diff mbox series

iconv: Fix matching of multi-character transliterations (bug 31859)

Message ID mvm8qzar1de.fsf@suse.de
State New
Headers show
Series iconv: Fix matching of multi-character transliterations (bug 31859) | expand

Commit Message

Andreas Schwab June 12, 2024, 8:50 a.m. UTC
Only return __GCONV_INCOMPLETE_INPUT for a partial match when the end of
the input buffer is reached.  Otherwise it is a non-match, and other
patterns should be tried.
---
 iconv/Makefile              | 13 ++++++++++
 iconv/gconv_trans.c         |  2 +-
 iconv/tst-translit-locale   | 10 ++++++++
 iconv/tst-translit-mchar.c  | 48 +++++++++++++++++++++++++++++++++++++
 iconv/tst-translit-mchar.sh | 48 +++++++++++++++++++++++++++++++++++++
 5 files changed, 120 insertions(+), 1 deletion(-)
 create mode 100644 iconv/tst-translit-locale
 create mode 100644 iconv/tst-translit-mchar.c
 create mode 100644 iconv/tst-translit-mchar.sh

Comments

Carlos O'Donell June 20, 2024, 6:39 p.m. UTC | #1
On 6/12/24 4:50 AM, Andreas Schwab wrote:
> Only return __GCONV_INCOMPLETE_INPUT for a partial match when the end of
> the input buffer is reached.  Otherwise it is a non-match, and other
> patterns should be tried.

Looking forward to v4.

Fix looks correct to me.

I have comments about the test case.

> ---
>  iconv/Makefile              | 13 ++++++++++
>  iconv/gconv_trans.c         |  2 +-
>  iconv/tst-translit-locale   | 10 ++++++++
>  iconv/tst-translit-mchar.c  | 48 +++++++++++++++++++++++++++++++++++++
>  iconv/tst-translit-mchar.sh | 48 +++++++++++++++++++++++++++++++++++++
>  5 files changed, 120 insertions(+), 1 deletion(-)
>  create mode 100644 iconv/tst-translit-locale
>  create mode 100644 iconv/tst-translit-mchar.c
>  create mode 100644 iconv/tst-translit-mchar.sh
> 
> diff --git a/iconv/Makefile b/iconv/Makefile
> index 63afc853ff..e93322da85 100644
> --- a/iconv/Makefile
> +++ b/iconv/Makefile
> @@ -57,6 +57,10 @@ tests = \
>  	tst-iconv-opt \
>  	# tests
>  
> +test-srcs := \
> +	tst-translit-mchar \
> +	# test-srcs

OK. Add test sources.

> +
>  others		= iconv_prog iconvconfig
>  install-others-programs	= $(inst_bindir)/iconv
>  install-sbin	= iconvconfig
> @@ -73,6 +77,7 @@ include $(patsubst %,$(..)libof-iterator.mk,$(cpp-srcs-left))
>  ifeq ($(run-built-tests),yes)
>  xtests-special += $(objpfx)test-iconvconfig.out
>  tests-special += $(objpfx)tst-iconv_prog.out
> +tests-special += $(objpfx)tst-translit-mchar.out

OK. Add special test.

>  endif
>  
>  # Make a copy of the file because gconv module names are constructed
> @@ -126,3 +131,11 @@ $(objpfx)tst-iconv_prog.out: tst-iconv_prog.sh $(objpfx)iconv_prog
>  	$(BASH) $< $(common-objdir) '$(test-wrapper-env)' \
>  		 '$(run-program-env)' > $@; \
>  	$(evaluate-test)
> +
> +$(objpfx)tst-translit-mchar.out: tst-translit-mchar.sh \
> +			$(objpfx)tst-translit-mchar \
> +			tst-translit-locale

We should be able to use LOCALES to just compile tst-tranlit-locale?

> +	$(SHELL) $< $(common-objpfx) '$(run-program-prefix-before-env)' \
> +		 '$(run-program-env)' '$(run-program-prefix-after-env)' \
> +		 $< > $@; \
> +	$(evaluate-test)

OK. 

> diff --git a/iconv/gconv_trans.c b/iconv/gconv_trans.c
> index 08b7a3f71d..44f0fd849a 100644
> --- a/iconv/gconv_trans.c
> +++ b/iconv/gconv_trans.c
> @@ -150,7 +150,7 @@ __gconv_transliterate (struct __gconv_step *step,
>  
>  	  /* Nothing found, continue searching.  */
>  	}
> -      else if (cnt > 0)
> +      else if (cnt > 0 && winbuf + cnt == winbufend)

OK. Looks correct to me, we not only need to match the prefix but must also be the
right length.

>  	/* This means that the input buffer contents matches a prefix of
>  	   an entry.  Since we cannot match it unless we get more input,
>  	   we will tell the caller about it.  */
> diff --git a/iconv/tst-translit-locale b/iconv/tst-translit-locale
> new file mode 100644
> index 0000000000..712b08628a
> --- /dev/null
> +++ b/iconv/tst-translit-locale
> @@ -0,0 +1,10 @@
> +# Test multi-character transliteration rule
> +
> +LC_CTYPE
> +copy "POSIX"
> +
> +translit_start
> +"ÄÄ" "AA"

OK. New locale, that provides a transliteration.

> +translit_end
> +
> +END LC_CTYPE
> diff --git a/iconv/tst-translit-mchar.c b/iconv/tst-translit-mchar.c
> new file mode 100644
> index 0000000000..72335976d6
> --- /dev/null
> +++ b/iconv/tst-translit-mchar.c
> @@ -0,0 +1,48 @@
> +/* Test multi-character transliterations.
> +   Copyright (C) 2024 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <locale.h>
> +#include <iconv.h>
> +#include <support/support.h>
> +#include <support/check.h>
> +
> +static int
> +do_test (void)
> +{
> +  iconv_t cd;
> +  /* An input sequence that shares a common prefix with a transliteration
> +     rule.  */
> +  char input[] = "\xc3\x84\xc3\x85";

For a simple rule test like this may we just use raw UTF-8?

It makes it easier to read and understand.

e.g. "ÄÅ"

> +  char *inptr = input;
> +  char outbuf[10];
> +  char *outptr = outbuf;
> +  size_t inlen = sizeof (input), outlen = sizeof (outbuf);
> +  size_t n;
> +
> +  xsetlocale (LC_CTYPE, "tst-translit");
> +
> +  cd = iconv_open ("ASCII//TRANSLIT", "UTF-8");
> +  TEST_VERIFY (cd != (iconv_t) -1);
> +
> +  /* This call used to loop infinitely.  */
> +  n = iconv (cd, &inptr, &inlen, &outptr, &outlen);
> +  TEST_VERIFY (iconv_close (cd) == 0);
> +  return n == 0;
> +}
> +
> +#include <support/test-driver.c>
> diff --git a/iconv/tst-translit-mchar.sh b/iconv/tst-translit-mchar.sh
> new file mode 100644
> index 0000000000..79efd6abc8
> --- /dev/null
> +++ b/iconv/tst-translit-mchar.sh
> @@ -0,0 +1,48 @@
> +#!/bin/sh
> +# Testing of multi-character transliterations
> +# Copyright (C) 2024 Free Software Foundation, Inc.
> +# This file is part of the GNU C Library.
> +
> +# The GNU C Library is free software; you can redistribute it and/or
> +# modify it under the terms of the GNU Lesser General Public
> +# License as published by the Free Software Foundation; either
> +# version 2.1 of the License, or (at your option) any later version.
> +
> +# The GNU C Library is distributed in the hope that it will be useful,
> +# but WITHOUT ANY WARRANTY; without even the implied warranty of
> +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +# Lesser General Public License for more details.
> +
> +# You should have received a copy of the GNU Lesser General Public
> +# License along with the GNU C Library; if not, see
> +# <https://www.gnu.org/licenses/>.
> +
> +set -e
> +
> +common_objpfx=$1
> +run_program_prefix_before_env=$2
> +run_program_env=$3
> +run_program_prefix_after_env=$4
> +
> +# Generate data files.
> +${run_program_prefix_before_env} \
> +${run_program_env} \
> +I18NPATH=../localedata \
> +${run_program_prefix_after_env} ${common_objpfx}locale/localedef \
> +--quiet -i tst-translit-locale -f UTF-8 ${common_objpfx}iconv/tst-translit || ret=$?
> +if [ $ret -ne 1 ]; then
> +  echo "FAIL: Locale compilation for tst-translit-locale failed (error $ret)."
> +  exit 1
> +fi

Can this just use LOCALES in iconv/Makefile?

It should be able to compile the locale for you as all the other locales are compiled.

> +
> +set -x
> +
> +# Run the test.
> +${run_program_prefix_before_env} \
> +${run_program_env} \
> +LOCPATH=${common_objpfx}iconv \
> +${run_program_prefix_after_env} ${common_objpfx}iconv/tst-translit-mchar

Then you can get rid of those whole shell script and fold it up the iconv/Makefile as a test?

> +
> +# Local Variables:
> +#  mode:shell-script
> +# End:
Andreas Schwab June 24, 2024, 7:20 a.m. UTC | #2
On Jun 20 2024, Carlos O'Donell wrote:

> We should be able to use LOCALES to just compile tst-tranlit-locale?

There are two issues here:

- LOCALES can only use locale sources from localedata/locales
- tst-tranlit-locale is a stub locale (only contains LC_CTYPE) that
  localedef warns about, and gen-locale.sh does not accept that.
diff mbox series

Patch

diff --git a/iconv/Makefile b/iconv/Makefile
index 63afc853ff..e93322da85 100644
--- a/iconv/Makefile
+++ b/iconv/Makefile
@@ -57,6 +57,10 @@  tests = \
 	tst-iconv-opt \
 	# tests
 
+test-srcs := \
+	tst-translit-mchar \
+	# test-srcs
+
 others		= iconv_prog iconvconfig
 install-others-programs	= $(inst_bindir)/iconv
 install-sbin	= iconvconfig
@@ -73,6 +77,7 @@  include $(patsubst %,$(..)libof-iterator.mk,$(cpp-srcs-left))
 ifeq ($(run-built-tests),yes)
 xtests-special += $(objpfx)test-iconvconfig.out
 tests-special += $(objpfx)tst-iconv_prog.out
+tests-special += $(objpfx)tst-translit-mchar.out
 endif
 
 # Make a copy of the file because gconv module names are constructed
@@ -126,3 +131,11 @@  $(objpfx)tst-iconv_prog.out: tst-iconv_prog.sh $(objpfx)iconv_prog
 	$(BASH) $< $(common-objdir) '$(test-wrapper-env)' \
 		 '$(run-program-env)' > $@; \
 	$(evaluate-test)
+
+$(objpfx)tst-translit-mchar.out: tst-translit-mchar.sh \
+			$(objpfx)tst-translit-mchar \
+			tst-translit-locale
+	$(SHELL) $< $(common-objpfx) '$(run-program-prefix-before-env)' \
+		 '$(run-program-env)' '$(run-program-prefix-after-env)' \
+		 $< > $@; \
+	$(evaluate-test)
diff --git a/iconv/gconv_trans.c b/iconv/gconv_trans.c
index 08b7a3f71d..44f0fd849a 100644
--- a/iconv/gconv_trans.c
+++ b/iconv/gconv_trans.c
@@ -150,7 +150,7 @@  __gconv_transliterate (struct __gconv_step *step,
 
 	  /* Nothing found, continue searching.  */
 	}
-      else if (cnt > 0)
+      else if (cnt > 0 && winbuf + cnt == winbufend)
 	/* This means that the input buffer contents matches a prefix of
 	   an entry.  Since we cannot match it unless we get more input,
 	   we will tell the caller about it.  */
diff --git a/iconv/tst-translit-locale b/iconv/tst-translit-locale
new file mode 100644
index 0000000000..712b08628a
--- /dev/null
+++ b/iconv/tst-translit-locale
@@ -0,0 +1,10 @@ 
+# Test multi-character transliteration rule
+
+LC_CTYPE
+copy "POSIX"
+
+translit_start
+"ÄÄ" "AA"
+translit_end
+
+END LC_CTYPE
diff --git a/iconv/tst-translit-mchar.c b/iconv/tst-translit-mchar.c
new file mode 100644
index 0000000000..72335976d6
--- /dev/null
+++ b/iconv/tst-translit-mchar.c
@@ -0,0 +1,48 @@ 
+/* Test multi-character transliterations.
+   Copyright (C) 2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <locale.h>
+#include <iconv.h>
+#include <support/support.h>
+#include <support/check.h>
+
+static int
+do_test (void)
+{
+  iconv_t cd;
+  /* An input sequence that shares a common prefix with a transliteration
+     rule.  */
+  char input[] = "\xc3\x84\xc3\x85";
+  char *inptr = input;
+  char outbuf[10];
+  char *outptr = outbuf;
+  size_t inlen = sizeof (input), outlen = sizeof (outbuf);
+  size_t n;
+
+  xsetlocale (LC_CTYPE, "tst-translit");
+
+  cd = iconv_open ("ASCII//TRANSLIT", "UTF-8");
+  TEST_VERIFY (cd != (iconv_t) -1);
+
+  /* This call used to loop infinitely.  */
+  n = iconv (cd, &inptr, &inlen, &outptr, &outlen);
+  TEST_VERIFY (iconv_close (cd) == 0);
+  return n == 0;
+}
+
+#include <support/test-driver.c>
diff --git a/iconv/tst-translit-mchar.sh b/iconv/tst-translit-mchar.sh
new file mode 100644
index 0000000000..79efd6abc8
--- /dev/null
+++ b/iconv/tst-translit-mchar.sh
@@ -0,0 +1,48 @@ 
+#!/bin/sh
+# Testing of multi-character transliterations
+# Copyright (C) 2024 Free Software Foundation, Inc.
+# This file is part of the GNU C Library.
+
+# The GNU C Library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+
+# The GNU C Library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+
+# You should have received a copy of the GNU Lesser General Public
+# License along with the GNU C Library; if not, see
+# <https://www.gnu.org/licenses/>.
+
+set -e
+
+common_objpfx=$1
+run_program_prefix_before_env=$2
+run_program_env=$3
+run_program_prefix_after_env=$4
+
+# Generate data files.
+${run_program_prefix_before_env} \
+${run_program_env} \
+I18NPATH=../localedata \
+${run_program_prefix_after_env} ${common_objpfx}locale/localedef \
+--quiet -i tst-translit-locale -f UTF-8 ${common_objpfx}iconv/tst-translit || ret=$?
+if [ $ret -ne 1 ]; then
+  echo "FAIL: Locale compilation for tst-translit-locale failed (error $ret)."
+  exit 1
+fi
+
+set -x
+
+# Run the test.
+${run_program_prefix_before_env} \
+${run_program_env} \
+LOCPATH=${common_objpfx}iconv \
+${run_program_prefix_after_env} ${common_objpfx}iconv/tst-translit-mchar
+
+# Local Variables:
+#  mode:shell-script
+# End: