[v6,2/2] POSIX locale covers every byte [BZ# 29511]

Message ID	969aa82c8d5904c1d2040bba87abe2f17a0dc647.1667409408.git.nabijaczleweli@nabijaczleweli.xyz
State	New
Headers	show Return-Path: <libc-alpha-bounces+incoming=patchwork.ozlabs.org@sourceware.org> DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 75148385C421 Date: Wed, 2 Nov 2022 18:17:17 +0100 To: libc-alpha@sourceware.org Subject: [PATCH v6 2/2] POSIX locale covers every byte [BZ# 29511] Message-ID: <969aa82c8d5904c1d2040bba87abe2f17a0dc647.1667409408.git.nabijaczleweli@nabijaczleweli.xyz> References: <ad6720f44981d53ce50804d4ea3696ca1b7cd0b7.1663768863.git.nabijaczleweli@nabijaczleweli.xyz> MIME-Version: 1.0 Content-Type: multipart/signed; micalg=pgp-sha512; protocol="application/pgp-signature"; boundary="xk24mnh6exgsd6mx" Content-Disposition: inline In-Reply-To: <ad6720f44981d53ce50804d4ea3696ca1b7cd0b7.1663768863.git.nabijaczleweli@nabijaczleweli.xyz> User-Agent: NeoMutt/20220429 Precedence: list From: =?utf-8?b?0L3QsNCxIHZpYSBMaWJjLWFscGhh?= <libc-alpha@sourceware.org> Reply-To: =?utf-8?b?0L3QsNCx?= <nabijaczleweli@nabijaczleweli.xyz> Cc: Florian Weimer <fweimer@redhat.com> Errors-To: libc-alpha-bounces+incoming=patchwork.ozlabs.org@sourceware.org Sender: "Libc-alpha" <libc-alpha-bounces+incoming=patchwork.ozlabs.org@sourceware.org>
Series	[v5,1/2] iconvdata/tst-table-charmap.sh: remove handling of old, borrowed format \| expand [v5,1/2] iconvdata/tst-table-charmap.sh: remove handling of old, borrowed format [v6,2/2] POSIX locale covers every byte [BZ# 29511]

* наб: > This is a logistically trivial patch, > largely duplicating the extant ASCII code with the error path changed I wouldn't say it's trivial in the commit message. 8-) > There are two user-facing changes: > * nl_langinfo(CODESET) is "POSIX" instead of "ANSI_X3.4-1968" > * mbrtowc() and friends return b if b <= 0x7F else <UDF00>+b > > Since Issue 7 TC 2/Issue 8, the C/POSIX locale, effectively, > (a) is 1-byte, stateless, and contains 256 characters > (b) they collate in byte order > (c) the first 128 characters are equivalent to ASCII (like previous) > cf. https://www.austingroupbugs.net/view.php?id=663 for a summary of > changes to the standard; > in short, this means that mbrtowc() must never fail and must return > b if b <= 0x7F else ab+c for all bytes b > where c is some constant >=0x80 > and a is a positive integer constant > > By strategically picking c=<UDF00> we land at the tail-end of the > Unicode Low Surrogate Area at DC00-DFFF, described as > > Isolated surrogate code points have no interpretation; > > consequently, no character code charts or names lists > > are provided for this range. > and match musl Sadly this doesn't match Python and PEP 540: >>> b'\x80'.decode('UTF-8', errors='surrogateescape') '\udc80' I believe the implementation translates this to 0xDF80 instead. Not sure what is more important here, musl compatibility or Python compatibility. Cc:ing Victor in case he as comments. I should probably ask on the musl list as well as how this divergence came to pass. This change definitely needs a NEWS entry. The mechanics of the patch look okay to me, just a few nits below. > diff --git a/iconv/gconv_int.h b/iconv/gconv_int.h > index 1c6745043e..45ab1edfad 100644 > --- a/iconv/gconv_int.h > +++ b/iconv/gconv_int.h > @@ -281,6 +281,8 @@ extern int __gconv_compare_alias (const char *name1, const char *name2) > > __BUILTIN_TRANSFORM (__gconv_transform_ascii_internal); > __BUILTIN_TRANSFORM (__gconv_transform_internal_ascii); > +__BUILTIN_TRANSFORM (__gconv_transform_posix_internal); > +__BUILTIN_TRANSFORM (__gconv_transform_internal_posix); > __BUILTIN_TRANSFORM (__gconv_transform_utf8_internal); > __BUILTIN_TRANSFORM (__gconv_transform_internal_utf8); > __BUILTIN_TRANSFORM (__gconv_transform_ucs2_internal); > @@ -299,6 +301,11 @@ __BUILTIN_TRANSFORM (__gconv_transform_utf16_internal); > only ASCII characters. */ > extern wint_t __gconv_btwoc_ascii (struct __gconv_step *step, unsigned char c); > > +/* Specialized conversion function for a single byte to INTERNAL, > + identity-mapping bytes [0, 0x7F], and moving [0x80, 0xFF] into the end > + of the Low Surrogate Area at [U+DF80, U+DFFF]. */ > +extern wint_t __gconv_btwoc_posix (struct __gconv_step *step, unsigned char c); > + > #endif Missing attribute_hidden. Yes, it's also missing from __gconv_btwoc_ascii. The linker probably papers over it. > > __END_DECLS > diff --git a/iconv/gconv_posix.c b/iconv/gconv_posix.c > new file mode 100644 > index 0000000000..dcb13fbb43 > --- /dev/null > +++ b/iconv/gconv_posix.c > @@ -0,0 +1,96 @@ > +/* Simple transformations functions. I think this line should say something about surrogate-escape encoding for the POSIX locale. > +#define MIN_NEEDED_INPUT MIN_NEEDED_FROM > +#define MIN_NEEDED_OUTPUT MIN_NEEDED_TO > +#define LOOPFCT FROM_LOOP > +#define BODY \ > + { \ > + uint32_t val = *((const uint32_t *) inptr); \ > + if (__glibc_unlikely ((val > 0x7f && val < 0xdf80) || val > 0xdfff)) \ > + { \ > + UNICODE_TAG_HANDLER (val, 4); \ > + STANDARD_TO_LOOP_ERR_HANDLER (4); \ > + } \ > + else \ > + { \ > + if (__glibc_unlikely (val > 0x7f)) \ > + val -= 0xdf00; \ > + *outptr++ = val; \ > + inptr += sizeof (uint32_t); \ > + } \ > + } I suggest to drop the last __glibc_unlikely here because it's input-dependent. > +#define LOOP_NEED_FLAGS > +#include <iconv/loop.c> > +#include <iconv/skeleton.c> > diff --git a/iconv/tst-iconv_prog.sh b/iconv/tst-iconv_prog.sh > index b3d8bf5110..a24d8d2207 100644 > --- a/iconv/tst-iconv_prog.sh > +++ b/iconv/tst-iconv_prog.sh > @@ -285,3 +285,46 @@ for errorcommand in "${errorarray[@]}"; do > execute_test > check_errtest_result > done > + > +allbytes () > +{ > + for (( i = 0; i <= 255; i++ )); do > + printf '\'"$(printf "%o" "$i")" > + done > +} > + > +allucs4be () > +{ > + for (( i = 0; i <= 127; i++ )); do > + printf '\0\0\0\'"$(printf "%o" "$i")" > + done > + for (( i = 128; i <= 255; i++ )); do > + printf '\0\0\xdf\'"$(printf "%o" "$i")" > + done > +} > + > +check_posix_result () > +{ > + if [ $? -eq 0 ]; then > + result=PASS > + else > + result=FAIL > + fi > + > + echo "$result: from \"$1\", to: \"$2\"" > + > + if [ "$result" != "PASS" ]; then > + exit 1 > + fi > +} > + > +check_posix_encoding () > +{ > + eval PROG=\"$ICONV\" > + allbytes | $PROG -f POSIX -t UCS-4BE | cmp -s - <(allucs4be) > + check_posix_result POSIX UCS-4BE > + allucs4be | $PROG -f UCS-4BE -t POSIX | cmp -s - <(allbytes) > + check_posix_result UCS-4BE POSIX > +} > + > +check_posix_encoding > diff --git a/iconvdata/tst-tables.sh b/iconvdata/tst-tables.sh > index 4207b44175..33a02158ac 100755 > --- a/iconvdata/tst-tables.sh > +++ b/iconvdata/tst-tables.sh > @@ -31,6 +31,7 @@ cat <<EOF | > # Keep this list in the same order as gconv-modules. > # > # charset name table name comment > + POSIX > ASCII ANSI_X3.4-1968 > ISO646-GB BS_4730 > ISO646-CA CSA_Z243.4-1985-1 > diff --git a/inet/tst-idna_name_classify.c b/inet/tst-idna_name_classify.c > index bfd34eee31..b379481844 100644 > --- a/inet/tst-idna_name_classify.c > +++ b/inet/tst-idna_name_classify.c > @@ -37,11 +37,11 @@ do_test (void) > puts ("info: C locale tests"); > locale_insensitive_tests (); > TEST_COMPARE (__idna_name_classify ("abc\200def"), > - idna_name_encoding_error); > + idna_name_nonascii); > TEST_COMPARE (__idna_name_classify ("abc\200\\def"), > - idna_name_encoding_error); > + idna_name_nonascii_backslash); > TEST_COMPARE (__idna_name_classify ("abc\377def"), > - idna_name_encoding_error); > + idna_name_nonascii); > > puts ("info: en_US.ISO-8859-1 locale tests"); > if (setlocale (LC_CTYPE, "en_US.ISO-8859-1") == 0) This seems to be okay, there is further test coverage for idna_name_encoding_error. > diff --git a/locale/tst-C-locale.c b/locale/tst-C-locale.c > index 6bd0367069..f30396ae12 100644 > --- a/locale/tst-C-locale.c > +++ b/locale/tst-C-locale.c > @@ -229,6 +229,75 @@ run_test (const char *locname) > STRTEST (YESSTR, ""); > STRTEST (NOSTR, ""); > > +#define CONVTEST(b, v) \ > + { \ > + unsigned char bs[] = {b, 0}; \ > + mbstate_t ctx = {}; \ > + wchar_t wc = -1; \ > + size_t sz = mbrtowc(&wc, (char *) bs, 1, &ctx); \ Missing space before '(' (also in other cases below). Not sure if the macros are needed, maybe write one loop for each direction with a condition in it? > diff --git a/stdio-common/tst-printf-bz25691.c b/stdio-common/tst-printf-bz25691.c > index 44844e71c3..e66242b58f 100644 > --- a/stdio-common/tst-printf-bz25691.c > +++ b/stdio-common/tst-printf-bz25691.c > @@ -30,6 +30,8 @@ > static int > do_test (void) > { > + setlocale(LC_CTYPE, "C.UTF-8"); > + > mtrace (); > > /* For 's' conversion specifier with 'l' modifier the array must be What's the rationale for this change? If it is really required, you must also update stdio-common/Makefile with a new dependency on $(gen-locales). Thanks, Florian

diff --git a/iconv/Makefile b/iconv/Makefile index a0d90cfeac..6e926f53e3 100644 --- a/iconv/Makefile +++ b/iconv/Makefile @@ -25,7 +25,7 @@ include ../Makeconfig headers = iconv.h gconv.h routines = iconv_open iconv iconv_close \ gconv_open gconv gconv_close gconv_db gconv_conf \ - gconv_builtin gconv_simple gconv_trans gconv_cache + gconv_builtin gconv_simple gconv_posix gconv_trans gconv_cache routines += gconv_dl gconv_charset vpath %.c ../locale/programs ../intl diff --git a/iconv/gconv_builtin.h b/iconv/gconv_builtin.h index 68c2369b1f..cd1805b3ce 100644 --- a/iconv/gconv_builtin.h +++ b/iconv/gconv_builtin.h @@ -89,6 +89,14 @@ BUILTIN_TRANSFORMATION ("INTERNAL", "ANSI_X3.4-1968//", 1, "=INTERNAL->ascii", __gconv_transform_internal_ascii, NULL, 4, 4, 1, 1) +BUILTIN_TRANSFORMATION ("POSIX//", "INTERNAL", 1, "=posix->INTERNAL", + __gconv_transform_posix_internal, __gconv_btwoc_posix, + 1, 1, 4, 4) + +BUILTIN_TRANSFORMATION ("INTERNAL", "POSIX//", 1, "=INTERNAL->posix", + __gconv_transform_internal_posix, NULL, 4, 4, 1, 1) + + #if BYTE_ORDER == BIG_ENDIAN BUILTIN_ALIAS ("UNICODEBIG//", "ISO-10646/UCS2/") BUILTIN_ALIAS ("UCS-2BE//", "ISO-10646/UCS2/") diff --git a/iconv/gconv_int.h b/iconv/gconv_int.h index 1c6745043e..45ab1edfad 100644 --- a/iconv/gconv_int.h +++ b/iconv/gconv_int.h @@ -281,6 +281,8 @@ extern int __gconv_compare_alias (const char *name1, const char *name2) __BUILTIN_TRANSFORM (__gconv_transform_ascii_internal); __BUILTIN_TRANSFORM (__gconv_transform_internal_ascii); +__BUILTIN_TRANSFORM (__gconv_transform_posix_internal); +__BUILTIN_TRANSFORM (__gconv_transform_internal_posix); __BUILTIN_TRANSFORM (__gconv_transform_utf8_internal); __BUILTIN_TRANSFORM (__gconv_transform_internal_utf8); __BUILTIN_TRANSFORM (__gconv_transform_ucs2_internal); @@ -299,6 +301,11 @@ __BUILTIN_TRANSFORM (__gconv_transform_utf16_internal); only ASCII characters. */ extern wint_t __gconv_btwoc_ascii (struct __gconv_step *step, unsigned char c); +/* Specialized conversion function for a single byte to INTERNAL, + identity-mapping bytes [0, 0x7F], and moving [0x80, 0xFF] into the end + of the Low Surrogate Area at [U+DF80, U+DFFF]. */ +extern wint_t __gconv_btwoc_posix (struct __gconv_step *step, unsigned char c); + #endif __END_DECLS diff --git a/iconv/gconv_posix.c b/iconv/gconv_posix.c new file mode 100644 index 0000000000..dcb13fbb43 --- /dev/null +++ b/iconv/gconv_posix.c @@ -0,0 +1,96 @@ +/* Simple transformations functions. + Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + + +#include <gconv_int.h> + + +/* Specialized conversion function for a single byte to INTERNAL, + identity-mapping bytes [0, 0x7F], and moving [0x80, 0xFF] into the end + of the Low Surrogate Area at [U+DF80, U+DFFF]. */ +wint_t +__gconv_btwoc_posix (struct __gconv_step *step, unsigned char c) +{ + if (c < 0x80) + return c; + else + return 0xdf00 + c; +} + + +/* Convert from {[0, 0x7F] => ISO 646-IRV; [0x80, 0xFF] => [U+DF80, U+DFFF]} + to the internal (UCS4-like) format. */ +#define DEFINE_INIT 0 +#define DEFINE_FINI 0 +#define MIN_NEEDED_FROM 1 +#define MIN_NEEDED_TO 4 +#define FROM_DIRECTION 1 +#define FROM_LOOP posix_internal_loop +#define TO_LOOP posix_internal_loop /* This is not used. */ +#define FUNCTION_NAME __gconv_transform_posix_internal +#define ONE_DIRECTION 1 + +#define MIN_NEEDED_INPUT MIN_NEEDED_FROM +#define MIN_NEEDED_OUTPUT MIN_NEEDED_TO +#define LOOPFCT FROM_LOOP +#define BODY \ + { \ + if (__glibc_unlikely (*inptr > '\x7f')) \ + *((uint32_t *) outptr) = 0xdf00 + *inptr++; \ + else \ + *((uint32_t *) outptr) = *inptr++; \ + outptr += sizeof (uint32_t); \ + } +#include <iconv/loop.c> +#include <iconv/skeleton.c> + + +/* Convert from the internal (UCS4-like) format to + {ISO 646-IRV => [0, 0x7F]; [U+DF80, U+DFFF] => [0x80, 0xFF]}. */ +#define DEFINE_INIT 0 +#define DEFINE_FINI 0 +#define MIN_NEEDED_FROM 4 +#define MIN_NEEDED_TO 1 +#define FROM_DIRECTION 1 +#define FROM_LOOP internal_posix_loop +#define TO_LOOP internal_posix_loop /* This is not used. */ +#define FUNCTION_NAME __gconv_transform_internal_posix +#define ONE_DIRECTION 1 + +#define MIN_NEEDED_INPUT MIN_NEEDED_FROM +#define MIN_NEEDED_OUTPUT MIN_NEEDED_TO +#define LOOPFCT FROM_LOOP +#define BODY \ + { \ + uint32_t val = *((const uint32_t *) inptr); \ + if (__glibc_unlikely ((val > 0x7f && val < 0xdf80) || val > 0xdfff)) \ + { \ + UNICODE_TAG_HANDLER (val, 4); \ + STANDARD_TO_LOOP_ERR_HANDLER (4); \ + } \ + else \ + { \ + if (__glibc_unlikely (val > 0x7f)) \ + val -= 0xdf00; \ + *outptr++ = val; \ + inptr += sizeof (uint32_t); \ + } \ + } +#define LOOP_NEED_FLAGS +#include <iconv/loop.c> +#include <iconv/skeleton.c> diff --git a/iconv/tst-iconv_prog.sh b/iconv/tst-iconv_prog.sh index b3d8bf5110..a24d8d2207 100644 --- a/iconv/tst-iconv_prog.sh +++ b/iconv/tst-iconv_prog.sh @@ -285,3 +285,46 @@ for errorcommand in "${errorarray[@]}"; do execute_test check_errtest_result done + +allbytes () +{ + for (( i = 0; i <= 255; i++ )); do + printf '\'"$(printf "%o" "$i")" + done +} + +allucs4be () +{ + for (( i = 0; i <= 127; i++ )); do + printf '\0\0\0\'"$(printf "%o" "$i")" + done + for (( i = 128; i <= 255; i++ )); do + printf '\0\0\xdf\'"$(printf "%o" "$i")" + done +} + +check_posix_result () +{ + if [ $? -eq 0 ]; then + result=PASS + else + result=FAIL + fi + + echo "$result: from \"$1\", to: \"$2\"" + + if [ "$result" != "PASS" ]; then + exit 1 + fi +} + +check_posix_encoding () +{ + eval PROG=\"$ICONV\" + allbytes | $PROG -f POSIX -t UCS-4BE | cmp -s - <(allucs4be) + check_posix_result POSIX UCS-4BE + allucs4be | $PROG -f UCS-4BE -t POSIX | cmp -s - <(allbytes) + check_posix_result UCS-4BE POSIX +} + +check_posix_encoding diff --git a/iconvdata/tst-tables.sh b/iconvdata/tst-tables.sh index 4207b44175..33a02158ac 100755 --- a/iconvdata/tst-tables.sh +++ b/iconvdata/tst-tables.sh @@ -31,6 +31,7 @@ cat <<EOF | # Keep this list in the same order as gconv-modules. # # charset name table name comment + POSIX ASCII ANSI_X3.4-1968 ISO646-GB BS_4730 ISO646-CA CSA_Z243.4-1985-1 diff --git a/inet/tst-idna_name_classify.c b/inet/tst-idna_name_classify.c index bfd34eee31..b379481844 100644 --- a/inet/tst-idna_name_classify.c +++ b/inet/tst-idna_name_classify.c @@ -37,11 +37,11 @@ do_test (void) puts ("info: C locale tests"); locale_insensitive_tests (); TEST_COMPARE (__idna_name_classify ("abc\200def"), - idna_name_encoding_error); + idna_name_nonascii); TEST_COMPARE (__idna_name_classify ("abc\200\\def"), - idna_name_encoding_error); + idna_name_nonascii_backslash); TEST_COMPARE (__idna_name_classify ("abc\377def"), - idna_name_encoding_error); + idna_name_nonascii); puts ("info: en_US.ISO-8859-1 locale tests"); if (setlocale (LC_CTYPE, "en_US.ISO-8859-1") == 0) diff --git a/locale/tst-C-locale.c b/locale/tst-C-locale.c index 6bd0367069..f30396ae12 100644 --- a/locale/tst-C-locale.c +++ b/locale/tst-C-locale.c @@ -229,6 +229,75 @@ run_test (const char *locname) STRTEST (YESSTR, ""); STRTEST (NOSTR, ""); +#define CONVTEST(b, v) \ + { \ + unsigned char bs[] = {b, 0}; \ + mbstate_t ctx = {}; \ + wchar_t wc = -1; \ + size_t sz = mbrtowc(&wc, (char *) bs, 1, &ctx); \ + if (sz != !!b) \ + { \ + printf ("mbrtowc(%02hhx) width in locale %s wrong " \ + "(is %zd, should be %d)\n", *bs, locname, sz, !!b); \ + result = 1; \ + } \ + if (wc != v) \ + { \ + printf ("mbrtowc(%02hhx) value in locale %s wrong " \ + "(is %x, should be %x)\n", *bs, locname, wc, v); \ + result = 1; \ + } \ + } + for(int i = 0; i <= 0x7f; ++i) + CONVTEST(i, i); + for(int i = 0x80; i <= 0xff; ++i) + CONVTEST(i, 0xdf00 + i); + +#define DECONVTEST(v, b) \ + { \ + unsigned char ob = -1; \ + mbstate_t ctx = {}; \ + size_t sz = wcrtomb((char *) &ob, v, &ctx); \ + if (sz != 1) \ + { \ + printf ("wcrtomb(%x) width in locale %s wrong " \ + "(is %zd, should be 1)\n", v, locname, sz); \ + result = 1; \ + } \ + if (ob != b) \ + { \ + printf ("wcrtomb(%x) value in locale %s wrong " \ + "(is %hhx, should be %hhx)\n", v, locname, ob, b); \ + result = 1; \ + } \ + } +#define DECONVERR(v) \ + { \ + unsigned char ob = -1; \ + mbstate_t ctx = {}; \ + size_t sz = wcrtomb((char *) &ob, v, &ctx); \ + if (sz != (size_t) -1) \ + { \ + printf ("wcrtomb(%x) width in locale %s wrong " \ + "(is %zd, should be (size_t )-1)\n", v, locname, sz); \ + result = 1; \ + } \ + if (ob != (unsigned char) -1) \ + { \ + printf ("wcrtomb(%x) value in locale %s wrong " \ + "(is %hhx, should be unchanged)\n", v, locname, ob); \ + result = 1; \ + } \ + } + for(int i = 0; i <= 0x7f; ++i) + DECONVTEST(i, i); + for(int i = 0x80; i < 0xdf00; ++i) + DECONVERR(i); + for(int i = 0x80; i <= 0xff; ++i) + DECONVTEST(0xdf00 + i, i); + for(int i = 0xe000; i <= 0xffff; ++i) + DECONVERR(i); + /* Test the new locale mechanisms. */ loc = newlocale (LC_ALL_MASK, locname, NULL); if (loc == NULL) diff --git a/localedata/charmaps/POSIX b/localedata/charmaps/POSIX new file mode 100644 index 0000000000..c44007ff49 --- /dev/null +++ b/localedata/charmaps/POSIX @@ -0,0 +1,136 @@ +<code_set_name> POSIX +<comment_char> % +<escape_char> / +% source: cf. localedata/locales/POSIX, LC_COLLATE + +CHARMAP +<U0000> /x00 NULL (NUL) +<U0001> /x01 START OF HEADING (SOH) +<U0002> /x02 START OF TEXT (STX) +<U0003> /x03 END OF TEXT (ETX) +<U0004> /x04 END OF TRANSMISSION (EOT) +<U0005> /x05 ENQUIRY (ENQ) +<U0006> /x06 ACKNOWLEDGE (ACK) +<U0007> /x07 BELL (BEL) +<U0008> /x08 BACKSPACE (BS) +<U0009> /x09 CHARACTER TABULATION (HT) +<U000A> /x0a LINE FEED (LF) +<U000B> /x0b LINE TABULATION (VT) +<U000C> /x0c FORM FEED (FF) +<U000D> /x0d CARRIAGE RETURN (CR) +<U000E> /x0e SHIFT OUT (SO) +<U000F> /x0f SHIFT IN (SI) +<U0010> /x10 DATALINK ESCAPE (DLE) +<U0011> /x11 DEVICE CONTROL ONE (DC1) +<U0012> /x12 DEVICE CONTROL TWO (DC2) +<U0013> /x13 DEVICE CONTROL THREE (DC3) +<U0014> /x14 DEVICE CONTROL FOUR (DC4) +<U0015> /x15 NEGATIVE ACKNOWLEDGE (NAK) +<U0016> /x16 SYNCHRONOUS IDLE (SYN) +<U0017> /x17 END OF TRANSMISSION BLOCK (ETB) +<U0018> /x18 CANCEL (CAN) +<U0019> /x19 END OF MEDIUM (EM) +<U001A> /x1a SUBSTITUTE (SUB) +<U001B> /x1b ESCAPE (ESC) +<U001C> /x1c FILE SEPARATOR (IS4) +<U001D> /x1d GROUP SEPARATOR (IS3) +<U001E> /x1e RECORD SEPARATOR (IS2) +<U001F> /x1f UNIT SEPARATOR (IS1) +<U0020> /x20 SPACE +<U0021> /x21 EXCLAMATION MARK +<U0022> /x22 QUOTATION MARK +<U0023> /x23 NUMBER SIGN +<U0024> /x24 DOLLAR SIGN +<U0025> /x25 PERCENT SIGN +<U0026> /x26 AMPERSAND +<U0027> /x27 APOSTROPHE +<U0028> /x28 LEFT PARENTHESIS +<U0029> /x29 RIGHT PARENTHESIS +<U002A> /x2a ASTERISK +<U002B> /x2b PLUS SIGN +<U002C> /x2c COMMA +<U002D> /x2d HYPHEN-MINUS +<U002E> /x2e FULL STOP +<U002F> /x2f SOLIDUS +<U0030> /x30 DIGIT ZERO +<U0031> /x31 DIGIT ONE +<U0032> /x32 DIGIT TWO +<U0033> /x33 DIGIT THREE +<U0034> /x34 DIGIT FOUR +<U0035> /x35 DIGIT FIVE +<U0036> /x36 DIGIT SIX +<U0037> /x37 DIGIT SEVEN +<U0038> /x38 DIGIT EIGHT +<U0039> /x39 DIGIT NINE +<U003A> /x3a COLON +<U003B> /x3b SEMICOLON +<U003C> /x3c LESS-THAN SIGN +<U003D> /x3d EQUALS SIGN +<U003E> /x3e GREATER-THAN SIGN +<U003F> /x3f QUESTION MARK +<U0040> /x40 COMMERCIAL AT +<U0041> /x41 LATIN CAPITAL LETTER A +<U0042> /x42 LATIN CAPITAL LETTER B +<U0043> /x43 LATIN CAPITAL LETTER C +<U0044> /x44 LATIN CAPITAL LETTER D +<U0045> /x45 LATIN CAPITAL LETTER E +<U0046> /x46 LATIN CAPITAL LETTER F +<U0047> /x47 LATIN CAPITAL LETTER G +<U0048> /x48 LATIN CAPITAL LETTER H +<U0049> /x49 LATIN CAPITAL LETTER I +<U004A> /x4a LATIN CAPITAL LETTER J +<U004B> /x4b LATIN CAPITAL LETTER K +<U004C> /x4c LATIN CAPITAL LETTER L +<U004D> /x4d LATIN CAPITAL LETTER M +<U004E> /x4e LATIN CAPITAL LETTER N +<U004F> /x4f LATIN CAPITAL LETTER O +<U0050> /x50 LATIN CAPITAL LETTER P +<U0051> /x51 LATIN CAPITAL LETTER Q +<U0052> /x52 LATIN CAPITAL LETTER R +<U0053> /x53 LATIN CAPITAL LETTER S +<U0054> /x54 LATIN CAPITAL LETTER T +<U0055> /x55 LATIN CAPITAL LETTER U +<U0056> /x56 LATIN CAPITAL LETTER V +<U0057> /x57 LATIN CAPITAL LETTER W +<U0058> /x58 LATIN CAPITAL LETTER X +<U0059> /x59 LATIN CAPITAL LETTER Y +<U005A> /x5a LATIN CAPITAL LETTER Z +<U005B> /x5b LEFT SQUARE BRACKET +<U005C> /x5c REVERSE SOLIDUS +<U005D> /x5d RIGHT SQUARE BRACKET +<U005E> /x5e CIRCUMFLEX ACCENT +<U005F> /x5f LOW LINE +<U0060> /x60 GRAVE ACCENT +<U0061> /x61 LATIN SMALL LETTER A +<U0062> /x62 LATIN SMALL LETTER B +<U0063> /x63 LATIN SMALL LETTER C +<U0064> /x64 LATIN SMALL LETTER D +<U0065> /x65 LATIN SMALL LETTER E +<U0066> /x66 LATIN SMALL LETTER F +<U0067> /x67 LATIN SMALL LETTER G +<U0068> /x68 LATIN SMALL LETTER H +<U0069> /x69 LATIN SMALL LETTER I +<U006A> /x6a LATIN SMALL LETTER J +<U006B> /x6b LATIN SMALL LETTER K +<U006C> /x6c LATIN SMALL LETTER L +<U006D> /x6d LATIN SMALL LETTER M +<U006E> /x6e LATIN SMALL LETTER N +<U006F> /x6f LATIN SMALL LETTER O +<U0070> /x70 LATIN SMALL LETTER P +<U0071> /x71 LATIN SMALL LETTER Q +<U0072> /x72 LATIN SMALL LETTER R +<U0073> /x73 LATIN SMALL LETTER S +<U0074> /x74 LATIN SMALL LETTER T +<U0075> /x75 LATIN SMALL LETTER U +<U0076> /x76 LATIN SMALL LETTER V +<U0077> /x77 LATIN SMALL LETTER W +<U0078> /x78 LATIN SMALL LETTER X +<U0079> /x79 LATIN SMALL LETTER Y +<U007A> /x7a LATIN SMALL LETTER Z +<U007B> /x7b LEFT CURLY BRACKET +<U007C> /x7c VERTICAL LINE +<U007D> /x7d RIGHT CURLY BRACKET +<U007E> /x7e TILDE +<U007F> /x7f DELETE (DEL) +<UDF80>..<UDFFF> /x80 +END CHARMAP diff --git a/localedata/locales/POSIX b/localedata/locales/POSIX index 7ec7f1c577..fc34a6abc1 100644 --- a/localedata/locales/POSIX +++ b/localedata/locales/POSIX @@ -97,6 +97,20 @@ END LC_CTYPE LC_COLLATE % This is the POSIX Locale definition for the LC_COLLATE category. % The order is the same as in the ASCII code set. +% Values above <DEL> (<U007F>) inserted in order, per Issue 7 TC2, +% XBD, 7.3.2, LC_COLLATE Category in the POSIX Locale: +% > All characters not explicitly listed here shall be inserted +% > in the character collation order after the listed characters +% > and shall be assigned unique primary weights. If the listed +% > characters have ASCII encoding, the other characters shall +% > be in ascending order according to their coded character set values +% Since Issue 7 TC2 (XBD, 6.2 Character Encoding): +% > The POSIX locale shall contain 256 single-byte characters [...] +% (cf. bug 663, 674). +% this is in contrast to previous issues, which limited the POSIX +% locale to the Portable Character Set (7-bit ASCII). +% We use the end of the Low Surrogate Area to contain these, +% yielding [<UDF80>, <UDFFF>] order_start forward <U0000> <U0001> @@ -226,7 +240,134 @@ order_start forward <U007D> <U007E> <U007F> -UNDEFINED +<UDF80> +<UDF81> +<UDF82> +<UDF83> +<UDF84> +<UDF85> +<UDF86> +<UDF87> +<UDF88> +<UDF89> +<UDF8A> +<UDF8B> +<UDF8C> +<UDF8D> +<UDF8E> +<UDF8F> +<UDF90> +<UDF91> +<UDF92> +<UDF93> +<UDF94> +<UDF95> +<UDF96> +<UDF97> +<UDF98> +<UDF99> +<UDF9A> +<UDF9B> +<UDF9C> +<UDF9D> +<UDF9E> +<UDF9F> +<UDFA0> +<UDFA1> +<UDFA2> +<UDFA3> +<UDFA4> +<UDFA5> +<UDFA6> +<UDFA7> +<UDFA8> +<UDFA9> +<UDFAA> +<UDFAB> +<UDFAC> +<UDFAD> +<UDFAE> +<UDFAF> +<UDFB0> +<UDFB1> +<UDFB2> +<UDFB3> +<UDFB4> +<UDFB5> +<UDFB6> +<UDFB7> +<UDFB8> +<UDFB9> +<UDFBA> +<UDFBB> +<UDFBC> +<UDFBD> +<UDFBE> +<UDFBF> +<UDFC0> +<UDFC1> +<UDFC2> +<UDFC3> +<UDFC4> +<UDFC5> +<UDFC6> +<UDFC7> +<UDFC8> +<UDFC9> +<UDFCA> +<UDFCB> +<UDFCC> +<UDFCD> +<UDFCE> +<UDFCF> +<UDFD0> +<UDFD1> +<UDFD2> +<UDFD3> +<UDFD4> +<UDFD5> +<UDFD6> +<UDFD7> +<UDFD8> +<UDFD9> +<UDFDA> +<UDFDB> +<UDFDC> +<UDFDD> +<UDFDE> +<UDFDF> +<UDFE0> +<UDFE1> +<UDFE2> +<UDFE3> +<UDFE4> +<UDFE5> +<UDFE6> +<UDFE7> +<UDFE8> +<UDFE9> +<UDFEA> +<UDFEB> +<UDFEC> +<UDFED> +<UDFEE> +<UDFEF> +<UDFF0> +<UDFF1> +<UDFF2> +<UDFF3> +<UDFF4> +<UDFF5> +<UDFF6> +<UDFF7> +<UDFF8> +<UDFF9> +<UDFFA> +<UDFFB> +<UDFFC> +<UDFFD> +<UDFFE> +<UDFFF> order_end % END LC_COLLATE diff --git a/stdio-common/tst-printf-bz25691.c b/stdio-common/tst-printf-bz25691.c index 44844e71c3..e66242b58f 100644 --- a/stdio-common/tst-printf-bz25691.c +++ b/stdio-common/tst-printf-bz25691.c @@ -30,6 +30,8 @@ static int do_test (void) { + setlocale(LC_CTYPE, "C.UTF-8"); + mtrace (); /* For 's' conversion specifier with 'l' modifier the array must be diff --git a/wcsmbs/wcsmbsload.c b/wcsmbs/wcsmbsload.c index 0f0f55f9ed..97de9afd25 100644 --- a/wcsmbs/wcsmbsload.c +++ b/wcsmbs/wcsmbsload.c @@ -33,10 +33,10 @@ static const struct __gconv_step to_wc = .__shlib_handle = NULL, .__modname = NULL, .__counter = INT_MAX, - .__from_name = (char *) "ANSI_X3.4-1968//TRANSLIT", + .__from_name = (char *) "POSIX", .__to_name = (char *) "INTERNAL", - .__fct = __gconv_transform_ascii_internal, - .__btowc_fct = __gconv_btwoc_ascii, + .__fct = __gconv_transform_posix_internal, + .__btowc_fct = __gconv_btwoc_posix, .__init_fct = NULL, .__end_fct = NULL, .__min_needed_from = 1, @@ -53,8 +53,8 @@ static const struct __gconv_step to_mb = .__modname = NULL, .__counter = INT_MAX, .__from_name = (char *) "INTERNAL", - .__to_name = (char *) "ANSI_X3.4-1968//TRANSLIT", - .__fct = __gconv_transform_internal_ascii, + .__to_name = (char *) "POSIX", + .__fct = __gconv_transform_internal_posix, .__btowc_fct = NULL, .__init_fct = NULL, .__end_fct = NULL, @@ -67,7 +67,9 @@ static const struct __gconv_step to_mb = }; -/* For the default locale we only have to handle ANSI_X3.4-1968. */ +/* The default/"POSIX"/"C" locale is an 8-bit-clean mapping + with ANSI_X3.4-1968 in the first 128 characters; + we lift the remaining bytes by <UDF00>. */ const struct gconv_fcts __wcsmbs_gconv_fcts_c = { .towc = (struct __gconv_step *) &to_wc,

[v6,2/2] POSIX locale covers every byte [BZ# 29511]

Commit Message

Comments

Patch