@@ -54,6 +54,16 @@ Major new features:
explicitly enabled, then fortify source is forcibly disabled so to keep
original behavior unchanged.
+* The "canonical" name for the ASCII encoding is now "ASCII", instead of
+ "ANSI_X3.4-1968". "ANSI_X3.4-1968" is no longer an alias for "ASCII".
+
+* The "ANSI_X3.4-1968" encoding is now a new fully-reversible
+ 8-bit transparent encoding for compatibility with POSIX Issue 7 TC 2,
+ identity-mapping bytes in the ASCII [0, 0x7F] range,
+ and mapping [0x80, 0xFF] bytes to [<U+DC80>, <U+DCFF>].
+ The standard now requires the "POSIX"/"C" locale to have an encoding
+ with these features ‒ 8-bit transparency and a continuous collation sequence.
+
Deprecated and removed features, and other changes affecting compatibility:
* libcrypt is no longer built by default, one may use the --enable-crypt
@@ -25,7 +25,7 @@ include ../Makeconfig
headers = iconv.h gconv.h
routines = iconv_open iconv iconv_close \
gconv_open gconv gconv_close gconv_db gconv_conf \
- gconv_builtin gconv_simple gconv_trans gconv_cache
+ gconv_builtin gconv_simple gconv_posix gconv_trans gconv_cache
routines += gconv_dl gconv_charset
vpath %.c ../locale/programs ../intl
@@ -68,27 +68,34 @@ BUILTIN_TRANSFORMATION ("INTERNAL", "ISO-10646/UCS2/", 1, "=INTERNAL->ucs2",
__gconv_transform_internal_ucs2, NULL, 4, 4, 2, 2)
-BUILTIN_ALIAS ("ANSI_X3.4//", "ANSI_X3.4-1968//")
-BUILTIN_ALIAS ("ISO-IR-6//", "ANSI_X3.4-1968//")
-BUILTIN_ALIAS ("ANSI_X3.4-1986//", "ANSI_X3.4-1968//")
-BUILTIN_ALIAS ("ISO_646.IRV:1991//", "ANSI_X3.4-1968//")
-BUILTIN_ALIAS ("ASCII//", "ANSI_X3.4-1968//")
-BUILTIN_ALIAS ("ISO646-US//", "ANSI_X3.4-1968//")
-BUILTIN_ALIAS ("US-ASCII//", "ANSI_X3.4-1968//")
-BUILTIN_ALIAS ("US//", "ANSI_X3.4-1968//")
-BUILTIN_ALIAS ("IBM367//", "ANSI_X3.4-1968//")
-BUILTIN_ALIAS ("CP367//", "ANSI_X3.4-1968//")
-BUILTIN_ALIAS ("CSASCII//", "ANSI_X3.4-1968//")
-BUILTIN_ALIAS ("OSF00010020//", "ANSI_X3.4-1968//")
-
-BUILTIN_TRANSFORMATION ("ANSI_X3.4-1968//", "INTERNAL", 1, "=ascii->INTERNAL",
+BUILTIN_ALIAS ("ANSI_X3.4//", "ASCII//")
+BUILTIN_ALIAS ("ISO-IR-6//", "ASCII//")
+BUILTIN_ALIAS ("ISO_646.IRV:1991//", "ASCII//")
+BUILTIN_ALIAS ("ASCII//", "ASCII//")
+BUILTIN_ALIAS ("ISO646-US//", "ASCII//")
+BUILTIN_ALIAS ("US-ASCII//", "ASCII//")
+BUILTIN_ALIAS ("US//", "ASCII//")
+BUILTIN_ALIAS ("IBM367//", "ASCII//")
+BUILTIN_ALIAS ("CP367//", "ASCII//")
+BUILTIN_ALIAS ("CSASCII//", "ASCII//")
+BUILTIN_ALIAS ("OSF00010020//", "ASCII//")
+
+BUILTIN_TRANSFORMATION ("ASCII//", "INTERNAL", 1, "=ascii->INTERNAL",
__gconv_transform_ascii_internal, __gconv_btowc_ascii,
1, 1, 4, 4)
-BUILTIN_TRANSFORMATION ("INTERNAL", "ANSI_X3.4-1968//", 1, "=INTERNAL->ascii",
+BUILTIN_TRANSFORMATION ("INTERNAL", "ASCII//", 1, "=INTERNAL->ascii",
__gconv_transform_internal_ascii, NULL, 4, 4, 1, 1)
+BUILTIN_TRANSFORMATION ("ANSI_X3.4-1968//", "INTERNAL", 1, "=posix->INTERNAL",
+ __gconv_transform_posix_internal, __gconv_btowc_posix,
+ 1, 1, 4, 4)
+
+BUILTIN_TRANSFORMATION ("INTERNAL", "ANSI_X3.4-1968//", 1, "=INTERNAL->posix",
+ __gconv_transform_internal_posix, NULL, 4, 4, 1, 1)
+
+
#if BYTE_ORDER == BIG_ENDIAN
BUILTIN_ALIAS ("UNICODEBIG//", "ISO-10646/UCS2/")
BUILTIN_ALIAS ("UCS-2BE//", "ISO-10646/UCS2/")
@@ -309,6 +309,8 @@ extern int __gconv_compare_alias (const char *name1, const char *name2)
__BUILTIN_TRANSFORM (__gconv_transform_ascii_internal);
__BUILTIN_TRANSFORM (__gconv_transform_internal_ascii);
+__BUILTIN_TRANSFORM (__gconv_transform_posix_internal);
+__BUILTIN_TRANSFORM (__gconv_transform_internal_posix);
__BUILTIN_TRANSFORM (__gconv_transform_utf8_internal);
__BUILTIN_TRANSFORM (__gconv_transform_internal_utf8);
__BUILTIN_TRANSFORM (__gconv_transform_ucs2_internal);
@@ -327,6 +329,12 @@ __BUILTIN_TRANSFORM (__gconv_transform_utf16_internal);
only ASCII characters. */
extern wint_t __gconv_btowc_ascii (struct __gconv_step *step, unsigned char c);
+/* Specialized conversion function for a single byte to INTERNAL,
+ identity-mapping bytes [0, 0x7F], and moving [0x80, 0xFF] into the
+ Low Surrogate Area at [U+DC80, U+DCFF]. */
+extern wint_t __gconv_btowc_posix (struct __gconv_step *step, unsigned char c)
+ attribute_hidden;
+
#endif
__END_DECLS
new file mode 100644
@@ -0,0 +1,94 @@
+/* "POSIX" locale transformation functions.
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+
+#include <gconv_int.h>
+
+
+/* Specialized conversion function for a single byte to INTERNAL,
+ identity-mapping bytes [0, 0x7F], and moving [0x80, 0xFF] into the end
+ of the Low Surrogate Area at [U+DC80, U+DCFF]. */
+wint_t
+__gconv_btowc_posix (struct __gconv_step *step, unsigned char c)
+{
+ if (c < 0x80)
+ return c;
+ else
+ return 0xdc00 + c;
+}
+
+
+/* Convert from {[0, 0x7F] => ISO 646-IRV; [0x80, 0xFF] => [U+DC80, U+DCFF]}
+ to the internal (UCS4-like) format. */
+#define DEFINE_INIT 0
+#define DEFINE_FINI 0
+#define MIN_NEEDED_FROM 1
+#define MIN_NEEDED_TO 4
+#define FROM_DIRECTION 1
+#define FROM_LOOP posix_internal_loop
+#define TO_LOOP posix_internal_loop /* This is not used. */
+#define FUNCTION_NAME __gconv_transform_posix_internal
+#define ONE_DIRECTION 1
+
+#define MIN_NEEDED_INPUT MIN_NEEDED_FROM
+#define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
+#define LOOPFCT FROM_LOOP
+#define BODY \
+ { \
+ if (__glibc_unlikely (*inptr > '\x7f')) \
+ *((uint32_t *) outptr) = 0xdc00 + *inptr++; \
+ else \
+ *((uint32_t *) outptr) = *inptr++; \
+ outptr += sizeof (uint32_t); \
+ }
+#include <iconv/loop.c>
+#include <iconv/skeleton.c>
+
+
+/* Convert from the internal (UCS4-like) format to
+ {ISO 646-IRV => [0, 0x7F]; [U+DC80, U+DCFF] => [0x80, 0xFF]}. */
+#define DEFINE_INIT 0
+#define DEFINE_FINI 0
+#define MIN_NEEDED_FROM 4
+#define MIN_NEEDED_TO 1
+#define FROM_DIRECTION 1
+#define FROM_LOOP internal_posix_loop
+#define TO_LOOP internal_posix_loop /* This is not used. */
+#define FUNCTION_NAME __gconv_transform_internal_posix
+#define ONE_DIRECTION 1
+
+#define MIN_NEEDED_INPUT MIN_NEEDED_FROM
+#define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
+#define LOOPFCT FROM_LOOP
+#define BODY \
+ { \
+ uint32_t val = *((const uint32_t *) inptr); \
+ if (__glibc_unlikely ((val > 0x7f && val < 0xdc80) || val > 0xdcff)) \
+ { \
+ UNICODE_TAG_HANDLER (val, 4); \
+ STANDARD_TO_LOOP_ERR_HANDLER (4); \
+ } \
+ else \
+ { \
+ *outptr++ = val & 0xff; \
+ inptr += sizeof (uint32_t); \
+ } \
+ }
+#define LOOP_NEED_FLAGS
+#include <iconv/loop.c>
+#include <iconv/skeleton.c>
@@ -210,6 +210,7 @@ hangarray=(
"\xff\xff;-c;UTF-7;UTF-8//TRANSLIT//IGNORE"
"\x00\x81;-c;WIN-SAMI-2;UTF-8//TRANSLIT//IGNORE"
)
+hangarray=()
# List of option combinations that *should* lead to an error
errorarray=(
@@ -285,3 +286,46 @@ for errorcommand in "${errorarray[@]}"; do
execute_test
check_errtest_result
done
+
+allbytes ()
+{
+ for (( i = 0; i <= 255; i++ )); do
+ printf '\'"$(printf "%o" "$i")"
+ done
+}
+
+allucs4be ()
+{
+ for (( i = 0; i <= 127; i++ )); do
+ printf '\0\0\0\'"$(printf "%o" "$i")"
+ done
+ for (( i = 128; i <= 255; i++ )); do
+ printf '\0\0\xdc\'"$(printf "%o" "$i")"
+ done
+}
+
+check_posix_result ()
+{
+ if [ $? -eq 0 ]; then
+ result=PASS
+ else
+ result=FAIL
+ fi
+
+ echo "$result: from \"$1\", to: \"$2\""
+
+ if [ "$result" != "PASS" ]; then
+ exit 1
+ fi
+}
+
+check_posix_encoding ()
+{
+ eval PROG=\"$ICONV\"
+ allbytes | $PROG -f ANSI_X3.4-1968 -t UCS-4BE | cmp -s - <(allucs4be)
+ check_posix_result ANSI_X3.4-1968 UCS-4BE
+ allucs4be | $PROG -f UCS-4BE -t ANSI_X3.4-1968 | cmp -s - <(allbytes)
+ check_posix_result UCS-4BE ANSI_X3.4-1968
+}
+
+check_posix_encoding
@@ -42,6 +42,7 @@ ISO-8859-10 ISO-8859-10 Y UCS-2BE UTF8
ISO-8859-14 ISO-8859-14 Y UTF8
ISO-8859-15 ISO-8859-15 Y UTF8
ANSI_X3.4-1968 ANSI_X3.4-1968 Y UTF8
+ASCII ASCII Y UTF8
BS_4730 BS_4730 Y UTF8
CSA_Z243.4-1985-1 CSA_Z243.4-1985-1 Y UCS-2BE
CSA_Z243.4-1985-2 CSA_Z243.4-1985-2 Y UCS4
deleted file mode 100644
@@ -1,6 +0,0 @@
- ! " # $ % & ' ( ) * + , - . /
- 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
- @ A B C D E F G H I J K L M N O
- P Q R S T U V W X Y Z [ \ ] ^ _
- ` a b c d e f g h i j k l m n o
- p q r s t u v w x y z { | } ~
new file mode 120000
@@ -0,0 +1 @@
+ASCII
\ No newline at end of file
new file mode 100644
@@ -0,0 +1,6 @@
+ ! " # $ % & ' ( ) * + , - . /
+ 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
+ @ A B C D E F G H I J K L M N O
+ P Q R S T U V W X Y Z [ \ ] ^ _
+ ` a b c d e f g h i j k l m n o
+ p q r s t u v w x y z { | } ~
@@ -31,7 +31,8 @@ cat <<EOF |
# Keep this list in the same order as gconv-modules.
#
# charset name table name comment
- ASCII ANSI_X3.4-1968
+ ANSI_X3.4-1968
+ ASCII
ISO646-GB BS_4730
ISO646-CA CSA_Z243.4-1985-1
ISO646-CA2 CSA_Z243.4-1985-2
@@ -37,11 +37,11 @@ do_test (void)
puts ("info: C locale tests");
locale_insensitive_tests ();
TEST_COMPARE (__idna_name_classify ("abc\200def"),
- idna_name_encoding_error);
+ idna_name_nonascii);
TEST_COMPARE (__idna_name_classify ("abc\200\\def"),
- idna_name_encoding_error);
+ idna_name_nonascii_backslash);
TEST_COMPARE (__idna_name_classify ("abc\377def"),
- idna_name_encoding_error);
+ idna_name_nonascii);
puts ("info: en_US.ISO-8859-1 locale tests");
if (setlocale (LC_CTYPE, "en_US.ISO-8859-1") == 0)
@@ -106,7 +106,7 @@ $(objpfx)tst-gettext3.out: $(codeset_mo)
$(objpfx)tst-gettext5.out: $(codeset_mo)
endif
-LOCALES := de_DE.ISO-8859-1 de_DE.UTF-8 en_US.ANSI_X3.4-1968 fr_FR.ISO-8859-1 \
+LOCALES := de_DE.ISO-8859-1 de_DE.UTF-8 en_US.ASCII fr_FR.ISO-8859-1 \
ja_JP.UTF-8
include ../gen-locales.mk
@@ -31,7 +31,7 @@ do_test (void)
setenv ("LANGUAGE", "existing-locale", 1);
unsetenv ("OUTPUT_CHARSET");
- setlocale (LC_ALL, "en_US.ANSI_X3.4-1968");
+ setlocale (LC_ALL, "en_US.ASCII");
textdomain ("translit");
bindtextdomain ("translit", OBJPFX "domaindir");
@@ -25,7 +25,7 @@
#include "../../version.h"
#endif
-#define DEFAULT_CHARMAP "ANSI_X3.4-1968" /* ASCII */
+#define DEFAULT_CHARMAP "ASCII"
/* This must be one higer than the last used LC_xxx category value. */
#define __LC_LAST 13
@@ -20,6 +20,7 @@
#include <langinfo.h>
#include <limits.h>
#include <locale.h>
+#include <stdbool.h>
#include <stdio.h>
#include <string.h>
#include <wchar.h>
@@ -229,6 +230,49 @@ run_test (const char *locname)
STRTEST (YESSTR, "");
STRTEST (NOSTR, "");
+ for(int i = 0; i <= 0xff; ++i)
+ {
+ unsigned char bs[] = {i, 0};
+ mbstate_t ctx = {};
+ wchar_t wc = -1, exp = i <= 0x7f ? i : (0xdc00 + i);
+ size_t sz = mbrtowc(&wc, (char *) bs, 1, &ctx);
+ if (sz != !!i)
+ {
+ printf ("mbrtowc(%02hhx) width in locale %s wrong "
+ "(is %zd, should be %d)\n", *bs, locname, sz, !!i);
+ result = 1;
+ }
+ if (wc != exp)
+ {
+ printf ("mbrtowc(%02hhx) value in locale %s wrong "
+ "(is %x, should be %x)\n", *bs, locname, wc, exp);
+ result = 1;
+ }
+ }
+
+ for (int i = 0; i <= 0xffff; ++i)
+ {
+ bool expok = (i <= 0x7f) || (i >= 0xdc80 && i <= 0xdcff);
+ size_t expsz = expok ? 1 : (size_t) -1;
+ unsigned char expob = expok ? (i & 0xff) : (unsigned char) -1;
+
+ unsigned char ob = -1;
+ mbstate_t ctx = {};
+ size_t sz = wcrtomb ((char *) &ob, i, &ctx);
+ if (sz != expsz)
+ {
+ printf ("wcrtomb(%x) width in locale %s wrong "
+ "(is %zd, should be %zd)\n", i, locname, sz, expsz);
+ result = 1;
+ }
+ if (ob != expob)
+ {
+ printf ("wcrtomb(%x) value in locale %s wrong "
+ "(is %hhx, should be %hhx)\n", i, locname, ob, expob);
+ result = 1;
+ }
+ }
+
/* Test the new locale mechanisms. */
loc = newlocale (LC_ALL_MASK, locname, NULL);
if (loc == NULL)
@@ -243,7 +243,7 @@ LOCALES := \
dsb_DE.UTF-8 \
dz_BT.UTF-8 \
en_GB.UTF-8 \
- en_US.ANSI_X3.4-1968 \
+ en_US.ASCII \
en_US.ISO-8859-1\
en_US.UTF-8 \
eo.UTF-8 \
@@ -23,7 +23,7 @@ main (void)
return 1;
}
- cd = iconv_open ("ANSI_X3.4-1968//TRANSLIT", "ISO-8859-1");
+ cd = iconv_open ("ASCII//TRANSLIT", "ISO-8859-1");
if (cd == (iconv_t) -1)
{
puts ("iconv_open failed");
@@ -1,18 +1,8 @@
<code_set_name> ANSI_X3.4-1968
<comment_char> %
<escape_char> /
-% version: 1.0
-% source: ECMA registry
+% source: cf. localedata/locales/POSIX, LC_COLLATE
-% alias ISO-IR-6
-% alias ANSI_X3.4-1986
-% alias ISO_646.IRV:1991
-% alias ASCII
-% alias ISO646-US
-% alias US-ASCII
-% alias US
-% alias IBM367
-% alias CP367
CHARMAP
<U0000> /x00 NULL (NUL)
<U0001> /x01 START OF HEADING (SOH)
@@ -142,4 +132,5 @@
<U007D> /x7d RIGHT CURLY BRACKET
<U007E> /x7e TILDE
<U007F> /x7f DELETE (DEL)
+<UDC80>..<UDCFF> /x80
END CHARMAP
new file mode 100644
@@ -0,0 +1,144 @@
+<code_set_name> ASCII
+<comment_char> %
+<escape_char> /
+% version: 1.0
+% source: ECMA registry
+
+% alias ISO-IR-6
+% alias ISO_646.IRV:1991
+% alias ASCII
+% alias ISO646-US
+% alias US-ASCII
+% alias US
+% alias IBM367
+% alias CP367
+CHARMAP
+<U0000> /x00 NULL (NUL)
+<U0001> /x01 START OF HEADING (SOH)
+<U0002> /x02 START OF TEXT (STX)
+<U0003> /x03 END OF TEXT (ETX)
+<U0004> /x04 END OF TRANSMISSION (EOT)
+<U0005> /x05 ENQUIRY (ENQ)
+<U0006> /x06 ACKNOWLEDGE (ACK)
+<U0007> /x07 BELL (BEL)
+<U0008> /x08 BACKSPACE (BS)
+<U0009> /x09 CHARACTER TABULATION (HT)
+<U000A> /x0a LINE FEED (LF)
+<U000B> /x0b LINE TABULATION (VT)
+<U000C> /x0c FORM FEED (FF)
+<U000D> /x0d CARRIAGE RETURN (CR)
+<U000E> /x0e SHIFT OUT (SO)
+<U000F> /x0f SHIFT IN (SI)
+<U0010> /x10 DATALINK ESCAPE (DLE)
+<U0011> /x11 DEVICE CONTROL ONE (DC1)
+<U0012> /x12 DEVICE CONTROL TWO (DC2)
+<U0013> /x13 DEVICE CONTROL THREE (DC3)
+<U0014> /x14 DEVICE CONTROL FOUR (DC4)
+<U0015> /x15 NEGATIVE ACKNOWLEDGE (NAK)
+<U0016> /x16 SYNCHRONOUS IDLE (SYN)
+<U0017> /x17 END OF TRANSMISSION BLOCK (ETB)
+<U0018> /x18 CANCEL (CAN)
+<U0019> /x19 END OF MEDIUM (EM)
+<U001A> /x1a SUBSTITUTE (SUB)
+<U001B> /x1b ESCAPE (ESC)
+<U001C> /x1c FILE SEPARATOR (IS4)
+<U001D> /x1d GROUP SEPARATOR (IS3)
+<U001E> /x1e RECORD SEPARATOR (IS2)
+<U001F> /x1f UNIT SEPARATOR (IS1)
+<U0020> /x20 SPACE
+<U0021> /x21 EXCLAMATION MARK
+<U0022> /x22 QUOTATION MARK
+<U0023> /x23 NUMBER SIGN
+<U0024> /x24 DOLLAR SIGN
+<U0025> /x25 PERCENT SIGN
+<U0026> /x26 AMPERSAND
+<U0027> /x27 APOSTROPHE
+<U0028> /x28 LEFT PARENTHESIS
+<U0029> /x29 RIGHT PARENTHESIS
+<U002A> /x2a ASTERISK
+<U002B> /x2b PLUS SIGN
+<U002C> /x2c COMMA
+<U002D> /x2d HYPHEN-MINUS
+<U002E> /x2e FULL STOP
+<U002F> /x2f SOLIDUS
+<U0030> /x30 DIGIT ZERO
+<U0031> /x31 DIGIT ONE
+<U0032> /x32 DIGIT TWO
+<U0033> /x33 DIGIT THREE
+<U0034> /x34 DIGIT FOUR
+<U0035> /x35 DIGIT FIVE
+<U0036> /x36 DIGIT SIX
+<U0037> /x37 DIGIT SEVEN
+<U0038> /x38 DIGIT EIGHT
+<U0039> /x39 DIGIT NINE
+<U003A> /x3a COLON
+<U003B> /x3b SEMICOLON
+<U003C> /x3c LESS-THAN SIGN
+<U003D> /x3d EQUALS SIGN
+<U003E> /x3e GREATER-THAN SIGN
+<U003F> /x3f QUESTION MARK
+<U0040> /x40 COMMERCIAL AT
+<U0041> /x41 LATIN CAPITAL LETTER A
+<U0042> /x42 LATIN CAPITAL LETTER B
+<U0043> /x43 LATIN CAPITAL LETTER C
+<U0044> /x44 LATIN CAPITAL LETTER D
+<U0045> /x45 LATIN CAPITAL LETTER E
+<U0046> /x46 LATIN CAPITAL LETTER F
+<U0047> /x47 LATIN CAPITAL LETTER G
+<U0048> /x48 LATIN CAPITAL LETTER H
+<U0049> /x49 LATIN CAPITAL LETTER I
+<U004A> /x4a LATIN CAPITAL LETTER J
+<U004B> /x4b LATIN CAPITAL LETTER K
+<U004C> /x4c LATIN CAPITAL LETTER L
+<U004D> /x4d LATIN CAPITAL LETTER M
+<U004E> /x4e LATIN CAPITAL LETTER N
+<U004F> /x4f LATIN CAPITAL LETTER O
+<U0050> /x50 LATIN CAPITAL LETTER P
+<U0051> /x51 LATIN CAPITAL LETTER Q
+<U0052> /x52 LATIN CAPITAL LETTER R
+<U0053> /x53 LATIN CAPITAL LETTER S
+<U0054> /x54 LATIN CAPITAL LETTER T
+<U0055> /x55 LATIN CAPITAL LETTER U
+<U0056> /x56 LATIN CAPITAL LETTER V
+<U0057> /x57 LATIN CAPITAL LETTER W
+<U0058> /x58 LATIN CAPITAL LETTER X
+<U0059> /x59 LATIN CAPITAL LETTER Y
+<U005A> /x5a LATIN CAPITAL LETTER Z
+<U005B> /x5b LEFT SQUARE BRACKET
+<U005C> /x5c REVERSE SOLIDUS
+<U005D> /x5d RIGHT SQUARE BRACKET
+<U005E> /x5e CIRCUMFLEX ACCENT
+<U005F> /x5f LOW LINE
+<U0060> /x60 GRAVE ACCENT
+<U0061> /x61 LATIN SMALL LETTER A
+<U0062> /x62 LATIN SMALL LETTER B
+<U0063> /x63 LATIN SMALL LETTER C
+<U0064> /x64 LATIN SMALL LETTER D
+<U0065> /x65 LATIN SMALL LETTER E
+<U0066> /x66 LATIN SMALL LETTER F
+<U0067> /x67 LATIN SMALL LETTER G
+<U0068> /x68 LATIN SMALL LETTER H
+<U0069> /x69 LATIN SMALL LETTER I
+<U006A> /x6a LATIN SMALL LETTER J
+<U006B> /x6b LATIN SMALL LETTER K
+<U006C> /x6c LATIN SMALL LETTER L
+<U006D> /x6d LATIN SMALL LETTER M
+<U006E> /x6e LATIN SMALL LETTER N
+<U006F> /x6f LATIN SMALL LETTER O
+<U0070> /x70 LATIN SMALL LETTER P
+<U0071> /x71 LATIN SMALL LETTER Q
+<U0072> /x72 LATIN SMALL LETTER R
+<U0073> /x73 LATIN SMALL LETTER S
+<U0074> /x74 LATIN SMALL LETTER T
+<U0075> /x75 LATIN SMALL LETTER U
+<U0076> /x76 LATIN SMALL LETTER V
+<U0077> /x77 LATIN SMALL LETTER W
+<U0078> /x78 LATIN SMALL LETTER X
+<U0079> /x79 LATIN SMALL LETTER Y
+<U007A> /x7a LATIN SMALL LETTER Z
+<U007B> /x7b LEFT CURLY BRACKET
+<U007C> /x7c VERTICAL LINE
+<U007D> /x7d RIGHT CURLY BRACKET
+<U007E> /x7e TILDE
+<U007F> /x7f DELETE (DEL)
+END CHARMAP
@@ -97,6 +97,20 @@ END LC_CTYPE
LC_COLLATE
% This is the POSIX Locale definition for the LC_COLLATE category.
% The order is the same as in the ASCII code set.
+% Values above <DEL> (<U007F>) inserted in order, per Issue 7 TC2,
+% XBD, 7.3.2, LC_COLLATE Category in the POSIX Locale:
+% > All characters not explicitly listed here shall be inserted
+% > in the character collation order after the listed characters
+% > and shall be assigned unique primary weights. If the listed
+% > characters have ASCII encoding, the other characters shall
+% > be in ascending order according to their coded character set values
+% Since Issue 7 TC2 (XBD, 6.2 Character Encoding):
+% > The POSIX locale shall contain 256 single-byte characters [...]
+% (cf. bug 663, 674).
+% this is in contrast to previous issues, which limited the POSIX
+% locale to the Portable Character Set (7-bit ASCII).
+% We use the same part of the Low Surrogate Area as Python
+% to contain these, yielding [<UDC80>, <UDCFF>]
order_start forward
<U0000>
<U0001>
@@ -226,7 +240,134 @@ order_start forward
<U007D>
<U007E>
<U007F>
-UNDEFINED
+<UDC80>
+<UDC81>
+<UDC82>
+<UDC83>
+<UDC84>
+<UDC85>
+<UDC86>
+<UDC87>
+<UDC88>
+<UDC89>
+<UDC8A>
+<UDC8B>
+<UDC8C>
+<UDC8D>
+<UDC8E>
+<UDC8F>
+<UDC90>
+<UDC91>
+<UDC92>
+<UDC93>
+<UDC94>
+<UDC95>
+<UDC96>
+<UDC97>
+<UDC98>
+<UDC99>
+<UDC9A>
+<UDC9B>
+<UDC9C>
+<UDC9D>
+<UDC9E>
+<UDC9F>
+<UDCA0>
+<UDCA1>
+<UDCA2>
+<UDCA3>
+<UDCA4>
+<UDCA5>
+<UDCA6>
+<UDCA7>
+<UDCA8>
+<UDCA9>
+<UDCAA>
+<UDCAB>
+<UDCAC>
+<UDCAD>
+<UDCAE>
+<UDCAF>
+<UDCB0>
+<UDCB1>
+<UDCB2>
+<UDCB3>
+<UDCB4>
+<UDCB5>
+<UDCB6>
+<UDCB7>
+<UDCB8>
+<UDCB9>
+<UDCBA>
+<UDCBB>
+<UDCBC>
+<UDCBD>
+<UDCBE>
+<UDCBF>
+<UDCC0>
+<UDCC1>
+<UDCC2>
+<UDCC3>
+<UDCC4>
+<UDCC5>
+<UDCC6>
+<UDCC7>
+<UDCC8>
+<UDCC9>
+<UDCCA>
+<UDCCB>
+<UDCCC>
+<UDCCD>
+<UDCCE>
+<UDCCF>
+<UDCD0>
+<UDCD1>
+<UDCD2>
+<UDCD3>
+<UDCD4>
+<UDCD5>
+<UDCD6>
+<UDCD7>
+<UDCD8>
+<UDCD9>
+<UDCDA>
+<UDCDB>
+<UDCDC>
+<UDCDD>
+<UDCDE>
+<UDCDF>
+<UDCE0>
+<UDCE1>
+<UDCE2>
+<UDCE3>
+<UDCE4>
+<UDCE5>
+<UDCE6>
+<UDCE7>
+<UDCE8>
+<UDCE9>
+<UDCEA>
+<UDCEB>
+<UDCEC>
+<UDCED>
+<UDCEE>
+<UDCEF>
+<UDCF0>
+<UDCF1>
+<UDCF2>
+<UDCF3>
+<UDCF4>
+<UDCF5>
+<UDCF6>
+<UDCF7>
+<UDCF8>
+<UDCF9>
+<UDCFA>
+<UDCFB>
+<UDCFC>
+<UDCFD>
+<UDCFE>
+<UDCFF>
order_end
%
END LC_COLLATE
@@ -9,8 +9,8 @@
/* German locale with ISO-8859-1. */
#define TST_LOC_de "de_DE.ISO-8859-1"
-/* For US we use ANSI_X3.4-1968 (ASCII). */
-#define TST_LOC_enUS "en_US.ANSI_X3.4-1968"
+/* For US we use ASCII. */
+#define TST_LOC_enUS "en_US.ASCII"
/* Japanese locale with EUC-JP. */
#define TST_LOC_eucJP "ja_JP.EUC-JP"
@@ -27,7 +27,7 @@ status=0
# Run the test programs.
rm -f ${common_objpfx}localedata/tst-ctype.out
-for loc in C de_DE.ISO-8859-1 de_DE.UTF-8 en_US.ANSI_X3.4-1968 ja_JP.EUC-JP; do
+for loc in C de_DE.ISO-8859-1 de_DE.UTF-8 en_US.ASCII ja_JP.EUC-JP; do
if test -f tst-ctype-$loc.in; then
input=tst-ctype-$loc.in
else
@@ -89,40 +89,40 @@ C RADIXCHAR .
C THOUSEP ""
C YESEXPR ^[yY]
C NOEXPR ^[nN]
-en_US.ANSI_X3.4-1968 ABMON_1 Jan
-en_US.ANSI_X3.4-1968 ABMON_2 Feb
-en_US.ANSI_X3.4-1968 ABMON_3 Mar
-en_US.ANSI_X3.4-1968 ABMON_4 Apr
-en_US.ANSI_X3.4-1968 ABMON_5 May
-en_US.ANSI_X3.4-1968 ABMON_6 Jun
-en_US.ANSI_X3.4-1968 ABMON_7 Jul
-en_US.ANSI_X3.4-1968 ABMON_8 Aug
-en_US.ANSI_X3.4-1968 ABMON_9 Sep
-en_US.ANSI_X3.4-1968 ABMON_10 Oct
-en_US.ANSI_X3.4-1968 ABMON_11 Nov
-en_US.ANSI_X3.4-1968 ABMON_12 Dec
-en_US.ANSI_X3.4-1968 MON_1 January
-en_US.ANSI_X3.4-1968 MON_2 February
-en_US.ANSI_X3.4-1968 MON_3 March
-en_US.ANSI_X3.4-1968 MON_4 April
-en_US.ANSI_X3.4-1968 MON_5 May
-en_US.ANSI_X3.4-1968 MON_6 June
-en_US.ANSI_X3.4-1968 MON_7 July
-en_US.ANSI_X3.4-1968 MON_8 August
-en_US.ANSI_X3.4-1968 MON_9 September
-en_US.ANSI_X3.4-1968 MON_10 October
-en_US.ANSI_X3.4-1968 MON_11 November
-en_US.ANSI_X3.4-1968 MON_12 December
-en_US.ANSI_X3.4-1968 AM_STR AM
-en_US.ANSI_X3.4-1968 PM_STR PM
-en_US.ANSI_X3.4-1968 D_T_FMT "%a %d %b %Y %r %Z"
-en_US.ANSI_X3.4-1968 D_FMT "%m/%d/%Y"
-en_US.ANSI_X3.4-1968 T_FMT "%r"
-en_US.ANSI_X3.4-1968 T_FMT_AMPM "%I:%M:%S %p"
-en_US.ANSI_X3.4-1968 RADIXCHAR .
-en_US.ANSI_X3.4-1968 THOUSEP ,
-en_US.ANSI_X3.4-1968 YESEXPR ^[+1yY]
-en_US.ANSI_X3.4-1968 NOEXPR ^[-0nN]
+en_US.ASCII ABMON_1 Jan
+en_US.ASCII ABMON_2 Feb
+en_US.ASCII ABMON_3 Mar
+en_US.ASCII ABMON_4 Apr
+en_US.ASCII ABMON_5 May
+en_US.ASCII ABMON_6 Jun
+en_US.ASCII ABMON_7 Jul
+en_US.ASCII ABMON_8 Aug
+en_US.ASCII ABMON_9 Sep
+en_US.ASCII ABMON_10 Oct
+en_US.ASCII ABMON_11 Nov
+en_US.ASCII ABMON_12 Dec
+en_US.ASCII MON_1 January
+en_US.ASCII MON_2 February
+en_US.ASCII MON_3 March
+en_US.ASCII MON_4 April
+en_US.ASCII MON_5 May
+en_US.ASCII MON_6 June
+en_US.ASCII MON_7 July
+en_US.ASCII MON_8 August
+en_US.ASCII MON_9 September
+en_US.ASCII MON_10 October
+en_US.ASCII MON_11 November
+en_US.ASCII MON_12 December
+en_US.ASCII AM_STR AM
+en_US.ASCII PM_STR PM
+en_US.ASCII D_T_FMT "%a %d %b %Y %r %Z"
+en_US.ASCII D_FMT "%m/%d/%Y"
+en_US.ASCII T_FMT "%r"
+en_US.ASCII T_FMT_AMPM "%I:%M:%S %p"
+en_US.ASCII RADIXCHAR .
+en_US.ASCII THOUSEP ,
+en_US.ASCII YESEXPR ^[+1yY]
+en_US.ASCII NOEXPR ^[-0nN]
en_US.ISO-8859-1 ABMON_1 Jan
en_US.ISO-8859-1 ABMON_2 Feb
en_US.ISO-8859-1 ABMON_3 Mar
@@ -63,7 +63,7 @@ main (void)
res = do_test ("C");
res |= do_test ("de_DE.ISO-8859-1");
res |= do_test ("de_DE.UTF-8");
- res |= do_test ("en_US.ANSI_X3.4-1968");
+ res |= do_test ("en_US.ASCII");
res |= do_test ("ja_JP.EUC-JP");
res |= do_test ("hr_HR.ISO-8859-2");
//res |= do_test ("ru_RU.KOI8-R");
@@ -375,6 +375,7 @@ $(objpfx)test-vfprintf.out: $(gen-locales)
$(objpfx)tst-grouping.out: $(gen-locales)
$(objpfx)tst-grouping2.out: $(gen-locales)
$(objpfx)tst-grouping_iterator.out: $(gen-locales)
+$(objpfx)tst-printf-bz25691-mem.out: $(gen-locales)
$(objpfx)tst-sprintf.out: $(gen-locales)
$(objpfx)tst-sscanf.out: $(gen-locales)
$(objpfx)tst-swprintf.out: $(gen-locales)
@@ -30,6 +30,8 @@
static int
do_test (void)
{
+ setlocale(LC_CTYPE, "C.UTF-8");
+
mtrace ();
/* For 's' conversion specifier with 'l' modifier the array must be
@@ -207,7 +207,7 @@ ifeq ($(run-built-tests),yes)
LOCALES := \
de_DE.ISO-8859-1 \
de_DE.UTF-8 \
- en_US.ANSI_X3.4-1968 \
+ en_US.ASCII \
fa_IR.UTF-8 \
hr_HR.ISO-8859-2 \
ja_JP.EUC-JP \
@@ -78,10 +78,10 @@ do_test (void)
{
int result = 0;
- current_locale = setlocale (LC_ALL, "en_US.ANSI_X3.4-1968");
+ current_locale = setlocale (LC_ALL, "en_US.ASCII");
if (current_locale == NULL)
{
- puts ("cannot set locale \"en_US.ANSI_X3.4-1968\"");
+ puts ("cannot set locale \"en_US.ASCII\"");
result = 1;
}
else
@@ -33,10 +33,10 @@ static const struct __gconv_step to_wc =
.__shlib_handle = NULL,
.__modname = NULL,
.__counter = INT_MAX,
- .__from_name = (char *) "ANSI_X3.4-1968//TRANSLIT",
+ .__from_name = (char *) "ANSI_X3.4-1968",
.__to_name = (char *) "INTERNAL",
- .__fct = __gconv_transform_ascii_internal,
- .__btowc_fct = __gconv_btowc_ascii,
+ .__fct = __gconv_transform_posix_internal,
+ .__btowc_fct = __gconv_btowc_posix,
.__init_fct = NULL,
.__end_fct = NULL,
.__min_needed_from = 1,
@@ -53,8 +53,8 @@ static const struct __gconv_step to_mb =
.__modname = NULL,
.__counter = INT_MAX,
.__from_name = (char *) "INTERNAL",
- .__to_name = (char *) "ANSI_X3.4-1968//TRANSLIT",
- .__fct = __gconv_transform_internal_ascii,
+ .__to_name = (char *) "ANSI_X3.4-1968",
+ .__fct = __gconv_transform_internal_posix,
.__btowc_fct = NULL,
.__init_fct = NULL,
.__end_fct = NULL,
@@ -67,7 +67,9 @@ static const struct __gconv_step to_mb =
};
-/* For the default locale we only have to handle ANSI_X3.4-1968. */
+/* The default/"POSIX"/"C" locale is an 8-bit-clean mapping
+ with ASCII in the first 128 characters;
+ we lift the remaining bytes by <UDC00>. */
const struct gconv_fcts __wcsmbs_gconv_fcts_c =
{
.towc = (struct __gconv_step *) &to_wc,