@@ -49,7 +49,7 @@ static void new_width (struct linereader *cmfile, struct charmap_t *result,
static void charmap_new_char (struct linereader *lr, struct charmap_t *cm,
size_t nbytes, unsigned char *bytes,
const char *from, const char *to,
- int decimal_ellipsis, int step);
+ int decimal_ellipsis, int step, bool is_utf8);
bool enc_not_ascii_compatible;
@@ -285,6 +285,27 @@ parse_charmap (struct linereader *cmfile, int verbose, int be_quiet)
enum token_t ellipsis = 0;
int step = 1;
+ /* POSIX explicitly requires that ellipsis processing do the
+ following: "Bytes shall be treated as unsigned octets, and carry
+ shall be propagated between the bytes as necessary to represent the
+ range." It then goes on to say that such a declaration should
+ never be specified because it creates null bytes. Therefore we
+ error on this condition (see charmap_new_char). However this still
+ leaves a problem for encodings which use less than the full 8-bits,
+ like UTF-8, and in such encodings you can use an ellipsis to
+ silently and accidentally create invalid ranges. In UTF-8 you have
+ only N-bits of the first byte and if your ellipsis covers a code
+ point range larger than this code point block the output is going
+ to be an invalid non-UTF-8 multi-byte sequence. Thus for
+ UTF-8 we add a special ellipsis handling loop that can increment
+ UTF-8 multi-byte output effectively and for UTF-8 we allow larger
+ ellipsis ranges without error. There may still be other encodings
+ for which the ellipsis will still generate invalid multi-byte
+ output, but not for UTF-8. The only alternative would be to call
+ gconv for each Unicode code point in the loop to convert it to the
+ appropriate multi-byte output, but that would be slow. */
+ bool is_utf8 = false;
+
/* We don't want symbolic names in string to be translated. */
cmfile->translate_strings = 0;
@@ -385,9 +406,14 @@ parse_charmap (struct linereader *cmfile, int verbose, int be_quiet)
}
if (nowtok == tok_code_set_name)
- result->code_set_name = obstack_copy0 (&result->mem_pool,
- arg->val.str.startmb,
- arg->val.str.lenmb);
+ {
+ result->code_set_name = obstack_copy0 (&result->mem_pool,
+ arg->val.str.startmb,
+ arg->val.str.lenmb);
+
+ if (strcmp (result->code_set_name, "UTF-8") == 0)
+ is_utf8 = true;
+ }
else
result->repertoiremap = obstack_copy0 (&result->mem_pool,
arg->val.str.startmb,
@@ -570,7 +596,7 @@ character sets with locking states are not supported"));
else
charmap_new_char (cmfile, result, now->val.charcode.nbytes,
now->val.charcode.bytes, from_name, to_name,
- ellipsis != tok_ellipsis2, step);
+ ellipsis != tok_ellipsis2, step, is_utf8);
/* Ignore trailing comment silently. */
lr_ignore_rest (cmfile, 0);
@@ -929,12 +955,81 @@ charmap_find_value (const struct charmap_t *cm, const char *name, size_t len)
< 0 ? NULL : (struct charseq *) result);
}
+/* This function takes the Unicode code point CP and encodes it into
+ a UTF-8 byte stream that must be NBYTES long and is stored into
+ the unsigned character array at BYTES.
+
+ If CP requires more than NBYTES to be encoded then we return an
+ error of -1.
+
+ If CP is not within any of the valid Unicode code point ranges
+ then we return an error of -2.
+
+ Otherwise we return the number of bytes encoded. */
+static int
+output_utf8_bytes (unsigned int cp, size_t nbytes, unsigned char *bytes)
+{
+ /* We need at least 1 byte. */
+ if (nbytes < 1)
+ return -1;
+
+ /* One byte range. */
+ if (cp >= 0x0 && cp <= 0x7f)
+ {
+ bytes[0] = cp;
+ return 1;
+ }
+
+ /* We need at least 2 bytes. */
+ if (nbytes < 2)
+ return -1;
+
+ /* Two byte range. */
+ if (cp >= 0x80 && cp <= 0x7ff)
+ {
+ bytes[0] = 0xc0 | ((cp & 0x07c0) >> 6);
+ bytes[1] = 0x80 | (cp & 0x003f);
+ return 2;
+ }
+
+ /* We need at least 3 bytes. */
+ if (nbytes < 3)
+ return -1;
+
+ /* Three byte range. Explicitly allow the surrogate range from
+ 0xd800 to 0xdfff since we want consistent sorting of the invalid
+ values that might appear in UTF-8 data. */
+ if (cp >= 0x800 && cp <= 0xffff)
+ {
+ bytes[0] = 0xe0 | ((cp & 0xf000) >> 12);
+ bytes[1] = 0x80 | ((cp & 0x0fc0) >> 6);
+ bytes[2] = 0x80 | (cp & 0x003f);
+ return 3;
+ }
+
+ /* We need at least 4 bytes. */
+ if (nbytes < 4)
+ return -1;
+
+ /* Four byte range. */
+ if (cp >= 0x10000 && cp <= 0x10ffff)
+ {
+ bytes[0] = 0xf0 | ((cp & 0x1c0000) >> 18);
+ bytes[1] = 0x80 | ((cp & 0x03f000) >> 12);
+ bytes[2] = 0x80 | ((cp & 0x000fc0) >> 6);
+ bytes[3] = 0x80 | (cp & 0x00003f);
+ return 4;
+ }
+
+ /* Invalid code point. */
+ return -2;
+}
static void
charmap_new_char (struct linereader *lr, struct charmap_t *cm,
size_t nbytes, unsigned char *bytes,
const char *from, const char *to,
- int decimal_ellipsis, int step)
+ int decimal_ellipsis, int step, bool is_utf8)
{
hash_table *ht = &cm->char_table;
hash_table *bt = &cm->byte_table;
@@ -1039,11 +1134,56 @@ hexadecimal range format should use only capital characters"));
for (cnt = from_nr; cnt <= to_nr; cnt += step)
{
char *name_end;
+ unsigned char ubytes[4] = { '\0', '\0', '\0', '\0' };
obstack_printf (ob, decimal_ellipsis ? "%.*s%0*d" : "%.*s%0*X",
prefix_len, from, len1 - prefix_len, cnt);
obstack_1grow (ob, '\0');
name_end = obstack_finish (ob);
+ /* Either we have a UTF-8 charmap, and we compute the bytes (see
+ comment above), or we have a non-UTF-8 charmap and we follow
+ POSIX rules as further below for incrementing the bytes in an
+ ellipsis. */
+ if (is_utf8)
+ {
+ int nubytes;
+
+ /* Directly convert the code point to the UTF-8 encoded bytes. */
+ nubytes = output_utf8_bytes (cnt, 4, ubytes);
+
+ /* This should not happen, but we check for it just in case. */
+ if (nubytes == -1)
+ lr_error (lr,
+ _("not enough space to output UTF-8 encoding."));
+
+ /* The other defect here could be that we have a mismatch
+ between the code point and the encoded value or number of
+ output bytes. For example you specify U0000 but assign it
+ an encoded value that is 3-bytes long (an error), or U0000
+ is assigned a value of /x01. */
+ if (cnt == from_nr)
+ {
+ if (nubytes != nbytes)
+ lr_error (lr,
+ _("encoding length does not match "
+ "Unicode code point."));
+ else
+ if (memcmp (bytes, ubytes, nbytes) != 0)
+ lr_error (lr,
+ _("encoded value does not match "
+ "Unicode code point."));
+ }
+
+ /* The range does not cover one of the 4 UTF-8 code point ranges. */
+ if (nubytes == -2)
+ lr_error (lr,
+ _("invalid code point in the range."));
+
+ /* Use the generated UTF-8 bytes. */
+ bytes = ubytes;
+ nbytes = nubytes;
+ }
+
newp = (struct charseq *) obstack_alloc (ob, sizeof (*newp) + nbytes);
newp->nbytes = nbytes;
memcpy (newp->bytes, bytes, nbytes);
@@ -1081,19 +1221,17 @@ hexadecimal range format should use only capital characters"));
/* Please note we don't examine the return value since it is no error
if we have two definitions for a symbol. */
- /* Increment the value in the byte sequence. */
- if (++bytes[nbytes - 1] == '\0')
- {
- int b = nbytes - 2;
+ /* Increment the byte stream following POSIX rules. */
+ if (!is_utf8)
+ bytes[nbytes - 1]++;
- do
- if (b < 0)
- {
- lr_error (lr,
- _("resulting bytes for range not representable."));
- return;
- }
- while (++bytes[b--] == 0);
+ /* If we overflowed then that generates a null byte which is an invalid
+ specification according to POSIX and we issue a parser error. */
+ if (bytes[nbytes - 1] == '\0')
+ {
+ lr_error (lr,
+ _("resulting bytes for range would contain null byte."));
+ return;
}
}
}