@@ -244,9 +244,9 @@ struct locale_collate_t
Therefore we keep all relevant input in a list. */
struct locale_collate_t *next;
- /* Arrays with heads of the list for each of the leading bytes in
+ /* Arrays with heads of the list for the leading bytes in
the multibyte sequences. */
- struct element_t *mbheads[256];
+ struct element_t *mbheads[256 * 256];
/* Arrays with heads of the list for each of the leading bytes in
the multibyte sequences. */
@@ -1558,6 +1558,7 @@ collate_finish (struct localedef_t *locale, const struct charmap_t *charmap)
struct section_list *sect;
int ruleidx;
int nr_wide_elems = 0;
+ bool is_utf8 = strcmp (charmap->code_set_name, "UTF-8") == 0;
if (collate == NULL)
{
@@ -1664,7 +1665,49 @@ collate_finish (struct localedef_t *locale, const struct charmap_t *charmap)
struct element_t *lastp = NULL;
/* Find the point where to insert in the list. */
- eptr = &collate->mbheads[((unsigned char *) runp->mbs)[0]];
+ uint16_t index = ((unsigned char *) runp->mbs)[0];
+
+ /* Special handling of UTF-8: Generate a 2-byte index to mbheads.
+ Also check the UTF-8 encoding. Keep locale/weight.h in sync. */
+ if (is_utf8 && index >= 0x80)
+ {
+ if ((index & 0xC0) == 0x80)
+ {
+ utf8_error:
+ WITH_CUR_LOCALE (error_at_line (0, 0, runp->file, runp->line,
+ _("\
+malformed UTF-8 character in `%s'"), runp->name););
+ goto dont_insert;
+ }
+ else if (index < 0xE0)
+ {
+ if (runp->nmbs < 2)
+ goto utf8_error;
+ uint16_t byte2 = ((unsigned char *) runp->mbs)[1];
+ index = (index << 6) + byte2 - 0x3080;
+ }
+ else if (index < 0xF0)
+ {
+ if (runp->nmbs < 3)
+ goto utf8_error;
+ uint16_t byte2 = ((unsigned char *) runp->mbs)[1];
+ uint16_t byte3 = ((unsigned char *) runp->mbs)[2];
+ index = (index << 12) + (byte2 << 6) + byte3 - 0xE2080;
+ }
+ else if (index < 0xF8)
+ {
+ if (runp->nmbs < 4)
+ goto utf8_error;
+ uint16_t byte2 = ((unsigned char *) runp->mbs)[1];
+ uint16_t byte3 = ((unsigned char *) runp->mbs)[2];
+ uint16_t byte4 = ((unsigned char *) runp->mbs)[3];
+ index = (byte2 << 12) + (byte3 << 6) + byte4 - 0x82080;
+ }
+ else
+ goto utf8_error;
+ }
+
+ eptr = &collate->mbheads[index];
while (*eptr != NULL)
{
if ((*eptr)->nmbs < runp->nmbs)
@@ -1735,7 +1778,7 @@ symbol `%s' has the same encoding as"), (*eptr)->name);
/* Find out whether any of the `mbheads' entries is unset. In this
case we use the UNDEFINED entry. */
- for (i = 1; i < 256; ++i)
+ for (i = 1; i < 256 * 256; ++i)
if (collate->mbheads[i] == NULL)
{
need_undefined = 1;
@@ -2108,7 +2151,7 @@ collate_output (struct localedef_t *locale, const struct charmap_t *charmap,
const size_t nelems = _NL_ITEM_INDEX (_NL_NUM_LC_COLLATE);
struct locale_file file;
size_t ch;
- int32_t tablemb[256];
+ int32_t tablemb[256 * 256];
struct obstack weightpool;
struct obstack extrapool;
struct obstack indirectpool;
@@ -2186,7 +2229,7 @@ collate_output (struct localedef_t *locale, const struct charmap_t *charmap,
if (collate->undefined.used_in_level != 0)
output_weight (&weightpool, collate, &collate->undefined);
- for (ch = 1; ch < 256; ++ch)
+ for (ch = 1; ch < 256 * 256; ++ch)
if (collate->mbheads[ch]->mbnext == NULL
&& collate->mbheads[ch]->nmbs <= 1)
{
@@ -2211,7 +2254,6 @@ collate_output (struct localedef_t *locale, const struct charmap_t *charmap,
and add only one index into the weight table. We can find the
consecutive entries since they are also consecutive in the list. */
struct element_t *runp = collate->mbheads[ch];
- struct element_t *lastp;
assert (LOCFILE_ALIGNED_P (obstack_object_size (&extrapool)));
@@ -2239,7 +2281,7 @@ collate_output (struct localedef_t *locale, const struct charmap_t *charmap,
/* Compute how much space we will need. */
added = LOCFILE_ALIGN_UP (sizeof (int32_t) + 1
- + 2 * (runp->nmbs - 1));
+ + 2 * runp->nmbs);
assert (LOCFILE_ALIGNED_P (obstack_object_size (&extrapool)));
obstack_make_room (&extrapool, added);
@@ -2262,9 +2304,9 @@ collate_output (struct localedef_t *locale, const struct charmap_t *charmap,
/* Now walk backward from here to the beginning. */
curp = runp;
- assert (runp->nmbs <= 256);
- obstack_1grow_fast (&extrapool, curp->nmbs - 1);
- for (i = 1; i < curp->nmbs; ++i)
+ assert (runp->nmbs <= 255);
+ obstack_1grow_fast (&extrapool, curp->nmbs);
+ for (i = 0; i < curp->nmbs; ++i)
obstack_1grow_fast (&extrapool, curp->mbs[i]);
/* Now find the end of the consecutive sequence and
@@ -2284,7 +2326,7 @@ collate_output (struct localedef_t *locale, const struct charmap_t *charmap,
/* And add the end byte sequence. Without length this
time. */
- for (i = 1; i < curp->nmbs; ++i)
+ for (i = 0; i < curp->nmbs; ++i)
obstack_1grow_fast (&extrapool, curp->mbs[i]);
}
else
@@ -2298,15 +2340,15 @@ collate_output (struct localedef_t *locale, const struct charmap_t *charmap,
weightidx = output_weight (&weightpool, collate, runp);
added = LOCFILE_ALIGN_UP (sizeof (int32_t) + 1
- + runp->nmbs - 1);
+ + runp->nmbs);
assert (LOCFILE_ALIGNED_P (obstack_object_size (&extrapool)));
obstack_make_room (&extrapool, added);
obstack_int32_grow_fast (&extrapool, weightidx);
- assert (runp->nmbs <= 256);
- obstack_1grow_fast (&extrapool, runp->nmbs - 1);
+ assert (runp->nmbs <= 255);
+ obstack_1grow_fast (&extrapool, runp->nmbs);
- for (i = 1; i < runp->nmbs; ++i)
+ for (i = 0; i < runp->nmbs; ++i)
obstack_1grow_fast (&extrapool, runp->mbs[i]);
}
@@ -2315,30 +2357,25 @@ collate_output (struct localedef_t *locale, const struct charmap_t *charmap,
obstack_1grow_fast (&extrapool, '\0');
/* Next entry. */
- lastp = runp;
runp = runp->mbnext;
}
while (runp != NULL);
assert (LOCFILE_ALIGNED_P (obstack_object_size (&extrapool)));
- /* If the final entry in the list is not a single character we
- add an UNDEFINED entry here. */
- if (lastp->nmbs != 1)
- {
- int added = LOCFILE_ALIGN_UP (sizeof (int32_t) + 1 + 1);
- obstack_make_room (&extrapool, added);
+ /* Add an UNDEFINED entry at the end of the list. */
+ int added = LOCFILE_ALIGN_UP (sizeof (int32_t) + 1 + 1);
+ obstack_make_room (&extrapool, added);
- obstack_int32_grow_fast (&extrapool, 0);
- /* XXX What rule? We just pick the first. */
- obstack_1grow_fast (&extrapool, 0);
- /* Length is zero. */
- obstack_1grow_fast (&extrapool, 0);
+ obstack_int32_grow_fast (&extrapool, 0);
+ /* XXX What rule? We just pick the first. */
+ obstack_1grow_fast (&extrapool, 0);
+ /* Length is zero. */
+ obstack_1grow_fast (&extrapool, 0);
- /* Add alignment bytes if necessary. */
- while (!LOCFILE_ALIGNED_P (obstack_object_size (&extrapool)))
- obstack_1grow_fast (&extrapool, '\0');
- }
+ /* Add alignment bytes if necessary. */
+ while (!LOCFILE_ALIGNED_P (obstack_object_size (&extrapool)))
+ obstack_1grow_fast (&extrapool, '\0');
}
/* Add padding to the tables if necessary. */
@@ -2346,7 +2383,7 @@ collate_output (struct localedef_t *locale, const struct charmap_t *charmap,
obstack_1grow (&weightpool, 0);
/* Now add the four tables. */
- add_locale_uint32_array (&file, (const uint32_t *) tablemb, 256);
+ add_locale_uint32_array (&file, (const uint32_t *) tablemb, 256 * 256);
add_locale_raw_obstack (&file, &weightpool);
add_locale_raw_obstack (&file, &extrapool);
add_locale_raw_obstack (&file, &indirectpool);
@@ -21,24 +21,65 @@
/* Find index of weight. */
static inline int32_t __attribute__ ((always_inline))
-findidx (const int32_t *table,
+findidx (uint_fast32_t locale_encoding,
+ const int32_t *table,
const int32_t *indirect,
const unsigned char *extra,
const unsigned char **cpp, size_t len)
{
- int_fast32_t i = table[*(*cpp)++];
const unsigned char *cp;
const unsigned char *usrc;
+ uint16_t index = (*cpp)[0];
+ /* Special handling of UTF-8: Generate a 2-byte index for table.
+ This has to be equal to the folding in locale/programs/ld-collate.c:
+ collate_finish(). */
+ if (locale_encoding == __cet_utf8 && index >= 0x80)
+ {
+ if (index < 0xE0)
+ {
+ if (len < 2)
+ goto utf8_error;
+ uint16_t byte2 = (*cpp)[1];
+ index = (index << 6) + byte2 - 0x3080;
+ }
+ else if (index < 0xF0)
+ {
+ if (len < 3)
+ goto utf8_error;
+ uint16_t byte2 = (*cpp)[1];
+ uint16_t byte3 = (*cpp)[2];
+ index = (index << 12) + (byte2 << 6) + byte3 - 0xE2080;
+ }
+ else if (index < 0xF8)
+ {
+ if (len < 4)
+ goto utf8_error;
+ uint16_t byte2 = (*cpp)[1];
+ uint16_t byte3 = (*cpp)[2];
+ uint16_t byte4 = (*cpp)[3];
+ index = (byte2 << 12) + (byte3 << 6) + byte4 - 0x82080;
+ }
+ else
+ {
+ utf8_error:
+ *cpp += 1;
+ return 0;
+ }
+ }
+
+ int_fast32_t i = table[index];
if (i >= 0)
- /* This is an index into the weight table. Cool. */
- return i;
+ {
+ /* This is an index into the weight table. Cool. */
+ *cpp += 1;
+ return i;
+ }
/* Oh well, more than one sequence starting with this byte.
Search for the correct one. */
cp = &extra[-i];
usrc = *cpp;
- --len;
while (1)
{
size_t nhere;
@@ -57,8 +98,7 @@ findidx (const int32_t *table,
/* It is a single character. If it matches we found our
index. Note that at the end of each list there is an
entry of length zero which represents the single byte
- sequence. The first (and here only) byte was tested
- already. */
+ sequence. */
size_t cnt;
for (cnt = 0; cnt < nhere && cnt < len; ++cnt)
@@ -68,7 +108,7 @@ findidx (const int32_t *table,
if (cnt == nhere)
{
/* Found it. */
- *cpp += nhere;
+ *cpp += nhere > 0 ? nhere : 1;
return i;
}
@@ -127,7 +167,7 @@ findidx (const int32_t *table,
while (++cnt < nhere);
}
- *cpp += nhere;
+ *cpp += nhere > 0 ? nhere : 1;
return indirect[-i + offset];
}
}
@@ -21,7 +21,8 @@
/* Find index of weight. */
static inline int32_t __attribute__ ((always_inline))
-findidx (const int32_t *table,
+findidx (uint_fast32_t encoding,
+ const int32_t *table,
const int32_t *indirect,
const wint_t *extra,
const wint_t **cpp, size_t len)
@@ -389,6 +389,8 @@ FCT (pattern, string, string_end, no_leading_period, flags, ends, alloca_used)
const int32_t *indirect;
int32_t idx;
const UCHAR *cp = (const UCHAR *) &str;
+ uint_fast32_t encoding = (uint32_t)
+ _NL_CURRENT (LC_COLLATE, _NL_COLLATE_ENCODING_TYPE);
# if WIDE_CHAR_VERSION
table = (const int32_t *)
@@ -410,7 +412,7 @@ FCT (pattern, string, string_end, no_leading_period, flags, ends, alloca_used)
_NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
# endif
- idx = FINDIDX (table, indirect, extra, &cp, 1);
+ idx = FINDIDX (encoding, table, indirect, extra, &cp, 1);
if (idx != 0)
{
/* We found a table entry. Now see whether the
@@ -420,7 +422,7 @@ FCT (pattern, string, string_end, no_leading_period, flags, ends, alloca_used)
int32_t idx2;
const UCHAR *np = (const UCHAR *) n;
- idx2 = FINDIDX (table, indirect, extra,
+ idx2 = FINDIDX (encoding, table, indirect, extra,
&np, string_end - n);
if (idx2 != 0
&& (idx >> 24) == (idx2 >> 24)
@@ -3426,6 +3426,7 @@ build_equiv_class (bitset_t sbcset, const unsigned char *name)
uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
if (nrules != 0)
{
+ uint_fast32_t encoding;
const int32_t *table, *indirect;
const unsigned char *weights, *extra, *cp;
unsigned char char_buf[2];
@@ -3434,6 +3435,7 @@ build_equiv_class (bitset_t sbcset, const unsigned char *name)
size_t len;
/* Calculate the index for equivalence class. */
cp = name;
+ encoding = (uint32_t) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_ENCODING_TYPE);
table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
_NL_COLLATE_WEIGHTMB);
@@ -3441,7 +3443,7 @@ build_equiv_class (bitset_t sbcset, const unsigned char *name)
_NL_COLLATE_EXTRAMB);
indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
_NL_COLLATE_INDIRECTMB);
- idx1 = findidx (table, indirect, extra, &cp, -1);
+ idx1 = findidx (encoding, table, indirect, extra, &cp, -1);
if (BE (idx1 == 0 || *cp != '\0', 0))
/* This isn't a valid character. */
return REG_ECOLLATE;
@@ -3452,7 +3454,7 @@ build_equiv_class (bitset_t sbcset, const unsigned char *name)
{
char_buf[0] = ch;
cp = char_buf;
- idx2 = findidx (table, indirect, extra, &cp, 1);
+ idx2 = findidx (encoding, table, indirect, extra, &cp, 1);
/*
idx2 = table[ch];
*/
@@ -743,17 +743,19 @@ re_string_elem_size_at (const re_string_t *pstr, int idx)
# ifdef _LIBC
const unsigned char *p, *extra;
const int32_t *table, *indirect;
+ uint_fast32_t encoding;
uint_fast32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
if (nrules != 0)
{
+ encoding = (uint32_t) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_ENCODING_TYPE);
table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
extra = (const unsigned char *)
_NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
_NL_COLLATE_INDIRECTMB);
p = pstr->mbs + idx;
- findidx (table, indirect, extra, &p, pstr->len - idx);
+ findidx (encoding, table, indirect, extra, &p, pstr->len - idx);
return p - pstr->mbs - idx;
}
else
@@ -3869,6 +3869,7 @@ check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
if (nrules != 0)
{
unsigned int in_collseq = 0;
+ uint_fast32_t encoding;
const int32_t *table, *indirect;
const unsigned char *weights, *extra;
const char *collseqwc;
@@ -3919,6 +3920,8 @@ check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
if (cset->nequiv_classes)
{
const unsigned char *cp = pin;
+ encoding = (uint32_t)
+ _NL_CURRENT (LC_COLLATE, _NL_COLLATE_ENCODING_TYPE);
table = (const int32_t *)
_NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
weights = (const unsigned char *)
@@ -3927,7 +3930,8 @@ check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
_NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
indirect = (const int32_t *)
_NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
- int32_t idx = findidx (table, indirect, extra, &cp, elem_len);
+ int32_t idx = findidx (encoding, table, indirect, extra, &cp,
+ elem_len);
if (idx > 0)
for (i = 0; i < cset->nequiv_classes; ++i)
{
@@ -78,9 +78,9 @@ typedef struct
/* Get next sequence. Traverse the string as required. */
static __always_inline void
get_next_seq (coll_seq *seq, int nrules, const unsigned char *rulesets,
- const USTRING_TYPE *weights, const int32_t *table,
- const USTRING_TYPE *extra, const int32_t *indirect,
- int pass)
+ const USTRING_TYPE *weights, uint_fast32_t encoding,
+ const int32_t *table, const USTRING_TYPE *extra,
+ const int32_t *indirect, int pass)
{
size_t val = seq->val = 0;
int len = seq->len;
@@ -124,7 +124,7 @@ get_next_seq (coll_seq *seq, int nrules, const unsigned char *rulesets,
us = seq->back_us;
while (i < backw)
{
- int32_t tmp = findidx (table, indirect, extra, &us, -1);
+ int32_t tmp = findidx (encoding, table, indirect, extra, &us, -1);
idx = tmp & 0xffffff;
i++;
}
@@ -139,7 +139,7 @@ get_next_seq (coll_seq *seq, int nrules, const unsigned char *rulesets,
while (*us != L('\0'))
{
- int32_t tmp = findidx (table, indirect, extra, &us, -1);
+ int32_t tmp = findidx (encoding, table, indirect, extra, &us, -1);
unsigned char rule = tmp >> 24;
prev_idx = idx;
idx = tmp & 0xffffff;
@@ -345,9 +345,9 @@ STRCOLL (const STRING_TYPE *s1, const STRING_TYPE *s2, __locale_t l)
while (1)
{
- get_next_seq (&seq1, nrules, rulesets, weights, table,
+ get_next_seq (&seq1, nrules, rulesets, weights, encoding, table,
extra, indirect, pass);
- get_next_seq (&seq2, nrules, rulesets, weights, table,
+ get_next_seq (&seq2, nrules, rulesets, weights, encoding, table,
extra, indirect, pass);
/* See whether any or both strings are empty. */
if (seq1.len == 0 || seq2.len == 0)
@@ -53,6 +53,7 @@ typedef struct
uint_fast32_t nrules;
unsigned char *rulesets;
USTRING_TYPE *weights;
+ uint_fast32_t encoding;
int32_t *table;
USTRING_TYPE *extra;
int32_t *indirect;
@@ -100,8 +101,8 @@ static __always_inline size_t
find_idx (const USTRING_TYPE **us, int32_t *weight_idx,
unsigned char *rule_idx, const locale_data_t *l_data, const int pass)
{
- int32_t tmp = findidx (l_data->table, l_data->indirect, l_data->extra, us,
- -1);
+ int32_t tmp = findidx (l_data->encoding, l_data->table, l_data->indirect,
+ l_data->extra, us, -1);
*rule_idx = tmp >> 24;
int32_t idx = tmp & 0xffffff;
size_t len = l_data->weights[idx++];
@@ -693,6 +694,8 @@ STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n, __locale_t l)
/* Get the locale data. */
l_data.rulesets = (unsigned char *)
current->values[_NL_ITEM_INDEX (_NL_COLLATE_RULESETS)].string;
+ l_data.encoding =
+ current->values[_NL_ITEM_INDEX (_NL_COLLATE_ENCODING_TYPE)].word;
l_data.table = (int32_t *)
current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_TABLE,SUFFIX))].string;
l_data.weights = (USTRING_TYPE *)
@@ -721,8 +724,8 @@ STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n, __locale_t l)
do
{
- int32_t tmp = findidx (l_data.table, l_data.indirect, l_data.extra, &cur,
- -1);
+ int32_t tmp = findidx (l_data.encoding, l_data.table, l_data.indirect,
+ l_data.extra, &cur, -1);
rulearr[idxmax] = tmp >> 24;
idxarr[idxmax] = tmp & 0xffffff;