@@ -51,6 +51,7 @@ FCT (const CHAR *pattern, const CHAR *string, const CHAR *string_end,
_NL_CURRENT(LC_COLLATE, _NL_COLLATE_COLLSEQMB);
# endif
#endif
+ uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
while ((c = *p++) != L_('\0'))
{
@@ -324,8 +325,6 @@ FCT (const CHAR *pattern, const CHAR *string, const CHAR *string_end,
diagnose a "used initialized" in a dead branch in the
findidx function. */
UCHAR str;
- uint32_t nrules =
- _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
const CHAR *startp = p;
c = *++p;
@@ -437,8 +436,6 @@ FCT (const CHAR *pattern, const CHAR *string, const CHAR *string_end,
if (c == L_('[') && *p == L_('.'))
{
- uint32_t nrules =
- _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
const CHAR *startp = p;
size_t c1 = 0;
@@ -600,42 +597,51 @@ FCT (const CHAR *pattern, const CHAR *string, const CHAR *string_end,
if (c == L_('-') && *p != L_(']'))
{
#if _LIBC
- /* We have to find the collation sequence
- value for C. Collation sequence is nothing
- we can regularly access. The sequence
- value is defined by the order in which the
- definitions of the collation values for the
- various characters appear in the source
- file. A strange concept, nowhere
- documented. */
- uint32_t fcollseq;
- uint32_t lcollseq;
+ /* We must find the collation sequence values for
+ the low part of the range, the high part of the
+ range and the searched value FN. We do this by
+ using the POSIX concept of Collation Element
+ Ordering, which is the defined order of elements
+ in the source locale. FCOLLSEQ is the searched
+ element in the range, while LCOLLSEQ is the low
+ element in the range. If we have no collation
+ rules (nrules == 0) then we must fall back to a
+ basic code point value for the collation
+ sequence value (which is correct for ASCII and
+ UTF-8). We must never use collseq if nrules ==
+ 0 since none of the tables we need will be
+ present in the compiled binary locale. We start
+ with fcollseq and lcollseq at unknown collation
+ sequences. We only compute hcollseq, the high
+ part of the range if required. */
+ uint32_t fcollseq = ~((uint32_t) 0);
+ uint32_t lcollseq = ~((uint32_t) 0);
UCHAR cend = *p++;
+ if (nrules != 0)
+ {
# if WIDE_CHAR_VERSION
- /* Search in the 'names' array for the characters. */
- fcollseq = __collseq_table_lookup (collseq, fn);
- if (fcollseq == ~((uint32_t) 0))
- /* XXX We don't know anything about the character
- we are supposed to match. This means we are
- failing. */
- goto range_not_matched;
-
- if (is_seqval)
- lcollseq = cold;
- else
- lcollseq = __collseq_table_lookup (collseq, cold);
+ /* Search the collation data for the character. */
+ fcollseq = __collseq_table_lookup (collseq, fn);
+ if (fcollseq == ~((uint32_t) 0))
+ /* We don't know anything about the character
+ we are supposed to match. This means we are
+ failing. */
+ goto range_not_matched;
+
+ if (is_seqval)
+ lcollseq = cold;
+ else
+ lcollseq = __collseq_table_lookup (collseq, cold);
# else
- fcollseq = collseq[fn];
- lcollseq = is_seqval ? cold : collseq[(UCHAR) cold];
+ fcollseq = collseq[fn];
+ lcollseq = is_seqval ? cold : collseq[(UCHAR) cold];
# endif
+ }
is_seqval = false;
if (cend == L_('[') && *p == L_('.'))
{
- uint32_t nrules =
- _NL_CURRENT_WORD (LC_COLLATE,
- _NL_COLLATE_NRULES);
const CHAR *startp = p;
size_t c1 = 0;
@@ -752,14 +758,20 @@ FCT (const CHAR *pattern, const CHAR *string, const CHAR *string_end,
cend = FOLD (cend);
}
- /* XXX It is not entirely clear to me how to handle
- characters which are not mentioned in the
- collation specification. */
- if (
+ /* If we have rules, and the low sequence is lower than
+ the value of the searched sequence then we must
+ lookup the high collation sequence value and
+ determine if the fcollseq falls within the range.
+ If hcollseq is unknown then we could still match
+ fcollseq on the low end of the range. If lcollseq
+ if unknown (0xffffffff) we will still fail to
+ match, but in the future we might consider matching
+ the high end of the range on an exact match. */
+ if (nrules != 0 && (
# if WIDE_CHAR_VERSION
lcollseq == 0xffffffff ||
# endif
- lcollseq <= fcollseq)
+ lcollseq <= fcollseq))
{
/* We have to look at the upper bound. */
uint32_t hcollseq;
@@ -789,6 +801,17 @@ FCT (const CHAR *pattern, const CHAR *string, const CHAR *string_end,
if (lcollseq <= hcollseq && fcollseq <= hcollseq)
goto matched;
}
+
+ /* No rules, but we have a range. */
+ if (nrules == 0)
+ {
+ if (cend == L_('\0'))
+ return FNM_NOMATCH;
+
+ /* Compare that fn is within the range. */
+ if ((UCHAR) cold <= fn && fn <= cend)
+ goto matched;
+ }
# if WIDE_CHAR_VERSION
range_not_matched:
# endif
@@ -2889,7 +2889,7 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
if (MB_CUR_MAX == 1)
*/
if (nrules == 0)
- return collseqmb[br_elem->opr.ch];
+ return br_elem->opr.ch;
else
{
wint_t wc = __btowc (br_elem->opr.ch);
@@ -2900,6 +2900,8 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
{
if (nrules != 0)
return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
+ else
+ return br_elem->opr.wch;
}
else if (br_elem->type == COLL_SYM)
{
@@ -2935,7 +2937,7 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
}
}
else if (sym_name_len == 1)
- return collseqmb[br_elem->opr.name[0]];
+ return br_elem->opr.name[0];
}
return UINT_MAX;
}
@@ -3017,7 +3019,7 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
if (MB_CUR_MAX == 1)
*/
if (nrules == 0)
- ch_collseq = collseqmb[ch];
+ ch_collseq = ch;
else
ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
@@ -3103,11 +3105,11 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
int token_len;
bool first_round = true;
#ifdef _LIBC
- collseqmb = (const unsigned char *)
- _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
if (nrules)
{
+ collseqmb = (const unsigned char *)
+ _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
/*
if (MB_CUR_MAX > 1)
*/
@@ -3858,62 +3858,53 @@ check_node_accept_bytes (const re_dfa_t *dfa, Idx node_idx,
}
# ifdef _LIBC
+#include <assert.h>
+
static unsigned int
find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
{
- uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
- if (nrules == 0)
- {
- if (mbs_len == 1)
- {
- /* No valid character. Match it as a single byte character. */
- const unsigned char *collseq = (const unsigned char *)
- _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
- return collseq[mbs[0]];
- }
- return UINT_MAX;
- }
- else
- {
- int32_t idx;
- const unsigned char *extra = (const unsigned char *)
+ int32_t idx;
+ const unsigned char *extra = (const unsigned char *)
_NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
- int32_t extrasize = (const unsigned char *)
+ int32_t extrasize = (const unsigned char *)
_NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
+ uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
+
+ /* Only called from within 'if (nrules != 0)'. */
+ assert (nrules != 0);
- for (idx = 0; idx < extrasize;)
+ for (idx = 0; idx < extrasize;)
+ {
+ int mbs_cnt;
+ bool found = false;
+ int32_t elem_mbs_len;
+ /* Skip the name of collating element name. */
+ idx = idx + extra[idx] + 1;
+ elem_mbs_len = extra[idx++];
+ if (mbs_len == elem_mbs_len)
{
- int mbs_cnt;
- bool found = false;
- int32_t elem_mbs_len;
- /* Skip the name of collating element name. */
- idx = idx + extra[idx] + 1;
- elem_mbs_len = extra[idx++];
- if (mbs_len == elem_mbs_len)
- {
- for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
- if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
- break;
- if (mbs_cnt == elem_mbs_len)
- /* Found the entry. */
- found = true;
- }
- /* Skip the byte sequence of the collating element. */
- idx += elem_mbs_len;
- /* Adjust for the alignment. */
- idx = (idx + 3) & ~3;
- /* Skip the collation sequence value. */
- idx += sizeof (uint32_t);
- /* Skip the wide char sequence of the collating element. */
- idx = idx + sizeof (uint32_t) * (*(int32_t *) (extra + idx) + 1);
- /* If we found the entry, return the sequence value. */
- if (found)
- return *(uint32_t *) (extra + idx);
- /* Skip the collation sequence value. */
- idx += sizeof (uint32_t);
+ for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
+ if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
+ break;
+ if (mbs_cnt == elem_mbs_len)
+ /* Found the entry. */
+ found = true;
}
- return UINT_MAX;
+ /* Skip the byte sequence of the collating element. */
+ idx += elem_mbs_len;
+ /* Adjust for the alignment. */
+ idx = (idx + 3) & ~3;
+ /* Skip the collation sequence value. */
+ idx += sizeof (uint32_t);
+ /* Skip the wide char sequence of the collating element. */
+ idx = idx + sizeof (uint32_t) * (*(int32_t *) (extra + idx) + 1);
+ /* If we found the entry, return the sequence value. */
+ if (found)
+ return *(uint32_t *) (extra + idx);
+ /* Skip the collation sequence value. */
+ idx += sizeof (uint32_t);
}
+ return UINT_MAX;
}
# endif /* _LIBC */
#endif /* RE_ENABLE_I18N */