1999-12-25  Ulrich Drepper  <drepper@cygnus.com>

	* locale/C-collate.c (_nl_C_LC_COLLATE): Add one more entry for the
	indirect table.
	* locale/langinfo.h: Likewise.
	* locale/categories.def: Likewise.  Remove reference to postload
	functions.
	* locale/lc-collate.c (_nl_postload_collate): Removed.  Also remove
	__collate_tablemb, __collate_weightmb, and __collate_extramb.
	* locale/localeinfo.h: Remove declaration for removed variables above.
	Remove prototype for _nl_get_era_entry.
	* locale/weight.h: Complete rewrite for new collate implementation.
	* locale/programs/ld-collate.c: Many changes to make output file
	usable in strxfrm/strcoll.
	* string/strxfrm.c: Complete rewrite for new collate implementation.
	* wcsmbs/wcsxfrm.c: Don't use strxfrm.c, implement dummy implementation
	locally.

1999-12-25  Shinya Hanataka  <hanataka@abyss.rim.or.jp>

	* locale/programs/ld-ctype.c (allocate_arrays): Correctly assign
	transformation values for chars >255.
	* wctype/wctrans.c: Return pointer unmodified.
This commit is contained in:
Ulrich Drepper 1999-12-25 23:41:39 +00:00
parent ce40141c6b
commit 450bf66ef2
12 changed files with 537 additions and 471 deletions

View File

@ -1,3 +1,27 @@
1999-12-25 Ulrich Drepper <drepper@cygnus.com>
* locale/C-collate.c (_nl_C_LC_COLLATE): Add one more entry for the
indirect table.
* locale/langinfo.h: Likewise.
* locale/categories.def: Likewise. Remove reference to postload
functions.
* locale/lc-collate.c (_nl_postload_collate): Removed. Also remove
__collate_tablemb, __collate_weightmb, and __collate_extramb.
* locale/localeinfo.h: Remove declaration for removed variables above.
Remove prototype for _nl_get_era_entry.
* locale/weight.h: Complete rewrite for new collate implementation.
* locale/programs/ld-collate.c: Many changes to make output file
usable in strxfrm/strcoll.
* string/strxfrm.c: Complete rewrite for new collate implementation.
* wcsmbs/wcsxfrm.c: Don't use strxfrm.c, implement dummy implementation
locally.
1999-12-25 Shinya Hanataka <hanataka@abyss.rim.or.jp>
* locale/programs/ld-ctype.c (allocate_arrays): Correctly assign
transformation values for chars >255.
* wctype/wctrans.c: Return pointer unmodified.
1999-12-24 Ulrich Drepper <drepper@cygnus.com> 1999-12-24 Ulrich Drepper <drepper@cygnus.com>
* sysdeps/posix/system.c (__libc_system): Check whether command * sysdeps/posix/system.c (__libc_system): Check whether command

View File

@ -150,12 +150,13 @@ const struct locale_data _nl_C_LC_COLLATE =
_nl_C_name, _nl_C_name,
NULL, 0, 0, /* no file mapped */ NULL, 0, 0, /* no file mapped */
UNDELETABLE, UNDELETABLE,
5, 6,
{ {
{ word: 0 }, { word: 0 },
{ string: NULL }, { string: NULL },
{ string: NULL }, { string: NULL },
{ string: NULL }, { string: NULL },
{ string: NULL },
{ string: NULL } { string: NULL }
} }
}; };

View File

@ -47,7 +47,8 @@ DEFINE_CATEGORY
DEFINE_ELEMENT (_NL_COLLATE_TABLEMB, "collate-tablemb", std, string) DEFINE_ELEMENT (_NL_COLLATE_TABLEMB, "collate-tablemb", std, string)
DEFINE_ELEMENT (_NL_COLLATE_WEIGHTMB, "collate-weightmb", std, string) DEFINE_ELEMENT (_NL_COLLATE_WEIGHTMB, "collate-weightmb", std, string)
DEFINE_ELEMENT (_NL_COLLATE_EXTRAMB, "collate-extramb", std, string) DEFINE_ELEMENT (_NL_COLLATE_EXTRAMB, "collate-extramb", std, string)
), _nl_postload_collate) DEFINE_ELEMENT (_NL_COLLATE_INDIRECTMB, "collate-indirectmb", std, string)
), NO_POSTLOAD)
/* The actual definition of ctype is meaningless here. It is hard coded in /* The actual definition of ctype is meaningless here. It is hard coded in

View File

@ -235,6 +235,7 @@ enum
_NL_COLLATE_TABLEMB, _NL_COLLATE_TABLEMB,
_NL_COLLATE_WEIGHTMB, _NL_COLLATE_WEIGHTMB,
_NL_COLLATE_EXTRAMB, _NL_COLLATE_EXTRAMB,
_NL_COLLATE_INDIRECTMB,
_NL_NUM_LC_COLLATE, _NL_NUM_LC_COLLATE,
/* LC_CTYPE category: character classification. /* LC_CTYPE category: character classification.

View File

@ -22,21 +22,3 @@
_NL_CURRENT_DEFINE (LC_COLLATE); _NL_CURRENT_DEFINE (LC_COLLATE);
const int32_t *__collate_tablemb;
const unsigned char *__collate_weightmb;
const unsigned char *__collate_extramb;
/* We are called after loading LC_CTYPE data to load it into
the variables used by the collation functions and regex. */
void
_nl_postload_collate (void)
{
#define paste(a,b) paste1(a,b)
#define paste1(a,b) a##b
#define current(x) _NL_CURRENT (LC_COLLATE, paste(_NL_COLLATE_,x))
__collate_tablemb = (const int32_t *) current (TABLEMB);
__collate_weightmb = (const unsigned char *) current (WEIGHTMB);
__collate_extramb = (const unsigned char *) current (EXTRAMB);
}

View File

@ -165,9 +165,6 @@ extern void _nl_unload_locale (struct locale_data *locale);
extern void _nl_remove_locale (int locale, struct locale_data *data); extern void _nl_remove_locale (int locale, struct locale_data *data);
/* initialize `era' entries */
extern void _nl_init_era_entries (void);
/* Return `era' entry which corresponds to TP. Used in strftime. */ /* Return `era' entry which corresponds to TP. Used in strftime. */
extern struct era_entry *_nl_get_era_entry (const struct tm *tp); extern struct era_entry *_nl_get_era_entry (const struct tm *tp);
@ -180,10 +177,4 @@ extern const char *_nl_get_alt_digit (unsigned int number);
/* Similar, but now for wide characters. */ /* Similar, but now for wide characters. */
extern const wchar_t *_nl_get_walt_digit (unsigned int number); extern const wchar_t *_nl_get_walt_digit (unsigned int number);
/* Global variables for LC_COLLATE category data. */
extern const int32_t *__collate_tablemb;
extern const unsigned char *__collate_extrweightmb;
extern const unsigned char *__collate_extramb;
#endif /* localeinfo.h */ #endif /* localeinfo.h */

View File

@ -137,9 +137,6 @@ struct locale_collate_t
/* To make handling of errors easier we have another section. */ /* To make handling of errors easier we have another section. */
struct section_list error_section; struct section_list error_section;
/* Number of sorting rules given in order_start line. */
uint32_t nrules;
/* Start of the order list. */ /* Start of the order list. */
struct element_t *start; struct element_t *start;
@ -176,7 +173,7 @@ struct locale_collate_t
/* We have a few global variables which are used for reading all /* We have a few global variables which are used for reading all
LC_COLLATE category descriptions in all files. */ LC_COLLATE category descriptions in all files. */
static int nrules; static uint32_t nrules;
/* These are definitions used by some of the functions for handling /* These are definitions used by some of the functions for handling
@ -426,7 +423,7 @@ read_directions (struct linereader *ldfile, struct token *arg,
if (! warned) if (! warned)
{ {
lr_error (ldfile, _("\ lr_error (ldfile, _("\
%s: `%s' mentioned twice in definition of weight %d in category `%s'"), %s: `%s' mentioned twice in definition of weight %d"),
"LC_COLLATE", "position", cnt + 1); "LC_COLLATE", "position", cnt + 1);
} }
} }
@ -450,7 +447,13 @@ read_directions (struct linereader *ldfile, struct token *arg,
/* See whether we have to increment the counter. */ /* See whether we have to increment the counter. */
if (arg->tok != tok_comma && rules[cnt] != 0) if (arg->tok != tok_comma && rules[cnt] != 0)
{
/* Add the default `forward' if we have seen only `position'. */
if (rules[cnt] == sort_position)
rules[cnt] = sort_position | sort_forward;
++cnt; ++cnt;
}
if (arg->tok == tok_eof || arg->tok == tok_eol) if (arg->tok == tok_eof || arg->tok == tok_eol)
/* End of line or file, so we exit the loop. */ /* End of line or file, so we exit the loop. */
@ -876,7 +879,7 @@ insert_value (struct linereader *ldfile, struct token *arg,
elem->nmbs = seq->nbytes; elem->nmbs = seq->nbytes;
} }
if (elem->wcs == NULL && seq != ILLEGAL_CHAR_VALUE) if (elem->wcs == NULL && wc != ILLEGAL_CHAR_VALUE)
{ {
uint32_t wcs[2] = { wc, 0 }; uint32_t wcs[2] = { wc, 0 };
@ -1552,7 +1555,7 @@ collate_finish (struct localedef_t *locale, struct charmap_t *charmap)
} }
static inline int32_t static int32_t
output_weight (struct obstack *pool, struct locale_collate_t *collate, output_weight (struct obstack *pool, struct locale_collate_t *collate,
struct element_t *elem) struct element_t *elem)
{ {
@ -1575,25 +1578,18 @@ output_weight (struct obstack *pool, struct locale_collate_t *collate,
int len = 0; int len = 0;
int i; int i;
/* Add the direction. */
obstack_1grow (pool, elem->section->rules[cnt]);
for (i = 0; i < elem->weights[cnt].cnt; ++i) for (i = 0; i < elem->weights[cnt].cnt; ++i)
/* Encode the weight value. */ /* Encode the weight value. We do nothing for IGNORE entries. */
if (elem->weights[cnt].w[i] == NULL) if (elem->weights[cnt].w[i] != NULL)
{
/* This entry was IGNORE. */
buf[len++] = IGNORE_CHAR;
}
else
len += utf8_encode (&buf[len], len += utf8_encode (&buf[len],
elem->weights[cnt].w[i]->mborder[cnt]); elem->weights[cnt].w[i]->mborder[cnt]);
/* And add the buffer content. */ /* And add the buffer content. */
obstack_1grow (pool, len);
obstack_grow (pool, buf, len); obstack_grow (pool, buf, len);
} }
return retval; return retval | ((elem->section->ruleidx & 0x7f) << 24);
} }
@ -1611,11 +1607,13 @@ collate_output (struct localedef_t *locale, struct charmap_t *charmap,
int32_t tablemb[256]; int32_t tablemb[256];
struct obstack weightpool; struct obstack weightpool;
struct obstack extrapool; struct obstack extrapool;
struct obstack indirectpool;
struct section_list *sect; struct section_list *sect;
int i; int i;
obstack_init (&weightpool); obstack_init (&weightpool);
obstack_init (&extrapool); obstack_init (&extrapool);
obstack_init (&indirectpool);
data.magic = LIMAGIC (LC_COLLATE); data.magic = LIMAGIC (LC_COLLATE);
data.n = nelems; data.n = nelems;
@ -1629,7 +1627,7 @@ collate_output (struct localedef_t *locale, struct charmap_t *charmap,
cnt = 0; cnt = 0;
assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_NRULES)); assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_NRULES));
iov[2 + cnt].iov_base = &collate->nrules; iov[2 + cnt].iov_base = &nrules;
iov[2 + cnt].iov_len = sizeof (uint32_t); iov[2 + cnt].iov_len = sizeof (uint32_t);
idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len; idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
++cnt; ++cnt;
@ -1638,7 +1636,12 @@ collate_output (struct localedef_t *locale, struct charmap_t *charmap,
for (sect = collate->sections, i = 0; sect != NULL; sect = sect->next) for (sect = collate->sections, i = 0; sect != NULL; sect = sect->next)
if (sect->ruleidx == i) if (sect->ruleidx == i)
{ {
obstack_grow (&weightpool, sect->rules, nrules); int j;
obstack_make_room (&weightpool, nrules);
for (j = 0; j < nrules; ++j)
obstack_1grow_fast (&weightpool, sect->rules[j]);
++i; ++i;
} }
/* And align the output. */ /* And align the output. */
@ -1719,38 +1722,60 @@ collate_output (struct localedef_t *locale, struct charmap_t *charmap,
{ {
int i; int i;
/* More than one consecutive entry. We mark this by having
a negative index into the weight table. */
weightidx = -weightidx;
/* Now add first the initial byte sequence. */ /* Now add first the initial byte sequence. */
added = ((sizeof (int32_t) + 1 + 1 + 2 * (runp->nmbs - 1) added = ((sizeof (int32_t) + 1 + 1 + 2 * (runp->nmbs - 1)
+ __alignof__ (int32_t) - 1) + __alignof__ (int32_t) - 1)
& ~(__alignof__ (int32_t) - 1)); & ~(__alignof__ (int32_t) - 1));
obstack_make_room (&extrapool, added); obstack_make_room (&extrapool, added);
/* More than one consecutive entry. We mark this by having
a negative index into the indirect table. */
if (sizeof (int32_t) == sizeof (int)) if (sizeof (int32_t) == sizeof (int))
obstack_int_grow_fast (&extrapool, weightidx); obstack_int_grow_fast (&extrapool,
obstack_object_size (&indirectpool)
/ sizeof (int32_t));
else else
obstack_grow (&extrapool, &weightidx, sizeof (int32_t)); {
obstack_1grow_fast (&extrapool, runp->section->ruleidx); int32_t i = (obstack_object_size (&indirectpool)
/ sizeof (int32_t));
obstack_grow (&extrapool, &i, sizeof (int32_t));
}
obstack_1grow_fast (&extrapool, runp->nmbs - 1); obstack_1grow_fast (&extrapool, runp->nmbs - 1);
for (i = 1; i < runp->nmbs; ++i) for (i = 1; i < runp->nmbs; ++i)
obstack_1grow_fast (&extrapool, runp->mbs[i]); obstack_1grow_fast (&extrapool, runp->mbs[i]);
/* Now find the end of the consecutive sequence. */ /* Now find the end of the consecutive sequence and
do add all the indeces in the indirect pool. */
runp = runp->next; while (1)
while (runp->mbnext != NULL {
&& runp->nmbs == runp->mbnext->nmbs if (sizeof (int32_t) == sizeof (int))
&& memcmp (runp->mbs, runp->mbnext->mbs, obstack_int_grow_fast (&extrapool, weightidx);
runp->nmbs - 1) == 0 else
&& (runp->mbs[runp->nmbs - 1] + 1 obstack_grow (&extrapool, &weightidx, sizeof (int32_t));
== runp->mbnext->mbs[runp->nmbs - 1]));
/* And add the end by sequence. Without length this time. */ runp = runp->next;
if (runp->mbnext == NULL
|| runp->nmbs != runp->mbnext->nmbs
|| memcmp (runp->mbs, runp->mbnext->mbs,
runp->nmbs - 1) != 0
|| (runp->mbs[runp->nmbs - 1] + 1
!= runp->mbnext->mbs[runp->nmbs - 1]))
break;
/* Insert the weight. */
weightidx = output_weight (&weightpool, collate, runp);
}
/* And add the end byte sequence. Without length this
time. */
for (i = 1; i < runp->nmbs; ++i) for (i = 1; i < runp->nmbs; ++i)
obstack_1grow_fast (&extrapool, runp->mbs[i]); obstack_1grow_fast (&extrapool, runp->mbs[i]);
weightidx = output_weight (&weightpool, collate, runp);
if (sizeof (int32_t) == sizeof (int))
obstack_int_grow_fast (&extrapool, weightidx);
else
obstack_grow (&extrapool, &weightidx, sizeof (int32_t));
} }
else else
{ {
@ -1768,7 +1793,6 @@ collate_output (struct localedef_t *locale, struct charmap_t *charmap,
obstack_int_grow_fast (&extrapool, weightidx); obstack_int_grow_fast (&extrapool, weightidx);
else else
obstack_grow (&extrapool, &weightidx, sizeof (int32_t)); obstack_grow (&extrapool, &weightidx, sizeof (int32_t));
obstack_1grow_fast (&extrapool, runp->section->ruleidx);
obstack_1grow_fast (&extrapool, runp->nmbs - 1); obstack_1grow_fast (&extrapool, runp->nmbs - 1);
for (i = 1; i < runp->nmbs; ++i) for (i = 1; i < runp->nmbs; ++i)
obstack_1grow_fast (&extrapool, runp->mbs[i]); obstack_1grow_fast (&extrapool, runp->mbs[i]);
@ -1835,6 +1859,12 @@ collate_output (struct localedef_t *locale, struct charmap_t *charmap,
idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len; idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
++cnt; ++cnt;
assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_INDIRECTMB));
iov[2 + cnt].iov_len = obstack_object_size (&indirectpool);
iov[2 + cnt].iov_base = obstack_finish (&indirectpool);
idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
++cnt;
assert (cnt == _NL_ITEM_INDEX (_NL_NUM_LC_COLLATE)); assert (cnt == _NL_ITEM_INDEX (_NL_NUM_LC_COLLATE));
@ -1842,6 +1872,7 @@ collate_output (struct localedef_t *locale, struct charmap_t *charmap,
obstack_free (&weightpool, NULL); obstack_free (&weightpool, NULL);
obstack_free (&extrapool, NULL); obstack_free (&extrapool, NULL);
obstack_free (&indirectpool, NULL);
} }
@ -2291,16 +2322,16 @@ error while adding equivalent collating symbol"));
uint32_t cnt; uint32_t cnt;
/* This means we have exactly one rule: `forward'. */ /* This means we have exactly one rule: `forward'. */
if (collate->nrules > 1) if (nrules > 1)
lr_error (ldfile, _("\ lr_error (ldfile, _("\
%s: invalid number of sorting rules"), %s: invalid number of sorting rules"),
"LC_COLLATE"); "LC_COLLATE");
else else
collate->nrules = 1; nrules = 1;
sp->rules = obstack_alloc (&collate->mempool, sp->rules = obstack_alloc (&collate->mempool,
(sizeof (enum coll_sort_rule) (sizeof (enum coll_sort_rule)
* collate->nrules)); * nrules));
for (cnt = 0; cnt < collate->nrules; ++cnt) for (cnt = 0; cnt < nrules; ++cnt)
sp->rules[cnt] = sort_forward; sp->rules[cnt] = sort_forward;
/* Next line. */ /* Next line. */

View File

@ -3073,10 +3073,8 @@ Computing table size for character classes might take a while..."),
while (idx2 < ctype->map_collection_act[idx]) while (idx2 < ctype->map_collection_act[idx])
{ {
if (ctype->map_collection[idx][idx2] != 0) if (ctype->map_collection[idx][idx2] != 0)
*find_idx (ctype, &ctype->map32[idx], ctype->map32[idx][ctype->charnames[idx2]] =
&ctype->map_collection_max[idx], ctype->map_collection[idx][idx2];
&ctype->map_collection_act[idx],
ctype->names[idx2]) = ctype->map_collection[idx][idx2];
++idx2; ++idx2;
} }
} }

View File

@ -17,191 +17,106 @@
write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
Boston, MA 02111-1307, USA. */ Boston, MA 02111-1307, USA. */
#include <alloca.h> /* Find index of weight. */
#include <errno.h> static inline int32_t
#include <langinfo.h> findidx (const unsigned char **cpp)
#include "localeinfo.h"
#ifndef STRING_TYPE
# error STRING_TYPE not defined
#endif
#ifndef USTRING_TYPE
# error USTRING_TYPE not defined
#endif
typedef struct weight_t
{ {
struct weight_t *prev; int_fast32_t i = table[*(*cpp)++];
struct weight_t *next; const unsigned char *cp;
struct data_pair
{
int number;
const uint32_t *value;
} data[0];
} weight_t;
if (i >= 0)
/* This is an index into the weight table. Cool. */
return i;
/* The following five macros grant access to the values in the /* Oh well, more than one sequence starting with this byte.
collate locale file that do not depend on byte order. */ Search for the correct one. */
#ifndef USE_IN_EXTENDED_LOCALE_MODEL cp = &extra[-i];
# define collate_nrules \
(_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES))
# define collate_hash_size \
(_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_HASH_SIZE))
# define collate_hash_layers \
(_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_HASH_LAYERS))
# define collate_undefined \
(_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_UNDEFINED_WC))
# define collate_rules \
((uint32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_RULES))
static __inline void get_weight (const STRING_TYPE **str, weight_t *result);
static __inline void
get_weight (const STRING_TYPE **str, weight_t *result)
#else
# define collate_nrules \
current->values[_NL_ITEM_INDEX (_NL_COLLATE_NRULES)].word
# define collate_hash_size \
current->values[_NL_ITEM_INDEX (_NL_COLLATE_HASH_SIZE)].word
# define collate_hash_layers \
current->values[_NL_ITEM_INDEX (_NL_COLLATE_HASH_LAYERS)].word
# define collate_undefined \
current->values[_NL_ITEM_INDEX (_NL_COLLATE_UNDEFINED_WC)].word
# define collate_rules \
((uint32_t *) current->values[_NL_ITEM_INDEX (_NL_COLLATE_RULES)].string)
static __inline void get_weight (const STRING_TYPE **str, weight_t *result,
struct locale_data *current,
const uint32_t *__collate_tablewc,
const uint32_t *__collate_extrawc);
static __inline void
get_weight (const STRING_TYPE **str, weight_t *result,
struct locale_data *current, const uint32_t *__collate_tablewc,
const uint32_t *__collate_extrawc)
#endif
{
unsigned int ch = *((USTRING_TYPE *) (*str))++;
size_t slot;
if (sizeof (STRING_TYPE) == 1)
slot = ch * (collate_nrules + 1);
else
{
const size_t level_size = collate_hash_size * (collate_nrules + 1);
size_t level;
slot = (ch % collate_hash_size) * (collate_nrules + 1);
level = 0;
while (__collate_tablewc[slot] != (uint32_t) ch)
{
if (__collate_tablewc[slot + 1] == 0
|| ++level >= collate_hash_layers)
{
size_t idx = collate_undefined;
size_t cnt;
for (cnt = 0; cnt < collate_nrules; ++cnt)
{
result->data[cnt].number = __collate_extrawc[idx++];
result->data[cnt].value = &__collate_extrawc[idx];
idx += result->data[cnt].number;
}
/* The Unix standard requires that a character outside
the domain is signalled by setting `errno'. */
__set_errno (EINVAL);
return;
}
slot += level_size;
}
}
if (__collate_tablewc[slot + 1] != (uint32_t) FORWARD_CHAR)
{
/* We have a simple form. One value for each weight. */
size_t cnt;
for (cnt = 0; cnt < collate_nrules; ++cnt)
{
result->data[cnt].number = 1;
result->data[cnt].value = &__collate_tablewc[slot + 1 + cnt];
}
return;
}
/* We now look for any collation element which starts with CH.
There might none, but the last list member is a catch-all case
because it is simple the character CH. The value of this entry
might be the same as UNDEFINED. */
slot = __collate_tablewc[slot + 2];
while (1) while (1)
{ {
size_t idx; size_t nhere;
const unsigned char *usrc = *cpp;
/* This is a comparison between a uint32_t array (aka wchar_t) and /* The first thing is the index. */
an 8-bit string. */ i = *((int32_t *) cp);
for (idx = 0; __collate_extrawc[slot + 2 + idx] != 0; ++idx) cp += sizeof (int32_t);
if (__collate_extrawc[slot + 2 + idx] != (uint32_t) (*str)[idx])
break;
/* When the loop finished with all character of the collation /* Next is the length of the byte sequence. These are always
element used, we found the longest prefix. */ short byte sequences so there is no reason to call any
if (__collate_extrawc[slot + 2 + idx] == 0) function (even if they are inlined). */
nhere = *cp++;
if (i >= 0)
{ {
/* It is a single character. If it matches we found our
index. Note that at the end of each list there is an
entry of length zero which represents the single byte
sequence. The first (and here only) byte was tested
already. */
size_t cnt; size_t cnt;
*str += idx; for (cnt = 0; cnt < nhere; ++cnt)
idx += slot + 3; if (cp[cnt] != usrc[cnt])
for (cnt = 0; cnt < collate_nrules; ++cnt) break;
if (cnt == nhere)
{ {
result->data[cnt].number = __collate_extrawc[idx++]; /* Found it. */
result->data[cnt].value = &__collate_extrawc[idx]; *cpp += nhere;
idx += result->data[cnt].number; return i;
}
return;
} }
/* To next entry in list. */ /* Up to the next entry. */
slot += __collate_extrawc[slot]; cp += nhere;
} }
else
{
/* This is a range of characters. First decide whether the
current byte sequence lies in the range. */
size_t cnt;
size_t offset = 0;
for (cnt = 0; cnt < nhere; ++cnt)
if (cp[cnt] != usrc[cnt])
break;
if (cnt != nhere)
{
if (cp[cnt] > usrc[cnt])
{
/* Cannot be in this range. */
cp += 2 * nhere;
continue;
}
/* Test against the end of the range. */
for (cnt = 0; cnt < nhere; ++cnt)
if (cp[nhere + cnt] != usrc[cnt])
break;
if (cnt != nhere && cp[nhere + cnt] < usrc[cnt])
{
/* Cannot be in this range. */
cp += 2 * nhere;
continue;
}
/* This range matches the next characters. Now find
the offset in the indirect table. */
for (cnt = 0; cp[cnt] == usrc[cnt]; ++cnt);
do
{
offset <<= 8;
offset += usrc[cnt] - cp[cnt];
}
while (++cnt < nhere);
}
*cpp += nhere;
return offset;
}
}
/* NOTREACHED */
return 0x43219876;
} }
/* To process a string efficiently we retrieve all information about
the string at once. The following macro constructs a double linked
list of this information. It is a macro because we use `alloca'
and we use a double linked list because of the backward collation
order.
We have this strange extra macro since the functions which use the
given locale (not the global one) cannot use the global tables. */
#ifndef USE_IN_EXTENDED_LOCALE_MODEL
# define call_get_weight(strp, newp) get_weight ((strp), (newp))
#else
# define call_get_weight(strp, newp) \
get_weight ((strp), (newp), current, collate_table, collate_extra)
#endif
#define get_string(str, forw, backw) \
do \
{ \
weight_t *newp; \
while (*str != '\0') \
{ \
newp = (weight_t *) alloca (sizeof (weight_t) \
+ (collate_nrules \
* sizeof (struct data_pair))); \
\
newp->prev = backw; \
if (backw == NULL) \
forw = newp; \
else \
backw->next = newp; \
newp->next = NULL; \
backw = newp; \
call_get_weight (&str, newp); \
} \
} \
while (0)

View File

@ -17,282 +17,397 @@
write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
Boston, MA 02111-1307, USA. */ Boston, MA 02111-1307, USA. */
#include <langinfo.h>
#include <stddef.h> #include <stddef.h>
#include <stdint.h>
#include <stdlib.h> #include <stdlib.h>
#include <string.h> #include <string.h>
#ifndef WIDE_VERSION #include "../locale/localeinfo.h"
# define STRING_TYPE char
# define USTRING_TYPE unsigned char #ifdef USE_IN_EXTENDED_LOCALE_MODEL
# define L_(Ch) Ch
# ifdef USE_IN_EXTENDED_LOCALE_MODEL
# define STRXFRM __strxfrm_l # define STRXFRM __strxfrm_l
# else #else
# define STRXFRM strxfrm # define STRXFRM strxfrm
# endif
# define STRLEN strlen
# define STPNCPY __stpncpy
#endif #endif
/* These are definitions used by some of the functions for handling
UTF-8 encoding below. */
static const uint32_t encoding_mask[] =
{
~0x7ff, ~0xffff, ~0x1fffff, ~0x3ffffff
};
static const unsigned char encoding_byte[] =
{
0xc0, 0xe0, 0xf0, 0xf8, 0xfc
};
/* We need UTF-8 encoding of numbers. */
static inline int
utf8_encode (char *buf, int val)
{
char *startp = buf;
int retval;
if (val < 0x80)
{
*buf++ = (char) val;
retval = 1;
}
else
{
int step;
for (step = 2; step < 6; ++step)
if ((val & encoding_mask[step - 2]) == 0)
break;
retval = step;
*buf = encoding_byte[step - 2];
--step;
do
{
buf[step] = 0x80 | (val & 0x3f);
val >>= 6;
}
while (--step > 0);
*buf |= val;
}
return buf - startp;
}
#ifndef USE_IN_EXTENDED_LOCALE_MODEL #ifndef USE_IN_EXTENDED_LOCALE_MODEL
size_t size_t
STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n) STRXFRM (char *dest, const char *src, size_t n)
#else #else
size_t size_t
STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n, __locale_t l) STRXFRM (char *dest, const char *src, size_t n, __locale_t l)
#endif
{
if (n != 0)
STPNCPY (dest, src, n);
return STRLEN (src);
}
#if 0
/* Include the shared helper functions. `strxfrm'/`wcsxfrm' also use
these functions. */
#include "../locale/weight.h"
#ifndef WIDE_VERSION
/* Write 32 bit value UTF-8 encoded but only if enough space is left. */
static __inline size_t
print_val (u_int32_t value, char *dest, size_t max, size_t act)
{
char tmp[6];
int idx = 0;
if (value < 0x80)
tmp[idx++] = (char) value;
else
{
tmp[idx++] = '\x80' + (char) (value & 0x3f);
value >>= 6;
if (value < 0x20)
tmp[idx++] = '\xc0' + (char) value;
else
{
tmp[idx++] = '\x80' + (char) (value & 0x3f);
value >>= 6;
if (value < 0x10)
tmp[idx++] = '\xe0' + (char) value;
else
{
tmp[idx++] = '\x80' + (char) (value & 0x3f);
value >>= 6;
if (value < 0x08)
tmp[idx++] = '\xf0' + (char) value;
else
{
tmp[idx++] = '\x80' + (char) (value & 0x3f);
value >>= 6;
if (value < 0x04)
tmp[idx++] = '\xf8' + (char) value;
else
{
tmp[idx++] = '\x80' + (char) (value & 0x3f);
tmp[idx++] = '\xfc' + (char) (value >> 6);
}
}
}
}
}
while (idx-- > 0)
{
if (act < max)
dest[act] = tmp[idx];
++act;
}
return act;
}
#else
static __inline size_t
print_val (u_int32_t value, wchar_t *dest, size_t max, size_t act)
{
/* We cannot really assume wchar_t is 32 bits wide. But it is for
GCC and so we don't do much optimization for the other case. */
if (sizeof (wchar_t) == 4)
{
if (act < max)
dest[act] = (wchar_t) value;
++act;
}
else
{
wchar_t tmp[3];
size_t idx = 0;
if (value < 0x8000)
tmp[idx++] = (wchar_t) act;
else
{
tmp[idx++] = (wchar_t) (0x8000 + (value & 0x3fff));
value >>= 14;
if (value < 0x2000)
tmp[idx++] = (wchar_t) (0xc000 + value);
else
{
tmp[idx++] = (wchar_t) (0x8000 + (value & 0x3fff));
value >>= 14;
tmp[idx++] = (wchar_t) (0xe000 + value);
}
}
while (idx-- > 0)
{
if (act < max)
dest[act] = tmp[idx];
++act;
}
}
return act;
}
#endif
/* Transform SRC into a form such that the result of strcmp
on two strings that have been transformed by strxfrm is
the same as the result of strcoll on the two strings before
their transformation. The transformed string is put in at
most N characters of DEST and its length is returned. */
#ifndef USE_IN_EXTENDED_LOCALE_MODEL
size_t
STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n)
#else
size_t
STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n, __locale_t l)
#endif #endif
{ {
#ifdef USE_IN_EXTENDED_LOCALE_MODEL #ifdef USE_IN_EXTENDED_LOCALE_MODEL
struct locale_data *current = l->__locales[LC_COLLATE]; struct locale_data *current = l->__locales[LC_COLLATE];
# if BYTE_ORDER == BIG_ENDIAN uint_fast32_t nrules = *((uint32_t *) current->values[_NL_ITEM_INDEX (_NL_COLLATE_NRULES)].string);
const u_int32_t *collate_table = (const u_int32_t *) #else
current->values[_NL_ITEM_INDEX (_NL_COLLATE_TABLE_EB)].string; uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
const u_int32_t *collate_extra = (const u_int32_t *)
current->values[_NL_ITEM_INDEX (_NL_COLLATE_EXTRA_EB)].string;
# elif BYTE_ORDER == LITTLE_ENDIAN
const u_int32_t *collate_table = (const u_int32_t *)
current->values[_NL_ITEM_INDEX (_NL_COLLATE_TABLE_EL)].string;
const u_int32_t *collate_extra = (const u_int32_t *)
current->values[_NL_ITEM_INDEX (_NL_COLLATE_EXTRA_EL)].string;
# else
# error bizarre byte order
# endif
#endif #endif
weight_t *forw = NULL; /* We don't assign the following values right away since it might be
weight_t *backw = NULL; unnecessary in case there are no rules. */
size_t pass; const unsigned char *rulesets;
size_t written; const int32_t *table;
const unsigned char *weights;
const unsigned char *extra;
const int32_t *indirect;
uint_fast32_t pass;
size_t needed;
const unsigned char *usrc;
size_t srclen = strlen (src);
int32_t *idxarr;
unsigned char *rulearr;
size_t idxmax;
size_t idxcnt;
int use_malloc = 0;
/* If the current locale does not specify locale data we use normal #include "../locale/weight.h"
8-bit string comparison. */
if (collate_nrules == 0) if (nrules == 0)
{ {
if (n != 0) if (n != 0)
STPNCPY (dest, src, n); __stpncpy (dest, src, n);
return STRLEN (src); return srclen;
} }
#ifdef USE_IN_EXTENDED_LOCALE_MODEL
rulesets = (const unsigned char *)
current->values[_NL_ITEM_INDEX (_NL_COLLATE_RULESETS)].string;
table = (const int32_t *)
current->values[_NL_ITEM_INDEX (_NL_COLLATE_TABLEMB)].string;
weights = (const unsigned char *)
current->values[_NL_ITEM_INDEX (_NL_COLLATE_WEIGHTMB)].string;
extra = (const unsigned char *)
current->values[_NL_ITEM_INDEX (_NL_COLLATE_EXTRAMB)].string;
indirect = (const int32_t *)
current->values[_NL_ITEM_INDEX (_NL_COLLATE_INDIRECTMB)].string;
#else
rulesets = (const unsigned char *)
_NL_CURRENT (LC_COLLATE, _NL_COLLATE_RULESETS);
table = (const int32_t *)
_NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
weights = (const unsigned char *)
_NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
extra = (const unsigned char *)
_NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
indirect = (const int32_t *)
_NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
#endif
/* Handle an empty string as a special case. */ /* Handle an empty string as a special case. */
if (*src == '\0') if (srclen == 0)
{ {
if (n != 0) if (n != 0)
*dest = '\0'; *dest = '\0';
return 1; return 1;
} }
/* Get full information about the string. This means we get /* We need the elements of the string as unsigned values since they
information for all passes in a special data structure. */ are used as indeces. */
get_string (src, forw, backw); usrc = (const unsigned char *) src;
/* Now we have all the information. In at most the given number of /* Perform the first pass over the string and while doing this find
passes we can finally decide about the order. */ and store the weights for each character. Since we want this to
written = 0; be as fast as possible we are using `alloca' to store the temporary
for (pass = 0; pass < collate_nrules; ++pass) values. But since there is no limit on the length of the string
we have to use `malloc' if the string is too long. We should be
very conservative here. */
if (srclen >= 16384)
{ {
int forward = (collate_rules[pass] & sort_forward) != 0; idxarr = (int32_t *) malloc (srclen * (sizeof (int32_t) + 1));
const weight_t *run = forward ? forw : backw; rulearr = (unsigned char *) &idxarr[srclen];
int idx = forward ? 0 : run->data[pass].number - 1;
while (1) if (idxarr == NULL)
{ /* No memory. Well, go with the stack then.
int ignore = 0;
u_int32_t w = 0;
/* Here we have to check for IGNORE entries. If these are XXX Once this implementation is stable we will handle this
found we count them and go on with he next value. */ differently. Instead of precomputing the indeces we will
while (run != NULL do this in time. This means, though, that this happens for
&& ((w = run->data[pass].value[idx]) every pass again. */
== (u_int32_t) IGNORE_CHAR)) goto try_stack;
{ use_malloc = 1;
++ignore;
if (forward
? ++idx >= run->data[pass].number
: --idx < 0)
{
weight_t *nextp = forward ? run->next : run->prev;
if (nextp == NULL)
{
w = 0;
/* No more non-INGOREd elements means lowest
possible value. */
ignore = -1;
} }
else else
idx = forward ? 0 : nextp->data[pass].number - 1; {
run = nextp; try_stack:
idxarr = (int32_t *) alloca (srclen * sizeof (int32_t));
rulearr = (unsigned char *) alloca (srclen);
}
idxmax = 0;
do
{
int32_t tmp = findidx (&usrc);
rulearr[idxmax] = tmp >> 24;
idxarr[idxmax] = tmp & 0x80ffffff;
++idxmax;
}
while (*usrc != '\0');
/* Now the passes over the weights. We now use the indeces we found
before. */
needed = 0;
for (pass = 0; pass < nrules; ++pass)
{
size_t backw_stop = ~0ul;
int rule = rulesets[rulearr[0] * nrules + pass];
/* We assume that if a rule has defined `position' in one section
this is true for all of them. */
int position = rule & sort_position;
if (position == 0)
{
for (idxcnt = 0; idxcnt < idxmax; ++idxcnt)
{
if ((rule & sort_forward) != 0)
{
size_t len;
if (backw_stop != ~0ul)
{
/* Handle the pushed elements now. */
size_t backw;
for (backw = idxcnt - 1; backw >= backw_stop; --backw)
{
len = weights[idxarr[backw]++];
if (needed + len < n)
while (len-- > 0)
dest[needed++] = weights[idxarr[backw]++];
else
{
/* No more characters fit into the buffer. */
needed += len;
idxarr[backw] += len;
} }
} }
/* Stop if all characters are processed. */ backw_stop = ~0ul;
if (run == NULL) }
break;
/* Now we have information of the number of ignored weights /* Now handle the forward element. */
and the value of the next weight. We have to add 2 len = weights[idxarr[idxcnt]++];
because 0 means EOS and 1 is the intermediate string end. */ if (needed + len < n)
if ((collate_rules[pass] & sort_position) != 0) while (len-- > 0)
written = print_val (ignore + 2, dest, n, written); dest[needed++] = weights[idxarr[idxcnt]++];
else
if (w != 0)
written = print_val (w, dest, n, written);
/* We have to increment the index counters. */
if (forward)
{ {
if (++idx >= run->data[pass].number) /* No more characters fit into the buffer. */
{ needed += len;
run = run->next; idxarr[idxcnt] += len;
idx = 0;
} }
} }
else else
{ {
if (--idx < 0) /* Remember where the backwards series started. */
if (backw_stop == ~0ul)
backw_stop = idxcnt;
}
rule = rulesets[rulearr[idxcnt + 1] * nrules + pass];
}
if (backw_stop != ~0ul)
{ {
run = run->prev; /* Handle the pushed elements now. */
if (run != NULL) size_t backw;
idx = run->data[pass].number - 1;
for (backw = idxcnt - 1; backw >= backw_stop; --backw)
{
size_t len = weights[idxarr[backw]++];
if (needed + len < n)
while (len-- > 0)
dest[needed++] = weights[idxarr[backw]++];
else
{
/* No more characters fit into the buffer. */
needed += len;
idxarr[backw] += len;
}
}
}
}
else
{
int val = 1;
char buf[7];
size_t buflen;
size_t i;
for (idxcnt = 0; idxcnt < idxmax; ++idxcnt)
{
if ((rule & sort_forward) != 0)
{
size_t len;
if (backw_stop != ~0ul)
{
/* Handle the pushed elements now. */
size_t backw;
for (backw = idxcnt - 1; backw >= backw_stop; --backw)
{
len = weights[idxarr[backw]++];
if (len != 0)
{
buflen = utf8_encode (buf, val);
if (needed + buflen + len < n)
{
for (i = 0; i < buflen; ++i)
dest[needed + i] = buf[i];
for (i = 0; i < len; ++i)
dest[needed + buflen + i] =
weights[idxarr[backw] + i];
}
idxarr[backw] += len;
needed += buflen + len;
val = 1;
}
else
++val;
}
backw_stop = ~0ul;
}
/* Now handle the forward element. */
len = weights[idxarr[idxcnt]++];
if (len != 0)
{
buflen = utf8_encode (buf, val);
if (needed + buflen + len < n)
{
for (i = 0; i < buflen; ++i)
dest[needed + i] = buf[i];
for (i = 0; i < len; ++i)
dest[needed + buflen + i] =
weights[idxarr[idxcnt] + i];
}
idxarr[idxcnt] += len;
needed += buflen + len;
val = 1;
}
else
/* Note that we don't have to increment `idxarr[idxcnt]'
since the length is zero. */
++val;
}
else
{
/* Remember where the backwards series started. */
if (backw_stop == ~0ul)
backw_stop = idxcnt;
}
rule = rulesets[rulearr[idxcnt + 1] * nrules + pass];
}
if (backw_stop != ~0)
{
/* Handle the pushed elements now. */
size_t backw;
for (backw = idxmax - 1; backw >= backw_stop; --backw)
{
size_t len = weights[idxarr[backw]++];
if (len != 0)
{
buflen = utf8_encode (buf, val);
if (needed + buflen + len < n)
{
for (i = 0; i < buflen; ++i)
dest[needed + i] = buf[i];
for (i = 0; i < len; ++i)
dest[needed + buflen + i] =
weights[idxarr[backw] + i];
}
idxarr[backw] += len;
needed += buflen + len;
val = 1;
}
else
++val;
} }
} }
} }
/* Write marker for end of word. */ /* Finally store the byte to separate the passes or terminate
if (pass + 1 < collate_nrules) the string. */
written = print_val (1, dest, n, written); if (needed < n)
dest[needed] = pass + 1 < nrules ? '\1' : '\0';
++needed;
} }
/* Terminate string. */ /* This is a little optimization: many collation specifications have
if (written < n) a `position' rule at the end and if no non-ignored character
dest[written] = L_('\0'); is found the last \1 byte is immediately followed by a \0 byte
signalling this. We can avoid the \1 byte(s). */
if (needed > 2 && dest[needed - 2] == '\1')
{
/* Remove the \1 byte. */
--needed;
dest[needed - 1] = '\0';
}
/* Return length without counting the terminating '\0'. */ /* Free the memory if needed. */
return written; if (use_malloc)
free (idxarr);
return needed;
} }
#endif

View File

@ -1,4 +1,4 @@
/* Copyright (C) 1996, 1997, 1998 Free Software Foundation, Inc. /* Copyright (C) 1996, 1997, 1998, 1999 Free Software Foundation, Inc.
This file is part of the GNU C Library. This file is part of the GNU C Library.
Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>, 1996. Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>, 1996.
@ -19,16 +19,23 @@
#include <wchar.h> #include <wchar.h>
#define WIDE_VERSION 1
#define STRING_TYPE wchar_t
#define USTRING_TYPE wint_t
#define L_(Ch) L##Ch
#ifdef USE_IN_EXTENDED_LOCALE_MODEL #ifdef USE_IN_EXTENDED_LOCALE_MODEL
# define STRXFRM __wcsxfrm_l # define STRXFRM __wcsxfrm_l
#else #else
# define STRXFRM wcsxfrm # define STRXFRM wcsxfrm
#endif #endif
#define STRLEN __wcslen
#define STPNCPY __wcpncpy
#include <string/strxfrm.c>
#ifndef USE_IN_EXTENDED_LOCALE_MODEL
size_t
STRXFRM (wchar_t *dest, const wchar_t *src, size_t n)
#else
size_t
STRXFRM (wchar_t *dest, const wchar_t *src, size_t n, __locale_t l)
#endif
{
if (n != 0)
__wcpncpy (dest, src, n);
return __wcslen (src);
}

View File

@ -52,5 +52,5 @@ wctrans (const char *property)
/* We have to search the table. */ /* We have to search the table. */
result = (int32_t *) _NL_CURRENT (LC_CTYPE, _NL_NUM_LC_CTYPE + cnt - 2); result = (int32_t *) _NL_CURRENT (LC_CTYPE, _NL_NUM_LC_CTYPE + cnt - 2);
return (wctrans_t) (result + 128); return (wctrans_t) result;
} }