1999-12-25  Ulrich Drepper  <drepper@cygnus.com>

	* locale/C-collate.c (_nl_C_LC_COLLATE): Add one more entry for the
	indirect table.
	* locale/langinfo.h: Likewise.
	* locale/categories.def: Likewise.  Remove reference to postload
	functions.
	* locale/lc-collate.c (_nl_postload_collate): Removed.  Also remove
	__collate_tablemb, __collate_weightmb, and __collate_extramb.
	* locale/localeinfo.h: Remove declaration for removed variables above.
	Remove prototype for _nl_get_era_entry.
	* locale/weight.h: Complete rewrite for new collate implementation.
	* locale/programs/ld-collate.c: Many changes to make output file
	usable in strxfrm/strcoll.
	* string/strxfrm.c: Complete rewrite for new collate implementation.
	* wcsmbs/wcsxfrm.c: Don't use strxfrm.c, implement dummy implementation
	locally.

1999-12-25  Shinya Hanataka  <hanataka@abyss.rim.or.jp>

	* locale/programs/ld-ctype.c (allocate_arrays): Correctly assign
	transformation values for chars >255.
	* wctype/wctrans.c: Return pointer unmodified.
This commit is contained in:
Ulrich Drepper 1999-12-25 23:41:39 +00:00
parent ce40141c6b
commit 450bf66ef2
12 changed files with 537 additions and 471 deletions

View File

@ -1,3 +1,27 @@
1999-12-25 Ulrich Drepper <drepper@cygnus.com>
* locale/C-collate.c (_nl_C_LC_COLLATE): Add one more entry for the
indirect table.
* locale/langinfo.h: Likewise.
* locale/categories.def: Likewise. Remove reference to postload
functions.
* locale/lc-collate.c (_nl_postload_collate): Removed. Also remove
__collate_tablemb, __collate_weightmb, and __collate_extramb.
* locale/localeinfo.h: Remove declaration for removed variables above.
Remove prototype for _nl_get_era_entry.
* locale/weight.h: Complete rewrite for new collate implementation.
* locale/programs/ld-collate.c: Many changes to make output file
usable in strxfrm/strcoll.
* string/strxfrm.c: Complete rewrite for new collate implementation.
* wcsmbs/wcsxfrm.c: Don't use strxfrm.c, implement dummy implementation
locally.
1999-12-25 Shinya Hanataka <hanataka@abyss.rim.or.jp>
* locale/programs/ld-ctype.c (allocate_arrays): Correctly assign
transformation values for chars >255.
* wctype/wctrans.c: Return pointer unmodified.
1999-12-24 Ulrich Drepper <drepper@cygnus.com>
* sysdeps/posix/system.c (__libc_system): Check whether command

View File

@ -150,12 +150,13 @@ const struct locale_data _nl_C_LC_COLLATE =
_nl_C_name,
NULL, 0, 0, /* no file mapped */
UNDELETABLE,
5,
6,
{
{ word: 0 },
{ string: NULL },
{ string: NULL },
{ string: NULL },
{ string: NULL },
{ string: NULL }
}
};

View File

@ -47,7 +47,8 @@ DEFINE_CATEGORY
DEFINE_ELEMENT (_NL_COLLATE_TABLEMB, "collate-tablemb", std, string)
DEFINE_ELEMENT (_NL_COLLATE_WEIGHTMB, "collate-weightmb", std, string)
DEFINE_ELEMENT (_NL_COLLATE_EXTRAMB, "collate-extramb", std, string)
), _nl_postload_collate)
DEFINE_ELEMENT (_NL_COLLATE_INDIRECTMB, "collate-indirectmb", std, string)
), NO_POSTLOAD)
/* The actual definition of ctype is meaningless here. It is hard coded in

View File

@ -235,6 +235,7 @@ enum
_NL_COLLATE_TABLEMB,
_NL_COLLATE_WEIGHTMB,
_NL_COLLATE_EXTRAMB,
_NL_COLLATE_INDIRECTMB,
_NL_NUM_LC_COLLATE,
/* LC_CTYPE category: character classification.

View File

@ -22,21 +22,3 @@
_NL_CURRENT_DEFINE (LC_COLLATE);
const int32_t *__collate_tablemb;
const unsigned char *__collate_weightmb;
const unsigned char *__collate_extramb;
/* We are called after loading LC_CTYPE data to load it into
the variables used by the collation functions and regex. */
void
_nl_postload_collate (void)
{
#define paste(a,b) paste1(a,b)
#define paste1(a,b) a##b
#define current(x) _NL_CURRENT (LC_COLLATE, paste(_NL_COLLATE_,x))
__collate_tablemb = (const int32_t *) current (TABLEMB);
__collate_weightmb = (const unsigned char *) current (WEIGHTMB);
__collate_extramb = (const unsigned char *) current (EXTRAMB);
}

View File

@ -165,9 +165,6 @@ extern void _nl_unload_locale (struct locale_data *locale);
extern void _nl_remove_locale (int locale, struct locale_data *data);
/* initialize `era' entries */
extern void _nl_init_era_entries (void);
/* Return `era' entry which corresponds to TP. Used in strftime. */
extern struct era_entry *_nl_get_era_entry (const struct tm *tp);
@ -180,10 +177,4 @@ extern const char *_nl_get_alt_digit (unsigned int number);
/* Similar, but now for wide characters. */
extern const wchar_t *_nl_get_walt_digit (unsigned int number);
/* Global variables for LC_COLLATE category data. */
extern const int32_t *__collate_tablemb;
extern const unsigned char *__collate_extrweightmb;
extern const unsigned char *__collate_extramb;
#endif /* localeinfo.h */

View File

@ -137,9 +137,6 @@ struct locale_collate_t
/* To make handling of errors easier we have another section. */
struct section_list error_section;
/* Number of sorting rules given in order_start line. */
uint32_t nrules;
/* Start of the order list. */
struct element_t *start;
@ -176,7 +173,7 @@ struct locale_collate_t
/* We have a few global variables which are used for reading all
LC_COLLATE category descriptions in all files. */
static int nrules;
static uint32_t nrules;
/* These are definitions used by some of the functions for handling
@ -426,7 +423,7 @@ read_directions (struct linereader *ldfile, struct token *arg,
if (! warned)
{
lr_error (ldfile, _("\
%s: `%s' mentioned twice in definition of weight %d in category `%s'"),
%s: `%s' mentioned twice in definition of weight %d"),
"LC_COLLATE", "position", cnt + 1);
}
}
@ -450,7 +447,13 @@ read_directions (struct linereader *ldfile, struct token *arg,
/* See whether we have to increment the counter. */
if (arg->tok != tok_comma && rules[cnt] != 0)
++cnt;
{
/* Add the default `forward' if we have seen only `position'. */
if (rules[cnt] == sort_position)
rules[cnt] = sort_position | sort_forward;
++cnt;
}
if (arg->tok == tok_eof || arg->tok == tok_eol)
/* End of line or file, so we exit the loop. */
@ -876,7 +879,7 @@ insert_value (struct linereader *ldfile, struct token *arg,
elem->nmbs = seq->nbytes;
}
if (elem->wcs == NULL && seq != ILLEGAL_CHAR_VALUE)
if (elem->wcs == NULL && wc != ILLEGAL_CHAR_VALUE)
{
uint32_t wcs[2] = { wc, 0 };
@ -1552,7 +1555,7 @@ collate_finish (struct localedef_t *locale, struct charmap_t *charmap)
}
static inline int32_t
static int32_t
output_weight (struct obstack *pool, struct locale_collate_t *collate,
struct element_t *elem)
{
@ -1575,25 +1578,18 @@ output_weight (struct obstack *pool, struct locale_collate_t *collate,
int len = 0;
int i;
/* Add the direction. */
obstack_1grow (pool, elem->section->rules[cnt]);
for (i = 0; i < elem->weights[cnt].cnt; ++i)
/* Encode the weight value. */
if (elem->weights[cnt].w[i] == NULL)
{
/* This entry was IGNORE. */
buf[len++] = IGNORE_CHAR;
}
else
/* Encode the weight value. We do nothing for IGNORE entries. */
if (elem->weights[cnt].w[i] != NULL)
len += utf8_encode (&buf[len],
elem->weights[cnt].w[i]->mborder[cnt]);
/* And add the buffer content. */
obstack_1grow (pool, len);
obstack_grow (pool, buf, len);
}
return retval;
return retval | ((elem->section->ruleidx & 0x7f) << 24);
}
@ -1611,11 +1607,13 @@ collate_output (struct localedef_t *locale, struct charmap_t *charmap,
int32_t tablemb[256];
struct obstack weightpool;
struct obstack extrapool;
struct obstack indirectpool;
struct section_list *sect;
int i;
obstack_init (&weightpool);
obstack_init (&extrapool);
obstack_init (&indirectpool);
data.magic = LIMAGIC (LC_COLLATE);
data.n = nelems;
@ -1629,7 +1627,7 @@ collate_output (struct localedef_t *locale, struct charmap_t *charmap,
cnt = 0;
assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_NRULES));
iov[2 + cnt].iov_base = &collate->nrules;
iov[2 + cnt].iov_base = &nrules;
iov[2 + cnt].iov_len = sizeof (uint32_t);
idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
++cnt;
@ -1638,7 +1636,12 @@ collate_output (struct localedef_t *locale, struct charmap_t *charmap,
for (sect = collate->sections, i = 0; sect != NULL; sect = sect->next)
if (sect->ruleidx == i)
{
obstack_grow (&weightpool, sect->rules, nrules);
int j;
obstack_make_room (&weightpool, nrules);
for (j = 0; j < nrules; ++j)
obstack_1grow_fast (&weightpool, sect->rules[j]);
++i;
}
/* And align the output. */
@ -1674,7 +1677,7 @@ collate_output (struct localedef_t *locale, struct charmap_t *charmap,
&& collate->mbheads[ch]->nmbs == 1)
{
tablemb[ch] = output_weight (&weightpool, collate,
collate->mbheads[ch]);
collate->mbheads[ch]);
}
else
{
@ -1719,38 +1722,60 @@ collate_output (struct localedef_t *locale, struct charmap_t *charmap,
{
int i;
/* More than one consecutive entry. We mark this by having
a negative index into the weight table. */
weightidx = -weightidx;
/* Now add first the initial byte sequence. */
added = ((sizeof (int32_t) + 1 + 1 + 2 * (runp->nmbs - 1)
+ __alignof__ (int32_t) - 1)
& ~(__alignof__ (int32_t) - 1));
obstack_make_room (&extrapool, added);
/* More than one consecutive entry. We mark this by having
a negative index into the indirect table. */
if (sizeof (int32_t) == sizeof (int))
obstack_int_grow_fast (&extrapool, weightidx);
obstack_int_grow_fast (&extrapool,
obstack_object_size (&indirectpool)
/ sizeof (int32_t));
else
obstack_grow (&extrapool, &weightidx, sizeof (int32_t));
obstack_1grow_fast (&extrapool, runp->section->ruleidx);
{
int32_t i = (obstack_object_size (&indirectpool)
/ sizeof (int32_t));
obstack_grow (&extrapool, &i, sizeof (int32_t));
}
obstack_1grow_fast (&extrapool, runp->nmbs - 1);
for (i = 1; i < runp->nmbs; ++i)
obstack_1grow_fast (&extrapool, runp->mbs[i]);
/* Now find the end of the consecutive sequence. */
do
runp = runp->next;
while (runp->mbnext != NULL
&& runp->nmbs == runp->mbnext->nmbs
&& memcmp (runp->mbs, runp->mbnext->mbs,
runp->nmbs - 1) == 0
&& (runp->mbs[runp->nmbs - 1] + 1
== runp->mbnext->mbs[runp->nmbs - 1]));
/* Now find the end of the consecutive sequence and
add all the indeces in the indirect pool. */
while (1)
{
if (sizeof (int32_t) == sizeof (int))
obstack_int_grow_fast (&extrapool, weightidx);
else
obstack_grow (&extrapool, &weightidx, sizeof (int32_t));
/* And add the end by sequence. Without length this time. */
runp = runp->next;
if (runp->mbnext == NULL
|| runp->nmbs != runp->mbnext->nmbs
|| memcmp (runp->mbs, runp->mbnext->mbs,
runp->nmbs - 1) != 0
|| (runp->mbs[runp->nmbs - 1] + 1
!= runp->mbnext->mbs[runp->nmbs - 1]))
break;
/* Insert the weight. */
weightidx = output_weight (&weightpool, collate, runp);
}
/* And add the end byte sequence. Without length this
time. */
for (i = 1; i < runp->nmbs; ++i)
obstack_1grow_fast (&extrapool, runp->mbs[i]);
weightidx = output_weight (&weightpool, collate, runp);
if (sizeof (int32_t) == sizeof (int))
obstack_int_grow_fast (&extrapool, weightidx);
else
obstack_grow (&extrapool, &weightidx, sizeof (int32_t));
}
else
{
@ -1768,7 +1793,6 @@ collate_output (struct localedef_t *locale, struct charmap_t *charmap,
obstack_int_grow_fast (&extrapool, weightidx);
else
obstack_grow (&extrapool, &weightidx, sizeof (int32_t));
obstack_1grow_fast (&extrapool, runp->section->ruleidx);
obstack_1grow_fast (&extrapool, runp->nmbs - 1);
for (i = 1; i < runp->nmbs; ++i)
obstack_1grow_fast (&extrapool, runp->mbs[i]);
@ -1835,6 +1859,12 @@ collate_output (struct localedef_t *locale, struct charmap_t *charmap,
idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
++cnt;
assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_INDIRECTMB));
iov[2 + cnt].iov_len = obstack_object_size (&indirectpool);
iov[2 + cnt].iov_base = obstack_finish (&indirectpool);
idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
++cnt;
assert (cnt == _NL_ITEM_INDEX (_NL_NUM_LC_COLLATE));
@ -1842,6 +1872,7 @@ collate_output (struct localedef_t *locale, struct charmap_t *charmap,
obstack_free (&weightpool, NULL);
obstack_free (&extrapool, NULL);
obstack_free (&indirectpool, NULL);
}
@ -2291,16 +2322,16 @@ error while adding equivalent collating symbol"));
uint32_t cnt;
/* This means we have exactly one rule: `forward'. */
if (collate->nrules > 1)
if (nrules > 1)
lr_error (ldfile, _("\
%s: invalid number of sorting rules"),
"LC_COLLATE");
else
collate->nrules = 1;
nrules = 1;
sp->rules = obstack_alloc (&collate->mempool,
(sizeof (enum coll_sort_rule)
* collate->nrules));
for (cnt = 0; cnt < collate->nrules; ++cnt)
* nrules));
for (cnt = 0; cnt < nrules; ++cnt)
sp->rules[cnt] = sort_forward;
/* Next line. */

View File

@ -3073,10 +3073,8 @@ Computing table size for character classes might take a while..."),
while (idx2 < ctype->map_collection_act[idx])
{
if (ctype->map_collection[idx][idx2] != 0)
*find_idx (ctype, &ctype->map32[idx],
&ctype->map_collection_max[idx],
&ctype->map_collection_act[idx],
ctype->names[idx2]) = ctype->map_collection[idx][idx2];
ctype->map32[idx][ctype->charnames[idx2]] =
ctype->map_collection[idx][idx2];
++idx2;
}
}

View File

@ -17,191 +17,106 @@
write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
Boston, MA 02111-1307, USA. */
#include <alloca.h>
#include <errno.h>
#include <langinfo.h>
#include "localeinfo.h"
#ifndef STRING_TYPE
# error STRING_TYPE not defined
#endif
#ifndef USTRING_TYPE
# error USTRING_TYPE not defined
#endif
typedef struct weight_t
/* Find index of weight. */
static inline int32_t
findidx (const unsigned char **cpp)
{
struct weight_t *prev;
struct weight_t *next;
struct data_pair
{
int number;
const uint32_t *value;
} data[0];
} weight_t;
int_fast32_t i = table[*(*cpp)++];
const unsigned char *cp;
if (i >= 0)
/* This is an index into the weight table. Cool. */
return i;
/* The following five macros grant access to the values in the
collate locale file that do not depend on byte order. */
#ifndef USE_IN_EXTENDED_LOCALE_MODEL
# define collate_nrules \
(_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES))
# define collate_hash_size \
(_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_HASH_SIZE))
# define collate_hash_layers \
(_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_HASH_LAYERS))
# define collate_undefined \
(_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_UNDEFINED_WC))
# define collate_rules \
((uint32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_RULES))
static __inline void get_weight (const STRING_TYPE **str, weight_t *result);
static __inline void
get_weight (const STRING_TYPE **str, weight_t *result)
#else
# define collate_nrules \
current->values[_NL_ITEM_INDEX (_NL_COLLATE_NRULES)].word
# define collate_hash_size \
current->values[_NL_ITEM_INDEX (_NL_COLLATE_HASH_SIZE)].word
# define collate_hash_layers \
current->values[_NL_ITEM_INDEX (_NL_COLLATE_HASH_LAYERS)].word
# define collate_undefined \
current->values[_NL_ITEM_INDEX (_NL_COLLATE_UNDEFINED_WC)].word
# define collate_rules \
((uint32_t *) current->values[_NL_ITEM_INDEX (_NL_COLLATE_RULES)].string)
static __inline void get_weight (const STRING_TYPE **str, weight_t *result,
struct locale_data *current,
const uint32_t *__collate_tablewc,
const uint32_t *__collate_extrawc);
static __inline void
get_weight (const STRING_TYPE **str, weight_t *result,
struct locale_data *current, const uint32_t *__collate_tablewc,
const uint32_t *__collate_extrawc)
#endif
{
unsigned int ch = *((USTRING_TYPE *) (*str))++;
size_t slot;
if (sizeof (STRING_TYPE) == 1)
slot = ch * (collate_nrules + 1);
else
{
const size_t level_size = collate_hash_size * (collate_nrules + 1);
size_t level;
slot = (ch % collate_hash_size) * (collate_nrules + 1);
level = 0;
while (__collate_tablewc[slot] != (uint32_t) ch)
{
if (__collate_tablewc[slot + 1] == 0
|| ++level >= collate_hash_layers)
{
size_t idx = collate_undefined;
size_t cnt;
for (cnt = 0; cnt < collate_nrules; ++cnt)
{
result->data[cnt].number = __collate_extrawc[idx++];
result->data[cnt].value = &__collate_extrawc[idx];
idx += result->data[cnt].number;
}
/* The Unix standard requires that a character outside
the domain is signalled by setting `errno'. */
__set_errno (EINVAL);
return;
}
slot += level_size;
}
}
if (__collate_tablewc[slot + 1] != (uint32_t) FORWARD_CHAR)
{
/* We have a simple form. One value for each weight. */
size_t cnt;
for (cnt = 0; cnt < collate_nrules; ++cnt)
{
result->data[cnt].number = 1;
result->data[cnt].value = &__collate_tablewc[slot + 1 + cnt];
}
return;
}
/* We now look for any collation element which starts with CH.
There might none, but the last list member is a catch-all case
because it is simple the character CH. The value of this entry
might be the same as UNDEFINED. */
slot = __collate_tablewc[slot + 2];
/* Oh well, more than one sequence starting with this byte.
Search for the correct one. */
cp = &extra[-i];
while (1)
{
size_t idx;
size_t nhere;
const unsigned char *usrc = *cpp;
/* This is a comparison between a uint32_t array (aka wchar_t) and
an 8-bit string. */
for (idx = 0; __collate_extrawc[slot + 2 + idx] != 0; ++idx)
if (__collate_extrawc[slot + 2 + idx] != (uint32_t) (*str)[idx])
break;
/* The first thing is the index. */
i = *((int32_t *) cp);
cp += sizeof (int32_t);
/* When the loop finished with all character of the collation
element used, we found the longest prefix. */
if (__collate_extrawc[slot + 2 + idx] == 0)
/* Next is the length of the byte sequence. These are always
short byte sequences so there is no reason to call any
function (even if they are inlined). */
nhere = *cp++;
if (i >= 0)
{
/* It is a single character. If it matches we found our
index. Note that at the end of each list there is an
entry of length zero which represents the single byte
sequence. The first (and here only) byte was tested
already. */
size_t cnt;
*str += idx;
idx += slot + 3;
for (cnt = 0; cnt < collate_nrules; ++cnt)
for (cnt = 0; cnt < nhere; ++cnt)
if (cp[cnt] != usrc[cnt])
break;
if (cnt == nhere)
{
result->data[cnt].number = __collate_extrawc[idx++];
result->data[cnt].value = &__collate_extrawc[idx];
idx += result->data[cnt].number;
/* Found it. */
*cpp += nhere;
return i;
}
return;
/* Up to the next entry. */
cp += nhere;
}
else
{
/* This is a range of characters. First decide whether the
current byte sequence lies in the range. */
size_t cnt;
size_t offset = 0;
/* To next entry in list. */
slot += __collate_extrawc[slot];
for (cnt = 0; cnt < nhere; ++cnt)
if (cp[cnt] != usrc[cnt])
break;
if (cnt != nhere)
{
if (cp[cnt] > usrc[cnt])
{
/* Cannot be in this range. */
cp += 2 * nhere;
continue;
}
/* Test against the end of the range. */
for (cnt = 0; cnt < nhere; ++cnt)
if (cp[nhere + cnt] != usrc[cnt])
break;
if (cnt != nhere && cp[nhere + cnt] < usrc[cnt])
{
/* Cannot be in this range. */
cp += 2 * nhere;
continue;
}
/* This range matches the next characters. Now find
the offset in the indirect table. */
for (cnt = 0; cp[cnt] == usrc[cnt]; ++cnt);
do
{
offset <<= 8;
offset += usrc[cnt] - cp[cnt];
}
while (++cnt < nhere);
}
*cpp += nhere;
return offset;
}
}
/* NOTREACHED */
return 0x43219876;
}
/* To process a string efficiently we retrieve all information about
the string at once. The following macro constructs a double linked
list of this information. It is a macro because we use `alloca'
and we use a double linked list because of the backward collation
order.
We have this strange extra macro since the functions which use the
given locale (not the global one) cannot use the global tables. */
#ifndef USE_IN_EXTENDED_LOCALE_MODEL
# define call_get_weight(strp, newp) get_weight ((strp), (newp))
#else
# define call_get_weight(strp, newp) \
get_weight ((strp), (newp), current, collate_table, collate_extra)
#endif
#define get_string(str, forw, backw) \
do \
{ \
weight_t *newp; \
while (*str != '\0') \
{ \
newp = (weight_t *) alloca (sizeof (weight_t) \
+ (collate_nrules \
* sizeof (struct data_pair))); \
\
newp->prev = backw; \
if (backw == NULL) \
forw = newp; \
else \
backw->next = newp; \
newp->next = NULL; \
backw = newp; \
call_get_weight (&str, newp); \
} \
} \
while (0)

View File

@ -17,282 +17,397 @@
write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
Boston, MA 02111-1307, USA. */
#include <langinfo.h>
#include <stddef.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#ifndef WIDE_VERSION
# define STRING_TYPE char
# define USTRING_TYPE unsigned char
# define L_(Ch) Ch
# ifdef USE_IN_EXTENDED_LOCALE_MODEL
# define STRXFRM __strxfrm_l
# else
# define STRXFRM strxfrm
# endif
# define STRLEN strlen
# define STPNCPY __stpncpy
#include "../locale/localeinfo.h"
#ifdef USE_IN_EXTENDED_LOCALE_MODEL
# define STRXFRM __strxfrm_l
#else
# define STRXFRM strxfrm
#endif
#ifndef USE_IN_EXTENDED_LOCALE_MODEL
size_t
STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n)
#else
size_t
STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n, __locale_t l)
#endif
/* These are definitions used by some of the functions for handling
UTF-8 encoding below. */
static const uint32_t encoding_mask[] =
{
if (n != 0)
STPNCPY (dest, src, n);
~0x7ff, ~0xffff, ~0x1fffff, ~0x3ffffff
};
return STRLEN (src);
}
#if 0
/* Include the shared helper functions. `strxfrm'/`wcsxfrm' also use
these functions. */
#include "../locale/weight.h"
#ifndef WIDE_VERSION
/* Write 32 bit value UTF-8 encoded but only if enough space is left. */
static __inline size_t
print_val (u_int32_t value, char *dest, size_t max, size_t act)
static const unsigned char encoding_byte[] =
{
char tmp[6];
int idx = 0;
0xc0, 0xe0, 0xf0, 0xf8, 0xfc
};
if (value < 0x80)
tmp[idx++] = (char) value;
else
{
tmp[idx++] = '\x80' + (char) (value & 0x3f);
value >>= 6;
if (value < 0x20)
tmp[idx++] = '\xc0' + (char) value;
else
{
tmp[idx++] = '\x80' + (char) (value & 0x3f);
value >>= 6;
if (value < 0x10)
tmp[idx++] = '\xe0' + (char) value;
else
{
tmp[idx++] = '\x80' + (char) (value & 0x3f);
value >>= 6;
if (value < 0x08)
tmp[idx++] = '\xf0' + (char) value;
else
{
tmp[idx++] = '\x80' + (char) (value & 0x3f);
value >>= 6;
if (value < 0x04)
tmp[idx++] = '\xf8' + (char) value;
else
{
tmp[idx++] = '\x80' + (char) (value & 0x3f);
tmp[idx++] = '\xfc' + (char) (value >> 6);
}
}
}
}
}
while (idx-- > 0)
{
if (act < max)
dest[act] = tmp[idx];
++act;
}
return act;
}
#else
static __inline size_t
print_val (u_int32_t value, wchar_t *dest, size_t max, size_t act)
/* We need UTF-8 encoding of numbers. */
static inline int
utf8_encode (char *buf, int val)
{
/* We cannot really assume wchar_t is 32 bits wide. But it is for
GCC and so we don't do much optimization for the other case. */
if (sizeof (wchar_t) == 4)
char *startp = buf;
int retval;
if (val < 0x80)
{
if (act < max)
dest[act] = (wchar_t) value;
++act;
*buf++ = (char) val;
retval = 1;
}
else
{
wchar_t tmp[3];
size_t idx = 0;
int step;
if (value < 0x8000)
tmp[idx++] = (wchar_t) act;
else
for (step = 2; step < 6; ++step)
if ((val & encoding_mask[step - 2]) == 0)
break;
retval = step;
*buf = encoding_byte[step - 2];
--step;
do
{
tmp[idx++] = (wchar_t) (0x8000 + (value & 0x3fff));
value >>= 14;
if (value < 0x2000)
tmp[idx++] = (wchar_t) (0xc000 + value);
else
{
tmp[idx++] = (wchar_t) (0x8000 + (value & 0x3fff));
value >>= 14;
tmp[idx++] = (wchar_t) (0xe000 + value);
}
}
while (idx-- > 0)
{
if (act < max)
dest[act] = tmp[idx];
++act;
buf[step] = 0x80 | (val & 0x3f);
val >>= 6;
}
while (--step > 0);
*buf |= val;
}
return act;
return buf - startp;
}
#endif
/* Transform SRC into a form such that the result of strcmp
on two strings that have been transformed by strxfrm is
the same as the result of strcoll on the two strings before
their transformation. The transformed string is put in at
most N characters of DEST and its length is returned. */
#ifndef USE_IN_EXTENDED_LOCALE_MODEL
size_t
STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n)
STRXFRM (char *dest, const char *src, size_t n)
#else
size_t
STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n, __locale_t l)
STRXFRM (char *dest, const char *src, size_t n, __locale_t l)
#endif
{
#ifdef USE_IN_EXTENDED_LOCALE_MODEL
struct locale_data *current = l->__locales[LC_COLLATE];
# if BYTE_ORDER == BIG_ENDIAN
const u_int32_t *collate_table = (const u_int32_t *)
current->values[_NL_ITEM_INDEX (_NL_COLLATE_TABLE_EB)].string;
const u_int32_t *collate_extra = (const u_int32_t *)
current->values[_NL_ITEM_INDEX (_NL_COLLATE_EXTRA_EB)].string;
# elif BYTE_ORDER == LITTLE_ENDIAN
const u_int32_t *collate_table = (const u_int32_t *)
current->values[_NL_ITEM_INDEX (_NL_COLLATE_TABLE_EL)].string;
const u_int32_t *collate_extra = (const u_int32_t *)
current->values[_NL_ITEM_INDEX (_NL_COLLATE_EXTRA_EL)].string;
# else
# error bizarre byte order
# endif
uint_fast32_t nrules = *((uint32_t *) current->values[_NL_ITEM_INDEX (_NL_COLLATE_NRULES)].string);
#else
uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
#endif
weight_t *forw = NULL;
weight_t *backw = NULL;
size_t pass;
size_t written;
/* We don't assign the following values right away since it might be
unnecessary in case there are no rules. */
const unsigned char *rulesets;
const int32_t *table;
const unsigned char *weights;
const unsigned char *extra;
const int32_t *indirect;
uint_fast32_t pass;
size_t needed;
const unsigned char *usrc;
size_t srclen = strlen (src);
int32_t *idxarr;
unsigned char *rulearr;
size_t idxmax;
size_t idxcnt;
int use_malloc = 0;
/* If the current locale does not specify locale data we use normal
8-bit string comparison. */
if (collate_nrules == 0)
#include "../locale/weight.h"
if (nrules == 0)
{
if (n != 0)
STPNCPY (dest, src, n);
__stpncpy (dest, src, n);
return STRLEN (src);
return srclen;
}
#ifdef USE_IN_EXTENDED_LOCALE_MODEL
rulesets = (const unsigned char *)
current->values[_NL_ITEM_INDEX (_NL_COLLATE_RULESETS)].string;
table = (const int32_t *)
current->values[_NL_ITEM_INDEX (_NL_COLLATE_TABLEMB)].string;
weights = (const unsigned char *)
current->values[_NL_ITEM_INDEX (_NL_COLLATE_WEIGHTMB)].string;
extra = (const unsigned char *)
current->values[_NL_ITEM_INDEX (_NL_COLLATE_EXTRAMB)].string;
indirect = (const int32_t *)
current->values[_NL_ITEM_INDEX (_NL_COLLATE_INDIRECTMB)].string;
#else
rulesets = (const unsigned char *)
_NL_CURRENT (LC_COLLATE, _NL_COLLATE_RULESETS);
table = (const int32_t *)
_NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
weights = (const unsigned char *)
_NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
extra = (const unsigned char *)
_NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
indirect = (const int32_t *)
_NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
#endif
/* Handle an empty string as a special case. */
if (*src == '\0')
if (srclen == 0)
{
if (n != 0)
*dest = '\0';
*dest = '\0';
return 1;
}
/* Get full information about the string. This means we get
information for all passes in a special data structure. */
get_string (src, forw, backw);
/* We need the elements of the string as unsigned values since they
are used as indeces. */
usrc = (const unsigned char *) src;
/* Now we have all the information. In at most the given number of
passes we can finally decide about the order. */
written = 0;
for (pass = 0; pass < collate_nrules; ++pass)
/* Perform the first pass over the string and while doing this find
and store the weights for each character. Since we want this to
be as fast as possible we are using `alloca' to store the temporary
values. But since there is no limit on the length of the string
we have to use `malloc' if the string is too long. We should be
very conservative here. */
if (srclen >= 16384)
{
int forward = (collate_rules[pass] & sort_forward) != 0;
const weight_t *run = forward ? forw : backw;
int idx = forward ? 0 : run->data[pass].number - 1;
idxarr = (int32_t *) malloc (srclen * (sizeof (int32_t) + 1));
rulearr = (unsigned char *) &idxarr[srclen];
while (1)
if (idxarr == NULL)
/* No memory. Well, go with the stack then.
XXX Once this implementation is stable we will handle this
differently. Instead of precomputing the indeces we will
do this in time. This means, though, that this happens for
every pass again. */
goto try_stack;
use_malloc = 1;
}
else
{
try_stack:
idxarr = (int32_t *) alloca (srclen * sizeof (int32_t));
rulearr = (unsigned char *) alloca (srclen);
}
idxmax = 0;
do
{
int32_t tmp = findidx (&usrc);
rulearr[idxmax] = tmp >> 24;
idxarr[idxmax] = tmp & 0x80ffffff;
++idxmax;
}
while (*usrc != '\0');
/* Now the passes over the weights. We now use the indeces we found
before. */
needed = 0;
for (pass = 0; pass < nrules; ++pass)
{
size_t backw_stop = ~0ul;
int rule = rulesets[rulearr[0] * nrules + pass];
/* We assume that if a rule has defined `position' in one section
this is true for all of them. */
int position = rule & sort_position;
if (position == 0)
{
int ignore = 0;
u_int32_t w = 0;
/* Here we have to check for IGNORE entries. If these are
found we count them and go on with he next value. */
while (run != NULL
&& ((w = run->data[pass].value[idx])
== (u_int32_t) IGNORE_CHAR))
for (idxcnt = 0; idxcnt < idxmax; ++idxcnt)
{
++ignore;
if (forward
? ++idx >= run->data[pass].number
: --idx < 0)
if ((rule & sort_forward) != 0)
{
weight_t *nextp = forward ? run->next : run->prev;
if (nextp == NULL)
size_t len;
if (backw_stop != ~0ul)
{
w = 0;
/* No more non-INGOREd elements means lowest
possible value. */
ignore = -1;
/* Handle the pushed elements now. */
size_t backw;
for (backw = idxcnt - 1; backw >= backw_stop; --backw)
{
len = weights[idxarr[backw]++];
if (needed + len < n)
while (len-- > 0)
dest[needed++] = weights[idxarr[backw]++];
else
{
/* No more characters fit into the buffer. */
needed += len;
idxarr[backw] += len;
}
}
backw_stop = ~0ul;
}
/* Now handle the forward element. */
len = weights[idxarr[idxcnt]++];
if (needed + len < n)
while (len-- > 0)
dest[needed++] = weights[idxarr[idxcnt]++];
else
{
/* No more characters fit into the buffer. */
needed += len;
idxarr[idxcnt] += len;
}
}
else
{
/* Remember where the backwards series started. */
if (backw_stop == ~0ul)
backw_stop = idxcnt;
}
rule = rulesets[rulearr[idxcnt + 1] * nrules + pass];
}
if (backw_stop != ~0ul)
{
/* Handle the pushed elements now. */
size_t backw;
for (backw = idxcnt - 1; backw >= backw_stop; --backw)
{
size_t len = weights[idxarr[backw]++];
if (needed + len < n)
while (len-- > 0)
dest[needed++] = weights[idxarr[backw]++];
else
{
/* No more characters fit into the buffer. */
needed += len;
idxarr[backw] += len;
}
}
}
}
else
{
int val = 1;
char buf[7];
size_t buflen;
size_t i;
for (idxcnt = 0; idxcnt < idxmax; ++idxcnt)
{
if ((rule & sort_forward) != 0)
{
size_t len;
if (backw_stop != ~0ul)
{
/* Handle the pushed elements now. */
size_t backw;
for (backw = idxcnt - 1; backw >= backw_stop; --backw)
{
len = weights[idxarr[backw]++];
if (len != 0)
{
buflen = utf8_encode (buf, val);
if (needed + buflen + len < n)
{
for (i = 0; i < buflen; ++i)
dest[needed + i] = buf[i];
for (i = 0; i < len; ++i)
dest[needed + buflen + i] =
weights[idxarr[backw] + i];
}
idxarr[backw] += len;
needed += buflen + len;
val = 1;
}
else
++val;
}
backw_stop = ~0ul;
}
/* Now handle the forward element. */
len = weights[idxarr[idxcnt]++];
if (len != 0)
{
buflen = utf8_encode (buf, val);
if (needed + buflen + len < n)
{
for (i = 0; i < buflen; ++i)
dest[needed + i] = buf[i];
for (i = 0; i < len; ++i)
dest[needed + buflen + i] =
weights[idxarr[idxcnt] + i];
}
idxarr[idxcnt] += len;
needed += buflen + len;
val = 1;
}
else
idx = forward ? 0 : nextp->data[pass].number - 1;
run = nextp;
/* Note that we don't have to increment `idxarr[idxcnt]'
since the length is zero. */
++val;
}
else
{
/* Remember where the backwards series started. */
if (backw_stop == ~0ul)
backw_stop = idxcnt;
}
rule = rulesets[rulearr[idxcnt + 1] * nrules + pass];
}
/* Stop if all characters are processed. */
if (run == NULL)
break;
/* Now we have information of the number of ignored weights
and the value of the next weight. We have to add 2
because 0 means EOS and 1 is the intermediate string end. */
if ((collate_rules[pass] & sort_position) != 0)
written = print_val (ignore + 2, dest, n, written);
if (w != 0)
written = print_val (w, dest, n, written);
/* We have to increment the index counters. */
if (forward)
if (backw_stop != ~0)
{
if (++idx >= run->data[pass].number)
/* Handle the pushed elements now. */
size_t backw;
for (backw = idxmax - 1; backw >= backw_stop; --backw)
{
run = run->next;
idx = 0;
}
}
else
{
if (--idx < 0)
{
run = run->prev;
if (run != NULL)
idx = run->data[pass].number - 1;
size_t len = weights[idxarr[backw]++];
if (len != 0)
{
buflen = utf8_encode (buf, val);
if (needed + buflen + len < n)
{
for (i = 0; i < buflen; ++i)
dest[needed + i] = buf[i];
for (i = 0; i < len; ++i)
dest[needed + buflen + i] =
weights[idxarr[backw] + i];
}
idxarr[backw] += len;
needed += buflen + len;
val = 1;
}
else
++val;
}
}
}
/* Write marker for end of word. */
if (pass + 1 < collate_nrules)
written = print_val (1, dest, n, written);
/* Finally store the byte to separate the passes or terminate
the string. */
if (needed < n)
dest[needed] = pass + 1 < nrules ? '\1' : '\0';
++needed;
}
/* Terminate string. */
if (written < n)
dest[written] = L_('\0');
/* This is a little optimization: many collation specifications have
a `position' rule at the end and if no non-ignored character
is found the last \1 byte is immediately followed by a \0 byte
signalling this. We can avoid the \1 byte(s). */
if (needed > 2 && dest[needed - 2] == '\1')
{
/* Remove the \1 byte. */
--needed;
dest[needed - 1] = '\0';
}
/* Return length without counting the terminating '\0'. */
return written;
/* Free the memory if needed. */
if (use_malloc)
free (idxarr);
return needed;
}
#endif

View File

@ -1,4 +1,4 @@
/* Copyright (C) 1996, 1997, 1998 Free Software Foundation, Inc.
/* Copyright (C) 1996, 1997, 1998, 1999 Free Software Foundation, Inc.
This file is part of the GNU C Library.
Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>, 1996.
@ -19,16 +19,23 @@
#include <wchar.h>
#define WIDE_VERSION 1
#define STRING_TYPE wchar_t
#define USTRING_TYPE wint_t
#define L_(Ch) L##Ch
#ifdef USE_IN_EXTENDED_LOCALE_MODEL
# define STRXFRM __wcsxfrm_l
#else
# define STRXFRM wcsxfrm
#endif
#define STRLEN __wcslen
#define STPNCPY __wcpncpy
#include <string/strxfrm.c>
#ifndef USE_IN_EXTENDED_LOCALE_MODEL
size_t
STRXFRM (wchar_t *dest, const wchar_t *src, size_t n)
#else
size_t
STRXFRM (wchar_t *dest, const wchar_t *src, size_t n, __locale_t l)
#endif
{
if (n != 0)
__wcpncpy (dest, src, n);
return __wcslen (src);
}

View File

@ -52,5 +52,5 @@ wctrans (const char *property)
/* We have to search the table. */
result = (int32_t *) _NL_CURRENT (LC_CTYPE, _NL_NUM_LC_CTYPE + cnt - 2);
return (wctrans_t) (result + 128);
return (wctrans_t) result;
}