419 lines
11 KiB
C
419 lines
11 KiB
C
/* Transliteration using the locale's data.
|
|
Copyright (C) 2000, 2009 Free Software Foundation, Inc.
|
|
This file is part of the GNU C Library.
|
|
Contributed by Ulrich Drepper <drepper@cygnus.com>, 2000.
|
|
|
|
The GNU C Library is free software; you can redistribute it and/or
|
|
modify it under the terms of the GNU Lesser General Public
|
|
License as published by the Free Software Foundation; either
|
|
version 2.1 of the License, or (at your option) any later version.
|
|
|
|
The GNU C Library is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
Lesser General Public License for more details.
|
|
|
|
You should have received a copy of the GNU Lesser General Public
|
|
License along with the GNU C Library; if not, see
|
|
<http://www.gnu.org/licenses/>. */
|
|
|
|
#include <assert.h>
|
|
#include <dlfcn.h>
|
|
#include <search.h>
|
|
#include <stdint.h>
|
|
#include <string.h>
|
|
#include <stdlib.h>
|
|
|
|
#include <bits/libc-lock.h>
|
|
#include "gconv_int.h"
|
|
#include "../locale/localeinfo.h"
|
|
|
|
|
|
int
|
|
__gconv_transliterate (struct __gconv_step *step,
|
|
struct __gconv_step_data *step_data,
|
|
void *trans_data __attribute__ ((unused)),
|
|
const unsigned char *inbufstart,
|
|
const unsigned char **inbufp,
|
|
const unsigned char *inbufend,
|
|
unsigned char **outbufstart, size_t *irreversible)
|
|
{
|
|
/* Find out about the locale's transliteration. */
|
|
uint_fast32_t size;
|
|
const uint32_t *from_idx;
|
|
const uint32_t *from_tbl;
|
|
const uint32_t *to_idx;
|
|
const uint32_t *to_tbl;
|
|
const uint32_t *winbuf;
|
|
const uint32_t *winbufend;
|
|
uint_fast32_t low;
|
|
uint_fast32_t high;
|
|
|
|
/* The input buffer. There are actually 4-byte values. */
|
|
winbuf = (const uint32_t *) *inbufp;
|
|
winbufend = (const uint32_t *) inbufend;
|
|
|
|
__gconv_fct fct = step->__fct;
|
|
#ifdef PTR_DEMANGLE
|
|
if (step->__shlib_handle != NULL)
|
|
PTR_DEMANGLE (fct);
|
|
#endif
|
|
|
|
/* If there is no transliteration information in the locale don't do
|
|
anything and return the error. */
|
|
size = _NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_TRANSLIT_TAB_SIZE);
|
|
if (size == 0)
|
|
goto no_rules;
|
|
|
|
/* Get the rest of the values. */
|
|
from_idx =
|
|
(const uint32_t *) _NL_CURRENT (LC_CTYPE, _NL_CTYPE_TRANSLIT_FROM_IDX);
|
|
from_tbl =
|
|
(const uint32_t *) _NL_CURRENT (LC_CTYPE, _NL_CTYPE_TRANSLIT_FROM_TBL);
|
|
to_idx =
|
|
(const uint32_t *) _NL_CURRENT (LC_CTYPE, _NL_CTYPE_TRANSLIT_TO_IDX);
|
|
to_tbl =
|
|
(const uint32_t *) _NL_CURRENT (LC_CTYPE, _NL_CTYPE_TRANSLIT_TO_TBL);
|
|
|
|
/* Test whether there is enough input. */
|
|
if (winbuf + 1 > winbufend)
|
|
return (winbuf == winbufend
|
|
? __GCONV_EMPTY_INPUT : __GCONV_INCOMPLETE_INPUT);
|
|
|
|
/* The array starting at FROM_IDX contains indeces to the string table
|
|
in FROM_TBL. The indeces are sorted wrt to the strings. I.e., we
|
|
are doing binary search. */
|
|
low = 0;
|
|
high = size;
|
|
while (low < high)
|
|
{
|
|
uint_fast32_t med = (low + high) / 2;
|
|
uint32_t idx;
|
|
int cnt;
|
|
|
|
/* Compare the string at this index with the string at the current
|
|
position in the input buffer. */
|
|
idx = from_idx[med];
|
|
cnt = 0;
|
|
do
|
|
{
|
|
if (from_tbl[idx + cnt] != winbuf[cnt])
|
|
/* Does not match. */
|
|
break;
|
|
++cnt;
|
|
}
|
|
while (from_tbl[idx + cnt] != L'\0' && winbuf + cnt < winbufend);
|
|
|
|
if (cnt > 0 && from_tbl[idx + cnt] == L'\0')
|
|
{
|
|
/* Found a matching input sequence. Now try to convert the
|
|
possible replacements. */
|
|
uint32_t idx2 = to_idx[med];
|
|
|
|
do
|
|
{
|
|
/* Determine length of replacement. */
|
|
uint_fast32_t len = 0;
|
|
int res;
|
|
const unsigned char *toinptr;
|
|
unsigned char *outptr;
|
|
|
|
while (to_tbl[idx2 + len] != L'\0')
|
|
++len;
|
|
|
|
/* Try this input text. */
|
|
toinptr = (const unsigned char *) &to_tbl[idx2];
|
|
outptr = *outbufstart;
|
|
res = DL_CALL_FCT (fct,
|
|
(step, step_data, &toinptr,
|
|
(const unsigned char *) &to_tbl[idx2 + len],
|
|
&outptr, NULL, 0, 0));
|
|
if (res != __GCONV_ILLEGAL_INPUT)
|
|
{
|
|
/* If the conversion succeeds we have to increment the
|
|
input buffer. */
|
|
if (res == __GCONV_EMPTY_INPUT)
|
|
{
|
|
*inbufp += cnt * sizeof (uint32_t);
|
|
++*irreversible;
|
|
res = __GCONV_OK;
|
|
}
|
|
/* Do not increment the output pointer if we could not
|
|
store the entire output. */
|
|
if (res != __GCONV_FULL_OUTPUT)
|
|
*outbufstart = outptr;
|
|
|
|
return res;
|
|
}
|
|
|
|
/* Next replacement. */
|
|
idx2 += len + 1;
|
|
}
|
|
while (to_tbl[idx2] != L'\0');
|
|
|
|
/* Nothing found, continue searching. */
|
|
}
|
|
else if (cnt > 0)
|
|
/* This means that the input buffer contents matches a prefix of
|
|
an entry. Since we cannot match it unless we get more input,
|
|
we will tell the caller about it. */
|
|
return __GCONV_INCOMPLETE_INPUT;
|
|
|
|
if (winbuf + cnt >= winbufend || from_tbl[idx + cnt] < winbuf[cnt])
|
|
low = med + 1;
|
|
else
|
|
high = med;
|
|
}
|
|
|
|
no_rules:
|
|
/* Maybe the character is supposed to be ignored. */
|
|
if (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_TRANSLIT_IGNORE_LEN) != 0)
|
|
{
|
|
int n = _NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_TRANSLIT_IGNORE_LEN);
|
|
const uint32_t *ranges =
|
|
(const uint32_t *) _NL_CURRENT (LC_CTYPE, _NL_CTYPE_TRANSLIT_IGNORE);
|
|
const uint32_t wc = *(const uint32_t *) (*inbufp);
|
|
int i;
|
|
|
|
/* Test whether there is enough input. */
|
|
if (winbuf + 1 > winbufend)
|
|
return (winbuf == winbufend
|
|
? __GCONV_EMPTY_INPUT : __GCONV_INCOMPLETE_INPUT);
|
|
|
|
for (i = 0; i < n; ranges += 3, ++i)
|
|
if (ranges[0] <= wc && wc <= ranges[1]
|
|
&& (wc - ranges[0]) % ranges[2] == 0)
|
|
{
|
|
/* Matches the range. Ignore it. */
|
|
*inbufp += 4;
|
|
++*irreversible;
|
|
return __GCONV_OK;
|
|
}
|
|
else if (wc < ranges[0])
|
|
/* There cannot be any other matching range since they are
|
|
sorted. */
|
|
break;
|
|
}
|
|
|
|
/* One last chance: use the default replacement. */
|
|
if (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_TRANSLIT_DEFAULT_MISSING_LEN) != 0)
|
|
{
|
|
const uint32_t *default_missing = (const uint32_t *)
|
|
_NL_CURRENT (LC_CTYPE, _NL_CTYPE_TRANSLIT_DEFAULT_MISSING);
|
|
const unsigned char *toinptr = (const unsigned char *) default_missing;
|
|
uint32_t len = _NL_CURRENT_WORD (LC_CTYPE,
|
|
_NL_CTYPE_TRANSLIT_DEFAULT_MISSING_LEN);
|
|
unsigned char *outptr;
|
|
int res;
|
|
|
|
/* Test whether there is enough input. */
|
|
if (winbuf + 1 > winbufend)
|
|
return (winbuf == winbufend
|
|
? __GCONV_EMPTY_INPUT : __GCONV_INCOMPLETE_INPUT);
|
|
|
|
outptr = *outbufstart;
|
|
res = DL_CALL_FCT (fct,
|
|
(step, step_data, &toinptr,
|
|
(const unsigned char *) (default_missing + len),
|
|
&outptr, NULL, 0, 0));
|
|
|
|
if (res != __GCONV_ILLEGAL_INPUT)
|
|
{
|
|
/* If the conversion succeeds we have to increment the
|
|
input buffer. */
|
|
if (res == __GCONV_EMPTY_INPUT)
|
|
{
|
|
/* This worked but is not reversible. */
|
|
++*irreversible;
|
|
*inbufp += 4;
|
|
res = __GCONV_OK;
|
|
}
|
|
*outbufstart = outptr;
|
|
|
|
return res;
|
|
}
|
|
}
|
|
|
|
/* Haven't found a match. */
|
|
return __GCONV_ILLEGAL_INPUT;
|
|
}
|
|
|
|
|
|
/* Structure to represent results of found (or not) transliteration
|
|
modules. */
|
|
struct known_trans
|
|
{
|
|
/* This structure must remain the first member. */
|
|
struct trans_struct info;
|
|
|
|
char *fname;
|
|
void *handle;
|
|
int open_count;
|
|
};
|
|
|
|
|
|
/* Tree with results of previous calls to __gconv_translit_find. */
|
|
static void *search_tree;
|
|
|
|
/* We modify global data. */
|
|
__libc_lock_define_initialized (static, lock);
|
|
|
|
|
|
/* Compare two transliteration entries. */
|
|
static int
|
|
trans_compare (const void *p1, const void *p2)
|
|
{
|
|
const struct known_trans *s1 = (const struct known_trans *) p1;
|
|
const struct known_trans *s2 = (const struct known_trans *) p2;
|
|
|
|
return strcmp (s1->info.name, s2->info.name);
|
|
}
|
|
|
|
|
|
/* Open (maybe reopen) the module named in the struct. Get the function
|
|
and data structure pointers we need. */
|
|
static int
|
|
open_translit (struct known_trans *trans)
|
|
{
|
|
__gconv_trans_query_fct queryfct;
|
|
|
|
trans->handle = __libc_dlopen (trans->fname);
|
|
if (trans->handle == NULL)
|
|
/* Not available. */
|
|
return 1;
|
|
|
|
/* Find the required symbol. */
|
|
queryfct = __libc_dlsym (trans->handle, "gconv_trans_context");
|
|
if (queryfct == NULL)
|
|
{
|
|
/* We cannot live with that. */
|
|
close_and_out:
|
|
__libc_dlclose (trans->handle);
|
|
trans->handle = NULL;
|
|
return 1;
|
|
}
|
|
|
|
/* Get the context. */
|
|
if (queryfct (trans->info.name, &trans->info.csnames, &trans->info.ncsnames)
|
|
!= 0)
|
|
goto close_and_out;
|
|
|
|
/* Of course we also have to have the actual function. */
|
|
trans->info.trans_fct = __libc_dlsym (trans->handle, "gconv_trans");
|
|
if (trans->info.trans_fct == NULL)
|
|
goto close_and_out;
|
|
|
|
/* Now the optional functions. */
|
|
trans->info.trans_init_fct =
|
|
__libc_dlsym (trans->handle, "gconv_trans_init");
|
|
trans->info.trans_context_fct =
|
|
__libc_dlsym (trans->handle, "gconv_trans_context");
|
|
trans->info.trans_end_fct =
|
|
__libc_dlsym (trans->handle, "gconv_trans_end");
|
|
|
|
trans->open_count = 1;
|
|
|
|
return 0;
|
|
}
|
|
|
|
|
|
int
|
|
internal_function
|
|
__gconv_translit_find (struct trans_struct *trans)
|
|
{
|
|
struct known_trans **found;
|
|
const struct path_elem *runp;
|
|
int res = 1;
|
|
|
|
/* We have to have a name. */
|
|
assert (trans->name != NULL);
|
|
|
|
/* Acquire the lock. */
|
|
__libc_lock_lock (lock);
|
|
|
|
/* See whether we know this module already. */
|
|
found = __tfind (trans, &search_tree, trans_compare);
|
|
if (found != NULL)
|
|
{
|
|
/* Is this module available? */
|
|
if ((*found)->handle != NULL)
|
|
{
|
|
/* Maybe we have to reopen the file. */
|
|
if ((*found)->handle != (void *) -1)
|
|
/* The object is not unloaded. */
|
|
res = 0;
|
|
else if (open_translit (*found) == 0)
|
|
{
|
|
/* Copy the data. */
|
|
*trans = (*found)->info;
|
|
(*found)->open_count++;
|
|
res = 0;
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
size_t name_len = strlen (trans->name) + 1;
|
|
int need_so = 0;
|
|
struct known_trans *newp;
|
|
|
|
/* We have to continue looking for the module. */
|
|
if (__gconv_path_elem == NULL)
|
|
__gconv_get_path ();
|
|
|
|
/* See whether we have to append .so. */
|
|
if (name_len <= 4 || memcmp (&trans->name[name_len - 4], ".so", 3) != 0)
|
|
need_so = 1;
|
|
|
|
/* Create a new entry. */
|
|
newp = (struct known_trans *) malloc (sizeof (struct known_trans)
|
|
+ (__gconv_max_path_elem_len
|
|
+ name_len + 3)
|
|
+ name_len);
|
|
if (newp != NULL)
|
|
{
|
|
char *cp;
|
|
|
|
/* Clear the struct. */
|
|
memset (newp, '\0', sizeof (struct known_trans));
|
|
|
|
/* Store a copy of the module name. */
|
|
newp->info.name = cp = (char *) (newp + 1);
|
|
cp = __mempcpy (cp, trans->name, name_len);
|
|
|
|
newp->fname = cp;
|
|
|
|
/* Search in all the directories. */
|
|
for (runp = __gconv_path_elem; runp->name != NULL; ++runp)
|
|
{
|
|
cp = __mempcpy (__stpcpy ((char *) newp->fname, runp->name),
|
|
trans->name, name_len);
|
|
if (need_so)
|
|
memcpy (cp, ".so", sizeof (".so"));
|
|
|
|
if (open_translit (newp) == 0)
|
|
{
|
|
/* We found a module. */
|
|
res = 0;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (res)
|
|
newp->fname = NULL;
|
|
|
|
/* In any case we'll add the entry to our search tree. */
|
|
if (__tsearch (newp, &search_tree, trans_compare) == NULL)
|
|
{
|
|
/* Yickes, this should not happen. Unload the object. */
|
|
res = 1;
|
|
/* XXX unload here. */
|
|
}
|
|
}
|
|
}
|
|
|
|
__libc_lock_unlock (lock);
|
|
|
|
return res;
|
|
}
|