glibc/locale/locfile-lex.c

/* Copyright (C) 1995 Free Software Foundation, Inc.

The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Library General Public License as
published by the Free Software Foundation; either version 2 of the
License, or (at your option) any later version.

The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
Library General Public License for more details.

You should have received a copy of the GNU Library General Public
License along with the GNU C Library; see the file COPYING.LIB.  If
not, write to the Free Software Foundation, Inc., 675 Mass Ave,
Cambridge, MA 02139, USA.  */

#include <ctype.h>
#include <langinfo.h>
#include <libintl.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>

#include "localedef.h"
#include "token.h"


/* Include the hashing table for the keywords.  */
const struct locale_keyword* in_word_set (register const char *str,
                                          register int len);
#include "keyword.h"


/* Contains the status of reading the locale definition file.  */
struct locfile_data locfile_data;

/* This is a flag used while collation input.  This is the only place
   where element names beside the ones defined in the character map are
   allowed.  There we must not give error messages.  */
int reject_new_char = 1;

/* Prototypes for local functions.  */
static int get_char (void);


#define LD locfile_data

/* Opens the locale definition file and initializes the status data structure
   for following calls of `locfile_lex'.  */
void
locfile_open (const char *fname)
{
  if (fname == NULL)
    /* We read from stdin.  */
    LD.filename = "<stdin>";
  else
    {
      if (freopen (fname, "r", stdin) == NULL)
	error (4, 0, gettext ("input file `%s' not found"), fname);
      LD.filename = fname;
    }

  /* Set default values.  */
  LD.escape_char = '\\';
  LD.comment_char = '#';

  LD.bufsize = sysconf (_SC_LINE_MAX);
  LD.buf = (char *) xmalloc (LD.bufsize);
  LD.strbuf = (char *) xmalloc (LD.bufsize);

  LD.buf_ptr = LD.returned_tokens = LD.line_no = 0;

  /* Now sign that we want immediately read a line.  */
  LD.continue_line = 1;
  LD.buf[LD.buf_ptr] = '\0';
}


int
xlocfile_lex (char **token, int *token_len)
{
  int retval = locfile_lex (token, token_len);

  if (retval == 0)
    /* I.e. end of file.  */
    error (4, 0, gettext ("%s: unexpected end of file in locale defintion "
			  "file"), locfile_data.filename);

  return retval;
}

int
locfile_lex (char **token, int *token_len)
{
  int start_again;
  int retval = 0;

  do
    {
      int start_ptr;

      start_again = 0;

      /* Read the next line.  Skip over empty lines and comments.  */
      if ((LD.buf[LD.buf_ptr] == '\0' && LD.continue_line != 0)
	  || LD.buf_ptr >= LD.bufsize
	  || (posix_conformance == 0 && LD.buf[LD.buf_ptr] == LD.comment_char))
	do
	  {
	    size_t linelen;

	    LD.buf_ptr = 0;

	    if (fgets (LD.buf, LD.bufsize, stdin) == NULL)
	      {
		/* This makes subsequent calls also return EOF.  */
		LD.buf[0] = '\0';
		return 0;
	      }

	    /* Increment line number counter.  */
	    ++LD.line_no;

	    /* We now have to look whether this line is continued and
	       whether it at all fits into our buffer.  */
	    linelen = strlen (LD.buf);

	    if (linelen == LD.bufsize - 1)
	      /* The did not fit into the buffer.  */
	      error (2, 0, gettext ("%s:%Zd: line too long;  use "
				    "`getconf LINE_MAX' to get the maximum "
				    "line length"), LD.filename, LD.line_no);

	    /* Remove '\n' at end of line.  */
	    if (LD.buf[linelen - 1] == '\n')
	      LD.buf[--linelen] = '\0';

	    if (linelen > 0 && LD.buf[linelen - 1] == LD.escape_char)
	      {
		LD.buf[--linelen] = '\0';
		LD.continue_line = 1;
	      }
	    else
	      LD.continue_line = 0;

	    while (isspace (LD.buf[LD.buf_ptr]))
	      ++LD.buf_ptr;

	    /* We are not so restrictive and allow white spaces before
	       a comment.  */
	    if (posix_conformance == 0
		&& LD.buf[LD.buf_ptr] == LD.comment_char
		&& LD.buf_ptr != 0)
	      error (0, 0, gettext ("%s:%Zd: comment does not start in "
				    "column 1"), LD.filename, LD.line_no);
	  }
	while (LD.buf[LD.buf_ptr] == '\0'
	       || LD.buf[LD.buf_ptr] == LD.comment_char);


      /* Get information for return values.  */
      *token = LD.buf + LD.buf_ptr;
      start_ptr = LD.buf_ptr;

      /* If no further character is in the line this is the end of a logical
	 line.  This information is needed in the parser.  */
      if (LD.buf[LD.buf_ptr] == '\0')
	{
	  LD.buf_ptr = LD.bufsize;
	  retval = TOK_ENDOFLINE;
	}
      else if (isalpha (LD.buf[LD.buf_ptr]))
	/* The token is an identifier.  The POSIX standard does not say
	   what characters might be contained but offical POSIX locale
	   definition files contain beside alnum characters '_', '-' and
	   '+'.  */
	{
	  const struct locale_keyword *kw;

	  do
	    ++LD.buf_ptr;
	  while (isalnum (LD.buf[LD.buf_ptr]) || LD.buf[LD.buf_ptr] == '_'
		 || LD.buf[LD.buf_ptr] == '-' || LD.buf[LD.buf_ptr] == '+');

	  /* Look in table of keywords.  */
	  kw = in_word_set (*token, LD.buf_ptr - start_ptr);
	  if (kw == NULL)
	    retval = TOK_IDENT;
	  else
	    {
	      if (kw->token_id == TOK_ESCAPE_CHAR
		  || kw->token_id == TOK_COMMENT_CHAR)
		/* `escape_char' and `comment_char' are keywords for the
		   lexer.  Do not give them to the parser.  */
		{
		  start_again = 1;

		  if (!isspace (LD.buf[LD.buf_ptr])
		      || (posix_conformance && LD.returned_tokens > 0))
		    error (0, 0, gettext ("%s:%Zd: syntax error in locale "
					  "definition file"),
			   LD.filename, LD.line_no);

		  do
		    ++LD.buf_ptr;
		  while (isspace (LD.buf[LD.buf_ptr]));

		  kw->token_id == TOK_ESCAPE_CHAR
		    ? LD.escape_char
		    : LD.comment_char = LD.buf[LD.buf_ptr++];

		  ignore_to_eol (0, posix_conformance);
		}
	      else
		/* It is one of the normal keywords.  */
		retval = kw->token_id;
	    }

	  *token_len = LD.buf_ptr - start_ptr;
	}
      else if (LD.buf[LD.buf_ptr] == '"')
	/* Read a string.  All symbolic character descriptions are expanded.
	   This has to be done in a local buffer because a simple symbolic
	   character like <A> may expand to upto 6 bytes.  */
	{
	  char *last = LD.strbuf;

	  ++LD.buf_ptr;
	  while (LD.buf[LD.buf_ptr] != '"')
	    {
	      int pre = LD.buf_ptr;
	      int char_val = get_char (); /* token, token_len); */

	      if (char_val == 0)
		{
		  error (4, 0, gettext ("%s:%Zd: unterminated string at end "
					"of line"), LD.filename, LD.line_no);
		  /* NOTREACHED */
		}

	      if (char_val > 0)
		/* Unknown characters are simply not stored.  */
		last += char_to_utf (last, char_val);
	      else
		{
		  char tmp[LD.buf_ptr - pre + 1];
		  memcpy (tmp, &LD.buf[pre], LD.buf_ptr - pre);
		  tmp[LD.buf_ptr - pre] = '\0';
		  error (0, 0, gettext ("%s:%Zd: character `%s' not defined"),
			 LD.filename, LD.line_no, tmp);
		}
	    }
	  if (LD.buf[LD.buf_ptr] != '\0')
	    ++LD.buf_ptr;

	  *last = '\0';
	  *token = LD.strbuf;
	  *token_len = last  - LD.strbuf;
	  retval = TOK_STRING;
	}
      else if (LD.buf[LD.buf_ptr] == '.' && LD.buf[LD.buf_ptr + 1] == '.'
	       && LD.buf[LD.buf_ptr + 2] == '.')
	{
	  LD.buf_ptr += 3;
	  retval = TOK_ELLIPSIS;
	}
      else if (LD.buf[LD.buf_ptr] == LD.escape_char)
	{
	  char *endp;

	  ++LD.buf_ptr;
	  switch (LD.buf[LD.buf_ptr])
	    {
	    case 'x':
	      if (isdigit (LD.buf[++LD.buf_ptr]))
		{
		  retval = strtol (&LD.buf[LD.buf_ptr], &endp, 16);
		  if (endp - (LD.buf + LD.buf_ptr) < 2 || retval > 255)
		    retval = 'x';
		  else
		    LD.buf_ptr = endp - LD.buf;
		}
	      else
		retval = 'x';
	      break;
	    case 'd':
	      if (isdigit (LD.buf[++LD.buf_ptr]))
		{
		  retval = strtol (&LD.buf[LD.buf_ptr], &endp, 10);
		  if (endp - (LD.buf + LD.buf_ptr) < 2 || retval > 255)
		    retval = 'd';
		  else
		    LD.buf_ptr = endp - LD.buf;
		}
	      else
		retval = 'd';
	      break;
	    case '0'...'9':
	      retval = strtol (&LD.buf[LD.buf_ptr], &endp, 8);
	      if (endp - (LD.buf + LD.buf_ptr) < 2 || retval > 255)
		retval = LD.buf[LD.buf_ptr++];
	      else
		LD.buf_ptr = endp - LD.buf;
	      break;
	    case 'a':
	      retval = '\a';
	      ++LD.buf_ptr;
	      break;
	    case 'b':
	      retval = '\b';
	      ++LD.buf_ptr;
	      break;
	    case 'f':
	      retval = '\f';
	      ++LD.buf_ptr;
	      break;
	    case 'n':
	      retval = '\n';
	      ++LD.buf_ptr;
	      break;
	    case 'r':
	      retval = '\r';
	      ++LD.buf_ptr;
	      break;
	    case 't':
	      retval = '\t';
	      ++LD.buf_ptr;
	      break;
	    case 'v':
	      retval = '\v';
	      ++LD.buf_ptr;
	      break;
	    default:
	      retval = LD.buf[LD.buf_ptr++];
	      break;
 	    }
	}
      else if (isdigit (LD.buf[LD.buf_ptr]))
	{
	  char *endp;

	  *token_len = strtol (&LD.buf[LD.buf_ptr], &endp, 10);
	  LD.buf_ptr = endp - LD.buf;
	  retval = TOK_NUMBER;
	}
      else if (LD.buf[LD.buf_ptr] == '-' && LD.buf[LD.buf_ptr + 1] == '1')
	{
	  LD.buf_ptr += 2;
	  retval = TOK_MINUS1;
	}
      else
	{
	  int ch = get_char (); /* token, token_len); */
	  if (ch != -1)
	    {
	      *token_len = ch;
	      retval = TOK_CHAR;
	    }
	  else
	    retval = TOK_ILL_CHAR;
	}

      /* Ignore white space.  */
      while (isspace (LD.buf[LD.buf_ptr]))
	++LD.buf_ptr;
    }
  while (start_again != 0);

  ++LD.returned_tokens;
  return retval;
}


/* Code a character with UTF-8 if the character map has multi-byte
   characters.  */
int
char_to_utf (char *buf, int char_val)
{
  if (charmap_data.mb_cur_max == 1)
    {
      *buf++ = char_val;
      return 1;
    }
  else
    {
/* The number of bits coded in each character.  */
#define CBPC 6
      static struct coding_tab
        {
          int mask;
          int val;
        }
      tab[] =
        {
          { 0x7f,       0x00 },
          { 0x7ff,      0xc0 },
          { 0xffff,     0xe0 },
          { 0x1fffff,   0xf0 },
          { 0x3ffffff,  0xf8 },
          { 0x7fffffff, 0xfc },
          { 0, }
        };
      struct coding_tab *t;
      int c;
      int cnt = 1;

      for (t = tab; char_val > t->mask; ++t, ++cnt)
	;

      c = cnt;

      buf += cnt;
      while (c > 1)
	{
	  *--buf = 0x80 | (char_val & ((1 << CBPC) - 1));
	  char_val >>= CBPC;
	  --c;
	}

      *--buf = t->val | char_val;

      return cnt;
    }
}


/* Ignore rest of line upto ENDOFLINE token, starting with given token.
   If WARN_FLAG is set warn about any token but ENDOFLINE.  */
void
ignore_to_eol (int token, int warn_flag)
{
  if (token == TOK_ENDOFLINE)
    return;

  if (LD.buf[LD.buf_ptr] != '\0' && warn_flag)
    error (0, 0, gettext ("%s:%Zd: trailing garbage at end of line"),
	   locfile_data.filename, locfile_data.line_no);

  while (LD.continue_line)
    {
      LD.continue_line = 0;

      /* Increment line number counter.  */
      ++LD.line_no;

      if (fgets (LD.buf, LD.bufsize, stdin) != NULL)
	{
	  /* We now have to look whether this line is continued and
	     whether it at all fits into our buffer.  */
	  int linelen = strlen (LD.buf);

	  if (linelen == LD.bufsize - 1)
	    /* The did not fit into the buffer.  */
	    error (2, 0, gettext ("%s:%Zd: line too long;  use `getconf "
				  "LINE_MAX' to get the current maximum "
				  "line length"), LD.filename, LD.line_no);

	  /* Remove '\n' at end of line.  */
	  if (LD.buf[linelen - 1] == '\n')
	    --linelen;

	  if (LD.buf[linelen - 1] == LD.escape_char)
	    LD.continue_line = 1;
	}
    }

  /* This causes to begin the next line.  */
  LD.buf_ptr = LD.bufsize;
}


/* Return the value of the character at the beginning of the input buffer.
   Symbolic character constants are expanded.  */
static int
get_char (void)
{
  if (LD.buf[LD.buf_ptr] == '<')
    /* This is a symbolic character name.  */
    {
      int char_val;
      char *startp = LD.buf + (++LD.buf_ptr);
      char *endp = startp;

      while (LD.buf[LD.buf_ptr] != '>' && isprint (LD.buf[LD.buf_ptr]))
	{
	  if (LD.buf[LD.buf_ptr] == '\0'
	      || (LD.buf[LD.buf_ptr] == LD.escape_char
		  && LD.buf[++LD.buf_ptr] == '\0'))
	    break;

	  *endp++ = LD.buf[LD.buf_ptr++];
	}

      if (LD.buf[LD.buf_ptr] != '>' && LD.buf[LD.buf_ptr] == '\0')
	{
	  error (0, 0, gettext ("%s:%Zd: end of line in character symbol"),
		 LD.filename, LD.line_no);

	  if (startp == endp)
	    return -1;
	}
      else
	++LD.buf_ptr;

      char_val = find_char (startp, endp - startp);
      if (char_val == -1 && verbose != 0 && reject_new_char != 0)
	{
	  /* Locale defintions are often given very general.  Missing
	     characters are only reported when explicitely requested.  */
	  char tmp[endp - startp + 3];

	  tmp[0] = '<';
	  memcpy (tmp + 1, startp, endp - startp);
	  tmp[endp - startp + 1] = '>';
	  tmp[endp - startp + 2] = '\0';

	  error (0, 0, gettext ("%s:%Zd: character `%s' not defined"),
		 LD.filename, LD.line_no, tmp);
	}

      return char_val;
    }
  else
    return (int) LD.buf[LD.buf_ptr++];
}

/*
 * Local Variables:
 *  mode:c
 *  c-basic-offset:2
 * End:
 */