/*****************************************************************************/
/*                                                                           */
/*  THE NONPAREIL DOCUMENT FORMATTING SYSTEM                                 */
/*  COPYRIGHT (C) 2002, 2005 Jeffrey H. Kingston                             */
/*                                                                           */
/*  Jeffrey H. Kingston (jeff@it.usyd.edu.au)                                */
/*  School of Information Technologies                                       */
/*  The University of Sydney 2006                                            */
/*  AUSTRALIA                                                                */
/*                                                                           */
/*  This program is free software; you can redistribute it and/or modify     */
/*  it under the terms of the GNU General Public License as published by     */
/*  the Free Software Foundation; either Version 2, or (at your option)      */
/*  any later version.                                                       */
/*                                                                           */
/*  This program is distributed in the hope that it will be useful,          */
/*  but WITHOUT ANY WARRANTY; without even the implied warranty of           */
/*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the            */
/*  GNU General Public License for more details.                             */
/*                                                                           */
/*  You should have received a copy of the GNU General Public License        */
/*  along with this program; if not, write to the Free Software              */
/*  Foundation, Inc., 59 Temple Place, Suite 330, Boston MA 02111-1307 USA   */
/*                                                                           */
/*  FILE:         uchar.c                                                    */
/*  DESCRIPTION:  32-bit unsigned Unicode scalar values                      */
/*                                                                           */
/*****************************************************************************/
#include <stdlib.h>
#include <assert.h>
#include <string.h>
#include "memory.h"
#include "uchar.h"
#include "ustring.h"
#include "ustring_pool.h"
#include "array.h"
#include "usymtab.h"
#include "trie.h"
#define DATA_FILE AStringToUString("UnicodeData.txt")
#define CASE_FILE AStringToUString("SpecialCasing.txt")
#define TRIE_FILE AStringToUString("char_trie")
#define POOL_FILE AStringToUString("char_pool")
#define in_range(code, lower, upper) ((code) >= (lower) && (code) <= (upper) )
#define MAX_CHAR 0x10FFFF	/* maximum Unicode character         */
#define MAX_BUFF 400
#define DEBUG1 0
#define DEBUG2 0
#define DEBUG3 0
#define DEBUG4 0
#define DEBUG5 0


/*****************************************************************************/
/*                                                                           */
/*  DATA_SEQ - whether a data point is a first or last in sequence           */
/*                                                                           */
/*****************************************************************************/

typedef enum {
  DATA_ORDINARY,			/* just an ordinary character record */
  DATA_FIRST,				/* first of a sequence               */
  DATA_LAST				/* last of a sequence                */
} DATA_SEQ;


/*****************************************************************************/
/*                                                                           */
/*  UCHAR_OBJ - Unicode character objects (pointers to property records)     */
/*                                                                           */
/*****************************************************************************/
#define NO_DIGIT 255

typedef struct uchar_obj_rec {
  unsigned char			general_category;  /* UCHAR_GENERAL_CATEGORY */
  unsigned char			lex_class;	   /* UCHAR_LEX_CLASS        */
  unsigned char			canonical_combining_class;
  unsigned char			bidi_class;	   /* UCHAR_BIDI_CLASS       */
  USTRING_POOL_INDEX		cmap;		   /* index to USTRING pool  */
  unsigned char			cmap_class;	   /* UCHAR_CMAP_CLASS       */
  unsigned char			decimal_digit;	   /* 255 means none         */
  unsigned char			digit;		   /* 255 means none         */
  unsigned char			numeric_numerator; /* 255 means none         */
  unsigned char			numeric_denominator;
  unsigned char			bidi_mirrored;	   /* BOOLEAN                */
  USTRING_POOL_INDEX		uppercase;	   /* these are              */
  USTRING_POOL_INDEX		lowercase;	   /* indexes into the       */
  USTRING_POOL_INDEX		titlecase;	   /* USTRING pool           */
} *UCHAR_OBJ;

typedef ARRAY(UCHAR_OBJ) ARRAY_UCHAR_OBJ;


/*****************************************************************************/
/*                                                                           */
/*  SPCASE_OBJ - Special Casing object for special casing data.              */
/*                                                                           */
/*****************************************************************************/

typedef struct spcase_obj_rec {
  USTRING_POOL_INDEX		uppercase;		/* these are        */
  USTRING_POOL_INDEX		lowercase;		/* indexes into the */
  USTRING_POOL_INDEX		titlecase;		/* USTRING pool     */
} *SPCASE_OBJ;


/*****************************************************************************/
/*                                                                           */
/*  char_trie and ustring_pool                                               */
/*                                                                           */
/*  When loaded, char_trie is a trie in which you can look up character      */
/*  codes and get back character properties objects, and ustring_pool is     */
/*  a pool of USTRING objects referenced by those objects.                   */
/*                                                                           */
/*****************************************************************************/

static USTRING_POOL ustring_pool = NULL;
static TRIE char_trie = NULL;


/*****************************************************************************/
/*                                                                           */
/*  ASTRING UCharGeneralCategoryShow(UCHAR_GENERAL_CATEGORY gc)              */
/*                                                                           */
/*  Return the string name of general catgory gc.                            */
/*                                                                           */
/*****************************************************************************/

ASTRING UCharGeneralCategoryShow(UCHAR_GENERAL_CATEGORY gc)
{
  switch( gc )
  {
    case UCHAR_LU: return "Lu";
    case UCHAR_LL: return "Ll";
    case UCHAR_LT: return "Lt";
    case UCHAR_LM: return "Lm";
    case UCHAR_LO: return "Lo";
    case UCHAR_MN: return "Mn";
    case UCHAR_MC: return "Mc";
    case UCHAR_ME: return "Me";
    case UCHAR_ND: return "Nd";
    case UCHAR_NL: return "Nl";
    case UCHAR_NO: return "No";
    case UCHAR_PC: return "Pc";
    case UCHAR_PD: return "Pd";
    case UCHAR_PS: return "Ps";
    case UCHAR_PE: return "Pe";
    case UCHAR_PI: return "Pi";
    case UCHAR_PF: return "Pf";
    case UCHAR_PO: return "Po";
    case UCHAR_SM: return "Sm";
    case UCHAR_SC: return "Sc";
    case UCHAR_SK: return "Sk";
    case UCHAR_SO: return "So";
    case UCHAR_ZS: return "Zs";
    case UCHAR_ZL: return "Zl";
    case UCHAR_ZP: return "Zp";
    case UCHAR_CC: return "Cc";
    case UCHAR_CF: return "Cf";
    case UCHAR_CS: return "Cs";
    case UCHAR_CO: return "Co";
    case UCHAR_CN: return "Cn";
    case UCHAR_I:  return "I";

    default:
      fprintf(stderr, "UCharGeneralCategoryShow: unknown category %d\n", gc);
      exit(1);
  }
}


/*****************************************************************************/
/*                                                                           */
/*  ASTRING UCharLexClassShow(UCHAR_LEX_CLASS lc)                            */
/*                                                                           */
/*  Return the string name of lex class lc.                                  */
/*                                                                           */
/*****************************************************************************/

ASTRING UCharLexClassShow(UCHAR_LEX_CLASS lc)
{
  switch( lc )
  {
    case UCHAR_LEX_HASH:	  return "UCHAR_LEX_HASH";
    case UCHAR_LEX_QUOTE_DOUBLE:  return "UCHAR_LEX_QUOTE_DOUBLE";
    case UCHAR_LEX_QUOTE_SINGLE:  return "UCHAR_LEX_QUOTE_SINGLE";
    case UCHAR_LEX_LEFT_PAREN:	  return "UCHAR_LEX_LEFT_PAREN";
    case UCHAR_LEX_RIGHT_PAREN:	  return "UCHAR_LEX_RIGHT_PAREN";
    case UCHAR_LEX_COMMA:	  return "UCHAR_LEX_COMMA";
    case UCHAR_LEX_EXCLAM:	  return "UCHAR_LEX_EXCLAM";
    case UCHAR_LEX_DOT:		  return "UCHAR_LEX_DOT";
    case UCHAR_LEX_COLON:	  return "UCHAR_LEX_COLON";
    case UCHAR_LEX_LEFT_BRACKET:  return "UCHAR_LEX_LEFT_BRACKET";
    case UCHAR_LEX_BACKSLASH:	  return "UCHAR_LEX_BACKSLASH";
    case UCHAR_LEX_RIGHT_BRACKET: return "UCHAR_LEX_RIGHT_BRACKET";
    case UCHAR_LEX_LEFT_BRACE:	  return "UCHAR_LEX_LEFT_BRACE";
    case UCHAR_LEX_RIGHT_BRACE:	  return "UCHAR_LEX_RIGHT_BRACE";
    case UCHAR_LEX_ID_BEGIN:	  return "UCHAR_LEX_ID_BEGIN";
    case UCHAR_LEX_ID_EXTEND:	  return "UCHAR_LEX_ID_EXTEND";
    case UCHAR_LEX_DIGIT:	  return "UCHAR_LEX_DIGIT";
    case UCHAR_LEX_OTHER_PUNCT:	  return "UCHAR_LEX_OTHER_PUNCT";
    case UCHAR_LEX_SPACE:	  return "UCHAR_LEX_SPACE";
    case UCHAR_LEX_TAB:		  return "UCHAR_LEX_TAB";
    case UCHAR_LEX_ENDLINE:	  return "UCHAR_LEX_ENDLINE";
    case UCHAR_LEX_OTHER:	  return "UCHAR_LEX_OTHER";

    default:
      fprintf(stderr, "UCharLexClass Show: unknown class %d\n", lc);
      exit(1);
  }
}


/*****************************************************************************/
/*                                                                           */
/*  ASTRING UCharBidiClassShow(UCHAR_BIDI_CLASS bc)                          */
/*                                                                           */
/*  Show the Bidi class bc.                                                  */
/*                                                                           */
/*****************************************************************************/

ASTRING UCharBidiClassShow(UCHAR_BIDI_CLASS bc)
{
  switch( bc )
  {
    case UCHAR_BIDI_L:		return "L";
    case UCHAR_BIDI_LRE:	return "LRE";
    case UCHAR_BIDI_LRO:	return "LRO";
    case UCHAR_BIDI_R:		return "R";
    case UCHAR_BIDI_AL:		return "AL";
    case UCHAR_BIDI_RLE:	return "RLE";
    case UCHAR_BIDI_RLO:	return "RLO";
    case UCHAR_BIDI_PDF:	return "PDF";
    case UCHAR_BIDI_EN:		return "EN";
    case UCHAR_BIDI_ES:		return "ES";
    case UCHAR_BIDI_ET:		return "ET";
    case UCHAR_BIDI_AN:		return "AN";
    case UCHAR_BIDI_CS:		return "CS";
    case UCHAR_BIDI_NSM:	return "NSM";
    case UCHAR_BIDI_BN:		return "BN";
    case UCHAR_BIDI_B:		return "B";
    case UCHAR_BIDI_S:		return "S";
    case UCHAR_BIDI_WS:		return "WS";
    case UCHAR_BIDI_ON:		return "ON";

    default:
      assert(FALSE);
      return NULL;  /* keep compiler happy */
  }
}


/*****************************************************************************/
/*                                                                           */
/*  void UCharObjDebug(UCHAR_OBJ uo, int indent, FILE *fp)                   */
/*                                                                           */
/*  Debug print of UCHAR object uo.                                          */
/*                                                                           */
/*****************************************************************************/

static void UCharObjDebug(UCHAR_OBJ uo, int indent, FILE *fp)
{
  fprintf(fp, "%*s[ UCHAR_OBJ:\n", indent, "");
  fprintf(fp, "%*s  general %s, lex %s, combining class %d, bidi class %s\n",
    indent, "", UCharGeneralCategoryShow(uo->general_category),
    UCharLexClassShow(uo->lex_class), uo->canonical_combining_class,
    UCharBidiClassShow(uo->bidi_class));
  fprintf(fp, "%*s  cmap class %d, cmap [%s]\n", indent, "", uo->cmap_class,
    UStringToDisplayedHex(UStringPoolGet(ustring_pool, uo->cmap)));
  if( uo->decimal_digit != NO_DIGIT )
    fprintf(fp, "%*s  decimal_digit %d\n", indent, "", uo->decimal_digit);
  if( uo->digit != NO_DIGIT )
    fprintf(fp, "%*s  digit %d\n", indent, "", uo->digit);
  if( uo->numeric_numerator != NO_DIGIT )
    fprintf(fp, "%*s  numerator %d, denominator %d\n", indent, "",
      uo->numeric_numerator, uo->numeric_denominator);
  fprintf(fp, "%*s  bidi_mirrored %s\n", indent, "", bool(uo->bidi_mirrored));
  fprintf(fp, "%*s  upper [%s], lower[%s], title[%s]\n", indent, "",
    UStringToDisplayedHex(UStringPoolGet(ustring_pool, uo->uppercase)),
    UStringToDisplayedHex(UStringPoolGet(ustring_pool, uo->lowercase)),
    UStringToDisplayedHex(UStringPoolGet(ustring_pool, uo->titlecase)));
  fprintf(fp, "%*s]\n", indent, "");
}


/*****************************************************************************/
/*                                                                           */
/*  void SpCaseObjDebug(SPCASE_OBJ spc, int indent, FILE *fp)                */
/*                                                                           */
/*  Debug print of special casing object spc.                                */
/*                                                                           */
/*****************************************************************************/

static void SpCaseObjDebug(SPCASE_OBJ spc, int indent, FILE *fp)
{
  fprintf(fp, "%*s[ SPCASE_OBJ:\n", indent, "");
  fprintf(fp, "%*s  upper [%s], lower[%s], title[%s]\n", indent, "",
    UStringToDisplayedHex(UStringPoolGet(ustring_pool, spc->uppercase)),
    UStringToDisplayedHex(UStringPoolGet(ustring_pool, spc->lowercase)),
    UStringToDisplayedHex(UStringPoolGet(ustring_pool, spc->titlecase)));
  fprintf(fp, "%*s]\n", indent, "");
}


/*****************************************************************************/
/*                                                                           */
/*  int UCharObjCmp(const void *t1, const void *t2)                          */
/*                                                                           */
/*  Comparison functions for sorting UCHAR_OBJS into an arbitrary but        */
/*  fixed ordering.                                                          */
/*                                                                           */
/*****************************************************************************/

static int UCharObjCmp(const void *t1, const void *t2)
{
  UCHAR_OBJ u1 = *(UCHAR_OBJ *) t1;
  UCHAR_OBJ u2 = *(UCHAR_OBJ *) t2;
  if( u1 == u2 )
    return 0;
  if( u1->general_category != u2->general_category )
    return u1->general_category - u2->general_category;
  if( u1->lex_class != u2->lex_class )
    return u1->lex_class - u2->lex_class;
  if( u1->canonical_combining_class != u2->canonical_combining_class )
    return u1->canonical_combining_class - u2->canonical_combining_class;
  if( u1->bidi_class != u2->bidi_class )
    return u1->bidi_class - u2->bidi_class;
  if( u1->cmap_class != u2->cmap_class )
    return u1->cmap_class - u2->cmap_class;
  if( u1->cmap != u2->cmap )
    return u1->cmap - u2->cmap;
  if( u1->decimal_digit != u2->decimal_digit )
    return u1->decimal_digit - u2->decimal_digit;
  if( u1->digit != u2->digit )
    return u1->digit - u2->digit;
  if( u1->numeric_numerator != u2->numeric_numerator )
    return u1->numeric_numerator - u2->numeric_numerator;
  if( u1->numeric_denominator != u2->numeric_denominator )
    return u1->numeric_denominator - u2->numeric_denominator;
  if( u1->bidi_mirrored != u2->bidi_mirrored )
    return u1->bidi_mirrored - u2->bidi_mirrored;
  if( u1->uppercase != u2->uppercase )
    return u1->uppercase - u2->uppercase;
  if( u1->lowercase != u2->lowercase )
    return u1->lowercase - u2->lowercase;
  if( u1->titlecase != u2->titlecase )
    return u1->titlecase - u2->titlecase;
  return 0;
}


/*****************************************************************************/
/*                                                                           */
/*  BOOLEAN UCharObjEqual(UCHAR_OBJ u1, UCHAR_OBJ u2)                        */
/*                                                                           */
/*  Return TRUE if u1 and u2 are equal.                                      */
/*                                                                           */
/*****************************************************************************/

static BOOLEAN UCharObjEqual(UCHAR_OBJ u1, UCHAR_OBJ u2)
{
  void *t1 = (void *) u1;
  void *t2 = (void *) u2;
  return UCharObjCmp(&t1, &t2) == 0;
}


/*****************************************************************************/
/*                                                                           */
/*  UTF-16 macros included for reference but not actually used.              */
/*                                                                           */
/*****************************************************************************/

#define is_high_surrogate(ch) ((ch) >= 0xD800 && (ch) <= 0xDBFF)
#define is_low_surrogate(ch)  ((ch) >= 0xDC00 && (ch) <= 0xDFFF)

#define scalar_code(h, l)  ((((h) - 0xD800) << 10) + ((l) - 0xDC00) + 0x10000)


/*****************************************************************************/
/*                                                                           */
/*  UCHAR_ERROR uchar_error_type                                             */
/*                                                                           */
/*  Contains the type of error when UCharGet returns FALSE.  Undefined       */
/*  at other times.                                                          */
/*                                                                           */
/*****************************************************************************/

UCHAR_ERROR uchar_error_type;


/*****************************************************************************/
/*                                                                           */
/*  unsigned int uchar_error_byte                                            */
/*                                                                           */
/*  Contains the illegal byte or char value when uchar_error_type is set to  */
/*  UE_ILLEGAL_CODE, UE_ILLEGAL_BYTE1, UE_ILLEGAL_BYTE2, UE_ILLEGAL_BYTE3,   */
/*  or UE_ILLEGAL_BYTE4.  Is undefined at other times.                       */
/*                                                                           */
/*****************************************************************************/

unsigned int uchar_error_byte;


/*****************************************************************************/
/*                                                                           */
/*  BOOLEAN UCharGet(FILE *fp, UCHAR *ch)                                    */
/*                                                                           */
/*  Read the next character from fp, or UEOF if none.  The file format       */
/*  is UTF-8 but the result is a 32-bit Unicode scalar value.  Return        */
/*  TRUE if successful and set *ch to the character found, otherwise         */
/*  return FALSE with ch undefined, and set uchar_error_type to the          */
/*  type of the error.                                                       */
/*                                                                           */
/*  This code is based on Table 3-1 (UTF-8 Bit Distribution) on page 47 of   */
/*  the Unicode book.  Here is an expanded version of how to interpret the   */
/*  first byte:                                                              */
/*                                                                           */
/*    Prefix  Binary range         Hex. range    Interpretation              */
/*    ---------------------------------------------------------------------  */
/*    0       00000000 - 01111111  0x00 - 0x7F   One-byte character          */
/*    10      10000000 - 10111111  0x80 - 0xBF   Illegal (labels non-first)  */
/*    110     11000000 - 11011111  0xC0 - 0xDF   Two-byte character          */
/*    1110    11100000 - 11101111  0xE0 - 0xEF   Three-byte character        */
/*    11110   11110000 - 11110111  0xF0 - 0xF7   Four-byte character         */
/*    11111   11111000 - 11111111  0xF8 - 0xFF   Illegal (labels 5-6-byte)   */
/*    ---------------------------------------------------------------------  */
/*                                                                           */
/*****************************************************************************/

#define last_three_bits(ch)	( (ch) & 0x07 )
#define last_four_bits(ch)	( (ch) & 0x0F )
#define last_five_bits(ch)	( (ch) & 0x1F )
#define last_six_bits(ch)	( (ch) & 0x3F )

#define check_for_unexpected_eof(ch)					\
if( ch == EOF )								\
{									\
  uchar_error_type = UE_UNEXPECTED_EOF;					\
  return FALSE;								\
}

#define check_for_illegal_byte(ch, error_type)				\
if( ch < 0x80 || ch > 0xBF )						\
{									\
  uchar_error_type = error_type;					\
  uchar_error_byte = ch;						\
  return FALSE;								\
}

BOOLEAN UCharGet(FILE *fp, UCHAR *ch)
{
  register int ch1, ch2, ch3, ch4;
  ch1 = getc(fp);
  if( ch1 == EOF )
  {
    /* end of file */
    *ch = UEOF;
    return TRUE;
  }
  else if( ch1 <= 0x7F )
  {
    /* one-byte case, nothing more needed */
    *ch = ch1;
    return TRUE;
  }
  else if( ch1 <= 0xBF )
  {
    /* illegal, no initial byte should be in this range */
    uchar_error_type = UE_ILLEGAL_BYTE1;
    uchar_error_byte = ch1;
    return FALSE;
  }
  else if( ch1 <= 0xDF )
  {
    /* two-byte case, read next character and assemble result */
    ch2 = getc(fp);
    check_for_unexpected_eof(ch2);
    check_for_illegal_byte(ch2, UE_ILLEGAL_BYTE2);
    *ch = (last_five_bits(ch1) << 6) | last_six_bits(ch2);
    return TRUE;
  }
  else if( ch1 <= 0xEF )
  {
    /* three-byte case, read next two characters and assemble result */
    ch2 = getc(fp);
    check_for_unexpected_eof(ch2);
    check_for_illegal_byte(ch2, UE_ILLEGAL_BYTE2);
    ch3 = getc(fp);
    check_for_unexpected_eof(ch3);
    check_for_illegal_byte(ch3, UE_ILLEGAL_BYTE3);
    *ch = (last_four_bits(ch1) << 12) | (last_six_bits(ch2) << 6) |
	    last_six_bits(ch3);
    return TRUE;
  }
  else if( ch1 <= 0xF7 )
  {
    /* four-byte case, read next three characters and assemble result */
    ch2 = getc(fp);
    check_for_unexpected_eof(ch2);
    check_for_illegal_byte(ch2, UE_ILLEGAL_BYTE2);
    ch3 = getc(fp);
    check_for_unexpected_eof(ch3);
    check_for_illegal_byte(ch3, UE_ILLEGAL_BYTE3);
    ch4 = getc(fp);
    check_for_unexpected_eof(ch4);
    check_for_illegal_byte(ch4, UE_ILLEGAL_BYTE4);
    *ch = (last_three_bits(ch1) << 18) | (last_six_bits(ch2) << 12) |
	   (last_six_bits(ch3) << 6) | last_six_bits(ch4);
    if( (*ch) <= MAX_CHAR )
      return TRUE;
    else
    {
      uchar_error_type = UE_ILLEGAL_CODE;
      uchar_error_byte = *ch;
      return FALSE;
    }
  }
  else /* ch1 <= 0xFF */
  {
    /* illegal, no initial byte should be in this range */
    uchar_error_type = UE_ILLEGAL_BYTE1;
    uchar_error_byte = ch1;
    return FALSE;
  }
}


/*****************************************************************************/
/*                                                                           */
/*  void UCharPut(UCHAR ch, FILE *fp)                                        */
/*                                                                           */
/*  Write one 32-bit Unicode scalar value to fp in UTF-8 format.             */
/*                                                                           */
/*****************************************************************************/

#define non_first_byte(ch)	( (ch) | 0x80 )
#define first_of_two(ch)	( (ch) | 0xC0 )
#define first_of_three(ch)	( (ch) | 0xE0 )
#define first_of_four(ch)	( (ch) | 0xF0 )

#define seven_bit_value(ch)	( ((ch) & ~0x7F    ) == 0 )
#define eleven_bit_value(ch)	( ((ch) & ~0x7FF   ) == 0 )
#define sixteen_bit_value(ch)	( ((ch) & ~0xFFFF  ) == 0 )
#define twentyone_bit_value(ch)	( ((ch) & ~0x1FFFFF) == 0 )

void UCharPut(UCHAR ch, FILE *fp)
{
  if( seven_bit_value(ch) )
  {
    /* seven-bit goes out as is */
    putc(ch, fp);
  }
  else if( eleven_bit_value(ch) )
  {
    /* eleven-bit goes out in two bytes */
    putc(first_of_two(ch >> 6), fp);
    putc(non_first_byte(last_six_bits(ch)), fp);
  }
  else if( sixteen_bit_value(ch) )
  {
    /* sixteen-bit value goes out in three bytes */
    putc(first_of_three(ch >> 12), fp);
    putc(non_first_byte(last_six_bits(ch >> 6)), fp);
    putc(non_first_byte(last_six_bits(ch)), fp);
  }
  else if( twentyone_bit_value(ch) )
  {
    /* twentyone-bit value goes out in four bytes */
    putc(first_of_four(ch >> 18), fp);
    putc(non_first_byte(last_six_bits(ch >> 12)), fp);
    putc(non_first_byte(last_six_bits(ch >> 6)), fp);
    putc(non_first_byte(last_six_bits(ch)), fp);
  }
  else
  {
    /* illegal value, not Unicode */
    fprintf(stderr, "illegal character (not Unicode) 0x%x\n", ch);
    exit(1);
  }
}


/*****************************************************************************/
/*                                                                           */
/*  Loading character properties.                                            */
/*                                                                           */
/*****************************************************************************/

/*****************************************************************************/
/*                                                                           */
/*  BOOLEAN Scan(ASTRING *p, ASTRING res, ASTRING stop)                      */
/*                                                                           */
/*  Scan *p onwards for the sequence of characters not including any         */
/*  characters from *stop.  Copy the scanned result and skip the stop        */
/*  character, ready for the next scan.  Return TRUE if a field was          */
/*  found, even if empty, and FALSE if not (which can only happen if         */
/*  **p == '\0' initially).                                                  */
/*                                                                           */
/*****************************************************************************/

static BOOLEAN ch_in(char ch, ASTRING str)
{
  ASTRING p;
  for( p = str;  *p != '\0';  p++ )
    if( *p == ch )
      return TRUE;
  return FALSE;
}

static BOOLEAN Scan(ASTRING *p, ASTRING res, ASTRING stop)
{
  if( **p == '\0' )
    return FALSE;
  while( !ch_in(**p, stop) && **p != '\0' )
  {
    *res = **p;
    (*p)++;
    res++;
  }
  *res = '\0';
  if( **p != '\0' )
    (*p)++;
  return TRUE;
}


/*****************************************************************************/
/*                                                                           */
/*  UCHAR_GENERAL_CATEGORY GeneralCategoryNameToCode(ASTRING name,           */
/*    UTF8 fname, int line_num, UCHAR ch)                                    */
/*                                                                           */
/*  Convert the general category name *name into its code.  The other        */
/*  parameters are for error messages only.                                  */
/*                                                                           */
/*  Note.  Because Nonpareil deals only in UTF-8, all occurrences of         */
/*  surrogate characters are invalid.  So in this table, "Cs" (i.e.          */
/*  surrogate) is mapped not to UCHAR_CS but to UCHAR_I.                     */
/*                                                                           */
/*****************************************************************************/

static UCHAR_GENERAL_CATEGORY GeneralCategoryNameToCode(ASTRING name,
  UTF8 fname, int line_num, UCHAR ch)
{
  static SYMTAB_INT gc_codes = NULL;
  UCHAR_GENERAL_CATEGORY res, err;
  if( gc_codes == NULL )
  {
    SymInit(&gc_codes);
    SymInsert(gc_codes, AStringToUString("Lu"), UCHAR_LU, &err);
    SymInsert(gc_codes, AStringToUString("Ll"), UCHAR_LL, &err);
    SymInsert(gc_codes, AStringToUString("Lt"), UCHAR_LT, &err);
    SymInsert(gc_codes, AStringToUString("Lm"), UCHAR_LM, &err);
    SymInsert(gc_codes, AStringToUString("Lo"), UCHAR_LO, &err);
    SymInsert(gc_codes, AStringToUString("Mn"), UCHAR_MN, &err);
    SymInsert(gc_codes, AStringToUString("Mc"), UCHAR_MC, &err);
    SymInsert(gc_codes, AStringToUString("Me"), UCHAR_ME, &err);
    SymInsert(gc_codes, AStringToUString("Nd"), UCHAR_ND, &err);
    SymInsert(gc_codes, AStringToUString("Nl"), UCHAR_NL, &err);
    SymInsert(gc_codes, AStringToUString("No"), UCHAR_NO, &err);
    SymInsert(gc_codes, AStringToUString("Pc"), UCHAR_PC, &err);
    SymInsert(gc_codes, AStringToUString("Pd"), UCHAR_PD, &err);
    SymInsert(gc_codes, AStringToUString("Ps"), UCHAR_PS, &err);
    SymInsert(gc_codes, AStringToUString("Pe"), UCHAR_PE, &err);
    SymInsert(gc_codes, AStringToUString("Pi"), UCHAR_PI, &err);
    SymInsert(gc_codes, AStringToUString("Pf"), UCHAR_PF, &err);
    SymInsert(gc_codes, AStringToUString("Po"), UCHAR_PO, &err);
    SymInsert(gc_codes, AStringToUString("Sm"), UCHAR_SM, &err);
    SymInsert(gc_codes, AStringToUString("Sc"), UCHAR_SC, &err);
    SymInsert(gc_codes, AStringToUString("Sk"), UCHAR_SK, &err);
    SymInsert(gc_codes, AStringToUString("So"), UCHAR_SO, &err);
    SymInsert(gc_codes, AStringToUString("Zs"), UCHAR_ZS, &err);
    SymInsert(gc_codes, AStringToUString("Zl"), UCHAR_ZL, &err);
    SymInsert(gc_codes, AStringToUString("Zp"), UCHAR_ZP, &err);
    SymInsert(gc_codes, AStringToUString("Cc"), UCHAR_CC, &err);
    SymInsert(gc_codes, AStringToUString("Cf"), UCHAR_CF, &err);
    SymInsert(gc_codes, AStringToUString("Cs"), UCHAR_I, &err);  /* not Cs! */
    SymInsert(gc_codes, AStringToUString("Co"), UCHAR_CO, &err);
    SymInsert(gc_codes, AStringToUString("Cn"), UCHAR_CN, &err);
  }
  if( !SymRetrieve(gc_codes, AStringToUString(name), &res) )
  {
    fprintf(stderr, "%s:%d: %x has unknown general category name %s\n",
      fname, line_num, ch, name);
    exit(1);
  }
  return res;
}


/*****************************************************************************/
/*                                                                           */
/*  UCHAR_LEX_CLASS FindLexClass(UCHAR ch, UCHAR_GENERAL_CATEGORY gc)        */
/*                                                                           */
/*  Return the Nopareil lexical class of ch, given than it is already        */
/*  known to have general category gc.                                       */
/*                                                                           */
/*****************************************************************************/

static UCHAR_LEX_CLASS FindLexClass(UCHAR ch, UCHAR_GENERAL_CATEGORY gc)
{
  switch( ch )
  {
    case '#':	return UCHAR_LEX_HASH;
    case '\"':	return UCHAR_LEX_QUOTE_DOUBLE;
    case '\'':	return UCHAR_LEX_QUOTE_SINGLE;
    case '(':	return UCHAR_LEX_LEFT_PAREN;
    case ')':	return UCHAR_LEX_RIGHT_PAREN;
    case ',':	return UCHAR_LEX_COMMA;
    case '!':	return UCHAR_LEX_EXCLAM;
    case '.':	return UCHAR_LEX_DOT;
    case ':':	return UCHAR_LEX_COLON;
    case '[':	return UCHAR_LEX_LEFT_BRACKET;
    case '\\':	return UCHAR_LEX_BACKSLASH;
    case ']':	return UCHAR_LEX_RIGHT_BRACKET;
    case '{':	return UCHAR_LEX_LEFT_BRACE;
    case '}':	return UCHAR_LEX_RIGHT_BRACE;
    case '\t':	return UCHAR_LEX_TAB;
    case '\n':	return UCHAR_LEX_ENDLINE;
    case '\r':	return UCHAR_LEX_ENDLINE;
    case '\v':	return UCHAR_LEX_ENDLINE;

    default:

      switch( gc )
      {
        case UCHAR_LU:
        case UCHAR_LL:
        case UCHAR_LT:
        case UCHAR_LM:
        case UCHAR_LO:
        case UCHAR_NL:

	  return UCHAR_LEX_ID_BEGIN;

        case UCHAR_MN:
        case UCHAR_MC:
        case UCHAR_PC:
        case UCHAR_CF:

	  return UCHAR_LEX_ID_EXTEND;

        case UCHAR_ND:

	  return UCHAR_LEX_DIGIT;

        case UCHAR_PD:
        case UCHAR_PS:
        case UCHAR_PE:
        case UCHAR_PI:
        case UCHAR_PF:
        case UCHAR_PO:
        case UCHAR_SM:
        case UCHAR_SC:
        case UCHAR_SK:
        case UCHAR_SO:

	  return UCHAR_LEX_OTHER_PUNCT;

        case UCHAR_ZS:

	  return UCHAR_LEX_SPACE;

        case UCHAR_ZL:
        case UCHAR_ZP:

	  return UCHAR_LEX_ENDLINE;

        case UCHAR_ME:
        case UCHAR_NO:
        case UCHAR_CC:
        case UCHAR_CS:
        case UCHAR_CO:
        case UCHAR_CN:
        case UCHAR_I:

	  return UCHAR_LEX_OTHER;

	default:

	  fprintf(stderr, "FindLexClass: unknown general category %d\n", gc);
	  exit(1);
	  return 0; /* keep compiler happy */
      }
  }
}


/*****************************************************************************/
/*                                                                           */
/*  UCHAR_BIDI_CLASS BidiNameToCode(ASTRING name,                            */
/*    UTF8 fname, int line_num, UCHAR ch)                                    */
/*                                                                           */
/*  Convert the Bidi name *name into its code.  The other parameters are     */
/*  for error messages only.                                                 */
/*                                                                           */
/*****************************************************************************/

static UCHAR_BIDI_CLASS BidiNameToCode(ASTRING name,
  UTF8 fname, int line_num, UCHAR ch)
{
  static SYMTAB_INT bidi_codes = NULL;
  UCHAR_BIDI_CLASS res, err;
  if( bidi_codes == NULL )
  {
    SymInit(&bidi_codes);
    SymInsert(bidi_codes, AStringToUString("L"),   UCHAR_BIDI_L,   &err);
    SymInsert(bidi_codes, AStringToUString("LRE"), UCHAR_BIDI_LRE, &err);
    SymInsert(bidi_codes, AStringToUString("LRO"), UCHAR_BIDI_LRO, &err);
    SymInsert(bidi_codes, AStringToUString("R"),   UCHAR_BIDI_R,   &err);
    SymInsert(bidi_codes, AStringToUString("AL"),  UCHAR_BIDI_AL,  &err);
    SymInsert(bidi_codes, AStringToUString("RLE"), UCHAR_BIDI_RLE, &err);
    SymInsert(bidi_codes, AStringToUString("RLO"), UCHAR_BIDI_RLO, &err);
    SymInsert(bidi_codes, AStringToUString("PDF"), UCHAR_BIDI_PDF, &err);
    SymInsert(bidi_codes, AStringToUString("EN"),  UCHAR_BIDI_EN,  &err);
    SymInsert(bidi_codes, AStringToUString("ES"),  UCHAR_BIDI_ES,  &err);
    SymInsert(bidi_codes, AStringToUString("ET"),  UCHAR_BIDI_ET,  &err);
    SymInsert(bidi_codes, AStringToUString("AN"),  UCHAR_BIDI_AN,  &err);
    SymInsert(bidi_codes, AStringToUString("CS"),  UCHAR_BIDI_CS,  &err);
    SymInsert(bidi_codes, AStringToUString("NSM"), UCHAR_BIDI_NSM, &err);
    SymInsert(bidi_codes, AStringToUString("BN"),  UCHAR_BIDI_BN,  &err);
    SymInsert(bidi_codes, AStringToUString("B"),   UCHAR_BIDI_B,   &err);
    SymInsert(bidi_codes, AStringToUString("S"),   UCHAR_BIDI_S,   &err);
    SymInsert(bidi_codes, AStringToUString("WS"),  UCHAR_BIDI_WS,  &err);
    SymInsert(bidi_codes, AStringToUString("ON"),  UCHAR_BIDI_ON,  &err);
  }
  if( !SymRetrieve(bidi_codes, AStringToUString(name), &res) )
  {
    fprintf(stderr, "%s:%d: %x has unknown bi-directional category name %s\n",
      fname, line_num, ch, name);
    exit(1);
  }
  return res;
}


/*****************************************************************************/
/*                                                                           */
/*  UCHAR_CMAP_CLASS CMapClassNameToCode(ASTRING name,                       */
/*    UTF8 fname, int line_num, UCHAR ch)                                    */
/*                                                                           */
/*  Convert the CMap name *name into its code.  Parameter code is for        */
/*  error messages only.                                                     */
/*                                                                           */
/*****************************************************************************/

static UCHAR_CMAP_CLASS CMapClassNameToCode(ASTRING name,
  UTF8 fname, int line_num, UCHAR ch)
{
  static SYMTAB_INT cm = NULL;
  UCHAR_CMAP_CLASS res, err;
  if( cm == NULL )
  {
    SymInit(&cm);
    SymInsert(cm, AStringToUString("<font>"),     UCHAR_CMAP_FONT,     &err);
    SymInsert(cm, AStringToUString("<noBreak>"),  UCHAR_CMAP_NOBREAK,  &err);
    SymInsert(cm, AStringToUString("<initial>"),  UCHAR_CMAP_INITIAL,  &err);
    SymInsert(cm, AStringToUString("<medial>"),   UCHAR_CMAP_MEDIAL,   &err);
    SymInsert(cm, AStringToUString("<final>"),    UCHAR_CMAP_FINAL,    &err);
    SymInsert(cm, AStringToUString("<isolated>"), UCHAR_CMAP_ISOLATED, &err);
    SymInsert(cm, AStringToUString("<circle>"),   UCHAR_CMAP_CIRCLE,   &err);
    SymInsert(cm, AStringToUString("<super>"),    UCHAR_CMAP_SUPER,    &err);
    SymInsert(cm, AStringToUString("<sub>"),      UCHAR_CMAP_SUB,      &err);
    SymInsert(cm, AStringToUString("<vertical>"), UCHAR_CMAP_VERTICAL, &err);
    SymInsert(cm, AStringToUString("<wide>"),     UCHAR_CMAP_WIDE,     &err);
    SymInsert(cm, AStringToUString("<narrow>"),   UCHAR_CMAP_NARROW,   &err);
    SymInsert(cm, AStringToUString("<small>"),    UCHAR_CMAP_SMALL,    &err);
    SymInsert(cm, AStringToUString("<square>"),   UCHAR_CMAP_SQUARE,   &err);
    SymInsert(cm, AStringToUString("<fraction>"), UCHAR_CMAP_FRACTION, &err);
    SymInsert(cm, AStringToUString("<compat>"),   UCHAR_CMAP_COMPAT,   &err);
  }
  if( !SymRetrieve(cm, AStringToUString(name), &res) )
  {
    fprintf(stderr, "%s:%d: %x has unknown cmap class name %s\n",
      fname, line_num, ch, name);
    exit(1);
  }
  return res;
}


/*****************************************************************************/
/*                                                                           */
/*  ASTRING UCharCMapClassShow(UCHAR_CMAP_CLASS cc)                          */
/*                                                                           */
/*  Show a map type.                                                         */
/*                                                                           */
/*****************************************************************************/

ASTRING UCharCMapClassShow(UCHAR_CMAP_CLASS cc)
{
  switch( cc )
  {
    case UCHAR_CMAP_NONE:	return "none";
    case UCHAR_CMAP_CANONICAL:	return "canonical";
    case UCHAR_CMAP_FONT:	return "font";
    case UCHAR_CMAP_NOBREAK:	return "nobreak";
    case UCHAR_CMAP_INITIAL:	return "initial";
    case UCHAR_CMAP_MEDIAL:	return "medial";
    case UCHAR_CMAP_FINAL:	return "final";
    case UCHAR_CMAP_ISOLATED:	return "isolated";
    case UCHAR_CMAP_CIRCLE:	return "circle";
    case UCHAR_CMAP_SUPER:	return "super";
    case UCHAR_CMAP_SUB:	return "sub";
    case UCHAR_CMAP_VERTICAL:	return "vertical";
    case UCHAR_CMAP_WIDE:	return "wide";
    case UCHAR_CMAP_NARROW:	return "narrow";
    case UCHAR_CMAP_SMALL:	return "small";
    case UCHAR_CMAP_SQUARE:	return "square";
    case UCHAR_CMAP_FRACTION:	return "fraction";
    case UCHAR_CMAP_COMPAT:	return "compat";

    default:

      assert(FALSE);
      return NULL; /* keep compiler happy */
  }
}


/*****************************************************************************/
/*                                                                           */
/*  unsigned short ScanHexCodeList(ASTRING field, UTF8 fname, int line_num,  */
/*    UCHAR ch, ASTRING field_name)                                          */
/*                                                                           */
/*  Scan *field for a sequence of hexadecimal characters separated by        */
/*  spaces.  Put them into the string pool and return the string's index.    */
/*  If an error occurs, use the last four parameters when generating the     */
/*  error message.                                                           */
/*                                                                           */
/*****************************************************************************/

static unsigned short ScanHexCodeList(ASTRING field, UTF8 fname, int line_num,
  UCHAR ch, ASTRING field_name)
{
  char subfield[MAX_BUFF], *p;  int i;  UCHAR hex2;
  i = 0;
  p = field;
  UStringBegin();
  while( Scan(&p, subfield, " ") )
  {
    if( strlen(subfield) > 0 )
    {
      if( sscanf(subfield, "%x", &hex2) != 1 )
      {
	fprintf(stderr, "%s:%d: %x %s has bad hex character %s\n",
	  fname, line_num, ch, field_name, subfield);
	exit(1);
      }
      UStringAdd(hex2);
    }
  }
  return UStringPoolAdd(ustring_pool, UStringEnd());
}


/*****************************************************************************/
/*                                                                           */
/*  int ScanCount(ASTRING field, char ch)                                    */
/*                                                                           */
/*  Return the number of occurrences of ch in field.                         */
/*                                                                           */
/*****************************************************************************/

static int ScanCount(ASTRING field, char ch)
{
  ASTRING p;  int res;
  res = 0;
  for( p = field;  *p != '\0';  p++ )
    if( *p == ch )
      res++;
  return res;
}


/*****************************************************************************/
/*                                                                           */
/*  UCHAR_GENERAL_CATEGORY DefaultGeneralCategory(UCHAR ch)                  */
/*                                                                           */
/*  Return the general category that is to be used for ch if nothing         */
/*  is known about it apart from its code.  This will be either Cn           */
/*  (unassigned) or I (invalid).                                             */
/*                                                                           */
/*  The invalid codes are surrogates (D800 - DFFF) and byte order marks      */
/*  (FFFE and FFFF).  The others are unassigned.                             */
/*                                                                           */
/*****************************************************************************/

static UCHAR_GENERAL_CATEGORY DefaultGeneralCategory(UCHAR ch)
{
  if( ch >= 0xD800 && ch <= 0xDFFF )
    return UCHAR_I;
  else if( ch == 0xFFFE || ch == 0xFFFF )
    return UCHAR_I;
  else
    return UCHAR_CN;
}


/*****************************************************************************/
/*                                                                           */
/*  unsigned char DefaultBidiClass(UCHAR hex_code)                           */
/*                                                                           */
/*  Return the bidi class of hex_code when it does not have an explicitly    */
/*  assigned value.  This function is derived from Table 3-8 in the          */
/*  Unicode book.                                                            */
/*                                                                           */
/*****************************************************************************/

static unsigned char DefaultBidiClass(UCHAR hex_code)
{
  if( in_range(hex_code, 0x0590, 0x05FF) )
    return UCHAR_BIDI_R;
  else if( in_range(hex_code, 0x07C0, 0x08FF) )
    return UCHAR_BIDI_R;
  else if( in_range(hex_code, 0xFB1D, 0xFB4F) )
    return UCHAR_BIDI_R;
  else if( in_range(hex_code, 0x10800, 0x10FFF) )
    return UCHAR_BIDI_R;
  else if( in_range(hex_code, 0x0600, 0x07BF) )
    return UCHAR_BIDI_AL;
  else if( in_range(hex_code, 0xFB50, 0xFDCF) )
    return UCHAR_BIDI_AL;
  else if( in_range(hex_code, 0xFDF0, 0xFDFF) )
    return UCHAR_BIDI_AL;
  else if( in_range(hex_code, 0xFE70, 0xFEFE) )
    return UCHAR_BIDI_AL;
  else
    return UCHAR_BIDI_L;
}


/*****************************************************************************/
/*                                                                           */
/*  UCHAR_OBJ DefaultUCharObj(UCHAR ch)                                      */
/*                                                                           */
/*  Return the UCHAR_OBJ that is to be used if no information about a        */
/*  character is available except its code.  Any strings created while       */
/*  making this object are to be stored in ustring_pool.                     */
/*                                                                           */
/*****************************************************************************/

static UCHAR_OBJ DefaultUCharObj(UCHAR ch)
{
  UCHAR_OBJ res;  USTRING_POOL_INDEX empty;
  empty = UStringPoolAdd(ustring_pool, UStringEmpty());
  GetMemory(res, UCHAR_OBJ);
  res->general_category = DefaultGeneralCategory(ch);
  res->lex_class = UCHAR_LEX_OTHER;
  res->canonical_combining_class = 0;
  res->bidi_class = DefaultBidiClass(ch);
  res->cmap_class = UCHAR_CMAP_NONE;
  res->cmap = empty;
  res->decimal_digit = NO_DIGIT;
  res->digit = NO_DIGIT;
  res->numeric_numerator = NO_DIGIT;
  res->numeric_denominator = 0;
  res->bidi_mirrored = FALSE;
  res->uppercase = empty;
  res->lowercase = empty;
  res->titlecase = empty;
  return res;
}



/*****************************************************************************/
/*                                                                           */
/*  UCHAR_OBJ NextUCharObj(FILE *fp, UCHAR *hex_code,                        */
/*    UTF8 fname, int *line_num, DATA_SEQ *data_seq)                         */
/*                                                                           */
/*  Return a UCHAR_OBJ containing the object on the next line of fp, or      */
/*  NULL if the file is finished.  If non-NULL, hex_code is its code,        */
/*  and *data_seq is DATA_ORDINARY, DATA_FIRST, or DATA_LAST accordingly     */
/*  as the character is not related to a sequence, or is the first in a      */
/*  sequence, or is the last.                                                */
/*                                                                           */
/*  Parameters fname and line_num are for error message printing only.       */
/*                                                                           */
/*****************************************************************************/

static UCHAR_OBJ NextUCharObj(FILE *fp, UCHAR *hex_code,
  UTF8 fname, int *line_num, DATA_SEQ *data_seq)
{
  char buff[MAX_BUFF], field[MAX_BUFF], subfield[MAX_BUFF];
  ASTRING p, sp;  int tmp, fields;  UCHAR_OBJ uo;

  /* return NULL if no more file */
  if( fgets(buff, MAX_BUFF, fp) == NULL )
    return NULL;

  /* make sure we got the whole line */
  (*line_num)++;
  if( buff[strlen(buff) - 1] != '\n' )
  {
    fprintf(stderr, "%s:%d: over-long line %s\n", (ASTRING) fname,
      *line_num, buff);
    exit(1);
  }
  p = buff;

  /* make sure we have the right number of fields */
  fields = ScanCount(buff, ';') + 1;
  if( fields != 15 )
  {
    fprintf(stderr, "%s:%d: %d fields on line (expected 15)\n",
      (ASTRING) fname, *line_num, fields);
    exit(1);
  }

  /* field 0 is character code; not saved in record, but used as index */
  Scan(&p, field, ";\n");
  if( sscanf(field, "%x", hex_code) != 1 )
  {
    fprintf(stderr, "%s:%d: can't find hex code in field 0: %s\n",
      (ASTRING) fname, *line_num, field);
    exit(1);
  }
  if( DEBUG1 )
    fprintf(stderr, "\nhex code: %x\n", *hex_code);

  GetMemory(uo, UCHAR_OBJ);

  /* field 1 is character name; don't save it, but do check for data seq */
  Scan(&p, field, ";\n");
  if( strstr(field, "First>") != NULL )
    *data_seq = DATA_FIRST;
  else if( strstr(field, "Last>") != NULL )
    *data_seq = DATA_LAST;
  else
    *data_seq = DATA_ORDINARY;
  if( DEBUG1 )
    fprintf(stderr, "name: %s (%s)\n", field, *data_seq == DATA_FIRST ?
      "first" : *data_seq == DATA_LAST ? "last" : "ordinary");

  /* field 2 is general category; also derive lex class from this */
  Scan(&p, field, ";\n");
  uo->general_category = GeneralCategoryNameToCode(field,
    fname, *line_num, *hex_code);
  uo->lex_class = FindLexClass(*hex_code, uo->general_category);
  if( DEBUG1 )
    fprintf(stderr, "general category: %s (%s), lex %s\n", field,
      UCharGeneralCategoryShow(uo->general_category),
      UCharLexClassShow(uo->lex_class));

  /* field 3 is canonical combining class */
  Scan(&p, field, ";\n");
  if( DEBUG1 )
    fprintf(stderr, "combining class: %s\n", field);
  if( sscanf(field, "%d", &tmp) != 1 )
  {
    fprintf(stderr, "%s:%d: %x combining class %s not numeric\n",
      (ASTRING) fname, *line_num, *hex_code, field);
    exit(1);
  }
  if( tmp < 0 || tmp > 255 )
  {
    fprintf(stderr, "%s:%d: %x combining class %d out of range\n",
      (ASTRING) fname, *line_num, *hex_code, tmp);
    exit(1);
  }
  uo->canonical_combining_class = tmp;

  /* field 4 is bidi category; default depends on code point (see docs) */
  Scan(&p, field, ";\n");
  if( DEBUG1 )
    fprintf(stderr, "bidi category: %s\n", field);
  if( strlen(field) > 0 )
    uo->bidi_class = BidiNameToCode(field, fname, *line_num, *hex_code);
  else
    uo->bidi_class = DefaultBidiClass(*hex_code);

  /* field 5 is canonical/compatibility mapping */
  Scan(&p, field, ";\n");
  if( DEBUG1 )
    fprintf(stderr, "cmap: %s\n", field);
  sp = field;
  if( !Scan(&sp, subfield, " \n") )
  {
    /* no entry, take default values */
    uo->cmap_class = UCHAR_CMAP_NONE;
    uo->cmap = UStringPoolAdd(ustring_pool, UStringEmpty());
  }
  else if( subfield[0] != '<' )
  {
    /* mapping but no tag (mapping is canonical) */
    uo->cmap_class = UCHAR_CMAP_CANONICAL;
    uo->cmap = ScanHexCodeList(field, fname, *line_num, *hex_code, "cmap");
  }
  else
  {
    /* mapping and tag (look up the tag) */
    uo->cmap_class = CMapClassNameToCode(subfield, fname, *line_num, *hex_code);
    uo->cmap = ScanHexCodeList(sp, fname, *line_num, *hex_code, "cmap");
  }

  /* field 6 is decimal digit */
  Scan(&p, field, ";\n");
  if( DEBUG1 )
    fprintf(stderr, "decimal digit: %s\n", field);
  if( strlen(field) > 0 )
  {
    if( sscanf(field, "%d", &tmp) != 1 )
    {
      fprintf(stderr, "%s:%d: %x has bad decimal digit %s\n",
	(ASTRING) fname, *line_num, *hex_code, field);
      exit(1);
    }
    if( tmp < 0 || tmp > 9 )
    {
      fprintf(stderr, "%s:%d: %x has decimal digit %d out of range\n",
	(ASTRING) fname, *line_num, *hex_code, tmp);
      exit(1);
    }
    uo->decimal_digit = tmp;
  }
  else
    uo->decimal_digit = NO_DIGIT;

  /* field 7 is digit */
  Scan(&p, field, ";\n");
  if( DEBUG1 )
    fprintf(stderr, "digit: %s\n", field);
  if( strlen(field) > 0 )
  {
    if( sscanf(field, "%d", &tmp) != 1 )
    {
      fprintf(stderr, "%s:%d: %x has bad digit %s\n",
	(ASTRING) fname, *line_num, *hex_code, field);
      exit(1);
    }
    if( tmp < 0 || tmp > 9 )
    {
      fprintf(stderr, "%s:%d: %x has digit %d out of range\n",
	(ASTRING) fname, *line_num, *hex_code, tmp);
      exit(1);
    }
    uo->digit = tmp;
  }
  else
    uo->digit = NO_DIGIT;

  /* field 8 is numeric; may be a/b */
  Scan(&p, field, ";\n");
  if( DEBUG1 )
    fprintf(stderr, "numeric: %s\n", field);
  uo->numeric_numerator = NO_DIGIT;
  uo->numeric_denominator = 1;
  if( strlen(field) > 0 )
  {
    sp = field;
    Scan(&sp, subfield, "/");
    if( sscanf(subfield, "%d", &tmp) != 1 )
    {
      fprintf(stderr, "%s:%d: %x has bad numeric numerator %s\n",
	(ASTRING) fname, *line_num, *hex_code, subfield);
      exit(1);
    }
    uo->numeric_numerator = tmp;
    if( Scan(&sp, subfield, "/") )
    {
      if( sscanf(subfield, "%d", &tmp) != 1 )
      {
       fprintf(stderr, "%s:%d: %x has bad numeric denominator %s\n",
	  (ASTRING) fname, *line_num, *hex_code, subfield);
	exit(1);
      }
      uo->numeric_denominator = tmp;
    }
  }

  /* field 9 is Bidi mirroring */
  Scan(&p, field, ";\n");
  if( DEBUG1 )
    fprintf(stderr, "bidi mirroring: %s\n", field);
  if( strlen(field) == 0 || strcmp(field, "N") == 0 )
    uo->bidi_mirrored = FALSE;
  else if( strcmp(field, "Y") == 0 )
    uo->bidi_mirrored = TRUE;
  else
  {
    fprintf(stderr, "%s:%d: %x has bad bidi mirrored value %s\n",
      (ASTRING) fname, *line_num, *hex_code, field);
    exit(1);
  }

  /* field 10 is Unicode 1 name (not stored) */
  Scan(&p, field, ";\n");
  if( DEBUG1 )
    fprintf(stderr, "unicode 1 name: %s\n", field);

  /* field 11 is ISO 10646 comment field (not stored) */
  Scan(&p, field, ";\n");
  if( DEBUG1 )
    fprintf(stderr, "ISO 10646 comment: %s\n", field);

  /* field 12 is simple uppercase (one character) */
  Scan(&p, field, ";\n");
  if( DEBUG1 )
    fprintf(stderr, "simple uppercase: %s\n", field);
  uo->uppercase = ScanHexCodeList(field, fname, *line_num,
    *hex_code, "simple uppercase");

  /* field 13 is simple lowercase (one character) */
  Scan(&p, field, ";\n");
  if( DEBUG1 )
    fprintf(stderr, "simple lowercase: %s\n", field);
  uo->lowercase = ScanHexCodeList(field, fname, *line_num,
    *hex_code, "simple lowercase");

  /* field 14 is simple titlecase (one character) */
  Scan(&p, field, ";\n");
  if( DEBUG1 )
    fprintf(stderr, "simple titlecase: %s\n", field);
  uo->titlecase = ScanHexCodeList(field, fname, *line_num,
    *hex_code, "simple titlecase");

  /* debug print */
  if( DEBUG1 )
  {
    fprintf(stderr, "object for %04X:\n", *hex_code);
    UCharObjDebug(uo, 2, stderr);
  }
  return uo;
}


/*****************************************************************************/
/*                                                                           */
/*  SPCASE_OBJ NextSpCaseObj(FILE *fp, UCHAR *hex_code,                      */
/*    UTF8 fname, int *line_num)                                             */
/*                                                                           */
/*  Read the next special casing object from fp, or return NULL if the       */
/*  file is exhausted.  Set *hex_code to the object's hex code.              */
/*  Parameters fname and line_num are for error message printing only.       */
/*                                                                           */
/*****************************************************************************/

static SPCASE_OBJ NextSpCaseObj(FILE *fp, UCHAR *hex_code,
  UTF8 fname, int *line_num)
{
  char buff[MAX_BUFF], field[MAX_BUFF], subfield[MAX_BUFF];
  ASTRING p, sp;  int semicolons;  SPCASE_OBJ res;

  while( fgets(buff, MAX_BUFF, fp) != NULL )
  {
    /* make sure we got the whole line */
    (*line_num)++;
    if( buff[strlen(buff) - 1] != '\n' )
    {
      fprintf(stderr, "%s:%d: over-long line %s\n", (ASTRING) fname,
	*line_num, buff);
      exit(1);
    }

    /* get the non-comment part of the line into field */
    p = buff;
    if( Scan(&p, field, "#\n") && strlen(field) > 0 )
    {
      /* <code>; <lower> ; <title> ; <upper> ; [ <condition_list> ; ] */
      semicolons = ScanCount(field, ';');
      if( semicolons < 4 || semicolons > 5 )
      {
	fprintf(stderr, "%s:%d: %d fields on line (expected 4 or 5)\n",
	  fname, *line_num, semicolons);
	exit(1);
      }
      if( semicolons == 4 ) /* not handling conditional casing at the moment */
      {
	if( DEBUG1 )
	  fprintf(stderr, "casing: %s\n", field);

	/* get hex code */
	sp = field;
	Scan(&sp, subfield, ";");
	if( sscanf(subfield, "%x", hex_code) != 1 )
	{
	  fprintf(stderr, "%s:%d: can't read hex code in first field\n",
	    (ASTRING) fname, *line_num);
	  exit(1);
	}

	/* get lower-case entry */
        GetMemory(res, SPCASE_OBJ);
	Scan(&sp, subfield, ";");
	res->lowercase = ScanHexCodeList(subfield,
	  fname, *line_num, *hex_code, "lowercase");

	/* get title-case entry */
	Scan(&sp, subfield, ";");
	res->titlecase = ScanHexCodeList(subfield,
	  fname, *line_num, *hex_code, "titlecase");

	/* get upper-case entry */
	Scan(&sp, subfield, ";");
	res->uppercase = ScanHexCodeList(subfield,
	  fname, *line_num, *hex_code, "uppercase");

	if( DEBUG1 )
	{
	  fprintf(stderr, "special casing for %04X:\n", *hex_code);
	  SpCaseObjDebug(res, 2, stderr);
	}
	return res;
      }
    }
  }
  return NULL;  /* didn't find any non-comment, non-conditional line */
}


/*****************************************************************************/
/*                                                                           */
/*  BOOLEAN MachineIsBigEndian()                                             */
/*                                                                           */
/*  Return TRUE if this program is running on a big-endian machine.          */
/*                                                                           */
/*****************************************************************************/

static BOOLEAN MachineIsBigEndian()
{
  int i;
  union {
    unsigned int whole_word;
    char         part_word[sizeof(int)];
  } test;
  for( i = 0;  i < sizeof(int);  i++ )
    test.part_word[i] = i;
  if( test.whole_word % 256 == 0 )
    return TRUE;
  else if( test.whole_word % 256 == sizeof(int) - 1 )
    return FALSE;
  else
    assert(FALSE);
}


/*****************************************************************************/
/*                                                                           */
/*  void HandleMissingRange(INIT_TRIE init_trie, UCHAR first_code,           */
/*    UCHAR last_code, UTF8 utf8_char_fname, int line_num)                   */
/*                                                                           */
/*  Insert trie entries for code range first_code to last_code, which        */
/*  is missing from the character database.  Use default values for          */
/*  the properties.                                                          */
/*                                                                           */
/*  Implementation note.  Most default values are constant over all          */
/*  code values.  The exceptions are the general category and the            */
/*  bidi class.  For their sakes, we have to investigate every code          */
/*  point and create default objects for constant sub-ranges.                */
/*                                                                           */
/*****************************************************************************/

static void HandleMissingRange(INIT_TRIE init_trie, UCHAR first_code,
  UCHAR last_code, UTF8 utf8_char_fname, int line_num)
{
  UCHAR_OBJ uo;  UCHAR curr_first, curr_last;
  if( DEBUG5 )
    fprintf(stderr, "[ HandleMissingRange(0x%X, 0x%X, %s:%d)\n",
      first_code, last_code, (ASTRING) utf8_char_fname, line_num);
  curr_first = first_code;
  for( curr_last = curr_first + 1;  curr_last <= last_code;  curr_last++ )
  {
    if( DefaultGeneralCategory(curr_first) != DefaultGeneralCategory(curr_last)
	|| DefaultBidiClass(curr_first) != DefaultBidiClass(curr_last) )
    {
      /* curr_first .. curr_last - 1 is a maximal constant code range */
      uo = DefaultUCharObj(curr_first);
      if( DEBUG5 )
      {
	fprintf(stderr, "  range 0x%X - 0x%X:\n", curr_first, curr_last - 1);
        UCharObjDebug(uo, 4, stderr);
      }
      TrieInsert(init_trie, curr_first, curr_last - 1, uo);
      curr_first = curr_last;
    }
  }
  uo = DefaultUCharObj(curr_first);
  if( DEBUG5 )
  {
    fprintf(stderr, "  range 0x%X - 0x%X\n", curr_first, last_code);
    UCharObjDebug(uo, 4, stderr);
  }
  TrieInsert(init_trie, curr_first, last_code, uo);
  if( DEBUG5 )
    fprintf(stderr, "] HandleMissingRange returning\n");
}


/*****************************************************************************/
/*                                                                           */
/*  void UCharObjBriefDebug(void *uo, FILE *fp)                              */
/*                                                                           */
/*  Here uo is really a UCHAR_OBJ.  Do a brief, one-line debug onto fp.      */
/*                                                                           */
/*****************************************************************************/

static void UCharObjBriefDebug(void *uo, FILE *fp)
{
  UCHAR_OBJ obj = (UCHAR_OBJ) uo;
  fprintf(fp, "{%s [%s] [%s] [%s]}",
    UCharGeneralCategoryShow(obj->general_category),
    UStringToDisplayedHex(UStringPoolGet(ustring_pool, obj->uppercase)),
    UStringToDisplayedHex(UStringPoolGet(ustring_pool, obj->lowercase)),
    UStringToDisplayedHex(UStringPoolGet(ustring_pool, obj->titlecase)));
}


/*****************************************************************************/
/*                                                                           */
/*  void UCharInit(USTRING directory)                                        */
/*                                                                           */
/*  Initialize this module.  Character properties data are to be read        */
/*  from directory *directory, and packed versions are to be written         */
/*  there too.                                                               */
/*                                                                           */
/*****************************************************************************/

void UCharInit(USTRING directory)
{
  FILE *char_fp;  int char_line_num;
  FILE *case_fp;  int case_line_num;
  USTRING char_fname, case_fname, trie_fname, pool_fname;
  UTF8 utf8_char_fname, utf8_case_fname;
  UCHAR char_hex_code, prev_char_hex_code, case_hex_code;
  INIT_TRIE init_trie;  UCHAR_OBJ uo, prev_uo;  SPCASE_OBJ spc;
  DATA_SEQ data_seq, prev_data_seq;

  if( DEBUG4 )
    fprintf(stderr, "[ UCharInit(%s)\n", (ASTRING) UStringToUTF8(directory));
  trie_fname = UStringCat4(directory, AStringToUString(NPC_DIR_SEP), TRIE_FILE,
    AStringToUString(MachineIsBigEndian() ? "_be" : "_le"));
  pool_fname = UStringCat4(directory, AStringToUString(NPC_DIR_SEP), POOL_FILE,
    AStringToUString(MachineIsBigEndian() ? "_be" : "_le"));
  char_trie = TrieRestore(trie_fname);
  ustring_pool = UStringPoolRestore(pool_fname);
  if( char_trie == NULL || ustring_pool == NULL )
  {
    /* data files are not present so have to be created and saved */
    ustring_pool = UStringPoolNew();
    init_trie = TrieNew(sizeof(struct uchar_obj_rec), &UCharObjCmp, 
      &UCharObjBriefDebug, DefaultUCharObj(0));

    /* open and read character properties file */
    char_fname = UStringCat3(directory, AStringToUString(NPC_DIR_SEP),
      DATA_FILE);
    utf8_char_fname = UStringToUTF8(char_fname);
    char_fp = fopen((ASTRING) utf8_char_fname, "r");
    if( char_fp == NULL )
    {
      fprintf(stderr, "cannot open character properties file %s\n",
	(ASTRING) utf8_char_fname);
      exit(1);
    }
    char_line_num = 0;
    prev_uo = NULL;
    prev_char_hex_code = 0;
    uo = NextUCharObj(char_fp, &char_hex_code, utf8_char_fname, &char_line_num,
      &data_seq);
    while( uo != NULL )
    {
      /* check for errors possibly occurring at first character */
      if( prev_uo == NULL && data_seq == DATA_LAST )
      {
	/* error, sequence ends but doesn't begin */
	fprintf(stderr, "%s:%d: sequence end not expected\n",
	  (ASTRING) utf8_char_fname, char_line_num);
	exit(1);
      }

      /* check for errors possibly occuring at non-first character */
      if( prev_uo != NULL )
      {
	if( prev_char_hex_code >= char_hex_code )
	{
	  /* error, character out of order */
	  fprintf(stderr, "%s:%d: %x %s\n", (ASTRING) utf8_char_fname,
	    char_line_num, char_hex_code,
	    prev_char_hex_code == char_hex_code ?  "repeated" :"out of order");
	  exit(1);
	}
	if( prev_data_seq == DATA_FIRST && data_seq != DATA_LAST )
	{
	  /* error, sequence begins but doesn't end */
	  fprintf(stderr, "%s:%d: sequence end expected\n",
	    (ASTRING) utf8_char_fname, char_line_num);
	  exit(1);
	}
	if( prev_data_seq != DATA_FIRST && data_seq == DATA_LAST )
	{
	  /* error, sequence ends but doesn't begin */
	  fprintf(stderr, "%s:%d: sequence end not expected\n",
	    (ASTRING) utf8_char_fname, char_line_num);
	  exit(1);
	}
      }

      /* handle missing code ranges */
      if( data_seq != DATA_LAST &&
	  (int) prev_char_hex_code < (int) char_hex_code - 1 )
	HandleMissingRange(init_trie, prev_char_hex_code + 1,
	  char_hex_code - 1, utf8_char_fname, char_line_num);

      /* do whatever insertions are required */
      switch( data_seq )
      {
	case DATA_ORDINARY:

	  /* ordinary insertion of single character */
	  TrieInsert(init_trie, char_hex_code, char_hex_code, uo);
	  break;


	case DATA_FIRST:

	  /* first element of range; do nothing here */
	  break;


	case DATA_LAST:

	  /* last element of range; insert whole range */
	  if( !UCharObjEqual(prev_uo, uo) )
	  {
	    fprintf(stderr, "%s:%d: sequence endpoints have different data\n",
	      (ASTRING) utf8_char_fname, char_line_num);
	    exit(1);
	  }
	  if( DEBUG4 )
	    fprintf(stderr, "  character range 0x%x -- 0x%x\n",
	      prev_char_hex_code, char_hex_code);
	  TrieInsert(init_trie, prev_char_hex_code, char_hex_code, uo);
	  break;


	default:

	  assert(FALSE);
      }

      /* move on to next object */
      prev_uo = uo;
      prev_data_seq = data_seq;
      prev_char_hex_code = char_hex_code;
      uo = NextUCharObj(char_fp, &char_hex_code, utf8_char_fname,
	&char_line_num, &data_seq);
    }

    /* handle missing final code ranges */
    if( prev_char_hex_code < MAX_CHAR )
      HandleMissingRange(init_trie, prev_char_hex_code + 1,
	MAX_CHAR, utf8_char_fname, char_line_num);

    /* open and read special casing file */
    case_fname = UStringCat3(directory, AStringToUString(NPC_DIR_SEP),
      CASE_FILE);
    utf8_case_fname = UStringToUTF8(case_fname);
    case_fp = fopen((ASTRING) utf8_case_fname, "r");
    if( case_fp == NULL )
    {
      fprintf(stderr, "cannot open special casing file %s\n",
	(ASTRING) utf8_case_fname);
      exit(1);
    }
    case_line_num = 0;
    spc = NextSpCaseObj(case_fp, &case_hex_code, utf8_case_fname,
      &case_line_num);
    while( spc != NULL )
    {
      uo = (UCHAR_OBJ) TrieInitRetrieve(init_trie, case_hex_code);
      if( uo == NULL )
      {
	/* error, special casing has entry for non-existent character */
	fprintf(stderr, "%s:%d: %x not present in main file %s\n",
	  (ASTRING) utf8_case_fname, case_line_num, case_hex_code,
	  (ASTRING) utf8_char_fname);
	exit(1);
      }
      uo->lowercase = spc->lowercase;
      uo->uppercase = spc->uppercase;
      uo->titlecase = spc->titlecase;
      spc = NextSpCaseObj(case_fp, &case_hex_code, utf8_case_fname,
	&case_line_num);
    }

    if( DEBUG3 )
    {
      uo = TrieInitRetrieve(init_trie, 0x00DF);
      fprintf(stderr, "init_trie character %04X:\n", 0x00DF);
      UCharObjDebug(uo, 2, stderr);
      uo = TrieInitRetrieve(init_trie, 0x0061);
      fprintf(stderr, "init_trie character %04X:\n", 0x0061);
      UCharObjDebug(uo, 2, stderr);
      uo = TrieInitRetrieve(init_trie, 0x005F);
      fprintf(stderr, "init_trie character %04X:\n", 0x005F);
      UCharObjDebug(uo, 2, stderr);
      uo = TrieInitRetrieve(init_trie, 0x0073);
      fprintf(stderr, "init_trie character %04X:\n", 0x0073);
      UCharObjDebug(uo, 2, stderr);
    }

    /* save and restore the trie and string pool */
    TrieSave(init_trie, trie_fname);
    char_trie = TrieRestore(trie_fname);
    UStringPoolSave(ustring_pool, pool_fname);
    ustring_pool = UStringPoolRestore(pool_fname);
    assert(char_trie != NULL && ustring_pool != NULL);
  }
  if( DEBUG4 )
    fprintf(stderr, "] UCharInit returning\n");
}


/*****************************************************************************/
/*                                                                           */
/*  UCHAR UCharMax()                                                         */
/*                                                                           */
/*  Return the largest known character.                                      */
/*                                                                           */
/*****************************************************************************/

UCHAR UCharMax()
{
  assert(char_trie != NULL);
  return TrieMaxKey(char_trie);
}


/*****************************************************************************/
/*                                                                           */
/*  UCHAR_OBJ UCharObjGet(UCHAR ch)                                          */
/*                                                                           */
/*  Return the object associated with ch.                                    */
/*                                                                           */
/*****************************************************************************/

static UCHAR_OBJ UCharObjGet(UCHAR ch)
{
  assert(char_trie != NULL);
  return (UCHAR_OBJ) TrieRetrieve(char_trie, ch);
}


/*****************************************************************************/
/*                                                                           */
/*  BOOLEAN UCharEqualProperties(UCHAR ch1, UCHAR ch2)                       */
/*                                                                           */
/*  Return TRUE if ch1 and ch2 have the same properties.                     */
/*                                                                           */
/*****************************************************************************/

BOOLEAN UCharEqualProperties(UCHAR ch1, UCHAR ch2)
{
  return UCharObjEqual(UCharObjGet(ch1), UCharObjGet(ch2));
}



/*****************************************************************************/
/*                                                                           */
/*  Submodule "character properties".                                        */
/*                                                                           */
/*****************************************************************************/

/*****************************************************************************/
/*                                                                           */
/*  UCHAR_GENERAL_CATEGORY UCharGeneralCategory(UCHAR ch)                    */
/*                                                                           */
/*  Return the Unicode General Category of ch.                               */
/*                                                                           */
/*****************************************************************************/

UCHAR_GENERAL_CATEGORY UCharGeneralCategory(UCHAR ch)
{
  UCHAR_OBJ uo = UCharObjGet(ch);
  return uo->general_category;
}


/*****************************************************************************/
/*                                                                           */
/*  UCHAR_LEX_CLASS UCharLexClass(UCHAR ch)                                  */
/*                                                                           */
/*  Return the Nonpareil lexical class of ch.                                */
/*                                                                           */
/*****************************************************************************/

UCHAR_LEX_CLASS UCharLexClass(UCHAR ch)
{
  UCHAR_OBJ uo = UCharObjGet(ch);
  return uo->lex_class;
}


/*****************************************************************************/
/*                                                                           */
/*  unsigned char UCharCanonicalCombiningClass(UCHAR ch)                     */
/*                                                                           */
/*  Return the canonical combining class of ch.                              */
/*                                                                           */
/*****************************************************************************/

unsigned char UCharCanonicalCombiningClass(UCHAR ch)
{
  UCHAR_OBJ uo = UCharObjGet(ch);
  return uo->canonical_combining_class;
}


/*****************************************************************************/
/*                                                                           */
/*  UCHAR_BIDI_CLASS UCharBidiClass(UCHAR ch)                                */
/*                                                                           */
/*  Return the Bidi class of ch.                                             */
/*                                                                           */
/*****************************************************************************/

UCHAR_BIDI_CLASS UCharBidiClass(UCHAR ch)
{
  UCHAR_OBJ uo = UCharObjGet(ch);
  return uo->bidi_class;
}


/*****************************************************************************/
/*                                                                           */
/*  UCHAR_CMAP_CLASS UCharCMapClass(UCHAR ch)                                */
/*                                                                           */
/*  Return the canonical/compatibility mapping class of ch.                  */
/*                                                                           */
/*****************************************************************************/

UCHAR_CMAP_CLASS UCharCMapClass(UCHAR ch)
{
  UCHAR_OBJ uo = UCharObjGet(ch);
  return uo->cmap_class;
}


/*****************************************************************************/
/*                                                                           */
/*  USTRING UCharCMap(UCHAR ch)                                              */
/*                                                                           */
/*  Return the canonical/compatibility mapping of ch; this is allowed only   */
/*  if UcharCMapClass(ch) is not UCHAR_CMAP_NONE.                            */
/*                                                                           */
/*****************************************************************************/

USTRING UCharCMap(UCHAR ch)
{
  UCHAR_OBJ uo = UCharObjGet(ch);
  assert(uo->cmap_class != UCHAR_CMAP_NONE);
  return UStringPoolGet(ustring_pool, uo->cmap);
}


/*****************************************************************************/
/*                                                                           */
/*  BOOLEAN UCharIsPrintableAscii(UCHAR ch)                                  */
/*                                                                           */
/*  Return TRUE if ch is a printable ASCII character.                        */
/*                                                                           */
/*****************************************************************************/

BOOLEAN UCharIsPrintableAscii(UCHAR ch)
{
  return ch >= ' ' && ch <= '~';
}


/*****************************************************************************/
/*                                                                           */
/*  BOOLEAN UCharIsDecimalDigit(UCHAR ch, int *value)                        */
/*                                                                           */
/*  If ch is a decimal digit, return TRUE and set *value to its value.       */
/*  Otherwise return FALSE with *value undefined.                            */
/*                                                                           */
/*****************************************************************************/

BOOLEAN UCharIsDecimalDigit(UCHAR ch, int *value)
{
  UCHAR_OBJ uo = UCharObjGet(ch);
  if( uo->decimal_digit != NO_DIGIT )
  {
    *value = uo->decimal_digit;
    return TRUE;
  }
  else
    return FALSE;
}


/*****************************************************************************/
/*                                                                           */
/*  BOOLEAN UCharIsDigit(UCHAR ch, int *value)                               */
/*                                                                           */
/*  If ch is a digit (NB not necessarily a decimal digit!), return TRUE      */
/*  and set *value to its value.  Otherwise return FALSE with *value         */
/*  undefined.                                                               */
/*                                                                           */
/*****************************************************************************/

BOOLEAN UCharIsDigit(UCHAR ch, int *value)
{
  UCHAR_OBJ uo = UCharObjGet(ch);
  if( uo->digit != NO_DIGIT )
  {
    *value = uo->digit;
    return TRUE;
  }
  else
    return FALSE;
}


/*****************************************************************************/
/*                                                                           */
/*  BOOLEAN UCharIsNumeric(UCHAR ch, int *numerator, int *denominator)       */
/*                                                                           */
/*  If ch is numeric, return TRUE and set *numerator and *denominator to     */
/*  its rational value.  The character is a fraction if *denominator > 1.    */
/*  Otherwise return FALSE with *numerator and *denominator undefined.       */
/*                                                                           */
/*****************************************************************************/

BOOLEAN UCharIsNumeric(UCHAR ch, int *numerator, int *denominator)
{
  UCHAR_OBJ uo = UCharObjGet(ch);
  if( uo->numeric_numerator != NO_DIGIT )
  {
    *numerator = uo->numeric_numerator;
    *denominator = uo->numeric_denominator;
    return TRUE;
  }
  else
    return FALSE;
}


/*****************************************************************************/
/*                                                                           */
/*  BOOLEAN UCharIsBidiMirrored(UCHAR ch)                                    */
/*                                                                           */
/*  Return TRUE if ch has the Bidi mirrored property.                        */
/*                                                                           */
/*****************************************************************************/

BOOLEAN UCharIsBidiMirrored(UCHAR ch)
{
  UCHAR_OBJ uo = UCharObjGet(ch);
  return uo->bidi_mirrored;
}


/*****************************************************************************/
/*                                                                           */
/*  USTRING UCharUpperCase(UCHAR ch)                                         */
/*                                                                           */
/*  Return a sequence of characters which is the uppercase equivalent        */
/*  of ch.  In some rare cases this is a string rather than a single         */
/*  character, hence the result type.                                        */
/*                                                                           */
/*  Note: a few case conversions depend on conditions, either the language   */
/*  or the context.  These conditional conversions are not implemented       */
/*  by this function.                                                        */
/*                                                                           */
/*****************************************************************************/

USTRING UCharUpperCase(UCHAR ch)
{
  UCHAR_OBJ uo = UCharObjGet(ch);
  return UStringPoolGet(ustring_pool, uo->uppercase);
}


/*****************************************************************************/
/*                                                                           */
/*  USTRING UCharLowerCase(UCHAR ch)                                         */
/*                                                                           */
/*  Return a sequence of characters which is the lowercase equivalent        */
/*  of ch.  See UCharUpperCase for futher information.                       */
/*                                                                           */
/*****************************************************************************/

USTRING UCharLowerCase(UCHAR ch)
{
  UCHAR_OBJ uo = UCharObjGet(ch);
  return UStringPoolGet(ustring_pool, uo->lowercase);
}


/*****************************************************************************/
/*                                                                           */
/*  USTRING UCharTitleCase(UCHAR ch)                                         */
/*                                                                           */
/*  Return a sequence of characters which is the title case equivalent       */
/*  of ch.  See UCharUpperCase for futher information.                       */
/*                                                                           */
/*****************************************************************************/

USTRING UCharTitleCase(UCHAR ch)
{
  UCHAR_OBJ uo = UCharObjGet(ch);
  return UStringPoolGet(ustring_pool, uo->titlecase);
}


/*****************************************************************************/
/*                                                                           */
/*  Submodule "debug and test"                                               */
/*                                                                           */
/*****************************************************************************/

/*****************************************************************************/
/*                                                                           */
/*  void UCharTestChar(UCHAR ch, ASTRING descr, FILE *fp)                    */
/*                                                                           */
/*  Show ch.                                                                 */
/*                                                                           */
/*****************************************************************************/

static void UCharTestChar(UCHAR ch, ASTRING descr, FILE *fp)
{
  UCHAR_OBJ uo = UCharObjGet(ch);
  fprintf(fp, "character %04X (%s):\n", ch, descr);
  UCharObjDebug(uo, 2, fp);
}


/*****************************************************************************/
/*                                                                           */
/*  void UCharTest(FILE *fp)                                                 */
/*                                                                           */
/*  Test this module, assuming that it has been initialized.                 */
/*                                                                           */
/*****************************************************************************/

void UCharTest(FILE *fp)
{
  assert(char_trie != NULL);
  UCharTestChar(0x00DF, "german small s", fp);
  UCharTestChar(0x0061, "small letter a", fp);
  UCharTestChar(0x005F, "underscore", fp);
}
