xref: /openbmc/linux/fs/unicode/mkutf8data.c (revision 2b3d0478)
128ba53c0SMasahiro Yamada /*
228ba53c0SMasahiro Yamada  * Copyright (c) 2014 SGI.
328ba53c0SMasahiro Yamada  * All rights reserved.
428ba53c0SMasahiro Yamada  *
528ba53c0SMasahiro Yamada  * This program is free software; you can redistribute it and/or
628ba53c0SMasahiro Yamada  * modify it under the terms of the GNU General Public License as
728ba53c0SMasahiro Yamada  * published by the Free Software Foundation.
828ba53c0SMasahiro Yamada  *
928ba53c0SMasahiro Yamada  * This program is distributed in the hope that it would be useful,
1028ba53c0SMasahiro Yamada  * but WITHOUT ANY WARRANTY; without even the implied warranty of
1128ba53c0SMasahiro Yamada  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
1228ba53c0SMasahiro Yamada  * GNU General Public License for more details.
1328ba53c0SMasahiro Yamada  *
1428ba53c0SMasahiro Yamada  * You should have received a copy of the GNU General Public License
1528ba53c0SMasahiro Yamada  * along with this program; if not, write the Free Software Foundation,
1628ba53c0SMasahiro Yamada  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
1728ba53c0SMasahiro Yamada  */
1828ba53c0SMasahiro Yamada 
1928ba53c0SMasahiro Yamada /* Generator for a compact trie for unicode normalization */
2028ba53c0SMasahiro Yamada 
2128ba53c0SMasahiro Yamada #include <sys/types.h>
2228ba53c0SMasahiro Yamada #include <stddef.h>
2328ba53c0SMasahiro Yamada #include <stdlib.h>
2428ba53c0SMasahiro Yamada #include <stdio.h>
2528ba53c0SMasahiro Yamada #include <assert.h>
2628ba53c0SMasahiro Yamada #include <string.h>
2728ba53c0SMasahiro Yamada #include <unistd.h>
2828ba53c0SMasahiro Yamada #include <errno.h>
2928ba53c0SMasahiro Yamada 
3028ba53c0SMasahiro Yamada /* Default names of the in- and output files. */
3128ba53c0SMasahiro Yamada 
3228ba53c0SMasahiro Yamada #define AGE_NAME	"DerivedAge.txt"
3328ba53c0SMasahiro Yamada #define CCC_NAME	"DerivedCombiningClass.txt"
3428ba53c0SMasahiro Yamada #define PROP_NAME	"DerivedCoreProperties.txt"
3528ba53c0SMasahiro Yamada #define DATA_NAME	"UnicodeData.txt"
3628ba53c0SMasahiro Yamada #define FOLD_NAME	"CaseFolding.txt"
3728ba53c0SMasahiro Yamada #define NORM_NAME	"NormalizationCorrections.txt"
3828ba53c0SMasahiro Yamada #define TEST_NAME	"NormalizationTest.txt"
3928ba53c0SMasahiro Yamada #define UTF8_NAME	"utf8data.h"
4028ba53c0SMasahiro Yamada 
4128ba53c0SMasahiro Yamada const char	*age_name  = AGE_NAME;
4228ba53c0SMasahiro Yamada const char	*ccc_name  = CCC_NAME;
4328ba53c0SMasahiro Yamada const char	*prop_name = PROP_NAME;
4428ba53c0SMasahiro Yamada const char	*data_name = DATA_NAME;
4528ba53c0SMasahiro Yamada const char	*fold_name = FOLD_NAME;
4628ba53c0SMasahiro Yamada const char	*norm_name = NORM_NAME;
4728ba53c0SMasahiro Yamada const char	*test_name = TEST_NAME;
4828ba53c0SMasahiro Yamada const char	*utf8_name = UTF8_NAME;
4928ba53c0SMasahiro Yamada 
5028ba53c0SMasahiro Yamada int verbose = 0;
5128ba53c0SMasahiro Yamada 
5228ba53c0SMasahiro Yamada /* An arbitrary line size limit on input lines. */
5328ba53c0SMasahiro Yamada 
5428ba53c0SMasahiro Yamada #define LINESIZE	1024
5528ba53c0SMasahiro Yamada char line[LINESIZE];
5628ba53c0SMasahiro Yamada char buf0[LINESIZE];
5728ba53c0SMasahiro Yamada char buf1[LINESIZE];
5828ba53c0SMasahiro Yamada char buf2[LINESIZE];
5928ba53c0SMasahiro Yamada char buf3[LINESIZE];
6028ba53c0SMasahiro Yamada 
6128ba53c0SMasahiro Yamada const char *argv0;
6228ba53c0SMasahiro Yamada 
6328ba53c0SMasahiro Yamada #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
6428ba53c0SMasahiro Yamada 
6528ba53c0SMasahiro Yamada /* ------------------------------------------------------------------ */
6628ba53c0SMasahiro Yamada 
6728ba53c0SMasahiro Yamada /*
6828ba53c0SMasahiro Yamada  * Unicode version numbers consist of three parts: major, minor, and a
6928ba53c0SMasahiro Yamada  * revision.  These numbers are packed into an unsigned int to obtain
7028ba53c0SMasahiro Yamada  * a single version number.
7128ba53c0SMasahiro Yamada  *
7228ba53c0SMasahiro Yamada  * To save space in the generated trie, the unicode version is not
7328ba53c0SMasahiro Yamada  * stored directly, instead we calculate a generation number from the
7428ba53c0SMasahiro Yamada  * unicode versions seen in the DerivedAge file, and use that as an
7528ba53c0SMasahiro Yamada  * index into a table of unicode versions.
7628ba53c0SMasahiro Yamada  */
7728ba53c0SMasahiro Yamada #define UNICODE_MAJ_SHIFT		(16)
7828ba53c0SMasahiro Yamada #define UNICODE_MIN_SHIFT		(8)
7928ba53c0SMasahiro Yamada 
8028ba53c0SMasahiro Yamada #define UNICODE_MAJ_MAX			((unsigned short)-1)
8128ba53c0SMasahiro Yamada #define UNICODE_MIN_MAX			((unsigned char)-1)
8228ba53c0SMasahiro Yamada #define UNICODE_REV_MAX			((unsigned char)-1)
8328ba53c0SMasahiro Yamada 
8428ba53c0SMasahiro Yamada #define UNICODE_AGE(MAJ,MIN,REV)			\
8528ba53c0SMasahiro Yamada 	(((unsigned int)(MAJ) << UNICODE_MAJ_SHIFT) |	\
8628ba53c0SMasahiro Yamada 	 ((unsigned int)(MIN) << UNICODE_MIN_SHIFT) |	\
8728ba53c0SMasahiro Yamada 	 ((unsigned int)(REV)))
8828ba53c0SMasahiro Yamada 
8928ba53c0SMasahiro Yamada unsigned int *ages;
9028ba53c0SMasahiro Yamada int ages_count;
9128ba53c0SMasahiro Yamada 
9228ba53c0SMasahiro Yamada unsigned int unicode_maxage;
9328ba53c0SMasahiro Yamada 
age_valid(unsigned int major,unsigned int minor,unsigned int revision)9428ba53c0SMasahiro Yamada static int age_valid(unsigned int major, unsigned int minor,
9528ba53c0SMasahiro Yamada 		     unsigned int revision)
9628ba53c0SMasahiro Yamada {
9728ba53c0SMasahiro Yamada 	if (major > UNICODE_MAJ_MAX)
9828ba53c0SMasahiro Yamada 		return 0;
9928ba53c0SMasahiro Yamada 	if (minor > UNICODE_MIN_MAX)
10028ba53c0SMasahiro Yamada 		return 0;
10128ba53c0SMasahiro Yamada 	if (revision > UNICODE_REV_MAX)
10228ba53c0SMasahiro Yamada 		return 0;
10328ba53c0SMasahiro Yamada 	return 1;
10428ba53c0SMasahiro Yamada }
10528ba53c0SMasahiro Yamada 
10628ba53c0SMasahiro Yamada /* ------------------------------------------------------------------ */
10728ba53c0SMasahiro Yamada 
10828ba53c0SMasahiro Yamada /*
10928ba53c0SMasahiro Yamada  * utf8trie_t
11028ba53c0SMasahiro Yamada  *
11128ba53c0SMasahiro Yamada  * A compact binary tree, used to decode UTF-8 characters.
11228ba53c0SMasahiro Yamada  *
11328ba53c0SMasahiro Yamada  * Internal nodes are one byte for the node itself, and up to three
11428ba53c0SMasahiro Yamada  * bytes for an offset into the tree.  The first byte contains the
11528ba53c0SMasahiro Yamada  * following information:
11628ba53c0SMasahiro Yamada  *  NEXTBYTE  - flag        - advance to next byte if set
11728ba53c0SMasahiro Yamada  *  BITNUM    - 3 bit field - the bit number to tested
11828ba53c0SMasahiro Yamada  *  OFFLEN    - 2 bit field - number of bytes in the offset
11928ba53c0SMasahiro Yamada  * if offlen == 0 (non-branching node)
12028ba53c0SMasahiro Yamada  *  RIGHTPATH - 1 bit field - set if the following node is for the
12128ba53c0SMasahiro Yamada  *                            right-hand path (tested bit is set)
12228ba53c0SMasahiro Yamada  *  TRIENODE  - 1 bit field - set if the following node is an internal
12328ba53c0SMasahiro Yamada  *                            node, otherwise it is a leaf node
12428ba53c0SMasahiro Yamada  * if offlen != 0 (branching node)
12528ba53c0SMasahiro Yamada  *  LEFTNODE  - 1 bit field - set if the left-hand node is internal
12628ba53c0SMasahiro Yamada  *  RIGHTNODE - 1 bit field - set if the right-hand node is internal
12728ba53c0SMasahiro Yamada  *
12828ba53c0SMasahiro Yamada  * Due to the way utf8 works, there cannot be branching nodes with
12928ba53c0SMasahiro Yamada  * NEXTBYTE set, and moreover those nodes always have a righthand
13028ba53c0SMasahiro Yamada  * descendant.
13128ba53c0SMasahiro Yamada  */
13228ba53c0SMasahiro Yamada typedef unsigned char utf8trie_t;
13328ba53c0SMasahiro Yamada #define BITNUM		0x07
13428ba53c0SMasahiro Yamada #define NEXTBYTE	0x08
13528ba53c0SMasahiro Yamada #define OFFLEN		0x30
13628ba53c0SMasahiro Yamada #define OFFLEN_SHIFT	4
13728ba53c0SMasahiro Yamada #define RIGHTPATH	0x40
13828ba53c0SMasahiro Yamada #define TRIENODE	0x80
13928ba53c0SMasahiro Yamada #define RIGHTNODE	0x40
14028ba53c0SMasahiro Yamada #define LEFTNODE	0x80
14128ba53c0SMasahiro Yamada 
14228ba53c0SMasahiro Yamada /*
14328ba53c0SMasahiro Yamada  * utf8leaf_t
14428ba53c0SMasahiro Yamada  *
14528ba53c0SMasahiro Yamada  * The leaves of the trie are embedded in the trie, and so the same
14628ba53c0SMasahiro Yamada  * underlying datatype, unsigned char.
14728ba53c0SMasahiro Yamada  *
14828ba53c0SMasahiro Yamada  * leaf[0]: The unicode version, stored as a generation number that is
14928ba53c0SMasahiro Yamada  *          an index into utf8agetab[].  With this we can filter code
15028ba53c0SMasahiro Yamada  *          points based on the unicode version in which they were
15128ba53c0SMasahiro Yamada  *          defined.  The CCC of a non-defined code point is 0.
15228ba53c0SMasahiro Yamada  * leaf[1]: Canonical Combining Class. During normalization, we need
15328ba53c0SMasahiro Yamada  *          to do a stable sort into ascending order of all characters
15428ba53c0SMasahiro Yamada  *          with a non-zero CCC that occur between two characters with
15528ba53c0SMasahiro Yamada  *          a CCC of 0, or at the begin or end of a string.
15628ba53c0SMasahiro Yamada  *          The unicode standard guarantees that all CCC values are
15728ba53c0SMasahiro Yamada  *          between 0 and 254 inclusive, which leaves 255 available as
15828ba53c0SMasahiro Yamada  *          a special value.
15928ba53c0SMasahiro Yamada  *          Code points with CCC 0 are known as stoppers.
16028ba53c0SMasahiro Yamada  * leaf[2]: Decomposition. If leaf[1] == 255, then leaf[2] is the
16128ba53c0SMasahiro Yamada  *          start of a NUL-terminated string that is the decomposition
16228ba53c0SMasahiro Yamada  *          of the character.
16328ba53c0SMasahiro Yamada  *          The CCC of a decomposable character is the same as the CCC
16428ba53c0SMasahiro Yamada  *          of the first character of its decomposition.
16528ba53c0SMasahiro Yamada  *          Some characters decompose as the empty string: these are
16628ba53c0SMasahiro Yamada  *          characters with the Default_Ignorable_Code_Point property.
16728ba53c0SMasahiro Yamada  *          These do affect normalization, as they all have CCC 0.
16828ba53c0SMasahiro Yamada  *
16928ba53c0SMasahiro Yamada  * The decompositions in the trie have been fully expanded.
17028ba53c0SMasahiro Yamada  *
17128ba53c0SMasahiro Yamada  * Casefolding, if applicable, is also done using decompositions.
17228ba53c0SMasahiro Yamada  */
17328ba53c0SMasahiro Yamada typedef unsigned char utf8leaf_t;
17428ba53c0SMasahiro Yamada 
17528ba53c0SMasahiro Yamada #define LEAF_GEN(LEAF)	((LEAF)[0])
17628ba53c0SMasahiro Yamada #define LEAF_CCC(LEAF)	((LEAF)[1])
17728ba53c0SMasahiro Yamada #define LEAF_STR(LEAF)	((const char*)((LEAF) + 2))
17828ba53c0SMasahiro Yamada 
17928ba53c0SMasahiro Yamada #define MAXGEN		(255)
18028ba53c0SMasahiro Yamada 
18128ba53c0SMasahiro Yamada #define MINCCC		(0)
18228ba53c0SMasahiro Yamada #define MAXCCC		(254)
18328ba53c0SMasahiro Yamada #define STOPPER		(0)
18428ba53c0SMasahiro Yamada #define DECOMPOSE	(255)
18528ba53c0SMasahiro Yamada #define HANGUL		((char)(255))
18628ba53c0SMasahiro Yamada 
18728ba53c0SMasahiro Yamada #define UTF8HANGULLEAF	(12)
18828ba53c0SMasahiro Yamada 
18928ba53c0SMasahiro Yamada struct tree;
19028ba53c0SMasahiro Yamada static utf8leaf_t *utf8nlookup(struct tree *, unsigned char *,
19128ba53c0SMasahiro Yamada 			       const char *, size_t);
19228ba53c0SMasahiro Yamada static utf8leaf_t *utf8lookup(struct tree *, unsigned char *, const char *);
19328ba53c0SMasahiro Yamada 
19428ba53c0SMasahiro Yamada unsigned char *utf8data;
19528ba53c0SMasahiro Yamada size_t utf8data_size;
19628ba53c0SMasahiro Yamada 
19728ba53c0SMasahiro Yamada utf8trie_t *nfdi;
19828ba53c0SMasahiro Yamada utf8trie_t *nfdicf;
19928ba53c0SMasahiro Yamada 
20028ba53c0SMasahiro Yamada /* ------------------------------------------------------------------ */
20128ba53c0SMasahiro Yamada 
20228ba53c0SMasahiro Yamada /*
20328ba53c0SMasahiro Yamada  * UTF8 valid ranges.
20428ba53c0SMasahiro Yamada  *
20528ba53c0SMasahiro Yamada  * The UTF-8 encoding spreads the bits of a 32bit word over several
20628ba53c0SMasahiro Yamada  * bytes. This table gives the ranges that can be held and how they'd
20728ba53c0SMasahiro Yamada  * be represented.
20828ba53c0SMasahiro Yamada  *
20928ba53c0SMasahiro Yamada  * 0x00000000 0x0000007F: 0xxxxxxx
21028ba53c0SMasahiro Yamada  * 0x00000000 0x000007FF: 110xxxxx 10xxxxxx
21128ba53c0SMasahiro Yamada  * 0x00000000 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
21228ba53c0SMasahiro Yamada  * 0x00000000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
21328ba53c0SMasahiro Yamada  * 0x00000000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
21428ba53c0SMasahiro Yamada  * 0x00000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
21528ba53c0SMasahiro Yamada  *
21628ba53c0SMasahiro Yamada  * There is an additional requirement on UTF-8, in that only the
21728ba53c0SMasahiro Yamada  * shortest representation of a 32bit value is to be used.  A decoder
21828ba53c0SMasahiro Yamada  * must not decode sequences that do not satisfy this requirement.
21928ba53c0SMasahiro Yamada  * Thus the allowed ranges have a lower bound.
22028ba53c0SMasahiro Yamada  *
22128ba53c0SMasahiro Yamada  * 0x00000000 0x0000007F: 0xxxxxxx
22228ba53c0SMasahiro Yamada  * 0x00000080 0x000007FF: 110xxxxx 10xxxxxx
22328ba53c0SMasahiro Yamada  * 0x00000800 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
22428ba53c0SMasahiro Yamada  * 0x00010000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
22528ba53c0SMasahiro Yamada  * 0x00200000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
22628ba53c0SMasahiro Yamada  * 0x04000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
22728ba53c0SMasahiro Yamada  *
22828ba53c0SMasahiro Yamada  * Actual unicode characters are limited to the range 0x0 - 0x10FFFF,
22928ba53c0SMasahiro Yamada  * 17 planes of 65536 values.  This limits the sequences actually seen
23028ba53c0SMasahiro Yamada  * even more, to just the following.
23128ba53c0SMasahiro Yamada  *
23228ba53c0SMasahiro Yamada  *          0 -     0x7f: 0                     0x7f
23328ba53c0SMasahiro Yamada  *       0x80 -    0x7ff: 0xc2 0x80             0xdf 0xbf
23428ba53c0SMasahiro Yamada  *      0x800 -   0xffff: 0xe0 0xa0 0x80        0xef 0xbf 0xbf
23528ba53c0SMasahiro Yamada  *    0x10000 - 0x10ffff: 0xf0 0x90 0x80 0x80   0xf4 0x8f 0xbf 0xbf
23628ba53c0SMasahiro Yamada  *
23728ba53c0SMasahiro Yamada  * Even within those ranges not all values are allowed: the surrogates
23828ba53c0SMasahiro Yamada  * 0xd800 - 0xdfff should never be seen.
23928ba53c0SMasahiro Yamada  *
24028ba53c0SMasahiro Yamada  * Note that the longest sequence seen with valid usage is 4 bytes,
24128ba53c0SMasahiro Yamada  * the same a single UTF-32 character.  This makes the UTF-8
24228ba53c0SMasahiro Yamada  * representation of Unicode strictly smaller than UTF-32.
24328ba53c0SMasahiro Yamada  *
24428ba53c0SMasahiro Yamada  * The shortest sequence requirement was introduced by:
24528ba53c0SMasahiro Yamada  *    Corrigendum #1: UTF-8 Shortest Form
24628ba53c0SMasahiro Yamada  * It can be found here:
24728ba53c0SMasahiro Yamada  *    http://www.unicode.org/versions/corrigendum1.html
24828ba53c0SMasahiro Yamada  *
24928ba53c0SMasahiro Yamada  */
25028ba53c0SMasahiro Yamada 
25128ba53c0SMasahiro Yamada #define UTF8_2_BITS     0xC0
25228ba53c0SMasahiro Yamada #define UTF8_3_BITS     0xE0
25328ba53c0SMasahiro Yamada #define UTF8_4_BITS     0xF0
25428ba53c0SMasahiro Yamada #define UTF8_N_BITS     0x80
25528ba53c0SMasahiro Yamada #define UTF8_2_MASK     0xE0
25628ba53c0SMasahiro Yamada #define UTF8_3_MASK     0xF0
25728ba53c0SMasahiro Yamada #define UTF8_4_MASK     0xF8
25828ba53c0SMasahiro Yamada #define UTF8_N_MASK     0xC0
25928ba53c0SMasahiro Yamada #define UTF8_V_MASK     0x3F
26028ba53c0SMasahiro Yamada #define UTF8_V_SHIFT    6
26128ba53c0SMasahiro Yamada 
utf8encode(char * str,unsigned int val)26228ba53c0SMasahiro Yamada static int utf8encode(char *str, unsigned int val)
26328ba53c0SMasahiro Yamada {
26428ba53c0SMasahiro Yamada 	int len;
26528ba53c0SMasahiro Yamada 
26628ba53c0SMasahiro Yamada 	if (val < 0x80) {
26728ba53c0SMasahiro Yamada 		str[0] = val;
26828ba53c0SMasahiro Yamada 		len = 1;
26928ba53c0SMasahiro Yamada 	} else if (val < 0x800) {
27028ba53c0SMasahiro Yamada 		str[1] = val & UTF8_V_MASK;
27128ba53c0SMasahiro Yamada 		str[1] |= UTF8_N_BITS;
27228ba53c0SMasahiro Yamada 		val >>= UTF8_V_SHIFT;
27328ba53c0SMasahiro Yamada 		str[0] = val;
27428ba53c0SMasahiro Yamada 		str[0] |= UTF8_2_BITS;
27528ba53c0SMasahiro Yamada 		len = 2;
27628ba53c0SMasahiro Yamada 	} else if (val < 0x10000) {
27728ba53c0SMasahiro Yamada 		str[2] = val & UTF8_V_MASK;
27828ba53c0SMasahiro Yamada 		str[2] |= UTF8_N_BITS;
27928ba53c0SMasahiro Yamada 		val >>= UTF8_V_SHIFT;
28028ba53c0SMasahiro Yamada 		str[1] = val & UTF8_V_MASK;
28128ba53c0SMasahiro Yamada 		str[1] |= UTF8_N_BITS;
28228ba53c0SMasahiro Yamada 		val >>= UTF8_V_SHIFT;
28328ba53c0SMasahiro Yamada 		str[0] = val;
28428ba53c0SMasahiro Yamada 		str[0] |= UTF8_3_BITS;
28528ba53c0SMasahiro Yamada 		len = 3;
28628ba53c0SMasahiro Yamada 	} else if (val < 0x110000) {
28728ba53c0SMasahiro Yamada 		str[3] = val & UTF8_V_MASK;
28828ba53c0SMasahiro Yamada 		str[3] |= UTF8_N_BITS;
28928ba53c0SMasahiro Yamada 		val >>= UTF8_V_SHIFT;
29028ba53c0SMasahiro Yamada 		str[2] = val & UTF8_V_MASK;
29128ba53c0SMasahiro Yamada 		str[2] |= UTF8_N_BITS;
29228ba53c0SMasahiro Yamada 		val >>= UTF8_V_SHIFT;
29328ba53c0SMasahiro Yamada 		str[1] = val & UTF8_V_MASK;
29428ba53c0SMasahiro Yamada 		str[1] |= UTF8_N_BITS;
29528ba53c0SMasahiro Yamada 		val >>= UTF8_V_SHIFT;
29628ba53c0SMasahiro Yamada 		str[0] = val;
29728ba53c0SMasahiro Yamada 		str[0] |= UTF8_4_BITS;
29828ba53c0SMasahiro Yamada 		len = 4;
29928ba53c0SMasahiro Yamada 	} else {
30028ba53c0SMasahiro Yamada 		printf("%#x: illegal val\n", val);
30128ba53c0SMasahiro Yamada 		len = 0;
30228ba53c0SMasahiro Yamada 	}
30328ba53c0SMasahiro Yamada 	return len;
30428ba53c0SMasahiro Yamada }
30528ba53c0SMasahiro Yamada 
utf8decode(const char * str)30628ba53c0SMasahiro Yamada static unsigned int utf8decode(const char *str)
30728ba53c0SMasahiro Yamada {
30828ba53c0SMasahiro Yamada 	const unsigned char *s = (const unsigned char*)str;
30928ba53c0SMasahiro Yamada 	unsigned int unichar = 0;
31028ba53c0SMasahiro Yamada 
31128ba53c0SMasahiro Yamada 	if (*s < 0x80) {
31228ba53c0SMasahiro Yamada 		unichar = *s;
31328ba53c0SMasahiro Yamada 	} else if (*s < UTF8_3_BITS) {
31428ba53c0SMasahiro Yamada 		unichar = *s++ & 0x1F;
31528ba53c0SMasahiro Yamada 		unichar <<= UTF8_V_SHIFT;
31628ba53c0SMasahiro Yamada 		unichar |= *s & 0x3F;
31728ba53c0SMasahiro Yamada 	} else if (*s < UTF8_4_BITS) {
31828ba53c0SMasahiro Yamada 		unichar = *s++ & 0x0F;
31928ba53c0SMasahiro Yamada 		unichar <<= UTF8_V_SHIFT;
32028ba53c0SMasahiro Yamada 		unichar |= *s++ & 0x3F;
32128ba53c0SMasahiro Yamada 		unichar <<= UTF8_V_SHIFT;
32228ba53c0SMasahiro Yamada 		unichar |= *s & 0x3F;
32328ba53c0SMasahiro Yamada 	} else {
32428ba53c0SMasahiro Yamada 		unichar = *s++ & 0x0F;
32528ba53c0SMasahiro Yamada 		unichar <<= UTF8_V_SHIFT;
32628ba53c0SMasahiro Yamada 		unichar |= *s++ & 0x3F;
32728ba53c0SMasahiro Yamada 		unichar <<= UTF8_V_SHIFT;
32828ba53c0SMasahiro Yamada 		unichar |= *s++ & 0x3F;
32928ba53c0SMasahiro Yamada 		unichar <<= UTF8_V_SHIFT;
33028ba53c0SMasahiro Yamada 		unichar |= *s & 0x3F;
33128ba53c0SMasahiro Yamada 	}
33228ba53c0SMasahiro Yamada 	return unichar;
33328ba53c0SMasahiro Yamada }
33428ba53c0SMasahiro Yamada 
utf32valid(unsigned int unichar)33528ba53c0SMasahiro Yamada static int utf32valid(unsigned int unichar)
33628ba53c0SMasahiro Yamada {
33728ba53c0SMasahiro Yamada 	return unichar < 0x110000;
33828ba53c0SMasahiro Yamada }
33928ba53c0SMasahiro Yamada 
34028ba53c0SMasahiro Yamada #define HANGUL_SYLLABLE(U)	((U) >= 0xAC00 && (U) <= 0xD7A3)
34128ba53c0SMasahiro Yamada 
34228ba53c0SMasahiro Yamada #define NODE 1
34328ba53c0SMasahiro Yamada #define LEAF 0
34428ba53c0SMasahiro Yamada 
34528ba53c0SMasahiro Yamada struct tree {
34628ba53c0SMasahiro Yamada 	void *root;
34728ba53c0SMasahiro Yamada 	int childnode;
34828ba53c0SMasahiro Yamada 	const char *type;
34928ba53c0SMasahiro Yamada 	unsigned int maxage;
35028ba53c0SMasahiro Yamada 	struct tree *next;
35128ba53c0SMasahiro Yamada 	int (*leaf_equal)(void *, void *);
35228ba53c0SMasahiro Yamada 	void (*leaf_print)(void *, int);
35328ba53c0SMasahiro Yamada 	int (*leaf_mark)(void *);
35428ba53c0SMasahiro Yamada 	int (*leaf_size)(void *);
35528ba53c0SMasahiro Yamada 	int *(*leaf_index)(struct tree *, void *);
35628ba53c0SMasahiro Yamada 	unsigned char *(*leaf_emit)(void *, unsigned char *);
35728ba53c0SMasahiro Yamada 	int leafindex[0x110000];
35828ba53c0SMasahiro Yamada 	int index;
35928ba53c0SMasahiro Yamada };
36028ba53c0SMasahiro Yamada 
36128ba53c0SMasahiro Yamada struct node {
36228ba53c0SMasahiro Yamada 	int index;
36328ba53c0SMasahiro Yamada 	int offset;
36428ba53c0SMasahiro Yamada 	int mark;
36528ba53c0SMasahiro Yamada 	int size;
36628ba53c0SMasahiro Yamada 	struct node *parent;
36728ba53c0SMasahiro Yamada 	void *left;
36828ba53c0SMasahiro Yamada 	void *right;
36928ba53c0SMasahiro Yamada 	unsigned char bitnum;
37028ba53c0SMasahiro Yamada 	unsigned char nextbyte;
37128ba53c0SMasahiro Yamada 	unsigned char leftnode;
37228ba53c0SMasahiro Yamada 	unsigned char rightnode;
37328ba53c0SMasahiro Yamada 	unsigned int keybits;
37428ba53c0SMasahiro Yamada 	unsigned int keymask;
37528ba53c0SMasahiro Yamada };
37628ba53c0SMasahiro Yamada 
37728ba53c0SMasahiro Yamada /*
37828ba53c0SMasahiro Yamada  * Example lookup function for a tree.
37928ba53c0SMasahiro Yamada  */
lookup(struct tree * tree,const char * key)38028ba53c0SMasahiro Yamada static void *lookup(struct tree *tree, const char *key)
38128ba53c0SMasahiro Yamada {
38228ba53c0SMasahiro Yamada 	struct node *node;
38328ba53c0SMasahiro Yamada 	void *leaf = NULL;
38428ba53c0SMasahiro Yamada 
38528ba53c0SMasahiro Yamada 	node = tree->root;
38628ba53c0SMasahiro Yamada 	while (!leaf && node) {
38728ba53c0SMasahiro Yamada 		if (node->nextbyte)
38828ba53c0SMasahiro Yamada 			key++;
38928ba53c0SMasahiro Yamada 		if (*key & (1 << (node->bitnum & 7))) {
39028ba53c0SMasahiro Yamada 			/* Right leg */
39128ba53c0SMasahiro Yamada 			if (node->rightnode == NODE) {
39228ba53c0SMasahiro Yamada 				node = node->right;
39328ba53c0SMasahiro Yamada 			} else if (node->rightnode == LEAF) {
39428ba53c0SMasahiro Yamada 				leaf = node->right;
39528ba53c0SMasahiro Yamada 			} else {
39628ba53c0SMasahiro Yamada 				node = NULL;
39728ba53c0SMasahiro Yamada 			}
39828ba53c0SMasahiro Yamada 		} else {
39928ba53c0SMasahiro Yamada 			/* Left leg */
40028ba53c0SMasahiro Yamada 			if (node->leftnode == NODE) {
40128ba53c0SMasahiro Yamada 				node = node->left;
40228ba53c0SMasahiro Yamada 			} else if (node->leftnode == LEAF) {
40328ba53c0SMasahiro Yamada 				leaf = node->left;
40428ba53c0SMasahiro Yamada 			} else {
40528ba53c0SMasahiro Yamada 				node = NULL;
40628ba53c0SMasahiro Yamada 			}
40728ba53c0SMasahiro Yamada 		}
40828ba53c0SMasahiro Yamada 	}
40928ba53c0SMasahiro Yamada 
41028ba53c0SMasahiro Yamada 	return leaf;
41128ba53c0SMasahiro Yamada }
41228ba53c0SMasahiro Yamada 
41328ba53c0SMasahiro Yamada /*
41428ba53c0SMasahiro Yamada  * A simple non-recursive tree walker: keep track of visits to the
41528ba53c0SMasahiro Yamada  * left and right branches in the leftmask and rightmask.
41628ba53c0SMasahiro Yamada  */
tree_walk(struct tree * tree)41728ba53c0SMasahiro Yamada static void tree_walk(struct tree *tree)
41828ba53c0SMasahiro Yamada {
41928ba53c0SMasahiro Yamada 	struct node *node;
42028ba53c0SMasahiro Yamada 	unsigned int leftmask;
42128ba53c0SMasahiro Yamada 	unsigned int rightmask;
42228ba53c0SMasahiro Yamada 	unsigned int bitmask;
42328ba53c0SMasahiro Yamada 	int indent = 1;
42428ba53c0SMasahiro Yamada 	int nodes, singletons, leaves;
42528ba53c0SMasahiro Yamada 
42628ba53c0SMasahiro Yamada 	nodes = singletons = leaves = 0;
42728ba53c0SMasahiro Yamada 
42828ba53c0SMasahiro Yamada 	printf("%s_%x root %p\n", tree->type, tree->maxage, tree->root);
42928ba53c0SMasahiro Yamada 	if (tree->childnode == LEAF) {
43028ba53c0SMasahiro Yamada 		assert(tree->root);
43128ba53c0SMasahiro Yamada 		tree->leaf_print(tree->root, indent);
43228ba53c0SMasahiro Yamada 		leaves = 1;
43328ba53c0SMasahiro Yamada 	} else {
43428ba53c0SMasahiro Yamada 		assert(tree->childnode == NODE);
43528ba53c0SMasahiro Yamada 		node = tree->root;
43628ba53c0SMasahiro Yamada 		leftmask = rightmask = 0;
43728ba53c0SMasahiro Yamada 		while (node) {
43828ba53c0SMasahiro Yamada 			printf("%*snode @ %p bitnum %d nextbyte %d"
43928ba53c0SMasahiro Yamada 			       " left %p right %p mask %x bits %x\n",
44028ba53c0SMasahiro Yamada 				indent, "", node,
44128ba53c0SMasahiro Yamada 				node->bitnum, node->nextbyte,
44228ba53c0SMasahiro Yamada 				node->left, node->right,
44328ba53c0SMasahiro Yamada 				node->keymask, node->keybits);
44428ba53c0SMasahiro Yamada 			nodes += 1;
44528ba53c0SMasahiro Yamada 			if (!(node->left && node->right))
44628ba53c0SMasahiro Yamada 				singletons += 1;
44728ba53c0SMasahiro Yamada 
44828ba53c0SMasahiro Yamada 			while (node) {
44928ba53c0SMasahiro Yamada 				bitmask = 1 << node->bitnum;
45028ba53c0SMasahiro Yamada 				if ((leftmask & bitmask) == 0) {
45128ba53c0SMasahiro Yamada 					leftmask |= bitmask;
45228ba53c0SMasahiro Yamada 					if (node->leftnode == LEAF) {
45328ba53c0SMasahiro Yamada 						assert(node->left);
45428ba53c0SMasahiro Yamada 						tree->leaf_print(node->left,
45528ba53c0SMasahiro Yamada 								 indent+1);
45628ba53c0SMasahiro Yamada 						leaves += 1;
45728ba53c0SMasahiro Yamada 					} else if (node->left) {
45828ba53c0SMasahiro Yamada 						assert(node->leftnode == NODE);
45928ba53c0SMasahiro Yamada 						indent += 1;
46028ba53c0SMasahiro Yamada 						node = node->left;
46128ba53c0SMasahiro Yamada 						break;
46228ba53c0SMasahiro Yamada 					}
46328ba53c0SMasahiro Yamada 				}
46428ba53c0SMasahiro Yamada 				if ((rightmask & bitmask) == 0) {
46528ba53c0SMasahiro Yamada 					rightmask |= bitmask;
46628ba53c0SMasahiro Yamada 					if (node->rightnode == LEAF) {
46728ba53c0SMasahiro Yamada 						assert(node->right);
46828ba53c0SMasahiro Yamada 						tree->leaf_print(node->right,
46928ba53c0SMasahiro Yamada 								 indent+1);
47028ba53c0SMasahiro Yamada 						leaves += 1;
47128ba53c0SMasahiro Yamada 					} else if (node->right) {
47228ba53c0SMasahiro Yamada 						assert(node->rightnode == NODE);
47328ba53c0SMasahiro Yamada 						indent += 1;
47428ba53c0SMasahiro Yamada 						node = node->right;
47528ba53c0SMasahiro Yamada 						break;
47628ba53c0SMasahiro Yamada 					}
47728ba53c0SMasahiro Yamada 				}
47828ba53c0SMasahiro Yamada 				leftmask &= ~bitmask;
47928ba53c0SMasahiro Yamada 				rightmask &= ~bitmask;
48028ba53c0SMasahiro Yamada 				node = node->parent;
48128ba53c0SMasahiro Yamada 				indent -= 1;
48228ba53c0SMasahiro Yamada 			}
48328ba53c0SMasahiro Yamada 		}
48428ba53c0SMasahiro Yamada 	}
48528ba53c0SMasahiro Yamada 	printf("nodes %d leaves %d singletons %d\n",
48628ba53c0SMasahiro Yamada 	       nodes, leaves, singletons);
48728ba53c0SMasahiro Yamada }
48828ba53c0SMasahiro Yamada 
48928ba53c0SMasahiro Yamada /*
49028ba53c0SMasahiro Yamada  * Allocate an initialize a new internal node.
49128ba53c0SMasahiro Yamada  */
alloc_node(struct node * parent)49228ba53c0SMasahiro Yamada static struct node *alloc_node(struct node *parent)
49328ba53c0SMasahiro Yamada {
49428ba53c0SMasahiro Yamada 	struct node *node;
49528ba53c0SMasahiro Yamada 	int bitnum;
49628ba53c0SMasahiro Yamada 
49728ba53c0SMasahiro Yamada 	node = malloc(sizeof(*node));
49828ba53c0SMasahiro Yamada 	node->left = node->right = NULL;
49928ba53c0SMasahiro Yamada 	node->parent = parent;
50028ba53c0SMasahiro Yamada 	node->leftnode = NODE;
50128ba53c0SMasahiro Yamada 	node->rightnode = NODE;
50228ba53c0SMasahiro Yamada 	node->keybits = 0;
50328ba53c0SMasahiro Yamada 	node->keymask = 0;
50428ba53c0SMasahiro Yamada 	node->mark = 0;
50528ba53c0SMasahiro Yamada 	node->index = 0;
50628ba53c0SMasahiro Yamada 	node->offset = -1;
50728ba53c0SMasahiro Yamada 	node->size = 4;
50828ba53c0SMasahiro Yamada 
50928ba53c0SMasahiro Yamada 	if (node->parent) {
51028ba53c0SMasahiro Yamada 		bitnum = parent->bitnum;
51128ba53c0SMasahiro Yamada 		if ((bitnum & 7) == 0) {
51228ba53c0SMasahiro Yamada 			node->bitnum = bitnum + 7 + 8;
51328ba53c0SMasahiro Yamada 			node->nextbyte = 1;
51428ba53c0SMasahiro Yamada 		} else {
51528ba53c0SMasahiro Yamada 			node->bitnum = bitnum - 1;
51628ba53c0SMasahiro Yamada 			node->nextbyte = 0;
51728ba53c0SMasahiro Yamada 		}
51828ba53c0SMasahiro Yamada 	} else {
51928ba53c0SMasahiro Yamada 		node->bitnum = 7;
52028ba53c0SMasahiro Yamada 		node->nextbyte = 0;
52128ba53c0SMasahiro Yamada 	}
52228ba53c0SMasahiro Yamada 
52328ba53c0SMasahiro Yamada 	return node;
52428ba53c0SMasahiro Yamada }
52528ba53c0SMasahiro Yamada 
52628ba53c0SMasahiro Yamada /*
52728ba53c0SMasahiro Yamada  * Insert a new leaf into the tree, and collapse any subtrees that are
52828ba53c0SMasahiro Yamada  * fully populated and end in identical leaves. A nextbyte tagged
52928ba53c0SMasahiro Yamada  * internal node will not be removed to preserve the tree's integrity.
53028ba53c0SMasahiro Yamada  * Note that due to the structure of utf8, no nextbyte tagged node
53128ba53c0SMasahiro Yamada  * will be a candidate for removal.
53228ba53c0SMasahiro Yamada  */
insert(struct tree * tree,char * key,int keylen,void * leaf)53328ba53c0SMasahiro Yamada static int insert(struct tree *tree, char *key, int keylen, void *leaf)
53428ba53c0SMasahiro Yamada {
53528ba53c0SMasahiro Yamada 	struct node *node;
53628ba53c0SMasahiro Yamada 	struct node *parent;
53728ba53c0SMasahiro Yamada 	void **cursor;
53828ba53c0SMasahiro Yamada 	int keybits;
53928ba53c0SMasahiro Yamada 
54028ba53c0SMasahiro Yamada 	assert(keylen >= 1 && keylen <= 4);
54128ba53c0SMasahiro Yamada 
54228ba53c0SMasahiro Yamada 	node = NULL;
54328ba53c0SMasahiro Yamada 	cursor = &tree->root;
54428ba53c0SMasahiro Yamada 	keybits = 8 * keylen;
54528ba53c0SMasahiro Yamada 
54628ba53c0SMasahiro Yamada 	/* Insert, creating path along the way. */
54728ba53c0SMasahiro Yamada 	while (keybits) {
54828ba53c0SMasahiro Yamada 		if (!*cursor)
54928ba53c0SMasahiro Yamada 			*cursor = alloc_node(node);
55028ba53c0SMasahiro Yamada 		node = *cursor;
55128ba53c0SMasahiro Yamada 		if (node->nextbyte)
55228ba53c0SMasahiro Yamada 			key++;
55328ba53c0SMasahiro Yamada 		if (*key & (1 << (node->bitnum & 7)))
55428ba53c0SMasahiro Yamada 			cursor = &node->right;
55528ba53c0SMasahiro Yamada 		else
55628ba53c0SMasahiro Yamada 			cursor = &node->left;
55728ba53c0SMasahiro Yamada 		keybits--;
55828ba53c0SMasahiro Yamada 	}
55928ba53c0SMasahiro Yamada 	*cursor = leaf;
56028ba53c0SMasahiro Yamada 
56128ba53c0SMasahiro Yamada 	/* Merge subtrees if possible. */
56228ba53c0SMasahiro Yamada 	while (node) {
56328ba53c0SMasahiro Yamada 		if (*key & (1 << (node->bitnum & 7)))
56428ba53c0SMasahiro Yamada 			node->rightnode = LEAF;
56528ba53c0SMasahiro Yamada 		else
56628ba53c0SMasahiro Yamada 			node->leftnode = LEAF;
56728ba53c0SMasahiro Yamada 		if (node->nextbyte)
56828ba53c0SMasahiro Yamada 			break;
56928ba53c0SMasahiro Yamada 		if (node->leftnode == NODE || node->rightnode == NODE)
57028ba53c0SMasahiro Yamada 			break;
57128ba53c0SMasahiro Yamada 		assert(node->left);
57228ba53c0SMasahiro Yamada 		assert(node->right);
57328ba53c0SMasahiro Yamada 		/* Compare */
57428ba53c0SMasahiro Yamada 		if (! tree->leaf_equal(node->left, node->right))
57528ba53c0SMasahiro Yamada 			break;
57628ba53c0SMasahiro Yamada 		/* Keep left, drop right leaf. */
57728ba53c0SMasahiro Yamada 		leaf = node->left;
57828ba53c0SMasahiro Yamada 		/* Check in parent */
57928ba53c0SMasahiro Yamada 		parent = node->parent;
58028ba53c0SMasahiro Yamada 		if (!parent) {
58128ba53c0SMasahiro Yamada 			/* root of tree! */
58228ba53c0SMasahiro Yamada 			tree->root = leaf;
58328ba53c0SMasahiro Yamada 			tree->childnode = LEAF;
58428ba53c0SMasahiro Yamada 		} else if (parent->left == node) {
58528ba53c0SMasahiro Yamada 			parent->left = leaf;
58628ba53c0SMasahiro Yamada 			parent->leftnode = LEAF;
58728ba53c0SMasahiro Yamada 			if (parent->right) {
58828ba53c0SMasahiro Yamada 				parent->keymask = 0;
58928ba53c0SMasahiro Yamada 				parent->keybits = 0;
59028ba53c0SMasahiro Yamada 			} else {
59128ba53c0SMasahiro Yamada 				parent->keymask |= (1 << node->bitnum);
59228ba53c0SMasahiro Yamada 			}
59328ba53c0SMasahiro Yamada 		} else if (parent->right == node) {
59428ba53c0SMasahiro Yamada 			parent->right = leaf;
59528ba53c0SMasahiro Yamada 			parent->rightnode = LEAF;
59628ba53c0SMasahiro Yamada 			if (parent->left) {
59728ba53c0SMasahiro Yamada 				parent->keymask = 0;
59828ba53c0SMasahiro Yamada 				parent->keybits = 0;
59928ba53c0SMasahiro Yamada 			} else {
60028ba53c0SMasahiro Yamada 				parent->keymask |= (1 << node->bitnum);
60128ba53c0SMasahiro Yamada 				parent->keybits |= (1 << node->bitnum);
60228ba53c0SMasahiro Yamada 			}
60328ba53c0SMasahiro Yamada 		} else {
60428ba53c0SMasahiro Yamada 			/* internal tree error */
60528ba53c0SMasahiro Yamada 			assert(0);
60628ba53c0SMasahiro Yamada 		}
60728ba53c0SMasahiro Yamada 		free(node);
60828ba53c0SMasahiro Yamada 		node = parent;
60928ba53c0SMasahiro Yamada 	}
61028ba53c0SMasahiro Yamada 
61128ba53c0SMasahiro Yamada 	/* Propagate keymasks up along singleton chains. */
61228ba53c0SMasahiro Yamada 	while (node) {
61328ba53c0SMasahiro Yamada 		parent = node->parent;
61428ba53c0SMasahiro Yamada 		if (!parent)
61528ba53c0SMasahiro Yamada 			break;
61628ba53c0SMasahiro Yamada 		/* Nix the mask for parents with two children. */
61728ba53c0SMasahiro Yamada 		if (node->keymask == 0) {
61828ba53c0SMasahiro Yamada 			parent->keymask = 0;
61928ba53c0SMasahiro Yamada 			parent->keybits = 0;
62028ba53c0SMasahiro Yamada 		} else if (parent->left && parent->right) {
62128ba53c0SMasahiro Yamada 			parent->keymask = 0;
62228ba53c0SMasahiro Yamada 			parent->keybits = 0;
62328ba53c0SMasahiro Yamada 		} else {
62428ba53c0SMasahiro Yamada 			assert((parent->keymask & node->keymask) == 0);
62528ba53c0SMasahiro Yamada 			parent->keymask |= node->keymask;
62628ba53c0SMasahiro Yamada 			parent->keymask |= (1 << parent->bitnum);
62728ba53c0SMasahiro Yamada 			parent->keybits |= node->keybits;
62828ba53c0SMasahiro Yamada 			if (parent->right)
62928ba53c0SMasahiro Yamada 				parent->keybits |= (1 << parent->bitnum);
63028ba53c0SMasahiro Yamada 		}
63128ba53c0SMasahiro Yamada 		node = parent;
63228ba53c0SMasahiro Yamada 	}
63328ba53c0SMasahiro Yamada 
63428ba53c0SMasahiro Yamada 	return 0;
63528ba53c0SMasahiro Yamada }
63628ba53c0SMasahiro Yamada 
63728ba53c0SMasahiro Yamada /*
63828ba53c0SMasahiro Yamada  * Prune internal nodes.
63928ba53c0SMasahiro Yamada  *
64028ba53c0SMasahiro Yamada  * Fully populated subtrees that end at the same leaf have already
64128ba53c0SMasahiro Yamada  * been collapsed.  There are still internal nodes that have for both
64228ba53c0SMasahiro Yamada  * their left and right branches a sequence of singletons that make
64328ba53c0SMasahiro Yamada  * identical choices and end in identical leaves.  The keymask and
64428ba53c0SMasahiro Yamada  * keybits collected in the nodes describe the choices made in these
64528ba53c0SMasahiro Yamada  * singleton chains.  When they are identical for the left and right
64628ba53c0SMasahiro Yamada  * branch of a node, and the two leaves comare identical, the node in
64728ba53c0SMasahiro Yamada  * question can be removed.
64828ba53c0SMasahiro Yamada  *
64928ba53c0SMasahiro Yamada  * Note that nodes with the nextbyte tag set will not be removed by
65028ba53c0SMasahiro Yamada  * this to ensure tree integrity.  Note as well that the structure of
65128ba53c0SMasahiro Yamada  * utf8 ensures that these nodes would not have been candidates for
65228ba53c0SMasahiro Yamada  * removal in any case.
65328ba53c0SMasahiro Yamada  */
prune(struct tree * tree)65428ba53c0SMasahiro Yamada static void prune(struct tree *tree)
65528ba53c0SMasahiro Yamada {
65628ba53c0SMasahiro Yamada 	struct node *node;
65728ba53c0SMasahiro Yamada 	struct node *left;
65828ba53c0SMasahiro Yamada 	struct node *right;
65928ba53c0SMasahiro Yamada 	struct node *parent;
66028ba53c0SMasahiro Yamada 	void *leftleaf;
66128ba53c0SMasahiro Yamada 	void *rightleaf;
66228ba53c0SMasahiro Yamada 	unsigned int leftmask;
66328ba53c0SMasahiro Yamada 	unsigned int rightmask;
66428ba53c0SMasahiro Yamada 	unsigned int bitmask;
66528ba53c0SMasahiro Yamada 	int count;
66628ba53c0SMasahiro Yamada 
66728ba53c0SMasahiro Yamada 	if (verbose > 0)
66828ba53c0SMasahiro Yamada 		printf("Pruning %s_%x\n", tree->type, tree->maxage);
66928ba53c0SMasahiro Yamada 
67028ba53c0SMasahiro Yamada 	count = 0;
67128ba53c0SMasahiro Yamada 	if (tree->childnode == LEAF)
67228ba53c0SMasahiro Yamada 		return;
67328ba53c0SMasahiro Yamada 	if (!tree->root)
67428ba53c0SMasahiro Yamada 		return;
67528ba53c0SMasahiro Yamada 
67628ba53c0SMasahiro Yamada 	leftmask = rightmask = 0;
67728ba53c0SMasahiro Yamada 	node = tree->root;
67828ba53c0SMasahiro Yamada 	while (node) {
67928ba53c0SMasahiro Yamada 		if (node->nextbyte)
68028ba53c0SMasahiro Yamada 			goto advance;
68128ba53c0SMasahiro Yamada 		if (node->leftnode == LEAF)
68228ba53c0SMasahiro Yamada 			goto advance;
68328ba53c0SMasahiro Yamada 		if (node->rightnode == LEAF)
68428ba53c0SMasahiro Yamada 			goto advance;
68528ba53c0SMasahiro Yamada 		if (!node->left)
68628ba53c0SMasahiro Yamada 			goto advance;
68728ba53c0SMasahiro Yamada 		if (!node->right)
68828ba53c0SMasahiro Yamada 			goto advance;
68928ba53c0SMasahiro Yamada 		left = node->left;
69028ba53c0SMasahiro Yamada 		right = node->right;
69128ba53c0SMasahiro Yamada 		if (left->keymask == 0)
69228ba53c0SMasahiro Yamada 			goto advance;
69328ba53c0SMasahiro Yamada 		if (right->keymask == 0)
69428ba53c0SMasahiro Yamada 			goto advance;
69528ba53c0SMasahiro Yamada 		if (left->keymask != right->keymask)
69628ba53c0SMasahiro Yamada 			goto advance;
69728ba53c0SMasahiro Yamada 		if (left->keybits != right->keybits)
69828ba53c0SMasahiro Yamada 			goto advance;
69928ba53c0SMasahiro Yamada 		leftleaf = NULL;
70028ba53c0SMasahiro Yamada 		while (!leftleaf) {
70128ba53c0SMasahiro Yamada 			assert(left->left || left->right);
70228ba53c0SMasahiro Yamada 			if (left->leftnode == LEAF)
70328ba53c0SMasahiro Yamada 				leftleaf = left->left;
70428ba53c0SMasahiro Yamada 			else if (left->rightnode == LEAF)
70528ba53c0SMasahiro Yamada 				leftleaf = left->right;
70628ba53c0SMasahiro Yamada 			else if (left->left)
70728ba53c0SMasahiro Yamada 				left = left->left;
70828ba53c0SMasahiro Yamada 			else if (left->right)
70928ba53c0SMasahiro Yamada 				left = left->right;
71028ba53c0SMasahiro Yamada 			else
71128ba53c0SMasahiro Yamada 				assert(0);
71228ba53c0SMasahiro Yamada 		}
71328ba53c0SMasahiro Yamada 		rightleaf = NULL;
71428ba53c0SMasahiro Yamada 		while (!rightleaf) {
71528ba53c0SMasahiro Yamada 			assert(right->left || right->right);
71628ba53c0SMasahiro Yamada 			if (right->leftnode == LEAF)
71728ba53c0SMasahiro Yamada 				rightleaf = right->left;
71828ba53c0SMasahiro Yamada 			else if (right->rightnode == LEAF)
71928ba53c0SMasahiro Yamada 				rightleaf = right->right;
72028ba53c0SMasahiro Yamada 			else if (right->left)
72128ba53c0SMasahiro Yamada 				right = right->left;
72228ba53c0SMasahiro Yamada 			else if (right->right)
72328ba53c0SMasahiro Yamada 				right = right->right;
72428ba53c0SMasahiro Yamada 			else
72528ba53c0SMasahiro Yamada 				assert(0);
72628ba53c0SMasahiro Yamada 		}
72728ba53c0SMasahiro Yamada 		if (! tree->leaf_equal(leftleaf, rightleaf))
72828ba53c0SMasahiro Yamada 			goto advance;
72928ba53c0SMasahiro Yamada 		/*
73028ba53c0SMasahiro Yamada 		 * This node has identical singleton-only subtrees.
73128ba53c0SMasahiro Yamada 		 * Remove it.
73228ba53c0SMasahiro Yamada 		 */
73328ba53c0SMasahiro Yamada 		parent = node->parent;
73428ba53c0SMasahiro Yamada 		left = node->left;
73528ba53c0SMasahiro Yamada 		right = node->right;
73628ba53c0SMasahiro Yamada 		if (parent->left == node)
73728ba53c0SMasahiro Yamada 			parent->left = left;
73828ba53c0SMasahiro Yamada 		else if (parent->right == node)
73928ba53c0SMasahiro Yamada 			parent->right = left;
74028ba53c0SMasahiro Yamada 		else
74128ba53c0SMasahiro Yamada 			assert(0);
74228ba53c0SMasahiro Yamada 		left->parent = parent;
74328ba53c0SMasahiro Yamada 		left->keymask |= (1 << node->bitnum);
74428ba53c0SMasahiro Yamada 		node->left = NULL;
74528ba53c0SMasahiro Yamada 		while (node) {
74628ba53c0SMasahiro Yamada 			bitmask = 1 << node->bitnum;
74728ba53c0SMasahiro Yamada 			leftmask &= ~bitmask;
74828ba53c0SMasahiro Yamada 			rightmask &= ~bitmask;
74928ba53c0SMasahiro Yamada 			if (node->leftnode == NODE && node->left) {
75028ba53c0SMasahiro Yamada 				left = node->left;
75128ba53c0SMasahiro Yamada 				free(node);
75228ba53c0SMasahiro Yamada 				count++;
75328ba53c0SMasahiro Yamada 				node = left;
75428ba53c0SMasahiro Yamada 			} else if (node->rightnode == NODE && node->right) {
75528ba53c0SMasahiro Yamada 				right = node->right;
75628ba53c0SMasahiro Yamada 				free(node);
75728ba53c0SMasahiro Yamada 				count++;
75828ba53c0SMasahiro Yamada 				node = right;
75928ba53c0SMasahiro Yamada 			} else {
76028ba53c0SMasahiro Yamada 				node = NULL;
76128ba53c0SMasahiro Yamada 			}
76228ba53c0SMasahiro Yamada 		}
76328ba53c0SMasahiro Yamada 		/* Propagate keymasks up along singleton chains. */
76428ba53c0SMasahiro Yamada 		node = parent;
76528ba53c0SMasahiro Yamada 		/* Force re-check */
76628ba53c0SMasahiro Yamada 		bitmask = 1 << node->bitnum;
76728ba53c0SMasahiro Yamada 		leftmask &= ~bitmask;
76828ba53c0SMasahiro Yamada 		rightmask &= ~bitmask;
76928ba53c0SMasahiro Yamada 		for (;;) {
77028ba53c0SMasahiro Yamada 			if (node->left && node->right)
77128ba53c0SMasahiro Yamada 				break;
77228ba53c0SMasahiro Yamada 			if (node->left) {
77328ba53c0SMasahiro Yamada 				left = node->left;
77428ba53c0SMasahiro Yamada 				node->keymask |= left->keymask;
77528ba53c0SMasahiro Yamada 				node->keybits |= left->keybits;
77628ba53c0SMasahiro Yamada 			}
77728ba53c0SMasahiro Yamada 			if (node->right) {
77828ba53c0SMasahiro Yamada 				right = node->right;
77928ba53c0SMasahiro Yamada 				node->keymask |= right->keymask;
78028ba53c0SMasahiro Yamada 				node->keybits |= right->keybits;
78128ba53c0SMasahiro Yamada 			}
78228ba53c0SMasahiro Yamada 			node->keymask |= (1 << node->bitnum);
78328ba53c0SMasahiro Yamada 			node = node->parent;
78428ba53c0SMasahiro Yamada 			/* Force re-check */
78528ba53c0SMasahiro Yamada 			bitmask = 1 << node->bitnum;
78628ba53c0SMasahiro Yamada 			leftmask &= ~bitmask;
78728ba53c0SMasahiro Yamada 			rightmask &= ~bitmask;
78828ba53c0SMasahiro Yamada 		}
78928ba53c0SMasahiro Yamada 	advance:
79028ba53c0SMasahiro Yamada 		bitmask = 1 << node->bitnum;
79128ba53c0SMasahiro Yamada 		if ((leftmask & bitmask) == 0 &&
79228ba53c0SMasahiro Yamada 		    node->leftnode == NODE &&
79328ba53c0SMasahiro Yamada 		    node->left) {
79428ba53c0SMasahiro Yamada 			leftmask |= bitmask;
79528ba53c0SMasahiro Yamada 			node = node->left;
79628ba53c0SMasahiro Yamada 		} else if ((rightmask & bitmask) == 0 &&
79728ba53c0SMasahiro Yamada 			   node->rightnode == NODE &&
79828ba53c0SMasahiro Yamada 			   node->right) {
79928ba53c0SMasahiro Yamada 			rightmask |= bitmask;
80028ba53c0SMasahiro Yamada 			node = node->right;
80128ba53c0SMasahiro Yamada 		} else {
80228ba53c0SMasahiro Yamada 			leftmask &= ~bitmask;
80328ba53c0SMasahiro Yamada 			rightmask &= ~bitmask;
80428ba53c0SMasahiro Yamada 			node = node->parent;
80528ba53c0SMasahiro Yamada 		}
80628ba53c0SMasahiro Yamada 	}
80728ba53c0SMasahiro Yamada 	if (verbose > 0)
80828ba53c0SMasahiro Yamada 		printf("Pruned %d nodes\n", count);
80928ba53c0SMasahiro Yamada }
81028ba53c0SMasahiro Yamada 
81128ba53c0SMasahiro Yamada /*
81228ba53c0SMasahiro Yamada  * Mark the nodes in the tree that lead to leaves that must be
81328ba53c0SMasahiro Yamada  * emitted.
81428ba53c0SMasahiro Yamada  */
mark_nodes(struct tree * tree)81528ba53c0SMasahiro Yamada static void mark_nodes(struct tree *tree)
81628ba53c0SMasahiro Yamada {
81728ba53c0SMasahiro Yamada 	struct node *node;
81828ba53c0SMasahiro Yamada 	struct node *n;
81928ba53c0SMasahiro Yamada 	unsigned int leftmask;
82028ba53c0SMasahiro Yamada 	unsigned int rightmask;
82128ba53c0SMasahiro Yamada 	unsigned int bitmask;
82228ba53c0SMasahiro Yamada 	int marked;
82328ba53c0SMasahiro Yamada 
82428ba53c0SMasahiro Yamada 	marked = 0;
82528ba53c0SMasahiro Yamada 	if (verbose > 0)
82628ba53c0SMasahiro Yamada 		printf("Marking %s_%x\n", tree->type, tree->maxage);
82728ba53c0SMasahiro Yamada 	if (tree->childnode == LEAF)
82828ba53c0SMasahiro Yamada 		goto done;
82928ba53c0SMasahiro Yamada 
83028ba53c0SMasahiro Yamada 	assert(tree->childnode == NODE);
83128ba53c0SMasahiro Yamada 	node = tree->root;
83228ba53c0SMasahiro Yamada 	leftmask = rightmask = 0;
83328ba53c0SMasahiro Yamada 	while (node) {
83428ba53c0SMasahiro Yamada 		bitmask = 1 << node->bitnum;
83528ba53c0SMasahiro Yamada 		if ((leftmask & bitmask) == 0) {
83628ba53c0SMasahiro Yamada 			leftmask |= bitmask;
83728ba53c0SMasahiro Yamada 			if (node->leftnode == LEAF) {
83828ba53c0SMasahiro Yamada 				assert(node->left);
83928ba53c0SMasahiro Yamada 				if (tree->leaf_mark(node->left)) {
84028ba53c0SMasahiro Yamada 					n = node;
84128ba53c0SMasahiro Yamada 					while (n && !n->mark) {
84228ba53c0SMasahiro Yamada 						marked++;
84328ba53c0SMasahiro Yamada 						n->mark = 1;
84428ba53c0SMasahiro Yamada 						n = n->parent;
84528ba53c0SMasahiro Yamada 					}
84628ba53c0SMasahiro Yamada 				}
84728ba53c0SMasahiro Yamada 			} else if (node->left) {
84828ba53c0SMasahiro Yamada 				assert(node->leftnode == NODE);
84928ba53c0SMasahiro Yamada 				node = node->left;
85028ba53c0SMasahiro Yamada 				continue;
85128ba53c0SMasahiro Yamada 			}
85228ba53c0SMasahiro Yamada 		}
85328ba53c0SMasahiro Yamada 		if ((rightmask & bitmask) == 0) {
85428ba53c0SMasahiro Yamada 			rightmask |= bitmask;
85528ba53c0SMasahiro Yamada 			if (node->rightnode == LEAF) {
85628ba53c0SMasahiro Yamada 				assert(node->right);
85728ba53c0SMasahiro Yamada 				if (tree->leaf_mark(node->right)) {
85828ba53c0SMasahiro Yamada 					n = node;
85928ba53c0SMasahiro Yamada 					while (n && !n->mark) {
86028ba53c0SMasahiro Yamada 						marked++;
86128ba53c0SMasahiro Yamada 						n->mark = 1;
86228ba53c0SMasahiro Yamada 						n = n->parent;
86328ba53c0SMasahiro Yamada 					}
86428ba53c0SMasahiro Yamada 				}
86528ba53c0SMasahiro Yamada 			} else if (node->right) {
86628ba53c0SMasahiro Yamada 				assert(node->rightnode == NODE);
86728ba53c0SMasahiro Yamada 				node = node->right;
86828ba53c0SMasahiro Yamada 				continue;
86928ba53c0SMasahiro Yamada 			}
87028ba53c0SMasahiro Yamada 		}
87128ba53c0SMasahiro Yamada 		leftmask &= ~bitmask;
87228ba53c0SMasahiro Yamada 		rightmask &= ~bitmask;
87328ba53c0SMasahiro Yamada 		node = node->parent;
87428ba53c0SMasahiro Yamada 	}
87528ba53c0SMasahiro Yamada 
87628ba53c0SMasahiro Yamada 	/* second pass: left siblings and singletons */
87728ba53c0SMasahiro Yamada 
87828ba53c0SMasahiro Yamada 	assert(tree->childnode == NODE);
87928ba53c0SMasahiro Yamada 	node = tree->root;
88028ba53c0SMasahiro Yamada 	leftmask = rightmask = 0;
88128ba53c0SMasahiro Yamada 	while (node) {
88228ba53c0SMasahiro Yamada 		bitmask = 1 << node->bitnum;
88328ba53c0SMasahiro Yamada 		if ((leftmask & bitmask) == 0) {
88428ba53c0SMasahiro Yamada 			leftmask |= bitmask;
88528ba53c0SMasahiro Yamada 			if (node->leftnode == LEAF) {
88628ba53c0SMasahiro Yamada 				assert(node->left);
88728ba53c0SMasahiro Yamada 				if (tree->leaf_mark(node->left)) {
88828ba53c0SMasahiro Yamada 					n = node;
88928ba53c0SMasahiro Yamada 					while (n && !n->mark) {
89028ba53c0SMasahiro Yamada 						marked++;
89128ba53c0SMasahiro Yamada 						n->mark = 1;
89228ba53c0SMasahiro Yamada 						n = n->parent;
89328ba53c0SMasahiro Yamada 					}
89428ba53c0SMasahiro Yamada 				}
89528ba53c0SMasahiro Yamada 			} else if (node->left) {
89628ba53c0SMasahiro Yamada 				assert(node->leftnode == NODE);
89728ba53c0SMasahiro Yamada 				node = node->left;
89828ba53c0SMasahiro Yamada 				if (!node->mark && node->parent->mark) {
89928ba53c0SMasahiro Yamada 					marked++;
90028ba53c0SMasahiro Yamada 					node->mark = 1;
90128ba53c0SMasahiro Yamada 				}
90228ba53c0SMasahiro Yamada 				continue;
90328ba53c0SMasahiro Yamada 			}
90428ba53c0SMasahiro Yamada 		}
90528ba53c0SMasahiro Yamada 		if ((rightmask & bitmask) == 0) {
90628ba53c0SMasahiro Yamada 			rightmask |= bitmask;
90728ba53c0SMasahiro Yamada 			if (node->rightnode == LEAF) {
90828ba53c0SMasahiro Yamada 				assert(node->right);
90928ba53c0SMasahiro Yamada 				if (tree->leaf_mark(node->right)) {
91028ba53c0SMasahiro Yamada 					n = node;
91128ba53c0SMasahiro Yamada 					while (n && !n->mark) {
91228ba53c0SMasahiro Yamada 						marked++;
91328ba53c0SMasahiro Yamada 						n->mark = 1;
91428ba53c0SMasahiro Yamada 						n = n->parent;
91528ba53c0SMasahiro Yamada 					}
91628ba53c0SMasahiro Yamada 				}
91728ba53c0SMasahiro Yamada 			} else if (node->right) {
91828ba53c0SMasahiro Yamada 				assert(node->rightnode == NODE);
91928ba53c0SMasahiro Yamada 				node = node->right;
92028ba53c0SMasahiro Yamada 				if (!node->mark && node->parent->mark &&
92128ba53c0SMasahiro Yamada 				    !node->parent->left) {
92228ba53c0SMasahiro Yamada 					marked++;
92328ba53c0SMasahiro Yamada 					node->mark = 1;
92428ba53c0SMasahiro Yamada 				}
92528ba53c0SMasahiro Yamada 				continue;
92628ba53c0SMasahiro Yamada 			}
92728ba53c0SMasahiro Yamada 		}
92828ba53c0SMasahiro Yamada 		leftmask &= ~bitmask;
92928ba53c0SMasahiro Yamada 		rightmask &= ~bitmask;
93028ba53c0SMasahiro Yamada 		node = node->parent;
93128ba53c0SMasahiro Yamada 	}
93228ba53c0SMasahiro Yamada done:
93328ba53c0SMasahiro Yamada 	if (verbose > 0)
93428ba53c0SMasahiro Yamada 		printf("Marked %d nodes\n", marked);
93528ba53c0SMasahiro Yamada }
93628ba53c0SMasahiro Yamada 
93728ba53c0SMasahiro Yamada /*
93828ba53c0SMasahiro Yamada  * Compute the index of each node and leaf, which is the offset in the
93928ba53c0SMasahiro Yamada  * emitted trie.  These values must be pre-computed because relative
94028ba53c0SMasahiro Yamada  * offsets between nodes are used to navigate the tree.
94128ba53c0SMasahiro Yamada  */
index_nodes(struct tree * tree,int index)94228ba53c0SMasahiro Yamada static int index_nodes(struct tree *tree, int index)
94328ba53c0SMasahiro Yamada {
94428ba53c0SMasahiro Yamada 	struct node *node;
94528ba53c0SMasahiro Yamada 	unsigned int leftmask;
94628ba53c0SMasahiro Yamada 	unsigned int rightmask;
94728ba53c0SMasahiro Yamada 	unsigned int bitmask;
94828ba53c0SMasahiro Yamada 	int count;
94928ba53c0SMasahiro Yamada 	int indent;
95028ba53c0SMasahiro Yamada 
95128ba53c0SMasahiro Yamada 	/* Align to a cache line (or half a cache line?). */
95228ba53c0SMasahiro Yamada 	while (index % 64)
95328ba53c0SMasahiro Yamada 		index++;
95428ba53c0SMasahiro Yamada 	tree->index = index;
95528ba53c0SMasahiro Yamada 	indent = 1;
95628ba53c0SMasahiro Yamada 	count = 0;
95728ba53c0SMasahiro Yamada 
95828ba53c0SMasahiro Yamada 	if (verbose > 0)
95928ba53c0SMasahiro Yamada 		printf("Indexing %s_%x: %d\n", tree->type, tree->maxage, index);
96028ba53c0SMasahiro Yamada 	if (tree->childnode == LEAF) {
96128ba53c0SMasahiro Yamada 		index += tree->leaf_size(tree->root);
96228ba53c0SMasahiro Yamada 		goto done;
96328ba53c0SMasahiro Yamada 	}
96428ba53c0SMasahiro Yamada 
96528ba53c0SMasahiro Yamada 	assert(tree->childnode == NODE);
96628ba53c0SMasahiro Yamada 	node = tree->root;
96728ba53c0SMasahiro Yamada 	leftmask = rightmask = 0;
96828ba53c0SMasahiro Yamada 	while (node) {
96928ba53c0SMasahiro Yamada 		if (!node->mark)
97028ba53c0SMasahiro Yamada 			goto skip;
97128ba53c0SMasahiro Yamada 		count++;
97228ba53c0SMasahiro Yamada 		if (node->index != index)
97328ba53c0SMasahiro Yamada 			node->index = index;
97428ba53c0SMasahiro Yamada 		index += node->size;
97528ba53c0SMasahiro Yamada skip:
97628ba53c0SMasahiro Yamada 		while (node) {
97728ba53c0SMasahiro Yamada 			bitmask = 1 << node->bitnum;
97828ba53c0SMasahiro Yamada 			if (node->mark && (leftmask & bitmask) == 0) {
97928ba53c0SMasahiro Yamada 				leftmask |= bitmask;
98028ba53c0SMasahiro Yamada 				if (node->leftnode == LEAF) {
98128ba53c0SMasahiro Yamada 					assert(node->left);
98228ba53c0SMasahiro Yamada 					*tree->leaf_index(tree, node->left) =
98328ba53c0SMasahiro Yamada 									index;
98428ba53c0SMasahiro Yamada 					index += tree->leaf_size(node->left);
98528ba53c0SMasahiro Yamada 					count++;
98628ba53c0SMasahiro Yamada 				} else if (node->left) {
98728ba53c0SMasahiro Yamada 					assert(node->leftnode == NODE);
98828ba53c0SMasahiro Yamada 					indent += 1;
98928ba53c0SMasahiro Yamada 					node = node->left;
99028ba53c0SMasahiro Yamada 					break;
99128ba53c0SMasahiro Yamada 				}
99228ba53c0SMasahiro Yamada 			}
99328ba53c0SMasahiro Yamada 			if (node->mark && (rightmask & bitmask) == 0) {
99428ba53c0SMasahiro Yamada 				rightmask |= bitmask;
99528ba53c0SMasahiro Yamada 				if (node->rightnode == LEAF) {
99628ba53c0SMasahiro Yamada 					assert(node->right);
99728ba53c0SMasahiro Yamada 					*tree->leaf_index(tree, node->right) = index;
99828ba53c0SMasahiro Yamada 					index += tree->leaf_size(node->right);
99928ba53c0SMasahiro Yamada 					count++;
100028ba53c0SMasahiro Yamada 				} else if (node->right) {
100128ba53c0SMasahiro Yamada 					assert(node->rightnode == NODE);
100228ba53c0SMasahiro Yamada 					indent += 1;
100328ba53c0SMasahiro Yamada 					node = node->right;
100428ba53c0SMasahiro Yamada 					break;
100528ba53c0SMasahiro Yamada 				}
100628ba53c0SMasahiro Yamada 			}
100728ba53c0SMasahiro Yamada 			leftmask &= ~bitmask;
100828ba53c0SMasahiro Yamada 			rightmask &= ~bitmask;
100928ba53c0SMasahiro Yamada 			node = node->parent;
101028ba53c0SMasahiro Yamada 			indent -= 1;
101128ba53c0SMasahiro Yamada 		}
101228ba53c0SMasahiro Yamada 	}
101328ba53c0SMasahiro Yamada done:
101428ba53c0SMasahiro Yamada 	/* Round up to a multiple of 16 */
101528ba53c0SMasahiro Yamada 	while (index % 16)
101628ba53c0SMasahiro Yamada 		index++;
101728ba53c0SMasahiro Yamada 	if (verbose > 0)
101828ba53c0SMasahiro Yamada 		printf("Final index %d\n", index);
101928ba53c0SMasahiro Yamada 	return index;
102028ba53c0SMasahiro Yamada }
102128ba53c0SMasahiro Yamada 
102228ba53c0SMasahiro Yamada /*
102328ba53c0SMasahiro Yamada  * Mark the nodes in a subtree, helper for size_nodes().
102428ba53c0SMasahiro Yamada  */
mark_subtree(struct node * node)102528ba53c0SMasahiro Yamada static int mark_subtree(struct node *node)
102628ba53c0SMasahiro Yamada {
102728ba53c0SMasahiro Yamada 	int changed;
102828ba53c0SMasahiro Yamada 
102928ba53c0SMasahiro Yamada 	if (!node || node->mark)
103028ba53c0SMasahiro Yamada 		return 0;
103128ba53c0SMasahiro Yamada 	node->mark = 1;
103228ba53c0SMasahiro Yamada 	node->index = node->parent->index;
103328ba53c0SMasahiro Yamada 	changed = 1;
103428ba53c0SMasahiro Yamada 	if (node->leftnode == NODE)
103528ba53c0SMasahiro Yamada 		changed += mark_subtree(node->left);
103628ba53c0SMasahiro Yamada 	if (node->rightnode == NODE)
103728ba53c0SMasahiro Yamada 		changed += mark_subtree(node->right);
103828ba53c0SMasahiro Yamada 	return changed;
103928ba53c0SMasahiro Yamada }
104028ba53c0SMasahiro Yamada 
104128ba53c0SMasahiro Yamada /*
104228ba53c0SMasahiro Yamada  * Compute the size of nodes and leaves. We start by assuming that
104328ba53c0SMasahiro Yamada  * each node needs to store a three-byte offset. The indexes of the
104428ba53c0SMasahiro Yamada  * nodes are calculated based on that, and then this function is
104528ba53c0SMasahiro Yamada  * called to see if the sizes of some nodes can be reduced.  This is
104628ba53c0SMasahiro Yamada  * repeated until no more changes are seen.
104728ba53c0SMasahiro Yamada  */
size_nodes(struct tree * tree)104828ba53c0SMasahiro Yamada static int size_nodes(struct tree *tree)
104928ba53c0SMasahiro Yamada {
105028ba53c0SMasahiro Yamada 	struct tree *next;
105128ba53c0SMasahiro Yamada 	struct node *node;
105228ba53c0SMasahiro Yamada 	struct node *right;
105328ba53c0SMasahiro Yamada 	struct node *n;
105428ba53c0SMasahiro Yamada 	unsigned int leftmask;
105528ba53c0SMasahiro Yamada 	unsigned int rightmask;
105628ba53c0SMasahiro Yamada 	unsigned int bitmask;
105728ba53c0SMasahiro Yamada 	unsigned int pathbits;
105828ba53c0SMasahiro Yamada 	unsigned int pathmask;
105928ba53c0SMasahiro Yamada 	unsigned int nbit;
106028ba53c0SMasahiro Yamada 	int changed;
106128ba53c0SMasahiro Yamada 	int offset;
106228ba53c0SMasahiro Yamada 	int size;
106328ba53c0SMasahiro Yamada 	int indent;
106428ba53c0SMasahiro Yamada 
106528ba53c0SMasahiro Yamada 	indent = 1;
106628ba53c0SMasahiro Yamada 	changed = 0;
106728ba53c0SMasahiro Yamada 	size = 0;
106828ba53c0SMasahiro Yamada 
106928ba53c0SMasahiro Yamada 	if (verbose > 0)
107028ba53c0SMasahiro Yamada 		printf("Sizing %s_%x\n", tree->type, tree->maxage);
107128ba53c0SMasahiro Yamada 	if (tree->childnode == LEAF)
107228ba53c0SMasahiro Yamada 		goto done;
107328ba53c0SMasahiro Yamada 
107428ba53c0SMasahiro Yamada 	assert(tree->childnode == NODE);
107528ba53c0SMasahiro Yamada 	pathbits = 0;
107628ba53c0SMasahiro Yamada 	pathmask = 0;
107728ba53c0SMasahiro Yamada 	node = tree->root;
107828ba53c0SMasahiro Yamada 	leftmask = rightmask = 0;
107928ba53c0SMasahiro Yamada 	while (node) {
108028ba53c0SMasahiro Yamada 		if (!node->mark)
108128ba53c0SMasahiro Yamada 			goto skip;
108228ba53c0SMasahiro Yamada 		offset = 0;
108328ba53c0SMasahiro Yamada 		if (!node->left || !node->right) {
108428ba53c0SMasahiro Yamada 			size = 1;
108528ba53c0SMasahiro Yamada 		} else {
108628ba53c0SMasahiro Yamada 			if (node->rightnode == NODE) {
108728ba53c0SMasahiro Yamada 				/*
108828ba53c0SMasahiro Yamada 				 * If the right node is not marked,
108928ba53c0SMasahiro Yamada 				 * look for a corresponding node in
109028ba53c0SMasahiro Yamada 				 * the next tree.  Such a node need
109128ba53c0SMasahiro Yamada 				 * not exist.
109228ba53c0SMasahiro Yamada 				 */
109328ba53c0SMasahiro Yamada 				right = node->right;
109428ba53c0SMasahiro Yamada 				next = tree->next;
109528ba53c0SMasahiro Yamada 				while (!right->mark) {
109628ba53c0SMasahiro Yamada 					assert(next);
109728ba53c0SMasahiro Yamada 					n = next->root;
109828ba53c0SMasahiro Yamada 					while (n->bitnum != node->bitnum) {
109928ba53c0SMasahiro Yamada 						nbit = 1 << n->bitnum;
110028ba53c0SMasahiro Yamada 						if (!(pathmask & nbit))
110128ba53c0SMasahiro Yamada 							break;
110228ba53c0SMasahiro Yamada 						if (pathbits & nbit) {
110328ba53c0SMasahiro Yamada 							if (n->rightnode == LEAF)
110428ba53c0SMasahiro Yamada 								break;
110528ba53c0SMasahiro Yamada 							n = n->right;
110628ba53c0SMasahiro Yamada 						} else {
110728ba53c0SMasahiro Yamada 							if (n->leftnode == LEAF)
110828ba53c0SMasahiro Yamada 								break;
110928ba53c0SMasahiro Yamada 							n = n->left;
111028ba53c0SMasahiro Yamada 						}
111128ba53c0SMasahiro Yamada 					}
111228ba53c0SMasahiro Yamada 					if (n->bitnum != node->bitnum)
111328ba53c0SMasahiro Yamada 						break;
111428ba53c0SMasahiro Yamada 					n = n->right;
111528ba53c0SMasahiro Yamada 					right = n;
111628ba53c0SMasahiro Yamada 					next = next->next;
111728ba53c0SMasahiro Yamada 				}
111828ba53c0SMasahiro Yamada 				/* Make sure the right node is marked. */
111928ba53c0SMasahiro Yamada 				if (!right->mark)
112028ba53c0SMasahiro Yamada 					changed += mark_subtree(right);
112128ba53c0SMasahiro Yamada 				offset = right->index - node->index;
112228ba53c0SMasahiro Yamada 			} else {
112328ba53c0SMasahiro Yamada 				offset = *tree->leaf_index(tree, node->right);
112428ba53c0SMasahiro Yamada 				offset -= node->index;
112528ba53c0SMasahiro Yamada 			}
112628ba53c0SMasahiro Yamada 			assert(offset >= 0);
112728ba53c0SMasahiro Yamada 			assert(offset <= 0xffffff);
112828ba53c0SMasahiro Yamada 			if (offset <= 0xff) {
112928ba53c0SMasahiro Yamada 				size = 2;
113028ba53c0SMasahiro Yamada 			} else if (offset <= 0xffff) {
113128ba53c0SMasahiro Yamada 				size = 3;
113228ba53c0SMasahiro Yamada 			} else { /* offset <= 0xffffff */
113328ba53c0SMasahiro Yamada 				size = 4;
113428ba53c0SMasahiro Yamada 			}
113528ba53c0SMasahiro Yamada 		}
113628ba53c0SMasahiro Yamada 		if (node->size != size || node->offset != offset) {
113728ba53c0SMasahiro Yamada 			node->size = size;
113828ba53c0SMasahiro Yamada 			node->offset = offset;
113928ba53c0SMasahiro Yamada 			changed++;
114028ba53c0SMasahiro Yamada 		}
114128ba53c0SMasahiro Yamada skip:
114228ba53c0SMasahiro Yamada 		while (node) {
114328ba53c0SMasahiro Yamada 			bitmask = 1 << node->bitnum;
114428ba53c0SMasahiro Yamada 			pathmask |= bitmask;
114528ba53c0SMasahiro Yamada 			if (node->mark && (leftmask & bitmask) == 0) {
114628ba53c0SMasahiro Yamada 				leftmask |= bitmask;
114728ba53c0SMasahiro Yamada 				if (node->leftnode == LEAF) {
114828ba53c0SMasahiro Yamada 					assert(node->left);
114928ba53c0SMasahiro Yamada 				} else if (node->left) {
115028ba53c0SMasahiro Yamada 					assert(node->leftnode == NODE);
115128ba53c0SMasahiro Yamada 					indent += 1;
115228ba53c0SMasahiro Yamada 					node = node->left;
115328ba53c0SMasahiro Yamada 					break;
115428ba53c0SMasahiro Yamada 				}
115528ba53c0SMasahiro Yamada 			}
115628ba53c0SMasahiro Yamada 			if (node->mark && (rightmask & bitmask) == 0) {
115728ba53c0SMasahiro Yamada 				rightmask |= bitmask;
115828ba53c0SMasahiro Yamada 				pathbits |= bitmask;
115928ba53c0SMasahiro Yamada 				if (node->rightnode == LEAF) {
116028ba53c0SMasahiro Yamada 					assert(node->right);
116128ba53c0SMasahiro Yamada 				} else if (node->right) {
116228ba53c0SMasahiro Yamada 					assert(node->rightnode == NODE);
116328ba53c0SMasahiro Yamada 					indent += 1;
116428ba53c0SMasahiro Yamada 					node = node->right;
116528ba53c0SMasahiro Yamada 					break;
116628ba53c0SMasahiro Yamada 				}
116728ba53c0SMasahiro Yamada 			}
116828ba53c0SMasahiro Yamada 			leftmask &= ~bitmask;
116928ba53c0SMasahiro Yamada 			rightmask &= ~bitmask;
117028ba53c0SMasahiro Yamada 			pathmask &= ~bitmask;
117128ba53c0SMasahiro Yamada 			pathbits &= ~bitmask;
117228ba53c0SMasahiro Yamada 			node = node->parent;
117328ba53c0SMasahiro Yamada 			indent -= 1;
117428ba53c0SMasahiro Yamada 		}
117528ba53c0SMasahiro Yamada 	}
117628ba53c0SMasahiro Yamada done:
117728ba53c0SMasahiro Yamada 	if (verbose > 0)
117828ba53c0SMasahiro Yamada 		printf("Found %d changes\n", changed);
117928ba53c0SMasahiro Yamada 	return changed;
118028ba53c0SMasahiro Yamada }
118128ba53c0SMasahiro Yamada 
118228ba53c0SMasahiro Yamada /*
118328ba53c0SMasahiro Yamada  * Emit a trie for the given tree into the data array.
118428ba53c0SMasahiro Yamada  */
emit(struct tree * tree,unsigned char * data)118528ba53c0SMasahiro Yamada static void emit(struct tree *tree, unsigned char *data)
118628ba53c0SMasahiro Yamada {
118728ba53c0SMasahiro Yamada 	struct node *node;
118828ba53c0SMasahiro Yamada 	unsigned int leftmask;
118928ba53c0SMasahiro Yamada 	unsigned int rightmask;
119028ba53c0SMasahiro Yamada 	unsigned int bitmask;
119128ba53c0SMasahiro Yamada 	int offlen;
119228ba53c0SMasahiro Yamada 	int offset;
119328ba53c0SMasahiro Yamada 	int index;
119428ba53c0SMasahiro Yamada 	int indent;
119528ba53c0SMasahiro Yamada 	int size;
119628ba53c0SMasahiro Yamada 	int bytes;
119728ba53c0SMasahiro Yamada 	int leaves;
119828ba53c0SMasahiro Yamada 	int nodes[4];
119928ba53c0SMasahiro Yamada 	unsigned char byte;
120028ba53c0SMasahiro Yamada 
120128ba53c0SMasahiro Yamada 	nodes[0] = nodes[1] = nodes[2] = nodes[3] = 0;
120228ba53c0SMasahiro Yamada 	leaves = 0;
120328ba53c0SMasahiro Yamada 	bytes = 0;
120428ba53c0SMasahiro Yamada 	index = tree->index;
120528ba53c0SMasahiro Yamada 	data += index;
120628ba53c0SMasahiro Yamada 	indent = 1;
120728ba53c0SMasahiro Yamada 	if (verbose > 0)
120828ba53c0SMasahiro Yamada 		printf("Emitting %s_%x\n", tree->type, tree->maxage);
120928ba53c0SMasahiro Yamada 	if (tree->childnode == LEAF) {
121028ba53c0SMasahiro Yamada 		assert(tree->root);
121128ba53c0SMasahiro Yamada 		tree->leaf_emit(tree->root, data);
121228ba53c0SMasahiro Yamada 		size = tree->leaf_size(tree->root);
121328ba53c0SMasahiro Yamada 		index += size;
121428ba53c0SMasahiro Yamada 		leaves++;
121528ba53c0SMasahiro Yamada 		goto done;
121628ba53c0SMasahiro Yamada 	}
121728ba53c0SMasahiro Yamada 
121828ba53c0SMasahiro Yamada 	assert(tree->childnode == NODE);
121928ba53c0SMasahiro Yamada 	node = tree->root;
122028ba53c0SMasahiro Yamada 	leftmask = rightmask = 0;
122128ba53c0SMasahiro Yamada 	while (node) {
122228ba53c0SMasahiro Yamada 		if (!node->mark)
122328ba53c0SMasahiro Yamada 			goto skip;
122428ba53c0SMasahiro Yamada 		assert(node->offset != -1);
122528ba53c0SMasahiro Yamada 		assert(node->index == index);
122628ba53c0SMasahiro Yamada 
122728ba53c0SMasahiro Yamada 		byte = 0;
122828ba53c0SMasahiro Yamada 		if (node->nextbyte)
122928ba53c0SMasahiro Yamada 			byte |= NEXTBYTE;
123028ba53c0SMasahiro Yamada 		byte |= (node->bitnum & BITNUM);
123128ba53c0SMasahiro Yamada 		if (node->left && node->right) {
123228ba53c0SMasahiro Yamada 			if (node->leftnode == NODE)
123328ba53c0SMasahiro Yamada 				byte |= LEFTNODE;
123428ba53c0SMasahiro Yamada 			if (node->rightnode == NODE)
123528ba53c0SMasahiro Yamada 				byte |= RIGHTNODE;
123628ba53c0SMasahiro Yamada 			if (node->offset <= 0xff)
123728ba53c0SMasahiro Yamada 				offlen = 1;
123828ba53c0SMasahiro Yamada 			else if (node->offset <= 0xffff)
123928ba53c0SMasahiro Yamada 				offlen = 2;
124028ba53c0SMasahiro Yamada 			else
124128ba53c0SMasahiro Yamada 				offlen = 3;
124228ba53c0SMasahiro Yamada 			nodes[offlen]++;
124328ba53c0SMasahiro Yamada 			offset = node->offset;
124428ba53c0SMasahiro Yamada 			byte |= offlen << OFFLEN_SHIFT;
124528ba53c0SMasahiro Yamada 			*data++ = byte;
124628ba53c0SMasahiro Yamada 			index++;
124728ba53c0SMasahiro Yamada 			while (offlen--) {
124828ba53c0SMasahiro Yamada 				*data++ = offset & 0xff;
124928ba53c0SMasahiro Yamada 				index++;
125028ba53c0SMasahiro Yamada 				offset >>= 8;
125128ba53c0SMasahiro Yamada 			}
125228ba53c0SMasahiro Yamada 		} else if (node->left) {
125328ba53c0SMasahiro Yamada 			if (node->leftnode == NODE)
125428ba53c0SMasahiro Yamada 				byte |= TRIENODE;
125528ba53c0SMasahiro Yamada 			nodes[0]++;
125628ba53c0SMasahiro Yamada 			*data++ = byte;
125728ba53c0SMasahiro Yamada 			index++;
125828ba53c0SMasahiro Yamada 		} else if (node->right) {
125928ba53c0SMasahiro Yamada 			byte |= RIGHTNODE;
126028ba53c0SMasahiro Yamada 			if (node->rightnode == NODE)
126128ba53c0SMasahiro Yamada 				byte |= TRIENODE;
126228ba53c0SMasahiro Yamada 			nodes[0]++;
126328ba53c0SMasahiro Yamada 			*data++ = byte;
126428ba53c0SMasahiro Yamada 			index++;
126528ba53c0SMasahiro Yamada 		} else {
126628ba53c0SMasahiro Yamada 			assert(0);
126728ba53c0SMasahiro Yamada 		}
126828ba53c0SMasahiro Yamada skip:
126928ba53c0SMasahiro Yamada 		while (node) {
127028ba53c0SMasahiro Yamada 			bitmask = 1 << node->bitnum;
127128ba53c0SMasahiro Yamada 			if (node->mark && (leftmask & bitmask) == 0) {
127228ba53c0SMasahiro Yamada 				leftmask |= bitmask;
127328ba53c0SMasahiro Yamada 				if (node->leftnode == LEAF) {
127428ba53c0SMasahiro Yamada 					assert(node->left);
127528ba53c0SMasahiro Yamada 					data = tree->leaf_emit(node->left,
127628ba53c0SMasahiro Yamada 							       data);
127728ba53c0SMasahiro Yamada 					size = tree->leaf_size(node->left);
127828ba53c0SMasahiro Yamada 					index += size;
127928ba53c0SMasahiro Yamada 					bytes += size;
128028ba53c0SMasahiro Yamada 					leaves++;
128128ba53c0SMasahiro Yamada 				} else if (node->left) {
128228ba53c0SMasahiro Yamada 					assert(node->leftnode == NODE);
128328ba53c0SMasahiro Yamada 					indent += 1;
128428ba53c0SMasahiro Yamada 					node = node->left;
128528ba53c0SMasahiro Yamada 					break;
128628ba53c0SMasahiro Yamada 				}
128728ba53c0SMasahiro Yamada 			}
128828ba53c0SMasahiro Yamada 			if (node->mark && (rightmask & bitmask) == 0) {
128928ba53c0SMasahiro Yamada 				rightmask |= bitmask;
129028ba53c0SMasahiro Yamada 				if (node->rightnode == LEAF) {
129128ba53c0SMasahiro Yamada 					assert(node->right);
129228ba53c0SMasahiro Yamada 					data = tree->leaf_emit(node->right,
129328ba53c0SMasahiro Yamada 							       data);
129428ba53c0SMasahiro Yamada 					size = tree->leaf_size(node->right);
129528ba53c0SMasahiro Yamada 					index += size;
129628ba53c0SMasahiro Yamada 					bytes += size;
129728ba53c0SMasahiro Yamada 					leaves++;
129828ba53c0SMasahiro Yamada 				} else if (node->right) {
129928ba53c0SMasahiro Yamada 					assert(node->rightnode == NODE);
130028ba53c0SMasahiro Yamada 					indent += 1;
130128ba53c0SMasahiro Yamada 					node = node->right;
130228ba53c0SMasahiro Yamada 					break;
130328ba53c0SMasahiro Yamada 				}
130428ba53c0SMasahiro Yamada 			}
130528ba53c0SMasahiro Yamada 			leftmask &= ~bitmask;
130628ba53c0SMasahiro Yamada 			rightmask &= ~bitmask;
130728ba53c0SMasahiro Yamada 			node = node->parent;
130828ba53c0SMasahiro Yamada 			indent -= 1;
130928ba53c0SMasahiro Yamada 		}
131028ba53c0SMasahiro Yamada 	}
131128ba53c0SMasahiro Yamada done:
131228ba53c0SMasahiro Yamada 	if (verbose > 0) {
131328ba53c0SMasahiro Yamada 		printf("Emitted %d (%d) leaves",
131428ba53c0SMasahiro Yamada 			leaves, bytes);
131528ba53c0SMasahiro Yamada 		printf(" %d (%d+%d+%d+%d) nodes",
131628ba53c0SMasahiro Yamada 			nodes[0] + nodes[1] + nodes[2] + nodes[3],
131728ba53c0SMasahiro Yamada 			nodes[0], nodes[1], nodes[2], nodes[3]);
131828ba53c0SMasahiro Yamada 		printf(" %d total\n", index - tree->index);
131928ba53c0SMasahiro Yamada 	}
132028ba53c0SMasahiro Yamada }
132128ba53c0SMasahiro Yamada 
132228ba53c0SMasahiro Yamada /* ------------------------------------------------------------------ */
132328ba53c0SMasahiro Yamada 
132428ba53c0SMasahiro Yamada /*
132528ba53c0SMasahiro Yamada  * Unicode data.
132628ba53c0SMasahiro Yamada  *
132728ba53c0SMasahiro Yamada  * We need to keep track of the Canonical Combining Class, the Age,
132828ba53c0SMasahiro Yamada  * and decompositions for a code point.
132928ba53c0SMasahiro Yamada  *
133028ba53c0SMasahiro Yamada  * For the Age, we store the index into the ages table.  Effectively
133128ba53c0SMasahiro Yamada  * this is a generation number that the table maps to a unicode
133228ba53c0SMasahiro Yamada  * version.
133328ba53c0SMasahiro Yamada  *
133428ba53c0SMasahiro Yamada  * The correction field is used to indicate that this entry is in the
133528ba53c0SMasahiro Yamada  * corrections array, which contains decompositions that were
133628ba53c0SMasahiro Yamada  * corrected in later revisions.  The value of the correction field is
133728ba53c0SMasahiro Yamada  * the Unicode version in which the mapping was corrected.
133828ba53c0SMasahiro Yamada  */
133928ba53c0SMasahiro Yamada struct unicode_data {
134028ba53c0SMasahiro Yamada 	unsigned int code;
134128ba53c0SMasahiro Yamada 	int ccc;
134228ba53c0SMasahiro Yamada 	int gen;
134328ba53c0SMasahiro Yamada 	int correction;
134428ba53c0SMasahiro Yamada 	unsigned int *utf32nfdi;
134528ba53c0SMasahiro Yamada 	unsigned int *utf32nfdicf;
134628ba53c0SMasahiro Yamada 	char *utf8nfdi;
134728ba53c0SMasahiro Yamada 	char *utf8nfdicf;
134828ba53c0SMasahiro Yamada };
134928ba53c0SMasahiro Yamada 
135028ba53c0SMasahiro Yamada struct unicode_data unicode_data[0x110000];
135128ba53c0SMasahiro Yamada struct unicode_data *corrections;
135228ba53c0SMasahiro Yamada int    corrections_count;
135328ba53c0SMasahiro Yamada 
135428ba53c0SMasahiro Yamada struct tree *nfdi_tree;
135528ba53c0SMasahiro Yamada struct tree *nfdicf_tree;
135628ba53c0SMasahiro Yamada 
135728ba53c0SMasahiro Yamada struct tree *trees;
135828ba53c0SMasahiro Yamada int          trees_count;
135928ba53c0SMasahiro Yamada 
136028ba53c0SMasahiro Yamada /*
136128ba53c0SMasahiro Yamada  * Check the corrections array to see if this entry was corrected at
136228ba53c0SMasahiro Yamada  * some point.
136328ba53c0SMasahiro Yamada  */
corrections_lookup(struct unicode_data * u)136428ba53c0SMasahiro Yamada static struct unicode_data *corrections_lookup(struct unicode_data *u)
136528ba53c0SMasahiro Yamada {
136628ba53c0SMasahiro Yamada 	int i;
136728ba53c0SMasahiro Yamada 
136828ba53c0SMasahiro Yamada 	for (i = 0; i != corrections_count; i++)
136928ba53c0SMasahiro Yamada 		if (u->code == corrections[i].code)
137028ba53c0SMasahiro Yamada 			return &corrections[i];
137128ba53c0SMasahiro Yamada 	return u;
137228ba53c0SMasahiro Yamada }
137328ba53c0SMasahiro Yamada 
nfdi_equal(void * l,void * r)137428ba53c0SMasahiro Yamada static int nfdi_equal(void *l, void *r)
137528ba53c0SMasahiro Yamada {
137628ba53c0SMasahiro Yamada 	struct unicode_data *left = l;
137728ba53c0SMasahiro Yamada 	struct unicode_data *right = r;
137828ba53c0SMasahiro Yamada 
137928ba53c0SMasahiro Yamada 	if (left->gen != right->gen)
138028ba53c0SMasahiro Yamada 		return 0;
138128ba53c0SMasahiro Yamada 	if (left->ccc != right->ccc)
138228ba53c0SMasahiro Yamada 		return 0;
138328ba53c0SMasahiro Yamada 	if (left->utf8nfdi && right->utf8nfdi &&
138428ba53c0SMasahiro Yamada 	    strcmp(left->utf8nfdi, right->utf8nfdi) == 0)
138528ba53c0SMasahiro Yamada 		return 1;
138628ba53c0SMasahiro Yamada 	if (left->utf8nfdi || right->utf8nfdi)
138728ba53c0SMasahiro Yamada 		return 0;
138828ba53c0SMasahiro Yamada 	return 1;
138928ba53c0SMasahiro Yamada }
139028ba53c0SMasahiro Yamada 
nfdicf_equal(void * l,void * r)139128ba53c0SMasahiro Yamada static int nfdicf_equal(void *l, void *r)
139228ba53c0SMasahiro Yamada {
139328ba53c0SMasahiro Yamada 	struct unicode_data *left = l;
139428ba53c0SMasahiro Yamada 	struct unicode_data *right = r;
139528ba53c0SMasahiro Yamada 
139628ba53c0SMasahiro Yamada 	if (left->gen != right->gen)
139728ba53c0SMasahiro Yamada 		return 0;
139828ba53c0SMasahiro Yamada 	if (left->ccc != right->ccc)
139928ba53c0SMasahiro Yamada 		return 0;
140028ba53c0SMasahiro Yamada 	if (left->utf8nfdicf && right->utf8nfdicf &&
140128ba53c0SMasahiro Yamada 	    strcmp(left->utf8nfdicf, right->utf8nfdicf) == 0)
140228ba53c0SMasahiro Yamada 		return 1;
140328ba53c0SMasahiro Yamada 	if (left->utf8nfdicf && right->utf8nfdicf)
140428ba53c0SMasahiro Yamada 		return 0;
140528ba53c0SMasahiro Yamada 	if (left->utf8nfdicf || right->utf8nfdicf)
140628ba53c0SMasahiro Yamada 		return 0;
140728ba53c0SMasahiro Yamada 	if (left->utf8nfdi && right->utf8nfdi &&
140828ba53c0SMasahiro Yamada 	    strcmp(left->utf8nfdi, right->utf8nfdi) == 0)
140928ba53c0SMasahiro Yamada 		return 1;
141028ba53c0SMasahiro Yamada 	if (left->utf8nfdi || right->utf8nfdi)
141128ba53c0SMasahiro Yamada 		return 0;
141228ba53c0SMasahiro Yamada 	return 1;
141328ba53c0SMasahiro Yamada }
141428ba53c0SMasahiro Yamada 
nfdi_print(void * l,int indent)141528ba53c0SMasahiro Yamada static void nfdi_print(void *l, int indent)
141628ba53c0SMasahiro Yamada {
141728ba53c0SMasahiro Yamada 	struct unicode_data *leaf = l;
141828ba53c0SMasahiro Yamada 
141928ba53c0SMasahiro Yamada 	printf("%*sleaf @ %p code %X ccc %d gen %d", indent, "", leaf,
142028ba53c0SMasahiro Yamada 		leaf->code, leaf->ccc, leaf->gen);
142128ba53c0SMasahiro Yamada 
142228ba53c0SMasahiro Yamada 	if (leaf->utf8nfdi && leaf->utf8nfdi[0] == HANGUL)
142328ba53c0SMasahiro Yamada 		printf(" nfdi \"%s\"", "HANGUL SYLLABLE");
142428ba53c0SMasahiro Yamada 	else if (leaf->utf8nfdi)
142528ba53c0SMasahiro Yamada 		printf(" nfdi \"%s\"", (const char*)leaf->utf8nfdi);
142628ba53c0SMasahiro Yamada 
142728ba53c0SMasahiro Yamada 	printf("\n");
142828ba53c0SMasahiro Yamada }
142928ba53c0SMasahiro Yamada 
nfdicf_print(void * l,int indent)143028ba53c0SMasahiro Yamada static void nfdicf_print(void *l, int indent)
143128ba53c0SMasahiro Yamada {
143228ba53c0SMasahiro Yamada 	struct unicode_data *leaf = l;
143328ba53c0SMasahiro Yamada 
143428ba53c0SMasahiro Yamada 	printf("%*sleaf @ %p code %X ccc %d gen %d", indent, "", leaf,
143528ba53c0SMasahiro Yamada 		leaf->code, leaf->ccc, leaf->gen);
143628ba53c0SMasahiro Yamada 
143728ba53c0SMasahiro Yamada 	if (leaf->utf8nfdicf)
143828ba53c0SMasahiro Yamada 		printf(" nfdicf \"%s\"", (const char*)leaf->utf8nfdicf);
143928ba53c0SMasahiro Yamada 	else if (leaf->utf8nfdi && leaf->utf8nfdi[0] == HANGUL)
144028ba53c0SMasahiro Yamada 		printf(" nfdi \"%s\"", "HANGUL SYLLABLE");
144128ba53c0SMasahiro Yamada 	else if (leaf->utf8nfdi)
144228ba53c0SMasahiro Yamada 		printf(" nfdi \"%s\"", (const char*)leaf->utf8nfdi);
144328ba53c0SMasahiro Yamada 	printf("\n");
144428ba53c0SMasahiro Yamada }
144528ba53c0SMasahiro Yamada 
nfdi_mark(void * l)144628ba53c0SMasahiro Yamada static int nfdi_mark(void *l)
144728ba53c0SMasahiro Yamada {
144828ba53c0SMasahiro Yamada 	return 1;
144928ba53c0SMasahiro Yamada }
145028ba53c0SMasahiro Yamada 
nfdicf_mark(void * l)145128ba53c0SMasahiro Yamada static int nfdicf_mark(void *l)
145228ba53c0SMasahiro Yamada {
145328ba53c0SMasahiro Yamada 	struct unicode_data *leaf = l;
145428ba53c0SMasahiro Yamada 
145528ba53c0SMasahiro Yamada 	if (leaf->utf8nfdicf)
145628ba53c0SMasahiro Yamada 		return 1;
145728ba53c0SMasahiro Yamada 	return 0;
145828ba53c0SMasahiro Yamada }
145928ba53c0SMasahiro Yamada 
correction_mark(void * l)146028ba53c0SMasahiro Yamada static int correction_mark(void *l)
146128ba53c0SMasahiro Yamada {
146228ba53c0SMasahiro Yamada 	struct unicode_data *leaf = l;
146328ba53c0SMasahiro Yamada 
146428ba53c0SMasahiro Yamada 	return leaf->correction;
146528ba53c0SMasahiro Yamada }
146628ba53c0SMasahiro Yamada 
nfdi_size(void * l)146728ba53c0SMasahiro Yamada static int nfdi_size(void *l)
146828ba53c0SMasahiro Yamada {
146928ba53c0SMasahiro Yamada 	struct unicode_data *leaf = l;
147028ba53c0SMasahiro Yamada 	int size = 2;
147128ba53c0SMasahiro Yamada 
147228ba53c0SMasahiro Yamada 	if (HANGUL_SYLLABLE(leaf->code))
147328ba53c0SMasahiro Yamada 		size += 1;
147428ba53c0SMasahiro Yamada 	else if (leaf->utf8nfdi)
147528ba53c0SMasahiro Yamada 		size += strlen(leaf->utf8nfdi) + 1;
147628ba53c0SMasahiro Yamada 	return size;
147728ba53c0SMasahiro Yamada }
147828ba53c0SMasahiro Yamada 
nfdicf_size(void * l)147928ba53c0SMasahiro Yamada static int nfdicf_size(void *l)
148028ba53c0SMasahiro Yamada {
148128ba53c0SMasahiro Yamada 	struct unicode_data *leaf = l;
148228ba53c0SMasahiro Yamada 	int size = 2;
148328ba53c0SMasahiro Yamada 
148428ba53c0SMasahiro Yamada 	if (HANGUL_SYLLABLE(leaf->code))
148528ba53c0SMasahiro Yamada 		size += 1;
148628ba53c0SMasahiro Yamada 	else if (leaf->utf8nfdicf)
148728ba53c0SMasahiro Yamada 		size += strlen(leaf->utf8nfdicf) + 1;
148828ba53c0SMasahiro Yamada 	else if (leaf->utf8nfdi)
148928ba53c0SMasahiro Yamada 		size += strlen(leaf->utf8nfdi) + 1;
149028ba53c0SMasahiro Yamada 	return size;
149128ba53c0SMasahiro Yamada }
149228ba53c0SMasahiro Yamada 
nfdi_index(struct tree * tree,void * l)149328ba53c0SMasahiro Yamada static int *nfdi_index(struct tree *tree, void *l)
149428ba53c0SMasahiro Yamada {
149528ba53c0SMasahiro Yamada 	struct unicode_data *leaf = l;
149628ba53c0SMasahiro Yamada 
149728ba53c0SMasahiro Yamada 	return &tree->leafindex[leaf->code];
149828ba53c0SMasahiro Yamada }
149928ba53c0SMasahiro Yamada 
nfdicf_index(struct tree * tree,void * l)150028ba53c0SMasahiro Yamada static int *nfdicf_index(struct tree *tree, void *l)
150128ba53c0SMasahiro Yamada {
150228ba53c0SMasahiro Yamada 	struct unicode_data *leaf = l;
150328ba53c0SMasahiro Yamada 
150428ba53c0SMasahiro Yamada 	return &tree->leafindex[leaf->code];
150528ba53c0SMasahiro Yamada }
150628ba53c0SMasahiro Yamada 
nfdi_emit(void * l,unsigned char * data)150728ba53c0SMasahiro Yamada static unsigned char *nfdi_emit(void *l, unsigned char *data)
150828ba53c0SMasahiro Yamada {
150928ba53c0SMasahiro Yamada 	struct unicode_data *leaf = l;
151028ba53c0SMasahiro Yamada 	unsigned char *s;
151128ba53c0SMasahiro Yamada 
151228ba53c0SMasahiro Yamada 	*data++ = leaf->gen;
151328ba53c0SMasahiro Yamada 
151428ba53c0SMasahiro Yamada 	if (HANGUL_SYLLABLE(leaf->code)) {
151528ba53c0SMasahiro Yamada 		*data++ = DECOMPOSE;
151628ba53c0SMasahiro Yamada 		*data++ = HANGUL;
151728ba53c0SMasahiro Yamada 	} else if (leaf->utf8nfdi) {
151828ba53c0SMasahiro Yamada 		*data++ = DECOMPOSE;
151928ba53c0SMasahiro Yamada 		s = (unsigned char*)leaf->utf8nfdi;
152028ba53c0SMasahiro Yamada 		while ((*data++ = *s++) != 0)
152128ba53c0SMasahiro Yamada 			;
152228ba53c0SMasahiro Yamada 	} else {
152328ba53c0SMasahiro Yamada 		*data++ = leaf->ccc;
152428ba53c0SMasahiro Yamada 	}
152528ba53c0SMasahiro Yamada 	return data;
152628ba53c0SMasahiro Yamada }
152728ba53c0SMasahiro Yamada 
nfdicf_emit(void * l,unsigned char * data)152828ba53c0SMasahiro Yamada static unsigned char *nfdicf_emit(void *l, unsigned char *data)
152928ba53c0SMasahiro Yamada {
153028ba53c0SMasahiro Yamada 	struct unicode_data *leaf = l;
153128ba53c0SMasahiro Yamada 	unsigned char *s;
153228ba53c0SMasahiro Yamada 
153328ba53c0SMasahiro Yamada 	*data++ = leaf->gen;
153428ba53c0SMasahiro Yamada 
153528ba53c0SMasahiro Yamada 	if (HANGUL_SYLLABLE(leaf->code)) {
153628ba53c0SMasahiro Yamada 		*data++ = DECOMPOSE;
153728ba53c0SMasahiro Yamada 		*data++ = HANGUL;
153828ba53c0SMasahiro Yamada 	} else if (leaf->utf8nfdicf) {
153928ba53c0SMasahiro Yamada 		*data++ = DECOMPOSE;
154028ba53c0SMasahiro Yamada 		s = (unsigned char*)leaf->utf8nfdicf;
154128ba53c0SMasahiro Yamada 		while ((*data++ = *s++) != 0)
154228ba53c0SMasahiro Yamada 			;
154328ba53c0SMasahiro Yamada 	} else if (leaf->utf8nfdi) {
154428ba53c0SMasahiro Yamada 		*data++ = DECOMPOSE;
154528ba53c0SMasahiro Yamada 		s = (unsigned char*)leaf->utf8nfdi;
154628ba53c0SMasahiro Yamada 		while ((*data++ = *s++) != 0)
154728ba53c0SMasahiro Yamada 			;
154828ba53c0SMasahiro Yamada 	} else {
154928ba53c0SMasahiro Yamada 		*data++ = leaf->ccc;
155028ba53c0SMasahiro Yamada 	}
155128ba53c0SMasahiro Yamada 	return data;
155228ba53c0SMasahiro Yamada }
155328ba53c0SMasahiro Yamada 
utf8_create(struct unicode_data * data)155428ba53c0SMasahiro Yamada static void utf8_create(struct unicode_data *data)
155528ba53c0SMasahiro Yamada {
155628ba53c0SMasahiro Yamada 	char utf[18*4+1];
155728ba53c0SMasahiro Yamada 	char *u;
155828ba53c0SMasahiro Yamada 	unsigned int *um;
155928ba53c0SMasahiro Yamada 	int i;
156028ba53c0SMasahiro Yamada 
156128ba53c0SMasahiro Yamada 	if (data->utf8nfdi) {
156228ba53c0SMasahiro Yamada 		assert(data->utf8nfdi[0] == HANGUL);
156328ba53c0SMasahiro Yamada 		return;
156428ba53c0SMasahiro Yamada 	}
156528ba53c0SMasahiro Yamada 
156628ba53c0SMasahiro Yamada 	u = utf;
156728ba53c0SMasahiro Yamada 	um = data->utf32nfdi;
156828ba53c0SMasahiro Yamada 	if (um) {
156928ba53c0SMasahiro Yamada 		for (i = 0; um[i]; i++)
157028ba53c0SMasahiro Yamada 			u += utf8encode(u, um[i]);
157128ba53c0SMasahiro Yamada 		*u = '\0';
157228ba53c0SMasahiro Yamada 		data->utf8nfdi = strdup(utf);
157328ba53c0SMasahiro Yamada 	}
157428ba53c0SMasahiro Yamada 	u = utf;
157528ba53c0SMasahiro Yamada 	um = data->utf32nfdicf;
157628ba53c0SMasahiro Yamada 	if (um) {
157728ba53c0SMasahiro Yamada 		for (i = 0; um[i]; i++)
157828ba53c0SMasahiro Yamada 			u += utf8encode(u, um[i]);
157928ba53c0SMasahiro Yamada 		*u = '\0';
158028ba53c0SMasahiro Yamada 		if (!data->utf8nfdi || strcmp(data->utf8nfdi, utf))
158128ba53c0SMasahiro Yamada 			data->utf8nfdicf = strdup(utf);
158228ba53c0SMasahiro Yamada 	}
158328ba53c0SMasahiro Yamada }
158428ba53c0SMasahiro Yamada 
utf8_init(void)158528ba53c0SMasahiro Yamada static void utf8_init(void)
158628ba53c0SMasahiro Yamada {
158728ba53c0SMasahiro Yamada 	unsigned int unichar;
158828ba53c0SMasahiro Yamada 	int i;
158928ba53c0SMasahiro Yamada 
159028ba53c0SMasahiro Yamada 	for (unichar = 0; unichar != 0x110000; unichar++)
159128ba53c0SMasahiro Yamada 		utf8_create(&unicode_data[unichar]);
159228ba53c0SMasahiro Yamada 
159328ba53c0SMasahiro Yamada 	for (i = 0; i != corrections_count; i++)
159428ba53c0SMasahiro Yamada 		utf8_create(&corrections[i]);
159528ba53c0SMasahiro Yamada }
159628ba53c0SMasahiro Yamada 
trees_init(void)159728ba53c0SMasahiro Yamada static void trees_init(void)
159828ba53c0SMasahiro Yamada {
159928ba53c0SMasahiro Yamada 	struct unicode_data *data;
160028ba53c0SMasahiro Yamada 	unsigned int maxage;
160128ba53c0SMasahiro Yamada 	unsigned int nextage;
160228ba53c0SMasahiro Yamada 	int count;
160328ba53c0SMasahiro Yamada 	int i;
160428ba53c0SMasahiro Yamada 	int j;
160528ba53c0SMasahiro Yamada 
160628ba53c0SMasahiro Yamada 	/* Count the number of different ages. */
160728ba53c0SMasahiro Yamada 	count = 0;
160828ba53c0SMasahiro Yamada 	nextage = (unsigned int)-1;
160928ba53c0SMasahiro Yamada 	do {
161028ba53c0SMasahiro Yamada 		maxage = nextage;
161128ba53c0SMasahiro Yamada 		nextage = 0;
161228ba53c0SMasahiro Yamada 		for (i = 0; i <= corrections_count; i++) {
161328ba53c0SMasahiro Yamada 			data = &corrections[i];
161428ba53c0SMasahiro Yamada 			if (nextage < data->correction &&
161528ba53c0SMasahiro Yamada 			    data->correction < maxage)
161628ba53c0SMasahiro Yamada 				nextage = data->correction;
161728ba53c0SMasahiro Yamada 		}
161828ba53c0SMasahiro Yamada 		count++;
161928ba53c0SMasahiro Yamada 	} while (nextage);
162028ba53c0SMasahiro Yamada 
162128ba53c0SMasahiro Yamada 	/* Two trees per age: nfdi and nfdicf */
162228ba53c0SMasahiro Yamada 	trees_count = count * 2;
162328ba53c0SMasahiro Yamada 	trees = calloc(trees_count, sizeof(struct tree));
162428ba53c0SMasahiro Yamada 
162528ba53c0SMasahiro Yamada 	/* Assign ages to the trees. */
162628ba53c0SMasahiro Yamada 	count = trees_count;
162728ba53c0SMasahiro Yamada 	nextage = (unsigned int)-1;
162828ba53c0SMasahiro Yamada 	do {
162928ba53c0SMasahiro Yamada 		maxage = nextage;
163028ba53c0SMasahiro Yamada 		trees[--count].maxage = maxage;
163128ba53c0SMasahiro Yamada 		trees[--count].maxage = maxage;
163228ba53c0SMasahiro Yamada 		nextage = 0;
163328ba53c0SMasahiro Yamada 		for (i = 0; i <= corrections_count; i++) {
163428ba53c0SMasahiro Yamada 			data = &corrections[i];
163528ba53c0SMasahiro Yamada 			if (nextage < data->correction &&
163628ba53c0SMasahiro Yamada 			    data->correction < maxage)
163728ba53c0SMasahiro Yamada 				nextage = data->correction;
163828ba53c0SMasahiro Yamada 		}
163928ba53c0SMasahiro Yamada 	} while (nextage);
164028ba53c0SMasahiro Yamada 
164128ba53c0SMasahiro Yamada 	/* The ages assigned above are off by one. */
164228ba53c0SMasahiro Yamada 	for (i = 0; i != trees_count; i++) {
164328ba53c0SMasahiro Yamada 		j = 0;
164428ba53c0SMasahiro Yamada 		while (ages[j] < trees[i].maxage)
164528ba53c0SMasahiro Yamada 			j++;
164628ba53c0SMasahiro Yamada 		trees[i].maxage = ages[j-1];
164728ba53c0SMasahiro Yamada 	}
164828ba53c0SMasahiro Yamada 
164928ba53c0SMasahiro Yamada 	/* Set up the forwarding between trees. */
165028ba53c0SMasahiro Yamada 	trees[trees_count-2].next = &trees[trees_count-1];
165128ba53c0SMasahiro Yamada 	trees[trees_count-1].leaf_mark = nfdi_mark;
165228ba53c0SMasahiro Yamada 	trees[trees_count-2].leaf_mark = nfdicf_mark;
165328ba53c0SMasahiro Yamada 	for (i = 0; i != trees_count-2; i += 2) {
165428ba53c0SMasahiro Yamada 		trees[i].next = &trees[trees_count-2];
165528ba53c0SMasahiro Yamada 		trees[i].leaf_mark = correction_mark;
165628ba53c0SMasahiro Yamada 		trees[i+1].next = &trees[trees_count-1];
165728ba53c0SMasahiro Yamada 		trees[i+1].leaf_mark = correction_mark;
165828ba53c0SMasahiro Yamada 	}
165928ba53c0SMasahiro Yamada 
166028ba53c0SMasahiro Yamada 	/* Assign the callouts. */
166128ba53c0SMasahiro Yamada 	for (i = 0; i != trees_count; i += 2) {
166228ba53c0SMasahiro Yamada 		trees[i].type = "nfdicf";
166328ba53c0SMasahiro Yamada 		trees[i].leaf_equal = nfdicf_equal;
166428ba53c0SMasahiro Yamada 		trees[i].leaf_print = nfdicf_print;
166528ba53c0SMasahiro Yamada 		trees[i].leaf_size = nfdicf_size;
166628ba53c0SMasahiro Yamada 		trees[i].leaf_index = nfdicf_index;
166728ba53c0SMasahiro Yamada 		trees[i].leaf_emit = nfdicf_emit;
166828ba53c0SMasahiro Yamada 
166928ba53c0SMasahiro Yamada 		trees[i+1].type = "nfdi";
167028ba53c0SMasahiro Yamada 		trees[i+1].leaf_equal = nfdi_equal;
167128ba53c0SMasahiro Yamada 		trees[i+1].leaf_print = nfdi_print;
167228ba53c0SMasahiro Yamada 		trees[i+1].leaf_size = nfdi_size;
167328ba53c0SMasahiro Yamada 		trees[i+1].leaf_index = nfdi_index;
167428ba53c0SMasahiro Yamada 		trees[i+1].leaf_emit = nfdi_emit;
167528ba53c0SMasahiro Yamada 	}
167628ba53c0SMasahiro Yamada 
167728ba53c0SMasahiro Yamada 	/* Finish init. */
167828ba53c0SMasahiro Yamada 	for (i = 0; i != trees_count; i++)
167928ba53c0SMasahiro Yamada 		trees[i].childnode = NODE;
168028ba53c0SMasahiro Yamada }
168128ba53c0SMasahiro Yamada 
trees_populate(void)168228ba53c0SMasahiro Yamada static void trees_populate(void)
168328ba53c0SMasahiro Yamada {
168428ba53c0SMasahiro Yamada 	struct unicode_data *data;
168528ba53c0SMasahiro Yamada 	unsigned int unichar;
168628ba53c0SMasahiro Yamada 	char keyval[4];
168728ba53c0SMasahiro Yamada 	int keylen;
168828ba53c0SMasahiro Yamada 	int i;
168928ba53c0SMasahiro Yamada 
169028ba53c0SMasahiro Yamada 	for (i = 0; i != trees_count; i++) {
169128ba53c0SMasahiro Yamada 		if (verbose > 0) {
169228ba53c0SMasahiro Yamada 			printf("Populating %s_%x\n",
169328ba53c0SMasahiro Yamada 				trees[i].type, trees[i].maxage);
169428ba53c0SMasahiro Yamada 		}
169528ba53c0SMasahiro Yamada 		for (unichar = 0; unichar != 0x110000; unichar++) {
169628ba53c0SMasahiro Yamada 			if (unicode_data[unichar].gen < 0)
169728ba53c0SMasahiro Yamada 				continue;
169828ba53c0SMasahiro Yamada 			keylen = utf8encode(keyval, unichar);
169928ba53c0SMasahiro Yamada 			data = corrections_lookup(&unicode_data[unichar]);
170028ba53c0SMasahiro Yamada 			if (data->correction <= trees[i].maxage)
170128ba53c0SMasahiro Yamada 				data = &unicode_data[unichar];
170228ba53c0SMasahiro Yamada 			insert(&trees[i], keyval, keylen, data);
170328ba53c0SMasahiro Yamada 		}
170428ba53c0SMasahiro Yamada 	}
170528ba53c0SMasahiro Yamada }
170628ba53c0SMasahiro Yamada 
trees_reduce(void)170728ba53c0SMasahiro Yamada static void trees_reduce(void)
170828ba53c0SMasahiro Yamada {
170928ba53c0SMasahiro Yamada 	int i;
171028ba53c0SMasahiro Yamada 	int size;
171128ba53c0SMasahiro Yamada 	int changed;
171228ba53c0SMasahiro Yamada 
171328ba53c0SMasahiro Yamada 	for (i = 0; i != trees_count; i++)
171428ba53c0SMasahiro Yamada 		prune(&trees[i]);
171528ba53c0SMasahiro Yamada 	for (i = 0; i != trees_count; i++)
171628ba53c0SMasahiro Yamada 		mark_nodes(&trees[i]);
171728ba53c0SMasahiro Yamada 	do {
171828ba53c0SMasahiro Yamada 		size = 0;
171928ba53c0SMasahiro Yamada 		for (i = 0; i != trees_count; i++)
172028ba53c0SMasahiro Yamada 			size = index_nodes(&trees[i], size);
172128ba53c0SMasahiro Yamada 		changed = 0;
172228ba53c0SMasahiro Yamada 		for (i = 0; i != trees_count; i++)
172328ba53c0SMasahiro Yamada 			changed += size_nodes(&trees[i]);
172428ba53c0SMasahiro Yamada 	} while (changed);
172528ba53c0SMasahiro Yamada 
172628ba53c0SMasahiro Yamada 	utf8data = calloc(size, 1);
172728ba53c0SMasahiro Yamada 	utf8data_size = size;
172828ba53c0SMasahiro Yamada 	for (i = 0; i != trees_count; i++)
172928ba53c0SMasahiro Yamada 		emit(&trees[i], utf8data);
173028ba53c0SMasahiro Yamada 
173128ba53c0SMasahiro Yamada 	if (verbose > 0) {
173228ba53c0SMasahiro Yamada 		for (i = 0; i != trees_count; i++) {
173328ba53c0SMasahiro Yamada 			printf("%s_%x idx %d\n",
173428ba53c0SMasahiro Yamada 				trees[i].type, trees[i].maxage, trees[i].index);
173528ba53c0SMasahiro Yamada 		}
173628ba53c0SMasahiro Yamada 	}
173728ba53c0SMasahiro Yamada 
173828ba53c0SMasahiro Yamada 	nfdi = utf8data + trees[trees_count-1].index;
173928ba53c0SMasahiro Yamada 	nfdicf = utf8data + trees[trees_count-2].index;
174028ba53c0SMasahiro Yamada 
174128ba53c0SMasahiro Yamada 	nfdi_tree = &trees[trees_count-1];
174228ba53c0SMasahiro Yamada 	nfdicf_tree = &trees[trees_count-2];
174328ba53c0SMasahiro Yamada }
174428ba53c0SMasahiro Yamada 
verify(struct tree * tree)174528ba53c0SMasahiro Yamada static void verify(struct tree *tree)
174628ba53c0SMasahiro Yamada {
174728ba53c0SMasahiro Yamada 	struct unicode_data *data;
174828ba53c0SMasahiro Yamada 	utf8leaf_t	*leaf;
174928ba53c0SMasahiro Yamada 	unsigned int	unichar;
175028ba53c0SMasahiro Yamada 	char		key[4];
175128ba53c0SMasahiro Yamada 	unsigned char	hangul[UTF8HANGULLEAF];
175228ba53c0SMasahiro Yamada 	int		report;
175328ba53c0SMasahiro Yamada 	int		nocf;
175428ba53c0SMasahiro Yamada 
175528ba53c0SMasahiro Yamada 	if (verbose > 0)
175628ba53c0SMasahiro Yamada 		printf("Verifying %s_%x\n", tree->type, tree->maxage);
175728ba53c0SMasahiro Yamada 	nocf = strcmp(tree->type, "nfdicf");
175828ba53c0SMasahiro Yamada 
175928ba53c0SMasahiro Yamada 	for (unichar = 0; unichar != 0x110000; unichar++) {
176028ba53c0SMasahiro Yamada 		report = 0;
176128ba53c0SMasahiro Yamada 		data = corrections_lookup(&unicode_data[unichar]);
176228ba53c0SMasahiro Yamada 		if (data->correction <= tree->maxage)
176328ba53c0SMasahiro Yamada 			data = &unicode_data[unichar];
176428ba53c0SMasahiro Yamada 		utf8encode(key,unichar);
176528ba53c0SMasahiro Yamada 		leaf = utf8lookup(tree, hangul, key);
176628ba53c0SMasahiro Yamada 
176728ba53c0SMasahiro Yamada 		if (!leaf) {
176828ba53c0SMasahiro Yamada 			if (data->gen != -1)
176928ba53c0SMasahiro Yamada 				report++;
177028ba53c0SMasahiro Yamada 			if (unichar < 0xd800 || unichar > 0xdfff)
177128ba53c0SMasahiro Yamada 				report++;
177228ba53c0SMasahiro Yamada 		} else {
177328ba53c0SMasahiro Yamada 			if (unichar >= 0xd800 && unichar <= 0xdfff)
177428ba53c0SMasahiro Yamada 				report++;
177528ba53c0SMasahiro Yamada 			if (data->gen == -1)
177628ba53c0SMasahiro Yamada 				report++;
177728ba53c0SMasahiro Yamada 			if (data->gen != LEAF_GEN(leaf))
177828ba53c0SMasahiro Yamada 				report++;
177928ba53c0SMasahiro Yamada 			if (LEAF_CCC(leaf) == DECOMPOSE) {
178028ba53c0SMasahiro Yamada 				if (HANGUL_SYLLABLE(data->code)) {
178128ba53c0SMasahiro Yamada 					if (data->utf8nfdi[0] != HANGUL)
178228ba53c0SMasahiro Yamada 						report++;
178328ba53c0SMasahiro Yamada 				} else if (nocf) {
178428ba53c0SMasahiro Yamada 					if (!data->utf8nfdi) {
178528ba53c0SMasahiro Yamada 						report++;
178628ba53c0SMasahiro Yamada 					} else if (strcmp(data->utf8nfdi,
178728ba53c0SMasahiro Yamada 							  LEAF_STR(leaf))) {
178828ba53c0SMasahiro Yamada 						report++;
178928ba53c0SMasahiro Yamada 					}
179028ba53c0SMasahiro Yamada 				} else {
179128ba53c0SMasahiro Yamada 					if (!data->utf8nfdicf &&
179228ba53c0SMasahiro Yamada 					    !data->utf8nfdi) {
179328ba53c0SMasahiro Yamada 						report++;
179428ba53c0SMasahiro Yamada 					} else if (data->utf8nfdicf) {
179528ba53c0SMasahiro Yamada 						if (strcmp(data->utf8nfdicf,
179628ba53c0SMasahiro Yamada 							   LEAF_STR(leaf)))
179728ba53c0SMasahiro Yamada 							report++;
179828ba53c0SMasahiro Yamada 					} else if (strcmp(data->utf8nfdi,
179928ba53c0SMasahiro Yamada 							  LEAF_STR(leaf))) {
180028ba53c0SMasahiro Yamada 						report++;
180128ba53c0SMasahiro Yamada 					}
180228ba53c0SMasahiro Yamada 				}
180328ba53c0SMasahiro Yamada 			} else if (data->ccc != LEAF_CCC(leaf)) {
180428ba53c0SMasahiro Yamada 				report++;
180528ba53c0SMasahiro Yamada 			}
180628ba53c0SMasahiro Yamada 		}
180728ba53c0SMasahiro Yamada 		if (report) {
180828ba53c0SMasahiro Yamada 			printf("%X code %X gen %d ccc %d"
180928ba53c0SMasahiro Yamada 				" nfdi -> \"%s\"",
181028ba53c0SMasahiro Yamada 				unichar, data->code, data->gen,
181128ba53c0SMasahiro Yamada 				data->ccc,
181228ba53c0SMasahiro Yamada 				data->utf8nfdi);
181328ba53c0SMasahiro Yamada 			if (leaf) {
181428ba53c0SMasahiro Yamada 				printf(" gen %d ccc %d"
181528ba53c0SMasahiro Yamada 					" nfdi -> \"%s\"",
181628ba53c0SMasahiro Yamada 					LEAF_GEN(leaf),
181728ba53c0SMasahiro Yamada 					LEAF_CCC(leaf),
181828ba53c0SMasahiro Yamada 					LEAF_CCC(leaf) == DECOMPOSE ?
181928ba53c0SMasahiro Yamada 						LEAF_STR(leaf) : "");
182028ba53c0SMasahiro Yamada 			}
182128ba53c0SMasahiro Yamada 			printf("\n");
182228ba53c0SMasahiro Yamada 		}
182328ba53c0SMasahiro Yamada 	}
182428ba53c0SMasahiro Yamada }
182528ba53c0SMasahiro Yamada 
trees_verify(void)182628ba53c0SMasahiro Yamada static void trees_verify(void)
182728ba53c0SMasahiro Yamada {
182828ba53c0SMasahiro Yamada 	int i;
182928ba53c0SMasahiro Yamada 
183028ba53c0SMasahiro Yamada 	for (i = 0; i != trees_count; i++)
183128ba53c0SMasahiro Yamada 		verify(&trees[i]);
183228ba53c0SMasahiro Yamada }
183328ba53c0SMasahiro Yamada 
183428ba53c0SMasahiro Yamada /* ------------------------------------------------------------------ */
183528ba53c0SMasahiro Yamada 
help(void)183628ba53c0SMasahiro Yamada static void help(void)
183728ba53c0SMasahiro Yamada {
183828ba53c0SMasahiro Yamada 	printf("Usage: %s [options]\n", argv0);
183928ba53c0SMasahiro Yamada 	printf("\n");
184028ba53c0SMasahiro Yamada 	printf("This program creates an a data trie used for parsing and\n");
184128ba53c0SMasahiro Yamada 	printf("normalization of UTF-8 strings. The trie is derived from\n");
184228ba53c0SMasahiro Yamada 	printf("a set of input files from the Unicode character database\n");
184328ba53c0SMasahiro Yamada 	printf("found at: http://www.unicode.org/Public/UCD/latest/ucd/\n");
184428ba53c0SMasahiro Yamada 	printf("\n");
184528ba53c0SMasahiro Yamada 	printf("The generated tree supports two normalization forms:\n");
184628ba53c0SMasahiro Yamada 	printf("\n");
184728ba53c0SMasahiro Yamada 	printf("\tnfdi:\n");
184828ba53c0SMasahiro Yamada 	printf("\t- Apply unicode normalization form NFD.\n");
184928ba53c0SMasahiro Yamada 	printf("\t- Remove any Default_Ignorable_Code_Point.\n");
185028ba53c0SMasahiro Yamada 	printf("\n");
185128ba53c0SMasahiro Yamada 	printf("\tnfdicf:\n");
185228ba53c0SMasahiro Yamada 	printf("\t- Apply unicode normalization form NFD.\n");
185328ba53c0SMasahiro Yamada 	printf("\t- Remove any Default_Ignorable_Code_Point.\n");
185428ba53c0SMasahiro Yamada 	printf("\t- Apply a full casefold (C + F).\n");
185528ba53c0SMasahiro Yamada 	printf("\n");
185628ba53c0SMasahiro Yamada 	printf("These forms were chosen as being most useful when dealing\n");
185728ba53c0SMasahiro Yamada 	printf("with file names: NFD catches most cases where characters\n");
185828ba53c0SMasahiro Yamada 	printf("should be considered equivalent. The ignorables are mostly\n");
185928ba53c0SMasahiro Yamada 	printf("invisible, making names hard to type.\n");
186028ba53c0SMasahiro Yamada 	printf("\n");
186128ba53c0SMasahiro Yamada 	printf("The options to specify the files to be used are listed\n");
186228ba53c0SMasahiro Yamada 	printf("below with their default values, which are the names used\n");
186328ba53c0SMasahiro Yamada 	printf("by version 11.0.0 of the Unicode Character Database.\n");
186428ba53c0SMasahiro Yamada 	printf("\n");
186528ba53c0SMasahiro Yamada 	printf("The input files:\n");
186628ba53c0SMasahiro Yamada 	printf("\t-a %s\n", AGE_NAME);
186728ba53c0SMasahiro Yamada 	printf("\t-c %s\n", CCC_NAME);
186828ba53c0SMasahiro Yamada 	printf("\t-p %s\n", PROP_NAME);
186928ba53c0SMasahiro Yamada 	printf("\t-d %s\n", DATA_NAME);
187028ba53c0SMasahiro Yamada 	printf("\t-f %s\n", FOLD_NAME);
187128ba53c0SMasahiro Yamada 	printf("\t-n %s\n", NORM_NAME);
187228ba53c0SMasahiro Yamada 	printf("\n");
187328ba53c0SMasahiro Yamada 	printf("Additionally, the generated tables are tested using:\n");
187428ba53c0SMasahiro Yamada 	printf("\t-t %s\n", TEST_NAME);
187528ba53c0SMasahiro Yamada 	printf("\n");
187628ba53c0SMasahiro Yamada 	printf("Finally, the output file:\n");
187728ba53c0SMasahiro Yamada 	printf("\t-o %s\n", UTF8_NAME);
187828ba53c0SMasahiro Yamada 	printf("\n");
187928ba53c0SMasahiro Yamada }
188028ba53c0SMasahiro Yamada 
usage(void)188128ba53c0SMasahiro Yamada static void usage(void)
188228ba53c0SMasahiro Yamada {
188328ba53c0SMasahiro Yamada 	help();
188428ba53c0SMasahiro Yamada 	exit(1);
188528ba53c0SMasahiro Yamada }
188628ba53c0SMasahiro Yamada 
open_fail(const char * name,int error)188728ba53c0SMasahiro Yamada static void open_fail(const char *name, int error)
188828ba53c0SMasahiro Yamada {
188928ba53c0SMasahiro Yamada 	printf("Error %d opening %s: %s\n", error, name, strerror(error));
189028ba53c0SMasahiro Yamada 	exit(1);
189128ba53c0SMasahiro Yamada }
189228ba53c0SMasahiro Yamada 
file_fail(const char * filename)189328ba53c0SMasahiro Yamada static void file_fail(const char *filename)
189428ba53c0SMasahiro Yamada {
189528ba53c0SMasahiro Yamada 	printf("Error parsing %s\n", filename);
189628ba53c0SMasahiro Yamada 	exit(1);
189728ba53c0SMasahiro Yamada }
189828ba53c0SMasahiro Yamada 
line_fail(const char * filename,const char * line)189928ba53c0SMasahiro Yamada static void line_fail(const char *filename, const char *line)
190028ba53c0SMasahiro Yamada {
190128ba53c0SMasahiro Yamada 	printf("Error parsing %s:%s\n", filename, line);
190228ba53c0SMasahiro Yamada 	exit(1);
190328ba53c0SMasahiro Yamada }
190428ba53c0SMasahiro Yamada 
190528ba53c0SMasahiro Yamada /* ------------------------------------------------------------------ */
190628ba53c0SMasahiro Yamada 
print_utf32(unsigned int * utf32str)190728ba53c0SMasahiro Yamada static void print_utf32(unsigned int *utf32str)
190828ba53c0SMasahiro Yamada {
190928ba53c0SMasahiro Yamada 	int	i;
191028ba53c0SMasahiro Yamada 
191128ba53c0SMasahiro Yamada 	for (i = 0; utf32str[i]; i++)
191228ba53c0SMasahiro Yamada 		printf(" %X", utf32str[i]);
191328ba53c0SMasahiro Yamada }
191428ba53c0SMasahiro Yamada 
print_utf32nfdi(unsigned int unichar)191528ba53c0SMasahiro Yamada static void print_utf32nfdi(unsigned int unichar)
191628ba53c0SMasahiro Yamada {
191728ba53c0SMasahiro Yamada 	printf(" %X ->", unichar);
191828ba53c0SMasahiro Yamada 	print_utf32(unicode_data[unichar].utf32nfdi);
191928ba53c0SMasahiro Yamada 	printf("\n");
192028ba53c0SMasahiro Yamada }
192128ba53c0SMasahiro Yamada 
print_utf32nfdicf(unsigned int unichar)192228ba53c0SMasahiro Yamada static void print_utf32nfdicf(unsigned int unichar)
192328ba53c0SMasahiro Yamada {
192428ba53c0SMasahiro Yamada 	printf(" %X ->", unichar);
192528ba53c0SMasahiro Yamada 	print_utf32(unicode_data[unichar].utf32nfdicf);
192628ba53c0SMasahiro Yamada 	printf("\n");
192728ba53c0SMasahiro Yamada }
192828ba53c0SMasahiro Yamada 
192928ba53c0SMasahiro Yamada /* ------------------------------------------------------------------ */
193028ba53c0SMasahiro Yamada 
age_init(void)193128ba53c0SMasahiro Yamada static void age_init(void)
193228ba53c0SMasahiro Yamada {
193328ba53c0SMasahiro Yamada 	FILE *file;
193428ba53c0SMasahiro Yamada 	unsigned int first;
193528ba53c0SMasahiro Yamada 	unsigned int last;
193628ba53c0SMasahiro Yamada 	unsigned int unichar;
193728ba53c0SMasahiro Yamada 	unsigned int major;
193828ba53c0SMasahiro Yamada 	unsigned int minor;
193928ba53c0SMasahiro Yamada 	unsigned int revision;
194028ba53c0SMasahiro Yamada 	int gen;
194128ba53c0SMasahiro Yamada 	int count;
194228ba53c0SMasahiro Yamada 	int ret;
194328ba53c0SMasahiro Yamada 
194428ba53c0SMasahiro Yamada 	if (verbose > 0)
194528ba53c0SMasahiro Yamada 		printf("Parsing %s\n", age_name);
194628ba53c0SMasahiro Yamada 
194728ba53c0SMasahiro Yamada 	file = fopen(age_name, "r");
194828ba53c0SMasahiro Yamada 	if (!file)
194928ba53c0SMasahiro Yamada 		open_fail(age_name, errno);
195028ba53c0SMasahiro Yamada 	count = 0;
195128ba53c0SMasahiro Yamada 
195228ba53c0SMasahiro Yamada 	gen = 0;
195328ba53c0SMasahiro Yamada 	while (fgets(line, LINESIZE, file)) {
195428ba53c0SMasahiro Yamada 		ret = sscanf(line, "# Age=V%d_%d_%d",
195528ba53c0SMasahiro Yamada 				&major, &minor, &revision);
195628ba53c0SMasahiro Yamada 		if (ret == 3) {
195728ba53c0SMasahiro Yamada 			ages_count++;
195828ba53c0SMasahiro Yamada 			if (verbose > 1)
195928ba53c0SMasahiro Yamada 				printf(" Age V%d_%d_%d\n",
196028ba53c0SMasahiro Yamada 					major, minor, revision);
196128ba53c0SMasahiro Yamada 			if (!age_valid(major, minor, revision))
196228ba53c0SMasahiro Yamada 				line_fail(age_name, line);
196328ba53c0SMasahiro Yamada 			continue;
196428ba53c0SMasahiro Yamada 		}
196528ba53c0SMasahiro Yamada 		ret = sscanf(line, "# Age=V%d_%d", &major, &minor);
196628ba53c0SMasahiro Yamada 		if (ret == 2) {
196728ba53c0SMasahiro Yamada 			ages_count++;
196828ba53c0SMasahiro Yamada 			if (verbose > 1)
196928ba53c0SMasahiro Yamada 				printf(" Age V%d_%d\n", major, minor);
197028ba53c0SMasahiro Yamada 			if (!age_valid(major, minor, 0))
197128ba53c0SMasahiro Yamada 				line_fail(age_name, line);
197228ba53c0SMasahiro Yamada 			continue;
197328ba53c0SMasahiro Yamada 		}
197428ba53c0SMasahiro Yamada 	}
197528ba53c0SMasahiro Yamada 
197628ba53c0SMasahiro Yamada 	/* We must have found something above. */
197728ba53c0SMasahiro Yamada 	if (verbose > 1)
197828ba53c0SMasahiro Yamada 		printf("%d age entries\n", ages_count);
197928ba53c0SMasahiro Yamada 	if (ages_count == 0 || ages_count > MAXGEN)
198028ba53c0SMasahiro Yamada 		file_fail(age_name);
198128ba53c0SMasahiro Yamada 
198228ba53c0SMasahiro Yamada 	/* There is a 0 entry. */
198328ba53c0SMasahiro Yamada 	ages_count++;
198428ba53c0SMasahiro Yamada 	ages = calloc(ages_count + 1, sizeof(*ages));
198528ba53c0SMasahiro Yamada 	/* And a guard entry. */
198628ba53c0SMasahiro Yamada 	ages[ages_count] = (unsigned int)-1;
198728ba53c0SMasahiro Yamada 
198828ba53c0SMasahiro Yamada 	rewind(file);
198928ba53c0SMasahiro Yamada 	count = 0;
199028ba53c0SMasahiro Yamada 	gen = 0;
199128ba53c0SMasahiro Yamada 	while (fgets(line, LINESIZE, file)) {
199228ba53c0SMasahiro Yamada 		ret = sscanf(line, "# Age=V%d_%d_%d",
199328ba53c0SMasahiro Yamada 				&major, &minor, &revision);
199428ba53c0SMasahiro Yamada 		if (ret == 3) {
199528ba53c0SMasahiro Yamada 			ages[++gen] =
199628ba53c0SMasahiro Yamada 				UNICODE_AGE(major, minor, revision);
199728ba53c0SMasahiro Yamada 			if (verbose > 1)
199828ba53c0SMasahiro Yamada 				printf(" Age V%d_%d_%d = gen %d\n",
199928ba53c0SMasahiro Yamada 					major, minor, revision, gen);
200028ba53c0SMasahiro Yamada 			if (!age_valid(major, minor, revision))
200128ba53c0SMasahiro Yamada 				line_fail(age_name, line);
200228ba53c0SMasahiro Yamada 			continue;
200328ba53c0SMasahiro Yamada 		}
200428ba53c0SMasahiro Yamada 		ret = sscanf(line, "# Age=V%d_%d", &major, &minor);
200528ba53c0SMasahiro Yamada 		if (ret == 2) {
200628ba53c0SMasahiro Yamada 			ages[++gen] = UNICODE_AGE(major, minor, 0);
200728ba53c0SMasahiro Yamada 			if (verbose > 1)
200828ba53c0SMasahiro Yamada 				printf(" Age V%d_%d = %d\n",
200928ba53c0SMasahiro Yamada 					major, minor, gen);
201028ba53c0SMasahiro Yamada 			if (!age_valid(major, minor, 0))
201128ba53c0SMasahiro Yamada 				line_fail(age_name, line);
201228ba53c0SMasahiro Yamada 			continue;
201328ba53c0SMasahiro Yamada 		}
201428ba53c0SMasahiro Yamada 		ret = sscanf(line, "%X..%X ; %d.%d #",
201528ba53c0SMasahiro Yamada 			     &first, &last, &major, &minor);
201628ba53c0SMasahiro Yamada 		if (ret == 4) {
201728ba53c0SMasahiro Yamada 			for (unichar = first; unichar <= last; unichar++)
201828ba53c0SMasahiro Yamada 				unicode_data[unichar].gen = gen;
201928ba53c0SMasahiro Yamada 			count += 1 + last - first;
202028ba53c0SMasahiro Yamada 			if (verbose > 1)
202128ba53c0SMasahiro Yamada 				printf("  %X..%X gen %d\n", first, last, gen);
202228ba53c0SMasahiro Yamada 			if (!utf32valid(first) || !utf32valid(last))
202328ba53c0SMasahiro Yamada 				line_fail(age_name, line);
202428ba53c0SMasahiro Yamada 			continue;
202528ba53c0SMasahiro Yamada 		}
202628ba53c0SMasahiro Yamada 		ret = sscanf(line, "%X ; %d.%d #", &unichar, &major, &minor);
202728ba53c0SMasahiro Yamada 		if (ret == 3) {
202828ba53c0SMasahiro Yamada 			unicode_data[unichar].gen = gen;
202928ba53c0SMasahiro Yamada 			count++;
203028ba53c0SMasahiro Yamada 			if (verbose > 1)
203128ba53c0SMasahiro Yamada 				printf("  %X gen %d\n", unichar, gen);
203228ba53c0SMasahiro Yamada 			if (!utf32valid(unichar))
203328ba53c0SMasahiro Yamada 				line_fail(age_name, line);
203428ba53c0SMasahiro Yamada 			continue;
203528ba53c0SMasahiro Yamada 		}
203628ba53c0SMasahiro Yamada 	}
203728ba53c0SMasahiro Yamada 	unicode_maxage = ages[gen];
203828ba53c0SMasahiro Yamada 	fclose(file);
203928ba53c0SMasahiro Yamada 
204028ba53c0SMasahiro Yamada 	/* Nix surrogate block */
204128ba53c0SMasahiro Yamada 	if (verbose > 1)
204228ba53c0SMasahiro Yamada 		printf(" Removing surrogate block D800..DFFF\n");
204328ba53c0SMasahiro Yamada 	for (unichar = 0xd800; unichar <= 0xdfff; unichar++)
204428ba53c0SMasahiro Yamada 		unicode_data[unichar].gen = -1;
204528ba53c0SMasahiro Yamada 
204628ba53c0SMasahiro Yamada 	if (verbose > 0)
204728ba53c0SMasahiro Yamada 	        printf("Found %d entries\n", count);
204828ba53c0SMasahiro Yamada 	if (count == 0)
204928ba53c0SMasahiro Yamada 		file_fail(age_name);
205028ba53c0SMasahiro Yamada }
205128ba53c0SMasahiro Yamada 
ccc_init(void)205228ba53c0SMasahiro Yamada static void ccc_init(void)
205328ba53c0SMasahiro Yamada {
205428ba53c0SMasahiro Yamada 	FILE *file;
205528ba53c0SMasahiro Yamada 	unsigned int first;
205628ba53c0SMasahiro Yamada 	unsigned int last;
205728ba53c0SMasahiro Yamada 	unsigned int unichar;
205828ba53c0SMasahiro Yamada 	unsigned int value;
205928ba53c0SMasahiro Yamada 	int count;
206028ba53c0SMasahiro Yamada 	int ret;
206128ba53c0SMasahiro Yamada 
206228ba53c0SMasahiro Yamada 	if (verbose > 0)
206328ba53c0SMasahiro Yamada 		printf("Parsing %s\n", ccc_name);
206428ba53c0SMasahiro Yamada 
206528ba53c0SMasahiro Yamada 	file = fopen(ccc_name, "r");
206628ba53c0SMasahiro Yamada 	if (!file)
206728ba53c0SMasahiro Yamada 		open_fail(ccc_name, errno);
206828ba53c0SMasahiro Yamada 
206928ba53c0SMasahiro Yamada 	count = 0;
207028ba53c0SMasahiro Yamada 	while (fgets(line, LINESIZE, file)) {
207128ba53c0SMasahiro Yamada 		ret = sscanf(line, "%X..%X ; %d #", &first, &last, &value);
207228ba53c0SMasahiro Yamada 		if (ret == 3) {
207328ba53c0SMasahiro Yamada 			for (unichar = first; unichar <= last; unichar++) {
207428ba53c0SMasahiro Yamada 				unicode_data[unichar].ccc = value;
207528ba53c0SMasahiro Yamada                                 count++;
207628ba53c0SMasahiro Yamada 			}
207728ba53c0SMasahiro Yamada 			if (verbose > 1)
207828ba53c0SMasahiro Yamada 				printf(" %X..%X ccc %d\n", first, last, value);
207928ba53c0SMasahiro Yamada 			if (!utf32valid(first) || !utf32valid(last))
208028ba53c0SMasahiro Yamada 				line_fail(ccc_name, line);
208128ba53c0SMasahiro Yamada 			continue;
208228ba53c0SMasahiro Yamada 		}
208328ba53c0SMasahiro Yamada 		ret = sscanf(line, "%X ; %d #", &unichar, &value);
208428ba53c0SMasahiro Yamada 		if (ret == 2) {
208528ba53c0SMasahiro Yamada 			unicode_data[unichar].ccc = value;
208628ba53c0SMasahiro Yamada                         count++;
208728ba53c0SMasahiro Yamada 			if (verbose > 1)
208828ba53c0SMasahiro Yamada 				printf(" %X ccc %d\n", unichar, value);
208928ba53c0SMasahiro Yamada 			if (!utf32valid(unichar))
209028ba53c0SMasahiro Yamada 				line_fail(ccc_name, line);
209128ba53c0SMasahiro Yamada 			continue;
209228ba53c0SMasahiro Yamada 		}
209328ba53c0SMasahiro Yamada 	}
209428ba53c0SMasahiro Yamada 	fclose(file);
209528ba53c0SMasahiro Yamada 
209628ba53c0SMasahiro Yamada 	if (verbose > 0)
209728ba53c0SMasahiro Yamada 		printf("Found %d entries\n", count);
209828ba53c0SMasahiro Yamada 	if (count == 0)
209928ba53c0SMasahiro Yamada 		file_fail(ccc_name);
210028ba53c0SMasahiro Yamada }
210128ba53c0SMasahiro Yamada 
ignore_compatibility_form(char * type)210228ba53c0SMasahiro Yamada static int ignore_compatibility_form(char *type)
210328ba53c0SMasahiro Yamada {
210428ba53c0SMasahiro Yamada 	int i;
210528ba53c0SMasahiro Yamada 	char *ignored_types[] = {"font", "noBreak", "initial", "medial",
210628ba53c0SMasahiro Yamada 				 "final", "isolated", "circle", "super",
210728ba53c0SMasahiro Yamada 				 "sub", "vertical", "wide", "narrow",
210828ba53c0SMasahiro Yamada 				 "small", "square", "fraction", "compat"};
210928ba53c0SMasahiro Yamada 
211028ba53c0SMasahiro Yamada 	for (i = 0 ; i < ARRAY_SIZE(ignored_types); i++)
211128ba53c0SMasahiro Yamada 		if (strcmp(type, ignored_types[i]) == 0)
211228ba53c0SMasahiro Yamada 			return 1;
211328ba53c0SMasahiro Yamada 	return 0;
211428ba53c0SMasahiro Yamada }
211528ba53c0SMasahiro Yamada 
nfdi_init(void)211628ba53c0SMasahiro Yamada static void nfdi_init(void)
211728ba53c0SMasahiro Yamada {
211828ba53c0SMasahiro Yamada 	FILE *file;
211928ba53c0SMasahiro Yamada 	unsigned int unichar;
212028ba53c0SMasahiro Yamada 	unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
212128ba53c0SMasahiro Yamada 	char *s;
212228ba53c0SMasahiro Yamada 	char *type;
212328ba53c0SMasahiro Yamada 	unsigned int *um;
212428ba53c0SMasahiro Yamada 	int count;
212528ba53c0SMasahiro Yamada 	int i;
212628ba53c0SMasahiro Yamada 	int ret;
212728ba53c0SMasahiro Yamada 
212828ba53c0SMasahiro Yamada 	if (verbose > 0)
212928ba53c0SMasahiro Yamada 		printf("Parsing %s\n", data_name);
213028ba53c0SMasahiro Yamada 	file = fopen(data_name, "r");
213128ba53c0SMasahiro Yamada 	if (!file)
213228ba53c0SMasahiro Yamada 		open_fail(data_name, errno);
213328ba53c0SMasahiro Yamada 
213428ba53c0SMasahiro Yamada 	count = 0;
213528ba53c0SMasahiro Yamada 	while (fgets(line, LINESIZE, file)) {
213628ba53c0SMasahiro Yamada 		ret = sscanf(line, "%X;%*[^;];%*[^;];%*[^;];%*[^;];%[^;];",
213728ba53c0SMasahiro Yamada 			     &unichar, buf0);
213828ba53c0SMasahiro Yamada 		if (ret != 2)
213928ba53c0SMasahiro Yamada 			continue;
214028ba53c0SMasahiro Yamada 		if (!utf32valid(unichar))
214128ba53c0SMasahiro Yamada 			line_fail(data_name, line);
214228ba53c0SMasahiro Yamada 
214328ba53c0SMasahiro Yamada 		s = buf0;
214428ba53c0SMasahiro Yamada 		/* skip over <tag> */
214528ba53c0SMasahiro Yamada 		if (*s == '<') {
214628ba53c0SMasahiro Yamada 			type = ++s;
214728ba53c0SMasahiro Yamada 			while (*++s != '>');
214828ba53c0SMasahiro Yamada 			*s++ = '\0';
214928ba53c0SMasahiro Yamada 			if(ignore_compatibility_form(type))
215028ba53c0SMasahiro Yamada 				continue;
215128ba53c0SMasahiro Yamada 		}
215228ba53c0SMasahiro Yamada 		/* decode the decomposition into UTF-32 */
215328ba53c0SMasahiro Yamada 		i = 0;
215428ba53c0SMasahiro Yamada 		while (*s) {
215528ba53c0SMasahiro Yamada 			mapping[i] = strtoul(s, &s, 16);
215628ba53c0SMasahiro Yamada 			if (!utf32valid(mapping[i]))
215728ba53c0SMasahiro Yamada 				line_fail(data_name, line);
215828ba53c0SMasahiro Yamada 			i++;
215928ba53c0SMasahiro Yamada 		}
216028ba53c0SMasahiro Yamada 		mapping[i++] = 0;
216128ba53c0SMasahiro Yamada 
216228ba53c0SMasahiro Yamada 		um = malloc(i * sizeof(unsigned int));
216328ba53c0SMasahiro Yamada 		memcpy(um, mapping, i * sizeof(unsigned int));
216428ba53c0SMasahiro Yamada 		unicode_data[unichar].utf32nfdi = um;
216528ba53c0SMasahiro Yamada 
216628ba53c0SMasahiro Yamada 		if (verbose > 1)
216728ba53c0SMasahiro Yamada 			print_utf32nfdi(unichar);
216828ba53c0SMasahiro Yamada 		count++;
216928ba53c0SMasahiro Yamada 	}
217028ba53c0SMasahiro Yamada 	fclose(file);
217128ba53c0SMasahiro Yamada 	if (verbose > 0)
217228ba53c0SMasahiro Yamada 		printf("Found %d entries\n", count);
217328ba53c0SMasahiro Yamada 	if (count == 0)
217428ba53c0SMasahiro Yamada 		file_fail(data_name);
217528ba53c0SMasahiro Yamada }
217628ba53c0SMasahiro Yamada 
nfdicf_init(void)217728ba53c0SMasahiro Yamada static void nfdicf_init(void)
217828ba53c0SMasahiro Yamada {
217928ba53c0SMasahiro Yamada 	FILE *file;
218028ba53c0SMasahiro Yamada 	unsigned int unichar;
218128ba53c0SMasahiro Yamada 	unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
218228ba53c0SMasahiro Yamada 	char status;
218328ba53c0SMasahiro Yamada 	char *s;
218428ba53c0SMasahiro Yamada 	unsigned int *um;
218528ba53c0SMasahiro Yamada 	int i;
218628ba53c0SMasahiro Yamada 	int count;
218728ba53c0SMasahiro Yamada 	int ret;
218828ba53c0SMasahiro Yamada 
218928ba53c0SMasahiro Yamada 	if (verbose > 0)
219028ba53c0SMasahiro Yamada 		printf("Parsing %s\n", fold_name);
219128ba53c0SMasahiro Yamada 	file = fopen(fold_name, "r");
219228ba53c0SMasahiro Yamada 	if (!file)
219328ba53c0SMasahiro Yamada 		open_fail(fold_name, errno);
219428ba53c0SMasahiro Yamada 
219528ba53c0SMasahiro Yamada 	count = 0;
219628ba53c0SMasahiro Yamada 	while (fgets(line, LINESIZE, file)) {
219728ba53c0SMasahiro Yamada 		ret = sscanf(line, "%X; %c; %[^;];", &unichar, &status, buf0);
219828ba53c0SMasahiro Yamada 		if (ret != 3)
219928ba53c0SMasahiro Yamada 			continue;
220028ba53c0SMasahiro Yamada 		if (!utf32valid(unichar))
220128ba53c0SMasahiro Yamada 			line_fail(fold_name, line);
220228ba53c0SMasahiro Yamada 		/* Use the C+F casefold. */
220328ba53c0SMasahiro Yamada 		if (status != 'C' && status != 'F')
220428ba53c0SMasahiro Yamada 			continue;
220528ba53c0SMasahiro Yamada 		s = buf0;
220628ba53c0SMasahiro Yamada 		if (*s == '<')
220728ba53c0SMasahiro Yamada 			while (*s++ != ' ')
220828ba53c0SMasahiro Yamada 				;
220928ba53c0SMasahiro Yamada 		i = 0;
221028ba53c0SMasahiro Yamada 		while (*s) {
221128ba53c0SMasahiro Yamada 			mapping[i] = strtoul(s, &s, 16);
221228ba53c0SMasahiro Yamada 			if (!utf32valid(mapping[i]))
221328ba53c0SMasahiro Yamada 				line_fail(fold_name, line);
221428ba53c0SMasahiro Yamada 			i++;
221528ba53c0SMasahiro Yamada 		}
221628ba53c0SMasahiro Yamada 		mapping[i++] = 0;
221728ba53c0SMasahiro Yamada 
221828ba53c0SMasahiro Yamada 		um = malloc(i * sizeof(unsigned int));
221928ba53c0SMasahiro Yamada 		memcpy(um, mapping, i * sizeof(unsigned int));
222028ba53c0SMasahiro Yamada 		unicode_data[unichar].utf32nfdicf = um;
222128ba53c0SMasahiro Yamada 
222228ba53c0SMasahiro Yamada 		if (verbose > 1)
222328ba53c0SMasahiro Yamada 			print_utf32nfdicf(unichar);
222428ba53c0SMasahiro Yamada 		count++;
222528ba53c0SMasahiro Yamada 	}
222628ba53c0SMasahiro Yamada 	fclose(file);
222728ba53c0SMasahiro Yamada 	if (verbose > 0)
222828ba53c0SMasahiro Yamada 		printf("Found %d entries\n", count);
222928ba53c0SMasahiro Yamada 	if (count == 0)
223028ba53c0SMasahiro Yamada 		file_fail(fold_name);
223128ba53c0SMasahiro Yamada }
223228ba53c0SMasahiro Yamada 
ignore_init(void)223328ba53c0SMasahiro Yamada static void ignore_init(void)
223428ba53c0SMasahiro Yamada {
223528ba53c0SMasahiro Yamada 	FILE *file;
223628ba53c0SMasahiro Yamada 	unsigned int unichar;
223728ba53c0SMasahiro Yamada 	unsigned int first;
223828ba53c0SMasahiro Yamada 	unsigned int last;
223928ba53c0SMasahiro Yamada 	unsigned int *um;
224028ba53c0SMasahiro Yamada 	int count;
224128ba53c0SMasahiro Yamada 	int ret;
224228ba53c0SMasahiro Yamada 
224328ba53c0SMasahiro Yamada 	if (verbose > 0)
224428ba53c0SMasahiro Yamada 		printf("Parsing %s\n", prop_name);
224528ba53c0SMasahiro Yamada 	file = fopen(prop_name, "r");
224628ba53c0SMasahiro Yamada 	if (!file)
224728ba53c0SMasahiro Yamada 		open_fail(prop_name, errno);
224828ba53c0SMasahiro Yamada 	assert(file);
224928ba53c0SMasahiro Yamada 	count = 0;
225028ba53c0SMasahiro Yamada 	while (fgets(line, LINESIZE, file)) {
225128ba53c0SMasahiro Yamada 		ret = sscanf(line, "%X..%X ; %s # ", &first, &last, buf0);
225228ba53c0SMasahiro Yamada 		if (ret == 3) {
225328ba53c0SMasahiro Yamada 			if (strcmp(buf0, "Default_Ignorable_Code_Point"))
225428ba53c0SMasahiro Yamada 				continue;
225528ba53c0SMasahiro Yamada 			if (!utf32valid(first) || !utf32valid(last))
225628ba53c0SMasahiro Yamada 				line_fail(prop_name, line);
225728ba53c0SMasahiro Yamada 			for (unichar = first; unichar <= last; unichar++) {
225828ba53c0SMasahiro Yamada 				free(unicode_data[unichar].utf32nfdi);
225928ba53c0SMasahiro Yamada 				um = malloc(sizeof(unsigned int));
226028ba53c0SMasahiro Yamada 				*um = 0;
226128ba53c0SMasahiro Yamada 				unicode_data[unichar].utf32nfdi = um;
226228ba53c0SMasahiro Yamada 				free(unicode_data[unichar].utf32nfdicf);
226328ba53c0SMasahiro Yamada 				um = malloc(sizeof(unsigned int));
226428ba53c0SMasahiro Yamada 				*um = 0;
226528ba53c0SMasahiro Yamada 				unicode_data[unichar].utf32nfdicf = um;
226628ba53c0SMasahiro Yamada 				count++;
226728ba53c0SMasahiro Yamada 			}
226828ba53c0SMasahiro Yamada 			if (verbose > 1)
226928ba53c0SMasahiro Yamada 				printf(" %X..%X Default_Ignorable_Code_Point\n",
227028ba53c0SMasahiro Yamada 					first, last);
227128ba53c0SMasahiro Yamada 			continue;
227228ba53c0SMasahiro Yamada 		}
227328ba53c0SMasahiro Yamada 		ret = sscanf(line, "%X ; %s # ", &unichar, buf0);
227428ba53c0SMasahiro Yamada 		if (ret == 2) {
227528ba53c0SMasahiro Yamada 			if (strcmp(buf0, "Default_Ignorable_Code_Point"))
227628ba53c0SMasahiro Yamada 				continue;
227728ba53c0SMasahiro Yamada 			if (!utf32valid(unichar))
227828ba53c0SMasahiro Yamada 				line_fail(prop_name, line);
227928ba53c0SMasahiro Yamada 			free(unicode_data[unichar].utf32nfdi);
228028ba53c0SMasahiro Yamada 			um = malloc(sizeof(unsigned int));
228128ba53c0SMasahiro Yamada 			*um = 0;
228228ba53c0SMasahiro Yamada 			unicode_data[unichar].utf32nfdi = um;
228328ba53c0SMasahiro Yamada 			free(unicode_data[unichar].utf32nfdicf);
228428ba53c0SMasahiro Yamada 			um = malloc(sizeof(unsigned int));
228528ba53c0SMasahiro Yamada 			*um = 0;
228628ba53c0SMasahiro Yamada 			unicode_data[unichar].utf32nfdicf = um;
228728ba53c0SMasahiro Yamada 			if (verbose > 1)
228828ba53c0SMasahiro Yamada 				printf(" %X Default_Ignorable_Code_Point\n",
228928ba53c0SMasahiro Yamada 					unichar);
229028ba53c0SMasahiro Yamada 			count++;
229128ba53c0SMasahiro Yamada 			continue;
229228ba53c0SMasahiro Yamada 		}
229328ba53c0SMasahiro Yamada 	}
229428ba53c0SMasahiro Yamada 	fclose(file);
229528ba53c0SMasahiro Yamada 
229628ba53c0SMasahiro Yamada 	if (verbose > 0)
229728ba53c0SMasahiro Yamada 		printf("Found %d entries\n", count);
229828ba53c0SMasahiro Yamada 	if (count == 0)
229928ba53c0SMasahiro Yamada 		file_fail(prop_name);
230028ba53c0SMasahiro Yamada }
230128ba53c0SMasahiro Yamada 
corrections_init(void)230228ba53c0SMasahiro Yamada static void corrections_init(void)
230328ba53c0SMasahiro Yamada {
230428ba53c0SMasahiro Yamada 	FILE *file;
230528ba53c0SMasahiro Yamada 	unsigned int unichar;
230628ba53c0SMasahiro Yamada 	unsigned int major;
230728ba53c0SMasahiro Yamada 	unsigned int minor;
230828ba53c0SMasahiro Yamada 	unsigned int revision;
230928ba53c0SMasahiro Yamada 	unsigned int age;
231028ba53c0SMasahiro Yamada 	unsigned int *um;
231128ba53c0SMasahiro Yamada 	unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
231228ba53c0SMasahiro Yamada 	char *s;
231328ba53c0SMasahiro Yamada 	int i;
231428ba53c0SMasahiro Yamada 	int count;
231528ba53c0SMasahiro Yamada 	int ret;
231628ba53c0SMasahiro Yamada 
231728ba53c0SMasahiro Yamada 	if (verbose > 0)
231828ba53c0SMasahiro Yamada 		printf("Parsing %s\n", norm_name);
231928ba53c0SMasahiro Yamada 	file = fopen(norm_name, "r");
232028ba53c0SMasahiro Yamada 	if (!file)
232128ba53c0SMasahiro Yamada 		open_fail(norm_name, errno);
232228ba53c0SMasahiro Yamada 
232328ba53c0SMasahiro Yamada 	count = 0;
232428ba53c0SMasahiro Yamada 	while (fgets(line, LINESIZE, file)) {
232528ba53c0SMasahiro Yamada 		ret = sscanf(line, "%X;%[^;];%[^;];%d.%d.%d #",
232628ba53c0SMasahiro Yamada 				&unichar, buf0, buf1,
232728ba53c0SMasahiro Yamada 				&major, &minor, &revision);
232828ba53c0SMasahiro Yamada 		if (ret != 6)
232928ba53c0SMasahiro Yamada 			continue;
233028ba53c0SMasahiro Yamada 		if (!utf32valid(unichar) || !age_valid(major, minor, revision))
233128ba53c0SMasahiro Yamada 			line_fail(norm_name, line);
233228ba53c0SMasahiro Yamada 		count++;
233328ba53c0SMasahiro Yamada 	}
233428ba53c0SMasahiro Yamada 	corrections = calloc(count, sizeof(struct unicode_data));
233528ba53c0SMasahiro Yamada 	corrections_count = count;
233628ba53c0SMasahiro Yamada 	rewind(file);
233728ba53c0SMasahiro Yamada 
233828ba53c0SMasahiro Yamada 	count = 0;
233928ba53c0SMasahiro Yamada 	while (fgets(line, LINESIZE, file)) {
234028ba53c0SMasahiro Yamada 		ret = sscanf(line, "%X;%[^;];%[^;];%d.%d.%d #",
234128ba53c0SMasahiro Yamada 				&unichar, buf0, buf1,
234228ba53c0SMasahiro Yamada 				&major, &minor, &revision);
234328ba53c0SMasahiro Yamada 		if (ret != 6)
234428ba53c0SMasahiro Yamada 			continue;
234528ba53c0SMasahiro Yamada 		if (!utf32valid(unichar) || !age_valid(major, minor, revision))
234628ba53c0SMasahiro Yamada 			line_fail(norm_name, line);
234728ba53c0SMasahiro Yamada 		corrections[count] = unicode_data[unichar];
234828ba53c0SMasahiro Yamada 		assert(corrections[count].code == unichar);
234928ba53c0SMasahiro Yamada 		age = UNICODE_AGE(major, minor, revision);
235028ba53c0SMasahiro Yamada 		corrections[count].correction = age;
235128ba53c0SMasahiro Yamada 
235228ba53c0SMasahiro Yamada 		i = 0;
235328ba53c0SMasahiro Yamada 		s = buf0;
235428ba53c0SMasahiro Yamada 		while (*s) {
235528ba53c0SMasahiro Yamada 			mapping[i] = strtoul(s, &s, 16);
235628ba53c0SMasahiro Yamada 			if (!utf32valid(mapping[i]))
235728ba53c0SMasahiro Yamada 				line_fail(norm_name, line);
235828ba53c0SMasahiro Yamada 			i++;
235928ba53c0SMasahiro Yamada 		}
236028ba53c0SMasahiro Yamada 		mapping[i++] = 0;
236128ba53c0SMasahiro Yamada 
236228ba53c0SMasahiro Yamada 		um = malloc(i * sizeof(unsigned int));
236328ba53c0SMasahiro Yamada 		memcpy(um, mapping, i * sizeof(unsigned int));
236428ba53c0SMasahiro Yamada 		corrections[count].utf32nfdi = um;
236528ba53c0SMasahiro Yamada 
236628ba53c0SMasahiro Yamada 		if (verbose > 1)
236728ba53c0SMasahiro Yamada 			printf(" %X -> %s -> %s V%d_%d_%d\n",
236828ba53c0SMasahiro Yamada 				unichar, buf0, buf1, major, minor, revision);
236928ba53c0SMasahiro Yamada 		count++;
237028ba53c0SMasahiro Yamada 	}
237128ba53c0SMasahiro Yamada 	fclose(file);
237228ba53c0SMasahiro Yamada 
237328ba53c0SMasahiro Yamada 	if (verbose > 0)
237428ba53c0SMasahiro Yamada 	        printf("Found %d entries\n", count);
237528ba53c0SMasahiro Yamada 	if (count == 0)
237628ba53c0SMasahiro Yamada 		file_fail(norm_name);
237728ba53c0SMasahiro Yamada }
237828ba53c0SMasahiro Yamada 
237928ba53c0SMasahiro Yamada /* ------------------------------------------------------------------ */
238028ba53c0SMasahiro Yamada 
238128ba53c0SMasahiro Yamada /*
238228ba53c0SMasahiro Yamada  * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0)
238328ba53c0SMasahiro Yamada  *
238428ba53c0SMasahiro Yamada  * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
238528ba53c0SMasahiro Yamada  * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
238628ba53c0SMasahiro Yamada  *
238728ba53c0SMasahiro Yamada  * SBase = 0xAC00
238828ba53c0SMasahiro Yamada  * LBase = 0x1100
238928ba53c0SMasahiro Yamada  * VBase = 0x1161
239028ba53c0SMasahiro Yamada  * TBase = 0x11A7
239128ba53c0SMasahiro Yamada  * LCount = 19
239228ba53c0SMasahiro Yamada  * VCount = 21
239328ba53c0SMasahiro Yamada  * TCount = 28
239428ba53c0SMasahiro Yamada  * NCount = 588 (VCount * TCount)
239528ba53c0SMasahiro Yamada  * SCount = 11172 (LCount * NCount)
239628ba53c0SMasahiro Yamada  *
239728ba53c0SMasahiro Yamada  * Decomposition:
239828ba53c0SMasahiro Yamada  *   SIndex = s - SBase
239928ba53c0SMasahiro Yamada  *
240028ba53c0SMasahiro Yamada  * LV (Canonical/Full)
240128ba53c0SMasahiro Yamada  *   LIndex = SIndex / NCount
240228ba53c0SMasahiro Yamada  *   VIndex = (Sindex % NCount) / TCount
240328ba53c0SMasahiro Yamada  *   LPart = LBase + LIndex
240428ba53c0SMasahiro Yamada  *   VPart = VBase + VIndex
240528ba53c0SMasahiro Yamada  *
240628ba53c0SMasahiro Yamada  * LVT (Canonical)
240728ba53c0SMasahiro Yamada  *   LVIndex = (SIndex / TCount) * TCount
240828ba53c0SMasahiro Yamada  *   TIndex = (Sindex % TCount)
240928ba53c0SMasahiro Yamada  *   LVPart = SBase + LVIndex
241028ba53c0SMasahiro Yamada  *   TPart = TBase + TIndex
241128ba53c0SMasahiro Yamada  *
241228ba53c0SMasahiro Yamada  * LVT (Full)
241328ba53c0SMasahiro Yamada  *   LIndex = SIndex / NCount
241428ba53c0SMasahiro Yamada  *   VIndex = (Sindex % NCount) / TCount
241528ba53c0SMasahiro Yamada  *   TIndex = (Sindex % TCount)
241628ba53c0SMasahiro Yamada  *   LPart = LBase + LIndex
241728ba53c0SMasahiro Yamada  *   VPart = VBase + VIndex
241828ba53c0SMasahiro Yamada  *   if (TIndex == 0) {
241928ba53c0SMasahiro Yamada  *          d = <LPart, VPart>
242028ba53c0SMasahiro Yamada  *   } else {
242128ba53c0SMasahiro Yamada  *          TPart = TBase + TIndex
242228ba53c0SMasahiro Yamada  *          d = <LPart, VPart, TPart>
242328ba53c0SMasahiro Yamada  *   }
242428ba53c0SMasahiro Yamada  *
242528ba53c0SMasahiro Yamada  */
242628ba53c0SMasahiro Yamada 
hangul_decompose(void)242728ba53c0SMasahiro Yamada static void hangul_decompose(void)
242828ba53c0SMasahiro Yamada {
242928ba53c0SMasahiro Yamada 	unsigned int sb = 0xAC00;
243028ba53c0SMasahiro Yamada 	unsigned int lb = 0x1100;
243128ba53c0SMasahiro Yamada 	unsigned int vb = 0x1161;
243228ba53c0SMasahiro Yamada 	unsigned int tb = 0x11a7;
243328ba53c0SMasahiro Yamada 	/* unsigned int lc = 19; */
243428ba53c0SMasahiro Yamada 	unsigned int vc = 21;
243528ba53c0SMasahiro Yamada 	unsigned int tc = 28;
243628ba53c0SMasahiro Yamada 	unsigned int nc = (vc * tc);
243728ba53c0SMasahiro Yamada 	/* unsigned int sc = (lc * nc); */
243828ba53c0SMasahiro Yamada 	unsigned int unichar;
243928ba53c0SMasahiro Yamada 	unsigned int mapping[4];
244028ba53c0SMasahiro Yamada 	unsigned int *um;
244128ba53c0SMasahiro Yamada         int count;
244228ba53c0SMasahiro Yamada 	int i;
244328ba53c0SMasahiro Yamada 
244428ba53c0SMasahiro Yamada 	if (verbose > 0)
244528ba53c0SMasahiro Yamada 		printf("Decomposing hangul\n");
244628ba53c0SMasahiro Yamada 	/* Hangul */
244728ba53c0SMasahiro Yamada 	count = 0;
244828ba53c0SMasahiro Yamada 	for (unichar = 0xAC00; unichar <= 0xD7A3; unichar++) {
244928ba53c0SMasahiro Yamada 		unsigned int si = unichar - sb;
245028ba53c0SMasahiro Yamada 		unsigned int li = si / nc;
245128ba53c0SMasahiro Yamada 		unsigned int vi = (si % nc) / tc;
245228ba53c0SMasahiro Yamada 		unsigned int ti = si % tc;
245328ba53c0SMasahiro Yamada 
245428ba53c0SMasahiro Yamada 		i = 0;
245528ba53c0SMasahiro Yamada 		mapping[i++] = lb + li;
245628ba53c0SMasahiro Yamada 		mapping[i++] = vb + vi;
245728ba53c0SMasahiro Yamada 		if (ti)
245828ba53c0SMasahiro Yamada 			mapping[i++] = tb + ti;
245928ba53c0SMasahiro Yamada 		mapping[i++] = 0;
246028ba53c0SMasahiro Yamada 
246128ba53c0SMasahiro Yamada 		assert(!unicode_data[unichar].utf32nfdi);
246228ba53c0SMasahiro Yamada 		um = malloc(i * sizeof(unsigned int));
246328ba53c0SMasahiro Yamada 		memcpy(um, mapping, i * sizeof(unsigned int));
246428ba53c0SMasahiro Yamada 		unicode_data[unichar].utf32nfdi = um;
246528ba53c0SMasahiro Yamada 
246628ba53c0SMasahiro Yamada 		assert(!unicode_data[unichar].utf32nfdicf);
246728ba53c0SMasahiro Yamada 		um = malloc(i * sizeof(unsigned int));
246828ba53c0SMasahiro Yamada 		memcpy(um, mapping, i * sizeof(unsigned int));
246928ba53c0SMasahiro Yamada 		unicode_data[unichar].utf32nfdicf = um;
247028ba53c0SMasahiro Yamada 
247128ba53c0SMasahiro Yamada 		/*
247228ba53c0SMasahiro Yamada 		 * Add a cookie as a reminder that the hangul syllable
247328ba53c0SMasahiro Yamada 		 * decompositions must not be stored in the generated
247428ba53c0SMasahiro Yamada 		 * trie.
247528ba53c0SMasahiro Yamada 		 */
247628ba53c0SMasahiro Yamada 		unicode_data[unichar].utf8nfdi = malloc(2);
247728ba53c0SMasahiro Yamada 		unicode_data[unichar].utf8nfdi[0] = HANGUL;
247828ba53c0SMasahiro Yamada 		unicode_data[unichar].utf8nfdi[1] = '\0';
247928ba53c0SMasahiro Yamada 
248028ba53c0SMasahiro Yamada 		if (verbose > 1)
248128ba53c0SMasahiro Yamada 			print_utf32nfdi(unichar);
248228ba53c0SMasahiro Yamada 
248328ba53c0SMasahiro Yamada 		count++;
248428ba53c0SMasahiro Yamada 	}
248528ba53c0SMasahiro Yamada 	if (verbose > 0)
248628ba53c0SMasahiro Yamada 		printf("Created %d entries\n", count);
248728ba53c0SMasahiro Yamada }
248828ba53c0SMasahiro Yamada 
nfdi_decompose(void)248928ba53c0SMasahiro Yamada static void nfdi_decompose(void)
249028ba53c0SMasahiro Yamada {
249128ba53c0SMasahiro Yamada 	unsigned int unichar;
249228ba53c0SMasahiro Yamada 	unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
249328ba53c0SMasahiro Yamada 	unsigned int *um;
249428ba53c0SMasahiro Yamada 	unsigned int *dc;
249528ba53c0SMasahiro Yamada 	int count;
249628ba53c0SMasahiro Yamada 	int i;
249728ba53c0SMasahiro Yamada 	int j;
249828ba53c0SMasahiro Yamada 	int ret;
249928ba53c0SMasahiro Yamada 
250028ba53c0SMasahiro Yamada 	if (verbose > 0)
250128ba53c0SMasahiro Yamada 		printf("Decomposing nfdi\n");
250228ba53c0SMasahiro Yamada 
250328ba53c0SMasahiro Yamada 	count = 0;
250428ba53c0SMasahiro Yamada 	for (unichar = 0; unichar != 0x110000; unichar++) {
250528ba53c0SMasahiro Yamada 		if (!unicode_data[unichar].utf32nfdi)
250628ba53c0SMasahiro Yamada 			continue;
250728ba53c0SMasahiro Yamada 		for (;;) {
250828ba53c0SMasahiro Yamada 			ret = 1;
250928ba53c0SMasahiro Yamada 			i = 0;
251028ba53c0SMasahiro Yamada 			um = unicode_data[unichar].utf32nfdi;
251128ba53c0SMasahiro Yamada 			while (*um) {
251228ba53c0SMasahiro Yamada 				dc = unicode_data[*um].utf32nfdi;
251328ba53c0SMasahiro Yamada 				if (dc) {
251428ba53c0SMasahiro Yamada 					for (j = 0; dc[j]; j++)
251528ba53c0SMasahiro Yamada 						mapping[i++] = dc[j];
251628ba53c0SMasahiro Yamada 					ret = 0;
251728ba53c0SMasahiro Yamada 				} else {
251828ba53c0SMasahiro Yamada 					mapping[i++] = *um;
251928ba53c0SMasahiro Yamada 				}
252028ba53c0SMasahiro Yamada 				um++;
252128ba53c0SMasahiro Yamada 			}
252228ba53c0SMasahiro Yamada 			mapping[i++] = 0;
252328ba53c0SMasahiro Yamada 			if (ret)
252428ba53c0SMasahiro Yamada 				break;
252528ba53c0SMasahiro Yamada 			free(unicode_data[unichar].utf32nfdi);
252628ba53c0SMasahiro Yamada 			um = malloc(i * sizeof(unsigned int));
252728ba53c0SMasahiro Yamada 			memcpy(um, mapping, i * sizeof(unsigned int));
252828ba53c0SMasahiro Yamada 			unicode_data[unichar].utf32nfdi = um;
252928ba53c0SMasahiro Yamada 		}
253028ba53c0SMasahiro Yamada 		/* Add this decomposition to nfdicf if there is no entry. */
253128ba53c0SMasahiro Yamada 		if (!unicode_data[unichar].utf32nfdicf) {
253228ba53c0SMasahiro Yamada 			um = malloc(i * sizeof(unsigned int));
253328ba53c0SMasahiro Yamada 			memcpy(um, mapping, i * sizeof(unsigned int));
253428ba53c0SMasahiro Yamada 			unicode_data[unichar].utf32nfdicf = um;
253528ba53c0SMasahiro Yamada 		}
253628ba53c0SMasahiro Yamada 		if (verbose > 1)
253728ba53c0SMasahiro Yamada 			print_utf32nfdi(unichar);
253828ba53c0SMasahiro Yamada 		count++;
253928ba53c0SMasahiro Yamada 	}
254028ba53c0SMasahiro Yamada 	if (verbose > 0)
254128ba53c0SMasahiro Yamada 		printf("Processed %d entries\n", count);
254228ba53c0SMasahiro Yamada }
254328ba53c0SMasahiro Yamada 
nfdicf_decompose(void)254428ba53c0SMasahiro Yamada static void nfdicf_decompose(void)
254528ba53c0SMasahiro Yamada {
254628ba53c0SMasahiro Yamada 	unsigned int unichar;
254728ba53c0SMasahiro Yamada 	unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
254828ba53c0SMasahiro Yamada 	unsigned int *um;
254928ba53c0SMasahiro Yamada 	unsigned int *dc;
255028ba53c0SMasahiro Yamada 	int count;
255128ba53c0SMasahiro Yamada 	int i;
255228ba53c0SMasahiro Yamada 	int j;
255328ba53c0SMasahiro Yamada 	int ret;
255428ba53c0SMasahiro Yamada 
255528ba53c0SMasahiro Yamada 	if (verbose > 0)
255628ba53c0SMasahiro Yamada 		printf("Decomposing nfdicf\n");
255728ba53c0SMasahiro Yamada 	count = 0;
255828ba53c0SMasahiro Yamada 	for (unichar = 0; unichar != 0x110000; unichar++) {
255928ba53c0SMasahiro Yamada 		if (!unicode_data[unichar].utf32nfdicf)
256028ba53c0SMasahiro Yamada 			continue;
256128ba53c0SMasahiro Yamada 		for (;;) {
256228ba53c0SMasahiro Yamada 			ret = 1;
256328ba53c0SMasahiro Yamada 			i = 0;
256428ba53c0SMasahiro Yamada 			um = unicode_data[unichar].utf32nfdicf;
256528ba53c0SMasahiro Yamada 			while (*um) {
256628ba53c0SMasahiro Yamada 				dc = unicode_data[*um].utf32nfdicf;
256728ba53c0SMasahiro Yamada 				if (dc) {
256828ba53c0SMasahiro Yamada 					for (j = 0; dc[j]; j++)
256928ba53c0SMasahiro Yamada 						mapping[i++] = dc[j];
257028ba53c0SMasahiro Yamada 					ret = 0;
257128ba53c0SMasahiro Yamada 				} else {
257228ba53c0SMasahiro Yamada 					mapping[i++] = *um;
257328ba53c0SMasahiro Yamada 				}
257428ba53c0SMasahiro Yamada 				um++;
257528ba53c0SMasahiro Yamada 			}
257628ba53c0SMasahiro Yamada 			mapping[i++] = 0;
257728ba53c0SMasahiro Yamada 			if (ret)
257828ba53c0SMasahiro Yamada 				break;
257928ba53c0SMasahiro Yamada 			free(unicode_data[unichar].utf32nfdicf);
258028ba53c0SMasahiro Yamada 			um = malloc(i * sizeof(unsigned int));
258128ba53c0SMasahiro Yamada 			memcpy(um, mapping, i * sizeof(unsigned int));
258228ba53c0SMasahiro Yamada 			unicode_data[unichar].utf32nfdicf = um;
258328ba53c0SMasahiro Yamada 		}
258428ba53c0SMasahiro Yamada 		if (verbose > 1)
258528ba53c0SMasahiro Yamada 			print_utf32nfdicf(unichar);
258628ba53c0SMasahiro Yamada 		count++;
258728ba53c0SMasahiro Yamada 	}
258828ba53c0SMasahiro Yamada 	if (verbose > 0)
258928ba53c0SMasahiro Yamada 		printf("Processed %d entries\n", count);
259028ba53c0SMasahiro Yamada }
259128ba53c0SMasahiro Yamada 
259228ba53c0SMasahiro Yamada /* ------------------------------------------------------------------ */
259328ba53c0SMasahiro Yamada 
259428ba53c0SMasahiro Yamada int utf8agemax(struct tree *, const char *);
259528ba53c0SMasahiro Yamada int utf8nagemax(struct tree *, const char *, size_t);
259628ba53c0SMasahiro Yamada int utf8agemin(struct tree *, const char *);
259728ba53c0SMasahiro Yamada int utf8nagemin(struct tree *, const char *, size_t);
259828ba53c0SMasahiro Yamada ssize_t utf8len(struct tree *, const char *);
259928ba53c0SMasahiro Yamada ssize_t utf8nlen(struct tree *, const char *, size_t);
260028ba53c0SMasahiro Yamada struct utf8cursor;
260128ba53c0SMasahiro Yamada int utf8cursor(struct utf8cursor *, struct tree *, const char *);
260228ba53c0SMasahiro Yamada int utf8ncursor(struct utf8cursor *, struct tree *, const char *, size_t);
260328ba53c0SMasahiro Yamada int utf8byte(struct utf8cursor *);
260428ba53c0SMasahiro Yamada 
260528ba53c0SMasahiro Yamada /*
260628ba53c0SMasahiro Yamada  * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0)
260728ba53c0SMasahiro Yamada  *
260828ba53c0SMasahiro Yamada  * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
260928ba53c0SMasahiro Yamada  * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
261028ba53c0SMasahiro Yamada  *
261128ba53c0SMasahiro Yamada  * SBase = 0xAC00
261228ba53c0SMasahiro Yamada  * LBase = 0x1100
261328ba53c0SMasahiro Yamada  * VBase = 0x1161
261428ba53c0SMasahiro Yamada  * TBase = 0x11A7
261528ba53c0SMasahiro Yamada  * LCount = 19
261628ba53c0SMasahiro Yamada  * VCount = 21
261728ba53c0SMasahiro Yamada  * TCount = 28
261828ba53c0SMasahiro Yamada  * NCount = 588 (VCount * TCount)
261928ba53c0SMasahiro Yamada  * SCount = 11172 (LCount * NCount)
262028ba53c0SMasahiro Yamada  *
262128ba53c0SMasahiro Yamada  * Decomposition:
262228ba53c0SMasahiro Yamada  *   SIndex = s - SBase
262328ba53c0SMasahiro Yamada  *
262428ba53c0SMasahiro Yamada  * LV (Canonical/Full)
262528ba53c0SMasahiro Yamada  *   LIndex = SIndex / NCount
262628ba53c0SMasahiro Yamada  *   VIndex = (Sindex % NCount) / TCount
262728ba53c0SMasahiro Yamada  *   LPart = LBase + LIndex
262828ba53c0SMasahiro Yamada  *   VPart = VBase + VIndex
262928ba53c0SMasahiro Yamada  *
263028ba53c0SMasahiro Yamada  * LVT (Canonical)
263128ba53c0SMasahiro Yamada  *   LVIndex = (SIndex / TCount) * TCount
263228ba53c0SMasahiro Yamada  *   TIndex = (Sindex % TCount)
263328ba53c0SMasahiro Yamada  *   LVPart = SBase + LVIndex
263428ba53c0SMasahiro Yamada  *   TPart = TBase + TIndex
263528ba53c0SMasahiro Yamada  *
263628ba53c0SMasahiro Yamada  * LVT (Full)
263728ba53c0SMasahiro Yamada  *   LIndex = SIndex / NCount
263828ba53c0SMasahiro Yamada  *   VIndex = (Sindex % NCount) / TCount
263928ba53c0SMasahiro Yamada  *   TIndex = (Sindex % TCount)
264028ba53c0SMasahiro Yamada  *   LPart = LBase + LIndex
264128ba53c0SMasahiro Yamada  *   VPart = VBase + VIndex
264228ba53c0SMasahiro Yamada  *   if (TIndex == 0) {
264328ba53c0SMasahiro Yamada  *          d = <LPart, VPart>
264428ba53c0SMasahiro Yamada  *   } else {
264528ba53c0SMasahiro Yamada  *          TPart = TBase + TIndex
264628ba53c0SMasahiro Yamada  *          d = <LPart, VPart, TPart>
264728ba53c0SMasahiro Yamada  *   }
264828ba53c0SMasahiro Yamada  */
264928ba53c0SMasahiro Yamada 
265028ba53c0SMasahiro Yamada /* Constants */
265128ba53c0SMasahiro Yamada #define SB	(0xAC00)
265228ba53c0SMasahiro Yamada #define LB	(0x1100)
265328ba53c0SMasahiro Yamada #define VB	(0x1161)
265428ba53c0SMasahiro Yamada #define TB	(0x11A7)
265528ba53c0SMasahiro Yamada #define LC	(19)
265628ba53c0SMasahiro Yamada #define VC	(21)
265728ba53c0SMasahiro Yamada #define TC	(28)
265828ba53c0SMasahiro Yamada #define NC	(VC * TC)
265928ba53c0SMasahiro Yamada #define SC	(LC * NC)
266028ba53c0SMasahiro Yamada 
266128ba53c0SMasahiro Yamada /* Algorithmic decomposition of hangul syllable. */
utf8hangul(const char * str,unsigned char * hangul)266228ba53c0SMasahiro Yamada static utf8leaf_t *utf8hangul(const char *str, unsigned char *hangul)
266328ba53c0SMasahiro Yamada {
266428ba53c0SMasahiro Yamada 	unsigned int	si;
266528ba53c0SMasahiro Yamada 	unsigned int	li;
266628ba53c0SMasahiro Yamada 	unsigned int	vi;
266728ba53c0SMasahiro Yamada 	unsigned int	ti;
266828ba53c0SMasahiro Yamada 	unsigned char	*h;
266928ba53c0SMasahiro Yamada 
267028ba53c0SMasahiro Yamada 	/* Calculate the SI, LI, VI, and TI values. */
267128ba53c0SMasahiro Yamada 	si = utf8decode(str) - SB;
267228ba53c0SMasahiro Yamada 	li = si / NC;
267328ba53c0SMasahiro Yamada 	vi = (si % NC) / TC;
267428ba53c0SMasahiro Yamada 	ti = si % TC;
267528ba53c0SMasahiro Yamada 
267628ba53c0SMasahiro Yamada 	/* Fill in base of leaf. */
267728ba53c0SMasahiro Yamada 	h = hangul;
267828ba53c0SMasahiro Yamada 	LEAF_GEN(h) = 2;
267928ba53c0SMasahiro Yamada 	LEAF_CCC(h) = DECOMPOSE;
268028ba53c0SMasahiro Yamada 	h += 2;
268128ba53c0SMasahiro Yamada 
268228ba53c0SMasahiro Yamada 	/* Add LPart, a 3-byte UTF-8 sequence. */
268328ba53c0SMasahiro Yamada 	h += utf8encode((char *)h, li + LB);
268428ba53c0SMasahiro Yamada 
268528ba53c0SMasahiro Yamada 	/* Add VPart, a 3-byte UTF-8 sequence. */
268628ba53c0SMasahiro Yamada 	h += utf8encode((char *)h, vi + VB);
268728ba53c0SMasahiro Yamada 
268828ba53c0SMasahiro Yamada 	/* Add TPart if required, also a 3-byte UTF-8 sequence. */
268928ba53c0SMasahiro Yamada 	if (ti)
269028ba53c0SMasahiro Yamada 		h += utf8encode((char *)h, ti + TB);
269128ba53c0SMasahiro Yamada 
269228ba53c0SMasahiro Yamada 	/* Terminate string. */
269328ba53c0SMasahiro Yamada 	h[0] = '\0';
269428ba53c0SMasahiro Yamada 
269528ba53c0SMasahiro Yamada 	return hangul;
269628ba53c0SMasahiro Yamada }
269728ba53c0SMasahiro Yamada 
269828ba53c0SMasahiro Yamada /*
269928ba53c0SMasahiro Yamada  * Use trie to scan s, touching at most len bytes.
270028ba53c0SMasahiro Yamada  * Returns the leaf if one exists, NULL otherwise.
270128ba53c0SMasahiro Yamada  *
270228ba53c0SMasahiro Yamada  * A non-NULL return guarantees that the UTF-8 sequence starting at s
270328ba53c0SMasahiro Yamada  * is well-formed and corresponds to a known unicode code point.  The
270428ba53c0SMasahiro Yamada  * shorthand for this will be "is valid UTF-8 unicode".
270528ba53c0SMasahiro Yamada  */
utf8nlookup(struct tree * tree,unsigned char * hangul,const char * s,size_t len)270628ba53c0SMasahiro Yamada static utf8leaf_t *utf8nlookup(struct tree *tree, unsigned char *hangul,
270728ba53c0SMasahiro Yamada 			       const char *s, size_t len)
270828ba53c0SMasahiro Yamada {
270928ba53c0SMasahiro Yamada 	utf8trie_t	*trie;
271028ba53c0SMasahiro Yamada 	int		offlen;
271128ba53c0SMasahiro Yamada 	int		offset;
271228ba53c0SMasahiro Yamada 	int		mask;
271328ba53c0SMasahiro Yamada 	int		node;
271428ba53c0SMasahiro Yamada 
271528ba53c0SMasahiro Yamada 	if (!tree)
271628ba53c0SMasahiro Yamada 		return NULL;
271728ba53c0SMasahiro Yamada 	if (len == 0)
271828ba53c0SMasahiro Yamada 		return NULL;
271928ba53c0SMasahiro Yamada 	node = 1;
272028ba53c0SMasahiro Yamada 	trie = utf8data + tree->index;
272128ba53c0SMasahiro Yamada 	while (node) {
272228ba53c0SMasahiro Yamada 		offlen = (*trie & OFFLEN) >> OFFLEN_SHIFT;
272328ba53c0SMasahiro Yamada 		if (*trie & NEXTBYTE) {
272428ba53c0SMasahiro Yamada 			if (--len == 0)
272528ba53c0SMasahiro Yamada 				return NULL;
272628ba53c0SMasahiro Yamada 			s++;
272728ba53c0SMasahiro Yamada 		}
272828ba53c0SMasahiro Yamada 		mask = 1 << (*trie & BITNUM);
272928ba53c0SMasahiro Yamada 		if (*s & mask) {
273028ba53c0SMasahiro Yamada 			/* Right leg */
273128ba53c0SMasahiro Yamada 			if (offlen) {
273228ba53c0SMasahiro Yamada 				/* Right node at offset of trie */
273328ba53c0SMasahiro Yamada 				node = (*trie & RIGHTNODE);
273428ba53c0SMasahiro Yamada 				offset = trie[offlen];
273528ba53c0SMasahiro Yamada 				while (--offlen) {
273628ba53c0SMasahiro Yamada 					offset <<= 8;
273728ba53c0SMasahiro Yamada 					offset |= trie[offlen];
273828ba53c0SMasahiro Yamada 				}
273928ba53c0SMasahiro Yamada 				trie += offset;
274028ba53c0SMasahiro Yamada 			} else if (*trie & RIGHTPATH) {
274128ba53c0SMasahiro Yamada 				/* Right node after this node */
274228ba53c0SMasahiro Yamada 				node = (*trie & TRIENODE);
274328ba53c0SMasahiro Yamada 				trie++;
274428ba53c0SMasahiro Yamada 			} else {
274528ba53c0SMasahiro Yamada 				/* No right node. */
274628ba53c0SMasahiro Yamada 				return NULL;
274728ba53c0SMasahiro Yamada 			}
274828ba53c0SMasahiro Yamada 		} else {
274928ba53c0SMasahiro Yamada 			/* Left leg */
275028ba53c0SMasahiro Yamada 			if (offlen) {
275128ba53c0SMasahiro Yamada 				/* Left node after this node. */
275228ba53c0SMasahiro Yamada 				node = (*trie & LEFTNODE);
275328ba53c0SMasahiro Yamada 				trie += offlen + 1;
275428ba53c0SMasahiro Yamada 			} else if (*trie & RIGHTPATH) {
275528ba53c0SMasahiro Yamada 				/* No left node. */
275628ba53c0SMasahiro Yamada 				return NULL;
275728ba53c0SMasahiro Yamada 			} else {
275828ba53c0SMasahiro Yamada 				/* Left node after this node */
275928ba53c0SMasahiro Yamada 				node = (*trie & TRIENODE);
276028ba53c0SMasahiro Yamada 				trie++;
276128ba53c0SMasahiro Yamada 			}
276228ba53c0SMasahiro Yamada 		}
276328ba53c0SMasahiro Yamada 	}
276428ba53c0SMasahiro Yamada 	/*
276528ba53c0SMasahiro Yamada 	 * Hangul decomposition is done algorithmically. These are the
276628ba53c0SMasahiro Yamada 	 * codepoints >= 0xAC00 and <= 0xD7A3. Their UTF-8 encoding is
276728ba53c0SMasahiro Yamada 	 * always 3 bytes long, so s has been advanced twice, and the
276828ba53c0SMasahiro Yamada 	 * start of the sequence is at s-2.
276928ba53c0SMasahiro Yamada 	 */
277028ba53c0SMasahiro Yamada 	if (LEAF_CCC(trie) == DECOMPOSE && LEAF_STR(trie)[0] == HANGUL)
277128ba53c0SMasahiro Yamada 		trie = utf8hangul(s - 2, hangul);
277228ba53c0SMasahiro Yamada 	return trie;
277328ba53c0SMasahiro Yamada }
277428ba53c0SMasahiro Yamada 
277528ba53c0SMasahiro Yamada /*
277628ba53c0SMasahiro Yamada  * Use trie to scan s.
277728ba53c0SMasahiro Yamada  * Returns the leaf if one exists, NULL otherwise.
277828ba53c0SMasahiro Yamada  *
277928ba53c0SMasahiro Yamada  * Forwards to trie_nlookup().
278028ba53c0SMasahiro Yamada  */
utf8lookup(struct tree * tree,unsigned char * hangul,const char * s)278128ba53c0SMasahiro Yamada static utf8leaf_t *utf8lookup(struct tree *tree, unsigned char *hangul,
278228ba53c0SMasahiro Yamada 			      const char *s)
278328ba53c0SMasahiro Yamada {
278428ba53c0SMasahiro Yamada 	return utf8nlookup(tree, hangul, s, (size_t)-1);
278528ba53c0SMasahiro Yamada }
278628ba53c0SMasahiro Yamada 
278728ba53c0SMasahiro Yamada /*
278828ba53c0SMasahiro Yamada  * Return the number of bytes used by the current UTF-8 sequence.
278928ba53c0SMasahiro Yamada  * Assumes the input points to the first byte of a valid UTF-8
279028ba53c0SMasahiro Yamada  * sequence.
279128ba53c0SMasahiro Yamada  */
utf8clen(const char * s)279228ba53c0SMasahiro Yamada static inline int utf8clen(const char *s)
279328ba53c0SMasahiro Yamada {
279428ba53c0SMasahiro Yamada 	unsigned char c = *s;
279528ba53c0SMasahiro Yamada 	return 1 + (c >= 0xC0) + (c >= 0xE0) + (c >= 0xF0);
279628ba53c0SMasahiro Yamada }
279728ba53c0SMasahiro Yamada 
279828ba53c0SMasahiro Yamada /*
279928ba53c0SMasahiro Yamada  * Maximum age of any character in s.
280028ba53c0SMasahiro Yamada  * Return -1 if s is not valid UTF-8 unicode.
280128ba53c0SMasahiro Yamada  * Return 0 if only non-assigned code points are used.
280228ba53c0SMasahiro Yamada  */
utf8agemax(struct tree * tree,const char * s)280328ba53c0SMasahiro Yamada int utf8agemax(struct tree *tree, const char *s)
280428ba53c0SMasahiro Yamada {
280528ba53c0SMasahiro Yamada 	utf8leaf_t	*leaf;
280628ba53c0SMasahiro Yamada 	int		age = 0;
280728ba53c0SMasahiro Yamada 	int		leaf_age;
280828ba53c0SMasahiro Yamada 	unsigned char	hangul[UTF8HANGULLEAF];
280928ba53c0SMasahiro Yamada 
281028ba53c0SMasahiro Yamada 	if (!tree)
281128ba53c0SMasahiro Yamada 		return -1;
281228ba53c0SMasahiro Yamada 
281328ba53c0SMasahiro Yamada 	while (*s) {
281428ba53c0SMasahiro Yamada 		leaf = utf8lookup(tree, hangul, s);
281528ba53c0SMasahiro Yamada 		if (!leaf)
281628ba53c0SMasahiro Yamada 			return -1;
281728ba53c0SMasahiro Yamada 		leaf_age = ages[LEAF_GEN(leaf)];
281828ba53c0SMasahiro Yamada 		if (leaf_age <= tree->maxage && leaf_age > age)
281928ba53c0SMasahiro Yamada 			age = leaf_age;
282028ba53c0SMasahiro Yamada 		s += utf8clen(s);
282128ba53c0SMasahiro Yamada 	}
282228ba53c0SMasahiro Yamada 	return age;
282328ba53c0SMasahiro Yamada }
282428ba53c0SMasahiro Yamada 
282528ba53c0SMasahiro Yamada /*
282628ba53c0SMasahiro Yamada  * Minimum age of any character in s.
282728ba53c0SMasahiro Yamada  * Return -1 if s is not valid UTF-8 unicode.
282828ba53c0SMasahiro Yamada  * Return 0 if non-assigned code points are used.
282928ba53c0SMasahiro Yamada  */
utf8agemin(struct tree * tree,const char * s)283028ba53c0SMasahiro Yamada int utf8agemin(struct tree *tree, const char *s)
283128ba53c0SMasahiro Yamada {
283228ba53c0SMasahiro Yamada 	utf8leaf_t	*leaf;
283328ba53c0SMasahiro Yamada 	int		age;
283428ba53c0SMasahiro Yamada 	int		leaf_age;
283528ba53c0SMasahiro Yamada 	unsigned char	hangul[UTF8HANGULLEAF];
283628ba53c0SMasahiro Yamada 
283728ba53c0SMasahiro Yamada 	if (!tree)
283828ba53c0SMasahiro Yamada 		return -1;
283928ba53c0SMasahiro Yamada 	age = tree->maxage;
284028ba53c0SMasahiro Yamada 	while (*s) {
284128ba53c0SMasahiro Yamada 		leaf = utf8lookup(tree, hangul, s);
284228ba53c0SMasahiro Yamada 		if (!leaf)
284328ba53c0SMasahiro Yamada 			return -1;
284428ba53c0SMasahiro Yamada 		leaf_age = ages[LEAF_GEN(leaf)];
284528ba53c0SMasahiro Yamada 		if (leaf_age <= tree->maxage && leaf_age < age)
284628ba53c0SMasahiro Yamada 			age = leaf_age;
284728ba53c0SMasahiro Yamada 		s += utf8clen(s);
284828ba53c0SMasahiro Yamada 	}
284928ba53c0SMasahiro Yamada 	return age;
285028ba53c0SMasahiro Yamada }
285128ba53c0SMasahiro Yamada 
285228ba53c0SMasahiro Yamada /*
285328ba53c0SMasahiro Yamada  * Maximum age of any character in s, touch at most len bytes.
285428ba53c0SMasahiro Yamada  * Return -1 if s is not valid UTF-8 unicode.
285528ba53c0SMasahiro Yamada  */
utf8nagemax(struct tree * tree,const char * s,size_t len)285628ba53c0SMasahiro Yamada int utf8nagemax(struct tree *tree, const char *s, size_t len)
285728ba53c0SMasahiro Yamada {
285828ba53c0SMasahiro Yamada 	utf8leaf_t	*leaf;
285928ba53c0SMasahiro Yamada 	int		age = 0;
286028ba53c0SMasahiro Yamada 	int		leaf_age;
286128ba53c0SMasahiro Yamada 	unsigned char	hangul[UTF8HANGULLEAF];
286228ba53c0SMasahiro Yamada 
286328ba53c0SMasahiro Yamada 	if (!tree)
286428ba53c0SMasahiro Yamada 		return -1;
286528ba53c0SMasahiro Yamada 
286628ba53c0SMasahiro Yamada         while (len && *s) {
286728ba53c0SMasahiro Yamada 		leaf = utf8nlookup(tree, hangul, s, len);
286828ba53c0SMasahiro Yamada 		if (!leaf)
286928ba53c0SMasahiro Yamada 			return -1;
287028ba53c0SMasahiro Yamada 		leaf_age = ages[LEAF_GEN(leaf)];
287128ba53c0SMasahiro Yamada 		if (leaf_age <= tree->maxage && leaf_age > age)
287228ba53c0SMasahiro Yamada 			age = leaf_age;
287328ba53c0SMasahiro Yamada 		len -= utf8clen(s);
287428ba53c0SMasahiro Yamada 		s += utf8clen(s);
287528ba53c0SMasahiro Yamada 	}
287628ba53c0SMasahiro Yamada 	return age;
287728ba53c0SMasahiro Yamada }
287828ba53c0SMasahiro Yamada 
287928ba53c0SMasahiro Yamada /*
288028ba53c0SMasahiro Yamada  * Maximum age of any character in s, touch at most len bytes.
288128ba53c0SMasahiro Yamada  * Return -1 if s is not valid UTF-8 unicode.
288228ba53c0SMasahiro Yamada  */
utf8nagemin(struct tree * tree,const char * s,size_t len)288328ba53c0SMasahiro Yamada int utf8nagemin(struct tree *tree, const char *s, size_t len)
288428ba53c0SMasahiro Yamada {
288528ba53c0SMasahiro Yamada 	utf8leaf_t	*leaf;
288628ba53c0SMasahiro Yamada 	int		leaf_age;
288728ba53c0SMasahiro Yamada 	int		age;
288828ba53c0SMasahiro Yamada 	unsigned char	hangul[UTF8HANGULLEAF];
288928ba53c0SMasahiro Yamada 
289028ba53c0SMasahiro Yamada 	if (!tree)
289128ba53c0SMasahiro Yamada 		return -1;
289228ba53c0SMasahiro Yamada 	age = tree->maxage;
289328ba53c0SMasahiro Yamada         while (len && *s) {
289428ba53c0SMasahiro Yamada 		leaf = utf8nlookup(tree, hangul, s, len);
289528ba53c0SMasahiro Yamada 		if (!leaf)
289628ba53c0SMasahiro Yamada 			return -1;
289728ba53c0SMasahiro Yamada 		leaf_age = ages[LEAF_GEN(leaf)];
289828ba53c0SMasahiro Yamada 		if (leaf_age <= tree->maxage && leaf_age < age)
289928ba53c0SMasahiro Yamada 			age = leaf_age;
290028ba53c0SMasahiro Yamada 		len -= utf8clen(s);
290128ba53c0SMasahiro Yamada 		s += utf8clen(s);
290228ba53c0SMasahiro Yamada 	}
290328ba53c0SMasahiro Yamada 	return age;
290428ba53c0SMasahiro Yamada }
290528ba53c0SMasahiro Yamada 
290628ba53c0SMasahiro Yamada /*
290728ba53c0SMasahiro Yamada  * Length of the normalization of s.
290828ba53c0SMasahiro Yamada  * Return -1 if s is not valid UTF-8 unicode.
290928ba53c0SMasahiro Yamada  *
291028ba53c0SMasahiro Yamada  * A string of Default_Ignorable_Code_Point has length 0.
291128ba53c0SMasahiro Yamada  */
utf8len(struct tree * tree,const char * s)291228ba53c0SMasahiro Yamada ssize_t utf8len(struct tree *tree, const char *s)
291328ba53c0SMasahiro Yamada {
291428ba53c0SMasahiro Yamada 	utf8leaf_t	*leaf;
291528ba53c0SMasahiro Yamada 	size_t		ret = 0;
291628ba53c0SMasahiro Yamada 	unsigned char	hangul[UTF8HANGULLEAF];
291728ba53c0SMasahiro Yamada 
291828ba53c0SMasahiro Yamada 	if (!tree)
291928ba53c0SMasahiro Yamada 		return -1;
292028ba53c0SMasahiro Yamada 	while (*s) {
292128ba53c0SMasahiro Yamada 		leaf = utf8lookup(tree, hangul, s);
292228ba53c0SMasahiro Yamada 		if (!leaf)
292328ba53c0SMasahiro Yamada 			return -1;
292428ba53c0SMasahiro Yamada 		if (ages[LEAF_GEN(leaf)] > tree->maxage)
292528ba53c0SMasahiro Yamada 			ret += utf8clen(s);
292628ba53c0SMasahiro Yamada 		else if (LEAF_CCC(leaf) == DECOMPOSE)
292728ba53c0SMasahiro Yamada 			ret += strlen(LEAF_STR(leaf));
292828ba53c0SMasahiro Yamada 		else
292928ba53c0SMasahiro Yamada 			ret += utf8clen(s);
293028ba53c0SMasahiro Yamada 		s += utf8clen(s);
293128ba53c0SMasahiro Yamada 	}
293228ba53c0SMasahiro Yamada 	return ret;
293328ba53c0SMasahiro Yamada }
293428ba53c0SMasahiro Yamada 
293528ba53c0SMasahiro Yamada /*
293628ba53c0SMasahiro Yamada  * Length of the normalization of s, touch at most len bytes.
293728ba53c0SMasahiro Yamada  * Return -1 if s is not valid UTF-8 unicode.
293828ba53c0SMasahiro Yamada  */
utf8nlen(struct tree * tree,const char * s,size_t len)293928ba53c0SMasahiro Yamada ssize_t utf8nlen(struct tree *tree, const char *s, size_t len)
294028ba53c0SMasahiro Yamada {
294128ba53c0SMasahiro Yamada 	utf8leaf_t	*leaf;
294228ba53c0SMasahiro Yamada 	size_t		ret = 0;
294328ba53c0SMasahiro Yamada 	unsigned char	hangul[UTF8HANGULLEAF];
294428ba53c0SMasahiro Yamada 
294528ba53c0SMasahiro Yamada 	if (!tree)
294628ba53c0SMasahiro Yamada 		return -1;
294728ba53c0SMasahiro Yamada 	while (len && *s) {
294828ba53c0SMasahiro Yamada 		leaf = utf8nlookup(tree, hangul, s, len);
294928ba53c0SMasahiro Yamada 		if (!leaf)
295028ba53c0SMasahiro Yamada 			return -1;
295128ba53c0SMasahiro Yamada 		if (ages[LEAF_GEN(leaf)] > tree->maxage)
295228ba53c0SMasahiro Yamada 			ret += utf8clen(s);
295328ba53c0SMasahiro Yamada 		else if (LEAF_CCC(leaf) == DECOMPOSE)
295428ba53c0SMasahiro Yamada 			ret += strlen(LEAF_STR(leaf));
295528ba53c0SMasahiro Yamada 		else
295628ba53c0SMasahiro Yamada 			ret += utf8clen(s);
295728ba53c0SMasahiro Yamada 		len -= utf8clen(s);
295828ba53c0SMasahiro Yamada 		s += utf8clen(s);
295928ba53c0SMasahiro Yamada 	}
296028ba53c0SMasahiro Yamada 	return ret;
296128ba53c0SMasahiro Yamada }
296228ba53c0SMasahiro Yamada 
296328ba53c0SMasahiro Yamada /*
296428ba53c0SMasahiro Yamada  * Cursor structure used by the normalizer.
296528ba53c0SMasahiro Yamada  */
296628ba53c0SMasahiro Yamada struct utf8cursor {
296728ba53c0SMasahiro Yamada 	struct tree	*tree;
296828ba53c0SMasahiro Yamada 	const char	*s;
296928ba53c0SMasahiro Yamada 	const char	*p;
297028ba53c0SMasahiro Yamada 	const char	*ss;
297128ba53c0SMasahiro Yamada 	const char	*sp;
297228ba53c0SMasahiro Yamada 	unsigned int	len;
297328ba53c0SMasahiro Yamada 	unsigned int	slen;
297428ba53c0SMasahiro Yamada 	short int	ccc;
297528ba53c0SMasahiro Yamada 	short int	nccc;
297628ba53c0SMasahiro Yamada 	unsigned int	unichar;
297728ba53c0SMasahiro Yamada 	unsigned char	hangul[UTF8HANGULLEAF];
297828ba53c0SMasahiro Yamada };
297928ba53c0SMasahiro Yamada 
298028ba53c0SMasahiro Yamada /*
298128ba53c0SMasahiro Yamada  * Set up an utf8cursor for use by utf8byte().
298228ba53c0SMasahiro Yamada  *
298328ba53c0SMasahiro Yamada  *   s      : string.
298428ba53c0SMasahiro Yamada  *   len    : length of s.
298528ba53c0SMasahiro Yamada  *   u8c    : pointer to cursor.
298628ba53c0SMasahiro Yamada  *   trie   : utf8trie_t to use for normalization.
298728ba53c0SMasahiro Yamada  *
298828ba53c0SMasahiro Yamada  * Returns -1 on error, 0 on success.
298928ba53c0SMasahiro Yamada  */
utf8ncursor(struct utf8cursor * u8c,struct tree * tree,const char * s,size_t len)299028ba53c0SMasahiro Yamada int utf8ncursor(struct utf8cursor *u8c, struct tree *tree, const char *s,
299128ba53c0SMasahiro Yamada 		size_t len)
299228ba53c0SMasahiro Yamada {
299328ba53c0SMasahiro Yamada 	if (!tree)
299428ba53c0SMasahiro Yamada 		return -1;
299528ba53c0SMasahiro Yamada 	if (!s)
299628ba53c0SMasahiro Yamada 		return -1;
299728ba53c0SMasahiro Yamada 	u8c->tree = tree;
299828ba53c0SMasahiro Yamada 	u8c->s = s;
299928ba53c0SMasahiro Yamada 	u8c->p = NULL;
300028ba53c0SMasahiro Yamada 	u8c->ss = NULL;
300128ba53c0SMasahiro Yamada 	u8c->sp = NULL;
300228ba53c0SMasahiro Yamada 	u8c->len = len;
300328ba53c0SMasahiro Yamada 	u8c->slen = 0;
300428ba53c0SMasahiro Yamada 	u8c->ccc = STOPPER;
300528ba53c0SMasahiro Yamada 	u8c->nccc = STOPPER;
300628ba53c0SMasahiro Yamada 	u8c->unichar = 0;
300728ba53c0SMasahiro Yamada 	/* Check we didn't clobber the maximum length. */
300828ba53c0SMasahiro Yamada 	if (u8c->len != len)
300928ba53c0SMasahiro Yamada 		return -1;
301028ba53c0SMasahiro Yamada 	/* The first byte of s may not be an utf8 continuation. */
301128ba53c0SMasahiro Yamada 	if (len > 0 && (*s & 0xC0) == 0x80)
301228ba53c0SMasahiro Yamada 		return -1;
301328ba53c0SMasahiro Yamada 	return 0;
301428ba53c0SMasahiro Yamada }
301528ba53c0SMasahiro Yamada 
301628ba53c0SMasahiro Yamada /*
301728ba53c0SMasahiro Yamada  * Set up an utf8cursor for use by utf8byte().
301828ba53c0SMasahiro Yamada  *
301928ba53c0SMasahiro Yamada  *   s      : NUL-terminated string.
302028ba53c0SMasahiro Yamada  *   u8c    : pointer to cursor.
302128ba53c0SMasahiro Yamada  *   trie   : utf8trie_t to use for normalization.
302228ba53c0SMasahiro Yamada  *
302328ba53c0SMasahiro Yamada  * Returns -1 on error, 0 on success.
302428ba53c0SMasahiro Yamada  */
utf8cursor(struct utf8cursor * u8c,struct tree * tree,const char * s)302528ba53c0SMasahiro Yamada int utf8cursor(struct utf8cursor *u8c, struct tree *tree, const char *s)
302628ba53c0SMasahiro Yamada {
302728ba53c0SMasahiro Yamada 	return utf8ncursor(u8c, tree, s, (unsigned int)-1);
302828ba53c0SMasahiro Yamada }
302928ba53c0SMasahiro Yamada 
303028ba53c0SMasahiro Yamada /*
303128ba53c0SMasahiro Yamada  * Get one byte from the normalized form of the string described by u8c.
303228ba53c0SMasahiro Yamada  *
303328ba53c0SMasahiro Yamada  * Returns the byte cast to an unsigned char on succes, and -1 on failure.
303428ba53c0SMasahiro Yamada  *
303528ba53c0SMasahiro Yamada  * The cursor keeps track of the location in the string in u8c->s.
303628ba53c0SMasahiro Yamada  * When a character is decomposed, the current location is stored in
303728ba53c0SMasahiro Yamada  * u8c->p, and u8c->s is set to the start of the decomposition. Note
303828ba53c0SMasahiro Yamada  * that bytes from a decomposition do not count against u8c->len.
303928ba53c0SMasahiro Yamada  *
304028ba53c0SMasahiro Yamada  * Characters are emitted if they match the current CCC in u8c->ccc.
304128ba53c0SMasahiro Yamada  * Hitting end-of-string while u8c->ccc == STOPPER means we're done,
304228ba53c0SMasahiro Yamada  * and the function returns 0 in that case.
304328ba53c0SMasahiro Yamada  *
304428ba53c0SMasahiro Yamada  * Sorting by CCC is done by repeatedly scanning the string.  The
304528ba53c0SMasahiro Yamada  * values of u8c->s and u8c->p are stored in u8c->ss and u8c->sp at
304628ba53c0SMasahiro Yamada  * the start of the scan.  The first pass finds the lowest CCC to be
304728ba53c0SMasahiro Yamada  * emitted and stores it in u8c->nccc, the second pass emits the
304828ba53c0SMasahiro Yamada  * characters with this CCC and finds the next lowest CCC. This limits
304928ba53c0SMasahiro Yamada  * the number of passes to 1 + the number of different CCCs in the
305028ba53c0SMasahiro Yamada  * sequence being scanned.
305128ba53c0SMasahiro Yamada  *
305228ba53c0SMasahiro Yamada  * Therefore:
305328ba53c0SMasahiro Yamada  *  u8c->p  != NULL -> a decomposition is being scanned.
305428ba53c0SMasahiro Yamada  *  u8c->ss != NULL -> this is a repeating scan.
305528ba53c0SMasahiro Yamada  *  u8c->ccc == -1  -> this is the first scan of a repeating scan.
305628ba53c0SMasahiro Yamada  */
utf8byte(struct utf8cursor * u8c)305728ba53c0SMasahiro Yamada int utf8byte(struct utf8cursor *u8c)
305828ba53c0SMasahiro Yamada {
305928ba53c0SMasahiro Yamada 	utf8leaf_t *leaf;
306028ba53c0SMasahiro Yamada 	int ccc;
306128ba53c0SMasahiro Yamada 
306228ba53c0SMasahiro Yamada 	for (;;) {
306328ba53c0SMasahiro Yamada 		/* Check for the end of a decomposed character. */
306428ba53c0SMasahiro Yamada 		if (u8c->p && *u8c->s == '\0') {
306528ba53c0SMasahiro Yamada 			u8c->s = u8c->p;
306628ba53c0SMasahiro Yamada 			u8c->p = NULL;
306728ba53c0SMasahiro Yamada 		}
306828ba53c0SMasahiro Yamada 
306928ba53c0SMasahiro Yamada 		/* Check for end-of-string. */
307028ba53c0SMasahiro Yamada 		if (!u8c->p && (u8c->len == 0 || *u8c->s == '\0')) {
307128ba53c0SMasahiro Yamada 			/* There is no next byte. */
307228ba53c0SMasahiro Yamada 			if (u8c->ccc == STOPPER)
307328ba53c0SMasahiro Yamada 				return 0;
307428ba53c0SMasahiro Yamada 			/* End-of-string during a scan counts as a stopper. */
307528ba53c0SMasahiro Yamada 			ccc = STOPPER;
307628ba53c0SMasahiro Yamada 			goto ccc_mismatch;
307728ba53c0SMasahiro Yamada 		} else if ((*u8c->s & 0xC0) == 0x80) {
307828ba53c0SMasahiro Yamada 			/* This is a continuation of the current character. */
307928ba53c0SMasahiro Yamada 			if (!u8c->p)
308028ba53c0SMasahiro Yamada 				u8c->len--;
308128ba53c0SMasahiro Yamada 			return (unsigned char)*u8c->s++;
308228ba53c0SMasahiro Yamada 		}
308328ba53c0SMasahiro Yamada 
308428ba53c0SMasahiro Yamada 		/* Look up the data for the current character. */
308528ba53c0SMasahiro Yamada 		if (u8c->p) {
308628ba53c0SMasahiro Yamada 			leaf = utf8lookup(u8c->tree, u8c->hangul, u8c->s);
308728ba53c0SMasahiro Yamada 		} else {
308828ba53c0SMasahiro Yamada 			leaf = utf8nlookup(u8c->tree, u8c->hangul,
308928ba53c0SMasahiro Yamada 					   u8c->s, u8c->len);
309028ba53c0SMasahiro Yamada 		}
309128ba53c0SMasahiro Yamada 
309228ba53c0SMasahiro Yamada 		/* No leaf found implies that the input is a binary blob. */
309328ba53c0SMasahiro Yamada 		if (!leaf)
309428ba53c0SMasahiro Yamada 			return -1;
309528ba53c0SMasahiro Yamada 
309628ba53c0SMasahiro Yamada 		/* Characters that are too new have CCC 0. */
309728ba53c0SMasahiro Yamada 		if (ages[LEAF_GEN(leaf)] > u8c->tree->maxage) {
309828ba53c0SMasahiro Yamada 			ccc = STOPPER;
309928ba53c0SMasahiro Yamada 		} else if ((ccc = LEAF_CCC(leaf)) == DECOMPOSE) {
310028ba53c0SMasahiro Yamada 			u8c->len -= utf8clen(u8c->s);
310128ba53c0SMasahiro Yamada 			u8c->p = u8c->s + utf8clen(u8c->s);
310228ba53c0SMasahiro Yamada 			u8c->s = LEAF_STR(leaf);
310328ba53c0SMasahiro Yamada 			/* Empty decomposition implies CCC 0. */
310428ba53c0SMasahiro Yamada 			if (*u8c->s == '\0') {
310528ba53c0SMasahiro Yamada 				if (u8c->ccc == STOPPER)
310628ba53c0SMasahiro Yamada 					continue;
310728ba53c0SMasahiro Yamada 				ccc = STOPPER;
310828ba53c0SMasahiro Yamada 				goto ccc_mismatch;
310928ba53c0SMasahiro Yamada 			}
311028ba53c0SMasahiro Yamada 			leaf = utf8lookup(u8c->tree, u8c->hangul, u8c->s);
311128ba53c0SMasahiro Yamada 			ccc = LEAF_CCC(leaf);
311228ba53c0SMasahiro Yamada 		}
311328ba53c0SMasahiro Yamada 		u8c->unichar = utf8decode(u8c->s);
311428ba53c0SMasahiro Yamada 
311528ba53c0SMasahiro Yamada 		/*
311628ba53c0SMasahiro Yamada 		 * If this is not a stopper, then see if it updates
311728ba53c0SMasahiro Yamada 		 * the next canonical class to be emitted.
311828ba53c0SMasahiro Yamada 		 */
311928ba53c0SMasahiro Yamada 		if (ccc != STOPPER && u8c->ccc < ccc && ccc < u8c->nccc)
312028ba53c0SMasahiro Yamada 			u8c->nccc = ccc;
312128ba53c0SMasahiro Yamada 
312228ba53c0SMasahiro Yamada 		/*
312328ba53c0SMasahiro Yamada 		 * Return the current byte if this is the current
312428ba53c0SMasahiro Yamada 		 * combining class.
312528ba53c0SMasahiro Yamada 		 */
312628ba53c0SMasahiro Yamada 		if (ccc == u8c->ccc) {
312728ba53c0SMasahiro Yamada 			if (!u8c->p)
312828ba53c0SMasahiro Yamada 				u8c->len--;
312928ba53c0SMasahiro Yamada 			return (unsigned char)*u8c->s++;
313028ba53c0SMasahiro Yamada 		}
313128ba53c0SMasahiro Yamada 
313228ba53c0SMasahiro Yamada 		/* Current combining class mismatch. */
313328ba53c0SMasahiro Yamada 	ccc_mismatch:
313428ba53c0SMasahiro Yamada 		if (u8c->nccc == STOPPER) {
313528ba53c0SMasahiro Yamada 			/*
313628ba53c0SMasahiro Yamada 			 * Scan forward for the first canonical class
313728ba53c0SMasahiro Yamada 			 * to be emitted.  Save the position from
313828ba53c0SMasahiro Yamada 			 * which to restart.
313928ba53c0SMasahiro Yamada 			 */
314028ba53c0SMasahiro Yamada 			assert(u8c->ccc == STOPPER);
314128ba53c0SMasahiro Yamada 			u8c->ccc = MINCCC - 1;
314228ba53c0SMasahiro Yamada 			u8c->nccc = ccc;
314328ba53c0SMasahiro Yamada 			u8c->sp = u8c->p;
314428ba53c0SMasahiro Yamada 			u8c->ss = u8c->s;
314528ba53c0SMasahiro Yamada 			u8c->slen = u8c->len;
314628ba53c0SMasahiro Yamada 			if (!u8c->p)
314728ba53c0SMasahiro Yamada 				u8c->len -= utf8clen(u8c->s);
314828ba53c0SMasahiro Yamada 			u8c->s += utf8clen(u8c->s);
314928ba53c0SMasahiro Yamada 		} else if (ccc != STOPPER) {
315028ba53c0SMasahiro Yamada 			/* Not a stopper, and not the ccc we're emitting. */
315128ba53c0SMasahiro Yamada 			if (!u8c->p)
315228ba53c0SMasahiro Yamada 				u8c->len -= utf8clen(u8c->s);
315328ba53c0SMasahiro Yamada 			u8c->s += utf8clen(u8c->s);
315428ba53c0SMasahiro Yamada 		} else if (u8c->nccc != MAXCCC + 1) {
315528ba53c0SMasahiro Yamada 			/* At a stopper, restart for next ccc. */
315628ba53c0SMasahiro Yamada 			u8c->ccc = u8c->nccc;
315728ba53c0SMasahiro Yamada 			u8c->nccc = MAXCCC + 1;
315828ba53c0SMasahiro Yamada 			u8c->s = u8c->ss;
315928ba53c0SMasahiro Yamada 			u8c->p = u8c->sp;
316028ba53c0SMasahiro Yamada 			u8c->len = u8c->slen;
316128ba53c0SMasahiro Yamada 		} else {
316228ba53c0SMasahiro Yamada 			/* All done, proceed from here. */
316328ba53c0SMasahiro Yamada 			u8c->ccc = STOPPER;
316428ba53c0SMasahiro Yamada 			u8c->nccc = STOPPER;
316528ba53c0SMasahiro Yamada 			u8c->sp = NULL;
316628ba53c0SMasahiro Yamada 			u8c->ss = NULL;
316728ba53c0SMasahiro Yamada 			u8c->slen = 0;
316828ba53c0SMasahiro Yamada 		}
316928ba53c0SMasahiro Yamada 	}
317028ba53c0SMasahiro Yamada }
317128ba53c0SMasahiro Yamada 
317228ba53c0SMasahiro Yamada /* ------------------------------------------------------------------ */
317328ba53c0SMasahiro Yamada 
normalize_line(struct tree * tree)317428ba53c0SMasahiro Yamada static int normalize_line(struct tree *tree)
317528ba53c0SMasahiro Yamada {
317628ba53c0SMasahiro Yamada 	char *s;
317728ba53c0SMasahiro Yamada 	char *t;
317828ba53c0SMasahiro Yamada 	int c;
317928ba53c0SMasahiro Yamada 	struct utf8cursor u8c;
318028ba53c0SMasahiro Yamada 
318128ba53c0SMasahiro Yamada 	/* First test: null-terminated string. */
318228ba53c0SMasahiro Yamada 	s = buf2;
318328ba53c0SMasahiro Yamada 	t = buf3;
318428ba53c0SMasahiro Yamada 	if (utf8cursor(&u8c, tree, s))
318528ba53c0SMasahiro Yamada 		return -1;
318628ba53c0SMasahiro Yamada 	while ((c = utf8byte(&u8c)) > 0)
318728ba53c0SMasahiro Yamada 		if (c != (unsigned char)*t++)
318828ba53c0SMasahiro Yamada 			return -1;
318928ba53c0SMasahiro Yamada 	if (c < 0)
319028ba53c0SMasahiro Yamada 		return -1;
319128ba53c0SMasahiro Yamada 	if (*t != 0)
319228ba53c0SMasahiro Yamada 		return -1;
319328ba53c0SMasahiro Yamada 
319428ba53c0SMasahiro Yamada 	/* Second test: length-limited string. */
319528ba53c0SMasahiro Yamada 	s = buf2;
319628ba53c0SMasahiro Yamada 	/* Replace NUL with a value that will cause an error if seen. */
319728ba53c0SMasahiro Yamada 	s[strlen(s) + 1] = -1;
319828ba53c0SMasahiro Yamada 	t = buf3;
319928ba53c0SMasahiro Yamada 	if (utf8cursor(&u8c, tree, s))
320028ba53c0SMasahiro Yamada 		return -1;
320128ba53c0SMasahiro Yamada 	while ((c = utf8byte(&u8c)) > 0)
320228ba53c0SMasahiro Yamada 		if (c != (unsigned char)*t++)
320328ba53c0SMasahiro Yamada 			return -1;
320428ba53c0SMasahiro Yamada 	if (c < 0)
320528ba53c0SMasahiro Yamada 		return -1;
320628ba53c0SMasahiro Yamada 	if (*t != 0)
320728ba53c0SMasahiro Yamada 		return -1;
320828ba53c0SMasahiro Yamada 
320928ba53c0SMasahiro Yamada 	return 0;
321028ba53c0SMasahiro Yamada }
321128ba53c0SMasahiro Yamada 
normalization_test(void)321228ba53c0SMasahiro Yamada static void normalization_test(void)
321328ba53c0SMasahiro Yamada {
321428ba53c0SMasahiro Yamada 	FILE *file;
321528ba53c0SMasahiro Yamada 	unsigned int unichar;
321628ba53c0SMasahiro Yamada 	struct unicode_data *data;
321728ba53c0SMasahiro Yamada 	char *s;
321828ba53c0SMasahiro Yamada 	char *t;
321928ba53c0SMasahiro Yamada 	int ret;
322028ba53c0SMasahiro Yamada 	int ignorables;
322128ba53c0SMasahiro Yamada 	int tests = 0;
322228ba53c0SMasahiro Yamada 	int failures = 0;
322328ba53c0SMasahiro Yamada 
322428ba53c0SMasahiro Yamada 	if (verbose > 0)
322528ba53c0SMasahiro Yamada 		printf("Parsing %s\n", test_name);
322628ba53c0SMasahiro Yamada 	/* Step one, read data from file. */
322728ba53c0SMasahiro Yamada 	file = fopen(test_name, "r");
322828ba53c0SMasahiro Yamada 	if (!file)
322928ba53c0SMasahiro Yamada 		open_fail(test_name, errno);
323028ba53c0SMasahiro Yamada 
323128ba53c0SMasahiro Yamada 	while (fgets(line, LINESIZE, file)) {
323228ba53c0SMasahiro Yamada 		ret = sscanf(line, "%[^;];%*[^;];%[^;];%*[^;];%*[^;];",
323328ba53c0SMasahiro Yamada 			     buf0, buf1);
323428ba53c0SMasahiro Yamada 		if (ret != 2 || *line == '#')
323528ba53c0SMasahiro Yamada 			continue;
323628ba53c0SMasahiro Yamada 		s = buf0;
323728ba53c0SMasahiro Yamada 		t = buf2;
323828ba53c0SMasahiro Yamada 		while (*s) {
323928ba53c0SMasahiro Yamada 			unichar = strtoul(s, &s, 16);
324028ba53c0SMasahiro Yamada 			t += utf8encode(t, unichar);
324128ba53c0SMasahiro Yamada 		}
324228ba53c0SMasahiro Yamada 		*t = '\0';
324328ba53c0SMasahiro Yamada 
324428ba53c0SMasahiro Yamada 		ignorables = 0;
324528ba53c0SMasahiro Yamada 		s = buf1;
324628ba53c0SMasahiro Yamada 		t = buf3;
324728ba53c0SMasahiro Yamada 		while (*s) {
324828ba53c0SMasahiro Yamada 			unichar = strtoul(s, &s, 16);
324928ba53c0SMasahiro Yamada 			data = &unicode_data[unichar];
325028ba53c0SMasahiro Yamada 			if (data->utf8nfdi && !*data->utf8nfdi)
325128ba53c0SMasahiro Yamada 				ignorables = 1;
325228ba53c0SMasahiro Yamada 			else
325328ba53c0SMasahiro Yamada 				t += utf8encode(t, unichar);
325428ba53c0SMasahiro Yamada 		}
325528ba53c0SMasahiro Yamada 		*t = '\0';
325628ba53c0SMasahiro Yamada 
325728ba53c0SMasahiro Yamada 		tests++;
325828ba53c0SMasahiro Yamada 		if (normalize_line(nfdi_tree) < 0) {
325928ba53c0SMasahiro Yamada 			printf("Line %s -> %s", buf0, buf1);
326028ba53c0SMasahiro Yamada 			if (ignorables)
326128ba53c0SMasahiro Yamada 				printf(" (ignorables removed)");
326228ba53c0SMasahiro Yamada 			printf(" failure\n");
326328ba53c0SMasahiro Yamada 			failures++;
326428ba53c0SMasahiro Yamada 		}
326528ba53c0SMasahiro Yamada 	}
326628ba53c0SMasahiro Yamada 	fclose(file);
326728ba53c0SMasahiro Yamada 	if (verbose > 0)
326828ba53c0SMasahiro Yamada 		printf("Ran %d tests with %d failures\n", tests, failures);
326928ba53c0SMasahiro Yamada 	if (failures)
327028ba53c0SMasahiro Yamada 		file_fail(test_name);
327128ba53c0SMasahiro Yamada }
327228ba53c0SMasahiro Yamada 
327328ba53c0SMasahiro Yamada /* ------------------------------------------------------------------ */
327428ba53c0SMasahiro Yamada 
write_file(void)327528ba53c0SMasahiro Yamada static void write_file(void)
327628ba53c0SMasahiro Yamada {
327728ba53c0SMasahiro Yamada 	FILE *file;
327828ba53c0SMasahiro Yamada 	int i;
327928ba53c0SMasahiro Yamada 	int j;
328028ba53c0SMasahiro Yamada 	int t;
328128ba53c0SMasahiro Yamada 	int gen;
328228ba53c0SMasahiro Yamada 
328328ba53c0SMasahiro Yamada 	if (verbose > 0)
328428ba53c0SMasahiro Yamada 		printf("Writing %s\n", utf8_name);
328528ba53c0SMasahiro Yamada 	file = fopen(utf8_name, "w");
328628ba53c0SMasahiro Yamada 	if (!file)
328728ba53c0SMasahiro Yamada 		open_fail(utf8_name, errno);
328828ba53c0SMasahiro Yamada 
328928ba53c0SMasahiro Yamada 	fprintf(file, "/* This file is generated code, do not edit. */\n");
329028ba53c0SMasahiro Yamada 	fprintf(file, "\n");
3291*2b3d0478SChristoph Hellwig 	fprintf(file, "#include <linux/module.h>\n");
3292*2b3d0478SChristoph Hellwig 	fprintf(file, "#include <linux/kernel.h>\n");
3293*2b3d0478SChristoph Hellwig 	fprintf(file, "#include \"utf8n.h\"\n");
329428ba53c0SMasahiro Yamada 	fprintf(file, "\n");
329528ba53c0SMasahiro Yamada 	fprintf(file, "static const unsigned int utf8agetab[] = {\n");
329628ba53c0SMasahiro Yamada 	for (i = 0; i != ages_count; i++)
329728ba53c0SMasahiro Yamada 		fprintf(file, "\t%#x%s\n", ages[i],
329828ba53c0SMasahiro Yamada 			ages[i] == unicode_maxage ? "" : ",");
329928ba53c0SMasahiro Yamada 	fprintf(file, "};\n");
330028ba53c0SMasahiro Yamada 	fprintf(file, "\n");
330128ba53c0SMasahiro Yamada 	fprintf(file, "static const struct utf8data utf8nfdicfdata[] = {\n");
330228ba53c0SMasahiro Yamada 	t = 0;
330328ba53c0SMasahiro Yamada 	for (gen = 0; gen < ages_count; gen++) {
330428ba53c0SMasahiro Yamada 		fprintf(file, "\t{ %#x, %d }%s\n",
330528ba53c0SMasahiro Yamada 			ages[gen], trees[t].index,
330628ba53c0SMasahiro Yamada 			ages[gen] == unicode_maxage ? "" : ",");
330728ba53c0SMasahiro Yamada 		if (trees[t].maxage == ages[gen])
330828ba53c0SMasahiro Yamada 			t += 2;
330928ba53c0SMasahiro Yamada 	}
331028ba53c0SMasahiro Yamada 	fprintf(file, "};\n");
331128ba53c0SMasahiro Yamada 	fprintf(file, "\n");
331228ba53c0SMasahiro Yamada 	fprintf(file, "static const struct utf8data utf8nfdidata[] = {\n");
331328ba53c0SMasahiro Yamada 	t = 1;
331428ba53c0SMasahiro Yamada 	for (gen = 0; gen < ages_count; gen++) {
331528ba53c0SMasahiro Yamada 		fprintf(file, "\t{ %#x, %d }%s\n",
331628ba53c0SMasahiro Yamada 			ages[gen], trees[t].index,
331728ba53c0SMasahiro Yamada 			ages[gen] == unicode_maxage ? "" : ",");
331828ba53c0SMasahiro Yamada 		if (trees[t].maxage == ages[gen])
331928ba53c0SMasahiro Yamada 			t += 2;
332028ba53c0SMasahiro Yamada 	}
332128ba53c0SMasahiro Yamada 	fprintf(file, "};\n");
332228ba53c0SMasahiro Yamada 	fprintf(file, "\n");
332328ba53c0SMasahiro Yamada 	fprintf(file, "static const unsigned char utf8data[%zd] = {\n",
332428ba53c0SMasahiro Yamada 		utf8data_size);
332528ba53c0SMasahiro Yamada 	t = 0;
332628ba53c0SMasahiro Yamada 	for (i = 0; i != utf8data_size; i += 16) {
332728ba53c0SMasahiro Yamada 		if (i == trees[t].index) {
332828ba53c0SMasahiro Yamada 			fprintf(file, "\t/* %s_%x */\n",
332928ba53c0SMasahiro Yamada 				trees[t].type, trees[t].maxage);
333028ba53c0SMasahiro Yamada 			if (t < trees_count-1)
333128ba53c0SMasahiro Yamada 				t++;
333228ba53c0SMasahiro Yamada 		}
333328ba53c0SMasahiro Yamada 		fprintf(file, "\t");
333428ba53c0SMasahiro Yamada 		for (j = i; j != i + 16; j++)
333528ba53c0SMasahiro Yamada 			fprintf(file, "0x%.2x%s", utf8data[j],
333628ba53c0SMasahiro Yamada 				(j < utf8data_size -1 ? "," : ""));
333728ba53c0SMasahiro Yamada 		fprintf(file, "\n");
333828ba53c0SMasahiro Yamada 	}
333928ba53c0SMasahiro Yamada 	fprintf(file, "};\n");
3340*2b3d0478SChristoph Hellwig 	fprintf(file, "\n");
3341*2b3d0478SChristoph Hellwig 	fprintf(file, "struct utf8data_table utf8_data_table = {\n");
3342*2b3d0478SChristoph Hellwig 	fprintf(file, "\t.utf8agetab = utf8agetab,\n");
3343*2b3d0478SChristoph Hellwig 	fprintf(file, "\t.utf8agetab_size = ARRAY_SIZE(utf8agetab),\n");
3344*2b3d0478SChristoph Hellwig 	fprintf(file, "\n");
3345*2b3d0478SChristoph Hellwig 	fprintf(file, "\t.utf8nfdicfdata = utf8nfdicfdata,\n");
3346*2b3d0478SChristoph Hellwig 	fprintf(file, "\t.utf8nfdicfdata_size = ARRAY_SIZE(utf8nfdicfdata),\n");
3347*2b3d0478SChristoph Hellwig 	fprintf(file, "\n");
3348*2b3d0478SChristoph Hellwig 	fprintf(file, "\t.utf8nfdidata = utf8nfdidata,\n");
3349*2b3d0478SChristoph Hellwig 	fprintf(file, "\t.utf8nfdidata_size = ARRAY_SIZE(utf8nfdidata),\n");
3350*2b3d0478SChristoph Hellwig 	fprintf(file, "\n");
3351*2b3d0478SChristoph Hellwig 	fprintf(file, "\t.utf8data = utf8data,\n");
3352*2b3d0478SChristoph Hellwig 	fprintf(file, "};\n");
3353*2b3d0478SChristoph Hellwig 	fprintf(file, "EXPORT_SYMBOL_GPL(utf8_data_table);");
3354*2b3d0478SChristoph Hellwig 	fprintf(file, "\n");
3355*2b3d0478SChristoph Hellwig 	fprintf(file, "MODULE_LICENSE(\"GPL v2\");\n");
335628ba53c0SMasahiro Yamada 	fclose(file);
335728ba53c0SMasahiro Yamada }
335828ba53c0SMasahiro Yamada 
335928ba53c0SMasahiro Yamada /* ------------------------------------------------------------------ */
336028ba53c0SMasahiro Yamada 
main(int argc,char * argv[])336128ba53c0SMasahiro Yamada int main(int argc, char *argv[])
336228ba53c0SMasahiro Yamada {
336328ba53c0SMasahiro Yamada 	unsigned int unichar;
336428ba53c0SMasahiro Yamada 	int opt;
336528ba53c0SMasahiro Yamada 
336628ba53c0SMasahiro Yamada 	argv0 = argv[0];
336728ba53c0SMasahiro Yamada 
336828ba53c0SMasahiro Yamada 	while ((opt = getopt(argc, argv, "a:c:d:f:hn:o:p:t:v")) != -1) {
336928ba53c0SMasahiro Yamada 		switch (opt) {
337028ba53c0SMasahiro Yamada 		case 'a':
337128ba53c0SMasahiro Yamada 			age_name = optarg;
337228ba53c0SMasahiro Yamada 			break;
337328ba53c0SMasahiro Yamada 		case 'c':
337428ba53c0SMasahiro Yamada 			ccc_name = optarg;
337528ba53c0SMasahiro Yamada 			break;
337628ba53c0SMasahiro Yamada 		case 'd':
337728ba53c0SMasahiro Yamada 			data_name = optarg;
337828ba53c0SMasahiro Yamada 			break;
337928ba53c0SMasahiro Yamada 		case 'f':
338028ba53c0SMasahiro Yamada 			fold_name = optarg;
338128ba53c0SMasahiro Yamada 			break;
338228ba53c0SMasahiro Yamada 		case 'n':
338328ba53c0SMasahiro Yamada 			norm_name = optarg;
338428ba53c0SMasahiro Yamada 			break;
338528ba53c0SMasahiro Yamada 		case 'o':
338628ba53c0SMasahiro Yamada 			utf8_name = optarg;
338728ba53c0SMasahiro Yamada 			break;
338828ba53c0SMasahiro Yamada 		case 'p':
338928ba53c0SMasahiro Yamada 			prop_name = optarg;
339028ba53c0SMasahiro Yamada 			break;
339128ba53c0SMasahiro Yamada 		case 't':
339228ba53c0SMasahiro Yamada 			test_name = optarg;
339328ba53c0SMasahiro Yamada 			break;
339428ba53c0SMasahiro Yamada 		case 'v':
339528ba53c0SMasahiro Yamada 			verbose++;
339628ba53c0SMasahiro Yamada 			break;
339728ba53c0SMasahiro Yamada 		case 'h':
339828ba53c0SMasahiro Yamada 			help();
339928ba53c0SMasahiro Yamada 			exit(0);
340028ba53c0SMasahiro Yamada 		default:
340128ba53c0SMasahiro Yamada 			usage();
340228ba53c0SMasahiro Yamada 		}
340328ba53c0SMasahiro Yamada 	}
340428ba53c0SMasahiro Yamada 
340528ba53c0SMasahiro Yamada 	if (verbose > 1)
340628ba53c0SMasahiro Yamada 		help();
340728ba53c0SMasahiro Yamada 	for (unichar = 0; unichar != 0x110000; unichar++)
340828ba53c0SMasahiro Yamada 		unicode_data[unichar].code = unichar;
340928ba53c0SMasahiro Yamada 	age_init();
341028ba53c0SMasahiro Yamada 	ccc_init();
341128ba53c0SMasahiro Yamada 	nfdi_init();
341228ba53c0SMasahiro Yamada 	nfdicf_init();
341328ba53c0SMasahiro Yamada 	ignore_init();
341428ba53c0SMasahiro Yamada 	corrections_init();
341528ba53c0SMasahiro Yamada 	hangul_decompose();
341628ba53c0SMasahiro Yamada 	nfdi_decompose();
341728ba53c0SMasahiro Yamada 	nfdicf_decompose();
341828ba53c0SMasahiro Yamada 	utf8_init();
341928ba53c0SMasahiro Yamada 	trees_init();
342028ba53c0SMasahiro Yamada 	trees_populate();
342128ba53c0SMasahiro Yamada 	trees_reduce();
342228ba53c0SMasahiro Yamada 	trees_verify();
342328ba53c0SMasahiro Yamada 	/* Prevent "unused function" warning. */
342428ba53c0SMasahiro Yamada 	(void)lookup(nfdi_tree, " ");
342528ba53c0SMasahiro Yamada 	if (verbose > 2)
342628ba53c0SMasahiro Yamada 		tree_walk(nfdi_tree);
342728ba53c0SMasahiro Yamada 	if (verbose > 2)
342828ba53c0SMasahiro Yamada 		tree_walk(nfdicf_tree);
342928ba53c0SMasahiro Yamada 	normalization_test();
343028ba53c0SMasahiro Yamada 	write_file();
343128ba53c0SMasahiro Yamada 
343228ba53c0SMasahiro Yamada 	return 0;
343328ba53c0SMasahiro Yamada }
3434