1 /* 2 * Copyright (c) 2014 SGI. 3 * All rights reserved. 4 * 5 * This program is free software; you can redistribute it and/or 6 * modify it under the terms of the GNU General Public License as 7 * published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope that it would be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public License 15 * along with this program; if not, write the Free Software Foundation, 16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 */ 18 19 /* Generator for a compact trie for unicode normalization */ 20 21 #include <sys/types.h> 22 #include <stddef.h> 23 #include <stdlib.h> 24 #include <stdio.h> 25 #include <assert.h> 26 #include <string.h> 27 #include <unistd.h> 28 #include <errno.h> 29 30 /* Default names of the in- and output files. */ 31 32 #define AGE_NAME "DerivedAge.txt" 33 #define CCC_NAME "DerivedCombiningClass.txt" 34 #define PROP_NAME "DerivedCoreProperties.txt" 35 #define DATA_NAME "UnicodeData.txt" 36 #define FOLD_NAME "CaseFolding.txt" 37 #define NORM_NAME "NormalizationCorrections.txt" 38 #define TEST_NAME "NormalizationTest.txt" 39 #define UTF8_NAME "utf8data.h" 40 41 const char *age_name = AGE_NAME; 42 const char *ccc_name = CCC_NAME; 43 const char *prop_name = PROP_NAME; 44 const char *data_name = DATA_NAME; 45 const char *fold_name = FOLD_NAME; 46 const char *norm_name = NORM_NAME; 47 const char *test_name = TEST_NAME; 48 const char *utf8_name = UTF8_NAME; 49 50 int verbose = 0; 51 52 /* An arbitrary line size limit on input lines. */ 53 54 #define LINESIZE 1024 55 char line[LINESIZE]; 56 char buf0[LINESIZE]; 57 char buf1[LINESIZE]; 58 char buf2[LINESIZE]; 59 char buf3[LINESIZE]; 60 61 const char *argv0; 62 63 #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) 64 65 /* ------------------------------------------------------------------ */ 66 67 /* 68 * Unicode version numbers consist of three parts: major, minor, and a 69 * revision. These numbers are packed into an unsigned int to obtain 70 * a single version number. 71 * 72 * To save space in the generated trie, the unicode version is not 73 * stored directly, instead we calculate a generation number from the 74 * unicode versions seen in the DerivedAge file, and use that as an 75 * index into a table of unicode versions. 76 */ 77 #define UNICODE_MAJ_SHIFT (16) 78 #define UNICODE_MIN_SHIFT (8) 79 80 #define UNICODE_MAJ_MAX ((unsigned short)-1) 81 #define UNICODE_MIN_MAX ((unsigned char)-1) 82 #define UNICODE_REV_MAX ((unsigned char)-1) 83 84 #define UNICODE_AGE(MAJ,MIN,REV) \ 85 (((unsigned int)(MAJ) << UNICODE_MAJ_SHIFT) | \ 86 ((unsigned int)(MIN) << UNICODE_MIN_SHIFT) | \ 87 ((unsigned int)(REV))) 88 89 unsigned int *ages; 90 int ages_count; 91 92 unsigned int unicode_maxage; 93 94 static int age_valid(unsigned int major, unsigned int minor, 95 unsigned int revision) 96 { 97 if (major > UNICODE_MAJ_MAX) 98 return 0; 99 if (minor > UNICODE_MIN_MAX) 100 return 0; 101 if (revision > UNICODE_REV_MAX) 102 return 0; 103 return 1; 104 } 105 106 /* ------------------------------------------------------------------ */ 107 108 /* 109 * utf8trie_t 110 * 111 * A compact binary tree, used to decode UTF-8 characters. 112 * 113 * Internal nodes are one byte for the node itself, and up to three 114 * bytes for an offset into the tree. The first byte contains the 115 * following information: 116 * NEXTBYTE - flag - advance to next byte if set 117 * BITNUM - 3 bit field - the bit number to tested 118 * OFFLEN - 2 bit field - number of bytes in the offset 119 * if offlen == 0 (non-branching node) 120 * RIGHTPATH - 1 bit field - set if the following node is for the 121 * right-hand path (tested bit is set) 122 * TRIENODE - 1 bit field - set if the following node is an internal 123 * node, otherwise it is a leaf node 124 * if offlen != 0 (branching node) 125 * LEFTNODE - 1 bit field - set if the left-hand node is internal 126 * RIGHTNODE - 1 bit field - set if the right-hand node is internal 127 * 128 * Due to the way utf8 works, there cannot be branching nodes with 129 * NEXTBYTE set, and moreover those nodes always have a righthand 130 * descendant. 131 */ 132 typedef unsigned char utf8trie_t; 133 #define BITNUM 0x07 134 #define NEXTBYTE 0x08 135 #define OFFLEN 0x30 136 #define OFFLEN_SHIFT 4 137 #define RIGHTPATH 0x40 138 #define TRIENODE 0x80 139 #define RIGHTNODE 0x40 140 #define LEFTNODE 0x80 141 142 /* 143 * utf8leaf_t 144 * 145 * The leaves of the trie are embedded in the trie, and so the same 146 * underlying datatype, unsigned char. 147 * 148 * leaf[0]: The unicode version, stored as a generation number that is 149 * an index into utf8agetab[]. With this we can filter code 150 * points based on the unicode version in which they were 151 * defined. The CCC of a non-defined code point is 0. 152 * leaf[1]: Canonical Combining Class. During normalization, we need 153 * to do a stable sort into ascending order of all characters 154 * with a non-zero CCC that occur between two characters with 155 * a CCC of 0, or at the begin or end of a string. 156 * The unicode standard guarantees that all CCC values are 157 * between 0 and 254 inclusive, which leaves 255 available as 158 * a special value. 159 * Code points with CCC 0 are known as stoppers. 160 * leaf[2]: Decomposition. If leaf[1] == 255, then leaf[2] is the 161 * start of a NUL-terminated string that is the decomposition 162 * of the character. 163 * The CCC of a decomposable character is the same as the CCC 164 * of the first character of its decomposition. 165 * Some characters decompose as the empty string: these are 166 * characters with the Default_Ignorable_Code_Point property. 167 * These do affect normalization, as they all have CCC 0. 168 * 169 * The decompositions in the trie have been fully expanded. 170 * 171 * Casefolding, if applicable, is also done using decompositions. 172 */ 173 typedef unsigned char utf8leaf_t; 174 175 #define LEAF_GEN(LEAF) ((LEAF)[0]) 176 #define LEAF_CCC(LEAF) ((LEAF)[1]) 177 #define LEAF_STR(LEAF) ((const char*)((LEAF) + 2)) 178 179 #define MAXGEN (255) 180 181 #define MINCCC (0) 182 #define MAXCCC (254) 183 #define STOPPER (0) 184 #define DECOMPOSE (255) 185 #define HANGUL ((char)(255)) 186 187 #define UTF8HANGULLEAF (12) 188 189 struct tree; 190 static utf8leaf_t *utf8nlookup(struct tree *, unsigned char *, 191 const char *, size_t); 192 static utf8leaf_t *utf8lookup(struct tree *, unsigned char *, const char *); 193 194 unsigned char *utf8data; 195 size_t utf8data_size; 196 197 utf8trie_t *nfdi; 198 utf8trie_t *nfdicf; 199 200 /* ------------------------------------------------------------------ */ 201 202 /* 203 * UTF8 valid ranges. 204 * 205 * The UTF-8 encoding spreads the bits of a 32bit word over several 206 * bytes. This table gives the ranges that can be held and how they'd 207 * be represented. 208 * 209 * 0x00000000 0x0000007F: 0xxxxxxx 210 * 0x00000000 0x000007FF: 110xxxxx 10xxxxxx 211 * 0x00000000 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx 212 * 0x00000000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 213 * 0x00000000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 214 * 0x00000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 215 * 216 * There is an additional requirement on UTF-8, in that only the 217 * shortest representation of a 32bit value is to be used. A decoder 218 * must not decode sequences that do not satisfy this requirement. 219 * Thus the allowed ranges have a lower bound. 220 * 221 * 0x00000000 0x0000007F: 0xxxxxxx 222 * 0x00000080 0x000007FF: 110xxxxx 10xxxxxx 223 * 0x00000800 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx 224 * 0x00010000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 225 * 0x00200000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 226 * 0x04000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 227 * 228 * Actual unicode characters are limited to the range 0x0 - 0x10FFFF, 229 * 17 planes of 65536 values. This limits the sequences actually seen 230 * even more, to just the following. 231 * 232 * 0 - 0x7f: 0 0x7f 233 * 0x80 - 0x7ff: 0xc2 0x80 0xdf 0xbf 234 * 0x800 - 0xffff: 0xe0 0xa0 0x80 0xef 0xbf 0xbf 235 * 0x10000 - 0x10ffff: 0xf0 0x90 0x80 0x80 0xf4 0x8f 0xbf 0xbf 236 * 237 * Even within those ranges not all values are allowed: the surrogates 238 * 0xd800 - 0xdfff should never be seen. 239 * 240 * Note that the longest sequence seen with valid usage is 4 bytes, 241 * the same a single UTF-32 character. This makes the UTF-8 242 * representation of Unicode strictly smaller than UTF-32. 243 * 244 * The shortest sequence requirement was introduced by: 245 * Corrigendum #1: UTF-8 Shortest Form 246 * It can be found here: 247 * http://www.unicode.org/versions/corrigendum1.html 248 * 249 */ 250 251 #define UTF8_2_BITS 0xC0 252 #define UTF8_3_BITS 0xE0 253 #define UTF8_4_BITS 0xF0 254 #define UTF8_N_BITS 0x80 255 #define UTF8_2_MASK 0xE0 256 #define UTF8_3_MASK 0xF0 257 #define UTF8_4_MASK 0xF8 258 #define UTF8_N_MASK 0xC0 259 #define UTF8_V_MASK 0x3F 260 #define UTF8_V_SHIFT 6 261 262 static int utf8encode(char *str, unsigned int val) 263 { 264 int len; 265 266 if (val < 0x80) { 267 str[0] = val; 268 len = 1; 269 } else if (val < 0x800) { 270 str[1] = val & UTF8_V_MASK; 271 str[1] |= UTF8_N_BITS; 272 val >>= UTF8_V_SHIFT; 273 str[0] = val; 274 str[0] |= UTF8_2_BITS; 275 len = 2; 276 } else if (val < 0x10000) { 277 str[2] = val & UTF8_V_MASK; 278 str[2] |= UTF8_N_BITS; 279 val >>= UTF8_V_SHIFT; 280 str[1] = val & UTF8_V_MASK; 281 str[1] |= UTF8_N_BITS; 282 val >>= UTF8_V_SHIFT; 283 str[0] = val; 284 str[0] |= UTF8_3_BITS; 285 len = 3; 286 } else if (val < 0x110000) { 287 str[3] = val & UTF8_V_MASK; 288 str[3] |= UTF8_N_BITS; 289 val >>= UTF8_V_SHIFT; 290 str[2] = val & UTF8_V_MASK; 291 str[2] |= UTF8_N_BITS; 292 val >>= UTF8_V_SHIFT; 293 str[1] = val & UTF8_V_MASK; 294 str[1] |= UTF8_N_BITS; 295 val >>= UTF8_V_SHIFT; 296 str[0] = val; 297 str[0] |= UTF8_4_BITS; 298 len = 4; 299 } else { 300 printf("%#x: illegal val\n", val); 301 len = 0; 302 } 303 return len; 304 } 305 306 static unsigned int utf8decode(const char *str) 307 { 308 const unsigned char *s = (const unsigned char*)str; 309 unsigned int unichar = 0; 310 311 if (*s < 0x80) { 312 unichar = *s; 313 } else if (*s < UTF8_3_BITS) { 314 unichar = *s++ & 0x1F; 315 unichar <<= UTF8_V_SHIFT; 316 unichar |= *s & 0x3F; 317 } else if (*s < UTF8_4_BITS) { 318 unichar = *s++ & 0x0F; 319 unichar <<= UTF8_V_SHIFT; 320 unichar |= *s++ & 0x3F; 321 unichar <<= UTF8_V_SHIFT; 322 unichar |= *s & 0x3F; 323 } else { 324 unichar = *s++ & 0x0F; 325 unichar <<= UTF8_V_SHIFT; 326 unichar |= *s++ & 0x3F; 327 unichar <<= UTF8_V_SHIFT; 328 unichar |= *s++ & 0x3F; 329 unichar <<= UTF8_V_SHIFT; 330 unichar |= *s & 0x3F; 331 } 332 return unichar; 333 } 334 335 static int utf32valid(unsigned int unichar) 336 { 337 return unichar < 0x110000; 338 } 339 340 #define HANGUL_SYLLABLE(U) ((U) >= 0xAC00 && (U) <= 0xD7A3) 341 342 #define NODE 1 343 #define LEAF 0 344 345 struct tree { 346 void *root; 347 int childnode; 348 const char *type; 349 unsigned int maxage; 350 struct tree *next; 351 int (*leaf_equal)(void *, void *); 352 void (*leaf_print)(void *, int); 353 int (*leaf_mark)(void *); 354 int (*leaf_size)(void *); 355 int *(*leaf_index)(struct tree *, void *); 356 unsigned char *(*leaf_emit)(void *, unsigned char *); 357 int leafindex[0x110000]; 358 int index; 359 }; 360 361 struct node { 362 int index; 363 int offset; 364 int mark; 365 int size; 366 struct node *parent; 367 void *left; 368 void *right; 369 unsigned char bitnum; 370 unsigned char nextbyte; 371 unsigned char leftnode; 372 unsigned char rightnode; 373 unsigned int keybits; 374 unsigned int keymask; 375 }; 376 377 /* 378 * Example lookup function for a tree. 379 */ 380 static void *lookup(struct tree *tree, const char *key) 381 { 382 struct node *node; 383 void *leaf = NULL; 384 385 node = tree->root; 386 while (!leaf && node) { 387 if (node->nextbyte) 388 key++; 389 if (*key & (1 << (node->bitnum & 7))) { 390 /* Right leg */ 391 if (node->rightnode == NODE) { 392 node = node->right; 393 } else if (node->rightnode == LEAF) { 394 leaf = node->right; 395 } else { 396 node = NULL; 397 } 398 } else { 399 /* Left leg */ 400 if (node->leftnode == NODE) { 401 node = node->left; 402 } else if (node->leftnode == LEAF) { 403 leaf = node->left; 404 } else { 405 node = NULL; 406 } 407 } 408 } 409 410 return leaf; 411 } 412 413 /* 414 * A simple non-recursive tree walker: keep track of visits to the 415 * left and right branches in the leftmask and rightmask. 416 */ 417 static void tree_walk(struct tree *tree) 418 { 419 struct node *node; 420 unsigned int leftmask; 421 unsigned int rightmask; 422 unsigned int bitmask; 423 int indent = 1; 424 int nodes, singletons, leaves; 425 426 nodes = singletons = leaves = 0; 427 428 printf("%s_%x root %p\n", tree->type, tree->maxage, tree->root); 429 if (tree->childnode == LEAF) { 430 assert(tree->root); 431 tree->leaf_print(tree->root, indent); 432 leaves = 1; 433 } else { 434 assert(tree->childnode == NODE); 435 node = tree->root; 436 leftmask = rightmask = 0; 437 while (node) { 438 printf("%*snode @ %p bitnum %d nextbyte %d" 439 " left %p right %p mask %x bits %x\n", 440 indent, "", node, 441 node->bitnum, node->nextbyte, 442 node->left, node->right, 443 node->keymask, node->keybits); 444 nodes += 1; 445 if (!(node->left && node->right)) 446 singletons += 1; 447 448 while (node) { 449 bitmask = 1 << node->bitnum; 450 if ((leftmask & bitmask) == 0) { 451 leftmask |= bitmask; 452 if (node->leftnode == LEAF) { 453 assert(node->left); 454 tree->leaf_print(node->left, 455 indent+1); 456 leaves += 1; 457 } else if (node->left) { 458 assert(node->leftnode == NODE); 459 indent += 1; 460 node = node->left; 461 break; 462 } 463 } 464 if ((rightmask & bitmask) == 0) { 465 rightmask |= bitmask; 466 if (node->rightnode == LEAF) { 467 assert(node->right); 468 tree->leaf_print(node->right, 469 indent+1); 470 leaves += 1; 471 } else if (node->right) { 472 assert(node->rightnode == NODE); 473 indent += 1; 474 node = node->right; 475 break; 476 } 477 } 478 leftmask &= ~bitmask; 479 rightmask &= ~bitmask; 480 node = node->parent; 481 indent -= 1; 482 } 483 } 484 } 485 printf("nodes %d leaves %d singletons %d\n", 486 nodes, leaves, singletons); 487 } 488 489 /* 490 * Allocate an initialize a new internal node. 491 */ 492 static struct node *alloc_node(struct node *parent) 493 { 494 struct node *node; 495 int bitnum; 496 497 node = malloc(sizeof(*node)); 498 node->left = node->right = NULL; 499 node->parent = parent; 500 node->leftnode = NODE; 501 node->rightnode = NODE; 502 node->keybits = 0; 503 node->keymask = 0; 504 node->mark = 0; 505 node->index = 0; 506 node->offset = -1; 507 node->size = 4; 508 509 if (node->parent) { 510 bitnum = parent->bitnum; 511 if ((bitnum & 7) == 0) { 512 node->bitnum = bitnum + 7 + 8; 513 node->nextbyte = 1; 514 } else { 515 node->bitnum = bitnum - 1; 516 node->nextbyte = 0; 517 } 518 } else { 519 node->bitnum = 7; 520 node->nextbyte = 0; 521 } 522 523 return node; 524 } 525 526 /* 527 * Insert a new leaf into the tree, and collapse any subtrees that are 528 * fully populated and end in identical leaves. A nextbyte tagged 529 * internal node will not be removed to preserve the tree's integrity. 530 * Note that due to the structure of utf8, no nextbyte tagged node 531 * will be a candidate for removal. 532 */ 533 static int insert(struct tree *tree, char *key, int keylen, void *leaf) 534 { 535 struct node *node; 536 struct node *parent; 537 void **cursor; 538 int keybits; 539 540 assert(keylen >= 1 && keylen <= 4); 541 542 node = NULL; 543 cursor = &tree->root; 544 keybits = 8 * keylen; 545 546 /* Insert, creating path along the way. */ 547 while (keybits) { 548 if (!*cursor) 549 *cursor = alloc_node(node); 550 node = *cursor; 551 if (node->nextbyte) 552 key++; 553 if (*key & (1 << (node->bitnum & 7))) 554 cursor = &node->right; 555 else 556 cursor = &node->left; 557 keybits--; 558 } 559 *cursor = leaf; 560 561 /* Merge subtrees if possible. */ 562 while (node) { 563 if (*key & (1 << (node->bitnum & 7))) 564 node->rightnode = LEAF; 565 else 566 node->leftnode = LEAF; 567 if (node->nextbyte) 568 break; 569 if (node->leftnode == NODE || node->rightnode == NODE) 570 break; 571 assert(node->left); 572 assert(node->right); 573 /* Compare */ 574 if (! tree->leaf_equal(node->left, node->right)) 575 break; 576 /* Keep left, drop right leaf. */ 577 leaf = node->left; 578 /* Check in parent */ 579 parent = node->parent; 580 if (!parent) { 581 /* root of tree! */ 582 tree->root = leaf; 583 tree->childnode = LEAF; 584 } else if (parent->left == node) { 585 parent->left = leaf; 586 parent->leftnode = LEAF; 587 if (parent->right) { 588 parent->keymask = 0; 589 parent->keybits = 0; 590 } else { 591 parent->keymask |= (1 << node->bitnum); 592 } 593 } else if (parent->right == node) { 594 parent->right = leaf; 595 parent->rightnode = LEAF; 596 if (parent->left) { 597 parent->keymask = 0; 598 parent->keybits = 0; 599 } else { 600 parent->keymask |= (1 << node->bitnum); 601 parent->keybits |= (1 << node->bitnum); 602 } 603 } else { 604 /* internal tree error */ 605 assert(0); 606 } 607 free(node); 608 node = parent; 609 } 610 611 /* Propagate keymasks up along singleton chains. */ 612 while (node) { 613 parent = node->parent; 614 if (!parent) 615 break; 616 /* Nix the mask for parents with two children. */ 617 if (node->keymask == 0) { 618 parent->keymask = 0; 619 parent->keybits = 0; 620 } else if (parent->left && parent->right) { 621 parent->keymask = 0; 622 parent->keybits = 0; 623 } else { 624 assert((parent->keymask & node->keymask) == 0); 625 parent->keymask |= node->keymask; 626 parent->keymask |= (1 << parent->bitnum); 627 parent->keybits |= node->keybits; 628 if (parent->right) 629 parent->keybits |= (1 << parent->bitnum); 630 } 631 node = parent; 632 } 633 634 return 0; 635 } 636 637 /* 638 * Prune internal nodes. 639 * 640 * Fully populated subtrees that end at the same leaf have already 641 * been collapsed. There are still internal nodes that have for both 642 * their left and right branches a sequence of singletons that make 643 * identical choices and end in identical leaves. The keymask and 644 * keybits collected in the nodes describe the choices made in these 645 * singleton chains. When they are identical for the left and right 646 * branch of a node, and the two leaves comare identical, the node in 647 * question can be removed. 648 * 649 * Note that nodes with the nextbyte tag set will not be removed by 650 * this to ensure tree integrity. Note as well that the structure of 651 * utf8 ensures that these nodes would not have been candidates for 652 * removal in any case. 653 */ 654 static void prune(struct tree *tree) 655 { 656 struct node *node; 657 struct node *left; 658 struct node *right; 659 struct node *parent; 660 void *leftleaf; 661 void *rightleaf; 662 unsigned int leftmask; 663 unsigned int rightmask; 664 unsigned int bitmask; 665 int count; 666 667 if (verbose > 0) 668 printf("Pruning %s_%x\n", tree->type, tree->maxage); 669 670 count = 0; 671 if (tree->childnode == LEAF) 672 return; 673 if (!tree->root) 674 return; 675 676 leftmask = rightmask = 0; 677 node = tree->root; 678 while (node) { 679 if (node->nextbyte) 680 goto advance; 681 if (node->leftnode == LEAF) 682 goto advance; 683 if (node->rightnode == LEAF) 684 goto advance; 685 if (!node->left) 686 goto advance; 687 if (!node->right) 688 goto advance; 689 left = node->left; 690 right = node->right; 691 if (left->keymask == 0) 692 goto advance; 693 if (right->keymask == 0) 694 goto advance; 695 if (left->keymask != right->keymask) 696 goto advance; 697 if (left->keybits != right->keybits) 698 goto advance; 699 leftleaf = NULL; 700 while (!leftleaf) { 701 assert(left->left || left->right); 702 if (left->leftnode == LEAF) 703 leftleaf = left->left; 704 else if (left->rightnode == LEAF) 705 leftleaf = left->right; 706 else if (left->left) 707 left = left->left; 708 else if (left->right) 709 left = left->right; 710 else 711 assert(0); 712 } 713 rightleaf = NULL; 714 while (!rightleaf) { 715 assert(right->left || right->right); 716 if (right->leftnode == LEAF) 717 rightleaf = right->left; 718 else if (right->rightnode == LEAF) 719 rightleaf = right->right; 720 else if (right->left) 721 right = right->left; 722 else if (right->right) 723 right = right->right; 724 else 725 assert(0); 726 } 727 if (! tree->leaf_equal(leftleaf, rightleaf)) 728 goto advance; 729 /* 730 * This node has identical singleton-only subtrees. 731 * Remove it. 732 */ 733 parent = node->parent; 734 left = node->left; 735 right = node->right; 736 if (parent->left == node) 737 parent->left = left; 738 else if (parent->right == node) 739 parent->right = left; 740 else 741 assert(0); 742 left->parent = parent; 743 left->keymask |= (1 << node->bitnum); 744 node->left = NULL; 745 while (node) { 746 bitmask = 1 << node->bitnum; 747 leftmask &= ~bitmask; 748 rightmask &= ~bitmask; 749 if (node->leftnode == NODE && node->left) { 750 left = node->left; 751 free(node); 752 count++; 753 node = left; 754 } else if (node->rightnode == NODE && node->right) { 755 right = node->right; 756 free(node); 757 count++; 758 node = right; 759 } else { 760 node = NULL; 761 } 762 } 763 /* Propagate keymasks up along singleton chains. */ 764 node = parent; 765 /* Force re-check */ 766 bitmask = 1 << node->bitnum; 767 leftmask &= ~bitmask; 768 rightmask &= ~bitmask; 769 for (;;) { 770 if (node->left && node->right) 771 break; 772 if (node->left) { 773 left = node->left; 774 node->keymask |= left->keymask; 775 node->keybits |= left->keybits; 776 } 777 if (node->right) { 778 right = node->right; 779 node->keymask |= right->keymask; 780 node->keybits |= right->keybits; 781 } 782 node->keymask |= (1 << node->bitnum); 783 node = node->parent; 784 /* Force re-check */ 785 bitmask = 1 << node->bitnum; 786 leftmask &= ~bitmask; 787 rightmask &= ~bitmask; 788 } 789 advance: 790 bitmask = 1 << node->bitnum; 791 if ((leftmask & bitmask) == 0 && 792 node->leftnode == NODE && 793 node->left) { 794 leftmask |= bitmask; 795 node = node->left; 796 } else if ((rightmask & bitmask) == 0 && 797 node->rightnode == NODE && 798 node->right) { 799 rightmask |= bitmask; 800 node = node->right; 801 } else { 802 leftmask &= ~bitmask; 803 rightmask &= ~bitmask; 804 node = node->parent; 805 } 806 } 807 if (verbose > 0) 808 printf("Pruned %d nodes\n", count); 809 } 810 811 /* 812 * Mark the nodes in the tree that lead to leaves that must be 813 * emitted. 814 */ 815 static void mark_nodes(struct tree *tree) 816 { 817 struct node *node; 818 struct node *n; 819 unsigned int leftmask; 820 unsigned int rightmask; 821 unsigned int bitmask; 822 int marked; 823 824 marked = 0; 825 if (verbose > 0) 826 printf("Marking %s_%x\n", tree->type, tree->maxage); 827 if (tree->childnode == LEAF) 828 goto done; 829 830 assert(tree->childnode == NODE); 831 node = tree->root; 832 leftmask = rightmask = 0; 833 while (node) { 834 bitmask = 1 << node->bitnum; 835 if ((leftmask & bitmask) == 0) { 836 leftmask |= bitmask; 837 if (node->leftnode == LEAF) { 838 assert(node->left); 839 if (tree->leaf_mark(node->left)) { 840 n = node; 841 while (n && !n->mark) { 842 marked++; 843 n->mark = 1; 844 n = n->parent; 845 } 846 } 847 } else if (node->left) { 848 assert(node->leftnode == NODE); 849 node = node->left; 850 continue; 851 } 852 } 853 if ((rightmask & bitmask) == 0) { 854 rightmask |= bitmask; 855 if (node->rightnode == LEAF) { 856 assert(node->right); 857 if (tree->leaf_mark(node->right)) { 858 n = node; 859 while (n && !n->mark) { 860 marked++; 861 n->mark = 1; 862 n = n->parent; 863 } 864 } 865 } else if (node->right) { 866 assert(node->rightnode == NODE); 867 node = node->right; 868 continue; 869 } 870 } 871 leftmask &= ~bitmask; 872 rightmask &= ~bitmask; 873 node = node->parent; 874 } 875 876 /* second pass: left siblings and singletons */ 877 878 assert(tree->childnode == NODE); 879 node = tree->root; 880 leftmask = rightmask = 0; 881 while (node) { 882 bitmask = 1 << node->bitnum; 883 if ((leftmask & bitmask) == 0) { 884 leftmask |= bitmask; 885 if (node->leftnode == LEAF) { 886 assert(node->left); 887 if (tree->leaf_mark(node->left)) { 888 n = node; 889 while (n && !n->mark) { 890 marked++; 891 n->mark = 1; 892 n = n->parent; 893 } 894 } 895 } else if (node->left) { 896 assert(node->leftnode == NODE); 897 node = node->left; 898 if (!node->mark && node->parent->mark) { 899 marked++; 900 node->mark = 1; 901 } 902 continue; 903 } 904 } 905 if ((rightmask & bitmask) == 0) { 906 rightmask |= bitmask; 907 if (node->rightnode == LEAF) { 908 assert(node->right); 909 if (tree->leaf_mark(node->right)) { 910 n = node; 911 while (n && !n->mark) { 912 marked++; 913 n->mark = 1; 914 n = n->parent; 915 } 916 } 917 } else if (node->right) { 918 assert(node->rightnode == NODE); 919 node = node->right; 920 if (!node->mark && node->parent->mark && 921 !node->parent->left) { 922 marked++; 923 node->mark = 1; 924 } 925 continue; 926 } 927 } 928 leftmask &= ~bitmask; 929 rightmask &= ~bitmask; 930 node = node->parent; 931 } 932 done: 933 if (verbose > 0) 934 printf("Marked %d nodes\n", marked); 935 } 936 937 /* 938 * Compute the index of each node and leaf, which is the offset in the 939 * emitted trie. These values must be pre-computed because relative 940 * offsets between nodes are used to navigate the tree. 941 */ 942 static int index_nodes(struct tree *tree, int index) 943 { 944 struct node *node; 945 unsigned int leftmask; 946 unsigned int rightmask; 947 unsigned int bitmask; 948 int count; 949 int indent; 950 951 /* Align to a cache line (or half a cache line?). */ 952 while (index % 64) 953 index++; 954 tree->index = index; 955 indent = 1; 956 count = 0; 957 958 if (verbose > 0) 959 printf("Indexing %s_%x: %d\n", tree->type, tree->maxage, index); 960 if (tree->childnode == LEAF) { 961 index += tree->leaf_size(tree->root); 962 goto done; 963 } 964 965 assert(tree->childnode == NODE); 966 node = tree->root; 967 leftmask = rightmask = 0; 968 while (node) { 969 if (!node->mark) 970 goto skip; 971 count++; 972 if (node->index != index) 973 node->index = index; 974 index += node->size; 975 skip: 976 while (node) { 977 bitmask = 1 << node->bitnum; 978 if (node->mark && (leftmask & bitmask) == 0) { 979 leftmask |= bitmask; 980 if (node->leftnode == LEAF) { 981 assert(node->left); 982 *tree->leaf_index(tree, node->left) = 983 index; 984 index += tree->leaf_size(node->left); 985 count++; 986 } else if (node->left) { 987 assert(node->leftnode == NODE); 988 indent += 1; 989 node = node->left; 990 break; 991 } 992 } 993 if (node->mark && (rightmask & bitmask) == 0) { 994 rightmask |= bitmask; 995 if (node->rightnode == LEAF) { 996 assert(node->right); 997 *tree->leaf_index(tree, node->right) = index; 998 index += tree->leaf_size(node->right); 999 count++; 1000 } else if (node->right) { 1001 assert(node->rightnode == NODE); 1002 indent += 1; 1003 node = node->right; 1004 break; 1005 } 1006 } 1007 leftmask &= ~bitmask; 1008 rightmask &= ~bitmask; 1009 node = node->parent; 1010 indent -= 1; 1011 } 1012 } 1013 done: 1014 /* Round up to a multiple of 16 */ 1015 while (index % 16) 1016 index++; 1017 if (verbose > 0) 1018 printf("Final index %d\n", index); 1019 return index; 1020 } 1021 1022 /* 1023 * Mark the nodes in a subtree, helper for size_nodes(). 1024 */ 1025 static int mark_subtree(struct node *node) 1026 { 1027 int changed; 1028 1029 if (!node || node->mark) 1030 return 0; 1031 node->mark = 1; 1032 node->index = node->parent->index; 1033 changed = 1; 1034 if (node->leftnode == NODE) 1035 changed += mark_subtree(node->left); 1036 if (node->rightnode == NODE) 1037 changed += mark_subtree(node->right); 1038 return changed; 1039 } 1040 1041 /* 1042 * Compute the size of nodes and leaves. We start by assuming that 1043 * each node needs to store a three-byte offset. The indexes of the 1044 * nodes are calculated based on that, and then this function is 1045 * called to see if the sizes of some nodes can be reduced. This is 1046 * repeated until no more changes are seen. 1047 */ 1048 static int size_nodes(struct tree *tree) 1049 { 1050 struct tree *next; 1051 struct node *node; 1052 struct node *right; 1053 struct node *n; 1054 unsigned int leftmask; 1055 unsigned int rightmask; 1056 unsigned int bitmask; 1057 unsigned int pathbits; 1058 unsigned int pathmask; 1059 unsigned int nbit; 1060 int changed; 1061 int offset; 1062 int size; 1063 int indent; 1064 1065 indent = 1; 1066 changed = 0; 1067 size = 0; 1068 1069 if (verbose > 0) 1070 printf("Sizing %s_%x\n", tree->type, tree->maxage); 1071 if (tree->childnode == LEAF) 1072 goto done; 1073 1074 assert(tree->childnode == NODE); 1075 pathbits = 0; 1076 pathmask = 0; 1077 node = tree->root; 1078 leftmask = rightmask = 0; 1079 while (node) { 1080 if (!node->mark) 1081 goto skip; 1082 offset = 0; 1083 if (!node->left || !node->right) { 1084 size = 1; 1085 } else { 1086 if (node->rightnode == NODE) { 1087 /* 1088 * If the right node is not marked, 1089 * look for a corresponding node in 1090 * the next tree. Such a node need 1091 * not exist. 1092 */ 1093 right = node->right; 1094 next = tree->next; 1095 while (!right->mark) { 1096 assert(next); 1097 n = next->root; 1098 while (n->bitnum != node->bitnum) { 1099 nbit = 1 << n->bitnum; 1100 if (!(pathmask & nbit)) 1101 break; 1102 if (pathbits & nbit) { 1103 if (n->rightnode == LEAF) 1104 break; 1105 n = n->right; 1106 } else { 1107 if (n->leftnode == LEAF) 1108 break; 1109 n = n->left; 1110 } 1111 } 1112 if (n->bitnum != node->bitnum) 1113 break; 1114 n = n->right; 1115 right = n; 1116 next = next->next; 1117 } 1118 /* Make sure the right node is marked. */ 1119 if (!right->mark) 1120 changed += mark_subtree(right); 1121 offset = right->index - node->index; 1122 } else { 1123 offset = *tree->leaf_index(tree, node->right); 1124 offset -= node->index; 1125 } 1126 assert(offset >= 0); 1127 assert(offset <= 0xffffff); 1128 if (offset <= 0xff) { 1129 size = 2; 1130 } else if (offset <= 0xffff) { 1131 size = 3; 1132 } else { /* offset <= 0xffffff */ 1133 size = 4; 1134 } 1135 } 1136 if (node->size != size || node->offset != offset) { 1137 node->size = size; 1138 node->offset = offset; 1139 changed++; 1140 } 1141 skip: 1142 while (node) { 1143 bitmask = 1 << node->bitnum; 1144 pathmask |= bitmask; 1145 if (node->mark && (leftmask & bitmask) == 0) { 1146 leftmask |= bitmask; 1147 if (node->leftnode == LEAF) { 1148 assert(node->left); 1149 } else if (node->left) { 1150 assert(node->leftnode == NODE); 1151 indent += 1; 1152 node = node->left; 1153 break; 1154 } 1155 } 1156 if (node->mark && (rightmask & bitmask) == 0) { 1157 rightmask |= bitmask; 1158 pathbits |= bitmask; 1159 if (node->rightnode == LEAF) { 1160 assert(node->right); 1161 } else if (node->right) { 1162 assert(node->rightnode == NODE); 1163 indent += 1; 1164 node = node->right; 1165 break; 1166 } 1167 } 1168 leftmask &= ~bitmask; 1169 rightmask &= ~bitmask; 1170 pathmask &= ~bitmask; 1171 pathbits &= ~bitmask; 1172 node = node->parent; 1173 indent -= 1; 1174 } 1175 } 1176 done: 1177 if (verbose > 0) 1178 printf("Found %d changes\n", changed); 1179 return changed; 1180 } 1181 1182 /* 1183 * Emit a trie for the given tree into the data array. 1184 */ 1185 static void emit(struct tree *tree, unsigned char *data) 1186 { 1187 struct node *node; 1188 unsigned int leftmask; 1189 unsigned int rightmask; 1190 unsigned int bitmask; 1191 int offlen; 1192 int offset; 1193 int index; 1194 int indent; 1195 int size; 1196 int bytes; 1197 int leaves; 1198 int nodes[4]; 1199 unsigned char byte; 1200 1201 nodes[0] = nodes[1] = nodes[2] = nodes[3] = 0; 1202 leaves = 0; 1203 bytes = 0; 1204 index = tree->index; 1205 data += index; 1206 indent = 1; 1207 if (verbose > 0) 1208 printf("Emitting %s_%x\n", tree->type, tree->maxage); 1209 if (tree->childnode == LEAF) { 1210 assert(tree->root); 1211 tree->leaf_emit(tree->root, data); 1212 size = tree->leaf_size(tree->root); 1213 index += size; 1214 leaves++; 1215 goto done; 1216 } 1217 1218 assert(tree->childnode == NODE); 1219 node = tree->root; 1220 leftmask = rightmask = 0; 1221 while (node) { 1222 if (!node->mark) 1223 goto skip; 1224 assert(node->offset != -1); 1225 assert(node->index == index); 1226 1227 byte = 0; 1228 if (node->nextbyte) 1229 byte |= NEXTBYTE; 1230 byte |= (node->bitnum & BITNUM); 1231 if (node->left && node->right) { 1232 if (node->leftnode == NODE) 1233 byte |= LEFTNODE; 1234 if (node->rightnode == NODE) 1235 byte |= RIGHTNODE; 1236 if (node->offset <= 0xff) 1237 offlen = 1; 1238 else if (node->offset <= 0xffff) 1239 offlen = 2; 1240 else 1241 offlen = 3; 1242 nodes[offlen]++; 1243 offset = node->offset; 1244 byte |= offlen << OFFLEN_SHIFT; 1245 *data++ = byte; 1246 index++; 1247 while (offlen--) { 1248 *data++ = offset & 0xff; 1249 index++; 1250 offset >>= 8; 1251 } 1252 } else if (node->left) { 1253 if (node->leftnode == NODE) 1254 byte |= TRIENODE; 1255 nodes[0]++; 1256 *data++ = byte; 1257 index++; 1258 } else if (node->right) { 1259 byte |= RIGHTNODE; 1260 if (node->rightnode == NODE) 1261 byte |= TRIENODE; 1262 nodes[0]++; 1263 *data++ = byte; 1264 index++; 1265 } else { 1266 assert(0); 1267 } 1268 skip: 1269 while (node) { 1270 bitmask = 1 << node->bitnum; 1271 if (node->mark && (leftmask & bitmask) == 0) { 1272 leftmask |= bitmask; 1273 if (node->leftnode == LEAF) { 1274 assert(node->left); 1275 data = tree->leaf_emit(node->left, 1276 data); 1277 size = tree->leaf_size(node->left); 1278 index += size; 1279 bytes += size; 1280 leaves++; 1281 } else if (node->left) { 1282 assert(node->leftnode == NODE); 1283 indent += 1; 1284 node = node->left; 1285 break; 1286 } 1287 } 1288 if (node->mark && (rightmask & bitmask) == 0) { 1289 rightmask |= bitmask; 1290 if (node->rightnode == LEAF) { 1291 assert(node->right); 1292 data = tree->leaf_emit(node->right, 1293 data); 1294 size = tree->leaf_size(node->right); 1295 index += size; 1296 bytes += size; 1297 leaves++; 1298 } else if (node->right) { 1299 assert(node->rightnode == NODE); 1300 indent += 1; 1301 node = node->right; 1302 break; 1303 } 1304 } 1305 leftmask &= ~bitmask; 1306 rightmask &= ~bitmask; 1307 node = node->parent; 1308 indent -= 1; 1309 } 1310 } 1311 done: 1312 if (verbose > 0) { 1313 printf("Emitted %d (%d) leaves", 1314 leaves, bytes); 1315 printf(" %d (%d+%d+%d+%d) nodes", 1316 nodes[0] + nodes[1] + nodes[2] + nodes[3], 1317 nodes[0], nodes[1], nodes[2], nodes[3]); 1318 printf(" %d total\n", index - tree->index); 1319 } 1320 } 1321 1322 /* ------------------------------------------------------------------ */ 1323 1324 /* 1325 * Unicode data. 1326 * 1327 * We need to keep track of the Canonical Combining Class, the Age, 1328 * and decompositions for a code point. 1329 * 1330 * For the Age, we store the index into the ages table. Effectively 1331 * this is a generation number that the table maps to a unicode 1332 * version. 1333 * 1334 * The correction field is used to indicate that this entry is in the 1335 * corrections array, which contains decompositions that were 1336 * corrected in later revisions. The value of the correction field is 1337 * the Unicode version in which the mapping was corrected. 1338 */ 1339 struct unicode_data { 1340 unsigned int code; 1341 int ccc; 1342 int gen; 1343 int correction; 1344 unsigned int *utf32nfdi; 1345 unsigned int *utf32nfdicf; 1346 char *utf8nfdi; 1347 char *utf8nfdicf; 1348 }; 1349 1350 struct unicode_data unicode_data[0x110000]; 1351 struct unicode_data *corrections; 1352 int corrections_count; 1353 1354 struct tree *nfdi_tree; 1355 struct tree *nfdicf_tree; 1356 1357 struct tree *trees; 1358 int trees_count; 1359 1360 /* 1361 * Check the corrections array to see if this entry was corrected at 1362 * some point. 1363 */ 1364 static struct unicode_data *corrections_lookup(struct unicode_data *u) 1365 { 1366 int i; 1367 1368 for (i = 0; i != corrections_count; i++) 1369 if (u->code == corrections[i].code) 1370 return &corrections[i]; 1371 return u; 1372 } 1373 1374 static int nfdi_equal(void *l, void *r) 1375 { 1376 struct unicode_data *left = l; 1377 struct unicode_data *right = r; 1378 1379 if (left->gen != right->gen) 1380 return 0; 1381 if (left->ccc != right->ccc) 1382 return 0; 1383 if (left->utf8nfdi && right->utf8nfdi && 1384 strcmp(left->utf8nfdi, right->utf8nfdi) == 0) 1385 return 1; 1386 if (left->utf8nfdi || right->utf8nfdi) 1387 return 0; 1388 return 1; 1389 } 1390 1391 static int nfdicf_equal(void *l, void *r) 1392 { 1393 struct unicode_data *left = l; 1394 struct unicode_data *right = r; 1395 1396 if (left->gen != right->gen) 1397 return 0; 1398 if (left->ccc != right->ccc) 1399 return 0; 1400 if (left->utf8nfdicf && right->utf8nfdicf && 1401 strcmp(left->utf8nfdicf, right->utf8nfdicf) == 0) 1402 return 1; 1403 if (left->utf8nfdicf && right->utf8nfdicf) 1404 return 0; 1405 if (left->utf8nfdicf || right->utf8nfdicf) 1406 return 0; 1407 if (left->utf8nfdi && right->utf8nfdi && 1408 strcmp(left->utf8nfdi, right->utf8nfdi) == 0) 1409 return 1; 1410 if (left->utf8nfdi || right->utf8nfdi) 1411 return 0; 1412 return 1; 1413 } 1414 1415 static void nfdi_print(void *l, int indent) 1416 { 1417 struct unicode_data *leaf = l; 1418 1419 printf("%*sleaf @ %p code %X ccc %d gen %d", indent, "", leaf, 1420 leaf->code, leaf->ccc, leaf->gen); 1421 1422 if (leaf->utf8nfdi && leaf->utf8nfdi[0] == HANGUL) 1423 printf(" nfdi \"%s\"", "HANGUL SYLLABLE"); 1424 else if (leaf->utf8nfdi) 1425 printf(" nfdi \"%s\"", (const char*)leaf->utf8nfdi); 1426 1427 printf("\n"); 1428 } 1429 1430 static void nfdicf_print(void *l, int indent) 1431 { 1432 struct unicode_data *leaf = l; 1433 1434 printf("%*sleaf @ %p code %X ccc %d gen %d", indent, "", leaf, 1435 leaf->code, leaf->ccc, leaf->gen); 1436 1437 if (leaf->utf8nfdicf) 1438 printf(" nfdicf \"%s\"", (const char*)leaf->utf8nfdicf); 1439 else if (leaf->utf8nfdi && leaf->utf8nfdi[0] == HANGUL) 1440 printf(" nfdi \"%s\"", "HANGUL SYLLABLE"); 1441 else if (leaf->utf8nfdi) 1442 printf(" nfdi \"%s\"", (const char*)leaf->utf8nfdi); 1443 printf("\n"); 1444 } 1445 1446 static int nfdi_mark(void *l) 1447 { 1448 return 1; 1449 } 1450 1451 static int nfdicf_mark(void *l) 1452 { 1453 struct unicode_data *leaf = l; 1454 1455 if (leaf->utf8nfdicf) 1456 return 1; 1457 return 0; 1458 } 1459 1460 static int correction_mark(void *l) 1461 { 1462 struct unicode_data *leaf = l; 1463 1464 return leaf->correction; 1465 } 1466 1467 static int nfdi_size(void *l) 1468 { 1469 struct unicode_data *leaf = l; 1470 int size = 2; 1471 1472 if (HANGUL_SYLLABLE(leaf->code)) 1473 size += 1; 1474 else if (leaf->utf8nfdi) 1475 size += strlen(leaf->utf8nfdi) + 1; 1476 return size; 1477 } 1478 1479 static int nfdicf_size(void *l) 1480 { 1481 struct unicode_data *leaf = l; 1482 int size = 2; 1483 1484 if (HANGUL_SYLLABLE(leaf->code)) 1485 size += 1; 1486 else if (leaf->utf8nfdicf) 1487 size += strlen(leaf->utf8nfdicf) + 1; 1488 else if (leaf->utf8nfdi) 1489 size += strlen(leaf->utf8nfdi) + 1; 1490 return size; 1491 } 1492 1493 static int *nfdi_index(struct tree *tree, void *l) 1494 { 1495 struct unicode_data *leaf = l; 1496 1497 return &tree->leafindex[leaf->code]; 1498 } 1499 1500 static int *nfdicf_index(struct tree *tree, void *l) 1501 { 1502 struct unicode_data *leaf = l; 1503 1504 return &tree->leafindex[leaf->code]; 1505 } 1506 1507 static unsigned char *nfdi_emit(void *l, unsigned char *data) 1508 { 1509 struct unicode_data *leaf = l; 1510 unsigned char *s; 1511 1512 *data++ = leaf->gen; 1513 1514 if (HANGUL_SYLLABLE(leaf->code)) { 1515 *data++ = DECOMPOSE; 1516 *data++ = HANGUL; 1517 } else if (leaf->utf8nfdi) { 1518 *data++ = DECOMPOSE; 1519 s = (unsigned char*)leaf->utf8nfdi; 1520 while ((*data++ = *s++) != 0) 1521 ; 1522 } else { 1523 *data++ = leaf->ccc; 1524 } 1525 return data; 1526 } 1527 1528 static unsigned char *nfdicf_emit(void *l, unsigned char *data) 1529 { 1530 struct unicode_data *leaf = l; 1531 unsigned char *s; 1532 1533 *data++ = leaf->gen; 1534 1535 if (HANGUL_SYLLABLE(leaf->code)) { 1536 *data++ = DECOMPOSE; 1537 *data++ = HANGUL; 1538 } else if (leaf->utf8nfdicf) { 1539 *data++ = DECOMPOSE; 1540 s = (unsigned char*)leaf->utf8nfdicf; 1541 while ((*data++ = *s++) != 0) 1542 ; 1543 } else if (leaf->utf8nfdi) { 1544 *data++ = DECOMPOSE; 1545 s = (unsigned char*)leaf->utf8nfdi; 1546 while ((*data++ = *s++) != 0) 1547 ; 1548 } else { 1549 *data++ = leaf->ccc; 1550 } 1551 return data; 1552 } 1553 1554 static void utf8_create(struct unicode_data *data) 1555 { 1556 char utf[18*4+1]; 1557 char *u; 1558 unsigned int *um; 1559 int i; 1560 1561 if (data->utf8nfdi) { 1562 assert(data->utf8nfdi[0] == HANGUL); 1563 return; 1564 } 1565 1566 u = utf; 1567 um = data->utf32nfdi; 1568 if (um) { 1569 for (i = 0; um[i]; i++) 1570 u += utf8encode(u, um[i]); 1571 *u = '\0'; 1572 data->utf8nfdi = strdup(utf); 1573 } 1574 u = utf; 1575 um = data->utf32nfdicf; 1576 if (um) { 1577 for (i = 0; um[i]; i++) 1578 u += utf8encode(u, um[i]); 1579 *u = '\0'; 1580 if (!data->utf8nfdi || strcmp(data->utf8nfdi, utf)) 1581 data->utf8nfdicf = strdup(utf); 1582 } 1583 } 1584 1585 static void utf8_init(void) 1586 { 1587 unsigned int unichar; 1588 int i; 1589 1590 for (unichar = 0; unichar != 0x110000; unichar++) 1591 utf8_create(&unicode_data[unichar]); 1592 1593 for (i = 0; i != corrections_count; i++) 1594 utf8_create(&corrections[i]); 1595 } 1596 1597 static void trees_init(void) 1598 { 1599 struct unicode_data *data; 1600 unsigned int maxage; 1601 unsigned int nextage; 1602 int count; 1603 int i; 1604 int j; 1605 1606 /* Count the number of different ages. */ 1607 count = 0; 1608 nextage = (unsigned int)-1; 1609 do { 1610 maxage = nextage; 1611 nextage = 0; 1612 for (i = 0; i <= corrections_count; i++) { 1613 data = &corrections[i]; 1614 if (nextage < data->correction && 1615 data->correction < maxage) 1616 nextage = data->correction; 1617 } 1618 count++; 1619 } while (nextage); 1620 1621 /* Two trees per age: nfdi and nfdicf */ 1622 trees_count = count * 2; 1623 trees = calloc(trees_count, sizeof(struct tree)); 1624 1625 /* Assign ages to the trees. */ 1626 count = trees_count; 1627 nextage = (unsigned int)-1; 1628 do { 1629 maxage = nextage; 1630 trees[--count].maxage = maxage; 1631 trees[--count].maxage = maxage; 1632 nextage = 0; 1633 for (i = 0; i <= corrections_count; i++) { 1634 data = &corrections[i]; 1635 if (nextage < data->correction && 1636 data->correction < maxage) 1637 nextage = data->correction; 1638 } 1639 } while (nextage); 1640 1641 /* The ages assigned above are off by one. */ 1642 for (i = 0; i != trees_count; i++) { 1643 j = 0; 1644 while (ages[j] < trees[i].maxage) 1645 j++; 1646 trees[i].maxage = ages[j-1]; 1647 } 1648 1649 /* Set up the forwarding between trees. */ 1650 trees[trees_count-2].next = &trees[trees_count-1]; 1651 trees[trees_count-1].leaf_mark = nfdi_mark; 1652 trees[trees_count-2].leaf_mark = nfdicf_mark; 1653 for (i = 0; i != trees_count-2; i += 2) { 1654 trees[i].next = &trees[trees_count-2]; 1655 trees[i].leaf_mark = correction_mark; 1656 trees[i+1].next = &trees[trees_count-1]; 1657 trees[i+1].leaf_mark = correction_mark; 1658 } 1659 1660 /* Assign the callouts. */ 1661 for (i = 0; i != trees_count; i += 2) { 1662 trees[i].type = "nfdicf"; 1663 trees[i].leaf_equal = nfdicf_equal; 1664 trees[i].leaf_print = nfdicf_print; 1665 trees[i].leaf_size = nfdicf_size; 1666 trees[i].leaf_index = nfdicf_index; 1667 trees[i].leaf_emit = nfdicf_emit; 1668 1669 trees[i+1].type = "nfdi"; 1670 trees[i+1].leaf_equal = nfdi_equal; 1671 trees[i+1].leaf_print = nfdi_print; 1672 trees[i+1].leaf_size = nfdi_size; 1673 trees[i+1].leaf_index = nfdi_index; 1674 trees[i+1].leaf_emit = nfdi_emit; 1675 } 1676 1677 /* Finish init. */ 1678 for (i = 0; i != trees_count; i++) 1679 trees[i].childnode = NODE; 1680 } 1681 1682 static void trees_populate(void) 1683 { 1684 struct unicode_data *data; 1685 unsigned int unichar; 1686 char keyval[4]; 1687 int keylen; 1688 int i; 1689 1690 for (i = 0; i != trees_count; i++) { 1691 if (verbose > 0) { 1692 printf("Populating %s_%x\n", 1693 trees[i].type, trees[i].maxage); 1694 } 1695 for (unichar = 0; unichar != 0x110000; unichar++) { 1696 if (unicode_data[unichar].gen < 0) 1697 continue; 1698 keylen = utf8encode(keyval, unichar); 1699 data = corrections_lookup(&unicode_data[unichar]); 1700 if (data->correction <= trees[i].maxage) 1701 data = &unicode_data[unichar]; 1702 insert(&trees[i], keyval, keylen, data); 1703 } 1704 } 1705 } 1706 1707 static void trees_reduce(void) 1708 { 1709 int i; 1710 int size; 1711 int changed; 1712 1713 for (i = 0; i != trees_count; i++) 1714 prune(&trees[i]); 1715 for (i = 0; i != trees_count; i++) 1716 mark_nodes(&trees[i]); 1717 do { 1718 size = 0; 1719 for (i = 0; i != trees_count; i++) 1720 size = index_nodes(&trees[i], size); 1721 changed = 0; 1722 for (i = 0; i != trees_count; i++) 1723 changed += size_nodes(&trees[i]); 1724 } while (changed); 1725 1726 utf8data = calloc(size, 1); 1727 utf8data_size = size; 1728 for (i = 0; i != trees_count; i++) 1729 emit(&trees[i], utf8data); 1730 1731 if (verbose > 0) { 1732 for (i = 0; i != trees_count; i++) { 1733 printf("%s_%x idx %d\n", 1734 trees[i].type, trees[i].maxage, trees[i].index); 1735 } 1736 } 1737 1738 nfdi = utf8data + trees[trees_count-1].index; 1739 nfdicf = utf8data + trees[trees_count-2].index; 1740 1741 nfdi_tree = &trees[trees_count-1]; 1742 nfdicf_tree = &trees[trees_count-2]; 1743 } 1744 1745 static void verify(struct tree *tree) 1746 { 1747 struct unicode_data *data; 1748 utf8leaf_t *leaf; 1749 unsigned int unichar; 1750 char key[4]; 1751 unsigned char hangul[UTF8HANGULLEAF]; 1752 int report; 1753 int nocf; 1754 1755 if (verbose > 0) 1756 printf("Verifying %s_%x\n", tree->type, tree->maxage); 1757 nocf = strcmp(tree->type, "nfdicf"); 1758 1759 for (unichar = 0; unichar != 0x110000; unichar++) { 1760 report = 0; 1761 data = corrections_lookup(&unicode_data[unichar]); 1762 if (data->correction <= tree->maxage) 1763 data = &unicode_data[unichar]; 1764 utf8encode(key,unichar); 1765 leaf = utf8lookup(tree, hangul, key); 1766 1767 if (!leaf) { 1768 if (data->gen != -1) 1769 report++; 1770 if (unichar < 0xd800 || unichar > 0xdfff) 1771 report++; 1772 } else { 1773 if (unichar >= 0xd800 && unichar <= 0xdfff) 1774 report++; 1775 if (data->gen == -1) 1776 report++; 1777 if (data->gen != LEAF_GEN(leaf)) 1778 report++; 1779 if (LEAF_CCC(leaf) == DECOMPOSE) { 1780 if (HANGUL_SYLLABLE(data->code)) { 1781 if (data->utf8nfdi[0] != HANGUL) 1782 report++; 1783 } else if (nocf) { 1784 if (!data->utf8nfdi) { 1785 report++; 1786 } else if (strcmp(data->utf8nfdi, 1787 LEAF_STR(leaf))) { 1788 report++; 1789 } 1790 } else { 1791 if (!data->utf8nfdicf && 1792 !data->utf8nfdi) { 1793 report++; 1794 } else if (data->utf8nfdicf) { 1795 if (strcmp(data->utf8nfdicf, 1796 LEAF_STR(leaf))) 1797 report++; 1798 } else if (strcmp(data->utf8nfdi, 1799 LEAF_STR(leaf))) { 1800 report++; 1801 } 1802 } 1803 } else if (data->ccc != LEAF_CCC(leaf)) { 1804 report++; 1805 } 1806 } 1807 if (report) { 1808 printf("%X code %X gen %d ccc %d" 1809 " nfdi -> \"%s\"", 1810 unichar, data->code, data->gen, 1811 data->ccc, 1812 data->utf8nfdi); 1813 if (leaf) { 1814 printf(" gen %d ccc %d" 1815 " nfdi -> \"%s\"", 1816 LEAF_GEN(leaf), 1817 LEAF_CCC(leaf), 1818 LEAF_CCC(leaf) == DECOMPOSE ? 1819 LEAF_STR(leaf) : ""); 1820 } 1821 printf("\n"); 1822 } 1823 } 1824 } 1825 1826 static void trees_verify(void) 1827 { 1828 int i; 1829 1830 for (i = 0; i != trees_count; i++) 1831 verify(&trees[i]); 1832 } 1833 1834 /* ------------------------------------------------------------------ */ 1835 1836 static void help(void) 1837 { 1838 printf("Usage: %s [options]\n", argv0); 1839 printf("\n"); 1840 printf("This program creates an a data trie used for parsing and\n"); 1841 printf("normalization of UTF-8 strings. The trie is derived from\n"); 1842 printf("a set of input files from the Unicode character database\n"); 1843 printf("found at: http://www.unicode.org/Public/UCD/latest/ucd/\n"); 1844 printf("\n"); 1845 printf("The generated tree supports two normalization forms:\n"); 1846 printf("\n"); 1847 printf("\tnfdi:\n"); 1848 printf("\t- Apply unicode normalization form NFD.\n"); 1849 printf("\t- Remove any Default_Ignorable_Code_Point.\n"); 1850 printf("\n"); 1851 printf("\tnfdicf:\n"); 1852 printf("\t- Apply unicode normalization form NFD.\n"); 1853 printf("\t- Remove any Default_Ignorable_Code_Point.\n"); 1854 printf("\t- Apply a full casefold (C + F).\n"); 1855 printf("\n"); 1856 printf("These forms were chosen as being most useful when dealing\n"); 1857 printf("with file names: NFD catches most cases where characters\n"); 1858 printf("should be considered equivalent. The ignorables are mostly\n"); 1859 printf("invisible, making names hard to type.\n"); 1860 printf("\n"); 1861 printf("The options to specify the files to be used are listed\n"); 1862 printf("below with their default values, which are the names used\n"); 1863 printf("by version 11.0.0 of the Unicode Character Database.\n"); 1864 printf("\n"); 1865 printf("The input files:\n"); 1866 printf("\t-a %s\n", AGE_NAME); 1867 printf("\t-c %s\n", CCC_NAME); 1868 printf("\t-p %s\n", PROP_NAME); 1869 printf("\t-d %s\n", DATA_NAME); 1870 printf("\t-f %s\n", FOLD_NAME); 1871 printf("\t-n %s\n", NORM_NAME); 1872 printf("\n"); 1873 printf("Additionally, the generated tables are tested using:\n"); 1874 printf("\t-t %s\n", TEST_NAME); 1875 printf("\n"); 1876 printf("Finally, the output file:\n"); 1877 printf("\t-o %s\n", UTF8_NAME); 1878 printf("\n"); 1879 } 1880 1881 static void usage(void) 1882 { 1883 help(); 1884 exit(1); 1885 } 1886 1887 static void open_fail(const char *name, int error) 1888 { 1889 printf("Error %d opening %s: %s\n", error, name, strerror(error)); 1890 exit(1); 1891 } 1892 1893 static void file_fail(const char *filename) 1894 { 1895 printf("Error parsing %s\n", filename); 1896 exit(1); 1897 } 1898 1899 static void line_fail(const char *filename, const char *line) 1900 { 1901 printf("Error parsing %s:%s\n", filename, line); 1902 exit(1); 1903 } 1904 1905 /* ------------------------------------------------------------------ */ 1906 1907 static void print_utf32(unsigned int *utf32str) 1908 { 1909 int i; 1910 1911 for (i = 0; utf32str[i]; i++) 1912 printf(" %X", utf32str[i]); 1913 } 1914 1915 static void print_utf32nfdi(unsigned int unichar) 1916 { 1917 printf(" %X ->", unichar); 1918 print_utf32(unicode_data[unichar].utf32nfdi); 1919 printf("\n"); 1920 } 1921 1922 static void print_utf32nfdicf(unsigned int unichar) 1923 { 1924 printf(" %X ->", unichar); 1925 print_utf32(unicode_data[unichar].utf32nfdicf); 1926 printf("\n"); 1927 } 1928 1929 /* ------------------------------------------------------------------ */ 1930 1931 static void age_init(void) 1932 { 1933 FILE *file; 1934 unsigned int first; 1935 unsigned int last; 1936 unsigned int unichar; 1937 unsigned int major; 1938 unsigned int minor; 1939 unsigned int revision; 1940 int gen; 1941 int count; 1942 int ret; 1943 1944 if (verbose > 0) 1945 printf("Parsing %s\n", age_name); 1946 1947 file = fopen(age_name, "r"); 1948 if (!file) 1949 open_fail(age_name, errno); 1950 count = 0; 1951 1952 gen = 0; 1953 while (fgets(line, LINESIZE, file)) { 1954 ret = sscanf(line, "# Age=V%d_%d_%d", 1955 &major, &minor, &revision); 1956 if (ret == 3) { 1957 ages_count++; 1958 if (verbose > 1) 1959 printf(" Age V%d_%d_%d\n", 1960 major, minor, revision); 1961 if (!age_valid(major, minor, revision)) 1962 line_fail(age_name, line); 1963 continue; 1964 } 1965 ret = sscanf(line, "# Age=V%d_%d", &major, &minor); 1966 if (ret == 2) { 1967 ages_count++; 1968 if (verbose > 1) 1969 printf(" Age V%d_%d\n", major, minor); 1970 if (!age_valid(major, minor, 0)) 1971 line_fail(age_name, line); 1972 continue; 1973 } 1974 } 1975 1976 /* We must have found something above. */ 1977 if (verbose > 1) 1978 printf("%d age entries\n", ages_count); 1979 if (ages_count == 0 || ages_count > MAXGEN) 1980 file_fail(age_name); 1981 1982 /* There is a 0 entry. */ 1983 ages_count++; 1984 ages = calloc(ages_count + 1, sizeof(*ages)); 1985 /* And a guard entry. */ 1986 ages[ages_count] = (unsigned int)-1; 1987 1988 rewind(file); 1989 count = 0; 1990 gen = 0; 1991 while (fgets(line, LINESIZE, file)) { 1992 ret = sscanf(line, "# Age=V%d_%d_%d", 1993 &major, &minor, &revision); 1994 if (ret == 3) { 1995 ages[++gen] = 1996 UNICODE_AGE(major, minor, revision); 1997 if (verbose > 1) 1998 printf(" Age V%d_%d_%d = gen %d\n", 1999 major, minor, revision, gen); 2000 if (!age_valid(major, minor, revision)) 2001 line_fail(age_name, line); 2002 continue; 2003 } 2004 ret = sscanf(line, "# Age=V%d_%d", &major, &minor); 2005 if (ret == 2) { 2006 ages[++gen] = UNICODE_AGE(major, minor, 0); 2007 if (verbose > 1) 2008 printf(" Age V%d_%d = %d\n", 2009 major, minor, gen); 2010 if (!age_valid(major, minor, 0)) 2011 line_fail(age_name, line); 2012 continue; 2013 } 2014 ret = sscanf(line, "%X..%X ; %d.%d #", 2015 &first, &last, &major, &minor); 2016 if (ret == 4) { 2017 for (unichar = first; unichar <= last; unichar++) 2018 unicode_data[unichar].gen = gen; 2019 count += 1 + last - first; 2020 if (verbose > 1) 2021 printf(" %X..%X gen %d\n", first, last, gen); 2022 if (!utf32valid(first) || !utf32valid(last)) 2023 line_fail(age_name, line); 2024 continue; 2025 } 2026 ret = sscanf(line, "%X ; %d.%d #", &unichar, &major, &minor); 2027 if (ret == 3) { 2028 unicode_data[unichar].gen = gen; 2029 count++; 2030 if (verbose > 1) 2031 printf(" %X gen %d\n", unichar, gen); 2032 if (!utf32valid(unichar)) 2033 line_fail(age_name, line); 2034 continue; 2035 } 2036 } 2037 unicode_maxage = ages[gen]; 2038 fclose(file); 2039 2040 /* Nix surrogate block */ 2041 if (verbose > 1) 2042 printf(" Removing surrogate block D800..DFFF\n"); 2043 for (unichar = 0xd800; unichar <= 0xdfff; unichar++) 2044 unicode_data[unichar].gen = -1; 2045 2046 if (verbose > 0) 2047 printf("Found %d entries\n", count); 2048 if (count == 0) 2049 file_fail(age_name); 2050 } 2051 2052 static void ccc_init(void) 2053 { 2054 FILE *file; 2055 unsigned int first; 2056 unsigned int last; 2057 unsigned int unichar; 2058 unsigned int value; 2059 int count; 2060 int ret; 2061 2062 if (verbose > 0) 2063 printf("Parsing %s\n", ccc_name); 2064 2065 file = fopen(ccc_name, "r"); 2066 if (!file) 2067 open_fail(ccc_name, errno); 2068 2069 count = 0; 2070 while (fgets(line, LINESIZE, file)) { 2071 ret = sscanf(line, "%X..%X ; %d #", &first, &last, &value); 2072 if (ret == 3) { 2073 for (unichar = first; unichar <= last; unichar++) { 2074 unicode_data[unichar].ccc = value; 2075 count++; 2076 } 2077 if (verbose > 1) 2078 printf(" %X..%X ccc %d\n", first, last, value); 2079 if (!utf32valid(first) || !utf32valid(last)) 2080 line_fail(ccc_name, line); 2081 continue; 2082 } 2083 ret = sscanf(line, "%X ; %d #", &unichar, &value); 2084 if (ret == 2) { 2085 unicode_data[unichar].ccc = value; 2086 count++; 2087 if (verbose > 1) 2088 printf(" %X ccc %d\n", unichar, value); 2089 if (!utf32valid(unichar)) 2090 line_fail(ccc_name, line); 2091 continue; 2092 } 2093 } 2094 fclose(file); 2095 2096 if (verbose > 0) 2097 printf("Found %d entries\n", count); 2098 if (count == 0) 2099 file_fail(ccc_name); 2100 } 2101 2102 static int ignore_compatibility_form(char *type) 2103 { 2104 int i; 2105 char *ignored_types[] = {"font", "noBreak", "initial", "medial", 2106 "final", "isolated", "circle", "super", 2107 "sub", "vertical", "wide", "narrow", 2108 "small", "square", "fraction", "compat"}; 2109 2110 for (i = 0 ; i < ARRAY_SIZE(ignored_types); i++) 2111 if (strcmp(type, ignored_types[i]) == 0) 2112 return 1; 2113 return 0; 2114 } 2115 2116 static void nfdi_init(void) 2117 { 2118 FILE *file; 2119 unsigned int unichar; 2120 unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */ 2121 char *s; 2122 char *type; 2123 unsigned int *um; 2124 int count; 2125 int i; 2126 int ret; 2127 2128 if (verbose > 0) 2129 printf("Parsing %s\n", data_name); 2130 file = fopen(data_name, "r"); 2131 if (!file) 2132 open_fail(data_name, errno); 2133 2134 count = 0; 2135 while (fgets(line, LINESIZE, file)) { 2136 ret = sscanf(line, "%X;%*[^;];%*[^;];%*[^;];%*[^;];%[^;];", 2137 &unichar, buf0); 2138 if (ret != 2) 2139 continue; 2140 if (!utf32valid(unichar)) 2141 line_fail(data_name, line); 2142 2143 s = buf0; 2144 /* skip over <tag> */ 2145 if (*s == '<') { 2146 type = ++s; 2147 while (*++s != '>'); 2148 *s++ = '\0'; 2149 if(ignore_compatibility_form(type)) 2150 continue; 2151 } 2152 /* decode the decomposition into UTF-32 */ 2153 i = 0; 2154 while (*s) { 2155 mapping[i] = strtoul(s, &s, 16); 2156 if (!utf32valid(mapping[i])) 2157 line_fail(data_name, line); 2158 i++; 2159 } 2160 mapping[i++] = 0; 2161 2162 um = malloc(i * sizeof(unsigned int)); 2163 memcpy(um, mapping, i * sizeof(unsigned int)); 2164 unicode_data[unichar].utf32nfdi = um; 2165 2166 if (verbose > 1) 2167 print_utf32nfdi(unichar); 2168 count++; 2169 } 2170 fclose(file); 2171 if (verbose > 0) 2172 printf("Found %d entries\n", count); 2173 if (count == 0) 2174 file_fail(data_name); 2175 } 2176 2177 static void nfdicf_init(void) 2178 { 2179 FILE *file; 2180 unsigned int unichar; 2181 unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */ 2182 char status; 2183 char *s; 2184 unsigned int *um; 2185 int i; 2186 int count; 2187 int ret; 2188 2189 if (verbose > 0) 2190 printf("Parsing %s\n", fold_name); 2191 file = fopen(fold_name, "r"); 2192 if (!file) 2193 open_fail(fold_name, errno); 2194 2195 count = 0; 2196 while (fgets(line, LINESIZE, file)) { 2197 ret = sscanf(line, "%X; %c; %[^;];", &unichar, &status, buf0); 2198 if (ret != 3) 2199 continue; 2200 if (!utf32valid(unichar)) 2201 line_fail(fold_name, line); 2202 /* Use the C+F casefold. */ 2203 if (status != 'C' && status != 'F') 2204 continue; 2205 s = buf0; 2206 if (*s == '<') 2207 while (*s++ != ' ') 2208 ; 2209 i = 0; 2210 while (*s) { 2211 mapping[i] = strtoul(s, &s, 16); 2212 if (!utf32valid(mapping[i])) 2213 line_fail(fold_name, line); 2214 i++; 2215 } 2216 mapping[i++] = 0; 2217 2218 um = malloc(i * sizeof(unsigned int)); 2219 memcpy(um, mapping, i * sizeof(unsigned int)); 2220 unicode_data[unichar].utf32nfdicf = um; 2221 2222 if (verbose > 1) 2223 print_utf32nfdicf(unichar); 2224 count++; 2225 } 2226 fclose(file); 2227 if (verbose > 0) 2228 printf("Found %d entries\n", count); 2229 if (count == 0) 2230 file_fail(fold_name); 2231 } 2232 2233 static void ignore_init(void) 2234 { 2235 FILE *file; 2236 unsigned int unichar; 2237 unsigned int first; 2238 unsigned int last; 2239 unsigned int *um; 2240 int count; 2241 int ret; 2242 2243 if (verbose > 0) 2244 printf("Parsing %s\n", prop_name); 2245 file = fopen(prop_name, "r"); 2246 if (!file) 2247 open_fail(prop_name, errno); 2248 assert(file); 2249 count = 0; 2250 while (fgets(line, LINESIZE, file)) { 2251 ret = sscanf(line, "%X..%X ; %s # ", &first, &last, buf0); 2252 if (ret == 3) { 2253 if (strcmp(buf0, "Default_Ignorable_Code_Point")) 2254 continue; 2255 if (!utf32valid(first) || !utf32valid(last)) 2256 line_fail(prop_name, line); 2257 for (unichar = first; unichar <= last; unichar++) { 2258 free(unicode_data[unichar].utf32nfdi); 2259 um = malloc(sizeof(unsigned int)); 2260 *um = 0; 2261 unicode_data[unichar].utf32nfdi = um; 2262 free(unicode_data[unichar].utf32nfdicf); 2263 um = malloc(sizeof(unsigned int)); 2264 *um = 0; 2265 unicode_data[unichar].utf32nfdicf = um; 2266 count++; 2267 } 2268 if (verbose > 1) 2269 printf(" %X..%X Default_Ignorable_Code_Point\n", 2270 first, last); 2271 continue; 2272 } 2273 ret = sscanf(line, "%X ; %s # ", &unichar, buf0); 2274 if (ret == 2) { 2275 if (strcmp(buf0, "Default_Ignorable_Code_Point")) 2276 continue; 2277 if (!utf32valid(unichar)) 2278 line_fail(prop_name, line); 2279 free(unicode_data[unichar].utf32nfdi); 2280 um = malloc(sizeof(unsigned int)); 2281 *um = 0; 2282 unicode_data[unichar].utf32nfdi = um; 2283 free(unicode_data[unichar].utf32nfdicf); 2284 um = malloc(sizeof(unsigned int)); 2285 *um = 0; 2286 unicode_data[unichar].utf32nfdicf = um; 2287 if (verbose > 1) 2288 printf(" %X Default_Ignorable_Code_Point\n", 2289 unichar); 2290 count++; 2291 continue; 2292 } 2293 } 2294 fclose(file); 2295 2296 if (verbose > 0) 2297 printf("Found %d entries\n", count); 2298 if (count == 0) 2299 file_fail(prop_name); 2300 } 2301 2302 static void corrections_init(void) 2303 { 2304 FILE *file; 2305 unsigned int unichar; 2306 unsigned int major; 2307 unsigned int minor; 2308 unsigned int revision; 2309 unsigned int age; 2310 unsigned int *um; 2311 unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */ 2312 char *s; 2313 int i; 2314 int count; 2315 int ret; 2316 2317 if (verbose > 0) 2318 printf("Parsing %s\n", norm_name); 2319 file = fopen(norm_name, "r"); 2320 if (!file) 2321 open_fail(norm_name, errno); 2322 2323 count = 0; 2324 while (fgets(line, LINESIZE, file)) { 2325 ret = sscanf(line, "%X;%[^;];%[^;];%d.%d.%d #", 2326 &unichar, buf0, buf1, 2327 &major, &minor, &revision); 2328 if (ret != 6) 2329 continue; 2330 if (!utf32valid(unichar) || !age_valid(major, minor, revision)) 2331 line_fail(norm_name, line); 2332 count++; 2333 } 2334 corrections = calloc(count, sizeof(struct unicode_data)); 2335 corrections_count = count; 2336 rewind(file); 2337 2338 count = 0; 2339 while (fgets(line, LINESIZE, file)) { 2340 ret = sscanf(line, "%X;%[^;];%[^;];%d.%d.%d #", 2341 &unichar, buf0, buf1, 2342 &major, &minor, &revision); 2343 if (ret != 6) 2344 continue; 2345 if (!utf32valid(unichar) || !age_valid(major, minor, revision)) 2346 line_fail(norm_name, line); 2347 corrections[count] = unicode_data[unichar]; 2348 assert(corrections[count].code == unichar); 2349 age = UNICODE_AGE(major, minor, revision); 2350 corrections[count].correction = age; 2351 2352 i = 0; 2353 s = buf0; 2354 while (*s) { 2355 mapping[i] = strtoul(s, &s, 16); 2356 if (!utf32valid(mapping[i])) 2357 line_fail(norm_name, line); 2358 i++; 2359 } 2360 mapping[i++] = 0; 2361 2362 um = malloc(i * sizeof(unsigned int)); 2363 memcpy(um, mapping, i * sizeof(unsigned int)); 2364 corrections[count].utf32nfdi = um; 2365 2366 if (verbose > 1) 2367 printf(" %X -> %s -> %s V%d_%d_%d\n", 2368 unichar, buf0, buf1, major, minor, revision); 2369 count++; 2370 } 2371 fclose(file); 2372 2373 if (verbose > 0) 2374 printf("Found %d entries\n", count); 2375 if (count == 0) 2376 file_fail(norm_name); 2377 } 2378 2379 /* ------------------------------------------------------------------ */ 2380 2381 /* 2382 * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0) 2383 * 2384 * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;; 2385 * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;; 2386 * 2387 * SBase = 0xAC00 2388 * LBase = 0x1100 2389 * VBase = 0x1161 2390 * TBase = 0x11A7 2391 * LCount = 19 2392 * VCount = 21 2393 * TCount = 28 2394 * NCount = 588 (VCount * TCount) 2395 * SCount = 11172 (LCount * NCount) 2396 * 2397 * Decomposition: 2398 * SIndex = s - SBase 2399 * 2400 * LV (Canonical/Full) 2401 * LIndex = SIndex / NCount 2402 * VIndex = (Sindex % NCount) / TCount 2403 * LPart = LBase + LIndex 2404 * VPart = VBase + VIndex 2405 * 2406 * LVT (Canonical) 2407 * LVIndex = (SIndex / TCount) * TCount 2408 * TIndex = (Sindex % TCount) 2409 * LVPart = SBase + LVIndex 2410 * TPart = TBase + TIndex 2411 * 2412 * LVT (Full) 2413 * LIndex = SIndex / NCount 2414 * VIndex = (Sindex % NCount) / TCount 2415 * TIndex = (Sindex % TCount) 2416 * LPart = LBase + LIndex 2417 * VPart = VBase + VIndex 2418 * if (TIndex == 0) { 2419 * d = <LPart, VPart> 2420 * } else { 2421 * TPart = TBase + TIndex 2422 * d = <LPart, VPart, TPart> 2423 * } 2424 * 2425 */ 2426 2427 static void hangul_decompose(void) 2428 { 2429 unsigned int sb = 0xAC00; 2430 unsigned int lb = 0x1100; 2431 unsigned int vb = 0x1161; 2432 unsigned int tb = 0x11a7; 2433 /* unsigned int lc = 19; */ 2434 unsigned int vc = 21; 2435 unsigned int tc = 28; 2436 unsigned int nc = (vc * tc); 2437 /* unsigned int sc = (lc * nc); */ 2438 unsigned int unichar; 2439 unsigned int mapping[4]; 2440 unsigned int *um; 2441 int count; 2442 int i; 2443 2444 if (verbose > 0) 2445 printf("Decomposing hangul\n"); 2446 /* Hangul */ 2447 count = 0; 2448 for (unichar = 0xAC00; unichar <= 0xD7A3; unichar++) { 2449 unsigned int si = unichar - sb; 2450 unsigned int li = si / nc; 2451 unsigned int vi = (si % nc) / tc; 2452 unsigned int ti = si % tc; 2453 2454 i = 0; 2455 mapping[i++] = lb + li; 2456 mapping[i++] = vb + vi; 2457 if (ti) 2458 mapping[i++] = tb + ti; 2459 mapping[i++] = 0; 2460 2461 assert(!unicode_data[unichar].utf32nfdi); 2462 um = malloc(i * sizeof(unsigned int)); 2463 memcpy(um, mapping, i * sizeof(unsigned int)); 2464 unicode_data[unichar].utf32nfdi = um; 2465 2466 assert(!unicode_data[unichar].utf32nfdicf); 2467 um = malloc(i * sizeof(unsigned int)); 2468 memcpy(um, mapping, i * sizeof(unsigned int)); 2469 unicode_data[unichar].utf32nfdicf = um; 2470 2471 /* 2472 * Add a cookie as a reminder that the hangul syllable 2473 * decompositions must not be stored in the generated 2474 * trie. 2475 */ 2476 unicode_data[unichar].utf8nfdi = malloc(2); 2477 unicode_data[unichar].utf8nfdi[0] = HANGUL; 2478 unicode_data[unichar].utf8nfdi[1] = '\0'; 2479 2480 if (verbose > 1) 2481 print_utf32nfdi(unichar); 2482 2483 count++; 2484 } 2485 if (verbose > 0) 2486 printf("Created %d entries\n", count); 2487 } 2488 2489 static void nfdi_decompose(void) 2490 { 2491 unsigned int unichar; 2492 unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */ 2493 unsigned int *um; 2494 unsigned int *dc; 2495 int count; 2496 int i; 2497 int j; 2498 int ret; 2499 2500 if (verbose > 0) 2501 printf("Decomposing nfdi\n"); 2502 2503 count = 0; 2504 for (unichar = 0; unichar != 0x110000; unichar++) { 2505 if (!unicode_data[unichar].utf32nfdi) 2506 continue; 2507 for (;;) { 2508 ret = 1; 2509 i = 0; 2510 um = unicode_data[unichar].utf32nfdi; 2511 while (*um) { 2512 dc = unicode_data[*um].utf32nfdi; 2513 if (dc) { 2514 for (j = 0; dc[j]; j++) 2515 mapping[i++] = dc[j]; 2516 ret = 0; 2517 } else { 2518 mapping[i++] = *um; 2519 } 2520 um++; 2521 } 2522 mapping[i++] = 0; 2523 if (ret) 2524 break; 2525 free(unicode_data[unichar].utf32nfdi); 2526 um = malloc(i * sizeof(unsigned int)); 2527 memcpy(um, mapping, i * sizeof(unsigned int)); 2528 unicode_data[unichar].utf32nfdi = um; 2529 } 2530 /* Add this decomposition to nfdicf if there is no entry. */ 2531 if (!unicode_data[unichar].utf32nfdicf) { 2532 um = malloc(i * sizeof(unsigned int)); 2533 memcpy(um, mapping, i * sizeof(unsigned int)); 2534 unicode_data[unichar].utf32nfdicf = um; 2535 } 2536 if (verbose > 1) 2537 print_utf32nfdi(unichar); 2538 count++; 2539 } 2540 if (verbose > 0) 2541 printf("Processed %d entries\n", count); 2542 } 2543 2544 static void nfdicf_decompose(void) 2545 { 2546 unsigned int unichar; 2547 unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */ 2548 unsigned int *um; 2549 unsigned int *dc; 2550 int count; 2551 int i; 2552 int j; 2553 int ret; 2554 2555 if (verbose > 0) 2556 printf("Decomposing nfdicf\n"); 2557 count = 0; 2558 for (unichar = 0; unichar != 0x110000; unichar++) { 2559 if (!unicode_data[unichar].utf32nfdicf) 2560 continue; 2561 for (;;) { 2562 ret = 1; 2563 i = 0; 2564 um = unicode_data[unichar].utf32nfdicf; 2565 while (*um) { 2566 dc = unicode_data[*um].utf32nfdicf; 2567 if (dc) { 2568 for (j = 0; dc[j]; j++) 2569 mapping[i++] = dc[j]; 2570 ret = 0; 2571 } else { 2572 mapping[i++] = *um; 2573 } 2574 um++; 2575 } 2576 mapping[i++] = 0; 2577 if (ret) 2578 break; 2579 free(unicode_data[unichar].utf32nfdicf); 2580 um = malloc(i * sizeof(unsigned int)); 2581 memcpy(um, mapping, i * sizeof(unsigned int)); 2582 unicode_data[unichar].utf32nfdicf = um; 2583 } 2584 if (verbose > 1) 2585 print_utf32nfdicf(unichar); 2586 count++; 2587 } 2588 if (verbose > 0) 2589 printf("Processed %d entries\n", count); 2590 } 2591 2592 /* ------------------------------------------------------------------ */ 2593 2594 int utf8agemax(struct tree *, const char *); 2595 int utf8nagemax(struct tree *, const char *, size_t); 2596 int utf8agemin(struct tree *, const char *); 2597 int utf8nagemin(struct tree *, const char *, size_t); 2598 ssize_t utf8len(struct tree *, const char *); 2599 ssize_t utf8nlen(struct tree *, const char *, size_t); 2600 struct utf8cursor; 2601 int utf8cursor(struct utf8cursor *, struct tree *, const char *); 2602 int utf8ncursor(struct utf8cursor *, struct tree *, const char *, size_t); 2603 int utf8byte(struct utf8cursor *); 2604 2605 /* 2606 * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0) 2607 * 2608 * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;; 2609 * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;; 2610 * 2611 * SBase = 0xAC00 2612 * LBase = 0x1100 2613 * VBase = 0x1161 2614 * TBase = 0x11A7 2615 * LCount = 19 2616 * VCount = 21 2617 * TCount = 28 2618 * NCount = 588 (VCount * TCount) 2619 * SCount = 11172 (LCount * NCount) 2620 * 2621 * Decomposition: 2622 * SIndex = s - SBase 2623 * 2624 * LV (Canonical/Full) 2625 * LIndex = SIndex / NCount 2626 * VIndex = (Sindex % NCount) / TCount 2627 * LPart = LBase + LIndex 2628 * VPart = VBase + VIndex 2629 * 2630 * LVT (Canonical) 2631 * LVIndex = (SIndex / TCount) * TCount 2632 * TIndex = (Sindex % TCount) 2633 * LVPart = SBase + LVIndex 2634 * TPart = TBase + TIndex 2635 * 2636 * LVT (Full) 2637 * LIndex = SIndex / NCount 2638 * VIndex = (Sindex % NCount) / TCount 2639 * TIndex = (Sindex % TCount) 2640 * LPart = LBase + LIndex 2641 * VPart = VBase + VIndex 2642 * if (TIndex == 0) { 2643 * d = <LPart, VPart> 2644 * } else { 2645 * TPart = TBase + TIndex 2646 * d = <LPart, VPart, TPart> 2647 * } 2648 */ 2649 2650 /* Constants */ 2651 #define SB (0xAC00) 2652 #define LB (0x1100) 2653 #define VB (0x1161) 2654 #define TB (0x11A7) 2655 #define LC (19) 2656 #define VC (21) 2657 #define TC (28) 2658 #define NC (VC * TC) 2659 #define SC (LC * NC) 2660 2661 /* Algorithmic decomposition of hangul syllable. */ 2662 static utf8leaf_t *utf8hangul(const char *str, unsigned char *hangul) 2663 { 2664 unsigned int si; 2665 unsigned int li; 2666 unsigned int vi; 2667 unsigned int ti; 2668 unsigned char *h; 2669 2670 /* Calculate the SI, LI, VI, and TI values. */ 2671 si = utf8decode(str) - SB; 2672 li = si / NC; 2673 vi = (si % NC) / TC; 2674 ti = si % TC; 2675 2676 /* Fill in base of leaf. */ 2677 h = hangul; 2678 LEAF_GEN(h) = 2; 2679 LEAF_CCC(h) = DECOMPOSE; 2680 h += 2; 2681 2682 /* Add LPart, a 3-byte UTF-8 sequence. */ 2683 h += utf8encode((char *)h, li + LB); 2684 2685 /* Add VPart, a 3-byte UTF-8 sequence. */ 2686 h += utf8encode((char *)h, vi + VB); 2687 2688 /* Add TPart if required, also a 3-byte UTF-8 sequence. */ 2689 if (ti) 2690 h += utf8encode((char *)h, ti + TB); 2691 2692 /* Terminate string. */ 2693 h[0] = '\0'; 2694 2695 return hangul; 2696 } 2697 2698 /* 2699 * Use trie to scan s, touching at most len bytes. 2700 * Returns the leaf if one exists, NULL otherwise. 2701 * 2702 * A non-NULL return guarantees that the UTF-8 sequence starting at s 2703 * is well-formed and corresponds to a known unicode code point. The 2704 * shorthand for this will be "is valid UTF-8 unicode". 2705 */ 2706 static utf8leaf_t *utf8nlookup(struct tree *tree, unsigned char *hangul, 2707 const char *s, size_t len) 2708 { 2709 utf8trie_t *trie; 2710 int offlen; 2711 int offset; 2712 int mask; 2713 int node; 2714 2715 if (!tree) 2716 return NULL; 2717 if (len == 0) 2718 return NULL; 2719 node = 1; 2720 trie = utf8data + tree->index; 2721 while (node) { 2722 offlen = (*trie & OFFLEN) >> OFFLEN_SHIFT; 2723 if (*trie & NEXTBYTE) { 2724 if (--len == 0) 2725 return NULL; 2726 s++; 2727 } 2728 mask = 1 << (*trie & BITNUM); 2729 if (*s & mask) { 2730 /* Right leg */ 2731 if (offlen) { 2732 /* Right node at offset of trie */ 2733 node = (*trie & RIGHTNODE); 2734 offset = trie[offlen]; 2735 while (--offlen) { 2736 offset <<= 8; 2737 offset |= trie[offlen]; 2738 } 2739 trie += offset; 2740 } else if (*trie & RIGHTPATH) { 2741 /* Right node after this node */ 2742 node = (*trie & TRIENODE); 2743 trie++; 2744 } else { 2745 /* No right node. */ 2746 return NULL; 2747 } 2748 } else { 2749 /* Left leg */ 2750 if (offlen) { 2751 /* Left node after this node. */ 2752 node = (*trie & LEFTNODE); 2753 trie += offlen + 1; 2754 } else if (*trie & RIGHTPATH) { 2755 /* No left node. */ 2756 return NULL; 2757 } else { 2758 /* Left node after this node */ 2759 node = (*trie & TRIENODE); 2760 trie++; 2761 } 2762 } 2763 } 2764 /* 2765 * Hangul decomposition is done algorithmically. These are the 2766 * codepoints >= 0xAC00 and <= 0xD7A3. Their UTF-8 encoding is 2767 * always 3 bytes long, so s has been advanced twice, and the 2768 * start of the sequence is at s-2. 2769 */ 2770 if (LEAF_CCC(trie) == DECOMPOSE && LEAF_STR(trie)[0] == HANGUL) 2771 trie = utf8hangul(s - 2, hangul); 2772 return trie; 2773 } 2774 2775 /* 2776 * Use trie to scan s. 2777 * Returns the leaf if one exists, NULL otherwise. 2778 * 2779 * Forwards to trie_nlookup(). 2780 */ 2781 static utf8leaf_t *utf8lookup(struct tree *tree, unsigned char *hangul, 2782 const char *s) 2783 { 2784 return utf8nlookup(tree, hangul, s, (size_t)-1); 2785 } 2786 2787 /* 2788 * Return the number of bytes used by the current UTF-8 sequence. 2789 * Assumes the input points to the first byte of a valid UTF-8 2790 * sequence. 2791 */ 2792 static inline int utf8clen(const char *s) 2793 { 2794 unsigned char c = *s; 2795 return 1 + (c >= 0xC0) + (c >= 0xE0) + (c >= 0xF0); 2796 } 2797 2798 /* 2799 * Maximum age of any character in s. 2800 * Return -1 if s is not valid UTF-8 unicode. 2801 * Return 0 if only non-assigned code points are used. 2802 */ 2803 int utf8agemax(struct tree *tree, const char *s) 2804 { 2805 utf8leaf_t *leaf; 2806 int age = 0; 2807 int leaf_age; 2808 unsigned char hangul[UTF8HANGULLEAF]; 2809 2810 if (!tree) 2811 return -1; 2812 2813 while (*s) { 2814 leaf = utf8lookup(tree, hangul, s); 2815 if (!leaf) 2816 return -1; 2817 leaf_age = ages[LEAF_GEN(leaf)]; 2818 if (leaf_age <= tree->maxage && leaf_age > age) 2819 age = leaf_age; 2820 s += utf8clen(s); 2821 } 2822 return age; 2823 } 2824 2825 /* 2826 * Minimum age of any character in s. 2827 * Return -1 if s is not valid UTF-8 unicode. 2828 * Return 0 if non-assigned code points are used. 2829 */ 2830 int utf8agemin(struct tree *tree, const char *s) 2831 { 2832 utf8leaf_t *leaf; 2833 int age; 2834 int leaf_age; 2835 unsigned char hangul[UTF8HANGULLEAF]; 2836 2837 if (!tree) 2838 return -1; 2839 age = tree->maxage; 2840 while (*s) { 2841 leaf = utf8lookup(tree, hangul, s); 2842 if (!leaf) 2843 return -1; 2844 leaf_age = ages[LEAF_GEN(leaf)]; 2845 if (leaf_age <= tree->maxage && leaf_age < age) 2846 age = leaf_age; 2847 s += utf8clen(s); 2848 } 2849 return age; 2850 } 2851 2852 /* 2853 * Maximum age of any character in s, touch at most len bytes. 2854 * Return -1 if s is not valid UTF-8 unicode. 2855 */ 2856 int utf8nagemax(struct tree *tree, const char *s, size_t len) 2857 { 2858 utf8leaf_t *leaf; 2859 int age = 0; 2860 int leaf_age; 2861 unsigned char hangul[UTF8HANGULLEAF]; 2862 2863 if (!tree) 2864 return -1; 2865 2866 while (len && *s) { 2867 leaf = utf8nlookup(tree, hangul, s, len); 2868 if (!leaf) 2869 return -1; 2870 leaf_age = ages[LEAF_GEN(leaf)]; 2871 if (leaf_age <= tree->maxage && leaf_age > age) 2872 age = leaf_age; 2873 len -= utf8clen(s); 2874 s += utf8clen(s); 2875 } 2876 return age; 2877 } 2878 2879 /* 2880 * Maximum age of any character in s, touch at most len bytes. 2881 * Return -1 if s is not valid UTF-8 unicode. 2882 */ 2883 int utf8nagemin(struct tree *tree, const char *s, size_t len) 2884 { 2885 utf8leaf_t *leaf; 2886 int leaf_age; 2887 int age; 2888 unsigned char hangul[UTF8HANGULLEAF]; 2889 2890 if (!tree) 2891 return -1; 2892 age = tree->maxage; 2893 while (len && *s) { 2894 leaf = utf8nlookup(tree, hangul, s, len); 2895 if (!leaf) 2896 return -1; 2897 leaf_age = ages[LEAF_GEN(leaf)]; 2898 if (leaf_age <= tree->maxage && leaf_age < age) 2899 age = leaf_age; 2900 len -= utf8clen(s); 2901 s += utf8clen(s); 2902 } 2903 return age; 2904 } 2905 2906 /* 2907 * Length of the normalization of s. 2908 * Return -1 if s is not valid UTF-8 unicode. 2909 * 2910 * A string of Default_Ignorable_Code_Point has length 0. 2911 */ 2912 ssize_t utf8len(struct tree *tree, const char *s) 2913 { 2914 utf8leaf_t *leaf; 2915 size_t ret = 0; 2916 unsigned char hangul[UTF8HANGULLEAF]; 2917 2918 if (!tree) 2919 return -1; 2920 while (*s) { 2921 leaf = utf8lookup(tree, hangul, s); 2922 if (!leaf) 2923 return -1; 2924 if (ages[LEAF_GEN(leaf)] > tree->maxage) 2925 ret += utf8clen(s); 2926 else if (LEAF_CCC(leaf) == DECOMPOSE) 2927 ret += strlen(LEAF_STR(leaf)); 2928 else 2929 ret += utf8clen(s); 2930 s += utf8clen(s); 2931 } 2932 return ret; 2933 } 2934 2935 /* 2936 * Length of the normalization of s, touch at most len bytes. 2937 * Return -1 if s is not valid UTF-8 unicode. 2938 */ 2939 ssize_t utf8nlen(struct tree *tree, const char *s, size_t len) 2940 { 2941 utf8leaf_t *leaf; 2942 size_t ret = 0; 2943 unsigned char hangul[UTF8HANGULLEAF]; 2944 2945 if (!tree) 2946 return -1; 2947 while (len && *s) { 2948 leaf = utf8nlookup(tree, hangul, s, len); 2949 if (!leaf) 2950 return -1; 2951 if (ages[LEAF_GEN(leaf)] > tree->maxage) 2952 ret += utf8clen(s); 2953 else if (LEAF_CCC(leaf) == DECOMPOSE) 2954 ret += strlen(LEAF_STR(leaf)); 2955 else 2956 ret += utf8clen(s); 2957 len -= utf8clen(s); 2958 s += utf8clen(s); 2959 } 2960 return ret; 2961 } 2962 2963 /* 2964 * Cursor structure used by the normalizer. 2965 */ 2966 struct utf8cursor { 2967 struct tree *tree; 2968 const char *s; 2969 const char *p; 2970 const char *ss; 2971 const char *sp; 2972 unsigned int len; 2973 unsigned int slen; 2974 short int ccc; 2975 short int nccc; 2976 unsigned int unichar; 2977 unsigned char hangul[UTF8HANGULLEAF]; 2978 }; 2979 2980 /* 2981 * Set up an utf8cursor for use by utf8byte(). 2982 * 2983 * s : string. 2984 * len : length of s. 2985 * u8c : pointer to cursor. 2986 * trie : utf8trie_t to use for normalization. 2987 * 2988 * Returns -1 on error, 0 on success. 2989 */ 2990 int utf8ncursor(struct utf8cursor *u8c, struct tree *tree, const char *s, 2991 size_t len) 2992 { 2993 if (!tree) 2994 return -1; 2995 if (!s) 2996 return -1; 2997 u8c->tree = tree; 2998 u8c->s = s; 2999 u8c->p = NULL; 3000 u8c->ss = NULL; 3001 u8c->sp = NULL; 3002 u8c->len = len; 3003 u8c->slen = 0; 3004 u8c->ccc = STOPPER; 3005 u8c->nccc = STOPPER; 3006 u8c->unichar = 0; 3007 /* Check we didn't clobber the maximum length. */ 3008 if (u8c->len != len) 3009 return -1; 3010 /* The first byte of s may not be an utf8 continuation. */ 3011 if (len > 0 && (*s & 0xC0) == 0x80) 3012 return -1; 3013 return 0; 3014 } 3015 3016 /* 3017 * Set up an utf8cursor for use by utf8byte(). 3018 * 3019 * s : NUL-terminated string. 3020 * u8c : pointer to cursor. 3021 * trie : utf8trie_t to use for normalization. 3022 * 3023 * Returns -1 on error, 0 on success. 3024 */ 3025 int utf8cursor(struct utf8cursor *u8c, struct tree *tree, const char *s) 3026 { 3027 return utf8ncursor(u8c, tree, s, (unsigned int)-1); 3028 } 3029 3030 /* 3031 * Get one byte from the normalized form of the string described by u8c. 3032 * 3033 * Returns the byte cast to an unsigned char on succes, and -1 on failure. 3034 * 3035 * The cursor keeps track of the location in the string in u8c->s. 3036 * When a character is decomposed, the current location is stored in 3037 * u8c->p, and u8c->s is set to the start of the decomposition. Note 3038 * that bytes from a decomposition do not count against u8c->len. 3039 * 3040 * Characters are emitted if they match the current CCC in u8c->ccc. 3041 * Hitting end-of-string while u8c->ccc == STOPPER means we're done, 3042 * and the function returns 0 in that case. 3043 * 3044 * Sorting by CCC is done by repeatedly scanning the string. The 3045 * values of u8c->s and u8c->p are stored in u8c->ss and u8c->sp at 3046 * the start of the scan. The first pass finds the lowest CCC to be 3047 * emitted and stores it in u8c->nccc, the second pass emits the 3048 * characters with this CCC and finds the next lowest CCC. This limits 3049 * the number of passes to 1 + the number of different CCCs in the 3050 * sequence being scanned. 3051 * 3052 * Therefore: 3053 * u8c->p != NULL -> a decomposition is being scanned. 3054 * u8c->ss != NULL -> this is a repeating scan. 3055 * u8c->ccc == -1 -> this is the first scan of a repeating scan. 3056 */ 3057 int utf8byte(struct utf8cursor *u8c) 3058 { 3059 utf8leaf_t *leaf; 3060 int ccc; 3061 3062 for (;;) { 3063 /* Check for the end of a decomposed character. */ 3064 if (u8c->p && *u8c->s == '\0') { 3065 u8c->s = u8c->p; 3066 u8c->p = NULL; 3067 } 3068 3069 /* Check for end-of-string. */ 3070 if (!u8c->p && (u8c->len == 0 || *u8c->s == '\0')) { 3071 /* There is no next byte. */ 3072 if (u8c->ccc == STOPPER) 3073 return 0; 3074 /* End-of-string during a scan counts as a stopper. */ 3075 ccc = STOPPER; 3076 goto ccc_mismatch; 3077 } else if ((*u8c->s & 0xC0) == 0x80) { 3078 /* This is a continuation of the current character. */ 3079 if (!u8c->p) 3080 u8c->len--; 3081 return (unsigned char)*u8c->s++; 3082 } 3083 3084 /* Look up the data for the current character. */ 3085 if (u8c->p) { 3086 leaf = utf8lookup(u8c->tree, u8c->hangul, u8c->s); 3087 } else { 3088 leaf = utf8nlookup(u8c->tree, u8c->hangul, 3089 u8c->s, u8c->len); 3090 } 3091 3092 /* No leaf found implies that the input is a binary blob. */ 3093 if (!leaf) 3094 return -1; 3095 3096 /* Characters that are too new have CCC 0. */ 3097 if (ages[LEAF_GEN(leaf)] > u8c->tree->maxage) { 3098 ccc = STOPPER; 3099 } else if ((ccc = LEAF_CCC(leaf)) == DECOMPOSE) { 3100 u8c->len -= utf8clen(u8c->s); 3101 u8c->p = u8c->s + utf8clen(u8c->s); 3102 u8c->s = LEAF_STR(leaf); 3103 /* Empty decomposition implies CCC 0. */ 3104 if (*u8c->s == '\0') { 3105 if (u8c->ccc == STOPPER) 3106 continue; 3107 ccc = STOPPER; 3108 goto ccc_mismatch; 3109 } 3110 leaf = utf8lookup(u8c->tree, u8c->hangul, u8c->s); 3111 ccc = LEAF_CCC(leaf); 3112 } 3113 u8c->unichar = utf8decode(u8c->s); 3114 3115 /* 3116 * If this is not a stopper, then see if it updates 3117 * the next canonical class to be emitted. 3118 */ 3119 if (ccc != STOPPER && u8c->ccc < ccc && ccc < u8c->nccc) 3120 u8c->nccc = ccc; 3121 3122 /* 3123 * Return the current byte if this is the current 3124 * combining class. 3125 */ 3126 if (ccc == u8c->ccc) { 3127 if (!u8c->p) 3128 u8c->len--; 3129 return (unsigned char)*u8c->s++; 3130 } 3131 3132 /* Current combining class mismatch. */ 3133 ccc_mismatch: 3134 if (u8c->nccc == STOPPER) { 3135 /* 3136 * Scan forward for the first canonical class 3137 * to be emitted. Save the position from 3138 * which to restart. 3139 */ 3140 assert(u8c->ccc == STOPPER); 3141 u8c->ccc = MINCCC - 1; 3142 u8c->nccc = ccc; 3143 u8c->sp = u8c->p; 3144 u8c->ss = u8c->s; 3145 u8c->slen = u8c->len; 3146 if (!u8c->p) 3147 u8c->len -= utf8clen(u8c->s); 3148 u8c->s += utf8clen(u8c->s); 3149 } else if (ccc != STOPPER) { 3150 /* Not a stopper, and not the ccc we're emitting. */ 3151 if (!u8c->p) 3152 u8c->len -= utf8clen(u8c->s); 3153 u8c->s += utf8clen(u8c->s); 3154 } else if (u8c->nccc != MAXCCC + 1) { 3155 /* At a stopper, restart for next ccc. */ 3156 u8c->ccc = u8c->nccc; 3157 u8c->nccc = MAXCCC + 1; 3158 u8c->s = u8c->ss; 3159 u8c->p = u8c->sp; 3160 u8c->len = u8c->slen; 3161 } else { 3162 /* All done, proceed from here. */ 3163 u8c->ccc = STOPPER; 3164 u8c->nccc = STOPPER; 3165 u8c->sp = NULL; 3166 u8c->ss = NULL; 3167 u8c->slen = 0; 3168 } 3169 } 3170 } 3171 3172 /* ------------------------------------------------------------------ */ 3173 3174 static int normalize_line(struct tree *tree) 3175 { 3176 char *s; 3177 char *t; 3178 int c; 3179 struct utf8cursor u8c; 3180 3181 /* First test: null-terminated string. */ 3182 s = buf2; 3183 t = buf3; 3184 if (utf8cursor(&u8c, tree, s)) 3185 return -1; 3186 while ((c = utf8byte(&u8c)) > 0) 3187 if (c != (unsigned char)*t++) 3188 return -1; 3189 if (c < 0) 3190 return -1; 3191 if (*t != 0) 3192 return -1; 3193 3194 /* Second test: length-limited string. */ 3195 s = buf2; 3196 /* Replace NUL with a value that will cause an error if seen. */ 3197 s[strlen(s) + 1] = -1; 3198 t = buf3; 3199 if (utf8cursor(&u8c, tree, s)) 3200 return -1; 3201 while ((c = utf8byte(&u8c)) > 0) 3202 if (c != (unsigned char)*t++) 3203 return -1; 3204 if (c < 0) 3205 return -1; 3206 if (*t != 0) 3207 return -1; 3208 3209 return 0; 3210 } 3211 3212 static void normalization_test(void) 3213 { 3214 FILE *file; 3215 unsigned int unichar; 3216 struct unicode_data *data; 3217 char *s; 3218 char *t; 3219 int ret; 3220 int ignorables; 3221 int tests = 0; 3222 int failures = 0; 3223 3224 if (verbose > 0) 3225 printf("Parsing %s\n", test_name); 3226 /* Step one, read data from file. */ 3227 file = fopen(test_name, "r"); 3228 if (!file) 3229 open_fail(test_name, errno); 3230 3231 while (fgets(line, LINESIZE, file)) { 3232 ret = sscanf(line, "%[^;];%*[^;];%[^;];%*[^;];%*[^;];", 3233 buf0, buf1); 3234 if (ret != 2 || *line == '#') 3235 continue; 3236 s = buf0; 3237 t = buf2; 3238 while (*s) { 3239 unichar = strtoul(s, &s, 16); 3240 t += utf8encode(t, unichar); 3241 } 3242 *t = '\0'; 3243 3244 ignorables = 0; 3245 s = buf1; 3246 t = buf3; 3247 while (*s) { 3248 unichar = strtoul(s, &s, 16); 3249 data = &unicode_data[unichar]; 3250 if (data->utf8nfdi && !*data->utf8nfdi) 3251 ignorables = 1; 3252 else 3253 t += utf8encode(t, unichar); 3254 } 3255 *t = '\0'; 3256 3257 tests++; 3258 if (normalize_line(nfdi_tree) < 0) { 3259 printf("Line %s -> %s", buf0, buf1); 3260 if (ignorables) 3261 printf(" (ignorables removed)"); 3262 printf(" failure\n"); 3263 failures++; 3264 } 3265 } 3266 fclose(file); 3267 if (verbose > 0) 3268 printf("Ran %d tests with %d failures\n", tests, failures); 3269 if (failures) 3270 file_fail(test_name); 3271 } 3272 3273 /* ------------------------------------------------------------------ */ 3274 3275 static void write_file(void) 3276 { 3277 FILE *file; 3278 int i; 3279 int j; 3280 int t; 3281 int gen; 3282 3283 if (verbose > 0) 3284 printf("Writing %s\n", utf8_name); 3285 file = fopen(utf8_name, "w"); 3286 if (!file) 3287 open_fail(utf8_name, errno); 3288 3289 fprintf(file, "/* This file is generated code, do not edit. */\n"); 3290 fprintf(file, "#ifndef __INCLUDED_FROM_UTF8NORM_C__\n"); 3291 fprintf(file, "#error Only nls_utf8-norm.c should include this file.\n"); 3292 fprintf(file, "#endif\n"); 3293 fprintf(file, "\n"); 3294 fprintf(file, "static const unsigned int utf8vers = %#x;\n", 3295 unicode_maxage); 3296 fprintf(file, "\n"); 3297 fprintf(file, "static const unsigned int utf8agetab[] = {\n"); 3298 for (i = 0; i != ages_count; i++) 3299 fprintf(file, "\t%#x%s\n", ages[i], 3300 ages[i] == unicode_maxage ? "" : ","); 3301 fprintf(file, "};\n"); 3302 fprintf(file, "\n"); 3303 fprintf(file, "static const struct utf8data utf8nfdicfdata[] = {\n"); 3304 t = 0; 3305 for (gen = 0; gen < ages_count; gen++) { 3306 fprintf(file, "\t{ %#x, %d }%s\n", 3307 ages[gen], trees[t].index, 3308 ages[gen] == unicode_maxage ? "" : ","); 3309 if (trees[t].maxage == ages[gen]) 3310 t += 2; 3311 } 3312 fprintf(file, "};\n"); 3313 fprintf(file, "\n"); 3314 fprintf(file, "static const struct utf8data utf8nfdidata[] = {\n"); 3315 t = 1; 3316 for (gen = 0; gen < ages_count; gen++) { 3317 fprintf(file, "\t{ %#x, %d }%s\n", 3318 ages[gen], trees[t].index, 3319 ages[gen] == unicode_maxage ? "" : ","); 3320 if (trees[t].maxage == ages[gen]) 3321 t += 2; 3322 } 3323 fprintf(file, "};\n"); 3324 fprintf(file, "\n"); 3325 fprintf(file, "static const unsigned char utf8data[%zd] = {\n", 3326 utf8data_size); 3327 t = 0; 3328 for (i = 0; i != utf8data_size; i += 16) { 3329 if (i == trees[t].index) { 3330 fprintf(file, "\t/* %s_%x */\n", 3331 trees[t].type, trees[t].maxage); 3332 if (t < trees_count-1) 3333 t++; 3334 } 3335 fprintf(file, "\t"); 3336 for (j = i; j != i + 16; j++) 3337 fprintf(file, "0x%.2x%s", utf8data[j], 3338 (j < utf8data_size -1 ? "," : "")); 3339 fprintf(file, "\n"); 3340 } 3341 fprintf(file, "};\n"); 3342 fclose(file); 3343 } 3344 3345 /* ------------------------------------------------------------------ */ 3346 3347 int main(int argc, char *argv[]) 3348 { 3349 unsigned int unichar; 3350 int opt; 3351 3352 argv0 = argv[0]; 3353 3354 while ((opt = getopt(argc, argv, "a:c:d:f:hn:o:p:t:v")) != -1) { 3355 switch (opt) { 3356 case 'a': 3357 age_name = optarg; 3358 break; 3359 case 'c': 3360 ccc_name = optarg; 3361 break; 3362 case 'd': 3363 data_name = optarg; 3364 break; 3365 case 'f': 3366 fold_name = optarg; 3367 break; 3368 case 'n': 3369 norm_name = optarg; 3370 break; 3371 case 'o': 3372 utf8_name = optarg; 3373 break; 3374 case 'p': 3375 prop_name = optarg; 3376 break; 3377 case 't': 3378 test_name = optarg; 3379 break; 3380 case 'v': 3381 verbose++; 3382 break; 3383 case 'h': 3384 help(); 3385 exit(0); 3386 default: 3387 usage(); 3388 } 3389 } 3390 3391 if (verbose > 1) 3392 help(); 3393 for (unichar = 0; unichar != 0x110000; unichar++) 3394 unicode_data[unichar].code = unichar; 3395 age_init(); 3396 ccc_init(); 3397 nfdi_init(); 3398 nfdicf_init(); 3399 ignore_init(); 3400 corrections_init(); 3401 hangul_decompose(); 3402 nfdi_decompose(); 3403 nfdicf_decompose(); 3404 utf8_init(); 3405 trees_init(); 3406 trees_populate(); 3407 trees_reduce(); 3408 trees_verify(); 3409 /* Prevent "unused function" warning. */ 3410 (void)lookup(nfdi_tree, " "); 3411 if (verbose > 2) 3412 tree_walk(nfdi_tree); 3413 if (verbose > 2) 3414 tree_walk(nfdicf_tree); 3415 normalization_test(); 3416 write_file(); 3417 3418 return 0; 3419 } 3420