1 /* 2 * Copyright (c) 2014 SGI. 3 * All rights reserved. 4 * 5 * This program is free software; you can redistribute it and/or 6 * modify it under the terms of the GNU General Public License as 7 * published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope that it would be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 */ 15 16 #include "utf8n.h" 17 18 struct utf8data { 19 unsigned int maxage; 20 unsigned int offset; 21 }; 22 23 #define __INCLUDED_FROM_UTF8NORM_C__ 24 #include "utf8data.h" 25 #undef __INCLUDED_FROM_UTF8NORM_C__ 26 27 int utf8version_is_supported(u8 maj, u8 min, u8 rev) 28 { 29 int i = ARRAY_SIZE(utf8agetab) - 1; 30 unsigned int sb_utf8version = UNICODE_AGE(maj, min, rev); 31 32 while (i >= 0 && utf8agetab[i] != 0) { 33 if (sb_utf8version == utf8agetab[i]) 34 return 1; 35 i--; 36 } 37 return 0; 38 } 39 EXPORT_SYMBOL(utf8version_is_supported); 40 41 /* 42 * UTF-8 valid ranges. 43 * 44 * The UTF-8 encoding spreads the bits of a 32bit word over several 45 * bytes. This table gives the ranges that can be held and how they'd 46 * be represented. 47 * 48 * 0x00000000 0x0000007F: 0xxxxxxx 49 * 0x00000000 0x000007FF: 110xxxxx 10xxxxxx 50 * 0x00000000 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx 51 * 0x00000000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 52 * 0x00000000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 53 * 0x00000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 54 * 55 * There is an additional requirement on UTF-8, in that only the 56 * shortest representation of a 32bit value is to be used. A decoder 57 * must not decode sequences that do not satisfy this requirement. 58 * Thus the allowed ranges have a lower bound. 59 * 60 * 0x00000000 0x0000007F: 0xxxxxxx 61 * 0x00000080 0x000007FF: 110xxxxx 10xxxxxx 62 * 0x00000800 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx 63 * 0x00010000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 64 * 0x00200000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 65 * 0x04000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 66 * 67 * Actual unicode characters are limited to the range 0x0 - 0x10FFFF, 68 * 17 planes of 65536 values. This limits the sequences actually seen 69 * even more, to just the following. 70 * 71 * 0 - 0x7F: 0 - 0x7F 72 * 0x80 - 0x7FF: 0xC2 0x80 - 0xDF 0xBF 73 * 0x800 - 0xFFFF: 0xE0 0xA0 0x80 - 0xEF 0xBF 0xBF 74 * 0x10000 - 0x10FFFF: 0xF0 0x90 0x80 0x80 - 0xF4 0x8F 0xBF 0xBF 75 * 76 * Within those ranges the surrogates 0xD800 - 0xDFFF are not allowed. 77 * 78 * Note that the longest sequence seen with valid usage is 4 bytes, 79 * the same a single UTF-32 character. This makes the UTF-8 80 * representation of Unicode strictly smaller than UTF-32. 81 * 82 * The shortest sequence requirement was introduced by: 83 * Corrigendum #1: UTF-8 Shortest Form 84 * It can be found here: 85 * http://www.unicode.org/versions/corrigendum1.html 86 * 87 */ 88 89 /* 90 * Return the number of bytes used by the current UTF-8 sequence. 91 * Assumes the input points to the first byte of a valid UTF-8 92 * sequence. 93 */ 94 static inline int utf8clen(const char *s) 95 { 96 unsigned char c = *s; 97 98 return 1 + (c >= 0xC0) + (c >= 0xE0) + (c >= 0xF0); 99 } 100 101 /* 102 * Decode a 3-byte UTF-8 sequence. 103 */ 104 static unsigned int 105 utf8decode3(const char *str) 106 { 107 unsigned int uc; 108 109 uc = *str++ & 0x0F; 110 uc <<= 6; 111 uc |= *str++ & 0x3F; 112 uc <<= 6; 113 uc |= *str++ & 0x3F; 114 115 return uc; 116 } 117 118 /* 119 * Encode a 3-byte UTF-8 sequence. 120 */ 121 static int 122 utf8encode3(char *str, unsigned int val) 123 { 124 str[2] = (val & 0x3F) | 0x80; 125 val >>= 6; 126 str[1] = (val & 0x3F) | 0x80; 127 val >>= 6; 128 str[0] = val | 0xE0; 129 130 return 3; 131 } 132 133 /* 134 * utf8trie_t 135 * 136 * A compact binary tree, used to decode UTF-8 characters. 137 * 138 * Internal nodes are one byte for the node itself, and up to three 139 * bytes for an offset into the tree. The first byte contains the 140 * following information: 141 * NEXTBYTE - flag - advance to next byte if set 142 * BITNUM - 3 bit field - the bit number to tested 143 * OFFLEN - 2 bit field - number of bytes in the offset 144 * if offlen == 0 (non-branching node) 145 * RIGHTPATH - 1 bit field - set if the following node is for the 146 * right-hand path (tested bit is set) 147 * TRIENODE - 1 bit field - set if the following node is an internal 148 * node, otherwise it is a leaf node 149 * if offlen != 0 (branching node) 150 * LEFTNODE - 1 bit field - set if the left-hand node is internal 151 * RIGHTNODE - 1 bit field - set if the right-hand node is internal 152 * 153 * Due to the way utf8 works, there cannot be branching nodes with 154 * NEXTBYTE set, and moreover those nodes always have a righthand 155 * descendant. 156 */ 157 typedef const unsigned char utf8trie_t; 158 #define BITNUM 0x07 159 #define NEXTBYTE 0x08 160 #define OFFLEN 0x30 161 #define OFFLEN_SHIFT 4 162 #define RIGHTPATH 0x40 163 #define TRIENODE 0x80 164 #define RIGHTNODE 0x40 165 #define LEFTNODE 0x80 166 167 /* 168 * utf8leaf_t 169 * 170 * The leaves of the trie are embedded in the trie, and so the same 171 * underlying datatype: unsigned char. 172 * 173 * leaf[0]: The unicode version, stored as a generation number that is 174 * an index into utf8agetab[]. With this we can filter code 175 * points based on the unicode version in which they were 176 * defined. The CCC of a non-defined code point is 0. 177 * leaf[1]: Canonical Combining Class. During normalization, we need 178 * to do a stable sort into ascending order of all characters 179 * with a non-zero CCC that occur between two characters with 180 * a CCC of 0, or at the begin or end of a string. 181 * The unicode standard guarantees that all CCC values are 182 * between 0 and 254 inclusive, which leaves 255 available as 183 * a special value. 184 * Code points with CCC 0 are known as stoppers. 185 * leaf[2]: Decomposition. If leaf[1] == 255, then leaf[2] is the 186 * start of a NUL-terminated string that is the decomposition 187 * of the character. 188 * The CCC of a decomposable character is the same as the CCC 189 * of the first character of its decomposition. 190 * Some characters decompose as the empty string: these are 191 * characters with the Default_Ignorable_Code_Point property. 192 * These do affect normalization, as they all have CCC 0. 193 * 194 * The decompositions in the trie have been fully expanded, with the 195 * exception of Hangul syllables, which are decomposed algorithmically. 196 * 197 * Casefolding, if applicable, is also done using decompositions. 198 * 199 * The trie is constructed in such a way that leaves exist for all 200 * UTF-8 sequences that match the criteria from the "UTF-8 valid 201 * ranges" comment above, and only for those sequences. Therefore a 202 * lookup in the trie can be used to validate the UTF-8 input. 203 */ 204 typedef const unsigned char utf8leaf_t; 205 206 #define LEAF_GEN(LEAF) ((LEAF)[0]) 207 #define LEAF_CCC(LEAF) ((LEAF)[1]) 208 #define LEAF_STR(LEAF) ((const char *)((LEAF) + 2)) 209 210 #define MINCCC (0) 211 #define MAXCCC (254) 212 #define STOPPER (0) 213 #define DECOMPOSE (255) 214 215 /* Marker for hangul syllable decomposition. */ 216 #define HANGUL ((char)(255)) 217 /* Size of the synthesized leaf used for Hangul syllable decomposition. */ 218 #define UTF8HANGULLEAF (12) 219 220 /* 221 * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0) 222 * 223 * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;; 224 * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;; 225 * 226 * SBase = 0xAC00 227 * LBase = 0x1100 228 * VBase = 0x1161 229 * TBase = 0x11A7 230 * LCount = 19 231 * VCount = 21 232 * TCount = 28 233 * NCount = 588 (VCount * TCount) 234 * SCount = 11172 (LCount * NCount) 235 * 236 * Decomposition: 237 * SIndex = s - SBase 238 * 239 * LV (Canonical/Full) 240 * LIndex = SIndex / NCount 241 * VIndex = (Sindex % NCount) / TCount 242 * LPart = LBase + LIndex 243 * VPart = VBase + VIndex 244 * 245 * LVT (Canonical) 246 * LVIndex = (SIndex / TCount) * TCount 247 * TIndex = (Sindex % TCount) 248 * LVPart = SBase + LVIndex 249 * TPart = TBase + TIndex 250 * 251 * LVT (Full) 252 * LIndex = SIndex / NCount 253 * VIndex = (Sindex % NCount) / TCount 254 * TIndex = (Sindex % TCount) 255 * LPart = LBase + LIndex 256 * VPart = VBase + VIndex 257 * if (TIndex == 0) { 258 * d = <LPart, VPart> 259 * } else { 260 * TPart = TBase + TIndex 261 * d = <LPart, TPart, VPart> 262 * } 263 */ 264 265 /* Constants */ 266 #define SB (0xAC00) 267 #define LB (0x1100) 268 #define VB (0x1161) 269 #define TB (0x11A7) 270 #define LC (19) 271 #define VC (21) 272 #define TC (28) 273 #define NC (VC * TC) 274 #define SC (LC * NC) 275 276 /* Algorithmic decomposition of hangul syllable. */ 277 static utf8leaf_t * 278 utf8hangul(const char *str, unsigned char *hangul) 279 { 280 unsigned int si; 281 unsigned int li; 282 unsigned int vi; 283 unsigned int ti; 284 unsigned char *h; 285 286 /* Calculate the SI, LI, VI, and TI values. */ 287 si = utf8decode3(str) - SB; 288 li = si / NC; 289 vi = (si % NC) / TC; 290 ti = si % TC; 291 292 /* Fill in base of leaf. */ 293 h = hangul; 294 LEAF_GEN(h) = 2; 295 LEAF_CCC(h) = DECOMPOSE; 296 h += 2; 297 298 /* Add LPart, a 3-byte UTF-8 sequence. */ 299 h += utf8encode3((char *)h, li + LB); 300 301 /* Add VPart, a 3-byte UTF-8 sequence. */ 302 h += utf8encode3((char *)h, vi + VB); 303 304 /* Add TPart if required, also a 3-byte UTF-8 sequence. */ 305 if (ti) 306 h += utf8encode3((char *)h, ti + TB); 307 308 /* Terminate string. */ 309 h[0] = '\0'; 310 311 return hangul; 312 } 313 314 /* 315 * Use trie to scan s, touching at most len bytes. 316 * Returns the leaf if one exists, NULL otherwise. 317 * 318 * A non-NULL return guarantees that the UTF-8 sequence starting at s 319 * is well-formed and corresponds to a known unicode code point. The 320 * shorthand for this will be "is valid UTF-8 unicode". 321 */ 322 static utf8leaf_t *utf8nlookup(const struct utf8data *data, 323 unsigned char *hangul, const char *s, size_t len) 324 { 325 utf8trie_t *trie = NULL; 326 int offlen; 327 int offset; 328 int mask; 329 int node; 330 331 if (!data) 332 return NULL; 333 if (len == 0) 334 return NULL; 335 336 trie = utf8data + data->offset; 337 node = 1; 338 while (node) { 339 offlen = (*trie & OFFLEN) >> OFFLEN_SHIFT; 340 if (*trie & NEXTBYTE) { 341 if (--len == 0) 342 return NULL; 343 s++; 344 } 345 mask = 1 << (*trie & BITNUM); 346 if (*s & mask) { 347 /* Right leg */ 348 if (offlen) { 349 /* Right node at offset of trie */ 350 node = (*trie & RIGHTNODE); 351 offset = trie[offlen]; 352 while (--offlen) { 353 offset <<= 8; 354 offset |= trie[offlen]; 355 } 356 trie += offset; 357 } else if (*trie & RIGHTPATH) { 358 /* Right node after this node */ 359 node = (*trie & TRIENODE); 360 trie++; 361 } else { 362 /* No right node. */ 363 return NULL; 364 } 365 } else { 366 /* Left leg */ 367 if (offlen) { 368 /* Left node after this node. */ 369 node = (*trie & LEFTNODE); 370 trie += offlen + 1; 371 } else if (*trie & RIGHTPATH) { 372 /* No left node. */ 373 return NULL; 374 } else { 375 /* Left node after this node */ 376 node = (*trie & TRIENODE); 377 trie++; 378 } 379 } 380 } 381 /* 382 * Hangul decomposition is done algorithmically. These are the 383 * codepoints >= 0xAC00 and <= 0xD7A3. Their UTF-8 encoding is 384 * always 3 bytes long, so s has been advanced twice, and the 385 * start of the sequence is at s-2. 386 */ 387 if (LEAF_CCC(trie) == DECOMPOSE && LEAF_STR(trie)[0] == HANGUL) 388 trie = utf8hangul(s - 2, hangul); 389 return trie; 390 } 391 392 /* 393 * Use trie to scan s. 394 * Returns the leaf if one exists, NULL otherwise. 395 * 396 * Forwards to utf8nlookup(). 397 */ 398 static utf8leaf_t *utf8lookup(const struct utf8data *data, 399 unsigned char *hangul, const char *s) 400 { 401 return utf8nlookup(data, hangul, s, (size_t)-1); 402 } 403 404 /* 405 * Maximum age of any character in s. 406 * Return -1 if s is not valid UTF-8 unicode. 407 * Return 0 if only non-assigned code points are used. 408 */ 409 int utf8agemax(const struct utf8data *data, const char *s) 410 { 411 utf8leaf_t *leaf; 412 int age = 0; 413 int leaf_age; 414 unsigned char hangul[UTF8HANGULLEAF]; 415 416 if (!data) 417 return -1; 418 419 while (*s) { 420 leaf = utf8lookup(data, hangul, s); 421 if (!leaf) 422 return -1; 423 424 leaf_age = utf8agetab[LEAF_GEN(leaf)]; 425 if (leaf_age <= data->maxage && leaf_age > age) 426 age = leaf_age; 427 s += utf8clen(s); 428 } 429 return age; 430 } 431 EXPORT_SYMBOL(utf8agemax); 432 433 /* 434 * Minimum age of any character in s. 435 * Return -1 if s is not valid UTF-8 unicode. 436 * Return 0 if non-assigned code points are used. 437 */ 438 int utf8agemin(const struct utf8data *data, const char *s) 439 { 440 utf8leaf_t *leaf; 441 int age; 442 int leaf_age; 443 unsigned char hangul[UTF8HANGULLEAF]; 444 445 if (!data) 446 return -1; 447 age = data->maxage; 448 while (*s) { 449 leaf = utf8lookup(data, hangul, s); 450 if (!leaf) 451 return -1; 452 leaf_age = utf8agetab[LEAF_GEN(leaf)]; 453 if (leaf_age <= data->maxage && leaf_age < age) 454 age = leaf_age; 455 s += utf8clen(s); 456 } 457 return age; 458 } 459 EXPORT_SYMBOL(utf8agemin); 460 461 /* 462 * Maximum age of any character in s, touch at most len bytes. 463 * Return -1 if s is not valid UTF-8 unicode. 464 */ 465 int utf8nagemax(const struct utf8data *data, const char *s, size_t len) 466 { 467 utf8leaf_t *leaf; 468 int age = 0; 469 int leaf_age; 470 unsigned char hangul[UTF8HANGULLEAF]; 471 472 if (!data) 473 return -1; 474 475 while (len && *s) { 476 leaf = utf8nlookup(data, hangul, s, len); 477 if (!leaf) 478 return -1; 479 leaf_age = utf8agetab[LEAF_GEN(leaf)]; 480 if (leaf_age <= data->maxage && leaf_age > age) 481 age = leaf_age; 482 len -= utf8clen(s); 483 s += utf8clen(s); 484 } 485 return age; 486 } 487 EXPORT_SYMBOL(utf8nagemax); 488 489 /* 490 * Maximum age of any character in s, touch at most len bytes. 491 * Return -1 if s is not valid UTF-8 unicode. 492 */ 493 int utf8nagemin(const struct utf8data *data, const char *s, size_t len) 494 { 495 utf8leaf_t *leaf; 496 int leaf_age; 497 int age; 498 unsigned char hangul[UTF8HANGULLEAF]; 499 500 if (!data) 501 return -1; 502 age = data->maxage; 503 while (len && *s) { 504 leaf = utf8nlookup(data, hangul, s, len); 505 if (!leaf) 506 return -1; 507 leaf_age = utf8agetab[LEAF_GEN(leaf)]; 508 if (leaf_age <= data->maxage && leaf_age < age) 509 age = leaf_age; 510 len -= utf8clen(s); 511 s += utf8clen(s); 512 } 513 return age; 514 } 515 EXPORT_SYMBOL(utf8nagemin); 516 517 /* 518 * Length of the normalization of s. 519 * Return -1 if s is not valid UTF-8 unicode. 520 * 521 * A string of Default_Ignorable_Code_Point has length 0. 522 */ 523 ssize_t utf8len(const struct utf8data *data, const char *s) 524 { 525 utf8leaf_t *leaf; 526 size_t ret = 0; 527 unsigned char hangul[UTF8HANGULLEAF]; 528 529 if (!data) 530 return -1; 531 while (*s) { 532 leaf = utf8lookup(data, hangul, s); 533 if (!leaf) 534 return -1; 535 if (utf8agetab[LEAF_GEN(leaf)] > data->maxage) 536 ret += utf8clen(s); 537 else if (LEAF_CCC(leaf) == DECOMPOSE) 538 ret += strlen(LEAF_STR(leaf)); 539 else 540 ret += utf8clen(s); 541 s += utf8clen(s); 542 } 543 return ret; 544 } 545 EXPORT_SYMBOL(utf8len); 546 547 /* 548 * Length of the normalization of s, touch at most len bytes. 549 * Return -1 if s is not valid UTF-8 unicode. 550 */ 551 ssize_t utf8nlen(const struct utf8data *data, const char *s, size_t len) 552 { 553 utf8leaf_t *leaf; 554 size_t ret = 0; 555 unsigned char hangul[UTF8HANGULLEAF]; 556 557 if (!data) 558 return -1; 559 while (len && *s) { 560 leaf = utf8nlookup(data, hangul, s, len); 561 if (!leaf) 562 return -1; 563 if (utf8agetab[LEAF_GEN(leaf)] > data->maxage) 564 ret += utf8clen(s); 565 else if (LEAF_CCC(leaf) == DECOMPOSE) 566 ret += strlen(LEAF_STR(leaf)); 567 else 568 ret += utf8clen(s); 569 len -= utf8clen(s); 570 s += utf8clen(s); 571 } 572 return ret; 573 } 574 EXPORT_SYMBOL(utf8nlen); 575 576 /* 577 * Set up an utf8cursor for use by utf8byte(). 578 * 579 * u8c : pointer to cursor. 580 * data : const struct utf8data to use for normalization. 581 * s : string. 582 * len : length of s. 583 * 584 * Returns -1 on error, 0 on success. 585 */ 586 int utf8ncursor(struct utf8cursor *u8c, const struct utf8data *data, 587 const char *s, size_t len) 588 { 589 if (!data) 590 return -1; 591 if (!s) 592 return -1; 593 u8c->data = data; 594 u8c->s = s; 595 u8c->p = NULL; 596 u8c->ss = NULL; 597 u8c->sp = NULL; 598 u8c->len = len; 599 u8c->slen = 0; 600 u8c->ccc = STOPPER; 601 u8c->nccc = STOPPER; 602 /* Check we didn't clobber the maximum length. */ 603 if (u8c->len != len) 604 return -1; 605 /* The first byte of s may not be an utf8 continuation. */ 606 if (len > 0 && (*s & 0xC0) == 0x80) 607 return -1; 608 return 0; 609 } 610 EXPORT_SYMBOL(utf8ncursor); 611 612 /* 613 * Set up an utf8cursor for use by utf8byte(). 614 * 615 * u8c : pointer to cursor. 616 * data : const struct utf8data to use for normalization. 617 * s : NUL-terminated string. 618 * 619 * Returns -1 on error, 0 on success. 620 */ 621 int utf8cursor(struct utf8cursor *u8c, const struct utf8data *data, 622 const char *s) 623 { 624 return utf8ncursor(u8c, data, s, (unsigned int)-1); 625 } 626 EXPORT_SYMBOL(utf8cursor); 627 628 /* 629 * Get one byte from the normalized form of the string described by u8c. 630 * 631 * Returns the byte cast to an unsigned char on succes, and -1 on failure. 632 * 633 * The cursor keeps track of the location in the string in u8c->s. 634 * When a character is decomposed, the current location is stored in 635 * u8c->p, and u8c->s is set to the start of the decomposition. Note 636 * that bytes from a decomposition do not count against u8c->len. 637 * 638 * Characters are emitted if they match the current CCC in u8c->ccc. 639 * Hitting end-of-string while u8c->ccc == STOPPER means we're done, 640 * and the function returns 0 in that case. 641 * 642 * Sorting by CCC is done by repeatedly scanning the string. The 643 * values of u8c->s and u8c->p are stored in u8c->ss and u8c->sp at 644 * the start of the scan. The first pass finds the lowest CCC to be 645 * emitted and stores it in u8c->nccc, the second pass emits the 646 * characters with this CCC and finds the next lowest CCC. This limits 647 * the number of passes to 1 + the number of different CCCs in the 648 * sequence being scanned. 649 * 650 * Therefore: 651 * u8c->p != NULL -> a decomposition is being scanned. 652 * u8c->ss != NULL -> this is a repeating scan. 653 * u8c->ccc == -1 -> this is the first scan of a repeating scan. 654 */ 655 int utf8byte(struct utf8cursor *u8c) 656 { 657 utf8leaf_t *leaf; 658 int ccc; 659 660 for (;;) { 661 /* Check for the end of a decomposed character. */ 662 if (u8c->p && *u8c->s == '\0') { 663 u8c->s = u8c->p; 664 u8c->p = NULL; 665 } 666 667 /* Check for end-of-string. */ 668 if (!u8c->p && (u8c->len == 0 || *u8c->s == '\0')) { 669 /* There is no next byte. */ 670 if (u8c->ccc == STOPPER) 671 return 0; 672 /* End-of-string during a scan counts as a stopper. */ 673 ccc = STOPPER; 674 goto ccc_mismatch; 675 } else if ((*u8c->s & 0xC0) == 0x80) { 676 /* This is a continuation of the current character. */ 677 if (!u8c->p) 678 u8c->len--; 679 return (unsigned char)*u8c->s++; 680 } 681 682 /* Look up the data for the current character. */ 683 if (u8c->p) { 684 leaf = utf8lookup(u8c->data, u8c->hangul, u8c->s); 685 } else { 686 leaf = utf8nlookup(u8c->data, u8c->hangul, 687 u8c->s, u8c->len); 688 } 689 690 /* No leaf found implies that the input is a binary blob. */ 691 if (!leaf) 692 return -1; 693 694 ccc = LEAF_CCC(leaf); 695 /* Characters that are too new have CCC 0. */ 696 if (utf8agetab[LEAF_GEN(leaf)] > u8c->data->maxage) { 697 ccc = STOPPER; 698 } else if (ccc == DECOMPOSE) { 699 u8c->len -= utf8clen(u8c->s); 700 u8c->p = u8c->s + utf8clen(u8c->s); 701 u8c->s = LEAF_STR(leaf); 702 /* Empty decomposition implies CCC 0. */ 703 if (*u8c->s == '\0') { 704 if (u8c->ccc == STOPPER) 705 continue; 706 ccc = STOPPER; 707 goto ccc_mismatch; 708 } 709 710 leaf = utf8lookup(u8c->data, u8c->hangul, u8c->s); 711 ccc = LEAF_CCC(leaf); 712 } 713 714 /* 715 * If this is not a stopper, then see if it updates 716 * the next canonical class to be emitted. 717 */ 718 if (ccc != STOPPER && u8c->ccc < ccc && ccc < u8c->nccc) 719 u8c->nccc = ccc; 720 721 /* 722 * Return the current byte if this is the current 723 * combining class. 724 */ 725 if (ccc == u8c->ccc) { 726 if (!u8c->p) 727 u8c->len--; 728 return (unsigned char)*u8c->s++; 729 } 730 731 /* Current combining class mismatch. */ 732 ccc_mismatch: 733 if (u8c->nccc == STOPPER) { 734 /* 735 * Scan forward for the first canonical class 736 * to be emitted. Save the position from 737 * which to restart. 738 */ 739 u8c->ccc = MINCCC - 1; 740 u8c->nccc = ccc; 741 u8c->sp = u8c->p; 742 u8c->ss = u8c->s; 743 u8c->slen = u8c->len; 744 if (!u8c->p) 745 u8c->len -= utf8clen(u8c->s); 746 u8c->s += utf8clen(u8c->s); 747 } else if (ccc != STOPPER) { 748 /* Not a stopper, and not the ccc we're emitting. */ 749 if (!u8c->p) 750 u8c->len -= utf8clen(u8c->s); 751 u8c->s += utf8clen(u8c->s); 752 } else if (u8c->nccc != MAXCCC + 1) { 753 /* At a stopper, restart for next ccc. */ 754 u8c->ccc = u8c->nccc; 755 u8c->nccc = MAXCCC + 1; 756 u8c->s = u8c->ss; 757 u8c->p = u8c->sp; 758 u8c->len = u8c->slen; 759 } else { 760 /* All done, proceed from here. */ 761 u8c->ccc = STOPPER; 762 u8c->nccc = STOPPER; 763 u8c->sp = NULL; 764 u8c->ss = NULL; 765 u8c->slen = 0; 766 } 767 } 768 } 769 EXPORT_SYMBOL(utf8byte); 770 771 const struct utf8data *utf8nfdi(unsigned int maxage) 772 { 773 int i = ARRAY_SIZE(utf8nfdidata) - 1; 774 775 while (maxage < utf8nfdidata[i].maxage) 776 i--; 777 if (maxage > utf8nfdidata[i].maxage) 778 return NULL; 779 return &utf8nfdidata[i]; 780 } 781 EXPORT_SYMBOL(utf8nfdi); 782 783 const struct utf8data *utf8nfdicf(unsigned int maxage) 784 { 785 int i = ARRAY_SIZE(utf8nfdicfdata) - 1; 786 787 while (maxage < utf8nfdicfdata[i].maxage) 788 i--; 789 if (maxage > utf8nfdicfdata[i].maxage) 790 return NULL; 791 return &utf8nfdicfdata[i]; 792 } 793 EXPORT_SYMBOL(utf8nfdicf); 794