xref: /openbmc/linux/fs/unicode/utf8-norm.c (revision 151f4e2b)
1 /*
2  * Copyright (c) 2014 SGI.
3  * All rights reserved.
4  *
5  * This program is free software; you can redistribute it and/or
6  * modify it under the terms of the GNU General Public License as
7  * published by the Free Software Foundation.
8  *
9  * This program is distributed in the hope that it would be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  * GNU General Public License for more details.
13  *
14  */
15 
16 #include "utf8n.h"
17 
18 struct utf8data {
19 	unsigned int maxage;
20 	unsigned int offset;
21 };
22 
23 #define __INCLUDED_FROM_UTF8NORM_C__
24 #include "utf8data.h"
25 #undef __INCLUDED_FROM_UTF8NORM_C__
26 
27 int utf8version_is_supported(u8 maj, u8 min, u8 rev)
28 {
29 	int i = ARRAY_SIZE(utf8agetab) - 1;
30 	unsigned int sb_utf8version = UNICODE_AGE(maj, min, rev);
31 
32 	while (i >= 0 && utf8agetab[i] != 0) {
33 		if (sb_utf8version == utf8agetab[i])
34 			return 1;
35 		i--;
36 	}
37 	return 0;
38 }
39 EXPORT_SYMBOL(utf8version_is_supported);
40 
41 int utf8version_latest(void)
42 {
43 	return utf8vers;
44 }
45 EXPORT_SYMBOL(utf8version_latest);
46 
47 /*
48  * UTF-8 valid ranges.
49  *
50  * The UTF-8 encoding spreads the bits of a 32bit word over several
51  * bytes. This table gives the ranges that can be held and how they'd
52  * be represented.
53  *
54  * 0x00000000 0x0000007F: 0xxxxxxx
55  * 0x00000000 0x000007FF: 110xxxxx 10xxxxxx
56  * 0x00000000 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
57  * 0x00000000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
58  * 0x00000000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
59  * 0x00000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
60  *
61  * There is an additional requirement on UTF-8, in that only the
62  * shortest representation of a 32bit value is to be used.  A decoder
63  * must not decode sequences that do not satisfy this requirement.
64  * Thus the allowed ranges have a lower bound.
65  *
66  * 0x00000000 0x0000007F: 0xxxxxxx
67  * 0x00000080 0x000007FF: 110xxxxx 10xxxxxx
68  * 0x00000800 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
69  * 0x00010000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
70  * 0x00200000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
71  * 0x04000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
72  *
73  * Actual unicode characters are limited to the range 0x0 - 0x10FFFF,
74  * 17 planes of 65536 values.  This limits the sequences actually seen
75  * even more, to just the following.
76  *
77  *          0 -     0x7F: 0                   - 0x7F
78  *       0x80 -    0x7FF: 0xC2 0x80           - 0xDF 0xBF
79  *      0x800 -   0xFFFF: 0xE0 0xA0 0x80      - 0xEF 0xBF 0xBF
80  *    0x10000 - 0x10FFFF: 0xF0 0x90 0x80 0x80 - 0xF4 0x8F 0xBF 0xBF
81  *
82  * Within those ranges the surrogates 0xD800 - 0xDFFF are not allowed.
83  *
84  * Note that the longest sequence seen with valid usage is 4 bytes,
85  * the same a single UTF-32 character.  This makes the UTF-8
86  * representation of Unicode strictly smaller than UTF-32.
87  *
88  * The shortest sequence requirement was introduced by:
89  *    Corrigendum #1: UTF-8 Shortest Form
90  * It can be found here:
91  *    http://www.unicode.org/versions/corrigendum1.html
92  *
93  */
94 
95 /*
96  * Return the number of bytes used by the current UTF-8 sequence.
97  * Assumes the input points to the first byte of a valid UTF-8
98  * sequence.
99  */
100 static inline int utf8clen(const char *s)
101 {
102 	unsigned char c = *s;
103 
104 	return 1 + (c >= 0xC0) + (c >= 0xE0) + (c >= 0xF0);
105 }
106 
107 /*
108  * Decode a 3-byte UTF-8 sequence.
109  */
110 static unsigned int
111 utf8decode3(const char *str)
112 {
113 	unsigned int		uc;
114 
115 	uc = *str++ & 0x0F;
116 	uc <<= 6;
117 	uc |= *str++ & 0x3F;
118 	uc <<= 6;
119 	uc |= *str++ & 0x3F;
120 
121 	return uc;
122 }
123 
124 /*
125  * Encode a 3-byte UTF-8 sequence.
126  */
127 static int
128 utf8encode3(char *str, unsigned int val)
129 {
130 	str[2] = (val & 0x3F) | 0x80;
131 	val >>= 6;
132 	str[1] = (val & 0x3F) | 0x80;
133 	val >>= 6;
134 	str[0] = val | 0xE0;
135 
136 	return 3;
137 }
138 
139 /*
140  * utf8trie_t
141  *
142  * A compact binary tree, used to decode UTF-8 characters.
143  *
144  * Internal nodes are one byte for the node itself, and up to three
145  * bytes for an offset into the tree.  The first byte contains the
146  * following information:
147  *  NEXTBYTE  - flag        - advance to next byte if set
148  *  BITNUM    - 3 bit field - the bit number to tested
149  *  OFFLEN    - 2 bit field - number of bytes in the offset
150  * if offlen == 0 (non-branching node)
151  *  RIGHTPATH - 1 bit field - set if the following node is for the
152  *                            right-hand path (tested bit is set)
153  *  TRIENODE  - 1 bit field - set if the following node is an internal
154  *                            node, otherwise it is a leaf node
155  * if offlen != 0 (branching node)
156  *  LEFTNODE  - 1 bit field - set if the left-hand node is internal
157  *  RIGHTNODE - 1 bit field - set if the right-hand node is internal
158  *
159  * Due to the way utf8 works, there cannot be branching nodes with
160  * NEXTBYTE set, and moreover those nodes always have a righthand
161  * descendant.
162  */
163 typedef const unsigned char utf8trie_t;
164 #define BITNUM		0x07
165 #define NEXTBYTE	0x08
166 #define OFFLEN		0x30
167 #define OFFLEN_SHIFT	4
168 #define RIGHTPATH	0x40
169 #define TRIENODE	0x80
170 #define RIGHTNODE	0x40
171 #define LEFTNODE	0x80
172 
173 /*
174  * utf8leaf_t
175  *
176  * The leaves of the trie are embedded in the trie, and so the same
177  * underlying datatype: unsigned char.
178  *
179  * leaf[0]: The unicode version, stored as a generation number that is
180  *          an index into utf8agetab[].  With this we can filter code
181  *          points based on the unicode version in which they were
182  *          defined.  The CCC of a non-defined code point is 0.
183  * leaf[1]: Canonical Combining Class. During normalization, we need
184  *          to do a stable sort into ascending order of all characters
185  *          with a non-zero CCC that occur between two characters with
186  *          a CCC of 0, or at the begin or end of a string.
187  *          The unicode standard guarantees that all CCC values are
188  *          between 0 and 254 inclusive, which leaves 255 available as
189  *          a special value.
190  *          Code points with CCC 0 are known as stoppers.
191  * leaf[2]: Decomposition. If leaf[1] == 255, then leaf[2] is the
192  *          start of a NUL-terminated string that is the decomposition
193  *          of the character.
194  *          The CCC of a decomposable character is the same as the CCC
195  *          of the first character of its decomposition.
196  *          Some characters decompose as the empty string: these are
197  *          characters with the Default_Ignorable_Code_Point property.
198  *          These do affect normalization, as they all have CCC 0.
199  *
200  * The decompositions in the trie have been fully expanded, with the
201  * exception of Hangul syllables, which are decomposed algorithmically.
202  *
203  * Casefolding, if applicable, is also done using decompositions.
204  *
205  * The trie is constructed in such a way that leaves exist for all
206  * UTF-8 sequences that match the criteria from the "UTF-8 valid
207  * ranges" comment above, and only for those sequences.  Therefore a
208  * lookup in the trie can be used to validate the UTF-8 input.
209  */
210 typedef const unsigned char utf8leaf_t;
211 
212 #define LEAF_GEN(LEAF)	((LEAF)[0])
213 #define LEAF_CCC(LEAF)	((LEAF)[1])
214 #define LEAF_STR(LEAF)	((const char *)((LEAF) + 2))
215 
216 #define MINCCC		(0)
217 #define MAXCCC		(254)
218 #define STOPPER		(0)
219 #define	DECOMPOSE	(255)
220 
221 /* Marker for hangul syllable decomposition. */
222 #define HANGUL		((char)(255))
223 /* Size of the synthesized leaf used for Hangul syllable decomposition. */
224 #define UTF8HANGULLEAF	(12)
225 
226 /*
227  * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0)
228  *
229  * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
230  * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
231  *
232  * SBase = 0xAC00
233  * LBase = 0x1100
234  * VBase = 0x1161
235  * TBase = 0x11A7
236  * LCount = 19
237  * VCount = 21
238  * TCount = 28
239  * NCount = 588 (VCount * TCount)
240  * SCount = 11172 (LCount * NCount)
241  *
242  * Decomposition:
243  *   SIndex = s - SBase
244  *
245  * LV (Canonical/Full)
246  *   LIndex = SIndex / NCount
247  *   VIndex = (Sindex % NCount) / TCount
248  *   LPart = LBase + LIndex
249  *   VPart = VBase + VIndex
250  *
251  * LVT (Canonical)
252  *   LVIndex = (SIndex / TCount) * TCount
253  *   TIndex = (Sindex % TCount)
254  *   LVPart = SBase + LVIndex
255  *   TPart = TBase + TIndex
256  *
257  * LVT (Full)
258  *   LIndex = SIndex / NCount
259  *   VIndex = (Sindex % NCount) / TCount
260  *   TIndex = (Sindex % TCount)
261  *   LPart = LBase + LIndex
262  *   VPart = VBase + VIndex
263  *   if (TIndex == 0) {
264  *          d = <LPart, VPart>
265  *   } else {
266  *          TPart = TBase + TIndex
267  *          d = <LPart, TPart, VPart>
268  *   }
269  */
270 
271 /* Constants */
272 #define SB	(0xAC00)
273 #define LB	(0x1100)
274 #define VB	(0x1161)
275 #define TB	(0x11A7)
276 #define LC	(19)
277 #define VC	(21)
278 #define TC	(28)
279 #define NC	(VC * TC)
280 #define SC	(LC * NC)
281 
282 /* Algorithmic decomposition of hangul syllable. */
283 static utf8leaf_t *
284 utf8hangul(const char *str, unsigned char *hangul)
285 {
286 	unsigned int	si;
287 	unsigned int	li;
288 	unsigned int	vi;
289 	unsigned int	ti;
290 	unsigned char	*h;
291 
292 	/* Calculate the SI, LI, VI, and TI values. */
293 	si = utf8decode3(str) - SB;
294 	li = si / NC;
295 	vi = (si % NC) / TC;
296 	ti = si % TC;
297 
298 	/* Fill in base of leaf. */
299 	h = hangul;
300 	LEAF_GEN(h) = 2;
301 	LEAF_CCC(h) = DECOMPOSE;
302 	h += 2;
303 
304 	/* Add LPart, a 3-byte UTF-8 sequence. */
305 	h += utf8encode3((char *)h, li + LB);
306 
307 	/* Add VPart, a 3-byte UTF-8 sequence. */
308 	h += utf8encode3((char *)h, vi + VB);
309 
310 	/* Add TPart if required, also a 3-byte UTF-8 sequence. */
311 	if (ti)
312 		h += utf8encode3((char *)h, ti + TB);
313 
314 	/* Terminate string. */
315 	h[0] = '\0';
316 
317 	return hangul;
318 }
319 
320 /*
321  * Use trie to scan s, touching at most len bytes.
322  * Returns the leaf if one exists, NULL otherwise.
323  *
324  * A non-NULL return guarantees that the UTF-8 sequence starting at s
325  * is well-formed and corresponds to a known unicode code point.  The
326  * shorthand for this will be "is valid UTF-8 unicode".
327  */
328 static utf8leaf_t *utf8nlookup(const struct utf8data *data,
329 			       unsigned char *hangul, const char *s, size_t len)
330 {
331 	utf8trie_t	*trie = NULL;
332 	int		offlen;
333 	int		offset;
334 	int		mask;
335 	int		node;
336 
337 	if (!data)
338 		return NULL;
339 	if (len == 0)
340 		return NULL;
341 
342 	trie = utf8data + data->offset;
343 	node = 1;
344 	while (node) {
345 		offlen = (*trie & OFFLEN) >> OFFLEN_SHIFT;
346 		if (*trie & NEXTBYTE) {
347 			if (--len == 0)
348 				return NULL;
349 			s++;
350 		}
351 		mask = 1 << (*trie & BITNUM);
352 		if (*s & mask) {
353 			/* Right leg */
354 			if (offlen) {
355 				/* Right node at offset of trie */
356 				node = (*trie & RIGHTNODE);
357 				offset = trie[offlen];
358 				while (--offlen) {
359 					offset <<= 8;
360 					offset |= trie[offlen];
361 				}
362 				trie += offset;
363 			} else if (*trie & RIGHTPATH) {
364 				/* Right node after this node */
365 				node = (*trie & TRIENODE);
366 				trie++;
367 			} else {
368 				/* No right node. */
369 				return NULL;
370 			}
371 		} else {
372 			/* Left leg */
373 			if (offlen) {
374 				/* Left node after this node. */
375 				node = (*trie & LEFTNODE);
376 				trie += offlen + 1;
377 			} else if (*trie & RIGHTPATH) {
378 				/* No left node. */
379 				return NULL;
380 			} else {
381 				/* Left node after this node */
382 				node = (*trie & TRIENODE);
383 				trie++;
384 			}
385 		}
386 	}
387 	/*
388 	 * Hangul decomposition is done algorithmically. These are the
389 	 * codepoints >= 0xAC00 and <= 0xD7A3. Their UTF-8 encoding is
390 	 * always 3 bytes long, so s has been advanced twice, and the
391 	 * start of the sequence is at s-2.
392 	 */
393 	if (LEAF_CCC(trie) == DECOMPOSE && LEAF_STR(trie)[0] == HANGUL)
394 		trie = utf8hangul(s - 2, hangul);
395 	return trie;
396 }
397 
398 /*
399  * Use trie to scan s.
400  * Returns the leaf if one exists, NULL otherwise.
401  *
402  * Forwards to utf8nlookup().
403  */
404 static utf8leaf_t *utf8lookup(const struct utf8data *data,
405 			      unsigned char *hangul, const char *s)
406 {
407 	return utf8nlookup(data, hangul, s, (size_t)-1);
408 }
409 
410 /*
411  * Maximum age of any character in s.
412  * Return -1 if s is not valid UTF-8 unicode.
413  * Return 0 if only non-assigned code points are used.
414  */
415 int utf8agemax(const struct utf8data *data, const char *s)
416 {
417 	utf8leaf_t	*leaf;
418 	int		age = 0;
419 	int		leaf_age;
420 	unsigned char	hangul[UTF8HANGULLEAF];
421 
422 	if (!data)
423 		return -1;
424 
425 	while (*s) {
426 		leaf = utf8lookup(data, hangul, s);
427 		if (!leaf)
428 			return -1;
429 
430 		leaf_age = utf8agetab[LEAF_GEN(leaf)];
431 		if (leaf_age <= data->maxage && leaf_age > age)
432 			age = leaf_age;
433 		s += utf8clen(s);
434 	}
435 	return age;
436 }
437 EXPORT_SYMBOL(utf8agemax);
438 
439 /*
440  * Minimum age of any character in s.
441  * Return -1 if s is not valid UTF-8 unicode.
442  * Return 0 if non-assigned code points are used.
443  */
444 int utf8agemin(const struct utf8data *data, const char *s)
445 {
446 	utf8leaf_t	*leaf;
447 	int		age;
448 	int		leaf_age;
449 	unsigned char	hangul[UTF8HANGULLEAF];
450 
451 	if (!data)
452 		return -1;
453 	age = data->maxage;
454 	while (*s) {
455 		leaf = utf8lookup(data, hangul, s);
456 		if (!leaf)
457 			return -1;
458 		leaf_age = utf8agetab[LEAF_GEN(leaf)];
459 		if (leaf_age <= data->maxage && leaf_age < age)
460 			age = leaf_age;
461 		s += utf8clen(s);
462 	}
463 	return age;
464 }
465 EXPORT_SYMBOL(utf8agemin);
466 
467 /*
468  * Maximum age of any character in s, touch at most len bytes.
469  * Return -1 if s is not valid UTF-8 unicode.
470  */
471 int utf8nagemax(const struct utf8data *data, const char *s, size_t len)
472 {
473 	utf8leaf_t	*leaf;
474 	int		age = 0;
475 	int		leaf_age;
476 	unsigned char	hangul[UTF8HANGULLEAF];
477 
478 	if (!data)
479 		return -1;
480 
481 	while (len && *s) {
482 		leaf = utf8nlookup(data, hangul, s, len);
483 		if (!leaf)
484 			return -1;
485 		leaf_age = utf8agetab[LEAF_GEN(leaf)];
486 		if (leaf_age <= data->maxage && leaf_age > age)
487 			age = leaf_age;
488 		len -= utf8clen(s);
489 		s += utf8clen(s);
490 	}
491 	return age;
492 }
493 EXPORT_SYMBOL(utf8nagemax);
494 
495 /*
496  * Maximum age of any character in s, touch at most len bytes.
497  * Return -1 if s is not valid UTF-8 unicode.
498  */
499 int utf8nagemin(const struct utf8data *data, const char *s, size_t len)
500 {
501 	utf8leaf_t	*leaf;
502 	int		leaf_age;
503 	int		age;
504 	unsigned char	hangul[UTF8HANGULLEAF];
505 
506 	if (!data)
507 		return -1;
508 	age = data->maxage;
509 	while (len && *s) {
510 		leaf = utf8nlookup(data, hangul, s, len);
511 		if (!leaf)
512 			return -1;
513 		leaf_age = utf8agetab[LEAF_GEN(leaf)];
514 		if (leaf_age <= data->maxage && leaf_age < age)
515 			age = leaf_age;
516 		len -= utf8clen(s);
517 		s += utf8clen(s);
518 	}
519 	return age;
520 }
521 EXPORT_SYMBOL(utf8nagemin);
522 
523 /*
524  * Length of the normalization of s.
525  * Return -1 if s is not valid UTF-8 unicode.
526  *
527  * A string of Default_Ignorable_Code_Point has length 0.
528  */
529 ssize_t utf8len(const struct utf8data *data, const char *s)
530 {
531 	utf8leaf_t	*leaf;
532 	size_t		ret = 0;
533 	unsigned char	hangul[UTF8HANGULLEAF];
534 
535 	if (!data)
536 		return -1;
537 	while (*s) {
538 		leaf = utf8lookup(data, hangul, s);
539 		if (!leaf)
540 			return -1;
541 		if (utf8agetab[LEAF_GEN(leaf)] > data->maxage)
542 			ret += utf8clen(s);
543 		else if (LEAF_CCC(leaf) == DECOMPOSE)
544 			ret += strlen(LEAF_STR(leaf));
545 		else
546 			ret += utf8clen(s);
547 		s += utf8clen(s);
548 	}
549 	return ret;
550 }
551 EXPORT_SYMBOL(utf8len);
552 
553 /*
554  * Length of the normalization of s, touch at most len bytes.
555  * Return -1 if s is not valid UTF-8 unicode.
556  */
557 ssize_t utf8nlen(const struct utf8data *data, const char *s, size_t len)
558 {
559 	utf8leaf_t	*leaf;
560 	size_t		ret = 0;
561 	unsigned char	hangul[UTF8HANGULLEAF];
562 
563 	if (!data)
564 		return -1;
565 	while (len && *s) {
566 		leaf = utf8nlookup(data, hangul, s, len);
567 		if (!leaf)
568 			return -1;
569 		if (utf8agetab[LEAF_GEN(leaf)] > data->maxage)
570 			ret += utf8clen(s);
571 		else if (LEAF_CCC(leaf) == DECOMPOSE)
572 			ret += strlen(LEAF_STR(leaf));
573 		else
574 			ret += utf8clen(s);
575 		len -= utf8clen(s);
576 		s += utf8clen(s);
577 	}
578 	return ret;
579 }
580 EXPORT_SYMBOL(utf8nlen);
581 
582 /*
583  * Set up an utf8cursor for use by utf8byte().
584  *
585  *   u8c    : pointer to cursor.
586  *   data   : const struct utf8data to use for normalization.
587  *   s      : string.
588  *   len    : length of s.
589  *
590  * Returns -1 on error, 0 on success.
591  */
592 int utf8ncursor(struct utf8cursor *u8c, const struct utf8data *data,
593 		const char *s, size_t len)
594 {
595 	if (!data)
596 		return -1;
597 	if (!s)
598 		return -1;
599 	u8c->data = data;
600 	u8c->s = s;
601 	u8c->p = NULL;
602 	u8c->ss = NULL;
603 	u8c->sp = NULL;
604 	u8c->len = len;
605 	u8c->slen = 0;
606 	u8c->ccc = STOPPER;
607 	u8c->nccc = STOPPER;
608 	/* Check we didn't clobber the maximum length. */
609 	if (u8c->len != len)
610 		return -1;
611 	/* The first byte of s may not be an utf8 continuation. */
612 	if (len > 0 && (*s & 0xC0) == 0x80)
613 		return -1;
614 	return 0;
615 }
616 EXPORT_SYMBOL(utf8ncursor);
617 
618 /*
619  * Set up an utf8cursor for use by utf8byte().
620  *
621  *   u8c    : pointer to cursor.
622  *   data   : const struct utf8data to use for normalization.
623  *   s      : NUL-terminated string.
624  *
625  * Returns -1 on error, 0 on success.
626  */
627 int utf8cursor(struct utf8cursor *u8c, const struct utf8data *data,
628 	       const char *s)
629 {
630 	return utf8ncursor(u8c, data, s, (unsigned int)-1);
631 }
632 EXPORT_SYMBOL(utf8cursor);
633 
634 /*
635  * Get one byte from the normalized form of the string described by u8c.
636  *
637  * Returns the byte cast to an unsigned char on succes, and -1 on failure.
638  *
639  * The cursor keeps track of the location in the string in u8c->s.
640  * When a character is decomposed, the current location is stored in
641  * u8c->p, and u8c->s is set to the start of the decomposition. Note
642  * that bytes from a decomposition do not count against u8c->len.
643  *
644  * Characters are emitted if they match the current CCC in u8c->ccc.
645  * Hitting end-of-string while u8c->ccc == STOPPER means we're done,
646  * and the function returns 0 in that case.
647  *
648  * Sorting by CCC is done by repeatedly scanning the string.  The
649  * values of u8c->s and u8c->p are stored in u8c->ss and u8c->sp at
650  * the start of the scan.  The first pass finds the lowest CCC to be
651  * emitted and stores it in u8c->nccc, the second pass emits the
652  * characters with this CCC and finds the next lowest CCC. This limits
653  * the number of passes to 1 + the number of different CCCs in the
654  * sequence being scanned.
655  *
656  * Therefore:
657  *  u8c->p  != NULL -> a decomposition is being scanned.
658  *  u8c->ss != NULL -> this is a repeating scan.
659  *  u8c->ccc == -1   -> this is the first scan of a repeating scan.
660  */
661 int utf8byte(struct utf8cursor *u8c)
662 {
663 	utf8leaf_t *leaf;
664 	int ccc;
665 
666 	for (;;) {
667 		/* Check for the end of a decomposed character. */
668 		if (u8c->p && *u8c->s == '\0') {
669 			u8c->s = u8c->p;
670 			u8c->p = NULL;
671 		}
672 
673 		/* Check for end-of-string. */
674 		if (!u8c->p && (u8c->len == 0 || *u8c->s == '\0')) {
675 			/* There is no next byte. */
676 			if (u8c->ccc == STOPPER)
677 				return 0;
678 			/* End-of-string during a scan counts as a stopper. */
679 			ccc = STOPPER;
680 			goto ccc_mismatch;
681 		} else if ((*u8c->s & 0xC0) == 0x80) {
682 			/* This is a continuation of the current character. */
683 			if (!u8c->p)
684 				u8c->len--;
685 			return (unsigned char)*u8c->s++;
686 		}
687 
688 		/* Look up the data for the current character. */
689 		if (u8c->p) {
690 			leaf = utf8lookup(u8c->data, u8c->hangul, u8c->s);
691 		} else {
692 			leaf = utf8nlookup(u8c->data, u8c->hangul,
693 					   u8c->s, u8c->len);
694 		}
695 
696 		/* No leaf found implies that the input is a binary blob. */
697 		if (!leaf)
698 			return -1;
699 
700 		ccc = LEAF_CCC(leaf);
701 		/* Characters that are too new have CCC 0. */
702 		if (utf8agetab[LEAF_GEN(leaf)] > u8c->data->maxage) {
703 			ccc = STOPPER;
704 		} else if (ccc == DECOMPOSE) {
705 			u8c->len -= utf8clen(u8c->s);
706 			u8c->p = u8c->s + utf8clen(u8c->s);
707 			u8c->s = LEAF_STR(leaf);
708 			/* Empty decomposition implies CCC 0. */
709 			if (*u8c->s == '\0') {
710 				if (u8c->ccc == STOPPER)
711 					continue;
712 				ccc = STOPPER;
713 				goto ccc_mismatch;
714 			}
715 
716 			leaf = utf8lookup(u8c->data, u8c->hangul, u8c->s);
717 			if (!leaf)
718 				return -1;
719 			ccc = LEAF_CCC(leaf);
720 		}
721 
722 		/*
723 		 * If this is not a stopper, then see if it updates
724 		 * the next canonical class to be emitted.
725 		 */
726 		if (ccc != STOPPER && u8c->ccc < ccc && ccc < u8c->nccc)
727 			u8c->nccc = ccc;
728 
729 		/*
730 		 * Return the current byte if this is the current
731 		 * combining class.
732 		 */
733 		if (ccc == u8c->ccc) {
734 			if (!u8c->p)
735 				u8c->len--;
736 			return (unsigned char)*u8c->s++;
737 		}
738 
739 		/* Current combining class mismatch. */
740 ccc_mismatch:
741 		if (u8c->nccc == STOPPER) {
742 			/*
743 			 * Scan forward for the first canonical class
744 			 * to be emitted.  Save the position from
745 			 * which to restart.
746 			 */
747 			u8c->ccc = MINCCC - 1;
748 			u8c->nccc = ccc;
749 			u8c->sp = u8c->p;
750 			u8c->ss = u8c->s;
751 			u8c->slen = u8c->len;
752 			if (!u8c->p)
753 				u8c->len -= utf8clen(u8c->s);
754 			u8c->s += utf8clen(u8c->s);
755 		} else if (ccc != STOPPER) {
756 			/* Not a stopper, and not the ccc we're emitting. */
757 			if (!u8c->p)
758 				u8c->len -= utf8clen(u8c->s);
759 			u8c->s += utf8clen(u8c->s);
760 		} else if (u8c->nccc != MAXCCC + 1) {
761 			/* At a stopper, restart for next ccc. */
762 			u8c->ccc = u8c->nccc;
763 			u8c->nccc = MAXCCC + 1;
764 			u8c->s = u8c->ss;
765 			u8c->p = u8c->sp;
766 			u8c->len = u8c->slen;
767 		} else {
768 			/* All done, proceed from here. */
769 			u8c->ccc = STOPPER;
770 			u8c->nccc = STOPPER;
771 			u8c->sp = NULL;
772 			u8c->ss = NULL;
773 			u8c->slen = 0;
774 		}
775 	}
776 }
777 EXPORT_SYMBOL(utf8byte);
778 
779 const struct utf8data *utf8nfdi(unsigned int maxage)
780 {
781 	int i = ARRAY_SIZE(utf8nfdidata) - 1;
782 
783 	while (maxage < utf8nfdidata[i].maxage)
784 		i--;
785 	if (maxage > utf8nfdidata[i].maxage)
786 		return NULL;
787 	return &utf8nfdidata[i];
788 }
789 EXPORT_SYMBOL(utf8nfdi);
790 
791 const struct utf8data *utf8nfdicf(unsigned int maxage)
792 {
793 	int i = ARRAY_SIZE(utf8nfdicfdata) - 1;
794 
795 	while (maxage < utf8nfdicfdata[i].maxage)
796 		i--;
797 	if (maxage > utf8nfdicfdata[i].maxage)
798 		return NULL;
799 	return &utf8nfdicfdata[i];
800 }
801 EXPORT_SYMBOL(utf8nfdicf);
802