19f806850SThomas Gleixner /* SPDX-License-Identifier: GPL-2.0-only */ 244594c2fSOlaf Weber /* 344594c2fSOlaf Weber * Copyright (c) 2014 SGI. 444594c2fSOlaf Weber * All rights reserved. 544594c2fSOlaf Weber */ 644594c2fSOlaf Weber 744594c2fSOlaf Weber #ifndef UTF8NORM_H 844594c2fSOlaf Weber #define UTF8NORM_H 944594c2fSOlaf Weber 1044594c2fSOlaf Weber #include <linux/types.h> 1144594c2fSOlaf Weber #include <linux/export.h> 1244594c2fSOlaf Weber #include <linux/string.h> 1344594c2fSOlaf Weber #include <linux/module.h> 1449bd03ccSChristoph Hellwig #include <linux/unicode.h> 1544594c2fSOlaf Weber 1649bd03ccSChristoph Hellwig int utf8version_is_supported(unsigned int version); 1744594c2fSOlaf Weber 1844594c2fSOlaf Weber /* 1944594c2fSOlaf Weber * Look for the correct const struct utf8data for a unicode version. 2044594c2fSOlaf Weber * Returns NULL if the version requested is too new. 2144594c2fSOlaf Weber * 2244594c2fSOlaf Weber * Two normalization forms are supported: nfdi and nfdicf. 2344594c2fSOlaf Weber * 2444594c2fSOlaf Weber * nfdi: 2544594c2fSOlaf Weber * - Apply unicode normalization form NFD. 2644594c2fSOlaf Weber * - Remove any Default_Ignorable_Code_Point. 2744594c2fSOlaf Weber * 2844594c2fSOlaf Weber * nfdicf: 2944594c2fSOlaf Weber * - Apply unicode normalization form NFD. 3044594c2fSOlaf Weber * - Remove any Default_Ignorable_Code_Point. 3144594c2fSOlaf Weber * - Apply a full casefold (C + F). 3244594c2fSOlaf Weber */ 3344594c2fSOlaf Weber extern const struct utf8data *utf8nfdi(unsigned int maxage); 3444594c2fSOlaf Weber extern const struct utf8data *utf8nfdicf(unsigned int maxage); 3544594c2fSOlaf Weber 3644594c2fSOlaf Weber /* 3744594c2fSOlaf Weber * Determine the length of the normalized from of the string, 3844594c2fSOlaf Weber * excluding any terminating NULL byte. 3944594c2fSOlaf Weber * Returns 0 if only ignorable code points are present. 4044594c2fSOlaf Weber * Returns -1 if the input is not valid UTF-8. 4144594c2fSOlaf Weber */ 42*6ca99ce7SChristoph Hellwig ssize_t utf8nlen(const struct unicode_map *um, enum utf8_normalization n, 43*6ca99ce7SChristoph Hellwig const char *s, size_t len); 4444594c2fSOlaf Weber 45a8384c68SOlaf Weber /* Needed in struct utf8cursor below. */ 46a8384c68SOlaf Weber #define UTF8HANGULLEAF (12) 47a8384c68SOlaf Weber 4844594c2fSOlaf Weber /* 4944594c2fSOlaf Weber * Cursor structure used by the normalizer. 5044594c2fSOlaf Weber */ 5144594c2fSOlaf Weber struct utf8cursor { 52*6ca99ce7SChristoph Hellwig const struct unicode_map *um; 53*6ca99ce7SChristoph Hellwig enum utf8_normalization n; 5444594c2fSOlaf Weber const char *s; 5544594c2fSOlaf Weber const char *p; 5644594c2fSOlaf Weber const char *ss; 5744594c2fSOlaf Weber const char *sp; 5844594c2fSOlaf Weber unsigned int len; 5944594c2fSOlaf Weber unsigned int slen; 6044594c2fSOlaf Weber short int ccc; 6144594c2fSOlaf Weber short int nccc; 62a8384c68SOlaf Weber unsigned char hangul[UTF8HANGULLEAF]; 6344594c2fSOlaf Weber }; 6444594c2fSOlaf Weber 6544594c2fSOlaf Weber /* 6644594c2fSOlaf Weber * Initialize a utf8cursor to normalize a string. 6744594c2fSOlaf Weber * Returns 0 on success. 6844594c2fSOlaf Weber * Returns -1 on failure. 6944594c2fSOlaf Weber */ 70*6ca99ce7SChristoph Hellwig int utf8ncursor(struct utf8cursor *u8c, const struct unicode_map *um, 71*6ca99ce7SChristoph Hellwig enum utf8_normalization n, const char *s, size_t len); 7244594c2fSOlaf Weber 7344594c2fSOlaf Weber /* 7444594c2fSOlaf Weber * Get the next byte in the normalization. 7544594c2fSOlaf Weber * Returns a value > 0 && < 256 on success. 7644594c2fSOlaf Weber * Returns 0 when the end of the normalization is reached. 7744594c2fSOlaf Weber * Returns -1 if the string being normalized is not valid UTF-8. 7844594c2fSOlaf Weber */ 7944594c2fSOlaf Weber extern int utf8byte(struct utf8cursor *u8c); 8044594c2fSOlaf Weber 8144594c2fSOlaf Weber #endif /* UTF8NORM_H */ 82