1 // SPDX-License-Identifier: GPL-2.0+ 2 /* 3 * charset conversion utils 4 * 5 * Copyright (c) 2017 Rob Clark 6 */ 7 8 #include <charset.h> 9 #include <capitalization.h> 10 #include <malloc.h> 11 12 static struct capitalization_table capitalization_table[] = 13 #ifdef CONFIG_EFI_UNICODE_CAPITALIZATION 14 UNICODE_CAPITALIZATION_TABLE; 15 #elif CONFIG_FAT_DEFAULT_CODEPAGE == 1250 16 CP1250_CAPITALIZATION_TABLE; 17 #else 18 CP437_CAPITALIZATION_TABLE; 19 #endif 20 21 s32 utf8_get(const char **src) 22 { 23 s32 code = 0; 24 unsigned char c; 25 26 if (!src || !*src) 27 return -1; 28 if (!**src) 29 return 0; 30 c = **src; 31 if (c >= 0x80) { 32 ++*src; 33 if (!**src) 34 return -1; 35 /* 36 * We do not expect a continuation byte (0x80 - 0xbf). 37 * 0x80 is coded as 0xc2 0x80, so we cannot have less then 0xc2 38 * here. 39 * The highest code point is 0x10ffff which is coded as 40 * 0xf4 0x8f 0xbf 0xbf. So we cannot have a byte above 0xf4. 41 */ 42 if (c < 0xc2 || code > 0xf4) 43 return -1; 44 if (c >= 0xe0) { 45 if (c >= 0xf0) { 46 /* 0xf0 - 0xf4 */ 47 c &= 0x07; 48 code = c << 18; 49 c = **src; 50 ++*src; 51 if (!**src) 52 return -1; 53 if (c < 0x80 || c > 0xbf) 54 return -1; 55 c &= 0x3f; 56 } else { 57 /* 0xe0 - 0xef */ 58 c &= 0x0f; 59 } 60 code += c << 12; 61 if ((code >= 0xD800 && code <= 0xDFFF) || 62 code >= 0x110000) 63 return -1; 64 c = **src; 65 ++*src; 66 if (!**src) 67 return -1; 68 if (c < 0x80 || c > 0xbf) 69 return -1; 70 } 71 /* 0xc0 - 0xdf or continuation byte (0x80 - 0xbf) */ 72 c &= 0x3f; 73 code += c << 6; 74 c = **src; 75 if (c < 0x80 || c > 0xbf) 76 return -1; 77 c &= 0x3f; 78 } 79 code += c; 80 ++*src; 81 return code; 82 } 83 84 int utf8_put(s32 code, char **dst) 85 { 86 if (!dst || !*dst) 87 return -1; 88 if ((code >= 0xD800 && code <= 0xDFFF) || code >= 0x110000) 89 return -1; 90 if (code <= 0x007F) { 91 **dst = code; 92 } else { 93 if (code <= 0x07FF) { 94 **dst = code >> 6 | 0xC0; 95 } else { 96 if (code < 0x10000) { 97 **dst = code >> 12 | 0xE0; 98 } else { 99 **dst = code >> 18 | 0xF0; 100 ++*dst; 101 **dst = (code >> 12 & 0x3F) | 0x80; 102 } 103 ++*dst; 104 **dst = (code >> 6 & 0x3F) | 0x80; 105 } 106 ++*dst; 107 **dst = (code & 0x3F) | 0x80; 108 } 109 ++*dst; 110 return 0; 111 } 112 113 size_t utf8_utf16_strnlen(const char *src, size_t count) 114 { 115 size_t len = 0; 116 117 for (; *src && count; --count) { 118 s32 code = utf8_get(&src); 119 120 if (!code) 121 break; 122 if (code < 0) { 123 /* Reserve space for a replacement character */ 124 len += 1; 125 } else if (code < 0x10000) { 126 len += 1; 127 } else { 128 len += 2; 129 } 130 } 131 return len; 132 } 133 134 int utf8_utf16_strncpy(u16 **dst, const char *src, size_t count) 135 { 136 if (!src || !dst || !*dst) 137 return -1; 138 139 for (; count && *src; --count) { 140 s32 code = utf8_get(&src); 141 142 if (code < 0) 143 code = '?'; 144 utf16_put(code, dst); 145 } 146 **dst = 0; 147 return 0; 148 } 149 150 s32 utf16_get(const u16 **src) 151 { 152 s32 code, code2; 153 154 if (!src || !*src) 155 return -1; 156 if (!**src) 157 return 0; 158 code = **src; 159 ++*src; 160 if (code >= 0xDC00 && code <= 0xDFFF) 161 return -1; 162 if (code >= 0xD800 && code <= 0xDBFF) { 163 if (!**src) 164 return -1; 165 code &= 0x3ff; 166 code <<= 10; 167 code += 0x10000; 168 code2 = **src; 169 ++*src; 170 if (code2 <= 0xDC00 || code2 >= 0xDFFF) 171 return -1; 172 code2 &= 0x3ff; 173 code += code2; 174 } 175 return code; 176 } 177 178 int utf16_put(s32 code, u16 **dst) 179 { 180 if (!dst || !*dst) 181 return -1; 182 if ((code >= 0xD800 && code <= 0xDFFF) || code >= 0x110000) 183 return -1; 184 if (code < 0x10000) { 185 **dst = code; 186 } else { 187 code -= 0x10000; 188 **dst = code >> 10 | 0xD800; 189 ++*dst; 190 **dst = (code & 0x3ff) | 0xDC00; 191 } 192 ++*dst; 193 return 0; 194 } 195 196 size_t utf16_strnlen(const u16 *src, size_t count) 197 { 198 size_t len = 0; 199 200 for (; *src && count; --count) { 201 s32 code = utf16_get(&src); 202 203 if (!code) 204 break; 205 /* 206 * In case of an illegal sequence still reserve space for a 207 * replacement character. 208 */ 209 ++len; 210 } 211 return len; 212 } 213 214 size_t utf16_utf8_strnlen(const u16 *src, size_t count) 215 { 216 size_t len = 0; 217 218 for (; *src && count; --count) { 219 s32 code = utf16_get(&src); 220 221 if (!code) 222 break; 223 if (code < 0) 224 /* Reserve space for a replacement character */ 225 len += 1; 226 else if (code < 0x80) 227 len += 1; 228 else if (code < 0x800) 229 len += 2; 230 else if (code < 0x10000) 231 len += 3; 232 else 233 len += 4; 234 } 235 return len; 236 } 237 238 int utf16_utf8_strncpy(char **dst, const u16 *src, size_t count) 239 { 240 if (!src || !dst || !*dst) 241 return -1; 242 243 for (; count && *src; --count) { 244 s32 code = utf16_get(&src); 245 246 if (code < 0) 247 code = '?'; 248 utf8_put(code, dst); 249 } 250 **dst = 0; 251 return 0; 252 } 253 254 s32 utf_to_lower(const s32 code) 255 { 256 struct capitalization_table *pos = capitalization_table; 257 s32 ret = code; 258 259 if (code <= 0x7f) { 260 if (code >= 'A' && code <= 'Z') 261 ret += 0x20; 262 return ret; 263 } 264 for (; pos->upper; ++pos) { 265 if (pos->upper == code) { 266 ret = pos->lower; 267 break; 268 } 269 } 270 return ret; 271 } 272 273 s32 utf_to_upper(const s32 code) 274 { 275 struct capitalization_table *pos = capitalization_table; 276 s32 ret = code; 277 278 if (code <= 0x7f) { 279 if (code >= 'a' && code <= 'z') 280 ret -= 0x20; 281 return ret; 282 } 283 for (; pos->lower; ++pos) { 284 if (pos->lower == code) { 285 ret = pos->upper; 286 break; 287 } 288 } 289 return ret; 290 } 291 292 size_t u16_strlen(const u16 *in) 293 { 294 size_t i; 295 for (i = 0; in[i]; i++); 296 return i; 297 } 298 299 size_t u16_strnlen(const u16 *in, size_t count) 300 { 301 size_t i; 302 for (i = 0; count-- && in[i]; i++); 303 return i; 304 } 305 306 /* Convert UTF-16 to UTF-8. */ 307 uint8_t *utf16_to_utf8(uint8_t *dest, const uint16_t *src, size_t size) 308 { 309 uint32_t code_high = 0; 310 311 while (size--) { 312 uint32_t code = *src++; 313 314 if (code_high) { 315 if (code >= 0xDC00 && code <= 0xDFFF) { 316 /* Surrogate pair. */ 317 code = ((code_high - 0xD800) << 10) + (code - 0xDC00) + 0x10000; 318 319 *dest++ = (code >> 18) | 0xF0; 320 *dest++ = ((code >> 12) & 0x3F) | 0x80; 321 *dest++ = ((code >> 6) & 0x3F) | 0x80; 322 *dest++ = (code & 0x3F) | 0x80; 323 } else { 324 /* Error... */ 325 *dest++ = '?'; 326 /* *src may be valid. Don't eat it. */ 327 src--; 328 } 329 330 code_high = 0; 331 } else { 332 if (code <= 0x007F) { 333 *dest++ = code; 334 } else if (code <= 0x07FF) { 335 *dest++ = (code >> 6) | 0xC0; 336 *dest++ = (code & 0x3F) | 0x80; 337 } else if (code >= 0xD800 && code <= 0xDBFF) { 338 code_high = code; 339 continue; 340 } else if (code >= 0xDC00 && code <= 0xDFFF) { 341 /* Error... */ 342 *dest++ = '?'; 343 } else if (code < 0x10000) { 344 *dest++ = (code >> 12) | 0xE0; 345 *dest++ = ((code >> 6) & 0x3F) | 0x80; 346 *dest++ = (code & 0x3F) | 0x80; 347 } else { 348 *dest++ = (code >> 18) | 0xF0; 349 *dest++ = ((code >> 12) & 0x3F) | 0x80; 350 *dest++ = ((code >> 6) & 0x3F) | 0x80; 351 *dest++ = (code & 0x3F) | 0x80; 352 } 353 } 354 } 355 356 return dest; 357 } 358