1 // SPDX-License-Identifier: GPL-2.0+ 2 /* 3 * charset conversion utils 4 * 5 * Copyright (c) 2017 Rob Clark 6 */ 7 8 #include <common.h> 9 #include <charset.h> 10 #include <capitalization.h> 11 #include <malloc.h> 12 13 static struct capitalization_table capitalization_table[] = 14 #ifdef CONFIG_EFI_UNICODE_CAPITALIZATION 15 UNICODE_CAPITALIZATION_TABLE; 16 #elif CONFIG_FAT_DEFAULT_CODEPAGE == 1250 17 CP1250_CAPITALIZATION_TABLE; 18 #else 19 CP437_CAPITALIZATION_TABLE; 20 #endif 21 22 /** 23 * get_code() - read Unicode code point from UTF-8 stream 24 * 25 * @read_u8: - stream reader 26 * @src: - string buffer passed to stream reader, optional 27 * Return: - Unicode code point 28 */ 29 static int get_code(u8 (*read_u8)(void *data), void *data) 30 { 31 s32 ch = 0; 32 33 ch = read_u8(data); 34 if (!ch) 35 return 0; 36 if (ch >= 0xc2 && ch <= 0xf4) { 37 int code = 0; 38 39 if (ch >= 0xe0) { 40 if (ch >= 0xf0) { 41 /* 0xf0 - 0xf4 */ 42 ch &= 0x07; 43 code = ch << 18; 44 ch = read_u8(data); 45 if (ch < 0x80 || ch > 0xbf) 46 goto error; 47 ch &= 0x3f; 48 } else { 49 /* 0xe0 - 0xef */ 50 ch &= 0x0f; 51 } 52 code += ch << 12; 53 if ((code >= 0xD800 && code <= 0xDFFF) || 54 code >= 0x110000) 55 goto error; 56 ch = read_u8(data); 57 if (ch < 0x80 || ch > 0xbf) 58 goto error; 59 } 60 /* 0xc0 - 0xdf or continuation byte (0x80 - 0xbf) */ 61 ch &= 0x3f; 62 code += ch << 6; 63 ch = read_u8(data); 64 if (ch < 0x80 || ch > 0xbf) 65 goto error; 66 ch &= 0x3f; 67 ch += code; 68 } else if (ch >= 0x80) { 69 goto error; 70 } 71 return ch; 72 error: 73 return '?'; 74 } 75 76 /** 77 * read_string() - read byte from character string 78 * 79 * @data: - pointer to string 80 * Return: - byte read 81 * 82 * The string pointer is incremented if it does not point to '\0'. 83 */ 84 static u8 read_string(void *data) 85 86 { 87 const char **src = (const char **)data; 88 u8 c; 89 90 if (!src || !*src || !**src) 91 return 0; 92 c = **src; 93 ++*src; 94 return c; 95 } 96 97 /** 98 * read_console() - read byte from console 99 * 100 * @src - not used, needed to match interface 101 * Return: - byte read 102 */ 103 static u8 read_console(void *data) 104 { 105 return getc(); 106 } 107 108 int console_read_unicode(s32 *code) 109 { 110 if (!tstc()) { 111 /* No input available */ 112 return 1; 113 } 114 115 /* Read Unicode code */ 116 *code = get_code(read_console, NULL); 117 return 0; 118 } 119 120 s32 utf8_get(const char **src) 121 { 122 return get_code(read_string, src); 123 } 124 125 int utf8_put(s32 code, char **dst) 126 { 127 if (!dst || !*dst) 128 return -1; 129 if ((code >= 0xD800 && code <= 0xDFFF) || code >= 0x110000) 130 return -1; 131 if (code <= 0x007F) { 132 **dst = code; 133 } else { 134 if (code <= 0x07FF) { 135 **dst = code >> 6 | 0xC0; 136 } else { 137 if (code < 0x10000) { 138 **dst = code >> 12 | 0xE0; 139 } else { 140 **dst = code >> 18 | 0xF0; 141 ++*dst; 142 **dst = (code >> 12 & 0x3F) | 0x80; 143 } 144 ++*dst; 145 **dst = (code >> 6 & 0x3F) | 0x80; 146 } 147 ++*dst; 148 **dst = (code & 0x3F) | 0x80; 149 } 150 ++*dst; 151 return 0; 152 } 153 154 size_t utf8_utf16_strnlen(const char *src, size_t count) 155 { 156 size_t len = 0; 157 158 for (; *src && count; --count) { 159 s32 code = utf8_get(&src); 160 161 if (!code) 162 break; 163 if (code < 0) { 164 /* Reserve space for a replacement character */ 165 len += 1; 166 } else if (code < 0x10000) { 167 len += 1; 168 } else { 169 len += 2; 170 } 171 } 172 return len; 173 } 174 175 int utf8_utf16_strncpy(u16 **dst, const char *src, size_t count) 176 { 177 if (!src || !dst || !*dst) 178 return -1; 179 180 for (; count && *src; --count) { 181 s32 code = utf8_get(&src); 182 183 if (code < 0) 184 code = '?'; 185 utf16_put(code, dst); 186 } 187 **dst = 0; 188 return 0; 189 } 190 191 s32 utf16_get(const u16 **src) 192 { 193 s32 code, code2; 194 195 if (!src || !*src) 196 return -1; 197 if (!**src) 198 return 0; 199 code = **src; 200 ++*src; 201 if (code >= 0xDC00 && code <= 0xDFFF) 202 return -1; 203 if (code >= 0xD800 && code <= 0xDBFF) { 204 if (!**src) 205 return -1; 206 code &= 0x3ff; 207 code <<= 10; 208 code += 0x10000; 209 code2 = **src; 210 ++*src; 211 if (code2 <= 0xDC00 || code2 >= 0xDFFF) 212 return -1; 213 code2 &= 0x3ff; 214 code += code2; 215 } 216 return code; 217 } 218 219 int utf16_put(s32 code, u16 **dst) 220 { 221 if (!dst || !*dst) 222 return -1; 223 if ((code >= 0xD800 && code <= 0xDFFF) || code >= 0x110000) 224 return -1; 225 if (code < 0x10000) { 226 **dst = code; 227 } else { 228 code -= 0x10000; 229 **dst = code >> 10 | 0xD800; 230 ++*dst; 231 **dst = (code & 0x3ff) | 0xDC00; 232 } 233 ++*dst; 234 return 0; 235 } 236 237 size_t utf16_strnlen(const u16 *src, size_t count) 238 { 239 size_t len = 0; 240 241 for (; *src && count; --count) { 242 s32 code = utf16_get(&src); 243 244 if (!code) 245 break; 246 /* 247 * In case of an illegal sequence still reserve space for a 248 * replacement character. 249 */ 250 ++len; 251 } 252 return len; 253 } 254 255 size_t utf16_utf8_strnlen(const u16 *src, size_t count) 256 { 257 size_t len = 0; 258 259 for (; *src && count; --count) { 260 s32 code = utf16_get(&src); 261 262 if (!code) 263 break; 264 if (code < 0) 265 /* Reserve space for a replacement character */ 266 len += 1; 267 else if (code < 0x80) 268 len += 1; 269 else if (code < 0x800) 270 len += 2; 271 else if (code < 0x10000) 272 len += 3; 273 else 274 len += 4; 275 } 276 return len; 277 } 278 279 int utf16_utf8_strncpy(char **dst, const u16 *src, size_t count) 280 { 281 if (!src || !dst || !*dst) 282 return -1; 283 284 for (; count && *src; --count) { 285 s32 code = utf16_get(&src); 286 287 if (code < 0) 288 code = '?'; 289 utf8_put(code, dst); 290 } 291 **dst = 0; 292 return 0; 293 } 294 295 s32 utf_to_lower(const s32 code) 296 { 297 struct capitalization_table *pos = capitalization_table; 298 s32 ret = code; 299 300 if (code <= 0x7f) { 301 if (code >= 'A' && code <= 'Z') 302 ret += 0x20; 303 return ret; 304 } 305 for (; pos->upper; ++pos) { 306 if (pos->upper == code) { 307 ret = pos->lower; 308 break; 309 } 310 } 311 return ret; 312 } 313 314 s32 utf_to_upper(const s32 code) 315 { 316 struct capitalization_table *pos = capitalization_table; 317 s32 ret = code; 318 319 if (code <= 0x7f) { 320 if (code >= 'a' && code <= 'z') 321 ret -= 0x20; 322 return ret; 323 } 324 for (; pos->lower; ++pos) { 325 if (pos->lower == code) { 326 ret = pos->upper; 327 break; 328 } 329 } 330 return ret; 331 } 332 333 size_t u16_strlen(const u16 *in) 334 { 335 size_t i; 336 for (i = 0; in[i]; i++); 337 return i; 338 } 339 340 size_t u16_strnlen(const u16 *in, size_t count) 341 { 342 size_t i; 343 for (i = 0; count-- && in[i]; i++); 344 return i; 345 } 346 347 /* Convert UTF-16 to UTF-8. */ 348 uint8_t *utf16_to_utf8(uint8_t *dest, const uint16_t *src, size_t size) 349 { 350 uint32_t code_high = 0; 351 352 while (size--) { 353 uint32_t code = *src++; 354 355 if (code_high) { 356 if (code >= 0xDC00 && code <= 0xDFFF) { 357 /* Surrogate pair. */ 358 code = ((code_high - 0xD800) << 10) + (code - 0xDC00) + 0x10000; 359 360 *dest++ = (code >> 18) | 0xF0; 361 *dest++ = ((code >> 12) & 0x3F) | 0x80; 362 *dest++ = ((code >> 6) & 0x3F) | 0x80; 363 *dest++ = (code & 0x3F) | 0x80; 364 } else { 365 /* Error... */ 366 *dest++ = '?'; 367 /* *src may be valid. Don't eat it. */ 368 src--; 369 } 370 371 code_high = 0; 372 } else { 373 if (code <= 0x007F) { 374 *dest++ = code; 375 } else if (code <= 0x07FF) { 376 *dest++ = (code >> 6) | 0xC0; 377 *dest++ = (code & 0x3F) | 0x80; 378 } else if (code >= 0xD800 && code <= 0xDBFF) { 379 code_high = code; 380 continue; 381 } else if (code >= 0xDC00 && code <= 0xDFFF) { 382 /* Error... */ 383 *dest++ = '?'; 384 } else if (code < 0x10000) { 385 *dest++ = (code >> 12) | 0xE0; 386 *dest++ = ((code >> 6) & 0x3F) | 0x80; 387 *dest++ = (code & 0x3F) | 0x80; 388 } else { 389 *dest++ = (code >> 18) | 0xF0; 390 *dest++ = ((code >> 12) & 0x3F) | 0x80; 391 *dest++ = ((code >> 6) & 0x3F) | 0x80; 392 *dest++ = (code & 0x3F) | 0x80; 393 } 394 } 395 } 396 397 return dest; 398 } 399