1 // SPDX-License-Identifier: GPL-2.0+ 2 /* 3 * charset conversion utils 4 * 5 * Copyright (c) 2017 Rob Clark 6 */ 7 8 #include <common.h> 9 #include <charset.h> 10 #include <capitalization.h> 11 #include <malloc.h> 12 13 static struct capitalization_table capitalization_table[] = 14 #ifdef CONFIG_EFI_UNICODE_CAPITALIZATION 15 UNICODE_CAPITALIZATION_TABLE; 16 #elif CONFIG_FAT_DEFAULT_CODEPAGE == 1250 17 CP1250_CAPITALIZATION_TABLE; 18 #else 19 CP437_CAPITALIZATION_TABLE; 20 #endif 21 22 /** 23 * get_code() - read Unicode code point from UTF-8 stream 24 * 25 * @read_u8: - stream reader 26 * @src: - string buffer passed to stream reader, optional 27 * Return: - Unicode code point 28 */ 29 static int get_code(u8 (*read_u8)(void *data), void *data) 30 { 31 s32 ch = 0; 32 33 ch = read_u8(data); 34 if (!ch) 35 return 0; 36 if (ch >= 0xc2 && ch <= 0xf4) { 37 int code = 0; 38 39 if (ch >= 0xe0) { 40 if (ch >= 0xf0) { 41 /* 0xf0 - 0xf4 */ 42 ch &= 0x07; 43 code = ch << 18; 44 ch = read_u8(data); 45 if (ch < 0x80 || ch > 0xbf) 46 goto error; 47 ch &= 0x3f; 48 } else { 49 /* 0xe0 - 0xef */ 50 ch &= 0x0f; 51 } 52 code += ch << 12; 53 if ((code >= 0xD800 && code <= 0xDFFF) || 54 code >= 0x110000) 55 goto error; 56 ch = read_u8(data); 57 if (ch < 0x80 || ch > 0xbf) 58 goto error; 59 } 60 /* 0xc0 - 0xdf or continuation byte (0x80 - 0xbf) */ 61 ch &= 0x3f; 62 code += ch << 6; 63 ch = read_u8(data); 64 if (ch < 0x80 || ch > 0xbf) 65 goto error; 66 ch &= 0x3f; 67 ch += code; 68 } else if (ch >= 0x80) { 69 goto error; 70 } 71 return ch; 72 error: 73 return '?'; 74 } 75 76 /** 77 * read_string() - read byte from character string 78 * 79 * @data: - pointer to string 80 * Return: - byte read 81 * 82 * The string pointer is incremented if it does not point to '\0'. 83 */ 84 static u8 read_string(void *data) 85 86 { 87 const char **src = (const char **)data; 88 u8 c; 89 90 if (!src || !*src || !**src) 91 return 0; 92 c = **src; 93 ++*src; 94 return c; 95 } 96 97 /** 98 * read_console() - read byte from console 99 * 100 * @data - not used, needed to match interface 101 * Return: - byte read or 0 on error 102 */ 103 static u8 read_console(void *data) 104 { 105 int ch; 106 107 ch = getc(); 108 if (ch < 0) 109 ch = 0; 110 return ch; 111 } 112 113 int console_read_unicode(s32 *code) 114 { 115 if (!tstc()) { 116 /* No input available */ 117 return 1; 118 } 119 120 /* Read Unicode code */ 121 *code = get_code(read_console, NULL); 122 return 0; 123 } 124 125 s32 utf8_get(const char **src) 126 { 127 return get_code(read_string, src); 128 } 129 130 int utf8_put(s32 code, char **dst) 131 { 132 if (!dst || !*dst) 133 return -1; 134 if ((code >= 0xD800 && code <= 0xDFFF) || code >= 0x110000) 135 return -1; 136 if (code <= 0x007F) { 137 **dst = code; 138 } else { 139 if (code <= 0x07FF) { 140 **dst = code >> 6 | 0xC0; 141 } else { 142 if (code < 0x10000) { 143 **dst = code >> 12 | 0xE0; 144 } else { 145 **dst = code >> 18 | 0xF0; 146 ++*dst; 147 **dst = (code >> 12 & 0x3F) | 0x80; 148 } 149 ++*dst; 150 **dst = (code >> 6 & 0x3F) | 0x80; 151 } 152 ++*dst; 153 **dst = (code & 0x3F) | 0x80; 154 } 155 ++*dst; 156 return 0; 157 } 158 159 size_t utf8_utf16_strnlen(const char *src, size_t count) 160 { 161 size_t len = 0; 162 163 for (; *src && count; --count) { 164 s32 code = utf8_get(&src); 165 166 if (!code) 167 break; 168 if (code < 0) { 169 /* Reserve space for a replacement character */ 170 len += 1; 171 } else if (code < 0x10000) { 172 len += 1; 173 } else { 174 len += 2; 175 } 176 } 177 return len; 178 } 179 180 int utf8_utf16_strncpy(u16 **dst, const char *src, size_t count) 181 { 182 if (!src || !dst || !*dst) 183 return -1; 184 185 for (; count && *src; --count) { 186 s32 code = utf8_get(&src); 187 188 if (code < 0) 189 code = '?'; 190 utf16_put(code, dst); 191 } 192 **dst = 0; 193 return 0; 194 } 195 196 s32 utf16_get(const u16 **src) 197 { 198 s32 code, code2; 199 200 if (!src || !*src) 201 return -1; 202 if (!**src) 203 return 0; 204 code = **src; 205 ++*src; 206 if (code >= 0xDC00 && code <= 0xDFFF) 207 return -1; 208 if (code >= 0xD800 && code <= 0xDBFF) { 209 if (!**src) 210 return -1; 211 code &= 0x3ff; 212 code <<= 10; 213 code += 0x10000; 214 code2 = **src; 215 ++*src; 216 if (code2 <= 0xDC00 || code2 >= 0xDFFF) 217 return -1; 218 code2 &= 0x3ff; 219 code += code2; 220 } 221 return code; 222 } 223 224 int utf16_put(s32 code, u16 **dst) 225 { 226 if (!dst || !*dst) 227 return -1; 228 if ((code >= 0xD800 && code <= 0xDFFF) || code >= 0x110000) 229 return -1; 230 if (code < 0x10000) { 231 **dst = code; 232 } else { 233 code -= 0x10000; 234 **dst = code >> 10 | 0xD800; 235 ++*dst; 236 **dst = (code & 0x3ff) | 0xDC00; 237 } 238 ++*dst; 239 return 0; 240 } 241 242 size_t utf16_strnlen(const u16 *src, size_t count) 243 { 244 size_t len = 0; 245 246 for (; *src && count; --count) { 247 s32 code = utf16_get(&src); 248 249 if (!code) 250 break; 251 /* 252 * In case of an illegal sequence still reserve space for a 253 * replacement character. 254 */ 255 ++len; 256 } 257 return len; 258 } 259 260 size_t utf16_utf8_strnlen(const u16 *src, size_t count) 261 { 262 size_t len = 0; 263 264 for (; *src && count; --count) { 265 s32 code = utf16_get(&src); 266 267 if (!code) 268 break; 269 if (code < 0) 270 /* Reserve space for a replacement character */ 271 len += 1; 272 else if (code < 0x80) 273 len += 1; 274 else if (code < 0x800) 275 len += 2; 276 else if (code < 0x10000) 277 len += 3; 278 else 279 len += 4; 280 } 281 return len; 282 } 283 284 int utf16_utf8_strncpy(char **dst, const u16 *src, size_t count) 285 { 286 if (!src || !dst || !*dst) 287 return -1; 288 289 for (; count && *src; --count) { 290 s32 code = utf16_get(&src); 291 292 if (code < 0) 293 code = '?'; 294 utf8_put(code, dst); 295 } 296 **dst = 0; 297 return 0; 298 } 299 300 s32 utf_to_lower(const s32 code) 301 { 302 struct capitalization_table *pos = capitalization_table; 303 s32 ret = code; 304 305 if (code <= 0x7f) { 306 if (code >= 'A' && code <= 'Z') 307 ret += 0x20; 308 return ret; 309 } 310 for (; pos->upper; ++pos) { 311 if (pos->upper == code) { 312 ret = pos->lower; 313 break; 314 } 315 } 316 return ret; 317 } 318 319 s32 utf_to_upper(const s32 code) 320 { 321 struct capitalization_table *pos = capitalization_table; 322 s32 ret = code; 323 324 if (code <= 0x7f) { 325 if (code >= 'a' && code <= 'z') 326 ret -= 0x20; 327 return ret; 328 } 329 for (; pos->lower; ++pos) { 330 if (pos->lower == code) { 331 ret = pos->upper; 332 break; 333 } 334 } 335 return ret; 336 } 337 338 size_t u16_strlen(const u16 *in) 339 { 340 size_t i; 341 for (i = 0; in[i]; i++); 342 return i; 343 } 344 345 size_t u16_strnlen(const u16 *in, size_t count) 346 { 347 size_t i; 348 for (i = 0; count-- && in[i]; i++); 349 return i; 350 } 351 352 /* Convert UTF-16 to UTF-8. */ 353 uint8_t *utf16_to_utf8(uint8_t *dest, const uint16_t *src, size_t size) 354 { 355 uint32_t code_high = 0; 356 357 while (size--) { 358 uint32_t code = *src++; 359 360 if (code_high) { 361 if (code >= 0xDC00 && code <= 0xDFFF) { 362 /* Surrogate pair. */ 363 code = ((code_high - 0xD800) << 10) + (code - 0xDC00) + 0x10000; 364 365 *dest++ = (code >> 18) | 0xF0; 366 *dest++ = ((code >> 12) & 0x3F) | 0x80; 367 *dest++ = ((code >> 6) & 0x3F) | 0x80; 368 *dest++ = (code & 0x3F) | 0x80; 369 } else { 370 /* Error... */ 371 *dest++ = '?'; 372 /* *src may be valid. Don't eat it. */ 373 src--; 374 } 375 376 code_high = 0; 377 } else { 378 if (code <= 0x007F) { 379 *dest++ = code; 380 } else if (code <= 0x07FF) { 381 *dest++ = (code >> 6) | 0xC0; 382 *dest++ = (code & 0x3F) | 0x80; 383 } else if (code >= 0xD800 && code <= 0xDBFF) { 384 code_high = code; 385 continue; 386 } else if (code >= 0xDC00 && code <= 0xDFFF) { 387 /* Error... */ 388 *dest++ = '?'; 389 } else if (code < 0x10000) { 390 *dest++ = (code >> 12) | 0xE0; 391 *dest++ = ((code >> 6) & 0x3F) | 0x80; 392 *dest++ = (code & 0x3F) | 0x80; 393 } else { 394 *dest++ = (code >> 18) | 0xF0; 395 *dest++ = ((code >> 12) & 0x3F) | 0x80; 396 *dest++ = ((code >> 6) & 0x3F) | 0x80; 397 *dest++ = (code & 0x3F) | 0x80; 398 } 399 } 400 } 401 402 return dest; 403 } 404