1 // SPDX-License-Identifier: GPL-2.0+ 2 /* 3 * charset conversion utils 4 * 5 * Copyright (c) 2017 Rob Clark 6 */ 7 8 #include <charset.h> 9 #include <malloc.h> 10 11 s32 utf8_get(const char **src) 12 { 13 s32 code = 0; 14 unsigned char c; 15 16 if (!src || !*src) 17 return -1; 18 if (!**src) 19 return 0; 20 c = **src; 21 if (c >= 0x80) { 22 ++*src; 23 if (!**src) 24 return -1; 25 /* 26 * We do not expect a continuation byte (0x80 - 0xbf). 27 * 0x80 is coded as 0xc2 0x80, so we cannot have less then 0xc2 28 * here. 29 * The highest code point is 0x10ffff which is coded as 30 * 0xf4 0x8f 0xbf 0xbf. So we cannot have a byte above 0xf4. 31 */ 32 if (c < 0xc2 || code > 0xf4) 33 return -1; 34 if (c >= 0xe0) { 35 if (c >= 0xf0) { 36 /* 0xf0 - 0xf4 */ 37 c &= 0x07; 38 code = c << 18; 39 c = **src; 40 ++*src; 41 if (!**src) 42 return -1; 43 if (c < 0x80 || c > 0xbf) 44 return -1; 45 c &= 0x3f; 46 } else { 47 /* 0xe0 - 0xef */ 48 c &= 0x0f; 49 } 50 code += c << 12; 51 if ((code >= 0xD800 && code <= 0xDFFF) || 52 code >= 0x110000) 53 return -1; 54 c = **src; 55 ++*src; 56 if (!**src) 57 return -1; 58 if (c < 0x80 || c > 0xbf) 59 return -1; 60 } 61 /* 0xc0 - 0xdf or continuation byte (0x80 - 0xbf) */ 62 c &= 0x3f; 63 code += c << 6; 64 c = **src; 65 if (c < 0x80 || c > 0xbf) 66 return -1; 67 c &= 0x3f; 68 } 69 code += c; 70 ++*src; 71 return code; 72 } 73 74 int utf8_put(s32 code, char **dst) 75 { 76 if (!dst || !*dst) 77 return -1; 78 if ((code >= 0xD800 && code <= 0xDFFF) || code >= 0x110000) 79 return -1; 80 if (code <= 0x007F) { 81 **dst = code; 82 } else { 83 if (code <= 0x07FF) { 84 **dst = code >> 6 | 0xC0; 85 } else { 86 if (code < 0x10000) { 87 **dst = code >> 12 | 0xE0; 88 } else { 89 **dst = code >> 18 | 0xF0; 90 ++*dst; 91 **dst = (code >> 12 & 0x3F) | 0x80; 92 } 93 ++*dst; 94 **dst = (code >> 6 & 0x3F) | 0x80; 95 } 96 ++*dst; 97 **dst = (code & 0x3F) | 0x80; 98 } 99 ++*dst; 100 return 0; 101 } 102 103 size_t utf8_utf16_strnlen(const char *src, size_t count) 104 { 105 size_t len = 0; 106 107 for (; *src && count; --count) { 108 s32 code = utf8_get(&src); 109 110 if (!code) 111 break; 112 if (code < 0) { 113 /* Reserve space for a replacement character */ 114 len += 1; 115 } else if (code < 0x10000) { 116 len += 1; 117 } else { 118 len += 2; 119 } 120 } 121 return len; 122 } 123 124 int utf8_utf16_strncpy(u16 **dst, const char *src, size_t count) 125 { 126 if (!src || !dst || !*dst) 127 return -1; 128 129 for (; count && *src; --count) { 130 s32 code = utf8_get(&src); 131 132 if (code < 0) 133 code = '?'; 134 utf16_put(code, dst); 135 } 136 **dst = 0; 137 return 0; 138 } 139 140 s32 utf16_get(const u16 **src) 141 { 142 s32 code, code2; 143 144 if (!src || !*src) 145 return -1; 146 if (!**src) 147 return 0; 148 code = **src; 149 ++*src; 150 if (code >= 0xDC00 && code <= 0xDFFF) 151 return -1; 152 if (code >= 0xD800 && code <= 0xDBFF) { 153 if (!**src) 154 return -1; 155 code &= 0x3ff; 156 code <<= 10; 157 code += 0x10000; 158 code2 = **src; 159 ++*src; 160 if (code2 <= 0xDC00 || code2 >= 0xDFFF) 161 return -1; 162 code2 &= 0x3ff; 163 code += code2; 164 } 165 return code; 166 } 167 168 int utf16_put(s32 code, u16 **dst) 169 { 170 if (!dst || !*dst) 171 return -1; 172 if ((code >= 0xD800 && code <= 0xDFFF) || code >= 0x110000) 173 return -1; 174 if (code < 0x10000) { 175 **dst = code; 176 } else { 177 code -= 0x10000; 178 **dst = code >> 10 | 0xD800; 179 ++*dst; 180 **dst = (code & 0x3ff) | 0xDC00; 181 } 182 ++*dst; 183 return 0; 184 } 185 186 size_t utf16_strnlen(const u16 *src, size_t count) 187 { 188 size_t len = 0; 189 190 for (; *src && count; --count) { 191 s32 code = utf16_get(&src); 192 193 if (!code) 194 break; 195 /* 196 * In case of an illegal sequence still reserve space for a 197 * replacement character. 198 */ 199 ++len; 200 } 201 return len; 202 } 203 204 size_t utf16_utf8_strnlen(const u16 *src, size_t count) 205 { 206 size_t len = 0; 207 208 for (; *src && count; --count) { 209 s32 code = utf16_get(&src); 210 211 if (!code) 212 break; 213 if (code < 0) 214 /* Reserve space for a replacement character */ 215 len += 1; 216 else if (code < 0x80) 217 len += 1; 218 else if (code < 0x800) 219 len += 2; 220 else if (code < 0x10000) 221 len += 3; 222 else 223 len += 4; 224 } 225 return len; 226 } 227 228 int utf16_utf8_strncpy(char **dst, const u16 *src, size_t count) 229 { 230 if (!src || !dst || !*dst) 231 return -1; 232 233 for (; count && *src; --count) { 234 s32 code = utf16_get(&src); 235 236 if (code < 0) 237 code = '?'; 238 utf8_put(code, dst); 239 } 240 **dst = 0; 241 return 0; 242 } 243 244 245 size_t u16_strlen(const u16 *in) 246 { 247 size_t i; 248 for (i = 0; in[i]; i++); 249 return i; 250 } 251 252 size_t u16_strnlen(const u16 *in, size_t count) 253 { 254 size_t i; 255 for (i = 0; count-- && in[i]; i++); 256 return i; 257 } 258 259 uint16_t *utf16_strcpy(uint16_t *dest, const uint16_t *src) 260 { 261 uint16_t *tmp = dest; 262 263 while ((*dest++ = *src++) != '\0') 264 /* nothing */; 265 return tmp; 266 267 } 268 269 uint16_t *utf16_strdup(const uint16_t *s) 270 { 271 uint16_t *new; 272 273 if (!s) 274 return NULL; 275 new = malloc((u16_strlen(s) + 1) * 2); 276 if (!new) 277 return NULL; 278 utf16_strcpy(new, s); 279 return new; 280 } 281 282 /* Convert UTF-16 to UTF-8. */ 283 uint8_t *utf16_to_utf8(uint8_t *dest, const uint16_t *src, size_t size) 284 { 285 uint32_t code_high = 0; 286 287 while (size--) { 288 uint32_t code = *src++; 289 290 if (code_high) { 291 if (code >= 0xDC00 && code <= 0xDFFF) { 292 /* Surrogate pair. */ 293 code = ((code_high - 0xD800) << 10) + (code - 0xDC00) + 0x10000; 294 295 *dest++ = (code >> 18) | 0xF0; 296 *dest++ = ((code >> 12) & 0x3F) | 0x80; 297 *dest++ = ((code >> 6) & 0x3F) | 0x80; 298 *dest++ = (code & 0x3F) | 0x80; 299 } else { 300 /* Error... */ 301 *dest++ = '?'; 302 /* *src may be valid. Don't eat it. */ 303 src--; 304 } 305 306 code_high = 0; 307 } else { 308 if (code <= 0x007F) { 309 *dest++ = code; 310 } else if (code <= 0x07FF) { 311 *dest++ = (code >> 6) | 0xC0; 312 *dest++ = (code & 0x3F) | 0x80; 313 } else if (code >= 0xD800 && code <= 0xDBFF) { 314 code_high = code; 315 continue; 316 } else if (code >= 0xDC00 && code <= 0xDFFF) { 317 /* Error... */ 318 *dest++ = '?'; 319 } else if (code < 0x10000) { 320 *dest++ = (code >> 12) | 0xE0; 321 *dest++ = ((code >> 6) & 0x3F) | 0x80; 322 *dest++ = (code & 0x3F) | 0x80; 323 } else { 324 *dest++ = (code >> 18) | 0xF0; 325 *dest++ = ((code >> 12) & 0x3F) | 0x80; 326 *dest++ = ((code >> 6) & 0x3F) | 0x80; 327 *dest++ = (code & 0x3F) | 0x80; 328 } 329 } 330 } 331 332 return dest; 333 } 334 335 uint16_t *utf8_to_utf16(uint16_t *dest, const uint8_t *src, size_t size) 336 { 337 while (size--) { 338 int extension_bytes; 339 uint32_t code; 340 341 extension_bytes = 0; 342 if (*src <= 0x7f) { 343 code = *src++; 344 /* Exit on zero byte */ 345 if (!code) 346 size = 0; 347 } else if (*src <= 0xbf) { 348 /* Illegal code */ 349 code = '?'; 350 } else if (*src <= 0xdf) { 351 code = *src++ & 0x1f; 352 extension_bytes = 1; 353 } else if (*src <= 0xef) { 354 code = *src++ & 0x0f; 355 extension_bytes = 2; 356 } else if (*src <= 0xf7) { 357 code = *src++ & 0x07; 358 extension_bytes = 3; 359 } else { 360 /* Illegal code */ 361 code = '?'; 362 } 363 364 for (; extension_bytes && size; --size, --extension_bytes) { 365 if ((*src & 0xc0) == 0x80) { 366 code <<= 6; 367 code |= *src++ & 0x3f; 368 } else { 369 /* Illegal code */ 370 code = '?'; 371 ++src; 372 --size; 373 break; 374 } 375 } 376 377 if (code < 0x10000) { 378 *dest++ = code; 379 } else { 380 /* 381 * Simplified expression for 382 * (((code - 0x10000) >> 10) & 0x3ff) | 0xd800 383 */ 384 *dest++ = (code >> 10) + 0xd7c0; 385 *dest++ = (code & 0x3ff) | 0xdc00; 386 } 387 } 388 return dest; 389 } 390