1 // SPDX-License-Identifier: GPL-2.0+ 2 /* 3 * charset conversion utils 4 * 5 * Copyright (c) 2017 Rob Clark 6 */ 7 8 #include <charset.h> 9 #include <malloc.h> 10 11 s32 utf8_get(const char **src) 12 { 13 s32 code = 0; 14 unsigned char c; 15 16 if (!src || !*src) 17 return -1; 18 if (!**src) 19 return 0; 20 c = **src; 21 if (c >= 0x80) { 22 ++*src; 23 if (!**src) 24 return -1; 25 /* 26 * We do not expect a continuation byte (0x80 - 0xbf). 27 * 0x80 is coded as 0xc2 0x80, so we cannot have less then 0xc2 28 * here. 29 * The highest code point is 0x10ffff which is coded as 30 * 0xf4 0x8f 0xbf 0xbf. So we cannot have a byte above 0xf4. 31 */ 32 if (c < 0xc2 || code > 0xf4) 33 return -1; 34 if (c >= 0xe0) { 35 if (c >= 0xf0) { 36 /* 0xf0 - 0xf4 */ 37 c &= 0x07; 38 code = c << 18; 39 c = **src; 40 ++*src; 41 if (!**src) 42 return -1; 43 if (c < 0x80 || c > 0xbf) 44 return -1; 45 c &= 0x3f; 46 } else { 47 /* 0xe0 - 0xef */ 48 c &= 0x0f; 49 } 50 code += c << 12; 51 if ((code >= 0xD800 && code <= 0xDFFF) || 52 code >= 0x110000) 53 return -1; 54 c = **src; 55 ++*src; 56 if (!**src) 57 return -1; 58 if (c < 0x80 || c > 0xbf) 59 return -1; 60 } 61 /* 0xc0 - 0xdf or continuation byte (0x80 - 0xbf) */ 62 c &= 0x3f; 63 code += c << 6; 64 c = **src; 65 if (c < 0x80 || c > 0xbf) 66 return -1; 67 c &= 0x3f; 68 } 69 code += c; 70 ++*src; 71 return code; 72 } 73 74 int utf8_put(s32 code, char **dst) 75 { 76 if (!dst || !*dst) 77 return -1; 78 if ((code >= 0xD800 && code <= 0xDFFF) || code >= 0x110000) 79 return -1; 80 if (code <= 0x007F) { 81 **dst = code; 82 } else { 83 if (code <= 0x07FF) { 84 **dst = code >> 6 | 0xC0; 85 } else { 86 if (code < 0x10000) { 87 **dst = code >> 12 | 0xE0; 88 } else { 89 **dst = code >> 18 | 0xF0; 90 ++*dst; 91 **dst = (code >> 12 & 0x3F) | 0x80; 92 } 93 ++*dst; 94 **dst = (code >> 6 & 0x3F) | 0x80; 95 } 96 ++*dst; 97 **dst = (code & 0x3F) | 0x80; 98 } 99 ++*dst; 100 return 0; 101 } 102 103 size_t utf8_utf16_strnlen(const char *src, size_t count) 104 { 105 size_t len = 0; 106 107 for (; *src && count; --count) { 108 s32 code = utf8_get(&src); 109 110 if (!code) 111 break; 112 if (code < 0) { 113 /* Reserve space for a replacement character */ 114 len += 1; 115 } else if (code < 0x10000) { 116 len += 1; 117 } else { 118 len += 2; 119 } 120 } 121 return len; 122 } 123 124 int utf8_utf16_strncpy(u16 **dst, const char *src, size_t count) 125 { 126 if (!src || !dst || !*dst) 127 return -1; 128 129 for (; count && *src; --count) { 130 s32 code = utf8_get(&src); 131 132 if (code < 0) 133 code = '?'; 134 utf16_put(code, dst); 135 } 136 **dst = 0; 137 return 0; 138 } 139 140 s32 utf16_get(const u16 **src) 141 { 142 s32 code, code2; 143 144 if (!src || !*src) 145 return -1; 146 if (!**src) 147 return 0; 148 code = **src; 149 ++*src; 150 if (code >= 0xDC00 && code <= 0xDFFF) 151 return -1; 152 if (code >= 0xD800 && code <= 0xDBFF) { 153 if (!**src) 154 return -1; 155 code &= 0x3ff; 156 code <<= 10; 157 code += 0x10000; 158 code2 = **src; 159 ++*src; 160 if (code2 <= 0xDC00 || code2 >= 0xDFFF) 161 return -1; 162 code2 &= 0x3ff; 163 code += code2; 164 } 165 return code; 166 } 167 168 int utf16_put(s32 code, u16 **dst) 169 { 170 if (!dst || !*dst) 171 return -1; 172 if ((code >= 0xD800 && code <= 0xDFFF) || code >= 0x110000) 173 return -1; 174 if (code < 0x10000) { 175 **dst = code; 176 } else { 177 code -= 0x10000; 178 **dst = code >> 10 | 0xD800; 179 ++*dst; 180 **dst = (code & 0x3ff) | 0xDC00; 181 } 182 ++*dst; 183 return 0; 184 } 185 186 size_t utf16_strnlen(const u16 *src, size_t count) 187 { 188 size_t len = 0; 189 190 for (; *src && count; --count) { 191 s32 code = utf16_get(&src); 192 193 if (!code) 194 break; 195 /* 196 * In case of an illegal sequence still reserve space for a 197 * replacement character. 198 */ 199 ++len; 200 } 201 return len; 202 } 203 204 size_t utf16_utf8_strnlen(const u16 *src, size_t count) 205 { 206 size_t len = 0; 207 208 for (; *src && count; --count) { 209 s32 code = utf16_get(&src); 210 211 if (!code) 212 break; 213 if (code < 0) 214 /* Reserve space for a replacement character */ 215 len += 1; 216 else if (code < 0x80) 217 len += 1; 218 else if (code < 0x800) 219 len += 2; 220 else if (code < 0x10000) 221 len += 3; 222 else 223 len += 4; 224 } 225 return len; 226 } 227 228 int utf16_utf8_strncpy(char **dst, const u16 *src, size_t count) 229 { 230 if (!src || !dst || !*dst) 231 return -1; 232 233 for (; count && *src; --count) { 234 s32 code = utf16_get(&src); 235 236 if (code < 0) 237 code = '?'; 238 utf8_put(code, dst); 239 } 240 **dst = 0; 241 return 0; 242 } 243 244 245 size_t u16_strlen(const u16 *in) 246 { 247 size_t i; 248 for (i = 0; in[i]; i++); 249 return i; 250 } 251 252 size_t u16_strnlen(const u16 *in, size_t count) 253 { 254 size_t i; 255 for (i = 0; count-- && in[i]; i++); 256 return i; 257 } 258 259 /* Convert UTF-16 to UTF-8. */ 260 uint8_t *utf16_to_utf8(uint8_t *dest, const uint16_t *src, size_t size) 261 { 262 uint32_t code_high = 0; 263 264 while (size--) { 265 uint32_t code = *src++; 266 267 if (code_high) { 268 if (code >= 0xDC00 && code <= 0xDFFF) { 269 /* Surrogate pair. */ 270 code = ((code_high - 0xD800) << 10) + (code - 0xDC00) + 0x10000; 271 272 *dest++ = (code >> 18) | 0xF0; 273 *dest++ = ((code >> 12) & 0x3F) | 0x80; 274 *dest++ = ((code >> 6) & 0x3F) | 0x80; 275 *dest++ = (code & 0x3F) | 0x80; 276 } else { 277 /* Error... */ 278 *dest++ = '?'; 279 /* *src may be valid. Don't eat it. */ 280 src--; 281 } 282 283 code_high = 0; 284 } else { 285 if (code <= 0x007F) { 286 *dest++ = code; 287 } else if (code <= 0x07FF) { 288 *dest++ = (code >> 6) | 0xC0; 289 *dest++ = (code & 0x3F) | 0x80; 290 } else if (code >= 0xD800 && code <= 0xDBFF) { 291 code_high = code; 292 continue; 293 } else if (code >= 0xDC00 && code <= 0xDFFF) { 294 /* Error... */ 295 *dest++ = '?'; 296 } else if (code < 0x10000) { 297 *dest++ = (code >> 12) | 0xE0; 298 *dest++ = ((code >> 6) & 0x3F) | 0x80; 299 *dest++ = (code & 0x3F) | 0x80; 300 } else { 301 *dest++ = (code >> 18) | 0xF0; 302 *dest++ = ((code >> 12) & 0x3F) | 0x80; 303 *dest++ = ((code >> 6) & 0x3F) | 0x80; 304 *dest++ = (code & 0x3F) | 0x80; 305 } 306 } 307 } 308 309 return dest; 310 } 311