1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * unicode.c 4 * 5 * PURPOSE 6 * Routines for converting between UTF-8 and OSTA Compressed Unicode. 7 * Also handles filename mangling 8 * 9 * DESCRIPTION 10 * OSTA Compressed Unicode is explained in the OSTA UDF specification. 11 * http://www.osta.org/ 12 * UTF-8 is explained in the IETF RFC XXXX. 13 * ftp://ftp.internic.net/rfc/rfcxxxx.txt 14 * 15 */ 16 17 #include "udfdecl.h" 18 19 #include <linux/kernel.h> 20 #include <linux/string.h> /* for memset */ 21 #include <linux/nls.h> 22 #include <linux/crc-itu-t.h> 23 #include <linux/slab.h> 24 25 #include "udf_sb.h" 26 27 #define PLANE_SIZE 0x10000 28 #define UNICODE_MAX 0x10ffff 29 #define SURROGATE_MASK 0xfffff800 30 #define SURROGATE_PAIR 0x0000d800 31 #define SURROGATE_LOW 0x00000400 32 #define SURROGATE_CHAR_BITS 10 33 #define SURROGATE_CHAR_MASK ((1 << SURROGATE_CHAR_BITS) - 1) 34 35 #define ILLEGAL_CHAR_MARK '_' 36 #define EXT_MARK '.' 37 #define CRC_MARK '#' 38 #define EXT_SIZE 5 39 /* Number of chars we need to store generated CRC to make filename unique */ 40 #define CRC_LEN 5 41 42 static unicode_t get_utf16_char(const uint8_t *str_i, int str_i_max_len, 43 int str_i_idx, int u_ch, unicode_t *ret) 44 { 45 unicode_t c; 46 int start_idx = str_i_idx; 47 48 /* Expand OSTA compressed Unicode to Unicode */ 49 c = str_i[str_i_idx++]; 50 if (u_ch > 1) 51 c = (c << 8) | str_i[str_i_idx++]; 52 if ((c & SURROGATE_MASK) == SURROGATE_PAIR) { 53 unicode_t next; 54 55 /* Trailing surrogate char */ 56 if (str_i_idx >= str_i_max_len) { 57 c = UNICODE_MAX + 1; 58 goto out; 59 } 60 61 /* Low surrogate must follow the high one... */ 62 if (c & SURROGATE_LOW) { 63 c = UNICODE_MAX + 1; 64 goto out; 65 } 66 67 WARN_ON_ONCE(u_ch != 2); 68 next = str_i[str_i_idx++] << 8; 69 next |= str_i[str_i_idx++]; 70 if ((next & SURROGATE_MASK) != SURROGATE_PAIR || 71 !(next & SURROGATE_LOW)) { 72 c = UNICODE_MAX + 1; 73 goto out; 74 } 75 76 c = PLANE_SIZE + 77 ((c & SURROGATE_CHAR_MASK) << SURROGATE_CHAR_BITS) + 78 (next & SURROGATE_CHAR_MASK); 79 } 80 out: 81 *ret = c; 82 return str_i_idx - start_idx; 83 } 84 85 86 static int udf_name_conv_char(uint8_t *str_o, int str_o_max_len, 87 int *str_o_idx, 88 const uint8_t *str_i, int str_i_max_len, 89 int *str_i_idx, 90 int u_ch, int *needsCRC, 91 int (*conv_f)(wchar_t, unsigned char *, int), 92 int translate) 93 { 94 unicode_t c; 95 int illChar = 0; 96 int len, gotch = 0; 97 98 while (!gotch && *str_i_idx < str_i_max_len) { 99 if (*str_o_idx >= str_o_max_len) { 100 *needsCRC = 1; 101 return gotch; 102 } 103 104 len = get_utf16_char(str_i, str_i_max_len, *str_i_idx, u_ch, 105 &c); 106 /* These chars cannot be converted. Replace them. */ 107 if (c == 0 || c > UNICODE_MAX || (conv_f && c > MAX_WCHAR_T) || 108 (translate && c == '/')) { 109 illChar = 1; 110 if (!translate) 111 gotch = 1; 112 } else if (illChar) 113 break; 114 else 115 gotch = 1; 116 *str_i_idx += len; 117 } 118 if (illChar) { 119 *needsCRC = 1; 120 c = ILLEGAL_CHAR_MARK; 121 gotch = 1; 122 } 123 if (gotch) { 124 if (conv_f) { 125 len = conv_f(c, &str_o[*str_o_idx], 126 str_o_max_len - *str_o_idx); 127 } else { 128 len = utf32_to_utf8(c, &str_o[*str_o_idx], 129 str_o_max_len - *str_o_idx); 130 if (len < 0) 131 len = -ENAMETOOLONG; 132 } 133 /* Valid character? */ 134 if (len >= 0) 135 *str_o_idx += len; 136 else if (len == -ENAMETOOLONG) { 137 *needsCRC = 1; 138 gotch = 0; 139 } else { 140 str_o[(*str_o_idx)++] = ILLEGAL_CHAR_MARK; 141 *needsCRC = 1; 142 } 143 } 144 return gotch; 145 } 146 147 static int udf_name_from_CS0(struct super_block *sb, 148 uint8_t *str_o, int str_max_len, 149 const uint8_t *ocu, int ocu_len, 150 int translate) 151 { 152 uint32_t c; 153 uint8_t cmp_id; 154 int idx, len; 155 int u_ch; 156 int needsCRC = 0; 157 int ext_i_len, ext_max_len; 158 int str_o_len = 0; /* Length of resulting output */ 159 int ext_o_len = 0; /* Extension output length */ 160 int ext_crc_len = 0; /* Extension output length if used with CRC */ 161 int i_ext = -1; /* Extension position in input buffer */ 162 int o_crc = 0; /* Rightmost possible output pos for CRC+ext */ 163 unsigned short valueCRC; 164 uint8_t ext[EXT_SIZE * NLS_MAX_CHARSET_SIZE + 1]; 165 uint8_t crc[CRC_LEN]; 166 int (*conv_f)(wchar_t, unsigned char *, int); 167 168 if (str_max_len <= 0) 169 return 0; 170 171 if (ocu_len == 0) { 172 memset(str_o, 0, str_max_len); 173 return 0; 174 } 175 176 if (UDF_SB(sb)->s_nls_map) 177 conv_f = UDF_SB(sb)->s_nls_map->uni2char; 178 else 179 conv_f = NULL; 180 181 cmp_id = ocu[0]; 182 if (cmp_id != 8 && cmp_id != 16) { 183 memset(str_o, 0, str_max_len); 184 pr_err("unknown compression code (%u)\n", cmp_id); 185 return -EINVAL; 186 } 187 u_ch = cmp_id >> 3; 188 189 ocu++; 190 ocu_len--; 191 192 if (ocu_len % u_ch) { 193 pr_err("incorrect filename length (%d)\n", ocu_len + 1); 194 return -EINVAL; 195 } 196 197 if (translate) { 198 /* Look for extension */ 199 for (idx = ocu_len - u_ch, ext_i_len = 0; 200 (idx >= 0) && (ext_i_len < EXT_SIZE); 201 idx -= u_ch, ext_i_len++) { 202 c = ocu[idx]; 203 if (u_ch > 1) 204 c = (c << 8) | ocu[idx + 1]; 205 206 if (c == EXT_MARK) { 207 if (ext_i_len) 208 i_ext = idx; 209 break; 210 } 211 } 212 if (i_ext >= 0) { 213 /* Convert extension */ 214 ext_max_len = min_t(int, sizeof(ext), str_max_len); 215 ext[ext_o_len++] = EXT_MARK; 216 idx = i_ext + u_ch; 217 while (udf_name_conv_char(ext, ext_max_len, &ext_o_len, 218 ocu, ocu_len, &idx, 219 u_ch, &needsCRC, 220 conv_f, translate)) { 221 if ((ext_o_len + CRC_LEN) < str_max_len) 222 ext_crc_len = ext_o_len; 223 } 224 } 225 } 226 227 idx = 0; 228 while (1) { 229 if (translate && (idx == i_ext)) { 230 if (str_o_len > (str_max_len - ext_o_len)) 231 needsCRC = 1; 232 break; 233 } 234 235 if (!udf_name_conv_char(str_o, str_max_len, &str_o_len, 236 ocu, ocu_len, &idx, 237 u_ch, &needsCRC, conv_f, translate)) 238 break; 239 240 if (translate && 241 (str_o_len <= (str_max_len - ext_o_len - CRC_LEN))) 242 o_crc = str_o_len; 243 } 244 245 if (translate) { 246 if (str_o_len > 0 && str_o_len <= 2 && str_o[0] == '.' && 247 (str_o_len == 1 || str_o[1] == '.')) 248 needsCRC = 1; 249 if (needsCRC) { 250 str_o_len = o_crc; 251 valueCRC = crc_itu_t(0, ocu, ocu_len); 252 crc[0] = CRC_MARK; 253 crc[1] = hex_asc_upper_hi(valueCRC >> 8); 254 crc[2] = hex_asc_upper_lo(valueCRC >> 8); 255 crc[3] = hex_asc_upper_hi(valueCRC); 256 crc[4] = hex_asc_upper_lo(valueCRC); 257 len = min_t(int, CRC_LEN, str_max_len - str_o_len); 258 memcpy(&str_o[str_o_len], crc, len); 259 str_o_len += len; 260 ext_o_len = ext_crc_len; 261 } 262 if (ext_o_len > 0) { 263 memcpy(&str_o[str_o_len], ext, ext_o_len); 264 str_o_len += ext_o_len; 265 } 266 } 267 268 return str_o_len; 269 } 270 271 static int udf_name_to_CS0(struct super_block *sb, 272 uint8_t *ocu, int ocu_max_len, 273 const uint8_t *str_i, int str_len) 274 { 275 int i, len; 276 unsigned int max_val; 277 int u_len, u_ch; 278 unicode_t uni_char; 279 int (*conv_f)(const unsigned char *, int, wchar_t *); 280 281 if (ocu_max_len <= 0) 282 return 0; 283 284 if (UDF_SB(sb)->s_nls_map) 285 conv_f = UDF_SB(sb)->s_nls_map->char2uni; 286 else 287 conv_f = NULL; 288 289 memset(ocu, 0, ocu_max_len); 290 ocu[0] = 8; 291 max_val = 0xff; 292 u_ch = 1; 293 294 try_again: 295 u_len = 1; 296 for (i = 0; i < str_len; i += len) { 297 /* Name didn't fit? */ 298 if (u_len + u_ch > ocu_max_len) 299 return 0; 300 if (conv_f) { 301 wchar_t wchar; 302 303 len = conv_f(&str_i[i], str_len - i, &wchar); 304 if (len > 0) 305 uni_char = wchar; 306 } else { 307 len = utf8_to_utf32(&str_i[i], str_len - i, 308 &uni_char); 309 } 310 /* Invalid character, deal with it */ 311 if (len <= 0 || uni_char > UNICODE_MAX) { 312 len = 1; 313 uni_char = '?'; 314 } 315 316 if (uni_char > max_val) { 317 unicode_t c; 318 319 if (max_val == 0xff) { 320 max_val = 0xffff; 321 ocu[0] = 0x10; 322 u_ch = 2; 323 goto try_again; 324 } 325 /* 326 * Use UTF-16 encoding for chars outside we 327 * cannot encode directly. 328 */ 329 if (u_len + 2 * u_ch > ocu_max_len) 330 return 0; 331 332 uni_char -= PLANE_SIZE; 333 c = SURROGATE_PAIR | 334 ((uni_char >> SURROGATE_CHAR_BITS) & 335 SURROGATE_CHAR_MASK); 336 ocu[u_len++] = (uint8_t)(c >> 8); 337 ocu[u_len++] = (uint8_t)(c & 0xff); 338 uni_char = SURROGATE_PAIR | SURROGATE_LOW | 339 (uni_char & SURROGATE_CHAR_MASK); 340 } 341 342 if (max_val == 0xffff) 343 ocu[u_len++] = (uint8_t)(uni_char >> 8); 344 ocu[u_len++] = (uint8_t)(uni_char & 0xff); 345 } 346 347 return u_len; 348 } 349 350 /* 351 * Convert CS0 dstring to output charset. Warning: This function may truncate 352 * input string if it is too long as it is used for informational strings only 353 * and it is better to truncate the string than to refuse mounting a media. 354 */ 355 int udf_dstrCS0toChar(struct super_block *sb, uint8_t *utf_o, int o_len, 356 const uint8_t *ocu_i, int i_len) 357 { 358 int s_len = 0; 359 360 if (i_len > 0) { 361 s_len = ocu_i[i_len - 1]; 362 if (s_len >= i_len) { 363 pr_warn("incorrect dstring lengths (%d/%d)," 364 " truncating\n", s_len, i_len); 365 s_len = i_len - 1; 366 /* 2-byte encoding? Need to round properly... */ 367 if (ocu_i[0] == 16) 368 s_len -= (s_len - 1) & 2; 369 } 370 } 371 372 return udf_name_from_CS0(sb, utf_o, o_len, ocu_i, s_len, 0); 373 } 374 375 int udf_get_filename(struct super_block *sb, const uint8_t *sname, int slen, 376 uint8_t *dname, int dlen) 377 { 378 int ret; 379 380 if (!slen) 381 return -EIO; 382 383 if (dlen <= 0) 384 return 0; 385 386 ret = udf_name_from_CS0(sb, dname, dlen, sname, slen, 1); 387 /* Zero length filename isn't valid... */ 388 if (ret == 0) 389 ret = -EINVAL; 390 return ret; 391 } 392 393 int udf_put_filename(struct super_block *sb, const uint8_t *sname, int slen, 394 uint8_t *dname, int dlen) 395 { 396 return udf_name_to_CS0(sb, dname, dlen, sname, slen); 397 } 398 399