1 /* 2 * unicode.c 3 * 4 * PURPOSE 5 * Routines for converting between UTF-8 and OSTA Compressed Unicode. 6 * Also handles filename mangling 7 * 8 * DESCRIPTION 9 * OSTA Compressed Unicode is explained in the OSTA UDF specification. 10 * http://www.osta.org/ 11 * UTF-8 is explained in the IETF RFC XXXX. 12 * ftp://ftp.internic.net/rfc/rfcxxxx.txt 13 * 14 * COPYRIGHT 15 * This file is distributed under the terms of the GNU General Public 16 * License (GPL). Copies of the GPL can be obtained from: 17 * ftp://prep.ai.mit.edu/pub/gnu/GPL 18 * Each contributing author retains all rights to their own work. 19 */ 20 21 #include "udfdecl.h" 22 23 #include <linux/kernel.h> 24 #include <linux/string.h> /* for memset */ 25 #include <linux/nls.h> 26 #include <linux/crc-itu-t.h> 27 #include <linux/slab.h> 28 29 #include "udf_sb.h" 30 31 #define SURROGATE_MASK 0xfffff800 32 #define SURROGATE_PAIR 0x0000d800 33 34 static int udf_uni2char_utf8(wchar_t uni, 35 unsigned char *out, 36 int boundlen) 37 { 38 int u_len = 0; 39 40 if (boundlen <= 0) 41 return -ENAMETOOLONG; 42 43 if ((uni & SURROGATE_MASK) == SURROGATE_PAIR) 44 return -EINVAL; 45 46 if (uni < 0x80) { 47 out[u_len++] = (unsigned char)uni; 48 } else if (uni < 0x800) { 49 if (boundlen < 2) 50 return -ENAMETOOLONG; 51 out[u_len++] = (unsigned char)(0xc0 | (uni >> 6)); 52 out[u_len++] = (unsigned char)(0x80 | (uni & 0x3f)); 53 } else { 54 if (boundlen < 3) 55 return -ENAMETOOLONG; 56 out[u_len++] = (unsigned char)(0xe0 | (uni >> 12)); 57 out[u_len++] = (unsigned char)(0x80 | ((uni >> 6) & 0x3f)); 58 out[u_len++] = (unsigned char)(0x80 | (uni & 0x3f)); 59 } 60 return u_len; 61 } 62 63 static int udf_char2uni_utf8(const unsigned char *in, 64 int boundlen, 65 wchar_t *uni) 66 { 67 unsigned int utf_char; 68 unsigned char c; 69 int utf_cnt, u_len; 70 71 utf_char = 0; 72 utf_cnt = 0; 73 for (u_len = 0; u_len < boundlen;) { 74 c = in[u_len++]; 75 76 /* Complete a multi-byte UTF-8 character */ 77 if (utf_cnt) { 78 utf_char = (utf_char << 6) | (c & 0x3f); 79 if (--utf_cnt) 80 continue; 81 } else { 82 /* Check for a multi-byte UTF-8 character */ 83 if (c & 0x80) { 84 /* Start a multi-byte UTF-8 character */ 85 if ((c & 0xe0) == 0xc0) { 86 utf_char = c & 0x1f; 87 utf_cnt = 1; 88 } else if ((c & 0xf0) == 0xe0) { 89 utf_char = c & 0x0f; 90 utf_cnt = 2; 91 } else if ((c & 0xf8) == 0xf0) { 92 utf_char = c & 0x07; 93 utf_cnt = 3; 94 } else if ((c & 0xfc) == 0xf8) { 95 utf_char = c & 0x03; 96 utf_cnt = 4; 97 } else if ((c & 0xfe) == 0xfc) { 98 utf_char = c & 0x01; 99 utf_cnt = 5; 100 } else { 101 utf_cnt = -1; 102 break; 103 } 104 continue; 105 } else { 106 /* Single byte UTF-8 character (most common) */ 107 utf_char = c; 108 } 109 } 110 *uni = utf_char; 111 break; 112 } 113 if (utf_cnt) { 114 *uni = '?'; 115 return -EINVAL; 116 } 117 return u_len; 118 } 119 120 #define ILLEGAL_CHAR_MARK '_' 121 #define EXT_MARK '.' 122 #define CRC_MARK '#' 123 #define EXT_SIZE 5 124 /* Number of chars we need to store generated CRC to make filename unique */ 125 #define CRC_LEN 5 126 127 static int udf_name_conv_char(uint8_t *str_o, int str_o_max_len, 128 int *str_o_idx, 129 const uint8_t *str_i, int str_i_max_len, 130 int *str_i_idx, 131 int u_ch, int *needsCRC, 132 int (*conv_f)(wchar_t, unsigned char *, int), 133 int translate) 134 { 135 uint32_t c; 136 int illChar = 0; 137 int len, gotch = 0; 138 139 for (; (!gotch) && (*str_i_idx < str_i_max_len); *str_i_idx += u_ch) { 140 if (*str_o_idx >= str_o_max_len) { 141 *needsCRC = 1; 142 return gotch; 143 } 144 145 /* Expand OSTA compressed Unicode to Unicode */ 146 c = str_i[*str_i_idx]; 147 if (u_ch > 1) 148 c = (c << 8) | str_i[*str_i_idx + 1]; 149 150 if (translate && (c == '/' || c == 0)) 151 illChar = 1; 152 else if (illChar) 153 break; 154 else 155 gotch = 1; 156 } 157 if (illChar) { 158 *needsCRC = 1; 159 c = ILLEGAL_CHAR_MARK; 160 gotch = 1; 161 } 162 if (gotch) { 163 len = conv_f(c, &str_o[*str_o_idx], str_o_max_len - *str_o_idx); 164 /* Valid character? */ 165 if (len >= 0) 166 *str_o_idx += len; 167 else if (len == -ENAMETOOLONG) { 168 *needsCRC = 1; 169 gotch = 0; 170 } else { 171 str_o[(*str_o_idx)++] = '?'; 172 *needsCRC = 1; 173 } 174 } 175 return gotch; 176 } 177 178 static int udf_name_from_CS0(uint8_t *str_o, int str_max_len, 179 const uint8_t *ocu, int ocu_len, 180 int (*conv_f)(wchar_t, unsigned char *, int), 181 int translate) 182 { 183 uint32_t c; 184 uint8_t cmp_id; 185 int idx, len; 186 int u_ch; 187 int needsCRC = 0; 188 int ext_i_len, ext_max_len; 189 int str_o_len = 0; /* Length of resulting output */ 190 int ext_o_len = 0; /* Extension output length */ 191 int ext_crc_len = 0; /* Extension output length if used with CRC */ 192 int i_ext = -1; /* Extension position in input buffer */ 193 int o_crc = 0; /* Rightmost possible output pos for CRC+ext */ 194 unsigned short valueCRC; 195 uint8_t ext[EXT_SIZE * NLS_MAX_CHARSET_SIZE + 1]; 196 uint8_t crc[CRC_LEN]; 197 198 if (str_max_len <= 0) 199 return 0; 200 201 if (ocu_len == 0) { 202 memset(str_o, 0, str_max_len); 203 return 0; 204 } 205 206 cmp_id = ocu[0]; 207 if (cmp_id != 8 && cmp_id != 16) { 208 memset(str_o, 0, str_max_len); 209 pr_err("unknown compression code (%u)\n", cmp_id); 210 return -EINVAL; 211 } 212 u_ch = cmp_id >> 3; 213 214 ocu++; 215 ocu_len--; 216 217 if (ocu_len % u_ch) { 218 pr_err("incorrect filename length (%d)\n", ocu_len + 1); 219 return -EINVAL; 220 } 221 222 if (translate) { 223 /* Look for extension */ 224 for (idx = ocu_len - u_ch, ext_i_len = 0; 225 (idx >= 0) && (ext_i_len < EXT_SIZE); 226 idx -= u_ch, ext_i_len++) { 227 c = ocu[idx]; 228 if (u_ch > 1) 229 c = (c << 8) | ocu[idx + 1]; 230 231 if (c == EXT_MARK) { 232 if (ext_i_len) 233 i_ext = idx; 234 break; 235 } 236 } 237 if (i_ext >= 0) { 238 /* Convert extension */ 239 ext_max_len = min_t(int, sizeof(ext), str_max_len); 240 ext[ext_o_len++] = EXT_MARK; 241 idx = i_ext + u_ch; 242 while (udf_name_conv_char(ext, ext_max_len, &ext_o_len, 243 ocu, ocu_len, &idx, 244 u_ch, &needsCRC, 245 conv_f, translate)) { 246 if ((ext_o_len + CRC_LEN) < str_max_len) 247 ext_crc_len = ext_o_len; 248 } 249 } 250 } 251 252 idx = 0; 253 while (1) { 254 if (translate && (idx == i_ext)) { 255 if (str_o_len > (str_max_len - ext_o_len)) 256 needsCRC = 1; 257 break; 258 } 259 260 if (!udf_name_conv_char(str_o, str_max_len, &str_o_len, 261 ocu, ocu_len, &idx, 262 u_ch, &needsCRC, conv_f, translate)) 263 break; 264 265 if (translate && 266 (str_o_len <= (str_max_len - ext_o_len - CRC_LEN))) 267 o_crc = str_o_len; 268 } 269 270 if (translate) { 271 if (str_o_len <= 2 && str_o[0] == '.' && 272 (str_o_len == 1 || str_o[1] == '.')) 273 needsCRC = 1; 274 if (needsCRC) { 275 str_o_len = o_crc; 276 valueCRC = crc_itu_t(0, ocu, ocu_len); 277 crc[0] = CRC_MARK; 278 crc[1] = hex_asc_upper_hi(valueCRC >> 8); 279 crc[2] = hex_asc_upper_lo(valueCRC >> 8); 280 crc[3] = hex_asc_upper_hi(valueCRC); 281 crc[4] = hex_asc_upper_lo(valueCRC); 282 len = min_t(int, CRC_LEN, str_max_len - str_o_len); 283 memcpy(&str_o[str_o_len], crc, len); 284 str_o_len += len; 285 ext_o_len = ext_crc_len; 286 } 287 if (ext_o_len > 0) { 288 memcpy(&str_o[str_o_len], ext, ext_o_len); 289 str_o_len += ext_o_len; 290 } 291 } 292 293 return str_o_len; 294 } 295 296 static int udf_name_to_CS0(uint8_t *ocu, int ocu_max_len, 297 const uint8_t *str_i, int str_len, 298 int (*conv_f)(const unsigned char *, int, wchar_t *)) 299 { 300 int i, len; 301 unsigned int max_val; 302 wchar_t uni_char; 303 int u_len, u_ch; 304 305 if (ocu_max_len <= 0) 306 return 0; 307 308 memset(ocu, 0, ocu_max_len); 309 ocu[0] = 8; 310 max_val = 0xff; 311 u_ch = 1; 312 313 try_again: 314 u_len = 1; 315 for (i = 0; i < str_len; i++) { 316 /* Name didn't fit? */ 317 if (u_len + u_ch > ocu_max_len) 318 return 0; 319 len = conv_f(&str_i[i], str_len - i, &uni_char); 320 if (!len) 321 continue; 322 /* Invalid character, deal with it */ 323 if (len < 0) { 324 len = 1; 325 uni_char = '?'; 326 } 327 328 if (uni_char > max_val) { 329 max_val = 0xffff; 330 ocu[0] = 0x10; 331 u_ch = 2; 332 goto try_again; 333 } 334 335 if (max_val == 0xffff) 336 ocu[u_len++] = (uint8_t)(uni_char >> 8); 337 ocu[u_len++] = (uint8_t)(uni_char & 0xff); 338 i += len - 1; 339 } 340 341 return u_len; 342 } 343 344 int udf_dstrCS0toUTF8(uint8_t *utf_o, int o_len, 345 const uint8_t *ocu_i, int i_len) 346 { 347 int s_len = 0; 348 349 if (i_len > 0) { 350 s_len = ocu_i[i_len - 1]; 351 if (s_len >= i_len) { 352 pr_err("incorrect dstring lengths (%d/%d)\n", 353 s_len, i_len); 354 return -EINVAL; 355 } 356 } 357 358 return udf_name_from_CS0(utf_o, o_len, ocu_i, s_len, 359 udf_uni2char_utf8, 0); 360 } 361 362 int udf_get_filename(struct super_block *sb, const uint8_t *sname, int slen, 363 uint8_t *dname, int dlen) 364 { 365 int (*conv_f)(wchar_t, unsigned char *, int); 366 int ret; 367 368 if (!slen) 369 return -EIO; 370 371 if (dlen <= 0) 372 return 0; 373 374 if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) { 375 conv_f = udf_uni2char_utf8; 376 } else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) { 377 conv_f = UDF_SB(sb)->s_nls_map->uni2char; 378 } else 379 BUG(); 380 381 ret = udf_name_from_CS0(dname, dlen, sname, slen, conv_f, 1); 382 /* Zero length filename isn't valid... */ 383 if (ret == 0) 384 ret = -EINVAL; 385 return ret; 386 } 387 388 int udf_put_filename(struct super_block *sb, const uint8_t *sname, int slen, 389 uint8_t *dname, int dlen) 390 { 391 int (*conv_f)(const unsigned char *, int, wchar_t *); 392 393 if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) { 394 conv_f = udf_char2uni_utf8; 395 } else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) { 396 conv_f = UDF_SB(sb)->s_nls_map->char2uni; 397 } else 398 BUG(); 399 400 return udf_name_to_CS0(dname, dlen, sname, slen, conv_f); 401 } 402 403