1 /* 2 * unicode.c 3 * 4 * PURPOSE 5 * Routines for converting between UTF-8 and OSTA Compressed Unicode. 6 * Also handles filename mangling 7 * 8 * DESCRIPTION 9 * OSTA Compressed Unicode is explained in the OSTA UDF specification. 10 * http://www.osta.org/ 11 * UTF-8 is explained in the IETF RFC XXXX. 12 * ftp://ftp.internic.net/rfc/rfcxxxx.txt 13 * 14 * COPYRIGHT 15 * This file is distributed under the terms of the GNU General Public 16 * License (GPL). Copies of the GPL can be obtained from: 17 * ftp://prep.ai.mit.edu/pub/gnu/GPL 18 * Each contributing author retains all rights to their own work. 19 */ 20 21 #include "udfdecl.h" 22 23 #include <linux/kernel.h> 24 #include <linux/string.h> /* for memset */ 25 #include <linux/nls.h> 26 #include <linux/udf_fs.h> 27 28 #include "udf_sb.h" 29 30 static int udf_translate_to_linux(uint8_t *, uint8_t *, int, uint8_t *, int); 31 32 static int udf_char_to_ustr(struct ustr *dest, const uint8_t *src, int strlen) 33 { 34 if ( (!dest) || (!src) || (!strlen) || (strlen > UDF_NAME_LEN-2) ) 35 return 0; 36 memset(dest, 0, sizeof(struct ustr)); 37 memcpy(dest->u_name, src, strlen); 38 dest->u_cmpID = 0x08; 39 dest->u_len = strlen; 40 return strlen; 41 } 42 43 /* 44 * udf_build_ustr 45 */ 46 int udf_build_ustr(struct ustr *dest, dstring *ptr, int size) 47 { 48 int usesize; 49 50 if ( (!dest) || (!ptr) || (!size) ) 51 return -1; 52 53 memset(dest, 0, sizeof(struct ustr)); 54 usesize= (size > UDF_NAME_LEN) ? UDF_NAME_LEN : size; 55 dest->u_cmpID=ptr[0]; 56 dest->u_len=ptr[size-1]; 57 memcpy(dest->u_name, ptr+1, usesize-1); 58 return 0; 59 } 60 61 /* 62 * udf_build_ustr_exact 63 */ 64 static int udf_build_ustr_exact(struct ustr *dest, dstring *ptr, int exactsize) 65 { 66 if ( (!dest) || (!ptr) || (!exactsize) ) 67 return -1; 68 69 memset(dest, 0, sizeof(struct ustr)); 70 dest->u_cmpID=ptr[0]; 71 dest->u_len=exactsize-1; 72 memcpy(dest->u_name, ptr+1, exactsize-1); 73 return 0; 74 } 75 76 /* 77 * udf_ocu_to_utf8 78 * 79 * PURPOSE 80 * Convert OSTA Compressed Unicode to the UTF-8 equivalent. 81 * 82 * DESCRIPTION 83 * This routine is only called by udf_filldir(). 84 * 85 * PRE-CONDITIONS 86 * utf Pointer to UTF-8 output buffer. 87 * ocu Pointer to OSTA Compressed Unicode input buffer 88 * of size UDF_NAME_LEN bytes. 89 * both of type "struct ustr *" 90 * 91 * POST-CONDITIONS 92 * <return> Zero on success. 93 * 94 * HISTORY 95 * November 12, 1997 - Andrew E. Mileski 96 * Written, tested, and released. 97 */ 98 int udf_CS0toUTF8(struct ustr *utf_o, struct ustr *ocu_i) 99 { 100 uint8_t *ocu; 101 uint32_t c; 102 uint8_t cmp_id, ocu_len; 103 int i; 104 105 ocu = ocu_i->u_name; 106 107 ocu_len = ocu_i->u_len; 108 cmp_id = ocu_i->u_cmpID; 109 utf_o->u_len = 0; 110 111 if (ocu_len == 0) 112 { 113 memset(utf_o, 0, sizeof(struct ustr)); 114 utf_o->u_cmpID = 0; 115 utf_o->u_len = 0; 116 return 0; 117 } 118 119 if ((cmp_id != 8) && (cmp_id != 16)) 120 { 121 printk(KERN_ERR "udf: unknown compression code (%d) stri=%s\n", cmp_id, ocu_i->u_name); 122 return 0; 123 } 124 125 for (i = 0; (i < ocu_len) && (utf_o->u_len <= (UDF_NAME_LEN-3)) ;) 126 { 127 128 /* Expand OSTA compressed Unicode to Unicode */ 129 c = ocu[i++]; 130 if (cmp_id == 16) 131 c = (c << 8) | ocu[i++]; 132 133 /* Compress Unicode to UTF-8 */ 134 if (c < 0x80U) 135 utf_o->u_name[utf_o->u_len++] = (uint8_t)c; 136 else if (c < 0x800U) 137 { 138 utf_o->u_name[utf_o->u_len++] = (uint8_t)(0xc0 | (c >> 6)); 139 utf_o->u_name[utf_o->u_len++] = (uint8_t)(0x80 | (c & 0x3f)); 140 } 141 else 142 { 143 utf_o->u_name[utf_o->u_len++] = (uint8_t)(0xe0 | (c >> 12)); 144 utf_o->u_name[utf_o->u_len++] = (uint8_t)(0x80 | ((c >> 6) & 0x3f)); 145 utf_o->u_name[utf_o->u_len++] = (uint8_t)(0x80 | (c & 0x3f)); 146 } 147 } 148 utf_o->u_cmpID=8; 149 150 return utf_o->u_len; 151 } 152 153 /* 154 * 155 * udf_utf8_to_ocu 156 * 157 * PURPOSE 158 * Convert UTF-8 to the OSTA Compressed Unicode equivalent. 159 * 160 * DESCRIPTION 161 * This routine is only called by udf_lookup(). 162 * 163 * PRE-CONDITIONS 164 * ocu Pointer to OSTA Compressed Unicode output 165 * buffer of size UDF_NAME_LEN bytes. 166 * utf Pointer to UTF-8 input buffer. 167 * utf_len Length of UTF-8 input buffer in bytes. 168 * 169 * POST-CONDITIONS 170 * <return> Zero on success. 171 * 172 * HISTORY 173 * November 12, 1997 - Andrew E. Mileski 174 * Written, tested, and released. 175 */ 176 static int udf_UTF8toCS0(dstring *ocu, struct ustr *utf, int length) 177 { 178 unsigned c, i, max_val, utf_char; 179 int utf_cnt, u_len; 180 181 memset(ocu, 0, sizeof(dstring) * length); 182 ocu[0] = 8; 183 max_val = 0xffU; 184 185 try_again: 186 u_len = 0U; 187 utf_char = 0U; 188 utf_cnt = 0U; 189 for (i = 0U; i < utf->u_len; i++) 190 { 191 c = (uint8_t)utf->u_name[i]; 192 193 /* Complete a multi-byte UTF-8 character */ 194 if (utf_cnt) 195 { 196 utf_char = (utf_char << 6) | (c & 0x3fU); 197 if (--utf_cnt) 198 continue; 199 } 200 else 201 { 202 /* Check for a multi-byte UTF-8 character */ 203 if (c & 0x80U) 204 { 205 /* Start a multi-byte UTF-8 character */ 206 if ((c & 0xe0U) == 0xc0U) 207 { 208 utf_char = c & 0x1fU; 209 utf_cnt = 1; 210 } 211 else if ((c & 0xf0U) == 0xe0U) 212 { 213 utf_char = c & 0x0fU; 214 utf_cnt = 2; 215 } 216 else if ((c & 0xf8U) == 0xf0U) 217 { 218 utf_char = c & 0x07U; 219 utf_cnt = 3; 220 } 221 else if ((c & 0xfcU) == 0xf8U) 222 { 223 utf_char = c & 0x03U; 224 utf_cnt = 4; 225 } 226 else if ((c & 0xfeU) == 0xfcU) 227 { 228 utf_char = c & 0x01U; 229 utf_cnt = 5; 230 } 231 else 232 goto error_out; 233 continue; 234 } else 235 /* Single byte UTF-8 character (most common) */ 236 utf_char = c; 237 } 238 239 /* Choose no compression if necessary */ 240 if (utf_char > max_val) 241 { 242 if ( 0xffU == max_val ) 243 { 244 max_val = 0xffffU; 245 ocu[0] = (uint8_t)0x10U; 246 goto try_again; 247 } 248 goto error_out; 249 } 250 251 if (max_val == 0xffffU) 252 { 253 ocu[++u_len] = (uint8_t)(utf_char >> 8); 254 } 255 ocu[++u_len] = (uint8_t)(utf_char & 0xffU); 256 } 257 258 259 if (utf_cnt) 260 { 261 error_out: 262 ocu[++u_len] = '?'; 263 printk(KERN_DEBUG "udf: bad UTF-8 character\n"); 264 } 265 266 ocu[length - 1] = (uint8_t)u_len + 1; 267 return u_len + 1; 268 } 269 270 static int udf_CS0toNLS(struct nls_table *nls, struct ustr *utf_o, struct ustr *ocu_i) 271 { 272 uint8_t *ocu; 273 uint32_t c; 274 uint8_t cmp_id, ocu_len; 275 int i; 276 277 ocu = ocu_i->u_name; 278 279 ocu_len = ocu_i->u_len; 280 cmp_id = ocu_i->u_cmpID; 281 utf_o->u_len = 0; 282 283 if (ocu_len == 0) 284 { 285 memset(utf_o, 0, sizeof(struct ustr)); 286 utf_o->u_cmpID = 0; 287 utf_o->u_len = 0; 288 return 0; 289 } 290 291 if ((cmp_id != 8) && (cmp_id != 16)) 292 { 293 printk(KERN_ERR "udf: unknown compression code (%d) stri=%s\n", cmp_id, ocu_i->u_name); 294 return 0; 295 } 296 297 for (i = 0; (i < ocu_len) && (utf_o->u_len <= (UDF_NAME_LEN-3)) ;) 298 { 299 /* Expand OSTA compressed Unicode to Unicode */ 300 c = ocu[i++]; 301 if (cmp_id == 16) 302 c = (c << 8) | ocu[i++]; 303 304 utf_o->u_len += nls->uni2char(c, &utf_o->u_name[utf_o->u_len], 305 UDF_NAME_LEN - utf_o->u_len); 306 } 307 utf_o->u_cmpID=8; 308 309 return utf_o->u_len; 310 } 311 312 static int udf_NLStoCS0(struct nls_table *nls, dstring *ocu, struct ustr *uni, int length) 313 { 314 unsigned len, i, max_val; 315 uint16_t uni_char; 316 int u_len; 317 318 memset(ocu, 0, sizeof(dstring) * length); 319 ocu[0] = 8; 320 max_val = 0xffU; 321 322 try_again: 323 u_len = 0U; 324 for (i = 0U; i < uni->u_len; i++) 325 { 326 len = nls->char2uni(&uni->u_name[i], uni->u_len-i, &uni_char); 327 if (len <= 0) 328 continue; 329 330 if (uni_char > max_val) 331 { 332 max_val = 0xffffU; 333 ocu[0] = (uint8_t)0x10U; 334 goto try_again; 335 } 336 337 if (max_val == 0xffffU) 338 ocu[++u_len] = (uint8_t)(uni_char >> 8); 339 ocu[++u_len] = (uint8_t)(uni_char & 0xffU); 340 i += len - 1; 341 } 342 343 ocu[length - 1] = (uint8_t)u_len + 1; 344 return u_len + 1; 345 } 346 347 int udf_get_filename(struct super_block *sb, uint8_t *sname, uint8_t *dname, int flen) 348 { 349 struct ustr filename, unifilename; 350 int len; 351 352 if (udf_build_ustr_exact(&unifilename, sname, flen)) 353 { 354 return 0; 355 } 356 357 if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) 358 { 359 if (!udf_CS0toUTF8(&filename, &unifilename) ) 360 { 361 udf_debug("Failed in udf_get_filename: sname = %s\n", sname); 362 return 0; 363 } 364 } 365 else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) 366 { 367 if (!udf_CS0toNLS(UDF_SB(sb)->s_nls_map, &filename, &unifilename) ) 368 { 369 udf_debug("Failed in udf_get_filename: sname = %s\n", sname); 370 return 0; 371 } 372 } 373 else 374 return 0; 375 376 if ((len = udf_translate_to_linux(dname, filename.u_name, filename.u_len, 377 unifilename.u_name, unifilename.u_len))) 378 { 379 return len; 380 } 381 return 0; 382 } 383 384 int udf_put_filename(struct super_block *sb, const uint8_t *sname, uint8_t *dname, int flen) 385 { 386 struct ustr unifilename; 387 int namelen; 388 389 if ( !(udf_char_to_ustr(&unifilename, sname, flen)) ) 390 { 391 return 0; 392 } 393 394 if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) 395 { 396 if ( !(namelen = udf_UTF8toCS0(dname, &unifilename, UDF_NAME_LEN)) ) 397 { 398 return 0; 399 } 400 } 401 else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) 402 { 403 if ( !(namelen = udf_NLStoCS0(UDF_SB(sb)->s_nls_map, dname, &unifilename, UDF_NAME_LEN)) ) 404 { 405 return 0; 406 } 407 } 408 else 409 return 0; 410 411 return namelen; 412 } 413 414 #define ILLEGAL_CHAR_MARK '_' 415 #define EXT_MARK '.' 416 #define CRC_MARK '#' 417 #define EXT_SIZE 5 418 419 static int udf_translate_to_linux(uint8_t *newName, uint8_t *udfName, int udfLen, uint8_t *fidName, int fidNameLen) 420 { 421 int index, newIndex = 0, needsCRC = 0; 422 int extIndex = 0, newExtIndex = 0, hasExt = 0; 423 unsigned short valueCRC; 424 uint8_t curr; 425 const uint8_t hexChar[] = "0123456789ABCDEF"; 426 427 if (udfName[0] == '.' && (udfLen == 1 || 428 (udfLen == 2 && udfName[1] == '.'))) 429 { 430 needsCRC = 1; 431 newIndex = udfLen; 432 memcpy(newName, udfName, udfLen); 433 } 434 else 435 { 436 for (index = 0; index < udfLen; index++) 437 { 438 curr = udfName[index]; 439 if (curr == '/' || curr == 0) 440 { 441 needsCRC = 1; 442 curr = ILLEGAL_CHAR_MARK; 443 while (index+1 < udfLen && (udfName[index+1] == '/' || 444 udfName[index+1] == 0)) 445 index++; 446 } 447 if (curr == EXT_MARK && (udfLen - index - 1) <= EXT_SIZE) 448 { 449 if (udfLen == index + 1) 450 hasExt = 0; 451 else 452 { 453 hasExt = 1; 454 extIndex = index; 455 newExtIndex = newIndex; 456 } 457 } 458 if (newIndex < 256) 459 newName[newIndex++] = curr; 460 else 461 needsCRC = 1; 462 } 463 } 464 if (needsCRC) 465 { 466 uint8_t ext[EXT_SIZE]; 467 int localExtIndex = 0; 468 469 if (hasExt) 470 { 471 int maxFilenameLen; 472 for(index = 0; index<EXT_SIZE && extIndex + index +1 < udfLen; 473 index++ ) 474 { 475 curr = udfName[extIndex + index + 1]; 476 477 if (curr == '/' || curr == 0) 478 { 479 needsCRC = 1; 480 curr = ILLEGAL_CHAR_MARK; 481 while(extIndex + index + 2 < udfLen && (index + 1 < EXT_SIZE 482 && (udfName[extIndex + index + 2] == '/' || 483 udfName[extIndex + index + 2] == 0))) 484 index++; 485 } 486 ext[localExtIndex++] = curr; 487 } 488 maxFilenameLen = 250 - localExtIndex; 489 if (newIndex > maxFilenameLen) 490 newIndex = maxFilenameLen; 491 else 492 newIndex = newExtIndex; 493 } 494 else if (newIndex > 250) 495 newIndex = 250; 496 newName[newIndex++] = CRC_MARK; 497 valueCRC = udf_crc(fidName, fidNameLen, 0); 498 newName[newIndex++] = hexChar[(valueCRC & 0xf000) >> 12]; 499 newName[newIndex++] = hexChar[(valueCRC & 0x0f00) >> 8]; 500 newName[newIndex++] = hexChar[(valueCRC & 0x00f0) >> 4]; 501 newName[newIndex++] = hexChar[(valueCRC & 0x000f)]; 502 503 if (hasExt) 504 { 505 newName[newIndex++] = EXT_MARK; 506 for (index = 0;index < localExtIndex ;index++ ) 507 newName[newIndex++] = ext[index]; 508 } 509 } 510 return newIndex; 511 } 512