1 /* 2 * unicode.c 3 * 4 * PURPOSE 5 * Routines for converting between UTF-8 and OSTA Compressed Unicode. 6 * Also handles filename mangling 7 * 8 * DESCRIPTION 9 * OSTA Compressed Unicode is explained in the OSTA UDF specification. 10 * http://www.osta.org/ 11 * UTF-8 is explained in the IETF RFC XXXX. 12 * ftp://ftp.internic.net/rfc/rfcxxxx.txt 13 * 14 * CONTACTS 15 * E-mail regarding any portion of the Linux UDF file system should be 16 * directed to the development team's mailing list (run by majordomo): 17 * linux_udf@hpesjro.fc.hp.com 18 * 19 * COPYRIGHT 20 * This file is distributed under the terms of the GNU General Public 21 * License (GPL). Copies of the GPL can be obtained from: 22 * ftp://prep.ai.mit.edu/pub/gnu/GPL 23 * Each contributing author retains all rights to their own work. 24 */ 25 26 #include "udfdecl.h" 27 28 #include <linux/kernel.h> 29 #include <linux/string.h> /* for memset */ 30 #include <linux/nls.h> 31 #include <linux/udf_fs.h> 32 33 #include "udf_sb.h" 34 35 static int udf_translate_to_linux(uint8_t *, uint8_t *, int, uint8_t *, int); 36 37 static int udf_char_to_ustr(struct ustr *dest, const uint8_t *src, int strlen) 38 { 39 if ( (!dest) || (!src) || (!strlen) || (strlen > UDF_NAME_LEN-2) ) 40 return 0; 41 memset(dest, 0, sizeof(struct ustr)); 42 memcpy(dest->u_name, src, strlen); 43 dest->u_cmpID = 0x08; 44 dest->u_len = strlen; 45 return strlen; 46 } 47 48 /* 49 * udf_build_ustr 50 */ 51 int udf_build_ustr(struct ustr *dest, dstring *ptr, int size) 52 { 53 int usesize; 54 55 if ( (!dest) || (!ptr) || (!size) ) 56 return -1; 57 58 memset(dest, 0, sizeof(struct ustr)); 59 usesize= (size > UDF_NAME_LEN) ? UDF_NAME_LEN : size; 60 dest->u_cmpID=ptr[0]; 61 dest->u_len=ptr[size-1]; 62 memcpy(dest->u_name, ptr+1, usesize-1); 63 return 0; 64 } 65 66 /* 67 * udf_build_ustr_exact 68 */ 69 static int udf_build_ustr_exact(struct ustr *dest, dstring *ptr, int exactsize) 70 { 71 if ( (!dest) || (!ptr) || (!exactsize) ) 72 return -1; 73 74 memset(dest, 0, sizeof(struct ustr)); 75 dest->u_cmpID=ptr[0]; 76 dest->u_len=exactsize-1; 77 memcpy(dest->u_name, ptr+1, exactsize-1); 78 return 0; 79 } 80 81 /* 82 * udf_ocu_to_utf8 83 * 84 * PURPOSE 85 * Convert OSTA Compressed Unicode to the UTF-8 equivalent. 86 * 87 * DESCRIPTION 88 * This routine is only called by udf_filldir(). 89 * 90 * PRE-CONDITIONS 91 * utf Pointer to UTF-8 output buffer. 92 * ocu Pointer to OSTA Compressed Unicode input buffer 93 * of size UDF_NAME_LEN bytes. 94 * both of type "struct ustr *" 95 * 96 * POST-CONDITIONS 97 * <return> Zero on success. 98 * 99 * HISTORY 100 * November 12, 1997 - Andrew E. Mileski 101 * Written, tested, and released. 102 */ 103 int udf_CS0toUTF8(struct ustr *utf_o, struct ustr *ocu_i) 104 { 105 uint8_t *ocu; 106 uint32_t c; 107 uint8_t cmp_id, ocu_len; 108 int i; 109 110 ocu = ocu_i->u_name; 111 112 ocu_len = ocu_i->u_len; 113 cmp_id = ocu_i->u_cmpID; 114 utf_o->u_len = 0; 115 116 if (ocu_len == 0) 117 { 118 memset(utf_o, 0, sizeof(struct ustr)); 119 utf_o->u_cmpID = 0; 120 utf_o->u_len = 0; 121 return 0; 122 } 123 124 if ((cmp_id != 8) && (cmp_id != 16)) 125 { 126 printk(KERN_ERR "udf: unknown compression code (%d) stri=%s\n", cmp_id, ocu_i->u_name); 127 return 0; 128 } 129 130 for (i = 0; (i < ocu_len) && (utf_o->u_len <= (UDF_NAME_LEN-3)) ;) 131 { 132 133 /* Expand OSTA compressed Unicode to Unicode */ 134 c = ocu[i++]; 135 if (cmp_id == 16) 136 c = (c << 8) | ocu[i++]; 137 138 /* Compress Unicode to UTF-8 */ 139 if (c < 0x80U) 140 utf_o->u_name[utf_o->u_len++] = (uint8_t)c; 141 else if (c < 0x800U) 142 { 143 utf_o->u_name[utf_o->u_len++] = (uint8_t)(0xc0 | (c >> 6)); 144 utf_o->u_name[utf_o->u_len++] = (uint8_t)(0x80 | (c & 0x3f)); 145 } 146 else 147 { 148 utf_o->u_name[utf_o->u_len++] = (uint8_t)(0xe0 | (c >> 12)); 149 utf_o->u_name[utf_o->u_len++] = (uint8_t)(0x80 | ((c >> 6) & 0x3f)); 150 utf_o->u_name[utf_o->u_len++] = (uint8_t)(0x80 | (c & 0x3f)); 151 } 152 } 153 utf_o->u_cmpID=8; 154 155 return utf_o->u_len; 156 } 157 158 /* 159 * 160 * udf_utf8_to_ocu 161 * 162 * PURPOSE 163 * Convert UTF-8 to the OSTA Compressed Unicode equivalent. 164 * 165 * DESCRIPTION 166 * This routine is only called by udf_lookup(). 167 * 168 * PRE-CONDITIONS 169 * ocu Pointer to OSTA Compressed Unicode output 170 * buffer of size UDF_NAME_LEN bytes. 171 * utf Pointer to UTF-8 input buffer. 172 * utf_len Length of UTF-8 input buffer in bytes. 173 * 174 * POST-CONDITIONS 175 * <return> Zero on success. 176 * 177 * HISTORY 178 * November 12, 1997 - Andrew E. Mileski 179 * Written, tested, and released. 180 */ 181 static int udf_UTF8toCS0(dstring *ocu, struct ustr *utf, int length) 182 { 183 unsigned c, i, max_val, utf_char; 184 int utf_cnt, u_len; 185 186 memset(ocu, 0, sizeof(dstring) * length); 187 ocu[0] = 8; 188 max_val = 0xffU; 189 190 try_again: 191 u_len = 0U; 192 utf_char = 0U; 193 utf_cnt = 0U; 194 for (i = 0U; i < utf->u_len; i++) 195 { 196 c = (uint8_t)utf->u_name[i]; 197 198 /* Complete a multi-byte UTF-8 character */ 199 if (utf_cnt) 200 { 201 utf_char = (utf_char << 6) | (c & 0x3fU); 202 if (--utf_cnt) 203 continue; 204 } 205 else 206 { 207 /* Check for a multi-byte UTF-8 character */ 208 if (c & 0x80U) 209 { 210 /* Start a multi-byte UTF-8 character */ 211 if ((c & 0xe0U) == 0xc0U) 212 { 213 utf_char = c & 0x1fU; 214 utf_cnt = 1; 215 } 216 else if ((c & 0xf0U) == 0xe0U) 217 { 218 utf_char = c & 0x0fU; 219 utf_cnt = 2; 220 } 221 else if ((c & 0xf8U) == 0xf0U) 222 { 223 utf_char = c & 0x07U; 224 utf_cnt = 3; 225 } 226 else if ((c & 0xfcU) == 0xf8U) 227 { 228 utf_char = c & 0x03U; 229 utf_cnt = 4; 230 } 231 else if ((c & 0xfeU) == 0xfcU) 232 { 233 utf_char = c & 0x01U; 234 utf_cnt = 5; 235 } 236 else 237 goto error_out; 238 continue; 239 } else 240 /* Single byte UTF-8 character (most common) */ 241 utf_char = c; 242 } 243 244 /* Choose no compression if necessary */ 245 if (utf_char > max_val) 246 { 247 if ( 0xffU == max_val ) 248 { 249 max_val = 0xffffU; 250 ocu[0] = (uint8_t)0x10U; 251 goto try_again; 252 } 253 goto error_out; 254 } 255 256 if (max_val == 0xffffU) 257 { 258 ocu[++u_len] = (uint8_t)(utf_char >> 8); 259 } 260 ocu[++u_len] = (uint8_t)(utf_char & 0xffU); 261 } 262 263 264 if (utf_cnt) 265 { 266 error_out: 267 ocu[++u_len] = '?'; 268 printk(KERN_DEBUG "udf: bad UTF-8 character\n"); 269 } 270 271 ocu[length - 1] = (uint8_t)u_len + 1; 272 return u_len + 1; 273 } 274 275 static int udf_CS0toNLS(struct nls_table *nls, struct ustr *utf_o, struct ustr *ocu_i) 276 { 277 uint8_t *ocu; 278 uint32_t c; 279 uint8_t cmp_id, ocu_len; 280 int i; 281 282 ocu = ocu_i->u_name; 283 284 ocu_len = ocu_i->u_len; 285 cmp_id = ocu_i->u_cmpID; 286 utf_o->u_len = 0; 287 288 if (ocu_len == 0) 289 { 290 memset(utf_o, 0, sizeof(struct ustr)); 291 utf_o->u_cmpID = 0; 292 utf_o->u_len = 0; 293 return 0; 294 } 295 296 if ((cmp_id != 8) && (cmp_id != 16)) 297 { 298 printk(KERN_ERR "udf: unknown compression code (%d) stri=%s\n", cmp_id, ocu_i->u_name); 299 return 0; 300 } 301 302 for (i = 0; (i < ocu_len) && (utf_o->u_len <= (UDF_NAME_LEN-3)) ;) 303 { 304 /* Expand OSTA compressed Unicode to Unicode */ 305 c = ocu[i++]; 306 if (cmp_id == 16) 307 c = (c << 8) | ocu[i++]; 308 309 utf_o->u_len += nls->uni2char(c, &utf_o->u_name[utf_o->u_len], 310 UDF_NAME_LEN - utf_o->u_len); 311 } 312 utf_o->u_cmpID=8; 313 314 return utf_o->u_len; 315 } 316 317 static int udf_NLStoCS0(struct nls_table *nls, dstring *ocu, struct ustr *uni, int length) 318 { 319 unsigned len, i, max_val; 320 uint16_t uni_char; 321 int u_len; 322 323 memset(ocu, 0, sizeof(dstring) * length); 324 ocu[0] = 8; 325 max_val = 0xffU; 326 327 try_again: 328 u_len = 0U; 329 for (i = 0U; i < uni->u_len; i++) 330 { 331 len = nls->char2uni(&uni->u_name[i], uni->u_len-i, &uni_char); 332 if (len <= 0) 333 continue; 334 335 if (uni_char > max_val) 336 { 337 max_val = 0xffffU; 338 ocu[0] = (uint8_t)0x10U; 339 goto try_again; 340 } 341 342 if (max_val == 0xffffU) 343 ocu[++u_len] = (uint8_t)(uni_char >> 8); 344 ocu[++u_len] = (uint8_t)(uni_char & 0xffU); 345 i += len - 1; 346 } 347 348 ocu[length - 1] = (uint8_t)u_len + 1; 349 return u_len + 1; 350 } 351 352 int udf_get_filename(struct super_block *sb, uint8_t *sname, uint8_t *dname, int flen) 353 { 354 struct ustr filename, unifilename; 355 int len; 356 357 if (udf_build_ustr_exact(&unifilename, sname, flen)) 358 { 359 return 0; 360 } 361 362 if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) 363 { 364 if (!udf_CS0toUTF8(&filename, &unifilename) ) 365 { 366 udf_debug("Failed in udf_get_filename: sname = %s\n", sname); 367 return 0; 368 } 369 } 370 else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) 371 { 372 if (!udf_CS0toNLS(UDF_SB(sb)->s_nls_map, &filename, &unifilename) ) 373 { 374 udf_debug("Failed in udf_get_filename: sname = %s\n", sname); 375 return 0; 376 } 377 } 378 else 379 return 0; 380 381 if ((len = udf_translate_to_linux(dname, filename.u_name, filename.u_len, 382 unifilename.u_name, unifilename.u_len))) 383 { 384 return len; 385 } 386 return 0; 387 } 388 389 int udf_put_filename(struct super_block *sb, const uint8_t *sname, uint8_t *dname, int flen) 390 { 391 struct ustr unifilename; 392 int namelen; 393 394 if ( !(udf_char_to_ustr(&unifilename, sname, flen)) ) 395 { 396 return 0; 397 } 398 399 if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) 400 { 401 if ( !(namelen = udf_UTF8toCS0(dname, &unifilename, UDF_NAME_LEN)) ) 402 { 403 return 0; 404 } 405 } 406 else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) 407 { 408 if ( !(namelen = udf_NLStoCS0(UDF_SB(sb)->s_nls_map, dname, &unifilename, UDF_NAME_LEN)) ) 409 { 410 return 0; 411 } 412 } 413 else 414 return 0; 415 416 return namelen; 417 } 418 419 #define ILLEGAL_CHAR_MARK '_' 420 #define EXT_MARK '.' 421 #define CRC_MARK '#' 422 #define EXT_SIZE 5 423 424 static int udf_translate_to_linux(uint8_t *newName, uint8_t *udfName, int udfLen, uint8_t *fidName, int fidNameLen) 425 { 426 int index, newIndex = 0, needsCRC = 0; 427 int extIndex = 0, newExtIndex = 0, hasExt = 0; 428 unsigned short valueCRC; 429 uint8_t curr; 430 const uint8_t hexChar[] = "0123456789ABCDEF"; 431 432 if (udfName[0] == '.' && (udfLen == 1 || 433 (udfLen == 2 && udfName[1] == '.'))) 434 { 435 needsCRC = 1; 436 newIndex = udfLen; 437 memcpy(newName, udfName, udfLen); 438 } 439 else 440 { 441 for (index = 0; index < udfLen; index++) 442 { 443 curr = udfName[index]; 444 if (curr == '/' || curr == 0) 445 { 446 needsCRC = 1; 447 curr = ILLEGAL_CHAR_MARK; 448 while (index+1 < udfLen && (udfName[index+1] == '/' || 449 udfName[index+1] == 0)) 450 index++; 451 } 452 if (curr == EXT_MARK && (udfLen - index - 1) <= EXT_SIZE) 453 { 454 if (udfLen == index + 1) 455 hasExt = 0; 456 else 457 { 458 hasExt = 1; 459 extIndex = index; 460 newExtIndex = newIndex; 461 } 462 } 463 if (newIndex < 256) 464 newName[newIndex++] = curr; 465 else 466 needsCRC = 1; 467 } 468 } 469 if (needsCRC) 470 { 471 uint8_t ext[EXT_SIZE]; 472 int localExtIndex = 0; 473 474 if (hasExt) 475 { 476 int maxFilenameLen; 477 for(index = 0; index<EXT_SIZE && extIndex + index +1 < udfLen; 478 index++ ) 479 { 480 curr = udfName[extIndex + index + 1]; 481 482 if (curr == '/' || curr == 0) 483 { 484 needsCRC = 1; 485 curr = ILLEGAL_CHAR_MARK; 486 while(extIndex + index + 2 < udfLen && (index + 1 < EXT_SIZE 487 && (udfName[extIndex + index + 2] == '/' || 488 udfName[extIndex + index + 2] == 0))) 489 index++; 490 } 491 ext[localExtIndex++] = curr; 492 } 493 maxFilenameLen = 250 - localExtIndex; 494 if (newIndex > maxFilenameLen) 495 newIndex = maxFilenameLen; 496 else 497 newIndex = newExtIndex; 498 } 499 else if (newIndex > 250) 500 newIndex = 250; 501 newName[newIndex++] = CRC_MARK; 502 valueCRC = udf_crc(fidName, fidNameLen, 0); 503 newName[newIndex++] = hexChar[(valueCRC & 0xf000) >> 12]; 504 newName[newIndex++] = hexChar[(valueCRC & 0x0f00) >> 8]; 505 newName[newIndex++] = hexChar[(valueCRC & 0x00f0) >> 4]; 506 newName[newIndex++] = hexChar[(valueCRC & 0x000f)]; 507 508 if (hasExt) 509 { 510 newName[newIndex++] = EXT_MARK; 511 for (index = 0;index < localExtIndex ;index++ ) 512 newName[newIndex++] = ext[index]; 513 } 514 } 515 return newIndex; 516 } 517