1 /* 2 * unicode.c 3 * 4 * PURPOSE 5 * Routines for converting between UTF-8 and OSTA Compressed Unicode. 6 * Also handles filename mangling 7 * 8 * DESCRIPTION 9 * OSTA Compressed Unicode is explained in the OSTA UDF specification. 10 * http://www.osta.org/ 11 * UTF-8 is explained in the IETF RFC XXXX. 12 * ftp://ftp.internic.net/rfc/rfcxxxx.txt 13 * 14 * COPYRIGHT 15 * This file is distributed under the terms of the GNU General Public 16 * License (GPL). Copies of the GPL can be obtained from: 17 * ftp://prep.ai.mit.edu/pub/gnu/GPL 18 * Each contributing author retains all rights to their own work. 19 */ 20 21 #include "udfdecl.h" 22 23 #include <linux/kernel.h> 24 #include <linux/string.h> /* for memset */ 25 #include <linux/nls.h> 26 #include <linux/crc-itu-t.h> 27 #include <linux/slab.h> 28 29 #include "udf_sb.h" 30 31 static int udf_translate_to_linux(uint8_t *, int, uint8_t *, int, uint8_t *, 32 int); 33 34 static int udf_char_to_ustr(struct ustr *dest, const uint8_t *src, int strlen) 35 { 36 if ((!dest) || (!src) || (!strlen) || (strlen > UDF_NAME_LEN - 2)) 37 return 0; 38 39 memset(dest, 0, sizeof(struct ustr)); 40 memcpy(dest->u_name, src, strlen); 41 dest->u_cmpID = 0x08; 42 dest->u_len = strlen; 43 44 return strlen; 45 } 46 47 /* 48 * udf_build_ustr 49 */ 50 int udf_build_ustr(struct ustr *dest, dstring *ptr, int size) 51 { 52 int usesize; 53 54 if (!dest || !ptr || !size) 55 return -1; 56 BUG_ON(size < 2); 57 58 usesize = min_t(size_t, ptr[size - 1], sizeof(dest->u_name)); 59 usesize = min(usesize, size - 2); 60 dest->u_cmpID = ptr[0]; 61 dest->u_len = usesize; 62 memcpy(dest->u_name, ptr + 1, usesize); 63 memset(dest->u_name + usesize, 0, sizeof(dest->u_name) - usesize); 64 65 return 0; 66 } 67 68 /* 69 * udf_build_ustr_exact 70 */ 71 static void udf_build_ustr_exact(struct ustr *dest, dstring *ptr, int exactsize) 72 { 73 memset(dest, 0, sizeof(struct ustr)); 74 dest->u_cmpID = ptr[0]; 75 dest->u_len = exactsize - 1; 76 memcpy(dest->u_name, ptr + 1, exactsize - 1); 77 } 78 79 static int udf_uni2char_utf8(wchar_t uni, 80 unsigned char *out, 81 int boundlen) 82 { 83 int u_len = 0; 84 85 if (boundlen <= 0) 86 return -ENAMETOOLONG; 87 88 if (uni < 0x80) { 89 out[u_len++] = (unsigned char)uni; 90 } else if (uni < 0x800) { 91 if (boundlen < 2) 92 return -ENAMETOOLONG; 93 out[u_len++] = (unsigned char)(0xc0 | (uni >> 6)); 94 out[u_len++] = (unsigned char)(0x80 | (uni & 0x3f)); 95 } else { 96 if (boundlen < 3) 97 return -ENAMETOOLONG; 98 out[u_len++] = (unsigned char)(0xe0 | (uni >> 12)); 99 out[u_len++] = (unsigned char)(0x80 | ((uni >> 6) & 0x3f)); 100 out[u_len++] = (unsigned char)(0x80 | (uni & 0x3f)); 101 } 102 return u_len; 103 } 104 105 static int udf_char2uni_utf8(const unsigned char *in, 106 int boundlen, 107 wchar_t *uni) 108 { 109 unsigned int utf_char; 110 unsigned char c; 111 int utf_cnt, u_len; 112 113 utf_char = 0; 114 utf_cnt = 0; 115 for (u_len = 0; u_len < boundlen;) { 116 c = in[u_len++]; 117 118 /* Complete a multi-byte UTF-8 character */ 119 if (utf_cnt) { 120 utf_char = (utf_char << 6) | (c & 0x3f); 121 if (--utf_cnt) 122 continue; 123 } else { 124 /* Check for a multi-byte UTF-8 character */ 125 if (c & 0x80) { 126 /* Start a multi-byte UTF-8 character */ 127 if ((c & 0xe0) == 0xc0) { 128 utf_char = c & 0x1f; 129 utf_cnt = 1; 130 } else if ((c & 0xf0) == 0xe0) { 131 utf_char = c & 0x0f; 132 utf_cnt = 2; 133 } else if ((c & 0xf8) == 0xf0) { 134 utf_char = c & 0x07; 135 utf_cnt = 3; 136 } else if ((c & 0xfc) == 0xf8) { 137 utf_char = c & 0x03; 138 utf_cnt = 4; 139 } else if ((c & 0xfe) == 0xfc) { 140 utf_char = c & 0x01; 141 utf_cnt = 5; 142 } else { 143 utf_cnt = -1; 144 break; 145 } 146 continue; 147 } else { 148 /* Single byte UTF-8 character (most common) */ 149 utf_char = c; 150 } 151 } 152 *uni = utf_char; 153 break; 154 } 155 if (utf_cnt) { 156 *uni = '?'; 157 return -EINVAL; 158 } 159 return u_len; 160 } 161 162 static int udf_name_from_CS0(struct ustr *utf_o, 163 const struct ustr *ocu_i, 164 int (*conv_f)(wchar_t, unsigned char *, int)) 165 { 166 const uint8_t *ocu; 167 uint8_t cmp_id, ocu_len; 168 int i, len; 169 170 171 ocu_len = ocu_i->u_len; 172 if (ocu_len == 0) { 173 memset(utf_o, 0, sizeof(struct ustr)); 174 return 0; 175 } 176 177 cmp_id = ocu_i->u_cmpID; 178 if (cmp_id != 8 && cmp_id != 16) { 179 memset(utf_o, 0, sizeof(struct ustr)); 180 pr_err("unknown compression code (%d) stri=%s\n", 181 cmp_id, ocu_i->u_name); 182 return -EINVAL; 183 } 184 185 ocu = ocu_i->u_name; 186 utf_o->u_len = 0; 187 for (i = 0; (i < ocu_len) && (utf_o->u_len <= (UDF_NAME_LEN - 3));) { 188 /* Expand OSTA compressed Unicode to Unicode */ 189 uint32_t c = ocu[i++]; 190 if (cmp_id == 16) 191 c = (c << 8) | ocu[i++]; 192 193 len = conv_f(c, &utf_o->u_name[utf_o->u_len], 194 UDF_NAME_LEN - 2 - utf_o->u_len); 195 /* Valid character? */ 196 if (len >= 0) 197 utf_o->u_len += len; 198 else if (len == -ENAMETOOLONG) 199 break; 200 else 201 utf_o->u_name[utf_o->u_len++] = '?'; 202 } 203 utf_o->u_cmpID = 8; 204 205 return utf_o->u_len; 206 } 207 208 static int udf_name_to_CS0(dstring *ocu, struct ustr *uni, int length, 209 int (*conv_f)(const unsigned char *, int, wchar_t *)) 210 { 211 int i, len; 212 unsigned int max_val; 213 wchar_t uni_char; 214 int u_len, u_ch; 215 216 memset(ocu, 0, sizeof(dstring) * length); 217 ocu[0] = 8; 218 max_val = 0xff; 219 u_ch = 1; 220 221 try_again: 222 u_len = 0; 223 for (i = 0; i < uni->u_len; i++) { 224 /* Name didn't fit? */ 225 if (u_len + 1 + u_ch >= length) 226 return 0; 227 len = conv_f(&uni->u_name[i], uni->u_len - i, &uni_char); 228 if (!len) 229 continue; 230 /* Invalid character, deal with it */ 231 if (len < 0) { 232 len = 1; 233 uni_char = '?'; 234 } 235 236 if (uni_char > max_val) { 237 max_val = 0xffff; 238 ocu[0] = 0x10; 239 u_ch = 2; 240 goto try_again; 241 } 242 243 if (max_val == 0xffff) 244 ocu[++u_len] = (uint8_t)(uni_char >> 8); 245 ocu[++u_len] = (uint8_t)(uni_char & 0xff); 246 i += len - 1; 247 } 248 249 ocu[length - 1] = (uint8_t)u_len + 1; 250 return u_len + 1; 251 } 252 253 int udf_CS0toUTF8(struct ustr *utf_o, const struct ustr *ocu_i) 254 { 255 return udf_name_from_CS0(utf_o, ocu_i, udf_uni2char_utf8); 256 } 257 258 int udf_get_filename(struct super_block *sb, uint8_t *sname, int slen, 259 uint8_t *dname, int dlen) 260 { 261 struct ustr *filename, *unifilename; 262 int (*conv_f)(wchar_t, unsigned char *, int); 263 int ret; 264 265 if (!slen) 266 return -EIO; 267 268 filename = kmalloc(sizeof(struct ustr), GFP_NOFS); 269 if (!filename) 270 return -ENOMEM; 271 272 unifilename = kmalloc(sizeof(struct ustr), GFP_NOFS); 273 if (!unifilename) { 274 ret = -ENOMEM; 275 goto out1; 276 } 277 278 udf_build_ustr_exact(unifilename, sname, slen); 279 if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) { 280 conv_f = udf_uni2char_utf8; 281 } else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) { 282 conv_f = UDF_SB(sb)->s_nls_map->uni2char; 283 } else 284 BUG(); 285 286 ret = udf_name_from_CS0(filename, unifilename, conv_f); 287 if (ret < 0) { 288 udf_debug("Failed in udf_get_filename: sname = %s\n", sname); 289 goto out2; 290 } 291 292 ret = udf_translate_to_linux(dname, dlen, 293 filename->u_name, filename->u_len, 294 unifilename->u_name, unifilename->u_len); 295 /* Zero length filename isn't valid... */ 296 if (ret == 0) 297 ret = -EINVAL; 298 out2: 299 kfree(unifilename); 300 out1: 301 kfree(filename); 302 return ret; 303 } 304 305 int udf_put_filename(struct super_block *sb, const uint8_t *sname, int slen, 306 uint8_t *dname, int dlen) 307 { 308 struct ustr unifilename; 309 int (*conv_f)(const unsigned char *, int, wchar_t *); 310 311 if (!udf_char_to_ustr(&unifilename, sname, slen)) 312 return 0; 313 314 if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) { 315 conv_f = udf_char2uni_utf8; 316 } else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) { 317 conv_f = UDF_SB(sb)->s_nls_map->char2uni; 318 } else 319 BUG(); 320 321 return udf_name_to_CS0(dname, &unifilename, dlen, conv_f); 322 } 323 324 #define ILLEGAL_CHAR_MARK '_' 325 #define EXT_MARK '.' 326 #define CRC_MARK '#' 327 #define EXT_SIZE 5 328 /* Number of chars we need to store generated CRC to make filename unique */ 329 #define CRC_LEN 5 330 331 static int udf_translate_to_linux(uint8_t *newName, int newLen, 332 uint8_t *udfName, int udfLen, 333 uint8_t *fidName, int fidNameLen) 334 { 335 int index, newIndex = 0, needsCRC = 0; 336 int extIndex = 0, newExtIndex = 0, hasExt = 0; 337 unsigned short valueCRC; 338 uint8_t curr; 339 340 if (udfName[0] == '.' && 341 (udfLen == 1 || (udfLen == 2 && udfName[1] == '.'))) { 342 needsCRC = 1; 343 newIndex = udfLen; 344 memcpy(newName, udfName, udfLen); 345 } else { 346 for (index = 0; index < udfLen; index++) { 347 curr = udfName[index]; 348 if (curr == '/' || curr == 0) { 349 needsCRC = 1; 350 curr = ILLEGAL_CHAR_MARK; 351 while (index + 1 < udfLen && 352 (udfName[index + 1] == '/' || 353 udfName[index + 1] == 0)) 354 index++; 355 } 356 if (curr == EXT_MARK && 357 (udfLen - index - 1) <= EXT_SIZE) { 358 if (udfLen == index + 1) 359 hasExt = 0; 360 else { 361 hasExt = 1; 362 extIndex = index; 363 newExtIndex = newIndex; 364 } 365 } 366 if (newIndex < newLen) 367 newName[newIndex++] = curr; 368 else 369 needsCRC = 1; 370 } 371 } 372 if (needsCRC) { 373 uint8_t ext[EXT_SIZE]; 374 int localExtIndex = 0; 375 376 if (hasExt) { 377 int maxFilenameLen; 378 for (index = 0; 379 index < EXT_SIZE && extIndex + index + 1 < udfLen; 380 index++) { 381 curr = udfName[extIndex + index + 1]; 382 383 if (curr == '/' || curr == 0) { 384 needsCRC = 1; 385 curr = ILLEGAL_CHAR_MARK; 386 while (extIndex + index + 2 < udfLen && 387 (index + 1 < EXT_SIZE && 388 (udfName[extIndex + index + 2] == '/' || 389 udfName[extIndex + index + 2] == 0))) 390 index++; 391 } 392 ext[localExtIndex++] = curr; 393 } 394 maxFilenameLen = newLen - CRC_LEN - localExtIndex; 395 if (newIndex > maxFilenameLen) 396 newIndex = maxFilenameLen; 397 else 398 newIndex = newExtIndex; 399 } else if (newIndex > newLen - CRC_LEN) 400 newIndex = newLen - CRC_LEN; 401 newName[newIndex++] = CRC_MARK; 402 valueCRC = crc_itu_t(0, fidName, fidNameLen); 403 newName[newIndex++] = hex_asc_upper_hi(valueCRC >> 8); 404 newName[newIndex++] = hex_asc_upper_lo(valueCRC >> 8); 405 newName[newIndex++] = hex_asc_upper_hi(valueCRC); 406 newName[newIndex++] = hex_asc_upper_lo(valueCRC); 407 408 if (hasExt) { 409 newName[newIndex++] = EXT_MARK; 410 for (index = 0; index < localExtIndex; index++) 411 newName[newIndex++] = ext[index]; 412 } 413 } 414 415 return newIndex; 416 } 417