xref: /openbmc/linux/fs/udf/unicode.c (revision 15aebd28)
1 /*
2  * unicode.c
3  *
4  * PURPOSE
5  *	Routines for converting between UTF-8 and OSTA Compressed Unicode.
6  *      Also handles filename mangling
7  *
8  * DESCRIPTION
9  *	OSTA Compressed Unicode is explained in the OSTA UDF specification.
10  *		http://www.osta.org/
11  *	UTF-8 is explained in the IETF RFC XXXX.
12  *		ftp://ftp.internic.net/rfc/rfcxxxx.txt
13  *
14  * COPYRIGHT
15  *	This file is distributed under the terms of the GNU General Public
16  *	License (GPL). Copies of the GPL can be obtained from:
17  *		ftp://prep.ai.mit.edu/pub/gnu/GPL
18  *	Each contributing author retains all rights to their own work.
19  */
20 
21 #include "udfdecl.h"
22 
23 #include <linux/kernel.h>
24 #include <linux/string.h>	/* for memset */
25 #include <linux/nls.h>
26 
27 #include "udf_sb.h"
28 
29 static int udf_translate_to_linux(uint8_t *, uint8_t *, int, uint8_t *, int);
30 
31 static int udf_char_to_ustr(struct ustr *dest, const uint8_t *src, int strlen)
32 {
33 	if ((!dest) || (!src) || (!strlen) || (strlen > UDF_NAME_LEN - 2))
34 		return 0;
35 
36 	memset(dest, 0, sizeof(struct ustr));
37 	memcpy(dest->u_name, src, strlen);
38 	dest->u_cmpID = 0x08;
39 	dest->u_len = strlen;
40 
41 	return strlen;
42 }
43 
44 /*
45  * udf_build_ustr
46  */
47 int udf_build_ustr(struct ustr *dest, dstring *ptr, int size)
48 {
49 	int usesize;
50 
51 	if ((!dest) || (!ptr) || (!size))
52 		return -1;
53 
54 	memset(dest, 0, sizeof(struct ustr));
55 	usesize = (size > UDF_NAME_LEN) ? UDF_NAME_LEN : size;
56 	dest->u_cmpID = ptr[0];
57 	dest->u_len = ptr[size - 1];
58 	memcpy(dest->u_name, ptr + 1, usesize - 1);
59 
60 	return 0;
61 }
62 
63 /*
64  * udf_build_ustr_exact
65  */
66 static int udf_build_ustr_exact(struct ustr *dest, dstring *ptr, int exactsize)
67 {
68 	if ((!dest) || (!ptr) || (!exactsize))
69 		return -1;
70 
71 	memset(dest, 0, sizeof(struct ustr));
72 	dest->u_cmpID = ptr[0];
73 	dest->u_len = exactsize - 1;
74 	memcpy(dest->u_name, ptr + 1, exactsize - 1);
75 
76 	return 0;
77 }
78 
79 /*
80  * udf_ocu_to_utf8
81  *
82  * PURPOSE
83  *	Convert OSTA Compressed Unicode to the UTF-8 equivalent.
84  *
85  * DESCRIPTION
86  *	This routine is only called by udf_filldir().
87  *
88  * PRE-CONDITIONS
89  *	utf			Pointer to UTF-8 output buffer.
90  *	ocu			Pointer to OSTA Compressed Unicode input buffer
91  *				of size UDF_NAME_LEN bytes.
92  * 				both of type "struct ustr *"
93  *
94  * POST-CONDITIONS
95  *	<return>		Zero on success.
96  *
97  * HISTORY
98  *	November 12, 1997 - Andrew E. Mileski
99  *	Written, tested, and released.
100  */
101 int udf_CS0toUTF8(struct ustr *utf_o, struct ustr *ocu_i)
102 {
103 	uint8_t *ocu;
104 	uint32_t c;
105 	uint8_t cmp_id, ocu_len;
106 	int i;
107 
108 	ocu = ocu_i->u_name;
109 
110 	ocu_len = ocu_i->u_len;
111 	cmp_id = ocu_i->u_cmpID;
112 	utf_o->u_len = 0;
113 
114 	if (ocu_len == 0) {
115 		memset(utf_o, 0, sizeof(struct ustr));
116 		utf_o->u_cmpID = 0;
117 		utf_o->u_len = 0;
118 		return 0;
119 	}
120 
121 	if ((cmp_id != 8) && (cmp_id != 16)) {
122 		printk(KERN_ERR "udf: unknown compression code (%d) stri=%s\n",
123 		       cmp_id, ocu_i->u_name);
124 		return 0;
125 	}
126 
127 	for (i = 0; (i < ocu_len) && (utf_o->u_len <= (UDF_NAME_LEN - 3));) {
128 
129 		/* Expand OSTA compressed Unicode to Unicode */
130 		c = ocu[i++];
131 		if (cmp_id == 16)
132 			c = (c << 8) | ocu[i++];
133 
134 		/* Compress Unicode to UTF-8 */
135 		if (c < 0x80U) {
136 			utf_o->u_name[utf_o->u_len++] = (uint8_t)c;
137 		} else if (c < 0x800U) {
138 			utf_o->u_name[utf_o->u_len++] =
139 						(uint8_t)(0xc0 | (c >> 6));
140 			utf_o->u_name[utf_o->u_len++] =
141 						(uint8_t)(0x80 | (c & 0x3f));
142 		} else {
143 			utf_o->u_name[utf_o->u_len++] =
144 						(uint8_t)(0xe0 | (c >> 12));
145 			utf_o->u_name[utf_o->u_len++] =
146 						(uint8_t)(0x80 |
147 							  ((c >> 6) & 0x3f));
148 			utf_o->u_name[utf_o->u_len++] =
149 						(uint8_t)(0x80 | (c & 0x3f));
150 		}
151 	}
152 	utf_o->u_cmpID = 8;
153 
154 	return utf_o->u_len;
155 }
156 
157 /*
158  *
159  * udf_utf8_to_ocu
160  *
161  * PURPOSE
162  *	Convert UTF-8 to the OSTA Compressed Unicode equivalent.
163  *
164  * DESCRIPTION
165  *	This routine is only called by udf_lookup().
166  *
167  * PRE-CONDITIONS
168  *	ocu			Pointer to OSTA Compressed Unicode output
169  *				buffer of size UDF_NAME_LEN bytes.
170  *	utf			Pointer to UTF-8 input buffer.
171  *	utf_len			Length of UTF-8 input buffer in bytes.
172  *
173  * POST-CONDITIONS
174  *	<return>		Zero on success.
175  *
176  * HISTORY
177  *	November 12, 1997 - Andrew E. Mileski
178  *	Written, tested, and released.
179  */
180 static int udf_UTF8toCS0(dstring *ocu, struct ustr *utf, int length)
181 {
182 	unsigned c, i, max_val, utf_char;
183 	int utf_cnt, u_len;
184 
185 	memset(ocu, 0, sizeof(dstring) * length);
186 	ocu[0] = 8;
187 	max_val = 0xffU;
188 
189 try_again:
190 	u_len = 0U;
191 	utf_char = 0U;
192 	utf_cnt = 0U;
193 	for (i = 0U; i < utf->u_len; i++) {
194 		c = (uint8_t)utf->u_name[i];
195 
196 		/* Complete a multi-byte UTF-8 character */
197 		if (utf_cnt) {
198 			utf_char = (utf_char << 6) | (c & 0x3fU);
199 			if (--utf_cnt)
200 				continue;
201 		} else {
202 			/* Check for a multi-byte UTF-8 character */
203 			if (c & 0x80U) {
204 				/* Start a multi-byte UTF-8 character */
205 				if ((c & 0xe0U) == 0xc0U) {
206 					utf_char = c & 0x1fU;
207 					utf_cnt = 1;
208 				} else if ((c & 0xf0U) == 0xe0U) {
209 					utf_char = c & 0x0fU;
210 					utf_cnt = 2;
211 				} else if ((c & 0xf8U) == 0xf0U) {
212 					utf_char = c & 0x07U;
213 					utf_cnt = 3;
214 				} else if ((c & 0xfcU) == 0xf8U) {
215 					utf_char = c & 0x03U;
216 					utf_cnt = 4;
217 				} else if ((c & 0xfeU) == 0xfcU) {
218 					utf_char = c & 0x01U;
219 					utf_cnt = 5;
220 				} else {
221 					goto error_out;
222 				}
223 				continue;
224 			} else {
225 				/* Single byte UTF-8 character (most common) */
226 				utf_char = c;
227 			}
228 		}
229 
230 		/* Choose no compression if necessary */
231 		if (utf_char > max_val) {
232 			if (max_val == 0xffU) {
233 				max_val = 0xffffU;
234 				ocu[0] = (uint8_t)0x10U;
235 				goto try_again;
236 			}
237 			goto error_out;
238 		}
239 
240 		if (max_val == 0xffffU)
241 			ocu[++u_len] = (uint8_t)(utf_char >> 8);
242 		ocu[++u_len] = (uint8_t)(utf_char & 0xffU);
243 	}
244 
245 	if (utf_cnt) {
246 error_out:
247 		ocu[++u_len] = '?';
248 		printk(KERN_DEBUG "udf: bad UTF-8 character\n");
249 	}
250 
251 	ocu[length - 1] = (uint8_t)u_len + 1;
252 
253 	return u_len + 1;
254 }
255 
256 static int udf_CS0toNLS(struct nls_table *nls, struct ustr *utf_o,
257 			struct ustr *ocu_i)
258 {
259 	uint8_t *ocu;
260 	uint32_t c;
261 	uint8_t cmp_id, ocu_len;
262 	int i;
263 
264 	ocu = ocu_i->u_name;
265 
266 	ocu_len = ocu_i->u_len;
267 	cmp_id = ocu_i->u_cmpID;
268 	utf_o->u_len = 0;
269 
270 	if (ocu_len == 0) {
271 		memset(utf_o, 0, sizeof(struct ustr));
272 		utf_o->u_cmpID = 0;
273 		utf_o->u_len = 0;
274 		return 0;
275 	}
276 
277 	if ((cmp_id != 8) && (cmp_id != 16)) {
278 		printk(KERN_ERR "udf: unknown compression code (%d) stri=%s\n",
279 		       cmp_id, ocu_i->u_name);
280 		return 0;
281 	}
282 
283 	for (i = 0; (i < ocu_len) && (utf_o->u_len <= (UDF_NAME_LEN - 3));) {
284 		/* Expand OSTA compressed Unicode to Unicode */
285 		c = ocu[i++];
286 		if (cmp_id == 16)
287 			c = (c << 8) | ocu[i++];
288 
289 		utf_o->u_len += nls->uni2char(c, &utf_o->u_name[utf_o->u_len],
290 					      UDF_NAME_LEN - utf_o->u_len);
291 	}
292 	utf_o->u_cmpID = 8;
293 
294 	return utf_o->u_len;
295 }
296 
297 static int udf_NLStoCS0(struct nls_table *nls, dstring *ocu, struct ustr *uni,
298 			int length)
299 {
300 	unsigned len, i, max_val;
301 	uint16_t uni_char;
302 	int u_len;
303 
304 	memset(ocu, 0, sizeof(dstring) * length);
305 	ocu[0] = 8;
306 	max_val = 0xffU;
307 
308 try_again:
309 	u_len = 0U;
310 	for (i = 0U; i < uni->u_len; i++) {
311 		len = nls->char2uni(&uni->u_name[i], uni->u_len - i, &uni_char);
312 		if (len <= 0)
313 			continue;
314 
315 		if (uni_char > max_val) {
316 			max_val = 0xffffU;
317 			ocu[0] = (uint8_t)0x10U;
318 			goto try_again;
319 		}
320 
321 		if (max_val == 0xffffU)
322 			ocu[++u_len] = (uint8_t)(uni_char >> 8);
323 		ocu[++u_len] = (uint8_t)(uni_char & 0xffU);
324 		i += len - 1;
325 	}
326 
327 	ocu[length - 1] = (uint8_t)u_len + 1;
328 	return u_len + 1;
329 }
330 
331 int udf_get_filename(struct super_block *sb, uint8_t *sname, uint8_t *dname,
332 		     int flen)
333 {
334 	struct ustr filename, unifilename;
335 	int len;
336 
337 	if (udf_build_ustr_exact(&unifilename, sname, flen))
338 		return 0;
339 
340 	if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) {
341 		if (!udf_CS0toUTF8(&filename, &unifilename)) {
342 			udf_debug("Failed in udf_get_filename: sname = %s\n",
343 				  sname);
344 			return 0;
345 		}
346 	} else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) {
347 		if (!udf_CS0toNLS(UDF_SB(sb)->s_nls_map, &filename,
348 				  &unifilename)) {
349 			udf_debug("Failed in udf_get_filename: sname = %s\n",
350 				  sname);
351 			return 0;
352 		}
353 	} else
354 		return 0;
355 
356 	len = udf_translate_to_linux(dname, filename.u_name, filename.u_len,
357 				     unifilename.u_name, unifilename.u_len);
358 	if (len)
359 		return len;
360 
361 	return 0;
362 }
363 
364 int udf_put_filename(struct super_block *sb, const uint8_t *sname,
365 		     uint8_t *dname, int flen)
366 {
367 	struct ustr unifilename;
368 	int namelen;
369 
370 	if (!udf_char_to_ustr(&unifilename, sname, flen))
371 		return 0;
372 
373 	if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) {
374 		namelen = udf_UTF8toCS0(dname, &unifilename, UDF_NAME_LEN);
375 		if (!namelen)
376 			return 0;
377 	} else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) {
378 		namelen = udf_NLStoCS0(UDF_SB(sb)->s_nls_map, dname,
379 					&unifilename, UDF_NAME_LEN);
380 		if (!namelen)
381 			return 0;
382 	} else
383 		return 0;
384 
385 	return namelen;
386 }
387 
388 #define ILLEGAL_CHAR_MARK	'_'
389 #define EXT_MARK		'.'
390 #define CRC_MARK		'#'
391 #define EXT_SIZE 		5
392 
393 static int udf_translate_to_linux(uint8_t *newName, uint8_t *udfName,
394 				  int udfLen, uint8_t *fidName,
395 				  int fidNameLen)
396 {
397 	int index, newIndex = 0, needsCRC = 0;
398 	int extIndex = 0, newExtIndex = 0, hasExt = 0;
399 	unsigned short valueCRC;
400 	uint8_t curr;
401 	const uint8_t hexChar[] = "0123456789ABCDEF";
402 
403 	if (udfName[0] == '.' &&
404 	    (udfLen == 1 || (udfLen == 2 && udfName[1] == '.'))) {
405 		needsCRC = 1;
406 		newIndex = udfLen;
407 		memcpy(newName, udfName, udfLen);
408 	} else {
409 		for (index = 0; index < udfLen; index++) {
410 			curr = udfName[index];
411 			if (curr == '/' || curr == 0) {
412 				needsCRC = 1;
413 				curr = ILLEGAL_CHAR_MARK;
414 				while (index + 1 < udfLen &&
415 						(udfName[index + 1] == '/' ||
416 						 udfName[index + 1] == 0))
417 					index++;
418 			}
419 			if (curr == EXT_MARK &&
420 					(udfLen - index - 1) <= EXT_SIZE) {
421 				if (udfLen == index + 1)
422 					hasExt = 0;
423 				else {
424 					hasExt = 1;
425 					extIndex = index;
426 					newExtIndex = newIndex;
427 				}
428 			}
429 			if (newIndex < 256)
430 				newName[newIndex++] = curr;
431 			else
432 				needsCRC = 1;
433 		}
434 	}
435 	if (needsCRC) {
436 		uint8_t ext[EXT_SIZE];
437 		int localExtIndex = 0;
438 
439 		if (hasExt) {
440 			int maxFilenameLen;
441 			for (index = 0;
442 			     index < EXT_SIZE && extIndex + index + 1 < udfLen;
443 			     index++) {
444 				curr = udfName[extIndex + index + 1];
445 
446 				if (curr == '/' || curr == 0) {
447 					needsCRC = 1;
448 					curr = ILLEGAL_CHAR_MARK;
449 					while (extIndex + index + 2 < udfLen &&
450 					      (index + 1 < EXT_SIZE &&
451 						(udfName[extIndex + index + 2] == '/' ||
452 						 udfName[extIndex + index + 2] == 0)))
453 						index++;
454 				}
455 				ext[localExtIndex++] = curr;
456 			}
457 			maxFilenameLen = 250 - localExtIndex;
458 			if (newIndex > maxFilenameLen)
459 				newIndex = maxFilenameLen;
460 			else
461 				newIndex = newExtIndex;
462 		} else if (newIndex > 250)
463 			newIndex = 250;
464 		newName[newIndex++] = CRC_MARK;
465 		valueCRC = udf_crc(fidName, fidNameLen, 0);
466 		newName[newIndex++] = hexChar[(valueCRC & 0xf000) >> 12];
467 		newName[newIndex++] = hexChar[(valueCRC & 0x0f00) >> 8];
468 		newName[newIndex++] = hexChar[(valueCRC & 0x00f0) >> 4];
469 		newName[newIndex++] = hexChar[(valueCRC & 0x000f)];
470 
471 		if (hasExt) {
472 			newName[newIndex++] = EXT_MARK;
473 			for (index = 0; index < localExtIndex; index++)
474 				newName[newIndex++] = ext[index];
475 		}
476 	}
477 
478 	return newIndex;
479 }
480