xref: /openbmc/linux/fs/udf/unicode.c (revision 1da177e4)
1 /*
2  * unicode.c
3  *
4  * PURPOSE
5  *	Routines for converting between UTF-8 and OSTA Compressed Unicode.
6  *      Also handles filename mangling
7  *
8  * DESCRIPTION
9  *	OSTA Compressed Unicode is explained in the OSTA UDF specification.
10  *		http://www.osta.org/
11  *	UTF-8 is explained in the IETF RFC XXXX.
12  *		ftp://ftp.internic.net/rfc/rfcxxxx.txt
13  *
14  * CONTACTS
15  *	E-mail regarding any portion of the Linux UDF file system should be
16  *	directed to the development team's mailing list (run by majordomo):
17  *		linux_udf@hpesjro.fc.hp.com
18  *
19  * COPYRIGHT
20  *	This file is distributed under the terms of the GNU General Public
21  *	License (GPL). Copies of the GPL can be obtained from:
22  *		ftp://prep.ai.mit.edu/pub/gnu/GPL
23  *	Each contributing author retains all rights to their own work.
24  */
25 
26 #include "udfdecl.h"
27 
28 #include <linux/kernel.h>
29 #include <linux/string.h>	/* for memset */
30 #include <linux/nls.h>
31 #include <linux/udf_fs.h>
32 
33 #include "udf_sb.h"
34 
35 static int udf_translate_to_linux(uint8_t *, uint8_t *, int, uint8_t *, int);
36 
37 static int udf_char_to_ustr(struct ustr *dest, const uint8_t *src, int strlen)
38 {
39 	if ( (!dest) || (!src) || (!strlen) || (strlen > UDF_NAME_LEN-2) )
40 		return 0;
41 	memset(dest, 0, sizeof(struct ustr));
42 	memcpy(dest->u_name, src, strlen);
43 	dest->u_cmpID = 0x08;
44 	dest->u_len = strlen;
45 	return strlen;
46 }
47 
48 /*
49  * udf_build_ustr
50  */
51 int udf_build_ustr(struct ustr *dest, dstring *ptr, int size)
52 {
53 	int usesize;
54 
55 	if ( (!dest) || (!ptr) || (!size) )
56 		return -1;
57 
58 	memset(dest, 0, sizeof(struct ustr));
59 	usesize= (size > UDF_NAME_LEN) ? UDF_NAME_LEN : size;
60 	dest->u_cmpID=ptr[0];
61 	dest->u_len=ptr[size-1];
62 	memcpy(dest->u_name, ptr+1, usesize-1);
63 	return 0;
64 }
65 
66 /*
67  * udf_build_ustr_exact
68  */
69 static int udf_build_ustr_exact(struct ustr *dest, dstring *ptr, int exactsize)
70 {
71 	if ( (!dest) || (!ptr) || (!exactsize) )
72 		return -1;
73 
74 	memset(dest, 0, sizeof(struct ustr));
75 	dest->u_cmpID=ptr[0];
76 	dest->u_len=exactsize-1;
77 	memcpy(dest->u_name, ptr+1, exactsize-1);
78 	return 0;
79 }
80 
81 /*
82  * udf_ocu_to_utf8
83  *
84  * PURPOSE
85  *	Convert OSTA Compressed Unicode to the UTF-8 equivalent.
86  *
87  * DESCRIPTION
88  *	This routine is only called by udf_filldir().
89  *
90  * PRE-CONDITIONS
91  *	utf			Pointer to UTF-8 output buffer.
92  *	ocu			Pointer to OSTA Compressed Unicode input buffer
93  *				of size UDF_NAME_LEN bytes.
94  * 				both of type "struct ustr *"
95  *
96  * POST-CONDITIONS
97  *	<return>		Zero on success.
98  *
99  * HISTORY
100  *	November 12, 1997 - Andrew E. Mileski
101  *	Written, tested, and released.
102  */
103 int udf_CS0toUTF8(struct ustr *utf_o, struct ustr *ocu_i)
104 {
105 	uint8_t *ocu;
106 	uint32_t c;
107 	uint8_t cmp_id, ocu_len;
108 	int i;
109 
110 	ocu = ocu_i->u_name;
111 
112 	ocu_len = ocu_i->u_len;
113 	cmp_id = ocu_i->u_cmpID;
114 	utf_o->u_len = 0;
115 
116 	if (ocu_len == 0)
117 	{
118 		memset(utf_o, 0, sizeof(struct ustr));
119 		utf_o->u_cmpID = 0;
120 		utf_o->u_len = 0;
121 		return 0;
122 	}
123 
124 	if ((cmp_id != 8) && (cmp_id != 16))
125 	{
126 		printk(KERN_ERR "udf: unknown compression code (%d) stri=%s\n", cmp_id, ocu_i->u_name);
127 		return 0;
128 	}
129 
130 	for (i = 0; (i < ocu_len) && (utf_o->u_len <= (UDF_NAME_LEN-3)) ;)
131 	{
132 
133 		/* Expand OSTA compressed Unicode to Unicode */
134 		c = ocu[i++];
135 		if (cmp_id == 16)
136 			c = (c << 8) | ocu[i++];
137 
138 		/* Compress Unicode to UTF-8 */
139 		if (c < 0x80U)
140 			utf_o->u_name[utf_o->u_len++] = (uint8_t)c;
141 		else if (c < 0x800U)
142 		{
143 			utf_o->u_name[utf_o->u_len++] = (uint8_t)(0xc0 | (c >> 6));
144 			utf_o->u_name[utf_o->u_len++] = (uint8_t)(0x80 | (c & 0x3f));
145 		}
146 		else
147 		{
148 			utf_o->u_name[utf_o->u_len++] = (uint8_t)(0xe0 | (c >> 12));
149 			utf_o->u_name[utf_o->u_len++] = (uint8_t)(0x80 | ((c >> 6) & 0x3f));
150 			utf_o->u_name[utf_o->u_len++] = (uint8_t)(0x80 | (c & 0x3f));
151 		}
152 	}
153 	utf_o->u_cmpID=8;
154 
155 	return utf_o->u_len;
156 }
157 
158 /*
159  *
160  * udf_utf8_to_ocu
161  *
162  * PURPOSE
163  *	Convert UTF-8 to the OSTA Compressed Unicode equivalent.
164  *
165  * DESCRIPTION
166  *	This routine is only called by udf_lookup().
167  *
168  * PRE-CONDITIONS
169  *	ocu			Pointer to OSTA Compressed Unicode output
170  *				buffer of size UDF_NAME_LEN bytes.
171  *	utf			Pointer to UTF-8 input buffer.
172  *	utf_len			Length of UTF-8 input buffer in bytes.
173  *
174  * POST-CONDITIONS
175  *	<return>		Zero on success.
176  *
177  * HISTORY
178  *	November 12, 1997 - Andrew E. Mileski
179  *	Written, tested, and released.
180  */
181 static int udf_UTF8toCS0(dstring *ocu, struct ustr *utf, int length)
182 {
183 	unsigned c, i, max_val, utf_char;
184 	int utf_cnt, u_len;
185 
186 	memset(ocu, 0, sizeof(dstring) * length);
187 	ocu[0] = 8;
188 	max_val = 0xffU;
189 
190 try_again:
191 	u_len = 0U;
192 	utf_char = 0U;
193 	utf_cnt = 0U;
194 	for (i = 0U; i < utf->u_len; i++)
195 	{
196 		c = (uint8_t)utf->u_name[i];
197 
198 		/* Complete a multi-byte UTF-8 character */
199 		if (utf_cnt)
200 		{
201 			utf_char = (utf_char << 6) | (c & 0x3fU);
202 			if (--utf_cnt)
203 				continue;
204 		}
205 		else
206 		{
207 			/* Check for a multi-byte UTF-8 character */
208 			if (c & 0x80U)
209 			{
210 				/* Start a multi-byte UTF-8 character */
211 				if ((c & 0xe0U) == 0xc0U)
212 				{
213 					utf_char = c & 0x1fU;
214 					utf_cnt = 1;
215 				}
216 				else if ((c & 0xf0U) == 0xe0U)
217 				{
218 					utf_char = c & 0x0fU;
219 					utf_cnt = 2;
220 				}
221 				else if ((c & 0xf8U) == 0xf0U)
222 				{
223 					utf_char = c & 0x07U;
224 					utf_cnt = 3;
225 				}
226 				else if ((c & 0xfcU) == 0xf8U)
227 				{
228 					utf_char = c & 0x03U;
229 					utf_cnt = 4;
230 				}
231 				else if ((c & 0xfeU) == 0xfcU)
232 				{
233 					utf_char = c & 0x01U;
234 					utf_cnt = 5;
235 				}
236 				else
237 					goto error_out;
238 				continue;
239 			} else
240 				/* Single byte UTF-8 character (most common) */
241 				utf_char = c;
242 		}
243 
244 		/* Choose no compression if necessary */
245 		if (utf_char > max_val)
246 		{
247 			if ( 0xffU == max_val )
248 			{
249 				max_val = 0xffffU;
250 				ocu[0] = (uint8_t)0x10U;
251 				goto try_again;
252 			}
253 			goto error_out;
254 		}
255 
256 		if (max_val == 0xffffU)
257 		{
258 			ocu[++u_len] = (uint8_t)(utf_char >> 8);
259 		}
260 		ocu[++u_len] = (uint8_t)(utf_char & 0xffU);
261 	}
262 
263 
264 	if (utf_cnt)
265 	{
266 error_out:
267 		ocu[++u_len] = '?';
268 		printk(KERN_DEBUG "udf: bad UTF-8 character\n");
269 	}
270 
271 	ocu[length - 1] = (uint8_t)u_len + 1;
272 	return u_len + 1;
273 }
274 
275 static int udf_CS0toNLS(struct nls_table *nls, struct ustr *utf_o, struct ustr *ocu_i)
276 {
277 	uint8_t *ocu;
278 	uint32_t c;
279 	uint8_t cmp_id, ocu_len;
280 	int i;
281 
282 	ocu = ocu_i->u_name;
283 
284 	ocu_len = ocu_i->u_len;
285 	cmp_id = ocu_i->u_cmpID;
286 	utf_o->u_len = 0;
287 
288 	if (ocu_len == 0)
289 	{
290 		memset(utf_o, 0, sizeof(struct ustr));
291 		utf_o->u_cmpID = 0;
292 		utf_o->u_len = 0;
293 		return 0;
294 	}
295 
296 	if ((cmp_id != 8) && (cmp_id != 16))
297 	{
298 		printk(KERN_ERR "udf: unknown compression code (%d) stri=%s\n", cmp_id, ocu_i->u_name);
299 		return 0;
300 	}
301 
302 	for (i = 0; (i < ocu_len) && (utf_o->u_len <= (UDF_NAME_LEN-3)) ;)
303 	{
304 		/* Expand OSTA compressed Unicode to Unicode */
305 		c = ocu[i++];
306 		if (cmp_id == 16)
307 			c = (c << 8) | ocu[i++];
308 
309 		utf_o->u_len += nls->uni2char(c, &utf_o->u_name[utf_o->u_len],
310 			UDF_NAME_LEN - utf_o->u_len);
311 	}
312 	utf_o->u_cmpID=8;
313 
314 	return utf_o->u_len;
315 }
316 
317 static int udf_NLStoCS0(struct nls_table *nls, dstring *ocu, struct ustr *uni, int length)
318 {
319 	unsigned len, i, max_val;
320 	uint16_t uni_char;
321 	int u_len;
322 
323 	memset(ocu, 0, sizeof(dstring) * length);
324 	ocu[0] = 8;
325 	max_val = 0xffU;
326 
327 try_again:
328 	u_len = 0U;
329 	for (i = 0U; i < uni->u_len; i++)
330 	{
331 		len = nls->char2uni(&uni->u_name[i], uni->u_len-i, &uni_char);
332 		if (len <= 0)
333 			continue;
334 
335 		if (uni_char > max_val)
336 		{
337 			max_val = 0xffffU;
338 			ocu[0] = (uint8_t)0x10U;
339 			goto try_again;
340 		}
341 
342 		if (max_val == 0xffffU)
343 			ocu[++u_len] = (uint8_t)(uni_char >> 8);
344 		ocu[++u_len] = (uint8_t)(uni_char & 0xffU);
345 		i += len - 1;
346 	}
347 
348 	ocu[length - 1] = (uint8_t)u_len + 1;
349 	return u_len + 1;
350 }
351 
352 int udf_get_filename(struct super_block *sb, uint8_t *sname, uint8_t *dname, int flen)
353 {
354 	struct ustr filename, unifilename;
355 	int len;
356 
357 	if (udf_build_ustr_exact(&unifilename, sname, flen))
358 	{
359 		return 0;
360 	}
361 
362 	if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8))
363 	{
364 		if (!udf_CS0toUTF8(&filename, &unifilename) )
365 		{
366 			udf_debug("Failed in udf_get_filename: sname = %s\n", sname);
367 			return 0;
368 		}
369 	}
370 	else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP))
371 	{
372 		if (!udf_CS0toNLS(UDF_SB(sb)->s_nls_map, &filename, &unifilename) )
373 		{
374 			udf_debug("Failed in udf_get_filename: sname = %s\n", sname);
375 			return 0;
376 		}
377 	}
378 	else
379 		return 0;
380 
381 	if ((len = udf_translate_to_linux(dname, filename.u_name, filename.u_len,
382 		unifilename.u_name, unifilename.u_len)))
383 	{
384 		return len;
385 	}
386 	return 0;
387 }
388 
389 int udf_put_filename(struct super_block *sb, const uint8_t *sname, uint8_t *dname, int flen)
390 {
391 	struct ustr unifilename;
392 	int namelen;
393 
394 	if ( !(udf_char_to_ustr(&unifilename, sname, flen)) )
395 	{
396 		return 0;
397 	}
398 
399 	if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8))
400 	{
401 		if ( !(namelen = udf_UTF8toCS0(dname, &unifilename, UDF_NAME_LEN)) )
402 		{
403 			return 0;
404 		}
405 	}
406 	else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP))
407 	{
408 		if ( !(namelen = udf_NLStoCS0(UDF_SB(sb)->s_nls_map, dname, &unifilename, UDF_NAME_LEN)) )
409 		{
410 			return 0;
411 		}
412 	}
413 	else
414 		return 0;
415 
416 	return namelen;
417 }
418 
419 #define ILLEGAL_CHAR_MARK	'_'
420 #define EXT_MARK			'.'
421 #define CRC_MARK			'#'
422 #define EXT_SIZE			5
423 
424 static int udf_translate_to_linux(uint8_t *newName, uint8_t *udfName, int udfLen, uint8_t *fidName, int fidNameLen)
425 {
426 	int index, newIndex = 0, needsCRC = 0;
427 	int extIndex = 0, newExtIndex = 0, hasExt = 0;
428 	unsigned short valueCRC;
429 	uint8_t curr;
430 	const uint8_t hexChar[] = "0123456789ABCDEF";
431 
432 	if (udfName[0] == '.' && (udfLen == 1 ||
433 		(udfLen == 2 && udfName[1] == '.')))
434 	{
435 		needsCRC = 1;
436 		newIndex = udfLen;
437 		memcpy(newName, udfName, udfLen);
438 	}
439 	else
440 	{
441 		for (index = 0; index < udfLen; index++)
442 		{
443 			curr = udfName[index];
444 			if (curr == '/' || curr == 0)
445 			{
446 				needsCRC = 1;
447 				curr = ILLEGAL_CHAR_MARK;
448 				while (index+1 < udfLen && (udfName[index+1] == '/' ||
449 					udfName[index+1] == 0))
450 					index++;
451 			}
452 			if (curr == EXT_MARK && (udfLen - index - 1) <= EXT_SIZE)
453 			{
454 				if (udfLen == index + 1)
455 					hasExt = 0;
456 				else
457 				{
458 					hasExt = 1;
459 					extIndex = index;
460 					newExtIndex = newIndex;
461 				}
462 			}
463 			if (newIndex < 256)
464 				newName[newIndex++] = curr;
465 			else
466 				needsCRC = 1;
467 		}
468 	}
469 	if (needsCRC)
470 	{
471 		uint8_t ext[EXT_SIZE];
472 		int localExtIndex = 0;
473 
474 		if (hasExt)
475 		{
476 			int maxFilenameLen;
477 			for(index = 0; index<EXT_SIZE && extIndex + index +1 < udfLen;
478 				index++ )
479 			{
480 				curr = udfName[extIndex + index + 1];
481 
482 				if (curr == '/' || curr == 0)
483 				{
484 					needsCRC = 1;
485 					curr = ILLEGAL_CHAR_MARK;
486 					while(extIndex + index + 2 < udfLen && (index + 1 < EXT_SIZE
487 						&& (udfName[extIndex + index + 2] == '/' ||
488 							udfName[extIndex + index + 2] == 0)))
489 						index++;
490 				}
491 				ext[localExtIndex++] = curr;
492 			}
493 			maxFilenameLen = 250 - localExtIndex;
494 			if (newIndex > maxFilenameLen)
495 				newIndex = maxFilenameLen;
496 			else
497 				newIndex = newExtIndex;
498 		}
499 		else if (newIndex > 250)
500 			newIndex = 250;
501 		newName[newIndex++] = CRC_MARK;
502 		valueCRC = udf_crc(fidName, fidNameLen, 0);
503 		newName[newIndex++] = hexChar[(valueCRC & 0xf000) >> 12];
504 		newName[newIndex++] = hexChar[(valueCRC & 0x0f00) >> 8];
505 		newName[newIndex++] = hexChar[(valueCRC & 0x00f0) >> 4];
506 		newName[newIndex++] = hexChar[(valueCRC & 0x000f)];
507 
508 		if (hasExt)
509 		{
510 			newName[newIndex++] = EXT_MARK;
511 			for (index = 0;index < localExtIndex ;index++ )
512 				newName[newIndex++] = ext[index];
513 		}
514 	}
515 	return newIndex;
516 }
517