xref: /openbmc/linux/fs/udf/unicode.c (revision 87c2ce3b)
1 /*
2  * unicode.c
3  *
4  * PURPOSE
5  *	Routines for converting between UTF-8 and OSTA Compressed Unicode.
6  *      Also handles filename mangling
7  *
8  * DESCRIPTION
9  *	OSTA Compressed Unicode is explained in the OSTA UDF specification.
10  *		http://www.osta.org/
11  *	UTF-8 is explained in the IETF RFC XXXX.
12  *		ftp://ftp.internic.net/rfc/rfcxxxx.txt
13  *
14  * COPYRIGHT
15  *	This file is distributed under the terms of the GNU General Public
16  *	License (GPL). Copies of the GPL can be obtained from:
17  *		ftp://prep.ai.mit.edu/pub/gnu/GPL
18  *	Each contributing author retains all rights to their own work.
19  */
20 
21 #include "udfdecl.h"
22 
23 #include <linux/kernel.h>
24 #include <linux/string.h>	/* for memset */
25 #include <linux/nls.h>
26 #include <linux/udf_fs.h>
27 
28 #include "udf_sb.h"
29 
30 static int udf_translate_to_linux(uint8_t *, uint8_t *, int, uint8_t *, int);
31 
32 static int udf_char_to_ustr(struct ustr *dest, const uint8_t *src, int strlen)
33 {
34 	if ( (!dest) || (!src) || (!strlen) || (strlen > UDF_NAME_LEN-2) )
35 		return 0;
36 	memset(dest, 0, sizeof(struct ustr));
37 	memcpy(dest->u_name, src, strlen);
38 	dest->u_cmpID = 0x08;
39 	dest->u_len = strlen;
40 	return strlen;
41 }
42 
43 /*
44  * udf_build_ustr
45  */
46 int udf_build_ustr(struct ustr *dest, dstring *ptr, int size)
47 {
48 	int usesize;
49 
50 	if ( (!dest) || (!ptr) || (!size) )
51 		return -1;
52 
53 	memset(dest, 0, sizeof(struct ustr));
54 	usesize= (size > UDF_NAME_LEN) ? UDF_NAME_LEN : size;
55 	dest->u_cmpID=ptr[0];
56 	dest->u_len=ptr[size-1];
57 	memcpy(dest->u_name, ptr+1, usesize-1);
58 	return 0;
59 }
60 
61 /*
62  * udf_build_ustr_exact
63  */
64 static int udf_build_ustr_exact(struct ustr *dest, dstring *ptr, int exactsize)
65 {
66 	if ( (!dest) || (!ptr) || (!exactsize) )
67 		return -1;
68 
69 	memset(dest, 0, sizeof(struct ustr));
70 	dest->u_cmpID=ptr[0];
71 	dest->u_len=exactsize-1;
72 	memcpy(dest->u_name, ptr+1, exactsize-1);
73 	return 0;
74 }
75 
76 /*
77  * udf_ocu_to_utf8
78  *
79  * PURPOSE
80  *	Convert OSTA Compressed Unicode to the UTF-8 equivalent.
81  *
82  * DESCRIPTION
83  *	This routine is only called by udf_filldir().
84  *
85  * PRE-CONDITIONS
86  *	utf			Pointer to UTF-8 output buffer.
87  *	ocu			Pointer to OSTA Compressed Unicode input buffer
88  *				of size UDF_NAME_LEN bytes.
89  * 				both of type "struct ustr *"
90  *
91  * POST-CONDITIONS
92  *	<return>		Zero on success.
93  *
94  * HISTORY
95  *	November 12, 1997 - Andrew E. Mileski
96  *	Written, tested, and released.
97  */
98 int udf_CS0toUTF8(struct ustr *utf_o, struct ustr *ocu_i)
99 {
100 	uint8_t *ocu;
101 	uint32_t c;
102 	uint8_t cmp_id, ocu_len;
103 	int i;
104 
105 	ocu = ocu_i->u_name;
106 
107 	ocu_len = ocu_i->u_len;
108 	cmp_id = ocu_i->u_cmpID;
109 	utf_o->u_len = 0;
110 
111 	if (ocu_len == 0)
112 	{
113 		memset(utf_o, 0, sizeof(struct ustr));
114 		utf_o->u_cmpID = 0;
115 		utf_o->u_len = 0;
116 		return 0;
117 	}
118 
119 	if ((cmp_id != 8) && (cmp_id != 16))
120 	{
121 		printk(KERN_ERR "udf: unknown compression code (%d) stri=%s\n", cmp_id, ocu_i->u_name);
122 		return 0;
123 	}
124 
125 	for (i = 0; (i < ocu_len) && (utf_o->u_len <= (UDF_NAME_LEN-3)) ;)
126 	{
127 
128 		/* Expand OSTA compressed Unicode to Unicode */
129 		c = ocu[i++];
130 		if (cmp_id == 16)
131 			c = (c << 8) | ocu[i++];
132 
133 		/* Compress Unicode to UTF-8 */
134 		if (c < 0x80U)
135 			utf_o->u_name[utf_o->u_len++] = (uint8_t)c;
136 		else if (c < 0x800U)
137 		{
138 			utf_o->u_name[utf_o->u_len++] = (uint8_t)(0xc0 | (c >> 6));
139 			utf_o->u_name[utf_o->u_len++] = (uint8_t)(0x80 | (c & 0x3f));
140 		}
141 		else
142 		{
143 			utf_o->u_name[utf_o->u_len++] = (uint8_t)(0xe0 | (c >> 12));
144 			utf_o->u_name[utf_o->u_len++] = (uint8_t)(0x80 | ((c >> 6) & 0x3f));
145 			utf_o->u_name[utf_o->u_len++] = (uint8_t)(0x80 | (c & 0x3f));
146 		}
147 	}
148 	utf_o->u_cmpID=8;
149 
150 	return utf_o->u_len;
151 }
152 
153 /*
154  *
155  * udf_utf8_to_ocu
156  *
157  * PURPOSE
158  *	Convert UTF-8 to the OSTA Compressed Unicode equivalent.
159  *
160  * DESCRIPTION
161  *	This routine is only called by udf_lookup().
162  *
163  * PRE-CONDITIONS
164  *	ocu			Pointer to OSTA Compressed Unicode output
165  *				buffer of size UDF_NAME_LEN bytes.
166  *	utf			Pointer to UTF-8 input buffer.
167  *	utf_len			Length of UTF-8 input buffer in bytes.
168  *
169  * POST-CONDITIONS
170  *	<return>		Zero on success.
171  *
172  * HISTORY
173  *	November 12, 1997 - Andrew E. Mileski
174  *	Written, tested, and released.
175  */
176 static int udf_UTF8toCS0(dstring *ocu, struct ustr *utf, int length)
177 {
178 	unsigned c, i, max_val, utf_char;
179 	int utf_cnt, u_len;
180 
181 	memset(ocu, 0, sizeof(dstring) * length);
182 	ocu[0] = 8;
183 	max_val = 0xffU;
184 
185 try_again:
186 	u_len = 0U;
187 	utf_char = 0U;
188 	utf_cnt = 0U;
189 	for (i = 0U; i < utf->u_len; i++)
190 	{
191 		c = (uint8_t)utf->u_name[i];
192 
193 		/* Complete a multi-byte UTF-8 character */
194 		if (utf_cnt)
195 		{
196 			utf_char = (utf_char << 6) | (c & 0x3fU);
197 			if (--utf_cnt)
198 				continue;
199 		}
200 		else
201 		{
202 			/* Check for a multi-byte UTF-8 character */
203 			if (c & 0x80U)
204 			{
205 				/* Start a multi-byte UTF-8 character */
206 				if ((c & 0xe0U) == 0xc0U)
207 				{
208 					utf_char = c & 0x1fU;
209 					utf_cnt = 1;
210 				}
211 				else if ((c & 0xf0U) == 0xe0U)
212 				{
213 					utf_char = c & 0x0fU;
214 					utf_cnt = 2;
215 				}
216 				else if ((c & 0xf8U) == 0xf0U)
217 				{
218 					utf_char = c & 0x07U;
219 					utf_cnt = 3;
220 				}
221 				else if ((c & 0xfcU) == 0xf8U)
222 				{
223 					utf_char = c & 0x03U;
224 					utf_cnt = 4;
225 				}
226 				else if ((c & 0xfeU) == 0xfcU)
227 				{
228 					utf_char = c & 0x01U;
229 					utf_cnt = 5;
230 				}
231 				else
232 					goto error_out;
233 				continue;
234 			} else
235 				/* Single byte UTF-8 character (most common) */
236 				utf_char = c;
237 		}
238 
239 		/* Choose no compression if necessary */
240 		if (utf_char > max_val)
241 		{
242 			if ( 0xffU == max_val )
243 			{
244 				max_val = 0xffffU;
245 				ocu[0] = (uint8_t)0x10U;
246 				goto try_again;
247 			}
248 			goto error_out;
249 		}
250 
251 		if (max_val == 0xffffU)
252 		{
253 			ocu[++u_len] = (uint8_t)(utf_char >> 8);
254 		}
255 		ocu[++u_len] = (uint8_t)(utf_char & 0xffU);
256 	}
257 
258 
259 	if (utf_cnt)
260 	{
261 error_out:
262 		ocu[++u_len] = '?';
263 		printk(KERN_DEBUG "udf: bad UTF-8 character\n");
264 	}
265 
266 	ocu[length - 1] = (uint8_t)u_len + 1;
267 	return u_len + 1;
268 }
269 
270 static int udf_CS0toNLS(struct nls_table *nls, struct ustr *utf_o, struct ustr *ocu_i)
271 {
272 	uint8_t *ocu;
273 	uint32_t c;
274 	uint8_t cmp_id, ocu_len;
275 	int i;
276 
277 	ocu = ocu_i->u_name;
278 
279 	ocu_len = ocu_i->u_len;
280 	cmp_id = ocu_i->u_cmpID;
281 	utf_o->u_len = 0;
282 
283 	if (ocu_len == 0)
284 	{
285 		memset(utf_o, 0, sizeof(struct ustr));
286 		utf_o->u_cmpID = 0;
287 		utf_o->u_len = 0;
288 		return 0;
289 	}
290 
291 	if ((cmp_id != 8) && (cmp_id != 16))
292 	{
293 		printk(KERN_ERR "udf: unknown compression code (%d) stri=%s\n", cmp_id, ocu_i->u_name);
294 		return 0;
295 	}
296 
297 	for (i = 0; (i < ocu_len) && (utf_o->u_len <= (UDF_NAME_LEN-3)) ;)
298 	{
299 		/* Expand OSTA compressed Unicode to Unicode */
300 		c = ocu[i++];
301 		if (cmp_id == 16)
302 			c = (c << 8) | ocu[i++];
303 
304 		utf_o->u_len += nls->uni2char(c, &utf_o->u_name[utf_o->u_len],
305 			UDF_NAME_LEN - utf_o->u_len);
306 	}
307 	utf_o->u_cmpID=8;
308 
309 	return utf_o->u_len;
310 }
311 
312 static int udf_NLStoCS0(struct nls_table *nls, dstring *ocu, struct ustr *uni, int length)
313 {
314 	unsigned len, i, max_val;
315 	uint16_t uni_char;
316 	int u_len;
317 
318 	memset(ocu, 0, sizeof(dstring) * length);
319 	ocu[0] = 8;
320 	max_val = 0xffU;
321 
322 try_again:
323 	u_len = 0U;
324 	for (i = 0U; i < uni->u_len; i++)
325 	{
326 		len = nls->char2uni(&uni->u_name[i], uni->u_len-i, &uni_char);
327 		if (len <= 0)
328 			continue;
329 
330 		if (uni_char > max_val)
331 		{
332 			max_val = 0xffffU;
333 			ocu[0] = (uint8_t)0x10U;
334 			goto try_again;
335 		}
336 
337 		if (max_val == 0xffffU)
338 			ocu[++u_len] = (uint8_t)(uni_char >> 8);
339 		ocu[++u_len] = (uint8_t)(uni_char & 0xffU);
340 		i += len - 1;
341 	}
342 
343 	ocu[length - 1] = (uint8_t)u_len + 1;
344 	return u_len + 1;
345 }
346 
347 int udf_get_filename(struct super_block *sb, uint8_t *sname, uint8_t *dname, int flen)
348 {
349 	struct ustr filename, unifilename;
350 	int len;
351 
352 	if (udf_build_ustr_exact(&unifilename, sname, flen))
353 	{
354 		return 0;
355 	}
356 
357 	if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8))
358 	{
359 		if (!udf_CS0toUTF8(&filename, &unifilename) )
360 		{
361 			udf_debug("Failed in udf_get_filename: sname = %s\n", sname);
362 			return 0;
363 		}
364 	}
365 	else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP))
366 	{
367 		if (!udf_CS0toNLS(UDF_SB(sb)->s_nls_map, &filename, &unifilename) )
368 		{
369 			udf_debug("Failed in udf_get_filename: sname = %s\n", sname);
370 			return 0;
371 		}
372 	}
373 	else
374 		return 0;
375 
376 	if ((len = udf_translate_to_linux(dname, filename.u_name, filename.u_len,
377 		unifilename.u_name, unifilename.u_len)))
378 	{
379 		return len;
380 	}
381 	return 0;
382 }
383 
384 int udf_put_filename(struct super_block *sb, const uint8_t *sname, uint8_t *dname, int flen)
385 {
386 	struct ustr unifilename;
387 	int namelen;
388 
389 	if ( !(udf_char_to_ustr(&unifilename, sname, flen)) )
390 	{
391 		return 0;
392 	}
393 
394 	if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8))
395 	{
396 		if ( !(namelen = udf_UTF8toCS0(dname, &unifilename, UDF_NAME_LEN)) )
397 		{
398 			return 0;
399 		}
400 	}
401 	else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP))
402 	{
403 		if ( !(namelen = udf_NLStoCS0(UDF_SB(sb)->s_nls_map, dname, &unifilename, UDF_NAME_LEN)) )
404 		{
405 			return 0;
406 		}
407 	}
408 	else
409 		return 0;
410 
411 	return namelen;
412 }
413 
414 #define ILLEGAL_CHAR_MARK	'_'
415 #define EXT_MARK			'.'
416 #define CRC_MARK			'#'
417 #define EXT_SIZE			5
418 
419 static int udf_translate_to_linux(uint8_t *newName, uint8_t *udfName, int udfLen, uint8_t *fidName, int fidNameLen)
420 {
421 	int index, newIndex = 0, needsCRC = 0;
422 	int extIndex = 0, newExtIndex = 0, hasExt = 0;
423 	unsigned short valueCRC;
424 	uint8_t curr;
425 	const uint8_t hexChar[] = "0123456789ABCDEF";
426 
427 	if (udfName[0] == '.' && (udfLen == 1 ||
428 		(udfLen == 2 && udfName[1] == '.')))
429 	{
430 		needsCRC = 1;
431 		newIndex = udfLen;
432 		memcpy(newName, udfName, udfLen);
433 	}
434 	else
435 	{
436 		for (index = 0; index < udfLen; index++)
437 		{
438 			curr = udfName[index];
439 			if (curr == '/' || curr == 0)
440 			{
441 				needsCRC = 1;
442 				curr = ILLEGAL_CHAR_MARK;
443 				while (index+1 < udfLen && (udfName[index+1] == '/' ||
444 					udfName[index+1] == 0))
445 					index++;
446 			}
447 			if (curr == EXT_MARK && (udfLen - index - 1) <= EXT_SIZE)
448 			{
449 				if (udfLen == index + 1)
450 					hasExt = 0;
451 				else
452 				{
453 					hasExt = 1;
454 					extIndex = index;
455 					newExtIndex = newIndex;
456 				}
457 			}
458 			if (newIndex < 256)
459 				newName[newIndex++] = curr;
460 			else
461 				needsCRC = 1;
462 		}
463 	}
464 	if (needsCRC)
465 	{
466 		uint8_t ext[EXT_SIZE];
467 		int localExtIndex = 0;
468 
469 		if (hasExt)
470 		{
471 			int maxFilenameLen;
472 			for(index = 0; index<EXT_SIZE && extIndex + index +1 < udfLen;
473 				index++ )
474 			{
475 				curr = udfName[extIndex + index + 1];
476 
477 				if (curr == '/' || curr == 0)
478 				{
479 					needsCRC = 1;
480 					curr = ILLEGAL_CHAR_MARK;
481 					while(extIndex + index + 2 < udfLen && (index + 1 < EXT_SIZE
482 						&& (udfName[extIndex + index + 2] == '/' ||
483 							udfName[extIndex + index + 2] == 0)))
484 						index++;
485 				}
486 				ext[localExtIndex++] = curr;
487 			}
488 			maxFilenameLen = 250 - localExtIndex;
489 			if (newIndex > maxFilenameLen)
490 				newIndex = maxFilenameLen;
491 			else
492 				newIndex = newExtIndex;
493 		}
494 		else if (newIndex > 250)
495 			newIndex = 250;
496 		newName[newIndex++] = CRC_MARK;
497 		valueCRC = udf_crc(fidName, fidNameLen, 0);
498 		newName[newIndex++] = hexChar[(valueCRC & 0xf000) >> 12];
499 		newName[newIndex++] = hexChar[(valueCRC & 0x0f00) >> 8];
500 		newName[newIndex++] = hexChar[(valueCRC & 0x00f0) >> 4];
501 		newName[newIndex++] = hexChar[(valueCRC & 0x000f)];
502 
503 		if (hasExt)
504 		{
505 			newName[newIndex++] = EXT_MARK;
506 			for (index = 0;index < localExtIndex ;index++ )
507 				newName[newIndex++] = ext[index];
508 		}
509 	}
510 	return newIndex;
511 }
512