xref: /openbmc/linux/fs/udf/unicode.c (revision 9293fcfb)
1 /*
2  * unicode.c
3  *
4  * PURPOSE
5  *	Routines for converting between UTF-8 and OSTA Compressed Unicode.
6  *      Also handles filename mangling
7  *
8  * DESCRIPTION
9  *	OSTA Compressed Unicode is explained in the OSTA UDF specification.
10  *		http://www.osta.org/
11  *	UTF-8 is explained in the IETF RFC XXXX.
12  *		ftp://ftp.internic.net/rfc/rfcxxxx.txt
13  *
14  * COPYRIGHT
15  *	This file is distributed under the terms of the GNU General Public
16  *	License (GPL). Copies of the GPL can be obtained from:
17  *		ftp://prep.ai.mit.edu/pub/gnu/GPL
18  *	Each contributing author retains all rights to their own work.
19  */
20 
21 #include "udfdecl.h"
22 
23 #include <linux/kernel.h>
24 #include <linux/string.h>	/* for memset */
25 #include <linux/nls.h>
26 #include <linux/crc-itu-t.h>
27 #include <linux/slab.h>
28 
29 #include "udf_sb.h"
30 
31 static int udf_translate_to_linux(uint8_t *, int, const uint8_t *, int,
32 				  const uint8_t *, int);
33 
34 static int udf_uni2char_utf8(wchar_t uni,
35 			     unsigned char *out,
36 			     int boundlen)
37 {
38 	int u_len = 0;
39 
40 	if (boundlen <= 0)
41 		return -ENAMETOOLONG;
42 
43 	if (uni < 0x80) {
44 		out[u_len++] = (unsigned char)uni;
45 	} else if (uni < 0x800) {
46 		if (boundlen < 2)
47 			return -ENAMETOOLONG;
48 		out[u_len++] = (unsigned char)(0xc0 | (uni >> 6));
49 		out[u_len++] = (unsigned char)(0x80 | (uni & 0x3f));
50 	} else {
51 		if (boundlen < 3)
52 			return -ENAMETOOLONG;
53 		out[u_len++] = (unsigned char)(0xe0 | (uni >> 12));
54 		out[u_len++] = (unsigned char)(0x80 | ((uni >> 6) & 0x3f));
55 		out[u_len++] = (unsigned char)(0x80 | (uni & 0x3f));
56 	}
57 	return u_len;
58 }
59 
60 static int udf_char2uni_utf8(const unsigned char *in,
61 			     int boundlen,
62 			     wchar_t *uni)
63 {
64 	unsigned int utf_char;
65 	unsigned char c;
66 	int utf_cnt, u_len;
67 
68 	utf_char = 0;
69 	utf_cnt = 0;
70 	for (u_len = 0; u_len < boundlen;) {
71 		c = in[u_len++];
72 
73 		/* Complete a multi-byte UTF-8 character */
74 		if (utf_cnt) {
75 			utf_char = (utf_char << 6) | (c & 0x3f);
76 			if (--utf_cnt)
77 				continue;
78 		} else {
79 			/* Check for a multi-byte UTF-8 character */
80 			if (c & 0x80) {
81 				/* Start a multi-byte UTF-8 character */
82 				if ((c & 0xe0) == 0xc0) {
83 					utf_char = c & 0x1f;
84 					utf_cnt = 1;
85 				} else if ((c & 0xf0) == 0xe0) {
86 					utf_char = c & 0x0f;
87 					utf_cnt = 2;
88 				} else if ((c & 0xf8) == 0xf0) {
89 					utf_char = c & 0x07;
90 					utf_cnt = 3;
91 				} else if ((c & 0xfc) == 0xf8) {
92 					utf_char = c & 0x03;
93 					utf_cnt = 4;
94 				} else if ((c & 0xfe) == 0xfc) {
95 					utf_char = c & 0x01;
96 					utf_cnt = 5;
97 				} else {
98 					utf_cnt = -1;
99 					break;
100 				}
101 				continue;
102 			} else {
103 				/* Single byte UTF-8 character (most common) */
104 				utf_char = c;
105 			}
106 		}
107 		*uni = utf_char;
108 		break;
109 	}
110 	if (utf_cnt) {
111 		*uni = '?';
112 		return -EINVAL;
113 	}
114 	return u_len;
115 }
116 
117 static int udf_name_from_CS0(uint8_t *str_o, int str_max_len,
118 			     const uint8_t *ocu, int ocu_len,
119 			     int (*conv_f)(wchar_t, unsigned char *, int))
120 {
121 	uint8_t cmp_id;
122 	int i, len;
123 	int str_o_len = 0;
124 
125 	if (str_max_len <= 0)
126 		return 0;
127 
128 	if (ocu_len == 0) {
129 		memset(str_o, 0, str_max_len);
130 		return 0;
131 	}
132 
133 	cmp_id = ocu[0];
134 	if (cmp_id != 8 && cmp_id != 16) {
135 		memset(str_o, 0, str_max_len);
136 		pr_err("unknown compression code (%d) stri=%s\n", cmp_id, ocu);
137 		return -EINVAL;
138 	}
139 
140 	for (i = 1; (i < ocu_len) && (str_o_len < str_max_len);) {
141 		/* Expand OSTA compressed Unicode to Unicode */
142 		uint32_t c = ocu[i++];
143 		if (cmp_id == 16)
144 			c = (c << 8) | ocu[i++];
145 
146 		len = conv_f(c, &str_o[str_o_len], str_max_len - str_o_len);
147 		/* Valid character? */
148 		if (len >= 0)
149 			str_o_len += len;
150 		else if (len == -ENAMETOOLONG)
151 			break;
152 		else
153 			str_o[str_o_len++] = '?';
154 	}
155 
156 	return str_o_len;
157 }
158 
159 static int udf_name_to_CS0(uint8_t *ocu, int ocu_max_len,
160 			   const uint8_t *str_i, int str_len,
161 			   int (*conv_f)(const unsigned char *, int, wchar_t *))
162 {
163 	int i, len;
164 	unsigned int max_val;
165 	wchar_t uni_char;
166 	int u_len, u_ch;
167 
168 	if (ocu_max_len <= 0)
169 		return 0;
170 
171 	memset(ocu, 0, ocu_max_len);
172 	ocu[0] = 8;
173 	max_val = 0xff;
174 	u_ch = 1;
175 
176 try_again:
177 	u_len = 1;
178 	for (i = 0; i < str_len; i++) {
179 		/* Name didn't fit? */
180 		if (u_len + u_ch > ocu_max_len)
181 			return 0;
182 		len = conv_f(&str_i[i], str_len - i, &uni_char);
183 		if (!len)
184 			continue;
185 		/* Invalid character, deal with it */
186 		if (len < 0) {
187 			len = 1;
188 			uni_char = '?';
189 		}
190 
191 		if (uni_char > max_val) {
192 			max_val = 0xffff;
193 			ocu[0] = 0x10;
194 			u_ch = 2;
195 			goto try_again;
196 		}
197 
198 		if (max_val == 0xffff)
199 			ocu[u_len++] = (uint8_t)(uni_char >> 8);
200 		ocu[u_len++] = (uint8_t)(uni_char & 0xff);
201 		i += len - 1;
202 	}
203 
204 	return u_len;
205 }
206 
207 int udf_CS0toUTF8(uint8_t *utf_o, int o_len, const uint8_t *ocu_i, int i_len)
208 {
209 	return udf_name_from_CS0(utf_o, o_len, ocu_i, i_len,
210 				 udf_uni2char_utf8);
211 }
212 
213 int udf_get_filename(struct super_block *sb, const uint8_t *sname, int slen,
214 		     uint8_t *dname, int dlen)
215 {
216 	uint8_t *filename;
217 	int (*conv_f)(wchar_t, unsigned char *, int);
218 	int ret;
219 
220 	if (!slen)
221 		return -EIO;
222 
223 	if (dlen <= 0)
224 		return 0;
225 
226 	filename = kmalloc(dlen, GFP_NOFS);
227 	if (!filename)
228 		return -ENOMEM;
229 
230 	if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) {
231 		conv_f = udf_uni2char_utf8;
232 	} else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) {
233 		conv_f = UDF_SB(sb)->s_nls_map->uni2char;
234 	} else
235 		BUG();
236 
237 	ret = udf_name_from_CS0(filename, dlen, sname, slen, conv_f);
238 	if (ret < 0) {
239 		udf_debug("Failed in udf_get_filename: sname = %s\n", sname);
240 		goto out2;
241 	}
242 
243 	ret = udf_translate_to_linux(dname, dlen, filename, dlen,
244 				     sname + 1, slen - 1);
245 	/* Zero length filename isn't valid... */
246 	if (ret == 0)
247 		ret = -EINVAL;
248 out2:
249 	kfree(filename);
250 	return ret;
251 }
252 
253 int udf_put_filename(struct super_block *sb, const uint8_t *sname, int slen,
254 		     uint8_t *dname, int dlen)
255 {
256 	int (*conv_f)(const unsigned char *, int, wchar_t *);
257 
258 	if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) {
259 		conv_f = udf_char2uni_utf8;
260 	} else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) {
261 		conv_f = UDF_SB(sb)->s_nls_map->char2uni;
262 	} else
263 		BUG();
264 
265 	return udf_name_to_CS0(dname, dlen, sname, slen, conv_f);
266 }
267 
268 #define ILLEGAL_CHAR_MARK	'_'
269 #define EXT_MARK		'.'
270 #define CRC_MARK		'#'
271 #define EXT_SIZE 		5
272 /* Number of chars we need to store generated CRC to make filename unique */
273 #define CRC_LEN			5
274 
275 static int udf_translate_to_linux(uint8_t *newName, int newLen,
276 				  const uint8_t *udfName, int udfLen,
277 				  const uint8_t *fidName, int fidNameLen)
278 {
279 	int index, newIndex = 0, needsCRC = 0;
280 	int extIndex = 0, newExtIndex = 0, hasExt = 0;
281 	unsigned short valueCRC;
282 	uint8_t curr;
283 
284 	if (udfName[0] == '.' &&
285 	    (udfLen == 1 || (udfLen == 2 && udfName[1] == '.'))) {
286 		needsCRC = 1;
287 		newIndex = udfLen;
288 		memcpy(newName, udfName, udfLen);
289 	} else {
290 		for (index = 0; index < udfLen; index++) {
291 			curr = udfName[index];
292 			if (curr == '/' || curr == 0) {
293 				needsCRC = 1;
294 				curr = ILLEGAL_CHAR_MARK;
295 				while (index + 1 < udfLen &&
296 						(udfName[index + 1] == '/' ||
297 						 udfName[index + 1] == 0))
298 					index++;
299 			}
300 			if (curr == EXT_MARK &&
301 					(udfLen - index - 1) <= EXT_SIZE) {
302 				if (udfLen == index + 1)
303 					hasExt = 0;
304 				else {
305 					hasExt = 1;
306 					extIndex = index;
307 					newExtIndex = newIndex;
308 				}
309 			}
310 			if (newIndex < newLen)
311 				newName[newIndex++] = curr;
312 			else
313 				needsCRC = 1;
314 		}
315 	}
316 	if (needsCRC) {
317 		uint8_t ext[EXT_SIZE];
318 		int localExtIndex = 0;
319 
320 		if (hasExt) {
321 			int maxFilenameLen;
322 			for (index = 0;
323 			     index < EXT_SIZE && extIndex + index + 1 < udfLen;
324 			     index++) {
325 				curr = udfName[extIndex + index + 1];
326 
327 				if (curr == '/' || curr == 0) {
328 					needsCRC = 1;
329 					curr = ILLEGAL_CHAR_MARK;
330 					while (extIndex + index + 2 < udfLen &&
331 					      (index + 1 < EXT_SIZE &&
332 						(udfName[extIndex + index + 2] == '/' ||
333 						 udfName[extIndex + index + 2] == 0)))
334 						index++;
335 				}
336 				ext[localExtIndex++] = curr;
337 			}
338 			maxFilenameLen = newLen - CRC_LEN - localExtIndex;
339 			if (newIndex > maxFilenameLen)
340 				newIndex = maxFilenameLen;
341 			else
342 				newIndex = newExtIndex;
343 		} else if (newIndex > newLen - CRC_LEN)
344 			newIndex = newLen - CRC_LEN;
345 		newName[newIndex++] = CRC_MARK;
346 		valueCRC = crc_itu_t(0, fidName, fidNameLen);
347 		newName[newIndex++] = hex_asc_upper_hi(valueCRC >> 8);
348 		newName[newIndex++] = hex_asc_upper_lo(valueCRC >> 8);
349 		newName[newIndex++] = hex_asc_upper_hi(valueCRC);
350 		newName[newIndex++] = hex_asc_upper_lo(valueCRC);
351 
352 		if (hasExt) {
353 			newName[newIndex++] = EXT_MARK;
354 			for (index = 0; index < localExtIndex; index++)
355 				newName[newIndex++] = ext[index];
356 		}
357 	}
358 
359 	return newIndex;
360 }
361