xref: /openbmc/linux/fs/udf/unicode.c (revision b8a41c44)
1 /*
2  * unicode.c
3  *
4  * PURPOSE
5  *	Routines for converting between UTF-8 and OSTA Compressed Unicode.
6  *      Also handles filename mangling
7  *
8  * DESCRIPTION
9  *	OSTA Compressed Unicode is explained in the OSTA UDF specification.
10  *		http://www.osta.org/
11  *	UTF-8 is explained in the IETF RFC XXXX.
12  *		ftp://ftp.internic.net/rfc/rfcxxxx.txt
13  *
14  * COPYRIGHT
15  *	This file is distributed under the terms of the GNU General Public
16  *	License (GPL). Copies of the GPL can be obtained from:
17  *		ftp://prep.ai.mit.edu/pub/gnu/GPL
18  *	Each contributing author retains all rights to their own work.
19  */
20 
21 #include "udfdecl.h"
22 
23 #include <linux/kernel.h>
24 #include <linux/string.h>	/* for memset */
25 #include <linux/nls.h>
26 #include <linux/crc-itu-t.h>
27 #include <linux/slab.h>
28 
29 #include "udf_sb.h"
30 
31 #define UNICODE_MAX 0x10ffff
32 #define SURROGATE_MASK 0xfffff800
33 #define SURROGATE_PAIR 0x0000d800
34 
35 static int udf_uni2char_utf8(wchar_t uni,
36 			     unsigned char *out,
37 			     int boundlen)
38 {
39 	int u_len = 0;
40 
41 	if (boundlen <= 0)
42 		return -ENAMETOOLONG;
43 
44 	u_len = utf32_to_utf8(uni, out, boundlen);
45 	if (u_len < 0) {
46 		if (uni > UNICODE_MAX ||
47 		    (uni & SURROGATE_MASK) == SURROGATE_PAIR)
48 			return -EINVAL;
49 		return -ENAMETOOLONG;
50 	}
51 	return u_len;
52 }
53 
54 static int udf_char2uni_utf8(const unsigned char *in,
55 			     int boundlen,
56 			     wchar_t *uni)
57 {
58 	int u_len;
59 	unicode_t c;
60 
61 	u_len = utf8_to_utf32(in, boundlen, &c);
62 	if (u_len < 0) {
63 		*uni = '?';
64 		return -EINVAL;
65 	}
66 
67 	if (c > MAX_WCHAR_T)
68 		*uni = '?';
69 	else
70 		*uni = c;
71 	return u_len;
72 }
73 
74 #define ILLEGAL_CHAR_MARK	'_'
75 #define EXT_MARK		'.'
76 #define CRC_MARK		'#'
77 #define EXT_SIZE		5
78 /* Number of chars we need to store generated CRC to make filename unique */
79 #define CRC_LEN			5
80 
81 static int udf_name_conv_char(uint8_t *str_o, int str_o_max_len,
82 			      int *str_o_idx,
83 			      const uint8_t *str_i, int str_i_max_len,
84 			      int *str_i_idx,
85 			      int u_ch, int *needsCRC,
86 			      int (*conv_f)(wchar_t, unsigned char *, int),
87 			      int translate)
88 {
89 	uint32_t c;
90 	int illChar = 0;
91 	int len, gotch = 0;
92 
93 	for (; (!gotch) && (*str_i_idx < str_i_max_len); *str_i_idx += u_ch) {
94 		if (*str_o_idx >= str_o_max_len) {
95 			*needsCRC = 1;
96 			return gotch;
97 		}
98 
99 		/* Expand OSTA compressed Unicode to Unicode */
100 		c = str_i[*str_i_idx];
101 		if (u_ch > 1)
102 			c = (c << 8) | str_i[*str_i_idx + 1];
103 
104 		if (translate && (c == '/' || c == 0))
105 			illChar = 1;
106 		else if (illChar)
107 			break;
108 		else
109 			gotch = 1;
110 	}
111 	if (illChar) {
112 		*needsCRC = 1;
113 		c = ILLEGAL_CHAR_MARK;
114 		gotch = 1;
115 	}
116 	if (gotch) {
117 		len = conv_f(c, &str_o[*str_o_idx], str_o_max_len - *str_o_idx);
118 		/* Valid character? */
119 		if (len >= 0)
120 			*str_o_idx += len;
121 		else if (len == -ENAMETOOLONG) {
122 			*needsCRC = 1;
123 			gotch = 0;
124 		} else {
125 			str_o[(*str_o_idx)++] = '?';
126 			*needsCRC = 1;
127 		}
128 	}
129 	return gotch;
130 }
131 
132 static int udf_name_from_CS0(uint8_t *str_o, int str_max_len,
133 			     const uint8_t *ocu, int ocu_len,
134 			     int (*conv_f)(wchar_t, unsigned char *, int),
135 			     int translate)
136 {
137 	uint32_t c;
138 	uint8_t cmp_id;
139 	int idx, len;
140 	int u_ch;
141 	int needsCRC = 0;
142 	int ext_i_len, ext_max_len;
143 	int str_o_len = 0;	/* Length of resulting output */
144 	int ext_o_len = 0;	/* Extension output length */
145 	int ext_crc_len = 0;	/* Extension output length if used with CRC */
146 	int i_ext = -1;		/* Extension position in input buffer */
147 	int o_crc = 0;		/* Rightmost possible output pos for CRC+ext */
148 	unsigned short valueCRC;
149 	uint8_t ext[EXT_SIZE * NLS_MAX_CHARSET_SIZE + 1];
150 	uint8_t crc[CRC_LEN];
151 
152 	if (str_max_len <= 0)
153 		return 0;
154 
155 	if (ocu_len == 0) {
156 		memset(str_o, 0, str_max_len);
157 		return 0;
158 	}
159 
160 	cmp_id = ocu[0];
161 	if (cmp_id != 8 && cmp_id != 16) {
162 		memset(str_o, 0, str_max_len);
163 		pr_err("unknown compression code (%u)\n", cmp_id);
164 		return -EINVAL;
165 	}
166 	u_ch = cmp_id >> 3;
167 
168 	ocu++;
169 	ocu_len--;
170 
171 	if (ocu_len % u_ch) {
172 		pr_err("incorrect filename length (%d)\n", ocu_len + 1);
173 		return -EINVAL;
174 	}
175 
176 	if (translate) {
177 		/* Look for extension */
178 		for (idx = ocu_len - u_ch, ext_i_len = 0;
179 		     (idx >= 0) && (ext_i_len < EXT_SIZE);
180 		     idx -= u_ch, ext_i_len++) {
181 			c = ocu[idx];
182 			if (u_ch > 1)
183 				c = (c << 8) | ocu[idx + 1];
184 
185 			if (c == EXT_MARK) {
186 				if (ext_i_len)
187 					i_ext = idx;
188 				break;
189 			}
190 		}
191 		if (i_ext >= 0) {
192 			/* Convert extension */
193 			ext_max_len = min_t(int, sizeof(ext), str_max_len);
194 			ext[ext_o_len++] = EXT_MARK;
195 			idx = i_ext + u_ch;
196 			while (udf_name_conv_char(ext, ext_max_len, &ext_o_len,
197 						  ocu, ocu_len, &idx,
198 						  u_ch, &needsCRC,
199 						  conv_f, translate)) {
200 				if ((ext_o_len + CRC_LEN) < str_max_len)
201 					ext_crc_len = ext_o_len;
202 			}
203 		}
204 	}
205 
206 	idx = 0;
207 	while (1) {
208 		if (translate && (idx == i_ext)) {
209 			if (str_o_len > (str_max_len - ext_o_len))
210 				needsCRC = 1;
211 			break;
212 		}
213 
214 		if (!udf_name_conv_char(str_o, str_max_len, &str_o_len,
215 					ocu, ocu_len, &idx,
216 					u_ch, &needsCRC, conv_f, translate))
217 			break;
218 
219 		if (translate &&
220 		    (str_o_len <= (str_max_len - ext_o_len - CRC_LEN)))
221 			o_crc = str_o_len;
222 	}
223 
224 	if (translate) {
225 		if (str_o_len <= 2 && str_o[0] == '.' &&
226 		    (str_o_len == 1 || str_o[1] == '.'))
227 			needsCRC = 1;
228 		if (needsCRC) {
229 			str_o_len = o_crc;
230 			valueCRC = crc_itu_t(0, ocu, ocu_len);
231 			crc[0] = CRC_MARK;
232 			crc[1] = hex_asc_upper_hi(valueCRC >> 8);
233 			crc[2] = hex_asc_upper_lo(valueCRC >> 8);
234 			crc[3] = hex_asc_upper_hi(valueCRC);
235 			crc[4] = hex_asc_upper_lo(valueCRC);
236 			len = min_t(int, CRC_LEN, str_max_len - str_o_len);
237 			memcpy(&str_o[str_o_len], crc, len);
238 			str_o_len += len;
239 			ext_o_len = ext_crc_len;
240 		}
241 		if (ext_o_len > 0) {
242 			memcpy(&str_o[str_o_len], ext, ext_o_len);
243 			str_o_len += ext_o_len;
244 		}
245 	}
246 
247 	return str_o_len;
248 }
249 
250 static int udf_name_to_CS0(uint8_t *ocu, int ocu_max_len,
251 			   const uint8_t *str_i, int str_len,
252 			   int (*conv_f)(const unsigned char *, int, wchar_t *))
253 {
254 	int i, len;
255 	unsigned int max_val;
256 	wchar_t uni_char;
257 	int u_len, u_ch;
258 
259 	if (ocu_max_len <= 0)
260 		return 0;
261 
262 	memset(ocu, 0, ocu_max_len);
263 	ocu[0] = 8;
264 	max_val = 0xff;
265 	u_ch = 1;
266 
267 try_again:
268 	u_len = 1;
269 	for (i = 0; i < str_len; i++) {
270 		/* Name didn't fit? */
271 		if (u_len + u_ch > ocu_max_len)
272 			return 0;
273 		len = conv_f(&str_i[i], str_len - i, &uni_char);
274 		if (!len)
275 			continue;
276 		/* Invalid character, deal with it */
277 		if (len < 0) {
278 			len = 1;
279 			uni_char = '?';
280 		}
281 
282 		if (uni_char > max_val) {
283 			max_val = 0xffff;
284 			ocu[0] = 0x10;
285 			u_ch = 2;
286 			goto try_again;
287 		}
288 
289 		if (max_val == 0xffff)
290 			ocu[u_len++] = (uint8_t)(uni_char >> 8);
291 		ocu[u_len++] = (uint8_t)(uni_char & 0xff);
292 		i += len - 1;
293 	}
294 
295 	return u_len;
296 }
297 
298 int udf_dstrCS0toUTF8(uint8_t *utf_o, int o_len,
299 		      const uint8_t *ocu_i, int i_len)
300 {
301 	int s_len = 0;
302 
303 	if (i_len > 0) {
304 		s_len = ocu_i[i_len - 1];
305 		if (s_len >= i_len) {
306 			pr_err("incorrect dstring lengths (%d/%d)\n",
307 			       s_len, i_len);
308 			return -EINVAL;
309 		}
310 	}
311 
312 	return udf_name_from_CS0(utf_o, o_len, ocu_i, s_len,
313 				 udf_uni2char_utf8, 0);
314 }
315 
316 int udf_get_filename(struct super_block *sb, const uint8_t *sname, int slen,
317 		     uint8_t *dname, int dlen)
318 {
319 	int (*conv_f)(wchar_t, unsigned char *, int);
320 	int ret;
321 
322 	if (!slen)
323 		return -EIO;
324 
325 	if (dlen <= 0)
326 		return 0;
327 
328 	if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) {
329 		conv_f = udf_uni2char_utf8;
330 	} else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) {
331 		conv_f = UDF_SB(sb)->s_nls_map->uni2char;
332 	} else
333 		BUG();
334 
335 	ret = udf_name_from_CS0(dname, dlen, sname, slen, conv_f, 1);
336 	/* Zero length filename isn't valid... */
337 	if (ret == 0)
338 		ret = -EINVAL;
339 	return ret;
340 }
341 
342 int udf_put_filename(struct super_block *sb, const uint8_t *sname, int slen,
343 		     uint8_t *dname, int dlen)
344 {
345 	int (*conv_f)(const unsigned char *, int, wchar_t *);
346 
347 	if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) {
348 		conv_f = udf_char2uni_utf8;
349 	} else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) {
350 		conv_f = UDF_SB(sb)->s_nls_map->char2uni;
351 	} else
352 		BUG();
353 
354 	return udf_name_to_CS0(dname, dlen, sname, slen, conv_f);
355 }
356 
357