xref: /openbmc/linux/fs/udf/unicode.c (revision d504adc2)
1 /*
2  * unicode.c
3  *
4  * PURPOSE
5  *	Routines for converting between UTF-8 and OSTA Compressed Unicode.
6  *      Also handles filename mangling
7  *
8  * DESCRIPTION
9  *	OSTA Compressed Unicode is explained in the OSTA UDF specification.
10  *		http://www.osta.org/
11  *	UTF-8 is explained in the IETF RFC XXXX.
12  *		ftp://ftp.internic.net/rfc/rfcxxxx.txt
13  *
14  * COPYRIGHT
15  *	This file is distributed under the terms of the GNU General Public
16  *	License (GPL). Copies of the GPL can be obtained from:
17  *		ftp://prep.ai.mit.edu/pub/gnu/GPL
18  *	Each contributing author retains all rights to their own work.
19  */
20 
21 #include "udfdecl.h"
22 
23 #include <linux/kernel.h>
24 #include <linux/string.h>	/* for memset */
25 #include <linux/nls.h>
26 #include <linux/crc-itu-t.h>
27 #include <linux/slab.h>
28 
29 #include "udf_sb.h"
30 
31 #define UNICODE_MAX 0x10ffff
32 #define SURROGATE_MASK 0xfffff800
33 #define SURROGATE_PAIR 0x0000d800
34 
35 static int udf_uni2char_utf8(wchar_t uni,
36 			     unsigned char *out,
37 			     int boundlen)
38 {
39 	int u_len = 0;
40 
41 	if (boundlen <= 0)
42 		return -ENAMETOOLONG;
43 
44 	u_len = utf32_to_utf8(uni, out, boundlen);
45 	if (u_len < 0) {
46 		if (uni > UNICODE_MAX ||
47 		    (uni & SURROGATE_MASK) == SURROGATE_PAIR)
48 			return -EINVAL;
49 		return -ENAMETOOLONG;
50 	}
51 	return u_len;
52 }
53 
54 static int udf_char2uni_utf8(const unsigned char *in,
55 			     int boundlen,
56 			     wchar_t *uni)
57 {
58 	int u_len;
59 	unicode_t c;
60 
61 	u_len = utf8_to_utf32(in, boundlen, &c);
62 	if (u_len < 0) {
63 		*uni = '?';
64 		return -EINVAL;
65 	}
66 
67 	if (c > MAX_WCHAR_T)
68 		*uni = '?';
69 	else
70 		*uni = c;
71 	return u_len;
72 }
73 
74 #define ILLEGAL_CHAR_MARK	'_'
75 #define EXT_MARK		'.'
76 #define CRC_MARK		'#'
77 #define EXT_SIZE		5
78 /* Number of chars we need to store generated CRC to make filename unique */
79 #define CRC_LEN			5
80 
81 static int udf_name_conv_char(uint8_t *str_o, int str_o_max_len,
82 			      int *str_o_idx,
83 			      const uint8_t *str_i, int str_i_max_len,
84 			      int *str_i_idx,
85 			      int u_ch, int *needsCRC,
86 			      int (*conv_f)(wchar_t, unsigned char *, int),
87 			      int translate)
88 {
89 	uint32_t c;
90 	int illChar = 0;
91 	int len, gotch = 0;
92 
93 	for (; (!gotch) && (*str_i_idx < str_i_max_len); *str_i_idx += u_ch) {
94 		if (*str_o_idx >= str_o_max_len) {
95 			*needsCRC = 1;
96 			return gotch;
97 		}
98 
99 		/* Expand OSTA compressed Unicode to Unicode */
100 		c = str_i[*str_i_idx];
101 		if (u_ch > 1)
102 			c = (c << 8) | str_i[*str_i_idx + 1];
103 
104 		if (translate && (c == '/' || c == 0))
105 			illChar = 1;
106 		else if (illChar)
107 			break;
108 		else
109 			gotch = 1;
110 	}
111 	if (illChar) {
112 		*needsCRC = 1;
113 		c = ILLEGAL_CHAR_MARK;
114 		gotch = 1;
115 	}
116 	if (gotch) {
117 		len = conv_f(c, &str_o[*str_o_idx], str_o_max_len - *str_o_idx);
118 		/* Valid character? */
119 		if (len >= 0)
120 			*str_o_idx += len;
121 		else if (len == -ENAMETOOLONG) {
122 			*needsCRC = 1;
123 			gotch = 0;
124 		} else {
125 			str_o[(*str_o_idx)++] = '?';
126 			*needsCRC = 1;
127 		}
128 	}
129 	return gotch;
130 }
131 
132 static int udf_name_from_CS0(struct super_block *sb,
133 			     uint8_t *str_o, int str_max_len,
134 			     const uint8_t *ocu, int ocu_len,
135 			     int translate)
136 {
137 	uint32_t c;
138 	uint8_t cmp_id;
139 	int idx, len;
140 	int u_ch;
141 	int needsCRC = 0;
142 	int ext_i_len, ext_max_len;
143 	int str_o_len = 0;	/* Length of resulting output */
144 	int ext_o_len = 0;	/* Extension output length */
145 	int ext_crc_len = 0;	/* Extension output length if used with CRC */
146 	int i_ext = -1;		/* Extension position in input buffer */
147 	int o_crc = 0;		/* Rightmost possible output pos for CRC+ext */
148 	unsigned short valueCRC;
149 	uint8_t ext[EXT_SIZE * NLS_MAX_CHARSET_SIZE + 1];
150 	uint8_t crc[CRC_LEN];
151 	int (*conv_f)(wchar_t, unsigned char *, int);
152 
153 	if (str_max_len <= 0)
154 		return 0;
155 
156 	if (ocu_len == 0) {
157 		memset(str_o, 0, str_max_len);
158 		return 0;
159 	}
160 
161 	if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) {
162 		conv_f = udf_uni2char_utf8;
163 	} else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) {
164 		conv_f = UDF_SB(sb)->s_nls_map->uni2char;
165 	} else
166 		BUG();
167 
168 	cmp_id = ocu[0];
169 	if (cmp_id != 8 && cmp_id != 16) {
170 		memset(str_o, 0, str_max_len);
171 		pr_err("unknown compression code (%u)\n", cmp_id);
172 		return -EINVAL;
173 	}
174 	u_ch = cmp_id >> 3;
175 
176 	ocu++;
177 	ocu_len--;
178 
179 	if (ocu_len % u_ch) {
180 		pr_err("incorrect filename length (%d)\n", ocu_len + 1);
181 		return -EINVAL;
182 	}
183 
184 	if (translate) {
185 		/* Look for extension */
186 		for (idx = ocu_len - u_ch, ext_i_len = 0;
187 		     (idx >= 0) && (ext_i_len < EXT_SIZE);
188 		     idx -= u_ch, ext_i_len++) {
189 			c = ocu[idx];
190 			if (u_ch > 1)
191 				c = (c << 8) | ocu[idx + 1];
192 
193 			if (c == EXT_MARK) {
194 				if (ext_i_len)
195 					i_ext = idx;
196 				break;
197 			}
198 		}
199 		if (i_ext >= 0) {
200 			/* Convert extension */
201 			ext_max_len = min_t(int, sizeof(ext), str_max_len);
202 			ext[ext_o_len++] = EXT_MARK;
203 			idx = i_ext + u_ch;
204 			while (udf_name_conv_char(ext, ext_max_len, &ext_o_len,
205 						  ocu, ocu_len, &idx,
206 						  u_ch, &needsCRC,
207 						  conv_f, translate)) {
208 				if ((ext_o_len + CRC_LEN) < str_max_len)
209 					ext_crc_len = ext_o_len;
210 			}
211 		}
212 	}
213 
214 	idx = 0;
215 	while (1) {
216 		if (translate && (idx == i_ext)) {
217 			if (str_o_len > (str_max_len - ext_o_len))
218 				needsCRC = 1;
219 			break;
220 		}
221 
222 		if (!udf_name_conv_char(str_o, str_max_len, &str_o_len,
223 					ocu, ocu_len, &idx,
224 					u_ch, &needsCRC, conv_f, translate))
225 			break;
226 
227 		if (translate &&
228 		    (str_o_len <= (str_max_len - ext_o_len - CRC_LEN)))
229 			o_crc = str_o_len;
230 	}
231 
232 	if (translate) {
233 		if (str_o_len <= 2 && str_o[0] == '.' &&
234 		    (str_o_len == 1 || str_o[1] == '.'))
235 			needsCRC = 1;
236 		if (needsCRC) {
237 			str_o_len = o_crc;
238 			valueCRC = crc_itu_t(0, ocu, ocu_len);
239 			crc[0] = CRC_MARK;
240 			crc[1] = hex_asc_upper_hi(valueCRC >> 8);
241 			crc[2] = hex_asc_upper_lo(valueCRC >> 8);
242 			crc[3] = hex_asc_upper_hi(valueCRC);
243 			crc[4] = hex_asc_upper_lo(valueCRC);
244 			len = min_t(int, CRC_LEN, str_max_len - str_o_len);
245 			memcpy(&str_o[str_o_len], crc, len);
246 			str_o_len += len;
247 			ext_o_len = ext_crc_len;
248 		}
249 		if (ext_o_len > 0) {
250 			memcpy(&str_o[str_o_len], ext, ext_o_len);
251 			str_o_len += ext_o_len;
252 		}
253 	}
254 
255 	return str_o_len;
256 }
257 
258 static int udf_name_to_CS0(struct super_block *sb,
259 			   uint8_t *ocu, int ocu_max_len,
260 			   const uint8_t *str_i, int str_len)
261 {
262 	int i, len;
263 	unsigned int max_val;
264 	wchar_t uni_char;
265 	int u_len, u_ch;
266 	int (*conv_f)(const unsigned char *, int, wchar_t *);
267 
268 	if (ocu_max_len <= 0)
269 		return 0;
270 
271 	if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) {
272 		conv_f = udf_char2uni_utf8;
273 	} else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) {
274 		conv_f = UDF_SB(sb)->s_nls_map->char2uni;
275 	} else
276 		BUG();
277 
278 	memset(ocu, 0, ocu_max_len);
279 	ocu[0] = 8;
280 	max_val = 0xff;
281 	u_ch = 1;
282 
283 try_again:
284 	u_len = 1;
285 	for (i = 0; i < str_len; i++) {
286 		/* Name didn't fit? */
287 		if (u_len + u_ch > ocu_max_len)
288 			return 0;
289 		len = conv_f(&str_i[i], str_len - i, &uni_char);
290 		if (!len)
291 			continue;
292 		/* Invalid character, deal with it */
293 		if (len < 0) {
294 			len = 1;
295 			uni_char = '?';
296 		}
297 
298 		if (uni_char > max_val) {
299 			max_val = 0xffff;
300 			ocu[0] = 0x10;
301 			u_ch = 2;
302 			goto try_again;
303 		}
304 
305 		if (max_val == 0xffff)
306 			ocu[u_len++] = (uint8_t)(uni_char >> 8);
307 		ocu[u_len++] = (uint8_t)(uni_char & 0xff);
308 		i += len - 1;
309 	}
310 
311 	return u_len;
312 }
313 
314 int udf_dstrCS0toChar(struct super_block *sb, uint8_t *utf_o, int o_len,
315 		      const uint8_t *ocu_i, int i_len)
316 {
317 	int s_len = 0;
318 
319 	if (i_len > 0) {
320 		s_len = ocu_i[i_len - 1];
321 		if (s_len >= i_len) {
322 			pr_err("incorrect dstring lengths (%d/%d)\n",
323 			       s_len, i_len);
324 			return -EINVAL;
325 		}
326 	}
327 
328 	return udf_name_from_CS0(sb, utf_o, o_len, ocu_i, s_len, 0);
329 }
330 
331 int udf_get_filename(struct super_block *sb, const uint8_t *sname, int slen,
332 		     uint8_t *dname, int dlen)
333 {
334 	int ret;
335 
336 	if (!slen)
337 		return -EIO;
338 
339 	if (dlen <= 0)
340 		return 0;
341 
342 	ret = udf_name_from_CS0(sb, dname, dlen, sname, slen, 1);
343 	/* Zero length filename isn't valid... */
344 	if (ret == 0)
345 		ret = -EINVAL;
346 	return ret;
347 }
348 
349 int udf_put_filename(struct super_block *sb, const uint8_t *sname, int slen,
350 		     uint8_t *dname, int dlen)
351 {
352 	return udf_name_to_CS0(sb, dname, dlen, sname, slen);
353 }
354 
355