xref: /openbmc/linux/fs/ntfs/unistr.c (revision c83eeec79ff64f777cbd59a8bd15d0a3fe1f92c0)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * unistr.c - NTFS Unicode string handling. Part of the Linux-NTFS project.
4  *
5  * Copyright (c) 2001-2006 Anton Altaparmakov
6  */
7 
8 #include <linux/slab.h>
9 
10 #include "types.h"
11 #include "debug.h"
12 #include "ntfs.h"
13 
14 /*
15  * IMPORTANT
16  * =========
17  *
18  * All these routines assume that the Unicode characters are in little endian
19  * encoding inside the strings!!!
20  */
21 
22 /*
23  * This is used by the name collation functions to quickly determine what
24  * characters are (in)valid.
25  */
26 static const u8 legal_ansi_char_array[0x40] = {
27 	0x00, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
28 	0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
29 
30 	0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
31 	0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
32 
33 	0x17, 0x07, 0x18, 0x17, 0x17, 0x17, 0x17, 0x17,
34 	0x17, 0x17, 0x18, 0x16, 0x16, 0x17, 0x07, 0x00,
35 
36 	0x17, 0x17, 0x17, 0x17, 0x17, 0x17, 0x17, 0x17,
37 	0x17, 0x17, 0x04, 0x16, 0x18, 0x16, 0x18, 0x18,
38 };
39 
40 /**
41  * ntfs_are_names_equal - compare two Unicode names for equality
42  * @s1:			name to compare to @s2
43  * @s1_len:		length in Unicode characters of @s1
44  * @s2:			name to compare to @s1
45  * @s2_len:		length in Unicode characters of @s2
46  * @ic:			ignore case bool
47  * @upcase:		upcase table (only if @ic == IGNORE_CASE)
48  * @upcase_size:	length in Unicode characters of @upcase (if present)
49  *
50  * Compare the names @s1 and @s2 and return 'true' (1) if the names are
51  * identical, or 'false' (0) if they are not identical. If @ic is IGNORE_CASE,
52  * the @upcase table is used to performa a case insensitive comparison.
53  */
54 bool ntfs_are_names_equal(const ntfschar *s1, size_t s1_len,
55 		const ntfschar *s2, size_t s2_len, const IGNORE_CASE_BOOL ic,
56 		const ntfschar *upcase, const u32 upcase_size)
57 {
58 	if (s1_len != s2_len)
59 		return false;
60 	if (ic == CASE_SENSITIVE)
61 		return !ntfs_ucsncmp(s1, s2, s1_len);
62 	return !ntfs_ucsncasecmp(s1, s2, s1_len, upcase, upcase_size);
63 }
64 
65 /**
66  * ntfs_collate_names - collate two Unicode names
67  * @name1:	first Unicode name to compare
68  * @name2:	second Unicode name to compare
69  * @err_val:	if @name1 contains an invalid character return this value
70  * @ic:		either CASE_SENSITIVE or IGNORE_CASE
71  * @upcase:	upcase table (ignored if @ic is CASE_SENSITIVE)
72  * @upcase_len:	upcase table size (ignored if @ic is CASE_SENSITIVE)
73  *
74  * ntfs_collate_names collates two Unicode names and returns:
75  *
76  *  -1 if the first name collates before the second one,
77  *   0 if the names match,
78  *   1 if the second name collates before the first one, or
79  * @err_val if an invalid character is found in @name1 during the comparison.
80  *
81  * The following characters are considered invalid: '"', '*', '<', '>' and '?'.
82  */
83 int ntfs_collate_names(const ntfschar *name1, const u32 name1_len,
84 		const ntfschar *name2, const u32 name2_len,
85 		const int err_val, const IGNORE_CASE_BOOL ic,
86 		const ntfschar *upcase, const u32 upcase_len)
87 {
88 	u32 cnt, min_len;
89 	u16 c1, c2;
90 
91 	min_len = name1_len;
92 	if (name1_len > name2_len)
93 		min_len = name2_len;
94 	for (cnt = 0; cnt < min_len; ++cnt) {
95 		c1 = le16_to_cpu(*name1++);
96 		c2 = le16_to_cpu(*name2++);
97 		if (ic) {
98 			if (c1 < upcase_len)
99 				c1 = le16_to_cpu(upcase[c1]);
100 			if (c2 < upcase_len)
101 				c2 = le16_to_cpu(upcase[c2]);
102 		}
103 		if (c1 < 64 && legal_ansi_char_array[c1] & 8)
104 			return err_val;
105 		if (c1 < c2)
106 			return -1;
107 		if (c1 > c2)
108 			return 1;
109 	}
110 	if (name1_len < name2_len)
111 		return -1;
112 	if (name1_len == name2_len)
113 		return 0;
114 	/* name1_len > name2_len */
115 	c1 = le16_to_cpu(*name1);
116 	if (c1 < 64 && legal_ansi_char_array[c1] & 8)
117 		return err_val;
118 	return 1;
119 }
120 
121 /**
122  * ntfs_ucsncmp - compare two little endian Unicode strings
123  * @s1:		first string
124  * @s2:		second string
125  * @n:		maximum unicode characters to compare
126  *
127  * Compare the first @n characters of the Unicode strings @s1 and @s2,
128  * The strings in little endian format and appropriate le16_to_cpu()
129  * conversion is performed on non-little endian machines.
130  *
131  * The function returns an integer less than, equal to, or greater than zero
132  * if @s1 (or the first @n Unicode characters thereof) is found, respectively,
133  * to be less than, to match, or be greater than @s2.
134  */
135 int ntfs_ucsncmp(const ntfschar *s1, const ntfschar *s2, size_t n)
136 {
137 	u16 c1, c2;
138 	size_t i;
139 
140 	for (i = 0; i < n; ++i) {
141 		c1 = le16_to_cpu(s1[i]);
142 		c2 = le16_to_cpu(s2[i]);
143 		if (c1 < c2)
144 			return -1;
145 		if (c1 > c2)
146 			return 1;
147 		if (!c1)
148 			break;
149 	}
150 	return 0;
151 }
152 
153 /**
154  * ntfs_ucsncasecmp - compare two little endian Unicode strings, ignoring case
155  * @s1:			first string
156  * @s2:			second string
157  * @n:			maximum unicode characters to compare
158  * @upcase:		upcase table
159  * @upcase_size:	upcase table size in Unicode characters
160  *
161  * Compare the first @n characters of the Unicode strings @s1 and @s2,
162  * ignoring case. The strings in little endian format and appropriate
163  * le16_to_cpu() conversion is performed on non-little endian machines.
164  *
165  * Each character is uppercased using the @upcase table before the comparison.
166  *
167  * The function returns an integer less than, equal to, or greater than zero
168  * if @s1 (or the first @n Unicode characters thereof) is found, respectively,
169  * to be less than, to match, or be greater than @s2.
170  */
171 int ntfs_ucsncasecmp(const ntfschar *s1, const ntfschar *s2, size_t n,
172 		const ntfschar *upcase, const u32 upcase_size)
173 {
174 	size_t i;
175 	u16 c1, c2;
176 
177 	for (i = 0; i < n; ++i) {
178 		if ((c1 = le16_to_cpu(s1[i])) < upcase_size)
179 			c1 = le16_to_cpu(upcase[c1]);
180 		if ((c2 = le16_to_cpu(s2[i])) < upcase_size)
181 			c2 = le16_to_cpu(upcase[c2]);
182 		if (c1 < c2)
183 			return -1;
184 		if (c1 > c2)
185 			return 1;
186 		if (!c1)
187 			break;
188 	}
189 	return 0;
190 }
191 
192 void ntfs_upcase_name(ntfschar *name, u32 name_len, const ntfschar *upcase,
193 		const u32 upcase_len)
194 {
195 	u32 i;
196 	u16 u;
197 
198 	for (i = 0; i < name_len; i++)
199 		if ((u = le16_to_cpu(name[i])) < upcase_len)
200 			name[i] = upcase[u];
201 }
202 
203 void ntfs_file_upcase_value(FILE_NAME_ATTR *file_name_attr,
204 		const ntfschar *upcase, const u32 upcase_len)
205 {
206 	ntfs_upcase_name((ntfschar*)&file_name_attr->file_name,
207 			file_name_attr->file_name_length, upcase, upcase_len);
208 }
209 
210 int ntfs_file_compare_values(FILE_NAME_ATTR *file_name_attr1,
211 		FILE_NAME_ATTR *file_name_attr2,
212 		const int err_val, const IGNORE_CASE_BOOL ic,
213 		const ntfschar *upcase, const u32 upcase_len)
214 {
215 	return ntfs_collate_names((ntfschar*)&file_name_attr1->file_name,
216 			file_name_attr1->file_name_length,
217 			(ntfschar*)&file_name_attr2->file_name,
218 			file_name_attr2->file_name_length,
219 			err_val, ic, upcase, upcase_len);
220 }
221 
222 /**
223  * ntfs_nlstoucs - convert NLS string to little endian Unicode string
224  * @vol:	ntfs volume which we are working with
225  * @ins:	input NLS string buffer
226  * @ins_len:	length of input string in bytes
227  * @outs:	on return contains the allocated output Unicode string buffer
228  *
229  * Convert the input string @ins, which is in whatever format the loaded NLS
230  * map dictates, into a little endian, 2-byte Unicode string.
231  *
232  * This function allocates the string and the caller is responsible for
233  * calling kmem_cache_free(ntfs_name_cache, *@outs); when finished with it.
234  *
235  * On success the function returns the number of Unicode characters written to
236  * the output string *@outs (>= 0), not counting the terminating Unicode NULL
237  * character. *@outs is set to the allocated output string buffer.
238  *
239  * On error, a negative number corresponding to the error code is returned. In
240  * that case the output string is not allocated. Both *@outs and *@outs_len
241  * are then undefined.
242  *
243  * This might look a bit odd due to fast path optimization...
244  */
245 int ntfs_nlstoucs(const ntfs_volume *vol, const char *ins,
246 		const int ins_len, ntfschar **outs)
247 {
248 	struct nls_table *nls = vol->nls_map;
249 	ntfschar *ucs;
250 	wchar_t wc;
251 	int i, o, wc_len;
252 
253 	/* We do not trust outside sources. */
254 	if (likely(ins)) {
255 		ucs = kmem_cache_alloc(ntfs_name_cache, GFP_NOFS);
256 		if (likely(ucs)) {
257 			for (i = o = 0; i < ins_len; i += wc_len) {
258 				wc_len = nls->char2uni(ins + i, ins_len - i,
259 						&wc);
260 				if (likely(wc_len >= 0 &&
261 						o < NTFS_MAX_NAME_LEN)) {
262 					if (likely(wc)) {
263 						ucs[o++] = cpu_to_le16(wc);
264 						continue;
265 					} /* else if (!wc) */
266 					break;
267 				} /* else if (wc_len < 0 ||
268 						o >= NTFS_MAX_NAME_LEN) */
269 				goto name_err;
270 			}
271 			ucs[o] = 0;
272 			*outs = ucs;
273 			return o;
274 		} /* else if (!ucs) */
275 		ntfs_error(vol->sb, "Failed to allocate buffer for converted "
276 				"name from ntfs_name_cache.");
277 		return -ENOMEM;
278 	} /* else if (!ins) */
279 	ntfs_error(vol->sb, "Received NULL pointer.");
280 	return -EINVAL;
281 name_err:
282 	kmem_cache_free(ntfs_name_cache, ucs);
283 	if (wc_len < 0) {
284 		ntfs_error(vol->sb, "Name using character set %s contains "
285 				"characters that cannot be converted to "
286 				"Unicode.", nls->charset);
287 		i = -EILSEQ;
288 	} else /* if (o >= NTFS_MAX_NAME_LEN) */ {
289 		ntfs_error(vol->sb, "Name is too long (maximum length for a "
290 				"name on NTFS is %d Unicode characters.",
291 				NTFS_MAX_NAME_LEN);
292 		i = -ENAMETOOLONG;
293 	}
294 	return i;
295 }
296 
297 /**
298  * ntfs_ucstonls - convert little endian Unicode string to NLS string
299  * @vol:	ntfs volume which we are working with
300  * @ins:	input Unicode string buffer
301  * @ins_len:	length of input string in Unicode characters
302  * @outs:	on return contains the (allocated) output NLS string buffer
303  * @outs_len:	length of output string buffer in bytes
304  *
305  * Convert the input little endian, 2-byte Unicode string @ins, of length
306  * @ins_len into the string format dictated by the loaded NLS.
307  *
308  * If *@outs is NULL, this function allocates the string and the caller is
309  * responsible for calling kfree(*@outs); when finished with it. In this case
310  * @outs_len is ignored and can be 0.
311  *
312  * On success the function returns the number of bytes written to the output
313  * string *@outs (>= 0), not counting the terminating NULL byte. If the output
314  * string buffer was allocated, *@outs is set to it.
315  *
316  * On error, a negative number corresponding to the error code is returned. In
317  * that case the output string is not allocated. The contents of *@outs are
318  * then undefined.
319  *
320  * This might look a bit odd due to fast path optimization...
321  */
322 int ntfs_ucstonls(const ntfs_volume *vol, const ntfschar *ins,
323 		const int ins_len, unsigned char **outs, int outs_len)
324 {
325 	struct nls_table *nls = vol->nls_map;
326 	unsigned char *ns;
327 	int i, o, ns_len, wc;
328 
329 	/* We don't trust outside sources. */
330 	if (ins) {
331 		ns = *outs;
332 		ns_len = outs_len;
333 		if (ns && !ns_len) {
334 			wc = -ENAMETOOLONG;
335 			goto conversion_err;
336 		}
337 		if (!ns) {
338 			ns_len = ins_len * NLS_MAX_CHARSET_SIZE;
339 			ns = kmalloc(ns_len + 1, GFP_NOFS);
340 			if (!ns)
341 				goto mem_err_out;
342 		}
343 		for (i = o = 0; i < ins_len; i++) {
344 retry:			wc = nls->uni2char(le16_to_cpu(ins[i]), ns + o,
345 					ns_len - o);
346 			if (wc > 0) {
347 				o += wc;
348 				continue;
349 			} else if (!wc)
350 				break;
351 			else if (wc == -ENAMETOOLONG && ns != *outs) {
352 				unsigned char *tc;
353 				/* Grow in multiples of 64 bytes. */
354 				tc = kmalloc((ns_len + 64) &
355 						~63, GFP_NOFS);
356 				if (tc) {
357 					memcpy(tc, ns, ns_len);
358 					ns_len = ((ns_len + 64) & ~63) - 1;
359 					kfree(ns);
360 					ns = tc;
361 					goto retry;
362 				} /* No memory so goto conversion_error; */
363 			} /* wc < 0, real error. */
364 			goto conversion_err;
365 		}
366 		ns[o] = 0;
367 		*outs = ns;
368 		return o;
369 	} /* else (!ins) */
370 	ntfs_error(vol->sb, "Received NULL pointer.");
371 	return -EINVAL;
372 conversion_err:
373 	ntfs_error(vol->sb, "Unicode name contains characters that cannot be "
374 			"converted to character set %s.  You might want to "
375 			"try to use the mount option nls=utf8.", nls->charset);
376 	if (ns != *outs)
377 		kfree(ns);
378 	if (wc != -ENAMETOOLONG)
379 		wc = -EILSEQ;
380 	return wc;
381 mem_err_out:
382 	ntfs_error(vol->sb, "Failed to allocate name!");
383 	return -ENOMEM;
384 }
385