xref: /openbmc/u-boot/test/unicode_ut.c (revision f11a164b58860c3971e207a2e1cf1c033b9d0910)
1 // SPDX-License-Identifier: GPL-2.0+
2 /*
3  * Unit tests for Unicode functions
4  *
5  * Copyright (c) 2018 Heinrich Schuchardt <xypron.glpk@gmx.de>
6  */
7 
8 #include <common.h>
9 #include <charset.h>
10 #include <command.h>
11 #include <errno.h>
12 #include <test/test.h>
13 #include <test/suites.h>
14 #include <test/ut.h>
15 
16 /* Linker list entry for a Unicode test */
17 #define UNICODE_TEST(_name) UNIT_TEST(_name, 0, unicode_test)
18 
19 /* Constants c1-c4 and d1-d4 encode the same letters */
20 
21 /* Six characters translating to one utf-8 byte each. */
22 static const u16 c1[] = {0x55, 0x2d, 0x42, 0x6f, 0x6f, 0x74, 0x00};
23 /* One character translating to two utf-8 bytes */
24 static const u16 c2[] = {0x6b, 0x61, 0x66, 0x62, 0xe1, 0x74, 0x75, 0x72, 0x00};
25 /* Three characters translating to three utf-8 bytes each */
26 static const u16 c3[] = {0x6f5c, 0x6c34, 0x8266, 0x00};
27 /* Three letters translating to four utf-8 bytes each */
28 static const u16 c4[] = {0xd801, 0xdc8d, 0xd801, 0xdc96, 0xd801, 0xdc87,
29 			 0x0000};
30 
31 /* Illegal utf-16 strings */
32 static const u16 i1[] = {0x69, 0x31, 0xdc87, 0x6c, 0x00};
33 static const u16 i2[] = {0x69, 0x32, 0xd801, 0xd801, 0x6c, 0x00};
34 static const u16 i3[] = {0x69, 0x33, 0xd801, 0x00};
35 
36 /* Six characters translating to one utf-16 word each. */
37 static const char d1[] = {0x55, 0x2d, 0x42, 0x6f, 0x6f, 0x74, 0x00};
38 /* Eight characters translating to one utf-16 word each */
39 static const char d2[] = {0x6b, 0x61, 0x66, 0x62, 0xc3, 0xa1, 0x74, 0x75,
40 			  0x72, 0x00};
41 /* Three characters translating to one utf-16 word each */
42 static const char d3[] = {0xe6, 0xbd, 0x9c, 0xe6, 0xb0, 0xb4, 0xe8, 0x89,
43 			  0xa6, 0x00};
44 /* Three letters translating to two utf-16 word each */
45 static const char d4[] = {0xf0, 0x90, 0x92, 0x8d, 0xf0, 0x90, 0x92, 0x96,
46 			  0xf0, 0x90, 0x92, 0x87, 0x00};
47 
48 /* Illegal utf-8 strings */
49 static const char j1[] = {0x6a, 0x31, 0xa1, 0x6c, 0x00};
50 static const char j2[] = {0x6a, 0x32, 0xc3, 0xc3, 0x6c, 0x00};
51 static const char j3[] = {0x6a, 0x33, 0xf0, 0x90, 0xf0, 0x00};
52 
53 static int ut_utf8_get(struct unit_test_state *uts)
54 {
55 	const char *s;
56 	s32 code;
57 	int i;
58 
59 	/* Check characters less than 0x800 */
60 	s = d2;
61 	for (i = 0; i < 8; ++i) {
62 		code = utf8_get((const char **)&s);
63 		/* c2 is the utf-8 encoding of d2 */
64 		ut_asserteq(c2[i], code);
65 		if (!code)
66 			break;
67 	}
68 	ut_asserteq_ptr(s, d2 + 9)
69 
70 	/* Check characters less than 0x10000 */
71 	s = d3;
72 	for (i = 0; i < 4; ++i) {
73 		code = utf8_get((const char **)&s);
74 		/* c3 is the utf-8 encoding of d3 */
75 		ut_asserteq(c3[i], code);
76 		if (!code)
77 			break;
78 	}
79 	ut_asserteq_ptr(s, d3 + 9)
80 
81 	/* Check character greater 0xffff */
82 	s = d4;
83 	code = utf8_get((const char **)&s);
84 	ut_asserteq(0x0001048d, code);
85 	ut_asserteq_ptr(s, d4 + 4);
86 
87 	return 0;
88 }
89 UNICODE_TEST(ut_utf8_get);
90 
91 static int ut_utf8_put(struct unit_test_state *uts)
92 {
93 	char buffer[8] = { 0, };
94 	char *pos;
95 
96 	/* Commercial at, translates to one character */
97 	pos = buffer;
98 	ut_assert(!utf8_put('@', &pos))
99 	ut_asserteq(1, pos - buffer);
100 	ut_asserteq('@', buffer[0]);
101 	ut_assert(!buffer[1]);
102 
103 	/* Latin letter G with acute, translates to two charactes */
104 	pos = buffer;
105 	ut_assert(!utf8_put(0x1f4, &pos));
106 	ut_asserteq(2, pos - buffer);
107 	ut_asserteq_str("\xc7\xb4", buffer);
108 
109 	/* Tagalog letter i, translates to three characters */
110 	pos = buffer;
111 	ut_assert(!utf8_put(0x1701, &pos));
112 	ut_asserteq(3, pos - buffer);
113 	ut_asserteq_str("\xe1\x9c\x81", buffer);
114 
115 	/* Hamster face, translates to four characters */
116 	pos = buffer;
117 	ut_assert(!utf8_put(0x1f439, &pos));
118 	ut_asserteq(4, pos - buffer);
119 	ut_asserteq_str("\xf0\x9f\x90\xb9", buffer);
120 
121 	/* Illegal code */
122 	pos = buffer;
123 	ut_asserteq(-1, utf8_put(0xd888, &pos));
124 
125 	return 0;
126 }
127 UNICODE_TEST(ut_utf8_put);
128 
129 static int ut_utf8_utf16_strlen(struct unit_test_state *uts)
130 {
131 	ut_asserteq(6, utf8_utf16_strlen(d1));
132 	ut_asserteq(8, utf8_utf16_strlen(d2));
133 	ut_asserteq(3, utf8_utf16_strlen(d3));
134 	ut_asserteq(6, utf8_utf16_strlen(d4));
135 
136 	/* illegal utf-8 sequences */
137 	ut_asserteq(4, utf8_utf16_strlen(j1));
138 	ut_asserteq(5, utf8_utf16_strlen(j2));
139 	ut_asserteq(3, utf8_utf16_strlen(j3));
140 
141 	return 0;
142 }
143 UNICODE_TEST(ut_utf8_utf16_strlen);
144 
145 static int ut_utf8_utf16_strnlen(struct unit_test_state *uts)
146 {
147 	ut_asserteq(3, utf8_utf16_strnlen(d1, 3));
148 	ut_asserteq(6, utf8_utf16_strnlen(d1, 13));
149 	ut_asserteq(6, utf8_utf16_strnlen(d2, 6));
150 	ut_asserteq(2, utf8_utf16_strnlen(d3, 2));
151 	ut_asserteq(4, utf8_utf16_strnlen(d4, 2));
152 	ut_asserteq(6, utf8_utf16_strnlen(d4, 3));
153 
154 	/* illegal utf-8 sequences */
155 	ut_asserteq(4, utf8_utf16_strnlen(j1, 16));
156 	ut_asserteq(5, utf8_utf16_strnlen(j2, 16));
157 	ut_asserteq(3, utf8_utf16_strnlen(j3, 16));
158 
159 	return 0;
160 }
161 UNICODE_TEST(ut_utf8_utf16_strnlen);
162 
163 /**
164  * ut_u16_strcmp() - Compare to u16 strings.
165  *
166  * @a1:		first string
167  * @a2:		second string
168  * @count:	number of u16 to compare
169  * Return:	-1 if a1 < a2, 0 if a1 == a2, 1 if a1 > a2
170  */
171 static int ut_u16_strcmp(const u16 *a1, const u16 *a2, size_t count)
172 {
173 	for (; (*a1 || *a2) && count; ++a1, ++a2, --count) {
174 		if (*a1 < *a2)
175 			return -1;
176 		if (*a1 > *a2)
177 			return 1;
178 	}
179 	return 0;
180 }
181 
182 static int ut_utf8_utf16_strcpy(struct unit_test_state *uts)
183 {
184 	u16 buf[16];
185 	u16 *pos;
186 
187 	pos = buf;
188 	utf8_utf16_strcpy(&pos, d1);
189 	ut_asserteq(6, pos - buf);
190 	ut_assert(!ut_u16_strcmp(buf, c1, SIZE_MAX));
191 
192 	pos = buf;
193 	utf8_utf16_strcpy(&pos, d2);
194 	ut_asserteq(8, pos - buf);
195 	ut_assert(!ut_u16_strcmp(buf, c2, SIZE_MAX));
196 
197 	pos = buf;
198 	utf8_utf16_strcpy(&pos, d3);
199 	ut_asserteq(3, pos - buf);
200 	ut_assert(!ut_u16_strcmp(buf, c3, SIZE_MAX));
201 
202 	pos = buf;
203 	utf8_utf16_strcpy(&pos, d4);
204 	ut_asserteq(6, pos - buf);
205 	ut_assert(!ut_u16_strcmp(buf, c4, SIZE_MAX));
206 
207 	/* Illegal utf-8 strings */
208 	pos = buf;
209 	utf8_utf16_strcpy(&pos, j1);
210 	ut_asserteq(4, pos - buf);
211 	ut_assert(!ut_u16_strcmp(buf, L"j1?l", SIZE_MAX));
212 
213 	pos = buf;
214 	utf8_utf16_strcpy(&pos, j2);
215 	ut_asserteq(5, pos - buf);
216 	ut_assert(!ut_u16_strcmp(buf, L"j2??l", SIZE_MAX));
217 
218 	pos = buf;
219 	utf8_utf16_strcpy(&pos, j3);
220 	ut_asserteq(3, pos - buf);
221 	ut_assert(!ut_u16_strcmp(buf, L"j3?", SIZE_MAX));
222 
223 	return 0;
224 }
225 UNICODE_TEST(ut_utf8_utf16_strcpy);
226 
227 int ut_utf8_utf16_strncpy(struct unit_test_state *uts)
228 {
229 	u16 buf[16];
230 	u16 *pos;
231 
232 	pos = buf;
233 	memset(buf, 0, sizeof(buf));
234 	utf8_utf16_strncpy(&pos, d1, 4);
235 	ut_asserteq(4, pos - buf);
236 	ut_assert(!buf[4]);
237 	ut_assert(!ut_u16_strcmp(buf, c1, 4));
238 
239 	pos = buf;
240 	memset(buf, 0, sizeof(buf));
241 	utf8_utf16_strncpy(&pos, d2, 10);
242 	ut_asserteq(8, pos - buf);
243 	ut_assert(buf[4]);
244 	ut_assert(!ut_u16_strcmp(buf, c2, SIZE_MAX));
245 
246 	pos = buf;
247 	memset(buf, 0, sizeof(buf));
248 	utf8_utf16_strncpy(&pos, d3, 2);
249 	ut_asserteq(2, pos - buf);
250 	ut_assert(!buf[2]);
251 	ut_assert(!ut_u16_strcmp(buf, c3, 2));
252 
253 	pos = buf;
254 	memset(buf, 0, sizeof(buf));
255 	utf8_utf16_strncpy(&pos, d4, 2);
256 	ut_asserteq(4, pos - buf);
257 	ut_assert(!buf[4]);
258 	ut_assert(!ut_u16_strcmp(buf, c4, 4));
259 
260 	pos = buf;
261 	memset(buf, 0, sizeof(buf));
262 	utf8_utf16_strncpy(&pos, d4, 10);
263 	ut_asserteq(6, pos - buf);
264 	ut_assert(buf[5]);
265 	ut_assert(!ut_u16_strcmp(buf, c4, SIZE_MAX));
266 
267 	return 0;
268 }
269 UNICODE_TEST(ut_utf8_utf16_strncpy);
270 
271 static int ut_utf16_get(struct unit_test_state *uts)
272 {
273 	const u16 *s;
274 	s32 code;
275 	int i;
276 
277 	/* Check characters less than 0x10000 */
278 	s = c2;
279 	for (i = 0; i < 9; ++i) {
280 		code = utf16_get((const u16 **)&s);
281 		ut_asserteq(c2[i], code);
282 		if (!code)
283 			break;
284 	}
285 	ut_asserteq_ptr(c2 + 8, s);
286 
287 	/* Check character greater 0xffff */
288 	s = c4;
289 	code = utf16_get((const u16 **)&s);
290 	ut_asserteq(0x0001048d, code);
291 	ut_asserteq_ptr(c4 + 2, s);
292 
293 	return 0;
294 }
295 UNICODE_TEST(ut_utf16_get);
296 
297 static int ut_utf16_put(struct unit_test_state *uts)
298 {
299 	u16 buffer[4] = { 0, };
300 	u16 *pos;
301 
302 	/* Commercial at, translates to one word */
303 	pos = buffer;
304 	ut_assert(!utf16_put('@', &pos));
305 	ut_asserteq(1, pos - buffer);
306 	ut_asserteq((u16)'@', buffer[0]);
307 	ut_assert(!buffer[1]);
308 
309 	/* Hamster face, translates to two words */
310 	pos = buffer;
311 	ut_assert(!utf16_put(0x1f439, &pos));
312 	ut_asserteq(2, pos - buffer);
313 	ut_asserteq((u16)0xd83d, buffer[0]);
314 	ut_asserteq((u16)0xdc39, buffer[1]);
315 	ut_assert(!buffer[2]);
316 
317 	/* Illegal code */
318 	pos = buffer;
319 	ut_asserteq(-1, utf16_put(0xd888, &pos));
320 
321 	return 0;
322 }
323 UNICODE_TEST(ut_utf16_put);
324 
325 int ut_utf16_strnlen(struct unit_test_state *uts)
326 {
327 	ut_asserteq(3, utf16_strnlen(c1, 3));
328 	ut_asserteq(6, utf16_strnlen(c1, 13));
329 	ut_asserteq(6, utf16_strnlen(c2, 6));
330 	ut_asserteq(2, utf16_strnlen(c3, 2));
331 	ut_asserteq(2, utf16_strnlen(c4, 2));
332 	ut_asserteq(3, utf16_strnlen(c4, 3));
333 
334 	/* illegal utf-16 word sequences */
335 	ut_asserteq(4, utf16_strnlen(i1, 16));
336 	ut_asserteq(4, utf16_strnlen(i2, 16));
337 	ut_asserteq(3, utf16_strnlen(i3, 16));
338 
339 	return 0;
340 }
341 UNICODE_TEST(ut_utf16_strnlen);
342 
343 int ut_utf16_utf8_strlen(struct unit_test_state *uts)
344 {
345 	ut_asserteq(6, utf16_utf8_strlen(c1));
346 	ut_asserteq(9, utf16_utf8_strlen(c2));
347 	ut_asserteq(9, utf16_utf8_strlen(c3));
348 	ut_asserteq(12, utf16_utf8_strlen(c4));
349 
350 	/* illegal utf-16 word sequences */
351 	ut_asserteq(4, utf16_utf8_strlen(i1));
352 	ut_asserteq(4, utf16_utf8_strlen(i2));
353 	ut_asserteq(3, utf16_utf8_strlen(i3));
354 
355 	return 0;
356 }
357 UNICODE_TEST(ut_utf16_utf8_strlen);
358 
359 int ut_utf16_utf8_strnlen(struct unit_test_state *uts)
360 {
361 	ut_asserteq(3, utf16_utf8_strnlen(c1, 3));
362 	ut_asserteq(6, utf16_utf8_strnlen(c1, 13));
363 	ut_asserteq(7, utf16_utf8_strnlen(c2, 6));
364 	ut_asserteq(6, utf16_utf8_strnlen(c3, 2));
365 	ut_asserteq(8, utf16_utf8_strnlen(c4, 2));
366 	ut_asserteq(12, utf16_utf8_strnlen(c4, 3));
367 	return 0;
368 }
369 UNICODE_TEST(ut_utf16_utf8_strnlen);
370 
371 int ut_utf16_utf8_strcpy(struct unit_test_state *uts)
372 {
373 	char buf[16];
374 	char *pos;
375 
376 	pos = buf;
377 	utf16_utf8_strcpy(&pos, c1);
378 	ut_asserteq(6, pos - buf);
379 	ut_asserteq_str(d1, buf);
380 
381 	pos = buf;
382 	utf16_utf8_strcpy(&pos, c2);
383 	ut_asserteq(9, pos - buf);
384 	ut_asserteq_str(d2, buf);
385 
386 	pos = buf;
387 	utf16_utf8_strcpy(&pos, c3);
388 	ut_asserteq(9, pos - buf);
389 	ut_asserteq_str(d3, buf);
390 
391 	pos = buf;
392 	utf16_utf8_strcpy(&pos, c4);
393 	ut_asserteq(12, pos - buf);
394 	ut_asserteq_str(d4, buf);
395 
396 	/* Illegal utf-16 strings */
397 	pos = buf;
398 	utf16_utf8_strcpy(&pos, i1);
399 	ut_asserteq(4, pos - buf);
400 	ut_asserteq_str("i1?l", buf);
401 
402 	pos = buf;
403 	utf16_utf8_strcpy(&pos, i2);
404 	ut_asserteq(4, pos - buf);
405 	ut_asserteq_str("i2?l", buf);
406 
407 	pos = buf;
408 	utf16_utf8_strcpy(&pos, i3);
409 	ut_asserteq(3, pos - buf);
410 	ut_asserteq_str("i3?", buf);
411 
412 	return 0;
413 }
414 UNICODE_TEST(ut_utf16_utf8_strcpy);
415 
416 int ut_utf16_utf8_strncpy(struct unit_test_state *uts)
417 {
418 	char buf[16];
419 	char *pos;
420 
421 	pos = buf;
422 	memset(buf, 0, sizeof(buf));
423 	utf16_utf8_strncpy(&pos, c1, 4);
424 	ut_asserteq(4, pos - buf);
425 	ut_assert(!buf[4]);
426 	ut_assert(!strncmp(buf, d1, 4));
427 
428 	pos = buf;
429 	memset(buf, 0, sizeof(buf));
430 	utf16_utf8_strncpy(&pos, c2, 10);
431 	ut_asserteq(9, pos - buf);
432 	ut_assert(buf[4]);
433 	ut_assert(!strncmp(buf, d2, SIZE_MAX));
434 
435 	pos = buf;
436 	memset(buf, 0, sizeof(buf));
437 	utf16_utf8_strncpy(&pos, c3, 2);
438 	ut_asserteq(6, pos - buf);
439 	ut_assert(!buf[6]);
440 	ut_assert(!strncmp(buf, d3, 6));
441 
442 	pos = buf;
443 	memset(buf, 0, sizeof(buf));
444 	utf16_utf8_strncpy(&pos, c4, 2);
445 	ut_asserteq(8, pos - buf);
446 	ut_assert(!buf[8]);
447 	ut_assert(!strncmp(buf, d4, 8));
448 
449 	pos = buf;
450 	memset(buf, 0, sizeof(buf));
451 	utf16_utf8_strncpy(&pos, c4, 10);
452 	ut_asserteq(12, pos - buf);
453 	ut_assert(buf[5]);
454 	ut_assert(!strncmp(buf, d4, SIZE_MAX));
455 
456 	return 0;
457 }
458 UNICODE_TEST(ut_utf16_utf8_strncpy);
459 
460 int do_ut_unicode(cmd_tbl_t *cmdtp, int flag, int argc, char * const argv[])
461 {
462 	struct unit_test *tests = ll_entry_start(struct unit_test, unicode_test);
463 	const int n_ents = ll_entry_count(struct unit_test, unicode_test);
464 
465 	return cmd_ut_category("Unicode", tests, n_ents, argc, argv);
466 }
467