1 /* SPDX-License-Identifier: GPL-2.0 */
2 /*
3  * syscall_numbering.c - test calling the x86-64 kernel with various
4  * valid and invalid system call numbers.
5  *
6  * Copyright (c) 2018 Andrew Lutomirski
7  */
8 
9 #define _GNU_SOURCE
10 
11 #include <stdlib.h>
12 #include <stdio.h>
13 #include <stdbool.h>
14 #include <errno.h>
15 #include <unistd.h>
16 #include <string.h>
17 #include <fcntl.h>
18 #include <limits.h>
19 #include <signal.h>
20 #include <sysexits.h>
21 
22 #include <sys/ptrace.h>
23 #include <sys/user.h>
24 #include <sys/wait.h>
25 #include <sys/mman.h>
26 
27 #include <linux/ptrace.h>
28 
29 /* Common system call numbers */
30 #define SYS_READ	  0
31 #define SYS_WRITE	  1
32 #define SYS_GETPID	 39
33 /* x64-only system call numbers */
34 #define X64_IOCTL	 16
35 #define X64_READV	 19
36 #define X64_WRITEV	 20
37 /* x32-only system call numbers (without X32_BIT) */
38 #define X32_IOCTL	514
39 #define X32_READV	515
40 #define X32_WRITEV	516
41 
42 #define X32_BIT 0x40000000
43 
44 static int nullfd = -1;		/* File descriptor for /dev/null */
45 static bool with_x32;		/* x32 supported on this kernel? */
46 
47 enum ptrace_pass {
48 	PTP_NOTHING,
49 	PTP_GETREGS,
50 	PTP_WRITEBACK,
51 	PTP_FUZZRET,
52 	PTP_FUZZHIGH,
53 	PTP_INTNUM,
54 	PTP_DONE
55 };
56 
57 static const char * const ptrace_pass_name[] =
58 {
59 	[PTP_NOTHING]	= "just stop, no data read",
60 	[PTP_GETREGS]	= "only getregs",
61 	[PTP_WRITEBACK]	= "getregs, unmodified setregs",
62 	[PTP_FUZZRET]	= "modifying the default return",
63 	[PTP_FUZZHIGH]	= "clobbering the top 32 bits",
64 	[PTP_INTNUM]	= "sign-extending the syscall number",
65 };
66 
67 /*
68  * Shared memory block between tracer and test
69  */
70 struct shared {
71 	unsigned int nerr;	/* Total error count */
72 	unsigned int indent;	/* Message indentation level */
73 	enum ptrace_pass ptrace_pass;
74 	bool probing_syscall;	/* In probe_syscall() */
75 };
76 static volatile struct shared *sh;
77 
78 static inline unsigned int offset(void)
79 {
80 	unsigned int level = sh ? sh->indent : 0;
81 
82 	return 8 + level * 4;
83 }
84 
85 #define msg(lvl, fmt, ...) printf("%-*s" fmt, offset(), "[" #lvl "]", \
86 				  ## __VA_ARGS__)
87 
88 #define run(fmt, ...)  msg(RUN,  fmt, ## __VA_ARGS__)
89 #define info(fmt, ...) msg(INFO, fmt, ## __VA_ARGS__)
90 #define ok(fmt, ...)   msg(OK,   fmt, ## __VA_ARGS__)
91 
92 #define fail(fmt, ...)					\
93 	do {						\
94 		msg(FAIL, fmt, ## __VA_ARGS__);		\
95 		sh->nerr++;				\
96        } while (0)
97 
98 #define crit(fmt, ...)					\
99 	do {						\
100 		sh->indent = 0;				\
101 		msg(FAIL, fmt, ## __VA_ARGS__);		\
102 		msg(SKIP, "Unable to run test\n");	\
103 		exit(EX_OSERR);				\
104        } while (0)
105 
106 /* Sentinel for ptrace-modified return value */
107 #define MODIFIED_BY_PTRACE	-9999
108 
109 /*
110  * Directly invokes the given syscall with nullfd as the first argument
111  * and the rest zero. Avoids involving glibc wrappers in case they ever
112  * end up intercepting some system calls for some reason, or modify
113  * the system call number itself.
114  */
115 static long long probe_syscall(int msb, int lsb)
116 {
117 	register long long arg1 asm("rdi") = nullfd;
118 	register long long arg2 asm("rsi") = 0;
119 	register long long arg3 asm("rdx") = 0;
120 	register long long arg4 asm("r10") = 0;
121 	register long long arg5 asm("r8")  = 0;
122 	register long long arg6 asm("r9")  = 0;
123 	long long nr = ((long long)msb << 32) | (unsigned int)lsb;
124 	long long ret;
125 
126 	/*
127 	 * We pass in an extra copy of the extended system call number
128 	 * in %rbx, so we can examine it from the ptrace handler without
129 	 * worrying about it being possibly modified. This is to test
130 	 * the validity of struct user regs.orig_rax a.k.a.
131 	 * struct pt_regs.orig_ax.
132 	 */
133 	sh->probing_syscall = true;
134 	asm volatile("syscall"
135 		     : "=a" (ret)
136 		     : "a" (nr), "b" (nr),
137 		       "r" (arg1), "r" (arg2), "r" (arg3),
138 		       "r" (arg4), "r" (arg5), "r" (arg6)
139 		     : "rcx", "r11", "memory", "cc");
140 	sh->probing_syscall = false;
141 
142 	return ret;
143 }
144 
145 static const char *syscall_str(int msb, int start, int end)
146 {
147 	static char buf[64];
148 	const char * const type = (start & X32_BIT) ? "x32" : "x64";
149 	int lsb = start;
150 
151 	/*
152 	 * Improve readability by stripping the x32 bit, but round
153 	 * toward zero so we don't display -1 as -1073741825.
154 	 */
155 	if (lsb < 0)
156 		lsb |= X32_BIT;
157 	else
158 		lsb &= ~X32_BIT;
159 
160 	if (start == end)
161 		snprintf(buf, sizeof buf, "%s syscall %d:%d",
162 			 type, msb, lsb);
163 	else
164 		snprintf(buf, sizeof buf, "%s syscalls %d:%d..%d",
165 			 type, msb, lsb, lsb + (end-start));
166 
167 	return buf;
168 }
169 
170 static unsigned int _check_for(int msb, int start, int end, long long expect,
171 			       const char *expect_str)
172 {
173 	unsigned int err = 0;
174 
175 	sh->indent++;
176 	if (start != end)
177 		sh->indent++;
178 
179 	for (int nr = start; nr <= end; nr++) {
180 		long long ret = probe_syscall(msb, nr);
181 
182 		if (ret != expect) {
183 			fail("%s returned %lld, but it should have returned %s\n",
184 			       syscall_str(msb, nr, nr),
185 			       ret, expect_str);
186 			err++;
187 		}
188 	}
189 
190 	if (start != end)
191 		sh->indent--;
192 
193 	if (err) {
194 		if (start != end)
195 			fail("%s had %u failure%s\n",
196 			     syscall_str(msb, start, end),
197 			     err, err == 1 ? "s" : "");
198 	} else {
199 		ok("%s returned %s as expected\n",
200 		   syscall_str(msb, start, end), expect_str);
201 	}
202 
203 	sh->indent--;
204 
205 	return err;
206 }
207 
208 #define check_for(msb,start,end,expect) \
209 	_check_for(msb,start,end,expect,#expect)
210 
211 static bool check_zero(int msb, int nr)
212 {
213 	return check_for(msb, nr, nr, 0);
214 }
215 
216 static bool check_enosys(int msb, int nr)
217 {
218 	return check_for(msb, nr, nr, -ENOSYS);
219 }
220 
221 /*
222  * Anyone diagnosing a failure will want to know whether the kernel
223  * supports x32. Tell them. This can also be used to conditionalize
224  * tests based on existence or nonexistence of x32.
225  */
226 static bool test_x32(void)
227 {
228 	long long ret;
229 	pid_t mypid = getpid();
230 
231 	run("Checking for x32 by calling x32 getpid()\n");
232 	ret = probe_syscall(0, SYS_GETPID | X32_BIT);
233 
234 	sh->indent++;
235 	if (ret == mypid) {
236 		info("x32 is supported\n");
237 		with_x32 = true;
238 	} else if (ret == -ENOSYS) {
239 		info("x32 is not supported\n");
240 		with_x32 = false;
241 	} else {
242 		fail("x32 getpid() returned %lld, but it should have returned either %lld or -ENOSYS\n", ret, (long long)mypid);
243 		with_x32 = false;
244 	}
245 	sh->indent--;
246 	return with_x32;
247 }
248 
249 static void test_syscalls_common(int msb)
250 {
251 	enum ptrace_pass pass = sh->ptrace_pass;
252 
253 	run("Checking some common syscalls as 64 bit\n");
254 	check_zero(msb, SYS_READ);
255 	check_zero(msb, SYS_WRITE);
256 
257 	run("Checking some 64-bit only syscalls as 64 bit\n");
258 	check_zero(msb, X64_READV);
259 	check_zero(msb, X64_WRITEV);
260 
261 	run("Checking out of range system calls\n");
262 	check_for(msb, -64, -2, -ENOSYS);
263 	if (pass >= PTP_FUZZRET)
264 		check_for(msb, -1, -1, MODIFIED_BY_PTRACE);
265 	else
266 		check_for(msb, -1, -1, -ENOSYS);
267 	check_for(msb, X32_BIT-64, X32_BIT-1, -ENOSYS);
268 	check_for(msb, -64-X32_BIT, -1-X32_BIT, -ENOSYS);
269 	check_for(msb, INT_MAX-64, INT_MAX-1, -ENOSYS);
270 }
271 
272 static void test_syscalls_with_x32(int msb)
273 {
274 	/*
275 	 * Syscalls 512-547 are "x32" syscalls.  They are
276 	 * intended to be called with the x32 (0x40000000) bit
277 	 * set.  Calling them without the x32 bit set is
278 	 * nonsense and should not work.
279 	 */
280 	run("Checking x32 syscalls as 64 bit\n");
281 	check_for(msb, 512, 547, -ENOSYS);
282 
283 	run("Checking some common syscalls as x32\n");
284 	check_zero(msb, SYS_READ   | X32_BIT);
285 	check_zero(msb, SYS_WRITE  | X32_BIT);
286 
287 	run("Checking some x32 syscalls as x32\n");
288 	check_zero(msb, X32_READV  | X32_BIT);
289 	check_zero(msb, X32_WRITEV | X32_BIT);
290 
291 	run("Checking some 64-bit syscalls as x32\n");
292 	check_enosys(msb, X64_IOCTL  | X32_BIT);
293 	check_enosys(msb, X64_READV  | X32_BIT);
294 	check_enosys(msb, X64_WRITEV | X32_BIT);
295 }
296 
297 static void test_syscalls_without_x32(int msb)
298 {
299 	run("Checking for absence of x32 system calls\n");
300 	check_for(msb, 0 | X32_BIT, 999 | X32_BIT, -ENOSYS);
301 }
302 
303 static void test_syscall_numbering(void)
304 {
305 	static const int msbs[] = {
306 		0, 1, -1, X32_BIT-1, X32_BIT, X32_BIT-1, -X32_BIT, INT_MAX,
307 		INT_MIN, INT_MIN+1
308 	};
309 
310 	sh->indent++;
311 
312 	/*
313 	 * The MSB is supposed to be ignored, so we loop over a few
314 	 * to test that out.
315 	 */
316 	for (size_t i = 0; i < sizeof(msbs)/sizeof(msbs[0]); i++) {
317 		int msb = msbs[i];
318 		run("Checking system calls with msb = %d (0x%x)\n",
319 		    msb, msb);
320 
321 		sh->indent++;
322 
323 		test_syscalls_common(msb);
324 		if (with_x32)
325 			test_syscalls_with_x32(msb);
326 		else
327 			test_syscalls_without_x32(msb);
328 
329 		sh->indent--;
330 	}
331 
332 	sh->indent--;
333 }
334 
335 static void syscall_numbering_tracee(void)
336 {
337 	enum ptrace_pass pass;
338 
339 	if (ptrace(PTRACE_TRACEME, 0, 0, 0)) {
340 		crit("Failed to request tracing\n");
341 		return;
342 	}
343 	raise(SIGSTOP);
344 
345 	for (sh->ptrace_pass = pass = PTP_NOTHING; pass < PTP_DONE;
346 	     sh->ptrace_pass = ++pass) {
347 		run("Running tests under ptrace: %s\n", ptrace_pass_name[pass]);
348 		test_syscall_numbering();
349 	}
350 }
351 
352 static void mess_with_syscall(pid_t testpid, enum ptrace_pass pass)
353 {
354 	struct user_regs_struct regs;
355 
356 	sh->probing_syscall = false; /* Do this on entry only */
357 
358 	/* For these, don't even getregs */
359 	if (pass == PTP_NOTHING || pass == PTP_DONE)
360 		return;
361 
362 	ptrace(PTRACE_GETREGS, testpid, NULL, &regs);
363 
364 	if (regs.orig_rax != regs.rbx) {
365 		fail("orig_rax %#llx doesn't match syscall number %#llx\n",
366 		     (unsigned long long)regs.orig_rax,
367 		     (unsigned long long)regs.rbx);
368 	}
369 
370 	switch (pass) {
371 	case PTP_GETREGS:
372 		/* Just read, no writeback */
373 		return;
374 	case PTP_WRITEBACK:
375 		/* Write back the same register state verbatim */
376 		break;
377 	case PTP_FUZZRET:
378 		regs.rax = MODIFIED_BY_PTRACE;
379 		break;
380 	case PTP_FUZZHIGH:
381 		regs.rax = MODIFIED_BY_PTRACE;
382 		regs.orig_rax = regs.orig_rax | 0xffffffff00000000ULL;
383 		break;
384 	case PTP_INTNUM:
385 		regs.rax = MODIFIED_BY_PTRACE;
386 		regs.orig_rax = (int)regs.orig_rax;
387 		break;
388 	default:
389 		crit("invalid ptrace_pass\n");
390 		break;
391 	}
392 
393 	ptrace(PTRACE_SETREGS, testpid, NULL, &regs);
394 }
395 
396 static void syscall_numbering_tracer(pid_t testpid)
397 {
398 	int wstatus;
399 
400 	do {
401 		pid_t wpid = waitpid(testpid, &wstatus, 0);
402 		if (wpid < 0 && errno != EINTR)
403 			break;
404 		if (wpid != testpid)
405 			continue;
406 		if (!WIFSTOPPED(wstatus))
407 			break;	/* Thread exited? */
408 
409 		if (sh->probing_syscall && WSTOPSIG(wstatus) == SIGTRAP)
410 			mess_with_syscall(testpid, sh->ptrace_pass);
411 	} while (sh->ptrace_pass != PTP_DONE &&
412 		 !ptrace(PTRACE_SYSCALL, testpid, NULL, NULL));
413 
414 	ptrace(PTRACE_DETACH, testpid, NULL, NULL);
415 
416 	/* Wait for the child process to terminate */
417 	while (waitpid(testpid, &wstatus, 0) != testpid || !WIFEXITED(wstatus))
418 		/* wait some more */;
419 }
420 
421 static void test_traced_syscall_numbering(void)
422 {
423 	pid_t testpid;
424 
425 	/* Launch the test thread; this thread continues as the tracer thread */
426 	testpid = fork();
427 
428 	if (testpid < 0) {
429 		crit("Unable to launch tracer process\n");
430 	} else if (testpid == 0) {
431 		syscall_numbering_tracee();
432 		_exit(0);
433 	} else {
434 		syscall_numbering_tracer(testpid);
435 	}
436 }
437 
438 int main(void)
439 {
440 	unsigned int nerr;
441 
442 	/*
443 	 * It is quite likely to get a segfault on a failure, so make
444 	 * sure the message gets out by setting stdout to nonbuffered.
445 	 */
446 	setvbuf(stdout, NULL, _IONBF, 0);
447 
448 	/*
449 	 * Harmless file descriptor to work on...
450 	 */
451 	nullfd = open("/dev/null", O_RDWR);
452 	if (nullfd < 0) {
453 		crit("Unable to open /dev/null: %s\n", strerror(errno));
454 	}
455 
456 	/*
457 	 * Set up a block of shared memory...
458 	 */
459 	sh = mmap(NULL, sysconf(_SC_PAGE_SIZE), PROT_READ|PROT_WRITE,
460 		  MAP_ANONYMOUS|MAP_SHARED, 0, 0);
461 	if (sh == MAP_FAILED) {
462 		crit("Unable to allocated shared memory block: %s\n",
463 		     strerror(errno));
464 	}
465 
466 	with_x32 = test_x32();
467 
468 	run("Running tests without ptrace...\n");
469 	test_syscall_numbering();
470 
471 	test_traced_syscall_numbering();
472 
473 	nerr = sh->nerr;
474 	if (!nerr) {
475 		ok("All system calls succeeded or failed as expected\n");
476 		return 0;
477 	} else {
478 		fail("A total of %u system call%s had incorrect behavior\n",
479 		     nerr, nerr != 1 ? "s" : "");
480 		return 1;
481 	}
482 }
483