1 // SPDX-License-Identifier: LGPL-2.1
2 #define _GNU_SOURCE
3 #include <assert.h>
4 #include <pthread.h>
5 #include <sched.h>
6 #include <stdint.h>
7 #include <stdio.h>
8 #include <stdlib.h>
9 #include <string.h>
10 #include <syscall.h>
11 #include <unistd.h>
12 #include <poll.h>
13 #include <sys/types.h>
14 #include <signal.h>
15 #include <errno.h>
16 #include <stddef.h>
17 
18 static inline pid_t gettid(void)
19 {
20 	return syscall(__NR_gettid);
21 }
22 
23 #define NR_INJECT	9
24 static int loop_cnt[NR_INJECT + 1];
25 
26 static int loop_cnt_1 asm("asm_loop_cnt_1") __attribute__((used));
27 static int loop_cnt_2 asm("asm_loop_cnt_2") __attribute__((used));
28 static int loop_cnt_3 asm("asm_loop_cnt_3") __attribute__((used));
29 static int loop_cnt_4 asm("asm_loop_cnt_4") __attribute__((used));
30 static int loop_cnt_5 asm("asm_loop_cnt_5") __attribute__((used));
31 static int loop_cnt_6 asm("asm_loop_cnt_6") __attribute__((used));
32 
33 static int opt_modulo, verbose;
34 
35 static int opt_yield, opt_signal, opt_sleep,
36 		opt_disable_rseq, opt_threads = 200,
37 		opt_disable_mod = 0, opt_test = 's', opt_mb = 0;
38 
39 #ifndef RSEQ_SKIP_FASTPATH
40 static long long opt_reps = 5000;
41 #else
42 static long long opt_reps = 100;
43 #endif
44 
45 static __thread __attribute__((tls_model("initial-exec")))
46 unsigned int signals_delivered;
47 
48 #ifndef BENCHMARK
49 
50 static __thread __attribute__((tls_model("initial-exec"), unused))
51 unsigned int yield_mod_cnt, nr_abort;
52 
53 #define printf_verbose(fmt, ...)			\
54 	do {						\
55 		if (verbose)				\
56 			printf(fmt, ## __VA_ARGS__);	\
57 	} while (0)
58 
59 #if defined(__x86_64__) || defined(__i386__)
60 
61 #define INJECT_ASM_REG	"eax"
62 
63 #define RSEQ_INJECT_CLOBBER \
64 	, INJECT_ASM_REG
65 
66 #ifdef __i386__
67 
68 #define RSEQ_INJECT_ASM(n) \
69 	"mov asm_loop_cnt_" #n ", %%" INJECT_ASM_REG "\n\t" \
70 	"test %%" INJECT_ASM_REG ",%%" INJECT_ASM_REG "\n\t" \
71 	"jz 333f\n\t" \
72 	"222:\n\t" \
73 	"dec %%" INJECT_ASM_REG "\n\t" \
74 	"jnz 222b\n\t" \
75 	"333:\n\t"
76 
77 #elif defined(__x86_64__)
78 
79 #define RSEQ_INJECT_ASM(n) \
80 	"lea asm_loop_cnt_" #n "(%%rip), %%" INJECT_ASM_REG "\n\t" \
81 	"mov (%%" INJECT_ASM_REG "), %%" INJECT_ASM_REG "\n\t" \
82 	"test %%" INJECT_ASM_REG ",%%" INJECT_ASM_REG "\n\t" \
83 	"jz 333f\n\t" \
84 	"222:\n\t" \
85 	"dec %%" INJECT_ASM_REG "\n\t" \
86 	"jnz 222b\n\t" \
87 	"333:\n\t"
88 
89 #else
90 #error "Unsupported architecture"
91 #endif
92 
93 #elif defined(__s390__)
94 
95 #define RSEQ_INJECT_INPUT \
96 	, [loop_cnt_1]"m"(loop_cnt[1]) \
97 	, [loop_cnt_2]"m"(loop_cnt[2]) \
98 	, [loop_cnt_3]"m"(loop_cnt[3]) \
99 	, [loop_cnt_4]"m"(loop_cnt[4]) \
100 	, [loop_cnt_5]"m"(loop_cnt[5]) \
101 	, [loop_cnt_6]"m"(loop_cnt[6])
102 
103 #define INJECT_ASM_REG	"r12"
104 
105 #define RSEQ_INJECT_CLOBBER \
106 	, INJECT_ASM_REG
107 
108 #define RSEQ_INJECT_ASM(n) \
109 	"l %%" INJECT_ASM_REG ", %[loop_cnt_" #n "]\n\t" \
110 	"ltr %%" INJECT_ASM_REG ", %%" INJECT_ASM_REG "\n\t" \
111 	"je 333f\n\t" \
112 	"222:\n\t" \
113 	"ahi %%" INJECT_ASM_REG ", -1\n\t" \
114 	"jnz 222b\n\t" \
115 	"333:\n\t"
116 
117 #elif defined(__ARMEL__)
118 
119 #define RSEQ_INJECT_INPUT \
120 	, [loop_cnt_1]"m"(loop_cnt[1]) \
121 	, [loop_cnt_2]"m"(loop_cnt[2]) \
122 	, [loop_cnt_3]"m"(loop_cnt[3]) \
123 	, [loop_cnt_4]"m"(loop_cnt[4]) \
124 	, [loop_cnt_5]"m"(loop_cnt[5]) \
125 	, [loop_cnt_6]"m"(loop_cnt[6])
126 
127 #define INJECT_ASM_REG	"r4"
128 
129 #define RSEQ_INJECT_CLOBBER \
130 	, INJECT_ASM_REG
131 
132 #define RSEQ_INJECT_ASM(n) \
133 	"ldr " INJECT_ASM_REG ", %[loop_cnt_" #n "]\n\t" \
134 	"cmp " INJECT_ASM_REG ", #0\n\t" \
135 	"beq 333f\n\t" \
136 	"222:\n\t" \
137 	"subs " INJECT_ASM_REG ", #1\n\t" \
138 	"bne 222b\n\t" \
139 	"333:\n\t"
140 
141 #elif defined(__AARCH64EL__)
142 
143 #define RSEQ_INJECT_INPUT \
144 	, [loop_cnt_1] "Qo" (loop_cnt[1]) \
145 	, [loop_cnt_2] "Qo" (loop_cnt[2]) \
146 	, [loop_cnt_3] "Qo" (loop_cnt[3]) \
147 	, [loop_cnt_4] "Qo" (loop_cnt[4]) \
148 	, [loop_cnt_5] "Qo" (loop_cnt[5]) \
149 	, [loop_cnt_6] "Qo" (loop_cnt[6])
150 
151 #define INJECT_ASM_REG	RSEQ_ASM_TMP_REG32
152 
153 #define RSEQ_INJECT_ASM(n) \
154 	"	ldr	" INJECT_ASM_REG ", %[loop_cnt_" #n "]\n"	\
155 	"	cbz	" INJECT_ASM_REG ", 333f\n"			\
156 	"222:\n"							\
157 	"	sub	" INJECT_ASM_REG ", " INJECT_ASM_REG ", #1\n"	\
158 	"	cbnz	" INJECT_ASM_REG ", 222b\n"			\
159 	"333:\n"
160 
161 #elif __PPC__
162 
163 #define RSEQ_INJECT_INPUT \
164 	, [loop_cnt_1]"m"(loop_cnt[1]) \
165 	, [loop_cnt_2]"m"(loop_cnt[2]) \
166 	, [loop_cnt_3]"m"(loop_cnt[3]) \
167 	, [loop_cnt_4]"m"(loop_cnt[4]) \
168 	, [loop_cnt_5]"m"(loop_cnt[5]) \
169 	, [loop_cnt_6]"m"(loop_cnt[6])
170 
171 #define INJECT_ASM_REG	"r18"
172 
173 #define RSEQ_INJECT_CLOBBER \
174 	, INJECT_ASM_REG
175 
176 #define RSEQ_INJECT_ASM(n) \
177 	"lwz %%" INJECT_ASM_REG ", %[loop_cnt_" #n "]\n\t" \
178 	"cmpwi %%" INJECT_ASM_REG ", 0\n\t" \
179 	"beq 333f\n\t" \
180 	"222:\n\t" \
181 	"subic. %%" INJECT_ASM_REG ", %%" INJECT_ASM_REG ", 1\n\t" \
182 	"bne 222b\n\t" \
183 	"333:\n\t"
184 
185 #elif defined(__mips__)
186 
187 #define RSEQ_INJECT_INPUT \
188 	, [loop_cnt_1]"m"(loop_cnt[1]) \
189 	, [loop_cnt_2]"m"(loop_cnt[2]) \
190 	, [loop_cnt_3]"m"(loop_cnt[3]) \
191 	, [loop_cnt_4]"m"(loop_cnt[4]) \
192 	, [loop_cnt_5]"m"(loop_cnt[5]) \
193 	, [loop_cnt_6]"m"(loop_cnt[6])
194 
195 #define INJECT_ASM_REG	"$5"
196 
197 #define RSEQ_INJECT_CLOBBER \
198 	, INJECT_ASM_REG
199 
200 #define RSEQ_INJECT_ASM(n) \
201 	"lw " INJECT_ASM_REG ", %[loop_cnt_" #n "]\n\t" \
202 	"beqz " INJECT_ASM_REG ", 333f\n\t" \
203 	"222:\n\t" \
204 	"addiu " INJECT_ASM_REG ", -1\n\t" \
205 	"bnez " INJECT_ASM_REG ", 222b\n\t" \
206 	"333:\n\t"
207 
208 #else
209 #error unsupported target
210 #endif
211 
212 #define RSEQ_INJECT_FAILED \
213 	nr_abort++;
214 
215 #define RSEQ_INJECT_C(n) \
216 { \
217 	int loc_i, loc_nr_loops = loop_cnt[n]; \
218 	\
219 	for (loc_i = 0; loc_i < loc_nr_loops; loc_i++) { \
220 		rseq_barrier(); \
221 	} \
222 	if (loc_nr_loops == -1 && opt_modulo) { \
223 		if (yield_mod_cnt == opt_modulo - 1) { \
224 			if (opt_sleep > 0) \
225 				poll(NULL, 0, opt_sleep); \
226 			if (opt_yield) \
227 				sched_yield(); \
228 			if (opt_signal) \
229 				raise(SIGUSR1); \
230 			yield_mod_cnt = 0; \
231 		} else { \
232 			yield_mod_cnt++; \
233 		} \
234 	} \
235 }
236 
237 #else
238 
239 #define printf_verbose(fmt, ...)
240 
241 #endif /* BENCHMARK */
242 
243 #include "rseq.h"
244 
245 struct percpu_lock_entry {
246 	intptr_t v;
247 } __attribute__((aligned(128)));
248 
249 struct percpu_lock {
250 	struct percpu_lock_entry c[CPU_SETSIZE];
251 };
252 
253 struct test_data_entry {
254 	intptr_t count;
255 } __attribute__((aligned(128)));
256 
257 struct spinlock_test_data {
258 	struct percpu_lock lock;
259 	struct test_data_entry c[CPU_SETSIZE];
260 };
261 
262 struct spinlock_thread_test_data {
263 	struct spinlock_test_data *data;
264 	long long reps;
265 	int reg;
266 };
267 
268 struct inc_test_data {
269 	struct test_data_entry c[CPU_SETSIZE];
270 };
271 
272 struct inc_thread_test_data {
273 	struct inc_test_data *data;
274 	long long reps;
275 	int reg;
276 };
277 
278 struct percpu_list_node {
279 	intptr_t data;
280 	struct percpu_list_node *next;
281 };
282 
283 struct percpu_list_entry {
284 	struct percpu_list_node *head;
285 } __attribute__((aligned(128)));
286 
287 struct percpu_list {
288 	struct percpu_list_entry c[CPU_SETSIZE];
289 };
290 
291 #define BUFFER_ITEM_PER_CPU	100
292 
293 struct percpu_buffer_node {
294 	intptr_t data;
295 };
296 
297 struct percpu_buffer_entry {
298 	intptr_t offset;
299 	intptr_t buflen;
300 	struct percpu_buffer_node **array;
301 } __attribute__((aligned(128)));
302 
303 struct percpu_buffer {
304 	struct percpu_buffer_entry c[CPU_SETSIZE];
305 };
306 
307 #define MEMCPY_BUFFER_ITEM_PER_CPU	100
308 
309 struct percpu_memcpy_buffer_node {
310 	intptr_t data1;
311 	uint64_t data2;
312 };
313 
314 struct percpu_memcpy_buffer_entry {
315 	intptr_t offset;
316 	intptr_t buflen;
317 	struct percpu_memcpy_buffer_node *array;
318 } __attribute__((aligned(128)));
319 
320 struct percpu_memcpy_buffer {
321 	struct percpu_memcpy_buffer_entry c[CPU_SETSIZE];
322 };
323 
324 /* A simple percpu spinlock. Grabs lock on current cpu. */
325 static int rseq_this_cpu_lock(struct percpu_lock *lock)
326 {
327 	int cpu;
328 
329 	for (;;) {
330 		int ret;
331 
332 		cpu = rseq_cpu_start();
333 		ret = rseq_cmpeqv_storev(&lock->c[cpu].v,
334 					 0, 1, cpu);
335 		if (rseq_likely(!ret))
336 			break;
337 		/* Retry if comparison fails or rseq aborts. */
338 	}
339 	/*
340 	 * Acquire semantic when taking lock after control dependency.
341 	 * Matches rseq_smp_store_release().
342 	 */
343 	rseq_smp_acquire__after_ctrl_dep();
344 	return cpu;
345 }
346 
347 static void rseq_percpu_unlock(struct percpu_lock *lock, int cpu)
348 {
349 	assert(lock->c[cpu].v == 1);
350 	/*
351 	 * Release lock, with release semantic. Matches
352 	 * rseq_smp_acquire__after_ctrl_dep().
353 	 */
354 	rseq_smp_store_release(&lock->c[cpu].v, 0);
355 }
356 
357 void *test_percpu_spinlock_thread(void *arg)
358 {
359 	struct spinlock_thread_test_data *thread_data = arg;
360 	struct spinlock_test_data *data = thread_data->data;
361 	long long i, reps;
362 
363 	if (!opt_disable_rseq && thread_data->reg &&
364 	    rseq_register_current_thread())
365 		abort();
366 	reps = thread_data->reps;
367 	for (i = 0; i < reps; i++) {
368 		int cpu = rseq_cpu_start();
369 
370 		cpu = rseq_this_cpu_lock(&data->lock);
371 		data->c[cpu].count++;
372 		rseq_percpu_unlock(&data->lock, cpu);
373 #ifndef BENCHMARK
374 		if (i != 0 && !(i % (reps / 10)))
375 			printf_verbose("tid %d: count %lld\n", (int) gettid(), i);
376 #endif
377 	}
378 	printf_verbose("tid %d: number of rseq abort: %d, signals delivered: %u\n",
379 		       (int) gettid(), nr_abort, signals_delivered);
380 	if (!opt_disable_rseq && thread_data->reg &&
381 	    rseq_unregister_current_thread())
382 		abort();
383 	return NULL;
384 }
385 
386 /*
387  * A simple test which implements a sharded counter using a per-cpu
388  * lock.  Obviously real applications might prefer to simply use a
389  * per-cpu increment; however, this is reasonable for a test and the
390  * lock can be extended to synchronize more complicated operations.
391  */
392 void test_percpu_spinlock(void)
393 {
394 	const int num_threads = opt_threads;
395 	int i, ret;
396 	uint64_t sum;
397 	pthread_t test_threads[num_threads];
398 	struct spinlock_test_data data;
399 	struct spinlock_thread_test_data thread_data[num_threads];
400 
401 	memset(&data, 0, sizeof(data));
402 	for (i = 0; i < num_threads; i++) {
403 		thread_data[i].reps = opt_reps;
404 		if (opt_disable_mod <= 0 || (i % opt_disable_mod))
405 			thread_data[i].reg = 1;
406 		else
407 			thread_data[i].reg = 0;
408 		thread_data[i].data = &data;
409 		ret = pthread_create(&test_threads[i], NULL,
410 				     test_percpu_spinlock_thread,
411 				     &thread_data[i]);
412 		if (ret) {
413 			errno = ret;
414 			perror("pthread_create");
415 			abort();
416 		}
417 	}
418 
419 	for (i = 0; i < num_threads; i++) {
420 		ret = pthread_join(test_threads[i], NULL);
421 		if (ret) {
422 			errno = ret;
423 			perror("pthread_join");
424 			abort();
425 		}
426 	}
427 
428 	sum = 0;
429 	for (i = 0; i < CPU_SETSIZE; i++)
430 		sum += data.c[i].count;
431 
432 	assert(sum == (uint64_t)opt_reps * num_threads);
433 }
434 
435 void *test_percpu_inc_thread(void *arg)
436 {
437 	struct inc_thread_test_data *thread_data = arg;
438 	struct inc_test_data *data = thread_data->data;
439 	long long i, reps;
440 
441 	if (!opt_disable_rseq && thread_data->reg &&
442 	    rseq_register_current_thread())
443 		abort();
444 	reps = thread_data->reps;
445 	for (i = 0; i < reps; i++) {
446 		int ret;
447 
448 		do {
449 			int cpu;
450 
451 			cpu = rseq_cpu_start();
452 			ret = rseq_addv(&data->c[cpu].count, 1, cpu);
453 		} while (rseq_unlikely(ret));
454 #ifndef BENCHMARK
455 		if (i != 0 && !(i % (reps / 10)))
456 			printf_verbose("tid %d: count %lld\n", (int) gettid(), i);
457 #endif
458 	}
459 	printf_verbose("tid %d: number of rseq abort: %d, signals delivered: %u\n",
460 		       (int) gettid(), nr_abort, signals_delivered);
461 	if (!opt_disable_rseq && thread_data->reg &&
462 	    rseq_unregister_current_thread())
463 		abort();
464 	return NULL;
465 }
466 
467 void test_percpu_inc(void)
468 {
469 	const int num_threads = opt_threads;
470 	int i, ret;
471 	uint64_t sum;
472 	pthread_t test_threads[num_threads];
473 	struct inc_test_data data;
474 	struct inc_thread_test_data thread_data[num_threads];
475 
476 	memset(&data, 0, sizeof(data));
477 	for (i = 0; i < num_threads; i++) {
478 		thread_data[i].reps = opt_reps;
479 		if (opt_disable_mod <= 0 || (i % opt_disable_mod))
480 			thread_data[i].reg = 1;
481 		else
482 			thread_data[i].reg = 0;
483 		thread_data[i].data = &data;
484 		ret = pthread_create(&test_threads[i], NULL,
485 				     test_percpu_inc_thread,
486 				     &thread_data[i]);
487 		if (ret) {
488 			errno = ret;
489 			perror("pthread_create");
490 			abort();
491 		}
492 	}
493 
494 	for (i = 0; i < num_threads; i++) {
495 		ret = pthread_join(test_threads[i], NULL);
496 		if (ret) {
497 			errno = ret;
498 			perror("pthread_join");
499 			abort();
500 		}
501 	}
502 
503 	sum = 0;
504 	for (i = 0; i < CPU_SETSIZE; i++)
505 		sum += data.c[i].count;
506 
507 	assert(sum == (uint64_t)opt_reps * num_threads);
508 }
509 
510 void this_cpu_list_push(struct percpu_list *list,
511 			struct percpu_list_node *node,
512 			int *_cpu)
513 {
514 	int cpu;
515 
516 	for (;;) {
517 		intptr_t *targetptr, newval, expect;
518 		int ret;
519 
520 		cpu = rseq_cpu_start();
521 		/* Load list->c[cpu].head with single-copy atomicity. */
522 		expect = (intptr_t)RSEQ_READ_ONCE(list->c[cpu].head);
523 		newval = (intptr_t)node;
524 		targetptr = (intptr_t *)&list->c[cpu].head;
525 		node->next = (struct percpu_list_node *)expect;
526 		ret = rseq_cmpeqv_storev(targetptr, expect, newval, cpu);
527 		if (rseq_likely(!ret))
528 			break;
529 		/* Retry if comparison fails or rseq aborts. */
530 	}
531 	if (_cpu)
532 		*_cpu = cpu;
533 }
534 
535 /*
536  * Unlike a traditional lock-less linked list; the availability of a
537  * rseq primitive allows us to implement pop without concerns over
538  * ABA-type races.
539  */
540 struct percpu_list_node *this_cpu_list_pop(struct percpu_list *list,
541 					   int *_cpu)
542 {
543 	struct percpu_list_node *node = NULL;
544 	int cpu;
545 
546 	for (;;) {
547 		struct percpu_list_node *head;
548 		intptr_t *targetptr, expectnot, *load;
549 		off_t offset;
550 		int ret;
551 
552 		cpu = rseq_cpu_start();
553 		targetptr = (intptr_t *)&list->c[cpu].head;
554 		expectnot = (intptr_t)NULL;
555 		offset = offsetof(struct percpu_list_node, next);
556 		load = (intptr_t *)&head;
557 		ret = rseq_cmpnev_storeoffp_load(targetptr, expectnot,
558 						   offset, load, cpu);
559 		if (rseq_likely(!ret)) {
560 			node = head;
561 			break;
562 		}
563 		if (ret > 0)
564 			break;
565 		/* Retry if rseq aborts. */
566 	}
567 	if (_cpu)
568 		*_cpu = cpu;
569 	return node;
570 }
571 
572 /*
573  * __percpu_list_pop is not safe against concurrent accesses. Should
574  * only be used on lists that are not concurrently modified.
575  */
576 struct percpu_list_node *__percpu_list_pop(struct percpu_list *list, int cpu)
577 {
578 	struct percpu_list_node *node;
579 
580 	node = list->c[cpu].head;
581 	if (!node)
582 		return NULL;
583 	list->c[cpu].head = node->next;
584 	return node;
585 }
586 
587 void *test_percpu_list_thread(void *arg)
588 {
589 	long long i, reps;
590 	struct percpu_list *list = (struct percpu_list *)arg;
591 
592 	if (!opt_disable_rseq && rseq_register_current_thread())
593 		abort();
594 
595 	reps = opt_reps;
596 	for (i = 0; i < reps; i++) {
597 		struct percpu_list_node *node;
598 
599 		node = this_cpu_list_pop(list, NULL);
600 		if (opt_yield)
601 			sched_yield();  /* encourage shuffling */
602 		if (node)
603 			this_cpu_list_push(list, node, NULL);
604 	}
605 
606 	printf_verbose("tid %d: number of rseq abort: %d, signals delivered: %u\n",
607 		       (int) gettid(), nr_abort, signals_delivered);
608 	if (!opt_disable_rseq && rseq_unregister_current_thread())
609 		abort();
610 
611 	return NULL;
612 }
613 
614 /* Simultaneous modification to a per-cpu linked list from many threads.  */
615 void test_percpu_list(void)
616 {
617 	const int num_threads = opt_threads;
618 	int i, j, ret;
619 	uint64_t sum = 0, expected_sum = 0;
620 	struct percpu_list list;
621 	pthread_t test_threads[num_threads];
622 	cpu_set_t allowed_cpus;
623 
624 	memset(&list, 0, sizeof(list));
625 
626 	/* Generate list entries for every usable cpu. */
627 	sched_getaffinity(0, sizeof(allowed_cpus), &allowed_cpus);
628 	for (i = 0; i < CPU_SETSIZE; i++) {
629 		if (!CPU_ISSET(i, &allowed_cpus))
630 			continue;
631 		for (j = 1; j <= 100; j++) {
632 			struct percpu_list_node *node;
633 
634 			expected_sum += j;
635 
636 			node = malloc(sizeof(*node));
637 			assert(node);
638 			node->data = j;
639 			node->next = list.c[i].head;
640 			list.c[i].head = node;
641 		}
642 	}
643 
644 	for (i = 0; i < num_threads; i++) {
645 		ret = pthread_create(&test_threads[i], NULL,
646 				     test_percpu_list_thread, &list);
647 		if (ret) {
648 			errno = ret;
649 			perror("pthread_create");
650 			abort();
651 		}
652 	}
653 
654 	for (i = 0; i < num_threads; i++) {
655 		ret = pthread_join(test_threads[i], NULL);
656 		if (ret) {
657 			errno = ret;
658 			perror("pthread_join");
659 			abort();
660 		}
661 	}
662 
663 	for (i = 0; i < CPU_SETSIZE; i++) {
664 		struct percpu_list_node *node;
665 
666 		if (!CPU_ISSET(i, &allowed_cpus))
667 			continue;
668 
669 		while ((node = __percpu_list_pop(&list, i))) {
670 			sum += node->data;
671 			free(node);
672 		}
673 	}
674 
675 	/*
676 	 * All entries should now be accounted for (unless some external
677 	 * actor is interfering with our allowed affinity while this
678 	 * test is running).
679 	 */
680 	assert(sum == expected_sum);
681 }
682 
683 bool this_cpu_buffer_push(struct percpu_buffer *buffer,
684 			  struct percpu_buffer_node *node,
685 			  int *_cpu)
686 {
687 	bool result = false;
688 	int cpu;
689 
690 	for (;;) {
691 		intptr_t *targetptr_spec, newval_spec;
692 		intptr_t *targetptr_final, newval_final;
693 		intptr_t offset;
694 		int ret;
695 
696 		cpu = rseq_cpu_start();
697 		offset = RSEQ_READ_ONCE(buffer->c[cpu].offset);
698 		if (offset == buffer->c[cpu].buflen)
699 			break;
700 		newval_spec = (intptr_t)node;
701 		targetptr_spec = (intptr_t *)&buffer->c[cpu].array[offset];
702 		newval_final = offset + 1;
703 		targetptr_final = &buffer->c[cpu].offset;
704 		if (opt_mb)
705 			ret = rseq_cmpeqv_trystorev_storev_release(
706 				targetptr_final, offset, targetptr_spec,
707 				newval_spec, newval_final, cpu);
708 		else
709 			ret = rseq_cmpeqv_trystorev_storev(targetptr_final,
710 				offset, targetptr_spec, newval_spec,
711 				newval_final, cpu);
712 		if (rseq_likely(!ret)) {
713 			result = true;
714 			break;
715 		}
716 		/* Retry if comparison fails or rseq aborts. */
717 	}
718 	if (_cpu)
719 		*_cpu = cpu;
720 	return result;
721 }
722 
723 struct percpu_buffer_node *this_cpu_buffer_pop(struct percpu_buffer *buffer,
724 					       int *_cpu)
725 {
726 	struct percpu_buffer_node *head;
727 	int cpu;
728 
729 	for (;;) {
730 		intptr_t *targetptr, newval;
731 		intptr_t offset;
732 		int ret;
733 
734 		cpu = rseq_cpu_start();
735 		/* Load offset with single-copy atomicity. */
736 		offset = RSEQ_READ_ONCE(buffer->c[cpu].offset);
737 		if (offset == 0) {
738 			head = NULL;
739 			break;
740 		}
741 		head = RSEQ_READ_ONCE(buffer->c[cpu].array[offset - 1]);
742 		newval = offset - 1;
743 		targetptr = (intptr_t *)&buffer->c[cpu].offset;
744 		ret = rseq_cmpeqv_cmpeqv_storev(targetptr, offset,
745 			(intptr_t *)&buffer->c[cpu].array[offset - 1],
746 			(intptr_t)head, newval, cpu);
747 		if (rseq_likely(!ret))
748 			break;
749 		/* Retry if comparison fails or rseq aborts. */
750 	}
751 	if (_cpu)
752 		*_cpu = cpu;
753 	return head;
754 }
755 
756 /*
757  * __percpu_buffer_pop is not safe against concurrent accesses. Should
758  * only be used on buffers that are not concurrently modified.
759  */
760 struct percpu_buffer_node *__percpu_buffer_pop(struct percpu_buffer *buffer,
761 					       int cpu)
762 {
763 	struct percpu_buffer_node *head;
764 	intptr_t offset;
765 
766 	offset = buffer->c[cpu].offset;
767 	if (offset == 0)
768 		return NULL;
769 	head = buffer->c[cpu].array[offset - 1];
770 	buffer->c[cpu].offset = offset - 1;
771 	return head;
772 }
773 
774 void *test_percpu_buffer_thread(void *arg)
775 {
776 	long long i, reps;
777 	struct percpu_buffer *buffer = (struct percpu_buffer *)arg;
778 
779 	if (!opt_disable_rseq && rseq_register_current_thread())
780 		abort();
781 
782 	reps = opt_reps;
783 	for (i = 0; i < reps; i++) {
784 		struct percpu_buffer_node *node;
785 
786 		node = this_cpu_buffer_pop(buffer, NULL);
787 		if (opt_yield)
788 			sched_yield();  /* encourage shuffling */
789 		if (node) {
790 			if (!this_cpu_buffer_push(buffer, node, NULL)) {
791 				/* Should increase buffer size. */
792 				abort();
793 			}
794 		}
795 	}
796 
797 	printf_verbose("tid %d: number of rseq abort: %d, signals delivered: %u\n",
798 		       (int) gettid(), nr_abort, signals_delivered);
799 	if (!opt_disable_rseq && rseq_unregister_current_thread())
800 		abort();
801 
802 	return NULL;
803 }
804 
805 /* Simultaneous modification to a per-cpu buffer from many threads.  */
806 void test_percpu_buffer(void)
807 {
808 	const int num_threads = opt_threads;
809 	int i, j, ret;
810 	uint64_t sum = 0, expected_sum = 0;
811 	struct percpu_buffer buffer;
812 	pthread_t test_threads[num_threads];
813 	cpu_set_t allowed_cpus;
814 
815 	memset(&buffer, 0, sizeof(buffer));
816 
817 	/* Generate list entries for every usable cpu. */
818 	sched_getaffinity(0, sizeof(allowed_cpus), &allowed_cpus);
819 	for (i = 0; i < CPU_SETSIZE; i++) {
820 		if (!CPU_ISSET(i, &allowed_cpus))
821 			continue;
822 		/* Worse-case is every item in same CPU. */
823 		buffer.c[i].array =
824 			malloc(sizeof(*buffer.c[i].array) * CPU_SETSIZE *
825 			       BUFFER_ITEM_PER_CPU);
826 		assert(buffer.c[i].array);
827 		buffer.c[i].buflen = CPU_SETSIZE * BUFFER_ITEM_PER_CPU;
828 		for (j = 1; j <= BUFFER_ITEM_PER_CPU; j++) {
829 			struct percpu_buffer_node *node;
830 
831 			expected_sum += j;
832 
833 			/*
834 			 * We could theoretically put the word-sized
835 			 * "data" directly in the buffer. However, we
836 			 * want to model objects that would not fit
837 			 * within a single word, so allocate an object
838 			 * for each node.
839 			 */
840 			node = malloc(sizeof(*node));
841 			assert(node);
842 			node->data = j;
843 			buffer.c[i].array[j - 1] = node;
844 			buffer.c[i].offset++;
845 		}
846 	}
847 
848 	for (i = 0; i < num_threads; i++) {
849 		ret = pthread_create(&test_threads[i], NULL,
850 				     test_percpu_buffer_thread, &buffer);
851 		if (ret) {
852 			errno = ret;
853 			perror("pthread_create");
854 			abort();
855 		}
856 	}
857 
858 	for (i = 0; i < num_threads; i++) {
859 		ret = pthread_join(test_threads[i], NULL);
860 		if (ret) {
861 			errno = ret;
862 			perror("pthread_join");
863 			abort();
864 		}
865 	}
866 
867 	for (i = 0; i < CPU_SETSIZE; i++) {
868 		struct percpu_buffer_node *node;
869 
870 		if (!CPU_ISSET(i, &allowed_cpus))
871 			continue;
872 
873 		while ((node = __percpu_buffer_pop(&buffer, i))) {
874 			sum += node->data;
875 			free(node);
876 		}
877 		free(buffer.c[i].array);
878 	}
879 
880 	/*
881 	 * All entries should now be accounted for (unless some external
882 	 * actor is interfering with our allowed affinity while this
883 	 * test is running).
884 	 */
885 	assert(sum == expected_sum);
886 }
887 
888 bool this_cpu_memcpy_buffer_push(struct percpu_memcpy_buffer *buffer,
889 				 struct percpu_memcpy_buffer_node item,
890 				 int *_cpu)
891 {
892 	bool result = false;
893 	int cpu;
894 
895 	for (;;) {
896 		intptr_t *targetptr_final, newval_final, offset;
897 		char *destptr, *srcptr;
898 		size_t copylen;
899 		int ret;
900 
901 		cpu = rseq_cpu_start();
902 		/* Load offset with single-copy atomicity. */
903 		offset = RSEQ_READ_ONCE(buffer->c[cpu].offset);
904 		if (offset == buffer->c[cpu].buflen)
905 			break;
906 		destptr = (char *)&buffer->c[cpu].array[offset];
907 		srcptr = (char *)&item;
908 		/* copylen must be <= 4kB. */
909 		copylen = sizeof(item);
910 		newval_final = offset + 1;
911 		targetptr_final = &buffer->c[cpu].offset;
912 		if (opt_mb)
913 			ret = rseq_cmpeqv_trymemcpy_storev_release(
914 				targetptr_final, offset,
915 				destptr, srcptr, copylen,
916 				newval_final, cpu);
917 		else
918 			ret = rseq_cmpeqv_trymemcpy_storev(targetptr_final,
919 				offset, destptr, srcptr, copylen,
920 				newval_final, cpu);
921 		if (rseq_likely(!ret)) {
922 			result = true;
923 			break;
924 		}
925 		/* Retry if comparison fails or rseq aborts. */
926 	}
927 	if (_cpu)
928 		*_cpu = cpu;
929 	return result;
930 }
931 
932 bool this_cpu_memcpy_buffer_pop(struct percpu_memcpy_buffer *buffer,
933 				struct percpu_memcpy_buffer_node *item,
934 				int *_cpu)
935 {
936 	bool result = false;
937 	int cpu;
938 
939 	for (;;) {
940 		intptr_t *targetptr_final, newval_final, offset;
941 		char *destptr, *srcptr;
942 		size_t copylen;
943 		int ret;
944 
945 		cpu = rseq_cpu_start();
946 		/* Load offset with single-copy atomicity. */
947 		offset = RSEQ_READ_ONCE(buffer->c[cpu].offset);
948 		if (offset == 0)
949 			break;
950 		destptr = (char *)item;
951 		srcptr = (char *)&buffer->c[cpu].array[offset - 1];
952 		/* copylen must be <= 4kB. */
953 		copylen = sizeof(*item);
954 		newval_final = offset - 1;
955 		targetptr_final = &buffer->c[cpu].offset;
956 		ret = rseq_cmpeqv_trymemcpy_storev(targetptr_final,
957 			offset, destptr, srcptr, copylen,
958 			newval_final, cpu);
959 		if (rseq_likely(!ret)) {
960 			result = true;
961 			break;
962 		}
963 		/* Retry if comparison fails or rseq aborts. */
964 	}
965 	if (_cpu)
966 		*_cpu = cpu;
967 	return result;
968 }
969 
970 /*
971  * __percpu_memcpy_buffer_pop is not safe against concurrent accesses. Should
972  * only be used on buffers that are not concurrently modified.
973  */
974 bool __percpu_memcpy_buffer_pop(struct percpu_memcpy_buffer *buffer,
975 				struct percpu_memcpy_buffer_node *item,
976 				int cpu)
977 {
978 	intptr_t offset;
979 
980 	offset = buffer->c[cpu].offset;
981 	if (offset == 0)
982 		return false;
983 	memcpy(item, &buffer->c[cpu].array[offset - 1], sizeof(*item));
984 	buffer->c[cpu].offset = offset - 1;
985 	return true;
986 }
987 
988 void *test_percpu_memcpy_buffer_thread(void *arg)
989 {
990 	long long i, reps;
991 	struct percpu_memcpy_buffer *buffer = (struct percpu_memcpy_buffer *)arg;
992 
993 	if (!opt_disable_rseq && rseq_register_current_thread())
994 		abort();
995 
996 	reps = opt_reps;
997 	for (i = 0; i < reps; i++) {
998 		struct percpu_memcpy_buffer_node item;
999 		bool result;
1000 
1001 		result = this_cpu_memcpy_buffer_pop(buffer, &item, NULL);
1002 		if (opt_yield)
1003 			sched_yield();  /* encourage shuffling */
1004 		if (result) {
1005 			if (!this_cpu_memcpy_buffer_push(buffer, item, NULL)) {
1006 				/* Should increase buffer size. */
1007 				abort();
1008 			}
1009 		}
1010 	}
1011 
1012 	printf_verbose("tid %d: number of rseq abort: %d, signals delivered: %u\n",
1013 		       (int) gettid(), nr_abort, signals_delivered);
1014 	if (!opt_disable_rseq && rseq_unregister_current_thread())
1015 		abort();
1016 
1017 	return NULL;
1018 }
1019 
1020 /* Simultaneous modification to a per-cpu buffer from many threads.  */
1021 void test_percpu_memcpy_buffer(void)
1022 {
1023 	const int num_threads = opt_threads;
1024 	int i, j, ret;
1025 	uint64_t sum = 0, expected_sum = 0;
1026 	struct percpu_memcpy_buffer buffer;
1027 	pthread_t test_threads[num_threads];
1028 	cpu_set_t allowed_cpus;
1029 
1030 	memset(&buffer, 0, sizeof(buffer));
1031 
1032 	/* Generate list entries for every usable cpu. */
1033 	sched_getaffinity(0, sizeof(allowed_cpus), &allowed_cpus);
1034 	for (i = 0; i < CPU_SETSIZE; i++) {
1035 		if (!CPU_ISSET(i, &allowed_cpus))
1036 			continue;
1037 		/* Worse-case is every item in same CPU. */
1038 		buffer.c[i].array =
1039 			malloc(sizeof(*buffer.c[i].array) * CPU_SETSIZE *
1040 			       MEMCPY_BUFFER_ITEM_PER_CPU);
1041 		assert(buffer.c[i].array);
1042 		buffer.c[i].buflen = CPU_SETSIZE * MEMCPY_BUFFER_ITEM_PER_CPU;
1043 		for (j = 1; j <= MEMCPY_BUFFER_ITEM_PER_CPU; j++) {
1044 			expected_sum += 2 * j + 1;
1045 
1046 			/*
1047 			 * We could theoretically put the word-sized
1048 			 * "data" directly in the buffer. However, we
1049 			 * want to model objects that would not fit
1050 			 * within a single word, so allocate an object
1051 			 * for each node.
1052 			 */
1053 			buffer.c[i].array[j - 1].data1 = j;
1054 			buffer.c[i].array[j - 1].data2 = j + 1;
1055 			buffer.c[i].offset++;
1056 		}
1057 	}
1058 
1059 	for (i = 0; i < num_threads; i++) {
1060 		ret = pthread_create(&test_threads[i], NULL,
1061 				     test_percpu_memcpy_buffer_thread,
1062 				     &buffer);
1063 		if (ret) {
1064 			errno = ret;
1065 			perror("pthread_create");
1066 			abort();
1067 		}
1068 	}
1069 
1070 	for (i = 0; i < num_threads; i++) {
1071 		ret = pthread_join(test_threads[i], NULL);
1072 		if (ret) {
1073 			errno = ret;
1074 			perror("pthread_join");
1075 			abort();
1076 		}
1077 	}
1078 
1079 	for (i = 0; i < CPU_SETSIZE; i++) {
1080 		struct percpu_memcpy_buffer_node item;
1081 
1082 		if (!CPU_ISSET(i, &allowed_cpus))
1083 			continue;
1084 
1085 		while (__percpu_memcpy_buffer_pop(&buffer, &item, i)) {
1086 			sum += item.data1;
1087 			sum += item.data2;
1088 		}
1089 		free(buffer.c[i].array);
1090 	}
1091 
1092 	/*
1093 	 * All entries should now be accounted for (unless some external
1094 	 * actor is interfering with our allowed affinity while this
1095 	 * test is running).
1096 	 */
1097 	assert(sum == expected_sum);
1098 }
1099 
1100 static void test_signal_interrupt_handler(int signo)
1101 {
1102 	signals_delivered++;
1103 }
1104 
1105 static int set_signal_handler(void)
1106 {
1107 	int ret = 0;
1108 	struct sigaction sa;
1109 	sigset_t sigset;
1110 
1111 	ret = sigemptyset(&sigset);
1112 	if (ret < 0) {
1113 		perror("sigemptyset");
1114 		return ret;
1115 	}
1116 
1117 	sa.sa_handler = test_signal_interrupt_handler;
1118 	sa.sa_mask = sigset;
1119 	sa.sa_flags = 0;
1120 	ret = sigaction(SIGUSR1, &sa, NULL);
1121 	if (ret < 0) {
1122 		perror("sigaction");
1123 		return ret;
1124 	}
1125 
1126 	printf_verbose("Signal handler set for SIGUSR1\n");
1127 
1128 	return ret;
1129 }
1130 
1131 static void show_usage(int argc, char **argv)
1132 {
1133 	printf("Usage : %s <OPTIONS>\n",
1134 		argv[0]);
1135 	printf("OPTIONS:\n");
1136 	printf("	[-1 loops] Number of loops for delay injection 1\n");
1137 	printf("	[-2 loops] Number of loops for delay injection 2\n");
1138 	printf("	[-3 loops] Number of loops for delay injection 3\n");
1139 	printf("	[-4 loops] Number of loops for delay injection 4\n");
1140 	printf("	[-5 loops] Number of loops for delay injection 5\n");
1141 	printf("	[-6 loops] Number of loops for delay injection 6\n");
1142 	printf("	[-7 loops] Number of loops for delay injection 7 (-1 to enable -m)\n");
1143 	printf("	[-8 loops] Number of loops for delay injection 8 (-1 to enable -m)\n");
1144 	printf("	[-9 loops] Number of loops for delay injection 9 (-1 to enable -m)\n");
1145 	printf("	[-m N] Yield/sleep/kill every modulo N (default 0: disabled) (>= 0)\n");
1146 	printf("	[-y] Yield\n");
1147 	printf("	[-k] Kill thread with signal\n");
1148 	printf("	[-s S] S: =0: disabled (default), >0: sleep time (ms)\n");
1149 	printf("	[-t N] Number of threads (default 200)\n");
1150 	printf("	[-r N] Number of repetitions per thread (default 5000)\n");
1151 	printf("	[-d] Disable rseq system call (no initialization)\n");
1152 	printf("	[-D M] Disable rseq for each M threads\n");
1153 	printf("	[-T test] Choose test: (s)pinlock, (l)ist, (b)uffer, (m)emcpy, (i)ncrement\n");
1154 	printf("	[-M] Push into buffer and memcpy buffer with memory barriers.\n");
1155 	printf("	[-v] Verbose output.\n");
1156 	printf("	[-h] Show this help.\n");
1157 	printf("\n");
1158 }
1159 
1160 int main(int argc, char **argv)
1161 {
1162 	int i;
1163 
1164 	for (i = 1; i < argc; i++) {
1165 		if (argv[i][0] != '-')
1166 			continue;
1167 		switch (argv[i][1]) {
1168 		case '1':
1169 		case '2':
1170 		case '3':
1171 		case '4':
1172 		case '5':
1173 		case '6':
1174 		case '7':
1175 		case '8':
1176 		case '9':
1177 			if (argc < i + 2) {
1178 				show_usage(argc, argv);
1179 				goto error;
1180 			}
1181 			loop_cnt[argv[i][1] - '0'] = atol(argv[i + 1]);
1182 			i++;
1183 			break;
1184 		case 'm':
1185 			if (argc < i + 2) {
1186 				show_usage(argc, argv);
1187 				goto error;
1188 			}
1189 			opt_modulo = atol(argv[i + 1]);
1190 			if (opt_modulo < 0) {
1191 				show_usage(argc, argv);
1192 				goto error;
1193 			}
1194 			i++;
1195 			break;
1196 		case 's':
1197 			if (argc < i + 2) {
1198 				show_usage(argc, argv);
1199 				goto error;
1200 			}
1201 			opt_sleep = atol(argv[i + 1]);
1202 			if (opt_sleep < 0) {
1203 				show_usage(argc, argv);
1204 				goto error;
1205 			}
1206 			i++;
1207 			break;
1208 		case 'y':
1209 			opt_yield = 1;
1210 			break;
1211 		case 'k':
1212 			opt_signal = 1;
1213 			break;
1214 		case 'd':
1215 			opt_disable_rseq = 1;
1216 			break;
1217 		case 'D':
1218 			if (argc < i + 2) {
1219 				show_usage(argc, argv);
1220 				goto error;
1221 			}
1222 			opt_disable_mod = atol(argv[i + 1]);
1223 			if (opt_disable_mod < 0) {
1224 				show_usage(argc, argv);
1225 				goto error;
1226 			}
1227 			i++;
1228 			break;
1229 		case 't':
1230 			if (argc < i + 2) {
1231 				show_usage(argc, argv);
1232 				goto error;
1233 			}
1234 			opt_threads = atol(argv[i + 1]);
1235 			if (opt_threads < 0) {
1236 				show_usage(argc, argv);
1237 				goto error;
1238 			}
1239 			i++;
1240 			break;
1241 		case 'r':
1242 			if (argc < i + 2) {
1243 				show_usage(argc, argv);
1244 				goto error;
1245 			}
1246 			opt_reps = atoll(argv[i + 1]);
1247 			if (opt_reps < 0) {
1248 				show_usage(argc, argv);
1249 				goto error;
1250 			}
1251 			i++;
1252 			break;
1253 		case 'h':
1254 			show_usage(argc, argv);
1255 			goto end;
1256 		case 'T':
1257 			if (argc < i + 2) {
1258 				show_usage(argc, argv);
1259 				goto error;
1260 			}
1261 			opt_test = *argv[i + 1];
1262 			switch (opt_test) {
1263 			case 's':
1264 			case 'l':
1265 			case 'i':
1266 			case 'b':
1267 			case 'm':
1268 				break;
1269 			default:
1270 				show_usage(argc, argv);
1271 				goto error;
1272 			}
1273 			i++;
1274 			break;
1275 		case 'v':
1276 			verbose = 1;
1277 			break;
1278 		case 'M':
1279 			opt_mb = 1;
1280 			break;
1281 		default:
1282 			show_usage(argc, argv);
1283 			goto error;
1284 		}
1285 	}
1286 
1287 	loop_cnt_1 = loop_cnt[1];
1288 	loop_cnt_2 = loop_cnt[2];
1289 	loop_cnt_3 = loop_cnt[3];
1290 	loop_cnt_4 = loop_cnt[4];
1291 	loop_cnt_5 = loop_cnt[5];
1292 	loop_cnt_6 = loop_cnt[6];
1293 
1294 	if (set_signal_handler())
1295 		goto error;
1296 
1297 	if (!opt_disable_rseq && rseq_register_current_thread())
1298 		goto error;
1299 	switch (opt_test) {
1300 	case 's':
1301 		printf_verbose("spinlock\n");
1302 		test_percpu_spinlock();
1303 		break;
1304 	case 'l':
1305 		printf_verbose("linked list\n");
1306 		test_percpu_list();
1307 		break;
1308 	case 'b':
1309 		printf_verbose("buffer\n");
1310 		test_percpu_buffer();
1311 		break;
1312 	case 'm':
1313 		printf_verbose("memcpy buffer\n");
1314 		test_percpu_memcpy_buffer();
1315 		break;
1316 	case 'i':
1317 		printf_verbose("counter increment\n");
1318 		test_percpu_inc();
1319 		break;
1320 	}
1321 	if (!opt_disable_rseq && rseq_unregister_current_thread())
1322 		abort();
1323 end:
1324 	return 0;
1325 
1326 error:
1327 	return -1;
1328 }
1329