1 /* SPDX-License-Identifier: MIT */
2 /* based on linux-kernel/tools/testing/selftests/net/msg_zerocopy.c */
3 #include <assert.h>
4 #include <errno.h>
5 #include <error.h>
6 #include <fcntl.h>
7 #include <limits.h>
8 #include <stdbool.h>
9 #include <stdint.h>
10 #include <stdio.h>
11 #include <stdlib.h>
12 #include <string.h>
13 #include <unistd.h>
14 
15 #include <arpa/inet.h>
16 #include <linux/errqueue.h>
17 #include <linux/if_packet.h>
18 #include <linux/io_uring.h>
19 #include <linux/ipv6.h>
20 #include <linux/socket.h>
21 #include <linux/sockios.h>
22 #include <net/ethernet.h>
23 #include <net/if.h>
24 #include <netinet/in.h>
25 #include <netinet/ip.h>
26 #include <netinet/ip6.h>
27 #include <netinet/tcp.h>
28 #include <netinet/udp.h>
29 #include <sys/ioctl.h>
30 #include <sys/mman.h>
31 #include <sys/resource.h>
32 #include <sys/socket.h>
33 #include <sys/stat.h>
34 #include <sys/time.h>
35 #include <sys/types.h>
36 #include <sys/un.h>
37 #include <sys/wait.h>
38 
39 #define NOTIF_TAG 0xfffffffULL
40 #define NONZC_TAG 0
41 #define ZC_TAG 1
42 
43 enum {
44 	MODE_NONZC	= 0,
45 	MODE_ZC		= 1,
46 	MODE_ZC_FIXED	= 2,
47 	MODE_MIXED	= 3,
48 };
49 
50 static bool cfg_flush		= false;
51 static bool cfg_cork		= false;
52 static int  cfg_mode		= MODE_ZC_FIXED;
53 static int  cfg_nr_reqs		= 8;
54 static int  cfg_family		= PF_UNSPEC;
55 static int  cfg_payload_len;
56 static int  cfg_port		= 8000;
57 static int  cfg_runtime_ms	= 4200;
58 
59 static socklen_t cfg_alen;
60 static struct sockaddr_storage cfg_dst_addr;
61 
62 static char payload[IP_MAXPACKET] __attribute__((aligned(4096)));
63 
64 struct io_sq_ring {
65 	unsigned *head;
66 	unsigned *tail;
67 	unsigned *ring_mask;
68 	unsigned *ring_entries;
69 	unsigned *flags;
70 	unsigned *array;
71 };
72 
73 struct io_cq_ring {
74 	unsigned *head;
75 	unsigned *tail;
76 	unsigned *ring_mask;
77 	unsigned *ring_entries;
78 	struct io_uring_cqe *cqes;
79 };
80 
81 struct io_uring_sq {
82 	unsigned *khead;
83 	unsigned *ktail;
84 	unsigned *kring_mask;
85 	unsigned *kring_entries;
86 	unsigned *kflags;
87 	unsigned *kdropped;
88 	unsigned *array;
89 	struct io_uring_sqe *sqes;
90 
91 	unsigned sqe_head;
92 	unsigned sqe_tail;
93 
94 	size_t ring_sz;
95 };
96 
97 struct io_uring_cq {
98 	unsigned *khead;
99 	unsigned *ktail;
100 	unsigned *kring_mask;
101 	unsigned *kring_entries;
102 	unsigned *koverflow;
103 	struct io_uring_cqe *cqes;
104 
105 	size_t ring_sz;
106 };
107 
108 struct io_uring {
109 	struct io_uring_sq sq;
110 	struct io_uring_cq cq;
111 	int ring_fd;
112 };
113 
114 #ifdef __alpha__
115 # ifndef __NR_io_uring_setup
116 #  define __NR_io_uring_setup		535
117 # endif
118 # ifndef __NR_io_uring_enter
119 #  define __NR_io_uring_enter		536
120 # endif
121 # ifndef __NR_io_uring_register
122 #  define __NR_io_uring_register	537
123 # endif
124 #else /* !__alpha__ */
125 # ifndef __NR_io_uring_setup
126 #  define __NR_io_uring_setup		425
127 # endif
128 # ifndef __NR_io_uring_enter
129 #  define __NR_io_uring_enter		426
130 # endif
131 # ifndef __NR_io_uring_register
132 #  define __NR_io_uring_register	427
133 # endif
134 #endif
135 
136 #if defined(__x86_64) || defined(__i386__)
137 #define read_barrier()	__asm__ __volatile__("":::"memory")
138 #define write_barrier()	__asm__ __volatile__("":::"memory")
139 #else
140 
141 #define read_barrier()	__sync_synchronize()
142 #define write_barrier()	__sync_synchronize()
143 #endif
144 
145 static int io_uring_setup(unsigned int entries, struct io_uring_params *p)
146 {
147 	return syscall(__NR_io_uring_setup, entries, p);
148 }
149 
150 static int io_uring_enter(int fd, unsigned int to_submit,
151 			  unsigned int min_complete,
152 			  unsigned int flags, sigset_t *sig)
153 {
154 	return syscall(__NR_io_uring_enter, fd, to_submit, min_complete,
155 			flags, sig, _NSIG / 8);
156 }
157 
158 static int io_uring_register_buffers(struct io_uring *ring,
159 				     const struct iovec *iovecs,
160 				     unsigned nr_iovecs)
161 {
162 	int ret;
163 
164 	ret = syscall(__NR_io_uring_register, ring->ring_fd,
165 		      IORING_REGISTER_BUFFERS, iovecs, nr_iovecs);
166 	return (ret < 0) ? -errno : ret;
167 }
168 
169 static int io_uring_register_notifications(struct io_uring *ring,
170 					   unsigned nr,
171 					   struct io_uring_notification_slot *slots)
172 {
173 	int ret;
174 	struct io_uring_notification_register r = {
175 		.nr_slots = nr,
176 		.data = (unsigned long)slots,
177 	};
178 
179 	ret = syscall(__NR_io_uring_register, ring->ring_fd,
180 		      IORING_REGISTER_NOTIFIERS, &r, sizeof(r));
181 	return (ret < 0) ? -errno : ret;
182 }
183 
184 static int io_uring_mmap(int fd, struct io_uring_params *p,
185 			 struct io_uring_sq *sq, struct io_uring_cq *cq)
186 {
187 	size_t size;
188 	void *ptr;
189 	int ret;
190 
191 	sq->ring_sz = p->sq_off.array + p->sq_entries * sizeof(unsigned);
192 	ptr = mmap(0, sq->ring_sz, PROT_READ | PROT_WRITE,
193 		   MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQ_RING);
194 	if (ptr == MAP_FAILED)
195 		return -errno;
196 	sq->khead = ptr + p->sq_off.head;
197 	sq->ktail = ptr + p->sq_off.tail;
198 	sq->kring_mask = ptr + p->sq_off.ring_mask;
199 	sq->kring_entries = ptr + p->sq_off.ring_entries;
200 	sq->kflags = ptr + p->sq_off.flags;
201 	sq->kdropped = ptr + p->sq_off.dropped;
202 	sq->array = ptr + p->sq_off.array;
203 
204 	size = p->sq_entries * sizeof(struct io_uring_sqe);
205 	sq->sqes = mmap(0, size, PROT_READ | PROT_WRITE,
206 			MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQES);
207 	if (sq->sqes == MAP_FAILED) {
208 		ret = -errno;
209 err:
210 		munmap(sq->khead, sq->ring_sz);
211 		return ret;
212 	}
213 
214 	cq->ring_sz = p->cq_off.cqes + p->cq_entries * sizeof(struct io_uring_cqe);
215 	ptr = mmap(0, cq->ring_sz, PROT_READ | PROT_WRITE,
216 			MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_CQ_RING);
217 	if (ptr == MAP_FAILED) {
218 		ret = -errno;
219 		munmap(sq->sqes, p->sq_entries * sizeof(struct io_uring_sqe));
220 		goto err;
221 	}
222 	cq->khead = ptr + p->cq_off.head;
223 	cq->ktail = ptr + p->cq_off.tail;
224 	cq->kring_mask = ptr + p->cq_off.ring_mask;
225 	cq->kring_entries = ptr + p->cq_off.ring_entries;
226 	cq->koverflow = ptr + p->cq_off.overflow;
227 	cq->cqes = ptr + p->cq_off.cqes;
228 	return 0;
229 }
230 
231 static int io_uring_queue_init(unsigned entries, struct io_uring *ring,
232 			       unsigned flags)
233 {
234 	struct io_uring_params p;
235 	int fd, ret;
236 
237 	memset(ring, 0, sizeof(*ring));
238 	memset(&p, 0, sizeof(p));
239 	p.flags = flags;
240 
241 	fd = io_uring_setup(entries, &p);
242 	if (fd < 0)
243 		return fd;
244 	ret = io_uring_mmap(fd, &p, &ring->sq, &ring->cq);
245 	if (!ret)
246 		ring->ring_fd = fd;
247 	else
248 		close(fd);
249 	return ret;
250 }
251 
252 static int io_uring_submit(struct io_uring *ring)
253 {
254 	struct io_uring_sq *sq = &ring->sq;
255 	const unsigned mask = *sq->kring_mask;
256 	unsigned ktail, submitted, to_submit;
257 	int ret;
258 
259 	read_barrier();
260 	if (*sq->khead != *sq->ktail) {
261 		submitted = *sq->kring_entries;
262 		goto submit;
263 	}
264 	if (sq->sqe_head == sq->sqe_tail)
265 		return 0;
266 
267 	ktail = *sq->ktail;
268 	to_submit = sq->sqe_tail - sq->sqe_head;
269 	for (submitted = 0; submitted < to_submit; submitted++) {
270 		read_barrier();
271 		sq->array[ktail++ & mask] = sq->sqe_head++ & mask;
272 	}
273 	if (!submitted)
274 		return 0;
275 
276 	if (*sq->ktail != ktail) {
277 		write_barrier();
278 		*sq->ktail = ktail;
279 		write_barrier();
280 	}
281 submit:
282 	ret = io_uring_enter(ring->ring_fd, submitted, 0,
283 				IORING_ENTER_GETEVENTS, NULL);
284 	return ret < 0 ? -errno : ret;
285 }
286 
287 static inline void io_uring_prep_send(struct io_uring_sqe *sqe, int sockfd,
288 				      const void *buf, size_t len, int flags)
289 {
290 	memset(sqe, 0, sizeof(*sqe));
291 	sqe->opcode = (__u8) IORING_OP_SEND;
292 	sqe->fd = sockfd;
293 	sqe->addr = (unsigned long) buf;
294 	sqe->len = len;
295 	sqe->msg_flags = (__u32) flags;
296 }
297 
298 static inline void io_uring_prep_sendzc(struct io_uring_sqe *sqe, int sockfd,
299 				        const void *buf, size_t len, int flags,
300 				        unsigned slot_idx, unsigned zc_flags)
301 {
302 	io_uring_prep_send(sqe, sockfd, buf, len, flags);
303 	sqe->opcode = (__u8) IORING_OP_SENDZC_NOTIF;
304 	sqe->notification_idx = slot_idx;
305 	sqe->ioprio = zc_flags;
306 }
307 
308 static struct io_uring_sqe *io_uring_get_sqe(struct io_uring *ring)
309 {
310 	struct io_uring_sq *sq = &ring->sq;
311 
312 	if (sq->sqe_tail + 1 - sq->sqe_head > *sq->kring_entries)
313 		return NULL;
314 	return &sq->sqes[sq->sqe_tail++ & *sq->kring_mask];
315 }
316 
317 static int io_uring_wait_cqe(struct io_uring *ring, struct io_uring_cqe **cqe_ptr)
318 {
319 	struct io_uring_cq *cq = &ring->cq;
320 	const unsigned mask = *cq->kring_mask;
321 	unsigned head = *cq->khead;
322 	int ret;
323 
324 	*cqe_ptr = NULL;
325 	do {
326 		read_barrier();
327 		if (head != *cq->ktail) {
328 			*cqe_ptr = &cq->cqes[head & mask];
329 			break;
330 		}
331 		ret = io_uring_enter(ring->ring_fd, 0, 1,
332 					IORING_ENTER_GETEVENTS, NULL);
333 		if (ret < 0)
334 			return -errno;
335 	} while (1);
336 
337 	return 0;
338 }
339 
340 static inline void io_uring_cqe_seen(struct io_uring *ring)
341 {
342 	*(&ring->cq)->khead += 1;
343 	write_barrier();
344 }
345 
346 static unsigned long gettimeofday_ms(void)
347 {
348 	struct timeval tv;
349 
350 	gettimeofday(&tv, NULL);
351 	return (tv.tv_sec * 1000) + (tv.tv_usec / 1000);
352 }
353 
354 static void do_setsockopt(int fd, int level, int optname, int val)
355 {
356 	if (setsockopt(fd, level, optname, &val, sizeof(val)))
357 		error(1, errno, "setsockopt %d.%d: %d", level, optname, val);
358 }
359 
360 static int do_setup_tx(int domain, int type, int protocol)
361 {
362 	int fd;
363 
364 	fd = socket(domain, type, protocol);
365 	if (fd == -1)
366 		error(1, errno, "socket t");
367 
368 	do_setsockopt(fd, SOL_SOCKET, SO_SNDBUF, 1 << 21);
369 
370 	if (connect(fd, (void *) &cfg_dst_addr, cfg_alen))
371 		error(1, errno, "connect");
372 	return fd;
373 }
374 
375 static void do_tx(int domain, int type, int protocol)
376 {
377 	struct io_uring_notification_slot b[1] = {{.tag = NOTIF_TAG}};
378 	struct io_uring_sqe *sqe;
379 	struct io_uring_cqe *cqe;
380 	unsigned long packets = 0, bytes = 0;
381 	struct io_uring ring;
382 	struct iovec iov;
383 	uint64_t tstop;
384 	int i, fd, ret;
385 	int compl_cqes = 0;
386 
387 	fd = do_setup_tx(domain, type, protocol);
388 
389 	ret = io_uring_queue_init(512, &ring, 0);
390 	if (ret)
391 		error(1, ret, "io_uring: queue init");
392 
393 	ret = io_uring_register_notifications(&ring, 1, b);
394 	if (ret)
395 		error(1, ret, "io_uring: tx ctx registration");
396 
397 	iov.iov_base = payload;
398 	iov.iov_len = cfg_payload_len;
399 
400 	ret = io_uring_register_buffers(&ring, &iov, 1);
401 	if (ret)
402 		error(1, ret, "io_uring: buffer registration");
403 
404 	tstop = gettimeofday_ms() + cfg_runtime_ms;
405 	do {
406 		if (cfg_cork)
407 			do_setsockopt(fd, IPPROTO_UDP, UDP_CORK, 1);
408 
409 		for (i = 0; i < cfg_nr_reqs; i++) {
410 			unsigned zc_flags = 0;
411 			unsigned buf_idx = 0;
412 			unsigned slot_idx = 0;
413 			unsigned mode = cfg_mode;
414 			unsigned msg_flags = 0;
415 
416 			if (cfg_mode == MODE_MIXED)
417 				mode = rand() % 3;
418 
419 			sqe = io_uring_get_sqe(&ring);
420 
421 			if (mode == MODE_NONZC) {
422 				io_uring_prep_send(sqe, fd, payload,
423 						   cfg_payload_len, msg_flags);
424 				sqe->user_data = NONZC_TAG;
425 			} else {
426 				if (cfg_flush) {
427 					zc_flags |= IORING_RECVSEND_NOTIF_FLUSH;
428 					compl_cqes++;
429 				}
430 				io_uring_prep_sendzc(sqe, fd, payload,
431 						     cfg_payload_len,
432 						     msg_flags, slot_idx, zc_flags);
433 				if (mode == MODE_ZC_FIXED) {
434 					sqe->ioprio |= IORING_RECVSEND_FIXED_BUF;
435 					sqe->buf_index = buf_idx;
436 				}
437 				sqe->user_data = ZC_TAG;
438 			}
439 		}
440 
441 		ret = io_uring_submit(&ring);
442 		if (ret != cfg_nr_reqs)
443 			error(1, ret, "submit");
444 
445 		for (i = 0; i < cfg_nr_reqs; i++) {
446 			ret = io_uring_wait_cqe(&ring, &cqe);
447 			if (ret)
448 				error(1, ret, "wait cqe");
449 
450 			if (cqe->user_data == NOTIF_TAG) {
451 				compl_cqes--;
452 				i--;
453 			} else if (cqe->user_data != NONZC_TAG &&
454 				   cqe->user_data != ZC_TAG) {
455 				error(1, cqe->res, "invalid user_data");
456 			} else if (cqe->res <= 0 && cqe->res != -EAGAIN) {
457 				error(1, cqe->res, "send failed");
458 			} else {
459 				if (cqe->res > 0) {
460 					packets++;
461 					bytes += cqe->res;
462 				}
463 				/* failed requests don't flush */
464 				if (cfg_flush &&
465 				    cqe->res <= 0 &&
466 				    cqe->user_data == ZC_TAG)
467 					compl_cqes--;
468 			}
469 			io_uring_cqe_seen(&ring);
470 		}
471 		if (cfg_cork)
472 			do_setsockopt(fd, IPPROTO_UDP, UDP_CORK, 0);
473 	} while (gettimeofday_ms() < tstop);
474 
475 	if (close(fd))
476 		error(1, errno, "close");
477 
478 	fprintf(stderr, "tx=%lu (MB=%lu), tx/s=%lu (MB/s=%lu)\n",
479 			packets, bytes >> 20,
480 			packets / (cfg_runtime_ms / 1000),
481 			(bytes >> 20) / (cfg_runtime_ms / 1000));
482 
483 	while (compl_cqes) {
484 		ret = io_uring_wait_cqe(&ring, &cqe);
485 		if (ret)
486 			error(1, ret, "wait cqe");
487 		io_uring_cqe_seen(&ring);
488 		compl_cqes--;
489 	}
490 }
491 
492 static void do_test(int domain, int type, int protocol)
493 {
494 	int i;
495 
496 	for (i = 0; i < IP_MAXPACKET; i++)
497 		payload[i] = 'a' + (i % 26);
498 	do_tx(domain, type, protocol);
499 }
500 
501 static void usage(const char *filepath)
502 {
503 	error(1, 0, "Usage: %s [-f] [-n<N>] [-z0] [-s<payload size>] "
504 		    "(-4|-6) [-t<time s>] -D<dst_ip> udp", filepath);
505 }
506 
507 static void parse_opts(int argc, char **argv)
508 {
509 	const int max_payload_len = sizeof(payload) -
510 				    sizeof(struct ipv6hdr) -
511 				    sizeof(struct tcphdr) -
512 				    40 /* max tcp options */;
513 	struct sockaddr_in6 *addr6 = (void *) &cfg_dst_addr;
514 	struct sockaddr_in *addr4 = (void *) &cfg_dst_addr;
515 	char *daddr = NULL;
516 	int c;
517 
518 	if (argc <= 1)
519 		usage(argv[0]);
520 	cfg_payload_len = max_payload_len;
521 
522 	while ((c = getopt(argc, argv, "46D:p:s:t:n:fc:m:")) != -1) {
523 		switch (c) {
524 		case '4':
525 			if (cfg_family != PF_UNSPEC)
526 				error(1, 0, "Pass one of -4 or -6");
527 			cfg_family = PF_INET;
528 			cfg_alen = sizeof(struct sockaddr_in);
529 			break;
530 		case '6':
531 			if (cfg_family != PF_UNSPEC)
532 				error(1, 0, "Pass one of -4 or -6");
533 			cfg_family = PF_INET6;
534 			cfg_alen = sizeof(struct sockaddr_in6);
535 			break;
536 		case 'D':
537 			daddr = optarg;
538 			break;
539 		case 'p':
540 			cfg_port = strtoul(optarg, NULL, 0);
541 			break;
542 		case 's':
543 			cfg_payload_len = strtoul(optarg, NULL, 0);
544 			break;
545 		case 't':
546 			cfg_runtime_ms = 200 + strtoul(optarg, NULL, 10) * 1000;
547 			break;
548 		case 'n':
549 			cfg_nr_reqs = strtoul(optarg, NULL, 0);
550 			break;
551 		case 'f':
552 			cfg_flush = 1;
553 			break;
554 		case 'c':
555 			cfg_cork = strtol(optarg, NULL, 0);
556 			break;
557 		case 'm':
558 			cfg_mode = strtol(optarg, NULL, 0);
559 			break;
560 		}
561 	}
562 
563 	switch (cfg_family) {
564 	case PF_INET:
565 		memset(addr4, 0, sizeof(*addr4));
566 		addr4->sin_family = AF_INET;
567 		addr4->sin_port = htons(cfg_port);
568 		if (daddr &&
569 		    inet_pton(AF_INET, daddr, &(addr4->sin_addr)) != 1)
570 			error(1, 0, "ipv4 parse error: %s", daddr);
571 		break;
572 	case PF_INET6:
573 		memset(addr6, 0, sizeof(*addr6));
574 		addr6->sin6_family = AF_INET6;
575 		addr6->sin6_port = htons(cfg_port);
576 		if (daddr &&
577 		    inet_pton(AF_INET6, daddr, &(addr6->sin6_addr)) != 1)
578 			error(1, 0, "ipv6 parse error: %s", daddr);
579 		break;
580 	default:
581 		error(1, 0, "illegal domain");
582 	}
583 
584 	if (cfg_payload_len > max_payload_len)
585 		error(1, 0, "-s: payload exceeds max (%d)", max_payload_len);
586 	if (cfg_mode == MODE_NONZC && cfg_flush)
587 		error(1, 0, "-f: only zerocopy modes support notifications");
588 	if (optind != argc - 1)
589 		usage(argv[0]);
590 }
591 
592 int main(int argc, char **argv)
593 {
594 	const char *cfg_test = argv[argc - 1];
595 
596 	parse_opts(argc, argv);
597 
598 	if (!strcmp(cfg_test, "tcp"))
599 		do_test(cfg_family, SOCK_STREAM, 0);
600 	else if (!strcmp(cfg_test, "udp"))
601 		do_test(cfg_family, SOCK_DGRAM, 0);
602 	else
603 		error(1, 0, "unknown cfg_test %s", cfg_test);
604 	return 0;
605 }
606