1 /* SPDX-License-Identifier: MIT */
2 /* based on linux-kernel/tools/testing/selftests/net/msg_zerocopy.c */
3 #include <assert.h>
4 #include <errno.h>
5 #include <error.h>
6 #include <fcntl.h>
7 #include <limits.h>
8 #include <stdbool.h>
9 #include <stdint.h>
10 #include <stdio.h>
11 #include <stdlib.h>
12 #include <string.h>
13 #include <unistd.h>
14 
15 #include <arpa/inet.h>
16 #include <linux/errqueue.h>
17 #include <linux/if_packet.h>
18 #include <linux/io_uring.h>
19 #include <linux/ipv6.h>
20 #include <linux/socket.h>
21 #include <linux/sockios.h>
22 #include <net/ethernet.h>
23 #include <net/if.h>
24 #include <netinet/in.h>
25 #include <netinet/ip.h>
26 #include <netinet/ip6.h>
27 #include <netinet/tcp.h>
28 #include <netinet/udp.h>
29 #include <sys/ioctl.h>
30 #include <sys/mman.h>
31 #include <sys/resource.h>
32 #include <sys/socket.h>
33 #include <sys/stat.h>
34 #include <sys/time.h>
35 #include <sys/types.h>
36 #include <sys/un.h>
37 #include <sys/wait.h>
38 
39 #define NOTIF_TAG 0xfffffffULL
40 #define NONZC_TAG 0
41 #define ZC_TAG 1
42 
43 enum {
44 	MODE_NONZC	= 0,
45 	MODE_ZC		= 1,
46 	MODE_ZC_FIXED	= 2,
47 	MODE_MIXED	= 3,
48 };
49 
50 static bool cfg_cork		= false;
51 static int  cfg_mode		= MODE_ZC_FIXED;
52 static int  cfg_nr_reqs		= 8;
53 static int  cfg_family		= PF_UNSPEC;
54 static int  cfg_payload_len;
55 static int  cfg_port		= 8000;
56 static int  cfg_runtime_ms	= 4200;
57 
58 static socklen_t cfg_alen;
59 static struct sockaddr_storage cfg_dst_addr;
60 
61 static char payload[IP_MAXPACKET] __attribute__((aligned(4096)));
62 
63 struct io_sq_ring {
64 	unsigned *head;
65 	unsigned *tail;
66 	unsigned *ring_mask;
67 	unsigned *ring_entries;
68 	unsigned *flags;
69 	unsigned *array;
70 };
71 
72 struct io_cq_ring {
73 	unsigned *head;
74 	unsigned *tail;
75 	unsigned *ring_mask;
76 	unsigned *ring_entries;
77 	struct io_uring_cqe *cqes;
78 };
79 
80 struct io_uring_sq {
81 	unsigned *khead;
82 	unsigned *ktail;
83 	unsigned *kring_mask;
84 	unsigned *kring_entries;
85 	unsigned *kflags;
86 	unsigned *kdropped;
87 	unsigned *array;
88 	struct io_uring_sqe *sqes;
89 
90 	unsigned sqe_head;
91 	unsigned sqe_tail;
92 
93 	size_t ring_sz;
94 };
95 
96 struct io_uring_cq {
97 	unsigned *khead;
98 	unsigned *ktail;
99 	unsigned *kring_mask;
100 	unsigned *kring_entries;
101 	unsigned *koverflow;
102 	struct io_uring_cqe *cqes;
103 
104 	size_t ring_sz;
105 };
106 
107 struct io_uring {
108 	struct io_uring_sq sq;
109 	struct io_uring_cq cq;
110 	int ring_fd;
111 };
112 
113 #ifdef __alpha__
114 # ifndef __NR_io_uring_setup
115 #  define __NR_io_uring_setup		535
116 # endif
117 # ifndef __NR_io_uring_enter
118 #  define __NR_io_uring_enter		536
119 # endif
120 # ifndef __NR_io_uring_register
121 #  define __NR_io_uring_register	537
122 # endif
123 #else /* !__alpha__ */
124 # ifndef __NR_io_uring_setup
125 #  define __NR_io_uring_setup		425
126 # endif
127 # ifndef __NR_io_uring_enter
128 #  define __NR_io_uring_enter		426
129 # endif
130 # ifndef __NR_io_uring_register
131 #  define __NR_io_uring_register	427
132 # endif
133 #endif
134 
135 #if defined(__x86_64) || defined(__i386__)
136 #define read_barrier()	__asm__ __volatile__("":::"memory")
137 #define write_barrier()	__asm__ __volatile__("":::"memory")
138 #else
139 
140 #define read_barrier()	__sync_synchronize()
141 #define write_barrier()	__sync_synchronize()
142 #endif
143 
io_uring_setup(unsigned int entries,struct io_uring_params * p)144 static int io_uring_setup(unsigned int entries, struct io_uring_params *p)
145 {
146 	return syscall(__NR_io_uring_setup, entries, p);
147 }
148 
io_uring_enter(int fd,unsigned int to_submit,unsigned int min_complete,unsigned int flags,sigset_t * sig)149 static int io_uring_enter(int fd, unsigned int to_submit,
150 			  unsigned int min_complete,
151 			  unsigned int flags, sigset_t *sig)
152 {
153 	return syscall(__NR_io_uring_enter, fd, to_submit, min_complete,
154 			flags, sig, _NSIG / 8);
155 }
156 
io_uring_register_buffers(struct io_uring * ring,const struct iovec * iovecs,unsigned nr_iovecs)157 static int io_uring_register_buffers(struct io_uring *ring,
158 				     const struct iovec *iovecs,
159 				     unsigned nr_iovecs)
160 {
161 	int ret;
162 
163 	ret = syscall(__NR_io_uring_register, ring->ring_fd,
164 		      IORING_REGISTER_BUFFERS, iovecs, nr_iovecs);
165 	return (ret < 0) ? -errno : ret;
166 }
167 
io_uring_mmap(int fd,struct io_uring_params * p,struct io_uring_sq * sq,struct io_uring_cq * cq)168 static int io_uring_mmap(int fd, struct io_uring_params *p,
169 			 struct io_uring_sq *sq, struct io_uring_cq *cq)
170 {
171 	size_t size;
172 	void *ptr;
173 	int ret;
174 
175 	sq->ring_sz = p->sq_off.array + p->sq_entries * sizeof(unsigned);
176 	ptr = mmap(0, sq->ring_sz, PROT_READ | PROT_WRITE,
177 		   MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQ_RING);
178 	if (ptr == MAP_FAILED)
179 		return -errno;
180 	sq->khead = ptr + p->sq_off.head;
181 	sq->ktail = ptr + p->sq_off.tail;
182 	sq->kring_mask = ptr + p->sq_off.ring_mask;
183 	sq->kring_entries = ptr + p->sq_off.ring_entries;
184 	sq->kflags = ptr + p->sq_off.flags;
185 	sq->kdropped = ptr + p->sq_off.dropped;
186 	sq->array = ptr + p->sq_off.array;
187 
188 	size = p->sq_entries * sizeof(struct io_uring_sqe);
189 	sq->sqes = mmap(0, size, PROT_READ | PROT_WRITE,
190 			MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQES);
191 	if (sq->sqes == MAP_FAILED) {
192 		ret = -errno;
193 err:
194 		munmap(sq->khead, sq->ring_sz);
195 		return ret;
196 	}
197 
198 	cq->ring_sz = p->cq_off.cqes + p->cq_entries * sizeof(struct io_uring_cqe);
199 	ptr = mmap(0, cq->ring_sz, PROT_READ | PROT_WRITE,
200 			MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_CQ_RING);
201 	if (ptr == MAP_FAILED) {
202 		ret = -errno;
203 		munmap(sq->sqes, p->sq_entries * sizeof(struct io_uring_sqe));
204 		goto err;
205 	}
206 	cq->khead = ptr + p->cq_off.head;
207 	cq->ktail = ptr + p->cq_off.tail;
208 	cq->kring_mask = ptr + p->cq_off.ring_mask;
209 	cq->kring_entries = ptr + p->cq_off.ring_entries;
210 	cq->koverflow = ptr + p->cq_off.overflow;
211 	cq->cqes = ptr + p->cq_off.cqes;
212 	return 0;
213 }
214 
io_uring_queue_init(unsigned entries,struct io_uring * ring,unsigned flags)215 static int io_uring_queue_init(unsigned entries, struct io_uring *ring,
216 			       unsigned flags)
217 {
218 	struct io_uring_params p;
219 	int fd, ret;
220 
221 	memset(ring, 0, sizeof(*ring));
222 	memset(&p, 0, sizeof(p));
223 	p.flags = flags;
224 
225 	fd = io_uring_setup(entries, &p);
226 	if (fd < 0)
227 		return fd;
228 	ret = io_uring_mmap(fd, &p, &ring->sq, &ring->cq);
229 	if (!ret)
230 		ring->ring_fd = fd;
231 	else
232 		close(fd);
233 	return ret;
234 }
235 
io_uring_submit(struct io_uring * ring)236 static int io_uring_submit(struct io_uring *ring)
237 {
238 	struct io_uring_sq *sq = &ring->sq;
239 	const unsigned mask = *sq->kring_mask;
240 	unsigned ktail, submitted, to_submit;
241 	int ret;
242 
243 	read_barrier();
244 	if (*sq->khead != *sq->ktail) {
245 		submitted = *sq->kring_entries;
246 		goto submit;
247 	}
248 	if (sq->sqe_head == sq->sqe_tail)
249 		return 0;
250 
251 	ktail = *sq->ktail;
252 	to_submit = sq->sqe_tail - sq->sqe_head;
253 	for (submitted = 0; submitted < to_submit; submitted++) {
254 		read_barrier();
255 		sq->array[ktail++ & mask] = sq->sqe_head++ & mask;
256 	}
257 	if (!submitted)
258 		return 0;
259 
260 	if (*sq->ktail != ktail) {
261 		write_barrier();
262 		*sq->ktail = ktail;
263 		write_barrier();
264 	}
265 submit:
266 	ret = io_uring_enter(ring->ring_fd, submitted, 0,
267 				IORING_ENTER_GETEVENTS, NULL);
268 	return ret < 0 ? -errno : ret;
269 }
270 
io_uring_prep_send(struct io_uring_sqe * sqe,int sockfd,const void * buf,size_t len,int flags)271 static inline void io_uring_prep_send(struct io_uring_sqe *sqe, int sockfd,
272 				      const void *buf, size_t len, int flags)
273 {
274 	memset(sqe, 0, sizeof(*sqe));
275 	sqe->opcode = (__u8) IORING_OP_SEND;
276 	sqe->fd = sockfd;
277 	sqe->addr = (unsigned long) buf;
278 	sqe->len = len;
279 	sqe->msg_flags = (__u32) flags;
280 }
281 
io_uring_prep_sendzc(struct io_uring_sqe * sqe,int sockfd,const void * buf,size_t len,int flags,unsigned zc_flags)282 static inline void io_uring_prep_sendzc(struct io_uring_sqe *sqe, int sockfd,
283 				        const void *buf, size_t len, int flags,
284 				        unsigned zc_flags)
285 {
286 	io_uring_prep_send(sqe, sockfd, buf, len, flags);
287 	sqe->opcode = (__u8) IORING_OP_SEND_ZC;
288 	sqe->ioprio = zc_flags;
289 }
290 
io_uring_get_sqe(struct io_uring * ring)291 static struct io_uring_sqe *io_uring_get_sqe(struct io_uring *ring)
292 {
293 	struct io_uring_sq *sq = &ring->sq;
294 
295 	if (sq->sqe_tail + 1 - sq->sqe_head > *sq->kring_entries)
296 		return NULL;
297 	return &sq->sqes[sq->sqe_tail++ & *sq->kring_mask];
298 }
299 
io_uring_wait_cqe(struct io_uring * ring,struct io_uring_cqe ** cqe_ptr)300 static int io_uring_wait_cqe(struct io_uring *ring, struct io_uring_cqe **cqe_ptr)
301 {
302 	struct io_uring_cq *cq = &ring->cq;
303 	const unsigned mask = *cq->kring_mask;
304 	unsigned head = *cq->khead;
305 	int ret;
306 
307 	*cqe_ptr = NULL;
308 	do {
309 		read_barrier();
310 		if (head != *cq->ktail) {
311 			*cqe_ptr = &cq->cqes[head & mask];
312 			break;
313 		}
314 		ret = io_uring_enter(ring->ring_fd, 0, 1,
315 					IORING_ENTER_GETEVENTS, NULL);
316 		if (ret < 0)
317 			return -errno;
318 	} while (1);
319 
320 	return 0;
321 }
322 
io_uring_cqe_seen(struct io_uring * ring)323 static inline void io_uring_cqe_seen(struct io_uring *ring)
324 {
325 	*(&ring->cq)->khead += 1;
326 	write_barrier();
327 }
328 
gettimeofday_ms(void)329 static unsigned long gettimeofday_ms(void)
330 {
331 	struct timeval tv;
332 
333 	gettimeofday(&tv, NULL);
334 	return (tv.tv_sec * 1000) + (tv.tv_usec / 1000);
335 }
336 
do_setsockopt(int fd,int level,int optname,int val)337 static void do_setsockopt(int fd, int level, int optname, int val)
338 {
339 	if (setsockopt(fd, level, optname, &val, sizeof(val)))
340 		error(1, errno, "setsockopt %d.%d: %d", level, optname, val);
341 }
342 
do_setup_tx(int domain,int type,int protocol)343 static int do_setup_tx(int domain, int type, int protocol)
344 {
345 	int fd;
346 
347 	fd = socket(domain, type, protocol);
348 	if (fd == -1)
349 		error(1, errno, "socket t");
350 
351 	do_setsockopt(fd, SOL_SOCKET, SO_SNDBUF, 1 << 21);
352 
353 	if (connect(fd, (void *) &cfg_dst_addr, cfg_alen))
354 		error(1, errno, "connect");
355 	return fd;
356 }
357 
do_tx(int domain,int type,int protocol)358 static void do_tx(int domain, int type, int protocol)
359 {
360 	struct io_uring_sqe *sqe;
361 	struct io_uring_cqe *cqe;
362 	unsigned long packets = 0, bytes = 0;
363 	struct io_uring ring;
364 	struct iovec iov;
365 	uint64_t tstop;
366 	int i, fd, ret;
367 	int compl_cqes = 0;
368 
369 	fd = do_setup_tx(domain, type, protocol);
370 
371 	ret = io_uring_queue_init(512, &ring, 0);
372 	if (ret)
373 		error(1, ret, "io_uring: queue init");
374 
375 	iov.iov_base = payload;
376 	iov.iov_len = cfg_payload_len;
377 
378 	ret = io_uring_register_buffers(&ring, &iov, 1);
379 	if (ret)
380 		error(1, ret, "io_uring: buffer registration");
381 
382 	tstop = gettimeofday_ms() + cfg_runtime_ms;
383 	do {
384 		if (cfg_cork)
385 			do_setsockopt(fd, IPPROTO_UDP, UDP_CORK, 1);
386 
387 		for (i = 0; i < cfg_nr_reqs; i++) {
388 			unsigned zc_flags = 0;
389 			unsigned buf_idx = 0;
390 			unsigned mode = cfg_mode;
391 			unsigned msg_flags = MSG_WAITALL;
392 
393 			if (cfg_mode == MODE_MIXED)
394 				mode = rand() % 3;
395 
396 			sqe = io_uring_get_sqe(&ring);
397 
398 			if (mode == MODE_NONZC) {
399 				io_uring_prep_send(sqe, fd, payload,
400 						   cfg_payload_len, msg_flags);
401 				sqe->user_data = NONZC_TAG;
402 			} else {
403 				io_uring_prep_sendzc(sqe, fd, payload,
404 						     cfg_payload_len,
405 						     msg_flags, zc_flags);
406 				if (mode == MODE_ZC_FIXED) {
407 					sqe->ioprio |= IORING_RECVSEND_FIXED_BUF;
408 					sqe->buf_index = buf_idx;
409 				}
410 				sqe->user_data = ZC_TAG;
411 			}
412 		}
413 
414 		ret = io_uring_submit(&ring);
415 		if (ret != cfg_nr_reqs)
416 			error(1, ret, "submit");
417 
418 		if (cfg_cork)
419 			do_setsockopt(fd, IPPROTO_UDP, UDP_CORK, 0);
420 		for (i = 0; i < cfg_nr_reqs; i++) {
421 			ret = io_uring_wait_cqe(&ring, &cqe);
422 			if (ret)
423 				error(1, ret, "wait cqe");
424 
425 			if (cqe->user_data != NONZC_TAG &&
426 			    cqe->user_data != ZC_TAG)
427 				error(1, -EINVAL, "invalid cqe->user_data");
428 
429 			if (cqe->flags & IORING_CQE_F_NOTIF) {
430 				if (cqe->flags & IORING_CQE_F_MORE)
431 					error(1, -EINVAL, "invalid notif flags");
432 				if (compl_cqes <= 0)
433 					error(1, -EINVAL, "notification mismatch");
434 				compl_cqes--;
435 				i--;
436 				io_uring_cqe_seen(&ring);
437 				continue;
438 			}
439 			if (cqe->flags & IORING_CQE_F_MORE) {
440 				if (cqe->user_data != ZC_TAG)
441 					error(1, cqe->res, "unexpected F_MORE");
442 				compl_cqes++;
443 			}
444 			if (cqe->res >= 0) {
445 				packets++;
446 				bytes += cqe->res;
447 			} else if (cqe->res != -EAGAIN) {
448 				error(1, cqe->res, "send failed");
449 			}
450 			io_uring_cqe_seen(&ring);
451 		}
452 	} while (gettimeofday_ms() < tstop);
453 
454 	while (compl_cqes) {
455 		ret = io_uring_wait_cqe(&ring, &cqe);
456 		if (ret)
457 			error(1, ret, "wait cqe");
458 		if (cqe->flags & IORING_CQE_F_MORE)
459 			error(1, -EINVAL, "invalid notif flags");
460 		if (!(cqe->flags & IORING_CQE_F_NOTIF))
461 			error(1, -EINVAL, "missing notif flag");
462 
463 		io_uring_cqe_seen(&ring);
464 		compl_cqes--;
465 	}
466 
467 	fprintf(stderr, "tx=%lu (MB=%lu), tx/s=%lu (MB/s=%lu)\n",
468 			packets, bytes >> 20,
469 			packets / (cfg_runtime_ms / 1000),
470 			(bytes >> 20) / (cfg_runtime_ms / 1000));
471 
472 	if (close(fd))
473 		error(1, errno, "close");
474 }
475 
do_test(int domain,int type,int protocol)476 static void do_test(int domain, int type, int protocol)
477 {
478 	int i;
479 
480 	for (i = 0; i < IP_MAXPACKET; i++)
481 		payload[i] = 'a' + (i % 26);
482 	do_tx(domain, type, protocol);
483 }
484 
usage(const char * filepath)485 static void usage(const char *filepath)
486 {
487 	error(1, 0, "Usage: %s (-4|-6) (udp|tcp) -D<dst_ip> [-s<payload size>] "
488 		    "[-t<time s>] [-n<batch>] [-p<port>] [-m<mode>]", filepath);
489 }
490 
parse_opts(int argc,char ** argv)491 static void parse_opts(int argc, char **argv)
492 {
493 	const int max_payload_len = sizeof(payload) -
494 				    sizeof(struct ipv6hdr) -
495 				    sizeof(struct tcphdr) -
496 				    40 /* max tcp options */;
497 	struct sockaddr_in6 *addr6 = (void *) &cfg_dst_addr;
498 	struct sockaddr_in *addr4 = (void *) &cfg_dst_addr;
499 	char *daddr = NULL;
500 	int c;
501 
502 	if (argc <= 1)
503 		usage(argv[0]);
504 	cfg_payload_len = max_payload_len;
505 
506 	while ((c = getopt(argc, argv, "46D:p:s:t:n:c:m:")) != -1) {
507 		switch (c) {
508 		case '4':
509 			if (cfg_family != PF_UNSPEC)
510 				error(1, 0, "Pass one of -4 or -6");
511 			cfg_family = PF_INET;
512 			cfg_alen = sizeof(struct sockaddr_in);
513 			break;
514 		case '6':
515 			if (cfg_family != PF_UNSPEC)
516 				error(1, 0, "Pass one of -4 or -6");
517 			cfg_family = PF_INET6;
518 			cfg_alen = sizeof(struct sockaddr_in6);
519 			break;
520 		case 'D':
521 			daddr = optarg;
522 			break;
523 		case 'p':
524 			cfg_port = strtoul(optarg, NULL, 0);
525 			break;
526 		case 's':
527 			cfg_payload_len = strtoul(optarg, NULL, 0);
528 			break;
529 		case 't':
530 			cfg_runtime_ms = 200 + strtoul(optarg, NULL, 10) * 1000;
531 			break;
532 		case 'n':
533 			cfg_nr_reqs = strtoul(optarg, NULL, 0);
534 			break;
535 		case 'c':
536 			cfg_cork = strtol(optarg, NULL, 0);
537 			break;
538 		case 'm':
539 			cfg_mode = strtol(optarg, NULL, 0);
540 			break;
541 		}
542 	}
543 
544 	switch (cfg_family) {
545 	case PF_INET:
546 		memset(addr4, 0, sizeof(*addr4));
547 		addr4->sin_family = AF_INET;
548 		addr4->sin_port = htons(cfg_port);
549 		if (daddr &&
550 		    inet_pton(AF_INET, daddr, &(addr4->sin_addr)) != 1)
551 			error(1, 0, "ipv4 parse error: %s", daddr);
552 		break;
553 	case PF_INET6:
554 		memset(addr6, 0, sizeof(*addr6));
555 		addr6->sin6_family = AF_INET6;
556 		addr6->sin6_port = htons(cfg_port);
557 		if (daddr &&
558 		    inet_pton(AF_INET6, daddr, &(addr6->sin6_addr)) != 1)
559 			error(1, 0, "ipv6 parse error: %s", daddr);
560 		break;
561 	default:
562 		error(1, 0, "illegal domain");
563 	}
564 
565 	if (cfg_payload_len > max_payload_len)
566 		error(1, 0, "-s: payload exceeds max (%d)", max_payload_len);
567 	if (optind != argc - 1)
568 		usage(argv[0]);
569 }
570 
main(int argc,char ** argv)571 int main(int argc, char **argv)
572 {
573 	const char *cfg_test = argv[argc - 1];
574 
575 	parse_opts(argc, argv);
576 
577 	if (!strcmp(cfg_test, "tcp"))
578 		do_test(cfg_family, SOCK_STREAM, 0);
579 	else if (!strcmp(cfg_test, "udp"))
580 		do_test(cfg_family, SOCK_DGRAM, 0);
581 	else
582 		error(1, 0, "unknown cfg_test %s", cfg_test);
583 	return 0;
584 }
585