1 /* SPDX-License-Identifier: MIT */
2 /* based on linux-kernel/tools/testing/selftests/net/msg_zerocopy.c */
3 #include <assert.h>
4 #include <errno.h>
5 #include <error.h>
6 #include <fcntl.h>
7 #include <limits.h>
8 #include <stdbool.h>
9 #include <stdint.h>
10 #include <stdio.h>
11 #include <stdlib.h>
12 #include <string.h>
13 #include <unistd.h>
14
15 #include <arpa/inet.h>
16 #include <linux/errqueue.h>
17 #include <linux/if_packet.h>
18 #include <linux/io_uring.h>
19 #include <linux/ipv6.h>
20 #include <linux/socket.h>
21 #include <linux/sockios.h>
22 #include <net/ethernet.h>
23 #include <net/if.h>
24 #include <netinet/in.h>
25 #include <netinet/ip.h>
26 #include <netinet/ip6.h>
27 #include <netinet/tcp.h>
28 #include <netinet/udp.h>
29 #include <sys/ioctl.h>
30 #include <sys/mman.h>
31 #include <sys/resource.h>
32 #include <sys/socket.h>
33 #include <sys/stat.h>
34 #include <sys/time.h>
35 #include <sys/types.h>
36 #include <sys/un.h>
37 #include <sys/wait.h>
38
39 #define NOTIF_TAG 0xfffffffULL
40 #define NONZC_TAG 0
41 #define ZC_TAG 1
42
43 enum {
44 MODE_NONZC = 0,
45 MODE_ZC = 1,
46 MODE_ZC_FIXED = 2,
47 MODE_MIXED = 3,
48 };
49
50 static bool cfg_cork = false;
51 static int cfg_mode = MODE_ZC_FIXED;
52 static int cfg_nr_reqs = 8;
53 static int cfg_family = PF_UNSPEC;
54 static int cfg_payload_len;
55 static int cfg_port = 8000;
56 static int cfg_runtime_ms = 4200;
57
58 static socklen_t cfg_alen;
59 static struct sockaddr_storage cfg_dst_addr;
60
61 static char payload[IP_MAXPACKET] __attribute__((aligned(4096)));
62
63 struct io_sq_ring {
64 unsigned *head;
65 unsigned *tail;
66 unsigned *ring_mask;
67 unsigned *ring_entries;
68 unsigned *flags;
69 unsigned *array;
70 };
71
72 struct io_cq_ring {
73 unsigned *head;
74 unsigned *tail;
75 unsigned *ring_mask;
76 unsigned *ring_entries;
77 struct io_uring_cqe *cqes;
78 };
79
80 struct io_uring_sq {
81 unsigned *khead;
82 unsigned *ktail;
83 unsigned *kring_mask;
84 unsigned *kring_entries;
85 unsigned *kflags;
86 unsigned *kdropped;
87 unsigned *array;
88 struct io_uring_sqe *sqes;
89
90 unsigned sqe_head;
91 unsigned sqe_tail;
92
93 size_t ring_sz;
94 };
95
96 struct io_uring_cq {
97 unsigned *khead;
98 unsigned *ktail;
99 unsigned *kring_mask;
100 unsigned *kring_entries;
101 unsigned *koverflow;
102 struct io_uring_cqe *cqes;
103
104 size_t ring_sz;
105 };
106
107 struct io_uring {
108 struct io_uring_sq sq;
109 struct io_uring_cq cq;
110 int ring_fd;
111 };
112
113 #ifdef __alpha__
114 # ifndef __NR_io_uring_setup
115 # define __NR_io_uring_setup 535
116 # endif
117 # ifndef __NR_io_uring_enter
118 # define __NR_io_uring_enter 536
119 # endif
120 # ifndef __NR_io_uring_register
121 # define __NR_io_uring_register 537
122 # endif
123 #else /* !__alpha__ */
124 # ifndef __NR_io_uring_setup
125 # define __NR_io_uring_setup 425
126 # endif
127 # ifndef __NR_io_uring_enter
128 # define __NR_io_uring_enter 426
129 # endif
130 # ifndef __NR_io_uring_register
131 # define __NR_io_uring_register 427
132 # endif
133 #endif
134
135 #if defined(__x86_64) || defined(__i386__)
136 #define read_barrier() __asm__ __volatile__("":::"memory")
137 #define write_barrier() __asm__ __volatile__("":::"memory")
138 #else
139
140 #define read_barrier() __sync_synchronize()
141 #define write_barrier() __sync_synchronize()
142 #endif
143
io_uring_setup(unsigned int entries,struct io_uring_params * p)144 static int io_uring_setup(unsigned int entries, struct io_uring_params *p)
145 {
146 return syscall(__NR_io_uring_setup, entries, p);
147 }
148
io_uring_enter(int fd,unsigned int to_submit,unsigned int min_complete,unsigned int flags,sigset_t * sig)149 static int io_uring_enter(int fd, unsigned int to_submit,
150 unsigned int min_complete,
151 unsigned int flags, sigset_t *sig)
152 {
153 return syscall(__NR_io_uring_enter, fd, to_submit, min_complete,
154 flags, sig, _NSIG / 8);
155 }
156
io_uring_register_buffers(struct io_uring * ring,const struct iovec * iovecs,unsigned nr_iovecs)157 static int io_uring_register_buffers(struct io_uring *ring,
158 const struct iovec *iovecs,
159 unsigned nr_iovecs)
160 {
161 int ret;
162
163 ret = syscall(__NR_io_uring_register, ring->ring_fd,
164 IORING_REGISTER_BUFFERS, iovecs, nr_iovecs);
165 return (ret < 0) ? -errno : ret;
166 }
167
io_uring_mmap(int fd,struct io_uring_params * p,struct io_uring_sq * sq,struct io_uring_cq * cq)168 static int io_uring_mmap(int fd, struct io_uring_params *p,
169 struct io_uring_sq *sq, struct io_uring_cq *cq)
170 {
171 size_t size;
172 void *ptr;
173 int ret;
174
175 sq->ring_sz = p->sq_off.array + p->sq_entries * sizeof(unsigned);
176 ptr = mmap(0, sq->ring_sz, PROT_READ | PROT_WRITE,
177 MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQ_RING);
178 if (ptr == MAP_FAILED)
179 return -errno;
180 sq->khead = ptr + p->sq_off.head;
181 sq->ktail = ptr + p->sq_off.tail;
182 sq->kring_mask = ptr + p->sq_off.ring_mask;
183 sq->kring_entries = ptr + p->sq_off.ring_entries;
184 sq->kflags = ptr + p->sq_off.flags;
185 sq->kdropped = ptr + p->sq_off.dropped;
186 sq->array = ptr + p->sq_off.array;
187
188 size = p->sq_entries * sizeof(struct io_uring_sqe);
189 sq->sqes = mmap(0, size, PROT_READ | PROT_WRITE,
190 MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQES);
191 if (sq->sqes == MAP_FAILED) {
192 ret = -errno;
193 err:
194 munmap(sq->khead, sq->ring_sz);
195 return ret;
196 }
197
198 cq->ring_sz = p->cq_off.cqes + p->cq_entries * sizeof(struct io_uring_cqe);
199 ptr = mmap(0, cq->ring_sz, PROT_READ | PROT_WRITE,
200 MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_CQ_RING);
201 if (ptr == MAP_FAILED) {
202 ret = -errno;
203 munmap(sq->sqes, p->sq_entries * sizeof(struct io_uring_sqe));
204 goto err;
205 }
206 cq->khead = ptr + p->cq_off.head;
207 cq->ktail = ptr + p->cq_off.tail;
208 cq->kring_mask = ptr + p->cq_off.ring_mask;
209 cq->kring_entries = ptr + p->cq_off.ring_entries;
210 cq->koverflow = ptr + p->cq_off.overflow;
211 cq->cqes = ptr + p->cq_off.cqes;
212 return 0;
213 }
214
io_uring_queue_init(unsigned entries,struct io_uring * ring,unsigned flags)215 static int io_uring_queue_init(unsigned entries, struct io_uring *ring,
216 unsigned flags)
217 {
218 struct io_uring_params p;
219 int fd, ret;
220
221 memset(ring, 0, sizeof(*ring));
222 memset(&p, 0, sizeof(p));
223 p.flags = flags;
224
225 fd = io_uring_setup(entries, &p);
226 if (fd < 0)
227 return fd;
228 ret = io_uring_mmap(fd, &p, &ring->sq, &ring->cq);
229 if (!ret)
230 ring->ring_fd = fd;
231 else
232 close(fd);
233 return ret;
234 }
235
io_uring_submit(struct io_uring * ring)236 static int io_uring_submit(struct io_uring *ring)
237 {
238 struct io_uring_sq *sq = &ring->sq;
239 const unsigned mask = *sq->kring_mask;
240 unsigned ktail, submitted, to_submit;
241 int ret;
242
243 read_barrier();
244 if (*sq->khead != *sq->ktail) {
245 submitted = *sq->kring_entries;
246 goto submit;
247 }
248 if (sq->sqe_head == sq->sqe_tail)
249 return 0;
250
251 ktail = *sq->ktail;
252 to_submit = sq->sqe_tail - sq->sqe_head;
253 for (submitted = 0; submitted < to_submit; submitted++) {
254 read_barrier();
255 sq->array[ktail++ & mask] = sq->sqe_head++ & mask;
256 }
257 if (!submitted)
258 return 0;
259
260 if (*sq->ktail != ktail) {
261 write_barrier();
262 *sq->ktail = ktail;
263 write_barrier();
264 }
265 submit:
266 ret = io_uring_enter(ring->ring_fd, submitted, 0,
267 IORING_ENTER_GETEVENTS, NULL);
268 return ret < 0 ? -errno : ret;
269 }
270
io_uring_prep_send(struct io_uring_sqe * sqe,int sockfd,const void * buf,size_t len,int flags)271 static inline void io_uring_prep_send(struct io_uring_sqe *sqe, int sockfd,
272 const void *buf, size_t len, int flags)
273 {
274 memset(sqe, 0, sizeof(*sqe));
275 sqe->opcode = (__u8) IORING_OP_SEND;
276 sqe->fd = sockfd;
277 sqe->addr = (unsigned long) buf;
278 sqe->len = len;
279 sqe->msg_flags = (__u32) flags;
280 }
281
io_uring_prep_sendzc(struct io_uring_sqe * sqe,int sockfd,const void * buf,size_t len,int flags,unsigned zc_flags)282 static inline void io_uring_prep_sendzc(struct io_uring_sqe *sqe, int sockfd,
283 const void *buf, size_t len, int flags,
284 unsigned zc_flags)
285 {
286 io_uring_prep_send(sqe, sockfd, buf, len, flags);
287 sqe->opcode = (__u8) IORING_OP_SEND_ZC;
288 sqe->ioprio = zc_flags;
289 }
290
io_uring_get_sqe(struct io_uring * ring)291 static struct io_uring_sqe *io_uring_get_sqe(struct io_uring *ring)
292 {
293 struct io_uring_sq *sq = &ring->sq;
294
295 if (sq->sqe_tail + 1 - sq->sqe_head > *sq->kring_entries)
296 return NULL;
297 return &sq->sqes[sq->sqe_tail++ & *sq->kring_mask];
298 }
299
io_uring_wait_cqe(struct io_uring * ring,struct io_uring_cqe ** cqe_ptr)300 static int io_uring_wait_cqe(struct io_uring *ring, struct io_uring_cqe **cqe_ptr)
301 {
302 struct io_uring_cq *cq = &ring->cq;
303 const unsigned mask = *cq->kring_mask;
304 unsigned head = *cq->khead;
305 int ret;
306
307 *cqe_ptr = NULL;
308 do {
309 read_barrier();
310 if (head != *cq->ktail) {
311 *cqe_ptr = &cq->cqes[head & mask];
312 break;
313 }
314 ret = io_uring_enter(ring->ring_fd, 0, 1,
315 IORING_ENTER_GETEVENTS, NULL);
316 if (ret < 0)
317 return -errno;
318 } while (1);
319
320 return 0;
321 }
322
io_uring_cqe_seen(struct io_uring * ring)323 static inline void io_uring_cqe_seen(struct io_uring *ring)
324 {
325 *(&ring->cq)->khead += 1;
326 write_barrier();
327 }
328
gettimeofday_ms(void)329 static unsigned long gettimeofday_ms(void)
330 {
331 struct timeval tv;
332
333 gettimeofday(&tv, NULL);
334 return (tv.tv_sec * 1000) + (tv.tv_usec / 1000);
335 }
336
do_setsockopt(int fd,int level,int optname,int val)337 static void do_setsockopt(int fd, int level, int optname, int val)
338 {
339 if (setsockopt(fd, level, optname, &val, sizeof(val)))
340 error(1, errno, "setsockopt %d.%d: %d", level, optname, val);
341 }
342
do_setup_tx(int domain,int type,int protocol)343 static int do_setup_tx(int domain, int type, int protocol)
344 {
345 int fd;
346
347 fd = socket(domain, type, protocol);
348 if (fd == -1)
349 error(1, errno, "socket t");
350
351 do_setsockopt(fd, SOL_SOCKET, SO_SNDBUF, 1 << 21);
352
353 if (connect(fd, (void *) &cfg_dst_addr, cfg_alen))
354 error(1, errno, "connect");
355 return fd;
356 }
357
do_tx(int domain,int type,int protocol)358 static void do_tx(int domain, int type, int protocol)
359 {
360 struct io_uring_sqe *sqe;
361 struct io_uring_cqe *cqe;
362 unsigned long packets = 0, bytes = 0;
363 struct io_uring ring;
364 struct iovec iov;
365 uint64_t tstop;
366 int i, fd, ret;
367 int compl_cqes = 0;
368
369 fd = do_setup_tx(domain, type, protocol);
370
371 ret = io_uring_queue_init(512, &ring, 0);
372 if (ret)
373 error(1, ret, "io_uring: queue init");
374
375 iov.iov_base = payload;
376 iov.iov_len = cfg_payload_len;
377
378 ret = io_uring_register_buffers(&ring, &iov, 1);
379 if (ret)
380 error(1, ret, "io_uring: buffer registration");
381
382 tstop = gettimeofday_ms() + cfg_runtime_ms;
383 do {
384 if (cfg_cork)
385 do_setsockopt(fd, IPPROTO_UDP, UDP_CORK, 1);
386
387 for (i = 0; i < cfg_nr_reqs; i++) {
388 unsigned zc_flags = 0;
389 unsigned buf_idx = 0;
390 unsigned mode = cfg_mode;
391 unsigned msg_flags = MSG_WAITALL;
392
393 if (cfg_mode == MODE_MIXED)
394 mode = rand() % 3;
395
396 sqe = io_uring_get_sqe(&ring);
397
398 if (mode == MODE_NONZC) {
399 io_uring_prep_send(sqe, fd, payload,
400 cfg_payload_len, msg_flags);
401 sqe->user_data = NONZC_TAG;
402 } else {
403 io_uring_prep_sendzc(sqe, fd, payload,
404 cfg_payload_len,
405 msg_flags, zc_flags);
406 if (mode == MODE_ZC_FIXED) {
407 sqe->ioprio |= IORING_RECVSEND_FIXED_BUF;
408 sqe->buf_index = buf_idx;
409 }
410 sqe->user_data = ZC_TAG;
411 }
412 }
413
414 ret = io_uring_submit(&ring);
415 if (ret != cfg_nr_reqs)
416 error(1, ret, "submit");
417
418 if (cfg_cork)
419 do_setsockopt(fd, IPPROTO_UDP, UDP_CORK, 0);
420 for (i = 0; i < cfg_nr_reqs; i++) {
421 ret = io_uring_wait_cqe(&ring, &cqe);
422 if (ret)
423 error(1, ret, "wait cqe");
424
425 if (cqe->user_data != NONZC_TAG &&
426 cqe->user_data != ZC_TAG)
427 error(1, -EINVAL, "invalid cqe->user_data");
428
429 if (cqe->flags & IORING_CQE_F_NOTIF) {
430 if (cqe->flags & IORING_CQE_F_MORE)
431 error(1, -EINVAL, "invalid notif flags");
432 if (compl_cqes <= 0)
433 error(1, -EINVAL, "notification mismatch");
434 compl_cqes--;
435 i--;
436 io_uring_cqe_seen(&ring);
437 continue;
438 }
439 if (cqe->flags & IORING_CQE_F_MORE) {
440 if (cqe->user_data != ZC_TAG)
441 error(1, cqe->res, "unexpected F_MORE");
442 compl_cqes++;
443 }
444 if (cqe->res >= 0) {
445 packets++;
446 bytes += cqe->res;
447 } else if (cqe->res != -EAGAIN) {
448 error(1, cqe->res, "send failed");
449 }
450 io_uring_cqe_seen(&ring);
451 }
452 } while (gettimeofday_ms() < tstop);
453
454 while (compl_cqes) {
455 ret = io_uring_wait_cqe(&ring, &cqe);
456 if (ret)
457 error(1, ret, "wait cqe");
458 if (cqe->flags & IORING_CQE_F_MORE)
459 error(1, -EINVAL, "invalid notif flags");
460 if (!(cqe->flags & IORING_CQE_F_NOTIF))
461 error(1, -EINVAL, "missing notif flag");
462
463 io_uring_cqe_seen(&ring);
464 compl_cqes--;
465 }
466
467 fprintf(stderr, "tx=%lu (MB=%lu), tx/s=%lu (MB/s=%lu)\n",
468 packets, bytes >> 20,
469 packets / (cfg_runtime_ms / 1000),
470 (bytes >> 20) / (cfg_runtime_ms / 1000));
471
472 if (close(fd))
473 error(1, errno, "close");
474 }
475
do_test(int domain,int type,int protocol)476 static void do_test(int domain, int type, int protocol)
477 {
478 int i;
479
480 for (i = 0; i < IP_MAXPACKET; i++)
481 payload[i] = 'a' + (i % 26);
482 do_tx(domain, type, protocol);
483 }
484
usage(const char * filepath)485 static void usage(const char *filepath)
486 {
487 error(1, 0, "Usage: %s (-4|-6) (udp|tcp) -D<dst_ip> [-s<payload size>] "
488 "[-t<time s>] [-n<batch>] [-p<port>] [-m<mode>]", filepath);
489 }
490
parse_opts(int argc,char ** argv)491 static void parse_opts(int argc, char **argv)
492 {
493 const int max_payload_len = sizeof(payload) -
494 sizeof(struct ipv6hdr) -
495 sizeof(struct tcphdr) -
496 40 /* max tcp options */;
497 struct sockaddr_in6 *addr6 = (void *) &cfg_dst_addr;
498 struct sockaddr_in *addr4 = (void *) &cfg_dst_addr;
499 char *daddr = NULL;
500 int c;
501
502 if (argc <= 1)
503 usage(argv[0]);
504 cfg_payload_len = max_payload_len;
505
506 while ((c = getopt(argc, argv, "46D:p:s:t:n:c:m:")) != -1) {
507 switch (c) {
508 case '4':
509 if (cfg_family != PF_UNSPEC)
510 error(1, 0, "Pass one of -4 or -6");
511 cfg_family = PF_INET;
512 cfg_alen = sizeof(struct sockaddr_in);
513 break;
514 case '6':
515 if (cfg_family != PF_UNSPEC)
516 error(1, 0, "Pass one of -4 or -6");
517 cfg_family = PF_INET6;
518 cfg_alen = sizeof(struct sockaddr_in6);
519 break;
520 case 'D':
521 daddr = optarg;
522 break;
523 case 'p':
524 cfg_port = strtoul(optarg, NULL, 0);
525 break;
526 case 's':
527 cfg_payload_len = strtoul(optarg, NULL, 0);
528 break;
529 case 't':
530 cfg_runtime_ms = 200 + strtoul(optarg, NULL, 10) * 1000;
531 break;
532 case 'n':
533 cfg_nr_reqs = strtoul(optarg, NULL, 0);
534 break;
535 case 'c':
536 cfg_cork = strtol(optarg, NULL, 0);
537 break;
538 case 'm':
539 cfg_mode = strtol(optarg, NULL, 0);
540 break;
541 }
542 }
543
544 switch (cfg_family) {
545 case PF_INET:
546 memset(addr4, 0, sizeof(*addr4));
547 addr4->sin_family = AF_INET;
548 addr4->sin_port = htons(cfg_port);
549 if (daddr &&
550 inet_pton(AF_INET, daddr, &(addr4->sin_addr)) != 1)
551 error(1, 0, "ipv4 parse error: %s", daddr);
552 break;
553 case PF_INET6:
554 memset(addr6, 0, sizeof(*addr6));
555 addr6->sin6_family = AF_INET6;
556 addr6->sin6_port = htons(cfg_port);
557 if (daddr &&
558 inet_pton(AF_INET6, daddr, &(addr6->sin6_addr)) != 1)
559 error(1, 0, "ipv6 parse error: %s", daddr);
560 break;
561 default:
562 error(1, 0, "illegal domain");
563 }
564
565 if (cfg_payload_len > max_payload_len)
566 error(1, 0, "-s: payload exceeds max (%d)", max_payload_len);
567 if (optind != argc - 1)
568 usage(argv[0]);
569 }
570
main(int argc,char ** argv)571 int main(int argc, char **argv)
572 {
573 const char *cfg_test = argv[argc - 1];
574
575 parse_opts(argc, argv);
576
577 if (!strcmp(cfg_test, "tcp"))
578 do_test(cfg_family, SOCK_STREAM, 0);
579 else if (!strcmp(cfg_test, "udp"))
580 do_test(cfg_family, SOCK_DGRAM, 0);
581 else
582 error(1, 0, "unknown cfg_test %s", cfg_test);
583 return 0;
584 }
585