1 /* SPDX-License-Identifier: MIT */ 2 /* based on linux-kernel/tools/testing/selftests/net/msg_zerocopy.c */ 3 #include <assert.h> 4 #include <errno.h> 5 #include <error.h> 6 #include <fcntl.h> 7 #include <limits.h> 8 #include <stdbool.h> 9 #include <stdint.h> 10 #include <stdio.h> 11 #include <stdlib.h> 12 #include <string.h> 13 #include <unistd.h> 14 15 #include <arpa/inet.h> 16 #include <linux/errqueue.h> 17 #include <linux/if_packet.h> 18 #include <linux/io_uring.h> 19 #include <linux/ipv6.h> 20 #include <linux/socket.h> 21 #include <linux/sockios.h> 22 #include <net/ethernet.h> 23 #include <net/if.h> 24 #include <netinet/in.h> 25 #include <netinet/ip.h> 26 #include <netinet/ip6.h> 27 #include <netinet/tcp.h> 28 #include <netinet/udp.h> 29 #include <sys/ioctl.h> 30 #include <sys/mman.h> 31 #include <sys/resource.h> 32 #include <sys/socket.h> 33 #include <sys/stat.h> 34 #include <sys/time.h> 35 #include <sys/types.h> 36 #include <sys/un.h> 37 #include <sys/wait.h> 38 39 #define NOTIF_TAG 0xfffffffULL 40 #define NONZC_TAG 0 41 #define ZC_TAG 1 42 43 enum { 44 MODE_NONZC = 0, 45 MODE_ZC = 1, 46 MODE_ZC_FIXED = 2, 47 MODE_MIXED = 3, 48 }; 49 50 static bool cfg_flush = false; 51 static bool cfg_cork = false; 52 static int cfg_mode = MODE_ZC_FIXED; 53 static int cfg_nr_reqs = 8; 54 static int cfg_family = PF_UNSPEC; 55 static int cfg_payload_len; 56 static int cfg_port = 8000; 57 static int cfg_runtime_ms = 4200; 58 59 static socklen_t cfg_alen; 60 static struct sockaddr_storage cfg_dst_addr; 61 62 static char payload[IP_MAXPACKET] __attribute__((aligned(4096))); 63 64 struct io_sq_ring { 65 unsigned *head; 66 unsigned *tail; 67 unsigned *ring_mask; 68 unsigned *ring_entries; 69 unsigned *flags; 70 unsigned *array; 71 }; 72 73 struct io_cq_ring { 74 unsigned *head; 75 unsigned *tail; 76 unsigned *ring_mask; 77 unsigned *ring_entries; 78 struct io_uring_cqe *cqes; 79 }; 80 81 struct io_uring_sq { 82 unsigned *khead; 83 unsigned *ktail; 84 unsigned *kring_mask; 85 unsigned *kring_entries; 86 unsigned *kflags; 87 unsigned *kdropped; 88 unsigned *array; 89 struct io_uring_sqe *sqes; 90 91 unsigned sqe_head; 92 unsigned sqe_tail; 93 94 size_t ring_sz; 95 }; 96 97 struct io_uring_cq { 98 unsigned *khead; 99 unsigned *ktail; 100 unsigned *kring_mask; 101 unsigned *kring_entries; 102 unsigned *koverflow; 103 struct io_uring_cqe *cqes; 104 105 size_t ring_sz; 106 }; 107 108 struct io_uring { 109 struct io_uring_sq sq; 110 struct io_uring_cq cq; 111 int ring_fd; 112 }; 113 114 #ifdef __alpha__ 115 # ifndef __NR_io_uring_setup 116 # define __NR_io_uring_setup 535 117 # endif 118 # ifndef __NR_io_uring_enter 119 # define __NR_io_uring_enter 536 120 # endif 121 # ifndef __NR_io_uring_register 122 # define __NR_io_uring_register 537 123 # endif 124 #else /* !__alpha__ */ 125 # ifndef __NR_io_uring_setup 126 # define __NR_io_uring_setup 425 127 # endif 128 # ifndef __NR_io_uring_enter 129 # define __NR_io_uring_enter 426 130 # endif 131 # ifndef __NR_io_uring_register 132 # define __NR_io_uring_register 427 133 # endif 134 #endif 135 136 #if defined(__x86_64) || defined(__i386__) 137 #define read_barrier() __asm__ __volatile__("":::"memory") 138 #define write_barrier() __asm__ __volatile__("":::"memory") 139 #else 140 141 #define read_barrier() __sync_synchronize() 142 #define write_barrier() __sync_synchronize() 143 #endif 144 145 static int io_uring_setup(unsigned int entries, struct io_uring_params *p) 146 { 147 return syscall(__NR_io_uring_setup, entries, p); 148 } 149 150 static int io_uring_enter(int fd, unsigned int to_submit, 151 unsigned int min_complete, 152 unsigned int flags, sigset_t *sig) 153 { 154 return syscall(__NR_io_uring_enter, fd, to_submit, min_complete, 155 flags, sig, _NSIG / 8); 156 } 157 158 static int io_uring_register_buffers(struct io_uring *ring, 159 const struct iovec *iovecs, 160 unsigned nr_iovecs) 161 { 162 int ret; 163 164 ret = syscall(__NR_io_uring_register, ring->ring_fd, 165 IORING_REGISTER_BUFFERS, iovecs, nr_iovecs); 166 return (ret < 0) ? -errno : ret; 167 } 168 169 static int io_uring_register_notifications(struct io_uring *ring, 170 unsigned nr, 171 struct io_uring_notification_slot *slots) 172 { 173 int ret; 174 struct io_uring_notification_register r = { 175 .nr_slots = nr, 176 .data = (unsigned long)slots, 177 }; 178 179 ret = syscall(__NR_io_uring_register, ring->ring_fd, 180 IORING_REGISTER_NOTIFIERS, &r, sizeof(r)); 181 return (ret < 0) ? -errno : ret; 182 } 183 184 static int io_uring_mmap(int fd, struct io_uring_params *p, 185 struct io_uring_sq *sq, struct io_uring_cq *cq) 186 { 187 size_t size; 188 void *ptr; 189 int ret; 190 191 sq->ring_sz = p->sq_off.array + p->sq_entries * sizeof(unsigned); 192 ptr = mmap(0, sq->ring_sz, PROT_READ | PROT_WRITE, 193 MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQ_RING); 194 if (ptr == MAP_FAILED) 195 return -errno; 196 sq->khead = ptr + p->sq_off.head; 197 sq->ktail = ptr + p->sq_off.tail; 198 sq->kring_mask = ptr + p->sq_off.ring_mask; 199 sq->kring_entries = ptr + p->sq_off.ring_entries; 200 sq->kflags = ptr + p->sq_off.flags; 201 sq->kdropped = ptr + p->sq_off.dropped; 202 sq->array = ptr + p->sq_off.array; 203 204 size = p->sq_entries * sizeof(struct io_uring_sqe); 205 sq->sqes = mmap(0, size, PROT_READ | PROT_WRITE, 206 MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQES); 207 if (sq->sqes == MAP_FAILED) { 208 ret = -errno; 209 err: 210 munmap(sq->khead, sq->ring_sz); 211 return ret; 212 } 213 214 cq->ring_sz = p->cq_off.cqes + p->cq_entries * sizeof(struct io_uring_cqe); 215 ptr = mmap(0, cq->ring_sz, PROT_READ | PROT_WRITE, 216 MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_CQ_RING); 217 if (ptr == MAP_FAILED) { 218 ret = -errno; 219 munmap(sq->sqes, p->sq_entries * sizeof(struct io_uring_sqe)); 220 goto err; 221 } 222 cq->khead = ptr + p->cq_off.head; 223 cq->ktail = ptr + p->cq_off.tail; 224 cq->kring_mask = ptr + p->cq_off.ring_mask; 225 cq->kring_entries = ptr + p->cq_off.ring_entries; 226 cq->koverflow = ptr + p->cq_off.overflow; 227 cq->cqes = ptr + p->cq_off.cqes; 228 return 0; 229 } 230 231 static int io_uring_queue_init(unsigned entries, struct io_uring *ring, 232 unsigned flags) 233 { 234 struct io_uring_params p; 235 int fd, ret; 236 237 memset(ring, 0, sizeof(*ring)); 238 memset(&p, 0, sizeof(p)); 239 p.flags = flags; 240 241 fd = io_uring_setup(entries, &p); 242 if (fd < 0) 243 return fd; 244 ret = io_uring_mmap(fd, &p, &ring->sq, &ring->cq); 245 if (!ret) 246 ring->ring_fd = fd; 247 else 248 close(fd); 249 return ret; 250 } 251 252 static int io_uring_submit(struct io_uring *ring) 253 { 254 struct io_uring_sq *sq = &ring->sq; 255 const unsigned mask = *sq->kring_mask; 256 unsigned ktail, submitted, to_submit; 257 int ret; 258 259 read_barrier(); 260 if (*sq->khead != *sq->ktail) { 261 submitted = *sq->kring_entries; 262 goto submit; 263 } 264 if (sq->sqe_head == sq->sqe_tail) 265 return 0; 266 267 ktail = *sq->ktail; 268 to_submit = sq->sqe_tail - sq->sqe_head; 269 for (submitted = 0; submitted < to_submit; submitted++) { 270 read_barrier(); 271 sq->array[ktail++ & mask] = sq->sqe_head++ & mask; 272 } 273 if (!submitted) 274 return 0; 275 276 if (*sq->ktail != ktail) { 277 write_barrier(); 278 *sq->ktail = ktail; 279 write_barrier(); 280 } 281 submit: 282 ret = io_uring_enter(ring->ring_fd, submitted, 0, 283 IORING_ENTER_GETEVENTS, NULL); 284 return ret < 0 ? -errno : ret; 285 } 286 287 static inline void io_uring_prep_send(struct io_uring_sqe *sqe, int sockfd, 288 const void *buf, size_t len, int flags) 289 { 290 memset(sqe, 0, sizeof(*sqe)); 291 sqe->opcode = (__u8) IORING_OP_SEND; 292 sqe->fd = sockfd; 293 sqe->addr = (unsigned long) buf; 294 sqe->len = len; 295 sqe->msg_flags = (__u32) flags; 296 } 297 298 static inline void io_uring_prep_sendzc(struct io_uring_sqe *sqe, int sockfd, 299 const void *buf, size_t len, int flags, 300 unsigned slot_idx, unsigned zc_flags) 301 { 302 io_uring_prep_send(sqe, sockfd, buf, len, flags); 303 sqe->opcode = (__u8) IORING_OP_SENDZC_NOTIF; 304 sqe->notification_idx = slot_idx; 305 sqe->ioprio = zc_flags; 306 } 307 308 static struct io_uring_sqe *io_uring_get_sqe(struct io_uring *ring) 309 { 310 struct io_uring_sq *sq = &ring->sq; 311 312 if (sq->sqe_tail + 1 - sq->sqe_head > *sq->kring_entries) 313 return NULL; 314 return &sq->sqes[sq->sqe_tail++ & *sq->kring_mask]; 315 } 316 317 static int io_uring_wait_cqe(struct io_uring *ring, struct io_uring_cqe **cqe_ptr) 318 { 319 struct io_uring_cq *cq = &ring->cq; 320 const unsigned mask = *cq->kring_mask; 321 unsigned head = *cq->khead; 322 int ret; 323 324 *cqe_ptr = NULL; 325 do { 326 read_barrier(); 327 if (head != *cq->ktail) { 328 *cqe_ptr = &cq->cqes[head & mask]; 329 break; 330 } 331 ret = io_uring_enter(ring->ring_fd, 0, 1, 332 IORING_ENTER_GETEVENTS, NULL); 333 if (ret < 0) 334 return -errno; 335 } while (1); 336 337 return 0; 338 } 339 340 static inline void io_uring_cqe_seen(struct io_uring *ring) 341 { 342 *(&ring->cq)->khead += 1; 343 write_barrier(); 344 } 345 346 static unsigned long gettimeofday_ms(void) 347 { 348 struct timeval tv; 349 350 gettimeofday(&tv, NULL); 351 return (tv.tv_sec * 1000) + (tv.tv_usec / 1000); 352 } 353 354 static void do_setsockopt(int fd, int level, int optname, int val) 355 { 356 if (setsockopt(fd, level, optname, &val, sizeof(val))) 357 error(1, errno, "setsockopt %d.%d: %d", level, optname, val); 358 } 359 360 static int do_setup_tx(int domain, int type, int protocol) 361 { 362 int fd; 363 364 fd = socket(domain, type, protocol); 365 if (fd == -1) 366 error(1, errno, "socket t"); 367 368 do_setsockopt(fd, SOL_SOCKET, SO_SNDBUF, 1 << 21); 369 370 if (connect(fd, (void *) &cfg_dst_addr, cfg_alen)) 371 error(1, errno, "connect"); 372 return fd; 373 } 374 375 static void do_tx(int domain, int type, int protocol) 376 { 377 struct io_uring_notification_slot b[1] = {{.tag = NOTIF_TAG}}; 378 struct io_uring_sqe *sqe; 379 struct io_uring_cqe *cqe; 380 unsigned long packets = 0, bytes = 0; 381 struct io_uring ring; 382 struct iovec iov; 383 uint64_t tstop; 384 int i, fd, ret; 385 int compl_cqes = 0; 386 387 fd = do_setup_tx(domain, type, protocol); 388 389 ret = io_uring_queue_init(512, &ring, 0); 390 if (ret) 391 error(1, ret, "io_uring: queue init"); 392 393 ret = io_uring_register_notifications(&ring, 1, b); 394 if (ret) 395 error(1, ret, "io_uring: tx ctx registration"); 396 397 iov.iov_base = payload; 398 iov.iov_len = cfg_payload_len; 399 400 ret = io_uring_register_buffers(&ring, &iov, 1); 401 if (ret) 402 error(1, ret, "io_uring: buffer registration"); 403 404 tstop = gettimeofday_ms() + cfg_runtime_ms; 405 do { 406 if (cfg_cork) 407 do_setsockopt(fd, IPPROTO_UDP, UDP_CORK, 1); 408 409 for (i = 0; i < cfg_nr_reqs; i++) { 410 unsigned zc_flags = 0; 411 unsigned buf_idx = 0; 412 unsigned slot_idx = 0; 413 unsigned mode = cfg_mode; 414 unsigned msg_flags = 0; 415 416 if (cfg_mode == MODE_MIXED) 417 mode = rand() % 3; 418 419 sqe = io_uring_get_sqe(&ring); 420 421 if (mode == MODE_NONZC) { 422 io_uring_prep_send(sqe, fd, payload, 423 cfg_payload_len, msg_flags); 424 sqe->user_data = NONZC_TAG; 425 } else { 426 if (cfg_flush) { 427 zc_flags |= IORING_RECVSEND_NOTIF_FLUSH; 428 compl_cqes++; 429 } 430 io_uring_prep_sendzc(sqe, fd, payload, 431 cfg_payload_len, 432 msg_flags, slot_idx, zc_flags); 433 if (mode == MODE_ZC_FIXED) { 434 sqe->ioprio |= IORING_RECVSEND_FIXED_BUF; 435 sqe->buf_index = buf_idx; 436 } 437 sqe->user_data = ZC_TAG; 438 } 439 } 440 441 ret = io_uring_submit(&ring); 442 if (ret != cfg_nr_reqs) 443 error(1, ret, "submit"); 444 445 for (i = 0; i < cfg_nr_reqs; i++) { 446 ret = io_uring_wait_cqe(&ring, &cqe); 447 if (ret) 448 error(1, ret, "wait cqe"); 449 450 if (cqe->user_data == NOTIF_TAG) { 451 compl_cqes--; 452 i--; 453 } else if (cqe->user_data != NONZC_TAG && 454 cqe->user_data != ZC_TAG) { 455 error(1, cqe->res, "invalid user_data"); 456 } else if (cqe->res <= 0 && cqe->res != -EAGAIN) { 457 error(1, cqe->res, "send failed"); 458 } else { 459 if (cqe->res > 0) { 460 packets++; 461 bytes += cqe->res; 462 } 463 /* failed requests don't flush */ 464 if (cfg_flush && 465 cqe->res <= 0 && 466 cqe->user_data == ZC_TAG) 467 compl_cqes--; 468 } 469 io_uring_cqe_seen(&ring); 470 } 471 if (cfg_cork) 472 do_setsockopt(fd, IPPROTO_UDP, UDP_CORK, 0); 473 } while (gettimeofday_ms() < tstop); 474 475 if (close(fd)) 476 error(1, errno, "close"); 477 478 fprintf(stderr, "tx=%lu (MB=%lu), tx/s=%lu (MB/s=%lu)\n", 479 packets, bytes >> 20, 480 packets / (cfg_runtime_ms / 1000), 481 (bytes >> 20) / (cfg_runtime_ms / 1000)); 482 483 while (compl_cqes) { 484 ret = io_uring_wait_cqe(&ring, &cqe); 485 if (ret) 486 error(1, ret, "wait cqe"); 487 io_uring_cqe_seen(&ring); 488 compl_cqes--; 489 } 490 } 491 492 static void do_test(int domain, int type, int protocol) 493 { 494 int i; 495 496 for (i = 0; i < IP_MAXPACKET; i++) 497 payload[i] = 'a' + (i % 26); 498 do_tx(domain, type, protocol); 499 } 500 501 static void usage(const char *filepath) 502 { 503 error(1, 0, "Usage: %s [-f] [-n<N>] [-z0] [-s<payload size>] " 504 "(-4|-6) [-t<time s>] -D<dst_ip> udp", filepath); 505 } 506 507 static void parse_opts(int argc, char **argv) 508 { 509 const int max_payload_len = sizeof(payload) - 510 sizeof(struct ipv6hdr) - 511 sizeof(struct tcphdr) - 512 40 /* max tcp options */; 513 struct sockaddr_in6 *addr6 = (void *) &cfg_dst_addr; 514 struct sockaddr_in *addr4 = (void *) &cfg_dst_addr; 515 char *daddr = NULL; 516 int c; 517 518 if (argc <= 1) 519 usage(argv[0]); 520 cfg_payload_len = max_payload_len; 521 522 while ((c = getopt(argc, argv, "46D:p:s:t:n:fc:m:")) != -1) { 523 switch (c) { 524 case '4': 525 if (cfg_family != PF_UNSPEC) 526 error(1, 0, "Pass one of -4 or -6"); 527 cfg_family = PF_INET; 528 cfg_alen = sizeof(struct sockaddr_in); 529 break; 530 case '6': 531 if (cfg_family != PF_UNSPEC) 532 error(1, 0, "Pass one of -4 or -6"); 533 cfg_family = PF_INET6; 534 cfg_alen = sizeof(struct sockaddr_in6); 535 break; 536 case 'D': 537 daddr = optarg; 538 break; 539 case 'p': 540 cfg_port = strtoul(optarg, NULL, 0); 541 break; 542 case 's': 543 cfg_payload_len = strtoul(optarg, NULL, 0); 544 break; 545 case 't': 546 cfg_runtime_ms = 200 + strtoul(optarg, NULL, 10) * 1000; 547 break; 548 case 'n': 549 cfg_nr_reqs = strtoul(optarg, NULL, 0); 550 break; 551 case 'f': 552 cfg_flush = 1; 553 break; 554 case 'c': 555 cfg_cork = strtol(optarg, NULL, 0); 556 break; 557 case 'm': 558 cfg_mode = strtol(optarg, NULL, 0); 559 break; 560 } 561 } 562 563 switch (cfg_family) { 564 case PF_INET: 565 memset(addr4, 0, sizeof(*addr4)); 566 addr4->sin_family = AF_INET; 567 addr4->sin_port = htons(cfg_port); 568 if (daddr && 569 inet_pton(AF_INET, daddr, &(addr4->sin_addr)) != 1) 570 error(1, 0, "ipv4 parse error: %s", daddr); 571 break; 572 case PF_INET6: 573 memset(addr6, 0, sizeof(*addr6)); 574 addr6->sin6_family = AF_INET6; 575 addr6->sin6_port = htons(cfg_port); 576 if (daddr && 577 inet_pton(AF_INET6, daddr, &(addr6->sin6_addr)) != 1) 578 error(1, 0, "ipv6 parse error: %s", daddr); 579 break; 580 default: 581 error(1, 0, "illegal domain"); 582 } 583 584 if (cfg_payload_len > max_payload_len) 585 error(1, 0, "-s: payload exceeds max (%d)", max_payload_len); 586 if (cfg_mode == MODE_NONZC && cfg_flush) 587 error(1, 0, "-f: only zerocopy modes support notifications"); 588 if (optind != argc - 1) 589 usage(argv[0]); 590 } 591 592 int main(int argc, char **argv) 593 { 594 const char *cfg_test = argv[argc - 1]; 595 596 parse_opts(argc, argv); 597 598 if (!strcmp(cfg_test, "tcp")) 599 do_test(cfg_family, SOCK_STREAM, 0); 600 else if (!strcmp(cfg_test, "udp")) 601 do_test(cfg_family, SOCK_DGRAM, 0); 602 else 603 error(1, 0, "unknown cfg_test %s", cfg_test); 604 return 0; 605 } 606