1 /* SPDX-License-Identifier: MIT */ 2 /* based on linux-kernel/tools/testing/selftests/net/msg_zerocopy.c */ 3 #include <assert.h> 4 #include <errno.h> 5 #include <error.h> 6 #include <fcntl.h> 7 #include <limits.h> 8 #include <stdbool.h> 9 #include <stdint.h> 10 #include <stdio.h> 11 #include <stdlib.h> 12 #include <string.h> 13 #include <unistd.h> 14 15 #include <arpa/inet.h> 16 #include <linux/errqueue.h> 17 #include <linux/if_packet.h> 18 #include <linux/io_uring.h> 19 #include <linux/ipv6.h> 20 #include <linux/socket.h> 21 #include <linux/sockios.h> 22 #include <net/ethernet.h> 23 #include <net/if.h> 24 #include <netinet/in.h> 25 #include <netinet/ip.h> 26 #include <netinet/ip6.h> 27 #include <netinet/tcp.h> 28 #include <netinet/udp.h> 29 #include <sys/ioctl.h> 30 #include <sys/mman.h> 31 #include <sys/resource.h> 32 #include <sys/socket.h> 33 #include <sys/stat.h> 34 #include <sys/time.h> 35 #include <sys/types.h> 36 #include <sys/un.h> 37 #include <sys/wait.h> 38 39 #define NOTIF_TAG 0xfffffffULL 40 #define NONZC_TAG 0 41 #define ZC_TAG 1 42 43 enum { 44 MODE_NONZC = 0, 45 MODE_ZC = 1, 46 MODE_ZC_FIXED = 2, 47 MODE_MIXED = 3, 48 }; 49 50 static bool cfg_cork = false; 51 static int cfg_mode = MODE_ZC_FIXED; 52 static int cfg_nr_reqs = 8; 53 static int cfg_family = PF_UNSPEC; 54 static int cfg_payload_len; 55 static int cfg_port = 8000; 56 static int cfg_runtime_ms = 4200; 57 58 static socklen_t cfg_alen; 59 static struct sockaddr_storage cfg_dst_addr; 60 61 static char payload[IP_MAXPACKET] __attribute__((aligned(4096))); 62 63 struct io_sq_ring { 64 unsigned *head; 65 unsigned *tail; 66 unsigned *ring_mask; 67 unsigned *ring_entries; 68 unsigned *flags; 69 unsigned *array; 70 }; 71 72 struct io_cq_ring { 73 unsigned *head; 74 unsigned *tail; 75 unsigned *ring_mask; 76 unsigned *ring_entries; 77 struct io_uring_cqe *cqes; 78 }; 79 80 struct io_uring_sq { 81 unsigned *khead; 82 unsigned *ktail; 83 unsigned *kring_mask; 84 unsigned *kring_entries; 85 unsigned *kflags; 86 unsigned *kdropped; 87 unsigned *array; 88 struct io_uring_sqe *sqes; 89 90 unsigned sqe_head; 91 unsigned sqe_tail; 92 93 size_t ring_sz; 94 }; 95 96 struct io_uring_cq { 97 unsigned *khead; 98 unsigned *ktail; 99 unsigned *kring_mask; 100 unsigned *kring_entries; 101 unsigned *koverflow; 102 struct io_uring_cqe *cqes; 103 104 size_t ring_sz; 105 }; 106 107 struct io_uring { 108 struct io_uring_sq sq; 109 struct io_uring_cq cq; 110 int ring_fd; 111 }; 112 113 #ifdef __alpha__ 114 # ifndef __NR_io_uring_setup 115 # define __NR_io_uring_setup 535 116 # endif 117 # ifndef __NR_io_uring_enter 118 # define __NR_io_uring_enter 536 119 # endif 120 # ifndef __NR_io_uring_register 121 # define __NR_io_uring_register 537 122 # endif 123 #else /* !__alpha__ */ 124 # ifndef __NR_io_uring_setup 125 # define __NR_io_uring_setup 425 126 # endif 127 # ifndef __NR_io_uring_enter 128 # define __NR_io_uring_enter 426 129 # endif 130 # ifndef __NR_io_uring_register 131 # define __NR_io_uring_register 427 132 # endif 133 #endif 134 135 #if defined(__x86_64) || defined(__i386__) 136 #define read_barrier() __asm__ __volatile__("":::"memory") 137 #define write_barrier() __asm__ __volatile__("":::"memory") 138 #else 139 140 #define read_barrier() __sync_synchronize() 141 #define write_barrier() __sync_synchronize() 142 #endif 143 144 static int io_uring_setup(unsigned int entries, struct io_uring_params *p) 145 { 146 return syscall(__NR_io_uring_setup, entries, p); 147 } 148 149 static int io_uring_enter(int fd, unsigned int to_submit, 150 unsigned int min_complete, 151 unsigned int flags, sigset_t *sig) 152 { 153 return syscall(__NR_io_uring_enter, fd, to_submit, min_complete, 154 flags, sig, _NSIG / 8); 155 } 156 157 static int io_uring_register_buffers(struct io_uring *ring, 158 const struct iovec *iovecs, 159 unsigned nr_iovecs) 160 { 161 int ret; 162 163 ret = syscall(__NR_io_uring_register, ring->ring_fd, 164 IORING_REGISTER_BUFFERS, iovecs, nr_iovecs); 165 return (ret < 0) ? -errno : ret; 166 } 167 168 static int io_uring_mmap(int fd, struct io_uring_params *p, 169 struct io_uring_sq *sq, struct io_uring_cq *cq) 170 { 171 size_t size; 172 void *ptr; 173 int ret; 174 175 sq->ring_sz = p->sq_off.array + p->sq_entries * sizeof(unsigned); 176 ptr = mmap(0, sq->ring_sz, PROT_READ | PROT_WRITE, 177 MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQ_RING); 178 if (ptr == MAP_FAILED) 179 return -errno; 180 sq->khead = ptr + p->sq_off.head; 181 sq->ktail = ptr + p->sq_off.tail; 182 sq->kring_mask = ptr + p->sq_off.ring_mask; 183 sq->kring_entries = ptr + p->sq_off.ring_entries; 184 sq->kflags = ptr + p->sq_off.flags; 185 sq->kdropped = ptr + p->sq_off.dropped; 186 sq->array = ptr + p->sq_off.array; 187 188 size = p->sq_entries * sizeof(struct io_uring_sqe); 189 sq->sqes = mmap(0, size, PROT_READ | PROT_WRITE, 190 MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQES); 191 if (sq->sqes == MAP_FAILED) { 192 ret = -errno; 193 err: 194 munmap(sq->khead, sq->ring_sz); 195 return ret; 196 } 197 198 cq->ring_sz = p->cq_off.cqes + p->cq_entries * sizeof(struct io_uring_cqe); 199 ptr = mmap(0, cq->ring_sz, PROT_READ | PROT_WRITE, 200 MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_CQ_RING); 201 if (ptr == MAP_FAILED) { 202 ret = -errno; 203 munmap(sq->sqes, p->sq_entries * sizeof(struct io_uring_sqe)); 204 goto err; 205 } 206 cq->khead = ptr + p->cq_off.head; 207 cq->ktail = ptr + p->cq_off.tail; 208 cq->kring_mask = ptr + p->cq_off.ring_mask; 209 cq->kring_entries = ptr + p->cq_off.ring_entries; 210 cq->koverflow = ptr + p->cq_off.overflow; 211 cq->cqes = ptr + p->cq_off.cqes; 212 return 0; 213 } 214 215 static int io_uring_queue_init(unsigned entries, struct io_uring *ring, 216 unsigned flags) 217 { 218 struct io_uring_params p; 219 int fd, ret; 220 221 memset(ring, 0, sizeof(*ring)); 222 memset(&p, 0, sizeof(p)); 223 p.flags = flags; 224 225 fd = io_uring_setup(entries, &p); 226 if (fd < 0) 227 return fd; 228 ret = io_uring_mmap(fd, &p, &ring->sq, &ring->cq); 229 if (!ret) 230 ring->ring_fd = fd; 231 else 232 close(fd); 233 return ret; 234 } 235 236 static int io_uring_submit(struct io_uring *ring) 237 { 238 struct io_uring_sq *sq = &ring->sq; 239 const unsigned mask = *sq->kring_mask; 240 unsigned ktail, submitted, to_submit; 241 int ret; 242 243 read_barrier(); 244 if (*sq->khead != *sq->ktail) { 245 submitted = *sq->kring_entries; 246 goto submit; 247 } 248 if (sq->sqe_head == sq->sqe_tail) 249 return 0; 250 251 ktail = *sq->ktail; 252 to_submit = sq->sqe_tail - sq->sqe_head; 253 for (submitted = 0; submitted < to_submit; submitted++) { 254 read_barrier(); 255 sq->array[ktail++ & mask] = sq->sqe_head++ & mask; 256 } 257 if (!submitted) 258 return 0; 259 260 if (*sq->ktail != ktail) { 261 write_barrier(); 262 *sq->ktail = ktail; 263 write_barrier(); 264 } 265 submit: 266 ret = io_uring_enter(ring->ring_fd, submitted, 0, 267 IORING_ENTER_GETEVENTS, NULL); 268 return ret < 0 ? -errno : ret; 269 } 270 271 static inline void io_uring_prep_send(struct io_uring_sqe *sqe, int sockfd, 272 const void *buf, size_t len, int flags) 273 { 274 memset(sqe, 0, sizeof(*sqe)); 275 sqe->opcode = (__u8) IORING_OP_SEND; 276 sqe->fd = sockfd; 277 sqe->addr = (unsigned long) buf; 278 sqe->len = len; 279 sqe->msg_flags = (__u32) flags; 280 } 281 282 static inline void io_uring_prep_sendzc(struct io_uring_sqe *sqe, int sockfd, 283 const void *buf, size_t len, int flags, 284 unsigned zc_flags) 285 { 286 io_uring_prep_send(sqe, sockfd, buf, len, flags); 287 sqe->opcode = (__u8) IORING_OP_SEND_ZC; 288 sqe->ioprio = zc_flags; 289 } 290 291 static struct io_uring_sqe *io_uring_get_sqe(struct io_uring *ring) 292 { 293 struct io_uring_sq *sq = &ring->sq; 294 295 if (sq->sqe_tail + 1 - sq->sqe_head > *sq->kring_entries) 296 return NULL; 297 return &sq->sqes[sq->sqe_tail++ & *sq->kring_mask]; 298 } 299 300 static int io_uring_wait_cqe(struct io_uring *ring, struct io_uring_cqe **cqe_ptr) 301 { 302 struct io_uring_cq *cq = &ring->cq; 303 const unsigned mask = *cq->kring_mask; 304 unsigned head = *cq->khead; 305 int ret; 306 307 *cqe_ptr = NULL; 308 do { 309 read_barrier(); 310 if (head != *cq->ktail) { 311 *cqe_ptr = &cq->cqes[head & mask]; 312 break; 313 } 314 ret = io_uring_enter(ring->ring_fd, 0, 1, 315 IORING_ENTER_GETEVENTS, NULL); 316 if (ret < 0) 317 return -errno; 318 } while (1); 319 320 return 0; 321 } 322 323 static inline void io_uring_cqe_seen(struct io_uring *ring) 324 { 325 *(&ring->cq)->khead += 1; 326 write_barrier(); 327 } 328 329 static unsigned long gettimeofday_ms(void) 330 { 331 struct timeval tv; 332 333 gettimeofday(&tv, NULL); 334 return (tv.tv_sec * 1000) + (tv.tv_usec / 1000); 335 } 336 337 static void do_setsockopt(int fd, int level, int optname, int val) 338 { 339 if (setsockopt(fd, level, optname, &val, sizeof(val))) 340 error(1, errno, "setsockopt %d.%d: %d", level, optname, val); 341 } 342 343 static int do_setup_tx(int domain, int type, int protocol) 344 { 345 int fd; 346 347 fd = socket(domain, type, protocol); 348 if (fd == -1) 349 error(1, errno, "socket t"); 350 351 do_setsockopt(fd, SOL_SOCKET, SO_SNDBUF, 1 << 21); 352 353 if (connect(fd, (void *) &cfg_dst_addr, cfg_alen)) 354 error(1, errno, "connect"); 355 return fd; 356 } 357 358 static void do_tx(int domain, int type, int protocol) 359 { 360 struct io_uring_sqe *sqe; 361 struct io_uring_cqe *cqe; 362 unsigned long packets = 0, bytes = 0; 363 struct io_uring ring; 364 struct iovec iov; 365 uint64_t tstop; 366 int i, fd, ret; 367 int compl_cqes = 0; 368 369 fd = do_setup_tx(domain, type, protocol); 370 371 ret = io_uring_queue_init(512, &ring, 0); 372 if (ret) 373 error(1, ret, "io_uring: queue init"); 374 375 iov.iov_base = payload; 376 iov.iov_len = cfg_payload_len; 377 378 ret = io_uring_register_buffers(&ring, &iov, 1); 379 if (ret) 380 error(1, ret, "io_uring: buffer registration"); 381 382 tstop = gettimeofday_ms() + cfg_runtime_ms; 383 do { 384 if (cfg_cork) 385 do_setsockopt(fd, IPPROTO_UDP, UDP_CORK, 1); 386 387 for (i = 0; i < cfg_nr_reqs; i++) { 388 unsigned zc_flags = 0; 389 unsigned buf_idx = 0; 390 unsigned mode = cfg_mode; 391 unsigned msg_flags = MSG_WAITALL; 392 393 if (cfg_mode == MODE_MIXED) 394 mode = rand() % 3; 395 396 sqe = io_uring_get_sqe(&ring); 397 398 if (mode == MODE_NONZC) { 399 io_uring_prep_send(sqe, fd, payload, 400 cfg_payload_len, msg_flags); 401 sqe->user_data = NONZC_TAG; 402 } else { 403 compl_cqes++; 404 io_uring_prep_sendzc(sqe, fd, payload, 405 cfg_payload_len, 406 msg_flags, zc_flags); 407 if (mode == MODE_ZC_FIXED) { 408 sqe->ioprio |= IORING_RECVSEND_FIXED_BUF; 409 sqe->buf_index = buf_idx; 410 } 411 sqe->user_data = ZC_TAG; 412 } 413 } 414 415 ret = io_uring_submit(&ring); 416 if (ret != cfg_nr_reqs) 417 error(1, ret, "submit"); 418 419 if (cfg_cork) 420 do_setsockopt(fd, IPPROTO_UDP, UDP_CORK, 0); 421 for (i = 0; i < cfg_nr_reqs; i++) { 422 ret = io_uring_wait_cqe(&ring, &cqe); 423 if (ret) 424 error(1, ret, "wait cqe"); 425 426 if (cqe->user_data != NONZC_TAG && 427 cqe->user_data != ZC_TAG) 428 error(1, -EINVAL, "invalid cqe->user_data"); 429 430 if (cqe->flags & IORING_CQE_F_NOTIF) { 431 if (cqe->flags & IORING_CQE_F_MORE) 432 error(1, -EINVAL, "invalid notif flags"); 433 compl_cqes--; 434 i--; 435 } else if (cqe->res <= 0) { 436 if (cqe->flags & IORING_CQE_F_MORE) 437 error(1, cqe->res, "more with a failed send"); 438 error(1, cqe->res, "send failed"); 439 } else { 440 if (cqe->user_data == ZC_TAG && 441 !(cqe->flags & IORING_CQE_F_MORE)) 442 error(1, cqe->res, "missing more flag"); 443 packets++; 444 bytes += cqe->res; 445 } 446 io_uring_cqe_seen(&ring); 447 } 448 } while (gettimeofday_ms() < tstop); 449 450 while (compl_cqes) { 451 ret = io_uring_wait_cqe(&ring, &cqe); 452 if (ret) 453 error(1, ret, "wait cqe"); 454 if (cqe->flags & IORING_CQE_F_MORE) 455 error(1, -EINVAL, "invalid notif flags"); 456 if (!(cqe->flags & IORING_CQE_F_NOTIF)) 457 error(1, -EINVAL, "missing notif flag"); 458 459 io_uring_cqe_seen(&ring); 460 compl_cqes--; 461 } 462 463 fprintf(stderr, "tx=%lu (MB=%lu), tx/s=%lu (MB/s=%lu)\n", 464 packets, bytes >> 20, 465 packets / (cfg_runtime_ms / 1000), 466 (bytes >> 20) / (cfg_runtime_ms / 1000)); 467 468 if (close(fd)) 469 error(1, errno, "close"); 470 } 471 472 static void do_test(int domain, int type, int protocol) 473 { 474 int i; 475 476 for (i = 0; i < IP_MAXPACKET; i++) 477 payload[i] = 'a' + (i % 26); 478 do_tx(domain, type, protocol); 479 } 480 481 static void usage(const char *filepath) 482 { 483 error(1, 0, "Usage: %s (-4|-6) (udp|tcp) -D<dst_ip> [-s<payload size>] " 484 "[-t<time s>] [-n<batch>] [-p<port>] [-m<mode>]", filepath); 485 } 486 487 static void parse_opts(int argc, char **argv) 488 { 489 const int max_payload_len = sizeof(payload) - 490 sizeof(struct ipv6hdr) - 491 sizeof(struct tcphdr) - 492 40 /* max tcp options */; 493 struct sockaddr_in6 *addr6 = (void *) &cfg_dst_addr; 494 struct sockaddr_in *addr4 = (void *) &cfg_dst_addr; 495 char *daddr = NULL; 496 int c; 497 498 if (argc <= 1) 499 usage(argv[0]); 500 cfg_payload_len = max_payload_len; 501 502 while ((c = getopt(argc, argv, "46D:p:s:t:n:c:m:")) != -1) { 503 switch (c) { 504 case '4': 505 if (cfg_family != PF_UNSPEC) 506 error(1, 0, "Pass one of -4 or -6"); 507 cfg_family = PF_INET; 508 cfg_alen = sizeof(struct sockaddr_in); 509 break; 510 case '6': 511 if (cfg_family != PF_UNSPEC) 512 error(1, 0, "Pass one of -4 or -6"); 513 cfg_family = PF_INET6; 514 cfg_alen = sizeof(struct sockaddr_in6); 515 break; 516 case 'D': 517 daddr = optarg; 518 break; 519 case 'p': 520 cfg_port = strtoul(optarg, NULL, 0); 521 break; 522 case 's': 523 cfg_payload_len = strtoul(optarg, NULL, 0); 524 break; 525 case 't': 526 cfg_runtime_ms = 200 + strtoul(optarg, NULL, 10) * 1000; 527 break; 528 case 'n': 529 cfg_nr_reqs = strtoul(optarg, NULL, 0); 530 break; 531 case 'c': 532 cfg_cork = strtol(optarg, NULL, 0); 533 break; 534 case 'm': 535 cfg_mode = strtol(optarg, NULL, 0); 536 break; 537 } 538 } 539 540 switch (cfg_family) { 541 case PF_INET: 542 memset(addr4, 0, sizeof(*addr4)); 543 addr4->sin_family = AF_INET; 544 addr4->sin_port = htons(cfg_port); 545 if (daddr && 546 inet_pton(AF_INET, daddr, &(addr4->sin_addr)) != 1) 547 error(1, 0, "ipv4 parse error: %s", daddr); 548 break; 549 case PF_INET6: 550 memset(addr6, 0, sizeof(*addr6)); 551 addr6->sin6_family = AF_INET6; 552 addr6->sin6_port = htons(cfg_port); 553 if (daddr && 554 inet_pton(AF_INET6, daddr, &(addr6->sin6_addr)) != 1) 555 error(1, 0, "ipv6 parse error: %s", daddr); 556 break; 557 default: 558 error(1, 0, "illegal domain"); 559 } 560 561 if (cfg_payload_len > max_payload_len) 562 error(1, 0, "-s: payload exceeds max (%d)", max_payload_len); 563 if (optind != argc - 1) 564 usage(argv[0]); 565 } 566 567 int main(int argc, char **argv) 568 { 569 const char *cfg_test = argv[argc - 1]; 570 571 parse_opts(argc, argv); 572 573 if (!strcmp(cfg_test, "tcp")) 574 do_test(cfg_family, SOCK_STREAM, 0); 575 else if (!strcmp(cfg_test, "udp")) 576 do_test(cfg_family, SOCK_DGRAM, 0); 577 else 578 error(1, 0, "unknown cfg_test %s", cfg_test); 579 return 0; 580 } 581