1 /* Evaluate MSG_ZEROCOPY 2 * 3 * Send traffic between two processes over one of the supported 4 * protocols and modes: 5 * 6 * PF_INET/PF_INET6 7 * - SOCK_STREAM 8 * - SOCK_DGRAM 9 * - SOCK_DGRAM with UDP_CORK 10 * - SOCK_RAW 11 * - SOCK_RAW with IP_HDRINCL 12 * 13 * PF_PACKET 14 * - SOCK_DGRAM 15 * - SOCK_RAW 16 * 17 * Start this program on two connected hosts, one in send mode and 18 * the other with option '-r' to put it in receiver mode. 19 * 20 * If zerocopy mode ('-z') is enabled, the sender will verify that 21 * the kernel queues completions on the error queue for all zerocopy 22 * transfers. 23 */ 24 25 #define _GNU_SOURCE 26 27 #include <arpa/inet.h> 28 #include <error.h> 29 #include <errno.h> 30 #include <limits.h> 31 #include <linux/errqueue.h> 32 #include <linux/if_packet.h> 33 #include <linux/ipv6.h> 34 #include <linux/socket.h> 35 #include <linux/sockios.h> 36 #include <net/ethernet.h> 37 #include <net/if.h> 38 #include <netinet/ip.h> 39 #include <netinet/ip6.h> 40 #include <netinet/tcp.h> 41 #include <netinet/udp.h> 42 #include <poll.h> 43 #include <sched.h> 44 #include <stdbool.h> 45 #include <stdio.h> 46 #include <stdint.h> 47 #include <stdlib.h> 48 #include <string.h> 49 #include <sys/ioctl.h> 50 #include <sys/socket.h> 51 #include <sys/stat.h> 52 #include <sys/time.h> 53 #include <sys/types.h> 54 #include <sys/wait.h> 55 #include <unistd.h> 56 57 #ifndef SO_EE_ORIGIN_ZEROCOPY 58 #define SO_EE_ORIGIN_ZEROCOPY 5 59 #endif 60 61 #ifndef SO_ZEROCOPY 62 #define SO_ZEROCOPY 60 63 #endif 64 65 #ifndef SO_EE_CODE_ZEROCOPY_COPIED 66 #define SO_EE_CODE_ZEROCOPY_COPIED 1 67 #endif 68 69 #ifndef MSG_ZEROCOPY 70 #define MSG_ZEROCOPY 0x4000000 71 #endif 72 73 static int cfg_cork; 74 static bool cfg_cork_mixed; 75 static int cfg_cpu = -1; /* default: pin to last cpu */ 76 static int cfg_family = PF_UNSPEC; 77 static int cfg_ifindex = 1; 78 static int cfg_payload_len; 79 static int cfg_port = 8000; 80 static bool cfg_rx; 81 static int cfg_runtime_ms = 4200; 82 static int cfg_verbose; 83 static int cfg_waittime_ms = 500; 84 static bool cfg_zerocopy; 85 86 static socklen_t cfg_alen; 87 static struct sockaddr_storage cfg_dst_addr; 88 static struct sockaddr_storage cfg_src_addr; 89 90 static char payload[IP_MAXPACKET]; 91 static long packets, bytes, completions, expected_completions; 92 static int zerocopied = -1; 93 static uint32_t next_completion; 94 95 static unsigned long gettimeofday_ms(void) 96 { 97 struct timeval tv; 98 99 gettimeofday(&tv, NULL); 100 return (tv.tv_sec * 1000) + (tv.tv_usec / 1000); 101 } 102 103 static uint16_t get_ip_csum(const uint16_t *start, int num_words) 104 { 105 unsigned long sum = 0; 106 int i; 107 108 for (i = 0; i < num_words; i++) 109 sum += start[i]; 110 111 while (sum >> 16) 112 sum = (sum & 0xFFFF) + (sum >> 16); 113 114 return ~sum; 115 } 116 117 static int do_setcpu(int cpu) 118 { 119 cpu_set_t mask; 120 121 CPU_ZERO(&mask); 122 CPU_SET(cpu, &mask); 123 if (sched_setaffinity(0, sizeof(mask), &mask)) 124 error(1, 0, "setaffinity %d", cpu); 125 126 if (cfg_verbose) 127 fprintf(stderr, "cpu: %u\n", cpu); 128 129 return 0; 130 } 131 132 static void do_setsockopt(int fd, int level, int optname, int val) 133 { 134 if (setsockopt(fd, level, optname, &val, sizeof(val))) 135 error(1, errno, "setsockopt %d.%d: %d", level, optname, val); 136 } 137 138 static int do_poll(int fd, int events) 139 { 140 struct pollfd pfd; 141 int ret; 142 143 pfd.events = events; 144 pfd.revents = 0; 145 pfd.fd = fd; 146 147 ret = poll(&pfd, 1, cfg_waittime_ms); 148 if (ret == -1) 149 error(1, errno, "poll"); 150 151 return ret && (pfd.revents & events); 152 } 153 154 static int do_accept(int fd) 155 { 156 int fda = fd; 157 158 fd = accept(fda, NULL, NULL); 159 if (fd == -1) 160 error(1, errno, "accept"); 161 if (close(fda)) 162 error(1, errno, "close listen sock"); 163 164 return fd; 165 } 166 167 static bool do_sendmsg(int fd, struct msghdr *msg, bool do_zerocopy) 168 { 169 int ret, len, i, flags; 170 171 len = 0; 172 for (i = 0; i < msg->msg_iovlen; i++) 173 len += msg->msg_iov[i].iov_len; 174 175 flags = MSG_DONTWAIT; 176 if (do_zerocopy) 177 flags |= MSG_ZEROCOPY; 178 179 ret = sendmsg(fd, msg, flags); 180 if (ret == -1 && errno == EAGAIN) 181 return false; 182 if (ret == -1) 183 error(1, errno, "send"); 184 if (cfg_verbose && ret != len) 185 fprintf(stderr, "send: ret=%u != %u\n", ret, len); 186 187 if (len) { 188 packets++; 189 bytes += ret; 190 if (do_zerocopy && ret) 191 expected_completions++; 192 } 193 194 return true; 195 } 196 197 static void do_sendmsg_corked(int fd, struct msghdr *msg) 198 { 199 bool do_zerocopy = cfg_zerocopy; 200 int i, payload_len, extra_len; 201 202 /* split up the packet. for non-multiple, make first buffer longer */ 203 payload_len = cfg_payload_len / cfg_cork; 204 extra_len = cfg_payload_len - (cfg_cork * payload_len); 205 206 do_setsockopt(fd, IPPROTO_UDP, UDP_CORK, 1); 207 208 for (i = 0; i < cfg_cork; i++) { 209 210 /* in mixed-frags mode, alternate zerocopy and copy frags 211 * start with non-zerocopy, to ensure attach later works 212 */ 213 if (cfg_cork_mixed) 214 do_zerocopy = (i & 1); 215 216 msg->msg_iov[0].iov_len = payload_len + extra_len; 217 extra_len = 0; 218 219 do_sendmsg(fd, msg, do_zerocopy); 220 } 221 222 do_setsockopt(fd, IPPROTO_UDP, UDP_CORK, 0); 223 } 224 225 static int setup_iph(struct iphdr *iph, uint16_t payload_len) 226 { 227 struct sockaddr_in *daddr = (void *) &cfg_dst_addr; 228 struct sockaddr_in *saddr = (void *) &cfg_src_addr; 229 230 memset(iph, 0, sizeof(*iph)); 231 232 iph->version = 4; 233 iph->tos = 0; 234 iph->ihl = 5; 235 iph->ttl = 2; 236 iph->saddr = saddr->sin_addr.s_addr; 237 iph->daddr = daddr->sin_addr.s_addr; 238 iph->protocol = IPPROTO_EGP; 239 iph->tot_len = htons(sizeof(*iph) + payload_len); 240 iph->check = get_ip_csum((void *) iph, iph->ihl << 1); 241 242 return sizeof(*iph); 243 } 244 245 static int setup_ip6h(struct ipv6hdr *ip6h, uint16_t payload_len) 246 { 247 struct sockaddr_in6 *daddr = (void *) &cfg_dst_addr; 248 struct sockaddr_in6 *saddr = (void *) &cfg_src_addr; 249 250 memset(ip6h, 0, sizeof(*ip6h)); 251 252 ip6h->version = 6; 253 ip6h->payload_len = htons(payload_len); 254 ip6h->nexthdr = IPPROTO_EGP; 255 ip6h->hop_limit = 2; 256 ip6h->saddr = saddr->sin6_addr; 257 ip6h->daddr = daddr->sin6_addr; 258 259 return sizeof(*ip6h); 260 } 261 262 263 static void setup_sockaddr(int domain, const char *str_addr, 264 struct sockaddr_storage *sockaddr) 265 { 266 struct sockaddr_in6 *addr6 = (void *) sockaddr; 267 struct sockaddr_in *addr4 = (void *) sockaddr; 268 269 switch (domain) { 270 case PF_INET: 271 memset(addr4, 0, sizeof(*addr4)); 272 addr4->sin_family = AF_INET; 273 addr4->sin_port = htons(cfg_port); 274 if (str_addr && 275 inet_pton(AF_INET, str_addr, &(addr4->sin_addr)) != 1) 276 error(1, 0, "ipv4 parse error: %s", str_addr); 277 break; 278 case PF_INET6: 279 memset(addr6, 0, sizeof(*addr6)); 280 addr6->sin6_family = AF_INET6; 281 addr6->sin6_port = htons(cfg_port); 282 if (str_addr && 283 inet_pton(AF_INET6, str_addr, &(addr6->sin6_addr)) != 1) 284 error(1, 0, "ipv6 parse error: %s", str_addr); 285 break; 286 default: 287 error(1, 0, "illegal domain"); 288 } 289 } 290 291 static int do_setup_tx(int domain, int type, int protocol) 292 { 293 int fd; 294 295 fd = socket(domain, type, protocol); 296 if (fd == -1) 297 error(1, errno, "socket t"); 298 299 do_setsockopt(fd, SOL_SOCKET, SO_SNDBUF, 1 << 21); 300 if (cfg_zerocopy) 301 do_setsockopt(fd, SOL_SOCKET, SO_ZEROCOPY, 1); 302 303 if (domain != PF_PACKET) 304 if (connect(fd, (void *) &cfg_dst_addr, cfg_alen)) 305 error(1, errno, "connect"); 306 307 return fd; 308 } 309 310 static bool do_recv_completion(int fd) 311 { 312 struct sock_extended_err *serr; 313 struct msghdr msg = {}; 314 struct cmsghdr *cm; 315 uint32_t hi, lo, range; 316 int ret, zerocopy; 317 char control[100]; 318 319 msg.msg_control = control; 320 msg.msg_controllen = sizeof(control); 321 322 ret = recvmsg(fd, &msg, MSG_ERRQUEUE); 323 if (ret == -1 && errno == EAGAIN) 324 return false; 325 if (ret == -1) 326 error(1, errno, "recvmsg notification"); 327 if (msg.msg_flags & MSG_CTRUNC) 328 error(1, errno, "recvmsg notification: truncated"); 329 330 cm = CMSG_FIRSTHDR(&msg); 331 if (!cm) 332 error(1, 0, "cmsg: no cmsg"); 333 if (!((cm->cmsg_level == SOL_IP && cm->cmsg_type == IP_RECVERR) || 334 (cm->cmsg_level == SOL_IPV6 && cm->cmsg_type == IPV6_RECVERR) || 335 (cm->cmsg_level == SOL_PACKET && cm->cmsg_type == PACKET_TX_TIMESTAMP))) 336 error(1, 0, "serr: wrong type: %d.%d", 337 cm->cmsg_level, cm->cmsg_type); 338 339 serr = (void *) CMSG_DATA(cm); 340 if (serr->ee_origin != SO_EE_ORIGIN_ZEROCOPY) 341 error(1, 0, "serr: wrong origin: %u", serr->ee_origin); 342 if (serr->ee_errno != 0) 343 error(1, 0, "serr: wrong error code: %u", serr->ee_errno); 344 345 hi = serr->ee_data; 346 lo = serr->ee_info; 347 range = hi - lo + 1; 348 349 /* Detect notification gaps. These should not happen often, if at all. 350 * Gaps can occur due to drops, reordering and retransmissions. 351 */ 352 if (lo != next_completion) 353 fprintf(stderr, "gap: %u..%u does not append to %u\n", 354 lo, hi, next_completion); 355 next_completion = hi + 1; 356 357 zerocopy = !(serr->ee_code & SO_EE_CODE_ZEROCOPY_COPIED); 358 if (zerocopied == -1) 359 zerocopied = zerocopy; 360 else if (zerocopied != zerocopy) { 361 fprintf(stderr, "serr: inconsistent\n"); 362 zerocopied = zerocopy; 363 } 364 365 if (cfg_verbose >= 2) 366 fprintf(stderr, "completed: %u (h=%u l=%u)\n", 367 range, hi, lo); 368 369 completions += range; 370 return true; 371 } 372 373 /* Read all outstanding messages on the errqueue */ 374 static void do_recv_completions(int fd) 375 { 376 while (do_recv_completion(fd)) {} 377 } 378 379 /* Wait for all remaining completions on the errqueue */ 380 static void do_recv_remaining_completions(int fd) 381 { 382 int64_t tstop = gettimeofday_ms() + cfg_waittime_ms; 383 384 while (completions < expected_completions && 385 gettimeofday_ms() < tstop) { 386 if (do_poll(fd, POLLERR)) 387 do_recv_completions(fd); 388 } 389 390 if (completions < expected_completions) 391 fprintf(stderr, "missing notifications: %lu < %lu\n", 392 completions, expected_completions); 393 } 394 395 static void do_tx(int domain, int type, int protocol) 396 { 397 struct iovec iov[3] = { {0} }; 398 struct sockaddr_ll laddr; 399 struct msghdr msg = {0}; 400 struct ethhdr eth; 401 union { 402 struct ipv6hdr ip6h; 403 struct iphdr iph; 404 } nh; 405 uint64_t tstop; 406 int fd; 407 408 fd = do_setup_tx(domain, type, protocol); 409 410 if (domain == PF_PACKET) { 411 uint16_t proto = cfg_family == PF_INET ? ETH_P_IP : ETH_P_IPV6; 412 413 /* sock_raw passes ll header as data */ 414 if (type == SOCK_RAW) { 415 memset(eth.h_dest, 0x06, ETH_ALEN); 416 memset(eth.h_source, 0x02, ETH_ALEN); 417 eth.h_proto = htons(proto); 418 iov[0].iov_base = ð 419 iov[0].iov_len = sizeof(eth); 420 msg.msg_iovlen++; 421 } 422 423 /* both sock_raw and sock_dgram expect name */ 424 memset(&laddr, 0, sizeof(laddr)); 425 laddr.sll_family = AF_PACKET; 426 laddr.sll_ifindex = cfg_ifindex; 427 laddr.sll_protocol = htons(proto); 428 laddr.sll_halen = ETH_ALEN; 429 430 memset(laddr.sll_addr, 0x06, ETH_ALEN); 431 432 msg.msg_name = &laddr; 433 msg.msg_namelen = sizeof(laddr); 434 } 435 436 /* packet and raw sockets with hdrincl must pass network header */ 437 if (domain == PF_PACKET || protocol == IPPROTO_RAW) { 438 if (cfg_family == PF_INET) 439 iov[1].iov_len = setup_iph(&nh.iph, cfg_payload_len); 440 else 441 iov[1].iov_len = setup_ip6h(&nh.ip6h, cfg_payload_len); 442 443 iov[1].iov_base = (void *) &nh; 444 msg.msg_iovlen++; 445 } 446 447 iov[2].iov_base = payload; 448 iov[2].iov_len = cfg_payload_len; 449 msg.msg_iovlen++; 450 msg.msg_iov = &iov[3 - msg.msg_iovlen]; 451 452 tstop = gettimeofday_ms() + cfg_runtime_ms; 453 do { 454 if (cfg_cork) 455 do_sendmsg_corked(fd, &msg); 456 else 457 do_sendmsg(fd, &msg, cfg_zerocopy); 458 459 while (!do_poll(fd, POLLOUT)) { 460 if (cfg_zerocopy) 461 do_recv_completions(fd); 462 } 463 464 } while (gettimeofday_ms() < tstop); 465 466 if (cfg_zerocopy) 467 do_recv_remaining_completions(fd); 468 469 if (close(fd)) 470 error(1, errno, "close"); 471 472 fprintf(stderr, "tx=%lu (%lu MB) txc=%lu zc=%c\n", 473 packets, bytes >> 20, completions, 474 zerocopied == 1 ? 'y' : 'n'); 475 } 476 477 static int do_setup_rx(int domain, int type, int protocol) 478 { 479 int fd; 480 481 /* If tx over PF_PACKET, rx over PF_INET(6)/SOCK_RAW, 482 * to recv the only copy of the packet, not a clone 483 */ 484 if (domain == PF_PACKET) 485 error(1, 0, "Use PF_INET/SOCK_RAW to read"); 486 487 if (type == SOCK_RAW && protocol == IPPROTO_RAW) 488 error(1, 0, "IPPROTO_RAW: not supported on Rx"); 489 490 fd = socket(domain, type, protocol); 491 if (fd == -1) 492 error(1, errno, "socket r"); 493 494 do_setsockopt(fd, SOL_SOCKET, SO_RCVBUF, 1 << 21); 495 do_setsockopt(fd, SOL_SOCKET, SO_RCVLOWAT, 1 << 16); 496 do_setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, 1); 497 498 if (bind(fd, (void *) &cfg_dst_addr, cfg_alen)) 499 error(1, errno, "bind"); 500 501 if (type == SOCK_STREAM) { 502 if (listen(fd, 1)) 503 error(1, errno, "listen"); 504 fd = do_accept(fd); 505 } 506 507 return fd; 508 } 509 510 /* Flush all outstanding bytes for the tcp receive queue */ 511 static void do_flush_tcp(int fd) 512 { 513 int ret; 514 515 /* MSG_TRUNC flushes up to len bytes */ 516 ret = recv(fd, NULL, 1 << 21, MSG_TRUNC | MSG_DONTWAIT); 517 if (ret == -1 && errno == EAGAIN) 518 return; 519 if (ret == -1) 520 error(1, errno, "flush"); 521 if (!ret) 522 return; 523 524 packets++; 525 bytes += ret; 526 } 527 528 /* Flush all outstanding datagrams. Verify first few bytes of each. */ 529 static void do_flush_datagram(int fd, int type) 530 { 531 int ret, off = 0; 532 char buf[64]; 533 534 /* MSG_TRUNC will return full datagram length */ 535 ret = recv(fd, buf, sizeof(buf), MSG_DONTWAIT | MSG_TRUNC); 536 if (ret == -1 && errno == EAGAIN) 537 return; 538 539 /* raw ipv4 return with header, raw ipv6 without */ 540 if (cfg_family == PF_INET && type == SOCK_RAW) { 541 off += sizeof(struct iphdr); 542 ret -= sizeof(struct iphdr); 543 } 544 545 if (ret == -1) 546 error(1, errno, "recv"); 547 if (ret != cfg_payload_len) 548 error(1, 0, "recv: ret=%u != %u", ret, cfg_payload_len); 549 if (ret > sizeof(buf) - off) 550 ret = sizeof(buf) - off; 551 if (memcmp(buf + off, payload, ret)) 552 error(1, 0, "recv: data mismatch"); 553 554 packets++; 555 bytes += cfg_payload_len; 556 } 557 558 static void do_rx(int domain, int type, int protocol) 559 { 560 uint64_t tstop; 561 int fd; 562 563 fd = do_setup_rx(domain, type, protocol); 564 565 tstop = gettimeofday_ms() + cfg_runtime_ms; 566 do { 567 if (type == SOCK_STREAM) 568 do_flush_tcp(fd); 569 else 570 do_flush_datagram(fd, type); 571 572 do_poll(fd, POLLIN); 573 574 } while (gettimeofday_ms() < tstop); 575 576 if (close(fd)) 577 error(1, errno, "close"); 578 579 fprintf(stderr, "rx=%lu (%lu MB)\n", packets, bytes >> 20); 580 } 581 582 static void do_test(int domain, int type, int protocol) 583 { 584 int i; 585 586 if (cfg_cork && (domain == PF_PACKET || type != SOCK_DGRAM)) 587 error(1, 0, "can only cork udp sockets"); 588 589 do_setcpu(cfg_cpu); 590 591 for (i = 0; i < IP_MAXPACKET; i++) 592 payload[i] = 'a' + (i % 26); 593 594 if (cfg_rx) 595 do_rx(domain, type, protocol); 596 else 597 do_tx(domain, type, protocol); 598 } 599 600 static void usage(const char *filepath) 601 { 602 error(1, 0, "Usage: %s [options] <test>", filepath); 603 } 604 605 static void parse_opts(int argc, char **argv) 606 { 607 const int max_payload_len = sizeof(payload) - 608 sizeof(struct ipv6hdr) - 609 sizeof(struct tcphdr) - 610 40 /* max tcp options */; 611 int c; 612 char *daddr = NULL, *saddr = NULL; 613 614 cfg_payload_len = max_payload_len; 615 616 while ((c = getopt(argc, argv, "46c:C:D:i:mp:rs:S:t:vz")) != -1) { 617 switch (c) { 618 case '4': 619 if (cfg_family != PF_UNSPEC) 620 error(1, 0, "Pass one of -4 or -6"); 621 cfg_family = PF_INET; 622 cfg_alen = sizeof(struct sockaddr_in); 623 break; 624 case '6': 625 if (cfg_family != PF_UNSPEC) 626 error(1, 0, "Pass one of -4 or -6"); 627 cfg_family = PF_INET6; 628 cfg_alen = sizeof(struct sockaddr_in6); 629 break; 630 case 'c': 631 cfg_cork = strtol(optarg, NULL, 0); 632 break; 633 case 'C': 634 cfg_cpu = strtol(optarg, NULL, 0); 635 break; 636 case 'D': 637 daddr = optarg; 638 break; 639 case 'i': 640 cfg_ifindex = if_nametoindex(optarg); 641 if (cfg_ifindex == 0) 642 error(1, errno, "invalid iface: %s", optarg); 643 break; 644 case 'm': 645 cfg_cork_mixed = true; 646 break; 647 case 'p': 648 cfg_port = strtoul(optarg, NULL, 0); 649 break; 650 case 'r': 651 cfg_rx = true; 652 break; 653 case 's': 654 cfg_payload_len = strtoul(optarg, NULL, 0); 655 break; 656 case 'S': 657 saddr = optarg; 658 break; 659 case 't': 660 cfg_runtime_ms = 200 + strtoul(optarg, NULL, 10) * 1000; 661 break; 662 case 'v': 663 cfg_verbose++; 664 break; 665 case 'z': 666 cfg_zerocopy = true; 667 break; 668 } 669 } 670 setup_sockaddr(cfg_family, daddr, &cfg_dst_addr); 671 setup_sockaddr(cfg_family, saddr, &cfg_src_addr); 672 673 if (cfg_payload_len > max_payload_len) 674 error(1, 0, "-s: payload exceeds max (%d)", max_payload_len); 675 if (cfg_cork_mixed && (!cfg_zerocopy || !cfg_cork)) 676 error(1, 0, "-m: cork_mixed requires corking and zerocopy"); 677 678 if (optind != argc - 1) 679 usage(argv[0]); 680 } 681 682 int main(int argc, char **argv) 683 { 684 const char *cfg_test; 685 686 parse_opts(argc, argv); 687 688 cfg_test = argv[argc - 1]; 689 690 if (!strcmp(cfg_test, "packet")) 691 do_test(PF_PACKET, SOCK_RAW, 0); 692 else if (!strcmp(cfg_test, "packet_dgram")) 693 do_test(PF_PACKET, SOCK_DGRAM, 0); 694 else if (!strcmp(cfg_test, "raw")) 695 do_test(cfg_family, SOCK_RAW, IPPROTO_EGP); 696 else if (!strcmp(cfg_test, "raw_hdrincl")) 697 do_test(cfg_family, SOCK_RAW, IPPROTO_RAW); 698 else if (!strcmp(cfg_test, "tcp")) 699 do_test(cfg_family, SOCK_STREAM, 0); 700 else if (!strcmp(cfg_test, "udp")) 701 do_test(cfg_family, SOCK_DGRAM, 0); 702 else 703 error(1, 0, "unknown cfg_test %s", cfg_test); 704 705 return 0; 706 } 707