1 /* Evaluate MSG_ZEROCOPY
2  *
3  * Send traffic between two processes over one of the supported
4  * protocols and modes:
5  *
6  * PF_INET/PF_INET6
7  * - SOCK_STREAM
8  * - SOCK_DGRAM
9  * - SOCK_DGRAM with UDP_CORK
10  * - SOCK_RAW
11  * - SOCK_RAW with IP_HDRINCL
12  *
13  * PF_PACKET
14  * - SOCK_DGRAM
15  * - SOCK_RAW
16  *
17  * PF_RDS
18  * - SOCK_SEQPACKET
19  *
20  * Start this program on two connected hosts, one in send mode and
21  * the other with option '-r' to put it in receiver mode.
22  *
23  * If zerocopy mode ('-z') is enabled, the sender will verify that
24  * the kernel queues completions on the error queue for all zerocopy
25  * transfers.
26  */
27 
28 #define _GNU_SOURCE
29 
30 #include <arpa/inet.h>
31 #include <error.h>
32 #include <errno.h>
33 #include <limits.h>
34 #include <linux/errqueue.h>
35 #include <linux/if_packet.h>
36 #include <linux/ipv6.h>
37 #include <linux/socket.h>
38 #include <linux/sockios.h>
39 #include <net/ethernet.h>
40 #include <net/if.h>
41 #include <netinet/ip.h>
42 #include <netinet/ip6.h>
43 #include <netinet/tcp.h>
44 #include <netinet/udp.h>
45 #include <poll.h>
46 #include <sched.h>
47 #include <stdbool.h>
48 #include <stdio.h>
49 #include <stdint.h>
50 #include <stdlib.h>
51 #include <string.h>
52 #include <sys/ioctl.h>
53 #include <sys/socket.h>
54 #include <sys/stat.h>
55 #include <sys/time.h>
56 #include <sys/types.h>
57 #include <sys/wait.h>
58 #include <unistd.h>
59 #include <linux/rds.h>
60 
61 #ifndef SO_EE_ORIGIN_ZEROCOPY
62 #define SO_EE_ORIGIN_ZEROCOPY		5
63 #endif
64 
65 #ifndef SO_ZEROCOPY
66 #define SO_ZEROCOPY	60
67 #endif
68 
69 #ifndef SO_EE_CODE_ZEROCOPY_COPIED
70 #define SO_EE_CODE_ZEROCOPY_COPIED	1
71 #endif
72 
73 #ifndef MSG_ZEROCOPY
74 #define MSG_ZEROCOPY	0x4000000
75 #endif
76 
77 static int  cfg_cork;
78 static bool cfg_cork_mixed;
79 static int  cfg_cpu		= -1;		/* default: pin to last cpu */
80 static int  cfg_family		= PF_UNSPEC;
81 static int  cfg_ifindex		= 1;
82 static int  cfg_payload_len;
83 static int  cfg_port		= 8000;
84 static bool cfg_rx;
85 static int  cfg_runtime_ms	= 4200;
86 static int  cfg_verbose;
87 static int  cfg_waittime_ms	= 500;
88 static int  cfg_notification_limit = 32;
89 static bool cfg_zerocopy;
90 
91 static socklen_t cfg_alen;
92 static struct sockaddr_storage cfg_dst_addr;
93 static struct sockaddr_storage cfg_src_addr;
94 
95 static char payload[IP_MAXPACKET];
96 static long packets, bytes, completions, expected_completions;
97 static int  zerocopied = -1;
98 static uint32_t next_completion;
99 static uint32_t sends_since_notify;
100 
gettimeofday_ms(void)101 static unsigned long gettimeofday_ms(void)
102 {
103 	struct timeval tv;
104 
105 	gettimeofday(&tv, NULL);
106 	return (tv.tv_sec * 1000) + (tv.tv_usec / 1000);
107 }
108 
get_ip_csum(const uint16_t * start,int num_words)109 static uint16_t get_ip_csum(const uint16_t *start, int num_words)
110 {
111 	unsigned long sum = 0;
112 	int i;
113 
114 	for (i = 0; i < num_words; i++)
115 		sum += start[i];
116 
117 	while (sum >> 16)
118 		sum = (sum & 0xFFFF) + (sum >> 16);
119 
120 	return ~sum;
121 }
122 
do_setcpu(int cpu)123 static int do_setcpu(int cpu)
124 {
125 	cpu_set_t mask;
126 
127 	CPU_ZERO(&mask);
128 	CPU_SET(cpu, &mask);
129 	if (sched_setaffinity(0, sizeof(mask), &mask))
130 		fprintf(stderr, "cpu: unable to pin, may increase variance.\n");
131 	else if (cfg_verbose)
132 		fprintf(stderr, "cpu: %u\n", cpu);
133 
134 	return 0;
135 }
136 
do_setsockopt(int fd,int level,int optname,int val)137 static void do_setsockopt(int fd, int level, int optname, int val)
138 {
139 	if (setsockopt(fd, level, optname, &val, sizeof(val)))
140 		error(1, errno, "setsockopt %d.%d: %d", level, optname, val);
141 }
142 
do_poll(int fd,int events)143 static int do_poll(int fd, int events)
144 {
145 	struct pollfd pfd;
146 	int ret;
147 
148 	pfd.events = events;
149 	pfd.revents = 0;
150 	pfd.fd = fd;
151 
152 	ret = poll(&pfd, 1, cfg_waittime_ms);
153 	if (ret == -1)
154 		error(1, errno, "poll");
155 
156 	return ret && (pfd.revents & events);
157 }
158 
do_accept(int fd)159 static int do_accept(int fd)
160 {
161 	int fda = fd;
162 
163 	fd = accept(fda, NULL, NULL);
164 	if (fd == -1)
165 		error(1, errno, "accept");
166 	if (close(fda))
167 		error(1, errno, "close listen sock");
168 
169 	return fd;
170 }
171 
add_zcopy_cookie(struct msghdr * msg,uint32_t cookie)172 static void add_zcopy_cookie(struct msghdr *msg, uint32_t cookie)
173 {
174 	struct cmsghdr *cm;
175 
176 	if (!msg->msg_control)
177 		error(1, errno, "NULL cookie");
178 	cm = (void *)msg->msg_control;
179 	cm->cmsg_len = CMSG_LEN(sizeof(cookie));
180 	cm->cmsg_level = SOL_RDS;
181 	cm->cmsg_type = RDS_CMSG_ZCOPY_COOKIE;
182 	memcpy(CMSG_DATA(cm), &cookie, sizeof(cookie));
183 }
184 
do_sendmsg(int fd,struct msghdr * msg,bool do_zerocopy,int domain)185 static bool do_sendmsg(int fd, struct msghdr *msg, bool do_zerocopy, int domain)
186 {
187 	int ret, len, i, flags;
188 	static uint32_t cookie;
189 	char ckbuf[CMSG_SPACE(sizeof(cookie))];
190 
191 	len = 0;
192 	for (i = 0; i < msg->msg_iovlen; i++)
193 		len += msg->msg_iov[i].iov_len;
194 
195 	flags = MSG_DONTWAIT;
196 	if (do_zerocopy) {
197 		flags |= MSG_ZEROCOPY;
198 		if (domain == PF_RDS) {
199 			memset(&msg->msg_control, 0, sizeof(msg->msg_control));
200 			msg->msg_controllen = CMSG_SPACE(sizeof(cookie));
201 			msg->msg_control = (struct cmsghdr *)ckbuf;
202 			add_zcopy_cookie(msg, ++cookie);
203 		}
204 	}
205 
206 	ret = sendmsg(fd, msg, flags);
207 	if (ret == -1 && errno == EAGAIN)
208 		return false;
209 	if (ret == -1)
210 		error(1, errno, "send");
211 	if (cfg_verbose && ret != len)
212 		fprintf(stderr, "send: ret=%u != %u\n", ret, len);
213 	sends_since_notify++;
214 
215 	if (len) {
216 		packets++;
217 		bytes += ret;
218 		if (do_zerocopy && ret)
219 			expected_completions++;
220 	}
221 	if (do_zerocopy && domain == PF_RDS) {
222 		msg->msg_control = NULL;
223 		msg->msg_controllen = 0;
224 	}
225 
226 	return true;
227 }
228 
do_sendmsg_corked(int fd,struct msghdr * msg)229 static void do_sendmsg_corked(int fd, struct msghdr *msg)
230 {
231 	bool do_zerocopy = cfg_zerocopy;
232 	int i, payload_len, extra_len;
233 
234 	/* split up the packet. for non-multiple, make first buffer longer */
235 	payload_len = cfg_payload_len / cfg_cork;
236 	extra_len = cfg_payload_len - (cfg_cork * payload_len);
237 
238 	do_setsockopt(fd, IPPROTO_UDP, UDP_CORK, 1);
239 
240 	for (i = 0; i < cfg_cork; i++) {
241 
242 		/* in mixed-frags mode, alternate zerocopy and copy frags
243 		 * start with non-zerocopy, to ensure attach later works
244 		 */
245 		if (cfg_cork_mixed)
246 			do_zerocopy = (i & 1);
247 
248 		msg->msg_iov[0].iov_len = payload_len + extra_len;
249 		extra_len = 0;
250 
251 		do_sendmsg(fd, msg, do_zerocopy,
252 			   (cfg_dst_addr.ss_family == AF_INET ?
253 			    PF_INET : PF_INET6));
254 	}
255 
256 	do_setsockopt(fd, IPPROTO_UDP, UDP_CORK, 0);
257 }
258 
setup_iph(struct iphdr * iph,uint16_t payload_len)259 static int setup_iph(struct iphdr *iph, uint16_t payload_len)
260 {
261 	struct sockaddr_in *daddr = (void *) &cfg_dst_addr;
262 	struct sockaddr_in *saddr = (void *) &cfg_src_addr;
263 
264 	memset(iph, 0, sizeof(*iph));
265 
266 	iph->version	= 4;
267 	iph->tos	= 0;
268 	iph->ihl	= 5;
269 	iph->ttl	= 2;
270 	iph->saddr	= saddr->sin_addr.s_addr;
271 	iph->daddr	= daddr->sin_addr.s_addr;
272 	iph->protocol	= IPPROTO_EGP;
273 	iph->tot_len	= htons(sizeof(*iph) + payload_len);
274 	iph->check	= get_ip_csum((void *) iph, iph->ihl << 1);
275 
276 	return sizeof(*iph);
277 }
278 
setup_ip6h(struct ipv6hdr * ip6h,uint16_t payload_len)279 static int setup_ip6h(struct ipv6hdr *ip6h, uint16_t payload_len)
280 {
281 	struct sockaddr_in6 *daddr = (void *) &cfg_dst_addr;
282 	struct sockaddr_in6 *saddr = (void *) &cfg_src_addr;
283 
284 	memset(ip6h, 0, sizeof(*ip6h));
285 
286 	ip6h->version		= 6;
287 	ip6h->payload_len	= htons(payload_len);
288 	ip6h->nexthdr		= IPPROTO_EGP;
289 	ip6h->hop_limit		= 2;
290 	ip6h->saddr		= saddr->sin6_addr;
291 	ip6h->daddr		= daddr->sin6_addr;
292 
293 	return sizeof(*ip6h);
294 }
295 
296 
setup_sockaddr(int domain,const char * str_addr,struct sockaddr_storage * sockaddr)297 static void setup_sockaddr(int domain, const char *str_addr,
298 			   struct sockaddr_storage *sockaddr)
299 {
300 	struct sockaddr_in6 *addr6 = (void *) sockaddr;
301 	struct sockaddr_in *addr4 = (void *) sockaddr;
302 
303 	switch (domain) {
304 	case PF_INET:
305 		memset(addr4, 0, sizeof(*addr4));
306 		addr4->sin_family = AF_INET;
307 		addr4->sin_port = htons(cfg_port);
308 		if (str_addr &&
309 		    inet_pton(AF_INET, str_addr, &(addr4->sin_addr)) != 1)
310 			error(1, 0, "ipv4 parse error: %s", str_addr);
311 		break;
312 	case PF_INET6:
313 		memset(addr6, 0, sizeof(*addr6));
314 		addr6->sin6_family = AF_INET6;
315 		addr6->sin6_port = htons(cfg_port);
316 		if (str_addr &&
317 		    inet_pton(AF_INET6, str_addr, &(addr6->sin6_addr)) != 1)
318 			error(1, 0, "ipv6 parse error: %s", str_addr);
319 		break;
320 	default:
321 		error(1, 0, "illegal domain");
322 	}
323 }
324 
do_setup_tx(int domain,int type,int protocol)325 static int do_setup_tx(int domain, int type, int protocol)
326 {
327 	int fd;
328 
329 	fd = socket(domain, type, protocol);
330 	if (fd == -1)
331 		error(1, errno, "socket t");
332 
333 	do_setsockopt(fd, SOL_SOCKET, SO_SNDBUF, 1 << 21);
334 	if (cfg_zerocopy)
335 		do_setsockopt(fd, SOL_SOCKET, SO_ZEROCOPY, 1);
336 
337 	if (domain != PF_PACKET && domain != PF_RDS)
338 		if (connect(fd, (void *) &cfg_dst_addr, cfg_alen))
339 			error(1, errno, "connect");
340 
341 	if (domain == PF_RDS) {
342 		if (bind(fd, (void *) &cfg_src_addr, cfg_alen))
343 			error(1, errno, "bind");
344 	}
345 
346 	return fd;
347 }
348 
do_process_zerocopy_cookies(struct rds_zcopy_cookies * ck)349 static uint32_t do_process_zerocopy_cookies(struct rds_zcopy_cookies *ck)
350 {
351 	int i;
352 
353 	if (ck->num > RDS_MAX_ZCOOKIES)
354 		error(1, 0, "Returned %d cookies, max expected %d\n",
355 		      ck->num, RDS_MAX_ZCOOKIES);
356 	for (i = 0; i < ck->num; i++)
357 		if (cfg_verbose >= 2)
358 			fprintf(stderr, "%d\n", ck->cookies[i]);
359 	return ck->num;
360 }
361 
do_recvmsg_completion(int fd)362 static bool do_recvmsg_completion(int fd)
363 {
364 	char cmsgbuf[CMSG_SPACE(sizeof(struct rds_zcopy_cookies))];
365 	struct rds_zcopy_cookies *ck;
366 	struct cmsghdr *cmsg;
367 	struct msghdr msg;
368 	bool ret = false;
369 
370 	memset(&msg, 0, sizeof(msg));
371 	msg.msg_control = cmsgbuf;
372 	msg.msg_controllen = sizeof(cmsgbuf);
373 
374 	if (recvmsg(fd, &msg, MSG_DONTWAIT))
375 		return ret;
376 
377 	if (msg.msg_flags & MSG_CTRUNC)
378 		error(1, errno, "recvmsg notification: truncated");
379 
380 	for (cmsg = CMSG_FIRSTHDR(&msg); cmsg; cmsg = CMSG_NXTHDR(&msg, cmsg)) {
381 		if (cmsg->cmsg_level == SOL_RDS &&
382 		    cmsg->cmsg_type == RDS_CMSG_ZCOPY_COMPLETION) {
383 
384 			ck = (struct rds_zcopy_cookies *)CMSG_DATA(cmsg);
385 			completions += do_process_zerocopy_cookies(ck);
386 			ret = true;
387 			break;
388 		}
389 		error(0, 0, "ignoring cmsg at level %d type %d\n",
390 			    cmsg->cmsg_level, cmsg->cmsg_type);
391 	}
392 	return ret;
393 }
394 
do_recv_completion(int fd,int domain)395 static bool do_recv_completion(int fd, int domain)
396 {
397 	struct sock_extended_err *serr;
398 	struct msghdr msg = {};
399 	struct cmsghdr *cm;
400 	uint32_t hi, lo, range;
401 	int ret, zerocopy;
402 	char control[100];
403 
404 	if (domain == PF_RDS)
405 		return do_recvmsg_completion(fd);
406 
407 	msg.msg_control = control;
408 	msg.msg_controllen = sizeof(control);
409 
410 	ret = recvmsg(fd, &msg, MSG_ERRQUEUE);
411 	if (ret == -1 && errno == EAGAIN)
412 		return false;
413 	if (ret == -1)
414 		error(1, errno, "recvmsg notification");
415 	if (msg.msg_flags & MSG_CTRUNC)
416 		error(1, errno, "recvmsg notification: truncated");
417 
418 	cm = CMSG_FIRSTHDR(&msg);
419 	if (!cm)
420 		error(1, 0, "cmsg: no cmsg");
421 	if (!((cm->cmsg_level == SOL_IP && cm->cmsg_type == IP_RECVERR) ||
422 	      (cm->cmsg_level == SOL_IPV6 && cm->cmsg_type == IPV6_RECVERR) ||
423 	      (cm->cmsg_level == SOL_PACKET && cm->cmsg_type == PACKET_TX_TIMESTAMP)))
424 		error(1, 0, "serr: wrong type: %d.%d",
425 		      cm->cmsg_level, cm->cmsg_type);
426 
427 	serr = (void *) CMSG_DATA(cm);
428 
429 	if (serr->ee_origin != SO_EE_ORIGIN_ZEROCOPY)
430 		error(1, 0, "serr: wrong origin: %u", serr->ee_origin);
431 	if (serr->ee_errno != 0)
432 		error(1, 0, "serr: wrong error code: %u", serr->ee_errno);
433 
434 	hi = serr->ee_data;
435 	lo = serr->ee_info;
436 	range = hi - lo + 1;
437 
438 	/* Detect notification gaps. These should not happen often, if at all.
439 	 * Gaps can occur due to drops, reordering and retransmissions.
440 	 */
441 	if (cfg_verbose && lo != next_completion)
442 		fprintf(stderr, "gap: %u..%u does not append to %u\n",
443 			lo, hi, next_completion);
444 	next_completion = hi + 1;
445 
446 	zerocopy = !(serr->ee_code & SO_EE_CODE_ZEROCOPY_COPIED);
447 	if (zerocopied == -1)
448 		zerocopied = zerocopy;
449 	else if (zerocopied != zerocopy) {
450 		fprintf(stderr, "serr: inconsistent\n");
451 		zerocopied = zerocopy;
452 	}
453 
454 	if (cfg_verbose >= 2)
455 		fprintf(stderr, "completed: %u (h=%u l=%u)\n",
456 			range, hi, lo);
457 
458 	completions += range;
459 	return true;
460 }
461 
462 /* Read all outstanding messages on the errqueue */
do_recv_completions(int fd,int domain)463 static void do_recv_completions(int fd, int domain)
464 {
465 	while (do_recv_completion(fd, domain)) {}
466 	sends_since_notify = 0;
467 }
468 
469 /* Wait for all remaining completions on the errqueue */
do_recv_remaining_completions(int fd,int domain)470 static void do_recv_remaining_completions(int fd, int domain)
471 {
472 	int64_t tstop = gettimeofday_ms() + cfg_waittime_ms;
473 
474 	while (completions < expected_completions &&
475 	       gettimeofday_ms() < tstop) {
476 		if (do_poll(fd, domain == PF_RDS ? POLLIN : POLLERR))
477 			do_recv_completions(fd, domain);
478 	}
479 
480 	if (completions < expected_completions)
481 		fprintf(stderr, "missing notifications: %lu < %lu\n",
482 			completions, expected_completions);
483 }
484 
do_tx(int domain,int type,int protocol)485 static void do_tx(int domain, int type, int protocol)
486 {
487 	struct iovec iov[3] = { {0} };
488 	struct sockaddr_ll laddr;
489 	struct msghdr msg = {0};
490 	struct ethhdr eth;
491 	union {
492 		struct ipv6hdr ip6h;
493 		struct iphdr iph;
494 	} nh;
495 	uint64_t tstop;
496 	int fd;
497 
498 	fd = do_setup_tx(domain, type, protocol);
499 
500 	if (domain == PF_PACKET) {
501 		uint16_t proto = cfg_family == PF_INET ? ETH_P_IP : ETH_P_IPV6;
502 
503 		/* sock_raw passes ll header as data */
504 		if (type == SOCK_RAW) {
505 			memset(eth.h_dest, 0x06, ETH_ALEN);
506 			memset(eth.h_source, 0x02, ETH_ALEN);
507 			eth.h_proto = htons(proto);
508 			iov[0].iov_base = &eth;
509 			iov[0].iov_len = sizeof(eth);
510 			msg.msg_iovlen++;
511 		}
512 
513 		/* both sock_raw and sock_dgram expect name */
514 		memset(&laddr, 0, sizeof(laddr));
515 		laddr.sll_family	= AF_PACKET;
516 		laddr.sll_ifindex	= cfg_ifindex;
517 		laddr.sll_protocol	= htons(proto);
518 		laddr.sll_halen		= ETH_ALEN;
519 
520 		memset(laddr.sll_addr, 0x06, ETH_ALEN);
521 
522 		msg.msg_name		= &laddr;
523 		msg.msg_namelen		= sizeof(laddr);
524 	}
525 
526 	/* packet and raw sockets with hdrincl must pass network header */
527 	if (domain == PF_PACKET || protocol == IPPROTO_RAW) {
528 		if (cfg_family == PF_INET)
529 			iov[1].iov_len = setup_iph(&nh.iph, cfg_payload_len);
530 		else
531 			iov[1].iov_len = setup_ip6h(&nh.ip6h, cfg_payload_len);
532 
533 		iov[1].iov_base = (void *) &nh;
534 		msg.msg_iovlen++;
535 	}
536 
537 	if (domain == PF_RDS) {
538 		msg.msg_name = &cfg_dst_addr;
539 		msg.msg_namelen =  (cfg_dst_addr.ss_family == AF_INET ?
540 				    sizeof(struct sockaddr_in) :
541 				    sizeof(struct sockaddr_in6));
542 	}
543 
544 	iov[2].iov_base = payload;
545 	iov[2].iov_len = cfg_payload_len;
546 	msg.msg_iovlen++;
547 	msg.msg_iov = &iov[3 - msg.msg_iovlen];
548 
549 	tstop = gettimeofday_ms() + cfg_runtime_ms;
550 	do {
551 		if (cfg_cork)
552 			do_sendmsg_corked(fd, &msg);
553 		else
554 			do_sendmsg(fd, &msg, cfg_zerocopy, domain);
555 
556 		if (cfg_zerocopy && sends_since_notify >= cfg_notification_limit)
557 			do_recv_completions(fd, domain);
558 
559 		while (!do_poll(fd, POLLOUT)) {
560 			if (cfg_zerocopy)
561 				do_recv_completions(fd, domain);
562 		}
563 
564 	} while (gettimeofday_ms() < tstop);
565 
566 	if (cfg_zerocopy)
567 		do_recv_remaining_completions(fd, domain);
568 
569 	if (close(fd))
570 		error(1, errno, "close");
571 
572 	fprintf(stderr, "tx=%lu (%lu MB) txc=%lu zc=%c\n",
573 		packets, bytes >> 20, completions,
574 		zerocopied == 1 ? 'y' : 'n');
575 }
576 
do_setup_rx(int domain,int type,int protocol)577 static int do_setup_rx(int domain, int type, int protocol)
578 {
579 	int fd;
580 
581 	/* If tx over PF_PACKET, rx over PF_INET(6)/SOCK_RAW,
582 	 * to recv the only copy of the packet, not a clone
583 	 */
584 	if (domain == PF_PACKET)
585 		error(1, 0, "Use PF_INET/SOCK_RAW to read");
586 
587 	if (type == SOCK_RAW && protocol == IPPROTO_RAW)
588 		error(1, 0, "IPPROTO_RAW: not supported on Rx");
589 
590 	fd = socket(domain, type, protocol);
591 	if (fd == -1)
592 		error(1, errno, "socket r");
593 
594 	do_setsockopt(fd, SOL_SOCKET, SO_RCVBUF, 1 << 21);
595 	do_setsockopt(fd, SOL_SOCKET, SO_RCVLOWAT, 1 << 16);
596 	do_setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, 1);
597 
598 	if (bind(fd, (void *) &cfg_dst_addr, cfg_alen))
599 		error(1, errno, "bind");
600 
601 	if (type == SOCK_STREAM) {
602 		if (listen(fd, 1))
603 			error(1, errno, "listen");
604 		fd = do_accept(fd);
605 	}
606 
607 	return fd;
608 }
609 
610 /* Flush all outstanding bytes for the tcp receive queue */
do_flush_tcp(int fd)611 static void do_flush_tcp(int fd)
612 {
613 	int ret;
614 
615 	/* MSG_TRUNC flushes up to len bytes */
616 	ret = recv(fd, NULL, 1 << 21, MSG_TRUNC | MSG_DONTWAIT);
617 	if (ret == -1 && errno == EAGAIN)
618 		return;
619 	if (ret == -1)
620 		error(1, errno, "flush");
621 	if (!ret)
622 		return;
623 
624 	packets++;
625 	bytes += ret;
626 }
627 
628 /* Flush all outstanding datagrams. Verify first few bytes of each. */
do_flush_datagram(int fd,int type)629 static void do_flush_datagram(int fd, int type)
630 {
631 	int ret, off = 0;
632 	char buf[64];
633 
634 	/* MSG_TRUNC will return full datagram length */
635 	ret = recv(fd, buf, sizeof(buf), MSG_DONTWAIT | MSG_TRUNC);
636 	if (ret == -1 && errno == EAGAIN)
637 		return;
638 
639 	/* raw ipv4 return with header, raw ipv6 without */
640 	if (cfg_family == PF_INET && type == SOCK_RAW) {
641 		off += sizeof(struct iphdr);
642 		ret -= sizeof(struct iphdr);
643 	}
644 
645 	if (ret == -1)
646 		error(1, errno, "recv");
647 	if (ret != cfg_payload_len)
648 		error(1, 0, "recv: ret=%u != %u", ret, cfg_payload_len);
649 	if (ret > sizeof(buf) - off)
650 		ret = sizeof(buf) - off;
651 	if (memcmp(buf + off, payload, ret))
652 		error(1, 0, "recv: data mismatch");
653 
654 	packets++;
655 	bytes += cfg_payload_len;
656 }
657 
do_rx(int domain,int type,int protocol)658 static void do_rx(int domain, int type, int protocol)
659 {
660 	const int cfg_receiver_wait_ms = 400;
661 	uint64_t tstop;
662 	int fd;
663 
664 	fd = do_setup_rx(domain, type, protocol);
665 
666 	tstop = gettimeofday_ms() + cfg_runtime_ms + cfg_receiver_wait_ms;
667 	do {
668 		if (type == SOCK_STREAM)
669 			do_flush_tcp(fd);
670 		else
671 			do_flush_datagram(fd, type);
672 
673 		do_poll(fd, POLLIN);
674 
675 	} while (gettimeofday_ms() < tstop);
676 
677 	if (close(fd))
678 		error(1, errno, "close");
679 
680 	fprintf(stderr, "rx=%lu (%lu MB)\n", packets, bytes >> 20);
681 }
682 
do_test(int domain,int type,int protocol)683 static void do_test(int domain, int type, int protocol)
684 {
685 	int i;
686 
687 	if (cfg_cork && (domain == PF_PACKET || type != SOCK_DGRAM))
688 		error(1, 0, "can only cork udp sockets");
689 
690 	do_setcpu(cfg_cpu);
691 
692 	for (i = 0; i < IP_MAXPACKET; i++)
693 		payload[i] = 'a' + (i % 26);
694 
695 	if (cfg_rx)
696 		do_rx(domain, type, protocol);
697 	else
698 		do_tx(domain, type, protocol);
699 }
700 
usage(const char * filepath)701 static void usage(const char *filepath)
702 {
703 	error(1, 0, "Usage: %s [options] <test>", filepath);
704 }
705 
parse_opts(int argc,char ** argv)706 static void parse_opts(int argc, char **argv)
707 {
708 	const int max_payload_len = sizeof(payload) -
709 				    sizeof(struct ipv6hdr) -
710 				    sizeof(struct tcphdr) -
711 				    40 /* max tcp options */;
712 	int c;
713 	char *daddr = NULL, *saddr = NULL;
714 	char *cfg_test;
715 
716 	cfg_payload_len = max_payload_len;
717 
718 	while ((c = getopt(argc, argv, "46c:C:D:i:l:mp:rs:S:t:vz")) != -1) {
719 		switch (c) {
720 		case '4':
721 			if (cfg_family != PF_UNSPEC)
722 				error(1, 0, "Pass one of -4 or -6");
723 			cfg_family = PF_INET;
724 			cfg_alen = sizeof(struct sockaddr_in);
725 			break;
726 		case '6':
727 			if (cfg_family != PF_UNSPEC)
728 				error(1, 0, "Pass one of -4 or -6");
729 			cfg_family = PF_INET6;
730 			cfg_alen = sizeof(struct sockaddr_in6);
731 			break;
732 		case 'c':
733 			cfg_cork = strtol(optarg, NULL, 0);
734 			break;
735 		case 'C':
736 			cfg_cpu = strtol(optarg, NULL, 0);
737 			break;
738 		case 'D':
739 			daddr = optarg;
740 			break;
741 		case 'i':
742 			cfg_ifindex = if_nametoindex(optarg);
743 			if (cfg_ifindex == 0)
744 				error(1, errno, "invalid iface: %s", optarg);
745 			break;
746 		case 'l':
747 			cfg_notification_limit = strtoul(optarg, NULL, 0);
748 			break;
749 		case 'm':
750 			cfg_cork_mixed = true;
751 			break;
752 		case 'p':
753 			cfg_port = strtoul(optarg, NULL, 0);
754 			break;
755 		case 'r':
756 			cfg_rx = true;
757 			break;
758 		case 's':
759 			cfg_payload_len = strtoul(optarg, NULL, 0);
760 			break;
761 		case 'S':
762 			saddr = optarg;
763 			break;
764 		case 't':
765 			cfg_runtime_ms = 200 + strtoul(optarg, NULL, 10) * 1000;
766 			break;
767 		case 'v':
768 			cfg_verbose++;
769 			break;
770 		case 'z':
771 			cfg_zerocopy = true;
772 			break;
773 		}
774 	}
775 
776 	cfg_test = argv[argc - 1];
777 	if (strcmp(cfg_test, "rds") == 0) {
778 		if (!daddr)
779 			error(1, 0, "-D <server addr> required for PF_RDS\n");
780 		if (!cfg_rx && !saddr)
781 			error(1, 0, "-S <client addr> required for PF_RDS\n");
782 	}
783 	setup_sockaddr(cfg_family, daddr, &cfg_dst_addr);
784 	setup_sockaddr(cfg_family, saddr, &cfg_src_addr);
785 
786 	if (cfg_payload_len > max_payload_len)
787 		error(1, 0, "-s: payload exceeds max (%d)", max_payload_len);
788 	if (cfg_cork_mixed && (!cfg_zerocopy || !cfg_cork))
789 		error(1, 0, "-m: cork_mixed requires corking and zerocopy");
790 
791 	if (optind != argc - 1)
792 		usage(argv[0]);
793 }
794 
main(int argc,char ** argv)795 int main(int argc, char **argv)
796 {
797 	const char *cfg_test;
798 
799 	parse_opts(argc, argv);
800 
801 	cfg_test = argv[argc - 1];
802 
803 	if (!strcmp(cfg_test, "packet"))
804 		do_test(PF_PACKET, SOCK_RAW, 0);
805 	else if (!strcmp(cfg_test, "packet_dgram"))
806 		do_test(PF_PACKET, SOCK_DGRAM, 0);
807 	else if (!strcmp(cfg_test, "raw"))
808 		do_test(cfg_family, SOCK_RAW, IPPROTO_EGP);
809 	else if (!strcmp(cfg_test, "raw_hdrincl"))
810 		do_test(cfg_family, SOCK_RAW, IPPROTO_RAW);
811 	else if (!strcmp(cfg_test, "tcp"))
812 		do_test(cfg_family, SOCK_STREAM, 0);
813 	else if (!strcmp(cfg_test, "udp"))
814 		do_test(cfg_family, SOCK_DGRAM, 0);
815 	else if (!strcmp(cfg_test, "rds"))
816 		do_test(PF_RDS, SOCK_SEQPACKET, 0);
817 	else
818 		error(1, 0, "unknown cfg_test %s", cfg_test);
819 
820 	return 0;
821 }
822