1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Check if we can migrate child sockets.
4  *
5  *   1. call listen() for 4 server sockets.
6  *   2. call connect() for 25 client sockets.
7  *   3. call listen() for 1 server socket. (migration target)
8  *   4. update a map to migrate all child sockets
9  *        to the last server socket (migrate_map[cookie] = 4)
10  *   5. call shutdown() for first 4 server sockets
11  *        and migrate the requests in the accept queue
12  *        to the last server socket.
13  *   6. call listen() for the second server socket.
14  *   7. call shutdown() for the last server
15  *        and migrate the requests in the accept queue
16  *        to the second server socket.
17  *   8. call listen() for the last server.
18  *   9. call shutdown() for the second server
19  *        and migrate the requests in the accept queue
20  *        to the last server socket.
21  *  10. call accept() for the last server socket.
22  *
23  * Author: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
24  */
25 
26 #include <bpf/bpf.h>
27 #include <bpf/libbpf.h>
28 
29 #include "test_progs.h"
30 #include "test_migrate_reuseport.skel.h"
31 #include "network_helpers.h"
32 
33 #ifndef TCP_FASTOPEN_CONNECT
34 #define TCP_FASTOPEN_CONNECT 30
35 #endif
36 
37 #define IFINDEX_LO 1
38 
39 #define NR_SERVERS 5
40 #define NR_CLIENTS (NR_SERVERS * 5)
41 #define MIGRATED_TO (NR_SERVERS - 1)
42 
43 /* fastopenq->max_qlen and sk->sk_max_ack_backlog */
44 #define QLEN (NR_CLIENTS * 5)
45 
46 #define MSG "Hello World\0"
47 #define MSGLEN 12
48 
49 static struct migrate_reuseport_test_case {
50 	const char *name;
51 	__s64 servers[NR_SERVERS];
52 	__s64 clients[NR_CLIENTS];
53 	struct sockaddr_storage addr;
54 	socklen_t addrlen;
55 	int family;
56 	int state;
57 	bool drop_ack;
58 	bool expire_synack_timer;
59 	bool fastopen;
60 	struct bpf_link *link;
61 } test_cases[] = {
62 	{
63 		.name = "IPv4 TCP_ESTABLISHED  inet_csk_listen_stop",
64 		.family = AF_INET,
65 		.state = BPF_TCP_ESTABLISHED,
66 		.drop_ack = false,
67 		.expire_synack_timer = false,
68 		.fastopen = false,
69 	},
70 	{
71 		.name = "IPv4 TCP_SYN_RECV     inet_csk_listen_stop",
72 		.family = AF_INET,
73 		.state = BPF_TCP_SYN_RECV,
74 		.drop_ack = true,
75 		.expire_synack_timer = false,
76 		.fastopen = true,
77 	},
78 	{
79 		.name = "IPv4 TCP_NEW_SYN_RECV reqsk_timer_handler",
80 		.family = AF_INET,
81 		.state = BPF_TCP_NEW_SYN_RECV,
82 		.drop_ack = true,
83 		.expire_synack_timer = true,
84 		.fastopen = false,
85 	},
86 	{
87 		.name = "IPv4 TCP_NEW_SYN_RECV inet_csk_complete_hashdance",
88 		.family = AF_INET,
89 		.state = BPF_TCP_NEW_SYN_RECV,
90 		.drop_ack = true,
91 		.expire_synack_timer = false,
92 		.fastopen = false,
93 	},
94 	{
95 		.name = "IPv6 TCP_ESTABLISHED  inet_csk_listen_stop",
96 		.family = AF_INET6,
97 		.state = BPF_TCP_ESTABLISHED,
98 		.drop_ack = false,
99 		.expire_synack_timer = false,
100 		.fastopen = false,
101 	},
102 	{
103 		.name = "IPv6 TCP_SYN_RECV     inet_csk_listen_stop",
104 		.family = AF_INET6,
105 		.state = BPF_TCP_SYN_RECV,
106 		.drop_ack = true,
107 		.expire_synack_timer = false,
108 		.fastopen = true,
109 	},
110 	{
111 		.name = "IPv6 TCP_NEW_SYN_RECV reqsk_timer_handler",
112 		.family = AF_INET6,
113 		.state = BPF_TCP_NEW_SYN_RECV,
114 		.drop_ack = true,
115 		.expire_synack_timer = true,
116 		.fastopen = false,
117 	},
118 	{
119 		.name = "IPv6 TCP_NEW_SYN_RECV inet_csk_complete_hashdance",
120 		.family = AF_INET6,
121 		.state = BPF_TCP_NEW_SYN_RECV,
122 		.drop_ack = true,
123 		.expire_synack_timer = false,
124 		.fastopen = false,
125 	}
126 };
127 
128 static void init_fds(__s64 fds[], int len)
129 {
130 	int i;
131 
132 	for (i = 0; i < len; i++)
133 		fds[i] = -1;
134 }
135 
136 static void close_fds(__s64 fds[], int len)
137 {
138 	int i;
139 
140 	for (i = 0; i < len; i++) {
141 		if (fds[i] != -1) {
142 			close(fds[i]);
143 			fds[i] = -1;
144 		}
145 	}
146 }
147 
148 static int setup_fastopen(char *buf, int size, int *saved_len, bool restore)
149 {
150 	int err = 0, fd, len;
151 
152 	fd = open("/proc/sys/net/ipv4/tcp_fastopen", O_RDWR);
153 	if (!ASSERT_NEQ(fd, -1, "open"))
154 		return -1;
155 
156 	if (restore) {
157 		len = write(fd, buf, *saved_len);
158 		if (!ASSERT_EQ(len, *saved_len, "write - restore"))
159 			err = -1;
160 	} else {
161 		*saved_len = read(fd, buf, size);
162 		if (!ASSERT_GE(*saved_len, 1, "read")) {
163 			err = -1;
164 			goto close;
165 		}
166 
167 		err = lseek(fd, 0, SEEK_SET);
168 		if (!ASSERT_OK(err, "lseek"))
169 			goto close;
170 
171 		/* (TFO_CLIENT_ENABLE | TFO_SERVER_ENABLE |
172 		 *  TFO_CLIENT_NO_COOKIE | TFO_SERVER_COOKIE_NOT_REQD)
173 		 */
174 		len = write(fd, "519", 3);
175 		if (!ASSERT_EQ(len, 3, "write - setup"))
176 			err = -1;
177 	}
178 
179 close:
180 	close(fd);
181 
182 	return err;
183 }
184 
185 static int drop_ack(struct migrate_reuseport_test_case *test_case,
186 		    struct test_migrate_reuseport *skel)
187 {
188 	if (test_case->family == AF_INET)
189 		skel->bss->server_port = ((struct sockaddr_in *)
190 					  &test_case->addr)->sin_port;
191 	else
192 		skel->bss->server_port = ((struct sockaddr_in6 *)
193 					  &test_case->addr)->sin6_port;
194 
195 	test_case->link = bpf_program__attach_xdp(skel->progs.drop_ack,
196 						  IFINDEX_LO);
197 	if (!ASSERT_OK_PTR(test_case->link, "bpf_program__attach_xdp"))
198 		return -1;
199 
200 	return 0;
201 }
202 
203 static int pass_ack(struct migrate_reuseport_test_case *test_case)
204 {
205 	int err;
206 
207 	err = bpf_link__destroy(test_case->link);
208 	if (!ASSERT_OK(err, "bpf_link__destroy"))
209 		return -1;
210 
211 	test_case->link = NULL;
212 
213 	return 0;
214 }
215 
216 static int start_servers(struct migrate_reuseport_test_case *test_case,
217 			 struct test_migrate_reuseport *skel)
218 {
219 	int i, err, prog_fd, reuseport = 1, qlen = QLEN;
220 
221 	prog_fd = bpf_program__fd(skel->progs.migrate_reuseport);
222 
223 	make_sockaddr(test_case->family,
224 		      test_case->family == AF_INET ? "127.0.0.1" : "::1", 0,
225 		      &test_case->addr, &test_case->addrlen);
226 
227 	for (i = 0; i < NR_SERVERS; i++) {
228 		test_case->servers[i] = socket(test_case->family, SOCK_STREAM,
229 					       IPPROTO_TCP);
230 		if (!ASSERT_NEQ(test_case->servers[i], -1, "socket"))
231 			return -1;
232 
233 		err = setsockopt(test_case->servers[i], SOL_SOCKET,
234 				 SO_REUSEPORT, &reuseport, sizeof(reuseport));
235 		if (!ASSERT_OK(err, "setsockopt - SO_REUSEPORT"))
236 			return -1;
237 
238 		err = bind(test_case->servers[i],
239 			   (struct sockaddr *)&test_case->addr,
240 			   test_case->addrlen);
241 		if (!ASSERT_OK(err, "bind"))
242 			return -1;
243 
244 		if (i == 0) {
245 			err = setsockopt(test_case->servers[i], SOL_SOCKET,
246 					 SO_ATTACH_REUSEPORT_EBPF,
247 					 &prog_fd, sizeof(prog_fd));
248 			if (!ASSERT_OK(err,
249 				       "setsockopt - SO_ATTACH_REUSEPORT_EBPF"))
250 				return -1;
251 
252 			err = getsockname(test_case->servers[i],
253 					  (struct sockaddr *)&test_case->addr,
254 					  &test_case->addrlen);
255 			if (!ASSERT_OK(err, "getsockname"))
256 				return -1;
257 		}
258 
259 		if (test_case->fastopen) {
260 			err = setsockopt(test_case->servers[i],
261 					 SOL_TCP, TCP_FASTOPEN,
262 					 &qlen, sizeof(qlen));
263 			if (!ASSERT_OK(err, "setsockopt - TCP_FASTOPEN"))
264 				return -1;
265 		}
266 
267 		/* All requests will be tied to the first four listeners */
268 		if (i != MIGRATED_TO) {
269 			err = listen(test_case->servers[i], qlen);
270 			if (!ASSERT_OK(err, "listen"))
271 				return -1;
272 		}
273 	}
274 
275 	return 0;
276 }
277 
278 static int start_clients(struct migrate_reuseport_test_case *test_case)
279 {
280 	char buf[MSGLEN] = MSG;
281 	int i, err;
282 
283 	for (i = 0; i < NR_CLIENTS; i++) {
284 		test_case->clients[i] = socket(test_case->family, SOCK_STREAM,
285 					       IPPROTO_TCP);
286 		if (!ASSERT_NEQ(test_case->clients[i], -1, "socket"))
287 			return -1;
288 
289 		/* The attached XDP program drops only the final ACK, so
290 		 * clients will transition to TCP_ESTABLISHED immediately.
291 		 */
292 		err = settimeo(test_case->clients[i], 100);
293 		if (!ASSERT_OK(err, "settimeo"))
294 			return -1;
295 
296 		if (test_case->fastopen) {
297 			int fastopen = 1;
298 
299 			err = setsockopt(test_case->clients[i], IPPROTO_TCP,
300 					 TCP_FASTOPEN_CONNECT, &fastopen,
301 					 sizeof(fastopen));
302 			if (!ASSERT_OK(err,
303 				       "setsockopt - TCP_FASTOPEN_CONNECT"))
304 				return -1;
305 		}
306 
307 		err = connect(test_case->clients[i],
308 			      (struct sockaddr *)&test_case->addr,
309 			      test_case->addrlen);
310 		if (!ASSERT_OK(err, "connect"))
311 			return -1;
312 
313 		err = write(test_case->clients[i], buf, MSGLEN);
314 		if (!ASSERT_EQ(err, MSGLEN, "write"))
315 			return -1;
316 	}
317 
318 	return 0;
319 }
320 
321 static int update_maps(struct migrate_reuseport_test_case *test_case,
322 		       struct test_migrate_reuseport *skel)
323 {
324 	int i, err, migrated_to = MIGRATED_TO;
325 	int reuseport_map_fd, migrate_map_fd;
326 	__u64 value;
327 
328 	reuseport_map_fd = bpf_map__fd(skel->maps.reuseport_map);
329 	migrate_map_fd = bpf_map__fd(skel->maps.migrate_map);
330 
331 	for (i = 0; i < NR_SERVERS; i++) {
332 		value = (__u64)test_case->servers[i];
333 		err = bpf_map_update_elem(reuseport_map_fd, &i, &value,
334 					  BPF_NOEXIST);
335 		if (!ASSERT_OK(err, "bpf_map_update_elem - reuseport_map"))
336 			return -1;
337 
338 		err = bpf_map_lookup_elem(reuseport_map_fd, &i, &value);
339 		if (!ASSERT_OK(err, "bpf_map_lookup_elem - reuseport_map"))
340 			return -1;
341 
342 		err = bpf_map_update_elem(migrate_map_fd, &value, &migrated_to,
343 					  BPF_NOEXIST);
344 		if (!ASSERT_OK(err, "bpf_map_update_elem - migrate_map"))
345 			return -1;
346 	}
347 
348 	return 0;
349 }
350 
351 static int migrate_dance(struct migrate_reuseport_test_case *test_case)
352 {
353 	int i, err;
354 
355 	/* Migrate TCP_ESTABLISHED and TCP_SYN_RECV requests
356 	 * to the last listener based on eBPF.
357 	 */
358 	for (i = 0; i < MIGRATED_TO; i++) {
359 		err = shutdown(test_case->servers[i], SHUT_RDWR);
360 		if (!ASSERT_OK(err, "shutdown"))
361 			return -1;
362 	}
363 
364 	/* No dance for TCP_NEW_SYN_RECV to migrate based on eBPF */
365 	if (test_case->state == BPF_TCP_NEW_SYN_RECV)
366 		return 0;
367 
368 	/* Note that we use the second listener instead of the
369 	 * first one here.
370 	 *
371 	 * The fist listener is bind()ed with port 0 and,
372 	 * SOCK_BINDPORT_LOCK is not set to sk_userlocks, so
373 	 * calling listen() again will bind() the first listener
374 	 * on a new ephemeral port and detach it from the existing
375 	 * reuseport group.  (See: __inet_bind(), tcp_set_state())
376 	 *
377 	 * OTOH, the second one is bind()ed with a specific port,
378 	 * and SOCK_BINDPORT_LOCK is set. Thus, re-listen() will
379 	 * resurrect the listener on the existing reuseport group.
380 	 */
381 	err = listen(test_case->servers[1], QLEN);
382 	if (!ASSERT_OK(err, "listen"))
383 		return -1;
384 
385 	/* Migrate from the last listener to the second one.
386 	 *
387 	 * All listeners were detached out of the reuseport_map,
388 	 * so migration will be done by kernel random pick from here.
389 	 */
390 	err = shutdown(test_case->servers[MIGRATED_TO], SHUT_RDWR);
391 	if (!ASSERT_OK(err, "shutdown"))
392 		return -1;
393 
394 	/* Back to the existing reuseport group */
395 	err = listen(test_case->servers[MIGRATED_TO], QLEN);
396 	if (!ASSERT_OK(err, "listen"))
397 		return -1;
398 
399 	/* Migrate back to the last one from the second one */
400 	err = shutdown(test_case->servers[1], SHUT_RDWR);
401 	if (!ASSERT_OK(err, "shutdown"))
402 		return -1;
403 
404 	return 0;
405 }
406 
407 static void count_requests(struct migrate_reuseport_test_case *test_case,
408 			   struct test_migrate_reuseport *skel)
409 {
410 	struct sockaddr_storage addr;
411 	socklen_t len = sizeof(addr);
412 	int err, cnt = 0, client;
413 	char buf[MSGLEN];
414 
415 	err = settimeo(test_case->servers[MIGRATED_TO], 4000);
416 	if (!ASSERT_OK(err, "settimeo"))
417 		goto out;
418 
419 	for (; cnt < NR_CLIENTS; cnt++) {
420 		client = accept(test_case->servers[MIGRATED_TO],
421 				(struct sockaddr *)&addr, &len);
422 		if (!ASSERT_NEQ(client, -1, "accept"))
423 			goto out;
424 
425 		memset(buf, 0, MSGLEN);
426 		read(client, &buf, MSGLEN);
427 		close(client);
428 
429 		if (!ASSERT_STREQ(buf, MSG, "read"))
430 			goto out;
431 	}
432 
433 out:
434 	ASSERT_EQ(cnt, NR_CLIENTS, "count in userspace");
435 
436 	switch (test_case->state) {
437 	case BPF_TCP_ESTABLISHED:
438 		cnt = skel->bss->migrated_at_close;
439 		break;
440 	case BPF_TCP_SYN_RECV:
441 		cnt = skel->bss->migrated_at_close_fastopen;
442 		break;
443 	case BPF_TCP_NEW_SYN_RECV:
444 		if (test_case->expire_synack_timer)
445 			cnt = skel->bss->migrated_at_send_synack;
446 		else
447 			cnt = skel->bss->migrated_at_recv_ack;
448 		break;
449 	default:
450 		cnt = 0;
451 	}
452 
453 	ASSERT_EQ(cnt, NR_CLIENTS, "count in BPF prog");
454 }
455 
456 static void run_test(struct migrate_reuseport_test_case *test_case,
457 		     struct test_migrate_reuseport *skel)
458 {
459 	int err, saved_len;
460 	char buf[16];
461 
462 	skel->bss->migrated_at_close = 0;
463 	skel->bss->migrated_at_close_fastopen = 0;
464 	skel->bss->migrated_at_send_synack = 0;
465 	skel->bss->migrated_at_recv_ack = 0;
466 
467 	init_fds(test_case->servers, NR_SERVERS);
468 	init_fds(test_case->clients, NR_CLIENTS);
469 
470 	if (test_case->fastopen) {
471 		memset(buf, 0, sizeof(buf));
472 
473 		err = setup_fastopen(buf, sizeof(buf), &saved_len, false);
474 		if (!ASSERT_OK(err, "setup_fastopen - setup"))
475 			return;
476 	}
477 
478 	err = start_servers(test_case, skel);
479 	if (!ASSERT_OK(err, "start_servers"))
480 		goto close_servers;
481 
482 	if (test_case->drop_ack) {
483 		/* Drop the final ACK of the 3-way handshake and stick the
484 		 * in-flight requests on TCP_SYN_RECV or TCP_NEW_SYN_RECV.
485 		 */
486 		err = drop_ack(test_case, skel);
487 		if (!ASSERT_OK(err, "drop_ack"))
488 			goto close_servers;
489 	}
490 
491 	/* Tie requests to the first four listners */
492 	err = start_clients(test_case);
493 	if (!ASSERT_OK(err, "start_clients"))
494 		goto close_clients;
495 
496 	err = listen(test_case->servers[MIGRATED_TO], QLEN);
497 	if (!ASSERT_OK(err, "listen"))
498 		goto close_clients;
499 
500 	err = update_maps(test_case, skel);
501 	if (!ASSERT_OK(err, "fill_maps"))
502 		goto close_clients;
503 
504 	/* Migrate the requests in the accept queue only.
505 	 * TCP_NEW_SYN_RECV requests are not migrated at this point.
506 	 */
507 	err = migrate_dance(test_case);
508 	if (!ASSERT_OK(err, "migrate_dance"))
509 		goto close_clients;
510 
511 	if (test_case->expire_synack_timer) {
512 		/* Wait for SYN+ACK timers to expire so that
513 		 * reqsk_timer_handler() migrates TCP_NEW_SYN_RECV requests.
514 		 */
515 		sleep(1);
516 	}
517 
518 	if (test_case->link) {
519 		/* Resume 3WHS and migrate TCP_NEW_SYN_RECV requests */
520 		err = pass_ack(test_case);
521 		if (!ASSERT_OK(err, "pass_ack"))
522 			goto close_clients;
523 	}
524 
525 	count_requests(test_case, skel);
526 
527 close_clients:
528 	close_fds(test_case->clients, NR_CLIENTS);
529 
530 	if (test_case->link) {
531 		err = pass_ack(test_case);
532 		ASSERT_OK(err, "pass_ack - clean up");
533 	}
534 
535 close_servers:
536 	close_fds(test_case->servers, NR_SERVERS);
537 
538 	if (test_case->fastopen) {
539 		err = setup_fastopen(buf, sizeof(buf), &saved_len, true);
540 		ASSERT_OK(err, "setup_fastopen - restore");
541 	}
542 }
543 
544 void serial_test_migrate_reuseport(void)
545 {
546 	struct test_migrate_reuseport *skel;
547 	int i;
548 
549 	skel = test_migrate_reuseport__open_and_load();
550 	if (!ASSERT_OK_PTR(skel, "open_and_load"))
551 		return;
552 
553 	for (i = 0; i < ARRAY_SIZE(test_cases); i++) {
554 		test__start_subtest(test_cases[i].name);
555 		run_test(&test_cases[i], skel);
556 	}
557 
558 	test_migrate_reuseport__destroy(skel);
559 }
560