1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Check if we can migrate child sockets. 4 * 5 * 1. call listen() for 4 server sockets. 6 * 2. call connect() for 25 client sockets. 7 * 3. call listen() for 1 server socket. (migration target) 8 * 4. update a map to migrate all child sockets 9 * to the last server socket (migrate_map[cookie] = 4) 10 * 5. call shutdown() for first 4 server sockets 11 * and migrate the requests in the accept queue 12 * to the last server socket. 13 * 6. call listen() for the second server socket. 14 * 7. call shutdown() for the last server 15 * and migrate the requests in the accept queue 16 * to the second server socket. 17 * 8. call listen() for the last server. 18 * 9. call shutdown() for the second server 19 * and migrate the requests in the accept queue 20 * to the last server socket. 21 * 10. call accept() for the last server socket. 22 * 23 * Author: Kuniyuki Iwashima <kuniyu@amazon.co.jp> 24 */ 25 26 #include <bpf/bpf.h> 27 #include <bpf/libbpf.h> 28 29 #include "test_progs.h" 30 #include "test_migrate_reuseport.skel.h" 31 #include "network_helpers.h" 32 33 #ifndef TCP_FASTOPEN_CONNECT 34 #define TCP_FASTOPEN_CONNECT 30 35 #endif 36 37 #define IFINDEX_LO 1 38 39 #define NR_SERVERS 5 40 #define NR_CLIENTS (NR_SERVERS * 5) 41 #define MIGRATED_TO (NR_SERVERS - 1) 42 43 /* fastopenq->max_qlen and sk->sk_max_ack_backlog */ 44 #define QLEN (NR_CLIENTS * 5) 45 46 #define MSG "Hello World\0" 47 #define MSGLEN 12 48 49 static struct migrate_reuseport_test_case { 50 const char *name; 51 __s64 servers[NR_SERVERS]; 52 __s64 clients[NR_CLIENTS]; 53 struct sockaddr_storage addr; 54 socklen_t addrlen; 55 int family; 56 int state; 57 bool drop_ack; 58 bool expire_synack_timer; 59 bool fastopen; 60 struct bpf_link *link; 61 } test_cases[] = { 62 { 63 .name = "IPv4 TCP_ESTABLISHED inet_csk_listen_stop", 64 .family = AF_INET, 65 .state = BPF_TCP_ESTABLISHED, 66 .drop_ack = false, 67 .expire_synack_timer = false, 68 .fastopen = false, 69 }, 70 { 71 .name = "IPv4 TCP_SYN_RECV inet_csk_listen_stop", 72 .family = AF_INET, 73 .state = BPF_TCP_SYN_RECV, 74 .drop_ack = true, 75 .expire_synack_timer = false, 76 .fastopen = true, 77 }, 78 { 79 .name = "IPv4 TCP_NEW_SYN_RECV reqsk_timer_handler", 80 .family = AF_INET, 81 .state = BPF_TCP_NEW_SYN_RECV, 82 .drop_ack = true, 83 .expire_synack_timer = true, 84 .fastopen = false, 85 }, 86 { 87 .name = "IPv4 TCP_NEW_SYN_RECV inet_csk_complete_hashdance", 88 .family = AF_INET, 89 .state = BPF_TCP_NEW_SYN_RECV, 90 .drop_ack = true, 91 .expire_synack_timer = false, 92 .fastopen = false, 93 }, 94 { 95 .name = "IPv6 TCP_ESTABLISHED inet_csk_listen_stop", 96 .family = AF_INET6, 97 .state = BPF_TCP_ESTABLISHED, 98 .drop_ack = false, 99 .expire_synack_timer = false, 100 .fastopen = false, 101 }, 102 { 103 .name = "IPv6 TCP_SYN_RECV inet_csk_listen_stop", 104 .family = AF_INET6, 105 .state = BPF_TCP_SYN_RECV, 106 .drop_ack = true, 107 .expire_synack_timer = false, 108 .fastopen = true, 109 }, 110 { 111 .name = "IPv6 TCP_NEW_SYN_RECV reqsk_timer_handler", 112 .family = AF_INET6, 113 .state = BPF_TCP_NEW_SYN_RECV, 114 .drop_ack = true, 115 .expire_synack_timer = true, 116 .fastopen = false, 117 }, 118 { 119 .name = "IPv6 TCP_NEW_SYN_RECV inet_csk_complete_hashdance", 120 .family = AF_INET6, 121 .state = BPF_TCP_NEW_SYN_RECV, 122 .drop_ack = true, 123 .expire_synack_timer = false, 124 .fastopen = false, 125 } 126 }; 127 128 static void init_fds(__s64 fds[], int len) 129 { 130 int i; 131 132 for (i = 0; i < len; i++) 133 fds[i] = -1; 134 } 135 136 static void close_fds(__s64 fds[], int len) 137 { 138 int i; 139 140 for (i = 0; i < len; i++) { 141 if (fds[i] != -1) { 142 close(fds[i]); 143 fds[i] = -1; 144 } 145 } 146 } 147 148 static int setup_fastopen(char *buf, int size, int *saved_len, bool restore) 149 { 150 int err = 0, fd, len; 151 152 fd = open("/proc/sys/net/ipv4/tcp_fastopen", O_RDWR); 153 if (!ASSERT_NEQ(fd, -1, "open")) 154 return -1; 155 156 if (restore) { 157 len = write(fd, buf, *saved_len); 158 if (!ASSERT_EQ(len, *saved_len, "write - restore")) 159 err = -1; 160 } else { 161 *saved_len = read(fd, buf, size); 162 if (!ASSERT_GE(*saved_len, 1, "read")) { 163 err = -1; 164 goto close; 165 } 166 167 err = lseek(fd, 0, SEEK_SET); 168 if (!ASSERT_OK(err, "lseek")) 169 goto close; 170 171 /* (TFO_CLIENT_ENABLE | TFO_SERVER_ENABLE | 172 * TFO_CLIENT_NO_COOKIE | TFO_SERVER_COOKIE_NOT_REQD) 173 */ 174 len = write(fd, "519", 3); 175 if (!ASSERT_EQ(len, 3, "write - setup")) 176 err = -1; 177 } 178 179 close: 180 close(fd); 181 182 return err; 183 } 184 185 static int drop_ack(struct migrate_reuseport_test_case *test_case, 186 struct test_migrate_reuseport *skel) 187 { 188 if (test_case->family == AF_INET) 189 skel->bss->server_port = ((struct sockaddr_in *) 190 &test_case->addr)->sin_port; 191 else 192 skel->bss->server_port = ((struct sockaddr_in6 *) 193 &test_case->addr)->sin6_port; 194 195 test_case->link = bpf_program__attach_xdp(skel->progs.drop_ack, 196 IFINDEX_LO); 197 if (!ASSERT_OK_PTR(test_case->link, "bpf_program__attach_xdp")) 198 return -1; 199 200 return 0; 201 } 202 203 static int pass_ack(struct migrate_reuseport_test_case *test_case) 204 { 205 int err; 206 207 err = bpf_link__detach(test_case->link); 208 if (!ASSERT_OK(err, "bpf_link__detach")) 209 return -1; 210 211 test_case->link = NULL; 212 213 return 0; 214 } 215 216 static int start_servers(struct migrate_reuseport_test_case *test_case, 217 struct test_migrate_reuseport *skel) 218 { 219 int i, err, prog_fd, reuseport = 1, qlen = QLEN; 220 221 prog_fd = bpf_program__fd(skel->progs.migrate_reuseport); 222 223 make_sockaddr(test_case->family, 224 test_case->family == AF_INET ? "127.0.0.1" : "::1", 0, 225 &test_case->addr, &test_case->addrlen); 226 227 for (i = 0; i < NR_SERVERS; i++) { 228 test_case->servers[i] = socket(test_case->family, SOCK_STREAM, 229 IPPROTO_TCP); 230 if (!ASSERT_NEQ(test_case->servers[i], -1, "socket")) 231 return -1; 232 233 err = setsockopt(test_case->servers[i], SOL_SOCKET, 234 SO_REUSEPORT, &reuseport, sizeof(reuseport)); 235 if (!ASSERT_OK(err, "setsockopt - SO_REUSEPORT")) 236 return -1; 237 238 err = bind(test_case->servers[i], 239 (struct sockaddr *)&test_case->addr, 240 test_case->addrlen); 241 if (!ASSERT_OK(err, "bind")) 242 return -1; 243 244 if (i == 0) { 245 err = setsockopt(test_case->servers[i], SOL_SOCKET, 246 SO_ATTACH_REUSEPORT_EBPF, 247 &prog_fd, sizeof(prog_fd)); 248 if (!ASSERT_OK(err, 249 "setsockopt - SO_ATTACH_REUSEPORT_EBPF")) 250 return -1; 251 252 err = getsockname(test_case->servers[i], 253 (struct sockaddr *)&test_case->addr, 254 &test_case->addrlen); 255 if (!ASSERT_OK(err, "getsockname")) 256 return -1; 257 } 258 259 if (test_case->fastopen) { 260 err = setsockopt(test_case->servers[i], 261 SOL_TCP, TCP_FASTOPEN, 262 &qlen, sizeof(qlen)); 263 if (!ASSERT_OK(err, "setsockopt - TCP_FASTOPEN")) 264 return -1; 265 } 266 267 /* All requests will be tied to the first four listeners */ 268 if (i != MIGRATED_TO) { 269 err = listen(test_case->servers[i], qlen); 270 if (!ASSERT_OK(err, "listen")) 271 return -1; 272 } 273 } 274 275 return 0; 276 } 277 278 static int start_clients(struct migrate_reuseport_test_case *test_case) 279 { 280 char buf[MSGLEN] = MSG; 281 int i, err; 282 283 for (i = 0; i < NR_CLIENTS; i++) { 284 test_case->clients[i] = socket(test_case->family, SOCK_STREAM, 285 IPPROTO_TCP); 286 if (!ASSERT_NEQ(test_case->clients[i], -1, "socket")) 287 return -1; 288 289 /* The attached XDP program drops only the final ACK, so 290 * clients will transition to TCP_ESTABLISHED immediately. 291 */ 292 err = settimeo(test_case->clients[i], 100); 293 if (!ASSERT_OK(err, "settimeo")) 294 return -1; 295 296 if (test_case->fastopen) { 297 int fastopen = 1; 298 299 err = setsockopt(test_case->clients[i], IPPROTO_TCP, 300 TCP_FASTOPEN_CONNECT, &fastopen, 301 sizeof(fastopen)); 302 if (!ASSERT_OK(err, 303 "setsockopt - TCP_FASTOPEN_CONNECT")) 304 return -1; 305 } 306 307 err = connect(test_case->clients[i], 308 (struct sockaddr *)&test_case->addr, 309 test_case->addrlen); 310 if (!ASSERT_OK(err, "connect")) 311 return -1; 312 313 err = write(test_case->clients[i], buf, MSGLEN); 314 if (!ASSERT_EQ(err, MSGLEN, "write")) 315 return -1; 316 } 317 318 return 0; 319 } 320 321 static int update_maps(struct migrate_reuseport_test_case *test_case, 322 struct test_migrate_reuseport *skel) 323 { 324 int i, err, migrated_to = MIGRATED_TO; 325 int reuseport_map_fd, migrate_map_fd; 326 __u64 value; 327 328 reuseport_map_fd = bpf_map__fd(skel->maps.reuseport_map); 329 migrate_map_fd = bpf_map__fd(skel->maps.migrate_map); 330 331 for (i = 0; i < NR_SERVERS; i++) { 332 value = (__u64)test_case->servers[i]; 333 err = bpf_map_update_elem(reuseport_map_fd, &i, &value, 334 BPF_NOEXIST); 335 if (!ASSERT_OK(err, "bpf_map_update_elem - reuseport_map")) 336 return -1; 337 338 err = bpf_map_lookup_elem(reuseport_map_fd, &i, &value); 339 if (!ASSERT_OK(err, "bpf_map_lookup_elem - reuseport_map")) 340 return -1; 341 342 err = bpf_map_update_elem(migrate_map_fd, &value, &migrated_to, 343 BPF_NOEXIST); 344 if (!ASSERT_OK(err, "bpf_map_update_elem - migrate_map")) 345 return -1; 346 } 347 348 return 0; 349 } 350 351 static int migrate_dance(struct migrate_reuseport_test_case *test_case) 352 { 353 int i, err; 354 355 /* Migrate TCP_ESTABLISHED and TCP_SYN_RECV requests 356 * to the last listener based on eBPF. 357 */ 358 for (i = 0; i < MIGRATED_TO; i++) { 359 err = shutdown(test_case->servers[i], SHUT_RDWR); 360 if (!ASSERT_OK(err, "shutdown")) 361 return -1; 362 } 363 364 /* No dance for TCP_NEW_SYN_RECV to migrate based on eBPF */ 365 if (test_case->state == BPF_TCP_NEW_SYN_RECV) 366 return 0; 367 368 /* Note that we use the second listener instead of the 369 * first one here. 370 * 371 * The fist listener is bind()ed with port 0 and, 372 * SOCK_BINDPORT_LOCK is not set to sk_userlocks, so 373 * calling listen() again will bind() the first listener 374 * on a new ephemeral port and detach it from the existing 375 * reuseport group. (See: __inet_bind(), tcp_set_state()) 376 * 377 * OTOH, the second one is bind()ed with a specific port, 378 * and SOCK_BINDPORT_LOCK is set. Thus, re-listen() will 379 * resurrect the listener on the existing reuseport group. 380 */ 381 err = listen(test_case->servers[1], QLEN); 382 if (!ASSERT_OK(err, "listen")) 383 return -1; 384 385 /* Migrate from the last listener to the second one. 386 * 387 * All listeners were detached out of the reuseport_map, 388 * so migration will be done by kernel random pick from here. 389 */ 390 err = shutdown(test_case->servers[MIGRATED_TO], SHUT_RDWR); 391 if (!ASSERT_OK(err, "shutdown")) 392 return -1; 393 394 /* Back to the existing reuseport group */ 395 err = listen(test_case->servers[MIGRATED_TO], QLEN); 396 if (!ASSERT_OK(err, "listen")) 397 return -1; 398 399 /* Migrate back to the last one from the second one */ 400 err = shutdown(test_case->servers[1], SHUT_RDWR); 401 if (!ASSERT_OK(err, "shutdown")) 402 return -1; 403 404 return 0; 405 } 406 407 static void count_requests(struct migrate_reuseport_test_case *test_case, 408 struct test_migrate_reuseport *skel) 409 { 410 struct sockaddr_storage addr; 411 socklen_t len = sizeof(addr); 412 int err, cnt = 0, client; 413 char buf[MSGLEN]; 414 415 err = settimeo(test_case->servers[MIGRATED_TO], 4000); 416 if (!ASSERT_OK(err, "settimeo")) 417 goto out; 418 419 for (; cnt < NR_CLIENTS; cnt++) { 420 client = accept(test_case->servers[MIGRATED_TO], 421 (struct sockaddr *)&addr, &len); 422 if (!ASSERT_NEQ(client, -1, "accept")) 423 goto out; 424 425 memset(buf, 0, MSGLEN); 426 read(client, &buf, MSGLEN); 427 close(client); 428 429 if (!ASSERT_STREQ(buf, MSG, "read")) 430 goto out; 431 } 432 433 out: 434 ASSERT_EQ(cnt, NR_CLIENTS, "count in userspace"); 435 436 switch (test_case->state) { 437 case BPF_TCP_ESTABLISHED: 438 cnt = skel->bss->migrated_at_close; 439 break; 440 case BPF_TCP_SYN_RECV: 441 cnt = skel->bss->migrated_at_close_fastopen; 442 break; 443 case BPF_TCP_NEW_SYN_RECV: 444 if (test_case->expire_synack_timer) 445 cnt = skel->bss->migrated_at_send_synack; 446 else 447 cnt = skel->bss->migrated_at_recv_ack; 448 break; 449 default: 450 cnt = 0; 451 } 452 453 ASSERT_EQ(cnt, NR_CLIENTS, "count in BPF prog"); 454 } 455 456 static void run_test(struct migrate_reuseport_test_case *test_case, 457 struct test_migrate_reuseport *skel) 458 { 459 int err, saved_len; 460 char buf[16]; 461 462 skel->bss->migrated_at_close = 0; 463 skel->bss->migrated_at_close_fastopen = 0; 464 skel->bss->migrated_at_send_synack = 0; 465 skel->bss->migrated_at_recv_ack = 0; 466 467 init_fds(test_case->servers, NR_SERVERS); 468 init_fds(test_case->clients, NR_CLIENTS); 469 470 if (test_case->fastopen) { 471 memset(buf, 0, sizeof(buf)); 472 473 err = setup_fastopen(buf, sizeof(buf), &saved_len, false); 474 if (!ASSERT_OK(err, "setup_fastopen - setup")) 475 return; 476 } 477 478 err = start_servers(test_case, skel); 479 if (!ASSERT_OK(err, "start_servers")) 480 goto close_servers; 481 482 if (test_case->drop_ack) { 483 /* Drop the final ACK of the 3-way handshake and stick the 484 * in-flight requests on TCP_SYN_RECV or TCP_NEW_SYN_RECV. 485 */ 486 err = drop_ack(test_case, skel); 487 if (!ASSERT_OK(err, "drop_ack")) 488 goto close_servers; 489 } 490 491 /* Tie requests to the first four listners */ 492 err = start_clients(test_case); 493 if (!ASSERT_OK(err, "start_clients")) 494 goto close_clients; 495 496 err = listen(test_case->servers[MIGRATED_TO], QLEN); 497 if (!ASSERT_OK(err, "listen")) 498 goto close_clients; 499 500 err = update_maps(test_case, skel); 501 if (!ASSERT_OK(err, "fill_maps")) 502 goto close_clients; 503 504 /* Migrate the requests in the accept queue only. 505 * TCP_NEW_SYN_RECV requests are not migrated at this point. 506 */ 507 err = migrate_dance(test_case); 508 if (!ASSERT_OK(err, "migrate_dance")) 509 goto close_clients; 510 511 if (test_case->expire_synack_timer) { 512 /* Wait for SYN+ACK timers to expire so that 513 * reqsk_timer_handler() migrates TCP_NEW_SYN_RECV requests. 514 */ 515 sleep(1); 516 } 517 518 if (test_case->link) { 519 /* Resume 3WHS and migrate TCP_NEW_SYN_RECV requests */ 520 err = pass_ack(test_case); 521 if (!ASSERT_OK(err, "pass_ack")) 522 goto close_clients; 523 } 524 525 count_requests(test_case, skel); 526 527 close_clients: 528 close_fds(test_case->clients, NR_CLIENTS); 529 530 if (test_case->link) { 531 err = pass_ack(test_case); 532 ASSERT_OK(err, "pass_ack - clean up"); 533 } 534 535 close_servers: 536 close_fds(test_case->servers, NR_SERVERS); 537 538 if (test_case->fastopen) { 539 err = setup_fastopen(buf, sizeof(buf), &saved_len, true); 540 ASSERT_OK(err, "setup_fastopen - restore"); 541 } 542 } 543 544 void test_migrate_reuseport(void) 545 { 546 struct test_migrate_reuseport *skel; 547 int i; 548 549 skel = test_migrate_reuseport__open_and_load(); 550 if (!ASSERT_OK_PTR(skel, "open_and_load")) 551 return; 552 553 for (i = 0; i < ARRAY_SIZE(test_cases); i++) { 554 test__start_subtest(test_cases[i].name); 555 run_test(&test_cases[i], skel); 556 } 557 558 test_migrate_reuseport__destroy(skel); 559 } 560