1 // SPDX-License-Identifier: GPL-2.0 2 /* Multipath TCP 3 * 4 * Copyright (c) 2017 - 2019, Intel Corporation. 5 */ 6 7 #define pr_fmt(fmt) "MPTCP: " fmt 8 9 #include <linux/kernel.h> 10 #include <linux/module.h> 11 #include <linux/netdevice.h> 12 #include <crypto/algapi.h> 13 #include <crypto/sha2.h> 14 #include <net/sock.h> 15 #include <net/inet_common.h> 16 #include <net/inet_hashtables.h> 17 #include <net/protocol.h> 18 #include <net/tcp.h> 19 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 20 #include <net/ip6_route.h> 21 #endif 22 #include <net/mptcp.h> 23 #include <uapi/linux/mptcp.h> 24 #include "protocol.h" 25 #include "mib.h" 26 27 static void SUBFLOW_REQ_INC_STATS(struct request_sock *req, 28 enum linux_mptcp_mib_field field) 29 { 30 MPTCP_INC_STATS(sock_net(req_to_sk(req)), field); 31 } 32 33 static void subflow_req_destructor(struct request_sock *req) 34 { 35 struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); 36 37 pr_debug("subflow_req=%p", subflow_req); 38 39 if (subflow_req->msk) 40 sock_put((struct sock *)subflow_req->msk); 41 42 mptcp_token_destroy_request(req); 43 tcp_request_sock_ops.destructor(req); 44 } 45 46 static void subflow_generate_hmac(u64 key1, u64 key2, u32 nonce1, u32 nonce2, 47 void *hmac) 48 { 49 u8 msg[8]; 50 51 put_unaligned_be32(nonce1, &msg[0]); 52 put_unaligned_be32(nonce2, &msg[4]); 53 54 mptcp_crypto_hmac_sha(key1, key2, msg, 8, hmac); 55 } 56 57 static bool mptcp_can_accept_new_subflow(const struct mptcp_sock *msk) 58 { 59 return mptcp_is_fully_established((void *)msk) && 60 READ_ONCE(msk->pm.accept_subflow); 61 } 62 63 /* validate received token and create truncated hmac and nonce for SYN-ACK */ 64 static struct mptcp_sock *subflow_token_join_request(struct request_sock *req, 65 const struct sk_buff *skb) 66 { 67 struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); 68 u8 hmac[SHA256_DIGEST_SIZE]; 69 struct mptcp_sock *msk; 70 int local_id; 71 72 msk = mptcp_token_get_sock(subflow_req->token); 73 if (!msk) { 74 SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINNOTOKEN); 75 return NULL; 76 } 77 78 local_id = mptcp_pm_get_local_id(msk, (struct sock_common *)req); 79 if (local_id < 0) { 80 sock_put((struct sock *)msk); 81 return NULL; 82 } 83 subflow_req->local_id = local_id; 84 85 get_random_bytes(&subflow_req->local_nonce, sizeof(u32)); 86 87 subflow_generate_hmac(msk->local_key, msk->remote_key, 88 subflow_req->local_nonce, 89 subflow_req->remote_nonce, hmac); 90 91 subflow_req->thmac = get_unaligned_be64(hmac); 92 return msk; 93 } 94 95 static int __subflow_init_req(struct request_sock *req, const struct sock *sk_listener) 96 { 97 struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); 98 99 subflow_req->mp_capable = 0; 100 subflow_req->mp_join = 0; 101 subflow_req->msk = NULL; 102 mptcp_token_init_request(req); 103 104 #ifdef CONFIG_TCP_MD5SIG 105 /* no MPTCP if MD5SIG is enabled on this socket or we may run out of 106 * TCP option space. 107 */ 108 if (rcu_access_pointer(tcp_sk(sk_listener)->md5sig_info)) 109 return -EINVAL; 110 #endif 111 112 return 0; 113 } 114 115 /* Init mptcp request socket. 116 * 117 * Returns an error code if a JOIN has failed and a TCP reset 118 * should be sent. 119 */ 120 static int subflow_init_req(struct request_sock *req, 121 const struct sock *sk_listener, 122 struct sk_buff *skb) 123 { 124 struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk_listener); 125 struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); 126 struct mptcp_options_received mp_opt; 127 int ret; 128 129 pr_debug("subflow_req=%p, listener=%p", subflow_req, listener); 130 131 ret = __subflow_init_req(req, sk_listener); 132 if (ret) 133 return 0; 134 135 mptcp_get_options(skb, &mp_opt); 136 137 if (mp_opt.mp_capable) { 138 SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MPCAPABLEPASSIVE); 139 140 if (mp_opt.mp_join) 141 return 0; 142 } else if (mp_opt.mp_join) { 143 SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINSYNRX); 144 } 145 146 if (mp_opt.mp_capable && listener->request_mptcp) { 147 int err, retries = 4; 148 149 subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq; 150 again: 151 do { 152 get_random_bytes(&subflow_req->local_key, sizeof(subflow_req->local_key)); 153 } while (subflow_req->local_key == 0); 154 155 if (unlikely(req->syncookie)) { 156 mptcp_crypto_key_sha(subflow_req->local_key, 157 &subflow_req->token, 158 &subflow_req->idsn); 159 if (mptcp_token_exists(subflow_req->token)) { 160 if (retries-- > 0) 161 goto again; 162 } else { 163 subflow_req->mp_capable = 1; 164 } 165 return 0; 166 } 167 168 err = mptcp_token_new_request(req); 169 if (err == 0) 170 subflow_req->mp_capable = 1; 171 else if (retries-- > 0) 172 goto again; 173 174 } else if (mp_opt.mp_join && listener->request_mptcp) { 175 subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq; 176 subflow_req->mp_join = 1; 177 subflow_req->backup = mp_opt.backup; 178 subflow_req->remote_id = mp_opt.join_id; 179 subflow_req->token = mp_opt.token; 180 subflow_req->remote_nonce = mp_opt.nonce; 181 subflow_req->msk = subflow_token_join_request(req, skb); 182 183 /* Can't fall back to TCP in this case. */ 184 if (!subflow_req->msk) 185 return -EPERM; 186 187 if (unlikely(req->syncookie)) { 188 if (mptcp_can_accept_new_subflow(subflow_req->msk)) 189 subflow_init_req_cookie_join_save(subflow_req, skb); 190 } 191 192 pr_debug("token=%u, remote_nonce=%u msk=%p", subflow_req->token, 193 subflow_req->remote_nonce, subflow_req->msk); 194 } 195 196 return 0; 197 } 198 199 int mptcp_subflow_init_cookie_req(struct request_sock *req, 200 const struct sock *sk_listener, 201 struct sk_buff *skb) 202 { 203 struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk_listener); 204 struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); 205 struct mptcp_options_received mp_opt; 206 int err; 207 208 err = __subflow_init_req(req, sk_listener); 209 if (err) 210 return err; 211 212 mptcp_get_options(skb, &mp_opt); 213 214 if (mp_opt.mp_capable && mp_opt.mp_join) 215 return -EINVAL; 216 217 if (mp_opt.mp_capable && listener->request_mptcp) { 218 if (mp_opt.sndr_key == 0) 219 return -EINVAL; 220 221 subflow_req->local_key = mp_opt.rcvr_key; 222 err = mptcp_token_new_request(req); 223 if (err) 224 return err; 225 226 subflow_req->mp_capable = 1; 227 subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq - 1; 228 } else if (mp_opt.mp_join && listener->request_mptcp) { 229 if (!mptcp_token_join_cookie_init_state(subflow_req, skb)) 230 return -EINVAL; 231 232 if (mptcp_can_accept_new_subflow(subflow_req->msk)) 233 subflow_req->mp_join = 1; 234 235 subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq - 1; 236 } 237 238 return 0; 239 } 240 EXPORT_SYMBOL_GPL(mptcp_subflow_init_cookie_req); 241 242 static struct dst_entry *subflow_v4_route_req(const struct sock *sk, 243 struct sk_buff *skb, 244 struct flowi *fl, 245 struct request_sock *req) 246 { 247 struct dst_entry *dst; 248 int err; 249 250 tcp_rsk(req)->is_mptcp = 1; 251 252 dst = tcp_request_sock_ipv4_ops.route_req(sk, skb, fl, req); 253 if (!dst) 254 return NULL; 255 256 err = subflow_init_req(req, sk, skb); 257 if (err == 0) 258 return dst; 259 260 dst_release(dst); 261 if (!req->syncookie) 262 tcp_request_sock_ops.send_reset(sk, skb); 263 return NULL; 264 } 265 266 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 267 static struct dst_entry *subflow_v6_route_req(const struct sock *sk, 268 struct sk_buff *skb, 269 struct flowi *fl, 270 struct request_sock *req) 271 { 272 struct dst_entry *dst; 273 int err; 274 275 tcp_rsk(req)->is_mptcp = 1; 276 277 dst = tcp_request_sock_ipv6_ops.route_req(sk, skb, fl, req); 278 if (!dst) 279 return NULL; 280 281 err = subflow_init_req(req, sk, skb); 282 if (err == 0) 283 return dst; 284 285 dst_release(dst); 286 if (!req->syncookie) 287 tcp6_request_sock_ops.send_reset(sk, skb); 288 return NULL; 289 } 290 #endif 291 292 /* validate received truncated hmac and create hmac for third ACK */ 293 static bool subflow_thmac_valid(struct mptcp_subflow_context *subflow) 294 { 295 u8 hmac[SHA256_DIGEST_SIZE]; 296 u64 thmac; 297 298 subflow_generate_hmac(subflow->remote_key, subflow->local_key, 299 subflow->remote_nonce, subflow->local_nonce, 300 hmac); 301 302 thmac = get_unaligned_be64(hmac); 303 pr_debug("subflow=%p, token=%u, thmac=%llu, subflow->thmac=%llu\n", 304 subflow, subflow->token, 305 (unsigned long long)thmac, 306 (unsigned long long)subflow->thmac); 307 308 return thmac == subflow->thmac; 309 } 310 311 void mptcp_subflow_reset(struct sock *ssk) 312 { 313 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); 314 struct sock *sk = subflow->conn; 315 316 /* must hold: tcp_done() could drop last reference on parent */ 317 sock_hold(sk); 318 319 tcp_set_state(ssk, TCP_CLOSE); 320 tcp_send_active_reset(ssk, GFP_ATOMIC); 321 tcp_done(ssk); 322 if (!test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &mptcp_sk(sk)->flags) && 323 schedule_work(&mptcp_sk(sk)->work)) 324 return; /* worker will put sk for us */ 325 326 sock_put(sk); 327 } 328 329 static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) 330 { 331 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); 332 struct mptcp_options_received mp_opt; 333 struct sock *parent = subflow->conn; 334 335 subflow->icsk_af_ops->sk_rx_dst_set(sk, skb); 336 337 if (inet_sk_state_load(parent) == TCP_SYN_SENT) { 338 inet_sk_state_store(parent, TCP_ESTABLISHED); 339 parent->sk_state_change(parent); 340 } 341 342 /* be sure no special action on any packet other than syn-ack */ 343 if (subflow->conn_finished) 344 return; 345 346 subflow->rel_write_seq = 1; 347 subflow->conn_finished = 1; 348 subflow->ssn_offset = TCP_SKB_CB(skb)->seq; 349 pr_debug("subflow=%p synack seq=%x", subflow, subflow->ssn_offset); 350 351 mptcp_get_options(skb, &mp_opt); 352 if (subflow->request_mptcp) { 353 if (!mp_opt.mp_capable) { 354 MPTCP_INC_STATS(sock_net(sk), 355 MPTCP_MIB_MPCAPABLEACTIVEFALLBACK); 356 mptcp_do_fallback(sk); 357 pr_fallback(mptcp_sk(subflow->conn)); 358 goto fallback; 359 } 360 361 subflow->mp_capable = 1; 362 subflow->can_ack = 1; 363 subflow->remote_key = mp_opt.sndr_key; 364 pr_debug("subflow=%p, remote_key=%llu", subflow, 365 subflow->remote_key); 366 mptcp_finish_connect(sk); 367 } else if (subflow->request_join) { 368 u8 hmac[SHA256_DIGEST_SIZE]; 369 370 if (!mp_opt.mp_join) 371 goto do_reset; 372 373 subflow->thmac = mp_opt.thmac; 374 subflow->remote_nonce = mp_opt.nonce; 375 pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u", subflow, 376 subflow->thmac, subflow->remote_nonce); 377 378 if (!subflow_thmac_valid(subflow)) { 379 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINACKMAC); 380 goto do_reset; 381 } 382 383 subflow_generate_hmac(subflow->local_key, subflow->remote_key, 384 subflow->local_nonce, 385 subflow->remote_nonce, 386 hmac); 387 memcpy(subflow->hmac, hmac, MPTCPOPT_HMAC_LEN); 388 389 if (!mptcp_finish_join(sk)) 390 goto do_reset; 391 392 subflow->mp_join = 1; 393 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNACKRX); 394 } else if (mptcp_check_fallback(sk)) { 395 fallback: 396 mptcp_rcv_space_init(mptcp_sk(parent), sk); 397 } 398 return; 399 400 do_reset: 401 mptcp_subflow_reset(sk); 402 } 403 404 struct request_sock_ops mptcp_subflow_request_sock_ops; 405 EXPORT_SYMBOL_GPL(mptcp_subflow_request_sock_ops); 406 static struct tcp_request_sock_ops subflow_request_sock_ipv4_ops; 407 408 static int subflow_v4_conn_request(struct sock *sk, struct sk_buff *skb) 409 { 410 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); 411 412 pr_debug("subflow=%p", subflow); 413 414 /* Never answer to SYNs sent to broadcast or multicast */ 415 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) 416 goto drop; 417 418 return tcp_conn_request(&mptcp_subflow_request_sock_ops, 419 &subflow_request_sock_ipv4_ops, 420 sk, skb); 421 drop: 422 tcp_listendrop(sk); 423 return 0; 424 } 425 426 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 427 static struct tcp_request_sock_ops subflow_request_sock_ipv6_ops; 428 static struct inet_connection_sock_af_ops subflow_v6_specific; 429 static struct inet_connection_sock_af_ops subflow_v6m_specific; 430 431 static int subflow_v6_conn_request(struct sock *sk, struct sk_buff *skb) 432 { 433 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); 434 435 pr_debug("subflow=%p", subflow); 436 437 if (skb->protocol == htons(ETH_P_IP)) 438 return subflow_v4_conn_request(sk, skb); 439 440 if (!ipv6_unicast_destination(skb)) 441 goto drop; 442 443 return tcp_conn_request(&mptcp_subflow_request_sock_ops, 444 &subflow_request_sock_ipv6_ops, sk, skb); 445 446 drop: 447 tcp_listendrop(sk); 448 return 0; /* don't send reset */ 449 } 450 #endif 451 452 /* validate hmac received in third ACK */ 453 static bool subflow_hmac_valid(const struct request_sock *req, 454 const struct mptcp_options_received *mp_opt) 455 { 456 const struct mptcp_subflow_request_sock *subflow_req; 457 u8 hmac[SHA256_DIGEST_SIZE]; 458 struct mptcp_sock *msk; 459 460 subflow_req = mptcp_subflow_rsk(req); 461 msk = subflow_req->msk; 462 if (!msk) 463 return false; 464 465 subflow_generate_hmac(msk->remote_key, msk->local_key, 466 subflow_req->remote_nonce, 467 subflow_req->local_nonce, hmac); 468 469 return !crypto_memneq(hmac, mp_opt->hmac, MPTCPOPT_HMAC_LEN); 470 } 471 472 static void mptcp_sock_destruct(struct sock *sk) 473 { 474 /* if new mptcp socket isn't accepted, it is free'd 475 * from the tcp listener sockets request queue, linked 476 * from req->sk. The tcp socket is released. 477 * This calls the ULP release function which will 478 * also remove the mptcp socket, via 479 * sock_put(ctx->conn). 480 * 481 * Problem is that the mptcp socket will be in 482 * ESTABLISHED state and will not have the SOCK_DEAD flag. 483 * Both result in warnings from inet_sock_destruct. 484 */ 485 486 if (sk->sk_state == TCP_ESTABLISHED) { 487 sk->sk_state = TCP_CLOSE; 488 WARN_ON_ONCE(sk->sk_socket); 489 sock_orphan(sk); 490 } 491 492 mptcp_destroy_common(mptcp_sk(sk)); 493 inet_sock_destruct(sk); 494 } 495 496 static void mptcp_force_close(struct sock *sk) 497 { 498 inet_sk_state_store(sk, TCP_CLOSE); 499 sk_common_release(sk); 500 } 501 502 static void subflow_ulp_fallback(struct sock *sk, 503 struct mptcp_subflow_context *old_ctx) 504 { 505 struct inet_connection_sock *icsk = inet_csk(sk); 506 507 mptcp_subflow_tcp_fallback(sk, old_ctx); 508 icsk->icsk_ulp_ops = NULL; 509 rcu_assign_pointer(icsk->icsk_ulp_data, NULL); 510 tcp_sk(sk)->is_mptcp = 0; 511 } 512 513 static void subflow_drop_ctx(struct sock *ssk) 514 { 515 struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(ssk); 516 517 if (!ctx) 518 return; 519 520 subflow_ulp_fallback(ssk, ctx); 521 if (ctx->conn) 522 sock_put(ctx->conn); 523 524 kfree_rcu(ctx, rcu); 525 } 526 527 void mptcp_subflow_fully_established(struct mptcp_subflow_context *subflow, 528 struct mptcp_options_received *mp_opt) 529 { 530 struct mptcp_sock *msk = mptcp_sk(subflow->conn); 531 532 subflow->remote_key = mp_opt->sndr_key; 533 subflow->fully_established = 1; 534 subflow->can_ack = 1; 535 WRITE_ONCE(msk->fully_established, true); 536 } 537 538 static struct sock *subflow_syn_recv_sock(const struct sock *sk, 539 struct sk_buff *skb, 540 struct request_sock *req, 541 struct dst_entry *dst, 542 struct request_sock *req_unhash, 543 bool *own_req) 544 { 545 struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk); 546 struct mptcp_subflow_request_sock *subflow_req; 547 struct mptcp_options_received mp_opt; 548 bool fallback, fallback_is_fatal; 549 struct sock *new_msk = NULL; 550 struct sock *child; 551 552 pr_debug("listener=%p, req=%p, conn=%p", listener, req, listener->conn); 553 554 /* After child creation we must look for 'mp_capable' even when options 555 * are not parsed 556 */ 557 mp_opt.mp_capable = 0; 558 559 /* hopefully temporary handling for MP_JOIN+syncookie */ 560 subflow_req = mptcp_subflow_rsk(req); 561 fallback_is_fatal = tcp_rsk(req)->is_mptcp && subflow_req->mp_join; 562 fallback = !tcp_rsk(req)->is_mptcp; 563 if (fallback) 564 goto create_child; 565 566 /* if the sk is MP_CAPABLE, we try to fetch the client key */ 567 if (subflow_req->mp_capable) { 568 if (TCP_SKB_CB(skb)->seq != subflow_req->ssn_offset + 1) { 569 /* here we can receive and accept an in-window, 570 * out-of-order pkt, which will not carry the MP_CAPABLE 571 * opt even on mptcp enabled paths 572 */ 573 goto create_msk; 574 } 575 576 mptcp_get_options(skb, &mp_opt); 577 if (!mp_opt.mp_capable) { 578 fallback = true; 579 goto create_child; 580 } 581 582 create_msk: 583 new_msk = mptcp_sk_clone(listener->conn, &mp_opt, req); 584 if (!new_msk) 585 fallback = true; 586 } else if (subflow_req->mp_join) { 587 mptcp_get_options(skb, &mp_opt); 588 if (!mp_opt.mp_join || !subflow_hmac_valid(req, &mp_opt) || 589 !mptcp_can_accept_new_subflow(subflow_req->msk)) { 590 SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKMAC); 591 fallback = true; 592 } 593 } 594 595 create_child: 596 child = listener->icsk_af_ops->syn_recv_sock(sk, skb, req, dst, 597 req_unhash, own_req); 598 599 if (child && *own_req) { 600 struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(child); 601 602 tcp_rsk(req)->drop_req = false; 603 604 /* we need to fallback on ctx allocation failure and on pre-reqs 605 * checking above. In the latter scenario we additionally need 606 * to reset the context to non MPTCP status. 607 */ 608 if (!ctx || fallback) { 609 if (fallback_is_fatal) 610 goto dispose_child; 611 612 subflow_drop_ctx(child); 613 goto out; 614 } 615 616 if (ctx->mp_capable) { 617 /* this can't race with mptcp_close(), as the msk is 618 * not yet exposted to user-space 619 */ 620 inet_sk_state_store((void *)new_msk, TCP_ESTABLISHED); 621 622 /* record the newly created socket as the first msk 623 * subflow, but don't link it yet into conn_list 624 */ 625 WRITE_ONCE(mptcp_sk(new_msk)->first, child); 626 627 /* new mpc subflow takes ownership of the newly 628 * created mptcp socket 629 */ 630 new_msk->sk_destruct = mptcp_sock_destruct; 631 mptcp_pm_new_connection(mptcp_sk(new_msk), 1); 632 mptcp_token_accept(subflow_req, mptcp_sk(new_msk)); 633 ctx->conn = new_msk; 634 new_msk = NULL; 635 636 /* with OoO packets we can reach here without ingress 637 * mpc option 638 */ 639 if (mp_opt.mp_capable) 640 mptcp_subflow_fully_established(ctx, &mp_opt); 641 } else if (ctx->mp_join) { 642 struct mptcp_sock *owner; 643 644 owner = subflow_req->msk; 645 if (!owner) 646 goto dispose_child; 647 648 /* move the msk reference ownership to the subflow */ 649 subflow_req->msk = NULL; 650 ctx->conn = (struct sock *)owner; 651 if (!mptcp_finish_join(child)) 652 goto dispose_child; 653 654 SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKRX); 655 tcp_rsk(req)->drop_req = true; 656 } 657 } 658 659 out: 660 /* dispose of the left over mptcp master, if any */ 661 if (unlikely(new_msk)) 662 mptcp_force_close(new_msk); 663 664 /* check for expected invariant - should never trigger, just help 665 * catching eariler subtle bugs 666 */ 667 WARN_ON_ONCE(child && *own_req && tcp_sk(child)->is_mptcp && 668 (!mptcp_subflow_ctx(child) || 669 !mptcp_subflow_ctx(child)->conn)); 670 return child; 671 672 dispose_child: 673 subflow_drop_ctx(child); 674 tcp_rsk(req)->drop_req = true; 675 inet_csk_prepare_for_destroy_sock(child); 676 tcp_done(child); 677 req->rsk_ops->send_reset(sk, skb); 678 679 /* The last child reference will be released by the caller */ 680 return child; 681 } 682 683 static struct inet_connection_sock_af_ops subflow_specific; 684 685 enum mapping_status { 686 MAPPING_OK, 687 MAPPING_INVALID, 688 MAPPING_EMPTY, 689 MAPPING_DATA_FIN, 690 MAPPING_DUMMY 691 }; 692 693 static u64 expand_seq(u64 old_seq, u16 old_data_len, u64 seq) 694 { 695 if ((u32)seq == (u32)old_seq) 696 return old_seq; 697 698 /* Assume map covers data not mapped yet. */ 699 return seq | ((old_seq + old_data_len + 1) & GENMASK_ULL(63, 32)); 700 } 701 702 static void warn_bad_map(struct mptcp_subflow_context *subflow, u32 ssn) 703 { 704 WARN_ONCE(1, "Bad mapping: ssn=%d map_seq=%d map_data_len=%d", 705 ssn, subflow->map_subflow_seq, subflow->map_data_len); 706 } 707 708 static bool skb_is_fully_mapped(struct sock *ssk, struct sk_buff *skb) 709 { 710 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); 711 unsigned int skb_consumed; 712 713 skb_consumed = tcp_sk(ssk)->copied_seq - TCP_SKB_CB(skb)->seq; 714 if (WARN_ON_ONCE(skb_consumed >= skb->len)) 715 return true; 716 717 return skb->len - skb_consumed <= subflow->map_data_len - 718 mptcp_subflow_get_map_offset(subflow); 719 } 720 721 static bool validate_mapping(struct sock *ssk, struct sk_buff *skb) 722 { 723 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); 724 u32 ssn = tcp_sk(ssk)->copied_seq - subflow->ssn_offset; 725 726 if (unlikely(before(ssn, subflow->map_subflow_seq))) { 727 /* Mapping covers data later in the subflow stream, 728 * currently unsupported. 729 */ 730 warn_bad_map(subflow, ssn); 731 return false; 732 } 733 if (unlikely(!before(ssn, subflow->map_subflow_seq + 734 subflow->map_data_len))) { 735 /* Mapping does covers past subflow data, invalid */ 736 warn_bad_map(subflow, ssn + skb->len); 737 return false; 738 } 739 return true; 740 } 741 742 static enum mapping_status get_mapping_status(struct sock *ssk, 743 struct mptcp_sock *msk) 744 { 745 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); 746 struct mptcp_ext *mpext; 747 struct sk_buff *skb; 748 u16 data_len; 749 u64 map_seq; 750 751 skb = skb_peek(&ssk->sk_receive_queue); 752 if (!skb) 753 return MAPPING_EMPTY; 754 755 if (mptcp_check_fallback(ssk)) 756 return MAPPING_DUMMY; 757 758 mpext = mptcp_get_ext(skb); 759 if (!mpext || !mpext->use_map) { 760 if (!subflow->map_valid && !skb->len) { 761 /* the TCP stack deliver 0 len FIN pkt to the receive 762 * queue, that is the only 0len pkts ever expected here, 763 * and we can admit no mapping only for 0 len pkts 764 */ 765 if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) 766 WARN_ONCE(1, "0len seq %d:%d flags %x", 767 TCP_SKB_CB(skb)->seq, 768 TCP_SKB_CB(skb)->end_seq, 769 TCP_SKB_CB(skb)->tcp_flags); 770 sk_eat_skb(ssk, skb); 771 return MAPPING_EMPTY; 772 } 773 774 if (!subflow->map_valid) 775 return MAPPING_INVALID; 776 777 goto validate_seq; 778 } 779 780 pr_debug("seq=%llu is64=%d ssn=%u data_len=%u data_fin=%d", 781 mpext->data_seq, mpext->dsn64, mpext->subflow_seq, 782 mpext->data_len, mpext->data_fin); 783 784 data_len = mpext->data_len; 785 if (data_len == 0) { 786 pr_err("Infinite mapping not handled"); 787 MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_INFINITEMAPRX); 788 return MAPPING_INVALID; 789 } 790 791 if (mpext->data_fin == 1) { 792 if (data_len == 1) { 793 bool updated = mptcp_update_rcv_data_fin(msk, mpext->data_seq, 794 mpext->dsn64); 795 pr_debug("DATA_FIN with no payload seq=%llu", mpext->data_seq); 796 if (subflow->map_valid) { 797 /* A DATA_FIN might arrive in a DSS 798 * option before the previous mapping 799 * has been fully consumed. Continue 800 * handling the existing mapping. 801 */ 802 skb_ext_del(skb, SKB_EXT_MPTCP); 803 return MAPPING_OK; 804 } else { 805 if (updated && schedule_work(&msk->work)) 806 sock_hold((struct sock *)msk); 807 808 return MAPPING_DATA_FIN; 809 } 810 } else { 811 u64 data_fin_seq = mpext->data_seq + data_len - 1; 812 813 /* If mpext->data_seq is a 32-bit value, data_fin_seq 814 * must also be limited to 32 bits. 815 */ 816 if (!mpext->dsn64) 817 data_fin_seq &= GENMASK_ULL(31, 0); 818 819 mptcp_update_rcv_data_fin(msk, data_fin_seq, mpext->dsn64); 820 pr_debug("DATA_FIN with mapping seq=%llu dsn64=%d", 821 data_fin_seq, mpext->dsn64); 822 } 823 824 /* Adjust for DATA_FIN using 1 byte of sequence space */ 825 data_len--; 826 } 827 828 if (!mpext->dsn64) { 829 map_seq = expand_seq(subflow->map_seq, subflow->map_data_len, 830 mpext->data_seq); 831 pr_debug("expanded seq=%llu", subflow->map_seq); 832 } else { 833 map_seq = mpext->data_seq; 834 } 835 WRITE_ONCE(mptcp_sk(subflow->conn)->use_64bit_ack, !!mpext->dsn64); 836 837 if (subflow->map_valid) { 838 /* Allow replacing only with an identical map */ 839 if (subflow->map_seq == map_seq && 840 subflow->map_subflow_seq == mpext->subflow_seq && 841 subflow->map_data_len == data_len) { 842 skb_ext_del(skb, SKB_EXT_MPTCP); 843 return MAPPING_OK; 844 } 845 846 /* If this skb data are fully covered by the current mapping, 847 * the new map would need caching, which is not supported 848 */ 849 if (skb_is_fully_mapped(ssk, skb)) { 850 MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DSSNOMATCH); 851 return MAPPING_INVALID; 852 } 853 854 /* will validate the next map after consuming the current one */ 855 return MAPPING_OK; 856 } 857 858 subflow->map_seq = map_seq; 859 subflow->map_subflow_seq = mpext->subflow_seq; 860 subflow->map_data_len = data_len; 861 subflow->map_valid = 1; 862 subflow->mpc_map = mpext->mpc_map; 863 pr_debug("new map seq=%llu subflow_seq=%u data_len=%u", 864 subflow->map_seq, subflow->map_subflow_seq, 865 subflow->map_data_len); 866 867 validate_seq: 868 /* we revalidate valid mapping on new skb, because we must ensure 869 * the current skb is completely covered by the available mapping 870 */ 871 if (!validate_mapping(ssk, skb)) 872 return MAPPING_INVALID; 873 874 skb_ext_del(skb, SKB_EXT_MPTCP); 875 return MAPPING_OK; 876 } 877 878 static void mptcp_subflow_discard_data(struct sock *ssk, struct sk_buff *skb, 879 u64 limit) 880 { 881 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); 882 bool fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN; 883 u32 incr; 884 885 incr = limit >= skb->len ? skb->len + fin : limit; 886 887 pr_debug("discarding=%d len=%d seq=%d", incr, skb->len, 888 subflow->map_subflow_seq); 889 MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DUPDATA); 890 tcp_sk(ssk)->copied_seq += incr; 891 if (!before(tcp_sk(ssk)->copied_seq, TCP_SKB_CB(skb)->end_seq)) 892 sk_eat_skb(ssk, skb); 893 if (mptcp_subflow_get_map_offset(subflow) >= subflow->map_data_len) 894 subflow->map_valid = 0; 895 } 896 897 static bool subflow_check_data_avail(struct sock *ssk) 898 { 899 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); 900 enum mapping_status status; 901 struct mptcp_sock *msk; 902 struct sk_buff *skb; 903 904 pr_debug("msk=%p ssk=%p data_avail=%d skb=%p", subflow->conn, ssk, 905 subflow->data_avail, skb_peek(&ssk->sk_receive_queue)); 906 if (!skb_peek(&ssk->sk_receive_queue)) 907 subflow->data_avail = 0; 908 if (subflow->data_avail) 909 return true; 910 911 msk = mptcp_sk(subflow->conn); 912 for (;;) { 913 u64 ack_seq; 914 u64 old_ack; 915 916 status = get_mapping_status(ssk, msk); 917 pr_debug("msk=%p ssk=%p status=%d", msk, ssk, status); 918 if (status == MAPPING_INVALID) { 919 ssk->sk_err = EBADMSG; 920 goto fatal; 921 } 922 if (status == MAPPING_DUMMY) { 923 __mptcp_do_fallback(msk); 924 skb = skb_peek(&ssk->sk_receive_queue); 925 subflow->map_valid = 1; 926 subflow->map_seq = READ_ONCE(msk->ack_seq); 927 subflow->map_data_len = skb->len; 928 subflow->map_subflow_seq = tcp_sk(ssk)->copied_seq - 929 subflow->ssn_offset; 930 subflow->data_avail = MPTCP_SUBFLOW_DATA_AVAIL; 931 return true; 932 } 933 934 if (status != MAPPING_OK) 935 return false; 936 937 skb = skb_peek(&ssk->sk_receive_queue); 938 if (WARN_ON_ONCE(!skb)) 939 return false; 940 941 /* if msk lacks the remote key, this subflow must provide an 942 * MP_CAPABLE-based mapping 943 */ 944 if (unlikely(!READ_ONCE(msk->can_ack))) { 945 if (!subflow->mpc_map) { 946 ssk->sk_err = EBADMSG; 947 goto fatal; 948 } 949 WRITE_ONCE(msk->remote_key, subflow->remote_key); 950 WRITE_ONCE(msk->ack_seq, subflow->map_seq); 951 WRITE_ONCE(msk->can_ack, true); 952 } 953 954 old_ack = READ_ONCE(msk->ack_seq); 955 ack_seq = mptcp_subflow_get_mapped_dsn(subflow); 956 pr_debug("msk ack_seq=%llx subflow ack_seq=%llx", old_ack, 957 ack_seq); 958 if (ack_seq == old_ack) { 959 subflow->data_avail = MPTCP_SUBFLOW_DATA_AVAIL; 960 break; 961 } else if (after64(ack_seq, old_ack)) { 962 subflow->data_avail = MPTCP_SUBFLOW_OOO_DATA; 963 break; 964 } 965 966 /* only accept in-sequence mapping. Old values are spurious 967 * retransmission 968 */ 969 mptcp_subflow_discard_data(ssk, skb, old_ack - ack_seq); 970 } 971 return true; 972 973 fatal: 974 /* fatal protocol error, close the socket */ 975 /* This barrier is coupled with smp_rmb() in tcp_poll() */ 976 smp_wmb(); 977 ssk->sk_error_report(ssk); 978 tcp_set_state(ssk, TCP_CLOSE); 979 tcp_send_active_reset(ssk, GFP_ATOMIC); 980 subflow->data_avail = 0; 981 return false; 982 } 983 984 bool mptcp_subflow_data_available(struct sock *sk) 985 { 986 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); 987 988 /* check if current mapping is still valid */ 989 if (subflow->map_valid && 990 mptcp_subflow_get_map_offset(subflow) >= subflow->map_data_len) { 991 subflow->map_valid = 0; 992 subflow->data_avail = 0; 993 994 pr_debug("Done with mapping: seq=%u data_len=%u", 995 subflow->map_subflow_seq, 996 subflow->map_data_len); 997 } 998 999 return subflow_check_data_avail(sk); 1000 } 1001 1002 /* If ssk has an mptcp parent socket, use the mptcp rcvbuf occupancy, 1003 * not the ssk one. 1004 * 1005 * In mptcp, rwin is about the mptcp-level connection data. 1006 * 1007 * Data that is still on the ssk rx queue can thus be ignored, 1008 * as far as mptcp peer is concerened that data is still inflight. 1009 * DSS ACK is updated when skb is moved to the mptcp rx queue. 1010 */ 1011 void mptcp_space(const struct sock *ssk, int *space, int *full_space) 1012 { 1013 const struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); 1014 const struct sock *sk = subflow->conn; 1015 1016 *space = __mptcp_space(sk); 1017 *full_space = tcp_full_space(sk); 1018 } 1019 1020 static void subflow_data_ready(struct sock *sk) 1021 { 1022 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); 1023 u16 state = 1 << inet_sk_state_load(sk); 1024 struct sock *parent = subflow->conn; 1025 struct mptcp_sock *msk; 1026 1027 msk = mptcp_sk(parent); 1028 if (state & TCPF_LISTEN) { 1029 set_bit(MPTCP_DATA_READY, &msk->flags); 1030 parent->sk_data_ready(parent); 1031 return; 1032 } 1033 1034 WARN_ON_ONCE(!__mptcp_check_fallback(msk) && !subflow->mp_capable && 1035 !subflow->mp_join && !(state & TCPF_CLOSE)); 1036 1037 if (mptcp_subflow_data_available(sk)) 1038 mptcp_data_ready(parent, sk); 1039 } 1040 1041 static void subflow_write_space(struct sock *ssk) 1042 { 1043 /* we take action in __mptcp_clean_una() */ 1044 } 1045 1046 static struct inet_connection_sock_af_ops * 1047 subflow_default_af_ops(struct sock *sk) 1048 { 1049 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 1050 if (sk->sk_family == AF_INET6) 1051 return &subflow_v6_specific; 1052 #endif 1053 return &subflow_specific; 1054 } 1055 1056 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 1057 void mptcpv6_handle_mapped(struct sock *sk, bool mapped) 1058 { 1059 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); 1060 struct inet_connection_sock *icsk = inet_csk(sk); 1061 struct inet_connection_sock_af_ops *target; 1062 1063 target = mapped ? &subflow_v6m_specific : subflow_default_af_ops(sk); 1064 1065 pr_debug("subflow=%p family=%d ops=%p target=%p mapped=%d", 1066 subflow, sk->sk_family, icsk->icsk_af_ops, target, mapped); 1067 1068 if (likely(icsk->icsk_af_ops == target)) 1069 return; 1070 1071 subflow->icsk_af_ops = icsk->icsk_af_ops; 1072 icsk->icsk_af_ops = target; 1073 } 1074 #endif 1075 1076 static void mptcp_info2sockaddr(const struct mptcp_addr_info *info, 1077 struct sockaddr_storage *addr) 1078 { 1079 memset(addr, 0, sizeof(*addr)); 1080 addr->ss_family = info->family; 1081 if (addr->ss_family == AF_INET) { 1082 struct sockaddr_in *in_addr = (struct sockaddr_in *)addr; 1083 1084 in_addr->sin_addr = info->addr; 1085 in_addr->sin_port = info->port; 1086 } 1087 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 1088 else if (addr->ss_family == AF_INET6) { 1089 struct sockaddr_in6 *in6_addr = (struct sockaddr_in6 *)addr; 1090 1091 in6_addr->sin6_addr = info->addr6; 1092 in6_addr->sin6_port = info->port; 1093 } 1094 #endif 1095 } 1096 1097 int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc, 1098 const struct mptcp_addr_info *remote) 1099 { 1100 struct mptcp_sock *msk = mptcp_sk(sk); 1101 struct mptcp_subflow_context *subflow; 1102 struct sockaddr_storage addr; 1103 int remote_id = remote->id; 1104 int local_id = loc->id; 1105 struct socket *sf; 1106 struct sock *ssk; 1107 u32 remote_token; 1108 int addrlen; 1109 int err; 1110 1111 if (!mptcp_is_fully_established(sk)) 1112 return -ENOTCONN; 1113 1114 err = mptcp_subflow_create_socket(sk, &sf); 1115 if (err) 1116 return err; 1117 1118 ssk = sf->sk; 1119 subflow = mptcp_subflow_ctx(ssk); 1120 do { 1121 get_random_bytes(&subflow->local_nonce, sizeof(u32)); 1122 } while (!subflow->local_nonce); 1123 1124 if (!local_id) { 1125 err = mptcp_pm_get_local_id(msk, (struct sock_common *)ssk); 1126 if (err < 0) 1127 goto failed; 1128 1129 local_id = err; 1130 } 1131 1132 subflow->remote_key = msk->remote_key; 1133 subflow->local_key = msk->local_key; 1134 subflow->token = msk->token; 1135 mptcp_info2sockaddr(loc, &addr); 1136 1137 addrlen = sizeof(struct sockaddr_in); 1138 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 1139 if (loc->family == AF_INET6) 1140 addrlen = sizeof(struct sockaddr_in6); 1141 #endif 1142 ssk->sk_bound_dev_if = loc->ifindex; 1143 err = kernel_bind(sf, (struct sockaddr *)&addr, addrlen); 1144 if (err) 1145 goto failed; 1146 1147 mptcp_crypto_key_sha(subflow->remote_key, &remote_token, NULL); 1148 pr_debug("msk=%p remote_token=%u local_id=%d remote_id=%d", msk, 1149 remote_token, local_id, remote_id); 1150 subflow->remote_token = remote_token; 1151 subflow->local_id = local_id; 1152 subflow->remote_id = remote_id; 1153 subflow->request_join = 1; 1154 subflow->request_bkup = !!(loc->flags & MPTCP_PM_ADDR_FLAG_BACKUP); 1155 mptcp_info2sockaddr(remote, &addr); 1156 1157 mptcp_add_pending_subflow(msk, subflow); 1158 err = kernel_connect(sf, (struct sockaddr *)&addr, addrlen, O_NONBLOCK); 1159 if (err && err != -EINPROGRESS) 1160 goto failed_unlink; 1161 1162 return err; 1163 1164 failed_unlink: 1165 spin_lock_bh(&msk->join_list_lock); 1166 list_del(&subflow->node); 1167 spin_unlock_bh(&msk->join_list_lock); 1168 1169 failed: 1170 subflow->disposable = 1; 1171 sock_release(sf); 1172 return err; 1173 } 1174 1175 static void mptcp_attach_cgroup(struct sock *parent, struct sock *child) 1176 { 1177 #ifdef CONFIG_SOCK_CGROUP_DATA 1178 struct sock_cgroup_data *parent_skcd = &parent->sk_cgrp_data, 1179 *child_skcd = &child->sk_cgrp_data; 1180 1181 /* only the additional subflows created by kworkers have to be modified */ 1182 if (cgroup_id(sock_cgroup_ptr(parent_skcd)) != 1183 cgroup_id(sock_cgroup_ptr(child_skcd))) { 1184 #ifdef CONFIG_MEMCG 1185 struct mem_cgroup *memcg = parent->sk_memcg; 1186 1187 mem_cgroup_sk_free(child); 1188 if (memcg && css_tryget(&memcg->css)) 1189 child->sk_memcg = memcg; 1190 #endif /* CONFIG_MEMCG */ 1191 1192 cgroup_sk_free(child_skcd); 1193 *child_skcd = *parent_skcd; 1194 cgroup_sk_clone(child_skcd); 1195 } 1196 #endif /* CONFIG_SOCK_CGROUP_DATA */ 1197 } 1198 1199 int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock) 1200 { 1201 struct mptcp_subflow_context *subflow; 1202 struct net *net = sock_net(sk); 1203 struct socket *sf; 1204 int err; 1205 1206 /* un-accepted server sockets can reach here - on bad configuration 1207 * bail early to avoid greater trouble later 1208 */ 1209 if (unlikely(!sk->sk_socket)) 1210 return -EINVAL; 1211 1212 err = sock_create_kern(net, sk->sk_family, SOCK_STREAM, IPPROTO_TCP, 1213 &sf); 1214 if (err) 1215 return err; 1216 1217 lock_sock(sf->sk); 1218 1219 /* the newly created socket has to be in the same cgroup as its parent */ 1220 mptcp_attach_cgroup(sk, sf->sk); 1221 1222 /* kernel sockets do not by default acquire net ref, but TCP timer 1223 * needs it. 1224 */ 1225 sf->sk->sk_net_refcnt = 1; 1226 get_net(net); 1227 #ifdef CONFIG_PROC_FS 1228 this_cpu_add(*net->core.sock_inuse, 1); 1229 #endif 1230 err = tcp_set_ulp(sf->sk, "mptcp"); 1231 release_sock(sf->sk); 1232 1233 if (err) { 1234 sock_release(sf); 1235 return err; 1236 } 1237 1238 /* the newly created socket really belongs to the owning MPTCP master 1239 * socket, even if for additional subflows the allocation is performed 1240 * by a kernel workqueue. Adjust inode references, so that the 1241 * procfs/diag interaces really show this one belonging to the correct 1242 * user. 1243 */ 1244 SOCK_INODE(sf)->i_ino = SOCK_INODE(sk->sk_socket)->i_ino; 1245 SOCK_INODE(sf)->i_uid = SOCK_INODE(sk->sk_socket)->i_uid; 1246 SOCK_INODE(sf)->i_gid = SOCK_INODE(sk->sk_socket)->i_gid; 1247 1248 subflow = mptcp_subflow_ctx(sf->sk); 1249 pr_debug("subflow=%p", subflow); 1250 1251 *new_sock = sf; 1252 sock_hold(sk); 1253 subflow->conn = sk; 1254 1255 return 0; 1256 } 1257 1258 static struct mptcp_subflow_context *subflow_create_ctx(struct sock *sk, 1259 gfp_t priority) 1260 { 1261 struct inet_connection_sock *icsk = inet_csk(sk); 1262 struct mptcp_subflow_context *ctx; 1263 1264 ctx = kzalloc(sizeof(*ctx), priority); 1265 if (!ctx) 1266 return NULL; 1267 1268 rcu_assign_pointer(icsk->icsk_ulp_data, ctx); 1269 INIT_LIST_HEAD(&ctx->node); 1270 1271 pr_debug("subflow=%p", ctx); 1272 1273 ctx->tcp_sock = sk; 1274 1275 return ctx; 1276 } 1277 1278 static void __subflow_state_change(struct sock *sk) 1279 { 1280 struct socket_wq *wq; 1281 1282 rcu_read_lock(); 1283 wq = rcu_dereference(sk->sk_wq); 1284 if (skwq_has_sleeper(wq)) 1285 wake_up_interruptible_all(&wq->wait); 1286 rcu_read_unlock(); 1287 } 1288 1289 static bool subflow_is_done(const struct sock *sk) 1290 { 1291 return sk->sk_shutdown & RCV_SHUTDOWN || sk->sk_state == TCP_CLOSE; 1292 } 1293 1294 static void subflow_state_change(struct sock *sk) 1295 { 1296 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); 1297 struct sock *parent = subflow->conn; 1298 1299 __subflow_state_change(sk); 1300 1301 if (subflow_simultaneous_connect(sk)) { 1302 mptcp_do_fallback(sk); 1303 mptcp_rcv_space_init(mptcp_sk(parent), sk); 1304 pr_fallback(mptcp_sk(parent)); 1305 subflow->conn_finished = 1; 1306 if (inet_sk_state_load(parent) == TCP_SYN_SENT) { 1307 inet_sk_state_store(parent, TCP_ESTABLISHED); 1308 parent->sk_state_change(parent); 1309 } 1310 } 1311 1312 /* as recvmsg() does not acquire the subflow socket for ssk selection 1313 * a fin packet carrying a DSS can be unnoticed if we don't trigger 1314 * the data available machinery here. 1315 */ 1316 if (mptcp_subflow_data_available(sk)) 1317 mptcp_data_ready(parent, sk); 1318 1319 if (__mptcp_check_fallback(mptcp_sk(parent)) && 1320 !subflow->rx_eof && subflow_is_done(sk)) { 1321 subflow->rx_eof = 1; 1322 mptcp_subflow_eof(parent); 1323 } 1324 } 1325 1326 static int subflow_ulp_init(struct sock *sk) 1327 { 1328 struct inet_connection_sock *icsk = inet_csk(sk); 1329 struct mptcp_subflow_context *ctx; 1330 struct tcp_sock *tp = tcp_sk(sk); 1331 int err = 0; 1332 1333 /* disallow attaching ULP to a socket unless it has been 1334 * created with sock_create_kern() 1335 */ 1336 if (!sk->sk_kern_sock) { 1337 err = -EOPNOTSUPP; 1338 goto out; 1339 } 1340 1341 ctx = subflow_create_ctx(sk, GFP_KERNEL); 1342 if (!ctx) { 1343 err = -ENOMEM; 1344 goto out; 1345 } 1346 1347 pr_debug("subflow=%p, family=%d", ctx, sk->sk_family); 1348 1349 tp->is_mptcp = 1; 1350 ctx->icsk_af_ops = icsk->icsk_af_ops; 1351 icsk->icsk_af_ops = subflow_default_af_ops(sk); 1352 ctx->tcp_data_ready = sk->sk_data_ready; 1353 ctx->tcp_state_change = sk->sk_state_change; 1354 ctx->tcp_write_space = sk->sk_write_space; 1355 sk->sk_data_ready = subflow_data_ready; 1356 sk->sk_write_space = subflow_write_space; 1357 sk->sk_state_change = subflow_state_change; 1358 out: 1359 return err; 1360 } 1361 1362 static void subflow_ulp_release(struct sock *ssk) 1363 { 1364 struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(ssk); 1365 bool release = true; 1366 struct sock *sk; 1367 1368 if (!ctx) 1369 return; 1370 1371 sk = ctx->conn; 1372 if (sk) { 1373 /* if the msk has been orphaned, keep the ctx 1374 * alive, will be freed by __mptcp_close_ssk(), 1375 * when the subflow is still unaccepted 1376 */ 1377 release = ctx->disposable || list_empty(&ctx->node); 1378 sock_put(sk); 1379 } 1380 1381 if (release) 1382 kfree_rcu(ctx, rcu); 1383 } 1384 1385 static void subflow_ulp_clone(const struct request_sock *req, 1386 struct sock *newsk, 1387 const gfp_t priority) 1388 { 1389 struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); 1390 struct mptcp_subflow_context *old_ctx = mptcp_subflow_ctx(newsk); 1391 struct mptcp_subflow_context *new_ctx; 1392 1393 if (!tcp_rsk(req)->is_mptcp || 1394 (!subflow_req->mp_capable && !subflow_req->mp_join)) { 1395 subflow_ulp_fallback(newsk, old_ctx); 1396 return; 1397 } 1398 1399 new_ctx = subflow_create_ctx(newsk, priority); 1400 if (!new_ctx) { 1401 subflow_ulp_fallback(newsk, old_ctx); 1402 return; 1403 } 1404 1405 new_ctx->conn_finished = 1; 1406 new_ctx->icsk_af_ops = old_ctx->icsk_af_ops; 1407 new_ctx->tcp_data_ready = old_ctx->tcp_data_ready; 1408 new_ctx->tcp_state_change = old_ctx->tcp_state_change; 1409 new_ctx->tcp_write_space = old_ctx->tcp_write_space; 1410 new_ctx->rel_write_seq = 1; 1411 new_ctx->tcp_sock = newsk; 1412 1413 if (subflow_req->mp_capable) { 1414 /* see comments in subflow_syn_recv_sock(), MPTCP connection 1415 * is fully established only after we receive the remote key 1416 */ 1417 new_ctx->mp_capable = 1; 1418 new_ctx->local_key = subflow_req->local_key; 1419 new_ctx->token = subflow_req->token; 1420 new_ctx->ssn_offset = subflow_req->ssn_offset; 1421 new_ctx->idsn = subflow_req->idsn; 1422 } else if (subflow_req->mp_join) { 1423 new_ctx->ssn_offset = subflow_req->ssn_offset; 1424 new_ctx->mp_join = 1; 1425 new_ctx->fully_established = 1; 1426 new_ctx->backup = subflow_req->backup; 1427 new_ctx->local_id = subflow_req->local_id; 1428 new_ctx->remote_id = subflow_req->remote_id; 1429 new_ctx->token = subflow_req->token; 1430 new_ctx->thmac = subflow_req->thmac; 1431 } 1432 } 1433 1434 static struct tcp_ulp_ops subflow_ulp_ops __read_mostly = { 1435 .name = "mptcp", 1436 .owner = THIS_MODULE, 1437 .init = subflow_ulp_init, 1438 .release = subflow_ulp_release, 1439 .clone = subflow_ulp_clone, 1440 }; 1441 1442 static int subflow_ops_init(struct request_sock_ops *subflow_ops) 1443 { 1444 subflow_ops->obj_size = sizeof(struct mptcp_subflow_request_sock); 1445 subflow_ops->slab_name = "request_sock_subflow"; 1446 1447 subflow_ops->slab = kmem_cache_create(subflow_ops->slab_name, 1448 subflow_ops->obj_size, 0, 1449 SLAB_ACCOUNT | 1450 SLAB_TYPESAFE_BY_RCU, 1451 NULL); 1452 if (!subflow_ops->slab) 1453 return -ENOMEM; 1454 1455 subflow_ops->destructor = subflow_req_destructor; 1456 1457 return 0; 1458 } 1459 1460 void __init mptcp_subflow_init(void) 1461 { 1462 mptcp_subflow_request_sock_ops = tcp_request_sock_ops; 1463 if (subflow_ops_init(&mptcp_subflow_request_sock_ops) != 0) 1464 panic("MPTCP: failed to init subflow request sock ops\n"); 1465 1466 subflow_request_sock_ipv4_ops = tcp_request_sock_ipv4_ops; 1467 subflow_request_sock_ipv4_ops.route_req = subflow_v4_route_req; 1468 1469 subflow_specific = ipv4_specific; 1470 subflow_specific.conn_request = subflow_v4_conn_request; 1471 subflow_specific.syn_recv_sock = subflow_syn_recv_sock; 1472 subflow_specific.sk_rx_dst_set = subflow_finish_connect; 1473 1474 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 1475 subflow_request_sock_ipv6_ops = tcp_request_sock_ipv6_ops; 1476 subflow_request_sock_ipv6_ops.route_req = subflow_v6_route_req; 1477 1478 subflow_v6_specific = ipv6_specific; 1479 subflow_v6_specific.conn_request = subflow_v6_conn_request; 1480 subflow_v6_specific.syn_recv_sock = subflow_syn_recv_sock; 1481 subflow_v6_specific.sk_rx_dst_set = subflow_finish_connect; 1482 1483 subflow_v6m_specific = subflow_v6_specific; 1484 subflow_v6m_specific.queue_xmit = ipv4_specific.queue_xmit; 1485 subflow_v6m_specific.send_check = ipv4_specific.send_check; 1486 subflow_v6m_specific.net_header_len = ipv4_specific.net_header_len; 1487 subflow_v6m_specific.mtu_reduced = ipv4_specific.mtu_reduced; 1488 subflow_v6m_specific.net_frag_header_len = 0; 1489 #endif 1490 1491 mptcp_diag_subflow_init(&subflow_ulp_ops); 1492 1493 if (tcp_register_ulp(&subflow_ulp_ops) != 0) 1494 panic("MPTCP: failed to register subflows to ULP\n"); 1495 } 1496