1 // SPDX-License-Identifier: GPL-2.0 2 /* Multipath TCP 3 * 4 * Copyright (c) 2017 - 2019, Intel Corporation. 5 */ 6 7 #define pr_fmt(fmt) "MPTCP: " fmt 8 9 #include <linux/kernel.h> 10 #include <linux/module.h> 11 #include <linux/netdevice.h> 12 #include <crypto/algapi.h> 13 #include <crypto/sha2.h> 14 #include <net/sock.h> 15 #include <net/inet_common.h> 16 #include <net/inet_hashtables.h> 17 #include <net/protocol.h> 18 #include <net/tcp.h> 19 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 20 #include <net/ip6_route.h> 21 #endif 22 #include <net/mptcp.h> 23 #include <uapi/linux/mptcp.h> 24 #include "protocol.h" 25 #include "mib.h" 26 27 static void SUBFLOW_REQ_INC_STATS(struct request_sock *req, 28 enum linux_mptcp_mib_field field) 29 { 30 MPTCP_INC_STATS(sock_net(req_to_sk(req)), field); 31 } 32 33 static void subflow_req_destructor(struct request_sock *req) 34 { 35 struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); 36 37 pr_debug("subflow_req=%p", subflow_req); 38 39 if (subflow_req->msk) 40 sock_put((struct sock *)subflow_req->msk); 41 42 mptcp_token_destroy_request(req); 43 tcp_request_sock_ops.destructor(req); 44 } 45 46 static void subflow_generate_hmac(u64 key1, u64 key2, u32 nonce1, u32 nonce2, 47 void *hmac) 48 { 49 u8 msg[8]; 50 51 put_unaligned_be32(nonce1, &msg[0]); 52 put_unaligned_be32(nonce2, &msg[4]); 53 54 mptcp_crypto_hmac_sha(key1, key2, msg, 8, hmac); 55 } 56 57 static bool mptcp_can_accept_new_subflow(const struct mptcp_sock *msk) 58 { 59 return mptcp_is_fully_established((void *)msk) && 60 READ_ONCE(msk->pm.accept_subflow); 61 } 62 63 /* validate received token and create truncated hmac and nonce for SYN-ACK */ 64 static struct mptcp_sock *subflow_token_join_request(struct request_sock *req, 65 const struct sk_buff *skb) 66 { 67 struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); 68 u8 hmac[SHA256_DIGEST_SIZE]; 69 struct mptcp_sock *msk; 70 int local_id; 71 72 msk = mptcp_token_get_sock(subflow_req->token); 73 if (!msk) { 74 SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINNOTOKEN); 75 return NULL; 76 } 77 78 local_id = mptcp_pm_get_local_id(msk, (struct sock_common *)req); 79 if (local_id < 0) { 80 sock_put((struct sock *)msk); 81 return NULL; 82 } 83 subflow_req->local_id = local_id; 84 85 get_random_bytes(&subflow_req->local_nonce, sizeof(u32)); 86 87 subflow_generate_hmac(msk->local_key, msk->remote_key, 88 subflow_req->local_nonce, 89 subflow_req->remote_nonce, hmac); 90 91 subflow_req->thmac = get_unaligned_be64(hmac); 92 return msk; 93 } 94 95 static void subflow_init_req(struct request_sock *req, const struct sock *sk_listener) 96 { 97 struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); 98 99 subflow_req->mp_capable = 0; 100 subflow_req->mp_join = 0; 101 subflow_req->msk = NULL; 102 mptcp_token_init_request(req); 103 } 104 105 /* Init mptcp request socket. 106 * 107 * Returns an error code if a JOIN has failed and a TCP reset 108 * should be sent. 109 */ 110 static int subflow_check_req(struct request_sock *req, 111 const struct sock *sk_listener, 112 struct sk_buff *skb) 113 { 114 struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk_listener); 115 struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); 116 struct mptcp_options_received mp_opt; 117 118 pr_debug("subflow_req=%p, listener=%p", subflow_req, listener); 119 120 #ifdef CONFIG_TCP_MD5SIG 121 /* no MPTCP if MD5SIG is enabled on this socket or we may run out of 122 * TCP option space. 123 */ 124 if (rcu_access_pointer(tcp_sk(sk_listener)->md5sig_info)) 125 return -EINVAL; 126 #endif 127 128 mptcp_get_options(skb, &mp_opt); 129 130 if (mp_opt.mp_capable) { 131 SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MPCAPABLEPASSIVE); 132 133 if (mp_opt.mp_join) 134 return 0; 135 } else if (mp_opt.mp_join) { 136 SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINSYNRX); 137 } 138 139 if (mp_opt.mp_capable && listener->request_mptcp) { 140 int err, retries = 4; 141 142 subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq; 143 again: 144 do { 145 get_random_bytes(&subflow_req->local_key, sizeof(subflow_req->local_key)); 146 } while (subflow_req->local_key == 0); 147 148 if (unlikely(req->syncookie)) { 149 mptcp_crypto_key_sha(subflow_req->local_key, 150 &subflow_req->token, 151 &subflow_req->idsn); 152 if (mptcp_token_exists(subflow_req->token)) { 153 if (retries-- > 0) 154 goto again; 155 } else { 156 subflow_req->mp_capable = 1; 157 } 158 return 0; 159 } 160 161 err = mptcp_token_new_request(req); 162 if (err == 0) 163 subflow_req->mp_capable = 1; 164 else if (retries-- > 0) 165 goto again; 166 167 } else if (mp_opt.mp_join && listener->request_mptcp) { 168 subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq; 169 subflow_req->mp_join = 1; 170 subflow_req->backup = mp_opt.backup; 171 subflow_req->remote_id = mp_opt.join_id; 172 subflow_req->token = mp_opt.token; 173 subflow_req->remote_nonce = mp_opt.nonce; 174 subflow_req->msk = subflow_token_join_request(req, skb); 175 176 /* Can't fall back to TCP in this case. */ 177 if (!subflow_req->msk) 178 return -EPERM; 179 180 if (unlikely(req->syncookie)) { 181 if (mptcp_can_accept_new_subflow(subflow_req->msk)) 182 subflow_init_req_cookie_join_save(subflow_req, skb); 183 } 184 185 pr_debug("token=%u, remote_nonce=%u msk=%p", subflow_req->token, 186 subflow_req->remote_nonce, subflow_req->msk); 187 } 188 189 return 0; 190 } 191 192 int mptcp_subflow_init_cookie_req(struct request_sock *req, 193 const struct sock *sk_listener, 194 struct sk_buff *skb) 195 { 196 struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk_listener); 197 struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); 198 struct mptcp_options_received mp_opt; 199 int err; 200 201 subflow_init_req(req, sk_listener); 202 mptcp_get_options(skb, &mp_opt); 203 204 if (mp_opt.mp_capable && mp_opt.mp_join) 205 return -EINVAL; 206 207 if (mp_opt.mp_capable && listener->request_mptcp) { 208 if (mp_opt.sndr_key == 0) 209 return -EINVAL; 210 211 subflow_req->local_key = mp_opt.rcvr_key; 212 err = mptcp_token_new_request(req); 213 if (err) 214 return err; 215 216 subflow_req->mp_capable = 1; 217 subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq - 1; 218 } else if (mp_opt.mp_join && listener->request_mptcp) { 219 if (!mptcp_token_join_cookie_init_state(subflow_req, skb)) 220 return -EINVAL; 221 222 if (mptcp_can_accept_new_subflow(subflow_req->msk)) 223 subflow_req->mp_join = 1; 224 225 subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq - 1; 226 } 227 228 return 0; 229 } 230 EXPORT_SYMBOL_GPL(mptcp_subflow_init_cookie_req); 231 232 static struct dst_entry *subflow_v4_route_req(const struct sock *sk, 233 struct sk_buff *skb, 234 struct flowi *fl, 235 struct request_sock *req) 236 { 237 struct dst_entry *dst; 238 int err; 239 240 tcp_rsk(req)->is_mptcp = 1; 241 subflow_init_req(req, sk); 242 243 dst = tcp_request_sock_ipv4_ops.route_req(sk, skb, fl, req); 244 if (!dst) 245 return NULL; 246 247 err = subflow_check_req(req, sk, skb); 248 if (err == 0) 249 return dst; 250 251 dst_release(dst); 252 if (!req->syncookie) 253 tcp_request_sock_ops.send_reset(sk, skb); 254 return NULL; 255 } 256 257 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 258 static struct dst_entry *subflow_v6_route_req(const struct sock *sk, 259 struct sk_buff *skb, 260 struct flowi *fl, 261 struct request_sock *req) 262 { 263 struct dst_entry *dst; 264 int err; 265 266 tcp_rsk(req)->is_mptcp = 1; 267 subflow_init_req(req, sk); 268 269 dst = tcp_request_sock_ipv6_ops.route_req(sk, skb, fl, req); 270 if (!dst) 271 return NULL; 272 273 err = subflow_check_req(req, sk, skb); 274 if (err == 0) 275 return dst; 276 277 dst_release(dst); 278 if (!req->syncookie) 279 tcp6_request_sock_ops.send_reset(sk, skb); 280 return NULL; 281 } 282 #endif 283 284 /* validate received truncated hmac and create hmac for third ACK */ 285 static bool subflow_thmac_valid(struct mptcp_subflow_context *subflow) 286 { 287 u8 hmac[SHA256_DIGEST_SIZE]; 288 u64 thmac; 289 290 subflow_generate_hmac(subflow->remote_key, subflow->local_key, 291 subflow->remote_nonce, subflow->local_nonce, 292 hmac); 293 294 thmac = get_unaligned_be64(hmac); 295 pr_debug("subflow=%p, token=%u, thmac=%llu, subflow->thmac=%llu\n", 296 subflow, subflow->token, 297 (unsigned long long)thmac, 298 (unsigned long long)subflow->thmac); 299 300 return thmac == subflow->thmac; 301 } 302 303 void mptcp_subflow_reset(struct sock *ssk) 304 { 305 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); 306 struct sock *sk = subflow->conn; 307 308 /* must hold: tcp_done() could drop last reference on parent */ 309 sock_hold(sk); 310 311 tcp_set_state(ssk, TCP_CLOSE); 312 tcp_send_active_reset(ssk, GFP_ATOMIC); 313 tcp_done(ssk); 314 if (!test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &mptcp_sk(sk)->flags) && 315 schedule_work(&mptcp_sk(sk)->work)) 316 return; /* worker will put sk for us */ 317 318 sock_put(sk); 319 } 320 321 static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) 322 { 323 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); 324 struct mptcp_options_received mp_opt; 325 struct sock *parent = subflow->conn; 326 327 subflow->icsk_af_ops->sk_rx_dst_set(sk, skb); 328 329 if (inet_sk_state_load(parent) == TCP_SYN_SENT) { 330 inet_sk_state_store(parent, TCP_ESTABLISHED); 331 parent->sk_state_change(parent); 332 } 333 334 /* be sure no special action on any packet other than syn-ack */ 335 if (subflow->conn_finished) 336 return; 337 338 subflow->rel_write_seq = 1; 339 subflow->conn_finished = 1; 340 subflow->ssn_offset = TCP_SKB_CB(skb)->seq; 341 pr_debug("subflow=%p synack seq=%x", subflow, subflow->ssn_offset); 342 343 mptcp_get_options(skb, &mp_opt); 344 if (subflow->request_mptcp) { 345 if (!mp_opt.mp_capable) { 346 MPTCP_INC_STATS(sock_net(sk), 347 MPTCP_MIB_MPCAPABLEACTIVEFALLBACK); 348 mptcp_do_fallback(sk); 349 pr_fallback(mptcp_sk(subflow->conn)); 350 goto fallback; 351 } 352 353 subflow->mp_capable = 1; 354 subflow->can_ack = 1; 355 subflow->remote_key = mp_opt.sndr_key; 356 pr_debug("subflow=%p, remote_key=%llu", subflow, 357 subflow->remote_key); 358 mptcp_finish_connect(sk); 359 } else if (subflow->request_join) { 360 u8 hmac[SHA256_DIGEST_SIZE]; 361 362 if (!mp_opt.mp_join) 363 goto do_reset; 364 365 subflow->thmac = mp_opt.thmac; 366 subflow->remote_nonce = mp_opt.nonce; 367 pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u", subflow, 368 subflow->thmac, subflow->remote_nonce); 369 370 if (!subflow_thmac_valid(subflow)) { 371 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINACKMAC); 372 goto do_reset; 373 } 374 375 subflow_generate_hmac(subflow->local_key, subflow->remote_key, 376 subflow->local_nonce, 377 subflow->remote_nonce, 378 hmac); 379 memcpy(subflow->hmac, hmac, MPTCPOPT_HMAC_LEN); 380 381 if (!mptcp_finish_join(sk)) 382 goto do_reset; 383 384 subflow->mp_join = 1; 385 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNACKRX); 386 } else if (mptcp_check_fallback(sk)) { 387 fallback: 388 mptcp_rcv_space_init(mptcp_sk(parent), sk); 389 } 390 return; 391 392 do_reset: 393 mptcp_subflow_reset(sk); 394 } 395 396 struct request_sock_ops mptcp_subflow_request_sock_ops; 397 EXPORT_SYMBOL_GPL(mptcp_subflow_request_sock_ops); 398 static struct tcp_request_sock_ops subflow_request_sock_ipv4_ops; 399 400 static int subflow_v4_conn_request(struct sock *sk, struct sk_buff *skb) 401 { 402 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); 403 404 pr_debug("subflow=%p", subflow); 405 406 /* Never answer to SYNs sent to broadcast or multicast */ 407 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) 408 goto drop; 409 410 return tcp_conn_request(&mptcp_subflow_request_sock_ops, 411 &subflow_request_sock_ipv4_ops, 412 sk, skb); 413 drop: 414 tcp_listendrop(sk); 415 return 0; 416 } 417 418 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 419 static struct tcp_request_sock_ops subflow_request_sock_ipv6_ops; 420 static struct inet_connection_sock_af_ops subflow_v6_specific; 421 static struct inet_connection_sock_af_ops subflow_v6m_specific; 422 423 static int subflow_v6_conn_request(struct sock *sk, struct sk_buff *skb) 424 { 425 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); 426 427 pr_debug("subflow=%p", subflow); 428 429 if (skb->protocol == htons(ETH_P_IP)) 430 return subflow_v4_conn_request(sk, skb); 431 432 if (!ipv6_unicast_destination(skb)) 433 goto drop; 434 435 return tcp_conn_request(&mptcp_subflow_request_sock_ops, 436 &subflow_request_sock_ipv6_ops, sk, skb); 437 438 drop: 439 tcp_listendrop(sk); 440 return 0; /* don't send reset */ 441 } 442 #endif 443 444 /* validate hmac received in third ACK */ 445 static bool subflow_hmac_valid(const struct request_sock *req, 446 const struct mptcp_options_received *mp_opt) 447 { 448 const struct mptcp_subflow_request_sock *subflow_req; 449 u8 hmac[SHA256_DIGEST_SIZE]; 450 struct mptcp_sock *msk; 451 452 subflow_req = mptcp_subflow_rsk(req); 453 msk = subflow_req->msk; 454 if (!msk) 455 return false; 456 457 subflow_generate_hmac(msk->remote_key, msk->local_key, 458 subflow_req->remote_nonce, 459 subflow_req->local_nonce, hmac); 460 461 return !crypto_memneq(hmac, mp_opt->hmac, MPTCPOPT_HMAC_LEN); 462 } 463 464 static void mptcp_sock_destruct(struct sock *sk) 465 { 466 /* if new mptcp socket isn't accepted, it is free'd 467 * from the tcp listener sockets request queue, linked 468 * from req->sk. The tcp socket is released. 469 * This calls the ULP release function which will 470 * also remove the mptcp socket, via 471 * sock_put(ctx->conn). 472 * 473 * Problem is that the mptcp socket will be in 474 * ESTABLISHED state and will not have the SOCK_DEAD flag. 475 * Both result in warnings from inet_sock_destruct. 476 */ 477 478 if (sk->sk_state == TCP_ESTABLISHED) { 479 sk->sk_state = TCP_CLOSE; 480 WARN_ON_ONCE(sk->sk_socket); 481 sock_orphan(sk); 482 } 483 484 mptcp_destroy_common(mptcp_sk(sk)); 485 inet_sock_destruct(sk); 486 } 487 488 static void mptcp_force_close(struct sock *sk) 489 { 490 inet_sk_state_store(sk, TCP_CLOSE); 491 sk_common_release(sk); 492 } 493 494 static void subflow_ulp_fallback(struct sock *sk, 495 struct mptcp_subflow_context *old_ctx) 496 { 497 struct inet_connection_sock *icsk = inet_csk(sk); 498 499 mptcp_subflow_tcp_fallback(sk, old_ctx); 500 icsk->icsk_ulp_ops = NULL; 501 rcu_assign_pointer(icsk->icsk_ulp_data, NULL); 502 tcp_sk(sk)->is_mptcp = 0; 503 } 504 505 static void subflow_drop_ctx(struct sock *ssk) 506 { 507 struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(ssk); 508 509 if (!ctx) 510 return; 511 512 subflow_ulp_fallback(ssk, ctx); 513 if (ctx->conn) 514 sock_put(ctx->conn); 515 516 kfree_rcu(ctx, rcu); 517 } 518 519 void mptcp_subflow_fully_established(struct mptcp_subflow_context *subflow, 520 struct mptcp_options_received *mp_opt) 521 { 522 struct mptcp_sock *msk = mptcp_sk(subflow->conn); 523 524 subflow->remote_key = mp_opt->sndr_key; 525 subflow->fully_established = 1; 526 subflow->can_ack = 1; 527 WRITE_ONCE(msk->fully_established, true); 528 } 529 530 static struct sock *subflow_syn_recv_sock(const struct sock *sk, 531 struct sk_buff *skb, 532 struct request_sock *req, 533 struct dst_entry *dst, 534 struct request_sock *req_unhash, 535 bool *own_req) 536 { 537 struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk); 538 struct mptcp_subflow_request_sock *subflow_req; 539 struct mptcp_options_received mp_opt; 540 bool fallback, fallback_is_fatal; 541 struct sock *new_msk = NULL; 542 struct sock *child; 543 544 pr_debug("listener=%p, req=%p, conn=%p", listener, req, listener->conn); 545 546 /* After child creation we must look for 'mp_capable' even when options 547 * are not parsed 548 */ 549 mp_opt.mp_capable = 0; 550 551 /* hopefully temporary handling for MP_JOIN+syncookie */ 552 subflow_req = mptcp_subflow_rsk(req); 553 fallback_is_fatal = tcp_rsk(req)->is_mptcp && subflow_req->mp_join; 554 fallback = !tcp_rsk(req)->is_mptcp; 555 if (fallback) 556 goto create_child; 557 558 /* if the sk is MP_CAPABLE, we try to fetch the client key */ 559 if (subflow_req->mp_capable) { 560 if (TCP_SKB_CB(skb)->seq != subflow_req->ssn_offset + 1) { 561 /* here we can receive and accept an in-window, 562 * out-of-order pkt, which will not carry the MP_CAPABLE 563 * opt even on mptcp enabled paths 564 */ 565 goto create_msk; 566 } 567 568 mptcp_get_options(skb, &mp_opt); 569 if (!mp_opt.mp_capable) { 570 fallback = true; 571 goto create_child; 572 } 573 574 create_msk: 575 new_msk = mptcp_sk_clone(listener->conn, &mp_opt, req); 576 if (!new_msk) 577 fallback = true; 578 } else if (subflow_req->mp_join) { 579 mptcp_get_options(skb, &mp_opt); 580 if (!mp_opt.mp_join || !subflow_hmac_valid(req, &mp_opt) || 581 !mptcp_can_accept_new_subflow(subflow_req->msk)) { 582 SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKMAC); 583 fallback = true; 584 } 585 } 586 587 create_child: 588 child = listener->icsk_af_ops->syn_recv_sock(sk, skb, req, dst, 589 req_unhash, own_req); 590 591 if (child && *own_req) { 592 struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(child); 593 594 tcp_rsk(req)->drop_req = false; 595 596 /* we need to fallback on ctx allocation failure and on pre-reqs 597 * checking above. In the latter scenario we additionally need 598 * to reset the context to non MPTCP status. 599 */ 600 if (!ctx || fallback) { 601 if (fallback_is_fatal) 602 goto dispose_child; 603 604 subflow_drop_ctx(child); 605 goto out; 606 } 607 608 if (ctx->mp_capable) { 609 /* this can't race with mptcp_close(), as the msk is 610 * not yet exposted to user-space 611 */ 612 inet_sk_state_store((void *)new_msk, TCP_ESTABLISHED); 613 614 /* record the newly created socket as the first msk 615 * subflow, but don't link it yet into conn_list 616 */ 617 WRITE_ONCE(mptcp_sk(new_msk)->first, child); 618 619 /* new mpc subflow takes ownership of the newly 620 * created mptcp socket 621 */ 622 new_msk->sk_destruct = mptcp_sock_destruct; 623 mptcp_pm_new_connection(mptcp_sk(new_msk), 1); 624 mptcp_token_accept(subflow_req, mptcp_sk(new_msk)); 625 ctx->conn = new_msk; 626 new_msk = NULL; 627 628 /* with OoO packets we can reach here without ingress 629 * mpc option 630 */ 631 if (mp_opt.mp_capable) 632 mptcp_subflow_fully_established(ctx, &mp_opt); 633 } else if (ctx->mp_join) { 634 struct mptcp_sock *owner; 635 636 owner = subflow_req->msk; 637 if (!owner) 638 goto dispose_child; 639 640 /* move the msk reference ownership to the subflow */ 641 subflow_req->msk = NULL; 642 ctx->conn = (struct sock *)owner; 643 if (!mptcp_finish_join(child)) 644 goto dispose_child; 645 646 SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKRX); 647 tcp_rsk(req)->drop_req = true; 648 } 649 } 650 651 out: 652 /* dispose of the left over mptcp master, if any */ 653 if (unlikely(new_msk)) 654 mptcp_force_close(new_msk); 655 656 /* check for expected invariant - should never trigger, just help 657 * catching eariler subtle bugs 658 */ 659 WARN_ON_ONCE(child && *own_req && tcp_sk(child)->is_mptcp && 660 (!mptcp_subflow_ctx(child) || 661 !mptcp_subflow_ctx(child)->conn)); 662 return child; 663 664 dispose_child: 665 subflow_drop_ctx(child); 666 tcp_rsk(req)->drop_req = true; 667 inet_csk_prepare_for_destroy_sock(child); 668 tcp_done(child); 669 req->rsk_ops->send_reset(sk, skb); 670 671 /* The last child reference will be released by the caller */ 672 return child; 673 } 674 675 static struct inet_connection_sock_af_ops subflow_specific; 676 677 enum mapping_status { 678 MAPPING_OK, 679 MAPPING_INVALID, 680 MAPPING_EMPTY, 681 MAPPING_DATA_FIN, 682 MAPPING_DUMMY 683 }; 684 685 static u64 expand_seq(u64 old_seq, u16 old_data_len, u64 seq) 686 { 687 if ((u32)seq == (u32)old_seq) 688 return old_seq; 689 690 /* Assume map covers data not mapped yet. */ 691 return seq | ((old_seq + old_data_len + 1) & GENMASK_ULL(63, 32)); 692 } 693 694 static void warn_bad_map(struct mptcp_subflow_context *subflow, u32 ssn) 695 { 696 WARN_ONCE(1, "Bad mapping: ssn=%d map_seq=%d map_data_len=%d", 697 ssn, subflow->map_subflow_seq, subflow->map_data_len); 698 } 699 700 static bool skb_is_fully_mapped(struct sock *ssk, struct sk_buff *skb) 701 { 702 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); 703 unsigned int skb_consumed; 704 705 skb_consumed = tcp_sk(ssk)->copied_seq - TCP_SKB_CB(skb)->seq; 706 if (WARN_ON_ONCE(skb_consumed >= skb->len)) 707 return true; 708 709 return skb->len - skb_consumed <= subflow->map_data_len - 710 mptcp_subflow_get_map_offset(subflow); 711 } 712 713 static bool validate_mapping(struct sock *ssk, struct sk_buff *skb) 714 { 715 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); 716 u32 ssn = tcp_sk(ssk)->copied_seq - subflow->ssn_offset; 717 718 if (unlikely(before(ssn, subflow->map_subflow_seq))) { 719 /* Mapping covers data later in the subflow stream, 720 * currently unsupported. 721 */ 722 warn_bad_map(subflow, ssn); 723 return false; 724 } 725 if (unlikely(!before(ssn, subflow->map_subflow_seq + 726 subflow->map_data_len))) { 727 /* Mapping does covers past subflow data, invalid */ 728 warn_bad_map(subflow, ssn + skb->len); 729 return false; 730 } 731 return true; 732 } 733 734 static enum mapping_status get_mapping_status(struct sock *ssk, 735 struct mptcp_sock *msk) 736 { 737 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); 738 struct mptcp_ext *mpext; 739 struct sk_buff *skb; 740 u16 data_len; 741 u64 map_seq; 742 743 skb = skb_peek(&ssk->sk_receive_queue); 744 if (!skb) 745 return MAPPING_EMPTY; 746 747 if (mptcp_check_fallback(ssk)) 748 return MAPPING_DUMMY; 749 750 mpext = mptcp_get_ext(skb); 751 if (!mpext || !mpext->use_map) { 752 if (!subflow->map_valid && !skb->len) { 753 /* the TCP stack deliver 0 len FIN pkt to the receive 754 * queue, that is the only 0len pkts ever expected here, 755 * and we can admit no mapping only for 0 len pkts 756 */ 757 if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) 758 WARN_ONCE(1, "0len seq %d:%d flags %x", 759 TCP_SKB_CB(skb)->seq, 760 TCP_SKB_CB(skb)->end_seq, 761 TCP_SKB_CB(skb)->tcp_flags); 762 sk_eat_skb(ssk, skb); 763 return MAPPING_EMPTY; 764 } 765 766 if (!subflow->map_valid) 767 return MAPPING_INVALID; 768 769 goto validate_seq; 770 } 771 772 pr_debug("seq=%llu is64=%d ssn=%u data_len=%u data_fin=%d", 773 mpext->data_seq, mpext->dsn64, mpext->subflow_seq, 774 mpext->data_len, mpext->data_fin); 775 776 data_len = mpext->data_len; 777 if (data_len == 0) { 778 pr_err("Infinite mapping not handled"); 779 MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_INFINITEMAPRX); 780 return MAPPING_INVALID; 781 } 782 783 if (mpext->data_fin == 1) { 784 if (data_len == 1) { 785 bool updated = mptcp_update_rcv_data_fin(msk, mpext->data_seq, 786 mpext->dsn64); 787 pr_debug("DATA_FIN with no payload seq=%llu", mpext->data_seq); 788 if (subflow->map_valid) { 789 /* A DATA_FIN might arrive in a DSS 790 * option before the previous mapping 791 * has been fully consumed. Continue 792 * handling the existing mapping. 793 */ 794 skb_ext_del(skb, SKB_EXT_MPTCP); 795 return MAPPING_OK; 796 } else { 797 if (updated && schedule_work(&msk->work)) 798 sock_hold((struct sock *)msk); 799 800 return MAPPING_DATA_FIN; 801 } 802 } else { 803 u64 data_fin_seq = mpext->data_seq + data_len - 1; 804 805 /* If mpext->data_seq is a 32-bit value, data_fin_seq 806 * must also be limited to 32 bits. 807 */ 808 if (!mpext->dsn64) 809 data_fin_seq &= GENMASK_ULL(31, 0); 810 811 mptcp_update_rcv_data_fin(msk, data_fin_seq, mpext->dsn64); 812 pr_debug("DATA_FIN with mapping seq=%llu dsn64=%d", 813 data_fin_seq, mpext->dsn64); 814 } 815 816 /* Adjust for DATA_FIN using 1 byte of sequence space */ 817 data_len--; 818 } 819 820 if (!mpext->dsn64) { 821 map_seq = expand_seq(subflow->map_seq, subflow->map_data_len, 822 mpext->data_seq); 823 pr_debug("expanded seq=%llu", subflow->map_seq); 824 } else { 825 map_seq = mpext->data_seq; 826 } 827 WRITE_ONCE(mptcp_sk(subflow->conn)->use_64bit_ack, !!mpext->dsn64); 828 829 if (subflow->map_valid) { 830 /* Allow replacing only with an identical map */ 831 if (subflow->map_seq == map_seq && 832 subflow->map_subflow_seq == mpext->subflow_seq && 833 subflow->map_data_len == data_len) { 834 skb_ext_del(skb, SKB_EXT_MPTCP); 835 return MAPPING_OK; 836 } 837 838 /* If this skb data are fully covered by the current mapping, 839 * the new map would need caching, which is not supported 840 */ 841 if (skb_is_fully_mapped(ssk, skb)) { 842 MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DSSNOMATCH); 843 return MAPPING_INVALID; 844 } 845 846 /* will validate the next map after consuming the current one */ 847 return MAPPING_OK; 848 } 849 850 subflow->map_seq = map_seq; 851 subflow->map_subflow_seq = mpext->subflow_seq; 852 subflow->map_data_len = data_len; 853 subflow->map_valid = 1; 854 subflow->mpc_map = mpext->mpc_map; 855 pr_debug("new map seq=%llu subflow_seq=%u data_len=%u", 856 subflow->map_seq, subflow->map_subflow_seq, 857 subflow->map_data_len); 858 859 validate_seq: 860 /* we revalidate valid mapping on new skb, because we must ensure 861 * the current skb is completely covered by the available mapping 862 */ 863 if (!validate_mapping(ssk, skb)) 864 return MAPPING_INVALID; 865 866 skb_ext_del(skb, SKB_EXT_MPTCP); 867 return MAPPING_OK; 868 } 869 870 static void mptcp_subflow_discard_data(struct sock *ssk, struct sk_buff *skb, 871 u64 limit) 872 { 873 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); 874 bool fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN; 875 u32 incr; 876 877 incr = limit >= skb->len ? skb->len + fin : limit; 878 879 pr_debug("discarding=%d len=%d seq=%d", incr, skb->len, 880 subflow->map_subflow_seq); 881 MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DUPDATA); 882 tcp_sk(ssk)->copied_seq += incr; 883 if (!before(tcp_sk(ssk)->copied_seq, TCP_SKB_CB(skb)->end_seq)) 884 sk_eat_skb(ssk, skb); 885 if (mptcp_subflow_get_map_offset(subflow) >= subflow->map_data_len) 886 subflow->map_valid = 0; 887 } 888 889 static bool subflow_check_data_avail(struct sock *ssk) 890 { 891 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); 892 enum mapping_status status; 893 struct mptcp_sock *msk; 894 struct sk_buff *skb; 895 896 pr_debug("msk=%p ssk=%p data_avail=%d skb=%p", subflow->conn, ssk, 897 subflow->data_avail, skb_peek(&ssk->sk_receive_queue)); 898 if (!skb_peek(&ssk->sk_receive_queue)) 899 subflow->data_avail = 0; 900 if (subflow->data_avail) 901 return true; 902 903 msk = mptcp_sk(subflow->conn); 904 for (;;) { 905 u64 ack_seq; 906 u64 old_ack; 907 908 status = get_mapping_status(ssk, msk); 909 pr_debug("msk=%p ssk=%p status=%d", msk, ssk, status); 910 if (status == MAPPING_INVALID) { 911 ssk->sk_err = EBADMSG; 912 goto fatal; 913 } 914 if (status == MAPPING_DUMMY) { 915 __mptcp_do_fallback(msk); 916 skb = skb_peek(&ssk->sk_receive_queue); 917 subflow->map_valid = 1; 918 subflow->map_seq = READ_ONCE(msk->ack_seq); 919 subflow->map_data_len = skb->len; 920 subflow->map_subflow_seq = tcp_sk(ssk)->copied_seq - 921 subflow->ssn_offset; 922 subflow->data_avail = MPTCP_SUBFLOW_DATA_AVAIL; 923 return true; 924 } 925 926 if (status != MAPPING_OK) 927 return false; 928 929 skb = skb_peek(&ssk->sk_receive_queue); 930 if (WARN_ON_ONCE(!skb)) 931 return false; 932 933 /* if msk lacks the remote key, this subflow must provide an 934 * MP_CAPABLE-based mapping 935 */ 936 if (unlikely(!READ_ONCE(msk->can_ack))) { 937 if (!subflow->mpc_map) { 938 ssk->sk_err = EBADMSG; 939 goto fatal; 940 } 941 WRITE_ONCE(msk->remote_key, subflow->remote_key); 942 WRITE_ONCE(msk->ack_seq, subflow->map_seq); 943 WRITE_ONCE(msk->can_ack, true); 944 } 945 946 old_ack = READ_ONCE(msk->ack_seq); 947 ack_seq = mptcp_subflow_get_mapped_dsn(subflow); 948 pr_debug("msk ack_seq=%llx subflow ack_seq=%llx", old_ack, 949 ack_seq); 950 if (ack_seq == old_ack) { 951 subflow->data_avail = MPTCP_SUBFLOW_DATA_AVAIL; 952 break; 953 } else if (after64(ack_seq, old_ack)) { 954 subflow->data_avail = MPTCP_SUBFLOW_OOO_DATA; 955 break; 956 } 957 958 /* only accept in-sequence mapping. Old values are spurious 959 * retransmission 960 */ 961 mptcp_subflow_discard_data(ssk, skb, old_ack - ack_seq); 962 } 963 return true; 964 965 fatal: 966 /* fatal protocol error, close the socket */ 967 /* This barrier is coupled with smp_rmb() in tcp_poll() */ 968 smp_wmb(); 969 ssk->sk_error_report(ssk); 970 tcp_set_state(ssk, TCP_CLOSE); 971 tcp_send_active_reset(ssk, GFP_ATOMIC); 972 subflow->data_avail = 0; 973 return false; 974 } 975 976 bool mptcp_subflow_data_available(struct sock *sk) 977 { 978 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); 979 980 /* check if current mapping is still valid */ 981 if (subflow->map_valid && 982 mptcp_subflow_get_map_offset(subflow) >= subflow->map_data_len) { 983 subflow->map_valid = 0; 984 subflow->data_avail = 0; 985 986 pr_debug("Done with mapping: seq=%u data_len=%u", 987 subflow->map_subflow_seq, 988 subflow->map_data_len); 989 } 990 991 return subflow_check_data_avail(sk); 992 } 993 994 /* If ssk has an mptcp parent socket, use the mptcp rcvbuf occupancy, 995 * not the ssk one. 996 * 997 * In mptcp, rwin is about the mptcp-level connection data. 998 * 999 * Data that is still on the ssk rx queue can thus be ignored, 1000 * as far as mptcp peer is concerened that data is still inflight. 1001 * DSS ACK is updated when skb is moved to the mptcp rx queue. 1002 */ 1003 void mptcp_space(const struct sock *ssk, int *space, int *full_space) 1004 { 1005 const struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); 1006 const struct sock *sk = subflow->conn; 1007 1008 *space = __mptcp_space(sk); 1009 *full_space = tcp_full_space(sk); 1010 } 1011 1012 static void subflow_data_ready(struct sock *sk) 1013 { 1014 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); 1015 u16 state = 1 << inet_sk_state_load(sk); 1016 struct sock *parent = subflow->conn; 1017 struct mptcp_sock *msk; 1018 1019 msk = mptcp_sk(parent); 1020 if (state & TCPF_LISTEN) { 1021 set_bit(MPTCP_DATA_READY, &msk->flags); 1022 parent->sk_data_ready(parent); 1023 return; 1024 } 1025 1026 WARN_ON_ONCE(!__mptcp_check_fallback(msk) && !subflow->mp_capable && 1027 !subflow->mp_join && !(state & TCPF_CLOSE)); 1028 1029 if (mptcp_subflow_data_available(sk)) 1030 mptcp_data_ready(parent, sk); 1031 } 1032 1033 static void subflow_write_space(struct sock *ssk) 1034 { 1035 /* we take action in __mptcp_clean_una() */ 1036 } 1037 1038 void __mptcp_error_report(struct sock *sk) 1039 { 1040 struct mptcp_subflow_context *subflow; 1041 struct mptcp_sock *msk = mptcp_sk(sk); 1042 1043 mptcp_for_each_subflow(msk, subflow) { 1044 struct sock *ssk = mptcp_subflow_tcp_sock(subflow); 1045 int err = sock_error(ssk); 1046 1047 if (!err) 1048 continue; 1049 1050 /* only propagate errors on fallen-back sockets or 1051 * on MPC connect 1052 */ 1053 if (sk->sk_state != TCP_SYN_SENT && !__mptcp_check_fallback(msk)) 1054 continue; 1055 1056 inet_sk_state_store(sk, inet_sk_state_load(ssk)); 1057 sk->sk_err = -err; 1058 1059 /* This barrier is coupled with smp_rmb() in mptcp_poll() */ 1060 smp_wmb(); 1061 sk->sk_error_report(sk); 1062 break; 1063 } 1064 } 1065 1066 static void subflow_error_report(struct sock *ssk) 1067 { 1068 struct sock *sk = mptcp_subflow_ctx(ssk)->conn; 1069 1070 mptcp_data_lock(sk); 1071 if (!sock_owned_by_user(sk)) 1072 __mptcp_error_report(sk); 1073 else 1074 set_bit(MPTCP_ERROR_REPORT, &mptcp_sk(sk)->flags); 1075 mptcp_data_unlock(sk); 1076 } 1077 1078 static struct inet_connection_sock_af_ops * 1079 subflow_default_af_ops(struct sock *sk) 1080 { 1081 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 1082 if (sk->sk_family == AF_INET6) 1083 return &subflow_v6_specific; 1084 #endif 1085 return &subflow_specific; 1086 } 1087 1088 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 1089 void mptcpv6_handle_mapped(struct sock *sk, bool mapped) 1090 { 1091 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); 1092 struct inet_connection_sock *icsk = inet_csk(sk); 1093 struct inet_connection_sock_af_ops *target; 1094 1095 target = mapped ? &subflow_v6m_specific : subflow_default_af_ops(sk); 1096 1097 pr_debug("subflow=%p family=%d ops=%p target=%p mapped=%d", 1098 subflow, sk->sk_family, icsk->icsk_af_ops, target, mapped); 1099 1100 if (likely(icsk->icsk_af_ops == target)) 1101 return; 1102 1103 subflow->icsk_af_ops = icsk->icsk_af_ops; 1104 icsk->icsk_af_ops = target; 1105 } 1106 #endif 1107 1108 static void mptcp_info2sockaddr(const struct mptcp_addr_info *info, 1109 struct sockaddr_storage *addr) 1110 { 1111 memset(addr, 0, sizeof(*addr)); 1112 addr->ss_family = info->family; 1113 if (addr->ss_family == AF_INET) { 1114 struct sockaddr_in *in_addr = (struct sockaddr_in *)addr; 1115 1116 in_addr->sin_addr = info->addr; 1117 in_addr->sin_port = info->port; 1118 } 1119 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 1120 else if (addr->ss_family == AF_INET6) { 1121 struct sockaddr_in6 *in6_addr = (struct sockaddr_in6 *)addr; 1122 1123 in6_addr->sin6_addr = info->addr6; 1124 in6_addr->sin6_port = info->port; 1125 } 1126 #endif 1127 } 1128 1129 int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc, 1130 const struct mptcp_addr_info *remote) 1131 { 1132 struct mptcp_sock *msk = mptcp_sk(sk); 1133 struct mptcp_subflow_context *subflow; 1134 struct sockaddr_storage addr; 1135 int remote_id = remote->id; 1136 int local_id = loc->id; 1137 struct socket *sf; 1138 struct sock *ssk; 1139 u32 remote_token; 1140 int addrlen; 1141 int err; 1142 1143 if (!mptcp_is_fully_established(sk)) 1144 return -ENOTCONN; 1145 1146 err = mptcp_subflow_create_socket(sk, &sf); 1147 if (err) 1148 return err; 1149 1150 ssk = sf->sk; 1151 subflow = mptcp_subflow_ctx(ssk); 1152 do { 1153 get_random_bytes(&subflow->local_nonce, sizeof(u32)); 1154 } while (!subflow->local_nonce); 1155 1156 if (!local_id) { 1157 err = mptcp_pm_get_local_id(msk, (struct sock_common *)ssk); 1158 if (err < 0) 1159 goto failed; 1160 1161 local_id = err; 1162 } 1163 1164 subflow->remote_key = msk->remote_key; 1165 subflow->local_key = msk->local_key; 1166 subflow->token = msk->token; 1167 mptcp_info2sockaddr(loc, &addr); 1168 1169 addrlen = sizeof(struct sockaddr_in); 1170 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 1171 if (loc->family == AF_INET6) 1172 addrlen = sizeof(struct sockaddr_in6); 1173 #endif 1174 ssk->sk_bound_dev_if = loc->ifindex; 1175 err = kernel_bind(sf, (struct sockaddr *)&addr, addrlen); 1176 if (err) 1177 goto failed; 1178 1179 mptcp_crypto_key_sha(subflow->remote_key, &remote_token, NULL); 1180 pr_debug("msk=%p remote_token=%u local_id=%d remote_id=%d", msk, 1181 remote_token, local_id, remote_id); 1182 subflow->remote_token = remote_token; 1183 subflow->local_id = local_id; 1184 subflow->remote_id = remote_id; 1185 subflow->request_join = 1; 1186 subflow->request_bkup = !!(loc->flags & MPTCP_PM_ADDR_FLAG_BACKUP); 1187 mptcp_info2sockaddr(remote, &addr); 1188 1189 mptcp_add_pending_subflow(msk, subflow); 1190 err = kernel_connect(sf, (struct sockaddr *)&addr, addrlen, O_NONBLOCK); 1191 if (err && err != -EINPROGRESS) 1192 goto failed_unlink; 1193 1194 return err; 1195 1196 failed_unlink: 1197 spin_lock_bh(&msk->join_list_lock); 1198 list_del(&subflow->node); 1199 spin_unlock_bh(&msk->join_list_lock); 1200 1201 failed: 1202 subflow->disposable = 1; 1203 sock_release(sf); 1204 return err; 1205 } 1206 1207 static void mptcp_attach_cgroup(struct sock *parent, struct sock *child) 1208 { 1209 #ifdef CONFIG_SOCK_CGROUP_DATA 1210 struct sock_cgroup_data *parent_skcd = &parent->sk_cgrp_data, 1211 *child_skcd = &child->sk_cgrp_data; 1212 1213 /* only the additional subflows created by kworkers have to be modified */ 1214 if (cgroup_id(sock_cgroup_ptr(parent_skcd)) != 1215 cgroup_id(sock_cgroup_ptr(child_skcd))) { 1216 #ifdef CONFIG_MEMCG 1217 struct mem_cgroup *memcg = parent->sk_memcg; 1218 1219 mem_cgroup_sk_free(child); 1220 if (memcg && css_tryget(&memcg->css)) 1221 child->sk_memcg = memcg; 1222 #endif /* CONFIG_MEMCG */ 1223 1224 cgroup_sk_free(child_skcd); 1225 *child_skcd = *parent_skcd; 1226 cgroup_sk_clone(child_skcd); 1227 } 1228 #endif /* CONFIG_SOCK_CGROUP_DATA */ 1229 } 1230 1231 int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock) 1232 { 1233 struct mptcp_subflow_context *subflow; 1234 struct net *net = sock_net(sk); 1235 struct socket *sf; 1236 int err; 1237 1238 /* un-accepted server sockets can reach here - on bad configuration 1239 * bail early to avoid greater trouble later 1240 */ 1241 if (unlikely(!sk->sk_socket)) 1242 return -EINVAL; 1243 1244 err = sock_create_kern(net, sk->sk_family, SOCK_STREAM, IPPROTO_TCP, 1245 &sf); 1246 if (err) 1247 return err; 1248 1249 lock_sock(sf->sk); 1250 1251 /* the newly created socket has to be in the same cgroup as its parent */ 1252 mptcp_attach_cgroup(sk, sf->sk); 1253 1254 /* kernel sockets do not by default acquire net ref, but TCP timer 1255 * needs it. 1256 */ 1257 sf->sk->sk_net_refcnt = 1; 1258 get_net(net); 1259 #ifdef CONFIG_PROC_FS 1260 this_cpu_add(*net->core.sock_inuse, 1); 1261 #endif 1262 err = tcp_set_ulp(sf->sk, "mptcp"); 1263 release_sock(sf->sk); 1264 1265 if (err) { 1266 sock_release(sf); 1267 return err; 1268 } 1269 1270 /* the newly created socket really belongs to the owning MPTCP master 1271 * socket, even if for additional subflows the allocation is performed 1272 * by a kernel workqueue. Adjust inode references, so that the 1273 * procfs/diag interaces really show this one belonging to the correct 1274 * user. 1275 */ 1276 SOCK_INODE(sf)->i_ino = SOCK_INODE(sk->sk_socket)->i_ino; 1277 SOCK_INODE(sf)->i_uid = SOCK_INODE(sk->sk_socket)->i_uid; 1278 SOCK_INODE(sf)->i_gid = SOCK_INODE(sk->sk_socket)->i_gid; 1279 1280 subflow = mptcp_subflow_ctx(sf->sk); 1281 pr_debug("subflow=%p", subflow); 1282 1283 *new_sock = sf; 1284 sock_hold(sk); 1285 subflow->conn = sk; 1286 1287 return 0; 1288 } 1289 1290 static struct mptcp_subflow_context *subflow_create_ctx(struct sock *sk, 1291 gfp_t priority) 1292 { 1293 struct inet_connection_sock *icsk = inet_csk(sk); 1294 struct mptcp_subflow_context *ctx; 1295 1296 ctx = kzalloc(sizeof(*ctx), priority); 1297 if (!ctx) 1298 return NULL; 1299 1300 rcu_assign_pointer(icsk->icsk_ulp_data, ctx); 1301 INIT_LIST_HEAD(&ctx->node); 1302 1303 pr_debug("subflow=%p", ctx); 1304 1305 ctx->tcp_sock = sk; 1306 1307 return ctx; 1308 } 1309 1310 static void __subflow_state_change(struct sock *sk) 1311 { 1312 struct socket_wq *wq; 1313 1314 rcu_read_lock(); 1315 wq = rcu_dereference(sk->sk_wq); 1316 if (skwq_has_sleeper(wq)) 1317 wake_up_interruptible_all(&wq->wait); 1318 rcu_read_unlock(); 1319 } 1320 1321 static bool subflow_is_done(const struct sock *sk) 1322 { 1323 return sk->sk_shutdown & RCV_SHUTDOWN || sk->sk_state == TCP_CLOSE; 1324 } 1325 1326 static void subflow_state_change(struct sock *sk) 1327 { 1328 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); 1329 struct sock *parent = subflow->conn; 1330 1331 __subflow_state_change(sk); 1332 1333 if (subflow_simultaneous_connect(sk)) { 1334 mptcp_do_fallback(sk); 1335 mptcp_rcv_space_init(mptcp_sk(parent), sk); 1336 pr_fallback(mptcp_sk(parent)); 1337 subflow->conn_finished = 1; 1338 if (inet_sk_state_load(parent) == TCP_SYN_SENT) { 1339 inet_sk_state_store(parent, TCP_ESTABLISHED); 1340 parent->sk_state_change(parent); 1341 } 1342 } 1343 1344 /* as recvmsg() does not acquire the subflow socket for ssk selection 1345 * a fin packet carrying a DSS can be unnoticed if we don't trigger 1346 * the data available machinery here. 1347 */ 1348 if (mptcp_subflow_data_available(sk)) 1349 mptcp_data_ready(parent, sk); 1350 1351 if (__mptcp_check_fallback(mptcp_sk(parent)) && 1352 !subflow->rx_eof && subflow_is_done(sk)) { 1353 subflow->rx_eof = 1; 1354 mptcp_subflow_eof(parent); 1355 } 1356 } 1357 1358 static int subflow_ulp_init(struct sock *sk) 1359 { 1360 struct inet_connection_sock *icsk = inet_csk(sk); 1361 struct mptcp_subflow_context *ctx; 1362 struct tcp_sock *tp = tcp_sk(sk); 1363 int err = 0; 1364 1365 /* disallow attaching ULP to a socket unless it has been 1366 * created with sock_create_kern() 1367 */ 1368 if (!sk->sk_kern_sock) { 1369 err = -EOPNOTSUPP; 1370 goto out; 1371 } 1372 1373 ctx = subflow_create_ctx(sk, GFP_KERNEL); 1374 if (!ctx) { 1375 err = -ENOMEM; 1376 goto out; 1377 } 1378 1379 pr_debug("subflow=%p, family=%d", ctx, sk->sk_family); 1380 1381 tp->is_mptcp = 1; 1382 ctx->icsk_af_ops = icsk->icsk_af_ops; 1383 icsk->icsk_af_ops = subflow_default_af_ops(sk); 1384 ctx->tcp_data_ready = sk->sk_data_ready; 1385 ctx->tcp_state_change = sk->sk_state_change; 1386 ctx->tcp_write_space = sk->sk_write_space; 1387 ctx->tcp_error_report = sk->sk_error_report; 1388 sk->sk_data_ready = subflow_data_ready; 1389 sk->sk_write_space = subflow_write_space; 1390 sk->sk_state_change = subflow_state_change; 1391 sk->sk_error_report = subflow_error_report; 1392 out: 1393 return err; 1394 } 1395 1396 static void subflow_ulp_release(struct sock *ssk) 1397 { 1398 struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(ssk); 1399 bool release = true; 1400 struct sock *sk; 1401 1402 if (!ctx) 1403 return; 1404 1405 sk = ctx->conn; 1406 if (sk) { 1407 /* if the msk has been orphaned, keep the ctx 1408 * alive, will be freed by __mptcp_close_ssk(), 1409 * when the subflow is still unaccepted 1410 */ 1411 release = ctx->disposable || list_empty(&ctx->node); 1412 sock_put(sk); 1413 } 1414 1415 if (release) 1416 kfree_rcu(ctx, rcu); 1417 } 1418 1419 static void subflow_ulp_clone(const struct request_sock *req, 1420 struct sock *newsk, 1421 const gfp_t priority) 1422 { 1423 struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); 1424 struct mptcp_subflow_context *old_ctx = mptcp_subflow_ctx(newsk); 1425 struct mptcp_subflow_context *new_ctx; 1426 1427 if (!tcp_rsk(req)->is_mptcp || 1428 (!subflow_req->mp_capable && !subflow_req->mp_join)) { 1429 subflow_ulp_fallback(newsk, old_ctx); 1430 return; 1431 } 1432 1433 new_ctx = subflow_create_ctx(newsk, priority); 1434 if (!new_ctx) { 1435 subflow_ulp_fallback(newsk, old_ctx); 1436 return; 1437 } 1438 1439 new_ctx->conn_finished = 1; 1440 new_ctx->icsk_af_ops = old_ctx->icsk_af_ops; 1441 new_ctx->tcp_data_ready = old_ctx->tcp_data_ready; 1442 new_ctx->tcp_state_change = old_ctx->tcp_state_change; 1443 new_ctx->tcp_write_space = old_ctx->tcp_write_space; 1444 new_ctx->tcp_error_report = old_ctx->tcp_error_report; 1445 new_ctx->rel_write_seq = 1; 1446 new_ctx->tcp_sock = newsk; 1447 1448 if (subflow_req->mp_capable) { 1449 /* see comments in subflow_syn_recv_sock(), MPTCP connection 1450 * is fully established only after we receive the remote key 1451 */ 1452 new_ctx->mp_capable = 1; 1453 new_ctx->local_key = subflow_req->local_key; 1454 new_ctx->token = subflow_req->token; 1455 new_ctx->ssn_offset = subflow_req->ssn_offset; 1456 new_ctx->idsn = subflow_req->idsn; 1457 } else if (subflow_req->mp_join) { 1458 new_ctx->ssn_offset = subflow_req->ssn_offset; 1459 new_ctx->mp_join = 1; 1460 new_ctx->fully_established = 1; 1461 new_ctx->backup = subflow_req->backup; 1462 new_ctx->local_id = subflow_req->local_id; 1463 new_ctx->remote_id = subflow_req->remote_id; 1464 new_ctx->token = subflow_req->token; 1465 new_ctx->thmac = subflow_req->thmac; 1466 } 1467 } 1468 1469 static struct tcp_ulp_ops subflow_ulp_ops __read_mostly = { 1470 .name = "mptcp", 1471 .owner = THIS_MODULE, 1472 .init = subflow_ulp_init, 1473 .release = subflow_ulp_release, 1474 .clone = subflow_ulp_clone, 1475 }; 1476 1477 static int subflow_ops_init(struct request_sock_ops *subflow_ops) 1478 { 1479 subflow_ops->obj_size = sizeof(struct mptcp_subflow_request_sock); 1480 subflow_ops->slab_name = "request_sock_subflow"; 1481 1482 subflow_ops->slab = kmem_cache_create(subflow_ops->slab_name, 1483 subflow_ops->obj_size, 0, 1484 SLAB_ACCOUNT | 1485 SLAB_TYPESAFE_BY_RCU, 1486 NULL); 1487 if (!subflow_ops->slab) 1488 return -ENOMEM; 1489 1490 subflow_ops->destructor = subflow_req_destructor; 1491 1492 return 0; 1493 } 1494 1495 void __init mptcp_subflow_init(void) 1496 { 1497 mptcp_subflow_request_sock_ops = tcp_request_sock_ops; 1498 if (subflow_ops_init(&mptcp_subflow_request_sock_ops) != 0) 1499 panic("MPTCP: failed to init subflow request sock ops\n"); 1500 1501 subflow_request_sock_ipv4_ops = tcp_request_sock_ipv4_ops; 1502 subflow_request_sock_ipv4_ops.route_req = subflow_v4_route_req; 1503 1504 subflow_specific = ipv4_specific; 1505 subflow_specific.conn_request = subflow_v4_conn_request; 1506 subflow_specific.syn_recv_sock = subflow_syn_recv_sock; 1507 subflow_specific.sk_rx_dst_set = subflow_finish_connect; 1508 1509 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 1510 subflow_request_sock_ipv6_ops = tcp_request_sock_ipv6_ops; 1511 subflow_request_sock_ipv6_ops.route_req = subflow_v6_route_req; 1512 1513 subflow_v6_specific = ipv6_specific; 1514 subflow_v6_specific.conn_request = subflow_v6_conn_request; 1515 subflow_v6_specific.syn_recv_sock = subflow_syn_recv_sock; 1516 subflow_v6_specific.sk_rx_dst_set = subflow_finish_connect; 1517 1518 subflow_v6m_specific = subflow_v6_specific; 1519 subflow_v6m_specific.queue_xmit = ipv4_specific.queue_xmit; 1520 subflow_v6m_specific.send_check = ipv4_specific.send_check; 1521 subflow_v6m_specific.net_header_len = ipv4_specific.net_header_len; 1522 subflow_v6m_specific.mtu_reduced = ipv4_specific.mtu_reduced; 1523 subflow_v6m_specific.net_frag_header_len = 0; 1524 #endif 1525 1526 mptcp_diag_subflow_init(&subflow_ulp_ops); 1527 1528 if (tcp_register_ulp(&subflow_ulp_ops) != 0) 1529 panic("MPTCP: failed to register subflows to ULP\n"); 1530 } 1531