1 // SPDX-License-Identifier: GPL-2.0 2 /* Multipath TCP 3 * 4 * Copyright (c) 2017 - 2019, Intel Corporation. 5 */ 6 7 #define pr_fmt(fmt) "MPTCP: " fmt 8 9 #include <linux/kernel.h> 10 #include <linux/module.h> 11 #include <linux/netdevice.h> 12 #include <linux/sched/signal.h> 13 #include <linux/atomic.h> 14 #include <net/sock.h> 15 #include <net/inet_common.h> 16 #include <net/inet_hashtables.h> 17 #include <net/protocol.h> 18 #include <net/tcp.h> 19 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 20 #include <net/transp_v6.h> 21 #endif 22 #include <net/mptcp.h> 23 #include "protocol.h" 24 #include "mib.h" 25 26 #define MPTCP_SAME_STATE TCP_MAX_STATES 27 28 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 29 struct mptcp6_sock { 30 struct mptcp_sock msk; 31 struct ipv6_pinfo np; 32 }; 33 #endif 34 35 struct mptcp_skb_cb { 36 u32 offset; 37 }; 38 39 #define MPTCP_SKB_CB(__skb) ((struct mptcp_skb_cb *)&((__skb)->cb[0])) 40 41 static struct percpu_counter mptcp_sockets_allocated; 42 43 /* If msk has an initial subflow socket, and the MP_CAPABLE handshake has not 44 * completed yet or has failed, return the subflow socket. 45 * Otherwise return NULL. 46 */ 47 static struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk) 48 { 49 if (!msk->subflow || READ_ONCE(msk->can_ack)) 50 return NULL; 51 52 return msk->subflow; 53 } 54 55 static bool __mptcp_needs_tcp_fallback(const struct mptcp_sock *msk) 56 { 57 return msk->first && !sk_is_mptcp(msk->first); 58 } 59 60 static struct socket *mptcp_is_tcpsk(struct sock *sk) 61 { 62 struct socket *sock = sk->sk_socket; 63 64 if (sock->sk != sk) 65 return NULL; 66 67 if (unlikely(sk->sk_prot == &tcp_prot)) { 68 /* we are being invoked after mptcp_accept() has 69 * accepted a non-mp-capable flow: sk is a tcp_sk, 70 * not an mptcp one. 71 * 72 * Hand the socket over to tcp so all further socket ops 73 * bypass mptcp. 74 */ 75 sock->ops = &inet_stream_ops; 76 return sock; 77 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 78 } else if (unlikely(sk->sk_prot == &tcpv6_prot)) { 79 sock->ops = &inet6_stream_ops; 80 return sock; 81 #endif 82 } 83 84 return NULL; 85 } 86 87 static struct socket *__mptcp_tcp_fallback(struct mptcp_sock *msk) 88 { 89 struct socket *sock; 90 91 sock_owned_by_me((const struct sock *)msk); 92 93 sock = mptcp_is_tcpsk((struct sock *)msk); 94 if (unlikely(sock)) 95 return sock; 96 97 if (likely(!__mptcp_needs_tcp_fallback(msk))) 98 return NULL; 99 100 return msk->subflow; 101 } 102 103 static bool __mptcp_can_create_subflow(const struct mptcp_sock *msk) 104 { 105 return !msk->first; 106 } 107 108 static struct socket *__mptcp_socket_create(struct mptcp_sock *msk, int state) 109 { 110 struct mptcp_subflow_context *subflow; 111 struct sock *sk = (struct sock *)msk; 112 struct socket *ssock; 113 int err; 114 115 ssock = __mptcp_tcp_fallback(msk); 116 if (unlikely(ssock)) 117 return ssock; 118 119 ssock = __mptcp_nmpc_socket(msk); 120 if (ssock) 121 goto set_state; 122 123 if (!__mptcp_can_create_subflow(msk)) 124 return ERR_PTR(-EINVAL); 125 126 err = mptcp_subflow_create_socket(sk, &ssock); 127 if (err) 128 return ERR_PTR(err); 129 130 msk->first = ssock->sk; 131 msk->subflow = ssock; 132 subflow = mptcp_subflow_ctx(ssock->sk); 133 list_add(&subflow->node, &msk->conn_list); 134 subflow->request_mptcp = 1; 135 136 set_state: 137 if (state != MPTCP_SAME_STATE) 138 inet_sk_state_store(sk, state); 139 return ssock; 140 } 141 142 static void __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk, 143 struct sk_buff *skb, 144 unsigned int offset, size_t copy_len) 145 { 146 struct sock *sk = (struct sock *)msk; 147 struct sk_buff *tail; 148 149 __skb_unlink(skb, &ssk->sk_receive_queue); 150 151 skb_ext_reset(skb); 152 skb_orphan(skb); 153 msk->ack_seq += copy_len; 154 155 tail = skb_peek_tail(&sk->sk_receive_queue); 156 if (offset == 0 && tail) { 157 bool fragstolen; 158 int delta; 159 160 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) { 161 kfree_skb_partial(skb, fragstolen); 162 atomic_add(delta, &sk->sk_rmem_alloc); 163 sk_mem_charge(sk, delta); 164 return; 165 } 166 } 167 168 skb_set_owner_r(skb, sk); 169 __skb_queue_tail(&sk->sk_receive_queue, skb); 170 MPTCP_SKB_CB(skb)->offset = offset; 171 } 172 173 /* both sockets must be locked */ 174 static bool mptcp_subflow_dsn_valid(const struct mptcp_sock *msk, 175 struct sock *ssk) 176 { 177 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); 178 u64 dsn = mptcp_subflow_get_mapped_dsn(subflow); 179 180 /* revalidate data sequence number. 181 * 182 * mptcp_subflow_data_available() is usually called 183 * without msk lock. Its unlikely (but possible) 184 * that msk->ack_seq has been advanced since the last 185 * call found in-sequence data. 186 */ 187 if (likely(dsn == msk->ack_seq)) 188 return true; 189 190 subflow->data_avail = 0; 191 return mptcp_subflow_data_available(ssk); 192 } 193 194 static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, 195 struct sock *ssk, 196 unsigned int *bytes) 197 { 198 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); 199 struct sock *sk = (struct sock *)msk; 200 unsigned int moved = 0; 201 bool more_data_avail; 202 struct tcp_sock *tp; 203 bool done = false; 204 205 if (!mptcp_subflow_dsn_valid(msk, ssk)) { 206 *bytes = 0; 207 return false; 208 } 209 210 if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { 211 int rcvbuf = max(ssk->sk_rcvbuf, sk->sk_rcvbuf); 212 213 if (rcvbuf > sk->sk_rcvbuf) 214 sk->sk_rcvbuf = rcvbuf; 215 } 216 217 tp = tcp_sk(ssk); 218 do { 219 u32 map_remaining, offset; 220 u32 seq = tp->copied_seq; 221 struct sk_buff *skb; 222 bool fin; 223 224 /* try to move as much data as available */ 225 map_remaining = subflow->map_data_len - 226 mptcp_subflow_get_map_offset(subflow); 227 228 skb = skb_peek(&ssk->sk_receive_queue); 229 if (!skb) 230 break; 231 232 offset = seq - TCP_SKB_CB(skb)->seq; 233 fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN; 234 if (fin) { 235 done = true; 236 seq++; 237 } 238 239 if (offset < skb->len) { 240 size_t len = skb->len - offset; 241 242 if (tp->urg_data) 243 done = true; 244 245 __mptcp_move_skb(msk, ssk, skb, offset, len); 246 seq += len; 247 moved += len; 248 249 if (WARN_ON_ONCE(map_remaining < len)) 250 break; 251 } else { 252 WARN_ON_ONCE(!fin); 253 sk_eat_skb(ssk, skb); 254 done = true; 255 } 256 257 WRITE_ONCE(tp->copied_seq, seq); 258 more_data_avail = mptcp_subflow_data_available(ssk); 259 260 if (atomic_read(&sk->sk_rmem_alloc) > READ_ONCE(sk->sk_rcvbuf)) { 261 done = true; 262 break; 263 } 264 } while (more_data_avail); 265 266 *bytes = moved; 267 268 return done; 269 } 270 271 /* In most cases we will be able to lock the mptcp socket. If its already 272 * owned, we need to defer to the work queue to avoid ABBA deadlock. 273 */ 274 static bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk) 275 { 276 struct sock *sk = (struct sock *)msk; 277 unsigned int moved = 0; 278 279 if (READ_ONCE(sk->sk_lock.owned)) 280 return false; 281 282 if (unlikely(!spin_trylock_bh(&sk->sk_lock.slock))) 283 return false; 284 285 /* must re-check after taking the lock */ 286 if (!READ_ONCE(sk->sk_lock.owned)) 287 __mptcp_move_skbs_from_subflow(msk, ssk, &moved); 288 289 spin_unlock_bh(&sk->sk_lock.slock); 290 291 return moved > 0; 292 } 293 294 void mptcp_data_ready(struct sock *sk, struct sock *ssk) 295 { 296 struct mptcp_sock *msk = mptcp_sk(sk); 297 298 set_bit(MPTCP_DATA_READY, &msk->flags); 299 300 if (atomic_read(&sk->sk_rmem_alloc) < READ_ONCE(sk->sk_rcvbuf) && 301 move_skbs_to_msk(msk, ssk)) 302 goto wake; 303 304 /* don't schedule if mptcp sk is (still) over limit */ 305 if (atomic_read(&sk->sk_rmem_alloc) > READ_ONCE(sk->sk_rcvbuf)) 306 goto wake; 307 308 /* mptcp socket is owned, release_cb should retry */ 309 if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, 310 &sk->sk_tsq_flags)) { 311 sock_hold(sk); 312 313 /* need to try again, its possible release_cb() has already 314 * been called after the test_and_set_bit() above. 315 */ 316 move_skbs_to_msk(msk, ssk); 317 } 318 wake: 319 sk->sk_data_ready(sk); 320 } 321 322 static void __mptcp_flush_join_list(struct mptcp_sock *msk) 323 { 324 if (likely(list_empty(&msk->join_list))) 325 return; 326 327 spin_lock_bh(&msk->join_list_lock); 328 list_splice_tail_init(&msk->join_list, &msk->conn_list); 329 spin_unlock_bh(&msk->join_list_lock); 330 } 331 332 static void mptcp_set_timeout(const struct sock *sk, const struct sock *ssk) 333 { 334 long tout = ssk && inet_csk(ssk)->icsk_pending ? 335 inet_csk(ssk)->icsk_timeout - jiffies : 0; 336 337 if (tout <= 0) 338 tout = mptcp_sk(sk)->timer_ival; 339 mptcp_sk(sk)->timer_ival = tout > 0 ? tout : TCP_RTO_MIN; 340 } 341 342 static bool mptcp_timer_pending(struct sock *sk) 343 { 344 return timer_pending(&inet_csk(sk)->icsk_retransmit_timer); 345 } 346 347 static void mptcp_reset_timer(struct sock *sk) 348 { 349 struct inet_connection_sock *icsk = inet_csk(sk); 350 unsigned long tout; 351 352 /* should never be called with mptcp level timer cleared */ 353 tout = READ_ONCE(mptcp_sk(sk)->timer_ival); 354 if (WARN_ON_ONCE(!tout)) 355 tout = TCP_RTO_MIN; 356 sk_reset_timer(sk, &icsk->icsk_retransmit_timer, jiffies + tout); 357 } 358 359 void mptcp_data_acked(struct sock *sk) 360 { 361 mptcp_reset_timer(sk); 362 363 if (!sk_stream_is_writeable(sk) && 364 schedule_work(&mptcp_sk(sk)->work)) 365 sock_hold(sk); 366 } 367 368 void mptcp_subflow_eof(struct sock *sk) 369 { 370 struct mptcp_sock *msk = mptcp_sk(sk); 371 372 if (!test_and_set_bit(MPTCP_WORK_EOF, &msk->flags) && 373 schedule_work(&msk->work)) 374 sock_hold(sk); 375 } 376 377 static void mptcp_check_for_eof(struct mptcp_sock *msk) 378 { 379 struct mptcp_subflow_context *subflow; 380 struct sock *sk = (struct sock *)msk; 381 int receivers = 0; 382 383 mptcp_for_each_subflow(msk, subflow) 384 receivers += !subflow->rx_eof; 385 386 if (!receivers && !(sk->sk_shutdown & RCV_SHUTDOWN)) { 387 /* hopefully temporary hack: propagate shutdown status 388 * to msk, when all subflows agree on it 389 */ 390 sk->sk_shutdown |= RCV_SHUTDOWN; 391 392 smp_mb__before_atomic(); /* SHUTDOWN must be visible first */ 393 set_bit(MPTCP_DATA_READY, &msk->flags); 394 sk->sk_data_ready(sk); 395 } 396 } 397 398 static void mptcp_stop_timer(struct sock *sk) 399 { 400 struct inet_connection_sock *icsk = inet_csk(sk); 401 402 sk_stop_timer(sk, &icsk->icsk_retransmit_timer); 403 mptcp_sk(sk)->timer_ival = 0; 404 } 405 406 static bool mptcp_ext_cache_refill(struct mptcp_sock *msk) 407 { 408 const struct sock *sk = (const struct sock *)msk; 409 410 if (!msk->cached_ext) 411 msk->cached_ext = __skb_ext_alloc(sk->sk_allocation); 412 413 return !!msk->cached_ext; 414 } 415 416 static struct sock *mptcp_subflow_recv_lookup(const struct mptcp_sock *msk) 417 { 418 struct mptcp_subflow_context *subflow; 419 struct sock *sk = (struct sock *)msk; 420 421 sock_owned_by_me(sk); 422 423 mptcp_for_each_subflow(msk, subflow) { 424 if (subflow->data_avail) 425 return mptcp_subflow_tcp_sock(subflow); 426 } 427 428 return NULL; 429 } 430 431 static bool mptcp_skb_can_collapse_to(u64 write_seq, 432 const struct sk_buff *skb, 433 const struct mptcp_ext *mpext) 434 { 435 if (!tcp_skb_can_collapse_to(skb)) 436 return false; 437 438 /* can collapse only if MPTCP level sequence is in order */ 439 return mpext && mpext->data_seq + mpext->data_len == write_seq; 440 } 441 442 static bool mptcp_frag_can_collapse_to(const struct mptcp_sock *msk, 443 const struct page_frag *pfrag, 444 const struct mptcp_data_frag *df) 445 { 446 return df && pfrag->page == df->page && 447 df->data_seq + df->data_len == msk->write_seq; 448 } 449 450 static void dfrag_uncharge(struct sock *sk, int len) 451 { 452 sk_mem_uncharge(sk, len); 453 sk_wmem_queued_add(sk, -len); 454 } 455 456 static void dfrag_clear(struct sock *sk, struct mptcp_data_frag *dfrag) 457 { 458 int len = dfrag->data_len + dfrag->overhead; 459 460 list_del(&dfrag->list); 461 dfrag_uncharge(sk, len); 462 put_page(dfrag->page); 463 } 464 465 static void mptcp_clean_una(struct sock *sk) 466 { 467 struct mptcp_sock *msk = mptcp_sk(sk); 468 struct mptcp_data_frag *dtmp, *dfrag; 469 u64 snd_una = atomic64_read(&msk->snd_una); 470 bool cleaned = false; 471 472 list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list) { 473 if (after64(dfrag->data_seq + dfrag->data_len, snd_una)) 474 break; 475 476 dfrag_clear(sk, dfrag); 477 cleaned = true; 478 } 479 480 dfrag = mptcp_rtx_head(sk); 481 if (dfrag && after64(snd_una, dfrag->data_seq)) { 482 u64 delta = dfrag->data_seq + dfrag->data_len - snd_una; 483 484 dfrag->data_seq += delta; 485 dfrag->data_len -= delta; 486 487 dfrag_uncharge(sk, delta); 488 cleaned = true; 489 } 490 491 if (cleaned) { 492 sk_mem_reclaim_partial(sk); 493 494 /* Only wake up writers if a subflow is ready */ 495 if (test_bit(MPTCP_SEND_SPACE, &msk->flags)) 496 sk_stream_write_space(sk); 497 } 498 } 499 500 /* ensure we get enough memory for the frag hdr, beyond some minimal amount of 501 * data 502 */ 503 static bool mptcp_page_frag_refill(struct sock *sk, struct page_frag *pfrag) 504 { 505 if (likely(skb_page_frag_refill(32U + sizeof(struct mptcp_data_frag), 506 pfrag, sk->sk_allocation))) 507 return true; 508 509 sk->sk_prot->enter_memory_pressure(sk); 510 sk_stream_moderate_sndbuf(sk); 511 return false; 512 } 513 514 static struct mptcp_data_frag * 515 mptcp_carve_data_frag(const struct mptcp_sock *msk, struct page_frag *pfrag, 516 int orig_offset) 517 { 518 int offset = ALIGN(orig_offset, sizeof(long)); 519 struct mptcp_data_frag *dfrag; 520 521 dfrag = (struct mptcp_data_frag *)(page_to_virt(pfrag->page) + offset); 522 dfrag->data_len = 0; 523 dfrag->data_seq = msk->write_seq; 524 dfrag->overhead = offset - orig_offset + sizeof(struct mptcp_data_frag); 525 dfrag->offset = offset + sizeof(struct mptcp_data_frag); 526 dfrag->page = pfrag->page; 527 528 return dfrag; 529 } 530 531 static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, 532 struct msghdr *msg, struct mptcp_data_frag *dfrag, 533 long *timeo, int *pmss_now, 534 int *ps_goal) 535 { 536 int mss_now, avail_size, size_goal, offset, ret, frag_truesize = 0; 537 bool dfrag_collapsed, can_collapse = false; 538 struct mptcp_sock *msk = mptcp_sk(sk); 539 struct mptcp_ext *mpext = NULL; 540 bool retransmission = !!dfrag; 541 struct sk_buff *skb, *tail; 542 struct page_frag *pfrag; 543 struct page *page; 544 u64 *write_seq; 545 size_t psize; 546 547 /* use the mptcp page cache so that we can easily move the data 548 * from one substream to another, but do per subflow memory accounting 549 * Note: pfrag is used only !retransmission, but the compiler if 550 * fooled into a warning if we don't init here 551 */ 552 pfrag = sk_page_frag(sk); 553 if (!retransmission) { 554 write_seq = &msk->write_seq; 555 page = pfrag->page; 556 } else { 557 write_seq = &dfrag->data_seq; 558 page = dfrag->page; 559 } 560 561 /* compute copy limit */ 562 mss_now = tcp_send_mss(ssk, &size_goal, msg->msg_flags); 563 *pmss_now = mss_now; 564 *ps_goal = size_goal; 565 avail_size = size_goal; 566 skb = tcp_write_queue_tail(ssk); 567 if (skb) { 568 mpext = skb_ext_find(skb, SKB_EXT_MPTCP); 569 570 /* Limit the write to the size available in the 571 * current skb, if any, so that we create at most a new skb. 572 * Explicitly tells TCP internals to avoid collapsing on later 573 * queue management operation, to avoid breaking the ext <-> 574 * SSN association set here 575 */ 576 can_collapse = (size_goal - skb->len > 0) && 577 mptcp_skb_can_collapse_to(*write_seq, skb, mpext); 578 if (!can_collapse) 579 TCP_SKB_CB(skb)->eor = 1; 580 else 581 avail_size = size_goal - skb->len; 582 } 583 584 if (!retransmission) { 585 /* reuse tail pfrag, if possible, or carve a new one from the 586 * page allocator 587 */ 588 dfrag = mptcp_rtx_tail(sk); 589 offset = pfrag->offset; 590 dfrag_collapsed = mptcp_frag_can_collapse_to(msk, pfrag, dfrag); 591 if (!dfrag_collapsed) { 592 dfrag = mptcp_carve_data_frag(msk, pfrag, offset); 593 offset = dfrag->offset; 594 frag_truesize = dfrag->overhead; 595 } 596 psize = min_t(size_t, pfrag->size - offset, avail_size); 597 598 /* Copy to page */ 599 pr_debug("left=%zu", msg_data_left(msg)); 600 psize = copy_page_from_iter(pfrag->page, offset, 601 min_t(size_t, msg_data_left(msg), 602 psize), 603 &msg->msg_iter); 604 pr_debug("left=%zu", msg_data_left(msg)); 605 if (!psize) 606 return -EINVAL; 607 608 if (!sk_wmem_schedule(sk, psize + dfrag->overhead)) 609 return -ENOMEM; 610 } else { 611 offset = dfrag->offset; 612 psize = min_t(size_t, dfrag->data_len, avail_size); 613 } 614 615 /* tell the TCP stack to delay the push so that we can safely 616 * access the skb after the sendpages call 617 */ 618 ret = do_tcp_sendpages(ssk, page, offset, psize, 619 msg->msg_flags | MSG_SENDPAGE_NOTLAST | MSG_DONTWAIT); 620 if (ret <= 0) 621 return ret; 622 623 frag_truesize += ret; 624 if (!retransmission) { 625 if (unlikely(ret < psize)) 626 iov_iter_revert(&msg->msg_iter, psize - ret); 627 628 /* send successful, keep track of sent data for mptcp-level 629 * retransmission 630 */ 631 dfrag->data_len += ret; 632 if (!dfrag_collapsed) { 633 get_page(dfrag->page); 634 list_add_tail(&dfrag->list, &msk->rtx_queue); 635 sk_wmem_queued_add(sk, frag_truesize); 636 } else { 637 sk_wmem_queued_add(sk, ret); 638 } 639 640 /* charge data on mptcp rtx queue to the master socket 641 * Note: we charge such data both to sk and ssk 642 */ 643 sk->sk_forward_alloc -= frag_truesize; 644 } 645 646 /* if the tail skb extension is still the cached one, collapsing 647 * really happened. Note: we can't check for 'same skb' as the sk_buff 648 * hdr on tail can be transmitted, freed and re-allocated by the 649 * do_tcp_sendpages() call 650 */ 651 tail = tcp_write_queue_tail(ssk); 652 if (mpext && tail && mpext == skb_ext_find(tail, SKB_EXT_MPTCP)) { 653 WARN_ON_ONCE(!can_collapse); 654 mpext->data_len += ret; 655 goto out; 656 } 657 658 skb = tcp_write_queue_tail(ssk); 659 mpext = __skb_ext_set(skb, SKB_EXT_MPTCP, msk->cached_ext); 660 msk->cached_ext = NULL; 661 662 memset(mpext, 0, sizeof(*mpext)); 663 mpext->data_seq = *write_seq; 664 mpext->subflow_seq = mptcp_subflow_ctx(ssk)->rel_write_seq; 665 mpext->data_len = ret; 666 mpext->use_map = 1; 667 mpext->dsn64 = 1; 668 669 pr_debug("data_seq=%llu subflow_seq=%u data_len=%u dsn64=%d", 670 mpext->data_seq, mpext->subflow_seq, mpext->data_len, 671 mpext->dsn64); 672 673 out: 674 if (!retransmission) 675 pfrag->offset += frag_truesize; 676 *write_seq += ret; 677 mptcp_subflow_ctx(ssk)->rel_write_seq += ret; 678 679 return ret; 680 } 681 682 static void mptcp_nospace(struct mptcp_sock *msk, struct socket *sock) 683 { 684 clear_bit(MPTCP_SEND_SPACE, &msk->flags); 685 smp_mb__after_atomic(); /* msk->flags is changed by write_space cb */ 686 687 /* enables sk->write_space() callbacks */ 688 set_bit(SOCK_NOSPACE, &sock->flags); 689 } 690 691 static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) 692 { 693 struct mptcp_subflow_context *subflow; 694 struct sock *backup = NULL; 695 696 sock_owned_by_me((const struct sock *)msk); 697 698 if (!mptcp_ext_cache_refill(msk)) 699 return NULL; 700 701 mptcp_for_each_subflow(msk, subflow) { 702 struct sock *ssk = mptcp_subflow_tcp_sock(subflow); 703 704 if (!sk_stream_memory_free(ssk)) { 705 struct socket *sock = ssk->sk_socket; 706 707 if (sock) 708 mptcp_nospace(msk, sock); 709 710 return NULL; 711 } 712 713 if (subflow->backup) { 714 if (!backup) 715 backup = ssk; 716 717 continue; 718 } 719 720 return ssk; 721 } 722 723 return backup; 724 } 725 726 static void ssk_check_wmem(struct mptcp_sock *msk, struct sock *ssk) 727 { 728 struct socket *sock; 729 730 if (likely(sk_stream_is_writeable(ssk))) 731 return; 732 733 sock = READ_ONCE(ssk->sk_socket); 734 if (sock) 735 mptcp_nospace(msk, sock); 736 } 737 738 static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) 739 { 740 int mss_now = 0, size_goal = 0, ret = 0; 741 struct mptcp_sock *msk = mptcp_sk(sk); 742 struct page_frag *pfrag; 743 struct socket *ssock; 744 size_t copied = 0; 745 struct sock *ssk; 746 bool tx_ok; 747 long timeo; 748 749 if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL)) 750 return -EOPNOTSUPP; 751 752 lock_sock(sk); 753 754 timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); 755 756 if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) { 757 ret = sk_stream_wait_connect(sk, &timeo); 758 if (ret) 759 goto out; 760 } 761 762 fallback: 763 ssock = __mptcp_tcp_fallback(msk); 764 if (unlikely(ssock)) { 765 release_sock(sk); 766 pr_debug("fallback passthrough"); 767 ret = sock_sendmsg(ssock, msg); 768 return ret >= 0 ? ret + copied : (copied ? copied : ret); 769 } 770 771 pfrag = sk_page_frag(sk); 772 restart: 773 mptcp_clean_una(sk); 774 775 wait_for_sndbuf: 776 __mptcp_flush_join_list(msk); 777 ssk = mptcp_subflow_get_send(msk); 778 while (!sk_stream_memory_free(sk) || 779 !ssk || 780 !mptcp_page_frag_refill(ssk, pfrag)) { 781 if (ssk) { 782 /* make sure retransmit timer is 783 * running before we wait for memory. 784 * 785 * The retransmit timer might be needed 786 * to make the peer send an up-to-date 787 * MPTCP Ack. 788 */ 789 mptcp_set_timeout(sk, ssk); 790 if (!mptcp_timer_pending(sk)) 791 mptcp_reset_timer(sk); 792 } 793 794 ret = sk_stream_wait_memory(sk, &timeo); 795 if (ret) 796 goto out; 797 798 mptcp_clean_una(sk); 799 800 ssk = mptcp_subflow_get_send(msk); 801 if (list_empty(&msk->conn_list)) { 802 ret = -ENOTCONN; 803 goto out; 804 } 805 } 806 807 pr_debug("conn_list->subflow=%p", ssk); 808 809 lock_sock(ssk); 810 tx_ok = msg_data_left(msg); 811 while (tx_ok) { 812 ret = mptcp_sendmsg_frag(sk, ssk, msg, NULL, &timeo, &mss_now, 813 &size_goal); 814 if (ret < 0) { 815 if (ret == -EAGAIN && timeo > 0) { 816 mptcp_set_timeout(sk, ssk); 817 release_sock(ssk); 818 goto restart; 819 } 820 break; 821 } 822 if (ret == 0 && unlikely(__mptcp_needs_tcp_fallback(msk))) { 823 /* Can happen for passive sockets: 824 * 3WHS negotiated MPTCP, but first packet after is 825 * plain TCP (e.g. due to middlebox filtering unknown 826 * options). 827 * 828 * Fall back to TCP. 829 */ 830 release_sock(ssk); 831 goto fallback; 832 } 833 834 copied += ret; 835 836 tx_ok = msg_data_left(msg); 837 if (!tx_ok) 838 break; 839 840 if (!sk_stream_memory_free(ssk) || 841 !mptcp_page_frag_refill(ssk, pfrag) || 842 !mptcp_ext_cache_refill(msk)) { 843 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 844 tcp_push(ssk, msg->msg_flags, mss_now, 845 tcp_sk(ssk)->nonagle, size_goal); 846 mptcp_set_timeout(sk, ssk); 847 release_sock(ssk); 848 goto restart; 849 } 850 851 /* memory is charged to mptcp level socket as well, i.e. 852 * if msg is very large, mptcp socket may run out of buffer 853 * space. mptcp_clean_una() will release data that has 854 * been acked at mptcp level in the mean time, so there is 855 * a good chance we can continue sending data right away. 856 * 857 * Normally, when the tcp subflow can accept more data, then 858 * so can the MPTCP socket. However, we need to cope with 859 * peers that might lag behind in their MPTCP-level 860 * acknowledgements, i.e. data might have been acked at 861 * tcp level only. So, we must also check the MPTCP socket 862 * limits before we send more data. 863 */ 864 if (unlikely(!sk_stream_memory_free(sk))) { 865 tcp_push(ssk, msg->msg_flags, mss_now, 866 tcp_sk(ssk)->nonagle, size_goal); 867 mptcp_clean_una(sk); 868 if (!sk_stream_memory_free(sk)) { 869 /* can't send more for now, need to wait for 870 * MPTCP-level ACKs from peer. 871 * 872 * Wakeup will happen via mptcp_clean_una(). 873 */ 874 mptcp_set_timeout(sk, ssk); 875 release_sock(ssk); 876 goto wait_for_sndbuf; 877 } 878 } 879 } 880 881 mptcp_set_timeout(sk, ssk); 882 if (copied) { 883 ret = copied; 884 tcp_push(ssk, msg->msg_flags, mss_now, tcp_sk(ssk)->nonagle, 885 size_goal); 886 887 /* start the timer, if it's not pending */ 888 if (!mptcp_timer_pending(sk)) 889 mptcp_reset_timer(sk); 890 } 891 892 ssk_check_wmem(msk, ssk); 893 release_sock(ssk); 894 out: 895 release_sock(sk); 896 return ret; 897 } 898 899 static void mptcp_wait_data(struct sock *sk, long *timeo) 900 { 901 DEFINE_WAIT_FUNC(wait, woken_wake_function); 902 struct mptcp_sock *msk = mptcp_sk(sk); 903 904 add_wait_queue(sk_sleep(sk), &wait); 905 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); 906 907 sk_wait_event(sk, timeo, 908 test_and_clear_bit(MPTCP_DATA_READY, &msk->flags), &wait); 909 910 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); 911 remove_wait_queue(sk_sleep(sk), &wait); 912 } 913 914 static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk, 915 struct msghdr *msg, 916 size_t len) 917 { 918 struct sock *sk = (struct sock *)msk; 919 struct sk_buff *skb; 920 int copied = 0; 921 922 while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) { 923 u32 offset = MPTCP_SKB_CB(skb)->offset; 924 u32 data_len = skb->len - offset; 925 u32 count = min_t(size_t, len - copied, data_len); 926 int err; 927 928 err = skb_copy_datagram_msg(skb, offset, msg, count); 929 if (unlikely(err < 0)) { 930 if (!copied) 931 return err; 932 break; 933 } 934 935 copied += count; 936 937 if (count < data_len) { 938 MPTCP_SKB_CB(skb)->offset += count; 939 break; 940 } 941 942 __skb_unlink(skb, &sk->sk_receive_queue); 943 __kfree_skb(skb); 944 945 if (copied >= len) 946 break; 947 } 948 949 return copied; 950 } 951 952 static bool __mptcp_move_skbs(struct mptcp_sock *msk) 953 { 954 unsigned int moved = 0; 955 bool done; 956 957 do { 958 struct sock *ssk = mptcp_subflow_recv_lookup(msk); 959 960 if (!ssk) 961 break; 962 963 lock_sock(ssk); 964 done = __mptcp_move_skbs_from_subflow(msk, ssk, &moved); 965 release_sock(ssk); 966 } while (!done); 967 968 return moved > 0; 969 } 970 971 static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, 972 int nonblock, int flags, int *addr_len) 973 { 974 struct mptcp_sock *msk = mptcp_sk(sk); 975 struct socket *ssock; 976 int copied = 0; 977 int target; 978 long timeo; 979 980 if (msg->msg_flags & ~(MSG_WAITALL | MSG_DONTWAIT)) 981 return -EOPNOTSUPP; 982 983 lock_sock(sk); 984 ssock = __mptcp_tcp_fallback(msk); 985 if (unlikely(ssock)) { 986 fallback: 987 release_sock(sk); 988 pr_debug("fallback-read subflow=%p", 989 mptcp_subflow_ctx(ssock->sk)); 990 copied = sock_recvmsg(ssock, msg, flags); 991 return copied; 992 } 993 994 timeo = sock_rcvtimeo(sk, nonblock); 995 996 len = min_t(size_t, len, INT_MAX); 997 target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); 998 __mptcp_flush_join_list(msk); 999 1000 while (len > (size_t)copied) { 1001 int bytes_read; 1002 1003 bytes_read = __mptcp_recvmsg_mskq(msk, msg, len - copied); 1004 if (unlikely(bytes_read < 0)) { 1005 if (!copied) 1006 copied = bytes_read; 1007 goto out_err; 1008 } 1009 1010 copied += bytes_read; 1011 1012 if (skb_queue_empty(&sk->sk_receive_queue) && 1013 __mptcp_move_skbs(msk)) 1014 continue; 1015 1016 /* only the master socket status is relevant here. The exit 1017 * conditions mirror closely tcp_recvmsg() 1018 */ 1019 if (copied >= target) 1020 break; 1021 1022 if (copied) { 1023 if (sk->sk_err || 1024 sk->sk_state == TCP_CLOSE || 1025 (sk->sk_shutdown & RCV_SHUTDOWN) || 1026 !timeo || 1027 signal_pending(current)) 1028 break; 1029 } else { 1030 if (sk->sk_err) { 1031 copied = sock_error(sk); 1032 break; 1033 } 1034 1035 if (test_and_clear_bit(MPTCP_WORK_EOF, &msk->flags)) 1036 mptcp_check_for_eof(msk); 1037 1038 if (sk->sk_shutdown & RCV_SHUTDOWN) 1039 break; 1040 1041 if (sk->sk_state == TCP_CLOSE) { 1042 copied = -ENOTCONN; 1043 break; 1044 } 1045 1046 if (!timeo) { 1047 copied = -EAGAIN; 1048 break; 1049 } 1050 1051 if (signal_pending(current)) { 1052 copied = sock_intr_errno(timeo); 1053 break; 1054 } 1055 } 1056 1057 pr_debug("block timeout %ld", timeo); 1058 mptcp_wait_data(sk, &timeo); 1059 ssock = __mptcp_tcp_fallback(msk); 1060 if (unlikely(ssock)) 1061 goto fallback; 1062 } 1063 1064 if (skb_queue_empty(&sk->sk_receive_queue)) { 1065 /* entire backlog drained, clear DATA_READY. */ 1066 clear_bit(MPTCP_DATA_READY, &msk->flags); 1067 1068 /* .. race-breaker: ssk might have gotten new data 1069 * after last __mptcp_move_skbs() returned false. 1070 */ 1071 if (unlikely(__mptcp_move_skbs(msk))) 1072 set_bit(MPTCP_DATA_READY, &msk->flags); 1073 } else if (unlikely(!test_bit(MPTCP_DATA_READY, &msk->flags))) { 1074 /* data to read but mptcp_wait_data() cleared DATA_READY */ 1075 set_bit(MPTCP_DATA_READY, &msk->flags); 1076 } 1077 out_err: 1078 release_sock(sk); 1079 return copied; 1080 } 1081 1082 static void mptcp_retransmit_handler(struct sock *sk) 1083 { 1084 struct mptcp_sock *msk = mptcp_sk(sk); 1085 1086 if (atomic64_read(&msk->snd_una) == msk->write_seq) { 1087 mptcp_stop_timer(sk); 1088 } else { 1089 set_bit(MPTCP_WORK_RTX, &msk->flags); 1090 if (schedule_work(&msk->work)) 1091 sock_hold(sk); 1092 } 1093 } 1094 1095 static void mptcp_retransmit_timer(struct timer_list *t) 1096 { 1097 struct inet_connection_sock *icsk = from_timer(icsk, t, 1098 icsk_retransmit_timer); 1099 struct sock *sk = &icsk->icsk_inet.sk; 1100 1101 bh_lock_sock(sk); 1102 if (!sock_owned_by_user(sk)) { 1103 mptcp_retransmit_handler(sk); 1104 } else { 1105 /* delegate our work to tcp_release_cb() */ 1106 if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED, 1107 &sk->sk_tsq_flags)) 1108 sock_hold(sk); 1109 } 1110 bh_unlock_sock(sk); 1111 sock_put(sk); 1112 } 1113 1114 /* Find an idle subflow. Return NULL if there is unacked data at tcp 1115 * level. 1116 * 1117 * A backup subflow is returned only if that is the only kind available. 1118 */ 1119 static struct sock *mptcp_subflow_get_retrans(const struct mptcp_sock *msk) 1120 { 1121 struct mptcp_subflow_context *subflow; 1122 struct sock *backup = NULL; 1123 1124 sock_owned_by_me((const struct sock *)msk); 1125 1126 mptcp_for_each_subflow(msk, subflow) { 1127 struct sock *ssk = mptcp_subflow_tcp_sock(subflow); 1128 1129 /* still data outstanding at TCP level? Don't retransmit. */ 1130 if (!tcp_write_queue_empty(ssk)) 1131 return NULL; 1132 1133 if (subflow->backup) { 1134 if (!backup) 1135 backup = ssk; 1136 continue; 1137 } 1138 1139 return ssk; 1140 } 1141 1142 return backup; 1143 } 1144 1145 /* subflow sockets can be either outgoing (connect) or incoming 1146 * (accept). 1147 * 1148 * Outgoing subflows use in-kernel sockets. 1149 * Incoming subflows do not have their own 'struct socket' allocated, 1150 * so we need to use tcp_close() after detaching them from the mptcp 1151 * parent socket. 1152 */ 1153 static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, 1154 struct mptcp_subflow_context *subflow, 1155 long timeout) 1156 { 1157 struct socket *sock = READ_ONCE(ssk->sk_socket); 1158 1159 list_del(&subflow->node); 1160 1161 if (sock && sock != sk->sk_socket) { 1162 /* outgoing subflow */ 1163 sock_release(sock); 1164 } else { 1165 /* incoming subflow */ 1166 tcp_close(ssk, timeout); 1167 } 1168 } 1169 1170 static unsigned int mptcp_sync_mss(struct sock *sk, u32 pmtu) 1171 { 1172 return 0; 1173 } 1174 1175 static void mptcp_worker(struct work_struct *work) 1176 { 1177 struct mptcp_sock *msk = container_of(work, struct mptcp_sock, work); 1178 struct sock *ssk, *sk = &msk->sk.icsk_inet.sk; 1179 int orig_len, orig_offset, mss_now = 0, size_goal = 0; 1180 struct mptcp_data_frag *dfrag; 1181 u64 orig_write_seq; 1182 size_t copied = 0; 1183 struct msghdr msg; 1184 long timeo = 0; 1185 1186 lock_sock(sk); 1187 mptcp_clean_una(sk); 1188 __mptcp_flush_join_list(msk); 1189 __mptcp_move_skbs(msk); 1190 1191 if (test_and_clear_bit(MPTCP_WORK_EOF, &msk->flags)) 1192 mptcp_check_for_eof(msk); 1193 1194 if (!test_and_clear_bit(MPTCP_WORK_RTX, &msk->flags)) 1195 goto unlock; 1196 1197 dfrag = mptcp_rtx_head(sk); 1198 if (!dfrag) 1199 goto unlock; 1200 1201 if (!mptcp_ext_cache_refill(msk)) 1202 goto reset_unlock; 1203 1204 ssk = mptcp_subflow_get_retrans(msk); 1205 if (!ssk) 1206 goto reset_unlock; 1207 1208 lock_sock(ssk); 1209 1210 msg.msg_flags = MSG_DONTWAIT; 1211 orig_len = dfrag->data_len; 1212 orig_offset = dfrag->offset; 1213 orig_write_seq = dfrag->data_seq; 1214 while (dfrag->data_len > 0) { 1215 int ret = mptcp_sendmsg_frag(sk, ssk, &msg, dfrag, &timeo, 1216 &mss_now, &size_goal); 1217 if (ret < 0) 1218 break; 1219 1220 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RETRANSSEGS); 1221 copied += ret; 1222 dfrag->data_len -= ret; 1223 dfrag->offset += ret; 1224 1225 if (!mptcp_ext_cache_refill(msk)) 1226 break; 1227 } 1228 if (copied) 1229 tcp_push(ssk, msg.msg_flags, mss_now, tcp_sk(ssk)->nonagle, 1230 size_goal); 1231 1232 dfrag->data_seq = orig_write_seq; 1233 dfrag->offset = orig_offset; 1234 dfrag->data_len = orig_len; 1235 1236 mptcp_set_timeout(sk, ssk); 1237 release_sock(ssk); 1238 1239 reset_unlock: 1240 if (!mptcp_timer_pending(sk)) 1241 mptcp_reset_timer(sk); 1242 1243 unlock: 1244 release_sock(sk); 1245 sock_put(sk); 1246 } 1247 1248 static int __mptcp_init_sock(struct sock *sk) 1249 { 1250 struct mptcp_sock *msk = mptcp_sk(sk); 1251 1252 spin_lock_init(&msk->join_list_lock); 1253 1254 INIT_LIST_HEAD(&msk->conn_list); 1255 INIT_LIST_HEAD(&msk->join_list); 1256 INIT_LIST_HEAD(&msk->rtx_queue); 1257 __set_bit(MPTCP_SEND_SPACE, &msk->flags); 1258 INIT_WORK(&msk->work, mptcp_worker); 1259 1260 msk->first = NULL; 1261 inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss; 1262 1263 mptcp_pm_data_init(msk); 1264 1265 /* re-use the csk retrans timer for MPTCP-level retrans */ 1266 timer_setup(&msk->sk.icsk_retransmit_timer, mptcp_retransmit_timer, 0); 1267 1268 return 0; 1269 } 1270 1271 static int mptcp_init_sock(struct sock *sk) 1272 { 1273 struct net *net = sock_net(sk); 1274 int ret; 1275 1276 if (!mptcp_is_enabled(net)) 1277 return -ENOPROTOOPT; 1278 1279 if (unlikely(!net->mib.mptcp_statistics) && !mptcp_mib_alloc(net)) 1280 return -ENOMEM; 1281 1282 ret = __mptcp_init_sock(sk); 1283 if (ret) 1284 return ret; 1285 1286 sk_sockets_allocated_inc(sk); 1287 sk->sk_sndbuf = sock_net(sk)->ipv4.sysctl_tcp_wmem[2]; 1288 1289 return 0; 1290 } 1291 1292 static void __mptcp_clear_xmit(struct sock *sk) 1293 { 1294 struct mptcp_sock *msk = mptcp_sk(sk); 1295 struct mptcp_data_frag *dtmp, *dfrag; 1296 1297 sk_stop_timer(sk, &msk->sk.icsk_retransmit_timer); 1298 1299 list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list) 1300 dfrag_clear(sk, dfrag); 1301 } 1302 1303 static void mptcp_cancel_work(struct sock *sk) 1304 { 1305 struct mptcp_sock *msk = mptcp_sk(sk); 1306 1307 if (cancel_work_sync(&msk->work)) 1308 sock_put(sk); 1309 } 1310 1311 static void mptcp_subflow_shutdown(struct sock *ssk, int how, 1312 bool data_fin_tx_enable, u64 data_fin_tx_seq) 1313 { 1314 lock_sock(ssk); 1315 1316 switch (ssk->sk_state) { 1317 case TCP_LISTEN: 1318 if (!(how & RCV_SHUTDOWN)) 1319 break; 1320 /* fall through */ 1321 case TCP_SYN_SENT: 1322 tcp_disconnect(ssk, O_NONBLOCK); 1323 break; 1324 default: 1325 if (data_fin_tx_enable) { 1326 struct mptcp_subflow_context *subflow; 1327 1328 subflow = mptcp_subflow_ctx(ssk); 1329 subflow->data_fin_tx_seq = data_fin_tx_seq; 1330 subflow->data_fin_tx_enable = 1; 1331 } 1332 1333 ssk->sk_shutdown |= how; 1334 tcp_shutdown(ssk, how); 1335 break; 1336 } 1337 1338 /* Wake up anyone sleeping in poll. */ 1339 ssk->sk_state_change(ssk); 1340 release_sock(ssk); 1341 } 1342 1343 /* Called with msk lock held, releases such lock before returning */ 1344 static void mptcp_close(struct sock *sk, long timeout) 1345 { 1346 struct mptcp_subflow_context *subflow, *tmp; 1347 struct mptcp_sock *msk = mptcp_sk(sk); 1348 LIST_HEAD(conn_list); 1349 u64 data_fin_tx_seq; 1350 1351 lock_sock(sk); 1352 1353 inet_sk_state_store(sk, TCP_CLOSE); 1354 1355 /* be sure to always acquire the join list lock, to sync vs 1356 * mptcp_finish_join(). 1357 */ 1358 spin_lock_bh(&msk->join_list_lock); 1359 list_splice_tail_init(&msk->join_list, &msk->conn_list); 1360 spin_unlock_bh(&msk->join_list_lock); 1361 list_splice_init(&msk->conn_list, &conn_list); 1362 1363 data_fin_tx_seq = msk->write_seq; 1364 1365 __mptcp_clear_xmit(sk); 1366 1367 release_sock(sk); 1368 1369 list_for_each_entry_safe(subflow, tmp, &conn_list, node) { 1370 struct sock *ssk = mptcp_subflow_tcp_sock(subflow); 1371 1372 subflow->data_fin_tx_seq = data_fin_tx_seq; 1373 subflow->data_fin_tx_enable = 1; 1374 __mptcp_close_ssk(sk, ssk, subflow, timeout); 1375 } 1376 1377 mptcp_cancel_work(sk); 1378 mptcp_pm_close(msk); 1379 1380 __skb_queue_purge(&sk->sk_receive_queue); 1381 1382 sk_common_release(sk); 1383 } 1384 1385 static void mptcp_copy_inaddrs(struct sock *msk, const struct sock *ssk) 1386 { 1387 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 1388 const struct ipv6_pinfo *ssk6 = inet6_sk(ssk); 1389 struct ipv6_pinfo *msk6 = inet6_sk(msk); 1390 1391 msk->sk_v6_daddr = ssk->sk_v6_daddr; 1392 msk->sk_v6_rcv_saddr = ssk->sk_v6_rcv_saddr; 1393 1394 if (msk6 && ssk6) { 1395 msk6->saddr = ssk6->saddr; 1396 msk6->flow_label = ssk6->flow_label; 1397 } 1398 #endif 1399 1400 inet_sk(msk)->inet_num = inet_sk(ssk)->inet_num; 1401 inet_sk(msk)->inet_dport = inet_sk(ssk)->inet_dport; 1402 inet_sk(msk)->inet_sport = inet_sk(ssk)->inet_sport; 1403 inet_sk(msk)->inet_daddr = inet_sk(ssk)->inet_daddr; 1404 inet_sk(msk)->inet_saddr = inet_sk(ssk)->inet_saddr; 1405 inet_sk(msk)->inet_rcv_saddr = inet_sk(ssk)->inet_rcv_saddr; 1406 } 1407 1408 static int mptcp_disconnect(struct sock *sk, int flags) 1409 { 1410 /* Should never be called. 1411 * inet_stream_connect() calls ->disconnect, but that 1412 * refers to the subflow socket, not the mptcp one. 1413 */ 1414 WARN_ON_ONCE(1); 1415 return 0; 1416 } 1417 1418 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 1419 static struct ipv6_pinfo *mptcp_inet6_sk(const struct sock *sk) 1420 { 1421 unsigned int offset = sizeof(struct mptcp6_sock) - sizeof(struct ipv6_pinfo); 1422 1423 return (struct ipv6_pinfo *)(((u8 *)sk) + offset); 1424 } 1425 #endif 1426 1427 struct sock *mptcp_sk_clone(const struct sock *sk, 1428 const struct mptcp_options_received *mp_opt, 1429 struct request_sock *req) 1430 { 1431 struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); 1432 struct sock *nsk = sk_clone_lock(sk, GFP_ATOMIC); 1433 struct mptcp_sock *msk; 1434 u64 ack_seq; 1435 1436 if (!nsk) 1437 return NULL; 1438 1439 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 1440 if (nsk->sk_family == AF_INET6) 1441 inet_sk(nsk)->pinet6 = mptcp_inet6_sk(nsk); 1442 #endif 1443 1444 __mptcp_init_sock(nsk); 1445 1446 msk = mptcp_sk(nsk); 1447 msk->local_key = subflow_req->local_key; 1448 msk->token = subflow_req->token; 1449 msk->subflow = NULL; 1450 1451 if (unlikely(mptcp_token_new_accept(subflow_req->token, nsk))) { 1452 nsk->sk_state = TCP_CLOSE; 1453 bh_unlock_sock(nsk); 1454 1455 /* we can't call into mptcp_close() here - possible BH context 1456 * free the sock directly. 1457 * sk_clone_lock() sets nsk refcnt to two, hence call sk_free() 1458 * too. 1459 */ 1460 sk_common_release(nsk); 1461 sk_free(nsk); 1462 return NULL; 1463 } 1464 1465 msk->write_seq = subflow_req->idsn + 1; 1466 atomic64_set(&msk->snd_una, msk->write_seq); 1467 if (mp_opt->mp_capable) { 1468 msk->can_ack = true; 1469 msk->remote_key = mp_opt->sndr_key; 1470 mptcp_crypto_key_sha(msk->remote_key, NULL, &ack_seq); 1471 ack_seq++; 1472 msk->ack_seq = ack_seq; 1473 } 1474 1475 sock_reset_flag(nsk, SOCK_RCU_FREE); 1476 /* will be fully established after successful MPC subflow creation */ 1477 inet_sk_state_store(nsk, TCP_SYN_RECV); 1478 bh_unlock_sock(nsk); 1479 1480 /* keep a single reference */ 1481 __sock_put(nsk); 1482 return nsk; 1483 } 1484 1485 static struct sock *mptcp_accept(struct sock *sk, int flags, int *err, 1486 bool kern) 1487 { 1488 struct mptcp_sock *msk = mptcp_sk(sk); 1489 struct socket *listener; 1490 struct sock *newsk; 1491 1492 listener = __mptcp_nmpc_socket(msk); 1493 if (WARN_ON_ONCE(!listener)) { 1494 *err = -EINVAL; 1495 return NULL; 1496 } 1497 1498 pr_debug("msk=%p, listener=%p", msk, mptcp_subflow_ctx(listener->sk)); 1499 newsk = inet_csk_accept(listener->sk, flags, err, kern); 1500 if (!newsk) 1501 return NULL; 1502 1503 pr_debug("msk=%p, subflow is mptcp=%d", msk, sk_is_mptcp(newsk)); 1504 1505 if (sk_is_mptcp(newsk)) { 1506 struct mptcp_subflow_context *subflow; 1507 struct sock *new_mptcp_sock; 1508 struct sock *ssk = newsk; 1509 1510 subflow = mptcp_subflow_ctx(newsk); 1511 new_mptcp_sock = subflow->conn; 1512 1513 /* is_mptcp should be false if subflow->conn is missing, see 1514 * subflow_syn_recv_sock() 1515 */ 1516 if (WARN_ON_ONCE(!new_mptcp_sock)) { 1517 tcp_sk(newsk)->is_mptcp = 0; 1518 return newsk; 1519 } 1520 1521 /* acquire the 2nd reference for the owning socket */ 1522 sock_hold(new_mptcp_sock); 1523 1524 local_bh_disable(); 1525 bh_lock_sock(new_mptcp_sock); 1526 msk = mptcp_sk(new_mptcp_sock); 1527 msk->first = newsk; 1528 1529 newsk = new_mptcp_sock; 1530 mptcp_copy_inaddrs(newsk, ssk); 1531 list_add(&subflow->node, &msk->conn_list); 1532 inet_sk_state_store(newsk, TCP_ESTABLISHED); 1533 1534 bh_unlock_sock(new_mptcp_sock); 1535 1536 __MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEPASSIVEACK); 1537 local_bh_enable(); 1538 } else { 1539 MPTCP_INC_STATS(sock_net(sk), 1540 MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK); 1541 } 1542 1543 return newsk; 1544 } 1545 1546 static void mptcp_destroy(struct sock *sk) 1547 { 1548 struct mptcp_sock *msk = mptcp_sk(sk); 1549 1550 mptcp_token_destroy(msk->token); 1551 if (msk->cached_ext) 1552 __skb_ext_put(msk->cached_ext); 1553 1554 sk_sockets_allocated_dec(sk); 1555 } 1556 1557 static int mptcp_setsockopt(struct sock *sk, int level, int optname, 1558 char __user *optval, unsigned int optlen) 1559 { 1560 struct mptcp_sock *msk = mptcp_sk(sk); 1561 struct socket *ssock; 1562 1563 pr_debug("msk=%p", msk); 1564 1565 /* @@ the meaning of setsockopt() when the socket is connected and 1566 * there are multiple subflows is not yet defined. It is up to the 1567 * MPTCP-level socket to configure the subflows until the subflow 1568 * is in TCP fallback, when TCP socket options are passed through 1569 * to the one remaining subflow. 1570 */ 1571 lock_sock(sk); 1572 ssock = __mptcp_tcp_fallback(msk); 1573 release_sock(sk); 1574 if (ssock) 1575 return tcp_setsockopt(ssock->sk, level, optname, optval, 1576 optlen); 1577 1578 return -EOPNOTSUPP; 1579 } 1580 1581 static int mptcp_getsockopt(struct sock *sk, int level, int optname, 1582 char __user *optval, int __user *option) 1583 { 1584 struct mptcp_sock *msk = mptcp_sk(sk); 1585 struct socket *ssock; 1586 1587 pr_debug("msk=%p", msk); 1588 1589 /* @@ the meaning of setsockopt() when the socket is connected and 1590 * there are multiple subflows is not yet defined. It is up to the 1591 * MPTCP-level socket to configure the subflows until the subflow 1592 * is in TCP fallback, when socket options are passed through 1593 * to the one remaining subflow. 1594 */ 1595 lock_sock(sk); 1596 ssock = __mptcp_tcp_fallback(msk); 1597 release_sock(sk); 1598 if (ssock) 1599 return tcp_getsockopt(ssock->sk, level, optname, optval, 1600 option); 1601 1602 return -EOPNOTSUPP; 1603 } 1604 1605 #define MPTCP_DEFERRED_ALL (TCPF_DELACK_TIMER_DEFERRED | \ 1606 TCPF_WRITE_TIMER_DEFERRED) 1607 1608 /* this is very alike tcp_release_cb() but we must handle differently a 1609 * different set of events 1610 */ 1611 static void mptcp_release_cb(struct sock *sk) 1612 { 1613 unsigned long flags, nflags; 1614 1615 do { 1616 flags = sk->sk_tsq_flags; 1617 if (!(flags & MPTCP_DEFERRED_ALL)) 1618 return; 1619 nflags = flags & ~MPTCP_DEFERRED_ALL; 1620 } while (cmpxchg(&sk->sk_tsq_flags, flags, nflags) != flags); 1621 1622 sock_release_ownership(sk); 1623 1624 if (flags & TCPF_DELACK_TIMER_DEFERRED) { 1625 struct mptcp_sock *msk = mptcp_sk(sk); 1626 struct sock *ssk; 1627 1628 ssk = mptcp_subflow_recv_lookup(msk); 1629 if (!ssk || !schedule_work(&msk->work)) 1630 __sock_put(sk); 1631 } 1632 1633 if (flags & TCPF_WRITE_TIMER_DEFERRED) { 1634 mptcp_retransmit_handler(sk); 1635 __sock_put(sk); 1636 } 1637 } 1638 1639 static int mptcp_get_port(struct sock *sk, unsigned short snum) 1640 { 1641 struct mptcp_sock *msk = mptcp_sk(sk); 1642 struct socket *ssock; 1643 1644 ssock = __mptcp_nmpc_socket(msk); 1645 pr_debug("msk=%p, subflow=%p", msk, ssock); 1646 if (WARN_ON_ONCE(!ssock)) 1647 return -EINVAL; 1648 1649 return inet_csk_get_port(ssock->sk, snum); 1650 } 1651 1652 void mptcp_finish_connect(struct sock *ssk) 1653 { 1654 struct mptcp_subflow_context *subflow; 1655 struct mptcp_sock *msk; 1656 struct sock *sk; 1657 u64 ack_seq; 1658 1659 subflow = mptcp_subflow_ctx(ssk); 1660 sk = subflow->conn; 1661 msk = mptcp_sk(sk); 1662 1663 if (!subflow->mp_capable) { 1664 MPTCP_INC_STATS(sock_net(sk), 1665 MPTCP_MIB_MPCAPABLEACTIVEFALLBACK); 1666 return; 1667 } 1668 1669 pr_debug("msk=%p, token=%u", sk, subflow->token); 1670 1671 mptcp_crypto_key_sha(subflow->remote_key, NULL, &ack_seq); 1672 ack_seq++; 1673 subflow->map_seq = ack_seq; 1674 subflow->map_subflow_seq = 1; 1675 subflow->rel_write_seq = 1; 1676 1677 /* the socket is not connected yet, no msk/subflow ops can access/race 1678 * accessing the field below 1679 */ 1680 WRITE_ONCE(msk->remote_key, subflow->remote_key); 1681 WRITE_ONCE(msk->local_key, subflow->local_key); 1682 WRITE_ONCE(msk->token, subflow->token); 1683 WRITE_ONCE(msk->write_seq, subflow->idsn + 1); 1684 WRITE_ONCE(msk->ack_seq, ack_seq); 1685 WRITE_ONCE(msk->can_ack, 1); 1686 atomic64_set(&msk->snd_una, msk->write_seq); 1687 1688 mptcp_pm_new_connection(msk, 0); 1689 } 1690 1691 static void mptcp_sock_graft(struct sock *sk, struct socket *parent) 1692 { 1693 write_lock_bh(&sk->sk_callback_lock); 1694 rcu_assign_pointer(sk->sk_wq, &parent->wq); 1695 sk_set_socket(sk, parent); 1696 sk->sk_uid = SOCK_INODE(parent)->i_uid; 1697 write_unlock_bh(&sk->sk_callback_lock); 1698 } 1699 1700 bool mptcp_finish_join(struct sock *sk) 1701 { 1702 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); 1703 struct mptcp_sock *msk = mptcp_sk(subflow->conn); 1704 struct sock *parent = (void *)msk; 1705 struct socket *parent_sock; 1706 bool ret; 1707 1708 pr_debug("msk=%p, subflow=%p", msk, subflow); 1709 1710 /* mptcp socket already closing? */ 1711 if (inet_sk_state_load(parent) != TCP_ESTABLISHED) 1712 return false; 1713 1714 if (!msk->pm.server_side) 1715 return true; 1716 1717 if (!mptcp_pm_allow_new_subflow(msk)) 1718 return false; 1719 1720 /* active connections are already on conn_list, and we can't acquire 1721 * msk lock here. 1722 * use the join list lock as synchronization point and double-check 1723 * msk status to avoid racing with mptcp_close() 1724 */ 1725 spin_lock_bh(&msk->join_list_lock); 1726 ret = inet_sk_state_load(parent) == TCP_ESTABLISHED; 1727 if (ret && !WARN_ON_ONCE(!list_empty(&subflow->node))) 1728 list_add_tail(&subflow->node, &msk->join_list); 1729 spin_unlock_bh(&msk->join_list_lock); 1730 if (!ret) 1731 return false; 1732 1733 /* attach to msk socket only after we are sure he will deal with us 1734 * at close time 1735 */ 1736 parent_sock = READ_ONCE(parent->sk_socket); 1737 if (parent_sock && !sk->sk_socket) 1738 mptcp_sock_graft(sk, parent_sock); 1739 subflow->map_seq = msk->ack_seq; 1740 return true; 1741 } 1742 1743 static bool mptcp_memory_free(const struct sock *sk, int wake) 1744 { 1745 struct mptcp_sock *msk = mptcp_sk(sk); 1746 1747 return wake ? test_bit(MPTCP_SEND_SPACE, &msk->flags) : true; 1748 } 1749 1750 static struct proto mptcp_prot = { 1751 .name = "MPTCP", 1752 .owner = THIS_MODULE, 1753 .init = mptcp_init_sock, 1754 .disconnect = mptcp_disconnect, 1755 .close = mptcp_close, 1756 .accept = mptcp_accept, 1757 .setsockopt = mptcp_setsockopt, 1758 .getsockopt = mptcp_getsockopt, 1759 .shutdown = tcp_shutdown, 1760 .destroy = mptcp_destroy, 1761 .sendmsg = mptcp_sendmsg, 1762 .recvmsg = mptcp_recvmsg, 1763 .release_cb = mptcp_release_cb, 1764 .hash = inet_hash, 1765 .unhash = inet_unhash, 1766 .get_port = mptcp_get_port, 1767 .sockets_allocated = &mptcp_sockets_allocated, 1768 .memory_allocated = &tcp_memory_allocated, 1769 .memory_pressure = &tcp_memory_pressure, 1770 .stream_memory_free = mptcp_memory_free, 1771 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), 1772 .sysctl_mem = sysctl_tcp_mem, 1773 .obj_size = sizeof(struct mptcp_sock), 1774 .no_autobind = true, 1775 }; 1776 1777 static int mptcp_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) 1778 { 1779 struct mptcp_sock *msk = mptcp_sk(sock->sk); 1780 struct socket *ssock; 1781 int err; 1782 1783 lock_sock(sock->sk); 1784 ssock = __mptcp_socket_create(msk, MPTCP_SAME_STATE); 1785 if (IS_ERR(ssock)) { 1786 err = PTR_ERR(ssock); 1787 goto unlock; 1788 } 1789 1790 err = ssock->ops->bind(ssock, uaddr, addr_len); 1791 if (!err) 1792 mptcp_copy_inaddrs(sock->sk, ssock->sk); 1793 1794 unlock: 1795 release_sock(sock->sk); 1796 return err; 1797 } 1798 1799 static int mptcp_stream_connect(struct socket *sock, struct sockaddr *uaddr, 1800 int addr_len, int flags) 1801 { 1802 struct mptcp_sock *msk = mptcp_sk(sock->sk); 1803 struct socket *ssock; 1804 int err; 1805 1806 lock_sock(sock->sk); 1807 if (sock->state != SS_UNCONNECTED && msk->subflow) { 1808 /* pending connection or invalid state, let existing subflow 1809 * cope with that 1810 */ 1811 ssock = msk->subflow; 1812 goto do_connect; 1813 } 1814 1815 ssock = __mptcp_socket_create(msk, TCP_SYN_SENT); 1816 if (IS_ERR(ssock)) { 1817 err = PTR_ERR(ssock); 1818 goto unlock; 1819 } 1820 1821 #ifdef CONFIG_TCP_MD5SIG 1822 /* no MPTCP if MD5SIG is enabled on this socket or we may run out of 1823 * TCP option space. 1824 */ 1825 if (rcu_access_pointer(tcp_sk(ssock->sk)->md5sig_info)) 1826 mptcp_subflow_ctx(ssock->sk)->request_mptcp = 0; 1827 #endif 1828 1829 do_connect: 1830 err = ssock->ops->connect(ssock, uaddr, addr_len, flags); 1831 sock->state = ssock->state; 1832 1833 /* on successful connect, the msk state will be moved to established by 1834 * subflow_finish_connect() 1835 */ 1836 if (!err || err == EINPROGRESS) 1837 mptcp_copy_inaddrs(sock->sk, ssock->sk); 1838 else 1839 inet_sk_state_store(sock->sk, inet_sk_state_load(ssock->sk)); 1840 1841 unlock: 1842 release_sock(sock->sk); 1843 return err; 1844 } 1845 1846 static int mptcp_v4_getname(struct socket *sock, struct sockaddr *uaddr, 1847 int peer) 1848 { 1849 if (sock->sk->sk_prot == &tcp_prot) { 1850 /* we are being invoked from __sys_accept4, after 1851 * mptcp_accept() has just accepted a non-mp-capable 1852 * flow: sk is a tcp_sk, not an mptcp one. 1853 * 1854 * Hand the socket over to tcp so all further socket ops 1855 * bypass mptcp. 1856 */ 1857 sock->ops = &inet_stream_ops; 1858 } 1859 1860 return inet_getname(sock, uaddr, peer); 1861 } 1862 1863 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 1864 static int mptcp_v6_getname(struct socket *sock, struct sockaddr *uaddr, 1865 int peer) 1866 { 1867 if (sock->sk->sk_prot == &tcpv6_prot) { 1868 /* we are being invoked from __sys_accept4 after 1869 * mptcp_accept() has accepted a non-mp-capable 1870 * subflow: sk is a tcp_sk, not mptcp. 1871 * 1872 * Hand the socket over to tcp so all further 1873 * socket ops bypass mptcp. 1874 */ 1875 sock->ops = &inet6_stream_ops; 1876 } 1877 1878 return inet6_getname(sock, uaddr, peer); 1879 } 1880 #endif 1881 1882 static int mptcp_listen(struct socket *sock, int backlog) 1883 { 1884 struct mptcp_sock *msk = mptcp_sk(sock->sk); 1885 struct socket *ssock; 1886 int err; 1887 1888 pr_debug("msk=%p", msk); 1889 1890 lock_sock(sock->sk); 1891 ssock = __mptcp_socket_create(msk, TCP_LISTEN); 1892 if (IS_ERR(ssock)) { 1893 err = PTR_ERR(ssock); 1894 goto unlock; 1895 } 1896 1897 sock_set_flag(sock->sk, SOCK_RCU_FREE); 1898 1899 err = ssock->ops->listen(ssock, backlog); 1900 inet_sk_state_store(sock->sk, inet_sk_state_load(ssock->sk)); 1901 if (!err) 1902 mptcp_copy_inaddrs(sock->sk, ssock->sk); 1903 1904 unlock: 1905 release_sock(sock->sk); 1906 return err; 1907 } 1908 1909 static bool is_tcp_proto(const struct proto *p) 1910 { 1911 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 1912 return p == &tcp_prot || p == &tcpv6_prot; 1913 #else 1914 return p == &tcp_prot; 1915 #endif 1916 } 1917 1918 static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, 1919 int flags, bool kern) 1920 { 1921 struct mptcp_sock *msk = mptcp_sk(sock->sk); 1922 struct socket *ssock; 1923 int err; 1924 1925 pr_debug("msk=%p", msk); 1926 1927 lock_sock(sock->sk); 1928 if (sock->sk->sk_state != TCP_LISTEN) 1929 goto unlock_fail; 1930 1931 ssock = __mptcp_nmpc_socket(msk); 1932 if (!ssock) 1933 goto unlock_fail; 1934 1935 sock_hold(ssock->sk); 1936 release_sock(sock->sk); 1937 1938 err = ssock->ops->accept(sock, newsock, flags, kern); 1939 if (err == 0 && !is_tcp_proto(newsock->sk->sk_prot)) { 1940 struct mptcp_sock *msk = mptcp_sk(newsock->sk); 1941 struct mptcp_subflow_context *subflow; 1942 1943 /* set ssk->sk_socket of accept()ed flows to mptcp socket. 1944 * This is needed so NOSPACE flag can be set from tcp stack. 1945 */ 1946 __mptcp_flush_join_list(msk); 1947 list_for_each_entry(subflow, &msk->conn_list, node) { 1948 struct sock *ssk = mptcp_subflow_tcp_sock(subflow); 1949 1950 if (!ssk->sk_socket) 1951 mptcp_sock_graft(ssk, newsock); 1952 } 1953 } 1954 1955 sock_put(ssock->sk); 1956 return err; 1957 1958 unlock_fail: 1959 release_sock(sock->sk); 1960 return -EINVAL; 1961 } 1962 1963 static __poll_t mptcp_poll(struct file *file, struct socket *sock, 1964 struct poll_table_struct *wait) 1965 { 1966 struct sock *sk = sock->sk; 1967 struct mptcp_sock *msk; 1968 struct socket *ssock; 1969 __poll_t mask = 0; 1970 1971 msk = mptcp_sk(sk); 1972 lock_sock(sk); 1973 ssock = __mptcp_tcp_fallback(msk); 1974 if (!ssock) 1975 ssock = __mptcp_nmpc_socket(msk); 1976 if (ssock) { 1977 mask = ssock->ops->poll(file, ssock, wait); 1978 release_sock(sk); 1979 return mask; 1980 } 1981 1982 release_sock(sk); 1983 sock_poll_wait(file, sock, wait); 1984 lock_sock(sk); 1985 1986 if (test_bit(MPTCP_DATA_READY, &msk->flags)) 1987 mask = EPOLLIN | EPOLLRDNORM; 1988 if (sk_stream_is_writeable(sk) && 1989 test_bit(MPTCP_SEND_SPACE, &msk->flags)) 1990 mask |= EPOLLOUT | EPOLLWRNORM; 1991 if (sk->sk_shutdown & RCV_SHUTDOWN) 1992 mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; 1993 1994 release_sock(sk); 1995 1996 return mask; 1997 } 1998 1999 static int mptcp_shutdown(struct socket *sock, int how) 2000 { 2001 struct mptcp_sock *msk = mptcp_sk(sock->sk); 2002 struct mptcp_subflow_context *subflow; 2003 struct socket *ssock; 2004 int ret = 0; 2005 2006 pr_debug("sk=%p, how=%d", msk, how); 2007 2008 lock_sock(sock->sk); 2009 ssock = __mptcp_tcp_fallback(msk); 2010 if (ssock) { 2011 release_sock(sock->sk); 2012 return inet_shutdown(ssock, how); 2013 } 2014 2015 if (how == SHUT_WR || how == SHUT_RDWR) 2016 inet_sk_state_store(sock->sk, TCP_FIN_WAIT1); 2017 2018 how++; 2019 2020 if ((how & ~SHUTDOWN_MASK) || !how) { 2021 ret = -EINVAL; 2022 goto out_unlock; 2023 } 2024 2025 if (sock->state == SS_CONNECTING) { 2026 if ((1 << sock->sk->sk_state) & 2027 (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_CLOSE)) 2028 sock->state = SS_DISCONNECTING; 2029 else 2030 sock->state = SS_CONNECTED; 2031 } 2032 2033 __mptcp_flush_join_list(msk); 2034 mptcp_for_each_subflow(msk, subflow) { 2035 struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow); 2036 2037 mptcp_subflow_shutdown(tcp_sk, how, 1, msk->write_seq); 2038 } 2039 2040 out_unlock: 2041 release_sock(sock->sk); 2042 2043 return ret; 2044 } 2045 2046 static const struct proto_ops mptcp_stream_ops = { 2047 .family = PF_INET, 2048 .owner = THIS_MODULE, 2049 .release = inet_release, 2050 .bind = mptcp_bind, 2051 .connect = mptcp_stream_connect, 2052 .socketpair = sock_no_socketpair, 2053 .accept = mptcp_stream_accept, 2054 .getname = mptcp_v4_getname, 2055 .poll = mptcp_poll, 2056 .ioctl = inet_ioctl, 2057 .gettstamp = sock_gettstamp, 2058 .listen = mptcp_listen, 2059 .shutdown = mptcp_shutdown, 2060 .setsockopt = sock_common_setsockopt, 2061 .getsockopt = sock_common_getsockopt, 2062 .sendmsg = inet_sendmsg, 2063 .recvmsg = inet_recvmsg, 2064 .mmap = sock_no_mmap, 2065 .sendpage = inet_sendpage, 2066 #ifdef CONFIG_COMPAT 2067 .compat_setsockopt = compat_sock_common_setsockopt, 2068 .compat_getsockopt = compat_sock_common_getsockopt, 2069 #endif 2070 }; 2071 2072 static struct inet_protosw mptcp_protosw = { 2073 .type = SOCK_STREAM, 2074 .protocol = IPPROTO_MPTCP, 2075 .prot = &mptcp_prot, 2076 .ops = &mptcp_stream_ops, 2077 .flags = INET_PROTOSW_ICSK, 2078 }; 2079 2080 void mptcp_proto_init(void) 2081 { 2082 mptcp_prot.h.hashinfo = tcp_prot.h.hashinfo; 2083 2084 if (percpu_counter_init(&mptcp_sockets_allocated, 0, GFP_KERNEL)) 2085 panic("Failed to allocate MPTCP pcpu counter\n"); 2086 2087 mptcp_subflow_init(); 2088 mptcp_pm_init(); 2089 2090 if (proto_register(&mptcp_prot, 1) != 0) 2091 panic("Failed to register MPTCP proto.\n"); 2092 2093 inet_register_protosw(&mptcp_protosw); 2094 2095 BUILD_BUG_ON(sizeof(struct mptcp_skb_cb) > sizeof_field(struct sk_buff, cb)); 2096 } 2097 2098 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 2099 static const struct proto_ops mptcp_v6_stream_ops = { 2100 .family = PF_INET6, 2101 .owner = THIS_MODULE, 2102 .release = inet6_release, 2103 .bind = mptcp_bind, 2104 .connect = mptcp_stream_connect, 2105 .socketpair = sock_no_socketpair, 2106 .accept = mptcp_stream_accept, 2107 .getname = mptcp_v6_getname, 2108 .poll = mptcp_poll, 2109 .ioctl = inet6_ioctl, 2110 .gettstamp = sock_gettstamp, 2111 .listen = mptcp_listen, 2112 .shutdown = mptcp_shutdown, 2113 .setsockopt = sock_common_setsockopt, 2114 .getsockopt = sock_common_getsockopt, 2115 .sendmsg = inet6_sendmsg, 2116 .recvmsg = inet6_recvmsg, 2117 .mmap = sock_no_mmap, 2118 .sendpage = inet_sendpage, 2119 #ifdef CONFIG_COMPAT 2120 .compat_ioctl = inet6_compat_ioctl, 2121 .compat_setsockopt = compat_sock_common_setsockopt, 2122 .compat_getsockopt = compat_sock_common_getsockopt, 2123 #endif 2124 }; 2125 2126 static struct proto mptcp_v6_prot; 2127 2128 static void mptcp_v6_destroy(struct sock *sk) 2129 { 2130 mptcp_destroy(sk); 2131 inet6_destroy_sock(sk); 2132 } 2133 2134 static struct inet_protosw mptcp_v6_protosw = { 2135 .type = SOCK_STREAM, 2136 .protocol = IPPROTO_MPTCP, 2137 .prot = &mptcp_v6_prot, 2138 .ops = &mptcp_v6_stream_ops, 2139 .flags = INET_PROTOSW_ICSK, 2140 }; 2141 2142 int mptcp_proto_v6_init(void) 2143 { 2144 int err; 2145 2146 mptcp_v6_prot = mptcp_prot; 2147 strcpy(mptcp_v6_prot.name, "MPTCPv6"); 2148 mptcp_v6_prot.slab = NULL; 2149 mptcp_v6_prot.destroy = mptcp_v6_destroy; 2150 mptcp_v6_prot.obj_size = sizeof(struct mptcp6_sock); 2151 2152 err = proto_register(&mptcp_v6_prot, 1); 2153 if (err) 2154 return err; 2155 2156 err = inet6_register_protosw(&mptcp_v6_protosw); 2157 if (err) 2158 proto_unregister(&mptcp_v6_prot); 2159 2160 return err; 2161 } 2162 #endif 2163