1 // SPDX-License-Identifier: GPL-2.0 2 /* Multipath TCP 3 * 4 * Copyright (c) 2017 - 2019, Intel Corporation. 5 */ 6 7 #define pr_fmt(fmt) "MPTCP: " fmt 8 9 #include <linux/kernel.h> 10 #include <linux/module.h> 11 #include <linux/netdevice.h> 12 #include <linux/sched/signal.h> 13 #include <linux/atomic.h> 14 #include <net/sock.h> 15 #include <net/inet_common.h> 16 #include <net/inet_hashtables.h> 17 #include <net/protocol.h> 18 #include <net/tcp.h> 19 #include <net/tcp_states.h> 20 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 21 #include <net/transp_v6.h> 22 #endif 23 #include <net/mptcp.h> 24 #include <net/xfrm.h> 25 #include <asm/ioctls.h> 26 #include "protocol.h" 27 #include "mib.h" 28 29 #define CREATE_TRACE_POINTS 30 #include <trace/events/mptcp.h> 31 32 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 33 struct mptcp6_sock { 34 struct mptcp_sock msk; 35 struct ipv6_pinfo np; 36 }; 37 #endif 38 39 struct mptcp_skb_cb { 40 u64 map_seq; 41 u64 end_seq; 42 u32 offset; 43 u8 has_rxtstamp:1; 44 }; 45 46 #define MPTCP_SKB_CB(__skb) ((struct mptcp_skb_cb *)&((__skb)->cb[0])) 47 48 enum { 49 MPTCP_CMSG_TS = BIT(0), 50 MPTCP_CMSG_INQ = BIT(1), 51 }; 52 53 static struct percpu_counter mptcp_sockets_allocated ____cacheline_aligned_in_smp; 54 55 static void __mptcp_destroy_sock(struct sock *sk); 56 static void __mptcp_check_send_data_fin(struct sock *sk); 57 58 DEFINE_PER_CPU(struct mptcp_delegated_action, mptcp_delegated_actions); 59 static struct net_device mptcp_napi_dev; 60 61 /* If msk has an initial subflow socket, and the MP_CAPABLE handshake has not 62 * completed yet or has failed, return the subflow socket. 63 * Otherwise return NULL. 64 */ 65 struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk) 66 { 67 if (!msk->subflow || READ_ONCE(msk->can_ack)) 68 return NULL; 69 70 return msk->subflow; 71 } 72 73 /* Returns end sequence number of the receiver's advertised window */ 74 static u64 mptcp_wnd_end(const struct mptcp_sock *msk) 75 { 76 return READ_ONCE(msk->wnd_end); 77 } 78 79 static bool mptcp_is_tcpsk(struct sock *sk) 80 { 81 struct socket *sock = sk->sk_socket; 82 83 if (unlikely(sk->sk_prot == &tcp_prot)) { 84 /* we are being invoked after mptcp_accept() has 85 * accepted a non-mp-capable flow: sk is a tcp_sk, 86 * not an mptcp one. 87 * 88 * Hand the socket over to tcp so all further socket ops 89 * bypass mptcp. 90 */ 91 sock->ops = &inet_stream_ops; 92 return true; 93 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 94 } else if (unlikely(sk->sk_prot == &tcpv6_prot)) { 95 sock->ops = &inet6_stream_ops; 96 return true; 97 #endif 98 } 99 100 return false; 101 } 102 103 static int __mptcp_socket_create(struct mptcp_sock *msk) 104 { 105 struct mptcp_subflow_context *subflow; 106 struct sock *sk = (struct sock *)msk; 107 struct socket *ssock; 108 int err; 109 110 err = mptcp_subflow_create_socket(sk, &ssock); 111 if (err) 112 return err; 113 114 msk->first = ssock->sk; 115 msk->subflow = ssock; 116 subflow = mptcp_subflow_ctx(ssock->sk); 117 list_add(&subflow->node, &msk->conn_list); 118 sock_hold(ssock->sk); 119 subflow->request_mptcp = 1; 120 121 /* This is the first subflow, always with id 0 */ 122 subflow->local_id_valid = 1; 123 mptcp_sock_graft(msk->first, sk->sk_socket); 124 125 return 0; 126 } 127 128 static void mptcp_drop(struct sock *sk, struct sk_buff *skb) 129 { 130 sk_drops_add(sk, skb); 131 __kfree_skb(skb); 132 } 133 134 static void mptcp_rmem_charge(struct sock *sk, int size) 135 { 136 mptcp_sk(sk)->rmem_fwd_alloc -= size; 137 } 138 139 static bool mptcp_try_coalesce(struct sock *sk, struct sk_buff *to, 140 struct sk_buff *from) 141 { 142 bool fragstolen; 143 int delta; 144 145 if (MPTCP_SKB_CB(from)->offset || 146 !skb_try_coalesce(to, from, &fragstolen, &delta)) 147 return false; 148 149 pr_debug("colesced seq %llx into %llx new len %d new end seq %llx", 150 MPTCP_SKB_CB(from)->map_seq, MPTCP_SKB_CB(to)->map_seq, 151 to->len, MPTCP_SKB_CB(from)->end_seq); 152 MPTCP_SKB_CB(to)->end_seq = MPTCP_SKB_CB(from)->end_seq; 153 kfree_skb_partial(from, fragstolen); 154 atomic_add(delta, &sk->sk_rmem_alloc); 155 mptcp_rmem_charge(sk, delta); 156 return true; 157 } 158 159 static bool mptcp_ooo_try_coalesce(struct mptcp_sock *msk, struct sk_buff *to, 160 struct sk_buff *from) 161 { 162 if (MPTCP_SKB_CB(from)->map_seq != MPTCP_SKB_CB(to)->end_seq) 163 return false; 164 165 return mptcp_try_coalesce((struct sock *)msk, to, from); 166 } 167 168 static void __mptcp_rmem_reclaim(struct sock *sk, int amount) 169 { 170 amount >>= PAGE_SHIFT; 171 mptcp_sk(sk)->rmem_fwd_alloc -= amount << PAGE_SHIFT; 172 __sk_mem_reduce_allocated(sk, amount); 173 } 174 175 static void mptcp_rmem_uncharge(struct sock *sk, int size) 176 { 177 struct mptcp_sock *msk = mptcp_sk(sk); 178 int reclaimable; 179 180 msk->rmem_fwd_alloc += size; 181 reclaimable = msk->rmem_fwd_alloc - sk_unused_reserved_mem(sk); 182 183 /* see sk_mem_uncharge() for the rationale behind the following schema */ 184 if (unlikely(reclaimable >= PAGE_SIZE)) 185 __mptcp_rmem_reclaim(sk, reclaimable); 186 } 187 188 static void mptcp_rfree(struct sk_buff *skb) 189 { 190 unsigned int len = skb->truesize; 191 struct sock *sk = skb->sk; 192 193 atomic_sub(len, &sk->sk_rmem_alloc); 194 mptcp_rmem_uncharge(sk, len); 195 } 196 197 static void mptcp_set_owner_r(struct sk_buff *skb, struct sock *sk) 198 { 199 skb_orphan(skb); 200 skb->sk = sk; 201 skb->destructor = mptcp_rfree; 202 atomic_add(skb->truesize, &sk->sk_rmem_alloc); 203 mptcp_rmem_charge(sk, skb->truesize); 204 } 205 206 /* "inspired" by tcp_data_queue_ofo(), main differences: 207 * - use mptcp seqs 208 * - don't cope with sacks 209 */ 210 static void mptcp_data_queue_ofo(struct mptcp_sock *msk, struct sk_buff *skb) 211 { 212 struct sock *sk = (struct sock *)msk; 213 struct rb_node **p, *parent; 214 u64 seq, end_seq, max_seq; 215 struct sk_buff *skb1; 216 217 seq = MPTCP_SKB_CB(skb)->map_seq; 218 end_seq = MPTCP_SKB_CB(skb)->end_seq; 219 max_seq = atomic64_read(&msk->rcv_wnd_sent); 220 221 pr_debug("msk=%p seq=%llx limit=%llx empty=%d", msk, seq, max_seq, 222 RB_EMPTY_ROOT(&msk->out_of_order_queue)); 223 if (after64(end_seq, max_seq)) { 224 /* out of window */ 225 mptcp_drop(sk, skb); 226 pr_debug("oow by %lld, rcv_wnd_sent %llu\n", 227 (unsigned long long)end_seq - (unsigned long)max_seq, 228 (unsigned long long)atomic64_read(&msk->rcv_wnd_sent)); 229 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_NODSSWINDOW); 230 return; 231 } 232 233 p = &msk->out_of_order_queue.rb_node; 234 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOQUEUE); 235 if (RB_EMPTY_ROOT(&msk->out_of_order_queue)) { 236 rb_link_node(&skb->rbnode, NULL, p); 237 rb_insert_color(&skb->rbnode, &msk->out_of_order_queue); 238 msk->ooo_last_skb = skb; 239 goto end; 240 } 241 242 /* with 2 subflows, adding at end of ooo queue is quite likely 243 * Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup. 244 */ 245 if (mptcp_ooo_try_coalesce(msk, msk->ooo_last_skb, skb)) { 246 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOMERGE); 247 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOQUEUETAIL); 248 return; 249 } 250 251 /* Can avoid an rbtree lookup if we are adding skb after ooo_last_skb */ 252 if (!before64(seq, MPTCP_SKB_CB(msk->ooo_last_skb)->end_seq)) { 253 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOQUEUETAIL); 254 parent = &msk->ooo_last_skb->rbnode; 255 p = &parent->rb_right; 256 goto insert; 257 } 258 259 /* Find place to insert this segment. Handle overlaps on the way. */ 260 parent = NULL; 261 while (*p) { 262 parent = *p; 263 skb1 = rb_to_skb(parent); 264 if (before64(seq, MPTCP_SKB_CB(skb1)->map_seq)) { 265 p = &parent->rb_left; 266 continue; 267 } 268 if (before64(seq, MPTCP_SKB_CB(skb1)->end_seq)) { 269 if (!after64(end_seq, MPTCP_SKB_CB(skb1)->end_seq)) { 270 /* All the bits are present. Drop. */ 271 mptcp_drop(sk, skb); 272 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA); 273 return; 274 } 275 if (after64(seq, MPTCP_SKB_CB(skb1)->map_seq)) { 276 /* partial overlap: 277 * | skb | 278 * | skb1 | 279 * continue traversing 280 */ 281 } else { 282 /* skb's seq == skb1's seq and skb covers skb1. 283 * Replace skb1 with skb. 284 */ 285 rb_replace_node(&skb1->rbnode, &skb->rbnode, 286 &msk->out_of_order_queue); 287 mptcp_drop(sk, skb1); 288 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA); 289 goto merge_right; 290 } 291 } else if (mptcp_ooo_try_coalesce(msk, skb1, skb)) { 292 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOMERGE); 293 return; 294 } 295 p = &parent->rb_right; 296 } 297 298 insert: 299 /* Insert segment into RB tree. */ 300 rb_link_node(&skb->rbnode, parent, p); 301 rb_insert_color(&skb->rbnode, &msk->out_of_order_queue); 302 303 merge_right: 304 /* Remove other segments covered by skb. */ 305 while ((skb1 = skb_rb_next(skb)) != NULL) { 306 if (before64(end_seq, MPTCP_SKB_CB(skb1)->end_seq)) 307 break; 308 rb_erase(&skb1->rbnode, &msk->out_of_order_queue); 309 mptcp_drop(sk, skb1); 310 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA); 311 } 312 /* If there is no skb after us, we are the last_skb ! */ 313 if (!skb1) 314 msk->ooo_last_skb = skb; 315 316 end: 317 skb_condense(skb); 318 mptcp_set_owner_r(skb, sk); 319 } 320 321 static bool mptcp_rmem_schedule(struct sock *sk, struct sock *ssk, int size) 322 { 323 struct mptcp_sock *msk = mptcp_sk(sk); 324 int amt, amount; 325 326 if (size <= msk->rmem_fwd_alloc) 327 return true; 328 329 size -= msk->rmem_fwd_alloc; 330 amt = sk_mem_pages(size); 331 amount = amt << PAGE_SHIFT; 332 if (!__sk_mem_raise_allocated(sk, size, amt, SK_MEM_RECV)) 333 return false; 334 335 msk->rmem_fwd_alloc += amount; 336 return true; 337 } 338 339 static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk, 340 struct sk_buff *skb, unsigned int offset, 341 size_t copy_len) 342 { 343 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); 344 struct sock *sk = (struct sock *)msk; 345 struct sk_buff *tail; 346 bool has_rxtstamp; 347 348 __skb_unlink(skb, &ssk->sk_receive_queue); 349 350 skb_ext_reset(skb); 351 skb_orphan(skb); 352 353 /* try to fetch required memory from subflow */ 354 if (!mptcp_rmem_schedule(sk, ssk, skb->truesize)) 355 goto drop; 356 357 has_rxtstamp = TCP_SKB_CB(skb)->has_rxtstamp; 358 359 /* the skb map_seq accounts for the skb offset: 360 * mptcp_subflow_get_mapped_dsn() is based on the current tp->copied_seq 361 * value 362 */ 363 MPTCP_SKB_CB(skb)->map_seq = mptcp_subflow_get_mapped_dsn(subflow); 364 MPTCP_SKB_CB(skb)->end_seq = MPTCP_SKB_CB(skb)->map_seq + copy_len; 365 MPTCP_SKB_CB(skb)->offset = offset; 366 MPTCP_SKB_CB(skb)->has_rxtstamp = has_rxtstamp; 367 368 if (MPTCP_SKB_CB(skb)->map_seq == msk->ack_seq) { 369 /* in sequence */ 370 WRITE_ONCE(msk->ack_seq, msk->ack_seq + copy_len); 371 tail = skb_peek_tail(&sk->sk_receive_queue); 372 if (tail && mptcp_try_coalesce(sk, tail, skb)) 373 return true; 374 375 mptcp_set_owner_r(skb, sk); 376 __skb_queue_tail(&sk->sk_receive_queue, skb); 377 return true; 378 } else if (after64(MPTCP_SKB_CB(skb)->map_seq, msk->ack_seq)) { 379 mptcp_data_queue_ofo(msk, skb); 380 return false; 381 } 382 383 /* old data, keep it simple and drop the whole pkt, sender 384 * will retransmit as needed, if needed. 385 */ 386 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA); 387 drop: 388 mptcp_drop(sk, skb); 389 return false; 390 } 391 392 static void mptcp_stop_timer(struct sock *sk) 393 { 394 struct inet_connection_sock *icsk = inet_csk(sk); 395 396 sk_stop_timer(sk, &icsk->icsk_retransmit_timer); 397 mptcp_sk(sk)->timer_ival = 0; 398 } 399 400 static void mptcp_close_wake_up(struct sock *sk) 401 { 402 if (sock_flag(sk, SOCK_DEAD)) 403 return; 404 405 sk->sk_state_change(sk); 406 if (sk->sk_shutdown == SHUTDOWN_MASK || 407 sk->sk_state == TCP_CLOSE) 408 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP); 409 else 410 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); 411 } 412 413 static bool mptcp_pending_data_fin_ack(struct sock *sk) 414 { 415 struct mptcp_sock *msk = mptcp_sk(sk); 416 417 return !__mptcp_check_fallback(msk) && 418 ((1 << sk->sk_state) & 419 (TCPF_FIN_WAIT1 | TCPF_CLOSING | TCPF_LAST_ACK)) && 420 msk->write_seq == READ_ONCE(msk->snd_una); 421 } 422 423 static void mptcp_check_data_fin_ack(struct sock *sk) 424 { 425 struct mptcp_sock *msk = mptcp_sk(sk); 426 427 /* Look for an acknowledged DATA_FIN */ 428 if (mptcp_pending_data_fin_ack(sk)) { 429 WRITE_ONCE(msk->snd_data_fin_enable, 0); 430 431 switch (sk->sk_state) { 432 case TCP_FIN_WAIT1: 433 inet_sk_state_store(sk, TCP_FIN_WAIT2); 434 break; 435 case TCP_CLOSING: 436 case TCP_LAST_ACK: 437 inet_sk_state_store(sk, TCP_CLOSE); 438 break; 439 } 440 441 mptcp_close_wake_up(sk); 442 } 443 } 444 445 static bool mptcp_pending_data_fin(struct sock *sk, u64 *seq) 446 { 447 struct mptcp_sock *msk = mptcp_sk(sk); 448 449 if (READ_ONCE(msk->rcv_data_fin) && 450 ((1 << sk->sk_state) & 451 (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2))) { 452 u64 rcv_data_fin_seq = READ_ONCE(msk->rcv_data_fin_seq); 453 454 if (msk->ack_seq == rcv_data_fin_seq) { 455 if (seq) 456 *seq = rcv_data_fin_seq; 457 458 return true; 459 } 460 } 461 462 return false; 463 } 464 465 static void mptcp_set_datafin_timeout(const struct sock *sk) 466 { 467 struct inet_connection_sock *icsk = inet_csk(sk); 468 u32 retransmits; 469 470 retransmits = min_t(u32, icsk->icsk_retransmits, 471 ilog2(TCP_RTO_MAX / TCP_RTO_MIN)); 472 473 mptcp_sk(sk)->timer_ival = TCP_RTO_MIN << retransmits; 474 } 475 476 static void __mptcp_set_timeout(struct sock *sk, long tout) 477 { 478 mptcp_sk(sk)->timer_ival = tout > 0 ? tout : TCP_RTO_MIN; 479 } 480 481 static long mptcp_timeout_from_subflow(const struct mptcp_subflow_context *subflow) 482 { 483 const struct sock *ssk = mptcp_subflow_tcp_sock(subflow); 484 485 return inet_csk(ssk)->icsk_pending && !subflow->stale_count ? 486 inet_csk(ssk)->icsk_timeout - jiffies : 0; 487 } 488 489 static void mptcp_set_timeout(struct sock *sk) 490 { 491 struct mptcp_subflow_context *subflow; 492 long tout = 0; 493 494 mptcp_for_each_subflow(mptcp_sk(sk), subflow) 495 tout = max(tout, mptcp_timeout_from_subflow(subflow)); 496 __mptcp_set_timeout(sk, tout); 497 } 498 499 static inline bool tcp_can_send_ack(const struct sock *ssk) 500 { 501 return !((1 << inet_sk_state_load(ssk)) & 502 (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_TIME_WAIT | TCPF_CLOSE | TCPF_LISTEN)); 503 } 504 505 void mptcp_subflow_send_ack(struct sock *ssk) 506 { 507 bool slow; 508 509 slow = lock_sock_fast(ssk); 510 if (tcp_can_send_ack(ssk)) 511 tcp_send_ack(ssk); 512 unlock_sock_fast(ssk, slow); 513 } 514 515 static void mptcp_send_ack(struct mptcp_sock *msk) 516 { 517 struct mptcp_subflow_context *subflow; 518 519 mptcp_for_each_subflow(msk, subflow) 520 mptcp_subflow_send_ack(mptcp_subflow_tcp_sock(subflow)); 521 } 522 523 static void mptcp_subflow_cleanup_rbuf(struct sock *ssk) 524 { 525 bool slow; 526 527 slow = lock_sock_fast(ssk); 528 if (tcp_can_send_ack(ssk)) 529 tcp_cleanup_rbuf(ssk, 1); 530 unlock_sock_fast(ssk, slow); 531 } 532 533 static bool mptcp_subflow_could_cleanup(const struct sock *ssk, bool rx_empty) 534 { 535 const struct inet_connection_sock *icsk = inet_csk(ssk); 536 u8 ack_pending = READ_ONCE(icsk->icsk_ack.pending); 537 const struct tcp_sock *tp = tcp_sk(ssk); 538 539 return (ack_pending & ICSK_ACK_SCHED) && 540 ((READ_ONCE(tp->rcv_nxt) - READ_ONCE(tp->rcv_wup) > 541 READ_ONCE(icsk->icsk_ack.rcv_mss)) || 542 (rx_empty && ack_pending & 543 (ICSK_ACK_PUSHED2 | ICSK_ACK_PUSHED))); 544 } 545 546 static void mptcp_cleanup_rbuf(struct mptcp_sock *msk) 547 { 548 int old_space = READ_ONCE(msk->old_wspace); 549 struct mptcp_subflow_context *subflow; 550 struct sock *sk = (struct sock *)msk; 551 int space = __mptcp_space(sk); 552 bool cleanup, rx_empty; 553 554 cleanup = (space > 0) && (space >= (old_space << 1)); 555 rx_empty = !__mptcp_rmem(sk); 556 557 mptcp_for_each_subflow(msk, subflow) { 558 struct sock *ssk = mptcp_subflow_tcp_sock(subflow); 559 560 if (cleanup || mptcp_subflow_could_cleanup(ssk, rx_empty)) 561 mptcp_subflow_cleanup_rbuf(ssk); 562 } 563 } 564 565 static bool mptcp_check_data_fin(struct sock *sk) 566 { 567 struct mptcp_sock *msk = mptcp_sk(sk); 568 u64 rcv_data_fin_seq; 569 bool ret = false; 570 571 if (__mptcp_check_fallback(msk)) 572 return ret; 573 574 /* Need to ack a DATA_FIN received from a peer while this side 575 * of the connection is in ESTABLISHED, FIN_WAIT1, or FIN_WAIT2. 576 * msk->rcv_data_fin was set when parsing the incoming options 577 * at the subflow level and the msk lock was not held, so this 578 * is the first opportunity to act on the DATA_FIN and change 579 * the msk state. 580 * 581 * If we are caught up to the sequence number of the incoming 582 * DATA_FIN, send the DATA_ACK now and do state transition. If 583 * not caught up, do nothing and let the recv code send DATA_ACK 584 * when catching up. 585 */ 586 587 if (mptcp_pending_data_fin(sk, &rcv_data_fin_seq)) { 588 WRITE_ONCE(msk->ack_seq, msk->ack_seq + 1); 589 WRITE_ONCE(msk->rcv_data_fin, 0); 590 591 sk->sk_shutdown |= RCV_SHUTDOWN; 592 smp_mb__before_atomic(); /* SHUTDOWN must be visible first */ 593 594 switch (sk->sk_state) { 595 case TCP_ESTABLISHED: 596 inet_sk_state_store(sk, TCP_CLOSE_WAIT); 597 break; 598 case TCP_FIN_WAIT1: 599 inet_sk_state_store(sk, TCP_CLOSING); 600 break; 601 case TCP_FIN_WAIT2: 602 inet_sk_state_store(sk, TCP_CLOSE); 603 break; 604 default: 605 /* Other states not expected */ 606 WARN_ON_ONCE(1); 607 break; 608 } 609 610 ret = true; 611 mptcp_send_ack(msk); 612 mptcp_close_wake_up(sk); 613 } 614 return ret; 615 } 616 617 static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, 618 struct sock *ssk, 619 unsigned int *bytes) 620 { 621 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); 622 struct sock *sk = (struct sock *)msk; 623 unsigned int moved = 0; 624 bool more_data_avail; 625 struct tcp_sock *tp; 626 bool done = false; 627 int sk_rbuf; 628 629 sk_rbuf = READ_ONCE(sk->sk_rcvbuf); 630 631 if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { 632 int ssk_rbuf = READ_ONCE(ssk->sk_rcvbuf); 633 634 if (unlikely(ssk_rbuf > sk_rbuf)) { 635 WRITE_ONCE(sk->sk_rcvbuf, ssk_rbuf); 636 sk_rbuf = ssk_rbuf; 637 } 638 } 639 640 pr_debug("msk=%p ssk=%p", msk, ssk); 641 tp = tcp_sk(ssk); 642 do { 643 u32 map_remaining, offset; 644 u32 seq = tp->copied_seq; 645 struct sk_buff *skb; 646 bool fin; 647 648 /* try to move as much data as available */ 649 map_remaining = subflow->map_data_len - 650 mptcp_subflow_get_map_offset(subflow); 651 652 skb = skb_peek(&ssk->sk_receive_queue); 653 if (!skb) { 654 /* if no data is found, a racing workqueue/recvmsg 655 * already processed the new data, stop here or we 656 * can enter an infinite loop 657 */ 658 if (!moved) 659 done = true; 660 break; 661 } 662 663 if (__mptcp_check_fallback(msk)) { 664 /* if we are running under the workqueue, TCP could have 665 * collapsed skbs between dummy map creation and now 666 * be sure to adjust the size 667 */ 668 map_remaining = skb->len; 669 subflow->map_data_len = skb->len; 670 } 671 672 offset = seq - TCP_SKB_CB(skb)->seq; 673 fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN; 674 if (fin) { 675 done = true; 676 seq++; 677 } 678 679 if (offset < skb->len) { 680 size_t len = skb->len - offset; 681 682 if (tp->urg_data) 683 done = true; 684 685 if (__mptcp_move_skb(msk, ssk, skb, offset, len)) 686 moved += len; 687 seq += len; 688 689 if (WARN_ON_ONCE(map_remaining < len)) 690 break; 691 } else { 692 WARN_ON_ONCE(!fin); 693 sk_eat_skb(ssk, skb); 694 done = true; 695 } 696 697 WRITE_ONCE(tp->copied_seq, seq); 698 more_data_avail = mptcp_subflow_data_available(ssk); 699 700 if (atomic_read(&sk->sk_rmem_alloc) > sk_rbuf) { 701 done = true; 702 break; 703 } 704 } while (more_data_avail); 705 706 *bytes += moved; 707 return done; 708 } 709 710 static bool __mptcp_ofo_queue(struct mptcp_sock *msk) 711 { 712 struct sock *sk = (struct sock *)msk; 713 struct sk_buff *skb, *tail; 714 bool moved = false; 715 struct rb_node *p; 716 u64 end_seq; 717 718 p = rb_first(&msk->out_of_order_queue); 719 pr_debug("msk=%p empty=%d", msk, RB_EMPTY_ROOT(&msk->out_of_order_queue)); 720 while (p) { 721 skb = rb_to_skb(p); 722 if (after64(MPTCP_SKB_CB(skb)->map_seq, msk->ack_seq)) 723 break; 724 725 p = rb_next(p); 726 rb_erase(&skb->rbnode, &msk->out_of_order_queue); 727 728 if (unlikely(!after64(MPTCP_SKB_CB(skb)->end_seq, 729 msk->ack_seq))) { 730 mptcp_drop(sk, skb); 731 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA); 732 continue; 733 } 734 735 end_seq = MPTCP_SKB_CB(skb)->end_seq; 736 tail = skb_peek_tail(&sk->sk_receive_queue); 737 if (!tail || !mptcp_ooo_try_coalesce(msk, tail, skb)) { 738 int delta = msk->ack_seq - MPTCP_SKB_CB(skb)->map_seq; 739 740 /* skip overlapping data, if any */ 741 pr_debug("uncoalesced seq=%llx ack seq=%llx delta=%d", 742 MPTCP_SKB_CB(skb)->map_seq, msk->ack_seq, 743 delta); 744 MPTCP_SKB_CB(skb)->offset += delta; 745 MPTCP_SKB_CB(skb)->map_seq += delta; 746 __skb_queue_tail(&sk->sk_receive_queue, skb); 747 } 748 msk->ack_seq = end_seq; 749 moved = true; 750 } 751 return moved; 752 } 753 754 /* In most cases we will be able to lock the mptcp socket. If its already 755 * owned, we need to defer to the work queue to avoid ABBA deadlock. 756 */ 757 static bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk) 758 { 759 struct sock *sk = (struct sock *)msk; 760 unsigned int moved = 0; 761 762 __mptcp_move_skbs_from_subflow(msk, ssk, &moved); 763 __mptcp_ofo_queue(msk); 764 if (unlikely(ssk->sk_err)) { 765 if (!sock_owned_by_user(sk)) 766 __mptcp_error_report(sk); 767 else 768 __set_bit(MPTCP_ERROR_REPORT, &msk->cb_flags); 769 } 770 771 /* If the moves have caught up with the DATA_FIN sequence number 772 * it's time to ack the DATA_FIN and change socket state, but 773 * this is not a good place to change state. Let the workqueue 774 * do it. 775 */ 776 if (mptcp_pending_data_fin(sk, NULL)) 777 mptcp_schedule_work(sk); 778 return moved > 0; 779 } 780 781 void mptcp_data_ready(struct sock *sk, struct sock *ssk) 782 { 783 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); 784 struct mptcp_sock *msk = mptcp_sk(sk); 785 int sk_rbuf, ssk_rbuf; 786 787 /* The peer can send data while we are shutting down this 788 * subflow at msk destruction time, but we must avoid enqueuing 789 * more data to the msk receive queue 790 */ 791 if (unlikely(subflow->disposable)) 792 return; 793 794 ssk_rbuf = READ_ONCE(ssk->sk_rcvbuf); 795 sk_rbuf = READ_ONCE(sk->sk_rcvbuf); 796 if (unlikely(ssk_rbuf > sk_rbuf)) 797 sk_rbuf = ssk_rbuf; 798 799 /* over limit? can't append more skbs to msk, Also, no need to wake-up*/ 800 if (__mptcp_rmem(sk) > sk_rbuf) { 801 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RCVPRUNED); 802 return; 803 } 804 805 /* Wake-up the reader only for in-sequence data */ 806 mptcp_data_lock(sk); 807 if (move_skbs_to_msk(msk, ssk)) 808 sk->sk_data_ready(sk); 809 810 mptcp_data_unlock(sk); 811 } 812 813 static bool __mptcp_finish_join(struct mptcp_sock *msk, struct sock *ssk) 814 { 815 struct sock *sk = (struct sock *)msk; 816 817 if (sk->sk_state != TCP_ESTABLISHED) 818 return false; 819 820 /* attach to msk socket only after we are sure we will deal with it 821 * at close time 822 */ 823 if (sk->sk_socket && !ssk->sk_socket) 824 mptcp_sock_graft(ssk, sk->sk_socket); 825 826 mptcp_propagate_sndbuf((struct sock *)msk, ssk); 827 mptcp_sockopt_sync_locked(msk, ssk); 828 return true; 829 } 830 831 static void __mptcp_flush_join_list(struct sock *sk) 832 { 833 struct mptcp_subflow_context *tmp, *subflow; 834 struct mptcp_sock *msk = mptcp_sk(sk); 835 836 list_for_each_entry_safe(subflow, tmp, &msk->join_list, node) { 837 struct sock *ssk = mptcp_subflow_tcp_sock(subflow); 838 bool slow = lock_sock_fast(ssk); 839 840 list_move_tail(&subflow->node, &msk->conn_list); 841 if (!__mptcp_finish_join(msk, ssk)) 842 mptcp_subflow_reset(ssk); 843 unlock_sock_fast(ssk, slow); 844 } 845 } 846 847 static bool mptcp_timer_pending(struct sock *sk) 848 { 849 return timer_pending(&inet_csk(sk)->icsk_retransmit_timer); 850 } 851 852 static void mptcp_reset_timer(struct sock *sk) 853 { 854 struct inet_connection_sock *icsk = inet_csk(sk); 855 unsigned long tout; 856 857 /* prevent rescheduling on close */ 858 if (unlikely(inet_sk_state_load(sk) == TCP_CLOSE)) 859 return; 860 861 tout = mptcp_sk(sk)->timer_ival; 862 sk_reset_timer(sk, &icsk->icsk_retransmit_timer, jiffies + tout); 863 } 864 865 bool mptcp_schedule_work(struct sock *sk) 866 { 867 if (inet_sk_state_load(sk) != TCP_CLOSE && 868 schedule_work(&mptcp_sk(sk)->work)) { 869 /* each subflow already holds a reference to the sk, and the 870 * workqueue is invoked by a subflow, so sk can't go away here. 871 */ 872 sock_hold(sk); 873 return true; 874 } 875 return false; 876 } 877 878 void mptcp_subflow_eof(struct sock *sk) 879 { 880 if (!test_and_set_bit(MPTCP_WORK_EOF, &mptcp_sk(sk)->flags)) 881 mptcp_schedule_work(sk); 882 } 883 884 static void mptcp_check_for_eof(struct mptcp_sock *msk) 885 { 886 struct mptcp_subflow_context *subflow; 887 struct sock *sk = (struct sock *)msk; 888 int receivers = 0; 889 890 mptcp_for_each_subflow(msk, subflow) 891 receivers += !subflow->rx_eof; 892 if (receivers) 893 return; 894 895 if (!(sk->sk_shutdown & RCV_SHUTDOWN)) { 896 /* hopefully temporary hack: propagate shutdown status 897 * to msk, when all subflows agree on it 898 */ 899 sk->sk_shutdown |= RCV_SHUTDOWN; 900 901 smp_mb__before_atomic(); /* SHUTDOWN must be visible first */ 902 sk->sk_data_ready(sk); 903 } 904 905 switch (sk->sk_state) { 906 case TCP_ESTABLISHED: 907 inet_sk_state_store(sk, TCP_CLOSE_WAIT); 908 break; 909 case TCP_FIN_WAIT1: 910 inet_sk_state_store(sk, TCP_CLOSING); 911 break; 912 case TCP_FIN_WAIT2: 913 inet_sk_state_store(sk, TCP_CLOSE); 914 break; 915 default: 916 return; 917 } 918 mptcp_close_wake_up(sk); 919 } 920 921 static struct sock *mptcp_subflow_recv_lookup(const struct mptcp_sock *msk) 922 { 923 struct mptcp_subflow_context *subflow; 924 struct sock *sk = (struct sock *)msk; 925 926 sock_owned_by_me(sk); 927 928 mptcp_for_each_subflow(msk, subflow) { 929 if (READ_ONCE(subflow->data_avail)) 930 return mptcp_subflow_tcp_sock(subflow); 931 } 932 933 return NULL; 934 } 935 936 static bool mptcp_skb_can_collapse_to(u64 write_seq, 937 const struct sk_buff *skb, 938 const struct mptcp_ext *mpext) 939 { 940 if (!tcp_skb_can_collapse_to(skb)) 941 return false; 942 943 /* can collapse only if MPTCP level sequence is in order and this 944 * mapping has not been xmitted yet 945 */ 946 return mpext && mpext->data_seq + mpext->data_len == write_seq && 947 !mpext->frozen; 948 } 949 950 /* we can append data to the given data frag if: 951 * - there is space available in the backing page_frag 952 * - the data frag tail matches the current page_frag free offset 953 * - the data frag end sequence number matches the current write seq 954 */ 955 static bool mptcp_frag_can_collapse_to(const struct mptcp_sock *msk, 956 const struct page_frag *pfrag, 957 const struct mptcp_data_frag *df) 958 { 959 return df && pfrag->page == df->page && 960 pfrag->size - pfrag->offset > 0 && 961 pfrag->offset == (df->offset + df->data_len) && 962 df->data_seq + df->data_len == msk->write_seq; 963 } 964 965 static void dfrag_uncharge(struct sock *sk, int len) 966 { 967 sk_mem_uncharge(sk, len); 968 sk_wmem_queued_add(sk, -len); 969 } 970 971 static void dfrag_clear(struct sock *sk, struct mptcp_data_frag *dfrag) 972 { 973 int len = dfrag->data_len + dfrag->overhead; 974 975 list_del(&dfrag->list); 976 dfrag_uncharge(sk, len); 977 put_page(dfrag->page); 978 } 979 980 static void __mptcp_clean_una(struct sock *sk) 981 { 982 struct mptcp_sock *msk = mptcp_sk(sk); 983 struct mptcp_data_frag *dtmp, *dfrag; 984 u64 snd_una; 985 986 /* on fallback we just need to ignore snd_una, as this is really 987 * plain TCP 988 */ 989 if (__mptcp_check_fallback(msk)) 990 msk->snd_una = READ_ONCE(msk->snd_nxt); 991 992 snd_una = msk->snd_una; 993 list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list) { 994 if (after64(dfrag->data_seq + dfrag->data_len, snd_una)) 995 break; 996 997 if (unlikely(dfrag == msk->first_pending)) { 998 /* in recovery mode can see ack after the current snd head */ 999 if (WARN_ON_ONCE(!msk->recovery)) 1000 break; 1001 1002 WRITE_ONCE(msk->first_pending, mptcp_send_next(sk)); 1003 } 1004 1005 dfrag_clear(sk, dfrag); 1006 } 1007 1008 dfrag = mptcp_rtx_head(sk); 1009 if (dfrag && after64(snd_una, dfrag->data_seq)) { 1010 u64 delta = snd_una - dfrag->data_seq; 1011 1012 /* prevent wrap around in recovery mode */ 1013 if (unlikely(delta > dfrag->already_sent)) { 1014 if (WARN_ON_ONCE(!msk->recovery)) 1015 goto out; 1016 if (WARN_ON_ONCE(delta > dfrag->data_len)) 1017 goto out; 1018 dfrag->already_sent += delta - dfrag->already_sent; 1019 } 1020 1021 dfrag->data_seq += delta; 1022 dfrag->offset += delta; 1023 dfrag->data_len -= delta; 1024 dfrag->already_sent -= delta; 1025 1026 dfrag_uncharge(sk, delta); 1027 } 1028 1029 /* all retransmitted data acked, recovery completed */ 1030 if (unlikely(msk->recovery) && after64(msk->snd_una, msk->recovery_snd_nxt)) 1031 msk->recovery = false; 1032 1033 out: 1034 if (snd_una == READ_ONCE(msk->snd_nxt) && 1035 snd_una == READ_ONCE(msk->write_seq)) { 1036 if (mptcp_timer_pending(sk) && !mptcp_data_fin_enabled(msk)) 1037 mptcp_stop_timer(sk); 1038 } else { 1039 mptcp_reset_timer(sk); 1040 } 1041 } 1042 1043 static void __mptcp_clean_una_wakeup(struct sock *sk) 1044 { 1045 lockdep_assert_held_once(&sk->sk_lock.slock); 1046 1047 __mptcp_clean_una(sk); 1048 mptcp_write_space(sk); 1049 } 1050 1051 static void mptcp_clean_una_wakeup(struct sock *sk) 1052 { 1053 mptcp_data_lock(sk); 1054 __mptcp_clean_una_wakeup(sk); 1055 mptcp_data_unlock(sk); 1056 } 1057 1058 static void mptcp_enter_memory_pressure(struct sock *sk) 1059 { 1060 struct mptcp_subflow_context *subflow; 1061 struct mptcp_sock *msk = mptcp_sk(sk); 1062 bool first = true; 1063 1064 sk_stream_moderate_sndbuf(sk); 1065 mptcp_for_each_subflow(msk, subflow) { 1066 struct sock *ssk = mptcp_subflow_tcp_sock(subflow); 1067 1068 if (first) 1069 tcp_enter_memory_pressure(ssk); 1070 sk_stream_moderate_sndbuf(ssk); 1071 first = false; 1072 } 1073 } 1074 1075 /* ensure we get enough memory for the frag hdr, beyond some minimal amount of 1076 * data 1077 */ 1078 static bool mptcp_page_frag_refill(struct sock *sk, struct page_frag *pfrag) 1079 { 1080 if (likely(skb_page_frag_refill(32U + sizeof(struct mptcp_data_frag), 1081 pfrag, sk->sk_allocation))) 1082 return true; 1083 1084 mptcp_enter_memory_pressure(sk); 1085 return false; 1086 } 1087 1088 static struct mptcp_data_frag * 1089 mptcp_carve_data_frag(const struct mptcp_sock *msk, struct page_frag *pfrag, 1090 int orig_offset) 1091 { 1092 int offset = ALIGN(orig_offset, sizeof(long)); 1093 struct mptcp_data_frag *dfrag; 1094 1095 dfrag = (struct mptcp_data_frag *)(page_to_virt(pfrag->page) + offset); 1096 dfrag->data_len = 0; 1097 dfrag->data_seq = msk->write_seq; 1098 dfrag->overhead = offset - orig_offset + sizeof(struct mptcp_data_frag); 1099 dfrag->offset = offset + sizeof(struct mptcp_data_frag); 1100 dfrag->already_sent = 0; 1101 dfrag->page = pfrag->page; 1102 1103 return dfrag; 1104 } 1105 1106 struct mptcp_sendmsg_info { 1107 int mss_now; 1108 int size_goal; 1109 u16 limit; 1110 u16 sent; 1111 unsigned int flags; 1112 bool data_lock_held; 1113 }; 1114 1115 static int mptcp_check_allowed_size(const struct mptcp_sock *msk, struct sock *ssk, 1116 u64 data_seq, int avail_size) 1117 { 1118 u64 window_end = mptcp_wnd_end(msk); 1119 u64 mptcp_snd_wnd; 1120 1121 if (__mptcp_check_fallback(msk)) 1122 return avail_size; 1123 1124 mptcp_snd_wnd = window_end - data_seq; 1125 avail_size = min_t(unsigned int, mptcp_snd_wnd, avail_size); 1126 1127 if (unlikely(tcp_sk(ssk)->snd_wnd < mptcp_snd_wnd)) { 1128 tcp_sk(ssk)->snd_wnd = min_t(u64, U32_MAX, mptcp_snd_wnd); 1129 MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_SNDWNDSHARED); 1130 } 1131 1132 return avail_size; 1133 } 1134 1135 static bool __mptcp_add_ext(struct sk_buff *skb, gfp_t gfp) 1136 { 1137 struct skb_ext *mpext = __skb_ext_alloc(gfp); 1138 1139 if (!mpext) 1140 return false; 1141 __skb_ext_set(skb, SKB_EXT_MPTCP, mpext); 1142 return true; 1143 } 1144 1145 static struct sk_buff *__mptcp_do_alloc_tx_skb(struct sock *sk, gfp_t gfp) 1146 { 1147 struct sk_buff *skb; 1148 1149 skb = alloc_skb_fclone(MAX_TCP_HEADER, gfp); 1150 if (likely(skb)) { 1151 if (likely(__mptcp_add_ext(skb, gfp))) { 1152 skb_reserve(skb, MAX_TCP_HEADER); 1153 skb->ip_summed = CHECKSUM_PARTIAL; 1154 INIT_LIST_HEAD(&skb->tcp_tsorted_anchor); 1155 return skb; 1156 } 1157 __kfree_skb(skb); 1158 } else { 1159 mptcp_enter_memory_pressure(sk); 1160 } 1161 return NULL; 1162 } 1163 1164 static struct sk_buff *__mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, gfp_t gfp) 1165 { 1166 struct sk_buff *skb; 1167 1168 skb = __mptcp_do_alloc_tx_skb(sk, gfp); 1169 if (!skb) 1170 return NULL; 1171 1172 if (likely(sk_wmem_schedule(ssk, skb->truesize))) { 1173 tcp_skb_entail(ssk, skb); 1174 return skb; 1175 } 1176 tcp_skb_tsorted_anchor_cleanup(skb); 1177 kfree_skb(skb); 1178 return NULL; 1179 } 1180 1181 static struct sk_buff *mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, bool data_lock_held) 1182 { 1183 gfp_t gfp = data_lock_held ? GFP_ATOMIC : sk->sk_allocation; 1184 1185 return __mptcp_alloc_tx_skb(sk, ssk, gfp); 1186 } 1187 1188 /* note: this always recompute the csum on the whole skb, even 1189 * if we just appended a single frag. More status info needed 1190 */ 1191 static void mptcp_update_data_checksum(struct sk_buff *skb, int added) 1192 { 1193 struct mptcp_ext *mpext = mptcp_get_ext(skb); 1194 __wsum csum = ~csum_unfold(mpext->csum); 1195 int offset = skb->len - added; 1196 1197 mpext->csum = csum_fold(csum_block_add(csum, skb_checksum(skb, offset, added, 0), offset)); 1198 } 1199 1200 static void mptcp_update_infinite_map(struct mptcp_sock *msk, 1201 struct sock *ssk, 1202 struct mptcp_ext *mpext) 1203 { 1204 if (!mpext) 1205 return; 1206 1207 mpext->infinite_map = 1; 1208 mpext->data_len = 0; 1209 1210 MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_INFINITEMAPTX); 1211 mptcp_subflow_ctx(ssk)->send_infinite_map = 0; 1212 pr_fallback(msk); 1213 mptcp_do_fallback(ssk); 1214 } 1215 1216 static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, 1217 struct mptcp_data_frag *dfrag, 1218 struct mptcp_sendmsg_info *info) 1219 { 1220 u64 data_seq = dfrag->data_seq + info->sent; 1221 int offset = dfrag->offset + info->sent; 1222 struct mptcp_sock *msk = mptcp_sk(sk); 1223 bool zero_window_probe = false; 1224 struct mptcp_ext *mpext = NULL; 1225 bool can_coalesce = false; 1226 bool reuse_skb = true; 1227 struct sk_buff *skb; 1228 size_t copy; 1229 int i; 1230 1231 pr_debug("msk=%p ssk=%p sending dfrag at seq=%llu len=%u already sent=%u", 1232 msk, ssk, dfrag->data_seq, dfrag->data_len, info->sent); 1233 1234 if (WARN_ON_ONCE(info->sent > info->limit || 1235 info->limit > dfrag->data_len)) 1236 return 0; 1237 1238 /* compute send limit */ 1239 info->mss_now = tcp_send_mss(ssk, &info->size_goal, info->flags); 1240 copy = info->size_goal; 1241 1242 skb = tcp_write_queue_tail(ssk); 1243 if (skb && copy > skb->len) { 1244 /* Limit the write to the size available in the 1245 * current skb, if any, so that we create at most a new skb. 1246 * Explicitly tells TCP internals to avoid collapsing on later 1247 * queue management operation, to avoid breaking the ext <-> 1248 * SSN association set here 1249 */ 1250 mpext = skb_ext_find(skb, SKB_EXT_MPTCP); 1251 if (!mptcp_skb_can_collapse_to(data_seq, skb, mpext)) { 1252 TCP_SKB_CB(skb)->eor = 1; 1253 goto alloc_skb; 1254 } 1255 1256 i = skb_shinfo(skb)->nr_frags; 1257 can_coalesce = skb_can_coalesce(skb, i, dfrag->page, offset); 1258 if (!can_coalesce && i >= sysctl_max_skb_frags) { 1259 tcp_mark_push(tcp_sk(ssk), skb); 1260 goto alloc_skb; 1261 } 1262 1263 copy -= skb->len; 1264 } else { 1265 alloc_skb: 1266 skb = mptcp_alloc_tx_skb(sk, ssk, info->data_lock_held); 1267 if (!skb) 1268 return -ENOMEM; 1269 1270 i = skb_shinfo(skb)->nr_frags; 1271 reuse_skb = false; 1272 mpext = skb_ext_find(skb, SKB_EXT_MPTCP); 1273 } 1274 1275 /* Zero window and all data acked? Probe. */ 1276 copy = mptcp_check_allowed_size(msk, ssk, data_seq, copy); 1277 if (copy == 0) { 1278 u64 snd_una = READ_ONCE(msk->snd_una); 1279 1280 if (snd_una != msk->snd_nxt) { 1281 tcp_remove_empty_skb(ssk); 1282 return 0; 1283 } 1284 1285 zero_window_probe = true; 1286 data_seq = snd_una - 1; 1287 copy = 1; 1288 1289 /* all mptcp-level data is acked, no skbs should be present into the 1290 * ssk write queue 1291 */ 1292 WARN_ON_ONCE(reuse_skb); 1293 } 1294 1295 copy = min_t(size_t, copy, info->limit - info->sent); 1296 if (!sk_wmem_schedule(ssk, copy)) { 1297 tcp_remove_empty_skb(ssk); 1298 return -ENOMEM; 1299 } 1300 1301 if (can_coalesce) { 1302 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); 1303 } else { 1304 get_page(dfrag->page); 1305 skb_fill_page_desc(skb, i, dfrag->page, offset, copy); 1306 } 1307 1308 skb->len += copy; 1309 skb->data_len += copy; 1310 skb->truesize += copy; 1311 sk_wmem_queued_add(ssk, copy); 1312 sk_mem_charge(ssk, copy); 1313 WRITE_ONCE(tcp_sk(ssk)->write_seq, tcp_sk(ssk)->write_seq + copy); 1314 TCP_SKB_CB(skb)->end_seq += copy; 1315 tcp_skb_pcount_set(skb, 0); 1316 1317 /* on skb reuse we just need to update the DSS len */ 1318 if (reuse_skb) { 1319 TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH; 1320 mpext->data_len += copy; 1321 WARN_ON_ONCE(zero_window_probe); 1322 goto out; 1323 } 1324 1325 memset(mpext, 0, sizeof(*mpext)); 1326 mpext->data_seq = data_seq; 1327 mpext->subflow_seq = mptcp_subflow_ctx(ssk)->rel_write_seq; 1328 mpext->data_len = copy; 1329 mpext->use_map = 1; 1330 mpext->dsn64 = 1; 1331 1332 pr_debug("data_seq=%llu subflow_seq=%u data_len=%u dsn64=%d", 1333 mpext->data_seq, mpext->subflow_seq, mpext->data_len, 1334 mpext->dsn64); 1335 1336 if (zero_window_probe) { 1337 mptcp_subflow_ctx(ssk)->rel_write_seq += copy; 1338 mpext->frozen = 1; 1339 if (READ_ONCE(msk->csum_enabled)) 1340 mptcp_update_data_checksum(skb, copy); 1341 tcp_push_pending_frames(ssk); 1342 return 0; 1343 } 1344 out: 1345 if (READ_ONCE(msk->csum_enabled)) 1346 mptcp_update_data_checksum(skb, copy); 1347 if (mptcp_subflow_ctx(ssk)->send_infinite_map) 1348 mptcp_update_infinite_map(msk, ssk, mpext); 1349 trace_mptcp_sendmsg_frag(mpext); 1350 mptcp_subflow_ctx(ssk)->rel_write_seq += copy; 1351 return copy; 1352 } 1353 1354 #define MPTCP_SEND_BURST_SIZE ((1 << 16) - \ 1355 sizeof(struct tcphdr) - \ 1356 MAX_TCP_OPTION_SPACE - \ 1357 sizeof(struct ipv6hdr) - \ 1358 sizeof(struct frag_hdr)) 1359 1360 struct subflow_send_info { 1361 struct sock *ssk; 1362 u64 linger_time; 1363 }; 1364 1365 void mptcp_subflow_set_active(struct mptcp_subflow_context *subflow) 1366 { 1367 if (!subflow->stale) 1368 return; 1369 1370 subflow->stale = 0; 1371 MPTCP_INC_STATS(sock_net(mptcp_subflow_tcp_sock(subflow)), MPTCP_MIB_SUBFLOWRECOVER); 1372 } 1373 1374 bool mptcp_subflow_active(struct mptcp_subflow_context *subflow) 1375 { 1376 if (unlikely(subflow->stale)) { 1377 u32 rcv_tstamp = READ_ONCE(tcp_sk(mptcp_subflow_tcp_sock(subflow))->rcv_tstamp); 1378 1379 if (subflow->stale_rcv_tstamp == rcv_tstamp) 1380 return false; 1381 1382 mptcp_subflow_set_active(subflow); 1383 } 1384 return __mptcp_subflow_active(subflow); 1385 } 1386 1387 #define SSK_MODE_ACTIVE 0 1388 #define SSK_MODE_BACKUP 1 1389 #define SSK_MODE_MAX 2 1390 1391 /* implement the mptcp packet scheduler; 1392 * returns the subflow that will transmit the next DSS 1393 * additionally updates the rtx timeout 1394 */ 1395 static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) 1396 { 1397 struct subflow_send_info send_info[SSK_MODE_MAX]; 1398 struct mptcp_subflow_context *subflow; 1399 struct sock *sk = (struct sock *)msk; 1400 u32 pace, burst, wmem; 1401 int i, nr_active = 0; 1402 struct sock *ssk; 1403 u64 linger_time; 1404 long tout = 0; 1405 1406 sock_owned_by_me(sk); 1407 1408 if (__mptcp_check_fallback(msk)) { 1409 if (!msk->first) 1410 return NULL; 1411 return sk_stream_memory_free(msk->first) ? msk->first : NULL; 1412 } 1413 1414 /* re-use last subflow, if the burst allow that */ 1415 if (msk->last_snd && msk->snd_burst > 0 && 1416 sk_stream_memory_free(msk->last_snd) && 1417 mptcp_subflow_active(mptcp_subflow_ctx(msk->last_snd))) { 1418 mptcp_set_timeout(sk); 1419 return msk->last_snd; 1420 } 1421 1422 /* pick the subflow with the lower wmem/wspace ratio */ 1423 for (i = 0; i < SSK_MODE_MAX; ++i) { 1424 send_info[i].ssk = NULL; 1425 send_info[i].linger_time = -1; 1426 } 1427 1428 mptcp_for_each_subflow(msk, subflow) { 1429 trace_mptcp_subflow_get_send(subflow); 1430 ssk = mptcp_subflow_tcp_sock(subflow); 1431 if (!mptcp_subflow_active(subflow)) 1432 continue; 1433 1434 tout = max(tout, mptcp_timeout_from_subflow(subflow)); 1435 nr_active += !subflow->backup; 1436 pace = subflow->avg_pacing_rate; 1437 if (unlikely(!pace)) { 1438 /* init pacing rate from socket */ 1439 subflow->avg_pacing_rate = READ_ONCE(ssk->sk_pacing_rate); 1440 pace = subflow->avg_pacing_rate; 1441 if (!pace) 1442 continue; 1443 } 1444 1445 linger_time = div_u64((u64)READ_ONCE(ssk->sk_wmem_queued) << 32, pace); 1446 if (linger_time < send_info[subflow->backup].linger_time) { 1447 send_info[subflow->backup].ssk = ssk; 1448 send_info[subflow->backup].linger_time = linger_time; 1449 } 1450 } 1451 __mptcp_set_timeout(sk, tout); 1452 1453 /* pick the best backup if no other subflow is active */ 1454 if (!nr_active) 1455 send_info[SSK_MODE_ACTIVE].ssk = send_info[SSK_MODE_BACKUP].ssk; 1456 1457 /* According to the blest algorithm, to avoid HoL blocking for the 1458 * faster flow, we need to: 1459 * - estimate the faster flow linger time 1460 * - use the above to estimate the amount of byte transferred 1461 * by the faster flow 1462 * - check that the amount of queued data is greter than the above, 1463 * otherwise do not use the picked, slower, subflow 1464 * We select the subflow with the shorter estimated time to flush 1465 * the queued mem, which basically ensure the above. We just need 1466 * to check that subflow has a non empty cwin. 1467 */ 1468 ssk = send_info[SSK_MODE_ACTIVE].ssk; 1469 if (!ssk || !sk_stream_memory_free(ssk)) 1470 return NULL; 1471 1472 burst = min_t(int, MPTCP_SEND_BURST_SIZE, mptcp_wnd_end(msk) - msk->snd_nxt); 1473 wmem = READ_ONCE(ssk->sk_wmem_queued); 1474 if (!burst) { 1475 msk->last_snd = NULL; 1476 return ssk; 1477 } 1478 1479 subflow = mptcp_subflow_ctx(ssk); 1480 subflow->avg_pacing_rate = div_u64((u64)subflow->avg_pacing_rate * wmem + 1481 READ_ONCE(ssk->sk_pacing_rate) * burst, 1482 burst + wmem); 1483 msk->last_snd = ssk; 1484 msk->snd_burst = burst; 1485 return ssk; 1486 } 1487 1488 static void mptcp_push_release(struct sock *ssk, struct mptcp_sendmsg_info *info) 1489 { 1490 tcp_push(ssk, 0, info->mss_now, tcp_sk(ssk)->nonagle, info->size_goal); 1491 release_sock(ssk); 1492 } 1493 1494 static void mptcp_update_post_push(struct mptcp_sock *msk, 1495 struct mptcp_data_frag *dfrag, 1496 u32 sent) 1497 { 1498 u64 snd_nxt_new = dfrag->data_seq; 1499 1500 dfrag->already_sent += sent; 1501 1502 msk->snd_burst -= sent; 1503 1504 snd_nxt_new += dfrag->already_sent; 1505 1506 /* snd_nxt_new can be smaller than snd_nxt in case mptcp 1507 * is recovering after a failover. In that event, this re-sends 1508 * old segments. 1509 * 1510 * Thus compute snd_nxt_new candidate based on 1511 * the dfrag->data_seq that was sent and the data 1512 * that has been handed to the subflow for transmission 1513 * and skip update in case it was old dfrag. 1514 */ 1515 if (likely(after64(snd_nxt_new, msk->snd_nxt))) 1516 msk->snd_nxt = snd_nxt_new; 1517 } 1518 1519 void mptcp_check_and_set_pending(struct sock *sk) 1520 { 1521 if (mptcp_send_head(sk)) 1522 mptcp_sk(sk)->push_pending |= BIT(MPTCP_PUSH_PENDING); 1523 } 1524 1525 void __mptcp_push_pending(struct sock *sk, unsigned int flags) 1526 { 1527 struct sock *prev_ssk = NULL, *ssk = NULL; 1528 struct mptcp_sock *msk = mptcp_sk(sk); 1529 struct mptcp_sendmsg_info info = { 1530 .flags = flags, 1531 }; 1532 struct mptcp_data_frag *dfrag; 1533 int len, copied = 0; 1534 1535 while ((dfrag = mptcp_send_head(sk))) { 1536 info.sent = dfrag->already_sent; 1537 info.limit = dfrag->data_len; 1538 len = dfrag->data_len - dfrag->already_sent; 1539 while (len > 0) { 1540 int ret = 0; 1541 1542 prev_ssk = ssk; 1543 ssk = mptcp_subflow_get_send(msk); 1544 1545 /* First check. If the ssk has changed since 1546 * the last round, release prev_ssk 1547 */ 1548 if (ssk != prev_ssk && prev_ssk) 1549 mptcp_push_release(prev_ssk, &info); 1550 if (!ssk) 1551 goto out; 1552 1553 /* Need to lock the new subflow only if different 1554 * from the previous one, otherwise we are still 1555 * helding the relevant lock 1556 */ 1557 if (ssk != prev_ssk) 1558 lock_sock(ssk); 1559 1560 ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info); 1561 if (ret <= 0) { 1562 mptcp_push_release(ssk, &info); 1563 goto out; 1564 } 1565 1566 info.sent += ret; 1567 copied += ret; 1568 len -= ret; 1569 1570 mptcp_update_post_push(msk, dfrag, ret); 1571 } 1572 WRITE_ONCE(msk->first_pending, mptcp_send_next(sk)); 1573 } 1574 1575 /* at this point we held the socket lock for the last subflow we used */ 1576 if (ssk) 1577 mptcp_push_release(ssk, &info); 1578 1579 out: 1580 /* ensure the rtx timer is running */ 1581 if (!mptcp_timer_pending(sk)) 1582 mptcp_reset_timer(sk); 1583 if (copied) 1584 __mptcp_check_send_data_fin(sk); 1585 } 1586 1587 static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk) 1588 { 1589 struct mptcp_sock *msk = mptcp_sk(sk); 1590 struct mptcp_sendmsg_info info = { 1591 .data_lock_held = true, 1592 }; 1593 struct mptcp_data_frag *dfrag; 1594 struct sock *xmit_ssk; 1595 int len, copied = 0; 1596 bool first = true; 1597 1598 info.flags = 0; 1599 while ((dfrag = mptcp_send_head(sk))) { 1600 info.sent = dfrag->already_sent; 1601 info.limit = dfrag->data_len; 1602 len = dfrag->data_len - dfrag->already_sent; 1603 while (len > 0) { 1604 int ret = 0; 1605 1606 /* the caller already invoked the packet scheduler, 1607 * check for a different subflow usage only after 1608 * spooling the first chunk of data 1609 */ 1610 xmit_ssk = first ? ssk : mptcp_subflow_get_send(mptcp_sk(sk)); 1611 if (!xmit_ssk) 1612 goto out; 1613 if (xmit_ssk != ssk) { 1614 mptcp_subflow_delegate(mptcp_subflow_ctx(xmit_ssk), 1615 MPTCP_DELEGATE_SEND); 1616 goto out; 1617 } 1618 1619 ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info); 1620 if (ret <= 0) 1621 goto out; 1622 1623 info.sent += ret; 1624 copied += ret; 1625 len -= ret; 1626 first = false; 1627 1628 mptcp_update_post_push(msk, dfrag, ret); 1629 } 1630 WRITE_ONCE(msk->first_pending, mptcp_send_next(sk)); 1631 } 1632 1633 out: 1634 /* __mptcp_alloc_tx_skb could have released some wmem and we are 1635 * not going to flush it via release_sock() 1636 */ 1637 if (copied) { 1638 tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle, 1639 info.size_goal); 1640 if (!mptcp_timer_pending(sk)) 1641 mptcp_reset_timer(sk); 1642 1643 if (msk->snd_data_fin_enable && 1644 msk->snd_nxt + 1 == msk->write_seq) 1645 mptcp_schedule_work(sk); 1646 } 1647 } 1648 1649 static void mptcp_set_nospace(struct sock *sk) 1650 { 1651 /* enable autotune */ 1652 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 1653 1654 /* will be cleared on avail space */ 1655 set_bit(MPTCP_NOSPACE, &mptcp_sk(sk)->flags); 1656 } 1657 1658 static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) 1659 { 1660 struct mptcp_sock *msk = mptcp_sk(sk); 1661 struct page_frag *pfrag; 1662 size_t copied = 0; 1663 int ret = 0; 1664 long timeo; 1665 1666 /* we don't support FASTOPEN yet */ 1667 if (msg->msg_flags & MSG_FASTOPEN) 1668 return -EOPNOTSUPP; 1669 1670 /* silently ignore everything else */ 1671 msg->msg_flags &= MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL; 1672 1673 lock_sock(sk); 1674 1675 timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); 1676 1677 if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) { 1678 ret = sk_stream_wait_connect(sk, &timeo); 1679 if (ret) 1680 goto out; 1681 } 1682 1683 pfrag = sk_page_frag(sk); 1684 1685 while (msg_data_left(msg)) { 1686 int total_ts, frag_truesize = 0; 1687 struct mptcp_data_frag *dfrag; 1688 bool dfrag_collapsed; 1689 size_t psize, offset; 1690 1691 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) { 1692 ret = -EPIPE; 1693 goto out; 1694 } 1695 1696 /* reuse tail pfrag, if possible, or carve a new one from the 1697 * page allocator 1698 */ 1699 dfrag = mptcp_pending_tail(sk); 1700 dfrag_collapsed = mptcp_frag_can_collapse_to(msk, pfrag, dfrag); 1701 if (!dfrag_collapsed) { 1702 if (!sk_stream_memory_free(sk)) 1703 goto wait_for_memory; 1704 1705 if (!mptcp_page_frag_refill(sk, pfrag)) 1706 goto wait_for_memory; 1707 1708 dfrag = mptcp_carve_data_frag(msk, pfrag, pfrag->offset); 1709 frag_truesize = dfrag->overhead; 1710 } 1711 1712 /* we do not bound vs wspace, to allow a single packet. 1713 * memory accounting will prevent execessive memory usage 1714 * anyway 1715 */ 1716 offset = dfrag->offset + dfrag->data_len; 1717 psize = pfrag->size - offset; 1718 psize = min_t(size_t, psize, msg_data_left(msg)); 1719 total_ts = psize + frag_truesize; 1720 1721 if (!sk_wmem_schedule(sk, total_ts)) 1722 goto wait_for_memory; 1723 1724 if (copy_page_from_iter(dfrag->page, offset, psize, 1725 &msg->msg_iter) != psize) { 1726 ret = -EFAULT; 1727 goto out; 1728 } 1729 1730 /* data successfully copied into the write queue */ 1731 sk->sk_forward_alloc -= total_ts; 1732 copied += psize; 1733 dfrag->data_len += psize; 1734 frag_truesize += psize; 1735 pfrag->offset += frag_truesize; 1736 WRITE_ONCE(msk->write_seq, msk->write_seq + psize); 1737 1738 /* charge data on mptcp pending queue to the msk socket 1739 * Note: we charge such data both to sk and ssk 1740 */ 1741 sk_wmem_queued_add(sk, frag_truesize); 1742 if (!dfrag_collapsed) { 1743 get_page(dfrag->page); 1744 list_add_tail(&dfrag->list, &msk->rtx_queue); 1745 if (!msk->first_pending) 1746 WRITE_ONCE(msk->first_pending, dfrag); 1747 } 1748 pr_debug("msk=%p dfrag at seq=%llu len=%u sent=%u new=%d", msk, 1749 dfrag->data_seq, dfrag->data_len, dfrag->already_sent, 1750 !dfrag_collapsed); 1751 1752 continue; 1753 1754 wait_for_memory: 1755 mptcp_set_nospace(sk); 1756 __mptcp_push_pending(sk, msg->msg_flags); 1757 ret = sk_stream_wait_memory(sk, &timeo); 1758 if (ret) 1759 goto out; 1760 } 1761 1762 if (copied) 1763 __mptcp_push_pending(sk, msg->msg_flags); 1764 1765 out: 1766 release_sock(sk); 1767 return copied ? : ret; 1768 } 1769 1770 static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk, 1771 struct msghdr *msg, 1772 size_t len, int flags, 1773 struct scm_timestamping_internal *tss, 1774 int *cmsg_flags) 1775 { 1776 struct sk_buff *skb, *tmp; 1777 int copied = 0; 1778 1779 skb_queue_walk_safe(&msk->receive_queue, skb, tmp) { 1780 u32 offset = MPTCP_SKB_CB(skb)->offset; 1781 u32 data_len = skb->len - offset; 1782 u32 count = min_t(size_t, len - copied, data_len); 1783 int err; 1784 1785 if (!(flags & MSG_TRUNC)) { 1786 err = skb_copy_datagram_msg(skb, offset, msg, count); 1787 if (unlikely(err < 0)) { 1788 if (!copied) 1789 return err; 1790 break; 1791 } 1792 } 1793 1794 if (MPTCP_SKB_CB(skb)->has_rxtstamp) { 1795 tcp_update_recv_tstamps(skb, tss); 1796 *cmsg_flags |= MPTCP_CMSG_TS; 1797 } 1798 1799 copied += count; 1800 1801 if (count < data_len) { 1802 if (!(flags & MSG_PEEK)) { 1803 MPTCP_SKB_CB(skb)->offset += count; 1804 MPTCP_SKB_CB(skb)->map_seq += count; 1805 } 1806 break; 1807 } 1808 1809 if (!(flags & MSG_PEEK)) { 1810 /* we will bulk release the skb memory later */ 1811 skb->destructor = NULL; 1812 WRITE_ONCE(msk->rmem_released, msk->rmem_released + skb->truesize); 1813 __skb_unlink(skb, &msk->receive_queue); 1814 __kfree_skb(skb); 1815 } 1816 1817 if (copied >= len) 1818 break; 1819 } 1820 1821 return copied; 1822 } 1823 1824 /* receive buffer autotuning. See tcp_rcv_space_adjust for more information. 1825 * 1826 * Only difference: Use highest rtt estimate of the subflows in use. 1827 */ 1828 static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied) 1829 { 1830 struct mptcp_subflow_context *subflow; 1831 struct sock *sk = (struct sock *)msk; 1832 u32 time, advmss = 1; 1833 u64 rtt_us, mstamp; 1834 1835 sock_owned_by_me(sk); 1836 1837 if (copied <= 0) 1838 return; 1839 1840 msk->rcvq_space.copied += copied; 1841 1842 mstamp = div_u64(tcp_clock_ns(), NSEC_PER_USEC); 1843 time = tcp_stamp_us_delta(mstamp, msk->rcvq_space.time); 1844 1845 rtt_us = msk->rcvq_space.rtt_us; 1846 if (rtt_us && time < (rtt_us >> 3)) 1847 return; 1848 1849 rtt_us = 0; 1850 mptcp_for_each_subflow(msk, subflow) { 1851 const struct tcp_sock *tp; 1852 u64 sf_rtt_us; 1853 u32 sf_advmss; 1854 1855 tp = tcp_sk(mptcp_subflow_tcp_sock(subflow)); 1856 1857 sf_rtt_us = READ_ONCE(tp->rcv_rtt_est.rtt_us); 1858 sf_advmss = READ_ONCE(tp->advmss); 1859 1860 rtt_us = max(sf_rtt_us, rtt_us); 1861 advmss = max(sf_advmss, advmss); 1862 } 1863 1864 msk->rcvq_space.rtt_us = rtt_us; 1865 if (time < (rtt_us >> 3) || rtt_us == 0) 1866 return; 1867 1868 if (msk->rcvq_space.copied <= msk->rcvq_space.space) 1869 goto new_measure; 1870 1871 if (sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf && 1872 !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { 1873 int rcvmem, rcvbuf; 1874 u64 rcvwin, grow; 1875 1876 rcvwin = ((u64)msk->rcvq_space.copied << 1) + 16 * advmss; 1877 1878 grow = rcvwin * (msk->rcvq_space.copied - msk->rcvq_space.space); 1879 1880 do_div(grow, msk->rcvq_space.space); 1881 rcvwin += (grow << 1); 1882 1883 rcvmem = SKB_TRUESIZE(advmss + MAX_TCP_HEADER); 1884 while (tcp_win_from_space(sk, rcvmem) < advmss) 1885 rcvmem += 128; 1886 1887 do_div(rcvwin, advmss); 1888 rcvbuf = min_t(u64, rcvwin * rcvmem, 1889 sock_net(sk)->ipv4.sysctl_tcp_rmem[2]); 1890 1891 if (rcvbuf > sk->sk_rcvbuf) { 1892 u32 window_clamp; 1893 1894 window_clamp = tcp_win_from_space(sk, rcvbuf); 1895 WRITE_ONCE(sk->sk_rcvbuf, rcvbuf); 1896 1897 /* Make subflows follow along. If we do not do this, we 1898 * get drops at subflow level if skbs can't be moved to 1899 * the mptcp rx queue fast enough (announced rcv_win can 1900 * exceed ssk->sk_rcvbuf). 1901 */ 1902 mptcp_for_each_subflow(msk, subflow) { 1903 struct sock *ssk; 1904 bool slow; 1905 1906 ssk = mptcp_subflow_tcp_sock(subflow); 1907 slow = lock_sock_fast(ssk); 1908 WRITE_ONCE(ssk->sk_rcvbuf, rcvbuf); 1909 tcp_sk(ssk)->window_clamp = window_clamp; 1910 tcp_cleanup_rbuf(ssk, 1); 1911 unlock_sock_fast(ssk, slow); 1912 } 1913 } 1914 } 1915 1916 msk->rcvq_space.space = msk->rcvq_space.copied; 1917 new_measure: 1918 msk->rcvq_space.copied = 0; 1919 msk->rcvq_space.time = mstamp; 1920 } 1921 1922 static void __mptcp_update_rmem(struct sock *sk) 1923 { 1924 struct mptcp_sock *msk = mptcp_sk(sk); 1925 1926 if (!msk->rmem_released) 1927 return; 1928 1929 atomic_sub(msk->rmem_released, &sk->sk_rmem_alloc); 1930 mptcp_rmem_uncharge(sk, msk->rmem_released); 1931 WRITE_ONCE(msk->rmem_released, 0); 1932 } 1933 1934 static void __mptcp_splice_receive_queue(struct sock *sk) 1935 { 1936 struct mptcp_sock *msk = mptcp_sk(sk); 1937 1938 skb_queue_splice_tail_init(&sk->sk_receive_queue, &msk->receive_queue); 1939 } 1940 1941 static bool __mptcp_move_skbs(struct mptcp_sock *msk) 1942 { 1943 struct sock *sk = (struct sock *)msk; 1944 unsigned int moved = 0; 1945 bool ret, done; 1946 1947 do { 1948 struct sock *ssk = mptcp_subflow_recv_lookup(msk); 1949 bool slowpath; 1950 1951 /* we can have data pending in the subflows only if the msk 1952 * receive buffer was full at subflow_data_ready() time, 1953 * that is an unlikely slow path. 1954 */ 1955 if (likely(!ssk)) 1956 break; 1957 1958 slowpath = lock_sock_fast(ssk); 1959 mptcp_data_lock(sk); 1960 __mptcp_update_rmem(sk); 1961 done = __mptcp_move_skbs_from_subflow(msk, ssk, &moved); 1962 mptcp_data_unlock(sk); 1963 1964 if (unlikely(ssk->sk_err)) 1965 __mptcp_error_report(sk); 1966 unlock_sock_fast(ssk, slowpath); 1967 } while (!done); 1968 1969 /* acquire the data lock only if some input data is pending */ 1970 ret = moved > 0; 1971 if (!RB_EMPTY_ROOT(&msk->out_of_order_queue) || 1972 !skb_queue_empty_lockless(&sk->sk_receive_queue)) { 1973 mptcp_data_lock(sk); 1974 __mptcp_update_rmem(sk); 1975 ret |= __mptcp_ofo_queue(msk); 1976 __mptcp_splice_receive_queue(sk); 1977 mptcp_data_unlock(sk); 1978 } 1979 if (ret) 1980 mptcp_check_data_fin((struct sock *)msk); 1981 return !skb_queue_empty(&msk->receive_queue); 1982 } 1983 1984 static unsigned int mptcp_inq_hint(const struct sock *sk) 1985 { 1986 const struct mptcp_sock *msk = mptcp_sk(sk); 1987 const struct sk_buff *skb; 1988 1989 skb = skb_peek(&msk->receive_queue); 1990 if (skb) { 1991 u64 hint_val = msk->ack_seq - MPTCP_SKB_CB(skb)->map_seq; 1992 1993 if (hint_val >= INT_MAX) 1994 return INT_MAX; 1995 1996 return (unsigned int)hint_val; 1997 } 1998 1999 if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN)) 2000 return 1; 2001 2002 return 0; 2003 } 2004 2005 static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, 2006 int flags, int *addr_len) 2007 { 2008 struct mptcp_sock *msk = mptcp_sk(sk); 2009 struct scm_timestamping_internal tss; 2010 int copied = 0, cmsg_flags = 0; 2011 int target; 2012 long timeo; 2013 2014 /* MSG_ERRQUEUE is really a no-op till we support IP_RECVERR */ 2015 if (unlikely(flags & MSG_ERRQUEUE)) 2016 return inet_recv_error(sk, msg, len, addr_len); 2017 2018 lock_sock(sk); 2019 if (unlikely(sk->sk_state == TCP_LISTEN)) { 2020 copied = -ENOTCONN; 2021 goto out_err; 2022 } 2023 2024 timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); 2025 2026 len = min_t(size_t, len, INT_MAX); 2027 target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); 2028 2029 if (unlikely(msk->recvmsg_inq)) 2030 cmsg_flags = MPTCP_CMSG_INQ; 2031 2032 while (copied < len) { 2033 int bytes_read; 2034 2035 bytes_read = __mptcp_recvmsg_mskq(msk, msg, len - copied, flags, &tss, &cmsg_flags); 2036 if (unlikely(bytes_read < 0)) { 2037 if (!copied) 2038 copied = bytes_read; 2039 goto out_err; 2040 } 2041 2042 copied += bytes_read; 2043 2044 /* be sure to advertise window change */ 2045 mptcp_cleanup_rbuf(msk); 2046 2047 if (skb_queue_empty(&msk->receive_queue) && __mptcp_move_skbs(msk)) 2048 continue; 2049 2050 /* only the master socket status is relevant here. The exit 2051 * conditions mirror closely tcp_recvmsg() 2052 */ 2053 if (copied >= target) 2054 break; 2055 2056 if (copied) { 2057 if (sk->sk_err || 2058 sk->sk_state == TCP_CLOSE || 2059 (sk->sk_shutdown & RCV_SHUTDOWN) || 2060 !timeo || 2061 signal_pending(current)) 2062 break; 2063 } else { 2064 if (sk->sk_err) { 2065 copied = sock_error(sk); 2066 break; 2067 } 2068 2069 if (test_and_clear_bit(MPTCP_WORK_EOF, &msk->flags)) 2070 mptcp_check_for_eof(msk); 2071 2072 if (sk->sk_shutdown & RCV_SHUTDOWN) { 2073 /* race breaker: the shutdown could be after the 2074 * previous receive queue check 2075 */ 2076 if (__mptcp_move_skbs(msk)) 2077 continue; 2078 break; 2079 } 2080 2081 if (sk->sk_state == TCP_CLOSE) { 2082 copied = -ENOTCONN; 2083 break; 2084 } 2085 2086 if (!timeo) { 2087 copied = -EAGAIN; 2088 break; 2089 } 2090 2091 if (signal_pending(current)) { 2092 copied = sock_intr_errno(timeo); 2093 break; 2094 } 2095 } 2096 2097 pr_debug("block timeout %ld", timeo); 2098 sk_wait_data(sk, &timeo, NULL); 2099 } 2100 2101 out_err: 2102 if (cmsg_flags && copied >= 0) { 2103 if (cmsg_flags & MPTCP_CMSG_TS) 2104 tcp_recv_timestamp(msg, sk, &tss); 2105 2106 if (cmsg_flags & MPTCP_CMSG_INQ) { 2107 unsigned int inq = mptcp_inq_hint(sk); 2108 2109 put_cmsg(msg, SOL_TCP, TCP_CM_INQ, sizeof(inq), &inq); 2110 } 2111 } 2112 2113 pr_debug("msk=%p rx queue empty=%d:%d copied=%d", 2114 msk, skb_queue_empty_lockless(&sk->sk_receive_queue), 2115 skb_queue_empty(&msk->receive_queue), copied); 2116 if (!(flags & MSG_PEEK)) 2117 mptcp_rcv_space_adjust(msk, copied); 2118 2119 release_sock(sk); 2120 return copied; 2121 } 2122 2123 static void mptcp_retransmit_timer(struct timer_list *t) 2124 { 2125 struct inet_connection_sock *icsk = from_timer(icsk, t, 2126 icsk_retransmit_timer); 2127 struct sock *sk = &icsk->icsk_inet.sk; 2128 struct mptcp_sock *msk = mptcp_sk(sk); 2129 2130 bh_lock_sock(sk); 2131 if (!sock_owned_by_user(sk)) { 2132 /* we need a process context to retransmit */ 2133 if (!test_and_set_bit(MPTCP_WORK_RTX, &msk->flags)) 2134 mptcp_schedule_work(sk); 2135 } else { 2136 /* delegate our work to tcp_release_cb() */ 2137 __set_bit(MPTCP_RETRANSMIT, &msk->cb_flags); 2138 } 2139 bh_unlock_sock(sk); 2140 sock_put(sk); 2141 } 2142 2143 static void mptcp_timeout_timer(struct timer_list *t) 2144 { 2145 struct sock *sk = from_timer(sk, t, sk_timer); 2146 2147 mptcp_schedule_work(sk); 2148 sock_put(sk); 2149 } 2150 2151 /* Find an idle subflow. Return NULL if there is unacked data at tcp 2152 * level. 2153 * 2154 * A backup subflow is returned only if that is the only kind available. 2155 */ 2156 static struct sock *mptcp_subflow_get_retrans(struct mptcp_sock *msk) 2157 { 2158 struct sock *backup = NULL, *pick = NULL; 2159 struct mptcp_subflow_context *subflow; 2160 int min_stale_count = INT_MAX; 2161 2162 sock_owned_by_me((const struct sock *)msk); 2163 2164 if (__mptcp_check_fallback(msk)) 2165 return NULL; 2166 2167 mptcp_for_each_subflow(msk, subflow) { 2168 struct sock *ssk = mptcp_subflow_tcp_sock(subflow); 2169 2170 if (!__mptcp_subflow_active(subflow)) 2171 continue; 2172 2173 /* still data outstanding at TCP level? skip this */ 2174 if (!tcp_rtx_and_write_queues_empty(ssk)) { 2175 mptcp_pm_subflow_chk_stale(msk, ssk); 2176 min_stale_count = min_t(int, min_stale_count, subflow->stale_count); 2177 continue; 2178 } 2179 2180 if (subflow->backup) { 2181 if (!backup) 2182 backup = ssk; 2183 continue; 2184 } 2185 2186 if (!pick) 2187 pick = ssk; 2188 } 2189 2190 if (pick) 2191 return pick; 2192 2193 /* use backup only if there are no progresses anywhere */ 2194 return min_stale_count > 1 ? backup : NULL; 2195 } 2196 2197 static void mptcp_dispose_initial_subflow(struct mptcp_sock *msk) 2198 { 2199 if (msk->subflow) { 2200 iput(SOCK_INODE(msk->subflow)); 2201 msk->subflow = NULL; 2202 } 2203 } 2204 2205 bool __mptcp_retransmit_pending_data(struct sock *sk) 2206 { 2207 struct mptcp_data_frag *cur, *rtx_head; 2208 struct mptcp_sock *msk = mptcp_sk(sk); 2209 2210 if (__mptcp_check_fallback(mptcp_sk(sk))) 2211 return false; 2212 2213 if (tcp_rtx_and_write_queues_empty(sk)) 2214 return false; 2215 2216 /* the closing socket has some data untransmitted and/or unacked: 2217 * some data in the mptcp rtx queue has not really xmitted yet. 2218 * keep it simple and re-inject the whole mptcp level rtx queue 2219 */ 2220 mptcp_data_lock(sk); 2221 __mptcp_clean_una_wakeup(sk); 2222 rtx_head = mptcp_rtx_head(sk); 2223 if (!rtx_head) { 2224 mptcp_data_unlock(sk); 2225 return false; 2226 } 2227 2228 msk->recovery_snd_nxt = msk->snd_nxt; 2229 msk->recovery = true; 2230 mptcp_data_unlock(sk); 2231 2232 msk->first_pending = rtx_head; 2233 msk->snd_burst = 0; 2234 2235 /* be sure to clear the "sent status" on all re-injected fragments */ 2236 list_for_each_entry(cur, &msk->rtx_queue, list) { 2237 if (!cur->already_sent) 2238 break; 2239 cur->already_sent = 0; 2240 } 2241 2242 return true; 2243 } 2244 2245 /* flags for __mptcp_close_ssk() */ 2246 #define MPTCP_CF_PUSH BIT(1) 2247 #define MPTCP_CF_FASTCLOSE BIT(2) 2248 2249 /* subflow sockets can be either outgoing (connect) or incoming 2250 * (accept). 2251 * 2252 * Outgoing subflows use in-kernel sockets. 2253 * Incoming subflows do not have their own 'struct socket' allocated, 2254 * so we need to use tcp_close() after detaching them from the mptcp 2255 * parent socket. 2256 */ 2257 static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, 2258 struct mptcp_subflow_context *subflow, 2259 unsigned int flags) 2260 { 2261 struct mptcp_sock *msk = mptcp_sk(sk); 2262 bool need_push, dispose_it; 2263 2264 dispose_it = !msk->subflow || ssk != msk->subflow->sk; 2265 if (dispose_it) 2266 list_del(&subflow->node); 2267 2268 lock_sock_nested(ssk, SINGLE_DEPTH_NESTING); 2269 2270 if (flags & MPTCP_CF_FASTCLOSE) 2271 subflow->send_fastclose = 1; 2272 2273 need_push = (flags & MPTCP_CF_PUSH) && __mptcp_retransmit_pending_data(sk); 2274 if (!dispose_it) { 2275 tcp_disconnect(ssk, 0); 2276 msk->subflow->state = SS_UNCONNECTED; 2277 mptcp_subflow_ctx_reset(subflow); 2278 release_sock(ssk); 2279 2280 goto out; 2281 } 2282 2283 /* if we are invoked by the msk cleanup code, the subflow is 2284 * already orphaned 2285 */ 2286 if (ssk->sk_socket) 2287 sock_orphan(ssk); 2288 2289 subflow->disposable = 1; 2290 2291 /* if ssk hit tcp_done(), tcp_cleanup_ulp() cleared the related ops 2292 * the ssk has been already destroyed, we just need to release the 2293 * reference owned by msk; 2294 */ 2295 if (!inet_csk(ssk)->icsk_ulp_ops) { 2296 kfree_rcu(subflow, rcu); 2297 } else { 2298 /* otherwise tcp will dispose of the ssk and subflow ctx */ 2299 if (ssk->sk_state == TCP_LISTEN) { 2300 tcp_set_state(ssk, TCP_CLOSE); 2301 mptcp_subflow_queue_clean(ssk); 2302 inet_csk_listen_stop(ssk); 2303 } 2304 __tcp_close(ssk, 0); 2305 2306 /* close acquired an extra ref */ 2307 __sock_put(ssk); 2308 } 2309 release_sock(ssk); 2310 2311 sock_put(ssk); 2312 2313 if (ssk == msk->first) 2314 msk->first = NULL; 2315 2316 out: 2317 if (ssk == msk->last_snd) 2318 msk->last_snd = NULL; 2319 2320 if (need_push) 2321 __mptcp_push_pending(sk, 0); 2322 } 2323 2324 void mptcp_close_ssk(struct sock *sk, struct sock *ssk, 2325 struct mptcp_subflow_context *subflow) 2326 { 2327 if (sk->sk_state == TCP_ESTABLISHED) 2328 mptcp_event(MPTCP_EVENT_SUB_CLOSED, mptcp_sk(sk), ssk, GFP_KERNEL); 2329 2330 /* subflow aborted before reaching the fully_established status 2331 * attempt the creation of the next subflow 2332 */ 2333 mptcp_pm_subflow_check_next(mptcp_sk(sk), ssk, subflow); 2334 2335 __mptcp_close_ssk(sk, ssk, subflow, MPTCP_CF_PUSH); 2336 } 2337 2338 static unsigned int mptcp_sync_mss(struct sock *sk, u32 pmtu) 2339 { 2340 return 0; 2341 } 2342 2343 static void __mptcp_close_subflow(struct mptcp_sock *msk) 2344 { 2345 struct mptcp_subflow_context *subflow, *tmp; 2346 2347 might_sleep(); 2348 2349 list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) { 2350 struct sock *ssk = mptcp_subflow_tcp_sock(subflow); 2351 2352 if (inet_sk_state_load(ssk) != TCP_CLOSE) 2353 continue; 2354 2355 /* 'subflow_data_ready' will re-sched once rx queue is empty */ 2356 if (!skb_queue_empty_lockless(&ssk->sk_receive_queue)) 2357 continue; 2358 2359 mptcp_close_ssk((struct sock *)msk, ssk, subflow); 2360 } 2361 } 2362 2363 static bool mptcp_check_close_timeout(const struct sock *sk) 2364 { 2365 s32 delta = tcp_jiffies32 - inet_csk(sk)->icsk_mtup.probe_timestamp; 2366 struct mptcp_subflow_context *subflow; 2367 2368 if (delta >= TCP_TIMEWAIT_LEN) 2369 return true; 2370 2371 /* if all subflows are in closed status don't bother with additional 2372 * timeout 2373 */ 2374 mptcp_for_each_subflow(mptcp_sk(sk), subflow) { 2375 if (inet_sk_state_load(mptcp_subflow_tcp_sock(subflow)) != 2376 TCP_CLOSE) 2377 return false; 2378 } 2379 return true; 2380 } 2381 2382 static void mptcp_check_fastclose(struct mptcp_sock *msk) 2383 { 2384 struct mptcp_subflow_context *subflow, *tmp; 2385 struct sock *sk = &msk->sk.icsk_inet.sk; 2386 2387 if (likely(!READ_ONCE(msk->rcv_fastclose))) 2388 return; 2389 2390 mptcp_token_destroy(msk); 2391 2392 list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) { 2393 struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow); 2394 bool slow; 2395 2396 slow = lock_sock_fast(tcp_sk); 2397 if (tcp_sk->sk_state != TCP_CLOSE) { 2398 tcp_send_active_reset(tcp_sk, GFP_ATOMIC); 2399 tcp_set_state(tcp_sk, TCP_CLOSE); 2400 } 2401 unlock_sock_fast(tcp_sk, slow); 2402 } 2403 2404 inet_sk_state_store(sk, TCP_CLOSE); 2405 sk->sk_shutdown = SHUTDOWN_MASK; 2406 smp_mb__before_atomic(); /* SHUTDOWN must be visible first */ 2407 set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags); 2408 2409 mptcp_close_wake_up(sk); 2410 } 2411 2412 static void __mptcp_retrans(struct sock *sk) 2413 { 2414 struct mptcp_sock *msk = mptcp_sk(sk); 2415 struct mptcp_sendmsg_info info = {}; 2416 struct mptcp_data_frag *dfrag; 2417 size_t copied = 0; 2418 struct sock *ssk; 2419 int ret; 2420 2421 mptcp_clean_una_wakeup(sk); 2422 2423 /* first check ssk: need to kick "stale" logic */ 2424 ssk = mptcp_subflow_get_retrans(msk); 2425 dfrag = mptcp_rtx_head(sk); 2426 if (!dfrag) { 2427 if (mptcp_data_fin_enabled(msk)) { 2428 struct inet_connection_sock *icsk = inet_csk(sk); 2429 2430 icsk->icsk_retransmits++; 2431 mptcp_set_datafin_timeout(sk); 2432 mptcp_send_ack(msk); 2433 2434 goto reset_timer; 2435 } 2436 2437 if (!mptcp_send_head(sk)) 2438 return; 2439 2440 goto reset_timer; 2441 } 2442 2443 if (!ssk) 2444 goto reset_timer; 2445 2446 lock_sock(ssk); 2447 2448 /* limit retransmission to the bytes already sent on some subflows */ 2449 info.sent = 0; 2450 info.limit = READ_ONCE(msk->csum_enabled) ? dfrag->data_len : dfrag->already_sent; 2451 while (info.sent < info.limit) { 2452 ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info); 2453 if (ret <= 0) 2454 break; 2455 2456 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RETRANSSEGS); 2457 copied += ret; 2458 info.sent += ret; 2459 } 2460 if (copied) { 2461 dfrag->already_sent = max(dfrag->already_sent, info.sent); 2462 tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle, 2463 info.size_goal); 2464 WRITE_ONCE(msk->allow_infinite_fallback, false); 2465 } 2466 2467 release_sock(ssk); 2468 2469 reset_timer: 2470 mptcp_check_and_set_pending(sk); 2471 2472 if (!mptcp_timer_pending(sk)) 2473 mptcp_reset_timer(sk); 2474 } 2475 2476 /* schedule the timeout timer for the relevant event: either close timeout 2477 * or mp_fail timeout. The close timeout takes precedence on the mp_fail one 2478 */ 2479 void mptcp_reset_timeout(struct mptcp_sock *msk, unsigned long fail_tout) 2480 { 2481 struct sock *sk = (struct sock *)msk; 2482 unsigned long timeout, close_timeout; 2483 2484 if (!fail_tout && !sock_flag(sk, SOCK_DEAD)) 2485 return; 2486 2487 close_timeout = inet_csk(sk)->icsk_mtup.probe_timestamp - tcp_jiffies32 + jiffies + TCP_TIMEWAIT_LEN; 2488 2489 /* the close timeout takes precedence on the fail one, and here at least one of 2490 * them is active 2491 */ 2492 timeout = sock_flag(sk, SOCK_DEAD) ? close_timeout : fail_tout; 2493 2494 sk_reset_timer(sk, &sk->sk_timer, timeout); 2495 } 2496 2497 static void mptcp_mp_fail_no_response(struct mptcp_sock *msk) 2498 { 2499 struct sock *ssk = msk->first; 2500 bool slow; 2501 2502 if (!ssk) 2503 return; 2504 2505 pr_debug("MP_FAIL doesn't respond, reset the subflow"); 2506 2507 slow = lock_sock_fast(ssk); 2508 mptcp_subflow_reset(ssk); 2509 WRITE_ONCE(mptcp_subflow_ctx(ssk)->fail_tout, 0); 2510 unlock_sock_fast(ssk, slow); 2511 2512 mptcp_reset_timeout(msk, 0); 2513 } 2514 2515 static void mptcp_worker(struct work_struct *work) 2516 { 2517 struct mptcp_sock *msk = container_of(work, struct mptcp_sock, work); 2518 struct sock *sk = &msk->sk.icsk_inet.sk; 2519 unsigned long fail_tout; 2520 int state; 2521 2522 lock_sock(sk); 2523 state = sk->sk_state; 2524 if (unlikely(state == TCP_CLOSE)) 2525 goto unlock; 2526 2527 mptcp_check_data_fin_ack(sk); 2528 2529 mptcp_check_fastclose(msk); 2530 2531 mptcp_pm_nl_work(msk); 2532 2533 if (test_and_clear_bit(MPTCP_WORK_EOF, &msk->flags)) 2534 mptcp_check_for_eof(msk); 2535 2536 __mptcp_check_send_data_fin(sk); 2537 mptcp_check_data_fin(sk); 2538 2539 /* There is no point in keeping around an orphaned sk timedout or 2540 * closed, but we need the msk around to reply to incoming DATA_FIN, 2541 * even if it is orphaned and in FIN_WAIT2 state 2542 */ 2543 if (sock_flag(sk, SOCK_DEAD) && 2544 (mptcp_check_close_timeout(sk) || sk->sk_state == TCP_CLOSE)) { 2545 inet_sk_state_store(sk, TCP_CLOSE); 2546 __mptcp_destroy_sock(sk); 2547 goto unlock; 2548 } 2549 2550 if (test_and_clear_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags)) 2551 __mptcp_close_subflow(msk); 2552 2553 if (test_and_clear_bit(MPTCP_WORK_RTX, &msk->flags)) 2554 __mptcp_retrans(sk); 2555 2556 fail_tout = msk->first ? READ_ONCE(mptcp_subflow_ctx(msk->first)->fail_tout) : 0; 2557 if (fail_tout && time_after(jiffies, fail_tout)) 2558 mptcp_mp_fail_no_response(msk); 2559 2560 unlock: 2561 release_sock(sk); 2562 sock_put(sk); 2563 } 2564 2565 static int __mptcp_init_sock(struct sock *sk) 2566 { 2567 struct mptcp_sock *msk = mptcp_sk(sk); 2568 2569 INIT_LIST_HEAD(&msk->conn_list); 2570 INIT_LIST_HEAD(&msk->join_list); 2571 INIT_LIST_HEAD(&msk->rtx_queue); 2572 INIT_WORK(&msk->work, mptcp_worker); 2573 __skb_queue_head_init(&msk->receive_queue); 2574 msk->out_of_order_queue = RB_ROOT; 2575 msk->first_pending = NULL; 2576 msk->rmem_fwd_alloc = 0; 2577 WRITE_ONCE(msk->rmem_released, 0); 2578 msk->timer_ival = TCP_RTO_MIN; 2579 2580 msk->first = NULL; 2581 inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss; 2582 WRITE_ONCE(msk->csum_enabled, mptcp_is_checksum_enabled(sock_net(sk))); 2583 WRITE_ONCE(msk->allow_infinite_fallback, true); 2584 msk->recovery = false; 2585 2586 mptcp_pm_data_init(msk); 2587 2588 /* re-use the csk retrans timer for MPTCP-level retrans */ 2589 timer_setup(&msk->sk.icsk_retransmit_timer, mptcp_retransmit_timer, 0); 2590 timer_setup(&sk->sk_timer, mptcp_timeout_timer, 0); 2591 2592 return 0; 2593 } 2594 2595 static void mptcp_ca_reset(struct sock *sk) 2596 { 2597 struct inet_connection_sock *icsk = inet_csk(sk); 2598 2599 tcp_assign_congestion_control(sk); 2600 strcpy(mptcp_sk(sk)->ca_name, icsk->icsk_ca_ops->name); 2601 2602 /* no need to keep a reference to the ops, the name will suffice */ 2603 tcp_cleanup_congestion_control(sk); 2604 icsk->icsk_ca_ops = NULL; 2605 } 2606 2607 static int mptcp_init_sock(struct sock *sk) 2608 { 2609 struct net *net = sock_net(sk); 2610 int ret; 2611 2612 ret = __mptcp_init_sock(sk); 2613 if (ret) 2614 return ret; 2615 2616 if (!mptcp_is_enabled(net)) 2617 return -ENOPROTOOPT; 2618 2619 if (unlikely(!net->mib.mptcp_statistics) && !mptcp_mib_alloc(net)) 2620 return -ENOMEM; 2621 2622 ret = __mptcp_socket_create(mptcp_sk(sk)); 2623 if (ret) 2624 return ret; 2625 2626 /* fetch the ca name; do it outside __mptcp_init_sock(), so that clone will 2627 * propagate the correct value 2628 */ 2629 mptcp_ca_reset(sk); 2630 2631 sk_sockets_allocated_inc(sk); 2632 sk->sk_rcvbuf = sock_net(sk)->ipv4.sysctl_tcp_rmem[1]; 2633 sk->sk_sndbuf = sock_net(sk)->ipv4.sysctl_tcp_wmem[1]; 2634 2635 return 0; 2636 } 2637 2638 static void __mptcp_clear_xmit(struct sock *sk) 2639 { 2640 struct mptcp_sock *msk = mptcp_sk(sk); 2641 struct mptcp_data_frag *dtmp, *dfrag; 2642 2643 WRITE_ONCE(msk->first_pending, NULL); 2644 list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list) 2645 dfrag_clear(sk, dfrag); 2646 } 2647 2648 static void mptcp_cancel_work(struct sock *sk) 2649 { 2650 struct mptcp_sock *msk = mptcp_sk(sk); 2651 2652 if (cancel_work_sync(&msk->work)) 2653 __sock_put(sk); 2654 } 2655 2656 void mptcp_subflow_shutdown(struct sock *sk, struct sock *ssk, int how) 2657 { 2658 lock_sock(ssk); 2659 2660 switch (ssk->sk_state) { 2661 case TCP_LISTEN: 2662 if (!(how & RCV_SHUTDOWN)) 2663 break; 2664 fallthrough; 2665 case TCP_SYN_SENT: 2666 tcp_disconnect(ssk, O_NONBLOCK); 2667 break; 2668 default: 2669 if (__mptcp_check_fallback(mptcp_sk(sk))) { 2670 pr_debug("Fallback"); 2671 ssk->sk_shutdown |= how; 2672 tcp_shutdown(ssk, how); 2673 } else { 2674 pr_debug("Sending DATA_FIN on subflow %p", ssk); 2675 tcp_send_ack(ssk); 2676 if (!mptcp_timer_pending(sk)) 2677 mptcp_reset_timer(sk); 2678 } 2679 break; 2680 } 2681 2682 release_sock(ssk); 2683 } 2684 2685 static const unsigned char new_state[16] = { 2686 /* current state: new state: action: */ 2687 [0 /* (Invalid) */] = TCP_CLOSE, 2688 [TCP_ESTABLISHED] = TCP_FIN_WAIT1 | TCP_ACTION_FIN, 2689 [TCP_SYN_SENT] = TCP_CLOSE, 2690 [TCP_SYN_RECV] = TCP_FIN_WAIT1 | TCP_ACTION_FIN, 2691 [TCP_FIN_WAIT1] = TCP_FIN_WAIT1, 2692 [TCP_FIN_WAIT2] = TCP_FIN_WAIT2, 2693 [TCP_TIME_WAIT] = TCP_CLOSE, /* should not happen ! */ 2694 [TCP_CLOSE] = TCP_CLOSE, 2695 [TCP_CLOSE_WAIT] = TCP_LAST_ACK | TCP_ACTION_FIN, 2696 [TCP_LAST_ACK] = TCP_LAST_ACK, 2697 [TCP_LISTEN] = TCP_CLOSE, 2698 [TCP_CLOSING] = TCP_CLOSING, 2699 [TCP_NEW_SYN_RECV] = TCP_CLOSE, /* should not happen ! */ 2700 }; 2701 2702 static int mptcp_close_state(struct sock *sk) 2703 { 2704 int next = (int)new_state[sk->sk_state]; 2705 int ns = next & TCP_STATE_MASK; 2706 2707 inet_sk_state_store(sk, ns); 2708 2709 return next & TCP_ACTION_FIN; 2710 } 2711 2712 static void __mptcp_check_send_data_fin(struct sock *sk) 2713 { 2714 struct mptcp_subflow_context *subflow; 2715 struct mptcp_sock *msk = mptcp_sk(sk); 2716 2717 pr_debug("msk=%p snd_data_fin_enable=%d pending=%d snd_nxt=%llu write_seq=%llu", 2718 msk, msk->snd_data_fin_enable, !!mptcp_send_head(sk), 2719 msk->snd_nxt, msk->write_seq); 2720 2721 /* we still need to enqueue subflows or not really shutting down, 2722 * skip this 2723 */ 2724 if (!msk->snd_data_fin_enable || msk->snd_nxt + 1 != msk->write_seq || 2725 mptcp_send_head(sk)) 2726 return; 2727 2728 WRITE_ONCE(msk->snd_nxt, msk->write_seq); 2729 2730 /* fallback socket will not get data_fin/ack, can move to the next 2731 * state now 2732 */ 2733 if (__mptcp_check_fallback(msk)) { 2734 WRITE_ONCE(msk->snd_una, msk->write_seq); 2735 if ((1 << sk->sk_state) & (TCPF_CLOSING | TCPF_LAST_ACK)) { 2736 inet_sk_state_store(sk, TCP_CLOSE); 2737 mptcp_close_wake_up(sk); 2738 } else if (sk->sk_state == TCP_FIN_WAIT1) { 2739 inet_sk_state_store(sk, TCP_FIN_WAIT2); 2740 } 2741 } 2742 2743 mptcp_for_each_subflow(msk, subflow) { 2744 struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow); 2745 2746 mptcp_subflow_shutdown(sk, tcp_sk, SEND_SHUTDOWN); 2747 } 2748 } 2749 2750 static void __mptcp_wr_shutdown(struct sock *sk) 2751 { 2752 struct mptcp_sock *msk = mptcp_sk(sk); 2753 2754 pr_debug("msk=%p snd_data_fin_enable=%d shutdown=%x state=%d pending=%d", 2755 msk, msk->snd_data_fin_enable, sk->sk_shutdown, sk->sk_state, 2756 !!mptcp_send_head(sk)); 2757 2758 /* will be ignored by fallback sockets */ 2759 WRITE_ONCE(msk->write_seq, msk->write_seq + 1); 2760 WRITE_ONCE(msk->snd_data_fin_enable, 1); 2761 2762 __mptcp_check_send_data_fin(sk); 2763 } 2764 2765 static void __mptcp_destroy_sock(struct sock *sk) 2766 { 2767 struct mptcp_subflow_context *subflow, *tmp; 2768 struct mptcp_sock *msk = mptcp_sk(sk); 2769 LIST_HEAD(conn_list); 2770 2771 pr_debug("msk=%p", msk); 2772 2773 might_sleep(); 2774 2775 /* join list will be eventually flushed (with rst) at sock lock release time*/ 2776 list_splice_init(&msk->conn_list, &conn_list); 2777 2778 mptcp_stop_timer(sk); 2779 sk_stop_timer(sk, &sk->sk_timer); 2780 msk->pm.status = 0; 2781 2782 /* clears msk->subflow, allowing the following loop to close 2783 * even the initial subflow 2784 */ 2785 mptcp_dispose_initial_subflow(msk); 2786 list_for_each_entry_safe(subflow, tmp, &conn_list, node) { 2787 struct sock *ssk = mptcp_subflow_tcp_sock(subflow); 2788 __mptcp_close_ssk(sk, ssk, subflow, 0); 2789 } 2790 2791 sk->sk_prot->destroy(sk); 2792 2793 WARN_ON_ONCE(msk->rmem_fwd_alloc); 2794 WARN_ON_ONCE(msk->rmem_released); 2795 sk_stream_kill_queues(sk); 2796 xfrm_sk_free_policy(sk); 2797 2798 sk_refcnt_debug_release(sk); 2799 sock_put(sk); 2800 } 2801 2802 static void mptcp_close(struct sock *sk, long timeout) 2803 { 2804 struct mptcp_subflow_context *subflow; 2805 struct mptcp_sock *msk = mptcp_sk(sk); 2806 bool do_cancel_work = false; 2807 2808 lock_sock(sk); 2809 sk->sk_shutdown = SHUTDOWN_MASK; 2810 2811 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) { 2812 inet_sk_state_store(sk, TCP_CLOSE); 2813 goto cleanup; 2814 } 2815 2816 if (mptcp_close_state(sk)) 2817 __mptcp_wr_shutdown(sk); 2818 2819 sk_stream_wait_close(sk, timeout); 2820 2821 cleanup: 2822 /* orphan all the subflows */ 2823 inet_csk(sk)->icsk_mtup.probe_timestamp = tcp_jiffies32; 2824 mptcp_for_each_subflow(msk, subflow) { 2825 struct sock *ssk = mptcp_subflow_tcp_sock(subflow); 2826 bool slow = lock_sock_fast_nested(ssk); 2827 2828 /* since the close timeout takes precedence on the fail one, 2829 * cancel the latter 2830 */ 2831 if (ssk == msk->first) 2832 subflow->fail_tout = 0; 2833 2834 sock_orphan(ssk); 2835 unlock_sock_fast(ssk, slow); 2836 } 2837 sock_orphan(sk); 2838 2839 sock_hold(sk); 2840 pr_debug("msk=%p state=%d", sk, sk->sk_state); 2841 if (mptcp_sk(sk)->token) 2842 mptcp_event(MPTCP_EVENT_CLOSED, msk, NULL, GFP_KERNEL); 2843 2844 if (sk->sk_state == TCP_CLOSE) { 2845 __mptcp_destroy_sock(sk); 2846 do_cancel_work = true; 2847 } else { 2848 mptcp_reset_timeout(msk, 0); 2849 } 2850 release_sock(sk); 2851 if (do_cancel_work) 2852 mptcp_cancel_work(sk); 2853 2854 sock_put(sk); 2855 } 2856 2857 static void mptcp_copy_inaddrs(struct sock *msk, const struct sock *ssk) 2858 { 2859 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 2860 const struct ipv6_pinfo *ssk6 = inet6_sk(ssk); 2861 struct ipv6_pinfo *msk6 = inet6_sk(msk); 2862 2863 msk->sk_v6_daddr = ssk->sk_v6_daddr; 2864 msk->sk_v6_rcv_saddr = ssk->sk_v6_rcv_saddr; 2865 2866 if (msk6 && ssk6) { 2867 msk6->saddr = ssk6->saddr; 2868 msk6->flow_label = ssk6->flow_label; 2869 } 2870 #endif 2871 2872 inet_sk(msk)->inet_num = inet_sk(ssk)->inet_num; 2873 inet_sk(msk)->inet_dport = inet_sk(ssk)->inet_dport; 2874 inet_sk(msk)->inet_sport = inet_sk(ssk)->inet_sport; 2875 inet_sk(msk)->inet_daddr = inet_sk(ssk)->inet_daddr; 2876 inet_sk(msk)->inet_saddr = inet_sk(ssk)->inet_saddr; 2877 inet_sk(msk)->inet_rcv_saddr = inet_sk(ssk)->inet_rcv_saddr; 2878 } 2879 2880 static int mptcp_disconnect(struct sock *sk, int flags) 2881 { 2882 struct mptcp_subflow_context *subflow; 2883 struct mptcp_sock *msk = mptcp_sk(sk); 2884 2885 inet_sk_state_store(sk, TCP_CLOSE); 2886 2887 mptcp_for_each_subflow(msk, subflow) { 2888 struct sock *ssk = mptcp_subflow_tcp_sock(subflow); 2889 2890 __mptcp_close_ssk(sk, ssk, subflow, MPTCP_CF_FASTCLOSE); 2891 } 2892 2893 mptcp_stop_timer(sk); 2894 sk_stop_timer(sk, &sk->sk_timer); 2895 2896 if (mptcp_sk(sk)->token) 2897 mptcp_event(MPTCP_EVENT_CLOSED, mptcp_sk(sk), NULL, GFP_KERNEL); 2898 2899 mptcp_destroy_common(msk); 2900 msk->last_snd = NULL; 2901 WRITE_ONCE(msk->flags, 0); 2902 msk->cb_flags = 0; 2903 msk->push_pending = 0; 2904 msk->recovery = false; 2905 msk->can_ack = false; 2906 msk->fully_established = false; 2907 msk->rcv_data_fin = false; 2908 msk->snd_data_fin_enable = false; 2909 msk->rcv_fastclose = false; 2910 msk->use_64bit_ack = false; 2911 WRITE_ONCE(msk->csum_enabled, mptcp_is_checksum_enabled(sock_net(sk))); 2912 mptcp_pm_data_reset(msk); 2913 mptcp_ca_reset(sk); 2914 2915 sk->sk_shutdown = 0; 2916 sk_error_report(sk); 2917 return 0; 2918 } 2919 2920 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 2921 static struct ipv6_pinfo *mptcp_inet6_sk(const struct sock *sk) 2922 { 2923 unsigned int offset = sizeof(struct mptcp6_sock) - sizeof(struct ipv6_pinfo); 2924 2925 return (struct ipv6_pinfo *)(((u8 *)sk) + offset); 2926 } 2927 #endif 2928 2929 struct sock *mptcp_sk_clone(const struct sock *sk, 2930 const struct mptcp_options_received *mp_opt, 2931 struct request_sock *req) 2932 { 2933 struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); 2934 struct sock *nsk = sk_clone_lock(sk, GFP_ATOMIC); 2935 struct mptcp_sock *msk; 2936 u64 ack_seq; 2937 2938 if (!nsk) 2939 return NULL; 2940 2941 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 2942 if (nsk->sk_family == AF_INET6) 2943 inet_sk(nsk)->pinet6 = mptcp_inet6_sk(nsk); 2944 #endif 2945 2946 __mptcp_init_sock(nsk); 2947 2948 msk = mptcp_sk(nsk); 2949 msk->local_key = subflow_req->local_key; 2950 msk->token = subflow_req->token; 2951 msk->subflow = NULL; 2952 WRITE_ONCE(msk->fully_established, false); 2953 if (mp_opt->suboptions & OPTION_MPTCP_CSUMREQD) 2954 WRITE_ONCE(msk->csum_enabled, true); 2955 2956 msk->write_seq = subflow_req->idsn + 1; 2957 msk->snd_nxt = msk->write_seq; 2958 msk->snd_una = msk->write_seq; 2959 msk->wnd_end = msk->snd_nxt + req->rsk_rcv_wnd; 2960 msk->setsockopt_seq = mptcp_sk(sk)->setsockopt_seq; 2961 2962 if (mp_opt->suboptions & OPTIONS_MPTCP_MPC) { 2963 msk->can_ack = true; 2964 msk->remote_key = mp_opt->sndr_key; 2965 mptcp_crypto_key_sha(msk->remote_key, NULL, &ack_seq); 2966 ack_seq++; 2967 WRITE_ONCE(msk->ack_seq, ack_seq); 2968 atomic64_set(&msk->rcv_wnd_sent, ack_seq); 2969 } 2970 2971 sock_reset_flag(nsk, SOCK_RCU_FREE); 2972 /* will be fully established after successful MPC subflow creation */ 2973 inet_sk_state_store(nsk, TCP_SYN_RECV); 2974 2975 security_inet_csk_clone(nsk, req); 2976 bh_unlock_sock(nsk); 2977 2978 /* keep a single reference */ 2979 __sock_put(nsk); 2980 return nsk; 2981 } 2982 2983 void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk) 2984 { 2985 const struct tcp_sock *tp = tcp_sk(ssk); 2986 2987 msk->rcvq_space.copied = 0; 2988 msk->rcvq_space.rtt_us = 0; 2989 2990 msk->rcvq_space.time = tp->tcp_mstamp; 2991 2992 /* initial rcv_space offering made to peer */ 2993 msk->rcvq_space.space = min_t(u32, tp->rcv_wnd, 2994 TCP_INIT_CWND * tp->advmss); 2995 if (msk->rcvq_space.space == 0) 2996 msk->rcvq_space.space = TCP_INIT_CWND * TCP_MSS_DEFAULT; 2997 2998 WRITE_ONCE(msk->wnd_end, msk->snd_nxt + tcp_sk(ssk)->snd_wnd); 2999 } 3000 3001 static struct sock *mptcp_accept(struct sock *sk, int flags, int *err, 3002 bool kern) 3003 { 3004 struct mptcp_sock *msk = mptcp_sk(sk); 3005 struct socket *listener; 3006 struct sock *newsk; 3007 3008 listener = __mptcp_nmpc_socket(msk); 3009 if (WARN_ON_ONCE(!listener)) { 3010 *err = -EINVAL; 3011 return NULL; 3012 } 3013 3014 pr_debug("msk=%p, listener=%p", msk, mptcp_subflow_ctx(listener->sk)); 3015 newsk = inet_csk_accept(listener->sk, flags, err, kern); 3016 if (!newsk) 3017 return NULL; 3018 3019 pr_debug("msk=%p, subflow is mptcp=%d", msk, sk_is_mptcp(newsk)); 3020 if (sk_is_mptcp(newsk)) { 3021 struct mptcp_subflow_context *subflow; 3022 struct sock *new_mptcp_sock; 3023 3024 subflow = mptcp_subflow_ctx(newsk); 3025 new_mptcp_sock = subflow->conn; 3026 3027 /* is_mptcp should be false if subflow->conn is missing, see 3028 * subflow_syn_recv_sock() 3029 */ 3030 if (WARN_ON_ONCE(!new_mptcp_sock)) { 3031 tcp_sk(newsk)->is_mptcp = 0; 3032 goto out; 3033 } 3034 3035 /* acquire the 2nd reference for the owning socket */ 3036 sock_hold(new_mptcp_sock); 3037 newsk = new_mptcp_sock; 3038 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEPASSIVEACK); 3039 } else { 3040 MPTCP_INC_STATS(sock_net(sk), 3041 MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK); 3042 } 3043 3044 out: 3045 newsk->sk_kern_sock = kern; 3046 return newsk; 3047 } 3048 3049 void mptcp_destroy_common(struct mptcp_sock *msk) 3050 { 3051 struct sock *sk = (struct sock *)msk; 3052 3053 __mptcp_clear_xmit(sk); 3054 3055 /* move to sk_receive_queue, sk_stream_kill_queues will purge it */ 3056 mptcp_data_lock(sk); 3057 skb_queue_splice_tail_init(&msk->receive_queue, &sk->sk_receive_queue); 3058 __skb_queue_purge(&sk->sk_receive_queue); 3059 skb_rbtree_purge(&msk->out_of_order_queue); 3060 mptcp_data_unlock(sk); 3061 3062 /* move all the rx fwd alloc into the sk_mem_reclaim_final in 3063 * inet_sock_destruct() will dispose it 3064 */ 3065 sk->sk_forward_alloc += msk->rmem_fwd_alloc; 3066 msk->rmem_fwd_alloc = 0; 3067 mptcp_token_destroy(msk); 3068 mptcp_pm_free_anno_list(msk); 3069 mptcp_free_local_addr_list(msk); 3070 } 3071 3072 static void mptcp_destroy(struct sock *sk) 3073 { 3074 struct mptcp_sock *msk = mptcp_sk(sk); 3075 3076 mptcp_destroy_common(msk); 3077 sk_sockets_allocated_dec(sk); 3078 } 3079 3080 void __mptcp_data_acked(struct sock *sk) 3081 { 3082 if (!sock_owned_by_user(sk)) 3083 __mptcp_clean_una(sk); 3084 else 3085 __set_bit(MPTCP_CLEAN_UNA, &mptcp_sk(sk)->cb_flags); 3086 3087 if (mptcp_pending_data_fin_ack(sk)) 3088 mptcp_schedule_work(sk); 3089 } 3090 3091 void __mptcp_check_push(struct sock *sk, struct sock *ssk) 3092 { 3093 if (!mptcp_send_head(sk)) 3094 return; 3095 3096 if (!sock_owned_by_user(sk)) { 3097 struct sock *xmit_ssk = mptcp_subflow_get_send(mptcp_sk(sk)); 3098 3099 if (xmit_ssk == ssk) 3100 __mptcp_subflow_push_pending(sk, ssk); 3101 else if (xmit_ssk) 3102 mptcp_subflow_delegate(mptcp_subflow_ctx(xmit_ssk), MPTCP_DELEGATE_SEND); 3103 } else { 3104 __set_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->cb_flags); 3105 } 3106 } 3107 3108 #define MPTCP_FLAGS_PROCESS_CTX_NEED (BIT(MPTCP_PUSH_PENDING) | \ 3109 BIT(MPTCP_RETRANSMIT) | \ 3110 BIT(MPTCP_FLUSH_JOIN_LIST)) 3111 3112 /* processes deferred events and flush wmem */ 3113 static void mptcp_release_cb(struct sock *sk) 3114 __must_hold(&sk->sk_lock.slock) 3115 { 3116 struct mptcp_sock *msk = mptcp_sk(sk); 3117 3118 for (;;) { 3119 unsigned long flags = (msk->cb_flags & MPTCP_FLAGS_PROCESS_CTX_NEED) | 3120 msk->push_pending; 3121 if (!flags) 3122 break; 3123 3124 /* the following actions acquire the subflow socket lock 3125 * 3126 * 1) can't be invoked in atomic scope 3127 * 2) must avoid ABBA deadlock with msk socket spinlock: the RX 3128 * datapath acquires the msk socket spinlock while helding 3129 * the subflow socket lock 3130 */ 3131 msk->push_pending = 0; 3132 msk->cb_flags &= ~flags; 3133 spin_unlock_bh(&sk->sk_lock.slock); 3134 if (flags & BIT(MPTCP_FLUSH_JOIN_LIST)) 3135 __mptcp_flush_join_list(sk); 3136 if (flags & BIT(MPTCP_PUSH_PENDING)) 3137 __mptcp_push_pending(sk, 0); 3138 if (flags & BIT(MPTCP_RETRANSMIT)) 3139 __mptcp_retrans(sk); 3140 3141 cond_resched(); 3142 spin_lock_bh(&sk->sk_lock.slock); 3143 } 3144 3145 if (__test_and_clear_bit(MPTCP_CLEAN_UNA, &msk->cb_flags)) 3146 __mptcp_clean_una_wakeup(sk); 3147 if (unlikely(&msk->cb_flags)) { 3148 /* be sure to set the current sk state before tacking actions 3149 * depending on sk_state, that is processing MPTCP_ERROR_REPORT 3150 */ 3151 if (__test_and_clear_bit(MPTCP_CONNECTED, &msk->cb_flags)) 3152 __mptcp_set_connected(sk); 3153 if (__test_and_clear_bit(MPTCP_ERROR_REPORT, &msk->cb_flags)) 3154 __mptcp_error_report(sk); 3155 if (__test_and_clear_bit(MPTCP_RESET_SCHEDULER, &msk->cb_flags)) 3156 msk->last_snd = NULL; 3157 } 3158 3159 __mptcp_update_rmem(sk); 3160 } 3161 3162 /* MP_JOIN client subflow must wait for 4th ack before sending any data: 3163 * TCP can't schedule delack timer before the subflow is fully established. 3164 * MPTCP uses the delack timer to do 3rd ack retransmissions 3165 */ 3166 static void schedule_3rdack_retransmission(struct sock *ssk) 3167 { 3168 struct inet_connection_sock *icsk = inet_csk(ssk); 3169 struct tcp_sock *tp = tcp_sk(ssk); 3170 unsigned long timeout; 3171 3172 if (mptcp_subflow_ctx(ssk)->fully_established) 3173 return; 3174 3175 /* reschedule with a timeout above RTT, as we must look only for drop */ 3176 if (tp->srtt_us) 3177 timeout = usecs_to_jiffies(tp->srtt_us >> (3 - 1)); 3178 else 3179 timeout = TCP_TIMEOUT_INIT; 3180 timeout += jiffies; 3181 3182 WARN_ON_ONCE(icsk->icsk_ack.pending & ICSK_ACK_TIMER); 3183 icsk->icsk_ack.pending |= ICSK_ACK_SCHED | ICSK_ACK_TIMER; 3184 icsk->icsk_ack.timeout = timeout; 3185 sk_reset_timer(ssk, &icsk->icsk_delack_timer, timeout); 3186 } 3187 3188 void mptcp_subflow_process_delegated(struct sock *ssk) 3189 { 3190 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); 3191 struct sock *sk = subflow->conn; 3192 3193 if (test_bit(MPTCP_DELEGATE_SEND, &subflow->delegated_status)) { 3194 mptcp_data_lock(sk); 3195 if (!sock_owned_by_user(sk)) 3196 __mptcp_subflow_push_pending(sk, ssk); 3197 else 3198 __set_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->cb_flags); 3199 mptcp_data_unlock(sk); 3200 mptcp_subflow_delegated_done(subflow, MPTCP_DELEGATE_SEND); 3201 } 3202 if (test_bit(MPTCP_DELEGATE_ACK, &subflow->delegated_status)) { 3203 schedule_3rdack_retransmission(ssk); 3204 mptcp_subflow_delegated_done(subflow, MPTCP_DELEGATE_ACK); 3205 } 3206 } 3207 3208 static int mptcp_hash(struct sock *sk) 3209 { 3210 /* should never be called, 3211 * we hash the TCP subflows not the master socket 3212 */ 3213 WARN_ON_ONCE(1); 3214 return 0; 3215 } 3216 3217 static void mptcp_unhash(struct sock *sk) 3218 { 3219 /* called from sk_common_release(), but nothing to do here */ 3220 } 3221 3222 static int mptcp_get_port(struct sock *sk, unsigned short snum) 3223 { 3224 struct mptcp_sock *msk = mptcp_sk(sk); 3225 struct socket *ssock; 3226 3227 ssock = __mptcp_nmpc_socket(msk); 3228 pr_debug("msk=%p, subflow=%p", msk, ssock); 3229 if (WARN_ON_ONCE(!ssock)) 3230 return -EINVAL; 3231 3232 return inet_csk_get_port(ssock->sk, snum); 3233 } 3234 3235 void mptcp_finish_connect(struct sock *ssk) 3236 { 3237 struct mptcp_subflow_context *subflow; 3238 struct mptcp_sock *msk; 3239 struct sock *sk; 3240 u64 ack_seq; 3241 3242 subflow = mptcp_subflow_ctx(ssk); 3243 sk = subflow->conn; 3244 msk = mptcp_sk(sk); 3245 3246 pr_debug("msk=%p, token=%u", sk, subflow->token); 3247 3248 mptcp_crypto_key_sha(subflow->remote_key, NULL, &ack_seq); 3249 ack_seq++; 3250 subflow->map_seq = ack_seq; 3251 subflow->map_subflow_seq = 1; 3252 3253 /* the socket is not connected yet, no msk/subflow ops can access/race 3254 * accessing the field below 3255 */ 3256 WRITE_ONCE(msk->remote_key, subflow->remote_key); 3257 WRITE_ONCE(msk->local_key, subflow->local_key); 3258 WRITE_ONCE(msk->write_seq, subflow->idsn + 1); 3259 WRITE_ONCE(msk->snd_nxt, msk->write_seq); 3260 WRITE_ONCE(msk->ack_seq, ack_seq); 3261 WRITE_ONCE(msk->can_ack, 1); 3262 WRITE_ONCE(msk->snd_una, msk->write_seq); 3263 atomic64_set(&msk->rcv_wnd_sent, ack_seq); 3264 3265 mptcp_pm_new_connection(msk, ssk, 0); 3266 3267 mptcp_rcv_space_init(msk, ssk); 3268 } 3269 3270 void mptcp_sock_graft(struct sock *sk, struct socket *parent) 3271 { 3272 write_lock_bh(&sk->sk_callback_lock); 3273 rcu_assign_pointer(sk->sk_wq, &parent->wq); 3274 sk_set_socket(sk, parent); 3275 sk->sk_uid = SOCK_INODE(parent)->i_uid; 3276 write_unlock_bh(&sk->sk_callback_lock); 3277 } 3278 3279 bool mptcp_finish_join(struct sock *ssk) 3280 { 3281 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); 3282 struct mptcp_sock *msk = mptcp_sk(subflow->conn); 3283 struct sock *parent = (void *)msk; 3284 bool ret = true; 3285 3286 pr_debug("msk=%p, subflow=%p", msk, subflow); 3287 3288 /* mptcp socket already closing? */ 3289 if (!mptcp_is_fully_established(parent)) { 3290 subflow->reset_reason = MPTCP_RST_EMPTCP; 3291 return false; 3292 } 3293 3294 if (!list_empty(&subflow->node)) 3295 goto out; 3296 3297 if (!mptcp_pm_allow_new_subflow(msk)) 3298 goto err_prohibited; 3299 3300 /* active connections are already on conn_list. 3301 * If we can't acquire msk socket lock here, let the release callback 3302 * handle it 3303 */ 3304 mptcp_data_lock(parent); 3305 if (!sock_owned_by_user(parent)) { 3306 ret = __mptcp_finish_join(msk, ssk); 3307 if (ret) { 3308 sock_hold(ssk); 3309 list_add_tail(&subflow->node, &msk->conn_list); 3310 } 3311 } else { 3312 sock_hold(ssk); 3313 list_add_tail(&subflow->node, &msk->join_list); 3314 __set_bit(MPTCP_FLUSH_JOIN_LIST, &msk->cb_flags); 3315 } 3316 mptcp_data_unlock(parent); 3317 3318 if (!ret) { 3319 err_prohibited: 3320 subflow->reset_reason = MPTCP_RST_EPROHIBIT; 3321 return false; 3322 } 3323 3324 subflow->map_seq = READ_ONCE(msk->ack_seq); 3325 WRITE_ONCE(msk->allow_infinite_fallback, false); 3326 3327 out: 3328 mptcp_event(MPTCP_EVENT_SUB_ESTABLISHED, msk, ssk, GFP_ATOMIC); 3329 return true; 3330 } 3331 3332 static void mptcp_shutdown(struct sock *sk, int how) 3333 { 3334 pr_debug("sk=%p, how=%d", sk, how); 3335 3336 if ((how & SEND_SHUTDOWN) && mptcp_close_state(sk)) 3337 __mptcp_wr_shutdown(sk); 3338 } 3339 3340 static int mptcp_forward_alloc_get(const struct sock *sk) 3341 { 3342 return sk->sk_forward_alloc + mptcp_sk(sk)->rmem_fwd_alloc; 3343 } 3344 3345 static int mptcp_ioctl_outq(const struct mptcp_sock *msk, u64 v) 3346 { 3347 const struct sock *sk = (void *)msk; 3348 u64 delta; 3349 3350 if (sk->sk_state == TCP_LISTEN) 3351 return -EINVAL; 3352 3353 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) 3354 return 0; 3355 3356 delta = msk->write_seq - v; 3357 if (__mptcp_check_fallback(msk) && msk->first) { 3358 struct tcp_sock *tp = tcp_sk(msk->first); 3359 3360 /* the first subflow is disconnected after close - see 3361 * __mptcp_close_ssk(). tcp_disconnect() moves the write_seq 3362 * so ignore that status, too. 3363 */ 3364 if (!((1 << msk->first->sk_state) & 3365 (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_CLOSE))) 3366 delta += READ_ONCE(tp->write_seq) - tp->snd_una; 3367 } 3368 if (delta > INT_MAX) 3369 delta = INT_MAX; 3370 3371 return (int)delta; 3372 } 3373 3374 static int mptcp_ioctl(struct sock *sk, int cmd, unsigned long arg) 3375 { 3376 struct mptcp_sock *msk = mptcp_sk(sk); 3377 bool slow; 3378 int answ; 3379 3380 switch (cmd) { 3381 case SIOCINQ: 3382 if (sk->sk_state == TCP_LISTEN) 3383 return -EINVAL; 3384 3385 lock_sock(sk); 3386 __mptcp_move_skbs(msk); 3387 answ = mptcp_inq_hint(sk); 3388 release_sock(sk); 3389 break; 3390 case SIOCOUTQ: 3391 slow = lock_sock_fast(sk); 3392 answ = mptcp_ioctl_outq(msk, READ_ONCE(msk->snd_una)); 3393 unlock_sock_fast(sk, slow); 3394 break; 3395 case SIOCOUTQNSD: 3396 slow = lock_sock_fast(sk); 3397 answ = mptcp_ioctl_outq(msk, msk->snd_nxt); 3398 unlock_sock_fast(sk, slow); 3399 break; 3400 default: 3401 return -ENOIOCTLCMD; 3402 } 3403 3404 return put_user(answ, (int __user *)arg); 3405 } 3406 3407 static struct proto mptcp_prot = { 3408 .name = "MPTCP", 3409 .owner = THIS_MODULE, 3410 .init = mptcp_init_sock, 3411 .disconnect = mptcp_disconnect, 3412 .close = mptcp_close, 3413 .accept = mptcp_accept, 3414 .setsockopt = mptcp_setsockopt, 3415 .getsockopt = mptcp_getsockopt, 3416 .shutdown = mptcp_shutdown, 3417 .destroy = mptcp_destroy, 3418 .sendmsg = mptcp_sendmsg, 3419 .ioctl = mptcp_ioctl, 3420 .recvmsg = mptcp_recvmsg, 3421 .release_cb = mptcp_release_cb, 3422 .hash = mptcp_hash, 3423 .unhash = mptcp_unhash, 3424 .get_port = mptcp_get_port, 3425 .forward_alloc_get = mptcp_forward_alloc_get, 3426 .sockets_allocated = &mptcp_sockets_allocated, 3427 3428 .memory_allocated = &tcp_memory_allocated, 3429 .per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc, 3430 3431 .memory_pressure = &tcp_memory_pressure, 3432 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), 3433 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), 3434 .sysctl_mem = sysctl_tcp_mem, 3435 .obj_size = sizeof(struct mptcp_sock), 3436 .slab_flags = SLAB_TYPESAFE_BY_RCU, 3437 .no_autobind = true, 3438 }; 3439 3440 static int mptcp_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) 3441 { 3442 struct mptcp_sock *msk = mptcp_sk(sock->sk); 3443 struct socket *ssock; 3444 int err; 3445 3446 lock_sock(sock->sk); 3447 ssock = __mptcp_nmpc_socket(msk); 3448 if (!ssock) { 3449 err = -EINVAL; 3450 goto unlock; 3451 } 3452 3453 err = ssock->ops->bind(ssock, uaddr, addr_len); 3454 if (!err) 3455 mptcp_copy_inaddrs(sock->sk, ssock->sk); 3456 3457 unlock: 3458 release_sock(sock->sk); 3459 return err; 3460 } 3461 3462 static void mptcp_subflow_early_fallback(struct mptcp_sock *msk, 3463 struct mptcp_subflow_context *subflow) 3464 { 3465 subflow->request_mptcp = 0; 3466 __mptcp_do_fallback(msk); 3467 } 3468 3469 static int mptcp_stream_connect(struct socket *sock, struct sockaddr *uaddr, 3470 int addr_len, int flags) 3471 { 3472 struct mptcp_sock *msk = mptcp_sk(sock->sk); 3473 struct mptcp_subflow_context *subflow; 3474 struct socket *ssock; 3475 int err = -EINVAL; 3476 3477 lock_sock(sock->sk); 3478 if (uaddr) { 3479 if (addr_len < sizeof(uaddr->sa_family)) 3480 goto unlock; 3481 3482 if (uaddr->sa_family == AF_UNSPEC) { 3483 err = mptcp_disconnect(sock->sk, flags); 3484 sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED; 3485 goto unlock; 3486 } 3487 } 3488 3489 if (sock->state != SS_UNCONNECTED && msk->subflow) { 3490 /* pending connection or invalid state, let existing subflow 3491 * cope with that 3492 */ 3493 ssock = msk->subflow; 3494 goto do_connect; 3495 } 3496 3497 ssock = __mptcp_nmpc_socket(msk); 3498 if (!ssock) 3499 goto unlock; 3500 3501 mptcp_token_destroy(msk); 3502 inet_sk_state_store(sock->sk, TCP_SYN_SENT); 3503 subflow = mptcp_subflow_ctx(ssock->sk); 3504 #ifdef CONFIG_TCP_MD5SIG 3505 /* no MPTCP if MD5SIG is enabled on this socket or we may run out of 3506 * TCP option space. 3507 */ 3508 if (rcu_access_pointer(tcp_sk(ssock->sk)->md5sig_info)) 3509 mptcp_subflow_early_fallback(msk, subflow); 3510 #endif 3511 if (subflow->request_mptcp && mptcp_token_new_connect(ssock->sk)) { 3512 MPTCP_INC_STATS(sock_net(ssock->sk), MPTCP_MIB_TOKENFALLBACKINIT); 3513 mptcp_subflow_early_fallback(msk, subflow); 3514 } 3515 if (likely(!__mptcp_check_fallback(msk))) 3516 MPTCP_INC_STATS(sock_net(sock->sk), MPTCP_MIB_MPCAPABLEACTIVE); 3517 3518 do_connect: 3519 err = ssock->ops->connect(ssock, uaddr, addr_len, flags); 3520 sock->state = ssock->state; 3521 3522 /* on successful connect, the msk state will be moved to established by 3523 * subflow_finish_connect() 3524 */ 3525 if (!err || err == -EINPROGRESS) 3526 mptcp_copy_inaddrs(sock->sk, ssock->sk); 3527 else 3528 inet_sk_state_store(sock->sk, inet_sk_state_load(ssock->sk)); 3529 3530 unlock: 3531 release_sock(sock->sk); 3532 return err; 3533 } 3534 3535 static int mptcp_listen(struct socket *sock, int backlog) 3536 { 3537 struct mptcp_sock *msk = mptcp_sk(sock->sk); 3538 struct socket *ssock; 3539 int err; 3540 3541 pr_debug("msk=%p", msk); 3542 3543 lock_sock(sock->sk); 3544 ssock = __mptcp_nmpc_socket(msk); 3545 if (!ssock) { 3546 err = -EINVAL; 3547 goto unlock; 3548 } 3549 3550 mptcp_token_destroy(msk); 3551 inet_sk_state_store(sock->sk, TCP_LISTEN); 3552 sock_set_flag(sock->sk, SOCK_RCU_FREE); 3553 3554 err = ssock->ops->listen(ssock, backlog); 3555 inet_sk_state_store(sock->sk, inet_sk_state_load(ssock->sk)); 3556 if (!err) 3557 mptcp_copy_inaddrs(sock->sk, ssock->sk); 3558 3559 unlock: 3560 release_sock(sock->sk); 3561 return err; 3562 } 3563 3564 static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, 3565 int flags, bool kern) 3566 { 3567 struct mptcp_sock *msk = mptcp_sk(sock->sk); 3568 struct socket *ssock; 3569 int err; 3570 3571 pr_debug("msk=%p", msk); 3572 3573 ssock = __mptcp_nmpc_socket(msk); 3574 if (!ssock) 3575 return -EINVAL; 3576 3577 err = ssock->ops->accept(sock, newsock, flags, kern); 3578 if (err == 0 && !mptcp_is_tcpsk(newsock->sk)) { 3579 struct mptcp_sock *msk = mptcp_sk(newsock->sk); 3580 struct mptcp_subflow_context *subflow; 3581 struct sock *newsk = newsock->sk; 3582 3583 lock_sock(newsk); 3584 3585 /* PM/worker can now acquire the first subflow socket 3586 * lock without racing with listener queue cleanup, 3587 * we can notify it, if needed. 3588 * 3589 * Even if remote has reset the initial subflow by now 3590 * the refcnt is still at least one. 3591 */ 3592 subflow = mptcp_subflow_ctx(msk->first); 3593 list_add(&subflow->node, &msk->conn_list); 3594 sock_hold(msk->first); 3595 if (mptcp_is_fully_established(newsk)) 3596 mptcp_pm_fully_established(msk, msk->first, GFP_KERNEL); 3597 3598 mptcp_copy_inaddrs(newsk, msk->first); 3599 mptcp_rcv_space_init(msk, msk->first); 3600 mptcp_propagate_sndbuf(newsk, msk->first); 3601 3602 /* set ssk->sk_socket of accept()ed flows to mptcp socket. 3603 * This is needed so NOSPACE flag can be set from tcp stack. 3604 */ 3605 mptcp_for_each_subflow(msk, subflow) { 3606 struct sock *ssk = mptcp_subflow_tcp_sock(subflow); 3607 3608 if (!ssk->sk_socket) 3609 mptcp_sock_graft(ssk, newsock); 3610 } 3611 release_sock(newsk); 3612 } 3613 3614 return err; 3615 } 3616 3617 static __poll_t mptcp_check_readable(struct mptcp_sock *msk) 3618 { 3619 /* Concurrent splices from sk_receive_queue into receive_queue will 3620 * always show at least one non-empty queue when checked in this order. 3621 */ 3622 if (skb_queue_empty_lockless(&((struct sock *)msk)->sk_receive_queue) && 3623 skb_queue_empty_lockless(&msk->receive_queue)) 3624 return 0; 3625 3626 return EPOLLIN | EPOLLRDNORM; 3627 } 3628 3629 static __poll_t mptcp_check_writeable(struct mptcp_sock *msk) 3630 { 3631 struct sock *sk = (struct sock *)msk; 3632 3633 if (unlikely(sk->sk_shutdown & SEND_SHUTDOWN)) 3634 return EPOLLOUT | EPOLLWRNORM; 3635 3636 if (sk_stream_is_writeable(sk)) 3637 return EPOLLOUT | EPOLLWRNORM; 3638 3639 mptcp_set_nospace(sk); 3640 smp_mb__after_atomic(); /* msk->flags is changed by write_space cb */ 3641 if (sk_stream_is_writeable(sk)) 3642 return EPOLLOUT | EPOLLWRNORM; 3643 3644 return 0; 3645 } 3646 3647 static __poll_t mptcp_poll(struct file *file, struct socket *sock, 3648 struct poll_table_struct *wait) 3649 { 3650 struct sock *sk = sock->sk; 3651 struct mptcp_sock *msk; 3652 __poll_t mask = 0; 3653 int state; 3654 3655 msk = mptcp_sk(sk); 3656 sock_poll_wait(file, sock, wait); 3657 3658 state = inet_sk_state_load(sk); 3659 pr_debug("msk=%p state=%d flags=%lx", msk, state, msk->flags); 3660 if (state == TCP_LISTEN) { 3661 if (WARN_ON_ONCE(!msk->subflow || !msk->subflow->sk)) 3662 return 0; 3663 3664 return inet_csk_listen_poll(msk->subflow->sk); 3665 } 3666 3667 if (state != TCP_SYN_SENT && state != TCP_SYN_RECV) { 3668 mask |= mptcp_check_readable(msk); 3669 mask |= mptcp_check_writeable(msk); 3670 } 3671 if (sk->sk_shutdown == SHUTDOWN_MASK || state == TCP_CLOSE) 3672 mask |= EPOLLHUP; 3673 if (sk->sk_shutdown & RCV_SHUTDOWN) 3674 mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; 3675 3676 /* This barrier is coupled with smp_wmb() in tcp_reset() */ 3677 smp_rmb(); 3678 if (sk->sk_err) 3679 mask |= EPOLLERR; 3680 3681 return mask; 3682 } 3683 3684 static const struct proto_ops mptcp_stream_ops = { 3685 .family = PF_INET, 3686 .owner = THIS_MODULE, 3687 .release = inet_release, 3688 .bind = mptcp_bind, 3689 .connect = mptcp_stream_connect, 3690 .socketpair = sock_no_socketpair, 3691 .accept = mptcp_stream_accept, 3692 .getname = inet_getname, 3693 .poll = mptcp_poll, 3694 .ioctl = inet_ioctl, 3695 .gettstamp = sock_gettstamp, 3696 .listen = mptcp_listen, 3697 .shutdown = inet_shutdown, 3698 .setsockopt = sock_common_setsockopt, 3699 .getsockopt = sock_common_getsockopt, 3700 .sendmsg = inet_sendmsg, 3701 .recvmsg = inet_recvmsg, 3702 .mmap = sock_no_mmap, 3703 .sendpage = inet_sendpage, 3704 }; 3705 3706 static struct inet_protosw mptcp_protosw = { 3707 .type = SOCK_STREAM, 3708 .protocol = IPPROTO_MPTCP, 3709 .prot = &mptcp_prot, 3710 .ops = &mptcp_stream_ops, 3711 .flags = INET_PROTOSW_ICSK, 3712 }; 3713 3714 static int mptcp_napi_poll(struct napi_struct *napi, int budget) 3715 { 3716 struct mptcp_delegated_action *delegated; 3717 struct mptcp_subflow_context *subflow; 3718 int work_done = 0; 3719 3720 delegated = container_of(napi, struct mptcp_delegated_action, napi); 3721 while ((subflow = mptcp_subflow_delegated_next(delegated)) != NULL) { 3722 struct sock *ssk = mptcp_subflow_tcp_sock(subflow); 3723 3724 bh_lock_sock_nested(ssk); 3725 if (!sock_owned_by_user(ssk) && 3726 mptcp_subflow_has_delegated_action(subflow)) 3727 mptcp_subflow_process_delegated(ssk); 3728 /* ... elsewhere tcp_release_cb_override already processed 3729 * the action or will do at next release_sock(). 3730 * In both case must dequeue the subflow here - on the same 3731 * CPU that scheduled it. 3732 */ 3733 bh_unlock_sock(ssk); 3734 sock_put(ssk); 3735 3736 if (++work_done == budget) 3737 return budget; 3738 } 3739 3740 /* always provide a 0 'work_done' argument, so that napi_complete_done 3741 * will not try accessing the NULL napi->dev ptr 3742 */ 3743 napi_complete_done(napi, 0); 3744 return work_done; 3745 } 3746 3747 void __init mptcp_proto_init(void) 3748 { 3749 struct mptcp_delegated_action *delegated; 3750 int cpu; 3751 3752 mptcp_prot.h.hashinfo = tcp_prot.h.hashinfo; 3753 3754 if (percpu_counter_init(&mptcp_sockets_allocated, 0, GFP_KERNEL)) 3755 panic("Failed to allocate MPTCP pcpu counter\n"); 3756 3757 init_dummy_netdev(&mptcp_napi_dev); 3758 for_each_possible_cpu(cpu) { 3759 delegated = per_cpu_ptr(&mptcp_delegated_actions, cpu); 3760 INIT_LIST_HEAD(&delegated->head); 3761 netif_napi_add_tx(&mptcp_napi_dev, &delegated->napi, 3762 mptcp_napi_poll); 3763 napi_enable(&delegated->napi); 3764 } 3765 3766 mptcp_subflow_init(); 3767 mptcp_pm_init(); 3768 mptcp_token_init(); 3769 3770 if (proto_register(&mptcp_prot, 1) != 0) 3771 panic("Failed to register MPTCP proto.\n"); 3772 3773 inet_register_protosw(&mptcp_protosw); 3774 3775 BUILD_BUG_ON(sizeof(struct mptcp_skb_cb) > sizeof_field(struct sk_buff, cb)); 3776 } 3777 3778 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 3779 static const struct proto_ops mptcp_v6_stream_ops = { 3780 .family = PF_INET6, 3781 .owner = THIS_MODULE, 3782 .release = inet6_release, 3783 .bind = mptcp_bind, 3784 .connect = mptcp_stream_connect, 3785 .socketpair = sock_no_socketpair, 3786 .accept = mptcp_stream_accept, 3787 .getname = inet6_getname, 3788 .poll = mptcp_poll, 3789 .ioctl = inet6_ioctl, 3790 .gettstamp = sock_gettstamp, 3791 .listen = mptcp_listen, 3792 .shutdown = inet_shutdown, 3793 .setsockopt = sock_common_setsockopt, 3794 .getsockopt = sock_common_getsockopt, 3795 .sendmsg = inet6_sendmsg, 3796 .recvmsg = inet6_recvmsg, 3797 .mmap = sock_no_mmap, 3798 .sendpage = inet_sendpage, 3799 #ifdef CONFIG_COMPAT 3800 .compat_ioctl = inet6_compat_ioctl, 3801 #endif 3802 }; 3803 3804 static struct proto mptcp_v6_prot; 3805 3806 static void mptcp_v6_destroy(struct sock *sk) 3807 { 3808 mptcp_destroy(sk); 3809 inet6_destroy_sock(sk); 3810 } 3811 3812 static struct inet_protosw mptcp_v6_protosw = { 3813 .type = SOCK_STREAM, 3814 .protocol = IPPROTO_MPTCP, 3815 .prot = &mptcp_v6_prot, 3816 .ops = &mptcp_v6_stream_ops, 3817 .flags = INET_PROTOSW_ICSK, 3818 }; 3819 3820 int __init mptcp_proto_v6_init(void) 3821 { 3822 int err; 3823 3824 mptcp_v6_prot = mptcp_prot; 3825 strcpy(mptcp_v6_prot.name, "MPTCPv6"); 3826 mptcp_v6_prot.slab = NULL; 3827 mptcp_v6_prot.destroy = mptcp_v6_destroy; 3828 mptcp_v6_prot.obj_size = sizeof(struct mptcp6_sock); 3829 3830 err = proto_register(&mptcp_v6_prot, 1); 3831 if (err) 3832 return err; 3833 3834 err = inet6_register_protosw(&mptcp_v6_protosw); 3835 if (err) 3836 proto_unregister(&mptcp_v6_prot); 3837 3838 return err; 3839 } 3840 #endif 3841