1 // SPDX-License-Identifier: GPL-2.0 2 /* Multipath TCP 3 * 4 * Copyright (c) 2017 - 2019, Intel Corporation. 5 */ 6 7 #define pr_fmt(fmt) "MPTCP: " fmt 8 9 #include <linux/kernel.h> 10 #include <linux/module.h> 11 #include <linux/netdevice.h> 12 #include <linux/sched/signal.h> 13 #include <linux/atomic.h> 14 #include <net/sock.h> 15 #include <net/inet_common.h> 16 #include <net/inet_hashtables.h> 17 #include <net/protocol.h> 18 #include <net/tcp.h> 19 #include <net/tcp_states.h> 20 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 21 #include <net/transp_v6.h> 22 #endif 23 #include <net/mptcp.h> 24 #include <net/xfrm.h> 25 #include "protocol.h" 26 #include "mib.h" 27 28 #define CREATE_TRACE_POINTS 29 #include <trace/events/mptcp.h> 30 31 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 32 struct mptcp6_sock { 33 struct mptcp_sock msk; 34 struct ipv6_pinfo np; 35 }; 36 #endif 37 38 struct mptcp_skb_cb { 39 u64 map_seq; 40 u64 end_seq; 41 u32 offset; 42 u8 has_rxtstamp:1; 43 }; 44 45 #define MPTCP_SKB_CB(__skb) ((struct mptcp_skb_cb *)&((__skb)->cb[0])) 46 47 enum { 48 MPTCP_CMSG_TS = BIT(0), 49 }; 50 51 static struct percpu_counter mptcp_sockets_allocated; 52 53 static void __mptcp_destroy_sock(struct sock *sk); 54 static void __mptcp_check_send_data_fin(struct sock *sk); 55 56 DEFINE_PER_CPU(struct mptcp_delegated_action, mptcp_delegated_actions); 57 static struct net_device mptcp_napi_dev; 58 59 /* If msk has an initial subflow socket, and the MP_CAPABLE handshake has not 60 * completed yet or has failed, return the subflow socket. 61 * Otherwise return NULL. 62 */ 63 struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk) 64 { 65 if (!msk->subflow || READ_ONCE(msk->can_ack)) 66 return NULL; 67 68 return msk->subflow; 69 } 70 71 /* Returns end sequence number of the receiver's advertised window */ 72 static u64 mptcp_wnd_end(const struct mptcp_sock *msk) 73 { 74 return READ_ONCE(msk->wnd_end); 75 } 76 77 static bool mptcp_is_tcpsk(struct sock *sk) 78 { 79 struct socket *sock = sk->sk_socket; 80 81 if (unlikely(sk->sk_prot == &tcp_prot)) { 82 /* we are being invoked after mptcp_accept() has 83 * accepted a non-mp-capable flow: sk is a tcp_sk, 84 * not an mptcp one. 85 * 86 * Hand the socket over to tcp so all further socket ops 87 * bypass mptcp. 88 */ 89 sock->ops = &inet_stream_ops; 90 return true; 91 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 92 } else if (unlikely(sk->sk_prot == &tcpv6_prot)) { 93 sock->ops = &inet6_stream_ops; 94 return true; 95 #endif 96 } 97 98 return false; 99 } 100 101 static int __mptcp_socket_create(struct mptcp_sock *msk) 102 { 103 struct mptcp_subflow_context *subflow; 104 struct sock *sk = (struct sock *)msk; 105 struct socket *ssock; 106 int err; 107 108 err = mptcp_subflow_create_socket(sk, &ssock); 109 if (err) 110 return err; 111 112 msk->first = ssock->sk; 113 msk->subflow = ssock; 114 subflow = mptcp_subflow_ctx(ssock->sk); 115 list_add(&subflow->node, &msk->conn_list); 116 sock_hold(ssock->sk); 117 subflow->request_mptcp = 1; 118 mptcp_sock_graft(msk->first, sk->sk_socket); 119 120 return 0; 121 } 122 123 static void mptcp_drop(struct sock *sk, struct sk_buff *skb) 124 { 125 sk_drops_add(sk, skb); 126 __kfree_skb(skb); 127 } 128 129 static bool mptcp_try_coalesce(struct sock *sk, struct sk_buff *to, 130 struct sk_buff *from) 131 { 132 bool fragstolen; 133 int delta; 134 135 if (MPTCP_SKB_CB(from)->offset || 136 !skb_try_coalesce(to, from, &fragstolen, &delta)) 137 return false; 138 139 pr_debug("colesced seq %llx into %llx new len %d new end seq %llx", 140 MPTCP_SKB_CB(from)->map_seq, MPTCP_SKB_CB(to)->map_seq, 141 to->len, MPTCP_SKB_CB(from)->end_seq); 142 MPTCP_SKB_CB(to)->end_seq = MPTCP_SKB_CB(from)->end_seq; 143 kfree_skb_partial(from, fragstolen); 144 atomic_add(delta, &sk->sk_rmem_alloc); 145 sk_mem_charge(sk, delta); 146 return true; 147 } 148 149 static bool mptcp_ooo_try_coalesce(struct mptcp_sock *msk, struct sk_buff *to, 150 struct sk_buff *from) 151 { 152 if (MPTCP_SKB_CB(from)->map_seq != MPTCP_SKB_CB(to)->end_seq) 153 return false; 154 155 return mptcp_try_coalesce((struct sock *)msk, to, from); 156 } 157 158 /* "inspired" by tcp_data_queue_ofo(), main differences: 159 * - use mptcp seqs 160 * - don't cope with sacks 161 */ 162 static void mptcp_data_queue_ofo(struct mptcp_sock *msk, struct sk_buff *skb) 163 { 164 struct sock *sk = (struct sock *)msk; 165 struct rb_node **p, *parent; 166 u64 seq, end_seq, max_seq; 167 struct sk_buff *skb1; 168 169 seq = MPTCP_SKB_CB(skb)->map_seq; 170 end_seq = MPTCP_SKB_CB(skb)->end_seq; 171 max_seq = READ_ONCE(msk->rcv_wnd_sent); 172 173 pr_debug("msk=%p seq=%llx limit=%llx empty=%d", msk, seq, max_seq, 174 RB_EMPTY_ROOT(&msk->out_of_order_queue)); 175 if (after64(end_seq, max_seq)) { 176 /* out of window */ 177 mptcp_drop(sk, skb); 178 pr_debug("oow by %lld, rcv_wnd_sent %llu\n", 179 (unsigned long long)end_seq - (unsigned long)max_seq, 180 (unsigned long long)msk->rcv_wnd_sent); 181 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_NODSSWINDOW); 182 return; 183 } 184 185 p = &msk->out_of_order_queue.rb_node; 186 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOQUEUE); 187 if (RB_EMPTY_ROOT(&msk->out_of_order_queue)) { 188 rb_link_node(&skb->rbnode, NULL, p); 189 rb_insert_color(&skb->rbnode, &msk->out_of_order_queue); 190 msk->ooo_last_skb = skb; 191 goto end; 192 } 193 194 /* with 2 subflows, adding at end of ooo queue is quite likely 195 * Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup. 196 */ 197 if (mptcp_ooo_try_coalesce(msk, msk->ooo_last_skb, skb)) { 198 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOMERGE); 199 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOQUEUETAIL); 200 return; 201 } 202 203 /* Can avoid an rbtree lookup if we are adding skb after ooo_last_skb */ 204 if (!before64(seq, MPTCP_SKB_CB(msk->ooo_last_skb)->end_seq)) { 205 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOQUEUETAIL); 206 parent = &msk->ooo_last_skb->rbnode; 207 p = &parent->rb_right; 208 goto insert; 209 } 210 211 /* Find place to insert this segment. Handle overlaps on the way. */ 212 parent = NULL; 213 while (*p) { 214 parent = *p; 215 skb1 = rb_to_skb(parent); 216 if (before64(seq, MPTCP_SKB_CB(skb1)->map_seq)) { 217 p = &parent->rb_left; 218 continue; 219 } 220 if (before64(seq, MPTCP_SKB_CB(skb1)->end_seq)) { 221 if (!after64(end_seq, MPTCP_SKB_CB(skb1)->end_seq)) { 222 /* All the bits are present. Drop. */ 223 mptcp_drop(sk, skb); 224 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA); 225 return; 226 } 227 if (after64(seq, MPTCP_SKB_CB(skb1)->map_seq)) { 228 /* partial overlap: 229 * | skb | 230 * | skb1 | 231 * continue traversing 232 */ 233 } else { 234 /* skb's seq == skb1's seq and skb covers skb1. 235 * Replace skb1 with skb. 236 */ 237 rb_replace_node(&skb1->rbnode, &skb->rbnode, 238 &msk->out_of_order_queue); 239 mptcp_drop(sk, skb1); 240 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA); 241 goto merge_right; 242 } 243 } else if (mptcp_ooo_try_coalesce(msk, skb1, skb)) { 244 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOMERGE); 245 return; 246 } 247 p = &parent->rb_right; 248 } 249 250 insert: 251 /* Insert segment into RB tree. */ 252 rb_link_node(&skb->rbnode, parent, p); 253 rb_insert_color(&skb->rbnode, &msk->out_of_order_queue); 254 255 merge_right: 256 /* Remove other segments covered by skb. */ 257 while ((skb1 = skb_rb_next(skb)) != NULL) { 258 if (before64(end_seq, MPTCP_SKB_CB(skb1)->end_seq)) 259 break; 260 rb_erase(&skb1->rbnode, &msk->out_of_order_queue); 261 mptcp_drop(sk, skb1); 262 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA); 263 } 264 /* If there is no skb after us, we are the last_skb ! */ 265 if (!skb1) 266 msk->ooo_last_skb = skb; 267 268 end: 269 skb_condense(skb); 270 skb_set_owner_r(skb, sk); 271 } 272 273 static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk, 274 struct sk_buff *skb, unsigned int offset, 275 size_t copy_len) 276 { 277 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); 278 struct sock *sk = (struct sock *)msk; 279 struct sk_buff *tail; 280 bool has_rxtstamp; 281 282 __skb_unlink(skb, &ssk->sk_receive_queue); 283 284 skb_ext_reset(skb); 285 skb_orphan(skb); 286 287 /* try to fetch required memory from subflow */ 288 if (!sk_rmem_schedule(sk, skb, skb->truesize)) { 289 int amount = sk_mem_pages(skb->truesize) << SK_MEM_QUANTUM_SHIFT; 290 291 if (ssk->sk_forward_alloc < amount) 292 goto drop; 293 294 ssk->sk_forward_alloc -= amount; 295 sk->sk_forward_alloc += amount; 296 } 297 298 has_rxtstamp = TCP_SKB_CB(skb)->has_rxtstamp; 299 300 /* the skb map_seq accounts for the skb offset: 301 * mptcp_subflow_get_mapped_dsn() is based on the current tp->copied_seq 302 * value 303 */ 304 MPTCP_SKB_CB(skb)->map_seq = mptcp_subflow_get_mapped_dsn(subflow); 305 MPTCP_SKB_CB(skb)->end_seq = MPTCP_SKB_CB(skb)->map_seq + copy_len; 306 MPTCP_SKB_CB(skb)->offset = offset; 307 MPTCP_SKB_CB(skb)->has_rxtstamp = has_rxtstamp; 308 309 if (MPTCP_SKB_CB(skb)->map_seq == msk->ack_seq) { 310 /* in sequence */ 311 WRITE_ONCE(msk->ack_seq, msk->ack_seq + copy_len); 312 tail = skb_peek_tail(&sk->sk_receive_queue); 313 if (tail && mptcp_try_coalesce(sk, tail, skb)) 314 return true; 315 316 skb_set_owner_r(skb, sk); 317 __skb_queue_tail(&sk->sk_receive_queue, skb); 318 return true; 319 } else if (after64(MPTCP_SKB_CB(skb)->map_seq, msk->ack_seq)) { 320 mptcp_data_queue_ofo(msk, skb); 321 return false; 322 } 323 324 /* old data, keep it simple and drop the whole pkt, sender 325 * will retransmit as needed, if needed. 326 */ 327 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA); 328 drop: 329 mptcp_drop(sk, skb); 330 return false; 331 } 332 333 static void mptcp_stop_timer(struct sock *sk) 334 { 335 struct inet_connection_sock *icsk = inet_csk(sk); 336 337 sk_stop_timer(sk, &icsk->icsk_retransmit_timer); 338 mptcp_sk(sk)->timer_ival = 0; 339 } 340 341 static void mptcp_close_wake_up(struct sock *sk) 342 { 343 if (sock_flag(sk, SOCK_DEAD)) 344 return; 345 346 sk->sk_state_change(sk); 347 if (sk->sk_shutdown == SHUTDOWN_MASK || 348 sk->sk_state == TCP_CLOSE) 349 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP); 350 else 351 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); 352 } 353 354 static bool mptcp_pending_data_fin_ack(struct sock *sk) 355 { 356 struct mptcp_sock *msk = mptcp_sk(sk); 357 358 return !__mptcp_check_fallback(msk) && 359 ((1 << sk->sk_state) & 360 (TCPF_FIN_WAIT1 | TCPF_CLOSING | TCPF_LAST_ACK)) && 361 msk->write_seq == READ_ONCE(msk->snd_una); 362 } 363 364 static void mptcp_check_data_fin_ack(struct sock *sk) 365 { 366 struct mptcp_sock *msk = mptcp_sk(sk); 367 368 /* Look for an acknowledged DATA_FIN */ 369 if (mptcp_pending_data_fin_ack(sk)) { 370 WRITE_ONCE(msk->snd_data_fin_enable, 0); 371 372 switch (sk->sk_state) { 373 case TCP_FIN_WAIT1: 374 inet_sk_state_store(sk, TCP_FIN_WAIT2); 375 break; 376 case TCP_CLOSING: 377 case TCP_LAST_ACK: 378 inet_sk_state_store(sk, TCP_CLOSE); 379 break; 380 } 381 382 mptcp_close_wake_up(sk); 383 } 384 } 385 386 static bool mptcp_pending_data_fin(struct sock *sk, u64 *seq) 387 { 388 struct mptcp_sock *msk = mptcp_sk(sk); 389 390 if (READ_ONCE(msk->rcv_data_fin) && 391 ((1 << sk->sk_state) & 392 (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2))) { 393 u64 rcv_data_fin_seq = READ_ONCE(msk->rcv_data_fin_seq); 394 395 if (msk->ack_seq == rcv_data_fin_seq) { 396 if (seq) 397 *seq = rcv_data_fin_seq; 398 399 return true; 400 } 401 } 402 403 return false; 404 } 405 406 static void mptcp_set_datafin_timeout(const struct sock *sk) 407 { 408 struct inet_connection_sock *icsk = inet_csk(sk); 409 410 mptcp_sk(sk)->timer_ival = min(TCP_RTO_MAX, 411 TCP_RTO_MIN << icsk->icsk_retransmits); 412 } 413 414 static void __mptcp_set_timeout(struct sock *sk, long tout) 415 { 416 mptcp_sk(sk)->timer_ival = tout > 0 ? tout : TCP_RTO_MIN; 417 } 418 419 static long mptcp_timeout_from_subflow(const struct mptcp_subflow_context *subflow) 420 { 421 const struct sock *ssk = mptcp_subflow_tcp_sock(subflow); 422 423 return inet_csk(ssk)->icsk_pending && !subflow->stale_count ? 424 inet_csk(ssk)->icsk_timeout - jiffies : 0; 425 } 426 427 static void mptcp_set_timeout(struct sock *sk) 428 { 429 struct mptcp_subflow_context *subflow; 430 long tout = 0; 431 432 mptcp_for_each_subflow(mptcp_sk(sk), subflow) 433 tout = max(tout, mptcp_timeout_from_subflow(subflow)); 434 __mptcp_set_timeout(sk, tout); 435 } 436 437 static bool tcp_can_send_ack(const struct sock *ssk) 438 { 439 return !((1 << inet_sk_state_load(ssk)) & 440 (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_TIME_WAIT | TCPF_CLOSE | TCPF_LISTEN)); 441 } 442 443 void mptcp_subflow_send_ack(struct sock *ssk) 444 { 445 bool slow; 446 447 slow = lock_sock_fast(ssk); 448 if (tcp_can_send_ack(ssk)) 449 tcp_send_ack(ssk); 450 unlock_sock_fast(ssk, slow); 451 } 452 453 static void mptcp_send_ack(struct mptcp_sock *msk) 454 { 455 struct mptcp_subflow_context *subflow; 456 457 mptcp_for_each_subflow(msk, subflow) 458 mptcp_subflow_send_ack(mptcp_subflow_tcp_sock(subflow)); 459 } 460 461 static void mptcp_subflow_cleanup_rbuf(struct sock *ssk) 462 { 463 bool slow; 464 465 slow = lock_sock_fast(ssk); 466 if (tcp_can_send_ack(ssk)) 467 tcp_cleanup_rbuf(ssk, 1); 468 unlock_sock_fast(ssk, slow); 469 } 470 471 static bool mptcp_subflow_could_cleanup(const struct sock *ssk, bool rx_empty) 472 { 473 const struct inet_connection_sock *icsk = inet_csk(ssk); 474 u8 ack_pending = READ_ONCE(icsk->icsk_ack.pending); 475 const struct tcp_sock *tp = tcp_sk(ssk); 476 477 return (ack_pending & ICSK_ACK_SCHED) && 478 ((READ_ONCE(tp->rcv_nxt) - READ_ONCE(tp->rcv_wup) > 479 READ_ONCE(icsk->icsk_ack.rcv_mss)) || 480 (rx_empty && ack_pending & 481 (ICSK_ACK_PUSHED2 | ICSK_ACK_PUSHED))); 482 } 483 484 static void mptcp_cleanup_rbuf(struct mptcp_sock *msk) 485 { 486 int old_space = READ_ONCE(msk->old_wspace); 487 struct mptcp_subflow_context *subflow; 488 struct sock *sk = (struct sock *)msk; 489 int space = __mptcp_space(sk); 490 bool cleanup, rx_empty; 491 492 cleanup = (space > 0) && (space >= (old_space << 1)); 493 rx_empty = !__mptcp_rmem(sk); 494 495 mptcp_for_each_subflow(msk, subflow) { 496 struct sock *ssk = mptcp_subflow_tcp_sock(subflow); 497 498 if (cleanup || mptcp_subflow_could_cleanup(ssk, rx_empty)) 499 mptcp_subflow_cleanup_rbuf(ssk); 500 } 501 } 502 503 static bool mptcp_check_data_fin(struct sock *sk) 504 { 505 struct mptcp_sock *msk = mptcp_sk(sk); 506 u64 rcv_data_fin_seq; 507 bool ret = false; 508 509 if (__mptcp_check_fallback(msk)) 510 return ret; 511 512 /* Need to ack a DATA_FIN received from a peer while this side 513 * of the connection is in ESTABLISHED, FIN_WAIT1, or FIN_WAIT2. 514 * msk->rcv_data_fin was set when parsing the incoming options 515 * at the subflow level and the msk lock was not held, so this 516 * is the first opportunity to act on the DATA_FIN and change 517 * the msk state. 518 * 519 * If we are caught up to the sequence number of the incoming 520 * DATA_FIN, send the DATA_ACK now and do state transition. If 521 * not caught up, do nothing and let the recv code send DATA_ACK 522 * when catching up. 523 */ 524 525 if (mptcp_pending_data_fin(sk, &rcv_data_fin_seq)) { 526 WRITE_ONCE(msk->ack_seq, msk->ack_seq + 1); 527 WRITE_ONCE(msk->rcv_data_fin, 0); 528 529 sk->sk_shutdown |= RCV_SHUTDOWN; 530 smp_mb__before_atomic(); /* SHUTDOWN must be visible first */ 531 set_bit(MPTCP_DATA_READY, &msk->flags); 532 533 switch (sk->sk_state) { 534 case TCP_ESTABLISHED: 535 inet_sk_state_store(sk, TCP_CLOSE_WAIT); 536 break; 537 case TCP_FIN_WAIT1: 538 inet_sk_state_store(sk, TCP_CLOSING); 539 break; 540 case TCP_FIN_WAIT2: 541 inet_sk_state_store(sk, TCP_CLOSE); 542 break; 543 default: 544 /* Other states not expected */ 545 WARN_ON_ONCE(1); 546 break; 547 } 548 549 ret = true; 550 mptcp_send_ack(msk); 551 mptcp_close_wake_up(sk); 552 } 553 return ret; 554 } 555 556 static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, 557 struct sock *ssk, 558 unsigned int *bytes) 559 { 560 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); 561 struct sock *sk = (struct sock *)msk; 562 unsigned int moved = 0; 563 bool more_data_avail; 564 struct tcp_sock *tp; 565 bool done = false; 566 int sk_rbuf; 567 568 sk_rbuf = READ_ONCE(sk->sk_rcvbuf); 569 570 if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { 571 int ssk_rbuf = READ_ONCE(ssk->sk_rcvbuf); 572 573 if (unlikely(ssk_rbuf > sk_rbuf)) { 574 WRITE_ONCE(sk->sk_rcvbuf, ssk_rbuf); 575 sk_rbuf = ssk_rbuf; 576 } 577 } 578 579 pr_debug("msk=%p ssk=%p", msk, ssk); 580 tp = tcp_sk(ssk); 581 do { 582 u32 map_remaining, offset; 583 u32 seq = tp->copied_seq; 584 struct sk_buff *skb; 585 bool fin; 586 587 /* try to move as much data as available */ 588 map_remaining = subflow->map_data_len - 589 mptcp_subflow_get_map_offset(subflow); 590 591 skb = skb_peek(&ssk->sk_receive_queue); 592 if (!skb) { 593 /* if no data is found, a racing workqueue/recvmsg 594 * already processed the new data, stop here or we 595 * can enter an infinite loop 596 */ 597 if (!moved) 598 done = true; 599 break; 600 } 601 602 if (__mptcp_check_fallback(msk)) { 603 /* if we are running under the workqueue, TCP could have 604 * collapsed skbs between dummy map creation and now 605 * be sure to adjust the size 606 */ 607 map_remaining = skb->len; 608 subflow->map_data_len = skb->len; 609 } 610 611 offset = seq - TCP_SKB_CB(skb)->seq; 612 fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN; 613 if (fin) { 614 done = true; 615 seq++; 616 } 617 618 if (offset < skb->len) { 619 size_t len = skb->len - offset; 620 621 if (tp->urg_data) 622 done = true; 623 624 if (__mptcp_move_skb(msk, ssk, skb, offset, len)) 625 moved += len; 626 seq += len; 627 628 if (WARN_ON_ONCE(map_remaining < len)) 629 break; 630 } else { 631 WARN_ON_ONCE(!fin); 632 sk_eat_skb(ssk, skb); 633 done = true; 634 } 635 636 WRITE_ONCE(tp->copied_seq, seq); 637 more_data_avail = mptcp_subflow_data_available(ssk); 638 639 if (atomic_read(&sk->sk_rmem_alloc) > sk_rbuf) { 640 done = true; 641 break; 642 } 643 } while (more_data_avail); 644 645 *bytes += moved; 646 return done; 647 } 648 649 static bool __mptcp_ofo_queue(struct mptcp_sock *msk) 650 { 651 struct sock *sk = (struct sock *)msk; 652 struct sk_buff *skb, *tail; 653 bool moved = false; 654 struct rb_node *p; 655 u64 end_seq; 656 657 p = rb_first(&msk->out_of_order_queue); 658 pr_debug("msk=%p empty=%d", msk, RB_EMPTY_ROOT(&msk->out_of_order_queue)); 659 while (p) { 660 skb = rb_to_skb(p); 661 if (after64(MPTCP_SKB_CB(skb)->map_seq, msk->ack_seq)) 662 break; 663 664 p = rb_next(p); 665 rb_erase(&skb->rbnode, &msk->out_of_order_queue); 666 667 if (unlikely(!after64(MPTCP_SKB_CB(skb)->end_seq, 668 msk->ack_seq))) { 669 mptcp_drop(sk, skb); 670 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA); 671 continue; 672 } 673 674 end_seq = MPTCP_SKB_CB(skb)->end_seq; 675 tail = skb_peek_tail(&sk->sk_receive_queue); 676 if (!tail || !mptcp_ooo_try_coalesce(msk, tail, skb)) { 677 int delta = msk->ack_seq - MPTCP_SKB_CB(skb)->map_seq; 678 679 /* skip overlapping data, if any */ 680 pr_debug("uncoalesced seq=%llx ack seq=%llx delta=%d", 681 MPTCP_SKB_CB(skb)->map_seq, msk->ack_seq, 682 delta); 683 MPTCP_SKB_CB(skb)->offset += delta; 684 __skb_queue_tail(&sk->sk_receive_queue, skb); 685 } 686 msk->ack_seq = end_seq; 687 moved = true; 688 } 689 return moved; 690 } 691 692 /* In most cases we will be able to lock the mptcp socket. If its already 693 * owned, we need to defer to the work queue to avoid ABBA deadlock. 694 */ 695 static bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk) 696 { 697 struct sock *sk = (struct sock *)msk; 698 unsigned int moved = 0; 699 700 __mptcp_move_skbs_from_subflow(msk, ssk, &moved); 701 __mptcp_ofo_queue(msk); 702 if (unlikely(ssk->sk_err)) { 703 if (!sock_owned_by_user(sk)) 704 __mptcp_error_report(sk); 705 else 706 set_bit(MPTCP_ERROR_REPORT, &msk->flags); 707 } 708 709 /* If the moves have caught up with the DATA_FIN sequence number 710 * it's time to ack the DATA_FIN and change socket state, but 711 * this is not a good place to change state. Let the workqueue 712 * do it. 713 */ 714 if (mptcp_pending_data_fin(sk, NULL)) 715 mptcp_schedule_work(sk); 716 return moved > 0; 717 } 718 719 void mptcp_data_ready(struct sock *sk, struct sock *ssk) 720 { 721 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); 722 struct mptcp_sock *msk = mptcp_sk(sk); 723 int sk_rbuf, ssk_rbuf; 724 725 /* The peer can send data while we are shutting down this 726 * subflow at msk destruction time, but we must avoid enqueuing 727 * more data to the msk receive queue 728 */ 729 if (unlikely(subflow->disposable)) 730 return; 731 732 ssk_rbuf = READ_ONCE(ssk->sk_rcvbuf); 733 sk_rbuf = READ_ONCE(sk->sk_rcvbuf); 734 if (unlikely(ssk_rbuf > sk_rbuf)) 735 sk_rbuf = ssk_rbuf; 736 737 /* over limit? can't append more skbs to msk, Also, no need to wake-up*/ 738 if (__mptcp_rmem(sk) > sk_rbuf) { 739 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RCVPRUNED); 740 return; 741 } 742 743 /* Wake-up the reader only for in-sequence data */ 744 mptcp_data_lock(sk); 745 if (move_skbs_to_msk(msk, ssk)) { 746 set_bit(MPTCP_DATA_READY, &msk->flags); 747 sk->sk_data_ready(sk); 748 } 749 mptcp_data_unlock(sk); 750 } 751 752 static bool mptcp_do_flush_join_list(struct mptcp_sock *msk) 753 { 754 struct mptcp_subflow_context *subflow; 755 bool ret = false; 756 757 if (likely(list_empty(&msk->join_list))) 758 return false; 759 760 spin_lock_bh(&msk->join_list_lock); 761 list_for_each_entry(subflow, &msk->join_list, node) { 762 u32 sseq = READ_ONCE(subflow->setsockopt_seq); 763 764 mptcp_propagate_sndbuf((struct sock *)msk, mptcp_subflow_tcp_sock(subflow)); 765 if (READ_ONCE(msk->setsockopt_seq) != sseq) 766 ret = true; 767 } 768 list_splice_tail_init(&msk->join_list, &msk->conn_list); 769 spin_unlock_bh(&msk->join_list_lock); 770 771 return ret; 772 } 773 774 void __mptcp_flush_join_list(struct mptcp_sock *msk) 775 { 776 if (likely(!mptcp_do_flush_join_list(msk))) 777 return; 778 779 if (!test_and_set_bit(MPTCP_WORK_SYNC_SETSOCKOPT, &msk->flags)) 780 mptcp_schedule_work((struct sock *)msk); 781 } 782 783 static void mptcp_flush_join_list(struct mptcp_sock *msk) 784 { 785 bool sync_needed = test_and_clear_bit(MPTCP_WORK_SYNC_SETSOCKOPT, &msk->flags); 786 787 might_sleep(); 788 789 if (!mptcp_do_flush_join_list(msk) && !sync_needed) 790 return; 791 792 mptcp_sockopt_sync_all(msk); 793 } 794 795 static bool mptcp_timer_pending(struct sock *sk) 796 { 797 return timer_pending(&inet_csk(sk)->icsk_retransmit_timer); 798 } 799 800 static void mptcp_reset_timer(struct sock *sk) 801 { 802 struct inet_connection_sock *icsk = inet_csk(sk); 803 unsigned long tout; 804 805 /* prevent rescheduling on close */ 806 if (unlikely(inet_sk_state_load(sk) == TCP_CLOSE)) 807 return; 808 809 tout = mptcp_sk(sk)->timer_ival; 810 sk_reset_timer(sk, &icsk->icsk_retransmit_timer, jiffies + tout); 811 } 812 813 bool mptcp_schedule_work(struct sock *sk) 814 { 815 if (inet_sk_state_load(sk) != TCP_CLOSE && 816 schedule_work(&mptcp_sk(sk)->work)) { 817 /* each subflow already holds a reference to the sk, and the 818 * workqueue is invoked by a subflow, so sk can't go away here. 819 */ 820 sock_hold(sk); 821 return true; 822 } 823 return false; 824 } 825 826 void mptcp_subflow_eof(struct sock *sk) 827 { 828 if (!test_and_set_bit(MPTCP_WORK_EOF, &mptcp_sk(sk)->flags)) 829 mptcp_schedule_work(sk); 830 } 831 832 static void mptcp_check_for_eof(struct mptcp_sock *msk) 833 { 834 struct mptcp_subflow_context *subflow; 835 struct sock *sk = (struct sock *)msk; 836 int receivers = 0; 837 838 mptcp_for_each_subflow(msk, subflow) 839 receivers += !subflow->rx_eof; 840 if (receivers) 841 return; 842 843 if (!(sk->sk_shutdown & RCV_SHUTDOWN)) { 844 /* hopefully temporary hack: propagate shutdown status 845 * to msk, when all subflows agree on it 846 */ 847 sk->sk_shutdown |= RCV_SHUTDOWN; 848 849 smp_mb__before_atomic(); /* SHUTDOWN must be visible first */ 850 set_bit(MPTCP_DATA_READY, &msk->flags); 851 sk->sk_data_ready(sk); 852 } 853 854 switch (sk->sk_state) { 855 case TCP_ESTABLISHED: 856 inet_sk_state_store(sk, TCP_CLOSE_WAIT); 857 break; 858 case TCP_FIN_WAIT1: 859 inet_sk_state_store(sk, TCP_CLOSING); 860 break; 861 case TCP_FIN_WAIT2: 862 inet_sk_state_store(sk, TCP_CLOSE); 863 break; 864 default: 865 return; 866 } 867 mptcp_close_wake_up(sk); 868 } 869 870 static struct sock *mptcp_subflow_recv_lookup(const struct mptcp_sock *msk) 871 { 872 struct mptcp_subflow_context *subflow; 873 struct sock *sk = (struct sock *)msk; 874 875 sock_owned_by_me(sk); 876 877 mptcp_for_each_subflow(msk, subflow) { 878 if (READ_ONCE(subflow->data_avail)) 879 return mptcp_subflow_tcp_sock(subflow); 880 } 881 882 return NULL; 883 } 884 885 static bool mptcp_skb_can_collapse_to(u64 write_seq, 886 const struct sk_buff *skb, 887 const struct mptcp_ext *mpext) 888 { 889 if (!tcp_skb_can_collapse_to(skb)) 890 return false; 891 892 /* can collapse only if MPTCP level sequence is in order and this 893 * mapping has not been xmitted yet 894 */ 895 return mpext && mpext->data_seq + mpext->data_len == write_seq && 896 !mpext->frozen; 897 } 898 899 /* we can append data to the given data frag if: 900 * - there is space available in the backing page_frag 901 * - the data frag tail matches the current page_frag free offset 902 * - the data frag end sequence number matches the current write seq 903 */ 904 static bool mptcp_frag_can_collapse_to(const struct mptcp_sock *msk, 905 const struct page_frag *pfrag, 906 const struct mptcp_data_frag *df) 907 { 908 return df && pfrag->page == df->page && 909 pfrag->size - pfrag->offset > 0 && 910 pfrag->offset == (df->offset + df->data_len) && 911 df->data_seq + df->data_len == msk->write_seq; 912 } 913 914 static int mptcp_wmem_with_overhead(int size) 915 { 916 return size + ((sizeof(struct mptcp_data_frag) * size) >> PAGE_SHIFT); 917 } 918 919 static void __mptcp_wmem_reserve(struct sock *sk, int size) 920 { 921 int amount = mptcp_wmem_with_overhead(size); 922 struct mptcp_sock *msk = mptcp_sk(sk); 923 924 WARN_ON_ONCE(msk->wmem_reserved); 925 if (WARN_ON_ONCE(amount < 0)) 926 amount = 0; 927 928 if (amount <= sk->sk_forward_alloc) 929 goto reserve; 930 931 /* under memory pressure try to reserve at most a single page 932 * otherwise try to reserve the full estimate and fallback 933 * to a single page before entering the error path 934 */ 935 if ((tcp_under_memory_pressure(sk) && amount > PAGE_SIZE) || 936 !sk_wmem_schedule(sk, amount)) { 937 if (amount <= PAGE_SIZE) 938 goto nomem; 939 940 amount = PAGE_SIZE; 941 if (!sk_wmem_schedule(sk, amount)) 942 goto nomem; 943 } 944 945 reserve: 946 msk->wmem_reserved = amount; 947 sk->sk_forward_alloc -= amount; 948 return; 949 950 nomem: 951 /* we will wait for memory on next allocation */ 952 msk->wmem_reserved = -1; 953 } 954 955 static void __mptcp_update_wmem(struct sock *sk) 956 { 957 struct mptcp_sock *msk = mptcp_sk(sk); 958 959 lockdep_assert_held_once(&sk->sk_lock.slock); 960 961 if (!msk->wmem_reserved) 962 return; 963 964 if (msk->wmem_reserved < 0) 965 msk->wmem_reserved = 0; 966 if (msk->wmem_reserved > 0) { 967 sk->sk_forward_alloc += msk->wmem_reserved; 968 msk->wmem_reserved = 0; 969 } 970 } 971 972 static bool mptcp_wmem_alloc(struct sock *sk, int size) 973 { 974 struct mptcp_sock *msk = mptcp_sk(sk); 975 976 /* check for pre-existing error condition */ 977 if (msk->wmem_reserved < 0) 978 return false; 979 980 if (msk->wmem_reserved >= size) 981 goto account; 982 983 mptcp_data_lock(sk); 984 if (!sk_wmem_schedule(sk, size)) { 985 mptcp_data_unlock(sk); 986 return false; 987 } 988 989 sk->sk_forward_alloc -= size; 990 msk->wmem_reserved += size; 991 mptcp_data_unlock(sk); 992 993 account: 994 msk->wmem_reserved -= size; 995 return true; 996 } 997 998 static void mptcp_wmem_uncharge(struct sock *sk, int size) 999 { 1000 struct mptcp_sock *msk = mptcp_sk(sk); 1001 1002 if (msk->wmem_reserved < 0) 1003 msk->wmem_reserved = 0; 1004 msk->wmem_reserved += size; 1005 } 1006 1007 static void __mptcp_mem_reclaim_partial(struct sock *sk) 1008 { 1009 lockdep_assert_held_once(&sk->sk_lock.slock); 1010 __mptcp_update_wmem(sk); 1011 sk_mem_reclaim_partial(sk); 1012 } 1013 1014 static void mptcp_mem_reclaim_partial(struct sock *sk) 1015 { 1016 struct mptcp_sock *msk = mptcp_sk(sk); 1017 1018 /* if we are experiencing a transint allocation error, 1019 * the forward allocation memory has been already 1020 * released 1021 */ 1022 if (msk->wmem_reserved < 0) 1023 return; 1024 1025 mptcp_data_lock(sk); 1026 sk->sk_forward_alloc += msk->wmem_reserved; 1027 sk_mem_reclaim_partial(sk); 1028 msk->wmem_reserved = sk->sk_forward_alloc; 1029 sk->sk_forward_alloc = 0; 1030 mptcp_data_unlock(sk); 1031 } 1032 1033 static void dfrag_uncharge(struct sock *sk, int len) 1034 { 1035 sk_mem_uncharge(sk, len); 1036 sk_wmem_queued_add(sk, -len); 1037 } 1038 1039 static void dfrag_clear(struct sock *sk, struct mptcp_data_frag *dfrag) 1040 { 1041 int len = dfrag->data_len + dfrag->overhead; 1042 1043 list_del(&dfrag->list); 1044 dfrag_uncharge(sk, len); 1045 put_page(dfrag->page); 1046 } 1047 1048 static void __mptcp_clean_una(struct sock *sk) 1049 { 1050 struct mptcp_sock *msk = mptcp_sk(sk); 1051 struct mptcp_data_frag *dtmp, *dfrag; 1052 bool cleaned = false; 1053 u64 snd_una; 1054 1055 /* on fallback we just need to ignore snd_una, as this is really 1056 * plain TCP 1057 */ 1058 if (__mptcp_check_fallback(msk)) 1059 msk->snd_una = READ_ONCE(msk->snd_nxt); 1060 1061 snd_una = msk->snd_una; 1062 list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list) { 1063 if (after64(dfrag->data_seq + dfrag->data_len, snd_una)) 1064 break; 1065 1066 if (unlikely(dfrag == msk->first_pending)) { 1067 /* in recovery mode can see ack after the current snd head */ 1068 if (WARN_ON_ONCE(!msk->recovery)) 1069 break; 1070 1071 WRITE_ONCE(msk->first_pending, mptcp_send_next(sk)); 1072 } 1073 1074 dfrag_clear(sk, dfrag); 1075 cleaned = true; 1076 } 1077 1078 dfrag = mptcp_rtx_head(sk); 1079 if (dfrag && after64(snd_una, dfrag->data_seq)) { 1080 u64 delta = snd_una - dfrag->data_seq; 1081 1082 /* prevent wrap around in recovery mode */ 1083 if (unlikely(delta > dfrag->already_sent)) { 1084 if (WARN_ON_ONCE(!msk->recovery)) 1085 goto out; 1086 if (WARN_ON_ONCE(delta > dfrag->data_len)) 1087 goto out; 1088 dfrag->already_sent += delta - dfrag->already_sent; 1089 } 1090 1091 dfrag->data_seq += delta; 1092 dfrag->offset += delta; 1093 dfrag->data_len -= delta; 1094 dfrag->already_sent -= delta; 1095 1096 dfrag_uncharge(sk, delta); 1097 cleaned = true; 1098 } 1099 1100 /* all retransmitted data acked, recovery completed */ 1101 if (unlikely(msk->recovery) && after64(msk->snd_una, msk->recovery_snd_nxt)) 1102 msk->recovery = false; 1103 1104 out: 1105 if (cleaned && tcp_under_memory_pressure(sk)) 1106 __mptcp_mem_reclaim_partial(sk); 1107 1108 if (snd_una == READ_ONCE(msk->snd_nxt) && 1109 snd_una == READ_ONCE(msk->write_seq)) { 1110 if (mptcp_timer_pending(sk) && !mptcp_data_fin_enabled(msk)) 1111 mptcp_stop_timer(sk); 1112 } else { 1113 mptcp_reset_timer(sk); 1114 } 1115 } 1116 1117 static void __mptcp_clean_una_wakeup(struct sock *sk) 1118 { 1119 lockdep_assert_held_once(&sk->sk_lock.slock); 1120 1121 __mptcp_clean_una(sk); 1122 mptcp_write_space(sk); 1123 } 1124 1125 static void mptcp_clean_una_wakeup(struct sock *sk) 1126 { 1127 mptcp_data_lock(sk); 1128 __mptcp_clean_una_wakeup(sk); 1129 mptcp_data_unlock(sk); 1130 } 1131 1132 static void mptcp_enter_memory_pressure(struct sock *sk) 1133 { 1134 struct mptcp_subflow_context *subflow; 1135 struct mptcp_sock *msk = mptcp_sk(sk); 1136 bool first = true; 1137 1138 sk_stream_moderate_sndbuf(sk); 1139 mptcp_for_each_subflow(msk, subflow) { 1140 struct sock *ssk = mptcp_subflow_tcp_sock(subflow); 1141 1142 if (first) 1143 tcp_enter_memory_pressure(ssk); 1144 sk_stream_moderate_sndbuf(ssk); 1145 first = false; 1146 } 1147 } 1148 1149 /* ensure we get enough memory for the frag hdr, beyond some minimal amount of 1150 * data 1151 */ 1152 static bool mptcp_page_frag_refill(struct sock *sk, struct page_frag *pfrag) 1153 { 1154 if (likely(skb_page_frag_refill(32U + sizeof(struct mptcp_data_frag), 1155 pfrag, sk->sk_allocation))) 1156 return true; 1157 1158 mptcp_enter_memory_pressure(sk); 1159 return false; 1160 } 1161 1162 static struct mptcp_data_frag * 1163 mptcp_carve_data_frag(const struct mptcp_sock *msk, struct page_frag *pfrag, 1164 int orig_offset) 1165 { 1166 int offset = ALIGN(orig_offset, sizeof(long)); 1167 struct mptcp_data_frag *dfrag; 1168 1169 dfrag = (struct mptcp_data_frag *)(page_to_virt(pfrag->page) + offset); 1170 dfrag->data_len = 0; 1171 dfrag->data_seq = msk->write_seq; 1172 dfrag->overhead = offset - orig_offset + sizeof(struct mptcp_data_frag); 1173 dfrag->offset = offset + sizeof(struct mptcp_data_frag); 1174 dfrag->already_sent = 0; 1175 dfrag->page = pfrag->page; 1176 1177 return dfrag; 1178 } 1179 1180 struct mptcp_sendmsg_info { 1181 int mss_now; 1182 int size_goal; 1183 u16 limit; 1184 u16 sent; 1185 unsigned int flags; 1186 bool data_lock_held; 1187 }; 1188 1189 static int mptcp_check_allowed_size(struct mptcp_sock *msk, u64 data_seq, 1190 int avail_size) 1191 { 1192 u64 window_end = mptcp_wnd_end(msk); 1193 1194 if (__mptcp_check_fallback(msk)) 1195 return avail_size; 1196 1197 if (!before64(data_seq + avail_size, window_end)) { 1198 u64 allowed_size = window_end - data_seq; 1199 1200 return min_t(unsigned int, allowed_size, avail_size); 1201 } 1202 1203 return avail_size; 1204 } 1205 1206 static bool __mptcp_add_ext(struct sk_buff *skb, gfp_t gfp) 1207 { 1208 struct skb_ext *mpext = __skb_ext_alloc(gfp); 1209 1210 if (!mpext) 1211 return false; 1212 __skb_ext_set(skb, SKB_EXT_MPTCP, mpext); 1213 return true; 1214 } 1215 1216 static struct sk_buff *__mptcp_do_alloc_tx_skb(struct sock *sk, gfp_t gfp) 1217 { 1218 struct sk_buff *skb; 1219 1220 skb = alloc_skb_fclone(MAX_TCP_HEADER, gfp); 1221 if (likely(skb)) { 1222 if (likely(__mptcp_add_ext(skb, gfp))) { 1223 skb_reserve(skb, MAX_TCP_HEADER); 1224 skb->reserved_tailroom = skb->end - skb->tail; 1225 INIT_LIST_HEAD(&skb->tcp_tsorted_anchor); 1226 return skb; 1227 } 1228 __kfree_skb(skb); 1229 } else { 1230 mptcp_enter_memory_pressure(sk); 1231 } 1232 return NULL; 1233 } 1234 1235 static struct sk_buff *__mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, gfp_t gfp) 1236 { 1237 struct sk_buff *skb; 1238 1239 skb = __mptcp_do_alloc_tx_skb(sk, gfp); 1240 if (!skb) 1241 return NULL; 1242 1243 if (likely(sk_wmem_schedule(ssk, skb->truesize))) { 1244 tcp_skb_entail(ssk, skb); 1245 return skb; 1246 } 1247 kfree_skb(skb); 1248 return NULL; 1249 } 1250 1251 static struct sk_buff *mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, bool data_lock_held) 1252 { 1253 gfp_t gfp = data_lock_held ? GFP_ATOMIC : sk->sk_allocation; 1254 1255 if (unlikely(tcp_under_memory_pressure(sk))) { 1256 if (data_lock_held) 1257 __mptcp_mem_reclaim_partial(sk); 1258 else 1259 mptcp_mem_reclaim_partial(sk); 1260 } 1261 return __mptcp_alloc_tx_skb(sk, ssk, gfp); 1262 } 1263 1264 /* note: this always recompute the csum on the whole skb, even 1265 * if we just appended a single frag. More status info needed 1266 */ 1267 static void mptcp_update_data_checksum(struct sk_buff *skb, int added) 1268 { 1269 struct mptcp_ext *mpext = mptcp_get_ext(skb); 1270 __wsum csum = ~csum_unfold(mpext->csum); 1271 int offset = skb->len - added; 1272 1273 mpext->csum = csum_fold(csum_block_add(csum, skb_checksum(skb, offset, added, 0), offset)); 1274 } 1275 1276 static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, 1277 struct mptcp_data_frag *dfrag, 1278 struct mptcp_sendmsg_info *info) 1279 { 1280 u64 data_seq = dfrag->data_seq + info->sent; 1281 int offset = dfrag->offset + info->sent; 1282 struct mptcp_sock *msk = mptcp_sk(sk); 1283 bool zero_window_probe = false; 1284 struct mptcp_ext *mpext = NULL; 1285 bool can_coalesce = false; 1286 bool reuse_skb = true; 1287 struct sk_buff *skb; 1288 size_t copy; 1289 int i; 1290 1291 pr_debug("msk=%p ssk=%p sending dfrag at seq=%llu len=%u already sent=%u", 1292 msk, ssk, dfrag->data_seq, dfrag->data_len, info->sent); 1293 1294 if (WARN_ON_ONCE(info->sent > info->limit || 1295 info->limit > dfrag->data_len)) 1296 return 0; 1297 1298 /* compute send limit */ 1299 info->mss_now = tcp_send_mss(ssk, &info->size_goal, info->flags); 1300 copy = info->size_goal; 1301 1302 skb = tcp_write_queue_tail(ssk); 1303 if (skb && copy > skb->len) { 1304 /* Limit the write to the size available in the 1305 * current skb, if any, so that we create at most a new skb. 1306 * Explicitly tells TCP internals to avoid collapsing on later 1307 * queue management operation, to avoid breaking the ext <-> 1308 * SSN association set here 1309 */ 1310 mpext = skb_ext_find(skb, SKB_EXT_MPTCP); 1311 if (!mptcp_skb_can_collapse_to(data_seq, skb, mpext)) { 1312 TCP_SKB_CB(skb)->eor = 1; 1313 goto alloc_skb; 1314 } 1315 1316 i = skb_shinfo(skb)->nr_frags; 1317 can_coalesce = skb_can_coalesce(skb, i, dfrag->page, offset); 1318 if (!can_coalesce && i >= sysctl_max_skb_frags) { 1319 tcp_mark_push(tcp_sk(ssk), skb); 1320 goto alloc_skb; 1321 } 1322 1323 copy -= skb->len; 1324 } else { 1325 alloc_skb: 1326 skb = mptcp_alloc_tx_skb(sk, ssk, info->data_lock_held); 1327 if (!skb) 1328 return -ENOMEM; 1329 1330 i = skb_shinfo(skb)->nr_frags; 1331 reuse_skb = false; 1332 mpext = skb_ext_find(skb, SKB_EXT_MPTCP); 1333 } 1334 1335 /* Zero window and all data acked? Probe. */ 1336 copy = mptcp_check_allowed_size(msk, data_seq, copy); 1337 if (copy == 0) { 1338 u64 snd_una = READ_ONCE(msk->snd_una); 1339 1340 if (snd_una != msk->snd_nxt) { 1341 tcp_remove_empty_skb(ssk, tcp_write_queue_tail(ssk)); 1342 return 0; 1343 } 1344 1345 zero_window_probe = true; 1346 data_seq = snd_una - 1; 1347 copy = 1; 1348 1349 /* all mptcp-level data is acked, no skbs should be present into the 1350 * ssk write queue 1351 */ 1352 WARN_ON_ONCE(reuse_skb); 1353 } 1354 1355 copy = min_t(size_t, copy, info->limit - info->sent); 1356 if (!sk_wmem_schedule(ssk, copy)) { 1357 tcp_remove_empty_skb(ssk, tcp_write_queue_tail(ssk)); 1358 return -ENOMEM; 1359 } 1360 1361 if (can_coalesce) { 1362 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); 1363 } else { 1364 get_page(dfrag->page); 1365 skb_fill_page_desc(skb, i, dfrag->page, offset, copy); 1366 } 1367 1368 skb->len += copy; 1369 skb->data_len += copy; 1370 skb->truesize += copy; 1371 sk_wmem_queued_add(ssk, copy); 1372 sk_mem_charge(ssk, copy); 1373 skb->ip_summed = CHECKSUM_PARTIAL; 1374 WRITE_ONCE(tcp_sk(ssk)->write_seq, tcp_sk(ssk)->write_seq + copy); 1375 TCP_SKB_CB(skb)->end_seq += copy; 1376 tcp_skb_pcount_set(skb, 0); 1377 1378 /* on skb reuse we just need to update the DSS len */ 1379 if (reuse_skb) { 1380 TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH; 1381 mpext->data_len += copy; 1382 WARN_ON_ONCE(zero_window_probe); 1383 goto out; 1384 } 1385 1386 memset(mpext, 0, sizeof(*mpext)); 1387 mpext->data_seq = data_seq; 1388 mpext->subflow_seq = mptcp_subflow_ctx(ssk)->rel_write_seq; 1389 mpext->data_len = copy; 1390 mpext->use_map = 1; 1391 mpext->dsn64 = 1; 1392 1393 pr_debug("data_seq=%llu subflow_seq=%u data_len=%u dsn64=%d", 1394 mpext->data_seq, mpext->subflow_seq, mpext->data_len, 1395 mpext->dsn64); 1396 1397 if (zero_window_probe) { 1398 mptcp_subflow_ctx(ssk)->rel_write_seq += copy; 1399 mpext->frozen = 1; 1400 if (READ_ONCE(msk->csum_enabled)) 1401 mptcp_update_data_checksum(skb, copy); 1402 tcp_push_pending_frames(ssk); 1403 return 0; 1404 } 1405 out: 1406 if (READ_ONCE(msk->csum_enabled)) 1407 mptcp_update_data_checksum(skb, copy); 1408 mptcp_subflow_ctx(ssk)->rel_write_seq += copy; 1409 return copy; 1410 } 1411 1412 #define MPTCP_SEND_BURST_SIZE ((1 << 16) - \ 1413 sizeof(struct tcphdr) - \ 1414 MAX_TCP_OPTION_SPACE - \ 1415 sizeof(struct ipv6hdr) - \ 1416 sizeof(struct frag_hdr)) 1417 1418 struct subflow_send_info { 1419 struct sock *ssk; 1420 u64 ratio; 1421 }; 1422 1423 void mptcp_subflow_set_active(struct mptcp_subflow_context *subflow) 1424 { 1425 if (!subflow->stale) 1426 return; 1427 1428 subflow->stale = 0; 1429 MPTCP_INC_STATS(sock_net(mptcp_subflow_tcp_sock(subflow)), MPTCP_MIB_SUBFLOWRECOVER); 1430 } 1431 1432 bool mptcp_subflow_active(struct mptcp_subflow_context *subflow) 1433 { 1434 if (unlikely(subflow->stale)) { 1435 u32 rcv_tstamp = READ_ONCE(tcp_sk(mptcp_subflow_tcp_sock(subflow))->rcv_tstamp); 1436 1437 if (subflow->stale_rcv_tstamp == rcv_tstamp) 1438 return false; 1439 1440 mptcp_subflow_set_active(subflow); 1441 } 1442 return __mptcp_subflow_active(subflow); 1443 } 1444 1445 /* implement the mptcp packet scheduler; 1446 * returns the subflow that will transmit the next DSS 1447 * additionally updates the rtx timeout 1448 */ 1449 static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) 1450 { 1451 struct subflow_send_info send_info[2]; 1452 struct mptcp_subflow_context *subflow; 1453 struct sock *sk = (struct sock *)msk; 1454 int i, nr_active = 0; 1455 struct sock *ssk; 1456 long tout = 0; 1457 u64 ratio; 1458 u32 pace; 1459 1460 sock_owned_by_me(sk); 1461 1462 if (__mptcp_check_fallback(msk)) { 1463 if (!msk->first) 1464 return NULL; 1465 return sk_stream_memory_free(msk->first) ? msk->first : NULL; 1466 } 1467 1468 /* re-use last subflow, if the burst allow that */ 1469 if (msk->last_snd && msk->snd_burst > 0 && 1470 sk_stream_memory_free(msk->last_snd) && 1471 mptcp_subflow_active(mptcp_subflow_ctx(msk->last_snd))) { 1472 mptcp_set_timeout(sk); 1473 return msk->last_snd; 1474 } 1475 1476 /* pick the subflow with the lower wmem/wspace ratio */ 1477 for (i = 0; i < 2; ++i) { 1478 send_info[i].ssk = NULL; 1479 send_info[i].ratio = -1; 1480 } 1481 mptcp_for_each_subflow(msk, subflow) { 1482 trace_mptcp_subflow_get_send(subflow); 1483 ssk = mptcp_subflow_tcp_sock(subflow); 1484 if (!mptcp_subflow_active(subflow)) 1485 continue; 1486 1487 tout = max(tout, mptcp_timeout_from_subflow(subflow)); 1488 nr_active += !subflow->backup; 1489 if (!sk_stream_memory_free(subflow->tcp_sock) || !tcp_sk(ssk)->snd_wnd) 1490 continue; 1491 1492 pace = READ_ONCE(ssk->sk_pacing_rate); 1493 if (!pace) 1494 continue; 1495 1496 ratio = div_u64((u64)READ_ONCE(ssk->sk_wmem_queued) << 32, 1497 pace); 1498 if (ratio < send_info[subflow->backup].ratio) { 1499 send_info[subflow->backup].ssk = ssk; 1500 send_info[subflow->backup].ratio = ratio; 1501 } 1502 } 1503 __mptcp_set_timeout(sk, tout); 1504 1505 /* pick the best backup if no other subflow is active */ 1506 if (!nr_active) 1507 send_info[0].ssk = send_info[1].ssk; 1508 1509 if (send_info[0].ssk) { 1510 msk->last_snd = send_info[0].ssk; 1511 msk->snd_burst = min_t(int, MPTCP_SEND_BURST_SIZE, 1512 tcp_sk(msk->last_snd)->snd_wnd); 1513 return msk->last_snd; 1514 } 1515 1516 return NULL; 1517 } 1518 1519 static void mptcp_push_release(struct sock *sk, struct sock *ssk, 1520 struct mptcp_sendmsg_info *info) 1521 { 1522 tcp_push(ssk, 0, info->mss_now, tcp_sk(ssk)->nonagle, info->size_goal); 1523 release_sock(ssk); 1524 } 1525 1526 static void mptcp_update_post_push(struct mptcp_sock *msk, 1527 struct mptcp_data_frag *dfrag, 1528 u32 sent) 1529 { 1530 u64 snd_nxt_new = dfrag->data_seq; 1531 1532 dfrag->already_sent += sent; 1533 1534 msk->snd_burst -= sent; 1535 1536 snd_nxt_new += dfrag->already_sent; 1537 1538 /* snd_nxt_new can be smaller than snd_nxt in case mptcp 1539 * is recovering after a failover. In that event, this re-sends 1540 * old segments. 1541 * 1542 * Thus compute snd_nxt_new candidate based on 1543 * the dfrag->data_seq that was sent and the data 1544 * that has been handed to the subflow for transmission 1545 * and skip update in case it was old dfrag. 1546 */ 1547 if (likely(after64(snd_nxt_new, msk->snd_nxt))) 1548 msk->snd_nxt = snd_nxt_new; 1549 } 1550 1551 static void mptcp_check_and_set_pending(struct sock *sk) 1552 { 1553 if (mptcp_send_head(sk) && 1554 !test_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->flags)) 1555 set_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->flags); 1556 } 1557 1558 void __mptcp_push_pending(struct sock *sk, unsigned int flags) 1559 { 1560 struct sock *prev_ssk = NULL, *ssk = NULL; 1561 struct mptcp_sock *msk = mptcp_sk(sk); 1562 struct mptcp_sendmsg_info info = { 1563 .flags = flags, 1564 }; 1565 struct mptcp_data_frag *dfrag; 1566 int len, copied = 0; 1567 1568 while ((dfrag = mptcp_send_head(sk))) { 1569 info.sent = dfrag->already_sent; 1570 info.limit = dfrag->data_len; 1571 len = dfrag->data_len - dfrag->already_sent; 1572 while (len > 0) { 1573 int ret = 0; 1574 1575 prev_ssk = ssk; 1576 mptcp_flush_join_list(msk); 1577 ssk = mptcp_subflow_get_send(msk); 1578 1579 /* First check. If the ssk has changed since 1580 * the last round, release prev_ssk 1581 */ 1582 if (ssk != prev_ssk && prev_ssk) 1583 mptcp_push_release(sk, prev_ssk, &info); 1584 if (!ssk) 1585 goto out; 1586 1587 /* Need to lock the new subflow only if different 1588 * from the previous one, otherwise we are still 1589 * helding the relevant lock 1590 */ 1591 if (ssk != prev_ssk) 1592 lock_sock(ssk); 1593 1594 ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info); 1595 if (ret <= 0) { 1596 mptcp_push_release(sk, ssk, &info); 1597 goto out; 1598 } 1599 1600 info.sent += ret; 1601 copied += ret; 1602 len -= ret; 1603 1604 mptcp_update_post_push(msk, dfrag, ret); 1605 } 1606 WRITE_ONCE(msk->first_pending, mptcp_send_next(sk)); 1607 } 1608 1609 /* at this point we held the socket lock for the last subflow we used */ 1610 if (ssk) 1611 mptcp_push_release(sk, ssk, &info); 1612 1613 out: 1614 /* ensure the rtx timer is running */ 1615 if (!mptcp_timer_pending(sk)) 1616 mptcp_reset_timer(sk); 1617 if (copied) 1618 __mptcp_check_send_data_fin(sk); 1619 } 1620 1621 static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk) 1622 { 1623 struct mptcp_sock *msk = mptcp_sk(sk); 1624 struct mptcp_sendmsg_info info = { 1625 .data_lock_held = true, 1626 }; 1627 struct mptcp_data_frag *dfrag; 1628 struct sock *xmit_ssk; 1629 int len, copied = 0; 1630 bool first = true; 1631 1632 info.flags = 0; 1633 while ((dfrag = mptcp_send_head(sk))) { 1634 info.sent = dfrag->already_sent; 1635 info.limit = dfrag->data_len; 1636 len = dfrag->data_len - dfrag->already_sent; 1637 while (len > 0) { 1638 int ret = 0; 1639 1640 /* the caller already invoked the packet scheduler, 1641 * check for a different subflow usage only after 1642 * spooling the first chunk of data 1643 */ 1644 xmit_ssk = first ? ssk : mptcp_subflow_get_send(mptcp_sk(sk)); 1645 if (!xmit_ssk) 1646 goto out; 1647 if (xmit_ssk != ssk) { 1648 mptcp_subflow_delegate(mptcp_subflow_ctx(xmit_ssk)); 1649 goto out; 1650 } 1651 1652 ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info); 1653 if (ret <= 0) 1654 goto out; 1655 1656 info.sent += ret; 1657 copied += ret; 1658 len -= ret; 1659 first = false; 1660 1661 mptcp_update_post_push(msk, dfrag, ret); 1662 } 1663 WRITE_ONCE(msk->first_pending, mptcp_send_next(sk)); 1664 } 1665 1666 out: 1667 /* __mptcp_alloc_tx_skb could have released some wmem and we are 1668 * not going to flush it via release_sock() 1669 */ 1670 __mptcp_update_wmem(sk); 1671 if (copied) { 1672 tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle, 1673 info.size_goal); 1674 if (!mptcp_timer_pending(sk)) 1675 mptcp_reset_timer(sk); 1676 1677 if (msk->snd_data_fin_enable && 1678 msk->snd_nxt + 1 == msk->write_seq) 1679 mptcp_schedule_work(sk); 1680 } 1681 } 1682 1683 static void mptcp_set_nospace(struct sock *sk) 1684 { 1685 /* enable autotune */ 1686 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 1687 1688 /* will be cleared on avail space */ 1689 set_bit(MPTCP_NOSPACE, &mptcp_sk(sk)->flags); 1690 } 1691 1692 static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) 1693 { 1694 struct mptcp_sock *msk = mptcp_sk(sk); 1695 struct page_frag *pfrag; 1696 size_t copied = 0; 1697 int ret = 0; 1698 long timeo; 1699 1700 /* we don't support FASTOPEN yet */ 1701 if (msg->msg_flags & MSG_FASTOPEN) 1702 return -EOPNOTSUPP; 1703 1704 /* silently ignore everything else */ 1705 msg->msg_flags &= MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL; 1706 1707 mptcp_lock_sock(sk, __mptcp_wmem_reserve(sk, min_t(size_t, 1 << 20, len))); 1708 1709 timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); 1710 1711 if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) { 1712 ret = sk_stream_wait_connect(sk, &timeo); 1713 if (ret) 1714 goto out; 1715 } 1716 1717 pfrag = sk_page_frag(sk); 1718 1719 while (msg_data_left(msg)) { 1720 int total_ts, frag_truesize = 0; 1721 struct mptcp_data_frag *dfrag; 1722 bool dfrag_collapsed; 1723 size_t psize, offset; 1724 1725 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) { 1726 ret = -EPIPE; 1727 goto out; 1728 } 1729 1730 /* reuse tail pfrag, if possible, or carve a new one from the 1731 * page allocator 1732 */ 1733 dfrag = mptcp_pending_tail(sk); 1734 dfrag_collapsed = mptcp_frag_can_collapse_to(msk, pfrag, dfrag); 1735 if (!dfrag_collapsed) { 1736 if (!sk_stream_memory_free(sk)) 1737 goto wait_for_memory; 1738 1739 if (!mptcp_page_frag_refill(sk, pfrag)) 1740 goto wait_for_memory; 1741 1742 dfrag = mptcp_carve_data_frag(msk, pfrag, pfrag->offset); 1743 frag_truesize = dfrag->overhead; 1744 } 1745 1746 /* we do not bound vs wspace, to allow a single packet. 1747 * memory accounting will prevent execessive memory usage 1748 * anyway 1749 */ 1750 offset = dfrag->offset + dfrag->data_len; 1751 psize = pfrag->size - offset; 1752 psize = min_t(size_t, psize, msg_data_left(msg)); 1753 total_ts = psize + frag_truesize; 1754 1755 if (!mptcp_wmem_alloc(sk, total_ts)) 1756 goto wait_for_memory; 1757 1758 if (copy_page_from_iter(dfrag->page, offset, psize, 1759 &msg->msg_iter) != psize) { 1760 mptcp_wmem_uncharge(sk, psize + frag_truesize); 1761 ret = -EFAULT; 1762 goto out; 1763 } 1764 1765 /* data successfully copied into the write queue */ 1766 copied += psize; 1767 dfrag->data_len += psize; 1768 frag_truesize += psize; 1769 pfrag->offset += frag_truesize; 1770 WRITE_ONCE(msk->write_seq, msk->write_seq + psize); 1771 1772 /* charge data on mptcp pending queue to the msk socket 1773 * Note: we charge such data both to sk and ssk 1774 */ 1775 sk_wmem_queued_add(sk, frag_truesize); 1776 if (!dfrag_collapsed) { 1777 get_page(dfrag->page); 1778 list_add_tail(&dfrag->list, &msk->rtx_queue); 1779 if (!msk->first_pending) 1780 WRITE_ONCE(msk->first_pending, dfrag); 1781 } 1782 pr_debug("msk=%p dfrag at seq=%llu len=%u sent=%u new=%d", msk, 1783 dfrag->data_seq, dfrag->data_len, dfrag->already_sent, 1784 !dfrag_collapsed); 1785 1786 continue; 1787 1788 wait_for_memory: 1789 mptcp_set_nospace(sk); 1790 __mptcp_push_pending(sk, msg->msg_flags); 1791 ret = sk_stream_wait_memory(sk, &timeo); 1792 if (ret) 1793 goto out; 1794 } 1795 1796 if (copied) 1797 __mptcp_push_pending(sk, msg->msg_flags); 1798 1799 out: 1800 release_sock(sk); 1801 return copied ? : ret; 1802 } 1803 1804 static void mptcp_wait_data(struct sock *sk, long *timeo) 1805 { 1806 DEFINE_WAIT_FUNC(wait, woken_wake_function); 1807 struct mptcp_sock *msk = mptcp_sk(sk); 1808 1809 add_wait_queue(sk_sleep(sk), &wait); 1810 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); 1811 1812 sk_wait_event(sk, timeo, 1813 test_bit(MPTCP_DATA_READY, &msk->flags), &wait); 1814 1815 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); 1816 remove_wait_queue(sk_sleep(sk), &wait); 1817 } 1818 1819 static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk, 1820 struct msghdr *msg, 1821 size_t len, int flags, 1822 struct scm_timestamping_internal *tss, 1823 int *cmsg_flags) 1824 { 1825 struct sk_buff *skb, *tmp; 1826 int copied = 0; 1827 1828 skb_queue_walk_safe(&msk->receive_queue, skb, tmp) { 1829 u32 offset = MPTCP_SKB_CB(skb)->offset; 1830 u32 data_len = skb->len - offset; 1831 u32 count = min_t(size_t, len - copied, data_len); 1832 int err; 1833 1834 if (!(flags & MSG_TRUNC)) { 1835 err = skb_copy_datagram_msg(skb, offset, msg, count); 1836 if (unlikely(err < 0)) { 1837 if (!copied) 1838 return err; 1839 break; 1840 } 1841 } 1842 1843 if (MPTCP_SKB_CB(skb)->has_rxtstamp) { 1844 tcp_update_recv_tstamps(skb, tss); 1845 *cmsg_flags |= MPTCP_CMSG_TS; 1846 } 1847 1848 copied += count; 1849 1850 if (count < data_len) { 1851 if (!(flags & MSG_PEEK)) 1852 MPTCP_SKB_CB(skb)->offset += count; 1853 break; 1854 } 1855 1856 if (!(flags & MSG_PEEK)) { 1857 /* we will bulk release the skb memory later */ 1858 skb->destructor = NULL; 1859 WRITE_ONCE(msk->rmem_released, msk->rmem_released + skb->truesize); 1860 __skb_unlink(skb, &msk->receive_queue); 1861 __kfree_skb(skb); 1862 } 1863 1864 if (copied >= len) 1865 break; 1866 } 1867 1868 return copied; 1869 } 1870 1871 /* receive buffer autotuning. See tcp_rcv_space_adjust for more information. 1872 * 1873 * Only difference: Use highest rtt estimate of the subflows in use. 1874 */ 1875 static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied) 1876 { 1877 struct mptcp_subflow_context *subflow; 1878 struct sock *sk = (struct sock *)msk; 1879 u32 time, advmss = 1; 1880 u64 rtt_us, mstamp; 1881 1882 sock_owned_by_me(sk); 1883 1884 if (copied <= 0) 1885 return; 1886 1887 msk->rcvq_space.copied += copied; 1888 1889 mstamp = div_u64(tcp_clock_ns(), NSEC_PER_USEC); 1890 time = tcp_stamp_us_delta(mstamp, msk->rcvq_space.time); 1891 1892 rtt_us = msk->rcvq_space.rtt_us; 1893 if (rtt_us && time < (rtt_us >> 3)) 1894 return; 1895 1896 rtt_us = 0; 1897 mptcp_for_each_subflow(msk, subflow) { 1898 const struct tcp_sock *tp; 1899 u64 sf_rtt_us; 1900 u32 sf_advmss; 1901 1902 tp = tcp_sk(mptcp_subflow_tcp_sock(subflow)); 1903 1904 sf_rtt_us = READ_ONCE(tp->rcv_rtt_est.rtt_us); 1905 sf_advmss = READ_ONCE(tp->advmss); 1906 1907 rtt_us = max(sf_rtt_us, rtt_us); 1908 advmss = max(sf_advmss, advmss); 1909 } 1910 1911 msk->rcvq_space.rtt_us = rtt_us; 1912 if (time < (rtt_us >> 3) || rtt_us == 0) 1913 return; 1914 1915 if (msk->rcvq_space.copied <= msk->rcvq_space.space) 1916 goto new_measure; 1917 1918 if (sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf && 1919 !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { 1920 int rcvmem, rcvbuf; 1921 u64 rcvwin, grow; 1922 1923 rcvwin = ((u64)msk->rcvq_space.copied << 1) + 16 * advmss; 1924 1925 grow = rcvwin * (msk->rcvq_space.copied - msk->rcvq_space.space); 1926 1927 do_div(grow, msk->rcvq_space.space); 1928 rcvwin += (grow << 1); 1929 1930 rcvmem = SKB_TRUESIZE(advmss + MAX_TCP_HEADER); 1931 while (tcp_win_from_space(sk, rcvmem) < advmss) 1932 rcvmem += 128; 1933 1934 do_div(rcvwin, advmss); 1935 rcvbuf = min_t(u64, rcvwin * rcvmem, 1936 sock_net(sk)->ipv4.sysctl_tcp_rmem[2]); 1937 1938 if (rcvbuf > sk->sk_rcvbuf) { 1939 u32 window_clamp; 1940 1941 window_clamp = tcp_win_from_space(sk, rcvbuf); 1942 WRITE_ONCE(sk->sk_rcvbuf, rcvbuf); 1943 1944 /* Make subflows follow along. If we do not do this, we 1945 * get drops at subflow level if skbs can't be moved to 1946 * the mptcp rx queue fast enough (announced rcv_win can 1947 * exceed ssk->sk_rcvbuf). 1948 */ 1949 mptcp_for_each_subflow(msk, subflow) { 1950 struct sock *ssk; 1951 bool slow; 1952 1953 ssk = mptcp_subflow_tcp_sock(subflow); 1954 slow = lock_sock_fast(ssk); 1955 WRITE_ONCE(ssk->sk_rcvbuf, rcvbuf); 1956 tcp_sk(ssk)->window_clamp = window_clamp; 1957 tcp_cleanup_rbuf(ssk, 1); 1958 unlock_sock_fast(ssk, slow); 1959 } 1960 } 1961 } 1962 1963 msk->rcvq_space.space = msk->rcvq_space.copied; 1964 new_measure: 1965 msk->rcvq_space.copied = 0; 1966 msk->rcvq_space.time = mstamp; 1967 } 1968 1969 static void __mptcp_update_rmem(struct sock *sk) 1970 { 1971 struct mptcp_sock *msk = mptcp_sk(sk); 1972 1973 if (!msk->rmem_released) 1974 return; 1975 1976 atomic_sub(msk->rmem_released, &sk->sk_rmem_alloc); 1977 sk_mem_uncharge(sk, msk->rmem_released); 1978 WRITE_ONCE(msk->rmem_released, 0); 1979 } 1980 1981 static void __mptcp_splice_receive_queue(struct sock *sk) 1982 { 1983 struct mptcp_sock *msk = mptcp_sk(sk); 1984 1985 skb_queue_splice_tail_init(&sk->sk_receive_queue, &msk->receive_queue); 1986 } 1987 1988 static bool __mptcp_move_skbs(struct mptcp_sock *msk) 1989 { 1990 struct sock *sk = (struct sock *)msk; 1991 unsigned int moved = 0; 1992 bool ret, done; 1993 1994 mptcp_flush_join_list(msk); 1995 do { 1996 struct sock *ssk = mptcp_subflow_recv_lookup(msk); 1997 bool slowpath; 1998 1999 /* we can have data pending in the subflows only if the msk 2000 * receive buffer was full at subflow_data_ready() time, 2001 * that is an unlikely slow path. 2002 */ 2003 if (likely(!ssk)) 2004 break; 2005 2006 slowpath = lock_sock_fast(ssk); 2007 mptcp_data_lock(sk); 2008 __mptcp_update_rmem(sk); 2009 done = __mptcp_move_skbs_from_subflow(msk, ssk, &moved); 2010 mptcp_data_unlock(sk); 2011 2012 if (unlikely(ssk->sk_err)) 2013 __mptcp_error_report(sk); 2014 unlock_sock_fast(ssk, slowpath); 2015 } while (!done); 2016 2017 /* acquire the data lock only if some input data is pending */ 2018 ret = moved > 0; 2019 if (!RB_EMPTY_ROOT(&msk->out_of_order_queue) || 2020 !skb_queue_empty_lockless(&sk->sk_receive_queue)) { 2021 mptcp_data_lock(sk); 2022 __mptcp_update_rmem(sk); 2023 ret |= __mptcp_ofo_queue(msk); 2024 __mptcp_splice_receive_queue(sk); 2025 mptcp_data_unlock(sk); 2026 } 2027 if (ret) 2028 mptcp_check_data_fin((struct sock *)msk); 2029 return !skb_queue_empty(&msk->receive_queue); 2030 } 2031 2032 static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, 2033 int nonblock, int flags, int *addr_len) 2034 { 2035 struct mptcp_sock *msk = mptcp_sk(sk); 2036 struct scm_timestamping_internal tss; 2037 int copied = 0, cmsg_flags = 0; 2038 int target; 2039 long timeo; 2040 2041 /* MSG_ERRQUEUE is really a no-op till we support IP_RECVERR */ 2042 if (unlikely(flags & MSG_ERRQUEUE)) 2043 return inet_recv_error(sk, msg, len, addr_len); 2044 2045 mptcp_lock_sock(sk, __mptcp_splice_receive_queue(sk)); 2046 if (unlikely(sk->sk_state == TCP_LISTEN)) { 2047 copied = -ENOTCONN; 2048 goto out_err; 2049 } 2050 2051 timeo = sock_rcvtimeo(sk, nonblock); 2052 2053 len = min_t(size_t, len, INT_MAX); 2054 target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); 2055 2056 while (copied < len) { 2057 int bytes_read; 2058 2059 bytes_read = __mptcp_recvmsg_mskq(msk, msg, len - copied, flags, &tss, &cmsg_flags); 2060 if (unlikely(bytes_read < 0)) { 2061 if (!copied) 2062 copied = bytes_read; 2063 goto out_err; 2064 } 2065 2066 copied += bytes_read; 2067 2068 /* be sure to advertise window change */ 2069 mptcp_cleanup_rbuf(msk); 2070 2071 if (skb_queue_empty(&msk->receive_queue) && __mptcp_move_skbs(msk)) 2072 continue; 2073 2074 /* only the master socket status is relevant here. The exit 2075 * conditions mirror closely tcp_recvmsg() 2076 */ 2077 if (copied >= target) 2078 break; 2079 2080 if (copied) { 2081 if (sk->sk_err || 2082 sk->sk_state == TCP_CLOSE || 2083 (sk->sk_shutdown & RCV_SHUTDOWN) || 2084 !timeo || 2085 signal_pending(current)) 2086 break; 2087 } else { 2088 if (sk->sk_err) { 2089 copied = sock_error(sk); 2090 break; 2091 } 2092 2093 if (test_and_clear_bit(MPTCP_WORK_EOF, &msk->flags)) 2094 mptcp_check_for_eof(msk); 2095 2096 if (sk->sk_shutdown & RCV_SHUTDOWN) { 2097 /* race breaker: the shutdown could be after the 2098 * previous receive queue check 2099 */ 2100 if (__mptcp_move_skbs(msk)) 2101 continue; 2102 break; 2103 } 2104 2105 if (sk->sk_state == TCP_CLOSE) { 2106 copied = -ENOTCONN; 2107 break; 2108 } 2109 2110 if (!timeo) { 2111 copied = -EAGAIN; 2112 break; 2113 } 2114 2115 if (signal_pending(current)) { 2116 copied = sock_intr_errno(timeo); 2117 break; 2118 } 2119 } 2120 2121 pr_debug("block timeout %ld", timeo); 2122 mptcp_wait_data(sk, &timeo); 2123 } 2124 2125 if (skb_queue_empty_lockless(&sk->sk_receive_queue) && 2126 skb_queue_empty(&msk->receive_queue)) { 2127 /* entire backlog drained, clear DATA_READY. */ 2128 clear_bit(MPTCP_DATA_READY, &msk->flags); 2129 2130 /* .. race-breaker: ssk might have gotten new data 2131 * after last __mptcp_move_skbs() returned false. 2132 */ 2133 if (unlikely(__mptcp_move_skbs(msk))) 2134 set_bit(MPTCP_DATA_READY, &msk->flags); 2135 } 2136 2137 out_err: 2138 if (cmsg_flags && copied >= 0) { 2139 if (cmsg_flags & MPTCP_CMSG_TS) 2140 tcp_recv_timestamp(msg, sk, &tss); 2141 } 2142 2143 pr_debug("msk=%p data_ready=%d rx queue empty=%d copied=%d", 2144 msk, test_bit(MPTCP_DATA_READY, &msk->flags), 2145 skb_queue_empty_lockless(&sk->sk_receive_queue), copied); 2146 if (!(flags & MSG_PEEK)) 2147 mptcp_rcv_space_adjust(msk, copied); 2148 2149 release_sock(sk); 2150 return copied; 2151 } 2152 2153 static void mptcp_retransmit_timer(struct timer_list *t) 2154 { 2155 struct inet_connection_sock *icsk = from_timer(icsk, t, 2156 icsk_retransmit_timer); 2157 struct sock *sk = &icsk->icsk_inet.sk; 2158 struct mptcp_sock *msk = mptcp_sk(sk); 2159 2160 bh_lock_sock(sk); 2161 if (!sock_owned_by_user(sk)) { 2162 /* we need a process context to retransmit */ 2163 if (!test_and_set_bit(MPTCP_WORK_RTX, &msk->flags)) 2164 mptcp_schedule_work(sk); 2165 } else { 2166 /* delegate our work to tcp_release_cb() */ 2167 set_bit(MPTCP_RETRANSMIT, &msk->flags); 2168 } 2169 bh_unlock_sock(sk); 2170 sock_put(sk); 2171 } 2172 2173 static void mptcp_timeout_timer(struct timer_list *t) 2174 { 2175 struct sock *sk = from_timer(sk, t, sk_timer); 2176 2177 mptcp_schedule_work(sk); 2178 sock_put(sk); 2179 } 2180 2181 /* Find an idle subflow. Return NULL if there is unacked data at tcp 2182 * level. 2183 * 2184 * A backup subflow is returned only if that is the only kind available. 2185 */ 2186 static struct sock *mptcp_subflow_get_retrans(struct mptcp_sock *msk) 2187 { 2188 struct sock *backup = NULL, *pick = NULL; 2189 struct mptcp_subflow_context *subflow; 2190 int min_stale_count = INT_MAX; 2191 2192 sock_owned_by_me((const struct sock *)msk); 2193 2194 if (__mptcp_check_fallback(msk)) 2195 return NULL; 2196 2197 mptcp_for_each_subflow(msk, subflow) { 2198 struct sock *ssk = mptcp_subflow_tcp_sock(subflow); 2199 2200 if (!__mptcp_subflow_active(subflow)) 2201 continue; 2202 2203 /* still data outstanding at TCP level? skip this */ 2204 if (!tcp_rtx_and_write_queues_empty(ssk)) { 2205 mptcp_pm_subflow_chk_stale(msk, ssk); 2206 min_stale_count = min_t(int, min_stale_count, subflow->stale_count); 2207 continue; 2208 } 2209 2210 if (subflow->backup) { 2211 if (!backup) 2212 backup = ssk; 2213 continue; 2214 } 2215 2216 if (!pick) 2217 pick = ssk; 2218 } 2219 2220 if (pick) 2221 return pick; 2222 2223 /* use backup only if there are no progresses anywhere */ 2224 return min_stale_count > 1 ? backup : NULL; 2225 } 2226 2227 static void mptcp_dispose_initial_subflow(struct mptcp_sock *msk) 2228 { 2229 if (msk->subflow) { 2230 iput(SOCK_INODE(msk->subflow)); 2231 msk->subflow = NULL; 2232 } 2233 } 2234 2235 bool __mptcp_retransmit_pending_data(struct sock *sk) 2236 { 2237 struct mptcp_data_frag *cur, *rtx_head; 2238 struct mptcp_sock *msk = mptcp_sk(sk); 2239 2240 if (__mptcp_check_fallback(mptcp_sk(sk))) 2241 return false; 2242 2243 if (tcp_rtx_and_write_queues_empty(sk)) 2244 return false; 2245 2246 /* the closing socket has some data untransmitted and/or unacked: 2247 * some data in the mptcp rtx queue has not really xmitted yet. 2248 * keep it simple and re-inject the whole mptcp level rtx queue 2249 */ 2250 mptcp_data_lock(sk); 2251 __mptcp_clean_una_wakeup(sk); 2252 rtx_head = mptcp_rtx_head(sk); 2253 if (!rtx_head) { 2254 mptcp_data_unlock(sk); 2255 return false; 2256 } 2257 2258 msk->recovery_snd_nxt = msk->snd_nxt; 2259 msk->recovery = true; 2260 mptcp_data_unlock(sk); 2261 2262 msk->first_pending = rtx_head; 2263 msk->snd_burst = 0; 2264 2265 /* be sure to clear the "sent status" on all re-injected fragments */ 2266 list_for_each_entry(cur, &msk->rtx_queue, list) { 2267 if (!cur->already_sent) 2268 break; 2269 cur->already_sent = 0; 2270 } 2271 2272 return true; 2273 } 2274 2275 /* subflow sockets can be either outgoing (connect) or incoming 2276 * (accept). 2277 * 2278 * Outgoing subflows use in-kernel sockets. 2279 * Incoming subflows do not have their own 'struct socket' allocated, 2280 * so we need to use tcp_close() after detaching them from the mptcp 2281 * parent socket. 2282 */ 2283 static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, 2284 struct mptcp_subflow_context *subflow) 2285 { 2286 struct mptcp_sock *msk = mptcp_sk(sk); 2287 bool need_push; 2288 2289 list_del(&subflow->node); 2290 2291 lock_sock_nested(ssk, SINGLE_DEPTH_NESTING); 2292 2293 /* if we are invoked by the msk cleanup code, the subflow is 2294 * already orphaned 2295 */ 2296 if (ssk->sk_socket) 2297 sock_orphan(ssk); 2298 2299 need_push = __mptcp_retransmit_pending_data(sk); 2300 subflow->disposable = 1; 2301 2302 /* if ssk hit tcp_done(), tcp_cleanup_ulp() cleared the related ops 2303 * the ssk has been already destroyed, we just need to release the 2304 * reference owned by msk; 2305 */ 2306 if (!inet_csk(ssk)->icsk_ulp_ops) { 2307 kfree_rcu(subflow, rcu); 2308 } else { 2309 /* otherwise tcp will dispose of the ssk and subflow ctx */ 2310 __tcp_close(ssk, 0); 2311 2312 /* close acquired an extra ref */ 2313 __sock_put(ssk); 2314 } 2315 release_sock(ssk); 2316 2317 sock_put(ssk); 2318 2319 if (ssk == msk->last_snd) 2320 msk->last_snd = NULL; 2321 2322 if (ssk == msk->first) 2323 msk->first = NULL; 2324 2325 if (msk->subflow && ssk == msk->subflow->sk) 2326 mptcp_dispose_initial_subflow(msk); 2327 2328 if (need_push) 2329 __mptcp_push_pending(sk, 0); 2330 } 2331 2332 void mptcp_close_ssk(struct sock *sk, struct sock *ssk, 2333 struct mptcp_subflow_context *subflow) 2334 { 2335 if (sk->sk_state == TCP_ESTABLISHED) 2336 mptcp_event(MPTCP_EVENT_SUB_CLOSED, mptcp_sk(sk), ssk, GFP_KERNEL); 2337 __mptcp_close_ssk(sk, ssk, subflow); 2338 } 2339 2340 static unsigned int mptcp_sync_mss(struct sock *sk, u32 pmtu) 2341 { 2342 return 0; 2343 } 2344 2345 static void __mptcp_close_subflow(struct mptcp_sock *msk) 2346 { 2347 struct mptcp_subflow_context *subflow, *tmp; 2348 2349 might_sleep(); 2350 2351 list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) { 2352 struct sock *ssk = mptcp_subflow_tcp_sock(subflow); 2353 2354 if (inet_sk_state_load(ssk) != TCP_CLOSE) 2355 continue; 2356 2357 /* 'subflow_data_ready' will re-sched once rx queue is empty */ 2358 if (!skb_queue_empty_lockless(&ssk->sk_receive_queue)) 2359 continue; 2360 2361 mptcp_close_ssk((struct sock *)msk, ssk, subflow); 2362 } 2363 } 2364 2365 static bool mptcp_check_close_timeout(const struct sock *sk) 2366 { 2367 s32 delta = tcp_jiffies32 - inet_csk(sk)->icsk_mtup.probe_timestamp; 2368 struct mptcp_subflow_context *subflow; 2369 2370 if (delta >= TCP_TIMEWAIT_LEN) 2371 return true; 2372 2373 /* if all subflows are in closed status don't bother with additional 2374 * timeout 2375 */ 2376 mptcp_for_each_subflow(mptcp_sk(sk), subflow) { 2377 if (inet_sk_state_load(mptcp_subflow_tcp_sock(subflow)) != 2378 TCP_CLOSE) 2379 return false; 2380 } 2381 return true; 2382 } 2383 2384 static void mptcp_check_fastclose(struct mptcp_sock *msk) 2385 { 2386 struct mptcp_subflow_context *subflow, *tmp; 2387 struct sock *sk = &msk->sk.icsk_inet.sk; 2388 2389 if (likely(!READ_ONCE(msk->rcv_fastclose))) 2390 return; 2391 2392 mptcp_token_destroy(msk); 2393 2394 list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) { 2395 struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow); 2396 bool slow; 2397 2398 slow = lock_sock_fast(tcp_sk); 2399 if (tcp_sk->sk_state != TCP_CLOSE) { 2400 tcp_send_active_reset(tcp_sk, GFP_ATOMIC); 2401 tcp_set_state(tcp_sk, TCP_CLOSE); 2402 } 2403 unlock_sock_fast(tcp_sk, slow); 2404 } 2405 2406 inet_sk_state_store(sk, TCP_CLOSE); 2407 sk->sk_shutdown = SHUTDOWN_MASK; 2408 smp_mb__before_atomic(); /* SHUTDOWN must be visible first */ 2409 set_bit(MPTCP_DATA_READY, &msk->flags); 2410 set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags); 2411 2412 mptcp_close_wake_up(sk); 2413 } 2414 2415 static void __mptcp_retrans(struct sock *sk) 2416 { 2417 struct mptcp_sock *msk = mptcp_sk(sk); 2418 struct mptcp_sendmsg_info info = {}; 2419 struct mptcp_data_frag *dfrag; 2420 size_t copied = 0; 2421 struct sock *ssk; 2422 int ret; 2423 2424 mptcp_clean_una_wakeup(sk); 2425 2426 /* first check ssk: need to kick "stale" logic */ 2427 ssk = mptcp_subflow_get_retrans(msk); 2428 dfrag = mptcp_rtx_head(sk); 2429 if (!dfrag) { 2430 if (mptcp_data_fin_enabled(msk)) { 2431 struct inet_connection_sock *icsk = inet_csk(sk); 2432 2433 icsk->icsk_retransmits++; 2434 mptcp_set_datafin_timeout(sk); 2435 mptcp_send_ack(msk); 2436 2437 goto reset_timer; 2438 } 2439 2440 if (!mptcp_send_head(sk)) 2441 return; 2442 2443 goto reset_timer; 2444 } 2445 2446 if (!ssk) 2447 goto reset_timer; 2448 2449 lock_sock(ssk); 2450 2451 /* limit retransmission to the bytes already sent on some subflows */ 2452 info.sent = 0; 2453 info.limit = READ_ONCE(msk->csum_enabled) ? dfrag->data_len : dfrag->already_sent; 2454 while (info.sent < info.limit) { 2455 ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info); 2456 if (ret <= 0) 2457 break; 2458 2459 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RETRANSSEGS); 2460 copied += ret; 2461 info.sent += ret; 2462 } 2463 if (copied) { 2464 dfrag->already_sent = max(dfrag->already_sent, info.sent); 2465 tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle, 2466 info.size_goal); 2467 } 2468 2469 release_sock(ssk); 2470 2471 reset_timer: 2472 mptcp_check_and_set_pending(sk); 2473 2474 if (!mptcp_timer_pending(sk)) 2475 mptcp_reset_timer(sk); 2476 } 2477 2478 static void mptcp_worker(struct work_struct *work) 2479 { 2480 struct mptcp_sock *msk = container_of(work, struct mptcp_sock, work); 2481 struct sock *sk = &msk->sk.icsk_inet.sk; 2482 int state; 2483 2484 lock_sock(sk); 2485 state = sk->sk_state; 2486 if (unlikely(state == TCP_CLOSE)) 2487 goto unlock; 2488 2489 mptcp_check_data_fin_ack(sk); 2490 mptcp_flush_join_list(msk); 2491 2492 mptcp_check_fastclose(msk); 2493 2494 if (msk->pm.status) 2495 mptcp_pm_nl_work(msk); 2496 2497 if (test_and_clear_bit(MPTCP_WORK_EOF, &msk->flags)) 2498 mptcp_check_for_eof(msk); 2499 2500 __mptcp_check_send_data_fin(sk); 2501 mptcp_check_data_fin(sk); 2502 2503 /* There is no point in keeping around an orphaned sk timedout or 2504 * closed, but we need the msk around to reply to incoming DATA_FIN, 2505 * even if it is orphaned and in FIN_WAIT2 state 2506 */ 2507 if (sock_flag(sk, SOCK_DEAD) && 2508 (mptcp_check_close_timeout(sk) || sk->sk_state == TCP_CLOSE)) { 2509 inet_sk_state_store(sk, TCP_CLOSE); 2510 __mptcp_destroy_sock(sk); 2511 goto unlock; 2512 } 2513 2514 if (test_and_clear_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags)) 2515 __mptcp_close_subflow(msk); 2516 2517 if (test_and_clear_bit(MPTCP_WORK_RTX, &msk->flags)) 2518 __mptcp_retrans(sk); 2519 2520 unlock: 2521 release_sock(sk); 2522 sock_put(sk); 2523 } 2524 2525 static int __mptcp_init_sock(struct sock *sk) 2526 { 2527 struct mptcp_sock *msk = mptcp_sk(sk); 2528 2529 spin_lock_init(&msk->join_list_lock); 2530 2531 INIT_LIST_HEAD(&msk->conn_list); 2532 INIT_LIST_HEAD(&msk->join_list); 2533 INIT_LIST_HEAD(&msk->rtx_queue); 2534 INIT_WORK(&msk->work, mptcp_worker); 2535 __skb_queue_head_init(&msk->receive_queue); 2536 msk->out_of_order_queue = RB_ROOT; 2537 msk->first_pending = NULL; 2538 msk->wmem_reserved = 0; 2539 WRITE_ONCE(msk->rmem_released, 0); 2540 msk->timer_ival = TCP_RTO_MIN; 2541 2542 msk->first = NULL; 2543 inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss; 2544 WRITE_ONCE(msk->csum_enabled, mptcp_is_checksum_enabled(sock_net(sk))); 2545 msk->recovery = false; 2546 2547 mptcp_pm_data_init(msk); 2548 2549 /* re-use the csk retrans timer for MPTCP-level retrans */ 2550 timer_setup(&msk->sk.icsk_retransmit_timer, mptcp_retransmit_timer, 0); 2551 timer_setup(&sk->sk_timer, mptcp_timeout_timer, 0); 2552 2553 return 0; 2554 } 2555 2556 static int mptcp_init_sock(struct sock *sk) 2557 { 2558 struct inet_connection_sock *icsk = inet_csk(sk); 2559 struct net *net = sock_net(sk); 2560 int ret; 2561 2562 ret = __mptcp_init_sock(sk); 2563 if (ret) 2564 return ret; 2565 2566 if (!mptcp_is_enabled(net)) 2567 return -ENOPROTOOPT; 2568 2569 if (unlikely(!net->mib.mptcp_statistics) && !mptcp_mib_alloc(net)) 2570 return -ENOMEM; 2571 2572 ret = __mptcp_socket_create(mptcp_sk(sk)); 2573 if (ret) 2574 return ret; 2575 2576 /* fetch the ca name; do it outside __mptcp_init_sock(), so that clone will 2577 * propagate the correct value 2578 */ 2579 tcp_assign_congestion_control(sk); 2580 strcpy(mptcp_sk(sk)->ca_name, icsk->icsk_ca_ops->name); 2581 2582 /* no need to keep a reference to the ops, the name will suffice */ 2583 tcp_cleanup_congestion_control(sk); 2584 icsk->icsk_ca_ops = NULL; 2585 2586 sk_sockets_allocated_inc(sk); 2587 sk->sk_rcvbuf = sock_net(sk)->ipv4.sysctl_tcp_rmem[1]; 2588 sk->sk_sndbuf = sock_net(sk)->ipv4.sysctl_tcp_wmem[1]; 2589 2590 return 0; 2591 } 2592 2593 static void __mptcp_clear_xmit(struct sock *sk) 2594 { 2595 struct mptcp_sock *msk = mptcp_sk(sk); 2596 struct mptcp_data_frag *dtmp, *dfrag; 2597 2598 WRITE_ONCE(msk->first_pending, NULL); 2599 list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list) 2600 dfrag_clear(sk, dfrag); 2601 } 2602 2603 static void mptcp_cancel_work(struct sock *sk) 2604 { 2605 struct mptcp_sock *msk = mptcp_sk(sk); 2606 2607 if (cancel_work_sync(&msk->work)) 2608 __sock_put(sk); 2609 } 2610 2611 void mptcp_subflow_shutdown(struct sock *sk, struct sock *ssk, int how) 2612 { 2613 lock_sock(ssk); 2614 2615 switch (ssk->sk_state) { 2616 case TCP_LISTEN: 2617 if (!(how & RCV_SHUTDOWN)) 2618 break; 2619 fallthrough; 2620 case TCP_SYN_SENT: 2621 tcp_disconnect(ssk, O_NONBLOCK); 2622 break; 2623 default: 2624 if (__mptcp_check_fallback(mptcp_sk(sk))) { 2625 pr_debug("Fallback"); 2626 ssk->sk_shutdown |= how; 2627 tcp_shutdown(ssk, how); 2628 } else { 2629 pr_debug("Sending DATA_FIN on subflow %p", ssk); 2630 tcp_send_ack(ssk); 2631 if (!mptcp_timer_pending(sk)) 2632 mptcp_reset_timer(sk); 2633 } 2634 break; 2635 } 2636 2637 release_sock(ssk); 2638 } 2639 2640 static const unsigned char new_state[16] = { 2641 /* current state: new state: action: */ 2642 [0 /* (Invalid) */] = TCP_CLOSE, 2643 [TCP_ESTABLISHED] = TCP_FIN_WAIT1 | TCP_ACTION_FIN, 2644 [TCP_SYN_SENT] = TCP_CLOSE, 2645 [TCP_SYN_RECV] = TCP_FIN_WAIT1 | TCP_ACTION_FIN, 2646 [TCP_FIN_WAIT1] = TCP_FIN_WAIT1, 2647 [TCP_FIN_WAIT2] = TCP_FIN_WAIT2, 2648 [TCP_TIME_WAIT] = TCP_CLOSE, /* should not happen ! */ 2649 [TCP_CLOSE] = TCP_CLOSE, 2650 [TCP_CLOSE_WAIT] = TCP_LAST_ACK | TCP_ACTION_FIN, 2651 [TCP_LAST_ACK] = TCP_LAST_ACK, 2652 [TCP_LISTEN] = TCP_CLOSE, 2653 [TCP_CLOSING] = TCP_CLOSING, 2654 [TCP_NEW_SYN_RECV] = TCP_CLOSE, /* should not happen ! */ 2655 }; 2656 2657 static int mptcp_close_state(struct sock *sk) 2658 { 2659 int next = (int)new_state[sk->sk_state]; 2660 int ns = next & TCP_STATE_MASK; 2661 2662 inet_sk_state_store(sk, ns); 2663 2664 return next & TCP_ACTION_FIN; 2665 } 2666 2667 static void __mptcp_check_send_data_fin(struct sock *sk) 2668 { 2669 struct mptcp_subflow_context *subflow; 2670 struct mptcp_sock *msk = mptcp_sk(sk); 2671 2672 pr_debug("msk=%p snd_data_fin_enable=%d pending=%d snd_nxt=%llu write_seq=%llu", 2673 msk, msk->snd_data_fin_enable, !!mptcp_send_head(sk), 2674 msk->snd_nxt, msk->write_seq); 2675 2676 /* we still need to enqueue subflows or not really shutting down, 2677 * skip this 2678 */ 2679 if (!msk->snd_data_fin_enable || msk->snd_nxt + 1 != msk->write_seq || 2680 mptcp_send_head(sk)) 2681 return; 2682 2683 WRITE_ONCE(msk->snd_nxt, msk->write_seq); 2684 2685 /* fallback socket will not get data_fin/ack, can move to the next 2686 * state now 2687 */ 2688 if (__mptcp_check_fallback(msk)) { 2689 if ((1 << sk->sk_state) & (TCPF_CLOSING | TCPF_LAST_ACK)) { 2690 inet_sk_state_store(sk, TCP_CLOSE); 2691 mptcp_close_wake_up(sk); 2692 } else if (sk->sk_state == TCP_FIN_WAIT1) { 2693 inet_sk_state_store(sk, TCP_FIN_WAIT2); 2694 } 2695 } 2696 2697 mptcp_flush_join_list(msk); 2698 mptcp_for_each_subflow(msk, subflow) { 2699 struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow); 2700 2701 mptcp_subflow_shutdown(sk, tcp_sk, SEND_SHUTDOWN); 2702 } 2703 } 2704 2705 static void __mptcp_wr_shutdown(struct sock *sk) 2706 { 2707 struct mptcp_sock *msk = mptcp_sk(sk); 2708 2709 pr_debug("msk=%p snd_data_fin_enable=%d shutdown=%x state=%d pending=%d", 2710 msk, msk->snd_data_fin_enable, sk->sk_shutdown, sk->sk_state, 2711 !!mptcp_send_head(sk)); 2712 2713 /* will be ignored by fallback sockets */ 2714 WRITE_ONCE(msk->write_seq, msk->write_seq + 1); 2715 WRITE_ONCE(msk->snd_data_fin_enable, 1); 2716 2717 __mptcp_check_send_data_fin(sk); 2718 } 2719 2720 static void __mptcp_destroy_sock(struct sock *sk) 2721 { 2722 struct mptcp_subflow_context *subflow, *tmp; 2723 struct mptcp_sock *msk = mptcp_sk(sk); 2724 LIST_HEAD(conn_list); 2725 2726 pr_debug("msk=%p", msk); 2727 2728 might_sleep(); 2729 2730 /* be sure to always acquire the join list lock, to sync vs 2731 * mptcp_finish_join(). 2732 */ 2733 spin_lock_bh(&msk->join_list_lock); 2734 list_splice_tail_init(&msk->join_list, &msk->conn_list); 2735 spin_unlock_bh(&msk->join_list_lock); 2736 list_splice_init(&msk->conn_list, &conn_list); 2737 2738 sk_stop_timer(sk, &msk->sk.icsk_retransmit_timer); 2739 sk_stop_timer(sk, &sk->sk_timer); 2740 msk->pm.status = 0; 2741 2742 list_for_each_entry_safe(subflow, tmp, &conn_list, node) { 2743 struct sock *ssk = mptcp_subflow_tcp_sock(subflow); 2744 __mptcp_close_ssk(sk, ssk, subflow); 2745 } 2746 2747 sk->sk_prot->destroy(sk); 2748 2749 WARN_ON_ONCE(msk->wmem_reserved); 2750 WARN_ON_ONCE(msk->rmem_released); 2751 sk_stream_kill_queues(sk); 2752 xfrm_sk_free_policy(sk); 2753 2754 sk_refcnt_debug_release(sk); 2755 mptcp_dispose_initial_subflow(msk); 2756 sock_put(sk); 2757 } 2758 2759 static void mptcp_close(struct sock *sk, long timeout) 2760 { 2761 struct mptcp_subflow_context *subflow; 2762 bool do_cancel_work = false; 2763 2764 lock_sock(sk); 2765 sk->sk_shutdown = SHUTDOWN_MASK; 2766 2767 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) { 2768 inet_sk_state_store(sk, TCP_CLOSE); 2769 goto cleanup; 2770 } 2771 2772 if (mptcp_close_state(sk)) 2773 __mptcp_wr_shutdown(sk); 2774 2775 sk_stream_wait_close(sk, timeout); 2776 2777 cleanup: 2778 /* orphan all the subflows */ 2779 inet_csk(sk)->icsk_mtup.probe_timestamp = tcp_jiffies32; 2780 mptcp_for_each_subflow(mptcp_sk(sk), subflow) { 2781 struct sock *ssk = mptcp_subflow_tcp_sock(subflow); 2782 bool slow = lock_sock_fast_nested(ssk); 2783 2784 sock_orphan(ssk); 2785 unlock_sock_fast(ssk, slow); 2786 } 2787 sock_orphan(sk); 2788 2789 sock_hold(sk); 2790 pr_debug("msk=%p state=%d", sk, sk->sk_state); 2791 if (sk->sk_state == TCP_CLOSE) { 2792 __mptcp_destroy_sock(sk); 2793 do_cancel_work = true; 2794 } else { 2795 sk_reset_timer(sk, &sk->sk_timer, jiffies + TCP_TIMEWAIT_LEN); 2796 } 2797 release_sock(sk); 2798 if (do_cancel_work) 2799 mptcp_cancel_work(sk); 2800 2801 if (mptcp_sk(sk)->token) 2802 mptcp_event(MPTCP_EVENT_CLOSED, mptcp_sk(sk), NULL, GFP_KERNEL); 2803 2804 sock_put(sk); 2805 } 2806 2807 static void mptcp_copy_inaddrs(struct sock *msk, const struct sock *ssk) 2808 { 2809 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 2810 const struct ipv6_pinfo *ssk6 = inet6_sk(ssk); 2811 struct ipv6_pinfo *msk6 = inet6_sk(msk); 2812 2813 msk->sk_v6_daddr = ssk->sk_v6_daddr; 2814 msk->sk_v6_rcv_saddr = ssk->sk_v6_rcv_saddr; 2815 2816 if (msk6 && ssk6) { 2817 msk6->saddr = ssk6->saddr; 2818 msk6->flow_label = ssk6->flow_label; 2819 } 2820 #endif 2821 2822 inet_sk(msk)->inet_num = inet_sk(ssk)->inet_num; 2823 inet_sk(msk)->inet_dport = inet_sk(ssk)->inet_dport; 2824 inet_sk(msk)->inet_sport = inet_sk(ssk)->inet_sport; 2825 inet_sk(msk)->inet_daddr = inet_sk(ssk)->inet_daddr; 2826 inet_sk(msk)->inet_saddr = inet_sk(ssk)->inet_saddr; 2827 inet_sk(msk)->inet_rcv_saddr = inet_sk(ssk)->inet_rcv_saddr; 2828 } 2829 2830 static int mptcp_disconnect(struct sock *sk, int flags) 2831 { 2832 struct mptcp_subflow_context *subflow; 2833 struct mptcp_sock *msk = mptcp_sk(sk); 2834 2835 mptcp_do_flush_join_list(msk); 2836 2837 mptcp_for_each_subflow(msk, subflow) { 2838 struct sock *ssk = mptcp_subflow_tcp_sock(subflow); 2839 2840 lock_sock(ssk); 2841 tcp_disconnect(ssk, flags); 2842 release_sock(ssk); 2843 } 2844 return 0; 2845 } 2846 2847 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 2848 static struct ipv6_pinfo *mptcp_inet6_sk(const struct sock *sk) 2849 { 2850 unsigned int offset = sizeof(struct mptcp6_sock) - sizeof(struct ipv6_pinfo); 2851 2852 return (struct ipv6_pinfo *)(((u8 *)sk) + offset); 2853 } 2854 #endif 2855 2856 struct sock *mptcp_sk_clone(const struct sock *sk, 2857 const struct mptcp_options_received *mp_opt, 2858 struct request_sock *req) 2859 { 2860 struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); 2861 struct sock *nsk = sk_clone_lock(sk, GFP_ATOMIC); 2862 struct mptcp_sock *msk; 2863 u64 ack_seq; 2864 2865 if (!nsk) 2866 return NULL; 2867 2868 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 2869 if (nsk->sk_family == AF_INET6) 2870 inet_sk(nsk)->pinet6 = mptcp_inet6_sk(nsk); 2871 #endif 2872 2873 __mptcp_init_sock(nsk); 2874 2875 msk = mptcp_sk(nsk); 2876 msk->local_key = subflow_req->local_key; 2877 msk->token = subflow_req->token; 2878 msk->subflow = NULL; 2879 WRITE_ONCE(msk->fully_established, false); 2880 if (mp_opt->suboptions & OPTION_MPTCP_CSUMREQD) 2881 WRITE_ONCE(msk->csum_enabled, true); 2882 2883 msk->write_seq = subflow_req->idsn + 1; 2884 msk->snd_nxt = msk->write_seq; 2885 msk->snd_una = msk->write_seq; 2886 msk->wnd_end = msk->snd_nxt + req->rsk_rcv_wnd; 2887 msk->setsockopt_seq = mptcp_sk(sk)->setsockopt_seq; 2888 2889 if (mp_opt->suboptions & OPTIONS_MPTCP_MPC) { 2890 msk->can_ack = true; 2891 msk->remote_key = mp_opt->sndr_key; 2892 mptcp_crypto_key_sha(msk->remote_key, NULL, &ack_seq); 2893 ack_seq++; 2894 WRITE_ONCE(msk->ack_seq, ack_seq); 2895 WRITE_ONCE(msk->rcv_wnd_sent, ack_seq); 2896 } 2897 2898 sock_reset_flag(nsk, SOCK_RCU_FREE); 2899 /* will be fully established after successful MPC subflow creation */ 2900 inet_sk_state_store(nsk, TCP_SYN_RECV); 2901 2902 security_inet_csk_clone(nsk, req); 2903 bh_unlock_sock(nsk); 2904 2905 /* keep a single reference */ 2906 __sock_put(nsk); 2907 return nsk; 2908 } 2909 2910 void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk) 2911 { 2912 const struct tcp_sock *tp = tcp_sk(ssk); 2913 2914 msk->rcvq_space.copied = 0; 2915 msk->rcvq_space.rtt_us = 0; 2916 2917 msk->rcvq_space.time = tp->tcp_mstamp; 2918 2919 /* initial rcv_space offering made to peer */ 2920 msk->rcvq_space.space = min_t(u32, tp->rcv_wnd, 2921 TCP_INIT_CWND * tp->advmss); 2922 if (msk->rcvq_space.space == 0) 2923 msk->rcvq_space.space = TCP_INIT_CWND * TCP_MSS_DEFAULT; 2924 2925 WRITE_ONCE(msk->wnd_end, msk->snd_nxt + tcp_sk(ssk)->snd_wnd); 2926 } 2927 2928 static struct sock *mptcp_accept(struct sock *sk, int flags, int *err, 2929 bool kern) 2930 { 2931 struct mptcp_sock *msk = mptcp_sk(sk); 2932 struct socket *listener; 2933 struct sock *newsk; 2934 2935 listener = __mptcp_nmpc_socket(msk); 2936 if (WARN_ON_ONCE(!listener)) { 2937 *err = -EINVAL; 2938 return NULL; 2939 } 2940 2941 pr_debug("msk=%p, listener=%p", msk, mptcp_subflow_ctx(listener->sk)); 2942 newsk = inet_csk_accept(listener->sk, flags, err, kern); 2943 if (!newsk) 2944 return NULL; 2945 2946 pr_debug("msk=%p, subflow is mptcp=%d", msk, sk_is_mptcp(newsk)); 2947 if (sk_is_mptcp(newsk)) { 2948 struct mptcp_subflow_context *subflow; 2949 struct sock *new_mptcp_sock; 2950 2951 subflow = mptcp_subflow_ctx(newsk); 2952 new_mptcp_sock = subflow->conn; 2953 2954 /* is_mptcp should be false if subflow->conn is missing, see 2955 * subflow_syn_recv_sock() 2956 */ 2957 if (WARN_ON_ONCE(!new_mptcp_sock)) { 2958 tcp_sk(newsk)->is_mptcp = 0; 2959 return newsk; 2960 } 2961 2962 /* acquire the 2nd reference for the owning socket */ 2963 sock_hold(new_mptcp_sock); 2964 newsk = new_mptcp_sock; 2965 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEPASSIVEACK); 2966 } else { 2967 MPTCP_INC_STATS(sock_net(sk), 2968 MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK); 2969 } 2970 2971 return newsk; 2972 } 2973 2974 void mptcp_destroy_common(struct mptcp_sock *msk) 2975 { 2976 struct sock *sk = (struct sock *)msk; 2977 2978 __mptcp_clear_xmit(sk); 2979 2980 /* move to sk_receive_queue, sk_stream_kill_queues will purge it */ 2981 skb_queue_splice_tail_init(&msk->receive_queue, &sk->sk_receive_queue); 2982 2983 skb_rbtree_purge(&msk->out_of_order_queue); 2984 mptcp_token_destroy(msk); 2985 mptcp_pm_free_anno_list(msk); 2986 } 2987 2988 static void mptcp_destroy(struct sock *sk) 2989 { 2990 struct mptcp_sock *msk = mptcp_sk(sk); 2991 2992 mptcp_destroy_common(msk); 2993 sk_sockets_allocated_dec(sk); 2994 } 2995 2996 void __mptcp_data_acked(struct sock *sk) 2997 { 2998 if (!sock_owned_by_user(sk)) 2999 __mptcp_clean_una(sk); 3000 else 3001 set_bit(MPTCP_CLEAN_UNA, &mptcp_sk(sk)->flags); 3002 3003 if (mptcp_pending_data_fin_ack(sk)) 3004 mptcp_schedule_work(sk); 3005 } 3006 3007 void __mptcp_check_push(struct sock *sk, struct sock *ssk) 3008 { 3009 if (!mptcp_send_head(sk)) 3010 return; 3011 3012 if (!sock_owned_by_user(sk)) { 3013 struct sock *xmit_ssk = mptcp_subflow_get_send(mptcp_sk(sk)); 3014 3015 if (xmit_ssk == ssk) 3016 __mptcp_subflow_push_pending(sk, ssk); 3017 else if (xmit_ssk) 3018 mptcp_subflow_delegate(mptcp_subflow_ctx(xmit_ssk)); 3019 } else { 3020 set_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->flags); 3021 } 3022 } 3023 3024 /* processes deferred events and flush wmem */ 3025 static void mptcp_release_cb(struct sock *sk) 3026 { 3027 for (;;) { 3028 unsigned long flags = 0; 3029 3030 if (test_and_clear_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->flags)) 3031 flags |= BIT(MPTCP_PUSH_PENDING); 3032 if (test_and_clear_bit(MPTCP_RETRANSMIT, &mptcp_sk(sk)->flags)) 3033 flags |= BIT(MPTCP_RETRANSMIT); 3034 if (!flags) 3035 break; 3036 3037 /* the following actions acquire the subflow socket lock 3038 * 3039 * 1) can't be invoked in atomic scope 3040 * 2) must avoid ABBA deadlock with msk socket spinlock: the RX 3041 * datapath acquires the msk socket spinlock while helding 3042 * the subflow socket lock 3043 */ 3044 3045 spin_unlock_bh(&sk->sk_lock.slock); 3046 if (flags & BIT(MPTCP_PUSH_PENDING)) 3047 __mptcp_push_pending(sk, 0); 3048 if (flags & BIT(MPTCP_RETRANSMIT)) 3049 __mptcp_retrans(sk); 3050 3051 cond_resched(); 3052 spin_lock_bh(&sk->sk_lock.slock); 3053 } 3054 3055 /* be sure to set the current sk state before tacking actions 3056 * depending on sk_state 3057 */ 3058 if (test_and_clear_bit(MPTCP_CONNECTED, &mptcp_sk(sk)->flags)) 3059 __mptcp_set_connected(sk); 3060 if (test_and_clear_bit(MPTCP_CLEAN_UNA, &mptcp_sk(sk)->flags)) 3061 __mptcp_clean_una_wakeup(sk); 3062 if (test_and_clear_bit(MPTCP_ERROR_REPORT, &mptcp_sk(sk)->flags)) 3063 __mptcp_error_report(sk); 3064 3065 /* push_pending may touch wmem_reserved, ensure we do the cleanup 3066 * later 3067 */ 3068 __mptcp_update_wmem(sk); 3069 __mptcp_update_rmem(sk); 3070 } 3071 3072 void mptcp_subflow_process_delegated(struct sock *ssk) 3073 { 3074 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); 3075 struct sock *sk = subflow->conn; 3076 3077 mptcp_data_lock(sk); 3078 if (!sock_owned_by_user(sk)) 3079 __mptcp_subflow_push_pending(sk, ssk); 3080 else 3081 set_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->flags); 3082 mptcp_data_unlock(sk); 3083 mptcp_subflow_delegated_done(subflow); 3084 } 3085 3086 static int mptcp_hash(struct sock *sk) 3087 { 3088 /* should never be called, 3089 * we hash the TCP subflows not the master socket 3090 */ 3091 WARN_ON_ONCE(1); 3092 return 0; 3093 } 3094 3095 static void mptcp_unhash(struct sock *sk) 3096 { 3097 /* called from sk_common_release(), but nothing to do here */ 3098 } 3099 3100 static int mptcp_get_port(struct sock *sk, unsigned short snum) 3101 { 3102 struct mptcp_sock *msk = mptcp_sk(sk); 3103 struct socket *ssock; 3104 3105 ssock = __mptcp_nmpc_socket(msk); 3106 pr_debug("msk=%p, subflow=%p", msk, ssock); 3107 if (WARN_ON_ONCE(!ssock)) 3108 return -EINVAL; 3109 3110 return inet_csk_get_port(ssock->sk, snum); 3111 } 3112 3113 void mptcp_finish_connect(struct sock *ssk) 3114 { 3115 struct mptcp_subflow_context *subflow; 3116 struct mptcp_sock *msk; 3117 struct sock *sk; 3118 u64 ack_seq; 3119 3120 subflow = mptcp_subflow_ctx(ssk); 3121 sk = subflow->conn; 3122 msk = mptcp_sk(sk); 3123 3124 pr_debug("msk=%p, token=%u", sk, subflow->token); 3125 3126 mptcp_crypto_key_sha(subflow->remote_key, NULL, &ack_seq); 3127 ack_seq++; 3128 subflow->map_seq = ack_seq; 3129 subflow->map_subflow_seq = 1; 3130 3131 /* the socket is not connected yet, no msk/subflow ops can access/race 3132 * accessing the field below 3133 */ 3134 WRITE_ONCE(msk->remote_key, subflow->remote_key); 3135 WRITE_ONCE(msk->local_key, subflow->local_key); 3136 WRITE_ONCE(msk->write_seq, subflow->idsn + 1); 3137 WRITE_ONCE(msk->snd_nxt, msk->write_seq); 3138 WRITE_ONCE(msk->ack_seq, ack_seq); 3139 WRITE_ONCE(msk->rcv_wnd_sent, ack_seq); 3140 WRITE_ONCE(msk->can_ack, 1); 3141 WRITE_ONCE(msk->snd_una, msk->write_seq); 3142 3143 mptcp_pm_new_connection(msk, ssk, 0); 3144 3145 mptcp_rcv_space_init(msk, ssk); 3146 } 3147 3148 void mptcp_sock_graft(struct sock *sk, struct socket *parent) 3149 { 3150 write_lock_bh(&sk->sk_callback_lock); 3151 rcu_assign_pointer(sk->sk_wq, &parent->wq); 3152 sk_set_socket(sk, parent); 3153 sk->sk_uid = SOCK_INODE(parent)->i_uid; 3154 write_unlock_bh(&sk->sk_callback_lock); 3155 } 3156 3157 bool mptcp_finish_join(struct sock *ssk) 3158 { 3159 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); 3160 struct mptcp_sock *msk = mptcp_sk(subflow->conn); 3161 struct sock *parent = (void *)msk; 3162 struct socket *parent_sock; 3163 bool ret; 3164 3165 pr_debug("msk=%p, subflow=%p", msk, subflow); 3166 3167 /* mptcp socket already closing? */ 3168 if (!mptcp_is_fully_established(parent)) { 3169 subflow->reset_reason = MPTCP_RST_EMPTCP; 3170 return false; 3171 } 3172 3173 if (!msk->pm.server_side) 3174 goto out; 3175 3176 if (!mptcp_pm_allow_new_subflow(msk)) { 3177 subflow->reset_reason = MPTCP_RST_EPROHIBIT; 3178 return false; 3179 } 3180 3181 /* active connections are already on conn_list, and we can't acquire 3182 * msk lock here. 3183 * use the join list lock as synchronization point and double-check 3184 * msk status to avoid racing with __mptcp_destroy_sock() 3185 */ 3186 spin_lock_bh(&msk->join_list_lock); 3187 ret = inet_sk_state_load(parent) == TCP_ESTABLISHED; 3188 if (ret && !WARN_ON_ONCE(!list_empty(&subflow->node))) { 3189 list_add_tail(&subflow->node, &msk->join_list); 3190 sock_hold(ssk); 3191 } 3192 spin_unlock_bh(&msk->join_list_lock); 3193 if (!ret) { 3194 subflow->reset_reason = MPTCP_RST_EPROHIBIT; 3195 return false; 3196 } 3197 3198 /* attach to msk socket only after we are sure he will deal with us 3199 * at close time 3200 */ 3201 parent_sock = READ_ONCE(parent->sk_socket); 3202 if (parent_sock && !ssk->sk_socket) 3203 mptcp_sock_graft(ssk, parent_sock); 3204 subflow->map_seq = READ_ONCE(msk->ack_seq); 3205 out: 3206 mptcp_event(MPTCP_EVENT_SUB_ESTABLISHED, msk, ssk, GFP_ATOMIC); 3207 return true; 3208 } 3209 3210 static void mptcp_shutdown(struct sock *sk, int how) 3211 { 3212 pr_debug("sk=%p, how=%d", sk, how); 3213 3214 if ((how & SEND_SHUTDOWN) && mptcp_close_state(sk)) 3215 __mptcp_wr_shutdown(sk); 3216 } 3217 3218 static struct proto mptcp_prot = { 3219 .name = "MPTCP", 3220 .owner = THIS_MODULE, 3221 .init = mptcp_init_sock, 3222 .disconnect = mptcp_disconnect, 3223 .close = mptcp_close, 3224 .accept = mptcp_accept, 3225 .setsockopt = mptcp_setsockopt, 3226 .getsockopt = mptcp_getsockopt, 3227 .shutdown = mptcp_shutdown, 3228 .destroy = mptcp_destroy, 3229 .sendmsg = mptcp_sendmsg, 3230 .recvmsg = mptcp_recvmsg, 3231 .release_cb = mptcp_release_cb, 3232 .hash = mptcp_hash, 3233 .unhash = mptcp_unhash, 3234 .get_port = mptcp_get_port, 3235 .sockets_allocated = &mptcp_sockets_allocated, 3236 .memory_allocated = &tcp_memory_allocated, 3237 .memory_pressure = &tcp_memory_pressure, 3238 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), 3239 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), 3240 .sysctl_mem = sysctl_tcp_mem, 3241 .obj_size = sizeof(struct mptcp_sock), 3242 .slab_flags = SLAB_TYPESAFE_BY_RCU, 3243 .no_autobind = true, 3244 }; 3245 3246 static int mptcp_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) 3247 { 3248 struct mptcp_sock *msk = mptcp_sk(sock->sk); 3249 struct socket *ssock; 3250 int err; 3251 3252 lock_sock(sock->sk); 3253 ssock = __mptcp_nmpc_socket(msk); 3254 if (!ssock) { 3255 err = -EINVAL; 3256 goto unlock; 3257 } 3258 3259 err = ssock->ops->bind(ssock, uaddr, addr_len); 3260 if (!err) 3261 mptcp_copy_inaddrs(sock->sk, ssock->sk); 3262 3263 unlock: 3264 release_sock(sock->sk); 3265 return err; 3266 } 3267 3268 static void mptcp_subflow_early_fallback(struct mptcp_sock *msk, 3269 struct mptcp_subflow_context *subflow) 3270 { 3271 subflow->request_mptcp = 0; 3272 __mptcp_do_fallback(msk); 3273 } 3274 3275 static int mptcp_stream_connect(struct socket *sock, struct sockaddr *uaddr, 3276 int addr_len, int flags) 3277 { 3278 struct mptcp_sock *msk = mptcp_sk(sock->sk); 3279 struct mptcp_subflow_context *subflow; 3280 struct socket *ssock; 3281 int err; 3282 3283 lock_sock(sock->sk); 3284 if (sock->state != SS_UNCONNECTED && msk->subflow) { 3285 /* pending connection or invalid state, let existing subflow 3286 * cope with that 3287 */ 3288 ssock = msk->subflow; 3289 goto do_connect; 3290 } 3291 3292 ssock = __mptcp_nmpc_socket(msk); 3293 if (!ssock) { 3294 err = -EINVAL; 3295 goto unlock; 3296 } 3297 3298 mptcp_token_destroy(msk); 3299 inet_sk_state_store(sock->sk, TCP_SYN_SENT); 3300 subflow = mptcp_subflow_ctx(ssock->sk); 3301 #ifdef CONFIG_TCP_MD5SIG 3302 /* no MPTCP if MD5SIG is enabled on this socket or we may run out of 3303 * TCP option space. 3304 */ 3305 if (rcu_access_pointer(tcp_sk(ssock->sk)->md5sig_info)) 3306 mptcp_subflow_early_fallback(msk, subflow); 3307 #endif 3308 if (subflow->request_mptcp && mptcp_token_new_connect(ssock->sk)) { 3309 MPTCP_INC_STATS(sock_net(ssock->sk), MPTCP_MIB_TOKENFALLBACKINIT); 3310 mptcp_subflow_early_fallback(msk, subflow); 3311 } 3312 if (likely(!__mptcp_check_fallback(msk))) 3313 MPTCP_INC_STATS(sock_net(sock->sk), MPTCP_MIB_MPCAPABLEACTIVE); 3314 3315 do_connect: 3316 err = ssock->ops->connect(ssock, uaddr, addr_len, flags); 3317 sock->state = ssock->state; 3318 3319 /* on successful connect, the msk state will be moved to established by 3320 * subflow_finish_connect() 3321 */ 3322 if (!err || err == -EINPROGRESS) 3323 mptcp_copy_inaddrs(sock->sk, ssock->sk); 3324 else 3325 inet_sk_state_store(sock->sk, inet_sk_state_load(ssock->sk)); 3326 3327 unlock: 3328 release_sock(sock->sk); 3329 return err; 3330 } 3331 3332 static int mptcp_listen(struct socket *sock, int backlog) 3333 { 3334 struct mptcp_sock *msk = mptcp_sk(sock->sk); 3335 struct socket *ssock; 3336 int err; 3337 3338 pr_debug("msk=%p", msk); 3339 3340 lock_sock(sock->sk); 3341 ssock = __mptcp_nmpc_socket(msk); 3342 if (!ssock) { 3343 err = -EINVAL; 3344 goto unlock; 3345 } 3346 3347 mptcp_token_destroy(msk); 3348 inet_sk_state_store(sock->sk, TCP_LISTEN); 3349 sock_set_flag(sock->sk, SOCK_RCU_FREE); 3350 3351 err = ssock->ops->listen(ssock, backlog); 3352 inet_sk_state_store(sock->sk, inet_sk_state_load(ssock->sk)); 3353 if (!err) 3354 mptcp_copy_inaddrs(sock->sk, ssock->sk); 3355 3356 unlock: 3357 release_sock(sock->sk); 3358 return err; 3359 } 3360 3361 static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, 3362 int flags, bool kern) 3363 { 3364 struct mptcp_sock *msk = mptcp_sk(sock->sk); 3365 struct socket *ssock; 3366 int err; 3367 3368 pr_debug("msk=%p", msk); 3369 3370 lock_sock(sock->sk); 3371 if (sock->sk->sk_state != TCP_LISTEN) 3372 goto unlock_fail; 3373 3374 ssock = __mptcp_nmpc_socket(msk); 3375 if (!ssock) 3376 goto unlock_fail; 3377 3378 clear_bit(MPTCP_DATA_READY, &msk->flags); 3379 sock_hold(ssock->sk); 3380 release_sock(sock->sk); 3381 3382 err = ssock->ops->accept(sock, newsock, flags, kern); 3383 if (err == 0 && !mptcp_is_tcpsk(newsock->sk)) { 3384 struct mptcp_sock *msk = mptcp_sk(newsock->sk); 3385 struct mptcp_subflow_context *subflow; 3386 struct sock *newsk = newsock->sk; 3387 3388 lock_sock(newsk); 3389 3390 /* PM/worker can now acquire the first subflow socket 3391 * lock without racing with listener queue cleanup, 3392 * we can notify it, if needed. 3393 * 3394 * Even if remote has reset the initial subflow by now 3395 * the refcnt is still at least one. 3396 */ 3397 subflow = mptcp_subflow_ctx(msk->first); 3398 list_add(&subflow->node, &msk->conn_list); 3399 sock_hold(msk->first); 3400 if (mptcp_is_fully_established(newsk)) 3401 mptcp_pm_fully_established(msk, msk->first, GFP_KERNEL); 3402 3403 mptcp_copy_inaddrs(newsk, msk->first); 3404 mptcp_rcv_space_init(msk, msk->first); 3405 mptcp_propagate_sndbuf(newsk, msk->first); 3406 3407 /* set ssk->sk_socket of accept()ed flows to mptcp socket. 3408 * This is needed so NOSPACE flag can be set from tcp stack. 3409 */ 3410 mptcp_flush_join_list(msk); 3411 mptcp_for_each_subflow(msk, subflow) { 3412 struct sock *ssk = mptcp_subflow_tcp_sock(subflow); 3413 3414 if (!ssk->sk_socket) 3415 mptcp_sock_graft(ssk, newsock); 3416 } 3417 release_sock(newsk); 3418 } 3419 3420 if (inet_csk_listen_poll(ssock->sk)) 3421 set_bit(MPTCP_DATA_READY, &msk->flags); 3422 sock_put(ssock->sk); 3423 return err; 3424 3425 unlock_fail: 3426 release_sock(sock->sk); 3427 return -EINVAL; 3428 } 3429 3430 static __poll_t mptcp_check_readable(struct mptcp_sock *msk) 3431 { 3432 return test_bit(MPTCP_DATA_READY, &msk->flags) ? EPOLLIN | EPOLLRDNORM : 3433 0; 3434 } 3435 3436 static __poll_t mptcp_check_writeable(struct mptcp_sock *msk) 3437 { 3438 struct sock *sk = (struct sock *)msk; 3439 3440 if (unlikely(sk->sk_shutdown & SEND_SHUTDOWN)) 3441 return EPOLLOUT | EPOLLWRNORM; 3442 3443 if (sk_stream_is_writeable(sk)) 3444 return EPOLLOUT | EPOLLWRNORM; 3445 3446 mptcp_set_nospace(sk); 3447 smp_mb__after_atomic(); /* msk->flags is changed by write_space cb */ 3448 if (sk_stream_is_writeable(sk)) 3449 return EPOLLOUT | EPOLLWRNORM; 3450 3451 return 0; 3452 } 3453 3454 static __poll_t mptcp_poll(struct file *file, struct socket *sock, 3455 struct poll_table_struct *wait) 3456 { 3457 struct sock *sk = sock->sk; 3458 struct mptcp_sock *msk; 3459 __poll_t mask = 0; 3460 int state; 3461 3462 msk = mptcp_sk(sk); 3463 sock_poll_wait(file, sock, wait); 3464 3465 state = inet_sk_state_load(sk); 3466 pr_debug("msk=%p state=%d flags=%lx", msk, state, msk->flags); 3467 if (state == TCP_LISTEN) 3468 return mptcp_check_readable(msk); 3469 3470 if (state != TCP_SYN_SENT && state != TCP_SYN_RECV) { 3471 mask |= mptcp_check_readable(msk); 3472 mask |= mptcp_check_writeable(msk); 3473 } 3474 if (sk->sk_shutdown == SHUTDOWN_MASK || state == TCP_CLOSE) 3475 mask |= EPOLLHUP; 3476 if (sk->sk_shutdown & RCV_SHUTDOWN) 3477 mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; 3478 3479 /* This barrier is coupled with smp_wmb() in tcp_reset() */ 3480 smp_rmb(); 3481 if (sk->sk_err) 3482 mask |= EPOLLERR; 3483 3484 return mask; 3485 } 3486 3487 static const struct proto_ops mptcp_stream_ops = { 3488 .family = PF_INET, 3489 .owner = THIS_MODULE, 3490 .release = inet_release, 3491 .bind = mptcp_bind, 3492 .connect = mptcp_stream_connect, 3493 .socketpair = sock_no_socketpair, 3494 .accept = mptcp_stream_accept, 3495 .getname = inet_getname, 3496 .poll = mptcp_poll, 3497 .ioctl = inet_ioctl, 3498 .gettstamp = sock_gettstamp, 3499 .listen = mptcp_listen, 3500 .shutdown = inet_shutdown, 3501 .setsockopt = sock_common_setsockopt, 3502 .getsockopt = sock_common_getsockopt, 3503 .sendmsg = inet_sendmsg, 3504 .recvmsg = inet_recvmsg, 3505 .mmap = sock_no_mmap, 3506 .sendpage = inet_sendpage, 3507 }; 3508 3509 static struct inet_protosw mptcp_protosw = { 3510 .type = SOCK_STREAM, 3511 .protocol = IPPROTO_MPTCP, 3512 .prot = &mptcp_prot, 3513 .ops = &mptcp_stream_ops, 3514 .flags = INET_PROTOSW_ICSK, 3515 }; 3516 3517 static int mptcp_napi_poll(struct napi_struct *napi, int budget) 3518 { 3519 struct mptcp_delegated_action *delegated; 3520 struct mptcp_subflow_context *subflow; 3521 int work_done = 0; 3522 3523 delegated = container_of(napi, struct mptcp_delegated_action, napi); 3524 while ((subflow = mptcp_subflow_delegated_next(delegated)) != NULL) { 3525 struct sock *ssk = mptcp_subflow_tcp_sock(subflow); 3526 3527 bh_lock_sock_nested(ssk); 3528 if (!sock_owned_by_user(ssk) && 3529 mptcp_subflow_has_delegated_action(subflow)) 3530 mptcp_subflow_process_delegated(ssk); 3531 /* ... elsewhere tcp_release_cb_override already processed 3532 * the action or will do at next release_sock(). 3533 * In both case must dequeue the subflow here - on the same 3534 * CPU that scheduled it. 3535 */ 3536 bh_unlock_sock(ssk); 3537 sock_put(ssk); 3538 3539 if (++work_done == budget) 3540 return budget; 3541 } 3542 3543 /* always provide a 0 'work_done' argument, so that napi_complete_done 3544 * will not try accessing the NULL napi->dev ptr 3545 */ 3546 napi_complete_done(napi, 0); 3547 return work_done; 3548 } 3549 3550 void __init mptcp_proto_init(void) 3551 { 3552 struct mptcp_delegated_action *delegated; 3553 int cpu; 3554 3555 mptcp_prot.h.hashinfo = tcp_prot.h.hashinfo; 3556 3557 if (percpu_counter_init(&mptcp_sockets_allocated, 0, GFP_KERNEL)) 3558 panic("Failed to allocate MPTCP pcpu counter\n"); 3559 3560 init_dummy_netdev(&mptcp_napi_dev); 3561 for_each_possible_cpu(cpu) { 3562 delegated = per_cpu_ptr(&mptcp_delegated_actions, cpu); 3563 INIT_LIST_HEAD(&delegated->head); 3564 netif_tx_napi_add(&mptcp_napi_dev, &delegated->napi, mptcp_napi_poll, 3565 NAPI_POLL_WEIGHT); 3566 napi_enable(&delegated->napi); 3567 } 3568 3569 mptcp_subflow_init(); 3570 mptcp_pm_init(); 3571 mptcp_token_init(); 3572 3573 if (proto_register(&mptcp_prot, 1) != 0) 3574 panic("Failed to register MPTCP proto.\n"); 3575 3576 inet_register_protosw(&mptcp_protosw); 3577 3578 BUILD_BUG_ON(sizeof(struct mptcp_skb_cb) > sizeof_field(struct sk_buff, cb)); 3579 } 3580 3581 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 3582 static const struct proto_ops mptcp_v6_stream_ops = { 3583 .family = PF_INET6, 3584 .owner = THIS_MODULE, 3585 .release = inet6_release, 3586 .bind = mptcp_bind, 3587 .connect = mptcp_stream_connect, 3588 .socketpair = sock_no_socketpair, 3589 .accept = mptcp_stream_accept, 3590 .getname = inet6_getname, 3591 .poll = mptcp_poll, 3592 .ioctl = inet6_ioctl, 3593 .gettstamp = sock_gettstamp, 3594 .listen = mptcp_listen, 3595 .shutdown = inet_shutdown, 3596 .setsockopt = sock_common_setsockopt, 3597 .getsockopt = sock_common_getsockopt, 3598 .sendmsg = inet6_sendmsg, 3599 .recvmsg = inet6_recvmsg, 3600 .mmap = sock_no_mmap, 3601 .sendpage = inet_sendpage, 3602 #ifdef CONFIG_COMPAT 3603 .compat_ioctl = inet6_compat_ioctl, 3604 #endif 3605 }; 3606 3607 static struct proto mptcp_v6_prot; 3608 3609 static void mptcp_v6_destroy(struct sock *sk) 3610 { 3611 mptcp_destroy(sk); 3612 inet6_destroy_sock(sk); 3613 } 3614 3615 static struct inet_protosw mptcp_v6_protosw = { 3616 .type = SOCK_STREAM, 3617 .protocol = IPPROTO_MPTCP, 3618 .prot = &mptcp_v6_prot, 3619 .ops = &mptcp_v6_stream_ops, 3620 .flags = INET_PROTOSW_ICSK, 3621 }; 3622 3623 int __init mptcp_proto_v6_init(void) 3624 { 3625 int err; 3626 3627 mptcp_v6_prot = mptcp_prot; 3628 strcpy(mptcp_v6_prot.name, "MPTCPv6"); 3629 mptcp_v6_prot.slab = NULL; 3630 mptcp_v6_prot.destroy = mptcp_v6_destroy; 3631 mptcp_v6_prot.obj_size = sizeof(struct mptcp6_sock); 3632 3633 err = proto_register(&mptcp_v6_prot, 1); 3634 if (err) 3635 return err; 3636 3637 err = inet6_register_protosw(&mptcp_v6_protosw); 3638 if (err) 3639 proto_unregister(&mptcp_v6_prot); 3640 3641 return err; 3642 } 3643 #endif 3644