1f870fa0bSMat Martineau // SPDX-License-Identifier: GPL-2.0 2f870fa0bSMat Martineau /* Multipath TCP 3f870fa0bSMat Martineau * 4f870fa0bSMat Martineau * Copyright (c) 2017 - 2019, Intel Corporation. 5f870fa0bSMat Martineau */ 6f870fa0bSMat Martineau 7f870fa0bSMat Martineau #define pr_fmt(fmt) "MPTCP: " fmt 8f870fa0bSMat Martineau 9f870fa0bSMat Martineau #include <linux/kernel.h> 10f870fa0bSMat Martineau #include <linux/module.h> 11f870fa0bSMat Martineau #include <linux/netdevice.h> 127a6a6cbcSPaolo Abeni #include <linux/sched/signal.h> 137a6a6cbcSPaolo Abeni #include <linux/atomic.h> 14f870fa0bSMat Martineau #include <net/sock.h> 15f870fa0bSMat Martineau #include <net/inet_common.h> 16f870fa0bSMat Martineau #include <net/inet_hashtables.h> 17f870fa0bSMat Martineau #include <net/protocol.h> 18f870fa0bSMat Martineau #include <net/tcp.h> 19*3721b9b6SMat Martineau #include <net/tcp_states.h> 20cf7da0d6SPeter Krystad #if IS_ENABLED(CONFIG_MPTCP_IPV6) 21cf7da0d6SPeter Krystad #include <net/transp_v6.h> 22cf7da0d6SPeter Krystad #endif 23f870fa0bSMat Martineau #include <net/mptcp.h> 24f870fa0bSMat Martineau #include "protocol.h" 25fc518953SFlorian Westphal #include "mib.h" 26f870fa0bSMat Martineau 272303f994SPeter Krystad #define MPTCP_SAME_STATE TCP_MAX_STATES 282303f994SPeter Krystad 29b0519de8SFlorian Westphal #if IS_ENABLED(CONFIG_MPTCP_IPV6) 30b0519de8SFlorian Westphal struct mptcp6_sock { 31b0519de8SFlorian Westphal struct mptcp_sock msk; 32b0519de8SFlorian Westphal struct ipv6_pinfo np; 33b0519de8SFlorian Westphal }; 34b0519de8SFlorian Westphal #endif 35b0519de8SFlorian Westphal 366771bfd9SFlorian Westphal struct mptcp_skb_cb { 376771bfd9SFlorian Westphal u32 offset; 386771bfd9SFlorian Westphal }; 396771bfd9SFlorian Westphal 406771bfd9SFlorian Westphal #define MPTCP_SKB_CB(__skb) ((struct mptcp_skb_cb *)&((__skb)->cb[0])) 416771bfd9SFlorian Westphal 42d027236cSPaolo Abeni static struct percpu_counter mptcp_sockets_allocated; 43d027236cSPaolo Abeni 442303f994SPeter Krystad /* If msk has an initial subflow socket, and the MP_CAPABLE handshake has not 452303f994SPeter Krystad * completed yet or has failed, return the subflow socket. 462303f994SPeter Krystad * Otherwise return NULL. 472303f994SPeter Krystad */ 482303f994SPeter Krystad static struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk) 492303f994SPeter Krystad { 50d22f4988SChristoph Paasch if (!msk->subflow || READ_ONCE(msk->can_ack)) 512303f994SPeter Krystad return NULL; 522303f994SPeter Krystad 532303f994SPeter Krystad return msk->subflow; 542303f994SPeter Krystad } 552303f994SPeter Krystad 56d2f77c53SPaolo Abeni static bool mptcp_is_tcpsk(struct sock *sk) 570b4f33deSFlorian Westphal { 580b4f33deSFlorian Westphal struct socket *sock = sk->sk_socket; 590b4f33deSFlorian Westphal 600b4f33deSFlorian Westphal if (unlikely(sk->sk_prot == &tcp_prot)) { 610b4f33deSFlorian Westphal /* we are being invoked after mptcp_accept() has 620b4f33deSFlorian Westphal * accepted a non-mp-capable flow: sk is a tcp_sk, 630b4f33deSFlorian Westphal * not an mptcp one. 640b4f33deSFlorian Westphal * 650b4f33deSFlorian Westphal * Hand the socket over to tcp so all further socket ops 660b4f33deSFlorian Westphal * bypass mptcp. 670b4f33deSFlorian Westphal */ 680b4f33deSFlorian Westphal sock->ops = &inet_stream_ops; 69d2f77c53SPaolo Abeni return true; 700b4f33deSFlorian Westphal #if IS_ENABLED(CONFIG_MPTCP_IPV6) 710b4f33deSFlorian Westphal } else if (unlikely(sk->sk_prot == &tcpv6_prot)) { 720b4f33deSFlorian Westphal sock->ops = &inet6_stream_ops; 73d2f77c53SPaolo Abeni return true; 740b4f33deSFlorian Westphal #endif 750b4f33deSFlorian Westphal } 760b4f33deSFlorian Westphal 77d2f77c53SPaolo Abeni return false; 780b4f33deSFlorian Westphal } 790b4f33deSFlorian Westphal 8076660afbSPaolo Abeni static struct sock *__mptcp_tcp_fallback(struct mptcp_sock *msk) 81cec37a6eSPeter Krystad { 82cec37a6eSPeter Krystad sock_owned_by_me((const struct sock *)msk); 83cec37a6eSPeter Krystad 84e1ff9e82SDavide Caratti if (likely(!__mptcp_check_fallback(msk))) 85cec37a6eSPeter Krystad return NULL; 86cec37a6eSPeter Krystad 8776660afbSPaolo Abeni return msk->first; 888ab183deSPaolo Abeni } 898ab183deSPaolo Abeni 90fa68018dSPaolo Abeni static int __mptcp_socket_create(struct mptcp_sock *msk) 912303f994SPeter Krystad { 922303f994SPeter Krystad struct mptcp_subflow_context *subflow; 932303f994SPeter Krystad struct sock *sk = (struct sock *)msk; 942303f994SPeter Krystad struct socket *ssock; 952303f994SPeter Krystad int err; 962303f994SPeter Krystad 972303f994SPeter Krystad err = mptcp_subflow_create_socket(sk, &ssock); 982303f994SPeter Krystad if (err) 99fa68018dSPaolo Abeni return err; 1002303f994SPeter Krystad 1018ab183deSPaolo Abeni msk->first = ssock->sk; 1022303f994SPeter Krystad msk->subflow = ssock; 1032303f994SPeter Krystad subflow = mptcp_subflow_ctx(ssock->sk); 104cec37a6eSPeter Krystad list_add(&subflow->node, &msk->conn_list); 1052303f994SPeter Krystad subflow->request_mptcp = 1; 1062303f994SPeter Krystad 107e1ff9e82SDavide Caratti /* accept() will wait on first subflow sk_wq, and we always wakes up 108e1ff9e82SDavide Caratti * via msk->sk_socket 109e1ff9e82SDavide Caratti */ 110e1ff9e82SDavide Caratti RCU_INIT_POINTER(msk->first->sk_wq, &sk->sk_socket->wq); 111e1ff9e82SDavide Caratti 112fa68018dSPaolo Abeni return 0; 1132303f994SPeter Krystad } 1142303f994SPeter Krystad 1156771bfd9SFlorian Westphal static void __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk, 1166771bfd9SFlorian Westphal struct sk_buff *skb, 1176771bfd9SFlorian Westphal unsigned int offset, size_t copy_len) 1186771bfd9SFlorian Westphal { 1196771bfd9SFlorian Westphal struct sock *sk = (struct sock *)msk; 1204e637c70SFlorian Westphal struct sk_buff *tail; 1216771bfd9SFlorian Westphal 1226771bfd9SFlorian Westphal __skb_unlink(skb, &ssk->sk_receive_queue); 1234e637c70SFlorian Westphal 1244e637c70SFlorian Westphal skb_ext_reset(skb); 1254e637c70SFlorian Westphal skb_orphan(skb); 1264e637c70SFlorian Westphal msk->ack_seq += copy_len; 1274e637c70SFlorian Westphal 1284e637c70SFlorian Westphal tail = skb_peek_tail(&sk->sk_receive_queue); 1294e637c70SFlorian Westphal if (offset == 0 && tail) { 1304e637c70SFlorian Westphal bool fragstolen; 1314e637c70SFlorian Westphal int delta; 1324e637c70SFlorian Westphal 1334e637c70SFlorian Westphal if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) { 1344e637c70SFlorian Westphal kfree_skb_partial(skb, fragstolen); 1354e637c70SFlorian Westphal atomic_add(delta, &sk->sk_rmem_alloc); 1364e637c70SFlorian Westphal sk_mem_charge(sk, delta); 1374e637c70SFlorian Westphal return; 1384e637c70SFlorian Westphal } 1394e637c70SFlorian Westphal } 1404e637c70SFlorian Westphal 141600911ffSFlorian Westphal skb_set_owner_r(skb, sk); 1426771bfd9SFlorian Westphal __skb_queue_tail(&sk->sk_receive_queue, skb); 1436771bfd9SFlorian Westphal MPTCP_SKB_CB(skb)->offset = offset; 1446771bfd9SFlorian Westphal } 1456771bfd9SFlorian Westphal 146de06f573SFlorian Westphal /* both sockets must be locked */ 147de06f573SFlorian Westphal static bool mptcp_subflow_dsn_valid(const struct mptcp_sock *msk, 148de06f573SFlorian Westphal struct sock *ssk) 149de06f573SFlorian Westphal { 150de06f573SFlorian Westphal struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); 151de06f573SFlorian Westphal u64 dsn = mptcp_subflow_get_mapped_dsn(subflow); 152de06f573SFlorian Westphal 153de06f573SFlorian Westphal /* revalidate data sequence number. 154de06f573SFlorian Westphal * 155de06f573SFlorian Westphal * mptcp_subflow_data_available() is usually called 156de06f573SFlorian Westphal * without msk lock. Its unlikely (but possible) 157de06f573SFlorian Westphal * that msk->ack_seq has been advanced since the last 158de06f573SFlorian Westphal * call found in-sequence data. 159de06f573SFlorian Westphal */ 160de06f573SFlorian Westphal if (likely(dsn == msk->ack_seq)) 161de06f573SFlorian Westphal return true; 162de06f573SFlorian Westphal 163de06f573SFlorian Westphal subflow->data_avail = 0; 164de06f573SFlorian Westphal return mptcp_subflow_data_available(ssk); 165de06f573SFlorian Westphal } 166de06f573SFlorian Westphal 167*3721b9b6SMat Martineau static bool mptcp_pending_data_fin(struct sock *sk, u64 *seq) 168*3721b9b6SMat Martineau { 169*3721b9b6SMat Martineau struct mptcp_sock *msk = mptcp_sk(sk); 170*3721b9b6SMat Martineau 171*3721b9b6SMat Martineau if (READ_ONCE(msk->rcv_data_fin) && 172*3721b9b6SMat Martineau ((1 << sk->sk_state) & 173*3721b9b6SMat Martineau (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2))) { 174*3721b9b6SMat Martineau u64 rcv_data_fin_seq = READ_ONCE(msk->rcv_data_fin_seq); 175*3721b9b6SMat Martineau 176*3721b9b6SMat Martineau if (msk->ack_seq == rcv_data_fin_seq) { 177*3721b9b6SMat Martineau if (seq) 178*3721b9b6SMat Martineau *seq = rcv_data_fin_seq; 179*3721b9b6SMat Martineau 180*3721b9b6SMat Martineau return true; 181*3721b9b6SMat Martineau } 182*3721b9b6SMat Martineau } 183*3721b9b6SMat Martineau 184*3721b9b6SMat Martineau return false; 185*3721b9b6SMat Martineau } 186*3721b9b6SMat Martineau 187*3721b9b6SMat Martineau static void mptcp_set_timeout(const struct sock *sk, const struct sock *ssk) 188*3721b9b6SMat Martineau { 189*3721b9b6SMat Martineau long tout = ssk && inet_csk(ssk)->icsk_pending ? 190*3721b9b6SMat Martineau inet_csk(ssk)->icsk_timeout - jiffies : 0; 191*3721b9b6SMat Martineau 192*3721b9b6SMat Martineau if (tout <= 0) 193*3721b9b6SMat Martineau tout = mptcp_sk(sk)->timer_ival; 194*3721b9b6SMat Martineau mptcp_sk(sk)->timer_ival = tout > 0 ? tout : TCP_RTO_MIN; 195*3721b9b6SMat Martineau } 196*3721b9b6SMat Martineau 197*3721b9b6SMat Martineau static void mptcp_check_data_fin(struct sock *sk) 198*3721b9b6SMat Martineau { 199*3721b9b6SMat Martineau struct mptcp_sock *msk = mptcp_sk(sk); 200*3721b9b6SMat Martineau u64 rcv_data_fin_seq; 201*3721b9b6SMat Martineau 202*3721b9b6SMat Martineau if (__mptcp_check_fallback(msk) || !msk->first) 203*3721b9b6SMat Martineau return; 204*3721b9b6SMat Martineau 205*3721b9b6SMat Martineau /* Need to ack a DATA_FIN received from a peer while this side 206*3721b9b6SMat Martineau * of the connection is in ESTABLISHED, FIN_WAIT1, or FIN_WAIT2. 207*3721b9b6SMat Martineau * msk->rcv_data_fin was set when parsing the incoming options 208*3721b9b6SMat Martineau * at the subflow level and the msk lock was not held, so this 209*3721b9b6SMat Martineau * is the first opportunity to act on the DATA_FIN and change 210*3721b9b6SMat Martineau * the msk state. 211*3721b9b6SMat Martineau * 212*3721b9b6SMat Martineau * If we are caught up to the sequence number of the incoming 213*3721b9b6SMat Martineau * DATA_FIN, send the DATA_ACK now and do state transition. If 214*3721b9b6SMat Martineau * not caught up, do nothing and let the recv code send DATA_ACK 215*3721b9b6SMat Martineau * when catching up. 216*3721b9b6SMat Martineau */ 217*3721b9b6SMat Martineau 218*3721b9b6SMat Martineau if (mptcp_pending_data_fin(sk, &rcv_data_fin_seq)) { 219*3721b9b6SMat Martineau struct mptcp_subflow_context *subflow; 220*3721b9b6SMat Martineau 221*3721b9b6SMat Martineau msk->ack_seq++; 222*3721b9b6SMat Martineau WRITE_ONCE(msk->rcv_data_fin, 0); 223*3721b9b6SMat Martineau 224*3721b9b6SMat Martineau sk->sk_shutdown |= RCV_SHUTDOWN; 225*3721b9b6SMat Martineau 226*3721b9b6SMat Martineau switch (sk->sk_state) { 227*3721b9b6SMat Martineau case TCP_ESTABLISHED: 228*3721b9b6SMat Martineau inet_sk_state_store(sk, TCP_CLOSE_WAIT); 229*3721b9b6SMat Martineau break; 230*3721b9b6SMat Martineau case TCP_FIN_WAIT1: 231*3721b9b6SMat Martineau inet_sk_state_store(sk, TCP_CLOSING); 232*3721b9b6SMat Martineau break; 233*3721b9b6SMat Martineau case TCP_FIN_WAIT2: 234*3721b9b6SMat Martineau inet_sk_state_store(sk, TCP_CLOSE); 235*3721b9b6SMat Martineau // @@ Close subflows now? 236*3721b9b6SMat Martineau break; 237*3721b9b6SMat Martineau default: 238*3721b9b6SMat Martineau /* Other states not expected */ 239*3721b9b6SMat Martineau WARN_ON_ONCE(1); 240*3721b9b6SMat Martineau break; 241*3721b9b6SMat Martineau } 242*3721b9b6SMat Martineau 243*3721b9b6SMat Martineau mptcp_set_timeout(sk, NULL); 244*3721b9b6SMat Martineau mptcp_for_each_subflow(msk, subflow) { 245*3721b9b6SMat Martineau struct sock *ssk = mptcp_subflow_tcp_sock(subflow); 246*3721b9b6SMat Martineau 247*3721b9b6SMat Martineau lock_sock(ssk); 248*3721b9b6SMat Martineau tcp_send_ack(ssk); 249*3721b9b6SMat Martineau release_sock(ssk); 250*3721b9b6SMat Martineau } 251*3721b9b6SMat Martineau 252*3721b9b6SMat Martineau sk->sk_state_change(sk); 253*3721b9b6SMat Martineau 254*3721b9b6SMat Martineau if (sk->sk_shutdown == SHUTDOWN_MASK || 255*3721b9b6SMat Martineau sk->sk_state == TCP_CLOSE) 256*3721b9b6SMat Martineau sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP); 257*3721b9b6SMat Martineau else 258*3721b9b6SMat Martineau sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); 259*3721b9b6SMat Martineau } 260*3721b9b6SMat Martineau } 261*3721b9b6SMat Martineau 2626771bfd9SFlorian Westphal static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, 2636771bfd9SFlorian Westphal struct sock *ssk, 2646771bfd9SFlorian Westphal unsigned int *bytes) 2656771bfd9SFlorian Westphal { 2666771bfd9SFlorian Westphal struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); 267600911ffSFlorian Westphal struct sock *sk = (struct sock *)msk; 2686771bfd9SFlorian Westphal unsigned int moved = 0; 2696771bfd9SFlorian Westphal bool more_data_avail; 2706771bfd9SFlorian Westphal struct tcp_sock *tp; 2716771bfd9SFlorian Westphal bool done = false; 272600911ffSFlorian Westphal 273de06f573SFlorian Westphal if (!mptcp_subflow_dsn_valid(msk, ssk)) { 274de06f573SFlorian Westphal *bytes = 0; 275de06f573SFlorian Westphal return false; 276de06f573SFlorian Westphal } 277de06f573SFlorian Westphal 2786771bfd9SFlorian Westphal tp = tcp_sk(ssk); 2796771bfd9SFlorian Westphal do { 2806771bfd9SFlorian Westphal u32 map_remaining, offset; 2816771bfd9SFlorian Westphal u32 seq = tp->copied_seq; 2826771bfd9SFlorian Westphal struct sk_buff *skb; 2836771bfd9SFlorian Westphal bool fin; 2846771bfd9SFlorian Westphal 2856771bfd9SFlorian Westphal /* try to move as much data as available */ 2866771bfd9SFlorian Westphal map_remaining = subflow->map_data_len - 2876771bfd9SFlorian Westphal mptcp_subflow_get_map_offset(subflow); 2886771bfd9SFlorian Westphal 2896771bfd9SFlorian Westphal skb = skb_peek(&ssk->sk_receive_queue); 2906771bfd9SFlorian Westphal if (!skb) 2916771bfd9SFlorian Westphal break; 2926771bfd9SFlorian Westphal 293e1ff9e82SDavide Caratti if (__mptcp_check_fallback(msk)) { 294e1ff9e82SDavide Caratti /* if we are running under the workqueue, TCP could have 295e1ff9e82SDavide Caratti * collapsed skbs between dummy map creation and now 296e1ff9e82SDavide Caratti * be sure to adjust the size 297e1ff9e82SDavide Caratti */ 298e1ff9e82SDavide Caratti map_remaining = skb->len; 299e1ff9e82SDavide Caratti subflow->map_data_len = skb->len; 300e1ff9e82SDavide Caratti } 301e1ff9e82SDavide Caratti 3026771bfd9SFlorian Westphal offset = seq - TCP_SKB_CB(skb)->seq; 3036771bfd9SFlorian Westphal fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN; 3046771bfd9SFlorian Westphal if (fin) { 3056771bfd9SFlorian Westphal done = true; 3066771bfd9SFlorian Westphal seq++; 3076771bfd9SFlorian Westphal } 3086771bfd9SFlorian Westphal 3096771bfd9SFlorian Westphal if (offset < skb->len) { 3106771bfd9SFlorian Westphal size_t len = skb->len - offset; 3116771bfd9SFlorian Westphal 3126771bfd9SFlorian Westphal if (tp->urg_data) 3136771bfd9SFlorian Westphal done = true; 3146771bfd9SFlorian Westphal 3156771bfd9SFlorian Westphal __mptcp_move_skb(msk, ssk, skb, offset, len); 3166771bfd9SFlorian Westphal seq += len; 3176771bfd9SFlorian Westphal moved += len; 3186771bfd9SFlorian Westphal 3196771bfd9SFlorian Westphal if (WARN_ON_ONCE(map_remaining < len)) 3206771bfd9SFlorian Westphal break; 3216771bfd9SFlorian Westphal } else { 3226771bfd9SFlorian Westphal WARN_ON_ONCE(!fin); 3236771bfd9SFlorian Westphal sk_eat_skb(ssk, skb); 3246771bfd9SFlorian Westphal done = true; 3256771bfd9SFlorian Westphal } 3266771bfd9SFlorian Westphal 3276771bfd9SFlorian Westphal WRITE_ONCE(tp->copied_seq, seq); 3286771bfd9SFlorian Westphal more_data_avail = mptcp_subflow_data_available(ssk); 329600911ffSFlorian Westphal 330600911ffSFlorian Westphal if (atomic_read(&sk->sk_rmem_alloc) > READ_ONCE(sk->sk_rcvbuf)) { 331600911ffSFlorian Westphal done = true; 332600911ffSFlorian Westphal break; 333600911ffSFlorian Westphal } 3346771bfd9SFlorian Westphal } while (more_data_avail); 3356771bfd9SFlorian Westphal 3366771bfd9SFlorian Westphal *bytes = moved; 3376771bfd9SFlorian Westphal 3386771bfd9SFlorian Westphal return done; 3396771bfd9SFlorian Westphal } 3406771bfd9SFlorian Westphal 3412e52213cSFlorian Westphal /* In most cases we will be able to lock the mptcp socket. If its already 3422e52213cSFlorian Westphal * owned, we need to defer to the work queue to avoid ABBA deadlock. 3432e52213cSFlorian Westphal */ 3442e52213cSFlorian Westphal static bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk) 3452e52213cSFlorian Westphal { 3462e52213cSFlorian Westphal struct sock *sk = (struct sock *)msk; 3472e52213cSFlorian Westphal unsigned int moved = 0; 3482e52213cSFlorian Westphal 3492e52213cSFlorian Westphal if (READ_ONCE(sk->sk_lock.owned)) 3502e52213cSFlorian Westphal return false; 3512e52213cSFlorian Westphal 3522e52213cSFlorian Westphal if (unlikely(!spin_trylock_bh(&sk->sk_lock.slock))) 3532e52213cSFlorian Westphal return false; 3542e52213cSFlorian Westphal 3552e52213cSFlorian Westphal /* must re-check after taking the lock */ 3562e52213cSFlorian Westphal if (!READ_ONCE(sk->sk_lock.owned)) 3572e52213cSFlorian Westphal __mptcp_move_skbs_from_subflow(msk, ssk, &moved); 3582e52213cSFlorian Westphal 3592e52213cSFlorian Westphal spin_unlock_bh(&sk->sk_lock.slock); 3602e52213cSFlorian Westphal 3612e52213cSFlorian Westphal return moved > 0; 3622e52213cSFlorian Westphal } 3632e52213cSFlorian Westphal 3642e52213cSFlorian Westphal void mptcp_data_ready(struct sock *sk, struct sock *ssk) 365101f6f85SFlorian Westphal { 366101f6f85SFlorian Westphal struct mptcp_sock *msk = mptcp_sk(sk); 367101f6f85SFlorian Westphal 368101f6f85SFlorian Westphal set_bit(MPTCP_DATA_READY, &msk->flags); 3696771bfd9SFlorian Westphal 3702e52213cSFlorian Westphal if (atomic_read(&sk->sk_rmem_alloc) < READ_ONCE(sk->sk_rcvbuf) && 3712e52213cSFlorian Westphal move_skbs_to_msk(msk, ssk)) 3722e52213cSFlorian Westphal goto wake; 3732e52213cSFlorian Westphal 374600911ffSFlorian Westphal /* don't schedule if mptcp sk is (still) over limit */ 375600911ffSFlorian Westphal if (atomic_read(&sk->sk_rmem_alloc) > READ_ONCE(sk->sk_rcvbuf)) 376600911ffSFlorian Westphal goto wake; 377600911ffSFlorian Westphal 37814c441b5SPaolo Abeni /* mptcp socket is owned, release_cb should retry */ 37914c441b5SPaolo Abeni if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, 38014c441b5SPaolo Abeni &sk->sk_tsq_flags)) { 38114c441b5SPaolo Abeni sock_hold(sk); 3826771bfd9SFlorian Westphal 38314c441b5SPaolo Abeni /* need to try again, its possible release_cb() has already 38414c441b5SPaolo Abeni * been called after the test_and_set_bit() above. 38514c441b5SPaolo Abeni */ 38614c441b5SPaolo Abeni move_skbs_to_msk(msk, ssk); 38714c441b5SPaolo Abeni } 388600911ffSFlorian Westphal wake: 389101f6f85SFlorian Westphal sk->sk_data_ready(sk); 390101f6f85SFlorian Westphal } 391101f6f85SFlorian Westphal 392ec3edaa7SPeter Krystad static void __mptcp_flush_join_list(struct mptcp_sock *msk) 393ec3edaa7SPeter Krystad { 394ec3edaa7SPeter Krystad if (likely(list_empty(&msk->join_list))) 395ec3edaa7SPeter Krystad return; 396ec3edaa7SPeter Krystad 397ec3edaa7SPeter Krystad spin_lock_bh(&msk->join_list_lock); 398ec3edaa7SPeter Krystad list_splice_tail_init(&msk->join_list, &msk->conn_list); 399ec3edaa7SPeter Krystad spin_unlock_bh(&msk->join_list_lock); 400ec3edaa7SPeter Krystad } 401ec3edaa7SPeter Krystad 402b51f9b80SPaolo Abeni static bool mptcp_timer_pending(struct sock *sk) 403b51f9b80SPaolo Abeni { 404b51f9b80SPaolo Abeni return timer_pending(&inet_csk(sk)->icsk_retransmit_timer); 405b51f9b80SPaolo Abeni } 406b51f9b80SPaolo Abeni 407b51f9b80SPaolo Abeni static void mptcp_reset_timer(struct sock *sk) 408b51f9b80SPaolo Abeni { 409b51f9b80SPaolo Abeni struct inet_connection_sock *icsk = inet_csk(sk); 410b51f9b80SPaolo Abeni unsigned long tout; 411b51f9b80SPaolo Abeni 412b51f9b80SPaolo Abeni /* should never be called with mptcp level timer cleared */ 413b51f9b80SPaolo Abeni tout = READ_ONCE(mptcp_sk(sk)->timer_ival); 414b51f9b80SPaolo Abeni if (WARN_ON_ONCE(!tout)) 415b51f9b80SPaolo Abeni tout = TCP_RTO_MIN; 416b51f9b80SPaolo Abeni sk_reset_timer(sk, &icsk->icsk_retransmit_timer, jiffies + tout); 417b51f9b80SPaolo Abeni } 418b51f9b80SPaolo Abeni 419b51f9b80SPaolo Abeni void mptcp_data_acked(struct sock *sk) 420b51f9b80SPaolo Abeni { 421b51f9b80SPaolo Abeni mptcp_reset_timer(sk); 4223b1d6210SPaolo Abeni 4233b1d6210SPaolo Abeni if (!sk_stream_is_writeable(sk) && 4243b1d6210SPaolo Abeni schedule_work(&mptcp_sk(sk)->work)) 4253b1d6210SPaolo Abeni sock_hold(sk); 426b51f9b80SPaolo Abeni } 427b51f9b80SPaolo Abeni 42859832e24SFlorian Westphal void mptcp_subflow_eof(struct sock *sk) 42959832e24SFlorian Westphal { 43059832e24SFlorian Westphal struct mptcp_sock *msk = mptcp_sk(sk); 43159832e24SFlorian Westphal 43259832e24SFlorian Westphal if (!test_and_set_bit(MPTCP_WORK_EOF, &msk->flags) && 43359832e24SFlorian Westphal schedule_work(&msk->work)) 43459832e24SFlorian Westphal sock_hold(sk); 43559832e24SFlorian Westphal } 43659832e24SFlorian Westphal 4375969856aSPaolo Abeni static void mptcp_check_for_eof(struct mptcp_sock *msk) 4385969856aSPaolo Abeni { 4395969856aSPaolo Abeni struct mptcp_subflow_context *subflow; 4405969856aSPaolo Abeni struct sock *sk = (struct sock *)msk; 4415969856aSPaolo Abeni int receivers = 0; 4425969856aSPaolo Abeni 4435969856aSPaolo Abeni mptcp_for_each_subflow(msk, subflow) 4445969856aSPaolo Abeni receivers += !subflow->rx_eof; 4455969856aSPaolo Abeni 4465969856aSPaolo Abeni if (!receivers && !(sk->sk_shutdown & RCV_SHUTDOWN)) { 4475969856aSPaolo Abeni /* hopefully temporary hack: propagate shutdown status 4485969856aSPaolo Abeni * to msk, when all subflows agree on it 4495969856aSPaolo Abeni */ 4505969856aSPaolo Abeni sk->sk_shutdown |= RCV_SHUTDOWN; 4515969856aSPaolo Abeni 4525969856aSPaolo Abeni smp_mb__before_atomic(); /* SHUTDOWN must be visible first */ 4535969856aSPaolo Abeni set_bit(MPTCP_DATA_READY, &msk->flags); 4545969856aSPaolo Abeni sk->sk_data_ready(sk); 4555969856aSPaolo Abeni } 4565969856aSPaolo Abeni } 4575969856aSPaolo Abeni 458b51f9b80SPaolo Abeni static void mptcp_stop_timer(struct sock *sk) 459b51f9b80SPaolo Abeni { 460b51f9b80SPaolo Abeni struct inet_connection_sock *icsk = inet_csk(sk); 461b51f9b80SPaolo Abeni 462b51f9b80SPaolo Abeni sk_stop_timer(sk, &icsk->icsk_retransmit_timer); 463b51f9b80SPaolo Abeni mptcp_sk(sk)->timer_ival = 0; 464b51f9b80SPaolo Abeni } 465b51f9b80SPaolo Abeni 4666d0060f6SMat Martineau static bool mptcp_ext_cache_refill(struct mptcp_sock *msk) 4676d0060f6SMat Martineau { 4684930f483SFlorian Westphal const struct sock *sk = (const struct sock *)msk; 4694930f483SFlorian Westphal 4706d0060f6SMat Martineau if (!msk->cached_ext) 4714930f483SFlorian Westphal msk->cached_ext = __skb_ext_alloc(sk->sk_allocation); 4726d0060f6SMat Martineau 4736d0060f6SMat Martineau return !!msk->cached_ext; 4746d0060f6SMat Martineau } 4756d0060f6SMat Martineau 4767a6a6cbcSPaolo Abeni static struct sock *mptcp_subflow_recv_lookup(const struct mptcp_sock *msk) 4777a6a6cbcSPaolo Abeni { 4787a6a6cbcSPaolo Abeni struct mptcp_subflow_context *subflow; 4797a6a6cbcSPaolo Abeni struct sock *sk = (struct sock *)msk; 4807a6a6cbcSPaolo Abeni 4817a6a6cbcSPaolo Abeni sock_owned_by_me(sk); 4827a6a6cbcSPaolo Abeni 4837a6a6cbcSPaolo Abeni mptcp_for_each_subflow(msk, subflow) { 4847a6a6cbcSPaolo Abeni if (subflow->data_avail) 4857a6a6cbcSPaolo Abeni return mptcp_subflow_tcp_sock(subflow); 4867a6a6cbcSPaolo Abeni } 4877a6a6cbcSPaolo Abeni 4887a6a6cbcSPaolo Abeni return NULL; 4897a6a6cbcSPaolo Abeni } 4907a6a6cbcSPaolo Abeni 4913f8e0aaeSPaolo Abeni static bool mptcp_skb_can_collapse_to(u64 write_seq, 49257040755SPaolo Abeni const struct sk_buff *skb, 49357040755SPaolo Abeni const struct mptcp_ext *mpext) 4946d0060f6SMat Martineau { 49557040755SPaolo Abeni if (!tcp_skb_can_collapse_to(skb)) 49657040755SPaolo Abeni return false; 49757040755SPaolo Abeni 49857040755SPaolo Abeni /* can collapse only if MPTCP level sequence is in order */ 4993f8e0aaeSPaolo Abeni return mpext && mpext->data_seq + mpext->data_len == write_seq; 50057040755SPaolo Abeni } 50157040755SPaolo Abeni 50218b683bfSPaolo Abeni static bool mptcp_frag_can_collapse_to(const struct mptcp_sock *msk, 50318b683bfSPaolo Abeni const struct page_frag *pfrag, 50418b683bfSPaolo Abeni const struct mptcp_data_frag *df) 50518b683bfSPaolo Abeni { 50618b683bfSPaolo Abeni return df && pfrag->page == df->page && 50718b683bfSPaolo Abeni df->data_seq + df->data_len == msk->write_seq; 50818b683bfSPaolo Abeni } 50918b683bfSPaolo Abeni 510d027236cSPaolo Abeni static void dfrag_uncharge(struct sock *sk, int len) 51118b683bfSPaolo Abeni { 512d027236cSPaolo Abeni sk_mem_uncharge(sk, len); 5137948f6ccSFlorian Westphal sk_wmem_queued_add(sk, -len); 514d027236cSPaolo Abeni } 515d027236cSPaolo Abeni 516d027236cSPaolo Abeni static void dfrag_clear(struct sock *sk, struct mptcp_data_frag *dfrag) 517d027236cSPaolo Abeni { 518d027236cSPaolo Abeni int len = dfrag->data_len + dfrag->overhead; 519d027236cSPaolo Abeni 52018b683bfSPaolo Abeni list_del(&dfrag->list); 521d027236cSPaolo Abeni dfrag_uncharge(sk, len); 52218b683bfSPaolo Abeni put_page(dfrag->page); 52318b683bfSPaolo Abeni } 52418b683bfSPaolo Abeni 52518b683bfSPaolo Abeni static void mptcp_clean_una(struct sock *sk) 52618b683bfSPaolo Abeni { 52718b683bfSPaolo Abeni struct mptcp_sock *msk = mptcp_sk(sk); 52818b683bfSPaolo Abeni struct mptcp_data_frag *dtmp, *dfrag; 529d027236cSPaolo Abeni bool cleaned = false; 530e1ff9e82SDavide Caratti u64 snd_una; 531e1ff9e82SDavide Caratti 532e1ff9e82SDavide Caratti /* on fallback we just need to ignore snd_una, as this is really 533e1ff9e82SDavide Caratti * plain TCP 534e1ff9e82SDavide Caratti */ 535e1ff9e82SDavide Caratti if (__mptcp_check_fallback(msk)) 536e1ff9e82SDavide Caratti atomic64_set(&msk->snd_una, msk->write_seq); 537e1ff9e82SDavide Caratti snd_una = atomic64_read(&msk->snd_una); 53818b683bfSPaolo Abeni 53918b683bfSPaolo Abeni list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list) { 54018b683bfSPaolo Abeni if (after64(dfrag->data_seq + dfrag->data_len, snd_una)) 54118b683bfSPaolo Abeni break; 54218b683bfSPaolo Abeni 543d027236cSPaolo Abeni dfrag_clear(sk, dfrag); 544d027236cSPaolo Abeni cleaned = true; 545d027236cSPaolo Abeni } 546d027236cSPaolo Abeni 5477948f6ccSFlorian Westphal dfrag = mptcp_rtx_head(sk); 5487948f6ccSFlorian Westphal if (dfrag && after64(snd_una, dfrag->data_seq)) { 54953eb4c38SPaolo Abeni u64 delta = snd_una - dfrag->data_seq; 55053eb4c38SPaolo Abeni 55153eb4c38SPaolo Abeni if (WARN_ON_ONCE(delta > dfrag->data_len)) 55253eb4c38SPaolo Abeni goto out; 5537948f6ccSFlorian Westphal 5547948f6ccSFlorian Westphal dfrag->data_seq += delta; 55553eb4c38SPaolo Abeni dfrag->offset += delta; 5567948f6ccSFlorian Westphal dfrag->data_len -= delta; 5577948f6ccSFlorian Westphal 5587948f6ccSFlorian Westphal dfrag_uncharge(sk, delta); 5597948f6ccSFlorian Westphal cleaned = true; 5607948f6ccSFlorian Westphal } 5617948f6ccSFlorian Westphal 56253eb4c38SPaolo Abeni out: 563d027236cSPaolo Abeni if (cleaned) { 564d027236cSPaolo Abeni sk_mem_reclaim_partial(sk); 5657948f6ccSFlorian Westphal 5667948f6ccSFlorian Westphal /* Only wake up writers if a subflow is ready */ 5677948f6ccSFlorian Westphal if (test_bit(MPTCP_SEND_SPACE, &msk->flags)) 5687948f6ccSFlorian Westphal sk_stream_write_space(sk); 56918b683bfSPaolo Abeni } 57018b683bfSPaolo Abeni } 57118b683bfSPaolo Abeni 57218b683bfSPaolo Abeni /* ensure we get enough memory for the frag hdr, beyond some minimal amount of 57318b683bfSPaolo Abeni * data 57418b683bfSPaolo Abeni */ 57518b683bfSPaolo Abeni static bool mptcp_page_frag_refill(struct sock *sk, struct page_frag *pfrag) 57618b683bfSPaolo Abeni { 57718b683bfSPaolo Abeni if (likely(skb_page_frag_refill(32U + sizeof(struct mptcp_data_frag), 57818b683bfSPaolo Abeni pfrag, sk->sk_allocation))) 57918b683bfSPaolo Abeni return true; 58018b683bfSPaolo Abeni 58118b683bfSPaolo Abeni sk->sk_prot->enter_memory_pressure(sk); 58218b683bfSPaolo Abeni sk_stream_moderate_sndbuf(sk); 58318b683bfSPaolo Abeni return false; 58418b683bfSPaolo Abeni } 58518b683bfSPaolo Abeni 58618b683bfSPaolo Abeni static struct mptcp_data_frag * 58718b683bfSPaolo Abeni mptcp_carve_data_frag(const struct mptcp_sock *msk, struct page_frag *pfrag, 58818b683bfSPaolo Abeni int orig_offset) 58918b683bfSPaolo Abeni { 59018b683bfSPaolo Abeni int offset = ALIGN(orig_offset, sizeof(long)); 59118b683bfSPaolo Abeni struct mptcp_data_frag *dfrag; 59218b683bfSPaolo Abeni 59318b683bfSPaolo Abeni dfrag = (struct mptcp_data_frag *)(page_to_virt(pfrag->page) + offset); 59418b683bfSPaolo Abeni dfrag->data_len = 0; 59518b683bfSPaolo Abeni dfrag->data_seq = msk->write_seq; 59618b683bfSPaolo Abeni dfrag->overhead = offset - orig_offset + sizeof(struct mptcp_data_frag); 59718b683bfSPaolo Abeni dfrag->offset = offset + sizeof(struct mptcp_data_frag); 59818b683bfSPaolo Abeni dfrag->page = pfrag->page; 59918b683bfSPaolo Abeni 60018b683bfSPaolo Abeni return dfrag; 60118b683bfSPaolo Abeni } 60218b683bfSPaolo Abeni 60357040755SPaolo Abeni static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, 6043f8e0aaeSPaolo Abeni struct msghdr *msg, struct mptcp_data_frag *dfrag, 6053f8e0aaeSPaolo Abeni long *timeo, int *pmss_now, 60657040755SPaolo Abeni int *ps_goal) 60757040755SPaolo Abeni { 60818b683bfSPaolo Abeni int mss_now, avail_size, size_goal, offset, ret, frag_truesize = 0; 60918b683bfSPaolo Abeni bool dfrag_collapsed, can_collapse = false; 6106d0060f6SMat Martineau struct mptcp_sock *msk = mptcp_sk(sk); 6116d0060f6SMat Martineau struct mptcp_ext *mpext = NULL; 6123f8e0aaeSPaolo Abeni bool retransmission = !!dfrag; 61357040755SPaolo Abeni struct sk_buff *skb, *tail; 6146d0060f6SMat Martineau struct page_frag *pfrag; 6153f8e0aaeSPaolo Abeni struct page *page; 6163f8e0aaeSPaolo Abeni u64 *write_seq; 6176d0060f6SMat Martineau size_t psize; 6186d0060f6SMat Martineau 6196d0060f6SMat Martineau /* use the mptcp page cache so that we can easily move the data 6206d0060f6SMat Martineau * from one substream to another, but do per subflow memory accounting 6213f8e0aaeSPaolo Abeni * Note: pfrag is used only !retransmission, but the compiler if 6223f8e0aaeSPaolo Abeni * fooled into a warning if we don't init here 6236d0060f6SMat Martineau */ 6246d0060f6SMat Martineau pfrag = sk_page_frag(sk); 6253f8e0aaeSPaolo Abeni if (!retransmission) { 6263f8e0aaeSPaolo Abeni write_seq = &msk->write_seq; 6273f8e0aaeSPaolo Abeni page = pfrag->page; 6283f8e0aaeSPaolo Abeni } else { 6293f8e0aaeSPaolo Abeni write_seq = &dfrag->data_seq; 6303f8e0aaeSPaolo Abeni page = dfrag->page; 6313f8e0aaeSPaolo Abeni } 6326d0060f6SMat Martineau 6336d0060f6SMat Martineau /* compute copy limit */ 6346d0060f6SMat Martineau mss_now = tcp_send_mss(ssk, &size_goal, msg->msg_flags); 63557040755SPaolo Abeni *pmss_now = mss_now; 63657040755SPaolo Abeni *ps_goal = size_goal; 63757040755SPaolo Abeni avail_size = size_goal; 63857040755SPaolo Abeni skb = tcp_write_queue_tail(ssk); 63957040755SPaolo Abeni if (skb) { 64057040755SPaolo Abeni mpext = skb_ext_find(skb, SKB_EXT_MPTCP); 6416d0060f6SMat Martineau 64257040755SPaolo Abeni /* Limit the write to the size available in the 64357040755SPaolo Abeni * current skb, if any, so that we create at most a new skb. 64457040755SPaolo Abeni * Explicitly tells TCP internals to avoid collapsing on later 64557040755SPaolo Abeni * queue management operation, to avoid breaking the ext <-> 64657040755SPaolo Abeni * SSN association set here 64757040755SPaolo Abeni */ 64857040755SPaolo Abeni can_collapse = (size_goal - skb->len > 0) && 6493f8e0aaeSPaolo Abeni mptcp_skb_can_collapse_to(*write_seq, skb, mpext); 65057040755SPaolo Abeni if (!can_collapse) 65157040755SPaolo Abeni TCP_SKB_CB(skb)->eor = 1; 65257040755SPaolo Abeni else 65357040755SPaolo Abeni avail_size = size_goal - skb->len; 65457040755SPaolo Abeni } 65518b683bfSPaolo Abeni 6563f8e0aaeSPaolo Abeni if (!retransmission) { 6573f8e0aaeSPaolo Abeni /* reuse tail pfrag, if possible, or carve a new one from the 6583f8e0aaeSPaolo Abeni * page allocator 65918b683bfSPaolo Abeni */ 66018b683bfSPaolo Abeni dfrag = mptcp_rtx_tail(sk); 66118b683bfSPaolo Abeni offset = pfrag->offset; 66218b683bfSPaolo Abeni dfrag_collapsed = mptcp_frag_can_collapse_to(msk, pfrag, dfrag); 66318b683bfSPaolo Abeni if (!dfrag_collapsed) { 66418b683bfSPaolo Abeni dfrag = mptcp_carve_data_frag(msk, pfrag, offset); 66518b683bfSPaolo Abeni offset = dfrag->offset; 66618b683bfSPaolo Abeni frag_truesize = dfrag->overhead; 66718b683bfSPaolo Abeni } 66818b683bfSPaolo Abeni psize = min_t(size_t, pfrag->size - offset, avail_size); 66957040755SPaolo Abeni 67057040755SPaolo Abeni /* Copy to page */ 6716d0060f6SMat Martineau pr_debug("left=%zu", msg_data_left(msg)); 67218b683bfSPaolo Abeni psize = copy_page_from_iter(pfrag->page, offset, 6733f8e0aaeSPaolo Abeni min_t(size_t, msg_data_left(msg), 6743f8e0aaeSPaolo Abeni psize), 6756d0060f6SMat Martineau &msg->msg_iter); 6766d0060f6SMat Martineau pr_debug("left=%zu", msg_data_left(msg)); 6776d0060f6SMat Martineau if (!psize) 6786d0060f6SMat Martineau return -EINVAL; 6796d0060f6SMat Martineau 680d027236cSPaolo Abeni if (!sk_wmem_schedule(sk, psize + dfrag->overhead)) 681d027236cSPaolo Abeni return -ENOMEM; 6823f8e0aaeSPaolo Abeni } else { 6833f8e0aaeSPaolo Abeni offset = dfrag->offset; 6843f8e0aaeSPaolo Abeni psize = min_t(size_t, dfrag->data_len, avail_size); 6853f8e0aaeSPaolo Abeni } 686d027236cSPaolo Abeni 68757040755SPaolo Abeni /* tell the TCP stack to delay the push so that we can safely 68857040755SPaolo Abeni * access the skb after the sendpages call 6896d0060f6SMat Martineau */ 6903f8e0aaeSPaolo Abeni ret = do_tcp_sendpages(ssk, page, offset, psize, 69172511aabSFlorian Westphal msg->msg_flags | MSG_SENDPAGE_NOTLAST | MSG_DONTWAIT); 6926d0060f6SMat Martineau if (ret <= 0) 6936d0060f6SMat Martineau return ret; 69418b683bfSPaolo Abeni 69518b683bfSPaolo Abeni frag_truesize += ret; 6963f8e0aaeSPaolo Abeni if (!retransmission) { 6976d0060f6SMat Martineau if (unlikely(ret < psize)) 6986d0060f6SMat Martineau iov_iter_revert(&msg->msg_iter, psize - ret); 6996d0060f6SMat Martineau 70018b683bfSPaolo Abeni /* send successful, keep track of sent data for mptcp-level 70118b683bfSPaolo Abeni * retransmission 70218b683bfSPaolo Abeni */ 70318b683bfSPaolo Abeni dfrag->data_len += ret; 70418b683bfSPaolo Abeni if (!dfrag_collapsed) { 70518b683bfSPaolo Abeni get_page(dfrag->page); 70618b683bfSPaolo Abeni list_add_tail(&dfrag->list, &msk->rtx_queue); 7073f8e0aaeSPaolo Abeni sk_wmem_queued_add(sk, frag_truesize); 7083f8e0aaeSPaolo Abeni } else { 7093f8e0aaeSPaolo Abeni sk_wmem_queued_add(sk, ret); 71018b683bfSPaolo Abeni } 71118b683bfSPaolo Abeni 712d027236cSPaolo Abeni /* charge data on mptcp rtx queue to the master socket 713d027236cSPaolo Abeni * Note: we charge such data both to sk and ssk 714d027236cSPaolo Abeni */ 715d027236cSPaolo Abeni sk->sk_forward_alloc -= frag_truesize; 7163f8e0aaeSPaolo Abeni } 717d027236cSPaolo Abeni 71857040755SPaolo Abeni /* if the tail skb extension is still the cached one, collapsing 71957040755SPaolo Abeni * really happened. Note: we can't check for 'same skb' as the sk_buff 72057040755SPaolo Abeni * hdr on tail can be transmitted, freed and re-allocated by the 72157040755SPaolo Abeni * do_tcp_sendpages() call 72257040755SPaolo Abeni */ 72357040755SPaolo Abeni tail = tcp_write_queue_tail(ssk); 72457040755SPaolo Abeni if (mpext && tail && mpext == skb_ext_find(tail, SKB_EXT_MPTCP)) { 72557040755SPaolo Abeni WARN_ON_ONCE(!can_collapse); 72657040755SPaolo Abeni mpext->data_len += ret; 72757040755SPaolo Abeni goto out; 72857040755SPaolo Abeni } 72957040755SPaolo Abeni 7306d0060f6SMat Martineau skb = tcp_write_queue_tail(ssk); 7316d0060f6SMat Martineau mpext = __skb_ext_set(skb, SKB_EXT_MPTCP, msk->cached_ext); 7326d0060f6SMat Martineau msk->cached_ext = NULL; 7336d0060f6SMat Martineau 7346d0060f6SMat Martineau memset(mpext, 0, sizeof(*mpext)); 7353f8e0aaeSPaolo Abeni mpext->data_seq = *write_seq; 7366d0060f6SMat Martineau mpext->subflow_seq = mptcp_subflow_ctx(ssk)->rel_write_seq; 7376d0060f6SMat Martineau mpext->data_len = ret; 7386d0060f6SMat Martineau mpext->use_map = 1; 7396d0060f6SMat Martineau mpext->dsn64 = 1; 7406d0060f6SMat Martineau 7416d0060f6SMat Martineau pr_debug("data_seq=%llu subflow_seq=%u data_len=%u dsn64=%d", 7426d0060f6SMat Martineau mpext->data_seq, mpext->subflow_seq, mpext->data_len, 7436d0060f6SMat Martineau mpext->dsn64); 7446d0060f6SMat Martineau 74557040755SPaolo Abeni out: 7463f8e0aaeSPaolo Abeni if (!retransmission) 74718b683bfSPaolo Abeni pfrag->offset += frag_truesize; 7483f8e0aaeSPaolo Abeni *write_seq += ret; 7496d0060f6SMat Martineau mptcp_subflow_ctx(ssk)->rel_write_seq += ret; 7506d0060f6SMat Martineau 7516d0060f6SMat Martineau return ret; 7526d0060f6SMat Martineau } 7536d0060f6SMat Martineau 754a0e17064SFlorian Westphal static void mptcp_nospace(struct mptcp_sock *msk, struct socket *sock) 755a0e17064SFlorian Westphal { 756a0e17064SFlorian Westphal clear_bit(MPTCP_SEND_SPACE, &msk->flags); 757a0e17064SFlorian Westphal smp_mb__after_atomic(); /* msk->flags is changed by write_space cb */ 758a0e17064SFlorian Westphal 759a0e17064SFlorian Westphal /* enables sk->write_space() callbacks */ 760a0e17064SFlorian Westphal set_bit(SOCK_NOSPACE, &sock->flags); 761a0e17064SFlorian Westphal } 762a0e17064SFlorian Westphal 763f296234cSPeter Krystad static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) 764f296234cSPeter Krystad { 765f296234cSPeter Krystad struct mptcp_subflow_context *subflow; 766f296234cSPeter Krystad struct sock *backup = NULL; 767f296234cSPeter Krystad 768f296234cSPeter Krystad sock_owned_by_me((const struct sock *)msk); 769f296234cSPeter Krystad 770149f7c71SFlorian Westphal if (!mptcp_ext_cache_refill(msk)) 771149f7c71SFlorian Westphal return NULL; 772149f7c71SFlorian Westphal 773f296234cSPeter Krystad mptcp_for_each_subflow(msk, subflow) { 774f296234cSPeter Krystad struct sock *ssk = mptcp_subflow_tcp_sock(subflow); 775f296234cSPeter Krystad 776f296234cSPeter Krystad if (!sk_stream_memory_free(ssk)) { 777f296234cSPeter Krystad struct socket *sock = ssk->sk_socket; 778f296234cSPeter Krystad 779a0e17064SFlorian Westphal if (sock) 780a0e17064SFlorian Westphal mptcp_nospace(msk, sock); 781f296234cSPeter Krystad 782f296234cSPeter Krystad return NULL; 783f296234cSPeter Krystad } 784f296234cSPeter Krystad 785f296234cSPeter Krystad if (subflow->backup) { 786f296234cSPeter Krystad if (!backup) 787f296234cSPeter Krystad backup = ssk; 788f296234cSPeter Krystad 789f296234cSPeter Krystad continue; 790f296234cSPeter Krystad } 791f296234cSPeter Krystad 792f296234cSPeter Krystad return ssk; 793f296234cSPeter Krystad } 794f296234cSPeter Krystad 795f296234cSPeter Krystad return backup; 796f296234cSPeter Krystad } 797f296234cSPeter Krystad 7981891c4a0SFlorian Westphal static void ssk_check_wmem(struct mptcp_sock *msk, struct sock *ssk) 7991891c4a0SFlorian Westphal { 8001891c4a0SFlorian Westphal struct socket *sock; 8011891c4a0SFlorian Westphal 8021891c4a0SFlorian Westphal if (likely(sk_stream_is_writeable(ssk))) 8031891c4a0SFlorian Westphal return; 8041891c4a0SFlorian Westphal 8051891c4a0SFlorian Westphal sock = READ_ONCE(ssk->sk_socket); 806a0e17064SFlorian Westphal if (sock) 807a0e17064SFlorian Westphal mptcp_nospace(msk, sock); 8081891c4a0SFlorian Westphal } 8091891c4a0SFlorian Westphal 810f870fa0bSMat Martineau static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) 811f870fa0bSMat Martineau { 81257040755SPaolo Abeni int mss_now = 0, size_goal = 0, ret = 0; 813f870fa0bSMat Martineau struct mptcp_sock *msk = mptcp_sk(sk); 81417091708SFlorian Westphal struct page_frag *pfrag; 8156d0060f6SMat Martineau size_t copied = 0; 816cec37a6eSPeter Krystad struct sock *ssk; 81772511aabSFlorian Westphal bool tx_ok; 8186d0060f6SMat Martineau long timeo; 819f870fa0bSMat Martineau 820f870fa0bSMat Martineau if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL)) 821f870fa0bSMat Martineau return -EOPNOTSUPP; 822f870fa0bSMat Martineau 823cec37a6eSPeter Krystad lock_sock(sk); 8241954b860SMat Martineau 8251954b860SMat Martineau timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); 8261954b860SMat Martineau 8271954b860SMat Martineau if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) { 8281954b860SMat Martineau ret = sk_stream_wait_connect(sk, &timeo); 8291954b860SMat Martineau if (ret) 8301954b860SMat Martineau goto out; 8311954b860SMat Martineau } 8321954b860SMat Martineau 83317091708SFlorian Westphal pfrag = sk_page_frag(sk); 83472511aabSFlorian Westphal restart: 83518b683bfSPaolo Abeni mptcp_clean_una(sk); 83618b683bfSPaolo Abeni 83757baaf28SMat Martineau if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) { 83857baaf28SMat Martineau ret = -EPIPE; 83957baaf28SMat Martineau goto out; 84057baaf28SMat Martineau } 84157baaf28SMat Martineau 842fb529e62SFlorian Westphal wait_for_sndbuf: 843ec3edaa7SPeter Krystad __mptcp_flush_join_list(msk); 844f296234cSPeter Krystad ssk = mptcp_subflow_get_send(msk); 84517091708SFlorian Westphal while (!sk_stream_memory_free(sk) || 84617091708SFlorian Westphal !ssk || 84717091708SFlorian Westphal !mptcp_page_frag_refill(ssk, pfrag)) { 848fb529e62SFlorian Westphal if (ssk) { 849fb529e62SFlorian Westphal /* make sure retransmit timer is 850fb529e62SFlorian Westphal * running before we wait for memory. 851fb529e62SFlorian Westphal * 852fb529e62SFlorian Westphal * The retransmit timer might be needed 853fb529e62SFlorian Westphal * to make the peer send an up-to-date 854fb529e62SFlorian Westphal * MPTCP Ack. 855fb529e62SFlorian Westphal */ 856fb529e62SFlorian Westphal mptcp_set_timeout(sk, ssk); 857fb529e62SFlorian Westphal if (!mptcp_timer_pending(sk)) 858fb529e62SFlorian Westphal mptcp_reset_timer(sk); 859fb529e62SFlorian Westphal } 860fb529e62SFlorian Westphal 861f296234cSPeter Krystad ret = sk_stream_wait_memory(sk, &timeo); 862f296234cSPeter Krystad if (ret) 863f296234cSPeter Krystad goto out; 864f296234cSPeter Krystad 86518b683bfSPaolo Abeni mptcp_clean_una(sk); 86618b683bfSPaolo Abeni 867f296234cSPeter Krystad ssk = mptcp_subflow_get_send(msk); 868f296234cSPeter Krystad if (list_empty(&msk->conn_list)) { 869f296234cSPeter Krystad ret = -ENOTCONN; 870f296234cSPeter Krystad goto out; 871f296234cSPeter Krystad } 872cec37a6eSPeter Krystad } 873cec37a6eSPeter Krystad 8746d0060f6SMat Martineau pr_debug("conn_list->subflow=%p", ssk); 875cec37a6eSPeter Krystad 8766d0060f6SMat Martineau lock_sock(ssk); 87772511aabSFlorian Westphal tx_ok = msg_data_left(msg); 87872511aabSFlorian Westphal while (tx_ok) { 8793f8e0aaeSPaolo Abeni ret = mptcp_sendmsg_frag(sk, ssk, msg, NULL, &timeo, &mss_now, 88057040755SPaolo Abeni &size_goal); 88172511aabSFlorian Westphal if (ret < 0) { 88272511aabSFlorian Westphal if (ret == -EAGAIN && timeo > 0) { 88372511aabSFlorian Westphal mptcp_set_timeout(sk, ssk); 88472511aabSFlorian Westphal release_sock(ssk); 88572511aabSFlorian Westphal goto restart; 88672511aabSFlorian Westphal } 8876d0060f6SMat Martineau break; 88872511aabSFlorian Westphal } 8896d0060f6SMat Martineau 8906d0060f6SMat Martineau copied += ret; 891fb529e62SFlorian Westphal 89272511aabSFlorian Westphal tx_ok = msg_data_left(msg); 89372511aabSFlorian Westphal if (!tx_ok) 89472511aabSFlorian Westphal break; 89572511aabSFlorian Westphal 896149f7c71SFlorian Westphal if (!sk_stream_memory_free(ssk) || 89717091708SFlorian Westphal !mptcp_page_frag_refill(ssk, pfrag) || 898149f7c71SFlorian Westphal !mptcp_ext_cache_refill(msk)) { 89972511aabSFlorian Westphal set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 90072511aabSFlorian Westphal tcp_push(ssk, msg->msg_flags, mss_now, 90172511aabSFlorian Westphal tcp_sk(ssk)->nonagle, size_goal); 90272511aabSFlorian Westphal mptcp_set_timeout(sk, ssk); 90372511aabSFlorian Westphal release_sock(ssk); 90472511aabSFlorian Westphal goto restart; 90572511aabSFlorian Westphal } 90672511aabSFlorian Westphal 907fb529e62SFlorian Westphal /* memory is charged to mptcp level socket as well, i.e. 908fb529e62SFlorian Westphal * if msg is very large, mptcp socket may run out of buffer 909fb529e62SFlorian Westphal * space. mptcp_clean_una() will release data that has 910fb529e62SFlorian Westphal * been acked at mptcp level in the mean time, so there is 911fb529e62SFlorian Westphal * a good chance we can continue sending data right away. 91272511aabSFlorian Westphal * 91372511aabSFlorian Westphal * Normally, when the tcp subflow can accept more data, then 91472511aabSFlorian Westphal * so can the MPTCP socket. However, we need to cope with 91572511aabSFlorian Westphal * peers that might lag behind in their MPTCP-level 91672511aabSFlorian Westphal * acknowledgements, i.e. data might have been acked at 91772511aabSFlorian Westphal * tcp level only. So, we must also check the MPTCP socket 91872511aabSFlorian Westphal * limits before we send more data. 919fb529e62SFlorian Westphal */ 920fb529e62SFlorian Westphal if (unlikely(!sk_stream_memory_free(sk))) { 921fb529e62SFlorian Westphal tcp_push(ssk, msg->msg_flags, mss_now, 922fb529e62SFlorian Westphal tcp_sk(ssk)->nonagle, size_goal); 923fb529e62SFlorian Westphal mptcp_clean_una(sk); 924fb529e62SFlorian Westphal if (!sk_stream_memory_free(sk)) { 925fb529e62SFlorian Westphal /* can't send more for now, need to wait for 926fb529e62SFlorian Westphal * MPTCP-level ACKs from peer. 927fb529e62SFlorian Westphal * 928fb529e62SFlorian Westphal * Wakeup will happen via mptcp_clean_una(). 929fb529e62SFlorian Westphal */ 930fb529e62SFlorian Westphal mptcp_set_timeout(sk, ssk); 931fb529e62SFlorian Westphal release_sock(ssk); 932fb529e62SFlorian Westphal goto wait_for_sndbuf; 933fb529e62SFlorian Westphal } 934fb529e62SFlorian Westphal } 9356d0060f6SMat Martineau } 9366d0060f6SMat Martineau 937b51f9b80SPaolo Abeni mptcp_set_timeout(sk, ssk); 93857040755SPaolo Abeni if (copied) { 9396d0060f6SMat Martineau ret = copied; 94057040755SPaolo Abeni tcp_push(ssk, msg->msg_flags, mss_now, tcp_sk(ssk)->nonagle, 94157040755SPaolo Abeni size_goal); 942b51f9b80SPaolo Abeni 943b51f9b80SPaolo Abeni /* start the timer, if it's not pending */ 944b51f9b80SPaolo Abeni if (!mptcp_timer_pending(sk)) 945b51f9b80SPaolo Abeni mptcp_reset_timer(sk); 94657040755SPaolo Abeni } 9476d0060f6SMat Martineau 9481891c4a0SFlorian Westphal ssk_check_wmem(msk, ssk); 9496d0060f6SMat Martineau release_sock(ssk); 9501954b860SMat Martineau out: 951cec37a6eSPeter Krystad release_sock(sk); 952cec37a6eSPeter Krystad return ret; 953f870fa0bSMat Martineau } 954f870fa0bSMat Martineau 9557a6a6cbcSPaolo Abeni static void mptcp_wait_data(struct sock *sk, long *timeo) 9567a6a6cbcSPaolo Abeni { 9577a6a6cbcSPaolo Abeni DEFINE_WAIT_FUNC(wait, woken_wake_function); 9587a6a6cbcSPaolo Abeni struct mptcp_sock *msk = mptcp_sk(sk); 9597a6a6cbcSPaolo Abeni 9607a6a6cbcSPaolo Abeni add_wait_queue(sk_sleep(sk), &wait); 9617a6a6cbcSPaolo Abeni sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); 9627a6a6cbcSPaolo Abeni 9637a6a6cbcSPaolo Abeni sk_wait_event(sk, timeo, 9647a6a6cbcSPaolo Abeni test_and_clear_bit(MPTCP_DATA_READY, &msk->flags), &wait); 9657a6a6cbcSPaolo Abeni 9667a6a6cbcSPaolo Abeni sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); 9677a6a6cbcSPaolo Abeni remove_wait_queue(sk_sleep(sk), &wait); 9687a6a6cbcSPaolo Abeni } 9697a6a6cbcSPaolo Abeni 9706771bfd9SFlorian Westphal static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk, 9716771bfd9SFlorian Westphal struct msghdr *msg, 9726771bfd9SFlorian Westphal size_t len) 9736771bfd9SFlorian Westphal { 9746771bfd9SFlorian Westphal struct sock *sk = (struct sock *)msk; 9756771bfd9SFlorian Westphal struct sk_buff *skb; 9766771bfd9SFlorian Westphal int copied = 0; 9776771bfd9SFlorian Westphal 9786771bfd9SFlorian Westphal while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) { 9796771bfd9SFlorian Westphal u32 offset = MPTCP_SKB_CB(skb)->offset; 9806771bfd9SFlorian Westphal u32 data_len = skb->len - offset; 9816771bfd9SFlorian Westphal u32 count = min_t(size_t, len - copied, data_len); 9826771bfd9SFlorian Westphal int err; 9836771bfd9SFlorian Westphal 9846771bfd9SFlorian Westphal err = skb_copy_datagram_msg(skb, offset, msg, count); 9856771bfd9SFlorian Westphal if (unlikely(err < 0)) { 9866771bfd9SFlorian Westphal if (!copied) 9876771bfd9SFlorian Westphal return err; 9886771bfd9SFlorian Westphal break; 9896771bfd9SFlorian Westphal } 9906771bfd9SFlorian Westphal 9916771bfd9SFlorian Westphal copied += count; 9926771bfd9SFlorian Westphal 9936771bfd9SFlorian Westphal if (count < data_len) { 9946771bfd9SFlorian Westphal MPTCP_SKB_CB(skb)->offset += count; 9956771bfd9SFlorian Westphal break; 9966771bfd9SFlorian Westphal } 9976771bfd9SFlorian Westphal 9986771bfd9SFlorian Westphal __skb_unlink(skb, &sk->sk_receive_queue); 9996771bfd9SFlorian Westphal __kfree_skb(skb); 10006771bfd9SFlorian Westphal 10016771bfd9SFlorian Westphal if (copied >= len) 10026771bfd9SFlorian Westphal break; 10036771bfd9SFlorian Westphal } 10046771bfd9SFlorian Westphal 10056771bfd9SFlorian Westphal return copied; 10066771bfd9SFlorian Westphal } 10076771bfd9SFlorian Westphal 1008a6b118feSFlorian Westphal /* receive buffer autotuning. See tcp_rcv_space_adjust for more information. 1009a6b118feSFlorian Westphal * 1010a6b118feSFlorian Westphal * Only difference: Use highest rtt estimate of the subflows in use. 1011a6b118feSFlorian Westphal */ 1012a6b118feSFlorian Westphal static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied) 1013a6b118feSFlorian Westphal { 1014a6b118feSFlorian Westphal struct mptcp_subflow_context *subflow; 1015a6b118feSFlorian Westphal struct sock *sk = (struct sock *)msk; 1016a6b118feSFlorian Westphal u32 time, advmss = 1; 1017a6b118feSFlorian Westphal u64 rtt_us, mstamp; 1018a6b118feSFlorian Westphal 1019a6b118feSFlorian Westphal sock_owned_by_me(sk); 1020a6b118feSFlorian Westphal 1021a6b118feSFlorian Westphal if (copied <= 0) 1022a6b118feSFlorian Westphal return; 1023a6b118feSFlorian Westphal 1024a6b118feSFlorian Westphal msk->rcvq_space.copied += copied; 1025a6b118feSFlorian Westphal 1026a6b118feSFlorian Westphal mstamp = div_u64(tcp_clock_ns(), NSEC_PER_USEC); 1027a6b118feSFlorian Westphal time = tcp_stamp_us_delta(mstamp, msk->rcvq_space.time); 1028a6b118feSFlorian Westphal 1029a6b118feSFlorian Westphal rtt_us = msk->rcvq_space.rtt_us; 1030a6b118feSFlorian Westphal if (rtt_us && time < (rtt_us >> 3)) 1031a6b118feSFlorian Westphal return; 1032a6b118feSFlorian Westphal 1033a6b118feSFlorian Westphal rtt_us = 0; 1034a6b118feSFlorian Westphal mptcp_for_each_subflow(msk, subflow) { 1035a6b118feSFlorian Westphal const struct tcp_sock *tp; 1036a6b118feSFlorian Westphal u64 sf_rtt_us; 1037a6b118feSFlorian Westphal u32 sf_advmss; 1038a6b118feSFlorian Westphal 1039a6b118feSFlorian Westphal tp = tcp_sk(mptcp_subflow_tcp_sock(subflow)); 1040a6b118feSFlorian Westphal 1041a6b118feSFlorian Westphal sf_rtt_us = READ_ONCE(tp->rcv_rtt_est.rtt_us); 1042a6b118feSFlorian Westphal sf_advmss = READ_ONCE(tp->advmss); 1043a6b118feSFlorian Westphal 1044a6b118feSFlorian Westphal rtt_us = max(sf_rtt_us, rtt_us); 1045a6b118feSFlorian Westphal advmss = max(sf_advmss, advmss); 1046a6b118feSFlorian Westphal } 1047a6b118feSFlorian Westphal 1048a6b118feSFlorian Westphal msk->rcvq_space.rtt_us = rtt_us; 1049a6b118feSFlorian Westphal if (time < (rtt_us >> 3) || rtt_us == 0) 1050a6b118feSFlorian Westphal return; 1051a6b118feSFlorian Westphal 1052a6b118feSFlorian Westphal if (msk->rcvq_space.copied <= msk->rcvq_space.space) 1053a6b118feSFlorian Westphal goto new_measure; 1054a6b118feSFlorian Westphal 1055a6b118feSFlorian Westphal if (sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf && 1056a6b118feSFlorian Westphal !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { 1057a6b118feSFlorian Westphal int rcvmem, rcvbuf; 1058a6b118feSFlorian Westphal u64 rcvwin, grow; 1059a6b118feSFlorian Westphal 1060a6b118feSFlorian Westphal rcvwin = ((u64)msk->rcvq_space.copied << 1) + 16 * advmss; 1061a6b118feSFlorian Westphal 1062a6b118feSFlorian Westphal grow = rcvwin * (msk->rcvq_space.copied - msk->rcvq_space.space); 1063a6b118feSFlorian Westphal 1064a6b118feSFlorian Westphal do_div(grow, msk->rcvq_space.space); 1065a6b118feSFlorian Westphal rcvwin += (grow << 1); 1066a6b118feSFlorian Westphal 1067a6b118feSFlorian Westphal rcvmem = SKB_TRUESIZE(advmss + MAX_TCP_HEADER); 1068a6b118feSFlorian Westphal while (tcp_win_from_space(sk, rcvmem) < advmss) 1069a6b118feSFlorian Westphal rcvmem += 128; 1070a6b118feSFlorian Westphal 1071a6b118feSFlorian Westphal do_div(rcvwin, advmss); 1072a6b118feSFlorian Westphal rcvbuf = min_t(u64, rcvwin * rcvmem, 1073a6b118feSFlorian Westphal sock_net(sk)->ipv4.sysctl_tcp_rmem[2]); 1074a6b118feSFlorian Westphal 1075a6b118feSFlorian Westphal if (rcvbuf > sk->sk_rcvbuf) { 1076a6b118feSFlorian Westphal u32 window_clamp; 1077a6b118feSFlorian Westphal 1078a6b118feSFlorian Westphal window_clamp = tcp_win_from_space(sk, rcvbuf); 1079a6b118feSFlorian Westphal WRITE_ONCE(sk->sk_rcvbuf, rcvbuf); 1080a6b118feSFlorian Westphal 1081a6b118feSFlorian Westphal /* Make subflows follow along. If we do not do this, we 1082a6b118feSFlorian Westphal * get drops at subflow level if skbs can't be moved to 1083a6b118feSFlorian Westphal * the mptcp rx queue fast enough (announced rcv_win can 1084a6b118feSFlorian Westphal * exceed ssk->sk_rcvbuf). 1085a6b118feSFlorian Westphal */ 1086a6b118feSFlorian Westphal mptcp_for_each_subflow(msk, subflow) { 1087a6b118feSFlorian Westphal struct sock *ssk; 1088a6b118feSFlorian Westphal 1089a6b118feSFlorian Westphal ssk = mptcp_subflow_tcp_sock(subflow); 1090a6b118feSFlorian Westphal WRITE_ONCE(ssk->sk_rcvbuf, rcvbuf); 1091a6b118feSFlorian Westphal tcp_sk(ssk)->window_clamp = window_clamp; 1092a6b118feSFlorian Westphal } 1093a6b118feSFlorian Westphal } 1094a6b118feSFlorian Westphal } 1095a6b118feSFlorian Westphal 1096a6b118feSFlorian Westphal msk->rcvq_space.space = msk->rcvq_space.copied; 1097a6b118feSFlorian Westphal new_measure: 1098a6b118feSFlorian Westphal msk->rcvq_space.copied = 0; 1099a6b118feSFlorian Westphal msk->rcvq_space.time = mstamp; 1100a6b118feSFlorian Westphal } 1101a6b118feSFlorian Westphal 11026771bfd9SFlorian Westphal static bool __mptcp_move_skbs(struct mptcp_sock *msk) 11036771bfd9SFlorian Westphal { 11046771bfd9SFlorian Westphal unsigned int moved = 0; 11056771bfd9SFlorian Westphal bool done; 11066771bfd9SFlorian Westphal 11076771bfd9SFlorian Westphal do { 11086771bfd9SFlorian Westphal struct sock *ssk = mptcp_subflow_recv_lookup(msk); 11096771bfd9SFlorian Westphal 11106771bfd9SFlorian Westphal if (!ssk) 11116771bfd9SFlorian Westphal break; 11126771bfd9SFlorian Westphal 11136771bfd9SFlorian Westphal lock_sock(ssk); 11146771bfd9SFlorian Westphal done = __mptcp_move_skbs_from_subflow(msk, ssk, &moved); 11156771bfd9SFlorian Westphal release_sock(ssk); 11166771bfd9SFlorian Westphal } while (!done); 11176771bfd9SFlorian Westphal 11186771bfd9SFlorian Westphal return moved > 0; 11196771bfd9SFlorian Westphal } 11206771bfd9SFlorian Westphal 1121f870fa0bSMat Martineau static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, 1122f870fa0bSMat Martineau int nonblock, int flags, int *addr_len) 1123f870fa0bSMat Martineau { 1124f870fa0bSMat Martineau struct mptcp_sock *msk = mptcp_sk(sk); 1125cec37a6eSPeter Krystad int copied = 0; 11267a6a6cbcSPaolo Abeni int target; 11277a6a6cbcSPaolo Abeni long timeo; 1128f870fa0bSMat Martineau 1129f870fa0bSMat Martineau if (msg->msg_flags & ~(MSG_WAITALL | MSG_DONTWAIT)) 1130f870fa0bSMat Martineau return -EOPNOTSUPP; 1131f870fa0bSMat Martineau 1132cec37a6eSPeter Krystad lock_sock(sk); 11337a6a6cbcSPaolo Abeni timeo = sock_rcvtimeo(sk, nonblock); 11347a6a6cbcSPaolo Abeni 11357a6a6cbcSPaolo Abeni len = min_t(size_t, len, INT_MAX); 11367a6a6cbcSPaolo Abeni target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); 1137ec3edaa7SPeter Krystad __mptcp_flush_join_list(msk); 11387a6a6cbcSPaolo Abeni 11396771bfd9SFlorian Westphal while (len > (size_t)copied) { 11407a6a6cbcSPaolo Abeni int bytes_read; 11417a6a6cbcSPaolo Abeni 11426771bfd9SFlorian Westphal bytes_read = __mptcp_recvmsg_mskq(msk, msg, len - copied); 11436771bfd9SFlorian Westphal if (unlikely(bytes_read < 0)) { 11447a6a6cbcSPaolo Abeni if (!copied) 11457a6a6cbcSPaolo Abeni copied = bytes_read; 11466771bfd9SFlorian Westphal goto out_err; 1147cec37a6eSPeter Krystad } 1148cec37a6eSPeter Krystad 11497a6a6cbcSPaolo Abeni copied += bytes_read; 11507a6a6cbcSPaolo Abeni 11516771bfd9SFlorian Westphal if (skb_queue_empty(&sk->sk_receive_queue) && 11526771bfd9SFlorian Westphal __mptcp_move_skbs(msk)) 11536771bfd9SFlorian Westphal continue; 11547a6a6cbcSPaolo Abeni 11557a6a6cbcSPaolo Abeni /* only the master socket status is relevant here. The exit 11567a6a6cbcSPaolo Abeni * conditions mirror closely tcp_recvmsg() 11577a6a6cbcSPaolo Abeni */ 11587a6a6cbcSPaolo Abeni if (copied >= target) 11597a6a6cbcSPaolo Abeni break; 11607a6a6cbcSPaolo Abeni 11617a6a6cbcSPaolo Abeni if (copied) { 11627a6a6cbcSPaolo Abeni if (sk->sk_err || 11637a6a6cbcSPaolo Abeni sk->sk_state == TCP_CLOSE || 11647a6a6cbcSPaolo Abeni (sk->sk_shutdown & RCV_SHUTDOWN) || 11657a6a6cbcSPaolo Abeni !timeo || 11667a6a6cbcSPaolo Abeni signal_pending(current)) 11677a6a6cbcSPaolo Abeni break; 11687a6a6cbcSPaolo Abeni } else { 11697a6a6cbcSPaolo Abeni if (sk->sk_err) { 11707a6a6cbcSPaolo Abeni copied = sock_error(sk); 11717a6a6cbcSPaolo Abeni break; 11727a6a6cbcSPaolo Abeni } 11737a6a6cbcSPaolo Abeni 11745969856aSPaolo Abeni if (test_and_clear_bit(MPTCP_WORK_EOF, &msk->flags)) 11755969856aSPaolo Abeni mptcp_check_for_eof(msk); 11765969856aSPaolo Abeni 11777a6a6cbcSPaolo Abeni if (sk->sk_shutdown & RCV_SHUTDOWN) 11787a6a6cbcSPaolo Abeni break; 11797a6a6cbcSPaolo Abeni 11807a6a6cbcSPaolo Abeni if (sk->sk_state == TCP_CLOSE) { 11817a6a6cbcSPaolo Abeni copied = -ENOTCONN; 11827a6a6cbcSPaolo Abeni break; 11837a6a6cbcSPaolo Abeni } 11847a6a6cbcSPaolo Abeni 11857a6a6cbcSPaolo Abeni if (!timeo) { 11867a6a6cbcSPaolo Abeni copied = -EAGAIN; 11877a6a6cbcSPaolo Abeni break; 11887a6a6cbcSPaolo Abeni } 11897a6a6cbcSPaolo Abeni 11907a6a6cbcSPaolo Abeni if (signal_pending(current)) { 11917a6a6cbcSPaolo Abeni copied = sock_intr_errno(timeo); 11927a6a6cbcSPaolo Abeni break; 11937a6a6cbcSPaolo Abeni } 11947a6a6cbcSPaolo Abeni } 11957a6a6cbcSPaolo Abeni 11967a6a6cbcSPaolo Abeni pr_debug("block timeout %ld", timeo); 11977a6a6cbcSPaolo Abeni mptcp_wait_data(sk, &timeo); 11987a6a6cbcSPaolo Abeni } 11997a6a6cbcSPaolo Abeni 12006771bfd9SFlorian Westphal if (skb_queue_empty(&sk->sk_receive_queue)) { 12016771bfd9SFlorian Westphal /* entire backlog drained, clear DATA_READY. */ 12027a6a6cbcSPaolo Abeni clear_bit(MPTCP_DATA_READY, &msk->flags); 12037a6a6cbcSPaolo Abeni 12046771bfd9SFlorian Westphal /* .. race-breaker: ssk might have gotten new data 12056771bfd9SFlorian Westphal * after last __mptcp_move_skbs() returned false. 12067a6a6cbcSPaolo Abeni */ 12076771bfd9SFlorian Westphal if (unlikely(__mptcp_move_skbs(msk))) 12086771bfd9SFlorian Westphal set_bit(MPTCP_DATA_READY, &msk->flags); 12096771bfd9SFlorian Westphal } else if (unlikely(!test_bit(MPTCP_DATA_READY, &msk->flags))) { 12106771bfd9SFlorian Westphal /* data to read but mptcp_wait_data() cleared DATA_READY */ 12117a6a6cbcSPaolo Abeni set_bit(MPTCP_DATA_READY, &msk->flags); 12127a6a6cbcSPaolo Abeni } 12136771bfd9SFlorian Westphal out_err: 1214a6b118feSFlorian Westphal mptcp_rcv_space_adjust(msk, copied); 1215a6b118feSFlorian Westphal 1216cec37a6eSPeter Krystad release_sock(sk); 1217cec37a6eSPeter Krystad return copied; 1218cec37a6eSPeter Krystad } 1219cec37a6eSPeter Krystad 1220b51f9b80SPaolo Abeni static void mptcp_retransmit_handler(struct sock *sk) 1221b51f9b80SPaolo Abeni { 1222b51f9b80SPaolo Abeni struct mptcp_sock *msk = mptcp_sk(sk); 1223b51f9b80SPaolo Abeni 12243b1d6210SPaolo Abeni if (atomic64_read(&msk->snd_una) == msk->write_seq) { 1225b51f9b80SPaolo Abeni mptcp_stop_timer(sk); 12263b1d6210SPaolo Abeni } else { 12273b1d6210SPaolo Abeni set_bit(MPTCP_WORK_RTX, &msk->flags); 12283b1d6210SPaolo Abeni if (schedule_work(&msk->work)) 12293b1d6210SPaolo Abeni sock_hold(sk); 12303b1d6210SPaolo Abeni } 1231b51f9b80SPaolo Abeni } 1232b51f9b80SPaolo Abeni 1233b51f9b80SPaolo Abeni static void mptcp_retransmit_timer(struct timer_list *t) 1234b51f9b80SPaolo Abeni { 1235b51f9b80SPaolo Abeni struct inet_connection_sock *icsk = from_timer(icsk, t, 1236b51f9b80SPaolo Abeni icsk_retransmit_timer); 1237b51f9b80SPaolo Abeni struct sock *sk = &icsk->icsk_inet.sk; 1238b51f9b80SPaolo Abeni 1239b51f9b80SPaolo Abeni bh_lock_sock(sk); 1240b51f9b80SPaolo Abeni if (!sock_owned_by_user(sk)) { 1241b51f9b80SPaolo Abeni mptcp_retransmit_handler(sk); 1242b51f9b80SPaolo Abeni } else { 1243b51f9b80SPaolo Abeni /* delegate our work to tcp_release_cb() */ 1244b51f9b80SPaolo Abeni if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED, 1245b51f9b80SPaolo Abeni &sk->sk_tsq_flags)) 1246b51f9b80SPaolo Abeni sock_hold(sk); 1247b51f9b80SPaolo Abeni } 1248b51f9b80SPaolo Abeni bh_unlock_sock(sk); 1249b51f9b80SPaolo Abeni sock_put(sk); 1250b51f9b80SPaolo Abeni } 1251b51f9b80SPaolo Abeni 12523b1d6210SPaolo Abeni /* Find an idle subflow. Return NULL if there is unacked data at tcp 12533b1d6210SPaolo Abeni * level. 12543b1d6210SPaolo Abeni * 12553b1d6210SPaolo Abeni * A backup subflow is returned only if that is the only kind available. 12563b1d6210SPaolo Abeni */ 12573b1d6210SPaolo Abeni static struct sock *mptcp_subflow_get_retrans(const struct mptcp_sock *msk) 12583b1d6210SPaolo Abeni { 12593b1d6210SPaolo Abeni struct mptcp_subflow_context *subflow; 12603b1d6210SPaolo Abeni struct sock *backup = NULL; 12613b1d6210SPaolo Abeni 12623b1d6210SPaolo Abeni sock_owned_by_me((const struct sock *)msk); 12633b1d6210SPaolo Abeni 12643b1d6210SPaolo Abeni mptcp_for_each_subflow(msk, subflow) { 12653b1d6210SPaolo Abeni struct sock *ssk = mptcp_subflow_tcp_sock(subflow); 12663b1d6210SPaolo Abeni 12673b1d6210SPaolo Abeni /* still data outstanding at TCP level? Don't retransmit. */ 12683b1d6210SPaolo Abeni if (!tcp_write_queue_empty(ssk)) 12693b1d6210SPaolo Abeni return NULL; 12703b1d6210SPaolo Abeni 12713b1d6210SPaolo Abeni if (subflow->backup) { 12723b1d6210SPaolo Abeni if (!backup) 12733b1d6210SPaolo Abeni backup = ssk; 12743b1d6210SPaolo Abeni continue; 12753b1d6210SPaolo Abeni } 12763b1d6210SPaolo Abeni 12773b1d6210SPaolo Abeni return ssk; 12783b1d6210SPaolo Abeni } 12793b1d6210SPaolo Abeni 12803b1d6210SPaolo Abeni return backup; 12813b1d6210SPaolo Abeni } 12823b1d6210SPaolo Abeni 1283cec37a6eSPeter Krystad /* subflow sockets can be either outgoing (connect) or incoming 1284cec37a6eSPeter Krystad * (accept). 1285cec37a6eSPeter Krystad * 1286cec37a6eSPeter Krystad * Outgoing subflows use in-kernel sockets. 1287cec37a6eSPeter Krystad * Incoming subflows do not have their own 'struct socket' allocated, 1288cec37a6eSPeter Krystad * so we need to use tcp_close() after detaching them from the mptcp 1289cec37a6eSPeter Krystad * parent socket. 1290cec37a6eSPeter Krystad */ 1291cec37a6eSPeter Krystad static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, 1292cec37a6eSPeter Krystad struct mptcp_subflow_context *subflow, 1293cec37a6eSPeter Krystad long timeout) 1294cec37a6eSPeter Krystad { 1295cec37a6eSPeter Krystad struct socket *sock = READ_ONCE(ssk->sk_socket); 1296cec37a6eSPeter Krystad 1297cec37a6eSPeter Krystad list_del(&subflow->node); 1298cec37a6eSPeter Krystad 1299cec37a6eSPeter Krystad if (sock && sock != sk->sk_socket) { 1300cec37a6eSPeter Krystad /* outgoing subflow */ 1301cec37a6eSPeter Krystad sock_release(sock); 1302cec37a6eSPeter Krystad } else { 1303cec37a6eSPeter Krystad /* incoming subflow */ 1304cec37a6eSPeter Krystad tcp_close(ssk, timeout); 1305cec37a6eSPeter Krystad } 1306f870fa0bSMat Martineau } 1307f870fa0bSMat Martineau 1308dc24f8b4SPaolo Abeni static unsigned int mptcp_sync_mss(struct sock *sk, u32 pmtu) 1309dc24f8b4SPaolo Abeni { 1310dc24f8b4SPaolo Abeni return 0; 1311dc24f8b4SPaolo Abeni } 1312dc24f8b4SPaolo Abeni 1313b416268bSFlorian Westphal static void pm_work(struct mptcp_sock *msk) 1314b416268bSFlorian Westphal { 1315b416268bSFlorian Westphal struct mptcp_pm_data *pm = &msk->pm; 1316b416268bSFlorian Westphal 1317b416268bSFlorian Westphal spin_lock_bh(&msk->pm.lock); 1318b416268bSFlorian Westphal 1319b416268bSFlorian Westphal pr_debug("msk=%p status=%x", msk, pm->status); 1320b416268bSFlorian Westphal if (pm->status & BIT(MPTCP_PM_ADD_ADDR_RECEIVED)) { 1321b416268bSFlorian Westphal pm->status &= ~BIT(MPTCP_PM_ADD_ADDR_RECEIVED); 1322b416268bSFlorian Westphal mptcp_pm_nl_add_addr_received(msk); 1323b416268bSFlorian Westphal } 1324b416268bSFlorian Westphal if (pm->status & BIT(MPTCP_PM_ESTABLISHED)) { 1325b416268bSFlorian Westphal pm->status &= ~BIT(MPTCP_PM_ESTABLISHED); 1326b416268bSFlorian Westphal mptcp_pm_nl_fully_established(msk); 1327b416268bSFlorian Westphal } 1328b416268bSFlorian Westphal if (pm->status & BIT(MPTCP_PM_SUBFLOW_ESTABLISHED)) { 1329b416268bSFlorian Westphal pm->status &= ~BIT(MPTCP_PM_SUBFLOW_ESTABLISHED); 1330b416268bSFlorian Westphal mptcp_pm_nl_subflow_established(msk); 1331b416268bSFlorian Westphal } 1332b416268bSFlorian Westphal 1333b416268bSFlorian Westphal spin_unlock_bh(&msk->pm.lock); 1334b416268bSFlorian Westphal } 1335b416268bSFlorian Westphal 133680992017SPaolo Abeni static void mptcp_worker(struct work_struct *work) 133780992017SPaolo Abeni { 133880992017SPaolo Abeni struct mptcp_sock *msk = container_of(work, struct mptcp_sock, work); 13393b1d6210SPaolo Abeni struct sock *ssk, *sk = &msk->sk.icsk_inet.sk; 1340149f7c71SFlorian Westphal int orig_len, orig_offset, mss_now = 0, size_goal = 0; 13413b1d6210SPaolo Abeni struct mptcp_data_frag *dfrag; 13423b1d6210SPaolo Abeni u64 orig_write_seq; 13433b1d6210SPaolo Abeni size_t copied = 0; 13443b1d6210SPaolo Abeni struct msghdr msg; 13453b1d6210SPaolo Abeni long timeo = 0; 134680992017SPaolo Abeni 134780992017SPaolo Abeni lock_sock(sk); 13483b1d6210SPaolo Abeni mptcp_clean_una(sk); 1349ec3edaa7SPeter Krystad __mptcp_flush_join_list(msk); 13506771bfd9SFlorian Westphal __mptcp_move_skbs(msk); 13513b1d6210SPaolo Abeni 1352b416268bSFlorian Westphal if (msk->pm.status) 1353b416268bSFlorian Westphal pm_work(msk); 1354b416268bSFlorian Westphal 135559832e24SFlorian Westphal if (test_and_clear_bit(MPTCP_WORK_EOF, &msk->flags)) 135659832e24SFlorian Westphal mptcp_check_for_eof(msk); 135759832e24SFlorian Westphal 13583b1d6210SPaolo Abeni if (!test_and_clear_bit(MPTCP_WORK_RTX, &msk->flags)) 13593b1d6210SPaolo Abeni goto unlock; 13603b1d6210SPaolo Abeni 13613b1d6210SPaolo Abeni dfrag = mptcp_rtx_head(sk); 13623b1d6210SPaolo Abeni if (!dfrag) 13633b1d6210SPaolo Abeni goto unlock; 13643b1d6210SPaolo Abeni 1365149f7c71SFlorian Westphal if (!mptcp_ext_cache_refill(msk)) 1366149f7c71SFlorian Westphal goto reset_unlock; 1367149f7c71SFlorian Westphal 13683b1d6210SPaolo Abeni ssk = mptcp_subflow_get_retrans(msk); 13693b1d6210SPaolo Abeni if (!ssk) 13703b1d6210SPaolo Abeni goto reset_unlock; 13713b1d6210SPaolo Abeni 13723b1d6210SPaolo Abeni lock_sock(ssk); 13733b1d6210SPaolo Abeni 13743b1d6210SPaolo Abeni msg.msg_flags = MSG_DONTWAIT; 13753b1d6210SPaolo Abeni orig_len = dfrag->data_len; 13763b1d6210SPaolo Abeni orig_offset = dfrag->offset; 13773b1d6210SPaolo Abeni orig_write_seq = dfrag->data_seq; 13783b1d6210SPaolo Abeni while (dfrag->data_len > 0) { 1379149f7c71SFlorian Westphal int ret = mptcp_sendmsg_frag(sk, ssk, &msg, dfrag, &timeo, 1380149f7c71SFlorian Westphal &mss_now, &size_goal); 13813b1d6210SPaolo Abeni if (ret < 0) 13823b1d6210SPaolo Abeni break; 13833b1d6210SPaolo Abeni 1384fc518953SFlorian Westphal MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RETRANSSEGS); 13853b1d6210SPaolo Abeni copied += ret; 13863b1d6210SPaolo Abeni dfrag->data_len -= ret; 13873b1d6210SPaolo Abeni dfrag->offset += ret; 1388149f7c71SFlorian Westphal 1389149f7c71SFlorian Westphal if (!mptcp_ext_cache_refill(msk)) 1390149f7c71SFlorian Westphal break; 13913b1d6210SPaolo Abeni } 13923b1d6210SPaolo Abeni if (copied) 13933b1d6210SPaolo Abeni tcp_push(ssk, msg.msg_flags, mss_now, tcp_sk(ssk)->nonagle, 13943b1d6210SPaolo Abeni size_goal); 13953b1d6210SPaolo Abeni 13963b1d6210SPaolo Abeni dfrag->data_seq = orig_write_seq; 13973b1d6210SPaolo Abeni dfrag->offset = orig_offset; 13983b1d6210SPaolo Abeni dfrag->data_len = orig_len; 13993b1d6210SPaolo Abeni 14003b1d6210SPaolo Abeni mptcp_set_timeout(sk, ssk); 14013b1d6210SPaolo Abeni release_sock(ssk); 14023b1d6210SPaolo Abeni 14033b1d6210SPaolo Abeni reset_unlock: 14043b1d6210SPaolo Abeni if (!mptcp_timer_pending(sk)) 14053b1d6210SPaolo Abeni mptcp_reset_timer(sk); 14063b1d6210SPaolo Abeni 14073b1d6210SPaolo Abeni unlock: 140880992017SPaolo Abeni release_sock(sk); 140980992017SPaolo Abeni sock_put(sk); 141080992017SPaolo Abeni } 141180992017SPaolo Abeni 1412784325e9SMatthieu Baerts static int __mptcp_init_sock(struct sock *sk) 1413f870fa0bSMat Martineau { 1414cec37a6eSPeter Krystad struct mptcp_sock *msk = mptcp_sk(sk); 1415cec37a6eSPeter Krystad 1416ec3edaa7SPeter Krystad spin_lock_init(&msk->join_list_lock); 1417ec3edaa7SPeter Krystad 1418cec37a6eSPeter Krystad INIT_LIST_HEAD(&msk->conn_list); 1419ec3edaa7SPeter Krystad INIT_LIST_HEAD(&msk->join_list); 142018b683bfSPaolo Abeni INIT_LIST_HEAD(&msk->rtx_queue); 14211891c4a0SFlorian Westphal __set_bit(MPTCP_SEND_SPACE, &msk->flags); 142280992017SPaolo Abeni INIT_WORK(&msk->work, mptcp_worker); 1423cec37a6eSPeter Krystad 14248ab183deSPaolo Abeni msk->first = NULL; 1425dc24f8b4SPaolo Abeni inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss; 14268ab183deSPaolo Abeni 14271b1c7a0eSPeter Krystad mptcp_pm_data_init(msk); 14281b1c7a0eSPeter Krystad 1429b51f9b80SPaolo Abeni /* re-use the csk retrans timer for MPTCP-level retrans */ 1430b51f9b80SPaolo Abeni timer_setup(&msk->sk.icsk_retransmit_timer, mptcp_retransmit_timer, 0); 1431b51f9b80SPaolo Abeni 1432f870fa0bSMat Martineau return 0; 1433f870fa0bSMat Martineau } 1434f870fa0bSMat Martineau 1435784325e9SMatthieu Baerts static int mptcp_init_sock(struct sock *sk) 1436784325e9SMatthieu Baerts { 1437fc518953SFlorian Westphal struct net *net = sock_net(sk); 1438fc518953SFlorian Westphal int ret; 143918b683bfSPaolo Abeni 1440fc518953SFlorian Westphal if (!mptcp_is_enabled(net)) 1441fc518953SFlorian Westphal return -ENOPROTOOPT; 1442fc518953SFlorian Westphal 1443fc518953SFlorian Westphal if (unlikely(!net->mib.mptcp_statistics) && !mptcp_mib_alloc(net)) 1444fc518953SFlorian Westphal return -ENOMEM; 1445fc518953SFlorian Westphal 1446fc518953SFlorian Westphal ret = __mptcp_init_sock(sk); 144718b683bfSPaolo Abeni if (ret) 144818b683bfSPaolo Abeni return ret; 144918b683bfSPaolo Abeni 1450fa68018dSPaolo Abeni ret = __mptcp_socket_create(mptcp_sk(sk)); 1451fa68018dSPaolo Abeni if (ret) 1452fa68018dSPaolo Abeni return ret; 1453fa68018dSPaolo Abeni 1454d027236cSPaolo Abeni sk_sockets_allocated_inc(sk); 1455a6b118feSFlorian Westphal sk->sk_rcvbuf = sock_net(sk)->ipv4.sysctl_tcp_rmem[1]; 14563f8e0aaeSPaolo Abeni sk->sk_sndbuf = sock_net(sk)->ipv4.sysctl_tcp_wmem[2]; 1457d027236cSPaolo Abeni 145818b683bfSPaolo Abeni return 0; 145918b683bfSPaolo Abeni } 146018b683bfSPaolo Abeni 146118b683bfSPaolo Abeni static void __mptcp_clear_xmit(struct sock *sk) 146218b683bfSPaolo Abeni { 146318b683bfSPaolo Abeni struct mptcp_sock *msk = mptcp_sk(sk); 146418b683bfSPaolo Abeni struct mptcp_data_frag *dtmp, *dfrag; 146518b683bfSPaolo Abeni 1466b51f9b80SPaolo Abeni sk_stop_timer(sk, &msk->sk.icsk_retransmit_timer); 1467b51f9b80SPaolo Abeni 146818b683bfSPaolo Abeni list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list) 1469d027236cSPaolo Abeni dfrag_clear(sk, dfrag); 1470784325e9SMatthieu Baerts } 1471784325e9SMatthieu Baerts 147280992017SPaolo Abeni static void mptcp_cancel_work(struct sock *sk) 147380992017SPaolo Abeni { 147480992017SPaolo Abeni struct mptcp_sock *msk = mptcp_sk(sk); 147580992017SPaolo Abeni 147680992017SPaolo Abeni if (cancel_work_sync(&msk->work)) 147780992017SPaolo Abeni sock_put(sk); 147880992017SPaolo Abeni } 147980992017SPaolo Abeni 14807279da61SMat Martineau static void mptcp_subflow_shutdown(struct sock *ssk, int how) 148121498490SPeter Krystad { 148221498490SPeter Krystad lock_sock(ssk); 148321498490SPeter Krystad 148421498490SPeter Krystad switch (ssk->sk_state) { 148521498490SPeter Krystad case TCP_LISTEN: 148621498490SPeter Krystad if (!(how & RCV_SHUTDOWN)) 148721498490SPeter Krystad break; 148821498490SPeter Krystad /* fall through */ 148921498490SPeter Krystad case TCP_SYN_SENT: 149021498490SPeter Krystad tcp_disconnect(ssk, O_NONBLOCK); 149121498490SPeter Krystad break; 149221498490SPeter Krystad default: 149321498490SPeter Krystad ssk->sk_shutdown |= how; 149421498490SPeter Krystad tcp_shutdown(ssk, how); 149521498490SPeter Krystad break; 149621498490SPeter Krystad } 149721498490SPeter Krystad 149821498490SPeter Krystad release_sock(ssk); 149921498490SPeter Krystad } 150021498490SPeter Krystad 15012c22c06cSFlorian Westphal static void mptcp_close(struct sock *sk, long timeout) 1502f870fa0bSMat Martineau { 1503cec37a6eSPeter Krystad struct mptcp_subflow_context *subflow, *tmp; 1504f870fa0bSMat Martineau struct mptcp_sock *msk = mptcp_sk(sk); 1505b2c5b614SFlorian Westphal LIST_HEAD(conn_list); 1506f870fa0bSMat Martineau 15072c22c06cSFlorian Westphal lock_sock(sk); 15082c22c06cSFlorian Westphal 1509f870fa0bSMat Martineau inet_sk_state_store(sk, TCP_CLOSE); 1510f870fa0bSMat Martineau 151110f6d46cSPaolo Abeni /* be sure to always acquire the join list lock, to sync vs 151210f6d46cSPaolo Abeni * mptcp_finish_join(). 151310f6d46cSPaolo Abeni */ 151410f6d46cSPaolo Abeni spin_lock_bh(&msk->join_list_lock); 151510f6d46cSPaolo Abeni list_splice_tail_init(&msk->join_list, &msk->conn_list); 151610f6d46cSPaolo Abeni spin_unlock_bh(&msk->join_list_lock); 1517b2c5b614SFlorian Westphal list_splice_init(&msk->conn_list, &conn_list); 1518b2c5b614SFlorian Westphal 15197279da61SMat Martineau msk->snd_data_fin_enable = 1; 152076c42a29SMat Martineau 152118b683bfSPaolo Abeni __mptcp_clear_xmit(sk); 152218b683bfSPaolo Abeni 1523b2c5b614SFlorian Westphal release_sock(sk); 1524b2c5b614SFlorian Westphal 1525b2c5b614SFlorian Westphal list_for_each_entry_safe(subflow, tmp, &conn_list, node) { 1526cec37a6eSPeter Krystad struct sock *ssk = mptcp_subflow_tcp_sock(subflow); 1527cec37a6eSPeter Krystad __mptcp_close_ssk(sk, ssk, subflow, timeout); 1528f870fa0bSMat Martineau } 1529f870fa0bSMat Martineau 153080992017SPaolo Abeni mptcp_cancel_work(sk); 153180992017SPaolo Abeni 15326771bfd9SFlorian Westphal __skb_queue_purge(&sk->sk_receive_queue); 15336771bfd9SFlorian Westphal 1534cec37a6eSPeter Krystad sk_common_release(sk); 1535f870fa0bSMat Martineau } 1536f870fa0bSMat Martineau 1537cf7da0d6SPeter Krystad static void mptcp_copy_inaddrs(struct sock *msk, const struct sock *ssk) 1538cf7da0d6SPeter Krystad { 1539cf7da0d6SPeter Krystad #if IS_ENABLED(CONFIG_MPTCP_IPV6) 1540cf7da0d6SPeter Krystad const struct ipv6_pinfo *ssk6 = inet6_sk(ssk); 1541cf7da0d6SPeter Krystad struct ipv6_pinfo *msk6 = inet6_sk(msk); 1542cf7da0d6SPeter Krystad 1543cf7da0d6SPeter Krystad msk->sk_v6_daddr = ssk->sk_v6_daddr; 1544cf7da0d6SPeter Krystad msk->sk_v6_rcv_saddr = ssk->sk_v6_rcv_saddr; 1545cf7da0d6SPeter Krystad 1546cf7da0d6SPeter Krystad if (msk6 && ssk6) { 1547cf7da0d6SPeter Krystad msk6->saddr = ssk6->saddr; 1548cf7da0d6SPeter Krystad msk6->flow_label = ssk6->flow_label; 1549cf7da0d6SPeter Krystad } 1550cf7da0d6SPeter Krystad #endif 1551cf7da0d6SPeter Krystad 1552cf7da0d6SPeter Krystad inet_sk(msk)->inet_num = inet_sk(ssk)->inet_num; 1553cf7da0d6SPeter Krystad inet_sk(msk)->inet_dport = inet_sk(ssk)->inet_dport; 1554cf7da0d6SPeter Krystad inet_sk(msk)->inet_sport = inet_sk(ssk)->inet_sport; 1555cf7da0d6SPeter Krystad inet_sk(msk)->inet_daddr = inet_sk(ssk)->inet_daddr; 1556cf7da0d6SPeter Krystad inet_sk(msk)->inet_saddr = inet_sk(ssk)->inet_saddr; 1557cf7da0d6SPeter Krystad inet_sk(msk)->inet_rcv_saddr = inet_sk(ssk)->inet_rcv_saddr; 1558cf7da0d6SPeter Krystad } 1559cf7da0d6SPeter Krystad 156018b683bfSPaolo Abeni static int mptcp_disconnect(struct sock *sk, int flags) 156118b683bfSPaolo Abeni { 156242c556feSFlorian Westphal /* Should never be called. 156342c556feSFlorian Westphal * inet_stream_connect() calls ->disconnect, but that 156442c556feSFlorian Westphal * refers to the subflow socket, not the mptcp one. 156542c556feSFlorian Westphal */ 156642c556feSFlorian Westphal WARN_ON_ONCE(1); 156742c556feSFlorian Westphal return 0; 156818b683bfSPaolo Abeni } 156918b683bfSPaolo Abeni 1570b0519de8SFlorian Westphal #if IS_ENABLED(CONFIG_MPTCP_IPV6) 1571b0519de8SFlorian Westphal static struct ipv6_pinfo *mptcp_inet6_sk(const struct sock *sk) 1572b0519de8SFlorian Westphal { 1573b0519de8SFlorian Westphal unsigned int offset = sizeof(struct mptcp6_sock) - sizeof(struct ipv6_pinfo); 1574b0519de8SFlorian Westphal 1575b0519de8SFlorian Westphal return (struct ipv6_pinfo *)(((u8 *)sk) + offset); 1576b0519de8SFlorian Westphal } 1577b0519de8SFlorian Westphal #endif 1578b0519de8SFlorian Westphal 1579fca5c82cSPaolo Abeni struct sock *mptcp_sk_clone(const struct sock *sk, 1580cfde141eSPaolo Abeni const struct mptcp_options_received *mp_opt, 1581fca5c82cSPaolo Abeni struct request_sock *req) 1582b0519de8SFlorian Westphal { 158358b09919SPaolo Abeni struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); 1584b0519de8SFlorian Westphal struct sock *nsk = sk_clone_lock(sk, GFP_ATOMIC); 158558b09919SPaolo Abeni struct mptcp_sock *msk; 158658b09919SPaolo Abeni u64 ack_seq; 1587b0519de8SFlorian Westphal 1588b0519de8SFlorian Westphal if (!nsk) 1589b0519de8SFlorian Westphal return NULL; 1590b0519de8SFlorian Westphal 1591b0519de8SFlorian Westphal #if IS_ENABLED(CONFIG_MPTCP_IPV6) 1592b0519de8SFlorian Westphal if (nsk->sk_family == AF_INET6) 1593b0519de8SFlorian Westphal inet_sk(nsk)->pinet6 = mptcp_inet6_sk(nsk); 1594b0519de8SFlorian Westphal #endif 1595b0519de8SFlorian Westphal 159658b09919SPaolo Abeni __mptcp_init_sock(nsk); 159758b09919SPaolo Abeni 159858b09919SPaolo Abeni msk = mptcp_sk(nsk); 159958b09919SPaolo Abeni msk->local_key = subflow_req->local_key; 160058b09919SPaolo Abeni msk->token = subflow_req->token; 160158b09919SPaolo Abeni msk->subflow = NULL; 1602b93df08cSPaolo Abeni WRITE_ONCE(msk->fully_established, false); 160358b09919SPaolo Abeni 160458b09919SPaolo Abeni msk->write_seq = subflow_req->idsn + 1; 1605cc9d2566SPaolo Abeni atomic64_set(&msk->snd_una, msk->write_seq); 1606cfde141eSPaolo Abeni if (mp_opt->mp_capable) { 160758b09919SPaolo Abeni msk->can_ack = true; 1608cfde141eSPaolo Abeni msk->remote_key = mp_opt->sndr_key; 160958b09919SPaolo Abeni mptcp_crypto_key_sha(msk->remote_key, NULL, &ack_seq); 161058b09919SPaolo Abeni ack_seq++; 161158b09919SPaolo Abeni msk->ack_seq = ack_seq; 161258b09919SPaolo Abeni } 16137f20d5fcSPaolo Abeni 16145e20087dSFlorian Westphal sock_reset_flag(nsk, SOCK_RCU_FREE); 16157f20d5fcSPaolo Abeni /* will be fully established after successful MPC subflow creation */ 16167f20d5fcSPaolo Abeni inet_sk_state_store(nsk, TCP_SYN_RECV); 161758b09919SPaolo Abeni bh_unlock_sock(nsk); 161858b09919SPaolo Abeni 161958b09919SPaolo Abeni /* keep a single reference */ 162058b09919SPaolo Abeni __sock_put(nsk); 1621b0519de8SFlorian Westphal return nsk; 1622b0519de8SFlorian Westphal } 1623b0519de8SFlorian Westphal 1624a6b118feSFlorian Westphal void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk) 1625a6b118feSFlorian Westphal { 1626a6b118feSFlorian Westphal const struct tcp_sock *tp = tcp_sk(ssk); 1627a6b118feSFlorian Westphal 1628a6b118feSFlorian Westphal msk->rcvq_space.copied = 0; 1629a6b118feSFlorian Westphal msk->rcvq_space.rtt_us = 0; 1630a6b118feSFlorian Westphal 1631a6b118feSFlorian Westphal msk->rcvq_space.time = tp->tcp_mstamp; 1632a6b118feSFlorian Westphal 1633a6b118feSFlorian Westphal /* initial rcv_space offering made to peer */ 1634a6b118feSFlorian Westphal msk->rcvq_space.space = min_t(u32, tp->rcv_wnd, 1635a6b118feSFlorian Westphal TCP_INIT_CWND * tp->advmss); 1636a6b118feSFlorian Westphal if (msk->rcvq_space.space == 0) 1637a6b118feSFlorian Westphal msk->rcvq_space.space = TCP_INIT_CWND * TCP_MSS_DEFAULT; 1638a6b118feSFlorian Westphal } 1639a6b118feSFlorian Westphal 1640cf7da0d6SPeter Krystad static struct sock *mptcp_accept(struct sock *sk, int flags, int *err, 1641cf7da0d6SPeter Krystad bool kern) 1642cf7da0d6SPeter Krystad { 1643cf7da0d6SPeter Krystad struct mptcp_sock *msk = mptcp_sk(sk); 1644cf7da0d6SPeter Krystad struct socket *listener; 1645cf7da0d6SPeter Krystad struct sock *newsk; 1646cf7da0d6SPeter Krystad 1647cf7da0d6SPeter Krystad listener = __mptcp_nmpc_socket(msk); 1648cf7da0d6SPeter Krystad if (WARN_ON_ONCE(!listener)) { 1649cf7da0d6SPeter Krystad *err = -EINVAL; 1650cf7da0d6SPeter Krystad return NULL; 1651cf7da0d6SPeter Krystad } 1652cf7da0d6SPeter Krystad 1653cf7da0d6SPeter Krystad pr_debug("msk=%p, listener=%p", msk, mptcp_subflow_ctx(listener->sk)); 1654cf7da0d6SPeter Krystad newsk = inet_csk_accept(listener->sk, flags, err, kern); 1655cf7da0d6SPeter Krystad if (!newsk) 1656cf7da0d6SPeter Krystad return NULL; 1657cf7da0d6SPeter Krystad 1658cf7da0d6SPeter Krystad pr_debug("msk=%p, subflow is mptcp=%d", msk, sk_is_mptcp(newsk)); 1659cf7da0d6SPeter Krystad if (sk_is_mptcp(newsk)) { 1660cf7da0d6SPeter Krystad struct mptcp_subflow_context *subflow; 1661cf7da0d6SPeter Krystad struct sock *new_mptcp_sock; 1662cf7da0d6SPeter Krystad struct sock *ssk = newsk; 1663cf7da0d6SPeter Krystad 1664cf7da0d6SPeter Krystad subflow = mptcp_subflow_ctx(newsk); 166558b09919SPaolo Abeni new_mptcp_sock = subflow->conn; 166658b09919SPaolo Abeni 166758b09919SPaolo Abeni /* is_mptcp should be false if subflow->conn is missing, see 166858b09919SPaolo Abeni * subflow_syn_recv_sock() 166958b09919SPaolo Abeni */ 167058b09919SPaolo Abeni if (WARN_ON_ONCE(!new_mptcp_sock)) { 167158b09919SPaolo Abeni tcp_sk(newsk)->is_mptcp = 0; 167258b09919SPaolo Abeni return newsk; 167358b09919SPaolo Abeni } 167458b09919SPaolo Abeni 167558b09919SPaolo Abeni /* acquire the 2nd reference for the owning socket */ 167658b09919SPaolo Abeni sock_hold(new_mptcp_sock); 1677cf7da0d6SPeter Krystad 1678cf7da0d6SPeter Krystad local_bh_disable(); 167958b09919SPaolo Abeni bh_lock_sock(new_mptcp_sock); 1680cf7da0d6SPeter Krystad msk = mptcp_sk(new_mptcp_sock); 16818ab183deSPaolo Abeni msk->first = newsk; 1682cf7da0d6SPeter Krystad 1683cf7da0d6SPeter Krystad newsk = new_mptcp_sock; 1684cf7da0d6SPeter Krystad mptcp_copy_inaddrs(newsk, ssk); 1685cf7da0d6SPeter Krystad list_add(&subflow->node, &msk->conn_list); 1686cf7da0d6SPeter Krystad 1687a6b118feSFlorian Westphal mptcp_rcv_space_init(msk, ssk); 1688cf7da0d6SPeter Krystad bh_unlock_sock(new_mptcp_sock); 1689fc518953SFlorian Westphal 1690fc518953SFlorian Westphal __MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEPASSIVEACK); 1691cf7da0d6SPeter Krystad local_bh_enable(); 1692fc518953SFlorian Westphal } else { 1693fc518953SFlorian Westphal MPTCP_INC_STATS(sock_net(sk), 1694fc518953SFlorian Westphal MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK); 1695cf7da0d6SPeter Krystad } 1696cf7da0d6SPeter Krystad 1697cf7da0d6SPeter Krystad return newsk; 1698cf7da0d6SPeter Krystad } 1699cf7da0d6SPeter Krystad 170079c0949eSPeter Krystad static void mptcp_destroy(struct sock *sk) 170179c0949eSPeter Krystad { 1702c9fd9c5fSFlorian Westphal struct mptcp_sock *msk = mptcp_sk(sk); 1703c9fd9c5fSFlorian Westphal 17042c5ebd00SPaolo Abeni mptcp_token_destroy(msk); 1705c9fd9c5fSFlorian Westphal if (msk->cached_ext) 1706c9fd9c5fSFlorian Westphal __skb_ext_put(msk->cached_ext); 1707d027236cSPaolo Abeni 1708d027236cSPaolo Abeni sk_sockets_allocated_dec(sk); 170979c0949eSPeter Krystad } 171079c0949eSPeter Krystad 1711fd1452d8SFlorian Westphal static int mptcp_setsockopt_sol_socket(struct mptcp_sock *msk, int optname, 1712a7b75c5aSChristoph Hellwig sockptr_t optval, unsigned int optlen) 1713fd1452d8SFlorian Westphal { 1714fd1452d8SFlorian Westphal struct sock *sk = (struct sock *)msk; 1715fd1452d8SFlorian Westphal struct socket *ssock; 1716fd1452d8SFlorian Westphal int ret; 1717fd1452d8SFlorian Westphal 1718fd1452d8SFlorian Westphal switch (optname) { 1719fd1452d8SFlorian Westphal case SO_REUSEPORT: 1720fd1452d8SFlorian Westphal case SO_REUSEADDR: 1721fd1452d8SFlorian Westphal lock_sock(sk); 1722fd1452d8SFlorian Westphal ssock = __mptcp_nmpc_socket(msk); 1723fd1452d8SFlorian Westphal if (!ssock) { 1724fd1452d8SFlorian Westphal release_sock(sk); 1725fd1452d8SFlorian Westphal return -EINVAL; 1726fd1452d8SFlorian Westphal } 1727fd1452d8SFlorian Westphal 1728a7b75c5aSChristoph Hellwig ret = sock_setsockopt(ssock, SOL_SOCKET, optname, optval, optlen); 1729fd1452d8SFlorian Westphal if (ret == 0) { 1730fd1452d8SFlorian Westphal if (optname == SO_REUSEPORT) 1731fd1452d8SFlorian Westphal sk->sk_reuseport = ssock->sk->sk_reuseport; 1732fd1452d8SFlorian Westphal else if (optname == SO_REUSEADDR) 1733fd1452d8SFlorian Westphal sk->sk_reuse = ssock->sk->sk_reuse; 1734fd1452d8SFlorian Westphal } 1735fd1452d8SFlorian Westphal release_sock(sk); 1736fd1452d8SFlorian Westphal return ret; 1737fd1452d8SFlorian Westphal } 1738fd1452d8SFlorian Westphal 1739a7b75c5aSChristoph Hellwig return sock_setsockopt(sk->sk_socket, SOL_SOCKET, optname, optval, optlen); 1740fd1452d8SFlorian Westphal } 1741fd1452d8SFlorian Westphal 1742c9b95a13SFlorian Westphal static int mptcp_setsockopt_v6(struct mptcp_sock *msk, int optname, 1743a7b75c5aSChristoph Hellwig sockptr_t optval, unsigned int optlen) 1744c9b95a13SFlorian Westphal { 1745c9b95a13SFlorian Westphal struct sock *sk = (struct sock *)msk; 1746c9b95a13SFlorian Westphal int ret = -EOPNOTSUPP; 1747c9b95a13SFlorian Westphal struct socket *ssock; 1748c9b95a13SFlorian Westphal 1749c9b95a13SFlorian Westphal switch (optname) { 1750c9b95a13SFlorian Westphal case IPV6_V6ONLY: 1751c9b95a13SFlorian Westphal lock_sock(sk); 1752c9b95a13SFlorian Westphal ssock = __mptcp_nmpc_socket(msk); 1753c9b95a13SFlorian Westphal if (!ssock) { 1754c9b95a13SFlorian Westphal release_sock(sk); 1755c9b95a13SFlorian Westphal return -EINVAL; 1756c9b95a13SFlorian Westphal } 1757c9b95a13SFlorian Westphal 1758c9b95a13SFlorian Westphal ret = tcp_setsockopt(ssock->sk, SOL_IPV6, optname, optval, optlen); 1759c9b95a13SFlorian Westphal if (ret == 0) 1760c9b95a13SFlorian Westphal sk->sk_ipv6only = ssock->sk->sk_ipv6only; 1761c9b95a13SFlorian Westphal 1762c9b95a13SFlorian Westphal release_sock(sk); 1763c9b95a13SFlorian Westphal break; 1764c9b95a13SFlorian Westphal } 1765c9b95a13SFlorian Westphal 1766c9b95a13SFlorian Westphal return ret; 1767c9b95a13SFlorian Westphal } 1768c9b95a13SFlorian Westphal 1769717e79c8SPeter Krystad static int mptcp_setsockopt(struct sock *sk, int level, int optname, 1770a7b75c5aSChristoph Hellwig sockptr_t optval, unsigned int optlen) 1771717e79c8SPeter Krystad { 1772717e79c8SPeter Krystad struct mptcp_sock *msk = mptcp_sk(sk); 177376660afbSPaolo Abeni struct sock *ssk; 1774717e79c8SPeter Krystad 1775717e79c8SPeter Krystad pr_debug("msk=%p", msk); 1776717e79c8SPeter Krystad 177783f0c10bSFlorian Westphal if (level == SOL_SOCKET) 1778fd1452d8SFlorian Westphal return mptcp_setsockopt_sol_socket(msk, optname, optval, optlen); 177983f0c10bSFlorian Westphal 1780717e79c8SPeter Krystad /* @@ the meaning of setsockopt() when the socket is connected and 1781b6e4a1aeSMat Martineau * there are multiple subflows is not yet defined. It is up to the 1782b6e4a1aeSMat Martineau * MPTCP-level socket to configure the subflows until the subflow 1783b6e4a1aeSMat Martineau * is in TCP fallback, when TCP socket options are passed through 1784b6e4a1aeSMat Martineau * to the one remaining subflow. 1785717e79c8SPeter Krystad */ 1786717e79c8SPeter Krystad lock_sock(sk); 178776660afbSPaolo Abeni ssk = __mptcp_tcp_fallback(msk); 1788e154659bSFlorian Westphal release_sock(sk); 178976660afbSPaolo Abeni if (ssk) 179076660afbSPaolo Abeni return tcp_setsockopt(ssk, level, optname, optval, optlen); 179150e741bbSFlorian Westphal 1792c9b95a13SFlorian Westphal if (level == SOL_IPV6) 1793c9b95a13SFlorian Westphal return mptcp_setsockopt_v6(msk, optname, optval, optlen); 1794c9b95a13SFlorian Westphal 1795b6e4a1aeSMat Martineau return -EOPNOTSUPP; 1796717e79c8SPeter Krystad } 1797717e79c8SPeter Krystad 1798717e79c8SPeter Krystad static int mptcp_getsockopt(struct sock *sk, int level, int optname, 179950e741bbSFlorian Westphal char __user *optval, int __user *option) 1800717e79c8SPeter Krystad { 1801717e79c8SPeter Krystad struct mptcp_sock *msk = mptcp_sk(sk); 180276660afbSPaolo Abeni struct sock *ssk; 1803717e79c8SPeter Krystad 1804717e79c8SPeter Krystad pr_debug("msk=%p", msk); 1805717e79c8SPeter Krystad 1806b6e4a1aeSMat Martineau /* @@ the meaning of setsockopt() when the socket is connected and 1807b6e4a1aeSMat Martineau * there are multiple subflows is not yet defined. It is up to the 1808b6e4a1aeSMat Martineau * MPTCP-level socket to configure the subflows until the subflow 1809b6e4a1aeSMat Martineau * is in TCP fallback, when socket options are passed through 1810b6e4a1aeSMat Martineau * to the one remaining subflow. 1811717e79c8SPeter Krystad */ 1812717e79c8SPeter Krystad lock_sock(sk); 181376660afbSPaolo Abeni ssk = __mptcp_tcp_fallback(msk); 1814e154659bSFlorian Westphal release_sock(sk); 181576660afbSPaolo Abeni if (ssk) 181676660afbSPaolo Abeni return tcp_getsockopt(ssk, level, optname, optval, option); 181750e741bbSFlorian Westphal 1818b6e4a1aeSMat Martineau return -EOPNOTSUPP; 1819717e79c8SPeter Krystad } 1820717e79c8SPeter Krystad 1821b51f9b80SPaolo Abeni #define MPTCP_DEFERRED_ALL (TCPF_DELACK_TIMER_DEFERRED | \ 1822b51f9b80SPaolo Abeni TCPF_WRITE_TIMER_DEFERRED) 182314c441b5SPaolo Abeni 182414c441b5SPaolo Abeni /* this is very alike tcp_release_cb() but we must handle differently a 182514c441b5SPaolo Abeni * different set of events 182614c441b5SPaolo Abeni */ 182714c441b5SPaolo Abeni static void mptcp_release_cb(struct sock *sk) 182814c441b5SPaolo Abeni { 182914c441b5SPaolo Abeni unsigned long flags, nflags; 183014c441b5SPaolo Abeni 183114c441b5SPaolo Abeni do { 183214c441b5SPaolo Abeni flags = sk->sk_tsq_flags; 183314c441b5SPaolo Abeni if (!(flags & MPTCP_DEFERRED_ALL)) 183414c441b5SPaolo Abeni return; 183514c441b5SPaolo Abeni nflags = flags & ~MPTCP_DEFERRED_ALL; 183614c441b5SPaolo Abeni } while (cmpxchg(&sk->sk_tsq_flags, flags, nflags) != flags); 183714c441b5SPaolo Abeni 1838b51f9b80SPaolo Abeni sock_release_ownership(sk); 1839b51f9b80SPaolo Abeni 184014c441b5SPaolo Abeni if (flags & TCPF_DELACK_TIMER_DEFERRED) { 184114c441b5SPaolo Abeni struct mptcp_sock *msk = mptcp_sk(sk); 184214c441b5SPaolo Abeni struct sock *ssk; 184314c441b5SPaolo Abeni 184414c441b5SPaolo Abeni ssk = mptcp_subflow_recv_lookup(msk); 184514c441b5SPaolo Abeni if (!ssk || !schedule_work(&msk->work)) 184614c441b5SPaolo Abeni __sock_put(sk); 184714c441b5SPaolo Abeni } 1848b51f9b80SPaolo Abeni 1849b51f9b80SPaolo Abeni if (flags & TCPF_WRITE_TIMER_DEFERRED) { 1850b51f9b80SPaolo Abeni mptcp_retransmit_handler(sk); 1851b51f9b80SPaolo Abeni __sock_put(sk); 1852b51f9b80SPaolo Abeni } 185314c441b5SPaolo Abeni } 185414c441b5SPaolo Abeni 18552c5ebd00SPaolo Abeni static int mptcp_hash(struct sock *sk) 18562c5ebd00SPaolo Abeni { 18572c5ebd00SPaolo Abeni /* should never be called, 18582c5ebd00SPaolo Abeni * we hash the TCP subflows not the master socket 18592c5ebd00SPaolo Abeni */ 18602c5ebd00SPaolo Abeni WARN_ON_ONCE(1); 18612c5ebd00SPaolo Abeni return 0; 18622c5ebd00SPaolo Abeni } 18632c5ebd00SPaolo Abeni 18642c5ebd00SPaolo Abeni static void mptcp_unhash(struct sock *sk) 18652c5ebd00SPaolo Abeni { 18662c5ebd00SPaolo Abeni /* called from sk_common_release(), but nothing to do here */ 18672c5ebd00SPaolo Abeni } 18682c5ebd00SPaolo Abeni 1869cec37a6eSPeter Krystad static int mptcp_get_port(struct sock *sk, unsigned short snum) 1870f870fa0bSMat Martineau { 1871f870fa0bSMat Martineau struct mptcp_sock *msk = mptcp_sk(sk); 1872cec37a6eSPeter Krystad struct socket *ssock; 1873f870fa0bSMat Martineau 1874cec37a6eSPeter Krystad ssock = __mptcp_nmpc_socket(msk); 1875cec37a6eSPeter Krystad pr_debug("msk=%p, subflow=%p", msk, ssock); 1876cec37a6eSPeter Krystad if (WARN_ON_ONCE(!ssock)) 1877cec37a6eSPeter Krystad return -EINVAL; 1878f870fa0bSMat Martineau 1879cec37a6eSPeter Krystad return inet_csk_get_port(ssock->sk, snum); 1880cec37a6eSPeter Krystad } 1881f870fa0bSMat Martineau 1882cec37a6eSPeter Krystad void mptcp_finish_connect(struct sock *ssk) 1883cec37a6eSPeter Krystad { 1884cec37a6eSPeter Krystad struct mptcp_subflow_context *subflow; 1885cec37a6eSPeter Krystad struct mptcp_sock *msk; 1886cec37a6eSPeter Krystad struct sock *sk; 18876d0060f6SMat Martineau u64 ack_seq; 1888f870fa0bSMat Martineau 1889cec37a6eSPeter Krystad subflow = mptcp_subflow_ctx(ssk); 1890cec37a6eSPeter Krystad sk = subflow->conn; 1891cec37a6eSPeter Krystad msk = mptcp_sk(sk); 1892cec37a6eSPeter Krystad 1893648ef4b8SMat Martineau pr_debug("msk=%p, token=%u", sk, subflow->token); 1894648ef4b8SMat Martineau 18956d0060f6SMat Martineau mptcp_crypto_key_sha(subflow->remote_key, NULL, &ack_seq); 18966d0060f6SMat Martineau ack_seq++; 1897648ef4b8SMat Martineau subflow->map_seq = ack_seq; 1898648ef4b8SMat Martineau subflow->map_subflow_seq = 1; 18996d0060f6SMat Martineau 1900cec37a6eSPeter Krystad /* the socket is not connected yet, no msk/subflow ops can access/race 1901cec37a6eSPeter Krystad * accessing the field below 1902cec37a6eSPeter Krystad */ 1903cec37a6eSPeter Krystad WRITE_ONCE(msk->remote_key, subflow->remote_key); 1904cec37a6eSPeter Krystad WRITE_ONCE(msk->local_key, subflow->local_key); 19056d0060f6SMat Martineau WRITE_ONCE(msk->write_seq, subflow->idsn + 1); 19066d0060f6SMat Martineau WRITE_ONCE(msk->ack_seq, ack_seq); 1907d22f4988SChristoph Paasch WRITE_ONCE(msk->can_ack, 1); 1908cc9d2566SPaolo Abeni atomic64_set(&msk->snd_una, msk->write_seq); 19091b1c7a0eSPeter Krystad 19101b1c7a0eSPeter Krystad mptcp_pm_new_connection(msk, 0); 1911a6b118feSFlorian Westphal 1912a6b118feSFlorian Westphal mptcp_rcv_space_init(msk, ssk); 1913f870fa0bSMat Martineau } 1914f870fa0bSMat Martineau 1915cf7da0d6SPeter Krystad static void mptcp_sock_graft(struct sock *sk, struct socket *parent) 1916cf7da0d6SPeter Krystad { 1917cf7da0d6SPeter Krystad write_lock_bh(&sk->sk_callback_lock); 1918cf7da0d6SPeter Krystad rcu_assign_pointer(sk->sk_wq, &parent->wq); 1919cf7da0d6SPeter Krystad sk_set_socket(sk, parent); 1920cf7da0d6SPeter Krystad sk->sk_uid = SOCK_INODE(parent)->i_uid; 1921cf7da0d6SPeter Krystad write_unlock_bh(&sk->sk_callback_lock); 1922cf7da0d6SPeter Krystad } 1923cf7da0d6SPeter Krystad 1924f296234cSPeter Krystad bool mptcp_finish_join(struct sock *sk) 1925f296234cSPeter Krystad { 1926f296234cSPeter Krystad struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); 1927f296234cSPeter Krystad struct mptcp_sock *msk = mptcp_sk(subflow->conn); 1928f296234cSPeter Krystad struct sock *parent = (void *)msk; 1929f296234cSPeter Krystad struct socket *parent_sock; 1930ec3edaa7SPeter Krystad bool ret; 1931f296234cSPeter Krystad 1932f296234cSPeter Krystad pr_debug("msk=%p, subflow=%p", msk, subflow); 1933f296234cSPeter Krystad 1934f296234cSPeter Krystad /* mptcp socket already closing? */ 1935b93df08cSPaolo Abeni if (!mptcp_is_fully_established(parent)) 1936f296234cSPeter Krystad return false; 1937f296234cSPeter Krystad 1938f296234cSPeter Krystad if (!msk->pm.server_side) 1939f296234cSPeter Krystad return true; 1940f296234cSPeter Krystad 194110f6d46cSPaolo Abeni if (!mptcp_pm_allow_new_subflow(msk)) 194210f6d46cSPaolo Abeni return false; 194310f6d46cSPaolo Abeni 194410f6d46cSPaolo Abeni /* active connections are already on conn_list, and we can't acquire 194510f6d46cSPaolo Abeni * msk lock here. 194610f6d46cSPaolo Abeni * use the join list lock as synchronization point and double-check 194710f6d46cSPaolo Abeni * msk status to avoid racing with mptcp_close() 194810f6d46cSPaolo Abeni */ 194910f6d46cSPaolo Abeni spin_lock_bh(&msk->join_list_lock); 195010f6d46cSPaolo Abeni ret = inet_sk_state_load(parent) == TCP_ESTABLISHED; 195110f6d46cSPaolo Abeni if (ret && !WARN_ON_ONCE(!list_empty(&subflow->node))) 195210f6d46cSPaolo Abeni list_add_tail(&subflow->node, &msk->join_list); 195310f6d46cSPaolo Abeni spin_unlock_bh(&msk->join_list_lock); 195410f6d46cSPaolo Abeni if (!ret) 195510f6d46cSPaolo Abeni return false; 195610f6d46cSPaolo Abeni 195710f6d46cSPaolo Abeni /* attach to msk socket only after we are sure he will deal with us 195810f6d46cSPaolo Abeni * at close time 195910f6d46cSPaolo Abeni */ 1960f296234cSPeter Krystad parent_sock = READ_ONCE(parent->sk_socket); 1961f296234cSPeter Krystad if (parent_sock && !sk->sk_socket) 1962f296234cSPeter Krystad mptcp_sock_graft(sk, parent_sock); 196364d950aeSChristoph Paasch subflow->map_seq = msk->ack_seq; 196410f6d46cSPaolo Abeni return true; 1965f296234cSPeter Krystad } 1966f296234cSPeter Krystad 19671891c4a0SFlorian Westphal static bool mptcp_memory_free(const struct sock *sk, int wake) 19681891c4a0SFlorian Westphal { 19691891c4a0SFlorian Westphal struct mptcp_sock *msk = mptcp_sk(sk); 19701891c4a0SFlorian Westphal 19711891c4a0SFlorian Westphal return wake ? test_bit(MPTCP_SEND_SPACE, &msk->flags) : true; 19721891c4a0SFlorian Westphal } 19731891c4a0SFlorian Westphal 1974f870fa0bSMat Martineau static struct proto mptcp_prot = { 1975f870fa0bSMat Martineau .name = "MPTCP", 1976f870fa0bSMat Martineau .owner = THIS_MODULE, 1977f870fa0bSMat Martineau .init = mptcp_init_sock, 197818b683bfSPaolo Abeni .disconnect = mptcp_disconnect, 1979f870fa0bSMat Martineau .close = mptcp_close, 1980cf7da0d6SPeter Krystad .accept = mptcp_accept, 1981717e79c8SPeter Krystad .setsockopt = mptcp_setsockopt, 1982717e79c8SPeter Krystad .getsockopt = mptcp_getsockopt, 1983f870fa0bSMat Martineau .shutdown = tcp_shutdown, 198479c0949eSPeter Krystad .destroy = mptcp_destroy, 1985f870fa0bSMat Martineau .sendmsg = mptcp_sendmsg, 1986f870fa0bSMat Martineau .recvmsg = mptcp_recvmsg, 198714c441b5SPaolo Abeni .release_cb = mptcp_release_cb, 19882c5ebd00SPaolo Abeni .hash = mptcp_hash, 19892c5ebd00SPaolo Abeni .unhash = mptcp_unhash, 1990cec37a6eSPeter Krystad .get_port = mptcp_get_port, 1991d027236cSPaolo Abeni .sockets_allocated = &mptcp_sockets_allocated, 1992d027236cSPaolo Abeni .memory_allocated = &tcp_memory_allocated, 1993d027236cSPaolo Abeni .memory_pressure = &tcp_memory_pressure, 19941891c4a0SFlorian Westphal .stream_memory_free = mptcp_memory_free, 1995d027236cSPaolo Abeni .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), 1996d027236cSPaolo Abeni .sysctl_mem = sysctl_tcp_mem, 1997f870fa0bSMat Martineau .obj_size = sizeof(struct mptcp_sock), 19982c5ebd00SPaolo Abeni .slab_flags = SLAB_TYPESAFE_BY_RCU, 1999f870fa0bSMat Martineau .no_autobind = true, 2000f870fa0bSMat Martineau }; 2001f870fa0bSMat Martineau 20022303f994SPeter Krystad static int mptcp_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) 20032303f994SPeter Krystad { 20042303f994SPeter Krystad struct mptcp_sock *msk = mptcp_sk(sock->sk); 20052303f994SPeter Krystad struct socket *ssock; 2006cf7da0d6SPeter Krystad int err; 20072303f994SPeter Krystad 20082303f994SPeter Krystad lock_sock(sock->sk); 2009fa68018dSPaolo Abeni ssock = __mptcp_nmpc_socket(msk); 2010fa68018dSPaolo Abeni if (!ssock) { 2011fa68018dSPaolo Abeni err = -EINVAL; 20122303f994SPeter Krystad goto unlock; 20132303f994SPeter Krystad } 20142303f994SPeter Krystad 20152303f994SPeter Krystad err = ssock->ops->bind(ssock, uaddr, addr_len); 2016cf7da0d6SPeter Krystad if (!err) 2017cf7da0d6SPeter Krystad mptcp_copy_inaddrs(sock->sk, ssock->sk); 20182303f994SPeter Krystad 20192303f994SPeter Krystad unlock: 20202303f994SPeter Krystad release_sock(sock->sk); 20212303f994SPeter Krystad return err; 20222303f994SPeter Krystad } 20232303f994SPeter Krystad 20240235d075SPaolo Abeni static void mptcp_subflow_early_fallback(struct mptcp_sock *msk, 20250235d075SPaolo Abeni struct mptcp_subflow_context *subflow) 20260235d075SPaolo Abeni { 20270235d075SPaolo Abeni subflow->request_mptcp = 0; 20280235d075SPaolo Abeni __mptcp_do_fallback(msk); 20290235d075SPaolo Abeni } 20300235d075SPaolo Abeni 20312303f994SPeter Krystad static int mptcp_stream_connect(struct socket *sock, struct sockaddr *uaddr, 20322303f994SPeter Krystad int addr_len, int flags) 20332303f994SPeter Krystad { 20342303f994SPeter Krystad struct mptcp_sock *msk = mptcp_sk(sock->sk); 20352c5ebd00SPaolo Abeni struct mptcp_subflow_context *subflow; 20362303f994SPeter Krystad struct socket *ssock; 20372303f994SPeter Krystad int err; 20382303f994SPeter Krystad 20392303f994SPeter Krystad lock_sock(sock->sk); 204041be81a8SPaolo Abeni if (sock->state != SS_UNCONNECTED && msk->subflow) { 204141be81a8SPaolo Abeni /* pending connection or invalid state, let existing subflow 204241be81a8SPaolo Abeni * cope with that 204341be81a8SPaolo Abeni */ 204441be81a8SPaolo Abeni ssock = msk->subflow; 204541be81a8SPaolo Abeni goto do_connect; 204641be81a8SPaolo Abeni } 204741be81a8SPaolo Abeni 2048fa68018dSPaolo Abeni ssock = __mptcp_nmpc_socket(msk); 2049fa68018dSPaolo Abeni if (!ssock) { 2050fa68018dSPaolo Abeni err = -EINVAL; 20512303f994SPeter Krystad goto unlock; 20522303f994SPeter Krystad } 20532303f994SPeter Krystad 2054fa68018dSPaolo Abeni mptcp_token_destroy(msk); 2055fa68018dSPaolo Abeni inet_sk_state_store(sock->sk, TCP_SYN_SENT); 20562c5ebd00SPaolo Abeni subflow = mptcp_subflow_ctx(ssock->sk); 2057cf7da0d6SPeter Krystad #ifdef CONFIG_TCP_MD5SIG 2058cf7da0d6SPeter Krystad /* no MPTCP if MD5SIG is enabled on this socket or we may run out of 2059cf7da0d6SPeter Krystad * TCP option space. 2060cf7da0d6SPeter Krystad */ 2061cf7da0d6SPeter Krystad if (rcu_access_pointer(tcp_sk(ssock->sk)->md5sig_info)) 20620235d075SPaolo Abeni mptcp_subflow_early_fallback(msk, subflow); 2063cf7da0d6SPeter Krystad #endif 20642c5ebd00SPaolo Abeni if (subflow->request_mptcp && mptcp_token_new_connect(ssock->sk)) 20650235d075SPaolo Abeni mptcp_subflow_early_fallback(msk, subflow); 2066cf7da0d6SPeter Krystad 206741be81a8SPaolo Abeni do_connect: 20682303f994SPeter Krystad err = ssock->ops->connect(ssock, uaddr, addr_len, flags); 206941be81a8SPaolo Abeni sock->state = ssock->state; 207041be81a8SPaolo Abeni 207141be81a8SPaolo Abeni /* on successful connect, the msk state will be moved to established by 207241be81a8SPaolo Abeni * subflow_finish_connect() 207341be81a8SPaolo Abeni */ 207441be81a8SPaolo Abeni if (!err || err == EINPROGRESS) 2075cf7da0d6SPeter Krystad mptcp_copy_inaddrs(sock->sk, ssock->sk); 207641be81a8SPaolo Abeni else 207741be81a8SPaolo Abeni inet_sk_state_store(sock->sk, inet_sk_state_load(ssock->sk)); 20782303f994SPeter Krystad 20792303f994SPeter Krystad unlock: 20802303f994SPeter Krystad release_sock(sock->sk); 20812303f994SPeter Krystad return err; 20822303f994SPeter Krystad } 20832303f994SPeter Krystad 2084cf7da0d6SPeter Krystad static int mptcp_listen(struct socket *sock, int backlog) 2085cf7da0d6SPeter Krystad { 2086cf7da0d6SPeter Krystad struct mptcp_sock *msk = mptcp_sk(sock->sk); 2087cf7da0d6SPeter Krystad struct socket *ssock; 2088cf7da0d6SPeter Krystad int err; 2089cf7da0d6SPeter Krystad 2090cf7da0d6SPeter Krystad pr_debug("msk=%p", msk); 2091cf7da0d6SPeter Krystad 2092cf7da0d6SPeter Krystad lock_sock(sock->sk); 2093fa68018dSPaolo Abeni ssock = __mptcp_nmpc_socket(msk); 2094fa68018dSPaolo Abeni if (!ssock) { 2095fa68018dSPaolo Abeni err = -EINVAL; 2096cf7da0d6SPeter Krystad goto unlock; 2097cf7da0d6SPeter Krystad } 2098cf7da0d6SPeter Krystad 2099fa68018dSPaolo Abeni mptcp_token_destroy(msk); 2100fa68018dSPaolo Abeni inet_sk_state_store(sock->sk, TCP_LISTEN); 21015e20087dSFlorian Westphal sock_set_flag(sock->sk, SOCK_RCU_FREE); 21025e20087dSFlorian Westphal 2103cf7da0d6SPeter Krystad err = ssock->ops->listen(ssock, backlog); 2104cf7da0d6SPeter Krystad inet_sk_state_store(sock->sk, inet_sk_state_load(ssock->sk)); 2105cf7da0d6SPeter Krystad if (!err) 2106cf7da0d6SPeter Krystad mptcp_copy_inaddrs(sock->sk, ssock->sk); 2107cf7da0d6SPeter Krystad 2108cf7da0d6SPeter Krystad unlock: 2109cf7da0d6SPeter Krystad release_sock(sock->sk); 2110cf7da0d6SPeter Krystad return err; 2111cf7da0d6SPeter Krystad } 2112cf7da0d6SPeter Krystad 2113cf7da0d6SPeter Krystad static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, 2114cf7da0d6SPeter Krystad int flags, bool kern) 2115cf7da0d6SPeter Krystad { 2116cf7da0d6SPeter Krystad struct mptcp_sock *msk = mptcp_sk(sock->sk); 2117cf7da0d6SPeter Krystad struct socket *ssock; 2118cf7da0d6SPeter Krystad int err; 2119cf7da0d6SPeter Krystad 2120cf7da0d6SPeter Krystad pr_debug("msk=%p", msk); 2121cf7da0d6SPeter Krystad 2122cf7da0d6SPeter Krystad lock_sock(sock->sk); 2123cf7da0d6SPeter Krystad if (sock->sk->sk_state != TCP_LISTEN) 2124cf7da0d6SPeter Krystad goto unlock_fail; 2125cf7da0d6SPeter Krystad 2126cf7da0d6SPeter Krystad ssock = __mptcp_nmpc_socket(msk); 2127cf7da0d6SPeter Krystad if (!ssock) 2128cf7da0d6SPeter Krystad goto unlock_fail; 2129cf7da0d6SPeter Krystad 21308a05661bSPaolo Abeni clear_bit(MPTCP_DATA_READY, &msk->flags); 2131cf7da0d6SPeter Krystad sock_hold(ssock->sk); 2132cf7da0d6SPeter Krystad release_sock(sock->sk); 2133cf7da0d6SPeter Krystad 2134cf7da0d6SPeter Krystad err = ssock->ops->accept(sock, newsock, flags, kern); 2135d2f77c53SPaolo Abeni if (err == 0 && !mptcp_is_tcpsk(newsock->sk)) { 2136cf7da0d6SPeter Krystad struct mptcp_sock *msk = mptcp_sk(newsock->sk); 2137cf7da0d6SPeter Krystad struct mptcp_subflow_context *subflow; 2138cf7da0d6SPeter Krystad 2139cf7da0d6SPeter Krystad /* set ssk->sk_socket of accept()ed flows to mptcp socket. 2140cf7da0d6SPeter Krystad * This is needed so NOSPACE flag can be set from tcp stack. 2141cf7da0d6SPeter Krystad */ 2142ec3edaa7SPeter Krystad __mptcp_flush_join_list(msk); 2143cf7da0d6SPeter Krystad list_for_each_entry(subflow, &msk->conn_list, node) { 2144cf7da0d6SPeter Krystad struct sock *ssk = mptcp_subflow_tcp_sock(subflow); 2145cf7da0d6SPeter Krystad 2146cf7da0d6SPeter Krystad if (!ssk->sk_socket) 2147cf7da0d6SPeter Krystad mptcp_sock_graft(ssk, newsock); 2148cf7da0d6SPeter Krystad } 2149cf7da0d6SPeter Krystad } 2150cf7da0d6SPeter Krystad 21518a05661bSPaolo Abeni if (inet_csk_listen_poll(ssock->sk)) 21528a05661bSPaolo Abeni set_bit(MPTCP_DATA_READY, &msk->flags); 2153cf7da0d6SPeter Krystad sock_put(ssock->sk); 2154cf7da0d6SPeter Krystad return err; 2155cf7da0d6SPeter Krystad 2156cf7da0d6SPeter Krystad unlock_fail: 2157cf7da0d6SPeter Krystad release_sock(sock->sk); 2158cf7da0d6SPeter Krystad return -EINVAL; 2159cf7da0d6SPeter Krystad } 2160cf7da0d6SPeter Krystad 21618a05661bSPaolo Abeni static __poll_t mptcp_check_readable(struct mptcp_sock *msk) 21628a05661bSPaolo Abeni { 21638a05661bSPaolo Abeni return test_bit(MPTCP_DATA_READY, &msk->flags) ? EPOLLIN | EPOLLRDNORM : 21648a05661bSPaolo Abeni 0; 21658a05661bSPaolo Abeni } 21668a05661bSPaolo Abeni 21672303f994SPeter Krystad static __poll_t mptcp_poll(struct file *file, struct socket *sock, 21682303f994SPeter Krystad struct poll_table_struct *wait) 21692303f994SPeter Krystad { 21701891c4a0SFlorian Westphal struct sock *sk = sock->sk; 21718ab183deSPaolo Abeni struct mptcp_sock *msk; 21722303f994SPeter Krystad __poll_t mask = 0; 21738a05661bSPaolo Abeni int state; 21742303f994SPeter Krystad 21751891c4a0SFlorian Westphal msk = mptcp_sk(sk); 21761891c4a0SFlorian Westphal sock_poll_wait(file, sock, wait); 21771891c4a0SFlorian Westphal 21788a05661bSPaolo Abeni state = inet_sk_state_load(sk); 21798a05661bSPaolo Abeni if (state == TCP_LISTEN) 21808a05661bSPaolo Abeni return mptcp_check_readable(msk); 21818a05661bSPaolo Abeni 21828a05661bSPaolo Abeni if (state != TCP_SYN_SENT && state != TCP_SYN_RECV) { 21838a05661bSPaolo Abeni mask |= mptcp_check_readable(msk); 21841891c4a0SFlorian Westphal if (sk_stream_is_writeable(sk) && 21851891c4a0SFlorian Westphal test_bit(MPTCP_SEND_SPACE, &msk->flags)) 21861891c4a0SFlorian Westphal mask |= EPOLLOUT | EPOLLWRNORM; 21878a05661bSPaolo Abeni } 21881891c4a0SFlorian Westphal if (sk->sk_shutdown & RCV_SHUTDOWN) 21891891c4a0SFlorian Westphal mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; 21901891c4a0SFlorian Westphal 21912303f994SPeter Krystad return mask; 21922303f994SPeter Krystad } 21932303f994SPeter Krystad 219421498490SPeter Krystad static int mptcp_shutdown(struct socket *sock, int how) 219521498490SPeter Krystad { 219621498490SPeter Krystad struct mptcp_sock *msk = mptcp_sk(sock->sk); 219721498490SPeter Krystad struct mptcp_subflow_context *subflow; 219821498490SPeter Krystad int ret = 0; 219921498490SPeter Krystad 220021498490SPeter Krystad pr_debug("sk=%p, how=%d", msk, how); 220121498490SPeter Krystad 220221498490SPeter Krystad lock_sock(sock->sk); 220321498490SPeter Krystad if (how == SHUT_WR || how == SHUT_RDWR) 220421498490SPeter Krystad inet_sk_state_store(sock->sk, TCP_FIN_WAIT1); 220521498490SPeter Krystad 220621498490SPeter Krystad how++; 220721498490SPeter Krystad 220821498490SPeter Krystad if ((how & ~SHUTDOWN_MASK) || !how) { 220921498490SPeter Krystad ret = -EINVAL; 221021498490SPeter Krystad goto out_unlock; 221121498490SPeter Krystad } 221221498490SPeter Krystad 221321498490SPeter Krystad if (sock->state == SS_CONNECTING) { 221421498490SPeter Krystad if ((1 << sock->sk->sk_state) & 221521498490SPeter Krystad (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_CLOSE)) 221621498490SPeter Krystad sock->state = SS_DISCONNECTING; 221721498490SPeter Krystad else 221821498490SPeter Krystad sock->state = SS_CONNECTED; 221921498490SPeter Krystad } 222021498490SPeter Krystad 2221ec3edaa7SPeter Krystad __mptcp_flush_join_list(msk); 22227279da61SMat Martineau msk->snd_data_fin_enable = 1; 22237279da61SMat Martineau 222421498490SPeter Krystad mptcp_for_each_subflow(msk, subflow) { 222521498490SPeter Krystad struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow); 222621498490SPeter Krystad 22277279da61SMat Martineau mptcp_subflow_shutdown(tcp_sk, how); 222821498490SPeter Krystad } 222921498490SPeter Krystad 2230e1ff9e82SDavide Caratti /* Wake up anyone sleeping in poll. */ 2231e1ff9e82SDavide Caratti sock->sk->sk_state_change(sock->sk); 2232e1ff9e82SDavide Caratti 223321498490SPeter Krystad out_unlock: 223421498490SPeter Krystad release_sock(sock->sk); 223521498490SPeter Krystad 223621498490SPeter Krystad return ret; 223721498490SPeter Krystad } 223821498490SPeter Krystad 2239e42f1ac6SFlorian Westphal static const struct proto_ops mptcp_stream_ops = { 2240e42f1ac6SFlorian Westphal .family = PF_INET, 2241e42f1ac6SFlorian Westphal .owner = THIS_MODULE, 2242e42f1ac6SFlorian Westphal .release = inet_release, 2243e42f1ac6SFlorian Westphal .bind = mptcp_bind, 2244e42f1ac6SFlorian Westphal .connect = mptcp_stream_connect, 2245e42f1ac6SFlorian Westphal .socketpair = sock_no_socketpair, 2246e42f1ac6SFlorian Westphal .accept = mptcp_stream_accept, 2247d2f77c53SPaolo Abeni .getname = inet_getname, 2248e42f1ac6SFlorian Westphal .poll = mptcp_poll, 2249e42f1ac6SFlorian Westphal .ioctl = inet_ioctl, 2250e42f1ac6SFlorian Westphal .gettstamp = sock_gettstamp, 2251e42f1ac6SFlorian Westphal .listen = mptcp_listen, 2252e42f1ac6SFlorian Westphal .shutdown = mptcp_shutdown, 2253e42f1ac6SFlorian Westphal .setsockopt = sock_common_setsockopt, 2254e42f1ac6SFlorian Westphal .getsockopt = sock_common_getsockopt, 2255e42f1ac6SFlorian Westphal .sendmsg = inet_sendmsg, 2256e42f1ac6SFlorian Westphal .recvmsg = inet_recvmsg, 2257e42f1ac6SFlorian Westphal .mmap = sock_no_mmap, 2258e42f1ac6SFlorian Westphal .sendpage = inet_sendpage, 2259e42f1ac6SFlorian Westphal }; 22602303f994SPeter Krystad 2261f870fa0bSMat Martineau static struct inet_protosw mptcp_protosw = { 2262f870fa0bSMat Martineau .type = SOCK_STREAM, 2263f870fa0bSMat Martineau .protocol = IPPROTO_MPTCP, 2264f870fa0bSMat Martineau .prot = &mptcp_prot, 22652303f994SPeter Krystad .ops = &mptcp_stream_ops, 22662303f994SPeter Krystad .flags = INET_PROTOSW_ICSK, 2267f870fa0bSMat Martineau }; 2268f870fa0bSMat Martineau 2269d39dcecaSPaolo Abeni void __init mptcp_proto_init(void) 2270f870fa0bSMat Martineau { 22712303f994SPeter Krystad mptcp_prot.h.hashinfo = tcp_prot.h.hashinfo; 22722303f994SPeter Krystad 2273d027236cSPaolo Abeni if (percpu_counter_init(&mptcp_sockets_allocated, 0, GFP_KERNEL)) 2274d027236cSPaolo Abeni panic("Failed to allocate MPTCP pcpu counter\n"); 2275d027236cSPaolo Abeni 22762303f994SPeter Krystad mptcp_subflow_init(); 22771b1c7a0eSPeter Krystad mptcp_pm_init(); 22782c5ebd00SPaolo Abeni mptcp_token_init(); 22792303f994SPeter Krystad 2280f870fa0bSMat Martineau if (proto_register(&mptcp_prot, 1) != 0) 2281f870fa0bSMat Martineau panic("Failed to register MPTCP proto.\n"); 2282f870fa0bSMat Martineau 2283f870fa0bSMat Martineau inet_register_protosw(&mptcp_protosw); 22846771bfd9SFlorian Westphal 22856771bfd9SFlorian Westphal BUILD_BUG_ON(sizeof(struct mptcp_skb_cb) > sizeof_field(struct sk_buff, cb)); 2286f870fa0bSMat Martineau } 2287f870fa0bSMat Martineau 2288f870fa0bSMat Martineau #if IS_ENABLED(CONFIG_MPTCP_IPV6) 2289e42f1ac6SFlorian Westphal static const struct proto_ops mptcp_v6_stream_ops = { 2290e42f1ac6SFlorian Westphal .family = PF_INET6, 2291e42f1ac6SFlorian Westphal .owner = THIS_MODULE, 2292e42f1ac6SFlorian Westphal .release = inet6_release, 2293e42f1ac6SFlorian Westphal .bind = mptcp_bind, 2294e42f1ac6SFlorian Westphal .connect = mptcp_stream_connect, 2295e42f1ac6SFlorian Westphal .socketpair = sock_no_socketpair, 2296e42f1ac6SFlorian Westphal .accept = mptcp_stream_accept, 2297d2f77c53SPaolo Abeni .getname = inet6_getname, 2298e42f1ac6SFlorian Westphal .poll = mptcp_poll, 2299e42f1ac6SFlorian Westphal .ioctl = inet6_ioctl, 2300e42f1ac6SFlorian Westphal .gettstamp = sock_gettstamp, 2301e42f1ac6SFlorian Westphal .listen = mptcp_listen, 2302e42f1ac6SFlorian Westphal .shutdown = mptcp_shutdown, 2303e42f1ac6SFlorian Westphal .setsockopt = sock_common_setsockopt, 2304e42f1ac6SFlorian Westphal .getsockopt = sock_common_getsockopt, 2305e42f1ac6SFlorian Westphal .sendmsg = inet6_sendmsg, 2306e42f1ac6SFlorian Westphal .recvmsg = inet6_recvmsg, 2307e42f1ac6SFlorian Westphal .mmap = sock_no_mmap, 2308e42f1ac6SFlorian Westphal .sendpage = inet_sendpage, 2309e42f1ac6SFlorian Westphal #ifdef CONFIG_COMPAT 23103986912fSChristoph Hellwig .compat_ioctl = inet6_compat_ioctl, 2311e42f1ac6SFlorian Westphal #endif 2312e42f1ac6SFlorian Westphal }; 2313e42f1ac6SFlorian Westphal 2314f870fa0bSMat Martineau static struct proto mptcp_v6_prot; 2315f870fa0bSMat Martineau 231679c0949eSPeter Krystad static void mptcp_v6_destroy(struct sock *sk) 231779c0949eSPeter Krystad { 231879c0949eSPeter Krystad mptcp_destroy(sk); 231979c0949eSPeter Krystad inet6_destroy_sock(sk); 232079c0949eSPeter Krystad } 232179c0949eSPeter Krystad 2322f870fa0bSMat Martineau static struct inet_protosw mptcp_v6_protosw = { 2323f870fa0bSMat Martineau .type = SOCK_STREAM, 2324f870fa0bSMat Martineau .protocol = IPPROTO_MPTCP, 2325f870fa0bSMat Martineau .prot = &mptcp_v6_prot, 23262303f994SPeter Krystad .ops = &mptcp_v6_stream_ops, 2327f870fa0bSMat Martineau .flags = INET_PROTOSW_ICSK, 2328f870fa0bSMat Martineau }; 2329f870fa0bSMat Martineau 2330d39dcecaSPaolo Abeni int __init mptcp_proto_v6_init(void) 2331f870fa0bSMat Martineau { 2332f870fa0bSMat Martineau int err; 2333f870fa0bSMat Martineau 2334f870fa0bSMat Martineau mptcp_v6_prot = mptcp_prot; 2335f870fa0bSMat Martineau strcpy(mptcp_v6_prot.name, "MPTCPv6"); 2336f870fa0bSMat Martineau mptcp_v6_prot.slab = NULL; 233779c0949eSPeter Krystad mptcp_v6_prot.destroy = mptcp_v6_destroy; 2338b0519de8SFlorian Westphal mptcp_v6_prot.obj_size = sizeof(struct mptcp6_sock); 2339f870fa0bSMat Martineau 2340f870fa0bSMat Martineau err = proto_register(&mptcp_v6_prot, 1); 2341f870fa0bSMat Martineau if (err) 2342f870fa0bSMat Martineau return err; 2343f870fa0bSMat Martineau 2344f870fa0bSMat Martineau err = inet6_register_protosw(&mptcp_v6_protosw); 2345f870fa0bSMat Martineau if (err) 2346f870fa0bSMat Martineau proto_unregister(&mptcp_v6_prot); 2347f870fa0bSMat Martineau 2348f870fa0bSMat Martineau return err; 2349f870fa0bSMat Martineau } 2350f870fa0bSMat Martineau #endif 2351