xref: /openbmc/linux/net/mptcp/subflow.c (revision 1e328ed5)
1 // SPDX-License-Identifier: GPL-2.0
2 /* Multipath TCP
3  *
4  * Copyright (c) 2017 - 2019, Intel Corporation.
5  */
6 
7 #define pr_fmt(fmt) "MPTCP: " fmt
8 
9 #include <linux/kernel.h>
10 #include <linux/module.h>
11 #include <linux/netdevice.h>
12 #include <crypto/algapi.h>
13 #include <crypto/sha2.h>
14 #include <net/sock.h>
15 #include <net/inet_common.h>
16 #include <net/inet_hashtables.h>
17 #include <net/protocol.h>
18 #include <net/tcp.h>
19 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
20 #include <net/ip6_route.h>
21 #include <net/transp_v6.h>
22 #endif
23 #include <net/mptcp.h>
24 #include <uapi/linux/mptcp.h>
25 #include "protocol.h"
26 #include "mib.h"
27 
28 static void mptcp_subflow_ops_undo_override(struct sock *ssk);
29 
30 static void SUBFLOW_REQ_INC_STATS(struct request_sock *req,
31 				  enum linux_mptcp_mib_field field)
32 {
33 	MPTCP_INC_STATS(sock_net(req_to_sk(req)), field);
34 }
35 
36 static void subflow_req_destructor(struct request_sock *req)
37 {
38 	struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
39 
40 	pr_debug("subflow_req=%p", subflow_req);
41 
42 	if (subflow_req->msk)
43 		sock_put((struct sock *)subflow_req->msk);
44 
45 	mptcp_token_destroy_request(req);
46 	tcp_request_sock_ops.destructor(req);
47 }
48 
49 static void subflow_generate_hmac(u64 key1, u64 key2, u32 nonce1, u32 nonce2,
50 				  void *hmac)
51 {
52 	u8 msg[8];
53 
54 	put_unaligned_be32(nonce1, &msg[0]);
55 	put_unaligned_be32(nonce2, &msg[4]);
56 
57 	mptcp_crypto_hmac_sha(key1, key2, msg, 8, hmac);
58 }
59 
60 static bool mptcp_can_accept_new_subflow(const struct mptcp_sock *msk)
61 {
62 	return mptcp_is_fully_established((void *)msk) &&
63 	       READ_ONCE(msk->pm.accept_subflow);
64 }
65 
66 /* validate received token and create truncated hmac and nonce for SYN-ACK */
67 static struct mptcp_sock *subflow_token_join_request(struct request_sock *req,
68 						     const struct sk_buff *skb)
69 {
70 	struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
71 	u8 hmac[SHA256_DIGEST_SIZE];
72 	struct mptcp_sock *msk;
73 	int local_id;
74 
75 	msk = mptcp_token_get_sock(subflow_req->token);
76 	if (!msk) {
77 		SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINNOTOKEN);
78 		return NULL;
79 	}
80 
81 	local_id = mptcp_pm_get_local_id(msk, (struct sock_common *)req);
82 	if (local_id < 0) {
83 		sock_put((struct sock *)msk);
84 		return NULL;
85 	}
86 	subflow_req->local_id = local_id;
87 
88 	get_random_bytes(&subflow_req->local_nonce, sizeof(u32));
89 
90 	subflow_generate_hmac(msk->local_key, msk->remote_key,
91 			      subflow_req->local_nonce,
92 			      subflow_req->remote_nonce, hmac);
93 
94 	subflow_req->thmac = get_unaligned_be64(hmac);
95 	return msk;
96 }
97 
98 static int __subflow_init_req(struct request_sock *req, const struct sock *sk_listener)
99 {
100 	struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
101 
102 	subflow_req->mp_capable = 0;
103 	subflow_req->mp_join = 0;
104 	subflow_req->msk = NULL;
105 	mptcp_token_init_request(req);
106 
107 #ifdef CONFIG_TCP_MD5SIG
108 	/* no MPTCP if MD5SIG is enabled on this socket or we may run out of
109 	 * TCP option space.
110 	 */
111 	if (rcu_access_pointer(tcp_sk(sk_listener)->md5sig_info))
112 		return -EINVAL;
113 #endif
114 
115 	return 0;
116 }
117 
118 /* Init mptcp request socket.
119  *
120  * Returns an error code if a JOIN has failed and a TCP reset
121  * should be sent.
122  */
123 static int subflow_init_req(struct request_sock *req,
124 			    const struct sock *sk_listener,
125 			    struct sk_buff *skb)
126 {
127 	struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk_listener);
128 	struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
129 	struct mptcp_options_received mp_opt;
130 	int ret;
131 
132 	pr_debug("subflow_req=%p, listener=%p", subflow_req, listener);
133 
134 	ret = __subflow_init_req(req, sk_listener);
135 	if (ret)
136 		return 0;
137 
138 	mptcp_get_options(skb, &mp_opt);
139 
140 	if (mp_opt.mp_capable) {
141 		SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MPCAPABLEPASSIVE);
142 
143 		if (mp_opt.mp_join)
144 			return 0;
145 	} else if (mp_opt.mp_join) {
146 		SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINSYNRX);
147 	}
148 
149 	if (mp_opt.mp_capable && listener->request_mptcp) {
150 		int err, retries = 4;
151 
152 		subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq;
153 again:
154 		do {
155 			get_random_bytes(&subflow_req->local_key, sizeof(subflow_req->local_key));
156 		} while (subflow_req->local_key == 0);
157 
158 		if (unlikely(req->syncookie)) {
159 			mptcp_crypto_key_sha(subflow_req->local_key,
160 					     &subflow_req->token,
161 					     &subflow_req->idsn);
162 			if (mptcp_token_exists(subflow_req->token)) {
163 				if (retries-- > 0)
164 					goto again;
165 			} else {
166 				subflow_req->mp_capable = 1;
167 			}
168 			return 0;
169 		}
170 
171 		err = mptcp_token_new_request(req);
172 		if (err == 0)
173 			subflow_req->mp_capable = 1;
174 		else if (retries-- > 0)
175 			goto again;
176 
177 	} else if (mp_opt.mp_join && listener->request_mptcp) {
178 		subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq;
179 		subflow_req->mp_join = 1;
180 		subflow_req->backup = mp_opt.backup;
181 		subflow_req->remote_id = mp_opt.join_id;
182 		subflow_req->token = mp_opt.token;
183 		subflow_req->remote_nonce = mp_opt.nonce;
184 		subflow_req->msk = subflow_token_join_request(req, skb);
185 
186 		/* Can't fall back to TCP in this case. */
187 		if (!subflow_req->msk)
188 			return -EPERM;
189 
190 		if (unlikely(req->syncookie)) {
191 			if (mptcp_can_accept_new_subflow(subflow_req->msk))
192 				subflow_init_req_cookie_join_save(subflow_req, skb);
193 		}
194 
195 		pr_debug("token=%u, remote_nonce=%u msk=%p", subflow_req->token,
196 			 subflow_req->remote_nonce, subflow_req->msk);
197 	}
198 
199 	return 0;
200 }
201 
202 int mptcp_subflow_init_cookie_req(struct request_sock *req,
203 				  const struct sock *sk_listener,
204 				  struct sk_buff *skb)
205 {
206 	struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk_listener);
207 	struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
208 	struct mptcp_options_received mp_opt;
209 	int err;
210 
211 	err = __subflow_init_req(req, sk_listener);
212 	if (err)
213 		return err;
214 
215 	mptcp_get_options(skb, &mp_opt);
216 
217 	if (mp_opt.mp_capable && mp_opt.mp_join)
218 		return -EINVAL;
219 
220 	if (mp_opt.mp_capable && listener->request_mptcp) {
221 		if (mp_opt.sndr_key == 0)
222 			return -EINVAL;
223 
224 		subflow_req->local_key = mp_opt.rcvr_key;
225 		err = mptcp_token_new_request(req);
226 		if (err)
227 			return err;
228 
229 		subflow_req->mp_capable = 1;
230 		subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq - 1;
231 	} else if (mp_opt.mp_join && listener->request_mptcp) {
232 		if (!mptcp_token_join_cookie_init_state(subflow_req, skb))
233 			return -EINVAL;
234 
235 		if (mptcp_can_accept_new_subflow(subflow_req->msk))
236 			subflow_req->mp_join = 1;
237 
238 		subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq - 1;
239 	}
240 
241 	return 0;
242 }
243 EXPORT_SYMBOL_GPL(mptcp_subflow_init_cookie_req);
244 
245 static struct dst_entry *subflow_v4_route_req(const struct sock *sk,
246 					      struct sk_buff *skb,
247 					      struct flowi *fl,
248 					      struct request_sock *req)
249 {
250 	struct dst_entry *dst;
251 	int err;
252 
253 	tcp_rsk(req)->is_mptcp = 1;
254 
255 	dst = tcp_request_sock_ipv4_ops.route_req(sk, skb, fl, req);
256 	if (!dst)
257 		return NULL;
258 
259 	err = subflow_init_req(req, sk, skb);
260 	if (err == 0)
261 		return dst;
262 
263 	dst_release(dst);
264 	if (!req->syncookie)
265 		tcp_request_sock_ops.send_reset(sk, skb);
266 	return NULL;
267 }
268 
269 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
270 static struct dst_entry *subflow_v6_route_req(const struct sock *sk,
271 					      struct sk_buff *skb,
272 					      struct flowi *fl,
273 					      struct request_sock *req)
274 {
275 	struct dst_entry *dst;
276 	int err;
277 
278 	tcp_rsk(req)->is_mptcp = 1;
279 
280 	dst = tcp_request_sock_ipv6_ops.route_req(sk, skb, fl, req);
281 	if (!dst)
282 		return NULL;
283 
284 	err = subflow_init_req(req, sk, skb);
285 	if (err == 0)
286 		return dst;
287 
288 	dst_release(dst);
289 	if (!req->syncookie)
290 		tcp6_request_sock_ops.send_reset(sk, skb);
291 	return NULL;
292 }
293 #endif
294 
295 /* validate received truncated hmac and create hmac for third ACK */
296 static bool subflow_thmac_valid(struct mptcp_subflow_context *subflow)
297 {
298 	u8 hmac[SHA256_DIGEST_SIZE];
299 	u64 thmac;
300 
301 	subflow_generate_hmac(subflow->remote_key, subflow->local_key,
302 			      subflow->remote_nonce, subflow->local_nonce,
303 			      hmac);
304 
305 	thmac = get_unaligned_be64(hmac);
306 	pr_debug("subflow=%p, token=%u, thmac=%llu, subflow->thmac=%llu\n",
307 		 subflow, subflow->token,
308 		 (unsigned long long)thmac,
309 		 (unsigned long long)subflow->thmac);
310 
311 	return thmac == subflow->thmac;
312 }
313 
314 void mptcp_subflow_reset(struct sock *ssk)
315 {
316 	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
317 	struct sock *sk = subflow->conn;
318 
319 	/* must hold: tcp_done() could drop last reference on parent */
320 	sock_hold(sk);
321 
322 	tcp_set_state(ssk, TCP_CLOSE);
323 	tcp_send_active_reset(ssk, GFP_ATOMIC);
324 	tcp_done(ssk);
325 	if (!test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &mptcp_sk(sk)->flags) &&
326 	    schedule_work(&mptcp_sk(sk)->work))
327 		return; /* worker will put sk for us */
328 
329 	sock_put(sk);
330 }
331 
332 static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
333 {
334 	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
335 	struct mptcp_options_received mp_opt;
336 	struct sock *parent = subflow->conn;
337 
338 	subflow->icsk_af_ops->sk_rx_dst_set(sk, skb);
339 
340 	if (inet_sk_state_load(parent) == TCP_SYN_SENT) {
341 		inet_sk_state_store(parent, TCP_ESTABLISHED);
342 		parent->sk_state_change(parent);
343 	}
344 
345 	/* be sure no special action on any packet other than syn-ack */
346 	if (subflow->conn_finished)
347 		return;
348 
349 	mptcp_propagate_sndbuf(parent, sk);
350 	subflow->rel_write_seq = 1;
351 	subflow->conn_finished = 1;
352 	subflow->ssn_offset = TCP_SKB_CB(skb)->seq;
353 	pr_debug("subflow=%p synack seq=%x", subflow, subflow->ssn_offset);
354 
355 	mptcp_get_options(skb, &mp_opt);
356 	if (subflow->request_mptcp) {
357 		if (!mp_opt.mp_capable) {
358 			MPTCP_INC_STATS(sock_net(sk),
359 					MPTCP_MIB_MPCAPABLEACTIVEFALLBACK);
360 			mptcp_do_fallback(sk);
361 			pr_fallback(mptcp_sk(subflow->conn));
362 			goto fallback;
363 		}
364 
365 		subflow->mp_capable = 1;
366 		subflow->can_ack = 1;
367 		subflow->remote_key = mp_opt.sndr_key;
368 		pr_debug("subflow=%p, remote_key=%llu", subflow,
369 			 subflow->remote_key);
370 		mptcp_finish_connect(sk);
371 	} else if (subflow->request_join) {
372 		u8 hmac[SHA256_DIGEST_SIZE];
373 
374 		if (!mp_opt.mp_join)
375 			goto do_reset;
376 
377 		subflow->thmac = mp_opt.thmac;
378 		subflow->remote_nonce = mp_opt.nonce;
379 		pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u", subflow,
380 			 subflow->thmac, subflow->remote_nonce);
381 
382 		if (!subflow_thmac_valid(subflow)) {
383 			MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINACKMAC);
384 			goto do_reset;
385 		}
386 
387 		subflow_generate_hmac(subflow->local_key, subflow->remote_key,
388 				      subflow->local_nonce,
389 				      subflow->remote_nonce,
390 				      hmac);
391 		memcpy(subflow->hmac, hmac, MPTCPOPT_HMAC_LEN);
392 
393 		if (!mptcp_finish_join(sk))
394 			goto do_reset;
395 
396 		subflow->mp_join = 1;
397 		MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNACKRX);
398 	} else if (mptcp_check_fallback(sk)) {
399 fallback:
400 		mptcp_rcv_space_init(mptcp_sk(parent), sk);
401 	}
402 	return;
403 
404 do_reset:
405 	mptcp_subflow_reset(sk);
406 }
407 
408 struct request_sock_ops mptcp_subflow_request_sock_ops;
409 EXPORT_SYMBOL_GPL(mptcp_subflow_request_sock_ops);
410 static struct tcp_request_sock_ops subflow_request_sock_ipv4_ops;
411 
412 static int subflow_v4_conn_request(struct sock *sk, struct sk_buff *skb)
413 {
414 	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
415 
416 	pr_debug("subflow=%p", subflow);
417 
418 	/* Never answer to SYNs sent to broadcast or multicast */
419 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
420 		goto drop;
421 
422 	return tcp_conn_request(&mptcp_subflow_request_sock_ops,
423 				&subflow_request_sock_ipv4_ops,
424 				sk, skb);
425 drop:
426 	tcp_listendrop(sk);
427 	return 0;
428 }
429 
430 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
431 static struct tcp_request_sock_ops subflow_request_sock_ipv6_ops;
432 static struct inet_connection_sock_af_ops subflow_v6_specific;
433 static struct inet_connection_sock_af_ops subflow_v6m_specific;
434 static struct proto tcpv6_prot_override;
435 
436 static int subflow_v6_conn_request(struct sock *sk, struct sk_buff *skb)
437 {
438 	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
439 
440 	pr_debug("subflow=%p", subflow);
441 
442 	if (skb->protocol == htons(ETH_P_IP))
443 		return subflow_v4_conn_request(sk, skb);
444 
445 	if (!ipv6_unicast_destination(skb))
446 		goto drop;
447 
448 	return tcp_conn_request(&mptcp_subflow_request_sock_ops,
449 				&subflow_request_sock_ipv6_ops, sk, skb);
450 
451 drop:
452 	tcp_listendrop(sk);
453 	return 0; /* don't send reset */
454 }
455 #endif
456 
457 /* validate hmac received in third ACK */
458 static bool subflow_hmac_valid(const struct request_sock *req,
459 			       const struct mptcp_options_received *mp_opt)
460 {
461 	const struct mptcp_subflow_request_sock *subflow_req;
462 	u8 hmac[SHA256_DIGEST_SIZE];
463 	struct mptcp_sock *msk;
464 
465 	subflow_req = mptcp_subflow_rsk(req);
466 	msk = subflow_req->msk;
467 	if (!msk)
468 		return false;
469 
470 	subflow_generate_hmac(msk->remote_key, msk->local_key,
471 			      subflow_req->remote_nonce,
472 			      subflow_req->local_nonce, hmac);
473 
474 	return !crypto_memneq(hmac, mp_opt->hmac, MPTCPOPT_HMAC_LEN);
475 }
476 
477 static void mptcp_sock_destruct(struct sock *sk)
478 {
479 	/* if new mptcp socket isn't accepted, it is free'd
480 	 * from the tcp listener sockets request queue, linked
481 	 * from req->sk.  The tcp socket is released.
482 	 * This calls the ULP release function which will
483 	 * also remove the mptcp socket, via
484 	 * sock_put(ctx->conn).
485 	 *
486 	 * Problem is that the mptcp socket will be in
487 	 * ESTABLISHED state and will not have the SOCK_DEAD flag.
488 	 * Both result in warnings from inet_sock_destruct.
489 	 */
490 
491 	if (sk->sk_state == TCP_ESTABLISHED) {
492 		sk->sk_state = TCP_CLOSE;
493 		WARN_ON_ONCE(sk->sk_socket);
494 		sock_orphan(sk);
495 	}
496 
497 	mptcp_destroy_common(mptcp_sk(sk));
498 	inet_sock_destruct(sk);
499 }
500 
501 static void mptcp_force_close(struct sock *sk)
502 {
503 	inet_sk_state_store(sk, TCP_CLOSE);
504 	sk_common_release(sk);
505 }
506 
507 static void subflow_ulp_fallback(struct sock *sk,
508 				 struct mptcp_subflow_context *old_ctx)
509 {
510 	struct inet_connection_sock *icsk = inet_csk(sk);
511 
512 	mptcp_subflow_tcp_fallback(sk, old_ctx);
513 	icsk->icsk_ulp_ops = NULL;
514 	rcu_assign_pointer(icsk->icsk_ulp_data, NULL);
515 	tcp_sk(sk)->is_mptcp = 0;
516 
517 	mptcp_subflow_ops_undo_override(sk);
518 }
519 
520 static void subflow_drop_ctx(struct sock *ssk)
521 {
522 	struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(ssk);
523 
524 	if (!ctx)
525 		return;
526 
527 	subflow_ulp_fallback(ssk, ctx);
528 	if (ctx->conn)
529 		sock_put(ctx->conn);
530 
531 	kfree_rcu(ctx, rcu);
532 }
533 
534 void mptcp_subflow_fully_established(struct mptcp_subflow_context *subflow,
535 				     struct mptcp_options_received *mp_opt)
536 {
537 	struct mptcp_sock *msk = mptcp_sk(subflow->conn);
538 
539 	subflow->remote_key = mp_opt->sndr_key;
540 	subflow->fully_established = 1;
541 	subflow->can_ack = 1;
542 	WRITE_ONCE(msk->fully_established, true);
543 }
544 
545 static struct sock *subflow_syn_recv_sock(const struct sock *sk,
546 					  struct sk_buff *skb,
547 					  struct request_sock *req,
548 					  struct dst_entry *dst,
549 					  struct request_sock *req_unhash,
550 					  bool *own_req)
551 {
552 	struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk);
553 	struct mptcp_subflow_request_sock *subflow_req;
554 	struct mptcp_options_received mp_opt;
555 	bool fallback, fallback_is_fatal;
556 	struct sock *new_msk = NULL;
557 	struct sock *child;
558 
559 	pr_debug("listener=%p, req=%p, conn=%p", listener, req, listener->conn);
560 
561 	/* After child creation we must look for 'mp_capable' even when options
562 	 * are not parsed
563 	 */
564 	mp_opt.mp_capable = 0;
565 
566 	/* hopefully temporary handling for MP_JOIN+syncookie */
567 	subflow_req = mptcp_subflow_rsk(req);
568 	fallback_is_fatal = tcp_rsk(req)->is_mptcp && subflow_req->mp_join;
569 	fallback = !tcp_rsk(req)->is_mptcp;
570 	if (fallback)
571 		goto create_child;
572 
573 	/* if the sk is MP_CAPABLE, we try to fetch the client key */
574 	if (subflow_req->mp_capable) {
575 		if (TCP_SKB_CB(skb)->seq != subflow_req->ssn_offset + 1) {
576 			/* here we can receive and accept an in-window,
577 			 * out-of-order pkt, which will not carry the MP_CAPABLE
578 			 * opt even on mptcp enabled paths
579 			 */
580 			goto create_msk;
581 		}
582 
583 		mptcp_get_options(skb, &mp_opt);
584 		if (!mp_opt.mp_capable) {
585 			fallback = true;
586 			goto create_child;
587 		}
588 
589 create_msk:
590 		new_msk = mptcp_sk_clone(listener->conn, &mp_opt, req);
591 		if (!new_msk)
592 			fallback = true;
593 	} else if (subflow_req->mp_join) {
594 		mptcp_get_options(skb, &mp_opt);
595 		if (!mp_opt.mp_join || !subflow_hmac_valid(req, &mp_opt) ||
596 		    !mptcp_can_accept_new_subflow(subflow_req->msk)) {
597 			SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKMAC);
598 			fallback = true;
599 		}
600 	}
601 
602 create_child:
603 	child = listener->icsk_af_ops->syn_recv_sock(sk, skb, req, dst,
604 						     req_unhash, own_req);
605 
606 	if (child && *own_req) {
607 		struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(child);
608 
609 		tcp_rsk(req)->drop_req = false;
610 
611 		/* we need to fallback on ctx allocation failure and on pre-reqs
612 		 * checking above. In the latter scenario we additionally need
613 		 * to reset the context to non MPTCP status.
614 		 */
615 		if (!ctx || fallback) {
616 			if (fallback_is_fatal)
617 				goto dispose_child;
618 
619 			subflow_drop_ctx(child);
620 			goto out;
621 		}
622 
623 		if (ctx->mp_capable) {
624 			/* this can't race with mptcp_close(), as the msk is
625 			 * not yet exposted to user-space
626 			 */
627 			inet_sk_state_store((void *)new_msk, TCP_ESTABLISHED);
628 
629 			/* record the newly created socket as the first msk
630 			 * subflow, but don't link it yet into conn_list
631 			 */
632 			WRITE_ONCE(mptcp_sk(new_msk)->first, child);
633 
634 			/* new mpc subflow takes ownership of the newly
635 			 * created mptcp socket
636 			 */
637 			new_msk->sk_destruct = mptcp_sock_destruct;
638 			mptcp_pm_new_connection(mptcp_sk(new_msk), 1);
639 			mptcp_token_accept(subflow_req, mptcp_sk(new_msk));
640 			ctx->conn = new_msk;
641 			new_msk = NULL;
642 
643 			/* with OoO packets we can reach here without ingress
644 			 * mpc option
645 			 */
646 			if (mp_opt.mp_capable)
647 				mptcp_subflow_fully_established(ctx, &mp_opt);
648 		} else if (ctx->mp_join) {
649 			struct mptcp_sock *owner;
650 
651 			owner = subflow_req->msk;
652 			if (!owner)
653 				goto dispose_child;
654 
655 			/* move the msk reference ownership to the subflow */
656 			subflow_req->msk = NULL;
657 			ctx->conn = (struct sock *)owner;
658 			if (!mptcp_finish_join(child))
659 				goto dispose_child;
660 
661 			SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKRX);
662 			tcp_rsk(req)->drop_req = true;
663 		}
664 	}
665 
666 out:
667 	/* dispose of the left over mptcp master, if any */
668 	if (unlikely(new_msk))
669 		mptcp_force_close(new_msk);
670 
671 	/* check for expected invariant - should never trigger, just help
672 	 * catching eariler subtle bugs
673 	 */
674 	WARN_ON_ONCE(child && *own_req && tcp_sk(child)->is_mptcp &&
675 		     (!mptcp_subflow_ctx(child) ||
676 		      !mptcp_subflow_ctx(child)->conn));
677 	return child;
678 
679 dispose_child:
680 	subflow_drop_ctx(child);
681 	tcp_rsk(req)->drop_req = true;
682 	inet_csk_prepare_for_destroy_sock(child);
683 	tcp_done(child);
684 	req->rsk_ops->send_reset(sk, skb);
685 
686 	/* The last child reference will be released by the caller */
687 	return child;
688 }
689 
690 static struct inet_connection_sock_af_ops subflow_specific;
691 static struct proto tcp_prot_override;
692 
693 enum mapping_status {
694 	MAPPING_OK,
695 	MAPPING_INVALID,
696 	MAPPING_EMPTY,
697 	MAPPING_DATA_FIN,
698 	MAPPING_DUMMY
699 };
700 
701 static u64 expand_seq(u64 old_seq, u16 old_data_len, u64 seq)
702 {
703 	if ((u32)seq == (u32)old_seq)
704 		return old_seq;
705 
706 	/* Assume map covers data not mapped yet. */
707 	return seq | ((old_seq + old_data_len + 1) & GENMASK_ULL(63, 32));
708 }
709 
710 static void warn_bad_map(struct mptcp_subflow_context *subflow, u32 ssn)
711 {
712 	WARN_ONCE(1, "Bad mapping: ssn=%d map_seq=%d map_data_len=%d",
713 		  ssn, subflow->map_subflow_seq, subflow->map_data_len);
714 }
715 
716 static bool skb_is_fully_mapped(struct sock *ssk, struct sk_buff *skb)
717 {
718 	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
719 	unsigned int skb_consumed;
720 
721 	skb_consumed = tcp_sk(ssk)->copied_seq - TCP_SKB_CB(skb)->seq;
722 	if (WARN_ON_ONCE(skb_consumed >= skb->len))
723 		return true;
724 
725 	return skb->len - skb_consumed <= subflow->map_data_len -
726 					  mptcp_subflow_get_map_offset(subflow);
727 }
728 
729 static bool validate_mapping(struct sock *ssk, struct sk_buff *skb)
730 {
731 	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
732 	u32 ssn = tcp_sk(ssk)->copied_seq - subflow->ssn_offset;
733 
734 	if (unlikely(before(ssn, subflow->map_subflow_seq))) {
735 		/* Mapping covers data later in the subflow stream,
736 		 * currently unsupported.
737 		 */
738 		warn_bad_map(subflow, ssn);
739 		return false;
740 	}
741 	if (unlikely(!before(ssn, subflow->map_subflow_seq +
742 				  subflow->map_data_len))) {
743 		/* Mapping does covers past subflow data, invalid */
744 		warn_bad_map(subflow, ssn + skb->len);
745 		return false;
746 	}
747 	return true;
748 }
749 
750 static enum mapping_status get_mapping_status(struct sock *ssk,
751 					      struct mptcp_sock *msk)
752 {
753 	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
754 	struct mptcp_ext *mpext;
755 	struct sk_buff *skb;
756 	u16 data_len;
757 	u64 map_seq;
758 
759 	skb = skb_peek(&ssk->sk_receive_queue);
760 	if (!skb)
761 		return MAPPING_EMPTY;
762 
763 	if (mptcp_check_fallback(ssk))
764 		return MAPPING_DUMMY;
765 
766 	mpext = mptcp_get_ext(skb);
767 	if (!mpext || !mpext->use_map) {
768 		if (!subflow->map_valid && !skb->len) {
769 			/* the TCP stack deliver 0 len FIN pkt to the receive
770 			 * queue, that is the only 0len pkts ever expected here,
771 			 * and we can admit no mapping only for 0 len pkts
772 			 */
773 			if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN))
774 				WARN_ONCE(1, "0len seq %d:%d flags %x",
775 					  TCP_SKB_CB(skb)->seq,
776 					  TCP_SKB_CB(skb)->end_seq,
777 					  TCP_SKB_CB(skb)->tcp_flags);
778 			sk_eat_skb(ssk, skb);
779 			return MAPPING_EMPTY;
780 		}
781 
782 		if (!subflow->map_valid)
783 			return MAPPING_INVALID;
784 
785 		goto validate_seq;
786 	}
787 
788 	pr_debug("seq=%llu is64=%d ssn=%u data_len=%u data_fin=%d",
789 		 mpext->data_seq, mpext->dsn64, mpext->subflow_seq,
790 		 mpext->data_len, mpext->data_fin);
791 
792 	data_len = mpext->data_len;
793 	if (data_len == 0) {
794 		pr_err("Infinite mapping not handled");
795 		MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_INFINITEMAPRX);
796 		return MAPPING_INVALID;
797 	}
798 
799 	if (mpext->data_fin == 1) {
800 		if (data_len == 1) {
801 			bool updated = mptcp_update_rcv_data_fin(msk, mpext->data_seq,
802 								 mpext->dsn64);
803 			pr_debug("DATA_FIN with no payload seq=%llu", mpext->data_seq);
804 			if (subflow->map_valid) {
805 				/* A DATA_FIN might arrive in a DSS
806 				 * option before the previous mapping
807 				 * has been fully consumed. Continue
808 				 * handling the existing mapping.
809 				 */
810 				skb_ext_del(skb, SKB_EXT_MPTCP);
811 				return MAPPING_OK;
812 			} else {
813 				if (updated && schedule_work(&msk->work))
814 					sock_hold((struct sock *)msk);
815 
816 				return MAPPING_DATA_FIN;
817 			}
818 		} else {
819 			u64 data_fin_seq = mpext->data_seq + data_len - 1;
820 
821 			/* If mpext->data_seq is a 32-bit value, data_fin_seq
822 			 * must also be limited to 32 bits.
823 			 */
824 			if (!mpext->dsn64)
825 				data_fin_seq &= GENMASK_ULL(31, 0);
826 
827 			mptcp_update_rcv_data_fin(msk, data_fin_seq, mpext->dsn64);
828 			pr_debug("DATA_FIN with mapping seq=%llu dsn64=%d",
829 				 data_fin_seq, mpext->dsn64);
830 		}
831 
832 		/* Adjust for DATA_FIN using 1 byte of sequence space */
833 		data_len--;
834 	}
835 
836 	if (!mpext->dsn64) {
837 		map_seq = expand_seq(subflow->map_seq, subflow->map_data_len,
838 				     mpext->data_seq);
839 		pr_debug("expanded seq=%llu", subflow->map_seq);
840 	} else {
841 		map_seq = mpext->data_seq;
842 	}
843 	WRITE_ONCE(mptcp_sk(subflow->conn)->use_64bit_ack, !!mpext->dsn64);
844 
845 	if (subflow->map_valid) {
846 		/* Allow replacing only with an identical map */
847 		if (subflow->map_seq == map_seq &&
848 		    subflow->map_subflow_seq == mpext->subflow_seq &&
849 		    subflow->map_data_len == data_len) {
850 			skb_ext_del(skb, SKB_EXT_MPTCP);
851 			return MAPPING_OK;
852 		}
853 
854 		/* If this skb data are fully covered by the current mapping,
855 		 * the new map would need caching, which is not supported
856 		 */
857 		if (skb_is_fully_mapped(ssk, skb)) {
858 			MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DSSNOMATCH);
859 			return MAPPING_INVALID;
860 		}
861 
862 		/* will validate the next map after consuming the current one */
863 		return MAPPING_OK;
864 	}
865 
866 	subflow->map_seq = map_seq;
867 	subflow->map_subflow_seq = mpext->subflow_seq;
868 	subflow->map_data_len = data_len;
869 	subflow->map_valid = 1;
870 	subflow->mpc_map = mpext->mpc_map;
871 	pr_debug("new map seq=%llu subflow_seq=%u data_len=%u",
872 		 subflow->map_seq, subflow->map_subflow_seq,
873 		 subflow->map_data_len);
874 
875 validate_seq:
876 	/* we revalidate valid mapping on new skb, because we must ensure
877 	 * the current skb is completely covered by the available mapping
878 	 */
879 	if (!validate_mapping(ssk, skb))
880 		return MAPPING_INVALID;
881 
882 	skb_ext_del(skb, SKB_EXT_MPTCP);
883 	return MAPPING_OK;
884 }
885 
886 static void mptcp_subflow_discard_data(struct sock *ssk, struct sk_buff *skb,
887 				       u64 limit)
888 {
889 	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
890 	bool fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN;
891 	u32 incr;
892 
893 	incr = limit >= skb->len ? skb->len + fin : limit;
894 
895 	pr_debug("discarding=%d len=%d seq=%d", incr, skb->len,
896 		 subflow->map_subflow_seq);
897 	MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DUPDATA);
898 	tcp_sk(ssk)->copied_seq += incr;
899 	if (!before(tcp_sk(ssk)->copied_seq, TCP_SKB_CB(skb)->end_seq))
900 		sk_eat_skb(ssk, skb);
901 	if (mptcp_subflow_get_map_offset(subflow) >= subflow->map_data_len)
902 		subflow->map_valid = 0;
903 }
904 
905 static bool subflow_check_data_avail(struct sock *ssk)
906 {
907 	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
908 	enum mapping_status status;
909 	struct mptcp_sock *msk;
910 	struct sk_buff *skb;
911 
912 	pr_debug("msk=%p ssk=%p data_avail=%d skb=%p", subflow->conn, ssk,
913 		 subflow->data_avail, skb_peek(&ssk->sk_receive_queue));
914 	if (!skb_peek(&ssk->sk_receive_queue))
915 		subflow->data_avail = 0;
916 	if (subflow->data_avail)
917 		return true;
918 
919 	msk = mptcp_sk(subflow->conn);
920 	for (;;) {
921 		u64 ack_seq;
922 		u64 old_ack;
923 
924 		status = get_mapping_status(ssk, msk);
925 		pr_debug("msk=%p ssk=%p status=%d", msk, ssk, status);
926 		if (status == MAPPING_INVALID) {
927 			ssk->sk_err = EBADMSG;
928 			goto fatal;
929 		}
930 		if (status == MAPPING_DUMMY) {
931 			__mptcp_do_fallback(msk);
932 			skb = skb_peek(&ssk->sk_receive_queue);
933 			subflow->map_valid = 1;
934 			subflow->map_seq = READ_ONCE(msk->ack_seq);
935 			subflow->map_data_len = skb->len;
936 			subflow->map_subflow_seq = tcp_sk(ssk)->copied_seq -
937 						   subflow->ssn_offset;
938 			subflow->data_avail = MPTCP_SUBFLOW_DATA_AVAIL;
939 			return true;
940 		}
941 
942 		if (status != MAPPING_OK)
943 			return false;
944 
945 		skb = skb_peek(&ssk->sk_receive_queue);
946 		if (WARN_ON_ONCE(!skb))
947 			return false;
948 
949 		/* if msk lacks the remote key, this subflow must provide an
950 		 * MP_CAPABLE-based mapping
951 		 */
952 		if (unlikely(!READ_ONCE(msk->can_ack))) {
953 			if (!subflow->mpc_map) {
954 				ssk->sk_err = EBADMSG;
955 				goto fatal;
956 			}
957 			WRITE_ONCE(msk->remote_key, subflow->remote_key);
958 			WRITE_ONCE(msk->ack_seq, subflow->map_seq);
959 			WRITE_ONCE(msk->can_ack, true);
960 		}
961 
962 		old_ack = READ_ONCE(msk->ack_seq);
963 		ack_seq = mptcp_subflow_get_mapped_dsn(subflow);
964 		pr_debug("msk ack_seq=%llx subflow ack_seq=%llx", old_ack,
965 			 ack_seq);
966 		if (ack_seq == old_ack) {
967 			subflow->data_avail = MPTCP_SUBFLOW_DATA_AVAIL;
968 			break;
969 		} else if (after64(ack_seq, old_ack)) {
970 			subflow->data_avail = MPTCP_SUBFLOW_OOO_DATA;
971 			break;
972 		}
973 
974 		/* only accept in-sequence mapping. Old values are spurious
975 		 * retransmission
976 		 */
977 		mptcp_subflow_discard_data(ssk, skb, old_ack - ack_seq);
978 	}
979 	return true;
980 
981 fatal:
982 	/* fatal protocol error, close the socket */
983 	/* This barrier is coupled with smp_rmb() in tcp_poll() */
984 	smp_wmb();
985 	ssk->sk_error_report(ssk);
986 	tcp_set_state(ssk, TCP_CLOSE);
987 	tcp_send_active_reset(ssk, GFP_ATOMIC);
988 	subflow->data_avail = 0;
989 	return false;
990 }
991 
992 bool mptcp_subflow_data_available(struct sock *sk)
993 {
994 	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
995 
996 	/* check if current mapping is still valid */
997 	if (subflow->map_valid &&
998 	    mptcp_subflow_get_map_offset(subflow) >= subflow->map_data_len) {
999 		subflow->map_valid = 0;
1000 		subflow->data_avail = 0;
1001 
1002 		pr_debug("Done with mapping: seq=%u data_len=%u",
1003 			 subflow->map_subflow_seq,
1004 			 subflow->map_data_len);
1005 	}
1006 
1007 	return subflow_check_data_avail(sk);
1008 }
1009 
1010 /* If ssk has an mptcp parent socket, use the mptcp rcvbuf occupancy,
1011  * not the ssk one.
1012  *
1013  * In mptcp, rwin is about the mptcp-level connection data.
1014  *
1015  * Data that is still on the ssk rx queue can thus be ignored,
1016  * as far as mptcp peer is concerened that data is still inflight.
1017  * DSS ACK is updated when skb is moved to the mptcp rx queue.
1018  */
1019 void mptcp_space(const struct sock *ssk, int *space, int *full_space)
1020 {
1021 	const struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
1022 	const struct sock *sk = subflow->conn;
1023 
1024 	*space = __mptcp_space(sk);
1025 	*full_space = tcp_full_space(sk);
1026 }
1027 
1028 static void subflow_data_ready(struct sock *sk)
1029 {
1030 	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
1031 	u16 state = 1 << inet_sk_state_load(sk);
1032 	struct sock *parent = subflow->conn;
1033 	struct mptcp_sock *msk;
1034 
1035 	msk = mptcp_sk(parent);
1036 	if (state & TCPF_LISTEN) {
1037 		set_bit(MPTCP_DATA_READY, &msk->flags);
1038 		parent->sk_data_ready(parent);
1039 		return;
1040 	}
1041 
1042 	WARN_ON_ONCE(!__mptcp_check_fallback(msk) && !subflow->mp_capable &&
1043 		     !subflow->mp_join && !(state & TCPF_CLOSE));
1044 
1045 	if (mptcp_subflow_data_available(sk))
1046 		mptcp_data_ready(parent, sk);
1047 }
1048 
1049 static void subflow_write_space(struct sock *ssk)
1050 {
1051 	struct sock *sk = mptcp_subflow_ctx(ssk)->conn;
1052 
1053 	mptcp_propagate_sndbuf(sk, ssk);
1054 	mptcp_write_space(sk);
1055 }
1056 
1057 static struct inet_connection_sock_af_ops *
1058 subflow_default_af_ops(struct sock *sk)
1059 {
1060 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
1061 	if (sk->sk_family == AF_INET6)
1062 		return &subflow_v6_specific;
1063 #endif
1064 	return &subflow_specific;
1065 }
1066 
1067 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
1068 void mptcpv6_handle_mapped(struct sock *sk, bool mapped)
1069 {
1070 	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
1071 	struct inet_connection_sock *icsk = inet_csk(sk);
1072 	struct inet_connection_sock_af_ops *target;
1073 
1074 	target = mapped ? &subflow_v6m_specific : subflow_default_af_ops(sk);
1075 
1076 	pr_debug("subflow=%p family=%d ops=%p target=%p mapped=%d",
1077 		 subflow, sk->sk_family, icsk->icsk_af_ops, target, mapped);
1078 
1079 	if (likely(icsk->icsk_af_ops == target))
1080 		return;
1081 
1082 	subflow->icsk_af_ops = icsk->icsk_af_ops;
1083 	icsk->icsk_af_ops = target;
1084 }
1085 #endif
1086 
1087 static void mptcp_info2sockaddr(const struct mptcp_addr_info *info,
1088 				struct sockaddr_storage *addr,
1089 				unsigned short family)
1090 {
1091 	memset(addr, 0, sizeof(*addr));
1092 	addr->ss_family = family;
1093 	if (addr->ss_family == AF_INET) {
1094 		struct sockaddr_in *in_addr = (struct sockaddr_in *)addr;
1095 
1096 		if (info->family == AF_INET)
1097 			in_addr->sin_addr = info->addr;
1098 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
1099 		else if (ipv6_addr_v4mapped(&info->addr6))
1100 			in_addr->sin_addr.s_addr = info->addr6.s6_addr32[3];
1101 #endif
1102 		in_addr->sin_port = info->port;
1103 	}
1104 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
1105 	else if (addr->ss_family == AF_INET6) {
1106 		struct sockaddr_in6 *in6_addr = (struct sockaddr_in6 *)addr;
1107 
1108 		if (info->family == AF_INET)
1109 			ipv6_addr_set_v4mapped(info->addr.s_addr,
1110 					       &in6_addr->sin6_addr);
1111 		else
1112 			in6_addr->sin6_addr = info->addr6;
1113 		in6_addr->sin6_port = info->port;
1114 	}
1115 #endif
1116 }
1117 
1118 int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc,
1119 			    const struct mptcp_addr_info *remote)
1120 {
1121 	struct mptcp_sock *msk = mptcp_sk(sk);
1122 	struct mptcp_subflow_context *subflow;
1123 	struct sockaddr_storage addr;
1124 	int remote_id = remote->id;
1125 	int local_id = loc->id;
1126 	struct socket *sf;
1127 	struct sock *ssk;
1128 	u32 remote_token;
1129 	int addrlen;
1130 	int err;
1131 
1132 	if (!mptcp_is_fully_established(sk))
1133 		return -ENOTCONN;
1134 
1135 	err = mptcp_subflow_create_socket(sk, &sf);
1136 	if (err)
1137 		return err;
1138 
1139 	ssk = sf->sk;
1140 	subflow = mptcp_subflow_ctx(ssk);
1141 	do {
1142 		get_random_bytes(&subflow->local_nonce, sizeof(u32));
1143 	} while (!subflow->local_nonce);
1144 
1145 	if (!local_id) {
1146 		err = mptcp_pm_get_local_id(msk, (struct sock_common *)ssk);
1147 		if (err < 0)
1148 			goto failed;
1149 
1150 		local_id = err;
1151 	}
1152 
1153 	subflow->remote_key = msk->remote_key;
1154 	subflow->local_key = msk->local_key;
1155 	subflow->token = msk->token;
1156 	mptcp_info2sockaddr(loc, &addr, ssk->sk_family);
1157 
1158 	addrlen = sizeof(struct sockaddr_in);
1159 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
1160 	if (addr.ss_family == AF_INET6)
1161 		addrlen = sizeof(struct sockaddr_in6);
1162 #endif
1163 	ssk->sk_bound_dev_if = loc->ifindex;
1164 	err = kernel_bind(sf, (struct sockaddr *)&addr, addrlen);
1165 	if (err)
1166 		goto failed;
1167 
1168 	mptcp_crypto_key_sha(subflow->remote_key, &remote_token, NULL);
1169 	pr_debug("msk=%p remote_token=%u local_id=%d remote_id=%d", msk,
1170 		 remote_token, local_id, remote_id);
1171 	subflow->remote_token = remote_token;
1172 	subflow->local_id = local_id;
1173 	subflow->remote_id = remote_id;
1174 	subflow->request_join = 1;
1175 	subflow->request_bkup = !!(loc->flags & MPTCP_PM_ADDR_FLAG_BACKUP);
1176 	mptcp_info2sockaddr(remote, &addr, ssk->sk_family);
1177 
1178 	mptcp_add_pending_subflow(msk, subflow);
1179 	err = kernel_connect(sf, (struct sockaddr *)&addr, addrlen, O_NONBLOCK);
1180 	if (err && err != -EINPROGRESS)
1181 		goto failed_unlink;
1182 
1183 	/* discard the subflow socket */
1184 	mptcp_sock_graft(ssk, sk->sk_socket);
1185 	iput(SOCK_INODE(sf));
1186 	return err;
1187 
1188 failed_unlink:
1189 	spin_lock_bh(&msk->join_list_lock);
1190 	list_del(&subflow->node);
1191 	spin_unlock_bh(&msk->join_list_lock);
1192 
1193 failed:
1194 	subflow->disposable = 1;
1195 	sock_release(sf);
1196 	return err;
1197 }
1198 
1199 static void mptcp_attach_cgroup(struct sock *parent, struct sock *child)
1200 {
1201 #ifdef CONFIG_SOCK_CGROUP_DATA
1202 	struct sock_cgroup_data *parent_skcd = &parent->sk_cgrp_data,
1203 				*child_skcd = &child->sk_cgrp_data;
1204 
1205 	/* only the additional subflows created by kworkers have to be modified */
1206 	if (cgroup_id(sock_cgroup_ptr(parent_skcd)) !=
1207 	    cgroup_id(sock_cgroup_ptr(child_skcd))) {
1208 #ifdef CONFIG_MEMCG
1209 		struct mem_cgroup *memcg = parent->sk_memcg;
1210 
1211 		mem_cgroup_sk_free(child);
1212 		if (memcg && css_tryget(&memcg->css))
1213 			child->sk_memcg = memcg;
1214 #endif /* CONFIG_MEMCG */
1215 
1216 		cgroup_sk_free(child_skcd);
1217 		*child_skcd = *parent_skcd;
1218 		cgroup_sk_clone(child_skcd);
1219 	}
1220 #endif /* CONFIG_SOCK_CGROUP_DATA */
1221 }
1222 
1223 static void mptcp_subflow_ops_override(struct sock *ssk)
1224 {
1225 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
1226 	if (ssk->sk_prot == &tcpv6_prot)
1227 		ssk->sk_prot = &tcpv6_prot_override;
1228 	else
1229 #endif
1230 		ssk->sk_prot = &tcp_prot_override;
1231 }
1232 
1233 static void mptcp_subflow_ops_undo_override(struct sock *ssk)
1234 {
1235 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
1236 	if (ssk->sk_prot == &tcpv6_prot_override)
1237 		ssk->sk_prot = &tcpv6_prot;
1238 	else
1239 #endif
1240 		ssk->sk_prot = &tcp_prot;
1241 }
1242 int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock)
1243 {
1244 	struct mptcp_subflow_context *subflow;
1245 	struct net *net = sock_net(sk);
1246 	struct socket *sf;
1247 	int err;
1248 
1249 	/* un-accepted server sockets can reach here - on bad configuration
1250 	 * bail early to avoid greater trouble later
1251 	 */
1252 	if (unlikely(!sk->sk_socket))
1253 		return -EINVAL;
1254 
1255 	err = sock_create_kern(net, sk->sk_family, SOCK_STREAM, IPPROTO_TCP,
1256 			       &sf);
1257 	if (err)
1258 		return err;
1259 
1260 	lock_sock(sf->sk);
1261 
1262 	/* the newly created socket has to be in the same cgroup as its parent */
1263 	mptcp_attach_cgroup(sk, sf->sk);
1264 
1265 	/* kernel sockets do not by default acquire net ref, but TCP timer
1266 	 * needs it.
1267 	 */
1268 	sf->sk->sk_net_refcnt = 1;
1269 	get_net(net);
1270 #ifdef CONFIG_PROC_FS
1271 	this_cpu_add(*net->core.sock_inuse, 1);
1272 #endif
1273 	err = tcp_set_ulp(sf->sk, "mptcp");
1274 	release_sock(sf->sk);
1275 
1276 	if (err) {
1277 		sock_release(sf);
1278 		return err;
1279 	}
1280 
1281 	/* the newly created socket really belongs to the owning MPTCP master
1282 	 * socket, even if for additional subflows the allocation is performed
1283 	 * by a kernel workqueue. Adjust inode references, so that the
1284 	 * procfs/diag interaces really show this one belonging to the correct
1285 	 * user.
1286 	 */
1287 	SOCK_INODE(sf)->i_ino = SOCK_INODE(sk->sk_socket)->i_ino;
1288 	SOCK_INODE(sf)->i_uid = SOCK_INODE(sk->sk_socket)->i_uid;
1289 	SOCK_INODE(sf)->i_gid = SOCK_INODE(sk->sk_socket)->i_gid;
1290 
1291 	subflow = mptcp_subflow_ctx(sf->sk);
1292 	pr_debug("subflow=%p", subflow);
1293 
1294 	*new_sock = sf;
1295 	sock_hold(sk);
1296 	subflow->conn = sk;
1297 	mptcp_subflow_ops_override(sf->sk);
1298 
1299 	return 0;
1300 }
1301 
1302 static struct mptcp_subflow_context *subflow_create_ctx(struct sock *sk,
1303 							gfp_t priority)
1304 {
1305 	struct inet_connection_sock *icsk = inet_csk(sk);
1306 	struct mptcp_subflow_context *ctx;
1307 
1308 	ctx = kzalloc(sizeof(*ctx), priority);
1309 	if (!ctx)
1310 		return NULL;
1311 
1312 	rcu_assign_pointer(icsk->icsk_ulp_data, ctx);
1313 	INIT_LIST_HEAD(&ctx->node);
1314 	INIT_LIST_HEAD(&ctx->delegated_node);
1315 
1316 	pr_debug("subflow=%p", ctx);
1317 
1318 	ctx->tcp_sock = sk;
1319 
1320 	return ctx;
1321 }
1322 
1323 static void __subflow_state_change(struct sock *sk)
1324 {
1325 	struct socket_wq *wq;
1326 
1327 	rcu_read_lock();
1328 	wq = rcu_dereference(sk->sk_wq);
1329 	if (skwq_has_sleeper(wq))
1330 		wake_up_interruptible_all(&wq->wait);
1331 	rcu_read_unlock();
1332 }
1333 
1334 static bool subflow_is_done(const struct sock *sk)
1335 {
1336 	return sk->sk_shutdown & RCV_SHUTDOWN || sk->sk_state == TCP_CLOSE;
1337 }
1338 
1339 static void subflow_state_change(struct sock *sk)
1340 {
1341 	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
1342 	struct sock *parent = subflow->conn;
1343 
1344 	__subflow_state_change(sk);
1345 
1346 	if (subflow_simultaneous_connect(sk)) {
1347 		mptcp_propagate_sndbuf(parent, sk);
1348 		mptcp_do_fallback(sk);
1349 		mptcp_rcv_space_init(mptcp_sk(parent), sk);
1350 		pr_fallback(mptcp_sk(parent));
1351 		subflow->conn_finished = 1;
1352 		if (inet_sk_state_load(parent) == TCP_SYN_SENT) {
1353 			inet_sk_state_store(parent, TCP_ESTABLISHED);
1354 			parent->sk_state_change(parent);
1355 		}
1356 	}
1357 
1358 	/* as recvmsg() does not acquire the subflow socket for ssk selection
1359 	 * a fin packet carrying a DSS can be unnoticed if we don't trigger
1360 	 * the data available machinery here.
1361 	 */
1362 	if (mptcp_subflow_data_available(sk))
1363 		mptcp_data_ready(parent, sk);
1364 
1365 	if (__mptcp_check_fallback(mptcp_sk(parent)) &&
1366 	    !subflow->rx_eof && subflow_is_done(sk)) {
1367 		subflow->rx_eof = 1;
1368 		mptcp_subflow_eof(parent);
1369 	}
1370 }
1371 
1372 static int subflow_ulp_init(struct sock *sk)
1373 {
1374 	struct inet_connection_sock *icsk = inet_csk(sk);
1375 	struct mptcp_subflow_context *ctx;
1376 	struct tcp_sock *tp = tcp_sk(sk);
1377 	int err = 0;
1378 
1379 	/* disallow attaching ULP to a socket unless it has been
1380 	 * created with sock_create_kern()
1381 	 */
1382 	if (!sk->sk_kern_sock) {
1383 		err = -EOPNOTSUPP;
1384 		goto out;
1385 	}
1386 
1387 	ctx = subflow_create_ctx(sk, GFP_KERNEL);
1388 	if (!ctx) {
1389 		err = -ENOMEM;
1390 		goto out;
1391 	}
1392 
1393 	pr_debug("subflow=%p, family=%d", ctx, sk->sk_family);
1394 
1395 	tp->is_mptcp = 1;
1396 	ctx->icsk_af_ops = icsk->icsk_af_ops;
1397 	icsk->icsk_af_ops = subflow_default_af_ops(sk);
1398 	ctx->tcp_data_ready = sk->sk_data_ready;
1399 	ctx->tcp_state_change = sk->sk_state_change;
1400 	ctx->tcp_write_space = sk->sk_write_space;
1401 	sk->sk_data_ready = subflow_data_ready;
1402 	sk->sk_write_space = subflow_write_space;
1403 	sk->sk_state_change = subflow_state_change;
1404 out:
1405 	return err;
1406 }
1407 
1408 static void subflow_ulp_release(struct sock *ssk)
1409 {
1410 	struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(ssk);
1411 	bool release = true;
1412 	struct sock *sk;
1413 
1414 	if (!ctx)
1415 		return;
1416 
1417 	sk = ctx->conn;
1418 	if (sk) {
1419 		/* if the msk has been orphaned, keep the ctx
1420 		 * alive, will be freed by __mptcp_close_ssk(),
1421 		 * when the subflow is still unaccepted
1422 		 */
1423 		release = ctx->disposable || list_empty(&ctx->node);
1424 		sock_put(sk);
1425 	}
1426 
1427 	mptcp_subflow_ops_undo_override(ssk);
1428 	if (release)
1429 		kfree_rcu(ctx, rcu);
1430 }
1431 
1432 static void subflow_ulp_clone(const struct request_sock *req,
1433 			      struct sock *newsk,
1434 			      const gfp_t priority)
1435 {
1436 	struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
1437 	struct mptcp_subflow_context *old_ctx = mptcp_subflow_ctx(newsk);
1438 	struct mptcp_subflow_context *new_ctx;
1439 
1440 	if (!tcp_rsk(req)->is_mptcp ||
1441 	    (!subflow_req->mp_capable && !subflow_req->mp_join)) {
1442 		subflow_ulp_fallback(newsk, old_ctx);
1443 		return;
1444 	}
1445 
1446 	new_ctx = subflow_create_ctx(newsk, priority);
1447 	if (!new_ctx) {
1448 		subflow_ulp_fallback(newsk, old_ctx);
1449 		return;
1450 	}
1451 
1452 	new_ctx->conn_finished = 1;
1453 	new_ctx->icsk_af_ops = old_ctx->icsk_af_ops;
1454 	new_ctx->tcp_data_ready = old_ctx->tcp_data_ready;
1455 	new_ctx->tcp_state_change = old_ctx->tcp_state_change;
1456 	new_ctx->tcp_write_space = old_ctx->tcp_write_space;
1457 	new_ctx->rel_write_seq = 1;
1458 	new_ctx->tcp_sock = newsk;
1459 
1460 	if (subflow_req->mp_capable) {
1461 		/* see comments in subflow_syn_recv_sock(), MPTCP connection
1462 		 * is fully established only after we receive the remote key
1463 		 */
1464 		new_ctx->mp_capable = 1;
1465 		new_ctx->local_key = subflow_req->local_key;
1466 		new_ctx->token = subflow_req->token;
1467 		new_ctx->ssn_offset = subflow_req->ssn_offset;
1468 		new_ctx->idsn = subflow_req->idsn;
1469 	} else if (subflow_req->mp_join) {
1470 		new_ctx->ssn_offset = subflow_req->ssn_offset;
1471 		new_ctx->mp_join = 1;
1472 		new_ctx->fully_established = 1;
1473 		new_ctx->backup = subflow_req->backup;
1474 		new_ctx->local_id = subflow_req->local_id;
1475 		new_ctx->remote_id = subflow_req->remote_id;
1476 		new_ctx->token = subflow_req->token;
1477 		new_ctx->thmac = subflow_req->thmac;
1478 	}
1479 }
1480 
1481 static void tcp_release_cb_override(struct sock *ssk)
1482 {
1483 	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
1484 
1485 	if (mptcp_subflow_has_delegated_action(subflow))
1486 		mptcp_subflow_process_delegated(ssk);
1487 
1488 	tcp_release_cb(ssk);
1489 }
1490 
1491 static struct tcp_ulp_ops subflow_ulp_ops __read_mostly = {
1492 	.name		= "mptcp",
1493 	.owner		= THIS_MODULE,
1494 	.init		= subflow_ulp_init,
1495 	.release	= subflow_ulp_release,
1496 	.clone		= subflow_ulp_clone,
1497 };
1498 
1499 static int subflow_ops_init(struct request_sock_ops *subflow_ops)
1500 {
1501 	subflow_ops->obj_size = sizeof(struct mptcp_subflow_request_sock);
1502 	subflow_ops->slab_name = "request_sock_subflow";
1503 
1504 	subflow_ops->slab = kmem_cache_create(subflow_ops->slab_name,
1505 					      subflow_ops->obj_size, 0,
1506 					      SLAB_ACCOUNT |
1507 					      SLAB_TYPESAFE_BY_RCU,
1508 					      NULL);
1509 	if (!subflow_ops->slab)
1510 		return -ENOMEM;
1511 
1512 	subflow_ops->destructor = subflow_req_destructor;
1513 
1514 	return 0;
1515 }
1516 
1517 void __init mptcp_subflow_init(void)
1518 {
1519 	mptcp_subflow_request_sock_ops = tcp_request_sock_ops;
1520 	if (subflow_ops_init(&mptcp_subflow_request_sock_ops) != 0)
1521 		panic("MPTCP: failed to init subflow request sock ops\n");
1522 
1523 	subflow_request_sock_ipv4_ops = tcp_request_sock_ipv4_ops;
1524 	subflow_request_sock_ipv4_ops.route_req = subflow_v4_route_req;
1525 
1526 	subflow_specific = ipv4_specific;
1527 	subflow_specific.conn_request = subflow_v4_conn_request;
1528 	subflow_specific.syn_recv_sock = subflow_syn_recv_sock;
1529 	subflow_specific.sk_rx_dst_set = subflow_finish_connect;
1530 
1531 	tcp_prot_override = tcp_prot;
1532 	tcp_prot_override.release_cb = tcp_release_cb_override;
1533 
1534 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
1535 	subflow_request_sock_ipv6_ops = tcp_request_sock_ipv6_ops;
1536 	subflow_request_sock_ipv6_ops.route_req = subflow_v6_route_req;
1537 
1538 	subflow_v6_specific = ipv6_specific;
1539 	subflow_v6_specific.conn_request = subflow_v6_conn_request;
1540 	subflow_v6_specific.syn_recv_sock = subflow_syn_recv_sock;
1541 	subflow_v6_specific.sk_rx_dst_set = subflow_finish_connect;
1542 
1543 	subflow_v6m_specific = subflow_v6_specific;
1544 	subflow_v6m_specific.queue_xmit = ipv4_specific.queue_xmit;
1545 	subflow_v6m_specific.send_check = ipv4_specific.send_check;
1546 	subflow_v6m_specific.net_header_len = ipv4_specific.net_header_len;
1547 	subflow_v6m_specific.mtu_reduced = ipv4_specific.mtu_reduced;
1548 	subflow_v6m_specific.net_frag_header_len = 0;
1549 
1550 	tcpv6_prot_override = tcpv6_prot;
1551 	tcpv6_prot_override.release_cb = tcp_release_cb_override;
1552 #endif
1553 
1554 	mptcp_diag_subflow_init(&subflow_ulp_ops);
1555 
1556 	if (tcp_register_ulp(&subflow_ulp_ops) != 0)
1557 		panic("MPTCP: failed to register subflows to ULP\n");
1558 }
1559