xref: /openbmc/linux/net/mptcp/options.c (revision 6a143a7c)
1 // SPDX-License-Identifier: GPL-2.0
2 /* Multipath TCP
3  *
4  * Copyright (c) 2017 - 2019, Intel Corporation.
5  */
6 
7 #define pr_fmt(fmt) "MPTCP: " fmt
8 
9 #include <linux/kernel.h>
10 #include <crypto/sha2.h>
11 #include <net/tcp.h>
12 #include <net/mptcp.h>
13 #include "protocol.h"
14 #include "mib.h"
15 
16 static bool mptcp_cap_flag_sha256(u8 flags)
17 {
18 	return (flags & MPTCP_CAP_FLAG_MASK) == MPTCP_CAP_HMAC_SHA256;
19 }
20 
21 static void mptcp_parse_option(const struct sk_buff *skb,
22 			       const unsigned char *ptr, int opsize,
23 			       struct mptcp_options_received *mp_opt)
24 {
25 	u8 subtype = *ptr >> 4;
26 	int expected_opsize;
27 	u8 version;
28 	u8 flags;
29 	u8 i;
30 
31 	switch (subtype) {
32 	case MPTCPOPT_MP_CAPABLE:
33 		/* strict size checking */
34 		if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
35 			if (skb->len > tcp_hdr(skb)->doff << 2)
36 				expected_opsize = TCPOLEN_MPTCP_MPC_ACK_DATA;
37 			else
38 				expected_opsize = TCPOLEN_MPTCP_MPC_ACK;
39 		} else {
40 			if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)
41 				expected_opsize = TCPOLEN_MPTCP_MPC_SYNACK;
42 			else
43 				expected_opsize = TCPOLEN_MPTCP_MPC_SYN;
44 		}
45 		if (opsize != expected_opsize)
46 			break;
47 
48 		/* try to be gentle vs future versions on the initial syn */
49 		version = *ptr++ & MPTCP_VERSION_MASK;
50 		if (opsize != TCPOLEN_MPTCP_MPC_SYN) {
51 			if (version != MPTCP_SUPPORTED_VERSION)
52 				break;
53 		} else if (version < MPTCP_SUPPORTED_VERSION) {
54 			break;
55 		}
56 
57 		flags = *ptr++;
58 		if (!mptcp_cap_flag_sha256(flags) ||
59 		    (flags & MPTCP_CAP_EXTENSIBILITY))
60 			break;
61 
62 		/* RFC 6824, Section 3.1:
63 		 * "For the Checksum Required bit (labeled "A"), if either
64 		 * host requires the use of checksums, checksums MUST be used.
65 		 * In other words, the only way for checksums not to be used
66 		 * is if both hosts in their SYNs set A=0."
67 		 *
68 		 * Section 3.3.0:
69 		 * "If a checksum is not present when its use has been
70 		 * negotiated, the receiver MUST close the subflow with a RST as
71 		 * it is considered broken."
72 		 *
73 		 * We don't implement DSS checksum - fall back to TCP.
74 		 */
75 		if (flags & MPTCP_CAP_CHECKSUM_REQD)
76 			break;
77 
78 		mp_opt->mp_capable = 1;
79 		if (opsize >= TCPOLEN_MPTCP_MPC_SYNACK) {
80 			mp_opt->sndr_key = get_unaligned_be64(ptr);
81 			ptr += 8;
82 		}
83 		if (opsize >= TCPOLEN_MPTCP_MPC_ACK) {
84 			mp_opt->rcvr_key = get_unaligned_be64(ptr);
85 			ptr += 8;
86 		}
87 		if (opsize == TCPOLEN_MPTCP_MPC_ACK_DATA) {
88 			/* Section 3.1.:
89 			 * "the data parameters in a MP_CAPABLE are semantically
90 			 * equivalent to those in a DSS option and can be used
91 			 * interchangeably."
92 			 */
93 			mp_opt->dss = 1;
94 			mp_opt->use_map = 1;
95 			mp_opt->mpc_map = 1;
96 			mp_opt->data_len = get_unaligned_be16(ptr);
97 			ptr += 2;
98 		}
99 		pr_debug("MP_CAPABLE version=%x, flags=%x, optlen=%d sndr=%llu, rcvr=%llu len=%d",
100 			 version, flags, opsize, mp_opt->sndr_key,
101 			 mp_opt->rcvr_key, mp_opt->data_len);
102 		break;
103 
104 	case MPTCPOPT_MP_JOIN:
105 		mp_opt->mp_join = 1;
106 		if (opsize == TCPOLEN_MPTCP_MPJ_SYN) {
107 			mp_opt->backup = *ptr++ & MPTCPOPT_BACKUP;
108 			mp_opt->join_id = *ptr++;
109 			mp_opt->token = get_unaligned_be32(ptr);
110 			ptr += 4;
111 			mp_opt->nonce = get_unaligned_be32(ptr);
112 			ptr += 4;
113 			pr_debug("MP_JOIN bkup=%u, id=%u, token=%u, nonce=%u",
114 				 mp_opt->backup, mp_opt->join_id,
115 				 mp_opt->token, mp_opt->nonce);
116 		} else if (opsize == TCPOLEN_MPTCP_MPJ_SYNACK) {
117 			mp_opt->backup = *ptr++ & MPTCPOPT_BACKUP;
118 			mp_opt->join_id = *ptr++;
119 			mp_opt->thmac = get_unaligned_be64(ptr);
120 			ptr += 8;
121 			mp_opt->nonce = get_unaligned_be32(ptr);
122 			ptr += 4;
123 			pr_debug("MP_JOIN bkup=%u, id=%u, thmac=%llu, nonce=%u",
124 				 mp_opt->backup, mp_opt->join_id,
125 				 mp_opt->thmac, mp_opt->nonce);
126 		} else if (opsize == TCPOLEN_MPTCP_MPJ_ACK) {
127 			ptr += 2;
128 			memcpy(mp_opt->hmac, ptr, MPTCPOPT_HMAC_LEN);
129 			pr_debug("MP_JOIN hmac");
130 		} else {
131 			pr_warn("MP_JOIN bad option size");
132 			mp_opt->mp_join = 0;
133 		}
134 		break;
135 
136 	case MPTCPOPT_DSS:
137 		pr_debug("DSS");
138 		ptr++;
139 
140 		/* we must clear 'mpc_map' be able to detect MP_CAPABLE
141 		 * map vs DSS map in mptcp_incoming_options(), and reconstruct
142 		 * map info accordingly
143 		 */
144 		mp_opt->mpc_map = 0;
145 		flags = (*ptr++) & MPTCP_DSS_FLAG_MASK;
146 		mp_opt->data_fin = (flags & MPTCP_DSS_DATA_FIN) != 0;
147 		mp_opt->dsn64 = (flags & MPTCP_DSS_DSN64) != 0;
148 		mp_opt->use_map = (flags & MPTCP_DSS_HAS_MAP) != 0;
149 		mp_opt->ack64 = (flags & MPTCP_DSS_ACK64) != 0;
150 		mp_opt->use_ack = (flags & MPTCP_DSS_HAS_ACK);
151 
152 		pr_debug("data_fin=%d dsn64=%d use_map=%d ack64=%d use_ack=%d",
153 			 mp_opt->data_fin, mp_opt->dsn64,
154 			 mp_opt->use_map, mp_opt->ack64,
155 			 mp_opt->use_ack);
156 
157 		expected_opsize = TCPOLEN_MPTCP_DSS_BASE;
158 
159 		if (mp_opt->use_ack) {
160 			if (mp_opt->ack64)
161 				expected_opsize += TCPOLEN_MPTCP_DSS_ACK64;
162 			else
163 				expected_opsize += TCPOLEN_MPTCP_DSS_ACK32;
164 		}
165 
166 		if (mp_opt->use_map) {
167 			if (mp_opt->dsn64)
168 				expected_opsize += TCPOLEN_MPTCP_DSS_MAP64;
169 			else
170 				expected_opsize += TCPOLEN_MPTCP_DSS_MAP32;
171 		}
172 
173 		/* RFC 6824, Section 3.3:
174 		 * If a checksum is present, but its use had
175 		 * not been negotiated in the MP_CAPABLE handshake,
176 		 * the checksum field MUST be ignored.
177 		 */
178 		if (opsize != expected_opsize &&
179 		    opsize != expected_opsize + TCPOLEN_MPTCP_DSS_CHECKSUM)
180 			break;
181 
182 		mp_opt->dss = 1;
183 
184 		if (mp_opt->use_ack) {
185 			if (mp_opt->ack64) {
186 				mp_opt->data_ack = get_unaligned_be64(ptr);
187 				ptr += 8;
188 			} else {
189 				mp_opt->data_ack = get_unaligned_be32(ptr);
190 				ptr += 4;
191 			}
192 
193 			pr_debug("data_ack=%llu", mp_opt->data_ack);
194 		}
195 
196 		if (mp_opt->use_map) {
197 			if (mp_opt->dsn64) {
198 				mp_opt->data_seq = get_unaligned_be64(ptr);
199 				ptr += 8;
200 			} else {
201 				mp_opt->data_seq = get_unaligned_be32(ptr);
202 				ptr += 4;
203 			}
204 
205 			mp_opt->subflow_seq = get_unaligned_be32(ptr);
206 			ptr += 4;
207 
208 			mp_opt->data_len = get_unaligned_be16(ptr);
209 			ptr += 2;
210 
211 			pr_debug("data_seq=%llu subflow_seq=%u data_len=%u",
212 				 mp_opt->data_seq, mp_opt->subflow_seq,
213 				 mp_opt->data_len);
214 		}
215 
216 		break;
217 
218 	case MPTCPOPT_ADD_ADDR:
219 		mp_opt->echo = (*ptr++) & MPTCP_ADDR_ECHO;
220 		if (!mp_opt->echo) {
221 			if (opsize == TCPOLEN_MPTCP_ADD_ADDR ||
222 			    opsize == TCPOLEN_MPTCP_ADD_ADDR_PORT)
223 				mp_opt->family = MPTCP_ADDR_IPVERSION_4;
224 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
225 			else if (opsize == TCPOLEN_MPTCP_ADD_ADDR6 ||
226 				 opsize == TCPOLEN_MPTCP_ADD_ADDR6_PORT)
227 				mp_opt->family = MPTCP_ADDR_IPVERSION_6;
228 #endif
229 			else
230 				break;
231 		} else {
232 			if (opsize == TCPOLEN_MPTCP_ADD_ADDR_BASE ||
233 			    opsize == TCPOLEN_MPTCP_ADD_ADDR_BASE_PORT)
234 				mp_opt->family = MPTCP_ADDR_IPVERSION_4;
235 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
236 			else if (opsize == TCPOLEN_MPTCP_ADD_ADDR6_BASE ||
237 				 opsize == TCPOLEN_MPTCP_ADD_ADDR6_BASE_PORT)
238 				mp_opt->family = MPTCP_ADDR_IPVERSION_6;
239 #endif
240 			else
241 				break;
242 		}
243 
244 		mp_opt->add_addr = 1;
245 		mp_opt->addr_id = *ptr++;
246 		if (mp_opt->family == MPTCP_ADDR_IPVERSION_4) {
247 			memcpy((u8 *)&mp_opt->addr.s_addr, (u8 *)ptr, 4);
248 			ptr += 4;
249 			if (opsize == TCPOLEN_MPTCP_ADD_ADDR_PORT ||
250 			    opsize == TCPOLEN_MPTCP_ADD_ADDR_BASE_PORT) {
251 				mp_opt->port = get_unaligned_be16(ptr);
252 				ptr += 2;
253 			}
254 		}
255 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
256 		else {
257 			memcpy(mp_opt->addr6.s6_addr, (u8 *)ptr, 16);
258 			ptr += 16;
259 			if (opsize == TCPOLEN_MPTCP_ADD_ADDR6_PORT ||
260 			    opsize == TCPOLEN_MPTCP_ADD_ADDR6_BASE_PORT) {
261 				mp_opt->port = get_unaligned_be16(ptr);
262 				ptr += 2;
263 			}
264 		}
265 #endif
266 		if (!mp_opt->echo) {
267 			mp_opt->ahmac = get_unaligned_be64(ptr);
268 			ptr += 8;
269 		}
270 		pr_debug("ADD_ADDR%s: id=%d, ahmac=%llu, echo=%d, port=%d",
271 			 (mp_opt->family == MPTCP_ADDR_IPVERSION_6) ? "6" : "",
272 			 mp_opt->addr_id, mp_opt->ahmac, mp_opt->echo, mp_opt->port);
273 		break;
274 
275 	case MPTCPOPT_RM_ADDR:
276 		if (opsize < TCPOLEN_MPTCP_RM_ADDR_BASE + 1 ||
277 		    opsize > TCPOLEN_MPTCP_RM_ADDR_BASE + MPTCP_RM_IDS_MAX)
278 			break;
279 
280 		ptr++;
281 
282 		mp_opt->rm_addr = 1;
283 		mp_opt->rm_list.nr = opsize - TCPOLEN_MPTCP_RM_ADDR_BASE;
284 		for (i = 0; i < mp_opt->rm_list.nr; i++)
285 			mp_opt->rm_list.ids[i] = *ptr++;
286 		pr_debug("RM_ADDR: rm_list_nr=%d", mp_opt->rm_list.nr);
287 		break;
288 
289 	case MPTCPOPT_MP_PRIO:
290 		if (opsize != TCPOLEN_MPTCP_PRIO)
291 			break;
292 
293 		mp_opt->mp_prio = 1;
294 		mp_opt->backup = *ptr++ & MPTCP_PRIO_BKUP;
295 		pr_debug("MP_PRIO: prio=%d", mp_opt->backup);
296 		break;
297 
298 	case MPTCPOPT_MP_FASTCLOSE:
299 		if (opsize != TCPOLEN_MPTCP_FASTCLOSE)
300 			break;
301 
302 		ptr += 2;
303 		mp_opt->rcvr_key = get_unaligned_be64(ptr);
304 		ptr += 8;
305 		mp_opt->fastclose = 1;
306 		break;
307 
308 	default:
309 		break;
310 	}
311 }
312 
313 void mptcp_get_options(const struct sk_buff *skb,
314 		       struct mptcp_options_received *mp_opt)
315 {
316 	const struct tcphdr *th = tcp_hdr(skb);
317 	const unsigned char *ptr;
318 	int length;
319 
320 	/* initialize option status */
321 	mp_opt->mp_capable = 0;
322 	mp_opt->mp_join = 0;
323 	mp_opt->add_addr = 0;
324 	mp_opt->ahmac = 0;
325 	mp_opt->fastclose = 0;
326 	mp_opt->port = 0;
327 	mp_opt->rm_addr = 0;
328 	mp_opt->dss = 0;
329 	mp_opt->mp_prio = 0;
330 
331 	length = (th->doff * 4) - sizeof(struct tcphdr);
332 	ptr = (const unsigned char *)(th + 1);
333 
334 	while (length > 0) {
335 		int opcode = *ptr++;
336 		int opsize;
337 
338 		switch (opcode) {
339 		case TCPOPT_EOL:
340 			return;
341 		case TCPOPT_NOP:	/* Ref: RFC 793 section 3.1 */
342 			length--;
343 			continue;
344 		default:
345 			opsize = *ptr++;
346 			if (opsize < 2) /* "silly options" */
347 				return;
348 			if (opsize > length)
349 				return;	/* don't parse partial options */
350 			if (opcode == TCPOPT_MPTCP)
351 				mptcp_parse_option(skb, ptr, opsize, mp_opt);
352 			ptr += opsize - 2;
353 			length -= opsize;
354 		}
355 	}
356 }
357 
358 bool mptcp_syn_options(struct sock *sk, const struct sk_buff *skb,
359 		       unsigned int *size, struct mptcp_out_options *opts)
360 {
361 	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
362 
363 	/* we will use snd_isn to detect first pkt [re]transmission
364 	 * in mptcp_established_options_mp()
365 	 */
366 	subflow->snd_isn = TCP_SKB_CB(skb)->end_seq;
367 	if (subflow->request_mptcp) {
368 		opts->suboptions = OPTION_MPTCP_MPC_SYN;
369 		*size = TCPOLEN_MPTCP_MPC_SYN;
370 		return true;
371 	} else if (subflow->request_join) {
372 		pr_debug("remote_token=%u, nonce=%u", subflow->remote_token,
373 			 subflow->local_nonce);
374 		opts->suboptions = OPTION_MPTCP_MPJ_SYN;
375 		opts->join_id = subflow->local_id;
376 		opts->token = subflow->remote_token;
377 		opts->nonce = subflow->local_nonce;
378 		opts->backup = subflow->request_bkup;
379 		*size = TCPOLEN_MPTCP_MPJ_SYN;
380 		return true;
381 	}
382 	return false;
383 }
384 
385 /* MP_JOIN client subflow must wait for 4th ack before sending any data:
386  * TCP can't schedule delack timer before the subflow is fully established.
387  * MPTCP uses the delack timer to do 3rd ack retransmissions
388  */
389 static void schedule_3rdack_retransmission(struct sock *sk)
390 {
391 	struct inet_connection_sock *icsk = inet_csk(sk);
392 	struct tcp_sock *tp = tcp_sk(sk);
393 	unsigned long timeout;
394 
395 	/* reschedule with a timeout above RTT, as we must look only for drop */
396 	if (tp->srtt_us)
397 		timeout = tp->srtt_us << 1;
398 	else
399 		timeout = TCP_TIMEOUT_INIT;
400 
401 	WARN_ON_ONCE(icsk->icsk_ack.pending & ICSK_ACK_TIMER);
402 	icsk->icsk_ack.pending |= ICSK_ACK_SCHED | ICSK_ACK_TIMER;
403 	icsk->icsk_ack.timeout = timeout;
404 	sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout);
405 }
406 
407 static void clear_3rdack_retransmission(struct sock *sk)
408 {
409 	struct inet_connection_sock *icsk = inet_csk(sk);
410 
411 	sk_stop_timer(sk, &icsk->icsk_delack_timer);
412 	icsk->icsk_ack.timeout = 0;
413 	icsk->icsk_ack.ato = 0;
414 	icsk->icsk_ack.pending &= ~(ICSK_ACK_SCHED | ICSK_ACK_TIMER);
415 }
416 
417 static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb,
418 					 bool snd_data_fin_enable,
419 					 unsigned int *size,
420 					 unsigned int remaining,
421 					 struct mptcp_out_options *opts)
422 {
423 	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
424 	struct mptcp_ext *mpext;
425 	unsigned int data_len;
426 
427 	/* When skb is not available, we better over-estimate the emitted
428 	 * options len. A full DSS option (28 bytes) is longer than
429 	 * TCPOLEN_MPTCP_MPC_ACK_DATA(22) or TCPOLEN_MPTCP_MPJ_ACK(24), so
430 	 * tell the caller to defer the estimate to
431 	 * mptcp_established_options_dss(), which will reserve enough space.
432 	 */
433 	if (!skb)
434 		return false;
435 
436 	/* MPC/MPJ needed only on 3rd ack packet, DATA_FIN and TCP shutdown take precedence */
437 	if (subflow->fully_established || snd_data_fin_enable ||
438 	    subflow->snd_isn != TCP_SKB_CB(skb)->seq ||
439 	    sk->sk_state != TCP_ESTABLISHED)
440 		return false;
441 
442 	if (subflow->mp_capable) {
443 		mpext = mptcp_get_ext(skb);
444 		data_len = mpext ? mpext->data_len : 0;
445 
446 		/* we will check ext_copy.data_len in mptcp_write_options() to
447 		 * discriminate between TCPOLEN_MPTCP_MPC_ACK_DATA and
448 		 * TCPOLEN_MPTCP_MPC_ACK
449 		 */
450 		opts->ext_copy.data_len = data_len;
451 		opts->suboptions = OPTION_MPTCP_MPC_ACK;
452 		opts->sndr_key = subflow->local_key;
453 		opts->rcvr_key = subflow->remote_key;
454 
455 		/* Section 3.1.
456 		 * The MP_CAPABLE option is carried on the SYN, SYN/ACK, and ACK
457 		 * packets that start the first subflow of an MPTCP connection,
458 		 * as well as the first packet that carries data
459 		 */
460 		if (data_len > 0)
461 			*size = ALIGN(TCPOLEN_MPTCP_MPC_ACK_DATA, 4);
462 		else
463 			*size = TCPOLEN_MPTCP_MPC_ACK;
464 
465 		pr_debug("subflow=%p, local_key=%llu, remote_key=%llu map_len=%d",
466 			 subflow, subflow->local_key, subflow->remote_key,
467 			 data_len);
468 
469 		return true;
470 	} else if (subflow->mp_join) {
471 		opts->suboptions = OPTION_MPTCP_MPJ_ACK;
472 		memcpy(opts->hmac, subflow->hmac, MPTCPOPT_HMAC_LEN);
473 		*size = TCPOLEN_MPTCP_MPJ_ACK;
474 		pr_debug("subflow=%p", subflow);
475 
476 		schedule_3rdack_retransmission(sk);
477 		return true;
478 	}
479 	return false;
480 }
481 
482 static void mptcp_write_data_fin(struct mptcp_subflow_context *subflow,
483 				 struct sk_buff *skb, struct mptcp_ext *ext)
484 {
485 	/* The write_seq value has already been incremented, so the actual
486 	 * sequence number for the DATA_FIN is one less.
487 	 */
488 	u64 data_fin_tx_seq = READ_ONCE(mptcp_sk(subflow->conn)->write_seq) - 1;
489 
490 	if (!ext->use_map || !skb->len) {
491 		/* RFC6824 requires a DSS mapping with specific values
492 		 * if DATA_FIN is set but no data payload is mapped
493 		 */
494 		ext->data_fin = 1;
495 		ext->use_map = 1;
496 		ext->dsn64 = 1;
497 		ext->data_seq = data_fin_tx_seq;
498 		ext->subflow_seq = 0;
499 		ext->data_len = 1;
500 	} else if (ext->data_seq + ext->data_len == data_fin_tx_seq) {
501 		/* If there's an existing DSS mapping and it is the
502 		 * final mapping, DATA_FIN consumes 1 additional byte of
503 		 * mapping space.
504 		 */
505 		ext->data_fin = 1;
506 		ext->data_len++;
507 	}
508 }
509 
510 static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb,
511 					  bool snd_data_fin_enable,
512 					  unsigned int *size,
513 					  unsigned int remaining,
514 					  struct mptcp_out_options *opts)
515 {
516 	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
517 	struct mptcp_sock *msk = mptcp_sk(subflow->conn);
518 	unsigned int dss_size = 0;
519 	struct mptcp_ext *mpext;
520 	unsigned int ack_size;
521 	bool ret = false;
522 	u64 ack_seq;
523 
524 	mpext = skb ? mptcp_get_ext(skb) : NULL;
525 
526 	if (!skb || (mpext && mpext->use_map) || snd_data_fin_enable) {
527 		unsigned int map_size;
528 
529 		map_size = TCPOLEN_MPTCP_DSS_BASE + TCPOLEN_MPTCP_DSS_MAP64;
530 
531 		remaining -= map_size;
532 		dss_size = map_size;
533 		if (mpext)
534 			opts->ext_copy = *mpext;
535 
536 		if (skb && snd_data_fin_enable)
537 			mptcp_write_data_fin(subflow, skb, &opts->ext_copy);
538 		ret = true;
539 	}
540 
541 	/* passive sockets msk will set the 'can_ack' after accept(), even
542 	 * if the first subflow may have the already the remote key handy
543 	 */
544 	opts->ext_copy.use_ack = 0;
545 	if (!READ_ONCE(msk->can_ack)) {
546 		*size = ALIGN(dss_size, 4);
547 		return ret;
548 	}
549 
550 	ack_seq = READ_ONCE(msk->ack_seq);
551 	if (READ_ONCE(msk->use_64bit_ack)) {
552 		ack_size = TCPOLEN_MPTCP_DSS_ACK64;
553 		opts->ext_copy.data_ack = ack_seq;
554 		opts->ext_copy.ack64 = 1;
555 	} else {
556 		ack_size = TCPOLEN_MPTCP_DSS_ACK32;
557 		opts->ext_copy.data_ack32 = (uint32_t)ack_seq;
558 		opts->ext_copy.ack64 = 0;
559 	}
560 	opts->ext_copy.use_ack = 1;
561 	WRITE_ONCE(msk->old_wspace, __mptcp_space((struct sock *)msk));
562 
563 	/* Add kind/length/subtype/flag overhead if mapping is not populated */
564 	if (dss_size == 0)
565 		ack_size += TCPOLEN_MPTCP_DSS_BASE;
566 
567 	dss_size += ack_size;
568 
569 	*size = ALIGN(dss_size, 4);
570 	return true;
571 }
572 
573 static u64 add_addr_generate_hmac(u64 key1, u64 key2, u8 addr_id,
574 				  struct in_addr *addr)
575 {
576 	u8 hmac[SHA256_DIGEST_SIZE];
577 	u8 msg[7];
578 
579 	msg[0] = addr_id;
580 	memcpy(&msg[1], &addr->s_addr, 4);
581 	msg[5] = 0;
582 	msg[6] = 0;
583 
584 	mptcp_crypto_hmac_sha(key1, key2, msg, 7, hmac);
585 
586 	return get_unaligned_be64(&hmac[SHA256_DIGEST_SIZE - sizeof(u64)]);
587 }
588 
589 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
590 static u64 add_addr6_generate_hmac(u64 key1, u64 key2, u8 addr_id,
591 				   struct in6_addr *addr)
592 {
593 	u8 hmac[SHA256_DIGEST_SIZE];
594 	u8 msg[19];
595 
596 	msg[0] = addr_id;
597 	memcpy(&msg[1], &addr->s6_addr, 16);
598 	msg[17] = 0;
599 	msg[18] = 0;
600 
601 	mptcp_crypto_hmac_sha(key1, key2, msg, 19, hmac);
602 
603 	return get_unaligned_be64(&hmac[SHA256_DIGEST_SIZE - sizeof(u64)]);
604 }
605 #endif
606 
607 static bool mptcp_established_options_add_addr(struct sock *sk, struct sk_buff *skb,
608 					       unsigned int *size,
609 					       unsigned int remaining,
610 					       struct mptcp_out_options *opts)
611 {
612 	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
613 	struct mptcp_sock *msk = mptcp_sk(subflow->conn);
614 	bool drop_other_suboptions = false;
615 	unsigned int opt_size = *size;
616 	struct mptcp_addr_info saddr;
617 	bool echo;
618 	bool port;
619 	int len;
620 
621 	if ((mptcp_pm_should_add_signal_ipv6(msk) ||
622 	     mptcp_pm_should_add_signal_port(msk)) &&
623 	    skb && skb_is_tcp_pure_ack(skb)) {
624 		pr_debug("drop other suboptions");
625 		opts->suboptions = 0;
626 		opts->ext_copy.use_ack = 0;
627 		opts->ext_copy.use_map = 0;
628 		remaining += opt_size;
629 		drop_other_suboptions = true;
630 	}
631 
632 	if (!mptcp_pm_should_add_signal(msk) ||
633 	    !(mptcp_pm_add_addr_signal(msk, remaining, &saddr, &echo, &port)))
634 		return false;
635 
636 	len = mptcp_add_addr_len(saddr.family, echo, port);
637 	if (remaining < len)
638 		return false;
639 
640 	*size = len;
641 	if (drop_other_suboptions)
642 		*size -= opt_size;
643 	opts->addr_id = saddr.id;
644 	if (port)
645 		opts->port = ntohs(saddr.port);
646 	if (saddr.family == AF_INET) {
647 		opts->suboptions |= OPTION_MPTCP_ADD_ADDR;
648 		opts->addr = saddr.addr;
649 		if (!echo) {
650 			opts->ahmac = add_addr_generate_hmac(msk->local_key,
651 							     msk->remote_key,
652 							     opts->addr_id,
653 							     &opts->addr);
654 		}
655 	}
656 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
657 	else if (saddr.family == AF_INET6) {
658 		opts->suboptions |= OPTION_MPTCP_ADD_ADDR6;
659 		opts->addr6 = saddr.addr6;
660 		if (!echo) {
661 			opts->ahmac = add_addr6_generate_hmac(msk->local_key,
662 							      msk->remote_key,
663 							      opts->addr_id,
664 							      &opts->addr6);
665 		}
666 	}
667 #endif
668 	pr_debug("addr_id=%d, ahmac=%llu, echo=%d, port=%d",
669 		 opts->addr_id, opts->ahmac, echo, opts->port);
670 
671 	return true;
672 }
673 
674 static bool mptcp_established_options_rm_addr(struct sock *sk,
675 					      unsigned int *size,
676 					      unsigned int remaining,
677 					      struct mptcp_out_options *opts)
678 {
679 	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
680 	struct mptcp_sock *msk = mptcp_sk(subflow->conn);
681 	struct mptcp_rm_list rm_list;
682 	int i, len;
683 
684 	if (!mptcp_pm_should_rm_signal(msk) ||
685 	    !(mptcp_pm_rm_addr_signal(msk, remaining, &rm_list)))
686 		return false;
687 
688 	len = mptcp_rm_addr_len(&rm_list);
689 	if (len < 0)
690 		return false;
691 	if (remaining < len)
692 		return false;
693 
694 	*size = len;
695 	opts->suboptions |= OPTION_MPTCP_RM_ADDR;
696 	opts->rm_list = rm_list;
697 
698 	for (i = 0; i < opts->rm_list.nr; i++)
699 		pr_debug("rm_list_ids[%d]=%d", i, opts->rm_list.ids[i]);
700 
701 	return true;
702 }
703 
704 static bool mptcp_established_options_mp_prio(struct sock *sk,
705 					      unsigned int *size,
706 					      unsigned int remaining,
707 					      struct mptcp_out_options *opts)
708 {
709 	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
710 
711 	if (!subflow->send_mp_prio)
712 		return false;
713 
714 	/* account for the trailing 'nop' option */
715 	if (remaining < TCPOLEN_MPTCP_PRIO_ALIGN)
716 		return false;
717 
718 	*size = TCPOLEN_MPTCP_PRIO_ALIGN;
719 	opts->suboptions |= OPTION_MPTCP_PRIO;
720 	opts->backup = subflow->request_bkup;
721 
722 	pr_debug("prio=%d", opts->backup);
723 
724 	return true;
725 }
726 
727 bool mptcp_established_options(struct sock *sk, struct sk_buff *skb,
728 			       unsigned int *size, unsigned int remaining,
729 			       struct mptcp_out_options *opts)
730 {
731 	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
732 	struct mptcp_sock *msk = mptcp_sk(subflow->conn);
733 	unsigned int opt_size = 0;
734 	bool snd_data_fin;
735 	bool ret = false;
736 
737 	opts->suboptions = 0;
738 
739 	if (unlikely(__mptcp_check_fallback(msk)))
740 		return false;
741 
742 	/* prevent adding of any MPTCP related options on reset packet
743 	 * until we support MP_TCPRST/MP_FASTCLOSE
744 	 */
745 	if (unlikely(skb && TCP_SKB_CB(skb)->tcp_flags & TCPHDR_RST))
746 		return false;
747 
748 	snd_data_fin = mptcp_data_fin_enabled(msk);
749 	if (mptcp_established_options_mp(sk, skb, snd_data_fin, &opt_size, remaining, opts))
750 		ret = true;
751 	else if (mptcp_established_options_dss(sk, skb, snd_data_fin, &opt_size, remaining, opts))
752 		ret = true;
753 
754 	/* we reserved enough space for the above options, and exceeding the
755 	 * TCP option space would be fatal
756 	 */
757 	if (WARN_ON_ONCE(opt_size > remaining))
758 		return false;
759 
760 	*size += opt_size;
761 	remaining -= opt_size;
762 	if (mptcp_established_options_add_addr(sk, skb, &opt_size, remaining, opts)) {
763 		*size += opt_size;
764 		remaining -= opt_size;
765 		ret = true;
766 	} else if (mptcp_established_options_rm_addr(sk, &opt_size, remaining, opts)) {
767 		*size += opt_size;
768 		remaining -= opt_size;
769 		ret = true;
770 	}
771 
772 	if (mptcp_established_options_mp_prio(sk, &opt_size, remaining, opts)) {
773 		*size += opt_size;
774 		remaining -= opt_size;
775 		ret = true;
776 	}
777 
778 	return ret;
779 }
780 
781 bool mptcp_synack_options(const struct request_sock *req, unsigned int *size,
782 			  struct mptcp_out_options *opts)
783 {
784 	struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
785 
786 	if (subflow_req->mp_capable) {
787 		opts->suboptions = OPTION_MPTCP_MPC_SYNACK;
788 		opts->sndr_key = subflow_req->local_key;
789 		*size = TCPOLEN_MPTCP_MPC_SYNACK;
790 		pr_debug("subflow_req=%p, local_key=%llu",
791 			 subflow_req, subflow_req->local_key);
792 		return true;
793 	} else if (subflow_req->mp_join) {
794 		opts->suboptions = OPTION_MPTCP_MPJ_SYNACK;
795 		opts->backup = subflow_req->backup;
796 		opts->join_id = subflow_req->local_id;
797 		opts->thmac = subflow_req->thmac;
798 		opts->nonce = subflow_req->local_nonce;
799 		pr_debug("req=%p, bkup=%u, id=%u, thmac=%llu, nonce=%u",
800 			 subflow_req, opts->backup, opts->join_id,
801 			 opts->thmac, opts->nonce);
802 		*size = TCPOLEN_MPTCP_MPJ_SYNACK;
803 		return true;
804 	}
805 	return false;
806 }
807 
808 static bool check_fully_established(struct mptcp_sock *msk, struct sock *ssk,
809 				    struct mptcp_subflow_context *subflow,
810 				    struct sk_buff *skb,
811 				    struct mptcp_options_received *mp_opt)
812 {
813 	/* here we can process OoO, in-window pkts, only in-sequence 4th ack
814 	 * will make the subflow fully established
815 	 */
816 	if (likely(subflow->fully_established)) {
817 		/* on passive sockets, check for 3rd ack retransmission
818 		 * note that msk is always set by subflow_syn_recv_sock()
819 		 * for mp_join subflows
820 		 */
821 		if (TCP_SKB_CB(skb)->seq == subflow->ssn_offset + 1 &&
822 		    TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq &&
823 		    subflow->mp_join && mp_opt->mp_join &&
824 		    READ_ONCE(msk->pm.server_side))
825 			tcp_send_ack(ssk);
826 		goto fully_established;
827 	}
828 
829 	/* we must process OoO packets before the first subflow is fully
830 	 * established. OoO packets are instead a protocol violation
831 	 * for MP_JOIN subflows as the peer must not send any data
832 	 * before receiving the forth ack - cfr. RFC 8684 section 3.2.
833 	 */
834 	if (TCP_SKB_CB(skb)->seq != subflow->ssn_offset + 1) {
835 		if (subflow->mp_join)
836 			goto reset;
837 		return subflow->mp_capable;
838 	}
839 
840 	if (mp_opt->dss && mp_opt->use_ack) {
841 		/* subflows are fully established as soon as we get any
842 		 * additional ack.
843 		 */
844 		subflow->fully_established = 1;
845 		WRITE_ONCE(msk->fully_established, true);
846 		goto fully_established;
847 	}
848 
849 	if (mp_opt->add_addr) {
850 		WRITE_ONCE(msk->fully_established, true);
851 		return true;
852 	}
853 
854 	/* If the first established packet does not contain MP_CAPABLE + data
855 	 * then fallback to TCP. Fallback scenarios requires a reset for
856 	 * MP_JOIN subflows.
857 	 */
858 	if (!mp_opt->mp_capable) {
859 		if (subflow->mp_join)
860 			goto reset;
861 		subflow->mp_capable = 0;
862 		pr_fallback(msk);
863 		__mptcp_do_fallback(msk);
864 		return false;
865 	}
866 
867 	if (unlikely(!READ_ONCE(msk->pm.server_side)))
868 		pr_warn_once("bogus mpc option on established client sk");
869 	mptcp_subflow_fully_established(subflow, mp_opt);
870 
871 fully_established:
872 	/* if the subflow is not already linked into the conn_list, we can't
873 	 * notify the PM: this subflow is still on the listener queue
874 	 * and the PM possibly acquiring the subflow lock could race with
875 	 * the listener close
876 	 */
877 	if (likely(subflow->pm_notified) || list_empty(&subflow->node))
878 		return true;
879 
880 	subflow->pm_notified = 1;
881 	if (subflow->mp_join) {
882 		clear_3rdack_retransmission(ssk);
883 		mptcp_pm_subflow_established(msk, subflow);
884 	} else {
885 		mptcp_pm_fully_established(msk, ssk, GFP_ATOMIC);
886 	}
887 	return true;
888 
889 reset:
890 	mptcp_subflow_reset(ssk);
891 	return false;
892 }
893 
894 static u64 expand_ack(u64 old_ack, u64 cur_ack, bool use_64bit)
895 {
896 	u32 old_ack32, cur_ack32;
897 
898 	if (use_64bit)
899 		return cur_ack;
900 
901 	old_ack32 = (u32)old_ack;
902 	cur_ack32 = (u32)cur_ack;
903 	cur_ack = (old_ack & GENMASK_ULL(63, 32)) + cur_ack32;
904 	if (unlikely(before(cur_ack32, old_ack32)))
905 		return cur_ack + (1LL << 32);
906 	return cur_ack;
907 }
908 
909 static void ack_update_msk(struct mptcp_sock *msk,
910 			   struct sock *ssk,
911 			   struct mptcp_options_received *mp_opt)
912 {
913 	u64 new_wnd_end, new_snd_una, snd_nxt = READ_ONCE(msk->snd_nxt);
914 	struct sock *sk = (struct sock *)msk;
915 	u64 old_snd_una;
916 
917 	mptcp_data_lock(sk);
918 
919 	/* avoid ack expansion on update conflict, to reduce the risk of
920 	 * wrongly expanding to a future ack sequence number, which is way
921 	 * more dangerous than missing an ack
922 	 */
923 	old_snd_una = msk->snd_una;
924 	new_snd_una = expand_ack(old_snd_una, mp_opt->data_ack, mp_opt->ack64);
925 
926 	/* ACK for data not even sent yet? Ignore. */
927 	if (after64(new_snd_una, snd_nxt))
928 		new_snd_una = old_snd_una;
929 
930 	new_wnd_end = new_snd_una + tcp_sk(ssk)->snd_wnd;
931 
932 	if (after64(new_wnd_end, msk->wnd_end))
933 		msk->wnd_end = new_wnd_end;
934 
935 	/* this assumes mptcp_incoming_options() is invoked after tcp_ack() */
936 	if (after64(msk->wnd_end, READ_ONCE(msk->snd_nxt)))
937 		__mptcp_check_push(sk, ssk);
938 
939 	if (after64(new_snd_una, old_snd_una)) {
940 		msk->snd_una = new_snd_una;
941 		__mptcp_data_acked(sk);
942 	}
943 	mptcp_data_unlock(sk);
944 }
945 
946 bool mptcp_update_rcv_data_fin(struct mptcp_sock *msk, u64 data_fin_seq, bool use_64bit)
947 {
948 	/* Skip if DATA_FIN was already received.
949 	 * If updating simultaneously with the recvmsg loop, values
950 	 * should match. If they mismatch, the peer is misbehaving and
951 	 * we will prefer the most recent information.
952 	 */
953 	if (READ_ONCE(msk->rcv_data_fin) || !READ_ONCE(msk->first))
954 		return false;
955 
956 	WRITE_ONCE(msk->rcv_data_fin_seq,
957 		   expand_ack(READ_ONCE(msk->ack_seq), data_fin_seq, use_64bit));
958 	WRITE_ONCE(msk->rcv_data_fin, 1);
959 
960 	return true;
961 }
962 
963 static bool add_addr_hmac_valid(struct mptcp_sock *msk,
964 				struct mptcp_options_received *mp_opt)
965 {
966 	u64 hmac = 0;
967 
968 	if (mp_opt->echo)
969 		return true;
970 
971 	if (mp_opt->family == MPTCP_ADDR_IPVERSION_4)
972 		hmac = add_addr_generate_hmac(msk->remote_key,
973 					      msk->local_key,
974 					      mp_opt->addr_id, &mp_opt->addr);
975 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
976 	else
977 		hmac = add_addr6_generate_hmac(msk->remote_key,
978 					       msk->local_key,
979 					       mp_opt->addr_id, &mp_opt->addr6);
980 #endif
981 
982 	pr_debug("msk=%p, ahmac=%llu, mp_opt->ahmac=%llu\n",
983 		 msk, (unsigned long long)hmac,
984 		 (unsigned long long)mp_opt->ahmac);
985 
986 	return hmac == mp_opt->ahmac;
987 }
988 
989 void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb)
990 {
991 	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
992 	struct mptcp_sock *msk = mptcp_sk(subflow->conn);
993 	struct mptcp_options_received mp_opt;
994 	struct mptcp_ext *mpext;
995 
996 	if (__mptcp_check_fallback(msk)) {
997 		/* Keep it simple and unconditionally trigger send data cleanup and
998 		 * pending queue spooling. We will need to acquire the data lock
999 		 * for more accurate checks, and once the lock is acquired, such
1000 		 * helpers are cheap.
1001 		 */
1002 		mptcp_data_lock(subflow->conn);
1003 		if (sk_stream_memory_free(sk))
1004 			__mptcp_check_push(subflow->conn, sk);
1005 		__mptcp_data_acked(subflow->conn);
1006 		mptcp_data_unlock(subflow->conn);
1007 		return;
1008 	}
1009 
1010 	mptcp_get_options(skb, &mp_opt);
1011 	if (!check_fully_established(msk, sk, subflow, skb, &mp_opt))
1012 		return;
1013 
1014 	if (mp_opt.fastclose &&
1015 	    msk->local_key == mp_opt.rcvr_key) {
1016 		WRITE_ONCE(msk->rcv_fastclose, true);
1017 		mptcp_schedule_work((struct sock *)msk);
1018 	}
1019 
1020 	if (mp_opt.add_addr && add_addr_hmac_valid(msk, &mp_opt)) {
1021 		struct mptcp_addr_info addr;
1022 
1023 		addr.port = htons(mp_opt.port);
1024 		addr.id = mp_opt.addr_id;
1025 		if (mp_opt.family == MPTCP_ADDR_IPVERSION_4) {
1026 			addr.family = AF_INET;
1027 			addr.addr = mp_opt.addr;
1028 		}
1029 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
1030 		else if (mp_opt.family == MPTCP_ADDR_IPVERSION_6) {
1031 			addr.family = AF_INET6;
1032 			addr.addr6 = mp_opt.addr6;
1033 		}
1034 #endif
1035 		if (!mp_opt.echo) {
1036 			mptcp_pm_add_addr_received(msk, &addr);
1037 			MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_ADDADDR);
1038 		} else {
1039 			mptcp_pm_del_add_timer(msk, &addr);
1040 			MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_ECHOADD);
1041 		}
1042 
1043 		if (mp_opt.port)
1044 			MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_PORTADD);
1045 
1046 		mp_opt.add_addr = 0;
1047 	}
1048 
1049 	if (mp_opt.rm_addr) {
1050 		mptcp_pm_rm_addr_received(msk, &mp_opt.rm_list);
1051 		mp_opt.rm_addr = 0;
1052 	}
1053 
1054 	if (mp_opt.mp_prio) {
1055 		mptcp_pm_mp_prio_received(sk, mp_opt.backup);
1056 		MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPPRIORX);
1057 		mp_opt.mp_prio = 0;
1058 	}
1059 
1060 	if (!mp_opt.dss)
1061 		return;
1062 
1063 	/* we can't wait for recvmsg() to update the ack_seq, otherwise
1064 	 * monodirectional flows will stuck
1065 	 */
1066 	if (mp_opt.use_ack)
1067 		ack_update_msk(msk, sk, &mp_opt);
1068 
1069 	/* Zero-data-length packets are dropped by the caller and not
1070 	 * propagated to the MPTCP layer, so the skb extension does not
1071 	 * need to be allocated or populated. DATA_FIN information, if
1072 	 * present, needs to be updated here before the skb is freed.
1073 	 */
1074 	if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) {
1075 		if (mp_opt.data_fin && mp_opt.data_len == 1 &&
1076 		    mptcp_update_rcv_data_fin(msk, mp_opt.data_seq, mp_opt.dsn64) &&
1077 		    schedule_work(&msk->work))
1078 			sock_hold(subflow->conn);
1079 
1080 		return;
1081 	}
1082 
1083 	mpext = skb_ext_add(skb, SKB_EXT_MPTCP);
1084 	if (!mpext)
1085 		return;
1086 
1087 	memset(mpext, 0, sizeof(*mpext));
1088 
1089 	if (mp_opt.use_map) {
1090 		if (mp_opt.mpc_map) {
1091 			/* this is an MP_CAPABLE carrying MPTCP data
1092 			 * we know this map the first chunk of data
1093 			 */
1094 			mptcp_crypto_key_sha(subflow->remote_key, NULL,
1095 					     &mpext->data_seq);
1096 			mpext->data_seq++;
1097 			mpext->subflow_seq = 1;
1098 			mpext->dsn64 = 1;
1099 			mpext->mpc_map = 1;
1100 			mpext->data_fin = 0;
1101 		} else {
1102 			mpext->data_seq = mp_opt.data_seq;
1103 			mpext->subflow_seq = mp_opt.subflow_seq;
1104 			mpext->dsn64 = mp_opt.dsn64;
1105 			mpext->data_fin = mp_opt.data_fin;
1106 		}
1107 		mpext->data_len = mp_opt.data_len;
1108 		mpext->use_map = 1;
1109 	}
1110 }
1111 
1112 static void mptcp_set_rwin(const struct tcp_sock *tp)
1113 {
1114 	const struct sock *ssk = (const struct sock *)tp;
1115 	const struct mptcp_subflow_context *subflow;
1116 	struct mptcp_sock *msk;
1117 	u64 ack_seq;
1118 
1119 	subflow = mptcp_subflow_ctx(ssk);
1120 	msk = mptcp_sk(subflow->conn);
1121 
1122 	ack_seq = READ_ONCE(msk->ack_seq) + tp->rcv_wnd;
1123 
1124 	if (after64(ack_seq, READ_ONCE(msk->rcv_wnd_sent)))
1125 		WRITE_ONCE(msk->rcv_wnd_sent, ack_seq);
1126 }
1127 
1128 void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp,
1129 			 struct mptcp_out_options *opts)
1130 {
1131 	if ((OPTION_MPTCP_MPC_SYN | OPTION_MPTCP_MPC_SYNACK |
1132 	     OPTION_MPTCP_MPC_ACK) & opts->suboptions) {
1133 		u8 len;
1134 
1135 		if (OPTION_MPTCP_MPC_SYN & opts->suboptions)
1136 			len = TCPOLEN_MPTCP_MPC_SYN;
1137 		else if (OPTION_MPTCP_MPC_SYNACK & opts->suboptions)
1138 			len = TCPOLEN_MPTCP_MPC_SYNACK;
1139 		else if (opts->ext_copy.data_len)
1140 			len = TCPOLEN_MPTCP_MPC_ACK_DATA;
1141 		else
1142 			len = TCPOLEN_MPTCP_MPC_ACK;
1143 
1144 		*ptr++ = mptcp_option(MPTCPOPT_MP_CAPABLE, len,
1145 				      MPTCP_SUPPORTED_VERSION,
1146 				      MPTCP_CAP_HMAC_SHA256);
1147 
1148 		if (!((OPTION_MPTCP_MPC_SYNACK | OPTION_MPTCP_MPC_ACK) &
1149 		    opts->suboptions))
1150 			goto mp_capable_done;
1151 
1152 		put_unaligned_be64(opts->sndr_key, ptr);
1153 		ptr += 2;
1154 		if (!((OPTION_MPTCP_MPC_ACK) & opts->suboptions))
1155 			goto mp_capable_done;
1156 
1157 		put_unaligned_be64(opts->rcvr_key, ptr);
1158 		ptr += 2;
1159 		if (!opts->ext_copy.data_len)
1160 			goto mp_capable_done;
1161 
1162 		put_unaligned_be32(opts->ext_copy.data_len << 16 |
1163 				   TCPOPT_NOP << 8 | TCPOPT_NOP, ptr);
1164 		ptr += 1;
1165 	}
1166 
1167 mp_capable_done:
1168 	if ((OPTION_MPTCP_ADD_ADDR
1169 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
1170 	     | OPTION_MPTCP_ADD_ADDR6
1171 #endif
1172 	    ) & opts->suboptions) {
1173 		u8 len = TCPOLEN_MPTCP_ADD_ADDR_BASE;
1174 		u8 echo = MPTCP_ADDR_ECHO;
1175 
1176 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
1177 		if (OPTION_MPTCP_ADD_ADDR6 & opts->suboptions)
1178 			len = TCPOLEN_MPTCP_ADD_ADDR6_BASE;
1179 #endif
1180 
1181 		if (opts->port)
1182 			len += TCPOLEN_MPTCP_PORT_LEN;
1183 
1184 		if (opts->ahmac) {
1185 			len += sizeof(opts->ahmac);
1186 			echo = 0;
1187 		}
1188 
1189 		*ptr++ = mptcp_option(MPTCPOPT_ADD_ADDR,
1190 				      len, echo, opts->addr_id);
1191 		if (OPTION_MPTCP_ADD_ADDR & opts->suboptions) {
1192 			memcpy((u8 *)ptr, (u8 *)&opts->addr.s_addr, 4);
1193 			ptr += 1;
1194 		}
1195 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
1196 		else if (OPTION_MPTCP_ADD_ADDR6 & opts->suboptions) {
1197 			memcpy((u8 *)ptr, opts->addr6.s6_addr, 16);
1198 			ptr += 4;
1199 		}
1200 #endif
1201 
1202 		if (!opts->port) {
1203 			if (opts->ahmac) {
1204 				put_unaligned_be64(opts->ahmac, ptr);
1205 				ptr += 2;
1206 			}
1207 		} else {
1208 			if (opts->ahmac) {
1209 				u8 *bptr = (u8 *)ptr;
1210 
1211 				put_unaligned_be16(opts->port, bptr);
1212 				bptr += 2;
1213 				put_unaligned_be64(opts->ahmac, bptr);
1214 				bptr += 8;
1215 				put_unaligned_be16(TCPOPT_NOP << 8 |
1216 						   TCPOPT_NOP, bptr);
1217 
1218 				ptr += 3;
1219 			} else {
1220 				put_unaligned_be32(opts->port << 16 |
1221 						   TCPOPT_NOP << 8 |
1222 						   TCPOPT_NOP, ptr);
1223 				ptr += 1;
1224 			}
1225 		}
1226 	}
1227 
1228 	if (OPTION_MPTCP_RM_ADDR & opts->suboptions) {
1229 		u8 i = 1;
1230 
1231 		*ptr++ = mptcp_option(MPTCPOPT_RM_ADDR,
1232 				      TCPOLEN_MPTCP_RM_ADDR_BASE + opts->rm_list.nr,
1233 				      0, opts->rm_list.ids[0]);
1234 
1235 		while (i < opts->rm_list.nr) {
1236 			u8 id1, id2, id3, id4;
1237 
1238 			id1 = opts->rm_list.ids[i];
1239 			id2 = i + 1 < opts->rm_list.nr ? opts->rm_list.ids[i + 1] : TCPOPT_NOP;
1240 			id3 = i + 2 < opts->rm_list.nr ? opts->rm_list.ids[i + 2] : TCPOPT_NOP;
1241 			id4 = i + 3 < opts->rm_list.nr ? opts->rm_list.ids[i + 3] : TCPOPT_NOP;
1242 			put_unaligned_be32(id1 << 24 | id2 << 16 | id3 << 8 | id4, ptr);
1243 			ptr += 1;
1244 			i += 4;
1245 		}
1246 	}
1247 
1248 	if (OPTION_MPTCP_PRIO & opts->suboptions) {
1249 		const struct sock *ssk = (const struct sock *)tp;
1250 		struct mptcp_subflow_context *subflow;
1251 
1252 		subflow = mptcp_subflow_ctx(ssk);
1253 		subflow->send_mp_prio = 0;
1254 
1255 		*ptr++ = mptcp_option(MPTCPOPT_MP_PRIO,
1256 				      TCPOLEN_MPTCP_PRIO,
1257 				      opts->backup, TCPOPT_NOP);
1258 	}
1259 
1260 	if (OPTION_MPTCP_MPJ_SYN & opts->suboptions) {
1261 		*ptr++ = mptcp_option(MPTCPOPT_MP_JOIN,
1262 				      TCPOLEN_MPTCP_MPJ_SYN,
1263 				      opts->backup, opts->join_id);
1264 		put_unaligned_be32(opts->token, ptr);
1265 		ptr += 1;
1266 		put_unaligned_be32(opts->nonce, ptr);
1267 		ptr += 1;
1268 	}
1269 
1270 	if (OPTION_MPTCP_MPJ_SYNACK & opts->suboptions) {
1271 		*ptr++ = mptcp_option(MPTCPOPT_MP_JOIN,
1272 				      TCPOLEN_MPTCP_MPJ_SYNACK,
1273 				      opts->backup, opts->join_id);
1274 		put_unaligned_be64(opts->thmac, ptr);
1275 		ptr += 2;
1276 		put_unaligned_be32(opts->nonce, ptr);
1277 		ptr += 1;
1278 	}
1279 
1280 	if (OPTION_MPTCP_MPJ_ACK & opts->suboptions) {
1281 		*ptr++ = mptcp_option(MPTCPOPT_MP_JOIN,
1282 				      TCPOLEN_MPTCP_MPJ_ACK, 0, 0);
1283 		memcpy(ptr, opts->hmac, MPTCPOPT_HMAC_LEN);
1284 		ptr += 5;
1285 	}
1286 
1287 	if (opts->ext_copy.use_ack || opts->ext_copy.use_map) {
1288 		struct mptcp_ext *mpext = &opts->ext_copy;
1289 		u8 len = TCPOLEN_MPTCP_DSS_BASE;
1290 		u8 flags = 0;
1291 
1292 		if (mpext->use_ack) {
1293 			flags = MPTCP_DSS_HAS_ACK;
1294 			if (mpext->ack64) {
1295 				len += TCPOLEN_MPTCP_DSS_ACK64;
1296 				flags |= MPTCP_DSS_ACK64;
1297 			} else {
1298 				len += TCPOLEN_MPTCP_DSS_ACK32;
1299 			}
1300 		}
1301 
1302 		if (mpext->use_map) {
1303 			len += TCPOLEN_MPTCP_DSS_MAP64;
1304 
1305 			/* Use only 64-bit mapping flags for now, add
1306 			 * support for optional 32-bit mappings later.
1307 			 */
1308 			flags |= MPTCP_DSS_HAS_MAP | MPTCP_DSS_DSN64;
1309 			if (mpext->data_fin)
1310 				flags |= MPTCP_DSS_DATA_FIN;
1311 		}
1312 
1313 		*ptr++ = mptcp_option(MPTCPOPT_DSS, len, 0, flags);
1314 
1315 		if (mpext->use_ack) {
1316 			if (mpext->ack64) {
1317 				put_unaligned_be64(mpext->data_ack, ptr);
1318 				ptr += 2;
1319 			} else {
1320 				put_unaligned_be32(mpext->data_ack32, ptr);
1321 				ptr += 1;
1322 			}
1323 		}
1324 
1325 		if (mpext->use_map) {
1326 			put_unaligned_be64(mpext->data_seq, ptr);
1327 			ptr += 2;
1328 			put_unaligned_be32(mpext->subflow_seq, ptr);
1329 			ptr += 1;
1330 			put_unaligned_be32(mpext->data_len << 16 |
1331 					   TCPOPT_NOP << 8 | TCPOPT_NOP, ptr);
1332 		}
1333 	}
1334 
1335 	if (tp)
1336 		mptcp_set_rwin(tp);
1337 }
1338