1 // SPDX-License-Identifier: GPL-2.0 2 /* Multipath TCP 3 * 4 * Copyright (c) 2017 - 2019, Intel Corporation. 5 */ 6 7 #define pr_fmt(fmt) "MPTCP: " fmt 8 9 #include <linux/kernel.h> 10 #include <crypto/sha.h> 11 #include <net/tcp.h> 12 #include <net/mptcp.h> 13 #include "protocol.h" 14 #include "mib.h" 15 16 static bool mptcp_cap_flag_sha256(u8 flags) 17 { 18 return (flags & MPTCP_CAP_FLAG_MASK) == MPTCP_CAP_HMAC_SHA256; 19 } 20 21 static void mptcp_parse_option(const struct sk_buff *skb, 22 const unsigned char *ptr, int opsize, 23 struct mptcp_options_received *mp_opt) 24 { 25 u8 subtype = *ptr >> 4; 26 int expected_opsize; 27 u8 version; 28 u8 flags; 29 30 switch (subtype) { 31 case MPTCPOPT_MP_CAPABLE: 32 /* strict size checking */ 33 if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) { 34 if (skb->len > tcp_hdr(skb)->doff << 2) 35 expected_opsize = TCPOLEN_MPTCP_MPC_ACK_DATA; 36 else 37 expected_opsize = TCPOLEN_MPTCP_MPC_ACK; 38 } else { 39 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK) 40 expected_opsize = TCPOLEN_MPTCP_MPC_SYNACK; 41 else 42 expected_opsize = TCPOLEN_MPTCP_MPC_SYN; 43 } 44 if (opsize != expected_opsize) 45 break; 46 47 /* try to be gentle vs future versions on the initial syn */ 48 version = *ptr++ & MPTCP_VERSION_MASK; 49 if (opsize != TCPOLEN_MPTCP_MPC_SYN) { 50 if (version != MPTCP_SUPPORTED_VERSION) 51 break; 52 } else if (version < MPTCP_SUPPORTED_VERSION) { 53 break; 54 } 55 56 flags = *ptr++; 57 if (!mptcp_cap_flag_sha256(flags) || 58 (flags & MPTCP_CAP_EXTENSIBILITY)) 59 break; 60 61 /* RFC 6824, Section 3.1: 62 * "For the Checksum Required bit (labeled "A"), if either 63 * host requires the use of checksums, checksums MUST be used. 64 * In other words, the only way for checksums not to be used 65 * is if both hosts in their SYNs set A=0." 66 * 67 * Section 3.3.0: 68 * "If a checksum is not present when its use has been 69 * negotiated, the receiver MUST close the subflow with a RST as 70 * it is considered broken." 71 * 72 * We don't implement DSS checksum - fall back to TCP. 73 */ 74 if (flags & MPTCP_CAP_CHECKSUM_REQD) 75 break; 76 77 mp_opt->mp_capable = 1; 78 if (opsize >= TCPOLEN_MPTCP_MPC_SYNACK) { 79 mp_opt->sndr_key = get_unaligned_be64(ptr); 80 ptr += 8; 81 } 82 if (opsize >= TCPOLEN_MPTCP_MPC_ACK) { 83 mp_opt->rcvr_key = get_unaligned_be64(ptr); 84 ptr += 8; 85 } 86 if (opsize == TCPOLEN_MPTCP_MPC_ACK_DATA) { 87 /* Section 3.1.: 88 * "the data parameters in a MP_CAPABLE are semantically 89 * equivalent to those in a DSS option and can be used 90 * interchangeably." 91 */ 92 mp_opt->dss = 1; 93 mp_opt->use_map = 1; 94 mp_opt->mpc_map = 1; 95 mp_opt->data_len = get_unaligned_be16(ptr); 96 ptr += 2; 97 } 98 pr_debug("MP_CAPABLE version=%x, flags=%x, optlen=%d sndr=%llu, rcvr=%llu len=%d", 99 version, flags, opsize, mp_opt->sndr_key, 100 mp_opt->rcvr_key, mp_opt->data_len); 101 break; 102 103 case MPTCPOPT_MP_JOIN: 104 mp_opt->mp_join = 1; 105 if (opsize == TCPOLEN_MPTCP_MPJ_SYN) { 106 mp_opt->backup = *ptr++ & MPTCPOPT_BACKUP; 107 mp_opt->join_id = *ptr++; 108 mp_opt->token = get_unaligned_be32(ptr); 109 ptr += 4; 110 mp_opt->nonce = get_unaligned_be32(ptr); 111 ptr += 4; 112 pr_debug("MP_JOIN bkup=%u, id=%u, token=%u, nonce=%u", 113 mp_opt->backup, mp_opt->join_id, 114 mp_opt->token, mp_opt->nonce); 115 } else if (opsize == TCPOLEN_MPTCP_MPJ_SYNACK) { 116 mp_opt->backup = *ptr++ & MPTCPOPT_BACKUP; 117 mp_opt->join_id = *ptr++; 118 mp_opt->thmac = get_unaligned_be64(ptr); 119 ptr += 8; 120 mp_opt->nonce = get_unaligned_be32(ptr); 121 ptr += 4; 122 pr_debug("MP_JOIN bkup=%u, id=%u, thmac=%llu, nonce=%u", 123 mp_opt->backup, mp_opt->join_id, 124 mp_opt->thmac, mp_opt->nonce); 125 } else if (opsize == TCPOLEN_MPTCP_MPJ_ACK) { 126 ptr += 2; 127 memcpy(mp_opt->hmac, ptr, MPTCPOPT_HMAC_LEN); 128 pr_debug("MP_JOIN hmac"); 129 } else { 130 pr_warn("MP_JOIN bad option size"); 131 mp_opt->mp_join = 0; 132 } 133 break; 134 135 case MPTCPOPT_DSS: 136 pr_debug("DSS"); 137 ptr++; 138 139 /* we must clear 'mpc_map' be able to detect MP_CAPABLE 140 * map vs DSS map in mptcp_incoming_options(), and reconstruct 141 * map info accordingly 142 */ 143 mp_opt->mpc_map = 0; 144 flags = (*ptr++) & MPTCP_DSS_FLAG_MASK; 145 mp_opt->data_fin = (flags & MPTCP_DSS_DATA_FIN) != 0; 146 mp_opt->dsn64 = (flags & MPTCP_DSS_DSN64) != 0; 147 mp_opt->use_map = (flags & MPTCP_DSS_HAS_MAP) != 0; 148 mp_opt->ack64 = (flags & MPTCP_DSS_ACK64) != 0; 149 mp_opt->use_ack = (flags & MPTCP_DSS_HAS_ACK); 150 151 pr_debug("data_fin=%d dsn64=%d use_map=%d ack64=%d use_ack=%d", 152 mp_opt->data_fin, mp_opt->dsn64, 153 mp_opt->use_map, mp_opt->ack64, 154 mp_opt->use_ack); 155 156 expected_opsize = TCPOLEN_MPTCP_DSS_BASE; 157 158 if (mp_opt->use_ack) { 159 if (mp_opt->ack64) 160 expected_opsize += TCPOLEN_MPTCP_DSS_ACK64; 161 else 162 expected_opsize += TCPOLEN_MPTCP_DSS_ACK32; 163 } 164 165 if (mp_opt->use_map) { 166 if (mp_opt->dsn64) 167 expected_opsize += TCPOLEN_MPTCP_DSS_MAP64; 168 else 169 expected_opsize += TCPOLEN_MPTCP_DSS_MAP32; 170 } 171 172 /* RFC 6824, Section 3.3: 173 * If a checksum is present, but its use had 174 * not been negotiated in the MP_CAPABLE handshake, 175 * the checksum field MUST be ignored. 176 */ 177 if (opsize != expected_opsize && 178 opsize != expected_opsize + TCPOLEN_MPTCP_DSS_CHECKSUM) 179 break; 180 181 mp_opt->dss = 1; 182 183 if (mp_opt->use_ack) { 184 if (mp_opt->ack64) { 185 mp_opt->data_ack = get_unaligned_be64(ptr); 186 ptr += 8; 187 } else { 188 mp_opt->data_ack = get_unaligned_be32(ptr); 189 ptr += 4; 190 } 191 192 pr_debug("data_ack=%llu", mp_opt->data_ack); 193 } 194 195 if (mp_opt->use_map) { 196 if (mp_opt->dsn64) { 197 mp_opt->data_seq = get_unaligned_be64(ptr); 198 ptr += 8; 199 } else { 200 mp_opt->data_seq = get_unaligned_be32(ptr); 201 ptr += 4; 202 } 203 204 mp_opt->subflow_seq = get_unaligned_be32(ptr); 205 ptr += 4; 206 207 mp_opt->data_len = get_unaligned_be16(ptr); 208 ptr += 2; 209 210 pr_debug("data_seq=%llu subflow_seq=%u data_len=%u", 211 mp_opt->data_seq, mp_opt->subflow_seq, 212 mp_opt->data_len); 213 } 214 215 break; 216 217 case MPTCPOPT_ADD_ADDR: 218 mp_opt->echo = (*ptr++) & MPTCP_ADDR_ECHO; 219 if (!mp_opt->echo) { 220 if (opsize == TCPOLEN_MPTCP_ADD_ADDR || 221 opsize == TCPOLEN_MPTCP_ADD_ADDR_PORT) 222 mp_opt->family = MPTCP_ADDR_IPVERSION_4; 223 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 224 else if (opsize == TCPOLEN_MPTCP_ADD_ADDR6 || 225 opsize == TCPOLEN_MPTCP_ADD_ADDR6_PORT) 226 mp_opt->family = MPTCP_ADDR_IPVERSION_6; 227 #endif 228 else 229 break; 230 } else { 231 if (opsize == TCPOLEN_MPTCP_ADD_ADDR_BASE || 232 opsize == TCPOLEN_MPTCP_ADD_ADDR_BASE_PORT) 233 mp_opt->family = MPTCP_ADDR_IPVERSION_4; 234 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 235 else if (opsize == TCPOLEN_MPTCP_ADD_ADDR6_BASE || 236 opsize == TCPOLEN_MPTCP_ADD_ADDR6_BASE_PORT) 237 mp_opt->family = MPTCP_ADDR_IPVERSION_6; 238 #endif 239 else 240 break; 241 } 242 243 mp_opt->add_addr = 1; 244 mp_opt->port = 0; 245 mp_opt->addr_id = *ptr++; 246 pr_debug("ADD_ADDR: id=%d, echo=%d", mp_opt->addr_id, mp_opt->echo); 247 if (mp_opt->family == MPTCP_ADDR_IPVERSION_4) { 248 memcpy((u8 *)&mp_opt->addr.s_addr, (u8 *)ptr, 4); 249 ptr += 4; 250 if (opsize == TCPOLEN_MPTCP_ADD_ADDR_PORT || 251 opsize == TCPOLEN_MPTCP_ADD_ADDR_BASE_PORT) { 252 mp_opt->port = get_unaligned_be16(ptr); 253 ptr += 2; 254 } 255 } 256 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 257 else { 258 memcpy(mp_opt->addr6.s6_addr, (u8 *)ptr, 16); 259 ptr += 16; 260 if (opsize == TCPOLEN_MPTCP_ADD_ADDR6_PORT || 261 opsize == TCPOLEN_MPTCP_ADD_ADDR6_BASE_PORT) { 262 mp_opt->port = get_unaligned_be16(ptr); 263 ptr += 2; 264 } 265 } 266 #endif 267 if (!mp_opt->echo) { 268 mp_opt->ahmac = get_unaligned_be64(ptr); 269 ptr += 8; 270 } 271 break; 272 273 case MPTCPOPT_RM_ADDR: 274 if (opsize != TCPOLEN_MPTCP_RM_ADDR_BASE) 275 break; 276 277 ptr++; 278 279 mp_opt->rm_addr = 1; 280 mp_opt->rm_id = *ptr++; 281 pr_debug("RM_ADDR: id=%d", mp_opt->rm_id); 282 break; 283 284 default: 285 break; 286 } 287 } 288 289 void mptcp_get_options(const struct sk_buff *skb, 290 struct mptcp_options_received *mp_opt) 291 { 292 const struct tcphdr *th = tcp_hdr(skb); 293 const unsigned char *ptr; 294 int length; 295 296 /* initialize option status */ 297 mp_opt->mp_capable = 0; 298 mp_opt->mp_join = 0; 299 mp_opt->add_addr = 0; 300 mp_opt->rm_addr = 0; 301 mp_opt->dss = 0; 302 303 length = (th->doff * 4) - sizeof(struct tcphdr); 304 ptr = (const unsigned char *)(th + 1); 305 306 while (length > 0) { 307 int opcode = *ptr++; 308 int opsize; 309 310 switch (opcode) { 311 case TCPOPT_EOL: 312 return; 313 case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ 314 length--; 315 continue; 316 default: 317 opsize = *ptr++; 318 if (opsize < 2) /* "silly options" */ 319 return; 320 if (opsize > length) 321 return; /* don't parse partial options */ 322 if (opcode == TCPOPT_MPTCP) 323 mptcp_parse_option(skb, ptr, opsize, mp_opt); 324 ptr += opsize - 2; 325 length -= opsize; 326 } 327 } 328 } 329 330 bool mptcp_syn_options(struct sock *sk, const struct sk_buff *skb, 331 unsigned int *size, struct mptcp_out_options *opts) 332 { 333 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); 334 335 /* we will use snd_isn to detect first pkt [re]transmission 336 * in mptcp_established_options_mp() 337 */ 338 subflow->snd_isn = TCP_SKB_CB(skb)->end_seq; 339 if (subflow->request_mptcp) { 340 opts->suboptions = OPTION_MPTCP_MPC_SYN; 341 *size = TCPOLEN_MPTCP_MPC_SYN; 342 return true; 343 } else if (subflow->request_join) { 344 pr_debug("remote_token=%u, nonce=%u", subflow->remote_token, 345 subflow->local_nonce); 346 opts->suboptions = OPTION_MPTCP_MPJ_SYN; 347 opts->join_id = subflow->local_id; 348 opts->token = subflow->remote_token; 349 opts->nonce = subflow->local_nonce; 350 opts->backup = subflow->request_bkup; 351 *size = TCPOLEN_MPTCP_MPJ_SYN; 352 return true; 353 } 354 return false; 355 } 356 357 /* MP_JOIN client subflow must wait for 4th ack before sending any data: 358 * TCP can't schedule delack timer before the subflow is fully established. 359 * MPTCP uses the delack timer to do 3rd ack retransmissions 360 */ 361 static void schedule_3rdack_retransmission(struct sock *sk) 362 { 363 struct inet_connection_sock *icsk = inet_csk(sk); 364 struct tcp_sock *tp = tcp_sk(sk); 365 unsigned long timeout; 366 367 /* reschedule with a timeout above RTT, as we must look only for drop */ 368 if (tp->srtt_us) 369 timeout = tp->srtt_us << 1; 370 else 371 timeout = TCP_TIMEOUT_INIT; 372 373 WARN_ON_ONCE(icsk->icsk_ack.pending & ICSK_ACK_TIMER); 374 icsk->icsk_ack.pending |= ICSK_ACK_SCHED | ICSK_ACK_TIMER; 375 icsk->icsk_ack.timeout = timeout; 376 sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout); 377 } 378 379 static void clear_3rdack_retransmission(struct sock *sk) 380 { 381 struct inet_connection_sock *icsk = inet_csk(sk); 382 383 sk_stop_timer(sk, &icsk->icsk_delack_timer); 384 icsk->icsk_ack.timeout = 0; 385 icsk->icsk_ack.ato = 0; 386 icsk->icsk_ack.pending &= ~(ICSK_ACK_SCHED | ICSK_ACK_TIMER); 387 } 388 389 static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb, 390 unsigned int *size, 391 unsigned int remaining, 392 struct mptcp_out_options *opts) 393 { 394 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); 395 struct mptcp_ext *mpext; 396 unsigned int data_len; 397 398 /* When skb is not available, we better over-estimate the emitted 399 * options len. A full DSS option (28 bytes) is longer than 400 * TCPOLEN_MPTCP_MPC_ACK_DATA(22) or TCPOLEN_MPTCP_MPJ_ACK(24), so 401 * tell the caller to defer the estimate to 402 * mptcp_established_options_dss(), which will reserve enough space. 403 */ 404 if (!skb) 405 return false; 406 407 /* MPC/MPJ needed only on 3rd ack packet */ 408 if (subflow->fully_established || 409 subflow->snd_isn != TCP_SKB_CB(skb)->seq) 410 return false; 411 412 if (subflow->mp_capable) { 413 mpext = mptcp_get_ext(skb); 414 data_len = mpext ? mpext->data_len : 0; 415 416 /* we will check ext_copy.data_len in mptcp_write_options() to 417 * discriminate between TCPOLEN_MPTCP_MPC_ACK_DATA and 418 * TCPOLEN_MPTCP_MPC_ACK 419 */ 420 opts->ext_copy.data_len = data_len; 421 opts->suboptions = OPTION_MPTCP_MPC_ACK; 422 opts->sndr_key = subflow->local_key; 423 opts->rcvr_key = subflow->remote_key; 424 425 /* Section 3.1. 426 * The MP_CAPABLE option is carried on the SYN, SYN/ACK, and ACK 427 * packets that start the first subflow of an MPTCP connection, 428 * as well as the first packet that carries data 429 */ 430 if (data_len > 0) 431 *size = ALIGN(TCPOLEN_MPTCP_MPC_ACK_DATA, 4); 432 else 433 *size = TCPOLEN_MPTCP_MPC_ACK; 434 435 pr_debug("subflow=%p, local_key=%llu, remote_key=%llu map_len=%d", 436 subflow, subflow->local_key, subflow->remote_key, 437 data_len); 438 439 return true; 440 } else if (subflow->mp_join) { 441 opts->suboptions = OPTION_MPTCP_MPJ_ACK; 442 memcpy(opts->hmac, subflow->hmac, MPTCPOPT_HMAC_LEN); 443 *size = TCPOLEN_MPTCP_MPJ_ACK; 444 pr_debug("subflow=%p", subflow); 445 446 schedule_3rdack_retransmission(sk); 447 return true; 448 } 449 return false; 450 } 451 452 static void mptcp_write_data_fin(struct mptcp_subflow_context *subflow, 453 struct sk_buff *skb, struct mptcp_ext *ext) 454 { 455 u64 data_fin_tx_seq = READ_ONCE(mptcp_sk(subflow->conn)->write_seq); 456 457 if (!ext->use_map || !skb->len) { 458 /* RFC6824 requires a DSS mapping with specific values 459 * if DATA_FIN is set but no data payload is mapped 460 */ 461 ext->data_fin = 1; 462 ext->use_map = 1; 463 ext->dsn64 = 1; 464 /* The write_seq value has already been incremented, so 465 * the actual sequence number for the DATA_FIN is one less. 466 */ 467 ext->data_seq = data_fin_tx_seq - 1; 468 ext->subflow_seq = 0; 469 ext->data_len = 1; 470 } else if (ext->data_seq + ext->data_len == data_fin_tx_seq) { 471 /* If there's an existing DSS mapping and it is the 472 * final mapping, DATA_FIN consumes 1 additional byte of 473 * mapping space. 474 */ 475 ext->data_fin = 1; 476 ext->data_len++; 477 } 478 } 479 480 static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb, 481 unsigned int *size, 482 unsigned int remaining, 483 struct mptcp_out_options *opts) 484 { 485 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); 486 struct mptcp_sock *msk = mptcp_sk(subflow->conn); 487 unsigned int dss_size = 0; 488 u64 snd_data_fin_enable; 489 struct mptcp_ext *mpext; 490 unsigned int ack_size; 491 bool ret = false; 492 493 mpext = skb ? mptcp_get_ext(skb) : NULL; 494 snd_data_fin_enable = READ_ONCE(msk->snd_data_fin_enable); 495 496 if (!skb || (mpext && mpext->use_map) || snd_data_fin_enable) { 497 unsigned int map_size; 498 499 map_size = TCPOLEN_MPTCP_DSS_BASE + TCPOLEN_MPTCP_DSS_MAP64; 500 501 remaining -= map_size; 502 dss_size = map_size; 503 if (mpext) 504 opts->ext_copy = *mpext; 505 506 if (skb && snd_data_fin_enable) 507 mptcp_write_data_fin(subflow, skb, &opts->ext_copy); 508 ret = true; 509 } 510 511 /* passive sockets msk will set the 'can_ack' after accept(), even 512 * if the first subflow may have the already the remote key handy 513 */ 514 opts->ext_copy.use_ack = 0; 515 if (!READ_ONCE(msk->can_ack)) { 516 *size = ALIGN(dss_size, 4); 517 return ret; 518 } 519 520 if (subflow->use_64bit_ack) { 521 ack_size = TCPOLEN_MPTCP_DSS_ACK64; 522 opts->ext_copy.data_ack = msk->ack_seq; 523 opts->ext_copy.ack64 = 1; 524 } else { 525 ack_size = TCPOLEN_MPTCP_DSS_ACK32; 526 opts->ext_copy.data_ack32 = (uint32_t)(msk->ack_seq); 527 opts->ext_copy.ack64 = 0; 528 } 529 opts->ext_copy.use_ack = 1; 530 531 /* Add kind/length/subtype/flag overhead if mapping is not populated */ 532 if (dss_size == 0) 533 ack_size += TCPOLEN_MPTCP_DSS_BASE; 534 535 dss_size += ack_size; 536 537 *size = ALIGN(dss_size, 4); 538 return true; 539 } 540 541 static u64 add_addr_generate_hmac(u64 key1, u64 key2, u8 addr_id, 542 struct in_addr *addr) 543 { 544 u8 hmac[SHA256_DIGEST_SIZE]; 545 u8 msg[7]; 546 547 msg[0] = addr_id; 548 memcpy(&msg[1], &addr->s_addr, 4); 549 msg[5] = 0; 550 msg[6] = 0; 551 552 mptcp_crypto_hmac_sha(key1, key2, msg, 7, hmac); 553 554 return get_unaligned_be64(&hmac[SHA256_DIGEST_SIZE - sizeof(u64)]); 555 } 556 557 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 558 static u64 add_addr6_generate_hmac(u64 key1, u64 key2, u8 addr_id, 559 struct in6_addr *addr) 560 { 561 u8 hmac[SHA256_DIGEST_SIZE]; 562 u8 msg[19]; 563 564 msg[0] = addr_id; 565 memcpy(&msg[1], &addr->s6_addr, 16); 566 msg[17] = 0; 567 msg[18] = 0; 568 569 mptcp_crypto_hmac_sha(key1, key2, msg, 19, hmac); 570 571 return get_unaligned_be64(&hmac[SHA256_DIGEST_SIZE - sizeof(u64)]); 572 } 573 #endif 574 575 static bool mptcp_established_options_add_addr(struct sock *sk, 576 unsigned int *size, 577 unsigned int remaining, 578 struct mptcp_out_options *opts) 579 { 580 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); 581 struct mptcp_sock *msk = mptcp_sk(subflow->conn); 582 struct mptcp_addr_info saddr; 583 bool echo; 584 int len; 585 586 if (!mptcp_pm_should_add_signal(msk) || 587 !(mptcp_pm_add_addr_signal(msk, remaining, &saddr, &echo))) 588 return false; 589 590 len = mptcp_add_addr_len(saddr.family); 591 if (remaining < len) 592 return false; 593 594 *size = len; 595 opts->addr_id = saddr.id; 596 if (saddr.family == AF_INET) { 597 opts->suboptions |= OPTION_MPTCP_ADD_ADDR; 598 opts->addr = saddr.addr; 599 if (!echo) { 600 opts->ahmac = add_addr_generate_hmac(msk->local_key, 601 msk->remote_key, 602 opts->addr_id, 603 &opts->addr); 604 } 605 } 606 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 607 else if (saddr.family == AF_INET6) { 608 opts->suboptions |= OPTION_MPTCP_ADD_ADDR6; 609 opts->addr6 = saddr.addr6; 610 if (!echo) { 611 opts->ahmac = add_addr6_generate_hmac(msk->local_key, 612 msk->remote_key, 613 opts->addr_id, 614 &opts->addr6); 615 } 616 } 617 #endif 618 pr_debug("addr_id=%d, ahmac=%llu, echo=%d", opts->addr_id, opts->ahmac, echo); 619 620 return true; 621 } 622 623 static bool mptcp_established_options_rm_addr(struct sock *sk, 624 unsigned int *size, 625 unsigned int remaining, 626 struct mptcp_out_options *opts) 627 { 628 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); 629 struct mptcp_sock *msk = mptcp_sk(subflow->conn); 630 u8 rm_id; 631 632 if (!mptcp_pm_should_rm_signal(msk) || 633 !(mptcp_pm_rm_addr_signal(msk, remaining, &rm_id))) 634 return false; 635 636 if (remaining < TCPOLEN_MPTCP_RM_ADDR_BASE) 637 return false; 638 639 *size = TCPOLEN_MPTCP_RM_ADDR_BASE; 640 opts->suboptions |= OPTION_MPTCP_RM_ADDR; 641 opts->rm_id = rm_id; 642 643 pr_debug("rm_id=%d", opts->rm_id); 644 645 return true; 646 } 647 648 bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, 649 unsigned int *size, unsigned int remaining, 650 struct mptcp_out_options *opts) 651 { 652 unsigned int opt_size = 0; 653 bool ret = false; 654 655 opts->suboptions = 0; 656 657 if (unlikely(mptcp_check_fallback(sk))) 658 return false; 659 660 if (mptcp_established_options_mp(sk, skb, &opt_size, remaining, opts)) 661 ret = true; 662 else if (mptcp_established_options_dss(sk, skb, &opt_size, remaining, 663 opts)) 664 ret = true; 665 666 /* we reserved enough space for the above options, and exceeding the 667 * TCP option space would be fatal 668 */ 669 if (WARN_ON_ONCE(opt_size > remaining)) 670 return false; 671 672 *size += opt_size; 673 remaining -= opt_size; 674 if (mptcp_established_options_add_addr(sk, &opt_size, remaining, opts)) { 675 *size += opt_size; 676 remaining -= opt_size; 677 ret = true; 678 } else if (mptcp_established_options_rm_addr(sk, &opt_size, remaining, opts)) { 679 *size += opt_size; 680 remaining -= opt_size; 681 ret = true; 682 } 683 684 return ret; 685 } 686 687 bool mptcp_synack_options(const struct request_sock *req, unsigned int *size, 688 struct mptcp_out_options *opts) 689 { 690 struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); 691 692 if (subflow_req->mp_capable) { 693 opts->suboptions = OPTION_MPTCP_MPC_SYNACK; 694 opts->sndr_key = subflow_req->local_key; 695 *size = TCPOLEN_MPTCP_MPC_SYNACK; 696 pr_debug("subflow_req=%p, local_key=%llu", 697 subflow_req, subflow_req->local_key); 698 return true; 699 } else if (subflow_req->mp_join) { 700 opts->suboptions = OPTION_MPTCP_MPJ_SYNACK; 701 opts->backup = subflow_req->backup; 702 opts->join_id = subflow_req->local_id; 703 opts->thmac = subflow_req->thmac; 704 opts->nonce = subflow_req->local_nonce; 705 pr_debug("req=%p, bkup=%u, id=%u, thmac=%llu, nonce=%u", 706 subflow_req, opts->backup, opts->join_id, 707 opts->thmac, opts->nonce); 708 *size = TCPOLEN_MPTCP_MPJ_SYNACK; 709 return true; 710 } 711 return false; 712 } 713 714 static bool check_fully_established(struct mptcp_sock *msk, struct sock *sk, 715 struct mptcp_subflow_context *subflow, 716 struct sk_buff *skb, 717 struct mptcp_options_received *mp_opt) 718 { 719 /* here we can process OoO, in-window pkts, only in-sequence 4th ack 720 * will make the subflow fully established 721 */ 722 if (likely(subflow->fully_established)) { 723 /* on passive sockets, check for 3rd ack retransmission 724 * note that msk is always set by subflow_syn_recv_sock() 725 * for mp_join subflows 726 */ 727 if (TCP_SKB_CB(skb)->seq == subflow->ssn_offset + 1 && 728 TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq && 729 subflow->mp_join && mp_opt->mp_join && 730 READ_ONCE(msk->pm.server_side)) 731 tcp_send_ack(sk); 732 goto fully_established; 733 } 734 735 /* we should process OoO packets before the first subflow is fully 736 * established, but not expected for MP_JOIN subflows 737 */ 738 if (TCP_SKB_CB(skb)->seq != subflow->ssn_offset + 1) 739 return subflow->mp_capable; 740 741 if (mp_opt->dss && mp_opt->use_ack) { 742 /* subflows are fully established as soon as we get any 743 * additional ack. 744 */ 745 subflow->fully_established = 1; 746 WRITE_ONCE(msk->fully_established, true); 747 goto fully_established; 748 } 749 750 /* If the first established packet does not contain MP_CAPABLE + data 751 * then fallback to TCP 752 */ 753 if (!mp_opt->mp_capable) { 754 subflow->mp_capable = 0; 755 pr_fallback(msk); 756 __mptcp_do_fallback(msk); 757 return false; 758 } 759 760 if (unlikely(!READ_ONCE(msk->pm.server_side))) 761 pr_warn_once("bogus mpc option on established client sk"); 762 mptcp_subflow_fully_established(subflow, mp_opt); 763 764 fully_established: 765 if (likely(subflow->pm_notified)) 766 return true; 767 768 subflow->pm_notified = 1; 769 if (subflow->mp_join) { 770 clear_3rdack_retransmission(sk); 771 mptcp_pm_subflow_established(msk, subflow); 772 } else { 773 mptcp_pm_fully_established(msk); 774 } 775 return true; 776 } 777 778 static u64 expand_ack(u64 old_ack, u64 cur_ack, bool use_64bit) 779 { 780 u32 old_ack32, cur_ack32; 781 782 if (use_64bit) 783 return cur_ack; 784 785 old_ack32 = (u32)old_ack; 786 cur_ack32 = (u32)cur_ack; 787 cur_ack = (old_ack & GENMASK_ULL(63, 32)) + cur_ack32; 788 if (unlikely(before(cur_ack32, old_ack32))) 789 return cur_ack + (1LL << 32); 790 return cur_ack; 791 } 792 793 static void update_una(struct mptcp_sock *msk, 794 struct mptcp_options_received *mp_opt) 795 { 796 u64 new_snd_una, snd_una, old_snd_una = atomic64_read(&msk->snd_una); 797 u64 write_seq = READ_ONCE(msk->write_seq); 798 799 /* avoid ack expansion on update conflict, to reduce the risk of 800 * wrongly expanding to a future ack sequence number, which is way 801 * more dangerous than missing an ack 802 */ 803 new_snd_una = expand_ack(old_snd_una, mp_opt->data_ack, mp_opt->ack64); 804 805 /* ACK for data not even sent yet? Ignore. */ 806 if (after64(new_snd_una, write_seq)) 807 new_snd_una = old_snd_una; 808 809 while (after64(new_snd_una, old_snd_una)) { 810 snd_una = old_snd_una; 811 old_snd_una = atomic64_cmpxchg(&msk->snd_una, snd_una, 812 new_snd_una); 813 if (old_snd_una == snd_una) { 814 mptcp_data_acked((struct sock *)msk); 815 break; 816 } 817 } 818 } 819 820 bool mptcp_update_rcv_data_fin(struct mptcp_sock *msk, u64 data_fin_seq) 821 { 822 /* Skip if DATA_FIN was already received. 823 * If updating simultaneously with the recvmsg loop, values 824 * should match. If they mismatch, the peer is misbehaving and 825 * we will prefer the most recent information. 826 */ 827 if (READ_ONCE(msk->rcv_data_fin) || !READ_ONCE(msk->first)) 828 return false; 829 830 WRITE_ONCE(msk->rcv_data_fin_seq, data_fin_seq); 831 WRITE_ONCE(msk->rcv_data_fin, 1); 832 833 return true; 834 } 835 836 static bool add_addr_hmac_valid(struct mptcp_sock *msk, 837 struct mptcp_options_received *mp_opt) 838 { 839 u64 hmac = 0; 840 841 if (mp_opt->echo) 842 return true; 843 844 if (mp_opt->family == MPTCP_ADDR_IPVERSION_4) 845 hmac = add_addr_generate_hmac(msk->remote_key, 846 msk->local_key, 847 mp_opt->addr_id, &mp_opt->addr); 848 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 849 else 850 hmac = add_addr6_generate_hmac(msk->remote_key, 851 msk->local_key, 852 mp_opt->addr_id, &mp_opt->addr6); 853 #endif 854 855 pr_debug("msk=%p, ahmac=%llu, mp_opt->ahmac=%llu\n", 856 msk, (unsigned long long)hmac, 857 (unsigned long long)mp_opt->ahmac); 858 859 return hmac == mp_opt->ahmac; 860 } 861 862 void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb) 863 { 864 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); 865 struct mptcp_sock *msk = mptcp_sk(subflow->conn); 866 struct mptcp_options_received mp_opt; 867 struct mptcp_ext *mpext; 868 869 if (__mptcp_check_fallback(msk)) 870 return; 871 872 mptcp_get_options(skb, &mp_opt); 873 if (!check_fully_established(msk, sk, subflow, skb, &mp_opt)) 874 return; 875 876 if (mp_opt.add_addr && add_addr_hmac_valid(msk, &mp_opt)) { 877 struct mptcp_addr_info addr; 878 879 addr.port = htons(mp_opt.port); 880 addr.id = mp_opt.addr_id; 881 if (mp_opt.family == MPTCP_ADDR_IPVERSION_4) { 882 addr.family = AF_INET; 883 addr.addr = mp_opt.addr; 884 } 885 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 886 else if (mp_opt.family == MPTCP_ADDR_IPVERSION_6) { 887 addr.family = AF_INET6; 888 addr.addr6 = mp_opt.addr6; 889 } 890 #endif 891 if (!mp_opt.echo) { 892 mptcp_pm_add_addr_received(msk, &addr); 893 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_ADDADDR); 894 } else { 895 mptcp_pm_del_add_timer(msk, &addr); 896 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_ECHOADD); 897 } 898 mp_opt.add_addr = 0; 899 } 900 901 if (mp_opt.rm_addr) { 902 mptcp_pm_rm_addr_received(msk, mp_opt.rm_id); 903 mp_opt.rm_addr = 0; 904 } 905 906 if (!mp_opt.dss) 907 return; 908 909 /* we can't wait for recvmsg() to update the ack_seq, otherwise 910 * monodirectional flows will stuck 911 */ 912 if (mp_opt.use_ack) 913 update_una(msk, &mp_opt); 914 915 /* Zero-data-length packets are dropped by the caller and not 916 * propagated to the MPTCP layer, so the skb extension does not 917 * need to be allocated or populated. DATA_FIN information, if 918 * present, needs to be updated here before the skb is freed. 919 */ 920 if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) { 921 if (mp_opt.data_fin && mp_opt.data_len == 1 && 922 mptcp_update_rcv_data_fin(msk, mp_opt.data_seq) && 923 schedule_work(&msk->work)) 924 sock_hold(subflow->conn); 925 926 return; 927 } 928 929 mpext = skb_ext_add(skb, SKB_EXT_MPTCP); 930 if (!mpext) 931 return; 932 933 memset(mpext, 0, sizeof(*mpext)); 934 935 if (mp_opt.use_map) { 936 if (mp_opt.mpc_map) { 937 /* this is an MP_CAPABLE carrying MPTCP data 938 * we know this map the first chunk of data 939 */ 940 mptcp_crypto_key_sha(subflow->remote_key, NULL, 941 &mpext->data_seq); 942 mpext->data_seq++; 943 mpext->subflow_seq = 1; 944 mpext->dsn64 = 1; 945 mpext->mpc_map = 1; 946 mpext->data_fin = 0; 947 } else { 948 mpext->data_seq = mp_opt.data_seq; 949 mpext->subflow_seq = mp_opt.subflow_seq; 950 mpext->dsn64 = mp_opt.dsn64; 951 mpext->data_fin = mp_opt.data_fin; 952 } 953 mpext->data_len = mp_opt.data_len; 954 mpext->use_map = 1; 955 } 956 } 957 958 void mptcp_write_options(__be32 *ptr, struct mptcp_out_options *opts) 959 { 960 if ((OPTION_MPTCP_MPC_SYN | OPTION_MPTCP_MPC_SYNACK | 961 OPTION_MPTCP_MPC_ACK) & opts->suboptions) { 962 u8 len; 963 964 if (OPTION_MPTCP_MPC_SYN & opts->suboptions) 965 len = TCPOLEN_MPTCP_MPC_SYN; 966 else if (OPTION_MPTCP_MPC_SYNACK & opts->suboptions) 967 len = TCPOLEN_MPTCP_MPC_SYNACK; 968 else if (opts->ext_copy.data_len) 969 len = TCPOLEN_MPTCP_MPC_ACK_DATA; 970 else 971 len = TCPOLEN_MPTCP_MPC_ACK; 972 973 *ptr++ = mptcp_option(MPTCPOPT_MP_CAPABLE, len, 974 MPTCP_SUPPORTED_VERSION, 975 MPTCP_CAP_HMAC_SHA256); 976 977 if (!((OPTION_MPTCP_MPC_SYNACK | OPTION_MPTCP_MPC_ACK) & 978 opts->suboptions)) 979 goto mp_capable_done; 980 981 put_unaligned_be64(opts->sndr_key, ptr); 982 ptr += 2; 983 if (!((OPTION_MPTCP_MPC_ACK) & opts->suboptions)) 984 goto mp_capable_done; 985 986 put_unaligned_be64(opts->rcvr_key, ptr); 987 ptr += 2; 988 if (!opts->ext_copy.data_len) 989 goto mp_capable_done; 990 991 put_unaligned_be32(opts->ext_copy.data_len << 16 | 992 TCPOPT_NOP << 8 | TCPOPT_NOP, ptr); 993 ptr += 1; 994 } 995 996 mp_capable_done: 997 if (OPTION_MPTCP_ADD_ADDR & opts->suboptions) { 998 if (opts->ahmac) 999 *ptr++ = mptcp_option(MPTCPOPT_ADD_ADDR, 1000 TCPOLEN_MPTCP_ADD_ADDR, 0, 1001 opts->addr_id); 1002 else 1003 *ptr++ = mptcp_option(MPTCPOPT_ADD_ADDR, 1004 TCPOLEN_MPTCP_ADD_ADDR_BASE, 1005 MPTCP_ADDR_ECHO, 1006 opts->addr_id); 1007 memcpy((u8 *)ptr, (u8 *)&opts->addr.s_addr, 4); 1008 ptr += 1; 1009 if (opts->ahmac) { 1010 put_unaligned_be64(opts->ahmac, ptr); 1011 ptr += 2; 1012 } 1013 } 1014 1015 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 1016 if (OPTION_MPTCP_ADD_ADDR6 & opts->suboptions) { 1017 if (opts->ahmac) 1018 *ptr++ = mptcp_option(MPTCPOPT_ADD_ADDR, 1019 TCPOLEN_MPTCP_ADD_ADDR6, 0, 1020 opts->addr_id); 1021 else 1022 *ptr++ = mptcp_option(MPTCPOPT_ADD_ADDR, 1023 TCPOLEN_MPTCP_ADD_ADDR6_BASE, 1024 MPTCP_ADDR_ECHO, 1025 opts->addr_id); 1026 memcpy((u8 *)ptr, opts->addr6.s6_addr, 16); 1027 ptr += 4; 1028 if (opts->ahmac) { 1029 put_unaligned_be64(opts->ahmac, ptr); 1030 ptr += 2; 1031 } 1032 } 1033 #endif 1034 1035 if (OPTION_MPTCP_RM_ADDR & opts->suboptions) { 1036 *ptr++ = mptcp_option(MPTCPOPT_RM_ADDR, 1037 TCPOLEN_MPTCP_RM_ADDR_BASE, 1038 0, opts->rm_id); 1039 } 1040 1041 if (OPTION_MPTCP_MPJ_SYN & opts->suboptions) { 1042 *ptr++ = mptcp_option(MPTCPOPT_MP_JOIN, 1043 TCPOLEN_MPTCP_MPJ_SYN, 1044 opts->backup, opts->join_id); 1045 put_unaligned_be32(opts->token, ptr); 1046 ptr += 1; 1047 put_unaligned_be32(opts->nonce, ptr); 1048 ptr += 1; 1049 } 1050 1051 if (OPTION_MPTCP_MPJ_SYNACK & opts->suboptions) { 1052 *ptr++ = mptcp_option(MPTCPOPT_MP_JOIN, 1053 TCPOLEN_MPTCP_MPJ_SYNACK, 1054 opts->backup, opts->join_id); 1055 put_unaligned_be64(opts->thmac, ptr); 1056 ptr += 2; 1057 put_unaligned_be32(opts->nonce, ptr); 1058 ptr += 1; 1059 } 1060 1061 if (OPTION_MPTCP_MPJ_ACK & opts->suboptions) { 1062 *ptr++ = mptcp_option(MPTCPOPT_MP_JOIN, 1063 TCPOLEN_MPTCP_MPJ_ACK, 0, 0); 1064 memcpy(ptr, opts->hmac, MPTCPOPT_HMAC_LEN); 1065 ptr += 5; 1066 } 1067 1068 if (opts->ext_copy.use_ack || opts->ext_copy.use_map) { 1069 struct mptcp_ext *mpext = &opts->ext_copy; 1070 u8 len = TCPOLEN_MPTCP_DSS_BASE; 1071 u8 flags = 0; 1072 1073 if (mpext->use_ack) { 1074 flags = MPTCP_DSS_HAS_ACK; 1075 if (mpext->ack64) { 1076 len += TCPOLEN_MPTCP_DSS_ACK64; 1077 flags |= MPTCP_DSS_ACK64; 1078 } else { 1079 len += TCPOLEN_MPTCP_DSS_ACK32; 1080 } 1081 } 1082 1083 if (mpext->use_map) { 1084 len += TCPOLEN_MPTCP_DSS_MAP64; 1085 1086 /* Use only 64-bit mapping flags for now, add 1087 * support for optional 32-bit mappings later. 1088 */ 1089 flags |= MPTCP_DSS_HAS_MAP | MPTCP_DSS_DSN64; 1090 if (mpext->data_fin) 1091 flags |= MPTCP_DSS_DATA_FIN; 1092 } 1093 1094 *ptr++ = mptcp_option(MPTCPOPT_DSS, len, 0, flags); 1095 1096 if (mpext->use_ack) { 1097 if (mpext->ack64) { 1098 put_unaligned_be64(mpext->data_ack, ptr); 1099 ptr += 2; 1100 } else { 1101 put_unaligned_be32(mpext->data_ack32, ptr); 1102 ptr += 1; 1103 } 1104 } 1105 1106 if (mpext->use_map) { 1107 put_unaligned_be64(mpext->data_seq, ptr); 1108 ptr += 2; 1109 put_unaligned_be32(mpext->subflow_seq, ptr); 1110 ptr += 1; 1111 put_unaligned_be32(mpext->data_len << 16 | 1112 TCPOPT_NOP << 8 | TCPOPT_NOP, ptr); 1113 } 1114 } 1115 } 1116