1 // SPDX-License-Identifier: GPL-2.0 2 /* Multipath TCP 3 * 4 * Copyright (c) 2017 - 2019, Intel Corporation. 5 */ 6 7 #include <linux/kernel.h> 8 #include <net/tcp.h> 9 #include <net/mptcp.h> 10 #include "protocol.h" 11 12 static bool mptcp_cap_flag_sha256(u8 flags) 13 { 14 return (flags & MPTCP_CAP_FLAG_MASK) == MPTCP_CAP_HMAC_SHA256; 15 } 16 17 void mptcp_parse_option(const struct sk_buff *skb, const unsigned char *ptr, 18 int opsize, struct tcp_options_received *opt_rx) 19 { 20 struct mptcp_options_received *mp_opt = &opt_rx->mptcp; 21 u8 subtype = *ptr >> 4; 22 int expected_opsize; 23 u8 version; 24 u8 flags; 25 26 switch (subtype) { 27 case MPTCPOPT_MP_CAPABLE: 28 /* strict size checking */ 29 if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) { 30 if (skb->len > tcp_hdr(skb)->doff << 2) 31 expected_opsize = TCPOLEN_MPTCP_MPC_ACK_DATA; 32 else 33 expected_opsize = TCPOLEN_MPTCP_MPC_ACK; 34 } else { 35 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK) 36 expected_opsize = TCPOLEN_MPTCP_MPC_SYNACK; 37 else 38 expected_opsize = TCPOLEN_MPTCP_MPC_SYN; 39 } 40 if (opsize != expected_opsize) 41 break; 42 43 /* try to be gentle vs future versions on the initial syn */ 44 version = *ptr++ & MPTCP_VERSION_MASK; 45 if (opsize != TCPOLEN_MPTCP_MPC_SYN) { 46 if (version != MPTCP_SUPPORTED_VERSION) 47 break; 48 } else if (version < MPTCP_SUPPORTED_VERSION) { 49 break; 50 } 51 52 flags = *ptr++; 53 if (!mptcp_cap_flag_sha256(flags) || 54 (flags & MPTCP_CAP_EXTENSIBILITY)) 55 break; 56 57 /* RFC 6824, Section 3.1: 58 * "For the Checksum Required bit (labeled "A"), if either 59 * host requires the use of checksums, checksums MUST be used. 60 * In other words, the only way for checksums not to be used 61 * is if both hosts in their SYNs set A=0." 62 * 63 * Section 3.3.0: 64 * "If a checksum is not present when its use has been 65 * negotiated, the receiver MUST close the subflow with a RST as 66 * it is considered broken." 67 * 68 * We don't implement DSS checksum - fall back to TCP. 69 */ 70 if (flags & MPTCP_CAP_CHECKSUM_REQD) 71 break; 72 73 mp_opt->mp_capable = 1; 74 if (opsize >= TCPOLEN_MPTCP_MPC_SYNACK) { 75 mp_opt->sndr_key = get_unaligned_be64(ptr); 76 ptr += 8; 77 } 78 if (opsize >= TCPOLEN_MPTCP_MPC_ACK) { 79 mp_opt->rcvr_key = get_unaligned_be64(ptr); 80 ptr += 8; 81 } 82 if (opsize == TCPOLEN_MPTCP_MPC_ACK_DATA) { 83 /* Section 3.1.: 84 * "the data parameters in a MP_CAPABLE are semantically 85 * equivalent to those in a DSS option and can be used 86 * interchangeably." 87 */ 88 mp_opt->dss = 1; 89 mp_opt->use_map = 1; 90 mp_opt->mpc_map = 1; 91 mp_opt->data_len = get_unaligned_be16(ptr); 92 ptr += 2; 93 } 94 pr_debug("MP_CAPABLE version=%x, flags=%x, optlen=%d sndr=%llu, rcvr=%llu len=%d", 95 version, flags, opsize, mp_opt->sndr_key, 96 mp_opt->rcvr_key, mp_opt->data_len); 97 break; 98 99 case MPTCPOPT_DSS: 100 pr_debug("DSS"); 101 ptr++; 102 103 /* we must clear 'mpc_map' be able to detect MP_CAPABLE 104 * map vs DSS map in mptcp_incoming_options(), and reconstruct 105 * map info accordingly 106 */ 107 mp_opt->mpc_map = 0; 108 flags = (*ptr++) & MPTCP_DSS_FLAG_MASK; 109 mp_opt->data_fin = (flags & MPTCP_DSS_DATA_FIN) != 0; 110 mp_opt->dsn64 = (flags & MPTCP_DSS_DSN64) != 0; 111 mp_opt->use_map = (flags & MPTCP_DSS_HAS_MAP) != 0; 112 mp_opt->ack64 = (flags & MPTCP_DSS_ACK64) != 0; 113 mp_opt->use_ack = (flags & MPTCP_DSS_HAS_ACK); 114 115 pr_debug("data_fin=%d dsn64=%d use_map=%d ack64=%d use_ack=%d", 116 mp_opt->data_fin, mp_opt->dsn64, 117 mp_opt->use_map, mp_opt->ack64, 118 mp_opt->use_ack); 119 120 expected_opsize = TCPOLEN_MPTCP_DSS_BASE; 121 122 if (mp_opt->use_ack) { 123 if (mp_opt->ack64) 124 expected_opsize += TCPOLEN_MPTCP_DSS_ACK64; 125 else 126 expected_opsize += TCPOLEN_MPTCP_DSS_ACK32; 127 } 128 129 if (mp_opt->use_map) { 130 if (mp_opt->dsn64) 131 expected_opsize += TCPOLEN_MPTCP_DSS_MAP64; 132 else 133 expected_opsize += TCPOLEN_MPTCP_DSS_MAP32; 134 } 135 136 /* RFC 6824, Section 3.3: 137 * If a checksum is present, but its use had 138 * not been negotiated in the MP_CAPABLE handshake, 139 * the checksum field MUST be ignored. 140 */ 141 if (opsize != expected_opsize && 142 opsize != expected_opsize + TCPOLEN_MPTCP_DSS_CHECKSUM) 143 break; 144 145 mp_opt->dss = 1; 146 147 if (mp_opt->use_ack) { 148 if (mp_opt->ack64) { 149 mp_opt->data_ack = get_unaligned_be64(ptr); 150 ptr += 8; 151 } else { 152 mp_opt->data_ack = get_unaligned_be32(ptr); 153 ptr += 4; 154 } 155 156 pr_debug("data_ack=%llu", mp_opt->data_ack); 157 } 158 159 if (mp_opt->use_map) { 160 if (mp_opt->dsn64) { 161 mp_opt->data_seq = get_unaligned_be64(ptr); 162 ptr += 8; 163 } else { 164 mp_opt->data_seq = get_unaligned_be32(ptr); 165 ptr += 4; 166 } 167 168 mp_opt->subflow_seq = get_unaligned_be32(ptr); 169 ptr += 4; 170 171 mp_opt->data_len = get_unaligned_be16(ptr); 172 ptr += 2; 173 174 pr_debug("data_seq=%llu subflow_seq=%u data_len=%u", 175 mp_opt->data_seq, mp_opt->subflow_seq, 176 mp_opt->data_len); 177 } 178 179 break; 180 181 default: 182 break; 183 } 184 } 185 186 void mptcp_get_options(const struct sk_buff *skb, 187 struct tcp_options_received *opt_rx) 188 { 189 const unsigned char *ptr; 190 const struct tcphdr *th = tcp_hdr(skb); 191 int length = (th->doff * 4) - sizeof(struct tcphdr); 192 193 ptr = (const unsigned char *)(th + 1); 194 195 while (length > 0) { 196 int opcode = *ptr++; 197 int opsize; 198 199 switch (opcode) { 200 case TCPOPT_EOL: 201 return; 202 case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ 203 length--; 204 continue; 205 default: 206 opsize = *ptr++; 207 if (opsize < 2) /* "silly options" */ 208 return; 209 if (opsize > length) 210 return; /* don't parse partial options */ 211 if (opcode == TCPOPT_MPTCP) 212 mptcp_parse_option(skb, ptr, opsize, opt_rx); 213 ptr += opsize - 2; 214 length -= opsize; 215 } 216 } 217 } 218 219 bool mptcp_syn_options(struct sock *sk, const struct sk_buff *skb, 220 unsigned int *size, struct mptcp_out_options *opts) 221 { 222 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); 223 224 /* we will use snd_isn to detect first pkt [re]transmission 225 * in mptcp_established_options_mp() 226 */ 227 subflow->snd_isn = TCP_SKB_CB(skb)->end_seq; 228 if (subflow->request_mptcp) { 229 pr_debug("local_key=%llu", subflow->local_key); 230 opts->suboptions = OPTION_MPTCP_MPC_SYN; 231 opts->sndr_key = subflow->local_key; 232 *size = TCPOLEN_MPTCP_MPC_SYN; 233 return true; 234 } 235 return false; 236 } 237 238 void mptcp_rcv_synsent(struct sock *sk) 239 { 240 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); 241 struct tcp_sock *tp = tcp_sk(sk); 242 243 pr_debug("subflow=%p", subflow); 244 if (subflow->request_mptcp && tp->rx_opt.mptcp.mp_capable) { 245 subflow->mp_capable = 1; 246 subflow->can_ack = 1; 247 subflow->remote_key = tp->rx_opt.mptcp.sndr_key; 248 } else { 249 tcp_sk(sk)->is_mptcp = 0; 250 } 251 } 252 253 static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb, 254 unsigned int *size, 255 unsigned int remaining, 256 struct mptcp_out_options *opts) 257 { 258 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); 259 struct mptcp_ext *mpext; 260 unsigned int data_len; 261 262 pr_debug("subflow=%p fourth_ack=%d seq=%x:%x remaining=%d", subflow, 263 subflow->fourth_ack, subflow->snd_isn, 264 skb ? TCP_SKB_CB(skb)->seq : 0, remaining); 265 266 if (subflow->mp_capable && !subflow->fourth_ack && skb && 267 subflow->snd_isn == TCP_SKB_CB(skb)->seq) { 268 /* When skb is not available, we better over-estimate the 269 * emitted options len. A full DSS option is longer than 270 * TCPOLEN_MPTCP_MPC_ACK_DATA, so let's the caller try to fit 271 * that. 272 */ 273 mpext = mptcp_get_ext(skb); 274 data_len = mpext ? mpext->data_len : 0; 275 276 /* we will check ext_copy.data_len in mptcp_write_options() to 277 * discriminate between TCPOLEN_MPTCP_MPC_ACK_DATA and 278 * TCPOLEN_MPTCP_MPC_ACK 279 */ 280 opts->ext_copy.data_len = data_len; 281 opts->suboptions = OPTION_MPTCP_MPC_ACK; 282 opts->sndr_key = subflow->local_key; 283 opts->rcvr_key = subflow->remote_key; 284 285 /* Section 3.1. 286 * The MP_CAPABLE option is carried on the SYN, SYN/ACK, and ACK 287 * packets that start the first subflow of an MPTCP connection, 288 * as well as the first packet that carries data 289 */ 290 if (data_len > 0) 291 *size = ALIGN(TCPOLEN_MPTCP_MPC_ACK_DATA, 4); 292 else 293 *size = TCPOLEN_MPTCP_MPC_ACK; 294 295 pr_debug("subflow=%p, local_key=%llu, remote_key=%llu map_len=%d", 296 subflow, subflow->local_key, subflow->remote_key, 297 data_len); 298 299 return true; 300 } 301 return false; 302 } 303 304 static void mptcp_write_data_fin(struct mptcp_subflow_context *subflow, 305 struct mptcp_ext *ext) 306 { 307 ext->data_fin = 1; 308 309 if (!ext->use_map) { 310 /* RFC6824 requires a DSS mapping with specific values 311 * if DATA_FIN is set but no data payload is mapped 312 */ 313 ext->use_map = 1; 314 ext->dsn64 = 1; 315 ext->data_seq = mptcp_sk(subflow->conn)->write_seq; 316 ext->subflow_seq = 0; 317 ext->data_len = 1; 318 } else { 319 /* If there's an existing DSS mapping, DATA_FIN consumes 320 * 1 additional byte of mapping space. 321 */ 322 ext->data_len++; 323 } 324 } 325 326 static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb, 327 unsigned int *size, 328 unsigned int remaining, 329 struct mptcp_out_options *opts) 330 { 331 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); 332 unsigned int dss_size = 0; 333 struct mptcp_ext *mpext; 334 struct mptcp_sock *msk; 335 unsigned int ack_size; 336 bool ret = false; 337 bool can_ack; 338 u64 ack_seq; 339 u8 tcp_fin; 340 341 if (skb) { 342 mpext = mptcp_get_ext(skb); 343 tcp_fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN; 344 } else { 345 mpext = NULL; 346 tcp_fin = 0; 347 } 348 349 if (!skb || (mpext && mpext->use_map) || tcp_fin) { 350 unsigned int map_size; 351 352 map_size = TCPOLEN_MPTCP_DSS_BASE + TCPOLEN_MPTCP_DSS_MAP64; 353 354 remaining -= map_size; 355 dss_size = map_size; 356 if (mpext) 357 opts->ext_copy = *mpext; 358 359 if (skb && tcp_fin && 360 subflow->conn->sk_state != TCP_ESTABLISHED) 361 mptcp_write_data_fin(subflow, &opts->ext_copy); 362 ret = true; 363 } 364 365 /* passive sockets msk will set the 'can_ack' after accept(), even 366 * if the first subflow may have the already the remote key handy 367 */ 368 can_ack = true; 369 opts->ext_copy.use_ack = 0; 370 msk = mptcp_sk(subflow->conn); 371 if (likely(msk && READ_ONCE(msk->can_ack))) { 372 ack_seq = msk->ack_seq; 373 } else if (subflow->can_ack) { 374 mptcp_crypto_key_sha(subflow->remote_key, NULL, &ack_seq); 375 ack_seq++; 376 } else { 377 can_ack = false; 378 } 379 380 if (unlikely(!can_ack)) { 381 *size = ALIGN(dss_size, 4); 382 return ret; 383 } 384 385 ack_size = TCPOLEN_MPTCP_DSS_ACK64; 386 387 /* Add kind/length/subtype/flag overhead if mapping is not populated */ 388 if (dss_size == 0) 389 ack_size += TCPOLEN_MPTCP_DSS_BASE; 390 391 dss_size += ack_size; 392 393 opts->ext_copy.data_ack = ack_seq; 394 opts->ext_copy.ack64 = 1; 395 opts->ext_copy.use_ack = 1; 396 397 *size = ALIGN(dss_size, 4); 398 return true; 399 } 400 401 bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, 402 unsigned int *size, unsigned int remaining, 403 struct mptcp_out_options *opts) 404 { 405 unsigned int opt_size = 0; 406 bool ret = false; 407 408 if (mptcp_established_options_mp(sk, skb, &opt_size, remaining, opts)) 409 ret = true; 410 else if (mptcp_established_options_dss(sk, skb, &opt_size, remaining, 411 opts)) 412 ret = true; 413 414 /* we reserved enough space for the above options, and exceeding the 415 * TCP option space would be fatal 416 */ 417 if (WARN_ON_ONCE(opt_size > remaining)) 418 return false; 419 420 *size += opt_size; 421 remaining -= opt_size; 422 423 return ret; 424 } 425 426 bool mptcp_synack_options(const struct request_sock *req, unsigned int *size, 427 struct mptcp_out_options *opts) 428 { 429 struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); 430 431 if (subflow_req->mp_capable) { 432 opts->suboptions = OPTION_MPTCP_MPC_SYNACK; 433 opts->sndr_key = subflow_req->local_key; 434 *size = TCPOLEN_MPTCP_MPC_SYNACK; 435 pr_debug("subflow_req=%p, local_key=%llu", 436 subflow_req, subflow_req->local_key); 437 return true; 438 } 439 return false; 440 } 441 442 static bool check_fourth_ack(struct mptcp_subflow_context *subflow, 443 struct sk_buff *skb, 444 struct mptcp_options_received *mp_opt) 445 { 446 /* here we can process OoO, in-window pkts, only in-sequence 4th ack 447 * are relevant 448 */ 449 if (likely(subflow->fourth_ack || 450 TCP_SKB_CB(skb)->seq != subflow->ssn_offset + 1)) 451 return true; 452 453 if (mp_opt->use_ack) 454 subflow->fourth_ack = 1; 455 456 if (subflow->can_ack) 457 return true; 458 459 /* If the first established packet does not contain MP_CAPABLE + data 460 * then fallback to TCP 461 */ 462 if (!mp_opt->mp_capable) { 463 subflow->mp_capable = 0; 464 tcp_sk(mptcp_subflow_tcp_sock(subflow))->is_mptcp = 0; 465 return false; 466 } 467 subflow->remote_key = mp_opt->sndr_key; 468 subflow->can_ack = 1; 469 return true; 470 } 471 472 void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb, 473 struct tcp_options_received *opt_rx) 474 { 475 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); 476 struct mptcp_options_received *mp_opt; 477 struct mptcp_ext *mpext; 478 479 mp_opt = &opt_rx->mptcp; 480 if (!check_fourth_ack(subflow, skb, mp_opt)) 481 return; 482 483 if (!mp_opt->dss) 484 return; 485 486 mpext = skb_ext_add(skb, SKB_EXT_MPTCP); 487 if (!mpext) 488 return; 489 490 memset(mpext, 0, sizeof(*mpext)); 491 492 if (mp_opt->use_map) { 493 if (mp_opt->mpc_map) { 494 /* this is an MP_CAPABLE carrying MPTCP data 495 * we know this map the first chunk of data 496 */ 497 mptcp_crypto_key_sha(subflow->remote_key, NULL, 498 &mpext->data_seq); 499 mpext->data_seq++; 500 mpext->subflow_seq = 1; 501 mpext->dsn64 = 1; 502 mpext->mpc_map = 1; 503 } else { 504 mpext->data_seq = mp_opt->data_seq; 505 mpext->subflow_seq = mp_opt->subflow_seq; 506 mpext->dsn64 = mp_opt->dsn64; 507 } 508 mpext->data_len = mp_opt->data_len; 509 mpext->use_map = 1; 510 } 511 512 if (mp_opt->use_ack) { 513 mpext->data_ack = mp_opt->data_ack; 514 mpext->use_ack = 1; 515 mpext->ack64 = mp_opt->ack64; 516 } 517 518 mpext->data_fin = mp_opt->data_fin; 519 } 520 521 void mptcp_write_options(__be32 *ptr, struct mptcp_out_options *opts) 522 { 523 if ((OPTION_MPTCP_MPC_SYN | OPTION_MPTCP_MPC_SYNACK | 524 OPTION_MPTCP_MPC_ACK) & opts->suboptions) { 525 u8 len; 526 527 if (OPTION_MPTCP_MPC_SYN & opts->suboptions) 528 len = TCPOLEN_MPTCP_MPC_SYN; 529 else if (OPTION_MPTCP_MPC_SYNACK & opts->suboptions) 530 len = TCPOLEN_MPTCP_MPC_SYNACK; 531 else if (opts->ext_copy.data_len) 532 len = TCPOLEN_MPTCP_MPC_ACK_DATA; 533 else 534 len = TCPOLEN_MPTCP_MPC_ACK; 535 536 *ptr++ = htonl((TCPOPT_MPTCP << 24) | (len << 16) | 537 (MPTCPOPT_MP_CAPABLE << 12) | 538 (MPTCP_SUPPORTED_VERSION << 8) | 539 MPTCP_CAP_HMAC_SHA256); 540 541 if (!((OPTION_MPTCP_MPC_SYNACK | OPTION_MPTCP_MPC_ACK) & 542 opts->suboptions)) 543 goto mp_capable_done; 544 545 put_unaligned_be64(opts->sndr_key, ptr); 546 ptr += 2; 547 if (!((OPTION_MPTCP_MPC_ACK) & opts->suboptions)) 548 goto mp_capable_done; 549 550 put_unaligned_be64(opts->rcvr_key, ptr); 551 ptr += 2; 552 if (!opts->ext_copy.data_len) 553 goto mp_capable_done; 554 555 put_unaligned_be32(opts->ext_copy.data_len << 16 | 556 TCPOPT_NOP << 8 | TCPOPT_NOP, ptr); 557 ptr += 1; 558 } 559 560 mp_capable_done: 561 if (opts->ext_copy.use_ack || opts->ext_copy.use_map) { 562 struct mptcp_ext *mpext = &opts->ext_copy; 563 u8 len = TCPOLEN_MPTCP_DSS_BASE; 564 u8 flags = 0; 565 566 if (mpext->use_ack) { 567 len += TCPOLEN_MPTCP_DSS_ACK64; 568 flags = MPTCP_DSS_HAS_ACK | MPTCP_DSS_ACK64; 569 } 570 571 if (mpext->use_map) { 572 len += TCPOLEN_MPTCP_DSS_MAP64; 573 574 /* Use only 64-bit mapping flags for now, add 575 * support for optional 32-bit mappings later. 576 */ 577 flags |= MPTCP_DSS_HAS_MAP | MPTCP_DSS_DSN64; 578 if (mpext->data_fin) 579 flags |= MPTCP_DSS_DATA_FIN; 580 } 581 582 *ptr++ = htonl((TCPOPT_MPTCP << 24) | 583 (len << 16) | 584 (MPTCPOPT_DSS << 12) | 585 (flags)); 586 587 if (mpext->use_ack) { 588 put_unaligned_be64(mpext->data_ack, ptr); 589 ptr += 2; 590 } 591 592 if (mpext->use_map) { 593 put_unaligned_be64(mpext->data_seq, ptr); 594 ptr += 2; 595 put_unaligned_be32(mpext->subflow_seq, ptr); 596 ptr += 1; 597 put_unaligned_be32(mpext->data_len << 16 | 598 TCPOPT_NOP << 8 | TCPOPT_NOP, ptr); 599 } 600 } 601 } 602