1 // SPDX-License-Identifier: GPL-2.0 2 /* Multipath TCP 3 * 4 * Copyright (c) 2017 - 2019, Intel Corporation. 5 */ 6 7 #define pr_fmt(fmt) "MPTCP: " fmt 8 9 #include <linux/kernel.h> 10 #include <linux/module.h> 11 #include <linux/netdevice.h> 12 #include <crypto/algapi.h> 13 #include <net/sock.h> 14 #include <net/inet_common.h> 15 #include <net/inet_hashtables.h> 16 #include <net/protocol.h> 17 #include <net/tcp.h> 18 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 19 #include <net/ip6_route.h> 20 #endif 21 #include <net/mptcp.h> 22 #include "protocol.h" 23 #include "mib.h" 24 25 static void SUBFLOW_REQ_INC_STATS(struct request_sock *req, 26 enum linux_mptcp_mib_field field) 27 { 28 MPTCP_INC_STATS(sock_net(req_to_sk(req)), field); 29 } 30 31 static int subflow_rebuild_header(struct sock *sk) 32 { 33 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); 34 int local_id, err = 0; 35 36 if (subflow->request_mptcp && !subflow->token) { 37 pr_debug("subflow=%p", sk); 38 err = mptcp_token_new_connect(sk); 39 } else if (subflow->request_join && !subflow->local_nonce) { 40 struct mptcp_sock *msk = (struct mptcp_sock *)subflow->conn; 41 42 pr_debug("subflow=%p", sk); 43 44 do { 45 get_random_bytes(&subflow->local_nonce, sizeof(u32)); 46 } while (!subflow->local_nonce); 47 48 if (subflow->local_id) 49 goto out; 50 51 local_id = mptcp_pm_get_local_id(msk, (struct sock_common *)sk); 52 if (local_id < 0) 53 return -EINVAL; 54 55 subflow->local_id = local_id; 56 } 57 58 out: 59 if (err) 60 return err; 61 62 return subflow->icsk_af_ops->rebuild_header(sk); 63 } 64 65 static void subflow_req_destructor(struct request_sock *req) 66 { 67 struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); 68 69 pr_debug("subflow_req=%p", subflow_req); 70 71 if (subflow_req->mp_capable) 72 mptcp_token_destroy_request(subflow_req->token); 73 tcp_request_sock_ops.destructor(req); 74 } 75 76 static void subflow_generate_hmac(u64 key1, u64 key2, u32 nonce1, u32 nonce2, 77 void *hmac) 78 { 79 u8 msg[8]; 80 81 put_unaligned_be32(nonce1, &msg[0]); 82 put_unaligned_be32(nonce2, &msg[4]); 83 84 mptcp_crypto_hmac_sha(key1, key2, msg, 8, hmac); 85 } 86 87 /* validate received token and create truncated hmac and nonce for SYN-ACK */ 88 static bool subflow_token_join_request(struct request_sock *req, 89 const struct sk_buff *skb) 90 { 91 struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); 92 u8 hmac[MPTCPOPT_HMAC_LEN]; 93 struct mptcp_sock *msk; 94 int local_id; 95 96 msk = mptcp_token_get_sock(subflow_req->token); 97 if (!msk) { 98 SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINNOTOKEN); 99 return false; 100 } 101 102 local_id = mptcp_pm_get_local_id(msk, (struct sock_common *)req); 103 if (local_id < 0) { 104 sock_put((struct sock *)msk); 105 return false; 106 } 107 subflow_req->local_id = local_id; 108 109 get_random_bytes(&subflow_req->local_nonce, sizeof(u32)); 110 111 subflow_generate_hmac(msk->local_key, msk->remote_key, 112 subflow_req->local_nonce, 113 subflow_req->remote_nonce, hmac); 114 115 subflow_req->thmac = get_unaligned_be64(hmac); 116 117 sock_put((struct sock *)msk); 118 return true; 119 } 120 121 static void subflow_init_req(struct request_sock *req, 122 const struct sock *sk_listener, 123 struct sk_buff *skb) 124 { 125 struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk_listener); 126 struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); 127 struct tcp_options_received rx_opt; 128 129 pr_debug("subflow_req=%p, listener=%p", subflow_req, listener); 130 131 memset(&rx_opt.mptcp, 0, sizeof(rx_opt.mptcp)); 132 mptcp_get_options(skb, &rx_opt); 133 134 subflow_req->mp_capable = 0; 135 subflow_req->mp_join = 0; 136 137 #ifdef CONFIG_TCP_MD5SIG 138 /* no MPTCP if MD5SIG is enabled on this socket or we may run out of 139 * TCP option space. 140 */ 141 if (rcu_access_pointer(tcp_sk(sk_listener)->md5sig_info)) 142 return; 143 #endif 144 145 if (rx_opt.mptcp.mp_capable) { 146 SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MPCAPABLEPASSIVE); 147 148 if (rx_opt.mptcp.mp_join) 149 return; 150 } else if (rx_opt.mptcp.mp_join) { 151 SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINSYNRX); 152 } 153 154 if (rx_opt.mptcp.mp_capable && listener->request_mptcp) { 155 int err; 156 157 err = mptcp_token_new_request(req); 158 if (err == 0) 159 subflow_req->mp_capable = 1; 160 161 subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq; 162 } else if (rx_opt.mptcp.mp_join && listener->request_mptcp) { 163 subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq; 164 subflow_req->mp_join = 1; 165 subflow_req->backup = rx_opt.mptcp.backup; 166 subflow_req->remote_id = rx_opt.mptcp.join_id; 167 subflow_req->token = rx_opt.mptcp.token; 168 subflow_req->remote_nonce = rx_opt.mptcp.nonce; 169 pr_debug("token=%u, remote_nonce=%u", subflow_req->token, 170 subflow_req->remote_nonce); 171 if (!subflow_token_join_request(req, skb)) { 172 subflow_req->mp_join = 0; 173 // @@ need to trigger RST 174 } 175 } 176 } 177 178 static void subflow_v4_init_req(struct request_sock *req, 179 const struct sock *sk_listener, 180 struct sk_buff *skb) 181 { 182 tcp_rsk(req)->is_mptcp = 1; 183 184 tcp_request_sock_ipv4_ops.init_req(req, sk_listener, skb); 185 186 subflow_init_req(req, sk_listener, skb); 187 } 188 189 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 190 static void subflow_v6_init_req(struct request_sock *req, 191 const struct sock *sk_listener, 192 struct sk_buff *skb) 193 { 194 tcp_rsk(req)->is_mptcp = 1; 195 196 tcp_request_sock_ipv6_ops.init_req(req, sk_listener, skb); 197 198 subflow_init_req(req, sk_listener, skb); 199 } 200 #endif 201 202 /* validate received truncated hmac and create hmac for third ACK */ 203 static bool subflow_thmac_valid(struct mptcp_subflow_context *subflow) 204 { 205 u8 hmac[MPTCPOPT_HMAC_LEN]; 206 u64 thmac; 207 208 subflow_generate_hmac(subflow->remote_key, subflow->local_key, 209 subflow->remote_nonce, subflow->local_nonce, 210 hmac); 211 212 thmac = get_unaligned_be64(hmac); 213 pr_debug("subflow=%p, token=%u, thmac=%llu, subflow->thmac=%llu\n", 214 subflow, subflow->token, 215 (unsigned long long)thmac, 216 (unsigned long long)subflow->thmac); 217 218 return thmac == subflow->thmac; 219 } 220 221 static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) 222 { 223 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); 224 struct sock *parent = subflow->conn; 225 226 subflow->icsk_af_ops->sk_rx_dst_set(sk, skb); 227 228 if (inet_sk_state_load(parent) != TCP_ESTABLISHED) { 229 inet_sk_state_store(parent, TCP_ESTABLISHED); 230 parent->sk_state_change(parent); 231 } 232 233 if (subflow->conn_finished || !tcp_sk(sk)->is_mptcp) 234 return; 235 236 if (subflow->mp_capable) { 237 pr_debug("subflow=%p, remote_key=%llu", mptcp_subflow_ctx(sk), 238 subflow->remote_key); 239 mptcp_finish_connect(sk); 240 subflow->conn_finished = 1; 241 242 if (skb) { 243 pr_debug("synack seq=%u", TCP_SKB_CB(skb)->seq); 244 subflow->ssn_offset = TCP_SKB_CB(skb)->seq; 245 } 246 } else if (subflow->mp_join) { 247 pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u", 248 subflow, subflow->thmac, 249 subflow->remote_nonce); 250 if (!subflow_thmac_valid(subflow)) { 251 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINACKMAC); 252 subflow->mp_join = 0; 253 goto do_reset; 254 } 255 256 subflow_generate_hmac(subflow->local_key, subflow->remote_key, 257 subflow->local_nonce, 258 subflow->remote_nonce, 259 subflow->hmac); 260 261 if (skb) 262 subflow->ssn_offset = TCP_SKB_CB(skb)->seq; 263 264 if (!mptcp_finish_join(sk)) 265 goto do_reset; 266 267 subflow->conn_finished = 1; 268 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNACKRX); 269 } else { 270 do_reset: 271 tcp_send_active_reset(sk, GFP_ATOMIC); 272 tcp_done(sk); 273 } 274 } 275 276 static struct request_sock_ops subflow_request_sock_ops; 277 static struct tcp_request_sock_ops subflow_request_sock_ipv4_ops; 278 279 static int subflow_v4_conn_request(struct sock *sk, struct sk_buff *skb) 280 { 281 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); 282 283 pr_debug("subflow=%p", subflow); 284 285 /* Never answer to SYNs sent to broadcast or multicast */ 286 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) 287 goto drop; 288 289 return tcp_conn_request(&subflow_request_sock_ops, 290 &subflow_request_sock_ipv4_ops, 291 sk, skb); 292 drop: 293 tcp_listendrop(sk); 294 return 0; 295 } 296 297 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 298 static struct tcp_request_sock_ops subflow_request_sock_ipv6_ops; 299 static struct inet_connection_sock_af_ops subflow_v6_specific; 300 static struct inet_connection_sock_af_ops subflow_v6m_specific; 301 302 static int subflow_v6_conn_request(struct sock *sk, struct sk_buff *skb) 303 { 304 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); 305 306 pr_debug("subflow=%p", subflow); 307 308 if (skb->protocol == htons(ETH_P_IP)) 309 return subflow_v4_conn_request(sk, skb); 310 311 if (!ipv6_unicast_destination(skb)) 312 goto drop; 313 314 return tcp_conn_request(&subflow_request_sock_ops, 315 &subflow_request_sock_ipv6_ops, sk, skb); 316 317 drop: 318 tcp_listendrop(sk); 319 return 0; /* don't send reset */ 320 } 321 #endif 322 323 /* validate hmac received in third ACK */ 324 static bool subflow_hmac_valid(const struct request_sock *req, 325 const struct tcp_options_received *rx_opt) 326 { 327 const struct mptcp_subflow_request_sock *subflow_req; 328 u8 hmac[MPTCPOPT_HMAC_LEN]; 329 struct mptcp_sock *msk; 330 bool ret; 331 332 subflow_req = mptcp_subflow_rsk(req); 333 msk = mptcp_token_get_sock(subflow_req->token); 334 if (!msk) 335 return false; 336 337 subflow_generate_hmac(msk->remote_key, msk->local_key, 338 subflow_req->remote_nonce, 339 subflow_req->local_nonce, hmac); 340 341 ret = true; 342 if (crypto_memneq(hmac, rx_opt->mptcp.hmac, sizeof(hmac))) 343 ret = false; 344 345 sock_put((struct sock *)msk); 346 return ret; 347 } 348 349 static void mptcp_sock_destruct(struct sock *sk) 350 { 351 /* if new mptcp socket isn't accepted, it is free'd 352 * from the tcp listener sockets request queue, linked 353 * from req->sk. The tcp socket is released. 354 * This calls the ULP release function which will 355 * also remove the mptcp socket, via 356 * sock_put(ctx->conn). 357 * 358 * Problem is that the mptcp socket will not be in 359 * SYN_RECV state and doesn't have SOCK_DEAD flag. 360 * Both result in warnings from inet_sock_destruct. 361 */ 362 363 if (sk->sk_state == TCP_SYN_RECV) { 364 sk->sk_state = TCP_CLOSE; 365 WARN_ON_ONCE(sk->sk_socket); 366 sock_orphan(sk); 367 } 368 369 inet_sock_destruct(sk); 370 } 371 372 static void mptcp_force_close(struct sock *sk) 373 { 374 inet_sk_state_store(sk, TCP_CLOSE); 375 sk_common_release(sk); 376 } 377 378 static void subflow_ulp_fallback(struct sock *sk, 379 struct mptcp_subflow_context *old_ctx) 380 { 381 struct inet_connection_sock *icsk = inet_csk(sk); 382 383 mptcp_subflow_tcp_fallback(sk, old_ctx); 384 icsk->icsk_ulp_ops = NULL; 385 rcu_assign_pointer(icsk->icsk_ulp_data, NULL); 386 tcp_sk(sk)->is_mptcp = 0; 387 } 388 389 static struct sock *subflow_syn_recv_sock(const struct sock *sk, 390 struct sk_buff *skb, 391 struct request_sock *req, 392 struct dst_entry *dst, 393 struct request_sock *req_unhash, 394 bool *own_req) 395 { 396 struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk); 397 struct mptcp_subflow_request_sock *subflow_req; 398 struct tcp_options_received opt_rx; 399 bool fallback_is_fatal = false; 400 struct sock *new_msk = NULL; 401 bool fallback = false; 402 struct sock *child; 403 404 pr_debug("listener=%p, req=%p, conn=%p", listener, req, listener->conn); 405 406 opt_rx.mptcp.mp_capable = 0; 407 if (tcp_rsk(req)->is_mptcp == 0) 408 goto create_child; 409 410 /* if the sk is MP_CAPABLE, we try to fetch the client key */ 411 subflow_req = mptcp_subflow_rsk(req); 412 if (subflow_req->mp_capable) { 413 if (TCP_SKB_CB(skb)->seq != subflow_req->ssn_offset + 1) { 414 /* here we can receive and accept an in-window, 415 * out-of-order pkt, which will not carry the MP_CAPABLE 416 * opt even on mptcp enabled paths 417 */ 418 goto create_msk; 419 } 420 421 mptcp_get_options(skb, &opt_rx); 422 if (!opt_rx.mptcp.mp_capable) { 423 fallback = true; 424 goto create_child; 425 } 426 427 create_msk: 428 new_msk = mptcp_sk_clone(listener->conn, &opt_rx, req); 429 if (!new_msk) 430 fallback = true; 431 } else if (subflow_req->mp_join) { 432 fallback_is_fatal = true; 433 opt_rx.mptcp.mp_join = 0; 434 mptcp_get_options(skb, &opt_rx); 435 if (!opt_rx.mptcp.mp_join || 436 !subflow_hmac_valid(req, &opt_rx)) { 437 SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKMAC); 438 return NULL; 439 } 440 } 441 442 create_child: 443 child = listener->icsk_af_ops->syn_recv_sock(sk, skb, req, dst, 444 req_unhash, own_req); 445 446 if (child && *own_req) { 447 struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(child); 448 449 /* we need to fallback on ctx allocation failure and on pre-reqs 450 * checking above. In the latter scenario we additionally need 451 * to reset the context to non MPTCP status. 452 */ 453 if (!ctx || fallback) { 454 if (fallback_is_fatal) 455 goto close_child; 456 457 if (ctx) { 458 subflow_ulp_fallback(child, ctx); 459 kfree_rcu(ctx, rcu); 460 } 461 goto out; 462 } 463 464 if (ctx->mp_capable) { 465 /* new mpc subflow takes ownership of the newly 466 * created mptcp socket 467 */ 468 new_msk->sk_destruct = mptcp_sock_destruct; 469 mptcp_pm_new_connection(mptcp_sk(new_msk), 1); 470 ctx->conn = new_msk; 471 new_msk = NULL; 472 473 /* with OoO packets we can reach here without ingress 474 * mpc option 475 */ 476 ctx->remote_key = opt_rx.mptcp.sndr_key; 477 ctx->fully_established = opt_rx.mptcp.mp_capable; 478 ctx->can_ack = opt_rx.mptcp.mp_capable; 479 } else if (ctx->mp_join) { 480 struct mptcp_sock *owner; 481 482 owner = mptcp_token_get_sock(ctx->token); 483 if (!owner) 484 goto close_child; 485 486 ctx->conn = (struct sock *)owner; 487 if (!mptcp_finish_join(child)) 488 goto close_child; 489 490 SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKRX); 491 } 492 } 493 494 out: 495 /* dispose of the left over mptcp master, if any */ 496 if (unlikely(new_msk)) 497 mptcp_force_close(new_msk); 498 499 /* check for expected invariant - should never trigger, just help 500 * catching eariler subtle bugs 501 */ 502 WARN_ON_ONCE(*own_req && child && tcp_sk(child)->is_mptcp && 503 (!mptcp_subflow_ctx(child) || 504 !mptcp_subflow_ctx(child)->conn)); 505 return child; 506 507 close_child: 508 tcp_send_active_reset(child, GFP_ATOMIC); 509 inet_csk_prepare_forced_close(child); 510 tcp_done(child); 511 return NULL; 512 } 513 514 static struct inet_connection_sock_af_ops subflow_specific; 515 516 enum mapping_status { 517 MAPPING_OK, 518 MAPPING_INVALID, 519 MAPPING_EMPTY, 520 MAPPING_DATA_FIN 521 }; 522 523 static u64 expand_seq(u64 old_seq, u16 old_data_len, u64 seq) 524 { 525 if ((u32)seq == (u32)old_seq) 526 return old_seq; 527 528 /* Assume map covers data not mapped yet. */ 529 return seq | ((old_seq + old_data_len + 1) & GENMASK_ULL(63, 32)); 530 } 531 532 static void warn_bad_map(struct mptcp_subflow_context *subflow, u32 ssn) 533 { 534 WARN_ONCE(1, "Bad mapping: ssn=%d map_seq=%d map_data_len=%d", 535 ssn, subflow->map_subflow_seq, subflow->map_data_len); 536 } 537 538 static bool skb_is_fully_mapped(struct sock *ssk, struct sk_buff *skb) 539 { 540 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); 541 unsigned int skb_consumed; 542 543 skb_consumed = tcp_sk(ssk)->copied_seq - TCP_SKB_CB(skb)->seq; 544 if (WARN_ON_ONCE(skb_consumed >= skb->len)) 545 return true; 546 547 return skb->len - skb_consumed <= subflow->map_data_len - 548 mptcp_subflow_get_map_offset(subflow); 549 } 550 551 static bool validate_mapping(struct sock *ssk, struct sk_buff *skb) 552 { 553 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); 554 u32 ssn = tcp_sk(ssk)->copied_seq - subflow->ssn_offset; 555 556 if (unlikely(before(ssn, subflow->map_subflow_seq))) { 557 /* Mapping covers data later in the subflow stream, 558 * currently unsupported. 559 */ 560 warn_bad_map(subflow, ssn); 561 return false; 562 } 563 if (unlikely(!before(ssn, subflow->map_subflow_seq + 564 subflow->map_data_len))) { 565 /* Mapping does covers past subflow data, invalid */ 566 warn_bad_map(subflow, ssn + skb->len); 567 return false; 568 } 569 return true; 570 } 571 572 static enum mapping_status get_mapping_status(struct sock *ssk) 573 { 574 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); 575 struct mptcp_ext *mpext; 576 struct sk_buff *skb; 577 u16 data_len; 578 u64 map_seq; 579 580 skb = skb_peek(&ssk->sk_receive_queue); 581 if (!skb) 582 return MAPPING_EMPTY; 583 584 mpext = mptcp_get_ext(skb); 585 if (!mpext || !mpext->use_map) { 586 if (!subflow->map_valid && !skb->len) { 587 /* the TCP stack deliver 0 len FIN pkt to the receive 588 * queue, that is the only 0len pkts ever expected here, 589 * and we can admit no mapping only for 0 len pkts 590 */ 591 if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) 592 WARN_ONCE(1, "0len seq %d:%d flags %x", 593 TCP_SKB_CB(skb)->seq, 594 TCP_SKB_CB(skb)->end_seq, 595 TCP_SKB_CB(skb)->tcp_flags); 596 sk_eat_skb(ssk, skb); 597 return MAPPING_EMPTY; 598 } 599 600 if (!subflow->map_valid) 601 return MAPPING_INVALID; 602 603 goto validate_seq; 604 } 605 606 pr_debug("seq=%llu is64=%d ssn=%u data_len=%u data_fin=%d", 607 mpext->data_seq, mpext->dsn64, mpext->subflow_seq, 608 mpext->data_len, mpext->data_fin); 609 610 data_len = mpext->data_len; 611 if (data_len == 0) { 612 pr_err("Infinite mapping not handled"); 613 MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_INFINITEMAPRX); 614 return MAPPING_INVALID; 615 } 616 617 if (mpext->data_fin == 1) { 618 if (data_len == 1) { 619 pr_debug("DATA_FIN with no payload"); 620 if (subflow->map_valid) { 621 /* A DATA_FIN might arrive in a DSS 622 * option before the previous mapping 623 * has been fully consumed. Continue 624 * handling the existing mapping. 625 */ 626 skb_ext_del(skb, SKB_EXT_MPTCP); 627 return MAPPING_OK; 628 } else { 629 return MAPPING_DATA_FIN; 630 } 631 } 632 633 /* Adjust for DATA_FIN using 1 byte of sequence space */ 634 data_len--; 635 } 636 637 if (!mpext->dsn64) { 638 map_seq = expand_seq(subflow->map_seq, subflow->map_data_len, 639 mpext->data_seq); 640 pr_debug("expanded seq=%llu", subflow->map_seq); 641 } else { 642 map_seq = mpext->data_seq; 643 } 644 645 if (subflow->map_valid) { 646 /* Allow replacing only with an identical map */ 647 if (subflow->map_seq == map_seq && 648 subflow->map_subflow_seq == mpext->subflow_seq && 649 subflow->map_data_len == data_len) { 650 skb_ext_del(skb, SKB_EXT_MPTCP); 651 return MAPPING_OK; 652 } 653 654 /* If this skb data are fully covered by the current mapping, 655 * the new map would need caching, which is not supported 656 */ 657 if (skb_is_fully_mapped(ssk, skb)) { 658 MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DSSNOMATCH); 659 return MAPPING_INVALID; 660 } 661 662 /* will validate the next map after consuming the current one */ 663 return MAPPING_OK; 664 } 665 666 subflow->map_seq = map_seq; 667 subflow->map_subflow_seq = mpext->subflow_seq; 668 subflow->map_data_len = data_len; 669 subflow->map_valid = 1; 670 subflow->mpc_map = mpext->mpc_map; 671 pr_debug("new map seq=%llu subflow_seq=%u data_len=%u", 672 subflow->map_seq, subflow->map_subflow_seq, 673 subflow->map_data_len); 674 675 validate_seq: 676 /* we revalidate valid mapping on new skb, because we must ensure 677 * the current skb is completely covered by the available mapping 678 */ 679 if (!validate_mapping(ssk, skb)) 680 return MAPPING_INVALID; 681 682 skb_ext_del(skb, SKB_EXT_MPTCP); 683 return MAPPING_OK; 684 } 685 686 static int subflow_read_actor(read_descriptor_t *desc, 687 struct sk_buff *skb, 688 unsigned int offset, size_t len) 689 { 690 size_t copy_len = min(desc->count, len); 691 692 desc->count -= copy_len; 693 694 pr_debug("flushed %zu bytes, %zu left", copy_len, desc->count); 695 return copy_len; 696 } 697 698 static bool subflow_check_data_avail(struct sock *ssk) 699 { 700 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); 701 enum mapping_status status; 702 struct mptcp_sock *msk; 703 struct sk_buff *skb; 704 705 pr_debug("msk=%p ssk=%p data_avail=%d skb=%p", subflow->conn, ssk, 706 subflow->data_avail, skb_peek(&ssk->sk_receive_queue)); 707 if (subflow->data_avail) 708 return true; 709 710 msk = mptcp_sk(subflow->conn); 711 for (;;) { 712 u32 map_remaining; 713 size_t delta; 714 u64 ack_seq; 715 u64 old_ack; 716 717 status = get_mapping_status(ssk); 718 pr_debug("msk=%p ssk=%p status=%d", msk, ssk, status); 719 if (status == MAPPING_INVALID) { 720 ssk->sk_err = EBADMSG; 721 goto fatal; 722 } 723 724 if (status != MAPPING_OK) 725 return false; 726 727 skb = skb_peek(&ssk->sk_receive_queue); 728 if (WARN_ON_ONCE(!skb)) 729 return false; 730 731 /* if msk lacks the remote key, this subflow must provide an 732 * MP_CAPABLE-based mapping 733 */ 734 if (unlikely(!READ_ONCE(msk->can_ack))) { 735 if (!subflow->mpc_map) { 736 ssk->sk_err = EBADMSG; 737 goto fatal; 738 } 739 WRITE_ONCE(msk->remote_key, subflow->remote_key); 740 WRITE_ONCE(msk->ack_seq, subflow->map_seq); 741 WRITE_ONCE(msk->can_ack, true); 742 } 743 744 old_ack = READ_ONCE(msk->ack_seq); 745 ack_seq = mptcp_subflow_get_mapped_dsn(subflow); 746 pr_debug("msk ack_seq=%llx subflow ack_seq=%llx", old_ack, 747 ack_seq); 748 if (ack_seq == old_ack) 749 break; 750 751 /* only accept in-sequence mapping. Old values are spurious 752 * retransmission; we can hit "future" values on active backup 753 * subflow switch, we relay on retransmissions to get 754 * in-sequence data. 755 * Cuncurrent subflows support will require subflow data 756 * reordering 757 */ 758 map_remaining = subflow->map_data_len - 759 mptcp_subflow_get_map_offset(subflow); 760 if (before64(ack_seq, old_ack)) 761 delta = min_t(size_t, old_ack - ack_seq, map_remaining); 762 else 763 delta = min_t(size_t, ack_seq - old_ack, map_remaining); 764 765 /* discard mapped data */ 766 pr_debug("discarding %zu bytes, current map len=%d", delta, 767 map_remaining); 768 if (delta) { 769 read_descriptor_t desc = { 770 .count = delta, 771 }; 772 int ret; 773 774 ret = tcp_read_sock(ssk, &desc, subflow_read_actor); 775 if (ret < 0) { 776 ssk->sk_err = -ret; 777 goto fatal; 778 } 779 if (ret < delta) 780 return false; 781 if (delta == map_remaining) 782 subflow->map_valid = 0; 783 } 784 } 785 return true; 786 787 fatal: 788 /* fatal protocol error, close the socket */ 789 /* This barrier is coupled with smp_rmb() in tcp_poll() */ 790 smp_wmb(); 791 ssk->sk_error_report(ssk); 792 tcp_set_state(ssk, TCP_CLOSE); 793 tcp_send_active_reset(ssk, GFP_ATOMIC); 794 return false; 795 } 796 797 bool mptcp_subflow_data_available(struct sock *sk) 798 { 799 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); 800 struct sk_buff *skb; 801 802 /* check if current mapping is still valid */ 803 if (subflow->map_valid && 804 mptcp_subflow_get_map_offset(subflow) >= subflow->map_data_len) { 805 subflow->map_valid = 0; 806 subflow->data_avail = 0; 807 808 pr_debug("Done with mapping: seq=%u data_len=%u", 809 subflow->map_subflow_seq, 810 subflow->map_data_len); 811 } 812 813 if (!subflow_check_data_avail(sk)) { 814 subflow->data_avail = 0; 815 return false; 816 } 817 818 skb = skb_peek(&sk->sk_receive_queue); 819 subflow->data_avail = skb && 820 before(tcp_sk(sk)->copied_seq, TCP_SKB_CB(skb)->end_seq); 821 return subflow->data_avail; 822 } 823 824 static void subflow_data_ready(struct sock *sk) 825 { 826 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); 827 struct sock *parent = subflow->conn; 828 829 if (!subflow->mp_capable && !subflow->mp_join) { 830 subflow->tcp_data_ready(sk); 831 832 parent->sk_data_ready(parent); 833 return; 834 } 835 836 if (mptcp_subflow_data_available(sk)) 837 mptcp_data_ready(parent, sk); 838 } 839 840 static void subflow_write_space(struct sock *sk) 841 { 842 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); 843 struct sock *parent = subflow->conn; 844 845 sk_stream_write_space(sk); 846 if (sk_stream_is_writeable(sk)) { 847 set_bit(MPTCP_SEND_SPACE, &mptcp_sk(parent)->flags); 848 smp_mb__after_atomic(); 849 /* set SEND_SPACE before sk_stream_write_space clears NOSPACE */ 850 sk_stream_write_space(parent); 851 } 852 } 853 854 static struct inet_connection_sock_af_ops * 855 subflow_default_af_ops(struct sock *sk) 856 { 857 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 858 if (sk->sk_family == AF_INET6) 859 return &subflow_v6_specific; 860 #endif 861 return &subflow_specific; 862 } 863 864 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 865 void mptcpv6_handle_mapped(struct sock *sk, bool mapped) 866 { 867 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); 868 struct inet_connection_sock *icsk = inet_csk(sk); 869 struct inet_connection_sock_af_ops *target; 870 871 target = mapped ? &subflow_v6m_specific : subflow_default_af_ops(sk); 872 873 pr_debug("subflow=%p family=%d ops=%p target=%p mapped=%d", 874 subflow, sk->sk_family, icsk->icsk_af_ops, target, mapped); 875 876 if (likely(icsk->icsk_af_ops == target)) 877 return; 878 879 subflow->icsk_af_ops = icsk->icsk_af_ops; 880 icsk->icsk_af_ops = target; 881 } 882 #endif 883 884 static void mptcp_info2sockaddr(const struct mptcp_addr_info *info, 885 struct sockaddr_storage *addr) 886 { 887 memset(addr, 0, sizeof(*addr)); 888 addr->ss_family = info->family; 889 if (addr->ss_family == AF_INET) { 890 struct sockaddr_in *in_addr = (struct sockaddr_in *)addr; 891 892 in_addr->sin_addr = info->addr; 893 in_addr->sin_port = info->port; 894 } 895 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 896 else if (addr->ss_family == AF_INET6) { 897 struct sockaddr_in6 *in6_addr = (struct sockaddr_in6 *)addr; 898 899 in6_addr->sin6_addr = info->addr6; 900 in6_addr->sin6_port = info->port; 901 } 902 #endif 903 } 904 905 int __mptcp_subflow_connect(struct sock *sk, int ifindex, 906 const struct mptcp_addr_info *loc, 907 const struct mptcp_addr_info *remote) 908 { 909 struct mptcp_sock *msk = mptcp_sk(sk); 910 struct mptcp_subflow_context *subflow; 911 struct sockaddr_storage addr; 912 struct socket *sf; 913 u32 remote_token; 914 int addrlen; 915 int err; 916 917 if (sk->sk_state != TCP_ESTABLISHED) 918 return -ENOTCONN; 919 920 err = mptcp_subflow_create_socket(sk, &sf); 921 if (err) 922 return err; 923 924 subflow = mptcp_subflow_ctx(sf->sk); 925 subflow->remote_key = msk->remote_key; 926 subflow->local_key = msk->local_key; 927 subflow->token = msk->token; 928 mptcp_info2sockaddr(loc, &addr); 929 930 addrlen = sizeof(struct sockaddr_in); 931 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 932 if (loc->family == AF_INET6) 933 addrlen = sizeof(struct sockaddr_in6); 934 #endif 935 sf->sk->sk_bound_dev_if = ifindex; 936 err = kernel_bind(sf, (struct sockaddr *)&addr, addrlen); 937 if (err) 938 goto failed; 939 940 mptcp_crypto_key_sha(subflow->remote_key, &remote_token, NULL); 941 pr_debug("msk=%p remote_token=%u", msk, remote_token); 942 subflow->remote_token = remote_token; 943 subflow->local_id = loc->id; 944 subflow->request_join = 1; 945 subflow->request_bkup = 1; 946 mptcp_info2sockaddr(remote, &addr); 947 948 err = kernel_connect(sf, (struct sockaddr *)&addr, addrlen, O_NONBLOCK); 949 if (err && err != -EINPROGRESS) 950 goto failed; 951 952 spin_lock_bh(&msk->join_list_lock); 953 list_add_tail(&subflow->node, &msk->join_list); 954 spin_unlock_bh(&msk->join_list_lock); 955 956 return err; 957 958 failed: 959 sock_release(sf); 960 return err; 961 } 962 963 int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock) 964 { 965 struct mptcp_subflow_context *subflow; 966 struct net *net = sock_net(sk); 967 struct socket *sf; 968 int err; 969 970 err = sock_create_kern(net, sk->sk_family, SOCK_STREAM, IPPROTO_TCP, 971 &sf); 972 if (err) 973 return err; 974 975 lock_sock(sf->sk); 976 977 /* kernel sockets do not by default acquire net ref, but TCP timer 978 * needs it. 979 */ 980 sf->sk->sk_net_refcnt = 1; 981 get_net(net); 982 #ifdef CONFIG_PROC_FS 983 this_cpu_add(*net->core.sock_inuse, 1); 984 #endif 985 err = tcp_set_ulp(sf->sk, "mptcp"); 986 release_sock(sf->sk); 987 988 if (err) 989 return err; 990 991 subflow = mptcp_subflow_ctx(sf->sk); 992 pr_debug("subflow=%p", subflow); 993 994 *new_sock = sf; 995 sock_hold(sk); 996 subflow->conn = sk; 997 998 return 0; 999 } 1000 1001 static struct mptcp_subflow_context *subflow_create_ctx(struct sock *sk, 1002 gfp_t priority) 1003 { 1004 struct inet_connection_sock *icsk = inet_csk(sk); 1005 struct mptcp_subflow_context *ctx; 1006 1007 ctx = kzalloc(sizeof(*ctx), priority); 1008 if (!ctx) 1009 return NULL; 1010 1011 rcu_assign_pointer(icsk->icsk_ulp_data, ctx); 1012 INIT_LIST_HEAD(&ctx->node); 1013 1014 pr_debug("subflow=%p", ctx); 1015 1016 ctx->tcp_sock = sk; 1017 1018 return ctx; 1019 } 1020 1021 static void __subflow_state_change(struct sock *sk) 1022 { 1023 struct socket_wq *wq; 1024 1025 rcu_read_lock(); 1026 wq = rcu_dereference(sk->sk_wq); 1027 if (skwq_has_sleeper(wq)) 1028 wake_up_interruptible_all(&wq->wait); 1029 rcu_read_unlock(); 1030 } 1031 1032 static bool subflow_is_done(const struct sock *sk) 1033 { 1034 return sk->sk_shutdown & RCV_SHUTDOWN || sk->sk_state == TCP_CLOSE; 1035 } 1036 1037 static void subflow_state_change(struct sock *sk) 1038 { 1039 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); 1040 struct sock *parent = subflow->conn; 1041 1042 __subflow_state_change(sk); 1043 1044 /* as recvmsg() does not acquire the subflow socket for ssk selection 1045 * a fin packet carrying a DSS can be unnoticed if we don't trigger 1046 * the data available machinery here. 1047 */ 1048 if (subflow->mp_capable && mptcp_subflow_data_available(sk)) 1049 mptcp_data_ready(parent, sk); 1050 1051 if (!(parent->sk_shutdown & RCV_SHUTDOWN) && 1052 !subflow->rx_eof && subflow_is_done(sk)) { 1053 subflow->rx_eof = 1; 1054 mptcp_subflow_eof(parent); 1055 } 1056 } 1057 1058 static int subflow_ulp_init(struct sock *sk) 1059 { 1060 struct inet_connection_sock *icsk = inet_csk(sk); 1061 struct mptcp_subflow_context *ctx; 1062 struct tcp_sock *tp = tcp_sk(sk); 1063 int err = 0; 1064 1065 /* disallow attaching ULP to a socket unless it has been 1066 * created with sock_create_kern() 1067 */ 1068 if (!sk->sk_kern_sock) { 1069 err = -EOPNOTSUPP; 1070 goto out; 1071 } 1072 1073 ctx = subflow_create_ctx(sk, GFP_KERNEL); 1074 if (!ctx) { 1075 err = -ENOMEM; 1076 goto out; 1077 } 1078 1079 pr_debug("subflow=%p, family=%d", ctx, sk->sk_family); 1080 1081 tp->is_mptcp = 1; 1082 ctx->icsk_af_ops = icsk->icsk_af_ops; 1083 icsk->icsk_af_ops = subflow_default_af_ops(sk); 1084 ctx->tcp_data_ready = sk->sk_data_ready; 1085 ctx->tcp_state_change = sk->sk_state_change; 1086 ctx->tcp_write_space = sk->sk_write_space; 1087 sk->sk_data_ready = subflow_data_ready; 1088 sk->sk_write_space = subflow_write_space; 1089 sk->sk_state_change = subflow_state_change; 1090 out: 1091 return err; 1092 } 1093 1094 static void subflow_ulp_release(struct sock *sk) 1095 { 1096 struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(sk); 1097 1098 if (!ctx) 1099 return; 1100 1101 if (ctx->conn) 1102 sock_put(ctx->conn); 1103 1104 kfree_rcu(ctx, rcu); 1105 } 1106 1107 static void subflow_ulp_clone(const struct request_sock *req, 1108 struct sock *newsk, 1109 const gfp_t priority) 1110 { 1111 struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); 1112 struct mptcp_subflow_context *old_ctx = mptcp_subflow_ctx(newsk); 1113 struct mptcp_subflow_context *new_ctx; 1114 1115 if (!tcp_rsk(req)->is_mptcp || 1116 (!subflow_req->mp_capable && !subflow_req->mp_join)) { 1117 subflow_ulp_fallback(newsk, old_ctx); 1118 return; 1119 } 1120 1121 new_ctx = subflow_create_ctx(newsk, priority); 1122 if (!new_ctx) { 1123 subflow_ulp_fallback(newsk, old_ctx); 1124 return; 1125 } 1126 1127 new_ctx->conn_finished = 1; 1128 new_ctx->icsk_af_ops = old_ctx->icsk_af_ops; 1129 new_ctx->tcp_data_ready = old_ctx->tcp_data_ready; 1130 new_ctx->tcp_state_change = old_ctx->tcp_state_change; 1131 new_ctx->tcp_write_space = old_ctx->tcp_write_space; 1132 new_ctx->rel_write_seq = 1; 1133 new_ctx->tcp_sock = newsk; 1134 1135 if (subflow_req->mp_capable) { 1136 /* see comments in subflow_syn_recv_sock(), MPTCP connection 1137 * is fully established only after we receive the remote key 1138 */ 1139 new_ctx->mp_capable = 1; 1140 new_ctx->local_key = subflow_req->local_key; 1141 new_ctx->token = subflow_req->token; 1142 new_ctx->ssn_offset = subflow_req->ssn_offset; 1143 new_ctx->idsn = subflow_req->idsn; 1144 } else if (subflow_req->mp_join) { 1145 new_ctx->ssn_offset = subflow_req->ssn_offset; 1146 new_ctx->mp_join = 1; 1147 new_ctx->fully_established = 1; 1148 new_ctx->backup = subflow_req->backup; 1149 new_ctx->local_id = subflow_req->local_id; 1150 new_ctx->token = subflow_req->token; 1151 new_ctx->thmac = subflow_req->thmac; 1152 } 1153 } 1154 1155 static struct tcp_ulp_ops subflow_ulp_ops __read_mostly = { 1156 .name = "mptcp", 1157 .owner = THIS_MODULE, 1158 .init = subflow_ulp_init, 1159 .release = subflow_ulp_release, 1160 .clone = subflow_ulp_clone, 1161 }; 1162 1163 static int subflow_ops_init(struct request_sock_ops *subflow_ops) 1164 { 1165 subflow_ops->obj_size = sizeof(struct mptcp_subflow_request_sock); 1166 subflow_ops->slab_name = "request_sock_subflow"; 1167 1168 subflow_ops->slab = kmem_cache_create(subflow_ops->slab_name, 1169 subflow_ops->obj_size, 0, 1170 SLAB_ACCOUNT | 1171 SLAB_TYPESAFE_BY_RCU, 1172 NULL); 1173 if (!subflow_ops->slab) 1174 return -ENOMEM; 1175 1176 subflow_ops->destructor = subflow_req_destructor; 1177 1178 return 0; 1179 } 1180 1181 void mptcp_subflow_init(void) 1182 { 1183 subflow_request_sock_ops = tcp_request_sock_ops; 1184 if (subflow_ops_init(&subflow_request_sock_ops) != 0) 1185 panic("MPTCP: failed to init subflow request sock ops\n"); 1186 1187 subflow_request_sock_ipv4_ops = tcp_request_sock_ipv4_ops; 1188 subflow_request_sock_ipv4_ops.init_req = subflow_v4_init_req; 1189 1190 subflow_specific = ipv4_specific; 1191 subflow_specific.conn_request = subflow_v4_conn_request; 1192 subflow_specific.syn_recv_sock = subflow_syn_recv_sock; 1193 subflow_specific.sk_rx_dst_set = subflow_finish_connect; 1194 subflow_specific.rebuild_header = subflow_rebuild_header; 1195 1196 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 1197 subflow_request_sock_ipv6_ops = tcp_request_sock_ipv6_ops; 1198 subflow_request_sock_ipv6_ops.init_req = subflow_v6_init_req; 1199 1200 subflow_v6_specific = ipv6_specific; 1201 subflow_v6_specific.conn_request = subflow_v6_conn_request; 1202 subflow_v6_specific.syn_recv_sock = subflow_syn_recv_sock; 1203 subflow_v6_specific.sk_rx_dst_set = subflow_finish_connect; 1204 subflow_v6_specific.rebuild_header = subflow_rebuild_header; 1205 1206 subflow_v6m_specific = subflow_v6_specific; 1207 subflow_v6m_specific.queue_xmit = ipv4_specific.queue_xmit; 1208 subflow_v6m_specific.send_check = ipv4_specific.send_check; 1209 subflow_v6m_specific.net_header_len = ipv4_specific.net_header_len; 1210 subflow_v6m_specific.mtu_reduced = ipv4_specific.mtu_reduced; 1211 subflow_v6m_specific.net_frag_header_len = 0; 1212 #endif 1213 1214 mptcp_diag_subflow_init(&subflow_ulp_ops); 1215 1216 if (tcp_register_ulp(&subflow_ulp_ops) != 0) 1217 panic("MPTCP: failed to register subflows to ULP\n"); 1218 } 1219