1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * INET An implementation of the TCP/IP protocol suite for the LINUX 4 * operating system. INET is implemented using the BSD Socket 5 * interface as the means of communication with the user level. 6 * 7 * Support for INET connection oriented protocols. 8 * 9 * Authors: See the TCP sources 10 */ 11 12 #include <linux/module.h> 13 #include <linux/jhash.h> 14 15 #include <net/inet_connection_sock.h> 16 #include <net/inet_hashtables.h> 17 #include <net/inet_timewait_sock.h> 18 #include <net/ip.h> 19 #include <net/route.h> 20 #include <net/tcp_states.h> 21 #include <net/xfrm.h> 22 #include <net/tcp.h> 23 #include <net/sock_reuseport.h> 24 #include <net/addrconf.h> 25 26 #if IS_ENABLED(CONFIG_IPV6) 27 /* match_sk*_wildcard == true: IPV6_ADDR_ANY equals to any IPv6 addresses 28 * if IPv6 only, and any IPv4 addresses 29 * if not IPv6 only 30 * match_sk*_wildcard == false: addresses must be exactly the same, i.e. 31 * IPV6_ADDR_ANY only equals to IPV6_ADDR_ANY, 32 * and 0.0.0.0 equals to 0.0.0.0 only 33 */ 34 static bool ipv6_rcv_saddr_equal(const struct in6_addr *sk1_rcv_saddr6, 35 const struct in6_addr *sk2_rcv_saddr6, 36 __be32 sk1_rcv_saddr, __be32 sk2_rcv_saddr, 37 bool sk1_ipv6only, bool sk2_ipv6only, 38 bool match_sk1_wildcard, 39 bool match_sk2_wildcard) 40 { 41 int addr_type = ipv6_addr_type(sk1_rcv_saddr6); 42 int addr_type2 = sk2_rcv_saddr6 ? ipv6_addr_type(sk2_rcv_saddr6) : IPV6_ADDR_MAPPED; 43 44 /* if both are mapped, treat as IPv4 */ 45 if (addr_type == IPV6_ADDR_MAPPED && addr_type2 == IPV6_ADDR_MAPPED) { 46 if (!sk2_ipv6only) { 47 if (sk1_rcv_saddr == sk2_rcv_saddr) 48 return true; 49 return (match_sk1_wildcard && !sk1_rcv_saddr) || 50 (match_sk2_wildcard && !sk2_rcv_saddr); 51 } 52 return false; 53 } 54 55 if (addr_type == IPV6_ADDR_ANY && addr_type2 == IPV6_ADDR_ANY) 56 return true; 57 58 if (addr_type2 == IPV6_ADDR_ANY && match_sk2_wildcard && 59 !(sk2_ipv6only && addr_type == IPV6_ADDR_MAPPED)) 60 return true; 61 62 if (addr_type == IPV6_ADDR_ANY && match_sk1_wildcard && 63 !(sk1_ipv6only && addr_type2 == IPV6_ADDR_MAPPED)) 64 return true; 65 66 if (sk2_rcv_saddr6 && 67 ipv6_addr_equal(sk1_rcv_saddr6, sk2_rcv_saddr6)) 68 return true; 69 70 return false; 71 } 72 #endif 73 74 /* match_sk*_wildcard == true: 0.0.0.0 equals to any IPv4 addresses 75 * match_sk*_wildcard == false: addresses must be exactly the same, i.e. 76 * 0.0.0.0 only equals to 0.0.0.0 77 */ 78 static bool ipv4_rcv_saddr_equal(__be32 sk1_rcv_saddr, __be32 sk2_rcv_saddr, 79 bool sk2_ipv6only, bool match_sk1_wildcard, 80 bool match_sk2_wildcard) 81 { 82 if (!sk2_ipv6only) { 83 if (sk1_rcv_saddr == sk2_rcv_saddr) 84 return true; 85 return (match_sk1_wildcard && !sk1_rcv_saddr) || 86 (match_sk2_wildcard && !sk2_rcv_saddr); 87 } 88 return false; 89 } 90 91 bool inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2, 92 bool match_wildcard) 93 { 94 #if IS_ENABLED(CONFIG_IPV6) 95 if (sk->sk_family == AF_INET6) 96 return ipv6_rcv_saddr_equal(&sk->sk_v6_rcv_saddr, 97 inet6_rcv_saddr(sk2), 98 sk->sk_rcv_saddr, 99 sk2->sk_rcv_saddr, 100 ipv6_only_sock(sk), 101 ipv6_only_sock(sk2), 102 match_wildcard, 103 match_wildcard); 104 #endif 105 return ipv4_rcv_saddr_equal(sk->sk_rcv_saddr, sk2->sk_rcv_saddr, 106 ipv6_only_sock(sk2), match_wildcard, 107 match_wildcard); 108 } 109 EXPORT_SYMBOL(inet_rcv_saddr_equal); 110 111 bool inet_rcv_saddr_any(const struct sock *sk) 112 { 113 #if IS_ENABLED(CONFIG_IPV6) 114 if (sk->sk_family == AF_INET6) 115 return ipv6_addr_any(&sk->sk_v6_rcv_saddr); 116 #endif 117 return !sk->sk_rcv_saddr; 118 } 119 120 void inet_get_local_port_range(struct net *net, int *low, int *high) 121 { 122 unsigned int seq; 123 124 do { 125 seq = read_seqbegin(&net->ipv4.ip_local_ports.lock); 126 127 *low = net->ipv4.ip_local_ports.range[0]; 128 *high = net->ipv4.ip_local_ports.range[1]; 129 } while (read_seqretry(&net->ipv4.ip_local_ports.lock, seq)); 130 } 131 EXPORT_SYMBOL(inet_get_local_port_range); 132 133 static int inet_csk_bind_conflict(const struct sock *sk, 134 const struct inet_bind_bucket *tb, 135 bool relax, bool reuseport_ok) 136 { 137 struct sock *sk2; 138 bool reuseport_cb_ok; 139 bool reuse = sk->sk_reuse; 140 bool reuseport = !!sk->sk_reuseport; 141 struct sock_reuseport *reuseport_cb; 142 kuid_t uid = sock_i_uid((struct sock *)sk); 143 144 rcu_read_lock(); 145 reuseport_cb = rcu_dereference(sk->sk_reuseport_cb); 146 /* paired with WRITE_ONCE() in __reuseport_(add|detach)_closed_sock */ 147 reuseport_cb_ok = !reuseport_cb || READ_ONCE(reuseport_cb->num_closed_socks); 148 rcu_read_unlock(); 149 150 /* 151 * Unlike other sk lookup places we do not check 152 * for sk_net here, since _all_ the socks listed 153 * in tb->owners list belong to the same net - the 154 * one this bucket belongs to. 155 */ 156 157 sk_for_each_bound(sk2, &tb->owners) { 158 int bound_dev_if2; 159 160 if (sk == sk2) 161 continue; 162 bound_dev_if2 = READ_ONCE(sk2->sk_bound_dev_if); 163 if ((!sk->sk_bound_dev_if || 164 !bound_dev_if2 || 165 sk->sk_bound_dev_if == bound_dev_if2)) { 166 if (reuse && sk2->sk_reuse && 167 sk2->sk_state != TCP_LISTEN) { 168 if ((!relax || 169 (!reuseport_ok && 170 reuseport && sk2->sk_reuseport && 171 reuseport_cb_ok && 172 (sk2->sk_state == TCP_TIME_WAIT || 173 uid_eq(uid, sock_i_uid(sk2))))) && 174 inet_rcv_saddr_equal(sk, sk2, true)) 175 break; 176 } else if (!reuseport_ok || 177 !reuseport || !sk2->sk_reuseport || 178 !reuseport_cb_ok || 179 (sk2->sk_state != TCP_TIME_WAIT && 180 !uid_eq(uid, sock_i_uid(sk2)))) { 181 if (inet_rcv_saddr_equal(sk, sk2, true)) 182 break; 183 } 184 } 185 } 186 return sk2 != NULL; 187 } 188 189 /* 190 * Find an open port number for the socket. Returns with the 191 * inet_bind_hashbucket lock held. 192 */ 193 static struct inet_bind_hashbucket * 194 inet_csk_find_open_port(struct sock *sk, struct inet_bind_bucket **tb_ret, int *port_ret) 195 { 196 struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo; 197 int port = 0; 198 struct inet_bind_hashbucket *head; 199 struct net *net = sock_net(sk); 200 bool relax = false; 201 int i, low, high, attempt_half; 202 struct inet_bind_bucket *tb; 203 u32 remaining, offset; 204 int l3mdev; 205 206 l3mdev = inet_sk_bound_l3mdev(sk); 207 ports_exhausted: 208 attempt_half = (sk->sk_reuse == SK_CAN_REUSE) ? 1 : 0; 209 other_half_scan: 210 inet_get_local_port_range(net, &low, &high); 211 high++; /* [32768, 60999] -> [32768, 61000[ */ 212 if (high - low < 4) 213 attempt_half = 0; 214 if (attempt_half) { 215 int half = low + (((high - low) >> 2) << 1); 216 217 if (attempt_half == 1) 218 high = half; 219 else 220 low = half; 221 } 222 remaining = high - low; 223 if (likely(remaining > 1)) 224 remaining &= ~1U; 225 226 offset = prandom_u32() % remaining; 227 /* __inet_hash_connect() favors ports having @low parity 228 * We do the opposite to not pollute connect() users. 229 */ 230 offset |= 1U; 231 232 other_parity_scan: 233 port = low + offset; 234 for (i = 0; i < remaining; i += 2, port += 2) { 235 if (unlikely(port >= high)) 236 port -= remaining; 237 if (inet_is_local_reserved_port(net, port)) 238 continue; 239 head = &hinfo->bhash[inet_bhashfn(net, port, 240 hinfo->bhash_size)]; 241 spin_lock_bh(&head->lock); 242 inet_bind_bucket_for_each(tb, &head->chain) 243 if (net_eq(ib_net(tb), net) && tb->l3mdev == l3mdev && 244 tb->port == port) { 245 if (!inet_csk_bind_conflict(sk, tb, relax, false)) 246 goto success; 247 goto next_port; 248 } 249 tb = NULL; 250 goto success; 251 next_port: 252 spin_unlock_bh(&head->lock); 253 cond_resched(); 254 } 255 256 offset--; 257 if (!(offset & 1)) 258 goto other_parity_scan; 259 260 if (attempt_half == 1) { 261 /* OK we now try the upper half of the range */ 262 attempt_half = 2; 263 goto other_half_scan; 264 } 265 266 if (READ_ONCE(net->ipv4.sysctl_ip_autobind_reuse) && !relax) { 267 /* We still have a chance to connect to different destinations */ 268 relax = true; 269 goto ports_exhausted; 270 } 271 return NULL; 272 success: 273 *port_ret = port; 274 *tb_ret = tb; 275 return head; 276 } 277 278 static inline int sk_reuseport_match(struct inet_bind_bucket *tb, 279 struct sock *sk) 280 { 281 kuid_t uid = sock_i_uid(sk); 282 283 if (tb->fastreuseport <= 0) 284 return 0; 285 if (!sk->sk_reuseport) 286 return 0; 287 if (rcu_access_pointer(sk->sk_reuseport_cb)) 288 return 0; 289 if (!uid_eq(tb->fastuid, uid)) 290 return 0; 291 /* We only need to check the rcv_saddr if this tb was once marked 292 * without fastreuseport and then was reset, as we can only know that 293 * the fast_*rcv_saddr doesn't have any conflicts with the socks on the 294 * owners list. 295 */ 296 if (tb->fastreuseport == FASTREUSEPORT_ANY) 297 return 1; 298 #if IS_ENABLED(CONFIG_IPV6) 299 if (tb->fast_sk_family == AF_INET6) 300 return ipv6_rcv_saddr_equal(&tb->fast_v6_rcv_saddr, 301 inet6_rcv_saddr(sk), 302 tb->fast_rcv_saddr, 303 sk->sk_rcv_saddr, 304 tb->fast_ipv6_only, 305 ipv6_only_sock(sk), true, false); 306 #endif 307 return ipv4_rcv_saddr_equal(tb->fast_rcv_saddr, sk->sk_rcv_saddr, 308 ipv6_only_sock(sk), true, false); 309 } 310 311 void inet_csk_update_fastreuse(struct inet_bind_bucket *tb, 312 struct sock *sk) 313 { 314 kuid_t uid = sock_i_uid(sk); 315 bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN; 316 317 if (hlist_empty(&tb->owners)) { 318 tb->fastreuse = reuse; 319 if (sk->sk_reuseport) { 320 tb->fastreuseport = FASTREUSEPORT_ANY; 321 tb->fastuid = uid; 322 tb->fast_rcv_saddr = sk->sk_rcv_saddr; 323 tb->fast_ipv6_only = ipv6_only_sock(sk); 324 tb->fast_sk_family = sk->sk_family; 325 #if IS_ENABLED(CONFIG_IPV6) 326 tb->fast_v6_rcv_saddr = sk->sk_v6_rcv_saddr; 327 #endif 328 } else { 329 tb->fastreuseport = 0; 330 } 331 } else { 332 if (!reuse) 333 tb->fastreuse = 0; 334 if (sk->sk_reuseport) { 335 /* We didn't match or we don't have fastreuseport set on 336 * the tb, but we have sk_reuseport set on this socket 337 * and we know that there are no bind conflicts with 338 * this socket in this tb, so reset our tb's reuseport 339 * settings so that any subsequent sockets that match 340 * our current socket will be put on the fast path. 341 * 342 * If we reset we need to set FASTREUSEPORT_STRICT so we 343 * do extra checking for all subsequent sk_reuseport 344 * socks. 345 */ 346 if (!sk_reuseport_match(tb, sk)) { 347 tb->fastreuseport = FASTREUSEPORT_STRICT; 348 tb->fastuid = uid; 349 tb->fast_rcv_saddr = sk->sk_rcv_saddr; 350 tb->fast_ipv6_only = ipv6_only_sock(sk); 351 tb->fast_sk_family = sk->sk_family; 352 #if IS_ENABLED(CONFIG_IPV6) 353 tb->fast_v6_rcv_saddr = sk->sk_v6_rcv_saddr; 354 #endif 355 } 356 } else { 357 tb->fastreuseport = 0; 358 } 359 } 360 } 361 362 /* Obtain a reference to a local port for the given sock, 363 * if snum is zero it means select any available local port. 364 * We try to allocate an odd port (and leave even ports for connect()) 365 */ 366 int inet_csk_get_port(struct sock *sk, unsigned short snum) 367 { 368 bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN; 369 struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo; 370 int ret = 1, port = snum; 371 struct inet_bind_hashbucket *head; 372 struct net *net = sock_net(sk); 373 struct inet_bind_bucket *tb = NULL; 374 int l3mdev; 375 376 l3mdev = inet_sk_bound_l3mdev(sk); 377 378 if (!port) { 379 head = inet_csk_find_open_port(sk, &tb, &port); 380 if (!head) 381 return ret; 382 if (!tb) 383 goto tb_not_found; 384 goto success; 385 } 386 head = &hinfo->bhash[inet_bhashfn(net, port, 387 hinfo->bhash_size)]; 388 spin_lock_bh(&head->lock); 389 inet_bind_bucket_for_each(tb, &head->chain) 390 if (net_eq(ib_net(tb), net) && tb->l3mdev == l3mdev && 391 tb->port == port) 392 goto tb_found; 393 tb_not_found: 394 tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, 395 net, head, port, l3mdev); 396 if (!tb) 397 goto fail_unlock; 398 tb_found: 399 if (!hlist_empty(&tb->owners)) { 400 if (sk->sk_reuse == SK_FORCE_REUSE) 401 goto success; 402 403 if ((tb->fastreuse > 0 && reuse) || 404 sk_reuseport_match(tb, sk)) 405 goto success; 406 if (inet_csk_bind_conflict(sk, tb, true, true)) 407 goto fail_unlock; 408 } 409 success: 410 inet_csk_update_fastreuse(tb, sk); 411 412 if (!inet_csk(sk)->icsk_bind_hash) 413 inet_bind_hash(sk, tb, port); 414 WARN_ON(inet_csk(sk)->icsk_bind_hash != tb); 415 ret = 0; 416 417 fail_unlock: 418 spin_unlock_bh(&head->lock); 419 return ret; 420 } 421 EXPORT_SYMBOL_GPL(inet_csk_get_port); 422 423 /* 424 * Wait for an incoming connection, avoid race conditions. This must be called 425 * with the socket locked. 426 */ 427 static int inet_csk_wait_for_connect(struct sock *sk, long timeo) 428 { 429 struct inet_connection_sock *icsk = inet_csk(sk); 430 DEFINE_WAIT(wait); 431 int err; 432 433 /* 434 * True wake-one mechanism for incoming connections: only 435 * one process gets woken up, not the 'whole herd'. 436 * Since we do not 'race & poll' for established sockets 437 * anymore, the common case will execute the loop only once. 438 * 439 * Subtle issue: "add_wait_queue_exclusive()" will be added 440 * after any current non-exclusive waiters, and we know that 441 * it will always _stay_ after any new non-exclusive waiters 442 * because all non-exclusive waiters are added at the 443 * beginning of the wait-queue. As such, it's ok to "drop" 444 * our exclusiveness temporarily when we get woken up without 445 * having to remove and re-insert us on the wait queue. 446 */ 447 for (;;) { 448 prepare_to_wait_exclusive(sk_sleep(sk), &wait, 449 TASK_INTERRUPTIBLE); 450 release_sock(sk); 451 if (reqsk_queue_empty(&icsk->icsk_accept_queue)) 452 timeo = schedule_timeout(timeo); 453 sched_annotate_sleep(); 454 lock_sock(sk); 455 err = 0; 456 if (!reqsk_queue_empty(&icsk->icsk_accept_queue)) 457 break; 458 err = -EINVAL; 459 if (sk->sk_state != TCP_LISTEN) 460 break; 461 err = sock_intr_errno(timeo); 462 if (signal_pending(current)) 463 break; 464 err = -EAGAIN; 465 if (!timeo) 466 break; 467 } 468 finish_wait(sk_sleep(sk), &wait); 469 return err; 470 } 471 472 /* 473 * This will accept the next outstanding connection. 474 */ 475 struct sock *inet_csk_accept(struct sock *sk, int flags, int *err, bool kern) 476 { 477 struct inet_connection_sock *icsk = inet_csk(sk); 478 struct request_sock_queue *queue = &icsk->icsk_accept_queue; 479 struct request_sock *req; 480 struct sock *newsk; 481 int error; 482 483 lock_sock(sk); 484 485 /* We need to make sure that this socket is listening, 486 * and that it has something pending. 487 */ 488 error = -EINVAL; 489 if (sk->sk_state != TCP_LISTEN) 490 goto out_err; 491 492 /* Find already established connection */ 493 if (reqsk_queue_empty(queue)) { 494 long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); 495 496 /* If this is a non blocking socket don't sleep */ 497 error = -EAGAIN; 498 if (!timeo) 499 goto out_err; 500 501 error = inet_csk_wait_for_connect(sk, timeo); 502 if (error) 503 goto out_err; 504 } 505 req = reqsk_queue_remove(queue, sk); 506 newsk = req->sk; 507 508 if (sk->sk_protocol == IPPROTO_TCP && 509 tcp_rsk(req)->tfo_listener) { 510 spin_lock_bh(&queue->fastopenq.lock); 511 if (tcp_rsk(req)->tfo_listener) { 512 /* We are still waiting for the final ACK from 3WHS 513 * so can't free req now. Instead, we set req->sk to 514 * NULL to signify that the child socket is taken 515 * so reqsk_fastopen_remove() will free the req 516 * when 3WHS finishes (or is aborted). 517 */ 518 req->sk = NULL; 519 req = NULL; 520 } 521 spin_unlock_bh(&queue->fastopenq.lock); 522 } 523 524 out: 525 release_sock(sk); 526 if (newsk && mem_cgroup_sockets_enabled) { 527 int amt; 528 529 /* atomically get the memory usage, set and charge the 530 * newsk->sk_memcg. 531 */ 532 lock_sock(newsk); 533 534 /* The socket has not been accepted yet, no need to look at 535 * newsk->sk_wmem_queued. 536 */ 537 amt = sk_mem_pages(newsk->sk_forward_alloc + 538 atomic_read(&newsk->sk_rmem_alloc)); 539 mem_cgroup_sk_alloc(newsk); 540 if (newsk->sk_memcg && amt) 541 mem_cgroup_charge_skmem(newsk->sk_memcg, amt, 542 GFP_KERNEL | __GFP_NOFAIL); 543 544 release_sock(newsk); 545 } 546 if (req) 547 reqsk_put(req); 548 return newsk; 549 out_err: 550 newsk = NULL; 551 req = NULL; 552 *err = error; 553 goto out; 554 } 555 EXPORT_SYMBOL(inet_csk_accept); 556 557 /* 558 * Using different timers for retransmit, delayed acks and probes 559 * We may wish use just one timer maintaining a list of expire jiffies 560 * to optimize. 561 */ 562 void inet_csk_init_xmit_timers(struct sock *sk, 563 void (*retransmit_handler)(struct timer_list *t), 564 void (*delack_handler)(struct timer_list *t), 565 void (*keepalive_handler)(struct timer_list *t)) 566 { 567 struct inet_connection_sock *icsk = inet_csk(sk); 568 569 timer_setup(&icsk->icsk_retransmit_timer, retransmit_handler, 0); 570 timer_setup(&icsk->icsk_delack_timer, delack_handler, 0); 571 timer_setup(&sk->sk_timer, keepalive_handler, 0); 572 icsk->icsk_pending = icsk->icsk_ack.pending = 0; 573 } 574 EXPORT_SYMBOL(inet_csk_init_xmit_timers); 575 576 void inet_csk_clear_xmit_timers(struct sock *sk) 577 { 578 struct inet_connection_sock *icsk = inet_csk(sk); 579 580 icsk->icsk_pending = icsk->icsk_ack.pending = 0; 581 582 sk_stop_timer(sk, &icsk->icsk_retransmit_timer); 583 sk_stop_timer(sk, &icsk->icsk_delack_timer); 584 sk_stop_timer(sk, &sk->sk_timer); 585 } 586 EXPORT_SYMBOL(inet_csk_clear_xmit_timers); 587 588 void inet_csk_delete_keepalive_timer(struct sock *sk) 589 { 590 sk_stop_timer(sk, &sk->sk_timer); 591 } 592 EXPORT_SYMBOL(inet_csk_delete_keepalive_timer); 593 594 void inet_csk_reset_keepalive_timer(struct sock *sk, unsigned long len) 595 { 596 sk_reset_timer(sk, &sk->sk_timer, jiffies + len); 597 } 598 EXPORT_SYMBOL(inet_csk_reset_keepalive_timer); 599 600 struct dst_entry *inet_csk_route_req(const struct sock *sk, 601 struct flowi4 *fl4, 602 const struct request_sock *req) 603 { 604 const struct inet_request_sock *ireq = inet_rsk(req); 605 struct net *net = read_pnet(&ireq->ireq_net); 606 struct ip_options_rcu *opt; 607 struct rtable *rt; 608 609 rcu_read_lock(); 610 opt = rcu_dereference(ireq->ireq_opt); 611 612 flowi4_init_output(fl4, ireq->ir_iif, ireq->ir_mark, 613 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, 614 sk->sk_protocol, inet_sk_flowi_flags(sk), 615 (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr, 616 ireq->ir_loc_addr, ireq->ir_rmt_port, 617 htons(ireq->ir_num), sk->sk_uid); 618 security_req_classify_flow(req, flowi4_to_flowi_common(fl4)); 619 rt = ip_route_output_flow(net, fl4, sk); 620 if (IS_ERR(rt)) 621 goto no_route; 622 if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway) 623 goto route_err; 624 rcu_read_unlock(); 625 return &rt->dst; 626 627 route_err: 628 ip_rt_put(rt); 629 no_route: 630 rcu_read_unlock(); 631 __IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); 632 return NULL; 633 } 634 EXPORT_SYMBOL_GPL(inet_csk_route_req); 635 636 struct dst_entry *inet_csk_route_child_sock(const struct sock *sk, 637 struct sock *newsk, 638 const struct request_sock *req) 639 { 640 const struct inet_request_sock *ireq = inet_rsk(req); 641 struct net *net = read_pnet(&ireq->ireq_net); 642 struct inet_sock *newinet = inet_sk(newsk); 643 struct ip_options_rcu *opt; 644 struct flowi4 *fl4; 645 struct rtable *rt; 646 647 opt = rcu_dereference(ireq->ireq_opt); 648 fl4 = &newinet->cork.fl.u.ip4; 649 650 flowi4_init_output(fl4, ireq->ir_iif, ireq->ir_mark, 651 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, 652 sk->sk_protocol, inet_sk_flowi_flags(sk), 653 (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr, 654 ireq->ir_loc_addr, ireq->ir_rmt_port, 655 htons(ireq->ir_num), sk->sk_uid); 656 security_req_classify_flow(req, flowi4_to_flowi_common(fl4)); 657 rt = ip_route_output_flow(net, fl4, sk); 658 if (IS_ERR(rt)) 659 goto no_route; 660 if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway) 661 goto route_err; 662 return &rt->dst; 663 664 route_err: 665 ip_rt_put(rt); 666 no_route: 667 __IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); 668 return NULL; 669 } 670 EXPORT_SYMBOL_GPL(inet_csk_route_child_sock); 671 672 /* Decide when to expire the request and when to resend SYN-ACK */ 673 static void syn_ack_recalc(struct request_sock *req, 674 const int max_syn_ack_retries, 675 const u8 rskq_defer_accept, 676 int *expire, int *resend) 677 { 678 if (!rskq_defer_accept) { 679 *expire = req->num_timeout >= max_syn_ack_retries; 680 *resend = 1; 681 return; 682 } 683 *expire = req->num_timeout >= max_syn_ack_retries && 684 (!inet_rsk(req)->acked || req->num_timeout >= rskq_defer_accept); 685 /* Do not resend while waiting for data after ACK, 686 * start to resend on end of deferring period to give 687 * last chance for data or ACK to create established socket. 688 */ 689 *resend = !inet_rsk(req)->acked || 690 req->num_timeout >= rskq_defer_accept - 1; 691 } 692 693 int inet_rtx_syn_ack(const struct sock *parent, struct request_sock *req) 694 { 695 int err = req->rsk_ops->rtx_syn_ack(parent, req); 696 697 if (!err) 698 req->num_retrans++; 699 return err; 700 } 701 EXPORT_SYMBOL(inet_rtx_syn_ack); 702 703 static struct request_sock *inet_reqsk_clone(struct request_sock *req, 704 struct sock *sk) 705 { 706 struct sock *req_sk, *nreq_sk; 707 struct request_sock *nreq; 708 709 nreq = kmem_cache_alloc(req->rsk_ops->slab, GFP_ATOMIC | __GFP_NOWARN); 710 if (!nreq) { 711 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMIGRATEREQFAILURE); 712 713 /* paired with refcount_inc_not_zero() in reuseport_migrate_sock() */ 714 sock_put(sk); 715 return NULL; 716 } 717 718 req_sk = req_to_sk(req); 719 nreq_sk = req_to_sk(nreq); 720 721 memcpy(nreq_sk, req_sk, 722 offsetof(struct sock, sk_dontcopy_begin)); 723 memcpy(&nreq_sk->sk_dontcopy_end, &req_sk->sk_dontcopy_end, 724 req->rsk_ops->obj_size - offsetof(struct sock, sk_dontcopy_end)); 725 726 sk_node_init(&nreq_sk->sk_node); 727 nreq_sk->sk_tx_queue_mapping = req_sk->sk_tx_queue_mapping; 728 #ifdef CONFIG_SOCK_RX_QUEUE_MAPPING 729 nreq_sk->sk_rx_queue_mapping = req_sk->sk_rx_queue_mapping; 730 #endif 731 nreq_sk->sk_incoming_cpu = req_sk->sk_incoming_cpu; 732 733 nreq->rsk_listener = sk; 734 735 /* We need not acquire fastopenq->lock 736 * because the child socket is locked in inet_csk_listen_stop(). 737 */ 738 if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(nreq)->tfo_listener) 739 rcu_assign_pointer(tcp_sk(nreq->sk)->fastopen_rsk, nreq); 740 741 return nreq; 742 } 743 744 static void reqsk_queue_migrated(struct request_sock_queue *queue, 745 const struct request_sock *req) 746 { 747 if (req->num_timeout == 0) 748 atomic_inc(&queue->young); 749 atomic_inc(&queue->qlen); 750 } 751 752 static void reqsk_migrate_reset(struct request_sock *req) 753 { 754 req->saved_syn = NULL; 755 #if IS_ENABLED(CONFIG_IPV6) 756 inet_rsk(req)->ipv6_opt = NULL; 757 inet_rsk(req)->pktopts = NULL; 758 #else 759 inet_rsk(req)->ireq_opt = NULL; 760 #endif 761 } 762 763 /* return true if req was found in the ehash table */ 764 static bool reqsk_queue_unlink(struct request_sock *req) 765 { 766 struct inet_hashinfo *hashinfo = req_to_sk(req)->sk_prot->h.hashinfo; 767 bool found = false; 768 769 if (sk_hashed(req_to_sk(req))) { 770 spinlock_t *lock = inet_ehash_lockp(hashinfo, req->rsk_hash); 771 772 spin_lock(lock); 773 found = __sk_nulls_del_node_init_rcu(req_to_sk(req)); 774 spin_unlock(lock); 775 } 776 if (timer_pending(&req->rsk_timer) && del_timer_sync(&req->rsk_timer)) 777 reqsk_put(req); 778 return found; 779 } 780 781 bool inet_csk_reqsk_queue_drop(struct sock *sk, struct request_sock *req) 782 { 783 bool unlinked = reqsk_queue_unlink(req); 784 785 if (unlinked) { 786 reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req); 787 reqsk_put(req); 788 } 789 return unlinked; 790 } 791 EXPORT_SYMBOL(inet_csk_reqsk_queue_drop); 792 793 void inet_csk_reqsk_queue_drop_and_put(struct sock *sk, struct request_sock *req) 794 { 795 inet_csk_reqsk_queue_drop(sk, req); 796 reqsk_put(req); 797 } 798 EXPORT_SYMBOL(inet_csk_reqsk_queue_drop_and_put); 799 800 static void reqsk_timer_handler(struct timer_list *t) 801 { 802 struct request_sock *req = from_timer(req, t, rsk_timer); 803 struct request_sock *nreq = NULL, *oreq = req; 804 struct sock *sk_listener = req->rsk_listener; 805 struct inet_connection_sock *icsk; 806 struct request_sock_queue *queue; 807 struct net *net; 808 int max_syn_ack_retries, qlen, expire = 0, resend = 0; 809 810 if (inet_sk_state_load(sk_listener) != TCP_LISTEN) { 811 struct sock *nsk; 812 813 nsk = reuseport_migrate_sock(sk_listener, req_to_sk(req), NULL); 814 if (!nsk) 815 goto drop; 816 817 nreq = inet_reqsk_clone(req, nsk); 818 if (!nreq) 819 goto drop; 820 821 /* The new timer for the cloned req can decrease the 2 822 * by calling inet_csk_reqsk_queue_drop_and_put(), so 823 * hold another count to prevent use-after-free and 824 * call reqsk_put() just before return. 825 */ 826 refcount_set(&nreq->rsk_refcnt, 2 + 1); 827 timer_setup(&nreq->rsk_timer, reqsk_timer_handler, TIMER_PINNED); 828 reqsk_queue_migrated(&inet_csk(nsk)->icsk_accept_queue, req); 829 830 req = nreq; 831 sk_listener = nsk; 832 } 833 834 icsk = inet_csk(sk_listener); 835 net = sock_net(sk_listener); 836 max_syn_ack_retries = icsk->icsk_syn_retries ? : 837 READ_ONCE(net->ipv4.sysctl_tcp_synack_retries); 838 /* Normally all the openreqs are young and become mature 839 * (i.e. converted to established socket) for first timeout. 840 * If synack was not acknowledged for 1 second, it means 841 * one of the following things: synack was lost, ack was lost, 842 * rtt is high or nobody planned to ack (i.e. synflood). 843 * When server is a bit loaded, queue is populated with old 844 * open requests, reducing effective size of queue. 845 * When server is well loaded, queue size reduces to zero 846 * after several minutes of work. It is not synflood, 847 * it is normal operation. The solution is pruning 848 * too old entries overriding normal timeout, when 849 * situation becomes dangerous. 850 * 851 * Essentially, we reserve half of room for young 852 * embrions; and abort old ones without pity, if old 853 * ones are about to clog our table. 854 */ 855 queue = &icsk->icsk_accept_queue; 856 qlen = reqsk_queue_len(queue); 857 if ((qlen << 1) > max(8U, READ_ONCE(sk_listener->sk_max_ack_backlog))) { 858 int young = reqsk_queue_len_young(queue) << 1; 859 860 while (max_syn_ack_retries > 2) { 861 if (qlen < young) 862 break; 863 max_syn_ack_retries--; 864 young <<= 1; 865 } 866 } 867 syn_ack_recalc(req, max_syn_ack_retries, READ_ONCE(queue->rskq_defer_accept), 868 &expire, &resend); 869 req->rsk_ops->syn_ack_timeout(req); 870 if (!expire && 871 (!resend || 872 !inet_rtx_syn_ack(sk_listener, req) || 873 inet_rsk(req)->acked)) { 874 if (req->num_timeout++ == 0) 875 atomic_dec(&queue->young); 876 mod_timer(&req->rsk_timer, jiffies + reqsk_timeout(req, TCP_RTO_MAX)); 877 878 if (!nreq) 879 return; 880 881 if (!inet_ehash_insert(req_to_sk(nreq), req_to_sk(oreq), NULL)) { 882 /* delete timer */ 883 inet_csk_reqsk_queue_drop(sk_listener, nreq); 884 goto no_ownership; 885 } 886 887 __NET_INC_STATS(net, LINUX_MIB_TCPMIGRATEREQSUCCESS); 888 reqsk_migrate_reset(oreq); 889 reqsk_queue_removed(&inet_csk(oreq->rsk_listener)->icsk_accept_queue, oreq); 890 reqsk_put(oreq); 891 892 reqsk_put(nreq); 893 return; 894 } 895 896 /* Even if we can clone the req, we may need not retransmit any more 897 * SYN+ACKs (nreq->num_timeout > max_syn_ack_retries, etc), or another 898 * CPU may win the "own_req" race so that inet_ehash_insert() fails. 899 */ 900 if (nreq) { 901 __NET_INC_STATS(net, LINUX_MIB_TCPMIGRATEREQFAILURE); 902 no_ownership: 903 reqsk_migrate_reset(nreq); 904 reqsk_queue_removed(queue, nreq); 905 __reqsk_free(nreq); 906 } 907 908 drop: 909 inet_csk_reqsk_queue_drop_and_put(oreq->rsk_listener, oreq); 910 } 911 912 static void reqsk_queue_hash_req(struct request_sock *req, 913 unsigned long timeout) 914 { 915 timer_setup(&req->rsk_timer, reqsk_timer_handler, TIMER_PINNED); 916 mod_timer(&req->rsk_timer, jiffies + timeout); 917 918 inet_ehash_insert(req_to_sk(req), NULL, NULL); 919 /* before letting lookups find us, make sure all req fields 920 * are committed to memory and refcnt initialized. 921 */ 922 smp_wmb(); 923 refcount_set(&req->rsk_refcnt, 2 + 1); 924 } 925 926 void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, 927 unsigned long timeout) 928 { 929 reqsk_queue_hash_req(req, timeout); 930 inet_csk_reqsk_queue_added(sk); 931 } 932 EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add); 933 934 static void inet_clone_ulp(const struct request_sock *req, struct sock *newsk, 935 const gfp_t priority) 936 { 937 struct inet_connection_sock *icsk = inet_csk(newsk); 938 939 if (!icsk->icsk_ulp_ops) 940 return; 941 942 if (icsk->icsk_ulp_ops->clone) 943 icsk->icsk_ulp_ops->clone(req, newsk, priority); 944 } 945 946 /** 947 * inet_csk_clone_lock - clone an inet socket, and lock its clone 948 * @sk: the socket to clone 949 * @req: request_sock 950 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) 951 * 952 * Caller must unlock socket even in error path (bh_unlock_sock(newsk)) 953 */ 954 struct sock *inet_csk_clone_lock(const struct sock *sk, 955 const struct request_sock *req, 956 const gfp_t priority) 957 { 958 struct sock *newsk = sk_clone_lock(sk, priority); 959 960 if (newsk) { 961 struct inet_connection_sock *newicsk = inet_csk(newsk); 962 963 inet_sk_set_state(newsk, TCP_SYN_RECV); 964 newicsk->icsk_bind_hash = NULL; 965 966 inet_sk(newsk)->inet_dport = inet_rsk(req)->ir_rmt_port; 967 inet_sk(newsk)->inet_num = inet_rsk(req)->ir_num; 968 inet_sk(newsk)->inet_sport = htons(inet_rsk(req)->ir_num); 969 970 /* listeners have SOCK_RCU_FREE, not the children */ 971 sock_reset_flag(newsk, SOCK_RCU_FREE); 972 973 inet_sk(newsk)->mc_list = NULL; 974 975 newsk->sk_mark = inet_rsk(req)->ir_mark; 976 atomic64_set(&newsk->sk_cookie, 977 atomic64_read(&inet_rsk(req)->ir_cookie)); 978 979 newicsk->icsk_retransmits = 0; 980 newicsk->icsk_backoff = 0; 981 newicsk->icsk_probes_out = 0; 982 newicsk->icsk_probes_tstamp = 0; 983 984 /* Deinitialize accept_queue to trap illegal accesses. */ 985 memset(&newicsk->icsk_accept_queue, 0, sizeof(newicsk->icsk_accept_queue)); 986 987 inet_clone_ulp(req, newsk, priority); 988 989 security_inet_csk_clone(newsk, req); 990 } 991 return newsk; 992 } 993 EXPORT_SYMBOL_GPL(inet_csk_clone_lock); 994 995 /* 996 * At this point, there should be no process reference to this 997 * socket, and thus no user references at all. Therefore we 998 * can assume the socket waitqueue is inactive and nobody will 999 * try to jump onto it. 1000 */ 1001 void inet_csk_destroy_sock(struct sock *sk) 1002 { 1003 WARN_ON(sk->sk_state != TCP_CLOSE); 1004 WARN_ON(!sock_flag(sk, SOCK_DEAD)); 1005 1006 /* It cannot be in hash table! */ 1007 WARN_ON(!sk_unhashed(sk)); 1008 1009 /* If it has not 0 inet_sk(sk)->inet_num, it must be bound */ 1010 WARN_ON(inet_sk(sk)->inet_num && !inet_csk(sk)->icsk_bind_hash); 1011 1012 sk->sk_prot->destroy(sk); 1013 1014 sk_stream_kill_queues(sk); 1015 1016 xfrm_sk_free_policy(sk); 1017 1018 sk_refcnt_debug_release(sk); 1019 1020 this_cpu_dec(*sk->sk_prot->orphan_count); 1021 1022 sock_put(sk); 1023 } 1024 EXPORT_SYMBOL(inet_csk_destroy_sock); 1025 1026 /* This function allows to force a closure of a socket after the call to 1027 * tcp/dccp_create_openreq_child(). 1028 */ 1029 void inet_csk_prepare_forced_close(struct sock *sk) 1030 __releases(&sk->sk_lock.slock) 1031 { 1032 /* sk_clone_lock locked the socket and set refcnt to 2 */ 1033 bh_unlock_sock(sk); 1034 sock_put(sk); 1035 inet_csk_prepare_for_destroy_sock(sk); 1036 inet_sk(sk)->inet_num = 0; 1037 } 1038 EXPORT_SYMBOL(inet_csk_prepare_forced_close); 1039 1040 int inet_csk_listen_start(struct sock *sk) 1041 { 1042 struct inet_connection_sock *icsk = inet_csk(sk); 1043 struct inet_sock *inet = inet_sk(sk); 1044 int err = -EADDRINUSE; 1045 1046 reqsk_queue_alloc(&icsk->icsk_accept_queue); 1047 1048 sk->sk_ack_backlog = 0; 1049 inet_csk_delack_init(sk); 1050 1051 if (sk->sk_txrehash == SOCK_TXREHASH_DEFAULT) 1052 sk->sk_txrehash = READ_ONCE(sock_net(sk)->core.sysctl_txrehash); 1053 1054 /* There is race window here: we announce ourselves listening, 1055 * but this transition is still not validated by get_port(). 1056 * It is OK, because this socket enters to hash table only 1057 * after validation is complete. 1058 */ 1059 inet_sk_state_store(sk, TCP_LISTEN); 1060 if (!sk->sk_prot->get_port(sk, inet->inet_num)) { 1061 inet->inet_sport = htons(inet->inet_num); 1062 1063 sk_dst_reset(sk); 1064 err = sk->sk_prot->hash(sk); 1065 1066 if (likely(!err)) 1067 return 0; 1068 } 1069 1070 inet_sk_set_state(sk, TCP_CLOSE); 1071 return err; 1072 } 1073 EXPORT_SYMBOL_GPL(inet_csk_listen_start); 1074 1075 static void inet_child_forget(struct sock *sk, struct request_sock *req, 1076 struct sock *child) 1077 { 1078 sk->sk_prot->disconnect(child, O_NONBLOCK); 1079 1080 sock_orphan(child); 1081 1082 this_cpu_inc(*sk->sk_prot->orphan_count); 1083 1084 if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->tfo_listener) { 1085 BUG_ON(rcu_access_pointer(tcp_sk(child)->fastopen_rsk) != req); 1086 BUG_ON(sk != req->rsk_listener); 1087 1088 /* Paranoid, to prevent race condition if 1089 * an inbound pkt destined for child is 1090 * blocked by sock lock in tcp_v4_rcv(). 1091 * Also to satisfy an assertion in 1092 * tcp_v4_destroy_sock(). 1093 */ 1094 RCU_INIT_POINTER(tcp_sk(child)->fastopen_rsk, NULL); 1095 } 1096 inet_csk_destroy_sock(child); 1097 } 1098 1099 struct sock *inet_csk_reqsk_queue_add(struct sock *sk, 1100 struct request_sock *req, 1101 struct sock *child) 1102 { 1103 struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; 1104 1105 spin_lock(&queue->rskq_lock); 1106 if (unlikely(sk->sk_state != TCP_LISTEN)) { 1107 inet_child_forget(sk, req, child); 1108 child = NULL; 1109 } else { 1110 req->sk = child; 1111 req->dl_next = NULL; 1112 if (queue->rskq_accept_head == NULL) 1113 WRITE_ONCE(queue->rskq_accept_head, req); 1114 else 1115 queue->rskq_accept_tail->dl_next = req; 1116 queue->rskq_accept_tail = req; 1117 sk_acceptq_added(sk); 1118 } 1119 spin_unlock(&queue->rskq_lock); 1120 return child; 1121 } 1122 EXPORT_SYMBOL(inet_csk_reqsk_queue_add); 1123 1124 struct sock *inet_csk_complete_hashdance(struct sock *sk, struct sock *child, 1125 struct request_sock *req, bool own_req) 1126 { 1127 if (own_req) { 1128 inet_csk_reqsk_queue_drop(req->rsk_listener, req); 1129 reqsk_queue_removed(&inet_csk(req->rsk_listener)->icsk_accept_queue, req); 1130 1131 if (sk != req->rsk_listener) { 1132 /* another listening sk has been selected, 1133 * migrate the req to it. 1134 */ 1135 struct request_sock *nreq; 1136 1137 /* hold a refcnt for the nreq->rsk_listener 1138 * which is assigned in inet_reqsk_clone() 1139 */ 1140 sock_hold(sk); 1141 nreq = inet_reqsk_clone(req, sk); 1142 if (!nreq) { 1143 inet_child_forget(sk, req, child); 1144 goto child_put; 1145 } 1146 1147 refcount_set(&nreq->rsk_refcnt, 1); 1148 if (inet_csk_reqsk_queue_add(sk, nreq, child)) { 1149 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMIGRATEREQSUCCESS); 1150 reqsk_migrate_reset(req); 1151 reqsk_put(req); 1152 return child; 1153 } 1154 1155 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMIGRATEREQFAILURE); 1156 reqsk_migrate_reset(nreq); 1157 __reqsk_free(nreq); 1158 } else if (inet_csk_reqsk_queue_add(sk, req, child)) { 1159 return child; 1160 } 1161 } 1162 /* Too bad, another child took ownership of the request, undo. */ 1163 child_put: 1164 bh_unlock_sock(child); 1165 sock_put(child); 1166 return NULL; 1167 } 1168 EXPORT_SYMBOL(inet_csk_complete_hashdance); 1169 1170 /* 1171 * This routine closes sockets which have been at least partially 1172 * opened, but not yet accepted. 1173 */ 1174 void inet_csk_listen_stop(struct sock *sk) 1175 { 1176 struct inet_connection_sock *icsk = inet_csk(sk); 1177 struct request_sock_queue *queue = &icsk->icsk_accept_queue; 1178 struct request_sock *next, *req; 1179 1180 /* Following specs, it would be better either to send FIN 1181 * (and enter FIN-WAIT-1, it is normal close) 1182 * or to send active reset (abort). 1183 * Certainly, it is pretty dangerous while synflood, but it is 1184 * bad justification for our negligence 8) 1185 * To be honest, we are not able to make either 1186 * of the variants now. --ANK 1187 */ 1188 while ((req = reqsk_queue_remove(queue, sk)) != NULL) { 1189 struct sock *child = req->sk, *nsk; 1190 struct request_sock *nreq; 1191 1192 local_bh_disable(); 1193 bh_lock_sock(child); 1194 WARN_ON(sock_owned_by_user(child)); 1195 sock_hold(child); 1196 1197 nsk = reuseport_migrate_sock(sk, child, NULL); 1198 if (nsk) { 1199 nreq = inet_reqsk_clone(req, nsk); 1200 if (nreq) { 1201 refcount_set(&nreq->rsk_refcnt, 1); 1202 1203 if (inet_csk_reqsk_queue_add(nsk, nreq, child)) { 1204 __NET_INC_STATS(sock_net(nsk), 1205 LINUX_MIB_TCPMIGRATEREQSUCCESS); 1206 reqsk_migrate_reset(req); 1207 } else { 1208 __NET_INC_STATS(sock_net(nsk), 1209 LINUX_MIB_TCPMIGRATEREQFAILURE); 1210 reqsk_migrate_reset(nreq); 1211 __reqsk_free(nreq); 1212 } 1213 1214 /* inet_csk_reqsk_queue_add() has already 1215 * called inet_child_forget() on failure case. 1216 */ 1217 goto skip_child_forget; 1218 } 1219 } 1220 1221 inet_child_forget(sk, req, child); 1222 skip_child_forget: 1223 reqsk_put(req); 1224 bh_unlock_sock(child); 1225 local_bh_enable(); 1226 sock_put(child); 1227 1228 cond_resched(); 1229 } 1230 if (queue->fastopenq.rskq_rst_head) { 1231 /* Free all the reqs queued in rskq_rst_head. */ 1232 spin_lock_bh(&queue->fastopenq.lock); 1233 req = queue->fastopenq.rskq_rst_head; 1234 queue->fastopenq.rskq_rst_head = NULL; 1235 spin_unlock_bh(&queue->fastopenq.lock); 1236 while (req != NULL) { 1237 next = req->dl_next; 1238 reqsk_put(req); 1239 req = next; 1240 } 1241 } 1242 WARN_ON_ONCE(sk->sk_ack_backlog); 1243 } 1244 EXPORT_SYMBOL_GPL(inet_csk_listen_stop); 1245 1246 void inet_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr) 1247 { 1248 struct sockaddr_in *sin = (struct sockaddr_in *)uaddr; 1249 const struct inet_sock *inet = inet_sk(sk); 1250 1251 sin->sin_family = AF_INET; 1252 sin->sin_addr.s_addr = inet->inet_daddr; 1253 sin->sin_port = inet->inet_dport; 1254 } 1255 EXPORT_SYMBOL_GPL(inet_csk_addr2sockaddr); 1256 1257 static struct dst_entry *inet_csk_rebuild_route(struct sock *sk, struct flowi *fl) 1258 { 1259 const struct inet_sock *inet = inet_sk(sk); 1260 const struct ip_options_rcu *inet_opt; 1261 __be32 daddr = inet->inet_daddr; 1262 struct flowi4 *fl4; 1263 struct rtable *rt; 1264 1265 rcu_read_lock(); 1266 inet_opt = rcu_dereference(inet->inet_opt); 1267 if (inet_opt && inet_opt->opt.srr) 1268 daddr = inet_opt->opt.faddr; 1269 fl4 = &fl->u.ip4; 1270 rt = ip_route_output_ports(sock_net(sk), fl4, sk, daddr, 1271 inet->inet_saddr, inet->inet_dport, 1272 inet->inet_sport, sk->sk_protocol, 1273 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if); 1274 if (IS_ERR(rt)) 1275 rt = NULL; 1276 if (rt) 1277 sk_setup_caps(sk, &rt->dst); 1278 rcu_read_unlock(); 1279 1280 return &rt->dst; 1281 } 1282 1283 struct dst_entry *inet_csk_update_pmtu(struct sock *sk, u32 mtu) 1284 { 1285 struct dst_entry *dst = __sk_dst_check(sk, 0); 1286 struct inet_sock *inet = inet_sk(sk); 1287 1288 if (!dst) { 1289 dst = inet_csk_rebuild_route(sk, &inet->cork.fl); 1290 if (!dst) 1291 goto out; 1292 } 1293 dst->ops->update_pmtu(dst, sk, NULL, mtu, true); 1294 1295 dst = __sk_dst_check(sk, 0); 1296 if (!dst) 1297 dst = inet_csk_rebuild_route(sk, &inet->cork.fl); 1298 out: 1299 return dst; 1300 } 1301 EXPORT_SYMBOL_GPL(inet_csk_update_pmtu); 1302