1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * INET An implementation of the TCP/IP protocol suite for the LINUX 4 * operating system. INET is implemented using the BSD Socket 5 * interface as the means of communication with the user level. 6 * 7 * Generic INET transport hashtables 8 * 9 * Authors: Lotsa people, from code originally in tcp 10 */ 11 12 #include <linux/module.h> 13 #include <linux/random.h> 14 #include <linux/sched.h> 15 #include <linux/slab.h> 16 #include <linux/wait.h> 17 #include <linux/vmalloc.h> 18 #include <linux/memblock.h> 19 20 #include <net/addrconf.h> 21 #include <net/inet_connection_sock.h> 22 #include <net/inet_hashtables.h> 23 #if IS_ENABLED(CONFIG_IPV6) 24 #include <net/inet6_hashtables.h> 25 #endif 26 #include <net/secure_seq.h> 27 #include <net/ip.h> 28 #include <net/tcp.h> 29 #include <net/sock_reuseport.h> 30 31 static u32 inet_ehashfn(const struct net *net, const __be32 laddr, 32 const __u16 lport, const __be32 faddr, 33 const __be16 fport) 34 { 35 static u32 inet_ehash_secret __read_mostly; 36 37 net_get_random_once(&inet_ehash_secret, sizeof(inet_ehash_secret)); 38 39 return __inet_ehashfn(laddr, lport, faddr, fport, 40 inet_ehash_secret + net_hash_mix(net)); 41 } 42 43 /* This function handles inet_sock, but also timewait and request sockets 44 * for IPv4/IPv6. 45 */ 46 static u32 sk_ehashfn(const struct sock *sk) 47 { 48 #if IS_ENABLED(CONFIG_IPV6) 49 if (sk->sk_family == AF_INET6 && 50 !ipv6_addr_v4mapped(&sk->sk_v6_daddr)) 51 return inet6_ehashfn(sock_net(sk), 52 &sk->sk_v6_rcv_saddr, sk->sk_num, 53 &sk->sk_v6_daddr, sk->sk_dport); 54 #endif 55 return inet_ehashfn(sock_net(sk), 56 sk->sk_rcv_saddr, sk->sk_num, 57 sk->sk_daddr, sk->sk_dport); 58 } 59 60 /* 61 * Allocate and initialize a new local port bind bucket. 62 * The bindhash mutex for snum's hash chain must be held here. 63 */ 64 struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep, 65 struct net *net, 66 struct inet_bind_hashbucket *head, 67 const unsigned short snum, 68 int l3mdev) 69 { 70 struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC); 71 72 if (tb) { 73 write_pnet(&tb->ib_net, net); 74 tb->l3mdev = l3mdev; 75 tb->port = snum; 76 tb->fastreuse = 0; 77 tb->fastreuseport = 0; 78 INIT_HLIST_HEAD(&tb->owners); 79 hlist_add_head(&tb->node, &head->chain); 80 } 81 return tb; 82 } 83 84 /* 85 * Caller must hold hashbucket lock for this tb with local BH disabled 86 */ 87 void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket *tb) 88 { 89 if (hlist_empty(&tb->owners)) { 90 __hlist_del(&tb->node); 91 kmem_cache_free(cachep, tb); 92 } 93 } 94 95 bool inet_bind_bucket_match(const struct inet_bind_bucket *tb, const struct net *net, 96 unsigned short port, int l3mdev) 97 { 98 return net_eq(ib_net(tb), net) && tb->port == port && 99 tb->l3mdev == l3mdev; 100 } 101 102 static void inet_bind2_bucket_init(struct inet_bind2_bucket *tb, 103 struct net *net, 104 struct inet_bind_hashbucket *head, 105 unsigned short port, int l3mdev, 106 const struct sock *sk) 107 { 108 write_pnet(&tb->ib_net, net); 109 tb->l3mdev = l3mdev; 110 tb->port = port; 111 #if IS_ENABLED(CONFIG_IPV6) 112 tb->family = sk->sk_family; 113 if (sk->sk_family == AF_INET6) 114 tb->v6_rcv_saddr = sk->sk_v6_rcv_saddr; 115 else 116 #endif 117 tb->rcv_saddr = sk->sk_rcv_saddr; 118 INIT_HLIST_HEAD(&tb->owners); 119 INIT_HLIST_HEAD(&tb->deathrow); 120 hlist_add_head(&tb->node, &head->chain); 121 } 122 123 struct inet_bind2_bucket *inet_bind2_bucket_create(struct kmem_cache *cachep, 124 struct net *net, 125 struct inet_bind_hashbucket *head, 126 unsigned short port, 127 int l3mdev, 128 const struct sock *sk) 129 { 130 struct inet_bind2_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC); 131 132 if (tb) 133 inet_bind2_bucket_init(tb, net, head, port, l3mdev, sk); 134 135 return tb; 136 } 137 138 /* Caller must hold hashbucket lock for this tb with local BH disabled */ 139 void inet_bind2_bucket_destroy(struct kmem_cache *cachep, struct inet_bind2_bucket *tb) 140 { 141 if (hlist_empty(&tb->owners) && hlist_empty(&tb->deathrow)) { 142 __hlist_del(&tb->node); 143 kmem_cache_free(cachep, tb); 144 } 145 } 146 147 static bool inet_bind2_bucket_addr_match(const struct inet_bind2_bucket *tb2, 148 const struct sock *sk) 149 { 150 #if IS_ENABLED(CONFIG_IPV6) 151 if (sk->sk_family != tb2->family) 152 return false; 153 154 if (sk->sk_family == AF_INET6) 155 return ipv6_addr_equal(&tb2->v6_rcv_saddr, 156 &sk->sk_v6_rcv_saddr); 157 #endif 158 return tb2->rcv_saddr == sk->sk_rcv_saddr; 159 } 160 161 void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, 162 struct inet_bind2_bucket *tb2, unsigned short port) 163 { 164 inet_sk(sk)->inet_num = port; 165 sk_add_bind_node(sk, &tb->owners); 166 inet_csk(sk)->icsk_bind_hash = tb; 167 sk_add_bind2_node(sk, &tb2->owners); 168 inet_csk(sk)->icsk_bind2_hash = tb2; 169 } 170 171 /* 172 * Get rid of any references to a local port held by the given sock. 173 */ 174 static void __inet_put_port(struct sock *sk) 175 { 176 struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk); 177 struct inet_bind_hashbucket *head, *head2; 178 struct net *net = sock_net(sk); 179 struct inet_bind_bucket *tb; 180 int bhash; 181 182 bhash = inet_bhashfn(net, inet_sk(sk)->inet_num, hashinfo->bhash_size); 183 head = &hashinfo->bhash[bhash]; 184 head2 = inet_bhashfn_portaddr(hashinfo, sk, net, inet_sk(sk)->inet_num); 185 186 spin_lock(&head->lock); 187 tb = inet_csk(sk)->icsk_bind_hash; 188 __sk_del_bind_node(sk); 189 inet_csk(sk)->icsk_bind_hash = NULL; 190 inet_sk(sk)->inet_num = 0; 191 inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); 192 193 spin_lock(&head2->lock); 194 if (inet_csk(sk)->icsk_bind2_hash) { 195 struct inet_bind2_bucket *tb2 = inet_csk(sk)->icsk_bind2_hash; 196 197 __sk_del_bind2_node(sk); 198 inet_csk(sk)->icsk_bind2_hash = NULL; 199 inet_bind2_bucket_destroy(hashinfo->bind2_bucket_cachep, tb2); 200 } 201 spin_unlock(&head2->lock); 202 203 spin_unlock(&head->lock); 204 } 205 206 void inet_put_port(struct sock *sk) 207 { 208 local_bh_disable(); 209 __inet_put_port(sk); 210 local_bh_enable(); 211 } 212 EXPORT_SYMBOL(inet_put_port); 213 214 int __inet_inherit_port(const struct sock *sk, struct sock *child) 215 { 216 struct inet_hashinfo *table = tcp_or_dccp_get_hashinfo(sk); 217 unsigned short port = inet_sk(child)->inet_num; 218 struct inet_bind_hashbucket *head, *head2; 219 bool created_inet_bind_bucket = false; 220 struct net *net = sock_net(sk); 221 bool update_fastreuse = false; 222 struct inet_bind2_bucket *tb2; 223 struct inet_bind_bucket *tb; 224 int bhash, l3mdev; 225 226 bhash = inet_bhashfn(net, port, table->bhash_size); 227 head = &table->bhash[bhash]; 228 head2 = inet_bhashfn_portaddr(table, child, net, port); 229 230 spin_lock(&head->lock); 231 spin_lock(&head2->lock); 232 tb = inet_csk(sk)->icsk_bind_hash; 233 tb2 = inet_csk(sk)->icsk_bind2_hash; 234 if (unlikely(!tb || !tb2)) { 235 spin_unlock(&head2->lock); 236 spin_unlock(&head->lock); 237 return -ENOENT; 238 } 239 if (tb->port != port) { 240 l3mdev = inet_sk_bound_l3mdev(sk); 241 242 /* NOTE: using tproxy and redirecting skbs to a proxy 243 * on a different listener port breaks the assumption 244 * that the listener socket's icsk_bind_hash is the same 245 * as that of the child socket. We have to look up or 246 * create a new bind bucket for the child here. */ 247 inet_bind_bucket_for_each(tb, &head->chain) { 248 if (inet_bind_bucket_match(tb, net, port, l3mdev)) 249 break; 250 } 251 if (!tb) { 252 tb = inet_bind_bucket_create(table->bind_bucket_cachep, 253 net, head, port, l3mdev); 254 if (!tb) { 255 spin_unlock(&head2->lock); 256 spin_unlock(&head->lock); 257 return -ENOMEM; 258 } 259 created_inet_bind_bucket = true; 260 } 261 update_fastreuse = true; 262 263 goto bhash2_find; 264 } else if (!inet_bind2_bucket_addr_match(tb2, child)) { 265 l3mdev = inet_sk_bound_l3mdev(sk); 266 267 bhash2_find: 268 tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, child); 269 if (!tb2) { 270 tb2 = inet_bind2_bucket_create(table->bind2_bucket_cachep, 271 net, head2, port, 272 l3mdev, child); 273 if (!tb2) 274 goto error; 275 } 276 } 277 if (update_fastreuse) 278 inet_csk_update_fastreuse(tb, child); 279 inet_bind_hash(child, tb, tb2, port); 280 spin_unlock(&head2->lock); 281 spin_unlock(&head->lock); 282 283 return 0; 284 285 error: 286 if (created_inet_bind_bucket) 287 inet_bind_bucket_destroy(table->bind_bucket_cachep, tb); 288 spin_unlock(&head2->lock); 289 spin_unlock(&head->lock); 290 return -ENOMEM; 291 } 292 EXPORT_SYMBOL_GPL(__inet_inherit_port); 293 294 static struct inet_listen_hashbucket * 295 inet_lhash2_bucket_sk(struct inet_hashinfo *h, struct sock *sk) 296 { 297 u32 hash; 298 299 #if IS_ENABLED(CONFIG_IPV6) 300 if (sk->sk_family == AF_INET6) 301 hash = ipv6_portaddr_hash(sock_net(sk), 302 &sk->sk_v6_rcv_saddr, 303 inet_sk(sk)->inet_num); 304 else 305 #endif 306 hash = ipv4_portaddr_hash(sock_net(sk), 307 inet_sk(sk)->inet_rcv_saddr, 308 inet_sk(sk)->inet_num); 309 return inet_lhash2_bucket(h, hash); 310 } 311 312 static inline int compute_score(struct sock *sk, struct net *net, 313 const unsigned short hnum, const __be32 daddr, 314 const int dif, const int sdif) 315 { 316 int score = -1; 317 318 if (net_eq(sock_net(sk), net) && sk->sk_num == hnum && 319 !ipv6_only_sock(sk)) { 320 if (sk->sk_rcv_saddr != daddr) 321 return -1; 322 323 if (!inet_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif)) 324 return -1; 325 score = sk->sk_bound_dev_if ? 2 : 1; 326 327 if (sk->sk_family == PF_INET) 328 score++; 329 if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id()) 330 score++; 331 } 332 return score; 333 } 334 335 static inline struct sock *lookup_reuseport(struct net *net, struct sock *sk, 336 struct sk_buff *skb, int doff, 337 __be32 saddr, __be16 sport, 338 __be32 daddr, unsigned short hnum) 339 { 340 struct sock *reuse_sk = NULL; 341 u32 phash; 342 343 if (sk->sk_reuseport) { 344 phash = inet_ehashfn(net, daddr, hnum, saddr, sport); 345 reuse_sk = reuseport_select_sock(sk, phash, skb, doff); 346 } 347 return reuse_sk; 348 } 349 350 /* 351 * Here are some nice properties to exploit here. The BSD API 352 * does not allow a listening sock to specify the remote port nor the 353 * remote address for the connection. So always assume those are both 354 * wildcarded during the search since they can never be otherwise. 355 */ 356 357 /* called with rcu_read_lock() : No refcount taken on the socket */ 358 static struct sock *inet_lhash2_lookup(struct net *net, 359 struct inet_listen_hashbucket *ilb2, 360 struct sk_buff *skb, int doff, 361 const __be32 saddr, __be16 sport, 362 const __be32 daddr, const unsigned short hnum, 363 const int dif, const int sdif) 364 { 365 struct sock *sk, *result = NULL; 366 struct hlist_nulls_node *node; 367 int score, hiscore = 0; 368 369 sk_nulls_for_each_rcu(sk, node, &ilb2->nulls_head) { 370 score = compute_score(sk, net, hnum, daddr, dif, sdif); 371 if (score > hiscore) { 372 result = lookup_reuseport(net, sk, skb, doff, 373 saddr, sport, daddr, hnum); 374 if (result) 375 return result; 376 377 result = sk; 378 hiscore = score; 379 } 380 } 381 382 return result; 383 } 384 385 static inline struct sock *inet_lookup_run_bpf(struct net *net, 386 struct inet_hashinfo *hashinfo, 387 struct sk_buff *skb, int doff, 388 __be32 saddr, __be16 sport, 389 __be32 daddr, u16 hnum, const int dif) 390 { 391 struct sock *sk, *reuse_sk; 392 bool no_reuseport; 393 394 if (hashinfo != net->ipv4.tcp_death_row.hashinfo) 395 return NULL; /* only TCP is supported */ 396 397 no_reuseport = bpf_sk_lookup_run_v4(net, IPPROTO_TCP, saddr, sport, 398 daddr, hnum, dif, &sk); 399 if (no_reuseport || IS_ERR_OR_NULL(sk)) 400 return sk; 401 402 reuse_sk = lookup_reuseport(net, sk, skb, doff, saddr, sport, daddr, hnum); 403 if (reuse_sk) 404 sk = reuse_sk; 405 return sk; 406 } 407 408 struct sock *__inet_lookup_listener(struct net *net, 409 struct inet_hashinfo *hashinfo, 410 struct sk_buff *skb, int doff, 411 const __be32 saddr, __be16 sport, 412 const __be32 daddr, const unsigned short hnum, 413 const int dif, const int sdif) 414 { 415 struct inet_listen_hashbucket *ilb2; 416 struct sock *result = NULL; 417 unsigned int hash2; 418 419 /* Lookup redirect from BPF */ 420 if (static_branch_unlikely(&bpf_sk_lookup_enabled)) { 421 result = inet_lookup_run_bpf(net, hashinfo, skb, doff, 422 saddr, sport, daddr, hnum, dif); 423 if (result) 424 goto done; 425 } 426 427 hash2 = ipv4_portaddr_hash(net, daddr, hnum); 428 ilb2 = inet_lhash2_bucket(hashinfo, hash2); 429 430 result = inet_lhash2_lookup(net, ilb2, skb, doff, 431 saddr, sport, daddr, hnum, 432 dif, sdif); 433 if (result) 434 goto done; 435 436 /* Lookup lhash2 with INADDR_ANY */ 437 hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum); 438 ilb2 = inet_lhash2_bucket(hashinfo, hash2); 439 440 result = inet_lhash2_lookup(net, ilb2, skb, doff, 441 saddr, sport, htonl(INADDR_ANY), hnum, 442 dif, sdif); 443 done: 444 if (IS_ERR(result)) 445 return NULL; 446 return result; 447 } 448 EXPORT_SYMBOL_GPL(__inet_lookup_listener); 449 450 /* All sockets share common refcount, but have different destructors */ 451 void sock_gen_put(struct sock *sk) 452 { 453 if (!refcount_dec_and_test(&sk->sk_refcnt)) 454 return; 455 456 if (sk->sk_state == TCP_TIME_WAIT) 457 inet_twsk_free(inet_twsk(sk)); 458 else if (sk->sk_state == TCP_NEW_SYN_RECV) 459 reqsk_free(inet_reqsk(sk)); 460 else 461 sk_free(sk); 462 } 463 EXPORT_SYMBOL_GPL(sock_gen_put); 464 465 void sock_edemux(struct sk_buff *skb) 466 { 467 sock_gen_put(skb->sk); 468 } 469 EXPORT_SYMBOL(sock_edemux); 470 471 struct sock *__inet_lookup_established(struct net *net, 472 struct inet_hashinfo *hashinfo, 473 const __be32 saddr, const __be16 sport, 474 const __be32 daddr, const u16 hnum, 475 const int dif, const int sdif) 476 { 477 INET_ADDR_COOKIE(acookie, saddr, daddr); 478 const __portpair ports = INET_COMBINED_PORTS(sport, hnum); 479 struct sock *sk; 480 const struct hlist_nulls_node *node; 481 /* Optimize here for direct hit, only listening connections can 482 * have wildcards anyways. 483 */ 484 unsigned int hash = inet_ehashfn(net, daddr, hnum, saddr, sport); 485 unsigned int slot = hash & hashinfo->ehash_mask; 486 struct inet_ehash_bucket *head = &hashinfo->ehash[slot]; 487 488 begin: 489 sk_nulls_for_each_rcu(sk, node, &head->chain) { 490 if (sk->sk_hash != hash) 491 continue; 492 if (likely(inet_match(net, sk, acookie, ports, dif, sdif))) { 493 if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt))) 494 goto out; 495 if (unlikely(!inet_match(net, sk, acookie, 496 ports, dif, sdif))) { 497 sock_gen_put(sk); 498 goto begin; 499 } 500 goto found; 501 } 502 } 503 /* 504 * if the nulls value we got at the end of this lookup is 505 * not the expected one, we must restart lookup. 506 * We probably met an item that was moved to another chain. 507 */ 508 if (get_nulls_value(node) != slot) 509 goto begin; 510 out: 511 sk = NULL; 512 found: 513 return sk; 514 } 515 EXPORT_SYMBOL_GPL(__inet_lookup_established); 516 517 /* called with local bh disabled */ 518 static int __inet_check_established(struct inet_timewait_death_row *death_row, 519 struct sock *sk, __u16 lport, 520 struct inet_timewait_sock **twp) 521 { 522 struct inet_hashinfo *hinfo = death_row->hashinfo; 523 struct inet_sock *inet = inet_sk(sk); 524 __be32 daddr = inet->inet_rcv_saddr; 525 __be32 saddr = inet->inet_daddr; 526 int dif = sk->sk_bound_dev_if; 527 struct net *net = sock_net(sk); 528 int sdif = l3mdev_master_ifindex_by_index(net, dif); 529 INET_ADDR_COOKIE(acookie, saddr, daddr); 530 const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport); 531 unsigned int hash = inet_ehashfn(net, daddr, lport, 532 saddr, inet->inet_dport); 533 struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); 534 spinlock_t *lock = inet_ehash_lockp(hinfo, hash); 535 struct sock *sk2; 536 const struct hlist_nulls_node *node; 537 struct inet_timewait_sock *tw = NULL; 538 539 spin_lock(lock); 540 541 sk_nulls_for_each(sk2, node, &head->chain) { 542 if (sk2->sk_hash != hash) 543 continue; 544 545 if (likely(inet_match(net, sk2, acookie, ports, dif, sdif))) { 546 if (sk2->sk_state == TCP_TIME_WAIT) { 547 tw = inet_twsk(sk2); 548 if (twsk_unique(sk, sk2, twp)) 549 break; 550 } 551 goto not_unique; 552 } 553 } 554 555 /* Must record num and sport now. Otherwise we will see 556 * in hash table socket with a funny identity. 557 */ 558 inet->inet_num = lport; 559 inet->inet_sport = htons(lport); 560 sk->sk_hash = hash; 561 WARN_ON(!sk_unhashed(sk)); 562 __sk_nulls_add_node_rcu(sk, &head->chain); 563 if (tw) { 564 sk_nulls_del_node_init_rcu((struct sock *)tw); 565 __NET_INC_STATS(net, LINUX_MIB_TIMEWAITRECYCLED); 566 } 567 spin_unlock(lock); 568 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 569 570 if (twp) { 571 *twp = tw; 572 } else if (tw) { 573 /* Silly. Should hash-dance instead... */ 574 inet_twsk_deschedule_put(tw); 575 } 576 return 0; 577 578 not_unique: 579 spin_unlock(lock); 580 return -EADDRNOTAVAIL; 581 } 582 583 static u64 inet_sk_port_offset(const struct sock *sk) 584 { 585 const struct inet_sock *inet = inet_sk(sk); 586 587 return secure_ipv4_port_ephemeral(inet->inet_rcv_saddr, 588 inet->inet_daddr, 589 inet->inet_dport); 590 } 591 592 /* Searches for an exsiting socket in the ehash bucket list. 593 * Returns true if found, false otherwise. 594 */ 595 static bool inet_ehash_lookup_by_sk(struct sock *sk, 596 struct hlist_nulls_head *list) 597 { 598 const __portpair ports = INET_COMBINED_PORTS(sk->sk_dport, sk->sk_num); 599 const int sdif = sk->sk_bound_dev_if; 600 const int dif = sk->sk_bound_dev_if; 601 const struct hlist_nulls_node *node; 602 struct net *net = sock_net(sk); 603 struct sock *esk; 604 605 INET_ADDR_COOKIE(acookie, sk->sk_daddr, sk->sk_rcv_saddr); 606 607 sk_nulls_for_each_rcu(esk, node, list) { 608 if (esk->sk_hash != sk->sk_hash) 609 continue; 610 if (sk->sk_family == AF_INET) { 611 if (unlikely(inet_match(net, esk, acookie, 612 ports, dif, sdif))) { 613 return true; 614 } 615 } 616 #if IS_ENABLED(CONFIG_IPV6) 617 else if (sk->sk_family == AF_INET6) { 618 if (unlikely(inet6_match(net, esk, 619 &sk->sk_v6_daddr, 620 &sk->sk_v6_rcv_saddr, 621 ports, dif, sdif))) { 622 return true; 623 } 624 } 625 #endif 626 } 627 return false; 628 } 629 630 /* Insert a socket into ehash, and eventually remove another one 631 * (The another one can be a SYN_RECV or TIMEWAIT) 632 * If an existing socket already exists, socket sk is not inserted, 633 * and sets found_dup_sk parameter to true. 634 */ 635 bool inet_ehash_insert(struct sock *sk, struct sock *osk, bool *found_dup_sk) 636 { 637 struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk); 638 struct inet_ehash_bucket *head; 639 struct hlist_nulls_head *list; 640 spinlock_t *lock; 641 bool ret = true; 642 643 WARN_ON_ONCE(!sk_unhashed(sk)); 644 645 sk->sk_hash = sk_ehashfn(sk); 646 head = inet_ehash_bucket(hashinfo, sk->sk_hash); 647 list = &head->chain; 648 lock = inet_ehash_lockp(hashinfo, sk->sk_hash); 649 650 spin_lock(lock); 651 if (osk) { 652 WARN_ON_ONCE(sk->sk_hash != osk->sk_hash); 653 ret = sk_nulls_del_node_init_rcu(osk); 654 } else if (found_dup_sk) { 655 *found_dup_sk = inet_ehash_lookup_by_sk(sk, list); 656 if (*found_dup_sk) 657 ret = false; 658 } 659 660 if (ret) 661 __sk_nulls_add_node_rcu(sk, list); 662 663 spin_unlock(lock); 664 665 return ret; 666 } 667 668 bool inet_ehash_nolisten(struct sock *sk, struct sock *osk, bool *found_dup_sk) 669 { 670 bool ok = inet_ehash_insert(sk, osk, found_dup_sk); 671 672 if (ok) { 673 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 674 } else { 675 this_cpu_inc(*sk->sk_prot->orphan_count); 676 inet_sk_set_state(sk, TCP_CLOSE); 677 sock_set_flag(sk, SOCK_DEAD); 678 inet_csk_destroy_sock(sk); 679 } 680 return ok; 681 } 682 EXPORT_SYMBOL_GPL(inet_ehash_nolisten); 683 684 static int inet_reuseport_add_sock(struct sock *sk, 685 struct inet_listen_hashbucket *ilb) 686 { 687 struct inet_bind_bucket *tb = inet_csk(sk)->icsk_bind_hash; 688 const struct hlist_nulls_node *node; 689 struct sock *sk2; 690 kuid_t uid = sock_i_uid(sk); 691 692 sk_nulls_for_each_rcu(sk2, node, &ilb->nulls_head) { 693 if (sk2 != sk && 694 sk2->sk_family == sk->sk_family && 695 ipv6_only_sock(sk2) == ipv6_only_sock(sk) && 696 sk2->sk_bound_dev_if == sk->sk_bound_dev_if && 697 inet_csk(sk2)->icsk_bind_hash == tb && 698 sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) && 699 inet_rcv_saddr_equal(sk, sk2, false)) 700 return reuseport_add_sock(sk, sk2, 701 inet_rcv_saddr_any(sk)); 702 } 703 704 return reuseport_alloc(sk, inet_rcv_saddr_any(sk)); 705 } 706 707 int __inet_hash(struct sock *sk, struct sock *osk) 708 { 709 struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk); 710 struct inet_listen_hashbucket *ilb2; 711 int err = 0; 712 713 if (sk->sk_state != TCP_LISTEN) { 714 local_bh_disable(); 715 inet_ehash_nolisten(sk, osk, NULL); 716 local_bh_enable(); 717 return 0; 718 } 719 WARN_ON(!sk_unhashed(sk)); 720 ilb2 = inet_lhash2_bucket_sk(hashinfo, sk); 721 722 spin_lock(&ilb2->lock); 723 if (sk->sk_reuseport) { 724 err = inet_reuseport_add_sock(sk, ilb2); 725 if (err) 726 goto unlock; 727 } 728 if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport && 729 sk->sk_family == AF_INET6) 730 __sk_nulls_add_node_tail_rcu(sk, &ilb2->nulls_head); 731 else 732 __sk_nulls_add_node_rcu(sk, &ilb2->nulls_head); 733 sock_set_flag(sk, SOCK_RCU_FREE); 734 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 735 unlock: 736 spin_unlock(&ilb2->lock); 737 738 return err; 739 } 740 EXPORT_SYMBOL(__inet_hash); 741 742 int inet_hash(struct sock *sk) 743 { 744 int err = 0; 745 746 if (sk->sk_state != TCP_CLOSE) 747 err = __inet_hash(sk, NULL); 748 749 return err; 750 } 751 EXPORT_SYMBOL_GPL(inet_hash); 752 753 void inet_unhash(struct sock *sk) 754 { 755 struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk); 756 757 if (sk_unhashed(sk)) 758 return; 759 760 if (sk->sk_state == TCP_LISTEN) { 761 struct inet_listen_hashbucket *ilb2; 762 763 ilb2 = inet_lhash2_bucket_sk(hashinfo, sk); 764 /* Don't disable bottom halves while acquiring the lock to 765 * avoid circular locking dependency on PREEMPT_RT. 766 */ 767 spin_lock(&ilb2->lock); 768 if (sk_unhashed(sk)) { 769 spin_unlock(&ilb2->lock); 770 return; 771 } 772 773 if (rcu_access_pointer(sk->sk_reuseport_cb)) 774 reuseport_stop_listen_sock(sk); 775 776 __sk_nulls_del_node_init_rcu(sk); 777 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); 778 spin_unlock(&ilb2->lock); 779 } else { 780 spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash); 781 782 spin_lock_bh(lock); 783 if (sk_unhashed(sk)) { 784 spin_unlock_bh(lock); 785 return; 786 } 787 __sk_nulls_del_node_init_rcu(sk); 788 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); 789 spin_unlock_bh(lock); 790 } 791 } 792 EXPORT_SYMBOL_GPL(inet_unhash); 793 794 static bool inet_bind2_bucket_match(const struct inet_bind2_bucket *tb, 795 const struct net *net, unsigned short port, 796 int l3mdev, const struct sock *sk) 797 { 798 #if IS_ENABLED(CONFIG_IPV6) 799 if (sk->sk_family != tb->family) 800 return false; 801 802 if (sk->sk_family == AF_INET6) 803 return net_eq(ib2_net(tb), net) && tb->port == port && 804 tb->l3mdev == l3mdev && 805 ipv6_addr_equal(&tb->v6_rcv_saddr, &sk->sk_v6_rcv_saddr); 806 else 807 #endif 808 return net_eq(ib2_net(tb), net) && tb->port == port && 809 tb->l3mdev == l3mdev && tb->rcv_saddr == sk->sk_rcv_saddr; 810 } 811 812 bool inet_bind2_bucket_match_addr_any(const struct inet_bind2_bucket *tb, const struct net *net, 813 unsigned short port, int l3mdev, const struct sock *sk) 814 { 815 #if IS_ENABLED(CONFIG_IPV6) 816 if (sk->sk_family != tb->family) { 817 if (sk->sk_family == AF_INET) 818 return net_eq(ib2_net(tb), net) && tb->port == port && 819 tb->l3mdev == l3mdev && 820 ipv6_addr_any(&tb->v6_rcv_saddr); 821 822 return false; 823 } 824 825 if (sk->sk_family == AF_INET6) 826 return net_eq(ib2_net(tb), net) && tb->port == port && 827 tb->l3mdev == l3mdev && 828 ipv6_addr_any(&tb->v6_rcv_saddr); 829 else 830 #endif 831 return net_eq(ib2_net(tb), net) && tb->port == port && 832 tb->l3mdev == l3mdev && tb->rcv_saddr == 0; 833 } 834 835 /* The socket's bhash2 hashbucket spinlock must be held when this is called */ 836 struct inet_bind2_bucket * 837 inet_bind2_bucket_find(const struct inet_bind_hashbucket *head, const struct net *net, 838 unsigned short port, int l3mdev, const struct sock *sk) 839 { 840 struct inet_bind2_bucket *bhash2 = NULL; 841 842 inet_bind_bucket_for_each(bhash2, &head->chain) 843 if (inet_bind2_bucket_match(bhash2, net, port, l3mdev, sk)) 844 break; 845 846 return bhash2; 847 } 848 849 struct inet_bind_hashbucket * 850 inet_bhash2_addr_any_hashbucket(const struct sock *sk, const struct net *net, int port) 851 { 852 struct inet_hashinfo *hinfo = tcp_or_dccp_get_hashinfo(sk); 853 u32 hash; 854 855 #if IS_ENABLED(CONFIG_IPV6) 856 if (sk->sk_family == AF_INET6) 857 hash = ipv6_portaddr_hash(net, &in6addr_any, port); 858 else 859 #endif 860 hash = ipv4_portaddr_hash(net, 0, port); 861 862 return &hinfo->bhash2[hash & (hinfo->bhash_size - 1)]; 863 } 864 865 static void inet_update_saddr(struct sock *sk, void *saddr, int family) 866 { 867 if (family == AF_INET) { 868 inet_sk(sk)->inet_saddr = *(__be32 *)saddr; 869 sk_rcv_saddr_set(sk, inet_sk(sk)->inet_saddr); 870 } 871 #if IS_ENABLED(CONFIG_IPV6) 872 else { 873 sk->sk_v6_rcv_saddr = *(struct in6_addr *)saddr; 874 } 875 #endif 876 } 877 878 static int __inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family, bool reset) 879 { 880 struct inet_hashinfo *hinfo = tcp_or_dccp_get_hashinfo(sk); 881 struct inet_bind_hashbucket *head, *head2; 882 struct inet_bind2_bucket *tb2, *new_tb2; 883 int l3mdev = inet_sk_bound_l3mdev(sk); 884 int port = inet_sk(sk)->inet_num; 885 struct net *net = sock_net(sk); 886 int bhash; 887 888 if (!inet_csk(sk)->icsk_bind2_hash) { 889 /* Not bind()ed before. */ 890 if (reset) 891 inet_reset_saddr(sk); 892 else 893 inet_update_saddr(sk, saddr, family); 894 895 return 0; 896 } 897 898 /* Allocate a bind2 bucket ahead of time to avoid permanently putting 899 * the bhash2 table in an inconsistent state if a new tb2 bucket 900 * allocation fails. 901 */ 902 new_tb2 = kmem_cache_alloc(hinfo->bind2_bucket_cachep, GFP_ATOMIC); 903 if (!new_tb2) { 904 if (reset) { 905 /* The (INADDR_ANY, port) bucket might have already 906 * been freed, then we cannot fixup icsk_bind2_hash, 907 * so we give up and unlink sk from bhash/bhash2 not 908 * to leave inconsistency in bhash2. 909 */ 910 inet_put_port(sk); 911 inet_reset_saddr(sk); 912 } 913 914 return -ENOMEM; 915 } 916 917 bhash = inet_bhashfn(net, port, hinfo->bhash_size); 918 head = &hinfo->bhash[bhash]; 919 head2 = inet_bhashfn_portaddr(hinfo, sk, net, port); 920 921 /* If we change saddr locklessly, another thread 922 * iterating over bhash might see corrupted address. 923 */ 924 spin_lock_bh(&head->lock); 925 926 spin_lock(&head2->lock); 927 __sk_del_bind2_node(sk); 928 inet_bind2_bucket_destroy(hinfo->bind2_bucket_cachep, inet_csk(sk)->icsk_bind2_hash); 929 spin_unlock(&head2->lock); 930 931 if (reset) 932 inet_reset_saddr(sk); 933 else 934 inet_update_saddr(sk, saddr, family); 935 936 head2 = inet_bhashfn_portaddr(hinfo, sk, net, port); 937 938 spin_lock(&head2->lock); 939 tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, sk); 940 if (!tb2) { 941 tb2 = new_tb2; 942 inet_bind2_bucket_init(tb2, net, head2, port, l3mdev, sk); 943 } 944 sk_add_bind2_node(sk, &tb2->owners); 945 inet_csk(sk)->icsk_bind2_hash = tb2; 946 spin_unlock(&head2->lock); 947 948 spin_unlock_bh(&head->lock); 949 950 if (tb2 != new_tb2) 951 kmem_cache_free(hinfo->bind2_bucket_cachep, new_tb2); 952 953 return 0; 954 } 955 956 int inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family) 957 { 958 return __inet_bhash2_update_saddr(sk, saddr, family, false); 959 } 960 EXPORT_SYMBOL_GPL(inet_bhash2_update_saddr); 961 962 void inet_bhash2_reset_saddr(struct sock *sk) 963 { 964 if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) 965 __inet_bhash2_update_saddr(sk, NULL, 0, true); 966 } 967 EXPORT_SYMBOL_GPL(inet_bhash2_reset_saddr); 968 969 /* RFC 6056 3.3.4. Algorithm 4: Double-Hash Port Selection Algorithm 970 * Note that we use 32bit integers (vs RFC 'short integers') 971 * because 2^16 is not a multiple of num_ephemeral and this 972 * property might be used by clever attacker. 973 * 974 * RFC claims using TABLE_LENGTH=10 buckets gives an improvement, though 975 * attacks were since demonstrated, thus we use 65536 by default instead 976 * to really give more isolation and privacy, at the expense of 256kB 977 * of kernel memory. 978 */ 979 #define INET_TABLE_PERTURB_SIZE (1 << CONFIG_INET_TABLE_PERTURB_ORDER) 980 static u32 *table_perturb; 981 982 int __inet_hash_connect(struct inet_timewait_death_row *death_row, 983 struct sock *sk, u64 port_offset, 984 int (*check_established)(struct inet_timewait_death_row *, 985 struct sock *, __u16, struct inet_timewait_sock **)) 986 { 987 struct inet_hashinfo *hinfo = death_row->hashinfo; 988 struct inet_bind_hashbucket *head, *head2; 989 struct inet_timewait_sock *tw = NULL; 990 int port = inet_sk(sk)->inet_num; 991 struct net *net = sock_net(sk); 992 struct inet_bind2_bucket *tb2; 993 struct inet_bind_bucket *tb; 994 bool tb_created = false; 995 u32 remaining, offset; 996 int ret, i, low, high; 997 int l3mdev; 998 u32 index; 999 1000 if (port) { 1001 local_bh_disable(); 1002 ret = check_established(death_row, sk, port, NULL); 1003 local_bh_enable(); 1004 return ret; 1005 } 1006 1007 l3mdev = inet_sk_bound_l3mdev(sk); 1008 1009 inet_sk_get_local_port_range(sk, &low, &high); 1010 high++; /* [32768, 60999] -> [32768, 61000[ */ 1011 remaining = high - low; 1012 if (likely(remaining > 1)) 1013 remaining &= ~1U; 1014 1015 get_random_sleepable_once(table_perturb, 1016 INET_TABLE_PERTURB_SIZE * sizeof(*table_perturb)); 1017 index = port_offset & (INET_TABLE_PERTURB_SIZE - 1); 1018 1019 offset = READ_ONCE(table_perturb[index]) + (port_offset >> 32); 1020 offset %= remaining; 1021 1022 /* In first pass we try ports of @low parity. 1023 * inet_csk_get_port() does the opposite choice. 1024 */ 1025 offset &= ~1U; 1026 other_parity_scan: 1027 port = low + offset; 1028 for (i = 0; i < remaining; i += 2, port += 2) { 1029 if (unlikely(port >= high)) 1030 port -= remaining; 1031 if (inet_is_local_reserved_port(net, port)) 1032 continue; 1033 head = &hinfo->bhash[inet_bhashfn(net, port, 1034 hinfo->bhash_size)]; 1035 spin_lock_bh(&head->lock); 1036 1037 /* Does not bother with rcv_saddr checks, because 1038 * the established check is already unique enough. 1039 */ 1040 inet_bind_bucket_for_each(tb, &head->chain) { 1041 if (inet_bind_bucket_match(tb, net, port, l3mdev)) { 1042 if (tb->fastreuse >= 0 || 1043 tb->fastreuseport >= 0) 1044 goto next_port; 1045 WARN_ON(hlist_empty(&tb->owners)); 1046 if (!check_established(death_row, sk, 1047 port, &tw)) 1048 goto ok; 1049 goto next_port; 1050 } 1051 } 1052 1053 tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, 1054 net, head, port, l3mdev); 1055 if (!tb) { 1056 spin_unlock_bh(&head->lock); 1057 return -ENOMEM; 1058 } 1059 tb_created = true; 1060 tb->fastreuse = -1; 1061 tb->fastreuseport = -1; 1062 goto ok; 1063 next_port: 1064 spin_unlock_bh(&head->lock); 1065 cond_resched(); 1066 } 1067 1068 offset++; 1069 if ((offset & 1) && remaining > 1) 1070 goto other_parity_scan; 1071 1072 return -EADDRNOTAVAIL; 1073 1074 ok: 1075 /* Find the corresponding tb2 bucket since we need to 1076 * add the socket to the bhash2 table as well 1077 */ 1078 head2 = inet_bhashfn_portaddr(hinfo, sk, net, port); 1079 spin_lock(&head2->lock); 1080 1081 tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, sk); 1082 if (!tb2) { 1083 tb2 = inet_bind2_bucket_create(hinfo->bind2_bucket_cachep, net, 1084 head2, port, l3mdev, sk); 1085 if (!tb2) 1086 goto error; 1087 } 1088 1089 /* Here we want to add a little bit of randomness to the next source 1090 * port that will be chosen. We use a max() with a random here so that 1091 * on low contention the randomness is maximal and on high contention 1092 * it may be inexistent. 1093 */ 1094 i = max_t(int, i, get_random_u32_below(8) * 2); 1095 WRITE_ONCE(table_perturb[index], READ_ONCE(table_perturb[index]) + i + 2); 1096 1097 /* Head lock still held and bh's disabled */ 1098 inet_bind_hash(sk, tb, tb2, port); 1099 1100 if (sk_unhashed(sk)) { 1101 inet_sk(sk)->inet_sport = htons(port); 1102 inet_ehash_nolisten(sk, (struct sock *)tw, NULL); 1103 } 1104 if (tw) 1105 inet_twsk_bind_unhash(tw, hinfo); 1106 1107 spin_unlock(&head2->lock); 1108 spin_unlock(&head->lock); 1109 1110 if (tw) 1111 inet_twsk_deschedule_put(tw); 1112 local_bh_enable(); 1113 return 0; 1114 1115 error: 1116 spin_unlock(&head2->lock); 1117 if (tb_created) 1118 inet_bind_bucket_destroy(hinfo->bind_bucket_cachep, tb); 1119 spin_unlock_bh(&head->lock); 1120 return -ENOMEM; 1121 } 1122 1123 /* 1124 * Bind a port for a connect operation and hash it. 1125 */ 1126 int inet_hash_connect(struct inet_timewait_death_row *death_row, 1127 struct sock *sk) 1128 { 1129 u64 port_offset = 0; 1130 1131 if (!inet_sk(sk)->inet_num) 1132 port_offset = inet_sk_port_offset(sk); 1133 return __inet_hash_connect(death_row, sk, port_offset, 1134 __inet_check_established); 1135 } 1136 EXPORT_SYMBOL_GPL(inet_hash_connect); 1137 1138 static void init_hashinfo_lhash2(struct inet_hashinfo *h) 1139 { 1140 int i; 1141 1142 for (i = 0; i <= h->lhash2_mask; i++) { 1143 spin_lock_init(&h->lhash2[i].lock); 1144 INIT_HLIST_NULLS_HEAD(&h->lhash2[i].nulls_head, 1145 i + LISTENING_NULLS_BASE); 1146 } 1147 } 1148 1149 void __init inet_hashinfo2_init(struct inet_hashinfo *h, const char *name, 1150 unsigned long numentries, int scale, 1151 unsigned long low_limit, 1152 unsigned long high_limit) 1153 { 1154 h->lhash2 = alloc_large_system_hash(name, 1155 sizeof(*h->lhash2), 1156 numentries, 1157 scale, 1158 0, 1159 NULL, 1160 &h->lhash2_mask, 1161 low_limit, 1162 high_limit); 1163 init_hashinfo_lhash2(h); 1164 1165 /* this one is used for source ports of outgoing connections */ 1166 table_perturb = alloc_large_system_hash("Table-perturb", 1167 sizeof(*table_perturb), 1168 INET_TABLE_PERTURB_SIZE, 1169 0, 0, NULL, NULL, 1170 INET_TABLE_PERTURB_SIZE, 1171 INET_TABLE_PERTURB_SIZE); 1172 } 1173 1174 int inet_hashinfo2_init_mod(struct inet_hashinfo *h) 1175 { 1176 h->lhash2 = kmalloc_array(INET_LHTABLE_SIZE, sizeof(*h->lhash2), GFP_KERNEL); 1177 if (!h->lhash2) 1178 return -ENOMEM; 1179 1180 h->lhash2_mask = INET_LHTABLE_SIZE - 1; 1181 /* INET_LHTABLE_SIZE must be a power of 2 */ 1182 BUG_ON(INET_LHTABLE_SIZE & h->lhash2_mask); 1183 1184 init_hashinfo_lhash2(h); 1185 return 0; 1186 } 1187 EXPORT_SYMBOL_GPL(inet_hashinfo2_init_mod); 1188 1189 int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo) 1190 { 1191 unsigned int locksz = sizeof(spinlock_t); 1192 unsigned int i, nblocks = 1; 1193 1194 if (locksz != 0) { 1195 /* allocate 2 cache lines or at least one spinlock per cpu */ 1196 nblocks = max(2U * L1_CACHE_BYTES / locksz, 1U); 1197 nblocks = roundup_pow_of_two(nblocks * num_possible_cpus()); 1198 1199 /* no more locks than number of hash buckets */ 1200 nblocks = min(nblocks, hashinfo->ehash_mask + 1); 1201 1202 hashinfo->ehash_locks = kvmalloc_array(nblocks, locksz, GFP_KERNEL); 1203 if (!hashinfo->ehash_locks) 1204 return -ENOMEM; 1205 1206 for (i = 0; i < nblocks; i++) 1207 spin_lock_init(&hashinfo->ehash_locks[i]); 1208 } 1209 hashinfo->ehash_locks_mask = nblocks - 1; 1210 return 0; 1211 } 1212 EXPORT_SYMBOL_GPL(inet_ehash_locks_alloc); 1213 1214 struct inet_hashinfo *inet_pernet_hashinfo_alloc(struct inet_hashinfo *hashinfo, 1215 unsigned int ehash_entries) 1216 { 1217 struct inet_hashinfo *new_hashinfo; 1218 int i; 1219 1220 new_hashinfo = kmemdup(hashinfo, sizeof(*hashinfo), GFP_KERNEL); 1221 if (!new_hashinfo) 1222 goto err; 1223 1224 new_hashinfo->ehash = vmalloc_huge(ehash_entries * sizeof(struct inet_ehash_bucket), 1225 GFP_KERNEL_ACCOUNT); 1226 if (!new_hashinfo->ehash) 1227 goto free_hashinfo; 1228 1229 new_hashinfo->ehash_mask = ehash_entries - 1; 1230 1231 if (inet_ehash_locks_alloc(new_hashinfo)) 1232 goto free_ehash; 1233 1234 for (i = 0; i < ehash_entries; i++) 1235 INIT_HLIST_NULLS_HEAD(&new_hashinfo->ehash[i].chain, i); 1236 1237 new_hashinfo->pernet = true; 1238 1239 return new_hashinfo; 1240 1241 free_ehash: 1242 vfree(new_hashinfo->ehash); 1243 free_hashinfo: 1244 kfree(new_hashinfo); 1245 err: 1246 return NULL; 1247 } 1248 EXPORT_SYMBOL_GPL(inet_pernet_hashinfo_alloc); 1249 1250 void inet_pernet_hashinfo_free(struct inet_hashinfo *hashinfo) 1251 { 1252 if (!hashinfo->pernet) 1253 return; 1254 1255 inet_ehash_locks_free(hashinfo); 1256 vfree(hashinfo->ehash); 1257 kfree(hashinfo); 1258 } 1259 EXPORT_SYMBOL_GPL(inet_pernet_hashinfo_free); 1260