1 /* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * Generic INET transport hashtables 7 * 8 * Authors: Lotsa people, from code originally in tcp 9 * 10 * This program is free software; you can redistribute it and/or 11 * modify it under the terms of the GNU General Public License 12 * as published by the Free Software Foundation; either version 13 * 2 of the License, or (at your option) any later version. 14 */ 15 16 #include <linux/module.h> 17 #include <linux/random.h> 18 #include <linux/sched.h> 19 #include <linux/slab.h> 20 #include <linux/wait.h> 21 #include <linux/vmalloc.h> 22 #include <linux/bootmem.h> 23 24 #include <net/addrconf.h> 25 #include <net/inet_connection_sock.h> 26 #include <net/inet_hashtables.h> 27 #include <net/secure_seq.h> 28 #include <net/ip.h> 29 #include <net/tcp.h> 30 #include <net/sock_reuseport.h> 31 32 static u32 inet_ehashfn(const struct net *net, const __be32 laddr, 33 const __u16 lport, const __be32 faddr, 34 const __be16 fport) 35 { 36 static u32 inet_ehash_secret __read_mostly; 37 38 net_get_random_once(&inet_ehash_secret, sizeof(inet_ehash_secret)); 39 40 return __inet_ehashfn(laddr, lport, faddr, fport, 41 inet_ehash_secret + net_hash_mix(net)); 42 } 43 44 /* This function handles inet_sock, but also timewait and request sockets 45 * for IPv4/IPv6. 46 */ 47 static u32 sk_ehashfn(const struct sock *sk) 48 { 49 #if IS_ENABLED(CONFIG_IPV6) 50 if (sk->sk_family == AF_INET6 && 51 !ipv6_addr_v4mapped(&sk->sk_v6_daddr)) 52 return inet6_ehashfn(sock_net(sk), 53 &sk->sk_v6_rcv_saddr, sk->sk_num, 54 &sk->sk_v6_daddr, sk->sk_dport); 55 #endif 56 return inet_ehashfn(sock_net(sk), 57 sk->sk_rcv_saddr, sk->sk_num, 58 sk->sk_daddr, sk->sk_dport); 59 } 60 61 /* 62 * Allocate and initialize a new local port bind bucket. 63 * The bindhash mutex for snum's hash chain must be held here. 64 */ 65 struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep, 66 struct net *net, 67 struct inet_bind_hashbucket *head, 68 const unsigned short snum) 69 { 70 struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC); 71 72 if (tb) { 73 write_pnet(&tb->ib_net, net); 74 tb->port = snum; 75 tb->fastreuse = 0; 76 tb->fastreuseport = 0; 77 INIT_HLIST_HEAD(&tb->owners); 78 hlist_add_head(&tb->node, &head->chain); 79 } 80 return tb; 81 } 82 83 /* 84 * Caller must hold hashbucket lock for this tb with local BH disabled 85 */ 86 void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket *tb) 87 { 88 if (hlist_empty(&tb->owners)) { 89 __hlist_del(&tb->node); 90 kmem_cache_free(cachep, tb); 91 } 92 } 93 94 void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, 95 const unsigned short snum) 96 { 97 inet_sk(sk)->inet_num = snum; 98 sk_add_bind_node(sk, &tb->owners); 99 inet_csk(sk)->icsk_bind_hash = tb; 100 } 101 102 /* 103 * Get rid of any references to a local port held by the given sock. 104 */ 105 static void __inet_put_port(struct sock *sk) 106 { 107 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; 108 const int bhash = inet_bhashfn(sock_net(sk), inet_sk(sk)->inet_num, 109 hashinfo->bhash_size); 110 struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash]; 111 struct inet_bind_bucket *tb; 112 113 spin_lock(&head->lock); 114 tb = inet_csk(sk)->icsk_bind_hash; 115 __sk_del_bind_node(sk); 116 inet_csk(sk)->icsk_bind_hash = NULL; 117 inet_sk(sk)->inet_num = 0; 118 inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); 119 spin_unlock(&head->lock); 120 } 121 122 void inet_put_port(struct sock *sk) 123 { 124 local_bh_disable(); 125 __inet_put_port(sk); 126 local_bh_enable(); 127 } 128 EXPORT_SYMBOL(inet_put_port); 129 130 int __inet_inherit_port(const struct sock *sk, struct sock *child) 131 { 132 struct inet_hashinfo *table = sk->sk_prot->h.hashinfo; 133 unsigned short port = inet_sk(child)->inet_num; 134 const int bhash = inet_bhashfn(sock_net(sk), port, 135 table->bhash_size); 136 struct inet_bind_hashbucket *head = &table->bhash[bhash]; 137 struct inet_bind_bucket *tb; 138 139 spin_lock(&head->lock); 140 tb = inet_csk(sk)->icsk_bind_hash; 141 if (unlikely(!tb)) { 142 spin_unlock(&head->lock); 143 return -ENOENT; 144 } 145 if (tb->port != port) { 146 /* NOTE: using tproxy and redirecting skbs to a proxy 147 * on a different listener port breaks the assumption 148 * that the listener socket's icsk_bind_hash is the same 149 * as that of the child socket. We have to look up or 150 * create a new bind bucket for the child here. */ 151 inet_bind_bucket_for_each(tb, &head->chain) { 152 if (net_eq(ib_net(tb), sock_net(sk)) && 153 tb->port == port) 154 break; 155 } 156 if (!tb) { 157 tb = inet_bind_bucket_create(table->bind_bucket_cachep, 158 sock_net(sk), head, port); 159 if (!tb) { 160 spin_unlock(&head->lock); 161 return -ENOMEM; 162 } 163 } 164 } 165 inet_bind_hash(child, tb, port); 166 spin_unlock(&head->lock); 167 168 return 0; 169 } 170 EXPORT_SYMBOL_GPL(__inet_inherit_port); 171 172 static struct inet_listen_hashbucket * 173 inet_lhash2_bucket_sk(struct inet_hashinfo *h, struct sock *sk) 174 { 175 u32 hash; 176 177 #if IS_ENABLED(CONFIG_IPV6) 178 if (sk->sk_family == AF_INET6) 179 hash = ipv6_portaddr_hash(sock_net(sk), 180 &sk->sk_v6_rcv_saddr, 181 inet_sk(sk)->inet_num); 182 else 183 #endif 184 hash = ipv4_portaddr_hash(sock_net(sk), 185 inet_sk(sk)->inet_rcv_saddr, 186 inet_sk(sk)->inet_num); 187 return inet_lhash2_bucket(h, hash); 188 } 189 190 static void inet_hash2(struct inet_hashinfo *h, struct sock *sk) 191 { 192 struct inet_listen_hashbucket *ilb2; 193 194 if (!h->lhash2) 195 return; 196 197 ilb2 = inet_lhash2_bucket_sk(h, sk); 198 199 spin_lock(&ilb2->lock); 200 if (sk->sk_reuseport && sk->sk_family == AF_INET6) 201 hlist_add_tail_rcu(&inet_csk(sk)->icsk_listen_portaddr_node, 202 &ilb2->head); 203 else 204 hlist_add_head_rcu(&inet_csk(sk)->icsk_listen_portaddr_node, 205 &ilb2->head); 206 ilb2->count++; 207 spin_unlock(&ilb2->lock); 208 } 209 210 static void inet_unhash2(struct inet_hashinfo *h, struct sock *sk) 211 { 212 struct inet_listen_hashbucket *ilb2; 213 214 if (!h->lhash2 || 215 WARN_ON_ONCE(hlist_unhashed(&inet_csk(sk)->icsk_listen_portaddr_node))) 216 return; 217 218 ilb2 = inet_lhash2_bucket_sk(h, sk); 219 220 spin_lock(&ilb2->lock); 221 hlist_del_init_rcu(&inet_csk(sk)->icsk_listen_portaddr_node); 222 ilb2->count--; 223 spin_unlock(&ilb2->lock); 224 } 225 226 static inline int compute_score(struct sock *sk, struct net *net, 227 const unsigned short hnum, const __be32 daddr, 228 const int dif, const int sdif, bool exact_dif) 229 { 230 int score = -1; 231 struct inet_sock *inet = inet_sk(sk); 232 233 if (net_eq(sock_net(sk), net) && inet->inet_num == hnum && 234 !ipv6_only_sock(sk)) { 235 __be32 rcv_saddr = inet->inet_rcv_saddr; 236 score = sk->sk_family == PF_INET ? 2 : 1; 237 if (rcv_saddr) { 238 if (rcv_saddr != daddr) 239 return -1; 240 score += 4; 241 } 242 if (sk->sk_bound_dev_if || exact_dif) { 243 bool dev_match = (sk->sk_bound_dev_if == dif || 244 sk->sk_bound_dev_if == sdif); 245 246 if (!dev_match) 247 return -1; 248 if (sk->sk_bound_dev_if) 249 score += 4; 250 } 251 if (sk->sk_incoming_cpu == raw_smp_processor_id()) 252 score++; 253 } 254 return score; 255 } 256 257 /* 258 * Here are some nice properties to exploit here. The BSD API 259 * does not allow a listening sock to specify the remote port nor the 260 * remote address for the connection. So always assume those are both 261 * wildcarded during the search since they can never be otherwise. 262 */ 263 264 /* called with rcu_read_lock() : No refcount taken on the socket */ 265 static struct sock *inet_lhash2_lookup(struct net *net, 266 struct inet_listen_hashbucket *ilb2, 267 struct sk_buff *skb, int doff, 268 const __be32 saddr, __be16 sport, 269 const __be32 daddr, const unsigned short hnum, 270 const int dif, const int sdif) 271 { 272 bool exact_dif = inet_exact_dif_match(net, skb); 273 struct inet_connection_sock *icsk; 274 struct sock *sk, *result = NULL; 275 int score, hiscore = 0; 276 u32 phash = 0; 277 278 inet_lhash2_for_each_icsk_rcu(icsk, &ilb2->head) { 279 sk = (struct sock *)icsk; 280 score = compute_score(sk, net, hnum, daddr, 281 dif, sdif, exact_dif); 282 if (score > hiscore) { 283 if (sk->sk_reuseport) { 284 phash = inet_ehashfn(net, daddr, hnum, 285 saddr, sport); 286 result = reuseport_select_sock(sk, phash, 287 skb, doff); 288 if (result) 289 return result; 290 } 291 result = sk; 292 hiscore = score; 293 } 294 } 295 296 return result; 297 } 298 299 struct sock *__inet_lookup_listener(struct net *net, 300 struct inet_hashinfo *hashinfo, 301 struct sk_buff *skb, int doff, 302 const __be32 saddr, __be16 sport, 303 const __be32 daddr, const unsigned short hnum, 304 const int dif, const int sdif) 305 { 306 unsigned int hash = inet_lhashfn(net, hnum); 307 struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash]; 308 bool exact_dif = inet_exact_dif_match(net, skb); 309 struct inet_listen_hashbucket *ilb2; 310 struct sock *sk, *result = NULL; 311 int score, hiscore = 0; 312 unsigned int hash2; 313 u32 phash = 0; 314 315 if (ilb->count <= 10 || !hashinfo->lhash2) 316 goto port_lookup; 317 318 /* Too many sk in the ilb bucket (which is hashed by port alone). 319 * Try lhash2 (which is hashed by port and addr) instead. 320 */ 321 322 hash2 = ipv4_portaddr_hash(net, daddr, hnum); 323 ilb2 = inet_lhash2_bucket(hashinfo, hash2); 324 if (ilb2->count > ilb->count) 325 goto port_lookup; 326 327 result = inet_lhash2_lookup(net, ilb2, skb, doff, 328 saddr, sport, daddr, hnum, 329 dif, sdif); 330 if (result) 331 return result; 332 333 /* Lookup lhash2 with INADDR_ANY */ 334 335 hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum); 336 ilb2 = inet_lhash2_bucket(hashinfo, hash2); 337 if (ilb2->count > ilb->count) 338 goto port_lookup; 339 340 return inet_lhash2_lookup(net, ilb2, skb, doff, 341 saddr, sport, daddr, hnum, 342 dif, sdif); 343 344 port_lookup: 345 sk_for_each_rcu(sk, &ilb->head) { 346 score = compute_score(sk, net, hnum, daddr, 347 dif, sdif, exact_dif); 348 if (score > hiscore) { 349 if (sk->sk_reuseport) { 350 phash = inet_ehashfn(net, daddr, hnum, 351 saddr, sport); 352 result = reuseport_select_sock(sk, phash, 353 skb, doff); 354 if (result) 355 return result; 356 } 357 result = sk; 358 hiscore = score; 359 } 360 } 361 return result; 362 } 363 EXPORT_SYMBOL_GPL(__inet_lookup_listener); 364 365 /* All sockets share common refcount, but have different destructors */ 366 void sock_gen_put(struct sock *sk) 367 { 368 if (!refcount_dec_and_test(&sk->sk_refcnt)) 369 return; 370 371 if (sk->sk_state == TCP_TIME_WAIT) 372 inet_twsk_free(inet_twsk(sk)); 373 else if (sk->sk_state == TCP_NEW_SYN_RECV) 374 reqsk_free(inet_reqsk(sk)); 375 else 376 sk_free(sk); 377 } 378 EXPORT_SYMBOL_GPL(sock_gen_put); 379 380 void sock_edemux(struct sk_buff *skb) 381 { 382 sock_gen_put(skb->sk); 383 } 384 EXPORT_SYMBOL(sock_edemux); 385 386 struct sock *__inet_lookup_established(struct net *net, 387 struct inet_hashinfo *hashinfo, 388 const __be32 saddr, const __be16 sport, 389 const __be32 daddr, const u16 hnum, 390 const int dif, const int sdif) 391 { 392 INET_ADDR_COOKIE(acookie, saddr, daddr); 393 const __portpair ports = INET_COMBINED_PORTS(sport, hnum); 394 struct sock *sk; 395 const struct hlist_nulls_node *node; 396 /* Optimize here for direct hit, only listening connections can 397 * have wildcards anyways. 398 */ 399 unsigned int hash = inet_ehashfn(net, daddr, hnum, saddr, sport); 400 unsigned int slot = hash & hashinfo->ehash_mask; 401 struct inet_ehash_bucket *head = &hashinfo->ehash[slot]; 402 403 begin: 404 sk_nulls_for_each_rcu(sk, node, &head->chain) { 405 if (sk->sk_hash != hash) 406 continue; 407 if (likely(INET_MATCH(sk, net, acookie, 408 saddr, daddr, ports, dif, sdif))) { 409 if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt))) 410 goto out; 411 if (unlikely(!INET_MATCH(sk, net, acookie, 412 saddr, daddr, ports, 413 dif, sdif))) { 414 sock_gen_put(sk); 415 goto begin; 416 } 417 goto found; 418 } 419 } 420 /* 421 * if the nulls value we got at the end of this lookup is 422 * not the expected one, we must restart lookup. 423 * We probably met an item that was moved to another chain. 424 */ 425 if (get_nulls_value(node) != slot) 426 goto begin; 427 out: 428 sk = NULL; 429 found: 430 return sk; 431 } 432 EXPORT_SYMBOL_GPL(__inet_lookup_established); 433 434 /* called with local bh disabled */ 435 static int __inet_check_established(struct inet_timewait_death_row *death_row, 436 struct sock *sk, __u16 lport, 437 struct inet_timewait_sock **twp) 438 { 439 struct inet_hashinfo *hinfo = death_row->hashinfo; 440 struct inet_sock *inet = inet_sk(sk); 441 __be32 daddr = inet->inet_rcv_saddr; 442 __be32 saddr = inet->inet_daddr; 443 int dif = sk->sk_bound_dev_if; 444 struct net *net = sock_net(sk); 445 int sdif = l3mdev_master_ifindex_by_index(net, dif); 446 INET_ADDR_COOKIE(acookie, saddr, daddr); 447 const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport); 448 unsigned int hash = inet_ehashfn(net, daddr, lport, 449 saddr, inet->inet_dport); 450 struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); 451 spinlock_t *lock = inet_ehash_lockp(hinfo, hash); 452 struct sock *sk2; 453 const struct hlist_nulls_node *node; 454 struct inet_timewait_sock *tw = NULL; 455 456 spin_lock(lock); 457 458 sk_nulls_for_each(sk2, node, &head->chain) { 459 if (sk2->sk_hash != hash) 460 continue; 461 462 if (likely(INET_MATCH(sk2, net, acookie, 463 saddr, daddr, ports, dif, sdif))) { 464 if (sk2->sk_state == TCP_TIME_WAIT) { 465 tw = inet_twsk(sk2); 466 if (twsk_unique(sk, sk2, twp)) 467 break; 468 } 469 goto not_unique; 470 } 471 } 472 473 /* Must record num and sport now. Otherwise we will see 474 * in hash table socket with a funny identity. 475 */ 476 inet->inet_num = lport; 477 inet->inet_sport = htons(lport); 478 sk->sk_hash = hash; 479 WARN_ON(!sk_unhashed(sk)); 480 __sk_nulls_add_node_rcu(sk, &head->chain); 481 if (tw) { 482 sk_nulls_del_node_init_rcu((struct sock *)tw); 483 __NET_INC_STATS(net, LINUX_MIB_TIMEWAITRECYCLED); 484 } 485 spin_unlock(lock); 486 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 487 488 if (twp) { 489 *twp = tw; 490 } else if (tw) { 491 /* Silly. Should hash-dance instead... */ 492 inet_twsk_deschedule_put(tw); 493 } 494 return 0; 495 496 not_unique: 497 spin_unlock(lock); 498 return -EADDRNOTAVAIL; 499 } 500 501 static u32 inet_sk_port_offset(const struct sock *sk) 502 { 503 const struct inet_sock *inet = inet_sk(sk); 504 505 return secure_ipv4_port_ephemeral(inet->inet_rcv_saddr, 506 inet->inet_daddr, 507 inet->inet_dport); 508 } 509 510 /* insert a socket into ehash, and eventually remove another one 511 * (The another one can be a SYN_RECV or TIMEWAIT 512 */ 513 bool inet_ehash_insert(struct sock *sk, struct sock *osk) 514 { 515 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; 516 struct hlist_nulls_head *list; 517 struct inet_ehash_bucket *head; 518 spinlock_t *lock; 519 bool ret = true; 520 521 WARN_ON_ONCE(!sk_unhashed(sk)); 522 523 sk->sk_hash = sk_ehashfn(sk); 524 head = inet_ehash_bucket(hashinfo, sk->sk_hash); 525 list = &head->chain; 526 lock = inet_ehash_lockp(hashinfo, sk->sk_hash); 527 528 spin_lock(lock); 529 if (osk) { 530 WARN_ON_ONCE(sk->sk_hash != osk->sk_hash); 531 ret = sk_nulls_del_node_init_rcu(osk); 532 } 533 if (ret) 534 __sk_nulls_add_node_rcu(sk, list); 535 spin_unlock(lock); 536 return ret; 537 } 538 539 bool inet_ehash_nolisten(struct sock *sk, struct sock *osk) 540 { 541 bool ok = inet_ehash_insert(sk, osk); 542 543 if (ok) { 544 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 545 } else { 546 percpu_counter_inc(sk->sk_prot->orphan_count); 547 inet_sk_set_state(sk, TCP_CLOSE); 548 sock_set_flag(sk, SOCK_DEAD); 549 inet_csk_destroy_sock(sk); 550 } 551 return ok; 552 } 553 EXPORT_SYMBOL_GPL(inet_ehash_nolisten); 554 555 static int inet_reuseport_add_sock(struct sock *sk, 556 struct inet_listen_hashbucket *ilb) 557 { 558 struct inet_bind_bucket *tb = inet_csk(sk)->icsk_bind_hash; 559 struct sock *sk2; 560 kuid_t uid = sock_i_uid(sk); 561 562 sk_for_each_rcu(sk2, &ilb->head) { 563 if (sk2 != sk && 564 sk2->sk_family == sk->sk_family && 565 ipv6_only_sock(sk2) == ipv6_only_sock(sk) && 566 sk2->sk_bound_dev_if == sk->sk_bound_dev_if && 567 inet_csk(sk2)->icsk_bind_hash == tb && 568 sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) && 569 inet_rcv_saddr_equal(sk, sk2, false)) 570 return reuseport_add_sock(sk, sk2); 571 } 572 573 return reuseport_alloc(sk); 574 } 575 576 int __inet_hash(struct sock *sk, struct sock *osk) 577 { 578 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; 579 struct inet_listen_hashbucket *ilb; 580 int err = 0; 581 582 if (sk->sk_state != TCP_LISTEN) { 583 inet_ehash_nolisten(sk, osk); 584 return 0; 585 } 586 WARN_ON(!sk_unhashed(sk)); 587 ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; 588 589 spin_lock(&ilb->lock); 590 if (sk->sk_reuseport) { 591 err = inet_reuseport_add_sock(sk, ilb); 592 if (err) 593 goto unlock; 594 } 595 if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport && 596 sk->sk_family == AF_INET6) 597 hlist_add_tail_rcu(&sk->sk_node, &ilb->head); 598 else 599 hlist_add_head_rcu(&sk->sk_node, &ilb->head); 600 inet_hash2(hashinfo, sk); 601 ilb->count++; 602 sock_set_flag(sk, SOCK_RCU_FREE); 603 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 604 unlock: 605 spin_unlock(&ilb->lock); 606 607 return err; 608 } 609 EXPORT_SYMBOL(__inet_hash); 610 611 int inet_hash(struct sock *sk) 612 { 613 int err = 0; 614 615 if (sk->sk_state != TCP_CLOSE) { 616 local_bh_disable(); 617 err = __inet_hash(sk, NULL); 618 local_bh_enable(); 619 } 620 621 return err; 622 } 623 EXPORT_SYMBOL_GPL(inet_hash); 624 625 void inet_unhash(struct sock *sk) 626 { 627 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; 628 struct inet_listen_hashbucket *ilb = NULL; 629 spinlock_t *lock; 630 631 if (sk_unhashed(sk)) 632 return; 633 634 if (sk->sk_state == TCP_LISTEN) { 635 ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; 636 lock = &ilb->lock; 637 } else { 638 lock = inet_ehash_lockp(hashinfo, sk->sk_hash); 639 } 640 spin_lock_bh(lock); 641 if (sk_unhashed(sk)) 642 goto unlock; 643 644 if (rcu_access_pointer(sk->sk_reuseport_cb)) 645 reuseport_detach_sock(sk); 646 if (ilb) { 647 inet_unhash2(hashinfo, sk); 648 __sk_del_node_init(sk); 649 ilb->count--; 650 } else { 651 __sk_nulls_del_node_init_rcu(sk); 652 } 653 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); 654 unlock: 655 spin_unlock_bh(lock); 656 } 657 EXPORT_SYMBOL_GPL(inet_unhash); 658 659 int __inet_hash_connect(struct inet_timewait_death_row *death_row, 660 struct sock *sk, u32 port_offset, 661 int (*check_established)(struct inet_timewait_death_row *, 662 struct sock *, __u16, struct inet_timewait_sock **)) 663 { 664 struct inet_hashinfo *hinfo = death_row->hashinfo; 665 struct inet_timewait_sock *tw = NULL; 666 struct inet_bind_hashbucket *head; 667 int port = inet_sk(sk)->inet_num; 668 struct net *net = sock_net(sk); 669 struct inet_bind_bucket *tb; 670 u32 remaining, offset; 671 int ret, i, low, high; 672 static u32 hint; 673 674 if (port) { 675 head = &hinfo->bhash[inet_bhashfn(net, port, 676 hinfo->bhash_size)]; 677 tb = inet_csk(sk)->icsk_bind_hash; 678 spin_lock_bh(&head->lock); 679 if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { 680 inet_ehash_nolisten(sk, NULL); 681 spin_unlock_bh(&head->lock); 682 return 0; 683 } 684 spin_unlock(&head->lock); 685 /* No definite answer... Walk to established hash table */ 686 ret = check_established(death_row, sk, port, NULL); 687 local_bh_enable(); 688 return ret; 689 } 690 691 inet_get_local_port_range(net, &low, &high); 692 high++; /* [32768, 60999] -> [32768, 61000[ */ 693 remaining = high - low; 694 if (likely(remaining > 1)) 695 remaining &= ~1U; 696 697 offset = (hint + port_offset) % remaining; 698 /* In first pass we try ports of @low parity. 699 * inet_csk_get_port() does the opposite choice. 700 */ 701 offset &= ~1U; 702 other_parity_scan: 703 port = low + offset; 704 for (i = 0; i < remaining; i += 2, port += 2) { 705 if (unlikely(port >= high)) 706 port -= remaining; 707 if (inet_is_local_reserved_port(net, port)) 708 continue; 709 head = &hinfo->bhash[inet_bhashfn(net, port, 710 hinfo->bhash_size)]; 711 spin_lock_bh(&head->lock); 712 713 /* Does not bother with rcv_saddr checks, because 714 * the established check is already unique enough. 715 */ 716 inet_bind_bucket_for_each(tb, &head->chain) { 717 if (net_eq(ib_net(tb), net) && tb->port == port) { 718 if (tb->fastreuse >= 0 || 719 tb->fastreuseport >= 0) 720 goto next_port; 721 WARN_ON(hlist_empty(&tb->owners)); 722 if (!check_established(death_row, sk, 723 port, &tw)) 724 goto ok; 725 goto next_port; 726 } 727 } 728 729 tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, 730 net, head, port); 731 if (!tb) { 732 spin_unlock_bh(&head->lock); 733 return -ENOMEM; 734 } 735 tb->fastreuse = -1; 736 tb->fastreuseport = -1; 737 goto ok; 738 next_port: 739 spin_unlock_bh(&head->lock); 740 cond_resched(); 741 } 742 743 offset++; 744 if ((offset & 1) && remaining > 1) 745 goto other_parity_scan; 746 747 return -EADDRNOTAVAIL; 748 749 ok: 750 hint += i + 2; 751 752 /* Head lock still held and bh's disabled */ 753 inet_bind_hash(sk, tb, port); 754 if (sk_unhashed(sk)) { 755 inet_sk(sk)->inet_sport = htons(port); 756 inet_ehash_nolisten(sk, (struct sock *)tw); 757 } 758 if (tw) 759 inet_twsk_bind_unhash(tw, hinfo); 760 spin_unlock(&head->lock); 761 if (tw) 762 inet_twsk_deschedule_put(tw); 763 local_bh_enable(); 764 return 0; 765 } 766 767 /* 768 * Bind a port for a connect operation and hash it. 769 */ 770 int inet_hash_connect(struct inet_timewait_death_row *death_row, 771 struct sock *sk) 772 { 773 u32 port_offset = 0; 774 775 if (!inet_sk(sk)->inet_num) 776 port_offset = inet_sk_port_offset(sk); 777 return __inet_hash_connect(death_row, sk, port_offset, 778 __inet_check_established); 779 } 780 EXPORT_SYMBOL_GPL(inet_hash_connect); 781 782 void inet_hashinfo_init(struct inet_hashinfo *h) 783 { 784 int i; 785 786 for (i = 0; i < INET_LHTABLE_SIZE; i++) { 787 spin_lock_init(&h->listening_hash[i].lock); 788 INIT_HLIST_HEAD(&h->listening_hash[i].head); 789 h->listening_hash[i].count = 0; 790 } 791 792 h->lhash2 = NULL; 793 } 794 EXPORT_SYMBOL_GPL(inet_hashinfo_init); 795 796 void __init inet_hashinfo2_init(struct inet_hashinfo *h, const char *name, 797 unsigned long numentries, int scale, 798 unsigned long low_limit, 799 unsigned long high_limit) 800 { 801 unsigned int i; 802 803 h->lhash2 = alloc_large_system_hash(name, 804 sizeof(*h->lhash2), 805 numentries, 806 scale, 807 0, 808 NULL, 809 &h->lhash2_mask, 810 low_limit, 811 high_limit); 812 813 for (i = 0; i <= h->lhash2_mask; i++) { 814 spin_lock_init(&h->lhash2[i].lock); 815 INIT_HLIST_HEAD(&h->lhash2[i].head); 816 h->lhash2[i].count = 0; 817 } 818 } 819 820 int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo) 821 { 822 unsigned int locksz = sizeof(spinlock_t); 823 unsigned int i, nblocks = 1; 824 825 if (locksz != 0) { 826 /* allocate 2 cache lines or at least one spinlock per cpu */ 827 nblocks = max(2U * L1_CACHE_BYTES / locksz, 1U); 828 nblocks = roundup_pow_of_two(nblocks * num_possible_cpus()); 829 830 /* no more locks than number of hash buckets */ 831 nblocks = min(nblocks, hashinfo->ehash_mask + 1); 832 833 hashinfo->ehash_locks = kvmalloc_array(nblocks, locksz, GFP_KERNEL); 834 if (!hashinfo->ehash_locks) 835 return -ENOMEM; 836 837 for (i = 0; i < nblocks; i++) 838 spin_lock_init(&hashinfo->ehash_locks[i]); 839 } 840 hashinfo->ehash_locks_mask = nblocks - 1; 841 return 0; 842 } 843 EXPORT_SYMBOL_GPL(inet_ehash_locks_alloc); 844