1 /* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * Generic INET transport hashtables 7 * 8 * Authors: Lotsa people, from code originally in tcp 9 * 10 * This program is free software; you can redistribute it and/or 11 * modify it under the terms of the GNU General Public License 12 * as published by the Free Software Foundation; either version 13 * 2 of the License, or (at your option) any later version. 14 */ 15 16 #include <linux/module.h> 17 #include <linux/random.h> 18 #include <linux/sched.h> 19 #include <linux/slab.h> 20 #include <linux/wait.h> 21 22 #include <net/inet_connection_sock.h> 23 #include <net/inet_hashtables.h> 24 #include <net/secure_seq.h> 25 #include <net/ip.h> 26 27 /* 28 * Allocate and initialize a new local port bind bucket. 29 * The bindhash mutex for snum's hash chain must be held here. 30 */ 31 struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep, 32 struct net *net, 33 struct inet_bind_hashbucket *head, 34 const unsigned short snum) 35 { 36 struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC); 37 38 if (tb != NULL) { 39 write_pnet(&tb->ib_net, hold_net(net)); 40 tb->port = snum; 41 tb->fastreuse = 0; 42 tb->fastreuseport = 0; 43 tb->num_owners = 0; 44 INIT_HLIST_HEAD(&tb->owners); 45 hlist_add_head(&tb->node, &head->chain); 46 } 47 return tb; 48 } 49 50 /* 51 * Caller must hold hashbucket lock for this tb with local BH disabled 52 */ 53 void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket *tb) 54 { 55 if (hlist_empty(&tb->owners)) { 56 __hlist_del(&tb->node); 57 release_net(ib_net(tb)); 58 kmem_cache_free(cachep, tb); 59 } 60 } 61 62 void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, 63 const unsigned short snum) 64 { 65 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; 66 67 atomic_inc(&hashinfo->bsockets); 68 69 inet_sk(sk)->inet_num = snum; 70 sk_add_bind_node(sk, &tb->owners); 71 tb->num_owners++; 72 inet_csk(sk)->icsk_bind_hash = tb; 73 } 74 75 /* 76 * Get rid of any references to a local port held by the given sock. 77 */ 78 static void __inet_put_port(struct sock *sk) 79 { 80 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; 81 const int bhash = inet_bhashfn(sock_net(sk), inet_sk(sk)->inet_num, 82 hashinfo->bhash_size); 83 struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash]; 84 struct inet_bind_bucket *tb; 85 86 atomic_dec(&hashinfo->bsockets); 87 88 spin_lock(&head->lock); 89 tb = inet_csk(sk)->icsk_bind_hash; 90 __sk_del_bind_node(sk); 91 tb->num_owners--; 92 inet_csk(sk)->icsk_bind_hash = NULL; 93 inet_sk(sk)->inet_num = 0; 94 inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); 95 spin_unlock(&head->lock); 96 } 97 98 void inet_put_port(struct sock *sk) 99 { 100 local_bh_disable(); 101 __inet_put_port(sk); 102 local_bh_enable(); 103 } 104 EXPORT_SYMBOL(inet_put_port); 105 106 int __inet_inherit_port(struct sock *sk, struct sock *child) 107 { 108 struct inet_hashinfo *table = sk->sk_prot->h.hashinfo; 109 unsigned short port = inet_sk(child)->inet_num; 110 const int bhash = inet_bhashfn(sock_net(sk), port, 111 table->bhash_size); 112 struct inet_bind_hashbucket *head = &table->bhash[bhash]; 113 struct inet_bind_bucket *tb; 114 115 spin_lock(&head->lock); 116 tb = inet_csk(sk)->icsk_bind_hash; 117 if (tb->port != port) { 118 /* NOTE: using tproxy and redirecting skbs to a proxy 119 * on a different listener port breaks the assumption 120 * that the listener socket's icsk_bind_hash is the same 121 * as that of the child socket. We have to look up or 122 * create a new bind bucket for the child here. */ 123 inet_bind_bucket_for_each(tb, &head->chain) { 124 if (net_eq(ib_net(tb), sock_net(sk)) && 125 tb->port == port) 126 break; 127 } 128 if (!tb) { 129 tb = inet_bind_bucket_create(table->bind_bucket_cachep, 130 sock_net(sk), head, port); 131 if (!tb) { 132 spin_unlock(&head->lock); 133 return -ENOMEM; 134 } 135 } 136 } 137 inet_bind_hash(child, tb, port); 138 spin_unlock(&head->lock); 139 140 return 0; 141 } 142 EXPORT_SYMBOL_GPL(__inet_inherit_port); 143 144 static inline int compute_score(struct sock *sk, struct net *net, 145 const unsigned short hnum, const __be32 daddr, 146 const int dif) 147 { 148 int score = -1; 149 struct inet_sock *inet = inet_sk(sk); 150 151 if (net_eq(sock_net(sk), net) && inet->inet_num == hnum && 152 !ipv6_only_sock(sk)) { 153 __be32 rcv_saddr = inet->inet_rcv_saddr; 154 score = sk->sk_family == PF_INET ? 2 : 1; 155 if (rcv_saddr) { 156 if (rcv_saddr != daddr) 157 return -1; 158 score += 4; 159 } 160 if (sk->sk_bound_dev_if) { 161 if (sk->sk_bound_dev_if != dif) 162 return -1; 163 score += 4; 164 } 165 } 166 return score; 167 } 168 169 /* 170 * Don't inline this cruft. Here are some nice properties to exploit here. The 171 * BSD API does not allow a listening sock to specify the remote port nor the 172 * remote address for the connection. So always assume those are both 173 * wildcarded during the search since they can never be otherwise. 174 */ 175 176 177 struct sock *__inet_lookup_listener(struct net *net, 178 struct inet_hashinfo *hashinfo, 179 const __be32 saddr, __be16 sport, 180 const __be32 daddr, const unsigned short hnum, 181 const int dif) 182 { 183 struct sock *sk, *result; 184 struct hlist_nulls_node *node; 185 unsigned int hash = inet_lhashfn(net, hnum); 186 struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash]; 187 int score, hiscore, matches = 0, reuseport = 0; 188 u32 phash = 0; 189 190 rcu_read_lock(); 191 begin: 192 result = NULL; 193 hiscore = 0; 194 sk_nulls_for_each_rcu(sk, node, &ilb->head) { 195 score = compute_score(sk, net, hnum, daddr, dif); 196 if (score > hiscore) { 197 result = sk; 198 hiscore = score; 199 reuseport = sk->sk_reuseport; 200 if (reuseport) { 201 phash = inet_ehashfn(net, daddr, hnum, 202 saddr, sport); 203 matches = 1; 204 } 205 } else if (score == hiscore && reuseport) { 206 matches++; 207 if (((u64)phash * matches) >> 32 == 0) 208 result = sk; 209 phash = next_pseudo_random32(phash); 210 } 211 } 212 /* 213 * if the nulls value we got at the end of this lookup is 214 * not the expected one, we must restart lookup. 215 * We probably met an item that was moved to another chain. 216 */ 217 if (get_nulls_value(node) != hash + LISTENING_NULLS_BASE) 218 goto begin; 219 if (result) { 220 if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt))) 221 result = NULL; 222 else if (unlikely(compute_score(result, net, hnum, daddr, 223 dif) < hiscore)) { 224 sock_put(result); 225 goto begin; 226 } 227 } 228 rcu_read_unlock(); 229 return result; 230 } 231 EXPORT_SYMBOL_GPL(__inet_lookup_listener); 232 233 struct sock *__inet_lookup_established(struct net *net, 234 struct inet_hashinfo *hashinfo, 235 const __be32 saddr, const __be16 sport, 236 const __be32 daddr, const u16 hnum, 237 const int dif) 238 { 239 INET_ADDR_COOKIE(acookie, saddr, daddr) 240 const __portpair ports = INET_COMBINED_PORTS(sport, hnum); 241 struct sock *sk; 242 const struct hlist_nulls_node *node; 243 /* Optimize here for direct hit, only listening connections can 244 * have wildcards anyways. 245 */ 246 unsigned int hash = inet_ehashfn(net, daddr, hnum, saddr, sport); 247 unsigned int slot = hash & hashinfo->ehash_mask; 248 struct inet_ehash_bucket *head = &hashinfo->ehash[slot]; 249 250 rcu_read_lock(); 251 begin: 252 sk_nulls_for_each_rcu(sk, node, &head->chain) { 253 if (sk->sk_hash != hash) 254 continue; 255 if (likely(INET_MATCH(sk, net, acookie, 256 saddr, daddr, ports, dif))) { 257 if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt))) 258 goto begintw; 259 if (unlikely(!INET_MATCH(sk, net, acookie, 260 saddr, daddr, ports, dif))) { 261 sock_put(sk); 262 goto begin; 263 } 264 goto out; 265 } 266 } 267 /* 268 * if the nulls value we got at the end of this lookup is 269 * not the expected one, we must restart lookup. 270 * We probably met an item that was moved to another chain. 271 */ 272 if (get_nulls_value(node) != slot) 273 goto begin; 274 275 begintw: 276 /* Must check for a TIME_WAIT'er before going to listener hash. */ 277 sk_nulls_for_each_rcu(sk, node, &head->twchain) { 278 if (sk->sk_hash != hash) 279 continue; 280 if (likely(INET_TW_MATCH(sk, net, acookie, 281 saddr, daddr, ports, 282 dif))) { 283 if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt))) { 284 sk = NULL; 285 goto out; 286 } 287 if (unlikely(!INET_TW_MATCH(sk, net, acookie, 288 saddr, daddr, ports, 289 dif))) { 290 inet_twsk_put(inet_twsk(sk)); 291 goto begintw; 292 } 293 goto out; 294 } 295 } 296 /* 297 * if the nulls value we got at the end of this lookup is 298 * not the expected one, we must restart lookup. 299 * We probably met an item that was moved to another chain. 300 */ 301 if (get_nulls_value(node) != slot) 302 goto begintw; 303 sk = NULL; 304 out: 305 rcu_read_unlock(); 306 return sk; 307 } 308 EXPORT_SYMBOL_GPL(__inet_lookup_established); 309 310 /* called with local bh disabled */ 311 static int __inet_check_established(struct inet_timewait_death_row *death_row, 312 struct sock *sk, __u16 lport, 313 struct inet_timewait_sock **twp) 314 { 315 struct inet_hashinfo *hinfo = death_row->hashinfo; 316 struct inet_sock *inet = inet_sk(sk); 317 __be32 daddr = inet->inet_rcv_saddr; 318 __be32 saddr = inet->inet_daddr; 319 int dif = sk->sk_bound_dev_if; 320 INET_ADDR_COOKIE(acookie, saddr, daddr) 321 const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport); 322 struct net *net = sock_net(sk); 323 unsigned int hash = inet_ehashfn(net, daddr, lport, 324 saddr, inet->inet_dport); 325 struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); 326 spinlock_t *lock = inet_ehash_lockp(hinfo, hash); 327 struct sock *sk2; 328 const struct hlist_nulls_node *node; 329 struct inet_timewait_sock *tw; 330 int twrefcnt = 0; 331 332 spin_lock(lock); 333 334 /* Check TIME-WAIT sockets first. */ 335 sk_nulls_for_each(sk2, node, &head->twchain) { 336 if (sk2->sk_hash != hash) 337 continue; 338 339 if (likely(INET_TW_MATCH(sk2, net, acookie, 340 saddr, daddr, ports, dif))) { 341 tw = inet_twsk(sk2); 342 if (twsk_unique(sk, sk2, twp)) 343 goto unique; 344 else 345 goto not_unique; 346 } 347 } 348 tw = NULL; 349 350 /* And established part... */ 351 sk_nulls_for_each(sk2, node, &head->chain) { 352 if (sk2->sk_hash != hash) 353 continue; 354 if (likely(INET_MATCH(sk2, net, acookie, 355 saddr, daddr, ports, dif))) 356 goto not_unique; 357 } 358 359 unique: 360 /* Must record num and sport now. Otherwise we will see 361 * in hash table socket with a funny identity. */ 362 inet->inet_num = lport; 363 inet->inet_sport = htons(lport); 364 sk->sk_hash = hash; 365 WARN_ON(!sk_unhashed(sk)); 366 __sk_nulls_add_node_rcu(sk, &head->chain); 367 if (tw) { 368 twrefcnt = inet_twsk_unhash(tw); 369 NET_INC_STATS_BH(net, LINUX_MIB_TIMEWAITRECYCLED); 370 } 371 spin_unlock(lock); 372 if (twrefcnt) 373 inet_twsk_put(tw); 374 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 375 376 if (twp) { 377 *twp = tw; 378 } else if (tw) { 379 /* Silly. Should hash-dance instead... */ 380 inet_twsk_deschedule(tw, death_row); 381 382 inet_twsk_put(tw); 383 } 384 return 0; 385 386 not_unique: 387 spin_unlock(lock); 388 return -EADDRNOTAVAIL; 389 } 390 391 static inline u32 inet_sk_port_offset(const struct sock *sk) 392 { 393 const struct inet_sock *inet = inet_sk(sk); 394 return secure_ipv4_port_ephemeral(inet->inet_rcv_saddr, 395 inet->inet_daddr, 396 inet->inet_dport); 397 } 398 399 int __inet_hash_nolisten(struct sock *sk, struct inet_timewait_sock *tw) 400 { 401 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; 402 struct hlist_nulls_head *list; 403 spinlock_t *lock; 404 struct inet_ehash_bucket *head; 405 int twrefcnt = 0; 406 407 WARN_ON(!sk_unhashed(sk)); 408 409 sk->sk_hash = inet_sk_ehashfn(sk); 410 head = inet_ehash_bucket(hashinfo, sk->sk_hash); 411 list = &head->chain; 412 lock = inet_ehash_lockp(hashinfo, sk->sk_hash); 413 414 spin_lock(lock); 415 __sk_nulls_add_node_rcu(sk, list); 416 if (tw) { 417 WARN_ON(sk->sk_hash != tw->tw_hash); 418 twrefcnt = inet_twsk_unhash(tw); 419 } 420 spin_unlock(lock); 421 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 422 return twrefcnt; 423 } 424 EXPORT_SYMBOL_GPL(__inet_hash_nolisten); 425 426 static void __inet_hash(struct sock *sk) 427 { 428 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; 429 struct inet_listen_hashbucket *ilb; 430 431 if (sk->sk_state != TCP_LISTEN) { 432 __inet_hash_nolisten(sk, NULL); 433 return; 434 } 435 436 WARN_ON(!sk_unhashed(sk)); 437 ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; 438 439 spin_lock(&ilb->lock); 440 __sk_nulls_add_node_rcu(sk, &ilb->head); 441 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 442 spin_unlock(&ilb->lock); 443 } 444 445 void inet_hash(struct sock *sk) 446 { 447 if (sk->sk_state != TCP_CLOSE) { 448 local_bh_disable(); 449 __inet_hash(sk); 450 local_bh_enable(); 451 } 452 } 453 EXPORT_SYMBOL_GPL(inet_hash); 454 455 void inet_unhash(struct sock *sk) 456 { 457 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; 458 spinlock_t *lock; 459 int done; 460 461 if (sk_unhashed(sk)) 462 return; 463 464 if (sk->sk_state == TCP_LISTEN) 465 lock = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)].lock; 466 else 467 lock = inet_ehash_lockp(hashinfo, sk->sk_hash); 468 469 spin_lock_bh(lock); 470 done = __sk_nulls_del_node_init_rcu(sk); 471 if (done) 472 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); 473 spin_unlock_bh(lock); 474 } 475 EXPORT_SYMBOL_GPL(inet_unhash); 476 477 int __inet_hash_connect(struct inet_timewait_death_row *death_row, 478 struct sock *sk, u32 port_offset, 479 int (*check_established)(struct inet_timewait_death_row *, 480 struct sock *, __u16, struct inet_timewait_sock **), 481 int (*hash)(struct sock *sk, struct inet_timewait_sock *twp)) 482 { 483 struct inet_hashinfo *hinfo = death_row->hashinfo; 484 const unsigned short snum = inet_sk(sk)->inet_num; 485 struct inet_bind_hashbucket *head; 486 struct inet_bind_bucket *tb; 487 int ret; 488 struct net *net = sock_net(sk); 489 int twrefcnt = 1; 490 491 if (!snum) { 492 int i, remaining, low, high, port; 493 static u32 hint; 494 u32 offset = hint + port_offset; 495 struct inet_timewait_sock *tw = NULL; 496 497 inet_get_local_port_range(&low, &high); 498 remaining = (high - low) + 1; 499 500 local_bh_disable(); 501 for (i = 1; i <= remaining; i++) { 502 port = low + (i + offset) % remaining; 503 if (inet_is_reserved_local_port(port)) 504 continue; 505 head = &hinfo->bhash[inet_bhashfn(net, port, 506 hinfo->bhash_size)]; 507 spin_lock(&head->lock); 508 509 /* Does not bother with rcv_saddr checks, 510 * because the established check is already 511 * unique enough. 512 */ 513 inet_bind_bucket_for_each(tb, &head->chain) { 514 if (net_eq(ib_net(tb), net) && 515 tb->port == port) { 516 if (tb->fastreuse >= 0 || 517 tb->fastreuseport >= 0) 518 goto next_port; 519 WARN_ON(hlist_empty(&tb->owners)); 520 if (!check_established(death_row, sk, 521 port, &tw)) 522 goto ok; 523 goto next_port; 524 } 525 } 526 527 tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, 528 net, head, port); 529 if (!tb) { 530 spin_unlock(&head->lock); 531 break; 532 } 533 tb->fastreuse = -1; 534 tb->fastreuseport = -1; 535 goto ok; 536 537 next_port: 538 spin_unlock(&head->lock); 539 } 540 local_bh_enable(); 541 542 return -EADDRNOTAVAIL; 543 544 ok: 545 hint += i; 546 547 /* Head lock still held and bh's disabled */ 548 inet_bind_hash(sk, tb, port); 549 if (sk_unhashed(sk)) { 550 inet_sk(sk)->inet_sport = htons(port); 551 twrefcnt += hash(sk, tw); 552 } 553 if (tw) 554 twrefcnt += inet_twsk_bind_unhash(tw, hinfo); 555 spin_unlock(&head->lock); 556 557 if (tw) { 558 inet_twsk_deschedule(tw, death_row); 559 while (twrefcnt) { 560 twrefcnt--; 561 inet_twsk_put(tw); 562 } 563 } 564 565 ret = 0; 566 goto out; 567 } 568 569 head = &hinfo->bhash[inet_bhashfn(net, snum, hinfo->bhash_size)]; 570 tb = inet_csk(sk)->icsk_bind_hash; 571 spin_lock_bh(&head->lock); 572 if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { 573 hash(sk, NULL); 574 spin_unlock_bh(&head->lock); 575 return 0; 576 } else { 577 spin_unlock(&head->lock); 578 /* No definite answer... Walk to established hash table */ 579 ret = check_established(death_row, sk, snum, NULL); 580 out: 581 local_bh_enable(); 582 return ret; 583 } 584 } 585 586 /* 587 * Bind a port for a connect operation and hash it. 588 */ 589 int inet_hash_connect(struct inet_timewait_death_row *death_row, 590 struct sock *sk) 591 { 592 return __inet_hash_connect(death_row, sk, inet_sk_port_offset(sk), 593 __inet_check_established, __inet_hash_nolisten); 594 } 595 EXPORT_SYMBOL_GPL(inet_hash_connect); 596 597 void inet_hashinfo_init(struct inet_hashinfo *h) 598 { 599 int i; 600 601 atomic_set(&h->bsockets, 0); 602 for (i = 0; i < INET_LHTABLE_SIZE; i++) { 603 spin_lock_init(&h->listening_hash[i].lock); 604 INIT_HLIST_NULLS_HEAD(&h->listening_hash[i].head, 605 i + LISTENING_NULLS_BASE); 606 } 607 } 608 EXPORT_SYMBOL_GPL(inet_hashinfo_init); 609