1 /* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * Generic INET transport hashtables 7 * 8 * Authors: Lotsa people, from code originally in tcp 9 * 10 * This program is free software; you can redistribute it and/or 11 * modify it under the terms of the GNU General Public License 12 * as published by the Free Software Foundation; either version 13 * 2 of the License, or (at your option) any later version. 14 */ 15 16 #include <linux/module.h> 17 #include <linux/random.h> 18 #include <linux/sched.h> 19 #include <linux/slab.h> 20 #include <linux/wait.h> 21 22 #include <net/inet_connection_sock.h> 23 #include <net/inet_hashtables.h> 24 #include <net/secure_seq.h> 25 #include <net/ip.h> 26 27 /* 28 * Allocate and initialize a new local port bind bucket. 29 * The bindhash mutex for snum's hash chain must be held here. 30 */ 31 struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep, 32 struct net *net, 33 struct inet_bind_hashbucket *head, 34 const unsigned short snum) 35 { 36 struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC); 37 38 if (tb != NULL) { 39 write_pnet(&tb->ib_net, hold_net(net)); 40 tb->port = snum; 41 tb->fastreuse = 0; 42 tb->fastreuseport = 0; 43 tb->num_owners = 0; 44 INIT_HLIST_HEAD(&tb->owners); 45 hlist_add_head(&tb->node, &head->chain); 46 } 47 return tb; 48 } 49 50 /* 51 * Caller must hold hashbucket lock for this tb with local BH disabled 52 */ 53 void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket *tb) 54 { 55 if (hlist_empty(&tb->owners)) { 56 __hlist_del(&tb->node); 57 release_net(ib_net(tb)); 58 kmem_cache_free(cachep, tb); 59 } 60 } 61 62 void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, 63 const unsigned short snum) 64 { 65 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; 66 67 atomic_inc(&hashinfo->bsockets); 68 69 inet_sk(sk)->inet_num = snum; 70 sk_add_bind_node(sk, &tb->owners); 71 tb->num_owners++; 72 inet_csk(sk)->icsk_bind_hash = tb; 73 } 74 75 /* 76 * Get rid of any references to a local port held by the given sock. 77 */ 78 static void __inet_put_port(struct sock *sk) 79 { 80 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; 81 const int bhash = inet_bhashfn(sock_net(sk), inet_sk(sk)->inet_num, 82 hashinfo->bhash_size); 83 struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash]; 84 struct inet_bind_bucket *tb; 85 86 atomic_dec(&hashinfo->bsockets); 87 88 spin_lock(&head->lock); 89 tb = inet_csk(sk)->icsk_bind_hash; 90 __sk_del_bind_node(sk); 91 tb->num_owners--; 92 inet_csk(sk)->icsk_bind_hash = NULL; 93 inet_sk(sk)->inet_num = 0; 94 inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); 95 spin_unlock(&head->lock); 96 } 97 98 void inet_put_port(struct sock *sk) 99 { 100 local_bh_disable(); 101 __inet_put_port(sk); 102 local_bh_enable(); 103 } 104 EXPORT_SYMBOL(inet_put_port); 105 106 int __inet_inherit_port(struct sock *sk, struct sock *child) 107 { 108 struct inet_hashinfo *table = sk->sk_prot->h.hashinfo; 109 unsigned short port = inet_sk(child)->inet_num; 110 const int bhash = inet_bhashfn(sock_net(sk), port, 111 table->bhash_size); 112 struct inet_bind_hashbucket *head = &table->bhash[bhash]; 113 struct inet_bind_bucket *tb; 114 115 spin_lock(&head->lock); 116 tb = inet_csk(sk)->icsk_bind_hash; 117 if (tb->port != port) { 118 /* NOTE: using tproxy and redirecting skbs to a proxy 119 * on a different listener port breaks the assumption 120 * that the listener socket's icsk_bind_hash is the same 121 * as that of the child socket. We have to look up or 122 * create a new bind bucket for the child here. */ 123 struct hlist_node *node; 124 inet_bind_bucket_for_each(tb, node, &head->chain) { 125 if (net_eq(ib_net(tb), sock_net(sk)) && 126 tb->port == port) 127 break; 128 } 129 if (!node) { 130 tb = inet_bind_bucket_create(table->bind_bucket_cachep, 131 sock_net(sk), head, port); 132 if (!tb) { 133 spin_unlock(&head->lock); 134 return -ENOMEM; 135 } 136 } 137 } 138 inet_bind_hash(child, tb, port); 139 spin_unlock(&head->lock); 140 141 return 0; 142 } 143 EXPORT_SYMBOL_GPL(__inet_inherit_port); 144 145 static inline int compute_score(struct sock *sk, struct net *net, 146 const unsigned short hnum, const __be32 daddr, 147 const int dif) 148 { 149 int score = -1; 150 struct inet_sock *inet = inet_sk(sk); 151 152 if (net_eq(sock_net(sk), net) && inet->inet_num == hnum && 153 !ipv6_only_sock(sk)) { 154 __be32 rcv_saddr = inet->inet_rcv_saddr; 155 score = sk->sk_family == PF_INET ? 2 : 1; 156 if (rcv_saddr) { 157 if (rcv_saddr != daddr) 158 return -1; 159 score += 4; 160 } 161 if (sk->sk_bound_dev_if) { 162 if (sk->sk_bound_dev_if != dif) 163 return -1; 164 score += 4; 165 } 166 } 167 return score; 168 } 169 170 /* 171 * Don't inline this cruft. Here are some nice properties to exploit here. The 172 * BSD API does not allow a listening sock to specify the remote port nor the 173 * remote address for the connection. So always assume those are both 174 * wildcarded during the search since they can never be otherwise. 175 */ 176 177 178 struct sock *__inet_lookup_listener(struct net *net, 179 struct inet_hashinfo *hashinfo, 180 const __be32 saddr, __be16 sport, 181 const __be32 daddr, const unsigned short hnum, 182 const int dif) 183 { 184 struct sock *sk, *result; 185 struct hlist_nulls_node *node; 186 unsigned int hash = inet_lhashfn(net, hnum); 187 struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash]; 188 int score, hiscore, matches = 0, reuseport = 0; 189 u32 phash = 0; 190 191 rcu_read_lock(); 192 begin: 193 result = NULL; 194 hiscore = 0; 195 sk_nulls_for_each_rcu(sk, node, &ilb->head) { 196 score = compute_score(sk, net, hnum, daddr, dif); 197 if (score > hiscore) { 198 result = sk; 199 hiscore = score; 200 reuseport = sk->sk_reuseport; 201 if (reuseport) { 202 phash = inet_ehashfn(net, daddr, hnum, 203 saddr, sport); 204 matches = 1; 205 } 206 } else if (score == hiscore && reuseport) { 207 matches++; 208 if (((u64)phash * matches) >> 32 == 0) 209 result = sk; 210 phash = next_pseudo_random32(phash); 211 } 212 } 213 /* 214 * if the nulls value we got at the end of this lookup is 215 * not the expected one, we must restart lookup. 216 * We probably met an item that was moved to another chain. 217 */ 218 if (get_nulls_value(node) != hash + LISTENING_NULLS_BASE) 219 goto begin; 220 if (result) { 221 if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt))) 222 result = NULL; 223 else if (unlikely(compute_score(result, net, hnum, daddr, 224 dif) < hiscore)) { 225 sock_put(result); 226 goto begin; 227 } 228 } 229 rcu_read_unlock(); 230 return result; 231 } 232 EXPORT_SYMBOL_GPL(__inet_lookup_listener); 233 234 struct sock *__inet_lookup_established(struct net *net, 235 struct inet_hashinfo *hashinfo, 236 const __be32 saddr, const __be16 sport, 237 const __be32 daddr, const u16 hnum, 238 const int dif) 239 { 240 INET_ADDR_COOKIE(acookie, saddr, daddr) 241 const __portpair ports = INET_COMBINED_PORTS(sport, hnum); 242 struct sock *sk; 243 const struct hlist_nulls_node *node; 244 /* Optimize here for direct hit, only listening connections can 245 * have wildcards anyways. 246 */ 247 unsigned int hash = inet_ehashfn(net, daddr, hnum, saddr, sport); 248 unsigned int slot = hash & hashinfo->ehash_mask; 249 struct inet_ehash_bucket *head = &hashinfo->ehash[slot]; 250 251 rcu_read_lock(); 252 begin: 253 sk_nulls_for_each_rcu(sk, node, &head->chain) { 254 if (sk->sk_hash != hash) 255 continue; 256 if (likely(INET_MATCH(sk, net, acookie, 257 saddr, daddr, ports, dif))) { 258 if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt))) 259 goto begintw; 260 if (unlikely(!INET_MATCH(sk, net, acookie, 261 saddr, daddr, ports, dif))) { 262 sock_put(sk); 263 goto begin; 264 } 265 goto out; 266 } 267 } 268 /* 269 * if the nulls value we got at the end of this lookup is 270 * not the expected one, we must restart lookup. 271 * We probably met an item that was moved to another chain. 272 */ 273 if (get_nulls_value(node) != slot) 274 goto begin; 275 276 begintw: 277 /* Must check for a TIME_WAIT'er before going to listener hash. */ 278 sk_nulls_for_each_rcu(sk, node, &head->twchain) { 279 if (sk->sk_hash != hash) 280 continue; 281 if (likely(INET_TW_MATCH(sk, net, acookie, 282 saddr, daddr, ports, 283 dif))) { 284 if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt))) { 285 sk = NULL; 286 goto out; 287 } 288 if (unlikely(!INET_TW_MATCH(sk, net, acookie, 289 saddr, daddr, ports, 290 dif))) { 291 sock_put(sk); 292 goto begintw; 293 } 294 goto out; 295 } 296 } 297 /* 298 * if the nulls value we got at the end of this lookup is 299 * not the expected one, we must restart lookup. 300 * We probably met an item that was moved to another chain. 301 */ 302 if (get_nulls_value(node) != slot) 303 goto begintw; 304 sk = NULL; 305 out: 306 rcu_read_unlock(); 307 return sk; 308 } 309 EXPORT_SYMBOL_GPL(__inet_lookup_established); 310 311 /* called with local bh disabled */ 312 static int __inet_check_established(struct inet_timewait_death_row *death_row, 313 struct sock *sk, __u16 lport, 314 struct inet_timewait_sock **twp) 315 { 316 struct inet_hashinfo *hinfo = death_row->hashinfo; 317 struct inet_sock *inet = inet_sk(sk); 318 __be32 daddr = inet->inet_rcv_saddr; 319 __be32 saddr = inet->inet_daddr; 320 int dif = sk->sk_bound_dev_if; 321 INET_ADDR_COOKIE(acookie, saddr, daddr) 322 const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport); 323 struct net *net = sock_net(sk); 324 unsigned int hash = inet_ehashfn(net, daddr, lport, 325 saddr, inet->inet_dport); 326 struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); 327 spinlock_t *lock = inet_ehash_lockp(hinfo, hash); 328 struct sock *sk2; 329 const struct hlist_nulls_node *node; 330 struct inet_timewait_sock *tw; 331 int twrefcnt = 0; 332 333 spin_lock(lock); 334 335 /* Check TIME-WAIT sockets first. */ 336 sk_nulls_for_each(sk2, node, &head->twchain) { 337 if (sk2->sk_hash != hash) 338 continue; 339 340 if (likely(INET_TW_MATCH(sk2, net, acookie, 341 saddr, daddr, ports, dif))) { 342 tw = inet_twsk(sk2); 343 if (twsk_unique(sk, sk2, twp)) 344 goto unique; 345 else 346 goto not_unique; 347 } 348 } 349 tw = NULL; 350 351 /* And established part... */ 352 sk_nulls_for_each(sk2, node, &head->chain) { 353 if (sk2->sk_hash != hash) 354 continue; 355 if (likely(INET_MATCH(sk2, net, acookie, 356 saddr, daddr, ports, dif))) 357 goto not_unique; 358 } 359 360 unique: 361 /* Must record num and sport now. Otherwise we will see 362 * in hash table socket with a funny identity. */ 363 inet->inet_num = lport; 364 inet->inet_sport = htons(lport); 365 sk->sk_hash = hash; 366 WARN_ON(!sk_unhashed(sk)); 367 __sk_nulls_add_node_rcu(sk, &head->chain); 368 if (tw) { 369 twrefcnt = inet_twsk_unhash(tw); 370 NET_INC_STATS_BH(net, LINUX_MIB_TIMEWAITRECYCLED); 371 } 372 spin_unlock(lock); 373 if (twrefcnt) 374 inet_twsk_put(tw); 375 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 376 377 if (twp) { 378 *twp = tw; 379 } else if (tw) { 380 /* Silly. Should hash-dance instead... */ 381 inet_twsk_deschedule(tw, death_row); 382 383 inet_twsk_put(tw); 384 } 385 return 0; 386 387 not_unique: 388 spin_unlock(lock); 389 return -EADDRNOTAVAIL; 390 } 391 392 static inline u32 inet_sk_port_offset(const struct sock *sk) 393 { 394 const struct inet_sock *inet = inet_sk(sk); 395 return secure_ipv4_port_ephemeral(inet->inet_rcv_saddr, 396 inet->inet_daddr, 397 inet->inet_dport); 398 } 399 400 int __inet_hash_nolisten(struct sock *sk, struct inet_timewait_sock *tw) 401 { 402 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; 403 struct hlist_nulls_head *list; 404 spinlock_t *lock; 405 struct inet_ehash_bucket *head; 406 int twrefcnt = 0; 407 408 WARN_ON(!sk_unhashed(sk)); 409 410 sk->sk_hash = inet_sk_ehashfn(sk); 411 head = inet_ehash_bucket(hashinfo, sk->sk_hash); 412 list = &head->chain; 413 lock = inet_ehash_lockp(hashinfo, sk->sk_hash); 414 415 spin_lock(lock); 416 __sk_nulls_add_node_rcu(sk, list); 417 if (tw) { 418 WARN_ON(sk->sk_hash != tw->tw_hash); 419 twrefcnt = inet_twsk_unhash(tw); 420 } 421 spin_unlock(lock); 422 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 423 return twrefcnt; 424 } 425 EXPORT_SYMBOL_GPL(__inet_hash_nolisten); 426 427 static void __inet_hash(struct sock *sk) 428 { 429 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; 430 struct inet_listen_hashbucket *ilb; 431 432 if (sk->sk_state != TCP_LISTEN) { 433 __inet_hash_nolisten(sk, NULL); 434 return; 435 } 436 437 WARN_ON(!sk_unhashed(sk)); 438 ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; 439 440 spin_lock(&ilb->lock); 441 __sk_nulls_add_node_rcu(sk, &ilb->head); 442 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 443 spin_unlock(&ilb->lock); 444 } 445 446 void inet_hash(struct sock *sk) 447 { 448 if (sk->sk_state != TCP_CLOSE) { 449 local_bh_disable(); 450 __inet_hash(sk); 451 local_bh_enable(); 452 } 453 } 454 EXPORT_SYMBOL_GPL(inet_hash); 455 456 void inet_unhash(struct sock *sk) 457 { 458 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; 459 spinlock_t *lock; 460 int done; 461 462 if (sk_unhashed(sk)) 463 return; 464 465 if (sk->sk_state == TCP_LISTEN) 466 lock = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)].lock; 467 else 468 lock = inet_ehash_lockp(hashinfo, sk->sk_hash); 469 470 spin_lock_bh(lock); 471 done =__sk_nulls_del_node_init_rcu(sk); 472 if (done) 473 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); 474 spin_unlock_bh(lock); 475 } 476 EXPORT_SYMBOL_GPL(inet_unhash); 477 478 int __inet_hash_connect(struct inet_timewait_death_row *death_row, 479 struct sock *sk, u32 port_offset, 480 int (*check_established)(struct inet_timewait_death_row *, 481 struct sock *, __u16, struct inet_timewait_sock **), 482 int (*hash)(struct sock *sk, struct inet_timewait_sock *twp)) 483 { 484 struct inet_hashinfo *hinfo = death_row->hashinfo; 485 const unsigned short snum = inet_sk(sk)->inet_num; 486 struct inet_bind_hashbucket *head; 487 struct inet_bind_bucket *tb; 488 int ret; 489 struct net *net = sock_net(sk); 490 int twrefcnt = 1; 491 492 if (!snum) { 493 int i, remaining, low, high, port; 494 static u32 hint; 495 u32 offset = hint + port_offset; 496 struct hlist_node *node; 497 struct inet_timewait_sock *tw = NULL; 498 499 inet_get_local_port_range(&low, &high); 500 remaining = (high - low) + 1; 501 502 local_bh_disable(); 503 for (i = 1; i <= remaining; i++) { 504 port = low + (i + offset) % remaining; 505 if (inet_is_reserved_local_port(port)) 506 continue; 507 head = &hinfo->bhash[inet_bhashfn(net, port, 508 hinfo->bhash_size)]; 509 spin_lock(&head->lock); 510 511 /* Does not bother with rcv_saddr checks, 512 * because the established check is already 513 * unique enough. 514 */ 515 inet_bind_bucket_for_each(tb, node, &head->chain) { 516 if (net_eq(ib_net(tb), net) && 517 tb->port == port) { 518 if (tb->fastreuse >= 0 || 519 tb->fastreuseport >= 0) 520 goto next_port; 521 WARN_ON(hlist_empty(&tb->owners)); 522 if (!check_established(death_row, sk, 523 port, &tw)) 524 goto ok; 525 goto next_port; 526 } 527 } 528 529 tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, 530 net, head, port); 531 if (!tb) { 532 spin_unlock(&head->lock); 533 break; 534 } 535 tb->fastreuse = -1; 536 tb->fastreuseport = -1; 537 goto ok; 538 539 next_port: 540 spin_unlock(&head->lock); 541 } 542 local_bh_enable(); 543 544 return -EADDRNOTAVAIL; 545 546 ok: 547 hint += i; 548 549 /* Head lock still held and bh's disabled */ 550 inet_bind_hash(sk, tb, port); 551 if (sk_unhashed(sk)) { 552 inet_sk(sk)->inet_sport = htons(port); 553 twrefcnt += hash(sk, tw); 554 } 555 if (tw) 556 twrefcnt += inet_twsk_bind_unhash(tw, hinfo); 557 spin_unlock(&head->lock); 558 559 if (tw) { 560 inet_twsk_deschedule(tw, death_row); 561 while (twrefcnt) { 562 twrefcnt--; 563 inet_twsk_put(tw); 564 } 565 } 566 567 ret = 0; 568 goto out; 569 } 570 571 head = &hinfo->bhash[inet_bhashfn(net, snum, hinfo->bhash_size)]; 572 tb = inet_csk(sk)->icsk_bind_hash; 573 spin_lock_bh(&head->lock); 574 if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { 575 hash(sk, NULL); 576 spin_unlock_bh(&head->lock); 577 return 0; 578 } else { 579 spin_unlock(&head->lock); 580 /* No definite answer... Walk to established hash table */ 581 ret = check_established(death_row, sk, snum, NULL); 582 out: 583 local_bh_enable(); 584 return ret; 585 } 586 } 587 588 /* 589 * Bind a port for a connect operation and hash it. 590 */ 591 int inet_hash_connect(struct inet_timewait_death_row *death_row, 592 struct sock *sk) 593 { 594 return __inet_hash_connect(death_row, sk, inet_sk_port_offset(sk), 595 __inet_check_established, __inet_hash_nolisten); 596 } 597 EXPORT_SYMBOL_GPL(inet_hash_connect); 598 599 void inet_hashinfo_init(struct inet_hashinfo *h) 600 { 601 int i; 602 603 atomic_set(&h->bsockets, 0); 604 for (i = 0; i < INET_LHTABLE_SIZE; i++) { 605 spin_lock_init(&h->listening_hash[i].lock); 606 INIT_HLIST_NULLS_HEAD(&h->listening_hash[i].head, 607 i + LISTENING_NULLS_BASE); 608 } 609 } 610 EXPORT_SYMBOL_GPL(inet_hashinfo_init); 611