1 /* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * Generic INET transport hashtables 7 * 8 * Authors: Lotsa people, from code originally in tcp 9 * 10 * This program is free software; you can redistribute it and/or 11 * modify it under the terms of the GNU General Public License 12 * as published by the Free Software Foundation; either version 13 * 2 of the License, or (at your option) any later version. 14 */ 15 16 #include <linux/module.h> 17 #include <linux/random.h> 18 #include <linux/sched.h> 19 #include <linux/slab.h> 20 #include <linux/wait.h> 21 22 #include <net/inet_connection_sock.h> 23 #include <net/inet_hashtables.h> 24 #include <net/ip.h> 25 26 /* 27 * Allocate and initialize a new local port bind bucket. 28 * The bindhash mutex for snum's hash chain must be held here. 29 */ 30 struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep, 31 struct net *net, 32 struct inet_bind_hashbucket *head, 33 const unsigned short snum) 34 { 35 struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC); 36 37 if (tb != NULL) { 38 write_pnet(&tb->ib_net, hold_net(net)); 39 tb->port = snum; 40 tb->fastreuse = 0; 41 tb->num_owners = 0; 42 INIT_HLIST_HEAD(&tb->owners); 43 hlist_add_head(&tb->node, &head->chain); 44 } 45 return tb; 46 } 47 48 /* 49 * Caller must hold hashbucket lock for this tb with local BH disabled 50 */ 51 void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket *tb) 52 { 53 if (hlist_empty(&tb->owners)) { 54 __hlist_del(&tb->node); 55 release_net(ib_net(tb)); 56 kmem_cache_free(cachep, tb); 57 } 58 } 59 60 void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, 61 const unsigned short snum) 62 { 63 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; 64 65 atomic_inc(&hashinfo->bsockets); 66 67 inet_sk(sk)->inet_num = snum; 68 sk_add_bind_node(sk, &tb->owners); 69 tb->num_owners++; 70 inet_csk(sk)->icsk_bind_hash = tb; 71 } 72 73 /* 74 * Get rid of any references to a local port held by the given sock. 75 */ 76 static void __inet_put_port(struct sock *sk) 77 { 78 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; 79 const int bhash = inet_bhashfn(sock_net(sk), inet_sk(sk)->inet_num, 80 hashinfo->bhash_size); 81 struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash]; 82 struct inet_bind_bucket *tb; 83 84 atomic_dec(&hashinfo->bsockets); 85 86 spin_lock(&head->lock); 87 tb = inet_csk(sk)->icsk_bind_hash; 88 __sk_del_bind_node(sk); 89 tb->num_owners--; 90 inet_csk(sk)->icsk_bind_hash = NULL; 91 inet_sk(sk)->inet_num = 0; 92 inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); 93 spin_unlock(&head->lock); 94 } 95 96 void inet_put_port(struct sock *sk) 97 { 98 local_bh_disable(); 99 __inet_put_port(sk); 100 local_bh_enable(); 101 } 102 EXPORT_SYMBOL(inet_put_port); 103 104 int __inet_inherit_port(struct sock *sk, struct sock *child) 105 { 106 struct inet_hashinfo *table = sk->sk_prot->h.hashinfo; 107 unsigned short port = inet_sk(child)->inet_num; 108 const int bhash = inet_bhashfn(sock_net(sk), port, 109 table->bhash_size); 110 struct inet_bind_hashbucket *head = &table->bhash[bhash]; 111 struct inet_bind_bucket *tb; 112 113 spin_lock(&head->lock); 114 tb = inet_csk(sk)->icsk_bind_hash; 115 if (tb->port != port) { 116 /* NOTE: using tproxy and redirecting skbs to a proxy 117 * on a different listener port breaks the assumption 118 * that the listener socket's icsk_bind_hash is the same 119 * as that of the child socket. We have to look up or 120 * create a new bind bucket for the child here. */ 121 struct hlist_node *node; 122 inet_bind_bucket_for_each(tb, node, &head->chain) { 123 if (net_eq(ib_net(tb), sock_net(sk)) && 124 tb->port == port) 125 break; 126 } 127 if (!node) { 128 tb = inet_bind_bucket_create(table->bind_bucket_cachep, 129 sock_net(sk), head, port); 130 if (!tb) { 131 spin_unlock(&head->lock); 132 return -ENOMEM; 133 } 134 } 135 } 136 inet_bind_hash(child, tb, port); 137 spin_unlock(&head->lock); 138 139 return 0; 140 } 141 EXPORT_SYMBOL_GPL(__inet_inherit_port); 142 143 static inline int compute_score(struct sock *sk, struct net *net, 144 const unsigned short hnum, const __be32 daddr, 145 const int dif) 146 { 147 int score = -1; 148 struct inet_sock *inet = inet_sk(sk); 149 150 if (net_eq(sock_net(sk), net) && inet->inet_num == hnum && 151 !ipv6_only_sock(sk)) { 152 __be32 rcv_saddr = inet->inet_rcv_saddr; 153 score = sk->sk_family == PF_INET ? 1 : 0; 154 if (rcv_saddr) { 155 if (rcv_saddr != daddr) 156 return -1; 157 score += 2; 158 } 159 if (sk->sk_bound_dev_if) { 160 if (sk->sk_bound_dev_if != dif) 161 return -1; 162 score += 2; 163 } 164 } 165 return score; 166 } 167 168 /* 169 * Don't inline this cruft. Here are some nice properties to exploit here. The 170 * BSD API does not allow a listening sock to specify the remote port nor the 171 * remote address for the connection. So always assume those are both 172 * wildcarded during the search since they can never be otherwise. 173 */ 174 175 176 struct sock *__inet_lookup_listener(struct net *net, 177 struct inet_hashinfo *hashinfo, 178 const __be32 daddr, const unsigned short hnum, 179 const int dif) 180 { 181 struct sock *sk, *result; 182 struct hlist_nulls_node *node; 183 unsigned int hash = inet_lhashfn(net, hnum); 184 struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash]; 185 int score, hiscore; 186 187 rcu_read_lock(); 188 begin: 189 result = NULL; 190 hiscore = -1; 191 sk_nulls_for_each_rcu(sk, node, &ilb->head) { 192 score = compute_score(sk, net, hnum, daddr, dif); 193 if (score > hiscore) { 194 result = sk; 195 hiscore = score; 196 } 197 } 198 /* 199 * if the nulls value we got at the end of this lookup is 200 * not the expected one, we must restart lookup. 201 * We probably met an item that was moved to another chain. 202 */ 203 if (get_nulls_value(node) != hash + LISTENING_NULLS_BASE) 204 goto begin; 205 if (result) { 206 if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt))) 207 result = NULL; 208 else if (unlikely(compute_score(result, net, hnum, daddr, 209 dif) < hiscore)) { 210 sock_put(result); 211 goto begin; 212 } 213 } 214 rcu_read_unlock(); 215 return result; 216 } 217 EXPORT_SYMBOL_GPL(__inet_lookup_listener); 218 219 struct sock * __inet_lookup_established(struct net *net, 220 struct inet_hashinfo *hashinfo, 221 const __be32 saddr, const __be16 sport, 222 const __be32 daddr, const u16 hnum, 223 const int dif) 224 { 225 INET_ADDR_COOKIE(acookie, saddr, daddr) 226 const __portpair ports = INET_COMBINED_PORTS(sport, hnum); 227 struct sock *sk; 228 const struct hlist_nulls_node *node; 229 /* Optimize here for direct hit, only listening connections can 230 * have wildcards anyways. 231 */ 232 unsigned int hash = inet_ehashfn(net, daddr, hnum, saddr, sport); 233 unsigned int slot = hash & hashinfo->ehash_mask; 234 struct inet_ehash_bucket *head = &hashinfo->ehash[slot]; 235 236 rcu_read_lock(); 237 begin: 238 sk_nulls_for_each_rcu(sk, node, &head->chain) { 239 if (INET_MATCH(sk, net, hash, acookie, 240 saddr, daddr, ports, dif)) { 241 if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt))) 242 goto begintw; 243 if (unlikely(!INET_MATCH(sk, net, hash, acookie, 244 saddr, daddr, ports, dif))) { 245 sock_put(sk); 246 goto begin; 247 } 248 goto out; 249 } 250 } 251 /* 252 * if the nulls value we got at the end of this lookup is 253 * not the expected one, we must restart lookup. 254 * We probably met an item that was moved to another chain. 255 */ 256 if (get_nulls_value(node) != slot) 257 goto begin; 258 259 begintw: 260 /* Must check for a TIME_WAIT'er before going to listener hash. */ 261 sk_nulls_for_each_rcu(sk, node, &head->twchain) { 262 if (INET_TW_MATCH(sk, net, hash, acookie, 263 saddr, daddr, ports, dif)) { 264 if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt))) { 265 sk = NULL; 266 goto out; 267 } 268 if (unlikely(!INET_TW_MATCH(sk, net, hash, acookie, 269 saddr, daddr, ports, dif))) { 270 sock_put(sk); 271 goto begintw; 272 } 273 goto out; 274 } 275 } 276 /* 277 * if the nulls value we got at the end of this lookup is 278 * not the expected one, we must restart lookup. 279 * We probably met an item that was moved to another chain. 280 */ 281 if (get_nulls_value(node) != slot) 282 goto begintw; 283 sk = NULL; 284 out: 285 rcu_read_unlock(); 286 return sk; 287 } 288 EXPORT_SYMBOL_GPL(__inet_lookup_established); 289 290 /* called with local bh disabled */ 291 static int __inet_check_established(struct inet_timewait_death_row *death_row, 292 struct sock *sk, __u16 lport, 293 struct inet_timewait_sock **twp) 294 { 295 struct inet_hashinfo *hinfo = death_row->hashinfo; 296 struct inet_sock *inet = inet_sk(sk); 297 __be32 daddr = inet->inet_rcv_saddr; 298 __be32 saddr = inet->inet_daddr; 299 int dif = sk->sk_bound_dev_if; 300 INET_ADDR_COOKIE(acookie, saddr, daddr) 301 const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport); 302 struct net *net = sock_net(sk); 303 unsigned int hash = inet_ehashfn(net, daddr, lport, 304 saddr, inet->inet_dport); 305 struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); 306 spinlock_t *lock = inet_ehash_lockp(hinfo, hash); 307 struct sock *sk2; 308 const struct hlist_nulls_node *node; 309 struct inet_timewait_sock *tw; 310 int twrefcnt = 0; 311 312 spin_lock(lock); 313 314 /* Check TIME-WAIT sockets first. */ 315 sk_nulls_for_each(sk2, node, &head->twchain) { 316 tw = inet_twsk(sk2); 317 318 if (INET_TW_MATCH(sk2, net, hash, acookie, 319 saddr, daddr, ports, dif)) { 320 if (twsk_unique(sk, sk2, twp)) 321 goto unique; 322 else 323 goto not_unique; 324 } 325 } 326 tw = NULL; 327 328 /* And established part... */ 329 sk_nulls_for_each(sk2, node, &head->chain) { 330 if (INET_MATCH(sk2, net, hash, acookie, 331 saddr, daddr, ports, dif)) 332 goto not_unique; 333 } 334 335 unique: 336 /* Must record num and sport now. Otherwise we will see 337 * in hash table socket with a funny identity. */ 338 inet->inet_num = lport; 339 inet->inet_sport = htons(lport); 340 sk->sk_hash = hash; 341 WARN_ON(!sk_unhashed(sk)); 342 __sk_nulls_add_node_rcu(sk, &head->chain); 343 if (tw) { 344 twrefcnt = inet_twsk_unhash(tw); 345 NET_INC_STATS_BH(net, LINUX_MIB_TIMEWAITRECYCLED); 346 } 347 spin_unlock(lock); 348 if (twrefcnt) 349 inet_twsk_put(tw); 350 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 351 352 if (twp) { 353 *twp = tw; 354 } else if (tw) { 355 /* Silly. Should hash-dance instead... */ 356 inet_twsk_deschedule(tw, death_row); 357 358 inet_twsk_put(tw); 359 } 360 return 0; 361 362 not_unique: 363 spin_unlock(lock); 364 return -EADDRNOTAVAIL; 365 } 366 367 static inline u32 inet_sk_port_offset(const struct sock *sk) 368 { 369 const struct inet_sock *inet = inet_sk(sk); 370 return secure_ipv4_port_ephemeral(inet->inet_rcv_saddr, 371 inet->inet_daddr, 372 inet->inet_dport); 373 } 374 375 int __inet_hash_nolisten(struct sock *sk, struct inet_timewait_sock *tw) 376 { 377 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; 378 struct hlist_nulls_head *list; 379 spinlock_t *lock; 380 struct inet_ehash_bucket *head; 381 int twrefcnt = 0; 382 383 WARN_ON(!sk_unhashed(sk)); 384 385 sk->sk_hash = inet_sk_ehashfn(sk); 386 head = inet_ehash_bucket(hashinfo, sk->sk_hash); 387 list = &head->chain; 388 lock = inet_ehash_lockp(hashinfo, sk->sk_hash); 389 390 spin_lock(lock); 391 __sk_nulls_add_node_rcu(sk, list); 392 if (tw) { 393 WARN_ON(sk->sk_hash != tw->tw_hash); 394 twrefcnt = inet_twsk_unhash(tw); 395 } 396 spin_unlock(lock); 397 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 398 return twrefcnt; 399 } 400 EXPORT_SYMBOL_GPL(__inet_hash_nolisten); 401 402 static void __inet_hash(struct sock *sk) 403 { 404 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; 405 struct inet_listen_hashbucket *ilb; 406 407 if (sk->sk_state != TCP_LISTEN) { 408 __inet_hash_nolisten(sk, NULL); 409 return; 410 } 411 412 WARN_ON(!sk_unhashed(sk)); 413 ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; 414 415 spin_lock(&ilb->lock); 416 __sk_nulls_add_node_rcu(sk, &ilb->head); 417 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 418 spin_unlock(&ilb->lock); 419 } 420 421 void inet_hash(struct sock *sk) 422 { 423 if (sk->sk_state != TCP_CLOSE) { 424 local_bh_disable(); 425 __inet_hash(sk); 426 local_bh_enable(); 427 } 428 } 429 EXPORT_SYMBOL_GPL(inet_hash); 430 431 void inet_unhash(struct sock *sk) 432 { 433 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; 434 spinlock_t *lock; 435 int done; 436 437 if (sk_unhashed(sk)) 438 return; 439 440 if (sk->sk_state == TCP_LISTEN) 441 lock = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)].lock; 442 else 443 lock = inet_ehash_lockp(hashinfo, sk->sk_hash); 444 445 spin_lock_bh(lock); 446 done =__sk_nulls_del_node_init_rcu(sk); 447 if (done) 448 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); 449 spin_unlock_bh(lock); 450 } 451 EXPORT_SYMBOL_GPL(inet_unhash); 452 453 int __inet_hash_connect(struct inet_timewait_death_row *death_row, 454 struct sock *sk, u32 port_offset, 455 int (*check_established)(struct inet_timewait_death_row *, 456 struct sock *, __u16, struct inet_timewait_sock **), 457 int (*hash)(struct sock *sk, struct inet_timewait_sock *twp)) 458 { 459 struct inet_hashinfo *hinfo = death_row->hashinfo; 460 const unsigned short snum = inet_sk(sk)->inet_num; 461 struct inet_bind_hashbucket *head; 462 struct inet_bind_bucket *tb; 463 int ret; 464 struct net *net = sock_net(sk); 465 int twrefcnt = 1; 466 467 if (!snum) { 468 int i, remaining, low, high, port; 469 static u32 hint; 470 u32 offset = hint + port_offset; 471 struct hlist_node *node; 472 struct inet_timewait_sock *tw = NULL; 473 474 inet_get_local_port_range(&low, &high); 475 remaining = (high - low) + 1; 476 477 local_bh_disable(); 478 for (i = 1; i <= remaining; i++) { 479 port = low + (i + offset) % remaining; 480 if (inet_is_reserved_local_port(port)) 481 continue; 482 head = &hinfo->bhash[inet_bhashfn(net, port, 483 hinfo->bhash_size)]; 484 spin_lock(&head->lock); 485 486 /* Does not bother with rcv_saddr checks, 487 * because the established check is already 488 * unique enough. 489 */ 490 inet_bind_bucket_for_each(tb, node, &head->chain) { 491 if (net_eq(ib_net(tb), net) && 492 tb->port == port) { 493 if (tb->fastreuse >= 0) 494 goto next_port; 495 WARN_ON(hlist_empty(&tb->owners)); 496 if (!check_established(death_row, sk, 497 port, &tw)) 498 goto ok; 499 goto next_port; 500 } 501 } 502 503 tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, 504 net, head, port); 505 if (!tb) { 506 spin_unlock(&head->lock); 507 break; 508 } 509 tb->fastreuse = -1; 510 goto ok; 511 512 next_port: 513 spin_unlock(&head->lock); 514 } 515 local_bh_enable(); 516 517 return -EADDRNOTAVAIL; 518 519 ok: 520 hint += i; 521 522 /* Head lock still held and bh's disabled */ 523 inet_bind_hash(sk, tb, port); 524 if (sk_unhashed(sk)) { 525 inet_sk(sk)->inet_sport = htons(port); 526 twrefcnt += hash(sk, tw); 527 } 528 if (tw) 529 twrefcnt += inet_twsk_bind_unhash(tw, hinfo); 530 spin_unlock(&head->lock); 531 532 if (tw) { 533 inet_twsk_deschedule(tw, death_row); 534 while (twrefcnt) { 535 twrefcnt--; 536 inet_twsk_put(tw); 537 } 538 } 539 540 ret = 0; 541 goto out; 542 } 543 544 head = &hinfo->bhash[inet_bhashfn(net, snum, hinfo->bhash_size)]; 545 tb = inet_csk(sk)->icsk_bind_hash; 546 spin_lock_bh(&head->lock); 547 if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { 548 hash(sk, NULL); 549 spin_unlock_bh(&head->lock); 550 return 0; 551 } else { 552 spin_unlock(&head->lock); 553 /* No definite answer... Walk to established hash table */ 554 ret = check_established(death_row, sk, snum, NULL); 555 out: 556 local_bh_enable(); 557 return ret; 558 } 559 } 560 561 /* 562 * Bind a port for a connect operation and hash it. 563 */ 564 int inet_hash_connect(struct inet_timewait_death_row *death_row, 565 struct sock *sk) 566 { 567 return __inet_hash_connect(death_row, sk, inet_sk_port_offset(sk), 568 __inet_check_established, __inet_hash_nolisten); 569 } 570 EXPORT_SYMBOL_GPL(inet_hash_connect); 571 572 void inet_hashinfo_init(struct inet_hashinfo *h) 573 { 574 int i; 575 576 atomic_set(&h->bsockets, 0); 577 for (i = 0; i < INET_LHTABLE_SIZE; i++) { 578 spin_lock_init(&h->listening_hash[i].lock); 579 INIT_HLIST_NULLS_HEAD(&h->listening_hash[i].head, 580 i + LISTENING_NULLS_BASE); 581 } 582 } 583 EXPORT_SYMBOL_GPL(inet_hashinfo_init); 584