1 /* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * Generic INET transport hashtables 7 * 8 * Authors: Lotsa people, from code originally in tcp 9 * 10 * This program is free software; you can redistribute it and/or 11 * modify it under the terms of the GNU General Public License 12 * as published by the Free Software Foundation; either version 13 * 2 of the License, or (at your option) any later version. 14 */ 15 16 #include <linux/module.h> 17 #include <linux/random.h> 18 #include <linux/sched.h> 19 #include <linux/slab.h> 20 #include <linux/wait.h> 21 22 #include <net/inet_connection_sock.h> 23 #include <net/inet_hashtables.h> 24 #include <net/ip.h> 25 26 /* 27 * Allocate and initialize a new local port bind bucket. 28 * The bindhash mutex for snum's hash chain must be held here. 29 */ 30 struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep, 31 struct net *net, 32 struct inet_bind_hashbucket *head, 33 const unsigned short snum) 34 { 35 struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC); 36 37 if (tb != NULL) { 38 tb->ib_net = net; 39 tb->port = snum; 40 tb->fastreuse = 0; 41 INIT_HLIST_HEAD(&tb->owners); 42 hlist_add_head(&tb->node, &head->chain); 43 } 44 return tb; 45 } 46 47 /* 48 * Caller must hold hashbucket lock for this tb with local BH disabled 49 */ 50 void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket *tb) 51 { 52 if (hlist_empty(&tb->owners)) { 53 __hlist_del(&tb->node); 54 kmem_cache_free(cachep, tb); 55 } 56 } 57 58 void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, 59 const unsigned short snum) 60 { 61 inet_sk(sk)->num = snum; 62 sk_add_bind_node(sk, &tb->owners); 63 inet_csk(sk)->icsk_bind_hash = tb; 64 } 65 66 /* 67 * Get rid of any references to a local port held by the given sock. 68 */ 69 static void __inet_put_port(struct sock *sk) 70 { 71 struct inet_hashinfo *hashinfo = sk->sk_prot->hashinfo; 72 const int bhash = inet_bhashfn(inet_sk(sk)->num, hashinfo->bhash_size); 73 struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash]; 74 struct inet_bind_bucket *tb; 75 76 spin_lock(&head->lock); 77 tb = inet_csk(sk)->icsk_bind_hash; 78 __sk_del_bind_node(sk); 79 inet_csk(sk)->icsk_bind_hash = NULL; 80 inet_sk(sk)->num = 0; 81 inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); 82 spin_unlock(&head->lock); 83 } 84 85 void inet_put_port(struct sock *sk) 86 { 87 local_bh_disable(); 88 __inet_put_port(sk); 89 local_bh_enable(); 90 } 91 92 EXPORT_SYMBOL(inet_put_port); 93 94 /* 95 * This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP. 96 * Look, when several writers sleep and reader wakes them up, all but one 97 * immediately hit write lock and grab all the cpus. Exclusive sleep solves 98 * this, _but_ remember, it adds useless work on UP machines (wake up each 99 * exclusive lock release). It should be ifdefed really. 100 */ 101 void inet_listen_wlock(struct inet_hashinfo *hashinfo) 102 __acquires(hashinfo->lhash_lock) 103 { 104 write_lock(&hashinfo->lhash_lock); 105 106 if (atomic_read(&hashinfo->lhash_users)) { 107 DEFINE_WAIT(wait); 108 109 for (;;) { 110 prepare_to_wait_exclusive(&hashinfo->lhash_wait, 111 &wait, TASK_UNINTERRUPTIBLE); 112 if (!atomic_read(&hashinfo->lhash_users)) 113 break; 114 write_unlock_bh(&hashinfo->lhash_lock); 115 schedule(); 116 write_lock_bh(&hashinfo->lhash_lock); 117 } 118 119 finish_wait(&hashinfo->lhash_wait, &wait); 120 } 121 } 122 123 /* 124 * Don't inline this cruft. Here are some nice properties to exploit here. The 125 * BSD API does not allow a listening sock to specify the remote port nor the 126 * remote address for the connection. So always assume those are both 127 * wildcarded during the search since they can never be otherwise. 128 */ 129 static struct sock *inet_lookup_listener_slow(struct net *net, 130 const struct hlist_head *head, 131 const __be32 daddr, 132 const unsigned short hnum, 133 const int dif) 134 { 135 struct sock *result = NULL, *sk; 136 const struct hlist_node *node; 137 int hiscore = -1; 138 139 sk_for_each(sk, node, head) { 140 const struct inet_sock *inet = inet_sk(sk); 141 142 if (sk->sk_net == net && inet->num == hnum && 143 !ipv6_only_sock(sk)) { 144 const __be32 rcv_saddr = inet->rcv_saddr; 145 int score = sk->sk_family == PF_INET ? 1 : 0; 146 147 if (rcv_saddr) { 148 if (rcv_saddr != daddr) 149 continue; 150 score += 2; 151 } 152 if (sk->sk_bound_dev_if) { 153 if (sk->sk_bound_dev_if != dif) 154 continue; 155 score += 2; 156 } 157 if (score == 5) 158 return sk; 159 if (score > hiscore) { 160 hiscore = score; 161 result = sk; 162 } 163 } 164 } 165 return result; 166 } 167 168 /* Optimize the common listener case. */ 169 struct sock *__inet_lookup_listener(struct net *net, 170 struct inet_hashinfo *hashinfo, 171 const __be32 daddr, const unsigned short hnum, 172 const int dif) 173 { 174 struct sock *sk = NULL; 175 const struct hlist_head *head; 176 177 read_lock(&hashinfo->lhash_lock); 178 head = &hashinfo->listening_hash[inet_lhashfn(hnum)]; 179 if (!hlist_empty(head)) { 180 const struct inet_sock *inet = inet_sk((sk = __sk_head(head))); 181 182 if (inet->num == hnum && !sk->sk_node.next && 183 (!inet->rcv_saddr || inet->rcv_saddr == daddr) && 184 (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) && 185 !sk->sk_bound_dev_if && sk->sk_net == net) 186 goto sherry_cache; 187 sk = inet_lookup_listener_slow(net, head, daddr, hnum, dif); 188 } 189 if (sk) { 190 sherry_cache: 191 sock_hold(sk); 192 } 193 read_unlock(&hashinfo->lhash_lock); 194 return sk; 195 } 196 EXPORT_SYMBOL_GPL(__inet_lookup_listener); 197 198 struct sock * __inet_lookup_established(struct net *net, 199 struct inet_hashinfo *hashinfo, 200 const __be32 saddr, const __be16 sport, 201 const __be32 daddr, const u16 hnum, 202 const int dif) 203 { 204 INET_ADDR_COOKIE(acookie, saddr, daddr) 205 const __portpair ports = INET_COMBINED_PORTS(sport, hnum); 206 struct sock *sk; 207 const struct hlist_node *node; 208 /* Optimize here for direct hit, only listening connections can 209 * have wildcards anyways. 210 */ 211 unsigned int hash = inet_ehashfn(daddr, hnum, saddr, sport); 212 struct inet_ehash_bucket *head = inet_ehash_bucket(hashinfo, hash); 213 rwlock_t *lock = inet_ehash_lockp(hashinfo, hash); 214 215 prefetch(head->chain.first); 216 read_lock(lock); 217 sk_for_each(sk, node, &head->chain) { 218 if (INET_MATCH(sk, net, hash, acookie, 219 saddr, daddr, ports, dif)) 220 goto hit; /* You sunk my battleship! */ 221 } 222 223 /* Must check for a TIME_WAIT'er before going to listener hash. */ 224 sk_for_each(sk, node, &head->twchain) { 225 if (INET_TW_MATCH(sk, net, hash, acookie, 226 saddr, daddr, ports, dif)) 227 goto hit; 228 } 229 sk = NULL; 230 out: 231 read_unlock(lock); 232 return sk; 233 hit: 234 sock_hold(sk); 235 goto out; 236 } 237 EXPORT_SYMBOL_GPL(__inet_lookup_established); 238 239 /* called with local bh disabled */ 240 static int __inet_check_established(struct inet_timewait_death_row *death_row, 241 struct sock *sk, __u16 lport, 242 struct inet_timewait_sock **twp) 243 { 244 struct inet_hashinfo *hinfo = death_row->hashinfo; 245 struct inet_sock *inet = inet_sk(sk); 246 __be32 daddr = inet->rcv_saddr; 247 __be32 saddr = inet->daddr; 248 int dif = sk->sk_bound_dev_if; 249 INET_ADDR_COOKIE(acookie, saddr, daddr) 250 const __portpair ports = INET_COMBINED_PORTS(inet->dport, lport); 251 unsigned int hash = inet_ehashfn(daddr, lport, saddr, inet->dport); 252 struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); 253 rwlock_t *lock = inet_ehash_lockp(hinfo, hash); 254 struct sock *sk2; 255 const struct hlist_node *node; 256 struct inet_timewait_sock *tw; 257 struct net *net = sk->sk_net; 258 259 prefetch(head->chain.first); 260 write_lock(lock); 261 262 /* Check TIME-WAIT sockets first. */ 263 sk_for_each(sk2, node, &head->twchain) { 264 tw = inet_twsk(sk2); 265 266 if (INET_TW_MATCH(sk2, net, hash, acookie, 267 saddr, daddr, ports, dif)) { 268 if (twsk_unique(sk, sk2, twp)) 269 goto unique; 270 else 271 goto not_unique; 272 } 273 } 274 tw = NULL; 275 276 /* And established part... */ 277 sk_for_each(sk2, node, &head->chain) { 278 if (INET_MATCH(sk2, net, hash, acookie, 279 saddr, daddr, ports, dif)) 280 goto not_unique; 281 } 282 283 unique: 284 /* Must record num and sport now. Otherwise we will see 285 * in hash table socket with a funny identity. */ 286 inet->num = lport; 287 inet->sport = htons(lport); 288 sk->sk_hash = hash; 289 BUG_TRAP(sk_unhashed(sk)); 290 __sk_add_node(sk, &head->chain); 291 sock_prot_inuse_add(sk->sk_prot, 1); 292 write_unlock(lock); 293 294 if (twp) { 295 *twp = tw; 296 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); 297 } else if (tw) { 298 /* Silly. Should hash-dance instead... */ 299 inet_twsk_deschedule(tw, death_row); 300 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); 301 302 inet_twsk_put(tw); 303 } 304 305 return 0; 306 307 not_unique: 308 write_unlock(lock); 309 return -EADDRNOTAVAIL; 310 } 311 312 static inline u32 inet_sk_port_offset(const struct sock *sk) 313 { 314 const struct inet_sock *inet = inet_sk(sk); 315 return secure_ipv4_port_ephemeral(inet->rcv_saddr, inet->daddr, 316 inet->dport); 317 } 318 319 void __inet_hash_nolisten(struct sock *sk) 320 { 321 struct inet_hashinfo *hashinfo = sk->sk_prot->hashinfo; 322 struct hlist_head *list; 323 rwlock_t *lock; 324 struct inet_ehash_bucket *head; 325 326 BUG_TRAP(sk_unhashed(sk)); 327 328 sk->sk_hash = inet_sk_ehashfn(sk); 329 head = inet_ehash_bucket(hashinfo, sk->sk_hash); 330 list = &head->chain; 331 lock = inet_ehash_lockp(hashinfo, sk->sk_hash); 332 333 write_lock(lock); 334 __sk_add_node(sk, list); 335 sock_prot_inuse_add(sk->sk_prot, 1); 336 write_unlock(lock); 337 } 338 EXPORT_SYMBOL_GPL(__inet_hash_nolisten); 339 340 static void __inet_hash(struct sock *sk) 341 { 342 struct inet_hashinfo *hashinfo = sk->sk_prot->hashinfo; 343 struct hlist_head *list; 344 rwlock_t *lock; 345 346 if (sk->sk_state != TCP_LISTEN) { 347 __inet_hash_nolisten(sk); 348 return; 349 } 350 351 BUG_TRAP(sk_unhashed(sk)); 352 list = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; 353 lock = &hashinfo->lhash_lock; 354 355 inet_listen_wlock(hashinfo); 356 __sk_add_node(sk, list); 357 sock_prot_inuse_add(sk->sk_prot, 1); 358 write_unlock(lock); 359 wake_up(&hashinfo->lhash_wait); 360 } 361 362 void inet_hash(struct sock *sk) 363 { 364 if (sk->sk_state != TCP_CLOSE) { 365 local_bh_disable(); 366 __inet_hash(sk); 367 local_bh_enable(); 368 } 369 } 370 EXPORT_SYMBOL_GPL(inet_hash); 371 372 void inet_unhash(struct sock *sk) 373 { 374 rwlock_t *lock; 375 struct inet_hashinfo *hashinfo = sk->sk_prot->hashinfo; 376 377 if (sk_unhashed(sk)) 378 goto out; 379 380 if (sk->sk_state == TCP_LISTEN) { 381 local_bh_disable(); 382 inet_listen_wlock(hashinfo); 383 lock = &hashinfo->lhash_lock; 384 } else { 385 lock = inet_ehash_lockp(hashinfo, sk->sk_hash); 386 write_lock_bh(lock); 387 } 388 389 if (__sk_del_node_init(sk)) 390 sock_prot_inuse_add(sk->sk_prot, -1); 391 write_unlock_bh(lock); 392 out: 393 if (sk->sk_state == TCP_LISTEN) 394 wake_up(&hashinfo->lhash_wait); 395 } 396 EXPORT_SYMBOL_GPL(inet_unhash); 397 398 int __inet_hash_connect(struct inet_timewait_death_row *death_row, 399 struct sock *sk, u32 port_offset, 400 int (*check_established)(struct inet_timewait_death_row *, 401 struct sock *, __u16, struct inet_timewait_sock **), 402 void (*hash)(struct sock *sk)) 403 { 404 struct inet_hashinfo *hinfo = death_row->hashinfo; 405 const unsigned short snum = inet_sk(sk)->num; 406 struct inet_bind_hashbucket *head; 407 struct inet_bind_bucket *tb; 408 int ret; 409 struct net *net = sk->sk_net; 410 411 if (!snum) { 412 int i, remaining, low, high, port; 413 static u32 hint; 414 u32 offset = hint + port_offset; 415 struct hlist_node *node; 416 struct inet_timewait_sock *tw = NULL; 417 418 inet_get_local_port_range(&low, &high); 419 remaining = (high - low) + 1; 420 421 local_bh_disable(); 422 for (i = 1; i <= remaining; i++) { 423 port = low + (i + offset) % remaining; 424 head = &hinfo->bhash[inet_bhashfn(port, hinfo->bhash_size)]; 425 spin_lock(&head->lock); 426 427 /* Does not bother with rcv_saddr checks, 428 * because the established check is already 429 * unique enough. 430 */ 431 inet_bind_bucket_for_each(tb, node, &head->chain) { 432 if (tb->ib_net == net && tb->port == port) { 433 BUG_TRAP(!hlist_empty(&tb->owners)); 434 if (tb->fastreuse >= 0) 435 goto next_port; 436 if (!check_established(death_row, sk, 437 port, &tw)) 438 goto ok; 439 goto next_port; 440 } 441 } 442 443 tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, 444 net, head, port); 445 if (!tb) { 446 spin_unlock(&head->lock); 447 break; 448 } 449 tb->fastreuse = -1; 450 goto ok; 451 452 next_port: 453 spin_unlock(&head->lock); 454 } 455 local_bh_enable(); 456 457 return -EADDRNOTAVAIL; 458 459 ok: 460 hint += i; 461 462 /* Head lock still held and bh's disabled */ 463 inet_bind_hash(sk, tb, port); 464 if (sk_unhashed(sk)) { 465 inet_sk(sk)->sport = htons(port); 466 hash(sk); 467 } 468 spin_unlock(&head->lock); 469 470 if (tw) { 471 inet_twsk_deschedule(tw, death_row); 472 inet_twsk_put(tw); 473 } 474 475 ret = 0; 476 goto out; 477 } 478 479 head = &hinfo->bhash[inet_bhashfn(snum, hinfo->bhash_size)]; 480 tb = inet_csk(sk)->icsk_bind_hash; 481 spin_lock_bh(&head->lock); 482 if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { 483 hash(sk); 484 spin_unlock_bh(&head->lock); 485 return 0; 486 } else { 487 spin_unlock(&head->lock); 488 /* No definite answer... Walk to established hash table */ 489 ret = check_established(death_row, sk, snum, NULL); 490 out: 491 local_bh_enable(); 492 return ret; 493 } 494 } 495 496 /* 497 * Bind a port for a connect operation and hash it. 498 */ 499 int inet_hash_connect(struct inet_timewait_death_row *death_row, 500 struct sock *sk) 501 { 502 return __inet_hash_connect(death_row, sk, inet_sk_port_offset(sk), 503 __inet_check_established, __inet_hash_nolisten); 504 } 505 506 EXPORT_SYMBOL_GPL(inet_hash_connect); 507