1 /* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * Generic INET transport hashtables 7 * 8 * Authors: Lotsa people, from code originally in tcp 9 * 10 * This program is free software; you can redistribute it and/or 11 * modify it under the terms of the GNU General Public License 12 * as published by the Free Software Foundation; either version 13 * 2 of the License, or (at your option) any later version. 14 */ 15 16 #include <linux/module.h> 17 #include <linux/random.h> 18 #include <linux/sched.h> 19 #include <linux/slab.h> 20 #include <linux/wait.h> 21 22 #include <net/inet_connection_sock.h> 23 #include <net/inet_hashtables.h> 24 #include <net/ip.h> 25 26 /* 27 * Allocate and initialize a new local port bind bucket. 28 * The bindhash mutex for snum's hash chain must be held here. 29 */ 30 struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep, 31 struct net *net, 32 struct inet_bind_hashbucket *head, 33 const unsigned short snum) 34 { 35 struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC); 36 37 if (tb != NULL) { 38 tb->ib_net = net; 39 tb->port = snum; 40 tb->fastreuse = 0; 41 INIT_HLIST_HEAD(&tb->owners); 42 hlist_add_head(&tb->node, &head->chain); 43 } 44 return tb; 45 } 46 47 /* 48 * Caller must hold hashbucket lock for this tb with local BH disabled 49 */ 50 void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket *tb) 51 { 52 if (hlist_empty(&tb->owners)) { 53 __hlist_del(&tb->node); 54 kmem_cache_free(cachep, tb); 55 } 56 } 57 58 void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, 59 const unsigned short snum) 60 { 61 inet_sk(sk)->num = snum; 62 sk_add_bind_node(sk, &tb->owners); 63 inet_csk(sk)->icsk_bind_hash = tb; 64 } 65 66 /* 67 * Get rid of any references to a local port held by the given sock. 68 */ 69 static void __inet_put_port(struct inet_hashinfo *hashinfo, struct sock *sk) 70 { 71 const int bhash = inet_bhashfn(inet_sk(sk)->num, hashinfo->bhash_size); 72 struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash]; 73 struct inet_bind_bucket *tb; 74 75 spin_lock(&head->lock); 76 tb = inet_csk(sk)->icsk_bind_hash; 77 __sk_del_bind_node(sk); 78 inet_csk(sk)->icsk_bind_hash = NULL; 79 inet_sk(sk)->num = 0; 80 inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); 81 spin_unlock(&head->lock); 82 } 83 84 void inet_put_port(struct inet_hashinfo *hashinfo, struct sock *sk) 85 { 86 local_bh_disable(); 87 __inet_put_port(hashinfo, sk); 88 local_bh_enable(); 89 } 90 91 EXPORT_SYMBOL(inet_put_port); 92 93 /* 94 * This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP. 95 * Look, when several writers sleep and reader wakes them up, all but one 96 * immediately hit write lock and grab all the cpus. Exclusive sleep solves 97 * this, _but_ remember, it adds useless work on UP machines (wake up each 98 * exclusive lock release). It should be ifdefed really. 99 */ 100 void inet_listen_wlock(struct inet_hashinfo *hashinfo) 101 __acquires(hashinfo->lhash_lock) 102 { 103 write_lock(&hashinfo->lhash_lock); 104 105 if (atomic_read(&hashinfo->lhash_users)) { 106 DEFINE_WAIT(wait); 107 108 for (;;) { 109 prepare_to_wait_exclusive(&hashinfo->lhash_wait, 110 &wait, TASK_UNINTERRUPTIBLE); 111 if (!atomic_read(&hashinfo->lhash_users)) 112 break; 113 write_unlock_bh(&hashinfo->lhash_lock); 114 schedule(); 115 write_lock_bh(&hashinfo->lhash_lock); 116 } 117 118 finish_wait(&hashinfo->lhash_wait, &wait); 119 } 120 } 121 122 EXPORT_SYMBOL(inet_listen_wlock); 123 124 /* 125 * Don't inline this cruft. Here are some nice properties to exploit here. The 126 * BSD API does not allow a listening sock to specify the remote port nor the 127 * remote address for the connection. So always assume those are both 128 * wildcarded during the search since they can never be otherwise. 129 */ 130 static struct sock *inet_lookup_listener_slow(struct net *net, 131 const struct hlist_head *head, 132 const __be32 daddr, 133 const unsigned short hnum, 134 const int dif) 135 { 136 struct sock *result = NULL, *sk; 137 const struct hlist_node *node; 138 int hiscore = -1; 139 140 sk_for_each(sk, node, head) { 141 const struct inet_sock *inet = inet_sk(sk); 142 143 if (sk->sk_net == net && inet->num == hnum && 144 !ipv6_only_sock(sk)) { 145 const __be32 rcv_saddr = inet->rcv_saddr; 146 int score = sk->sk_family == PF_INET ? 1 : 0; 147 148 if (rcv_saddr) { 149 if (rcv_saddr != daddr) 150 continue; 151 score += 2; 152 } 153 if (sk->sk_bound_dev_if) { 154 if (sk->sk_bound_dev_if != dif) 155 continue; 156 score += 2; 157 } 158 if (score == 5) 159 return sk; 160 if (score > hiscore) { 161 hiscore = score; 162 result = sk; 163 } 164 } 165 } 166 return result; 167 } 168 169 /* Optimize the common listener case. */ 170 struct sock *__inet_lookup_listener(struct net *net, 171 struct inet_hashinfo *hashinfo, 172 const __be32 daddr, const unsigned short hnum, 173 const int dif) 174 { 175 struct sock *sk = NULL; 176 const struct hlist_head *head; 177 178 read_lock(&hashinfo->lhash_lock); 179 head = &hashinfo->listening_hash[inet_lhashfn(hnum)]; 180 if (!hlist_empty(head)) { 181 const struct inet_sock *inet = inet_sk((sk = __sk_head(head))); 182 183 if (inet->num == hnum && !sk->sk_node.next && 184 (!inet->rcv_saddr || inet->rcv_saddr == daddr) && 185 (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) && 186 !sk->sk_bound_dev_if && sk->sk_net == net) 187 goto sherry_cache; 188 sk = inet_lookup_listener_slow(net, head, daddr, hnum, dif); 189 } 190 if (sk) { 191 sherry_cache: 192 sock_hold(sk); 193 } 194 read_unlock(&hashinfo->lhash_lock); 195 return sk; 196 } 197 EXPORT_SYMBOL_GPL(__inet_lookup_listener); 198 199 struct sock * __inet_lookup_established(struct net *net, 200 struct inet_hashinfo *hashinfo, 201 const __be32 saddr, const __be16 sport, 202 const __be32 daddr, const u16 hnum, 203 const int dif) 204 { 205 INET_ADDR_COOKIE(acookie, saddr, daddr) 206 const __portpair ports = INET_COMBINED_PORTS(sport, hnum); 207 struct sock *sk; 208 const struct hlist_node *node; 209 /* Optimize here for direct hit, only listening connections can 210 * have wildcards anyways. 211 */ 212 unsigned int hash = inet_ehashfn(daddr, hnum, saddr, sport); 213 struct inet_ehash_bucket *head = inet_ehash_bucket(hashinfo, hash); 214 rwlock_t *lock = inet_ehash_lockp(hashinfo, hash); 215 216 prefetch(head->chain.first); 217 read_lock(lock); 218 sk_for_each(sk, node, &head->chain) { 219 if (INET_MATCH(sk, net, hash, acookie, 220 saddr, daddr, ports, dif)) 221 goto hit; /* You sunk my battleship! */ 222 } 223 224 /* Must check for a TIME_WAIT'er before going to listener hash. */ 225 sk_for_each(sk, node, &head->twchain) { 226 if (INET_TW_MATCH(sk, net, hash, acookie, 227 saddr, daddr, ports, dif)) 228 goto hit; 229 } 230 sk = NULL; 231 out: 232 read_unlock(lock); 233 return sk; 234 hit: 235 sock_hold(sk); 236 goto out; 237 } 238 EXPORT_SYMBOL_GPL(__inet_lookup_established); 239 240 /* called with local bh disabled */ 241 static int __inet_check_established(struct inet_timewait_death_row *death_row, 242 struct sock *sk, __u16 lport, 243 struct inet_timewait_sock **twp) 244 { 245 struct inet_hashinfo *hinfo = death_row->hashinfo; 246 struct inet_sock *inet = inet_sk(sk); 247 __be32 daddr = inet->rcv_saddr; 248 __be32 saddr = inet->daddr; 249 int dif = sk->sk_bound_dev_if; 250 INET_ADDR_COOKIE(acookie, saddr, daddr) 251 const __portpair ports = INET_COMBINED_PORTS(inet->dport, lport); 252 unsigned int hash = inet_ehashfn(daddr, lport, saddr, inet->dport); 253 struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); 254 rwlock_t *lock = inet_ehash_lockp(hinfo, hash); 255 struct sock *sk2; 256 const struct hlist_node *node; 257 struct inet_timewait_sock *tw; 258 struct net *net = sk->sk_net; 259 260 prefetch(head->chain.first); 261 write_lock(lock); 262 263 /* Check TIME-WAIT sockets first. */ 264 sk_for_each(sk2, node, &head->twchain) { 265 tw = inet_twsk(sk2); 266 267 if (INET_TW_MATCH(sk2, net, hash, acookie, 268 saddr, daddr, ports, dif)) { 269 if (twsk_unique(sk, sk2, twp)) 270 goto unique; 271 else 272 goto not_unique; 273 } 274 } 275 tw = NULL; 276 277 /* And established part... */ 278 sk_for_each(sk2, node, &head->chain) { 279 if (INET_MATCH(sk2, net, hash, acookie, 280 saddr, daddr, ports, dif)) 281 goto not_unique; 282 } 283 284 unique: 285 /* Must record num and sport now. Otherwise we will see 286 * in hash table socket with a funny identity. */ 287 inet->num = lport; 288 inet->sport = htons(lport); 289 sk->sk_hash = hash; 290 BUG_TRAP(sk_unhashed(sk)); 291 __sk_add_node(sk, &head->chain); 292 sock_prot_inuse_add(sk->sk_prot, 1); 293 write_unlock(lock); 294 295 if (twp) { 296 *twp = tw; 297 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); 298 } else if (tw) { 299 /* Silly. Should hash-dance instead... */ 300 inet_twsk_deschedule(tw, death_row); 301 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); 302 303 inet_twsk_put(tw); 304 } 305 306 return 0; 307 308 not_unique: 309 write_unlock(lock); 310 return -EADDRNOTAVAIL; 311 } 312 313 static inline u32 inet_sk_port_offset(const struct sock *sk) 314 { 315 const struct inet_sock *inet = inet_sk(sk); 316 return secure_ipv4_port_ephemeral(inet->rcv_saddr, inet->daddr, 317 inet->dport); 318 } 319 320 void __inet_hash_nolisten(struct inet_hashinfo *hashinfo, struct sock *sk) 321 { 322 struct hlist_head *list; 323 rwlock_t *lock; 324 struct inet_ehash_bucket *head; 325 326 BUG_TRAP(sk_unhashed(sk)); 327 328 sk->sk_hash = inet_sk_ehashfn(sk); 329 head = inet_ehash_bucket(hashinfo, sk->sk_hash); 330 list = &head->chain; 331 lock = inet_ehash_lockp(hashinfo, sk->sk_hash); 332 333 write_lock(lock); 334 __sk_add_node(sk, list); 335 sock_prot_inuse_add(sk->sk_prot, 1); 336 write_unlock(lock); 337 } 338 EXPORT_SYMBOL_GPL(__inet_hash_nolisten); 339 340 void __inet_hash(struct inet_hashinfo *hashinfo, struct sock *sk) 341 { 342 struct hlist_head *list; 343 rwlock_t *lock; 344 345 if (sk->sk_state != TCP_LISTEN) { 346 __inet_hash_nolisten(hashinfo, sk); 347 return; 348 } 349 350 BUG_TRAP(sk_unhashed(sk)); 351 list = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; 352 lock = &hashinfo->lhash_lock; 353 354 inet_listen_wlock(hashinfo); 355 __sk_add_node(sk, list); 356 sock_prot_inuse_add(sk->sk_prot, 1); 357 write_unlock(lock); 358 wake_up(&hashinfo->lhash_wait); 359 } 360 EXPORT_SYMBOL_GPL(__inet_hash); 361 362 int __inet_hash_connect(struct inet_timewait_death_row *death_row, 363 struct sock *sk, 364 int (*check_established)(struct inet_timewait_death_row *, 365 struct sock *, __u16, struct inet_timewait_sock **), 366 void (*hash)(struct inet_hashinfo *, struct sock *)) 367 { 368 struct inet_hashinfo *hinfo = death_row->hashinfo; 369 const unsigned short snum = inet_sk(sk)->num; 370 struct inet_bind_hashbucket *head; 371 struct inet_bind_bucket *tb; 372 int ret; 373 struct net *net = sk->sk_net; 374 375 if (!snum) { 376 int i, remaining, low, high, port; 377 static u32 hint; 378 u32 offset = hint + inet_sk_port_offset(sk); 379 struct hlist_node *node; 380 struct inet_timewait_sock *tw = NULL; 381 382 inet_get_local_port_range(&low, &high); 383 remaining = (high - low) + 1; 384 385 local_bh_disable(); 386 for (i = 1; i <= remaining; i++) { 387 port = low + (i + offset) % remaining; 388 head = &hinfo->bhash[inet_bhashfn(port, hinfo->bhash_size)]; 389 spin_lock(&head->lock); 390 391 /* Does not bother with rcv_saddr checks, 392 * because the established check is already 393 * unique enough. 394 */ 395 inet_bind_bucket_for_each(tb, node, &head->chain) { 396 if (tb->ib_net == net && tb->port == port) { 397 BUG_TRAP(!hlist_empty(&tb->owners)); 398 if (tb->fastreuse >= 0) 399 goto next_port; 400 if (!check_established(death_row, sk, 401 port, &tw)) 402 goto ok; 403 goto next_port; 404 } 405 } 406 407 tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, 408 net, head, port); 409 if (!tb) { 410 spin_unlock(&head->lock); 411 break; 412 } 413 tb->fastreuse = -1; 414 goto ok; 415 416 next_port: 417 spin_unlock(&head->lock); 418 } 419 local_bh_enable(); 420 421 return -EADDRNOTAVAIL; 422 423 ok: 424 hint += i; 425 426 /* Head lock still held and bh's disabled */ 427 inet_bind_hash(sk, tb, port); 428 if (sk_unhashed(sk)) { 429 inet_sk(sk)->sport = htons(port); 430 hash(hinfo, sk); 431 } 432 spin_unlock(&head->lock); 433 434 if (tw) { 435 inet_twsk_deschedule(tw, death_row); 436 inet_twsk_put(tw); 437 } 438 439 ret = 0; 440 goto out; 441 } 442 443 head = &hinfo->bhash[inet_bhashfn(snum, hinfo->bhash_size)]; 444 tb = inet_csk(sk)->icsk_bind_hash; 445 spin_lock_bh(&head->lock); 446 if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { 447 hash(hinfo, sk); 448 spin_unlock_bh(&head->lock); 449 return 0; 450 } else { 451 spin_unlock(&head->lock); 452 /* No definite answer... Walk to established hash table */ 453 ret = check_established(death_row, sk, snum, NULL); 454 out: 455 local_bh_enable(); 456 return ret; 457 } 458 } 459 EXPORT_SYMBOL_GPL(__inet_hash_connect); 460 461 /* 462 * Bind a port for a connect operation and hash it. 463 */ 464 int inet_hash_connect(struct inet_timewait_death_row *death_row, 465 struct sock *sk) 466 { 467 return __inet_hash_connect(death_row, sk, 468 __inet_check_established, __inet_hash_nolisten); 469 } 470 471 EXPORT_SYMBOL_GPL(inet_hash_connect); 472