1 /* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * Generic INET transport hashtables 7 * 8 * Authors: Lotsa people, from code originally in tcp 9 * 10 * This program is free software; you can redistribute it and/or 11 * modify it under the terms of the GNU General Public License 12 * as published by the Free Software Foundation; either version 13 * 2 of the License, or (at your option) any later version. 14 */ 15 16 #include <linux/module.h> 17 #include <linux/random.h> 18 #include <linux/sched.h> 19 #include <linux/slab.h> 20 #include <linux/wait.h> 21 22 #include <net/inet_connection_sock.h> 23 #include <net/inet_hashtables.h> 24 #include <net/ip.h> 25 26 /* 27 * Allocate and initialize a new local port bind bucket. 28 * The bindhash mutex for snum's hash chain must be held here. 29 */ 30 struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep, 31 struct inet_bind_hashbucket *head, 32 const unsigned short snum) 33 { 34 struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC); 35 36 if (tb != NULL) { 37 tb->port = snum; 38 tb->fastreuse = 0; 39 INIT_HLIST_HEAD(&tb->owners); 40 hlist_add_head(&tb->node, &head->chain); 41 } 42 return tb; 43 } 44 45 /* 46 * Caller must hold hashbucket lock for this tb with local BH disabled 47 */ 48 void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket *tb) 49 { 50 if (hlist_empty(&tb->owners)) { 51 __hlist_del(&tb->node); 52 kmem_cache_free(cachep, tb); 53 } 54 } 55 56 void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, 57 const unsigned short snum) 58 { 59 inet_sk(sk)->num = snum; 60 sk_add_bind_node(sk, &tb->owners); 61 inet_csk(sk)->icsk_bind_hash = tb; 62 } 63 64 /* 65 * Get rid of any references to a local port held by the given sock. 66 */ 67 static void __inet_put_port(struct inet_hashinfo *hashinfo, struct sock *sk) 68 { 69 const int bhash = inet_bhashfn(inet_sk(sk)->num, hashinfo->bhash_size); 70 struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash]; 71 struct inet_bind_bucket *tb; 72 73 spin_lock(&head->lock); 74 tb = inet_csk(sk)->icsk_bind_hash; 75 __sk_del_bind_node(sk); 76 inet_csk(sk)->icsk_bind_hash = NULL; 77 inet_sk(sk)->num = 0; 78 inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); 79 spin_unlock(&head->lock); 80 } 81 82 void inet_put_port(struct inet_hashinfo *hashinfo, struct sock *sk) 83 { 84 local_bh_disable(); 85 __inet_put_port(hashinfo, sk); 86 local_bh_enable(); 87 } 88 89 EXPORT_SYMBOL(inet_put_port); 90 91 /* 92 * This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP. 93 * Look, when several writers sleep and reader wakes them up, all but one 94 * immediately hit write lock and grab all the cpus. Exclusive sleep solves 95 * this, _but_ remember, it adds useless work on UP machines (wake up each 96 * exclusive lock release). It should be ifdefed really. 97 */ 98 void inet_listen_wlock(struct inet_hashinfo *hashinfo) 99 { 100 write_lock(&hashinfo->lhash_lock); 101 102 if (atomic_read(&hashinfo->lhash_users)) { 103 DEFINE_WAIT(wait); 104 105 for (;;) { 106 prepare_to_wait_exclusive(&hashinfo->lhash_wait, 107 &wait, TASK_UNINTERRUPTIBLE); 108 if (!atomic_read(&hashinfo->lhash_users)) 109 break; 110 write_unlock_bh(&hashinfo->lhash_lock); 111 schedule(); 112 write_lock_bh(&hashinfo->lhash_lock); 113 } 114 115 finish_wait(&hashinfo->lhash_wait, &wait); 116 } 117 } 118 119 EXPORT_SYMBOL(inet_listen_wlock); 120 121 /* 122 * Don't inline this cruft. Here are some nice properties to exploit here. The 123 * BSD API does not allow a listening sock to specify the remote port nor the 124 * remote address for the connection. So always assume those are both 125 * wildcarded during the search since they can never be otherwise. 126 */ 127 static struct sock *inet_lookup_listener_slow(const struct hlist_head *head, 128 const __be32 daddr, 129 const unsigned short hnum, 130 const int dif) 131 { 132 struct sock *result = NULL, *sk; 133 const struct hlist_node *node; 134 int hiscore = -1; 135 136 sk_for_each(sk, node, head) { 137 const struct inet_sock *inet = inet_sk(sk); 138 139 if (inet->num == hnum && !ipv6_only_sock(sk)) { 140 const __be32 rcv_saddr = inet->rcv_saddr; 141 int score = sk->sk_family == PF_INET ? 1 : 0; 142 143 if (rcv_saddr) { 144 if (rcv_saddr != daddr) 145 continue; 146 score += 2; 147 } 148 if (sk->sk_bound_dev_if) { 149 if (sk->sk_bound_dev_if != dif) 150 continue; 151 score += 2; 152 } 153 if (score == 5) 154 return sk; 155 if (score > hiscore) { 156 hiscore = score; 157 result = sk; 158 } 159 } 160 } 161 return result; 162 } 163 164 /* Optimize the common listener case. */ 165 struct sock *__inet_lookup_listener(struct inet_hashinfo *hashinfo, 166 const __be32 daddr, const unsigned short hnum, 167 const int dif) 168 { 169 struct sock *sk = NULL; 170 const struct hlist_head *head; 171 172 read_lock(&hashinfo->lhash_lock); 173 head = &hashinfo->listening_hash[inet_lhashfn(hnum)]; 174 if (!hlist_empty(head)) { 175 const struct inet_sock *inet = inet_sk((sk = __sk_head(head))); 176 177 if (inet->num == hnum && !sk->sk_node.next && 178 (!inet->rcv_saddr || inet->rcv_saddr == daddr) && 179 (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) && 180 !sk->sk_bound_dev_if) 181 goto sherry_cache; 182 sk = inet_lookup_listener_slow(head, daddr, hnum, dif); 183 } 184 if (sk) { 185 sherry_cache: 186 sock_hold(sk); 187 } 188 read_unlock(&hashinfo->lhash_lock); 189 return sk; 190 } 191 EXPORT_SYMBOL_GPL(__inet_lookup_listener); 192 193 /* called with local bh disabled */ 194 static int __inet_check_established(struct inet_timewait_death_row *death_row, 195 struct sock *sk, __u16 lport, 196 struct inet_timewait_sock **twp) 197 { 198 struct inet_hashinfo *hinfo = death_row->hashinfo; 199 struct inet_sock *inet = inet_sk(sk); 200 __be32 daddr = inet->rcv_saddr; 201 __be32 saddr = inet->daddr; 202 int dif = sk->sk_bound_dev_if; 203 INET_ADDR_COOKIE(acookie, saddr, daddr) 204 const __portpair ports = INET_COMBINED_PORTS(inet->dport, lport); 205 unsigned int hash = inet_ehashfn(daddr, lport, saddr, inet->dport); 206 struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); 207 struct sock *sk2; 208 const struct hlist_node *node; 209 struct inet_timewait_sock *tw; 210 211 prefetch(head->chain.first); 212 write_lock(&head->lock); 213 214 /* Check TIME-WAIT sockets first. */ 215 sk_for_each(sk2, node, &head->twchain) { 216 tw = inet_twsk(sk2); 217 218 if (INET_TW_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif)) { 219 if (twsk_unique(sk, sk2, twp)) 220 goto unique; 221 else 222 goto not_unique; 223 } 224 } 225 tw = NULL; 226 227 /* And established part... */ 228 sk_for_each(sk2, node, &head->chain) { 229 if (INET_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif)) 230 goto not_unique; 231 } 232 233 unique: 234 /* Must record num and sport now. Otherwise we will see 235 * in hash table socket with a funny identity. */ 236 inet->num = lport; 237 inet->sport = htons(lport); 238 sk->sk_hash = hash; 239 BUG_TRAP(sk_unhashed(sk)); 240 __sk_add_node(sk, &head->chain); 241 sock_prot_inc_use(sk->sk_prot); 242 write_unlock(&head->lock); 243 244 if (twp) { 245 *twp = tw; 246 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); 247 } else if (tw) { 248 /* Silly. Should hash-dance instead... */ 249 inet_twsk_deschedule(tw, death_row); 250 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); 251 252 inet_twsk_put(tw); 253 } 254 255 return 0; 256 257 not_unique: 258 write_unlock(&head->lock); 259 return -EADDRNOTAVAIL; 260 } 261 262 static inline u32 inet_sk_port_offset(const struct sock *sk) 263 { 264 const struct inet_sock *inet = inet_sk(sk); 265 return secure_ipv4_port_ephemeral(inet->rcv_saddr, inet->daddr, 266 inet->dport); 267 } 268 269 /* 270 * Bind a port for a connect operation and hash it. 271 */ 272 int inet_hash_connect(struct inet_timewait_death_row *death_row, 273 struct sock *sk) 274 { 275 struct inet_hashinfo *hinfo = death_row->hashinfo; 276 const unsigned short snum = inet_sk(sk)->num; 277 struct inet_bind_hashbucket *head; 278 struct inet_bind_bucket *tb; 279 int ret; 280 281 if (!snum) { 282 int low = sysctl_local_port_range[0]; 283 int high = sysctl_local_port_range[1]; 284 int range = high - low; 285 int i; 286 int port; 287 static u32 hint; 288 u32 offset = hint + inet_sk_port_offset(sk); 289 struct hlist_node *node; 290 struct inet_timewait_sock *tw = NULL; 291 292 local_bh_disable(); 293 for (i = 1; i <= range; i++) { 294 port = low + (i + offset) % range; 295 head = &hinfo->bhash[inet_bhashfn(port, hinfo->bhash_size)]; 296 spin_lock(&head->lock); 297 298 /* Does not bother with rcv_saddr checks, 299 * because the established check is already 300 * unique enough. 301 */ 302 inet_bind_bucket_for_each(tb, node, &head->chain) { 303 if (tb->port == port) { 304 BUG_TRAP(!hlist_empty(&tb->owners)); 305 if (tb->fastreuse >= 0) 306 goto next_port; 307 if (!__inet_check_established(death_row, 308 sk, port, 309 &tw)) 310 goto ok; 311 goto next_port; 312 } 313 } 314 315 tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, head, port); 316 if (!tb) { 317 spin_unlock(&head->lock); 318 break; 319 } 320 tb->fastreuse = -1; 321 goto ok; 322 323 next_port: 324 spin_unlock(&head->lock); 325 } 326 local_bh_enable(); 327 328 return -EADDRNOTAVAIL; 329 330 ok: 331 hint += i; 332 333 /* Head lock still held and bh's disabled */ 334 inet_bind_hash(sk, tb, port); 335 if (sk_unhashed(sk)) { 336 inet_sk(sk)->sport = htons(port); 337 __inet_hash(hinfo, sk, 0); 338 } 339 spin_unlock(&head->lock); 340 341 if (tw) { 342 inet_twsk_deschedule(tw, death_row); 343 inet_twsk_put(tw); 344 } 345 346 ret = 0; 347 goto out; 348 } 349 350 head = &hinfo->bhash[inet_bhashfn(snum, hinfo->bhash_size)]; 351 tb = inet_csk(sk)->icsk_bind_hash; 352 spin_lock_bh(&head->lock); 353 if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { 354 __inet_hash(hinfo, sk, 0); 355 spin_unlock_bh(&head->lock); 356 return 0; 357 } else { 358 spin_unlock(&head->lock); 359 /* No definite answer... Walk to established hash table */ 360 ret = __inet_check_established(death_row, sk, snum, NULL); 361 out: 362 local_bh_enable(); 363 return ret; 364 } 365 } 366 367 EXPORT_SYMBOL_GPL(inet_hash_connect); 368