xref: /openbmc/linux/net/ipv4/inet_hashtables.c (revision b85d4594)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Generic INET transport hashtables
7  *
8  * Authors:	Lotsa people, from code originally in tcp
9  *
10  *	This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  */
15 
16 #include <linux/module.h>
17 #include <linux/random.h>
18 #include <linux/sched.h>
19 #include <linux/slab.h>
20 #include <linux/wait.h>
21 #include <linux/vmalloc.h>
22 
23 #include <net/inet_connection_sock.h>
24 #include <net/inet_hashtables.h>
25 #include <net/secure_seq.h>
26 #include <net/ip.h>
27 
28 static u32 inet_ehashfn(const struct net *net, const __be32 laddr,
29 			const __u16 lport, const __be32 faddr,
30 			const __be16 fport)
31 {
32 	static u32 inet_ehash_secret __read_mostly;
33 
34 	net_get_random_once(&inet_ehash_secret, sizeof(inet_ehash_secret));
35 
36 	return __inet_ehashfn(laddr, lport, faddr, fport,
37 			      inet_ehash_secret + net_hash_mix(net));
38 }
39 
40 /* This function handles inet_sock, but also timewait and request sockets
41  * for IPv4/IPv6.
42  */
43 u32 sk_ehashfn(const struct sock *sk)
44 {
45 #if IS_ENABLED(CONFIG_IPV6)
46 	if (sk->sk_family == AF_INET6 &&
47 	    !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
48 		return inet6_ehashfn(sock_net(sk),
49 				     &sk->sk_v6_rcv_saddr, sk->sk_num,
50 				     &sk->sk_v6_daddr, sk->sk_dport);
51 #endif
52 	return inet_ehashfn(sock_net(sk),
53 			    sk->sk_rcv_saddr, sk->sk_num,
54 			    sk->sk_daddr, sk->sk_dport);
55 }
56 
57 /*
58  * Allocate and initialize a new local port bind bucket.
59  * The bindhash mutex for snum's hash chain must be held here.
60  */
61 struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
62 						 struct net *net,
63 						 struct inet_bind_hashbucket *head,
64 						 const unsigned short snum)
65 {
66 	struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC);
67 
68 	if (tb) {
69 		write_pnet(&tb->ib_net, net);
70 		tb->port      = snum;
71 		tb->fastreuse = 0;
72 		tb->fastreuseport = 0;
73 		tb->num_owners = 0;
74 		INIT_HLIST_HEAD(&tb->owners);
75 		hlist_add_head(&tb->node, &head->chain);
76 	}
77 	return tb;
78 }
79 
80 /*
81  * Caller must hold hashbucket lock for this tb with local BH disabled
82  */
83 void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket *tb)
84 {
85 	if (hlist_empty(&tb->owners)) {
86 		__hlist_del(&tb->node);
87 		kmem_cache_free(cachep, tb);
88 	}
89 }
90 
91 void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
92 		    const unsigned short snum)
93 {
94 	inet_sk(sk)->inet_num = snum;
95 	sk_add_bind_node(sk, &tb->owners);
96 	tb->num_owners++;
97 	inet_csk(sk)->icsk_bind_hash = tb;
98 }
99 
100 /*
101  * Get rid of any references to a local port held by the given sock.
102  */
103 static void __inet_put_port(struct sock *sk)
104 {
105 	struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
106 	const int bhash = inet_bhashfn(sock_net(sk), inet_sk(sk)->inet_num,
107 			hashinfo->bhash_size);
108 	struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash];
109 	struct inet_bind_bucket *tb;
110 
111 	spin_lock(&head->lock);
112 	tb = inet_csk(sk)->icsk_bind_hash;
113 	__sk_del_bind_node(sk);
114 	tb->num_owners--;
115 	inet_csk(sk)->icsk_bind_hash = NULL;
116 	inet_sk(sk)->inet_num = 0;
117 	inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
118 	spin_unlock(&head->lock);
119 }
120 
121 void inet_put_port(struct sock *sk)
122 {
123 	local_bh_disable();
124 	__inet_put_port(sk);
125 	local_bh_enable();
126 }
127 EXPORT_SYMBOL(inet_put_port);
128 
129 int __inet_inherit_port(struct sock *sk, struct sock *child)
130 {
131 	struct inet_hashinfo *table = sk->sk_prot->h.hashinfo;
132 	unsigned short port = inet_sk(child)->inet_num;
133 	const int bhash = inet_bhashfn(sock_net(sk), port,
134 			table->bhash_size);
135 	struct inet_bind_hashbucket *head = &table->bhash[bhash];
136 	struct inet_bind_bucket *tb;
137 
138 	spin_lock(&head->lock);
139 	tb = inet_csk(sk)->icsk_bind_hash;
140 	if (tb->port != port) {
141 		/* NOTE: using tproxy and redirecting skbs to a proxy
142 		 * on a different listener port breaks the assumption
143 		 * that the listener socket's icsk_bind_hash is the same
144 		 * as that of the child socket. We have to look up or
145 		 * create a new bind bucket for the child here. */
146 		inet_bind_bucket_for_each(tb, &head->chain) {
147 			if (net_eq(ib_net(tb), sock_net(sk)) &&
148 			    tb->port == port)
149 				break;
150 		}
151 		if (!tb) {
152 			tb = inet_bind_bucket_create(table->bind_bucket_cachep,
153 						     sock_net(sk), head, port);
154 			if (!tb) {
155 				spin_unlock(&head->lock);
156 				return -ENOMEM;
157 			}
158 		}
159 	}
160 	inet_bind_hash(child, tb, port);
161 	spin_unlock(&head->lock);
162 
163 	return 0;
164 }
165 EXPORT_SYMBOL_GPL(__inet_inherit_port);
166 
167 static inline int compute_score(struct sock *sk, struct net *net,
168 				const unsigned short hnum, const __be32 daddr,
169 				const int dif)
170 {
171 	int score = -1;
172 	struct inet_sock *inet = inet_sk(sk);
173 
174 	if (net_eq(sock_net(sk), net) && inet->inet_num == hnum &&
175 			!ipv6_only_sock(sk)) {
176 		__be32 rcv_saddr = inet->inet_rcv_saddr;
177 		score = sk->sk_family == PF_INET ? 2 : 1;
178 		if (rcv_saddr) {
179 			if (rcv_saddr != daddr)
180 				return -1;
181 			score += 4;
182 		}
183 		if (sk->sk_bound_dev_if) {
184 			if (sk->sk_bound_dev_if != dif)
185 				return -1;
186 			score += 4;
187 		}
188 	}
189 	return score;
190 }
191 
192 /*
193  * Don't inline this cruft. Here are some nice properties to exploit here. The
194  * BSD API does not allow a listening sock to specify the remote port nor the
195  * remote address for the connection. So always assume those are both
196  * wildcarded during the search since they can never be otherwise.
197  */
198 
199 
200 struct sock *__inet_lookup_listener(struct net *net,
201 				    struct inet_hashinfo *hashinfo,
202 				    const __be32 saddr, __be16 sport,
203 				    const __be32 daddr, const unsigned short hnum,
204 				    const int dif)
205 {
206 	struct sock *sk, *result;
207 	struct hlist_nulls_node *node;
208 	unsigned int hash = inet_lhashfn(net, hnum);
209 	struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash];
210 	int score, hiscore, matches = 0, reuseport = 0;
211 	u32 phash = 0;
212 
213 	rcu_read_lock();
214 begin:
215 	result = NULL;
216 	hiscore = 0;
217 	sk_nulls_for_each_rcu(sk, node, &ilb->head) {
218 		score = compute_score(sk, net, hnum, daddr, dif);
219 		if (score > hiscore) {
220 			result = sk;
221 			hiscore = score;
222 			reuseport = sk->sk_reuseport;
223 			if (reuseport) {
224 				phash = inet_ehashfn(net, daddr, hnum,
225 						     saddr, sport);
226 				matches = 1;
227 			}
228 		} else if (score == hiscore && reuseport) {
229 			matches++;
230 			if (reciprocal_scale(phash, matches) == 0)
231 				result = sk;
232 			phash = next_pseudo_random32(phash);
233 		}
234 	}
235 	/*
236 	 * if the nulls value we got at the end of this lookup is
237 	 * not the expected one, we must restart lookup.
238 	 * We probably met an item that was moved to another chain.
239 	 */
240 	if (get_nulls_value(node) != hash + LISTENING_NULLS_BASE)
241 		goto begin;
242 	if (result) {
243 		if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt)))
244 			result = NULL;
245 		else if (unlikely(compute_score(result, net, hnum, daddr,
246 				  dif) < hiscore)) {
247 			sock_put(result);
248 			goto begin;
249 		}
250 	}
251 	rcu_read_unlock();
252 	return result;
253 }
254 EXPORT_SYMBOL_GPL(__inet_lookup_listener);
255 
256 /* All sockets share common refcount, but have different destructors */
257 void sock_gen_put(struct sock *sk)
258 {
259 	if (!atomic_dec_and_test(&sk->sk_refcnt))
260 		return;
261 
262 	if (sk->sk_state == TCP_TIME_WAIT)
263 		inet_twsk_free(inet_twsk(sk));
264 	else if (sk->sk_state == TCP_NEW_SYN_RECV)
265 		reqsk_free(inet_reqsk(sk));
266 	else
267 		sk_free(sk);
268 }
269 EXPORT_SYMBOL_GPL(sock_gen_put);
270 
271 void sock_edemux(struct sk_buff *skb)
272 {
273 	sock_gen_put(skb->sk);
274 }
275 EXPORT_SYMBOL(sock_edemux);
276 
277 struct sock *__inet_lookup_established(struct net *net,
278 				  struct inet_hashinfo *hashinfo,
279 				  const __be32 saddr, const __be16 sport,
280 				  const __be32 daddr, const u16 hnum,
281 				  const int dif)
282 {
283 	INET_ADDR_COOKIE(acookie, saddr, daddr);
284 	const __portpair ports = INET_COMBINED_PORTS(sport, hnum);
285 	struct sock *sk;
286 	const struct hlist_nulls_node *node;
287 	/* Optimize here for direct hit, only listening connections can
288 	 * have wildcards anyways.
289 	 */
290 	unsigned int hash = inet_ehashfn(net, daddr, hnum, saddr, sport);
291 	unsigned int slot = hash & hashinfo->ehash_mask;
292 	struct inet_ehash_bucket *head = &hashinfo->ehash[slot];
293 
294 	rcu_read_lock();
295 begin:
296 	sk_nulls_for_each_rcu(sk, node, &head->chain) {
297 		if (sk->sk_hash != hash)
298 			continue;
299 		if (likely(INET_MATCH(sk, net, acookie,
300 				      saddr, daddr, ports, dif))) {
301 			if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt)))
302 				goto out;
303 			if (unlikely(!INET_MATCH(sk, net, acookie,
304 						 saddr, daddr, ports, dif))) {
305 				sock_gen_put(sk);
306 				goto begin;
307 			}
308 			goto found;
309 		}
310 	}
311 	/*
312 	 * if the nulls value we got at the end of this lookup is
313 	 * not the expected one, we must restart lookup.
314 	 * We probably met an item that was moved to another chain.
315 	 */
316 	if (get_nulls_value(node) != slot)
317 		goto begin;
318 out:
319 	sk = NULL;
320 found:
321 	rcu_read_unlock();
322 	return sk;
323 }
324 EXPORT_SYMBOL_GPL(__inet_lookup_established);
325 
326 /* called with local bh disabled */
327 static int __inet_check_established(struct inet_timewait_death_row *death_row,
328 				    struct sock *sk, __u16 lport,
329 				    struct inet_timewait_sock **twp)
330 {
331 	struct inet_hashinfo *hinfo = death_row->hashinfo;
332 	struct inet_sock *inet = inet_sk(sk);
333 	__be32 daddr = inet->inet_rcv_saddr;
334 	__be32 saddr = inet->inet_daddr;
335 	int dif = sk->sk_bound_dev_if;
336 	INET_ADDR_COOKIE(acookie, saddr, daddr);
337 	const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport);
338 	struct net *net = sock_net(sk);
339 	unsigned int hash = inet_ehashfn(net, daddr, lport,
340 					 saddr, inet->inet_dport);
341 	struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash);
342 	spinlock_t *lock = inet_ehash_lockp(hinfo, hash);
343 	struct sock *sk2;
344 	const struct hlist_nulls_node *node;
345 	struct inet_timewait_sock *tw = NULL;
346 
347 	spin_lock(lock);
348 
349 	sk_nulls_for_each(sk2, node, &head->chain) {
350 		if (sk2->sk_hash != hash)
351 			continue;
352 
353 		if (likely(INET_MATCH(sk2, net, acookie,
354 					 saddr, daddr, ports, dif))) {
355 			if (sk2->sk_state == TCP_TIME_WAIT) {
356 				tw = inet_twsk(sk2);
357 				if (twsk_unique(sk, sk2, twp))
358 					break;
359 			}
360 			goto not_unique;
361 		}
362 	}
363 
364 	/* Must record num and sport now. Otherwise we will see
365 	 * in hash table socket with a funny identity.
366 	 */
367 	inet->inet_num = lport;
368 	inet->inet_sport = htons(lport);
369 	sk->sk_hash = hash;
370 	WARN_ON(!sk_unhashed(sk));
371 	__sk_nulls_add_node_rcu(sk, &head->chain);
372 	if (tw) {
373 		sk_nulls_del_node_init_rcu((struct sock *)tw);
374 		NET_INC_STATS_BH(net, LINUX_MIB_TIMEWAITRECYCLED);
375 	}
376 	spin_unlock(lock);
377 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
378 
379 	if (twp) {
380 		*twp = tw;
381 	} else if (tw) {
382 		/* Silly. Should hash-dance instead... */
383 		inet_twsk_deschedule_put(tw);
384 	}
385 	return 0;
386 
387 not_unique:
388 	spin_unlock(lock);
389 	return -EADDRNOTAVAIL;
390 }
391 
392 static u32 inet_sk_port_offset(const struct sock *sk)
393 {
394 	const struct inet_sock *inet = inet_sk(sk);
395 
396 	return secure_ipv4_port_ephemeral(inet->inet_rcv_saddr,
397 					  inet->inet_daddr,
398 					  inet->inet_dport);
399 }
400 
401 void __inet_hash_nolisten(struct sock *sk, struct sock *osk)
402 {
403 	struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
404 	struct hlist_nulls_head *list;
405 	struct inet_ehash_bucket *head;
406 	spinlock_t *lock;
407 
408 	WARN_ON(!sk_unhashed(sk));
409 
410 	sk->sk_hash = sk_ehashfn(sk);
411 	head = inet_ehash_bucket(hashinfo, sk->sk_hash);
412 	list = &head->chain;
413 	lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
414 
415 	spin_lock(lock);
416 	__sk_nulls_add_node_rcu(sk, list);
417 	if (osk) {
418 		WARN_ON(sk->sk_hash != osk->sk_hash);
419 		sk_nulls_del_node_init_rcu(osk);
420 	}
421 	spin_unlock(lock);
422 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
423 }
424 EXPORT_SYMBOL_GPL(__inet_hash_nolisten);
425 
426 void __inet_hash(struct sock *sk, struct sock *osk)
427 {
428 	struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
429 	struct inet_listen_hashbucket *ilb;
430 
431 	if (sk->sk_state != TCP_LISTEN)
432 		return __inet_hash_nolisten(sk, osk);
433 
434 	WARN_ON(!sk_unhashed(sk));
435 	ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
436 
437 	spin_lock(&ilb->lock);
438 	__sk_nulls_add_node_rcu(sk, &ilb->head);
439 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
440 	spin_unlock(&ilb->lock);
441 }
442 EXPORT_SYMBOL(__inet_hash);
443 
444 void inet_hash(struct sock *sk)
445 {
446 	if (sk->sk_state != TCP_CLOSE) {
447 		local_bh_disable();
448 		__inet_hash(sk, NULL);
449 		local_bh_enable();
450 	}
451 }
452 EXPORT_SYMBOL_GPL(inet_hash);
453 
454 void inet_unhash(struct sock *sk)
455 {
456 	struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
457 	spinlock_t *lock;
458 	int done;
459 
460 	if (sk_unhashed(sk))
461 		return;
462 
463 	if (sk->sk_state == TCP_LISTEN)
464 		lock = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)].lock;
465 	else
466 		lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
467 
468 	spin_lock_bh(lock);
469 	done = __sk_nulls_del_node_init_rcu(sk);
470 	if (done)
471 		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
472 	spin_unlock_bh(lock);
473 }
474 EXPORT_SYMBOL_GPL(inet_unhash);
475 
476 int __inet_hash_connect(struct inet_timewait_death_row *death_row,
477 		struct sock *sk, u32 port_offset,
478 		int (*check_established)(struct inet_timewait_death_row *,
479 			struct sock *, __u16, struct inet_timewait_sock **))
480 {
481 	struct inet_hashinfo *hinfo = death_row->hashinfo;
482 	const unsigned short snum = inet_sk(sk)->inet_num;
483 	struct inet_bind_hashbucket *head;
484 	struct inet_bind_bucket *tb;
485 	int ret;
486 	struct net *net = sock_net(sk);
487 
488 	if (!snum) {
489 		int i, remaining, low, high, port;
490 		static u32 hint;
491 		u32 offset = hint + port_offset;
492 		struct inet_timewait_sock *tw = NULL;
493 
494 		inet_get_local_port_range(net, &low, &high);
495 		remaining = (high - low) + 1;
496 
497 		/* By starting with offset being an even number,
498 		 * we tend to leave about 50% of ports for other uses,
499 		 * like bind(0).
500 		 */
501 		offset &= ~1;
502 
503 		local_bh_disable();
504 		for (i = 0; i < remaining; i++) {
505 			port = low + (i + offset) % remaining;
506 			if (inet_is_local_reserved_port(net, port))
507 				continue;
508 			head = &hinfo->bhash[inet_bhashfn(net, port,
509 					hinfo->bhash_size)];
510 			spin_lock(&head->lock);
511 
512 			/* Does not bother with rcv_saddr checks,
513 			 * because the established check is already
514 			 * unique enough.
515 			 */
516 			inet_bind_bucket_for_each(tb, &head->chain) {
517 				if (net_eq(ib_net(tb), net) &&
518 				    tb->port == port) {
519 					if (tb->fastreuse >= 0 ||
520 					    tb->fastreuseport >= 0)
521 						goto next_port;
522 					WARN_ON(hlist_empty(&tb->owners));
523 					if (!check_established(death_row, sk,
524 								port, &tw))
525 						goto ok;
526 					goto next_port;
527 				}
528 			}
529 
530 			tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
531 					net, head, port);
532 			if (!tb) {
533 				spin_unlock(&head->lock);
534 				break;
535 			}
536 			tb->fastreuse = -1;
537 			tb->fastreuseport = -1;
538 			goto ok;
539 
540 		next_port:
541 			spin_unlock(&head->lock);
542 		}
543 		local_bh_enable();
544 
545 		return -EADDRNOTAVAIL;
546 
547 ok:
548 		hint += (i + 2) & ~1;
549 
550 		/* Head lock still held and bh's disabled */
551 		inet_bind_hash(sk, tb, port);
552 		if (sk_unhashed(sk)) {
553 			inet_sk(sk)->inet_sport = htons(port);
554 			__inet_hash_nolisten(sk, (struct sock *)tw);
555 		}
556 		if (tw)
557 			inet_twsk_bind_unhash(tw, hinfo);
558 		spin_unlock(&head->lock);
559 
560 		if (tw)
561 			inet_twsk_deschedule_put(tw);
562 
563 		ret = 0;
564 		goto out;
565 	}
566 
567 	head = &hinfo->bhash[inet_bhashfn(net, snum, hinfo->bhash_size)];
568 	tb  = inet_csk(sk)->icsk_bind_hash;
569 	spin_lock_bh(&head->lock);
570 	if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
571 		__inet_hash_nolisten(sk, NULL);
572 		spin_unlock_bh(&head->lock);
573 		return 0;
574 	} else {
575 		spin_unlock(&head->lock);
576 		/* No definite answer... Walk to established hash table */
577 		ret = check_established(death_row, sk, snum, NULL);
578 out:
579 		local_bh_enable();
580 		return ret;
581 	}
582 }
583 
584 /*
585  * Bind a port for a connect operation and hash it.
586  */
587 int inet_hash_connect(struct inet_timewait_death_row *death_row,
588 		      struct sock *sk)
589 {
590 	u32 port_offset = 0;
591 
592 	if (!inet_sk(sk)->inet_num)
593 		port_offset = inet_sk_port_offset(sk);
594 	return __inet_hash_connect(death_row, sk, port_offset,
595 				   __inet_check_established);
596 }
597 EXPORT_SYMBOL_GPL(inet_hash_connect);
598 
599 void inet_hashinfo_init(struct inet_hashinfo *h)
600 {
601 	int i;
602 
603 	for (i = 0; i < INET_LHTABLE_SIZE; i++) {
604 		spin_lock_init(&h->listening_hash[i].lock);
605 		INIT_HLIST_NULLS_HEAD(&h->listening_hash[i].head,
606 				      i + LISTENING_NULLS_BASE);
607 		}
608 }
609 EXPORT_SYMBOL_GPL(inet_hashinfo_init);
610 
611 int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo)
612 {
613 	unsigned int locksz = sizeof(spinlock_t);
614 	unsigned int i, nblocks = 1;
615 
616 	if (locksz != 0) {
617 		/* allocate 2 cache lines or at least one spinlock per cpu */
618 		nblocks = max(2U * L1_CACHE_BYTES / locksz, 1U);
619 		nblocks = roundup_pow_of_two(nblocks * num_possible_cpus());
620 
621 		/* no more locks than number of hash buckets */
622 		nblocks = min(nblocks, hashinfo->ehash_mask + 1);
623 
624 		hashinfo->ehash_locks =	kmalloc_array(nblocks, locksz,
625 						      GFP_KERNEL | __GFP_NOWARN);
626 		if (!hashinfo->ehash_locks)
627 			hashinfo->ehash_locks = vmalloc(nblocks * locksz);
628 
629 		if (!hashinfo->ehash_locks)
630 			return -ENOMEM;
631 
632 		for (i = 0; i < nblocks; i++)
633 			spin_lock_init(&hashinfo->ehash_locks[i]);
634 	}
635 	hashinfo->ehash_locks_mask = nblocks - 1;
636 	return 0;
637 }
638 EXPORT_SYMBOL_GPL(inet_ehash_locks_alloc);
639