xref: /openbmc/linux/net/core/sock_reuseport.c (revision 7ae9fb1b7ecbb5d85d07857943f677fd1a559b18)
1b2441318SGreg Kroah-Hartman // SPDX-License-Identifier: GPL-2.0
2ef456144SCraig Gallek /*
3ef456144SCraig Gallek  * To speed up listener socket lookup, create an array to store all sockets
4ef456144SCraig Gallek  * listening on the same port.  This allows a decision to be made after finding
5538950a1SCraig Gallek  * the first socket.  An optional BPF program can also be configured for
6538950a1SCraig Gallek  * selecting the socket index from the array of available sockets.
7ef456144SCraig Gallek  */
8ef456144SCraig Gallek 
955d444b3SKuniyuki Iwashima #include <net/ip.h>
10ef456144SCraig Gallek #include <net/sock_reuseport.h>
11538950a1SCraig Gallek #include <linux/bpf.h>
12736b4602SMartin KaFai Lau #include <linux/idr.h>
138217ca65SMartin KaFai Lau #include <linux/filter.h>
14ef456144SCraig Gallek #include <linux/rcupdate.h>
15ef456144SCraig Gallek 
16ef456144SCraig Gallek #define INIT_SOCKS 128
17ef456144SCraig Gallek 
18736b4602SMartin KaFai Lau DEFINE_SPINLOCK(reuseport_lock);
19736b4602SMartin KaFai Lau 
20736b4602SMartin KaFai Lau static DEFINE_IDA(reuseport_ida);
21333bb73fSKuniyuki Iwashima static int reuseport_resurrect(struct sock *sk, struct sock_reuseport *old_reuse,
22333bb73fSKuniyuki Iwashima 			       struct sock_reuseport *reuse, bool bind_inany);
23736b4602SMartin KaFai Lau 
reuseport_has_conns_set(struct sock * sk)2469421bf9SKuniyuki Iwashima void reuseport_has_conns_set(struct sock *sk)
2569421bf9SKuniyuki Iwashima {
2669421bf9SKuniyuki Iwashima 	struct sock_reuseport *reuse;
2769421bf9SKuniyuki Iwashima 
2869421bf9SKuniyuki Iwashima 	if (!rcu_access_pointer(sk->sk_reuseport_cb))
2969421bf9SKuniyuki Iwashima 		return;
3069421bf9SKuniyuki Iwashima 
3169421bf9SKuniyuki Iwashima 	spin_lock_bh(&reuseport_lock);
3269421bf9SKuniyuki Iwashima 	reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
3369421bf9SKuniyuki Iwashima 					  lockdep_is_held(&reuseport_lock));
3469421bf9SKuniyuki Iwashima 	if (likely(reuse))
3569421bf9SKuniyuki Iwashima 		reuse->has_conns = 1;
3669421bf9SKuniyuki Iwashima 	spin_unlock_bh(&reuseport_lock);
3769421bf9SKuniyuki Iwashima }
3869421bf9SKuniyuki Iwashima EXPORT_SYMBOL(reuseport_has_conns_set);
3969421bf9SKuniyuki Iwashima 
__reuseport_get_incoming_cpu(struct sock_reuseport * reuse)40*b261eda8SKuniyuki Iwashima static void __reuseport_get_incoming_cpu(struct sock_reuseport *reuse)
41*b261eda8SKuniyuki Iwashima {
42*b261eda8SKuniyuki Iwashima 	/* Paired with READ_ONCE() in reuseport_select_sock_by_hash(). */
43*b261eda8SKuniyuki Iwashima 	WRITE_ONCE(reuse->incoming_cpu, reuse->incoming_cpu + 1);
44*b261eda8SKuniyuki Iwashima }
45*b261eda8SKuniyuki Iwashima 
__reuseport_put_incoming_cpu(struct sock_reuseport * reuse)46*b261eda8SKuniyuki Iwashima static void __reuseport_put_incoming_cpu(struct sock_reuseport *reuse)
47*b261eda8SKuniyuki Iwashima {
48*b261eda8SKuniyuki Iwashima 	/* Paired with READ_ONCE() in reuseport_select_sock_by_hash(). */
49*b261eda8SKuniyuki Iwashima 	WRITE_ONCE(reuse->incoming_cpu, reuse->incoming_cpu - 1);
50*b261eda8SKuniyuki Iwashima }
51*b261eda8SKuniyuki Iwashima 
reuseport_get_incoming_cpu(struct sock * sk,struct sock_reuseport * reuse)52*b261eda8SKuniyuki Iwashima static void reuseport_get_incoming_cpu(struct sock *sk, struct sock_reuseport *reuse)
53*b261eda8SKuniyuki Iwashima {
54*b261eda8SKuniyuki Iwashima 	if (sk->sk_incoming_cpu >= 0)
55*b261eda8SKuniyuki Iwashima 		__reuseport_get_incoming_cpu(reuse);
56*b261eda8SKuniyuki Iwashima }
57*b261eda8SKuniyuki Iwashima 
reuseport_put_incoming_cpu(struct sock * sk,struct sock_reuseport * reuse)58*b261eda8SKuniyuki Iwashima static void reuseport_put_incoming_cpu(struct sock *sk, struct sock_reuseport *reuse)
59*b261eda8SKuniyuki Iwashima {
60*b261eda8SKuniyuki Iwashima 	if (sk->sk_incoming_cpu >= 0)
61*b261eda8SKuniyuki Iwashima 		__reuseport_put_incoming_cpu(reuse);
62*b261eda8SKuniyuki Iwashima }
63*b261eda8SKuniyuki Iwashima 
reuseport_update_incoming_cpu(struct sock * sk,int val)64*b261eda8SKuniyuki Iwashima void reuseport_update_incoming_cpu(struct sock *sk, int val)
65*b261eda8SKuniyuki Iwashima {
66*b261eda8SKuniyuki Iwashima 	struct sock_reuseport *reuse;
67*b261eda8SKuniyuki Iwashima 	int old_sk_incoming_cpu;
68*b261eda8SKuniyuki Iwashima 
69*b261eda8SKuniyuki Iwashima 	if (unlikely(!rcu_access_pointer(sk->sk_reuseport_cb))) {
70*b261eda8SKuniyuki Iwashima 		/* Paired with REAE_ONCE() in sk_incoming_cpu_update()
71*b261eda8SKuniyuki Iwashima 		 * and compute_score().
72*b261eda8SKuniyuki Iwashima 		 */
73*b261eda8SKuniyuki Iwashima 		WRITE_ONCE(sk->sk_incoming_cpu, val);
74*b261eda8SKuniyuki Iwashima 		return;
75*b261eda8SKuniyuki Iwashima 	}
76*b261eda8SKuniyuki Iwashima 
77*b261eda8SKuniyuki Iwashima 	spin_lock_bh(&reuseport_lock);
78*b261eda8SKuniyuki Iwashima 
79*b261eda8SKuniyuki Iwashima 	/* This must be done under reuseport_lock to avoid a race with
80*b261eda8SKuniyuki Iwashima 	 * reuseport_grow(), which accesses sk->sk_incoming_cpu without
81*b261eda8SKuniyuki Iwashima 	 * lock_sock() when detaching a shutdown()ed sk.
82*b261eda8SKuniyuki Iwashima 	 *
83*b261eda8SKuniyuki Iwashima 	 * Paired with READ_ONCE() in reuseport_select_sock_by_hash().
84*b261eda8SKuniyuki Iwashima 	 */
85*b261eda8SKuniyuki Iwashima 	old_sk_incoming_cpu = sk->sk_incoming_cpu;
86*b261eda8SKuniyuki Iwashima 	WRITE_ONCE(sk->sk_incoming_cpu, val);
87*b261eda8SKuniyuki Iwashima 
88*b261eda8SKuniyuki Iwashima 	reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
89*b261eda8SKuniyuki Iwashima 					  lockdep_is_held(&reuseport_lock));
90*b261eda8SKuniyuki Iwashima 
91*b261eda8SKuniyuki Iwashima 	/* reuseport_grow() has detached a closed sk. */
92*b261eda8SKuniyuki Iwashima 	if (!reuse)
93*b261eda8SKuniyuki Iwashima 		goto out;
94*b261eda8SKuniyuki Iwashima 
95*b261eda8SKuniyuki Iwashima 	if (old_sk_incoming_cpu < 0 && val >= 0)
96*b261eda8SKuniyuki Iwashima 		__reuseport_get_incoming_cpu(reuse);
97*b261eda8SKuniyuki Iwashima 	else if (old_sk_incoming_cpu >= 0 && val < 0)
98*b261eda8SKuniyuki Iwashima 		__reuseport_put_incoming_cpu(reuse);
99*b261eda8SKuniyuki Iwashima 
100*b261eda8SKuniyuki Iwashima out:
101*b261eda8SKuniyuki Iwashima 	spin_unlock_bh(&reuseport_lock);
102*b261eda8SKuniyuki Iwashima }
103*b261eda8SKuniyuki Iwashima 
reuseport_sock_index(struct sock * sk,const struct sock_reuseport * reuse,bool closed)1045c040eafSKuniyuki Iwashima static int reuseport_sock_index(struct sock *sk,
1055c040eafSKuniyuki Iwashima 				const struct sock_reuseport *reuse,
1065c040eafSKuniyuki Iwashima 				bool closed)
1075c040eafSKuniyuki Iwashima {
1085c040eafSKuniyuki Iwashima 	int left, right;
1095c040eafSKuniyuki Iwashima 
1105c040eafSKuniyuki Iwashima 	if (!closed) {
1115c040eafSKuniyuki Iwashima 		left = 0;
1125c040eafSKuniyuki Iwashima 		right = reuse->num_socks;
1135c040eafSKuniyuki Iwashima 	} else {
1145c040eafSKuniyuki Iwashima 		left = reuse->max_socks - reuse->num_closed_socks;
1155c040eafSKuniyuki Iwashima 		right = reuse->max_socks;
1165c040eafSKuniyuki Iwashima 	}
1175c040eafSKuniyuki Iwashima 
1185c040eafSKuniyuki Iwashima 	for (; left < right; left++)
1195c040eafSKuniyuki Iwashima 		if (reuse->socks[left] == sk)
1205c040eafSKuniyuki Iwashima 			return left;
1215c040eafSKuniyuki Iwashima 	return -1;
1225c040eafSKuniyuki Iwashima }
1235c040eafSKuniyuki Iwashima 
__reuseport_add_sock(struct sock * sk,struct sock_reuseport * reuse)1245c040eafSKuniyuki Iwashima static void __reuseport_add_sock(struct sock *sk,
1255c040eafSKuniyuki Iwashima 				 struct sock_reuseport *reuse)
1265c040eafSKuniyuki Iwashima {
1275c040eafSKuniyuki Iwashima 	reuse->socks[reuse->num_socks] = sk;
1281cd62c21SKuniyuki Iwashima 	/* paired with smp_rmb() in reuseport_(select|migrate)_sock() */
1295c040eafSKuniyuki Iwashima 	smp_wmb();
1305c040eafSKuniyuki Iwashima 	reuse->num_socks++;
131*b261eda8SKuniyuki Iwashima 	reuseport_get_incoming_cpu(sk, reuse);
1325c040eafSKuniyuki Iwashima }
1335c040eafSKuniyuki Iwashima 
__reuseport_detach_sock(struct sock * sk,struct sock_reuseport * reuse)1345c040eafSKuniyuki Iwashima static bool __reuseport_detach_sock(struct sock *sk,
1355c040eafSKuniyuki Iwashima 				    struct sock_reuseport *reuse)
1365c040eafSKuniyuki Iwashima {
1375c040eafSKuniyuki Iwashima 	int i = reuseport_sock_index(sk, reuse, false);
1385c040eafSKuniyuki Iwashima 
1395c040eafSKuniyuki Iwashima 	if (i == -1)
1405c040eafSKuniyuki Iwashima 		return false;
1415c040eafSKuniyuki Iwashima 
1425c040eafSKuniyuki Iwashima 	reuse->socks[i] = reuse->socks[reuse->num_socks - 1];
1435c040eafSKuniyuki Iwashima 	reuse->num_socks--;
144*b261eda8SKuniyuki Iwashima 	reuseport_put_incoming_cpu(sk, reuse);
1455c040eafSKuniyuki Iwashima 
1465c040eafSKuniyuki Iwashima 	return true;
1475c040eafSKuniyuki Iwashima }
1485c040eafSKuniyuki Iwashima 
__reuseport_add_closed_sock(struct sock * sk,struct sock_reuseport * reuse)149333bb73fSKuniyuki Iwashima static void __reuseport_add_closed_sock(struct sock *sk,
150333bb73fSKuniyuki Iwashima 					struct sock_reuseport *reuse)
151333bb73fSKuniyuki Iwashima {
152333bb73fSKuniyuki Iwashima 	reuse->socks[reuse->max_socks - reuse->num_closed_socks - 1] = sk;
153333bb73fSKuniyuki Iwashima 	/* paired with READ_ONCE() in inet_csk_bind_conflict() */
154333bb73fSKuniyuki Iwashima 	WRITE_ONCE(reuse->num_closed_socks, reuse->num_closed_socks + 1);
155*b261eda8SKuniyuki Iwashima 	reuseport_get_incoming_cpu(sk, reuse);
156333bb73fSKuniyuki Iwashima }
157333bb73fSKuniyuki Iwashima 
__reuseport_detach_closed_sock(struct sock * sk,struct sock_reuseport * reuse)158333bb73fSKuniyuki Iwashima static bool __reuseport_detach_closed_sock(struct sock *sk,
159333bb73fSKuniyuki Iwashima 					   struct sock_reuseport *reuse)
160333bb73fSKuniyuki Iwashima {
161333bb73fSKuniyuki Iwashima 	int i = reuseport_sock_index(sk, reuse, true);
162333bb73fSKuniyuki Iwashima 
163333bb73fSKuniyuki Iwashima 	if (i == -1)
164333bb73fSKuniyuki Iwashima 		return false;
165333bb73fSKuniyuki Iwashima 
166333bb73fSKuniyuki Iwashima 	reuse->socks[i] = reuse->socks[reuse->max_socks - reuse->num_closed_socks];
167333bb73fSKuniyuki Iwashima 	/* paired with READ_ONCE() in inet_csk_bind_conflict() */
168333bb73fSKuniyuki Iwashima 	WRITE_ONCE(reuse->num_closed_socks, reuse->num_closed_socks - 1);
169*b261eda8SKuniyuki Iwashima 	reuseport_put_incoming_cpu(sk, reuse);
170333bb73fSKuniyuki Iwashima 
171333bb73fSKuniyuki Iwashima 	return true;
172333bb73fSKuniyuki Iwashima }
173333bb73fSKuniyuki Iwashima 
__reuseport_alloc(unsigned int max_socks)174822f9bb1SAlexey Dobriyan static struct sock_reuseport *__reuseport_alloc(unsigned int max_socks)
175ef456144SCraig Gallek {
176822f9bb1SAlexey Dobriyan 	unsigned int size = sizeof(struct sock_reuseport) +
177ef456144SCraig Gallek 		      sizeof(struct sock *) * max_socks;
178ef456144SCraig Gallek 	struct sock_reuseport *reuse = kzalloc(size, GFP_ATOMIC);
179ef456144SCraig Gallek 
180ef456144SCraig Gallek 	if (!reuse)
181ef456144SCraig Gallek 		return NULL;
182ef456144SCraig Gallek 
183ef456144SCraig Gallek 	reuse->max_socks = max_socks;
184ef456144SCraig Gallek 
185538950a1SCraig Gallek 	RCU_INIT_POINTER(reuse->prog, NULL);
186ef456144SCraig Gallek 	return reuse;
187ef456144SCraig Gallek }
188ef456144SCraig Gallek 
reuseport_alloc(struct sock * sk,bool bind_inany)1892dbb9b9eSMartin KaFai Lau int reuseport_alloc(struct sock *sk, bool bind_inany)
190ef456144SCraig Gallek {
191ef456144SCraig Gallek 	struct sock_reuseport *reuse;
192035ff358SJakub Sitnicki 	int id, ret = 0;
193ef456144SCraig Gallek 
194ef456144SCraig Gallek 	/* bh lock used since this function call may precede hlist lock in
195ef456144SCraig Gallek 	 * soft irq of receive path or setsockopt from process context
196ef456144SCraig Gallek 	 */
197ef456144SCraig Gallek 	spin_lock_bh(&reuseport_lock);
1981b5f962eSCraig Gallek 
1991b5f962eSCraig Gallek 	/* Allocation attempts can occur concurrently via the setsockopt path
2001b5f962eSCraig Gallek 	 * and the bind/hash path.  Nothing to do when we lose the race.
2011b5f962eSCraig Gallek 	 */
2022dbb9b9eSMartin KaFai Lau 	reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
2032dbb9b9eSMartin KaFai Lau 					  lockdep_is_held(&reuseport_lock));
2042dbb9b9eSMartin KaFai Lau 	if (reuse) {
205333bb73fSKuniyuki Iwashima 		if (reuse->num_closed_socks) {
206333bb73fSKuniyuki Iwashima 			/* sk was shutdown()ed before */
207333bb73fSKuniyuki Iwashima 			ret = reuseport_resurrect(sk, reuse, NULL, bind_inany);
208333bb73fSKuniyuki Iwashima 			goto out;
209333bb73fSKuniyuki Iwashima 		}
210333bb73fSKuniyuki Iwashima 
2112dbb9b9eSMartin KaFai Lau 		/* Only set reuse->bind_inany if the bind_inany is true.
2122dbb9b9eSMartin KaFai Lau 		 * Otherwise, it will overwrite the reuse->bind_inany
2132dbb9b9eSMartin KaFai Lau 		 * which was set by the bind/hash path.
2142dbb9b9eSMartin KaFai Lau 		 */
2152dbb9b9eSMartin KaFai Lau 		if (bind_inany)
2162dbb9b9eSMartin KaFai Lau 			reuse->bind_inany = bind_inany;
2171b5f962eSCraig Gallek 		goto out;
2182dbb9b9eSMartin KaFai Lau 	}
2191b5f962eSCraig Gallek 
220ef456144SCraig Gallek 	reuse = __reuseport_alloc(INIT_SOCKS);
221ef456144SCraig Gallek 	if (!reuse) {
222035ff358SJakub Sitnicki 		ret = -ENOMEM;
223035ff358SJakub Sitnicki 		goto out;
224ef456144SCraig Gallek 	}
225ef456144SCraig Gallek 
226035ff358SJakub Sitnicki 	id = ida_alloc(&reuseport_ida, GFP_ATOMIC);
227035ff358SJakub Sitnicki 	if (id < 0) {
228035ff358SJakub Sitnicki 		kfree(reuse);
229035ff358SJakub Sitnicki 		ret = id;
230035ff358SJakub Sitnicki 		goto out;
231035ff358SJakub Sitnicki 	}
232035ff358SJakub Sitnicki 
233035ff358SJakub Sitnicki 	reuse->reuseport_id = id;
2345c040eafSKuniyuki Iwashima 	reuse->bind_inany = bind_inany;
235ef456144SCraig Gallek 	reuse->socks[0] = sk;
236ef456144SCraig Gallek 	reuse->num_socks = 1;
237*b261eda8SKuniyuki Iwashima 	reuseport_get_incoming_cpu(sk, reuse);
238ef456144SCraig Gallek 	rcu_assign_pointer(sk->sk_reuseport_cb, reuse);
239ef456144SCraig Gallek 
2401b5f962eSCraig Gallek out:
241ef456144SCraig Gallek 	spin_unlock_bh(&reuseport_lock);
242ef456144SCraig Gallek 
243035ff358SJakub Sitnicki 	return ret;
244ef456144SCraig Gallek }
245ef456144SCraig Gallek EXPORT_SYMBOL(reuseport_alloc);
246ef456144SCraig Gallek 
reuseport_grow(struct sock_reuseport * reuse)247ef456144SCraig Gallek static struct sock_reuseport *reuseport_grow(struct sock_reuseport *reuse)
248ef456144SCraig Gallek {
249ef456144SCraig Gallek 	struct sock_reuseport *more_reuse;
250ef456144SCraig Gallek 	u32 more_socks_size, i;
251ef456144SCraig Gallek 
252ef456144SCraig Gallek 	more_socks_size = reuse->max_socks * 2U;
253333bb73fSKuniyuki Iwashima 	if (more_socks_size > U16_MAX) {
254333bb73fSKuniyuki Iwashima 		if (reuse->num_closed_socks) {
255333bb73fSKuniyuki Iwashima 			/* Make room by removing a closed sk.
256333bb73fSKuniyuki Iwashima 			 * The child has already been migrated.
257333bb73fSKuniyuki Iwashima 			 * Only reqsk left at this point.
258333bb73fSKuniyuki Iwashima 			 */
259333bb73fSKuniyuki Iwashima 			struct sock *sk;
260333bb73fSKuniyuki Iwashima 
261333bb73fSKuniyuki Iwashima 			sk = reuse->socks[reuse->max_socks - reuse->num_closed_socks];
262333bb73fSKuniyuki Iwashima 			RCU_INIT_POINTER(sk->sk_reuseport_cb, NULL);
263333bb73fSKuniyuki Iwashima 			__reuseport_detach_closed_sock(sk, reuse);
264333bb73fSKuniyuki Iwashima 
265333bb73fSKuniyuki Iwashima 			return reuse;
266333bb73fSKuniyuki Iwashima 		}
267333bb73fSKuniyuki Iwashima 
268ef456144SCraig Gallek 		return NULL;
269333bb73fSKuniyuki Iwashima 	}
270ef456144SCraig Gallek 
271ef456144SCraig Gallek 	more_reuse = __reuseport_alloc(more_socks_size);
272ef456144SCraig Gallek 	if (!more_reuse)
273ef456144SCraig Gallek 		return NULL;
274ef456144SCraig Gallek 
275ef456144SCraig Gallek 	more_reuse->num_socks = reuse->num_socks;
2765c040eafSKuniyuki Iwashima 	more_reuse->num_closed_socks = reuse->num_closed_socks;
277538950a1SCraig Gallek 	more_reuse->prog = reuse->prog;
278736b4602SMartin KaFai Lau 	more_reuse->reuseport_id = reuse->reuseport_id;
2792dbb9b9eSMartin KaFai Lau 	more_reuse->bind_inany = reuse->bind_inany;
280f2b2c55eSKuniyuki Iwashima 	more_reuse->has_conns = reuse->has_conns;
281*b261eda8SKuniyuki Iwashima 	more_reuse->incoming_cpu = reuse->incoming_cpu;
282ef456144SCraig Gallek 
283ef456144SCraig Gallek 	memcpy(more_reuse->socks, reuse->socks,
284ef456144SCraig Gallek 	       reuse->num_socks * sizeof(struct sock *));
2855c040eafSKuniyuki Iwashima 	memcpy(more_reuse->socks +
2865c040eafSKuniyuki Iwashima 	       (more_reuse->max_socks - more_reuse->num_closed_socks),
2875c040eafSKuniyuki Iwashima 	       reuse->socks + (reuse->max_socks - reuse->num_closed_socks),
2885c040eafSKuniyuki Iwashima 	       reuse->num_closed_socks * sizeof(struct sock *));
28940a1227eSMartin KaFai Lau 	more_reuse->synq_overflow_ts = READ_ONCE(reuse->synq_overflow_ts);
290ef456144SCraig Gallek 
2915c040eafSKuniyuki Iwashima 	for (i = 0; i < reuse->max_socks; ++i)
292ef456144SCraig Gallek 		rcu_assign_pointer(reuse->socks[i]->sk_reuseport_cb,
293ef456144SCraig Gallek 				   more_reuse);
294ef456144SCraig Gallek 
295538950a1SCraig Gallek 	/* Note: we use kfree_rcu here instead of reuseport_free_rcu so
296538950a1SCraig Gallek 	 * that reuse and more_reuse can temporarily share a reference
297538950a1SCraig Gallek 	 * to prog.
298538950a1SCraig Gallek 	 */
299ef456144SCraig Gallek 	kfree_rcu(reuse, rcu);
300ef456144SCraig Gallek 	return more_reuse;
301ef456144SCraig Gallek }
302ef456144SCraig Gallek 
reuseport_free_rcu(struct rcu_head * head)3034db428a7SEric Dumazet static void reuseport_free_rcu(struct rcu_head *head)
3044db428a7SEric Dumazet {
3054db428a7SEric Dumazet 	struct sock_reuseport *reuse;
3064db428a7SEric Dumazet 
3074db428a7SEric Dumazet 	reuse = container_of(head, struct sock_reuseport, rcu);
3088217ca65SMartin KaFai Lau 	sk_reuseport_prog_free(rcu_dereference_protected(reuse->prog, 1));
309035ff358SJakub Sitnicki 	ida_free(&reuseport_ida, reuse->reuseport_id);
3104db428a7SEric Dumazet 	kfree(reuse);
3114db428a7SEric Dumazet }
3124db428a7SEric Dumazet 
313ef456144SCraig Gallek /**
314ef456144SCraig Gallek  *  reuseport_add_sock - Add a socket to the reuseport group of another.
315ef456144SCraig Gallek  *  @sk:  New socket to add to the group.
316ef456144SCraig Gallek  *  @sk2: Socket belonging to the existing reuseport group.
31737f3c421SBart Van Assche  *  @bind_inany: Whether or not the group is bound to a local INANY address.
31837f3c421SBart Van Assche  *
319ef456144SCraig Gallek  *  May return ENOMEM and not add socket to group under memory pressure.
320ef456144SCraig Gallek  */
reuseport_add_sock(struct sock * sk,struct sock * sk2,bool bind_inany)3212dbb9b9eSMartin KaFai Lau int reuseport_add_sock(struct sock *sk, struct sock *sk2, bool bind_inany)
322ef456144SCraig Gallek {
3234db428a7SEric Dumazet 	struct sock_reuseport *old_reuse, *reuse;
324ef456144SCraig Gallek 
325b4ace4f1SCraig Gallek 	if (!rcu_access_pointer(sk2->sk_reuseport_cb)) {
3262dbb9b9eSMartin KaFai Lau 		int err = reuseport_alloc(sk2, bind_inany);
327b4ace4f1SCraig Gallek 
328b4ace4f1SCraig Gallek 		if (err)
329b4ace4f1SCraig Gallek 			return err;
330b4ace4f1SCraig Gallek 	}
331b4ace4f1SCraig Gallek 
332ef456144SCraig Gallek 	spin_lock_bh(&reuseport_lock);
333ef456144SCraig Gallek 	reuse = rcu_dereference_protected(sk2->sk_reuseport_cb,
3344db428a7SEric Dumazet 					  lockdep_is_held(&reuseport_lock));
3354db428a7SEric Dumazet 	old_reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
3364db428a7SEric Dumazet 					      lockdep_is_held(&reuseport_lock));
337333bb73fSKuniyuki Iwashima 	if (old_reuse && old_reuse->num_closed_socks) {
338333bb73fSKuniyuki Iwashima 		/* sk was shutdown()ed before */
339333bb73fSKuniyuki Iwashima 		int err = reuseport_resurrect(sk, old_reuse, reuse, reuse->bind_inany);
340333bb73fSKuniyuki Iwashima 
341333bb73fSKuniyuki Iwashima 		spin_unlock_bh(&reuseport_lock);
342333bb73fSKuniyuki Iwashima 		return err;
343333bb73fSKuniyuki Iwashima 	}
344333bb73fSKuniyuki Iwashima 
3454db428a7SEric Dumazet 	if (old_reuse && old_reuse->num_socks != 1) {
3464db428a7SEric Dumazet 		spin_unlock_bh(&reuseport_lock);
3474db428a7SEric Dumazet 		return -EBUSY;
3484db428a7SEric Dumazet 	}
349ef456144SCraig Gallek 
3505c040eafSKuniyuki Iwashima 	if (reuse->num_socks + reuse->num_closed_socks == reuse->max_socks) {
351ef456144SCraig Gallek 		reuse = reuseport_grow(reuse);
352ef456144SCraig Gallek 		if (!reuse) {
353ef456144SCraig Gallek 			spin_unlock_bh(&reuseport_lock);
354ef456144SCraig Gallek 			return -ENOMEM;
355ef456144SCraig Gallek 		}
356ef456144SCraig Gallek 	}
357ef456144SCraig Gallek 
3585c040eafSKuniyuki Iwashima 	__reuseport_add_sock(sk, reuse);
359ef456144SCraig Gallek 	rcu_assign_pointer(sk->sk_reuseport_cb, reuse);
360ef456144SCraig Gallek 
361ef456144SCraig Gallek 	spin_unlock_bh(&reuseport_lock);
362ef456144SCraig Gallek 
3634db428a7SEric Dumazet 	if (old_reuse)
3644db428a7SEric Dumazet 		call_rcu(&old_reuse->rcu, reuseport_free_rcu);
365ef456144SCraig Gallek 	return 0;
366ef456144SCraig Gallek }
36776c6d988SXin Long EXPORT_SYMBOL(reuseport_add_sock);
368ef456144SCraig Gallek 
reuseport_resurrect(struct sock * sk,struct sock_reuseport * old_reuse,struct sock_reuseport * reuse,bool bind_inany)369333bb73fSKuniyuki Iwashima static int reuseport_resurrect(struct sock *sk, struct sock_reuseport *old_reuse,
370333bb73fSKuniyuki Iwashima 			       struct sock_reuseport *reuse, bool bind_inany)
371333bb73fSKuniyuki Iwashima {
372333bb73fSKuniyuki Iwashima 	if (old_reuse == reuse) {
373333bb73fSKuniyuki Iwashima 		/* If sk was in the same reuseport group, just pop sk out of
374333bb73fSKuniyuki Iwashima 		 * the closed section and push sk into the listening section.
375333bb73fSKuniyuki Iwashima 		 */
376333bb73fSKuniyuki Iwashima 		__reuseport_detach_closed_sock(sk, old_reuse);
377333bb73fSKuniyuki Iwashima 		__reuseport_add_sock(sk, old_reuse);
378333bb73fSKuniyuki Iwashima 		return 0;
379333bb73fSKuniyuki Iwashima 	}
380333bb73fSKuniyuki Iwashima 
381333bb73fSKuniyuki Iwashima 	if (!reuse) {
382333bb73fSKuniyuki Iwashima 		/* In bind()/listen() path, we cannot carry over the eBPF prog
383333bb73fSKuniyuki Iwashima 		 * for the shutdown()ed socket. In setsockopt() path, we should
384333bb73fSKuniyuki Iwashima 		 * not change the eBPF prog of listening sockets by attaching a
385333bb73fSKuniyuki Iwashima 		 * prog to the shutdown()ed socket. Thus, we will allocate a new
386333bb73fSKuniyuki Iwashima 		 * reuseport group and detach sk from the old group.
387333bb73fSKuniyuki Iwashima 		 */
388333bb73fSKuniyuki Iwashima 		int id;
389333bb73fSKuniyuki Iwashima 
390333bb73fSKuniyuki Iwashima 		reuse = __reuseport_alloc(INIT_SOCKS);
391333bb73fSKuniyuki Iwashima 		if (!reuse)
392333bb73fSKuniyuki Iwashima 			return -ENOMEM;
393333bb73fSKuniyuki Iwashima 
394333bb73fSKuniyuki Iwashima 		id = ida_alloc(&reuseport_ida, GFP_ATOMIC);
395333bb73fSKuniyuki Iwashima 		if (id < 0) {
396333bb73fSKuniyuki Iwashima 			kfree(reuse);
397333bb73fSKuniyuki Iwashima 			return id;
398333bb73fSKuniyuki Iwashima 		}
399333bb73fSKuniyuki Iwashima 
400333bb73fSKuniyuki Iwashima 		reuse->reuseport_id = id;
401333bb73fSKuniyuki Iwashima 		reuse->bind_inany = bind_inany;
402333bb73fSKuniyuki Iwashima 	} else {
403333bb73fSKuniyuki Iwashima 		/* Move sk from the old group to the new one if
404333bb73fSKuniyuki Iwashima 		 * - all the other listeners in the old group were close()d or
405333bb73fSKuniyuki Iwashima 		 *   shutdown()ed, and then sk2 has listen()ed on the same port
406333bb73fSKuniyuki Iwashima 		 * OR
407333bb73fSKuniyuki Iwashima 		 * - sk listen()ed without bind() (or with autobind), was
408333bb73fSKuniyuki Iwashima 		 *   shutdown()ed, and then listen()s on another port which
409333bb73fSKuniyuki Iwashima 		 *   sk2 listen()s on.
410333bb73fSKuniyuki Iwashima 		 */
411333bb73fSKuniyuki Iwashima 		if (reuse->num_socks + reuse->num_closed_socks == reuse->max_socks) {
412333bb73fSKuniyuki Iwashima 			reuse = reuseport_grow(reuse);
413333bb73fSKuniyuki Iwashima 			if (!reuse)
414333bb73fSKuniyuki Iwashima 				return -ENOMEM;
415333bb73fSKuniyuki Iwashima 		}
416333bb73fSKuniyuki Iwashima 	}
417333bb73fSKuniyuki Iwashima 
418333bb73fSKuniyuki Iwashima 	__reuseport_detach_closed_sock(sk, old_reuse);
419333bb73fSKuniyuki Iwashima 	__reuseport_add_sock(sk, reuse);
420333bb73fSKuniyuki Iwashima 	rcu_assign_pointer(sk->sk_reuseport_cb, reuse);
421333bb73fSKuniyuki Iwashima 
422333bb73fSKuniyuki Iwashima 	if (old_reuse->num_socks + old_reuse->num_closed_socks == 0)
423333bb73fSKuniyuki Iwashima 		call_rcu(&old_reuse->rcu, reuseport_free_rcu);
424333bb73fSKuniyuki Iwashima 
425333bb73fSKuniyuki Iwashima 	return 0;
426333bb73fSKuniyuki Iwashima }
427333bb73fSKuniyuki Iwashima 
reuseport_detach_sock(struct sock * sk)428ef456144SCraig Gallek void reuseport_detach_sock(struct sock *sk)
429ef456144SCraig Gallek {
430ef456144SCraig Gallek 	struct sock_reuseport *reuse;
431ef456144SCraig Gallek 
432ef456144SCraig Gallek 	spin_lock_bh(&reuseport_lock);
433ef456144SCraig Gallek 	reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
434ef456144SCraig Gallek 					  lockdep_is_held(&reuseport_lock));
4355dc4c4b7SMartin KaFai Lau 
436333bb73fSKuniyuki Iwashima 	/* reuseport_grow() has detached a closed sk */
437333bb73fSKuniyuki Iwashima 	if (!reuse)
438333bb73fSKuniyuki Iwashima 		goto out;
439333bb73fSKuniyuki Iwashima 
440035ff358SJakub Sitnicki 	/* Notify the bpf side. The sk may be added to a sockarray
441035ff358SJakub Sitnicki 	 * map. If so, sockarray logic will remove it from the map.
442035ff358SJakub Sitnicki 	 *
443035ff358SJakub Sitnicki 	 * Other bpf map types that work with reuseport, like sockmap,
444035ff358SJakub Sitnicki 	 * don't need an explicit callback from here. They override sk
445035ff358SJakub Sitnicki 	 * unhash/close ops to remove the sk from the map before we
446035ff358SJakub Sitnicki 	 * get to this point.
4475dc4c4b7SMartin KaFai Lau 	 */
4485dc4c4b7SMartin KaFai Lau 	bpf_sk_reuseport_detach(sk);
4495dc4c4b7SMartin KaFai Lau 
450ef456144SCraig Gallek 	rcu_assign_pointer(sk->sk_reuseport_cb, NULL);
451333bb73fSKuniyuki Iwashima 
452333bb73fSKuniyuki Iwashima 	if (!__reuseport_detach_closed_sock(sk, reuse))
4535c040eafSKuniyuki Iwashima 		__reuseport_detach_sock(sk, reuse);
454ef456144SCraig Gallek 
4555c040eafSKuniyuki Iwashima 	if (reuse->num_socks + reuse->num_closed_socks == 0)
456538950a1SCraig Gallek 		call_rcu(&reuse->rcu, reuseport_free_rcu);
4575c040eafSKuniyuki Iwashima 
458333bb73fSKuniyuki Iwashima out:
459ef456144SCraig Gallek 	spin_unlock_bh(&reuseport_lock);
460ef456144SCraig Gallek }
461ef456144SCraig Gallek EXPORT_SYMBOL(reuseport_detach_sock);
462ef456144SCraig Gallek 
reuseport_stop_listen_sock(struct sock * sk)463333bb73fSKuniyuki Iwashima void reuseport_stop_listen_sock(struct sock *sk)
464333bb73fSKuniyuki Iwashima {
465333bb73fSKuniyuki Iwashima 	if (sk->sk_protocol == IPPROTO_TCP) {
466333bb73fSKuniyuki Iwashima 		struct sock_reuseport *reuse;
467d5e4ddaeSKuniyuki Iwashima 		struct bpf_prog *prog;
468333bb73fSKuniyuki Iwashima 
469333bb73fSKuniyuki Iwashima 		spin_lock_bh(&reuseport_lock);
470333bb73fSKuniyuki Iwashima 
471333bb73fSKuniyuki Iwashima 		reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
472333bb73fSKuniyuki Iwashima 						  lockdep_is_held(&reuseport_lock));
473d5e4ddaeSKuniyuki Iwashima 		prog = rcu_dereference_protected(reuse->prog,
474d5e4ddaeSKuniyuki Iwashima 						 lockdep_is_held(&reuseport_lock));
475333bb73fSKuniyuki Iwashima 
4764177f545SKuniyuki Iwashima 		if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_migrate_req) ||
477d5e4ddaeSKuniyuki Iwashima 		    (prog && prog->expected_attach_type == BPF_SK_REUSEPORT_SELECT_OR_MIGRATE)) {
478333bb73fSKuniyuki Iwashima 			/* Migration capable, move sk from the listening section
479333bb73fSKuniyuki Iwashima 			 * to the closed section.
480333bb73fSKuniyuki Iwashima 			 */
481333bb73fSKuniyuki Iwashima 			bpf_sk_reuseport_detach(sk);
482333bb73fSKuniyuki Iwashima 
483333bb73fSKuniyuki Iwashima 			__reuseport_detach_sock(sk, reuse);
484333bb73fSKuniyuki Iwashima 			__reuseport_add_closed_sock(sk, reuse);
485333bb73fSKuniyuki Iwashima 
486333bb73fSKuniyuki Iwashima 			spin_unlock_bh(&reuseport_lock);
487333bb73fSKuniyuki Iwashima 			return;
488333bb73fSKuniyuki Iwashima 		}
489333bb73fSKuniyuki Iwashima 
490333bb73fSKuniyuki Iwashima 		spin_unlock_bh(&reuseport_lock);
491333bb73fSKuniyuki Iwashima 	}
492333bb73fSKuniyuki Iwashima 
493333bb73fSKuniyuki Iwashima 	/* Not capable to do migration, detach immediately */
494333bb73fSKuniyuki Iwashima 	reuseport_detach_sock(sk);
495333bb73fSKuniyuki Iwashima }
496333bb73fSKuniyuki Iwashima EXPORT_SYMBOL(reuseport_stop_listen_sock);
497333bb73fSKuniyuki Iwashima 
run_bpf_filter(struct sock_reuseport * reuse,u16 socks,struct bpf_prog * prog,struct sk_buff * skb,int hdr_len)4988217ca65SMartin KaFai Lau static struct sock *run_bpf_filter(struct sock_reuseport *reuse, u16 socks,
499538950a1SCraig Gallek 				   struct bpf_prog *prog, struct sk_buff *skb,
500538950a1SCraig Gallek 				   int hdr_len)
501538950a1SCraig Gallek {
502538950a1SCraig Gallek 	struct sk_buff *nskb = NULL;
503538950a1SCraig Gallek 	u32 index;
504538950a1SCraig Gallek 
505538950a1SCraig Gallek 	if (skb_shared(skb)) {
506538950a1SCraig Gallek 		nskb = skb_clone(skb, GFP_ATOMIC);
507538950a1SCraig Gallek 		if (!nskb)
508538950a1SCraig Gallek 			return NULL;
509538950a1SCraig Gallek 		skb = nskb;
510538950a1SCraig Gallek 	}
511538950a1SCraig Gallek 
512538950a1SCraig Gallek 	/* temporarily advance data past protocol header */
513538950a1SCraig Gallek 	if (!pskb_pull(skb, hdr_len)) {
51400ce3a15SCraig Gallek 		kfree_skb(nskb);
515538950a1SCraig Gallek 		return NULL;
516538950a1SCraig Gallek 	}
517538950a1SCraig Gallek 	index = bpf_prog_run_save_cb(prog, skb);
518538950a1SCraig Gallek 	__skb_push(skb, hdr_len);
519538950a1SCraig Gallek 
520538950a1SCraig Gallek 	consume_skb(nskb);
521538950a1SCraig Gallek 
522538950a1SCraig Gallek 	if (index >= socks)
523538950a1SCraig Gallek 		return NULL;
524538950a1SCraig Gallek 
525538950a1SCraig Gallek 	return reuse->socks[index];
526538950a1SCraig Gallek }
527538950a1SCraig Gallek 
reuseport_select_sock_by_hash(struct sock_reuseport * reuse,u32 hash,u16 num_socks)5281cd62c21SKuniyuki Iwashima static struct sock *reuseport_select_sock_by_hash(struct sock_reuseport *reuse,
5291cd62c21SKuniyuki Iwashima 						  u32 hash, u16 num_socks)
5301cd62c21SKuniyuki Iwashima {
531*b261eda8SKuniyuki Iwashima 	struct sock *first_valid_sk = NULL;
5321cd62c21SKuniyuki Iwashima 	int i, j;
5331cd62c21SKuniyuki Iwashima 
5341cd62c21SKuniyuki Iwashima 	i = j = reciprocal_scale(hash, num_socks);
535*b261eda8SKuniyuki Iwashima 	do {
536*b261eda8SKuniyuki Iwashima 		struct sock *sk = reuse->socks[i];
537*b261eda8SKuniyuki Iwashima 
538*b261eda8SKuniyuki Iwashima 		if (sk->sk_state != TCP_ESTABLISHED) {
539*b261eda8SKuniyuki Iwashima 			/* Paired with WRITE_ONCE() in __reuseport_(get|put)_incoming_cpu(). */
540*b261eda8SKuniyuki Iwashima 			if (!READ_ONCE(reuse->incoming_cpu))
541*b261eda8SKuniyuki Iwashima 				return sk;
542*b261eda8SKuniyuki Iwashima 
543*b261eda8SKuniyuki Iwashima 			/* Paired with WRITE_ONCE() in reuseport_update_incoming_cpu(). */
544*b261eda8SKuniyuki Iwashima 			if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id())
545*b261eda8SKuniyuki Iwashima 				return sk;
546*b261eda8SKuniyuki Iwashima 
547*b261eda8SKuniyuki Iwashima 			if (!first_valid_sk)
548*b261eda8SKuniyuki Iwashima 				first_valid_sk = sk;
549*b261eda8SKuniyuki Iwashima 		}
550*b261eda8SKuniyuki Iwashima 
5511cd62c21SKuniyuki Iwashima 		i++;
5521cd62c21SKuniyuki Iwashima 		if (i >= num_socks)
5531cd62c21SKuniyuki Iwashima 			i = 0;
554*b261eda8SKuniyuki Iwashima 	} while (i != j);
5551cd62c21SKuniyuki Iwashima 
556*b261eda8SKuniyuki Iwashima 	return first_valid_sk;
5571cd62c21SKuniyuki Iwashima }
5581cd62c21SKuniyuki Iwashima 
559ef456144SCraig Gallek /**
560ef456144SCraig Gallek  *  reuseport_select_sock - Select a socket from an SO_REUSEPORT group.
561ef456144SCraig Gallek  *  @sk: First socket in the group.
562538950a1SCraig Gallek  *  @hash: When no BPF filter is available, use this hash to select.
563538950a1SCraig Gallek  *  @skb: skb to run through BPF filter.
564538950a1SCraig Gallek  *  @hdr_len: BPF filter expects skb data pointer at payload data.  If
565538950a1SCraig Gallek  *    the skb does not yet point at the payload, this parameter represents
566538950a1SCraig Gallek  *    how far the pointer needs to advance to reach the payload.
567ef456144SCraig Gallek  *  Returns a socket that should receive the packet (or NULL on error).
568ef456144SCraig Gallek  */
reuseport_select_sock(struct sock * sk,u32 hash,struct sk_buff * skb,int hdr_len)569538950a1SCraig Gallek struct sock *reuseport_select_sock(struct sock *sk,
570538950a1SCraig Gallek 				   u32 hash,
571538950a1SCraig Gallek 				   struct sk_buff *skb,
572538950a1SCraig Gallek 				   int hdr_len)
573ef456144SCraig Gallek {
574ef456144SCraig Gallek 	struct sock_reuseport *reuse;
575538950a1SCraig Gallek 	struct bpf_prog *prog;
576ef456144SCraig Gallek 	struct sock *sk2 = NULL;
577ef456144SCraig Gallek 	u16 socks;
578ef456144SCraig Gallek 
579ef456144SCraig Gallek 	rcu_read_lock();
580ef456144SCraig Gallek 	reuse = rcu_dereference(sk->sk_reuseport_cb);
581ef456144SCraig Gallek 
582ef456144SCraig Gallek 	/* if memory allocation failed or add call is not yet complete */
583ef456144SCraig Gallek 	if (!reuse)
584ef456144SCraig Gallek 		goto out;
585ef456144SCraig Gallek 
586538950a1SCraig Gallek 	prog = rcu_dereference(reuse->prog);
587ef456144SCraig Gallek 	socks = READ_ONCE(reuse->num_socks);
588ef456144SCraig Gallek 	if (likely(socks)) {
5895c040eafSKuniyuki Iwashima 		/* paired with smp_wmb() in __reuseport_add_sock() */
590ef456144SCraig Gallek 		smp_rmb();
591ef456144SCraig Gallek 
5928217ca65SMartin KaFai Lau 		if (!prog || !skb)
5938217ca65SMartin KaFai Lau 			goto select_by_hash;
594e94a62f5SPaolo Abeni 
5958217ca65SMartin KaFai Lau 		if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT)
596d5e4ddaeSKuniyuki Iwashima 			sk2 = bpf_run_sk_reuseport(reuse, sk, prog, skb, NULL, hash);
5978217ca65SMartin KaFai Lau 		else
5988217ca65SMartin KaFai Lau 			sk2 = run_bpf_filter(reuse, socks, prog, skb, hdr_len);
5998217ca65SMartin KaFai Lau 
6008217ca65SMartin KaFai Lau select_by_hash:
601e94a62f5SPaolo Abeni 		/* no bpf or invalid bpf result: fall back to hash usage */
6021cd62c21SKuniyuki Iwashima 		if (!sk2)
6031cd62c21SKuniyuki Iwashima 			sk2 = reuseport_select_sock_by_hash(reuse, hash, socks);
604ef456144SCraig Gallek 	}
605ef456144SCraig Gallek 
606ef456144SCraig Gallek out:
607ef456144SCraig Gallek 	rcu_read_unlock();
608ef456144SCraig Gallek 	return sk2;
609ef456144SCraig Gallek }
610ef456144SCraig Gallek EXPORT_SYMBOL(reuseport_select_sock);
611538950a1SCraig Gallek 
6121cd62c21SKuniyuki Iwashima /**
6131cd62c21SKuniyuki Iwashima  *  reuseport_migrate_sock - Select a socket from an SO_REUSEPORT group.
6141cd62c21SKuniyuki Iwashima  *  @sk: close()ed or shutdown()ed socket in the group.
6151cd62c21SKuniyuki Iwashima  *  @migrating_sk: ESTABLISHED/SYN_RECV full socket in the accept queue or
6161cd62c21SKuniyuki Iwashima  *    NEW_SYN_RECV request socket during 3WHS.
6171cd62c21SKuniyuki Iwashima  *  @skb: skb to run through BPF filter.
6181cd62c21SKuniyuki Iwashima  *  Returns a socket (with sk_refcnt +1) that should accept the child socket
6191cd62c21SKuniyuki Iwashima  *  (or NULL on error).
6201cd62c21SKuniyuki Iwashima  */
reuseport_migrate_sock(struct sock * sk,struct sock * migrating_sk,struct sk_buff * skb)6211cd62c21SKuniyuki Iwashima struct sock *reuseport_migrate_sock(struct sock *sk,
6221cd62c21SKuniyuki Iwashima 				    struct sock *migrating_sk,
6231cd62c21SKuniyuki Iwashima 				    struct sk_buff *skb)
6241cd62c21SKuniyuki Iwashima {
6251cd62c21SKuniyuki Iwashima 	struct sock_reuseport *reuse;
6261cd62c21SKuniyuki Iwashima 	struct sock *nsk = NULL;
627d5e4ddaeSKuniyuki Iwashima 	bool allocated = false;
628d5e4ddaeSKuniyuki Iwashima 	struct bpf_prog *prog;
6291cd62c21SKuniyuki Iwashima 	u16 socks;
6301cd62c21SKuniyuki Iwashima 	u32 hash;
6311cd62c21SKuniyuki Iwashima 
6321cd62c21SKuniyuki Iwashima 	rcu_read_lock();
6331cd62c21SKuniyuki Iwashima 
6341cd62c21SKuniyuki Iwashima 	reuse = rcu_dereference(sk->sk_reuseport_cb);
6351cd62c21SKuniyuki Iwashima 	if (!reuse)
6361cd62c21SKuniyuki Iwashima 		goto out;
6371cd62c21SKuniyuki Iwashima 
6381cd62c21SKuniyuki Iwashima 	socks = READ_ONCE(reuse->num_socks);
6391cd62c21SKuniyuki Iwashima 	if (unlikely(!socks))
64055d444b3SKuniyuki Iwashima 		goto failure;
6411cd62c21SKuniyuki Iwashima 
6421cd62c21SKuniyuki Iwashima 	/* paired with smp_wmb() in __reuseport_add_sock() */
6431cd62c21SKuniyuki Iwashima 	smp_rmb();
6441cd62c21SKuniyuki Iwashima 
6451cd62c21SKuniyuki Iwashima 	hash = migrating_sk->sk_hash;
646d5e4ddaeSKuniyuki Iwashima 	prog = rcu_dereference(reuse->prog);
647d5e4ddaeSKuniyuki Iwashima 	if (!prog || prog->expected_attach_type != BPF_SK_REUSEPORT_SELECT_OR_MIGRATE) {
6484177f545SKuniyuki Iwashima 		if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_migrate_req))
649d5e4ddaeSKuniyuki Iwashima 			goto select_by_hash;
65055d444b3SKuniyuki Iwashima 		goto failure;
651d5e4ddaeSKuniyuki Iwashima 	}
652d5e4ddaeSKuniyuki Iwashima 
653d5e4ddaeSKuniyuki Iwashima 	if (!skb) {
654d5e4ddaeSKuniyuki Iwashima 		skb = alloc_skb(0, GFP_ATOMIC);
655d5e4ddaeSKuniyuki Iwashima 		if (!skb)
65655d444b3SKuniyuki Iwashima 			goto failure;
657d5e4ddaeSKuniyuki Iwashima 		allocated = true;
658d5e4ddaeSKuniyuki Iwashima 	}
659d5e4ddaeSKuniyuki Iwashima 
660d5e4ddaeSKuniyuki Iwashima 	nsk = bpf_run_sk_reuseport(reuse, sk, prog, skb, migrating_sk, hash);
661d5e4ddaeSKuniyuki Iwashima 
662d5e4ddaeSKuniyuki Iwashima 	if (allocated)
663d5e4ddaeSKuniyuki Iwashima 		kfree_skb(skb);
664d5e4ddaeSKuniyuki Iwashima 
665d5e4ddaeSKuniyuki Iwashima select_by_hash:
666d5e4ddaeSKuniyuki Iwashima 	if (!nsk)
6671cd62c21SKuniyuki Iwashima 		nsk = reuseport_select_sock_by_hash(reuse, hash, socks);
6681cd62c21SKuniyuki Iwashima 
66955d444b3SKuniyuki Iwashima 	if (IS_ERR_OR_NULL(nsk) || unlikely(!refcount_inc_not_zero(&nsk->sk_refcnt))) {
6701cd62c21SKuniyuki Iwashima 		nsk = NULL;
67155d444b3SKuniyuki Iwashima 		goto failure;
67255d444b3SKuniyuki Iwashima 	}
6731cd62c21SKuniyuki Iwashima 
6741cd62c21SKuniyuki Iwashima out:
6751cd62c21SKuniyuki Iwashima 	rcu_read_unlock();
6761cd62c21SKuniyuki Iwashima 	return nsk;
67755d444b3SKuniyuki Iwashima 
67855d444b3SKuniyuki Iwashima failure:
67955d444b3SKuniyuki Iwashima 	__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMIGRATEREQFAILURE);
68055d444b3SKuniyuki Iwashima 	goto out;
6811cd62c21SKuniyuki Iwashima }
6821cd62c21SKuniyuki Iwashima EXPORT_SYMBOL(reuseport_migrate_sock);
6831cd62c21SKuniyuki Iwashima 
reuseport_attach_prog(struct sock * sk,struct bpf_prog * prog)6848217ca65SMartin KaFai Lau int reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog)
685538950a1SCraig Gallek {
686538950a1SCraig Gallek 	struct sock_reuseport *reuse;
687538950a1SCraig Gallek 	struct bpf_prog *old_prog;
688538950a1SCraig Gallek 
689333bb73fSKuniyuki Iwashima 	if (sk_unhashed(sk)) {
690333bb73fSKuniyuki Iwashima 		int err;
6918217ca65SMartin KaFai Lau 
692333bb73fSKuniyuki Iwashima 		if (!sk->sk_reuseport)
693333bb73fSKuniyuki Iwashima 			return -EINVAL;
694333bb73fSKuniyuki Iwashima 
695333bb73fSKuniyuki Iwashima 		err = reuseport_alloc(sk, false);
6968217ca65SMartin KaFai Lau 		if (err)
6978217ca65SMartin KaFai Lau 			return err;
6988217ca65SMartin KaFai Lau 	} else if (!rcu_access_pointer(sk->sk_reuseport_cb)) {
6998217ca65SMartin KaFai Lau 		/* The socket wasn't bound with SO_REUSEPORT */
7008217ca65SMartin KaFai Lau 		return -EINVAL;
7018217ca65SMartin KaFai Lau 	}
7028217ca65SMartin KaFai Lau 
703538950a1SCraig Gallek 	spin_lock_bh(&reuseport_lock);
704538950a1SCraig Gallek 	reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
705538950a1SCraig Gallek 					  lockdep_is_held(&reuseport_lock));
706538950a1SCraig Gallek 	old_prog = rcu_dereference_protected(reuse->prog,
707538950a1SCraig Gallek 					     lockdep_is_held(&reuseport_lock));
708538950a1SCraig Gallek 	rcu_assign_pointer(reuse->prog, prog);
709538950a1SCraig Gallek 	spin_unlock_bh(&reuseport_lock);
710538950a1SCraig Gallek 
7118217ca65SMartin KaFai Lau 	sk_reuseport_prog_free(old_prog);
7128217ca65SMartin KaFai Lau 	return 0;
713538950a1SCraig Gallek }
714538950a1SCraig Gallek EXPORT_SYMBOL(reuseport_attach_prog);
71599f3a064SMartin KaFai Lau 
reuseport_detach_prog(struct sock * sk)71699f3a064SMartin KaFai Lau int reuseport_detach_prog(struct sock *sk)
71799f3a064SMartin KaFai Lau {
71899f3a064SMartin KaFai Lau 	struct sock_reuseport *reuse;
71999f3a064SMartin KaFai Lau 	struct bpf_prog *old_prog;
72099f3a064SMartin KaFai Lau 
72199f3a064SMartin KaFai Lau 	old_prog = NULL;
72299f3a064SMartin KaFai Lau 	spin_lock_bh(&reuseport_lock);
72399f3a064SMartin KaFai Lau 	reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
72499f3a064SMartin KaFai Lau 					  lockdep_is_held(&reuseport_lock));
725333bb73fSKuniyuki Iwashima 
726333bb73fSKuniyuki Iwashima 	/* reuse must be checked after acquiring the reuseport_lock
727333bb73fSKuniyuki Iwashima 	 * because reuseport_grow() can detach a closed sk.
728333bb73fSKuniyuki Iwashima 	 */
729333bb73fSKuniyuki Iwashima 	if (!reuse) {
730333bb73fSKuniyuki Iwashima 		spin_unlock_bh(&reuseport_lock);
731333bb73fSKuniyuki Iwashima 		return sk->sk_reuseport ? -ENOENT : -EINVAL;
732333bb73fSKuniyuki Iwashima 	}
733333bb73fSKuniyuki Iwashima 
734333bb73fSKuniyuki Iwashima 	if (sk_unhashed(sk) && reuse->num_closed_socks) {
735333bb73fSKuniyuki Iwashima 		spin_unlock_bh(&reuseport_lock);
736333bb73fSKuniyuki Iwashima 		return -ENOENT;
737333bb73fSKuniyuki Iwashima 	}
738333bb73fSKuniyuki Iwashima 
739e3f0d761SPaul E. McKenney 	old_prog = rcu_replace_pointer(reuse->prog, old_prog,
74099f3a064SMartin KaFai Lau 				       lockdep_is_held(&reuseport_lock));
74199f3a064SMartin KaFai Lau 	spin_unlock_bh(&reuseport_lock);
74299f3a064SMartin KaFai Lau 
74399f3a064SMartin KaFai Lau 	if (!old_prog)
74499f3a064SMartin KaFai Lau 		return -ENOENT;
74599f3a064SMartin KaFai Lau 
74699f3a064SMartin KaFai Lau 	sk_reuseport_prog_free(old_prog);
74799f3a064SMartin KaFai Lau 	return 0;
74899f3a064SMartin KaFai Lau }
74999f3a064SMartin KaFai Lau EXPORT_SYMBOL(reuseport_detach_prog);
750