xref: /openbmc/linux/net/core/sock_reuseport.c (revision f8a11425075ff11b4b5784f077cb84f3d2dfb3f0)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * To speed up listener socket lookup, create an array to store all sockets
4  * listening on the same port.  This allows a decision to be made after finding
5  * the first socket.  An optional BPF program can also be configured for
6  * selecting the socket index from the array of available sockets.
7  */
8 
9 #include <net/sock_reuseport.h>
10 #include <linux/bpf.h>
11 #include <linux/idr.h>
12 #include <linux/filter.h>
13 #include <linux/rcupdate.h>
14 
15 #define INIT_SOCKS 128
16 
17 DEFINE_SPINLOCK(reuseport_lock);
18 
19 static DEFINE_IDA(reuseport_ida);
20 static int reuseport_resurrect(struct sock *sk, struct sock_reuseport *old_reuse,
21 			       struct sock_reuseport *reuse, bool bind_inany);
22 
23 static int reuseport_sock_index(struct sock *sk,
24 				const struct sock_reuseport *reuse,
25 				bool closed)
26 {
27 	int left, right;
28 
29 	if (!closed) {
30 		left = 0;
31 		right = reuse->num_socks;
32 	} else {
33 		left = reuse->max_socks - reuse->num_closed_socks;
34 		right = reuse->max_socks;
35 	}
36 
37 	for (; left < right; left++)
38 		if (reuse->socks[left] == sk)
39 			return left;
40 	return -1;
41 }
42 
43 static void __reuseport_add_sock(struct sock *sk,
44 				 struct sock_reuseport *reuse)
45 {
46 	reuse->socks[reuse->num_socks] = sk;
47 	/* paired with smp_rmb() in reuseport_(select|migrate)_sock() */
48 	smp_wmb();
49 	reuse->num_socks++;
50 }
51 
52 static bool __reuseport_detach_sock(struct sock *sk,
53 				    struct sock_reuseport *reuse)
54 {
55 	int i = reuseport_sock_index(sk, reuse, false);
56 
57 	if (i == -1)
58 		return false;
59 
60 	reuse->socks[i] = reuse->socks[reuse->num_socks - 1];
61 	reuse->num_socks--;
62 
63 	return true;
64 }
65 
66 static void __reuseport_add_closed_sock(struct sock *sk,
67 					struct sock_reuseport *reuse)
68 {
69 	reuse->socks[reuse->max_socks - reuse->num_closed_socks - 1] = sk;
70 	/* paired with READ_ONCE() in inet_csk_bind_conflict() */
71 	WRITE_ONCE(reuse->num_closed_socks, reuse->num_closed_socks + 1);
72 }
73 
74 static bool __reuseport_detach_closed_sock(struct sock *sk,
75 					   struct sock_reuseport *reuse)
76 {
77 	int i = reuseport_sock_index(sk, reuse, true);
78 
79 	if (i == -1)
80 		return false;
81 
82 	reuse->socks[i] = reuse->socks[reuse->max_socks - reuse->num_closed_socks];
83 	/* paired with READ_ONCE() in inet_csk_bind_conflict() */
84 	WRITE_ONCE(reuse->num_closed_socks, reuse->num_closed_socks - 1);
85 
86 	return true;
87 }
88 
89 static struct sock_reuseport *__reuseport_alloc(unsigned int max_socks)
90 {
91 	unsigned int size = sizeof(struct sock_reuseport) +
92 		      sizeof(struct sock *) * max_socks;
93 	struct sock_reuseport *reuse = kzalloc(size, GFP_ATOMIC);
94 
95 	if (!reuse)
96 		return NULL;
97 
98 	reuse->max_socks = max_socks;
99 
100 	RCU_INIT_POINTER(reuse->prog, NULL);
101 	return reuse;
102 }
103 
104 int reuseport_alloc(struct sock *sk, bool bind_inany)
105 {
106 	struct sock_reuseport *reuse;
107 	int id, ret = 0;
108 
109 	/* bh lock used since this function call may precede hlist lock in
110 	 * soft irq of receive path or setsockopt from process context
111 	 */
112 	spin_lock_bh(&reuseport_lock);
113 
114 	/* Allocation attempts can occur concurrently via the setsockopt path
115 	 * and the bind/hash path.  Nothing to do when we lose the race.
116 	 */
117 	reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
118 					  lockdep_is_held(&reuseport_lock));
119 	if (reuse) {
120 		if (reuse->num_closed_socks) {
121 			/* sk was shutdown()ed before */
122 			ret = reuseport_resurrect(sk, reuse, NULL, bind_inany);
123 			goto out;
124 		}
125 
126 		/* Only set reuse->bind_inany if the bind_inany is true.
127 		 * Otherwise, it will overwrite the reuse->bind_inany
128 		 * which was set by the bind/hash path.
129 		 */
130 		if (bind_inany)
131 			reuse->bind_inany = bind_inany;
132 		goto out;
133 	}
134 
135 	reuse = __reuseport_alloc(INIT_SOCKS);
136 	if (!reuse) {
137 		ret = -ENOMEM;
138 		goto out;
139 	}
140 
141 	id = ida_alloc(&reuseport_ida, GFP_ATOMIC);
142 	if (id < 0) {
143 		kfree(reuse);
144 		ret = id;
145 		goto out;
146 	}
147 
148 	reuse->reuseport_id = id;
149 	reuse->bind_inany = bind_inany;
150 	reuse->socks[0] = sk;
151 	reuse->num_socks = 1;
152 	rcu_assign_pointer(sk->sk_reuseport_cb, reuse);
153 
154 out:
155 	spin_unlock_bh(&reuseport_lock);
156 
157 	return ret;
158 }
159 EXPORT_SYMBOL(reuseport_alloc);
160 
161 static struct sock_reuseport *reuseport_grow(struct sock_reuseport *reuse)
162 {
163 	struct sock_reuseport *more_reuse;
164 	u32 more_socks_size, i;
165 
166 	more_socks_size = reuse->max_socks * 2U;
167 	if (more_socks_size > U16_MAX) {
168 		if (reuse->num_closed_socks) {
169 			/* Make room by removing a closed sk.
170 			 * The child has already been migrated.
171 			 * Only reqsk left at this point.
172 			 */
173 			struct sock *sk;
174 
175 			sk = reuse->socks[reuse->max_socks - reuse->num_closed_socks];
176 			RCU_INIT_POINTER(sk->sk_reuseport_cb, NULL);
177 			__reuseport_detach_closed_sock(sk, reuse);
178 
179 			return reuse;
180 		}
181 
182 		return NULL;
183 	}
184 
185 	more_reuse = __reuseport_alloc(more_socks_size);
186 	if (!more_reuse)
187 		return NULL;
188 
189 	more_reuse->num_socks = reuse->num_socks;
190 	more_reuse->num_closed_socks = reuse->num_closed_socks;
191 	more_reuse->prog = reuse->prog;
192 	more_reuse->reuseport_id = reuse->reuseport_id;
193 	more_reuse->bind_inany = reuse->bind_inany;
194 	more_reuse->has_conns = reuse->has_conns;
195 
196 	memcpy(more_reuse->socks, reuse->socks,
197 	       reuse->num_socks * sizeof(struct sock *));
198 	memcpy(more_reuse->socks +
199 	       (more_reuse->max_socks - more_reuse->num_closed_socks),
200 	       reuse->socks + (reuse->max_socks - reuse->num_closed_socks),
201 	       reuse->num_closed_socks * sizeof(struct sock *));
202 	more_reuse->synq_overflow_ts = READ_ONCE(reuse->synq_overflow_ts);
203 
204 	for (i = 0; i < reuse->max_socks; ++i)
205 		rcu_assign_pointer(reuse->socks[i]->sk_reuseport_cb,
206 				   more_reuse);
207 
208 	/* Note: we use kfree_rcu here instead of reuseport_free_rcu so
209 	 * that reuse and more_reuse can temporarily share a reference
210 	 * to prog.
211 	 */
212 	kfree_rcu(reuse, rcu);
213 	return more_reuse;
214 }
215 
216 static void reuseport_free_rcu(struct rcu_head *head)
217 {
218 	struct sock_reuseport *reuse;
219 
220 	reuse = container_of(head, struct sock_reuseport, rcu);
221 	sk_reuseport_prog_free(rcu_dereference_protected(reuse->prog, 1));
222 	ida_free(&reuseport_ida, reuse->reuseport_id);
223 	kfree(reuse);
224 }
225 
226 /**
227  *  reuseport_add_sock - Add a socket to the reuseport group of another.
228  *  @sk:  New socket to add to the group.
229  *  @sk2: Socket belonging to the existing reuseport group.
230  *  @bind_inany: Whether or not the group is bound to a local INANY address.
231  *
232  *  May return ENOMEM and not add socket to group under memory pressure.
233  */
234 int reuseport_add_sock(struct sock *sk, struct sock *sk2, bool bind_inany)
235 {
236 	struct sock_reuseport *old_reuse, *reuse;
237 
238 	if (!rcu_access_pointer(sk2->sk_reuseport_cb)) {
239 		int err = reuseport_alloc(sk2, bind_inany);
240 
241 		if (err)
242 			return err;
243 	}
244 
245 	spin_lock_bh(&reuseport_lock);
246 	reuse = rcu_dereference_protected(sk2->sk_reuseport_cb,
247 					  lockdep_is_held(&reuseport_lock));
248 	old_reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
249 					      lockdep_is_held(&reuseport_lock));
250 	if (old_reuse && old_reuse->num_closed_socks) {
251 		/* sk was shutdown()ed before */
252 		int err = reuseport_resurrect(sk, old_reuse, reuse, reuse->bind_inany);
253 
254 		spin_unlock_bh(&reuseport_lock);
255 		return err;
256 	}
257 
258 	if (old_reuse && old_reuse->num_socks != 1) {
259 		spin_unlock_bh(&reuseport_lock);
260 		return -EBUSY;
261 	}
262 
263 	if (reuse->num_socks + reuse->num_closed_socks == reuse->max_socks) {
264 		reuse = reuseport_grow(reuse);
265 		if (!reuse) {
266 			spin_unlock_bh(&reuseport_lock);
267 			return -ENOMEM;
268 		}
269 	}
270 
271 	__reuseport_add_sock(sk, reuse);
272 	rcu_assign_pointer(sk->sk_reuseport_cb, reuse);
273 
274 	spin_unlock_bh(&reuseport_lock);
275 
276 	if (old_reuse)
277 		call_rcu(&old_reuse->rcu, reuseport_free_rcu);
278 	return 0;
279 }
280 EXPORT_SYMBOL(reuseport_add_sock);
281 
282 static int reuseport_resurrect(struct sock *sk, struct sock_reuseport *old_reuse,
283 			       struct sock_reuseport *reuse, bool bind_inany)
284 {
285 	if (old_reuse == reuse) {
286 		/* If sk was in the same reuseport group, just pop sk out of
287 		 * the closed section and push sk into the listening section.
288 		 */
289 		__reuseport_detach_closed_sock(sk, old_reuse);
290 		__reuseport_add_sock(sk, old_reuse);
291 		return 0;
292 	}
293 
294 	if (!reuse) {
295 		/* In bind()/listen() path, we cannot carry over the eBPF prog
296 		 * for the shutdown()ed socket. In setsockopt() path, we should
297 		 * not change the eBPF prog of listening sockets by attaching a
298 		 * prog to the shutdown()ed socket. Thus, we will allocate a new
299 		 * reuseport group and detach sk from the old group.
300 		 */
301 		int id;
302 
303 		reuse = __reuseport_alloc(INIT_SOCKS);
304 		if (!reuse)
305 			return -ENOMEM;
306 
307 		id = ida_alloc(&reuseport_ida, GFP_ATOMIC);
308 		if (id < 0) {
309 			kfree(reuse);
310 			return id;
311 		}
312 
313 		reuse->reuseport_id = id;
314 		reuse->bind_inany = bind_inany;
315 	} else {
316 		/* Move sk from the old group to the new one if
317 		 * - all the other listeners in the old group were close()d or
318 		 *   shutdown()ed, and then sk2 has listen()ed on the same port
319 		 * OR
320 		 * - sk listen()ed without bind() (or with autobind), was
321 		 *   shutdown()ed, and then listen()s on another port which
322 		 *   sk2 listen()s on.
323 		 */
324 		if (reuse->num_socks + reuse->num_closed_socks == reuse->max_socks) {
325 			reuse = reuseport_grow(reuse);
326 			if (!reuse)
327 				return -ENOMEM;
328 		}
329 	}
330 
331 	__reuseport_detach_closed_sock(sk, old_reuse);
332 	__reuseport_add_sock(sk, reuse);
333 	rcu_assign_pointer(sk->sk_reuseport_cb, reuse);
334 
335 	if (old_reuse->num_socks + old_reuse->num_closed_socks == 0)
336 		call_rcu(&old_reuse->rcu, reuseport_free_rcu);
337 
338 	return 0;
339 }
340 
341 void reuseport_detach_sock(struct sock *sk)
342 {
343 	struct sock_reuseport *reuse;
344 
345 	spin_lock_bh(&reuseport_lock);
346 	reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
347 					  lockdep_is_held(&reuseport_lock));
348 
349 	/* reuseport_grow() has detached a closed sk */
350 	if (!reuse)
351 		goto out;
352 
353 	/* Notify the bpf side. The sk may be added to a sockarray
354 	 * map. If so, sockarray logic will remove it from the map.
355 	 *
356 	 * Other bpf map types that work with reuseport, like sockmap,
357 	 * don't need an explicit callback from here. They override sk
358 	 * unhash/close ops to remove the sk from the map before we
359 	 * get to this point.
360 	 */
361 	bpf_sk_reuseport_detach(sk);
362 
363 	rcu_assign_pointer(sk->sk_reuseport_cb, NULL);
364 
365 	if (!__reuseport_detach_closed_sock(sk, reuse))
366 		__reuseport_detach_sock(sk, reuse);
367 
368 	if (reuse->num_socks + reuse->num_closed_socks == 0)
369 		call_rcu(&reuse->rcu, reuseport_free_rcu);
370 
371 out:
372 	spin_unlock_bh(&reuseport_lock);
373 }
374 EXPORT_SYMBOL(reuseport_detach_sock);
375 
376 void reuseport_stop_listen_sock(struct sock *sk)
377 {
378 	if (sk->sk_protocol == IPPROTO_TCP) {
379 		struct sock_reuseport *reuse;
380 		struct bpf_prog *prog;
381 
382 		spin_lock_bh(&reuseport_lock);
383 
384 		reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
385 						  lockdep_is_held(&reuseport_lock));
386 		prog = rcu_dereference_protected(reuse->prog,
387 						 lockdep_is_held(&reuseport_lock));
388 
389 		if (sock_net(sk)->ipv4.sysctl_tcp_migrate_req ||
390 		    (prog && prog->expected_attach_type == BPF_SK_REUSEPORT_SELECT_OR_MIGRATE)) {
391 			/* Migration capable, move sk from the listening section
392 			 * to the closed section.
393 			 */
394 			bpf_sk_reuseport_detach(sk);
395 
396 			__reuseport_detach_sock(sk, reuse);
397 			__reuseport_add_closed_sock(sk, reuse);
398 
399 			spin_unlock_bh(&reuseport_lock);
400 			return;
401 		}
402 
403 		spin_unlock_bh(&reuseport_lock);
404 	}
405 
406 	/* Not capable to do migration, detach immediately */
407 	reuseport_detach_sock(sk);
408 }
409 EXPORT_SYMBOL(reuseport_stop_listen_sock);
410 
411 static struct sock *run_bpf_filter(struct sock_reuseport *reuse, u16 socks,
412 				   struct bpf_prog *prog, struct sk_buff *skb,
413 				   int hdr_len)
414 {
415 	struct sk_buff *nskb = NULL;
416 	u32 index;
417 
418 	if (skb_shared(skb)) {
419 		nskb = skb_clone(skb, GFP_ATOMIC);
420 		if (!nskb)
421 			return NULL;
422 		skb = nskb;
423 	}
424 
425 	/* temporarily advance data past protocol header */
426 	if (!pskb_pull(skb, hdr_len)) {
427 		kfree_skb(nskb);
428 		return NULL;
429 	}
430 	index = bpf_prog_run_save_cb(prog, skb);
431 	__skb_push(skb, hdr_len);
432 
433 	consume_skb(nskb);
434 
435 	if (index >= socks)
436 		return NULL;
437 
438 	return reuse->socks[index];
439 }
440 
441 static struct sock *reuseport_select_sock_by_hash(struct sock_reuseport *reuse,
442 						  u32 hash, u16 num_socks)
443 {
444 	int i, j;
445 
446 	i = j = reciprocal_scale(hash, num_socks);
447 	while (reuse->socks[i]->sk_state == TCP_ESTABLISHED) {
448 		i++;
449 		if (i >= num_socks)
450 			i = 0;
451 		if (i == j)
452 			return NULL;
453 	}
454 
455 	return reuse->socks[i];
456 }
457 
458 /**
459  *  reuseport_select_sock - Select a socket from an SO_REUSEPORT group.
460  *  @sk: First socket in the group.
461  *  @hash: When no BPF filter is available, use this hash to select.
462  *  @skb: skb to run through BPF filter.
463  *  @hdr_len: BPF filter expects skb data pointer at payload data.  If
464  *    the skb does not yet point at the payload, this parameter represents
465  *    how far the pointer needs to advance to reach the payload.
466  *  Returns a socket that should receive the packet (or NULL on error).
467  */
468 struct sock *reuseport_select_sock(struct sock *sk,
469 				   u32 hash,
470 				   struct sk_buff *skb,
471 				   int hdr_len)
472 {
473 	struct sock_reuseport *reuse;
474 	struct bpf_prog *prog;
475 	struct sock *sk2 = NULL;
476 	u16 socks;
477 
478 	rcu_read_lock();
479 	reuse = rcu_dereference(sk->sk_reuseport_cb);
480 
481 	/* if memory allocation failed or add call is not yet complete */
482 	if (!reuse)
483 		goto out;
484 
485 	prog = rcu_dereference(reuse->prog);
486 	socks = READ_ONCE(reuse->num_socks);
487 	if (likely(socks)) {
488 		/* paired with smp_wmb() in __reuseport_add_sock() */
489 		smp_rmb();
490 
491 		if (!prog || !skb)
492 			goto select_by_hash;
493 
494 		if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT)
495 			sk2 = bpf_run_sk_reuseport(reuse, sk, prog, skb, NULL, hash);
496 		else
497 			sk2 = run_bpf_filter(reuse, socks, prog, skb, hdr_len);
498 
499 select_by_hash:
500 		/* no bpf or invalid bpf result: fall back to hash usage */
501 		if (!sk2)
502 			sk2 = reuseport_select_sock_by_hash(reuse, hash, socks);
503 	}
504 
505 out:
506 	rcu_read_unlock();
507 	return sk2;
508 }
509 EXPORT_SYMBOL(reuseport_select_sock);
510 
511 /**
512  *  reuseport_migrate_sock - Select a socket from an SO_REUSEPORT group.
513  *  @sk: close()ed or shutdown()ed socket in the group.
514  *  @migrating_sk: ESTABLISHED/SYN_RECV full socket in the accept queue or
515  *    NEW_SYN_RECV request socket during 3WHS.
516  *  @skb: skb to run through BPF filter.
517  *  Returns a socket (with sk_refcnt +1) that should accept the child socket
518  *  (or NULL on error).
519  */
520 struct sock *reuseport_migrate_sock(struct sock *sk,
521 				    struct sock *migrating_sk,
522 				    struct sk_buff *skb)
523 {
524 	struct sock_reuseport *reuse;
525 	struct sock *nsk = NULL;
526 	bool allocated = false;
527 	struct bpf_prog *prog;
528 	u16 socks;
529 	u32 hash;
530 
531 	rcu_read_lock();
532 
533 	reuse = rcu_dereference(sk->sk_reuseport_cb);
534 	if (!reuse)
535 		goto out;
536 
537 	socks = READ_ONCE(reuse->num_socks);
538 	if (unlikely(!socks))
539 		goto out;
540 
541 	/* paired with smp_wmb() in __reuseport_add_sock() */
542 	smp_rmb();
543 
544 	hash = migrating_sk->sk_hash;
545 	prog = rcu_dereference(reuse->prog);
546 	if (!prog || prog->expected_attach_type != BPF_SK_REUSEPORT_SELECT_OR_MIGRATE) {
547 		if (sock_net(sk)->ipv4.sysctl_tcp_migrate_req)
548 			goto select_by_hash;
549 		goto out;
550 	}
551 
552 	if (!skb) {
553 		skb = alloc_skb(0, GFP_ATOMIC);
554 		if (!skb)
555 			goto out;
556 		allocated = true;
557 	}
558 
559 	nsk = bpf_run_sk_reuseport(reuse, sk, prog, skb, migrating_sk, hash);
560 
561 	if (allocated)
562 		kfree_skb(skb);
563 
564 select_by_hash:
565 	if (!nsk)
566 		nsk = reuseport_select_sock_by_hash(reuse, hash, socks);
567 
568 	if (IS_ERR_OR_NULL(nsk) || unlikely(!refcount_inc_not_zero(&nsk->sk_refcnt)))
569 		nsk = NULL;
570 
571 out:
572 	rcu_read_unlock();
573 	return nsk;
574 }
575 EXPORT_SYMBOL(reuseport_migrate_sock);
576 
577 int reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog)
578 {
579 	struct sock_reuseport *reuse;
580 	struct bpf_prog *old_prog;
581 
582 	if (sk_unhashed(sk)) {
583 		int err;
584 
585 		if (!sk->sk_reuseport)
586 			return -EINVAL;
587 
588 		err = reuseport_alloc(sk, false);
589 		if (err)
590 			return err;
591 	} else if (!rcu_access_pointer(sk->sk_reuseport_cb)) {
592 		/* The socket wasn't bound with SO_REUSEPORT */
593 		return -EINVAL;
594 	}
595 
596 	spin_lock_bh(&reuseport_lock);
597 	reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
598 					  lockdep_is_held(&reuseport_lock));
599 	old_prog = rcu_dereference_protected(reuse->prog,
600 					     lockdep_is_held(&reuseport_lock));
601 	rcu_assign_pointer(reuse->prog, prog);
602 	spin_unlock_bh(&reuseport_lock);
603 
604 	sk_reuseport_prog_free(old_prog);
605 	return 0;
606 }
607 EXPORT_SYMBOL(reuseport_attach_prog);
608 
609 int reuseport_detach_prog(struct sock *sk)
610 {
611 	struct sock_reuseport *reuse;
612 	struct bpf_prog *old_prog;
613 
614 	old_prog = NULL;
615 	spin_lock_bh(&reuseport_lock);
616 	reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
617 					  lockdep_is_held(&reuseport_lock));
618 
619 	/* reuse must be checked after acquiring the reuseport_lock
620 	 * because reuseport_grow() can detach a closed sk.
621 	 */
622 	if (!reuse) {
623 		spin_unlock_bh(&reuseport_lock);
624 		return sk->sk_reuseport ? -ENOENT : -EINVAL;
625 	}
626 
627 	if (sk_unhashed(sk) && reuse->num_closed_socks) {
628 		spin_unlock_bh(&reuseport_lock);
629 		return -ENOENT;
630 	}
631 
632 	old_prog = rcu_replace_pointer(reuse->prog, old_prog,
633 				       lockdep_is_held(&reuseport_lock));
634 	spin_unlock_bh(&reuseport_lock);
635 
636 	if (!old_prog)
637 		return -ENOENT;
638 
639 	sk_reuseport_prog_free(old_prog);
640 	return 0;
641 }
642 EXPORT_SYMBOL(reuseport_detach_prog);
643