xref: /openbmc/linux/net/core/sock_reuseport.c (revision 1cd62c21)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * To speed up listener socket lookup, create an array to store all sockets
4  * listening on the same port.  This allows a decision to be made after finding
5  * the first socket.  An optional BPF program can also be configured for
6  * selecting the socket index from the array of available sockets.
7  */
8 
9 #include <net/sock_reuseport.h>
10 #include <linux/bpf.h>
11 #include <linux/idr.h>
12 #include <linux/filter.h>
13 #include <linux/rcupdate.h>
14 
15 #define INIT_SOCKS 128
16 
17 DEFINE_SPINLOCK(reuseport_lock);
18 
19 static DEFINE_IDA(reuseport_ida);
20 static int reuseport_resurrect(struct sock *sk, struct sock_reuseport *old_reuse,
21 			       struct sock_reuseport *reuse, bool bind_inany);
22 
23 static int reuseport_sock_index(struct sock *sk,
24 				const struct sock_reuseport *reuse,
25 				bool closed)
26 {
27 	int left, right;
28 
29 	if (!closed) {
30 		left = 0;
31 		right = reuse->num_socks;
32 	} else {
33 		left = reuse->max_socks - reuse->num_closed_socks;
34 		right = reuse->max_socks;
35 	}
36 
37 	for (; left < right; left++)
38 		if (reuse->socks[left] == sk)
39 			return left;
40 	return -1;
41 }
42 
43 static void __reuseport_add_sock(struct sock *sk,
44 				 struct sock_reuseport *reuse)
45 {
46 	reuse->socks[reuse->num_socks] = sk;
47 	/* paired with smp_rmb() in reuseport_(select|migrate)_sock() */
48 	smp_wmb();
49 	reuse->num_socks++;
50 }
51 
52 static bool __reuseport_detach_sock(struct sock *sk,
53 				    struct sock_reuseport *reuse)
54 {
55 	int i = reuseport_sock_index(sk, reuse, false);
56 
57 	if (i == -1)
58 		return false;
59 
60 	reuse->socks[i] = reuse->socks[reuse->num_socks - 1];
61 	reuse->num_socks--;
62 
63 	return true;
64 }
65 
66 static void __reuseport_add_closed_sock(struct sock *sk,
67 					struct sock_reuseport *reuse)
68 {
69 	reuse->socks[reuse->max_socks - reuse->num_closed_socks - 1] = sk;
70 	/* paired with READ_ONCE() in inet_csk_bind_conflict() */
71 	WRITE_ONCE(reuse->num_closed_socks, reuse->num_closed_socks + 1);
72 }
73 
74 static bool __reuseport_detach_closed_sock(struct sock *sk,
75 					   struct sock_reuseport *reuse)
76 {
77 	int i = reuseport_sock_index(sk, reuse, true);
78 
79 	if (i == -1)
80 		return false;
81 
82 	reuse->socks[i] = reuse->socks[reuse->max_socks - reuse->num_closed_socks];
83 	/* paired with READ_ONCE() in inet_csk_bind_conflict() */
84 	WRITE_ONCE(reuse->num_closed_socks, reuse->num_closed_socks - 1);
85 
86 	return true;
87 }
88 
89 static struct sock_reuseport *__reuseport_alloc(unsigned int max_socks)
90 {
91 	unsigned int size = sizeof(struct sock_reuseport) +
92 		      sizeof(struct sock *) * max_socks;
93 	struct sock_reuseport *reuse = kzalloc(size, GFP_ATOMIC);
94 
95 	if (!reuse)
96 		return NULL;
97 
98 	reuse->max_socks = max_socks;
99 
100 	RCU_INIT_POINTER(reuse->prog, NULL);
101 	return reuse;
102 }
103 
104 int reuseport_alloc(struct sock *sk, bool bind_inany)
105 {
106 	struct sock_reuseport *reuse;
107 	int id, ret = 0;
108 
109 	/* bh lock used since this function call may precede hlist lock in
110 	 * soft irq of receive path or setsockopt from process context
111 	 */
112 	spin_lock_bh(&reuseport_lock);
113 
114 	/* Allocation attempts can occur concurrently via the setsockopt path
115 	 * and the bind/hash path.  Nothing to do when we lose the race.
116 	 */
117 	reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
118 					  lockdep_is_held(&reuseport_lock));
119 	if (reuse) {
120 		if (reuse->num_closed_socks) {
121 			/* sk was shutdown()ed before */
122 			ret = reuseport_resurrect(sk, reuse, NULL, bind_inany);
123 			goto out;
124 		}
125 
126 		/* Only set reuse->bind_inany if the bind_inany is true.
127 		 * Otherwise, it will overwrite the reuse->bind_inany
128 		 * which was set by the bind/hash path.
129 		 */
130 		if (bind_inany)
131 			reuse->bind_inany = bind_inany;
132 		goto out;
133 	}
134 
135 	reuse = __reuseport_alloc(INIT_SOCKS);
136 	if (!reuse) {
137 		ret = -ENOMEM;
138 		goto out;
139 	}
140 
141 	id = ida_alloc(&reuseport_ida, GFP_ATOMIC);
142 	if (id < 0) {
143 		kfree(reuse);
144 		ret = id;
145 		goto out;
146 	}
147 
148 	reuse->reuseport_id = id;
149 	reuse->bind_inany = bind_inany;
150 	reuse->socks[0] = sk;
151 	reuse->num_socks = 1;
152 	rcu_assign_pointer(sk->sk_reuseport_cb, reuse);
153 
154 out:
155 	spin_unlock_bh(&reuseport_lock);
156 
157 	return ret;
158 }
159 EXPORT_SYMBOL(reuseport_alloc);
160 
161 static struct sock_reuseport *reuseport_grow(struct sock_reuseport *reuse)
162 {
163 	struct sock_reuseport *more_reuse;
164 	u32 more_socks_size, i;
165 
166 	more_socks_size = reuse->max_socks * 2U;
167 	if (more_socks_size > U16_MAX) {
168 		if (reuse->num_closed_socks) {
169 			/* Make room by removing a closed sk.
170 			 * The child has already been migrated.
171 			 * Only reqsk left at this point.
172 			 */
173 			struct sock *sk;
174 
175 			sk = reuse->socks[reuse->max_socks - reuse->num_closed_socks];
176 			RCU_INIT_POINTER(sk->sk_reuseport_cb, NULL);
177 			__reuseport_detach_closed_sock(sk, reuse);
178 
179 			return reuse;
180 		}
181 
182 		return NULL;
183 	}
184 
185 	more_reuse = __reuseport_alloc(more_socks_size);
186 	if (!more_reuse)
187 		return NULL;
188 
189 	more_reuse->num_socks = reuse->num_socks;
190 	more_reuse->num_closed_socks = reuse->num_closed_socks;
191 	more_reuse->prog = reuse->prog;
192 	more_reuse->reuseport_id = reuse->reuseport_id;
193 	more_reuse->bind_inany = reuse->bind_inany;
194 	more_reuse->has_conns = reuse->has_conns;
195 
196 	memcpy(more_reuse->socks, reuse->socks,
197 	       reuse->num_socks * sizeof(struct sock *));
198 	memcpy(more_reuse->socks +
199 	       (more_reuse->max_socks - more_reuse->num_closed_socks),
200 	       reuse->socks + (reuse->max_socks - reuse->num_closed_socks),
201 	       reuse->num_closed_socks * sizeof(struct sock *));
202 	more_reuse->synq_overflow_ts = READ_ONCE(reuse->synq_overflow_ts);
203 
204 	for (i = 0; i < reuse->max_socks; ++i)
205 		rcu_assign_pointer(reuse->socks[i]->sk_reuseport_cb,
206 				   more_reuse);
207 
208 	/* Note: we use kfree_rcu here instead of reuseport_free_rcu so
209 	 * that reuse and more_reuse can temporarily share a reference
210 	 * to prog.
211 	 */
212 	kfree_rcu(reuse, rcu);
213 	return more_reuse;
214 }
215 
216 static void reuseport_free_rcu(struct rcu_head *head)
217 {
218 	struct sock_reuseport *reuse;
219 
220 	reuse = container_of(head, struct sock_reuseport, rcu);
221 	sk_reuseport_prog_free(rcu_dereference_protected(reuse->prog, 1));
222 	ida_free(&reuseport_ida, reuse->reuseport_id);
223 	kfree(reuse);
224 }
225 
226 /**
227  *  reuseport_add_sock - Add a socket to the reuseport group of another.
228  *  @sk:  New socket to add to the group.
229  *  @sk2: Socket belonging to the existing reuseport group.
230  *  @bind_inany: Whether or not the group is bound to a local INANY address.
231  *
232  *  May return ENOMEM and not add socket to group under memory pressure.
233  */
234 int reuseport_add_sock(struct sock *sk, struct sock *sk2, bool bind_inany)
235 {
236 	struct sock_reuseport *old_reuse, *reuse;
237 
238 	if (!rcu_access_pointer(sk2->sk_reuseport_cb)) {
239 		int err = reuseport_alloc(sk2, bind_inany);
240 
241 		if (err)
242 			return err;
243 	}
244 
245 	spin_lock_bh(&reuseport_lock);
246 	reuse = rcu_dereference_protected(sk2->sk_reuseport_cb,
247 					  lockdep_is_held(&reuseport_lock));
248 	old_reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
249 					      lockdep_is_held(&reuseport_lock));
250 	if (old_reuse && old_reuse->num_closed_socks) {
251 		/* sk was shutdown()ed before */
252 		int err = reuseport_resurrect(sk, old_reuse, reuse, reuse->bind_inany);
253 
254 		spin_unlock_bh(&reuseport_lock);
255 		return err;
256 	}
257 
258 	if (old_reuse && old_reuse->num_socks != 1) {
259 		spin_unlock_bh(&reuseport_lock);
260 		return -EBUSY;
261 	}
262 
263 	if (reuse->num_socks + reuse->num_closed_socks == reuse->max_socks) {
264 		reuse = reuseport_grow(reuse);
265 		if (!reuse) {
266 			spin_unlock_bh(&reuseport_lock);
267 			return -ENOMEM;
268 		}
269 	}
270 
271 	__reuseport_add_sock(sk, reuse);
272 	rcu_assign_pointer(sk->sk_reuseport_cb, reuse);
273 
274 	spin_unlock_bh(&reuseport_lock);
275 
276 	if (old_reuse)
277 		call_rcu(&old_reuse->rcu, reuseport_free_rcu);
278 	return 0;
279 }
280 EXPORT_SYMBOL(reuseport_add_sock);
281 
282 static int reuseport_resurrect(struct sock *sk, struct sock_reuseport *old_reuse,
283 			       struct sock_reuseport *reuse, bool bind_inany)
284 {
285 	if (old_reuse == reuse) {
286 		/* If sk was in the same reuseport group, just pop sk out of
287 		 * the closed section and push sk into the listening section.
288 		 */
289 		__reuseport_detach_closed_sock(sk, old_reuse);
290 		__reuseport_add_sock(sk, old_reuse);
291 		return 0;
292 	}
293 
294 	if (!reuse) {
295 		/* In bind()/listen() path, we cannot carry over the eBPF prog
296 		 * for the shutdown()ed socket. In setsockopt() path, we should
297 		 * not change the eBPF prog of listening sockets by attaching a
298 		 * prog to the shutdown()ed socket. Thus, we will allocate a new
299 		 * reuseport group and detach sk from the old group.
300 		 */
301 		int id;
302 
303 		reuse = __reuseport_alloc(INIT_SOCKS);
304 		if (!reuse)
305 			return -ENOMEM;
306 
307 		id = ida_alloc(&reuseport_ida, GFP_ATOMIC);
308 		if (id < 0) {
309 			kfree(reuse);
310 			return id;
311 		}
312 
313 		reuse->reuseport_id = id;
314 		reuse->bind_inany = bind_inany;
315 	} else {
316 		/* Move sk from the old group to the new one if
317 		 * - all the other listeners in the old group were close()d or
318 		 *   shutdown()ed, and then sk2 has listen()ed on the same port
319 		 * OR
320 		 * - sk listen()ed without bind() (or with autobind), was
321 		 *   shutdown()ed, and then listen()s on another port which
322 		 *   sk2 listen()s on.
323 		 */
324 		if (reuse->num_socks + reuse->num_closed_socks == reuse->max_socks) {
325 			reuse = reuseport_grow(reuse);
326 			if (!reuse)
327 				return -ENOMEM;
328 		}
329 	}
330 
331 	__reuseport_detach_closed_sock(sk, old_reuse);
332 	__reuseport_add_sock(sk, reuse);
333 	rcu_assign_pointer(sk->sk_reuseport_cb, reuse);
334 
335 	if (old_reuse->num_socks + old_reuse->num_closed_socks == 0)
336 		call_rcu(&old_reuse->rcu, reuseport_free_rcu);
337 
338 	return 0;
339 }
340 
341 void reuseport_detach_sock(struct sock *sk)
342 {
343 	struct sock_reuseport *reuse;
344 
345 	spin_lock_bh(&reuseport_lock);
346 	reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
347 					  lockdep_is_held(&reuseport_lock));
348 
349 	/* reuseport_grow() has detached a closed sk */
350 	if (!reuse)
351 		goto out;
352 
353 	/* Notify the bpf side. The sk may be added to a sockarray
354 	 * map. If so, sockarray logic will remove it from the map.
355 	 *
356 	 * Other bpf map types that work with reuseport, like sockmap,
357 	 * don't need an explicit callback from here. They override sk
358 	 * unhash/close ops to remove the sk from the map before we
359 	 * get to this point.
360 	 */
361 	bpf_sk_reuseport_detach(sk);
362 
363 	rcu_assign_pointer(sk->sk_reuseport_cb, NULL);
364 
365 	if (!__reuseport_detach_closed_sock(sk, reuse))
366 		__reuseport_detach_sock(sk, reuse);
367 
368 	if (reuse->num_socks + reuse->num_closed_socks == 0)
369 		call_rcu(&reuse->rcu, reuseport_free_rcu);
370 
371 out:
372 	spin_unlock_bh(&reuseport_lock);
373 }
374 EXPORT_SYMBOL(reuseport_detach_sock);
375 
376 void reuseport_stop_listen_sock(struct sock *sk)
377 {
378 	if (sk->sk_protocol == IPPROTO_TCP) {
379 		struct sock_reuseport *reuse;
380 
381 		spin_lock_bh(&reuseport_lock);
382 
383 		reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
384 						  lockdep_is_held(&reuseport_lock));
385 
386 		if (sock_net(sk)->ipv4.sysctl_tcp_migrate_req) {
387 			/* Migration capable, move sk from the listening section
388 			 * to the closed section.
389 			 */
390 			bpf_sk_reuseport_detach(sk);
391 
392 			__reuseport_detach_sock(sk, reuse);
393 			__reuseport_add_closed_sock(sk, reuse);
394 
395 			spin_unlock_bh(&reuseport_lock);
396 			return;
397 		}
398 
399 		spin_unlock_bh(&reuseport_lock);
400 	}
401 
402 	/* Not capable to do migration, detach immediately */
403 	reuseport_detach_sock(sk);
404 }
405 EXPORT_SYMBOL(reuseport_stop_listen_sock);
406 
407 static struct sock *run_bpf_filter(struct sock_reuseport *reuse, u16 socks,
408 				   struct bpf_prog *prog, struct sk_buff *skb,
409 				   int hdr_len)
410 {
411 	struct sk_buff *nskb = NULL;
412 	u32 index;
413 
414 	if (skb_shared(skb)) {
415 		nskb = skb_clone(skb, GFP_ATOMIC);
416 		if (!nskb)
417 			return NULL;
418 		skb = nskb;
419 	}
420 
421 	/* temporarily advance data past protocol header */
422 	if (!pskb_pull(skb, hdr_len)) {
423 		kfree_skb(nskb);
424 		return NULL;
425 	}
426 	index = bpf_prog_run_save_cb(prog, skb);
427 	__skb_push(skb, hdr_len);
428 
429 	consume_skb(nskb);
430 
431 	if (index >= socks)
432 		return NULL;
433 
434 	return reuse->socks[index];
435 }
436 
437 static struct sock *reuseport_select_sock_by_hash(struct sock_reuseport *reuse,
438 						  u32 hash, u16 num_socks)
439 {
440 	int i, j;
441 
442 	i = j = reciprocal_scale(hash, num_socks);
443 	while (reuse->socks[i]->sk_state == TCP_ESTABLISHED) {
444 		i++;
445 		if (i >= num_socks)
446 			i = 0;
447 		if (i == j)
448 			return NULL;
449 	}
450 
451 	return reuse->socks[i];
452 }
453 
454 /**
455  *  reuseport_select_sock - Select a socket from an SO_REUSEPORT group.
456  *  @sk: First socket in the group.
457  *  @hash: When no BPF filter is available, use this hash to select.
458  *  @skb: skb to run through BPF filter.
459  *  @hdr_len: BPF filter expects skb data pointer at payload data.  If
460  *    the skb does not yet point at the payload, this parameter represents
461  *    how far the pointer needs to advance to reach the payload.
462  *  Returns a socket that should receive the packet (or NULL on error).
463  */
464 struct sock *reuseport_select_sock(struct sock *sk,
465 				   u32 hash,
466 				   struct sk_buff *skb,
467 				   int hdr_len)
468 {
469 	struct sock_reuseport *reuse;
470 	struct bpf_prog *prog;
471 	struct sock *sk2 = NULL;
472 	u16 socks;
473 
474 	rcu_read_lock();
475 	reuse = rcu_dereference(sk->sk_reuseport_cb);
476 
477 	/* if memory allocation failed or add call is not yet complete */
478 	if (!reuse)
479 		goto out;
480 
481 	prog = rcu_dereference(reuse->prog);
482 	socks = READ_ONCE(reuse->num_socks);
483 	if (likely(socks)) {
484 		/* paired with smp_wmb() in __reuseport_add_sock() */
485 		smp_rmb();
486 
487 		if (!prog || !skb)
488 			goto select_by_hash;
489 
490 		if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT)
491 			sk2 = bpf_run_sk_reuseport(reuse, sk, prog, skb, hash);
492 		else
493 			sk2 = run_bpf_filter(reuse, socks, prog, skb, hdr_len);
494 
495 select_by_hash:
496 		/* no bpf or invalid bpf result: fall back to hash usage */
497 		if (!sk2)
498 			sk2 = reuseport_select_sock_by_hash(reuse, hash, socks);
499 	}
500 
501 out:
502 	rcu_read_unlock();
503 	return sk2;
504 }
505 EXPORT_SYMBOL(reuseport_select_sock);
506 
507 /**
508  *  reuseport_migrate_sock - Select a socket from an SO_REUSEPORT group.
509  *  @sk: close()ed or shutdown()ed socket in the group.
510  *  @migrating_sk: ESTABLISHED/SYN_RECV full socket in the accept queue or
511  *    NEW_SYN_RECV request socket during 3WHS.
512  *  @skb: skb to run through BPF filter.
513  *  Returns a socket (with sk_refcnt +1) that should accept the child socket
514  *  (or NULL on error).
515  */
516 struct sock *reuseport_migrate_sock(struct sock *sk,
517 				    struct sock *migrating_sk,
518 				    struct sk_buff *skb)
519 {
520 	struct sock_reuseport *reuse;
521 	struct sock *nsk = NULL;
522 	u16 socks;
523 	u32 hash;
524 
525 	rcu_read_lock();
526 
527 	reuse = rcu_dereference(sk->sk_reuseport_cb);
528 	if (!reuse)
529 		goto out;
530 
531 	socks = READ_ONCE(reuse->num_socks);
532 	if (unlikely(!socks))
533 		goto out;
534 
535 	/* paired with smp_wmb() in __reuseport_add_sock() */
536 	smp_rmb();
537 
538 	hash = migrating_sk->sk_hash;
539 	if (sock_net(sk)->ipv4.sysctl_tcp_migrate_req)
540 		nsk = reuseport_select_sock_by_hash(reuse, hash, socks);
541 
542 	if (nsk && unlikely(!refcount_inc_not_zero(&nsk->sk_refcnt)))
543 		nsk = NULL;
544 
545 out:
546 	rcu_read_unlock();
547 	return nsk;
548 }
549 EXPORT_SYMBOL(reuseport_migrate_sock);
550 
551 int reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog)
552 {
553 	struct sock_reuseport *reuse;
554 	struct bpf_prog *old_prog;
555 
556 	if (sk_unhashed(sk)) {
557 		int err;
558 
559 		if (!sk->sk_reuseport)
560 			return -EINVAL;
561 
562 		err = reuseport_alloc(sk, false);
563 		if (err)
564 			return err;
565 	} else if (!rcu_access_pointer(sk->sk_reuseport_cb)) {
566 		/* The socket wasn't bound with SO_REUSEPORT */
567 		return -EINVAL;
568 	}
569 
570 	spin_lock_bh(&reuseport_lock);
571 	reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
572 					  lockdep_is_held(&reuseport_lock));
573 	old_prog = rcu_dereference_protected(reuse->prog,
574 					     lockdep_is_held(&reuseport_lock));
575 	rcu_assign_pointer(reuse->prog, prog);
576 	spin_unlock_bh(&reuseport_lock);
577 
578 	sk_reuseport_prog_free(old_prog);
579 	return 0;
580 }
581 EXPORT_SYMBOL(reuseport_attach_prog);
582 
583 int reuseport_detach_prog(struct sock *sk)
584 {
585 	struct sock_reuseport *reuse;
586 	struct bpf_prog *old_prog;
587 
588 	old_prog = NULL;
589 	spin_lock_bh(&reuseport_lock);
590 	reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
591 					  lockdep_is_held(&reuseport_lock));
592 
593 	/* reuse must be checked after acquiring the reuseport_lock
594 	 * because reuseport_grow() can detach a closed sk.
595 	 */
596 	if (!reuse) {
597 		spin_unlock_bh(&reuseport_lock);
598 		return sk->sk_reuseport ? -ENOENT : -EINVAL;
599 	}
600 
601 	if (sk_unhashed(sk) && reuse->num_closed_socks) {
602 		spin_unlock_bh(&reuseport_lock);
603 		return -ENOENT;
604 	}
605 
606 	old_prog = rcu_replace_pointer(reuse->prog, old_prog,
607 				       lockdep_is_held(&reuseport_lock));
608 	spin_unlock_bh(&reuseport_lock);
609 
610 	if (!old_prog)
611 		return -ENOENT;
612 
613 	sk_reuseport_prog_free(old_prog);
614 	return 0;
615 }
616 EXPORT_SYMBOL(reuseport_detach_prog);
617