xref: /openbmc/linux/net/xdp/xsk.c (revision c4c8f39a57bf5057fc51a848d42b7e348ecfa31d)
1 // SPDX-License-Identifier: GPL-2.0
2 /* XDP sockets
3  *
4  * AF_XDP sockets allows a channel between XDP programs and userspace
5  * applications.
6  * Copyright(c) 2018 Intel Corporation.
7  *
8  * This program is free software; you can redistribute it and/or modify it
9  * under the terms and conditions of the GNU General Public License,
10  * version 2, as published by the Free Software Foundation.
11  *
12  * This program is distributed in the hope it will be useful, but WITHOUT
13  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
15  * more details.
16  *
17  * Author(s): Björn Töpel <bjorn.topel@intel.com>
18  *	      Magnus Karlsson <magnus.karlsson@intel.com>
19  */
20 
21 #define pr_fmt(fmt) "AF_XDP: %s: " fmt, __func__
22 
23 #include <linux/if_xdp.h>
24 #include <linux/init.h>
25 #include <linux/sched/mm.h>
26 #include <linux/sched/signal.h>
27 #include <linux/sched/task.h>
28 #include <linux/socket.h>
29 #include <linux/file.h>
30 #include <linux/uaccess.h>
31 #include <linux/net.h>
32 #include <linux/netdevice.h>
33 #include <net/xdp_sock.h>
34 #include <net/xdp.h>
35 
36 #include "xsk_queue.h"
37 #include "xdp_umem.h"
38 
39 #define TX_BATCH_SIZE 16
40 
41 static struct xdp_sock *xdp_sk(struct sock *sk)
42 {
43 	return (struct xdp_sock *)sk;
44 }
45 
46 bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs)
47 {
48 	return !!xs->rx;
49 }
50 
51 static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
52 {
53 	u32 *id, len = xdp->data_end - xdp->data;
54 	void *buffer;
55 	int err = 0;
56 
57 	if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index)
58 		return -EINVAL;
59 
60 	id = xskq_peek_id(xs->umem->fq);
61 	if (!id)
62 		return -ENOSPC;
63 
64 	buffer = xdp_umem_get_data_with_headroom(xs->umem, *id);
65 	memcpy(buffer, xdp->data, len);
66 	err = xskq_produce_batch_desc(xs->rx, *id, len,
67 				      xs->umem->frame_headroom);
68 	if (!err)
69 		xskq_discard_id(xs->umem->fq);
70 
71 	return err;
72 }
73 
74 int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
75 {
76 	int err;
77 
78 	err = __xsk_rcv(xs, xdp);
79 	if (likely(!err))
80 		xdp_return_buff(xdp);
81 	else
82 		xs->rx_dropped++;
83 
84 	return err;
85 }
86 
87 void xsk_flush(struct xdp_sock *xs)
88 {
89 	xskq_produce_flush_desc(xs->rx);
90 	xs->sk.sk_data_ready(&xs->sk);
91 }
92 
93 int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
94 {
95 	int err;
96 
97 	err = __xsk_rcv(xs, xdp);
98 	if (!err)
99 		xsk_flush(xs);
100 	else
101 		xs->rx_dropped++;
102 
103 	return err;
104 }
105 
106 static void xsk_destruct_skb(struct sk_buff *skb)
107 {
108 	u32 id = (u32)(long)skb_shinfo(skb)->destructor_arg;
109 	struct xdp_sock *xs = xdp_sk(skb->sk);
110 
111 	WARN_ON_ONCE(xskq_produce_id(xs->umem->cq, id));
112 
113 	sock_wfree(skb);
114 }
115 
116 static int xsk_generic_xmit(struct sock *sk, struct msghdr *m,
117 			    size_t total_len)
118 {
119 	bool need_wait = !(m->msg_flags & MSG_DONTWAIT);
120 	u32 max_batch = TX_BATCH_SIZE;
121 	struct xdp_sock *xs = xdp_sk(sk);
122 	bool sent_frame = false;
123 	struct xdp_desc desc;
124 	struct sk_buff *skb;
125 	int err = 0;
126 
127 	if (unlikely(!xs->tx))
128 		return -ENOBUFS;
129 	if (need_wait)
130 		return -EOPNOTSUPP;
131 
132 	mutex_lock(&xs->mutex);
133 
134 	while (xskq_peek_desc(xs->tx, &desc)) {
135 		char *buffer;
136 		u32 id, len;
137 
138 		if (max_batch-- == 0) {
139 			err = -EAGAIN;
140 			goto out;
141 		}
142 
143 		if (xskq_reserve_id(xs->umem->cq)) {
144 			err = -EAGAIN;
145 			goto out;
146 		}
147 
148 		len = desc.len;
149 		if (unlikely(len > xs->dev->mtu)) {
150 			err = -EMSGSIZE;
151 			goto out;
152 		}
153 
154 		skb = sock_alloc_send_skb(sk, len, !need_wait, &err);
155 		if (unlikely(!skb)) {
156 			err = -EAGAIN;
157 			goto out;
158 		}
159 
160 		skb_put(skb, len);
161 		id = desc.idx;
162 		buffer = xdp_umem_get_data(xs->umem, id) + desc.offset;
163 		err = skb_store_bits(skb, 0, buffer, len);
164 		if (unlikely(err)) {
165 			kfree_skb(skb);
166 			goto out;
167 		}
168 
169 		skb->dev = xs->dev;
170 		skb->priority = sk->sk_priority;
171 		skb->mark = sk->sk_mark;
172 		skb_shinfo(skb)->destructor_arg = (void *)(long)id;
173 		skb->destructor = xsk_destruct_skb;
174 
175 		err = dev_direct_xmit(skb, xs->queue_id);
176 		/* Ignore NET_XMIT_CN as packet might have been sent */
177 		if (err == NET_XMIT_DROP || err == NETDEV_TX_BUSY) {
178 			err = -EAGAIN;
179 			/* SKB consumed by dev_direct_xmit() */
180 			goto out;
181 		}
182 
183 		sent_frame = true;
184 		xskq_discard_desc(xs->tx);
185 	}
186 
187 out:
188 	if (sent_frame)
189 		sk->sk_write_space(sk);
190 
191 	mutex_unlock(&xs->mutex);
192 	return err;
193 }
194 
195 static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
196 {
197 	struct sock *sk = sock->sk;
198 	struct xdp_sock *xs = xdp_sk(sk);
199 
200 	if (unlikely(!xs->dev))
201 		return -ENXIO;
202 	if (unlikely(!(xs->dev->flags & IFF_UP)))
203 		return -ENETDOWN;
204 
205 	return xsk_generic_xmit(sk, m, total_len);
206 }
207 
208 static unsigned int xsk_poll(struct file *file, struct socket *sock,
209 			     struct poll_table_struct *wait)
210 {
211 	unsigned int mask = datagram_poll(file, sock, wait);
212 	struct sock *sk = sock->sk;
213 	struct xdp_sock *xs = xdp_sk(sk);
214 
215 	if (xs->rx && !xskq_empty_desc(xs->rx))
216 		mask |= POLLIN | POLLRDNORM;
217 	if (xs->tx && !xskq_full_desc(xs->tx))
218 		mask |= POLLOUT | POLLWRNORM;
219 
220 	return mask;
221 }
222 
223 static int xsk_init_queue(u32 entries, struct xsk_queue **queue,
224 			  bool umem_queue)
225 {
226 	struct xsk_queue *q;
227 
228 	if (entries == 0 || *queue || !is_power_of_2(entries))
229 		return -EINVAL;
230 
231 	q = xskq_create(entries, umem_queue);
232 	if (!q)
233 		return -ENOMEM;
234 
235 	*queue = q;
236 	return 0;
237 }
238 
239 static void __xsk_release(struct xdp_sock *xs)
240 {
241 	/* Wait for driver to stop using the xdp socket. */
242 	synchronize_net();
243 
244 	dev_put(xs->dev);
245 }
246 
247 static int xsk_release(struct socket *sock)
248 {
249 	struct sock *sk = sock->sk;
250 	struct xdp_sock *xs = xdp_sk(sk);
251 	struct net *net;
252 
253 	if (!sk)
254 		return 0;
255 
256 	net = sock_net(sk);
257 
258 	local_bh_disable();
259 	sock_prot_inuse_add(net, sk->sk_prot, -1);
260 	local_bh_enable();
261 
262 	if (xs->dev) {
263 		__xsk_release(xs);
264 		xs->dev = NULL;
265 	}
266 
267 	sock_orphan(sk);
268 	sock->sk = NULL;
269 
270 	sk_refcnt_debug_release(sk);
271 	sock_put(sk);
272 
273 	return 0;
274 }
275 
276 static struct socket *xsk_lookup_xsk_from_fd(int fd)
277 {
278 	struct socket *sock;
279 	int err;
280 
281 	sock = sockfd_lookup(fd, &err);
282 	if (!sock)
283 		return ERR_PTR(-ENOTSOCK);
284 
285 	if (sock->sk->sk_family != PF_XDP) {
286 		sockfd_put(sock);
287 		return ERR_PTR(-ENOPROTOOPT);
288 	}
289 
290 	return sock;
291 }
292 
293 static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
294 {
295 	struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr;
296 	struct sock *sk = sock->sk;
297 	struct net_device *dev, *dev_curr;
298 	struct xdp_sock *xs = xdp_sk(sk);
299 	struct xdp_umem *old_umem = NULL;
300 	int err = 0;
301 
302 	if (addr_len < sizeof(struct sockaddr_xdp))
303 		return -EINVAL;
304 	if (sxdp->sxdp_family != AF_XDP)
305 		return -EINVAL;
306 
307 	mutex_lock(&xs->mutex);
308 	dev_curr = xs->dev;
309 	dev = dev_get_by_index(sock_net(sk), sxdp->sxdp_ifindex);
310 	if (!dev) {
311 		err = -ENODEV;
312 		goto out_release;
313 	}
314 
315 	if (!xs->rx && !xs->tx) {
316 		err = -EINVAL;
317 		goto out_unlock;
318 	}
319 
320 	if (sxdp->sxdp_queue_id >= dev->num_rx_queues) {
321 		err = -EINVAL;
322 		goto out_unlock;
323 	}
324 
325 	if (sxdp->sxdp_flags & XDP_SHARED_UMEM) {
326 		struct xdp_sock *umem_xs;
327 		struct socket *sock;
328 
329 		if (xs->umem) {
330 			/* We have already our own. */
331 			err = -EINVAL;
332 			goto out_unlock;
333 		}
334 
335 		sock = xsk_lookup_xsk_from_fd(sxdp->sxdp_shared_umem_fd);
336 		if (IS_ERR(sock)) {
337 			err = PTR_ERR(sock);
338 			goto out_unlock;
339 		}
340 
341 		umem_xs = xdp_sk(sock->sk);
342 		if (!umem_xs->umem) {
343 			/* No umem to inherit. */
344 			err = -EBADF;
345 			sockfd_put(sock);
346 			goto out_unlock;
347 		} else if (umem_xs->dev != dev ||
348 			   umem_xs->queue_id != sxdp->sxdp_queue_id) {
349 			err = -EINVAL;
350 			sockfd_put(sock);
351 			goto out_unlock;
352 		}
353 
354 		xdp_get_umem(umem_xs->umem);
355 		old_umem = xs->umem;
356 		xs->umem = umem_xs->umem;
357 		sockfd_put(sock);
358 	} else if (!xs->umem || !xdp_umem_validate_queues(xs->umem)) {
359 		err = -EINVAL;
360 		goto out_unlock;
361 	} else {
362 		/* This xsk has its own umem. */
363 		xskq_set_umem(xs->umem->fq, &xs->umem->props);
364 		xskq_set_umem(xs->umem->cq, &xs->umem->props);
365 	}
366 
367 	/* Rebind? */
368 	if (dev_curr && (dev_curr != dev ||
369 			 xs->queue_id != sxdp->sxdp_queue_id)) {
370 		__xsk_release(xs);
371 		if (old_umem)
372 			xdp_put_umem(old_umem);
373 	}
374 
375 	xs->dev = dev;
376 	xs->queue_id = sxdp->sxdp_queue_id;
377 
378 	xskq_set_umem(xs->rx, &xs->umem->props);
379 	xskq_set_umem(xs->tx, &xs->umem->props);
380 
381 out_unlock:
382 	if (err)
383 		dev_put(dev);
384 out_release:
385 	mutex_unlock(&xs->mutex);
386 	return err;
387 }
388 
389 static int xsk_setsockopt(struct socket *sock, int level, int optname,
390 			  char __user *optval, unsigned int optlen)
391 {
392 	struct sock *sk = sock->sk;
393 	struct xdp_sock *xs = xdp_sk(sk);
394 	int err;
395 
396 	if (level != SOL_XDP)
397 		return -ENOPROTOOPT;
398 
399 	switch (optname) {
400 	case XDP_RX_RING:
401 	case XDP_TX_RING:
402 	{
403 		struct xsk_queue **q;
404 		int entries;
405 
406 		if (optlen < sizeof(entries))
407 			return -EINVAL;
408 		if (copy_from_user(&entries, optval, sizeof(entries)))
409 			return -EFAULT;
410 
411 		mutex_lock(&xs->mutex);
412 		q = (optname == XDP_TX_RING) ? &xs->tx : &xs->rx;
413 		err = xsk_init_queue(entries, q, false);
414 		mutex_unlock(&xs->mutex);
415 		return err;
416 	}
417 	case XDP_UMEM_REG:
418 	{
419 		struct xdp_umem_reg mr;
420 		struct xdp_umem *umem;
421 
422 		if (xs->umem)
423 			return -EBUSY;
424 
425 		if (copy_from_user(&mr, optval, sizeof(mr)))
426 			return -EFAULT;
427 
428 		mutex_lock(&xs->mutex);
429 		err = xdp_umem_create(&umem);
430 
431 		err = xdp_umem_reg(umem, &mr);
432 		if (err) {
433 			kfree(umem);
434 			mutex_unlock(&xs->mutex);
435 			return err;
436 		}
437 
438 		/* Make sure umem is ready before it can be seen by others */
439 		smp_wmb();
440 
441 		xs->umem = umem;
442 		mutex_unlock(&xs->mutex);
443 		return 0;
444 	}
445 	case XDP_UMEM_FILL_RING:
446 	case XDP_UMEM_COMPLETION_RING:
447 	{
448 		struct xsk_queue **q;
449 		int entries;
450 
451 		if (!xs->umem)
452 			return -EINVAL;
453 
454 		if (copy_from_user(&entries, optval, sizeof(entries)))
455 			return -EFAULT;
456 
457 		mutex_lock(&xs->mutex);
458 		q = (optname == XDP_UMEM_FILL_RING) ? &xs->umem->fq :
459 			&xs->umem->cq;
460 		err = xsk_init_queue(entries, q, true);
461 		mutex_unlock(&xs->mutex);
462 		return err;
463 	}
464 	default:
465 		break;
466 	}
467 
468 	return -ENOPROTOOPT;
469 }
470 
471 static int xsk_getsockopt(struct socket *sock, int level, int optname,
472 			  char __user *optval, int __user *optlen)
473 {
474 	struct sock *sk = sock->sk;
475 	struct xdp_sock *xs = xdp_sk(sk);
476 	int len;
477 
478 	if (level != SOL_XDP)
479 		return -ENOPROTOOPT;
480 
481 	if (get_user(len, optlen))
482 		return -EFAULT;
483 	if (len < 0)
484 		return -EINVAL;
485 
486 	switch (optname) {
487 	case XDP_STATISTICS:
488 	{
489 		struct xdp_statistics stats;
490 
491 		if (len < sizeof(stats))
492 			return -EINVAL;
493 
494 		mutex_lock(&xs->mutex);
495 		stats.rx_dropped = xs->rx_dropped;
496 		stats.rx_invalid_descs = xskq_nb_invalid_descs(xs->rx);
497 		stats.tx_invalid_descs = xskq_nb_invalid_descs(xs->tx);
498 		mutex_unlock(&xs->mutex);
499 
500 		if (copy_to_user(optval, &stats, sizeof(stats)))
501 			return -EFAULT;
502 		if (put_user(sizeof(stats), optlen))
503 			return -EFAULT;
504 
505 		return 0;
506 	}
507 	default:
508 		break;
509 	}
510 
511 	return -EOPNOTSUPP;
512 }
513 
514 static int xsk_mmap(struct file *file, struct socket *sock,
515 		    struct vm_area_struct *vma)
516 {
517 	unsigned long offset = vma->vm_pgoff << PAGE_SHIFT;
518 	unsigned long size = vma->vm_end - vma->vm_start;
519 	struct xdp_sock *xs = xdp_sk(sock->sk);
520 	struct xsk_queue *q = NULL;
521 	unsigned long pfn;
522 	struct page *qpg;
523 
524 	if (offset == XDP_PGOFF_RX_RING) {
525 		q = xs->rx;
526 	} else if (offset == XDP_PGOFF_TX_RING) {
527 		q = xs->tx;
528 	} else {
529 		if (!xs->umem)
530 			return -EINVAL;
531 
532 		if (offset == XDP_UMEM_PGOFF_FILL_RING)
533 			q = xs->umem->fq;
534 		else if (offset == XDP_UMEM_PGOFF_COMPLETION_RING)
535 			q = xs->umem->cq;
536 	}
537 
538 	if (!q)
539 		return -EINVAL;
540 
541 	qpg = virt_to_head_page(q->ring);
542 	if (size > (PAGE_SIZE << compound_order(qpg)))
543 		return -EINVAL;
544 
545 	pfn = virt_to_phys(q->ring) >> PAGE_SHIFT;
546 	return remap_pfn_range(vma, vma->vm_start, pfn,
547 			       size, vma->vm_page_prot);
548 }
549 
550 static struct proto xsk_proto = {
551 	.name =		"XDP",
552 	.owner =	THIS_MODULE,
553 	.obj_size =	sizeof(struct xdp_sock),
554 };
555 
556 static const struct proto_ops xsk_proto_ops = {
557 	.family =	PF_XDP,
558 	.owner =	THIS_MODULE,
559 	.release =	xsk_release,
560 	.bind =		xsk_bind,
561 	.connect =	sock_no_connect,
562 	.socketpair =	sock_no_socketpair,
563 	.accept =	sock_no_accept,
564 	.getname =	sock_no_getname,
565 	.poll =		xsk_poll,
566 	.ioctl =	sock_no_ioctl,
567 	.listen =	sock_no_listen,
568 	.shutdown =	sock_no_shutdown,
569 	.setsockopt =	xsk_setsockopt,
570 	.getsockopt =	xsk_getsockopt,
571 	.sendmsg =	xsk_sendmsg,
572 	.recvmsg =	sock_no_recvmsg,
573 	.mmap =		xsk_mmap,
574 	.sendpage =	sock_no_sendpage,
575 };
576 
577 static void xsk_destruct(struct sock *sk)
578 {
579 	struct xdp_sock *xs = xdp_sk(sk);
580 
581 	if (!sock_flag(sk, SOCK_DEAD))
582 		return;
583 
584 	xskq_destroy(xs->rx);
585 	xskq_destroy(xs->tx);
586 	xdp_put_umem(xs->umem);
587 
588 	sk_refcnt_debug_dec(sk);
589 }
590 
591 static int xsk_create(struct net *net, struct socket *sock, int protocol,
592 		      int kern)
593 {
594 	struct sock *sk;
595 	struct xdp_sock *xs;
596 
597 	if (!ns_capable(net->user_ns, CAP_NET_RAW))
598 		return -EPERM;
599 	if (sock->type != SOCK_RAW)
600 		return -ESOCKTNOSUPPORT;
601 
602 	if (protocol)
603 		return -EPROTONOSUPPORT;
604 
605 	sock->state = SS_UNCONNECTED;
606 
607 	sk = sk_alloc(net, PF_XDP, GFP_KERNEL, &xsk_proto, kern);
608 	if (!sk)
609 		return -ENOBUFS;
610 
611 	sock->ops = &xsk_proto_ops;
612 
613 	sock_init_data(sock, sk);
614 
615 	sk->sk_family = PF_XDP;
616 
617 	sk->sk_destruct = xsk_destruct;
618 	sk_refcnt_debug_inc(sk);
619 
620 	xs = xdp_sk(sk);
621 	mutex_init(&xs->mutex);
622 
623 	local_bh_disable();
624 	sock_prot_inuse_add(net, &xsk_proto, 1);
625 	local_bh_enable();
626 
627 	return 0;
628 }
629 
630 static const struct net_proto_family xsk_family_ops = {
631 	.family = PF_XDP,
632 	.create = xsk_create,
633 	.owner	= THIS_MODULE,
634 };
635 
636 static int __init xsk_init(void)
637 {
638 	int err;
639 
640 	err = proto_register(&xsk_proto, 0 /* no slab */);
641 	if (err)
642 		goto out;
643 
644 	err = sock_register(&xsk_family_ops);
645 	if (err)
646 		goto out_proto;
647 
648 	return 0;
649 
650 out_proto:
651 	proto_unregister(&xsk_proto);
652 out:
653 	return err;
654 }
655 
656 fs_initcall(xsk_init);
657