xref: /openbmc/linux/net/xdp/xsk.c (revision 77ab8d5d)
1 // SPDX-License-Identifier: GPL-2.0
2 /* XDP sockets
3  *
4  * AF_XDP sockets allows a channel between XDP programs and userspace
5  * applications.
6  * Copyright(c) 2018 Intel Corporation.
7  *
8  * Author(s): Björn Töpel <bjorn.topel@intel.com>
9  *	      Magnus Karlsson <magnus.karlsson@intel.com>
10  */
11 
12 #define pr_fmt(fmt) "AF_XDP: %s: " fmt, __func__
13 
14 #include <linux/if_xdp.h>
15 #include <linux/init.h>
16 #include <linux/sched/mm.h>
17 #include <linux/sched/signal.h>
18 #include <linux/sched/task.h>
19 #include <linux/socket.h>
20 #include <linux/file.h>
21 #include <linux/uaccess.h>
22 #include <linux/net.h>
23 #include <linux/netdevice.h>
24 #include <net/xdp_sock.h>
25 #include <net/xdp.h>
26 
27 #include "xsk_queue.h"
28 #include "xdp_umem.h"
29 
30 #define TX_BATCH_SIZE 16
31 
32 static struct xdp_sock *xdp_sk(struct sock *sk)
33 {
34 	return (struct xdp_sock *)sk;
35 }
36 
37 bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs)
38 {
39 	return !!xs->rx;
40 }
41 
42 static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
43 {
44 	u32 *id, len = xdp->data_end - xdp->data;
45 	void *buffer;
46 	int err = 0;
47 
48 	if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index)
49 		return -EINVAL;
50 
51 	id = xskq_peek_id(xs->umem->fq);
52 	if (!id)
53 		return -ENOSPC;
54 
55 	buffer = xdp_umem_get_data_with_headroom(xs->umem, *id);
56 	memcpy(buffer, xdp->data, len);
57 	err = xskq_produce_batch_desc(xs->rx, *id, len,
58 				      xs->umem->frame_headroom);
59 	if (!err)
60 		xskq_discard_id(xs->umem->fq);
61 
62 	return err;
63 }
64 
65 int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
66 {
67 	int err;
68 
69 	err = __xsk_rcv(xs, xdp);
70 	if (likely(!err))
71 		xdp_return_buff(xdp);
72 	else
73 		xs->rx_dropped++;
74 
75 	return err;
76 }
77 
78 void xsk_flush(struct xdp_sock *xs)
79 {
80 	xskq_produce_flush_desc(xs->rx);
81 	xs->sk.sk_data_ready(&xs->sk);
82 }
83 
84 int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
85 {
86 	int err;
87 
88 	err = __xsk_rcv(xs, xdp);
89 	if (!err)
90 		xsk_flush(xs);
91 	else
92 		xs->rx_dropped++;
93 
94 	return err;
95 }
96 
97 static void xsk_destruct_skb(struct sk_buff *skb)
98 {
99 	u32 id = (u32)(long)skb_shinfo(skb)->destructor_arg;
100 	struct xdp_sock *xs = xdp_sk(skb->sk);
101 
102 	WARN_ON_ONCE(xskq_produce_id(xs->umem->cq, id));
103 
104 	sock_wfree(skb);
105 }
106 
107 static int xsk_generic_xmit(struct sock *sk, struct msghdr *m,
108 			    size_t total_len)
109 {
110 	bool need_wait = !(m->msg_flags & MSG_DONTWAIT);
111 	u32 max_batch = TX_BATCH_SIZE;
112 	struct xdp_sock *xs = xdp_sk(sk);
113 	bool sent_frame = false;
114 	struct xdp_desc desc;
115 	struct sk_buff *skb;
116 	int err = 0;
117 
118 	if (unlikely(!xs->tx))
119 		return -ENOBUFS;
120 	if (need_wait)
121 		return -EOPNOTSUPP;
122 
123 	mutex_lock(&xs->mutex);
124 
125 	while (xskq_peek_desc(xs->tx, &desc)) {
126 		char *buffer;
127 		u32 id, len;
128 
129 		if (max_batch-- == 0) {
130 			err = -EAGAIN;
131 			goto out;
132 		}
133 
134 		if (xskq_reserve_id(xs->umem->cq)) {
135 			err = -EAGAIN;
136 			goto out;
137 		}
138 
139 		len = desc.len;
140 		if (unlikely(len > xs->dev->mtu)) {
141 			err = -EMSGSIZE;
142 			goto out;
143 		}
144 
145 		if (xs->queue_id >= xs->dev->real_num_tx_queues) {
146 			err = -ENXIO;
147 			goto out;
148 		}
149 
150 		skb = sock_alloc_send_skb(sk, len, !need_wait, &err);
151 		if (unlikely(!skb)) {
152 			err = -EAGAIN;
153 			goto out;
154 		}
155 
156 		skb_put(skb, len);
157 		id = desc.idx;
158 		buffer = xdp_umem_get_data(xs->umem, id) + desc.offset;
159 		err = skb_store_bits(skb, 0, buffer, len);
160 		if (unlikely(err)) {
161 			kfree_skb(skb);
162 			goto out;
163 		}
164 
165 		skb->dev = xs->dev;
166 		skb->priority = sk->sk_priority;
167 		skb->mark = sk->sk_mark;
168 		skb_shinfo(skb)->destructor_arg = (void *)(long)id;
169 		skb->destructor = xsk_destruct_skb;
170 
171 		err = dev_direct_xmit(skb, xs->queue_id);
172 		/* Ignore NET_XMIT_CN as packet might have been sent */
173 		if (err == NET_XMIT_DROP || err == NETDEV_TX_BUSY) {
174 			err = -EAGAIN;
175 			/* SKB consumed by dev_direct_xmit() */
176 			goto out;
177 		}
178 
179 		sent_frame = true;
180 		xskq_discard_desc(xs->tx);
181 	}
182 
183 out:
184 	if (sent_frame)
185 		sk->sk_write_space(sk);
186 
187 	mutex_unlock(&xs->mutex);
188 	return err;
189 }
190 
191 static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
192 {
193 	struct sock *sk = sock->sk;
194 	struct xdp_sock *xs = xdp_sk(sk);
195 
196 	if (unlikely(!xs->dev))
197 		return -ENXIO;
198 	if (unlikely(!(xs->dev->flags & IFF_UP)))
199 		return -ENETDOWN;
200 
201 	return xsk_generic_xmit(sk, m, total_len);
202 }
203 
204 static unsigned int xsk_poll(struct file *file, struct socket *sock,
205 			     struct poll_table_struct *wait)
206 {
207 	unsigned int mask = datagram_poll(file, sock, wait);
208 	struct sock *sk = sock->sk;
209 	struct xdp_sock *xs = xdp_sk(sk);
210 
211 	if (xs->rx && !xskq_empty_desc(xs->rx))
212 		mask |= POLLIN | POLLRDNORM;
213 	if (xs->tx && !xskq_full_desc(xs->tx))
214 		mask |= POLLOUT | POLLWRNORM;
215 
216 	return mask;
217 }
218 
219 static int xsk_init_queue(u32 entries, struct xsk_queue **queue,
220 			  bool umem_queue)
221 {
222 	struct xsk_queue *q;
223 
224 	if (entries == 0 || *queue || !is_power_of_2(entries))
225 		return -EINVAL;
226 
227 	q = xskq_create(entries, umem_queue);
228 	if (!q)
229 		return -ENOMEM;
230 
231 	/* Make sure queue is ready before it can be seen by others */
232 	smp_wmb();
233 	*queue = q;
234 	return 0;
235 }
236 
237 static int xsk_release(struct socket *sock)
238 {
239 	struct sock *sk = sock->sk;
240 	struct xdp_sock *xs = xdp_sk(sk);
241 	struct net *net;
242 
243 	if (!sk)
244 		return 0;
245 
246 	net = sock_net(sk);
247 
248 	local_bh_disable();
249 	sock_prot_inuse_add(net, sk->sk_prot, -1);
250 	local_bh_enable();
251 
252 	if (xs->dev) {
253 		/* Wait for driver to stop using the xdp socket. */
254 		synchronize_net();
255 		dev_put(xs->dev);
256 		xs->dev = NULL;
257 	}
258 
259 	sock_orphan(sk);
260 	sock->sk = NULL;
261 
262 	sk_refcnt_debug_release(sk);
263 	sock_put(sk);
264 
265 	return 0;
266 }
267 
268 static struct socket *xsk_lookup_xsk_from_fd(int fd)
269 {
270 	struct socket *sock;
271 	int err;
272 
273 	sock = sockfd_lookup(fd, &err);
274 	if (!sock)
275 		return ERR_PTR(-ENOTSOCK);
276 
277 	if (sock->sk->sk_family != PF_XDP) {
278 		sockfd_put(sock);
279 		return ERR_PTR(-ENOPROTOOPT);
280 	}
281 
282 	return sock;
283 }
284 
285 static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
286 {
287 	struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr;
288 	struct sock *sk = sock->sk;
289 	struct xdp_sock *xs = xdp_sk(sk);
290 	struct net_device *dev;
291 	int err = 0;
292 
293 	if (addr_len < sizeof(struct sockaddr_xdp))
294 		return -EINVAL;
295 	if (sxdp->sxdp_family != AF_XDP)
296 		return -EINVAL;
297 
298 	mutex_lock(&xs->mutex);
299 	if (xs->dev) {
300 		err = -EBUSY;
301 		goto out_release;
302 	}
303 
304 	dev = dev_get_by_index(sock_net(sk), sxdp->sxdp_ifindex);
305 	if (!dev) {
306 		err = -ENODEV;
307 		goto out_release;
308 	}
309 
310 	if (!xs->rx && !xs->tx) {
311 		err = -EINVAL;
312 		goto out_unlock;
313 	}
314 
315 	if ((xs->rx && sxdp->sxdp_queue_id >= dev->real_num_rx_queues) ||
316 	    (xs->tx && sxdp->sxdp_queue_id >= dev->real_num_tx_queues)) {
317 		err = -EINVAL;
318 		goto out_unlock;
319 	}
320 
321 	if (sxdp->sxdp_flags & XDP_SHARED_UMEM) {
322 		struct xdp_sock *umem_xs;
323 		struct socket *sock;
324 
325 		if (xs->umem) {
326 			/* We have already our own. */
327 			err = -EINVAL;
328 			goto out_unlock;
329 		}
330 
331 		sock = xsk_lookup_xsk_from_fd(sxdp->sxdp_shared_umem_fd);
332 		if (IS_ERR(sock)) {
333 			err = PTR_ERR(sock);
334 			goto out_unlock;
335 		}
336 
337 		umem_xs = xdp_sk(sock->sk);
338 		if (!umem_xs->umem) {
339 			/* No umem to inherit. */
340 			err = -EBADF;
341 			sockfd_put(sock);
342 			goto out_unlock;
343 		} else if (umem_xs->dev != dev ||
344 			   umem_xs->queue_id != sxdp->sxdp_queue_id) {
345 			err = -EINVAL;
346 			sockfd_put(sock);
347 			goto out_unlock;
348 		}
349 
350 		xdp_get_umem(umem_xs->umem);
351 		xs->umem = umem_xs->umem;
352 		sockfd_put(sock);
353 	} else if (!xs->umem || !xdp_umem_validate_queues(xs->umem)) {
354 		err = -EINVAL;
355 		goto out_unlock;
356 	} else {
357 		/* This xsk has its own umem. */
358 		xskq_set_umem(xs->umem->fq, &xs->umem->props);
359 		xskq_set_umem(xs->umem->cq, &xs->umem->props);
360 	}
361 
362 	xs->dev = dev;
363 	xs->queue_id = sxdp->sxdp_queue_id;
364 
365 	xskq_set_umem(xs->rx, &xs->umem->props);
366 	xskq_set_umem(xs->tx, &xs->umem->props);
367 
368 out_unlock:
369 	if (err)
370 		dev_put(dev);
371 out_release:
372 	mutex_unlock(&xs->mutex);
373 	return err;
374 }
375 
376 static int xsk_setsockopt(struct socket *sock, int level, int optname,
377 			  char __user *optval, unsigned int optlen)
378 {
379 	struct sock *sk = sock->sk;
380 	struct xdp_sock *xs = xdp_sk(sk);
381 	int err;
382 
383 	if (level != SOL_XDP)
384 		return -ENOPROTOOPT;
385 
386 	switch (optname) {
387 	case XDP_RX_RING:
388 	case XDP_TX_RING:
389 	{
390 		struct xsk_queue **q;
391 		int entries;
392 
393 		if (optlen < sizeof(entries))
394 			return -EINVAL;
395 		if (copy_from_user(&entries, optval, sizeof(entries)))
396 			return -EFAULT;
397 
398 		mutex_lock(&xs->mutex);
399 		q = (optname == XDP_TX_RING) ? &xs->tx : &xs->rx;
400 		err = xsk_init_queue(entries, q, false);
401 		mutex_unlock(&xs->mutex);
402 		return err;
403 	}
404 	case XDP_UMEM_REG:
405 	{
406 		struct xdp_umem_reg mr;
407 		struct xdp_umem *umem;
408 
409 		if (copy_from_user(&mr, optval, sizeof(mr)))
410 			return -EFAULT;
411 
412 		mutex_lock(&xs->mutex);
413 		if (xs->umem) {
414 			mutex_unlock(&xs->mutex);
415 			return -EBUSY;
416 		}
417 
418 		umem = xdp_umem_create(&mr);
419 		if (IS_ERR(umem)) {
420 			mutex_unlock(&xs->mutex);
421 			return PTR_ERR(umem);
422 		}
423 
424 		/* Make sure umem is ready before it can be seen by others */
425 		smp_wmb();
426 		xs->umem = umem;
427 		mutex_unlock(&xs->mutex);
428 		return 0;
429 	}
430 	case XDP_UMEM_FILL_RING:
431 	case XDP_UMEM_COMPLETION_RING:
432 	{
433 		struct xsk_queue **q;
434 		int entries;
435 
436 		if (copy_from_user(&entries, optval, sizeof(entries)))
437 			return -EFAULT;
438 
439 		mutex_lock(&xs->mutex);
440 		if (!xs->umem) {
441 			mutex_unlock(&xs->mutex);
442 			return -EINVAL;
443 		}
444 
445 		q = (optname == XDP_UMEM_FILL_RING) ? &xs->umem->fq :
446 			&xs->umem->cq;
447 		err = xsk_init_queue(entries, q, true);
448 		mutex_unlock(&xs->mutex);
449 		return err;
450 	}
451 	default:
452 		break;
453 	}
454 
455 	return -ENOPROTOOPT;
456 }
457 
458 static int xsk_getsockopt(struct socket *sock, int level, int optname,
459 			  char __user *optval, int __user *optlen)
460 {
461 	struct sock *sk = sock->sk;
462 	struct xdp_sock *xs = xdp_sk(sk);
463 	int len;
464 
465 	if (level != SOL_XDP)
466 		return -ENOPROTOOPT;
467 
468 	if (get_user(len, optlen))
469 		return -EFAULT;
470 	if (len < 0)
471 		return -EINVAL;
472 
473 	switch (optname) {
474 	case XDP_STATISTICS:
475 	{
476 		struct xdp_statistics stats;
477 
478 		if (len < sizeof(stats))
479 			return -EINVAL;
480 
481 		mutex_lock(&xs->mutex);
482 		stats.rx_dropped = xs->rx_dropped;
483 		stats.rx_invalid_descs = xskq_nb_invalid_descs(xs->rx);
484 		stats.tx_invalid_descs = xskq_nb_invalid_descs(xs->tx);
485 		mutex_unlock(&xs->mutex);
486 
487 		if (copy_to_user(optval, &stats, sizeof(stats)))
488 			return -EFAULT;
489 		if (put_user(sizeof(stats), optlen))
490 			return -EFAULT;
491 
492 		return 0;
493 	}
494 	case XDP_MMAP_OFFSETS:
495 	{
496 		struct xdp_mmap_offsets off;
497 
498 		if (len < sizeof(off))
499 			return -EINVAL;
500 
501 		off.rx.producer = offsetof(struct xdp_rxtx_ring, ptrs.producer);
502 		off.rx.consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer);
503 		off.rx.desc	= offsetof(struct xdp_rxtx_ring, desc);
504 		off.tx.producer = offsetof(struct xdp_rxtx_ring, ptrs.producer);
505 		off.tx.consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer);
506 		off.tx.desc	= offsetof(struct xdp_rxtx_ring, desc);
507 
508 		off.fr.producer = offsetof(struct xdp_umem_ring, ptrs.producer);
509 		off.fr.consumer = offsetof(struct xdp_umem_ring, ptrs.consumer);
510 		off.fr.desc	= offsetof(struct xdp_umem_ring, desc);
511 		off.cr.producer = offsetof(struct xdp_umem_ring, ptrs.producer);
512 		off.cr.consumer = offsetof(struct xdp_umem_ring, ptrs.consumer);
513 		off.cr.desc	= offsetof(struct xdp_umem_ring, desc);
514 
515 		len = sizeof(off);
516 		if (copy_to_user(optval, &off, len))
517 			return -EFAULT;
518 		if (put_user(len, optlen))
519 			return -EFAULT;
520 
521 		return 0;
522 	}
523 	default:
524 		break;
525 	}
526 
527 	return -EOPNOTSUPP;
528 }
529 
530 static int xsk_mmap(struct file *file, struct socket *sock,
531 		    struct vm_area_struct *vma)
532 {
533 	unsigned long offset = vma->vm_pgoff << PAGE_SHIFT;
534 	unsigned long size = vma->vm_end - vma->vm_start;
535 	struct xdp_sock *xs = xdp_sk(sock->sk);
536 	struct xsk_queue *q = NULL;
537 	struct xdp_umem *umem;
538 	unsigned long pfn;
539 	struct page *qpg;
540 
541 	if (offset == XDP_PGOFF_RX_RING) {
542 		q = READ_ONCE(xs->rx);
543 	} else if (offset == XDP_PGOFF_TX_RING) {
544 		q = READ_ONCE(xs->tx);
545 	} else {
546 		umem = READ_ONCE(xs->umem);
547 		if (!umem)
548 			return -EINVAL;
549 
550 		if (offset == XDP_UMEM_PGOFF_FILL_RING)
551 			q = READ_ONCE(umem->fq);
552 		else if (offset == XDP_UMEM_PGOFF_COMPLETION_RING)
553 			q = READ_ONCE(umem->cq);
554 	}
555 
556 	if (!q)
557 		return -EINVAL;
558 
559 	qpg = virt_to_head_page(q->ring);
560 	if (size > (PAGE_SIZE << compound_order(qpg)))
561 		return -EINVAL;
562 
563 	pfn = virt_to_phys(q->ring) >> PAGE_SHIFT;
564 	return remap_pfn_range(vma, vma->vm_start, pfn,
565 			       size, vma->vm_page_prot);
566 }
567 
568 static struct proto xsk_proto = {
569 	.name =		"XDP",
570 	.owner =	THIS_MODULE,
571 	.obj_size =	sizeof(struct xdp_sock),
572 };
573 
574 static const struct proto_ops xsk_proto_ops = {
575 	.family		= PF_XDP,
576 	.owner		= THIS_MODULE,
577 	.release	= xsk_release,
578 	.bind		= xsk_bind,
579 	.connect	= sock_no_connect,
580 	.socketpair	= sock_no_socketpair,
581 	.accept		= sock_no_accept,
582 	.getname	= sock_no_getname,
583 	.poll		= xsk_poll,
584 	.ioctl		= sock_no_ioctl,
585 	.listen		= sock_no_listen,
586 	.shutdown	= sock_no_shutdown,
587 	.setsockopt	= xsk_setsockopt,
588 	.getsockopt	= xsk_getsockopt,
589 	.sendmsg	= xsk_sendmsg,
590 	.recvmsg	= sock_no_recvmsg,
591 	.mmap		= xsk_mmap,
592 	.sendpage	= sock_no_sendpage,
593 };
594 
595 static void xsk_destruct(struct sock *sk)
596 {
597 	struct xdp_sock *xs = xdp_sk(sk);
598 
599 	if (!sock_flag(sk, SOCK_DEAD))
600 		return;
601 
602 	xskq_destroy(xs->rx);
603 	xskq_destroy(xs->tx);
604 	xdp_put_umem(xs->umem);
605 
606 	sk_refcnt_debug_dec(sk);
607 }
608 
609 static int xsk_create(struct net *net, struct socket *sock, int protocol,
610 		      int kern)
611 {
612 	struct sock *sk;
613 	struct xdp_sock *xs;
614 
615 	if (!ns_capable(net->user_ns, CAP_NET_RAW))
616 		return -EPERM;
617 	if (sock->type != SOCK_RAW)
618 		return -ESOCKTNOSUPPORT;
619 
620 	if (protocol)
621 		return -EPROTONOSUPPORT;
622 
623 	sock->state = SS_UNCONNECTED;
624 
625 	sk = sk_alloc(net, PF_XDP, GFP_KERNEL, &xsk_proto, kern);
626 	if (!sk)
627 		return -ENOBUFS;
628 
629 	sock->ops = &xsk_proto_ops;
630 
631 	sock_init_data(sock, sk);
632 
633 	sk->sk_family = PF_XDP;
634 
635 	sk->sk_destruct = xsk_destruct;
636 	sk_refcnt_debug_inc(sk);
637 
638 	xs = xdp_sk(sk);
639 	mutex_init(&xs->mutex);
640 
641 	local_bh_disable();
642 	sock_prot_inuse_add(net, &xsk_proto, 1);
643 	local_bh_enable();
644 
645 	return 0;
646 }
647 
648 static const struct net_proto_family xsk_family_ops = {
649 	.family = PF_XDP,
650 	.create = xsk_create,
651 	.owner	= THIS_MODULE,
652 };
653 
654 static int __init xsk_init(void)
655 {
656 	int err;
657 
658 	err = proto_register(&xsk_proto, 0 /* no slab */);
659 	if (err)
660 		goto out;
661 
662 	err = sock_register(&xsk_family_ops);
663 	if (err)
664 		goto out_proto;
665 
666 	return 0;
667 
668 out_proto:
669 	proto_unregister(&xsk_proto);
670 out:
671 	return err;
672 }
673 
674 fs_initcall(xsk_init);
675