xref: /openbmc/linux/net/xdp/xsk.c (revision 42bc47b3)
1 // SPDX-License-Identifier: GPL-2.0
2 /* XDP sockets
3  *
4  * AF_XDP sockets allows a channel between XDP programs and userspace
5  * applications.
6  * Copyright(c) 2018 Intel Corporation.
7  *
8  * Author(s): Björn Töpel <bjorn.topel@intel.com>
9  *	      Magnus Karlsson <magnus.karlsson@intel.com>
10  */
11 
12 #define pr_fmt(fmt) "AF_XDP: %s: " fmt, __func__
13 
14 #include <linux/if_xdp.h>
15 #include <linux/init.h>
16 #include <linux/sched/mm.h>
17 #include <linux/sched/signal.h>
18 #include <linux/sched/task.h>
19 #include <linux/socket.h>
20 #include <linux/file.h>
21 #include <linux/uaccess.h>
22 #include <linux/net.h>
23 #include <linux/netdevice.h>
24 #include <linux/rculist.h>
25 #include <net/xdp_sock.h>
26 #include <net/xdp.h>
27 
28 #include "xsk_queue.h"
29 #include "xdp_umem.h"
30 
31 #define TX_BATCH_SIZE 16
32 
33 static struct xdp_sock *xdp_sk(struct sock *sk)
34 {
35 	return (struct xdp_sock *)sk;
36 }
37 
38 bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs)
39 {
40 	return READ_ONCE(xs->rx) &&  READ_ONCE(xs->umem) &&
41 		READ_ONCE(xs->umem->fq);
42 }
43 
44 u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr)
45 {
46 	return xskq_peek_addr(umem->fq, addr);
47 }
48 EXPORT_SYMBOL(xsk_umem_peek_addr);
49 
50 void xsk_umem_discard_addr(struct xdp_umem *umem)
51 {
52 	xskq_discard_addr(umem->fq);
53 }
54 EXPORT_SYMBOL(xsk_umem_discard_addr);
55 
56 static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
57 {
58 	void *buffer;
59 	u64 addr;
60 	int err;
61 
62 	if (!xskq_peek_addr(xs->umem->fq, &addr) ||
63 	    len > xs->umem->chunk_size_nohr) {
64 		xs->rx_dropped++;
65 		return -ENOSPC;
66 	}
67 
68 	addr += xs->umem->headroom;
69 
70 	buffer = xdp_umem_get_data(xs->umem, addr);
71 	memcpy(buffer, xdp->data, len);
72 	err = xskq_produce_batch_desc(xs->rx, addr, len);
73 	if (!err) {
74 		xskq_discard_addr(xs->umem->fq);
75 		xdp_return_buff(xdp);
76 		return 0;
77 	}
78 
79 	xs->rx_dropped++;
80 	return err;
81 }
82 
83 static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
84 {
85 	int err = xskq_produce_batch_desc(xs->rx, (u64)xdp->handle, len);
86 
87 	if (err) {
88 		xdp_return_buff(xdp);
89 		xs->rx_dropped++;
90 	}
91 
92 	return err;
93 }
94 
95 int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
96 {
97 	u32 len;
98 
99 	if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index)
100 		return -EINVAL;
101 
102 	len = xdp->data_end - xdp->data;
103 
104 	return (xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY) ?
105 		__xsk_rcv_zc(xs, xdp, len) : __xsk_rcv(xs, xdp, len);
106 }
107 
108 void xsk_flush(struct xdp_sock *xs)
109 {
110 	xskq_produce_flush_desc(xs->rx);
111 	xs->sk.sk_data_ready(&xs->sk);
112 }
113 
114 int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
115 {
116 	u32 len = xdp->data_end - xdp->data;
117 	void *buffer;
118 	u64 addr;
119 	int err;
120 
121 	if (!xskq_peek_addr(xs->umem->fq, &addr) ||
122 	    len > xs->umem->chunk_size_nohr) {
123 		xs->rx_dropped++;
124 		return -ENOSPC;
125 	}
126 
127 	addr += xs->umem->headroom;
128 
129 	buffer = xdp_umem_get_data(xs->umem, addr);
130 	memcpy(buffer, xdp->data, len);
131 	err = xskq_produce_batch_desc(xs->rx, addr, len);
132 	if (!err) {
133 		xskq_discard_addr(xs->umem->fq);
134 		xsk_flush(xs);
135 		return 0;
136 	}
137 
138 	xs->rx_dropped++;
139 	return err;
140 }
141 
142 void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries)
143 {
144 	xskq_produce_flush_addr_n(umem->cq, nb_entries);
145 }
146 EXPORT_SYMBOL(xsk_umem_complete_tx);
147 
148 void xsk_umem_consume_tx_done(struct xdp_umem *umem)
149 {
150 	struct xdp_sock *xs;
151 
152 	rcu_read_lock();
153 	list_for_each_entry_rcu(xs, &umem->xsk_list, list) {
154 		xs->sk.sk_write_space(&xs->sk);
155 	}
156 	rcu_read_unlock();
157 }
158 EXPORT_SYMBOL(xsk_umem_consume_tx_done);
159 
160 bool xsk_umem_consume_tx(struct xdp_umem *umem, dma_addr_t *dma, u32 *len)
161 {
162 	struct xdp_desc desc;
163 	struct xdp_sock *xs;
164 
165 	rcu_read_lock();
166 	list_for_each_entry_rcu(xs, &umem->xsk_list, list) {
167 		if (!xskq_peek_desc(xs->tx, &desc))
168 			continue;
169 
170 		if (xskq_produce_addr_lazy(umem->cq, desc.addr))
171 			goto out;
172 
173 		*dma = xdp_umem_get_dma(umem, desc.addr);
174 		*len = desc.len;
175 
176 		xskq_discard_desc(xs->tx);
177 		rcu_read_unlock();
178 		return true;
179 	}
180 
181 out:
182 	rcu_read_unlock();
183 	return false;
184 }
185 EXPORT_SYMBOL(xsk_umem_consume_tx);
186 
187 static int xsk_zc_xmit(struct sock *sk)
188 {
189 	struct xdp_sock *xs = xdp_sk(sk);
190 	struct net_device *dev = xs->dev;
191 
192 	return dev->netdev_ops->ndo_xsk_async_xmit(dev, xs->queue_id);
193 }
194 
195 static void xsk_destruct_skb(struct sk_buff *skb)
196 {
197 	u64 addr = (u64)(long)skb_shinfo(skb)->destructor_arg;
198 	struct xdp_sock *xs = xdp_sk(skb->sk);
199 
200 	WARN_ON_ONCE(xskq_produce_addr(xs->umem->cq, addr));
201 
202 	sock_wfree(skb);
203 }
204 
205 static int xsk_generic_xmit(struct sock *sk, struct msghdr *m,
206 			    size_t total_len)
207 {
208 	u32 max_batch = TX_BATCH_SIZE;
209 	struct xdp_sock *xs = xdp_sk(sk);
210 	bool sent_frame = false;
211 	struct xdp_desc desc;
212 	struct sk_buff *skb;
213 	int err = 0;
214 
215 	if (unlikely(!xs->tx))
216 		return -ENOBUFS;
217 
218 	mutex_lock(&xs->mutex);
219 
220 	while (xskq_peek_desc(xs->tx, &desc)) {
221 		char *buffer;
222 		u64 addr;
223 		u32 len;
224 
225 		if (max_batch-- == 0) {
226 			err = -EAGAIN;
227 			goto out;
228 		}
229 
230 		if (xskq_reserve_addr(xs->umem->cq)) {
231 			err = -EAGAIN;
232 			goto out;
233 		}
234 
235 		len = desc.len;
236 		if (unlikely(len > xs->dev->mtu)) {
237 			err = -EMSGSIZE;
238 			goto out;
239 		}
240 
241 		if (xs->queue_id >= xs->dev->real_num_tx_queues) {
242 			err = -ENXIO;
243 			goto out;
244 		}
245 
246 		skb = sock_alloc_send_skb(sk, len, 1, &err);
247 		if (unlikely(!skb)) {
248 			err = -EAGAIN;
249 			goto out;
250 		}
251 
252 		skb_put(skb, len);
253 		addr = desc.addr;
254 		buffer = xdp_umem_get_data(xs->umem, addr);
255 		err = skb_store_bits(skb, 0, buffer, len);
256 		if (unlikely(err)) {
257 			kfree_skb(skb);
258 			goto out;
259 		}
260 
261 		skb->dev = xs->dev;
262 		skb->priority = sk->sk_priority;
263 		skb->mark = sk->sk_mark;
264 		skb_shinfo(skb)->destructor_arg = (void *)(long)addr;
265 		skb->destructor = xsk_destruct_skb;
266 
267 		err = dev_direct_xmit(skb, xs->queue_id);
268 		/* Ignore NET_XMIT_CN as packet might have been sent */
269 		if (err == NET_XMIT_DROP || err == NETDEV_TX_BUSY) {
270 			err = -EAGAIN;
271 			/* SKB consumed by dev_direct_xmit() */
272 			goto out;
273 		}
274 
275 		sent_frame = true;
276 		xskq_discard_desc(xs->tx);
277 	}
278 
279 out:
280 	if (sent_frame)
281 		sk->sk_write_space(sk);
282 
283 	mutex_unlock(&xs->mutex);
284 	return err;
285 }
286 
287 static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
288 {
289 	bool need_wait = !(m->msg_flags & MSG_DONTWAIT);
290 	struct sock *sk = sock->sk;
291 	struct xdp_sock *xs = xdp_sk(sk);
292 
293 	if (unlikely(!xs->dev))
294 		return -ENXIO;
295 	if (unlikely(!(xs->dev->flags & IFF_UP)))
296 		return -ENETDOWN;
297 	if (need_wait)
298 		return -EOPNOTSUPP;
299 
300 	return (xs->zc) ? xsk_zc_xmit(sk) : xsk_generic_xmit(sk, m, total_len);
301 }
302 
303 static __poll_t xsk_poll_mask(struct socket *sock, __poll_t events)
304 {
305 	__poll_t mask = datagram_poll_mask(sock, events);
306 	struct sock *sk = sock->sk;
307 	struct xdp_sock *xs = xdp_sk(sk);
308 
309 	if (xs->rx && !xskq_empty_desc(xs->rx))
310 		mask |= POLLIN | POLLRDNORM;
311 	if (xs->tx && !xskq_full_desc(xs->tx))
312 		mask |= POLLOUT | POLLWRNORM;
313 
314 	return mask;
315 }
316 
317 static int xsk_init_queue(u32 entries, struct xsk_queue **queue,
318 			  bool umem_queue)
319 {
320 	struct xsk_queue *q;
321 
322 	if (entries == 0 || *queue || !is_power_of_2(entries))
323 		return -EINVAL;
324 
325 	q = xskq_create(entries, umem_queue);
326 	if (!q)
327 		return -ENOMEM;
328 
329 	/* Make sure queue is ready before it can be seen by others */
330 	smp_wmb();
331 	*queue = q;
332 	return 0;
333 }
334 
335 static int xsk_release(struct socket *sock)
336 {
337 	struct sock *sk = sock->sk;
338 	struct xdp_sock *xs = xdp_sk(sk);
339 	struct net *net;
340 
341 	if (!sk)
342 		return 0;
343 
344 	net = sock_net(sk);
345 
346 	local_bh_disable();
347 	sock_prot_inuse_add(net, sk->sk_prot, -1);
348 	local_bh_enable();
349 
350 	if (xs->dev) {
351 		/* Wait for driver to stop using the xdp socket. */
352 		synchronize_net();
353 		dev_put(xs->dev);
354 		xs->dev = NULL;
355 	}
356 
357 	sock_orphan(sk);
358 	sock->sk = NULL;
359 
360 	sk_refcnt_debug_release(sk);
361 	sock_put(sk);
362 
363 	return 0;
364 }
365 
366 static struct socket *xsk_lookup_xsk_from_fd(int fd)
367 {
368 	struct socket *sock;
369 	int err;
370 
371 	sock = sockfd_lookup(fd, &err);
372 	if (!sock)
373 		return ERR_PTR(-ENOTSOCK);
374 
375 	if (sock->sk->sk_family != PF_XDP) {
376 		sockfd_put(sock);
377 		return ERR_PTR(-ENOPROTOOPT);
378 	}
379 
380 	return sock;
381 }
382 
383 static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
384 {
385 	struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr;
386 	struct sock *sk = sock->sk;
387 	struct xdp_sock *xs = xdp_sk(sk);
388 	struct net_device *dev;
389 	u32 flags, qid;
390 	int err = 0;
391 
392 	if (addr_len < sizeof(struct sockaddr_xdp))
393 		return -EINVAL;
394 	if (sxdp->sxdp_family != AF_XDP)
395 		return -EINVAL;
396 
397 	mutex_lock(&xs->mutex);
398 	if (xs->dev) {
399 		err = -EBUSY;
400 		goto out_release;
401 	}
402 
403 	dev = dev_get_by_index(sock_net(sk), sxdp->sxdp_ifindex);
404 	if (!dev) {
405 		err = -ENODEV;
406 		goto out_release;
407 	}
408 
409 	if (!xs->rx && !xs->tx) {
410 		err = -EINVAL;
411 		goto out_unlock;
412 	}
413 
414 	qid = sxdp->sxdp_queue_id;
415 
416 	if ((xs->rx && qid >= dev->real_num_rx_queues) ||
417 	    (xs->tx && qid >= dev->real_num_tx_queues)) {
418 		err = -EINVAL;
419 		goto out_unlock;
420 	}
421 
422 	flags = sxdp->sxdp_flags;
423 
424 	if (flags & XDP_SHARED_UMEM) {
425 		struct xdp_sock *umem_xs;
426 		struct socket *sock;
427 
428 		if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY)) {
429 			/* Cannot specify flags for shared sockets. */
430 			err = -EINVAL;
431 			goto out_unlock;
432 		}
433 
434 		if (xs->umem) {
435 			/* We have already our own. */
436 			err = -EINVAL;
437 			goto out_unlock;
438 		}
439 
440 		sock = xsk_lookup_xsk_from_fd(sxdp->sxdp_shared_umem_fd);
441 		if (IS_ERR(sock)) {
442 			err = PTR_ERR(sock);
443 			goto out_unlock;
444 		}
445 
446 		umem_xs = xdp_sk(sock->sk);
447 		if (!umem_xs->umem) {
448 			/* No umem to inherit. */
449 			err = -EBADF;
450 			sockfd_put(sock);
451 			goto out_unlock;
452 		} else if (umem_xs->dev != dev || umem_xs->queue_id != qid) {
453 			err = -EINVAL;
454 			sockfd_put(sock);
455 			goto out_unlock;
456 		}
457 
458 		xdp_get_umem(umem_xs->umem);
459 		xs->umem = umem_xs->umem;
460 		sockfd_put(sock);
461 	} else if (!xs->umem || !xdp_umem_validate_queues(xs->umem)) {
462 		err = -EINVAL;
463 		goto out_unlock;
464 	} else {
465 		/* This xsk has its own umem. */
466 		xskq_set_umem(xs->umem->fq, &xs->umem->props);
467 		xskq_set_umem(xs->umem->cq, &xs->umem->props);
468 
469 		err = xdp_umem_assign_dev(xs->umem, dev, qid, flags);
470 		if (err)
471 			goto out_unlock;
472 	}
473 
474 	xs->dev = dev;
475 	xs->zc = xs->umem->zc;
476 	xs->queue_id = qid;
477 	xskq_set_umem(xs->rx, &xs->umem->props);
478 	xskq_set_umem(xs->tx, &xs->umem->props);
479 	xdp_add_sk_umem(xs->umem, xs);
480 
481 out_unlock:
482 	if (err)
483 		dev_put(dev);
484 out_release:
485 	mutex_unlock(&xs->mutex);
486 	return err;
487 }
488 
489 static int xsk_setsockopt(struct socket *sock, int level, int optname,
490 			  char __user *optval, unsigned int optlen)
491 {
492 	struct sock *sk = sock->sk;
493 	struct xdp_sock *xs = xdp_sk(sk);
494 	int err;
495 
496 	if (level != SOL_XDP)
497 		return -ENOPROTOOPT;
498 
499 	switch (optname) {
500 	case XDP_RX_RING:
501 	case XDP_TX_RING:
502 	{
503 		struct xsk_queue **q;
504 		int entries;
505 
506 		if (optlen < sizeof(entries))
507 			return -EINVAL;
508 		if (copy_from_user(&entries, optval, sizeof(entries)))
509 			return -EFAULT;
510 
511 		mutex_lock(&xs->mutex);
512 		q = (optname == XDP_TX_RING) ? &xs->tx : &xs->rx;
513 		err = xsk_init_queue(entries, q, false);
514 		mutex_unlock(&xs->mutex);
515 		return err;
516 	}
517 	case XDP_UMEM_REG:
518 	{
519 		struct xdp_umem_reg mr;
520 		struct xdp_umem *umem;
521 
522 		if (copy_from_user(&mr, optval, sizeof(mr)))
523 			return -EFAULT;
524 
525 		mutex_lock(&xs->mutex);
526 		if (xs->umem) {
527 			mutex_unlock(&xs->mutex);
528 			return -EBUSY;
529 		}
530 
531 		umem = xdp_umem_create(&mr);
532 		if (IS_ERR(umem)) {
533 			mutex_unlock(&xs->mutex);
534 			return PTR_ERR(umem);
535 		}
536 
537 		/* Make sure umem is ready before it can be seen by others */
538 		smp_wmb();
539 		xs->umem = umem;
540 		mutex_unlock(&xs->mutex);
541 		return 0;
542 	}
543 	case XDP_UMEM_FILL_RING:
544 	case XDP_UMEM_COMPLETION_RING:
545 	{
546 		struct xsk_queue **q;
547 		int entries;
548 
549 		if (copy_from_user(&entries, optval, sizeof(entries)))
550 			return -EFAULT;
551 
552 		mutex_lock(&xs->mutex);
553 		if (!xs->umem) {
554 			mutex_unlock(&xs->mutex);
555 			return -EINVAL;
556 		}
557 
558 		q = (optname == XDP_UMEM_FILL_RING) ? &xs->umem->fq :
559 			&xs->umem->cq;
560 		err = xsk_init_queue(entries, q, true);
561 		mutex_unlock(&xs->mutex);
562 		return err;
563 	}
564 	default:
565 		break;
566 	}
567 
568 	return -ENOPROTOOPT;
569 }
570 
571 static int xsk_getsockopt(struct socket *sock, int level, int optname,
572 			  char __user *optval, int __user *optlen)
573 {
574 	struct sock *sk = sock->sk;
575 	struct xdp_sock *xs = xdp_sk(sk);
576 	int len;
577 
578 	if (level != SOL_XDP)
579 		return -ENOPROTOOPT;
580 
581 	if (get_user(len, optlen))
582 		return -EFAULT;
583 	if (len < 0)
584 		return -EINVAL;
585 
586 	switch (optname) {
587 	case XDP_STATISTICS:
588 	{
589 		struct xdp_statistics stats;
590 
591 		if (len < sizeof(stats))
592 			return -EINVAL;
593 
594 		mutex_lock(&xs->mutex);
595 		stats.rx_dropped = xs->rx_dropped;
596 		stats.rx_invalid_descs = xskq_nb_invalid_descs(xs->rx);
597 		stats.tx_invalid_descs = xskq_nb_invalid_descs(xs->tx);
598 		mutex_unlock(&xs->mutex);
599 
600 		if (copy_to_user(optval, &stats, sizeof(stats)))
601 			return -EFAULT;
602 		if (put_user(sizeof(stats), optlen))
603 			return -EFAULT;
604 
605 		return 0;
606 	}
607 	case XDP_MMAP_OFFSETS:
608 	{
609 		struct xdp_mmap_offsets off;
610 
611 		if (len < sizeof(off))
612 			return -EINVAL;
613 
614 		off.rx.producer = offsetof(struct xdp_rxtx_ring, ptrs.producer);
615 		off.rx.consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer);
616 		off.rx.desc	= offsetof(struct xdp_rxtx_ring, desc);
617 		off.tx.producer = offsetof(struct xdp_rxtx_ring, ptrs.producer);
618 		off.tx.consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer);
619 		off.tx.desc	= offsetof(struct xdp_rxtx_ring, desc);
620 
621 		off.fr.producer = offsetof(struct xdp_umem_ring, ptrs.producer);
622 		off.fr.consumer = offsetof(struct xdp_umem_ring, ptrs.consumer);
623 		off.fr.desc	= offsetof(struct xdp_umem_ring, desc);
624 		off.cr.producer = offsetof(struct xdp_umem_ring, ptrs.producer);
625 		off.cr.consumer = offsetof(struct xdp_umem_ring, ptrs.consumer);
626 		off.cr.desc	= offsetof(struct xdp_umem_ring, desc);
627 
628 		len = sizeof(off);
629 		if (copy_to_user(optval, &off, len))
630 			return -EFAULT;
631 		if (put_user(len, optlen))
632 			return -EFAULT;
633 
634 		return 0;
635 	}
636 	default:
637 		break;
638 	}
639 
640 	return -EOPNOTSUPP;
641 }
642 
643 static int xsk_mmap(struct file *file, struct socket *sock,
644 		    struct vm_area_struct *vma)
645 {
646 	loff_t offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
647 	unsigned long size = vma->vm_end - vma->vm_start;
648 	struct xdp_sock *xs = xdp_sk(sock->sk);
649 	struct xsk_queue *q = NULL;
650 	struct xdp_umem *umem;
651 	unsigned long pfn;
652 	struct page *qpg;
653 
654 	if (offset == XDP_PGOFF_RX_RING) {
655 		q = READ_ONCE(xs->rx);
656 	} else if (offset == XDP_PGOFF_TX_RING) {
657 		q = READ_ONCE(xs->tx);
658 	} else {
659 		umem = READ_ONCE(xs->umem);
660 		if (!umem)
661 			return -EINVAL;
662 
663 		if (offset == XDP_UMEM_PGOFF_FILL_RING)
664 			q = READ_ONCE(umem->fq);
665 		else if (offset == XDP_UMEM_PGOFF_COMPLETION_RING)
666 			q = READ_ONCE(umem->cq);
667 	}
668 
669 	if (!q)
670 		return -EINVAL;
671 
672 	qpg = virt_to_head_page(q->ring);
673 	if (size > (PAGE_SIZE << compound_order(qpg)))
674 		return -EINVAL;
675 
676 	pfn = virt_to_phys(q->ring) >> PAGE_SHIFT;
677 	return remap_pfn_range(vma, vma->vm_start, pfn,
678 			       size, vma->vm_page_prot);
679 }
680 
681 static struct proto xsk_proto = {
682 	.name =		"XDP",
683 	.owner =	THIS_MODULE,
684 	.obj_size =	sizeof(struct xdp_sock),
685 };
686 
687 static const struct proto_ops xsk_proto_ops = {
688 	.family		= PF_XDP,
689 	.owner		= THIS_MODULE,
690 	.release	= xsk_release,
691 	.bind		= xsk_bind,
692 	.connect	= sock_no_connect,
693 	.socketpair	= sock_no_socketpair,
694 	.accept		= sock_no_accept,
695 	.getname	= sock_no_getname,
696 	.poll_mask	= xsk_poll_mask,
697 	.ioctl		= sock_no_ioctl,
698 	.listen		= sock_no_listen,
699 	.shutdown	= sock_no_shutdown,
700 	.setsockopt	= xsk_setsockopt,
701 	.getsockopt	= xsk_getsockopt,
702 	.sendmsg	= xsk_sendmsg,
703 	.recvmsg	= sock_no_recvmsg,
704 	.mmap		= xsk_mmap,
705 	.sendpage	= sock_no_sendpage,
706 };
707 
708 static void xsk_destruct(struct sock *sk)
709 {
710 	struct xdp_sock *xs = xdp_sk(sk);
711 
712 	if (!sock_flag(sk, SOCK_DEAD))
713 		return;
714 
715 	xskq_destroy(xs->rx);
716 	xskq_destroy(xs->tx);
717 	xdp_del_sk_umem(xs->umem, xs);
718 	xdp_put_umem(xs->umem);
719 
720 	sk_refcnt_debug_dec(sk);
721 }
722 
723 static int xsk_create(struct net *net, struct socket *sock, int protocol,
724 		      int kern)
725 {
726 	struct sock *sk;
727 	struct xdp_sock *xs;
728 
729 	if (!ns_capable(net->user_ns, CAP_NET_RAW))
730 		return -EPERM;
731 	if (sock->type != SOCK_RAW)
732 		return -ESOCKTNOSUPPORT;
733 
734 	if (protocol)
735 		return -EPROTONOSUPPORT;
736 
737 	sock->state = SS_UNCONNECTED;
738 
739 	sk = sk_alloc(net, PF_XDP, GFP_KERNEL, &xsk_proto, kern);
740 	if (!sk)
741 		return -ENOBUFS;
742 
743 	sock->ops = &xsk_proto_ops;
744 
745 	sock_init_data(sock, sk);
746 
747 	sk->sk_family = PF_XDP;
748 
749 	sk->sk_destruct = xsk_destruct;
750 	sk_refcnt_debug_inc(sk);
751 
752 	xs = xdp_sk(sk);
753 	mutex_init(&xs->mutex);
754 
755 	local_bh_disable();
756 	sock_prot_inuse_add(net, &xsk_proto, 1);
757 	local_bh_enable();
758 
759 	return 0;
760 }
761 
762 static const struct net_proto_family xsk_family_ops = {
763 	.family = PF_XDP,
764 	.create = xsk_create,
765 	.owner	= THIS_MODULE,
766 };
767 
768 static int __init xsk_init(void)
769 {
770 	int err;
771 
772 	err = proto_register(&xsk_proto, 0 /* no slab */);
773 	if (err)
774 		goto out;
775 
776 	err = sock_register(&xsk_family_ops);
777 	if (err)
778 		goto out_proto;
779 
780 	return 0;
781 
782 out_proto:
783 	proto_unregister(&xsk_proto);
784 out:
785 	return err;
786 }
787 
788 fs_initcall(xsk_init);
789