xref: /openbmc/linux/net/xdp/xsk.c (revision 3557b3fd)
1 // SPDX-License-Identifier: GPL-2.0
2 /* XDP sockets
3  *
4  * AF_XDP sockets allows a channel between XDP programs and userspace
5  * applications.
6  * Copyright(c) 2018 Intel Corporation.
7  *
8  * Author(s): Björn Töpel <bjorn.topel@intel.com>
9  *	      Magnus Karlsson <magnus.karlsson@intel.com>
10  */
11 
12 #define pr_fmt(fmt) "AF_XDP: %s: " fmt, __func__
13 
14 #include <linux/if_xdp.h>
15 #include <linux/init.h>
16 #include <linux/sched/mm.h>
17 #include <linux/sched/signal.h>
18 #include <linux/sched/task.h>
19 #include <linux/socket.h>
20 #include <linux/file.h>
21 #include <linux/uaccess.h>
22 #include <linux/net.h>
23 #include <linux/netdevice.h>
24 #include <linux/rculist.h>
25 #include <net/xdp_sock.h>
26 #include <net/xdp.h>
27 
28 #include "xsk_queue.h"
29 #include "xdp_umem.h"
30 #include "xsk.h"
31 
32 #define TX_BATCH_SIZE 16
33 
34 bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs)
35 {
36 	return READ_ONCE(xs->rx) &&  READ_ONCE(xs->umem) &&
37 		READ_ONCE(xs->umem->fq);
38 }
39 
40 u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr)
41 {
42 	return xskq_peek_addr(umem->fq, addr);
43 }
44 EXPORT_SYMBOL(xsk_umem_peek_addr);
45 
46 void xsk_umem_discard_addr(struct xdp_umem *umem)
47 {
48 	xskq_discard_addr(umem->fq);
49 }
50 EXPORT_SYMBOL(xsk_umem_discard_addr);
51 
52 static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
53 {
54 	void *to_buf, *from_buf;
55 	u32 metalen;
56 	u64 addr;
57 	int err;
58 
59 	if (!xskq_peek_addr(xs->umem->fq, &addr) ||
60 	    len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) {
61 		xs->rx_dropped++;
62 		return -ENOSPC;
63 	}
64 
65 	addr += xs->umem->headroom;
66 
67 	if (unlikely(xdp_data_meta_unsupported(xdp))) {
68 		from_buf = xdp->data;
69 		metalen = 0;
70 	} else {
71 		from_buf = xdp->data_meta;
72 		metalen = xdp->data - xdp->data_meta;
73 	}
74 
75 	to_buf = xdp_umem_get_data(xs->umem, addr);
76 	memcpy(to_buf, from_buf, len + metalen);
77 	addr += metalen;
78 	err = xskq_produce_batch_desc(xs->rx, addr, len);
79 	if (!err) {
80 		xskq_discard_addr(xs->umem->fq);
81 		xdp_return_buff(xdp);
82 		return 0;
83 	}
84 
85 	xs->rx_dropped++;
86 	return err;
87 }
88 
89 static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
90 {
91 	int err = xskq_produce_batch_desc(xs->rx, (u64)xdp->handle, len);
92 
93 	if (err)
94 		xs->rx_dropped++;
95 
96 	return err;
97 }
98 
99 int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
100 {
101 	u32 len;
102 
103 	if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index)
104 		return -EINVAL;
105 
106 	len = xdp->data_end - xdp->data;
107 
108 	return (xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY) ?
109 		__xsk_rcv_zc(xs, xdp, len) : __xsk_rcv(xs, xdp, len);
110 }
111 
112 void xsk_flush(struct xdp_sock *xs)
113 {
114 	xskq_produce_flush_desc(xs->rx);
115 	xs->sk.sk_data_ready(&xs->sk);
116 }
117 
118 int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
119 {
120 	u32 metalen = xdp->data - xdp->data_meta;
121 	u32 len = xdp->data_end - xdp->data;
122 	void *buffer;
123 	u64 addr;
124 	int err;
125 
126 	if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index)
127 		return -EINVAL;
128 
129 	if (!xskq_peek_addr(xs->umem->fq, &addr) ||
130 	    len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) {
131 		xs->rx_dropped++;
132 		return -ENOSPC;
133 	}
134 
135 	addr += xs->umem->headroom;
136 
137 	buffer = xdp_umem_get_data(xs->umem, addr);
138 	memcpy(buffer, xdp->data_meta, len + metalen);
139 	addr += metalen;
140 	err = xskq_produce_batch_desc(xs->rx, addr, len);
141 	if (!err) {
142 		xskq_discard_addr(xs->umem->fq);
143 		xsk_flush(xs);
144 		return 0;
145 	}
146 
147 	xs->rx_dropped++;
148 	return err;
149 }
150 
151 void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries)
152 {
153 	xskq_produce_flush_addr_n(umem->cq, nb_entries);
154 }
155 EXPORT_SYMBOL(xsk_umem_complete_tx);
156 
157 void xsk_umem_consume_tx_done(struct xdp_umem *umem)
158 {
159 	struct xdp_sock *xs;
160 
161 	rcu_read_lock();
162 	list_for_each_entry_rcu(xs, &umem->xsk_list, list) {
163 		xs->sk.sk_write_space(&xs->sk);
164 	}
165 	rcu_read_unlock();
166 }
167 EXPORT_SYMBOL(xsk_umem_consume_tx_done);
168 
169 bool xsk_umem_consume_tx(struct xdp_umem *umem, dma_addr_t *dma, u32 *len)
170 {
171 	struct xdp_desc desc;
172 	struct xdp_sock *xs;
173 
174 	rcu_read_lock();
175 	list_for_each_entry_rcu(xs, &umem->xsk_list, list) {
176 		if (!xskq_peek_desc(xs->tx, &desc))
177 			continue;
178 
179 		if (xskq_produce_addr_lazy(umem->cq, desc.addr))
180 			goto out;
181 
182 		*dma = xdp_umem_get_dma(umem, desc.addr);
183 		*len = desc.len;
184 
185 		xskq_discard_desc(xs->tx);
186 		rcu_read_unlock();
187 		return true;
188 	}
189 
190 out:
191 	rcu_read_unlock();
192 	return false;
193 }
194 EXPORT_SYMBOL(xsk_umem_consume_tx);
195 
196 static int xsk_zc_xmit(struct sock *sk)
197 {
198 	struct xdp_sock *xs = xdp_sk(sk);
199 	struct net_device *dev = xs->dev;
200 
201 	return dev->netdev_ops->ndo_xsk_async_xmit(dev, xs->queue_id);
202 }
203 
204 static void xsk_destruct_skb(struct sk_buff *skb)
205 {
206 	u64 addr = (u64)(long)skb_shinfo(skb)->destructor_arg;
207 	struct xdp_sock *xs = xdp_sk(skb->sk);
208 	unsigned long flags;
209 
210 	spin_lock_irqsave(&xs->tx_completion_lock, flags);
211 	WARN_ON_ONCE(xskq_produce_addr(xs->umem->cq, addr));
212 	spin_unlock_irqrestore(&xs->tx_completion_lock, flags);
213 
214 	sock_wfree(skb);
215 }
216 
217 static int xsk_generic_xmit(struct sock *sk, struct msghdr *m,
218 			    size_t total_len)
219 {
220 	u32 max_batch = TX_BATCH_SIZE;
221 	struct xdp_sock *xs = xdp_sk(sk);
222 	bool sent_frame = false;
223 	struct xdp_desc desc;
224 	struct sk_buff *skb;
225 	int err = 0;
226 
227 	mutex_lock(&xs->mutex);
228 
229 	while (xskq_peek_desc(xs->tx, &desc)) {
230 		char *buffer;
231 		u64 addr;
232 		u32 len;
233 
234 		if (max_batch-- == 0) {
235 			err = -EAGAIN;
236 			goto out;
237 		}
238 
239 		if (xskq_reserve_addr(xs->umem->cq))
240 			goto out;
241 
242 		if (xs->queue_id >= xs->dev->real_num_tx_queues)
243 			goto out;
244 
245 		len = desc.len;
246 		skb = sock_alloc_send_skb(sk, len, 1, &err);
247 		if (unlikely(!skb)) {
248 			err = -EAGAIN;
249 			goto out;
250 		}
251 
252 		skb_put(skb, len);
253 		addr = desc.addr;
254 		buffer = xdp_umem_get_data(xs->umem, addr);
255 		err = skb_store_bits(skb, 0, buffer, len);
256 		if (unlikely(err)) {
257 			kfree_skb(skb);
258 			goto out;
259 		}
260 
261 		skb->dev = xs->dev;
262 		skb->priority = sk->sk_priority;
263 		skb->mark = sk->sk_mark;
264 		skb_shinfo(skb)->destructor_arg = (void *)(long)addr;
265 		skb->destructor = xsk_destruct_skb;
266 
267 		err = dev_direct_xmit(skb, xs->queue_id);
268 		xskq_discard_desc(xs->tx);
269 		/* Ignore NET_XMIT_CN as packet might have been sent */
270 		if (err == NET_XMIT_DROP || err == NETDEV_TX_BUSY) {
271 			/* SKB completed but not sent */
272 			err = -EBUSY;
273 			goto out;
274 		}
275 
276 		sent_frame = true;
277 	}
278 
279 out:
280 	if (sent_frame)
281 		sk->sk_write_space(sk);
282 
283 	mutex_unlock(&xs->mutex);
284 	return err;
285 }
286 
287 static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
288 {
289 	bool need_wait = !(m->msg_flags & MSG_DONTWAIT);
290 	struct sock *sk = sock->sk;
291 	struct xdp_sock *xs = xdp_sk(sk);
292 
293 	if (unlikely(!xs->dev))
294 		return -ENXIO;
295 	if (unlikely(!(xs->dev->flags & IFF_UP)))
296 		return -ENETDOWN;
297 	if (unlikely(!xs->tx))
298 		return -ENOBUFS;
299 	if (need_wait)
300 		return -EOPNOTSUPP;
301 
302 	return (xs->zc) ? xsk_zc_xmit(sk) : xsk_generic_xmit(sk, m, total_len);
303 }
304 
305 static unsigned int xsk_poll(struct file *file, struct socket *sock,
306 			     struct poll_table_struct *wait)
307 {
308 	unsigned int mask = datagram_poll(file, sock, wait);
309 	struct sock *sk = sock->sk;
310 	struct xdp_sock *xs = xdp_sk(sk);
311 
312 	if (xs->rx && !xskq_empty_desc(xs->rx))
313 		mask |= POLLIN | POLLRDNORM;
314 	if (xs->tx && !xskq_full_desc(xs->tx))
315 		mask |= POLLOUT | POLLWRNORM;
316 
317 	return mask;
318 }
319 
320 static int xsk_init_queue(u32 entries, struct xsk_queue **queue,
321 			  bool umem_queue)
322 {
323 	struct xsk_queue *q;
324 
325 	if (entries == 0 || *queue || !is_power_of_2(entries))
326 		return -EINVAL;
327 
328 	q = xskq_create(entries, umem_queue);
329 	if (!q)
330 		return -ENOMEM;
331 
332 	/* Make sure queue is ready before it can be seen by others */
333 	smp_wmb();
334 	*queue = q;
335 	return 0;
336 }
337 
338 static int xsk_release(struct socket *sock)
339 {
340 	struct sock *sk = sock->sk;
341 	struct xdp_sock *xs = xdp_sk(sk);
342 	struct net *net;
343 
344 	if (!sk)
345 		return 0;
346 
347 	net = sock_net(sk);
348 
349 	mutex_lock(&net->xdp.lock);
350 	sk_del_node_init_rcu(sk);
351 	mutex_unlock(&net->xdp.lock);
352 
353 	local_bh_disable();
354 	sock_prot_inuse_add(net, sk->sk_prot, -1);
355 	local_bh_enable();
356 
357 	if (xs->dev) {
358 		struct net_device *dev = xs->dev;
359 
360 		/* Wait for driver to stop using the xdp socket. */
361 		xdp_del_sk_umem(xs->umem, xs);
362 		xs->dev = NULL;
363 		synchronize_net();
364 		dev_put(dev);
365 	}
366 
367 	xskq_destroy(xs->rx);
368 	xskq_destroy(xs->tx);
369 
370 	sock_orphan(sk);
371 	sock->sk = NULL;
372 
373 	sk_refcnt_debug_release(sk);
374 	sock_put(sk);
375 
376 	return 0;
377 }
378 
379 static struct socket *xsk_lookup_xsk_from_fd(int fd)
380 {
381 	struct socket *sock;
382 	int err;
383 
384 	sock = sockfd_lookup(fd, &err);
385 	if (!sock)
386 		return ERR_PTR(-ENOTSOCK);
387 
388 	if (sock->sk->sk_family != PF_XDP) {
389 		sockfd_put(sock);
390 		return ERR_PTR(-ENOPROTOOPT);
391 	}
392 
393 	return sock;
394 }
395 
396 static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
397 {
398 	struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr;
399 	struct sock *sk = sock->sk;
400 	struct xdp_sock *xs = xdp_sk(sk);
401 	struct net_device *dev;
402 	u32 flags, qid;
403 	int err = 0;
404 
405 	if (addr_len < sizeof(struct sockaddr_xdp))
406 		return -EINVAL;
407 	if (sxdp->sxdp_family != AF_XDP)
408 		return -EINVAL;
409 
410 	flags = sxdp->sxdp_flags;
411 	if (flags & ~(XDP_SHARED_UMEM | XDP_COPY | XDP_ZEROCOPY))
412 		return -EINVAL;
413 
414 	mutex_lock(&xs->mutex);
415 	if (xs->dev) {
416 		err = -EBUSY;
417 		goto out_release;
418 	}
419 
420 	dev = dev_get_by_index(sock_net(sk), sxdp->sxdp_ifindex);
421 	if (!dev) {
422 		err = -ENODEV;
423 		goto out_release;
424 	}
425 
426 	if (!xs->rx && !xs->tx) {
427 		err = -EINVAL;
428 		goto out_unlock;
429 	}
430 
431 	qid = sxdp->sxdp_queue_id;
432 
433 	if (flags & XDP_SHARED_UMEM) {
434 		struct xdp_sock *umem_xs;
435 		struct socket *sock;
436 
437 		if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY)) {
438 			/* Cannot specify flags for shared sockets. */
439 			err = -EINVAL;
440 			goto out_unlock;
441 		}
442 
443 		if (xs->umem) {
444 			/* We have already our own. */
445 			err = -EINVAL;
446 			goto out_unlock;
447 		}
448 
449 		sock = xsk_lookup_xsk_from_fd(sxdp->sxdp_shared_umem_fd);
450 		if (IS_ERR(sock)) {
451 			err = PTR_ERR(sock);
452 			goto out_unlock;
453 		}
454 
455 		umem_xs = xdp_sk(sock->sk);
456 		if (!umem_xs->umem) {
457 			/* No umem to inherit. */
458 			err = -EBADF;
459 			sockfd_put(sock);
460 			goto out_unlock;
461 		} else if (umem_xs->dev != dev || umem_xs->queue_id != qid) {
462 			err = -EINVAL;
463 			sockfd_put(sock);
464 			goto out_unlock;
465 		}
466 
467 		xdp_get_umem(umem_xs->umem);
468 		xs->umem = umem_xs->umem;
469 		sockfd_put(sock);
470 	} else if (!xs->umem || !xdp_umem_validate_queues(xs->umem)) {
471 		err = -EINVAL;
472 		goto out_unlock;
473 	} else {
474 		/* This xsk has its own umem. */
475 		xskq_set_umem(xs->umem->fq, xs->umem->size,
476 			      xs->umem->chunk_mask);
477 		xskq_set_umem(xs->umem->cq, xs->umem->size,
478 			      xs->umem->chunk_mask);
479 
480 		err = xdp_umem_assign_dev(xs->umem, dev, qid, flags);
481 		if (err)
482 			goto out_unlock;
483 	}
484 
485 	xs->dev = dev;
486 	xs->zc = xs->umem->zc;
487 	xs->queue_id = qid;
488 	xskq_set_umem(xs->rx, xs->umem->size, xs->umem->chunk_mask);
489 	xskq_set_umem(xs->tx, xs->umem->size, xs->umem->chunk_mask);
490 	xdp_add_sk_umem(xs->umem, xs);
491 
492 out_unlock:
493 	if (err)
494 		dev_put(dev);
495 out_release:
496 	mutex_unlock(&xs->mutex);
497 	return err;
498 }
499 
500 static int xsk_setsockopt(struct socket *sock, int level, int optname,
501 			  char __user *optval, unsigned int optlen)
502 {
503 	struct sock *sk = sock->sk;
504 	struct xdp_sock *xs = xdp_sk(sk);
505 	int err;
506 
507 	if (level != SOL_XDP)
508 		return -ENOPROTOOPT;
509 
510 	switch (optname) {
511 	case XDP_RX_RING:
512 	case XDP_TX_RING:
513 	{
514 		struct xsk_queue **q;
515 		int entries;
516 
517 		if (optlen < sizeof(entries))
518 			return -EINVAL;
519 		if (copy_from_user(&entries, optval, sizeof(entries)))
520 			return -EFAULT;
521 
522 		mutex_lock(&xs->mutex);
523 		q = (optname == XDP_TX_RING) ? &xs->tx : &xs->rx;
524 		err = xsk_init_queue(entries, q, false);
525 		mutex_unlock(&xs->mutex);
526 		return err;
527 	}
528 	case XDP_UMEM_REG:
529 	{
530 		struct xdp_umem_reg mr;
531 		struct xdp_umem *umem;
532 
533 		if (copy_from_user(&mr, optval, sizeof(mr)))
534 			return -EFAULT;
535 
536 		mutex_lock(&xs->mutex);
537 		if (xs->umem) {
538 			mutex_unlock(&xs->mutex);
539 			return -EBUSY;
540 		}
541 
542 		umem = xdp_umem_create(&mr);
543 		if (IS_ERR(umem)) {
544 			mutex_unlock(&xs->mutex);
545 			return PTR_ERR(umem);
546 		}
547 
548 		/* Make sure umem is ready before it can be seen by others */
549 		smp_wmb();
550 		xs->umem = umem;
551 		mutex_unlock(&xs->mutex);
552 		return 0;
553 	}
554 	case XDP_UMEM_FILL_RING:
555 	case XDP_UMEM_COMPLETION_RING:
556 	{
557 		struct xsk_queue **q;
558 		int entries;
559 
560 		if (copy_from_user(&entries, optval, sizeof(entries)))
561 			return -EFAULT;
562 
563 		mutex_lock(&xs->mutex);
564 		if (!xs->umem) {
565 			mutex_unlock(&xs->mutex);
566 			return -EINVAL;
567 		}
568 
569 		q = (optname == XDP_UMEM_FILL_RING) ? &xs->umem->fq :
570 			&xs->umem->cq;
571 		err = xsk_init_queue(entries, q, true);
572 		mutex_unlock(&xs->mutex);
573 		return err;
574 	}
575 	default:
576 		break;
577 	}
578 
579 	return -ENOPROTOOPT;
580 }
581 
582 static int xsk_getsockopt(struct socket *sock, int level, int optname,
583 			  char __user *optval, int __user *optlen)
584 {
585 	struct sock *sk = sock->sk;
586 	struct xdp_sock *xs = xdp_sk(sk);
587 	int len;
588 
589 	if (level != SOL_XDP)
590 		return -ENOPROTOOPT;
591 
592 	if (get_user(len, optlen))
593 		return -EFAULT;
594 	if (len < 0)
595 		return -EINVAL;
596 
597 	switch (optname) {
598 	case XDP_STATISTICS:
599 	{
600 		struct xdp_statistics stats;
601 
602 		if (len < sizeof(stats))
603 			return -EINVAL;
604 
605 		mutex_lock(&xs->mutex);
606 		stats.rx_dropped = xs->rx_dropped;
607 		stats.rx_invalid_descs = xskq_nb_invalid_descs(xs->rx);
608 		stats.tx_invalid_descs = xskq_nb_invalid_descs(xs->tx);
609 		mutex_unlock(&xs->mutex);
610 
611 		if (copy_to_user(optval, &stats, sizeof(stats)))
612 			return -EFAULT;
613 		if (put_user(sizeof(stats), optlen))
614 			return -EFAULT;
615 
616 		return 0;
617 	}
618 	case XDP_MMAP_OFFSETS:
619 	{
620 		struct xdp_mmap_offsets off;
621 
622 		if (len < sizeof(off))
623 			return -EINVAL;
624 
625 		off.rx.producer = offsetof(struct xdp_rxtx_ring, ptrs.producer);
626 		off.rx.consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer);
627 		off.rx.desc	= offsetof(struct xdp_rxtx_ring, desc);
628 		off.tx.producer = offsetof(struct xdp_rxtx_ring, ptrs.producer);
629 		off.tx.consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer);
630 		off.tx.desc	= offsetof(struct xdp_rxtx_ring, desc);
631 
632 		off.fr.producer = offsetof(struct xdp_umem_ring, ptrs.producer);
633 		off.fr.consumer = offsetof(struct xdp_umem_ring, ptrs.consumer);
634 		off.fr.desc	= offsetof(struct xdp_umem_ring, desc);
635 		off.cr.producer = offsetof(struct xdp_umem_ring, ptrs.producer);
636 		off.cr.consumer = offsetof(struct xdp_umem_ring, ptrs.consumer);
637 		off.cr.desc	= offsetof(struct xdp_umem_ring, desc);
638 
639 		len = sizeof(off);
640 		if (copy_to_user(optval, &off, len))
641 			return -EFAULT;
642 		if (put_user(len, optlen))
643 			return -EFAULT;
644 
645 		return 0;
646 	}
647 	default:
648 		break;
649 	}
650 
651 	return -EOPNOTSUPP;
652 }
653 
654 static int xsk_mmap(struct file *file, struct socket *sock,
655 		    struct vm_area_struct *vma)
656 {
657 	loff_t offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
658 	unsigned long size = vma->vm_end - vma->vm_start;
659 	struct xdp_sock *xs = xdp_sk(sock->sk);
660 	struct xsk_queue *q = NULL;
661 	struct xdp_umem *umem;
662 	unsigned long pfn;
663 	struct page *qpg;
664 
665 	if (offset == XDP_PGOFF_RX_RING) {
666 		q = READ_ONCE(xs->rx);
667 	} else if (offset == XDP_PGOFF_TX_RING) {
668 		q = READ_ONCE(xs->tx);
669 	} else {
670 		umem = READ_ONCE(xs->umem);
671 		if (!umem)
672 			return -EINVAL;
673 
674 		/* Matches the smp_wmb() in XDP_UMEM_REG */
675 		smp_rmb();
676 		if (offset == XDP_UMEM_PGOFF_FILL_RING)
677 			q = READ_ONCE(umem->fq);
678 		else if (offset == XDP_UMEM_PGOFF_COMPLETION_RING)
679 			q = READ_ONCE(umem->cq);
680 	}
681 
682 	if (!q)
683 		return -EINVAL;
684 
685 	/* Matches the smp_wmb() in xsk_init_queue */
686 	smp_rmb();
687 	qpg = virt_to_head_page(q->ring);
688 	if (size > (PAGE_SIZE << compound_order(qpg)))
689 		return -EINVAL;
690 
691 	pfn = virt_to_phys(q->ring) >> PAGE_SHIFT;
692 	return remap_pfn_range(vma, vma->vm_start, pfn,
693 			       size, vma->vm_page_prot);
694 }
695 
696 static struct proto xsk_proto = {
697 	.name =		"XDP",
698 	.owner =	THIS_MODULE,
699 	.obj_size =	sizeof(struct xdp_sock),
700 };
701 
702 static const struct proto_ops xsk_proto_ops = {
703 	.family		= PF_XDP,
704 	.owner		= THIS_MODULE,
705 	.release	= xsk_release,
706 	.bind		= xsk_bind,
707 	.connect	= sock_no_connect,
708 	.socketpair	= sock_no_socketpair,
709 	.accept		= sock_no_accept,
710 	.getname	= sock_no_getname,
711 	.poll		= xsk_poll,
712 	.ioctl		= sock_no_ioctl,
713 	.listen		= sock_no_listen,
714 	.shutdown	= sock_no_shutdown,
715 	.setsockopt	= xsk_setsockopt,
716 	.getsockopt	= xsk_getsockopt,
717 	.sendmsg	= xsk_sendmsg,
718 	.recvmsg	= sock_no_recvmsg,
719 	.mmap		= xsk_mmap,
720 	.sendpage	= sock_no_sendpage,
721 };
722 
723 static void xsk_destruct(struct sock *sk)
724 {
725 	struct xdp_sock *xs = xdp_sk(sk);
726 
727 	if (!sock_flag(sk, SOCK_DEAD))
728 		return;
729 
730 	xdp_put_umem(xs->umem);
731 
732 	sk_refcnt_debug_dec(sk);
733 }
734 
735 static int xsk_create(struct net *net, struct socket *sock, int protocol,
736 		      int kern)
737 {
738 	struct sock *sk;
739 	struct xdp_sock *xs;
740 
741 	if (!ns_capable(net->user_ns, CAP_NET_RAW))
742 		return -EPERM;
743 	if (sock->type != SOCK_RAW)
744 		return -ESOCKTNOSUPPORT;
745 
746 	if (protocol)
747 		return -EPROTONOSUPPORT;
748 
749 	sock->state = SS_UNCONNECTED;
750 
751 	sk = sk_alloc(net, PF_XDP, GFP_KERNEL, &xsk_proto, kern);
752 	if (!sk)
753 		return -ENOBUFS;
754 
755 	sock->ops = &xsk_proto_ops;
756 
757 	sock_init_data(sock, sk);
758 
759 	sk->sk_family = PF_XDP;
760 
761 	sk->sk_destruct = xsk_destruct;
762 	sk_refcnt_debug_inc(sk);
763 
764 	sock_set_flag(sk, SOCK_RCU_FREE);
765 
766 	xs = xdp_sk(sk);
767 	mutex_init(&xs->mutex);
768 	spin_lock_init(&xs->tx_completion_lock);
769 
770 	mutex_lock(&net->xdp.lock);
771 	sk_add_node_rcu(sk, &net->xdp.list);
772 	mutex_unlock(&net->xdp.lock);
773 
774 	local_bh_disable();
775 	sock_prot_inuse_add(net, &xsk_proto, 1);
776 	local_bh_enable();
777 
778 	return 0;
779 }
780 
781 static const struct net_proto_family xsk_family_ops = {
782 	.family = PF_XDP,
783 	.create = xsk_create,
784 	.owner	= THIS_MODULE,
785 };
786 
787 static int __net_init xsk_net_init(struct net *net)
788 {
789 	mutex_init(&net->xdp.lock);
790 	INIT_HLIST_HEAD(&net->xdp.list);
791 	return 0;
792 }
793 
794 static void __net_exit xsk_net_exit(struct net *net)
795 {
796 	WARN_ON_ONCE(!hlist_empty(&net->xdp.list));
797 }
798 
799 static struct pernet_operations xsk_net_ops = {
800 	.init = xsk_net_init,
801 	.exit = xsk_net_exit,
802 };
803 
804 static int __init xsk_init(void)
805 {
806 	int err;
807 
808 	err = proto_register(&xsk_proto, 0 /* no slab */);
809 	if (err)
810 		goto out;
811 
812 	err = sock_register(&xsk_family_ops);
813 	if (err)
814 		goto out_proto;
815 
816 	err = register_pernet_subsys(&xsk_net_ops);
817 	if (err)
818 		goto out_sk;
819 	return 0;
820 
821 out_sk:
822 	sock_unregister(PF_XDP);
823 out_proto:
824 	proto_unregister(&xsk_proto);
825 out:
826 	return err;
827 }
828 
829 fs_initcall(xsk_init);
830