xref: /openbmc/linux/net/xdp/xsk.c (revision 0c874100)
1 // SPDX-License-Identifier: GPL-2.0
2 /* XDP sockets
3  *
4  * AF_XDP sockets allows a channel between XDP programs and userspace
5  * applications.
6  * Copyright(c) 2018 Intel Corporation.
7  *
8  * Author(s): Björn Töpel <bjorn.topel@intel.com>
9  *	      Magnus Karlsson <magnus.karlsson@intel.com>
10  */
11 
12 #define pr_fmt(fmt) "AF_XDP: %s: " fmt, __func__
13 
14 #include <linux/if_xdp.h>
15 #include <linux/init.h>
16 #include <linux/sched/mm.h>
17 #include <linux/sched/signal.h>
18 #include <linux/sched/task.h>
19 #include <linux/socket.h>
20 #include <linux/file.h>
21 #include <linux/uaccess.h>
22 #include <linux/net.h>
23 #include <linux/netdevice.h>
24 #include <linux/rculist.h>
25 #include <net/xdp_sock.h>
26 #include <net/xdp.h>
27 
28 #include "xsk_queue.h"
29 #include "xdp_umem.h"
30 
31 #define TX_BATCH_SIZE 16
32 
33 static struct xdp_sock *xdp_sk(struct sock *sk)
34 {
35 	return (struct xdp_sock *)sk;
36 }
37 
38 bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs)
39 {
40 	return READ_ONCE(xs->rx) &&  READ_ONCE(xs->umem) &&
41 		READ_ONCE(xs->umem->fq);
42 }
43 
44 u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr)
45 {
46 	return xskq_peek_addr(umem->fq, addr);
47 }
48 EXPORT_SYMBOL(xsk_umem_peek_addr);
49 
50 void xsk_umem_discard_addr(struct xdp_umem *umem)
51 {
52 	xskq_discard_addr(umem->fq);
53 }
54 EXPORT_SYMBOL(xsk_umem_discard_addr);
55 
56 static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
57 {
58 	void *to_buf, *from_buf;
59 	u32 metalen;
60 	u64 addr;
61 	int err;
62 
63 	if (!xskq_peek_addr(xs->umem->fq, &addr) ||
64 	    len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) {
65 		xs->rx_dropped++;
66 		return -ENOSPC;
67 	}
68 
69 	addr += xs->umem->headroom;
70 
71 	if (unlikely(xdp_data_meta_unsupported(xdp))) {
72 		from_buf = xdp->data;
73 		metalen = 0;
74 	} else {
75 		from_buf = xdp->data_meta;
76 		metalen = xdp->data - xdp->data_meta;
77 	}
78 
79 	to_buf = xdp_umem_get_data(xs->umem, addr);
80 	memcpy(to_buf, from_buf, len + metalen);
81 	addr += metalen;
82 	err = xskq_produce_batch_desc(xs->rx, addr, len);
83 	if (!err) {
84 		xskq_discard_addr(xs->umem->fq);
85 		xdp_return_buff(xdp);
86 		return 0;
87 	}
88 
89 	xs->rx_dropped++;
90 	return err;
91 }
92 
93 static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
94 {
95 	int err = xskq_produce_batch_desc(xs->rx, (u64)xdp->handle, len);
96 
97 	if (err)
98 		xs->rx_dropped++;
99 
100 	return err;
101 }
102 
103 int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
104 {
105 	u32 len;
106 
107 	if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index)
108 		return -EINVAL;
109 
110 	len = xdp->data_end - xdp->data;
111 
112 	return (xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY) ?
113 		__xsk_rcv_zc(xs, xdp, len) : __xsk_rcv(xs, xdp, len);
114 }
115 
116 void xsk_flush(struct xdp_sock *xs)
117 {
118 	xskq_produce_flush_desc(xs->rx);
119 	xs->sk.sk_data_ready(&xs->sk);
120 }
121 
122 int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
123 {
124 	u32 metalen = xdp->data - xdp->data_meta;
125 	u32 len = xdp->data_end - xdp->data;
126 	void *buffer;
127 	u64 addr;
128 	int err;
129 
130 	if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index)
131 		return -EINVAL;
132 
133 	if (!xskq_peek_addr(xs->umem->fq, &addr) ||
134 	    len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) {
135 		xs->rx_dropped++;
136 		return -ENOSPC;
137 	}
138 
139 	addr += xs->umem->headroom;
140 
141 	buffer = xdp_umem_get_data(xs->umem, addr);
142 	memcpy(buffer, xdp->data_meta, len + metalen);
143 	addr += metalen;
144 	err = xskq_produce_batch_desc(xs->rx, addr, len);
145 	if (!err) {
146 		xskq_discard_addr(xs->umem->fq);
147 		xsk_flush(xs);
148 		return 0;
149 	}
150 
151 	xs->rx_dropped++;
152 	return err;
153 }
154 
155 void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries)
156 {
157 	xskq_produce_flush_addr_n(umem->cq, nb_entries);
158 }
159 EXPORT_SYMBOL(xsk_umem_complete_tx);
160 
161 void xsk_umem_consume_tx_done(struct xdp_umem *umem)
162 {
163 	struct xdp_sock *xs;
164 
165 	rcu_read_lock();
166 	list_for_each_entry_rcu(xs, &umem->xsk_list, list) {
167 		xs->sk.sk_write_space(&xs->sk);
168 	}
169 	rcu_read_unlock();
170 }
171 EXPORT_SYMBOL(xsk_umem_consume_tx_done);
172 
173 bool xsk_umem_consume_tx(struct xdp_umem *umem, dma_addr_t *dma, u32 *len)
174 {
175 	struct xdp_desc desc;
176 	struct xdp_sock *xs;
177 
178 	rcu_read_lock();
179 	list_for_each_entry_rcu(xs, &umem->xsk_list, list) {
180 		if (!xskq_peek_desc(xs->tx, &desc))
181 			continue;
182 
183 		if (xskq_produce_addr_lazy(umem->cq, desc.addr))
184 			goto out;
185 
186 		*dma = xdp_umem_get_dma(umem, desc.addr);
187 		*len = desc.len;
188 
189 		xskq_discard_desc(xs->tx);
190 		rcu_read_unlock();
191 		return true;
192 	}
193 
194 out:
195 	rcu_read_unlock();
196 	return false;
197 }
198 EXPORT_SYMBOL(xsk_umem_consume_tx);
199 
200 static int xsk_zc_xmit(struct sock *sk)
201 {
202 	struct xdp_sock *xs = xdp_sk(sk);
203 	struct net_device *dev = xs->dev;
204 
205 	return dev->netdev_ops->ndo_xsk_async_xmit(dev, xs->queue_id);
206 }
207 
208 static void xsk_destruct_skb(struct sk_buff *skb)
209 {
210 	u64 addr = (u64)(long)skb_shinfo(skb)->destructor_arg;
211 	struct xdp_sock *xs = xdp_sk(skb->sk);
212 	unsigned long flags;
213 
214 	spin_lock_irqsave(&xs->tx_completion_lock, flags);
215 	WARN_ON_ONCE(xskq_produce_addr(xs->umem->cq, addr));
216 	spin_unlock_irqrestore(&xs->tx_completion_lock, flags);
217 
218 	sock_wfree(skb);
219 }
220 
221 static int xsk_generic_xmit(struct sock *sk, struct msghdr *m,
222 			    size_t total_len)
223 {
224 	u32 max_batch = TX_BATCH_SIZE;
225 	struct xdp_sock *xs = xdp_sk(sk);
226 	bool sent_frame = false;
227 	struct xdp_desc desc;
228 	struct sk_buff *skb;
229 	int err = 0;
230 
231 	mutex_lock(&xs->mutex);
232 
233 	while (xskq_peek_desc(xs->tx, &desc)) {
234 		char *buffer;
235 		u64 addr;
236 		u32 len;
237 
238 		if (max_batch-- == 0) {
239 			err = -EAGAIN;
240 			goto out;
241 		}
242 
243 		if (xskq_reserve_addr(xs->umem->cq))
244 			goto out;
245 
246 		if (xs->queue_id >= xs->dev->real_num_tx_queues)
247 			goto out;
248 
249 		len = desc.len;
250 		skb = sock_alloc_send_skb(sk, len, 1, &err);
251 		if (unlikely(!skb)) {
252 			err = -EAGAIN;
253 			goto out;
254 		}
255 
256 		skb_put(skb, len);
257 		addr = desc.addr;
258 		buffer = xdp_umem_get_data(xs->umem, addr);
259 		err = skb_store_bits(skb, 0, buffer, len);
260 		if (unlikely(err)) {
261 			kfree_skb(skb);
262 			goto out;
263 		}
264 
265 		skb->dev = xs->dev;
266 		skb->priority = sk->sk_priority;
267 		skb->mark = sk->sk_mark;
268 		skb_shinfo(skb)->destructor_arg = (void *)(long)addr;
269 		skb->destructor = xsk_destruct_skb;
270 
271 		err = dev_direct_xmit(skb, xs->queue_id);
272 		xskq_discard_desc(xs->tx);
273 		/* Ignore NET_XMIT_CN as packet might have been sent */
274 		if (err == NET_XMIT_DROP || err == NETDEV_TX_BUSY) {
275 			/* SKB completed but not sent */
276 			err = -EBUSY;
277 			goto out;
278 		}
279 
280 		sent_frame = true;
281 	}
282 
283 out:
284 	if (sent_frame)
285 		sk->sk_write_space(sk);
286 
287 	mutex_unlock(&xs->mutex);
288 	return err;
289 }
290 
291 static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
292 {
293 	bool need_wait = !(m->msg_flags & MSG_DONTWAIT);
294 	struct sock *sk = sock->sk;
295 	struct xdp_sock *xs = xdp_sk(sk);
296 
297 	if (unlikely(!xs->dev))
298 		return -ENXIO;
299 	if (unlikely(!(xs->dev->flags & IFF_UP)))
300 		return -ENETDOWN;
301 	if (unlikely(!xs->tx))
302 		return -ENOBUFS;
303 	if (need_wait)
304 		return -EOPNOTSUPP;
305 
306 	return (xs->zc) ? xsk_zc_xmit(sk) : xsk_generic_xmit(sk, m, total_len);
307 }
308 
309 static unsigned int xsk_poll(struct file *file, struct socket *sock,
310 			     struct poll_table_struct *wait)
311 {
312 	unsigned int mask = datagram_poll(file, sock, wait);
313 	struct sock *sk = sock->sk;
314 	struct xdp_sock *xs = xdp_sk(sk);
315 
316 	if (xs->rx && !xskq_empty_desc(xs->rx))
317 		mask |= POLLIN | POLLRDNORM;
318 	if (xs->tx && !xskq_full_desc(xs->tx))
319 		mask |= POLLOUT | POLLWRNORM;
320 
321 	return mask;
322 }
323 
324 static int xsk_init_queue(u32 entries, struct xsk_queue **queue,
325 			  bool umem_queue)
326 {
327 	struct xsk_queue *q;
328 
329 	if (entries == 0 || *queue || !is_power_of_2(entries))
330 		return -EINVAL;
331 
332 	q = xskq_create(entries, umem_queue);
333 	if (!q)
334 		return -ENOMEM;
335 
336 	/* Make sure queue is ready before it can be seen by others */
337 	smp_wmb();
338 	*queue = q;
339 	return 0;
340 }
341 
342 static int xsk_release(struct socket *sock)
343 {
344 	struct sock *sk = sock->sk;
345 	struct xdp_sock *xs = xdp_sk(sk);
346 	struct net *net;
347 
348 	if (!sk)
349 		return 0;
350 
351 	net = sock_net(sk);
352 
353 	local_bh_disable();
354 	sock_prot_inuse_add(net, sk->sk_prot, -1);
355 	local_bh_enable();
356 
357 	if (xs->dev) {
358 		struct net_device *dev = xs->dev;
359 
360 		/* Wait for driver to stop using the xdp socket. */
361 		xdp_del_sk_umem(xs->umem, xs);
362 		xs->dev = NULL;
363 		synchronize_net();
364 		dev_put(dev);
365 	}
366 
367 	xskq_destroy(xs->rx);
368 	xskq_destroy(xs->tx);
369 
370 	sock_orphan(sk);
371 	sock->sk = NULL;
372 
373 	sk_refcnt_debug_release(sk);
374 	sock_put(sk);
375 
376 	return 0;
377 }
378 
379 static struct socket *xsk_lookup_xsk_from_fd(int fd)
380 {
381 	struct socket *sock;
382 	int err;
383 
384 	sock = sockfd_lookup(fd, &err);
385 	if (!sock)
386 		return ERR_PTR(-ENOTSOCK);
387 
388 	if (sock->sk->sk_family != PF_XDP) {
389 		sockfd_put(sock);
390 		return ERR_PTR(-ENOPROTOOPT);
391 	}
392 
393 	return sock;
394 }
395 
396 static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
397 {
398 	struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr;
399 	struct sock *sk = sock->sk;
400 	struct xdp_sock *xs = xdp_sk(sk);
401 	struct net_device *dev;
402 	u32 flags, qid;
403 	int err = 0;
404 
405 	if (addr_len < sizeof(struct sockaddr_xdp))
406 		return -EINVAL;
407 	if (sxdp->sxdp_family != AF_XDP)
408 		return -EINVAL;
409 
410 	mutex_lock(&xs->mutex);
411 	if (xs->dev) {
412 		err = -EBUSY;
413 		goto out_release;
414 	}
415 
416 	dev = dev_get_by_index(sock_net(sk), sxdp->sxdp_ifindex);
417 	if (!dev) {
418 		err = -ENODEV;
419 		goto out_release;
420 	}
421 
422 	if (!xs->rx && !xs->tx) {
423 		err = -EINVAL;
424 		goto out_unlock;
425 	}
426 
427 	qid = sxdp->sxdp_queue_id;
428 	flags = sxdp->sxdp_flags;
429 
430 	if (flags & XDP_SHARED_UMEM) {
431 		struct xdp_sock *umem_xs;
432 		struct socket *sock;
433 
434 		if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY)) {
435 			/* Cannot specify flags for shared sockets. */
436 			err = -EINVAL;
437 			goto out_unlock;
438 		}
439 
440 		if (xs->umem) {
441 			/* We have already our own. */
442 			err = -EINVAL;
443 			goto out_unlock;
444 		}
445 
446 		sock = xsk_lookup_xsk_from_fd(sxdp->sxdp_shared_umem_fd);
447 		if (IS_ERR(sock)) {
448 			err = PTR_ERR(sock);
449 			goto out_unlock;
450 		}
451 
452 		umem_xs = xdp_sk(sock->sk);
453 		if (!umem_xs->umem) {
454 			/* No umem to inherit. */
455 			err = -EBADF;
456 			sockfd_put(sock);
457 			goto out_unlock;
458 		} else if (umem_xs->dev != dev || umem_xs->queue_id != qid) {
459 			err = -EINVAL;
460 			sockfd_put(sock);
461 			goto out_unlock;
462 		}
463 
464 		xdp_get_umem(umem_xs->umem);
465 		xs->umem = umem_xs->umem;
466 		sockfd_put(sock);
467 	} else if (!xs->umem || !xdp_umem_validate_queues(xs->umem)) {
468 		err = -EINVAL;
469 		goto out_unlock;
470 	} else {
471 		/* This xsk has its own umem. */
472 		xskq_set_umem(xs->umem->fq, xs->umem->size,
473 			      xs->umem->chunk_mask);
474 		xskq_set_umem(xs->umem->cq, xs->umem->size,
475 			      xs->umem->chunk_mask);
476 
477 		err = xdp_umem_assign_dev(xs->umem, dev, qid, flags);
478 		if (err)
479 			goto out_unlock;
480 	}
481 
482 	xs->dev = dev;
483 	xs->zc = xs->umem->zc;
484 	xs->queue_id = qid;
485 	xskq_set_umem(xs->rx, xs->umem->size, xs->umem->chunk_mask);
486 	xskq_set_umem(xs->tx, xs->umem->size, xs->umem->chunk_mask);
487 	xdp_add_sk_umem(xs->umem, xs);
488 
489 out_unlock:
490 	if (err)
491 		dev_put(dev);
492 out_release:
493 	mutex_unlock(&xs->mutex);
494 	return err;
495 }
496 
497 static int xsk_setsockopt(struct socket *sock, int level, int optname,
498 			  char __user *optval, unsigned int optlen)
499 {
500 	struct sock *sk = sock->sk;
501 	struct xdp_sock *xs = xdp_sk(sk);
502 	int err;
503 
504 	if (level != SOL_XDP)
505 		return -ENOPROTOOPT;
506 
507 	switch (optname) {
508 	case XDP_RX_RING:
509 	case XDP_TX_RING:
510 	{
511 		struct xsk_queue **q;
512 		int entries;
513 
514 		if (optlen < sizeof(entries))
515 			return -EINVAL;
516 		if (copy_from_user(&entries, optval, sizeof(entries)))
517 			return -EFAULT;
518 
519 		mutex_lock(&xs->mutex);
520 		q = (optname == XDP_TX_RING) ? &xs->tx : &xs->rx;
521 		err = xsk_init_queue(entries, q, false);
522 		mutex_unlock(&xs->mutex);
523 		return err;
524 	}
525 	case XDP_UMEM_REG:
526 	{
527 		struct xdp_umem_reg mr;
528 		struct xdp_umem *umem;
529 
530 		if (copy_from_user(&mr, optval, sizeof(mr)))
531 			return -EFAULT;
532 
533 		mutex_lock(&xs->mutex);
534 		if (xs->umem) {
535 			mutex_unlock(&xs->mutex);
536 			return -EBUSY;
537 		}
538 
539 		umem = xdp_umem_create(&mr);
540 		if (IS_ERR(umem)) {
541 			mutex_unlock(&xs->mutex);
542 			return PTR_ERR(umem);
543 		}
544 
545 		/* Make sure umem is ready before it can be seen by others */
546 		smp_wmb();
547 		xs->umem = umem;
548 		mutex_unlock(&xs->mutex);
549 		return 0;
550 	}
551 	case XDP_UMEM_FILL_RING:
552 	case XDP_UMEM_COMPLETION_RING:
553 	{
554 		struct xsk_queue **q;
555 		int entries;
556 
557 		if (copy_from_user(&entries, optval, sizeof(entries)))
558 			return -EFAULT;
559 
560 		mutex_lock(&xs->mutex);
561 		if (!xs->umem) {
562 			mutex_unlock(&xs->mutex);
563 			return -EINVAL;
564 		}
565 
566 		q = (optname == XDP_UMEM_FILL_RING) ? &xs->umem->fq :
567 			&xs->umem->cq;
568 		err = xsk_init_queue(entries, q, true);
569 		mutex_unlock(&xs->mutex);
570 		return err;
571 	}
572 	default:
573 		break;
574 	}
575 
576 	return -ENOPROTOOPT;
577 }
578 
579 static int xsk_getsockopt(struct socket *sock, int level, int optname,
580 			  char __user *optval, int __user *optlen)
581 {
582 	struct sock *sk = sock->sk;
583 	struct xdp_sock *xs = xdp_sk(sk);
584 	int len;
585 
586 	if (level != SOL_XDP)
587 		return -ENOPROTOOPT;
588 
589 	if (get_user(len, optlen))
590 		return -EFAULT;
591 	if (len < 0)
592 		return -EINVAL;
593 
594 	switch (optname) {
595 	case XDP_STATISTICS:
596 	{
597 		struct xdp_statistics stats;
598 
599 		if (len < sizeof(stats))
600 			return -EINVAL;
601 
602 		mutex_lock(&xs->mutex);
603 		stats.rx_dropped = xs->rx_dropped;
604 		stats.rx_invalid_descs = xskq_nb_invalid_descs(xs->rx);
605 		stats.tx_invalid_descs = xskq_nb_invalid_descs(xs->tx);
606 		mutex_unlock(&xs->mutex);
607 
608 		if (copy_to_user(optval, &stats, sizeof(stats)))
609 			return -EFAULT;
610 		if (put_user(sizeof(stats), optlen))
611 			return -EFAULT;
612 
613 		return 0;
614 	}
615 	case XDP_MMAP_OFFSETS:
616 	{
617 		struct xdp_mmap_offsets off;
618 
619 		if (len < sizeof(off))
620 			return -EINVAL;
621 
622 		off.rx.producer = offsetof(struct xdp_rxtx_ring, ptrs.producer);
623 		off.rx.consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer);
624 		off.rx.desc	= offsetof(struct xdp_rxtx_ring, desc);
625 		off.tx.producer = offsetof(struct xdp_rxtx_ring, ptrs.producer);
626 		off.tx.consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer);
627 		off.tx.desc	= offsetof(struct xdp_rxtx_ring, desc);
628 
629 		off.fr.producer = offsetof(struct xdp_umem_ring, ptrs.producer);
630 		off.fr.consumer = offsetof(struct xdp_umem_ring, ptrs.consumer);
631 		off.fr.desc	= offsetof(struct xdp_umem_ring, desc);
632 		off.cr.producer = offsetof(struct xdp_umem_ring, ptrs.producer);
633 		off.cr.consumer = offsetof(struct xdp_umem_ring, ptrs.consumer);
634 		off.cr.desc	= offsetof(struct xdp_umem_ring, desc);
635 
636 		len = sizeof(off);
637 		if (copy_to_user(optval, &off, len))
638 			return -EFAULT;
639 		if (put_user(len, optlen))
640 			return -EFAULT;
641 
642 		return 0;
643 	}
644 	default:
645 		break;
646 	}
647 
648 	return -EOPNOTSUPP;
649 }
650 
651 static int xsk_mmap(struct file *file, struct socket *sock,
652 		    struct vm_area_struct *vma)
653 {
654 	loff_t offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
655 	unsigned long size = vma->vm_end - vma->vm_start;
656 	struct xdp_sock *xs = xdp_sk(sock->sk);
657 	struct xsk_queue *q = NULL;
658 	struct xdp_umem *umem;
659 	unsigned long pfn;
660 	struct page *qpg;
661 
662 	if (offset == XDP_PGOFF_RX_RING) {
663 		q = READ_ONCE(xs->rx);
664 	} else if (offset == XDP_PGOFF_TX_RING) {
665 		q = READ_ONCE(xs->tx);
666 	} else {
667 		umem = READ_ONCE(xs->umem);
668 		if (!umem)
669 			return -EINVAL;
670 
671 		if (offset == XDP_UMEM_PGOFF_FILL_RING)
672 			q = READ_ONCE(umem->fq);
673 		else if (offset == XDP_UMEM_PGOFF_COMPLETION_RING)
674 			q = READ_ONCE(umem->cq);
675 	}
676 
677 	if (!q)
678 		return -EINVAL;
679 
680 	qpg = virt_to_head_page(q->ring);
681 	if (size > (PAGE_SIZE << compound_order(qpg)))
682 		return -EINVAL;
683 
684 	pfn = virt_to_phys(q->ring) >> PAGE_SHIFT;
685 	return remap_pfn_range(vma, vma->vm_start, pfn,
686 			       size, vma->vm_page_prot);
687 }
688 
689 static struct proto xsk_proto = {
690 	.name =		"XDP",
691 	.owner =	THIS_MODULE,
692 	.obj_size =	sizeof(struct xdp_sock),
693 };
694 
695 static const struct proto_ops xsk_proto_ops = {
696 	.family		= PF_XDP,
697 	.owner		= THIS_MODULE,
698 	.release	= xsk_release,
699 	.bind		= xsk_bind,
700 	.connect	= sock_no_connect,
701 	.socketpair	= sock_no_socketpair,
702 	.accept		= sock_no_accept,
703 	.getname	= sock_no_getname,
704 	.poll		= xsk_poll,
705 	.ioctl		= sock_no_ioctl,
706 	.listen		= sock_no_listen,
707 	.shutdown	= sock_no_shutdown,
708 	.setsockopt	= xsk_setsockopt,
709 	.getsockopt	= xsk_getsockopt,
710 	.sendmsg	= xsk_sendmsg,
711 	.recvmsg	= sock_no_recvmsg,
712 	.mmap		= xsk_mmap,
713 	.sendpage	= sock_no_sendpage,
714 };
715 
716 static void xsk_destruct(struct sock *sk)
717 {
718 	struct xdp_sock *xs = xdp_sk(sk);
719 
720 	if (!sock_flag(sk, SOCK_DEAD))
721 		return;
722 
723 	xdp_put_umem(xs->umem);
724 
725 	sk_refcnt_debug_dec(sk);
726 }
727 
728 static int xsk_create(struct net *net, struct socket *sock, int protocol,
729 		      int kern)
730 {
731 	struct sock *sk;
732 	struct xdp_sock *xs;
733 
734 	if (!ns_capable(net->user_ns, CAP_NET_RAW))
735 		return -EPERM;
736 	if (sock->type != SOCK_RAW)
737 		return -ESOCKTNOSUPPORT;
738 
739 	if (protocol)
740 		return -EPROTONOSUPPORT;
741 
742 	sock->state = SS_UNCONNECTED;
743 
744 	sk = sk_alloc(net, PF_XDP, GFP_KERNEL, &xsk_proto, kern);
745 	if (!sk)
746 		return -ENOBUFS;
747 
748 	sock->ops = &xsk_proto_ops;
749 
750 	sock_init_data(sock, sk);
751 
752 	sk->sk_family = PF_XDP;
753 
754 	sk->sk_destruct = xsk_destruct;
755 	sk_refcnt_debug_inc(sk);
756 
757 	sock_set_flag(sk, SOCK_RCU_FREE);
758 
759 	xs = xdp_sk(sk);
760 	mutex_init(&xs->mutex);
761 	spin_lock_init(&xs->tx_completion_lock);
762 
763 	local_bh_disable();
764 	sock_prot_inuse_add(net, &xsk_proto, 1);
765 	local_bh_enable();
766 
767 	return 0;
768 }
769 
770 static const struct net_proto_family xsk_family_ops = {
771 	.family = PF_XDP,
772 	.create = xsk_create,
773 	.owner	= THIS_MODULE,
774 };
775 
776 static int __init xsk_init(void)
777 {
778 	int err;
779 
780 	err = proto_register(&xsk_proto, 0 /* no slab */);
781 	if (err)
782 		goto out;
783 
784 	err = sock_register(&xsk_family_ops);
785 	if (err)
786 		goto out_proto;
787 
788 	return 0;
789 
790 out_proto:
791 	proto_unregister(&xsk_proto);
792 out:
793 	return err;
794 }
795 
796 fs_initcall(xsk_init);
797