xref: /openbmc/linux/net/xdp/xsk.c (revision d9fd5a71)
1 // SPDX-License-Identifier: GPL-2.0
2 /* XDP sockets
3  *
4  * AF_XDP sockets allows a channel between XDP programs and userspace
5  * applications.
6  * Copyright(c) 2018 Intel Corporation.
7  *
8  * Author(s): Björn Töpel <bjorn.topel@intel.com>
9  *	      Magnus Karlsson <magnus.karlsson@intel.com>
10  */
11 
12 #define pr_fmt(fmt) "AF_XDP: %s: " fmt, __func__
13 
14 #include <linux/if_xdp.h>
15 #include <linux/init.h>
16 #include <linux/sched/mm.h>
17 #include <linux/sched/signal.h>
18 #include <linux/sched/task.h>
19 #include <linux/socket.h>
20 #include <linux/file.h>
21 #include <linux/uaccess.h>
22 #include <linux/net.h>
23 #include <linux/netdevice.h>
24 #include <linux/rculist.h>
25 #include <net/xdp_sock_drv.h>
26 #include <net/busy_poll.h>
27 #include <net/xdp.h>
28 
29 #include "xsk_queue.h"
30 #include "xdp_umem.h"
31 #include "xsk.h"
32 
33 #define TX_BATCH_SIZE 16
34 
35 static DEFINE_PER_CPU(struct list_head, xskmap_flush_list);
36 
37 void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool)
38 {
39 	if (pool->cached_need_wakeup & XDP_WAKEUP_RX)
40 		return;
41 
42 	pool->fq->ring->flags |= XDP_RING_NEED_WAKEUP;
43 	pool->cached_need_wakeup |= XDP_WAKEUP_RX;
44 }
45 EXPORT_SYMBOL(xsk_set_rx_need_wakeup);
46 
47 void xsk_set_tx_need_wakeup(struct xsk_buff_pool *pool)
48 {
49 	struct xdp_sock *xs;
50 
51 	if (pool->cached_need_wakeup & XDP_WAKEUP_TX)
52 		return;
53 
54 	rcu_read_lock();
55 	list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
56 		xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP;
57 	}
58 	rcu_read_unlock();
59 
60 	pool->cached_need_wakeup |= XDP_WAKEUP_TX;
61 }
62 EXPORT_SYMBOL(xsk_set_tx_need_wakeup);
63 
64 void xsk_clear_rx_need_wakeup(struct xsk_buff_pool *pool)
65 {
66 	if (!(pool->cached_need_wakeup & XDP_WAKEUP_RX))
67 		return;
68 
69 	pool->fq->ring->flags &= ~XDP_RING_NEED_WAKEUP;
70 	pool->cached_need_wakeup &= ~XDP_WAKEUP_RX;
71 }
72 EXPORT_SYMBOL(xsk_clear_rx_need_wakeup);
73 
74 void xsk_clear_tx_need_wakeup(struct xsk_buff_pool *pool)
75 {
76 	struct xdp_sock *xs;
77 
78 	if (!(pool->cached_need_wakeup & XDP_WAKEUP_TX))
79 		return;
80 
81 	rcu_read_lock();
82 	list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
83 		xs->tx->ring->flags &= ~XDP_RING_NEED_WAKEUP;
84 	}
85 	rcu_read_unlock();
86 
87 	pool->cached_need_wakeup &= ~XDP_WAKEUP_TX;
88 }
89 EXPORT_SYMBOL(xsk_clear_tx_need_wakeup);
90 
91 bool xsk_uses_need_wakeup(struct xsk_buff_pool *pool)
92 {
93 	return pool->uses_need_wakeup;
94 }
95 EXPORT_SYMBOL(xsk_uses_need_wakeup);
96 
97 struct xsk_buff_pool *xsk_get_pool_from_qid(struct net_device *dev,
98 					    u16 queue_id)
99 {
100 	if (queue_id < dev->real_num_rx_queues)
101 		return dev->_rx[queue_id].pool;
102 	if (queue_id < dev->real_num_tx_queues)
103 		return dev->_tx[queue_id].pool;
104 
105 	return NULL;
106 }
107 EXPORT_SYMBOL(xsk_get_pool_from_qid);
108 
109 void xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id)
110 {
111 	if (queue_id < dev->num_rx_queues)
112 		dev->_rx[queue_id].pool = NULL;
113 	if (queue_id < dev->num_tx_queues)
114 		dev->_tx[queue_id].pool = NULL;
115 }
116 
117 /* The buffer pool is stored both in the _rx struct and the _tx struct as we do
118  * not know if the device has more tx queues than rx, or the opposite.
119  * This might also change during run time.
120  */
121 int xsk_reg_pool_at_qid(struct net_device *dev, struct xsk_buff_pool *pool,
122 			u16 queue_id)
123 {
124 	if (queue_id >= max_t(unsigned int,
125 			      dev->real_num_rx_queues,
126 			      dev->real_num_tx_queues))
127 		return -EINVAL;
128 
129 	if (queue_id < dev->real_num_rx_queues)
130 		dev->_rx[queue_id].pool = pool;
131 	if (queue_id < dev->real_num_tx_queues)
132 		dev->_tx[queue_id].pool = pool;
133 
134 	return 0;
135 }
136 
137 void xp_release(struct xdp_buff_xsk *xskb)
138 {
139 	xskb->pool->free_heads[xskb->pool->free_heads_cnt++] = xskb;
140 }
141 
142 static u64 xp_get_handle(struct xdp_buff_xsk *xskb)
143 {
144 	u64 offset = xskb->xdp.data - xskb->xdp.data_hard_start;
145 
146 	offset += xskb->pool->headroom;
147 	if (!xskb->pool->unaligned)
148 		return xskb->orig_addr + offset;
149 	return xskb->orig_addr + (offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT);
150 }
151 
152 static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
153 {
154 	struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp);
155 	u64 addr;
156 	int err;
157 
158 	addr = xp_get_handle(xskb);
159 	err = xskq_prod_reserve_desc(xs->rx, addr, len);
160 	if (err) {
161 		xs->rx_queue_full++;
162 		return err;
163 	}
164 
165 	xp_release(xskb);
166 	return 0;
167 }
168 
169 static void xsk_copy_xdp(struct xdp_buff *to, struct xdp_buff *from, u32 len)
170 {
171 	void *from_buf, *to_buf;
172 	u32 metalen;
173 
174 	if (unlikely(xdp_data_meta_unsupported(from))) {
175 		from_buf = from->data;
176 		to_buf = to->data;
177 		metalen = 0;
178 	} else {
179 		from_buf = from->data_meta;
180 		metalen = from->data - from->data_meta;
181 		to_buf = to->data - metalen;
182 	}
183 
184 	memcpy(to_buf, from_buf, len + metalen);
185 }
186 
187 static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
188 {
189 	struct xdp_buff *xsk_xdp;
190 	int err;
191 	u32 len;
192 
193 	len = xdp->data_end - xdp->data;
194 	if (len > xsk_pool_get_rx_frame_size(xs->pool)) {
195 		xs->rx_dropped++;
196 		return -ENOSPC;
197 	}
198 
199 	xsk_xdp = xsk_buff_alloc(xs->pool);
200 	if (!xsk_xdp) {
201 		xs->rx_dropped++;
202 		return -ENOSPC;
203 	}
204 
205 	xsk_copy_xdp(xsk_xdp, xdp, len);
206 	err = __xsk_rcv_zc(xs, xsk_xdp, len);
207 	if (err) {
208 		xsk_buff_free(xsk_xdp);
209 		return err;
210 	}
211 	return 0;
212 }
213 
214 static bool xsk_tx_writeable(struct xdp_sock *xs)
215 {
216 	if (xskq_cons_present_entries(xs->tx) > xs->tx->nentries / 2)
217 		return false;
218 
219 	return true;
220 }
221 
222 static bool xsk_is_bound(struct xdp_sock *xs)
223 {
224 	if (READ_ONCE(xs->state) == XSK_BOUND) {
225 		/* Matches smp_wmb() in bind(). */
226 		smp_rmb();
227 		return true;
228 	}
229 	return false;
230 }
231 
232 static int xsk_rcv_check(struct xdp_sock *xs, struct xdp_buff *xdp)
233 {
234 	if (!xsk_is_bound(xs))
235 		return -EINVAL;
236 
237 	if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index)
238 		return -EINVAL;
239 
240 	sk_mark_napi_id_once_xdp(&xs->sk, xdp);
241 	return 0;
242 }
243 
244 static void xsk_flush(struct xdp_sock *xs)
245 {
246 	xskq_prod_submit(xs->rx);
247 	__xskq_cons_release(xs->pool->fq);
248 	sock_def_readable(&xs->sk);
249 }
250 
251 int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
252 {
253 	int err;
254 
255 	spin_lock_bh(&xs->rx_lock);
256 	err = xsk_rcv_check(xs, xdp);
257 	if (!err) {
258 		err = __xsk_rcv(xs, xdp);
259 		xsk_flush(xs);
260 	}
261 	spin_unlock_bh(&xs->rx_lock);
262 	return err;
263 }
264 
265 static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
266 {
267 	int err;
268 	u32 len;
269 
270 	err = xsk_rcv_check(xs, xdp);
271 	if (err)
272 		return err;
273 
274 	if (xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL) {
275 		len = xdp->data_end - xdp->data;
276 		return __xsk_rcv_zc(xs, xdp, len);
277 	}
278 
279 	err = __xsk_rcv(xs, xdp);
280 	if (!err)
281 		xdp_return_buff(xdp);
282 	return err;
283 }
284 
285 int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp)
286 {
287 	struct list_head *flush_list = this_cpu_ptr(&xskmap_flush_list);
288 	int err;
289 
290 	err = xsk_rcv(xs, xdp);
291 	if (err)
292 		return err;
293 
294 	if (!xs->flush_node.prev)
295 		list_add(&xs->flush_node, flush_list);
296 
297 	return 0;
298 }
299 
300 void __xsk_map_flush(void)
301 {
302 	struct list_head *flush_list = this_cpu_ptr(&xskmap_flush_list);
303 	struct xdp_sock *xs, *tmp;
304 
305 	list_for_each_entry_safe(xs, tmp, flush_list, flush_node) {
306 		xsk_flush(xs);
307 		__list_del_clearprev(&xs->flush_node);
308 	}
309 }
310 
311 void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries)
312 {
313 	xskq_prod_submit_n(pool->cq, nb_entries);
314 }
315 EXPORT_SYMBOL(xsk_tx_completed);
316 
317 void xsk_tx_release(struct xsk_buff_pool *pool)
318 {
319 	struct xdp_sock *xs;
320 
321 	rcu_read_lock();
322 	list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
323 		__xskq_cons_release(xs->tx);
324 		if (xsk_tx_writeable(xs))
325 			xs->sk.sk_write_space(&xs->sk);
326 	}
327 	rcu_read_unlock();
328 }
329 EXPORT_SYMBOL(xsk_tx_release);
330 
331 bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc)
332 {
333 	struct xdp_sock *xs;
334 
335 	rcu_read_lock();
336 	list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
337 		if (!xskq_cons_peek_desc(xs->tx, desc, pool)) {
338 			xs->tx->queue_empty_descs++;
339 			continue;
340 		}
341 
342 		/* This is the backpressure mechanism for the Tx path.
343 		 * Reserve space in the completion queue and only proceed
344 		 * if there is space in it. This avoids having to implement
345 		 * any buffering in the Tx path.
346 		 */
347 		if (xskq_prod_reserve_addr(pool->cq, desc->addr))
348 			goto out;
349 
350 		xskq_cons_release(xs->tx);
351 		rcu_read_unlock();
352 		return true;
353 	}
354 
355 out:
356 	rcu_read_unlock();
357 	return false;
358 }
359 EXPORT_SYMBOL(xsk_tx_peek_desc);
360 
361 static u32 xsk_tx_peek_release_fallback(struct xsk_buff_pool *pool, struct xdp_desc *descs,
362 					u32 max_entries)
363 {
364 	u32 nb_pkts = 0;
365 
366 	while (nb_pkts < max_entries && xsk_tx_peek_desc(pool, &descs[nb_pkts]))
367 		nb_pkts++;
368 
369 	xsk_tx_release(pool);
370 	return nb_pkts;
371 }
372 
373 u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, struct xdp_desc *descs,
374 				   u32 max_entries)
375 {
376 	struct xdp_sock *xs;
377 	u32 nb_pkts;
378 
379 	rcu_read_lock();
380 	if (!list_is_singular(&pool->xsk_tx_list)) {
381 		/* Fallback to the non-batched version */
382 		rcu_read_unlock();
383 		return xsk_tx_peek_release_fallback(pool, descs, max_entries);
384 	}
385 
386 	xs = list_first_or_null_rcu(&pool->xsk_tx_list, struct xdp_sock, tx_list);
387 	if (!xs) {
388 		nb_pkts = 0;
389 		goto out;
390 	}
391 
392 	nb_pkts = xskq_cons_peek_desc_batch(xs->tx, descs, pool, max_entries);
393 	if (!nb_pkts) {
394 		xs->tx->queue_empty_descs++;
395 		goto out;
396 	}
397 
398 	/* This is the backpressure mechanism for the Tx path. Try to
399 	 * reserve space in the completion queue for all packets, but
400 	 * if there are fewer slots available, just process that many
401 	 * packets. This avoids having to implement any buffering in
402 	 * the Tx path.
403 	 */
404 	nb_pkts = xskq_prod_reserve_addr_batch(pool->cq, descs, nb_pkts);
405 	if (!nb_pkts)
406 		goto out;
407 
408 	xskq_cons_release_n(xs->tx, nb_pkts);
409 	__xskq_cons_release(xs->tx);
410 	xs->sk.sk_write_space(&xs->sk);
411 
412 out:
413 	rcu_read_unlock();
414 	return nb_pkts;
415 }
416 EXPORT_SYMBOL(xsk_tx_peek_release_desc_batch);
417 
418 static int xsk_wakeup(struct xdp_sock *xs, u8 flags)
419 {
420 	struct net_device *dev = xs->dev;
421 	int err;
422 
423 	rcu_read_lock();
424 	err = dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, flags);
425 	rcu_read_unlock();
426 
427 	return err;
428 }
429 
430 static int xsk_zc_xmit(struct xdp_sock *xs)
431 {
432 	return xsk_wakeup(xs, XDP_WAKEUP_TX);
433 }
434 
435 static void xsk_destruct_skb(struct sk_buff *skb)
436 {
437 	u64 addr = (u64)(long)skb_shinfo(skb)->destructor_arg;
438 	struct xdp_sock *xs = xdp_sk(skb->sk);
439 	unsigned long flags;
440 
441 	spin_lock_irqsave(&xs->pool->cq_lock, flags);
442 	xskq_prod_submit_addr(xs->pool->cq, addr);
443 	spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
444 
445 	sock_wfree(skb);
446 }
447 
448 static int xsk_generic_xmit(struct sock *sk)
449 {
450 	struct xdp_sock *xs = xdp_sk(sk);
451 	u32 max_batch = TX_BATCH_SIZE;
452 	bool sent_frame = false;
453 	struct xdp_desc desc;
454 	struct sk_buff *skb;
455 	unsigned long flags;
456 	int err = 0;
457 
458 	mutex_lock(&xs->mutex);
459 
460 	if (xs->queue_id >= xs->dev->real_num_tx_queues)
461 		goto out;
462 
463 	while (xskq_cons_peek_desc(xs->tx, &desc, xs->pool)) {
464 		char *buffer;
465 		u64 addr;
466 		u32 len;
467 
468 		if (max_batch-- == 0) {
469 			err = -EAGAIN;
470 			goto out;
471 		}
472 
473 		len = desc.len;
474 		skb = sock_alloc_send_skb(sk, len, 1, &err);
475 		if (unlikely(!skb))
476 			goto out;
477 
478 		skb_put(skb, len);
479 		addr = desc.addr;
480 		buffer = xsk_buff_raw_get_data(xs->pool, addr);
481 		err = skb_store_bits(skb, 0, buffer, len);
482 		/* This is the backpressure mechanism for the Tx path.
483 		 * Reserve space in the completion queue and only proceed
484 		 * if there is space in it. This avoids having to implement
485 		 * any buffering in the Tx path.
486 		 */
487 		spin_lock_irqsave(&xs->pool->cq_lock, flags);
488 		if (unlikely(err) || xskq_prod_reserve(xs->pool->cq)) {
489 			spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
490 			kfree_skb(skb);
491 			goto out;
492 		}
493 		spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
494 
495 		skb->dev = xs->dev;
496 		skb->priority = sk->sk_priority;
497 		skb->mark = sk->sk_mark;
498 		skb_shinfo(skb)->destructor_arg = (void *)(long)desc.addr;
499 		skb->destructor = xsk_destruct_skb;
500 
501 		err = __dev_direct_xmit(skb, xs->queue_id);
502 		if  (err == NETDEV_TX_BUSY) {
503 			/* Tell user-space to retry the send */
504 			skb->destructor = sock_wfree;
505 			spin_lock_irqsave(&xs->pool->cq_lock, flags);
506 			xskq_prod_cancel(xs->pool->cq);
507 			spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
508 			/* Free skb without triggering the perf drop trace */
509 			consume_skb(skb);
510 			err = -EAGAIN;
511 			goto out;
512 		}
513 
514 		xskq_cons_release(xs->tx);
515 		/* Ignore NET_XMIT_CN as packet might have been sent */
516 		if (err == NET_XMIT_DROP) {
517 			/* SKB completed but not sent */
518 			err = -EBUSY;
519 			goto out;
520 		}
521 
522 		sent_frame = true;
523 	}
524 
525 	xs->tx->queue_empty_descs++;
526 
527 out:
528 	if (sent_frame)
529 		if (xsk_tx_writeable(xs))
530 			sk->sk_write_space(sk);
531 
532 	mutex_unlock(&xs->mutex);
533 	return err;
534 }
535 
536 static int __xsk_sendmsg(struct sock *sk)
537 {
538 	struct xdp_sock *xs = xdp_sk(sk);
539 
540 	if (unlikely(!(xs->dev->flags & IFF_UP)))
541 		return -ENETDOWN;
542 	if (unlikely(!xs->tx))
543 		return -ENOBUFS;
544 
545 	return xs->zc ? xsk_zc_xmit(xs) : xsk_generic_xmit(sk);
546 }
547 
548 static bool xsk_no_wakeup(struct sock *sk)
549 {
550 #ifdef CONFIG_NET_RX_BUSY_POLL
551 	/* Prefer busy-polling, skip the wakeup. */
552 	return READ_ONCE(sk->sk_prefer_busy_poll) && READ_ONCE(sk->sk_ll_usec) &&
553 		READ_ONCE(sk->sk_napi_id) >= MIN_NAPI_ID;
554 #else
555 	return false;
556 #endif
557 }
558 
559 static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
560 {
561 	bool need_wait = !(m->msg_flags & MSG_DONTWAIT);
562 	struct sock *sk = sock->sk;
563 	struct xdp_sock *xs = xdp_sk(sk);
564 	struct xsk_buff_pool *pool;
565 
566 	if (unlikely(!xsk_is_bound(xs)))
567 		return -ENXIO;
568 	if (unlikely(need_wait))
569 		return -EOPNOTSUPP;
570 
571 	if (sk_can_busy_loop(sk))
572 		sk_busy_loop(sk, 1); /* only support non-blocking sockets */
573 
574 	if (xsk_no_wakeup(sk))
575 		return 0;
576 
577 	pool = xs->pool;
578 	if (pool->cached_need_wakeup & XDP_WAKEUP_TX)
579 		return __xsk_sendmsg(sk);
580 	return 0;
581 }
582 
583 static int xsk_recvmsg(struct socket *sock, struct msghdr *m, size_t len, int flags)
584 {
585 	bool need_wait = !(flags & MSG_DONTWAIT);
586 	struct sock *sk = sock->sk;
587 	struct xdp_sock *xs = xdp_sk(sk);
588 
589 	if (unlikely(!xsk_is_bound(xs)))
590 		return -ENXIO;
591 	if (unlikely(!(xs->dev->flags & IFF_UP)))
592 		return -ENETDOWN;
593 	if (unlikely(!xs->rx))
594 		return -ENOBUFS;
595 	if (unlikely(need_wait))
596 		return -EOPNOTSUPP;
597 
598 	if (sk_can_busy_loop(sk))
599 		sk_busy_loop(sk, 1); /* only support non-blocking sockets */
600 
601 	if (xsk_no_wakeup(sk))
602 		return 0;
603 
604 	if (xs->pool->cached_need_wakeup & XDP_WAKEUP_RX && xs->zc)
605 		return xsk_wakeup(xs, XDP_WAKEUP_RX);
606 	return 0;
607 }
608 
609 static __poll_t xsk_poll(struct file *file, struct socket *sock,
610 			     struct poll_table_struct *wait)
611 {
612 	__poll_t mask = 0;
613 	struct sock *sk = sock->sk;
614 	struct xdp_sock *xs = xdp_sk(sk);
615 	struct xsk_buff_pool *pool;
616 
617 	sock_poll_wait(file, sock, wait);
618 
619 	if (unlikely(!xsk_is_bound(xs)))
620 		return mask;
621 
622 	pool = xs->pool;
623 
624 	if (pool->cached_need_wakeup) {
625 		if (xs->zc)
626 			xsk_wakeup(xs, pool->cached_need_wakeup);
627 		else
628 			/* Poll needs to drive Tx also in copy mode */
629 			__xsk_sendmsg(sk);
630 	}
631 
632 	if (xs->rx && !xskq_prod_is_empty(xs->rx))
633 		mask |= EPOLLIN | EPOLLRDNORM;
634 	if (xs->tx && xsk_tx_writeable(xs))
635 		mask |= EPOLLOUT | EPOLLWRNORM;
636 
637 	return mask;
638 }
639 
640 static int xsk_init_queue(u32 entries, struct xsk_queue **queue,
641 			  bool umem_queue)
642 {
643 	struct xsk_queue *q;
644 
645 	if (entries == 0 || *queue || !is_power_of_2(entries))
646 		return -EINVAL;
647 
648 	q = xskq_create(entries, umem_queue);
649 	if (!q)
650 		return -ENOMEM;
651 
652 	/* Make sure queue is ready before it can be seen by others */
653 	smp_wmb();
654 	WRITE_ONCE(*queue, q);
655 	return 0;
656 }
657 
658 static void xsk_unbind_dev(struct xdp_sock *xs)
659 {
660 	struct net_device *dev = xs->dev;
661 
662 	if (xs->state != XSK_BOUND)
663 		return;
664 	WRITE_ONCE(xs->state, XSK_UNBOUND);
665 
666 	/* Wait for driver to stop using the xdp socket. */
667 	xp_del_xsk(xs->pool, xs);
668 	xs->dev = NULL;
669 	synchronize_net();
670 	dev_put(dev);
671 }
672 
673 static struct xsk_map *xsk_get_map_list_entry(struct xdp_sock *xs,
674 					      struct xdp_sock ***map_entry)
675 {
676 	struct xsk_map *map = NULL;
677 	struct xsk_map_node *node;
678 
679 	*map_entry = NULL;
680 
681 	spin_lock_bh(&xs->map_list_lock);
682 	node = list_first_entry_or_null(&xs->map_list, struct xsk_map_node,
683 					node);
684 	if (node) {
685 		bpf_map_inc(&node->map->map);
686 		map = node->map;
687 		*map_entry = node->map_entry;
688 	}
689 	spin_unlock_bh(&xs->map_list_lock);
690 	return map;
691 }
692 
693 static void xsk_delete_from_maps(struct xdp_sock *xs)
694 {
695 	/* This function removes the current XDP socket from all the
696 	 * maps it resides in. We need to take extra care here, due to
697 	 * the two locks involved. Each map has a lock synchronizing
698 	 * updates to the entries, and each socket has a lock that
699 	 * synchronizes access to the list of maps (map_list). For
700 	 * deadlock avoidance the locks need to be taken in the order
701 	 * "map lock"->"socket map list lock". We start off by
702 	 * accessing the socket map list, and take a reference to the
703 	 * map to guarantee existence between the
704 	 * xsk_get_map_list_entry() and xsk_map_try_sock_delete()
705 	 * calls. Then we ask the map to remove the socket, which
706 	 * tries to remove the socket from the map. Note that there
707 	 * might be updates to the map between
708 	 * xsk_get_map_list_entry() and xsk_map_try_sock_delete().
709 	 */
710 	struct xdp_sock **map_entry = NULL;
711 	struct xsk_map *map;
712 
713 	while ((map = xsk_get_map_list_entry(xs, &map_entry))) {
714 		xsk_map_try_sock_delete(map, xs, map_entry);
715 		bpf_map_put(&map->map);
716 	}
717 }
718 
719 static int xsk_release(struct socket *sock)
720 {
721 	struct sock *sk = sock->sk;
722 	struct xdp_sock *xs = xdp_sk(sk);
723 	struct net *net;
724 
725 	if (!sk)
726 		return 0;
727 
728 	net = sock_net(sk);
729 
730 	mutex_lock(&net->xdp.lock);
731 	sk_del_node_init_rcu(sk);
732 	mutex_unlock(&net->xdp.lock);
733 
734 	local_bh_disable();
735 	sock_prot_inuse_add(net, sk->sk_prot, -1);
736 	local_bh_enable();
737 
738 	xsk_delete_from_maps(xs);
739 	mutex_lock(&xs->mutex);
740 	xsk_unbind_dev(xs);
741 	mutex_unlock(&xs->mutex);
742 
743 	xskq_destroy(xs->rx);
744 	xskq_destroy(xs->tx);
745 	xskq_destroy(xs->fq_tmp);
746 	xskq_destroy(xs->cq_tmp);
747 
748 	sock_orphan(sk);
749 	sock->sk = NULL;
750 
751 	sk_refcnt_debug_release(sk);
752 	sock_put(sk);
753 
754 	return 0;
755 }
756 
757 static struct socket *xsk_lookup_xsk_from_fd(int fd)
758 {
759 	struct socket *sock;
760 	int err;
761 
762 	sock = sockfd_lookup(fd, &err);
763 	if (!sock)
764 		return ERR_PTR(-ENOTSOCK);
765 
766 	if (sock->sk->sk_family != PF_XDP) {
767 		sockfd_put(sock);
768 		return ERR_PTR(-ENOPROTOOPT);
769 	}
770 
771 	return sock;
772 }
773 
774 static bool xsk_validate_queues(struct xdp_sock *xs)
775 {
776 	return xs->fq_tmp && xs->cq_tmp;
777 }
778 
779 static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
780 {
781 	struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr;
782 	struct sock *sk = sock->sk;
783 	struct xdp_sock *xs = xdp_sk(sk);
784 	struct net_device *dev;
785 	u32 flags, qid;
786 	int err = 0;
787 
788 	if (addr_len < sizeof(struct sockaddr_xdp))
789 		return -EINVAL;
790 	if (sxdp->sxdp_family != AF_XDP)
791 		return -EINVAL;
792 
793 	flags = sxdp->sxdp_flags;
794 	if (flags & ~(XDP_SHARED_UMEM | XDP_COPY | XDP_ZEROCOPY |
795 		      XDP_USE_NEED_WAKEUP))
796 		return -EINVAL;
797 
798 	rtnl_lock();
799 	mutex_lock(&xs->mutex);
800 	if (xs->state != XSK_READY) {
801 		err = -EBUSY;
802 		goto out_release;
803 	}
804 
805 	dev = dev_get_by_index(sock_net(sk), sxdp->sxdp_ifindex);
806 	if (!dev) {
807 		err = -ENODEV;
808 		goto out_release;
809 	}
810 
811 	if (!xs->rx && !xs->tx) {
812 		err = -EINVAL;
813 		goto out_unlock;
814 	}
815 
816 	qid = sxdp->sxdp_queue_id;
817 
818 	if (flags & XDP_SHARED_UMEM) {
819 		struct xdp_sock *umem_xs;
820 		struct socket *sock;
821 
822 		if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY) ||
823 		    (flags & XDP_USE_NEED_WAKEUP)) {
824 			/* Cannot specify flags for shared sockets. */
825 			err = -EINVAL;
826 			goto out_unlock;
827 		}
828 
829 		if (xs->umem) {
830 			/* We have already our own. */
831 			err = -EINVAL;
832 			goto out_unlock;
833 		}
834 
835 		sock = xsk_lookup_xsk_from_fd(sxdp->sxdp_shared_umem_fd);
836 		if (IS_ERR(sock)) {
837 			err = PTR_ERR(sock);
838 			goto out_unlock;
839 		}
840 
841 		umem_xs = xdp_sk(sock->sk);
842 		if (!xsk_is_bound(umem_xs)) {
843 			err = -EBADF;
844 			sockfd_put(sock);
845 			goto out_unlock;
846 		}
847 
848 		if (umem_xs->queue_id != qid || umem_xs->dev != dev) {
849 			/* Share the umem with another socket on another qid
850 			 * and/or device.
851 			 */
852 			xs->pool = xp_create_and_assign_umem(xs,
853 							     umem_xs->umem);
854 			if (!xs->pool) {
855 				err = -ENOMEM;
856 				sockfd_put(sock);
857 				goto out_unlock;
858 			}
859 
860 			err = xp_assign_dev_shared(xs->pool, umem_xs->umem,
861 						   dev, qid);
862 			if (err) {
863 				xp_destroy(xs->pool);
864 				xs->pool = NULL;
865 				sockfd_put(sock);
866 				goto out_unlock;
867 			}
868 		} else {
869 			/* Share the buffer pool with the other socket. */
870 			if (xs->fq_tmp || xs->cq_tmp) {
871 				/* Do not allow setting your own fq or cq. */
872 				err = -EINVAL;
873 				sockfd_put(sock);
874 				goto out_unlock;
875 			}
876 
877 			xp_get_pool(umem_xs->pool);
878 			xs->pool = umem_xs->pool;
879 		}
880 
881 		xdp_get_umem(umem_xs->umem);
882 		WRITE_ONCE(xs->umem, umem_xs->umem);
883 		sockfd_put(sock);
884 	} else if (!xs->umem || !xsk_validate_queues(xs)) {
885 		err = -EINVAL;
886 		goto out_unlock;
887 	} else {
888 		/* This xsk has its own umem. */
889 		xs->pool = xp_create_and_assign_umem(xs, xs->umem);
890 		if (!xs->pool) {
891 			err = -ENOMEM;
892 			goto out_unlock;
893 		}
894 
895 		err = xp_assign_dev(xs->pool, dev, qid, flags);
896 		if (err) {
897 			xp_destroy(xs->pool);
898 			xs->pool = NULL;
899 			goto out_unlock;
900 		}
901 	}
902 
903 	/* FQ and CQ are now owned by the buffer pool and cleaned up with it. */
904 	xs->fq_tmp = NULL;
905 	xs->cq_tmp = NULL;
906 
907 	xs->dev = dev;
908 	xs->zc = xs->umem->zc;
909 	xs->queue_id = qid;
910 	xp_add_xsk(xs->pool, xs);
911 
912 out_unlock:
913 	if (err) {
914 		dev_put(dev);
915 	} else {
916 		/* Matches smp_rmb() in bind() for shared umem
917 		 * sockets, and xsk_is_bound().
918 		 */
919 		smp_wmb();
920 		WRITE_ONCE(xs->state, XSK_BOUND);
921 	}
922 out_release:
923 	mutex_unlock(&xs->mutex);
924 	rtnl_unlock();
925 	return err;
926 }
927 
928 struct xdp_umem_reg_v1 {
929 	__u64 addr; /* Start of packet data area */
930 	__u64 len; /* Length of packet data area */
931 	__u32 chunk_size;
932 	__u32 headroom;
933 };
934 
935 static int xsk_setsockopt(struct socket *sock, int level, int optname,
936 			  sockptr_t optval, unsigned int optlen)
937 {
938 	struct sock *sk = sock->sk;
939 	struct xdp_sock *xs = xdp_sk(sk);
940 	int err;
941 
942 	if (level != SOL_XDP)
943 		return -ENOPROTOOPT;
944 
945 	switch (optname) {
946 	case XDP_RX_RING:
947 	case XDP_TX_RING:
948 	{
949 		struct xsk_queue **q;
950 		int entries;
951 
952 		if (optlen < sizeof(entries))
953 			return -EINVAL;
954 		if (copy_from_sockptr(&entries, optval, sizeof(entries)))
955 			return -EFAULT;
956 
957 		mutex_lock(&xs->mutex);
958 		if (xs->state != XSK_READY) {
959 			mutex_unlock(&xs->mutex);
960 			return -EBUSY;
961 		}
962 		q = (optname == XDP_TX_RING) ? &xs->tx : &xs->rx;
963 		err = xsk_init_queue(entries, q, false);
964 		if (!err && optname == XDP_TX_RING)
965 			/* Tx needs to be explicitly woken up the first time */
966 			xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP;
967 		mutex_unlock(&xs->mutex);
968 		return err;
969 	}
970 	case XDP_UMEM_REG:
971 	{
972 		size_t mr_size = sizeof(struct xdp_umem_reg);
973 		struct xdp_umem_reg mr = {};
974 		struct xdp_umem *umem;
975 
976 		if (optlen < sizeof(struct xdp_umem_reg_v1))
977 			return -EINVAL;
978 		else if (optlen < sizeof(mr))
979 			mr_size = sizeof(struct xdp_umem_reg_v1);
980 
981 		if (copy_from_sockptr(&mr, optval, mr_size))
982 			return -EFAULT;
983 
984 		mutex_lock(&xs->mutex);
985 		if (xs->state != XSK_READY || xs->umem) {
986 			mutex_unlock(&xs->mutex);
987 			return -EBUSY;
988 		}
989 
990 		umem = xdp_umem_create(&mr);
991 		if (IS_ERR(umem)) {
992 			mutex_unlock(&xs->mutex);
993 			return PTR_ERR(umem);
994 		}
995 
996 		/* Make sure umem is ready before it can be seen by others */
997 		smp_wmb();
998 		WRITE_ONCE(xs->umem, umem);
999 		mutex_unlock(&xs->mutex);
1000 		return 0;
1001 	}
1002 	case XDP_UMEM_FILL_RING:
1003 	case XDP_UMEM_COMPLETION_RING:
1004 	{
1005 		struct xsk_queue **q;
1006 		int entries;
1007 
1008 		if (copy_from_sockptr(&entries, optval, sizeof(entries)))
1009 			return -EFAULT;
1010 
1011 		mutex_lock(&xs->mutex);
1012 		if (xs->state != XSK_READY) {
1013 			mutex_unlock(&xs->mutex);
1014 			return -EBUSY;
1015 		}
1016 
1017 		q = (optname == XDP_UMEM_FILL_RING) ? &xs->fq_tmp :
1018 			&xs->cq_tmp;
1019 		err = xsk_init_queue(entries, q, true);
1020 		mutex_unlock(&xs->mutex);
1021 		return err;
1022 	}
1023 	default:
1024 		break;
1025 	}
1026 
1027 	return -ENOPROTOOPT;
1028 }
1029 
1030 static void xsk_enter_rxtx_offsets(struct xdp_ring_offset_v1 *ring)
1031 {
1032 	ring->producer = offsetof(struct xdp_rxtx_ring, ptrs.producer);
1033 	ring->consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer);
1034 	ring->desc = offsetof(struct xdp_rxtx_ring, desc);
1035 }
1036 
1037 static void xsk_enter_umem_offsets(struct xdp_ring_offset_v1 *ring)
1038 {
1039 	ring->producer = offsetof(struct xdp_umem_ring, ptrs.producer);
1040 	ring->consumer = offsetof(struct xdp_umem_ring, ptrs.consumer);
1041 	ring->desc = offsetof(struct xdp_umem_ring, desc);
1042 }
1043 
1044 struct xdp_statistics_v1 {
1045 	__u64 rx_dropped;
1046 	__u64 rx_invalid_descs;
1047 	__u64 tx_invalid_descs;
1048 };
1049 
1050 static int xsk_getsockopt(struct socket *sock, int level, int optname,
1051 			  char __user *optval, int __user *optlen)
1052 {
1053 	struct sock *sk = sock->sk;
1054 	struct xdp_sock *xs = xdp_sk(sk);
1055 	int len;
1056 
1057 	if (level != SOL_XDP)
1058 		return -ENOPROTOOPT;
1059 
1060 	if (get_user(len, optlen))
1061 		return -EFAULT;
1062 	if (len < 0)
1063 		return -EINVAL;
1064 
1065 	switch (optname) {
1066 	case XDP_STATISTICS:
1067 	{
1068 		struct xdp_statistics stats = {};
1069 		bool extra_stats = true;
1070 		size_t stats_size;
1071 
1072 		if (len < sizeof(struct xdp_statistics_v1)) {
1073 			return -EINVAL;
1074 		} else if (len < sizeof(stats)) {
1075 			extra_stats = false;
1076 			stats_size = sizeof(struct xdp_statistics_v1);
1077 		} else {
1078 			stats_size = sizeof(stats);
1079 		}
1080 
1081 		mutex_lock(&xs->mutex);
1082 		stats.rx_dropped = xs->rx_dropped;
1083 		if (extra_stats) {
1084 			stats.rx_ring_full = xs->rx_queue_full;
1085 			stats.rx_fill_ring_empty_descs =
1086 				xs->pool ? xskq_nb_queue_empty_descs(xs->pool->fq) : 0;
1087 			stats.tx_ring_empty_descs = xskq_nb_queue_empty_descs(xs->tx);
1088 		} else {
1089 			stats.rx_dropped += xs->rx_queue_full;
1090 		}
1091 		stats.rx_invalid_descs = xskq_nb_invalid_descs(xs->rx);
1092 		stats.tx_invalid_descs = xskq_nb_invalid_descs(xs->tx);
1093 		mutex_unlock(&xs->mutex);
1094 
1095 		if (copy_to_user(optval, &stats, stats_size))
1096 			return -EFAULT;
1097 		if (put_user(stats_size, optlen))
1098 			return -EFAULT;
1099 
1100 		return 0;
1101 	}
1102 	case XDP_MMAP_OFFSETS:
1103 	{
1104 		struct xdp_mmap_offsets off;
1105 		struct xdp_mmap_offsets_v1 off_v1;
1106 		bool flags_supported = true;
1107 		void *to_copy;
1108 
1109 		if (len < sizeof(off_v1))
1110 			return -EINVAL;
1111 		else if (len < sizeof(off))
1112 			flags_supported = false;
1113 
1114 		if (flags_supported) {
1115 			/* xdp_ring_offset is identical to xdp_ring_offset_v1
1116 			 * except for the flags field added to the end.
1117 			 */
1118 			xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *)
1119 					       &off.rx);
1120 			xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *)
1121 					       &off.tx);
1122 			xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *)
1123 					       &off.fr);
1124 			xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *)
1125 					       &off.cr);
1126 			off.rx.flags = offsetof(struct xdp_rxtx_ring,
1127 						ptrs.flags);
1128 			off.tx.flags = offsetof(struct xdp_rxtx_ring,
1129 						ptrs.flags);
1130 			off.fr.flags = offsetof(struct xdp_umem_ring,
1131 						ptrs.flags);
1132 			off.cr.flags = offsetof(struct xdp_umem_ring,
1133 						ptrs.flags);
1134 
1135 			len = sizeof(off);
1136 			to_copy = &off;
1137 		} else {
1138 			xsk_enter_rxtx_offsets(&off_v1.rx);
1139 			xsk_enter_rxtx_offsets(&off_v1.tx);
1140 			xsk_enter_umem_offsets(&off_v1.fr);
1141 			xsk_enter_umem_offsets(&off_v1.cr);
1142 
1143 			len = sizeof(off_v1);
1144 			to_copy = &off_v1;
1145 		}
1146 
1147 		if (copy_to_user(optval, to_copy, len))
1148 			return -EFAULT;
1149 		if (put_user(len, optlen))
1150 			return -EFAULT;
1151 
1152 		return 0;
1153 	}
1154 	case XDP_OPTIONS:
1155 	{
1156 		struct xdp_options opts = {};
1157 
1158 		if (len < sizeof(opts))
1159 			return -EINVAL;
1160 
1161 		mutex_lock(&xs->mutex);
1162 		if (xs->zc)
1163 			opts.flags |= XDP_OPTIONS_ZEROCOPY;
1164 		mutex_unlock(&xs->mutex);
1165 
1166 		len = sizeof(opts);
1167 		if (copy_to_user(optval, &opts, len))
1168 			return -EFAULT;
1169 		if (put_user(len, optlen))
1170 			return -EFAULT;
1171 
1172 		return 0;
1173 	}
1174 	default:
1175 		break;
1176 	}
1177 
1178 	return -EOPNOTSUPP;
1179 }
1180 
1181 static int xsk_mmap(struct file *file, struct socket *sock,
1182 		    struct vm_area_struct *vma)
1183 {
1184 	loff_t offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
1185 	unsigned long size = vma->vm_end - vma->vm_start;
1186 	struct xdp_sock *xs = xdp_sk(sock->sk);
1187 	struct xsk_queue *q = NULL;
1188 	unsigned long pfn;
1189 	struct page *qpg;
1190 
1191 	if (READ_ONCE(xs->state) != XSK_READY)
1192 		return -EBUSY;
1193 
1194 	if (offset == XDP_PGOFF_RX_RING) {
1195 		q = READ_ONCE(xs->rx);
1196 	} else if (offset == XDP_PGOFF_TX_RING) {
1197 		q = READ_ONCE(xs->tx);
1198 	} else {
1199 		/* Matches the smp_wmb() in XDP_UMEM_REG */
1200 		smp_rmb();
1201 		if (offset == XDP_UMEM_PGOFF_FILL_RING)
1202 			q = READ_ONCE(xs->fq_tmp);
1203 		else if (offset == XDP_UMEM_PGOFF_COMPLETION_RING)
1204 			q = READ_ONCE(xs->cq_tmp);
1205 	}
1206 
1207 	if (!q)
1208 		return -EINVAL;
1209 
1210 	/* Matches the smp_wmb() in xsk_init_queue */
1211 	smp_rmb();
1212 	qpg = virt_to_head_page(q->ring);
1213 	if (size > page_size(qpg))
1214 		return -EINVAL;
1215 
1216 	pfn = virt_to_phys(q->ring) >> PAGE_SHIFT;
1217 	return remap_pfn_range(vma, vma->vm_start, pfn,
1218 			       size, vma->vm_page_prot);
1219 }
1220 
1221 static int xsk_notifier(struct notifier_block *this,
1222 			unsigned long msg, void *ptr)
1223 {
1224 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
1225 	struct net *net = dev_net(dev);
1226 	struct sock *sk;
1227 
1228 	switch (msg) {
1229 	case NETDEV_UNREGISTER:
1230 		mutex_lock(&net->xdp.lock);
1231 		sk_for_each(sk, &net->xdp.list) {
1232 			struct xdp_sock *xs = xdp_sk(sk);
1233 
1234 			mutex_lock(&xs->mutex);
1235 			if (xs->dev == dev) {
1236 				sk->sk_err = ENETDOWN;
1237 				if (!sock_flag(sk, SOCK_DEAD))
1238 					sk->sk_error_report(sk);
1239 
1240 				xsk_unbind_dev(xs);
1241 
1242 				/* Clear device references. */
1243 				xp_clear_dev(xs->pool);
1244 			}
1245 			mutex_unlock(&xs->mutex);
1246 		}
1247 		mutex_unlock(&net->xdp.lock);
1248 		break;
1249 	}
1250 	return NOTIFY_DONE;
1251 }
1252 
1253 static struct proto xsk_proto = {
1254 	.name =		"XDP",
1255 	.owner =	THIS_MODULE,
1256 	.obj_size =	sizeof(struct xdp_sock),
1257 };
1258 
1259 static const struct proto_ops xsk_proto_ops = {
1260 	.family		= PF_XDP,
1261 	.owner		= THIS_MODULE,
1262 	.release	= xsk_release,
1263 	.bind		= xsk_bind,
1264 	.connect	= sock_no_connect,
1265 	.socketpair	= sock_no_socketpair,
1266 	.accept		= sock_no_accept,
1267 	.getname	= sock_no_getname,
1268 	.poll		= xsk_poll,
1269 	.ioctl		= sock_no_ioctl,
1270 	.listen		= sock_no_listen,
1271 	.shutdown	= sock_no_shutdown,
1272 	.setsockopt	= xsk_setsockopt,
1273 	.getsockopt	= xsk_getsockopt,
1274 	.sendmsg	= xsk_sendmsg,
1275 	.recvmsg	= xsk_recvmsg,
1276 	.mmap		= xsk_mmap,
1277 	.sendpage	= sock_no_sendpage,
1278 };
1279 
1280 static void xsk_destruct(struct sock *sk)
1281 {
1282 	struct xdp_sock *xs = xdp_sk(sk);
1283 
1284 	if (!sock_flag(sk, SOCK_DEAD))
1285 		return;
1286 
1287 	if (!xp_put_pool(xs->pool))
1288 		xdp_put_umem(xs->umem, !xs->pool);
1289 
1290 	sk_refcnt_debug_dec(sk);
1291 }
1292 
1293 static int xsk_create(struct net *net, struct socket *sock, int protocol,
1294 		      int kern)
1295 {
1296 	struct xdp_sock *xs;
1297 	struct sock *sk;
1298 
1299 	if (!ns_capable(net->user_ns, CAP_NET_RAW))
1300 		return -EPERM;
1301 	if (sock->type != SOCK_RAW)
1302 		return -ESOCKTNOSUPPORT;
1303 
1304 	if (protocol)
1305 		return -EPROTONOSUPPORT;
1306 
1307 	sock->state = SS_UNCONNECTED;
1308 
1309 	sk = sk_alloc(net, PF_XDP, GFP_KERNEL, &xsk_proto, kern);
1310 	if (!sk)
1311 		return -ENOBUFS;
1312 
1313 	sock->ops = &xsk_proto_ops;
1314 
1315 	sock_init_data(sock, sk);
1316 
1317 	sk->sk_family = PF_XDP;
1318 
1319 	sk->sk_destruct = xsk_destruct;
1320 	sk_refcnt_debug_inc(sk);
1321 
1322 	sock_set_flag(sk, SOCK_RCU_FREE);
1323 
1324 	xs = xdp_sk(sk);
1325 	xs->state = XSK_READY;
1326 	mutex_init(&xs->mutex);
1327 	spin_lock_init(&xs->rx_lock);
1328 
1329 	INIT_LIST_HEAD(&xs->map_list);
1330 	spin_lock_init(&xs->map_list_lock);
1331 
1332 	mutex_lock(&net->xdp.lock);
1333 	sk_add_node_rcu(sk, &net->xdp.list);
1334 	mutex_unlock(&net->xdp.lock);
1335 
1336 	local_bh_disable();
1337 	sock_prot_inuse_add(net, &xsk_proto, 1);
1338 	local_bh_enable();
1339 
1340 	return 0;
1341 }
1342 
1343 static const struct net_proto_family xsk_family_ops = {
1344 	.family = PF_XDP,
1345 	.create = xsk_create,
1346 	.owner	= THIS_MODULE,
1347 };
1348 
1349 static struct notifier_block xsk_netdev_notifier = {
1350 	.notifier_call	= xsk_notifier,
1351 };
1352 
1353 static int __net_init xsk_net_init(struct net *net)
1354 {
1355 	mutex_init(&net->xdp.lock);
1356 	INIT_HLIST_HEAD(&net->xdp.list);
1357 	return 0;
1358 }
1359 
1360 static void __net_exit xsk_net_exit(struct net *net)
1361 {
1362 	WARN_ON_ONCE(!hlist_empty(&net->xdp.list));
1363 }
1364 
1365 static struct pernet_operations xsk_net_ops = {
1366 	.init = xsk_net_init,
1367 	.exit = xsk_net_exit,
1368 };
1369 
1370 static int __init xsk_init(void)
1371 {
1372 	int err, cpu;
1373 
1374 	err = proto_register(&xsk_proto, 0 /* no slab */);
1375 	if (err)
1376 		goto out;
1377 
1378 	err = sock_register(&xsk_family_ops);
1379 	if (err)
1380 		goto out_proto;
1381 
1382 	err = register_pernet_subsys(&xsk_net_ops);
1383 	if (err)
1384 		goto out_sk;
1385 
1386 	err = register_netdevice_notifier(&xsk_netdev_notifier);
1387 	if (err)
1388 		goto out_pernet;
1389 
1390 	for_each_possible_cpu(cpu)
1391 		INIT_LIST_HEAD(&per_cpu(xskmap_flush_list, cpu));
1392 	return 0;
1393 
1394 out_pernet:
1395 	unregister_pernet_subsys(&xsk_net_ops);
1396 out_sk:
1397 	sock_unregister(PF_XDP);
1398 out_proto:
1399 	proto_unregister(&xsk_proto);
1400 out:
1401 	return err;
1402 }
1403 
1404 fs_initcall(xsk_init);
1405