xref: /openbmc/linux/net/xdp/xsk.c (revision 0760aad038b5a032c31ea124feed63d88627d2f1)
1 // SPDX-License-Identifier: GPL-2.0
2 /* XDP sockets
3  *
4  * AF_XDP sockets allows a channel between XDP programs and userspace
5  * applications.
6  * Copyright(c) 2018 Intel Corporation.
7  *
8  * Author(s): Björn Töpel <bjorn.topel@intel.com>
9  *	      Magnus Karlsson <magnus.karlsson@intel.com>
10  */
11 
12 #define pr_fmt(fmt) "AF_XDP: %s: " fmt, __func__
13 
14 #include <linux/if_xdp.h>
15 #include <linux/init.h>
16 #include <linux/sched/mm.h>
17 #include <linux/sched/signal.h>
18 #include <linux/sched/task.h>
19 #include <linux/socket.h>
20 #include <linux/file.h>
21 #include <linux/uaccess.h>
22 #include <linux/net.h>
23 #include <linux/netdevice.h>
24 #include <linux/rculist.h>
25 #include <net/xdp_sock_drv.h>
26 #include <net/xdp.h>
27 
28 #include "xsk_queue.h"
29 #include "xdp_umem.h"
30 #include "xsk.h"
31 
32 #define TX_BATCH_SIZE 16
33 
34 static DEFINE_PER_CPU(struct list_head, xskmap_flush_list);
35 
36 bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs)
37 {
38 	return READ_ONCE(xs->rx) &&  READ_ONCE(xs->umem) &&
39 		(xs->pool->fq || READ_ONCE(xs->fq_tmp));
40 }
41 
42 void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool)
43 {
44 	if (pool->cached_need_wakeup & XDP_WAKEUP_RX)
45 		return;
46 
47 	pool->fq->ring->flags |= XDP_RING_NEED_WAKEUP;
48 	pool->cached_need_wakeup |= XDP_WAKEUP_RX;
49 }
50 EXPORT_SYMBOL(xsk_set_rx_need_wakeup);
51 
52 void xsk_set_tx_need_wakeup(struct xsk_buff_pool *pool)
53 {
54 	struct xdp_sock *xs;
55 
56 	if (pool->cached_need_wakeup & XDP_WAKEUP_TX)
57 		return;
58 
59 	rcu_read_lock();
60 	list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
61 		xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP;
62 	}
63 	rcu_read_unlock();
64 
65 	pool->cached_need_wakeup |= XDP_WAKEUP_TX;
66 }
67 EXPORT_SYMBOL(xsk_set_tx_need_wakeup);
68 
69 void xsk_clear_rx_need_wakeup(struct xsk_buff_pool *pool)
70 {
71 	if (!(pool->cached_need_wakeup & XDP_WAKEUP_RX))
72 		return;
73 
74 	pool->fq->ring->flags &= ~XDP_RING_NEED_WAKEUP;
75 	pool->cached_need_wakeup &= ~XDP_WAKEUP_RX;
76 }
77 EXPORT_SYMBOL(xsk_clear_rx_need_wakeup);
78 
79 void xsk_clear_tx_need_wakeup(struct xsk_buff_pool *pool)
80 {
81 	struct xdp_sock *xs;
82 
83 	if (!(pool->cached_need_wakeup & XDP_WAKEUP_TX))
84 		return;
85 
86 	rcu_read_lock();
87 	list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
88 		xs->tx->ring->flags &= ~XDP_RING_NEED_WAKEUP;
89 	}
90 	rcu_read_unlock();
91 
92 	pool->cached_need_wakeup &= ~XDP_WAKEUP_TX;
93 }
94 EXPORT_SYMBOL(xsk_clear_tx_need_wakeup);
95 
96 bool xsk_uses_need_wakeup(struct xsk_buff_pool *pool)
97 {
98 	return pool->uses_need_wakeup;
99 }
100 EXPORT_SYMBOL(xsk_uses_need_wakeup);
101 
102 struct xsk_buff_pool *xsk_get_pool_from_qid(struct net_device *dev,
103 					    u16 queue_id)
104 {
105 	if (queue_id < dev->real_num_rx_queues)
106 		return dev->_rx[queue_id].pool;
107 	if (queue_id < dev->real_num_tx_queues)
108 		return dev->_tx[queue_id].pool;
109 
110 	return NULL;
111 }
112 EXPORT_SYMBOL(xsk_get_pool_from_qid);
113 
114 void xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id)
115 {
116 	if (queue_id < dev->real_num_rx_queues)
117 		dev->_rx[queue_id].pool = NULL;
118 	if (queue_id < dev->real_num_tx_queues)
119 		dev->_tx[queue_id].pool = NULL;
120 }
121 
122 /* The buffer pool is stored both in the _rx struct and the _tx struct as we do
123  * not know if the device has more tx queues than rx, or the opposite.
124  * This might also change during run time.
125  */
126 int xsk_reg_pool_at_qid(struct net_device *dev, struct xsk_buff_pool *pool,
127 			u16 queue_id)
128 {
129 	if (queue_id >= max_t(unsigned int,
130 			      dev->real_num_rx_queues,
131 			      dev->real_num_tx_queues))
132 		return -EINVAL;
133 
134 	if (queue_id < dev->real_num_rx_queues)
135 		dev->_rx[queue_id].pool = pool;
136 	if (queue_id < dev->real_num_tx_queues)
137 		dev->_tx[queue_id].pool = pool;
138 
139 	return 0;
140 }
141 
142 void xp_release(struct xdp_buff_xsk *xskb)
143 {
144 	xskb->pool->free_heads[xskb->pool->free_heads_cnt++] = xskb;
145 }
146 
147 static u64 xp_get_handle(struct xdp_buff_xsk *xskb)
148 {
149 	u64 offset = xskb->xdp.data - xskb->xdp.data_hard_start;
150 
151 	offset += xskb->pool->headroom;
152 	if (!xskb->pool->unaligned)
153 		return xskb->orig_addr + offset;
154 	return xskb->orig_addr + (offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT);
155 }
156 
157 static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
158 {
159 	struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp);
160 	u64 addr;
161 	int err;
162 
163 	addr = xp_get_handle(xskb);
164 	err = xskq_prod_reserve_desc(xs->rx, addr, len);
165 	if (err) {
166 		xs->rx_queue_full++;
167 		return err;
168 	}
169 
170 	xp_release(xskb);
171 	return 0;
172 }
173 
174 static void xsk_copy_xdp(struct xdp_buff *to, struct xdp_buff *from, u32 len)
175 {
176 	void *from_buf, *to_buf;
177 	u32 metalen;
178 
179 	if (unlikely(xdp_data_meta_unsupported(from))) {
180 		from_buf = from->data;
181 		to_buf = to->data;
182 		metalen = 0;
183 	} else {
184 		from_buf = from->data_meta;
185 		metalen = from->data - from->data_meta;
186 		to_buf = to->data - metalen;
187 	}
188 
189 	memcpy(to_buf, from_buf, len + metalen);
190 }
191 
192 static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len,
193 		     bool explicit_free)
194 {
195 	struct xdp_buff *xsk_xdp;
196 	int err;
197 
198 	if (len > xsk_pool_get_rx_frame_size(xs->pool)) {
199 		xs->rx_dropped++;
200 		return -ENOSPC;
201 	}
202 
203 	xsk_xdp = xsk_buff_alloc(xs->pool);
204 	if (!xsk_xdp) {
205 		xs->rx_dropped++;
206 		return -ENOSPC;
207 	}
208 
209 	xsk_copy_xdp(xsk_xdp, xdp, len);
210 	err = __xsk_rcv_zc(xs, xsk_xdp, len);
211 	if (err) {
212 		xsk_buff_free(xsk_xdp);
213 		return err;
214 	}
215 	if (explicit_free)
216 		xdp_return_buff(xdp);
217 	return 0;
218 }
219 
220 static bool xsk_is_bound(struct xdp_sock *xs)
221 {
222 	if (READ_ONCE(xs->state) == XSK_BOUND) {
223 		/* Matches smp_wmb() in bind(). */
224 		smp_rmb();
225 		return true;
226 	}
227 	return false;
228 }
229 
230 static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp,
231 		   bool explicit_free)
232 {
233 	u32 len;
234 
235 	if (!xsk_is_bound(xs))
236 		return -EINVAL;
237 
238 	if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index)
239 		return -EINVAL;
240 
241 	len = xdp->data_end - xdp->data;
242 
243 	return xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL ?
244 		__xsk_rcv_zc(xs, xdp, len) :
245 		__xsk_rcv(xs, xdp, len, explicit_free);
246 }
247 
248 static void xsk_flush(struct xdp_sock *xs)
249 {
250 	xskq_prod_submit(xs->rx);
251 	__xskq_cons_release(xs->pool->fq);
252 	sock_def_readable(&xs->sk);
253 }
254 
255 int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
256 {
257 	int err;
258 
259 	spin_lock_bh(&xs->rx_lock);
260 	err = xsk_rcv(xs, xdp, false);
261 	xsk_flush(xs);
262 	spin_unlock_bh(&xs->rx_lock);
263 	return err;
264 }
265 
266 int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp)
267 {
268 	struct list_head *flush_list = this_cpu_ptr(&xskmap_flush_list);
269 	int err;
270 
271 	err = xsk_rcv(xs, xdp, true);
272 	if (err)
273 		return err;
274 
275 	if (!xs->flush_node.prev)
276 		list_add(&xs->flush_node, flush_list);
277 
278 	return 0;
279 }
280 
281 void __xsk_map_flush(void)
282 {
283 	struct list_head *flush_list = this_cpu_ptr(&xskmap_flush_list);
284 	struct xdp_sock *xs, *tmp;
285 
286 	list_for_each_entry_safe(xs, tmp, flush_list, flush_node) {
287 		xsk_flush(xs);
288 		__list_del_clearprev(&xs->flush_node);
289 	}
290 }
291 
292 void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries)
293 {
294 	xskq_prod_submit_n(pool->cq, nb_entries);
295 }
296 EXPORT_SYMBOL(xsk_tx_completed);
297 
298 void xsk_tx_release(struct xsk_buff_pool *pool)
299 {
300 	struct xdp_sock *xs;
301 
302 	rcu_read_lock();
303 	list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
304 		__xskq_cons_release(xs->tx);
305 		xs->sk.sk_write_space(&xs->sk);
306 	}
307 	rcu_read_unlock();
308 }
309 EXPORT_SYMBOL(xsk_tx_release);
310 
311 bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc)
312 {
313 	struct xdp_sock *xs;
314 
315 	rcu_read_lock();
316 	list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
317 		if (!xskq_cons_peek_desc(xs->tx, desc, pool)) {
318 			xs->tx->queue_empty_descs++;
319 			continue;
320 		}
321 
322 		/* This is the backpressure mechanism for the Tx path.
323 		 * Reserve space in the completion queue and only proceed
324 		 * if there is space in it. This avoids having to implement
325 		 * any buffering in the Tx path.
326 		 */
327 		if (xskq_prod_reserve_addr(pool->cq, desc->addr))
328 			goto out;
329 
330 		xskq_cons_release(xs->tx);
331 		rcu_read_unlock();
332 		return true;
333 	}
334 
335 out:
336 	rcu_read_unlock();
337 	return false;
338 }
339 EXPORT_SYMBOL(xsk_tx_peek_desc);
340 
341 static int xsk_wakeup(struct xdp_sock *xs, u8 flags)
342 {
343 	struct net_device *dev = xs->dev;
344 	int err;
345 
346 	rcu_read_lock();
347 	err = dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, flags);
348 	rcu_read_unlock();
349 
350 	return err;
351 }
352 
353 static int xsk_zc_xmit(struct xdp_sock *xs)
354 {
355 	return xsk_wakeup(xs, XDP_WAKEUP_TX);
356 }
357 
358 static void xsk_destruct_skb(struct sk_buff *skb)
359 {
360 	u64 addr = (u64)(long)skb_shinfo(skb)->destructor_arg;
361 	struct xdp_sock *xs = xdp_sk(skb->sk);
362 	unsigned long flags;
363 
364 	spin_lock_irqsave(&xs->tx_completion_lock, flags);
365 	xskq_prod_submit_addr(xs->pool->cq, addr);
366 	spin_unlock_irqrestore(&xs->tx_completion_lock, flags);
367 
368 	sock_wfree(skb);
369 }
370 
371 static int xsk_generic_xmit(struct sock *sk)
372 {
373 	struct xdp_sock *xs = xdp_sk(sk);
374 	u32 max_batch = TX_BATCH_SIZE;
375 	bool sent_frame = false;
376 	struct xdp_desc desc;
377 	struct sk_buff *skb;
378 	int err = 0;
379 
380 	mutex_lock(&xs->mutex);
381 
382 	if (xs->queue_id >= xs->dev->real_num_tx_queues)
383 		goto out;
384 
385 	while (xskq_cons_peek_desc(xs->tx, &desc, xs->pool)) {
386 		char *buffer;
387 		u64 addr;
388 		u32 len;
389 
390 		if (max_batch-- == 0) {
391 			err = -EAGAIN;
392 			goto out;
393 		}
394 
395 		len = desc.len;
396 		skb = sock_alloc_send_skb(sk, len, 1, &err);
397 		if (unlikely(!skb))
398 			goto out;
399 
400 		skb_put(skb, len);
401 		addr = desc.addr;
402 		buffer = xsk_buff_raw_get_data(xs->pool, addr);
403 		err = skb_store_bits(skb, 0, buffer, len);
404 		/* This is the backpressure mechanism for the Tx path.
405 		 * Reserve space in the completion queue and only proceed
406 		 * if there is space in it. This avoids having to implement
407 		 * any buffering in the Tx path.
408 		 */
409 		if (unlikely(err) || xskq_prod_reserve(xs->pool->cq)) {
410 			kfree_skb(skb);
411 			goto out;
412 		}
413 
414 		skb->dev = xs->dev;
415 		skb->priority = sk->sk_priority;
416 		skb->mark = sk->sk_mark;
417 		skb_shinfo(skb)->destructor_arg = (void *)(long)desc.addr;
418 		skb->destructor = xsk_destruct_skb;
419 
420 		err = dev_direct_xmit(skb, xs->queue_id);
421 		xskq_cons_release(xs->tx);
422 		/* Ignore NET_XMIT_CN as packet might have been sent */
423 		if (err == NET_XMIT_DROP || err == NETDEV_TX_BUSY) {
424 			/* SKB completed but not sent */
425 			err = -EBUSY;
426 			goto out;
427 		}
428 
429 		sent_frame = true;
430 	}
431 
432 	xs->tx->queue_empty_descs++;
433 
434 out:
435 	if (sent_frame)
436 		sk->sk_write_space(sk);
437 
438 	mutex_unlock(&xs->mutex);
439 	return err;
440 }
441 
442 static int __xsk_sendmsg(struct sock *sk)
443 {
444 	struct xdp_sock *xs = xdp_sk(sk);
445 
446 	if (unlikely(!(xs->dev->flags & IFF_UP)))
447 		return -ENETDOWN;
448 	if (unlikely(!xs->tx))
449 		return -ENOBUFS;
450 
451 	return xs->zc ? xsk_zc_xmit(xs) : xsk_generic_xmit(sk);
452 }
453 
454 static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
455 {
456 	bool need_wait = !(m->msg_flags & MSG_DONTWAIT);
457 	struct sock *sk = sock->sk;
458 	struct xdp_sock *xs = xdp_sk(sk);
459 
460 	if (unlikely(!xsk_is_bound(xs)))
461 		return -ENXIO;
462 	if (unlikely(need_wait))
463 		return -EOPNOTSUPP;
464 
465 	return __xsk_sendmsg(sk);
466 }
467 
468 static __poll_t xsk_poll(struct file *file, struct socket *sock,
469 			     struct poll_table_struct *wait)
470 {
471 	__poll_t mask = datagram_poll(file, sock, wait);
472 	struct sock *sk = sock->sk;
473 	struct xdp_sock *xs = xdp_sk(sk);
474 	struct xsk_buff_pool *pool;
475 
476 	if (unlikely(!xsk_is_bound(xs)))
477 		return mask;
478 
479 	pool = xs->pool;
480 
481 	if (pool->cached_need_wakeup) {
482 		if (xs->zc)
483 			xsk_wakeup(xs, pool->cached_need_wakeup);
484 		else
485 			/* Poll needs to drive Tx also in copy mode */
486 			__xsk_sendmsg(sk);
487 	}
488 
489 	if (xs->rx && !xskq_prod_is_empty(xs->rx))
490 		mask |= EPOLLIN | EPOLLRDNORM;
491 	if (xs->tx && !xskq_cons_is_full(xs->tx))
492 		mask |= EPOLLOUT | EPOLLWRNORM;
493 
494 	return mask;
495 }
496 
497 static int xsk_init_queue(u32 entries, struct xsk_queue **queue,
498 			  bool umem_queue)
499 {
500 	struct xsk_queue *q;
501 
502 	if (entries == 0 || *queue || !is_power_of_2(entries))
503 		return -EINVAL;
504 
505 	q = xskq_create(entries, umem_queue);
506 	if (!q)
507 		return -ENOMEM;
508 
509 	/* Make sure queue is ready before it can be seen by others */
510 	smp_wmb();
511 	WRITE_ONCE(*queue, q);
512 	return 0;
513 }
514 
515 static void xsk_unbind_dev(struct xdp_sock *xs)
516 {
517 	struct net_device *dev = xs->dev;
518 
519 	if (xs->state != XSK_BOUND)
520 		return;
521 	WRITE_ONCE(xs->state, XSK_UNBOUND);
522 
523 	/* Wait for driver to stop using the xdp socket. */
524 	xp_del_xsk(xs->pool, xs);
525 	xs->dev = NULL;
526 	synchronize_net();
527 	dev_put(dev);
528 }
529 
530 static struct xsk_map *xsk_get_map_list_entry(struct xdp_sock *xs,
531 					      struct xdp_sock ***map_entry)
532 {
533 	struct xsk_map *map = NULL;
534 	struct xsk_map_node *node;
535 
536 	*map_entry = NULL;
537 
538 	spin_lock_bh(&xs->map_list_lock);
539 	node = list_first_entry_or_null(&xs->map_list, struct xsk_map_node,
540 					node);
541 	if (node) {
542 		WARN_ON(xsk_map_inc(node->map));
543 		map = node->map;
544 		*map_entry = node->map_entry;
545 	}
546 	spin_unlock_bh(&xs->map_list_lock);
547 	return map;
548 }
549 
550 static void xsk_delete_from_maps(struct xdp_sock *xs)
551 {
552 	/* This function removes the current XDP socket from all the
553 	 * maps it resides in. We need to take extra care here, due to
554 	 * the two locks involved. Each map has a lock synchronizing
555 	 * updates to the entries, and each socket has a lock that
556 	 * synchronizes access to the list of maps (map_list). For
557 	 * deadlock avoidance the locks need to be taken in the order
558 	 * "map lock"->"socket map list lock". We start off by
559 	 * accessing the socket map list, and take a reference to the
560 	 * map to guarantee existence between the
561 	 * xsk_get_map_list_entry() and xsk_map_try_sock_delete()
562 	 * calls. Then we ask the map to remove the socket, which
563 	 * tries to remove the socket from the map. Note that there
564 	 * might be updates to the map between
565 	 * xsk_get_map_list_entry() and xsk_map_try_sock_delete().
566 	 */
567 	struct xdp_sock **map_entry = NULL;
568 	struct xsk_map *map;
569 
570 	while ((map = xsk_get_map_list_entry(xs, &map_entry))) {
571 		xsk_map_try_sock_delete(map, xs, map_entry);
572 		xsk_map_put(map);
573 	}
574 }
575 
576 static int xsk_release(struct socket *sock)
577 {
578 	struct sock *sk = sock->sk;
579 	struct xdp_sock *xs = xdp_sk(sk);
580 	struct net *net;
581 
582 	if (!sk)
583 		return 0;
584 
585 	net = sock_net(sk);
586 
587 	mutex_lock(&net->xdp.lock);
588 	sk_del_node_init_rcu(sk);
589 	mutex_unlock(&net->xdp.lock);
590 
591 	local_bh_disable();
592 	sock_prot_inuse_add(net, sk->sk_prot, -1);
593 	local_bh_enable();
594 
595 	xsk_delete_from_maps(xs);
596 	mutex_lock(&xs->mutex);
597 	xsk_unbind_dev(xs);
598 	mutex_unlock(&xs->mutex);
599 
600 	xskq_destroy(xs->rx);
601 	xskq_destroy(xs->tx);
602 	xskq_destroy(xs->fq_tmp);
603 	xskq_destroy(xs->cq_tmp);
604 
605 	sock_orphan(sk);
606 	sock->sk = NULL;
607 
608 	sk_refcnt_debug_release(sk);
609 	sock_put(sk);
610 
611 	return 0;
612 }
613 
614 static struct socket *xsk_lookup_xsk_from_fd(int fd)
615 {
616 	struct socket *sock;
617 	int err;
618 
619 	sock = sockfd_lookup(fd, &err);
620 	if (!sock)
621 		return ERR_PTR(-ENOTSOCK);
622 
623 	if (sock->sk->sk_family != PF_XDP) {
624 		sockfd_put(sock);
625 		return ERR_PTR(-ENOPROTOOPT);
626 	}
627 
628 	return sock;
629 }
630 
631 static bool xsk_validate_queues(struct xdp_sock *xs)
632 {
633 	return xs->fq_tmp && xs->cq_tmp;
634 }
635 
636 static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
637 {
638 	struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr;
639 	struct sock *sk = sock->sk;
640 	struct xdp_sock *xs = xdp_sk(sk);
641 	struct net_device *dev;
642 	u32 flags, qid;
643 	int err = 0;
644 
645 	if (addr_len < sizeof(struct sockaddr_xdp))
646 		return -EINVAL;
647 	if (sxdp->sxdp_family != AF_XDP)
648 		return -EINVAL;
649 
650 	flags = sxdp->sxdp_flags;
651 	if (flags & ~(XDP_SHARED_UMEM | XDP_COPY | XDP_ZEROCOPY |
652 		      XDP_USE_NEED_WAKEUP))
653 		return -EINVAL;
654 
655 	rtnl_lock();
656 	mutex_lock(&xs->mutex);
657 	if (xs->state != XSK_READY) {
658 		err = -EBUSY;
659 		goto out_release;
660 	}
661 
662 	dev = dev_get_by_index(sock_net(sk), sxdp->sxdp_ifindex);
663 	if (!dev) {
664 		err = -ENODEV;
665 		goto out_release;
666 	}
667 
668 	if (!xs->rx && !xs->tx) {
669 		err = -EINVAL;
670 		goto out_unlock;
671 	}
672 
673 	qid = sxdp->sxdp_queue_id;
674 
675 	if (flags & XDP_SHARED_UMEM) {
676 		struct xdp_sock *umem_xs;
677 		struct socket *sock;
678 
679 		if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY) ||
680 		    (flags & XDP_USE_NEED_WAKEUP)) {
681 			/* Cannot specify flags for shared sockets. */
682 			err = -EINVAL;
683 			goto out_unlock;
684 		}
685 
686 		if (xs->umem) {
687 			/* We have already our own. */
688 			err = -EINVAL;
689 			goto out_unlock;
690 		}
691 
692 		sock = xsk_lookup_xsk_from_fd(sxdp->sxdp_shared_umem_fd);
693 		if (IS_ERR(sock)) {
694 			err = PTR_ERR(sock);
695 			goto out_unlock;
696 		}
697 
698 		umem_xs = xdp_sk(sock->sk);
699 		if (!xsk_is_bound(umem_xs)) {
700 			err = -EBADF;
701 			sockfd_put(sock);
702 			goto out_unlock;
703 		}
704 
705 		if (umem_xs->queue_id != qid || umem_xs->dev != dev) {
706 			/* Share the umem with another socket on another qid
707 			 * and/or device.
708 			 */
709 			xs->pool = xp_create_and_assign_umem(xs,
710 							     umem_xs->umem);
711 			if (!xs->pool) {
712 				sockfd_put(sock);
713 				goto out_unlock;
714 			}
715 
716 			err = xp_assign_dev_shared(xs->pool, umem_xs->umem,
717 						   dev, qid);
718 			if (err) {
719 				xp_destroy(xs->pool);
720 				sockfd_put(sock);
721 				goto out_unlock;
722 			}
723 		} else {
724 			/* Share the buffer pool with the other socket. */
725 			if (xs->fq_tmp || xs->cq_tmp) {
726 				/* Do not allow setting your own fq or cq. */
727 				err = -EINVAL;
728 				sockfd_put(sock);
729 				goto out_unlock;
730 			}
731 
732 			xp_get_pool(umem_xs->pool);
733 			xs->pool = umem_xs->pool;
734 		}
735 
736 		xdp_get_umem(umem_xs->umem);
737 		WRITE_ONCE(xs->umem, umem_xs->umem);
738 		sockfd_put(sock);
739 	} else if (!xs->umem || !xsk_validate_queues(xs)) {
740 		err = -EINVAL;
741 		goto out_unlock;
742 	} else {
743 		/* This xsk has its own umem. */
744 		xs->pool = xp_create_and_assign_umem(xs, xs->umem);
745 		if (!xs->pool) {
746 			err = -ENOMEM;
747 			goto out_unlock;
748 		}
749 
750 		err = xp_assign_dev(xs->pool, dev, qid, flags);
751 		if (err) {
752 			xp_destroy(xs->pool);
753 			xs->pool = NULL;
754 			goto out_unlock;
755 		}
756 	}
757 
758 	xs->dev = dev;
759 	xs->zc = xs->umem->zc;
760 	xs->queue_id = qid;
761 	xp_add_xsk(xs->pool, xs);
762 
763 out_unlock:
764 	if (err) {
765 		dev_put(dev);
766 	} else {
767 		/* Matches smp_rmb() in bind() for shared umem
768 		 * sockets, and xsk_is_bound().
769 		 */
770 		smp_wmb();
771 		WRITE_ONCE(xs->state, XSK_BOUND);
772 	}
773 out_release:
774 	mutex_unlock(&xs->mutex);
775 	rtnl_unlock();
776 	return err;
777 }
778 
779 struct xdp_umem_reg_v1 {
780 	__u64 addr; /* Start of packet data area */
781 	__u64 len; /* Length of packet data area */
782 	__u32 chunk_size;
783 	__u32 headroom;
784 };
785 
786 static int xsk_setsockopt(struct socket *sock, int level, int optname,
787 			  sockptr_t optval, unsigned int optlen)
788 {
789 	struct sock *sk = sock->sk;
790 	struct xdp_sock *xs = xdp_sk(sk);
791 	int err;
792 
793 	if (level != SOL_XDP)
794 		return -ENOPROTOOPT;
795 
796 	switch (optname) {
797 	case XDP_RX_RING:
798 	case XDP_TX_RING:
799 	{
800 		struct xsk_queue **q;
801 		int entries;
802 
803 		if (optlen < sizeof(entries))
804 			return -EINVAL;
805 		if (copy_from_sockptr(&entries, optval, sizeof(entries)))
806 			return -EFAULT;
807 
808 		mutex_lock(&xs->mutex);
809 		if (xs->state != XSK_READY) {
810 			mutex_unlock(&xs->mutex);
811 			return -EBUSY;
812 		}
813 		q = (optname == XDP_TX_RING) ? &xs->tx : &xs->rx;
814 		err = xsk_init_queue(entries, q, false);
815 		if (!err && optname == XDP_TX_RING)
816 			/* Tx needs to be explicitly woken up the first time */
817 			xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP;
818 		mutex_unlock(&xs->mutex);
819 		return err;
820 	}
821 	case XDP_UMEM_REG:
822 	{
823 		size_t mr_size = sizeof(struct xdp_umem_reg);
824 		struct xdp_umem_reg mr = {};
825 		struct xdp_umem *umem;
826 
827 		if (optlen < sizeof(struct xdp_umem_reg_v1))
828 			return -EINVAL;
829 		else if (optlen < sizeof(mr))
830 			mr_size = sizeof(struct xdp_umem_reg_v1);
831 
832 		if (copy_from_sockptr(&mr, optval, mr_size))
833 			return -EFAULT;
834 
835 		mutex_lock(&xs->mutex);
836 		if (xs->state != XSK_READY || xs->umem) {
837 			mutex_unlock(&xs->mutex);
838 			return -EBUSY;
839 		}
840 
841 		umem = xdp_umem_create(&mr);
842 		if (IS_ERR(umem)) {
843 			mutex_unlock(&xs->mutex);
844 			return PTR_ERR(umem);
845 		}
846 
847 		/* Make sure umem is ready before it can be seen by others */
848 		smp_wmb();
849 		WRITE_ONCE(xs->umem, umem);
850 		mutex_unlock(&xs->mutex);
851 		return 0;
852 	}
853 	case XDP_UMEM_FILL_RING:
854 	case XDP_UMEM_COMPLETION_RING:
855 	{
856 		struct xsk_queue **q;
857 		int entries;
858 
859 		if (copy_from_sockptr(&entries, optval, sizeof(entries)))
860 			return -EFAULT;
861 
862 		mutex_lock(&xs->mutex);
863 		if (xs->state != XSK_READY) {
864 			mutex_unlock(&xs->mutex);
865 			return -EBUSY;
866 		}
867 
868 		q = (optname == XDP_UMEM_FILL_RING) ? &xs->fq_tmp :
869 			&xs->cq_tmp;
870 		err = xsk_init_queue(entries, q, true);
871 		mutex_unlock(&xs->mutex);
872 		return err;
873 	}
874 	default:
875 		break;
876 	}
877 
878 	return -ENOPROTOOPT;
879 }
880 
881 static void xsk_enter_rxtx_offsets(struct xdp_ring_offset_v1 *ring)
882 {
883 	ring->producer = offsetof(struct xdp_rxtx_ring, ptrs.producer);
884 	ring->consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer);
885 	ring->desc = offsetof(struct xdp_rxtx_ring, desc);
886 }
887 
888 static void xsk_enter_umem_offsets(struct xdp_ring_offset_v1 *ring)
889 {
890 	ring->producer = offsetof(struct xdp_umem_ring, ptrs.producer);
891 	ring->consumer = offsetof(struct xdp_umem_ring, ptrs.consumer);
892 	ring->desc = offsetof(struct xdp_umem_ring, desc);
893 }
894 
895 struct xdp_statistics_v1 {
896 	__u64 rx_dropped;
897 	__u64 rx_invalid_descs;
898 	__u64 tx_invalid_descs;
899 };
900 
901 static int xsk_getsockopt(struct socket *sock, int level, int optname,
902 			  char __user *optval, int __user *optlen)
903 {
904 	struct sock *sk = sock->sk;
905 	struct xdp_sock *xs = xdp_sk(sk);
906 	int len;
907 
908 	if (level != SOL_XDP)
909 		return -ENOPROTOOPT;
910 
911 	if (get_user(len, optlen))
912 		return -EFAULT;
913 	if (len < 0)
914 		return -EINVAL;
915 
916 	switch (optname) {
917 	case XDP_STATISTICS:
918 	{
919 		struct xdp_statistics stats = {};
920 		bool extra_stats = true;
921 		size_t stats_size;
922 
923 		if (len < sizeof(struct xdp_statistics_v1)) {
924 			return -EINVAL;
925 		} else if (len < sizeof(stats)) {
926 			extra_stats = false;
927 			stats_size = sizeof(struct xdp_statistics_v1);
928 		} else {
929 			stats_size = sizeof(stats);
930 		}
931 
932 		mutex_lock(&xs->mutex);
933 		stats.rx_dropped = xs->rx_dropped;
934 		if (extra_stats) {
935 			stats.rx_ring_full = xs->rx_queue_full;
936 			stats.rx_fill_ring_empty_descs =
937 				xs->pool ? xskq_nb_queue_empty_descs(xs->pool->fq) : 0;
938 			stats.tx_ring_empty_descs = xskq_nb_queue_empty_descs(xs->tx);
939 		} else {
940 			stats.rx_dropped += xs->rx_queue_full;
941 		}
942 		stats.rx_invalid_descs = xskq_nb_invalid_descs(xs->rx);
943 		stats.tx_invalid_descs = xskq_nb_invalid_descs(xs->tx);
944 		mutex_unlock(&xs->mutex);
945 
946 		if (copy_to_user(optval, &stats, stats_size))
947 			return -EFAULT;
948 		if (put_user(stats_size, optlen))
949 			return -EFAULT;
950 
951 		return 0;
952 	}
953 	case XDP_MMAP_OFFSETS:
954 	{
955 		struct xdp_mmap_offsets off;
956 		struct xdp_mmap_offsets_v1 off_v1;
957 		bool flags_supported = true;
958 		void *to_copy;
959 
960 		if (len < sizeof(off_v1))
961 			return -EINVAL;
962 		else if (len < sizeof(off))
963 			flags_supported = false;
964 
965 		if (flags_supported) {
966 			/* xdp_ring_offset is identical to xdp_ring_offset_v1
967 			 * except for the flags field added to the end.
968 			 */
969 			xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *)
970 					       &off.rx);
971 			xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *)
972 					       &off.tx);
973 			xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *)
974 					       &off.fr);
975 			xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *)
976 					       &off.cr);
977 			off.rx.flags = offsetof(struct xdp_rxtx_ring,
978 						ptrs.flags);
979 			off.tx.flags = offsetof(struct xdp_rxtx_ring,
980 						ptrs.flags);
981 			off.fr.flags = offsetof(struct xdp_umem_ring,
982 						ptrs.flags);
983 			off.cr.flags = offsetof(struct xdp_umem_ring,
984 						ptrs.flags);
985 
986 			len = sizeof(off);
987 			to_copy = &off;
988 		} else {
989 			xsk_enter_rxtx_offsets(&off_v1.rx);
990 			xsk_enter_rxtx_offsets(&off_v1.tx);
991 			xsk_enter_umem_offsets(&off_v1.fr);
992 			xsk_enter_umem_offsets(&off_v1.cr);
993 
994 			len = sizeof(off_v1);
995 			to_copy = &off_v1;
996 		}
997 
998 		if (copy_to_user(optval, to_copy, len))
999 			return -EFAULT;
1000 		if (put_user(len, optlen))
1001 			return -EFAULT;
1002 
1003 		return 0;
1004 	}
1005 	case XDP_OPTIONS:
1006 	{
1007 		struct xdp_options opts = {};
1008 
1009 		if (len < sizeof(opts))
1010 			return -EINVAL;
1011 
1012 		mutex_lock(&xs->mutex);
1013 		if (xs->zc)
1014 			opts.flags |= XDP_OPTIONS_ZEROCOPY;
1015 		mutex_unlock(&xs->mutex);
1016 
1017 		len = sizeof(opts);
1018 		if (copy_to_user(optval, &opts, len))
1019 			return -EFAULT;
1020 		if (put_user(len, optlen))
1021 			return -EFAULT;
1022 
1023 		return 0;
1024 	}
1025 	default:
1026 		break;
1027 	}
1028 
1029 	return -EOPNOTSUPP;
1030 }
1031 
1032 static int xsk_mmap(struct file *file, struct socket *sock,
1033 		    struct vm_area_struct *vma)
1034 {
1035 	loff_t offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
1036 	unsigned long size = vma->vm_end - vma->vm_start;
1037 	struct xdp_sock *xs = xdp_sk(sock->sk);
1038 	struct xsk_queue *q = NULL;
1039 	unsigned long pfn;
1040 	struct page *qpg;
1041 
1042 	if (READ_ONCE(xs->state) != XSK_READY)
1043 		return -EBUSY;
1044 
1045 	if (offset == XDP_PGOFF_RX_RING) {
1046 		q = READ_ONCE(xs->rx);
1047 	} else if (offset == XDP_PGOFF_TX_RING) {
1048 		q = READ_ONCE(xs->tx);
1049 	} else {
1050 		/* Matches the smp_wmb() in XDP_UMEM_REG */
1051 		smp_rmb();
1052 		if (offset == XDP_UMEM_PGOFF_FILL_RING)
1053 			q = READ_ONCE(xs->fq_tmp);
1054 		else if (offset == XDP_UMEM_PGOFF_COMPLETION_RING)
1055 			q = READ_ONCE(xs->cq_tmp);
1056 	}
1057 
1058 	if (!q)
1059 		return -EINVAL;
1060 
1061 	/* Matches the smp_wmb() in xsk_init_queue */
1062 	smp_rmb();
1063 	qpg = virt_to_head_page(q->ring);
1064 	if (size > page_size(qpg))
1065 		return -EINVAL;
1066 
1067 	pfn = virt_to_phys(q->ring) >> PAGE_SHIFT;
1068 	return remap_pfn_range(vma, vma->vm_start, pfn,
1069 			       size, vma->vm_page_prot);
1070 }
1071 
1072 static int xsk_notifier(struct notifier_block *this,
1073 			unsigned long msg, void *ptr)
1074 {
1075 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
1076 	struct net *net = dev_net(dev);
1077 	struct sock *sk;
1078 
1079 	switch (msg) {
1080 	case NETDEV_UNREGISTER:
1081 		mutex_lock(&net->xdp.lock);
1082 		sk_for_each(sk, &net->xdp.list) {
1083 			struct xdp_sock *xs = xdp_sk(sk);
1084 
1085 			mutex_lock(&xs->mutex);
1086 			if (xs->dev == dev) {
1087 				sk->sk_err = ENETDOWN;
1088 				if (!sock_flag(sk, SOCK_DEAD))
1089 					sk->sk_error_report(sk);
1090 
1091 				xsk_unbind_dev(xs);
1092 
1093 				/* Clear device references. */
1094 				xp_clear_dev(xs->pool);
1095 			}
1096 			mutex_unlock(&xs->mutex);
1097 		}
1098 		mutex_unlock(&net->xdp.lock);
1099 		break;
1100 	}
1101 	return NOTIFY_DONE;
1102 }
1103 
1104 static struct proto xsk_proto = {
1105 	.name =		"XDP",
1106 	.owner =	THIS_MODULE,
1107 	.obj_size =	sizeof(struct xdp_sock),
1108 };
1109 
1110 static const struct proto_ops xsk_proto_ops = {
1111 	.family		= PF_XDP,
1112 	.owner		= THIS_MODULE,
1113 	.release	= xsk_release,
1114 	.bind		= xsk_bind,
1115 	.connect	= sock_no_connect,
1116 	.socketpair	= sock_no_socketpair,
1117 	.accept		= sock_no_accept,
1118 	.getname	= sock_no_getname,
1119 	.poll		= xsk_poll,
1120 	.ioctl		= sock_no_ioctl,
1121 	.listen		= sock_no_listen,
1122 	.shutdown	= sock_no_shutdown,
1123 	.setsockopt	= xsk_setsockopt,
1124 	.getsockopt	= xsk_getsockopt,
1125 	.sendmsg	= xsk_sendmsg,
1126 	.recvmsg	= sock_no_recvmsg,
1127 	.mmap		= xsk_mmap,
1128 	.sendpage	= sock_no_sendpage,
1129 };
1130 
1131 static void xsk_destruct(struct sock *sk)
1132 {
1133 	struct xdp_sock *xs = xdp_sk(sk);
1134 
1135 	if (!sock_flag(sk, SOCK_DEAD))
1136 		return;
1137 
1138 	xp_put_pool(xs->pool);
1139 
1140 	sk_refcnt_debug_dec(sk);
1141 }
1142 
1143 static int xsk_create(struct net *net, struct socket *sock, int protocol,
1144 		      int kern)
1145 {
1146 	struct xdp_sock *xs;
1147 	struct sock *sk;
1148 
1149 	if (!ns_capable(net->user_ns, CAP_NET_RAW))
1150 		return -EPERM;
1151 	if (sock->type != SOCK_RAW)
1152 		return -ESOCKTNOSUPPORT;
1153 
1154 	if (protocol)
1155 		return -EPROTONOSUPPORT;
1156 
1157 	sock->state = SS_UNCONNECTED;
1158 
1159 	sk = sk_alloc(net, PF_XDP, GFP_KERNEL, &xsk_proto, kern);
1160 	if (!sk)
1161 		return -ENOBUFS;
1162 
1163 	sock->ops = &xsk_proto_ops;
1164 
1165 	sock_init_data(sock, sk);
1166 
1167 	sk->sk_family = PF_XDP;
1168 
1169 	sk->sk_destruct = xsk_destruct;
1170 	sk_refcnt_debug_inc(sk);
1171 
1172 	sock_set_flag(sk, SOCK_RCU_FREE);
1173 
1174 	xs = xdp_sk(sk);
1175 	xs->state = XSK_READY;
1176 	mutex_init(&xs->mutex);
1177 	spin_lock_init(&xs->rx_lock);
1178 	spin_lock_init(&xs->tx_completion_lock);
1179 
1180 	INIT_LIST_HEAD(&xs->map_list);
1181 	spin_lock_init(&xs->map_list_lock);
1182 
1183 	mutex_lock(&net->xdp.lock);
1184 	sk_add_node_rcu(sk, &net->xdp.list);
1185 	mutex_unlock(&net->xdp.lock);
1186 
1187 	local_bh_disable();
1188 	sock_prot_inuse_add(net, &xsk_proto, 1);
1189 	local_bh_enable();
1190 
1191 	return 0;
1192 }
1193 
1194 static const struct net_proto_family xsk_family_ops = {
1195 	.family = PF_XDP,
1196 	.create = xsk_create,
1197 	.owner	= THIS_MODULE,
1198 };
1199 
1200 static struct notifier_block xsk_netdev_notifier = {
1201 	.notifier_call	= xsk_notifier,
1202 };
1203 
1204 static int __net_init xsk_net_init(struct net *net)
1205 {
1206 	mutex_init(&net->xdp.lock);
1207 	INIT_HLIST_HEAD(&net->xdp.list);
1208 	return 0;
1209 }
1210 
1211 static void __net_exit xsk_net_exit(struct net *net)
1212 {
1213 	WARN_ON_ONCE(!hlist_empty(&net->xdp.list));
1214 }
1215 
1216 static struct pernet_operations xsk_net_ops = {
1217 	.init = xsk_net_init,
1218 	.exit = xsk_net_exit,
1219 };
1220 
1221 static int __init xsk_init(void)
1222 {
1223 	int err, cpu;
1224 
1225 	err = proto_register(&xsk_proto, 0 /* no slab */);
1226 	if (err)
1227 		goto out;
1228 
1229 	err = sock_register(&xsk_family_ops);
1230 	if (err)
1231 		goto out_proto;
1232 
1233 	err = register_pernet_subsys(&xsk_net_ops);
1234 	if (err)
1235 		goto out_sk;
1236 
1237 	err = register_netdevice_notifier(&xsk_netdev_notifier);
1238 	if (err)
1239 		goto out_pernet;
1240 
1241 	for_each_possible_cpu(cpu)
1242 		INIT_LIST_HEAD(&per_cpu(xskmap_flush_list, cpu));
1243 	return 0;
1244 
1245 out_pernet:
1246 	unregister_pernet_subsys(&xsk_net_ops);
1247 out_sk:
1248 	sock_unregister(PF_XDP);
1249 out_proto:
1250 	proto_unregister(&xsk_proto);
1251 out:
1252 	return err;
1253 }
1254 
1255 fs_initcall(xsk_init);
1256