xref: /openbmc/linux/net/xdp/xsk.c (revision 911b8eac)
1 // SPDX-License-Identifier: GPL-2.0
2 /* XDP sockets
3  *
4  * AF_XDP sockets allows a channel between XDP programs and userspace
5  * applications.
6  * Copyright(c) 2018 Intel Corporation.
7  *
8  * Author(s): Björn Töpel <bjorn.topel@intel.com>
9  *	      Magnus Karlsson <magnus.karlsson@intel.com>
10  */
11 
12 #define pr_fmt(fmt) "AF_XDP: %s: " fmt, __func__
13 
14 #include <linux/if_xdp.h>
15 #include <linux/init.h>
16 #include <linux/sched/mm.h>
17 #include <linux/sched/signal.h>
18 #include <linux/sched/task.h>
19 #include <linux/socket.h>
20 #include <linux/file.h>
21 #include <linux/uaccess.h>
22 #include <linux/net.h>
23 #include <linux/netdevice.h>
24 #include <linux/rculist.h>
25 #include <net/xdp_sock_drv.h>
26 #include <net/xdp.h>
27 
28 #include "xsk_queue.h"
29 #include "xdp_umem.h"
30 #include "xsk.h"
31 
32 #define TX_BATCH_SIZE 16
33 
34 static DEFINE_PER_CPU(struct list_head, xskmap_flush_list);
35 
36 void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool)
37 {
38 	if (pool->cached_need_wakeup & XDP_WAKEUP_RX)
39 		return;
40 
41 	pool->fq->ring->flags |= XDP_RING_NEED_WAKEUP;
42 	pool->cached_need_wakeup |= XDP_WAKEUP_RX;
43 }
44 EXPORT_SYMBOL(xsk_set_rx_need_wakeup);
45 
46 void xsk_set_tx_need_wakeup(struct xsk_buff_pool *pool)
47 {
48 	struct xdp_sock *xs;
49 
50 	if (pool->cached_need_wakeup & XDP_WAKEUP_TX)
51 		return;
52 
53 	rcu_read_lock();
54 	list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
55 		xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP;
56 	}
57 	rcu_read_unlock();
58 
59 	pool->cached_need_wakeup |= XDP_WAKEUP_TX;
60 }
61 EXPORT_SYMBOL(xsk_set_tx_need_wakeup);
62 
63 void xsk_clear_rx_need_wakeup(struct xsk_buff_pool *pool)
64 {
65 	if (!(pool->cached_need_wakeup & XDP_WAKEUP_RX))
66 		return;
67 
68 	pool->fq->ring->flags &= ~XDP_RING_NEED_WAKEUP;
69 	pool->cached_need_wakeup &= ~XDP_WAKEUP_RX;
70 }
71 EXPORT_SYMBOL(xsk_clear_rx_need_wakeup);
72 
73 void xsk_clear_tx_need_wakeup(struct xsk_buff_pool *pool)
74 {
75 	struct xdp_sock *xs;
76 
77 	if (!(pool->cached_need_wakeup & XDP_WAKEUP_TX))
78 		return;
79 
80 	rcu_read_lock();
81 	list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
82 		xs->tx->ring->flags &= ~XDP_RING_NEED_WAKEUP;
83 	}
84 	rcu_read_unlock();
85 
86 	pool->cached_need_wakeup &= ~XDP_WAKEUP_TX;
87 }
88 EXPORT_SYMBOL(xsk_clear_tx_need_wakeup);
89 
90 bool xsk_uses_need_wakeup(struct xsk_buff_pool *pool)
91 {
92 	return pool->uses_need_wakeup;
93 }
94 EXPORT_SYMBOL(xsk_uses_need_wakeup);
95 
96 struct xsk_buff_pool *xsk_get_pool_from_qid(struct net_device *dev,
97 					    u16 queue_id)
98 {
99 	if (queue_id < dev->real_num_rx_queues)
100 		return dev->_rx[queue_id].pool;
101 	if (queue_id < dev->real_num_tx_queues)
102 		return dev->_tx[queue_id].pool;
103 
104 	return NULL;
105 }
106 EXPORT_SYMBOL(xsk_get_pool_from_qid);
107 
108 void xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id)
109 {
110 	if (queue_id < dev->real_num_rx_queues)
111 		dev->_rx[queue_id].pool = NULL;
112 	if (queue_id < dev->real_num_tx_queues)
113 		dev->_tx[queue_id].pool = NULL;
114 }
115 
116 /* The buffer pool is stored both in the _rx struct and the _tx struct as we do
117  * not know if the device has more tx queues than rx, or the opposite.
118  * This might also change during run time.
119  */
120 int xsk_reg_pool_at_qid(struct net_device *dev, struct xsk_buff_pool *pool,
121 			u16 queue_id)
122 {
123 	if (queue_id >= max_t(unsigned int,
124 			      dev->real_num_rx_queues,
125 			      dev->real_num_tx_queues))
126 		return -EINVAL;
127 
128 	if (queue_id < dev->real_num_rx_queues)
129 		dev->_rx[queue_id].pool = pool;
130 	if (queue_id < dev->real_num_tx_queues)
131 		dev->_tx[queue_id].pool = pool;
132 
133 	return 0;
134 }
135 
136 void xp_release(struct xdp_buff_xsk *xskb)
137 {
138 	xskb->pool->free_heads[xskb->pool->free_heads_cnt++] = xskb;
139 }
140 
141 static u64 xp_get_handle(struct xdp_buff_xsk *xskb)
142 {
143 	u64 offset = xskb->xdp.data - xskb->xdp.data_hard_start;
144 
145 	offset += xskb->pool->headroom;
146 	if (!xskb->pool->unaligned)
147 		return xskb->orig_addr + offset;
148 	return xskb->orig_addr + (offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT);
149 }
150 
151 static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
152 {
153 	struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp);
154 	u64 addr;
155 	int err;
156 
157 	addr = xp_get_handle(xskb);
158 	err = xskq_prod_reserve_desc(xs->rx, addr, len);
159 	if (err) {
160 		xs->rx_queue_full++;
161 		return err;
162 	}
163 
164 	xp_release(xskb);
165 	return 0;
166 }
167 
168 static void xsk_copy_xdp(struct xdp_buff *to, struct xdp_buff *from, u32 len)
169 {
170 	void *from_buf, *to_buf;
171 	u32 metalen;
172 
173 	if (unlikely(xdp_data_meta_unsupported(from))) {
174 		from_buf = from->data;
175 		to_buf = to->data;
176 		metalen = 0;
177 	} else {
178 		from_buf = from->data_meta;
179 		metalen = from->data - from->data_meta;
180 		to_buf = to->data - metalen;
181 	}
182 
183 	memcpy(to_buf, from_buf, len + metalen);
184 }
185 
186 static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len,
187 		     bool explicit_free)
188 {
189 	struct xdp_buff *xsk_xdp;
190 	int err;
191 
192 	if (len > xsk_pool_get_rx_frame_size(xs->pool)) {
193 		xs->rx_dropped++;
194 		return -ENOSPC;
195 	}
196 
197 	xsk_xdp = xsk_buff_alloc(xs->pool);
198 	if (!xsk_xdp) {
199 		xs->rx_dropped++;
200 		return -ENOSPC;
201 	}
202 
203 	xsk_copy_xdp(xsk_xdp, xdp, len);
204 	err = __xsk_rcv_zc(xs, xsk_xdp, len);
205 	if (err) {
206 		xsk_buff_free(xsk_xdp);
207 		return err;
208 	}
209 	if (explicit_free)
210 		xdp_return_buff(xdp);
211 	return 0;
212 }
213 
214 static bool xsk_is_bound(struct xdp_sock *xs)
215 {
216 	if (READ_ONCE(xs->state) == XSK_BOUND) {
217 		/* Matches smp_wmb() in bind(). */
218 		smp_rmb();
219 		return true;
220 	}
221 	return false;
222 }
223 
224 static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp,
225 		   bool explicit_free)
226 {
227 	u32 len;
228 
229 	if (!xsk_is_bound(xs))
230 		return -EINVAL;
231 
232 	if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index)
233 		return -EINVAL;
234 
235 	len = xdp->data_end - xdp->data;
236 
237 	return xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL ?
238 		__xsk_rcv_zc(xs, xdp, len) :
239 		__xsk_rcv(xs, xdp, len, explicit_free);
240 }
241 
242 static void xsk_flush(struct xdp_sock *xs)
243 {
244 	xskq_prod_submit(xs->rx);
245 	__xskq_cons_release(xs->pool->fq);
246 	sock_def_readable(&xs->sk);
247 }
248 
249 int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
250 {
251 	int err;
252 
253 	spin_lock_bh(&xs->rx_lock);
254 	err = xsk_rcv(xs, xdp, false);
255 	xsk_flush(xs);
256 	spin_unlock_bh(&xs->rx_lock);
257 	return err;
258 }
259 
260 int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp)
261 {
262 	struct list_head *flush_list = this_cpu_ptr(&xskmap_flush_list);
263 	int err;
264 
265 	err = xsk_rcv(xs, xdp, true);
266 	if (err)
267 		return err;
268 
269 	if (!xs->flush_node.prev)
270 		list_add(&xs->flush_node, flush_list);
271 
272 	return 0;
273 }
274 
275 void __xsk_map_flush(void)
276 {
277 	struct list_head *flush_list = this_cpu_ptr(&xskmap_flush_list);
278 	struct xdp_sock *xs, *tmp;
279 
280 	list_for_each_entry_safe(xs, tmp, flush_list, flush_node) {
281 		xsk_flush(xs);
282 		__list_del_clearprev(&xs->flush_node);
283 	}
284 }
285 
286 void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries)
287 {
288 	xskq_prod_submit_n(pool->cq, nb_entries);
289 }
290 EXPORT_SYMBOL(xsk_tx_completed);
291 
292 void xsk_tx_release(struct xsk_buff_pool *pool)
293 {
294 	struct xdp_sock *xs;
295 
296 	rcu_read_lock();
297 	list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
298 		__xskq_cons_release(xs->tx);
299 		xs->sk.sk_write_space(&xs->sk);
300 	}
301 	rcu_read_unlock();
302 }
303 EXPORT_SYMBOL(xsk_tx_release);
304 
305 bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc)
306 {
307 	struct xdp_sock *xs;
308 
309 	rcu_read_lock();
310 	list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
311 		if (!xskq_cons_peek_desc(xs->tx, desc, pool)) {
312 			xs->tx->queue_empty_descs++;
313 			continue;
314 		}
315 
316 		/* This is the backpressure mechanism for the Tx path.
317 		 * Reserve space in the completion queue and only proceed
318 		 * if there is space in it. This avoids having to implement
319 		 * any buffering in the Tx path.
320 		 */
321 		if (xskq_prod_reserve_addr(pool->cq, desc->addr))
322 			goto out;
323 
324 		xskq_cons_release(xs->tx);
325 		rcu_read_unlock();
326 		return true;
327 	}
328 
329 out:
330 	rcu_read_unlock();
331 	return false;
332 }
333 EXPORT_SYMBOL(xsk_tx_peek_desc);
334 
335 static int xsk_wakeup(struct xdp_sock *xs, u8 flags)
336 {
337 	struct net_device *dev = xs->dev;
338 	int err;
339 
340 	rcu_read_lock();
341 	err = dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, flags);
342 	rcu_read_unlock();
343 
344 	return err;
345 }
346 
347 static int xsk_zc_xmit(struct xdp_sock *xs)
348 {
349 	return xsk_wakeup(xs, XDP_WAKEUP_TX);
350 }
351 
352 static void xsk_destruct_skb(struct sk_buff *skb)
353 {
354 	u64 addr = (u64)(long)skb_shinfo(skb)->destructor_arg;
355 	struct xdp_sock *xs = xdp_sk(skb->sk);
356 	unsigned long flags;
357 
358 	spin_lock_irqsave(&xs->tx_completion_lock, flags);
359 	xskq_prod_submit_addr(xs->pool->cq, addr);
360 	spin_unlock_irqrestore(&xs->tx_completion_lock, flags);
361 
362 	sock_wfree(skb);
363 }
364 
365 static int xsk_generic_xmit(struct sock *sk)
366 {
367 	struct xdp_sock *xs = xdp_sk(sk);
368 	u32 max_batch = TX_BATCH_SIZE;
369 	bool sent_frame = false;
370 	struct xdp_desc desc;
371 	struct sk_buff *skb;
372 	int err = 0;
373 
374 	mutex_lock(&xs->mutex);
375 
376 	if (xs->queue_id >= xs->dev->real_num_tx_queues)
377 		goto out;
378 
379 	while (xskq_cons_peek_desc(xs->tx, &desc, xs->pool)) {
380 		char *buffer;
381 		u64 addr;
382 		u32 len;
383 
384 		if (max_batch-- == 0) {
385 			err = -EAGAIN;
386 			goto out;
387 		}
388 
389 		len = desc.len;
390 		skb = sock_alloc_send_skb(sk, len, 1, &err);
391 		if (unlikely(!skb))
392 			goto out;
393 
394 		skb_put(skb, len);
395 		addr = desc.addr;
396 		buffer = xsk_buff_raw_get_data(xs->pool, addr);
397 		err = skb_store_bits(skb, 0, buffer, len);
398 		/* This is the backpressure mechanism for the Tx path.
399 		 * Reserve space in the completion queue and only proceed
400 		 * if there is space in it. This avoids having to implement
401 		 * any buffering in the Tx path.
402 		 */
403 		if (unlikely(err) || xskq_prod_reserve(xs->pool->cq)) {
404 			kfree_skb(skb);
405 			goto out;
406 		}
407 
408 		skb->dev = xs->dev;
409 		skb->priority = sk->sk_priority;
410 		skb->mark = sk->sk_mark;
411 		skb_shinfo(skb)->destructor_arg = (void *)(long)desc.addr;
412 		skb->destructor = xsk_destruct_skb;
413 
414 		err = dev_direct_xmit(skb, xs->queue_id);
415 		xskq_cons_release(xs->tx);
416 		/* Ignore NET_XMIT_CN as packet might have been sent */
417 		if (err == NET_XMIT_DROP || err == NETDEV_TX_BUSY) {
418 			/* SKB completed but not sent */
419 			err = -EBUSY;
420 			goto out;
421 		}
422 
423 		sent_frame = true;
424 	}
425 
426 	xs->tx->queue_empty_descs++;
427 
428 out:
429 	if (sent_frame)
430 		sk->sk_write_space(sk);
431 
432 	mutex_unlock(&xs->mutex);
433 	return err;
434 }
435 
436 static int __xsk_sendmsg(struct sock *sk)
437 {
438 	struct xdp_sock *xs = xdp_sk(sk);
439 
440 	if (unlikely(!(xs->dev->flags & IFF_UP)))
441 		return -ENETDOWN;
442 	if (unlikely(!xs->tx))
443 		return -ENOBUFS;
444 
445 	return xs->zc ? xsk_zc_xmit(xs) : xsk_generic_xmit(sk);
446 }
447 
448 static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
449 {
450 	bool need_wait = !(m->msg_flags & MSG_DONTWAIT);
451 	struct sock *sk = sock->sk;
452 	struct xdp_sock *xs = xdp_sk(sk);
453 
454 	if (unlikely(!xsk_is_bound(xs)))
455 		return -ENXIO;
456 	if (unlikely(need_wait))
457 		return -EOPNOTSUPP;
458 
459 	return __xsk_sendmsg(sk);
460 }
461 
462 static __poll_t xsk_poll(struct file *file, struct socket *sock,
463 			     struct poll_table_struct *wait)
464 {
465 	__poll_t mask = datagram_poll(file, sock, wait);
466 	struct sock *sk = sock->sk;
467 	struct xdp_sock *xs = xdp_sk(sk);
468 	struct xsk_buff_pool *pool;
469 
470 	if (unlikely(!xsk_is_bound(xs)))
471 		return mask;
472 
473 	pool = xs->pool;
474 
475 	if (pool->cached_need_wakeup) {
476 		if (xs->zc)
477 			xsk_wakeup(xs, pool->cached_need_wakeup);
478 		else
479 			/* Poll needs to drive Tx also in copy mode */
480 			__xsk_sendmsg(sk);
481 	}
482 
483 	if (xs->rx && !xskq_prod_is_empty(xs->rx))
484 		mask |= EPOLLIN | EPOLLRDNORM;
485 	if (xs->tx && !xskq_cons_is_full(xs->tx))
486 		mask |= EPOLLOUT | EPOLLWRNORM;
487 
488 	return mask;
489 }
490 
491 static int xsk_init_queue(u32 entries, struct xsk_queue **queue,
492 			  bool umem_queue)
493 {
494 	struct xsk_queue *q;
495 
496 	if (entries == 0 || *queue || !is_power_of_2(entries))
497 		return -EINVAL;
498 
499 	q = xskq_create(entries, umem_queue);
500 	if (!q)
501 		return -ENOMEM;
502 
503 	/* Make sure queue is ready before it can be seen by others */
504 	smp_wmb();
505 	WRITE_ONCE(*queue, q);
506 	return 0;
507 }
508 
509 static void xsk_unbind_dev(struct xdp_sock *xs)
510 {
511 	struct net_device *dev = xs->dev;
512 
513 	if (xs->state != XSK_BOUND)
514 		return;
515 	WRITE_ONCE(xs->state, XSK_UNBOUND);
516 
517 	/* Wait for driver to stop using the xdp socket. */
518 	xp_del_xsk(xs->pool, xs);
519 	xs->dev = NULL;
520 	synchronize_net();
521 	dev_put(dev);
522 }
523 
524 static struct xsk_map *xsk_get_map_list_entry(struct xdp_sock *xs,
525 					      struct xdp_sock ***map_entry)
526 {
527 	struct xsk_map *map = NULL;
528 	struct xsk_map_node *node;
529 
530 	*map_entry = NULL;
531 
532 	spin_lock_bh(&xs->map_list_lock);
533 	node = list_first_entry_or_null(&xs->map_list, struct xsk_map_node,
534 					node);
535 	if (node) {
536 		WARN_ON(xsk_map_inc(node->map));
537 		map = node->map;
538 		*map_entry = node->map_entry;
539 	}
540 	spin_unlock_bh(&xs->map_list_lock);
541 	return map;
542 }
543 
544 static void xsk_delete_from_maps(struct xdp_sock *xs)
545 {
546 	/* This function removes the current XDP socket from all the
547 	 * maps it resides in. We need to take extra care here, due to
548 	 * the two locks involved. Each map has a lock synchronizing
549 	 * updates to the entries, and each socket has a lock that
550 	 * synchronizes access to the list of maps (map_list). For
551 	 * deadlock avoidance the locks need to be taken in the order
552 	 * "map lock"->"socket map list lock". We start off by
553 	 * accessing the socket map list, and take a reference to the
554 	 * map to guarantee existence between the
555 	 * xsk_get_map_list_entry() and xsk_map_try_sock_delete()
556 	 * calls. Then we ask the map to remove the socket, which
557 	 * tries to remove the socket from the map. Note that there
558 	 * might be updates to the map between
559 	 * xsk_get_map_list_entry() and xsk_map_try_sock_delete().
560 	 */
561 	struct xdp_sock **map_entry = NULL;
562 	struct xsk_map *map;
563 
564 	while ((map = xsk_get_map_list_entry(xs, &map_entry))) {
565 		xsk_map_try_sock_delete(map, xs, map_entry);
566 		xsk_map_put(map);
567 	}
568 }
569 
570 static int xsk_release(struct socket *sock)
571 {
572 	struct sock *sk = sock->sk;
573 	struct xdp_sock *xs = xdp_sk(sk);
574 	struct net *net;
575 
576 	if (!sk)
577 		return 0;
578 
579 	net = sock_net(sk);
580 
581 	mutex_lock(&net->xdp.lock);
582 	sk_del_node_init_rcu(sk);
583 	mutex_unlock(&net->xdp.lock);
584 
585 	local_bh_disable();
586 	sock_prot_inuse_add(net, sk->sk_prot, -1);
587 	local_bh_enable();
588 
589 	xsk_delete_from_maps(xs);
590 	mutex_lock(&xs->mutex);
591 	xsk_unbind_dev(xs);
592 	mutex_unlock(&xs->mutex);
593 
594 	xskq_destroy(xs->rx);
595 	xskq_destroy(xs->tx);
596 	xskq_destroy(xs->fq_tmp);
597 	xskq_destroy(xs->cq_tmp);
598 
599 	sock_orphan(sk);
600 	sock->sk = NULL;
601 
602 	sk_refcnt_debug_release(sk);
603 	sock_put(sk);
604 
605 	return 0;
606 }
607 
608 static struct socket *xsk_lookup_xsk_from_fd(int fd)
609 {
610 	struct socket *sock;
611 	int err;
612 
613 	sock = sockfd_lookup(fd, &err);
614 	if (!sock)
615 		return ERR_PTR(-ENOTSOCK);
616 
617 	if (sock->sk->sk_family != PF_XDP) {
618 		sockfd_put(sock);
619 		return ERR_PTR(-ENOPROTOOPT);
620 	}
621 
622 	return sock;
623 }
624 
625 static bool xsk_validate_queues(struct xdp_sock *xs)
626 {
627 	return xs->fq_tmp && xs->cq_tmp;
628 }
629 
630 static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
631 {
632 	struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr;
633 	struct sock *sk = sock->sk;
634 	struct xdp_sock *xs = xdp_sk(sk);
635 	struct net_device *dev;
636 	u32 flags, qid;
637 	int err = 0;
638 
639 	if (addr_len < sizeof(struct sockaddr_xdp))
640 		return -EINVAL;
641 	if (sxdp->sxdp_family != AF_XDP)
642 		return -EINVAL;
643 
644 	flags = sxdp->sxdp_flags;
645 	if (flags & ~(XDP_SHARED_UMEM | XDP_COPY | XDP_ZEROCOPY |
646 		      XDP_USE_NEED_WAKEUP))
647 		return -EINVAL;
648 
649 	rtnl_lock();
650 	mutex_lock(&xs->mutex);
651 	if (xs->state != XSK_READY) {
652 		err = -EBUSY;
653 		goto out_release;
654 	}
655 
656 	dev = dev_get_by_index(sock_net(sk), sxdp->sxdp_ifindex);
657 	if (!dev) {
658 		err = -ENODEV;
659 		goto out_release;
660 	}
661 
662 	if (!xs->rx && !xs->tx) {
663 		err = -EINVAL;
664 		goto out_unlock;
665 	}
666 
667 	qid = sxdp->sxdp_queue_id;
668 
669 	if (flags & XDP_SHARED_UMEM) {
670 		struct xdp_sock *umem_xs;
671 		struct socket *sock;
672 
673 		if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY) ||
674 		    (flags & XDP_USE_NEED_WAKEUP)) {
675 			/* Cannot specify flags for shared sockets. */
676 			err = -EINVAL;
677 			goto out_unlock;
678 		}
679 
680 		if (xs->umem) {
681 			/* We have already our own. */
682 			err = -EINVAL;
683 			goto out_unlock;
684 		}
685 
686 		sock = xsk_lookup_xsk_from_fd(sxdp->sxdp_shared_umem_fd);
687 		if (IS_ERR(sock)) {
688 			err = PTR_ERR(sock);
689 			goto out_unlock;
690 		}
691 
692 		umem_xs = xdp_sk(sock->sk);
693 		if (!xsk_is_bound(umem_xs)) {
694 			err = -EBADF;
695 			sockfd_put(sock);
696 			goto out_unlock;
697 		}
698 
699 		if (umem_xs->queue_id != qid || umem_xs->dev != dev) {
700 			/* Share the umem with another socket on another qid
701 			 * and/or device.
702 			 */
703 			xs->pool = xp_create_and_assign_umem(xs,
704 							     umem_xs->umem);
705 			if (!xs->pool) {
706 				sockfd_put(sock);
707 				goto out_unlock;
708 			}
709 
710 			err = xp_assign_dev_shared(xs->pool, umem_xs->umem,
711 						   dev, qid);
712 			if (err) {
713 				xp_destroy(xs->pool);
714 				xs->pool = NULL;
715 				sockfd_put(sock);
716 				goto out_unlock;
717 			}
718 		} else {
719 			/* Share the buffer pool with the other socket. */
720 			if (xs->fq_tmp || xs->cq_tmp) {
721 				/* Do not allow setting your own fq or cq. */
722 				err = -EINVAL;
723 				sockfd_put(sock);
724 				goto out_unlock;
725 			}
726 
727 			xp_get_pool(umem_xs->pool);
728 			xs->pool = umem_xs->pool;
729 		}
730 
731 		xdp_get_umem(umem_xs->umem);
732 		WRITE_ONCE(xs->umem, umem_xs->umem);
733 		sockfd_put(sock);
734 	} else if (!xs->umem || !xsk_validate_queues(xs)) {
735 		err = -EINVAL;
736 		goto out_unlock;
737 	} else {
738 		/* This xsk has its own umem. */
739 		xs->pool = xp_create_and_assign_umem(xs, xs->umem);
740 		if (!xs->pool) {
741 			err = -ENOMEM;
742 			goto out_unlock;
743 		}
744 
745 		err = xp_assign_dev(xs->pool, dev, qid, flags);
746 		if (err) {
747 			xp_destroy(xs->pool);
748 			xs->pool = NULL;
749 			goto out_unlock;
750 		}
751 	}
752 
753 	xs->dev = dev;
754 	xs->zc = xs->umem->zc;
755 	xs->queue_id = qid;
756 	xp_add_xsk(xs->pool, xs);
757 
758 out_unlock:
759 	if (err) {
760 		dev_put(dev);
761 	} else {
762 		/* Matches smp_rmb() in bind() for shared umem
763 		 * sockets, and xsk_is_bound().
764 		 */
765 		smp_wmb();
766 		WRITE_ONCE(xs->state, XSK_BOUND);
767 	}
768 out_release:
769 	mutex_unlock(&xs->mutex);
770 	rtnl_unlock();
771 	return err;
772 }
773 
774 struct xdp_umem_reg_v1 {
775 	__u64 addr; /* Start of packet data area */
776 	__u64 len; /* Length of packet data area */
777 	__u32 chunk_size;
778 	__u32 headroom;
779 };
780 
781 static int xsk_setsockopt(struct socket *sock, int level, int optname,
782 			  sockptr_t optval, unsigned int optlen)
783 {
784 	struct sock *sk = sock->sk;
785 	struct xdp_sock *xs = xdp_sk(sk);
786 	int err;
787 
788 	if (level != SOL_XDP)
789 		return -ENOPROTOOPT;
790 
791 	switch (optname) {
792 	case XDP_RX_RING:
793 	case XDP_TX_RING:
794 	{
795 		struct xsk_queue **q;
796 		int entries;
797 
798 		if (optlen < sizeof(entries))
799 			return -EINVAL;
800 		if (copy_from_sockptr(&entries, optval, sizeof(entries)))
801 			return -EFAULT;
802 
803 		mutex_lock(&xs->mutex);
804 		if (xs->state != XSK_READY) {
805 			mutex_unlock(&xs->mutex);
806 			return -EBUSY;
807 		}
808 		q = (optname == XDP_TX_RING) ? &xs->tx : &xs->rx;
809 		err = xsk_init_queue(entries, q, false);
810 		if (!err && optname == XDP_TX_RING)
811 			/* Tx needs to be explicitly woken up the first time */
812 			xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP;
813 		mutex_unlock(&xs->mutex);
814 		return err;
815 	}
816 	case XDP_UMEM_REG:
817 	{
818 		size_t mr_size = sizeof(struct xdp_umem_reg);
819 		struct xdp_umem_reg mr = {};
820 		struct xdp_umem *umem;
821 
822 		if (optlen < sizeof(struct xdp_umem_reg_v1))
823 			return -EINVAL;
824 		else if (optlen < sizeof(mr))
825 			mr_size = sizeof(struct xdp_umem_reg_v1);
826 
827 		if (copy_from_sockptr(&mr, optval, mr_size))
828 			return -EFAULT;
829 
830 		mutex_lock(&xs->mutex);
831 		if (xs->state != XSK_READY || xs->umem) {
832 			mutex_unlock(&xs->mutex);
833 			return -EBUSY;
834 		}
835 
836 		umem = xdp_umem_create(&mr);
837 		if (IS_ERR(umem)) {
838 			mutex_unlock(&xs->mutex);
839 			return PTR_ERR(umem);
840 		}
841 
842 		/* Make sure umem is ready before it can be seen by others */
843 		smp_wmb();
844 		WRITE_ONCE(xs->umem, umem);
845 		mutex_unlock(&xs->mutex);
846 		return 0;
847 	}
848 	case XDP_UMEM_FILL_RING:
849 	case XDP_UMEM_COMPLETION_RING:
850 	{
851 		struct xsk_queue **q;
852 		int entries;
853 
854 		if (copy_from_sockptr(&entries, optval, sizeof(entries)))
855 			return -EFAULT;
856 
857 		mutex_lock(&xs->mutex);
858 		if (xs->state != XSK_READY) {
859 			mutex_unlock(&xs->mutex);
860 			return -EBUSY;
861 		}
862 
863 		q = (optname == XDP_UMEM_FILL_RING) ? &xs->fq_tmp :
864 			&xs->cq_tmp;
865 		err = xsk_init_queue(entries, q, true);
866 		mutex_unlock(&xs->mutex);
867 		return err;
868 	}
869 	default:
870 		break;
871 	}
872 
873 	return -ENOPROTOOPT;
874 }
875 
876 static void xsk_enter_rxtx_offsets(struct xdp_ring_offset_v1 *ring)
877 {
878 	ring->producer = offsetof(struct xdp_rxtx_ring, ptrs.producer);
879 	ring->consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer);
880 	ring->desc = offsetof(struct xdp_rxtx_ring, desc);
881 }
882 
883 static void xsk_enter_umem_offsets(struct xdp_ring_offset_v1 *ring)
884 {
885 	ring->producer = offsetof(struct xdp_umem_ring, ptrs.producer);
886 	ring->consumer = offsetof(struct xdp_umem_ring, ptrs.consumer);
887 	ring->desc = offsetof(struct xdp_umem_ring, desc);
888 }
889 
890 struct xdp_statistics_v1 {
891 	__u64 rx_dropped;
892 	__u64 rx_invalid_descs;
893 	__u64 tx_invalid_descs;
894 };
895 
896 static int xsk_getsockopt(struct socket *sock, int level, int optname,
897 			  char __user *optval, int __user *optlen)
898 {
899 	struct sock *sk = sock->sk;
900 	struct xdp_sock *xs = xdp_sk(sk);
901 	int len;
902 
903 	if (level != SOL_XDP)
904 		return -ENOPROTOOPT;
905 
906 	if (get_user(len, optlen))
907 		return -EFAULT;
908 	if (len < 0)
909 		return -EINVAL;
910 
911 	switch (optname) {
912 	case XDP_STATISTICS:
913 	{
914 		struct xdp_statistics stats = {};
915 		bool extra_stats = true;
916 		size_t stats_size;
917 
918 		if (len < sizeof(struct xdp_statistics_v1)) {
919 			return -EINVAL;
920 		} else if (len < sizeof(stats)) {
921 			extra_stats = false;
922 			stats_size = sizeof(struct xdp_statistics_v1);
923 		} else {
924 			stats_size = sizeof(stats);
925 		}
926 
927 		mutex_lock(&xs->mutex);
928 		stats.rx_dropped = xs->rx_dropped;
929 		if (extra_stats) {
930 			stats.rx_ring_full = xs->rx_queue_full;
931 			stats.rx_fill_ring_empty_descs =
932 				xs->pool ? xskq_nb_queue_empty_descs(xs->pool->fq) : 0;
933 			stats.tx_ring_empty_descs = xskq_nb_queue_empty_descs(xs->tx);
934 		} else {
935 			stats.rx_dropped += xs->rx_queue_full;
936 		}
937 		stats.rx_invalid_descs = xskq_nb_invalid_descs(xs->rx);
938 		stats.tx_invalid_descs = xskq_nb_invalid_descs(xs->tx);
939 		mutex_unlock(&xs->mutex);
940 
941 		if (copy_to_user(optval, &stats, stats_size))
942 			return -EFAULT;
943 		if (put_user(stats_size, optlen))
944 			return -EFAULT;
945 
946 		return 0;
947 	}
948 	case XDP_MMAP_OFFSETS:
949 	{
950 		struct xdp_mmap_offsets off;
951 		struct xdp_mmap_offsets_v1 off_v1;
952 		bool flags_supported = true;
953 		void *to_copy;
954 
955 		if (len < sizeof(off_v1))
956 			return -EINVAL;
957 		else if (len < sizeof(off))
958 			flags_supported = false;
959 
960 		if (flags_supported) {
961 			/* xdp_ring_offset is identical to xdp_ring_offset_v1
962 			 * except for the flags field added to the end.
963 			 */
964 			xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *)
965 					       &off.rx);
966 			xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *)
967 					       &off.tx);
968 			xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *)
969 					       &off.fr);
970 			xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *)
971 					       &off.cr);
972 			off.rx.flags = offsetof(struct xdp_rxtx_ring,
973 						ptrs.flags);
974 			off.tx.flags = offsetof(struct xdp_rxtx_ring,
975 						ptrs.flags);
976 			off.fr.flags = offsetof(struct xdp_umem_ring,
977 						ptrs.flags);
978 			off.cr.flags = offsetof(struct xdp_umem_ring,
979 						ptrs.flags);
980 
981 			len = sizeof(off);
982 			to_copy = &off;
983 		} else {
984 			xsk_enter_rxtx_offsets(&off_v1.rx);
985 			xsk_enter_rxtx_offsets(&off_v1.tx);
986 			xsk_enter_umem_offsets(&off_v1.fr);
987 			xsk_enter_umem_offsets(&off_v1.cr);
988 
989 			len = sizeof(off_v1);
990 			to_copy = &off_v1;
991 		}
992 
993 		if (copy_to_user(optval, to_copy, len))
994 			return -EFAULT;
995 		if (put_user(len, optlen))
996 			return -EFAULT;
997 
998 		return 0;
999 	}
1000 	case XDP_OPTIONS:
1001 	{
1002 		struct xdp_options opts = {};
1003 
1004 		if (len < sizeof(opts))
1005 			return -EINVAL;
1006 
1007 		mutex_lock(&xs->mutex);
1008 		if (xs->zc)
1009 			opts.flags |= XDP_OPTIONS_ZEROCOPY;
1010 		mutex_unlock(&xs->mutex);
1011 
1012 		len = sizeof(opts);
1013 		if (copy_to_user(optval, &opts, len))
1014 			return -EFAULT;
1015 		if (put_user(len, optlen))
1016 			return -EFAULT;
1017 
1018 		return 0;
1019 	}
1020 	default:
1021 		break;
1022 	}
1023 
1024 	return -EOPNOTSUPP;
1025 }
1026 
1027 static int xsk_mmap(struct file *file, struct socket *sock,
1028 		    struct vm_area_struct *vma)
1029 {
1030 	loff_t offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
1031 	unsigned long size = vma->vm_end - vma->vm_start;
1032 	struct xdp_sock *xs = xdp_sk(sock->sk);
1033 	struct xsk_queue *q = NULL;
1034 	unsigned long pfn;
1035 	struct page *qpg;
1036 
1037 	if (READ_ONCE(xs->state) != XSK_READY)
1038 		return -EBUSY;
1039 
1040 	if (offset == XDP_PGOFF_RX_RING) {
1041 		q = READ_ONCE(xs->rx);
1042 	} else if (offset == XDP_PGOFF_TX_RING) {
1043 		q = READ_ONCE(xs->tx);
1044 	} else {
1045 		/* Matches the smp_wmb() in XDP_UMEM_REG */
1046 		smp_rmb();
1047 		if (offset == XDP_UMEM_PGOFF_FILL_RING)
1048 			q = READ_ONCE(xs->fq_tmp);
1049 		else if (offset == XDP_UMEM_PGOFF_COMPLETION_RING)
1050 			q = READ_ONCE(xs->cq_tmp);
1051 	}
1052 
1053 	if (!q)
1054 		return -EINVAL;
1055 
1056 	/* Matches the smp_wmb() in xsk_init_queue */
1057 	smp_rmb();
1058 	qpg = virt_to_head_page(q->ring);
1059 	if (size > page_size(qpg))
1060 		return -EINVAL;
1061 
1062 	pfn = virt_to_phys(q->ring) >> PAGE_SHIFT;
1063 	return remap_pfn_range(vma, vma->vm_start, pfn,
1064 			       size, vma->vm_page_prot);
1065 }
1066 
1067 static int xsk_notifier(struct notifier_block *this,
1068 			unsigned long msg, void *ptr)
1069 {
1070 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
1071 	struct net *net = dev_net(dev);
1072 	struct sock *sk;
1073 
1074 	switch (msg) {
1075 	case NETDEV_UNREGISTER:
1076 		mutex_lock(&net->xdp.lock);
1077 		sk_for_each(sk, &net->xdp.list) {
1078 			struct xdp_sock *xs = xdp_sk(sk);
1079 
1080 			mutex_lock(&xs->mutex);
1081 			if (xs->dev == dev) {
1082 				sk->sk_err = ENETDOWN;
1083 				if (!sock_flag(sk, SOCK_DEAD))
1084 					sk->sk_error_report(sk);
1085 
1086 				xsk_unbind_dev(xs);
1087 
1088 				/* Clear device references. */
1089 				xp_clear_dev(xs->pool);
1090 			}
1091 			mutex_unlock(&xs->mutex);
1092 		}
1093 		mutex_unlock(&net->xdp.lock);
1094 		break;
1095 	}
1096 	return NOTIFY_DONE;
1097 }
1098 
1099 static struct proto xsk_proto = {
1100 	.name =		"XDP",
1101 	.owner =	THIS_MODULE,
1102 	.obj_size =	sizeof(struct xdp_sock),
1103 };
1104 
1105 static const struct proto_ops xsk_proto_ops = {
1106 	.family		= PF_XDP,
1107 	.owner		= THIS_MODULE,
1108 	.release	= xsk_release,
1109 	.bind		= xsk_bind,
1110 	.connect	= sock_no_connect,
1111 	.socketpair	= sock_no_socketpair,
1112 	.accept		= sock_no_accept,
1113 	.getname	= sock_no_getname,
1114 	.poll		= xsk_poll,
1115 	.ioctl		= sock_no_ioctl,
1116 	.listen		= sock_no_listen,
1117 	.shutdown	= sock_no_shutdown,
1118 	.setsockopt	= xsk_setsockopt,
1119 	.getsockopt	= xsk_getsockopt,
1120 	.sendmsg	= xsk_sendmsg,
1121 	.recvmsg	= sock_no_recvmsg,
1122 	.mmap		= xsk_mmap,
1123 	.sendpage	= sock_no_sendpage,
1124 };
1125 
1126 static void xsk_destruct(struct sock *sk)
1127 {
1128 	struct xdp_sock *xs = xdp_sk(sk);
1129 
1130 	if (!sock_flag(sk, SOCK_DEAD))
1131 		return;
1132 
1133 	xp_put_pool(xs->pool);
1134 
1135 	sk_refcnt_debug_dec(sk);
1136 }
1137 
1138 static int xsk_create(struct net *net, struct socket *sock, int protocol,
1139 		      int kern)
1140 {
1141 	struct xdp_sock *xs;
1142 	struct sock *sk;
1143 
1144 	if (!ns_capable(net->user_ns, CAP_NET_RAW))
1145 		return -EPERM;
1146 	if (sock->type != SOCK_RAW)
1147 		return -ESOCKTNOSUPPORT;
1148 
1149 	if (protocol)
1150 		return -EPROTONOSUPPORT;
1151 
1152 	sock->state = SS_UNCONNECTED;
1153 
1154 	sk = sk_alloc(net, PF_XDP, GFP_KERNEL, &xsk_proto, kern);
1155 	if (!sk)
1156 		return -ENOBUFS;
1157 
1158 	sock->ops = &xsk_proto_ops;
1159 
1160 	sock_init_data(sock, sk);
1161 
1162 	sk->sk_family = PF_XDP;
1163 
1164 	sk->sk_destruct = xsk_destruct;
1165 	sk_refcnt_debug_inc(sk);
1166 
1167 	sock_set_flag(sk, SOCK_RCU_FREE);
1168 
1169 	xs = xdp_sk(sk);
1170 	xs->state = XSK_READY;
1171 	mutex_init(&xs->mutex);
1172 	spin_lock_init(&xs->rx_lock);
1173 	spin_lock_init(&xs->tx_completion_lock);
1174 
1175 	INIT_LIST_HEAD(&xs->map_list);
1176 	spin_lock_init(&xs->map_list_lock);
1177 
1178 	mutex_lock(&net->xdp.lock);
1179 	sk_add_node_rcu(sk, &net->xdp.list);
1180 	mutex_unlock(&net->xdp.lock);
1181 
1182 	local_bh_disable();
1183 	sock_prot_inuse_add(net, &xsk_proto, 1);
1184 	local_bh_enable();
1185 
1186 	return 0;
1187 }
1188 
1189 static const struct net_proto_family xsk_family_ops = {
1190 	.family = PF_XDP,
1191 	.create = xsk_create,
1192 	.owner	= THIS_MODULE,
1193 };
1194 
1195 static struct notifier_block xsk_netdev_notifier = {
1196 	.notifier_call	= xsk_notifier,
1197 };
1198 
1199 static int __net_init xsk_net_init(struct net *net)
1200 {
1201 	mutex_init(&net->xdp.lock);
1202 	INIT_HLIST_HEAD(&net->xdp.list);
1203 	return 0;
1204 }
1205 
1206 static void __net_exit xsk_net_exit(struct net *net)
1207 {
1208 	WARN_ON_ONCE(!hlist_empty(&net->xdp.list));
1209 }
1210 
1211 static struct pernet_operations xsk_net_ops = {
1212 	.init = xsk_net_init,
1213 	.exit = xsk_net_exit,
1214 };
1215 
1216 static int __init xsk_init(void)
1217 {
1218 	int err, cpu;
1219 
1220 	err = proto_register(&xsk_proto, 0 /* no slab */);
1221 	if (err)
1222 		goto out;
1223 
1224 	err = sock_register(&xsk_family_ops);
1225 	if (err)
1226 		goto out_proto;
1227 
1228 	err = register_pernet_subsys(&xsk_net_ops);
1229 	if (err)
1230 		goto out_sk;
1231 
1232 	err = register_netdevice_notifier(&xsk_netdev_notifier);
1233 	if (err)
1234 		goto out_pernet;
1235 
1236 	for_each_possible_cpu(cpu)
1237 		INIT_LIST_HEAD(&per_cpu(xskmap_flush_list, cpu));
1238 	return 0;
1239 
1240 out_pernet:
1241 	unregister_pernet_subsys(&xsk_net_ops);
1242 out_sk:
1243 	sock_unregister(PF_XDP);
1244 out_proto:
1245 	proto_unregister(&xsk_proto);
1246 out:
1247 	return err;
1248 }
1249 
1250 fs_initcall(xsk_init);
1251