xref: /openbmc/linux/net/xdp/xsk.c (revision aeddf9a2731de8235b2b433533d06ee7dc73d233)
1  // SPDX-License-Identifier: GPL-2.0
2  /* XDP sockets
3   *
4   * AF_XDP sockets allows a channel between XDP programs and userspace
5   * applications.
6   * Copyright(c) 2018 Intel Corporation.
7   *
8   * Author(s): Björn Töpel <bjorn.topel@intel.com>
9   *	      Magnus Karlsson <magnus.karlsson@intel.com>
10   */
11  
12  #define pr_fmt(fmt) "AF_XDP: %s: " fmt, __func__
13  
14  #include <linux/if_xdp.h>
15  #include <linux/init.h>
16  #include <linux/sched/mm.h>
17  #include <linux/sched/signal.h>
18  #include <linux/sched/task.h>
19  #include <linux/socket.h>
20  #include <linux/file.h>
21  #include <linux/uaccess.h>
22  #include <linux/net.h>
23  #include <linux/netdevice.h>
24  #include <linux/rculist.h>
25  #include <linux/vmalloc.h>
26  #include <net/xdp_sock_drv.h>
27  #include <net/busy_poll.h>
28  #include <net/netdev_rx_queue.h>
29  #include <net/xdp.h>
30  
31  #include "xsk_queue.h"
32  #include "xdp_umem.h"
33  #include "xsk.h"
34  
35  #define TX_BATCH_SIZE 32
36  
37  static DEFINE_PER_CPU(struct list_head, xskmap_flush_list);
38  
xsk_set_rx_need_wakeup(struct xsk_buff_pool * pool)39  void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool)
40  {
41  	if (pool->cached_need_wakeup & XDP_WAKEUP_RX)
42  		return;
43  
44  	pool->fq->ring->flags |= XDP_RING_NEED_WAKEUP;
45  	pool->cached_need_wakeup |= XDP_WAKEUP_RX;
46  }
47  EXPORT_SYMBOL(xsk_set_rx_need_wakeup);
48  
xsk_set_tx_need_wakeup(struct xsk_buff_pool * pool)49  void xsk_set_tx_need_wakeup(struct xsk_buff_pool *pool)
50  {
51  	struct xdp_sock *xs;
52  
53  	if (pool->cached_need_wakeup & XDP_WAKEUP_TX)
54  		return;
55  
56  	rcu_read_lock();
57  	list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
58  		xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP;
59  	}
60  	rcu_read_unlock();
61  
62  	pool->cached_need_wakeup |= XDP_WAKEUP_TX;
63  }
64  EXPORT_SYMBOL(xsk_set_tx_need_wakeup);
65  
xsk_clear_rx_need_wakeup(struct xsk_buff_pool * pool)66  void xsk_clear_rx_need_wakeup(struct xsk_buff_pool *pool)
67  {
68  	if (!(pool->cached_need_wakeup & XDP_WAKEUP_RX))
69  		return;
70  
71  	pool->fq->ring->flags &= ~XDP_RING_NEED_WAKEUP;
72  	pool->cached_need_wakeup &= ~XDP_WAKEUP_RX;
73  }
74  EXPORT_SYMBOL(xsk_clear_rx_need_wakeup);
75  
xsk_clear_tx_need_wakeup(struct xsk_buff_pool * pool)76  void xsk_clear_tx_need_wakeup(struct xsk_buff_pool *pool)
77  {
78  	struct xdp_sock *xs;
79  
80  	if (!(pool->cached_need_wakeup & XDP_WAKEUP_TX))
81  		return;
82  
83  	rcu_read_lock();
84  	list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
85  		xs->tx->ring->flags &= ~XDP_RING_NEED_WAKEUP;
86  	}
87  	rcu_read_unlock();
88  
89  	pool->cached_need_wakeup &= ~XDP_WAKEUP_TX;
90  }
91  EXPORT_SYMBOL(xsk_clear_tx_need_wakeup);
92  
xsk_uses_need_wakeup(struct xsk_buff_pool * pool)93  bool xsk_uses_need_wakeup(struct xsk_buff_pool *pool)
94  {
95  	return pool->uses_need_wakeup;
96  }
97  EXPORT_SYMBOL(xsk_uses_need_wakeup);
98  
xsk_get_pool_from_qid(struct net_device * dev,u16 queue_id)99  struct xsk_buff_pool *xsk_get_pool_from_qid(struct net_device *dev,
100  					    u16 queue_id)
101  {
102  	if (queue_id < dev->real_num_rx_queues)
103  		return dev->_rx[queue_id].pool;
104  	if (queue_id < dev->real_num_tx_queues)
105  		return dev->_tx[queue_id].pool;
106  
107  	return NULL;
108  }
109  EXPORT_SYMBOL(xsk_get_pool_from_qid);
110  
xsk_clear_pool_at_qid(struct net_device * dev,u16 queue_id)111  void xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id)
112  {
113  	if (queue_id < dev->num_rx_queues)
114  		dev->_rx[queue_id].pool = NULL;
115  	if (queue_id < dev->num_tx_queues)
116  		dev->_tx[queue_id].pool = NULL;
117  }
118  
119  /* The buffer pool is stored both in the _rx struct and the _tx struct as we do
120   * not know if the device has more tx queues than rx, or the opposite.
121   * This might also change during run time.
122   */
xsk_reg_pool_at_qid(struct net_device * dev,struct xsk_buff_pool * pool,u16 queue_id)123  int xsk_reg_pool_at_qid(struct net_device *dev, struct xsk_buff_pool *pool,
124  			u16 queue_id)
125  {
126  	if (queue_id >= max_t(unsigned int,
127  			      dev->real_num_rx_queues,
128  			      dev->real_num_tx_queues))
129  		return -EINVAL;
130  
131  	if (queue_id < dev->real_num_rx_queues)
132  		dev->_rx[queue_id].pool = pool;
133  	if (queue_id < dev->real_num_tx_queues)
134  		dev->_tx[queue_id].pool = pool;
135  
136  	return 0;
137  }
138  
__xsk_rcv_zc(struct xdp_sock * xs,struct xdp_buff_xsk * xskb,u32 len,u32 flags)139  static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff_xsk *xskb, u32 len,
140  			u32 flags)
141  {
142  	u64 addr;
143  	int err;
144  
145  	addr = xp_get_handle(xskb);
146  	err = xskq_prod_reserve_desc(xs->rx, addr, len, flags);
147  	if (err) {
148  		xs->rx_queue_full++;
149  		return err;
150  	}
151  
152  	xp_release(xskb);
153  	return 0;
154  }
155  
xsk_rcv_zc(struct xdp_sock * xs,struct xdp_buff * xdp,u32 len)156  static int xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
157  {
158  	struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp);
159  	u32 frags = xdp_buff_has_frags(xdp);
160  	struct xdp_buff_xsk *pos, *tmp;
161  	struct list_head *xskb_list;
162  	u32 contd = 0;
163  	int err;
164  
165  	if (frags)
166  		contd = XDP_PKT_CONTD;
167  
168  	err = __xsk_rcv_zc(xs, xskb, len, contd);
169  	if (err)
170  		goto err;
171  	if (likely(!frags))
172  		return 0;
173  
174  	xskb_list = &xskb->pool->xskb_list;
175  	list_for_each_entry_safe(pos, tmp, xskb_list, xskb_list_node) {
176  		if (list_is_singular(xskb_list))
177  			contd = 0;
178  		len = pos->xdp.data_end - pos->xdp.data;
179  		err = __xsk_rcv_zc(xs, pos, len, contd);
180  		if (err)
181  			goto err;
182  		list_del(&pos->xskb_list_node);
183  	}
184  
185  	return 0;
186  err:
187  	xsk_buff_free(xdp);
188  	return err;
189  }
190  
xsk_copy_xdp_start(struct xdp_buff * from)191  static void *xsk_copy_xdp_start(struct xdp_buff *from)
192  {
193  	if (unlikely(xdp_data_meta_unsupported(from)))
194  		return from->data;
195  	else
196  		return from->data_meta;
197  }
198  
xsk_copy_xdp(void * to,void ** from,u32 to_len,u32 * from_len,skb_frag_t ** frag,u32 rem)199  static u32 xsk_copy_xdp(void *to, void **from, u32 to_len,
200  			u32 *from_len, skb_frag_t **frag, u32 rem)
201  {
202  	u32 copied = 0;
203  
204  	while (1) {
205  		u32 copy_len = min_t(u32, *from_len, to_len);
206  
207  		memcpy(to, *from, copy_len);
208  		copied += copy_len;
209  		if (rem == copied)
210  			return copied;
211  
212  		if (*from_len == copy_len) {
213  			*from = skb_frag_address(*frag);
214  			*from_len = skb_frag_size((*frag)++);
215  		} else {
216  			*from += copy_len;
217  			*from_len -= copy_len;
218  		}
219  		if (to_len == copy_len)
220  			return copied;
221  
222  		to_len -= copy_len;
223  		to += copy_len;
224  	}
225  }
226  
__xsk_rcv(struct xdp_sock * xs,struct xdp_buff * xdp,u32 len)227  static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
228  {
229  	u32 frame_size = xsk_pool_get_rx_frame_size(xs->pool);
230  	void *copy_from = xsk_copy_xdp_start(xdp), *copy_to;
231  	u32 from_len, meta_len, rem, num_desc;
232  	struct xdp_buff_xsk *xskb;
233  	struct xdp_buff *xsk_xdp;
234  	skb_frag_t *frag;
235  
236  	from_len = xdp->data_end - copy_from;
237  	meta_len = xdp->data - copy_from;
238  	rem = len + meta_len;
239  
240  	if (len <= frame_size && !xdp_buff_has_frags(xdp)) {
241  		int err;
242  
243  		xsk_xdp = xsk_buff_alloc(xs->pool);
244  		if (!xsk_xdp) {
245  			xs->rx_dropped++;
246  			return -ENOMEM;
247  		}
248  		memcpy(xsk_xdp->data - meta_len, copy_from, rem);
249  		xskb = container_of(xsk_xdp, struct xdp_buff_xsk, xdp);
250  		err = __xsk_rcv_zc(xs, xskb, len, 0);
251  		if (err) {
252  			xsk_buff_free(xsk_xdp);
253  			return err;
254  		}
255  
256  		return 0;
257  	}
258  
259  	num_desc = (len - 1) / frame_size + 1;
260  
261  	if (!xsk_buff_can_alloc(xs->pool, num_desc)) {
262  		xs->rx_dropped++;
263  		return -ENOMEM;
264  	}
265  	if (xskq_prod_nb_free(xs->rx, num_desc) < num_desc) {
266  		xs->rx_queue_full++;
267  		return -ENOBUFS;
268  	}
269  
270  	if (xdp_buff_has_frags(xdp)) {
271  		struct skb_shared_info *sinfo;
272  
273  		sinfo = xdp_get_shared_info_from_buff(xdp);
274  		frag =  &sinfo->frags[0];
275  	}
276  
277  	do {
278  		u32 to_len = frame_size + meta_len;
279  		u32 copied;
280  
281  		xsk_xdp = xsk_buff_alloc(xs->pool);
282  		copy_to = xsk_xdp->data - meta_len;
283  
284  		copied = xsk_copy_xdp(copy_to, &copy_from, to_len, &from_len, &frag, rem);
285  		rem -= copied;
286  
287  		xskb = container_of(xsk_xdp, struct xdp_buff_xsk, xdp);
288  		__xsk_rcv_zc(xs, xskb, copied - meta_len, rem ? XDP_PKT_CONTD : 0);
289  		meta_len = 0;
290  	} while (rem);
291  
292  	return 0;
293  }
294  
xsk_tx_writeable(struct xdp_sock * xs)295  static bool xsk_tx_writeable(struct xdp_sock *xs)
296  {
297  	if (xskq_cons_present_entries(xs->tx) > xs->tx->nentries / 2)
298  		return false;
299  
300  	return true;
301  }
302  
xsk_is_bound(struct xdp_sock * xs)303  static bool xsk_is_bound(struct xdp_sock *xs)
304  {
305  	if (READ_ONCE(xs->state) == XSK_BOUND) {
306  		/* Matches smp_wmb() in bind(). */
307  		smp_rmb();
308  		return true;
309  	}
310  	return false;
311  }
312  
xsk_rcv_check(struct xdp_sock * xs,struct xdp_buff * xdp,u32 len)313  static int xsk_rcv_check(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
314  {
315  	if (!xsk_is_bound(xs))
316  		return -ENXIO;
317  
318  	if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index)
319  		return -EINVAL;
320  
321  	if (len > xsk_pool_get_rx_frame_size(xs->pool) && !xs->sg) {
322  		xs->rx_dropped++;
323  		return -ENOSPC;
324  	}
325  
326  	sk_mark_napi_id_once_xdp(&xs->sk, xdp);
327  	return 0;
328  }
329  
xsk_flush(struct xdp_sock * xs)330  static void xsk_flush(struct xdp_sock *xs)
331  {
332  	xskq_prod_submit(xs->rx);
333  	__xskq_cons_release(xs->pool->fq);
334  	sock_def_readable(&xs->sk);
335  }
336  
xsk_generic_rcv(struct xdp_sock * xs,struct xdp_buff * xdp)337  int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
338  {
339  	u32 len = xdp_get_buff_len(xdp);
340  	int err;
341  
342  	spin_lock_bh(&xs->rx_lock);
343  	err = xsk_rcv_check(xs, xdp, len);
344  	if (!err) {
345  		err = __xsk_rcv(xs, xdp, len);
346  		xsk_flush(xs);
347  	}
348  	spin_unlock_bh(&xs->rx_lock);
349  	return err;
350  }
351  
xsk_rcv(struct xdp_sock * xs,struct xdp_buff * xdp)352  static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
353  {
354  	u32 len = xdp_get_buff_len(xdp);
355  	int err;
356  
357  	err = xsk_rcv_check(xs, xdp, len);
358  	if (err)
359  		return err;
360  
361  	if (xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL) {
362  		len = xdp->data_end - xdp->data;
363  		return xsk_rcv_zc(xs, xdp, len);
364  	}
365  
366  	err = __xsk_rcv(xs, xdp, len);
367  	if (!err)
368  		xdp_return_buff(xdp);
369  	return err;
370  }
371  
__xsk_map_redirect(struct xdp_sock * xs,struct xdp_buff * xdp)372  int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp)
373  {
374  	struct list_head *flush_list = this_cpu_ptr(&xskmap_flush_list);
375  	int err;
376  
377  	err = xsk_rcv(xs, xdp);
378  	if (err)
379  		return err;
380  
381  	if (!xs->flush_node.prev)
382  		list_add(&xs->flush_node, flush_list);
383  
384  	return 0;
385  }
386  
__xsk_map_flush(void)387  void __xsk_map_flush(void)
388  {
389  	struct list_head *flush_list = this_cpu_ptr(&xskmap_flush_list);
390  	struct xdp_sock *xs, *tmp;
391  
392  	list_for_each_entry_safe(xs, tmp, flush_list, flush_node) {
393  		xsk_flush(xs);
394  		__list_del_clearprev(&xs->flush_node);
395  	}
396  }
397  
xsk_tx_completed(struct xsk_buff_pool * pool,u32 nb_entries)398  void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries)
399  {
400  	xskq_prod_submit_n(pool->cq, nb_entries);
401  }
402  EXPORT_SYMBOL(xsk_tx_completed);
403  
xsk_tx_release(struct xsk_buff_pool * pool)404  void xsk_tx_release(struct xsk_buff_pool *pool)
405  {
406  	struct xdp_sock *xs;
407  
408  	rcu_read_lock();
409  	list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
410  		__xskq_cons_release(xs->tx);
411  		if (xsk_tx_writeable(xs))
412  			xs->sk.sk_write_space(&xs->sk);
413  	}
414  	rcu_read_unlock();
415  }
416  EXPORT_SYMBOL(xsk_tx_release);
417  
xsk_tx_peek_desc(struct xsk_buff_pool * pool,struct xdp_desc * desc)418  bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc)
419  {
420  	struct xdp_sock *xs;
421  
422  	rcu_read_lock();
423  	list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
424  		if (!xskq_cons_peek_desc(xs->tx, desc, pool)) {
425  			if (xskq_has_descs(xs->tx))
426  				xskq_cons_release(xs->tx);
427  			continue;
428  		}
429  
430  		/* This is the backpressure mechanism for the Tx path.
431  		 * Reserve space in the completion queue and only proceed
432  		 * if there is space in it. This avoids having to implement
433  		 * any buffering in the Tx path.
434  		 */
435  		if (xskq_prod_reserve_addr(pool->cq, desc->addr))
436  			goto out;
437  
438  		xskq_cons_release(xs->tx);
439  		rcu_read_unlock();
440  		return true;
441  	}
442  
443  out:
444  	rcu_read_unlock();
445  	return false;
446  }
447  EXPORT_SYMBOL(xsk_tx_peek_desc);
448  
xsk_tx_peek_release_fallback(struct xsk_buff_pool * pool,u32 max_entries)449  static u32 xsk_tx_peek_release_fallback(struct xsk_buff_pool *pool, u32 max_entries)
450  {
451  	struct xdp_desc *descs = pool->tx_descs;
452  	u32 nb_pkts = 0;
453  
454  	while (nb_pkts < max_entries && xsk_tx_peek_desc(pool, &descs[nb_pkts]))
455  		nb_pkts++;
456  
457  	xsk_tx_release(pool);
458  	return nb_pkts;
459  }
460  
xsk_tx_peek_release_desc_batch(struct xsk_buff_pool * pool,u32 nb_pkts)461  u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, u32 nb_pkts)
462  {
463  	struct xdp_sock *xs;
464  
465  	rcu_read_lock();
466  	if (!list_is_singular(&pool->xsk_tx_list)) {
467  		/* Fallback to the non-batched version */
468  		rcu_read_unlock();
469  		return xsk_tx_peek_release_fallback(pool, nb_pkts);
470  	}
471  
472  	xs = list_first_or_null_rcu(&pool->xsk_tx_list, struct xdp_sock, tx_list);
473  	if (!xs) {
474  		nb_pkts = 0;
475  		goto out;
476  	}
477  
478  	nb_pkts = xskq_cons_nb_entries(xs->tx, nb_pkts);
479  
480  	/* This is the backpressure mechanism for the Tx path. Try to
481  	 * reserve space in the completion queue for all packets, but
482  	 * if there are fewer slots available, just process that many
483  	 * packets. This avoids having to implement any buffering in
484  	 * the Tx path.
485  	 */
486  	nb_pkts = xskq_prod_nb_free(pool->cq, nb_pkts);
487  	if (!nb_pkts)
488  		goto out;
489  
490  	nb_pkts = xskq_cons_read_desc_batch(xs->tx, pool, nb_pkts);
491  	if (!nb_pkts) {
492  		xs->tx->queue_empty_descs++;
493  		goto out;
494  	}
495  
496  	__xskq_cons_release(xs->tx);
497  	xskq_prod_write_addr_batch(pool->cq, pool->tx_descs, nb_pkts);
498  	xs->sk.sk_write_space(&xs->sk);
499  
500  out:
501  	rcu_read_unlock();
502  	return nb_pkts;
503  }
504  EXPORT_SYMBOL(xsk_tx_peek_release_desc_batch);
505  
xsk_wakeup(struct xdp_sock * xs,u8 flags)506  static int xsk_wakeup(struct xdp_sock *xs, u8 flags)
507  {
508  	struct net_device *dev = xs->dev;
509  
510  	return dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, flags);
511  }
512  
xsk_cq_reserve_addr_locked(struct xdp_sock * xs,u64 addr)513  static int xsk_cq_reserve_addr_locked(struct xdp_sock *xs, u64 addr)
514  {
515  	unsigned long flags;
516  	int ret;
517  
518  	spin_lock_irqsave(&xs->pool->cq_lock, flags);
519  	ret = xskq_prod_reserve_addr(xs->pool->cq, addr);
520  	spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
521  
522  	return ret;
523  }
524  
xsk_cq_submit_locked(struct xdp_sock * xs,u32 n)525  static void xsk_cq_submit_locked(struct xdp_sock *xs, u32 n)
526  {
527  	unsigned long flags;
528  
529  	spin_lock_irqsave(&xs->pool->cq_lock, flags);
530  	xskq_prod_submit_n(xs->pool->cq, n);
531  	spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
532  }
533  
xsk_cq_cancel_locked(struct xdp_sock * xs,u32 n)534  static void xsk_cq_cancel_locked(struct xdp_sock *xs, u32 n)
535  {
536  	unsigned long flags;
537  
538  	spin_lock_irqsave(&xs->pool->cq_lock, flags);
539  	xskq_prod_cancel_n(xs->pool->cq, n);
540  	spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
541  }
542  
xsk_get_num_desc(struct sk_buff * skb)543  static u32 xsk_get_num_desc(struct sk_buff *skb)
544  {
545  	return skb ? (long)skb_shinfo(skb)->destructor_arg : 0;
546  }
547  
xsk_destruct_skb(struct sk_buff * skb)548  static void xsk_destruct_skb(struct sk_buff *skb)
549  {
550  	xsk_cq_submit_locked(xdp_sk(skb->sk), xsk_get_num_desc(skb));
551  	sock_wfree(skb);
552  }
553  
xsk_set_destructor_arg(struct sk_buff * skb)554  static void xsk_set_destructor_arg(struct sk_buff *skb)
555  {
556  	long num = xsk_get_num_desc(xdp_sk(skb->sk)->skb) + 1;
557  
558  	skb_shinfo(skb)->destructor_arg = (void *)num;
559  }
560  
xsk_consume_skb(struct sk_buff * skb)561  static void xsk_consume_skb(struct sk_buff *skb)
562  {
563  	struct xdp_sock *xs = xdp_sk(skb->sk);
564  
565  	skb->destructor = sock_wfree;
566  	xsk_cq_cancel_locked(xs, xsk_get_num_desc(skb));
567  	/* Free skb without triggering the perf drop trace */
568  	consume_skb(skb);
569  	xs->skb = NULL;
570  }
571  
xsk_drop_skb(struct sk_buff * skb)572  static void xsk_drop_skb(struct sk_buff *skb)
573  {
574  	xdp_sk(skb->sk)->tx->invalid_descs += xsk_get_num_desc(skb);
575  	xsk_consume_skb(skb);
576  }
577  
xsk_build_skb_zerocopy(struct xdp_sock * xs,struct xdp_desc * desc)578  static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
579  					      struct xdp_desc *desc)
580  {
581  	struct xsk_buff_pool *pool = xs->pool;
582  	u32 hr, len, ts, offset, copy, copied;
583  	struct sk_buff *skb = xs->skb;
584  	struct page *page;
585  	void *buffer;
586  	int err, i;
587  	u64 addr;
588  
589  	if (!skb) {
590  		hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(xs->dev->needed_headroom));
591  
592  		skb = sock_alloc_send_skb(&xs->sk, hr, 1, &err);
593  		if (unlikely(!skb))
594  			return ERR_PTR(err);
595  
596  		skb_reserve(skb, hr);
597  	}
598  
599  	addr = desc->addr;
600  	len = desc->len;
601  	ts = pool->unaligned ? len : pool->chunk_size;
602  
603  	buffer = xsk_buff_raw_get_data(pool, addr);
604  	offset = offset_in_page(buffer);
605  	addr = buffer - pool->addrs;
606  
607  	for (copied = 0, i = skb_shinfo(skb)->nr_frags; copied < len; i++) {
608  		if (unlikely(i >= MAX_SKB_FRAGS))
609  			return ERR_PTR(-EOVERFLOW);
610  
611  		page = pool->umem->pgs[addr >> PAGE_SHIFT];
612  		get_page(page);
613  
614  		copy = min_t(u32, PAGE_SIZE - offset, len - copied);
615  		skb_fill_page_desc(skb, i, page, offset, copy);
616  
617  		copied += copy;
618  		addr += copy;
619  		offset = 0;
620  	}
621  
622  	skb->len += len;
623  	skb->data_len += len;
624  	skb->truesize += ts;
625  
626  	refcount_add(ts, &xs->sk.sk_wmem_alloc);
627  
628  	return skb;
629  }
630  
xsk_build_skb(struct xdp_sock * xs,struct xdp_desc * desc)631  static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
632  				     struct xdp_desc *desc)
633  {
634  	struct net_device *dev = xs->dev;
635  	struct sk_buff *skb = xs->skb;
636  	int err;
637  
638  	if (dev->priv_flags & IFF_TX_SKB_NO_LINEAR) {
639  		skb = xsk_build_skb_zerocopy(xs, desc);
640  		if (IS_ERR(skb)) {
641  			err = PTR_ERR(skb);
642  			goto free_err;
643  		}
644  	} else {
645  		u32 hr, tr, len;
646  		void *buffer;
647  
648  		buffer = xsk_buff_raw_get_data(xs->pool, desc->addr);
649  		len = desc->len;
650  
651  		if (!skb) {
652  			hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(dev->needed_headroom));
653  			tr = dev->needed_tailroom;
654  			skb = sock_alloc_send_skb(&xs->sk, hr + len + tr, 1, &err);
655  			if (unlikely(!skb))
656  				goto free_err;
657  
658  			skb_reserve(skb, hr);
659  			skb_put(skb, len);
660  
661  			err = skb_store_bits(skb, 0, buffer, len);
662  			if (unlikely(err)) {
663  				kfree_skb(skb);
664  				goto free_err;
665  			}
666  		} else {
667  			int nr_frags = skb_shinfo(skb)->nr_frags;
668  			struct page *page;
669  			u8 *vaddr;
670  
671  			if (unlikely(nr_frags == (MAX_SKB_FRAGS - 1) && xp_mb_desc(desc))) {
672  				err = -EOVERFLOW;
673  				goto free_err;
674  			}
675  
676  			page = alloc_page(xs->sk.sk_allocation);
677  			if (unlikely(!page)) {
678  				err = -EAGAIN;
679  				goto free_err;
680  			}
681  
682  			vaddr = kmap_local_page(page);
683  			memcpy(vaddr, buffer, len);
684  			kunmap_local(vaddr);
685  
686  			skb_add_rx_frag(skb, nr_frags, page, 0, len, PAGE_SIZE);
687  			refcount_add(PAGE_SIZE, &xs->sk.sk_wmem_alloc);
688  		}
689  	}
690  
691  	skb->dev = dev;
692  	skb->priority = xs->sk.sk_priority;
693  	skb->mark = READ_ONCE(xs->sk.sk_mark);
694  	skb->destructor = xsk_destruct_skb;
695  	xsk_set_destructor_arg(skb);
696  
697  	return skb;
698  
699  free_err:
700  	if (err == -EOVERFLOW) {
701  		/* Drop the packet */
702  		xsk_set_destructor_arg(xs->skb);
703  		xsk_drop_skb(xs->skb);
704  		xskq_cons_release(xs->tx);
705  	} else {
706  		/* Let application retry */
707  		xsk_cq_cancel_locked(xs, 1);
708  	}
709  
710  	return ERR_PTR(err);
711  }
712  
__xsk_generic_xmit(struct sock * sk)713  static int __xsk_generic_xmit(struct sock *sk)
714  {
715  	struct xdp_sock *xs = xdp_sk(sk);
716  	u32 max_batch = TX_BATCH_SIZE;
717  	bool sent_frame = false;
718  	struct xdp_desc desc;
719  	struct sk_buff *skb;
720  	int err = 0;
721  
722  	mutex_lock(&xs->mutex);
723  
724  	/* Since we dropped the RCU read lock, the socket state might have changed. */
725  	if (unlikely(!xsk_is_bound(xs))) {
726  		err = -ENXIO;
727  		goto out;
728  	}
729  
730  	if (xs->queue_id >= xs->dev->real_num_tx_queues)
731  		goto out;
732  
733  	while (xskq_cons_peek_desc(xs->tx, &desc, xs->pool)) {
734  		if (max_batch-- == 0) {
735  			err = -EAGAIN;
736  			goto out;
737  		}
738  
739  		/* This is the backpressure mechanism for the Tx path.
740  		 * Reserve space in the completion queue and only proceed
741  		 * if there is space in it. This avoids having to implement
742  		 * any buffering in the Tx path.
743  		 */
744  		if (xsk_cq_reserve_addr_locked(xs, desc.addr))
745  			goto out;
746  
747  		skb = xsk_build_skb(xs, &desc);
748  		if (IS_ERR(skb)) {
749  			err = PTR_ERR(skb);
750  			if (err != -EOVERFLOW)
751  				goto out;
752  			err = 0;
753  			continue;
754  		}
755  
756  		xskq_cons_release(xs->tx);
757  
758  		if (xp_mb_desc(&desc)) {
759  			xs->skb = skb;
760  			continue;
761  		}
762  
763  		err = __dev_direct_xmit(skb, xs->queue_id);
764  		if  (err == NETDEV_TX_BUSY) {
765  			/* Tell user-space to retry the send */
766  			xskq_cons_cancel_n(xs->tx, xsk_get_num_desc(skb));
767  			xsk_consume_skb(skb);
768  			err = -EAGAIN;
769  			goto out;
770  		}
771  
772  		/* Ignore NET_XMIT_CN as packet might have been sent */
773  		if (err == NET_XMIT_DROP) {
774  			/* SKB completed but not sent */
775  			err = -EBUSY;
776  			xs->skb = NULL;
777  			goto out;
778  		}
779  
780  		sent_frame = true;
781  		xs->skb = NULL;
782  	}
783  
784  	if (xskq_has_descs(xs->tx)) {
785  		if (xs->skb)
786  			xsk_drop_skb(xs->skb);
787  		xskq_cons_release(xs->tx);
788  	}
789  
790  out:
791  	if (sent_frame)
792  		if (xsk_tx_writeable(xs))
793  			sk->sk_write_space(sk);
794  
795  	mutex_unlock(&xs->mutex);
796  	return err;
797  }
798  
xsk_generic_xmit(struct sock * sk)799  static int xsk_generic_xmit(struct sock *sk)
800  {
801  	int ret;
802  
803  	/* Drop the RCU lock since the SKB path might sleep. */
804  	rcu_read_unlock();
805  	ret = __xsk_generic_xmit(sk);
806  	/* Reaquire RCU lock before going into common code. */
807  	rcu_read_lock();
808  
809  	return ret;
810  }
811  
xsk_no_wakeup(struct sock * sk)812  static bool xsk_no_wakeup(struct sock *sk)
813  {
814  #ifdef CONFIG_NET_RX_BUSY_POLL
815  	/* Prefer busy-polling, skip the wakeup. */
816  	return READ_ONCE(sk->sk_prefer_busy_poll) && READ_ONCE(sk->sk_ll_usec) &&
817  		READ_ONCE(sk->sk_napi_id) >= MIN_NAPI_ID;
818  #else
819  	return false;
820  #endif
821  }
822  
xsk_check_common(struct xdp_sock * xs)823  static int xsk_check_common(struct xdp_sock *xs)
824  {
825  	if (unlikely(!xsk_is_bound(xs)))
826  		return -ENXIO;
827  	if (unlikely(!(xs->dev->flags & IFF_UP)))
828  		return -ENETDOWN;
829  
830  	return 0;
831  }
832  
__xsk_sendmsg(struct socket * sock,struct msghdr * m,size_t total_len)833  static int __xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
834  {
835  	bool need_wait = !(m->msg_flags & MSG_DONTWAIT);
836  	struct sock *sk = sock->sk;
837  	struct xdp_sock *xs = xdp_sk(sk);
838  	struct xsk_buff_pool *pool;
839  	int err;
840  
841  	err = xsk_check_common(xs);
842  	if (err)
843  		return err;
844  	if (unlikely(need_wait))
845  		return -EOPNOTSUPP;
846  	if (unlikely(!xs->tx))
847  		return -ENOBUFS;
848  
849  	if (sk_can_busy_loop(sk)) {
850  		if (xs->zc)
851  			__sk_mark_napi_id_once(sk, xsk_pool_get_napi_id(xs->pool));
852  		sk_busy_loop(sk, 1); /* only support non-blocking sockets */
853  	}
854  
855  	if (xs->zc && xsk_no_wakeup(sk))
856  		return 0;
857  
858  	pool = xs->pool;
859  	if (pool->cached_need_wakeup & XDP_WAKEUP_TX) {
860  		if (xs->zc)
861  			return xsk_wakeup(xs, XDP_WAKEUP_TX);
862  		return xsk_generic_xmit(sk);
863  	}
864  	return 0;
865  }
866  
xsk_sendmsg(struct socket * sock,struct msghdr * m,size_t total_len)867  static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
868  {
869  	int ret;
870  
871  	rcu_read_lock();
872  	ret = __xsk_sendmsg(sock, m, total_len);
873  	rcu_read_unlock();
874  
875  	return ret;
876  }
877  
__xsk_recvmsg(struct socket * sock,struct msghdr * m,size_t len,int flags)878  static int __xsk_recvmsg(struct socket *sock, struct msghdr *m, size_t len, int flags)
879  {
880  	bool need_wait = !(flags & MSG_DONTWAIT);
881  	struct sock *sk = sock->sk;
882  	struct xdp_sock *xs = xdp_sk(sk);
883  	int err;
884  
885  	err = xsk_check_common(xs);
886  	if (err)
887  		return err;
888  	if (unlikely(!xs->rx))
889  		return -ENOBUFS;
890  	if (unlikely(need_wait))
891  		return -EOPNOTSUPP;
892  
893  	if (sk_can_busy_loop(sk))
894  		sk_busy_loop(sk, 1); /* only support non-blocking sockets */
895  
896  	if (xsk_no_wakeup(sk))
897  		return 0;
898  
899  	if (xs->pool->cached_need_wakeup & XDP_WAKEUP_RX && xs->zc)
900  		return xsk_wakeup(xs, XDP_WAKEUP_RX);
901  	return 0;
902  }
903  
xsk_recvmsg(struct socket * sock,struct msghdr * m,size_t len,int flags)904  static int xsk_recvmsg(struct socket *sock, struct msghdr *m, size_t len, int flags)
905  {
906  	int ret;
907  
908  	rcu_read_lock();
909  	ret = __xsk_recvmsg(sock, m, len, flags);
910  	rcu_read_unlock();
911  
912  	return ret;
913  }
914  
xsk_poll(struct file * file,struct socket * sock,struct poll_table_struct * wait)915  static __poll_t xsk_poll(struct file *file, struct socket *sock,
916  			     struct poll_table_struct *wait)
917  {
918  	__poll_t mask = 0;
919  	struct sock *sk = sock->sk;
920  	struct xdp_sock *xs = xdp_sk(sk);
921  	struct xsk_buff_pool *pool;
922  
923  	sock_poll_wait(file, sock, wait);
924  
925  	rcu_read_lock();
926  	if (xsk_check_common(xs))
927  		goto out;
928  
929  	pool = xs->pool;
930  
931  	if (pool->cached_need_wakeup) {
932  		if (xs->zc)
933  			xsk_wakeup(xs, pool->cached_need_wakeup);
934  		else if (xs->tx)
935  			/* Poll needs to drive Tx also in copy mode */
936  			xsk_generic_xmit(sk);
937  	}
938  
939  	if (xs->rx && !xskq_prod_is_empty(xs->rx))
940  		mask |= EPOLLIN | EPOLLRDNORM;
941  	if (xs->tx && xsk_tx_writeable(xs))
942  		mask |= EPOLLOUT | EPOLLWRNORM;
943  out:
944  	rcu_read_unlock();
945  	return mask;
946  }
947  
xsk_init_queue(u32 entries,struct xsk_queue ** queue,bool umem_queue)948  static int xsk_init_queue(u32 entries, struct xsk_queue **queue,
949  			  bool umem_queue)
950  {
951  	struct xsk_queue *q;
952  
953  	if (entries == 0 || *queue || !is_power_of_2(entries))
954  		return -EINVAL;
955  
956  	q = xskq_create(entries, umem_queue);
957  	if (!q)
958  		return -ENOMEM;
959  
960  	/* Make sure queue is ready before it can be seen by others */
961  	smp_wmb();
962  	WRITE_ONCE(*queue, q);
963  	return 0;
964  }
965  
xsk_unbind_dev(struct xdp_sock * xs)966  static void xsk_unbind_dev(struct xdp_sock *xs)
967  {
968  	struct net_device *dev = xs->dev;
969  
970  	if (xs->state != XSK_BOUND)
971  		return;
972  	WRITE_ONCE(xs->state, XSK_UNBOUND);
973  
974  	/* Wait for driver to stop using the xdp socket. */
975  	xp_del_xsk(xs->pool, xs);
976  	synchronize_net();
977  	dev_put(dev);
978  }
979  
xsk_get_map_list_entry(struct xdp_sock * xs,struct xdp_sock __rcu *** map_entry)980  static struct xsk_map *xsk_get_map_list_entry(struct xdp_sock *xs,
981  					      struct xdp_sock __rcu ***map_entry)
982  {
983  	struct xsk_map *map = NULL;
984  	struct xsk_map_node *node;
985  
986  	*map_entry = NULL;
987  
988  	spin_lock_bh(&xs->map_list_lock);
989  	node = list_first_entry_or_null(&xs->map_list, struct xsk_map_node,
990  					node);
991  	if (node) {
992  		bpf_map_inc(&node->map->map);
993  		map = node->map;
994  		*map_entry = node->map_entry;
995  	}
996  	spin_unlock_bh(&xs->map_list_lock);
997  	return map;
998  }
999  
xsk_delete_from_maps(struct xdp_sock * xs)1000  static void xsk_delete_from_maps(struct xdp_sock *xs)
1001  {
1002  	/* This function removes the current XDP socket from all the
1003  	 * maps it resides in. We need to take extra care here, due to
1004  	 * the two locks involved. Each map has a lock synchronizing
1005  	 * updates to the entries, and each socket has a lock that
1006  	 * synchronizes access to the list of maps (map_list). For
1007  	 * deadlock avoidance the locks need to be taken in the order
1008  	 * "map lock"->"socket map list lock". We start off by
1009  	 * accessing the socket map list, and take a reference to the
1010  	 * map to guarantee existence between the
1011  	 * xsk_get_map_list_entry() and xsk_map_try_sock_delete()
1012  	 * calls. Then we ask the map to remove the socket, which
1013  	 * tries to remove the socket from the map. Note that there
1014  	 * might be updates to the map between
1015  	 * xsk_get_map_list_entry() and xsk_map_try_sock_delete().
1016  	 */
1017  	struct xdp_sock __rcu **map_entry = NULL;
1018  	struct xsk_map *map;
1019  
1020  	while ((map = xsk_get_map_list_entry(xs, &map_entry))) {
1021  		xsk_map_try_sock_delete(map, xs, map_entry);
1022  		bpf_map_put(&map->map);
1023  	}
1024  }
1025  
xsk_release(struct socket * sock)1026  static int xsk_release(struct socket *sock)
1027  {
1028  	struct sock *sk = sock->sk;
1029  	struct xdp_sock *xs = xdp_sk(sk);
1030  	struct net *net;
1031  
1032  	if (!sk)
1033  		return 0;
1034  
1035  	net = sock_net(sk);
1036  
1037  	if (xs->skb)
1038  		xsk_drop_skb(xs->skb);
1039  
1040  	mutex_lock(&net->xdp.lock);
1041  	sk_del_node_init_rcu(sk);
1042  	mutex_unlock(&net->xdp.lock);
1043  
1044  	sock_prot_inuse_add(net, sk->sk_prot, -1);
1045  
1046  	xsk_delete_from_maps(xs);
1047  	mutex_lock(&xs->mutex);
1048  	xsk_unbind_dev(xs);
1049  	mutex_unlock(&xs->mutex);
1050  
1051  	xskq_destroy(xs->rx);
1052  	xskq_destroy(xs->tx);
1053  	xskq_destroy(xs->fq_tmp);
1054  	xskq_destroy(xs->cq_tmp);
1055  
1056  	sock_orphan(sk);
1057  	sock->sk = NULL;
1058  
1059  	sock_put(sk);
1060  
1061  	return 0;
1062  }
1063  
xsk_lookup_xsk_from_fd(int fd)1064  static struct socket *xsk_lookup_xsk_from_fd(int fd)
1065  {
1066  	struct socket *sock;
1067  	int err;
1068  
1069  	sock = sockfd_lookup(fd, &err);
1070  	if (!sock)
1071  		return ERR_PTR(-ENOTSOCK);
1072  
1073  	if (sock->sk->sk_family != PF_XDP) {
1074  		sockfd_put(sock);
1075  		return ERR_PTR(-ENOPROTOOPT);
1076  	}
1077  
1078  	return sock;
1079  }
1080  
xsk_validate_queues(struct xdp_sock * xs)1081  static bool xsk_validate_queues(struct xdp_sock *xs)
1082  {
1083  	return xs->fq_tmp && xs->cq_tmp;
1084  }
1085  
xsk_bind(struct socket * sock,struct sockaddr * addr,int addr_len)1086  static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
1087  {
1088  	struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr;
1089  	struct sock *sk = sock->sk;
1090  	struct xdp_sock *xs = xdp_sk(sk);
1091  	struct net_device *dev;
1092  	int bound_dev_if;
1093  	u32 flags, qid;
1094  	int err = 0;
1095  
1096  	if (addr_len < sizeof(struct sockaddr_xdp))
1097  		return -EINVAL;
1098  	if (sxdp->sxdp_family != AF_XDP)
1099  		return -EINVAL;
1100  
1101  	flags = sxdp->sxdp_flags;
1102  	if (flags & ~(XDP_SHARED_UMEM | XDP_COPY | XDP_ZEROCOPY |
1103  		      XDP_USE_NEED_WAKEUP | XDP_USE_SG))
1104  		return -EINVAL;
1105  
1106  	bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
1107  	if (bound_dev_if && bound_dev_if != sxdp->sxdp_ifindex)
1108  		return -EINVAL;
1109  
1110  	rtnl_lock();
1111  	mutex_lock(&xs->mutex);
1112  	if (xs->state != XSK_READY) {
1113  		err = -EBUSY;
1114  		goto out_release;
1115  	}
1116  
1117  	dev = dev_get_by_index(sock_net(sk), sxdp->sxdp_ifindex);
1118  	if (!dev) {
1119  		err = -ENODEV;
1120  		goto out_release;
1121  	}
1122  
1123  	if (!xs->rx && !xs->tx) {
1124  		err = -EINVAL;
1125  		goto out_unlock;
1126  	}
1127  
1128  	qid = sxdp->sxdp_queue_id;
1129  
1130  	if (flags & XDP_SHARED_UMEM) {
1131  		struct xdp_sock *umem_xs;
1132  		struct socket *sock;
1133  
1134  		if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY) ||
1135  		    (flags & XDP_USE_NEED_WAKEUP) || (flags & XDP_USE_SG)) {
1136  			/* Cannot specify flags for shared sockets. */
1137  			err = -EINVAL;
1138  			goto out_unlock;
1139  		}
1140  
1141  		if (xs->umem) {
1142  			/* We have already our own. */
1143  			err = -EINVAL;
1144  			goto out_unlock;
1145  		}
1146  
1147  		sock = xsk_lookup_xsk_from_fd(sxdp->sxdp_shared_umem_fd);
1148  		if (IS_ERR(sock)) {
1149  			err = PTR_ERR(sock);
1150  			goto out_unlock;
1151  		}
1152  
1153  		umem_xs = xdp_sk(sock->sk);
1154  		if (!xsk_is_bound(umem_xs)) {
1155  			err = -EBADF;
1156  			sockfd_put(sock);
1157  			goto out_unlock;
1158  		}
1159  
1160  		if (umem_xs->queue_id != qid || umem_xs->dev != dev) {
1161  			/* Share the umem with another socket on another qid
1162  			 * and/or device.
1163  			 */
1164  			xs->pool = xp_create_and_assign_umem(xs,
1165  							     umem_xs->umem);
1166  			if (!xs->pool) {
1167  				err = -ENOMEM;
1168  				sockfd_put(sock);
1169  				goto out_unlock;
1170  			}
1171  
1172  			err = xp_assign_dev_shared(xs->pool, umem_xs, dev,
1173  						   qid);
1174  			if (err) {
1175  				xp_destroy(xs->pool);
1176  				xs->pool = NULL;
1177  				sockfd_put(sock);
1178  				goto out_unlock;
1179  			}
1180  		} else {
1181  			/* Share the buffer pool with the other socket. */
1182  			if (xs->fq_tmp || xs->cq_tmp) {
1183  				/* Do not allow setting your own fq or cq. */
1184  				err = -EINVAL;
1185  				sockfd_put(sock);
1186  				goto out_unlock;
1187  			}
1188  
1189  			xp_get_pool(umem_xs->pool);
1190  			xs->pool = umem_xs->pool;
1191  
1192  			/* If underlying shared umem was created without Tx
1193  			 * ring, allocate Tx descs array that Tx batching API
1194  			 * utilizes
1195  			 */
1196  			if (xs->tx && !xs->pool->tx_descs) {
1197  				err = xp_alloc_tx_descs(xs->pool, xs);
1198  				if (err) {
1199  					xp_put_pool(xs->pool);
1200  					xs->pool = NULL;
1201  					sockfd_put(sock);
1202  					goto out_unlock;
1203  				}
1204  			}
1205  		}
1206  
1207  		xdp_get_umem(umem_xs->umem);
1208  		WRITE_ONCE(xs->umem, umem_xs->umem);
1209  		sockfd_put(sock);
1210  	} else if (!xs->umem || !xsk_validate_queues(xs)) {
1211  		err = -EINVAL;
1212  		goto out_unlock;
1213  	} else {
1214  		/* This xsk has its own umem. */
1215  		xs->pool = xp_create_and_assign_umem(xs, xs->umem);
1216  		if (!xs->pool) {
1217  			err = -ENOMEM;
1218  			goto out_unlock;
1219  		}
1220  
1221  		err = xp_assign_dev(xs->pool, dev, qid, flags);
1222  		if (err) {
1223  			xp_destroy(xs->pool);
1224  			xs->pool = NULL;
1225  			goto out_unlock;
1226  		}
1227  	}
1228  
1229  	/* FQ and CQ are now owned by the buffer pool and cleaned up with it. */
1230  	xs->fq_tmp = NULL;
1231  	xs->cq_tmp = NULL;
1232  
1233  	xs->dev = dev;
1234  	xs->zc = xs->umem->zc;
1235  	xs->sg = !!(xs->umem->flags & XDP_UMEM_SG_FLAG);
1236  	xs->queue_id = qid;
1237  	xp_add_xsk(xs->pool, xs);
1238  
1239  out_unlock:
1240  	if (err) {
1241  		dev_put(dev);
1242  	} else {
1243  		/* Matches smp_rmb() in bind() for shared umem
1244  		 * sockets, and xsk_is_bound().
1245  		 */
1246  		smp_wmb();
1247  		WRITE_ONCE(xs->state, XSK_BOUND);
1248  	}
1249  out_release:
1250  	mutex_unlock(&xs->mutex);
1251  	rtnl_unlock();
1252  	return err;
1253  }
1254  
1255  struct xdp_umem_reg_v1 {
1256  	__u64 addr; /* Start of packet data area */
1257  	__u64 len; /* Length of packet data area */
1258  	__u32 chunk_size;
1259  	__u32 headroom;
1260  };
1261  
xsk_setsockopt(struct socket * sock,int level,int optname,sockptr_t optval,unsigned int optlen)1262  static int xsk_setsockopt(struct socket *sock, int level, int optname,
1263  			  sockptr_t optval, unsigned int optlen)
1264  {
1265  	struct sock *sk = sock->sk;
1266  	struct xdp_sock *xs = xdp_sk(sk);
1267  	int err;
1268  
1269  	if (level != SOL_XDP)
1270  		return -ENOPROTOOPT;
1271  
1272  	switch (optname) {
1273  	case XDP_RX_RING:
1274  	case XDP_TX_RING:
1275  	{
1276  		struct xsk_queue **q;
1277  		int entries;
1278  
1279  		if (optlen < sizeof(entries))
1280  			return -EINVAL;
1281  		if (copy_from_sockptr(&entries, optval, sizeof(entries)))
1282  			return -EFAULT;
1283  
1284  		mutex_lock(&xs->mutex);
1285  		if (xs->state != XSK_READY) {
1286  			mutex_unlock(&xs->mutex);
1287  			return -EBUSY;
1288  		}
1289  		q = (optname == XDP_TX_RING) ? &xs->tx : &xs->rx;
1290  		err = xsk_init_queue(entries, q, false);
1291  		if (!err && optname == XDP_TX_RING)
1292  			/* Tx needs to be explicitly woken up the first time */
1293  			xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP;
1294  		mutex_unlock(&xs->mutex);
1295  		return err;
1296  	}
1297  	case XDP_UMEM_REG:
1298  	{
1299  		size_t mr_size = sizeof(struct xdp_umem_reg);
1300  		struct xdp_umem_reg mr = {};
1301  		struct xdp_umem *umem;
1302  
1303  		if (optlen < sizeof(struct xdp_umem_reg_v1))
1304  			return -EINVAL;
1305  		else if (optlen < sizeof(mr))
1306  			mr_size = sizeof(struct xdp_umem_reg_v1);
1307  
1308  		if (copy_from_sockptr(&mr, optval, mr_size))
1309  			return -EFAULT;
1310  
1311  		mutex_lock(&xs->mutex);
1312  		if (xs->state != XSK_READY || xs->umem) {
1313  			mutex_unlock(&xs->mutex);
1314  			return -EBUSY;
1315  		}
1316  
1317  		umem = xdp_umem_create(&mr);
1318  		if (IS_ERR(umem)) {
1319  			mutex_unlock(&xs->mutex);
1320  			return PTR_ERR(umem);
1321  		}
1322  
1323  		/* Make sure umem is ready before it can be seen by others */
1324  		smp_wmb();
1325  		WRITE_ONCE(xs->umem, umem);
1326  		mutex_unlock(&xs->mutex);
1327  		return 0;
1328  	}
1329  	case XDP_UMEM_FILL_RING:
1330  	case XDP_UMEM_COMPLETION_RING:
1331  	{
1332  		struct xsk_queue **q;
1333  		int entries;
1334  
1335  		if (optlen < sizeof(entries))
1336  			return -EINVAL;
1337  		if (copy_from_sockptr(&entries, optval, sizeof(entries)))
1338  			return -EFAULT;
1339  
1340  		mutex_lock(&xs->mutex);
1341  		if (xs->state != XSK_READY) {
1342  			mutex_unlock(&xs->mutex);
1343  			return -EBUSY;
1344  		}
1345  
1346  		q = (optname == XDP_UMEM_FILL_RING) ? &xs->fq_tmp :
1347  			&xs->cq_tmp;
1348  		err = xsk_init_queue(entries, q, true);
1349  		mutex_unlock(&xs->mutex);
1350  		return err;
1351  	}
1352  	default:
1353  		break;
1354  	}
1355  
1356  	return -ENOPROTOOPT;
1357  }
1358  
xsk_enter_rxtx_offsets(struct xdp_ring_offset_v1 * ring)1359  static void xsk_enter_rxtx_offsets(struct xdp_ring_offset_v1 *ring)
1360  {
1361  	ring->producer = offsetof(struct xdp_rxtx_ring, ptrs.producer);
1362  	ring->consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer);
1363  	ring->desc = offsetof(struct xdp_rxtx_ring, desc);
1364  }
1365  
xsk_enter_umem_offsets(struct xdp_ring_offset_v1 * ring)1366  static void xsk_enter_umem_offsets(struct xdp_ring_offset_v1 *ring)
1367  {
1368  	ring->producer = offsetof(struct xdp_umem_ring, ptrs.producer);
1369  	ring->consumer = offsetof(struct xdp_umem_ring, ptrs.consumer);
1370  	ring->desc = offsetof(struct xdp_umem_ring, desc);
1371  }
1372  
1373  struct xdp_statistics_v1 {
1374  	__u64 rx_dropped;
1375  	__u64 rx_invalid_descs;
1376  	__u64 tx_invalid_descs;
1377  };
1378  
xsk_getsockopt(struct socket * sock,int level,int optname,char __user * optval,int __user * optlen)1379  static int xsk_getsockopt(struct socket *sock, int level, int optname,
1380  			  char __user *optval, int __user *optlen)
1381  {
1382  	struct sock *sk = sock->sk;
1383  	struct xdp_sock *xs = xdp_sk(sk);
1384  	int len;
1385  
1386  	if (level != SOL_XDP)
1387  		return -ENOPROTOOPT;
1388  
1389  	if (get_user(len, optlen))
1390  		return -EFAULT;
1391  	if (len < 0)
1392  		return -EINVAL;
1393  
1394  	switch (optname) {
1395  	case XDP_STATISTICS:
1396  	{
1397  		struct xdp_statistics stats = {};
1398  		bool extra_stats = true;
1399  		size_t stats_size;
1400  
1401  		if (len < sizeof(struct xdp_statistics_v1)) {
1402  			return -EINVAL;
1403  		} else if (len < sizeof(stats)) {
1404  			extra_stats = false;
1405  			stats_size = sizeof(struct xdp_statistics_v1);
1406  		} else {
1407  			stats_size = sizeof(stats);
1408  		}
1409  
1410  		mutex_lock(&xs->mutex);
1411  		stats.rx_dropped = xs->rx_dropped;
1412  		if (extra_stats) {
1413  			stats.rx_ring_full = xs->rx_queue_full;
1414  			stats.rx_fill_ring_empty_descs =
1415  				xs->pool ? xskq_nb_queue_empty_descs(xs->pool->fq) : 0;
1416  			stats.tx_ring_empty_descs = xskq_nb_queue_empty_descs(xs->tx);
1417  		} else {
1418  			stats.rx_dropped += xs->rx_queue_full;
1419  		}
1420  		stats.rx_invalid_descs = xskq_nb_invalid_descs(xs->rx);
1421  		stats.tx_invalid_descs = xskq_nb_invalid_descs(xs->tx);
1422  		mutex_unlock(&xs->mutex);
1423  
1424  		if (copy_to_user(optval, &stats, stats_size))
1425  			return -EFAULT;
1426  		if (put_user(stats_size, optlen))
1427  			return -EFAULT;
1428  
1429  		return 0;
1430  	}
1431  	case XDP_MMAP_OFFSETS:
1432  	{
1433  		struct xdp_mmap_offsets off;
1434  		struct xdp_mmap_offsets_v1 off_v1;
1435  		bool flags_supported = true;
1436  		void *to_copy;
1437  
1438  		if (len < sizeof(off_v1))
1439  			return -EINVAL;
1440  		else if (len < sizeof(off))
1441  			flags_supported = false;
1442  
1443  		if (flags_supported) {
1444  			/* xdp_ring_offset is identical to xdp_ring_offset_v1
1445  			 * except for the flags field added to the end.
1446  			 */
1447  			xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *)
1448  					       &off.rx);
1449  			xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *)
1450  					       &off.tx);
1451  			xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *)
1452  					       &off.fr);
1453  			xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *)
1454  					       &off.cr);
1455  			off.rx.flags = offsetof(struct xdp_rxtx_ring,
1456  						ptrs.flags);
1457  			off.tx.flags = offsetof(struct xdp_rxtx_ring,
1458  						ptrs.flags);
1459  			off.fr.flags = offsetof(struct xdp_umem_ring,
1460  						ptrs.flags);
1461  			off.cr.flags = offsetof(struct xdp_umem_ring,
1462  						ptrs.flags);
1463  
1464  			len = sizeof(off);
1465  			to_copy = &off;
1466  		} else {
1467  			xsk_enter_rxtx_offsets(&off_v1.rx);
1468  			xsk_enter_rxtx_offsets(&off_v1.tx);
1469  			xsk_enter_umem_offsets(&off_v1.fr);
1470  			xsk_enter_umem_offsets(&off_v1.cr);
1471  
1472  			len = sizeof(off_v1);
1473  			to_copy = &off_v1;
1474  		}
1475  
1476  		if (copy_to_user(optval, to_copy, len))
1477  			return -EFAULT;
1478  		if (put_user(len, optlen))
1479  			return -EFAULT;
1480  
1481  		return 0;
1482  	}
1483  	case XDP_OPTIONS:
1484  	{
1485  		struct xdp_options opts = {};
1486  
1487  		if (len < sizeof(opts))
1488  			return -EINVAL;
1489  
1490  		mutex_lock(&xs->mutex);
1491  		if (xs->zc)
1492  			opts.flags |= XDP_OPTIONS_ZEROCOPY;
1493  		mutex_unlock(&xs->mutex);
1494  
1495  		len = sizeof(opts);
1496  		if (copy_to_user(optval, &opts, len))
1497  			return -EFAULT;
1498  		if (put_user(len, optlen))
1499  			return -EFAULT;
1500  
1501  		return 0;
1502  	}
1503  	default:
1504  		break;
1505  	}
1506  
1507  	return -EOPNOTSUPP;
1508  }
1509  
xsk_mmap(struct file * file,struct socket * sock,struct vm_area_struct * vma)1510  static int xsk_mmap(struct file *file, struct socket *sock,
1511  		    struct vm_area_struct *vma)
1512  {
1513  	loff_t offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
1514  	unsigned long size = vma->vm_end - vma->vm_start;
1515  	struct xdp_sock *xs = xdp_sk(sock->sk);
1516  	int state = READ_ONCE(xs->state);
1517  	struct xsk_queue *q = NULL;
1518  
1519  	if (state != XSK_READY && state != XSK_BOUND)
1520  		return -EBUSY;
1521  
1522  	if (offset == XDP_PGOFF_RX_RING) {
1523  		q = READ_ONCE(xs->rx);
1524  	} else if (offset == XDP_PGOFF_TX_RING) {
1525  		q = READ_ONCE(xs->tx);
1526  	} else {
1527  		/* Matches the smp_wmb() in XDP_UMEM_REG */
1528  		smp_rmb();
1529  		if (offset == XDP_UMEM_PGOFF_FILL_RING)
1530  			q = state == XSK_READY ? READ_ONCE(xs->fq_tmp) :
1531  						 READ_ONCE(xs->pool->fq);
1532  		else if (offset == XDP_UMEM_PGOFF_COMPLETION_RING)
1533  			q = state == XSK_READY ? READ_ONCE(xs->cq_tmp) :
1534  						 READ_ONCE(xs->pool->cq);
1535  	}
1536  
1537  	if (!q)
1538  		return -EINVAL;
1539  
1540  	/* Matches the smp_wmb() in xsk_init_queue */
1541  	smp_rmb();
1542  	if (size > q->ring_vmalloc_size)
1543  		return -EINVAL;
1544  
1545  	return remap_vmalloc_range(vma, q->ring, 0);
1546  }
1547  
xsk_notifier(struct notifier_block * this,unsigned long msg,void * ptr)1548  static int xsk_notifier(struct notifier_block *this,
1549  			unsigned long msg, void *ptr)
1550  {
1551  	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
1552  	struct net *net = dev_net(dev);
1553  	struct sock *sk;
1554  
1555  	switch (msg) {
1556  	case NETDEV_UNREGISTER:
1557  		mutex_lock(&net->xdp.lock);
1558  		sk_for_each(sk, &net->xdp.list) {
1559  			struct xdp_sock *xs = xdp_sk(sk);
1560  
1561  			mutex_lock(&xs->mutex);
1562  			if (xs->dev == dev) {
1563  				sk->sk_err = ENETDOWN;
1564  				if (!sock_flag(sk, SOCK_DEAD))
1565  					sk_error_report(sk);
1566  
1567  				xsk_unbind_dev(xs);
1568  
1569  				/* Clear device references. */
1570  				xp_clear_dev(xs->pool);
1571  			}
1572  			mutex_unlock(&xs->mutex);
1573  		}
1574  		mutex_unlock(&net->xdp.lock);
1575  		break;
1576  	}
1577  	return NOTIFY_DONE;
1578  }
1579  
1580  static struct proto xsk_proto = {
1581  	.name =		"XDP",
1582  	.owner =	THIS_MODULE,
1583  	.obj_size =	sizeof(struct xdp_sock),
1584  };
1585  
1586  static const struct proto_ops xsk_proto_ops = {
1587  	.family		= PF_XDP,
1588  	.owner		= THIS_MODULE,
1589  	.release	= xsk_release,
1590  	.bind		= xsk_bind,
1591  	.connect	= sock_no_connect,
1592  	.socketpair	= sock_no_socketpair,
1593  	.accept		= sock_no_accept,
1594  	.getname	= sock_no_getname,
1595  	.poll		= xsk_poll,
1596  	.ioctl		= sock_no_ioctl,
1597  	.listen		= sock_no_listen,
1598  	.shutdown	= sock_no_shutdown,
1599  	.setsockopt	= xsk_setsockopt,
1600  	.getsockopt	= xsk_getsockopt,
1601  	.sendmsg	= xsk_sendmsg,
1602  	.recvmsg	= xsk_recvmsg,
1603  	.mmap		= xsk_mmap,
1604  };
1605  
xsk_destruct(struct sock * sk)1606  static void xsk_destruct(struct sock *sk)
1607  {
1608  	struct xdp_sock *xs = xdp_sk(sk);
1609  
1610  	if (!sock_flag(sk, SOCK_DEAD))
1611  		return;
1612  
1613  	if (!xp_put_pool(xs->pool))
1614  		xdp_put_umem(xs->umem, !xs->pool);
1615  }
1616  
xsk_create(struct net * net,struct socket * sock,int protocol,int kern)1617  static int xsk_create(struct net *net, struct socket *sock, int protocol,
1618  		      int kern)
1619  {
1620  	struct xdp_sock *xs;
1621  	struct sock *sk;
1622  
1623  	if (!ns_capable(net->user_ns, CAP_NET_RAW))
1624  		return -EPERM;
1625  	if (sock->type != SOCK_RAW)
1626  		return -ESOCKTNOSUPPORT;
1627  
1628  	if (protocol)
1629  		return -EPROTONOSUPPORT;
1630  
1631  	sock->state = SS_UNCONNECTED;
1632  
1633  	sk = sk_alloc(net, PF_XDP, GFP_KERNEL, &xsk_proto, kern);
1634  	if (!sk)
1635  		return -ENOBUFS;
1636  
1637  	sock->ops = &xsk_proto_ops;
1638  
1639  	sock_init_data(sock, sk);
1640  
1641  	sk->sk_family = PF_XDP;
1642  
1643  	sk->sk_destruct = xsk_destruct;
1644  
1645  	sock_set_flag(sk, SOCK_RCU_FREE);
1646  
1647  	xs = xdp_sk(sk);
1648  	xs->state = XSK_READY;
1649  	mutex_init(&xs->mutex);
1650  	spin_lock_init(&xs->rx_lock);
1651  
1652  	INIT_LIST_HEAD(&xs->map_list);
1653  	spin_lock_init(&xs->map_list_lock);
1654  
1655  	mutex_lock(&net->xdp.lock);
1656  	sk_add_node_rcu(sk, &net->xdp.list);
1657  	mutex_unlock(&net->xdp.lock);
1658  
1659  	sock_prot_inuse_add(net, &xsk_proto, 1);
1660  
1661  	return 0;
1662  }
1663  
1664  static const struct net_proto_family xsk_family_ops = {
1665  	.family = PF_XDP,
1666  	.create = xsk_create,
1667  	.owner	= THIS_MODULE,
1668  };
1669  
1670  static struct notifier_block xsk_netdev_notifier = {
1671  	.notifier_call	= xsk_notifier,
1672  };
1673  
xsk_net_init(struct net * net)1674  static int __net_init xsk_net_init(struct net *net)
1675  {
1676  	mutex_init(&net->xdp.lock);
1677  	INIT_HLIST_HEAD(&net->xdp.list);
1678  	return 0;
1679  }
1680  
xsk_net_exit(struct net * net)1681  static void __net_exit xsk_net_exit(struct net *net)
1682  {
1683  	WARN_ON_ONCE(!hlist_empty(&net->xdp.list));
1684  }
1685  
1686  static struct pernet_operations xsk_net_ops = {
1687  	.init = xsk_net_init,
1688  	.exit = xsk_net_exit,
1689  };
1690  
xsk_init(void)1691  static int __init xsk_init(void)
1692  {
1693  	int err, cpu;
1694  
1695  	err = proto_register(&xsk_proto, 0 /* no slab */);
1696  	if (err)
1697  		goto out;
1698  
1699  	err = sock_register(&xsk_family_ops);
1700  	if (err)
1701  		goto out_proto;
1702  
1703  	err = register_pernet_subsys(&xsk_net_ops);
1704  	if (err)
1705  		goto out_sk;
1706  
1707  	err = register_netdevice_notifier(&xsk_netdev_notifier);
1708  	if (err)
1709  		goto out_pernet;
1710  
1711  	for_each_possible_cpu(cpu)
1712  		INIT_LIST_HEAD(&per_cpu(xskmap_flush_list, cpu));
1713  	return 0;
1714  
1715  out_pernet:
1716  	unregister_pernet_subsys(&xsk_net_ops);
1717  out_sk:
1718  	sock_unregister(PF_XDP);
1719  out_proto:
1720  	proto_unregister(&xsk_proto);
1721  out:
1722  	return err;
1723  }
1724  
1725  fs_initcall(xsk_init);
1726