xref: /openbmc/qemu/net/af-xdp.c (revision cb039ef3)
1*cb039ef3SIlya Maximets /*
2*cb039ef3SIlya Maximets  * AF_XDP network backend.
3*cb039ef3SIlya Maximets  *
4*cb039ef3SIlya Maximets  * Copyright (c) 2023 Red Hat, Inc.
5*cb039ef3SIlya Maximets  *
6*cb039ef3SIlya Maximets  * Authors:
7*cb039ef3SIlya Maximets  *  Ilya Maximets <i.maximets@ovn.org>
8*cb039ef3SIlya Maximets  *
9*cb039ef3SIlya Maximets  * This work is licensed under the terms of the GNU GPL, version 2 or later.
10*cb039ef3SIlya Maximets  * See the COPYING file in the top-level directory.
11*cb039ef3SIlya Maximets  */
12*cb039ef3SIlya Maximets 
13*cb039ef3SIlya Maximets 
14*cb039ef3SIlya Maximets #include "qemu/osdep.h"
15*cb039ef3SIlya Maximets #include <bpf/bpf.h>
16*cb039ef3SIlya Maximets #include <inttypes.h>
17*cb039ef3SIlya Maximets #include <linux/if_link.h>
18*cb039ef3SIlya Maximets #include <linux/if_xdp.h>
19*cb039ef3SIlya Maximets #include <net/if.h>
20*cb039ef3SIlya Maximets #include <xdp/xsk.h>
21*cb039ef3SIlya Maximets 
22*cb039ef3SIlya Maximets #include "clients.h"
23*cb039ef3SIlya Maximets #include "monitor/monitor.h"
24*cb039ef3SIlya Maximets #include "net/net.h"
25*cb039ef3SIlya Maximets #include "qapi/error.h"
26*cb039ef3SIlya Maximets #include "qemu/cutils.h"
27*cb039ef3SIlya Maximets #include "qemu/error-report.h"
28*cb039ef3SIlya Maximets #include "qemu/iov.h"
29*cb039ef3SIlya Maximets #include "qemu/main-loop.h"
30*cb039ef3SIlya Maximets #include "qemu/memalign.h"
31*cb039ef3SIlya Maximets 
32*cb039ef3SIlya Maximets 
33*cb039ef3SIlya Maximets typedef struct AFXDPState {
34*cb039ef3SIlya Maximets     NetClientState       nc;
35*cb039ef3SIlya Maximets 
36*cb039ef3SIlya Maximets     struct xsk_socket    *xsk;
37*cb039ef3SIlya Maximets     struct xsk_ring_cons rx;
38*cb039ef3SIlya Maximets     struct xsk_ring_prod tx;
39*cb039ef3SIlya Maximets     struct xsk_ring_cons cq;
40*cb039ef3SIlya Maximets     struct xsk_ring_prod fq;
41*cb039ef3SIlya Maximets 
42*cb039ef3SIlya Maximets     char                 ifname[IFNAMSIZ];
43*cb039ef3SIlya Maximets     int                  ifindex;
44*cb039ef3SIlya Maximets     bool                 read_poll;
45*cb039ef3SIlya Maximets     bool                 write_poll;
46*cb039ef3SIlya Maximets     uint32_t             outstanding_tx;
47*cb039ef3SIlya Maximets 
48*cb039ef3SIlya Maximets     uint64_t             *pool;
49*cb039ef3SIlya Maximets     uint32_t             n_pool;
50*cb039ef3SIlya Maximets     char                 *buffer;
51*cb039ef3SIlya Maximets     struct xsk_umem      *umem;
52*cb039ef3SIlya Maximets 
53*cb039ef3SIlya Maximets     uint32_t             n_queues;
54*cb039ef3SIlya Maximets     uint32_t             xdp_flags;
55*cb039ef3SIlya Maximets     bool                 inhibit;
56*cb039ef3SIlya Maximets } AFXDPState;
57*cb039ef3SIlya Maximets 
58*cb039ef3SIlya Maximets #define AF_XDP_BATCH_SIZE 64
59*cb039ef3SIlya Maximets 
60*cb039ef3SIlya Maximets static void af_xdp_send(void *opaque);
61*cb039ef3SIlya Maximets static void af_xdp_writable(void *opaque);
62*cb039ef3SIlya Maximets 
63*cb039ef3SIlya Maximets /* Set the event-loop handlers for the af-xdp backend. */
af_xdp_update_fd_handler(AFXDPState * s)64*cb039ef3SIlya Maximets static void af_xdp_update_fd_handler(AFXDPState *s)
65*cb039ef3SIlya Maximets {
66*cb039ef3SIlya Maximets     qemu_set_fd_handler(xsk_socket__fd(s->xsk),
67*cb039ef3SIlya Maximets                         s->read_poll ? af_xdp_send : NULL,
68*cb039ef3SIlya Maximets                         s->write_poll ? af_xdp_writable : NULL,
69*cb039ef3SIlya Maximets                         s);
70*cb039ef3SIlya Maximets }
71*cb039ef3SIlya Maximets 
72*cb039ef3SIlya Maximets /* Update the read handler. */
af_xdp_read_poll(AFXDPState * s,bool enable)73*cb039ef3SIlya Maximets static void af_xdp_read_poll(AFXDPState *s, bool enable)
74*cb039ef3SIlya Maximets {
75*cb039ef3SIlya Maximets     if (s->read_poll != enable) {
76*cb039ef3SIlya Maximets         s->read_poll = enable;
77*cb039ef3SIlya Maximets         af_xdp_update_fd_handler(s);
78*cb039ef3SIlya Maximets     }
79*cb039ef3SIlya Maximets }
80*cb039ef3SIlya Maximets 
81*cb039ef3SIlya Maximets /* Update the write handler. */
af_xdp_write_poll(AFXDPState * s,bool enable)82*cb039ef3SIlya Maximets static void af_xdp_write_poll(AFXDPState *s, bool enable)
83*cb039ef3SIlya Maximets {
84*cb039ef3SIlya Maximets     if (s->write_poll != enable) {
85*cb039ef3SIlya Maximets         s->write_poll = enable;
86*cb039ef3SIlya Maximets         af_xdp_update_fd_handler(s);
87*cb039ef3SIlya Maximets     }
88*cb039ef3SIlya Maximets }
89*cb039ef3SIlya Maximets 
af_xdp_poll(NetClientState * nc,bool enable)90*cb039ef3SIlya Maximets static void af_xdp_poll(NetClientState *nc, bool enable)
91*cb039ef3SIlya Maximets {
92*cb039ef3SIlya Maximets     AFXDPState *s = DO_UPCAST(AFXDPState, nc, nc);
93*cb039ef3SIlya Maximets 
94*cb039ef3SIlya Maximets     if (s->read_poll != enable || s->write_poll != enable) {
95*cb039ef3SIlya Maximets         s->write_poll = enable;
96*cb039ef3SIlya Maximets         s->read_poll  = enable;
97*cb039ef3SIlya Maximets         af_xdp_update_fd_handler(s);
98*cb039ef3SIlya Maximets     }
99*cb039ef3SIlya Maximets }
100*cb039ef3SIlya Maximets 
af_xdp_complete_tx(AFXDPState * s)101*cb039ef3SIlya Maximets static void af_xdp_complete_tx(AFXDPState *s)
102*cb039ef3SIlya Maximets {
103*cb039ef3SIlya Maximets     uint32_t idx = 0;
104*cb039ef3SIlya Maximets     uint32_t done, i;
105*cb039ef3SIlya Maximets     uint64_t *addr;
106*cb039ef3SIlya Maximets 
107*cb039ef3SIlya Maximets     done = xsk_ring_cons__peek(&s->cq, XSK_RING_CONS__DEFAULT_NUM_DESCS, &idx);
108*cb039ef3SIlya Maximets 
109*cb039ef3SIlya Maximets     for (i = 0; i < done; i++) {
110*cb039ef3SIlya Maximets         addr = (void *) xsk_ring_cons__comp_addr(&s->cq, idx++);
111*cb039ef3SIlya Maximets         s->pool[s->n_pool++] = *addr;
112*cb039ef3SIlya Maximets         s->outstanding_tx--;
113*cb039ef3SIlya Maximets     }
114*cb039ef3SIlya Maximets 
115*cb039ef3SIlya Maximets     if (done) {
116*cb039ef3SIlya Maximets         xsk_ring_cons__release(&s->cq, done);
117*cb039ef3SIlya Maximets     }
118*cb039ef3SIlya Maximets }
119*cb039ef3SIlya Maximets 
120*cb039ef3SIlya Maximets /*
121*cb039ef3SIlya Maximets  * The fd_write() callback, invoked if the fd is marked as writable
122*cb039ef3SIlya Maximets  * after a poll.
123*cb039ef3SIlya Maximets  */
af_xdp_writable(void * opaque)124*cb039ef3SIlya Maximets static void af_xdp_writable(void *opaque)
125*cb039ef3SIlya Maximets {
126*cb039ef3SIlya Maximets     AFXDPState *s = opaque;
127*cb039ef3SIlya Maximets 
128*cb039ef3SIlya Maximets     /* Try to recover buffers that are already sent. */
129*cb039ef3SIlya Maximets     af_xdp_complete_tx(s);
130*cb039ef3SIlya Maximets 
131*cb039ef3SIlya Maximets     /*
132*cb039ef3SIlya Maximets      * Unregister the handler, unless we still have packets to transmit
133*cb039ef3SIlya Maximets      * and kernel needs a wake up.
134*cb039ef3SIlya Maximets      */
135*cb039ef3SIlya Maximets     if (!s->outstanding_tx || !xsk_ring_prod__needs_wakeup(&s->tx)) {
136*cb039ef3SIlya Maximets         af_xdp_write_poll(s, false);
137*cb039ef3SIlya Maximets     }
138*cb039ef3SIlya Maximets 
139*cb039ef3SIlya Maximets     /* Flush any buffered packets. */
140*cb039ef3SIlya Maximets     qemu_flush_queued_packets(&s->nc);
141*cb039ef3SIlya Maximets }
142*cb039ef3SIlya Maximets 
af_xdp_receive(NetClientState * nc,const uint8_t * buf,size_t size)143*cb039ef3SIlya Maximets static ssize_t af_xdp_receive(NetClientState *nc,
144*cb039ef3SIlya Maximets                               const uint8_t *buf, size_t size)
145*cb039ef3SIlya Maximets {
146*cb039ef3SIlya Maximets     AFXDPState *s = DO_UPCAST(AFXDPState, nc, nc);
147*cb039ef3SIlya Maximets     struct xdp_desc *desc;
148*cb039ef3SIlya Maximets     uint32_t idx;
149*cb039ef3SIlya Maximets     void *data;
150*cb039ef3SIlya Maximets 
151*cb039ef3SIlya Maximets     /* Try to recover buffers that are already sent. */
152*cb039ef3SIlya Maximets     af_xdp_complete_tx(s);
153*cb039ef3SIlya Maximets 
154*cb039ef3SIlya Maximets     if (size > XSK_UMEM__DEFAULT_FRAME_SIZE) {
155*cb039ef3SIlya Maximets         /* We can't transmit packet this size... */
156*cb039ef3SIlya Maximets         return size;
157*cb039ef3SIlya Maximets     }
158*cb039ef3SIlya Maximets 
159*cb039ef3SIlya Maximets     if (!s->n_pool || !xsk_ring_prod__reserve(&s->tx, 1, &idx)) {
160*cb039ef3SIlya Maximets         /*
161*cb039ef3SIlya Maximets          * Out of buffers or space in tx ring.  Poll until we can write.
162*cb039ef3SIlya Maximets          * This will also kick the Tx, if it was waiting on CQ.
163*cb039ef3SIlya Maximets          */
164*cb039ef3SIlya Maximets         af_xdp_write_poll(s, true);
165*cb039ef3SIlya Maximets         return 0;
166*cb039ef3SIlya Maximets     }
167*cb039ef3SIlya Maximets 
168*cb039ef3SIlya Maximets     desc = xsk_ring_prod__tx_desc(&s->tx, idx);
169*cb039ef3SIlya Maximets     desc->addr = s->pool[--s->n_pool];
170*cb039ef3SIlya Maximets     desc->len = size;
171*cb039ef3SIlya Maximets 
172*cb039ef3SIlya Maximets     data = xsk_umem__get_data(s->buffer, desc->addr);
173*cb039ef3SIlya Maximets     memcpy(data, buf, size);
174*cb039ef3SIlya Maximets 
175*cb039ef3SIlya Maximets     xsk_ring_prod__submit(&s->tx, 1);
176*cb039ef3SIlya Maximets     s->outstanding_tx++;
177*cb039ef3SIlya Maximets 
178*cb039ef3SIlya Maximets     if (xsk_ring_prod__needs_wakeup(&s->tx)) {
179*cb039ef3SIlya Maximets         af_xdp_write_poll(s, true);
180*cb039ef3SIlya Maximets     }
181*cb039ef3SIlya Maximets 
182*cb039ef3SIlya Maximets     return size;
183*cb039ef3SIlya Maximets }
184*cb039ef3SIlya Maximets 
185*cb039ef3SIlya Maximets /*
186*cb039ef3SIlya Maximets  * Complete a previous send (backend --> guest) and enable the
187*cb039ef3SIlya Maximets  * fd_read callback.
188*cb039ef3SIlya Maximets  */
af_xdp_send_completed(NetClientState * nc,ssize_t len)189*cb039ef3SIlya Maximets static void af_xdp_send_completed(NetClientState *nc, ssize_t len)
190*cb039ef3SIlya Maximets {
191*cb039ef3SIlya Maximets     AFXDPState *s = DO_UPCAST(AFXDPState, nc, nc);
192*cb039ef3SIlya Maximets 
193*cb039ef3SIlya Maximets     af_xdp_read_poll(s, true);
194*cb039ef3SIlya Maximets }
195*cb039ef3SIlya Maximets 
af_xdp_fq_refill(AFXDPState * s,uint32_t n)196*cb039ef3SIlya Maximets static void af_xdp_fq_refill(AFXDPState *s, uint32_t n)
197*cb039ef3SIlya Maximets {
198*cb039ef3SIlya Maximets     uint32_t i, idx = 0;
199*cb039ef3SIlya Maximets 
200*cb039ef3SIlya Maximets     /* Leave one packet for Tx, just in case. */
201*cb039ef3SIlya Maximets     if (s->n_pool < n + 1) {
202*cb039ef3SIlya Maximets         n = s->n_pool;
203*cb039ef3SIlya Maximets     }
204*cb039ef3SIlya Maximets 
205*cb039ef3SIlya Maximets     if (!n || !xsk_ring_prod__reserve(&s->fq, n, &idx)) {
206*cb039ef3SIlya Maximets         return;
207*cb039ef3SIlya Maximets     }
208*cb039ef3SIlya Maximets 
209*cb039ef3SIlya Maximets     for (i = 0; i < n; i++) {
210*cb039ef3SIlya Maximets         *xsk_ring_prod__fill_addr(&s->fq, idx++) = s->pool[--s->n_pool];
211*cb039ef3SIlya Maximets     }
212*cb039ef3SIlya Maximets     xsk_ring_prod__submit(&s->fq, n);
213*cb039ef3SIlya Maximets 
214*cb039ef3SIlya Maximets     if (xsk_ring_prod__needs_wakeup(&s->fq)) {
215*cb039ef3SIlya Maximets         /* Receive was blocked by not having enough buffers.  Wake it up. */
216*cb039ef3SIlya Maximets         af_xdp_read_poll(s, true);
217*cb039ef3SIlya Maximets     }
218*cb039ef3SIlya Maximets }
219*cb039ef3SIlya Maximets 
af_xdp_send(void * opaque)220*cb039ef3SIlya Maximets static void af_xdp_send(void *opaque)
221*cb039ef3SIlya Maximets {
222*cb039ef3SIlya Maximets     uint32_t i, n_rx, idx = 0;
223*cb039ef3SIlya Maximets     AFXDPState *s = opaque;
224*cb039ef3SIlya Maximets 
225*cb039ef3SIlya Maximets     n_rx = xsk_ring_cons__peek(&s->rx, AF_XDP_BATCH_SIZE, &idx);
226*cb039ef3SIlya Maximets     if (!n_rx) {
227*cb039ef3SIlya Maximets         return;
228*cb039ef3SIlya Maximets     }
229*cb039ef3SIlya Maximets 
230*cb039ef3SIlya Maximets     for (i = 0; i < n_rx; i++) {
231*cb039ef3SIlya Maximets         const struct xdp_desc *desc;
232*cb039ef3SIlya Maximets         struct iovec iov;
233*cb039ef3SIlya Maximets 
234*cb039ef3SIlya Maximets         desc = xsk_ring_cons__rx_desc(&s->rx, idx++);
235*cb039ef3SIlya Maximets 
236*cb039ef3SIlya Maximets         iov.iov_base = xsk_umem__get_data(s->buffer, desc->addr);
237*cb039ef3SIlya Maximets         iov.iov_len = desc->len;
238*cb039ef3SIlya Maximets 
239*cb039ef3SIlya Maximets         s->pool[s->n_pool++] = desc->addr;
240*cb039ef3SIlya Maximets 
241*cb039ef3SIlya Maximets         if (!qemu_sendv_packet_async(&s->nc, &iov, 1,
242*cb039ef3SIlya Maximets                                      af_xdp_send_completed)) {
243*cb039ef3SIlya Maximets             /*
244*cb039ef3SIlya Maximets              * The peer does not receive anymore.  Packet is queued, stop
245*cb039ef3SIlya Maximets              * reading from the backend until af_xdp_send_completed().
246*cb039ef3SIlya Maximets              */
247*cb039ef3SIlya Maximets             af_xdp_read_poll(s, false);
248*cb039ef3SIlya Maximets 
249*cb039ef3SIlya Maximets             /* Return unused descriptors to not break the ring cache. */
250*cb039ef3SIlya Maximets             xsk_ring_cons__cancel(&s->rx, n_rx - i - 1);
251*cb039ef3SIlya Maximets             n_rx = i + 1;
252*cb039ef3SIlya Maximets             break;
253*cb039ef3SIlya Maximets         }
254*cb039ef3SIlya Maximets     }
255*cb039ef3SIlya Maximets 
256*cb039ef3SIlya Maximets     /* Release actually sent descriptors and try to re-fill. */
257*cb039ef3SIlya Maximets     xsk_ring_cons__release(&s->rx, n_rx);
258*cb039ef3SIlya Maximets     af_xdp_fq_refill(s, AF_XDP_BATCH_SIZE);
259*cb039ef3SIlya Maximets }
260*cb039ef3SIlya Maximets 
261*cb039ef3SIlya Maximets /* Flush and close. */
af_xdp_cleanup(NetClientState * nc)262*cb039ef3SIlya Maximets static void af_xdp_cleanup(NetClientState *nc)
263*cb039ef3SIlya Maximets {
264*cb039ef3SIlya Maximets     AFXDPState *s = DO_UPCAST(AFXDPState, nc, nc);
265*cb039ef3SIlya Maximets 
266*cb039ef3SIlya Maximets     qemu_purge_queued_packets(nc);
267*cb039ef3SIlya Maximets 
268*cb039ef3SIlya Maximets     af_xdp_poll(nc, false);
269*cb039ef3SIlya Maximets 
270*cb039ef3SIlya Maximets     xsk_socket__delete(s->xsk);
271*cb039ef3SIlya Maximets     s->xsk = NULL;
272*cb039ef3SIlya Maximets     g_free(s->pool);
273*cb039ef3SIlya Maximets     s->pool = NULL;
274*cb039ef3SIlya Maximets     xsk_umem__delete(s->umem);
275*cb039ef3SIlya Maximets     s->umem = NULL;
276*cb039ef3SIlya Maximets     qemu_vfree(s->buffer);
277*cb039ef3SIlya Maximets     s->buffer = NULL;
278*cb039ef3SIlya Maximets 
279*cb039ef3SIlya Maximets     /* Remove the program if it's the last open queue. */
280*cb039ef3SIlya Maximets     if (!s->inhibit && nc->queue_index == s->n_queues - 1 && s->xdp_flags
281*cb039ef3SIlya Maximets         && bpf_xdp_detach(s->ifindex, s->xdp_flags, NULL) != 0) {
282*cb039ef3SIlya Maximets         fprintf(stderr,
283*cb039ef3SIlya Maximets                 "af-xdp: unable to remove XDP program from '%s', ifindex: %d\n",
284*cb039ef3SIlya Maximets                 s->ifname, s->ifindex);
285*cb039ef3SIlya Maximets     }
286*cb039ef3SIlya Maximets }
287*cb039ef3SIlya Maximets 
af_xdp_umem_create(AFXDPState * s,int sock_fd,Error ** errp)288*cb039ef3SIlya Maximets static int af_xdp_umem_create(AFXDPState *s, int sock_fd, Error **errp)
289*cb039ef3SIlya Maximets {
290*cb039ef3SIlya Maximets     struct xsk_umem_config config = {
291*cb039ef3SIlya Maximets         .fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS,
292*cb039ef3SIlya Maximets         .comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS,
293*cb039ef3SIlya Maximets         .frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE,
294*cb039ef3SIlya Maximets         .frame_headroom = 0,
295*cb039ef3SIlya Maximets     };
296*cb039ef3SIlya Maximets     uint64_t n_descs;
297*cb039ef3SIlya Maximets     uint64_t size;
298*cb039ef3SIlya Maximets     int64_t i;
299*cb039ef3SIlya Maximets     int ret;
300*cb039ef3SIlya Maximets 
301*cb039ef3SIlya Maximets     /* Number of descriptors if all 4 queues (rx, tx, cq, fq) are full. */
302*cb039ef3SIlya Maximets     n_descs = (XSK_RING_PROD__DEFAULT_NUM_DESCS
303*cb039ef3SIlya Maximets                + XSK_RING_CONS__DEFAULT_NUM_DESCS) * 2;
304*cb039ef3SIlya Maximets     size = n_descs * XSK_UMEM__DEFAULT_FRAME_SIZE;
305*cb039ef3SIlya Maximets 
306*cb039ef3SIlya Maximets     s->buffer = qemu_memalign(qemu_real_host_page_size(), size);
307*cb039ef3SIlya Maximets     memset(s->buffer, 0, size);
308*cb039ef3SIlya Maximets 
309*cb039ef3SIlya Maximets     if (sock_fd < 0) {
310*cb039ef3SIlya Maximets         ret = xsk_umem__create(&s->umem, s->buffer, size,
311*cb039ef3SIlya Maximets                                &s->fq, &s->cq, &config);
312*cb039ef3SIlya Maximets     } else {
313*cb039ef3SIlya Maximets         ret = xsk_umem__create_with_fd(&s->umem, sock_fd, s->buffer, size,
314*cb039ef3SIlya Maximets                                        &s->fq, &s->cq, &config);
315*cb039ef3SIlya Maximets     }
316*cb039ef3SIlya Maximets 
317*cb039ef3SIlya Maximets     if (ret) {
318*cb039ef3SIlya Maximets         qemu_vfree(s->buffer);
319*cb039ef3SIlya Maximets         error_setg_errno(errp, errno,
320*cb039ef3SIlya Maximets                          "failed to create umem for %s queue_index: %d",
321*cb039ef3SIlya Maximets                          s->ifname, s->nc.queue_index);
322*cb039ef3SIlya Maximets         return -1;
323*cb039ef3SIlya Maximets     }
324*cb039ef3SIlya Maximets 
325*cb039ef3SIlya Maximets     s->pool = g_new(uint64_t, n_descs);
326*cb039ef3SIlya Maximets     /* Fill the pool in the opposite order, because it's a LIFO queue. */
327*cb039ef3SIlya Maximets     for (i = n_descs; i >= 0; i--) {
328*cb039ef3SIlya Maximets         s->pool[i] = i * XSK_UMEM__DEFAULT_FRAME_SIZE;
329*cb039ef3SIlya Maximets     }
330*cb039ef3SIlya Maximets     s->n_pool = n_descs;
331*cb039ef3SIlya Maximets 
332*cb039ef3SIlya Maximets     af_xdp_fq_refill(s, XSK_RING_PROD__DEFAULT_NUM_DESCS);
333*cb039ef3SIlya Maximets 
334*cb039ef3SIlya Maximets     return 0;
335*cb039ef3SIlya Maximets }
336*cb039ef3SIlya Maximets 
af_xdp_socket_create(AFXDPState * s,const NetdevAFXDPOptions * opts,Error ** errp)337*cb039ef3SIlya Maximets static int af_xdp_socket_create(AFXDPState *s,
338*cb039ef3SIlya Maximets                                 const NetdevAFXDPOptions *opts, Error **errp)
339*cb039ef3SIlya Maximets {
340*cb039ef3SIlya Maximets     struct xsk_socket_config cfg = {
341*cb039ef3SIlya Maximets         .rx_size = XSK_RING_CONS__DEFAULT_NUM_DESCS,
342*cb039ef3SIlya Maximets         .tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS,
343*cb039ef3SIlya Maximets         .libxdp_flags = 0,
344*cb039ef3SIlya Maximets         .bind_flags = XDP_USE_NEED_WAKEUP,
345*cb039ef3SIlya Maximets         .xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST,
346*cb039ef3SIlya Maximets     };
347*cb039ef3SIlya Maximets     int queue_id, error = 0;
348*cb039ef3SIlya Maximets 
349*cb039ef3SIlya Maximets     s->inhibit = opts->has_inhibit && opts->inhibit;
350*cb039ef3SIlya Maximets     if (s->inhibit) {
351*cb039ef3SIlya Maximets         cfg.libxdp_flags |= XSK_LIBXDP_FLAGS__INHIBIT_PROG_LOAD;
352*cb039ef3SIlya Maximets     }
353*cb039ef3SIlya Maximets 
354*cb039ef3SIlya Maximets     if (opts->has_force_copy && opts->force_copy) {
355*cb039ef3SIlya Maximets         cfg.bind_flags |= XDP_COPY;
356*cb039ef3SIlya Maximets     }
357*cb039ef3SIlya Maximets 
358*cb039ef3SIlya Maximets     queue_id = s->nc.queue_index;
359*cb039ef3SIlya Maximets     if (opts->has_start_queue && opts->start_queue > 0) {
360*cb039ef3SIlya Maximets         queue_id += opts->start_queue;
361*cb039ef3SIlya Maximets     }
362*cb039ef3SIlya Maximets 
363*cb039ef3SIlya Maximets     if (opts->has_mode) {
364*cb039ef3SIlya Maximets         /* Specific mode requested. */
365*cb039ef3SIlya Maximets         cfg.xdp_flags |= (opts->mode == AFXDP_MODE_NATIVE)
366*cb039ef3SIlya Maximets                          ? XDP_FLAGS_DRV_MODE : XDP_FLAGS_SKB_MODE;
367*cb039ef3SIlya Maximets         if (xsk_socket__create(&s->xsk, s->ifname, queue_id,
368*cb039ef3SIlya Maximets                                s->umem, &s->rx, &s->tx, &cfg)) {
369*cb039ef3SIlya Maximets             error = errno;
370*cb039ef3SIlya Maximets         }
371*cb039ef3SIlya Maximets     } else {
372*cb039ef3SIlya Maximets         /* No mode requested, try native first. */
373*cb039ef3SIlya Maximets         cfg.xdp_flags |= XDP_FLAGS_DRV_MODE;
374*cb039ef3SIlya Maximets 
375*cb039ef3SIlya Maximets         if (xsk_socket__create(&s->xsk, s->ifname, queue_id,
376*cb039ef3SIlya Maximets                                s->umem, &s->rx, &s->tx, &cfg)) {
377*cb039ef3SIlya Maximets             /* Can't use native mode, try skb. */
378*cb039ef3SIlya Maximets             cfg.xdp_flags &= ~XDP_FLAGS_DRV_MODE;
379*cb039ef3SIlya Maximets             cfg.xdp_flags |= XDP_FLAGS_SKB_MODE;
380*cb039ef3SIlya Maximets 
381*cb039ef3SIlya Maximets             if (xsk_socket__create(&s->xsk, s->ifname, queue_id,
382*cb039ef3SIlya Maximets                                    s->umem, &s->rx, &s->tx, &cfg)) {
383*cb039ef3SIlya Maximets                 error = errno;
384*cb039ef3SIlya Maximets             }
385*cb039ef3SIlya Maximets         }
386*cb039ef3SIlya Maximets     }
387*cb039ef3SIlya Maximets 
388*cb039ef3SIlya Maximets     if (error) {
389*cb039ef3SIlya Maximets         error_setg_errno(errp, error,
390*cb039ef3SIlya Maximets                          "failed to create AF_XDP socket for %s queue_id: %d",
391*cb039ef3SIlya Maximets                          s->ifname, queue_id);
392*cb039ef3SIlya Maximets         return -1;
393*cb039ef3SIlya Maximets     }
394*cb039ef3SIlya Maximets 
395*cb039ef3SIlya Maximets     s->xdp_flags = cfg.xdp_flags;
396*cb039ef3SIlya Maximets 
397*cb039ef3SIlya Maximets     return 0;
398*cb039ef3SIlya Maximets }
399*cb039ef3SIlya Maximets 
400*cb039ef3SIlya Maximets /* NetClientInfo methods. */
401*cb039ef3SIlya Maximets static NetClientInfo net_af_xdp_info = {
402*cb039ef3SIlya Maximets     .type = NET_CLIENT_DRIVER_AF_XDP,
403*cb039ef3SIlya Maximets     .size = sizeof(AFXDPState),
404*cb039ef3SIlya Maximets     .receive = af_xdp_receive,
405*cb039ef3SIlya Maximets     .poll = af_xdp_poll,
406*cb039ef3SIlya Maximets     .cleanup = af_xdp_cleanup,
407*cb039ef3SIlya Maximets };
408*cb039ef3SIlya Maximets 
parse_socket_fds(const char * sock_fds_str,int64_t n_expected,Error ** errp)409*cb039ef3SIlya Maximets static int *parse_socket_fds(const char *sock_fds_str,
410*cb039ef3SIlya Maximets                              int64_t n_expected, Error **errp)
411*cb039ef3SIlya Maximets {
412*cb039ef3SIlya Maximets     gchar **substrings = g_strsplit(sock_fds_str, ":", -1);
413*cb039ef3SIlya Maximets     int64_t i, n_sock_fds = g_strv_length(substrings);
414*cb039ef3SIlya Maximets     int *sock_fds = NULL;
415*cb039ef3SIlya Maximets 
416*cb039ef3SIlya Maximets     if (n_sock_fds != n_expected) {
417*cb039ef3SIlya Maximets         error_setg(errp, "expected %"PRIi64" socket fds, got %"PRIi64,
418*cb039ef3SIlya Maximets                    n_expected, n_sock_fds);
419*cb039ef3SIlya Maximets         goto exit;
420*cb039ef3SIlya Maximets     }
421*cb039ef3SIlya Maximets 
422*cb039ef3SIlya Maximets     sock_fds = g_new(int, n_sock_fds);
423*cb039ef3SIlya Maximets 
424*cb039ef3SIlya Maximets     for (i = 0; i < n_sock_fds; i++) {
425*cb039ef3SIlya Maximets         sock_fds[i] = monitor_fd_param(monitor_cur(), substrings[i], errp);
426*cb039ef3SIlya Maximets         if (sock_fds[i] < 0) {
427*cb039ef3SIlya Maximets             g_free(sock_fds);
428*cb039ef3SIlya Maximets             sock_fds = NULL;
429*cb039ef3SIlya Maximets             goto exit;
430*cb039ef3SIlya Maximets         }
431*cb039ef3SIlya Maximets     }
432*cb039ef3SIlya Maximets 
433*cb039ef3SIlya Maximets exit:
434*cb039ef3SIlya Maximets     g_strfreev(substrings);
435*cb039ef3SIlya Maximets     return sock_fds;
436*cb039ef3SIlya Maximets }
437*cb039ef3SIlya Maximets 
438*cb039ef3SIlya Maximets /*
439*cb039ef3SIlya Maximets  * The exported init function.
440*cb039ef3SIlya Maximets  *
441*cb039ef3SIlya Maximets  * ... -netdev af-xdp,ifname="..."
442*cb039ef3SIlya Maximets  */
net_init_af_xdp(const Netdev * netdev,const char * name,NetClientState * peer,Error ** errp)443*cb039ef3SIlya Maximets int net_init_af_xdp(const Netdev *netdev,
444*cb039ef3SIlya Maximets                     const char *name, NetClientState *peer, Error **errp)
445*cb039ef3SIlya Maximets {
446*cb039ef3SIlya Maximets     const NetdevAFXDPOptions *opts = &netdev->u.af_xdp;
447*cb039ef3SIlya Maximets     NetClientState *nc, *nc0 = NULL;
448*cb039ef3SIlya Maximets     unsigned int ifindex;
449*cb039ef3SIlya Maximets     uint32_t prog_id = 0;
450*cb039ef3SIlya Maximets     int *sock_fds = NULL;
451*cb039ef3SIlya Maximets     int64_t i, queues;
452*cb039ef3SIlya Maximets     Error *err = NULL;
453*cb039ef3SIlya Maximets     AFXDPState *s;
454*cb039ef3SIlya Maximets 
455*cb039ef3SIlya Maximets     ifindex = if_nametoindex(opts->ifname);
456*cb039ef3SIlya Maximets     if (!ifindex) {
457*cb039ef3SIlya Maximets         error_setg_errno(errp, errno, "failed to get ifindex for '%s'",
458*cb039ef3SIlya Maximets                          opts->ifname);
459*cb039ef3SIlya Maximets         return -1;
460*cb039ef3SIlya Maximets     }
461*cb039ef3SIlya Maximets 
462*cb039ef3SIlya Maximets     queues = opts->has_queues ? opts->queues : 1;
463*cb039ef3SIlya Maximets     if (queues < 1) {
464*cb039ef3SIlya Maximets         error_setg(errp, "invalid number of queues (%" PRIi64 ") for '%s'",
465*cb039ef3SIlya Maximets                    queues, opts->ifname);
466*cb039ef3SIlya Maximets         return -1;
467*cb039ef3SIlya Maximets     }
468*cb039ef3SIlya Maximets 
469*cb039ef3SIlya Maximets     if ((opts->has_inhibit && opts->inhibit) != !!opts->sock_fds) {
470*cb039ef3SIlya Maximets         error_setg(errp, "'inhibit=on' requires 'sock-fds' and vice versa");
471*cb039ef3SIlya Maximets         return -1;
472*cb039ef3SIlya Maximets     }
473*cb039ef3SIlya Maximets 
474*cb039ef3SIlya Maximets     if (opts->sock_fds) {
475*cb039ef3SIlya Maximets         sock_fds = parse_socket_fds(opts->sock_fds, queues, errp);
476*cb039ef3SIlya Maximets         if (!sock_fds) {
477*cb039ef3SIlya Maximets             return -1;
478*cb039ef3SIlya Maximets         }
479*cb039ef3SIlya Maximets     }
480*cb039ef3SIlya Maximets 
481*cb039ef3SIlya Maximets     for (i = 0; i < queues; i++) {
482*cb039ef3SIlya Maximets         nc = qemu_new_net_client(&net_af_xdp_info, peer, "af-xdp", name);
483*cb039ef3SIlya Maximets         qemu_set_info_str(nc, "af-xdp%"PRIi64" to %s", i, opts->ifname);
484*cb039ef3SIlya Maximets         nc->queue_index = i;
485*cb039ef3SIlya Maximets 
486*cb039ef3SIlya Maximets         if (!nc0) {
487*cb039ef3SIlya Maximets             nc0 = nc;
488*cb039ef3SIlya Maximets         }
489*cb039ef3SIlya Maximets 
490*cb039ef3SIlya Maximets         s = DO_UPCAST(AFXDPState, nc, nc);
491*cb039ef3SIlya Maximets 
492*cb039ef3SIlya Maximets         pstrcpy(s->ifname, sizeof(s->ifname), opts->ifname);
493*cb039ef3SIlya Maximets         s->ifindex = ifindex;
494*cb039ef3SIlya Maximets         s->n_queues = queues;
495*cb039ef3SIlya Maximets 
496*cb039ef3SIlya Maximets         if (af_xdp_umem_create(s, sock_fds ? sock_fds[i] : -1, errp)
497*cb039ef3SIlya Maximets             || af_xdp_socket_create(s, opts, errp)) {
498*cb039ef3SIlya Maximets             /* Make sure the XDP program will be removed. */
499*cb039ef3SIlya Maximets             s->n_queues = i;
500*cb039ef3SIlya Maximets             error_propagate(errp, err);
501*cb039ef3SIlya Maximets             goto err;
502*cb039ef3SIlya Maximets         }
503*cb039ef3SIlya Maximets     }
504*cb039ef3SIlya Maximets 
505*cb039ef3SIlya Maximets     if (nc0) {
506*cb039ef3SIlya Maximets         s = DO_UPCAST(AFXDPState, nc, nc0);
507*cb039ef3SIlya Maximets         if (bpf_xdp_query_id(s->ifindex, s->xdp_flags, &prog_id) || !prog_id) {
508*cb039ef3SIlya Maximets             error_setg_errno(errp, errno,
509*cb039ef3SIlya Maximets                              "no XDP program loaded on '%s', ifindex: %d",
510*cb039ef3SIlya Maximets                              s->ifname, s->ifindex);
511*cb039ef3SIlya Maximets             goto err;
512*cb039ef3SIlya Maximets         }
513*cb039ef3SIlya Maximets     }
514*cb039ef3SIlya Maximets 
515*cb039ef3SIlya Maximets     af_xdp_read_poll(s, true); /* Initially only poll for reads. */
516*cb039ef3SIlya Maximets 
517*cb039ef3SIlya Maximets     return 0;
518*cb039ef3SIlya Maximets 
519*cb039ef3SIlya Maximets err:
520*cb039ef3SIlya Maximets     g_free(sock_fds);
521*cb039ef3SIlya Maximets     if (nc0) {
522*cb039ef3SIlya Maximets         qemu_del_net_client(nc0);
523*cb039ef3SIlya Maximets     }
524*cb039ef3SIlya Maximets 
525*cb039ef3SIlya Maximets     return -1;
526*cb039ef3SIlya Maximets }
527