1*cb039ef3SIlya Maximets /*
2*cb039ef3SIlya Maximets * AF_XDP network backend.
3*cb039ef3SIlya Maximets *
4*cb039ef3SIlya Maximets * Copyright (c) 2023 Red Hat, Inc.
5*cb039ef3SIlya Maximets *
6*cb039ef3SIlya Maximets * Authors:
7*cb039ef3SIlya Maximets * Ilya Maximets <i.maximets@ovn.org>
8*cb039ef3SIlya Maximets *
9*cb039ef3SIlya Maximets * This work is licensed under the terms of the GNU GPL, version 2 or later.
10*cb039ef3SIlya Maximets * See the COPYING file in the top-level directory.
11*cb039ef3SIlya Maximets */
12*cb039ef3SIlya Maximets
13*cb039ef3SIlya Maximets
14*cb039ef3SIlya Maximets #include "qemu/osdep.h"
15*cb039ef3SIlya Maximets #include <bpf/bpf.h>
16*cb039ef3SIlya Maximets #include <inttypes.h>
17*cb039ef3SIlya Maximets #include <linux/if_link.h>
18*cb039ef3SIlya Maximets #include <linux/if_xdp.h>
19*cb039ef3SIlya Maximets #include <net/if.h>
20*cb039ef3SIlya Maximets #include <xdp/xsk.h>
21*cb039ef3SIlya Maximets
22*cb039ef3SIlya Maximets #include "clients.h"
23*cb039ef3SIlya Maximets #include "monitor/monitor.h"
24*cb039ef3SIlya Maximets #include "net/net.h"
25*cb039ef3SIlya Maximets #include "qapi/error.h"
26*cb039ef3SIlya Maximets #include "qemu/cutils.h"
27*cb039ef3SIlya Maximets #include "qemu/error-report.h"
28*cb039ef3SIlya Maximets #include "qemu/iov.h"
29*cb039ef3SIlya Maximets #include "qemu/main-loop.h"
30*cb039ef3SIlya Maximets #include "qemu/memalign.h"
31*cb039ef3SIlya Maximets
32*cb039ef3SIlya Maximets
33*cb039ef3SIlya Maximets typedef struct AFXDPState {
34*cb039ef3SIlya Maximets NetClientState nc;
35*cb039ef3SIlya Maximets
36*cb039ef3SIlya Maximets struct xsk_socket *xsk;
37*cb039ef3SIlya Maximets struct xsk_ring_cons rx;
38*cb039ef3SIlya Maximets struct xsk_ring_prod tx;
39*cb039ef3SIlya Maximets struct xsk_ring_cons cq;
40*cb039ef3SIlya Maximets struct xsk_ring_prod fq;
41*cb039ef3SIlya Maximets
42*cb039ef3SIlya Maximets char ifname[IFNAMSIZ];
43*cb039ef3SIlya Maximets int ifindex;
44*cb039ef3SIlya Maximets bool read_poll;
45*cb039ef3SIlya Maximets bool write_poll;
46*cb039ef3SIlya Maximets uint32_t outstanding_tx;
47*cb039ef3SIlya Maximets
48*cb039ef3SIlya Maximets uint64_t *pool;
49*cb039ef3SIlya Maximets uint32_t n_pool;
50*cb039ef3SIlya Maximets char *buffer;
51*cb039ef3SIlya Maximets struct xsk_umem *umem;
52*cb039ef3SIlya Maximets
53*cb039ef3SIlya Maximets uint32_t n_queues;
54*cb039ef3SIlya Maximets uint32_t xdp_flags;
55*cb039ef3SIlya Maximets bool inhibit;
56*cb039ef3SIlya Maximets } AFXDPState;
57*cb039ef3SIlya Maximets
58*cb039ef3SIlya Maximets #define AF_XDP_BATCH_SIZE 64
59*cb039ef3SIlya Maximets
60*cb039ef3SIlya Maximets static void af_xdp_send(void *opaque);
61*cb039ef3SIlya Maximets static void af_xdp_writable(void *opaque);
62*cb039ef3SIlya Maximets
63*cb039ef3SIlya Maximets /* Set the event-loop handlers for the af-xdp backend. */
af_xdp_update_fd_handler(AFXDPState * s)64*cb039ef3SIlya Maximets static void af_xdp_update_fd_handler(AFXDPState *s)
65*cb039ef3SIlya Maximets {
66*cb039ef3SIlya Maximets qemu_set_fd_handler(xsk_socket__fd(s->xsk),
67*cb039ef3SIlya Maximets s->read_poll ? af_xdp_send : NULL,
68*cb039ef3SIlya Maximets s->write_poll ? af_xdp_writable : NULL,
69*cb039ef3SIlya Maximets s);
70*cb039ef3SIlya Maximets }
71*cb039ef3SIlya Maximets
72*cb039ef3SIlya Maximets /* Update the read handler. */
af_xdp_read_poll(AFXDPState * s,bool enable)73*cb039ef3SIlya Maximets static void af_xdp_read_poll(AFXDPState *s, bool enable)
74*cb039ef3SIlya Maximets {
75*cb039ef3SIlya Maximets if (s->read_poll != enable) {
76*cb039ef3SIlya Maximets s->read_poll = enable;
77*cb039ef3SIlya Maximets af_xdp_update_fd_handler(s);
78*cb039ef3SIlya Maximets }
79*cb039ef3SIlya Maximets }
80*cb039ef3SIlya Maximets
81*cb039ef3SIlya Maximets /* Update the write handler. */
af_xdp_write_poll(AFXDPState * s,bool enable)82*cb039ef3SIlya Maximets static void af_xdp_write_poll(AFXDPState *s, bool enable)
83*cb039ef3SIlya Maximets {
84*cb039ef3SIlya Maximets if (s->write_poll != enable) {
85*cb039ef3SIlya Maximets s->write_poll = enable;
86*cb039ef3SIlya Maximets af_xdp_update_fd_handler(s);
87*cb039ef3SIlya Maximets }
88*cb039ef3SIlya Maximets }
89*cb039ef3SIlya Maximets
af_xdp_poll(NetClientState * nc,bool enable)90*cb039ef3SIlya Maximets static void af_xdp_poll(NetClientState *nc, bool enable)
91*cb039ef3SIlya Maximets {
92*cb039ef3SIlya Maximets AFXDPState *s = DO_UPCAST(AFXDPState, nc, nc);
93*cb039ef3SIlya Maximets
94*cb039ef3SIlya Maximets if (s->read_poll != enable || s->write_poll != enable) {
95*cb039ef3SIlya Maximets s->write_poll = enable;
96*cb039ef3SIlya Maximets s->read_poll = enable;
97*cb039ef3SIlya Maximets af_xdp_update_fd_handler(s);
98*cb039ef3SIlya Maximets }
99*cb039ef3SIlya Maximets }
100*cb039ef3SIlya Maximets
af_xdp_complete_tx(AFXDPState * s)101*cb039ef3SIlya Maximets static void af_xdp_complete_tx(AFXDPState *s)
102*cb039ef3SIlya Maximets {
103*cb039ef3SIlya Maximets uint32_t idx = 0;
104*cb039ef3SIlya Maximets uint32_t done, i;
105*cb039ef3SIlya Maximets uint64_t *addr;
106*cb039ef3SIlya Maximets
107*cb039ef3SIlya Maximets done = xsk_ring_cons__peek(&s->cq, XSK_RING_CONS__DEFAULT_NUM_DESCS, &idx);
108*cb039ef3SIlya Maximets
109*cb039ef3SIlya Maximets for (i = 0; i < done; i++) {
110*cb039ef3SIlya Maximets addr = (void *) xsk_ring_cons__comp_addr(&s->cq, idx++);
111*cb039ef3SIlya Maximets s->pool[s->n_pool++] = *addr;
112*cb039ef3SIlya Maximets s->outstanding_tx--;
113*cb039ef3SIlya Maximets }
114*cb039ef3SIlya Maximets
115*cb039ef3SIlya Maximets if (done) {
116*cb039ef3SIlya Maximets xsk_ring_cons__release(&s->cq, done);
117*cb039ef3SIlya Maximets }
118*cb039ef3SIlya Maximets }
119*cb039ef3SIlya Maximets
120*cb039ef3SIlya Maximets /*
121*cb039ef3SIlya Maximets * The fd_write() callback, invoked if the fd is marked as writable
122*cb039ef3SIlya Maximets * after a poll.
123*cb039ef3SIlya Maximets */
af_xdp_writable(void * opaque)124*cb039ef3SIlya Maximets static void af_xdp_writable(void *opaque)
125*cb039ef3SIlya Maximets {
126*cb039ef3SIlya Maximets AFXDPState *s = opaque;
127*cb039ef3SIlya Maximets
128*cb039ef3SIlya Maximets /* Try to recover buffers that are already sent. */
129*cb039ef3SIlya Maximets af_xdp_complete_tx(s);
130*cb039ef3SIlya Maximets
131*cb039ef3SIlya Maximets /*
132*cb039ef3SIlya Maximets * Unregister the handler, unless we still have packets to transmit
133*cb039ef3SIlya Maximets * and kernel needs a wake up.
134*cb039ef3SIlya Maximets */
135*cb039ef3SIlya Maximets if (!s->outstanding_tx || !xsk_ring_prod__needs_wakeup(&s->tx)) {
136*cb039ef3SIlya Maximets af_xdp_write_poll(s, false);
137*cb039ef3SIlya Maximets }
138*cb039ef3SIlya Maximets
139*cb039ef3SIlya Maximets /* Flush any buffered packets. */
140*cb039ef3SIlya Maximets qemu_flush_queued_packets(&s->nc);
141*cb039ef3SIlya Maximets }
142*cb039ef3SIlya Maximets
af_xdp_receive(NetClientState * nc,const uint8_t * buf,size_t size)143*cb039ef3SIlya Maximets static ssize_t af_xdp_receive(NetClientState *nc,
144*cb039ef3SIlya Maximets const uint8_t *buf, size_t size)
145*cb039ef3SIlya Maximets {
146*cb039ef3SIlya Maximets AFXDPState *s = DO_UPCAST(AFXDPState, nc, nc);
147*cb039ef3SIlya Maximets struct xdp_desc *desc;
148*cb039ef3SIlya Maximets uint32_t idx;
149*cb039ef3SIlya Maximets void *data;
150*cb039ef3SIlya Maximets
151*cb039ef3SIlya Maximets /* Try to recover buffers that are already sent. */
152*cb039ef3SIlya Maximets af_xdp_complete_tx(s);
153*cb039ef3SIlya Maximets
154*cb039ef3SIlya Maximets if (size > XSK_UMEM__DEFAULT_FRAME_SIZE) {
155*cb039ef3SIlya Maximets /* We can't transmit packet this size... */
156*cb039ef3SIlya Maximets return size;
157*cb039ef3SIlya Maximets }
158*cb039ef3SIlya Maximets
159*cb039ef3SIlya Maximets if (!s->n_pool || !xsk_ring_prod__reserve(&s->tx, 1, &idx)) {
160*cb039ef3SIlya Maximets /*
161*cb039ef3SIlya Maximets * Out of buffers or space in tx ring. Poll until we can write.
162*cb039ef3SIlya Maximets * This will also kick the Tx, if it was waiting on CQ.
163*cb039ef3SIlya Maximets */
164*cb039ef3SIlya Maximets af_xdp_write_poll(s, true);
165*cb039ef3SIlya Maximets return 0;
166*cb039ef3SIlya Maximets }
167*cb039ef3SIlya Maximets
168*cb039ef3SIlya Maximets desc = xsk_ring_prod__tx_desc(&s->tx, idx);
169*cb039ef3SIlya Maximets desc->addr = s->pool[--s->n_pool];
170*cb039ef3SIlya Maximets desc->len = size;
171*cb039ef3SIlya Maximets
172*cb039ef3SIlya Maximets data = xsk_umem__get_data(s->buffer, desc->addr);
173*cb039ef3SIlya Maximets memcpy(data, buf, size);
174*cb039ef3SIlya Maximets
175*cb039ef3SIlya Maximets xsk_ring_prod__submit(&s->tx, 1);
176*cb039ef3SIlya Maximets s->outstanding_tx++;
177*cb039ef3SIlya Maximets
178*cb039ef3SIlya Maximets if (xsk_ring_prod__needs_wakeup(&s->tx)) {
179*cb039ef3SIlya Maximets af_xdp_write_poll(s, true);
180*cb039ef3SIlya Maximets }
181*cb039ef3SIlya Maximets
182*cb039ef3SIlya Maximets return size;
183*cb039ef3SIlya Maximets }
184*cb039ef3SIlya Maximets
185*cb039ef3SIlya Maximets /*
186*cb039ef3SIlya Maximets * Complete a previous send (backend --> guest) and enable the
187*cb039ef3SIlya Maximets * fd_read callback.
188*cb039ef3SIlya Maximets */
af_xdp_send_completed(NetClientState * nc,ssize_t len)189*cb039ef3SIlya Maximets static void af_xdp_send_completed(NetClientState *nc, ssize_t len)
190*cb039ef3SIlya Maximets {
191*cb039ef3SIlya Maximets AFXDPState *s = DO_UPCAST(AFXDPState, nc, nc);
192*cb039ef3SIlya Maximets
193*cb039ef3SIlya Maximets af_xdp_read_poll(s, true);
194*cb039ef3SIlya Maximets }
195*cb039ef3SIlya Maximets
af_xdp_fq_refill(AFXDPState * s,uint32_t n)196*cb039ef3SIlya Maximets static void af_xdp_fq_refill(AFXDPState *s, uint32_t n)
197*cb039ef3SIlya Maximets {
198*cb039ef3SIlya Maximets uint32_t i, idx = 0;
199*cb039ef3SIlya Maximets
200*cb039ef3SIlya Maximets /* Leave one packet for Tx, just in case. */
201*cb039ef3SIlya Maximets if (s->n_pool < n + 1) {
202*cb039ef3SIlya Maximets n = s->n_pool;
203*cb039ef3SIlya Maximets }
204*cb039ef3SIlya Maximets
205*cb039ef3SIlya Maximets if (!n || !xsk_ring_prod__reserve(&s->fq, n, &idx)) {
206*cb039ef3SIlya Maximets return;
207*cb039ef3SIlya Maximets }
208*cb039ef3SIlya Maximets
209*cb039ef3SIlya Maximets for (i = 0; i < n; i++) {
210*cb039ef3SIlya Maximets *xsk_ring_prod__fill_addr(&s->fq, idx++) = s->pool[--s->n_pool];
211*cb039ef3SIlya Maximets }
212*cb039ef3SIlya Maximets xsk_ring_prod__submit(&s->fq, n);
213*cb039ef3SIlya Maximets
214*cb039ef3SIlya Maximets if (xsk_ring_prod__needs_wakeup(&s->fq)) {
215*cb039ef3SIlya Maximets /* Receive was blocked by not having enough buffers. Wake it up. */
216*cb039ef3SIlya Maximets af_xdp_read_poll(s, true);
217*cb039ef3SIlya Maximets }
218*cb039ef3SIlya Maximets }
219*cb039ef3SIlya Maximets
af_xdp_send(void * opaque)220*cb039ef3SIlya Maximets static void af_xdp_send(void *opaque)
221*cb039ef3SIlya Maximets {
222*cb039ef3SIlya Maximets uint32_t i, n_rx, idx = 0;
223*cb039ef3SIlya Maximets AFXDPState *s = opaque;
224*cb039ef3SIlya Maximets
225*cb039ef3SIlya Maximets n_rx = xsk_ring_cons__peek(&s->rx, AF_XDP_BATCH_SIZE, &idx);
226*cb039ef3SIlya Maximets if (!n_rx) {
227*cb039ef3SIlya Maximets return;
228*cb039ef3SIlya Maximets }
229*cb039ef3SIlya Maximets
230*cb039ef3SIlya Maximets for (i = 0; i < n_rx; i++) {
231*cb039ef3SIlya Maximets const struct xdp_desc *desc;
232*cb039ef3SIlya Maximets struct iovec iov;
233*cb039ef3SIlya Maximets
234*cb039ef3SIlya Maximets desc = xsk_ring_cons__rx_desc(&s->rx, idx++);
235*cb039ef3SIlya Maximets
236*cb039ef3SIlya Maximets iov.iov_base = xsk_umem__get_data(s->buffer, desc->addr);
237*cb039ef3SIlya Maximets iov.iov_len = desc->len;
238*cb039ef3SIlya Maximets
239*cb039ef3SIlya Maximets s->pool[s->n_pool++] = desc->addr;
240*cb039ef3SIlya Maximets
241*cb039ef3SIlya Maximets if (!qemu_sendv_packet_async(&s->nc, &iov, 1,
242*cb039ef3SIlya Maximets af_xdp_send_completed)) {
243*cb039ef3SIlya Maximets /*
244*cb039ef3SIlya Maximets * The peer does not receive anymore. Packet is queued, stop
245*cb039ef3SIlya Maximets * reading from the backend until af_xdp_send_completed().
246*cb039ef3SIlya Maximets */
247*cb039ef3SIlya Maximets af_xdp_read_poll(s, false);
248*cb039ef3SIlya Maximets
249*cb039ef3SIlya Maximets /* Return unused descriptors to not break the ring cache. */
250*cb039ef3SIlya Maximets xsk_ring_cons__cancel(&s->rx, n_rx - i - 1);
251*cb039ef3SIlya Maximets n_rx = i + 1;
252*cb039ef3SIlya Maximets break;
253*cb039ef3SIlya Maximets }
254*cb039ef3SIlya Maximets }
255*cb039ef3SIlya Maximets
256*cb039ef3SIlya Maximets /* Release actually sent descriptors and try to re-fill. */
257*cb039ef3SIlya Maximets xsk_ring_cons__release(&s->rx, n_rx);
258*cb039ef3SIlya Maximets af_xdp_fq_refill(s, AF_XDP_BATCH_SIZE);
259*cb039ef3SIlya Maximets }
260*cb039ef3SIlya Maximets
261*cb039ef3SIlya Maximets /* Flush and close. */
af_xdp_cleanup(NetClientState * nc)262*cb039ef3SIlya Maximets static void af_xdp_cleanup(NetClientState *nc)
263*cb039ef3SIlya Maximets {
264*cb039ef3SIlya Maximets AFXDPState *s = DO_UPCAST(AFXDPState, nc, nc);
265*cb039ef3SIlya Maximets
266*cb039ef3SIlya Maximets qemu_purge_queued_packets(nc);
267*cb039ef3SIlya Maximets
268*cb039ef3SIlya Maximets af_xdp_poll(nc, false);
269*cb039ef3SIlya Maximets
270*cb039ef3SIlya Maximets xsk_socket__delete(s->xsk);
271*cb039ef3SIlya Maximets s->xsk = NULL;
272*cb039ef3SIlya Maximets g_free(s->pool);
273*cb039ef3SIlya Maximets s->pool = NULL;
274*cb039ef3SIlya Maximets xsk_umem__delete(s->umem);
275*cb039ef3SIlya Maximets s->umem = NULL;
276*cb039ef3SIlya Maximets qemu_vfree(s->buffer);
277*cb039ef3SIlya Maximets s->buffer = NULL;
278*cb039ef3SIlya Maximets
279*cb039ef3SIlya Maximets /* Remove the program if it's the last open queue. */
280*cb039ef3SIlya Maximets if (!s->inhibit && nc->queue_index == s->n_queues - 1 && s->xdp_flags
281*cb039ef3SIlya Maximets && bpf_xdp_detach(s->ifindex, s->xdp_flags, NULL) != 0) {
282*cb039ef3SIlya Maximets fprintf(stderr,
283*cb039ef3SIlya Maximets "af-xdp: unable to remove XDP program from '%s', ifindex: %d\n",
284*cb039ef3SIlya Maximets s->ifname, s->ifindex);
285*cb039ef3SIlya Maximets }
286*cb039ef3SIlya Maximets }
287*cb039ef3SIlya Maximets
af_xdp_umem_create(AFXDPState * s,int sock_fd,Error ** errp)288*cb039ef3SIlya Maximets static int af_xdp_umem_create(AFXDPState *s, int sock_fd, Error **errp)
289*cb039ef3SIlya Maximets {
290*cb039ef3SIlya Maximets struct xsk_umem_config config = {
291*cb039ef3SIlya Maximets .fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS,
292*cb039ef3SIlya Maximets .comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS,
293*cb039ef3SIlya Maximets .frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE,
294*cb039ef3SIlya Maximets .frame_headroom = 0,
295*cb039ef3SIlya Maximets };
296*cb039ef3SIlya Maximets uint64_t n_descs;
297*cb039ef3SIlya Maximets uint64_t size;
298*cb039ef3SIlya Maximets int64_t i;
299*cb039ef3SIlya Maximets int ret;
300*cb039ef3SIlya Maximets
301*cb039ef3SIlya Maximets /* Number of descriptors if all 4 queues (rx, tx, cq, fq) are full. */
302*cb039ef3SIlya Maximets n_descs = (XSK_RING_PROD__DEFAULT_NUM_DESCS
303*cb039ef3SIlya Maximets + XSK_RING_CONS__DEFAULT_NUM_DESCS) * 2;
304*cb039ef3SIlya Maximets size = n_descs * XSK_UMEM__DEFAULT_FRAME_SIZE;
305*cb039ef3SIlya Maximets
306*cb039ef3SIlya Maximets s->buffer = qemu_memalign(qemu_real_host_page_size(), size);
307*cb039ef3SIlya Maximets memset(s->buffer, 0, size);
308*cb039ef3SIlya Maximets
309*cb039ef3SIlya Maximets if (sock_fd < 0) {
310*cb039ef3SIlya Maximets ret = xsk_umem__create(&s->umem, s->buffer, size,
311*cb039ef3SIlya Maximets &s->fq, &s->cq, &config);
312*cb039ef3SIlya Maximets } else {
313*cb039ef3SIlya Maximets ret = xsk_umem__create_with_fd(&s->umem, sock_fd, s->buffer, size,
314*cb039ef3SIlya Maximets &s->fq, &s->cq, &config);
315*cb039ef3SIlya Maximets }
316*cb039ef3SIlya Maximets
317*cb039ef3SIlya Maximets if (ret) {
318*cb039ef3SIlya Maximets qemu_vfree(s->buffer);
319*cb039ef3SIlya Maximets error_setg_errno(errp, errno,
320*cb039ef3SIlya Maximets "failed to create umem for %s queue_index: %d",
321*cb039ef3SIlya Maximets s->ifname, s->nc.queue_index);
322*cb039ef3SIlya Maximets return -1;
323*cb039ef3SIlya Maximets }
324*cb039ef3SIlya Maximets
325*cb039ef3SIlya Maximets s->pool = g_new(uint64_t, n_descs);
326*cb039ef3SIlya Maximets /* Fill the pool in the opposite order, because it's a LIFO queue. */
327*cb039ef3SIlya Maximets for (i = n_descs; i >= 0; i--) {
328*cb039ef3SIlya Maximets s->pool[i] = i * XSK_UMEM__DEFAULT_FRAME_SIZE;
329*cb039ef3SIlya Maximets }
330*cb039ef3SIlya Maximets s->n_pool = n_descs;
331*cb039ef3SIlya Maximets
332*cb039ef3SIlya Maximets af_xdp_fq_refill(s, XSK_RING_PROD__DEFAULT_NUM_DESCS);
333*cb039ef3SIlya Maximets
334*cb039ef3SIlya Maximets return 0;
335*cb039ef3SIlya Maximets }
336*cb039ef3SIlya Maximets
af_xdp_socket_create(AFXDPState * s,const NetdevAFXDPOptions * opts,Error ** errp)337*cb039ef3SIlya Maximets static int af_xdp_socket_create(AFXDPState *s,
338*cb039ef3SIlya Maximets const NetdevAFXDPOptions *opts, Error **errp)
339*cb039ef3SIlya Maximets {
340*cb039ef3SIlya Maximets struct xsk_socket_config cfg = {
341*cb039ef3SIlya Maximets .rx_size = XSK_RING_CONS__DEFAULT_NUM_DESCS,
342*cb039ef3SIlya Maximets .tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS,
343*cb039ef3SIlya Maximets .libxdp_flags = 0,
344*cb039ef3SIlya Maximets .bind_flags = XDP_USE_NEED_WAKEUP,
345*cb039ef3SIlya Maximets .xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST,
346*cb039ef3SIlya Maximets };
347*cb039ef3SIlya Maximets int queue_id, error = 0;
348*cb039ef3SIlya Maximets
349*cb039ef3SIlya Maximets s->inhibit = opts->has_inhibit && opts->inhibit;
350*cb039ef3SIlya Maximets if (s->inhibit) {
351*cb039ef3SIlya Maximets cfg.libxdp_flags |= XSK_LIBXDP_FLAGS__INHIBIT_PROG_LOAD;
352*cb039ef3SIlya Maximets }
353*cb039ef3SIlya Maximets
354*cb039ef3SIlya Maximets if (opts->has_force_copy && opts->force_copy) {
355*cb039ef3SIlya Maximets cfg.bind_flags |= XDP_COPY;
356*cb039ef3SIlya Maximets }
357*cb039ef3SIlya Maximets
358*cb039ef3SIlya Maximets queue_id = s->nc.queue_index;
359*cb039ef3SIlya Maximets if (opts->has_start_queue && opts->start_queue > 0) {
360*cb039ef3SIlya Maximets queue_id += opts->start_queue;
361*cb039ef3SIlya Maximets }
362*cb039ef3SIlya Maximets
363*cb039ef3SIlya Maximets if (opts->has_mode) {
364*cb039ef3SIlya Maximets /* Specific mode requested. */
365*cb039ef3SIlya Maximets cfg.xdp_flags |= (opts->mode == AFXDP_MODE_NATIVE)
366*cb039ef3SIlya Maximets ? XDP_FLAGS_DRV_MODE : XDP_FLAGS_SKB_MODE;
367*cb039ef3SIlya Maximets if (xsk_socket__create(&s->xsk, s->ifname, queue_id,
368*cb039ef3SIlya Maximets s->umem, &s->rx, &s->tx, &cfg)) {
369*cb039ef3SIlya Maximets error = errno;
370*cb039ef3SIlya Maximets }
371*cb039ef3SIlya Maximets } else {
372*cb039ef3SIlya Maximets /* No mode requested, try native first. */
373*cb039ef3SIlya Maximets cfg.xdp_flags |= XDP_FLAGS_DRV_MODE;
374*cb039ef3SIlya Maximets
375*cb039ef3SIlya Maximets if (xsk_socket__create(&s->xsk, s->ifname, queue_id,
376*cb039ef3SIlya Maximets s->umem, &s->rx, &s->tx, &cfg)) {
377*cb039ef3SIlya Maximets /* Can't use native mode, try skb. */
378*cb039ef3SIlya Maximets cfg.xdp_flags &= ~XDP_FLAGS_DRV_MODE;
379*cb039ef3SIlya Maximets cfg.xdp_flags |= XDP_FLAGS_SKB_MODE;
380*cb039ef3SIlya Maximets
381*cb039ef3SIlya Maximets if (xsk_socket__create(&s->xsk, s->ifname, queue_id,
382*cb039ef3SIlya Maximets s->umem, &s->rx, &s->tx, &cfg)) {
383*cb039ef3SIlya Maximets error = errno;
384*cb039ef3SIlya Maximets }
385*cb039ef3SIlya Maximets }
386*cb039ef3SIlya Maximets }
387*cb039ef3SIlya Maximets
388*cb039ef3SIlya Maximets if (error) {
389*cb039ef3SIlya Maximets error_setg_errno(errp, error,
390*cb039ef3SIlya Maximets "failed to create AF_XDP socket for %s queue_id: %d",
391*cb039ef3SIlya Maximets s->ifname, queue_id);
392*cb039ef3SIlya Maximets return -1;
393*cb039ef3SIlya Maximets }
394*cb039ef3SIlya Maximets
395*cb039ef3SIlya Maximets s->xdp_flags = cfg.xdp_flags;
396*cb039ef3SIlya Maximets
397*cb039ef3SIlya Maximets return 0;
398*cb039ef3SIlya Maximets }
399*cb039ef3SIlya Maximets
400*cb039ef3SIlya Maximets /* NetClientInfo methods. */
401*cb039ef3SIlya Maximets static NetClientInfo net_af_xdp_info = {
402*cb039ef3SIlya Maximets .type = NET_CLIENT_DRIVER_AF_XDP,
403*cb039ef3SIlya Maximets .size = sizeof(AFXDPState),
404*cb039ef3SIlya Maximets .receive = af_xdp_receive,
405*cb039ef3SIlya Maximets .poll = af_xdp_poll,
406*cb039ef3SIlya Maximets .cleanup = af_xdp_cleanup,
407*cb039ef3SIlya Maximets };
408*cb039ef3SIlya Maximets
parse_socket_fds(const char * sock_fds_str,int64_t n_expected,Error ** errp)409*cb039ef3SIlya Maximets static int *parse_socket_fds(const char *sock_fds_str,
410*cb039ef3SIlya Maximets int64_t n_expected, Error **errp)
411*cb039ef3SIlya Maximets {
412*cb039ef3SIlya Maximets gchar **substrings = g_strsplit(sock_fds_str, ":", -1);
413*cb039ef3SIlya Maximets int64_t i, n_sock_fds = g_strv_length(substrings);
414*cb039ef3SIlya Maximets int *sock_fds = NULL;
415*cb039ef3SIlya Maximets
416*cb039ef3SIlya Maximets if (n_sock_fds != n_expected) {
417*cb039ef3SIlya Maximets error_setg(errp, "expected %"PRIi64" socket fds, got %"PRIi64,
418*cb039ef3SIlya Maximets n_expected, n_sock_fds);
419*cb039ef3SIlya Maximets goto exit;
420*cb039ef3SIlya Maximets }
421*cb039ef3SIlya Maximets
422*cb039ef3SIlya Maximets sock_fds = g_new(int, n_sock_fds);
423*cb039ef3SIlya Maximets
424*cb039ef3SIlya Maximets for (i = 0; i < n_sock_fds; i++) {
425*cb039ef3SIlya Maximets sock_fds[i] = monitor_fd_param(monitor_cur(), substrings[i], errp);
426*cb039ef3SIlya Maximets if (sock_fds[i] < 0) {
427*cb039ef3SIlya Maximets g_free(sock_fds);
428*cb039ef3SIlya Maximets sock_fds = NULL;
429*cb039ef3SIlya Maximets goto exit;
430*cb039ef3SIlya Maximets }
431*cb039ef3SIlya Maximets }
432*cb039ef3SIlya Maximets
433*cb039ef3SIlya Maximets exit:
434*cb039ef3SIlya Maximets g_strfreev(substrings);
435*cb039ef3SIlya Maximets return sock_fds;
436*cb039ef3SIlya Maximets }
437*cb039ef3SIlya Maximets
438*cb039ef3SIlya Maximets /*
439*cb039ef3SIlya Maximets * The exported init function.
440*cb039ef3SIlya Maximets *
441*cb039ef3SIlya Maximets * ... -netdev af-xdp,ifname="..."
442*cb039ef3SIlya Maximets */
net_init_af_xdp(const Netdev * netdev,const char * name,NetClientState * peer,Error ** errp)443*cb039ef3SIlya Maximets int net_init_af_xdp(const Netdev *netdev,
444*cb039ef3SIlya Maximets const char *name, NetClientState *peer, Error **errp)
445*cb039ef3SIlya Maximets {
446*cb039ef3SIlya Maximets const NetdevAFXDPOptions *opts = &netdev->u.af_xdp;
447*cb039ef3SIlya Maximets NetClientState *nc, *nc0 = NULL;
448*cb039ef3SIlya Maximets unsigned int ifindex;
449*cb039ef3SIlya Maximets uint32_t prog_id = 0;
450*cb039ef3SIlya Maximets int *sock_fds = NULL;
451*cb039ef3SIlya Maximets int64_t i, queues;
452*cb039ef3SIlya Maximets Error *err = NULL;
453*cb039ef3SIlya Maximets AFXDPState *s;
454*cb039ef3SIlya Maximets
455*cb039ef3SIlya Maximets ifindex = if_nametoindex(opts->ifname);
456*cb039ef3SIlya Maximets if (!ifindex) {
457*cb039ef3SIlya Maximets error_setg_errno(errp, errno, "failed to get ifindex for '%s'",
458*cb039ef3SIlya Maximets opts->ifname);
459*cb039ef3SIlya Maximets return -1;
460*cb039ef3SIlya Maximets }
461*cb039ef3SIlya Maximets
462*cb039ef3SIlya Maximets queues = opts->has_queues ? opts->queues : 1;
463*cb039ef3SIlya Maximets if (queues < 1) {
464*cb039ef3SIlya Maximets error_setg(errp, "invalid number of queues (%" PRIi64 ") for '%s'",
465*cb039ef3SIlya Maximets queues, opts->ifname);
466*cb039ef3SIlya Maximets return -1;
467*cb039ef3SIlya Maximets }
468*cb039ef3SIlya Maximets
469*cb039ef3SIlya Maximets if ((opts->has_inhibit && opts->inhibit) != !!opts->sock_fds) {
470*cb039ef3SIlya Maximets error_setg(errp, "'inhibit=on' requires 'sock-fds' and vice versa");
471*cb039ef3SIlya Maximets return -1;
472*cb039ef3SIlya Maximets }
473*cb039ef3SIlya Maximets
474*cb039ef3SIlya Maximets if (opts->sock_fds) {
475*cb039ef3SIlya Maximets sock_fds = parse_socket_fds(opts->sock_fds, queues, errp);
476*cb039ef3SIlya Maximets if (!sock_fds) {
477*cb039ef3SIlya Maximets return -1;
478*cb039ef3SIlya Maximets }
479*cb039ef3SIlya Maximets }
480*cb039ef3SIlya Maximets
481*cb039ef3SIlya Maximets for (i = 0; i < queues; i++) {
482*cb039ef3SIlya Maximets nc = qemu_new_net_client(&net_af_xdp_info, peer, "af-xdp", name);
483*cb039ef3SIlya Maximets qemu_set_info_str(nc, "af-xdp%"PRIi64" to %s", i, opts->ifname);
484*cb039ef3SIlya Maximets nc->queue_index = i;
485*cb039ef3SIlya Maximets
486*cb039ef3SIlya Maximets if (!nc0) {
487*cb039ef3SIlya Maximets nc0 = nc;
488*cb039ef3SIlya Maximets }
489*cb039ef3SIlya Maximets
490*cb039ef3SIlya Maximets s = DO_UPCAST(AFXDPState, nc, nc);
491*cb039ef3SIlya Maximets
492*cb039ef3SIlya Maximets pstrcpy(s->ifname, sizeof(s->ifname), opts->ifname);
493*cb039ef3SIlya Maximets s->ifindex = ifindex;
494*cb039ef3SIlya Maximets s->n_queues = queues;
495*cb039ef3SIlya Maximets
496*cb039ef3SIlya Maximets if (af_xdp_umem_create(s, sock_fds ? sock_fds[i] : -1, errp)
497*cb039ef3SIlya Maximets || af_xdp_socket_create(s, opts, errp)) {
498*cb039ef3SIlya Maximets /* Make sure the XDP program will be removed. */
499*cb039ef3SIlya Maximets s->n_queues = i;
500*cb039ef3SIlya Maximets error_propagate(errp, err);
501*cb039ef3SIlya Maximets goto err;
502*cb039ef3SIlya Maximets }
503*cb039ef3SIlya Maximets }
504*cb039ef3SIlya Maximets
505*cb039ef3SIlya Maximets if (nc0) {
506*cb039ef3SIlya Maximets s = DO_UPCAST(AFXDPState, nc, nc0);
507*cb039ef3SIlya Maximets if (bpf_xdp_query_id(s->ifindex, s->xdp_flags, &prog_id) || !prog_id) {
508*cb039ef3SIlya Maximets error_setg_errno(errp, errno,
509*cb039ef3SIlya Maximets "no XDP program loaded on '%s', ifindex: %d",
510*cb039ef3SIlya Maximets s->ifname, s->ifindex);
511*cb039ef3SIlya Maximets goto err;
512*cb039ef3SIlya Maximets }
513*cb039ef3SIlya Maximets }
514*cb039ef3SIlya Maximets
515*cb039ef3SIlya Maximets af_xdp_read_poll(s, true); /* Initially only poll for reads. */
516*cb039ef3SIlya Maximets
517*cb039ef3SIlya Maximets return 0;
518*cb039ef3SIlya Maximets
519*cb039ef3SIlya Maximets err:
520*cb039ef3SIlya Maximets g_free(sock_fds);
521*cb039ef3SIlya Maximets if (nc0) {
522*cb039ef3SIlya Maximets qemu_del_net_client(nc0);
523*cb039ef3SIlya Maximets }
524*cb039ef3SIlya Maximets
525*cb039ef3SIlya Maximets return -1;
526*cb039ef3SIlya Maximets }
527