xref: /openbmc/qemu/net/af-xdp.c (revision 90bb6d676489b5cc063858ece263e1586795803f)
1 /*
2  * AF_XDP network backend.
3  *
4  * Copyright (c) 2023 Red Hat, Inc.
5  *
6  * Authors:
7  *  Ilya Maximets <i.maximets@ovn.org>
8  *
9  * This work is licensed under the terms of the GNU GPL, version 2 or later.
10  * See the COPYING file in the top-level directory.
11  */
12 
13 
14 #include "qemu/osdep.h"
15 #include <bpf/bpf.h>
16 #include <inttypes.h>
17 #include <linux/if_link.h>
18 #include <linux/if_xdp.h>
19 #include <net/if.h>
20 #include <xdp/xsk.h>
21 
22 #include "clients.h"
23 #include "monitor/monitor.h"
24 #include "net/net.h"
25 #include "qapi/error.h"
26 #include "qemu/cutils.h"
27 #include "qemu/error-report.h"
28 #include "qemu/iov.h"
29 #include "qemu/main-loop.h"
30 #include "qemu/memalign.h"
31 
32 
33 typedef struct AFXDPState {
34     NetClientState       nc;
35 
36     struct xsk_socket    *xsk;
37     struct xsk_ring_cons rx;
38     struct xsk_ring_prod tx;
39     struct xsk_ring_cons cq;
40     struct xsk_ring_prod fq;
41 
42     char                 ifname[IFNAMSIZ];
43     int                  ifindex;
44     bool                 read_poll;
45     bool                 write_poll;
46     uint32_t             outstanding_tx;
47 
48     uint64_t             *pool;
49     uint32_t             n_pool;
50     char                 *buffer;
51     struct xsk_umem      *umem;
52 
53     uint32_t             n_queues;
54     uint32_t             xdp_flags;
55     bool                 inhibit;
56 } AFXDPState;
57 
58 #define AF_XDP_BATCH_SIZE 64
59 
60 static void af_xdp_send(void *opaque);
61 static void af_xdp_writable(void *opaque);
62 
63 /* Set the event-loop handlers for the af-xdp backend. */
64 static void af_xdp_update_fd_handler(AFXDPState *s)
65 {
66     qemu_set_fd_handler(xsk_socket__fd(s->xsk),
67                         s->read_poll ? af_xdp_send : NULL,
68                         s->write_poll ? af_xdp_writable : NULL,
69                         s);
70 }
71 
72 /* Update the read handler. */
73 static void af_xdp_read_poll(AFXDPState *s, bool enable)
74 {
75     if (s->read_poll != enable) {
76         s->read_poll = enable;
77         af_xdp_update_fd_handler(s);
78     }
79 }
80 
81 /* Update the write handler. */
82 static void af_xdp_write_poll(AFXDPState *s, bool enable)
83 {
84     if (s->write_poll != enable) {
85         s->write_poll = enable;
86         af_xdp_update_fd_handler(s);
87     }
88 }
89 
90 static void af_xdp_poll(NetClientState *nc, bool enable)
91 {
92     AFXDPState *s = DO_UPCAST(AFXDPState, nc, nc);
93 
94     if (s->read_poll != enable || s->write_poll != enable) {
95         s->write_poll = enable;
96         s->read_poll  = enable;
97         af_xdp_update_fd_handler(s);
98     }
99 }
100 
101 static void af_xdp_complete_tx(AFXDPState *s)
102 {
103     uint32_t idx = 0;
104     uint32_t done, i;
105     uint64_t *addr;
106 
107     done = xsk_ring_cons__peek(&s->cq, XSK_RING_CONS__DEFAULT_NUM_DESCS, &idx);
108 
109     for (i = 0; i < done; i++) {
110         addr = (void *) xsk_ring_cons__comp_addr(&s->cq, idx++);
111         s->pool[s->n_pool++] = *addr;
112         s->outstanding_tx--;
113     }
114 
115     if (done) {
116         xsk_ring_cons__release(&s->cq, done);
117     }
118 }
119 
120 /*
121  * The fd_write() callback, invoked if the fd is marked as writable
122  * after a poll.
123  */
124 static void af_xdp_writable(void *opaque)
125 {
126     AFXDPState *s = opaque;
127 
128     /* Try to recover buffers that are already sent. */
129     af_xdp_complete_tx(s);
130 
131     /*
132      * Unregister the handler, unless we still have packets to transmit
133      * and kernel needs a wake up.
134      */
135     if (!s->outstanding_tx || !xsk_ring_prod__needs_wakeup(&s->tx)) {
136         af_xdp_write_poll(s, false);
137     }
138 
139     /* Flush any buffered packets. */
140     qemu_flush_queued_packets(&s->nc);
141 }
142 
143 static ssize_t af_xdp_receive(NetClientState *nc,
144                               const uint8_t *buf, size_t size)
145 {
146     AFXDPState *s = DO_UPCAST(AFXDPState, nc, nc);
147     struct xdp_desc *desc;
148     uint32_t idx;
149     void *data;
150 
151     /* Try to recover buffers that are already sent. */
152     af_xdp_complete_tx(s);
153 
154     if (size > XSK_UMEM__DEFAULT_FRAME_SIZE) {
155         /* We can't transmit packet this size... */
156         return size;
157     }
158 
159     if (!s->n_pool || !xsk_ring_prod__reserve(&s->tx, 1, &idx)) {
160         /*
161          * Out of buffers or space in tx ring.  Poll until we can write.
162          * This will also kick the Tx, if it was waiting on CQ.
163          */
164         af_xdp_write_poll(s, true);
165         return 0;
166     }
167 
168     desc = xsk_ring_prod__tx_desc(&s->tx, idx);
169     desc->addr = s->pool[--s->n_pool];
170     desc->len = size;
171 
172     data = xsk_umem__get_data(s->buffer, desc->addr);
173     memcpy(data, buf, size);
174 
175     xsk_ring_prod__submit(&s->tx, 1);
176     s->outstanding_tx++;
177 
178     if (xsk_ring_prod__needs_wakeup(&s->tx)) {
179         af_xdp_write_poll(s, true);
180     }
181 
182     return size;
183 }
184 
185 /*
186  * Complete a previous send (backend --> guest) and enable the
187  * fd_read callback.
188  */
189 static void af_xdp_send_completed(NetClientState *nc, ssize_t len)
190 {
191     AFXDPState *s = DO_UPCAST(AFXDPState, nc, nc);
192 
193     af_xdp_read_poll(s, true);
194 }
195 
196 static void af_xdp_fq_refill(AFXDPState *s, uint32_t n)
197 {
198     uint32_t i, idx = 0;
199 
200     /* Leave one packet for Tx, just in case. */
201     if (s->n_pool < n + 1) {
202         n = s->n_pool;
203     }
204 
205     if (!n || !xsk_ring_prod__reserve(&s->fq, n, &idx)) {
206         return;
207     }
208 
209     for (i = 0; i < n; i++) {
210         *xsk_ring_prod__fill_addr(&s->fq, idx++) = s->pool[--s->n_pool];
211     }
212     xsk_ring_prod__submit(&s->fq, n);
213 
214     if (xsk_ring_prod__needs_wakeup(&s->fq)) {
215         /* Receive was blocked by not having enough buffers.  Wake it up. */
216         af_xdp_read_poll(s, true);
217     }
218 }
219 
220 static void af_xdp_send(void *opaque)
221 {
222     uint32_t i, n_rx, idx = 0;
223     AFXDPState *s = opaque;
224 
225     n_rx = xsk_ring_cons__peek(&s->rx, AF_XDP_BATCH_SIZE, &idx);
226     if (!n_rx) {
227         return;
228     }
229 
230     for (i = 0; i < n_rx; i++) {
231         const struct xdp_desc *desc;
232         struct iovec iov;
233 
234         desc = xsk_ring_cons__rx_desc(&s->rx, idx++);
235 
236         iov.iov_base = xsk_umem__get_data(s->buffer, desc->addr);
237         iov.iov_len = desc->len;
238 
239         s->pool[s->n_pool++] = desc->addr;
240 
241         if (!qemu_sendv_packet_async(&s->nc, &iov, 1,
242                                      af_xdp_send_completed)) {
243             /*
244              * The peer does not receive anymore.  Packet is queued, stop
245              * reading from the backend until af_xdp_send_completed().
246              */
247             af_xdp_read_poll(s, false);
248 
249             /* Return unused descriptors to not break the ring cache. */
250             xsk_ring_cons__cancel(&s->rx, n_rx - i - 1);
251             n_rx = i + 1;
252             break;
253         }
254     }
255 
256     /* Release actually sent descriptors and try to re-fill. */
257     xsk_ring_cons__release(&s->rx, n_rx);
258     af_xdp_fq_refill(s, AF_XDP_BATCH_SIZE);
259 }
260 
261 /* Flush and close. */
262 static void af_xdp_cleanup(NetClientState *nc)
263 {
264     AFXDPState *s = DO_UPCAST(AFXDPState, nc, nc);
265 
266     qemu_purge_queued_packets(nc);
267 
268     af_xdp_poll(nc, false);
269 
270     xsk_socket__delete(s->xsk);
271     s->xsk = NULL;
272     g_free(s->pool);
273     s->pool = NULL;
274     xsk_umem__delete(s->umem);
275     s->umem = NULL;
276     qemu_vfree(s->buffer);
277     s->buffer = NULL;
278 
279     /* Remove the program if it's the last open queue. */
280     if (!s->inhibit && nc->queue_index == s->n_queues - 1 && s->xdp_flags
281         && bpf_xdp_detach(s->ifindex, s->xdp_flags, NULL) != 0) {
282         fprintf(stderr,
283                 "af-xdp: unable to remove XDP program from '%s', ifindex: %d\n",
284                 s->ifname, s->ifindex);
285     }
286 }
287 
288 static int af_xdp_umem_create(AFXDPState *s, int sock_fd, Error **errp)
289 {
290     struct xsk_umem_config config = {
291         .fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS,
292         .comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS,
293         .frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE,
294         .frame_headroom = 0,
295     };
296     uint64_t n_descs;
297     uint64_t size;
298     int64_t i;
299     int ret;
300 
301     /* Number of descriptors if all 4 queues (rx, tx, cq, fq) are full. */
302     n_descs = (XSK_RING_PROD__DEFAULT_NUM_DESCS
303                + XSK_RING_CONS__DEFAULT_NUM_DESCS) * 2;
304     size = n_descs * XSK_UMEM__DEFAULT_FRAME_SIZE;
305 
306     s->buffer = qemu_memalign(qemu_real_host_page_size(), size);
307     memset(s->buffer, 0, size);
308 
309     if (sock_fd < 0) {
310         ret = xsk_umem__create(&s->umem, s->buffer, size,
311                                &s->fq, &s->cq, &config);
312     } else {
313         ret = xsk_umem__create_with_fd(&s->umem, sock_fd, s->buffer, size,
314                                        &s->fq, &s->cq, &config);
315     }
316 
317     if (ret) {
318         qemu_vfree(s->buffer);
319         error_setg_errno(errp, errno,
320                          "failed to create umem for %s queue_index: %d",
321                          s->ifname, s->nc.queue_index);
322         return -1;
323     }
324 
325     s->pool = g_new(uint64_t, n_descs);
326     /* Fill the pool in the opposite order, because it's a LIFO queue. */
327     for (i = n_descs; i >= 0; i--) {
328         s->pool[i] = i * XSK_UMEM__DEFAULT_FRAME_SIZE;
329     }
330     s->n_pool = n_descs;
331 
332     af_xdp_fq_refill(s, XSK_RING_PROD__DEFAULT_NUM_DESCS);
333 
334     return 0;
335 }
336 
337 static int af_xdp_socket_create(AFXDPState *s,
338                                 const NetdevAFXDPOptions *opts, Error **errp)
339 {
340     struct xsk_socket_config cfg = {
341         .rx_size = XSK_RING_CONS__DEFAULT_NUM_DESCS,
342         .tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS,
343         .libxdp_flags = 0,
344         .bind_flags = XDP_USE_NEED_WAKEUP,
345         .xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST,
346     };
347     int queue_id, error = 0;
348 
349     s->inhibit = opts->has_inhibit && opts->inhibit;
350     if (s->inhibit) {
351         cfg.libxdp_flags |= XSK_LIBXDP_FLAGS__INHIBIT_PROG_LOAD;
352     }
353 
354     if (opts->has_force_copy && opts->force_copy) {
355         cfg.bind_flags |= XDP_COPY;
356     }
357 
358     queue_id = s->nc.queue_index;
359     if (opts->has_start_queue && opts->start_queue > 0) {
360         queue_id += opts->start_queue;
361     }
362 
363     if (opts->has_mode) {
364         /* Specific mode requested. */
365         cfg.xdp_flags |= (opts->mode == AFXDP_MODE_NATIVE)
366                          ? XDP_FLAGS_DRV_MODE : XDP_FLAGS_SKB_MODE;
367         if (xsk_socket__create(&s->xsk, s->ifname, queue_id,
368                                s->umem, &s->rx, &s->tx, &cfg)) {
369             error = errno;
370         }
371     } else {
372         /* No mode requested, try native first. */
373         cfg.xdp_flags |= XDP_FLAGS_DRV_MODE;
374 
375         if (xsk_socket__create(&s->xsk, s->ifname, queue_id,
376                                s->umem, &s->rx, &s->tx, &cfg)) {
377             /* Can't use native mode, try skb. */
378             cfg.xdp_flags &= ~XDP_FLAGS_DRV_MODE;
379             cfg.xdp_flags |= XDP_FLAGS_SKB_MODE;
380 
381             if (xsk_socket__create(&s->xsk, s->ifname, queue_id,
382                                    s->umem, &s->rx, &s->tx, &cfg)) {
383                 error = errno;
384             }
385         }
386     }
387 
388     if (error) {
389         error_setg_errno(errp, error,
390                          "failed to create AF_XDP socket for %s queue_id: %d",
391                          s->ifname, queue_id);
392         return -1;
393     }
394 
395     s->xdp_flags = cfg.xdp_flags;
396 
397     return 0;
398 }
399 
400 /* NetClientInfo methods. */
401 static NetClientInfo net_af_xdp_info = {
402     .type = NET_CLIENT_DRIVER_AF_XDP,
403     .size = sizeof(AFXDPState),
404     .receive = af_xdp_receive,
405     .poll = af_xdp_poll,
406     .cleanup = af_xdp_cleanup,
407 };
408 
409 static int *parse_socket_fds(const char *sock_fds_str,
410                              int64_t n_expected, Error **errp)
411 {
412     gchar **substrings = g_strsplit(sock_fds_str, ":", -1);
413     int64_t i, n_sock_fds = g_strv_length(substrings);
414     int *sock_fds = NULL;
415 
416     if (n_sock_fds != n_expected) {
417         error_setg(errp, "expected %"PRIi64" socket fds, got %"PRIi64,
418                    n_expected, n_sock_fds);
419         goto exit;
420     }
421 
422     sock_fds = g_new(int, n_sock_fds);
423 
424     for (i = 0; i < n_sock_fds; i++) {
425         sock_fds[i] = monitor_fd_param(monitor_cur(), substrings[i], errp);
426         if (sock_fds[i] < 0) {
427             g_free(sock_fds);
428             sock_fds = NULL;
429             goto exit;
430         }
431     }
432 
433 exit:
434     g_strfreev(substrings);
435     return sock_fds;
436 }
437 
438 /*
439  * The exported init function.
440  *
441  * ... -netdev af-xdp,ifname="..."
442  */
443 int net_init_af_xdp(const Netdev *netdev,
444                     const char *name, NetClientState *peer, Error **errp)
445 {
446     const NetdevAFXDPOptions *opts = &netdev->u.af_xdp;
447     NetClientState *nc, *nc0 = NULL;
448     unsigned int ifindex;
449     uint32_t prog_id = 0;
450     int *sock_fds = NULL;
451     int64_t i, queues;
452     Error *err = NULL;
453     AFXDPState *s;
454 
455     ifindex = if_nametoindex(opts->ifname);
456     if (!ifindex) {
457         error_setg_errno(errp, errno, "failed to get ifindex for '%s'",
458                          opts->ifname);
459         return -1;
460     }
461 
462     queues = opts->has_queues ? opts->queues : 1;
463     if (queues < 1) {
464         error_setg(errp, "invalid number of queues (%" PRIi64 ") for '%s'",
465                    queues, opts->ifname);
466         return -1;
467     }
468 
469     if ((opts->has_inhibit && opts->inhibit) != !!opts->sock_fds) {
470         error_setg(errp, "'inhibit=on' requires 'sock-fds' and vice versa");
471         return -1;
472     }
473 
474     if (opts->sock_fds) {
475         sock_fds = parse_socket_fds(opts->sock_fds, queues, errp);
476         if (!sock_fds) {
477             return -1;
478         }
479     }
480 
481     for (i = 0; i < queues; i++) {
482         nc = qemu_new_net_client(&net_af_xdp_info, peer, "af-xdp", name);
483         qemu_set_info_str(nc, "af-xdp%"PRIi64" to %s", i, opts->ifname);
484         nc->queue_index = i;
485 
486         if (!nc0) {
487             nc0 = nc;
488         }
489 
490         s = DO_UPCAST(AFXDPState, nc, nc);
491 
492         pstrcpy(s->ifname, sizeof(s->ifname), opts->ifname);
493         s->ifindex = ifindex;
494         s->n_queues = queues;
495 
496         if (af_xdp_umem_create(s, sock_fds ? sock_fds[i] : -1, errp)
497             || af_xdp_socket_create(s, opts, errp)) {
498             /* Make sure the XDP program will be removed. */
499             s->n_queues = i;
500             error_propagate(errp, err);
501             goto err;
502         }
503     }
504 
505     if (nc0) {
506         s = DO_UPCAST(AFXDPState, nc, nc0);
507         if (bpf_xdp_query_id(s->ifindex, s->xdp_flags, &prog_id) || !prog_id) {
508             error_setg_errno(errp, errno,
509                              "no XDP program loaded on '%s', ifindex: %d",
510                              s->ifname, s->ifindex);
511             goto err;
512         }
513     }
514 
515     af_xdp_read_poll(s, true); /* Initially only poll for reads. */
516 
517     return 0;
518 
519 err:
520     g_free(sock_fds);
521     if (nc0) {
522         qemu_del_net_client(nc0);
523     }
524 
525     return -1;
526 }
527