xref: /openbmc/qemu/block/linux-aio.c (revision 36c1febe3f34ae38db375865b7841165d76cdae4)
110fb6e06SPaolo Bonzini /*
210fb6e06SPaolo Bonzini  * Linux native AIO support.
310fb6e06SPaolo Bonzini  *
410fb6e06SPaolo Bonzini  * Copyright (C) 2009 IBM, Corp.
510fb6e06SPaolo Bonzini  * Copyright (C) 2009 Red Hat, Inc.
610fb6e06SPaolo Bonzini  *
710fb6e06SPaolo Bonzini  * This work is licensed under the terms of the GNU GPL, version 2 or later.
810fb6e06SPaolo Bonzini  * See the COPYING file in the top-level directory.
910fb6e06SPaolo Bonzini  */
1080c71a24SPeter Maydell #include "qemu/osdep.h"
11737e150eSPaolo Bonzini #include "block/aio.h"
121de7afc9SPaolo Bonzini #include "qemu/queue.h"
132174f12bSKevin Wolf #include "block/block.h"
1410fb6e06SPaolo Bonzini #include "block/raw-aio.h"
151de7afc9SPaolo Bonzini #include "qemu/event_notifier.h"
162174f12bSKevin Wolf #include "qemu/coroutine.h"
17433fcea4SStefan Hajnoczi #include "qemu/defer-call.h"
18ed6e2161SNishanth Aravamudan #include "qapi/error.h"
1907668288SStefan Hajnoczi #include "sysemu/block-backend.h"
2010fb6e06SPaolo Bonzini 
21ab50533bSEmanuele Giuseppe Esposito /* Only used for assertions.  */
22ab50533bSEmanuele Giuseppe Esposito #include "qemu/coroutine_int.h"
23ab50533bSEmanuele Giuseppe Esposito 
2410fb6e06SPaolo Bonzini #include <libaio.h>
2510fb6e06SPaolo Bonzini 
2610fb6e06SPaolo Bonzini /*
2710fb6e06SPaolo Bonzini  * Queue size (per-device).
2810fb6e06SPaolo Bonzini  *
2910fb6e06SPaolo Bonzini  * XXX: eventually we need to communicate this to the guest and/or make it
3010fb6e06SPaolo Bonzini  *      tunable by the guest.  If we get more outstanding requests at a time
3110fb6e06SPaolo Bonzini  *      than this we will get EAGAIN from io_submit which is communicated to
3210fb6e06SPaolo Bonzini  *      the guest as an I/O error.
3310fb6e06SPaolo Bonzini  */
342558cb8dSWangyong #define MAX_EVENTS 1024
3510fb6e06SPaolo Bonzini 
36d7ddd0a1SStefano Garzarella /* Maximum number of requests in a batch. (default value) */
37d7ddd0a1SStefano Garzarella #define DEFAULT_MAX_BATCH 32
38d7ddd0a1SStefano Garzarella 
3910fb6e06SPaolo Bonzini struct qemu_laiocb {
402174f12bSKevin Wolf     Coroutine *co;
41dd7f7ed1SPaolo Bonzini     LinuxAioState *ctx;
4210fb6e06SPaolo Bonzini     struct iocb iocb;
4310fb6e06SPaolo Bonzini     ssize_t ret;
4410fb6e06SPaolo Bonzini     size_t nbytes;
4510fb6e06SPaolo Bonzini     QEMUIOVector *qiov;
4610fb6e06SPaolo Bonzini     bool is_read;
4728b24087SPaolo Bonzini     QSIMPLEQ_ENTRY(qemu_laiocb) next;
4810fb6e06SPaolo Bonzini };
4910fb6e06SPaolo Bonzini 
501b3abdccSMing Lei typedef struct {
515e1b34a3SRoman Pen     unsigned int in_queue;
525e1b34a3SRoman Pen     unsigned int in_flight;
5343f2376eSPaolo Bonzini     bool blocked;
5428b24087SPaolo Bonzini     QSIMPLEQ_HEAD(, qemu_laiocb) pending;
551b3abdccSMing Lei } LaioQueue;
561b3abdccSMing Lei 
57dd7f7ed1SPaolo Bonzini struct LinuxAioState {
580187f5c9SPaolo Bonzini     AioContext *aio_context;
590187f5c9SPaolo Bonzini 
6010fb6e06SPaolo Bonzini     io_context_t ctx;
6110fb6e06SPaolo Bonzini     EventNotifier e;
621b3abdccSMing Lei 
63ab50533bSEmanuele Giuseppe Esposito     /* No locking required, only accessed from AioContext home thread */
641b3abdccSMing Lei     LaioQueue io_q;
652cdff7f6SStefan Hajnoczi     QEMUBH *completion_bh;
662cdff7f6SStefan Hajnoczi     int event_idx;
672cdff7f6SStefan Hajnoczi     int event_max;
6810fb6e06SPaolo Bonzini };
6910fb6e06SPaolo Bonzini 
70dd7f7ed1SPaolo Bonzini static void ioq_submit(LinuxAioState *s);
7128b24087SPaolo Bonzini 
io_event_ret(struct io_event * ev)7210fb6e06SPaolo Bonzini static inline ssize_t io_event_ret(struct io_event *ev)
7310fb6e06SPaolo Bonzini {
7410fb6e06SPaolo Bonzini     return (ssize_t)(((uint64_t)ev->res2 << 32) | ev->res);
7510fb6e06SPaolo Bonzini }
7610fb6e06SPaolo Bonzini 
7710fb6e06SPaolo Bonzini /*
782b02fd81SJulia Suvorova  * Completes an AIO request.
7910fb6e06SPaolo Bonzini  */
qemu_laio_process_completion(struct qemu_laiocb * laiocb)80dd7f7ed1SPaolo Bonzini static void qemu_laio_process_completion(struct qemu_laiocb *laiocb)
8110fb6e06SPaolo Bonzini {
8210fb6e06SPaolo Bonzini     int ret;
8310fb6e06SPaolo Bonzini 
8410fb6e06SPaolo Bonzini     ret = laiocb->ret;
8510fb6e06SPaolo Bonzini     if (ret != -ECANCELED) {
8610fb6e06SPaolo Bonzini         if (ret == laiocb->nbytes) {
8710fb6e06SPaolo Bonzini             ret = 0;
8810fb6e06SPaolo Bonzini         } else if (ret >= 0) {
8910fb6e06SPaolo Bonzini             /* Short reads mean EOF, pad with zeros. */
9010fb6e06SPaolo Bonzini             if (laiocb->is_read) {
9110fb6e06SPaolo Bonzini                 qemu_iovec_memset(laiocb->qiov, ret, 0,
9210fb6e06SPaolo Bonzini                     laiocb->qiov->size - ret);
9310fb6e06SPaolo Bonzini             } else {
941c42f149SDenis V. Lunev                 ret = -ENOSPC;
9510fb6e06SPaolo Bonzini             }
9610fb6e06SPaolo Bonzini         }
9710fb6e06SPaolo Bonzini     }
9810fb6e06SPaolo Bonzini 
992174f12bSKevin Wolf     laiocb->ret = ret;
1002b02fd81SJulia Suvorova 
1012b02fd81SJulia Suvorova     /*
1022b02fd81SJulia Suvorova      * If the coroutine is already entered it must be in ioq_submit() and
103fe121b9dSStefan Hajnoczi      * will notice laio->ret has been filled in when it eventually runs
104fe121b9dSStefan Hajnoczi      * later.  Coroutines cannot be entered recursively so avoid doing
105fe121b9dSStefan Hajnoczi      * that!
106fe121b9dSStefan Hajnoczi      */
107ab50533bSEmanuele Giuseppe Esposito     assert(laiocb->co->ctx == laiocb->ctx->aio_context);
108fe121b9dSStefan Hajnoczi     if (!qemu_coroutine_entered(laiocb->co)) {
109b9e413ddSPaolo Bonzini         aio_co_wake(laiocb->co);
1100ed93d84SRoman Pen     }
1112174f12bSKevin Wolf }
11210fb6e06SPaolo Bonzini 
1139e909a58SRoman Pen /**
1149e909a58SRoman Pen  * aio_ring buffer which is shared between userspace and kernel.
1159e909a58SRoman Pen  *
1169e909a58SRoman Pen  * This copied from linux/fs/aio.c, common header does not exist
1179e909a58SRoman Pen  * but AIO exists for ages so we assume ABI is stable.
1189e909a58SRoman Pen  */
1199e909a58SRoman Pen struct aio_ring {
1209e909a58SRoman Pen     unsigned    id;    /* kernel internal index number */
1219e909a58SRoman Pen     unsigned    nr;    /* number of io_events */
1229e909a58SRoman Pen     unsigned    head;  /* Written to by userland or by kernel. */
1239e909a58SRoman Pen     unsigned    tail;
1249e909a58SRoman Pen 
1259e909a58SRoman Pen     unsigned    magic;
1269e909a58SRoman Pen     unsigned    compat_features;
1279e909a58SRoman Pen     unsigned    incompat_features;
1289e909a58SRoman Pen     unsigned    header_length;  /* size of aio_ring */
1299e909a58SRoman Pen 
130f7795e40SPhilippe Mathieu-Daudé     struct io_event io_events[];
1319e909a58SRoman Pen };
1329e909a58SRoman Pen 
1339e909a58SRoman Pen /**
1349e909a58SRoman Pen  * io_getevents_peek:
1359e909a58SRoman Pen  * @ctx: AIO context
1369e909a58SRoman Pen  * @events: pointer on events array, output value
1379e909a58SRoman Pen 
1389e909a58SRoman Pen  * Returns the number of completed events and sets a pointer
1399e909a58SRoman Pen  * on events array.  This function does not update the internal
1409e909a58SRoman Pen  * ring buffer, only reads head and tail.  When @events has been
1419e909a58SRoman Pen  * processed io_getevents_commit() must be called.
1429e909a58SRoman Pen  */
io_getevents_peek(io_context_t ctx,struct io_event ** events)1439e909a58SRoman Pen static inline unsigned int io_getevents_peek(io_context_t ctx,
1449e909a58SRoman Pen                                              struct io_event **events)
1459e909a58SRoman Pen {
1469e909a58SRoman Pen     struct aio_ring *ring = (struct aio_ring *)ctx;
1479e909a58SRoman Pen     unsigned int head = ring->head, tail = ring->tail;
1489e909a58SRoman Pen     unsigned int nr;
1499e909a58SRoman Pen 
1509e909a58SRoman Pen     nr = tail >= head ? tail - head : ring->nr - head;
1519e909a58SRoman Pen     *events = ring->io_events + head;
1529e909a58SRoman Pen     /* To avoid speculative loads of s->events[i] before observing tail.
1539e909a58SRoman Pen        Paired with smp_wmb() inside linux/fs/aio.c: aio_complete(). */
1549e909a58SRoman Pen     smp_rmb();
1559e909a58SRoman Pen 
1569e909a58SRoman Pen     return nr;
1579e909a58SRoman Pen }
1589e909a58SRoman Pen 
1599e909a58SRoman Pen /**
1609e909a58SRoman Pen  * io_getevents_commit:
1619e909a58SRoman Pen  * @ctx: AIO context
1629e909a58SRoman Pen  * @nr: the number of events on which head should be advanced
1639e909a58SRoman Pen  *
1649e909a58SRoman Pen  * Advances head of a ring buffer.
1659e909a58SRoman Pen  */
io_getevents_commit(io_context_t ctx,unsigned int nr)1669e909a58SRoman Pen static inline void io_getevents_commit(io_context_t ctx, unsigned int nr)
1679e909a58SRoman Pen {
1689e909a58SRoman Pen     struct aio_ring *ring = (struct aio_ring *)ctx;
1699e909a58SRoman Pen 
1709e909a58SRoman Pen     if (nr) {
1719e909a58SRoman Pen         ring->head = (ring->head + nr) % ring->nr;
1729e909a58SRoman Pen     }
1739e909a58SRoman Pen }
1749e909a58SRoman Pen 
1759e909a58SRoman Pen /**
1769e909a58SRoman Pen  * io_getevents_advance_and_peek:
1779e909a58SRoman Pen  * @ctx: AIO context
1789e909a58SRoman Pen  * @events: pointer on events array, output value
1799e909a58SRoman Pen  * @nr: the number of events on which head should be advanced
1809e909a58SRoman Pen  *
1819e909a58SRoman Pen  * Advances head of a ring buffer and returns number of elements left.
1829e909a58SRoman Pen  */
1839e909a58SRoman Pen static inline unsigned int
io_getevents_advance_and_peek(io_context_t ctx,struct io_event ** events,unsigned int nr)1849e909a58SRoman Pen io_getevents_advance_and_peek(io_context_t ctx,
1859e909a58SRoman Pen                               struct io_event **events,
1869e909a58SRoman Pen                               unsigned int nr)
1879e909a58SRoman Pen {
1889e909a58SRoman Pen     io_getevents_commit(ctx, nr);
1899e909a58SRoman Pen     return io_getevents_peek(ctx, events);
1909e909a58SRoman Pen }
1919e909a58SRoman Pen 
1923407de57SRoman Pen /**
1933407de57SRoman Pen  * qemu_laio_process_completions:
1943407de57SRoman Pen  * @s: AIO state
1953407de57SRoman Pen  *
1963407de57SRoman Pen  * Fetches completed I/O requests and invokes their callbacks.
1972cdff7f6SStefan Hajnoczi  *
1982cdff7f6SStefan Hajnoczi  * The function is somewhat tricky because it supports nested event loops, for
1992cdff7f6SStefan Hajnoczi  * example when a request callback invokes aio_poll().  In order to do this,
2003407de57SRoman Pen  * indices are kept in LinuxAioState.  Function schedules BH completion so it
2013407de57SRoman Pen  * can be called again in a nested event loop.  When there are no events left
2023407de57SRoman Pen  * to complete the BH is being canceled.
2032cdff7f6SStefan Hajnoczi  */
qemu_laio_process_completions(LinuxAioState * s)2043407de57SRoman Pen static void qemu_laio_process_completions(LinuxAioState *s)
2052cdff7f6SStefan Hajnoczi {
2069e909a58SRoman Pen     struct io_event *events;
2072cdff7f6SStefan Hajnoczi 
20884d61e5fSStefan Hajnoczi     defer_call_begin();
20984d61e5fSStefan Hajnoczi 
2102cdff7f6SStefan Hajnoczi     /* Reschedule so nested event loops see currently pending completions */
2112cdff7f6SStefan Hajnoczi     qemu_bh_schedule(s->completion_bh);
2122cdff7f6SStefan Hajnoczi 
2139e909a58SRoman Pen     while ((s->event_max = io_getevents_advance_and_peek(s->ctx, &events,
2149e909a58SRoman Pen                                                          s->event_idx))) {
2159e909a58SRoman Pen         for (s->event_idx = 0; s->event_idx < s->event_max; ) {
2169e909a58SRoman Pen             struct iocb *iocb = events[s->event_idx].obj;
2172cdff7f6SStefan Hajnoczi             struct qemu_laiocb *laiocb =
2182cdff7f6SStefan Hajnoczi                 container_of(iocb, struct qemu_laiocb, iocb);
2192cdff7f6SStefan Hajnoczi 
2209e909a58SRoman Pen             laiocb->ret = io_event_ret(&events[s->event_idx]);
2212cdff7f6SStefan Hajnoczi 
2229e909a58SRoman Pen             /* Change counters one-by-one because we can be nested. */
2239e909a58SRoman Pen             s->io_q.in_flight--;
2249e909a58SRoman Pen             s->event_idx++;
225dd7f7ed1SPaolo Bonzini             qemu_laio_process_completion(laiocb);
2262cdff7f6SStefan Hajnoczi         }
2279e909a58SRoman Pen     }
2289e909a58SRoman Pen 
2299e909a58SRoman Pen     qemu_bh_cancel(s->completion_bh);
2309e909a58SRoman Pen 
2319e909a58SRoman Pen     /* If we are nested we have to notify the level above that we are done
2329e909a58SRoman Pen      * by setting event_max to zero, upper level will then jump out of it's
2333202d8e4SMichael Tokarev      * own `for` loop.  If we are the last all counters dropped to zero. */
2349e909a58SRoman Pen     s->event_max = 0;
2359e909a58SRoman Pen     s->event_idx = 0;
23684d61e5fSStefan Hajnoczi 
23784d61e5fSStefan Hajnoczi     defer_call_end();
2383407de57SRoman Pen }
23928b24087SPaolo Bonzini 
qemu_laio_process_completions_and_submit(LinuxAioState * s)2403407de57SRoman Pen static void qemu_laio_process_completions_and_submit(LinuxAioState *s)
2413407de57SRoman Pen {
2423407de57SRoman Pen     qemu_laio_process_completions(s);
2431919631eSPaolo Bonzini 
24407668288SStefan Hajnoczi     if (!QSIMPLEQ_EMPTY(&s->io_q.pending)) {
24528b24087SPaolo Bonzini         ioq_submit(s);
24628b24087SPaolo Bonzini     }
2472cdff7f6SStefan Hajnoczi }
2482cdff7f6SStefan Hajnoczi 
qemu_laio_completion_bh(void * opaque)2493407de57SRoman Pen static void qemu_laio_completion_bh(void *opaque)
2503407de57SRoman Pen {
2513407de57SRoman Pen     LinuxAioState *s = opaque;
2523407de57SRoman Pen 
2533407de57SRoman Pen     qemu_laio_process_completions_and_submit(s);
2543407de57SRoman Pen }
2553407de57SRoman Pen 
qemu_laio_completion_cb(EventNotifier * e)25610fb6e06SPaolo Bonzini static void qemu_laio_completion_cb(EventNotifier *e)
25710fb6e06SPaolo Bonzini {
258dd7f7ed1SPaolo Bonzini     LinuxAioState *s = container_of(e, LinuxAioState, e);
25910fb6e06SPaolo Bonzini 
2602cdff7f6SStefan Hajnoczi     if (event_notifier_test_and_clear(&s->e)) {
2613407de57SRoman Pen         qemu_laio_process_completions_and_submit(s);
26210fb6e06SPaolo Bonzini     }
26310fb6e06SPaolo Bonzini }
26410fb6e06SPaolo Bonzini 
qemu_laio_poll_cb(void * opaque)265ee686975SStefan Hajnoczi static bool qemu_laio_poll_cb(void *opaque)
266ee686975SStefan Hajnoczi {
267ee686975SStefan Hajnoczi     EventNotifier *e = opaque;
268ee686975SStefan Hajnoczi     LinuxAioState *s = container_of(e, LinuxAioState, e);
269ee686975SStefan Hajnoczi     struct io_event *events;
270ee686975SStefan Hajnoczi 
271826cc324SStefan Hajnoczi     return io_getevents_peek(s->ctx, &events);
272ee686975SStefan Hajnoczi }
273ee686975SStefan Hajnoczi 
qemu_laio_poll_ready(EventNotifier * opaque)274826cc324SStefan Hajnoczi static void qemu_laio_poll_ready(EventNotifier *opaque)
275826cc324SStefan Hajnoczi {
276826cc324SStefan Hajnoczi     EventNotifier *e = opaque;
277826cc324SStefan Hajnoczi     LinuxAioState *s = container_of(e, LinuxAioState, e);
278826cc324SStefan Hajnoczi 
279ee686975SStefan Hajnoczi     qemu_laio_process_completions_and_submit(s);
280ee686975SStefan Hajnoczi }
281ee686975SStefan Hajnoczi 
ioq_init(LaioQueue * io_q)2821b3abdccSMing Lei static void ioq_init(LaioQueue *io_q)
2831b3abdccSMing Lei {
28428b24087SPaolo Bonzini     QSIMPLEQ_INIT(&io_q->pending);
2855e1b34a3SRoman Pen     io_q->in_queue = 0;
2865e1b34a3SRoman Pen     io_q->in_flight = 0;
28743f2376eSPaolo Bonzini     io_q->blocked = false;
2881b3abdccSMing Lei }
2891b3abdccSMing Lei 
ioq_submit(LinuxAioState * s)290dd7f7ed1SPaolo Bonzini static void ioq_submit(LinuxAioState *s)
2911b3abdccSMing Lei {
29282595da8SPaolo Bonzini     int ret, len;
29328b24087SPaolo Bonzini     struct qemu_laiocb *aiocb;
2945e1b34a3SRoman Pen     struct iocb *iocbs[MAX_EVENTS];
29582595da8SPaolo Bonzini     QSIMPLEQ_HEAD(, qemu_laiocb) completed;
2961b3abdccSMing Lei 
29743f2376eSPaolo Bonzini     do {
2985e1b34a3SRoman Pen         if (s->io_q.in_flight >= MAX_EVENTS) {
2995e1b34a3SRoman Pen             break;
3005e1b34a3SRoman Pen         }
30143f2376eSPaolo Bonzini         len = 0;
30228b24087SPaolo Bonzini         QSIMPLEQ_FOREACH(aiocb, &s->io_q.pending, next) {
30328b24087SPaolo Bonzini             iocbs[len++] = &aiocb->iocb;
3045e1b34a3SRoman Pen             if (s->io_q.in_flight + len >= MAX_EVENTS) {
30528b24087SPaolo Bonzini                 break;
30628b24087SPaolo Bonzini             }
3071b3abdccSMing Lei         }
3081b3abdccSMing Lei 
30928b24087SPaolo Bonzini         ret = io_submit(s->ctx, len, iocbs);
31028b24087SPaolo Bonzini         if (ret == -EAGAIN) {
31182595da8SPaolo Bonzini             break;
31228b24087SPaolo Bonzini         }
31328b24087SPaolo Bonzini         if (ret < 0) {
31444713c9eSKevin Wolf             /* Fail the first request, retry the rest */
31544713c9eSKevin Wolf             aiocb = QSIMPLEQ_FIRST(&s->io_q.pending);
31644713c9eSKevin Wolf             QSIMPLEQ_REMOVE_HEAD(&s->io_q.pending, next);
31744713c9eSKevin Wolf             s->io_q.in_queue--;
31844713c9eSKevin Wolf             aiocb->ret = ret;
31944713c9eSKevin Wolf             qemu_laio_process_completion(aiocb);
32044713c9eSKevin Wolf             continue;
32128b24087SPaolo Bonzini         }
3221b3abdccSMing Lei 
3235e1b34a3SRoman Pen         s->io_q.in_flight += ret;
3245e1b34a3SRoman Pen         s->io_q.in_queue  -= ret;
32582595da8SPaolo Bonzini         aiocb = container_of(iocbs[ret - 1], struct qemu_laiocb, iocb);
32682595da8SPaolo Bonzini         QSIMPLEQ_SPLIT_AFTER(&s->io_q.pending, aiocb, next, &completed);
32743f2376eSPaolo Bonzini     } while (ret == len && !QSIMPLEQ_EMPTY(&s->io_q.pending));
3285e1b34a3SRoman Pen     s->io_q.blocked = (s->io_q.in_queue > 0);
3290ed93d84SRoman Pen 
3300ed93d84SRoman Pen     if (s->io_q.in_flight) {
3310ed93d84SRoman Pen         /* We can try to complete something just right away if there are
3320ed93d84SRoman Pen          * still requests in-flight. */
3330ed93d84SRoman Pen         qemu_laio_process_completions(s);
3340ed93d84SRoman Pen         /*
3350ed93d84SRoman Pen          * Even we have completed everything (in_flight == 0), the queue can
3360ed93d84SRoman Pen          * have still pended requests (in_queue > 0).  We do not attempt to
3370ed93d84SRoman Pen          * repeat submission to avoid IO hang.  The reason is simple: s->e is
3380ed93d84SRoman Pen          * still set and completion callback will be called shortly and all
3390ed93d84SRoman Pen          * pended requests will be submitted from there.
3400ed93d84SRoman Pen          */
3410ed93d84SRoman Pen     }
3421b3abdccSMing Lei }
3431b3abdccSMing Lei 
laio_max_batch(LinuxAioState * s,uint64_t dev_max_batch)344512da211SStefano Garzarella static uint64_t laio_max_batch(LinuxAioState *s, uint64_t dev_max_batch)
345512da211SStefano Garzarella {
346512da211SStefano Garzarella     uint64_t max_batch = s->aio_context->aio_max_batch ?: DEFAULT_MAX_BATCH;
347512da211SStefano Garzarella 
348512da211SStefano Garzarella     /*
349512da211SStefano Garzarella      * AIO context can be shared between multiple block devices, so
350512da211SStefano Garzarella      * `dev_max_batch` allows reducing the batch size for latency-sensitive
351512da211SStefano Garzarella      * devices.
352512da211SStefano Garzarella      */
353512da211SStefano Garzarella     max_batch = MIN_NON_ZERO(dev_max_batch, max_batch);
354512da211SStefano Garzarella 
355512da211SStefano Garzarella     /* limit the batch with the number of available events */
356512da211SStefano Garzarella     max_batch = MIN_NON_ZERO(MAX_EVENTS - s->io_q.in_flight, max_batch);
357512da211SStefano Garzarella 
358512da211SStefano Garzarella     return max_batch;
359512da211SStefano Garzarella }
360512da211SStefano Garzarella 
laio_deferred_fn(void * opaque)361ccee48aaSStefan Hajnoczi static void laio_deferred_fn(void *opaque)
3621b3abdccSMing Lei {
36307668288SStefan Hajnoczi     LinuxAioState *s = opaque;
364ab50533bSEmanuele Giuseppe Esposito 
36507668288SStefan Hajnoczi     if (!s->io_q.blocked && !QSIMPLEQ_EMPTY(&s->io_q.pending)) {
366de354644SPaolo Bonzini         ioq_submit(s);
3671b3abdccSMing Lei     }
3681b3abdccSMing Lei }
3691b3abdccSMing Lei 
laio_do_submit(int fd,struct qemu_laiocb * laiocb,off_t offset,int type,uint64_t dev_max_batch)3702174f12bSKevin Wolf static int laio_do_submit(int fd, struct qemu_laiocb *laiocb, off_t offset,
371512da211SStefano Garzarella                           int type, uint64_t dev_max_batch)
37210fb6e06SPaolo Bonzini {
3732174f12bSKevin Wolf     LinuxAioState *s = laiocb->ctx;
3742174f12bSKevin Wolf     struct iocb *iocbs = &laiocb->iocb;
3752174f12bSKevin Wolf     QEMUIOVector *qiov = laiocb->qiov;
37610fb6e06SPaolo Bonzini 
37710fb6e06SPaolo Bonzini     switch (type) {
37810fb6e06SPaolo Bonzini     case QEMU_AIO_WRITE:
37910fb6e06SPaolo Bonzini         io_prep_pwritev(iocbs, fd, qiov->iov, qiov->niov, offset);
38010fb6e06SPaolo Bonzini         break;
3814751d09aSSam Li     case QEMU_AIO_ZONE_APPEND:
3824751d09aSSam Li         io_prep_pwritev(iocbs, fd, qiov->iov, qiov->niov, offset);
3834751d09aSSam Li         break;
38410fb6e06SPaolo Bonzini     case QEMU_AIO_READ:
38510fb6e06SPaolo Bonzini         io_prep_preadv(iocbs, fd, qiov->iov, qiov->niov, offset);
38610fb6e06SPaolo Bonzini         break;
387*24687abfSPrasad Pandit     case QEMU_AIO_FLUSH:
388*24687abfSPrasad Pandit         io_prep_fdsync(iocbs, fd);
389*24687abfSPrasad Pandit         break;
39010fb6e06SPaolo Bonzini     /* Currently Linux kernel does not support other operations */
39110fb6e06SPaolo Bonzini     default:
39210fb6e06SPaolo Bonzini         fprintf(stderr, "%s: invalid AIO request type 0x%x.\n",
39310fb6e06SPaolo Bonzini                         __func__, type);
3942174f12bSKevin Wolf         return -EIO;
39510fb6e06SPaolo Bonzini     }
39610fb6e06SPaolo Bonzini     io_set_eventfd(&laiocb->iocb, event_notifier_get_fd(&s->e));
39710fb6e06SPaolo Bonzini 
39828b24087SPaolo Bonzini     QSIMPLEQ_INSERT_TAIL(&s->io_q.pending, laiocb, next);
3995e1b34a3SRoman Pen     s->io_q.in_queue++;
40007668288SStefan Hajnoczi     if (!s->io_q.blocked) {
40107668288SStefan Hajnoczi         if (s->io_q.in_queue >= laio_max_batch(s, dev_max_batch)) {
40228b24087SPaolo Bonzini             ioq_submit(s);
40307668288SStefan Hajnoczi         } else {
404ccee48aaSStefan Hajnoczi             defer_call(laio_deferred_fn, s);
40507668288SStefan Hajnoczi         }
4061b3abdccSMing Lei     }
40710fb6e06SPaolo Bonzini 
4082174f12bSKevin Wolf     return 0;
4092174f12bSKevin Wolf }
4102174f12bSKevin Wolf 
laio_co_submit(int fd,uint64_t offset,QEMUIOVector * qiov,int type,uint64_t dev_max_batch)411ab50533bSEmanuele Giuseppe Esposito int coroutine_fn laio_co_submit(int fd, uint64_t offset, QEMUIOVector *qiov,
412ab50533bSEmanuele Giuseppe Esposito                                 int type, uint64_t dev_max_batch)
4132174f12bSKevin Wolf {
4142174f12bSKevin Wolf     int ret;
415ab50533bSEmanuele Giuseppe Esposito     AioContext *ctx = qemu_get_current_aio_context();
4162174f12bSKevin Wolf     struct qemu_laiocb laiocb = {
4172174f12bSKevin Wolf         .co         = qemu_coroutine_self(),
418*24687abfSPrasad Pandit         .nbytes     = qiov ? qiov->size : 0,
419ab50533bSEmanuele Giuseppe Esposito         .ctx        = aio_get_linux_aio(ctx),
4200ed93d84SRoman Pen         .ret        = -EINPROGRESS,
4212174f12bSKevin Wolf         .is_read    = (type == QEMU_AIO_READ),
4222174f12bSKevin Wolf         .qiov       = qiov,
4232174f12bSKevin Wolf     };
4242174f12bSKevin Wolf 
425512da211SStefano Garzarella     ret = laio_do_submit(fd, &laiocb, offset, type, dev_max_batch);
4262174f12bSKevin Wolf     if (ret < 0) {
4272174f12bSKevin Wolf         return ret;
4282174f12bSKevin Wolf     }
4292174f12bSKevin Wolf 
4300ed93d84SRoman Pen     if (laiocb.ret == -EINPROGRESS) {
4312174f12bSKevin Wolf         qemu_coroutine_yield();
4320ed93d84SRoman Pen     }
4332174f12bSKevin Wolf     return laiocb.ret;
4342174f12bSKevin Wolf }
4352174f12bSKevin Wolf 
laio_detach_aio_context(LinuxAioState * s,AioContext * old_context)436dd7f7ed1SPaolo Bonzini void laio_detach_aio_context(LinuxAioState *s, AioContext *old_context)
437c2f3426cSStefan Hajnoczi {
43860f782b6SStefan Hajnoczi     aio_set_event_notifier(old_context, &s->e, NULL, NULL, NULL);
4392cdff7f6SStefan Hajnoczi     qemu_bh_delete(s->completion_bh);
4401919631eSPaolo Bonzini     s->aio_context = NULL;
441c2f3426cSStefan Hajnoczi }
442c2f3426cSStefan Hajnoczi 
laio_attach_aio_context(LinuxAioState * s,AioContext * new_context)443dd7f7ed1SPaolo Bonzini void laio_attach_aio_context(LinuxAioState *s, AioContext *new_context)
444c2f3426cSStefan Hajnoczi {
4450187f5c9SPaolo Bonzini     s->aio_context = new_context;
4462cdff7f6SStefan Hajnoczi     s->completion_bh = aio_bh_new(new_context, qemu_laio_completion_bh, s);
44760f782b6SStefan Hajnoczi     aio_set_event_notifier(new_context, &s->e,
448ee686975SStefan Hajnoczi                            qemu_laio_completion_cb,
449826cc324SStefan Hajnoczi                            qemu_laio_poll_cb,
450826cc324SStefan Hajnoczi                            qemu_laio_poll_ready);
451c2f3426cSStefan Hajnoczi }
452c2f3426cSStefan Hajnoczi 
laio_init(Error ** errp)453ed6e2161SNishanth Aravamudan LinuxAioState *laio_init(Error **errp)
45410fb6e06SPaolo Bonzini {
455ed6e2161SNishanth Aravamudan     int rc;
456dd7f7ed1SPaolo Bonzini     LinuxAioState *s;
45710fb6e06SPaolo Bonzini 
45810fb6e06SPaolo Bonzini     s = g_malloc0(sizeof(*s));
459ed6e2161SNishanth Aravamudan     rc = event_notifier_init(&s->e, false);
460ed6e2161SNishanth Aravamudan     if (rc < 0) {
4617a21bee2SDaniel P. Berrangé         error_setg_errno(errp, -rc, "failed to initialize event notifier");
46210fb6e06SPaolo Bonzini         goto out_free_state;
46310fb6e06SPaolo Bonzini     }
46410fb6e06SPaolo Bonzini 
465ed6e2161SNishanth Aravamudan     rc = io_setup(MAX_EVENTS, &s->ctx);
466ed6e2161SNishanth Aravamudan     if (rc < 0) {
467ed6e2161SNishanth Aravamudan         error_setg_errno(errp, -rc, "failed to create linux AIO context");
46810fb6e06SPaolo Bonzini         goto out_close_efd;
46910fb6e06SPaolo Bonzini     }
47010fb6e06SPaolo Bonzini 
4711b3abdccSMing Lei     ioq_init(&s->io_q);
4721b3abdccSMing Lei 
47310fb6e06SPaolo Bonzini     return s;
47410fb6e06SPaolo Bonzini 
47510fb6e06SPaolo Bonzini out_close_efd:
47610fb6e06SPaolo Bonzini     event_notifier_cleanup(&s->e);
47710fb6e06SPaolo Bonzini out_free_state:
47810fb6e06SPaolo Bonzini     g_free(s);
47910fb6e06SPaolo Bonzini     return NULL;
48010fb6e06SPaolo Bonzini }
481abd269b7SStefan Hajnoczi 
laio_cleanup(LinuxAioState * s)482dd7f7ed1SPaolo Bonzini void laio_cleanup(LinuxAioState *s)
483abd269b7SStefan Hajnoczi {
484abd269b7SStefan Hajnoczi     event_notifier_cleanup(&s->e);
485a1abf40dSGonglei 
486a1abf40dSGonglei     if (io_destroy(s->ctx) != 0) {
487a1abf40dSGonglei         fprintf(stderr, "%s: destroy AIO context %p failed\n",
488a1abf40dSGonglei                         __func__, &s->ctx);
489a1abf40dSGonglei     }
490abd269b7SStefan Hajnoczi     g_free(s);
491abd269b7SStefan Hajnoczi }
492*24687abfSPrasad Pandit 
laio_has_fdsync(int fd)493*24687abfSPrasad Pandit bool laio_has_fdsync(int fd)
494*24687abfSPrasad Pandit {
495*24687abfSPrasad Pandit     struct iocb cb;
496*24687abfSPrasad Pandit     struct iocb *cbs[] = {&cb, NULL};
497*24687abfSPrasad Pandit 
498*24687abfSPrasad Pandit     io_context_t ctx = 0;
499*24687abfSPrasad Pandit     io_setup(1, &ctx);
500*24687abfSPrasad Pandit 
501*24687abfSPrasad Pandit     /* check if host kernel supports IO_CMD_FDSYNC */
502*24687abfSPrasad Pandit     io_prep_fdsync(&cb, fd);
503*24687abfSPrasad Pandit     int ret = io_submit(ctx, 1, cbs);
504*24687abfSPrasad Pandit 
505*24687abfSPrasad Pandit     io_destroy(ctx);
506*24687abfSPrasad Pandit     return (ret == -EINVAL) ? false : true;
507*24687abfSPrasad Pandit }
508