110fb6e06SPaolo Bonzini /*
210fb6e06SPaolo Bonzini * Linux native AIO support.
310fb6e06SPaolo Bonzini *
410fb6e06SPaolo Bonzini * Copyright (C) 2009 IBM, Corp.
510fb6e06SPaolo Bonzini * Copyright (C) 2009 Red Hat, Inc.
610fb6e06SPaolo Bonzini *
710fb6e06SPaolo Bonzini * This work is licensed under the terms of the GNU GPL, version 2 or later.
810fb6e06SPaolo Bonzini * See the COPYING file in the top-level directory.
910fb6e06SPaolo Bonzini */
1080c71a24SPeter Maydell #include "qemu/osdep.h"
11737e150eSPaolo Bonzini #include "block/aio.h"
121de7afc9SPaolo Bonzini #include "qemu/queue.h"
132174f12bSKevin Wolf #include "block/block.h"
1410fb6e06SPaolo Bonzini #include "block/raw-aio.h"
151de7afc9SPaolo Bonzini #include "qemu/event_notifier.h"
162174f12bSKevin Wolf #include "qemu/coroutine.h"
17433fcea4SStefan Hajnoczi #include "qemu/defer-call.h"
18ed6e2161SNishanth Aravamudan #include "qapi/error.h"
1907668288SStefan Hajnoczi #include "sysemu/block-backend.h"
2010fb6e06SPaolo Bonzini
21ab50533bSEmanuele Giuseppe Esposito /* Only used for assertions. */
22ab50533bSEmanuele Giuseppe Esposito #include "qemu/coroutine_int.h"
23ab50533bSEmanuele Giuseppe Esposito
2410fb6e06SPaolo Bonzini #include <libaio.h>
2510fb6e06SPaolo Bonzini
2610fb6e06SPaolo Bonzini /*
2710fb6e06SPaolo Bonzini * Queue size (per-device).
2810fb6e06SPaolo Bonzini *
2910fb6e06SPaolo Bonzini * XXX: eventually we need to communicate this to the guest and/or make it
3010fb6e06SPaolo Bonzini * tunable by the guest. If we get more outstanding requests at a time
3110fb6e06SPaolo Bonzini * than this we will get EAGAIN from io_submit which is communicated to
3210fb6e06SPaolo Bonzini * the guest as an I/O error.
3310fb6e06SPaolo Bonzini */
342558cb8dSWangyong #define MAX_EVENTS 1024
3510fb6e06SPaolo Bonzini
36d7ddd0a1SStefano Garzarella /* Maximum number of requests in a batch. (default value) */
37d7ddd0a1SStefano Garzarella #define DEFAULT_MAX_BATCH 32
38d7ddd0a1SStefano Garzarella
3910fb6e06SPaolo Bonzini struct qemu_laiocb {
402174f12bSKevin Wolf Coroutine *co;
41dd7f7ed1SPaolo Bonzini LinuxAioState *ctx;
4210fb6e06SPaolo Bonzini struct iocb iocb;
4310fb6e06SPaolo Bonzini ssize_t ret;
4410fb6e06SPaolo Bonzini size_t nbytes;
4510fb6e06SPaolo Bonzini QEMUIOVector *qiov;
4610fb6e06SPaolo Bonzini bool is_read;
4728b24087SPaolo Bonzini QSIMPLEQ_ENTRY(qemu_laiocb) next;
4810fb6e06SPaolo Bonzini };
4910fb6e06SPaolo Bonzini
501b3abdccSMing Lei typedef struct {
515e1b34a3SRoman Pen unsigned int in_queue;
525e1b34a3SRoman Pen unsigned int in_flight;
5343f2376eSPaolo Bonzini bool blocked;
5428b24087SPaolo Bonzini QSIMPLEQ_HEAD(, qemu_laiocb) pending;
551b3abdccSMing Lei } LaioQueue;
561b3abdccSMing Lei
57dd7f7ed1SPaolo Bonzini struct LinuxAioState {
580187f5c9SPaolo Bonzini AioContext *aio_context;
590187f5c9SPaolo Bonzini
6010fb6e06SPaolo Bonzini io_context_t ctx;
6110fb6e06SPaolo Bonzini EventNotifier e;
621b3abdccSMing Lei
63ab50533bSEmanuele Giuseppe Esposito /* No locking required, only accessed from AioContext home thread */
641b3abdccSMing Lei LaioQueue io_q;
652cdff7f6SStefan Hajnoczi QEMUBH *completion_bh;
662cdff7f6SStefan Hajnoczi int event_idx;
672cdff7f6SStefan Hajnoczi int event_max;
6810fb6e06SPaolo Bonzini };
6910fb6e06SPaolo Bonzini
70dd7f7ed1SPaolo Bonzini static void ioq_submit(LinuxAioState *s);
7128b24087SPaolo Bonzini
io_event_ret(struct io_event * ev)7210fb6e06SPaolo Bonzini static inline ssize_t io_event_ret(struct io_event *ev)
7310fb6e06SPaolo Bonzini {
7410fb6e06SPaolo Bonzini return (ssize_t)(((uint64_t)ev->res2 << 32) | ev->res);
7510fb6e06SPaolo Bonzini }
7610fb6e06SPaolo Bonzini
7710fb6e06SPaolo Bonzini /*
782b02fd81SJulia Suvorova * Completes an AIO request.
7910fb6e06SPaolo Bonzini */
qemu_laio_process_completion(struct qemu_laiocb * laiocb)80dd7f7ed1SPaolo Bonzini static void qemu_laio_process_completion(struct qemu_laiocb *laiocb)
8110fb6e06SPaolo Bonzini {
8210fb6e06SPaolo Bonzini int ret;
8310fb6e06SPaolo Bonzini
8410fb6e06SPaolo Bonzini ret = laiocb->ret;
8510fb6e06SPaolo Bonzini if (ret != -ECANCELED) {
8610fb6e06SPaolo Bonzini if (ret == laiocb->nbytes) {
8710fb6e06SPaolo Bonzini ret = 0;
8810fb6e06SPaolo Bonzini } else if (ret >= 0) {
8910fb6e06SPaolo Bonzini /* Short reads mean EOF, pad with zeros. */
9010fb6e06SPaolo Bonzini if (laiocb->is_read) {
9110fb6e06SPaolo Bonzini qemu_iovec_memset(laiocb->qiov, ret, 0,
9210fb6e06SPaolo Bonzini laiocb->qiov->size - ret);
9310fb6e06SPaolo Bonzini } else {
941c42f149SDenis V. Lunev ret = -ENOSPC;
9510fb6e06SPaolo Bonzini }
9610fb6e06SPaolo Bonzini }
9710fb6e06SPaolo Bonzini }
9810fb6e06SPaolo Bonzini
992174f12bSKevin Wolf laiocb->ret = ret;
1002b02fd81SJulia Suvorova
1012b02fd81SJulia Suvorova /*
1022b02fd81SJulia Suvorova * If the coroutine is already entered it must be in ioq_submit() and
103fe121b9dSStefan Hajnoczi * will notice laio->ret has been filled in when it eventually runs
104fe121b9dSStefan Hajnoczi * later. Coroutines cannot be entered recursively so avoid doing
105fe121b9dSStefan Hajnoczi * that!
106fe121b9dSStefan Hajnoczi */
107ab50533bSEmanuele Giuseppe Esposito assert(laiocb->co->ctx == laiocb->ctx->aio_context);
108fe121b9dSStefan Hajnoczi if (!qemu_coroutine_entered(laiocb->co)) {
109b9e413ddSPaolo Bonzini aio_co_wake(laiocb->co);
1100ed93d84SRoman Pen }
1112174f12bSKevin Wolf }
11210fb6e06SPaolo Bonzini
1139e909a58SRoman Pen /**
1149e909a58SRoman Pen * aio_ring buffer which is shared between userspace and kernel.
1159e909a58SRoman Pen *
1169e909a58SRoman Pen * This copied from linux/fs/aio.c, common header does not exist
1179e909a58SRoman Pen * but AIO exists for ages so we assume ABI is stable.
1189e909a58SRoman Pen */
1199e909a58SRoman Pen struct aio_ring {
1209e909a58SRoman Pen unsigned id; /* kernel internal index number */
1219e909a58SRoman Pen unsigned nr; /* number of io_events */
1229e909a58SRoman Pen unsigned head; /* Written to by userland or by kernel. */
1239e909a58SRoman Pen unsigned tail;
1249e909a58SRoman Pen
1259e909a58SRoman Pen unsigned magic;
1269e909a58SRoman Pen unsigned compat_features;
1279e909a58SRoman Pen unsigned incompat_features;
1289e909a58SRoman Pen unsigned header_length; /* size of aio_ring */
1299e909a58SRoman Pen
130f7795e40SPhilippe Mathieu-Daudé struct io_event io_events[];
1319e909a58SRoman Pen };
1329e909a58SRoman Pen
1339e909a58SRoman Pen /**
1349e909a58SRoman Pen * io_getevents_peek:
1359e909a58SRoman Pen * @ctx: AIO context
1369e909a58SRoman Pen * @events: pointer on events array, output value
1379e909a58SRoman Pen
1389e909a58SRoman Pen * Returns the number of completed events and sets a pointer
1399e909a58SRoman Pen * on events array. This function does not update the internal
1409e909a58SRoman Pen * ring buffer, only reads head and tail. When @events has been
1419e909a58SRoman Pen * processed io_getevents_commit() must be called.
1429e909a58SRoman Pen */
io_getevents_peek(io_context_t ctx,struct io_event ** events)1439e909a58SRoman Pen static inline unsigned int io_getevents_peek(io_context_t ctx,
1449e909a58SRoman Pen struct io_event **events)
1459e909a58SRoman Pen {
1469e909a58SRoman Pen struct aio_ring *ring = (struct aio_ring *)ctx;
1479e909a58SRoman Pen unsigned int head = ring->head, tail = ring->tail;
1489e909a58SRoman Pen unsigned int nr;
1499e909a58SRoman Pen
1509e909a58SRoman Pen nr = tail >= head ? tail - head : ring->nr - head;
1519e909a58SRoman Pen *events = ring->io_events + head;
1529e909a58SRoman Pen /* To avoid speculative loads of s->events[i] before observing tail.
1539e909a58SRoman Pen Paired with smp_wmb() inside linux/fs/aio.c: aio_complete(). */
1549e909a58SRoman Pen smp_rmb();
1559e909a58SRoman Pen
1569e909a58SRoman Pen return nr;
1579e909a58SRoman Pen }
1589e909a58SRoman Pen
1599e909a58SRoman Pen /**
1609e909a58SRoman Pen * io_getevents_commit:
1619e909a58SRoman Pen * @ctx: AIO context
1629e909a58SRoman Pen * @nr: the number of events on which head should be advanced
1639e909a58SRoman Pen *
1649e909a58SRoman Pen * Advances head of a ring buffer.
1659e909a58SRoman Pen */
io_getevents_commit(io_context_t ctx,unsigned int nr)1669e909a58SRoman Pen static inline void io_getevents_commit(io_context_t ctx, unsigned int nr)
1679e909a58SRoman Pen {
1689e909a58SRoman Pen struct aio_ring *ring = (struct aio_ring *)ctx;
1699e909a58SRoman Pen
1709e909a58SRoman Pen if (nr) {
1719e909a58SRoman Pen ring->head = (ring->head + nr) % ring->nr;
1729e909a58SRoman Pen }
1739e909a58SRoman Pen }
1749e909a58SRoman Pen
1759e909a58SRoman Pen /**
1769e909a58SRoman Pen * io_getevents_advance_and_peek:
1779e909a58SRoman Pen * @ctx: AIO context
1789e909a58SRoman Pen * @events: pointer on events array, output value
1799e909a58SRoman Pen * @nr: the number of events on which head should be advanced
1809e909a58SRoman Pen *
1819e909a58SRoman Pen * Advances head of a ring buffer and returns number of elements left.
1829e909a58SRoman Pen */
1839e909a58SRoman Pen static inline unsigned int
io_getevents_advance_and_peek(io_context_t ctx,struct io_event ** events,unsigned int nr)1849e909a58SRoman Pen io_getevents_advance_and_peek(io_context_t ctx,
1859e909a58SRoman Pen struct io_event **events,
1869e909a58SRoman Pen unsigned int nr)
1879e909a58SRoman Pen {
1889e909a58SRoman Pen io_getevents_commit(ctx, nr);
1899e909a58SRoman Pen return io_getevents_peek(ctx, events);
1909e909a58SRoman Pen }
1919e909a58SRoman Pen
1923407de57SRoman Pen /**
1933407de57SRoman Pen * qemu_laio_process_completions:
1943407de57SRoman Pen * @s: AIO state
1953407de57SRoman Pen *
1963407de57SRoman Pen * Fetches completed I/O requests and invokes their callbacks.
1972cdff7f6SStefan Hajnoczi *
1982cdff7f6SStefan Hajnoczi * The function is somewhat tricky because it supports nested event loops, for
1992cdff7f6SStefan Hajnoczi * example when a request callback invokes aio_poll(). In order to do this,
2003407de57SRoman Pen * indices are kept in LinuxAioState. Function schedules BH completion so it
2013407de57SRoman Pen * can be called again in a nested event loop. When there are no events left
2023407de57SRoman Pen * to complete the BH is being canceled.
2032cdff7f6SStefan Hajnoczi */
qemu_laio_process_completions(LinuxAioState * s)2043407de57SRoman Pen static void qemu_laio_process_completions(LinuxAioState *s)
2052cdff7f6SStefan Hajnoczi {
2069e909a58SRoman Pen struct io_event *events;
2072cdff7f6SStefan Hajnoczi
20884d61e5fSStefan Hajnoczi defer_call_begin();
20984d61e5fSStefan Hajnoczi
2102cdff7f6SStefan Hajnoczi /* Reschedule so nested event loops see currently pending completions */
2112cdff7f6SStefan Hajnoczi qemu_bh_schedule(s->completion_bh);
2122cdff7f6SStefan Hajnoczi
2139e909a58SRoman Pen while ((s->event_max = io_getevents_advance_and_peek(s->ctx, &events,
2149e909a58SRoman Pen s->event_idx))) {
2159e909a58SRoman Pen for (s->event_idx = 0; s->event_idx < s->event_max; ) {
2169e909a58SRoman Pen struct iocb *iocb = events[s->event_idx].obj;
2172cdff7f6SStefan Hajnoczi struct qemu_laiocb *laiocb =
2182cdff7f6SStefan Hajnoczi container_of(iocb, struct qemu_laiocb, iocb);
2192cdff7f6SStefan Hajnoczi
2209e909a58SRoman Pen laiocb->ret = io_event_ret(&events[s->event_idx]);
2212cdff7f6SStefan Hajnoczi
2229e909a58SRoman Pen /* Change counters one-by-one because we can be nested. */
2239e909a58SRoman Pen s->io_q.in_flight--;
2249e909a58SRoman Pen s->event_idx++;
225dd7f7ed1SPaolo Bonzini qemu_laio_process_completion(laiocb);
2262cdff7f6SStefan Hajnoczi }
2279e909a58SRoman Pen }
2289e909a58SRoman Pen
2299e909a58SRoman Pen qemu_bh_cancel(s->completion_bh);
2309e909a58SRoman Pen
2319e909a58SRoman Pen /* If we are nested we have to notify the level above that we are done
2329e909a58SRoman Pen * by setting event_max to zero, upper level will then jump out of it's
2333202d8e4SMichael Tokarev * own `for` loop. If we are the last all counters dropped to zero. */
2349e909a58SRoman Pen s->event_max = 0;
2359e909a58SRoman Pen s->event_idx = 0;
23684d61e5fSStefan Hajnoczi
23784d61e5fSStefan Hajnoczi defer_call_end();
2383407de57SRoman Pen }
23928b24087SPaolo Bonzini
qemu_laio_process_completions_and_submit(LinuxAioState * s)2403407de57SRoman Pen static void qemu_laio_process_completions_and_submit(LinuxAioState *s)
2413407de57SRoman Pen {
2423407de57SRoman Pen qemu_laio_process_completions(s);
2431919631eSPaolo Bonzini
24407668288SStefan Hajnoczi if (!QSIMPLEQ_EMPTY(&s->io_q.pending)) {
24528b24087SPaolo Bonzini ioq_submit(s);
24628b24087SPaolo Bonzini }
2472cdff7f6SStefan Hajnoczi }
2482cdff7f6SStefan Hajnoczi
qemu_laio_completion_bh(void * opaque)2493407de57SRoman Pen static void qemu_laio_completion_bh(void *opaque)
2503407de57SRoman Pen {
2513407de57SRoman Pen LinuxAioState *s = opaque;
2523407de57SRoman Pen
2533407de57SRoman Pen qemu_laio_process_completions_and_submit(s);
2543407de57SRoman Pen }
2553407de57SRoman Pen
qemu_laio_completion_cb(EventNotifier * e)25610fb6e06SPaolo Bonzini static void qemu_laio_completion_cb(EventNotifier *e)
25710fb6e06SPaolo Bonzini {
258dd7f7ed1SPaolo Bonzini LinuxAioState *s = container_of(e, LinuxAioState, e);
25910fb6e06SPaolo Bonzini
2602cdff7f6SStefan Hajnoczi if (event_notifier_test_and_clear(&s->e)) {
2613407de57SRoman Pen qemu_laio_process_completions_and_submit(s);
26210fb6e06SPaolo Bonzini }
26310fb6e06SPaolo Bonzini }
26410fb6e06SPaolo Bonzini
qemu_laio_poll_cb(void * opaque)265ee686975SStefan Hajnoczi static bool qemu_laio_poll_cb(void *opaque)
266ee686975SStefan Hajnoczi {
267ee686975SStefan Hajnoczi EventNotifier *e = opaque;
268ee686975SStefan Hajnoczi LinuxAioState *s = container_of(e, LinuxAioState, e);
269ee686975SStefan Hajnoczi struct io_event *events;
270ee686975SStefan Hajnoczi
271826cc324SStefan Hajnoczi return io_getevents_peek(s->ctx, &events);
272ee686975SStefan Hajnoczi }
273ee686975SStefan Hajnoczi
qemu_laio_poll_ready(EventNotifier * opaque)274826cc324SStefan Hajnoczi static void qemu_laio_poll_ready(EventNotifier *opaque)
275826cc324SStefan Hajnoczi {
276826cc324SStefan Hajnoczi EventNotifier *e = opaque;
277826cc324SStefan Hajnoczi LinuxAioState *s = container_of(e, LinuxAioState, e);
278826cc324SStefan Hajnoczi
279ee686975SStefan Hajnoczi qemu_laio_process_completions_and_submit(s);
280ee686975SStefan Hajnoczi }
281ee686975SStefan Hajnoczi
ioq_init(LaioQueue * io_q)2821b3abdccSMing Lei static void ioq_init(LaioQueue *io_q)
2831b3abdccSMing Lei {
28428b24087SPaolo Bonzini QSIMPLEQ_INIT(&io_q->pending);
2855e1b34a3SRoman Pen io_q->in_queue = 0;
2865e1b34a3SRoman Pen io_q->in_flight = 0;
28743f2376eSPaolo Bonzini io_q->blocked = false;
2881b3abdccSMing Lei }
2891b3abdccSMing Lei
ioq_submit(LinuxAioState * s)290dd7f7ed1SPaolo Bonzini static void ioq_submit(LinuxAioState *s)
2911b3abdccSMing Lei {
29282595da8SPaolo Bonzini int ret, len;
29328b24087SPaolo Bonzini struct qemu_laiocb *aiocb;
2945e1b34a3SRoman Pen struct iocb *iocbs[MAX_EVENTS];
29582595da8SPaolo Bonzini QSIMPLEQ_HEAD(, qemu_laiocb) completed;
2961b3abdccSMing Lei
29743f2376eSPaolo Bonzini do {
2985e1b34a3SRoman Pen if (s->io_q.in_flight >= MAX_EVENTS) {
2995e1b34a3SRoman Pen break;
3005e1b34a3SRoman Pen }
30143f2376eSPaolo Bonzini len = 0;
30228b24087SPaolo Bonzini QSIMPLEQ_FOREACH(aiocb, &s->io_q.pending, next) {
30328b24087SPaolo Bonzini iocbs[len++] = &aiocb->iocb;
3045e1b34a3SRoman Pen if (s->io_q.in_flight + len >= MAX_EVENTS) {
30528b24087SPaolo Bonzini break;
30628b24087SPaolo Bonzini }
3071b3abdccSMing Lei }
3081b3abdccSMing Lei
30928b24087SPaolo Bonzini ret = io_submit(s->ctx, len, iocbs);
31028b24087SPaolo Bonzini if (ret == -EAGAIN) {
31182595da8SPaolo Bonzini break;
31228b24087SPaolo Bonzini }
31328b24087SPaolo Bonzini if (ret < 0) {
31444713c9eSKevin Wolf /* Fail the first request, retry the rest */
31544713c9eSKevin Wolf aiocb = QSIMPLEQ_FIRST(&s->io_q.pending);
31644713c9eSKevin Wolf QSIMPLEQ_REMOVE_HEAD(&s->io_q.pending, next);
31744713c9eSKevin Wolf s->io_q.in_queue--;
31844713c9eSKevin Wolf aiocb->ret = ret;
31944713c9eSKevin Wolf qemu_laio_process_completion(aiocb);
32044713c9eSKevin Wolf continue;
32128b24087SPaolo Bonzini }
3221b3abdccSMing Lei
3235e1b34a3SRoman Pen s->io_q.in_flight += ret;
3245e1b34a3SRoman Pen s->io_q.in_queue -= ret;
32582595da8SPaolo Bonzini aiocb = container_of(iocbs[ret - 1], struct qemu_laiocb, iocb);
32682595da8SPaolo Bonzini QSIMPLEQ_SPLIT_AFTER(&s->io_q.pending, aiocb, next, &completed);
32743f2376eSPaolo Bonzini } while (ret == len && !QSIMPLEQ_EMPTY(&s->io_q.pending));
3285e1b34a3SRoman Pen s->io_q.blocked = (s->io_q.in_queue > 0);
3290ed93d84SRoman Pen
3300ed93d84SRoman Pen if (s->io_q.in_flight) {
3310ed93d84SRoman Pen /* We can try to complete something just right away if there are
3320ed93d84SRoman Pen * still requests in-flight. */
3330ed93d84SRoman Pen qemu_laio_process_completions(s);
3340ed93d84SRoman Pen /*
3350ed93d84SRoman Pen * Even we have completed everything (in_flight == 0), the queue can
3360ed93d84SRoman Pen * have still pended requests (in_queue > 0). We do not attempt to
3370ed93d84SRoman Pen * repeat submission to avoid IO hang. The reason is simple: s->e is
3380ed93d84SRoman Pen * still set and completion callback will be called shortly and all
3390ed93d84SRoman Pen * pended requests will be submitted from there.
3400ed93d84SRoman Pen */
3410ed93d84SRoman Pen }
3421b3abdccSMing Lei }
3431b3abdccSMing Lei
laio_max_batch(LinuxAioState * s,uint64_t dev_max_batch)344512da211SStefano Garzarella static uint64_t laio_max_batch(LinuxAioState *s, uint64_t dev_max_batch)
345512da211SStefano Garzarella {
346512da211SStefano Garzarella uint64_t max_batch = s->aio_context->aio_max_batch ?: DEFAULT_MAX_BATCH;
347512da211SStefano Garzarella
348512da211SStefano Garzarella /*
349512da211SStefano Garzarella * AIO context can be shared between multiple block devices, so
350512da211SStefano Garzarella * `dev_max_batch` allows reducing the batch size for latency-sensitive
351512da211SStefano Garzarella * devices.
352512da211SStefano Garzarella */
353512da211SStefano Garzarella max_batch = MIN_NON_ZERO(dev_max_batch, max_batch);
354512da211SStefano Garzarella
355512da211SStefano Garzarella /* limit the batch with the number of available events */
356512da211SStefano Garzarella max_batch = MIN_NON_ZERO(MAX_EVENTS - s->io_q.in_flight, max_batch);
357512da211SStefano Garzarella
358512da211SStefano Garzarella return max_batch;
359512da211SStefano Garzarella }
360512da211SStefano Garzarella
laio_deferred_fn(void * opaque)361ccee48aaSStefan Hajnoczi static void laio_deferred_fn(void *opaque)
3621b3abdccSMing Lei {
36307668288SStefan Hajnoczi LinuxAioState *s = opaque;
364ab50533bSEmanuele Giuseppe Esposito
36507668288SStefan Hajnoczi if (!s->io_q.blocked && !QSIMPLEQ_EMPTY(&s->io_q.pending)) {
366de354644SPaolo Bonzini ioq_submit(s);
3671b3abdccSMing Lei }
3681b3abdccSMing Lei }
3691b3abdccSMing Lei
laio_do_submit(int fd,struct qemu_laiocb * laiocb,off_t offset,int type,uint64_t dev_max_batch)3702174f12bSKevin Wolf static int laio_do_submit(int fd, struct qemu_laiocb *laiocb, off_t offset,
371512da211SStefano Garzarella int type, uint64_t dev_max_batch)
37210fb6e06SPaolo Bonzini {
3732174f12bSKevin Wolf LinuxAioState *s = laiocb->ctx;
3742174f12bSKevin Wolf struct iocb *iocbs = &laiocb->iocb;
3752174f12bSKevin Wolf QEMUIOVector *qiov = laiocb->qiov;
37610fb6e06SPaolo Bonzini
37710fb6e06SPaolo Bonzini switch (type) {
37810fb6e06SPaolo Bonzini case QEMU_AIO_WRITE:
37910fb6e06SPaolo Bonzini io_prep_pwritev(iocbs, fd, qiov->iov, qiov->niov, offset);
38010fb6e06SPaolo Bonzini break;
3814751d09aSSam Li case QEMU_AIO_ZONE_APPEND:
3824751d09aSSam Li io_prep_pwritev(iocbs, fd, qiov->iov, qiov->niov, offset);
3834751d09aSSam Li break;
38410fb6e06SPaolo Bonzini case QEMU_AIO_READ:
38510fb6e06SPaolo Bonzini io_prep_preadv(iocbs, fd, qiov->iov, qiov->niov, offset);
38610fb6e06SPaolo Bonzini break;
387*24687abfSPrasad Pandit case QEMU_AIO_FLUSH:
388*24687abfSPrasad Pandit io_prep_fdsync(iocbs, fd);
389*24687abfSPrasad Pandit break;
39010fb6e06SPaolo Bonzini /* Currently Linux kernel does not support other operations */
39110fb6e06SPaolo Bonzini default:
39210fb6e06SPaolo Bonzini fprintf(stderr, "%s: invalid AIO request type 0x%x.\n",
39310fb6e06SPaolo Bonzini __func__, type);
3942174f12bSKevin Wolf return -EIO;
39510fb6e06SPaolo Bonzini }
39610fb6e06SPaolo Bonzini io_set_eventfd(&laiocb->iocb, event_notifier_get_fd(&s->e));
39710fb6e06SPaolo Bonzini
39828b24087SPaolo Bonzini QSIMPLEQ_INSERT_TAIL(&s->io_q.pending, laiocb, next);
3995e1b34a3SRoman Pen s->io_q.in_queue++;
40007668288SStefan Hajnoczi if (!s->io_q.blocked) {
40107668288SStefan Hajnoczi if (s->io_q.in_queue >= laio_max_batch(s, dev_max_batch)) {
40228b24087SPaolo Bonzini ioq_submit(s);
40307668288SStefan Hajnoczi } else {
404ccee48aaSStefan Hajnoczi defer_call(laio_deferred_fn, s);
40507668288SStefan Hajnoczi }
4061b3abdccSMing Lei }
40710fb6e06SPaolo Bonzini
4082174f12bSKevin Wolf return 0;
4092174f12bSKevin Wolf }
4102174f12bSKevin Wolf
laio_co_submit(int fd,uint64_t offset,QEMUIOVector * qiov,int type,uint64_t dev_max_batch)411ab50533bSEmanuele Giuseppe Esposito int coroutine_fn laio_co_submit(int fd, uint64_t offset, QEMUIOVector *qiov,
412ab50533bSEmanuele Giuseppe Esposito int type, uint64_t dev_max_batch)
4132174f12bSKevin Wolf {
4142174f12bSKevin Wolf int ret;
415ab50533bSEmanuele Giuseppe Esposito AioContext *ctx = qemu_get_current_aio_context();
4162174f12bSKevin Wolf struct qemu_laiocb laiocb = {
4172174f12bSKevin Wolf .co = qemu_coroutine_self(),
418*24687abfSPrasad Pandit .nbytes = qiov ? qiov->size : 0,
419ab50533bSEmanuele Giuseppe Esposito .ctx = aio_get_linux_aio(ctx),
4200ed93d84SRoman Pen .ret = -EINPROGRESS,
4212174f12bSKevin Wolf .is_read = (type == QEMU_AIO_READ),
4222174f12bSKevin Wolf .qiov = qiov,
4232174f12bSKevin Wolf };
4242174f12bSKevin Wolf
425512da211SStefano Garzarella ret = laio_do_submit(fd, &laiocb, offset, type, dev_max_batch);
4262174f12bSKevin Wolf if (ret < 0) {
4272174f12bSKevin Wolf return ret;
4282174f12bSKevin Wolf }
4292174f12bSKevin Wolf
4300ed93d84SRoman Pen if (laiocb.ret == -EINPROGRESS) {
4312174f12bSKevin Wolf qemu_coroutine_yield();
4320ed93d84SRoman Pen }
4332174f12bSKevin Wolf return laiocb.ret;
4342174f12bSKevin Wolf }
4352174f12bSKevin Wolf
laio_detach_aio_context(LinuxAioState * s,AioContext * old_context)436dd7f7ed1SPaolo Bonzini void laio_detach_aio_context(LinuxAioState *s, AioContext *old_context)
437c2f3426cSStefan Hajnoczi {
43860f782b6SStefan Hajnoczi aio_set_event_notifier(old_context, &s->e, NULL, NULL, NULL);
4392cdff7f6SStefan Hajnoczi qemu_bh_delete(s->completion_bh);
4401919631eSPaolo Bonzini s->aio_context = NULL;
441c2f3426cSStefan Hajnoczi }
442c2f3426cSStefan Hajnoczi
laio_attach_aio_context(LinuxAioState * s,AioContext * new_context)443dd7f7ed1SPaolo Bonzini void laio_attach_aio_context(LinuxAioState *s, AioContext *new_context)
444c2f3426cSStefan Hajnoczi {
4450187f5c9SPaolo Bonzini s->aio_context = new_context;
4462cdff7f6SStefan Hajnoczi s->completion_bh = aio_bh_new(new_context, qemu_laio_completion_bh, s);
44760f782b6SStefan Hajnoczi aio_set_event_notifier(new_context, &s->e,
448ee686975SStefan Hajnoczi qemu_laio_completion_cb,
449826cc324SStefan Hajnoczi qemu_laio_poll_cb,
450826cc324SStefan Hajnoczi qemu_laio_poll_ready);
451c2f3426cSStefan Hajnoczi }
452c2f3426cSStefan Hajnoczi
laio_init(Error ** errp)453ed6e2161SNishanth Aravamudan LinuxAioState *laio_init(Error **errp)
45410fb6e06SPaolo Bonzini {
455ed6e2161SNishanth Aravamudan int rc;
456dd7f7ed1SPaolo Bonzini LinuxAioState *s;
45710fb6e06SPaolo Bonzini
45810fb6e06SPaolo Bonzini s = g_malloc0(sizeof(*s));
459ed6e2161SNishanth Aravamudan rc = event_notifier_init(&s->e, false);
460ed6e2161SNishanth Aravamudan if (rc < 0) {
4617a21bee2SDaniel P. Berrangé error_setg_errno(errp, -rc, "failed to initialize event notifier");
46210fb6e06SPaolo Bonzini goto out_free_state;
46310fb6e06SPaolo Bonzini }
46410fb6e06SPaolo Bonzini
465ed6e2161SNishanth Aravamudan rc = io_setup(MAX_EVENTS, &s->ctx);
466ed6e2161SNishanth Aravamudan if (rc < 0) {
467ed6e2161SNishanth Aravamudan error_setg_errno(errp, -rc, "failed to create linux AIO context");
46810fb6e06SPaolo Bonzini goto out_close_efd;
46910fb6e06SPaolo Bonzini }
47010fb6e06SPaolo Bonzini
4711b3abdccSMing Lei ioq_init(&s->io_q);
4721b3abdccSMing Lei
47310fb6e06SPaolo Bonzini return s;
47410fb6e06SPaolo Bonzini
47510fb6e06SPaolo Bonzini out_close_efd:
47610fb6e06SPaolo Bonzini event_notifier_cleanup(&s->e);
47710fb6e06SPaolo Bonzini out_free_state:
47810fb6e06SPaolo Bonzini g_free(s);
47910fb6e06SPaolo Bonzini return NULL;
48010fb6e06SPaolo Bonzini }
481abd269b7SStefan Hajnoczi
laio_cleanup(LinuxAioState * s)482dd7f7ed1SPaolo Bonzini void laio_cleanup(LinuxAioState *s)
483abd269b7SStefan Hajnoczi {
484abd269b7SStefan Hajnoczi event_notifier_cleanup(&s->e);
485a1abf40dSGonglei
486a1abf40dSGonglei if (io_destroy(s->ctx) != 0) {
487a1abf40dSGonglei fprintf(stderr, "%s: destroy AIO context %p failed\n",
488a1abf40dSGonglei __func__, &s->ctx);
489a1abf40dSGonglei }
490abd269b7SStefan Hajnoczi g_free(s);
491abd269b7SStefan Hajnoczi }
492*24687abfSPrasad Pandit
laio_has_fdsync(int fd)493*24687abfSPrasad Pandit bool laio_has_fdsync(int fd)
494*24687abfSPrasad Pandit {
495*24687abfSPrasad Pandit struct iocb cb;
496*24687abfSPrasad Pandit struct iocb *cbs[] = {&cb, NULL};
497*24687abfSPrasad Pandit
498*24687abfSPrasad Pandit io_context_t ctx = 0;
499*24687abfSPrasad Pandit io_setup(1, &ctx);
500*24687abfSPrasad Pandit
501*24687abfSPrasad Pandit /* check if host kernel supports IO_CMD_FDSYNC */
502*24687abfSPrasad Pandit io_prep_fdsync(&cb, fd);
503*24687abfSPrasad Pandit int ret = io_submit(ctx, 1, cbs);
504*24687abfSPrasad Pandit
505*24687abfSPrasad Pandit io_destroy(ctx);
506*24687abfSPrasad Pandit return (ret == -EINVAL) ? false : true;
507*24687abfSPrasad Pandit }
508