xref: /openbmc/qemu/block/export/vduse-blk.c (revision 19a989096e5d439c78a887bb51d4d9a5310557c9)
1 /*
2  * Export QEMU block device via VDUSE
3  *
4  * Copyright (C) 2022 Bytedance Inc. and/or its affiliates. All rights reserved.
5  *
6  * Author:
7  *   Xie Yongji <xieyongji@bytedance.com>
8  *
9  * This work is licensed under the terms of the GNU GPL, version 2 or
10  * later.  See the COPYING file in the top-level directory.
11  */
12 
13 #include "qemu/osdep.h"
14 #include <sys/eventfd.h>
15 
16 #include "qapi/error.h"
17 #include "block/export.h"
18 #include "qemu/error-report.h"
19 #include "util/block-helpers.h"
20 #include "subprojects/libvduse/libvduse.h"
21 #include "virtio-blk-handler.h"
22 
23 #include "standard-headers/linux/virtio_blk.h"
24 
25 #define VDUSE_DEFAULT_NUM_QUEUE 1
26 #define VDUSE_DEFAULT_QUEUE_SIZE 256
27 
28 typedef struct VduseBlkExport {
29     BlockExport export;
30     VirtioBlkHandler handler;
31     VduseDev *dev;
32     uint16_t num_queues;
33     char *recon_file;
34     unsigned int inflight; /* atomic */
35     bool vqs_started;
36 } VduseBlkExport;
37 
38 typedef struct VduseBlkReq {
39     VduseVirtqElement elem;
40     VduseVirtq *vq;
41 } VduseBlkReq;
42 
vduse_blk_inflight_inc(VduseBlkExport * vblk_exp)43 static void vduse_blk_inflight_inc(VduseBlkExport *vblk_exp)
44 {
45     if (qatomic_fetch_inc(&vblk_exp->inflight) == 0) {
46         /* Prevent export from being deleted */
47         blk_exp_ref(&vblk_exp->export);
48     }
49 }
50 
vduse_blk_inflight_dec(VduseBlkExport * vblk_exp)51 static void vduse_blk_inflight_dec(VduseBlkExport *vblk_exp)
52 {
53     if (qatomic_fetch_dec(&vblk_exp->inflight) == 1) {
54         /* Wake AIO_WAIT_WHILE() */
55         aio_wait_kick();
56 
57         /* Now the export can be deleted */
58         blk_exp_unref(&vblk_exp->export);
59     }
60 }
61 
vduse_blk_req_complete(VduseBlkReq * req,size_t in_len)62 static void vduse_blk_req_complete(VduseBlkReq *req, size_t in_len)
63 {
64     vduse_queue_push(req->vq, &req->elem, in_len);
65     vduse_queue_notify(req->vq);
66 
67     free(req);
68 }
69 
vduse_blk_virtio_process_req(void * opaque)70 static void coroutine_fn vduse_blk_virtio_process_req(void *opaque)
71 {
72     VduseBlkReq *req = opaque;
73     VduseVirtq *vq = req->vq;
74     VduseDev *dev = vduse_queue_get_dev(vq);
75     VduseBlkExport *vblk_exp = vduse_dev_get_priv(dev);
76     VirtioBlkHandler *handler = &vblk_exp->handler;
77     VduseVirtqElement *elem = &req->elem;
78     struct iovec *in_iov = elem->in_sg;
79     struct iovec *out_iov = elem->out_sg;
80     unsigned in_num = elem->in_num;
81     unsigned out_num = elem->out_num;
82     int in_len;
83 
84     in_len = virtio_blk_process_req(handler, in_iov,
85                                     out_iov, in_num, out_num);
86     if (in_len < 0) {
87         free(req);
88         return;
89     }
90 
91     vduse_blk_req_complete(req, in_len);
92     vduse_blk_inflight_dec(vblk_exp);
93 }
94 
vduse_blk_vq_handler(VduseDev * dev,VduseVirtq * vq)95 static void vduse_blk_vq_handler(VduseDev *dev, VduseVirtq *vq)
96 {
97     VduseBlkExport *vblk_exp = vduse_dev_get_priv(dev);
98 
99     while (1) {
100         VduseBlkReq *req;
101 
102         req = vduse_queue_pop(vq, sizeof(VduseBlkReq));
103         if (!req) {
104             break;
105         }
106         req->vq = vq;
107 
108         Coroutine *co =
109             qemu_coroutine_create(vduse_blk_virtio_process_req, req);
110 
111         vduse_blk_inflight_inc(vblk_exp);
112         qemu_coroutine_enter(co);
113     }
114 }
115 
on_vduse_vq_kick(void * opaque)116 static void on_vduse_vq_kick(void *opaque)
117 {
118     VduseVirtq *vq = opaque;
119     VduseDev *dev = vduse_queue_get_dev(vq);
120     int fd = vduse_queue_get_fd(vq);
121     eventfd_t kick_data;
122 
123     if (eventfd_read(fd, &kick_data) == -1) {
124         error_report("failed to read data from eventfd");
125         return;
126     }
127 
128     vduse_blk_vq_handler(dev, vq);
129 }
130 
vduse_blk_enable_queue(VduseDev * dev,VduseVirtq * vq)131 static void vduse_blk_enable_queue(VduseDev *dev, VduseVirtq *vq)
132 {
133     VduseBlkExport *vblk_exp = vduse_dev_get_priv(dev);
134 
135     if (!vblk_exp->vqs_started) {
136         return; /* vduse_blk_drained_end() will start vqs later */
137     }
138 
139     aio_set_fd_handler(vblk_exp->export.ctx, vduse_queue_get_fd(vq),
140                        on_vduse_vq_kick, NULL, NULL, NULL, vq);
141     /* Make sure we don't miss any kick after reconnecting */
142     eventfd_write(vduse_queue_get_fd(vq), 1);
143 }
144 
vduse_blk_disable_queue(VduseDev * dev,VduseVirtq * vq)145 static void vduse_blk_disable_queue(VduseDev *dev, VduseVirtq *vq)
146 {
147     VduseBlkExport *vblk_exp = vduse_dev_get_priv(dev);
148     int fd = vduse_queue_get_fd(vq);
149 
150     if (fd < 0) {
151         return;
152     }
153 
154     aio_set_fd_handler(vblk_exp->export.ctx, fd,
155                        NULL, NULL, NULL, NULL, NULL);
156 }
157 
158 static const VduseOps vduse_blk_ops = {
159     .enable_queue = vduse_blk_enable_queue,
160     .disable_queue = vduse_blk_disable_queue,
161 };
162 
on_vduse_dev_kick(void * opaque)163 static void on_vduse_dev_kick(void *opaque)
164 {
165     VduseDev *dev = opaque;
166 
167     vduse_dev_handler(dev);
168 }
169 
vduse_blk_attach_ctx(VduseBlkExport * vblk_exp,AioContext * ctx)170 static void vduse_blk_attach_ctx(VduseBlkExport *vblk_exp, AioContext *ctx)
171 {
172     aio_set_fd_handler(vblk_exp->export.ctx, vduse_dev_get_fd(vblk_exp->dev),
173                        on_vduse_dev_kick, NULL, NULL, NULL,
174                        vblk_exp->dev);
175 
176     /* Virtqueues are handled by vduse_blk_drained_end() */
177 }
178 
vduse_blk_detach_ctx(VduseBlkExport * vblk_exp)179 static void vduse_blk_detach_ctx(VduseBlkExport *vblk_exp)
180 {
181     aio_set_fd_handler(vblk_exp->export.ctx, vduse_dev_get_fd(vblk_exp->dev),
182                        NULL, NULL, NULL, NULL, NULL);
183 
184     /* Virtqueues are handled by vduse_blk_drained_begin() */
185 }
186 
187 
blk_aio_attached(AioContext * ctx,void * opaque)188 static void blk_aio_attached(AioContext *ctx, void *opaque)
189 {
190     VduseBlkExport *vblk_exp = opaque;
191 
192     vblk_exp->export.ctx = ctx;
193     vduse_blk_attach_ctx(vblk_exp, ctx);
194 }
195 
blk_aio_detach(void * opaque)196 static void blk_aio_detach(void *opaque)
197 {
198     VduseBlkExport *vblk_exp = opaque;
199 
200     vduse_blk_detach_ctx(vblk_exp);
201     vblk_exp->export.ctx = NULL;
202 }
203 
vduse_blk_resize(void * opaque)204 static void vduse_blk_resize(void *opaque)
205 {
206     BlockExport *exp = opaque;
207     VduseBlkExport *vblk_exp = container_of(exp, VduseBlkExport, export);
208     struct virtio_blk_config config;
209 
210     config.capacity =
211             cpu_to_le64(blk_getlength(exp->blk) >> VIRTIO_BLK_SECTOR_BITS);
212     vduse_dev_update_config(vblk_exp->dev, sizeof(config.capacity),
213                             offsetof(struct virtio_blk_config, capacity),
214                             (char *)&config.capacity);
215 }
216 
vduse_blk_stop_virtqueues(VduseBlkExport * vblk_exp)217 static void vduse_blk_stop_virtqueues(VduseBlkExport *vblk_exp)
218 {
219     for (uint16_t i = 0; i < vblk_exp->num_queues; i++) {
220         VduseVirtq *vq = vduse_dev_get_queue(vblk_exp->dev, i);
221         vduse_blk_disable_queue(vblk_exp->dev, vq);
222     }
223 
224     vblk_exp->vqs_started = false;
225 }
226 
vduse_blk_start_virtqueues(VduseBlkExport * vblk_exp)227 static void vduse_blk_start_virtqueues(VduseBlkExport *vblk_exp)
228 {
229     vblk_exp->vqs_started = true;
230 
231     for (uint16_t i = 0; i < vblk_exp->num_queues; i++) {
232         VduseVirtq *vq = vduse_dev_get_queue(vblk_exp->dev, i);
233         vduse_blk_enable_queue(vblk_exp->dev, vq);
234     }
235 }
236 
vduse_blk_drained_begin(void * opaque)237 static void vduse_blk_drained_begin(void *opaque)
238 {
239     BlockExport *exp = opaque;
240     VduseBlkExport *vblk_exp = container_of(exp, VduseBlkExport, export);
241 
242     vduse_blk_stop_virtqueues(vblk_exp);
243 }
244 
vduse_blk_drained_end(void * opaque)245 static void vduse_blk_drained_end(void *opaque)
246 {
247     BlockExport *exp = opaque;
248     VduseBlkExport *vblk_exp = container_of(exp, VduseBlkExport, export);
249 
250     vduse_blk_start_virtqueues(vblk_exp);
251 }
252 
vduse_blk_drained_poll(void * opaque)253 static bool vduse_blk_drained_poll(void *opaque)
254 {
255     BlockExport *exp = opaque;
256     VduseBlkExport *vblk_exp = container_of(exp, VduseBlkExport, export);
257 
258     return qatomic_read(&vblk_exp->inflight) > 0;
259 }
260 
261 static const BlockDevOps vduse_block_ops = {
262     .resize_cb     = vduse_blk_resize,
263     .drained_begin = vduse_blk_drained_begin,
264     .drained_end   = vduse_blk_drained_end,
265     .drained_poll  = vduse_blk_drained_poll,
266 };
267 
vduse_blk_exp_create(BlockExport * exp,BlockExportOptions * opts,Error ** errp)268 static int vduse_blk_exp_create(BlockExport *exp, BlockExportOptions *opts,
269                                 Error **errp)
270 {
271     VduseBlkExport *vblk_exp = container_of(exp, VduseBlkExport, export);
272     BlockExportOptionsVduseBlk *vblk_opts = &opts->u.vduse_blk;
273     uint64_t logical_block_size = VIRTIO_BLK_SECTOR_SIZE;
274     uint16_t num_queues = VDUSE_DEFAULT_NUM_QUEUE;
275     uint16_t queue_size = VDUSE_DEFAULT_QUEUE_SIZE;
276     struct virtio_blk_config config = { 0 };
277     uint64_t features;
278     int i, ret;
279 
280     if (vblk_opts->has_num_queues) {
281         num_queues = vblk_opts->num_queues;
282         if (num_queues == 0) {
283             error_setg(errp, "num-queues must be greater than 0");
284             return -EINVAL;
285         }
286     }
287 
288     if (vblk_opts->has_queue_size) {
289         queue_size = vblk_opts->queue_size;
290         if (queue_size <= 2 || !is_power_of_2(queue_size) ||
291             queue_size > VIRTQUEUE_MAX_SIZE) {
292             error_setg(errp, "queue-size is invalid");
293             return -EINVAL;
294         }
295     }
296 
297     if (vblk_opts->has_logical_block_size) {
298         logical_block_size = vblk_opts->logical_block_size;
299         if (!check_block_size("logical-block-size", logical_block_size,
300                               errp)) {
301             return -EINVAL;
302         }
303     }
304     vblk_exp->num_queues = num_queues;
305     vblk_exp->handler.blk = exp->blk;
306     vblk_exp->handler.serial = g_strdup(vblk_opts->serial ?: "");
307     vblk_exp->handler.logical_block_size = logical_block_size;
308     vblk_exp->handler.writable = opts->writable;
309     vblk_exp->vqs_started = true;
310 
311     config.capacity =
312             cpu_to_le64(blk_getlength(exp->blk) >> VIRTIO_BLK_SECTOR_BITS);
313     config.seg_max = cpu_to_le32(queue_size - 2);
314     config.min_io_size = cpu_to_le16(1);
315     config.opt_io_size = cpu_to_le32(1);
316     config.num_queues = cpu_to_le16(num_queues);
317     config.blk_size = cpu_to_le32(logical_block_size);
318     config.max_discard_sectors = cpu_to_le32(VIRTIO_BLK_MAX_DISCARD_SECTORS);
319     config.max_discard_seg = cpu_to_le32(1);
320     config.discard_sector_alignment =
321         cpu_to_le32(logical_block_size >> VIRTIO_BLK_SECTOR_BITS);
322     config.max_write_zeroes_sectors =
323         cpu_to_le32(VIRTIO_BLK_MAX_WRITE_ZEROES_SECTORS);
324     config.max_write_zeroes_seg = cpu_to_le32(1);
325 
326     features = vduse_get_virtio_features() |
327                (1ULL << VIRTIO_BLK_F_SEG_MAX) |
328                (1ULL << VIRTIO_BLK_F_TOPOLOGY) |
329                (1ULL << VIRTIO_BLK_F_BLK_SIZE) |
330                (1ULL << VIRTIO_BLK_F_FLUSH) |
331                (1ULL << VIRTIO_BLK_F_DISCARD) |
332                (1ULL << VIRTIO_BLK_F_WRITE_ZEROES);
333 
334     if (num_queues > 1) {
335         features |= 1ULL << VIRTIO_BLK_F_MQ;
336     }
337     if (!opts->writable) {
338         features |= 1ULL << VIRTIO_BLK_F_RO;
339     }
340 
341     vblk_exp->dev = vduse_dev_create(vblk_opts->name, VIRTIO_ID_BLOCK, 0,
342                                      features, num_queues,
343                                      sizeof(struct virtio_blk_config),
344                                      (char *)&config, &vduse_blk_ops,
345                                      vblk_exp);
346     if (!vblk_exp->dev) {
347         error_setg(errp, "failed to create vduse device");
348         ret = -ENOMEM;
349         goto err_dev;
350     }
351 
352     vblk_exp->recon_file = g_strdup_printf("%s/vduse-blk-%s",
353                                            g_get_tmp_dir(), vblk_opts->name);
354     if (vduse_set_reconnect_log_file(vblk_exp->dev, vblk_exp->recon_file)) {
355         error_setg(errp, "failed to set reconnect log file");
356         ret = -EINVAL;
357         goto err;
358     }
359 
360     for (i = 0; i < num_queues; i++) {
361         vduse_dev_setup_queue(vblk_exp->dev, i, queue_size);
362     }
363 
364     aio_set_fd_handler(exp->ctx, vduse_dev_get_fd(vblk_exp->dev),
365                        on_vduse_dev_kick, NULL, NULL, NULL, vblk_exp->dev);
366 
367     blk_add_aio_context_notifier(exp->blk, blk_aio_attached, blk_aio_detach,
368                                  vblk_exp);
369     blk_set_dev_ops(exp->blk, &vduse_block_ops, exp);
370 
371     /*
372      * We handle draining ourselves using an in-flight counter and by disabling
373      * virtqueue fd handlers. Do not queue BlockBackend requests, they need to
374      * complete so the in-flight counter reaches zero.
375      */
376     blk_set_disable_request_queuing(exp->blk, true);
377 
378     return 0;
379 err:
380     vduse_dev_destroy(vblk_exp->dev);
381     g_free(vblk_exp->recon_file);
382 err_dev:
383     g_free(vblk_exp->handler.serial);
384     return ret;
385 }
386 
vduse_blk_exp_delete(BlockExport * exp)387 static void vduse_blk_exp_delete(BlockExport *exp)
388 {
389     VduseBlkExport *vblk_exp = container_of(exp, VduseBlkExport, export);
390     int ret;
391 
392     assert(qatomic_read(&vblk_exp->inflight) == 0);
393 
394     vduse_blk_detach_ctx(vblk_exp);
395     blk_remove_aio_context_notifier(exp->blk, blk_aio_attached, blk_aio_detach,
396                                     vblk_exp);
397     ret = vduse_dev_destroy(vblk_exp->dev);
398     if (ret != -EBUSY) {
399         unlink(vblk_exp->recon_file);
400     }
401     g_free(vblk_exp->recon_file);
402     g_free(vblk_exp->handler.serial);
403 }
404 
405 /* Called with exp->ctx acquired */
vduse_blk_exp_request_shutdown(BlockExport * exp)406 static void vduse_blk_exp_request_shutdown(BlockExport *exp)
407 {
408     VduseBlkExport *vblk_exp = container_of(exp, VduseBlkExport, export);
409 
410     vduse_blk_stop_virtqueues(vblk_exp);
411 }
412 
413 const BlockExportDriver blk_exp_vduse_blk = {
414     .type               = BLOCK_EXPORT_TYPE_VDUSE_BLK,
415     .instance_size      = sizeof(VduseBlkExport),
416     .create             = vduse_blk_exp_create,
417     .delete             = vduse_blk_exp_delete,
418     .request_shutdown   = vduse_blk_exp_request_shutdown,
419 };
420