1 /* 2 * Export QEMU block device via VDUSE 3 * 4 * Copyright (C) 2022 Bytedance Inc. and/or its affiliates. All rights reserved. 5 * 6 * Author: 7 * Xie Yongji <xieyongji@bytedance.com> 8 * 9 * This work is licensed under the terms of the GNU GPL, version 2 or 10 * later. See the COPYING file in the top-level directory. 11 */ 12 13 #include "qemu/osdep.h" 14 #include <sys/eventfd.h> 15 16 #include "qapi/error.h" 17 #include "block/export.h" 18 #include "qemu/error-report.h" 19 #include "util/block-helpers.h" 20 #include "subprojects/libvduse/libvduse.h" 21 #include "virtio-blk-handler.h" 22 23 #include "standard-headers/linux/virtio_blk.h" 24 25 #define VDUSE_DEFAULT_NUM_QUEUE 1 26 #define VDUSE_DEFAULT_QUEUE_SIZE 256 27 28 typedef struct VduseBlkExport { 29 BlockExport export; 30 VirtioBlkHandler handler; 31 VduseDev *dev; 32 uint16_t num_queues; 33 char *recon_file; 34 unsigned int inflight; /* atomic */ 35 bool vqs_started; 36 } VduseBlkExport; 37 38 typedef struct VduseBlkReq { 39 VduseVirtqElement elem; 40 VduseVirtq *vq; 41 } VduseBlkReq; 42 43 static void vduse_blk_inflight_inc(VduseBlkExport *vblk_exp) 44 { 45 if (qatomic_fetch_inc(&vblk_exp->inflight) == 0) { 46 /* Prevent export from being deleted */ 47 blk_exp_ref(&vblk_exp->export); 48 } 49 } 50 51 static void vduse_blk_inflight_dec(VduseBlkExport *vblk_exp) 52 { 53 if (qatomic_fetch_dec(&vblk_exp->inflight) == 1) { 54 /* Wake AIO_WAIT_WHILE() */ 55 aio_wait_kick(); 56 57 /* Now the export can be deleted */ 58 blk_exp_unref(&vblk_exp->export); 59 } 60 } 61 62 static void vduse_blk_req_complete(VduseBlkReq *req, size_t in_len) 63 { 64 vduse_queue_push(req->vq, &req->elem, in_len); 65 vduse_queue_notify(req->vq); 66 67 free(req); 68 } 69 70 static void coroutine_fn vduse_blk_virtio_process_req(void *opaque) 71 { 72 VduseBlkReq *req = opaque; 73 VduseVirtq *vq = req->vq; 74 VduseDev *dev = vduse_queue_get_dev(vq); 75 VduseBlkExport *vblk_exp = vduse_dev_get_priv(dev); 76 VirtioBlkHandler *handler = &vblk_exp->handler; 77 VduseVirtqElement *elem = &req->elem; 78 struct iovec *in_iov = elem->in_sg; 79 struct iovec *out_iov = elem->out_sg; 80 unsigned in_num = elem->in_num; 81 unsigned out_num = elem->out_num; 82 int in_len; 83 84 in_len = virtio_blk_process_req(handler, in_iov, 85 out_iov, in_num, out_num); 86 if (in_len < 0) { 87 free(req); 88 return; 89 } 90 91 vduse_blk_req_complete(req, in_len); 92 vduse_blk_inflight_dec(vblk_exp); 93 } 94 95 static void vduse_blk_vq_handler(VduseDev *dev, VduseVirtq *vq) 96 { 97 VduseBlkExport *vblk_exp = vduse_dev_get_priv(dev); 98 99 while (1) { 100 VduseBlkReq *req; 101 102 req = vduse_queue_pop(vq, sizeof(VduseBlkReq)); 103 if (!req) { 104 break; 105 } 106 req->vq = vq; 107 108 Coroutine *co = 109 qemu_coroutine_create(vduse_blk_virtio_process_req, req); 110 111 vduse_blk_inflight_inc(vblk_exp); 112 qemu_coroutine_enter(co); 113 } 114 } 115 116 static void on_vduse_vq_kick(void *opaque) 117 { 118 VduseVirtq *vq = opaque; 119 VduseDev *dev = vduse_queue_get_dev(vq); 120 int fd = vduse_queue_get_fd(vq); 121 eventfd_t kick_data; 122 123 if (eventfd_read(fd, &kick_data) == -1) { 124 error_report("failed to read data from eventfd"); 125 return; 126 } 127 128 vduse_blk_vq_handler(dev, vq); 129 } 130 131 static void vduse_blk_enable_queue(VduseDev *dev, VduseVirtq *vq) 132 { 133 VduseBlkExport *vblk_exp = vduse_dev_get_priv(dev); 134 135 if (!vblk_exp->vqs_started) { 136 return; /* vduse_blk_drained_end() will start vqs later */ 137 } 138 139 aio_set_fd_handler(vblk_exp->export.ctx, vduse_queue_get_fd(vq), 140 on_vduse_vq_kick, NULL, NULL, NULL, vq); 141 /* Make sure we don't miss any kick after reconnecting */ 142 eventfd_write(vduse_queue_get_fd(vq), 1); 143 } 144 145 static void vduse_blk_disable_queue(VduseDev *dev, VduseVirtq *vq) 146 { 147 VduseBlkExport *vblk_exp = vduse_dev_get_priv(dev); 148 int fd = vduse_queue_get_fd(vq); 149 150 if (fd < 0) { 151 return; 152 } 153 154 aio_set_fd_handler(vblk_exp->export.ctx, fd, 155 NULL, NULL, NULL, NULL, NULL); 156 } 157 158 static const VduseOps vduse_blk_ops = { 159 .enable_queue = vduse_blk_enable_queue, 160 .disable_queue = vduse_blk_disable_queue, 161 }; 162 163 static void on_vduse_dev_kick(void *opaque) 164 { 165 VduseDev *dev = opaque; 166 167 vduse_dev_handler(dev); 168 } 169 170 static void vduse_blk_attach_ctx(VduseBlkExport *vblk_exp, AioContext *ctx) 171 { 172 aio_set_fd_handler(vblk_exp->export.ctx, vduse_dev_get_fd(vblk_exp->dev), 173 on_vduse_dev_kick, NULL, NULL, NULL, 174 vblk_exp->dev); 175 176 /* Virtqueues are handled by vduse_blk_drained_end() */ 177 } 178 179 static void vduse_blk_detach_ctx(VduseBlkExport *vblk_exp) 180 { 181 aio_set_fd_handler(vblk_exp->export.ctx, vduse_dev_get_fd(vblk_exp->dev), 182 NULL, NULL, NULL, NULL, NULL); 183 184 /* Virtqueues are handled by vduse_blk_drained_begin() */ 185 } 186 187 188 static void blk_aio_attached(AioContext *ctx, void *opaque) 189 { 190 VduseBlkExport *vblk_exp = opaque; 191 192 vblk_exp->export.ctx = ctx; 193 vduse_blk_attach_ctx(vblk_exp, ctx); 194 } 195 196 static void blk_aio_detach(void *opaque) 197 { 198 VduseBlkExport *vblk_exp = opaque; 199 200 vduse_blk_detach_ctx(vblk_exp); 201 vblk_exp->export.ctx = NULL; 202 } 203 204 static void vduse_blk_resize(void *opaque) 205 { 206 BlockExport *exp = opaque; 207 VduseBlkExport *vblk_exp = container_of(exp, VduseBlkExport, export); 208 struct virtio_blk_config config; 209 210 config.capacity = 211 cpu_to_le64(blk_getlength(exp->blk) >> VIRTIO_BLK_SECTOR_BITS); 212 vduse_dev_update_config(vblk_exp->dev, sizeof(config.capacity), 213 offsetof(struct virtio_blk_config, capacity), 214 (char *)&config.capacity); 215 } 216 217 static void vduse_blk_stop_virtqueues(VduseBlkExport *vblk_exp) 218 { 219 for (uint16_t i = 0; i < vblk_exp->num_queues; i++) { 220 VduseVirtq *vq = vduse_dev_get_queue(vblk_exp->dev, i); 221 vduse_blk_disable_queue(vblk_exp->dev, vq); 222 } 223 224 vblk_exp->vqs_started = false; 225 } 226 227 static void vduse_blk_start_virtqueues(VduseBlkExport *vblk_exp) 228 { 229 vblk_exp->vqs_started = true; 230 231 for (uint16_t i = 0; i < vblk_exp->num_queues; i++) { 232 VduseVirtq *vq = vduse_dev_get_queue(vblk_exp->dev, i); 233 vduse_blk_enable_queue(vblk_exp->dev, vq); 234 } 235 } 236 237 static void vduse_blk_drained_begin(void *opaque) 238 { 239 BlockExport *exp = opaque; 240 VduseBlkExport *vblk_exp = container_of(exp, VduseBlkExport, export); 241 242 vduse_blk_stop_virtqueues(vblk_exp); 243 } 244 245 static void vduse_blk_drained_end(void *opaque) 246 { 247 BlockExport *exp = opaque; 248 VduseBlkExport *vblk_exp = container_of(exp, VduseBlkExport, export); 249 250 vduse_blk_start_virtqueues(vblk_exp); 251 } 252 253 static bool vduse_blk_drained_poll(void *opaque) 254 { 255 BlockExport *exp = opaque; 256 VduseBlkExport *vblk_exp = container_of(exp, VduseBlkExport, export); 257 258 return qatomic_read(&vblk_exp->inflight) > 0; 259 } 260 261 static const BlockDevOps vduse_block_ops = { 262 .resize_cb = vduse_blk_resize, 263 .drained_begin = vduse_blk_drained_begin, 264 .drained_end = vduse_blk_drained_end, 265 .drained_poll = vduse_blk_drained_poll, 266 }; 267 268 static int vduse_blk_exp_create(BlockExport *exp, BlockExportOptions *opts, 269 Error **errp) 270 { 271 VduseBlkExport *vblk_exp = container_of(exp, VduseBlkExport, export); 272 BlockExportOptionsVduseBlk *vblk_opts = &opts->u.vduse_blk; 273 uint64_t logical_block_size = VIRTIO_BLK_SECTOR_SIZE; 274 uint16_t num_queues = VDUSE_DEFAULT_NUM_QUEUE; 275 uint16_t queue_size = VDUSE_DEFAULT_QUEUE_SIZE; 276 Error *local_err = NULL; 277 struct virtio_blk_config config = { 0 }; 278 uint64_t features; 279 int i, ret; 280 281 if (vblk_opts->has_num_queues) { 282 num_queues = vblk_opts->num_queues; 283 if (num_queues == 0) { 284 error_setg(errp, "num-queues must be greater than 0"); 285 return -EINVAL; 286 } 287 } 288 289 if (vblk_opts->has_queue_size) { 290 queue_size = vblk_opts->queue_size; 291 if (queue_size <= 2 || !is_power_of_2(queue_size) || 292 queue_size > VIRTQUEUE_MAX_SIZE) { 293 error_setg(errp, "queue-size is invalid"); 294 return -EINVAL; 295 } 296 } 297 298 if (vblk_opts->has_logical_block_size) { 299 logical_block_size = vblk_opts->logical_block_size; 300 check_block_size(exp->id, "logical-block-size", logical_block_size, 301 &local_err); 302 if (local_err) { 303 error_propagate(errp, local_err); 304 return -EINVAL; 305 } 306 } 307 vblk_exp->num_queues = num_queues; 308 vblk_exp->handler.blk = exp->blk; 309 vblk_exp->handler.serial = g_strdup(vblk_opts->serial ?: ""); 310 vblk_exp->handler.logical_block_size = logical_block_size; 311 vblk_exp->handler.writable = opts->writable; 312 vblk_exp->vqs_started = true; 313 314 config.capacity = 315 cpu_to_le64(blk_getlength(exp->blk) >> VIRTIO_BLK_SECTOR_BITS); 316 config.seg_max = cpu_to_le32(queue_size - 2); 317 config.min_io_size = cpu_to_le16(1); 318 config.opt_io_size = cpu_to_le32(1); 319 config.num_queues = cpu_to_le16(num_queues); 320 config.blk_size = cpu_to_le32(logical_block_size); 321 config.max_discard_sectors = cpu_to_le32(VIRTIO_BLK_MAX_DISCARD_SECTORS); 322 config.max_discard_seg = cpu_to_le32(1); 323 config.discard_sector_alignment = 324 cpu_to_le32(logical_block_size >> VIRTIO_BLK_SECTOR_BITS); 325 config.max_write_zeroes_sectors = 326 cpu_to_le32(VIRTIO_BLK_MAX_WRITE_ZEROES_SECTORS); 327 config.max_write_zeroes_seg = cpu_to_le32(1); 328 329 features = vduse_get_virtio_features() | 330 (1ULL << VIRTIO_BLK_F_SEG_MAX) | 331 (1ULL << VIRTIO_BLK_F_TOPOLOGY) | 332 (1ULL << VIRTIO_BLK_F_BLK_SIZE) | 333 (1ULL << VIRTIO_BLK_F_FLUSH) | 334 (1ULL << VIRTIO_BLK_F_DISCARD) | 335 (1ULL << VIRTIO_BLK_F_WRITE_ZEROES); 336 337 if (num_queues > 1) { 338 features |= 1ULL << VIRTIO_BLK_F_MQ; 339 } 340 if (!opts->writable) { 341 features |= 1ULL << VIRTIO_BLK_F_RO; 342 } 343 344 vblk_exp->dev = vduse_dev_create(vblk_opts->name, VIRTIO_ID_BLOCK, 0, 345 features, num_queues, 346 sizeof(struct virtio_blk_config), 347 (char *)&config, &vduse_blk_ops, 348 vblk_exp); 349 if (!vblk_exp->dev) { 350 error_setg(errp, "failed to create vduse device"); 351 ret = -ENOMEM; 352 goto err_dev; 353 } 354 355 vblk_exp->recon_file = g_strdup_printf("%s/vduse-blk-%s", 356 g_get_tmp_dir(), vblk_opts->name); 357 if (vduse_set_reconnect_log_file(vblk_exp->dev, vblk_exp->recon_file)) { 358 error_setg(errp, "failed to set reconnect log file"); 359 ret = -EINVAL; 360 goto err; 361 } 362 363 for (i = 0; i < num_queues; i++) { 364 vduse_dev_setup_queue(vblk_exp->dev, i, queue_size); 365 } 366 367 aio_set_fd_handler(exp->ctx, vduse_dev_get_fd(vblk_exp->dev), 368 on_vduse_dev_kick, NULL, NULL, NULL, vblk_exp->dev); 369 370 blk_add_aio_context_notifier(exp->blk, blk_aio_attached, blk_aio_detach, 371 vblk_exp); 372 blk_set_dev_ops(exp->blk, &vduse_block_ops, exp); 373 374 /* 375 * We handle draining ourselves using an in-flight counter and by disabling 376 * virtqueue fd handlers. Do not queue BlockBackend requests, they need to 377 * complete so the in-flight counter reaches zero. 378 */ 379 blk_set_disable_request_queuing(exp->blk, true); 380 381 return 0; 382 err: 383 vduse_dev_destroy(vblk_exp->dev); 384 g_free(vblk_exp->recon_file); 385 err_dev: 386 g_free(vblk_exp->handler.serial); 387 return ret; 388 } 389 390 static void vduse_blk_exp_delete(BlockExport *exp) 391 { 392 VduseBlkExport *vblk_exp = container_of(exp, VduseBlkExport, export); 393 int ret; 394 395 assert(qatomic_read(&vblk_exp->inflight) == 0); 396 397 vduse_blk_detach_ctx(vblk_exp); 398 blk_remove_aio_context_notifier(exp->blk, blk_aio_attached, blk_aio_detach, 399 vblk_exp); 400 ret = vduse_dev_destroy(vblk_exp->dev); 401 if (ret != -EBUSY) { 402 unlink(vblk_exp->recon_file); 403 } 404 g_free(vblk_exp->recon_file); 405 g_free(vblk_exp->handler.serial); 406 } 407 408 /* Called with exp->ctx acquired */ 409 static void vduse_blk_exp_request_shutdown(BlockExport *exp) 410 { 411 VduseBlkExport *vblk_exp = container_of(exp, VduseBlkExport, export); 412 413 vduse_blk_stop_virtqueues(vblk_exp); 414 } 415 416 const BlockExportDriver blk_exp_vduse_blk = { 417 .type = BLOCK_EXPORT_TYPE_VDUSE_BLK, 418 .instance_size = sizeof(VduseBlkExport), 419 .create = vduse_blk_exp_create, 420 .delete = vduse_blk_exp_delete, 421 .request_shutdown = vduse_blk_exp_request_shutdown, 422 }; 423