xref: /openbmc/qemu/hw/block/virtio-blk.c (revision f7230e09b1ccfb7055b79dfee981e18d444a118a)
1 /*
2  * Virtio Block Device
3  *
4  * Copyright IBM, Corp. 2007
5  *
6  * Authors:
7  *  Anthony Liguori   <aliguori@us.ibm.com>
8  *
9  * This work is licensed under the terms of the GNU GPL, version 2.  See
10  * the COPYING file in the top-level directory.
11  *
12  */
13 
14 #include "qemu/osdep.h"
15 #include "qemu/defer-call.h"
16 #include "qapi/error.h"
17 #include "qemu/iov.h"
18 #include "qemu/module.h"
19 #include "qemu/error-report.h"
20 #include "qemu/main-loop.h"
21 #include "block/block_int.h"
22 #include "trace.h"
23 #include "hw/block/block.h"
24 #include "hw/qdev-properties.h"
25 #include "sysemu/blockdev.h"
26 #include "sysemu/block-ram-registrar.h"
27 #include "sysemu/sysemu.h"
28 #include "sysemu/runstate.h"
29 #include "hw/virtio/virtio-blk.h"
30 #include "scsi/constants.h"
31 #ifdef __linux__
32 # include <scsi/sg.h>
33 #endif
34 #include "hw/virtio/virtio-bus.h"
35 #include "migration/qemu-file-types.h"
36 #include "hw/virtio/virtio-access.h"
37 #include "hw/virtio/virtio-blk-common.h"
38 #include "qemu/coroutine.h"
39 
40 static void virtio_blk_ioeventfd_attach(VirtIOBlock *s);
41 
42 static void virtio_blk_init_request(VirtIOBlock *s, VirtQueue *vq,
43                                     VirtIOBlockReq *req)
44 {
45     req->dev = s;
46     req->vq = vq;
47     req->qiov.size = 0;
48     req->in_len = 0;
49     req->next = NULL;
50     req->mr_next = NULL;
51 }
52 
53 static void virtio_blk_free_request(VirtIOBlockReq *req)
54 {
55     g_free(req);
56 }
57 
58 static void virtio_blk_req_complete(VirtIOBlockReq *req, unsigned char status)
59 {
60     VirtIOBlock *s = req->dev;
61     VirtIODevice *vdev = VIRTIO_DEVICE(s);
62 
63     trace_virtio_blk_req_complete(vdev, req, status);
64 
65     stb_p(&req->in->status, status);
66     iov_discard_undo(&req->inhdr_undo);
67     iov_discard_undo(&req->outhdr_undo);
68     virtqueue_push(req->vq, &req->elem, req->in_len);
69     if (qemu_in_iothread()) {
70         virtio_notify_irqfd(vdev, req->vq);
71     } else {
72         virtio_notify(vdev, req->vq);
73     }
74 }
75 
76 static int virtio_blk_handle_rw_error(VirtIOBlockReq *req, int error,
77     bool is_read, bool acct_failed)
78 {
79     VirtIOBlock *s = req->dev;
80     BlockErrorAction action = blk_get_error_action(s->blk, is_read, error);
81 
82     if (action == BLOCK_ERROR_ACTION_STOP) {
83         /* Break the link as the next request is going to be parsed from the
84          * ring again. Otherwise we may end up doing a double completion! */
85         req->mr_next = NULL;
86 
87         WITH_QEMU_LOCK_GUARD(&s->rq_lock) {
88             req->next = s->rq;
89             s->rq = req;
90         }
91     } else if (action == BLOCK_ERROR_ACTION_REPORT) {
92         virtio_blk_req_complete(req, VIRTIO_BLK_S_IOERR);
93         if (acct_failed) {
94             block_acct_failed(blk_get_stats(s->blk), &req->acct);
95         }
96         virtio_blk_free_request(req);
97     }
98 
99     blk_error_action(s->blk, action, is_read, error);
100     return action != BLOCK_ERROR_ACTION_IGNORE;
101 }
102 
103 static void virtio_blk_rw_complete(void *opaque, int ret)
104 {
105     VirtIOBlockReq *next = opaque;
106     VirtIOBlock *s = next->dev;
107     VirtIODevice *vdev = VIRTIO_DEVICE(s);
108 
109     while (next) {
110         VirtIOBlockReq *req = next;
111         next = req->mr_next;
112         trace_virtio_blk_rw_complete(vdev, req, ret);
113 
114         if (req->qiov.nalloc != -1) {
115             /* If nalloc is != -1 req->qiov is a local copy of the original
116              * external iovec. It was allocated in submit_requests to be
117              * able to merge requests. */
118             qemu_iovec_destroy(&req->qiov);
119         }
120 
121         if (ret) {
122             int p = virtio_ldl_p(VIRTIO_DEVICE(s), &req->out.type);
123             bool is_read = !(p & VIRTIO_BLK_T_OUT);
124             /* Note that memory may be dirtied on read failure.  If the
125              * virtio request is not completed here, as is the case for
126              * BLOCK_ERROR_ACTION_STOP, the memory may not be copied
127              * correctly during live migration.  While this is ugly,
128              * it is acceptable because the device is free to write to
129              * the memory until the request is completed (which will
130              * happen on the other side of the migration).
131              */
132             if (virtio_blk_handle_rw_error(req, -ret, is_read, true)) {
133                 continue;
134             }
135         }
136 
137         virtio_blk_req_complete(req, VIRTIO_BLK_S_OK);
138         block_acct_done(blk_get_stats(s->blk), &req->acct);
139         virtio_blk_free_request(req);
140     }
141 }
142 
143 static void virtio_blk_flush_complete(void *opaque, int ret)
144 {
145     VirtIOBlockReq *req = opaque;
146     VirtIOBlock *s = req->dev;
147 
148     if (ret && virtio_blk_handle_rw_error(req, -ret, 0, true)) {
149         return;
150     }
151 
152     virtio_blk_req_complete(req, VIRTIO_BLK_S_OK);
153     block_acct_done(blk_get_stats(s->blk), &req->acct);
154     virtio_blk_free_request(req);
155 }
156 
157 static void virtio_blk_discard_write_zeroes_complete(void *opaque, int ret)
158 {
159     VirtIOBlockReq *req = opaque;
160     VirtIOBlock *s = req->dev;
161     bool is_write_zeroes = (virtio_ldl_p(VIRTIO_DEVICE(s), &req->out.type) &
162                             ~VIRTIO_BLK_T_BARRIER) == VIRTIO_BLK_T_WRITE_ZEROES;
163 
164     if (ret && virtio_blk_handle_rw_error(req, -ret, false, is_write_zeroes)) {
165         return;
166     }
167 
168     virtio_blk_req_complete(req, VIRTIO_BLK_S_OK);
169     if (is_write_zeroes) {
170         block_acct_done(blk_get_stats(s->blk), &req->acct);
171     }
172     virtio_blk_free_request(req);
173 }
174 
175 static VirtIOBlockReq *virtio_blk_get_request(VirtIOBlock *s, VirtQueue *vq)
176 {
177     VirtIOBlockReq *req = virtqueue_pop(vq, sizeof(VirtIOBlockReq));
178 
179     if (req) {
180         virtio_blk_init_request(s, vq, req);
181     }
182     return req;
183 }
184 
185 static void virtio_blk_handle_scsi(VirtIOBlockReq *req)
186 {
187     int status;
188     struct virtio_scsi_inhdr *scsi;
189     VirtIOBlock *blk = req->dev;
190     VirtIODevice *vdev = VIRTIO_DEVICE(blk);
191     VirtQueueElement *elem = &req->elem;
192 
193     /*
194      * We require at least one output segment each for the virtio_blk_outhdr
195      * and the SCSI command block.
196      *
197      * We also at least require the virtio_blk_inhdr, the virtio_scsi_inhdr
198      * and the sense buffer pointer in the input segments.
199      */
200     if (elem->out_num < 2 || elem->in_num < 3) {
201         status = VIRTIO_BLK_S_IOERR;
202         goto fail;
203     }
204 
205     /*
206      * The scsi inhdr is placed in the second-to-last input segment, just
207      * before the regular inhdr.
208      *
209      * Just put anything nonzero so that the ioctl fails in the guest.
210      */
211     scsi = (void *)elem->in_sg[elem->in_num - 2].iov_base;
212     virtio_stl_p(vdev, &scsi->errors, 255);
213     status = VIRTIO_BLK_S_UNSUPP;
214 
215 fail:
216     virtio_blk_req_complete(req, status);
217     virtio_blk_free_request(req);
218 }
219 
220 static inline void submit_requests(VirtIOBlock *s, MultiReqBuffer *mrb,
221                                    int start, int num_reqs, int niov)
222 {
223     BlockBackend *blk = s->blk;
224     QEMUIOVector *qiov = &mrb->reqs[start]->qiov;
225     int64_t sector_num = mrb->reqs[start]->sector_num;
226     bool is_write = mrb->is_write;
227     BdrvRequestFlags flags = 0;
228 
229     if (num_reqs > 1) {
230         int i;
231         struct iovec *tmp_iov = qiov->iov;
232         int tmp_niov = qiov->niov;
233 
234         /* mrb->reqs[start]->qiov was initialized from external so we can't
235          * modify it here. We need to initialize it locally and then add the
236          * external iovecs. */
237         qemu_iovec_init(qiov, niov);
238 
239         for (i = 0; i < tmp_niov; i++) {
240             qemu_iovec_add(qiov, tmp_iov[i].iov_base, tmp_iov[i].iov_len);
241         }
242 
243         for (i = start + 1; i < start + num_reqs; i++) {
244             qemu_iovec_concat(qiov, &mrb->reqs[i]->qiov, 0,
245                               mrb->reqs[i]->qiov.size);
246             mrb->reqs[i - 1]->mr_next = mrb->reqs[i];
247         }
248 
249         trace_virtio_blk_submit_multireq(VIRTIO_DEVICE(mrb->reqs[start]->dev),
250                                          mrb, start, num_reqs,
251                                          sector_num << BDRV_SECTOR_BITS,
252                                          qiov->size, is_write);
253         block_acct_merge_done(blk_get_stats(blk),
254                               is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ,
255                               num_reqs - 1);
256     }
257 
258     if (blk_ram_registrar_ok(&s->blk_ram_registrar)) {
259         flags |= BDRV_REQ_REGISTERED_BUF;
260     }
261 
262     if (is_write) {
263         blk_aio_pwritev(blk, sector_num << BDRV_SECTOR_BITS, qiov,
264                         flags, virtio_blk_rw_complete,
265                         mrb->reqs[start]);
266     } else {
267         blk_aio_preadv(blk, sector_num << BDRV_SECTOR_BITS, qiov,
268                        flags, virtio_blk_rw_complete,
269                        mrb->reqs[start]);
270     }
271 }
272 
273 static int multireq_compare(const void *a, const void *b)
274 {
275     const VirtIOBlockReq *req1 = *(VirtIOBlockReq **)a,
276                          *req2 = *(VirtIOBlockReq **)b;
277 
278     /*
279      * Note that we can't simply subtract sector_num1 from sector_num2
280      * here as that could overflow the return value.
281      */
282     if (req1->sector_num > req2->sector_num) {
283         return 1;
284     } else if (req1->sector_num < req2->sector_num) {
285         return -1;
286     } else {
287         return 0;
288     }
289 }
290 
291 static void virtio_blk_submit_multireq(VirtIOBlock *s, MultiReqBuffer *mrb)
292 {
293     int i = 0, start = 0, num_reqs = 0, niov = 0, nb_sectors = 0;
294     uint32_t max_transfer;
295     int64_t sector_num = 0;
296 
297     if (mrb->num_reqs == 1) {
298         submit_requests(s, mrb, 0, 1, -1);
299         mrb->num_reqs = 0;
300         return;
301     }
302 
303     max_transfer = blk_get_max_transfer(mrb->reqs[0]->dev->blk);
304 
305     qsort(mrb->reqs, mrb->num_reqs, sizeof(*mrb->reqs),
306           &multireq_compare);
307 
308     for (i = 0; i < mrb->num_reqs; i++) {
309         VirtIOBlockReq *req = mrb->reqs[i];
310         if (num_reqs > 0) {
311             /*
312              * NOTE: We cannot merge the requests in below situations:
313              * 1. requests are not sequential
314              * 2. merge would exceed maximum number of IOVs
315              * 3. merge would exceed maximum transfer length of backend device
316              */
317             if (sector_num + nb_sectors != req->sector_num ||
318                 niov > blk_get_max_iov(s->blk) - req->qiov.niov ||
319                 req->qiov.size > max_transfer ||
320                 nb_sectors > (max_transfer -
321                               req->qiov.size) / BDRV_SECTOR_SIZE) {
322                 submit_requests(s, mrb, start, num_reqs, niov);
323                 num_reqs = 0;
324             }
325         }
326 
327         if (num_reqs == 0) {
328             sector_num = req->sector_num;
329             nb_sectors = niov = 0;
330             start = i;
331         }
332 
333         nb_sectors += req->qiov.size / BDRV_SECTOR_SIZE;
334         niov += req->qiov.niov;
335         num_reqs++;
336     }
337 
338     submit_requests(s, mrb, start, num_reqs, niov);
339     mrb->num_reqs = 0;
340 }
341 
342 static void virtio_blk_handle_flush(VirtIOBlockReq *req, MultiReqBuffer *mrb)
343 {
344     VirtIOBlock *s = req->dev;
345 
346     block_acct_start(blk_get_stats(s->blk), &req->acct, 0,
347                      BLOCK_ACCT_FLUSH);
348 
349     /*
350      * Make sure all outstanding writes are posted to the backing device.
351      */
352     if (mrb->is_write && mrb->num_reqs > 0) {
353         virtio_blk_submit_multireq(s, mrb);
354     }
355     blk_aio_flush(s->blk, virtio_blk_flush_complete, req);
356 }
357 
358 static bool virtio_blk_sect_range_ok(VirtIOBlock *dev,
359                                      uint64_t sector, size_t size)
360 {
361     uint64_t nb_sectors = size >> BDRV_SECTOR_BITS;
362     uint64_t total_sectors;
363 
364     if (nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
365         return false;
366     }
367     if (sector & dev->sector_mask) {
368         return false;
369     }
370     if (size % dev->conf.conf.logical_block_size) {
371         return false;
372     }
373     blk_get_geometry(dev->blk, &total_sectors);
374     if (sector > total_sectors || nb_sectors > total_sectors - sector) {
375         return false;
376     }
377     return true;
378 }
379 
380 static uint8_t virtio_blk_handle_discard_write_zeroes(VirtIOBlockReq *req,
381     struct virtio_blk_discard_write_zeroes *dwz_hdr, bool is_write_zeroes)
382 {
383     VirtIOBlock *s = req->dev;
384     VirtIODevice *vdev = VIRTIO_DEVICE(s);
385     uint64_t sector;
386     uint32_t num_sectors, flags, max_sectors;
387     uint8_t err_status;
388     int bytes;
389 
390     sector = virtio_ldq_p(vdev, &dwz_hdr->sector);
391     num_sectors = virtio_ldl_p(vdev, &dwz_hdr->num_sectors);
392     flags = virtio_ldl_p(vdev, &dwz_hdr->flags);
393     max_sectors = is_write_zeroes ? s->conf.max_write_zeroes_sectors :
394                   s->conf.max_discard_sectors;
395 
396     /*
397      * max_sectors is at most BDRV_REQUEST_MAX_SECTORS, this check
398      * make us sure that "num_sectors << BDRV_SECTOR_BITS" can fit in
399      * the integer variable.
400      */
401     if (unlikely(num_sectors > max_sectors)) {
402         err_status = VIRTIO_BLK_S_IOERR;
403         goto err;
404     }
405 
406     bytes = num_sectors << BDRV_SECTOR_BITS;
407 
408     if (unlikely(!virtio_blk_sect_range_ok(s, sector, bytes))) {
409         err_status = VIRTIO_BLK_S_IOERR;
410         goto err;
411     }
412 
413     /*
414      * The device MUST set the status byte to VIRTIO_BLK_S_UNSUPP for discard
415      * and write zeroes commands if any unknown flag is set.
416      */
417     if (unlikely(flags & ~VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP)) {
418         err_status = VIRTIO_BLK_S_UNSUPP;
419         goto err;
420     }
421 
422     if (is_write_zeroes) { /* VIRTIO_BLK_T_WRITE_ZEROES */
423         int blk_aio_flags = 0;
424 
425         if (flags & VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP) {
426             blk_aio_flags |= BDRV_REQ_MAY_UNMAP;
427         }
428 
429         block_acct_start(blk_get_stats(s->blk), &req->acct, bytes,
430                          BLOCK_ACCT_WRITE);
431 
432         blk_aio_pwrite_zeroes(s->blk, sector << BDRV_SECTOR_BITS,
433                               bytes, blk_aio_flags,
434                               virtio_blk_discard_write_zeroes_complete, req);
435     } else { /* VIRTIO_BLK_T_DISCARD */
436         /*
437          * The device MUST set the status byte to VIRTIO_BLK_S_UNSUPP for
438          * discard commands if the unmap flag is set.
439          */
440         if (unlikely(flags & VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP)) {
441             err_status = VIRTIO_BLK_S_UNSUPP;
442             goto err;
443         }
444 
445         blk_aio_pdiscard(s->blk, sector << BDRV_SECTOR_BITS, bytes,
446                          virtio_blk_discard_write_zeroes_complete, req);
447     }
448 
449     return VIRTIO_BLK_S_OK;
450 
451 err:
452     if (is_write_zeroes) {
453         block_acct_invalid(blk_get_stats(s->blk), BLOCK_ACCT_WRITE);
454     }
455     return err_status;
456 }
457 
458 typedef struct ZoneCmdData {
459     VirtIOBlockReq *req;
460     struct iovec *in_iov;
461     unsigned in_num;
462     union {
463         struct {
464             unsigned int nr_zones;
465             BlockZoneDescriptor *zones;
466         } zone_report_data;
467         struct {
468             int64_t offset;
469         } zone_append_data;
470     };
471 } ZoneCmdData;
472 
473 /*
474  * check zoned_request: error checking before issuing requests. If all checks
475  * passed, return true.
476  * append: true if only zone append requests issued.
477  */
478 static bool check_zoned_request(VirtIOBlock *s, int64_t offset, int64_t len,
479                              bool append, uint8_t *status) {
480     BlockDriverState *bs = blk_bs(s->blk);
481     int index;
482 
483     if (!virtio_has_feature(s->host_features, VIRTIO_BLK_F_ZONED)) {
484         *status = VIRTIO_BLK_S_UNSUPP;
485         return false;
486     }
487 
488     if (offset < 0 || len < 0 || len > (bs->total_sectors << BDRV_SECTOR_BITS)
489         || offset > (bs->total_sectors << BDRV_SECTOR_BITS) - len) {
490         *status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
491         return false;
492     }
493 
494     if (append) {
495         if (bs->bl.write_granularity) {
496             if ((offset % bs->bl.write_granularity) != 0) {
497                 *status = VIRTIO_BLK_S_ZONE_UNALIGNED_WP;
498                 return false;
499             }
500         }
501 
502         index = offset / bs->bl.zone_size;
503         if (BDRV_ZT_IS_CONV(bs->wps->wp[index])) {
504             *status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
505             return false;
506         }
507 
508         if (len / 512 > bs->bl.max_append_sectors) {
509             if (bs->bl.max_append_sectors == 0) {
510                 *status = VIRTIO_BLK_S_UNSUPP;
511             } else {
512                 *status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
513             }
514             return false;
515         }
516     }
517     return true;
518 }
519 
520 static void virtio_blk_zone_report_complete(void *opaque, int ret)
521 {
522     ZoneCmdData *data = opaque;
523     VirtIOBlockReq *req = data->req;
524     VirtIODevice *vdev = VIRTIO_DEVICE(req->dev);
525     struct iovec *in_iov = data->in_iov;
526     unsigned in_num = data->in_num;
527     int64_t zrp_size, n, j = 0;
528     int64_t nz = data->zone_report_data.nr_zones;
529     int8_t err_status = VIRTIO_BLK_S_OK;
530     struct virtio_blk_zone_report zrp_hdr = (struct virtio_blk_zone_report) {
531         .nr_zones = cpu_to_le64(nz),
532     };
533 
534     trace_virtio_blk_zone_report_complete(vdev, req, nz, ret);
535     if (ret) {
536         err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
537         goto out;
538     }
539 
540     zrp_size = sizeof(struct virtio_blk_zone_report)
541                + sizeof(struct virtio_blk_zone_descriptor) * nz;
542     n = iov_from_buf(in_iov, in_num, 0, &zrp_hdr, sizeof(zrp_hdr));
543     if (n != sizeof(zrp_hdr)) {
544         virtio_error(vdev, "Driver provided input buffer that is too small!");
545         err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
546         goto out;
547     }
548 
549     for (size_t i = sizeof(zrp_hdr); i < zrp_size;
550         i += sizeof(struct virtio_blk_zone_descriptor), ++j) {
551         struct virtio_blk_zone_descriptor desc =
552             (struct virtio_blk_zone_descriptor) {
553                 .z_start = cpu_to_le64(data->zone_report_data.zones[j].start
554                     >> BDRV_SECTOR_BITS),
555                 .z_cap = cpu_to_le64(data->zone_report_data.zones[j].cap
556                     >> BDRV_SECTOR_BITS),
557                 .z_wp = cpu_to_le64(data->zone_report_data.zones[j].wp
558                     >> BDRV_SECTOR_BITS),
559         };
560 
561         switch (data->zone_report_data.zones[j].type) {
562         case BLK_ZT_CONV:
563             desc.z_type = VIRTIO_BLK_ZT_CONV;
564             break;
565         case BLK_ZT_SWR:
566             desc.z_type = VIRTIO_BLK_ZT_SWR;
567             break;
568         case BLK_ZT_SWP:
569             desc.z_type = VIRTIO_BLK_ZT_SWP;
570             break;
571         default:
572             g_assert_not_reached();
573         }
574 
575         switch (data->zone_report_data.zones[j].state) {
576         case BLK_ZS_RDONLY:
577             desc.z_state = VIRTIO_BLK_ZS_RDONLY;
578             break;
579         case BLK_ZS_OFFLINE:
580             desc.z_state = VIRTIO_BLK_ZS_OFFLINE;
581             break;
582         case BLK_ZS_EMPTY:
583             desc.z_state = VIRTIO_BLK_ZS_EMPTY;
584             break;
585         case BLK_ZS_CLOSED:
586             desc.z_state = VIRTIO_BLK_ZS_CLOSED;
587             break;
588         case BLK_ZS_FULL:
589             desc.z_state = VIRTIO_BLK_ZS_FULL;
590             break;
591         case BLK_ZS_EOPEN:
592             desc.z_state = VIRTIO_BLK_ZS_EOPEN;
593             break;
594         case BLK_ZS_IOPEN:
595             desc.z_state = VIRTIO_BLK_ZS_IOPEN;
596             break;
597         case BLK_ZS_NOT_WP:
598             desc.z_state = VIRTIO_BLK_ZS_NOT_WP;
599             break;
600         default:
601             g_assert_not_reached();
602         }
603 
604         /* TODO: it takes O(n^2) time complexity. Optimizations required. */
605         n = iov_from_buf(in_iov, in_num, i, &desc, sizeof(desc));
606         if (n != sizeof(desc)) {
607             virtio_error(vdev, "Driver provided input buffer "
608                                "for descriptors that is too small!");
609             err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
610         }
611     }
612 
613 out:
614     virtio_blk_req_complete(req, err_status);
615     virtio_blk_free_request(req);
616     g_free(data->zone_report_data.zones);
617     g_free(data);
618 }
619 
620 static void virtio_blk_handle_zone_report(VirtIOBlockReq *req,
621                                          struct iovec *in_iov,
622                                          unsigned in_num)
623 {
624     VirtIOBlock *s = req->dev;
625     VirtIODevice *vdev = VIRTIO_DEVICE(s);
626     unsigned int nr_zones;
627     ZoneCmdData *data;
628     int64_t zone_size, offset;
629     uint8_t err_status;
630 
631     if (req->in_len < sizeof(struct virtio_blk_inhdr) +
632             sizeof(struct virtio_blk_zone_report) +
633             sizeof(struct virtio_blk_zone_descriptor)) {
634         virtio_error(vdev, "in buffer too small for zone report");
635         err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
636         goto out;
637     }
638 
639     /* start byte offset of the zone report */
640     offset = virtio_ldq_p(vdev, &req->out.sector) << BDRV_SECTOR_BITS;
641     if (!check_zoned_request(s, offset, 0, false, &err_status)) {
642         goto out;
643     }
644     nr_zones = (req->in_len - sizeof(struct virtio_blk_inhdr) -
645                 sizeof(struct virtio_blk_zone_report)) /
646                sizeof(struct virtio_blk_zone_descriptor);
647     trace_virtio_blk_handle_zone_report(vdev, req,
648                                         offset >> BDRV_SECTOR_BITS, nr_zones);
649 
650     zone_size = sizeof(BlockZoneDescriptor) * nr_zones;
651     data = g_malloc(sizeof(ZoneCmdData));
652     data->req = req;
653     data->in_iov = in_iov;
654     data->in_num = in_num;
655     data->zone_report_data.nr_zones = nr_zones;
656     data->zone_report_data.zones = g_malloc(zone_size),
657 
658     blk_aio_zone_report(s->blk, offset, &data->zone_report_data.nr_zones,
659                         data->zone_report_data.zones,
660                         virtio_blk_zone_report_complete, data);
661     return;
662 out:
663     virtio_blk_req_complete(req, err_status);
664     virtio_blk_free_request(req);
665 }
666 
667 static void virtio_blk_zone_mgmt_complete(void *opaque, int ret)
668 {
669     VirtIOBlockReq *req = opaque;
670     VirtIOBlock *s = req->dev;
671     VirtIODevice *vdev = VIRTIO_DEVICE(s);
672     int8_t err_status = VIRTIO_BLK_S_OK;
673     trace_virtio_blk_zone_mgmt_complete(vdev, req,ret);
674 
675     if (ret) {
676         err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
677     }
678 
679     virtio_blk_req_complete(req, err_status);
680     virtio_blk_free_request(req);
681 }
682 
683 static int virtio_blk_handle_zone_mgmt(VirtIOBlockReq *req, BlockZoneOp op)
684 {
685     VirtIOBlock *s = req->dev;
686     VirtIODevice *vdev = VIRTIO_DEVICE(s);
687     BlockDriverState *bs = blk_bs(s->blk);
688     int64_t offset = virtio_ldq_p(vdev, &req->out.sector) << BDRV_SECTOR_BITS;
689     uint64_t len;
690     uint64_t capacity = bs->total_sectors << BDRV_SECTOR_BITS;
691     uint8_t err_status = VIRTIO_BLK_S_OK;
692 
693     uint32_t type = virtio_ldl_p(vdev, &req->out.type);
694     if (type == VIRTIO_BLK_T_ZONE_RESET_ALL) {
695         /* Entire drive capacity */
696         offset = 0;
697         len = capacity;
698         trace_virtio_blk_handle_zone_reset_all(vdev, req, 0,
699                                                bs->total_sectors);
700     } else {
701         if (bs->bl.zone_size > capacity - offset) {
702             /* The zoned device allows the last smaller zone. */
703             len = capacity - bs->bl.zone_size * (bs->bl.nr_zones - 1ull);
704         } else {
705             len = bs->bl.zone_size;
706         }
707         trace_virtio_blk_handle_zone_mgmt(vdev, req, op,
708                                           offset >> BDRV_SECTOR_BITS,
709                                           len >> BDRV_SECTOR_BITS);
710     }
711 
712     if (!check_zoned_request(s, offset, len, false, &err_status)) {
713         goto out;
714     }
715 
716     blk_aio_zone_mgmt(s->blk, op, offset, len,
717                       virtio_blk_zone_mgmt_complete, req);
718 
719     return 0;
720 out:
721     virtio_blk_req_complete(req, err_status);
722     virtio_blk_free_request(req);
723     return err_status;
724 }
725 
726 static void virtio_blk_zone_append_complete(void *opaque, int ret)
727 {
728     ZoneCmdData *data = opaque;
729     VirtIOBlockReq *req = data->req;
730     VirtIODevice *vdev = VIRTIO_DEVICE(req->dev);
731     int64_t append_sector, n;
732     uint8_t err_status = VIRTIO_BLK_S_OK;
733 
734     if (ret) {
735         err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
736         goto out;
737     }
738 
739     virtio_stq_p(vdev, &append_sector,
740                  data->zone_append_data.offset >> BDRV_SECTOR_BITS);
741     n = iov_from_buf(data->in_iov, data->in_num, 0, &append_sector,
742                      sizeof(append_sector));
743     if (n != sizeof(append_sector)) {
744         virtio_error(vdev, "Driver provided input buffer less than size of "
745                            "append_sector");
746         err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
747         goto out;
748     }
749     trace_virtio_blk_zone_append_complete(vdev, req, append_sector, ret);
750 
751 out:
752     virtio_blk_req_complete(req, err_status);
753     virtio_blk_free_request(req);
754     g_free(data);
755 }
756 
757 static int virtio_blk_handle_zone_append(VirtIOBlockReq *req,
758                                          struct iovec *out_iov,
759                                          struct iovec *in_iov,
760                                          uint64_t out_num,
761                                          unsigned in_num) {
762     VirtIOBlock *s = req->dev;
763     VirtIODevice *vdev = VIRTIO_DEVICE(s);
764     uint8_t err_status = VIRTIO_BLK_S_OK;
765 
766     int64_t offset = virtio_ldq_p(vdev, &req->out.sector) << BDRV_SECTOR_BITS;
767     int64_t len = iov_size(out_iov, out_num);
768     ZoneCmdData *data;
769 
770     trace_virtio_blk_handle_zone_append(vdev, req, offset >> BDRV_SECTOR_BITS);
771     if (!check_zoned_request(s, offset, len, true, &err_status)) {
772         goto out;
773     }
774 
775     data = g_malloc(sizeof(ZoneCmdData));
776     data->req = req;
777     data->in_iov = in_iov;
778     data->in_num = in_num;
779     data->zone_append_data.offset = offset;
780     qemu_iovec_init_external(&req->qiov, out_iov, out_num);
781 
782     block_acct_start(blk_get_stats(s->blk), &req->acct, len,
783                      BLOCK_ACCT_ZONE_APPEND);
784 
785     blk_aio_zone_append(s->blk, &data->zone_append_data.offset, &req->qiov, 0,
786                         virtio_blk_zone_append_complete, data);
787     return 0;
788 
789 out:
790     virtio_blk_req_complete(req, err_status);
791     virtio_blk_free_request(req);
792     return err_status;
793 }
794 
795 static int virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
796 {
797     uint32_t type;
798     struct iovec *in_iov = req->elem.in_sg;
799     struct iovec *out_iov = req->elem.out_sg;
800     unsigned in_num = req->elem.in_num;
801     unsigned out_num = req->elem.out_num;
802     VirtIOBlock *s = req->dev;
803     VirtIODevice *vdev = VIRTIO_DEVICE(s);
804 
805     if (req->elem.out_num < 1 || req->elem.in_num < 1) {
806         virtio_error(vdev, "virtio-blk missing headers");
807         return -1;
808     }
809 
810     if (unlikely(iov_to_buf(out_iov, out_num, 0, &req->out,
811                             sizeof(req->out)) != sizeof(req->out))) {
812         virtio_error(vdev, "virtio-blk request outhdr too short");
813         return -1;
814     }
815 
816     iov_discard_front_undoable(&out_iov, &out_num, sizeof(req->out),
817                                &req->outhdr_undo);
818 
819     if (in_iov[in_num - 1].iov_len < sizeof(struct virtio_blk_inhdr)) {
820         virtio_error(vdev, "virtio-blk request inhdr too short");
821         iov_discard_undo(&req->outhdr_undo);
822         return -1;
823     }
824 
825     /* We always touch the last byte, so just see how big in_iov is.  */
826     req->in_len = iov_size(in_iov, in_num);
827     req->in = (void *)in_iov[in_num - 1].iov_base
828               + in_iov[in_num - 1].iov_len
829               - sizeof(struct virtio_blk_inhdr);
830     iov_discard_back_undoable(in_iov, &in_num, sizeof(struct virtio_blk_inhdr),
831                               &req->inhdr_undo);
832 
833     type = virtio_ldl_p(vdev, &req->out.type);
834 
835     /* VIRTIO_BLK_T_OUT defines the command direction. VIRTIO_BLK_T_BARRIER
836      * is an optional flag. Although a guest should not send this flag if
837      * not negotiated we ignored it in the past. So keep ignoring it. */
838     switch (type & ~(VIRTIO_BLK_T_OUT | VIRTIO_BLK_T_BARRIER)) {
839     case VIRTIO_BLK_T_IN:
840     {
841         bool is_write = type & VIRTIO_BLK_T_OUT;
842         req->sector_num = virtio_ldq_p(vdev, &req->out.sector);
843 
844         if (is_write) {
845             qemu_iovec_init_external(&req->qiov, out_iov, out_num);
846             trace_virtio_blk_handle_write(vdev, req, req->sector_num,
847                                           req->qiov.size / BDRV_SECTOR_SIZE);
848         } else {
849             qemu_iovec_init_external(&req->qiov, in_iov, in_num);
850             trace_virtio_blk_handle_read(vdev, req, req->sector_num,
851                                          req->qiov.size / BDRV_SECTOR_SIZE);
852         }
853 
854         if (!virtio_blk_sect_range_ok(s, req->sector_num, req->qiov.size)) {
855             virtio_blk_req_complete(req, VIRTIO_BLK_S_IOERR);
856             block_acct_invalid(blk_get_stats(s->blk),
857                                is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ);
858             virtio_blk_free_request(req);
859             return 0;
860         }
861 
862         block_acct_start(blk_get_stats(s->blk), &req->acct, req->qiov.size,
863                          is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ);
864 
865         /* merge would exceed maximum number of requests or IO direction
866          * changes */
867         if (mrb->num_reqs > 0 && (mrb->num_reqs == VIRTIO_BLK_MAX_MERGE_REQS ||
868                                   is_write != mrb->is_write ||
869                                   !s->conf.request_merging)) {
870             virtio_blk_submit_multireq(s, mrb);
871         }
872 
873         assert(mrb->num_reqs < VIRTIO_BLK_MAX_MERGE_REQS);
874         mrb->reqs[mrb->num_reqs++] = req;
875         mrb->is_write = is_write;
876         break;
877     }
878     case VIRTIO_BLK_T_FLUSH:
879         virtio_blk_handle_flush(req, mrb);
880         break;
881     case VIRTIO_BLK_T_ZONE_REPORT:
882         virtio_blk_handle_zone_report(req, in_iov, in_num);
883         break;
884     case VIRTIO_BLK_T_ZONE_OPEN:
885         virtio_blk_handle_zone_mgmt(req, BLK_ZO_OPEN);
886         break;
887     case VIRTIO_BLK_T_ZONE_CLOSE:
888         virtio_blk_handle_zone_mgmt(req, BLK_ZO_CLOSE);
889         break;
890     case VIRTIO_BLK_T_ZONE_FINISH:
891         virtio_blk_handle_zone_mgmt(req, BLK_ZO_FINISH);
892         break;
893     case VIRTIO_BLK_T_ZONE_RESET:
894         virtio_blk_handle_zone_mgmt(req, BLK_ZO_RESET);
895         break;
896     case VIRTIO_BLK_T_ZONE_RESET_ALL:
897         virtio_blk_handle_zone_mgmt(req, BLK_ZO_RESET);
898         break;
899     case VIRTIO_BLK_T_SCSI_CMD:
900         virtio_blk_handle_scsi(req);
901         break;
902     case VIRTIO_BLK_T_GET_ID:
903     {
904         /*
905          * NB: per existing s/n string convention the string is
906          * terminated by '\0' only when shorter than buffer.
907          */
908         const char *serial = s->conf.serial ? s->conf.serial : "";
909         size_t size = MIN(strlen(serial) + 1,
910                           MIN(iov_size(in_iov, in_num),
911                               VIRTIO_BLK_ID_BYTES));
912         iov_from_buf(in_iov, in_num, 0, serial, size);
913         virtio_blk_req_complete(req, VIRTIO_BLK_S_OK);
914         virtio_blk_free_request(req);
915         break;
916     }
917     case VIRTIO_BLK_T_ZONE_APPEND & ~VIRTIO_BLK_T_OUT:
918         /*
919          * Passing out_iov/out_num and in_iov/in_num is not safe
920          * to access req->elem.out_sg directly because it may be
921          * modified by virtio_blk_handle_request().
922          */
923         virtio_blk_handle_zone_append(req, out_iov, in_iov, out_num, in_num);
924         break;
925     /*
926      * VIRTIO_BLK_T_DISCARD and VIRTIO_BLK_T_WRITE_ZEROES are defined with
927      * VIRTIO_BLK_T_OUT flag set. We masked this flag in the switch statement,
928      * so we must mask it for these requests, then we will check if it is set.
929      */
930     case VIRTIO_BLK_T_DISCARD & ~VIRTIO_BLK_T_OUT:
931     case VIRTIO_BLK_T_WRITE_ZEROES & ~VIRTIO_BLK_T_OUT:
932     {
933         struct virtio_blk_discard_write_zeroes dwz_hdr;
934         size_t out_len = iov_size(out_iov, out_num);
935         bool is_write_zeroes = (type & ~VIRTIO_BLK_T_BARRIER) ==
936                                VIRTIO_BLK_T_WRITE_ZEROES;
937         uint8_t err_status;
938 
939         /*
940          * Unsupported if VIRTIO_BLK_T_OUT is not set or the request contains
941          * more than one segment.
942          */
943         if (unlikely(!(type & VIRTIO_BLK_T_OUT) ||
944                      out_len > sizeof(dwz_hdr))) {
945             virtio_blk_req_complete(req, VIRTIO_BLK_S_UNSUPP);
946             virtio_blk_free_request(req);
947             return 0;
948         }
949 
950         if (unlikely(iov_to_buf(out_iov, out_num, 0, &dwz_hdr,
951                                 sizeof(dwz_hdr)) != sizeof(dwz_hdr))) {
952             iov_discard_undo(&req->inhdr_undo);
953             iov_discard_undo(&req->outhdr_undo);
954             virtio_error(vdev, "virtio-blk discard/write_zeroes header"
955                          " too short");
956             return -1;
957         }
958 
959         err_status = virtio_blk_handle_discard_write_zeroes(req, &dwz_hdr,
960                                                             is_write_zeroes);
961         if (err_status != VIRTIO_BLK_S_OK) {
962             virtio_blk_req_complete(req, err_status);
963             virtio_blk_free_request(req);
964         }
965 
966         break;
967     }
968     default:
969         virtio_blk_req_complete(req, VIRTIO_BLK_S_UNSUPP);
970         virtio_blk_free_request(req);
971     }
972     return 0;
973 }
974 
975 void virtio_blk_handle_vq(VirtIOBlock *s, VirtQueue *vq)
976 {
977     VirtIOBlockReq *req;
978     MultiReqBuffer mrb = {};
979     bool suppress_notifications = virtio_queue_get_notification(vq);
980 
981     defer_call_begin();
982 
983     do {
984         if (suppress_notifications) {
985             virtio_queue_set_notification(vq, 0);
986         }
987 
988         while ((req = virtio_blk_get_request(s, vq))) {
989             if (virtio_blk_handle_request(req, &mrb)) {
990                 virtqueue_detach_element(req->vq, &req->elem, 0);
991                 virtio_blk_free_request(req);
992                 break;
993             }
994         }
995 
996         if (suppress_notifications) {
997             virtio_queue_set_notification(vq, 1);
998         }
999     } while (!virtio_queue_empty(vq));
1000 
1001     if (mrb.num_reqs) {
1002         virtio_blk_submit_multireq(s, &mrb);
1003     }
1004 
1005     defer_call_end();
1006 }
1007 
1008 static void virtio_blk_handle_output(VirtIODevice *vdev, VirtQueue *vq)
1009 {
1010     VirtIOBlock *s = (VirtIOBlock *)vdev;
1011 
1012     if (!s->ioeventfd_disabled && !s->ioeventfd_started) {
1013         /* Some guests kick before setting VIRTIO_CONFIG_S_DRIVER_OK so start
1014          * ioeventfd here instead of waiting for .set_status().
1015          */
1016         virtio_device_start_ioeventfd(vdev);
1017         if (!s->ioeventfd_disabled) {
1018             return;
1019         }
1020     }
1021 
1022     virtio_blk_handle_vq(s, vq);
1023 }
1024 
1025 static void virtio_blk_dma_restart_bh(void *opaque)
1026 {
1027     VirtIOBlockReq *req = opaque;
1028     VirtIOBlock *s = req->dev; /* we're called with at least one request */
1029 
1030     MultiReqBuffer mrb = {};
1031 
1032     while (req) {
1033         VirtIOBlockReq *next = req->next;
1034         if (virtio_blk_handle_request(req, &mrb)) {
1035             /* Device is now broken and won't do any processing until it gets
1036              * reset. Already queued requests will be lost: let's purge them.
1037              */
1038             while (req) {
1039                 next = req->next;
1040                 virtqueue_detach_element(req->vq, &req->elem, 0);
1041                 virtio_blk_free_request(req);
1042                 req = next;
1043             }
1044             break;
1045         }
1046         req = next;
1047     }
1048 
1049     if (mrb.num_reqs) {
1050         virtio_blk_submit_multireq(s, &mrb);
1051     }
1052 
1053     /* Paired with inc in virtio_blk_dma_restart_cb() */
1054     blk_dec_in_flight(s->conf.conf.blk);
1055 }
1056 
1057 static void virtio_blk_dma_restart_cb(void *opaque, bool running,
1058                                       RunState state)
1059 {
1060     VirtIOBlock *s = opaque;
1061     uint16_t num_queues = s->conf.num_queues;
1062     g_autofree VirtIOBlockReq **vq_rq = NULL;
1063     VirtIOBlockReq *rq = NULL;
1064 
1065     if (!running) {
1066         return;
1067     }
1068 
1069     /* Split the device-wide s->rq request list into per-vq request lists */
1070     vq_rq = g_new0(VirtIOBlockReq *, num_queues);
1071 
1072     WITH_QEMU_LOCK_GUARD(&s->rq_lock) {
1073         rq = s->rq;
1074         s->rq = NULL;
1075     }
1076 
1077     while (rq) {
1078         VirtIOBlockReq *next = rq->next;
1079         uint16_t idx = virtio_get_queue_index(rq->vq);
1080 
1081         /* Only num_queues vqs were created so vq_rq[idx] is within bounds */
1082         assert(idx < num_queues);
1083         rq->next = vq_rq[idx];
1084         vq_rq[idx] = rq;
1085         rq = next;
1086     }
1087 
1088     /* Schedule a BH to submit the requests in each vq's AioContext */
1089     for (uint16_t i = 0; i < num_queues; i++) {
1090         if (!vq_rq[i]) {
1091             continue;
1092         }
1093 
1094         /* Paired with dec in virtio_blk_dma_restart_bh() */
1095         blk_inc_in_flight(s->conf.conf.blk);
1096 
1097         aio_bh_schedule_oneshot(s->vq_aio_context[i],
1098                                 virtio_blk_dma_restart_bh,
1099                                 vq_rq[i]);
1100     }
1101 }
1102 
1103 static void virtio_blk_reset(VirtIODevice *vdev)
1104 {
1105     VirtIOBlock *s = VIRTIO_BLK(vdev);
1106     VirtIOBlockReq *req;
1107 
1108     /* Dataplane has stopped... */
1109     assert(!s->ioeventfd_started);
1110 
1111     /* ...but requests may still be in flight. */
1112     blk_drain(s->blk);
1113 
1114     /* We drop queued requests after blk_drain() because blk_drain() itself can
1115      * produce them. */
1116     WITH_QEMU_LOCK_GUARD(&s->rq_lock) {
1117         while (s->rq) {
1118             req = s->rq;
1119             s->rq = req->next;
1120 
1121             /* No other threads can access req->vq here */
1122             virtqueue_detach_element(req->vq, &req->elem, 0);
1123 
1124             virtio_blk_free_request(req);
1125         }
1126     }
1127 
1128     blk_set_enable_write_cache(s->blk, s->original_wce);
1129 }
1130 
1131 /* coalesce internal state, copy to pci i/o region 0
1132  */
1133 static void virtio_blk_update_config(VirtIODevice *vdev, uint8_t *config)
1134 {
1135     VirtIOBlock *s = VIRTIO_BLK(vdev);
1136     BlockConf *conf = &s->conf.conf;
1137     BlockDriverState *bs = blk_bs(s->blk);
1138     struct virtio_blk_config blkcfg;
1139     uint64_t capacity;
1140     int64_t length;
1141     int blk_size = conf->logical_block_size;
1142 
1143     blk_get_geometry(s->blk, &capacity);
1144     memset(&blkcfg, 0, sizeof(blkcfg));
1145     virtio_stq_p(vdev, &blkcfg.capacity, capacity);
1146     virtio_stl_p(vdev, &blkcfg.seg_max,
1147                  s->conf.seg_max_adjust ? s->conf.queue_size - 2 : 128 - 2);
1148     virtio_stw_p(vdev, &blkcfg.geometry.cylinders, conf->cyls);
1149     virtio_stl_p(vdev, &blkcfg.blk_size, blk_size);
1150     virtio_stw_p(vdev, &blkcfg.min_io_size, conf->min_io_size / blk_size);
1151     virtio_stl_p(vdev, &blkcfg.opt_io_size, conf->opt_io_size / blk_size);
1152     blkcfg.geometry.heads = conf->heads;
1153     /*
1154      * We must ensure that the block device capacity is a multiple of
1155      * the logical block size. If that is not the case, let's use
1156      * sector_mask to adopt the geometry to have a correct picture.
1157      * For those devices where the capacity is ok for the given geometry
1158      * we don't touch the sector value of the geometry, since some devices
1159      * (like s390 dasd) need a specific value. Here the capacity is already
1160      * cyls*heads*secs*blk_size and the sector value is not block size
1161      * divided by 512 - instead it is the amount of blk_size blocks
1162      * per track (cylinder).
1163      */
1164     length = blk_getlength(s->blk);
1165     if (length > 0 && length / conf->heads / conf->secs % blk_size) {
1166         blkcfg.geometry.sectors = conf->secs & ~s->sector_mask;
1167     } else {
1168         blkcfg.geometry.sectors = conf->secs;
1169     }
1170     blkcfg.size_max = 0;
1171     blkcfg.physical_block_exp = get_physical_block_exp(conf);
1172     blkcfg.alignment_offset = 0;
1173     blkcfg.wce = blk_enable_write_cache(s->blk);
1174     virtio_stw_p(vdev, &blkcfg.num_queues, s->conf.num_queues);
1175     if (virtio_has_feature(s->host_features, VIRTIO_BLK_F_DISCARD)) {
1176         uint32_t discard_granularity = conf->discard_granularity;
1177         if (discard_granularity == -1 || !s->conf.report_discard_granularity) {
1178             discard_granularity = blk_size;
1179         }
1180         virtio_stl_p(vdev, &blkcfg.max_discard_sectors,
1181                      s->conf.max_discard_sectors);
1182         virtio_stl_p(vdev, &blkcfg.discard_sector_alignment,
1183                      discard_granularity >> BDRV_SECTOR_BITS);
1184         /*
1185          * We support only one segment per request since multiple segments
1186          * are not widely used and there are no userspace APIs that allow
1187          * applications to submit multiple segments in a single call.
1188          */
1189         virtio_stl_p(vdev, &blkcfg.max_discard_seg, 1);
1190     }
1191     if (virtio_has_feature(s->host_features, VIRTIO_BLK_F_WRITE_ZEROES)) {
1192         virtio_stl_p(vdev, &blkcfg.max_write_zeroes_sectors,
1193                      s->conf.max_write_zeroes_sectors);
1194         blkcfg.write_zeroes_may_unmap = 1;
1195         virtio_stl_p(vdev, &blkcfg.max_write_zeroes_seg, 1);
1196     }
1197     if (bs->bl.zoned != BLK_Z_NONE) {
1198         switch (bs->bl.zoned) {
1199         case BLK_Z_HM:
1200             blkcfg.zoned.model = VIRTIO_BLK_Z_HM;
1201             break;
1202         case BLK_Z_HA:
1203             blkcfg.zoned.model = VIRTIO_BLK_Z_HA;
1204             break;
1205         default:
1206             g_assert_not_reached();
1207         }
1208 
1209         virtio_stl_p(vdev, &blkcfg.zoned.zone_sectors,
1210                      bs->bl.zone_size / 512);
1211         virtio_stl_p(vdev, &blkcfg.zoned.max_active_zones,
1212                      bs->bl.max_active_zones);
1213         virtio_stl_p(vdev, &blkcfg.zoned.max_open_zones,
1214                      bs->bl.max_open_zones);
1215         virtio_stl_p(vdev, &blkcfg.zoned.write_granularity, blk_size);
1216         virtio_stl_p(vdev, &blkcfg.zoned.max_append_sectors,
1217                      bs->bl.max_append_sectors);
1218     } else {
1219         blkcfg.zoned.model = VIRTIO_BLK_Z_NONE;
1220     }
1221     memcpy(config, &blkcfg, s->config_size);
1222 }
1223 
1224 static void virtio_blk_set_config(VirtIODevice *vdev, const uint8_t *config)
1225 {
1226     VirtIOBlock *s = VIRTIO_BLK(vdev);
1227     struct virtio_blk_config blkcfg;
1228 
1229     memcpy(&blkcfg, config, s->config_size);
1230 
1231     blk_set_enable_write_cache(s->blk, blkcfg.wce != 0);
1232 }
1233 
1234 static uint64_t virtio_blk_get_features(VirtIODevice *vdev, uint64_t features,
1235                                         Error **errp)
1236 {
1237     VirtIOBlock *s = VIRTIO_BLK(vdev);
1238 
1239     /* Firstly sync all virtio-blk possible supported features */
1240     features |= s->host_features;
1241 
1242     virtio_add_feature(&features, VIRTIO_BLK_F_SEG_MAX);
1243     virtio_add_feature(&features, VIRTIO_BLK_F_GEOMETRY);
1244     virtio_add_feature(&features, VIRTIO_BLK_F_TOPOLOGY);
1245     virtio_add_feature(&features, VIRTIO_BLK_F_BLK_SIZE);
1246     if (!virtio_has_feature(features, VIRTIO_F_VERSION_1)) {
1247         virtio_clear_feature(&features, VIRTIO_F_ANY_LAYOUT);
1248         /* Added for historical reasons, removing it could break migration.  */
1249         virtio_add_feature(&features, VIRTIO_BLK_F_SCSI);
1250     }
1251 
1252     if (blk_enable_write_cache(s->blk) ||
1253         (s->conf.x_enable_wce_if_config_wce &&
1254          virtio_has_feature(features, VIRTIO_BLK_F_CONFIG_WCE))) {
1255         virtio_add_feature(&features, VIRTIO_BLK_F_WCE);
1256     }
1257     if (!blk_is_writable(s->blk)) {
1258         virtio_add_feature(&features, VIRTIO_BLK_F_RO);
1259     }
1260     if (s->conf.num_queues > 1) {
1261         virtio_add_feature(&features, VIRTIO_BLK_F_MQ);
1262     }
1263 
1264     return features;
1265 }
1266 
1267 static void virtio_blk_set_status(VirtIODevice *vdev, uint8_t status)
1268 {
1269     VirtIOBlock *s = VIRTIO_BLK(vdev);
1270 
1271     if (!(status & (VIRTIO_CONFIG_S_DRIVER | VIRTIO_CONFIG_S_DRIVER_OK))) {
1272         assert(!s->ioeventfd_started);
1273     }
1274 
1275     if (!(status & VIRTIO_CONFIG_S_DRIVER_OK)) {
1276         return;
1277     }
1278 
1279     /* A guest that supports VIRTIO_BLK_F_CONFIG_WCE must be able to send
1280      * cache flushes.  Thus, the "auto writethrough" behavior is never
1281      * necessary for guests that support the VIRTIO_BLK_F_CONFIG_WCE feature.
1282      * Leaving it enabled would break the following sequence:
1283      *
1284      *     Guest started with "-drive cache=writethrough"
1285      *     Guest sets status to 0
1286      *     Guest sets DRIVER bit in status field
1287      *     Guest reads host features (WCE=0, CONFIG_WCE=1)
1288      *     Guest writes guest features (WCE=0, CONFIG_WCE=1)
1289      *     Guest writes 1 to the WCE configuration field (writeback mode)
1290      *     Guest sets DRIVER_OK bit in status field
1291      *
1292      * s->blk would erroneously be placed in writethrough mode.
1293      */
1294     if (!virtio_vdev_has_feature(vdev, VIRTIO_BLK_F_CONFIG_WCE)) {
1295         blk_set_enable_write_cache(s->blk,
1296                                    virtio_vdev_has_feature(vdev,
1297                                                            VIRTIO_BLK_F_WCE));
1298     }
1299 }
1300 
1301 static void virtio_blk_save_device(VirtIODevice *vdev, QEMUFile *f)
1302 {
1303     VirtIOBlock *s = VIRTIO_BLK(vdev);
1304 
1305     WITH_QEMU_LOCK_GUARD(&s->rq_lock) {
1306         VirtIOBlockReq *req = s->rq;
1307 
1308         while (req) {
1309             qemu_put_sbyte(f, 1);
1310 
1311             if (s->conf.num_queues > 1) {
1312                 qemu_put_be32(f, virtio_get_queue_index(req->vq));
1313             }
1314 
1315             qemu_put_virtqueue_element(vdev, f, &req->elem);
1316             req = req->next;
1317         }
1318     }
1319 
1320     qemu_put_sbyte(f, 0);
1321 }
1322 
1323 static int virtio_blk_load_device(VirtIODevice *vdev, QEMUFile *f,
1324                                   int version_id)
1325 {
1326     VirtIOBlock *s = VIRTIO_BLK(vdev);
1327 
1328     while (qemu_get_sbyte(f)) {
1329         unsigned nvqs = s->conf.num_queues;
1330         unsigned vq_idx = 0;
1331         VirtIOBlockReq *req;
1332 
1333         if (nvqs > 1) {
1334             vq_idx = qemu_get_be32(f);
1335 
1336             if (vq_idx >= nvqs) {
1337                 error_report("Invalid virtqueue index in request list: %#x",
1338                              vq_idx);
1339                 return -EINVAL;
1340             }
1341         }
1342 
1343         req = qemu_get_virtqueue_element(vdev, f, sizeof(VirtIOBlockReq));
1344         virtio_blk_init_request(s, virtio_get_queue(vdev, vq_idx), req);
1345 
1346         WITH_QEMU_LOCK_GUARD(&s->rq_lock) {
1347             req->next = s->rq;
1348             s->rq = req;
1349         }
1350     }
1351 
1352     return 0;
1353 }
1354 
1355 static void virtio_resize_cb(void *opaque)
1356 {
1357     VirtIODevice *vdev = opaque;
1358 
1359     assert(qemu_get_current_aio_context() == qemu_get_aio_context());
1360     virtio_notify_config(vdev);
1361 }
1362 
1363 static void virtio_blk_resize(void *opaque)
1364 {
1365     VirtIODevice *vdev = VIRTIO_DEVICE(opaque);
1366 
1367     /*
1368      * virtio_notify_config() needs to acquire the BQL,
1369      * so it can't be called from an iothread. Instead, schedule
1370      * it to be run in the main context BH.
1371      */
1372     aio_bh_schedule_oneshot(qemu_get_aio_context(), virtio_resize_cb, vdev);
1373 }
1374 
1375 static void virtio_blk_ioeventfd_detach(VirtIOBlock *s)
1376 {
1377     VirtIODevice *vdev = VIRTIO_DEVICE(s);
1378 
1379     for (uint16_t i = 0; i < s->conf.num_queues; i++) {
1380         VirtQueue *vq = virtio_get_queue(vdev, i);
1381         virtio_queue_aio_detach_host_notifier(vq, s->vq_aio_context[i]);
1382     }
1383 }
1384 
1385 static void virtio_blk_ioeventfd_attach(VirtIOBlock *s)
1386 {
1387     VirtIODevice *vdev = VIRTIO_DEVICE(s);
1388 
1389     for (uint16_t i = 0; i < s->conf.num_queues; i++) {
1390         VirtQueue *vq = virtio_get_queue(vdev, i);
1391         virtio_queue_aio_attach_host_notifier(vq, s->vq_aio_context[i]);
1392     }
1393 }
1394 
1395 /* Suspend virtqueue ioeventfd processing during drain */
1396 static void virtio_blk_drained_begin(void *opaque)
1397 {
1398     VirtIOBlock *s = opaque;
1399 
1400     if (s->ioeventfd_started) {
1401         virtio_blk_ioeventfd_detach(s);
1402     }
1403 }
1404 
1405 /* Resume virtqueue ioeventfd processing after drain */
1406 static void virtio_blk_drained_end(void *opaque)
1407 {
1408     VirtIOBlock *s = opaque;
1409 
1410     if (s->ioeventfd_started) {
1411         virtio_blk_ioeventfd_attach(s);
1412     }
1413 }
1414 
1415 static const BlockDevOps virtio_block_ops = {
1416     .resize_cb     = virtio_blk_resize,
1417     .drained_begin = virtio_blk_drained_begin,
1418     .drained_end   = virtio_blk_drained_end,
1419 };
1420 
1421 static bool
1422 validate_iothread_vq_mapping_list(IOThreadVirtQueueMappingList *list,
1423         uint16_t num_queues, Error **errp)
1424 {
1425     g_autofree unsigned long *vqs = bitmap_new(num_queues);
1426     g_autoptr(GHashTable) iothreads =
1427         g_hash_table_new(g_str_hash, g_str_equal);
1428 
1429     for (IOThreadVirtQueueMappingList *node = list; node; node = node->next) {
1430         const char *name = node->value->iothread;
1431         uint16List *vq;
1432 
1433         if (!iothread_by_id(name)) {
1434             error_setg(errp, "IOThread \"%s\" object does not exist", name);
1435             return false;
1436         }
1437 
1438         if (!g_hash_table_add(iothreads, (gpointer)name)) {
1439             error_setg(errp,
1440                     "duplicate IOThread name \"%s\" in iothread-vq-mapping",
1441                     name);
1442             return false;
1443         }
1444 
1445         if (node != list) {
1446             if (!!node->value->vqs != !!list->value->vqs) {
1447                 error_setg(errp, "either all items in iothread-vq-mapping "
1448                                  "must have vqs or none of them must have it");
1449                 return false;
1450             }
1451         }
1452 
1453         for (vq = node->value->vqs; vq; vq = vq->next) {
1454             if (vq->value >= num_queues) {
1455                 error_setg(errp, "vq index %u for IOThread \"%s\" must be "
1456                         "less than num_queues %u in iothread-vq-mapping",
1457                         vq->value, name, num_queues);
1458                 return false;
1459             }
1460 
1461             if (test_and_set_bit(vq->value, vqs)) {
1462                 error_setg(errp, "cannot assign vq %u to IOThread \"%s\" "
1463                         "because it is already assigned", vq->value, name);
1464                 return false;
1465             }
1466         }
1467     }
1468 
1469     if (list->value->vqs) {
1470         for (uint16_t i = 0; i < num_queues; i++) {
1471             if (!test_bit(i, vqs)) {
1472                 error_setg(errp,
1473                         "missing vq %u IOThread assignment in iothread-vq-mapping",
1474                         i);
1475                 return false;
1476             }
1477         }
1478     }
1479 
1480     return true;
1481 }
1482 
1483 /**
1484  * apply_iothread_vq_mapping:
1485  * @iothread_vq_mapping_list: The mapping of virtqueues to IOThreads.
1486  * @vq_aio_context: The array of AioContext pointers to fill in.
1487  * @num_queues: The length of @vq_aio_context.
1488  * @errp: If an error occurs, a pointer to the area to store the error.
1489  *
1490  * Fill in the AioContext for each virtqueue in the @vq_aio_context array given
1491  * the iothread-vq-mapping parameter in @iothread_vq_mapping_list.
1492  *
1493  * Returns: %true on success, %false on failure.
1494  **/
1495 static bool apply_iothread_vq_mapping(
1496         IOThreadVirtQueueMappingList *iothread_vq_mapping_list,
1497         AioContext **vq_aio_context,
1498         uint16_t num_queues,
1499         Error **errp)
1500 {
1501     IOThreadVirtQueueMappingList *node;
1502     size_t num_iothreads = 0;
1503     size_t cur_iothread = 0;
1504 
1505     if (!validate_iothread_vq_mapping_list(iothread_vq_mapping_list,
1506                                            num_queues, errp)) {
1507         return false;
1508     }
1509 
1510     for (node = iothread_vq_mapping_list; node; node = node->next) {
1511         num_iothreads++;
1512     }
1513 
1514     for (node = iothread_vq_mapping_list; node; node = node->next) {
1515         IOThread *iothread = iothread_by_id(node->value->iothread);
1516         AioContext *ctx = iothread_get_aio_context(iothread);
1517 
1518         /* Released in virtio_blk_vq_aio_context_cleanup() */
1519         object_ref(OBJECT(iothread));
1520 
1521         if (node->value->vqs) {
1522             uint16List *vq;
1523 
1524             /* Explicit vq:IOThread assignment */
1525             for (vq = node->value->vqs; vq; vq = vq->next) {
1526                 assert(vq->value < num_queues);
1527                 vq_aio_context[vq->value] = ctx;
1528             }
1529         } else {
1530             /* Round-robin vq:IOThread assignment */
1531             for (unsigned i = cur_iothread; i < num_queues;
1532                  i += num_iothreads) {
1533                 vq_aio_context[i] = ctx;
1534             }
1535         }
1536 
1537         cur_iothread++;
1538     }
1539 
1540     return true;
1541 }
1542 
1543 /* Context: BQL held */
1544 static bool virtio_blk_vq_aio_context_init(VirtIOBlock *s, Error **errp)
1545 {
1546     ERRP_GUARD();
1547     VirtIODevice *vdev = VIRTIO_DEVICE(s);
1548     VirtIOBlkConf *conf = &s->conf;
1549     BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
1550     VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
1551 
1552     if (conf->iothread && conf->iothread_vq_mapping_list) {
1553         error_setg(errp,
1554                    "iothread and iothread-vq-mapping properties cannot be set "
1555                    "at the same time");
1556         return false;
1557     }
1558 
1559     if (conf->iothread || conf->iothread_vq_mapping_list) {
1560         if (!k->set_guest_notifiers || !k->ioeventfd_assign) {
1561             error_setg(errp,
1562                        "device is incompatible with iothread "
1563                        "(transport does not support notifiers)");
1564             return false;
1565         }
1566         if (!virtio_device_ioeventfd_enabled(vdev)) {
1567             error_setg(errp, "ioeventfd is required for iothread");
1568             return false;
1569         }
1570 
1571         /*
1572          * If ioeventfd is (re-)enabled while the guest is running there could
1573          * be block jobs that can conflict.
1574          */
1575         if (blk_op_is_blocked(conf->conf.blk, BLOCK_OP_TYPE_DATAPLANE, errp)) {
1576             error_prepend(errp, "cannot start virtio-blk ioeventfd: ");
1577             return false;
1578         }
1579     }
1580 
1581     s->vq_aio_context = g_new(AioContext *, conf->num_queues);
1582 
1583     if (conf->iothread_vq_mapping_list) {
1584         if (!apply_iothread_vq_mapping(conf->iothread_vq_mapping_list,
1585                                        s->vq_aio_context,
1586                                        conf->num_queues,
1587                                        errp)) {
1588             g_free(s->vq_aio_context);
1589             s->vq_aio_context = NULL;
1590             return false;
1591         }
1592     } else if (conf->iothread) {
1593         AioContext *ctx = iothread_get_aio_context(conf->iothread);
1594         for (unsigned i = 0; i < conf->num_queues; i++) {
1595             s->vq_aio_context[i] = ctx;
1596         }
1597 
1598         /* Released in virtio_blk_vq_aio_context_cleanup() */
1599         object_ref(OBJECT(conf->iothread));
1600     } else {
1601         AioContext *ctx = qemu_get_aio_context();
1602         for (unsigned i = 0; i < conf->num_queues; i++) {
1603             s->vq_aio_context[i] = ctx;
1604         }
1605     }
1606 
1607     return true;
1608 }
1609 
1610 /* Context: BQL held */
1611 static void virtio_blk_vq_aio_context_cleanup(VirtIOBlock *s)
1612 {
1613     VirtIOBlkConf *conf = &s->conf;
1614 
1615     assert(!s->ioeventfd_started);
1616 
1617     if (conf->iothread_vq_mapping_list) {
1618         IOThreadVirtQueueMappingList *node;
1619 
1620         for (node = conf->iothread_vq_mapping_list; node; node = node->next) {
1621             IOThread *iothread = iothread_by_id(node->value->iothread);
1622             object_unref(OBJECT(iothread));
1623         }
1624     }
1625 
1626     if (conf->iothread) {
1627         object_unref(OBJECT(conf->iothread));
1628     }
1629 
1630     g_free(s->vq_aio_context);
1631     s->vq_aio_context = NULL;
1632 }
1633 
1634 /* Context: BQL held */
1635 static int virtio_blk_start_ioeventfd(VirtIODevice *vdev)
1636 {
1637     VirtIOBlock *s = VIRTIO_BLK(vdev);
1638     BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(s)));
1639     VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
1640     unsigned i;
1641     unsigned nvqs = s->conf.num_queues;
1642     Error *local_err = NULL;
1643     int r;
1644 
1645     if (s->ioeventfd_started || s->ioeventfd_starting) {
1646         return 0;
1647     }
1648 
1649     s->ioeventfd_starting = true;
1650 
1651     /* Set up guest notifier (irq) */
1652     r = k->set_guest_notifiers(qbus->parent, nvqs, true);
1653     if (r != 0) {
1654         error_report("virtio-blk failed to set guest notifier (%d), "
1655                      "ensure -accel kvm is set.", r);
1656         goto fail_guest_notifiers;
1657     }
1658 
1659     /*
1660      * Batch all the host notifiers in a single transaction to avoid
1661      * quadratic time complexity in address_space_update_ioeventfds().
1662      */
1663     memory_region_transaction_begin();
1664 
1665     /* Set up virtqueue notify */
1666     for (i = 0; i < nvqs; i++) {
1667         r = virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), i, true);
1668         if (r != 0) {
1669             int j = i;
1670 
1671             fprintf(stderr, "virtio-blk failed to set host notifier (%d)\n", r);
1672             while (i--) {
1673                 virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), i, false);
1674             }
1675 
1676             /*
1677              * The transaction expects the ioeventfds to be open when it
1678              * commits. Do it now, before the cleanup loop.
1679              */
1680             memory_region_transaction_commit();
1681 
1682             while (j--) {
1683                 virtio_bus_cleanup_host_notifier(VIRTIO_BUS(qbus), j);
1684             }
1685             goto fail_host_notifiers;
1686         }
1687     }
1688 
1689     memory_region_transaction_commit();
1690 
1691     /*
1692      * Try to change the AioContext so that block jobs and other operations can
1693      * co-locate their activity in the same AioContext. If it fails, nevermind.
1694      */
1695     assert(nvqs > 0); /* enforced during ->realize() */
1696     r = blk_set_aio_context(s->conf.conf.blk, s->vq_aio_context[0],
1697                             &local_err);
1698     if (r < 0) {
1699         warn_report_err(local_err);
1700     }
1701 
1702     /*
1703      * These fields must be visible to the IOThread when it processes the
1704      * virtqueue, otherwise it will think ioeventfd has not started yet.
1705      *
1706      * Make sure ->ioeventfd_started is false when blk_set_aio_context() is
1707      * called above so that draining does not cause the host notifier to be
1708      * detached/attached prematurely.
1709      */
1710     s->ioeventfd_starting = false;
1711     s->ioeventfd_started = true;
1712     smp_wmb(); /* paired with aio_notify_accept() on the read side */
1713 
1714     /*
1715      * Get this show started by hooking up our callbacks.  If drained now,
1716      * virtio_blk_drained_end() will do this later.
1717      * Attaching the notifier also kicks the virtqueues, processing any requests
1718      * they may already have.
1719      */
1720     if (!blk_in_drain(s->conf.conf.blk)) {
1721         virtio_blk_ioeventfd_attach(s);
1722     }
1723     return 0;
1724 
1725   fail_host_notifiers:
1726     k->set_guest_notifiers(qbus->parent, nvqs, false);
1727   fail_guest_notifiers:
1728     s->ioeventfd_disabled = true;
1729     s->ioeventfd_starting = false;
1730     return -ENOSYS;
1731 }
1732 
1733 /* Stop notifications for new requests from guest.
1734  *
1735  * Context: BH in IOThread
1736  */
1737 static void virtio_blk_ioeventfd_stop_vq_bh(void *opaque)
1738 {
1739     VirtQueue *vq = opaque;
1740     EventNotifier *host_notifier = virtio_queue_get_host_notifier(vq);
1741 
1742     virtio_queue_aio_detach_host_notifier(vq, qemu_get_current_aio_context());
1743 
1744     /*
1745      * Test and clear notifier after disabling event, in case poll callback
1746      * didn't have time to run.
1747      */
1748     virtio_queue_host_notifier_read(host_notifier);
1749 }
1750 
1751 /* Context: BQL held */
1752 static void virtio_blk_stop_ioeventfd(VirtIODevice *vdev)
1753 {
1754     VirtIOBlock *s = VIRTIO_BLK(vdev);
1755     BusState *qbus = qdev_get_parent_bus(DEVICE(s));
1756     VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
1757     unsigned i;
1758     unsigned nvqs = s->conf.num_queues;
1759 
1760     if (!s->ioeventfd_started || s->ioeventfd_stopping) {
1761         return;
1762     }
1763 
1764     /* Better luck next time. */
1765     if (s->ioeventfd_disabled) {
1766         s->ioeventfd_disabled = false;
1767         s->ioeventfd_started = false;
1768         return;
1769     }
1770     s->ioeventfd_stopping = true;
1771 
1772     if (!blk_in_drain(s->conf.conf.blk)) {
1773         for (i = 0; i < nvqs; i++) {
1774             VirtQueue *vq = virtio_get_queue(vdev, i);
1775             AioContext *ctx = s->vq_aio_context[i];
1776 
1777             aio_wait_bh_oneshot(ctx, virtio_blk_ioeventfd_stop_vq_bh, vq);
1778         }
1779     }
1780 
1781     /*
1782      * Batch all the host notifiers in a single transaction to avoid
1783      * quadratic time complexity in address_space_update_ioeventfds().
1784      */
1785     memory_region_transaction_begin();
1786 
1787     for (i = 0; i < nvqs; i++) {
1788         virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), i, false);
1789     }
1790 
1791     /*
1792      * The transaction expects the ioeventfds to be open when it
1793      * commits. Do it now, before the cleanup loop.
1794      */
1795     memory_region_transaction_commit();
1796 
1797     for (i = 0; i < nvqs; i++) {
1798         virtio_bus_cleanup_host_notifier(VIRTIO_BUS(qbus), i);
1799     }
1800 
1801     /*
1802      * Set ->ioeventfd_started to false before draining so that host notifiers
1803      * are not detached/attached anymore.
1804      */
1805     s->ioeventfd_started = false;
1806 
1807     /* Wait for virtio_blk_dma_restart_bh() and in flight I/O to complete */
1808     blk_drain(s->conf.conf.blk);
1809 
1810     /*
1811      * Try to switch bs back to the QEMU main loop. If other users keep the
1812      * BlockBackend in the iothread, that's ok
1813      */
1814     blk_set_aio_context(s->conf.conf.blk, qemu_get_aio_context(), NULL);
1815 
1816     /* Clean up guest notifier (irq) */
1817     k->set_guest_notifiers(qbus->parent, nvqs, false);
1818 
1819     s->ioeventfd_stopping = false;
1820 }
1821 
1822 static void virtio_blk_device_realize(DeviceState *dev, Error **errp)
1823 {
1824     VirtIODevice *vdev = VIRTIO_DEVICE(dev);
1825     VirtIOBlock *s = VIRTIO_BLK(dev);
1826     VirtIOBlkConf *conf = &s->conf;
1827     BlockDriverState *bs;
1828     Error *err = NULL;
1829     unsigned i;
1830 
1831     if (!conf->conf.blk) {
1832         error_setg(errp, "drive property not set");
1833         return;
1834     }
1835     if (!blk_is_inserted(conf->conf.blk)) {
1836         error_setg(errp, "Device needs media, but drive is empty");
1837         return;
1838     }
1839     if (conf->num_queues == VIRTIO_BLK_AUTO_NUM_QUEUES) {
1840         conf->num_queues = 1;
1841     }
1842     if (!conf->num_queues) {
1843         error_setg(errp, "num-queues property must be larger than 0");
1844         return;
1845     }
1846     if (conf->queue_size <= 2) {
1847         error_setg(errp, "invalid queue-size property (%" PRIu16 "), "
1848                    "must be > 2", conf->queue_size);
1849         return;
1850     }
1851     if (!is_power_of_2(conf->queue_size) ||
1852         conf->queue_size > VIRTQUEUE_MAX_SIZE) {
1853         error_setg(errp, "invalid queue-size property (%" PRIu16 "), "
1854                    "must be a power of 2 (max %d)",
1855                    conf->queue_size, VIRTQUEUE_MAX_SIZE);
1856         return;
1857     }
1858 
1859     if (!blkconf_apply_backend_options(&conf->conf,
1860                                        !blk_supports_write_perm(conf->conf.blk),
1861                                        true, errp)) {
1862         return;
1863     }
1864     s->original_wce = blk_enable_write_cache(conf->conf.blk);
1865     if (!blkconf_geometry(&conf->conf, NULL, 65535, 255, 255, errp)) {
1866         return;
1867     }
1868 
1869     if (!blkconf_blocksizes(&conf->conf, errp)) {
1870         return;
1871     }
1872 
1873     bs = blk_bs(conf->conf.blk);
1874     if (bs->bl.zoned != BLK_Z_NONE) {
1875         virtio_add_feature(&s->host_features, VIRTIO_BLK_F_ZONED);
1876         if (bs->bl.zoned == BLK_Z_HM) {
1877             virtio_clear_feature(&s->host_features, VIRTIO_BLK_F_DISCARD);
1878         }
1879     }
1880 
1881     if (virtio_has_feature(s->host_features, VIRTIO_BLK_F_DISCARD) &&
1882         (!conf->max_discard_sectors ||
1883          conf->max_discard_sectors > BDRV_REQUEST_MAX_SECTORS)) {
1884         error_setg(errp, "invalid max-discard-sectors property (%" PRIu32 ")"
1885                    ", must be between 1 and %d",
1886                    conf->max_discard_sectors, (int)BDRV_REQUEST_MAX_SECTORS);
1887         return;
1888     }
1889 
1890     if (virtio_has_feature(s->host_features, VIRTIO_BLK_F_WRITE_ZEROES) &&
1891         (!conf->max_write_zeroes_sectors ||
1892          conf->max_write_zeroes_sectors > BDRV_REQUEST_MAX_SECTORS)) {
1893         error_setg(errp, "invalid max-write-zeroes-sectors property (%" PRIu32
1894                    "), must be between 1 and %d",
1895                    conf->max_write_zeroes_sectors,
1896                    (int)BDRV_REQUEST_MAX_SECTORS);
1897         return;
1898     }
1899 
1900     s->config_size = virtio_get_config_size(&virtio_blk_cfg_size_params,
1901                                             s->host_features);
1902     virtio_init(vdev, VIRTIO_ID_BLOCK, s->config_size);
1903 
1904     qemu_mutex_init(&s->rq_lock);
1905 
1906     s->blk = conf->conf.blk;
1907     s->rq = NULL;
1908     s->sector_mask = (s->conf.conf.logical_block_size / BDRV_SECTOR_SIZE) - 1;
1909 
1910     for (i = 0; i < conf->num_queues; i++) {
1911         virtio_add_queue(vdev, conf->queue_size, virtio_blk_handle_output);
1912     }
1913     qemu_coroutine_inc_pool_size(conf->num_queues * conf->queue_size / 2);
1914 
1915     /* Don't start ioeventfd if transport does not support notifiers. */
1916     if (!virtio_device_ioeventfd_enabled(vdev)) {
1917         s->ioeventfd_disabled = true;
1918     }
1919 
1920     virtio_blk_vq_aio_context_init(s, &err);
1921     if (err != NULL) {
1922         error_propagate(errp, err);
1923         for (i = 0; i < conf->num_queues; i++) {
1924             virtio_del_queue(vdev, i);
1925         }
1926         virtio_cleanup(vdev);
1927         return;
1928     }
1929 
1930     /*
1931      * This must be after virtio_init() so virtio_blk_dma_restart_cb() gets
1932      * called after ->start_ioeventfd() has already set blk's AioContext.
1933      */
1934     s->change =
1935         qdev_add_vm_change_state_handler(dev, virtio_blk_dma_restart_cb, s);
1936 
1937     blk_ram_registrar_init(&s->blk_ram_registrar, s->blk);
1938     blk_set_dev_ops(s->blk, &virtio_block_ops, s);
1939 
1940     blk_iostatus_enable(s->blk);
1941 
1942     add_boot_device_lchs(dev, "/disk@0,0",
1943                          conf->conf.lcyls,
1944                          conf->conf.lheads,
1945                          conf->conf.lsecs);
1946 }
1947 
1948 static void virtio_blk_device_unrealize(DeviceState *dev)
1949 {
1950     VirtIODevice *vdev = VIRTIO_DEVICE(dev);
1951     VirtIOBlock *s = VIRTIO_BLK(dev);
1952     VirtIOBlkConf *conf = &s->conf;
1953     unsigned i;
1954 
1955     blk_drain(s->blk);
1956     del_boot_device_lchs(dev, "/disk@0,0");
1957     virtio_blk_vq_aio_context_cleanup(s);
1958     for (i = 0; i < conf->num_queues; i++) {
1959         virtio_del_queue(vdev, i);
1960     }
1961     qemu_coroutine_dec_pool_size(conf->num_queues * conf->queue_size / 2);
1962     qemu_mutex_destroy(&s->rq_lock);
1963     blk_ram_registrar_destroy(&s->blk_ram_registrar);
1964     qemu_del_vm_change_state_handler(s->change);
1965     blockdev_mark_auto_del(s->blk);
1966     virtio_cleanup(vdev);
1967 }
1968 
1969 static void virtio_blk_instance_init(Object *obj)
1970 {
1971     VirtIOBlock *s = VIRTIO_BLK(obj);
1972 
1973     device_add_bootindex_property(obj, &s->conf.conf.bootindex,
1974                                   "bootindex", "/disk@0,0",
1975                                   DEVICE(obj));
1976 }
1977 
1978 static const VMStateDescription vmstate_virtio_blk = {
1979     .name = "virtio-blk",
1980     .minimum_version_id = 2,
1981     .version_id = 2,
1982     .fields = (const VMStateField[]) {
1983         VMSTATE_VIRTIO_DEVICE,
1984         VMSTATE_END_OF_LIST()
1985     },
1986 };
1987 
1988 static Property virtio_blk_properties[] = {
1989     DEFINE_BLOCK_PROPERTIES(VirtIOBlock, conf.conf),
1990     DEFINE_BLOCK_ERROR_PROPERTIES(VirtIOBlock, conf.conf),
1991     DEFINE_BLOCK_CHS_PROPERTIES(VirtIOBlock, conf.conf),
1992     DEFINE_PROP_STRING("serial", VirtIOBlock, conf.serial),
1993     DEFINE_PROP_BIT64("config-wce", VirtIOBlock, host_features,
1994                       VIRTIO_BLK_F_CONFIG_WCE, true),
1995     DEFINE_PROP_BIT("request-merging", VirtIOBlock, conf.request_merging, 0,
1996                     true),
1997     DEFINE_PROP_UINT16("num-queues", VirtIOBlock, conf.num_queues,
1998                        VIRTIO_BLK_AUTO_NUM_QUEUES),
1999     DEFINE_PROP_UINT16("queue-size", VirtIOBlock, conf.queue_size, 256),
2000     DEFINE_PROP_BOOL("seg-max-adjust", VirtIOBlock, conf.seg_max_adjust, true),
2001     DEFINE_PROP_LINK("iothread", VirtIOBlock, conf.iothread, TYPE_IOTHREAD,
2002                      IOThread *),
2003     DEFINE_PROP_IOTHREAD_VQ_MAPPING_LIST("iothread-vq-mapping", VirtIOBlock,
2004                                          conf.iothread_vq_mapping_list),
2005     DEFINE_PROP_BIT64("discard", VirtIOBlock, host_features,
2006                       VIRTIO_BLK_F_DISCARD, true),
2007     DEFINE_PROP_BOOL("report-discard-granularity", VirtIOBlock,
2008                      conf.report_discard_granularity, true),
2009     DEFINE_PROP_BIT64("write-zeroes", VirtIOBlock, host_features,
2010                       VIRTIO_BLK_F_WRITE_ZEROES, true),
2011     DEFINE_PROP_UINT32("max-discard-sectors", VirtIOBlock,
2012                        conf.max_discard_sectors, BDRV_REQUEST_MAX_SECTORS),
2013     DEFINE_PROP_UINT32("max-write-zeroes-sectors", VirtIOBlock,
2014                        conf.max_write_zeroes_sectors, BDRV_REQUEST_MAX_SECTORS),
2015     DEFINE_PROP_BOOL("x-enable-wce-if-config-wce", VirtIOBlock,
2016                      conf.x_enable_wce_if_config_wce, true),
2017     DEFINE_PROP_END_OF_LIST(),
2018 };
2019 
2020 static void virtio_blk_class_init(ObjectClass *klass, void *data)
2021 {
2022     DeviceClass *dc = DEVICE_CLASS(klass);
2023     VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass);
2024 
2025     device_class_set_props(dc, virtio_blk_properties);
2026     dc->vmsd = &vmstate_virtio_blk;
2027     set_bit(DEVICE_CATEGORY_STORAGE, dc->categories);
2028     vdc->realize = virtio_blk_device_realize;
2029     vdc->unrealize = virtio_blk_device_unrealize;
2030     vdc->get_config = virtio_blk_update_config;
2031     vdc->set_config = virtio_blk_set_config;
2032     vdc->get_features = virtio_blk_get_features;
2033     vdc->set_status = virtio_blk_set_status;
2034     vdc->reset = virtio_blk_reset;
2035     vdc->save = virtio_blk_save_device;
2036     vdc->load = virtio_blk_load_device;
2037     vdc->start_ioeventfd = virtio_blk_start_ioeventfd;
2038     vdc->stop_ioeventfd = virtio_blk_stop_ioeventfd;
2039 }
2040 
2041 static const TypeInfo virtio_blk_info = {
2042     .name = TYPE_VIRTIO_BLK,
2043     .parent = TYPE_VIRTIO_DEVICE,
2044     .instance_size = sizeof(VirtIOBlock),
2045     .instance_init = virtio_blk_instance_init,
2046     .class_init = virtio_blk_class_init,
2047 };
2048 
2049 static void virtio_register_types(void)
2050 {
2051     type_register_static(&virtio_blk_info);
2052 }
2053 
2054 type_init(virtio_register_types)
2055