xref: /openbmc/qemu/hw/block/virtio-blk.c (revision 09a274d8)
1 /*
2  * Virtio Block Device
3  *
4  * Copyright IBM, Corp. 2007
5  *
6  * Authors:
7  *  Anthony Liguori   <aliguori@us.ibm.com>
8  *
9  * This work is licensed under the terms of the GNU GPL, version 2.  See
10  * the COPYING file in the top-level directory.
11  *
12  */
13 
14 #include "qemu/osdep.h"
15 #include "qapi/error.h"
16 #include "qemu-common.h"
17 #include "qemu/iov.h"
18 #include "qemu/error-report.h"
19 #include "trace.h"
20 #include "hw/block/block.h"
21 #include "sysemu/blockdev.h"
22 #include "hw/virtio/virtio-blk.h"
23 #include "dataplane/virtio-blk.h"
24 #include "scsi/constants.h"
25 #ifdef __linux__
26 # include <scsi/sg.h>
27 #endif
28 #include "hw/virtio/virtio-bus.h"
29 #include "hw/virtio/virtio-access.h"
30 
31 /* We don't support discard yet, hide associated config fields. */
32 #define VIRTIO_BLK_CFG_SIZE offsetof(struct virtio_blk_config, \
33                                      max_discard_sectors)
34 
35 static void virtio_blk_init_request(VirtIOBlock *s, VirtQueue *vq,
36                                     VirtIOBlockReq *req)
37 {
38     req->dev = s;
39     req->vq = vq;
40     req->qiov.size = 0;
41     req->in_len = 0;
42     req->next = NULL;
43     req->mr_next = NULL;
44 }
45 
46 static void virtio_blk_free_request(VirtIOBlockReq *req)
47 {
48     g_free(req);
49 }
50 
51 static void virtio_blk_req_complete(VirtIOBlockReq *req, unsigned char status)
52 {
53     VirtIOBlock *s = req->dev;
54     VirtIODevice *vdev = VIRTIO_DEVICE(s);
55 
56     trace_virtio_blk_req_complete(vdev, req, status);
57 
58     stb_p(&req->in->status, status);
59     virtqueue_push(req->vq, &req->elem, req->in_len);
60     if (s->dataplane_started && !s->dataplane_disabled) {
61         virtio_blk_data_plane_notify(s->dataplane, req->vq);
62     } else {
63         virtio_notify(vdev, req->vq);
64     }
65 }
66 
67 static int virtio_blk_handle_rw_error(VirtIOBlockReq *req, int error,
68     bool is_read)
69 {
70     VirtIOBlock *s = req->dev;
71     BlockErrorAction action = blk_get_error_action(s->blk, is_read, error);
72 
73     if (action == BLOCK_ERROR_ACTION_STOP) {
74         /* Break the link as the next request is going to be parsed from the
75          * ring again. Otherwise we may end up doing a double completion! */
76         req->mr_next = NULL;
77         req->next = s->rq;
78         s->rq = req;
79     } else if (action == BLOCK_ERROR_ACTION_REPORT) {
80         virtio_blk_req_complete(req, VIRTIO_BLK_S_IOERR);
81         block_acct_failed(blk_get_stats(s->blk), &req->acct);
82         virtio_blk_free_request(req);
83     }
84 
85     blk_error_action(s->blk, action, is_read, error);
86     return action != BLOCK_ERROR_ACTION_IGNORE;
87 }
88 
89 static void virtio_blk_rw_complete(void *opaque, int ret)
90 {
91     VirtIOBlockReq *next = opaque;
92     VirtIOBlock *s = next->dev;
93     VirtIODevice *vdev = VIRTIO_DEVICE(s);
94 
95     aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
96     while (next) {
97         VirtIOBlockReq *req = next;
98         next = req->mr_next;
99         trace_virtio_blk_rw_complete(vdev, req, ret);
100 
101         if (req->qiov.nalloc != -1) {
102             /* If nalloc is != -1 req->qiov is a local copy of the original
103              * external iovec. It was allocated in submit_requests to be
104              * able to merge requests. */
105             qemu_iovec_destroy(&req->qiov);
106         }
107 
108         if (ret) {
109             int p = virtio_ldl_p(VIRTIO_DEVICE(req->dev), &req->out.type);
110             bool is_read = !(p & VIRTIO_BLK_T_OUT);
111             /* Note that memory may be dirtied on read failure.  If the
112              * virtio request is not completed here, as is the case for
113              * BLOCK_ERROR_ACTION_STOP, the memory may not be copied
114              * correctly during live migration.  While this is ugly,
115              * it is acceptable because the device is free to write to
116              * the memory until the request is completed (which will
117              * happen on the other side of the migration).
118              */
119             if (virtio_blk_handle_rw_error(req, -ret, is_read)) {
120                 continue;
121             }
122         }
123 
124         virtio_blk_req_complete(req, VIRTIO_BLK_S_OK);
125         block_acct_done(blk_get_stats(req->dev->blk), &req->acct);
126         virtio_blk_free_request(req);
127     }
128     aio_context_release(blk_get_aio_context(s->conf.conf.blk));
129 }
130 
131 static void virtio_blk_flush_complete(void *opaque, int ret)
132 {
133     VirtIOBlockReq *req = opaque;
134     VirtIOBlock *s = req->dev;
135 
136     aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
137     if (ret) {
138         if (virtio_blk_handle_rw_error(req, -ret, 0)) {
139             goto out;
140         }
141     }
142 
143     virtio_blk_req_complete(req, VIRTIO_BLK_S_OK);
144     block_acct_done(blk_get_stats(s->blk), &req->acct);
145     virtio_blk_free_request(req);
146 
147 out:
148     aio_context_release(blk_get_aio_context(s->conf.conf.blk));
149 }
150 
151 #ifdef __linux__
152 
153 typedef struct {
154     VirtIOBlockReq *req;
155     struct sg_io_hdr hdr;
156 } VirtIOBlockIoctlReq;
157 
158 static void virtio_blk_ioctl_complete(void *opaque, int status)
159 {
160     VirtIOBlockIoctlReq *ioctl_req = opaque;
161     VirtIOBlockReq *req = ioctl_req->req;
162     VirtIOBlock *s = req->dev;
163     VirtIODevice *vdev = VIRTIO_DEVICE(s);
164     struct virtio_scsi_inhdr *scsi;
165     struct sg_io_hdr *hdr;
166 
167     scsi = (void *)req->elem.in_sg[req->elem.in_num - 2].iov_base;
168 
169     if (status) {
170         status = VIRTIO_BLK_S_UNSUPP;
171         virtio_stl_p(vdev, &scsi->errors, 255);
172         goto out;
173     }
174 
175     hdr = &ioctl_req->hdr;
176     /*
177      * From SCSI-Generic-HOWTO: "Some lower level drivers (e.g. ide-scsi)
178      * clear the masked_status field [hence status gets cleared too, see
179      * block/scsi_ioctl.c] even when a CHECK_CONDITION or COMMAND_TERMINATED
180      * status has occurred.  However they do set DRIVER_SENSE in driver_status
181      * field. Also a (sb_len_wr > 0) indicates there is a sense buffer.
182      */
183     if (hdr->status == 0 && hdr->sb_len_wr > 0) {
184         hdr->status = CHECK_CONDITION;
185     }
186 
187     virtio_stl_p(vdev, &scsi->errors,
188                  hdr->status | (hdr->msg_status << 8) |
189                  (hdr->host_status << 16) | (hdr->driver_status << 24));
190     virtio_stl_p(vdev, &scsi->residual, hdr->resid);
191     virtio_stl_p(vdev, &scsi->sense_len, hdr->sb_len_wr);
192     virtio_stl_p(vdev, &scsi->data_len, hdr->dxfer_len);
193 
194 out:
195     aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
196     virtio_blk_req_complete(req, status);
197     virtio_blk_free_request(req);
198     aio_context_release(blk_get_aio_context(s->conf.conf.blk));
199     g_free(ioctl_req);
200 }
201 
202 #endif
203 
204 static VirtIOBlockReq *virtio_blk_get_request(VirtIOBlock *s, VirtQueue *vq)
205 {
206     VirtIOBlockReq *req = virtqueue_pop(vq, sizeof(VirtIOBlockReq));
207 
208     if (req) {
209         virtio_blk_init_request(s, vq, req);
210     }
211     return req;
212 }
213 
214 static int virtio_blk_handle_scsi_req(VirtIOBlockReq *req)
215 {
216     int status = VIRTIO_BLK_S_OK;
217     struct virtio_scsi_inhdr *scsi = NULL;
218     VirtIODevice *vdev = VIRTIO_DEVICE(req->dev);
219     VirtQueueElement *elem = &req->elem;
220     VirtIOBlock *blk = req->dev;
221 
222 #ifdef __linux__
223     int i;
224     VirtIOBlockIoctlReq *ioctl_req;
225     BlockAIOCB *acb;
226 #endif
227 
228     /*
229      * We require at least one output segment each for the virtio_blk_outhdr
230      * and the SCSI command block.
231      *
232      * We also at least require the virtio_blk_inhdr, the virtio_scsi_inhdr
233      * and the sense buffer pointer in the input segments.
234      */
235     if (elem->out_num < 2 || elem->in_num < 3) {
236         status = VIRTIO_BLK_S_IOERR;
237         goto fail;
238     }
239 
240     /*
241      * The scsi inhdr is placed in the second-to-last input segment, just
242      * before the regular inhdr.
243      */
244     scsi = (void *)elem->in_sg[elem->in_num - 2].iov_base;
245 
246     if (!blk->conf.scsi) {
247         status = VIRTIO_BLK_S_UNSUPP;
248         goto fail;
249     }
250 
251     /*
252      * No support for bidirection commands yet.
253      */
254     if (elem->out_num > 2 && elem->in_num > 3) {
255         status = VIRTIO_BLK_S_UNSUPP;
256         goto fail;
257     }
258 
259 #ifdef __linux__
260     ioctl_req = g_new0(VirtIOBlockIoctlReq, 1);
261     ioctl_req->req = req;
262     ioctl_req->hdr.interface_id = 'S';
263     ioctl_req->hdr.cmd_len = elem->out_sg[1].iov_len;
264     ioctl_req->hdr.cmdp = elem->out_sg[1].iov_base;
265     ioctl_req->hdr.dxfer_len = 0;
266 
267     if (elem->out_num > 2) {
268         /*
269          * If there are more than the minimally required 2 output segments
270          * there is write payload starting from the third iovec.
271          */
272         ioctl_req->hdr.dxfer_direction = SG_DXFER_TO_DEV;
273         ioctl_req->hdr.iovec_count = elem->out_num - 2;
274 
275         for (i = 0; i < ioctl_req->hdr.iovec_count; i++) {
276             ioctl_req->hdr.dxfer_len += elem->out_sg[i + 2].iov_len;
277         }
278 
279         ioctl_req->hdr.dxferp = elem->out_sg + 2;
280 
281     } else if (elem->in_num > 3) {
282         /*
283          * If we have more than 3 input segments the guest wants to actually
284          * read data.
285          */
286         ioctl_req->hdr.dxfer_direction = SG_DXFER_FROM_DEV;
287         ioctl_req->hdr.iovec_count = elem->in_num - 3;
288         for (i = 0; i < ioctl_req->hdr.iovec_count; i++) {
289             ioctl_req->hdr.dxfer_len += elem->in_sg[i].iov_len;
290         }
291 
292         ioctl_req->hdr.dxferp = elem->in_sg;
293     } else {
294         /*
295          * Some SCSI commands don't actually transfer any data.
296          */
297         ioctl_req->hdr.dxfer_direction = SG_DXFER_NONE;
298     }
299 
300     ioctl_req->hdr.sbp = elem->in_sg[elem->in_num - 3].iov_base;
301     ioctl_req->hdr.mx_sb_len = elem->in_sg[elem->in_num - 3].iov_len;
302 
303     acb = blk_aio_ioctl(blk->blk, SG_IO, &ioctl_req->hdr,
304                         virtio_blk_ioctl_complete, ioctl_req);
305     if (!acb) {
306         g_free(ioctl_req);
307         status = VIRTIO_BLK_S_UNSUPP;
308         goto fail;
309     }
310     return -EINPROGRESS;
311 #else
312     abort();
313 #endif
314 
315 fail:
316     /* Just put anything nonzero so that the ioctl fails in the guest.  */
317     if (scsi) {
318         virtio_stl_p(vdev, &scsi->errors, 255);
319     }
320     return status;
321 }
322 
323 static void virtio_blk_handle_scsi(VirtIOBlockReq *req)
324 {
325     int status;
326 
327     status = virtio_blk_handle_scsi_req(req);
328     if (status != -EINPROGRESS) {
329         virtio_blk_req_complete(req, status);
330         virtio_blk_free_request(req);
331     }
332 }
333 
334 static inline void submit_requests(BlockBackend *blk, MultiReqBuffer *mrb,
335                                    int start, int num_reqs, int niov)
336 {
337     QEMUIOVector *qiov = &mrb->reqs[start]->qiov;
338     int64_t sector_num = mrb->reqs[start]->sector_num;
339     bool is_write = mrb->is_write;
340 
341     if (num_reqs > 1) {
342         int i;
343         struct iovec *tmp_iov = qiov->iov;
344         int tmp_niov = qiov->niov;
345 
346         /* mrb->reqs[start]->qiov was initialized from external so we can't
347          * modify it here. We need to initialize it locally and then add the
348          * external iovecs. */
349         qemu_iovec_init(qiov, niov);
350 
351         for (i = 0; i < tmp_niov; i++) {
352             qemu_iovec_add(qiov, tmp_iov[i].iov_base, tmp_iov[i].iov_len);
353         }
354 
355         for (i = start + 1; i < start + num_reqs; i++) {
356             qemu_iovec_concat(qiov, &mrb->reqs[i]->qiov, 0,
357                               mrb->reqs[i]->qiov.size);
358             mrb->reqs[i - 1]->mr_next = mrb->reqs[i];
359         }
360 
361         trace_virtio_blk_submit_multireq(VIRTIO_DEVICE(mrb->reqs[start]->dev),
362                                          mrb, start, num_reqs,
363                                          sector_num << BDRV_SECTOR_BITS,
364                                          qiov->size, is_write);
365         block_acct_merge_done(blk_get_stats(blk),
366                               is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ,
367                               num_reqs - 1);
368     }
369 
370     if (is_write) {
371         blk_aio_pwritev(blk, sector_num << BDRV_SECTOR_BITS, qiov, 0,
372                         virtio_blk_rw_complete, mrb->reqs[start]);
373     } else {
374         blk_aio_preadv(blk, sector_num << BDRV_SECTOR_BITS, qiov, 0,
375                        virtio_blk_rw_complete, mrb->reqs[start]);
376     }
377 }
378 
379 static int multireq_compare(const void *a, const void *b)
380 {
381     const VirtIOBlockReq *req1 = *(VirtIOBlockReq **)a,
382                          *req2 = *(VirtIOBlockReq **)b;
383 
384     /*
385      * Note that we can't simply subtract sector_num1 from sector_num2
386      * here as that could overflow the return value.
387      */
388     if (req1->sector_num > req2->sector_num) {
389         return 1;
390     } else if (req1->sector_num < req2->sector_num) {
391         return -1;
392     } else {
393         return 0;
394     }
395 }
396 
397 static void virtio_blk_submit_multireq(BlockBackend *blk, MultiReqBuffer *mrb)
398 {
399     int i = 0, start = 0, num_reqs = 0, niov = 0, nb_sectors = 0;
400     uint32_t max_transfer;
401     int64_t sector_num = 0;
402 
403     if (mrb->num_reqs == 1) {
404         submit_requests(blk, mrb, 0, 1, -1);
405         mrb->num_reqs = 0;
406         return;
407     }
408 
409     max_transfer = blk_get_max_transfer(mrb->reqs[0]->dev->blk);
410 
411     qsort(mrb->reqs, mrb->num_reqs, sizeof(*mrb->reqs),
412           &multireq_compare);
413 
414     for (i = 0; i < mrb->num_reqs; i++) {
415         VirtIOBlockReq *req = mrb->reqs[i];
416         if (num_reqs > 0) {
417             /*
418              * NOTE: We cannot merge the requests in below situations:
419              * 1. requests are not sequential
420              * 2. merge would exceed maximum number of IOVs
421              * 3. merge would exceed maximum transfer length of backend device
422              */
423             if (sector_num + nb_sectors != req->sector_num ||
424                 niov > blk_get_max_iov(blk) - req->qiov.niov ||
425                 req->qiov.size > max_transfer ||
426                 nb_sectors > (max_transfer -
427                               req->qiov.size) / BDRV_SECTOR_SIZE) {
428                 submit_requests(blk, mrb, start, num_reqs, niov);
429                 num_reqs = 0;
430             }
431         }
432 
433         if (num_reqs == 0) {
434             sector_num = req->sector_num;
435             nb_sectors = niov = 0;
436             start = i;
437         }
438 
439         nb_sectors += req->qiov.size / BDRV_SECTOR_SIZE;
440         niov += req->qiov.niov;
441         num_reqs++;
442     }
443 
444     submit_requests(blk, mrb, start, num_reqs, niov);
445     mrb->num_reqs = 0;
446 }
447 
448 static void virtio_blk_handle_flush(VirtIOBlockReq *req, MultiReqBuffer *mrb)
449 {
450     block_acct_start(blk_get_stats(req->dev->blk), &req->acct, 0,
451                      BLOCK_ACCT_FLUSH);
452 
453     /*
454      * Make sure all outstanding writes are posted to the backing device.
455      */
456     if (mrb->is_write && mrb->num_reqs > 0) {
457         virtio_blk_submit_multireq(req->dev->blk, mrb);
458     }
459     blk_aio_flush(req->dev->blk, virtio_blk_flush_complete, req);
460 }
461 
462 static bool virtio_blk_sect_range_ok(VirtIOBlock *dev,
463                                      uint64_t sector, size_t size)
464 {
465     uint64_t nb_sectors = size >> BDRV_SECTOR_BITS;
466     uint64_t total_sectors;
467 
468     if (nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
469         return false;
470     }
471     if (sector & dev->sector_mask) {
472         return false;
473     }
474     if (size % dev->conf.conf.logical_block_size) {
475         return false;
476     }
477     blk_get_geometry(dev->blk, &total_sectors);
478     if (sector > total_sectors || nb_sectors > total_sectors - sector) {
479         return false;
480     }
481     return true;
482 }
483 
484 static int virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
485 {
486     uint32_t type;
487     struct iovec *in_iov = req->elem.in_sg;
488     struct iovec *out_iov = req->elem.out_sg;
489     unsigned in_num = req->elem.in_num;
490     unsigned out_num = req->elem.out_num;
491     VirtIOBlock *s = req->dev;
492     VirtIODevice *vdev = VIRTIO_DEVICE(s);
493 
494     if (req->elem.out_num < 1 || req->elem.in_num < 1) {
495         virtio_error(vdev, "virtio-blk missing headers");
496         return -1;
497     }
498 
499     if (unlikely(iov_to_buf(out_iov, out_num, 0, &req->out,
500                             sizeof(req->out)) != sizeof(req->out))) {
501         virtio_error(vdev, "virtio-blk request outhdr too short");
502         return -1;
503     }
504 
505     iov_discard_front(&out_iov, &out_num, sizeof(req->out));
506 
507     if (in_iov[in_num - 1].iov_len < sizeof(struct virtio_blk_inhdr)) {
508         virtio_error(vdev, "virtio-blk request inhdr too short");
509         return -1;
510     }
511 
512     /* We always touch the last byte, so just see how big in_iov is.  */
513     req->in_len = iov_size(in_iov, in_num);
514     req->in = (void *)in_iov[in_num - 1].iov_base
515               + in_iov[in_num - 1].iov_len
516               - sizeof(struct virtio_blk_inhdr);
517     iov_discard_back(in_iov, &in_num, sizeof(struct virtio_blk_inhdr));
518 
519     type = virtio_ldl_p(vdev, &req->out.type);
520 
521     /* VIRTIO_BLK_T_OUT defines the command direction. VIRTIO_BLK_T_BARRIER
522      * is an optional flag. Although a guest should not send this flag if
523      * not negotiated we ignored it in the past. So keep ignoring it. */
524     switch (type & ~(VIRTIO_BLK_T_OUT | VIRTIO_BLK_T_BARRIER)) {
525     case VIRTIO_BLK_T_IN:
526     {
527         bool is_write = type & VIRTIO_BLK_T_OUT;
528         req->sector_num = virtio_ldq_p(vdev, &req->out.sector);
529 
530         if (is_write) {
531             qemu_iovec_init_external(&req->qiov, out_iov, out_num);
532             trace_virtio_blk_handle_write(vdev, req, req->sector_num,
533                                           req->qiov.size / BDRV_SECTOR_SIZE);
534         } else {
535             qemu_iovec_init_external(&req->qiov, in_iov, in_num);
536             trace_virtio_blk_handle_read(vdev, req, req->sector_num,
537                                          req->qiov.size / BDRV_SECTOR_SIZE);
538         }
539 
540         if (!virtio_blk_sect_range_ok(s, req->sector_num, req->qiov.size)) {
541             virtio_blk_req_complete(req, VIRTIO_BLK_S_IOERR);
542             block_acct_invalid(blk_get_stats(s->blk),
543                                is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ);
544             virtio_blk_free_request(req);
545             return 0;
546         }
547 
548         block_acct_start(blk_get_stats(s->blk), &req->acct, req->qiov.size,
549                          is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ);
550 
551         /* merge would exceed maximum number of requests or IO direction
552          * changes */
553         if (mrb->num_reqs > 0 && (mrb->num_reqs == VIRTIO_BLK_MAX_MERGE_REQS ||
554                                   is_write != mrb->is_write ||
555                                   !s->conf.request_merging)) {
556             virtio_blk_submit_multireq(s->blk, mrb);
557         }
558 
559         assert(mrb->num_reqs < VIRTIO_BLK_MAX_MERGE_REQS);
560         mrb->reqs[mrb->num_reqs++] = req;
561         mrb->is_write = is_write;
562         break;
563     }
564     case VIRTIO_BLK_T_FLUSH:
565         virtio_blk_handle_flush(req, mrb);
566         break;
567     case VIRTIO_BLK_T_SCSI_CMD:
568         virtio_blk_handle_scsi(req);
569         break;
570     case VIRTIO_BLK_T_GET_ID:
571     {
572         /*
573          * NB: per existing s/n string convention the string is
574          * terminated by '\0' only when shorter than buffer.
575          */
576         const char *serial = s->conf.serial ? s->conf.serial : "";
577         size_t size = MIN(strlen(serial) + 1,
578                           MIN(iov_size(in_iov, in_num),
579                               VIRTIO_BLK_ID_BYTES));
580         iov_from_buf(in_iov, in_num, 0, serial, size);
581         virtio_blk_req_complete(req, VIRTIO_BLK_S_OK);
582         virtio_blk_free_request(req);
583         break;
584     }
585     default:
586         virtio_blk_req_complete(req, VIRTIO_BLK_S_UNSUPP);
587         virtio_blk_free_request(req);
588     }
589     return 0;
590 }
591 
592 bool virtio_blk_handle_vq(VirtIOBlock *s, VirtQueue *vq)
593 {
594     VirtIOBlockReq *req;
595     MultiReqBuffer mrb = {};
596     bool progress = false;
597 
598     aio_context_acquire(blk_get_aio_context(s->blk));
599     blk_io_plug(s->blk);
600 
601     do {
602         virtio_queue_set_notification(vq, 0);
603 
604         while ((req = virtio_blk_get_request(s, vq))) {
605             progress = true;
606             if (virtio_blk_handle_request(req, &mrb)) {
607                 virtqueue_detach_element(req->vq, &req->elem, 0);
608                 virtio_blk_free_request(req);
609                 break;
610             }
611         }
612 
613         virtio_queue_set_notification(vq, 1);
614     } while (!virtio_queue_empty(vq));
615 
616     if (mrb.num_reqs) {
617         virtio_blk_submit_multireq(s->blk, &mrb);
618     }
619 
620     blk_io_unplug(s->blk);
621     aio_context_release(blk_get_aio_context(s->blk));
622     return progress;
623 }
624 
625 static void virtio_blk_handle_output_do(VirtIOBlock *s, VirtQueue *vq)
626 {
627     virtio_blk_handle_vq(s, vq);
628 }
629 
630 static void virtio_blk_handle_output(VirtIODevice *vdev, VirtQueue *vq)
631 {
632     VirtIOBlock *s = (VirtIOBlock *)vdev;
633 
634     if (s->dataplane) {
635         /* Some guests kick before setting VIRTIO_CONFIG_S_DRIVER_OK so start
636          * dataplane here instead of waiting for .set_status().
637          */
638         virtio_device_start_ioeventfd(vdev);
639         if (!s->dataplane_disabled) {
640             return;
641         }
642     }
643     virtio_blk_handle_output_do(s, vq);
644 }
645 
646 static void virtio_blk_dma_restart_bh(void *opaque)
647 {
648     VirtIOBlock *s = opaque;
649     VirtIOBlockReq *req = s->rq;
650     MultiReqBuffer mrb = {};
651 
652     qemu_bh_delete(s->bh);
653     s->bh = NULL;
654 
655     s->rq = NULL;
656 
657     aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
658     while (req) {
659         VirtIOBlockReq *next = req->next;
660         if (virtio_blk_handle_request(req, &mrb)) {
661             /* Device is now broken and won't do any processing until it gets
662              * reset. Already queued requests will be lost: let's purge them.
663              */
664             while (req) {
665                 next = req->next;
666                 virtqueue_detach_element(req->vq, &req->elem, 0);
667                 virtio_blk_free_request(req);
668                 req = next;
669             }
670             break;
671         }
672         req = next;
673     }
674 
675     if (mrb.num_reqs) {
676         virtio_blk_submit_multireq(s->blk, &mrb);
677     }
678     aio_context_release(blk_get_aio_context(s->conf.conf.blk));
679 }
680 
681 static void virtio_blk_dma_restart_cb(void *opaque, int running,
682                                       RunState state)
683 {
684     VirtIOBlock *s = opaque;
685 
686     if (!running) {
687         return;
688     }
689 
690     if (!s->bh) {
691         s->bh = aio_bh_new(blk_get_aio_context(s->conf.conf.blk),
692                            virtio_blk_dma_restart_bh, s);
693         qemu_bh_schedule(s->bh);
694     }
695 }
696 
697 static void virtio_blk_reset(VirtIODevice *vdev)
698 {
699     VirtIOBlock *s = VIRTIO_BLK(vdev);
700     AioContext *ctx;
701     VirtIOBlockReq *req;
702 
703     ctx = blk_get_aio_context(s->blk);
704     aio_context_acquire(ctx);
705     blk_drain(s->blk);
706 
707     /* We drop queued requests after blk_drain() because blk_drain() itself can
708      * produce them. */
709     while (s->rq) {
710         req = s->rq;
711         s->rq = req->next;
712         virtqueue_detach_element(req->vq, &req->elem, 0);
713         virtio_blk_free_request(req);
714     }
715 
716     aio_context_release(ctx);
717 
718     assert(!s->dataplane_started);
719     blk_set_enable_write_cache(s->blk, s->original_wce);
720 }
721 
722 /* coalesce internal state, copy to pci i/o region 0
723  */
724 static void virtio_blk_update_config(VirtIODevice *vdev, uint8_t *config)
725 {
726     VirtIOBlock *s = VIRTIO_BLK(vdev);
727     BlockConf *conf = &s->conf.conf;
728     struct virtio_blk_config blkcfg;
729     uint64_t capacity;
730     int64_t length;
731     int blk_size = conf->logical_block_size;
732 
733     blk_get_geometry(s->blk, &capacity);
734     memset(&blkcfg, 0, sizeof(blkcfg));
735     virtio_stq_p(vdev, &blkcfg.capacity, capacity);
736     virtio_stl_p(vdev, &blkcfg.seg_max, 128 - 2);
737     virtio_stw_p(vdev, &blkcfg.geometry.cylinders, conf->cyls);
738     virtio_stl_p(vdev, &blkcfg.blk_size, blk_size);
739     virtio_stw_p(vdev, &blkcfg.min_io_size, conf->min_io_size / blk_size);
740     virtio_stw_p(vdev, &blkcfg.opt_io_size, conf->opt_io_size / blk_size);
741     blkcfg.geometry.heads = conf->heads;
742     /*
743      * We must ensure that the block device capacity is a multiple of
744      * the logical block size. If that is not the case, let's use
745      * sector_mask to adopt the geometry to have a correct picture.
746      * For those devices where the capacity is ok for the given geometry
747      * we don't touch the sector value of the geometry, since some devices
748      * (like s390 dasd) need a specific value. Here the capacity is already
749      * cyls*heads*secs*blk_size and the sector value is not block size
750      * divided by 512 - instead it is the amount of blk_size blocks
751      * per track (cylinder).
752      */
753     length = blk_getlength(s->blk);
754     if (length > 0 && length / conf->heads / conf->secs % blk_size) {
755         blkcfg.geometry.sectors = conf->secs & ~s->sector_mask;
756     } else {
757         blkcfg.geometry.sectors = conf->secs;
758     }
759     blkcfg.size_max = 0;
760     blkcfg.physical_block_exp = get_physical_block_exp(conf);
761     blkcfg.alignment_offset = 0;
762     blkcfg.wce = blk_enable_write_cache(s->blk);
763     virtio_stw_p(vdev, &blkcfg.num_queues, s->conf.num_queues);
764     memcpy(config, &blkcfg, VIRTIO_BLK_CFG_SIZE);
765     QEMU_BUILD_BUG_ON(VIRTIO_BLK_CFG_SIZE > sizeof(blkcfg));
766 }
767 
768 static void virtio_blk_set_config(VirtIODevice *vdev, const uint8_t *config)
769 {
770     VirtIOBlock *s = VIRTIO_BLK(vdev);
771     struct virtio_blk_config blkcfg;
772 
773     memcpy(&blkcfg, config, VIRTIO_BLK_CFG_SIZE);
774     QEMU_BUILD_BUG_ON(VIRTIO_BLK_CFG_SIZE > sizeof(blkcfg));
775 
776     aio_context_acquire(blk_get_aio_context(s->blk));
777     blk_set_enable_write_cache(s->blk, blkcfg.wce != 0);
778     aio_context_release(blk_get_aio_context(s->blk));
779 }
780 
781 static uint64_t virtio_blk_get_features(VirtIODevice *vdev, uint64_t features,
782                                         Error **errp)
783 {
784     VirtIOBlock *s = VIRTIO_BLK(vdev);
785 
786     virtio_add_feature(&features, VIRTIO_BLK_F_SEG_MAX);
787     virtio_add_feature(&features, VIRTIO_BLK_F_GEOMETRY);
788     virtio_add_feature(&features, VIRTIO_BLK_F_TOPOLOGY);
789     virtio_add_feature(&features, VIRTIO_BLK_F_BLK_SIZE);
790     if (virtio_has_feature(features, VIRTIO_F_VERSION_1)) {
791         if (s->conf.scsi) {
792             error_setg(errp, "Please set scsi=off for virtio-blk devices in order to use virtio 1.0");
793             return 0;
794         }
795     } else {
796         virtio_clear_feature(&features, VIRTIO_F_ANY_LAYOUT);
797         virtio_add_feature(&features, VIRTIO_BLK_F_SCSI);
798     }
799 
800     if (s->conf.config_wce) {
801         virtio_add_feature(&features, VIRTIO_BLK_F_CONFIG_WCE);
802     }
803     if (blk_enable_write_cache(s->blk)) {
804         virtio_add_feature(&features, VIRTIO_BLK_F_WCE);
805     }
806     if (blk_is_read_only(s->blk)) {
807         virtio_add_feature(&features, VIRTIO_BLK_F_RO);
808     }
809     if (s->conf.num_queues > 1) {
810         virtio_add_feature(&features, VIRTIO_BLK_F_MQ);
811     }
812 
813     return features;
814 }
815 
816 static void virtio_blk_set_status(VirtIODevice *vdev, uint8_t status)
817 {
818     VirtIOBlock *s = VIRTIO_BLK(vdev);
819 
820     if (!(status & (VIRTIO_CONFIG_S_DRIVER | VIRTIO_CONFIG_S_DRIVER_OK))) {
821         assert(!s->dataplane_started);
822     }
823 
824     if (!(status & VIRTIO_CONFIG_S_DRIVER_OK)) {
825         return;
826     }
827 
828     /* A guest that supports VIRTIO_BLK_F_CONFIG_WCE must be able to send
829      * cache flushes.  Thus, the "auto writethrough" behavior is never
830      * necessary for guests that support the VIRTIO_BLK_F_CONFIG_WCE feature.
831      * Leaving it enabled would break the following sequence:
832      *
833      *     Guest started with "-drive cache=writethrough"
834      *     Guest sets status to 0
835      *     Guest sets DRIVER bit in status field
836      *     Guest reads host features (WCE=0, CONFIG_WCE=1)
837      *     Guest writes guest features (WCE=0, CONFIG_WCE=1)
838      *     Guest writes 1 to the WCE configuration field (writeback mode)
839      *     Guest sets DRIVER_OK bit in status field
840      *
841      * s->blk would erroneously be placed in writethrough mode.
842      */
843     if (!virtio_vdev_has_feature(vdev, VIRTIO_BLK_F_CONFIG_WCE)) {
844         aio_context_acquire(blk_get_aio_context(s->blk));
845         blk_set_enable_write_cache(s->blk,
846                                    virtio_vdev_has_feature(vdev,
847                                                            VIRTIO_BLK_F_WCE));
848         aio_context_release(blk_get_aio_context(s->blk));
849     }
850 }
851 
852 static void virtio_blk_save_device(VirtIODevice *vdev, QEMUFile *f)
853 {
854     VirtIOBlock *s = VIRTIO_BLK(vdev);
855     VirtIOBlockReq *req = s->rq;
856 
857     while (req) {
858         qemu_put_sbyte(f, 1);
859 
860         if (s->conf.num_queues > 1) {
861             qemu_put_be32(f, virtio_get_queue_index(req->vq));
862         }
863 
864         qemu_put_virtqueue_element(f, &req->elem);
865         req = req->next;
866     }
867     qemu_put_sbyte(f, 0);
868 }
869 
870 static int virtio_blk_load_device(VirtIODevice *vdev, QEMUFile *f,
871                                   int version_id)
872 {
873     VirtIOBlock *s = VIRTIO_BLK(vdev);
874 
875     while (qemu_get_sbyte(f)) {
876         unsigned nvqs = s->conf.num_queues;
877         unsigned vq_idx = 0;
878         VirtIOBlockReq *req;
879 
880         if (nvqs > 1) {
881             vq_idx = qemu_get_be32(f);
882 
883             if (vq_idx >= nvqs) {
884                 error_report("Invalid virtqueue index in request list: %#x",
885                              vq_idx);
886                 return -EINVAL;
887             }
888         }
889 
890         req = qemu_get_virtqueue_element(vdev, f, sizeof(VirtIOBlockReq));
891         virtio_blk_init_request(s, virtio_get_queue(vdev, vq_idx), req);
892         req->next = s->rq;
893         s->rq = req;
894     }
895 
896     return 0;
897 }
898 
899 static void virtio_blk_resize(void *opaque)
900 {
901     VirtIODevice *vdev = VIRTIO_DEVICE(opaque);
902 
903     virtio_notify_config(vdev);
904 }
905 
906 static const BlockDevOps virtio_block_ops = {
907     .resize_cb = virtio_blk_resize,
908 };
909 
910 static void virtio_blk_device_realize(DeviceState *dev, Error **errp)
911 {
912     VirtIODevice *vdev = VIRTIO_DEVICE(dev);
913     VirtIOBlock *s = VIRTIO_BLK(dev);
914     VirtIOBlkConf *conf = &s->conf;
915     Error *err = NULL;
916     unsigned i;
917 
918     if (!conf->conf.blk) {
919         error_setg(errp, "drive property not set");
920         return;
921     }
922     if (!blk_is_inserted(conf->conf.blk)) {
923         error_setg(errp, "Device needs media, but drive is empty");
924         return;
925     }
926     if (!conf->num_queues) {
927         error_setg(errp, "num-queues property must be larger than 0");
928         return;
929     }
930     if (!is_power_of_2(conf->queue_size) ||
931         conf->queue_size > VIRTQUEUE_MAX_SIZE) {
932         error_setg(errp, "invalid queue-size property (%" PRIu16 "), "
933                    "must be a power of 2 (max %d)",
934                    conf->queue_size, VIRTQUEUE_MAX_SIZE);
935         return;
936     }
937 
938     if (!blkconf_apply_backend_options(&conf->conf,
939                                        blk_is_read_only(conf->conf.blk), true,
940                                        errp)) {
941         return;
942     }
943     s->original_wce = blk_enable_write_cache(conf->conf.blk);
944     if (!blkconf_geometry(&conf->conf, NULL, 65535, 255, 255, errp)) {
945         return;
946     }
947 
948     blkconf_blocksizes(&conf->conf);
949 
950     if (conf->conf.logical_block_size >
951         conf->conf.physical_block_size) {
952         error_setg(errp,
953                    "logical_block_size > physical_block_size not supported");
954         return;
955     }
956 
957     virtio_init(vdev, "virtio-blk", VIRTIO_ID_BLOCK, VIRTIO_BLK_CFG_SIZE);
958 
959     s->blk = conf->conf.blk;
960     s->rq = NULL;
961     s->sector_mask = (s->conf.conf.logical_block_size / BDRV_SECTOR_SIZE) - 1;
962 
963     for (i = 0; i < conf->num_queues; i++) {
964         virtio_add_queue(vdev, conf->queue_size, virtio_blk_handle_output);
965     }
966     virtio_blk_data_plane_create(vdev, conf, &s->dataplane, &err);
967     if (err != NULL) {
968         error_propagate(errp, err);
969         virtio_cleanup(vdev);
970         return;
971     }
972 
973     s->change = qemu_add_vm_change_state_handler(virtio_blk_dma_restart_cb, s);
974     blk_set_dev_ops(s->blk, &virtio_block_ops, s);
975     blk_set_guest_block_size(s->blk, s->conf.conf.logical_block_size);
976 
977     blk_iostatus_enable(s->blk);
978 }
979 
980 static void virtio_blk_device_unrealize(DeviceState *dev, Error **errp)
981 {
982     VirtIODevice *vdev = VIRTIO_DEVICE(dev);
983     VirtIOBlock *s = VIRTIO_BLK(dev);
984 
985     virtio_blk_data_plane_destroy(s->dataplane);
986     s->dataplane = NULL;
987     qemu_del_vm_change_state_handler(s->change);
988     blockdev_mark_auto_del(s->blk);
989     virtio_cleanup(vdev);
990 }
991 
992 static void virtio_blk_instance_init(Object *obj)
993 {
994     VirtIOBlock *s = VIRTIO_BLK(obj);
995 
996     device_add_bootindex_property(obj, &s->conf.conf.bootindex,
997                                   "bootindex", "/disk@0,0",
998                                   DEVICE(obj), NULL);
999 }
1000 
1001 static const VMStateDescription vmstate_virtio_blk = {
1002     .name = "virtio-blk",
1003     .minimum_version_id = 2,
1004     .version_id = 2,
1005     .fields = (VMStateField[]) {
1006         VMSTATE_VIRTIO_DEVICE,
1007         VMSTATE_END_OF_LIST()
1008     },
1009 };
1010 
1011 static Property virtio_blk_properties[] = {
1012     DEFINE_BLOCK_PROPERTIES(VirtIOBlock, conf.conf),
1013     DEFINE_BLOCK_ERROR_PROPERTIES(VirtIOBlock, conf.conf),
1014     DEFINE_BLOCK_CHS_PROPERTIES(VirtIOBlock, conf.conf),
1015     DEFINE_PROP_STRING("serial", VirtIOBlock, conf.serial),
1016     DEFINE_PROP_BIT("config-wce", VirtIOBlock, conf.config_wce, 0, true),
1017 #ifdef __linux__
1018     DEFINE_PROP_BIT("scsi", VirtIOBlock, conf.scsi, 0, false),
1019 #endif
1020     DEFINE_PROP_BIT("request-merging", VirtIOBlock, conf.request_merging, 0,
1021                     true),
1022     DEFINE_PROP_UINT16("num-queues", VirtIOBlock, conf.num_queues, 1),
1023     DEFINE_PROP_UINT16("queue-size", VirtIOBlock, conf.queue_size, 128),
1024     DEFINE_PROP_LINK("iothread", VirtIOBlock, conf.iothread, TYPE_IOTHREAD,
1025                      IOThread *),
1026     DEFINE_PROP_END_OF_LIST(),
1027 };
1028 
1029 static void virtio_blk_class_init(ObjectClass *klass, void *data)
1030 {
1031     DeviceClass *dc = DEVICE_CLASS(klass);
1032     VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass);
1033 
1034     dc->props = virtio_blk_properties;
1035     dc->vmsd = &vmstate_virtio_blk;
1036     set_bit(DEVICE_CATEGORY_STORAGE, dc->categories);
1037     vdc->realize = virtio_blk_device_realize;
1038     vdc->unrealize = virtio_blk_device_unrealize;
1039     vdc->get_config = virtio_blk_update_config;
1040     vdc->set_config = virtio_blk_set_config;
1041     vdc->get_features = virtio_blk_get_features;
1042     vdc->set_status = virtio_blk_set_status;
1043     vdc->reset = virtio_blk_reset;
1044     vdc->save = virtio_blk_save_device;
1045     vdc->load = virtio_blk_load_device;
1046     vdc->start_ioeventfd = virtio_blk_data_plane_start;
1047     vdc->stop_ioeventfd = virtio_blk_data_plane_stop;
1048 }
1049 
1050 static const TypeInfo virtio_blk_info = {
1051     .name = TYPE_VIRTIO_BLK,
1052     .parent = TYPE_VIRTIO_DEVICE,
1053     .instance_size = sizeof(VirtIOBlock),
1054     .instance_init = virtio_blk_instance_init,
1055     .class_init = virtio_blk_class_init,
1056 };
1057 
1058 static void virtio_register_types(void)
1059 {
1060     type_register_static(&virtio_blk_info);
1061 }
1062 
1063 type_init(virtio_register_types)
1064