xref: /openbmc/qemu/block/vpc.c (revision a6caeee8)
1 /*
2  * Block driver for Connectix / Microsoft Virtual PC images
3  *
4  * Copyright (c) 2005 Alex Beregszaszi
5  * Copyright (c) 2009 Kevin Wolf <kwolf@suse.de>
6  *
7  * Permission is hereby granted, free of charge, to any person obtaining a copy
8  * of this software and associated documentation files (the "Software"), to deal
9  * in the Software without restriction, including without limitation the rights
10  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11  * copies of the Software, and to permit persons to whom the Software is
12  * furnished to do so, subject to the following conditions:
13  *
14  * The above copyright notice and this permission notice shall be included in
15  * all copies or substantial portions of the Software.
16  *
17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23  * THE SOFTWARE.
24  */
25 
26 #include "qemu/osdep.h"
27 #include "qapi/error.h"
28 #include "block/block_int.h"
29 #include "block/qdict.h"
30 #include "sysemu/block-backend.h"
31 #include "qemu/module.h"
32 #include "qemu/option.h"
33 #include "migration/blocker.h"
34 #include "qemu/bswap.h"
35 #include "qemu/uuid.h"
36 #include "qemu/memalign.h"
37 #include "qapi/qmp/qdict.h"
38 #include "qapi/qobject-input-visitor.h"
39 #include "qapi/qapi-visit-block-core.h"
40 
41 /**************************************************************/
42 
43 //#define CACHE
44 
45 enum vhd_type {
46     VHD_FIXED           = 2,
47     VHD_DYNAMIC         = 3,
48     VHD_DIFFERENCING    = 4,
49 };
50 
51 /* Seconds since Jan 1, 2000 0:00:00 (UTC) */
52 #define VHD_TIMESTAMP_BASE 946684800
53 
54 #define VHD_CHS_MAX_C   65535LL
55 #define VHD_CHS_MAX_H   16
56 #define VHD_CHS_MAX_S   255
57 
58 #define VHD_MAX_SECTORS       0xff000000    /* 2040 GiB max image size */
59 #define VHD_MAX_GEOMETRY      (VHD_CHS_MAX_C * VHD_CHS_MAX_H * VHD_CHS_MAX_S)
60 
61 #define VPC_OPT_FORCE_SIZE "force_size"
62 
63 /* always big-endian */
64 typedef struct vhd_footer {
65     char        creator[8]; /* "conectix" */
66     uint32_t    features;
67     uint32_t    version;
68 
69     /* Offset of next header structure, 0xFFFFFFFF if none */
70     uint64_t    data_offset;
71 
72     /* Seconds since Jan 1, 2000 0:00:00 (UTC) */
73     uint32_t    timestamp;
74 
75     char        creator_app[4]; /*  e.g., "vpc " */
76     uint16_t    major;
77     uint16_t    minor;
78     char        creator_os[4]; /* "Wi2k" */
79 
80     uint64_t    orig_size;
81     uint64_t    current_size;
82 
83     uint16_t    cyls;
84     uint8_t     heads;
85     uint8_t     secs_per_cyl;
86 
87     uint32_t    type;
88 
89     /* Checksum of the Hard Disk Footer ("one's complement of the sum of all
90        the bytes in the footer without the checksum field") */
91     uint32_t    checksum;
92 
93     /* UUID used to identify a parent hard disk (backing file) */
94     QemuUUID    uuid;
95 
96     uint8_t     in_saved_state;
97     uint8_t     reserved[427];
98 } QEMU_PACKED VHDFooter;
99 
100 QEMU_BUILD_BUG_ON(sizeof(VHDFooter) != 512);
101 
102 typedef struct vhd_dyndisk_header {
103     char        magic[8]; /* "cxsparse" */
104 
105     /* Offset of next header structure, 0xFFFFFFFF if none */
106     uint64_t    data_offset;
107 
108     /* Offset of the Block Allocation Table (BAT) */
109     uint64_t    table_offset;
110 
111     uint32_t    version;
112     uint32_t    max_table_entries; /* 32bit/entry */
113 
114     /* 2 MB by default, must be a power of two */
115     uint32_t    block_size;
116 
117     uint32_t    checksum;
118     uint8_t     parent_uuid[16];
119     uint32_t    parent_timestamp;
120     uint32_t    reserved;
121 
122     /* Backing file name (in UTF-16) */
123     uint8_t     parent_name[512];
124 
125     struct {
126         uint32_t    platform;
127         uint32_t    data_space;
128         uint32_t    data_length;
129         uint32_t    reserved;
130         uint64_t    data_offset;
131     } parent_locator[8];
132     uint8_t     reserved2[256];
133 } QEMU_PACKED VHDDynDiskHeader;
134 
135 QEMU_BUILD_BUG_ON(sizeof(VHDDynDiskHeader) != 1024);
136 
137 typedef struct BDRVVPCState {
138     CoMutex lock;
139     VHDFooter footer;
140     uint64_t free_data_block_offset;
141     int max_table_entries;
142     uint32_t *pagetable;
143     uint64_t bat_offset;
144     uint64_t last_bitmap_offset;
145 
146     uint32_t block_size;
147     uint32_t bitmap_size;
148     bool force_use_chs;
149     bool force_use_sz;
150 
151 #ifdef CACHE
152     uint8_t *pageentry_u8;
153     uint32_t *pageentry_u32;
154     uint16_t *pageentry_u16;
155 
156     uint64_t last_bitmap;
157 #endif
158 
159     Error *migration_blocker;
160 } BDRVVPCState;
161 
162 #define VPC_OPT_SIZE_CALC "force_size_calc"
163 static QemuOptsList vpc_runtime_opts = {
164     .name = "vpc-runtime-opts",
165     .head = QTAILQ_HEAD_INITIALIZER(vpc_runtime_opts.head),
166     .desc = {
167         {
168             .name = VPC_OPT_SIZE_CALC,
169             .type = QEMU_OPT_STRING,
170             .help = "Force disk size calculation to use either CHS geometry, "
171                     "or use the disk current_size specified in the VHD footer. "
172                     "{chs, current_size}"
173         },
174         { /* end of list */ }
175     }
176 };
177 
178 static QemuOptsList vpc_create_opts;
179 
180 static uint32_t vpc_checksum(void *p, size_t size)
181 {
182     uint8_t *buf = p;
183     uint32_t res = 0;
184     int i;
185 
186     for (i = 0; i < size; i++)
187         res += buf[i];
188 
189     return ~res;
190 }
191 
192 
193 static int vpc_probe(const uint8_t *buf, int buf_size, const char *filename)
194 {
195     if (buf_size >= 8 && !strncmp((char *)buf, "conectix", 8))
196         return 100;
197     return 0;
198 }
199 
200 static void vpc_parse_options(BlockDriverState *bs, QemuOpts *opts,
201                               Error **errp)
202 {
203     BDRVVPCState *s = bs->opaque;
204     const char *size_calc;
205 
206     size_calc = qemu_opt_get(opts, VPC_OPT_SIZE_CALC);
207 
208     if (!size_calc) {
209        /* no override, use autodetect only */
210     } else if (!strcmp(size_calc, "current_size")) {
211         s->force_use_sz = true;
212     } else if (!strcmp(size_calc, "chs")) {
213         s->force_use_chs = true;
214     } else {
215         error_setg(errp, "Invalid size calculation mode: '%s'", size_calc);
216     }
217 }
218 
219 static int vpc_open(BlockDriverState *bs, QDict *options, int flags,
220                     Error **errp)
221 {
222     BDRVVPCState *s = bs->opaque;
223     int i;
224     VHDFooter *footer;
225     QemuOpts *opts = NULL;
226     Error *local_err = NULL;
227     bool use_chs;
228     VHDDynDiskHeader dyndisk_header;
229     uint32_t checksum;
230     uint64_t computed_size;
231     uint64_t pagetable_size;
232     int disk_type = VHD_DYNAMIC;
233     int ret;
234     int64_t bs_size;
235 
236     bs->file = bdrv_open_child(NULL, options, "file", bs, &child_of_bds,
237                                BDRV_CHILD_IMAGE, false, errp);
238     if (!bs->file) {
239         return -EINVAL;
240     }
241 
242     opts = qemu_opts_create(&vpc_runtime_opts, NULL, 0, &error_abort);
243     if (!qemu_opts_absorb_qdict(opts, options, errp)) {
244         ret = -EINVAL;
245         goto fail;
246     }
247 
248     vpc_parse_options(bs, opts, &local_err);
249     if (local_err) {
250         error_propagate(errp, local_err);
251         ret = -EINVAL;
252         goto fail;
253     }
254 
255     ret = bdrv_pread(bs->file, 0, &s->footer, sizeof(s->footer));
256     if (ret < 0) {
257         error_setg(errp, "Unable to read VHD header");
258         goto fail;
259     }
260 
261     footer = &s->footer;
262     if (strncmp(footer->creator, "conectix", 8)) {
263         int64_t offset = bdrv_getlength(bs->file->bs);
264         if (offset < 0) {
265             ret = offset;
266             error_setg(errp, "Invalid file size");
267             goto fail;
268         } else if (offset < sizeof(*footer)) {
269             ret = -EINVAL;
270             error_setg(errp, "File too small for a VHD header");
271             goto fail;
272         }
273 
274         /* If a fixed disk, the footer is found only at the end of the file */
275         ret = bdrv_pread(bs->file, offset - sizeof(*footer),
276                          footer, sizeof(*footer));
277         if (ret < 0) {
278             goto fail;
279         }
280         if (strncmp(footer->creator, "conectix", 8) ||
281             be32_to_cpu(footer->type) != VHD_FIXED) {
282             error_setg(errp, "invalid VPC image");
283             ret = -EINVAL;
284             goto fail;
285         }
286         disk_type = VHD_FIXED;
287     }
288 
289     checksum = be32_to_cpu(footer->checksum);
290     footer->checksum = 0;
291     if (vpc_checksum(footer, sizeof(*footer)) != checksum) {
292         error_setg(errp, "Incorrect header checksum");
293         ret = -EINVAL;
294         goto fail;
295     }
296 
297     /* Write 'checksum' back to footer, or else will leave it with zero. */
298     footer->checksum = cpu_to_be32(checksum);
299 
300     /* The visible size of a image in Virtual PC depends on the geometry
301        rather than on the size stored in the footer (the size in the footer
302        is too large usually) */
303     bs->total_sectors = (int64_t)
304         be16_to_cpu(footer->cyls) * footer->heads * footer->secs_per_cyl;
305 
306     /* Microsoft Virtual PC and Microsoft Hyper-V produce and read
307      * VHD image sizes differently.  VPC will rely on CHS geometry,
308      * while Hyper-V and disk2vhd use the size specified in the footer.
309      *
310      * We use a couple of approaches to try and determine the correct method:
311      * look at the Creator App field, and look for images that have CHS
312      * geometry that is the maximum value.
313      *
314      * If the CHS geometry is the maximum CHS geometry, then we assume that
315      * the size is the footer->current_size to avoid truncation.  Otherwise,
316      * we follow the table based on footer->creator_app:
317      *
318      *  Known creator apps:
319      *      'vpc '  :  CHS              Virtual PC (uses disk geometry)
320      *      'qemu'  :  CHS              QEMU (uses disk geometry)
321      *      'qem2'  :  current_size     QEMU (uses current_size)
322      *      'win '  :  current_size     Hyper-V
323      *      'd2v '  :  current_size     Disk2vhd
324      *      'tap\0' :  current_size     XenServer
325      *      'CTXS'  :  current_size     XenConverter
326      *
327      *  The user can override the table values via drive options, however
328      *  even with an override we will still use current_size for images
329      *  that have CHS geometry of the maximum size.
330      */
331     use_chs = (!!strncmp(footer->creator_app, "win ", 4) &&
332                !!strncmp(footer->creator_app, "qem2", 4) &&
333                !!strncmp(footer->creator_app, "d2v ", 4) &&
334                !!strncmp(footer->creator_app, "CTXS", 4) &&
335                !!memcmp(footer->creator_app, "tap", 4)) || s->force_use_chs;
336 
337     if (!use_chs || bs->total_sectors == VHD_MAX_GEOMETRY || s->force_use_sz) {
338         bs->total_sectors = be64_to_cpu(footer->current_size) /
339                                         BDRV_SECTOR_SIZE;
340     }
341 
342     /* Allow a maximum disk size of 2040 GiB */
343     if (bs->total_sectors > VHD_MAX_SECTORS) {
344         ret = -EFBIG;
345         goto fail;
346     }
347 
348     if (disk_type == VHD_DYNAMIC) {
349         ret = bdrv_pread(bs->file, be64_to_cpu(footer->data_offset),
350                          &dyndisk_header, sizeof(dyndisk_header));
351         if (ret < 0) {
352             error_setg(errp, "Error reading dynamic VHD header");
353             goto fail;
354         }
355 
356         if (strncmp(dyndisk_header.magic, "cxsparse", 8)) {
357             error_setg(errp, "Invalid header magic");
358             ret = -EINVAL;
359             goto fail;
360         }
361 
362         s->block_size = be32_to_cpu(dyndisk_header.block_size);
363         if (!is_power_of_2(s->block_size) || s->block_size < BDRV_SECTOR_SIZE) {
364             error_setg(errp, "Invalid block size %" PRIu32, s->block_size);
365             ret = -EINVAL;
366             goto fail;
367         }
368         s->bitmap_size = ((s->block_size / (8 * 512)) + 511) & ~511;
369 
370         s->max_table_entries = be32_to_cpu(dyndisk_header.max_table_entries);
371 
372         if ((bs->total_sectors * 512) / s->block_size > 0xffffffffU) {
373             error_setg(errp, "Too many blocks");
374             ret = -EINVAL;
375             goto fail;
376         }
377 
378         computed_size = (uint64_t) s->max_table_entries * s->block_size;
379         if (computed_size < bs->total_sectors * 512) {
380             error_setg(errp, "Page table too small");
381             ret = -EINVAL;
382             goto fail;
383         }
384 
385         if (s->max_table_entries > SIZE_MAX / 4 ||
386             s->max_table_entries > (int) INT_MAX / 4) {
387             error_setg(errp, "Max Table Entries too large (%" PRId32 ")",
388                         s->max_table_entries);
389             ret = -EINVAL;
390             goto fail;
391         }
392 
393         pagetable_size = (uint64_t) s->max_table_entries * 4;
394 
395         s->pagetable = qemu_try_blockalign(bs->file->bs, pagetable_size);
396         if (s->pagetable == NULL) {
397             error_setg(errp, "Unable to allocate memory for page table");
398             ret = -ENOMEM;
399             goto fail;
400         }
401 
402         s->bat_offset = be64_to_cpu(dyndisk_header.table_offset);
403 
404         ret = bdrv_pread(bs->file, s->bat_offset, s->pagetable,
405                          pagetable_size);
406         if (ret < 0) {
407             error_setg(errp, "Error reading pagetable");
408             goto fail;
409         }
410 
411         s->free_data_block_offset =
412             ROUND_UP(s->bat_offset + pagetable_size, 512);
413 
414         for (i = 0; i < s->max_table_entries; i++) {
415             be32_to_cpus(&s->pagetable[i]);
416             if (s->pagetable[i] != 0xFFFFFFFF) {
417                 int64_t next = (512 * (int64_t) s->pagetable[i]) +
418                     s->bitmap_size + s->block_size;
419 
420                 if (next > s->free_data_block_offset) {
421                     s->free_data_block_offset = next;
422                 }
423             }
424         }
425 
426         bs_size = bdrv_getlength(bs->file->bs);
427         if (bs_size < 0) {
428             error_setg_errno(errp, -bs_size, "Unable to learn image size");
429             ret = bs_size;
430             goto fail;
431         }
432         if (s->free_data_block_offset > bs_size) {
433             error_setg(errp, "block-vpc: free_data_block_offset points after "
434                              "the end of file. The image has been truncated.");
435             ret = -EINVAL;
436             goto fail;
437         }
438 
439         s->last_bitmap_offset = (int64_t) -1;
440 
441 #ifdef CACHE
442         s->pageentry_u8 = g_malloc(512);
443         s->pageentry_u32 = s->pageentry_u8;
444         s->pageentry_u16 = s->pageentry_u8;
445         s->last_pagetable = -1;
446 #endif
447     }
448 
449     /* Disable migration when VHD images are used */
450     error_setg(&s->migration_blocker, "The vpc format used by node '%s' "
451                "does not support live migration",
452                bdrv_get_device_or_node_name(bs));
453     ret = migrate_add_blocker(s->migration_blocker, errp);
454     if (ret < 0) {
455         error_free(s->migration_blocker);
456         goto fail;
457     }
458 
459     qemu_co_mutex_init(&s->lock);
460     qemu_opts_del(opts);
461 
462     return 0;
463 
464 fail:
465     qemu_opts_del(opts);
466     qemu_vfree(s->pagetable);
467 #ifdef CACHE
468     g_free(s->pageentry_u8);
469 #endif
470     return ret;
471 }
472 
473 static int vpc_reopen_prepare(BDRVReopenState *state,
474                               BlockReopenQueue *queue, Error **errp)
475 {
476     return 0;
477 }
478 
479 /*
480  * Returns the absolute byte offset of the given sector in the image file.
481  * If the sector is not allocated, -1 is returned instead.
482  * If an error occurred trying to write an updated block bitmap back to
483  * the file, -2 is returned, and the error value is written to *err.
484  * This can only happen for a write operation.
485  *
486  * The parameter write must be 1 if the offset will be used for a write
487  * operation (the block bitmaps is updated then), 0 otherwise.
488  * If write is true then err must not be NULL.
489  */
490 static inline int64_t get_image_offset(BlockDriverState *bs, uint64_t offset,
491                                        bool write, int *err)
492 {
493     BDRVVPCState *s = bs->opaque;
494     uint64_t bitmap_offset, block_offset;
495     uint32_t pagetable_index, offset_in_block;
496 
497     assert(!(write && err == NULL));
498 
499     pagetable_index = offset / s->block_size;
500     offset_in_block = offset % s->block_size;
501 
502     if (pagetable_index >= s->max_table_entries || s->pagetable[pagetable_index] == 0xffffffff)
503         return -1; /* not allocated */
504 
505     bitmap_offset = 512 * (uint64_t) s->pagetable[pagetable_index];
506     block_offset = bitmap_offset + s->bitmap_size + offset_in_block;
507 
508     /* We must ensure that we don't write to any sectors which are marked as
509        unused in the bitmap. We get away with setting all bits in the block
510        bitmap each time we write to a new block. This might cause Virtual PC to
511        miss sparse read optimization, but it's not a problem in terms of
512        correctness. */
513     if (write && (s->last_bitmap_offset != bitmap_offset)) {
514         uint8_t bitmap[s->bitmap_size];
515         int r;
516 
517         s->last_bitmap_offset = bitmap_offset;
518         memset(bitmap, 0xff, s->bitmap_size);
519         r = bdrv_pwrite_sync(bs->file, bitmap_offset, bitmap, s->bitmap_size);
520         if (r < 0) {
521             *err = r;
522             return -2;
523         }
524     }
525 
526     return block_offset;
527 }
528 
529 /*
530  * Writes the footer to the end of the image file. This is needed when the
531  * file grows as it overwrites the old footer
532  *
533  * Returns 0 on success and < 0 on error
534  */
535 static int rewrite_footer(BlockDriverState *bs)
536 {
537     int ret;
538     BDRVVPCState *s = bs->opaque;
539     int64_t offset = s->free_data_block_offset;
540 
541     ret = bdrv_pwrite_sync(bs->file, offset, &s->footer, sizeof(s->footer));
542     if (ret < 0)
543         return ret;
544 
545     return 0;
546 }
547 
548 /*
549  * Allocates a new block. This involves writing a new footer and updating
550  * the Block Allocation Table to use the space at the old end of the image
551  * file (overwriting the old footer)
552  *
553  * Returns the sectors' offset in the image file on success and < 0 on error
554  */
555 static int64_t alloc_block(BlockDriverState *bs, int64_t offset)
556 {
557     BDRVVPCState *s = bs->opaque;
558     int64_t bat_offset;
559     uint32_t index, bat_value;
560     int ret;
561     uint8_t bitmap[s->bitmap_size];
562 
563     /* Check if sector_num is valid */
564     if ((offset < 0) || (offset > bs->total_sectors * BDRV_SECTOR_SIZE)) {
565         return -EINVAL;
566     }
567 
568     /* Write entry into in-memory BAT */
569     index = offset / s->block_size;
570     assert(s->pagetable[index] == 0xFFFFFFFF);
571     s->pagetable[index] = s->free_data_block_offset / 512;
572 
573     /* Initialize the block's bitmap */
574     memset(bitmap, 0xff, s->bitmap_size);
575     ret = bdrv_pwrite_sync(bs->file, s->free_data_block_offset, bitmap,
576         s->bitmap_size);
577     if (ret < 0) {
578         return ret;
579     }
580 
581     /* Write new footer (the old one will be overwritten) */
582     s->free_data_block_offset += s->block_size + s->bitmap_size;
583     ret = rewrite_footer(bs);
584     if (ret < 0)
585         goto fail;
586 
587     /* Write BAT entry to disk */
588     bat_offset = s->bat_offset + (4 * index);
589     bat_value = cpu_to_be32(s->pagetable[index]);
590     ret = bdrv_pwrite_sync(bs->file, bat_offset, &bat_value, 4);
591     if (ret < 0)
592         goto fail;
593 
594     return get_image_offset(bs, offset, false, NULL);
595 
596 fail:
597     s->free_data_block_offset -= (s->block_size + s->bitmap_size);
598     return ret;
599 }
600 
601 static int vpc_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
602 {
603     BDRVVPCState *s = (BDRVVPCState *)bs->opaque;
604 
605     if (be32_to_cpu(s->footer.type) != VHD_FIXED) {
606         bdi->cluster_size = s->block_size;
607     }
608 
609     return 0;
610 }
611 
612 static int coroutine_fn
613 vpc_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes,
614               QEMUIOVector *qiov, BdrvRequestFlags flags)
615 {
616     BDRVVPCState *s = bs->opaque;
617     int ret;
618     int64_t image_offset;
619     int64_t n_bytes;
620     int64_t bytes_done = 0;
621     QEMUIOVector local_qiov;
622 
623     if (be32_to_cpu(s->footer.type) == VHD_FIXED) {
624         return bdrv_co_preadv(bs->file, offset, bytes, qiov, 0);
625     }
626 
627     qemu_co_mutex_lock(&s->lock);
628     qemu_iovec_init(&local_qiov, qiov->niov);
629 
630     while (bytes > 0) {
631         image_offset = get_image_offset(bs, offset, false, NULL);
632         n_bytes = MIN(bytes, s->block_size - (offset % s->block_size));
633 
634         if (image_offset == -1) {
635             qemu_iovec_memset(qiov, bytes_done, 0, n_bytes);
636         } else {
637             qemu_iovec_reset(&local_qiov);
638             qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes);
639 
640             qemu_co_mutex_unlock(&s->lock);
641             ret = bdrv_co_preadv(bs->file, image_offset, n_bytes,
642                                  &local_qiov, 0);
643             qemu_co_mutex_lock(&s->lock);
644             if (ret < 0) {
645                 goto fail;
646             }
647         }
648 
649         bytes -= n_bytes;
650         offset += n_bytes;
651         bytes_done += n_bytes;
652     }
653 
654     ret = 0;
655 fail:
656     qemu_iovec_destroy(&local_qiov);
657     qemu_co_mutex_unlock(&s->lock);
658 
659     return ret;
660 }
661 
662 static int coroutine_fn
663 vpc_co_pwritev(BlockDriverState *bs, int64_t offset, int64_t bytes,
664                QEMUIOVector *qiov, BdrvRequestFlags flags)
665 {
666     BDRVVPCState *s = bs->opaque;
667     int64_t image_offset;
668     int64_t n_bytes;
669     int64_t bytes_done = 0;
670     int ret = 0;
671     QEMUIOVector local_qiov;
672 
673     if (be32_to_cpu(s->footer.type) == VHD_FIXED) {
674         return bdrv_co_pwritev(bs->file, offset, bytes, qiov, 0);
675     }
676 
677     qemu_co_mutex_lock(&s->lock);
678     qemu_iovec_init(&local_qiov, qiov->niov);
679 
680     while (bytes > 0) {
681         image_offset = get_image_offset(bs, offset, true, &ret);
682         if (image_offset == -2) {
683             /* Failed to write block bitmap: can't proceed with write */
684             goto fail;
685         }
686         n_bytes = MIN(bytes, s->block_size - (offset % s->block_size));
687 
688         if (image_offset == -1) {
689             image_offset = alloc_block(bs, offset);
690             if (image_offset < 0) {
691                 ret = image_offset;
692                 goto fail;
693             }
694         }
695 
696         qemu_iovec_reset(&local_qiov);
697         qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes);
698 
699         qemu_co_mutex_unlock(&s->lock);
700         ret = bdrv_co_pwritev(bs->file, image_offset, n_bytes,
701                               &local_qiov, 0);
702         qemu_co_mutex_lock(&s->lock);
703         if (ret < 0) {
704             goto fail;
705         }
706 
707         bytes -= n_bytes;
708         offset += n_bytes;
709         bytes_done += n_bytes;
710     }
711 
712     ret = 0;
713 fail:
714     qemu_iovec_destroy(&local_qiov);
715     qemu_co_mutex_unlock(&s->lock);
716 
717     return ret;
718 }
719 
720 static int coroutine_fn vpc_co_block_status(BlockDriverState *bs,
721                                             bool want_zero,
722                                             int64_t offset, int64_t bytes,
723                                             int64_t *pnum, int64_t *map,
724                                             BlockDriverState **file)
725 {
726     BDRVVPCState *s = bs->opaque;
727     int64_t image_offset;
728     bool allocated;
729     int ret;
730     int64_t n;
731 
732     if (be32_to_cpu(s->footer.type) == VHD_FIXED) {
733         *pnum = bytes;
734         *map = offset;
735         *file = bs->file->bs;
736         return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | BDRV_BLOCK_RECURSE;
737     }
738 
739     qemu_co_mutex_lock(&s->lock);
740 
741     image_offset = get_image_offset(bs, offset, false, NULL);
742     allocated = (image_offset != -1);
743     *pnum = 0;
744     ret = BDRV_BLOCK_ZERO;
745 
746     do {
747         /* All sectors in a block are contiguous (without using the bitmap) */
748         n = ROUND_UP(offset + 1, s->block_size) - offset;
749         n = MIN(n, bytes);
750 
751         *pnum += n;
752         offset += n;
753         bytes -= n;
754         /* *pnum can't be greater than one block for allocated
755          * sectors since there is always a bitmap in between. */
756         if (allocated) {
757             *file = bs->file->bs;
758             *map = image_offset;
759             ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
760             break;
761         }
762         if (bytes == 0) {
763             break;
764         }
765         image_offset = get_image_offset(bs, offset, false, NULL);
766     } while (image_offset == -1);
767 
768     qemu_co_mutex_unlock(&s->lock);
769     return ret;
770 }
771 
772 /*
773  * Calculates the number of cylinders, heads and sectors per cylinder
774  * based on a given number of sectors. This is the algorithm described
775  * in the VHD specification.
776  *
777  * Note that the geometry doesn't always exactly match total_sectors but
778  * may round it down.
779  *
780  * Returns 0 on success, -EFBIG if the size is larger than 2040 GiB. Override
781  * the hardware EIDE and ATA-2 limit of 16 heads (max disk size of 127 GB)
782  * and instead allow up to 255 heads.
783  */
784 static int calculate_geometry(int64_t total_sectors, uint16_t *cyls,
785     uint8_t *heads, uint8_t *secs_per_cyl)
786 {
787     uint32_t cyls_times_heads;
788 
789     total_sectors = MIN(total_sectors, VHD_MAX_GEOMETRY);
790 
791     if (total_sectors >= 65535LL * 16 * 63) {
792         *secs_per_cyl = 255;
793         *heads = 16;
794         cyls_times_heads = total_sectors / *secs_per_cyl;
795     } else {
796         *secs_per_cyl = 17;
797         cyls_times_heads = total_sectors / *secs_per_cyl;
798         *heads = DIV_ROUND_UP(cyls_times_heads, 1024);
799 
800         if (*heads < 4) {
801             *heads = 4;
802         }
803 
804         if (cyls_times_heads >= (*heads * 1024) || *heads > 16) {
805             *secs_per_cyl = 31;
806             *heads = 16;
807             cyls_times_heads = total_sectors / *secs_per_cyl;
808         }
809 
810         if (cyls_times_heads >= (*heads * 1024)) {
811             *secs_per_cyl = 63;
812             *heads = 16;
813             cyls_times_heads = total_sectors / *secs_per_cyl;
814         }
815     }
816 
817     *cyls = cyls_times_heads / *heads;
818 
819     return 0;
820 }
821 
822 static int create_dynamic_disk(BlockBackend *blk, VHDFooter *footer,
823                                int64_t total_sectors)
824 {
825     VHDDynDiskHeader dyndisk_header;
826     uint8_t bat_sector[512];
827     size_t block_size, num_bat_entries;
828     int i;
829     int ret;
830     int64_t offset = 0;
831 
832     /* Write the footer (twice: at the beginning and at the end) */
833     block_size = 0x200000;
834     num_bat_entries = DIV_ROUND_UP(total_sectors, block_size / 512);
835 
836     ret = blk_pwrite(blk, offset, footer, sizeof(*footer), 0);
837     if (ret < 0) {
838         goto fail;
839     }
840 
841     offset = 1536 + ((num_bat_entries * 4 + 511) & ~511);
842     ret = blk_pwrite(blk, offset, footer, sizeof(*footer), 0);
843     if (ret < 0) {
844         goto fail;
845     }
846 
847     /* Write the initial BAT */
848     offset = 3 * 512;
849 
850     memset(bat_sector, 0xFF, 512);
851     for (i = 0; i < DIV_ROUND_UP(num_bat_entries * 4, 512); i++) {
852         ret = blk_pwrite(blk, offset, bat_sector, 512, 0);
853         if (ret < 0) {
854             goto fail;
855         }
856         offset += 512;
857     }
858 
859     /* Prepare the Dynamic Disk Header */
860     memset(&dyndisk_header, 0, sizeof(dyndisk_header));
861 
862     memcpy(dyndisk_header.magic, "cxsparse", 8);
863 
864     /*
865      * Note: The spec is actually wrong here for data_offset, it says
866      * 0xFFFFFFFF, but MS tools expect all 64 bits to be set.
867      */
868     dyndisk_header.data_offset = cpu_to_be64(0xFFFFFFFFFFFFFFFFULL);
869     dyndisk_header.table_offset = cpu_to_be64(3 * 512);
870     dyndisk_header.version = cpu_to_be32(0x00010000);
871     dyndisk_header.block_size = cpu_to_be32(block_size);
872     dyndisk_header.max_table_entries = cpu_to_be32(num_bat_entries);
873 
874     dyndisk_header.checksum = cpu_to_be32(
875         vpc_checksum(&dyndisk_header, sizeof(dyndisk_header)));
876 
877     /* Write the header */
878     offset = 512;
879 
880     ret = blk_pwrite(blk, offset, &dyndisk_header, sizeof(dyndisk_header), 0);
881     if (ret < 0) {
882         goto fail;
883     }
884 
885     ret = 0;
886  fail:
887     return ret;
888 }
889 
890 static int create_fixed_disk(BlockBackend *blk, VHDFooter *footer,
891                              int64_t total_size, Error **errp)
892 {
893     int ret;
894 
895     /* Add footer to total size */
896     total_size += sizeof(*footer);
897 
898     ret = blk_truncate(blk, total_size, false, PREALLOC_MODE_OFF, 0, errp);
899     if (ret < 0) {
900         return ret;
901     }
902 
903     ret = blk_pwrite(blk, total_size - sizeof(*footer),
904                      footer, sizeof(*footer), 0);
905     if (ret < 0) {
906         error_setg_errno(errp, -ret, "Unable to write VHD header");
907         return ret;
908     }
909 
910     return 0;
911 }
912 
913 static int calculate_rounded_image_size(BlockdevCreateOptionsVpc *vpc_opts,
914                                         uint16_t *out_cyls,
915                                         uint8_t *out_heads,
916                                         uint8_t *out_secs_per_cyl,
917                                         int64_t *out_total_sectors,
918                                         Error **errp)
919 {
920     int64_t total_size = vpc_opts->size;
921     uint16_t cyls = 0;
922     uint8_t heads = 0;
923     uint8_t secs_per_cyl = 0;
924     int64_t total_sectors;
925     int i;
926 
927     /*
928      * Calculate matching total_size and geometry. Increase the number of
929      * sectors requested until we get enough (or fail). This ensures that
930      * qemu-img convert doesn't truncate images, but rather rounds up.
931      *
932      * If the image size can't be represented by a spec conformant CHS geometry,
933      * we set the geometry to 65535 x 16 x 255 (CxHxS) sectors and use
934      * the image size from the VHD footer to calculate total_sectors.
935      */
936     if (vpc_opts->force_size) {
937         /* This will force the use of total_size for sector count, below */
938         cyls         = VHD_CHS_MAX_C;
939         heads        = VHD_CHS_MAX_H;
940         secs_per_cyl = VHD_CHS_MAX_S;
941     } else {
942         total_sectors = MIN(VHD_MAX_GEOMETRY, total_size / BDRV_SECTOR_SIZE);
943         for (i = 0; total_sectors > (int64_t)cyls * heads * secs_per_cyl; i++) {
944             calculate_geometry(total_sectors + i, &cyls, &heads, &secs_per_cyl);
945         }
946     }
947 
948     if ((int64_t)cyls * heads * secs_per_cyl == VHD_MAX_GEOMETRY) {
949         total_sectors = total_size / BDRV_SECTOR_SIZE;
950         /* Allow a maximum disk size of 2040 GiB */
951         if (total_sectors > VHD_MAX_SECTORS) {
952             error_setg(errp, "Disk size is too large, max size is 2040 GiB");
953             return -EFBIG;
954         }
955     } else {
956         total_sectors = (int64_t) cyls * heads * secs_per_cyl;
957     }
958 
959     *out_total_sectors = total_sectors;
960     if (out_cyls) {
961         *out_cyls = cyls;
962         *out_heads = heads;
963         *out_secs_per_cyl = secs_per_cyl;
964     }
965 
966     return 0;
967 }
968 
969 static int coroutine_fn vpc_co_create(BlockdevCreateOptions *opts,
970                                       Error **errp)
971 {
972     BlockdevCreateOptionsVpc *vpc_opts;
973     BlockBackend *blk = NULL;
974     BlockDriverState *bs = NULL;
975 
976     VHDFooter footer;
977     uint16_t cyls = 0;
978     uint8_t heads = 0;
979     uint8_t secs_per_cyl = 0;
980     int64_t total_sectors;
981     int64_t total_size;
982     int disk_type;
983     int ret = -EIO;
984     QemuUUID uuid;
985 
986     assert(opts->driver == BLOCKDEV_DRIVER_VPC);
987     vpc_opts = &opts->u.vpc;
988 
989     /* Validate options and set default values */
990     total_size = vpc_opts->size;
991 
992     if (!vpc_opts->has_subformat) {
993         vpc_opts->subformat = BLOCKDEV_VPC_SUBFORMAT_DYNAMIC;
994     }
995     switch (vpc_opts->subformat) {
996     case BLOCKDEV_VPC_SUBFORMAT_DYNAMIC:
997         disk_type = VHD_DYNAMIC;
998         break;
999     case BLOCKDEV_VPC_SUBFORMAT_FIXED:
1000         disk_type = VHD_FIXED;
1001         break;
1002     default:
1003         g_assert_not_reached();
1004     }
1005 
1006     /* Create BlockBackend to write to the image */
1007     bs = bdrv_open_blockdev_ref(vpc_opts->file, errp);
1008     if (bs == NULL) {
1009         return -EIO;
1010     }
1011 
1012     blk = blk_new_with_bs(bs, BLK_PERM_WRITE | BLK_PERM_RESIZE, BLK_PERM_ALL,
1013                           errp);
1014     if (!blk) {
1015         ret = -EPERM;
1016         goto out;
1017     }
1018     blk_set_allow_write_beyond_eof(blk, true);
1019 
1020     /* Get geometry and check that it matches the image size*/
1021     ret = calculate_rounded_image_size(vpc_opts, &cyls, &heads, &secs_per_cyl,
1022                                        &total_sectors, errp);
1023     if (ret < 0) {
1024         goto out;
1025     }
1026 
1027     if (total_size != total_sectors * BDRV_SECTOR_SIZE) {
1028         error_setg(errp, "The requested image size cannot be represented in "
1029                          "CHS geometry");
1030         error_append_hint(errp, "Try size=%llu or force-size=on (the "
1031                                 "latter makes the image incompatible with "
1032                                 "Virtual PC)",
1033                           total_sectors * BDRV_SECTOR_SIZE);
1034         ret = -EINVAL;
1035         goto out;
1036     }
1037 
1038     /* Prepare the Hard Disk Footer */
1039     memset(&footer, 0, sizeof(footer));
1040 
1041     memcpy(footer.creator, "conectix", 8);
1042     if (vpc_opts->force_size) {
1043         memcpy(footer.creator_app, "qem2", 4);
1044     } else {
1045         memcpy(footer.creator_app, "qemu", 4);
1046     }
1047     memcpy(footer.creator_os, "Wi2k", 4);
1048 
1049     footer.features = cpu_to_be32(0x02);
1050     footer.version = cpu_to_be32(0x00010000);
1051     if (disk_type == VHD_DYNAMIC) {
1052         footer.data_offset = cpu_to_be64(sizeof(footer));
1053     } else {
1054         footer.data_offset = cpu_to_be64(0xFFFFFFFFFFFFFFFFULL);
1055     }
1056     footer.timestamp = cpu_to_be32(time(NULL) - VHD_TIMESTAMP_BASE);
1057 
1058     /* Version of Virtual PC 2007 */
1059     footer.major = cpu_to_be16(0x0005);
1060     footer.minor = cpu_to_be16(0x0003);
1061     footer.orig_size = cpu_to_be64(total_size);
1062     footer.current_size = cpu_to_be64(total_size);
1063     footer.cyls = cpu_to_be16(cyls);
1064     footer.heads = heads;
1065     footer.secs_per_cyl = secs_per_cyl;
1066 
1067     footer.type = cpu_to_be32(disk_type);
1068 
1069     qemu_uuid_generate(&uuid);
1070     footer.uuid = uuid;
1071 
1072     footer.checksum = cpu_to_be32(vpc_checksum(&footer, sizeof(footer)));
1073 
1074     if (disk_type == VHD_DYNAMIC) {
1075         ret = create_dynamic_disk(blk, &footer, total_sectors);
1076         if (ret < 0) {
1077             error_setg(errp, "Unable to create or write VHD header");
1078         }
1079     } else {
1080         ret = create_fixed_disk(blk, &footer, total_size, errp);
1081     }
1082 
1083 out:
1084     blk_unref(blk);
1085     bdrv_unref(bs);
1086     return ret;
1087 }
1088 
1089 static int coroutine_fn vpc_co_create_opts(BlockDriver *drv,
1090                                            const char *filename,
1091                                            QemuOpts *opts,
1092                                            Error **errp)
1093 {
1094     BlockdevCreateOptions *create_options = NULL;
1095     QDict *qdict;
1096     Visitor *v;
1097     BlockDriverState *bs = NULL;
1098     int ret;
1099 
1100     static const QDictRenames opt_renames[] = {
1101         { VPC_OPT_FORCE_SIZE,           "force-size" },
1102         { NULL, NULL },
1103     };
1104 
1105     /* Parse options and convert legacy syntax */
1106     qdict = qemu_opts_to_qdict_filtered(opts, NULL, &vpc_create_opts, true);
1107 
1108     if (!qdict_rename_keys(qdict, opt_renames, errp)) {
1109         ret = -EINVAL;
1110         goto fail;
1111     }
1112 
1113     /* Create and open the file (protocol layer) */
1114     ret = bdrv_create_file(filename, opts, errp);
1115     if (ret < 0) {
1116         goto fail;
1117     }
1118 
1119     bs = bdrv_open(filename, NULL, NULL,
1120                    BDRV_O_RDWR | BDRV_O_RESIZE | BDRV_O_PROTOCOL, errp);
1121     if (bs == NULL) {
1122         ret = -EIO;
1123         goto fail;
1124     }
1125 
1126     /* Now get the QAPI type BlockdevCreateOptions */
1127     qdict_put_str(qdict, "driver", "vpc");
1128     qdict_put_str(qdict, "file", bs->node_name);
1129 
1130     v = qobject_input_visitor_new_flat_confused(qdict, errp);
1131     if (!v) {
1132         ret = -EINVAL;
1133         goto fail;
1134     }
1135 
1136     visit_type_BlockdevCreateOptions(v, NULL, &create_options, errp);
1137     visit_free(v);
1138     if (!create_options) {
1139         ret = -EINVAL;
1140         goto fail;
1141     }
1142 
1143     /* Silently round up size */
1144     assert(create_options->driver == BLOCKDEV_DRIVER_VPC);
1145     create_options->u.vpc.size =
1146         ROUND_UP(create_options->u.vpc.size, BDRV_SECTOR_SIZE);
1147 
1148     if (!create_options->u.vpc.force_size) {
1149         int64_t total_sectors;
1150         ret = calculate_rounded_image_size(&create_options->u.vpc, NULL, NULL,
1151                                            NULL, &total_sectors, errp);
1152         if (ret < 0) {
1153             goto fail;
1154         }
1155 
1156         create_options->u.vpc.size = total_sectors * BDRV_SECTOR_SIZE;
1157     }
1158 
1159 
1160     /* Create the vpc image (format layer) */
1161     ret = vpc_co_create(create_options, errp);
1162 
1163 fail:
1164     qobject_unref(qdict);
1165     bdrv_unref(bs);
1166     qapi_free_BlockdevCreateOptions(create_options);
1167     return ret;
1168 }
1169 
1170 
1171 static int vpc_has_zero_init(BlockDriverState *bs)
1172 {
1173     BDRVVPCState *s = bs->opaque;
1174 
1175     if (be32_to_cpu(s->footer.type) == VHD_FIXED) {
1176         return bdrv_has_zero_init(bs->file->bs);
1177     } else {
1178         return 1;
1179     }
1180 }
1181 
1182 static void vpc_close(BlockDriverState *bs)
1183 {
1184     BDRVVPCState *s = bs->opaque;
1185     qemu_vfree(s->pagetable);
1186 #ifdef CACHE
1187     g_free(s->pageentry_u8);
1188 #endif
1189 
1190     migrate_del_blocker(s->migration_blocker);
1191     error_free(s->migration_blocker);
1192 }
1193 
1194 static QemuOptsList vpc_create_opts = {
1195     .name = "vpc-create-opts",
1196     .head = QTAILQ_HEAD_INITIALIZER(vpc_create_opts.head),
1197     .desc = {
1198         {
1199             .name = BLOCK_OPT_SIZE,
1200             .type = QEMU_OPT_SIZE,
1201             .help = "Virtual disk size"
1202         },
1203         {
1204             .name = BLOCK_OPT_SUBFMT,
1205             .type = QEMU_OPT_STRING,
1206             .help =
1207                 "Type of virtual hard disk format. Supported formats are "
1208                 "{dynamic (default) | fixed} "
1209         },
1210         {
1211             .name = VPC_OPT_FORCE_SIZE,
1212             .type = QEMU_OPT_BOOL,
1213             .help = "Force disk size calculation to use the actual size "
1214                     "specified, rather than using the nearest CHS-based "
1215                     "calculation"
1216         },
1217         { /* end of list */ }
1218     }
1219 };
1220 
1221 static const char *const vpc_strong_runtime_opts[] = {
1222     VPC_OPT_SIZE_CALC,
1223 
1224     NULL
1225 };
1226 
1227 static BlockDriver bdrv_vpc = {
1228     .format_name    = "vpc",
1229     .instance_size  = sizeof(BDRVVPCState),
1230 
1231     .bdrv_probe             = vpc_probe,
1232     .bdrv_open              = vpc_open,
1233     .bdrv_close             = vpc_close,
1234     .bdrv_reopen_prepare    = vpc_reopen_prepare,
1235     .bdrv_child_perm        = bdrv_default_perms,
1236     .bdrv_co_create         = vpc_co_create,
1237     .bdrv_co_create_opts    = vpc_co_create_opts,
1238 
1239     .bdrv_co_preadv             = vpc_co_preadv,
1240     .bdrv_co_pwritev            = vpc_co_pwritev,
1241     .bdrv_co_block_status       = vpc_co_block_status,
1242 
1243     .bdrv_get_info          = vpc_get_info,
1244 
1245     .is_format              = true,
1246     .create_opts            = &vpc_create_opts,
1247     .bdrv_has_zero_init     = vpc_has_zero_init,
1248     .strong_runtime_opts    = vpc_strong_runtime_opts,
1249 };
1250 
1251 static void bdrv_vpc_init(void)
1252 {
1253     bdrv_register(&bdrv_vpc);
1254 }
1255 
1256 block_init(bdrv_vpc_init);
1257