xref: /openbmc/qemu/block/vpc.c (revision 5ef4a0cb)
1 /*
2  * Block driver for Connectix / Microsoft Virtual PC images
3  *
4  * Copyright (c) 2005 Alex Beregszaszi
5  * Copyright (c) 2009 Kevin Wolf <kwolf@suse.de>
6  *
7  * Permission is hereby granted, free of charge, to any person obtaining a copy
8  * of this software and associated documentation files (the "Software"), to deal
9  * in the Software without restriction, including without limitation the rights
10  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11  * copies of the Software, and to permit persons to whom the Software is
12  * furnished to do so, subject to the following conditions:
13  *
14  * The above copyright notice and this permission notice shall be included in
15  * all copies or substantial portions of the Software.
16  *
17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23  * THE SOFTWARE.
24  */
25 
26 #include "qemu/osdep.h"
27 #include "qapi/error.h"
28 #include "block/block_int.h"
29 #include "block/qdict.h"
30 #include "sysemu/block-backend.h"
31 #include "qemu/module.h"
32 #include "qemu/option.h"
33 #include "migration/blocker.h"
34 #include "qemu/bswap.h"
35 #include "qemu/uuid.h"
36 #include "qemu/memalign.h"
37 #include "qapi/qmp/qdict.h"
38 #include "qapi/qobject-input-visitor.h"
39 #include "qapi/qapi-visit-block-core.h"
40 
41 /**************************************************************/
42 
43 //#define CACHE
44 
45 enum vhd_type {
46     VHD_FIXED           = 2,
47     VHD_DYNAMIC         = 3,
48     VHD_DIFFERENCING    = 4,
49 };
50 
51 /* Seconds since Jan 1, 2000 0:00:00 (UTC) */
52 #define VHD_TIMESTAMP_BASE 946684800
53 
54 #define VHD_CHS_MAX_C   65535LL
55 #define VHD_CHS_MAX_H   16
56 #define VHD_CHS_MAX_S   255
57 
58 #define VHD_MAX_SECTORS       0xff000000    /* 2040 GiB max image size */
59 #define VHD_MAX_GEOMETRY      (VHD_CHS_MAX_C * VHD_CHS_MAX_H * VHD_CHS_MAX_S)
60 
61 #define VPC_OPT_FORCE_SIZE "force_size"
62 
63 /* always big-endian */
64 typedef struct vhd_footer {
65     char        creator[8]; /* "conectix" */
66     uint32_t    features;
67     uint32_t    version;
68 
69     /* Offset of next header structure, 0xFFFFFFFF if none */
70     uint64_t    data_offset;
71 
72     /* Seconds since Jan 1, 2000 0:00:00 (UTC) */
73     uint32_t    timestamp;
74 
75     char        creator_app[4]; /*  e.g., "vpc " */
76     uint16_t    major;
77     uint16_t    minor;
78     char        creator_os[4]; /* "Wi2k" */
79 
80     uint64_t    orig_size;
81     uint64_t    current_size;
82 
83     uint16_t    cyls;
84     uint8_t     heads;
85     uint8_t     secs_per_cyl;
86 
87     uint32_t    type;
88 
89     /* Checksum of the Hard Disk Footer ("one's complement of the sum of all
90        the bytes in the footer without the checksum field") */
91     uint32_t    checksum;
92 
93     /* UUID used to identify a parent hard disk (backing file) */
94     QemuUUID    uuid;
95 
96     uint8_t     in_saved_state;
97     uint8_t     reserved[427];
98 } QEMU_PACKED VHDFooter;
99 
100 QEMU_BUILD_BUG_ON(sizeof(VHDFooter) != 512);
101 
102 typedef struct vhd_dyndisk_header {
103     char        magic[8]; /* "cxsparse" */
104 
105     /* Offset of next header structure, 0xFFFFFFFF if none */
106     uint64_t    data_offset;
107 
108     /* Offset of the Block Allocation Table (BAT) */
109     uint64_t    table_offset;
110 
111     uint32_t    version;
112     uint32_t    max_table_entries; /* 32bit/entry */
113 
114     /* 2 MB by default, must be a power of two */
115     uint32_t    block_size;
116 
117     uint32_t    checksum;
118     uint8_t     parent_uuid[16];
119     uint32_t    parent_timestamp;
120     uint32_t    reserved;
121 
122     /* Backing file name (in UTF-16) */
123     uint8_t     parent_name[512];
124 
125     struct {
126         uint32_t    platform;
127         uint32_t    data_space;
128         uint32_t    data_length;
129         uint32_t    reserved;
130         uint64_t    data_offset;
131     } parent_locator[8];
132     uint8_t     reserved2[256];
133 } QEMU_PACKED VHDDynDiskHeader;
134 
135 QEMU_BUILD_BUG_ON(sizeof(VHDDynDiskHeader) != 1024);
136 
137 typedef struct BDRVVPCState {
138     CoMutex lock;
139     VHDFooter footer;
140     uint64_t free_data_block_offset;
141     int max_table_entries;
142     uint32_t *pagetable;
143     uint64_t bat_offset;
144     uint64_t last_bitmap_offset;
145 
146     uint32_t block_size;
147     uint32_t bitmap_size;
148     bool force_use_chs;
149     bool force_use_sz;
150 
151 #ifdef CACHE
152     uint8_t *pageentry_u8;
153     uint32_t *pageentry_u32;
154     uint16_t *pageentry_u16;
155 
156     uint64_t last_bitmap;
157 #endif
158 
159     Error *migration_blocker;
160 } BDRVVPCState;
161 
162 #define VPC_OPT_SIZE_CALC "force_size_calc"
163 static QemuOptsList vpc_runtime_opts = {
164     .name = "vpc-runtime-opts",
165     .head = QTAILQ_HEAD_INITIALIZER(vpc_runtime_opts.head),
166     .desc = {
167         {
168             .name = VPC_OPT_SIZE_CALC,
169             .type = QEMU_OPT_STRING,
170             .help = "Force disk size calculation to use either CHS geometry, "
171                     "or use the disk current_size specified in the VHD footer. "
172                     "{chs, current_size}"
173         },
174         { /* end of list */ }
175     }
176 };
177 
178 static QemuOptsList vpc_create_opts;
179 
180 static uint32_t vpc_checksum(void *p, size_t size)
181 {
182     uint8_t *buf = p;
183     uint32_t res = 0;
184     int i;
185 
186     for (i = 0; i < size; i++)
187         res += buf[i];
188 
189     return ~res;
190 }
191 
192 
193 static int vpc_probe(const uint8_t *buf, int buf_size, const char *filename)
194 {
195     if (buf_size >= 8 && !strncmp((char *)buf, "conectix", 8))
196         return 100;
197     return 0;
198 }
199 
200 static void vpc_parse_options(BlockDriverState *bs, QemuOpts *opts,
201                               Error **errp)
202 {
203     BDRVVPCState *s = bs->opaque;
204     const char *size_calc;
205 
206     size_calc = qemu_opt_get(opts, VPC_OPT_SIZE_CALC);
207 
208     if (!size_calc) {
209        /* no override, use autodetect only */
210     } else if (!strcmp(size_calc, "current_size")) {
211         s->force_use_sz = true;
212     } else if (!strcmp(size_calc, "chs")) {
213         s->force_use_chs = true;
214     } else {
215         error_setg(errp, "Invalid size calculation mode: '%s'", size_calc);
216     }
217 }
218 
219 static int vpc_open(BlockDriverState *bs, QDict *options, int flags,
220                     Error **errp)
221 {
222     BDRVVPCState *s = bs->opaque;
223     int i;
224     VHDFooter *footer;
225     QemuOpts *opts = NULL;
226     Error *local_err = NULL;
227     bool use_chs;
228     VHDDynDiskHeader dyndisk_header;
229     uint32_t checksum;
230     uint64_t computed_size;
231     uint64_t pagetable_size;
232     int disk_type = VHD_DYNAMIC;
233     int ret;
234     int64_t bs_size;
235 
236     ret = bdrv_open_file_child(NULL, options, "file", bs, errp);
237     if (ret < 0) {
238         return ret;
239     }
240 
241     opts = qemu_opts_create(&vpc_runtime_opts, NULL, 0, &error_abort);
242     if (!qemu_opts_absorb_qdict(opts, options, errp)) {
243         ret = -EINVAL;
244         goto fail;
245     }
246 
247     vpc_parse_options(bs, opts, &local_err);
248     if (local_err) {
249         error_propagate(errp, local_err);
250         ret = -EINVAL;
251         goto fail;
252     }
253 
254     ret = bdrv_pread(bs->file, 0, sizeof(s->footer), &s->footer, 0);
255     if (ret < 0) {
256         error_setg(errp, "Unable to read VHD header");
257         goto fail;
258     }
259 
260     footer = &s->footer;
261     if (strncmp(footer->creator, "conectix", 8)) {
262         int64_t offset = bdrv_getlength(bs->file->bs);
263         if (offset < 0) {
264             ret = offset;
265             error_setg(errp, "Invalid file size");
266             goto fail;
267         } else if (offset < sizeof(*footer)) {
268             ret = -EINVAL;
269             error_setg(errp, "File too small for a VHD header");
270             goto fail;
271         }
272 
273         /* If a fixed disk, the footer is found only at the end of the file */
274         ret = bdrv_pread(bs->file, offset - sizeof(*footer), sizeof(*footer),
275                          footer, 0);
276         if (ret < 0) {
277             goto fail;
278         }
279         if (strncmp(footer->creator, "conectix", 8) ||
280             be32_to_cpu(footer->type) != VHD_FIXED) {
281             error_setg(errp, "invalid VPC image");
282             ret = -EINVAL;
283             goto fail;
284         }
285         disk_type = VHD_FIXED;
286     }
287 
288     checksum = be32_to_cpu(footer->checksum);
289     footer->checksum = 0;
290     if (vpc_checksum(footer, sizeof(*footer)) != checksum) {
291         error_setg(errp, "Incorrect header checksum");
292         ret = -EINVAL;
293         goto fail;
294     }
295 
296     /* Write 'checksum' back to footer, or else will leave it with zero. */
297     footer->checksum = cpu_to_be32(checksum);
298 
299     /* The visible size of a image in Virtual PC depends on the geometry
300        rather than on the size stored in the footer (the size in the footer
301        is too large usually) */
302     bs->total_sectors = (int64_t)
303         be16_to_cpu(footer->cyls) * footer->heads * footer->secs_per_cyl;
304 
305     /* Microsoft Virtual PC and Microsoft Hyper-V produce and read
306      * VHD image sizes differently.  VPC will rely on CHS geometry,
307      * while Hyper-V and disk2vhd use the size specified in the footer.
308      *
309      * We use a couple of approaches to try and determine the correct method:
310      * look at the Creator App field, and look for images that have CHS
311      * geometry that is the maximum value.
312      *
313      * If the CHS geometry is the maximum CHS geometry, then we assume that
314      * the size is the footer->current_size to avoid truncation.  Otherwise,
315      * we follow the table based on footer->creator_app:
316      *
317      *  Known creator apps:
318      *      'vpc '  :  CHS              Virtual PC (uses disk geometry)
319      *      'qemu'  :  CHS              QEMU (uses disk geometry)
320      *      'qem2'  :  current_size     QEMU (uses current_size)
321      *      'win '  :  current_size     Hyper-V
322      *      'd2v '  :  current_size     Disk2vhd
323      *      'tap\0' :  current_size     XenServer
324      *      'CTXS'  :  current_size     XenConverter
325      *
326      *  The user can override the table values via drive options, however
327      *  even with an override we will still use current_size for images
328      *  that have CHS geometry of the maximum size.
329      */
330     use_chs = (!!strncmp(footer->creator_app, "win ", 4) &&
331                !!strncmp(footer->creator_app, "qem2", 4) &&
332                !!strncmp(footer->creator_app, "d2v ", 4) &&
333                !!strncmp(footer->creator_app, "CTXS", 4) &&
334                !!memcmp(footer->creator_app, "tap", 4)) || s->force_use_chs;
335 
336     if (!use_chs || bs->total_sectors == VHD_MAX_GEOMETRY || s->force_use_sz) {
337         bs->total_sectors = be64_to_cpu(footer->current_size) /
338                                         BDRV_SECTOR_SIZE;
339     }
340 
341     /* Allow a maximum disk size of 2040 GiB */
342     if (bs->total_sectors > VHD_MAX_SECTORS) {
343         ret = -EFBIG;
344         goto fail;
345     }
346 
347     if (disk_type == VHD_DYNAMIC) {
348         ret = bdrv_pread(bs->file, be64_to_cpu(footer->data_offset),
349                          sizeof(dyndisk_header), &dyndisk_header, 0);
350         if (ret < 0) {
351             error_setg(errp, "Error reading dynamic VHD header");
352             goto fail;
353         }
354 
355         if (strncmp(dyndisk_header.magic, "cxsparse", 8)) {
356             error_setg(errp, "Invalid header magic");
357             ret = -EINVAL;
358             goto fail;
359         }
360 
361         s->block_size = be32_to_cpu(dyndisk_header.block_size);
362         if (!is_power_of_2(s->block_size) || s->block_size < BDRV_SECTOR_SIZE) {
363             error_setg(errp, "Invalid block size %" PRIu32, s->block_size);
364             ret = -EINVAL;
365             goto fail;
366         }
367         s->bitmap_size = ((s->block_size / (8 * 512)) + 511) & ~511;
368 
369         s->max_table_entries = be32_to_cpu(dyndisk_header.max_table_entries);
370 
371         if ((bs->total_sectors * 512) / s->block_size > 0xffffffffU) {
372             error_setg(errp, "Too many blocks");
373             ret = -EINVAL;
374             goto fail;
375         }
376 
377         computed_size = (uint64_t) s->max_table_entries * s->block_size;
378         if (computed_size < bs->total_sectors * 512) {
379             error_setg(errp, "Page table too small");
380             ret = -EINVAL;
381             goto fail;
382         }
383 
384         if (s->max_table_entries > SIZE_MAX / 4 ||
385             s->max_table_entries > (int) INT_MAX / 4) {
386             error_setg(errp, "Max Table Entries too large (%" PRId32 ")",
387                         s->max_table_entries);
388             ret = -EINVAL;
389             goto fail;
390         }
391 
392         pagetable_size = (uint64_t) s->max_table_entries * 4;
393 
394         s->pagetable = qemu_try_blockalign(bs->file->bs, pagetable_size);
395         if (s->pagetable == NULL) {
396             error_setg(errp, "Unable to allocate memory for page table");
397             ret = -ENOMEM;
398             goto fail;
399         }
400 
401         s->bat_offset = be64_to_cpu(dyndisk_header.table_offset);
402 
403         ret = bdrv_pread(bs->file, s->bat_offset, pagetable_size,
404                          s->pagetable, 0);
405         if (ret < 0) {
406             error_setg(errp, "Error reading pagetable");
407             goto fail;
408         }
409 
410         s->free_data_block_offset =
411             ROUND_UP(s->bat_offset + pagetable_size, 512);
412 
413         for (i = 0; i < s->max_table_entries; i++) {
414             be32_to_cpus(&s->pagetable[i]);
415             if (s->pagetable[i] != 0xFFFFFFFF) {
416                 int64_t next = (512 * (int64_t) s->pagetable[i]) +
417                     s->bitmap_size + s->block_size;
418 
419                 if (next > s->free_data_block_offset) {
420                     s->free_data_block_offset = next;
421                 }
422             }
423         }
424 
425         bs_size = bdrv_getlength(bs->file->bs);
426         if (bs_size < 0) {
427             error_setg_errno(errp, -bs_size, "Unable to learn image size");
428             ret = bs_size;
429             goto fail;
430         }
431         if (s->free_data_block_offset > bs_size) {
432             error_setg(errp, "block-vpc: free_data_block_offset points after "
433                              "the end of file. The image has been truncated.");
434             ret = -EINVAL;
435             goto fail;
436         }
437 
438         s->last_bitmap_offset = (int64_t) -1;
439 
440 #ifdef CACHE
441         s->pageentry_u8 = g_malloc(512);
442         s->pageentry_u32 = s->pageentry_u8;
443         s->pageentry_u16 = s->pageentry_u8;
444         s->last_pagetable = -1;
445 #endif
446     }
447 
448     /* Disable migration when VHD images are used */
449     error_setg(&s->migration_blocker, "The vpc format used by node '%s' "
450                "does not support live migration",
451                bdrv_get_device_or_node_name(bs));
452     ret = migrate_add_blocker(s->migration_blocker, errp);
453     if (ret < 0) {
454         error_free(s->migration_blocker);
455         goto fail;
456     }
457 
458     qemu_co_mutex_init(&s->lock);
459     qemu_opts_del(opts);
460 
461     return 0;
462 
463 fail:
464     qemu_opts_del(opts);
465     qemu_vfree(s->pagetable);
466 #ifdef CACHE
467     g_free(s->pageentry_u8);
468 #endif
469     return ret;
470 }
471 
472 static int vpc_reopen_prepare(BDRVReopenState *state,
473                               BlockReopenQueue *queue, Error **errp)
474 {
475     return 0;
476 }
477 
478 /*
479  * Returns the absolute byte offset of the given sector in the image file.
480  * If the sector is not allocated, -1 is returned instead.
481  * If an error occurred trying to write an updated block bitmap back to
482  * the file, -2 is returned, and the error value is written to *err.
483  * This can only happen for a write operation.
484  *
485  * The parameter write must be 1 if the offset will be used for a write
486  * operation (the block bitmaps is updated then), 0 otherwise.
487  * If write is true then err must not be NULL.
488  */
489 static inline int64_t get_image_offset(BlockDriverState *bs, uint64_t offset,
490                                        bool write, int *err)
491 {
492     BDRVVPCState *s = bs->opaque;
493     uint64_t bitmap_offset, block_offset;
494     uint32_t pagetable_index, offset_in_block;
495 
496     assert(!(write && err == NULL));
497 
498     pagetable_index = offset / s->block_size;
499     offset_in_block = offset % s->block_size;
500 
501     if (pagetable_index >= s->max_table_entries || s->pagetable[pagetable_index] == 0xffffffff)
502         return -1; /* not allocated */
503 
504     bitmap_offset = 512 * (uint64_t) s->pagetable[pagetable_index];
505     block_offset = bitmap_offset + s->bitmap_size + offset_in_block;
506 
507     /* We must ensure that we don't write to any sectors which are marked as
508        unused in the bitmap. We get away with setting all bits in the block
509        bitmap each time we write to a new block. This might cause Virtual PC to
510        miss sparse read optimization, but it's not a problem in terms of
511        correctness. */
512     if (write && (s->last_bitmap_offset != bitmap_offset)) {
513         uint8_t bitmap[s->bitmap_size];
514         int r;
515 
516         s->last_bitmap_offset = bitmap_offset;
517         memset(bitmap, 0xff, s->bitmap_size);
518         r = bdrv_pwrite_sync(bs->file, bitmap_offset, s->bitmap_size, bitmap,
519                              0);
520         if (r < 0) {
521             *err = r;
522             return -2;
523         }
524     }
525 
526     return block_offset;
527 }
528 
529 /*
530  * Writes the footer to the end of the image file. This is needed when the
531  * file grows as it overwrites the old footer
532  *
533  * Returns 0 on success and < 0 on error
534  */
535 static int rewrite_footer(BlockDriverState *bs)
536 {
537     int ret;
538     BDRVVPCState *s = bs->opaque;
539     int64_t offset = s->free_data_block_offset;
540 
541     ret = bdrv_pwrite_sync(bs->file, offset, sizeof(s->footer), &s->footer, 0);
542     if (ret < 0)
543         return ret;
544 
545     return 0;
546 }
547 
548 /*
549  * Allocates a new block. This involves writing a new footer and updating
550  * the Block Allocation Table to use the space at the old end of the image
551  * file (overwriting the old footer)
552  *
553  * Returns the sectors' offset in the image file on success and < 0 on error
554  */
555 static int64_t alloc_block(BlockDriverState *bs, int64_t offset)
556 {
557     BDRVVPCState *s = bs->opaque;
558     int64_t bat_offset;
559     uint32_t index, bat_value;
560     int ret;
561     uint8_t bitmap[s->bitmap_size];
562 
563     /* Check if sector_num is valid */
564     if ((offset < 0) || (offset > bs->total_sectors * BDRV_SECTOR_SIZE)) {
565         return -EINVAL;
566     }
567 
568     /* Write entry into in-memory BAT */
569     index = offset / s->block_size;
570     assert(s->pagetable[index] == 0xFFFFFFFF);
571     s->pagetable[index] = s->free_data_block_offset / 512;
572 
573     /* Initialize the block's bitmap */
574     memset(bitmap, 0xff, s->bitmap_size);
575     ret = bdrv_pwrite_sync(bs->file, s->free_data_block_offset,
576                            s->bitmap_size, bitmap, 0);
577     if (ret < 0) {
578         return ret;
579     }
580 
581     /* Write new footer (the old one will be overwritten) */
582     s->free_data_block_offset += s->block_size + s->bitmap_size;
583     ret = rewrite_footer(bs);
584     if (ret < 0)
585         goto fail;
586 
587     /* Write BAT entry to disk */
588     bat_offset = s->bat_offset + (4 * index);
589     bat_value = cpu_to_be32(s->pagetable[index]);
590     ret = bdrv_pwrite_sync(bs->file, bat_offset, 4, &bat_value, 0);
591     if (ret < 0)
592         goto fail;
593 
594     return get_image_offset(bs, offset, false, NULL);
595 
596 fail:
597     s->free_data_block_offset -= (s->block_size + s->bitmap_size);
598     return ret;
599 }
600 
601 static int vpc_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
602 {
603     BDRVVPCState *s = (BDRVVPCState *)bs->opaque;
604 
605     if (be32_to_cpu(s->footer.type) != VHD_FIXED) {
606         bdi->cluster_size = s->block_size;
607     }
608 
609     return 0;
610 }
611 
612 static int coroutine_fn
613 vpc_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes,
614               QEMUIOVector *qiov, BdrvRequestFlags flags)
615 {
616     BDRVVPCState *s = bs->opaque;
617     int ret;
618     int64_t image_offset;
619     int64_t n_bytes;
620     int64_t bytes_done = 0;
621     QEMUIOVector local_qiov;
622 
623     if (be32_to_cpu(s->footer.type) == VHD_FIXED) {
624         return bdrv_co_preadv(bs->file, offset, bytes, qiov, 0);
625     }
626 
627     qemu_co_mutex_lock(&s->lock);
628     qemu_iovec_init(&local_qiov, qiov->niov);
629 
630     while (bytes > 0) {
631         image_offset = get_image_offset(bs, offset, false, NULL);
632         n_bytes = MIN(bytes, s->block_size - (offset % s->block_size));
633 
634         if (image_offset == -1) {
635             qemu_iovec_memset(qiov, bytes_done, 0, n_bytes);
636         } else {
637             qemu_iovec_reset(&local_qiov);
638             qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes);
639 
640             qemu_co_mutex_unlock(&s->lock);
641             ret = bdrv_co_preadv(bs->file, image_offset, n_bytes,
642                                  &local_qiov, 0);
643             qemu_co_mutex_lock(&s->lock);
644             if (ret < 0) {
645                 goto fail;
646             }
647         }
648 
649         bytes -= n_bytes;
650         offset += n_bytes;
651         bytes_done += n_bytes;
652     }
653 
654     ret = 0;
655 fail:
656     qemu_iovec_destroy(&local_qiov);
657     qemu_co_mutex_unlock(&s->lock);
658 
659     return ret;
660 }
661 
662 static int coroutine_fn
663 vpc_co_pwritev(BlockDriverState *bs, int64_t offset, int64_t bytes,
664                QEMUIOVector *qiov, BdrvRequestFlags flags)
665 {
666     BDRVVPCState *s = bs->opaque;
667     int64_t image_offset;
668     int64_t n_bytes;
669     int64_t bytes_done = 0;
670     int ret = 0;
671     QEMUIOVector local_qiov;
672 
673     if (be32_to_cpu(s->footer.type) == VHD_FIXED) {
674         return bdrv_co_pwritev(bs->file, offset, bytes, qiov, 0);
675     }
676 
677     qemu_co_mutex_lock(&s->lock);
678     qemu_iovec_init(&local_qiov, qiov->niov);
679 
680     while (bytes > 0) {
681         image_offset = get_image_offset(bs, offset, true, &ret);
682         if (image_offset == -2) {
683             /* Failed to write block bitmap: can't proceed with write */
684             goto fail;
685         }
686         n_bytes = MIN(bytes, s->block_size - (offset % s->block_size));
687 
688         if (image_offset == -1) {
689             image_offset = alloc_block(bs, offset);
690             if (image_offset < 0) {
691                 ret = image_offset;
692                 goto fail;
693             }
694         }
695 
696         qemu_iovec_reset(&local_qiov);
697         qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes);
698 
699         qemu_co_mutex_unlock(&s->lock);
700         ret = bdrv_co_pwritev(bs->file, image_offset, n_bytes,
701                               &local_qiov, 0);
702         qemu_co_mutex_lock(&s->lock);
703         if (ret < 0) {
704             goto fail;
705         }
706 
707         bytes -= n_bytes;
708         offset += n_bytes;
709         bytes_done += n_bytes;
710     }
711 
712     ret = 0;
713 fail:
714     qemu_iovec_destroy(&local_qiov);
715     qemu_co_mutex_unlock(&s->lock);
716 
717     return ret;
718 }
719 
720 static int coroutine_fn vpc_co_block_status(BlockDriverState *bs,
721                                             bool want_zero,
722                                             int64_t offset, int64_t bytes,
723                                             int64_t *pnum, int64_t *map,
724                                             BlockDriverState **file)
725 {
726     BDRVVPCState *s = bs->opaque;
727     int64_t image_offset;
728     bool allocated;
729     int ret;
730     int64_t n;
731 
732     if (be32_to_cpu(s->footer.type) == VHD_FIXED) {
733         *pnum = bytes;
734         *map = offset;
735         *file = bs->file->bs;
736         return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | BDRV_BLOCK_RECURSE;
737     }
738 
739     qemu_co_mutex_lock(&s->lock);
740 
741     image_offset = get_image_offset(bs, offset, false, NULL);
742     allocated = (image_offset != -1);
743     *pnum = 0;
744     ret = BDRV_BLOCK_ZERO;
745 
746     do {
747         /* All sectors in a block are contiguous (without using the bitmap) */
748         n = ROUND_UP(offset + 1, s->block_size) - offset;
749         n = MIN(n, bytes);
750 
751         *pnum += n;
752         offset += n;
753         bytes -= n;
754         /* *pnum can't be greater than one block for allocated
755          * sectors since there is always a bitmap in between. */
756         if (allocated) {
757             *file = bs->file->bs;
758             *map = image_offset;
759             ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
760             break;
761         }
762         if (bytes == 0) {
763             break;
764         }
765         image_offset = get_image_offset(bs, offset, false, NULL);
766     } while (image_offset == -1);
767 
768     qemu_co_mutex_unlock(&s->lock);
769     return ret;
770 }
771 
772 /*
773  * Calculates the number of cylinders, heads and sectors per cylinder
774  * based on a given number of sectors. This is the algorithm described
775  * in the VHD specification.
776  *
777  * Note that the geometry doesn't always exactly match total_sectors but
778  * may round it down.
779  *
780  * Returns 0 on success, -EFBIG if the size is larger than 2040 GiB. Override
781  * the hardware EIDE and ATA-2 limit of 16 heads (max disk size of 127 GB)
782  * and instead allow up to 255 heads.
783  */
784 static int calculate_geometry(int64_t total_sectors, uint16_t *cyls,
785     uint8_t *heads, uint8_t *secs_per_cyl)
786 {
787     uint32_t cyls_times_heads;
788 
789     total_sectors = MIN(total_sectors, VHD_MAX_GEOMETRY);
790 
791     if (total_sectors >= 65535LL * 16 * 63) {
792         *secs_per_cyl = 255;
793         *heads = 16;
794         cyls_times_heads = total_sectors / *secs_per_cyl;
795     } else {
796         *secs_per_cyl = 17;
797         cyls_times_heads = total_sectors / *secs_per_cyl;
798         *heads = DIV_ROUND_UP(cyls_times_heads, 1024);
799 
800         if (*heads < 4) {
801             *heads = 4;
802         }
803 
804         if (cyls_times_heads >= (*heads * 1024) || *heads > 16) {
805             *secs_per_cyl = 31;
806             *heads = 16;
807             cyls_times_heads = total_sectors / *secs_per_cyl;
808         }
809 
810         if (cyls_times_heads >= (*heads * 1024)) {
811             *secs_per_cyl = 63;
812             *heads = 16;
813             cyls_times_heads = total_sectors / *secs_per_cyl;
814         }
815     }
816 
817     *cyls = cyls_times_heads / *heads;
818 
819     return 0;
820 }
821 
822 static int create_dynamic_disk(BlockBackend *blk, VHDFooter *footer,
823                                int64_t total_sectors)
824 {
825     VHDDynDiskHeader dyndisk_header;
826     uint8_t bat_sector[512];
827     size_t block_size, num_bat_entries;
828     int i;
829     int ret;
830     int64_t offset = 0;
831 
832     /* Write the footer (twice: at the beginning and at the end) */
833     block_size = 0x200000;
834     num_bat_entries = DIV_ROUND_UP(total_sectors, block_size / 512);
835 
836     ret = blk_pwrite(blk, offset, sizeof(*footer), footer, 0);
837     if (ret < 0) {
838         goto fail;
839     }
840 
841     offset = 1536 + ((num_bat_entries * 4 + 511) & ~511);
842     ret = blk_pwrite(blk, offset, sizeof(*footer), footer, 0);
843     if (ret < 0) {
844         goto fail;
845     }
846 
847     /* Write the initial BAT */
848     offset = 3 * 512;
849 
850     memset(bat_sector, 0xFF, 512);
851     for (i = 0; i < DIV_ROUND_UP(num_bat_entries * 4, 512); i++) {
852         ret = blk_pwrite(blk, offset, 512, bat_sector, 0);
853         if (ret < 0) {
854             goto fail;
855         }
856         offset += 512;
857     }
858 
859     /* Prepare the Dynamic Disk Header */
860     memset(&dyndisk_header, 0, sizeof(dyndisk_header));
861 
862     memcpy(dyndisk_header.magic, "cxsparse", 8);
863 
864     /*
865      * Note: The spec is actually wrong here for data_offset, it says
866      * 0xFFFFFFFF, but MS tools expect all 64 bits to be set.
867      */
868     dyndisk_header.data_offset = cpu_to_be64(0xFFFFFFFFFFFFFFFFULL);
869     dyndisk_header.table_offset = cpu_to_be64(3 * 512);
870     dyndisk_header.version = cpu_to_be32(0x00010000);
871     dyndisk_header.block_size = cpu_to_be32(block_size);
872     dyndisk_header.max_table_entries = cpu_to_be32(num_bat_entries);
873 
874     dyndisk_header.checksum = cpu_to_be32(
875         vpc_checksum(&dyndisk_header, sizeof(dyndisk_header)));
876 
877     /* Write the header */
878     offset = 512;
879 
880     ret = blk_pwrite(blk, offset, sizeof(dyndisk_header), &dyndisk_header, 0);
881     if (ret < 0) {
882         goto fail;
883     }
884 
885     ret = 0;
886  fail:
887     return ret;
888 }
889 
890 static int create_fixed_disk(BlockBackend *blk, VHDFooter *footer,
891                              int64_t total_size, Error **errp)
892 {
893     int ret;
894 
895     /* Add footer to total size */
896     total_size += sizeof(*footer);
897 
898     ret = blk_truncate(blk, total_size, false, PREALLOC_MODE_OFF, 0, errp);
899     if (ret < 0) {
900         return ret;
901     }
902 
903     ret = blk_pwrite(blk, total_size - sizeof(*footer), sizeof(*footer),
904                      footer, 0);
905     if (ret < 0) {
906         error_setg_errno(errp, -ret, "Unable to write VHD header");
907         return ret;
908     }
909 
910     return 0;
911 }
912 
913 static int calculate_rounded_image_size(BlockdevCreateOptionsVpc *vpc_opts,
914                                         uint16_t *out_cyls,
915                                         uint8_t *out_heads,
916                                         uint8_t *out_secs_per_cyl,
917                                         int64_t *out_total_sectors,
918                                         Error **errp)
919 {
920     int64_t total_size = vpc_opts->size;
921     uint16_t cyls = 0;
922     uint8_t heads = 0;
923     uint8_t secs_per_cyl = 0;
924     int64_t total_sectors;
925     int i;
926 
927     /*
928      * Calculate matching total_size and geometry. Increase the number of
929      * sectors requested until we get enough (or fail). This ensures that
930      * qemu-img convert doesn't truncate images, but rather rounds up.
931      *
932      * If the image size can't be represented by a spec conformant CHS geometry,
933      * we set the geometry to 65535 x 16 x 255 (CxHxS) sectors and use
934      * the image size from the VHD footer to calculate total_sectors.
935      */
936     if (vpc_opts->force_size) {
937         /* This will force the use of total_size for sector count, below */
938         cyls         = VHD_CHS_MAX_C;
939         heads        = VHD_CHS_MAX_H;
940         secs_per_cyl = VHD_CHS_MAX_S;
941     } else {
942         total_sectors = MIN(VHD_MAX_GEOMETRY, total_size / BDRV_SECTOR_SIZE);
943         for (i = 0; total_sectors > (int64_t)cyls * heads * secs_per_cyl; i++) {
944             calculate_geometry(total_sectors + i, &cyls, &heads, &secs_per_cyl);
945         }
946     }
947 
948     if ((int64_t)cyls * heads * secs_per_cyl == VHD_MAX_GEOMETRY) {
949         total_sectors = total_size / BDRV_SECTOR_SIZE;
950         /* Allow a maximum disk size of 2040 GiB */
951         if (total_sectors > VHD_MAX_SECTORS) {
952             error_setg(errp, "Disk size is too large, max size is 2040 GiB");
953             return -EFBIG;
954         }
955     } else {
956         total_sectors = (int64_t) cyls * heads * secs_per_cyl;
957     }
958 
959     *out_total_sectors = total_sectors;
960     if (out_cyls) {
961         *out_cyls = cyls;
962         *out_heads = heads;
963         *out_secs_per_cyl = secs_per_cyl;
964     }
965 
966     return 0;
967 }
968 
969 static int coroutine_fn vpc_co_create(BlockdevCreateOptions *opts,
970                                       Error **errp)
971 {
972     BlockdevCreateOptionsVpc *vpc_opts;
973     BlockBackend *blk = NULL;
974     BlockDriverState *bs = NULL;
975 
976     VHDFooter footer;
977     uint16_t cyls = 0;
978     uint8_t heads = 0;
979     uint8_t secs_per_cyl = 0;
980     int64_t total_sectors;
981     int64_t total_size;
982     int disk_type;
983     int ret = -EIO;
984     QemuUUID uuid;
985 
986     assert(opts->driver == BLOCKDEV_DRIVER_VPC);
987     vpc_opts = &opts->u.vpc;
988 
989     /* Validate options and set default values */
990     total_size = vpc_opts->size;
991 
992     if (!vpc_opts->has_subformat) {
993         vpc_opts->subformat = BLOCKDEV_VPC_SUBFORMAT_DYNAMIC;
994     }
995     switch (vpc_opts->subformat) {
996     case BLOCKDEV_VPC_SUBFORMAT_DYNAMIC:
997         disk_type = VHD_DYNAMIC;
998         break;
999     case BLOCKDEV_VPC_SUBFORMAT_FIXED:
1000         disk_type = VHD_FIXED;
1001         break;
1002     default:
1003         g_assert_not_reached();
1004     }
1005 
1006     /* Create BlockBackend to write to the image */
1007     bs = bdrv_open_blockdev_ref(vpc_opts->file, errp);
1008     if (bs == NULL) {
1009         return -EIO;
1010     }
1011 
1012     blk = blk_new_with_bs(bs, BLK_PERM_WRITE | BLK_PERM_RESIZE, BLK_PERM_ALL,
1013                           errp);
1014     if (!blk) {
1015         ret = -EPERM;
1016         goto out;
1017     }
1018     blk_set_allow_write_beyond_eof(blk, true);
1019 
1020     /* Get geometry and check that it matches the image size*/
1021     ret = calculate_rounded_image_size(vpc_opts, &cyls, &heads, &secs_per_cyl,
1022                                        &total_sectors, errp);
1023     if (ret < 0) {
1024         goto out;
1025     }
1026 
1027     if (total_size != total_sectors * BDRV_SECTOR_SIZE) {
1028         error_setg(errp, "The requested image size cannot be represented in "
1029                          "CHS geometry");
1030         error_append_hint(errp, "Try size=%llu or force-size=on (the "
1031                                 "latter makes the image incompatible with "
1032                                 "Virtual PC)",
1033                           total_sectors * BDRV_SECTOR_SIZE);
1034         ret = -EINVAL;
1035         goto out;
1036     }
1037 
1038     /* Prepare the Hard Disk Footer */
1039     memset(&footer, 0, sizeof(footer));
1040 
1041     memcpy(footer.creator, "conectix", 8);
1042     if (vpc_opts->force_size) {
1043         memcpy(footer.creator_app, "qem2", 4);
1044     } else {
1045         memcpy(footer.creator_app, "qemu", 4);
1046     }
1047     memcpy(footer.creator_os, "Wi2k", 4);
1048 
1049     footer.features = cpu_to_be32(0x02);
1050     footer.version = cpu_to_be32(0x00010000);
1051     if (disk_type == VHD_DYNAMIC) {
1052         footer.data_offset = cpu_to_be64(sizeof(footer));
1053     } else {
1054         footer.data_offset = cpu_to_be64(0xFFFFFFFFFFFFFFFFULL);
1055     }
1056     footer.timestamp = cpu_to_be32(time(NULL) - VHD_TIMESTAMP_BASE);
1057 
1058     /* Version of Virtual PC 2007 */
1059     footer.major = cpu_to_be16(0x0005);
1060     footer.minor = cpu_to_be16(0x0003);
1061     footer.orig_size = cpu_to_be64(total_size);
1062     footer.current_size = cpu_to_be64(total_size);
1063     footer.cyls = cpu_to_be16(cyls);
1064     footer.heads = heads;
1065     footer.secs_per_cyl = secs_per_cyl;
1066 
1067     footer.type = cpu_to_be32(disk_type);
1068 
1069     qemu_uuid_generate(&uuid);
1070     footer.uuid = uuid;
1071 
1072     footer.checksum = cpu_to_be32(vpc_checksum(&footer, sizeof(footer)));
1073 
1074     if (disk_type == VHD_DYNAMIC) {
1075         ret = create_dynamic_disk(blk, &footer, total_sectors);
1076         if (ret < 0) {
1077             error_setg(errp, "Unable to create or write VHD header");
1078         }
1079     } else {
1080         ret = create_fixed_disk(blk, &footer, total_size, errp);
1081     }
1082 
1083 out:
1084     blk_unref(blk);
1085     bdrv_unref(bs);
1086     return ret;
1087 }
1088 
1089 static int coroutine_fn vpc_co_create_opts(BlockDriver *drv,
1090                                            const char *filename,
1091                                            QemuOpts *opts,
1092                                            Error **errp)
1093 {
1094     BlockdevCreateOptions *create_options = NULL;
1095     QDict *qdict;
1096     Visitor *v;
1097     BlockDriverState *bs = NULL;
1098     int ret;
1099 
1100     static const QDictRenames opt_renames[] = {
1101         { VPC_OPT_FORCE_SIZE,           "force-size" },
1102         { NULL, NULL },
1103     };
1104 
1105     /* Parse options and convert legacy syntax */
1106     qdict = qemu_opts_to_qdict_filtered(opts, NULL, &vpc_create_opts, true);
1107 
1108     if (!qdict_rename_keys(qdict, opt_renames, errp)) {
1109         ret = -EINVAL;
1110         goto fail;
1111     }
1112 
1113     /* Create and open the file (protocol layer) */
1114     ret = bdrv_co_create_file(filename, opts, errp);
1115     if (ret < 0) {
1116         goto fail;
1117     }
1118 
1119     bs = bdrv_open(filename, NULL, NULL,
1120                    BDRV_O_RDWR | BDRV_O_RESIZE | BDRV_O_PROTOCOL, errp);
1121     if (bs == NULL) {
1122         ret = -EIO;
1123         goto fail;
1124     }
1125 
1126     /* Now get the QAPI type BlockdevCreateOptions */
1127     qdict_put_str(qdict, "driver", "vpc");
1128     qdict_put_str(qdict, "file", bs->node_name);
1129 
1130     v = qobject_input_visitor_new_flat_confused(qdict, errp);
1131     if (!v) {
1132         ret = -EINVAL;
1133         goto fail;
1134     }
1135 
1136     visit_type_BlockdevCreateOptions(v, NULL, &create_options, errp);
1137     visit_free(v);
1138     if (!create_options) {
1139         ret = -EINVAL;
1140         goto fail;
1141     }
1142 
1143     /* Silently round up size */
1144     assert(create_options->driver == BLOCKDEV_DRIVER_VPC);
1145     create_options->u.vpc.size =
1146         ROUND_UP(create_options->u.vpc.size, BDRV_SECTOR_SIZE);
1147 
1148     if (!create_options->u.vpc.force_size) {
1149         int64_t total_sectors;
1150         ret = calculate_rounded_image_size(&create_options->u.vpc, NULL, NULL,
1151                                            NULL, &total_sectors, errp);
1152         if (ret < 0) {
1153             goto fail;
1154         }
1155 
1156         create_options->u.vpc.size = total_sectors * BDRV_SECTOR_SIZE;
1157     }
1158 
1159 
1160     /* Create the vpc image (format layer) */
1161     ret = vpc_co_create(create_options, errp);
1162 
1163 fail:
1164     qobject_unref(qdict);
1165     bdrv_unref(bs);
1166     qapi_free_BlockdevCreateOptions(create_options);
1167     return ret;
1168 }
1169 
1170 
1171 static int vpc_has_zero_init(BlockDriverState *bs)
1172 {
1173     BDRVVPCState *s = bs->opaque;
1174 
1175     if (be32_to_cpu(s->footer.type) == VHD_FIXED) {
1176         return bdrv_has_zero_init(bs->file->bs);
1177     } else {
1178         return 1;
1179     }
1180 }
1181 
1182 static void vpc_close(BlockDriverState *bs)
1183 {
1184     BDRVVPCState *s = bs->opaque;
1185     qemu_vfree(s->pagetable);
1186 #ifdef CACHE
1187     g_free(s->pageentry_u8);
1188 #endif
1189 
1190     migrate_del_blocker(s->migration_blocker);
1191     error_free(s->migration_blocker);
1192 }
1193 
1194 static QemuOptsList vpc_create_opts = {
1195     .name = "vpc-create-opts",
1196     .head = QTAILQ_HEAD_INITIALIZER(vpc_create_opts.head),
1197     .desc = {
1198         {
1199             .name = BLOCK_OPT_SIZE,
1200             .type = QEMU_OPT_SIZE,
1201             .help = "Virtual disk size"
1202         },
1203         {
1204             .name = BLOCK_OPT_SUBFMT,
1205             .type = QEMU_OPT_STRING,
1206             .help =
1207                 "Type of virtual hard disk format. Supported formats are "
1208                 "{dynamic (default) | fixed} "
1209         },
1210         {
1211             .name = VPC_OPT_FORCE_SIZE,
1212             .type = QEMU_OPT_BOOL,
1213             .help = "Force disk size calculation to use the actual size "
1214                     "specified, rather than using the nearest CHS-based "
1215                     "calculation"
1216         },
1217         { /* end of list */ }
1218     }
1219 };
1220 
1221 static const char *const vpc_strong_runtime_opts[] = {
1222     VPC_OPT_SIZE_CALC,
1223 
1224     NULL
1225 };
1226 
1227 static BlockDriver bdrv_vpc = {
1228     .format_name    = "vpc",
1229     .instance_size  = sizeof(BDRVVPCState),
1230 
1231     .bdrv_probe             = vpc_probe,
1232     .bdrv_open              = vpc_open,
1233     .bdrv_close             = vpc_close,
1234     .bdrv_reopen_prepare    = vpc_reopen_prepare,
1235     .bdrv_child_perm        = bdrv_default_perms,
1236     .bdrv_co_create         = vpc_co_create,
1237     .bdrv_co_create_opts    = vpc_co_create_opts,
1238 
1239     .bdrv_co_preadv             = vpc_co_preadv,
1240     .bdrv_co_pwritev            = vpc_co_pwritev,
1241     .bdrv_co_block_status       = vpc_co_block_status,
1242 
1243     .bdrv_get_info          = vpc_get_info,
1244 
1245     .is_format              = true,
1246     .create_opts            = &vpc_create_opts,
1247     .bdrv_has_zero_init     = vpc_has_zero_init,
1248     .strong_runtime_opts    = vpc_strong_runtime_opts,
1249 };
1250 
1251 static void bdrv_vpc_init(void)
1252 {
1253     bdrv_register(&bdrv_vpc);
1254 }
1255 
1256 block_init(bdrv_vpc_init);
1257