xref: /openbmc/qemu/block/vpc.c (revision b14df228)
1 /*
2  * Block driver for Connectix / Microsoft Virtual PC images
3  *
4  * Copyright (c) 2005 Alex Beregszaszi
5  * Copyright (c) 2009 Kevin Wolf <kwolf@suse.de>
6  *
7  * Permission is hereby granted, free of charge, to any person obtaining a copy
8  * of this software and associated documentation files (the "Software"), to deal
9  * in the Software without restriction, including without limitation the rights
10  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11  * copies of the Software, and to permit persons to whom the Software is
12  * furnished to do so, subject to the following conditions:
13  *
14  * The above copyright notice and this permission notice shall be included in
15  * all copies or substantial portions of the Software.
16  *
17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23  * THE SOFTWARE.
24  */
25 
26 #include "qemu/osdep.h"
27 #include "qapi/error.h"
28 #include "block/block_int.h"
29 #include "block/qdict.h"
30 #include "sysemu/block-backend.h"
31 #include "qemu/module.h"
32 #include "qemu/option.h"
33 #include "migration/blocker.h"
34 #include "qemu/bswap.h"
35 #include "qemu/uuid.h"
36 #include "qemu/memalign.h"
37 #include "qapi/qmp/qdict.h"
38 #include "qapi/qobject-input-visitor.h"
39 #include "qapi/qapi-visit-block-core.h"
40 
41 /**************************************************************/
42 
43 //#define CACHE
44 
45 enum vhd_type {
46     VHD_FIXED           = 2,
47     VHD_DYNAMIC         = 3,
48     VHD_DIFFERENCING    = 4,
49 };
50 
51 /* Seconds since Jan 1, 2000 0:00:00 (UTC) */
52 #define VHD_TIMESTAMP_BASE 946684800
53 
54 #define VHD_CHS_MAX_C   65535LL
55 #define VHD_CHS_MAX_H   16
56 #define VHD_CHS_MAX_S   255
57 
58 #define VHD_MAX_SECTORS       0xff000000    /* 2040 GiB max image size */
59 #define VHD_MAX_GEOMETRY      (VHD_CHS_MAX_C * VHD_CHS_MAX_H * VHD_CHS_MAX_S)
60 
61 #define VPC_OPT_FORCE_SIZE "force_size"
62 
63 /* always big-endian */
64 typedef struct vhd_footer {
65     char        creator[8]; /* "conectix" */
66     uint32_t    features;
67     uint32_t    version;
68 
69     /* Offset of next header structure, 0xFFFFFFFF if none */
70     uint64_t    data_offset;
71 
72     /* Seconds since Jan 1, 2000 0:00:00 (UTC) */
73     uint32_t    timestamp;
74 
75     char        creator_app[4]; /*  e.g., "vpc " */
76     uint16_t    major;
77     uint16_t    minor;
78     char        creator_os[4]; /* "Wi2k" */
79 
80     uint64_t    orig_size;
81     uint64_t    current_size;
82 
83     uint16_t    cyls;
84     uint8_t     heads;
85     uint8_t     secs_per_cyl;
86 
87     uint32_t    type;
88 
89     /* Checksum of the Hard Disk Footer ("one's complement of the sum of all
90        the bytes in the footer without the checksum field") */
91     uint32_t    checksum;
92 
93     /* UUID used to identify a parent hard disk (backing file) */
94     QemuUUID    uuid;
95 
96     uint8_t     in_saved_state;
97     uint8_t     reserved[427];
98 } QEMU_PACKED VHDFooter;
99 
100 QEMU_BUILD_BUG_ON(sizeof(VHDFooter) != 512);
101 
102 typedef struct vhd_dyndisk_header {
103     char        magic[8]; /* "cxsparse" */
104 
105     /* Offset of next header structure, 0xFFFFFFFF if none */
106     uint64_t    data_offset;
107 
108     /* Offset of the Block Allocation Table (BAT) */
109     uint64_t    table_offset;
110 
111     uint32_t    version;
112     uint32_t    max_table_entries; /* 32bit/entry */
113 
114     /* 2 MB by default, must be a power of two */
115     uint32_t    block_size;
116 
117     uint32_t    checksum;
118     uint8_t     parent_uuid[16];
119     uint32_t    parent_timestamp;
120     uint32_t    reserved;
121 
122     /* Backing file name (in UTF-16) */
123     uint8_t     parent_name[512];
124 
125     struct {
126         uint32_t    platform;
127         uint32_t    data_space;
128         uint32_t    data_length;
129         uint32_t    reserved;
130         uint64_t    data_offset;
131     } parent_locator[8];
132     uint8_t     reserved2[256];
133 } QEMU_PACKED VHDDynDiskHeader;
134 
135 QEMU_BUILD_BUG_ON(sizeof(VHDDynDiskHeader) != 1024);
136 
137 typedef struct BDRVVPCState {
138     CoMutex lock;
139     VHDFooter footer;
140     uint64_t free_data_block_offset;
141     int max_table_entries;
142     uint32_t *pagetable;
143     uint64_t bat_offset;
144     uint64_t last_bitmap_offset;
145 
146     uint32_t block_size;
147     uint32_t bitmap_size;
148     bool force_use_chs;
149     bool force_use_sz;
150 
151 #ifdef CACHE
152     uint8_t *pageentry_u8;
153     uint32_t *pageentry_u32;
154     uint16_t *pageentry_u16;
155 
156     uint64_t last_bitmap;
157 #endif
158 
159     Error *migration_blocker;
160 } BDRVVPCState;
161 
162 #define VPC_OPT_SIZE_CALC "force_size_calc"
163 static QemuOptsList vpc_runtime_opts = {
164     .name = "vpc-runtime-opts",
165     .head = QTAILQ_HEAD_INITIALIZER(vpc_runtime_opts.head),
166     .desc = {
167         {
168             .name = VPC_OPT_SIZE_CALC,
169             .type = QEMU_OPT_STRING,
170             .help = "Force disk size calculation to use either CHS geometry, "
171                     "or use the disk current_size specified in the VHD footer. "
172                     "{chs, current_size}"
173         },
174         { /* end of list */ }
175     }
176 };
177 
178 static QemuOptsList vpc_create_opts;
179 
180 static uint32_t vpc_checksum(void *p, size_t size)
181 {
182     uint8_t *buf = p;
183     uint32_t res = 0;
184     int i;
185 
186     for (i = 0; i < size; i++)
187         res += buf[i];
188 
189     return ~res;
190 }
191 
192 
193 static int vpc_probe(const uint8_t *buf, int buf_size, const char *filename)
194 {
195     if (buf_size >= 8 && !strncmp((char *)buf, "conectix", 8))
196         return 100;
197     return 0;
198 }
199 
200 static void vpc_parse_options(BlockDriverState *bs, QemuOpts *opts,
201                               Error **errp)
202 {
203     BDRVVPCState *s = bs->opaque;
204     const char *size_calc;
205 
206     size_calc = qemu_opt_get(opts, VPC_OPT_SIZE_CALC);
207 
208     if (!size_calc) {
209        /* no override, use autodetect only */
210     } else if (!strcmp(size_calc, "current_size")) {
211         s->force_use_sz = true;
212     } else if (!strcmp(size_calc, "chs")) {
213         s->force_use_chs = true;
214     } else {
215         error_setg(errp, "Invalid size calculation mode: '%s'", size_calc);
216     }
217 }
218 
219 static int vpc_open(BlockDriverState *bs, QDict *options, int flags,
220                     Error **errp)
221 {
222     BDRVVPCState *s = bs->opaque;
223     int i;
224     VHDFooter *footer;
225     QemuOpts *opts = NULL;
226     Error *local_err = NULL;
227     bool use_chs;
228     VHDDynDiskHeader dyndisk_header;
229     uint32_t checksum;
230     uint64_t computed_size;
231     uint64_t pagetable_size;
232     int disk_type = VHD_DYNAMIC;
233     int ret;
234     int64_t bs_size;
235 
236     bs->file = bdrv_open_child(NULL, options, "file", bs, &child_of_bds,
237                                BDRV_CHILD_IMAGE, false, errp);
238     if (!bs->file) {
239         return -EINVAL;
240     }
241 
242     opts = qemu_opts_create(&vpc_runtime_opts, NULL, 0, &error_abort);
243     if (!qemu_opts_absorb_qdict(opts, options, errp)) {
244         ret = -EINVAL;
245         goto fail;
246     }
247 
248     vpc_parse_options(bs, opts, &local_err);
249     if (local_err) {
250         error_propagate(errp, local_err);
251         ret = -EINVAL;
252         goto fail;
253     }
254 
255     ret = bdrv_pread(bs->file, 0, sizeof(s->footer), &s->footer, 0);
256     if (ret < 0) {
257         error_setg(errp, "Unable to read VHD header");
258         goto fail;
259     }
260 
261     footer = &s->footer;
262     if (strncmp(footer->creator, "conectix", 8)) {
263         int64_t offset = bdrv_getlength(bs->file->bs);
264         if (offset < 0) {
265             ret = offset;
266             error_setg(errp, "Invalid file size");
267             goto fail;
268         } else if (offset < sizeof(*footer)) {
269             ret = -EINVAL;
270             error_setg(errp, "File too small for a VHD header");
271             goto fail;
272         }
273 
274         /* If a fixed disk, the footer is found only at the end of the file */
275         ret = bdrv_pread(bs->file, offset - sizeof(*footer), sizeof(*footer),
276                          footer, 0);
277         if (ret < 0) {
278             goto fail;
279         }
280         if (strncmp(footer->creator, "conectix", 8) ||
281             be32_to_cpu(footer->type) != VHD_FIXED) {
282             error_setg(errp, "invalid VPC image");
283             ret = -EINVAL;
284             goto fail;
285         }
286         disk_type = VHD_FIXED;
287     }
288 
289     checksum = be32_to_cpu(footer->checksum);
290     footer->checksum = 0;
291     if (vpc_checksum(footer, sizeof(*footer)) != checksum) {
292         error_setg(errp, "Incorrect header checksum");
293         ret = -EINVAL;
294         goto fail;
295     }
296 
297     /* Write 'checksum' back to footer, or else will leave it with zero. */
298     footer->checksum = cpu_to_be32(checksum);
299 
300     /* The visible size of a image in Virtual PC depends on the geometry
301        rather than on the size stored in the footer (the size in the footer
302        is too large usually) */
303     bs->total_sectors = (int64_t)
304         be16_to_cpu(footer->cyls) * footer->heads * footer->secs_per_cyl;
305 
306     /* Microsoft Virtual PC and Microsoft Hyper-V produce and read
307      * VHD image sizes differently.  VPC will rely on CHS geometry,
308      * while Hyper-V and disk2vhd use the size specified in the footer.
309      *
310      * We use a couple of approaches to try and determine the correct method:
311      * look at the Creator App field, and look for images that have CHS
312      * geometry that is the maximum value.
313      *
314      * If the CHS geometry is the maximum CHS geometry, then we assume that
315      * the size is the footer->current_size to avoid truncation.  Otherwise,
316      * we follow the table based on footer->creator_app:
317      *
318      *  Known creator apps:
319      *      'vpc '  :  CHS              Virtual PC (uses disk geometry)
320      *      'qemu'  :  CHS              QEMU (uses disk geometry)
321      *      'qem2'  :  current_size     QEMU (uses current_size)
322      *      'win '  :  current_size     Hyper-V
323      *      'd2v '  :  current_size     Disk2vhd
324      *      'tap\0' :  current_size     XenServer
325      *      'CTXS'  :  current_size     XenConverter
326      *
327      *  The user can override the table values via drive options, however
328      *  even with an override we will still use current_size for images
329      *  that have CHS geometry of the maximum size.
330      */
331     use_chs = (!!strncmp(footer->creator_app, "win ", 4) &&
332                !!strncmp(footer->creator_app, "qem2", 4) &&
333                !!strncmp(footer->creator_app, "d2v ", 4) &&
334                !!strncmp(footer->creator_app, "CTXS", 4) &&
335                !!memcmp(footer->creator_app, "tap", 4)) || s->force_use_chs;
336 
337     if (!use_chs || bs->total_sectors == VHD_MAX_GEOMETRY || s->force_use_sz) {
338         bs->total_sectors = be64_to_cpu(footer->current_size) /
339                                         BDRV_SECTOR_SIZE;
340     }
341 
342     /* Allow a maximum disk size of 2040 GiB */
343     if (bs->total_sectors > VHD_MAX_SECTORS) {
344         ret = -EFBIG;
345         goto fail;
346     }
347 
348     if (disk_type == VHD_DYNAMIC) {
349         ret = bdrv_pread(bs->file, be64_to_cpu(footer->data_offset),
350                          sizeof(dyndisk_header), &dyndisk_header, 0);
351         if (ret < 0) {
352             error_setg(errp, "Error reading dynamic VHD header");
353             goto fail;
354         }
355 
356         if (strncmp(dyndisk_header.magic, "cxsparse", 8)) {
357             error_setg(errp, "Invalid header magic");
358             ret = -EINVAL;
359             goto fail;
360         }
361 
362         s->block_size = be32_to_cpu(dyndisk_header.block_size);
363         if (!is_power_of_2(s->block_size) || s->block_size < BDRV_SECTOR_SIZE) {
364             error_setg(errp, "Invalid block size %" PRIu32, s->block_size);
365             ret = -EINVAL;
366             goto fail;
367         }
368         s->bitmap_size = ((s->block_size / (8 * 512)) + 511) & ~511;
369 
370         s->max_table_entries = be32_to_cpu(dyndisk_header.max_table_entries);
371 
372         if ((bs->total_sectors * 512) / s->block_size > 0xffffffffU) {
373             error_setg(errp, "Too many blocks");
374             ret = -EINVAL;
375             goto fail;
376         }
377 
378         computed_size = (uint64_t) s->max_table_entries * s->block_size;
379         if (computed_size < bs->total_sectors * 512) {
380             error_setg(errp, "Page table too small");
381             ret = -EINVAL;
382             goto fail;
383         }
384 
385         if (s->max_table_entries > SIZE_MAX / 4 ||
386             s->max_table_entries > (int) INT_MAX / 4) {
387             error_setg(errp, "Max Table Entries too large (%" PRId32 ")",
388                         s->max_table_entries);
389             ret = -EINVAL;
390             goto fail;
391         }
392 
393         pagetable_size = (uint64_t) s->max_table_entries * 4;
394 
395         s->pagetable = qemu_try_blockalign(bs->file->bs, pagetable_size);
396         if (s->pagetable == NULL) {
397             error_setg(errp, "Unable to allocate memory for page table");
398             ret = -ENOMEM;
399             goto fail;
400         }
401 
402         s->bat_offset = be64_to_cpu(dyndisk_header.table_offset);
403 
404         ret = bdrv_pread(bs->file, s->bat_offset, pagetable_size,
405                          s->pagetable, 0);
406         if (ret < 0) {
407             error_setg(errp, "Error reading pagetable");
408             goto fail;
409         }
410 
411         s->free_data_block_offset =
412             ROUND_UP(s->bat_offset + pagetable_size, 512);
413 
414         for (i = 0; i < s->max_table_entries; i++) {
415             be32_to_cpus(&s->pagetable[i]);
416             if (s->pagetable[i] != 0xFFFFFFFF) {
417                 int64_t next = (512 * (int64_t) s->pagetable[i]) +
418                     s->bitmap_size + s->block_size;
419 
420                 if (next > s->free_data_block_offset) {
421                     s->free_data_block_offset = next;
422                 }
423             }
424         }
425 
426         bs_size = bdrv_getlength(bs->file->bs);
427         if (bs_size < 0) {
428             error_setg_errno(errp, -bs_size, "Unable to learn image size");
429             ret = bs_size;
430             goto fail;
431         }
432         if (s->free_data_block_offset > bs_size) {
433             error_setg(errp, "block-vpc: free_data_block_offset points after "
434                              "the end of file. The image has been truncated.");
435             ret = -EINVAL;
436             goto fail;
437         }
438 
439         s->last_bitmap_offset = (int64_t) -1;
440 
441 #ifdef CACHE
442         s->pageentry_u8 = g_malloc(512);
443         s->pageentry_u32 = s->pageentry_u8;
444         s->pageentry_u16 = s->pageentry_u8;
445         s->last_pagetable = -1;
446 #endif
447     }
448 
449     /* Disable migration when VHD images are used */
450     error_setg(&s->migration_blocker, "The vpc format used by node '%s' "
451                "does not support live migration",
452                bdrv_get_device_or_node_name(bs));
453     ret = migrate_add_blocker(s->migration_blocker, errp);
454     if (ret < 0) {
455         error_free(s->migration_blocker);
456         goto fail;
457     }
458 
459     qemu_co_mutex_init(&s->lock);
460     qemu_opts_del(opts);
461 
462     return 0;
463 
464 fail:
465     qemu_opts_del(opts);
466     qemu_vfree(s->pagetable);
467 #ifdef CACHE
468     g_free(s->pageentry_u8);
469 #endif
470     return ret;
471 }
472 
473 static int vpc_reopen_prepare(BDRVReopenState *state,
474                               BlockReopenQueue *queue, Error **errp)
475 {
476     return 0;
477 }
478 
479 /*
480  * Returns the absolute byte offset of the given sector in the image file.
481  * If the sector is not allocated, -1 is returned instead.
482  * If an error occurred trying to write an updated block bitmap back to
483  * the file, -2 is returned, and the error value is written to *err.
484  * This can only happen for a write operation.
485  *
486  * The parameter write must be 1 if the offset will be used for a write
487  * operation (the block bitmaps is updated then), 0 otherwise.
488  * If write is true then err must not be NULL.
489  */
490 static inline int64_t get_image_offset(BlockDriverState *bs, uint64_t offset,
491                                        bool write, int *err)
492 {
493     BDRVVPCState *s = bs->opaque;
494     uint64_t bitmap_offset, block_offset;
495     uint32_t pagetable_index, offset_in_block;
496 
497     assert(!(write && err == NULL));
498 
499     pagetable_index = offset / s->block_size;
500     offset_in_block = offset % s->block_size;
501 
502     if (pagetable_index >= s->max_table_entries || s->pagetable[pagetable_index] == 0xffffffff)
503         return -1; /* not allocated */
504 
505     bitmap_offset = 512 * (uint64_t) s->pagetable[pagetable_index];
506     block_offset = bitmap_offset + s->bitmap_size + offset_in_block;
507 
508     /* We must ensure that we don't write to any sectors which are marked as
509        unused in the bitmap. We get away with setting all bits in the block
510        bitmap each time we write to a new block. This might cause Virtual PC to
511        miss sparse read optimization, but it's not a problem in terms of
512        correctness. */
513     if (write && (s->last_bitmap_offset != bitmap_offset)) {
514         uint8_t bitmap[s->bitmap_size];
515         int r;
516 
517         s->last_bitmap_offset = bitmap_offset;
518         memset(bitmap, 0xff, s->bitmap_size);
519         r = bdrv_pwrite_sync(bs->file, bitmap_offset, s->bitmap_size, bitmap,
520                              0);
521         if (r < 0) {
522             *err = r;
523             return -2;
524         }
525     }
526 
527     return block_offset;
528 }
529 
530 /*
531  * Writes the footer to the end of the image file. This is needed when the
532  * file grows as it overwrites the old footer
533  *
534  * Returns 0 on success and < 0 on error
535  */
536 static int rewrite_footer(BlockDriverState *bs)
537 {
538     int ret;
539     BDRVVPCState *s = bs->opaque;
540     int64_t offset = s->free_data_block_offset;
541 
542     ret = bdrv_pwrite_sync(bs->file, offset, sizeof(s->footer), &s->footer, 0);
543     if (ret < 0)
544         return ret;
545 
546     return 0;
547 }
548 
549 /*
550  * Allocates a new block. This involves writing a new footer and updating
551  * the Block Allocation Table to use the space at the old end of the image
552  * file (overwriting the old footer)
553  *
554  * Returns the sectors' offset in the image file on success and < 0 on error
555  */
556 static int64_t alloc_block(BlockDriverState *bs, int64_t offset)
557 {
558     BDRVVPCState *s = bs->opaque;
559     int64_t bat_offset;
560     uint32_t index, bat_value;
561     int ret;
562     uint8_t bitmap[s->bitmap_size];
563 
564     /* Check if sector_num is valid */
565     if ((offset < 0) || (offset > bs->total_sectors * BDRV_SECTOR_SIZE)) {
566         return -EINVAL;
567     }
568 
569     /* Write entry into in-memory BAT */
570     index = offset / s->block_size;
571     assert(s->pagetable[index] == 0xFFFFFFFF);
572     s->pagetable[index] = s->free_data_block_offset / 512;
573 
574     /* Initialize the block's bitmap */
575     memset(bitmap, 0xff, s->bitmap_size);
576     ret = bdrv_pwrite_sync(bs->file, s->free_data_block_offset,
577                            s->bitmap_size, bitmap, 0);
578     if (ret < 0) {
579         return ret;
580     }
581 
582     /* Write new footer (the old one will be overwritten) */
583     s->free_data_block_offset += s->block_size + s->bitmap_size;
584     ret = rewrite_footer(bs);
585     if (ret < 0)
586         goto fail;
587 
588     /* Write BAT entry to disk */
589     bat_offset = s->bat_offset + (4 * index);
590     bat_value = cpu_to_be32(s->pagetable[index]);
591     ret = bdrv_pwrite_sync(bs->file, bat_offset, 4, &bat_value, 0);
592     if (ret < 0)
593         goto fail;
594 
595     return get_image_offset(bs, offset, false, NULL);
596 
597 fail:
598     s->free_data_block_offset -= (s->block_size + s->bitmap_size);
599     return ret;
600 }
601 
602 static int vpc_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
603 {
604     BDRVVPCState *s = (BDRVVPCState *)bs->opaque;
605 
606     if (be32_to_cpu(s->footer.type) != VHD_FIXED) {
607         bdi->cluster_size = s->block_size;
608     }
609 
610     return 0;
611 }
612 
613 static int coroutine_fn
614 vpc_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes,
615               QEMUIOVector *qiov, BdrvRequestFlags flags)
616 {
617     BDRVVPCState *s = bs->opaque;
618     int ret;
619     int64_t image_offset;
620     int64_t n_bytes;
621     int64_t bytes_done = 0;
622     QEMUIOVector local_qiov;
623 
624     if (be32_to_cpu(s->footer.type) == VHD_FIXED) {
625         return bdrv_co_preadv(bs->file, offset, bytes, qiov, 0);
626     }
627 
628     qemu_co_mutex_lock(&s->lock);
629     qemu_iovec_init(&local_qiov, qiov->niov);
630 
631     while (bytes > 0) {
632         image_offset = get_image_offset(bs, offset, false, NULL);
633         n_bytes = MIN(bytes, s->block_size - (offset % s->block_size));
634 
635         if (image_offset == -1) {
636             qemu_iovec_memset(qiov, bytes_done, 0, n_bytes);
637         } else {
638             qemu_iovec_reset(&local_qiov);
639             qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes);
640 
641             qemu_co_mutex_unlock(&s->lock);
642             ret = bdrv_co_preadv(bs->file, image_offset, n_bytes,
643                                  &local_qiov, 0);
644             qemu_co_mutex_lock(&s->lock);
645             if (ret < 0) {
646                 goto fail;
647             }
648         }
649 
650         bytes -= n_bytes;
651         offset += n_bytes;
652         bytes_done += n_bytes;
653     }
654 
655     ret = 0;
656 fail:
657     qemu_iovec_destroy(&local_qiov);
658     qemu_co_mutex_unlock(&s->lock);
659 
660     return ret;
661 }
662 
663 static int coroutine_fn
664 vpc_co_pwritev(BlockDriverState *bs, int64_t offset, int64_t bytes,
665                QEMUIOVector *qiov, BdrvRequestFlags flags)
666 {
667     BDRVVPCState *s = bs->opaque;
668     int64_t image_offset;
669     int64_t n_bytes;
670     int64_t bytes_done = 0;
671     int ret = 0;
672     QEMUIOVector local_qiov;
673 
674     if (be32_to_cpu(s->footer.type) == VHD_FIXED) {
675         return bdrv_co_pwritev(bs->file, offset, bytes, qiov, 0);
676     }
677 
678     qemu_co_mutex_lock(&s->lock);
679     qemu_iovec_init(&local_qiov, qiov->niov);
680 
681     while (bytes > 0) {
682         image_offset = get_image_offset(bs, offset, true, &ret);
683         if (image_offset == -2) {
684             /* Failed to write block bitmap: can't proceed with write */
685             goto fail;
686         }
687         n_bytes = MIN(bytes, s->block_size - (offset % s->block_size));
688 
689         if (image_offset == -1) {
690             image_offset = alloc_block(bs, offset);
691             if (image_offset < 0) {
692                 ret = image_offset;
693                 goto fail;
694             }
695         }
696 
697         qemu_iovec_reset(&local_qiov);
698         qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes);
699 
700         qemu_co_mutex_unlock(&s->lock);
701         ret = bdrv_co_pwritev(bs->file, image_offset, n_bytes,
702                               &local_qiov, 0);
703         qemu_co_mutex_lock(&s->lock);
704         if (ret < 0) {
705             goto fail;
706         }
707 
708         bytes -= n_bytes;
709         offset += n_bytes;
710         bytes_done += n_bytes;
711     }
712 
713     ret = 0;
714 fail:
715     qemu_iovec_destroy(&local_qiov);
716     qemu_co_mutex_unlock(&s->lock);
717 
718     return ret;
719 }
720 
721 static int coroutine_fn vpc_co_block_status(BlockDriverState *bs,
722                                             bool want_zero,
723                                             int64_t offset, int64_t bytes,
724                                             int64_t *pnum, int64_t *map,
725                                             BlockDriverState **file)
726 {
727     BDRVVPCState *s = bs->opaque;
728     int64_t image_offset;
729     bool allocated;
730     int ret;
731     int64_t n;
732 
733     if (be32_to_cpu(s->footer.type) == VHD_FIXED) {
734         *pnum = bytes;
735         *map = offset;
736         *file = bs->file->bs;
737         return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | BDRV_BLOCK_RECURSE;
738     }
739 
740     qemu_co_mutex_lock(&s->lock);
741 
742     image_offset = get_image_offset(bs, offset, false, NULL);
743     allocated = (image_offset != -1);
744     *pnum = 0;
745     ret = BDRV_BLOCK_ZERO;
746 
747     do {
748         /* All sectors in a block are contiguous (without using the bitmap) */
749         n = ROUND_UP(offset + 1, s->block_size) - offset;
750         n = MIN(n, bytes);
751 
752         *pnum += n;
753         offset += n;
754         bytes -= n;
755         /* *pnum can't be greater than one block for allocated
756          * sectors since there is always a bitmap in between. */
757         if (allocated) {
758             *file = bs->file->bs;
759             *map = image_offset;
760             ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
761             break;
762         }
763         if (bytes == 0) {
764             break;
765         }
766         image_offset = get_image_offset(bs, offset, false, NULL);
767     } while (image_offset == -1);
768 
769     qemu_co_mutex_unlock(&s->lock);
770     return ret;
771 }
772 
773 /*
774  * Calculates the number of cylinders, heads and sectors per cylinder
775  * based on a given number of sectors. This is the algorithm described
776  * in the VHD specification.
777  *
778  * Note that the geometry doesn't always exactly match total_sectors but
779  * may round it down.
780  *
781  * Returns 0 on success, -EFBIG if the size is larger than 2040 GiB. Override
782  * the hardware EIDE and ATA-2 limit of 16 heads (max disk size of 127 GB)
783  * and instead allow up to 255 heads.
784  */
785 static int calculate_geometry(int64_t total_sectors, uint16_t *cyls,
786     uint8_t *heads, uint8_t *secs_per_cyl)
787 {
788     uint32_t cyls_times_heads;
789 
790     total_sectors = MIN(total_sectors, VHD_MAX_GEOMETRY);
791 
792     if (total_sectors >= 65535LL * 16 * 63) {
793         *secs_per_cyl = 255;
794         *heads = 16;
795         cyls_times_heads = total_sectors / *secs_per_cyl;
796     } else {
797         *secs_per_cyl = 17;
798         cyls_times_heads = total_sectors / *secs_per_cyl;
799         *heads = DIV_ROUND_UP(cyls_times_heads, 1024);
800 
801         if (*heads < 4) {
802             *heads = 4;
803         }
804 
805         if (cyls_times_heads >= (*heads * 1024) || *heads > 16) {
806             *secs_per_cyl = 31;
807             *heads = 16;
808             cyls_times_heads = total_sectors / *secs_per_cyl;
809         }
810 
811         if (cyls_times_heads >= (*heads * 1024)) {
812             *secs_per_cyl = 63;
813             *heads = 16;
814             cyls_times_heads = total_sectors / *secs_per_cyl;
815         }
816     }
817 
818     *cyls = cyls_times_heads / *heads;
819 
820     return 0;
821 }
822 
823 static int create_dynamic_disk(BlockBackend *blk, VHDFooter *footer,
824                                int64_t total_sectors)
825 {
826     VHDDynDiskHeader dyndisk_header;
827     uint8_t bat_sector[512];
828     size_t block_size, num_bat_entries;
829     int i;
830     int ret;
831     int64_t offset = 0;
832 
833     /* Write the footer (twice: at the beginning and at the end) */
834     block_size = 0x200000;
835     num_bat_entries = DIV_ROUND_UP(total_sectors, block_size / 512);
836 
837     ret = blk_pwrite(blk, offset, sizeof(*footer), footer, 0);
838     if (ret < 0) {
839         goto fail;
840     }
841 
842     offset = 1536 + ((num_bat_entries * 4 + 511) & ~511);
843     ret = blk_pwrite(blk, offset, sizeof(*footer), footer, 0);
844     if (ret < 0) {
845         goto fail;
846     }
847 
848     /* Write the initial BAT */
849     offset = 3 * 512;
850 
851     memset(bat_sector, 0xFF, 512);
852     for (i = 0; i < DIV_ROUND_UP(num_bat_entries * 4, 512); i++) {
853         ret = blk_pwrite(blk, offset, 512, bat_sector, 0);
854         if (ret < 0) {
855             goto fail;
856         }
857         offset += 512;
858     }
859 
860     /* Prepare the Dynamic Disk Header */
861     memset(&dyndisk_header, 0, sizeof(dyndisk_header));
862 
863     memcpy(dyndisk_header.magic, "cxsparse", 8);
864 
865     /*
866      * Note: The spec is actually wrong here for data_offset, it says
867      * 0xFFFFFFFF, but MS tools expect all 64 bits to be set.
868      */
869     dyndisk_header.data_offset = cpu_to_be64(0xFFFFFFFFFFFFFFFFULL);
870     dyndisk_header.table_offset = cpu_to_be64(3 * 512);
871     dyndisk_header.version = cpu_to_be32(0x00010000);
872     dyndisk_header.block_size = cpu_to_be32(block_size);
873     dyndisk_header.max_table_entries = cpu_to_be32(num_bat_entries);
874 
875     dyndisk_header.checksum = cpu_to_be32(
876         vpc_checksum(&dyndisk_header, sizeof(dyndisk_header)));
877 
878     /* Write the header */
879     offset = 512;
880 
881     ret = blk_pwrite(blk, offset, sizeof(dyndisk_header), &dyndisk_header, 0);
882     if (ret < 0) {
883         goto fail;
884     }
885 
886     ret = 0;
887  fail:
888     return ret;
889 }
890 
891 static int create_fixed_disk(BlockBackend *blk, VHDFooter *footer,
892                              int64_t total_size, Error **errp)
893 {
894     int ret;
895 
896     /* Add footer to total size */
897     total_size += sizeof(*footer);
898 
899     ret = blk_truncate(blk, total_size, false, PREALLOC_MODE_OFF, 0, errp);
900     if (ret < 0) {
901         return ret;
902     }
903 
904     ret = blk_pwrite(blk, total_size - sizeof(*footer), sizeof(*footer),
905                      footer, 0);
906     if (ret < 0) {
907         error_setg_errno(errp, -ret, "Unable to write VHD header");
908         return ret;
909     }
910 
911     return 0;
912 }
913 
914 static int calculate_rounded_image_size(BlockdevCreateOptionsVpc *vpc_opts,
915                                         uint16_t *out_cyls,
916                                         uint8_t *out_heads,
917                                         uint8_t *out_secs_per_cyl,
918                                         int64_t *out_total_sectors,
919                                         Error **errp)
920 {
921     int64_t total_size = vpc_opts->size;
922     uint16_t cyls = 0;
923     uint8_t heads = 0;
924     uint8_t secs_per_cyl = 0;
925     int64_t total_sectors;
926     int i;
927 
928     /*
929      * Calculate matching total_size and geometry. Increase the number of
930      * sectors requested until we get enough (or fail). This ensures that
931      * qemu-img convert doesn't truncate images, but rather rounds up.
932      *
933      * If the image size can't be represented by a spec conformant CHS geometry,
934      * we set the geometry to 65535 x 16 x 255 (CxHxS) sectors and use
935      * the image size from the VHD footer to calculate total_sectors.
936      */
937     if (vpc_opts->force_size) {
938         /* This will force the use of total_size for sector count, below */
939         cyls         = VHD_CHS_MAX_C;
940         heads        = VHD_CHS_MAX_H;
941         secs_per_cyl = VHD_CHS_MAX_S;
942     } else {
943         total_sectors = MIN(VHD_MAX_GEOMETRY, total_size / BDRV_SECTOR_SIZE);
944         for (i = 0; total_sectors > (int64_t)cyls * heads * secs_per_cyl; i++) {
945             calculate_geometry(total_sectors + i, &cyls, &heads, &secs_per_cyl);
946         }
947     }
948 
949     if ((int64_t)cyls * heads * secs_per_cyl == VHD_MAX_GEOMETRY) {
950         total_sectors = total_size / BDRV_SECTOR_SIZE;
951         /* Allow a maximum disk size of 2040 GiB */
952         if (total_sectors > VHD_MAX_SECTORS) {
953             error_setg(errp, "Disk size is too large, max size is 2040 GiB");
954             return -EFBIG;
955         }
956     } else {
957         total_sectors = (int64_t) cyls * heads * secs_per_cyl;
958     }
959 
960     *out_total_sectors = total_sectors;
961     if (out_cyls) {
962         *out_cyls = cyls;
963         *out_heads = heads;
964         *out_secs_per_cyl = secs_per_cyl;
965     }
966 
967     return 0;
968 }
969 
970 static int coroutine_fn vpc_co_create(BlockdevCreateOptions *opts,
971                                       Error **errp)
972 {
973     BlockdevCreateOptionsVpc *vpc_opts;
974     BlockBackend *blk = NULL;
975     BlockDriverState *bs = NULL;
976 
977     VHDFooter footer;
978     uint16_t cyls = 0;
979     uint8_t heads = 0;
980     uint8_t secs_per_cyl = 0;
981     int64_t total_sectors;
982     int64_t total_size;
983     int disk_type;
984     int ret = -EIO;
985     QemuUUID uuid;
986 
987     assert(opts->driver == BLOCKDEV_DRIVER_VPC);
988     vpc_opts = &opts->u.vpc;
989 
990     /* Validate options and set default values */
991     total_size = vpc_opts->size;
992 
993     if (!vpc_opts->has_subformat) {
994         vpc_opts->subformat = BLOCKDEV_VPC_SUBFORMAT_DYNAMIC;
995     }
996     switch (vpc_opts->subformat) {
997     case BLOCKDEV_VPC_SUBFORMAT_DYNAMIC:
998         disk_type = VHD_DYNAMIC;
999         break;
1000     case BLOCKDEV_VPC_SUBFORMAT_FIXED:
1001         disk_type = VHD_FIXED;
1002         break;
1003     default:
1004         g_assert_not_reached();
1005     }
1006 
1007     /* Create BlockBackend to write to the image */
1008     bs = bdrv_open_blockdev_ref(vpc_opts->file, errp);
1009     if (bs == NULL) {
1010         return -EIO;
1011     }
1012 
1013     blk = blk_new_with_bs(bs, BLK_PERM_WRITE | BLK_PERM_RESIZE, BLK_PERM_ALL,
1014                           errp);
1015     if (!blk) {
1016         ret = -EPERM;
1017         goto out;
1018     }
1019     blk_set_allow_write_beyond_eof(blk, true);
1020 
1021     /* Get geometry and check that it matches the image size*/
1022     ret = calculate_rounded_image_size(vpc_opts, &cyls, &heads, &secs_per_cyl,
1023                                        &total_sectors, errp);
1024     if (ret < 0) {
1025         goto out;
1026     }
1027 
1028     if (total_size != total_sectors * BDRV_SECTOR_SIZE) {
1029         error_setg(errp, "The requested image size cannot be represented in "
1030                          "CHS geometry");
1031         error_append_hint(errp, "Try size=%llu or force-size=on (the "
1032                                 "latter makes the image incompatible with "
1033                                 "Virtual PC)",
1034                           total_sectors * BDRV_SECTOR_SIZE);
1035         ret = -EINVAL;
1036         goto out;
1037     }
1038 
1039     /* Prepare the Hard Disk Footer */
1040     memset(&footer, 0, sizeof(footer));
1041 
1042     memcpy(footer.creator, "conectix", 8);
1043     if (vpc_opts->force_size) {
1044         memcpy(footer.creator_app, "qem2", 4);
1045     } else {
1046         memcpy(footer.creator_app, "qemu", 4);
1047     }
1048     memcpy(footer.creator_os, "Wi2k", 4);
1049 
1050     footer.features = cpu_to_be32(0x02);
1051     footer.version = cpu_to_be32(0x00010000);
1052     if (disk_type == VHD_DYNAMIC) {
1053         footer.data_offset = cpu_to_be64(sizeof(footer));
1054     } else {
1055         footer.data_offset = cpu_to_be64(0xFFFFFFFFFFFFFFFFULL);
1056     }
1057     footer.timestamp = cpu_to_be32(time(NULL) - VHD_TIMESTAMP_BASE);
1058 
1059     /* Version of Virtual PC 2007 */
1060     footer.major = cpu_to_be16(0x0005);
1061     footer.minor = cpu_to_be16(0x0003);
1062     footer.orig_size = cpu_to_be64(total_size);
1063     footer.current_size = cpu_to_be64(total_size);
1064     footer.cyls = cpu_to_be16(cyls);
1065     footer.heads = heads;
1066     footer.secs_per_cyl = secs_per_cyl;
1067 
1068     footer.type = cpu_to_be32(disk_type);
1069 
1070     qemu_uuid_generate(&uuid);
1071     footer.uuid = uuid;
1072 
1073     footer.checksum = cpu_to_be32(vpc_checksum(&footer, sizeof(footer)));
1074 
1075     if (disk_type == VHD_DYNAMIC) {
1076         ret = create_dynamic_disk(blk, &footer, total_sectors);
1077         if (ret < 0) {
1078             error_setg(errp, "Unable to create or write VHD header");
1079         }
1080     } else {
1081         ret = create_fixed_disk(blk, &footer, total_size, errp);
1082     }
1083 
1084 out:
1085     blk_unref(blk);
1086     bdrv_unref(bs);
1087     return ret;
1088 }
1089 
1090 static int coroutine_fn vpc_co_create_opts(BlockDriver *drv,
1091                                            const char *filename,
1092                                            QemuOpts *opts,
1093                                            Error **errp)
1094 {
1095     BlockdevCreateOptions *create_options = NULL;
1096     QDict *qdict;
1097     Visitor *v;
1098     BlockDriverState *bs = NULL;
1099     int ret;
1100 
1101     static const QDictRenames opt_renames[] = {
1102         { VPC_OPT_FORCE_SIZE,           "force-size" },
1103         { NULL, NULL },
1104     };
1105 
1106     /* Parse options and convert legacy syntax */
1107     qdict = qemu_opts_to_qdict_filtered(opts, NULL, &vpc_create_opts, true);
1108 
1109     if (!qdict_rename_keys(qdict, opt_renames, errp)) {
1110         ret = -EINVAL;
1111         goto fail;
1112     }
1113 
1114     /* Create and open the file (protocol layer) */
1115     ret = bdrv_create_file(filename, opts, errp);
1116     if (ret < 0) {
1117         goto fail;
1118     }
1119 
1120     bs = bdrv_open(filename, NULL, NULL,
1121                    BDRV_O_RDWR | BDRV_O_RESIZE | BDRV_O_PROTOCOL, errp);
1122     if (bs == NULL) {
1123         ret = -EIO;
1124         goto fail;
1125     }
1126 
1127     /* Now get the QAPI type BlockdevCreateOptions */
1128     qdict_put_str(qdict, "driver", "vpc");
1129     qdict_put_str(qdict, "file", bs->node_name);
1130 
1131     v = qobject_input_visitor_new_flat_confused(qdict, errp);
1132     if (!v) {
1133         ret = -EINVAL;
1134         goto fail;
1135     }
1136 
1137     visit_type_BlockdevCreateOptions(v, NULL, &create_options, errp);
1138     visit_free(v);
1139     if (!create_options) {
1140         ret = -EINVAL;
1141         goto fail;
1142     }
1143 
1144     /* Silently round up size */
1145     assert(create_options->driver == BLOCKDEV_DRIVER_VPC);
1146     create_options->u.vpc.size =
1147         ROUND_UP(create_options->u.vpc.size, BDRV_SECTOR_SIZE);
1148 
1149     if (!create_options->u.vpc.force_size) {
1150         int64_t total_sectors;
1151         ret = calculate_rounded_image_size(&create_options->u.vpc, NULL, NULL,
1152                                            NULL, &total_sectors, errp);
1153         if (ret < 0) {
1154             goto fail;
1155         }
1156 
1157         create_options->u.vpc.size = total_sectors * BDRV_SECTOR_SIZE;
1158     }
1159 
1160 
1161     /* Create the vpc image (format layer) */
1162     ret = vpc_co_create(create_options, errp);
1163 
1164 fail:
1165     qobject_unref(qdict);
1166     bdrv_unref(bs);
1167     qapi_free_BlockdevCreateOptions(create_options);
1168     return ret;
1169 }
1170 
1171 
1172 static int vpc_has_zero_init(BlockDriverState *bs)
1173 {
1174     BDRVVPCState *s = bs->opaque;
1175 
1176     if (be32_to_cpu(s->footer.type) == VHD_FIXED) {
1177         return bdrv_has_zero_init(bs->file->bs);
1178     } else {
1179         return 1;
1180     }
1181 }
1182 
1183 static void vpc_close(BlockDriverState *bs)
1184 {
1185     BDRVVPCState *s = bs->opaque;
1186     qemu_vfree(s->pagetable);
1187 #ifdef CACHE
1188     g_free(s->pageentry_u8);
1189 #endif
1190 
1191     migrate_del_blocker(s->migration_blocker);
1192     error_free(s->migration_blocker);
1193 }
1194 
1195 static QemuOptsList vpc_create_opts = {
1196     .name = "vpc-create-opts",
1197     .head = QTAILQ_HEAD_INITIALIZER(vpc_create_opts.head),
1198     .desc = {
1199         {
1200             .name = BLOCK_OPT_SIZE,
1201             .type = QEMU_OPT_SIZE,
1202             .help = "Virtual disk size"
1203         },
1204         {
1205             .name = BLOCK_OPT_SUBFMT,
1206             .type = QEMU_OPT_STRING,
1207             .help =
1208                 "Type of virtual hard disk format. Supported formats are "
1209                 "{dynamic (default) | fixed} "
1210         },
1211         {
1212             .name = VPC_OPT_FORCE_SIZE,
1213             .type = QEMU_OPT_BOOL,
1214             .help = "Force disk size calculation to use the actual size "
1215                     "specified, rather than using the nearest CHS-based "
1216                     "calculation"
1217         },
1218         { /* end of list */ }
1219     }
1220 };
1221 
1222 static const char *const vpc_strong_runtime_opts[] = {
1223     VPC_OPT_SIZE_CALC,
1224 
1225     NULL
1226 };
1227 
1228 static BlockDriver bdrv_vpc = {
1229     .format_name    = "vpc",
1230     .instance_size  = sizeof(BDRVVPCState),
1231 
1232     .bdrv_probe             = vpc_probe,
1233     .bdrv_open              = vpc_open,
1234     .bdrv_close             = vpc_close,
1235     .bdrv_reopen_prepare    = vpc_reopen_prepare,
1236     .bdrv_child_perm        = bdrv_default_perms,
1237     .bdrv_co_create         = vpc_co_create,
1238     .bdrv_co_create_opts    = vpc_co_create_opts,
1239 
1240     .bdrv_co_preadv             = vpc_co_preadv,
1241     .bdrv_co_pwritev            = vpc_co_pwritev,
1242     .bdrv_co_block_status       = vpc_co_block_status,
1243 
1244     .bdrv_get_info          = vpc_get_info,
1245 
1246     .is_format              = true,
1247     .create_opts            = &vpc_create_opts,
1248     .bdrv_has_zero_init     = vpc_has_zero_init,
1249     .strong_runtime_opts    = vpc_strong_runtime_opts,
1250 };
1251 
1252 static void bdrv_vpc_init(void)
1253 {
1254     bdrv_register(&bdrv_vpc);
1255 }
1256 
1257 block_init(bdrv_vpc_init);
1258