xref: /openbmc/qemu/block/vpc.c (revision 0bdb12c7)
1 /*
2  * Block driver for Connectix / Microsoft Virtual PC images
3  *
4  * Copyright (c) 2005 Alex Beregszaszi
5  * Copyright (c) 2009 Kevin Wolf <kwolf@suse.de>
6  *
7  * Permission is hereby granted, free of charge, to any person obtaining a copy
8  * of this software and associated documentation files (the "Software"), to deal
9  * in the Software without restriction, including without limitation the rights
10  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11  * copies of the Software, and to permit persons to whom the Software is
12  * furnished to do so, subject to the following conditions:
13  *
14  * The above copyright notice and this permission notice shall be included in
15  * all copies or substantial portions of the Software.
16  *
17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23  * THE SOFTWARE.
24  */
25 #include "qemu/osdep.h"
26 #include "qapi/error.h"
27 #include "qemu-common.h"
28 #include "block/block_int.h"
29 #include "sysemu/block-backend.h"
30 #include "qemu/module.h"
31 #include "migration/migration.h"
32 #include "qemu/bswap.h"
33 #include "qemu/uuid.h"
34 
35 /**************************************************************/
36 
37 #define HEADER_SIZE 512
38 
39 //#define CACHE
40 
41 enum vhd_type {
42     VHD_FIXED           = 2,
43     VHD_DYNAMIC         = 3,
44     VHD_DIFFERENCING    = 4,
45 };
46 
47 /* Seconds since Jan 1, 2000 0:00:00 (UTC) */
48 #define VHD_TIMESTAMP_BASE 946684800
49 
50 #define VHD_CHS_MAX_C   65535LL
51 #define VHD_CHS_MAX_H   16
52 #define VHD_CHS_MAX_S   255
53 
54 #define VHD_MAX_SECTORS       0xff000000    /* 2040 GiB max image size */
55 #define VHD_MAX_GEOMETRY      (VHD_CHS_MAX_C * VHD_CHS_MAX_H * VHD_CHS_MAX_S)
56 
57 #define VPC_OPT_FORCE_SIZE "force_size"
58 
59 /* always big-endian */
60 typedef struct vhd_footer {
61     char        creator[8]; /* "conectix" */
62     uint32_t    features;
63     uint32_t    version;
64 
65     /* Offset of next header structure, 0xFFFFFFFF if none */
66     uint64_t    data_offset;
67 
68     /* Seconds since Jan 1, 2000 0:00:00 (UTC) */
69     uint32_t    timestamp;
70 
71     char        creator_app[4]; /*  e.g., "vpc " */
72     uint16_t    major;
73     uint16_t    minor;
74     char        creator_os[4]; /* "Wi2k" */
75 
76     uint64_t    orig_size;
77     uint64_t    current_size;
78 
79     uint16_t    cyls;
80     uint8_t     heads;
81     uint8_t     secs_per_cyl;
82 
83     uint32_t    type;
84 
85     /* Checksum of the Hard Disk Footer ("one's complement of the sum of all
86        the bytes in the footer without the checksum field") */
87     uint32_t    checksum;
88 
89     /* UUID used to identify a parent hard disk (backing file) */
90     QemuUUID    uuid;
91 
92     uint8_t     in_saved_state;
93 } QEMU_PACKED VHDFooter;
94 
95 typedef struct vhd_dyndisk_header {
96     char        magic[8]; /* "cxsparse" */
97 
98     /* Offset of next header structure, 0xFFFFFFFF if none */
99     uint64_t    data_offset;
100 
101     /* Offset of the Block Allocation Table (BAT) */
102     uint64_t    table_offset;
103 
104     uint32_t    version;
105     uint32_t    max_table_entries; /* 32bit/entry */
106 
107     /* 2 MB by default, must be a power of two */
108     uint32_t    block_size;
109 
110     uint32_t    checksum;
111     uint8_t     parent_uuid[16];
112     uint32_t    parent_timestamp;
113     uint32_t    reserved;
114 
115     /* Backing file name (in UTF-16) */
116     uint8_t     parent_name[512];
117 
118     struct {
119         uint32_t    platform;
120         uint32_t    data_space;
121         uint32_t    data_length;
122         uint32_t    reserved;
123         uint64_t    data_offset;
124     } parent_locator[8];
125 } QEMU_PACKED VHDDynDiskHeader;
126 
127 typedef struct BDRVVPCState {
128     CoMutex lock;
129     uint8_t footer_buf[HEADER_SIZE];
130     uint64_t free_data_block_offset;
131     int max_table_entries;
132     uint32_t *pagetable;
133     uint64_t bat_offset;
134     uint64_t last_bitmap_offset;
135 
136     uint32_t block_size;
137     uint32_t bitmap_size;
138     bool force_use_chs;
139     bool force_use_sz;
140 
141 #ifdef CACHE
142     uint8_t *pageentry_u8;
143     uint32_t *pageentry_u32;
144     uint16_t *pageentry_u16;
145 
146     uint64_t last_bitmap;
147 #endif
148 
149     Error *migration_blocker;
150 } BDRVVPCState;
151 
152 #define VPC_OPT_SIZE_CALC "force_size_calc"
153 static QemuOptsList vpc_runtime_opts = {
154     .name = "vpc-runtime-opts",
155     .head = QTAILQ_HEAD_INITIALIZER(vpc_runtime_opts.head),
156     .desc = {
157         {
158             .name = VPC_OPT_SIZE_CALC,
159             .type = QEMU_OPT_STRING,
160             .help = "Force disk size calculation to use either CHS geometry, "
161                     "or use the disk current_size specified in the VHD footer. "
162                     "{chs, current_size}"
163         },
164         { /* end of list */ }
165     }
166 };
167 
168 static uint32_t vpc_checksum(uint8_t* buf, size_t size)
169 {
170     uint32_t res = 0;
171     int i;
172 
173     for (i = 0; i < size; i++)
174         res += buf[i];
175 
176     return ~res;
177 }
178 
179 
180 static int vpc_probe(const uint8_t *buf, int buf_size, const char *filename)
181 {
182     if (buf_size >= 8 && !strncmp((char *)buf, "conectix", 8))
183 	return 100;
184     return 0;
185 }
186 
187 static void vpc_parse_options(BlockDriverState *bs, QemuOpts *opts,
188                               Error **errp)
189 {
190     BDRVVPCState *s = bs->opaque;
191     const char *size_calc;
192 
193     size_calc = qemu_opt_get(opts, VPC_OPT_SIZE_CALC);
194 
195     if (!size_calc) {
196        /* no override, use autodetect only */
197     } else if (!strcmp(size_calc, "current_size")) {
198         s->force_use_sz = true;
199     } else if (!strcmp(size_calc, "chs")) {
200         s->force_use_chs = true;
201     } else {
202         error_setg(errp, "Invalid size calculation mode: '%s'", size_calc);
203     }
204 }
205 
206 static int vpc_open(BlockDriverState *bs, QDict *options, int flags,
207                     Error **errp)
208 {
209     BDRVVPCState *s = bs->opaque;
210     int i;
211     VHDFooter *footer;
212     VHDDynDiskHeader *dyndisk_header;
213     QemuOpts *opts = NULL;
214     Error *local_err = NULL;
215     bool use_chs;
216     uint8_t buf[HEADER_SIZE];
217     uint32_t checksum;
218     uint64_t computed_size;
219     uint64_t pagetable_size;
220     int disk_type = VHD_DYNAMIC;
221     int ret;
222 
223     opts = qemu_opts_create(&vpc_runtime_opts, NULL, 0, &error_abort);
224     qemu_opts_absorb_qdict(opts, options, &local_err);
225     if (local_err) {
226         error_propagate(errp, local_err);
227         ret = -EINVAL;
228         goto fail;
229     }
230 
231     vpc_parse_options(bs, opts, &local_err);
232     if (local_err) {
233         error_propagate(errp, local_err);
234         ret = -EINVAL;
235         goto fail;
236     }
237 
238     ret = bdrv_pread(bs->file, 0, s->footer_buf, HEADER_SIZE);
239     if (ret < 0) {
240         error_setg(errp, "Unable to read VHD header");
241         goto fail;
242     }
243 
244     footer = (VHDFooter *) s->footer_buf;
245     if (strncmp(footer->creator, "conectix", 8)) {
246         int64_t offset = bdrv_getlength(bs->file->bs);
247         if (offset < 0) {
248             ret = offset;
249             error_setg(errp, "Invalid file size");
250             goto fail;
251         } else if (offset < HEADER_SIZE) {
252             ret = -EINVAL;
253             error_setg(errp, "File too small for a VHD header");
254             goto fail;
255         }
256 
257         /* If a fixed disk, the footer is found only at the end of the file */
258         ret = bdrv_pread(bs->file, offset-HEADER_SIZE, s->footer_buf,
259                          HEADER_SIZE);
260         if (ret < 0) {
261             goto fail;
262         }
263         if (strncmp(footer->creator, "conectix", 8)) {
264             error_setg(errp, "invalid VPC image");
265             ret = -EINVAL;
266             goto fail;
267         }
268         disk_type = VHD_FIXED;
269     }
270 
271     checksum = be32_to_cpu(footer->checksum);
272     footer->checksum = 0;
273     if (vpc_checksum(s->footer_buf, HEADER_SIZE) != checksum)
274         fprintf(stderr, "block-vpc: The header checksum of '%s' is "
275             "incorrect.\n", bs->filename);
276 
277     /* Write 'checksum' back to footer, or else will leave it with zero. */
278     footer->checksum = cpu_to_be32(checksum);
279 
280     /* The visible size of a image in Virtual PC depends on the geometry
281        rather than on the size stored in the footer (the size in the footer
282        is too large usually) */
283     bs->total_sectors = (int64_t)
284         be16_to_cpu(footer->cyls) * footer->heads * footer->secs_per_cyl;
285 
286     /* Microsoft Virtual PC and Microsoft Hyper-V produce and read
287      * VHD image sizes differently.  VPC will rely on CHS geometry,
288      * while Hyper-V and disk2vhd use the size specified in the footer.
289      *
290      * We use a couple of approaches to try and determine the correct method:
291      * look at the Creator App field, and look for images that have CHS
292      * geometry that is the maximum value.
293      *
294      * If the CHS geometry is the maximum CHS geometry, then we assume that
295      * the size is the footer->current_size to avoid truncation.  Otherwise,
296      * we follow the table based on footer->creator_app:
297      *
298      *  Known creator apps:
299      *      'vpc '  :  CHS              Virtual PC (uses disk geometry)
300      *      'qemu'  :  CHS              QEMU (uses disk geometry)
301      *      'qem2'  :  current_size     QEMU (uses current_size)
302      *      'win '  :  current_size     Hyper-V
303      *      'd2v '  :  current_size     Disk2vhd
304      *      'tap\0' :  current_size     XenServer
305      *      'CTXS'  :  current_size     XenConverter
306      *
307      *  The user can override the table values via drive options, however
308      *  even with an override we will still use current_size for images
309      *  that have CHS geometry of the maximum size.
310      */
311     use_chs = (!!strncmp(footer->creator_app, "win ", 4) &&
312                !!strncmp(footer->creator_app, "qem2", 4) &&
313                !!strncmp(footer->creator_app, "d2v ", 4) &&
314                !!strncmp(footer->creator_app, "CTXS", 4) &&
315                !!memcmp(footer->creator_app, "tap", 4)) || s->force_use_chs;
316 
317     if (!use_chs || bs->total_sectors == VHD_MAX_GEOMETRY || s->force_use_sz) {
318         bs->total_sectors = be64_to_cpu(footer->current_size) /
319                                         BDRV_SECTOR_SIZE;
320     }
321 
322     /* Allow a maximum disk size of 2040 GiB */
323     if (bs->total_sectors > VHD_MAX_SECTORS) {
324         ret = -EFBIG;
325         goto fail;
326     }
327 
328     if (disk_type == VHD_DYNAMIC) {
329         ret = bdrv_pread(bs->file, be64_to_cpu(footer->data_offset), buf,
330                          HEADER_SIZE);
331         if (ret < 0) {
332             error_setg(errp, "Error reading dynamic VHD header");
333             goto fail;
334         }
335 
336         dyndisk_header = (VHDDynDiskHeader *) buf;
337 
338         if (strncmp(dyndisk_header->magic, "cxsparse", 8)) {
339             error_setg(errp, "Invalid header magic");
340             ret = -EINVAL;
341             goto fail;
342         }
343 
344         s->block_size = be32_to_cpu(dyndisk_header->block_size);
345         if (!is_power_of_2(s->block_size) || s->block_size < BDRV_SECTOR_SIZE) {
346             error_setg(errp, "Invalid block size %" PRIu32, s->block_size);
347             ret = -EINVAL;
348             goto fail;
349         }
350         s->bitmap_size = ((s->block_size / (8 * 512)) + 511) & ~511;
351 
352         s->max_table_entries = be32_to_cpu(dyndisk_header->max_table_entries);
353 
354         if ((bs->total_sectors * 512) / s->block_size > 0xffffffffU) {
355             error_setg(errp, "Too many blocks");
356             ret = -EINVAL;
357             goto fail;
358         }
359 
360         computed_size = (uint64_t) s->max_table_entries * s->block_size;
361         if (computed_size < bs->total_sectors * 512) {
362             error_setg(errp, "Page table too small");
363             ret = -EINVAL;
364             goto fail;
365         }
366 
367         if (s->max_table_entries > SIZE_MAX / 4 ||
368             s->max_table_entries > (int) INT_MAX / 4) {
369             error_setg(errp, "Max Table Entries too large (%" PRId32 ")",
370                         s->max_table_entries);
371             ret = -EINVAL;
372             goto fail;
373         }
374 
375         pagetable_size = (uint64_t) s->max_table_entries * 4;
376 
377         s->pagetable = qemu_try_blockalign(bs->file->bs, pagetable_size);
378         if (s->pagetable == NULL) {
379             error_setg(errp, "Unable to allocate memory for page table");
380             ret = -ENOMEM;
381             goto fail;
382         }
383 
384         s->bat_offset = be64_to_cpu(dyndisk_header->table_offset);
385 
386         ret = bdrv_pread(bs->file, s->bat_offset, s->pagetable,
387                          pagetable_size);
388         if (ret < 0) {
389             error_setg(errp, "Error reading pagetable");
390             goto fail;
391         }
392 
393         s->free_data_block_offset =
394             ROUND_UP(s->bat_offset + pagetable_size, 512);
395 
396         for (i = 0; i < s->max_table_entries; i++) {
397             be32_to_cpus(&s->pagetable[i]);
398             if (s->pagetable[i] != 0xFFFFFFFF) {
399                 int64_t next = (512 * (int64_t) s->pagetable[i]) +
400                     s->bitmap_size + s->block_size;
401 
402                 if (next > s->free_data_block_offset) {
403                     s->free_data_block_offset = next;
404                 }
405             }
406         }
407 
408         if (s->free_data_block_offset > bdrv_getlength(bs->file->bs)) {
409             error_setg(errp, "block-vpc: free_data_block_offset points after "
410                              "the end of file. The image has been truncated.");
411             ret = -EINVAL;
412             goto fail;
413         }
414 
415         s->last_bitmap_offset = (int64_t) -1;
416 
417 #ifdef CACHE
418         s->pageentry_u8 = g_malloc(512);
419         s->pageentry_u32 = s->pageentry_u8;
420         s->pageentry_u16 = s->pageentry_u8;
421         s->last_pagetable = -1;
422 #endif
423     }
424 
425     qemu_co_mutex_init(&s->lock);
426 
427     /* Disable migration when VHD images are used */
428     error_setg(&s->migration_blocker, "The vpc format used by node '%s' "
429                "does not support live migration",
430                bdrv_get_device_or_node_name(bs));
431     migrate_add_blocker(s->migration_blocker);
432 
433     return 0;
434 
435 fail:
436     qemu_vfree(s->pagetable);
437 #ifdef CACHE
438     g_free(s->pageentry_u8);
439 #endif
440     return ret;
441 }
442 
443 static int vpc_reopen_prepare(BDRVReopenState *state,
444                               BlockReopenQueue *queue, Error **errp)
445 {
446     return 0;
447 }
448 
449 /*
450  * Returns the absolute byte offset of the given sector in the image file.
451  * If the sector is not allocated, -1 is returned instead.
452  *
453  * The parameter write must be 1 if the offset will be used for a write
454  * operation (the block bitmaps is updated then), 0 otherwise.
455  */
456 static inline int64_t get_image_offset(BlockDriverState *bs, uint64_t offset,
457                                        bool write)
458 {
459     BDRVVPCState *s = bs->opaque;
460     uint64_t bitmap_offset, block_offset;
461     uint32_t pagetable_index, offset_in_block;
462 
463     pagetable_index = offset / s->block_size;
464     offset_in_block = offset % s->block_size;
465 
466     if (pagetable_index >= s->max_table_entries || s->pagetable[pagetable_index] == 0xffffffff)
467         return -1; /* not allocated */
468 
469     bitmap_offset = 512 * (uint64_t) s->pagetable[pagetable_index];
470     block_offset = bitmap_offset + s->bitmap_size + offset_in_block;
471 
472     /* We must ensure that we don't write to any sectors which are marked as
473        unused in the bitmap. We get away with setting all bits in the block
474        bitmap each time we write to a new block. This might cause Virtual PC to
475        miss sparse read optimization, but it's not a problem in terms of
476        correctness. */
477     if (write && (s->last_bitmap_offset != bitmap_offset)) {
478         uint8_t bitmap[s->bitmap_size];
479 
480         s->last_bitmap_offset = bitmap_offset;
481         memset(bitmap, 0xff, s->bitmap_size);
482         bdrv_pwrite_sync(bs->file, bitmap_offset, bitmap, s->bitmap_size);
483     }
484 
485     return block_offset;
486 }
487 
488 static inline int64_t get_sector_offset(BlockDriverState *bs,
489                                         int64_t sector_num, bool write)
490 {
491     return get_image_offset(bs, sector_num * BDRV_SECTOR_SIZE, write);
492 }
493 
494 /*
495  * Writes the footer to the end of the image file. This is needed when the
496  * file grows as it overwrites the old footer
497  *
498  * Returns 0 on success and < 0 on error
499  */
500 static int rewrite_footer(BlockDriverState* bs)
501 {
502     int ret;
503     BDRVVPCState *s = bs->opaque;
504     int64_t offset = s->free_data_block_offset;
505 
506     ret = bdrv_pwrite_sync(bs->file, offset, s->footer_buf, HEADER_SIZE);
507     if (ret < 0)
508         return ret;
509 
510     return 0;
511 }
512 
513 /*
514  * Allocates a new block. This involves writing a new footer and updating
515  * the Block Allocation Table to use the space at the old end of the image
516  * file (overwriting the old footer)
517  *
518  * Returns the sectors' offset in the image file on success and < 0 on error
519  */
520 static int64_t alloc_block(BlockDriverState* bs, int64_t offset)
521 {
522     BDRVVPCState *s = bs->opaque;
523     int64_t bat_offset;
524     uint32_t index, bat_value;
525     int ret;
526     uint8_t bitmap[s->bitmap_size];
527 
528     /* Check if sector_num is valid */
529     if ((offset < 0) || (offset > bs->total_sectors * BDRV_SECTOR_SIZE)) {
530         return -EINVAL;
531     }
532 
533     /* Write entry into in-memory BAT */
534     index = offset / s->block_size;
535     assert(s->pagetable[index] == 0xFFFFFFFF);
536     s->pagetable[index] = s->free_data_block_offset / 512;
537 
538     /* Initialize the block's bitmap */
539     memset(bitmap, 0xff, s->bitmap_size);
540     ret = bdrv_pwrite_sync(bs->file, s->free_data_block_offset, bitmap,
541         s->bitmap_size);
542     if (ret < 0) {
543         return ret;
544     }
545 
546     /* Write new footer (the old one will be overwritten) */
547     s->free_data_block_offset += s->block_size + s->bitmap_size;
548     ret = rewrite_footer(bs);
549     if (ret < 0)
550         goto fail;
551 
552     /* Write BAT entry to disk */
553     bat_offset = s->bat_offset + (4 * index);
554     bat_value = cpu_to_be32(s->pagetable[index]);
555     ret = bdrv_pwrite_sync(bs->file, bat_offset, &bat_value, 4);
556     if (ret < 0)
557         goto fail;
558 
559     return get_image_offset(bs, offset, false);
560 
561 fail:
562     s->free_data_block_offset -= (s->block_size + s->bitmap_size);
563     return ret;
564 }
565 
566 static int vpc_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
567 {
568     BDRVVPCState *s = (BDRVVPCState *)bs->opaque;
569     VHDFooter *footer = (VHDFooter *) s->footer_buf;
570 
571     if (be32_to_cpu(footer->type) != VHD_FIXED) {
572         bdi->cluster_size = s->block_size;
573     }
574 
575     bdi->unallocated_blocks_are_zero = true;
576     return 0;
577 }
578 
579 static int coroutine_fn
580 vpc_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
581               QEMUIOVector *qiov, int flags)
582 {
583     BDRVVPCState *s = bs->opaque;
584     int ret;
585     int64_t image_offset;
586     int64_t n_bytes;
587     int64_t bytes_done = 0;
588     VHDFooter *footer = (VHDFooter *) s->footer_buf;
589     QEMUIOVector local_qiov;
590 
591     if (be32_to_cpu(footer->type) == VHD_FIXED) {
592         return bdrv_co_preadv(bs->file, offset, bytes, qiov, 0);
593     }
594 
595     qemu_co_mutex_lock(&s->lock);
596     qemu_iovec_init(&local_qiov, qiov->niov);
597 
598     while (bytes > 0) {
599         image_offset = get_image_offset(bs, offset, false);
600         n_bytes = MIN(bytes, s->block_size - (offset % s->block_size));
601 
602         if (image_offset == -1) {
603             qemu_iovec_memset(qiov, bytes_done, 0, n_bytes);
604         } else {
605             qemu_iovec_reset(&local_qiov);
606             qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes);
607 
608             ret = bdrv_co_preadv(bs->file, image_offset, n_bytes,
609                                  &local_qiov, 0);
610             if (ret < 0) {
611                 goto fail;
612             }
613         }
614 
615         bytes -= n_bytes;
616         offset += n_bytes;
617         bytes_done += n_bytes;
618     }
619 
620     ret = 0;
621 fail:
622     qemu_iovec_destroy(&local_qiov);
623     qemu_co_mutex_unlock(&s->lock);
624 
625     return ret;
626 }
627 
628 static int coroutine_fn
629 vpc_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
630                QEMUIOVector *qiov, int flags)
631 {
632     BDRVVPCState *s = bs->opaque;
633     int64_t image_offset;
634     int64_t n_bytes;
635     int64_t bytes_done = 0;
636     int ret;
637     VHDFooter *footer =  (VHDFooter *) s->footer_buf;
638     QEMUIOVector local_qiov;
639 
640     if (be32_to_cpu(footer->type) == VHD_FIXED) {
641         return bdrv_co_pwritev(bs->file, offset, bytes, qiov, 0);
642     }
643 
644     qemu_co_mutex_lock(&s->lock);
645     qemu_iovec_init(&local_qiov, qiov->niov);
646 
647     while (bytes > 0) {
648         image_offset = get_image_offset(bs, offset, true);
649         n_bytes = MIN(bytes, s->block_size - (offset % s->block_size));
650 
651         if (image_offset == -1) {
652             image_offset = alloc_block(bs, offset);
653             if (image_offset < 0) {
654                 ret = image_offset;
655                 goto fail;
656             }
657         }
658 
659         qemu_iovec_reset(&local_qiov);
660         qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes);
661 
662         ret = bdrv_co_pwritev(bs->file, image_offset, n_bytes,
663                               &local_qiov, 0);
664         if (ret < 0) {
665             goto fail;
666         }
667 
668         bytes -= n_bytes;
669         offset += n_bytes;
670         bytes_done += n_bytes;
671     }
672 
673     ret = 0;
674 fail:
675     qemu_iovec_destroy(&local_qiov);
676     qemu_co_mutex_unlock(&s->lock);
677 
678     return ret;
679 }
680 
681 static int64_t coroutine_fn vpc_co_get_block_status(BlockDriverState *bs,
682         int64_t sector_num, int nb_sectors, int *pnum, BlockDriverState **file)
683 {
684     BDRVVPCState *s = bs->opaque;
685     VHDFooter *footer = (VHDFooter*) s->footer_buf;
686     int64_t start, offset;
687     bool allocated;
688     int n;
689 
690     if (be32_to_cpu(footer->type) == VHD_FIXED) {
691         *pnum = nb_sectors;
692         *file = bs->file->bs;
693         return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID | BDRV_BLOCK_DATA |
694                (sector_num << BDRV_SECTOR_BITS);
695     }
696 
697     offset = get_sector_offset(bs, sector_num, 0);
698     start = offset;
699     allocated = (offset != -1);
700     *pnum = 0;
701 
702     do {
703         /* All sectors in a block are contiguous (without using the bitmap) */
704         n = ROUND_UP(sector_num + 1, s->block_size / BDRV_SECTOR_SIZE)
705           - sector_num;
706         n = MIN(n, nb_sectors);
707 
708         *pnum += n;
709         sector_num += n;
710         nb_sectors -= n;
711         /* *pnum can't be greater than one block for allocated
712          * sectors since there is always a bitmap in between. */
713         if (allocated) {
714             *file = bs->file->bs;
715             return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | start;
716         }
717         if (nb_sectors == 0) {
718             break;
719         }
720         offset = get_sector_offset(bs, sector_num, 0);
721     } while (offset == -1);
722 
723     return 0;
724 }
725 
726 /*
727  * Calculates the number of cylinders, heads and sectors per cylinder
728  * based on a given number of sectors. This is the algorithm described
729  * in the VHD specification.
730  *
731  * Note that the geometry doesn't always exactly match total_sectors but
732  * may round it down.
733  *
734  * Returns 0 on success, -EFBIG if the size is larger than 2040 GiB. Override
735  * the hardware EIDE and ATA-2 limit of 16 heads (max disk size of 127 GB)
736  * and instead allow up to 255 heads.
737  */
738 static int calculate_geometry(int64_t total_sectors, uint16_t* cyls,
739     uint8_t* heads, uint8_t* secs_per_cyl)
740 {
741     uint32_t cyls_times_heads;
742 
743     total_sectors = MIN(total_sectors, VHD_MAX_GEOMETRY);
744 
745     if (total_sectors >= 65535LL * 16 * 63) {
746         *secs_per_cyl = 255;
747         *heads = 16;
748         cyls_times_heads = total_sectors / *secs_per_cyl;
749     } else {
750         *secs_per_cyl = 17;
751         cyls_times_heads = total_sectors / *secs_per_cyl;
752         *heads = (cyls_times_heads + 1023) / 1024;
753 
754         if (*heads < 4) {
755             *heads = 4;
756         }
757 
758         if (cyls_times_heads >= (*heads * 1024) || *heads > 16) {
759             *secs_per_cyl = 31;
760             *heads = 16;
761             cyls_times_heads = total_sectors / *secs_per_cyl;
762         }
763 
764         if (cyls_times_heads >= (*heads * 1024)) {
765             *secs_per_cyl = 63;
766             *heads = 16;
767             cyls_times_heads = total_sectors / *secs_per_cyl;
768         }
769     }
770 
771     *cyls = cyls_times_heads / *heads;
772 
773     return 0;
774 }
775 
776 static int create_dynamic_disk(BlockBackend *blk, uint8_t *buf,
777                                int64_t total_sectors)
778 {
779     VHDDynDiskHeader *dyndisk_header =
780         (VHDDynDiskHeader *) buf;
781     size_t block_size, num_bat_entries;
782     int i;
783     int ret;
784     int64_t offset = 0;
785 
786     /* Write the footer (twice: at the beginning and at the end) */
787     block_size = 0x200000;
788     num_bat_entries = (total_sectors + block_size / 512) / (block_size / 512);
789 
790     ret = blk_pwrite(blk, offset, buf, HEADER_SIZE, 0);
791     if (ret < 0) {
792         goto fail;
793     }
794 
795     offset = 1536 + ((num_bat_entries * 4 + 511) & ~511);
796     ret = blk_pwrite(blk, offset, buf, HEADER_SIZE, 0);
797     if (ret < 0) {
798         goto fail;
799     }
800 
801     /* Write the initial BAT */
802     offset = 3 * 512;
803 
804     memset(buf, 0xFF, 512);
805     for (i = 0; i < (num_bat_entries * 4 + 511) / 512; i++) {
806         ret = blk_pwrite(blk, offset, buf, 512, 0);
807         if (ret < 0) {
808             goto fail;
809         }
810         offset += 512;
811     }
812 
813     /* Prepare the Dynamic Disk Header */
814     memset(buf, 0, 1024);
815 
816     memcpy(dyndisk_header->magic, "cxsparse", 8);
817 
818     /*
819      * Note: The spec is actually wrong here for data_offset, it says
820      * 0xFFFFFFFF, but MS tools expect all 64 bits to be set.
821      */
822     dyndisk_header->data_offset = cpu_to_be64(0xFFFFFFFFFFFFFFFFULL);
823     dyndisk_header->table_offset = cpu_to_be64(3 * 512);
824     dyndisk_header->version = cpu_to_be32(0x00010000);
825     dyndisk_header->block_size = cpu_to_be32(block_size);
826     dyndisk_header->max_table_entries = cpu_to_be32(num_bat_entries);
827 
828     dyndisk_header->checksum = cpu_to_be32(vpc_checksum(buf, 1024));
829 
830     /* Write the header */
831     offset = 512;
832 
833     ret = blk_pwrite(blk, offset, buf, 1024, 0);
834     if (ret < 0) {
835         goto fail;
836     }
837 
838  fail:
839     return ret;
840 }
841 
842 static int create_fixed_disk(BlockBackend *blk, uint8_t *buf,
843                              int64_t total_size)
844 {
845     int ret;
846 
847     /* Add footer to total size */
848     total_size += HEADER_SIZE;
849 
850     ret = blk_truncate(blk, total_size);
851     if (ret < 0) {
852         return ret;
853     }
854 
855     ret = blk_pwrite(blk, total_size - HEADER_SIZE, buf, HEADER_SIZE, 0);
856     if (ret < 0) {
857         return ret;
858     }
859 
860     return ret;
861 }
862 
863 static int vpc_create(const char *filename, QemuOpts *opts, Error **errp)
864 {
865     uint8_t buf[1024];
866     VHDFooter *footer = (VHDFooter *) buf;
867     char *disk_type_param;
868     int i;
869     uint16_t cyls = 0;
870     uint8_t heads = 0;
871     uint8_t secs_per_cyl = 0;
872     int64_t total_sectors;
873     int64_t total_size;
874     int disk_type;
875     int ret = -EIO;
876     bool force_size;
877     Error *local_err = NULL;
878     BlockBackend *blk = NULL;
879 
880     /* Read out options */
881     total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
882                           BDRV_SECTOR_SIZE);
883     disk_type_param = qemu_opt_get_del(opts, BLOCK_OPT_SUBFMT);
884     if (disk_type_param) {
885         if (!strcmp(disk_type_param, "dynamic")) {
886             disk_type = VHD_DYNAMIC;
887         } else if (!strcmp(disk_type_param, "fixed")) {
888             disk_type = VHD_FIXED;
889         } else {
890             error_setg(errp, "Invalid disk type, %s", disk_type_param);
891             ret = -EINVAL;
892             goto out;
893         }
894     } else {
895         disk_type = VHD_DYNAMIC;
896     }
897 
898     force_size = qemu_opt_get_bool_del(opts, VPC_OPT_FORCE_SIZE, false);
899 
900     ret = bdrv_create_file(filename, opts, &local_err);
901     if (ret < 0) {
902         error_propagate(errp, local_err);
903         goto out;
904     }
905 
906     blk = blk_new_open(filename, NULL, NULL,
907                        BDRV_O_RDWR | BDRV_O_PROTOCOL, &local_err);
908     if (blk == NULL) {
909         error_propagate(errp, local_err);
910         ret = -EIO;
911         goto out;
912     }
913 
914     blk_set_allow_write_beyond_eof(blk, true);
915 
916     /*
917      * Calculate matching total_size and geometry. Increase the number of
918      * sectors requested until we get enough (or fail). This ensures that
919      * qemu-img convert doesn't truncate images, but rather rounds up.
920      *
921      * If the image size can't be represented by a spec conformant CHS geometry,
922      * we set the geometry to 65535 x 16 x 255 (CxHxS) sectors and use
923      * the image size from the VHD footer to calculate total_sectors.
924      */
925     if (force_size) {
926         /* This will force the use of total_size for sector count, below */
927         cyls         = VHD_CHS_MAX_C;
928         heads        = VHD_CHS_MAX_H;
929         secs_per_cyl = VHD_CHS_MAX_S;
930     } else {
931         total_sectors = MIN(VHD_MAX_GEOMETRY, total_size / BDRV_SECTOR_SIZE);
932         for (i = 0; total_sectors > (int64_t)cyls * heads * secs_per_cyl; i++) {
933             calculate_geometry(total_sectors + i, &cyls, &heads, &secs_per_cyl);
934         }
935     }
936 
937     if ((int64_t)cyls * heads * secs_per_cyl == VHD_MAX_GEOMETRY) {
938         total_sectors = total_size / BDRV_SECTOR_SIZE;
939         /* Allow a maximum disk size of 2040 GiB */
940         if (total_sectors > VHD_MAX_SECTORS) {
941             error_setg(errp, "Disk size is too large, max size is 2040 GiB");
942             ret = -EFBIG;
943             goto out;
944         }
945     } else {
946         total_sectors = (int64_t)cyls * heads * secs_per_cyl;
947         total_size = total_sectors * BDRV_SECTOR_SIZE;
948     }
949 
950     /* Prepare the Hard Disk Footer */
951     memset(buf, 0, 1024);
952 
953     memcpy(footer->creator, "conectix", 8);
954     if (force_size) {
955         memcpy(footer->creator_app, "qem2", 4);
956     } else {
957         memcpy(footer->creator_app, "qemu", 4);
958     }
959     memcpy(footer->creator_os, "Wi2k", 4);
960 
961     footer->features = cpu_to_be32(0x02);
962     footer->version = cpu_to_be32(0x00010000);
963     if (disk_type == VHD_DYNAMIC) {
964         footer->data_offset = cpu_to_be64(HEADER_SIZE);
965     } else {
966         footer->data_offset = cpu_to_be64(0xFFFFFFFFFFFFFFFFULL);
967     }
968     footer->timestamp = cpu_to_be32(time(NULL) - VHD_TIMESTAMP_BASE);
969 
970     /* Version of Virtual PC 2007 */
971     footer->major = cpu_to_be16(0x0005);
972     footer->minor = cpu_to_be16(0x0003);
973     footer->orig_size = cpu_to_be64(total_size);
974     footer->current_size = cpu_to_be64(total_size);
975     footer->cyls = cpu_to_be16(cyls);
976     footer->heads = heads;
977     footer->secs_per_cyl = secs_per_cyl;
978 
979     footer->type = cpu_to_be32(disk_type);
980 
981     qemu_uuid_generate(&footer->uuid);
982 
983     footer->checksum = cpu_to_be32(vpc_checksum(buf, HEADER_SIZE));
984 
985     if (disk_type == VHD_DYNAMIC) {
986         ret = create_dynamic_disk(blk, buf, total_sectors);
987     } else {
988         ret = create_fixed_disk(blk, buf, total_size);
989     }
990     if (ret < 0) {
991         error_setg(errp, "Unable to create or write VHD header");
992     }
993 
994 out:
995     blk_unref(blk);
996     g_free(disk_type_param);
997     return ret;
998 }
999 
1000 static int vpc_has_zero_init(BlockDriverState *bs)
1001 {
1002     BDRVVPCState *s = bs->opaque;
1003     VHDFooter *footer =  (VHDFooter *) s->footer_buf;
1004 
1005     if (be32_to_cpu(footer->type) == VHD_FIXED) {
1006         return bdrv_has_zero_init(bs->file->bs);
1007     } else {
1008         return 1;
1009     }
1010 }
1011 
1012 static void vpc_close(BlockDriverState *bs)
1013 {
1014     BDRVVPCState *s = bs->opaque;
1015     qemu_vfree(s->pagetable);
1016 #ifdef CACHE
1017     g_free(s->pageentry_u8);
1018 #endif
1019 
1020     migrate_del_blocker(s->migration_blocker);
1021     error_free(s->migration_blocker);
1022 }
1023 
1024 static QemuOptsList vpc_create_opts = {
1025     .name = "vpc-create-opts",
1026     .head = QTAILQ_HEAD_INITIALIZER(vpc_create_opts.head),
1027     .desc = {
1028         {
1029             .name = BLOCK_OPT_SIZE,
1030             .type = QEMU_OPT_SIZE,
1031             .help = "Virtual disk size"
1032         },
1033         {
1034             .name = BLOCK_OPT_SUBFMT,
1035             .type = QEMU_OPT_STRING,
1036             .help =
1037                 "Type of virtual hard disk format. Supported formats are "
1038                 "{dynamic (default) | fixed} "
1039         },
1040         {
1041             .name = VPC_OPT_FORCE_SIZE,
1042             .type = QEMU_OPT_BOOL,
1043             .help = "Force disk size calculation to use the actual size "
1044                     "specified, rather than using the nearest CHS-based "
1045                     "calculation"
1046         },
1047         { /* end of list */ }
1048     }
1049 };
1050 
1051 static BlockDriver bdrv_vpc = {
1052     .format_name    = "vpc",
1053     .instance_size  = sizeof(BDRVVPCState),
1054 
1055     .bdrv_probe             = vpc_probe,
1056     .bdrv_open              = vpc_open,
1057     .bdrv_close             = vpc_close,
1058     .bdrv_reopen_prepare    = vpc_reopen_prepare,
1059     .bdrv_create            = vpc_create,
1060 
1061     .bdrv_co_preadv             = vpc_co_preadv,
1062     .bdrv_co_pwritev            = vpc_co_pwritev,
1063     .bdrv_co_get_block_status   = vpc_co_get_block_status,
1064 
1065     .bdrv_get_info          = vpc_get_info,
1066 
1067     .create_opts            = &vpc_create_opts,
1068     .bdrv_has_zero_init     = vpc_has_zero_init,
1069 };
1070 
1071 static void bdrv_vpc_init(void)
1072 {
1073     bdrv_register(&bdrv_vpc);
1074 }
1075 
1076 block_init(bdrv_vpc_init);
1077