xref: /openbmc/qemu/block/vpc.c (revision dc5bd18f)
1 /*
2  * Block driver for Connectix / Microsoft Virtual PC images
3  *
4  * Copyright (c) 2005 Alex Beregszaszi
5  * Copyright (c) 2009 Kevin Wolf <kwolf@suse.de>
6  *
7  * Permission is hereby granted, free of charge, to any person obtaining a copy
8  * of this software and associated documentation files (the "Software"), to deal
9  * in the Software without restriction, including without limitation the rights
10  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11  * copies of the Software, and to permit persons to whom the Software is
12  * furnished to do so, subject to the following conditions:
13  *
14  * The above copyright notice and this permission notice shall be included in
15  * all copies or substantial portions of the Software.
16  *
17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23  * THE SOFTWARE.
24  */
25 
26 #include "qemu/osdep.h"
27 #include "qapi/error.h"
28 #include "block/block_int.h"
29 #include "sysemu/block-backend.h"
30 #include "qemu/module.h"
31 #include "qemu/option.h"
32 #include "migration/blocker.h"
33 #include "qemu/bswap.h"
34 #include "qemu/uuid.h"
35 
36 /**************************************************************/
37 
38 #define HEADER_SIZE 512
39 
40 //#define CACHE
41 
42 enum vhd_type {
43     VHD_FIXED           = 2,
44     VHD_DYNAMIC         = 3,
45     VHD_DIFFERENCING    = 4,
46 };
47 
48 /* Seconds since Jan 1, 2000 0:00:00 (UTC) */
49 #define VHD_TIMESTAMP_BASE 946684800
50 
51 #define VHD_CHS_MAX_C   65535LL
52 #define VHD_CHS_MAX_H   16
53 #define VHD_CHS_MAX_S   255
54 
55 #define VHD_MAX_SECTORS       0xff000000    /* 2040 GiB max image size */
56 #define VHD_MAX_GEOMETRY      (VHD_CHS_MAX_C * VHD_CHS_MAX_H * VHD_CHS_MAX_S)
57 
58 #define VPC_OPT_FORCE_SIZE "force_size"
59 
60 /* always big-endian */
61 typedef struct vhd_footer {
62     char        creator[8]; /* "conectix" */
63     uint32_t    features;
64     uint32_t    version;
65 
66     /* Offset of next header structure, 0xFFFFFFFF if none */
67     uint64_t    data_offset;
68 
69     /* Seconds since Jan 1, 2000 0:00:00 (UTC) */
70     uint32_t    timestamp;
71 
72     char        creator_app[4]; /*  e.g., "vpc " */
73     uint16_t    major;
74     uint16_t    minor;
75     char        creator_os[4]; /* "Wi2k" */
76 
77     uint64_t    orig_size;
78     uint64_t    current_size;
79 
80     uint16_t    cyls;
81     uint8_t     heads;
82     uint8_t     secs_per_cyl;
83 
84     uint32_t    type;
85 
86     /* Checksum of the Hard Disk Footer ("one's complement of the sum of all
87        the bytes in the footer without the checksum field") */
88     uint32_t    checksum;
89 
90     /* UUID used to identify a parent hard disk (backing file) */
91     QemuUUID    uuid;
92 
93     uint8_t     in_saved_state;
94 } QEMU_PACKED VHDFooter;
95 
96 typedef struct vhd_dyndisk_header {
97     char        magic[8]; /* "cxsparse" */
98 
99     /* Offset of next header structure, 0xFFFFFFFF if none */
100     uint64_t    data_offset;
101 
102     /* Offset of the Block Allocation Table (BAT) */
103     uint64_t    table_offset;
104 
105     uint32_t    version;
106     uint32_t    max_table_entries; /* 32bit/entry */
107 
108     /* 2 MB by default, must be a power of two */
109     uint32_t    block_size;
110 
111     uint32_t    checksum;
112     uint8_t     parent_uuid[16];
113     uint32_t    parent_timestamp;
114     uint32_t    reserved;
115 
116     /* Backing file name (in UTF-16) */
117     uint8_t     parent_name[512];
118 
119     struct {
120         uint32_t    platform;
121         uint32_t    data_space;
122         uint32_t    data_length;
123         uint32_t    reserved;
124         uint64_t    data_offset;
125     } parent_locator[8];
126 } QEMU_PACKED VHDDynDiskHeader;
127 
128 typedef struct BDRVVPCState {
129     CoMutex lock;
130     uint8_t footer_buf[HEADER_SIZE];
131     uint64_t free_data_block_offset;
132     int max_table_entries;
133     uint32_t *pagetable;
134     uint64_t bat_offset;
135     uint64_t last_bitmap_offset;
136 
137     uint32_t block_size;
138     uint32_t bitmap_size;
139     bool force_use_chs;
140     bool force_use_sz;
141 
142 #ifdef CACHE
143     uint8_t *pageentry_u8;
144     uint32_t *pageentry_u32;
145     uint16_t *pageentry_u16;
146 
147     uint64_t last_bitmap;
148 #endif
149 
150     Error *migration_blocker;
151 } BDRVVPCState;
152 
153 #define VPC_OPT_SIZE_CALC "force_size_calc"
154 static QemuOptsList vpc_runtime_opts = {
155     .name = "vpc-runtime-opts",
156     .head = QTAILQ_HEAD_INITIALIZER(vpc_runtime_opts.head),
157     .desc = {
158         {
159             .name = VPC_OPT_SIZE_CALC,
160             .type = QEMU_OPT_STRING,
161             .help = "Force disk size calculation to use either CHS geometry, "
162                     "or use the disk current_size specified in the VHD footer. "
163                     "{chs, current_size}"
164         },
165         { /* end of list */ }
166     }
167 };
168 
169 static uint32_t vpc_checksum(uint8_t* buf, size_t size)
170 {
171     uint32_t res = 0;
172     int i;
173 
174     for (i = 0; i < size; i++)
175         res += buf[i];
176 
177     return ~res;
178 }
179 
180 
181 static int vpc_probe(const uint8_t *buf, int buf_size, const char *filename)
182 {
183     if (buf_size >= 8 && !strncmp((char *)buf, "conectix", 8))
184 	return 100;
185     return 0;
186 }
187 
188 static void vpc_parse_options(BlockDriverState *bs, QemuOpts *opts,
189                               Error **errp)
190 {
191     BDRVVPCState *s = bs->opaque;
192     const char *size_calc;
193 
194     size_calc = qemu_opt_get(opts, VPC_OPT_SIZE_CALC);
195 
196     if (!size_calc) {
197        /* no override, use autodetect only */
198     } else if (!strcmp(size_calc, "current_size")) {
199         s->force_use_sz = true;
200     } else if (!strcmp(size_calc, "chs")) {
201         s->force_use_chs = true;
202     } else {
203         error_setg(errp, "Invalid size calculation mode: '%s'", size_calc);
204     }
205 }
206 
207 static int vpc_open(BlockDriverState *bs, QDict *options, int flags,
208                     Error **errp)
209 {
210     BDRVVPCState *s = bs->opaque;
211     int i;
212     VHDFooter *footer;
213     VHDDynDiskHeader *dyndisk_header;
214     QemuOpts *opts = NULL;
215     Error *local_err = NULL;
216     bool use_chs;
217     uint8_t buf[HEADER_SIZE];
218     uint32_t checksum;
219     uint64_t computed_size;
220     uint64_t pagetable_size;
221     int disk_type = VHD_DYNAMIC;
222     int ret;
223     int64_t bs_size;
224 
225     bs->file = bdrv_open_child(NULL, options, "file", bs, &child_file,
226                                false, errp);
227     if (!bs->file) {
228         return -EINVAL;
229     }
230 
231     opts = qemu_opts_create(&vpc_runtime_opts, NULL, 0, &error_abort);
232     qemu_opts_absorb_qdict(opts, options, &local_err);
233     if (local_err) {
234         error_propagate(errp, local_err);
235         ret = -EINVAL;
236         goto fail;
237     }
238 
239     vpc_parse_options(bs, opts, &local_err);
240     if (local_err) {
241         error_propagate(errp, local_err);
242         ret = -EINVAL;
243         goto fail;
244     }
245 
246     ret = bdrv_pread(bs->file, 0, s->footer_buf, HEADER_SIZE);
247     if (ret < 0) {
248         error_setg(errp, "Unable to read VHD header");
249         goto fail;
250     }
251 
252     footer = (VHDFooter *) s->footer_buf;
253     if (strncmp(footer->creator, "conectix", 8)) {
254         int64_t offset = bdrv_getlength(bs->file->bs);
255         if (offset < 0) {
256             ret = offset;
257             error_setg(errp, "Invalid file size");
258             goto fail;
259         } else if (offset < HEADER_SIZE) {
260             ret = -EINVAL;
261             error_setg(errp, "File too small for a VHD header");
262             goto fail;
263         }
264 
265         /* If a fixed disk, the footer is found only at the end of the file */
266         ret = bdrv_pread(bs->file, offset-HEADER_SIZE, s->footer_buf,
267                          HEADER_SIZE);
268         if (ret < 0) {
269             goto fail;
270         }
271         if (strncmp(footer->creator, "conectix", 8)) {
272             error_setg(errp, "invalid VPC image");
273             ret = -EINVAL;
274             goto fail;
275         }
276         disk_type = VHD_FIXED;
277     }
278 
279     checksum = be32_to_cpu(footer->checksum);
280     footer->checksum = 0;
281     if (vpc_checksum(s->footer_buf, HEADER_SIZE) != checksum)
282         fprintf(stderr, "block-vpc: The header checksum of '%s' is "
283             "incorrect.\n", bs->filename);
284 
285     /* Write 'checksum' back to footer, or else will leave it with zero. */
286     footer->checksum = cpu_to_be32(checksum);
287 
288     /* The visible size of a image in Virtual PC depends on the geometry
289        rather than on the size stored in the footer (the size in the footer
290        is too large usually) */
291     bs->total_sectors = (int64_t)
292         be16_to_cpu(footer->cyls) * footer->heads * footer->secs_per_cyl;
293 
294     /* Microsoft Virtual PC and Microsoft Hyper-V produce and read
295      * VHD image sizes differently.  VPC will rely on CHS geometry,
296      * while Hyper-V and disk2vhd use the size specified in the footer.
297      *
298      * We use a couple of approaches to try and determine the correct method:
299      * look at the Creator App field, and look for images that have CHS
300      * geometry that is the maximum value.
301      *
302      * If the CHS geometry is the maximum CHS geometry, then we assume that
303      * the size is the footer->current_size to avoid truncation.  Otherwise,
304      * we follow the table based on footer->creator_app:
305      *
306      *  Known creator apps:
307      *      'vpc '  :  CHS              Virtual PC (uses disk geometry)
308      *      'qemu'  :  CHS              QEMU (uses disk geometry)
309      *      'qem2'  :  current_size     QEMU (uses current_size)
310      *      'win '  :  current_size     Hyper-V
311      *      'd2v '  :  current_size     Disk2vhd
312      *      'tap\0' :  current_size     XenServer
313      *      'CTXS'  :  current_size     XenConverter
314      *
315      *  The user can override the table values via drive options, however
316      *  even with an override we will still use current_size for images
317      *  that have CHS geometry of the maximum size.
318      */
319     use_chs = (!!strncmp(footer->creator_app, "win ", 4) &&
320                !!strncmp(footer->creator_app, "qem2", 4) &&
321                !!strncmp(footer->creator_app, "d2v ", 4) &&
322                !!strncmp(footer->creator_app, "CTXS", 4) &&
323                !!memcmp(footer->creator_app, "tap", 4)) || s->force_use_chs;
324 
325     if (!use_chs || bs->total_sectors == VHD_MAX_GEOMETRY || s->force_use_sz) {
326         bs->total_sectors = be64_to_cpu(footer->current_size) /
327                                         BDRV_SECTOR_SIZE;
328     }
329 
330     /* Allow a maximum disk size of 2040 GiB */
331     if (bs->total_sectors > VHD_MAX_SECTORS) {
332         ret = -EFBIG;
333         goto fail;
334     }
335 
336     if (disk_type == VHD_DYNAMIC) {
337         ret = bdrv_pread(bs->file, be64_to_cpu(footer->data_offset), buf,
338                          HEADER_SIZE);
339         if (ret < 0) {
340             error_setg(errp, "Error reading dynamic VHD header");
341             goto fail;
342         }
343 
344         dyndisk_header = (VHDDynDiskHeader *) buf;
345 
346         if (strncmp(dyndisk_header->magic, "cxsparse", 8)) {
347             error_setg(errp, "Invalid header magic");
348             ret = -EINVAL;
349             goto fail;
350         }
351 
352         s->block_size = be32_to_cpu(dyndisk_header->block_size);
353         if (!is_power_of_2(s->block_size) || s->block_size < BDRV_SECTOR_SIZE) {
354             error_setg(errp, "Invalid block size %" PRIu32, s->block_size);
355             ret = -EINVAL;
356             goto fail;
357         }
358         s->bitmap_size = ((s->block_size / (8 * 512)) + 511) & ~511;
359 
360         s->max_table_entries = be32_to_cpu(dyndisk_header->max_table_entries);
361 
362         if ((bs->total_sectors * 512) / s->block_size > 0xffffffffU) {
363             error_setg(errp, "Too many blocks");
364             ret = -EINVAL;
365             goto fail;
366         }
367 
368         computed_size = (uint64_t) s->max_table_entries * s->block_size;
369         if (computed_size < bs->total_sectors * 512) {
370             error_setg(errp, "Page table too small");
371             ret = -EINVAL;
372             goto fail;
373         }
374 
375         if (s->max_table_entries > SIZE_MAX / 4 ||
376             s->max_table_entries > (int) INT_MAX / 4) {
377             error_setg(errp, "Max Table Entries too large (%" PRId32 ")",
378                         s->max_table_entries);
379             ret = -EINVAL;
380             goto fail;
381         }
382 
383         pagetable_size = (uint64_t) s->max_table_entries * 4;
384 
385         s->pagetable = qemu_try_blockalign(bs->file->bs, pagetable_size);
386         if (s->pagetable == NULL) {
387             error_setg(errp, "Unable to allocate memory for page table");
388             ret = -ENOMEM;
389             goto fail;
390         }
391 
392         s->bat_offset = be64_to_cpu(dyndisk_header->table_offset);
393 
394         ret = bdrv_pread(bs->file, s->bat_offset, s->pagetable,
395                          pagetable_size);
396         if (ret < 0) {
397             error_setg(errp, "Error reading pagetable");
398             goto fail;
399         }
400 
401         s->free_data_block_offset =
402             ROUND_UP(s->bat_offset + pagetable_size, 512);
403 
404         for (i = 0; i < s->max_table_entries; i++) {
405             be32_to_cpus(&s->pagetable[i]);
406             if (s->pagetable[i] != 0xFFFFFFFF) {
407                 int64_t next = (512 * (int64_t) s->pagetable[i]) +
408                     s->bitmap_size + s->block_size;
409 
410                 if (next > s->free_data_block_offset) {
411                     s->free_data_block_offset = next;
412                 }
413             }
414         }
415 
416         bs_size = bdrv_getlength(bs->file->bs);
417         if (bs_size < 0) {
418             error_setg_errno(errp, -bs_size, "Unable to learn image size");
419             ret = bs_size;
420             goto fail;
421         }
422         if (s->free_data_block_offset > bs_size) {
423             error_setg(errp, "block-vpc: free_data_block_offset points after "
424                              "the end of file. The image has been truncated.");
425             ret = -EINVAL;
426             goto fail;
427         }
428 
429         s->last_bitmap_offset = (int64_t) -1;
430 
431 #ifdef CACHE
432         s->pageentry_u8 = g_malloc(512);
433         s->pageentry_u32 = s->pageentry_u8;
434         s->pageentry_u16 = s->pageentry_u8;
435         s->last_pagetable = -1;
436 #endif
437     }
438 
439     /* Disable migration when VHD images are used */
440     error_setg(&s->migration_blocker, "The vpc format used by node '%s' "
441                "does not support live migration",
442                bdrv_get_device_or_node_name(bs));
443     ret = migrate_add_blocker(s->migration_blocker, &local_err);
444     if (local_err) {
445         error_propagate(errp, local_err);
446         error_free(s->migration_blocker);
447         goto fail;
448     }
449 
450     qemu_co_mutex_init(&s->lock);
451 
452     return 0;
453 
454 fail:
455     qemu_vfree(s->pagetable);
456 #ifdef CACHE
457     g_free(s->pageentry_u8);
458 #endif
459     return ret;
460 }
461 
462 static int vpc_reopen_prepare(BDRVReopenState *state,
463                               BlockReopenQueue *queue, Error **errp)
464 {
465     return 0;
466 }
467 
468 /*
469  * Returns the absolute byte offset of the given sector in the image file.
470  * If the sector is not allocated, -1 is returned instead.
471  * If an error occurred trying to write an updated block bitmap back to
472  * the file, -2 is returned, and the error value is written to *err.
473  * This can only happen for a write operation.
474  *
475  * The parameter write must be 1 if the offset will be used for a write
476  * operation (the block bitmaps is updated then), 0 otherwise.
477  * If write is true then err must not be NULL.
478  */
479 static inline int64_t get_image_offset(BlockDriverState *bs, uint64_t offset,
480                                        bool write, int *err)
481 {
482     BDRVVPCState *s = bs->opaque;
483     uint64_t bitmap_offset, block_offset;
484     uint32_t pagetable_index, offset_in_block;
485 
486     assert(!(write && err == NULL));
487 
488     pagetable_index = offset / s->block_size;
489     offset_in_block = offset % s->block_size;
490 
491     if (pagetable_index >= s->max_table_entries || s->pagetable[pagetable_index] == 0xffffffff)
492         return -1; /* not allocated */
493 
494     bitmap_offset = 512 * (uint64_t) s->pagetable[pagetable_index];
495     block_offset = bitmap_offset + s->bitmap_size + offset_in_block;
496 
497     /* We must ensure that we don't write to any sectors which are marked as
498        unused in the bitmap. We get away with setting all bits in the block
499        bitmap each time we write to a new block. This might cause Virtual PC to
500        miss sparse read optimization, but it's not a problem in terms of
501        correctness. */
502     if (write && (s->last_bitmap_offset != bitmap_offset)) {
503         uint8_t bitmap[s->bitmap_size];
504         int r;
505 
506         s->last_bitmap_offset = bitmap_offset;
507         memset(bitmap, 0xff, s->bitmap_size);
508         r = bdrv_pwrite_sync(bs->file, bitmap_offset, bitmap, s->bitmap_size);
509         if (r < 0) {
510             *err = r;
511             return -2;
512         }
513     }
514 
515     return block_offset;
516 }
517 
518 /*
519  * Writes the footer to the end of the image file. This is needed when the
520  * file grows as it overwrites the old footer
521  *
522  * Returns 0 on success and < 0 on error
523  */
524 static int rewrite_footer(BlockDriverState* bs)
525 {
526     int ret;
527     BDRVVPCState *s = bs->opaque;
528     int64_t offset = s->free_data_block_offset;
529 
530     ret = bdrv_pwrite_sync(bs->file, offset, s->footer_buf, HEADER_SIZE);
531     if (ret < 0)
532         return ret;
533 
534     return 0;
535 }
536 
537 /*
538  * Allocates a new block. This involves writing a new footer and updating
539  * the Block Allocation Table to use the space at the old end of the image
540  * file (overwriting the old footer)
541  *
542  * Returns the sectors' offset in the image file on success and < 0 on error
543  */
544 static int64_t alloc_block(BlockDriverState* bs, int64_t offset)
545 {
546     BDRVVPCState *s = bs->opaque;
547     int64_t bat_offset;
548     uint32_t index, bat_value;
549     int ret;
550     uint8_t bitmap[s->bitmap_size];
551 
552     /* Check if sector_num is valid */
553     if ((offset < 0) || (offset > bs->total_sectors * BDRV_SECTOR_SIZE)) {
554         return -EINVAL;
555     }
556 
557     /* Write entry into in-memory BAT */
558     index = offset / s->block_size;
559     assert(s->pagetable[index] == 0xFFFFFFFF);
560     s->pagetable[index] = s->free_data_block_offset / 512;
561 
562     /* Initialize the block's bitmap */
563     memset(bitmap, 0xff, s->bitmap_size);
564     ret = bdrv_pwrite_sync(bs->file, s->free_data_block_offset, bitmap,
565         s->bitmap_size);
566     if (ret < 0) {
567         return ret;
568     }
569 
570     /* Write new footer (the old one will be overwritten) */
571     s->free_data_block_offset += s->block_size + s->bitmap_size;
572     ret = rewrite_footer(bs);
573     if (ret < 0)
574         goto fail;
575 
576     /* Write BAT entry to disk */
577     bat_offset = s->bat_offset + (4 * index);
578     bat_value = cpu_to_be32(s->pagetable[index]);
579     ret = bdrv_pwrite_sync(bs->file, bat_offset, &bat_value, 4);
580     if (ret < 0)
581         goto fail;
582 
583     return get_image_offset(bs, offset, false, NULL);
584 
585 fail:
586     s->free_data_block_offset -= (s->block_size + s->bitmap_size);
587     return ret;
588 }
589 
590 static int vpc_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
591 {
592     BDRVVPCState *s = (BDRVVPCState *)bs->opaque;
593     VHDFooter *footer = (VHDFooter *) s->footer_buf;
594 
595     if (be32_to_cpu(footer->type) != VHD_FIXED) {
596         bdi->cluster_size = s->block_size;
597     }
598 
599     bdi->unallocated_blocks_are_zero = true;
600     return 0;
601 }
602 
603 static int coroutine_fn
604 vpc_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
605               QEMUIOVector *qiov, int flags)
606 {
607     BDRVVPCState *s = bs->opaque;
608     int ret;
609     int64_t image_offset;
610     int64_t n_bytes;
611     int64_t bytes_done = 0;
612     VHDFooter *footer = (VHDFooter *) s->footer_buf;
613     QEMUIOVector local_qiov;
614 
615     if (be32_to_cpu(footer->type) == VHD_FIXED) {
616         return bdrv_co_preadv(bs->file, offset, bytes, qiov, 0);
617     }
618 
619     qemu_co_mutex_lock(&s->lock);
620     qemu_iovec_init(&local_qiov, qiov->niov);
621 
622     while (bytes > 0) {
623         image_offset = get_image_offset(bs, offset, false, NULL);
624         n_bytes = MIN(bytes, s->block_size - (offset % s->block_size));
625 
626         if (image_offset == -1) {
627             qemu_iovec_memset(qiov, bytes_done, 0, n_bytes);
628         } else {
629             qemu_iovec_reset(&local_qiov);
630             qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes);
631 
632             ret = bdrv_co_preadv(bs->file, image_offset, n_bytes,
633                                  &local_qiov, 0);
634             if (ret < 0) {
635                 goto fail;
636             }
637         }
638 
639         bytes -= n_bytes;
640         offset += n_bytes;
641         bytes_done += n_bytes;
642     }
643 
644     ret = 0;
645 fail:
646     qemu_iovec_destroy(&local_qiov);
647     qemu_co_mutex_unlock(&s->lock);
648 
649     return ret;
650 }
651 
652 static int coroutine_fn
653 vpc_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
654                QEMUIOVector *qiov, int flags)
655 {
656     BDRVVPCState *s = bs->opaque;
657     int64_t image_offset;
658     int64_t n_bytes;
659     int64_t bytes_done = 0;
660     int ret = 0;
661     VHDFooter *footer =  (VHDFooter *) s->footer_buf;
662     QEMUIOVector local_qiov;
663 
664     if (be32_to_cpu(footer->type) == VHD_FIXED) {
665         return bdrv_co_pwritev(bs->file, offset, bytes, qiov, 0);
666     }
667 
668     qemu_co_mutex_lock(&s->lock);
669     qemu_iovec_init(&local_qiov, qiov->niov);
670 
671     while (bytes > 0) {
672         image_offset = get_image_offset(bs, offset, true, &ret);
673         if (image_offset == -2) {
674             /* Failed to write block bitmap: can't proceed with write */
675             goto fail;
676         }
677         n_bytes = MIN(bytes, s->block_size - (offset % s->block_size));
678 
679         if (image_offset == -1) {
680             image_offset = alloc_block(bs, offset);
681             if (image_offset < 0) {
682                 ret = image_offset;
683                 goto fail;
684             }
685         }
686 
687         qemu_iovec_reset(&local_qiov);
688         qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes);
689 
690         ret = bdrv_co_pwritev(bs->file, image_offset, n_bytes,
691                               &local_qiov, 0);
692         if (ret < 0) {
693             goto fail;
694         }
695 
696         bytes -= n_bytes;
697         offset += n_bytes;
698         bytes_done += n_bytes;
699     }
700 
701     ret = 0;
702 fail:
703     qemu_iovec_destroy(&local_qiov);
704     qemu_co_mutex_unlock(&s->lock);
705 
706     return ret;
707 }
708 
709 static int coroutine_fn vpc_co_block_status(BlockDriverState *bs,
710                                             bool want_zero,
711                                             int64_t offset, int64_t bytes,
712                                             int64_t *pnum, int64_t *map,
713                                             BlockDriverState **file)
714 {
715     BDRVVPCState *s = bs->opaque;
716     VHDFooter *footer = (VHDFooter*) s->footer_buf;
717     int64_t image_offset;
718     bool allocated;
719     int ret;
720     int64_t n;
721 
722     if (be32_to_cpu(footer->type) == VHD_FIXED) {
723         *pnum = bytes;
724         *map = offset;
725         *file = bs->file->bs;
726         return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID;
727     }
728 
729     qemu_co_mutex_lock(&s->lock);
730 
731     image_offset = get_image_offset(bs, offset, false, NULL);
732     allocated = (image_offset != -1);
733     *pnum = 0;
734     ret = 0;
735 
736     do {
737         /* All sectors in a block are contiguous (without using the bitmap) */
738         n = ROUND_UP(offset + 1, s->block_size) - offset;
739         n = MIN(n, bytes);
740 
741         *pnum += n;
742         offset += n;
743         bytes -= n;
744         /* *pnum can't be greater than one block for allocated
745          * sectors since there is always a bitmap in between. */
746         if (allocated) {
747             *file = bs->file->bs;
748             *map = image_offset;
749             ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
750             break;
751         }
752         if (bytes == 0) {
753             break;
754         }
755         image_offset = get_image_offset(bs, offset, false, NULL);
756     } while (image_offset == -1);
757 
758     qemu_co_mutex_unlock(&s->lock);
759     return ret;
760 }
761 
762 /*
763  * Calculates the number of cylinders, heads and sectors per cylinder
764  * based on a given number of sectors. This is the algorithm described
765  * in the VHD specification.
766  *
767  * Note that the geometry doesn't always exactly match total_sectors but
768  * may round it down.
769  *
770  * Returns 0 on success, -EFBIG if the size is larger than 2040 GiB. Override
771  * the hardware EIDE and ATA-2 limit of 16 heads (max disk size of 127 GB)
772  * and instead allow up to 255 heads.
773  */
774 static int calculate_geometry(int64_t total_sectors, uint16_t* cyls,
775     uint8_t* heads, uint8_t* secs_per_cyl)
776 {
777     uint32_t cyls_times_heads;
778 
779     total_sectors = MIN(total_sectors, VHD_MAX_GEOMETRY);
780 
781     if (total_sectors >= 65535LL * 16 * 63) {
782         *secs_per_cyl = 255;
783         *heads = 16;
784         cyls_times_heads = total_sectors / *secs_per_cyl;
785     } else {
786         *secs_per_cyl = 17;
787         cyls_times_heads = total_sectors / *secs_per_cyl;
788         *heads = DIV_ROUND_UP(cyls_times_heads, 1024);
789 
790         if (*heads < 4) {
791             *heads = 4;
792         }
793 
794         if (cyls_times_heads >= (*heads * 1024) || *heads > 16) {
795             *secs_per_cyl = 31;
796             *heads = 16;
797             cyls_times_heads = total_sectors / *secs_per_cyl;
798         }
799 
800         if (cyls_times_heads >= (*heads * 1024)) {
801             *secs_per_cyl = 63;
802             *heads = 16;
803             cyls_times_heads = total_sectors / *secs_per_cyl;
804         }
805     }
806 
807     *cyls = cyls_times_heads / *heads;
808 
809     return 0;
810 }
811 
812 static int create_dynamic_disk(BlockBackend *blk, uint8_t *buf,
813                                int64_t total_sectors)
814 {
815     VHDDynDiskHeader *dyndisk_header =
816         (VHDDynDiskHeader *) buf;
817     size_t block_size, num_bat_entries;
818     int i;
819     int ret;
820     int64_t offset = 0;
821 
822     /* Write the footer (twice: at the beginning and at the end) */
823     block_size = 0x200000;
824     num_bat_entries = (total_sectors + block_size / 512) / (block_size / 512);
825 
826     ret = blk_pwrite(blk, offset, buf, HEADER_SIZE, 0);
827     if (ret < 0) {
828         goto fail;
829     }
830 
831     offset = 1536 + ((num_bat_entries * 4 + 511) & ~511);
832     ret = blk_pwrite(blk, offset, buf, HEADER_SIZE, 0);
833     if (ret < 0) {
834         goto fail;
835     }
836 
837     /* Write the initial BAT */
838     offset = 3 * 512;
839 
840     memset(buf, 0xFF, 512);
841     for (i = 0; i < DIV_ROUND_UP(num_bat_entries * 4, 512); i++) {
842         ret = blk_pwrite(blk, offset, buf, 512, 0);
843         if (ret < 0) {
844             goto fail;
845         }
846         offset += 512;
847     }
848 
849     /* Prepare the Dynamic Disk Header */
850     memset(buf, 0, 1024);
851 
852     memcpy(dyndisk_header->magic, "cxsparse", 8);
853 
854     /*
855      * Note: The spec is actually wrong here for data_offset, it says
856      * 0xFFFFFFFF, but MS tools expect all 64 bits to be set.
857      */
858     dyndisk_header->data_offset = cpu_to_be64(0xFFFFFFFFFFFFFFFFULL);
859     dyndisk_header->table_offset = cpu_to_be64(3 * 512);
860     dyndisk_header->version = cpu_to_be32(0x00010000);
861     dyndisk_header->block_size = cpu_to_be32(block_size);
862     dyndisk_header->max_table_entries = cpu_to_be32(num_bat_entries);
863 
864     dyndisk_header->checksum = cpu_to_be32(vpc_checksum(buf, 1024));
865 
866     /* Write the header */
867     offset = 512;
868 
869     ret = blk_pwrite(blk, offset, buf, 1024, 0);
870     if (ret < 0) {
871         goto fail;
872     }
873 
874  fail:
875     return ret;
876 }
877 
878 static int create_fixed_disk(BlockBackend *blk, uint8_t *buf,
879                              int64_t total_size, Error **errp)
880 {
881     int ret;
882 
883     /* Add footer to total size */
884     total_size += HEADER_SIZE;
885 
886     ret = blk_truncate(blk, total_size, PREALLOC_MODE_OFF, errp);
887     if (ret < 0) {
888         return ret;
889     }
890 
891     ret = blk_pwrite(blk, total_size - HEADER_SIZE, buf, HEADER_SIZE, 0);
892     if (ret < 0) {
893         error_setg_errno(errp, -ret, "Unable to write VHD header");
894         return ret;
895     }
896 
897     return ret;
898 }
899 
900 static int coroutine_fn vpc_co_create_opts(const char *filename, QemuOpts *opts,
901                                            Error **errp)
902 {
903     uint8_t buf[1024];
904     VHDFooter *footer = (VHDFooter *) buf;
905     char *disk_type_param;
906     int i;
907     uint16_t cyls = 0;
908     uint8_t heads = 0;
909     uint8_t secs_per_cyl = 0;
910     int64_t total_sectors;
911     int64_t total_size;
912     int disk_type;
913     int ret = -EIO;
914     bool force_size;
915     Error *local_err = NULL;
916     BlockBackend *blk = NULL;
917 
918     /* Read out options */
919     total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
920                           BDRV_SECTOR_SIZE);
921     disk_type_param = qemu_opt_get_del(opts, BLOCK_OPT_SUBFMT);
922     if (disk_type_param) {
923         if (!strcmp(disk_type_param, "dynamic")) {
924             disk_type = VHD_DYNAMIC;
925         } else if (!strcmp(disk_type_param, "fixed")) {
926             disk_type = VHD_FIXED;
927         } else {
928             error_setg(errp, "Invalid disk type, %s", disk_type_param);
929             ret = -EINVAL;
930             goto out;
931         }
932     } else {
933         disk_type = VHD_DYNAMIC;
934     }
935 
936     force_size = qemu_opt_get_bool_del(opts, VPC_OPT_FORCE_SIZE, false);
937 
938     ret = bdrv_create_file(filename, opts, &local_err);
939     if (ret < 0) {
940         error_propagate(errp, local_err);
941         goto out;
942     }
943 
944     blk = blk_new_open(filename, NULL, NULL,
945                        BDRV_O_RDWR | BDRV_O_RESIZE | BDRV_O_PROTOCOL,
946                        &local_err);
947     if (blk == NULL) {
948         error_propagate(errp, local_err);
949         ret = -EIO;
950         goto out;
951     }
952 
953     blk_set_allow_write_beyond_eof(blk, true);
954 
955     /*
956      * Calculate matching total_size and geometry. Increase the number of
957      * sectors requested until we get enough (or fail). This ensures that
958      * qemu-img convert doesn't truncate images, but rather rounds up.
959      *
960      * If the image size can't be represented by a spec conformant CHS geometry,
961      * we set the geometry to 65535 x 16 x 255 (CxHxS) sectors and use
962      * the image size from the VHD footer to calculate total_sectors.
963      */
964     if (force_size) {
965         /* This will force the use of total_size for sector count, below */
966         cyls         = VHD_CHS_MAX_C;
967         heads        = VHD_CHS_MAX_H;
968         secs_per_cyl = VHD_CHS_MAX_S;
969     } else {
970         total_sectors = MIN(VHD_MAX_GEOMETRY, total_size / BDRV_SECTOR_SIZE);
971         for (i = 0; total_sectors > (int64_t)cyls * heads * secs_per_cyl; i++) {
972             calculate_geometry(total_sectors + i, &cyls, &heads, &secs_per_cyl);
973         }
974     }
975 
976     if ((int64_t)cyls * heads * secs_per_cyl == VHD_MAX_GEOMETRY) {
977         total_sectors = total_size / BDRV_SECTOR_SIZE;
978         /* Allow a maximum disk size of 2040 GiB */
979         if (total_sectors > VHD_MAX_SECTORS) {
980             error_setg(errp, "Disk size is too large, max size is 2040 GiB");
981             ret = -EFBIG;
982             goto out;
983         }
984     } else {
985         total_sectors = (int64_t)cyls * heads * secs_per_cyl;
986         total_size = total_sectors * BDRV_SECTOR_SIZE;
987     }
988 
989     /* Prepare the Hard Disk Footer */
990     memset(buf, 0, 1024);
991 
992     memcpy(footer->creator, "conectix", 8);
993     if (force_size) {
994         memcpy(footer->creator_app, "qem2", 4);
995     } else {
996         memcpy(footer->creator_app, "qemu", 4);
997     }
998     memcpy(footer->creator_os, "Wi2k", 4);
999 
1000     footer->features = cpu_to_be32(0x02);
1001     footer->version = cpu_to_be32(0x00010000);
1002     if (disk_type == VHD_DYNAMIC) {
1003         footer->data_offset = cpu_to_be64(HEADER_SIZE);
1004     } else {
1005         footer->data_offset = cpu_to_be64(0xFFFFFFFFFFFFFFFFULL);
1006     }
1007     footer->timestamp = cpu_to_be32(time(NULL) - VHD_TIMESTAMP_BASE);
1008 
1009     /* Version of Virtual PC 2007 */
1010     footer->major = cpu_to_be16(0x0005);
1011     footer->minor = cpu_to_be16(0x0003);
1012     footer->orig_size = cpu_to_be64(total_size);
1013     footer->current_size = cpu_to_be64(total_size);
1014     footer->cyls = cpu_to_be16(cyls);
1015     footer->heads = heads;
1016     footer->secs_per_cyl = secs_per_cyl;
1017 
1018     footer->type = cpu_to_be32(disk_type);
1019 
1020     qemu_uuid_generate(&footer->uuid);
1021 
1022     footer->checksum = cpu_to_be32(vpc_checksum(buf, HEADER_SIZE));
1023 
1024     if (disk_type == VHD_DYNAMIC) {
1025         ret = create_dynamic_disk(blk, buf, total_sectors);
1026         if (ret < 0) {
1027             error_setg(errp, "Unable to create or write VHD header");
1028         }
1029     } else {
1030         ret = create_fixed_disk(blk, buf, total_size, errp);
1031     }
1032 
1033 out:
1034     blk_unref(blk);
1035     g_free(disk_type_param);
1036     return ret;
1037 }
1038 
1039 static int vpc_has_zero_init(BlockDriverState *bs)
1040 {
1041     BDRVVPCState *s = bs->opaque;
1042     VHDFooter *footer =  (VHDFooter *) s->footer_buf;
1043 
1044     if (be32_to_cpu(footer->type) == VHD_FIXED) {
1045         return bdrv_has_zero_init(bs->file->bs);
1046     } else {
1047         return 1;
1048     }
1049 }
1050 
1051 static void vpc_close(BlockDriverState *bs)
1052 {
1053     BDRVVPCState *s = bs->opaque;
1054     qemu_vfree(s->pagetable);
1055 #ifdef CACHE
1056     g_free(s->pageentry_u8);
1057 #endif
1058 
1059     migrate_del_blocker(s->migration_blocker);
1060     error_free(s->migration_blocker);
1061 }
1062 
1063 static QemuOptsList vpc_create_opts = {
1064     .name = "vpc-create-opts",
1065     .head = QTAILQ_HEAD_INITIALIZER(vpc_create_opts.head),
1066     .desc = {
1067         {
1068             .name = BLOCK_OPT_SIZE,
1069             .type = QEMU_OPT_SIZE,
1070             .help = "Virtual disk size"
1071         },
1072         {
1073             .name = BLOCK_OPT_SUBFMT,
1074             .type = QEMU_OPT_STRING,
1075             .help =
1076                 "Type of virtual hard disk format. Supported formats are "
1077                 "{dynamic (default) | fixed} "
1078         },
1079         {
1080             .name = VPC_OPT_FORCE_SIZE,
1081             .type = QEMU_OPT_BOOL,
1082             .help = "Force disk size calculation to use the actual size "
1083                     "specified, rather than using the nearest CHS-based "
1084                     "calculation"
1085         },
1086         { /* end of list */ }
1087     }
1088 };
1089 
1090 static BlockDriver bdrv_vpc = {
1091     .format_name    = "vpc",
1092     .instance_size  = sizeof(BDRVVPCState),
1093 
1094     .bdrv_probe             = vpc_probe,
1095     .bdrv_open              = vpc_open,
1096     .bdrv_close             = vpc_close,
1097     .bdrv_reopen_prepare    = vpc_reopen_prepare,
1098     .bdrv_child_perm        = bdrv_format_default_perms,
1099     .bdrv_co_create_opts    = vpc_co_create_opts,
1100 
1101     .bdrv_co_preadv             = vpc_co_preadv,
1102     .bdrv_co_pwritev            = vpc_co_pwritev,
1103     .bdrv_co_block_status       = vpc_co_block_status,
1104 
1105     .bdrv_get_info          = vpc_get_info,
1106 
1107     .create_opts            = &vpc_create_opts,
1108     .bdrv_has_zero_init     = vpc_has_zero_init,
1109 };
1110 
1111 static void bdrv_vpc_init(void)
1112 {
1113     bdrv_register(&bdrv_vpc);
1114 }
1115 
1116 block_init(bdrv_vpc_init);
1117