xref: /openbmc/qemu/block/vpc.c (revision 61b01bbc)
1 /*
2  * Block driver for Connectix / Microsoft Virtual PC images
3  *
4  * Copyright (c) 2005 Alex Beregszaszi
5  * Copyright (c) 2009 Kevin Wolf <kwolf@suse.de>
6  *
7  * Permission is hereby granted, free of charge, to any person obtaining a copy
8  * of this software and associated documentation files (the "Software"), to deal
9  * in the Software without restriction, including without limitation the rights
10  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11  * copies of the Software, and to permit persons to whom the Software is
12  * furnished to do so, subject to the following conditions:
13  *
14  * The above copyright notice and this permission notice shall be included in
15  * all copies or substantial portions of the Software.
16  *
17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23  * THE SOFTWARE.
24  */
25 
26 #include "qemu/osdep.h"
27 #include "qapi/error.h"
28 #include "block/block_int.h"
29 #include "sysemu/block-backend.h"
30 #include "qemu/module.h"
31 #include "qemu/option.h"
32 #include "migration/blocker.h"
33 #include "qemu/bswap.h"
34 #include "qemu/uuid.h"
35 
36 /**************************************************************/
37 
38 #define HEADER_SIZE 512
39 
40 //#define CACHE
41 
42 enum vhd_type {
43     VHD_FIXED           = 2,
44     VHD_DYNAMIC         = 3,
45     VHD_DIFFERENCING    = 4,
46 };
47 
48 /* Seconds since Jan 1, 2000 0:00:00 (UTC) */
49 #define VHD_TIMESTAMP_BASE 946684800
50 
51 #define VHD_CHS_MAX_C   65535LL
52 #define VHD_CHS_MAX_H   16
53 #define VHD_CHS_MAX_S   255
54 
55 #define VHD_MAX_SECTORS       0xff000000    /* 2040 GiB max image size */
56 #define VHD_MAX_GEOMETRY      (VHD_CHS_MAX_C * VHD_CHS_MAX_H * VHD_CHS_MAX_S)
57 
58 #define VPC_OPT_FORCE_SIZE "force_size"
59 
60 /* always big-endian */
61 typedef struct vhd_footer {
62     char        creator[8]; /* "conectix" */
63     uint32_t    features;
64     uint32_t    version;
65 
66     /* Offset of next header structure, 0xFFFFFFFF if none */
67     uint64_t    data_offset;
68 
69     /* Seconds since Jan 1, 2000 0:00:00 (UTC) */
70     uint32_t    timestamp;
71 
72     char        creator_app[4]; /*  e.g., "vpc " */
73     uint16_t    major;
74     uint16_t    minor;
75     char        creator_os[4]; /* "Wi2k" */
76 
77     uint64_t    orig_size;
78     uint64_t    current_size;
79 
80     uint16_t    cyls;
81     uint8_t     heads;
82     uint8_t     secs_per_cyl;
83 
84     uint32_t    type;
85 
86     /* Checksum of the Hard Disk Footer ("one's complement of the sum of all
87        the bytes in the footer without the checksum field") */
88     uint32_t    checksum;
89 
90     /* UUID used to identify a parent hard disk (backing file) */
91     QemuUUID    uuid;
92 
93     uint8_t     in_saved_state;
94 } QEMU_PACKED VHDFooter;
95 
96 typedef struct vhd_dyndisk_header {
97     char        magic[8]; /* "cxsparse" */
98 
99     /* Offset of next header structure, 0xFFFFFFFF if none */
100     uint64_t    data_offset;
101 
102     /* Offset of the Block Allocation Table (BAT) */
103     uint64_t    table_offset;
104 
105     uint32_t    version;
106     uint32_t    max_table_entries; /* 32bit/entry */
107 
108     /* 2 MB by default, must be a power of two */
109     uint32_t    block_size;
110 
111     uint32_t    checksum;
112     uint8_t     parent_uuid[16];
113     uint32_t    parent_timestamp;
114     uint32_t    reserved;
115 
116     /* Backing file name (in UTF-16) */
117     uint8_t     parent_name[512];
118 
119     struct {
120         uint32_t    platform;
121         uint32_t    data_space;
122         uint32_t    data_length;
123         uint32_t    reserved;
124         uint64_t    data_offset;
125     } parent_locator[8];
126 } QEMU_PACKED VHDDynDiskHeader;
127 
128 typedef struct BDRVVPCState {
129     CoMutex lock;
130     uint8_t footer_buf[HEADER_SIZE];
131     uint64_t free_data_block_offset;
132     int max_table_entries;
133     uint32_t *pagetable;
134     uint64_t bat_offset;
135     uint64_t last_bitmap_offset;
136 
137     uint32_t block_size;
138     uint32_t bitmap_size;
139     bool force_use_chs;
140     bool force_use_sz;
141 
142 #ifdef CACHE
143     uint8_t *pageentry_u8;
144     uint32_t *pageentry_u32;
145     uint16_t *pageentry_u16;
146 
147     uint64_t last_bitmap;
148 #endif
149 
150     Error *migration_blocker;
151 } BDRVVPCState;
152 
153 #define VPC_OPT_SIZE_CALC "force_size_calc"
154 static QemuOptsList vpc_runtime_opts = {
155     .name = "vpc-runtime-opts",
156     .head = QTAILQ_HEAD_INITIALIZER(vpc_runtime_opts.head),
157     .desc = {
158         {
159             .name = VPC_OPT_SIZE_CALC,
160             .type = QEMU_OPT_STRING,
161             .help = "Force disk size calculation to use either CHS geometry, "
162                     "or use the disk current_size specified in the VHD footer. "
163                     "{chs, current_size}"
164         },
165         { /* end of list */ }
166     }
167 };
168 
169 static uint32_t vpc_checksum(uint8_t* buf, size_t size)
170 {
171     uint32_t res = 0;
172     int i;
173 
174     for (i = 0; i < size; i++)
175         res += buf[i];
176 
177     return ~res;
178 }
179 
180 
181 static int vpc_probe(const uint8_t *buf, int buf_size, const char *filename)
182 {
183     if (buf_size >= 8 && !strncmp((char *)buf, "conectix", 8))
184 	return 100;
185     return 0;
186 }
187 
188 static void vpc_parse_options(BlockDriverState *bs, QemuOpts *opts,
189                               Error **errp)
190 {
191     BDRVVPCState *s = bs->opaque;
192     const char *size_calc;
193 
194     size_calc = qemu_opt_get(opts, VPC_OPT_SIZE_CALC);
195 
196     if (!size_calc) {
197        /* no override, use autodetect only */
198     } else if (!strcmp(size_calc, "current_size")) {
199         s->force_use_sz = true;
200     } else if (!strcmp(size_calc, "chs")) {
201         s->force_use_chs = true;
202     } else {
203         error_setg(errp, "Invalid size calculation mode: '%s'", size_calc);
204     }
205 }
206 
207 static int vpc_open(BlockDriverState *bs, QDict *options, int flags,
208                     Error **errp)
209 {
210     BDRVVPCState *s = bs->opaque;
211     int i;
212     VHDFooter *footer;
213     VHDDynDiskHeader *dyndisk_header;
214     QemuOpts *opts = NULL;
215     Error *local_err = NULL;
216     bool use_chs;
217     uint8_t buf[HEADER_SIZE];
218     uint32_t checksum;
219     uint64_t computed_size;
220     uint64_t pagetable_size;
221     int disk_type = VHD_DYNAMIC;
222     int ret;
223     int64_t bs_size;
224 
225     bs->file = bdrv_open_child(NULL, options, "file", bs, &child_file,
226                                false, errp);
227     if (!bs->file) {
228         return -EINVAL;
229     }
230 
231     opts = qemu_opts_create(&vpc_runtime_opts, NULL, 0, &error_abort);
232     qemu_opts_absorb_qdict(opts, options, &local_err);
233     if (local_err) {
234         error_propagate(errp, local_err);
235         ret = -EINVAL;
236         goto fail;
237     }
238 
239     vpc_parse_options(bs, opts, &local_err);
240     if (local_err) {
241         error_propagate(errp, local_err);
242         ret = -EINVAL;
243         goto fail;
244     }
245 
246     ret = bdrv_pread(bs->file, 0, s->footer_buf, HEADER_SIZE);
247     if (ret < 0) {
248         error_setg(errp, "Unable to read VHD header");
249         goto fail;
250     }
251 
252     footer = (VHDFooter *) s->footer_buf;
253     if (strncmp(footer->creator, "conectix", 8)) {
254         int64_t offset = bdrv_getlength(bs->file->bs);
255         if (offset < 0) {
256             ret = offset;
257             error_setg(errp, "Invalid file size");
258             goto fail;
259         } else if (offset < HEADER_SIZE) {
260             ret = -EINVAL;
261             error_setg(errp, "File too small for a VHD header");
262             goto fail;
263         }
264 
265         /* If a fixed disk, the footer is found only at the end of the file */
266         ret = bdrv_pread(bs->file, offset-HEADER_SIZE, s->footer_buf,
267                          HEADER_SIZE);
268         if (ret < 0) {
269             goto fail;
270         }
271         if (strncmp(footer->creator, "conectix", 8)) {
272             error_setg(errp, "invalid VPC image");
273             ret = -EINVAL;
274             goto fail;
275         }
276         disk_type = VHD_FIXED;
277     }
278 
279     checksum = be32_to_cpu(footer->checksum);
280     footer->checksum = 0;
281     if (vpc_checksum(s->footer_buf, HEADER_SIZE) != checksum)
282         fprintf(stderr, "block-vpc: The header checksum of '%s' is "
283             "incorrect.\n", bs->filename);
284 
285     /* Write 'checksum' back to footer, or else will leave it with zero. */
286     footer->checksum = cpu_to_be32(checksum);
287 
288     /* The visible size of a image in Virtual PC depends on the geometry
289        rather than on the size stored in the footer (the size in the footer
290        is too large usually) */
291     bs->total_sectors = (int64_t)
292         be16_to_cpu(footer->cyls) * footer->heads * footer->secs_per_cyl;
293 
294     /* Microsoft Virtual PC and Microsoft Hyper-V produce and read
295      * VHD image sizes differently.  VPC will rely on CHS geometry,
296      * while Hyper-V and disk2vhd use the size specified in the footer.
297      *
298      * We use a couple of approaches to try and determine the correct method:
299      * look at the Creator App field, and look for images that have CHS
300      * geometry that is the maximum value.
301      *
302      * If the CHS geometry is the maximum CHS geometry, then we assume that
303      * the size is the footer->current_size to avoid truncation.  Otherwise,
304      * we follow the table based on footer->creator_app:
305      *
306      *  Known creator apps:
307      *      'vpc '  :  CHS              Virtual PC (uses disk geometry)
308      *      'qemu'  :  CHS              QEMU (uses disk geometry)
309      *      'qem2'  :  current_size     QEMU (uses current_size)
310      *      'win '  :  current_size     Hyper-V
311      *      'd2v '  :  current_size     Disk2vhd
312      *      'tap\0' :  current_size     XenServer
313      *      'CTXS'  :  current_size     XenConverter
314      *
315      *  The user can override the table values via drive options, however
316      *  even with an override we will still use current_size for images
317      *  that have CHS geometry of the maximum size.
318      */
319     use_chs = (!!strncmp(footer->creator_app, "win ", 4) &&
320                !!strncmp(footer->creator_app, "qem2", 4) &&
321                !!strncmp(footer->creator_app, "d2v ", 4) &&
322                !!strncmp(footer->creator_app, "CTXS", 4) &&
323                !!memcmp(footer->creator_app, "tap", 4)) || s->force_use_chs;
324 
325     if (!use_chs || bs->total_sectors == VHD_MAX_GEOMETRY || s->force_use_sz) {
326         bs->total_sectors = be64_to_cpu(footer->current_size) /
327                                         BDRV_SECTOR_SIZE;
328     }
329 
330     /* Allow a maximum disk size of 2040 GiB */
331     if (bs->total_sectors > VHD_MAX_SECTORS) {
332         ret = -EFBIG;
333         goto fail;
334     }
335 
336     if (disk_type == VHD_DYNAMIC) {
337         ret = bdrv_pread(bs->file, be64_to_cpu(footer->data_offset), buf,
338                          HEADER_SIZE);
339         if (ret < 0) {
340             error_setg(errp, "Error reading dynamic VHD header");
341             goto fail;
342         }
343 
344         dyndisk_header = (VHDDynDiskHeader *) buf;
345 
346         if (strncmp(dyndisk_header->magic, "cxsparse", 8)) {
347             error_setg(errp, "Invalid header magic");
348             ret = -EINVAL;
349             goto fail;
350         }
351 
352         s->block_size = be32_to_cpu(dyndisk_header->block_size);
353         if (!is_power_of_2(s->block_size) || s->block_size < BDRV_SECTOR_SIZE) {
354             error_setg(errp, "Invalid block size %" PRIu32, s->block_size);
355             ret = -EINVAL;
356             goto fail;
357         }
358         s->bitmap_size = ((s->block_size / (8 * 512)) + 511) & ~511;
359 
360         s->max_table_entries = be32_to_cpu(dyndisk_header->max_table_entries);
361 
362         if ((bs->total_sectors * 512) / s->block_size > 0xffffffffU) {
363             error_setg(errp, "Too many blocks");
364             ret = -EINVAL;
365             goto fail;
366         }
367 
368         computed_size = (uint64_t) s->max_table_entries * s->block_size;
369         if (computed_size < bs->total_sectors * 512) {
370             error_setg(errp, "Page table too small");
371             ret = -EINVAL;
372             goto fail;
373         }
374 
375         if (s->max_table_entries > SIZE_MAX / 4 ||
376             s->max_table_entries > (int) INT_MAX / 4) {
377             error_setg(errp, "Max Table Entries too large (%" PRId32 ")",
378                         s->max_table_entries);
379             ret = -EINVAL;
380             goto fail;
381         }
382 
383         pagetable_size = (uint64_t) s->max_table_entries * 4;
384 
385         s->pagetable = qemu_try_blockalign(bs->file->bs, pagetable_size);
386         if (s->pagetable == NULL) {
387             error_setg(errp, "Unable to allocate memory for page table");
388             ret = -ENOMEM;
389             goto fail;
390         }
391 
392         s->bat_offset = be64_to_cpu(dyndisk_header->table_offset);
393 
394         ret = bdrv_pread(bs->file, s->bat_offset, s->pagetable,
395                          pagetable_size);
396         if (ret < 0) {
397             error_setg(errp, "Error reading pagetable");
398             goto fail;
399         }
400 
401         s->free_data_block_offset =
402             ROUND_UP(s->bat_offset + pagetable_size, 512);
403 
404         for (i = 0; i < s->max_table_entries; i++) {
405             be32_to_cpus(&s->pagetable[i]);
406             if (s->pagetable[i] != 0xFFFFFFFF) {
407                 int64_t next = (512 * (int64_t) s->pagetable[i]) +
408                     s->bitmap_size + s->block_size;
409 
410                 if (next > s->free_data_block_offset) {
411                     s->free_data_block_offset = next;
412                 }
413             }
414         }
415 
416         bs_size = bdrv_getlength(bs->file->bs);
417         if (bs_size < 0) {
418             error_setg_errno(errp, -bs_size, "Unable to learn image size");
419             ret = bs_size;
420             goto fail;
421         }
422         if (s->free_data_block_offset > bs_size) {
423             error_setg(errp, "block-vpc: free_data_block_offset points after "
424                              "the end of file. The image has been truncated.");
425             ret = -EINVAL;
426             goto fail;
427         }
428 
429         s->last_bitmap_offset = (int64_t) -1;
430 
431 #ifdef CACHE
432         s->pageentry_u8 = g_malloc(512);
433         s->pageentry_u32 = s->pageentry_u8;
434         s->pageentry_u16 = s->pageentry_u8;
435         s->last_pagetable = -1;
436 #endif
437     }
438 
439     /* Disable migration when VHD images are used */
440     error_setg(&s->migration_blocker, "The vpc format used by node '%s' "
441                "does not support live migration",
442                bdrv_get_device_or_node_name(bs));
443     ret = migrate_add_blocker(s->migration_blocker, &local_err);
444     if (local_err) {
445         error_propagate(errp, local_err);
446         error_free(s->migration_blocker);
447         goto fail;
448     }
449 
450     qemu_co_mutex_init(&s->lock);
451 
452     return 0;
453 
454 fail:
455     qemu_vfree(s->pagetable);
456 #ifdef CACHE
457     g_free(s->pageentry_u8);
458 #endif
459     return ret;
460 }
461 
462 static int vpc_reopen_prepare(BDRVReopenState *state,
463                               BlockReopenQueue *queue, Error **errp)
464 {
465     return 0;
466 }
467 
468 /*
469  * Returns the absolute byte offset of the given sector in the image file.
470  * If the sector is not allocated, -1 is returned instead.
471  * If an error occurred trying to write an updated block bitmap back to
472  * the file, -2 is returned, and the error value is written to *err.
473  * This can only happen for a write operation.
474  *
475  * The parameter write must be 1 if the offset will be used for a write
476  * operation (the block bitmaps is updated then), 0 otherwise.
477  * If write is true then err must not be NULL.
478  */
479 static inline int64_t get_image_offset(BlockDriverState *bs, uint64_t offset,
480                                        bool write, int *err)
481 {
482     BDRVVPCState *s = bs->opaque;
483     uint64_t bitmap_offset, block_offset;
484     uint32_t pagetable_index, offset_in_block;
485 
486     assert(!(write && err == NULL));
487 
488     pagetable_index = offset / s->block_size;
489     offset_in_block = offset % s->block_size;
490 
491     if (pagetable_index >= s->max_table_entries || s->pagetable[pagetable_index] == 0xffffffff)
492         return -1; /* not allocated */
493 
494     bitmap_offset = 512 * (uint64_t) s->pagetable[pagetable_index];
495     block_offset = bitmap_offset + s->bitmap_size + offset_in_block;
496 
497     /* We must ensure that we don't write to any sectors which are marked as
498        unused in the bitmap. We get away with setting all bits in the block
499        bitmap each time we write to a new block. This might cause Virtual PC to
500        miss sparse read optimization, but it's not a problem in terms of
501        correctness. */
502     if (write && (s->last_bitmap_offset != bitmap_offset)) {
503         uint8_t bitmap[s->bitmap_size];
504         int r;
505 
506         s->last_bitmap_offset = bitmap_offset;
507         memset(bitmap, 0xff, s->bitmap_size);
508         r = bdrv_pwrite_sync(bs->file, bitmap_offset, bitmap, s->bitmap_size);
509         if (r < 0) {
510             *err = r;
511             return -2;
512         }
513     }
514 
515     return block_offset;
516 }
517 
518 /*
519  * Writes the footer to the end of the image file. This is needed when the
520  * file grows as it overwrites the old footer
521  *
522  * Returns 0 on success and < 0 on error
523  */
524 static int rewrite_footer(BlockDriverState* bs)
525 {
526     int ret;
527     BDRVVPCState *s = bs->opaque;
528     int64_t offset = s->free_data_block_offset;
529 
530     ret = bdrv_pwrite_sync(bs->file, offset, s->footer_buf, HEADER_SIZE);
531     if (ret < 0)
532         return ret;
533 
534     return 0;
535 }
536 
537 /*
538  * Allocates a new block. This involves writing a new footer and updating
539  * the Block Allocation Table to use the space at the old end of the image
540  * file (overwriting the old footer)
541  *
542  * Returns the sectors' offset in the image file on success and < 0 on error
543  */
544 static int64_t alloc_block(BlockDriverState* bs, int64_t offset)
545 {
546     BDRVVPCState *s = bs->opaque;
547     int64_t bat_offset;
548     uint32_t index, bat_value;
549     int ret;
550     uint8_t bitmap[s->bitmap_size];
551 
552     /* Check if sector_num is valid */
553     if ((offset < 0) || (offset > bs->total_sectors * BDRV_SECTOR_SIZE)) {
554         return -EINVAL;
555     }
556 
557     /* Write entry into in-memory BAT */
558     index = offset / s->block_size;
559     assert(s->pagetable[index] == 0xFFFFFFFF);
560     s->pagetable[index] = s->free_data_block_offset / 512;
561 
562     /* Initialize the block's bitmap */
563     memset(bitmap, 0xff, s->bitmap_size);
564     ret = bdrv_pwrite_sync(bs->file, s->free_data_block_offset, bitmap,
565         s->bitmap_size);
566     if (ret < 0) {
567         return ret;
568     }
569 
570     /* Write new footer (the old one will be overwritten) */
571     s->free_data_block_offset += s->block_size + s->bitmap_size;
572     ret = rewrite_footer(bs);
573     if (ret < 0)
574         goto fail;
575 
576     /* Write BAT entry to disk */
577     bat_offset = s->bat_offset + (4 * index);
578     bat_value = cpu_to_be32(s->pagetable[index]);
579     ret = bdrv_pwrite_sync(bs->file, bat_offset, &bat_value, 4);
580     if (ret < 0)
581         goto fail;
582 
583     return get_image_offset(bs, offset, false, NULL);
584 
585 fail:
586     s->free_data_block_offset -= (s->block_size + s->bitmap_size);
587     return ret;
588 }
589 
590 static int vpc_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
591 {
592     BDRVVPCState *s = (BDRVVPCState *)bs->opaque;
593     VHDFooter *footer = (VHDFooter *) s->footer_buf;
594 
595     if (be32_to_cpu(footer->type) != VHD_FIXED) {
596         bdi->cluster_size = s->block_size;
597     }
598 
599     bdi->unallocated_blocks_are_zero = true;
600     return 0;
601 }
602 
603 static int coroutine_fn
604 vpc_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
605               QEMUIOVector *qiov, int flags)
606 {
607     BDRVVPCState *s = bs->opaque;
608     int ret;
609     int64_t image_offset;
610     int64_t n_bytes;
611     int64_t bytes_done = 0;
612     VHDFooter *footer = (VHDFooter *) s->footer_buf;
613     QEMUIOVector local_qiov;
614 
615     if (be32_to_cpu(footer->type) == VHD_FIXED) {
616         return bdrv_co_preadv(bs->file, offset, bytes, qiov, 0);
617     }
618 
619     qemu_co_mutex_lock(&s->lock);
620     qemu_iovec_init(&local_qiov, qiov->niov);
621 
622     while (bytes > 0) {
623         image_offset = get_image_offset(bs, offset, false, NULL);
624         n_bytes = MIN(bytes, s->block_size - (offset % s->block_size));
625 
626         if (image_offset == -1) {
627             qemu_iovec_memset(qiov, bytes_done, 0, n_bytes);
628         } else {
629             qemu_iovec_reset(&local_qiov);
630             qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes);
631 
632             ret = bdrv_co_preadv(bs->file, image_offset, n_bytes,
633                                  &local_qiov, 0);
634             if (ret < 0) {
635                 goto fail;
636             }
637         }
638 
639         bytes -= n_bytes;
640         offset += n_bytes;
641         bytes_done += n_bytes;
642     }
643 
644     ret = 0;
645 fail:
646     qemu_iovec_destroy(&local_qiov);
647     qemu_co_mutex_unlock(&s->lock);
648 
649     return ret;
650 }
651 
652 static int coroutine_fn
653 vpc_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
654                QEMUIOVector *qiov, int flags)
655 {
656     BDRVVPCState *s = bs->opaque;
657     int64_t image_offset;
658     int64_t n_bytes;
659     int64_t bytes_done = 0;
660     int ret = 0;
661     VHDFooter *footer =  (VHDFooter *) s->footer_buf;
662     QEMUIOVector local_qiov;
663 
664     if (be32_to_cpu(footer->type) == VHD_FIXED) {
665         return bdrv_co_pwritev(bs->file, offset, bytes, qiov, 0);
666     }
667 
668     qemu_co_mutex_lock(&s->lock);
669     qemu_iovec_init(&local_qiov, qiov->niov);
670 
671     while (bytes > 0) {
672         image_offset = get_image_offset(bs, offset, true, &ret);
673         if (image_offset == -2) {
674             /* Failed to write block bitmap: can't proceed with write */
675             goto fail;
676         }
677         n_bytes = MIN(bytes, s->block_size - (offset % s->block_size));
678 
679         if (image_offset == -1) {
680             image_offset = alloc_block(bs, offset);
681             if (image_offset < 0) {
682                 ret = image_offset;
683                 goto fail;
684             }
685         }
686 
687         qemu_iovec_reset(&local_qiov);
688         qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes);
689 
690         ret = bdrv_co_pwritev(bs->file, image_offset, n_bytes,
691                               &local_qiov, 0);
692         if (ret < 0) {
693             goto fail;
694         }
695 
696         bytes -= n_bytes;
697         offset += n_bytes;
698         bytes_done += n_bytes;
699     }
700 
701     ret = 0;
702 fail:
703     qemu_iovec_destroy(&local_qiov);
704     qemu_co_mutex_unlock(&s->lock);
705 
706     return ret;
707 }
708 
709 static int64_t coroutine_fn vpc_co_get_block_status(BlockDriverState *bs,
710         int64_t sector_num, int nb_sectors, int *pnum, BlockDriverState **file)
711 {
712     BDRVVPCState *s = bs->opaque;
713     VHDFooter *footer = (VHDFooter*) s->footer_buf;
714     int64_t start, offset;
715     bool allocated;
716     int64_t ret;
717     int n;
718 
719     if (be32_to_cpu(footer->type) == VHD_FIXED) {
720         *pnum = nb_sectors;
721         *file = bs->file->bs;
722         return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID |
723                (sector_num << BDRV_SECTOR_BITS);
724     }
725 
726     qemu_co_mutex_lock(&s->lock);
727 
728     offset = get_image_offset(bs, sector_num << BDRV_SECTOR_BITS, false, NULL);
729     start = offset;
730     allocated = (offset != -1);
731     *pnum = 0;
732     ret = 0;
733 
734     do {
735         /* All sectors in a block are contiguous (without using the bitmap) */
736         n = ROUND_UP(sector_num + 1, s->block_size / BDRV_SECTOR_SIZE)
737           - sector_num;
738         n = MIN(n, nb_sectors);
739 
740         *pnum += n;
741         sector_num += n;
742         nb_sectors -= n;
743         /* *pnum can't be greater than one block for allocated
744          * sectors since there is always a bitmap in between. */
745         if (allocated) {
746             *file = bs->file->bs;
747             ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | start;
748             break;
749         }
750         if (nb_sectors == 0) {
751             break;
752         }
753         offset = get_image_offset(bs, sector_num << BDRV_SECTOR_BITS, false,
754                                   NULL);
755     } while (offset == -1);
756 
757     qemu_co_mutex_unlock(&s->lock);
758     return ret;
759 }
760 
761 /*
762  * Calculates the number of cylinders, heads and sectors per cylinder
763  * based on a given number of sectors. This is the algorithm described
764  * in the VHD specification.
765  *
766  * Note that the geometry doesn't always exactly match total_sectors but
767  * may round it down.
768  *
769  * Returns 0 on success, -EFBIG if the size is larger than 2040 GiB. Override
770  * the hardware EIDE and ATA-2 limit of 16 heads (max disk size of 127 GB)
771  * and instead allow up to 255 heads.
772  */
773 static int calculate_geometry(int64_t total_sectors, uint16_t* cyls,
774     uint8_t* heads, uint8_t* secs_per_cyl)
775 {
776     uint32_t cyls_times_heads;
777 
778     total_sectors = MIN(total_sectors, VHD_MAX_GEOMETRY);
779 
780     if (total_sectors >= 65535LL * 16 * 63) {
781         *secs_per_cyl = 255;
782         *heads = 16;
783         cyls_times_heads = total_sectors / *secs_per_cyl;
784     } else {
785         *secs_per_cyl = 17;
786         cyls_times_heads = total_sectors / *secs_per_cyl;
787         *heads = DIV_ROUND_UP(cyls_times_heads, 1024);
788 
789         if (*heads < 4) {
790             *heads = 4;
791         }
792 
793         if (cyls_times_heads >= (*heads * 1024) || *heads > 16) {
794             *secs_per_cyl = 31;
795             *heads = 16;
796             cyls_times_heads = total_sectors / *secs_per_cyl;
797         }
798 
799         if (cyls_times_heads >= (*heads * 1024)) {
800             *secs_per_cyl = 63;
801             *heads = 16;
802             cyls_times_heads = total_sectors / *secs_per_cyl;
803         }
804     }
805 
806     *cyls = cyls_times_heads / *heads;
807 
808     return 0;
809 }
810 
811 static int create_dynamic_disk(BlockBackend *blk, uint8_t *buf,
812                                int64_t total_sectors)
813 {
814     VHDDynDiskHeader *dyndisk_header =
815         (VHDDynDiskHeader *) buf;
816     size_t block_size, num_bat_entries;
817     int i;
818     int ret;
819     int64_t offset = 0;
820 
821     /* Write the footer (twice: at the beginning and at the end) */
822     block_size = 0x200000;
823     num_bat_entries = (total_sectors + block_size / 512) / (block_size / 512);
824 
825     ret = blk_pwrite(blk, offset, buf, HEADER_SIZE, 0);
826     if (ret < 0) {
827         goto fail;
828     }
829 
830     offset = 1536 + ((num_bat_entries * 4 + 511) & ~511);
831     ret = blk_pwrite(blk, offset, buf, HEADER_SIZE, 0);
832     if (ret < 0) {
833         goto fail;
834     }
835 
836     /* Write the initial BAT */
837     offset = 3 * 512;
838 
839     memset(buf, 0xFF, 512);
840     for (i = 0; i < DIV_ROUND_UP(num_bat_entries * 4, 512); i++) {
841         ret = blk_pwrite(blk, offset, buf, 512, 0);
842         if (ret < 0) {
843             goto fail;
844         }
845         offset += 512;
846     }
847 
848     /* Prepare the Dynamic Disk Header */
849     memset(buf, 0, 1024);
850 
851     memcpy(dyndisk_header->magic, "cxsparse", 8);
852 
853     /*
854      * Note: The spec is actually wrong here for data_offset, it says
855      * 0xFFFFFFFF, but MS tools expect all 64 bits to be set.
856      */
857     dyndisk_header->data_offset = cpu_to_be64(0xFFFFFFFFFFFFFFFFULL);
858     dyndisk_header->table_offset = cpu_to_be64(3 * 512);
859     dyndisk_header->version = cpu_to_be32(0x00010000);
860     dyndisk_header->block_size = cpu_to_be32(block_size);
861     dyndisk_header->max_table_entries = cpu_to_be32(num_bat_entries);
862 
863     dyndisk_header->checksum = cpu_to_be32(vpc_checksum(buf, 1024));
864 
865     /* Write the header */
866     offset = 512;
867 
868     ret = blk_pwrite(blk, offset, buf, 1024, 0);
869     if (ret < 0) {
870         goto fail;
871     }
872 
873  fail:
874     return ret;
875 }
876 
877 static int create_fixed_disk(BlockBackend *blk, uint8_t *buf,
878                              int64_t total_size, Error **errp)
879 {
880     int ret;
881 
882     /* Add footer to total size */
883     total_size += HEADER_SIZE;
884 
885     ret = blk_truncate(blk, total_size, PREALLOC_MODE_OFF, errp);
886     if (ret < 0) {
887         return ret;
888     }
889 
890     ret = blk_pwrite(blk, total_size - HEADER_SIZE, buf, HEADER_SIZE, 0);
891     if (ret < 0) {
892         error_setg_errno(errp, -ret, "Unable to write VHD header");
893         return ret;
894     }
895 
896     return ret;
897 }
898 
899 static int vpc_create(const char *filename, QemuOpts *opts, Error **errp)
900 {
901     uint8_t buf[1024];
902     VHDFooter *footer = (VHDFooter *) buf;
903     char *disk_type_param;
904     int i;
905     uint16_t cyls = 0;
906     uint8_t heads = 0;
907     uint8_t secs_per_cyl = 0;
908     int64_t total_sectors;
909     int64_t total_size;
910     int disk_type;
911     int ret = -EIO;
912     bool force_size;
913     Error *local_err = NULL;
914     BlockBackend *blk = NULL;
915 
916     /* Read out options */
917     total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
918                           BDRV_SECTOR_SIZE);
919     disk_type_param = qemu_opt_get_del(opts, BLOCK_OPT_SUBFMT);
920     if (disk_type_param) {
921         if (!strcmp(disk_type_param, "dynamic")) {
922             disk_type = VHD_DYNAMIC;
923         } else if (!strcmp(disk_type_param, "fixed")) {
924             disk_type = VHD_FIXED;
925         } else {
926             error_setg(errp, "Invalid disk type, %s", disk_type_param);
927             ret = -EINVAL;
928             goto out;
929         }
930     } else {
931         disk_type = VHD_DYNAMIC;
932     }
933 
934     force_size = qemu_opt_get_bool_del(opts, VPC_OPT_FORCE_SIZE, false);
935 
936     ret = bdrv_create_file(filename, opts, &local_err);
937     if (ret < 0) {
938         error_propagate(errp, local_err);
939         goto out;
940     }
941 
942     blk = blk_new_open(filename, NULL, NULL,
943                        BDRV_O_RDWR | BDRV_O_RESIZE | BDRV_O_PROTOCOL,
944                        &local_err);
945     if (blk == NULL) {
946         error_propagate(errp, local_err);
947         ret = -EIO;
948         goto out;
949     }
950 
951     blk_set_allow_write_beyond_eof(blk, true);
952 
953     /*
954      * Calculate matching total_size and geometry. Increase the number of
955      * sectors requested until we get enough (or fail). This ensures that
956      * qemu-img convert doesn't truncate images, but rather rounds up.
957      *
958      * If the image size can't be represented by a spec conformant CHS geometry,
959      * we set the geometry to 65535 x 16 x 255 (CxHxS) sectors and use
960      * the image size from the VHD footer to calculate total_sectors.
961      */
962     if (force_size) {
963         /* This will force the use of total_size for sector count, below */
964         cyls         = VHD_CHS_MAX_C;
965         heads        = VHD_CHS_MAX_H;
966         secs_per_cyl = VHD_CHS_MAX_S;
967     } else {
968         total_sectors = MIN(VHD_MAX_GEOMETRY, total_size / BDRV_SECTOR_SIZE);
969         for (i = 0; total_sectors > (int64_t)cyls * heads * secs_per_cyl; i++) {
970             calculate_geometry(total_sectors + i, &cyls, &heads, &secs_per_cyl);
971         }
972     }
973 
974     if ((int64_t)cyls * heads * secs_per_cyl == VHD_MAX_GEOMETRY) {
975         total_sectors = total_size / BDRV_SECTOR_SIZE;
976         /* Allow a maximum disk size of 2040 GiB */
977         if (total_sectors > VHD_MAX_SECTORS) {
978             error_setg(errp, "Disk size is too large, max size is 2040 GiB");
979             ret = -EFBIG;
980             goto out;
981         }
982     } else {
983         total_sectors = (int64_t)cyls * heads * secs_per_cyl;
984         total_size = total_sectors * BDRV_SECTOR_SIZE;
985     }
986 
987     /* Prepare the Hard Disk Footer */
988     memset(buf, 0, 1024);
989 
990     memcpy(footer->creator, "conectix", 8);
991     if (force_size) {
992         memcpy(footer->creator_app, "qem2", 4);
993     } else {
994         memcpy(footer->creator_app, "qemu", 4);
995     }
996     memcpy(footer->creator_os, "Wi2k", 4);
997 
998     footer->features = cpu_to_be32(0x02);
999     footer->version = cpu_to_be32(0x00010000);
1000     if (disk_type == VHD_DYNAMIC) {
1001         footer->data_offset = cpu_to_be64(HEADER_SIZE);
1002     } else {
1003         footer->data_offset = cpu_to_be64(0xFFFFFFFFFFFFFFFFULL);
1004     }
1005     footer->timestamp = cpu_to_be32(time(NULL) - VHD_TIMESTAMP_BASE);
1006 
1007     /* Version of Virtual PC 2007 */
1008     footer->major = cpu_to_be16(0x0005);
1009     footer->minor = cpu_to_be16(0x0003);
1010     footer->orig_size = cpu_to_be64(total_size);
1011     footer->current_size = cpu_to_be64(total_size);
1012     footer->cyls = cpu_to_be16(cyls);
1013     footer->heads = heads;
1014     footer->secs_per_cyl = secs_per_cyl;
1015 
1016     footer->type = cpu_to_be32(disk_type);
1017 
1018     qemu_uuid_generate(&footer->uuid);
1019 
1020     footer->checksum = cpu_to_be32(vpc_checksum(buf, HEADER_SIZE));
1021 
1022     if (disk_type == VHD_DYNAMIC) {
1023         ret = create_dynamic_disk(blk, buf, total_sectors);
1024         if (ret < 0) {
1025             error_setg(errp, "Unable to create or write VHD header");
1026         }
1027     } else {
1028         ret = create_fixed_disk(blk, buf, total_size, errp);
1029     }
1030 
1031 out:
1032     blk_unref(blk);
1033     g_free(disk_type_param);
1034     return ret;
1035 }
1036 
1037 static int vpc_has_zero_init(BlockDriverState *bs)
1038 {
1039     BDRVVPCState *s = bs->opaque;
1040     VHDFooter *footer =  (VHDFooter *) s->footer_buf;
1041 
1042     if (be32_to_cpu(footer->type) == VHD_FIXED) {
1043         return bdrv_has_zero_init(bs->file->bs);
1044     } else {
1045         return 1;
1046     }
1047 }
1048 
1049 static void vpc_close(BlockDriverState *bs)
1050 {
1051     BDRVVPCState *s = bs->opaque;
1052     qemu_vfree(s->pagetable);
1053 #ifdef CACHE
1054     g_free(s->pageentry_u8);
1055 #endif
1056 
1057     migrate_del_blocker(s->migration_blocker);
1058     error_free(s->migration_blocker);
1059 }
1060 
1061 static QemuOptsList vpc_create_opts = {
1062     .name = "vpc-create-opts",
1063     .head = QTAILQ_HEAD_INITIALIZER(vpc_create_opts.head),
1064     .desc = {
1065         {
1066             .name = BLOCK_OPT_SIZE,
1067             .type = QEMU_OPT_SIZE,
1068             .help = "Virtual disk size"
1069         },
1070         {
1071             .name = BLOCK_OPT_SUBFMT,
1072             .type = QEMU_OPT_STRING,
1073             .help =
1074                 "Type of virtual hard disk format. Supported formats are "
1075                 "{dynamic (default) | fixed} "
1076         },
1077         {
1078             .name = VPC_OPT_FORCE_SIZE,
1079             .type = QEMU_OPT_BOOL,
1080             .help = "Force disk size calculation to use the actual size "
1081                     "specified, rather than using the nearest CHS-based "
1082                     "calculation"
1083         },
1084         { /* end of list */ }
1085     }
1086 };
1087 
1088 static BlockDriver bdrv_vpc = {
1089     .format_name    = "vpc",
1090     .instance_size  = sizeof(BDRVVPCState),
1091 
1092     .bdrv_probe             = vpc_probe,
1093     .bdrv_open              = vpc_open,
1094     .bdrv_close             = vpc_close,
1095     .bdrv_reopen_prepare    = vpc_reopen_prepare,
1096     .bdrv_child_perm        = bdrv_format_default_perms,
1097     .bdrv_create            = vpc_create,
1098 
1099     .bdrv_co_preadv             = vpc_co_preadv,
1100     .bdrv_co_pwritev            = vpc_co_pwritev,
1101     .bdrv_co_get_block_status   = vpc_co_get_block_status,
1102 
1103     .bdrv_get_info          = vpc_get_info,
1104 
1105     .create_opts            = &vpc_create_opts,
1106     .bdrv_has_zero_init     = vpc_has_zero_init,
1107 };
1108 
1109 static void bdrv_vpc_init(void)
1110 {
1111     bdrv_register(&bdrv_vpc);
1112 }
1113 
1114 block_init(bdrv_vpc_init);
1115