xref: /openbmc/qemu/block/vpc.c (revision 12a6c15e)
1 /*
2  * Block driver for Connectix / Microsoft Virtual PC images
3  *
4  * Copyright (c) 2005 Alex Beregszaszi
5  * Copyright (c) 2009 Kevin Wolf <kwolf@suse.de>
6  *
7  * Permission is hereby granted, free of charge, to any person obtaining a copy
8  * of this software and associated documentation files (the "Software"), to deal
9  * in the Software without restriction, including without limitation the rights
10  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11  * copies of the Software, and to permit persons to whom the Software is
12  * furnished to do so, subject to the following conditions:
13  *
14  * The above copyright notice and this permission notice shall be included in
15  * all copies or substantial portions of the Software.
16  *
17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23  * THE SOFTWARE.
24  */
25 #include "qemu/osdep.h"
26 #include "qapi/error.h"
27 #include "qemu-common.h"
28 #include "block/block_int.h"
29 #include "sysemu/block-backend.h"
30 #include "qemu/module.h"
31 #include "migration/blocker.h"
32 #include "qemu/bswap.h"
33 #include "qemu/uuid.h"
34 
35 /**************************************************************/
36 
37 #define HEADER_SIZE 512
38 
39 //#define CACHE
40 
41 enum vhd_type {
42     VHD_FIXED           = 2,
43     VHD_DYNAMIC         = 3,
44     VHD_DIFFERENCING    = 4,
45 };
46 
47 /* Seconds since Jan 1, 2000 0:00:00 (UTC) */
48 #define VHD_TIMESTAMP_BASE 946684800
49 
50 #define VHD_CHS_MAX_C   65535LL
51 #define VHD_CHS_MAX_H   16
52 #define VHD_CHS_MAX_S   255
53 
54 #define VHD_MAX_SECTORS       0xff000000    /* 2040 GiB max image size */
55 #define VHD_MAX_GEOMETRY      (VHD_CHS_MAX_C * VHD_CHS_MAX_H * VHD_CHS_MAX_S)
56 
57 #define VPC_OPT_FORCE_SIZE "force_size"
58 
59 /* always big-endian */
60 typedef struct vhd_footer {
61     char        creator[8]; /* "conectix" */
62     uint32_t    features;
63     uint32_t    version;
64 
65     /* Offset of next header structure, 0xFFFFFFFF if none */
66     uint64_t    data_offset;
67 
68     /* Seconds since Jan 1, 2000 0:00:00 (UTC) */
69     uint32_t    timestamp;
70 
71     char        creator_app[4]; /*  e.g., "vpc " */
72     uint16_t    major;
73     uint16_t    minor;
74     char        creator_os[4]; /* "Wi2k" */
75 
76     uint64_t    orig_size;
77     uint64_t    current_size;
78 
79     uint16_t    cyls;
80     uint8_t     heads;
81     uint8_t     secs_per_cyl;
82 
83     uint32_t    type;
84 
85     /* Checksum of the Hard Disk Footer ("one's complement of the sum of all
86        the bytes in the footer without the checksum field") */
87     uint32_t    checksum;
88 
89     /* UUID used to identify a parent hard disk (backing file) */
90     QemuUUID    uuid;
91 
92     uint8_t     in_saved_state;
93 } QEMU_PACKED VHDFooter;
94 
95 typedef struct vhd_dyndisk_header {
96     char        magic[8]; /* "cxsparse" */
97 
98     /* Offset of next header structure, 0xFFFFFFFF if none */
99     uint64_t    data_offset;
100 
101     /* Offset of the Block Allocation Table (BAT) */
102     uint64_t    table_offset;
103 
104     uint32_t    version;
105     uint32_t    max_table_entries; /* 32bit/entry */
106 
107     /* 2 MB by default, must be a power of two */
108     uint32_t    block_size;
109 
110     uint32_t    checksum;
111     uint8_t     parent_uuid[16];
112     uint32_t    parent_timestamp;
113     uint32_t    reserved;
114 
115     /* Backing file name (in UTF-16) */
116     uint8_t     parent_name[512];
117 
118     struct {
119         uint32_t    platform;
120         uint32_t    data_space;
121         uint32_t    data_length;
122         uint32_t    reserved;
123         uint64_t    data_offset;
124     } parent_locator[8];
125 } QEMU_PACKED VHDDynDiskHeader;
126 
127 typedef struct BDRVVPCState {
128     CoMutex lock;
129     uint8_t footer_buf[HEADER_SIZE];
130     uint64_t free_data_block_offset;
131     int max_table_entries;
132     uint32_t *pagetable;
133     uint64_t bat_offset;
134     uint64_t last_bitmap_offset;
135 
136     uint32_t block_size;
137     uint32_t bitmap_size;
138     bool force_use_chs;
139     bool force_use_sz;
140 
141 #ifdef CACHE
142     uint8_t *pageentry_u8;
143     uint32_t *pageentry_u32;
144     uint16_t *pageentry_u16;
145 
146     uint64_t last_bitmap;
147 #endif
148 
149     Error *migration_blocker;
150 } BDRVVPCState;
151 
152 #define VPC_OPT_SIZE_CALC "force_size_calc"
153 static QemuOptsList vpc_runtime_opts = {
154     .name = "vpc-runtime-opts",
155     .head = QTAILQ_HEAD_INITIALIZER(vpc_runtime_opts.head),
156     .desc = {
157         {
158             .name = VPC_OPT_SIZE_CALC,
159             .type = QEMU_OPT_STRING,
160             .help = "Force disk size calculation to use either CHS geometry, "
161                     "or use the disk current_size specified in the VHD footer. "
162                     "{chs, current_size}"
163         },
164         { /* end of list */ }
165     }
166 };
167 
168 static uint32_t vpc_checksum(uint8_t* buf, size_t size)
169 {
170     uint32_t res = 0;
171     int i;
172 
173     for (i = 0; i < size; i++)
174         res += buf[i];
175 
176     return ~res;
177 }
178 
179 
180 static int vpc_probe(const uint8_t *buf, int buf_size, const char *filename)
181 {
182     if (buf_size >= 8 && !strncmp((char *)buf, "conectix", 8))
183 	return 100;
184     return 0;
185 }
186 
187 static void vpc_parse_options(BlockDriverState *bs, QemuOpts *opts,
188                               Error **errp)
189 {
190     BDRVVPCState *s = bs->opaque;
191     const char *size_calc;
192 
193     size_calc = qemu_opt_get(opts, VPC_OPT_SIZE_CALC);
194 
195     if (!size_calc) {
196        /* no override, use autodetect only */
197     } else if (!strcmp(size_calc, "current_size")) {
198         s->force_use_sz = true;
199     } else if (!strcmp(size_calc, "chs")) {
200         s->force_use_chs = true;
201     } else {
202         error_setg(errp, "Invalid size calculation mode: '%s'", size_calc);
203     }
204 }
205 
206 static int vpc_open(BlockDriverState *bs, QDict *options, int flags,
207                     Error **errp)
208 {
209     BDRVVPCState *s = bs->opaque;
210     int i;
211     VHDFooter *footer;
212     VHDDynDiskHeader *dyndisk_header;
213     QemuOpts *opts = NULL;
214     Error *local_err = NULL;
215     bool use_chs;
216     uint8_t buf[HEADER_SIZE];
217     uint32_t checksum;
218     uint64_t computed_size;
219     uint64_t pagetable_size;
220     int disk_type = VHD_DYNAMIC;
221     int ret;
222     int64_t bs_size;
223 
224     bs->file = bdrv_open_child(NULL, options, "file", bs, &child_file,
225                                false, errp);
226     if (!bs->file) {
227         return -EINVAL;
228     }
229 
230     opts = qemu_opts_create(&vpc_runtime_opts, NULL, 0, &error_abort);
231     qemu_opts_absorb_qdict(opts, options, &local_err);
232     if (local_err) {
233         error_propagate(errp, local_err);
234         ret = -EINVAL;
235         goto fail;
236     }
237 
238     vpc_parse_options(bs, opts, &local_err);
239     if (local_err) {
240         error_propagate(errp, local_err);
241         ret = -EINVAL;
242         goto fail;
243     }
244 
245     ret = bdrv_pread(bs->file, 0, s->footer_buf, HEADER_SIZE);
246     if (ret < 0) {
247         error_setg(errp, "Unable to read VHD header");
248         goto fail;
249     }
250 
251     footer = (VHDFooter *) s->footer_buf;
252     if (strncmp(footer->creator, "conectix", 8)) {
253         int64_t offset = bdrv_getlength(bs->file->bs);
254         if (offset < 0) {
255             ret = offset;
256             error_setg(errp, "Invalid file size");
257             goto fail;
258         } else if (offset < HEADER_SIZE) {
259             ret = -EINVAL;
260             error_setg(errp, "File too small for a VHD header");
261             goto fail;
262         }
263 
264         /* If a fixed disk, the footer is found only at the end of the file */
265         ret = bdrv_pread(bs->file, offset-HEADER_SIZE, s->footer_buf,
266                          HEADER_SIZE);
267         if (ret < 0) {
268             goto fail;
269         }
270         if (strncmp(footer->creator, "conectix", 8)) {
271             error_setg(errp, "invalid VPC image");
272             ret = -EINVAL;
273             goto fail;
274         }
275         disk_type = VHD_FIXED;
276     }
277 
278     checksum = be32_to_cpu(footer->checksum);
279     footer->checksum = 0;
280     if (vpc_checksum(s->footer_buf, HEADER_SIZE) != checksum)
281         fprintf(stderr, "block-vpc: The header checksum of '%s' is "
282             "incorrect.\n", bs->filename);
283 
284     /* Write 'checksum' back to footer, or else will leave it with zero. */
285     footer->checksum = cpu_to_be32(checksum);
286 
287     /* The visible size of a image in Virtual PC depends on the geometry
288        rather than on the size stored in the footer (the size in the footer
289        is too large usually) */
290     bs->total_sectors = (int64_t)
291         be16_to_cpu(footer->cyls) * footer->heads * footer->secs_per_cyl;
292 
293     /* Microsoft Virtual PC and Microsoft Hyper-V produce and read
294      * VHD image sizes differently.  VPC will rely on CHS geometry,
295      * while Hyper-V and disk2vhd use the size specified in the footer.
296      *
297      * We use a couple of approaches to try and determine the correct method:
298      * look at the Creator App field, and look for images that have CHS
299      * geometry that is the maximum value.
300      *
301      * If the CHS geometry is the maximum CHS geometry, then we assume that
302      * the size is the footer->current_size to avoid truncation.  Otherwise,
303      * we follow the table based on footer->creator_app:
304      *
305      *  Known creator apps:
306      *      'vpc '  :  CHS              Virtual PC (uses disk geometry)
307      *      'qemu'  :  CHS              QEMU (uses disk geometry)
308      *      'qem2'  :  current_size     QEMU (uses current_size)
309      *      'win '  :  current_size     Hyper-V
310      *      'd2v '  :  current_size     Disk2vhd
311      *      'tap\0' :  current_size     XenServer
312      *      'CTXS'  :  current_size     XenConverter
313      *
314      *  The user can override the table values via drive options, however
315      *  even with an override we will still use current_size for images
316      *  that have CHS geometry of the maximum size.
317      */
318     use_chs = (!!strncmp(footer->creator_app, "win ", 4) &&
319                !!strncmp(footer->creator_app, "qem2", 4) &&
320                !!strncmp(footer->creator_app, "d2v ", 4) &&
321                !!strncmp(footer->creator_app, "CTXS", 4) &&
322                !!memcmp(footer->creator_app, "tap", 4)) || s->force_use_chs;
323 
324     if (!use_chs || bs->total_sectors == VHD_MAX_GEOMETRY || s->force_use_sz) {
325         bs->total_sectors = be64_to_cpu(footer->current_size) /
326                                         BDRV_SECTOR_SIZE;
327     }
328 
329     /* Allow a maximum disk size of 2040 GiB */
330     if (bs->total_sectors > VHD_MAX_SECTORS) {
331         ret = -EFBIG;
332         goto fail;
333     }
334 
335     if (disk_type == VHD_DYNAMIC) {
336         ret = bdrv_pread(bs->file, be64_to_cpu(footer->data_offset), buf,
337                          HEADER_SIZE);
338         if (ret < 0) {
339             error_setg(errp, "Error reading dynamic VHD header");
340             goto fail;
341         }
342 
343         dyndisk_header = (VHDDynDiskHeader *) buf;
344 
345         if (strncmp(dyndisk_header->magic, "cxsparse", 8)) {
346             error_setg(errp, "Invalid header magic");
347             ret = -EINVAL;
348             goto fail;
349         }
350 
351         s->block_size = be32_to_cpu(dyndisk_header->block_size);
352         if (!is_power_of_2(s->block_size) || s->block_size < BDRV_SECTOR_SIZE) {
353             error_setg(errp, "Invalid block size %" PRIu32, s->block_size);
354             ret = -EINVAL;
355             goto fail;
356         }
357         s->bitmap_size = ((s->block_size / (8 * 512)) + 511) & ~511;
358 
359         s->max_table_entries = be32_to_cpu(dyndisk_header->max_table_entries);
360 
361         if ((bs->total_sectors * 512) / s->block_size > 0xffffffffU) {
362             error_setg(errp, "Too many blocks");
363             ret = -EINVAL;
364             goto fail;
365         }
366 
367         computed_size = (uint64_t) s->max_table_entries * s->block_size;
368         if (computed_size < bs->total_sectors * 512) {
369             error_setg(errp, "Page table too small");
370             ret = -EINVAL;
371             goto fail;
372         }
373 
374         if (s->max_table_entries > SIZE_MAX / 4 ||
375             s->max_table_entries > (int) INT_MAX / 4) {
376             error_setg(errp, "Max Table Entries too large (%" PRId32 ")",
377                         s->max_table_entries);
378             ret = -EINVAL;
379             goto fail;
380         }
381 
382         pagetable_size = (uint64_t) s->max_table_entries * 4;
383 
384         s->pagetable = qemu_try_blockalign(bs->file->bs, pagetable_size);
385         if (s->pagetable == NULL) {
386             error_setg(errp, "Unable to allocate memory for page table");
387             ret = -ENOMEM;
388             goto fail;
389         }
390 
391         s->bat_offset = be64_to_cpu(dyndisk_header->table_offset);
392 
393         ret = bdrv_pread(bs->file, s->bat_offset, s->pagetable,
394                          pagetable_size);
395         if (ret < 0) {
396             error_setg(errp, "Error reading pagetable");
397             goto fail;
398         }
399 
400         s->free_data_block_offset =
401             ROUND_UP(s->bat_offset + pagetable_size, 512);
402 
403         for (i = 0; i < s->max_table_entries; i++) {
404             be32_to_cpus(&s->pagetable[i]);
405             if (s->pagetable[i] != 0xFFFFFFFF) {
406                 int64_t next = (512 * (int64_t) s->pagetable[i]) +
407                     s->bitmap_size + s->block_size;
408 
409                 if (next > s->free_data_block_offset) {
410                     s->free_data_block_offset = next;
411                 }
412             }
413         }
414 
415         bs_size = bdrv_getlength(bs->file->bs);
416         if (bs_size < 0) {
417             error_setg_errno(errp, -bs_size, "Unable to learn image size");
418             ret = bs_size;
419             goto fail;
420         }
421         if (s->free_data_block_offset > bs_size) {
422             error_setg(errp, "block-vpc: free_data_block_offset points after "
423                              "the end of file. The image has been truncated.");
424             ret = -EINVAL;
425             goto fail;
426         }
427 
428         s->last_bitmap_offset = (int64_t) -1;
429 
430 #ifdef CACHE
431         s->pageentry_u8 = g_malloc(512);
432         s->pageentry_u32 = s->pageentry_u8;
433         s->pageentry_u16 = s->pageentry_u8;
434         s->last_pagetable = -1;
435 #endif
436     }
437 
438     /* Disable migration when VHD images are used */
439     error_setg(&s->migration_blocker, "The vpc format used by node '%s' "
440                "does not support live migration",
441                bdrv_get_device_or_node_name(bs));
442     ret = migrate_add_blocker(s->migration_blocker, &local_err);
443     if (local_err) {
444         error_propagate(errp, local_err);
445         error_free(s->migration_blocker);
446         goto fail;
447     }
448 
449     qemu_co_mutex_init(&s->lock);
450 
451     return 0;
452 
453 fail:
454     qemu_vfree(s->pagetable);
455 #ifdef CACHE
456     g_free(s->pageentry_u8);
457 #endif
458     return ret;
459 }
460 
461 static int vpc_reopen_prepare(BDRVReopenState *state,
462                               BlockReopenQueue *queue, Error **errp)
463 {
464     return 0;
465 }
466 
467 /*
468  * Returns the absolute byte offset of the given sector in the image file.
469  * If the sector is not allocated, -1 is returned instead.
470  * If an error occurred trying to write an updated block bitmap back to
471  * the file, -2 is returned, and the error value is written to *err.
472  * This can only happen for a write operation.
473  *
474  * The parameter write must be 1 if the offset will be used for a write
475  * operation (the block bitmaps is updated then), 0 otherwise.
476  * If write is true then err must not be NULL.
477  */
478 static inline int64_t get_image_offset(BlockDriverState *bs, uint64_t offset,
479                                        bool write, int *err)
480 {
481     BDRVVPCState *s = bs->opaque;
482     uint64_t bitmap_offset, block_offset;
483     uint32_t pagetable_index, offset_in_block;
484 
485     assert(!(write && err == NULL));
486 
487     pagetable_index = offset / s->block_size;
488     offset_in_block = offset % s->block_size;
489 
490     if (pagetable_index >= s->max_table_entries || s->pagetable[pagetable_index] == 0xffffffff)
491         return -1; /* not allocated */
492 
493     bitmap_offset = 512 * (uint64_t) s->pagetable[pagetable_index];
494     block_offset = bitmap_offset + s->bitmap_size + offset_in_block;
495 
496     /* We must ensure that we don't write to any sectors which are marked as
497        unused in the bitmap. We get away with setting all bits in the block
498        bitmap each time we write to a new block. This might cause Virtual PC to
499        miss sparse read optimization, but it's not a problem in terms of
500        correctness. */
501     if (write && (s->last_bitmap_offset != bitmap_offset)) {
502         uint8_t bitmap[s->bitmap_size];
503         int r;
504 
505         s->last_bitmap_offset = bitmap_offset;
506         memset(bitmap, 0xff, s->bitmap_size);
507         r = bdrv_pwrite_sync(bs->file, bitmap_offset, bitmap, s->bitmap_size);
508         if (r < 0) {
509             *err = r;
510             return -2;
511         }
512     }
513 
514     return block_offset;
515 }
516 
517 /*
518  * Writes the footer to the end of the image file. This is needed when the
519  * file grows as it overwrites the old footer
520  *
521  * Returns 0 on success and < 0 on error
522  */
523 static int rewrite_footer(BlockDriverState* bs)
524 {
525     int ret;
526     BDRVVPCState *s = bs->opaque;
527     int64_t offset = s->free_data_block_offset;
528 
529     ret = bdrv_pwrite_sync(bs->file, offset, s->footer_buf, HEADER_SIZE);
530     if (ret < 0)
531         return ret;
532 
533     return 0;
534 }
535 
536 /*
537  * Allocates a new block. This involves writing a new footer and updating
538  * the Block Allocation Table to use the space at the old end of the image
539  * file (overwriting the old footer)
540  *
541  * Returns the sectors' offset in the image file on success and < 0 on error
542  */
543 static int64_t alloc_block(BlockDriverState* bs, int64_t offset)
544 {
545     BDRVVPCState *s = bs->opaque;
546     int64_t bat_offset;
547     uint32_t index, bat_value;
548     int ret;
549     uint8_t bitmap[s->bitmap_size];
550 
551     /* Check if sector_num is valid */
552     if ((offset < 0) || (offset > bs->total_sectors * BDRV_SECTOR_SIZE)) {
553         return -EINVAL;
554     }
555 
556     /* Write entry into in-memory BAT */
557     index = offset / s->block_size;
558     assert(s->pagetable[index] == 0xFFFFFFFF);
559     s->pagetable[index] = s->free_data_block_offset / 512;
560 
561     /* Initialize the block's bitmap */
562     memset(bitmap, 0xff, s->bitmap_size);
563     ret = bdrv_pwrite_sync(bs->file, s->free_data_block_offset, bitmap,
564         s->bitmap_size);
565     if (ret < 0) {
566         return ret;
567     }
568 
569     /* Write new footer (the old one will be overwritten) */
570     s->free_data_block_offset += s->block_size + s->bitmap_size;
571     ret = rewrite_footer(bs);
572     if (ret < 0)
573         goto fail;
574 
575     /* Write BAT entry to disk */
576     bat_offset = s->bat_offset + (4 * index);
577     bat_value = cpu_to_be32(s->pagetable[index]);
578     ret = bdrv_pwrite_sync(bs->file, bat_offset, &bat_value, 4);
579     if (ret < 0)
580         goto fail;
581 
582     return get_image_offset(bs, offset, false, NULL);
583 
584 fail:
585     s->free_data_block_offset -= (s->block_size + s->bitmap_size);
586     return ret;
587 }
588 
589 static int vpc_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
590 {
591     BDRVVPCState *s = (BDRVVPCState *)bs->opaque;
592     VHDFooter *footer = (VHDFooter *) s->footer_buf;
593 
594     if (be32_to_cpu(footer->type) != VHD_FIXED) {
595         bdi->cluster_size = s->block_size;
596     }
597 
598     bdi->unallocated_blocks_are_zero = true;
599     return 0;
600 }
601 
602 static int coroutine_fn
603 vpc_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
604               QEMUIOVector *qiov, int flags)
605 {
606     BDRVVPCState *s = bs->opaque;
607     int ret;
608     int64_t image_offset;
609     int64_t n_bytes;
610     int64_t bytes_done = 0;
611     VHDFooter *footer = (VHDFooter *) s->footer_buf;
612     QEMUIOVector local_qiov;
613 
614     if (be32_to_cpu(footer->type) == VHD_FIXED) {
615         return bdrv_co_preadv(bs->file, offset, bytes, qiov, 0);
616     }
617 
618     qemu_co_mutex_lock(&s->lock);
619     qemu_iovec_init(&local_qiov, qiov->niov);
620 
621     while (bytes > 0) {
622         image_offset = get_image_offset(bs, offset, false, NULL);
623         n_bytes = MIN(bytes, s->block_size - (offset % s->block_size));
624 
625         if (image_offset == -1) {
626             qemu_iovec_memset(qiov, bytes_done, 0, n_bytes);
627         } else {
628             qemu_iovec_reset(&local_qiov);
629             qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes);
630 
631             ret = bdrv_co_preadv(bs->file, image_offset, n_bytes,
632                                  &local_qiov, 0);
633             if (ret < 0) {
634                 goto fail;
635             }
636         }
637 
638         bytes -= n_bytes;
639         offset += n_bytes;
640         bytes_done += n_bytes;
641     }
642 
643     ret = 0;
644 fail:
645     qemu_iovec_destroy(&local_qiov);
646     qemu_co_mutex_unlock(&s->lock);
647 
648     return ret;
649 }
650 
651 static int coroutine_fn
652 vpc_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
653                QEMUIOVector *qiov, int flags)
654 {
655     BDRVVPCState *s = bs->opaque;
656     int64_t image_offset;
657     int64_t n_bytes;
658     int64_t bytes_done = 0;
659     int ret = 0;
660     VHDFooter *footer =  (VHDFooter *) s->footer_buf;
661     QEMUIOVector local_qiov;
662 
663     if (be32_to_cpu(footer->type) == VHD_FIXED) {
664         return bdrv_co_pwritev(bs->file, offset, bytes, qiov, 0);
665     }
666 
667     qemu_co_mutex_lock(&s->lock);
668     qemu_iovec_init(&local_qiov, qiov->niov);
669 
670     while (bytes > 0) {
671         image_offset = get_image_offset(bs, offset, true, &ret);
672         if (image_offset == -2) {
673             /* Failed to write block bitmap: can't proceed with write */
674             goto fail;
675         }
676         n_bytes = MIN(bytes, s->block_size - (offset % s->block_size));
677 
678         if (image_offset == -1) {
679             image_offset = alloc_block(bs, offset);
680             if (image_offset < 0) {
681                 ret = image_offset;
682                 goto fail;
683             }
684         }
685 
686         qemu_iovec_reset(&local_qiov);
687         qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes);
688 
689         ret = bdrv_co_pwritev(bs->file, image_offset, n_bytes,
690                               &local_qiov, 0);
691         if (ret < 0) {
692             goto fail;
693         }
694 
695         bytes -= n_bytes;
696         offset += n_bytes;
697         bytes_done += n_bytes;
698     }
699 
700     ret = 0;
701 fail:
702     qemu_iovec_destroy(&local_qiov);
703     qemu_co_mutex_unlock(&s->lock);
704 
705     return ret;
706 }
707 
708 static int64_t coroutine_fn vpc_co_get_block_status(BlockDriverState *bs,
709         int64_t sector_num, int nb_sectors, int *pnum, BlockDriverState **file)
710 {
711     BDRVVPCState *s = bs->opaque;
712     VHDFooter *footer = (VHDFooter*) s->footer_buf;
713     int64_t start, offset;
714     bool allocated;
715     int64_t ret;
716     int n;
717 
718     if (be32_to_cpu(footer->type) == VHD_FIXED) {
719         *pnum = nb_sectors;
720         *file = bs->file->bs;
721         return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID |
722                (sector_num << BDRV_SECTOR_BITS);
723     }
724 
725     qemu_co_mutex_lock(&s->lock);
726 
727     offset = get_image_offset(bs, sector_num << BDRV_SECTOR_BITS, false, NULL);
728     start = offset;
729     allocated = (offset != -1);
730     *pnum = 0;
731     ret = 0;
732 
733     do {
734         /* All sectors in a block are contiguous (without using the bitmap) */
735         n = ROUND_UP(sector_num + 1, s->block_size / BDRV_SECTOR_SIZE)
736           - sector_num;
737         n = MIN(n, nb_sectors);
738 
739         *pnum += n;
740         sector_num += n;
741         nb_sectors -= n;
742         /* *pnum can't be greater than one block for allocated
743          * sectors since there is always a bitmap in between. */
744         if (allocated) {
745             *file = bs->file->bs;
746             ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | start;
747             break;
748         }
749         if (nb_sectors == 0) {
750             break;
751         }
752         offset = get_image_offset(bs, sector_num << BDRV_SECTOR_BITS, false,
753                                   NULL);
754     } while (offset == -1);
755 
756     qemu_co_mutex_unlock(&s->lock);
757     return ret;
758 }
759 
760 /*
761  * Calculates the number of cylinders, heads and sectors per cylinder
762  * based on a given number of sectors. This is the algorithm described
763  * in the VHD specification.
764  *
765  * Note that the geometry doesn't always exactly match total_sectors but
766  * may round it down.
767  *
768  * Returns 0 on success, -EFBIG if the size is larger than 2040 GiB. Override
769  * the hardware EIDE and ATA-2 limit of 16 heads (max disk size of 127 GB)
770  * and instead allow up to 255 heads.
771  */
772 static int calculate_geometry(int64_t total_sectors, uint16_t* cyls,
773     uint8_t* heads, uint8_t* secs_per_cyl)
774 {
775     uint32_t cyls_times_heads;
776 
777     total_sectors = MIN(total_sectors, VHD_MAX_GEOMETRY);
778 
779     if (total_sectors >= 65535LL * 16 * 63) {
780         *secs_per_cyl = 255;
781         *heads = 16;
782         cyls_times_heads = total_sectors / *secs_per_cyl;
783     } else {
784         *secs_per_cyl = 17;
785         cyls_times_heads = total_sectors / *secs_per_cyl;
786         *heads = DIV_ROUND_UP(cyls_times_heads, 1024);
787 
788         if (*heads < 4) {
789             *heads = 4;
790         }
791 
792         if (cyls_times_heads >= (*heads * 1024) || *heads > 16) {
793             *secs_per_cyl = 31;
794             *heads = 16;
795             cyls_times_heads = total_sectors / *secs_per_cyl;
796         }
797 
798         if (cyls_times_heads >= (*heads * 1024)) {
799             *secs_per_cyl = 63;
800             *heads = 16;
801             cyls_times_heads = total_sectors / *secs_per_cyl;
802         }
803     }
804 
805     *cyls = cyls_times_heads / *heads;
806 
807     return 0;
808 }
809 
810 static int create_dynamic_disk(BlockBackend *blk, uint8_t *buf,
811                                int64_t total_sectors)
812 {
813     VHDDynDiskHeader *dyndisk_header =
814         (VHDDynDiskHeader *) buf;
815     size_t block_size, num_bat_entries;
816     int i;
817     int ret;
818     int64_t offset = 0;
819 
820     /* Write the footer (twice: at the beginning and at the end) */
821     block_size = 0x200000;
822     num_bat_entries = (total_sectors + block_size / 512) / (block_size / 512);
823 
824     ret = blk_pwrite(blk, offset, buf, HEADER_SIZE, 0);
825     if (ret < 0) {
826         goto fail;
827     }
828 
829     offset = 1536 + ((num_bat_entries * 4 + 511) & ~511);
830     ret = blk_pwrite(blk, offset, buf, HEADER_SIZE, 0);
831     if (ret < 0) {
832         goto fail;
833     }
834 
835     /* Write the initial BAT */
836     offset = 3 * 512;
837 
838     memset(buf, 0xFF, 512);
839     for (i = 0; i < DIV_ROUND_UP(num_bat_entries * 4, 512); i++) {
840         ret = blk_pwrite(blk, offset, buf, 512, 0);
841         if (ret < 0) {
842             goto fail;
843         }
844         offset += 512;
845     }
846 
847     /* Prepare the Dynamic Disk Header */
848     memset(buf, 0, 1024);
849 
850     memcpy(dyndisk_header->magic, "cxsparse", 8);
851 
852     /*
853      * Note: The spec is actually wrong here for data_offset, it says
854      * 0xFFFFFFFF, but MS tools expect all 64 bits to be set.
855      */
856     dyndisk_header->data_offset = cpu_to_be64(0xFFFFFFFFFFFFFFFFULL);
857     dyndisk_header->table_offset = cpu_to_be64(3 * 512);
858     dyndisk_header->version = cpu_to_be32(0x00010000);
859     dyndisk_header->block_size = cpu_to_be32(block_size);
860     dyndisk_header->max_table_entries = cpu_to_be32(num_bat_entries);
861 
862     dyndisk_header->checksum = cpu_to_be32(vpc_checksum(buf, 1024));
863 
864     /* Write the header */
865     offset = 512;
866 
867     ret = blk_pwrite(blk, offset, buf, 1024, 0);
868     if (ret < 0) {
869         goto fail;
870     }
871 
872  fail:
873     return ret;
874 }
875 
876 static int create_fixed_disk(BlockBackend *blk, uint8_t *buf,
877                              int64_t total_size, Error **errp)
878 {
879     int ret;
880 
881     /* Add footer to total size */
882     total_size += HEADER_SIZE;
883 
884     ret = blk_truncate(blk, total_size, PREALLOC_MODE_OFF, errp);
885     if (ret < 0) {
886         return ret;
887     }
888 
889     ret = blk_pwrite(blk, total_size - HEADER_SIZE, buf, HEADER_SIZE, 0);
890     if (ret < 0) {
891         error_setg_errno(errp, -ret, "Unable to write VHD header");
892         return ret;
893     }
894 
895     return ret;
896 }
897 
898 static int vpc_create(const char *filename, QemuOpts *opts, Error **errp)
899 {
900     uint8_t buf[1024];
901     VHDFooter *footer = (VHDFooter *) buf;
902     char *disk_type_param;
903     int i;
904     uint16_t cyls = 0;
905     uint8_t heads = 0;
906     uint8_t secs_per_cyl = 0;
907     int64_t total_sectors;
908     int64_t total_size;
909     int disk_type;
910     int ret = -EIO;
911     bool force_size;
912     Error *local_err = NULL;
913     BlockBackend *blk = NULL;
914 
915     /* Read out options */
916     total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
917                           BDRV_SECTOR_SIZE);
918     disk_type_param = qemu_opt_get_del(opts, BLOCK_OPT_SUBFMT);
919     if (disk_type_param) {
920         if (!strcmp(disk_type_param, "dynamic")) {
921             disk_type = VHD_DYNAMIC;
922         } else if (!strcmp(disk_type_param, "fixed")) {
923             disk_type = VHD_FIXED;
924         } else {
925             error_setg(errp, "Invalid disk type, %s", disk_type_param);
926             ret = -EINVAL;
927             goto out;
928         }
929     } else {
930         disk_type = VHD_DYNAMIC;
931     }
932 
933     force_size = qemu_opt_get_bool_del(opts, VPC_OPT_FORCE_SIZE, false);
934 
935     ret = bdrv_create_file(filename, opts, &local_err);
936     if (ret < 0) {
937         error_propagate(errp, local_err);
938         goto out;
939     }
940 
941     blk = blk_new_open(filename, NULL, NULL,
942                        BDRV_O_RDWR | BDRV_O_RESIZE | BDRV_O_PROTOCOL,
943                        &local_err);
944     if (blk == NULL) {
945         error_propagate(errp, local_err);
946         ret = -EIO;
947         goto out;
948     }
949 
950     blk_set_allow_write_beyond_eof(blk, true);
951 
952     /*
953      * Calculate matching total_size and geometry. Increase the number of
954      * sectors requested until we get enough (or fail). This ensures that
955      * qemu-img convert doesn't truncate images, but rather rounds up.
956      *
957      * If the image size can't be represented by a spec conformant CHS geometry,
958      * we set the geometry to 65535 x 16 x 255 (CxHxS) sectors and use
959      * the image size from the VHD footer to calculate total_sectors.
960      */
961     if (force_size) {
962         /* This will force the use of total_size for sector count, below */
963         cyls         = VHD_CHS_MAX_C;
964         heads        = VHD_CHS_MAX_H;
965         secs_per_cyl = VHD_CHS_MAX_S;
966     } else {
967         total_sectors = MIN(VHD_MAX_GEOMETRY, total_size / BDRV_SECTOR_SIZE);
968         for (i = 0; total_sectors > (int64_t)cyls * heads * secs_per_cyl; i++) {
969             calculate_geometry(total_sectors + i, &cyls, &heads, &secs_per_cyl);
970         }
971     }
972 
973     if ((int64_t)cyls * heads * secs_per_cyl == VHD_MAX_GEOMETRY) {
974         total_sectors = total_size / BDRV_SECTOR_SIZE;
975         /* Allow a maximum disk size of 2040 GiB */
976         if (total_sectors > VHD_MAX_SECTORS) {
977             error_setg(errp, "Disk size is too large, max size is 2040 GiB");
978             ret = -EFBIG;
979             goto out;
980         }
981     } else {
982         total_sectors = (int64_t)cyls * heads * secs_per_cyl;
983         total_size = total_sectors * BDRV_SECTOR_SIZE;
984     }
985 
986     /* Prepare the Hard Disk Footer */
987     memset(buf, 0, 1024);
988 
989     memcpy(footer->creator, "conectix", 8);
990     if (force_size) {
991         memcpy(footer->creator_app, "qem2", 4);
992     } else {
993         memcpy(footer->creator_app, "qemu", 4);
994     }
995     memcpy(footer->creator_os, "Wi2k", 4);
996 
997     footer->features = cpu_to_be32(0x02);
998     footer->version = cpu_to_be32(0x00010000);
999     if (disk_type == VHD_DYNAMIC) {
1000         footer->data_offset = cpu_to_be64(HEADER_SIZE);
1001     } else {
1002         footer->data_offset = cpu_to_be64(0xFFFFFFFFFFFFFFFFULL);
1003     }
1004     footer->timestamp = cpu_to_be32(time(NULL) - VHD_TIMESTAMP_BASE);
1005 
1006     /* Version of Virtual PC 2007 */
1007     footer->major = cpu_to_be16(0x0005);
1008     footer->minor = cpu_to_be16(0x0003);
1009     footer->orig_size = cpu_to_be64(total_size);
1010     footer->current_size = cpu_to_be64(total_size);
1011     footer->cyls = cpu_to_be16(cyls);
1012     footer->heads = heads;
1013     footer->secs_per_cyl = secs_per_cyl;
1014 
1015     footer->type = cpu_to_be32(disk_type);
1016 
1017     qemu_uuid_generate(&footer->uuid);
1018 
1019     footer->checksum = cpu_to_be32(vpc_checksum(buf, HEADER_SIZE));
1020 
1021     if (disk_type == VHD_DYNAMIC) {
1022         ret = create_dynamic_disk(blk, buf, total_sectors);
1023         if (ret < 0) {
1024             error_setg(errp, "Unable to create or write VHD header");
1025         }
1026     } else {
1027         ret = create_fixed_disk(blk, buf, total_size, errp);
1028     }
1029 
1030 out:
1031     blk_unref(blk);
1032     g_free(disk_type_param);
1033     return ret;
1034 }
1035 
1036 static int vpc_has_zero_init(BlockDriverState *bs)
1037 {
1038     BDRVVPCState *s = bs->opaque;
1039     VHDFooter *footer =  (VHDFooter *) s->footer_buf;
1040 
1041     if (be32_to_cpu(footer->type) == VHD_FIXED) {
1042         return bdrv_has_zero_init(bs->file->bs);
1043     } else {
1044         return 1;
1045     }
1046 }
1047 
1048 static void vpc_close(BlockDriverState *bs)
1049 {
1050     BDRVVPCState *s = bs->opaque;
1051     qemu_vfree(s->pagetable);
1052 #ifdef CACHE
1053     g_free(s->pageentry_u8);
1054 #endif
1055 
1056     migrate_del_blocker(s->migration_blocker);
1057     error_free(s->migration_blocker);
1058 }
1059 
1060 static QemuOptsList vpc_create_opts = {
1061     .name = "vpc-create-opts",
1062     .head = QTAILQ_HEAD_INITIALIZER(vpc_create_opts.head),
1063     .desc = {
1064         {
1065             .name = BLOCK_OPT_SIZE,
1066             .type = QEMU_OPT_SIZE,
1067             .help = "Virtual disk size"
1068         },
1069         {
1070             .name = BLOCK_OPT_SUBFMT,
1071             .type = QEMU_OPT_STRING,
1072             .help =
1073                 "Type of virtual hard disk format. Supported formats are "
1074                 "{dynamic (default) | fixed} "
1075         },
1076         {
1077             .name = VPC_OPT_FORCE_SIZE,
1078             .type = QEMU_OPT_BOOL,
1079             .help = "Force disk size calculation to use the actual size "
1080                     "specified, rather than using the nearest CHS-based "
1081                     "calculation"
1082         },
1083         { /* end of list */ }
1084     }
1085 };
1086 
1087 static BlockDriver bdrv_vpc = {
1088     .format_name    = "vpc",
1089     .instance_size  = sizeof(BDRVVPCState),
1090 
1091     .bdrv_probe             = vpc_probe,
1092     .bdrv_open              = vpc_open,
1093     .bdrv_close             = vpc_close,
1094     .bdrv_reopen_prepare    = vpc_reopen_prepare,
1095     .bdrv_child_perm        = bdrv_format_default_perms,
1096     .bdrv_create            = vpc_create,
1097 
1098     .bdrv_co_preadv             = vpc_co_preadv,
1099     .bdrv_co_pwritev            = vpc_co_pwritev,
1100     .bdrv_co_get_block_status   = vpc_co_get_block_status,
1101 
1102     .bdrv_get_info          = vpc_get_info,
1103 
1104     .create_opts            = &vpc_create_opts,
1105     .bdrv_has_zero_init     = vpc_has_zero_init,
1106 };
1107 
1108 static void bdrv_vpc_init(void)
1109 {
1110     bdrv_register(&bdrv_vpc);
1111 }
1112 
1113 block_init(bdrv_vpc_init);
1114