xref: /openbmc/qemu/block/vpc.c (revision 4b4629d9)
1 /*
2  * Block driver for Connectix / Microsoft Virtual PC images
3  *
4  * Copyright (c) 2005 Alex Beregszaszi
5  * Copyright (c) 2009 Kevin Wolf <kwolf@suse.de>
6  *
7  * Permission is hereby granted, free of charge, to any person obtaining a copy
8  * of this software and associated documentation files (the "Software"), to deal
9  * in the Software without restriction, including without limitation the rights
10  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11  * copies of the Software, and to permit persons to whom the Software is
12  * furnished to do so, subject to the following conditions:
13  *
14  * The above copyright notice and this permission notice shall be included in
15  * all copies or substantial portions of the Software.
16  *
17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23  * THE SOFTWARE.
24  */
25 #include "qemu/osdep.h"
26 #include "qapi/error.h"
27 #include "qemu-common.h"
28 #include "block/block_int.h"
29 #include "sysemu/block-backend.h"
30 #include "qemu/module.h"
31 #include "migration/migration.h"
32 #if defined(CONFIG_UUID)
33 #include <uuid/uuid.h>
34 #endif
35 
36 /**************************************************************/
37 
38 #define HEADER_SIZE 512
39 
40 //#define CACHE
41 
42 enum vhd_type {
43     VHD_FIXED           = 2,
44     VHD_DYNAMIC         = 3,
45     VHD_DIFFERENCING    = 4,
46 };
47 
48 /* Seconds since Jan 1, 2000 0:00:00 (UTC) */
49 #define VHD_TIMESTAMP_BASE 946684800
50 
51 #define VHD_CHS_MAX_C   65535LL
52 #define VHD_CHS_MAX_H   16
53 #define VHD_CHS_MAX_S   255
54 
55 #define VHD_MAX_SECTORS       0xff000000    /* 2040 GiB max image size */
56 #define VHD_MAX_GEOMETRY      (VHD_CHS_MAX_C * VHD_CHS_MAX_H * VHD_CHS_MAX_S)
57 
58 #define VPC_OPT_FORCE_SIZE "force_size"
59 
60 /* always big-endian */
61 typedef struct vhd_footer {
62     char        creator[8]; /* "conectix" */
63     uint32_t    features;
64     uint32_t    version;
65 
66     /* Offset of next header structure, 0xFFFFFFFF if none */
67     uint64_t    data_offset;
68 
69     /* Seconds since Jan 1, 2000 0:00:00 (UTC) */
70     uint32_t    timestamp;
71 
72     char        creator_app[4]; /*  e.g., "vpc " */
73     uint16_t    major;
74     uint16_t    minor;
75     char        creator_os[4]; /* "Wi2k" */
76 
77     uint64_t    orig_size;
78     uint64_t    current_size;
79 
80     uint16_t    cyls;
81     uint8_t     heads;
82     uint8_t     secs_per_cyl;
83 
84     uint32_t    type;
85 
86     /* Checksum of the Hard Disk Footer ("one's complement of the sum of all
87        the bytes in the footer without the checksum field") */
88     uint32_t    checksum;
89 
90     /* UUID used to identify a parent hard disk (backing file) */
91     uint8_t     uuid[16];
92 
93     uint8_t     in_saved_state;
94 } QEMU_PACKED VHDFooter;
95 
96 typedef struct vhd_dyndisk_header {
97     char        magic[8]; /* "cxsparse" */
98 
99     /* Offset of next header structure, 0xFFFFFFFF if none */
100     uint64_t    data_offset;
101 
102     /* Offset of the Block Allocation Table (BAT) */
103     uint64_t    table_offset;
104 
105     uint32_t    version;
106     uint32_t    max_table_entries; /* 32bit/entry */
107 
108     /* 2 MB by default, must be a power of two */
109     uint32_t    block_size;
110 
111     uint32_t    checksum;
112     uint8_t     parent_uuid[16];
113     uint32_t    parent_timestamp;
114     uint32_t    reserved;
115 
116     /* Backing file name (in UTF-16) */
117     uint8_t     parent_name[512];
118 
119     struct {
120         uint32_t    platform;
121         uint32_t    data_space;
122         uint32_t    data_length;
123         uint32_t    reserved;
124         uint64_t    data_offset;
125     } parent_locator[8];
126 } QEMU_PACKED VHDDynDiskHeader;
127 
128 typedef struct BDRVVPCState {
129     CoMutex lock;
130     uint8_t footer_buf[HEADER_SIZE];
131     uint64_t free_data_block_offset;
132     int max_table_entries;
133     uint32_t *pagetable;
134     uint64_t bat_offset;
135     uint64_t last_bitmap_offset;
136 
137     uint32_t block_size;
138     uint32_t bitmap_size;
139     bool force_use_chs;
140     bool force_use_sz;
141 
142 #ifdef CACHE
143     uint8_t *pageentry_u8;
144     uint32_t *pageentry_u32;
145     uint16_t *pageentry_u16;
146 
147     uint64_t last_bitmap;
148 #endif
149 
150     Error *migration_blocker;
151 } BDRVVPCState;
152 
153 #define VPC_OPT_SIZE_CALC "force_size_calc"
154 static QemuOptsList vpc_runtime_opts = {
155     .name = "vpc-runtime-opts",
156     .head = QTAILQ_HEAD_INITIALIZER(vpc_runtime_opts.head),
157     .desc = {
158         {
159             .name = VPC_OPT_SIZE_CALC,
160             .type = QEMU_OPT_STRING,
161             .help = "Force disk size calculation to use either CHS geometry, "
162                     "or use the disk current_size specified in the VHD footer. "
163                     "{chs, current_size}"
164         },
165         { /* end of list */ }
166     }
167 };
168 
169 static uint32_t vpc_checksum(uint8_t* buf, size_t size)
170 {
171     uint32_t res = 0;
172     int i;
173 
174     for (i = 0; i < size; i++)
175         res += buf[i];
176 
177     return ~res;
178 }
179 
180 
181 static int vpc_probe(const uint8_t *buf, int buf_size, const char *filename)
182 {
183     if (buf_size >= 8 && !strncmp((char *)buf, "conectix", 8))
184 	return 100;
185     return 0;
186 }
187 
188 static void vpc_parse_options(BlockDriverState *bs, QemuOpts *opts,
189                               Error **errp)
190 {
191     BDRVVPCState *s = bs->opaque;
192     const char *size_calc;
193 
194     size_calc = qemu_opt_get(opts, VPC_OPT_SIZE_CALC);
195 
196     if (!size_calc) {
197        /* no override, use autodetect only */
198     } else if (!strcmp(size_calc, "current_size")) {
199         s->force_use_sz = true;
200     } else if (!strcmp(size_calc, "chs")) {
201         s->force_use_chs = true;
202     } else {
203         error_setg(errp, "Invalid size calculation mode: '%s'", size_calc);
204     }
205 }
206 
207 static int vpc_open(BlockDriverState *bs, QDict *options, int flags,
208                     Error **errp)
209 {
210     BDRVVPCState *s = bs->opaque;
211     int i;
212     VHDFooter *footer;
213     VHDDynDiskHeader *dyndisk_header;
214     QemuOpts *opts = NULL;
215     Error *local_err = NULL;
216     bool use_chs;
217     uint8_t buf[HEADER_SIZE];
218     uint32_t checksum;
219     uint64_t computed_size;
220     uint64_t pagetable_size;
221     int disk_type = VHD_DYNAMIC;
222     int ret;
223 
224     opts = qemu_opts_create(&vpc_runtime_opts, NULL, 0, &error_abort);
225     qemu_opts_absorb_qdict(opts, options, &local_err);
226     if (local_err) {
227         error_propagate(errp, local_err);
228         ret = -EINVAL;
229         goto fail;
230     }
231 
232     vpc_parse_options(bs, opts, &local_err);
233     if (local_err) {
234         error_propagate(errp, local_err);
235         ret = -EINVAL;
236         goto fail;
237     }
238 
239     ret = bdrv_pread(bs->file->bs, 0, s->footer_buf, HEADER_SIZE);
240     if (ret < 0) {
241         error_setg(errp, "Unable to read VHD header");
242         goto fail;
243     }
244 
245     footer = (VHDFooter *) s->footer_buf;
246     if (strncmp(footer->creator, "conectix", 8)) {
247         int64_t offset = bdrv_getlength(bs->file->bs);
248         if (offset < 0) {
249             ret = offset;
250             error_setg(errp, "Invalid file size");
251             goto fail;
252         } else if (offset < HEADER_SIZE) {
253             ret = -EINVAL;
254             error_setg(errp, "File too small for a VHD header");
255             goto fail;
256         }
257 
258         /* If a fixed disk, the footer is found only at the end of the file */
259         ret = bdrv_pread(bs->file->bs, offset-HEADER_SIZE, s->footer_buf,
260                          HEADER_SIZE);
261         if (ret < 0) {
262             goto fail;
263         }
264         if (strncmp(footer->creator, "conectix", 8)) {
265             error_setg(errp, "invalid VPC image");
266             ret = -EINVAL;
267             goto fail;
268         }
269         disk_type = VHD_FIXED;
270     }
271 
272     checksum = be32_to_cpu(footer->checksum);
273     footer->checksum = 0;
274     if (vpc_checksum(s->footer_buf, HEADER_SIZE) != checksum)
275         fprintf(stderr, "block-vpc: The header checksum of '%s' is "
276             "incorrect.\n", bs->filename);
277 
278     /* Write 'checksum' back to footer, or else will leave it with zero. */
279     footer->checksum = cpu_to_be32(checksum);
280 
281     /* The visible size of a image in Virtual PC depends on the geometry
282        rather than on the size stored in the footer (the size in the footer
283        is too large usually) */
284     bs->total_sectors = (int64_t)
285         be16_to_cpu(footer->cyls) * footer->heads * footer->secs_per_cyl;
286 
287     /* Microsoft Virtual PC and Microsoft Hyper-V produce and read
288      * VHD image sizes differently.  VPC will rely on CHS geometry,
289      * while Hyper-V and disk2vhd use the size specified in the footer.
290      *
291      * We use a couple of approaches to try and determine the correct method:
292      * look at the Creator App field, and look for images that have CHS
293      * geometry that is the maximum value.
294      *
295      * If the CHS geometry is the maximum CHS geometry, then we assume that
296      * the size is the footer->current_size to avoid truncation.  Otherwise,
297      * we follow the table based on footer->creator_app:
298      *
299      *  Known creator apps:
300      *      'vpc '  :  CHS              Virtual PC (uses disk geometry)
301      *      'qemu'  :  CHS              QEMU (uses disk geometry)
302      *      'qem2'  :  current_size     QEMU (uses current_size)
303      *      'win '  :  current_size     Hyper-V
304      *      'd2v '  :  current_size     Disk2vhd
305      *      'tap\0' :  current_size     XenServer
306      *      'CTXS'  :  current_size     XenConverter
307      *
308      *  The user can override the table values via drive options, however
309      *  even with an override we will still use current_size for images
310      *  that have CHS geometry of the maximum size.
311      */
312     use_chs = (!!strncmp(footer->creator_app, "win ", 4) &&
313                !!strncmp(footer->creator_app, "qem2", 4) &&
314                !!strncmp(footer->creator_app, "d2v ", 4) &&
315                !!strncmp(footer->creator_app, "CTXS", 4) &&
316                !!memcmp(footer->creator_app, "tap", 4)) || s->force_use_chs;
317 
318     if (!use_chs || bs->total_sectors == VHD_MAX_GEOMETRY || s->force_use_sz) {
319         bs->total_sectors = be64_to_cpu(footer->current_size) /
320                                         BDRV_SECTOR_SIZE;
321     }
322 
323     /* Allow a maximum disk size of 2040 GiB */
324     if (bs->total_sectors > VHD_MAX_SECTORS) {
325         ret = -EFBIG;
326         goto fail;
327     }
328 
329     if (disk_type == VHD_DYNAMIC) {
330         ret = bdrv_pread(bs->file->bs, be64_to_cpu(footer->data_offset), buf,
331                          HEADER_SIZE);
332         if (ret < 0) {
333             error_setg(errp, "Error reading dynamic VHD header");
334             goto fail;
335         }
336 
337         dyndisk_header = (VHDDynDiskHeader *) buf;
338 
339         if (strncmp(dyndisk_header->magic, "cxsparse", 8)) {
340             error_setg(errp, "Invalid header magic");
341             ret = -EINVAL;
342             goto fail;
343         }
344 
345         s->block_size = be32_to_cpu(dyndisk_header->block_size);
346         if (!is_power_of_2(s->block_size) || s->block_size < BDRV_SECTOR_SIZE) {
347             error_setg(errp, "Invalid block size %" PRIu32, s->block_size);
348             ret = -EINVAL;
349             goto fail;
350         }
351         s->bitmap_size = ((s->block_size / (8 * 512)) + 511) & ~511;
352 
353         s->max_table_entries = be32_to_cpu(dyndisk_header->max_table_entries);
354 
355         if ((bs->total_sectors * 512) / s->block_size > 0xffffffffU) {
356             error_setg(errp, "Too many blocks");
357             ret = -EINVAL;
358             goto fail;
359         }
360 
361         computed_size = (uint64_t) s->max_table_entries * s->block_size;
362         if (computed_size < bs->total_sectors * 512) {
363             error_setg(errp, "Page table too small");
364             ret = -EINVAL;
365             goto fail;
366         }
367 
368         if (s->max_table_entries > SIZE_MAX / 4 ||
369             s->max_table_entries > (int) INT_MAX / 4) {
370             error_setg(errp, "Max Table Entries too large (%" PRId32 ")",
371                         s->max_table_entries);
372             ret = -EINVAL;
373             goto fail;
374         }
375 
376         pagetable_size = (uint64_t) s->max_table_entries * 4;
377 
378         s->pagetable = qemu_try_blockalign(bs->file->bs, pagetable_size);
379         if (s->pagetable == NULL) {
380             error_setg(errp, "Unable to allocate memory for page table");
381             ret = -ENOMEM;
382             goto fail;
383         }
384 
385         s->bat_offset = be64_to_cpu(dyndisk_header->table_offset);
386 
387         ret = bdrv_pread(bs->file->bs, s->bat_offset, s->pagetable,
388                          pagetable_size);
389         if (ret < 0) {
390             error_setg(errp, "Error reading pagetable");
391             goto fail;
392         }
393 
394         s->free_data_block_offset =
395             ROUND_UP(s->bat_offset + pagetable_size, 512);
396 
397         for (i = 0; i < s->max_table_entries; i++) {
398             be32_to_cpus(&s->pagetable[i]);
399             if (s->pagetable[i] != 0xFFFFFFFF) {
400                 int64_t next = (512 * (int64_t) s->pagetable[i]) +
401                     s->bitmap_size + s->block_size;
402 
403                 if (next > s->free_data_block_offset) {
404                     s->free_data_block_offset = next;
405                 }
406             }
407         }
408 
409         if (s->free_data_block_offset > bdrv_getlength(bs->file->bs)) {
410             error_setg(errp, "block-vpc: free_data_block_offset points after "
411                              "the end of file. The image has been truncated.");
412             ret = -EINVAL;
413             goto fail;
414         }
415 
416         s->last_bitmap_offset = (int64_t) -1;
417 
418 #ifdef CACHE
419         s->pageentry_u8 = g_malloc(512);
420         s->pageentry_u32 = s->pageentry_u8;
421         s->pageentry_u16 = s->pageentry_u8;
422         s->last_pagetable = -1;
423 #endif
424     }
425 
426     qemu_co_mutex_init(&s->lock);
427 
428     /* Disable migration when VHD images are used */
429     error_setg(&s->migration_blocker, "The vpc format used by node '%s' "
430                "does not support live migration",
431                bdrv_get_device_or_node_name(bs));
432     migrate_add_blocker(s->migration_blocker);
433 
434     return 0;
435 
436 fail:
437     qemu_vfree(s->pagetable);
438 #ifdef CACHE
439     g_free(s->pageentry_u8);
440 #endif
441     return ret;
442 }
443 
444 static int vpc_reopen_prepare(BDRVReopenState *state,
445                               BlockReopenQueue *queue, Error **errp)
446 {
447     return 0;
448 }
449 
450 /*
451  * Returns the absolute byte offset of the given sector in the image file.
452  * If the sector is not allocated, -1 is returned instead.
453  *
454  * The parameter write must be 1 if the offset will be used for a write
455  * operation (the block bitmaps is updated then), 0 otherwise.
456  */
457 static inline int64_t get_image_offset(BlockDriverState *bs, uint64_t offset,
458                                        bool write)
459 {
460     BDRVVPCState *s = bs->opaque;
461     uint64_t bitmap_offset, block_offset;
462     uint32_t pagetable_index, offset_in_block;
463 
464     pagetable_index = offset / s->block_size;
465     offset_in_block = offset % s->block_size;
466 
467     if (pagetable_index >= s->max_table_entries || s->pagetable[pagetable_index] == 0xffffffff)
468         return -1; /* not allocated */
469 
470     bitmap_offset = 512 * (uint64_t) s->pagetable[pagetable_index];
471     block_offset = bitmap_offset + s->bitmap_size + offset_in_block;
472 
473     /* We must ensure that we don't write to any sectors which are marked as
474        unused in the bitmap. We get away with setting all bits in the block
475        bitmap each time we write to a new block. This might cause Virtual PC to
476        miss sparse read optimization, but it's not a problem in terms of
477        correctness. */
478     if (write && (s->last_bitmap_offset != bitmap_offset)) {
479         uint8_t bitmap[s->bitmap_size];
480 
481         s->last_bitmap_offset = bitmap_offset;
482         memset(bitmap, 0xff, s->bitmap_size);
483         bdrv_pwrite_sync(bs->file->bs, bitmap_offset, bitmap, s->bitmap_size);
484     }
485 
486     return block_offset;
487 }
488 
489 static inline int64_t get_sector_offset(BlockDriverState *bs,
490                                         int64_t sector_num, bool write)
491 {
492     return get_image_offset(bs, sector_num * BDRV_SECTOR_SIZE, write);
493 }
494 
495 /*
496  * Writes the footer to the end of the image file. This is needed when the
497  * file grows as it overwrites the old footer
498  *
499  * Returns 0 on success and < 0 on error
500  */
501 static int rewrite_footer(BlockDriverState* bs)
502 {
503     int ret;
504     BDRVVPCState *s = bs->opaque;
505     int64_t offset = s->free_data_block_offset;
506 
507     ret = bdrv_pwrite_sync(bs->file->bs, offset, s->footer_buf, HEADER_SIZE);
508     if (ret < 0)
509         return ret;
510 
511     return 0;
512 }
513 
514 /*
515  * Allocates a new block. This involves writing a new footer and updating
516  * the Block Allocation Table to use the space at the old end of the image
517  * file (overwriting the old footer)
518  *
519  * Returns the sectors' offset in the image file on success and < 0 on error
520  */
521 static int64_t alloc_block(BlockDriverState* bs, int64_t offset)
522 {
523     BDRVVPCState *s = bs->opaque;
524     int64_t bat_offset;
525     uint32_t index, bat_value;
526     int ret;
527     uint8_t bitmap[s->bitmap_size];
528 
529     /* Check if sector_num is valid */
530     if ((offset < 0) || (offset > bs->total_sectors * BDRV_SECTOR_SIZE)) {
531         return -EINVAL;
532     }
533 
534     /* Write entry into in-memory BAT */
535     index = offset / s->block_size;
536     assert(s->pagetable[index] == 0xFFFFFFFF);
537     s->pagetable[index] = s->free_data_block_offset / 512;
538 
539     /* Initialize the block's bitmap */
540     memset(bitmap, 0xff, s->bitmap_size);
541     ret = bdrv_pwrite_sync(bs->file->bs, s->free_data_block_offset, bitmap,
542         s->bitmap_size);
543     if (ret < 0) {
544         return ret;
545     }
546 
547     /* Write new footer (the old one will be overwritten) */
548     s->free_data_block_offset += s->block_size + s->bitmap_size;
549     ret = rewrite_footer(bs);
550     if (ret < 0)
551         goto fail;
552 
553     /* Write BAT entry to disk */
554     bat_offset = s->bat_offset + (4 * index);
555     bat_value = cpu_to_be32(s->pagetable[index]);
556     ret = bdrv_pwrite_sync(bs->file->bs, bat_offset, &bat_value, 4);
557     if (ret < 0)
558         goto fail;
559 
560     return get_image_offset(bs, offset, false);
561 
562 fail:
563     s->free_data_block_offset -= (s->block_size + s->bitmap_size);
564     return ret;
565 }
566 
567 static int vpc_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
568 {
569     BDRVVPCState *s = (BDRVVPCState *)bs->opaque;
570     VHDFooter *footer = (VHDFooter *) s->footer_buf;
571 
572     if (be32_to_cpu(footer->type) != VHD_FIXED) {
573         bdi->cluster_size = s->block_size;
574     }
575 
576     bdi->unallocated_blocks_are_zero = true;
577     return 0;
578 }
579 
580 static int coroutine_fn
581 vpc_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
582               QEMUIOVector *qiov, int flags)
583 {
584     BDRVVPCState *s = bs->opaque;
585     int ret;
586     int64_t image_offset;
587     int64_t n_bytes;
588     int64_t bytes_done = 0;
589     VHDFooter *footer = (VHDFooter *) s->footer_buf;
590     QEMUIOVector local_qiov;
591 
592     if (be32_to_cpu(footer->type) == VHD_FIXED) {
593         return bdrv_co_preadv(bs->file->bs, offset, bytes, qiov, 0);
594     }
595 
596     qemu_co_mutex_lock(&s->lock);
597     qemu_iovec_init(&local_qiov, qiov->niov);
598 
599     while (bytes > 0) {
600         image_offset = get_image_offset(bs, offset, false);
601         n_bytes = MIN(bytes, s->block_size - (offset % s->block_size));
602 
603         if (image_offset == -1) {
604             qemu_iovec_memset(qiov, bytes_done, 0, n_bytes);
605         } else {
606             qemu_iovec_reset(&local_qiov);
607             qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes);
608 
609             ret = bdrv_co_preadv(bs->file->bs, image_offset, n_bytes,
610                                  &local_qiov, 0);
611             if (ret < 0) {
612                 goto fail;
613             }
614         }
615 
616         bytes -= n_bytes;
617         offset += n_bytes;
618         bytes_done += n_bytes;
619     }
620 
621     ret = 0;
622 fail:
623     qemu_iovec_destroy(&local_qiov);
624     qemu_co_mutex_unlock(&s->lock);
625 
626     return ret;
627 }
628 
629 static int coroutine_fn
630 vpc_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
631                QEMUIOVector *qiov, int flags)
632 {
633     BDRVVPCState *s = bs->opaque;
634     int64_t image_offset;
635     int64_t n_bytes;
636     int64_t bytes_done = 0;
637     int ret;
638     VHDFooter *footer =  (VHDFooter *) s->footer_buf;
639     QEMUIOVector local_qiov;
640 
641     if (be32_to_cpu(footer->type) == VHD_FIXED) {
642         return bdrv_co_pwritev(bs->file->bs, offset, bytes, qiov, 0);
643     }
644 
645     qemu_co_mutex_lock(&s->lock);
646     qemu_iovec_init(&local_qiov, qiov->niov);
647 
648     while (bytes > 0) {
649         image_offset = get_image_offset(bs, offset, true);
650         n_bytes = MIN(bytes, s->block_size - (offset % s->block_size));
651 
652         if (image_offset == -1) {
653             image_offset = alloc_block(bs, offset);
654             if (image_offset < 0) {
655                 ret = image_offset;
656                 goto fail;
657             }
658         }
659 
660         qemu_iovec_reset(&local_qiov);
661         qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes);
662 
663         ret = bdrv_co_pwritev(bs->file->bs, image_offset, n_bytes,
664                               &local_qiov, 0);
665         if (ret < 0) {
666             goto fail;
667         }
668 
669         bytes -= n_bytes;
670         offset += n_bytes;
671         bytes_done += n_bytes;
672     }
673 
674     ret = 0;
675 fail:
676     qemu_iovec_destroy(&local_qiov);
677     qemu_co_mutex_unlock(&s->lock);
678 
679     return ret;
680 }
681 
682 static int64_t coroutine_fn vpc_co_get_block_status(BlockDriverState *bs,
683         int64_t sector_num, int nb_sectors, int *pnum, BlockDriverState **file)
684 {
685     BDRVVPCState *s = bs->opaque;
686     VHDFooter *footer = (VHDFooter*) s->footer_buf;
687     int64_t start, offset;
688     bool allocated;
689     int n;
690 
691     if (be32_to_cpu(footer->type) == VHD_FIXED) {
692         *pnum = nb_sectors;
693         *file = bs->file->bs;
694         return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID | BDRV_BLOCK_DATA |
695                (sector_num << BDRV_SECTOR_BITS);
696     }
697 
698     offset = get_sector_offset(bs, sector_num, 0);
699     start = offset;
700     allocated = (offset != -1);
701     *pnum = 0;
702 
703     do {
704         /* All sectors in a block are contiguous (without using the bitmap) */
705         n = ROUND_UP(sector_num + 1, s->block_size / BDRV_SECTOR_SIZE)
706           - sector_num;
707         n = MIN(n, nb_sectors);
708 
709         *pnum += n;
710         sector_num += n;
711         nb_sectors -= n;
712         /* *pnum can't be greater than one block for allocated
713          * sectors since there is always a bitmap in between. */
714         if (allocated) {
715             *file = bs->file->bs;
716             return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | start;
717         }
718         if (nb_sectors == 0) {
719             break;
720         }
721         offset = get_sector_offset(bs, sector_num, 0);
722     } while (offset == -1);
723 
724     return 0;
725 }
726 
727 /*
728  * Calculates the number of cylinders, heads and sectors per cylinder
729  * based on a given number of sectors. This is the algorithm described
730  * in the VHD specification.
731  *
732  * Note that the geometry doesn't always exactly match total_sectors but
733  * may round it down.
734  *
735  * Returns 0 on success, -EFBIG if the size is larger than 2040 GiB. Override
736  * the hardware EIDE and ATA-2 limit of 16 heads (max disk size of 127 GB)
737  * and instead allow up to 255 heads.
738  */
739 static int calculate_geometry(int64_t total_sectors, uint16_t* cyls,
740     uint8_t* heads, uint8_t* secs_per_cyl)
741 {
742     uint32_t cyls_times_heads;
743 
744     total_sectors = MIN(total_sectors, VHD_MAX_GEOMETRY);
745 
746     if (total_sectors >= 65535LL * 16 * 63) {
747         *secs_per_cyl = 255;
748         *heads = 16;
749         cyls_times_heads = total_sectors / *secs_per_cyl;
750     } else {
751         *secs_per_cyl = 17;
752         cyls_times_heads = total_sectors / *secs_per_cyl;
753         *heads = (cyls_times_heads + 1023) / 1024;
754 
755         if (*heads < 4) {
756             *heads = 4;
757         }
758 
759         if (cyls_times_heads >= (*heads * 1024) || *heads > 16) {
760             *secs_per_cyl = 31;
761             *heads = 16;
762             cyls_times_heads = total_sectors / *secs_per_cyl;
763         }
764 
765         if (cyls_times_heads >= (*heads * 1024)) {
766             *secs_per_cyl = 63;
767             *heads = 16;
768             cyls_times_heads = total_sectors / *secs_per_cyl;
769         }
770     }
771 
772     *cyls = cyls_times_heads / *heads;
773 
774     return 0;
775 }
776 
777 static int create_dynamic_disk(BlockBackend *blk, uint8_t *buf,
778                                int64_t total_sectors)
779 {
780     VHDDynDiskHeader *dyndisk_header =
781         (VHDDynDiskHeader *) buf;
782     size_t block_size, num_bat_entries;
783     int i;
784     int ret;
785     int64_t offset = 0;
786 
787     /* Write the footer (twice: at the beginning and at the end) */
788     block_size = 0x200000;
789     num_bat_entries = (total_sectors + block_size / 512) / (block_size / 512);
790 
791     ret = blk_pwrite(blk, offset, buf, HEADER_SIZE, 0);
792     if (ret < 0) {
793         goto fail;
794     }
795 
796     offset = 1536 + ((num_bat_entries * 4 + 511) & ~511);
797     ret = blk_pwrite(blk, offset, buf, HEADER_SIZE, 0);
798     if (ret < 0) {
799         goto fail;
800     }
801 
802     /* Write the initial BAT */
803     offset = 3 * 512;
804 
805     memset(buf, 0xFF, 512);
806     for (i = 0; i < (num_bat_entries * 4 + 511) / 512; i++) {
807         ret = blk_pwrite(blk, offset, buf, 512, 0);
808         if (ret < 0) {
809             goto fail;
810         }
811         offset += 512;
812     }
813 
814     /* Prepare the Dynamic Disk Header */
815     memset(buf, 0, 1024);
816 
817     memcpy(dyndisk_header->magic, "cxsparse", 8);
818 
819     /*
820      * Note: The spec is actually wrong here for data_offset, it says
821      * 0xFFFFFFFF, but MS tools expect all 64 bits to be set.
822      */
823     dyndisk_header->data_offset = cpu_to_be64(0xFFFFFFFFFFFFFFFFULL);
824     dyndisk_header->table_offset = cpu_to_be64(3 * 512);
825     dyndisk_header->version = cpu_to_be32(0x00010000);
826     dyndisk_header->block_size = cpu_to_be32(block_size);
827     dyndisk_header->max_table_entries = cpu_to_be32(num_bat_entries);
828 
829     dyndisk_header->checksum = cpu_to_be32(vpc_checksum(buf, 1024));
830 
831     /* Write the header */
832     offset = 512;
833 
834     ret = blk_pwrite(blk, offset, buf, 1024, 0);
835     if (ret < 0) {
836         goto fail;
837     }
838 
839  fail:
840     return ret;
841 }
842 
843 static int create_fixed_disk(BlockBackend *blk, uint8_t *buf,
844                              int64_t total_size)
845 {
846     int ret;
847 
848     /* Add footer to total size */
849     total_size += HEADER_SIZE;
850 
851     ret = blk_truncate(blk, total_size);
852     if (ret < 0) {
853         return ret;
854     }
855 
856     ret = blk_pwrite(blk, total_size - HEADER_SIZE, buf, HEADER_SIZE, 0);
857     if (ret < 0) {
858         return ret;
859     }
860 
861     return ret;
862 }
863 
864 static int vpc_create(const char *filename, QemuOpts *opts, Error **errp)
865 {
866     uint8_t buf[1024];
867     VHDFooter *footer = (VHDFooter *) buf;
868     char *disk_type_param;
869     int i;
870     uint16_t cyls = 0;
871     uint8_t heads = 0;
872     uint8_t secs_per_cyl = 0;
873     int64_t total_sectors;
874     int64_t total_size;
875     int disk_type;
876     int ret = -EIO;
877     bool force_size;
878     Error *local_err = NULL;
879     BlockBackend *blk = NULL;
880 
881     /* Read out options */
882     total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
883                           BDRV_SECTOR_SIZE);
884     disk_type_param = qemu_opt_get_del(opts, BLOCK_OPT_SUBFMT);
885     if (disk_type_param) {
886         if (!strcmp(disk_type_param, "dynamic")) {
887             disk_type = VHD_DYNAMIC;
888         } else if (!strcmp(disk_type_param, "fixed")) {
889             disk_type = VHD_FIXED;
890         } else {
891             error_setg(errp, "Invalid disk type, %s", disk_type_param);
892             ret = -EINVAL;
893             goto out;
894         }
895     } else {
896         disk_type = VHD_DYNAMIC;
897     }
898 
899     force_size = qemu_opt_get_bool_del(opts, VPC_OPT_FORCE_SIZE, false);
900 
901     ret = bdrv_create_file(filename, opts, &local_err);
902     if (ret < 0) {
903         error_propagate(errp, local_err);
904         goto out;
905     }
906 
907     blk = blk_new_open(filename, NULL, NULL,
908                        BDRV_O_RDWR | BDRV_O_PROTOCOL, &local_err);
909     if (blk == NULL) {
910         error_propagate(errp, local_err);
911         ret = -EIO;
912         goto out;
913     }
914 
915     blk_set_allow_write_beyond_eof(blk, true);
916 
917     /*
918      * Calculate matching total_size and geometry. Increase the number of
919      * sectors requested until we get enough (or fail). This ensures that
920      * qemu-img convert doesn't truncate images, but rather rounds up.
921      *
922      * If the image size can't be represented by a spec conformant CHS geometry,
923      * we set the geometry to 65535 x 16 x 255 (CxHxS) sectors and use
924      * the image size from the VHD footer to calculate total_sectors.
925      */
926     if (force_size) {
927         /* This will force the use of total_size for sector count, below */
928         cyls         = VHD_CHS_MAX_C;
929         heads        = VHD_CHS_MAX_H;
930         secs_per_cyl = VHD_CHS_MAX_S;
931     } else {
932         total_sectors = MIN(VHD_MAX_GEOMETRY, total_size / BDRV_SECTOR_SIZE);
933         for (i = 0; total_sectors > (int64_t)cyls * heads * secs_per_cyl; i++) {
934             calculate_geometry(total_sectors + i, &cyls, &heads, &secs_per_cyl);
935         }
936     }
937 
938     if ((int64_t)cyls * heads * secs_per_cyl == VHD_MAX_GEOMETRY) {
939         total_sectors = total_size / BDRV_SECTOR_SIZE;
940         /* Allow a maximum disk size of 2040 GiB */
941         if (total_sectors > VHD_MAX_SECTORS) {
942             error_setg(errp, "Disk size is too large, max size is 2040 GiB");
943             ret = -EFBIG;
944             goto out;
945         }
946     } else {
947         total_sectors = (int64_t)cyls * heads * secs_per_cyl;
948         total_size = total_sectors * BDRV_SECTOR_SIZE;
949     }
950 
951     /* Prepare the Hard Disk Footer */
952     memset(buf, 0, 1024);
953 
954     memcpy(footer->creator, "conectix", 8);
955     if (force_size) {
956         memcpy(footer->creator_app, "qem2", 4);
957     } else {
958         memcpy(footer->creator_app, "qemu", 4);
959     }
960     memcpy(footer->creator_os, "Wi2k", 4);
961 
962     footer->features = cpu_to_be32(0x02);
963     footer->version = cpu_to_be32(0x00010000);
964     if (disk_type == VHD_DYNAMIC) {
965         footer->data_offset = cpu_to_be64(HEADER_SIZE);
966     } else {
967         footer->data_offset = cpu_to_be64(0xFFFFFFFFFFFFFFFFULL);
968     }
969     footer->timestamp = cpu_to_be32(time(NULL) - VHD_TIMESTAMP_BASE);
970 
971     /* Version of Virtual PC 2007 */
972     footer->major = cpu_to_be16(0x0005);
973     footer->minor = cpu_to_be16(0x0003);
974     footer->orig_size = cpu_to_be64(total_size);
975     footer->current_size = cpu_to_be64(total_size);
976     footer->cyls = cpu_to_be16(cyls);
977     footer->heads = heads;
978     footer->secs_per_cyl = secs_per_cyl;
979 
980     footer->type = cpu_to_be32(disk_type);
981 
982 #if defined(CONFIG_UUID)
983     uuid_generate(footer->uuid);
984 #endif
985 
986     footer->checksum = cpu_to_be32(vpc_checksum(buf, HEADER_SIZE));
987 
988     if (disk_type == VHD_DYNAMIC) {
989         ret = create_dynamic_disk(blk, buf, total_sectors);
990     } else {
991         ret = create_fixed_disk(blk, buf, total_size);
992     }
993     if (ret < 0) {
994         error_setg(errp, "Unable to create or write VHD header");
995     }
996 
997 out:
998     blk_unref(blk);
999     g_free(disk_type_param);
1000     return ret;
1001 }
1002 
1003 static int vpc_has_zero_init(BlockDriverState *bs)
1004 {
1005     BDRVVPCState *s = bs->opaque;
1006     VHDFooter *footer =  (VHDFooter *) s->footer_buf;
1007 
1008     if (be32_to_cpu(footer->type) == VHD_FIXED) {
1009         return bdrv_has_zero_init(bs->file->bs);
1010     } else {
1011         return 1;
1012     }
1013 }
1014 
1015 static void vpc_close(BlockDriverState *bs)
1016 {
1017     BDRVVPCState *s = bs->opaque;
1018     qemu_vfree(s->pagetable);
1019 #ifdef CACHE
1020     g_free(s->pageentry_u8);
1021 #endif
1022 
1023     migrate_del_blocker(s->migration_blocker);
1024     error_free(s->migration_blocker);
1025 }
1026 
1027 static QemuOptsList vpc_create_opts = {
1028     .name = "vpc-create-opts",
1029     .head = QTAILQ_HEAD_INITIALIZER(vpc_create_opts.head),
1030     .desc = {
1031         {
1032             .name = BLOCK_OPT_SIZE,
1033             .type = QEMU_OPT_SIZE,
1034             .help = "Virtual disk size"
1035         },
1036         {
1037             .name = BLOCK_OPT_SUBFMT,
1038             .type = QEMU_OPT_STRING,
1039             .help =
1040                 "Type of virtual hard disk format. Supported formats are "
1041                 "{dynamic (default) | fixed} "
1042         },
1043         {
1044             .name = VPC_OPT_FORCE_SIZE,
1045             .type = QEMU_OPT_BOOL,
1046             .help = "Force disk size calculation to use the actual size "
1047                     "specified, rather than using the nearest CHS-based "
1048                     "calculation"
1049         },
1050         { /* end of list */ }
1051     }
1052 };
1053 
1054 static BlockDriver bdrv_vpc = {
1055     .format_name    = "vpc",
1056     .instance_size  = sizeof(BDRVVPCState),
1057 
1058     .bdrv_probe             = vpc_probe,
1059     .bdrv_open              = vpc_open,
1060     .bdrv_close             = vpc_close,
1061     .bdrv_reopen_prepare    = vpc_reopen_prepare,
1062     .bdrv_create            = vpc_create,
1063 
1064     .bdrv_co_preadv             = vpc_co_preadv,
1065     .bdrv_co_pwritev            = vpc_co_pwritev,
1066     .bdrv_co_get_block_status   = vpc_co_get_block_status,
1067 
1068     .bdrv_get_info          = vpc_get_info,
1069 
1070     .create_opts            = &vpc_create_opts,
1071     .bdrv_has_zero_init     = vpc_has_zero_init,
1072 };
1073 
1074 static void bdrv_vpc_init(void)
1075 {
1076     bdrv_register(&bdrv_vpc);
1077 }
1078 
1079 block_init(bdrv_vpc_init);
1080