xref: /openbmc/qemu/block/vpc.c (revision 4a09d0bb)
1 /*
2  * Block driver for Connectix / Microsoft Virtual PC images
3  *
4  * Copyright (c) 2005 Alex Beregszaszi
5  * Copyright (c) 2009 Kevin Wolf <kwolf@suse.de>
6  *
7  * Permission is hereby granted, free of charge, to any person obtaining a copy
8  * of this software and associated documentation files (the "Software"), to deal
9  * in the Software without restriction, including without limitation the rights
10  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11  * copies of the Software, and to permit persons to whom the Software is
12  * furnished to do so, subject to the following conditions:
13  *
14  * The above copyright notice and this permission notice shall be included in
15  * all copies or substantial portions of the Software.
16  *
17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23  * THE SOFTWARE.
24  */
25 #include "qemu/osdep.h"
26 #include "qapi/error.h"
27 #include "qemu-common.h"
28 #include "block/block_int.h"
29 #include "sysemu/block-backend.h"
30 #include "qemu/module.h"
31 #include "migration/migration.h"
32 #include "qemu/bswap.h"
33 #include "qemu/uuid.h"
34 
35 /**************************************************************/
36 
37 #define HEADER_SIZE 512
38 
39 //#define CACHE
40 
41 enum vhd_type {
42     VHD_FIXED           = 2,
43     VHD_DYNAMIC         = 3,
44     VHD_DIFFERENCING    = 4,
45 };
46 
47 /* Seconds since Jan 1, 2000 0:00:00 (UTC) */
48 #define VHD_TIMESTAMP_BASE 946684800
49 
50 #define VHD_CHS_MAX_C   65535LL
51 #define VHD_CHS_MAX_H   16
52 #define VHD_CHS_MAX_S   255
53 
54 #define VHD_MAX_SECTORS       0xff000000    /* 2040 GiB max image size */
55 #define VHD_MAX_GEOMETRY      (VHD_CHS_MAX_C * VHD_CHS_MAX_H * VHD_CHS_MAX_S)
56 
57 #define VPC_OPT_FORCE_SIZE "force_size"
58 
59 /* always big-endian */
60 typedef struct vhd_footer {
61     char        creator[8]; /* "conectix" */
62     uint32_t    features;
63     uint32_t    version;
64 
65     /* Offset of next header structure, 0xFFFFFFFF if none */
66     uint64_t    data_offset;
67 
68     /* Seconds since Jan 1, 2000 0:00:00 (UTC) */
69     uint32_t    timestamp;
70 
71     char        creator_app[4]; /*  e.g., "vpc " */
72     uint16_t    major;
73     uint16_t    minor;
74     char        creator_os[4]; /* "Wi2k" */
75 
76     uint64_t    orig_size;
77     uint64_t    current_size;
78 
79     uint16_t    cyls;
80     uint8_t     heads;
81     uint8_t     secs_per_cyl;
82 
83     uint32_t    type;
84 
85     /* Checksum of the Hard Disk Footer ("one's complement of the sum of all
86        the bytes in the footer without the checksum field") */
87     uint32_t    checksum;
88 
89     /* UUID used to identify a parent hard disk (backing file) */
90     QemuUUID    uuid;
91 
92     uint8_t     in_saved_state;
93 } QEMU_PACKED VHDFooter;
94 
95 typedef struct vhd_dyndisk_header {
96     char        magic[8]; /* "cxsparse" */
97 
98     /* Offset of next header structure, 0xFFFFFFFF if none */
99     uint64_t    data_offset;
100 
101     /* Offset of the Block Allocation Table (BAT) */
102     uint64_t    table_offset;
103 
104     uint32_t    version;
105     uint32_t    max_table_entries; /* 32bit/entry */
106 
107     /* 2 MB by default, must be a power of two */
108     uint32_t    block_size;
109 
110     uint32_t    checksum;
111     uint8_t     parent_uuid[16];
112     uint32_t    parent_timestamp;
113     uint32_t    reserved;
114 
115     /* Backing file name (in UTF-16) */
116     uint8_t     parent_name[512];
117 
118     struct {
119         uint32_t    platform;
120         uint32_t    data_space;
121         uint32_t    data_length;
122         uint32_t    reserved;
123         uint64_t    data_offset;
124     } parent_locator[8];
125 } QEMU_PACKED VHDDynDiskHeader;
126 
127 typedef struct BDRVVPCState {
128     CoMutex lock;
129     uint8_t footer_buf[HEADER_SIZE];
130     uint64_t free_data_block_offset;
131     int max_table_entries;
132     uint32_t *pagetable;
133     uint64_t bat_offset;
134     uint64_t last_bitmap_offset;
135 
136     uint32_t block_size;
137     uint32_t bitmap_size;
138     bool force_use_chs;
139     bool force_use_sz;
140 
141 #ifdef CACHE
142     uint8_t *pageentry_u8;
143     uint32_t *pageentry_u32;
144     uint16_t *pageentry_u16;
145 
146     uint64_t last_bitmap;
147 #endif
148 
149     Error *migration_blocker;
150 } BDRVVPCState;
151 
152 #define VPC_OPT_SIZE_CALC "force_size_calc"
153 static QemuOptsList vpc_runtime_opts = {
154     .name = "vpc-runtime-opts",
155     .head = QTAILQ_HEAD_INITIALIZER(vpc_runtime_opts.head),
156     .desc = {
157         {
158             .name = VPC_OPT_SIZE_CALC,
159             .type = QEMU_OPT_STRING,
160             .help = "Force disk size calculation to use either CHS geometry, "
161                     "or use the disk current_size specified in the VHD footer. "
162                     "{chs, current_size}"
163         },
164         { /* end of list */ }
165     }
166 };
167 
168 static uint32_t vpc_checksum(uint8_t* buf, size_t size)
169 {
170     uint32_t res = 0;
171     int i;
172 
173     for (i = 0; i < size; i++)
174         res += buf[i];
175 
176     return ~res;
177 }
178 
179 
180 static int vpc_probe(const uint8_t *buf, int buf_size, const char *filename)
181 {
182     if (buf_size >= 8 && !strncmp((char *)buf, "conectix", 8))
183 	return 100;
184     return 0;
185 }
186 
187 static void vpc_parse_options(BlockDriverState *bs, QemuOpts *opts,
188                               Error **errp)
189 {
190     BDRVVPCState *s = bs->opaque;
191     const char *size_calc;
192 
193     size_calc = qemu_opt_get(opts, VPC_OPT_SIZE_CALC);
194 
195     if (!size_calc) {
196        /* no override, use autodetect only */
197     } else if (!strcmp(size_calc, "current_size")) {
198         s->force_use_sz = true;
199     } else if (!strcmp(size_calc, "chs")) {
200         s->force_use_chs = true;
201     } else {
202         error_setg(errp, "Invalid size calculation mode: '%s'", size_calc);
203     }
204 }
205 
206 static int vpc_open(BlockDriverState *bs, QDict *options, int flags,
207                     Error **errp)
208 {
209     BDRVVPCState *s = bs->opaque;
210     int i;
211     VHDFooter *footer;
212     VHDDynDiskHeader *dyndisk_header;
213     QemuOpts *opts = NULL;
214     Error *local_err = NULL;
215     bool use_chs;
216     uint8_t buf[HEADER_SIZE];
217     uint32_t checksum;
218     uint64_t computed_size;
219     uint64_t pagetable_size;
220     int disk_type = VHD_DYNAMIC;
221     int ret;
222 
223     opts = qemu_opts_create(&vpc_runtime_opts, NULL, 0, &error_abort);
224     qemu_opts_absorb_qdict(opts, options, &local_err);
225     if (local_err) {
226         error_propagate(errp, local_err);
227         ret = -EINVAL;
228         goto fail;
229     }
230 
231     vpc_parse_options(bs, opts, &local_err);
232     if (local_err) {
233         error_propagate(errp, local_err);
234         ret = -EINVAL;
235         goto fail;
236     }
237 
238     ret = bdrv_pread(bs->file, 0, s->footer_buf, HEADER_SIZE);
239     if (ret < 0) {
240         error_setg(errp, "Unable to read VHD header");
241         goto fail;
242     }
243 
244     footer = (VHDFooter *) s->footer_buf;
245     if (strncmp(footer->creator, "conectix", 8)) {
246         int64_t offset = bdrv_getlength(bs->file->bs);
247         if (offset < 0) {
248             ret = offset;
249             error_setg(errp, "Invalid file size");
250             goto fail;
251         } else if (offset < HEADER_SIZE) {
252             ret = -EINVAL;
253             error_setg(errp, "File too small for a VHD header");
254             goto fail;
255         }
256 
257         /* If a fixed disk, the footer is found only at the end of the file */
258         ret = bdrv_pread(bs->file, offset-HEADER_SIZE, s->footer_buf,
259                          HEADER_SIZE);
260         if (ret < 0) {
261             goto fail;
262         }
263         if (strncmp(footer->creator, "conectix", 8)) {
264             error_setg(errp, "invalid VPC image");
265             ret = -EINVAL;
266             goto fail;
267         }
268         disk_type = VHD_FIXED;
269     }
270 
271     checksum = be32_to_cpu(footer->checksum);
272     footer->checksum = 0;
273     if (vpc_checksum(s->footer_buf, HEADER_SIZE) != checksum)
274         fprintf(stderr, "block-vpc: The header checksum of '%s' is "
275             "incorrect.\n", bs->filename);
276 
277     /* Write 'checksum' back to footer, or else will leave it with zero. */
278     footer->checksum = cpu_to_be32(checksum);
279 
280     /* The visible size of a image in Virtual PC depends on the geometry
281        rather than on the size stored in the footer (the size in the footer
282        is too large usually) */
283     bs->total_sectors = (int64_t)
284         be16_to_cpu(footer->cyls) * footer->heads * footer->secs_per_cyl;
285 
286     /* Microsoft Virtual PC and Microsoft Hyper-V produce and read
287      * VHD image sizes differently.  VPC will rely on CHS geometry,
288      * while Hyper-V and disk2vhd use the size specified in the footer.
289      *
290      * We use a couple of approaches to try and determine the correct method:
291      * look at the Creator App field, and look for images that have CHS
292      * geometry that is the maximum value.
293      *
294      * If the CHS geometry is the maximum CHS geometry, then we assume that
295      * the size is the footer->current_size to avoid truncation.  Otherwise,
296      * we follow the table based on footer->creator_app:
297      *
298      *  Known creator apps:
299      *      'vpc '  :  CHS              Virtual PC (uses disk geometry)
300      *      'qemu'  :  CHS              QEMU (uses disk geometry)
301      *      'qem2'  :  current_size     QEMU (uses current_size)
302      *      'win '  :  current_size     Hyper-V
303      *      'd2v '  :  current_size     Disk2vhd
304      *      'tap\0' :  current_size     XenServer
305      *      'CTXS'  :  current_size     XenConverter
306      *
307      *  The user can override the table values via drive options, however
308      *  even with an override we will still use current_size for images
309      *  that have CHS geometry of the maximum size.
310      */
311     use_chs = (!!strncmp(footer->creator_app, "win ", 4) &&
312                !!strncmp(footer->creator_app, "qem2", 4) &&
313                !!strncmp(footer->creator_app, "d2v ", 4) &&
314                !!strncmp(footer->creator_app, "CTXS", 4) &&
315                !!memcmp(footer->creator_app, "tap", 4)) || s->force_use_chs;
316 
317     if (!use_chs || bs->total_sectors == VHD_MAX_GEOMETRY || s->force_use_sz) {
318         bs->total_sectors = be64_to_cpu(footer->current_size) /
319                                         BDRV_SECTOR_SIZE;
320     }
321 
322     /* Allow a maximum disk size of 2040 GiB */
323     if (bs->total_sectors > VHD_MAX_SECTORS) {
324         ret = -EFBIG;
325         goto fail;
326     }
327 
328     if (disk_type == VHD_DYNAMIC) {
329         ret = bdrv_pread(bs->file, be64_to_cpu(footer->data_offset), buf,
330                          HEADER_SIZE);
331         if (ret < 0) {
332             error_setg(errp, "Error reading dynamic VHD header");
333             goto fail;
334         }
335 
336         dyndisk_header = (VHDDynDiskHeader *) buf;
337 
338         if (strncmp(dyndisk_header->magic, "cxsparse", 8)) {
339             error_setg(errp, "Invalid header magic");
340             ret = -EINVAL;
341             goto fail;
342         }
343 
344         s->block_size = be32_to_cpu(dyndisk_header->block_size);
345         if (!is_power_of_2(s->block_size) || s->block_size < BDRV_SECTOR_SIZE) {
346             error_setg(errp, "Invalid block size %" PRIu32, s->block_size);
347             ret = -EINVAL;
348             goto fail;
349         }
350         s->bitmap_size = ((s->block_size / (8 * 512)) + 511) & ~511;
351 
352         s->max_table_entries = be32_to_cpu(dyndisk_header->max_table_entries);
353 
354         if ((bs->total_sectors * 512) / s->block_size > 0xffffffffU) {
355             error_setg(errp, "Too many blocks");
356             ret = -EINVAL;
357             goto fail;
358         }
359 
360         computed_size = (uint64_t) s->max_table_entries * s->block_size;
361         if (computed_size < bs->total_sectors * 512) {
362             error_setg(errp, "Page table too small");
363             ret = -EINVAL;
364             goto fail;
365         }
366 
367         if (s->max_table_entries > SIZE_MAX / 4 ||
368             s->max_table_entries > (int) INT_MAX / 4) {
369             error_setg(errp, "Max Table Entries too large (%" PRId32 ")",
370                         s->max_table_entries);
371             ret = -EINVAL;
372             goto fail;
373         }
374 
375         pagetable_size = (uint64_t) s->max_table_entries * 4;
376 
377         s->pagetable = qemu_try_blockalign(bs->file->bs, pagetable_size);
378         if (s->pagetable == NULL) {
379             error_setg(errp, "Unable to allocate memory for page table");
380             ret = -ENOMEM;
381             goto fail;
382         }
383 
384         s->bat_offset = be64_to_cpu(dyndisk_header->table_offset);
385 
386         ret = bdrv_pread(bs->file, s->bat_offset, s->pagetable,
387                          pagetable_size);
388         if (ret < 0) {
389             error_setg(errp, "Error reading pagetable");
390             goto fail;
391         }
392 
393         s->free_data_block_offset =
394             ROUND_UP(s->bat_offset + pagetable_size, 512);
395 
396         for (i = 0; i < s->max_table_entries; i++) {
397             be32_to_cpus(&s->pagetable[i]);
398             if (s->pagetable[i] != 0xFFFFFFFF) {
399                 int64_t next = (512 * (int64_t) s->pagetable[i]) +
400                     s->bitmap_size + s->block_size;
401 
402                 if (next > s->free_data_block_offset) {
403                     s->free_data_block_offset = next;
404                 }
405             }
406         }
407 
408         if (s->free_data_block_offset > bdrv_getlength(bs->file->bs)) {
409             error_setg(errp, "block-vpc: free_data_block_offset points after "
410                              "the end of file. The image has been truncated.");
411             ret = -EINVAL;
412             goto fail;
413         }
414 
415         s->last_bitmap_offset = (int64_t) -1;
416 
417 #ifdef CACHE
418         s->pageentry_u8 = g_malloc(512);
419         s->pageentry_u32 = s->pageentry_u8;
420         s->pageentry_u16 = s->pageentry_u8;
421         s->last_pagetable = -1;
422 #endif
423     }
424 
425     /* Disable migration when VHD images are used */
426     error_setg(&s->migration_blocker, "The vpc format used by node '%s' "
427                "does not support live migration",
428                bdrv_get_device_or_node_name(bs));
429     ret = migrate_add_blocker(s->migration_blocker, &local_err);
430     if (local_err) {
431         error_propagate(errp, local_err);
432         error_free(s->migration_blocker);
433         goto fail;
434     }
435 
436     qemu_co_mutex_init(&s->lock);
437 
438     return 0;
439 
440 fail:
441     qemu_vfree(s->pagetable);
442 #ifdef CACHE
443     g_free(s->pageentry_u8);
444 #endif
445     return ret;
446 }
447 
448 static int vpc_reopen_prepare(BDRVReopenState *state,
449                               BlockReopenQueue *queue, Error **errp)
450 {
451     return 0;
452 }
453 
454 /*
455  * Returns the absolute byte offset of the given sector in the image file.
456  * If the sector is not allocated, -1 is returned instead.
457  *
458  * The parameter write must be 1 if the offset will be used for a write
459  * operation (the block bitmaps is updated then), 0 otherwise.
460  */
461 static inline int64_t get_image_offset(BlockDriverState *bs, uint64_t offset,
462                                        bool write)
463 {
464     BDRVVPCState *s = bs->opaque;
465     uint64_t bitmap_offset, block_offset;
466     uint32_t pagetable_index, offset_in_block;
467 
468     pagetable_index = offset / s->block_size;
469     offset_in_block = offset % s->block_size;
470 
471     if (pagetable_index >= s->max_table_entries || s->pagetable[pagetable_index] == 0xffffffff)
472         return -1; /* not allocated */
473 
474     bitmap_offset = 512 * (uint64_t) s->pagetable[pagetable_index];
475     block_offset = bitmap_offset + s->bitmap_size + offset_in_block;
476 
477     /* We must ensure that we don't write to any sectors which are marked as
478        unused in the bitmap. We get away with setting all bits in the block
479        bitmap each time we write to a new block. This might cause Virtual PC to
480        miss sparse read optimization, but it's not a problem in terms of
481        correctness. */
482     if (write && (s->last_bitmap_offset != bitmap_offset)) {
483         uint8_t bitmap[s->bitmap_size];
484 
485         s->last_bitmap_offset = bitmap_offset;
486         memset(bitmap, 0xff, s->bitmap_size);
487         bdrv_pwrite_sync(bs->file, bitmap_offset, bitmap, s->bitmap_size);
488     }
489 
490     return block_offset;
491 }
492 
493 static inline int64_t get_sector_offset(BlockDriverState *bs,
494                                         int64_t sector_num, bool write)
495 {
496     return get_image_offset(bs, sector_num * BDRV_SECTOR_SIZE, write);
497 }
498 
499 /*
500  * Writes the footer to the end of the image file. This is needed when the
501  * file grows as it overwrites the old footer
502  *
503  * Returns 0 on success and < 0 on error
504  */
505 static int rewrite_footer(BlockDriverState* bs)
506 {
507     int ret;
508     BDRVVPCState *s = bs->opaque;
509     int64_t offset = s->free_data_block_offset;
510 
511     ret = bdrv_pwrite_sync(bs->file, offset, s->footer_buf, HEADER_SIZE);
512     if (ret < 0)
513         return ret;
514 
515     return 0;
516 }
517 
518 /*
519  * Allocates a new block. This involves writing a new footer and updating
520  * the Block Allocation Table to use the space at the old end of the image
521  * file (overwriting the old footer)
522  *
523  * Returns the sectors' offset in the image file on success and < 0 on error
524  */
525 static int64_t alloc_block(BlockDriverState* bs, int64_t offset)
526 {
527     BDRVVPCState *s = bs->opaque;
528     int64_t bat_offset;
529     uint32_t index, bat_value;
530     int ret;
531     uint8_t bitmap[s->bitmap_size];
532 
533     /* Check if sector_num is valid */
534     if ((offset < 0) || (offset > bs->total_sectors * BDRV_SECTOR_SIZE)) {
535         return -EINVAL;
536     }
537 
538     /* Write entry into in-memory BAT */
539     index = offset / s->block_size;
540     assert(s->pagetable[index] == 0xFFFFFFFF);
541     s->pagetable[index] = s->free_data_block_offset / 512;
542 
543     /* Initialize the block's bitmap */
544     memset(bitmap, 0xff, s->bitmap_size);
545     ret = bdrv_pwrite_sync(bs->file, s->free_data_block_offset, bitmap,
546         s->bitmap_size);
547     if (ret < 0) {
548         return ret;
549     }
550 
551     /* Write new footer (the old one will be overwritten) */
552     s->free_data_block_offset += s->block_size + s->bitmap_size;
553     ret = rewrite_footer(bs);
554     if (ret < 0)
555         goto fail;
556 
557     /* Write BAT entry to disk */
558     bat_offset = s->bat_offset + (4 * index);
559     bat_value = cpu_to_be32(s->pagetable[index]);
560     ret = bdrv_pwrite_sync(bs->file, bat_offset, &bat_value, 4);
561     if (ret < 0)
562         goto fail;
563 
564     return get_image_offset(bs, offset, false);
565 
566 fail:
567     s->free_data_block_offset -= (s->block_size + s->bitmap_size);
568     return ret;
569 }
570 
571 static int vpc_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
572 {
573     BDRVVPCState *s = (BDRVVPCState *)bs->opaque;
574     VHDFooter *footer = (VHDFooter *) s->footer_buf;
575 
576     if (be32_to_cpu(footer->type) != VHD_FIXED) {
577         bdi->cluster_size = s->block_size;
578     }
579 
580     bdi->unallocated_blocks_are_zero = true;
581     return 0;
582 }
583 
584 static int coroutine_fn
585 vpc_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
586               QEMUIOVector *qiov, int flags)
587 {
588     BDRVVPCState *s = bs->opaque;
589     int ret;
590     int64_t image_offset;
591     int64_t n_bytes;
592     int64_t bytes_done = 0;
593     VHDFooter *footer = (VHDFooter *) s->footer_buf;
594     QEMUIOVector local_qiov;
595 
596     if (be32_to_cpu(footer->type) == VHD_FIXED) {
597         return bdrv_co_preadv(bs->file, offset, bytes, qiov, 0);
598     }
599 
600     qemu_co_mutex_lock(&s->lock);
601     qemu_iovec_init(&local_qiov, qiov->niov);
602 
603     while (bytes > 0) {
604         image_offset = get_image_offset(bs, offset, false);
605         n_bytes = MIN(bytes, s->block_size - (offset % s->block_size));
606 
607         if (image_offset == -1) {
608             qemu_iovec_memset(qiov, bytes_done, 0, n_bytes);
609         } else {
610             qemu_iovec_reset(&local_qiov);
611             qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes);
612 
613             ret = bdrv_co_preadv(bs->file, image_offset, n_bytes,
614                                  &local_qiov, 0);
615             if (ret < 0) {
616                 goto fail;
617             }
618         }
619 
620         bytes -= n_bytes;
621         offset += n_bytes;
622         bytes_done += n_bytes;
623     }
624 
625     ret = 0;
626 fail:
627     qemu_iovec_destroy(&local_qiov);
628     qemu_co_mutex_unlock(&s->lock);
629 
630     return ret;
631 }
632 
633 static int coroutine_fn
634 vpc_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
635                QEMUIOVector *qiov, int flags)
636 {
637     BDRVVPCState *s = bs->opaque;
638     int64_t image_offset;
639     int64_t n_bytes;
640     int64_t bytes_done = 0;
641     int ret;
642     VHDFooter *footer =  (VHDFooter *) s->footer_buf;
643     QEMUIOVector local_qiov;
644 
645     if (be32_to_cpu(footer->type) == VHD_FIXED) {
646         return bdrv_co_pwritev(bs->file, offset, bytes, qiov, 0);
647     }
648 
649     qemu_co_mutex_lock(&s->lock);
650     qemu_iovec_init(&local_qiov, qiov->niov);
651 
652     while (bytes > 0) {
653         image_offset = get_image_offset(bs, offset, true);
654         n_bytes = MIN(bytes, s->block_size - (offset % s->block_size));
655 
656         if (image_offset == -1) {
657             image_offset = alloc_block(bs, offset);
658             if (image_offset < 0) {
659                 ret = image_offset;
660                 goto fail;
661             }
662         }
663 
664         qemu_iovec_reset(&local_qiov);
665         qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes);
666 
667         ret = bdrv_co_pwritev(bs->file, image_offset, n_bytes,
668                               &local_qiov, 0);
669         if (ret < 0) {
670             goto fail;
671         }
672 
673         bytes -= n_bytes;
674         offset += n_bytes;
675         bytes_done += n_bytes;
676     }
677 
678     ret = 0;
679 fail:
680     qemu_iovec_destroy(&local_qiov);
681     qemu_co_mutex_unlock(&s->lock);
682 
683     return ret;
684 }
685 
686 static int64_t coroutine_fn vpc_co_get_block_status(BlockDriverState *bs,
687         int64_t sector_num, int nb_sectors, int *pnum, BlockDriverState **file)
688 {
689     BDRVVPCState *s = bs->opaque;
690     VHDFooter *footer = (VHDFooter*) s->footer_buf;
691     int64_t start, offset;
692     bool allocated;
693     int n;
694 
695     if (be32_to_cpu(footer->type) == VHD_FIXED) {
696         *pnum = nb_sectors;
697         *file = bs->file->bs;
698         return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID | BDRV_BLOCK_DATA |
699                (sector_num << BDRV_SECTOR_BITS);
700     }
701 
702     offset = get_sector_offset(bs, sector_num, 0);
703     start = offset;
704     allocated = (offset != -1);
705     *pnum = 0;
706 
707     do {
708         /* All sectors in a block are contiguous (without using the bitmap) */
709         n = ROUND_UP(sector_num + 1, s->block_size / BDRV_SECTOR_SIZE)
710           - sector_num;
711         n = MIN(n, nb_sectors);
712 
713         *pnum += n;
714         sector_num += n;
715         nb_sectors -= n;
716         /* *pnum can't be greater than one block for allocated
717          * sectors since there is always a bitmap in between. */
718         if (allocated) {
719             *file = bs->file->bs;
720             return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | start;
721         }
722         if (nb_sectors == 0) {
723             break;
724         }
725         offset = get_sector_offset(bs, sector_num, 0);
726     } while (offset == -1);
727 
728     return 0;
729 }
730 
731 /*
732  * Calculates the number of cylinders, heads and sectors per cylinder
733  * based on a given number of sectors. This is the algorithm described
734  * in the VHD specification.
735  *
736  * Note that the geometry doesn't always exactly match total_sectors but
737  * may round it down.
738  *
739  * Returns 0 on success, -EFBIG if the size is larger than 2040 GiB. Override
740  * the hardware EIDE and ATA-2 limit of 16 heads (max disk size of 127 GB)
741  * and instead allow up to 255 heads.
742  */
743 static int calculate_geometry(int64_t total_sectors, uint16_t* cyls,
744     uint8_t* heads, uint8_t* secs_per_cyl)
745 {
746     uint32_t cyls_times_heads;
747 
748     total_sectors = MIN(total_sectors, VHD_MAX_GEOMETRY);
749 
750     if (total_sectors >= 65535LL * 16 * 63) {
751         *secs_per_cyl = 255;
752         *heads = 16;
753         cyls_times_heads = total_sectors / *secs_per_cyl;
754     } else {
755         *secs_per_cyl = 17;
756         cyls_times_heads = total_sectors / *secs_per_cyl;
757         *heads = (cyls_times_heads + 1023) / 1024;
758 
759         if (*heads < 4) {
760             *heads = 4;
761         }
762 
763         if (cyls_times_heads >= (*heads * 1024) || *heads > 16) {
764             *secs_per_cyl = 31;
765             *heads = 16;
766             cyls_times_heads = total_sectors / *secs_per_cyl;
767         }
768 
769         if (cyls_times_heads >= (*heads * 1024)) {
770             *secs_per_cyl = 63;
771             *heads = 16;
772             cyls_times_heads = total_sectors / *secs_per_cyl;
773         }
774     }
775 
776     *cyls = cyls_times_heads / *heads;
777 
778     return 0;
779 }
780 
781 static int create_dynamic_disk(BlockBackend *blk, uint8_t *buf,
782                                int64_t total_sectors)
783 {
784     VHDDynDiskHeader *dyndisk_header =
785         (VHDDynDiskHeader *) buf;
786     size_t block_size, num_bat_entries;
787     int i;
788     int ret;
789     int64_t offset = 0;
790 
791     /* Write the footer (twice: at the beginning and at the end) */
792     block_size = 0x200000;
793     num_bat_entries = (total_sectors + block_size / 512) / (block_size / 512);
794 
795     ret = blk_pwrite(blk, offset, buf, HEADER_SIZE, 0);
796     if (ret < 0) {
797         goto fail;
798     }
799 
800     offset = 1536 + ((num_bat_entries * 4 + 511) & ~511);
801     ret = blk_pwrite(blk, offset, buf, HEADER_SIZE, 0);
802     if (ret < 0) {
803         goto fail;
804     }
805 
806     /* Write the initial BAT */
807     offset = 3 * 512;
808 
809     memset(buf, 0xFF, 512);
810     for (i = 0; i < (num_bat_entries * 4 + 511) / 512; i++) {
811         ret = blk_pwrite(blk, offset, buf, 512, 0);
812         if (ret < 0) {
813             goto fail;
814         }
815         offset += 512;
816     }
817 
818     /* Prepare the Dynamic Disk Header */
819     memset(buf, 0, 1024);
820 
821     memcpy(dyndisk_header->magic, "cxsparse", 8);
822 
823     /*
824      * Note: The spec is actually wrong here for data_offset, it says
825      * 0xFFFFFFFF, but MS tools expect all 64 bits to be set.
826      */
827     dyndisk_header->data_offset = cpu_to_be64(0xFFFFFFFFFFFFFFFFULL);
828     dyndisk_header->table_offset = cpu_to_be64(3 * 512);
829     dyndisk_header->version = cpu_to_be32(0x00010000);
830     dyndisk_header->block_size = cpu_to_be32(block_size);
831     dyndisk_header->max_table_entries = cpu_to_be32(num_bat_entries);
832 
833     dyndisk_header->checksum = cpu_to_be32(vpc_checksum(buf, 1024));
834 
835     /* Write the header */
836     offset = 512;
837 
838     ret = blk_pwrite(blk, offset, buf, 1024, 0);
839     if (ret < 0) {
840         goto fail;
841     }
842 
843  fail:
844     return ret;
845 }
846 
847 static int create_fixed_disk(BlockBackend *blk, uint8_t *buf,
848                              int64_t total_size)
849 {
850     int ret;
851 
852     /* Add footer to total size */
853     total_size += HEADER_SIZE;
854 
855     ret = blk_truncate(blk, total_size);
856     if (ret < 0) {
857         return ret;
858     }
859 
860     ret = blk_pwrite(blk, total_size - HEADER_SIZE, buf, HEADER_SIZE, 0);
861     if (ret < 0) {
862         return ret;
863     }
864 
865     return ret;
866 }
867 
868 static int vpc_create(const char *filename, QemuOpts *opts, Error **errp)
869 {
870     uint8_t buf[1024];
871     VHDFooter *footer = (VHDFooter *) buf;
872     char *disk_type_param;
873     int i;
874     uint16_t cyls = 0;
875     uint8_t heads = 0;
876     uint8_t secs_per_cyl = 0;
877     int64_t total_sectors;
878     int64_t total_size;
879     int disk_type;
880     int ret = -EIO;
881     bool force_size;
882     Error *local_err = NULL;
883     BlockBackend *blk = NULL;
884 
885     /* Read out options */
886     total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
887                           BDRV_SECTOR_SIZE);
888     disk_type_param = qemu_opt_get_del(opts, BLOCK_OPT_SUBFMT);
889     if (disk_type_param) {
890         if (!strcmp(disk_type_param, "dynamic")) {
891             disk_type = VHD_DYNAMIC;
892         } else if (!strcmp(disk_type_param, "fixed")) {
893             disk_type = VHD_FIXED;
894         } else {
895             error_setg(errp, "Invalid disk type, %s", disk_type_param);
896             ret = -EINVAL;
897             goto out;
898         }
899     } else {
900         disk_type = VHD_DYNAMIC;
901     }
902 
903     force_size = qemu_opt_get_bool_del(opts, VPC_OPT_FORCE_SIZE, false);
904 
905     ret = bdrv_create_file(filename, opts, &local_err);
906     if (ret < 0) {
907         error_propagate(errp, local_err);
908         goto out;
909     }
910 
911     blk = blk_new_open(filename, NULL, NULL,
912                        BDRV_O_RDWR | BDRV_O_PROTOCOL, &local_err);
913     if (blk == NULL) {
914         error_propagate(errp, local_err);
915         ret = -EIO;
916         goto out;
917     }
918 
919     blk_set_allow_write_beyond_eof(blk, true);
920 
921     /*
922      * Calculate matching total_size and geometry. Increase the number of
923      * sectors requested until we get enough (or fail). This ensures that
924      * qemu-img convert doesn't truncate images, but rather rounds up.
925      *
926      * If the image size can't be represented by a spec conformant CHS geometry,
927      * we set the geometry to 65535 x 16 x 255 (CxHxS) sectors and use
928      * the image size from the VHD footer to calculate total_sectors.
929      */
930     if (force_size) {
931         /* This will force the use of total_size for sector count, below */
932         cyls         = VHD_CHS_MAX_C;
933         heads        = VHD_CHS_MAX_H;
934         secs_per_cyl = VHD_CHS_MAX_S;
935     } else {
936         total_sectors = MIN(VHD_MAX_GEOMETRY, total_size / BDRV_SECTOR_SIZE);
937         for (i = 0; total_sectors > (int64_t)cyls * heads * secs_per_cyl; i++) {
938             calculate_geometry(total_sectors + i, &cyls, &heads, &secs_per_cyl);
939         }
940     }
941 
942     if ((int64_t)cyls * heads * secs_per_cyl == VHD_MAX_GEOMETRY) {
943         total_sectors = total_size / BDRV_SECTOR_SIZE;
944         /* Allow a maximum disk size of 2040 GiB */
945         if (total_sectors > VHD_MAX_SECTORS) {
946             error_setg(errp, "Disk size is too large, max size is 2040 GiB");
947             ret = -EFBIG;
948             goto out;
949         }
950     } else {
951         total_sectors = (int64_t)cyls * heads * secs_per_cyl;
952         total_size = total_sectors * BDRV_SECTOR_SIZE;
953     }
954 
955     /* Prepare the Hard Disk Footer */
956     memset(buf, 0, 1024);
957 
958     memcpy(footer->creator, "conectix", 8);
959     if (force_size) {
960         memcpy(footer->creator_app, "qem2", 4);
961     } else {
962         memcpy(footer->creator_app, "qemu", 4);
963     }
964     memcpy(footer->creator_os, "Wi2k", 4);
965 
966     footer->features = cpu_to_be32(0x02);
967     footer->version = cpu_to_be32(0x00010000);
968     if (disk_type == VHD_DYNAMIC) {
969         footer->data_offset = cpu_to_be64(HEADER_SIZE);
970     } else {
971         footer->data_offset = cpu_to_be64(0xFFFFFFFFFFFFFFFFULL);
972     }
973     footer->timestamp = cpu_to_be32(time(NULL) - VHD_TIMESTAMP_BASE);
974 
975     /* Version of Virtual PC 2007 */
976     footer->major = cpu_to_be16(0x0005);
977     footer->minor = cpu_to_be16(0x0003);
978     footer->orig_size = cpu_to_be64(total_size);
979     footer->current_size = cpu_to_be64(total_size);
980     footer->cyls = cpu_to_be16(cyls);
981     footer->heads = heads;
982     footer->secs_per_cyl = secs_per_cyl;
983 
984     footer->type = cpu_to_be32(disk_type);
985 
986     qemu_uuid_generate(&footer->uuid);
987 
988     footer->checksum = cpu_to_be32(vpc_checksum(buf, HEADER_SIZE));
989 
990     if (disk_type == VHD_DYNAMIC) {
991         ret = create_dynamic_disk(blk, buf, total_sectors);
992     } else {
993         ret = create_fixed_disk(blk, buf, total_size);
994     }
995     if (ret < 0) {
996         error_setg(errp, "Unable to create or write VHD header");
997     }
998 
999 out:
1000     blk_unref(blk);
1001     g_free(disk_type_param);
1002     return ret;
1003 }
1004 
1005 static int vpc_has_zero_init(BlockDriverState *bs)
1006 {
1007     BDRVVPCState *s = bs->opaque;
1008     VHDFooter *footer =  (VHDFooter *) s->footer_buf;
1009 
1010     if (be32_to_cpu(footer->type) == VHD_FIXED) {
1011         return bdrv_has_zero_init(bs->file->bs);
1012     } else {
1013         return 1;
1014     }
1015 }
1016 
1017 static void vpc_close(BlockDriverState *bs)
1018 {
1019     BDRVVPCState *s = bs->opaque;
1020     qemu_vfree(s->pagetable);
1021 #ifdef CACHE
1022     g_free(s->pageentry_u8);
1023 #endif
1024 
1025     migrate_del_blocker(s->migration_blocker);
1026     error_free(s->migration_blocker);
1027 }
1028 
1029 static QemuOptsList vpc_create_opts = {
1030     .name = "vpc-create-opts",
1031     .head = QTAILQ_HEAD_INITIALIZER(vpc_create_opts.head),
1032     .desc = {
1033         {
1034             .name = BLOCK_OPT_SIZE,
1035             .type = QEMU_OPT_SIZE,
1036             .help = "Virtual disk size"
1037         },
1038         {
1039             .name = BLOCK_OPT_SUBFMT,
1040             .type = QEMU_OPT_STRING,
1041             .help =
1042                 "Type of virtual hard disk format. Supported formats are "
1043                 "{dynamic (default) | fixed} "
1044         },
1045         {
1046             .name = VPC_OPT_FORCE_SIZE,
1047             .type = QEMU_OPT_BOOL,
1048             .help = "Force disk size calculation to use the actual size "
1049                     "specified, rather than using the nearest CHS-based "
1050                     "calculation"
1051         },
1052         { /* end of list */ }
1053     }
1054 };
1055 
1056 static BlockDriver bdrv_vpc = {
1057     .format_name    = "vpc",
1058     .instance_size  = sizeof(BDRVVPCState),
1059 
1060     .bdrv_probe             = vpc_probe,
1061     .bdrv_open              = vpc_open,
1062     .bdrv_close             = vpc_close,
1063     .bdrv_reopen_prepare    = vpc_reopen_prepare,
1064     .bdrv_create            = vpc_create,
1065 
1066     .bdrv_co_preadv             = vpc_co_preadv,
1067     .bdrv_co_pwritev            = vpc_co_pwritev,
1068     .bdrv_co_get_block_status   = vpc_co_get_block_status,
1069 
1070     .bdrv_get_info          = vpc_get_info,
1071 
1072     .create_opts            = &vpc_create_opts,
1073     .bdrv_has_zero_init     = vpc_has_zero_init,
1074 };
1075 
1076 static void bdrv_vpc_init(void)
1077 {
1078     bdrv_register(&bdrv_vpc);
1079 }
1080 
1081 block_init(bdrv_vpc_init);
1082