xref: /openbmc/qemu/block/vpc.c (revision 864a2178)
1 /*
2  * Block driver for Connectix / Microsoft Virtual PC images
3  *
4  * Copyright (c) 2005 Alex Beregszaszi
5  * Copyright (c) 2009 Kevin Wolf <kwolf@suse.de>
6  *
7  * Permission is hereby granted, free of charge, to any person obtaining a copy
8  * of this software and associated documentation files (the "Software"), to deal
9  * in the Software without restriction, including without limitation the rights
10  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11  * copies of the Software, and to permit persons to whom the Software is
12  * furnished to do so, subject to the following conditions:
13  *
14  * The above copyright notice and this permission notice shall be included in
15  * all copies or substantial portions of the Software.
16  *
17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23  * THE SOFTWARE.
24  */
25 #include "qemu/osdep.h"
26 #include "qapi/error.h"
27 #include "qemu-common.h"
28 #include "block/block_int.h"
29 #include "sysemu/block-backend.h"
30 #include "qemu/module.h"
31 #include "migration/blocker.h"
32 #include "qemu/bswap.h"
33 #include "qemu/uuid.h"
34 
35 /**************************************************************/
36 
37 #define HEADER_SIZE 512
38 
39 //#define CACHE
40 
41 enum vhd_type {
42     VHD_FIXED           = 2,
43     VHD_DYNAMIC         = 3,
44     VHD_DIFFERENCING    = 4,
45 };
46 
47 /* Seconds since Jan 1, 2000 0:00:00 (UTC) */
48 #define VHD_TIMESTAMP_BASE 946684800
49 
50 #define VHD_CHS_MAX_C   65535LL
51 #define VHD_CHS_MAX_H   16
52 #define VHD_CHS_MAX_S   255
53 
54 #define VHD_MAX_SECTORS       0xff000000    /* 2040 GiB max image size */
55 #define VHD_MAX_GEOMETRY      (VHD_CHS_MAX_C * VHD_CHS_MAX_H * VHD_CHS_MAX_S)
56 
57 #define VPC_OPT_FORCE_SIZE "force_size"
58 
59 /* always big-endian */
60 typedef struct vhd_footer {
61     char        creator[8]; /* "conectix" */
62     uint32_t    features;
63     uint32_t    version;
64 
65     /* Offset of next header structure, 0xFFFFFFFF if none */
66     uint64_t    data_offset;
67 
68     /* Seconds since Jan 1, 2000 0:00:00 (UTC) */
69     uint32_t    timestamp;
70 
71     char        creator_app[4]; /*  e.g., "vpc " */
72     uint16_t    major;
73     uint16_t    minor;
74     char        creator_os[4]; /* "Wi2k" */
75 
76     uint64_t    orig_size;
77     uint64_t    current_size;
78 
79     uint16_t    cyls;
80     uint8_t     heads;
81     uint8_t     secs_per_cyl;
82 
83     uint32_t    type;
84 
85     /* Checksum of the Hard Disk Footer ("one's complement of the sum of all
86        the bytes in the footer without the checksum field") */
87     uint32_t    checksum;
88 
89     /* UUID used to identify a parent hard disk (backing file) */
90     QemuUUID    uuid;
91 
92     uint8_t     in_saved_state;
93 } QEMU_PACKED VHDFooter;
94 
95 typedef struct vhd_dyndisk_header {
96     char        magic[8]; /* "cxsparse" */
97 
98     /* Offset of next header structure, 0xFFFFFFFF if none */
99     uint64_t    data_offset;
100 
101     /* Offset of the Block Allocation Table (BAT) */
102     uint64_t    table_offset;
103 
104     uint32_t    version;
105     uint32_t    max_table_entries; /* 32bit/entry */
106 
107     /* 2 MB by default, must be a power of two */
108     uint32_t    block_size;
109 
110     uint32_t    checksum;
111     uint8_t     parent_uuid[16];
112     uint32_t    parent_timestamp;
113     uint32_t    reserved;
114 
115     /* Backing file name (in UTF-16) */
116     uint8_t     parent_name[512];
117 
118     struct {
119         uint32_t    platform;
120         uint32_t    data_space;
121         uint32_t    data_length;
122         uint32_t    reserved;
123         uint64_t    data_offset;
124     } parent_locator[8];
125 } QEMU_PACKED VHDDynDiskHeader;
126 
127 typedef struct BDRVVPCState {
128     CoMutex lock;
129     uint8_t footer_buf[HEADER_SIZE];
130     uint64_t free_data_block_offset;
131     int max_table_entries;
132     uint32_t *pagetable;
133     uint64_t bat_offset;
134     uint64_t last_bitmap_offset;
135 
136     uint32_t block_size;
137     uint32_t bitmap_size;
138     bool force_use_chs;
139     bool force_use_sz;
140 
141 #ifdef CACHE
142     uint8_t *pageentry_u8;
143     uint32_t *pageentry_u32;
144     uint16_t *pageentry_u16;
145 
146     uint64_t last_bitmap;
147 #endif
148 
149     Error *migration_blocker;
150 } BDRVVPCState;
151 
152 #define VPC_OPT_SIZE_CALC "force_size_calc"
153 static QemuOptsList vpc_runtime_opts = {
154     .name = "vpc-runtime-opts",
155     .head = QTAILQ_HEAD_INITIALIZER(vpc_runtime_opts.head),
156     .desc = {
157         {
158             .name = VPC_OPT_SIZE_CALC,
159             .type = QEMU_OPT_STRING,
160             .help = "Force disk size calculation to use either CHS geometry, "
161                     "or use the disk current_size specified in the VHD footer. "
162                     "{chs, current_size}"
163         },
164         { /* end of list */ }
165     }
166 };
167 
168 static uint32_t vpc_checksum(uint8_t* buf, size_t size)
169 {
170     uint32_t res = 0;
171     int i;
172 
173     for (i = 0; i < size; i++)
174         res += buf[i];
175 
176     return ~res;
177 }
178 
179 
180 static int vpc_probe(const uint8_t *buf, int buf_size, const char *filename)
181 {
182     if (buf_size >= 8 && !strncmp((char *)buf, "conectix", 8))
183 	return 100;
184     return 0;
185 }
186 
187 static void vpc_parse_options(BlockDriverState *bs, QemuOpts *opts,
188                               Error **errp)
189 {
190     BDRVVPCState *s = bs->opaque;
191     const char *size_calc;
192 
193     size_calc = qemu_opt_get(opts, VPC_OPT_SIZE_CALC);
194 
195     if (!size_calc) {
196        /* no override, use autodetect only */
197     } else if (!strcmp(size_calc, "current_size")) {
198         s->force_use_sz = true;
199     } else if (!strcmp(size_calc, "chs")) {
200         s->force_use_chs = true;
201     } else {
202         error_setg(errp, "Invalid size calculation mode: '%s'", size_calc);
203     }
204 }
205 
206 static int vpc_open(BlockDriverState *bs, QDict *options, int flags,
207                     Error **errp)
208 {
209     BDRVVPCState *s = bs->opaque;
210     int i;
211     VHDFooter *footer;
212     VHDDynDiskHeader *dyndisk_header;
213     QemuOpts *opts = NULL;
214     Error *local_err = NULL;
215     bool use_chs;
216     uint8_t buf[HEADER_SIZE];
217     uint32_t checksum;
218     uint64_t computed_size;
219     uint64_t pagetable_size;
220     int disk_type = VHD_DYNAMIC;
221     int ret;
222 
223     bs->file = bdrv_open_child(NULL, options, "file", bs, &child_file,
224                                false, errp);
225     if (!bs->file) {
226         return -EINVAL;
227     }
228 
229     opts = qemu_opts_create(&vpc_runtime_opts, NULL, 0, &error_abort);
230     qemu_opts_absorb_qdict(opts, options, &local_err);
231     if (local_err) {
232         error_propagate(errp, local_err);
233         ret = -EINVAL;
234         goto fail;
235     }
236 
237     vpc_parse_options(bs, opts, &local_err);
238     if (local_err) {
239         error_propagate(errp, local_err);
240         ret = -EINVAL;
241         goto fail;
242     }
243 
244     ret = bdrv_pread(bs->file, 0, s->footer_buf, HEADER_SIZE);
245     if (ret < 0) {
246         error_setg(errp, "Unable to read VHD header");
247         goto fail;
248     }
249 
250     footer = (VHDFooter *) s->footer_buf;
251     if (strncmp(footer->creator, "conectix", 8)) {
252         int64_t offset = bdrv_getlength(bs->file->bs);
253         if (offset < 0) {
254             ret = offset;
255             error_setg(errp, "Invalid file size");
256             goto fail;
257         } else if (offset < HEADER_SIZE) {
258             ret = -EINVAL;
259             error_setg(errp, "File too small for a VHD header");
260             goto fail;
261         }
262 
263         /* If a fixed disk, the footer is found only at the end of the file */
264         ret = bdrv_pread(bs->file, offset-HEADER_SIZE, s->footer_buf,
265                          HEADER_SIZE);
266         if (ret < 0) {
267             goto fail;
268         }
269         if (strncmp(footer->creator, "conectix", 8)) {
270             error_setg(errp, "invalid VPC image");
271             ret = -EINVAL;
272             goto fail;
273         }
274         disk_type = VHD_FIXED;
275     }
276 
277     checksum = be32_to_cpu(footer->checksum);
278     footer->checksum = 0;
279     if (vpc_checksum(s->footer_buf, HEADER_SIZE) != checksum)
280         fprintf(stderr, "block-vpc: The header checksum of '%s' is "
281             "incorrect.\n", bs->filename);
282 
283     /* Write 'checksum' back to footer, or else will leave it with zero. */
284     footer->checksum = cpu_to_be32(checksum);
285 
286     /* The visible size of a image in Virtual PC depends on the geometry
287        rather than on the size stored in the footer (the size in the footer
288        is too large usually) */
289     bs->total_sectors = (int64_t)
290         be16_to_cpu(footer->cyls) * footer->heads * footer->secs_per_cyl;
291 
292     /* Microsoft Virtual PC and Microsoft Hyper-V produce and read
293      * VHD image sizes differently.  VPC will rely on CHS geometry,
294      * while Hyper-V and disk2vhd use the size specified in the footer.
295      *
296      * We use a couple of approaches to try and determine the correct method:
297      * look at the Creator App field, and look for images that have CHS
298      * geometry that is the maximum value.
299      *
300      * If the CHS geometry is the maximum CHS geometry, then we assume that
301      * the size is the footer->current_size to avoid truncation.  Otherwise,
302      * we follow the table based on footer->creator_app:
303      *
304      *  Known creator apps:
305      *      'vpc '  :  CHS              Virtual PC (uses disk geometry)
306      *      'qemu'  :  CHS              QEMU (uses disk geometry)
307      *      'qem2'  :  current_size     QEMU (uses current_size)
308      *      'win '  :  current_size     Hyper-V
309      *      'd2v '  :  current_size     Disk2vhd
310      *      'tap\0' :  current_size     XenServer
311      *      'CTXS'  :  current_size     XenConverter
312      *
313      *  The user can override the table values via drive options, however
314      *  even with an override we will still use current_size for images
315      *  that have CHS geometry of the maximum size.
316      */
317     use_chs = (!!strncmp(footer->creator_app, "win ", 4) &&
318                !!strncmp(footer->creator_app, "qem2", 4) &&
319                !!strncmp(footer->creator_app, "d2v ", 4) &&
320                !!strncmp(footer->creator_app, "CTXS", 4) &&
321                !!memcmp(footer->creator_app, "tap", 4)) || s->force_use_chs;
322 
323     if (!use_chs || bs->total_sectors == VHD_MAX_GEOMETRY || s->force_use_sz) {
324         bs->total_sectors = be64_to_cpu(footer->current_size) /
325                                         BDRV_SECTOR_SIZE;
326     }
327 
328     /* Allow a maximum disk size of 2040 GiB */
329     if (bs->total_sectors > VHD_MAX_SECTORS) {
330         ret = -EFBIG;
331         goto fail;
332     }
333 
334     if (disk_type == VHD_DYNAMIC) {
335         ret = bdrv_pread(bs->file, be64_to_cpu(footer->data_offset), buf,
336                          HEADER_SIZE);
337         if (ret < 0) {
338             error_setg(errp, "Error reading dynamic VHD header");
339             goto fail;
340         }
341 
342         dyndisk_header = (VHDDynDiskHeader *) buf;
343 
344         if (strncmp(dyndisk_header->magic, "cxsparse", 8)) {
345             error_setg(errp, "Invalid header magic");
346             ret = -EINVAL;
347             goto fail;
348         }
349 
350         s->block_size = be32_to_cpu(dyndisk_header->block_size);
351         if (!is_power_of_2(s->block_size) || s->block_size < BDRV_SECTOR_SIZE) {
352             error_setg(errp, "Invalid block size %" PRIu32, s->block_size);
353             ret = -EINVAL;
354             goto fail;
355         }
356         s->bitmap_size = ((s->block_size / (8 * 512)) + 511) & ~511;
357 
358         s->max_table_entries = be32_to_cpu(dyndisk_header->max_table_entries);
359 
360         if ((bs->total_sectors * 512) / s->block_size > 0xffffffffU) {
361             error_setg(errp, "Too many blocks");
362             ret = -EINVAL;
363             goto fail;
364         }
365 
366         computed_size = (uint64_t) s->max_table_entries * s->block_size;
367         if (computed_size < bs->total_sectors * 512) {
368             error_setg(errp, "Page table too small");
369             ret = -EINVAL;
370             goto fail;
371         }
372 
373         if (s->max_table_entries > SIZE_MAX / 4 ||
374             s->max_table_entries > (int) INT_MAX / 4) {
375             error_setg(errp, "Max Table Entries too large (%" PRId32 ")",
376                         s->max_table_entries);
377             ret = -EINVAL;
378             goto fail;
379         }
380 
381         pagetable_size = (uint64_t) s->max_table_entries * 4;
382 
383         s->pagetable = qemu_try_blockalign(bs->file->bs, pagetable_size);
384         if (s->pagetable == NULL) {
385             error_setg(errp, "Unable to allocate memory for page table");
386             ret = -ENOMEM;
387             goto fail;
388         }
389 
390         s->bat_offset = be64_to_cpu(dyndisk_header->table_offset);
391 
392         ret = bdrv_pread(bs->file, s->bat_offset, s->pagetable,
393                          pagetable_size);
394         if (ret < 0) {
395             error_setg(errp, "Error reading pagetable");
396             goto fail;
397         }
398 
399         s->free_data_block_offset =
400             ROUND_UP(s->bat_offset + pagetable_size, 512);
401 
402         for (i = 0; i < s->max_table_entries; i++) {
403             be32_to_cpus(&s->pagetable[i]);
404             if (s->pagetable[i] != 0xFFFFFFFF) {
405                 int64_t next = (512 * (int64_t) s->pagetable[i]) +
406                     s->bitmap_size + s->block_size;
407 
408                 if (next > s->free_data_block_offset) {
409                     s->free_data_block_offset = next;
410                 }
411             }
412         }
413 
414         if (s->free_data_block_offset > bdrv_getlength(bs->file->bs)) {
415             error_setg(errp, "block-vpc: free_data_block_offset points after "
416                              "the end of file. The image has been truncated.");
417             ret = -EINVAL;
418             goto fail;
419         }
420 
421         s->last_bitmap_offset = (int64_t) -1;
422 
423 #ifdef CACHE
424         s->pageentry_u8 = g_malloc(512);
425         s->pageentry_u32 = s->pageentry_u8;
426         s->pageentry_u16 = s->pageentry_u8;
427         s->last_pagetable = -1;
428 #endif
429     }
430 
431     /* Disable migration when VHD images are used */
432     error_setg(&s->migration_blocker, "The vpc format used by node '%s' "
433                "does not support live migration",
434                bdrv_get_device_or_node_name(bs));
435     ret = migrate_add_blocker(s->migration_blocker, &local_err);
436     if (local_err) {
437         error_propagate(errp, local_err);
438         error_free(s->migration_blocker);
439         goto fail;
440     }
441 
442     qemu_co_mutex_init(&s->lock);
443 
444     return 0;
445 
446 fail:
447     qemu_vfree(s->pagetable);
448 #ifdef CACHE
449     g_free(s->pageentry_u8);
450 #endif
451     return ret;
452 }
453 
454 static int vpc_reopen_prepare(BDRVReopenState *state,
455                               BlockReopenQueue *queue, Error **errp)
456 {
457     return 0;
458 }
459 
460 /*
461  * Returns the absolute byte offset of the given sector in the image file.
462  * If the sector is not allocated, -1 is returned instead.
463  *
464  * The parameter write must be 1 if the offset will be used for a write
465  * operation (the block bitmaps is updated then), 0 otherwise.
466  */
467 static inline int64_t get_image_offset(BlockDriverState *bs, uint64_t offset,
468                                        bool write)
469 {
470     BDRVVPCState *s = bs->opaque;
471     uint64_t bitmap_offset, block_offset;
472     uint32_t pagetable_index, offset_in_block;
473 
474     pagetable_index = offset / s->block_size;
475     offset_in_block = offset % s->block_size;
476 
477     if (pagetable_index >= s->max_table_entries || s->pagetable[pagetable_index] == 0xffffffff)
478         return -1; /* not allocated */
479 
480     bitmap_offset = 512 * (uint64_t) s->pagetable[pagetable_index];
481     block_offset = bitmap_offset + s->bitmap_size + offset_in_block;
482 
483     /* We must ensure that we don't write to any sectors which are marked as
484        unused in the bitmap. We get away with setting all bits in the block
485        bitmap each time we write to a new block. This might cause Virtual PC to
486        miss sparse read optimization, but it's not a problem in terms of
487        correctness. */
488     if (write && (s->last_bitmap_offset != bitmap_offset)) {
489         uint8_t bitmap[s->bitmap_size];
490 
491         s->last_bitmap_offset = bitmap_offset;
492         memset(bitmap, 0xff, s->bitmap_size);
493         bdrv_pwrite_sync(bs->file, bitmap_offset, bitmap, s->bitmap_size);
494     }
495 
496     return block_offset;
497 }
498 
499 static inline int64_t get_sector_offset(BlockDriverState *bs,
500                                         int64_t sector_num, bool write)
501 {
502     return get_image_offset(bs, sector_num * BDRV_SECTOR_SIZE, write);
503 }
504 
505 /*
506  * Writes the footer to the end of the image file. This is needed when the
507  * file grows as it overwrites the old footer
508  *
509  * Returns 0 on success and < 0 on error
510  */
511 static int rewrite_footer(BlockDriverState* bs)
512 {
513     int ret;
514     BDRVVPCState *s = bs->opaque;
515     int64_t offset = s->free_data_block_offset;
516 
517     ret = bdrv_pwrite_sync(bs->file, offset, s->footer_buf, HEADER_SIZE);
518     if (ret < 0)
519         return ret;
520 
521     return 0;
522 }
523 
524 /*
525  * Allocates a new block. This involves writing a new footer and updating
526  * the Block Allocation Table to use the space at the old end of the image
527  * file (overwriting the old footer)
528  *
529  * Returns the sectors' offset in the image file on success and < 0 on error
530  */
531 static int64_t alloc_block(BlockDriverState* bs, int64_t offset)
532 {
533     BDRVVPCState *s = bs->opaque;
534     int64_t bat_offset;
535     uint32_t index, bat_value;
536     int ret;
537     uint8_t bitmap[s->bitmap_size];
538 
539     /* Check if sector_num is valid */
540     if ((offset < 0) || (offset > bs->total_sectors * BDRV_SECTOR_SIZE)) {
541         return -EINVAL;
542     }
543 
544     /* Write entry into in-memory BAT */
545     index = offset / s->block_size;
546     assert(s->pagetable[index] == 0xFFFFFFFF);
547     s->pagetable[index] = s->free_data_block_offset / 512;
548 
549     /* Initialize the block's bitmap */
550     memset(bitmap, 0xff, s->bitmap_size);
551     ret = bdrv_pwrite_sync(bs->file, s->free_data_block_offset, bitmap,
552         s->bitmap_size);
553     if (ret < 0) {
554         return ret;
555     }
556 
557     /* Write new footer (the old one will be overwritten) */
558     s->free_data_block_offset += s->block_size + s->bitmap_size;
559     ret = rewrite_footer(bs);
560     if (ret < 0)
561         goto fail;
562 
563     /* Write BAT entry to disk */
564     bat_offset = s->bat_offset + (4 * index);
565     bat_value = cpu_to_be32(s->pagetable[index]);
566     ret = bdrv_pwrite_sync(bs->file, bat_offset, &bat_value, 4);
567     if (ret < 0)
568         goto fail;
569 
570     return get_image_offset(bs, offset, false);
571 
572 fail:
573     s->free_data_block_offset -= (s->block_size + s->bitmap_size);
574     return ret;
575 }
576 
577 static int vpc_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
578 {
579     BDRVVPCState *s = (BDRVVPCState *)bs->opaque;
580     VHDFooter *footer = (VHDFooter *) s->footer_buf;
581 
582     if (be32_to_cpu(footer->type) != VHD_FIXED) {
583         bdi->cluster_size = s->block_size;
584     }
585 
586     bdi->unallocated_blocks_are_zero = true;
587     return 0;
588 }
589 
590 static int coroutine_fn
591 vpc_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
592               QEMUIOVector *qiov, int flags)
593 {
594     BDRVVPCState *s = bs->opaque;
595     int ret;
596     int64_t image_offset;
597     int64_t n_bytes;
598     int64_t bytes_done = 0;
599     VHDFooter *footer = (VHDFooter *) s->footer_buf;
600     QEMUIOVector local_qiov;
601 
602     if (be32_to_cpu(footer->type) == VHD_FIXED) {
603         return bdrv_co_preadv(bs->file, offset, bytes, qiov, 0);
604     }
605 
606     qemu_co_mutex_lock(&s->lock);
607     qemu_iovec_init(&local_qiov, qiov->niov);
608 
609     while (bytes > 0) {
610         image_offset = get_image_offset(bs, offset, false);
611         n_bytes = MIN(bytes, s->block_size - (offset % s->block_size));
612 
613         if (image_offset == -1) {
614             qemu_iovec_memset(qiov, bytes_done, 0, n_bytes);
615         } else {
616             qemu_iovec_reset(&local_qiov);
617             qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes);
618 
619             ret = bdrv_co_preadv(bs->file, image_offset, n_bytes,
620                                  &local_qiov, 0);
621             if (ret < 0) {
622                 goto fail;
623             }
624         }
625 
626         bytes -= n_bytes;
627         offset += n_bytes;
628         bytes_done += n_bytes;
629     }
630 
631     ret = 0;
632 fail:
633     qemu_iovec_destroy(&local_qiov);
634     qemu_co_mutex_unlock(&s->lock);
635 
636     return ret;
637 }
638 
639 static int coroutine_fn
640 vpc_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
641                QEMUIOVector *qiov, int flags)
642 {
643     BDRVVPCState *s = bs->opaque;
644     int64_t image_offset;
645     int64_t n_bytes;
646     int64_t bytes_done = 0;
647     int ret;
648     VHDFooter *footer =  (VHDFooter *) s->footer_buf;
649     QEMUIOVector local_qiov;
650 
651     if (be32_to_cpu(footer->type) == VHD_FIXED) {
652         return bdrv_co_pwritev(bs->file, offset, bytes, qiov, 0);
653     }
654 
655     qemu_co_mutex_lock(&s->lock);
656     qemu_iovec_init(&local_qiov, qiov->niov);
657 
658     while (bytes > 0) {
659         image_offset = get_image_offset(bs, offset, true);
660         n_bytes = MIN(bytes, s->block_size - (offset % s->block_size));
661 
662         if (image_offset == -1) {
663             image_offset = alloc_block(bs, offset);
664             if (image_offset < 0) {
665                 ret = image_offset;
666                 goto fail;
667             }
668         }
669 
670         qemu_iovec_reset(&local_qiov);
671         qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes);
672 
673         ret = bdrv_co_pwritev(bs->file, image_offset, n_bytes,
674                               &local_qiov, 0);
675         if (ret < 0) {
676             goto fail;
677         }
678 
679         bytes -= n_bytes;
680         offset += n_bytes;
681         bytes_done += n_bytes;
682     }
683 
684     ret = 0;
685 fail:
686     qemu_iovec_destroy(&local_qiov);
687     qemu_co_mutex_unlock(&s->lock);
688 
689     return ret;
690 }
691 
692 static int64_t coroutine_fn vpc_co_get_block_status(BlockDriverState *bs,
693         int64_t sector_num, int nb_sectors, int *pnum, BlockDriverState **file)
694 {
695     BDRVVPCState *s = bs->opaque;
696     VHDFooter *footer = (VHDFooter*) s->footer_buf;
697     int64_t start, offset;
698     bool allocated;
699     int n;
700 
701     if (be32_to_cpu(footer->type) == VHD_FIXED) {
702         *pnum = nb_sectors;
703         *file = bs->file->bs;
704         return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID |
705                (sector_num << BDRV_SECTOR_BITS);
706     }
707 
708     offset = get_sector_offset(bs, sector_num, 0);
709     start = offset;
710     allocated = (offset != -1);
711     *pnum = 0;
712 
713     do {
714         /* All sectors in a block are contiguous (without using the bitmap) */
715         n = ROUND_UP(sector_num + 1, s->block_size / BDRV_SECTOR_SIZE)
716           - sector_num;
717         n = MIN(n, nb_sectors);
718 
719         *pnum += n;
720         sector_num += n;
721         nb_sectors -= n;
722         /* *pnum can't be greater than one block for allocated
723          * sectors since there is always a bitmap in between. */
724         if (allocated) {
725             *file = bs->file->bs;
726             return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | start;
727         }
728         if (nb_sectors == 0) {
729             break;
730         }
731         offset = get_sector_offset(bs, sector_num, 0);
732     } while (offset == -1);
733 
734     return 0;
735 }
736 
737 /*
738  * Calculates the number of cylinders, heads and sectors per cylinder
739  * based on a given number of sectors. This is the algorithm described
740  * in the VHD specification.
741  *
742  * Note that the geometry doesn't always exactly match total_sectors but
743  * may round it down.
744  *
745  * Returns 0 on success, -EFBIG if the size is larger than 2040 GiB. Override
746  * the hardware EIDE and ATA-2 limit of 16 heads (max disk size of 127 GB)
747  * and instead allow up to 255 heads.
748  */
749 static int calculate_geometry(int64_t total_sectors, uint16_t* cyls,
750     uint8_t* heads, uint8_t* secs_per_cyl)
751 {
752     uint32_t cyls_times_heads;
753 
754     total_sectors = MIN(total_sectors, VHD_MAX_GEOMETRY);
755 
756     if (total_sectors >= 65535LL * 16 * 63) {
757         *secs_per_cyl = 255;
758         *heads = 16;
759         cyls_times_heads = total_sectors / *secs_per_cyl;
760     } else {
761         *secs_per_cyl = 17;
762         cyls_times_heads = total_sectors / *secs_per_cyl;
763         *heads = (cyls_times_heads + 1023) / 1024;
764 
765         if (*heads < 4) {
766             *heads = 4;
767         }
768 
769         if (cyls_times_heads >= (*heads * 1024) || *heads > 16) {
770             *secs_per_cyl = 31;
771             *heads = 16;
772             cyls_times_heads = total_sectors / *secs_per_cyl;
773         }
774 
775         if (cyls_times_heads >= (*heads * 1024)) {
776             *secs_per_cyl = 63;
777             *heads = 16;
778             cyls_times_heads = total_sectors / *secs_per_cyl;
779         }
780     }
781 
782     *cyls = cyls_times_heads / *heads;
783 
784     return 0;
785 }
786 
787 static int create_dynamic_disk(BlockBackend *blk, uint8_t *buf,
788                                int64_t total_sectors)
789 {
790     VHDDynDiskHeader *dyndisk_header =
791         (VHDDynDiskHeader *) buf;
792     size_t block_size, num_bat_entries;
793     int i;
794     int ret;
795     int64_t offset = 0;
796 
797     /* Write the footer (twice: at the beginning and at the end) */
798     block_size = 0x200000;
799     num_bat_entries = (total_sectors + block_size / 512) / (block_size / 512);
800 
801     ret = blk_pwrite(blk, offset, buf, HEADER_SIZE, 0);
802     if (ret < 0) {
803         goto fail;
804     }
805 
806     offset = 1536 + ((num_bat_entries * 4 + 511) & ~511);
807     ret = blk_pwrite(blk, offset, buf, HEADER_SIZE, 0);
808     if (ret < 0) {
809         goto fail;
810     }
811 
812     /* Write the initial BAT */
813     offset = 3 * 512;
814 
815     memset(buf, 0xFF, 512);
816     for (i = 0; i < (num_bat_entries * 4 + 511) / 512; i++) {
817         ret = blk_pwrite(blk, offset, buf, 512, 0);
818         if (ret < 0) {
819             goto fail;
820         }
821         offset += 512;
822     }
823 
824     /* Prepare the Dynamic Disk Header */
825     memset(buf, 0, 1024);
826 
827     memcpy(dyndisk_header->magic, "cxsparse", 8);
828 
829     /*
830      * Note: The spec is actually wrong here for data_offset, it says
831      * 0xFFFFFFFF, but MS tools expect all 64 bits to be set.
832      */
833     dyndisk_header->data_offset = cpu_to_be64(0xFFFFFFFFFFFFFFFFULL);
834     dyndisk_header->table_offset = cpu_to_be64(3 * 512);
835     dyndisk_header->version = cpu_to_be32(0x00010000);
836     dyndisk_header->block_size = cpu_to_be32(block_size);
837     dyndisk_header->max_table_entries = cpu_to_be32(num_bat_entries);
838 
839     dyndisk_header->checksum = cpu_to_be32(vpc_checksum(buf, 1024));
840 
841     /* Write the header */
842     offset = 512;
843 
844     ret = blk_pwrite(blk, offset, buf, 1024, 0);
845     if (ret < 0) {
846         goto fail;
847     }
848 
849  fail:
850     return ret;
851 }
852 
853 static int create_fixed_disk(BlockBackend *blk, uint8_t *buf,
854                              int64_t total_size, Error **errp)
855 {
856     int ret;
857 
858     /* Add footer to total size */
859     total_size += HEADER_SIZE;
860 
861     ret = blk_truncate(blk, total_size, PREALLOC_MODE_OFF, errp);
862     if (ret < 0) {
863         return ret;
864     }
865 
866     ret = blk_pwrite(blk, total_size - HEADER_SIZE, buf, HEADER_SIZE, 0);
867     if (ret < 0) {
868         error_setg_errno(errp, -ret, "Unable to write VHD header");
869         return ret;
870     }
871 
872     return ret;
873 }
874 
875 static int vpc_create(const char *filename, QemuOpts *opts, Error **errp)
876 {
877     uint8_t buf[1024];
878     VHDFooter *footer = (VHDFooter *) buf;
879     char *disk_type_param;
880     int i;
881     uint16_t cyls = 0;
882     uint8_t heads = 0;
883     uint8_t secs_per_cyl = 0;
884     int64_t total_sectors;
885     int64_t total_size;
886     int disk_type;
887     int ret = -EIO;
888     bool force_size;
889     Error *local_err = NULL;
890     BlockBackend *blk = NULL;
891 
892     /* Read out options */
893     total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
894                           BDRV_SECTOR_SIZE);
895     disk_type_param = qemu_opt_get_del(opts, BLOCK_OPT_SUBFMT);
896     if (disk_type_param) {
897         if (!strcmp(disk_type_param, "dynamic")) {
898             disk_type = VHD_DYNAMIC;
899         } else if (!strcmp(disk_type_param, "fixed")) {
900             disk_type = VHD_FIXED;
901         } else {
902             error_setg(errp, "Invalid disk type, %s", disk_type_param);
903             ret = -EINVAL;
904             goto out;
905         }
906     } else {
907         disk_type = VHD_DYNAMIC;
908     }
909 
910     force_size = qemu_opt_get_bool_del(opts, VPC_OPT_FORCE_SIZE, false);
911 
912     ret = bdrv_create_file(filename, opts, &local_err);
913     if (ret < 0) {
914         error_propagate(errp, local_err);
915         goto out;
916     }
917 
918     blk = blk_new_open(filename, NULL, NULL,
919                        BDRV_O_RDWR | BDRV_O_RESIZE | BDRV_O_PROTOCOL,
920                        &local_err);
921     if (blk == NULL) {
922         error_propagate(errp, local_err);
923         ret = -EIO;
924         goto out;
925     }
926 
927     blk_set_allow_write_beyond_eof(blk, true);
928 
929     /*
930      * Calculate matching total_size and geometry. Increase the number of
931      * sectors requested until we get enough (or fail). This ensures that
932      * qemu-img convert doesn't truncate images, but rather rounds up.
933      *
934      * If the image size can't be represented by a spec conformant CHS geometry,
935      * we set the geometry to 65535 x 16 x 255 (CxHxS) sectors and use
936      * the image size from the VHD footer to calculate total_sectors.
937      */
938     if (force_size) {
939         /* This will force the use of total_size for sector count, below */
940         cyls         = VHD_CHS_MAX_C;
941         heads        = VHD_CHS_MAX_H;
942         secs_per_cyl = VHD_CHS_MAX_S;
943     } else {
944         total_sectors = MIN(VHD_MAX_GEOMETRY, total_size / BDRV_SECTOR_SIZE);
945         for (i = 0; total_sectors > (int64_t)cyls * heads * secs_per_cyl; i++) {
946             calculate_geometry(total_sectors + i, &cyls, &heads, &secs_per_cyl);
947         }
948     }
949 
950     if ((int64_t)cyls * heads * secs_per_cyl == VHD_MAX_GEOMETRY) {
951         total_sectors = total_size / BDRV_SECTOR_SIZE;
952         /* Allow a maximum disk size of 2040 GiB */
953         if (total_sectors > VHD_MAX_SECTORS) {
954             error_setg(errp, "Disk size is too large, max size is 2040 GiB");
955             ret = -EFBIG;
956             goto out;
957         }
958     } else {
959         total_sectors = (int64_t)cyls * heads * secs_per_cyl;
960         total_size = total_sectors * BDRV_SECTOR_SIZE;
961     }
962 
963     /* Prepare the Hard Disk Footer */
964     memset(buf, 0, 1024);
965 
966     memcpy(footer->creator, "conectix", 8);
967     if (force_size) {
968         memcpy(footer->creator_app, "qem2", 4);
969     } else {
970         memcpy(footer->creator_app, "qemu", 4);
971     }
972     memcpy(footer->creator_os, "Wi2k", 4);
973 
974     footer->features = cpu_to_be32(0x02);
975     footer->version = cpu_to_be32(0x00010000);
976     if (disk_type == VHD_DYNAMIC) {
977         footer->data_offset = cpu_to_be64(HEADER_SIZE);
978     } else {
979         footer->data_offset = cpu_to_be64(0xFFFFFFFFFFFFFFFFULL);
980     }
981     footer->timestamp = cpu_to_be32(time(NULL) - VHD_TIMESTAMP_BASE);
982 
983     /* Version of Virtual PC 2007 */
984     footer->major = cpu_to_be16(0x0005);
985     footer->minor = cpu_to_be16(0x0003);
986     footer->orig_size = cpu_to_be64(total_size);
987     footer->current_size = cpu_to_be64(total_size);
988     footer->cyls = cpu_to_be16(cyls);
989     footer->heads = heads;
990     footer->secs_per_cyl = secs_per_cyl;
991 
992     footer->type = cpu_to_be32(disk_type);
993 
994     qemu_uuid_generate(&footer->uuid);
995 
996     footer->checksum = cpu_to_be32(vpc_checksum(buf, HEADER_SIZE));
997 
998     if (disk_type == VHD_DYNAMIC) {
999         ret = create_dynamic_disk(blk, buf, total_sectors);
1000         if (ret < 0) {
1001             error_setg(errp, "Unable to create or write VHD header");
1002         }
1003     } else {
1004         ret = create_fixed_disk(blk, buf, total_size, errp);
1005     }
1006 
1007 out:
1008     blk_unref(blk);
1009     g_free(disk_type_param);
1010     return ret;
1011 }
1012 
1013 static int vpc_has_zero_init(BlockDriverState *bs)
1014 {
1015     BDRVVPCState *s = bs->opaque;
1016     VHDFooter *footer =  (VHDFooter *) s->footer_buf;
1017 
1018     if (be32_to_cpu(footer->type) == VHD_FIXED) {
1019         return bdrv_has_zero_init(bs->file->bs);
1020     } else {
1021         return 1;
1022     }
1023 }
1024 
1025 static void vpc_close(BlockDriverState *bs)
1026 {
1027     BDRVVPCState *s = bs->opaque;
1028     qemu_vfree(s->pagetable);
1029 #ifdef CACHE
1030     g_free(s->pageentry_u8);
1031 #endif
1032 
1033     migrate_del_blocker(s->migration_blocker);
1034     error_free(s->migration_blocker);
1035 }
1036 
1037 static QemuOptsList vpc_create_opts = {
1038     .name = "vpc-create-opts",
1039     .head = QTAILQ_HEAD_INITIALIZER(vpc_create_opts.head),
1040     .desc = {
1041         {
1042             .name = BLOCK_OPT_SIZE,
1043             .type = QEMU_OPT_SIZE,
1044             .help = "Virtual disk size"
1045         },
1046         {
1047             .name = BLOCK_OPT_SUBFMT,
1048             .type = QEMU_OPT_STRING,
1049             .help =
1050                 "Type of virtual hard disk format. Supported formats are "
1051                 "{dynamic (default) | fixed} "
1052         },
1053         {
1054             .name = VPC_OPT_FORCE_SIZE,
1055             .type = QEMU_OPT_BOOL,
1056             .help = "Force disk size calculation to use the actual size "
1057                     "specified, rather than using the nearest CHS-based "
1058                     "calculation"
1059         },
1060         { /* end of list */ }
1061     }
1062 };
1063 
1064 static BlockDriver bdrv_vpc = {
1065     .format_name    = "vpc",
1066     .instance_size  = sizeof(BDRVVPCState),
1067 
1068     .bdrv_probe             = vpc_probe,
1069     .bdrv_open              = vpc_open,
1070     .bdrv_close             = vpc_close,
1071     .bdrv_reopen_prepare    = vpc_reopen_prepare,
1072     .bdrv_child_perm        = bdrv_format_default_perms,
1073     .bdrv_create            = vpc_create,
1074 
1075     .bdrv_co_preadv             = vpc_co_preadv,
1076     .bdrv_co_pwritev            = vpc_co_pwritev,
1077     .bdrv_co_get_block_status   = vpc_co_get_block_status,
1078 
1079     .bdrv_get_info          = vpc_get_info,
1080 
1081     .create_opts            = &vpc_create_opts,
1082     .bdrv_has_zero_init     = vpc_has_zero_init,
1083 };
1084 
1085 static void bdrv_vpc_init(void)
1086 {
1087     bdrv_register(&bdrv_vpc);
1088 }
1089 
1090 block_init(bdrv_vpc_init);
1091