xref: /openbmc/qemu/block/vpc.c (revision dbdf841b)
1 /*
2  * Block driver for Connectix / Microsoft Virtual PC images
3  *
4  * Copyright (c) 2005 Alex Beregszaszi
5  * Copyright (c) 2009 Kevin Wolf <kwolf@suse.de>
6  *
7  * Permission is hereby granted, free of charge, to any person obtaining a copy
8  * of this software and associated documentation files (the "Software"), to deal
9  * in the Software without restriction, including without limitation the rights
10  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11  * copies of the Software, and to permit persons to whom the Software is
12  * furnished to do so, subject to the following conditions:
13  *
14  * The above copyright notice and this permission notice shall be included in
15  * all copies or substantial portions of the Software.
16  *
17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23  * THE SOFTWARE.
24  */
25 
26 #include "qemu/osdep.h"
27 #include "qapi/error.h"
28 #include "block/block_int.h"
29 #include "block/qdict.h"
30 #include "sysemu/block-backend.h"
31 #include "qemu/module.h"
32 #include "qemu/option.h"
33 #include "migration/blocker.h"
34 #include "qemu/bswap.h"
35 #include "qemu/uuid.h"
36 #include "qemu/memalign.h"
37 #include "qapi/qmp/qdict.h"
38 #include "qapi/qobject-input-visitor.h"
39 #include "qapi/qapi-visit-block-core.h"
40 
41 /**************************************************************/
42 
43 //#define CACHE
44 
45 enum vhd_type {
46     VHD_FIXED           = 2,
47     VHD_DYNAMIC         = 3,
48     VHD_DIFFERENCING    = 4,
49 };
50 
51 /* Seconds since Jan 1, 2000 0:00:00 (UTC) */
52 #define VHD_TIMESTAMP_BASE 946684800
53 
54 #define VHD_CHS_MAX_C   65535LL
55 #define VHD_CHS_MAX_H   16
56 #define VHD_CHS_MAX_S   255
57 
58 #define VHD_MAX_SECTORS       0xff000000    /* 2040 GiB max image size */
59 #define VHD_MAX_GEOMETRY      (VHD_CHS_MAX_C * VHD_CHS_MAX_H * VHD_CHS_MAX_S)
60 
61 #define VPC_OPT_FORCE_SIZE "force_size"
62 
63 /* always big-endian */
64 typedef struct vhd_footer {
65     char        creator[8]; /* "conectix" */
66     uint32_t    features;
67     uint32_t    version;
68 
69     /* Offset of next header structure, 0xFFFFFFFF if none */
70     uint64_t    data_offset;
71 
72     /* Seconds since Jan 1, 2000 0:00:00 (UTC) */
73     uint32_t    timestamp;
74 
75     char        creator_app[4]; /*  e.g., "vpc " */
76     uint16_t    major;
77     uint16_t    minor;
78     char        creator_os[4]; /* "Wi2k" */
79 
80     uint64_t    orig_size;
81     uint64_t    current_size;
82 
83     uint16_t    cyls;
84     uint8_t     heads;
85     uint8_t     secs_per_cyl;
86 
87     uint32_t    type;
88 
89     /* Checksum of the Hard Disk Footer ("one's complement of the sum of all
90        the bytes in the footer without the checksum field") */
91     uint32_t    checksum;
92 
93     /* UUID used to identify a parent hard disk (backing file) */
94     QemuUUID    uuid;
95 
96     uint8_t     in_saved_state;
97     uint8_t     reserved[427];
98 } QEMU_PACKED VHDFooter;
99 
100 QEMU_BUILD_BUG_ON(sizeof(VHDFooter) != 512);
101 
102 typedef struct vhd_dyndisk_header {
103     char        magic[8]; /* "cxsparse" */
104 
105     /* Offset of next header structure, 0xFFFFFFFF if none */
106     uint64_t    data_offset;
107 
108     /* Offset of the Block Allocation Table (BAT) */
109     uint64_t    table_offset;
110 
111     uint32_t    version;
112     uint32_t    max_table_entries; /* 32bit/entry */
113 
114     /* 2 MB by default, must be a power of two */
115     uint32_t    block_size;
116 
117     uint32_t    checksum;
118     uint8_t     parent_uuid[16];
119     uint32_t    parent_timestamp;
120     uint32_t    reserved;
121 
122     /* Backing file name (in UTF-16) */
123     uint8_t     parent_name[512];
124 
125     struct {
126         uint32_t    platform;
127         uint32_t    data_space;
128         uint32_t    data_length;
129         uint32_t    reserved;
130         uint64_t    data_offset;
131     } parent_locator[8];
132     uint8_t     reserved2[256];
133 } QEMU_PACKED VHDDynDiskHeader;
134 
135 QEMU_BUILD_BUG_ON(sizeof(VHDDynDiskHeader) != 1024);
136 
137 typedef struct BDRVVPCState {
138     CoMutex lock;
139     VHDFooter footer;
140     uint64_t free_data_block_offset;
141     int max_table_entries;
142     uint32_t *pagetable;
143     uint64_t bat_offset;
144     uint64_t last_bitmap_offset;
145 
146     uint32_t block_size;
147     uint32_t bitmap_size;
148     bool force_use_chs;
149     bool force_use_sz;
150 
151 #ifdef CACHE
152     uint8_t *pageentry_u8;
153     uint32_t *pageentry_u32;
154     uint16_t *pageentry_u16;
155 
156     uint64_t last_bitmap;
157 #endif
158 
159     Error *migration_blocker;
160 } BDRVVPCState;
161 
162 #define VPC_OPT_SIZE_CALC "force_size_calc"
163 static QemuOptsList vpc_runtime_opts = {
164     .name = "vpc-runtime-opts",
165     .head = QTAILQ_HEAD_INITIALIZER(vpc_runtime_opts.head),
166     .desc = {
167         {
168             .name = VPC_OPT_SIZE_CALC,
169             .type = QEMU_OPT_STRING,
170             .help = "Force disk size calculation to use either CHS geometry, "
171                     "or use the disk current_size specified in the VHD footer. "
172                     "{chs, current_size}"
173         },
174         { /* end of list */ }
175     }
176 };
177 
178 static QemuOptsList vpc_create_opts;
179 
180 static uint32_t vpc_checksum(void *p, size_t size)
181 {
182     uint8_t *buf = p;
183     uint32_t res = 0;
184     int i;
185 
186     for (i = 0; i < size; i++)
187         res += buf[i];
188 
189     return ~res;
190 }
191 
192 
193 static int vpc_probe(const uint8_t *buf, int buf_size, const char *filename)
194 {
195     if (buf_size >= 8 && !strncmp((char *)buf, "conectix", 8))
196         return 100;
197     return 0;
198 }
199 
200 static void vpc_parse_options(BlockDriverState *bs, QemuOpts *opts,
201                               Error **errp)
202 {
203     BDRVVPCState *s = bs->opaque;
204     const char *size_calc;
205 
206     size_calc = qemu_opt_get(opts, VPC_OPT_SIZE_CALC);
207 
208     if (!size_calc) {
209        /* no override, use autodetect only */
210     } else if (!strcmp(size_calc, "current_size")) {
211         s->force_use_sz = true;
212     } else if (!strcmp(size_calc, "chs")) {
213         s->force_use_chs = true;
214     } else {
215         error_setg(errp, "Invalid size calculation mode: '%s'", size_calc);
216     }
217 }
218 
219 static int vpc_open(BlockDriverState *bs, QDict *options, int flags,
220                     Error **errp)
221 {
222     BDRVVPCState *s = bs->opaque;
223     int i;
224     VHDFooter *footer;
225     QemuOpts *opts = NULL;
226     Error *local_err = NULL;
227     bool use_chs;
228     VHDDynDiskHeader dyndisk_header;
229     uint32_t checksum;
230     uint64_t computed_size;
231     uint64_t pagetable_size;
232     int disk_type = VHD_DYNAMIC;
233     int ret;
234     int64_t bs_size;
235 
236     ret = bdrv_open_file_child(NULL, options, "file", bs, errp);
237     if (ret < 0) {
238         return ret;
239     }
240 
241     opts = qemu_opts_create(&vpc_runtime_opts, NULL, 0, &error_abort);
242     if (!qemu_opts_absorb_qdict(opts, options, errp)) {
243         ret = -EINVAL;
244         goto fail;
245     }
246 
247     vpc_parse_options(bs, opts, &local_err);
248     if (local_err) {
249         error_propagate(errp, local_err);
250         ret = -EINVAL;
251         goto fail;
252     }
253 
254     ret = bdrv_pread(bs->file, 0, sizeof(s->footer), &s->footer, 0);
255     if (ret < 0) {
256         error_setg(errp, "Unable to read VHD header");
257         goto fail;
258     }
259 
260     footer = &s->footer;
261     if (strncmp(footer->creator, "conectix", 8)) {
262         int64_t offset = bdrv_getlength(bs->file->bs);
263         if (offset < 0) {
264             ret = offset;
265             error_setg(errp, "Invalid file size");
266             goto fail;
267         } else if (offset < sizeof(*footer)) {
268             ret = -EINVAL;
269             error_setg(errp, "File too small for a VHD header");
270             goto fail;
271         }
272 
273         /* If a fixed disk, the footer is found only at the end of the file */
274         ret = bdrv_pread(bs->file, offset - sizeof(*footer), sizeof(*footer),
275                          footer, 0);
276         if (ret < 0) {
277             goto fail;
278         }
279         if (strncmp(footer->creator, "conectix", 8) ||
280             be32_to_cpu(footer->type) != VHD_FIXED) {
281             error_setg(errp, "invalid VPC image");
282             ret = -EINVAL;
283             goto fail;
284         }
285         disk_type = VHD_FIXED;
286     }
287 
288     checksum = be32_to_cpu(footer->checksum);
289     footer->checksum = 0;
290     if (vpc_checksum(footer, sizeof(*footer)) != checksum) {
291         error_setg(errp, "Incorrect header checksum");
292         ret = -EINVAL;
293         goto fail;
294     }
295 
296     /* Write 'checksum' back to footer, or else will leave it with zero. */
297     footer->checksum = cpu_to_be32(checksum);
298 
299     /* The visible size of a image in Virtual PC depends on the geometry
300        rather than on the size stored in the footer (the size in the footer
301        is too large usually) */
302     bs->total_sectors = (int64_t)
303         be16_to_cpu(footer->cyls) * footer->heads * footer->secs_per_cyl;
304 
305     /* Microsoft Virtual PC and Microsoft Hyper-V produce and read
306      * VHD image sizes differently.  VPC will rely on CHS geometry,
307      * while Hyper-V and disk2vhd use the size specified in the footer.
308      *
309      * We use a couple of approaches to try and determine the correct method:
310      * look at the Creator App field, and look for images that have CHS
311      * geometry that is the maximum value.
312      *
313      * If the CHS geometry is the maximum CHS geometry, then we assume that
314      * the size is the footer->current_size to avoid truncation.  Otherwise,
315      * we follow the table based on footer->creator_app:
316      *
317      *  Known creator apps:
318      *      'vpc '  :  CHS              Virtual PC (uses disk geometry)
319      *      'qemu'  :  CHS              QEMU (uses disk geometry)
320      *      'qem2'  :  current_size     QEMU (uses current_size)
321      *      'win '  :  current_size     Hyper-V
322      *      'd2v '  :  current_size     Disk2vhd
323      *      'tap\0' :  current_size     XenServer
324      *      'CTXS'  :  current_size     XenConverter
325      *
326      *  The user can override the table values via drive options, however
327      *  even with an override we will still use current_size for images
328      *  that have CHS geometry of the maximum size.
329      */
330     use_chs = (!!strncmp(footer->creator_app, "win ", 4) &&
331                !!strncmp(footer->creator_app, "qem2", 4) &&
332                !!strncmp(footer->creator_app, "d2v ", 4) &&
333                !!strncmp(footer->creator_app, "CTXS", 4) &&
334                !!memcmp(footer->creator_app, "tap", 4)) || s->force_use_chs;
335 
336     if (!use_chs || bs->total_sectors == VHD_MAX_GEOMETRY || s->force_use_sz) {
337         bs->total_sectors = be64_to_cpu(footer->current_size) /
338                                         BDRV_SECTOR_SIZE;
339     }
340 
341     /* Allow a maximum disk size of 2040 GiB */
342     if (bs->total_sectors > VHD_MAX_SECTORS) {
343         ret = -EFBIG;
344         goto fail;
345     }
346 
347     if (disk_type == VHD_DYNAMIC) {
348         ret = bdrv_pread(bs->file, be64_to_cpu(footer->data_offset),
349                          sizeof(dyndisk_header), &dyndisk_header, 0);
350         if (ret < 0) {
351             error_setg(errp, "Error reading dynamic VHD header");
352             goto fail;
353         }
354 
355         if (strncmp(dyndisk_header.magic, "cxsparse", 8)) {
356             error_setg(errp, "Invalid header magic");
357             ret = -EINVAL;
358             goto fail;
359         }
360 
361         s->block_size = be32_to_cpu(dyndisk_header.block_size);
362         if (!is_power_of_2(s->block_size) || s->block_size < BDRV_SECTOR_SIZE) {
363             error_setg(errp, "Invalid block size %" PRIu32, s->block_size);
364             ret = -EINVAL;
365             goto fail;
366         }
367         s->bitmap_size = ((s->block_size / (8 * 512)) + 511) & ~511;
368 
369         s->max_table_entries = be32_to_cpu(dyndisk_header.max_table_entries);
370 
371         if ((bs->total_sectors * 512) / s->block_size > 0xffffffffU) {
372             error_setg(errp, "Too many blocks");
373             ret = -EINVAL;
374             goto fail;
375         }
376 
377         computed_size = (uint64_t) s->max_table_entries * s->block_size;
378         if (computed_size < bs->total_sectors * 512) {
379             error_setg(errp, "Page table too small");
380             ret = -EINVAL;
381             goto fail;
382         }
383 
384         if (s->max_table_entries > SIZE_MAX / 4 ||
385             s->max_table_entries > (int) INT_MAX / 4) {
386             error_setg(errp, "Max Table Entries too large (%" PRId32 ")",
387                         s->max_table_entries);
388             ret = -EINVAL;
389             goto fail;
390         }
391 
392         pagetable_size = (uint64_t) s->max_table_entries * 4;
393 
394         s->pagetable = qemu_try_blockalign(bs->file->bs, pagetable_size);
395         if (s->pagetable == NULL) {
396             error_setg(errp, "Unable to allocate memory for page table");
397             ret = -ENOMEM;
398             goto fail;
399         }
400 
401         s->bat_offset = be64_to_cpu(dyndisk_header.table_offset);
402 
403         ret = bdrv_pread(bs->file, s->bat_offset, pagetable_size,
404                          s->pagetable, 0);
405         if (ret < 0) {
406             error_setg(errp, "Error reading pagetable");
407             goto fail;
408         }
409 
410         s->free_data_block_offset =
411             ROUND_UP(s->bat_offset + pagetable_size, 512);
412 
413         for (i = 0; i < s->max_table_entries; i++) {
414             be32_to_cpus(&s->pagetable[i]);
415             if (s->pagetable[i] != 0xFFFFFFFF) {
416                 int64_t next = (512 * (int64_t) s->pagetable[i]) +
417                     s->bitmap_size + s->block_size;
418 
419                 if (next > s->free_data_block_offset) {
420                     s->free_data_block_offset = next;
421                 }
422             }
423         }
424 
425         bs_size = bdrv_getlength(bs->file->bs);
426         if (bs_size < 0) {
427             error_setg_errno(errp, -bs_size, "Unable to learn image size");
428             ret = bs_size;
429             goto fail;
430         }
431         if (s->free_data_block_offset > bs_size) {
432             error_setg(errp, "block-vpc: free_data_block_offset points after "
433                              "the end of file. The image has been truncated.");
434             ret = -EINVAL;
435             goto fail;
436         }
437 
438         s->last_bitmap_offset = (int64_t) -1;
439 
440 #ifdef CACHE
441         s->pageentry_u8 = g_malloc(512);
442         s->pageentry_u32 = s->pageentry_u8;
443         s->pageentry_u16 = s->pageentry_u8;
444         s->last_pagetable = -1;
445 #endif
446     }
447 
448     /* Disable migration when VHD images are used */
449     error_setg(&s->migration_blocker, "The vpc format used by node '%s' "
450                "does not support live migration",
451                bdrv_get_device_or_node_name(bs));
452     ret = migrate_add_blocker(s->migration_blocker, errp);
453     if (ret < 0) {
454         error_free(s->migration_blocker);
455         goto fail;
456     }
457 
458     qemu_co_mutex_init(&s->lock);
459     qemu_opts_del(opts);
460 
461     return 0;
462 
463 fail:
464     qemu_opts_del(opts);
465     qemu_vfree(s->pagetable);
466 #ifdef CACHE
467     g_free(s->pageentry_u8);
468 #endif
469     return ret;
470 }
471 
472 static int vpc_reopen_prepare(BDRVReopenState *state,
473                               BlockReopenQueue *queue, Error **errp)
474 {
475     return 0;
476 }
477 
478 /*
479  * Returns the absolute byte offset of the given sector in the image file.
480  * If the sector is not allocated, -1 is returned instead.
481  * If an error occurred trying to write an updated block bitmap back to
482  * the file, -2 is returned, and the error value is written to *err.
483  * This can only happen for a write operation.
484  *
485  * The parameter write must be 1 if the offset will be used for a write
486  * operation (the block bitmaps is updated then), 0 otherwise.
487  * If write is true then err must not be NULL.
488  */
489 static int64_t coroutine_fn GRAPH_RDLOCK
490 get_image_offset(BlockDriverState *bs, uint64_t offset, bool write, int *err)
491 {
492     BDRVVPCState *s = bs->opaque;
493     uint64_t bitmap_offset, block_offset;
494     uint32_t pagetable_index, offset_in_block;
495 
496     assert(!(write && err == NULL));
497 
498     pagetable_index = offset / s->block_size;
499     offset_in_block = offset % s->block_size;
500 
501     if (pagetable_index >= s->max_table_entries || s->pagetable[pagetable_index] == 0xffffffff)
502         return -1; /* not allocated */
503 
504     bitmap_offset = 512 * (uint64_t) s->pagetable[pagetable_index];
505     block_offset = bitmap_offset + s->bitmap_size + offset_in_block;
506 
507     /* We must ensure that we don't write to any sectors which are marked as
508        unused in the bitmap. We get away with setting all bits in the block
509        bitmap each time we write to a new block. This might cause Virtual PC to
510        miss sparse read optimization, but it's not a problem in terms of
511        correctness. */
512     if (write && (s->last_bitmap_offset != bitmap_offset)) {
513         uint8_t bitmap[s->bitmap_size];
514         int r;
515 
516         s->last_bitmap_offset = bitmap_offset;
517         memset(bitmap, 0xff, s->bitmap_size);
518         r = bdrv_co_pwrite_sync(bs->file, bitmap_offset, s->bitmap_size, bitmap, 0);
519         if (r < 0) {
520             *err = r;
521             return -2;
522         }
523     }
524 
525     return block_offset;
526 }
527 
528 /*
529  * Writes the footer to the end of the image file. This is needed when the
530  * file grows as it overwrites the old footer
531  *
532  * Returns 0 on success and < 0 on error
533  */
534 static int coroutine_fn GRAPH_RDLOCK rewrite_footer(BlockDriverState *bs)
535 {
536     int ret;
537     BDRVVPCState *s = bs->opaque;
538     int64_t offset = s->free_data_block_offset;
539 
540     ret = bdrv_co_pwrite_sync(bs->file, offset, sizeof(s->footer), &s->footer, 0);
541     if (ret < 0)
542         return ret;
543 
544     return 0;
545 }
546 
547 /*
548  * Allocates a new block. This involves writing a new footer and updating
549  * the Block Allocation Table to use the space at the old end of the image
550  * file (overwriting the old footer)
551  *
552  * Returns the sectors' offset in the image file on success and < 0 on error
553  */
554 static int64_t coroutine_fn GRAPH_RDLOCK
555 alloc_block(BlockDriverState *bs, int64_t offset)
556 {
557     BDRVVPCState *s = bs->opaque;
558     int64_t bat_offset;
559     uint32_t index, bat_value;
560     int ret;
561     uint8_t bitmap[s->bitmap_size];
562 
563     /* Check if sector_num is valid */
564     if ((offset < 0) || (offset > bs->total_sectors * BDRV_SECTOR_SIZE)) {
565         return -EINVAL;
566     }
567 
568     /* Write entry into in-memory BAT */
569     index = offset / s->block_size;
570     assert(s->pagetable[index] == 0xFFFFFFFF);
571     s->pagetable[index] = s->free_data_block_offset / 512;
572 
573     /* Initialize the block's bitmap */
574     memset(bitmap, 0xff, s->bitmap_size);
575     ret = bdrv_co_pwrite_sync(bs->file, s->free_data_block_offset,
576                               s->bitmap_size, bitmap, 0);
577     if (ret < 0) {
578         return ret;
579     }
580 
581     /* Write new footer (the old one will be overwritten) */
582     s->free_data_block_offset += s->block_size + s->bitmap_size;
583     ret = rewrite_footer(bs);
584     if (ret < 0)
585         goto fail;
586 
587     /* Write BAT entry to disk */
588     bat_offset = s->bat_offset + (4 * index);
589     bat_value = cpu_to_be32(s->pagetable[index]);
590     ret = bdrv_co_pwrite_sync(bs->file, bat_offset, 4, &bat_value, 0);
591     if (ret < 0)
592         goto fail;
593 
594     return get_image_offset(bs, offset, false, NULL);
595 
596 fail:
597     s->free_data_block_offset -= (s->block_size + s->bitmap_size);
598     return ret;
599 }
600 
601 static int coroutine_fn
602 vpc_co_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
603 {
604     BDRVVPCState *s = (BDRVVPCState *)bs->opaque;
605 
606     if (be32_to_cpu(s->footer.type) != VHD_FIXED) {
607         bdi->cluster_size = s->block_size;
608     }
609 
610     return 0;
611 }
612 
613 static int coroutine_fn GRAPH_RDLOCK
614 vpc_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes,
615               QEMUIOVector *qiov, BdrvRequestFlags flags)
616 {
617     BDRVVPCState *s = bs->opaque;
618     int ret;
619     int64_t image_offset;
620     int64_t n_bytes;
621     int64_t bytes_done = 0;
622     QEMUIOVector local_qiov;
623 
624     if (be32_to_cpu(s->footer.type) == VHD_FIXED) {
625         return bdrv_co_preadv(bs->file, offset, bytes, qiov, 0);
626     }
627 
628     qemu_co_mutex_lock(&s->lock);
629     qemu_iovec_init(&local_qiov, qiov->niov);
630 
631     while (bytes > 0) {
632         image_offset = get_image_offset(bs, offset, false, NULL);
633         n_bytes = MIN(bytes, s->block_size - (offset % s->block_size));
634 
635         if (image_offset == -1) {
636             qemu_iovec_memset(qiov, bytes_done, 0, n_bytes);
637         } else {
638             qemu_iovec_reset(&local_qiov);
639             qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes);
640 
641             qemu_co_mutex_unlock(&s->lock);
642             ret = bdrv_co_preadv(bs->file, image_offset, n_bytes,
643                                  &local_qiov, 0);
644             qemu_co_mutex_lock(&s->lock);
645             if (ret < 0) {
646                 goto fail;
647             }
648         }
649 
650         bytes -= n_bytes;
651         offset += n_bytes;
652         bytes_done += n_bytes;
653     }
654 
655     ret = 0;
656 fail:
657     qemu_iovec_destroy(&local_qiov);
658     qemu_co_mutex_unlock(&s->lock);
659 
660     return ret;
661 }
662 
663 static int coroutine_fn GRAPH_RDLOCK
664 vpc_co_pwritev(BlockDriverState *bs, int64_t offset, int64_t bytes,
665                QEMUIOVector *qiov, BdrvRequestFlags flags)
666 {
667     BDRVVPCState *s = bs->opaque;
668     int64_t image_offset;
669     int64_t n_bytes;
670     int64_t bytes_done = 0;
671     int ret = 0;
672     QEMUIOVector local_qiov;
673 
674     if (be32_to_cpu(s->footer.type) == VHD_FIXED) {
675         return bdrv_co_pwritev(bs->file, offset, bytes, qiov, 0);
676     }
677 
678     qemu_co_mutex_lock(&s->lock);
679     qemu_iovec_init(&local_qiov, qiov->niov);
680 
681     while (bytes > 0) {
682         image_offset = get_image_offset(bs, offset, true, &ret);
683         if (image_offset == -2) {
684             /* Failed to write block bitmap: can't proceed with write */
685             goto fail;
686         }
687         n_bytes = MIN(bytes, s->block_size - (offset % s->block_size));
688 
689         if (image_offset == -1) {
690             image_offset = alloc_block(bs, offset);
691             if (image_offset < 0) {
692                 ret = image_offset;
693                 goto fail;
694             }
695         }
696 
697         qemu_iovec_reset(&local_qiov);
698         qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes);
699 
700         qemu_co_mutex_unlock(&s->lock);
701         ret = bdrv_co_pwritev(bs->file, image_offset, n_bytes,
702                               &local_qiov, 0);
703         qemu_co_mutex_lock(&s->lock);
704         if (ret < 0) {
705             goto fail;
706         }
707 
708         bytes -= n_bytes;
709         offset += n_bytes;
710         bytes_done += n_bytes;
711     }
712 
713     ret = 0;
714 fail:
715     qemu_iovec_destroy(&local_qiov);
716     qemu_co_mutex_unlock(&s->lock);
717 
718     return ret;
719 }
720 
721 static int coroutine_fn GRAPH_RDLOCK
722 vpc_co_block_status(BlockDriverState *bs, bool want_zero,
723                     int64_t offset, int64_t bytes,
724                     int64_t *pnum, int64_t *map,
725                     BlockDriverState **file)
726 {
727     BDRVVPCState *s = bs->opaque;
728     int64_t image_offset;
729     bool allocated;
730     int ret;
731     int64_t n;
732 
733     if (be32_to_cpu(s->footer.type) == VHD_FIXED) {
734         *pnum = bytes;
735         *map = offset;
736         *file = bs->file->bs;
737         return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | BDRV_BLOCK_RECURSE;
738     }
739 
740     qemu_co_mutex_lock(&s->lock);
741 
742     image_offset = get_image_offset(bs, offset, false, NULL);
743     allocated = (image_offset != -1);
744     *pnum = 0;
745     ret = BDRV_BLOCK_ZERO;
746 
747     do {
748         /* All sectors in a block are contiguous (without using the bitmap) */
749         n = ROUND_UP(offset + 1, s->block_size) - offset;
750         n = MIN(n, bytes);
751 
752         *pnum += n;
753         offset += n;
754         bytes -= n;
755         /* *pnum can't be greater than one block for allocated
756          * sectors since there is always a bitmap in between. */
757         if (allocated) {
758             *file = bs->file->bs;
759             *map = image_offset;
760             ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
761             break;
762         }
763         if (bytes == 0) {
764             break;
765         }
766         image_offset = get_image_offset(bs, offset, false, NULL);
767     } while (image_offset == -1);
768 
769     qemu_co_mutex_unlock(&s->lock);
770     return ret;
771 }
772 
773 /*
774  * Calculates the number of cylinders, heads and sectors per cylinder
775  * based on a given number of sectors. This is the algorithm described
776  * in the VHD specification.
777  *
778  * Note that the geometry doesn't always exactly match total_sectors but
779  * may round it down.
780  *
781  * Returns 0 on success, -EFBIG if the size is larger than 2040 GiB. Override
782  * the hardware EIDE and ATA-2 limit of 16 heads (max disk size of 127 GB)
783  * and instead allow up to 255 heads.
784  */
785 static int calculate_geometry(int64_t total_sectors, uint16_t *cyls,
786     uint8_t *heads, uint8_t *secs_per_cyl)
787 {
788     uint32_t cyls_times_heads;
789 
790     total_sectors = MIN(total_sectors, VHD_MAX_GEOMETRY);
791 
792     if (total_sectors >= 65535LL * 16 * 63) {
793         *secs_per_cyl = 255;
794         *heads = 16;
795         cyls_times_heads = total_sectors / *secs_per_cyl;
796     } else {
797         *secs_per_cyl = 17;
798         cyls_times_heads = total_sectors / *secs_per_cyl;
799         *heads = DIV_ROUND_UP(cyls_times_heads, 1024);
800 
801         if (*heads < 4) {
802             *heads = 4;
803         }
804 
805         if (cyls_times_heads >= (*heads * 1024) || *heads > 16) {
806             *secs_per_cyl = 31;
807             *heads = 16;
808             cyls_times_heads = total_sectors / *secs_per_cyl;
809         }
810 
811         if (cyls_times_heads >= (*heads * 1024)) {
812             *secs_per_cyl = 63;
813             *heads = 16;
814             cyls_times_heads = total_sectors / *secs_per_cyl;
815         }
816     }
817 
818     *cyls = cyls_times_heads / *heads;
819 
820     return 0;
821 }
822 
823 static int coroutine_fn create_dynamic_disk(BlockBackend *blk, VHDFooter *footer,
824                                             int64_t total_sectors)
825 {
826     VHDDynDiskHeader dyndisk_header;
827     uint8_t bat_sector[512];
828     size_t block_size, num_bat_entries;
829     int i;
830     int ret;
831     int64_t offset = 0;
832 
833     /* Write the footer (twice: at the beginning and at the end) */
834     block_size = 0x200000;
835     num_bat_entries = DIV_ROUND_UP(total_sectors, block_size / 512);
836 
837     ret = blk_co_pwrite(blk, offset, sizeof(*footer), footer, 0);
838     if (ret < 0) {
839         goto fail;
840     }
841 
842     offset = 1536 + ((num_bat_entries * 4 + 511) & ~511);
843     ret = blk_co_pwrite(blk, offset, sizeof(*footer), footer, 0);
844     if (ret < 0) {
845         goto fail;
846     }
847 
848     /* Write the initial BAT */
849     offset = 3 * 512;
850 
851     memset(bat_sector, 0xFF, 512);
852     for (i = 0; i < DIV_ROUND_UP(num_bat_entries * 4, 512); i++) {
853         ret = blk_co_pwrite(blk, offset, 512, bat_sector, 0);
854         if (ret < 0) {
855             goto fail;
856         }
857         offset += 512;
858     }
859 
860     /* Prepare the Dynamic Disk Header */
861     memset(&dyndisk_header, 0, sizeof(dyndisk_header));
862 
863     memcpy(dyndisk_header.magic, "cxsparse", 8);
864 
865     /*
866      * Note: The spec is actually wrong here for data_offset, it says
867      * 0xFFFFFFFF, but MS tools expect all 64 bits to be set.
868      */
869     dyndisk_header.data_offset = cpu_to_be64(0xFFFFFFFFFFFFFFFFULL);
870     dyndisk_header.table_offset = cpu_to_be64(3 * 512);
871     dyndisk_header.version = cpu_to_be32(0x00010000);
872     dyndisk_header.block_size = cpu_to_be32(block_size);
873     dyndisk_header.max_table_entries = cpu_to_be32(num_bat_entries);
874 
875     dyndisk_header.checksum = cpu_to_be32(
876         vpc_checksum(&dyndisk_header, sizeof(dyndisk_header)));
877 
878     /* Write the header */
879     offset = 512;
880 
881     ret = blk_co_pwrite(blk, offset, sizeof(dyndisk_header), &dyndisk_header, 0);
882     if (ret < 0) {
883         goto fail;
884     }
885 
886     ret = 0;
887  fail:
888     return ret;
889 }
890 
891 static int coroutine_fn create_fixed_disk(BlockBackend *blk, VHDFooter *footer,
892                                           int64_t total_size, Error **errp)
893 {
894     int ret;
895 
896     /* Add footer to total size */
897     total_size += sizeof(*footer);
898 
899     ret = blk_co_truncate(blk, total_size, false, PREALLOC_MODE_OFF, 0, errp);
900     if (ret < 0) {
901         return ret;
902     }
903 
904     ret = blk_co_pwrite(blk, total_size - sizeof(*footer), sizeof(*footer),
905                         footer, 0);
906     if (ret < 0) {
907         error_setg_errno(errp, -ret, "Unable to write VHD header");
908         return ret;
909     }
910 
911     return 0;
912 }
913 
914 static int calculate_rounded_image_size(BlockdevCreateOptionsVpc *vpc_opts,
915                                         uint16_t *out_cyls,
916                                         uint8_t *out_heads,
917                                         uint8_t *out_secs_per_cyl,
918                                         int64_t *out_total_sectors,
919                                         Error **errp)
920 {
921     int64_t total_size = vpc_opts->size;
922     uint16_t cyls = 0;
923     uint8_t heads = 0;
924     uint8_t secs_per_cyl = 0;
925     int64_t total_sectors;
926     int i;
927 
928     /*
929      * Calculate matching total_size and geometry. Increase the number of
930      * sectors requested until we get enough (or fail). This ensures that
931      * qemu-img convert doesn't truncate images, but rather rounds up.
932      *
933      * If the image size can't be represented by a spec conformant CHS geometry,
934      * we set the geometry to 65535 x 16 x 255 (CxHxS) sectors and use
935      * the image size from the VHD footer to calculate total_sectors.
936      */
937     if (vpc_opts->force_size) {
938         /* This will force the use of total_size for sector count, below */
939         cyls         = VHD_CHS_MAX_C;
940         heads        = VHD_CHS_MAX_H;
941         secs_per_cyl = VHD_CHS_MAX_S;
942     } else {
943         total_sectors = MIN(VHD_MAX_GEOMETRY, total_size / BDRV_SECTOR_SIZE);
944         for (i = 0; total_sectors > (int64_t)cyls * heads * secs_per_cyl; i++) {
945             calculate_geometry(total_sectors + i, &cyls, &heads, &secs_per_cyl);
946         }
947     }
948 
949     if ((int64_t)cyls * heads * secs_per_cyl == VHD_MAX_GEOMETRY) {
950         total_sectors = total_size / BDRV_SECTOR_SIZE;
951         /* Allow a maximum disk size of 2040 GiB */
952         if (total_sectors > VHD_MAX_SECTORS) {
953             error_setg(errp, "Disk size is too large, max size is 2040 GiB");
954             return -EFBIG;
955         }
956     } else {
957         total_sectors = (int64_t) cyls * heads * secs_per_cyl;
958     }
959 
960     *out_total_sectors = total_sectors;
961     if (out_cyls) {
962         *out_cyls = cyls;
963         *out_heads = heads;
964         *out_secs_per_cyl = secs_per_cyl;
965     }
966 
967     return 0;
968 }
969 
970 static int coroutine_fn GRAPH_UNLOCKED
971 vpc_co_create(BlockdevCreateOptions *opts, Error **errp)
972 {
973     BlockdevCreateOptionsVpc *vpc_opts;
974     BlockBackend *blk = NULL;
975     BlockDriverState *bs = NULL;
976 
977     VHDFooter footer;
978     uint16_t cyls = 0;
979     uint8_t heads = 0;
980     uint8_t secs_per_cyl = 0;
981     int64_t total_sectors;
982     int64_t total_size;
983     int disk_type;
984     int ret = -EIO;
985     QemuUUID uuid;
986 
987     assert(opts->driver == BLOCKDEV_DRIVER_VPC);
988     vpc_opts = &opts->u.vpc;
989 
990     /* Validate options and set default values */
991     total_size = vpc_opts->size;
992 
993     if (!vpc_opts->has_subformat) {
994         vpc_opts->subformat = BLOCKDEV_VPC_SUBFORMAT_DYNAMIC;
995     }
996     switch (vpc_opts->subformat) {
997     case BLOCKDEV_VPC_SUBFORMAT_DYNAMIC:
998         disk_type = VHD_DYNAMIC;
999         break;
1000     case BLOCKDEV_VPC_SUBFORMAT_FIXED:
1001         disk_type = VHD_FIXED;
1002         break;
1003     default:
1004         g_assert_not_reached();
1005     }
1006 
1007     /* Create BlockBackend to write to the image */
1008     bs = bdrv_co_open_blockdev_ref(vpc_opts->file, errp);
1009     if (bs == NULL) {
1010         return -EIO;
1011     }
1012 
1013     blk = blk_co_new_with_bs(bs, BLK_PERM_WRITE | BLK_PERM_RESIZE, BLK_PERM_ALL,
1014                              errp);
1015     if (!blk) {
1016         ret = -EPERM;
1017         goto out;
1018     }
1019     blk_set_allow_write_beyond_eof(blk, true);
1020 
1021     /* Get geometry and check that it matches the image size*/
1022     ret = calculate_rounded_image_size(vpc_opts, &cyls, &heads, &secs_per_cyl,
1023                                        &total_sectors, errp);
1024     if (ret < 0) {
1025         goto out;
1026     }
1027 
1028     if (total_size != total_sectors * BDRV_SECTOR_SIZE) {
1029         error_setg(errp, "The requested image size cannot be represented in "
1030                          "CHS geometry");
1031         error_append_hint(errp, "Try size=%llu or force-size=on (the "
1032                                 "latter makes the image incompatible with "
1033                                 "Virtual PC)",
1034                           total_sectors * BDRV_SECTOR_SIZE);
1035         ret = -EINVAL;
1036         goto out;
1037     }
1038 
1039     /* Prepare the Hard Disk Footer */
1040     memset(&footer, 0, sizeof(footer));
1041 
1042     memcpy(footer.creator, "conectix", 8);
1043     if (vpc_opts->force_size) {
1044         memcpy(footer.creator_app, "qem2", 4);
1045     } else {
1046         memcpy(footer.creator_app, "qemu", 4);
1047     }
1048     memcpy(footer.creator_os, "Wi2k", 4);
1049 
1050     footer.features = cpu_to_be32(0x02);
1051     footer.version = cpu_to_be32(0x00010000);
1052     if (disk_type == VHD_DYNAMIC) {
1053         footer.data_offset = cpu_to_be64(sizeof(footer));
1054     } else {
1055         footer.data_offset = cpu_to_be64(0xFFFFFFFFFFFFFFFFULL);
1056     }
1057     footer.timestamp = cpu_to_be32(time(NULL) - VHD_TIMESTAMP_BASE);
1058 
1059     /* Version of Virtual PC 2007 */
1060     footer.major = cpu_to_be16(0x0005);
1061     footer.minor = cpu_to_be16(0x0003);
1062     footer.orig_size = cpu_to_be64(total_size);
1063     footer.current_size = cpu_to_be64(total_size);
1064     footer.cyls = cpu_to_be16(cyls);
1065     footer.heads = heads;
1066     footer.secs_per_cyl = secs_per_cyl;
1067 
1068     footer.type = cpu_to_be32(disk_type);
1069 
1070     qemu_uuid_generate(&uuid);
1071     footer.uuid = uuid;
1072 
1073     footer.checksum = cpu_to_be32(vpc_checksum(&footer, sizeof(footer)));
1074 
1075     if (disk_type == VHD_DYNAMIC) {
1076         ret = create_dynamic_disk(blk, &footer, total_sectors);
1077         if (ret < 0) {
1078             error_setg(errp, "Unable to create or write VHD header");
1079         }
1080     } else {
1081         ret = create_fixed_disk(blk, &footer, total_size, errp);
1082     }
1083 
1084 out:
1085     blk_co_unref(blk);
1086     bdrv_co_unref(bs);
1087     return ret;
1088 }
1089 
1090 static int coroutine_fn GRAPH_UNLOCKED
1091 vpc_co_create_opts(BlockDriver *drv, const char *filename,
1092                    QemuOpts *opts, Error **errp)
1093 {
1094     BlockdevCreateOptions *create_options = NULL;
1095     QDict *qdict;
1096     Visitor *v;
1097     BlockDriverState *bs = NULL;
1098     int ret;
1099 
1100     static const QDictRenames opt_renames[] = {
1101         { VPC_OPT_FORCE_SIZE,           "force-size" },
1102         { NULL, NULL },
1103     };
1104 
1105     /* Parse options and convert legacy syntax */
1106     qdict = qemu_opts_to_qdict_filtered(opts, NULL, &vpc_create_opts, true);
1107 
1108     if (!qdict_rename_keys(qdict, opt_renames, errp)) {
1109         ret = -EINVAL;
1110         goto fail;
1111     }
1112 
1113     /* Create and open the file (protocol layer) */
1114     ret = bdrv_co_create_file(filename, opts, errp);
1115     if (ret < 0) {
1116         goto fail;
1117     }
1118 
1119     bs = bdrv_co_open(filename, NULL, NULL,
1120                       BDRV_O_RDWR | BDRV_O_RESIZE | BDRV_O_PROTOCOL, errp);
1121     if (bs == NULL) {
1122         ret = -EIO;
1123         goto fail;
1124     }
1125 
1126     /* Now get the QAPI type BlockdevCreateOptions */
1127     qdict_put_str(qdict, "driver", "vpc");
1128     qdict_put_str(qdict, "file", bs->node_name);
1129 
1130     v = qobject_input_visitor_new_flat_confused(qdict, errp);
1131     if (!v) {
1132         ret = -EINVAL;
1133         goto fail;
1134     }
1135 
1136     visit_type_BlockdevCreateOptions(v, NULL, &create_options, errp);
1137     visit_free(v);
1138     if (!create_options) {
1139         ret = -EINVAL;
1140         goto fail;
1141     }
1142 
1143     /* Silently round up size */
1144     assert(create_options->driver == BLOCKDEV_DRIVER_VPC);
1145     create_options->u.vpc.size =
1146         ROUND_UP(create_options->u.vpc.size, BDRV_SECTOR_SIZE);
1147 
1148     if (!create_options->u.vpc.force_size) {
1149         int64_t total_sectors;
1150         ret = calculate_rounded_image_size(&create_options->u.vpc, NULL, NULL,
1151                                            NULL, &total_sectors, errp);
1152         if (ret < 0) {
1153             goto fail;
1154         }
1155 
1156         create_options->u.vpc.size = total_sectors * BDRV_SECTOR_SIZE;
1157     }
1158 
1159 
1160     /* Create the vpc image (format layer) */
1161     ret = vpc_co_create(create_options, errp);
1162 
1163 fail:
1164     qobject_unref(qdict);
1165     bdrv_co_unref(bs);
1166     qapi_free_BlockdevCreateOptions(create_options);
1167     return ret;
1168 }
1169 
1170 
1171 static int vpc_has_zero_init(BlockDriverState *bs)
1172 {
1173     BDRVVPCState *s = bs->opaque;
1174 
1175     if (be32_to_cpu(s->footer.type) == VHD_FIXED) {
1176         return bdrv_has_zero_init(bs->file->bs);
1177     } else {
1178         return 1;
1179     }
1180 }
1181 
1182 static void vpc_close(BlockDriverState *bs)
1183 {
1184     BDRVVPCState *s = bs->opaque;
1185     qemu_vfree(s->pagetable);
1186 #ifdef CACHE
1187     g_free(s->pageentry_u8);
1188 #endif
1189 
1190     migrate_del_blocker(s->migration_blocker);
1191     error_free(s->migration_blocker);
1192 }
1193 
1194 static QemuOptsList vpc_create_opts = {
1195     .name = "vpc-create-opts",
1196     .head = QTAILQ_HEAD_INITIALIZER(vpc_create_opts.head),
1197     .desc = {
1198         {
1199             .name = BLOCK_OPT_SIZE,
1200             .type = QEMU_OPT_SIZE,
1201             .help = "Virtual disk size"
1202         },
1203         {
1204             .name = BLOCK_OPT_SUBFMT,
1205             .type = QEMU_OPT_STRING,
1206             .help =
1207                 "Type of virtual hard disk format. Supported formats are "
1208                 "{dynamic (default) | fixed} "
1209         },
1210         {
1211             .name = VPC_OPT_FORCE_SIZE,
1212             .type = QEMU_OPT_BOOL,
1213             .help = "Force disk size calculation to use the actual size "
1214                     "specified, rather than using the nearest CHS-based "
1215                     "calculation"
1216         },
1217         { /* end of list */ }
1218     }
1219 };
1220 
1221 static const char *const vpc_strong_runtime_opts[] = {
1222     VPC_OPT_SIZE_CALC,
1223 
1224     NULL
1225 };
1226 
1227 static BlockDriver bdrv_vpc = {
1228     .format_name    = "vpc",
1229     .instance_size  = sizeof(BDRVVPCState),
1230 
1231     .bdrv_probe             = vpc_probe,
1232     .bdrv_open              = vpc_open,
1233     .bdrv_close             = vpc_close,
1234     .bdrv_reopen_prepare    = vpc_reopen_prepare,
1235     .bdrv_child_perm        = bdrv_default_perms,
1236     .bdrv_co_create         = vpc_co_create,
1237     .bdrv_co_create_opts    = vpc_co_create_opts,
1238 
1239     .bdrv_co_preadv             = vpc_co_preadv,
1240     .bdrv_co_pwritev            = vpc_co_pwritev,
1241     .bdrv_co_block_status       = vpc_co_block_status,
1242 
1243     .bdrv_co_get_info       = vpc_co_get_info,
1244 
1245     .is_format              = true,
1246     .create_opts            = &vpc_create_opts,
1247     .bdrv_has_zero_init     = vpc_has_zero_init,
1248     .strong_runtime_opts    = vpc_strong_runtime_opts,
1249 };
1250 
1251 static void bdrv_vpc_init(void)
1252 {
1253     bdrv_register(&bdrv_vpc);
1254 }
1255 
1256 block_init(bdrv_vpc_init);
1257