xref: /openbmc/qemu/block/vpc.c (revision 89854803)
1 /*
2  * Block driver for Connectix / Microsoft Virtual PC images
3  *
4  * Copyright (c) 2005 Alex Beregszaszi
5  * Copyright (c) 2009 Kevin Wolf <kwolf@suse.de>
6  *
7  * Permission is hereby granted, free of charge, to any person obtaining a copy
8  * of this software and associated documentation files (the "Software"), to deal
9  * in the Software without restriction, including without limitation the rights
10  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11  * copies of the Software, and to permit persons to whom the Software is
12  * furnished to do so, subject to the following conditions:
13  *
14  * The above copyright notice and this permission notice shall be included in
15  * all copies or substantial portions of the Software.
16  *
17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23  * THE SOFTWARE.
24  */
25 
26 #include "qemu/osdep.h"
27 #include "qapi/error.h"
28 #include "block/block_int.h"
29 #include "sysemu/block-backend.h"
30 #include "qemu/module.h"
31 #include "qemu/option.h"
32 #include "migration/blocker.h"
33 #include "qemu/bswap.h"
34 #include "qemu/uuid.h"
35 #include "qapi/qmp/qdict.h"
36 #include "qapi/qobject-input-visitor.h"
37 #include "qapi/qapi-visit-block-core.h"
38 
39 /**************************************************************/
40 
41 #define HEADER_SIZE 512
42 
43 //#define CACHE
44 
45 enum vhd_type {
46     VHD_FIXED           = 2,
47     VHD_DYNAMIC         = 3,
48     VHD_DIFFERENCING    = 4,
49 };
50 
51 /* Seconds since Jan 1, 2000 0:00:00 (UTC) */
52 #define VHD_TIMESTAMP_BASE 946684800
53 
54 #define VHD_CHS_MAX_C   65535LL
55 #define VHD_CHS_MAX_H   16
56 #define VHD_CHS_MAX_S   255
57 
58 #define VHD_MAX_SECTORS       0xff000000    /* 2040 GiB max image size */
59 #define VHD_MAX_GEOMETRY      (VHD_CHS_MAX_C * VHD_CHS_MAX_H * VHD_CHS_MAX_S)
60 
61 #define VPC_OPT_FORCE_SIZE "force_size"
62 
63 /* always big-endian */
64 typedef struct vhd_footer {
65     char        creator[8]; /* "conectix" */
66     uint32_t    features;
67     uint32_t    version;
68 
69     /* Offset of next header structure, 0xFFFFFFFF if none */
70     uint64_t    data_offset;
71 
72     /* Seconds since Jan 1, 2000 0:00:00 (UTC) */
73     uint32_t    timestamp;
74 
75     char        creator_app[4]; /*  e.g., "vpc " */
76     uint16_t    major;
77     uint16_t    minor;
78     char        creator_os[4]; /* "Wi2k" */
79 
80     uint64_t    orig_size;
81     uint64_t    current_size;
82 
83     uint16_t    cyls;
84     uint8_t     heads;
85     uint8_t     secs_per_cyl;
86 
87     uint32_t    type;
88 
89     /* Checksum of the Hard Disk Footer ("one's complement of the sum of all
90        the bytes in the footer without the checksum field") */
91     uint32_t    checksum;
92 
93     /* UUID used to identify a parent hard disk (backing file) */
94     QemuUUID    uuid;
95 
96     uint8_t     in_saved_state;
97 } QEMU_PACKED VHDFooter;
98 
99 typedef struct vhd_dyndisk_header {
100     char        magic[8]; /* "cxsparse" */
101 
102     /* Offset of next header structure, 0xFFFFFFFF if none */
103     uint64_t    data_offset;
104 
105     /* Offset of the Block Allocation Table (BAT) */
106     uint64_t    table_offset;
107 
108     uint32_t    version;
109     uint32_t    max_table_entries; /* 32bit/entry */
110 
111     /* 2 MB by default, must be a power of two */
112     uint32_t    block_size;
113 
114     uint32_t    checksum;
115     uint8_t     parent_uuid[16];
116     uint32_t    parent_timestamp;
117     uint32_t    reserved;
118 
119     /* Backing file name (in UTF-16) */
120     uint8_t     parent_name[512];
121 
122     struct {
123         uint32_t    platform;
124         uint32_t    data_space;
125         uint32_t    data_length;
126         uint32_t    reserved;
127         uint64_t    data_offset;
128     } parent_locator[8];
129 } QEMU_PACKED VHDDynDiskHeader;
130 
131 typedef struct BDRVVPCState {
132     CoMutex lock;
133     uint8_t footer_buf[HEADER_SIZE];
134     uint64_t free_data_block_offset;
135     int max_table_entries;
136     uint32_t *pagetable;
137     uint64_t bat_offset;
138     uint64_t last_bitmap_offset;
139 
140     uint32_t block_size;
141     uint32_t bitmap_size;
142     bool force_use_chs;
143     bool force_use_sz;
144 
145 #ifdef CACHE
146     uint8_t *pageentry_u8;
147     uint32_t *pageentry_u32;
148     uint16_t *pageentry_u16;
149 
150     uint64_t last_bitmap;
151 #endif
152 
153     Error *migration_blocker;
154 } BDRVVPCState;
155 
156 #define VPC_OPT_SIZE_CALC "force_size_calc"
157 static QemuOptsList vpc_runtime_opts = {
158     .name = "vpc-runtime-opts",
159     .head = QTAILQ_HEAD_INITIALIZER(vpc_runtime_opts.head),
160     .desc = {
161         {
162             .name = VPC_OPT_SIZE_CALC,
163             .type = QEMU_OPT_STRING,
164             .help = "Force disk size calculation to use either CHS geometry, "
165                     "or use the disk current_size specified in the VHD footer. "
166                     "{chs, current_size}"
167         },
168         { /* end of list */ }
169     }
170 };
171 
172 static QemuOptsList vpc_create_opts;
173 
174 static uint32_t vpc_checksum(uint8_t* buf, size_t size)
175 {
176     uint32_t res = 0;
177     int i;
178 
179     for (i = 0; i < size; i++)
180         res += buf[i];
181 
182     return ~res;
183 }
184 
185 
186 static int vpc_probe(const uint8_t *buf, int buf_size, const char *filename)
187 {
188     if (buf_size >= 8 && !strncmp((char *)buf, "conectix", 8))
189 	return 100;
190     return 0;
191 }
192 
193 static void vpc_parse_options(BlockDriverState *bs, QemuOpts *opts,
194                               Error **errp)
195 {
196     BDRVVPCState *s = bs->opaque;
197     const char *size_calc;
198 
199     size_calc = qemu_opt_get(opts, VPC_OPT_SIZE_CALC);
200 
201     if (!size_calc) {
202        /* no override, use autodetect only */
203     } else if (!strcmp(size_calc, "current_size")) {
204         s->force_use_sz = true;
205     } else if (!strcmp(size_calc, "chs")) {
206         s->force_use_chs = true;
207     } else {
208         error_setg(errp, "Invalid size calculation mode: '%s'", size_calc);
209     }
210 }
211 
212 static int vpc_open(BlockDriverState *bs, QDict *options, int flags,
213                     Error **errp)
214 {
215     BDRVVPCState *s = bs->opaque;
216     int i;
217     VHDFooter *footer;
218     VHDDynDiskHeader *dyndisk_header;
219     QemuOpts *opts = NULL;
220     Error *local_err = NULL;
221     bool use_chs;
222     uint8_t buf[HEADER_SIZE];
223     uint32_t checksum;
224     uint64_t computed_size;
225     uint64_t pagetable_size;
226     int disk_type = VHD_DYNAMIC;
227     int ret;
228     int64_t bs_size;
229 
230     bs->file = bdrv_open_child(NULL, options, "file", bs, &child_file,
231                                false, errp);
232     if (!bs->file) {
233         return -EINVAL;
234     }
235 
236     opts = qemu_opts_create(&vpc_runtime_opts, NULL, 0, &error_abort);
237     qemu_opts_absorb_qdict(opts, options, &local_err);
238     if (local_err) {
239         error_propagate(errp, local_err);
240         ret = -EINVAL;
241         goto fail;
242     }
243 
244     vpc_parse_options(bs, opts, &local_err);
245     if (local_err) {
246         error_propagate(errp, local_err);
247         ret = -EINVAL;
248         goto fail;
249     }
250 
251     ret = bdrv_pread(bs->file, 0, s->footer_buf, HEADER_SIZE);
252     if (ret < 0) {
253         error_setg(errp, "Unable to read VHD header");
254         goto fail;
255     }
256 
257     footer = (VHDFooter *) s->footer_buf;
258     if (strncmp(footer->creator, "conectix", 8)) {
259         int64_t offset = bdrv_getlength(bs->file->bs);
260         if (offset < 0) {
261             ret = offset;
262             error_setg(errp, "Invalid file size");
263             goto fail;
264         } else if (offset < HEADER_SIZE) {
265             ret = -EINVAL;
266             error_setg(errp, "File too small for a VHD header");
267             goto fail;
268         }
269 
270         /* If a fixed disk, the footer is found only at the end of the file */
271         ret = bdrv_pread(bs->file, offset-HEADER_SIZE, s->footer_buf,
272                          HEADER_SIZE);
273         if (ret < 0) {
274             goto fail;
275         }
276         if (strncmp(footer->creator, "conectix", 8)) {
277             error_setg(errp, "invalid VPC image");
278             ret = -EINVAL;
279             goto fail;
280         }
281         disk_type = VHD_FIXED;
282     }
283 
284     checksum = be32_to_cpu(footer->checksum);
285     footer->checksum = 0;
286     if (vpc_checksum(s->footer_buf, HEADER_SIZE) != checksum)
287         fprintf(stderr, "block-vpc: The header checksum of '%s' is "
288             "incorrect.\n", bs->filename);
289 
290     /* Write 'checksum' back to footer, or else will leave it with zero. */
291     footer->checksum = cpu_to_be32(checksum);
292 
293     /* The visible size of a image in Virtual PC depends on the geometry
294        rather than on the size stored in the footer (the size in the footer
295        is too large usually) */
296     bs->total_sectors = (int64_t)
297         be16_to_cpu(footer->cyls) * footer->heads * footer->secs_per_cyl;
298 
299     /* Microsoft Virtual PC and Microsoft Hyper-V produce and read
300      * VHD image sizes differently.  VPC will rely on CHS geometry,
301      * while Hyper-V and disk2vhd use the size specified in the footer.
302      *
303      * We use a couple of approaches to try and determine the correct method:
304      * look at the Creator App field, and look for images that have CHS
305      * geometry that is the maximum value.
306      *
307      * If the CHS geometry is the maximum CHS geometry, then we assume that
308      * the size is the footer->current_size to avoid truncation.  Otherwise,
309      * we follow the table based on footer->creator_app:
310      *
311      *  Known creator apps:
312      *      'vpc '  :  CHS              Virtual PC (uses disk geometry)
313      *      'qemu'  :  CHS              QEMU (uses disk geometry)
314      *      'qem2'  :  current_size     QEMU (uses current_size)
315      *      'win '  :  current_size     Hyper-V
316      *      'd2v '  :  current_size     Disk2vhd
317      *      'tap\0' :  current_size     XenServer
318      *      'CTXS'  :  current_size     XenConverter
319      *
320      *  The user can override the table values via drive options, however
321      *  even with an override we will still use current_size for images
322      *  that have CHS geometry of the maximum size.
323      */
324     use_chs = (!!strncmp(footer->creator_app, "win ", 4) &&
325                !!strncmp(footer->creator_app, "qem2", 4) &&
326                !!strncmp(footer->creator_app, "d2v ", 4) &&
327                !!strncmp(footer->creator_app, "CTXS", 4) &&
328                !!memcmp(footer->creator_app, "tap", 4)) || s->force_use_chs;
329 
330     if (!use_chs || bs->total_sectors == VHD_MAX_GEOMETRY || s->force_use_sz) {
331         bs->total_sectors = be64_to_cpu(footer->current_size) /
332                                         BDRV_SECTOR_SIZE;
333     }
334 
335     /* Allow a maximum disk size of 2040 GiB */
336     if (bs->total_sectors > VHD_MAX_SECTORS) {
337         ret = -EFBIG;
338         goto fail;
339     }
340 
341     if (disk_type == VHD_DYNAMIC) {
342         ret = bdrv_pread(bs->file, be64_to_cpu(footer->data_offset), buf,
343                          HEADER_SIZE);
344         if (ret < 0) {
345             error_setg(errp, "Error reading dynamic VHD header");
346             goto fail;
347         }
348 
349         dyndisk_header = (VHDDynDiskHeader *) buf;
350 
351         if (strncmp(dyndisk_header->magic, "cxsparse", 8)) {
352             error_setg(errp, "Invalid header magic");
353             ret = -EINVAL;
354             goto fail;
355         }
356 
357         s->block_size = be32_to_cpu(dyndisk_header->block_size);
358         if (!is_power_of_2(s->block_size) || s->block_size < BDRV_SECTOR_SIZE) {
359             error_setg(errp, "Invalid block size %" PRIu32, s->block_size);
360             ret = -EINVAL;
361             goto fail;
362         }
363         s->bitmap_size = ((s->block_size / (8 * 512)) + 511) & ~511;
364 
365         s->max_table_entries = be32_to_cpu(dyndisk_header->max_table_entries);
366 
367         if ((bs->total_sectors * 512) / s->block_size > 0xffffffffU) {
368             error_setg(errp, "Too many blocks");
369             ret = -EINVAL;
370             goto fail;
371         }
372 
373         computed_size = (uint64_t) s->max_table_entries * s->block_size;
374         if (computed_size < bs->total_sectors * 512) {
375             error_setg(errp, "Page table too small");
376             ret = -EINVAL;
377             goto fail;
378         }
379 
380         if (s->max_table_entries > SIZE_MAX / 4 ||
381             s->max_table_entries > (int) INT_MAX / 4) {
382             error_setg(errp, "Max Table Entries too large (%" PRId32 ")",
383                         s->max_table_entries);
384             ret = -EINVAL;
385             goto fail;
386         }
387 
388         pagetable_size = (uint64_t) s->max_table_entries * 4;
389 
390         s->pagetable = qemu_try_blockalign(bs->file->bs, pagetable_size);
391         if (s->pagetable == NULL) {
392             error_setg(errp, "Unable to allocate memory for page table");
393             ret = -ENOMEM;
394             goto fail;
395         }
396 
397         s->bat_offset = be64_to_cpu(dyndisk_header->table_offset);
398 
399         ret = bdrv_pread(bs->file, s->bat_offset, s->pagetable,
400                          pagetable_size);
401         if (ret < 0) {
402             error_setg(errp, "Error reading pagetable");
403             goto fail;
404         }
405 
406         s->free_data_block_offset =
407             ROUND_UP(s->bat_offset + pagetable_size, 512);
408 
409         for (i = 0; i < s->max_table_entries; i++) {
410             be32_to_cpus(&s->pagetable[i]);
411             if (s->pagetable[i] != 0xFFFFFFFF) {
412                 int64_t next = (512 * (int64_t) s->pagetable[i]) +
413                     s->bitmap_size + s->block_size;
414 
415                 if (next > s->free_data_block_offset) {
416                     s->free_data_block_offset = next;
417                 }
418             }
419         }
420 
421         bs_size = bdrv_getlength(bs->file->bs);
422         if (bs_size < 0) {
423             error_setg_errno(errp, -bs_size, "Unable to learn image size");
424             ret = bs_size;
425             goto fail;
426         }
427         if (s->free_data_block_offset > bs_size) {
428             error_setg(errp, "block-vpc: free_data_block_offset points after "
429                              "the end of file. The image has been truncated.");
430             ret = -EINVAL;
431             goto fail;
432         }
433 
434         s->last_bitmap_offset = (int64_t) -1;
435 
436 #ifdef CACHE
437         s->pageentry_u8 = g_malloc(512);
438         s->pageentry_u32 = s->pageentry_u8;
439         s->pageentry_u16 = s->pageentry_u8;
440         s->last_pagetable = -1;
441 #endif
442     }
443 
444     /* Disable migration when VHD images are used */
445     error_setg(&s->migration_blocker, "The vpc format used by node '%s' "
446                "does not support live migration",
447                bdrv_get_device_or_node_name(bs));
448     ret = migrate_add_blocker(s->migration_blocker, &local_err);
449     if (local_err) {
450         error_propagate(errp, local_err);
451         error_free(s->migration_blocker);
452         goto fail;
453     }
454 
455     qemu_co_mutex_init(&s->lock);
456 
457     return 0;
458 
459 fail:
460     qemu_vfree(s->pagetable);
461 #ifdef CACHE
462     g_free(s->pageentry_u8);
463 #endif
464     return ret;
465 }
466 
467 static int vpc_reopen_prepare(BDRVReopenState *state,
468                               BlockReopenQueue *queue, Error **errp)
469 {
470     return 0;
471 }
472 
473 /*
474  * Returns the absolute byte offset of the given sector in the image file.
475  * If the sector is not allocated, -1 is returned instead.
476  * If an error occurred trying to write an updated block bitmap back to
477  * the file, -2 is returned, and the error value is written to *err.
478  * This can only happen for a write operation.
479  *
480  * The parameter write must be 1 if the offset will be used for a write
481  * operation (the block bitmaps is updated then), 0 otherwise.
482  * If write is true then err must not be NULL.
483  */
484 static inline int64_t get_image_offset(BlockDriverState *bs, uint64_t offset,
485                                        bool write, int *err)
486 {
487     BDRVVPCState *s = bs->opaque;
488     uint64_t bitmap_offset, block_offset;
489     uint32_t pagetable_index, offset_in_block;
490 
491     assert(!(write && err == NULL));
492 
493     pagetable_index = offset / s->block_size;
494     offset_in_block = offset % s->block_size;
495 
496     if (pagetable_index >= s->max_table_entries || s->pagetable[pagetable_index] == 0xffffffff)
497         return -1; /* not allocated */
498 
499     bitmap_offset = 512 * (uint64_t) s->pagetable[pagetable_index];
500     block_offset = bitmap_offset + s->bitmap_size + offset_in_block;
501 
502     /* We must ensure that we don't write to any sectors which are marked as
503        unused in the bitmap. We get away with setting all bits in the block
504        bitmap each time we write to a new block. This might cause Virtual PC to
505        miss sparse read optimization, but it's not a problem in terms of
506        correctness. */
507     if (write && (s->last_bitmap_offset != bitmap_offset)) {
508         uint8_t bitmap[s->bitmap_size];
509         int r;
510 
511         s->last_bitmap_offset = bitmap_offset;
512         memset(bitmap, 0xff, s->bitmap_size);
513         r = bdrv_pwrite_sync(bs->file, bitmap_offset, bitmap, s->bitmap_size);
514         if (r < 0) {
515             *err = r;
516             return -2;
517         }
518     }
519 
520     return block_offset;
521 }
522 
523 /*
524  * Writes the footer to the end of the image file. This is needed when the
525  * file grows as it overwrites the old footer
526  *
527  * Returns 0 on success and < 0 on error
528  */
529 static int rewrite_footer(BlockDriverState* bs)
530 {
531     int ret;
532     BDRVVPCState *s = bs->opaque;
533     int64_t offset = s->free_data_block_offset;
534 
535     ret = bdrv_pwrite_sync(bs->file, offset, s->footer_buf, HEADER_SIZE);
536     if (ret < 0)
537         return ret;
538 
539     return 0;
540 }
541 
542 /*
543  * Allocates a new block. This involves writing a new footer and updating
544  * the Block Allocation Table to use the space at the old end of the image
545  * file (overwriting the old footer)
546  *
547  * Returns the sectors' offset in the image file on success and < 0 on error
548  */
549 static int64_t alloc_block(BlockDriverState* bs, int64_t offset)
550 {
551     BDRVVPCState *s = bs->opaque;
552     int64_t bat_offset;
553     uint32_t index, bat_value;
554     int ret;
555     uint8_t bitmap[s->bitmap_size];
556 
557     /* Check if sector_num is valid */
558     if ((offset < 0) || (offset > bs->total_sectors * BDRV_SECTOR_SIZE)) {
559         return -EINVAL;
560     }
561 
562     /* Write entry into in-memory BAT */
563     index = offset / s->block_size;
564     assert(s->pagetable[index] == 0xFFFFFFFF);
565     s->pagetable[index] = s->free_data_block_offset / 512;
566 
567     /* Initialize the block's bitmap */
568     memset(bitmap, 0xff, s->bitmap_size);
569     ret = bdrv_pwrite_sync(bs->file, s->free_data_block_offset, bitmap,
570         s->bitmap_size);
571     if (ret < 0) {
572         return ret;
573     }
574 
575     /* Write new footer (the old one will be overwritten) */
576     s->free_data_block_offset += s->block_size + s->bitmap_size;
577     ret = rewrite_footer(bs);
578     if (ret < 0)
579         goto fail;
580 
581     /* Write BAT entry to disk */
582     bat_offset = s->bat_offset + (4 * index);
583     bat_value = cpu_to_be32(s->pagetable[index]);
584     ret = bdrv_pwrite_sync(bs->file, bat_offset, &bat_value, 4);
585     if (ret < 0)
586         goto fail;
587 
588     return get_image_offset(bs, offset, false, NULL);
589 
590 fail:
591     s->free_data_block_offset -= (s->block_size + s->bitmap_size);
592     return ret;
593 }
594 
595 static int vpc_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
596 {
597     BDRVVPCState *s = (BDRVVPCState *)bs->opaque;
598     VHDFooter *footer = (VHDFooter *) s->footer_buf;
599 
600     if (be32_to_cpu(footer->type) != VHD_FIXED) {
601         bdi->cluster_size = s->block_size;
602     }
603 
604     bdi->unallocated_blocks_are_zero = true;
605     return 0;
606 }
607 
608 static int coroutine_fn
609 vpc_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
610               QEMUIOVector *qiov, int flags)
611 {
612     BDRVVPCState *s = bs->opaque;
613     int ret;
614     int64_t image_offset;
615     int64_t n_bytes;
616     int64_t bytes_done = 0;
617     VHDFooter *footer = (VHDFooter *) s->footer_buf;
618     QEMUIOVector local_qiov;
619 
620     if (be32_to_cpu(footer->type) == VHD_FIXED) {
621         return bdrv_co_preadv(bs->file, offset, bytes, qiov, 0);
622     }
623 
624     qemu_co_mutex_lock(&s->lock);
625     qemu_iovec_init(&local_qiov, qiov->niov);
626 
627     while (bytes > 0) {
628         image_offset = get_image_offset(bs, offset, false, NULL);
629         n_bytes = MIN(bytes, s->block_size - (offset % s->block_size));
630 
631         if (image_offset == -1) {
632             qemu_iovec_memset(qiov, bytes_done, 0, n_bytes);
633         } else {
634             qemu_iovec_reset(&local_qiov);
635             qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes);
636 
637             ret = bdrv_co_preadv(bs->file, image_offset, n_bytes,
638                                  &local_qiov, 0);
639             if (ret < 0) {
640                 goto fail;
641             }
642         }
643 
644         bytes -= n_bytes;
645         offset += n_bytes;
646         bytes_done += n_bytes;
647     }
648 
649     ret = 0;
650 fail:
651     qemu_iovec_destroy(&local_qiov);
652     qemu_co_mutex_unlock(&s->lock);
653 
654     return ret;
655 }
656 
657 static int coroutine_fn
658 vpc_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
659                QEMUIOVector *qiov, int flags)
660 {
661     BDRVVPCState *s = bs->opaque;
662     int64_t image_offset;
663     int64_t n_bytes;
664     int64_t bytes_done = 0;
665     int ret = 0;
666     VHDFooter *footer =  (VHDFooter *) s->footer_buf;
667     QEMUIOVector local_qiov;
668 
669     if (be32_to_cpu(footer->type) == VHD_FIXED) {
670         return bdrv_co_pwritev(bs->file, offset, bytes, qiov, 0);
671     }
672 
673     qemu_co_mutex_lock(&s->lock);
674     qemu_iovec_init(&local_qiov, qiov->niov);
675 
676     while (bytes > 0) {
677         image_offset = get_image_offset(bs, offset, true, &ret);
678         if (image_offset == -2) {
679             /* Failed to write block bitmap: can't proceed with write */
680             goto fail;
681         }
682         n_bytes = MIN(bytes, s->block_size - (offset % s->block_size));
683 
684         if (image_offset == -1) {
685             image_offset = alloc_block(bs, offset);
686             if (image_offset < 0) {
687                 ret = image_offset;
688                 goto fail;
689             }
690         }
691 
692         qemu_iovec_reset(&local_qiov);
693         qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes);
694 
695         ret = bdrv_co_pwritev(bs->file, image_offset, n_bytes,
696                               &local_qiov, 0);
697         if (ret < 0) {
698             goto fail;
699         }
700 
701         bytes -= n_bytes;
702         offset += n_bytes;
703         bytes_done += n_bytes;
704     }
705 
706     ret = 0;
707 fail:
708     qemu_iovec_destroy(&local_qiov);
709     qemu_co_mutex_unlock(&s->lock);
710 
711     return ret;
712 }
713 
714 static int coroutine_fn vpc_co_block_status(BlockDriverState *bs,
715                                             bool want_zero,
716                                             int64_t offset, int64_t bytes,
717                                             int64_t *pnum, int64_t *map,
718                                             BlockDriverState **file)
719 {
720     BDRVVPCState *s = bs->opaque;
721     VHDFooter *footer = (VHDFooter*) s->footer_buf;
722     int64_t image_offset;
723     bool allocated;
724     int ret;
725     int64_t n;
726 
727     if (be32_to_cpu(footer->type) == VHD_FIXED) {
728         *pnum = bytes;
729         *map = offset;
730         *file = bs->file->bs;
731         return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID;
732     }
733 
734     qemu_co_mutex_lock(&s->lock);
735 
736     image_offset = get_image_offset(bs, offset, false, NULL);
737     allocated = (image_offset != -1);
738     *pnum = 0;
739     ret = 0;
740 
741     do {
742         /* All sectors in a block are contiguous (without using the bitmap) */
743         n = ROUND_UP(offset + 1, s->block_size) - offset;
744         n = MIN(n, bytes);
745 
746         *pnum += n;
747         offset += n;
748         bytes -= n;
749         /* *pnum can't be greater than one block for allocated
750          * sectors since there is always a bitmap in between. */
751         if (allocated) {
752             *file = bs->file->bs;
753             *map = image_offset;
754             ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
755             break;
756         }
757         if (bytes == 0) {
758             break;
759         }
760         image_offset = get_image_offset(bs, offset, false, NULL);
761     } while (image_offset == -1);
762 
763     qemu_co_mutex_unlock(&s->lock);
764     return ret;
765 }
766 
767 /*
768  * Calculates the number of cylinders, heads and sectors per cylinder
769  * based on a given number of sectors. This is the algorithm described
770  * in the VHD specification.
771  *
772  * Note that the geometry doesn't always exactly match total_sectors but
773  * may round it down.
774  *
775  * Returns 0 on success, -EFBIG if the size is larger than 2040 GiB. Override
776  * the hardware EIDE and ATA-2 limit of 16 heads (max disk size of 127 GB)
777  * and instead allow up to 255 heads.
778  */
779 static int calculate_geometry(int64_t total_sectors, uint16_t* cyls,
780     uint8_t* heads, uint8_t* secs_per_cyl)
781 {
782     uint32_t cyls_times_heads;
783 
784     total_sectors = MIN(total_sectors, VHD_MAX_GEOMETRY);
785 
786     if (total_sectors >= 65535LL * 16 * 63) {
787         *secs_per_cyl = 255;
788         *heads = 16;
789         cyls_times_heads = total_sectors / *secs_per_cyl;
790     } else {
791         *secs_per_cyl = 17;
792         cyls_times_heads = total_sectors / *secs_per_cyl;
793         *heads = DIV_ROUND_UP(cyls_times_heads, 1024);
794 
795         if (*heads < 4) {
796             *heads = 4;
797         }
798 
799         if (cyls_times_heads >= (*heads * 1024) || *heads > 16) {
800             *secs_per_cyl = 31;
801             *heads = 16;
802             cyls_times_heads = total_sectors / *secs_per_cyl;
803         }
804 
805         if (cyls_times_heads >= (*heads * 1024)) {
806             *secs_per_cyl = 63;
807             *heads = 16;
808             cyls_times_heads = total_sectors / *secs_per_cyl;
809         }
810     }
811 
812     *cyls = cyls_times_heads / *heads;
813 
814     return 0;
815 }
816 
817 static int create_dynamic_disk(BlockBackend *blk, uint8_t *buf,
818                                int64_t total_sectors)
819 {
820     VHDDynDiskHeader *dyndisk_header =
821         (VHDDynDiskHeader *) buf;
822     size_t block_size, num_bat_entries;
823     int i;
824     int ret;
825     int64_t offset = 0;
826 
827     /* Write the footer (twice: at the beginning and at the end) */
828     block_size = 0x200000;
829     num_bat_entries = (total_sectors + block_size / 512) / (block_size / 512);
830 
831     ret = blk_pwrite(blk, offset, buf, HEADER_SIZE, 0);
832     if (ret < 0) {
833         goto fail;
834     }
835 
836     offset = 1536 + ((num_bat_entries * 4 + 511) & ~511);
837     ret = blk_pwrite(blk, offset, buf, HEADER_SIZE, 0);
838     if (ret < 0) {
839         goto fail;
840     }
841 
842     /* Write the initial BAT */
843     offset = 3 * 512;
844 
845     memset(buf, 0xFF, 512);
846     for (i = 0; i < DIV_ROUND_UP(num_bat_entries * 4, 512); i++) {
847         ret = blk_pwrite(blk, offset, buf, 512, 0);
848         if (ret < 0) {
849             goto fail;
850         }
851         offset += 512;
852     }
853 
854     /* Prepare the Dynamic Disk Header */
855     memset(buf, 0, 1024);
856 
857     memcpy(dyndisk_header->magic, "cxsparse", 8);
858 
859     /*
860      * Note: The spec is actually wrong here for data_offset, it says
861      * 0xFFFFFFFF, but MS tools expect all 64 bits to be set.
862      */
863     dyndisk_header->data_offset = cpu_to_be64(0xFFFFFFFFFFFFFFFFULL);
864     dyndisk_header->table_offset = cpu_to_be64(3 * 512);
865     dyndisk_header->version = cpu_to_be32(0x00010000);
866     dyndisk_header->block_size = cpu_to_be32(block_size);
867     dyndisk_header->max_table_entries = cpu_to_be32(num_bat_entries);
868 
869     dyndisk_header->checksum = cpu_to_be32(vpc_checksum(buf, 1024));
870 
871     /* Write the header */
872     offset = 512;
873 
874     ret = blk_pwrite(blk, offset, buf, 1024, 0);
875     if (ret < 0) {
876         goto fail;
877     }
878 
879  fail:
880     return ret;
881 }
882 
883 static int create_fixed_disk(BlockBackend *blk, uint8_t *buf,
884                              int64_t total_size, Error **errp)
885 {
886     int ret;
887 
888     /* Add footer to total size */
889     total_size += HEADER_SIZE;
890 
891     ret = blk_truncate(blk, total_size, PREALLOC_MODE_OFF, errp);
892     if (ret < 0) {
893         return ret;
894     }
895 
896     ret = blk_pwrite(blk, total_size - HEADER_SIZE, buf, HEADER_SIZE, 0);
897     if (ret < 0) {
898         error_setg_errno(errp, -ret, "Unable to write VHD header");
899         return ret;
900     }
901 
902     return ret;
903 }
904 
905 static int calculate_rounded_image_size(BlockdevCreateOptionsVpc *vpc_opts,
906                                         uint16_t *out_cyls,
907                                         uint8_t *out_heads,
908                                         uint8_t *out_secs_per_cyl,
909                                         int64_t *out_total_sectors,
910                                         Error **errp)
911 {
912     int64_t total_size = vpc_opts->size;
913     uint16_t cyls = 0;
914     uint8_t heads = 0;
915     uint8_t secs_per_cyl = 0;
916     int64_t total_sectors;
917     int i;
918 
919     /*
920      * Calculate matching total_size and geometry. Increase the number of
921      * sectors requested until we get enough (or fail). This ensures that
922      * qemu-img convert doesn't truncate images, but rather rounds up.
923      *
924      * If the image size can't be represented by a spec conformant CHS geometry,
925      * we set the geometry to 65535 x 16 x 255 (CxHxS) sectors and use
926      * the image size from the VHD footer to calculate total_sectors.
927      */
928     if (vpc_opts->force_size) {
929         /* This will force the use of total_size for sector count, below */
930         cyls         = VHD_CHS_MAX_C;
931         heads        = VHD_CHS_MAX_H;
932         secs_per_cyl = VHD_CHS_MAX_S;
933     } else {
934         total_sectors = MIN(VHD_MAX_GEOMETRY, total_size / BDRV_SECTOR_SIZE);
935         for (i = 0; total_sectors > (int64_t)cyls * heads * secs_per_cyl; i++) {
936             calculate_geometry(total_sectors + i, &cyls, &heads, &secs_per_cyl);
937         }
938     }
939 
940     if ((int64_t)cyls * heads * secs_per_cyl == VHD_MAX_GEOMETRY) {
941         total_sectors = total_size / BDRV_SECTOR_SIZE;
942         /* Allow a maximum disk size of 2040 GiB */
943         if (total_sectors > VHD_MAX_SECTORS) {
944             error_setg(errp, "Disk size is too large, max size is 2040 GiB");
945             return -EFBIG;
946         }
947     } else {
948         total_sectors = (int64_t) cyls * heads * secs_per_cyl;
949     }
950 
951     *out_total_sectors = total_sectors;
952     if (out_cyls) {
953         *out_cyls = cyls;
954         *out_heads = heads;
955         *out_secs_per_cyl = secs_per_cyl;
956     }
957 
958     return 0;
959 }
960 
961 static int coroutine_fn vpc_co_create(BlockdevCreateOptions *opts,
962                                       Error **errp)
963 {
964     BlockdevCreateOptionsVpc *vpc_opts;
965     BlockBackend *blk = NULL;
966     BlockDriverState *bs = NULL;
967 
968     uint8_t buf[1024];
969     VHDFooter *footer = (VHDFooter *) buf;
970     uint16_t cyls = 0;
971     uint8_t heads = 0;
972     uint8_t secs_per_cyl = 0;
973     int64_t total_sectors;
974     int64_t total_size;
975     int disk_type;
976     int ret = -EIO;
977 
978     assert(opts->driver == BLOCKDEV_DRIVER_VPC);
979     vpc_opts = &opts->u.vpc;
980 
981     /* Validate options and set default values */
982     total_size = vpc_opts->size;
983 
984     if (!vpc_opts->has_subformat) {
985         vpc_opts->subformat = BLOCKDEV_VPC_SUBFORMAT_DYNAMIC;
986     }
987     switch (vpc_opts->subformat) {
988     case BLOCKDEV_VPC_SUBFORMAT_DYNAMIC:
989         disk_type = VHD_DYNAMIC;
990         break;
991     case BLOCKDEV_VPC_SUBFORMAT_FIXED:
992         disk_type = VHD_FIXED;
993         break;
994     default:
995         g_assert_not_reached();
996     }
997 
998     /* Create BlockBackend to write to the image */
999     bs = bdrv_open_blockdev_ref(vpc_opts->file, errp);
1000     if (bs == NULL) {
1001         return -EIO;
1002     }
1003 
1004     blk = blk_new(BLK_PERM_WRITE | BLK_PERM_RESIZE, BLK_PERM_ALL);
1005     ret = blk_insert_bs(blk, bs, errp);
1006     if (ret < 0) {
1007         goto out;
1008     }
1009     blk_set_allow_write_beyond_eof(blk, true);
1010 
1011     /* Get geometry and check that it matches the image size*/
1012     ret = calculate_rounded_image_size(vpc_opts, &cyls, &heads, &secs_per_cyl,
1013                                        &total_sectors, errp);
1014     if (ret < 0) {
1015         goto out;
1016     }
1017 
1018     if (total_size != total_sectors * BDRV_SECTOR_SIZE) {
1019         error_setg(errp, "The requested image size cannot be represented in "
1020                          "CHS geometry");
1021         error_append_hint(errp, "Try size=%llu or force-size=on (the "
1022                                 "latter makes the image incompatible with "
1023                                 "Virtual PC)",
1024                           total_sectors * BDRV_SECTOR_SIZE);
1025         ret = -EINVAL;
1026         goto out;
1027     }
1028 
1029     /* Prepare the Hard Disk Footer */
1030     memset(buf, 0, 1024);
1031 
1032     memcpy(footer->creator, "conectix", 8);
1033     if (vpc_opts->force_size) {
1034         memcpy(footer->creator_app, "qem2", 4);
1035     } else {
1036         memcpy(footer->creator_app, "qemu", 4);
1037     }
1038     memcpy(footer->creator_os, "Wi2k", 4);
1039 
1040     footer->features = cpu_to_be32(0x02);
1041     footer->version = cpu_to_be32(0x00010000);
1042     if (disk_type == VHD_DYNAMIC) {
1043         footer->data_offset = cpu_to_be64(HEADER_SIZE);
1044     } else {
1045         footer->data_offset = cpu_to_be64(0xFFFFFFFFFFFFFFFFULL);
1046     }
1047     footer->timestamp = cpu_to_be32(time(NULL) - VHD_TIMESTAMP_BASE);
1048 
1049     /* Version of Virtual PC 2007 */
1050     footer->major = cpu_to_be16(0x0005);
1051     footer->minor = cpu_to_be16(0x0003);
1052     footer->orig_size = cpu_to_be64(total_size);
1053     footer->current_size = cpu_to_be64(total_size);
1054     footer->cyls = cpu_to_be16(cyls);
1055     footer->heads = heads;
1056     footer->secs_per_cyl = secs_per_cyl;
1057 
1058     footer->type = cpu_to_be32(disk_type);
1059 
1060     qemu_uuid_generate(&footer->uuid);
1061 
1062     footer->checksum = cpu_to_be32(vpc_checksum(buf, HEADER_SIZE));
1063 
1064     if (disk_type == VHD_DYNAMIC) {
1065         ret = create_dynamic_disk(blk, buf, total_sectors);
1066         if (ret < 0) {
1067             error_setg(errp, "Unable to create or write VHD header");
1068         }
1069     } else {
1070         ret = create_fixed_disk(blk, buf, total_size, errp);
1071     }
1072 
1073 out:
1074     blk_unref(blk);
1075     bdrv_unref(bs);
1076     return ret;
1077 }
1078 
1079 static int coroutine_fn vpc_co_create_opts(const char *filename,
1080                                            QemuOpts *opts, Error **errp)
1081 {
1082     BlockdevCreateOptions *create_options = NULL;
1083     QDict *qdict = NULL;
1084     QObject *qobj;
1085     Visitor *v;
1086     BlockDriverState *bs = NULL;
1087     Error *local_err = NULL;
1088     int ret;
1089 
1090     static const QDictRenames opt_renames[] = {
1091         { VPC_OPT_FORCE_SIZE,           "force-size" },
1092         { NULL, NULL },
1093     };
1094 
1095     /* Parse options and convert legacy syntax */
1096     qdict = qemu_opts_to_qdict_filtered(opts, NULL, &vpc_create_opts, true);
1097 
1098     if (!qdict_rename_keys(qdict, opt_renames, errp)) {
1099         ret = -EINVAL;
1100         goto fail;
1101     }
1102 
1103     /* Create and open the file (protocol layer) */
1104     ret = bdrv_create_file(filename, opts, &local_err);
1105     if (ret < 0) {
1106         error_propagate(errp, local_err);
1107         goto fail;
1108     }
1109 
1110     bs = bdrv_open(filename, NULL, NULL,
1111                    BDRV_O_RDWR | BDRV_O_RESIZE | BDRV_O_PROTOCOL, errp);
1112     if (bs == NULL) {
1113         ret = -EIO;
1114         goto fail;
1115     }
1116 
1117     /* Now get the QAPI type BlockdevCreateOptions */
1118     qdict_put_str(qdict, "driver", "vpc");
1119     qdict_put_str(qdict, "file", bs->node_name);
1120 
1121     qobj = qdict_crumple(qdict, errp);
1122     qobject_unref(qdict);
1123     qdict = qobject_to(QDict, qobj);
1124     if (qdict == NULL) {
1125         ret = -EINVAL;
1126         goto fail;
1127     }
1128 
1129     v = qobject_input_visitor_new_keyval(QOBJECT(qdict));
1130     visit_type_BlockdevCreateOptions(v, NULL, &create_options, &local_err);
1131     visit_free(v);
1132 
1133     if (local_err) {
1134         error_propagate(errp, local_err);
1135         ret = -EINVAL;
1136         goto fail;
1137     }
1138 
1139     /* Silently round up size */
1140     assert(create_options->driver == BLOCKDEV_DRIVER_VPC);
1141     create_options->u.vpc.size =
1142         ROUND_UP(create_options->u.vpc.size, BDRV_SECTOR_SIZE);
1143 
1144     if (!create_options->u.vpc.force_size) {
1145         int64_t total_sectors;
1146         ret = calculate_rounded_image_size(&create_options->u.vpc, NULL, NULL,
1147                                            NULL, &total_sectors, errp);
1148         if (ret < 0) {
1149             goto fail;
1150         }
1151 
1152         create_options->u.vpc.size = total_sectors * BDRV_SECTOR_SIZE;
1153     }
1154 
1155 
1156     /* Create the vpc image (format layer) */
1157     ret = vpc_co_create(create_options, errp);
1158 
1159 fail:
1160     qobject_unref(qdict);
1161     bdrv_unref(bs);
1162     qapi_free_BlockdevCreateOptions(create_options);
1163     return ret;
1164 }
1165 
1166 
1167 static int vpc_has_zero_init(BlockDriverState *bs)
1168 {
1169     BDRVVPCState *s = bs->opaque;
1170     VHDFooter *footer =  (VHDFooter *) s->footer_buf;
1171 
1172     if (be32_to_cpu(footer->type) == VHD_FIXED) {
1173         return bdrv_has_zero_init(bs->file->bs);
1174     } else {
1175         return 1;
1176     }
1177 }
1178 
1179 static void vpc_close(BlockDriverState *bs)
1180 {
1181     BDRVVPCState *s = bs->opaque;
1182     qemu_vfree(s->pagetable);
1183 #ifdef CACHE
1184     g_free(s->pageentry_u8);
1185 #endif
1186 
1187     migrate_del_blocker(s->migration_blocker);
1188     error_free(s->migration_blocker);
1189 }
1190 
1191 static QemuOptsList vpc_create_opts = {
1192     .name = "vpc-create-opts",
1193     .head = QTAILQ_HEAD_INITIALIZER(vpc_create_opts.head),
1194     .desc = {
1195         {
1196             .name = BLOCK_OPT_SIZE,
1197             .type = QEMU_OPT_SIZE,
1198             .help = "Virtual disk size"
1199         },
1200         {
1201             .name = BLOCK_OPT_SUBFMT,
1202             .type = QEMU_OPT_STRING,
1203             .help =
1204                 "Type of virtual hard disk format. Supported formats are "
1205                 "{dynamic (default) | fixed} "
1206         },
1207         {
1208             .name = VPC_OPT_FORCE_SIZE,
1209             .type = QEMU_OPT_BOOL,
1210             .help = "Force disk size calculation to use the actual size "
1211                     "specified, rather than using the nearest CHS-based "
1212                     "calculation"
1213         },
1214         { /* end of list */ }
1215     }
1216 };
1217 
1218 static BlockDriver bdrv_vpc = {
1219     .format_name    = "vpc",
1220     .instance_size  = sizeof(BDRVVPCState),
1221 
1222     .bdrv_probe             = vpc_probe,
1223     .bdrv_open              = vpc_open,
1224     .bdrv_close             = vpc_close,
1225     .bdrv_reopen_prepare    = vpc_reopen_prepare,
1226     .bdrv_child_perm        = bdrv_format_default_perms,
1227     .bdrv_co_create         = vpc_co_create,
1228     .bdrv_co_create_opts    = vpc_co_create_opts,
1229 
1230     .bdrv_co_preadv             = vpc_co_preadv,
1231     .bdrv_co_pwritev            = vpc_co_pwritev,
1232     .bdrv_co_block_status       = vpc_co_block_status,
1233 
1234     .bdrv_get_info          = vpc_get_info,
1235 
1236     .create_opts            = &vpc_create_opts,
1237     .bdrv_has_zero_init     = vpc_has_zero_init,
1238 };
1239 
1240 static void bdrv_vpc_init(void)
1241 {
1242     bdrv_register(&bdrv_vpc);
1243 }
1244 
1245 block_init(bdrv_vpc_init);
1246