xref: /openbmc/qemu/block/vpc.c (revision cfc58cf37362a931990efc75f3f580dfec49ac1e)
1 /*
2  * Block driver for Connectix / Microsoft Virtual PC images
3  *
4  * Copyright (c) 2005 Alex Beregszaszi
5  * Copyright (c) 2009 Kevin Wolf <kwolf@suse.de>
6  *
7  * Permission is hereby granted, free of charge, to any person obtaining a copy
8  * of this software and associated documentation files (the "Software"), to deal
9  * in the Software without restriction, including without limitation the rights
10  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11  * copies of the Software, and to permit persons to whom the Software is
12  * furnished to do so, subject to the following conditions:
13  *
14  * The above copyright notice and this permission notice shall be included in
15  * all copies or substantial portions of the Software.
16  *
17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23  * THE SOFTWARE.
24  */
25 #include "qemu/osdep.h"
26 #include "qapi/error.h"
27 #include "qemu-common.h"
28 #include "block/block_int.h"
29 #include "sysemu/block-backend.h"
30 #include "qemu/module.h"
31 #include "migration/migration.h"
32 #include "qemu/bswap.h"
33 #if defined(CONFIG_UUID)
34 #include <uuid/uuid.h>
35 #endif
36 
37 /**************************************************************/
38 
39 #define HEADER_SIZE 512
40 
41 //#define CACHE
42 
43 enum vhd_type {
44     VHD_FIXED           = 2,
45     VHD_DYNAMIC         = 3,
46     VHD_DIFFERENCING    = 4,
47 };
48 
49 /* Seconds since Jan 1, 2000 0:00:00 (UTC) */
50 #define VHD_TIMESTAMP_BASE 946684800
51 
52 #define VHD_CHS_MAX_C   65535LL
53 #define VHD_CHS_MAX_H   16
54 #define VHD_CHS_MAX_S   255
55 
56 #define VHD_MAX_SECTORS       0xff000000    /* 2040 GiB max image size */
57 #define VHD_MAX_GEOMETRY      (VHD_CHS_MAX_C * VHD_CHS_MAX_H * VHD_CHS_MAX_S)
58 
59 #define VPC_OPT_FORCE_SIZE "force_size"
60 
61 /* always big-endian */
62 typedef struct vhd_footer {
63     char        creator[8]; /* "conectix" */
64     uint32_t    features;
65     uint32_t    version;
66 
67     /* Offset of next header structure, 0xFFFFFFFF if none */
68     uint64_t    data_offset;
69 
70     /* Seconds since Jan 1, 2000 0:00:00 (UTC) */
71     uint32_t    timestamp;
72 
73     char        creator_app[4]; /*  e.g., "vpc " */
74     uint16_t    major;
75     uint16_t    minor;
76     char        creator_os[4]; /* "Wi2k" */
77 
78     uint64_t    orig_size;
79     uint64_t    current_size;
80 
81     uint16_t    cyls;
82     uint8_t     heads;
83     uint8_t     secs_per_cyl;
84 
85     uint32_t    type;
86 
87     /* Checksum of the Hard Disk Footer ("one's complement of the sum of all
88        the bytes in the footer without the checksum field") */
89     uint32_t    checksum;
90 
91     /* UUID used to identify a parent hard disk (backing file) */
92     uint8_t     uuid[16];
93 
94     uint8_t     in_saved_state;
95 } QEMU_PACKED VHDFooter;
96 
97 typedef struct vhd_dyndisk_header {
98     char        magic[8]; /* "cxsparse" */
99 
100     /* Offset of next header structure, 0xFFFFFFFF if none */
101     uint64_t    data_offset;
102 
103     /* Offset of the Block Allocation Table (BAT) */
104     uint64_t    table_offset;
105 
106     uint32_t    version;
107     uint32_t    max_table_entries; /* 32bit/entry */
108 
109     /* 2 MB by default, must be a power of two */
110     uint32_t    block_size;
111 
112     uint32_t    checksum;
113     uint8_t     parent_uuid[16];
114     uint32_t    parent_timestamp;
115     uint32_t    reserved;
116 
117     /* Backing file name (in UTF-16) */
118     uint8_t     parent_name[512];
119 
120     struct {
121         uint32_t    platform;
122         uint32_t    data_space;
123         uint32_t    data_length;
124         uint32_t    reserved;
125         uint64_t    data_offset;
126     } parent_locator[8];
127 } QEMU_PACKED VHDDynDiskHeader;
128 
129 typedef struct BDRVVPCState {
130     CoMutex lock;
131     uint8_t footer_buf[HEADER_SIZE];
132     uint64_t free_data_block_offset;
133     int max_table_entries;
134     uint32_t *pagetable;
135     uint64_t bat_offset;
136     uint64_t last_bitmap_offset;
137 
138     uint32_t block_size;
139     uint32_t bitmap_size;
140     bool force_use_chs;
141     bool force_use_sz;
142 
143 #ifdef CACHE
144     uint8_t *pageentry_u8;
145     uint32_t *pageentry_u32;
146     uint16_t *pageentry_u16;
147 
148     uint64_t last_bitmap;
149 #endif
150 
151     Error *migration_blocker;
152 } BDRVVPCState;
153 
154 #define VPC_OPT_SIZE_CALC "force_size_calc"
155 static QemuOptsList vpc_runtime_opts = {
156     .name = "vpc-runtime-opts",
157     .head = QTAILQ_HEAD_INITIALIZER(vpc_runtime_opts.head),
158     .desc = {
159         {
160             .name = VPC_OPT_SIZE_CALC,
161             .type = QEMU_OPT_STRING,
162             .help = "Force disk size calculation to use either CHS geometry, "
163                     "or use the disk current_size specified in the VHD footer. "
164                     "{chs, current_size}"
165         },
166         { /* end of list */ }
167     }
168 };
169 
170 static uint32_t vpc_checksum(uint8_t* buf, size_t size)
171 {
172     uint32_t res = 0;
173     int i;
174 
175     for (i = 0; i < size; i++)
176         res += buf[i];
177 
178     return ~res;
179 }
180 
181 
182 static int vpc_probe(const uint8_t *buf, int buf_size, const char *filename)
183 {
184     if (buf_size >= 8 && !strncmp((char *)buf, "conectix", 8))
185 	return 100;
186     return 0;
187 }
188 
189 static void vpc_parse_options(BlockDriverState *bs, QemuOpts *opts,
190                               Error **errp)
191 {
192     BDRVVPCState *s = bs->opaque;
193     const char *size_calc;
194 
195     size_calc = qemu_opt_get(opts, VPC_OPT_SIZE_CALC);
196 
197     if (!size_calc) {
198        /* no override, use autodetect only */
199     } else if (!strcmp(size_calc, "current_size")) {
200         s->force_use_sz = true;
201     } else if (!strcmp(size_calc, "chs")) {
202         s->force_use_chs = true;
203     } else {
204         error_setg(errp, "Invalid size calculation mode: '%s'", size_calc);
205     }
206 }
207 
208 static int vpc_open(BlockDriverState *bs, QDict *options, int flags,
209                     Error **errp)
210 {
211     BDRVVPCState *s = bs->opaque;
212     int i;
213     VHDFooter *footer;
214     VHDDynDiskHeader *dyndisk_header;
215     QemuOpts *opts = NULL;
216     Error *local_err = NULL;
217     bool use_chs;
218     uint8_t buf[HEADER_SIZE];
219     uint32_t checksum;
220     uint64_t computed_size;
221     uint64_t pagetable_size;
222     int disk_type = VHD_DYNAMIC;
223     int ret;
224 
225     opts = qemu_opts_create(&vpc_runtime_opts, NULL, 0, &error_abort);
226     qemu_opts_absorb_qdict(opts, options, &local_err);
227     if (local_err) {
228         error_propagate(errp, local_err);
229         ret = -EINVAL;
230         goto fail;
231     }
232 
233     vpc_parse_options(bs, opts, &local_err);
234     if (local_err) {
235         error_propagate(errp, local_err);
236         ret = -EINVAL;
237         goto fail;
238     }
239 
240     ret = bdrv_pread(bs->file->bs, 0, s->footer_buf, HEADER_SIZE);
241     if (ret < 0) {
242         error_setg(errp, "Unable to read VHD header");
243         goto fail;
244     }
245 
246     footer = (VHDFooter *) s->footer_buf;
247     if (strncmp(footer->creator, "conectix", 8)) {
248         int64_t offset = bdrv_getlength(bs->file->bs);
249         if (offset < 0) {
250             ret = offset;
251             error_setg(errp, "Invalid file size");
252             goto fail;
253         } else if (offset < HEADER_SIZE) {
254             ret = -EINVAL;
255             error_setg(errp, "File too small for a VHD header");
256             goto fail;
257         }
258 
259         /* If a fixed disk, the footer is found only at the end of the file */
260         ret = bdrv_pread(bs->file->bs, offset-HEADER_SIZE, s->footer_buf,
261                          HEADER_SIZE);
262         if (ret < 0) {
263             goto fail;
264         }
265         if (strncmp(footer->creator, "conectix", 8)) {
266             error_setg(errp, "invalid VPC image");
267             ret = -EINVAL;
268             goto fail;
269         }
270         disk_type = VHD_FIXED;
271     }
272 
273     checksum = be32_to_cpu(footer->checksum);
274     footer->checksum = 0;
275     if (vpc_checksum(s->footer_buf, HEADER_SIZE) != checksum)
276         fprintf(stderr, "block-vpc: The header checksum of '%s' is "
277             "incorrect.\n", bs->filename);
278 
279     /* Write 'checksum' back to footer, or else will leave it with zero. */
280     footer->checksum = cpu_to_be32(checksum);
281 
282     /* The visible size of a image in Virtual PC depends on the geometry
283        rather than on the size stored in the footer (the size in the footer
284        is too large usually) */
285     bs->total_sectors = (int64_t)
286         be16_to_cpu(footer->cyls) * footer->heads * footer->secs_per_cyl;
287 
288     /* Microsoft Virtual PC and Microsoft Hyper-V produce and read
289      * VHD image sizes differently.  VPC will rely on CHS geometry,
290      * while Hyper-V and disk2vhd use the size specified in the footer.
291      *
292      * We use a couple of approaches to try and determine the correct method:
293      * look at the Creator App field, and look for images that have CHS
294      * geometry that is the maximum value.
295      *
296      * If the CHS geometry is the maximum CHS geometry, then we assume that
297      * the size is the footer->current_size to avoid truncation.  Otherwise,
298      * we follow the table based on footer->creator_app:
299      *
300      *  Known creator apps:
301      *      'vpc '  :  CHS              Virtual PC (uses disk geometry)
302      *      'qemu'  :  CHS              QEMU (uses disk geometry)
303      *      'qem2'  :  current_size     QEMU (uses current_size)
304      *      'win '  :  current_size     Hyper-V
305      *      'd2v '  :  current_size     Disk2vhd
306      *      'tap\0' :  current_size     XenServer
307      *      'CTXS'  :  current_size     XenConverter
308      *
309      *  The user can override the table values via drive options, however
310      *  even with an override we will still use current_size for images
311      *  that have CHS geometry of the maximum size.
312      */
313     use_chs = (!!strncmp(footer->creator_app, "win ", 4) &&
314                !!strncmp(footer->creator_app, "qem2", 4) &&
315                !!strncmp(footer->creator_app, "d2v ", 4) &&
316                !!strncmp(footer->creator_app, "CTXS", 4) &&
317                !!memcmp(footer->creator_app, "tap", 4)) || s->force_use_chs;
318 
319     if (!use_chs || bs->total_sectors == VHD_MAX_GEOMETRY || s->force_use_sz) {
320         bs->total_sectors = be64_to_cpu(footer->current_size) /
321                                         BDRV_SECTOR_SIZE;
322     }
323 
324     /* Allow a maximum disk size of 2040 GiB */
325     if (bs->total_sectors > VHD_MAX_SECTORS) {
326         ret = -EFBIG;
327         goto fail;
328     }
329 
330     if (disk_type == VHD_DYNAMIC) {
331         ret = bdrv_pread(bs->file->bs, be64_to_cpu(footer->data_offset), buf,
332                          HEADER_SIZE);
333         if (ret < 0) {
334             error_setg(errp, "Error reading dynamic VHD header");
335             goto fail;
336         }
337 
338         dyndisk_header = (VHDDynDiskHeader *) buf;
339 
340         if (strncmp(dyndisk_header->magic, "cxsparse", 8)) {
341             error_setg(errp, "Invalid header magic");
342             ret = -EINVAL;
343             goto fail;
344         }
345 
346         s->block_size = be32_to_cpu(dyndisk_header->block_size);
347         if (!is_power_of_2(s->block_size) || s->block_size < BDRV_SECTOR_SIZE) {
348             error_setg(errp, "Invalid block size %" PRIu32, s->block_size);
349             ret = -EINVAL;
350             goto fail;
351         }
352         s->bitmap_size = ((s->block_size / (8 * 512)) + 511) & ~511;
353 
354         s->max_table_entries = be32_to_cpu(dyndisk_header->max_table_entries);
355 
356         if ((bs->total_sectors * 512) / s->block_size > 0xffffffffU) {
357             error_setg(errp, "Too many blocks");
358             ret = -EINVAL;
359             goto fail;
360         }
361 
362         computed_size = (uint64_t) s->max_table_entries * s->block_size;
363         if (computed_size < bs->total_sectors * 512) {
364             error_setg(errp, "Page table too small");
365             ret = -EINVAL;
366             goto fail;
367         }
368 
369         if (s->max_table_entries > SIZE_MAX / 4 ||
370             s->max_table_entries > (int) INT_MAX / 4) {
371             error_setg(errp, "Max Table Entries too large (%" PRId32 ")",
372                         s->max_table_entries);
373             ret = -EINVAL;
374             goto fail;
375         }
376 
377         pagetable_size = (uint64_t) s->max_table_entries * 4;
378 
379         s->pagetable = qemu_try_blockalign(bs->file->bs, pagetable_size);
380         if (s->pagetable == NULL) {
381             error_setg(errp, "Unable to allocate memory for page table");
382             ret = -ENOMEM;
383             goto fail;
384         }
385 
386         s->bat_offset = be64_to_cpu(dyndisk_header->table_offset);
387 
388         ret = bdrv_pread(bs->file->bs, s->bat_offset, s->pagetable,
389                          pagetable_size);
390         if (ret < 0) {
391             error_setg(errp, "Error reading pagetable");
392             goto fail;
393         }
394 
395         s->free_data_block_offset =
396             ROUND_UP(s->bat_offset + pagetable_size, 512);
397 
398         for (i = 0; i < s->max_table_entries; i++) {
399             be32_to_cpus(&s->pagetable[i]);
400             if (s->pagetable[i] != 0xFFFFFFFF) {
401                 int64_t next = (512 * (int64_t) s->pagetable[i]) +
402                     s->bitmap_size + s->block_size;
403 
404                 if (next > s->free_data_block_offset) {
405                     s->free_data_block_offset = next;
406                 }
407             }
408         }
409 
410         if (s->free_data_block_offset > bdrv_getlength(bs->file->bs)) {
411             error_setg(errp, "block-vpc: free_data_block_offset points after "
412                              "the end of file. The image has been truncated.");
413             ret = -EINVAL;
414             goto fail;
415         }
416 
417         s->last_bitmap_offset = (int64_t) -1;
418 
419 #ifdef CACHE
420         s->pageentry_u8 = g_malloc(512);
421         s->pageentry_u32 = s->pageentry_u8;
422         s->pageentry_u16 = s->pageentry_u8;
423         s->last_pagetable = -1;
424 #endif
425     }
426 
427     qemu_co_mutex_init(&s->lock);
428 
429     /* Disable migration when VHD images are used */
430     error_setg(&s->migration_blocker, "The vpc format used by node '%s' "
431                "does not support live migration",
432                bdrv_get_device_or_node_name(bs));
433     migrate_add_blocker(s->migration_blocker);
434 
435     return 0;
436 
437 fail:
438     qemu_vfree(s->pagetable);
439 #ifdef CACHE
440     g_free(s->pageentry_u8);
441 #endif
442     return ret;
443 }
444 
445 static int vpc_reopen_prepare(BDRVReopenState *state,
446                               BlockReopenQueue *queue, Error **errp)
447 {
448     return 0;
449 }
450 
451 /*
452  * Returns the absolute byte offset of the given sector in the image file.
453  * If the sector is not allocated, -1 is returned instead.
454  *
455  * The parameter write must be 1 if the offset will be used for a write
456  * operation (the block bitmaps is updated then), 0 otherwise.
457  */
458 static inline int64_t get_image_offset(BlockDriverState *bs, uint64_t offset,
459                                        bool write)
460 {
461     BDRVVPCState *s = bs->opaque;
462     uint64_t bitmap_offset, block_offset;
463     uint32_t pagetable_index, offset_in_block;
464 
465     pagetable_index = offset / s->block_size;
466     offset_in_block = offset % s->block_size;
467 
468     if (pagetable_index >= s->max_table_entries || s->pagetable[pagetable_index] == 0xffffffff)
469         return -1; /* not allocated */
470 
471     bitmap_offset = 512 * (uint64_t) s->pagetable[pagetable_index];
472     block_offset = bitmap_offset + s->bitmap_size + offset_in_block;
473 
474     /* We must ensure that we don't write to any sectors which are marked as
475        unused in the bitmap. We get away with setting all bits in the block
476        bitmap each time we write to a new block. This might cause Virtual PC to
477        miss sparse read optimization, but it's not a problem in terms of
478        correctness. */
479     if (write && (s->last_bitmap_offset != bitmap_offset)) {
480         uint8_t bitmap[s->bitmap_size];
481 
482         s->last_bitmap_offset = bitmap_offset;
483         memset(bitmap, 0xff, s->bitmap_size);
484         bdrv_pwrite_sync(bs->file->bs, bitmap_offset, bitmap, s->bitmap_size);
485     }
486 
487     return block_offset;
488 }
489 
490 static inline int64_t get_sector_offset(BlockDriverState *bs,
491                                         int64_t sector_num, bool write)
492 {
493     return get_image_offset(bs, sector_num * BDRV_SECTOR_SIZE, write);
494 }
495 
496 /*
497  * Writes the footer to the end of the image file. This is needed when the
498  * file grows as it overwrites the old footer
499  *
500  * Returns 0 on success and < 0 on error
501  */
502 static int rewrite_footer(BlockDriverState* bs)
503 {
504     int ret;
505     BDRVVPCState *s = bs->opaque;
506     int64_t offset = s->free_data_block_offset;
507 
508     ret = bdrv_pwrite_sync(bs->file->bs, offset, s->footer_buf, HEADER_SIZE);
509     if (ret < 0)
510         return ret;
511 
512     return 0;
513 }
514 
515 /*
516  * Allocates a new block. This involves writing a new footer and updating
517  * the Block Allocation Table to use the space at the old end of the image
518  * file (overwriting the old footer)
519  *
520  * Returns the sectors' offset in the image file on success and < 0 on error
521  */
522 static int64_t alloc_block(BlockDriverState* bs, int64_t offset)
523 {
524     BDRVVPCState *s = bs->opaque;
525     int64_t bat_offset;
526     uint32_t index, bat_value;
527     int ret;
528     uint8_t bitmap[s->bitmap_size];
529 
530     /* Check if sector_num is valid */
531     if ((offset < 0) || (offset > bs->total_sectors * BDRV_SECTOR_SIZE)) {
532         return -EINVAL;
533     }
534 
535     /* Write entry into in-memory BAT */
536     index = offset / s->block_size;
537     assert(s->pagetable[index] == 0xFFFFFFFF);
538     s->pagetable[index] = s->free_data_block_offset / 512;
539 
540     /* Initialize the block's bitmap */
541     memset(bitmap, 0xff, s->bitmap_size);
542     ret = bdrv_pwrite_sync(bs->file->bs, s->free_data_block_offset, bitmap,
543         s->bitmap_size);
544     if (ret < 0) {
545         return ret;
546     }
547 
548     /* Write new footer (the old one will be overwritten) */
549     s->free_data_block_offset += s->block_size + s->bitmap_size;
550     ret = rewrite_footer(bs);
551     if (ret < 0)
552         goto fail;
553 
554     /* Write BAT entry to disk */
555     bat_offset = s->bat_offset + (4 * index);
556     bat_value = cpu_to_be32(s->pagetable[index]);
557     ret = bdrv_pwrite_sync(bs->file->bs, bat_offset, &bat_value, 4);
558     if (ret < 0)
559         goto fail;
560 
561     return get_image_offset(bs, offset, false);
562 
563 fail:
564     s->free_data_block_offset -= (s->block_size + s->bitmap_size);
565     return ret;
566 }
567 
568 static int vpc_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
569 {
570     BDRVVPCState *s = (BDRVVPCState *)bs->opaque;
571     VHDFooter *footer = (VHDFooter *) s->footer_buf;
572 
573     if (be32_to_cpu(footer->type) != VHD_FIXED) {
574         bdi->cluster_size = s->block_size;
575     }
576 
577     bdi->unallocated_blocks_are_zero = true;
578     return 0;
579 }
580 
581 static int coroutine_fn
582 vpc_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
583               QEMUIOVector *qiov, int flags)
584 {
585     BDRVVPCState *s = bs->opaque;
586     int ret;
587     int64_t image_offset;
588     int64_t n_bytes;
589     int64_t bytes_done = 0;
590     VHDFooter *footer = (VHDFooter *) s->footer_buf;
591     QEMUIOVector local_qiov;
592 
593     if (be32_to_cpu(footer->type) == VHD_FIXED) {
594         return bdrv_co_preadv(bs->file->bs, offset, bytes, qiov, 0);
595     }
596 
597     qemu_co_mutex_lock(&s->lock);
598     qemu_iovec_init(&local_qiov, qiov->niov);
599 
600     while (bytes > 0) {
601         image_offset = get_image_offset(bs, offset, false);
602         n_bytes = MIN(bytes, s->block_size - (offset % s->block_size));
603 
604         if (image_offset == -1) {
605             qemu_iovec_memset(qiov, bytes_done, 0, n_bytes);
606         } else {
607             qemu_iovec_reset(&local_qiov);
608             qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes);
609 
610             ret = bdrv_co_preadv(bs->file->bs, image_offset, n_bytes,
611                                  &local_qiov, 0);
612             if (ret < 0) {
613                 goto fail;
614             }
615         }
616 
617         bytes -= n_bytes;
618         offset += n_bytes;
619         bytes_done += n_bytes;
620     }
621 
622     ret = 0;
623 fail:
624     qemu_iovec_destroy(&local_qiov);
625     qemu_co_mutex_unlock(&s->lock);
626 
627     return ret;
628 }
629 
630 static int coroutine_fn
631 vpc_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
632                QEMUIOVector *qiov, int flags)
633 {
634     BDRVVPCState *s = bs->opaque;
635     int64_t image_offset;
636     int64_t n_bytes;
637     int64_t bytes_done = 0;
638     int ret;
639     VHDFooter *footer =  (VHDFooter *) s->footer_buf;
640     QEMUIOVector local_qiov;
641 
642     if (be32_to_cpu(footer->type) == VHD_FIXED) {
643         return bdrv_co_pwritev(bs->file->bs, offset, bytes, qiov, 0);
644     }
645 
646     qemu_co_mutex_lock(&s->lock);
647     qemu_iovec_init(&local_qiov, qiov->niov);
648 
649     while (bytes > 0) {
650         image_offset = get_image_offset(bs, offset, true);
651         n_bytes = MIN(bytes, s->block_size - (offset % s->block_size));
652 
653         if (image_offset == -1) {
654             image_offset = alloc_block(bs, offset);
655             if (image_offset < 0) {
656                 ret = image_offset;
657                 goto fail;
658             }
659         }
660 
661         qemu_iovec_reset(&local_qiov);
662         qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes);
663 
664         ret = bdrv_co_pwritev(bs->file->bs, image_offset, n_bytes,
665                               &local_qiov, 0);
666         if (ret < 0) {
667             goto fail;
668         }
669 
670         bytes -= n_bytes;
671         offset += n_bytes;
672         bytes_done += n_bytes;
673     }
674 
675     ret = 0;
676 fail:
677     qemu_iovec_destroy(&local_qiov);
678     qemu_co_mutex_unlock(&s->lock);
679 
680     return ret;
681 }
682 
683 static int64_t coroutine_fn vpc_co_get_block_status(BlockDriverState *bs,
684         int64_t sector_num, int nb_sectors, int *pnum, BlockDriverState **file)
685 {
686     BDRVVPCState *s = bs->opaque;
687     VHDFooter *footer = (VHDFooter*) s->footer_buf;
688     int64_t start, offset;
689     bool allocated;
690     int n;
691 
692     if (be32_to_cpu(footer->type) == VHD_FIXED) {
693         *pnum = nb_sectors;
694         *file = bs->file->bs;
695         return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID | BDRV_BLOCK_DATA |
696                (sector_num << BDRV_SECTOR_BITS);
697     }
698 
699     offset = get_sector_offset(bs, sector_num, 0);
700     start = offset;
701     allocated = (offset != -1);
702     *pnum = 0;
703 
704     do {
705         /* All sectors in a block are contiguous (without using the bitmap) */
706         n = ROUND_UP(sector_num + 1, s->block_size / BDRV_SECTOR_SIZE)
707           - sector_num;
708         n = MIN(n, nb_sectors);
709 
710         *pnum += n;
711         sector_num += n;
712         nb_sectors -= n;
713         /* *pnum can't be greater than one block for allocated
714          * sectors since there is always a bitmap in between. */
715         if (allocated) {
716             *file = bs->file->bs;
717             return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | start;
718         }
719         if (nb_sectors == 0) {
720             break;
721         }
722         offset = get_sector_offset(bs, sector_num, 0);
723     } while (offset == -1);
724 
725     return 0;
726 }
727 
728 /*
729  * Calculates the number of cylinders, heads and sectors per cylinder
730  * based on a given number of sectors. This is the algorithm described
731  * in the VHD specification.
732  *
733  * Note that the geometry doesn't always exactly match total_sectors but
734  * may round it down.
735  *
736  * Returns 0 on success, -EFBIG if the size is larger than 2040 GiB. Override
737  * the hardware EIDE and ATA-2 limit of 16 heads (max disk size of 127 GB)
738  * and instead allow up to 255 heads.
739  */
740 static int calculate_geometry(int64_t total_sectors, uint16_t* cyls,
741     uint8_t* heads, uint8_t* secs_per_cyl)
742 {
743     uint32_t cyls_times_heads;
744 
745     total_sectors = MIN(total_sectors, VHD_MAX_GEOMETRY);
746 
747     if (total_sectors >= 65535LL * 16 * 63) {
748         *secs_per_cyl = 255;
749         *heads = 16;
750         cyls_times_heads = total_sectors / *secs_per_cyl;
751     } else {
752         *secs_per_cyl = 17;
753         cyls_times_heads = total_sectors / *secs_per_cyl;
754         *heads = (cyls_times_heads + 1023) / 1024;
755 
756         if (*heads < 4) {
757             *heads = 4;
758         }
759 
760         if (cyls_times_heads >= (*heads * 1024) || *heads > 16) {
761             *secs_per_cyl = 31;
762             *heads = 16;
763             cyls_times_heads = total_sectors / *secs_per_cyl;
764         }
765 
766         if (cyls_times_heads >= (*heads * 1024)) {
767             *secs_per_cyl = 63;
768             *heads = 16;
769             cyls_times_heads = total_sectors / *secs_per_cyl;
770         }
771     }
772 
773     *cyls = cyls_times_heads / *heads;
774 
775     return 0;
776 }
777 
778 static int create_dynamic_disk(BlockBackend *blk, uint8_t *buf,
779                                int64_t total_sectors)
780 {
781     VHDDynDiskHeader *dyndisk_header =
782         (VHDDynDiskHeader *) buf;
783     size_t block_size, num_bat_entries;
784     int i;
785     int ret;
786     int64_t offset = 0;
787 
788     /* Write the footer (twice: at the beginning and at the end) */
789     block_size = 0x200000;
790     num_bat_entries = (total_sectors + block_size / 512) / (block_size / 512);
791 
792     ret = blk_pwrite(blk, offset, buf, HEADER_SIZE, 0);
793     if (ret < 0) {
794         goto fail;
795     }
796 
797     offset = 1536 + ((num_bat_entries * 4 + 511) & ~511);
798     ret = blk_pwrite(blk, offset, buf, HEADER_SIZE, 0);
799     if (ret < 0) {
800         goto fail;
801     }
802 
803     /* Write the initial BAT */
804     offset = 3 * 512;
805 
806     memset(buf, 0xFF, 512);
807     for (i = 0; i < (num_bat_entries * 4 + 511) / 512; i++) {
808         ret = blk_pwrite(blk, offset, buf, 512, 0);
809         if (ret < 0) {
810             goto fail;
811         }
812         offset += 512;
813     }
814 
815     /* Prepare the Dynamic Disk Header */
816     memset(buf, 0, 1024);
817 
818     memcpy(dyndisk_header->magic, "cxsparse", 8);
819 
820     /*
821      * Note: The spec is actually wrong here for data_offset, it says
822      * 0xFFFFFFFF, but MS tools expect all 64 bits to be set.
823      */
824     dyndisk_header->data_offset = cpu_to_be64(0xFFFFFFFFFFFFFFFFULL);
825     dyndisk_header->table_offset = cpu_to_be64(3 * 512);
826     dyndisk_header->version = cpu_to_be32(0x00010000);
827     dyndisk_header->block_size = cpu_to_be32(block_size);
828     dyndisk_header->max_table_entries = cpu_to_be32(num_bat_entries);
829 
830     dyndisk_header->checksum = cpu_to_be32(vpc_checksum(buf, 1024));
831 
832     /* Write the header */
833     offset = 512;
834 
835     ret = blk_pwrite(blk, offset, buf, 1024, 0);
836     if (ret < 0) {
837         goto fail;
838     }
839 
840  fail:
841     return ret;
842 }
843 
844 static int create_fixed_disk(BlockBackend *blk, uint8_t *buf,
845                              int64_t total_size)
846 {
847     int ret;
848 
849     /* Add footer to total size */
850     total_size += HEADER_SIZE;
851 
852     ret = blk_truncate(blk, total_size);
853     if (ret < 0) {
854         return ret;
855     }
856 
857     ret = blk_pwrite(blk, total_size - HEADER_SIZE, buf, HEADER_SIZE, 0);
858     if (ret < 0) {
859         return ret;
860     }
861 
862     return ret;
863 }
864 
865 static int vpc_create(const char *filename, QemuOpts *opts, Error **errp)
866 {
867     uint8_t buf[1024];
868     VHDFooter *footer = (VHDFooter *) buf;
869     char *disk_type_param;
870     int i;
871     uint16_t cyls = 0;
872     uint8_t heads = 0;
873     uint8_t secs_per_cyl = 0;
874     int64_t total_sectors;
875     int64_t total_size;
876     int disk_type;
877     int ret = -EIO;
878     bool force_size;
879     Error *local_err = NULL;
880     BlockBackend *blk = NULL;
881 
882     /* Read out options */
883     total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
884                           BDRV_SECTOR_SIZE);
885     disk_type_param = qemu_opt_get_del(opts, BLOCK_OPT_SUBFMT);
886     if (disk_type_param) {
887         if (!strcmp(disk_type_param, "dynamic")) {
888             disk_type = VHD_DYNAMIC;
889         } else if (!strcmp(disk_type_param, "fixed")) {
890             disk_type = VHD_FIXED;
891         } else {
892             error_setg(errp, "Invalid disk type, %s", disk_type_param);
893             ret = -EINVAL;
894             goto out;
895         }
896     } else {
897         disk_type = VHD_DYNAMIC;
898     }
899 
900     force_size = qemu_opt_get_bool_del(opts, VPC_OPT_FORCE_SIZE, false);
901 
902     ret = bdrv_create_file(filename, opts, &local_err);
903     if (ret < 0) {
904         error_propagate(errp, local_err);
905         goto out;
906     }
907 
908     blk = blk_new_open(filename, NULL, NULL,
909                        BDRV_O_RDWR | BDRV_O_PROTOCOL, &local_err);
910     if (blk == NULL) {
911         error_propagate(errp, local_err);
912         ret = -EIO;
913         goto out;
914     }
915 
916     blk_set_allow_write_beyond_eof(blk, true);
917 
918     /*
919      * Calculate matching total_size and geometry. Increase the number of
920      * sectors requested until we get enough (or fail). This ensures that
921      * qemu-img convert doesn't truncate images, but rather rounds up.
922      *
923      * If the image size can't be represented by a spec conformant CHS geometry,
924      * we set the geometry to 65535 x 16 x 255 (CxHxS) sectors and use
925      * the image size from the VHD footer to calculate total_sectors.
926      */
927     if (force_size) {
928         /* This will force the use of total_size for sector count, below */
929         cyls         = VHD_CHS_MAX_C;
930         heads        = VHD_CHS_MAX_H;
931         secs_per_cyl = VHD_CHS_MAX_S;
932     } else {
933         total_sectors = MIN(VHD_MAX_GEOMETRY, total_size / BDRV_SECTOR_SIZE);
934         for (i = 0; total_sectors > (int64_t)cyls * heads * secs_per_cyl; i++) {
935             calculate_geometry(total_sectors + i, &cyls, &heads, &secs_per_cyl);
936         }
937     }
938 
939     if ((int64_t)cyls * heads * secs_per_cyl == VHD_MAX_GEOMETRY) {
940         total_sectors = total_size / BDRV_SECTOR_SIZE;
941         /* Allow a maximum disk size of 2040 GiB */
942         if (total_sectors > VHD_MAX_SECTORS) {
943             error_setg(errp, "Disk size is too large, max size is 2040 GiB");
944             ret = -EFBIG;
945             goto out;
946         }
947     } else {
948         total_sectors = (int64_t)cyls * heads * secs_per_cyl;
949         total_size = total_sectors * BDRV_SECTOR_SIZE;
950     }
951 
952     /* Prepare the Hard Disk Footer */
953     memset(buf, 0, 1024);
954 
955     memcpy(footer->creator, "conectix", 8);
956     if (force_size) {
957         memcpy(footer->creator_app, "qem2", 4);
958     } else {
959         memcpy(footer->creator_app, "qemu", 4);
960     }
961     memcpy(footer->creator_os, "Wi2k", 4);
962 
963     footer->features = cpu_to_be32(0x02);
964     footer->version = cpu_to_be32(0x00010000);
965     if (disk_type == VHD_DYNAMIC) {
966         footer->data_offset = cpu_to_be64(HEADER_SIZE);
967     } else {
968         footer->data_offset = cpu_to_be64(0xFFFFFFFFFFFFFFFFULL);
969     }
970     footer->timestamp = cpu_to_be32(time(NULL) - VHD_TIMESTAMP_BASE);
971 
972     /* Version of Virtual PC 2007 */
973     footer->major = cpu_to_be16(0x0005);
974     footer->minor = cpu_to_be16(0x0003);
975     footer->orig_size = cpu_to_be64(total_size);
976     footer->current_size = cpu_to_be64(total_size);
977     footer->cyls = cpu_to_be16(cyls);
978     footer->heads = heads;
979     footer->secs_per_cyl = secs_per_cyl;
980 
981     footer->type = cpu_to_be32(disk_type);
982 
983 #if defined(CONFIG_UUID)
984     uuid_generate(footer->uuid);
985 #endif
986 
987     footer->checksum = cpu_to_be32(vpc_checksum(buf, HEADER_SIZE));
988 
989     if (disk_type == VHD_DYNAMIC) {
990         ret = create_dynamic_disk(blk, buf, total_sectors);
991     } else {
992         ret = create_fixed_disk(blk, buf, total_size);
993     }
994     if (ret < 0) {
995         error_setg(errp, "Unable to create or write VHD header");
996     }
997 
998 out:
999     blk_unref(blk);
1000     g_free(disk_type_param);
1001     return ret;
1002 }
1003 
1004 static int vpc_has_zero_init(BlockDriverState *bs)
1005 {
1006     BDRVVPCState *s = bs->opaque;
1007     VHDFooter *footer =  (VHDFooter *) s->footer_buf;
1008 
1009     if (be32_to_cpu(footer->type) == VHD_FIXED) {
1010         return bdrv_has_zero_init(bs->file->bs);
1011     } else {
1012         return 1;
1013     }
1014 }
1015 
1016 static void vpc_close(BlockDriverState *bs)
1017 {
1018     BDRVVPCState *s = bs->opaque;
1019     qemu_vfree(s->pagetable);
1020 #ifdef CACHE
1021     g_free(s->pageentry_u8);
1022 #endif
1023 
1024     migrate_del_blocker(s->migration_blocker);
1025     error_free(s->migration_blocker);
1026 }
1027 
1028 static QemuOptsList vpc_create_opts = {
1029     .name = "vpc-create-opts",
1030     .head = QTAILQ_HEAD_INITIALIZER(vpc_create_opts.head),
1031     .desc = {
1032         {
1033             .name = BLOCK_OPT_SIZE,
1034             .type = QEMU_OPT_SIZE,
1035             .help = "Virtual disk size"
1036         },
1037         {
1038             .name = BLOCK_OPT_SUBFMT,
1039             .type = QEMU_OPT_STRING,
1040             .help =
1041                 "Type of virtual hard disk format. Supported formats are "
1042                 "{dynamic (default) | fixed} "
1043         },
1044         {
1045             .name = VPC_OPT_FORCE_SIZE,
1046             .type = QEMU_OPT_BOOL,
1047             .help = "Force disk size calculation to use the actual size "
1048                     "specified, rather than using the nearest CHS-based "
1049                     "calculation"
1050         },
1051         { /* end of list */ }
1052     }
1053 };
1054 
1055 static BlockDriver bdrv_vpc = {
1056     .format_name    = "vpc",
1057     .instance_size  = sizeof(BDRVVPCState),
1058 
1059     .bdrv_probe             = vpc_probe,
1060     .bdrv_open              = vpc_open,
1061     .bdrv_close             = vpc_close,
1062     .bdrv_reopen_prepare    = vpc_reopen_prepare,
1063     .bdrv_create            = vpc_create,
1064 
1065     .bdrv_co_preadv             = vpc_co_preadv,
1066     .bdrv_co_pwritev            = vpc_co_pwritev,
1067     .bdrv_co_get_block_status   = vpc_co_get_block_status,
1068 
1069     .bdrv_get_info          = vpc_get_info,
1070 
1071     .create_opts            = &vpc_create_opts,
1072     .bdrv_has_zero_init     = vpc_has_zero_init,
1073 };
1074 
1075 static void bdrv_vpc_init(void)
1076 {
1077     bdrv_register(&bdrv_vpc);
1078 }
1079 
1080 block_init(bdrv_vpc_init);
1081