xref: /openbmc/qemu/block/vpc.c (revision ea1ff54f)
1 /*
2  * Block driver for Connectix / Microsoft Virtual PC images
3  *
4  * Copyright (c) 2005 Alex Beregszaszi
5  * Copyright (c) 2009 Kevin Wolf <kwolf@suse.de>
6  *
7  * Permission is hereby granted, free of charge, to any person obtaining a copy
8  * of this software and associated documentation files (the "Software"), to deal
9  * in the Software without restriction, including without limitation the rights
10  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11  * copies of the Software, and to permit persons to whom the Software is
12  * furnished to do so, subject to the following conditions:
13  *
14  * The above copyright notice and this permission notice shall be included in
15  * all copies or substantial portions of the Software.
16  *
17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23  * THE SOFTWARE.
24  */
25 #include "qemu/osdep.h"
26 #include "qapi/error.h"
27 #include "qemu-common.h"
28 #include "block/block_int.h"
29 #include "sysemu/block-backend.h"
30 #include "qemu/module.h"
31 #include "migration/blocker.h"
32 #include "qemu/bswap.h"
33 #include "qemu/uuid.h"
34 
35 /**************************************************************/
36 
37 #define HEADER_SIZE 512
38 
39 //#define CACHE
40 
41 enum vhd_type {
42     VHD_FIXED           = 2,
43     VHD_DYNAMIC         = 3,
44     VHD_DIFFERENCING    = 4,
45 };
46 
47 /* Seconds since Jan 1, 2000 0:00:00 (UTC) */
48 #define VHD_TIMESTAMP_BASE 946684800
49 
50 #define VHD_CHS_MAX_C   65535LL
51 #define VHD_CHS_MAX_H   16
52 #define VHD_CHS_MAX_S   255
53 
54 #define VHD_MAX_SECTORS       0xff000000    /* 2040 GiB max image size */
55 #define VHD_MAX_GEOMETRY      (VHD_CHS_MAX_C * VHD_CHS_MAX_H * VHD_CHS_MAX_S)
56 
57 #define VPC_OPT_FORCE_SIZE "force_size"
58 
59 /* always big-endian */
60 typedef struct vhd_footer {
61     char        creator[8]; /* "conectix" */
62     uint32_t    features;
63     uint32_t    version;
64 
65     /* Offset of next header structure, 0xFFFFFFFF if none */
66     uint64_t    data_offset;
67 
68     /* Seconds since Jan 1, 2000 0:00:00 (UTC) */
69     uint32_t    timestamp;
70 
71     char        creator_app[4]; /*  e.g., "vpc " */
72     uint16_t    major;
73     uint16_t    minor;
74     char        creator_os[4]; /* "Wi2k" */
75 
76     uint64_t    orig_size;
77     uint64_t    current_size;
78 
79     uint16_t    cyls;
80     uint8_t     heads;
81     uint8_t     secs_per_cyl;
82 
83     uint32_t    type;
84 
85     /* Checksum of the Hard Disk Footer ("one's complement of the sum of all
86        the bytes in the footer without the checksum field") */
87     uint32_t    checksum;
88 
89     /* UUID used to identify a parent hard disk (backing file) */
90     QemuUUID    uuid;
91 
92     uint8_t     in_saved_state;
93 } QEMU_PACKED VHDFooter;
94 
95 typedef struct vhd_dyndisk_header {
96     char        magic[8]; /* "cxsparse" */
97 
98     /* Offset of next header structure, 0xFFFFFFFF if none */
99     uint64_t    data_offset;
100 
101     /* Offset of the Block Allocation Table (BAT) */
102     uint64_t    table_offset;
103 
104     uint32_t    version;
105     uint32_t    max_table_entries; /* 32bit/entry */
106 
107     /* 2 MB by default, must be a power of two */
108     uint32_t    block_size;
109 
110     uint32_t    checksum;
111     uint8_t     parent_uuid[16];
112     uint32_t    parent_timestamp;
113     uint32_t    reserved;
114 
115     /* Backing file name (in UTF-16) */
116     uint8_t     parent_name[512];
117 
118     struct {
119         uint32_t    platform;
120         uint32_t    data_space;
121         uint32_t    data_length;
122         uint32_t    reserved;
123         uint64_t    data_offset;
124     } parent_locator[8];
125 } QEMU_PACKED VHDDynDiskHeader;
126 
127 typedef struct BDRVVPCState {
128     CoMutex lock;
129     uint8_t footer_buf[HEADER_SIZE];
130     uint64_t free_data_block_offset;
131     int max_table_entries;
132     uint32_t *pagetable;
133     uint64_t bat_offset;
134     uint64_t last_bitmap_offset;
135 
136     uint32_t block_size;
137     uint32_t bitmap_size;
138     bool force_use_chs;
139     bool force_use_sz;
140 
141 #ifdef CACHE
142     uint8_t *pageentry_u8;
143     uint32_t *pageentry_u32;
144     uint16_t *pageentry_u16;
145 
146     uint64_t last_bitmap;
147 #endif
148 
149     Error *migration_blocker;
150 } BDRVVPCState;
151 
152 #define VPC_OPT_SIZE_CALC "force_size_calc"
153 static QemuOptsList vpc_runtime_opts = {
154     .name = "vpc-runtime-opts",
155     .head = QTAILQ_HEAD_INITIALIZER(vpc_runtime_opts.head),
156     .desc = {
157         {
158             .name = VPC_OPT_SIZE_CALC,
159             .type = QEMU_OPT_STRING,
160             .help = "Force disk size calculation to use either CHS geometry, "
161                     "or use the disk current_size specified in the VHD footer. "
162                     "{chs, current_size}"
163         },
164         { /* end of list */ }
165     }
166 };
167 
168 static uint32_t vpc_checksum(uint8_t* buf, size_t size)
169 {
170     uint32_t res = 0;
171     int i;
172 
173     for (i = 0; i < size; i++)
174         res += buf[i];
175 
176     return ~res;
177 }
178 
179 
180 static int vpc_probe(const uint8_t *buf, int buf_size, const char *filename)
181 {
182     if (buf_size >= 8 && !strncmp((char *)buf, "conectix", 8))
183 	return 100;
184     return 0;
185 }
186 
187 static void vpc_parse_options(BlockDriverState *bs, QemuOpts *opts,
188                               Error **errp)
189 {
190     BDRVVPCState *s = bs->opaque;
191     const char *size_calc;
192 
193     size_calc = qemu_opt_get(opts, VPC_OPT_SIZE_CALC);
194 
195     if (!size_calc) {
196        /* no override, use autodetect only */
197     } else if (!strcmp(size_calc, "current_size")) {
198         s->force_use_sz = true;
199     } else if (!strcmp(size_calc, "chs")) {
200         s->force_use_chs = true;
201     } else {
202         error_setg(errp, "Invalid size calculation mode: '%s'", size_calc);
203     }
204 }
205 
206 static int vpc_open(BlockDriverState *bs, QDict *options, int flags,
207                     Error **errp)
208 {
209     BDRVVPCState *s = bs->opaque;
210     int i;
211     VHDFooter *footer;
212     VHDDynDiskHeader *dyndisk_header;
213     QemuOpts *opts = NULL;
214     Error *local_err = NULL;
215     bool use_chs;
216     uint8_t buf[HEADER_SIZE];
217     uint32_t checksum;
218     uint64_t computed_size;
219     uint64_t pagetable_size;
220     int disk_type = VHD_DYNAMIC;
221     int ret;
222 
223     bs->file = bdrv_open_child(NULL, options, "file", bs, &child_file,
224                                false, errp);
225     if (!bs->file) {
226         return -EINVAL;
227     }
228 
229     opts = qemu_opts_create(&vpc_runtime_opts, NULL, 0, &error_abort);
230     qemu_opts_absorb_qdict(opts, options, &local_err);
231     if (local_err) {
232         error_propagate(errp, local_err);
233         ret = -EINVAL;
234         goto fail;
235     }
236 
237     vpc_parse_options(bs, opts, &local_err);
238     if (local_err) {
239         error_propagate(errp, local_err);
240         ret = -EINVAL;
241         goto fail;
242     }
243 
244     ret = bdrv_pread(bs->file, 0, s->footer_buf, HEADER_SIZE);
245     if (ret < 0) {
246         error_setg(errp, "Unable to read VHD header");
247         goto fail;
248     }
249 
250     footer = (VHDFooter *) s->footer_buf;
251     if (strncmp(footer->creator, "conectix", 8)) {
252         int64_t offset = bdrv_getlength(bs->file->bs);
253         if (offset < 0) {
254             ret = offset;
255             error_setg(errp, "Invalid file size");
256             goto fail;
257         } else if (offset < HEADER_SIZE) {
258             ret = -EINVAL;
259             error_setg(errp, "File too small for a VHD header");
260             goto fail;
261         }
262 
263         /* If a fixed disk, the footer is found only at the end of the file */
264         ret = bdrv_pread(bs->file, offset-HEADER_SIZE, s->footer_buf,
265                          HEADER_SIZE);
266         if (ret < 0) {
267             goto fail;
268         }
269         if (strncmp(footer->creator, "conectix", 8)) {
270             error_setg(errp, "invalid VPC image");
271             ret = -EINVAL;
272             goto fail;
273         }
274         disk_type = VHD_FIXED;
275     }
276 
277     checksum = be32_to_cpu(footer->checksum);
278     footer->checksum = 0;
279     if (vpc_checksum(s->footer_buf, HEADER_SIZE) != checksum)
280         fprintf(stderr, "block-vpc: The header checksum of '%s' is "
281             "incorrect.\n", bs->filename);
282 
283     /* Write 'checksum' back to footer, or else will leave it with zero. */
284     footer->checksum = cpu_to_be32(checksum);
285 
286     /* The visible size of a image in Virtual PC depends on the geometry
287        rather than on the size stored in the footer (the size in the footer
288        is too large usually) */
289     bs->total_sectors = (int64_t)
290         be16_to_cpu(footer->cyls) * footer->heads * footer->secs_per_cyl;
291 
292     /* Microsoft Virtual PC and Microsoft Hyper-V produce and read
293      * VHD image sizes differently.  VPC will rely on CHS geometry,
294      * while Hyper-V and disk2vhd use the size specified in the footer.
295      *
296      * We use a couple of approaches to try and determine the correct method:
297      * look at the Creator App field, and look for images that have CHS
298      * geometry that is the maximum value.
299      *
300      * If the CHS geometry is the maximum CHS geometry, then we assume that
301      * the size is the footer->current_size to avoid truncation.  Otherwise,
302      * we follow the table based on footer->creator_app:
303      *
304      *  Known creator apps:
305      *      'vpc '  :  CHS              Virtual PC (uses disk geometry)
306      *      'qemu'  :  CHS              QEMU (uses disk geometry)
307      *      'qem2'  :  current_size     QEMU (uses current_size)
308      *      'win '  :  current_size     Hyper-V
309      *      'd2v '  :  current_size     Disk2vhd
310      *      'tap\0' :  current_size     XenServer
311      *      'CTXS'  :  current_size     XenConverter
312      *
313      *  The user can override the table values via drive options, however
314      *  even with an override we will still use current_size for images
315      *  that have CHS geometry of the maximum size.
316      */
317     use_chs = (!!strncmp(footer->creator_app, "win ", 4) &&
318                !!strncmp(footer->creator_app, "qem2", 4) &&
319                !!strncmp(footer->creator_app, "d2v ", 4) &&
320                !!strncmp(footer->creator_app, "CTXS", 4) &&
321                !!memcmp(footer->creator_app, "tap", 4)) || s->force_use_chs;
322 
323     if (!use_chs || bs->total_sectors == VHD_MAX_GEOMETRY || s->force_use_sz) {
324         bs->total_sectors = be64_to_cpu(footer->current_size) /
325                                         BDRV_SECTOR_SIZE;
326     }
327 
328     /* Allow a maximum disk size of 2040 GiB */
329     if (bs->total_sectors > VHD_MAX_SECTORS) {
330         ret = -EFBIG;
331         goto fail;
332     }
333 
334     if (disk_type == VHD_DYNAMIC) {
335         ret = bdrv_pread(bs->file, be64_to_cpu(footer->data_offset), buf,
336                          HEADER_SIZE);
337         if (ret < 0) {
338             error_setg(errp, "Error reading dynamic VHD header");
339             goto fail;
340         }
341 
342         dyndisk_header = (VHDDynDiskHeader *) buf;
343 
344         if (strncmp(dyndisk_header->magic, "cxsparse", 8)) {
345             error_setg(errp, "Invalid header magic");
346             ret = -EINVAL;
347             goto fail;
348         }
349 
350         s->block_size = be32_to_cpu(dyndisk_header->block_size);
351         if (!is_power_of_2(s->block_size) || s->block_size < BDRV_SECTOR_SIZE) {
352             error_setg(errp, "Invalid block size %" PRIu32, s->block_size);
353             ret = -EINVAL;
354             goto fail;
355         }
356         s->bitmap_size = ((s->block_size / (8 * 512)) + 511) & ~511;
357 
358         s->max_table_entries = be32_to_cpu(dyndisk_header->max_table_entries);
359 
360         if ((bs->total_sectors * 512) / s->block_size > 0xffffffffU) {
361             error_setg(errp, "Too many blocks");
362             ret = -EINVAL;
363             goto fail;
364         }
365 
366         computed_size = (uint64_t) s->max_table_entries * s->block_size;
367         if (computed_size < bs->total_sectors * 512) {
368             error_setg(errp, "Page table too small");
369             ret = -EINVAL;
370             goto fail;
371         }
372 
373         if (s->max_table_entries > SIZE_MAX / 4 ||
374             s->max_table_entries > (int) INT_MAX / 4) {
375             error_setg(errp, "Max Table Entries too large (%" PRId32 ")",
376                         s->max_table_entries);
377             ret = -EINVAL;
378             goto fail;
379         }
380 
381         pagetable_size = (uint64_t) s->max_table_entries * 4;
382 
383         s->pagetable = qemu_try_blockalign(bs->file->bs, pagetable_size);
384         if (s->pagetable == NULL) {
385             error_setg(errp, "Unable to allocate memory for page table");
386             ret = -ENOMEM;
387             goto fail;
388         }
389 
390         s->bat_offset = be64_to_cpu(dyndisk_header->table_offset);
391 
392         ret = bdrv_pread(bs->file, s->bat_offset, s->pagetable,
393                          pagetable_size);
394         if (ret < 0) {
395             error_setg(errp, "Error reading pagetable");
396             goto fail;
397         }
398 
399         s->free_data_block_offset =
400             ROUND_UP(s->bat_offset + pagetable_size, 512);
401 
402         for (i = 0; i < s->max_table_entries; i++) {
403             be32_to_cpus(&s->pagetable[i]);
404             if (s->pagetable[i] != 0xFFFFFFFF) {
405                 int64_t next = (512 * (int64_t) s->pagetable[i]) +
406                     s->bitmap_size + s->block_size;
407 
408                 if (next > s->free_data_block_offset) {
409                     s->free_data_block_offset = next;
410                 }
411             }
412         }
413 
414         if (s->free_data_block_offset > bdrv_getlength(bs->file->bs)) {
415             error_setg(errp, "block-vpc: free_data_block_offset points after "
416                              "the end of file. The image has been truncated.");
417             ret = -EINVAL;
418             goto fail;
419         }
420 
421         s->last_bitmap_offset = (int64_t) -1;
422 
423 #ifdef CACHE
424         s->pageentry_u8 = g_malloc(512);
425         s->pageentry_u32 = s->pageentry_u8;
426         s->pageentry_u16 = s->pageentry_u8;
427         s->last_pagetable = -1;
428 #endif
429     }
430 
431     /* Disable migration when VHD images are used */
432     error_setg(&s->migration_blocker, "The vpc format used by node '%s' "
433                "does not support live migration",
434                bdrv_get_device_or_node_name(bs));
435     ret = migrate_add_blocker(s->migration_blocker, &local_err);
436     if (local_err) {
437         error_propagate(errp, local_err);
438         error_free(s->migration_blocker);
439         goto fail;
440     }
441 
442     qemu_co_mutex_init(&s->lock);
443 
444     return 0;
445 
446 fail:
447     qemu_vfree(s->pagetable);
448 #ifdef CACHE
449     g_free(s->pageentry_u8);
450 #endif
451     return ret;
452 }
453 
454 static int vpc_reopen_prepare(BDRVReopenState *state,
455                               BlockReopenQueue *queue, Error **errp)
456 {
457     return 0;
458 }
459 
460 /*
461  * Returns the absolute byte offset of the given sector in the image file.
462  * If the sector is not allocated, -1 is returned instead.
463  * If an error occurred trying to write an updated block bitmap back to
464  * the file, -2 is returned, and the error value is written to *err.
465  * This can only happen for a write operation.
466  *
467  * The parameter write must be 1 if the offset will be used for a write
468  * operation (the block bitmaps is updated then), 0 otherwise.
469  * If write is true then err must not be NULL.
470  */
471 static inline int64_t get_image_offset(BlockDriverState *bs, uint64_t offset,
472                                        bool write, int *err)
473 {
474     BDRVVPCState *s = bs->opaque;
475     uint64_t bitmap_offset, block_offset;
476     uint32_t pagetable_index, offset_in_block;
477 
478     assert(!(write && err == NULL));
479 
480     pagetable_index = offset / s->block_size;
481     offset_in_block = offset % s->block_size;
482 
483     if (pagetable_index >= s->max_table_entries || s->pagetable[pagetable_index] == 0xffffffff)
484         return -1; /* not allocated */
485 
486     bitmap_offset = 512 * (uint64_t) s->pagetable[pagetable_index];
487     block_offset = bitmap_offset + s->bitmap_size + offset_in_block;
488 
489     /* We must ensure that we don't write to any sectors which are marked as
490        unused in the bitmap. We get away with setting all bits in the block
491        bitmap each time we write to a new block. This might cause Virtual PC to
492        miss sparse read optimization, but it's not a problem in terms of
493        correctness. */
494     if (write && (s->last_bitmap_offset != bitmap_offset)) {
495         uint8_t bitmap[s->bitmap_size];
496         int r;
497 
498         s->last_bitmap_offset = bitmap_offset;
499         memset(bitmap, 0xff, s->bitmap_size);
500         r = bdrv_pwrite_sync(bs->file, bitmap_offset, bitmap, s->bitmap_size);
501         if (r < 0) {
502             *err = r;
503             return -2;
504         }
505     }
506 
507     return block_offset;
508 }
509 
510 /*
511  * Writes the footer to the end of the image file. This is needed when the
512  * file grows as it overwrites the old footer
513  *
514  * Returns 0 on success and < 0 on error
515  */
516 static int rewrite_footer(BlockDriverState* bs)
517 {
518     int ret;
519     BDRVVPCState *s = bs->opaque;
520     int64_t offset = s->free_data_block_offset;
521 
522     ret = bdrv_pwrite_sync(bs->file, offset, s->footer_buf, HEADER_SIZE);
523     if (ret < 0)
524         return ret;
525 
526     return 0;
527 }
528 
529 /*
530  * Allocates a new block. This involves writing a new footer and updating
531  * the Block Allocation Table to use the space at the old end of the image
532  * file (overwriting the old footer)
533  *
534  * Returns the sectors' offset in the image file on success and < 0 on error
535  */
536 static int64_t alloc_block(BlockDriverState* bs, int64_t offset)
537 {
538     BDRVVPCState *s = bs->opaque;
539     int64_t bat_offset;
540     uint32_t index, bat_value;
541     int ret;
542     uint8_t bitmap[s->bitmap_size];
543 
544     /* Check if sector_num is valid */
545     if ((offset < 0) || (offset > bs->total_sectors * BDRV_SECTOR_SIZE)) {
546         return -EINVAL;
547     }
548 
549     /* Write entry into in-memory BAT */
550     index = offset / s->block_size;
551     assert(s->pagetable[index] == 0xFFFFFFFF);
552     s->pagetable[index] = s->free_data_block_offset / 512;
553 
554     /* Initialize the block's bitmap */
555     memset(bitmap, 0xff, s->bitmap_size);
556     ret = bdrv_pwrite_sync(bs->file, s->free_data_block_offset, bitmap,
557         s->bitmap_size);
558     if (ret < 0) {
559         return ret;
560     }
561 
562     /* Write new footer (the old one will be overwritten) */
563     s->free_data_block_offset += s->block_size + s->bitmap_size;
564     ret = rewrite_footer(bs);
565     if (ret < 0)
566         goto fail;
567 
568     /* Write BAT entry to disk */
569     bat_offset = s->bat_offset + (4 * index);
570     bat_value = cpu_to_be32(s->pagetable[index]);
571     ret = bdrv_pwrite_sync(bs->file, bat_offset, &bat_value, 4);
572     if (ret < 0)
573         goto fail;
574 
575     return get_image_offset(bs, offset, false, NULL);
576 
577 fail:
578     s->free_data_block_offset -= (s->block_size + s->bitmap_size);
579     return ret;
580 }
581 
582 static int vpc_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
583 {
584     BDRVVPCState *s = (BDRVVPCState *)bs->opaque;
585     VHDFooter *footer = (VHDFooter *) s->footer_buf;
586 
587     if (be32_to_cpu(footer->type) != VHD_FIXED) {
588         bdi->cluster_size = s->block_size;
589     }
590 
591     bdi->unallocated_blocks_are_zero = true;
592     return 0;
593 }
594 
595 static int coroutine_fn
596 vpc_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
597               QEMUIOVector *qiov, int flags)
598 {
599     BDRVVPCState *s = bs->opaque;
600     int ret;
601     int64_t image_offset;
602     int64_t n_bytes;
603     int64_t bytes_done = 0;
604     VHDFooter *footer = (VHDFooter *) s->footer_buf;
605     QEMUIOVector local_qiov;
606 
607     if (be32_to_cpu(footer->type) == VHD_FIXED) {
608         return bdrv_co_preadv(bs->file, offset, bytes, qiov, 0);
609     }
610 
611     qemu_co_mutex_lock(&s->lock);
612     qemu_iovec_init(&local_qiov, qiov->niov);
613 
614     while (bytes > 0) {
615         image_offset = get_image_offset(bs, offset, false, NULL);
616         n_bytes = MIN(bytes, s->block_size - (offset % s->block_size));
617 
618         if (image_offset == -1) {
619             qemu_iovec_memset(qiov, bytes_done, 0, n_bytes);
620         } else {
621             qemu_iovec_reset(&local_qiov);
622             qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes);
623 
624             ret = bdrv_co_preadv(bs->file, image_offset, n_bytes,
625                                  &local_qiov, 0);
626             if (ret < 0) {
627                 goto fail;
628             }
629         }
630 
631         bytes -= n_bytes;
632         offset += n_bytes;
633         bytes_done += n_bytes;
634     }
635 
636     ret = 0;
637 fail:
638     qemu_iovec_destroy(&local_qiov);
639     qemu_co_mutex_unlock(&s->lock);
640 
641     return ret;
642 }
643 
644 static int coroutine_fn
645 vpc_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
646                QEMUIOVector *qiov, int flags)
647 {
648     BDRVVPCState *s = bs->opaque;
649     int64_t image_offset;
650     int64_t n_bytes;
651     int64_t bytes_done = 0;
652     int ret = 0;
653     VHDFooter *footer =  (VHDFooter *) s->footer_buf;
654     QEMUIOVector local_qiov;
655 
656     if (be32_to_cpu(footer->type) == VHD_FIXED) {
657         return bdrv_co_pwritev(bs->file, offset, bytes, qiov, 0);
658     }
659 
660     qemu_co_mutex_lock(&s->lock);
661     qemu_iovec_init(&local_qiov, qiov->niov);
662 
663     while (bytes > 0) {
664         image_offset = get_image_offset(bs, offset, true, &ret);
665         if (image_offset == -2) {
666             /* Failed to write block bitmap: can't proceed with write */
667             goto fail;
668         }
669         n_bytes = MIN(bytes, s->block_size - (offset % s->block_size));
670 
671         if (image_offset == -1) {
672             image_offset = alloc_block(bs, offset);
673             if (image_offset < 0) {
674                 ret = image_offset;
675                 goto fail;
676             }
677         }
678 
679         qemu_iovec_reset(&local_qiov);
680         qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes);
681 
682         ret = bdrv_co_pwritev(bs->file, image_offset, n_bytes,
683                               &local_qiov, 0);
684         if (ret < 0) {
685             goto fail;
686         }
687 
688         bytes -= n_bytes;
689         offset += n_bytes;
690         bytes_done += n_bytes;
691     }
692 
693     ret = 0;
694 fail:
695     qemu_iovec_destroy(&local_qiov);
696     qemu_co_mutex_unlock(&s->lock);
697 
698     return ret;
699 }
700 
701 static int64_t coroutine_fn vpc_co_get_block_status(BlockDriverState *bs,
702         int64_t sector_num, int nb_sectors, int *pnum, BlockDriverState **file)
703 {
704     BDRVVPCState *s = bs->opaque;
705     VHDFooter *footer = (VHDFooter*) s->footer_buf;
706     int64_t start, offset;
707     bool allocated;
708     int64_t ret;
709     int n;
710 
711     if (be32_to_cpu(footer->type) == VHD_FIXED) {
712         *pnum = nb_sectors;
713         *file = bs->file->bs;
714         return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID |
715                (sector_num << BDRV_SECTOR_BITS);
716     }
717 
718     qemu_co_mutex_lock(&s->lock);
719 
720     offset = get_image_offset(bs, sector_num << BDRV_SECTOR_BITS, false, NULL);
721     start = offset;
722     allocated = (offset != -1);
723     *pnum = 0;
724     ret = 0;
725 
726     do {
727         /* All sectors in a block are contiguous (without using the bitmap) */
728         n = ROUND_UP(sector_num + 1, s->block_size / BDRV_SECTOR_SIZE)
729           - sector_num;
730         n = MIN(n, nb_sectors);
731 
732         *pnum += n;
733         sector_num += n;
734         nb_sectors -= n;
735         /* *pnum can't be greater than one block for allocated
736          * sectors since there is always a bitmap in between. */
737         if (allocated) {
738             *file = bs->file->bs;
739             ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | start;
740             break;
741         }
742         if (nb_sectors == 0) {
743             break;
744         }
745         offset = get_image_offset(bs, sector_num << BDRV_SECTOR_BITS, false,
746                                   NULL);
747     } while (offset == -1);
748 
749     qemu_co_mutex_unlock(&s->lock);
750     return ret;
751 }
752 
753 /*
754  * Calculates the number of cylinders, heads and sectors per cylinder
755  * based on a given number of sectors. This is the algorithm described
756  * in the VHD specification.
757  *
758  * Note that the geometry doesn't always exactly match total_sectors but
759  * may round it down.
760  *
761  * Returns 0 on success, -EFBIG if the size is larger than 2040 GiB. Override
762  * the hardware EIDE and ATA-2 limit of 16 heads (max disk size of 127 GB)
763  * and instead allow up to 255 heads.
764  */
765 static int calculate_geometry(int64_t total_sectors, uint16_t* cyls,
766     uint8_t* heads, uint8_t* secs_per_cyl)
767 {
768     uint32_t cyls_times_heads;
769 
770     total_sectors = MIN(total_sectors, VHD_MAX_GEOMETRY);
771 
772     if (total_sectors >= 65535LL * 16 * 63) {
773         *secs_per_cyl = 255;
774         *heads = 16;
775         cyls_times_heads = total_sectors / *secs_per_cyl;
776     } else {
777         *secs_per_cyl = 17;
778         cyls_times_heads = total_sectors / *secs_per_cyl;
779         *heads = (cyls_times_heads + 1023) / 1024;
780 
781         if (*heads < 4) {
782             *heads = 4;
783         }
784 
785         if (cyls_times_heads >= (*heads * 1024) || *heads > 16) {
786             *secs_per_cyl = 31;
787             *heads = 16;
788             cyls_times_heads = total_sectors / *secs_per_cyl;
789         }
790 
791         if (cyls_times_heads >= (*heads * 1024)) {
792             *secs_per_cyl = 63;
793             *heads = 16;
794             cyls_times_heads = total_sectors / *secs_per_cyl;
795         }
796     }
797 
798     *cyls = cyls_times_heads / *heads;
799 
800     return 0;
801 }
802 
803 static int create_dynamic_disk(BlockBackend *blk, uint8_t *buf,
804                                int64_t total_sectors)
805 {
806     VHDDynDiskHeader *dyndisk_header =
807         (VHDDynDiskHeader *) buf;
808     size_t block_size, num_bat_entries;
809     int i;
810     int ret;
811     int64_t offset = 0;
812 
813     /* Write the footer (twice: at the beginning and at the end) */
814     block_size = 0x200000;
815     num_bat_entries = (total_sectors + block_size / 512) / (block_size / 512);
816 
817     ret = blk_pwrite(blk, offset, buf, HEADER_SIZE, 0);
818     if (ret < 0) {
819         goto fail;
820     }
821 
822     offset = 1536 + ((num_bat_entries * 4 + 511) & ~511);
823     ret = blk_pwrite(blk, offset, buf, HEADER_SIZE, 0);
824     if (ret < 0) {
825         goto fail;
826     }
827 
828     /* Write the initial BAT */
829     offset = 3 * 512;
830 
831     memset(buf, 0xFF, 512);
832     for (i = 0; i < (num_bat_entries * 4 + 511) / 512; i++) {
833         ret = blk_pwrite(blk, offset, buf, 512, 0);
834         if (ret < 0) {
835             goto fail;
836         }
837         offset += 512;
838     }
839 
840     /* Prepare the Dynamic Disk Header */
841     memset(buf, 0, 1024);
842 
843     memcpy(dyndisk_header->magic, "cxsparse", 8);
844 
845     /*
846      * Note: The spec is actually wrong here for data_offset, it says
847      * 0xFFFFFFFF, but MS tools expect all 64 bits to be set.
848      */
849     dyndisk_header->data_offset = cpu_to_be64(0xFFFFFFFFFFFFFFFFULL);
850     dyndisk_header->table_offset = cpu_to_be64(3 * 512);
851     dyndisk_header->version = cpu_to_be32(0x00010000);
852     dyndisk_header->block_size = cpu_to_be32(block_size);
853     dyndisk_header->max_table_entries = cpu_to_be32(num_bat_entries);
854 
855     dyndisk_header->checksum = cpu_to_be32(vpc_checksum(buf, 1024));
856 
857     /* Write the header */
858     offset = 512;
859 
860     ret = blk_pwrite(blk, offset, buf, 1024, 0);
861     if (ret < 0) {
862         goto fail;
863     }
864 
865  fail:
866     return ret;
867 }
868 
869 static int create_fixed_disk(BlockBackend *blk, uint8_t *buf,
870                              int64_t total_size, Error **errp)
871 {
872     int ret;
873 
874     /* Add footer to total size */
875     total_size += HEADER_SIZE;
876 
877     ret = blk_truncate(blk, total_size, PREALLOC_MODE_OFF, errp);
878     if (ret < 0) {
879         return ret;
880     }
881 
882     ret = blk_pwrite(blk, total_size - HEADER_SIZE, buf, HEADER_SIZE, 0);
883     if (ret < 0) {
884         error_setg_errno(errp, -ret, "Unable to write VHD header");
885         return ret;
886     }
887 
888     return ret;
889 }
890 
891 static int vpc_create(const char *filename, QemuOpts *opts, Error **errp)
892 {
893     uint8_t buf[1024];
894     VHDFooter *footer = (VHDFooter *) buf;
895     char *disk_type_param;
896     int i;
897     uint16_t cyls = 0;
898     uint8_t heads = 0;
899     uint8_t secs_per_cyl = 0;
900     int64_t total_sectors;
901     int64_t total_size;
902     int disk_type;
903     int ret = -EIO;
904     bool force_size;
905     Error *local_err = NULL;
906     BlockBackend *blk = NULL;
907 
908     /* Read out options */
909     total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
910                           BDRV_SECTOR_SIZE);
911     disk_type_param = qemu_opt_get_del(opts, BLOCK_OPT_SUBFMT);
912     if (disk_type_param) {
913         if (!strcmp(disk_type_param, "dynamic")) {
914             disk_type = VHD_DYNAMIC;
915         } else if (!strcmp(disk_type_param, "fixed")) {
916             disk_type = VHD_FIXED;
917         } else {
918             error_setg(errp, "Invalid disk type, %s", disk_type_param);
919             ret = -EINVAL;
920             goto out;
921         }
922     } else {
923         disk_type = VHD_DYNAMIC;
924     }
925 
926     force_size = qemu_opt_get_bool_del(opts, VPC_OPT_FORCE_SIZE, false);
927 
928     ret = bdrv_create_file(filename, opts, &local_err);
929     if (ret < 0) {
930         error_propagate(errp, local_err);
931         goto out;
932     }
933 
934     blk = blk_new_open(filename, NULL, NULL,
935                        BDRV_O_RDWR | BDRV_O_RESIZE | BDRV_O_PROTOCOL,
936                        &local_err);
937     if (blk == NULL) {
938         error_propagate(errp, local_err);
939         ret = -EIO;
940         goto out;
941     }
942 
943     blk_set_allow_write_beyond_eof(blk, true);
944 
945     /*
946      * Calculate matching total_size and geometry. Increase the number of
947      * sectors requested until we get enough (or fail). This ensures that
948      * qemu-img convert doesn't truncate images, but rather rounds up.
949      *
950      * If the image size can't be represented by a spec conformant CHS geometry,
951      * we set the geometry to 65535 x 16 x 255 (CxHxS) sectors and use
952      * the image size from the VHD footer to calculate total_sectors.
953      */
954     if (force_size) {
955         /* This will force the use of total_size for sector count, below */
956         cyls         = VHD_CHS_MAX_C;
957         heads        = VHD_CHS_MAX_H;
958         secs_per_cyl = VHD_CHS_MAX_S;
959     } else {
960         total_sectors = MIN(VHD_MAX_GEOMETRY, total_size / BDRV_SECTOR_SIZE);
961         for (i = 0; total_sectors > (int64_t)cyls * heads * secs_per_cyl; i++) {
962             calculate_geometry(total_sectors + i, &cyls, &heads, &secs_per_cyl);
963         }
964     }
965 
966     if ((int64_t)cyls * heads * secs_per_cyl == VHD_MAX_GEOMETRY) {
967         total_sectors = total_size / BDRV_SECTOR_SIZE;
968         /* Allow a maximum disk size of 2040 GiB */
969         if (total_sectors > VHD_MAX_SECTORS) {
970             error_setg(errp, "Disk size is too large, max size is 2040 GiB");
971             ret = -EFBIG;
972             goto out;
973         }
974     } else {
975         total_sectors = (int64_t)cyls * heads * secs_per_cyl;
976         total_size = total_sectors * BDRV_SECTOR_SIZE;
977     }
978 
979     /* Prepare the Hard Disk Footer */
980     memset(buf, 0, 1024);
981 
982     memcpy(footer->creator, "conectix", 8);
983     if (force_size) {
984         memcpy(footer->creator_app, "qem2", 4);
985     } else {
986         memcpy(footer->creator_app, "qemu", 4);
987     }
988     memcpy(footer->creator_os, "Wi2k", 4);
989 
990     footer->features = cpu_to_be32(0x02);
991     footer->version = cpu_to_be32(0x00010000);
992     if (disk_type == VHD_DYNAMIC) {
993         footer->data_offset = cpu_to_be64(HEADER_SIZE);
994     } else {
995         footer->data_offset = cpu_to_be64(0xFFFFFFFFFFFFFFFFULL);
996     }
997     footer->timestamp = cpu_to_be32(time(NULL) - VHD_TIMESTAMP_BASE);
998 
999     /* Version of Virtual PC 2007 */
1000     footer->major = cpu_to_be16(0x0005);
1001     footer->minor = cpu_to_be16(0x0003);
1002     footer->orig_size = cpu_to_be64(total_size);
1003     footer->current_size = cpu_to_be64(total_size);
1004     footer->cyls = cpu_to_be16(cyls);
1005     footer->heads = heads;
1006     footer->secs_per_cyl = secs_per_cyl;
1007 
1008     footer->type = cpu_to_be32(disk_type);
1009 
1010     qemu_uuid_generate(&footer->uuid);
1011 
1012     footer->checksum = cpu_to_be32(vpc_checksum(buf, HEADER_SIZE));
1013 
1014     if (disk_type == VHD_DYNAMIC) {
1015         ret = create_dynamic_disk(blk, buf, total_sectors);
1016         if (ret < 0) {
1017             error_setg(errp, "Unable to create or write VHD header");
1018         }
1019     } else {
1020         ret = create_fixed_disk(blk, buf, total_size, errp);
1021     }
1022 
1023 out:
1024     blk_unref(blk);
1025     g_free(disk_type_param);
1026     return ret;
1027 }
1028 
1029 static int vpc_has_zero_init(BlockDriverState *bs)
1030 {
1031     BDRVVPCState *s = bs->opaque;
1032     VHDFooter *footer =  (VHDFooter *) s->footer_buf;
1033 
1034     if (be32_to_cpu(footer->type) == VHD_FIXED) {
1035         return bdrv_has_zero_init(bs->file->bs);
1036     } else {
1037         return 1;
1038     }
1039 }
1040 
1041 static void vpc_close(BlockDriverState *bs)
1042 {
1043     BDRVVPCState *s = bs->opaque;
1044     qemu_vfree(s->pagetable);
1045 #ifdef CACHE
1046     g_free(s->pageentry_u8);
1047 #endif
1048 
1049     migrate_del_blocker(s->migration_blocker);
1050     error_free(s->migration_blocker);
1051 }
1052 
1053 static QemuOptsList vpc_create_opts = {
1054     .name = "vpc-create-opts",
1055     .head = QTAILQ_HEAD_INITIALIZER(vpc_create_opts.head),
1056     .desc = {
1057         {
1058             .name = BLOCK_OPT_SIZE,
1059             .type = QEMU_OPT_SIZE,
1060             .help = "Virtual disk size"
1061         },
1062         {
1063             .name = BLOCK_OPT_SUBFMT,
1064             .type = QEMU_OPT_STRING,
1065             .help =
1066                 "Type of virtual hard disk format. Supported formats are "
1067                 "{dynamic (default) | fixed} "
1068         },
1069         {
1070             .name = VPC_OPT_FORCE_SIZE,
1071             .type = QEMU_OPT_BOOL,
1072             .help = "Force disk size calculation to use the actual size "
1073                     "specified, rather than using the nearest CHS-based "
1074                     "calculation"
1075         },
1076         { /* end of list */ }
1077     }
1078 };
1079 
1080 static BlockDriver bdrv_vpc = {
1081     .format_name    = "vpc",
1082     .instance_size  = sizeof(BDRVVPCState),
1083 
1084     .bdrv_probe             = vpc_probe,
1085     .bdrv_open              = vpc_open,
1086     .bdrv_close             = vpc_close,
1087     .bdrv_reopen_prepare    = vpc_reopen_prepare,
1088     .bdrv_child_perm        = bdrv_format_default_perms,
1089     .bdrv_create            = vpc_create,
1090 
1091     .bdrv_co_preadv             = vpc_co_preadv,
1092     .bdrv_co_pwritev            = vpc_co_pwritev,
1093     .bdrv_co_get_block_status   = vpc_co_get_block_status,
1094 
1095     .bdrv_get_info          = vpc_get_info,
1096 
1097     .create_opts            = &vpc_create_opts,
1098     .bdrv_has_zero_init     = vpc_has_zero_init,
1099 };
1100 
1101 static void bdrv_vpc_init(void)
1102 {
1103     bdrv_register(&bdrv_vpc);
1104 }
1105 
1106 block_init(bdrv_vpc_init);
1107