xref: /openbmc/qemu/block/vpc.c (revision bcd82a96)
1 /*
2  * Block driver for Connectix / Microsoft Virtual PC images
3  *
4  * Copyright (c) 2005 Alex Beregszaszi
5  * Copyright (c) 2009 Kevin Wolf <kwolf@suse.de>
6  *
7  * Permission is hereby granted, free of charge, to any person obtaining a copy
8  * of this software and associated documentation files (the "Software"), to deal
9  * in the Software without restriction, including without limitation the rights
10  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11  * copies of the Software, and to permit persons to whom the Software is
12  * furnished to do so, subject to the following conditions:
13  *
14  * The above copyright notice and this permission notice shall be included in
15  * all copies or substantial portions of the Software.
16  *
17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23  * THE SOFTWARE.
24  */
25 #include "qemu/osdep.h"
26 #include "qapi/error.h"
27 #include "qemu-common.h"
28 #include "block/block_int.h"
29 #include "sysemu/block-backend.h"
30 #include "qemu/module.h"
31 #include "migration/migration.h"
32 #if defined(CONFIG_UUID)
33 #include <uuid/uuid.h>
34 #endif
35 
36 /**************************************************************/
37 
38 #define HEADER_SIZE 512
39 
40 //#define CACHE
41 
42 enum vhd_type {
43     VHD_FIXED           = 2,
44     VHD_DYNAMIC         = 3,
45     VHD_DIFFERENCING    = 4,
46 };
47 
48 /* Seconds since Jan 1, 2000 0:00:00 (UTC) */
49 #define VHD_TIMESTAMP_BASE 946684800
50 
51 #define VHD_CHS_MAX_C   65535LL
52 #define VHD_CHS_MAX_H   16
53 #define VHD_CHS_MAX_S   255
54 
55 #define VHD_MAX_SECTORS       0xff000000    /* 2040 GiB max image size */
56 #define VHD_MAX_GEOMETRY      (VHD_CHS_MAX_C * VHD_CHS_MAX_H * VHD_CHS_MAX_S)
57 
58 #define VPC_OPT_FORCE_SIZE "force_size"
59 
60 /* always big-endian */
61 typedef struct vhd_footer {
62     char        creator[8]; /* "conectix" */
63     uint32_t    features;
64     uint32_t    version;
65 
66     /* Offset of next header structure, 0xFFFFFFFF if none */
67     uint64_t    data_offset;
68 
69     /* Seconds since Jan 1, 2000 0:00:00 (UTC) */
70     uint32_t    timestamp;
71 
72     char        creator_app[4]; /*  e.g., "vpc " */
73     uint16_t    major;
74     uint16_t    minor;
75     char        creator_os[4]; /* "Wi2k" */
76 
77     uint64_t    orig_size;
78     uint64_t    current_size;
79 
80     uint16_t    cyls;
81     uint8_t     heads;
82     uint8_t     secs_per_cyl;
83 
84     uint32_t    type;
85 
86     /* Checksum of the Hard Disk Footer ("one's complement of the sum of all
87        the bytes in the footer without the checksum field") */
88     uint32_t    checksum;
89 
90     /* UUID used to identify a parent hard disk (backing file) */
91     uint8_t     uuid[16];
92 
93     uint8_t     in_saved_state;
94 } QEMU_PACKED VHDFooter;
95 
96 typedef struct vhd_dyndisk_header {
97     char        magic[8]; /* "cxsparse" */
98 
99     /* Offset of next header structure, 0xFFFFFFFF if none */
100     uint64_t    data_offset;
101 
102     /* Offset of the Block Allocation Table (BAT) */
103     uint64_t    table_offset;
104 
105     uint32_t    version;
106     uint32_t    max_table_entries; /* 32bit/entry */
107 
108     /* 2 MB by default, must be a power of two */
109     uint32_t    block_size;
110 
111     uint32_t    checksum;
112     uint8_t     parent_uuid[16];
113     uint32_t    parent_timestamp;
114     uint32_t    reserved;
115 
116     /* Backing file name (in UTF-16) */
117     uint8_t     parent_name[512];
118 
119     struct {
120         uint32_t    platform;
121         uint32_t    data_space;
122         uint32_t    data_length;
123         uint32_t    reserved;
124         uint64_t    data_offset;
125     } parent_locator[8];
126 } QEMU_PACKED VHDDynDiskHeader;
127 
128 typedef struct BDRVVPCState {
129     CoMutex lock;
130     uint8_t footer_buf[HEADER_SIZE];
131     uint64_t free_data_block_offset;
132     int max_table_entries;
133     uint32_t *pagetable;
134     uint64_t bat_offset;
135     uint64_t last_bitmap_offset;
136 
137     uint32_t block_size;
138     uint32_t bitmap_size;
139     bool force_use_chs;
140     bool force_use_sz;
141 
142 #ifdef CACHE
143     uint8_t *pageentry_u8;
144     uint32_t *pageentry_u32;
145     uint16_t *pageentry_u16;
146 
147     uint64_t last_bitmap;
148 #endif
149 
150     Error *migration_blocker;
151 } BDRVVPCState;
152 
153 #define VPC_OPT_SIZE_CALC "force_size_calc"
154 static QemuOptsList vpc_runtime_opts = {
155     .name = "vpc-runtime-opts",
156     .head = QTAILQ_HEAD_INITIALIZER(vpc_runtime_opts.head),
157     .desc = {
158         {
159             .name = VPC_OPT_SIZE_CALC,
160             .type = QEMU_OPT_STRING,
161             .help = "Force disk size calculation to use either CHS geometry, "
162                     "or use the disk current_size specified in the VHD footer. "
163                     "{chs, current_size}"
164         },
165         { /* end of list */ }
166     }
167 };
168 
169 static uint32_t vpc_checksum(uint8_t* buf, size_t size)
170 {
171     uint32_t res = 0;
172     int i;
173 
174     for (i = 0; i < size; i++)
175         res += buf[i];
176 
177     return ~res;
178 }
179 
180 
181 static int vpc_probe(const uint8_t *buf, int buf_size, const char *filename)
182 {
183     if (buf_size >= 8 && !strncmp((char *)buf, "conectix", 8))
184 	return 100;
185     return 0;
186 }
187 
188 static void vpc_parse_options(BlockDriverState *bs, QemuOpts *opts,
189                               Error **errp)
190 {
191     BDRVVPCState *s = bs->opaque;
192     const char *size_calc;
193 
194     size_calc = qemu_opt_get(opts, VPC_OPT_SIZE_CALC);
195 
196     if (!size_calc) {
197        /* no override, use autodetect only */
198     } else if (!strcmp(size_calc, "current_size")) {
199         s->force_use_sz = true;
200     } else if (!strcmp(size_calc, "chs")) {
201         s->force_use_chs = true;
202     } else {
203         error_setg(errp, "Invalid size calculation mode: '%s'", size_calc);
204     }
205 }
206 
207 static int vpc_open(BlockDriverState *bs, QDict *options, int flags,
208                     Error **errp)
209 {
210     BDRVVPCState *s = bs->opaque;
211     int i;
212     VHDFooter *footer;
213     VHDDynDiskHeader *dyndisk_header;
214     QemuOpts *opts = NULL;
215     Error *local_err = NULL;
216     bool use_chs;
217     uint8_t buf[HEADER_SIZE];
218     uint32_t checksum;
219     uint64_t computed_size;
220     uint64_t pagetable_size;
221     int disk_type = VHD_DYNAMIC;
222     int ret;
223 
224     opts = qemu_opts_create(&vpc_runtime_opts, NULL, 0, &error_abort);
225     qemu_opts_absorb_qdict(opts, options, &local_err);
226     if (local_err) {
227         error_propagate(errp, local_err);
228         ret = -EINVAL;
229         goto fail;
230     }
231 
232     vpc_parse_options(bs, opts, &local_err);
233     if (local_err) {
234         error_propagate(errp, local_err);
235         ret = -EINVAL;
236         goto fail;
237     }
238 
239     ret = bdrv_pread(bs->file->bs, 0, s->footer_buf, HEADER_SIZE);
240     if (ret < 0) {
241         error_setg(errp, "Unable to read VHD header");
242         goto fail;
243     }
244 
245     footer = (VHDFooter *) s->footer_buf;
246     if (strncmp(footer->creator, "conectix", 8)) {
247         int64_t offset = bdrv_getlength(bs->file->bs);
248         if (offset < 0) {
249             ret = offset;
250             error_setg(errp, "Invalid file size");
251             goto fail;
252         } else if (offset < HEADER_SIZE) {
253             ret = -EINVAL;
254             error_setg(errp, "File too small for a VHD header");
255             goto fail;
256         }
257 
258         /* If a fixed disk, the footer is found only at the end of the file */
259         ret = bdrv_pread(bs->file->bs, offset-HEADER_SIZE, s->footer_buf,
260                          HEADER_SIZE);
261         if (ret < 0) {
262             goto fail;
263         }
264         if (strncmp(footer->creator, "conectix", 8)) {
265             error_setg(errp, "invalid VPC image");
266             ret = -EINVAL;
267             goto fail;
268         }
269         disk_type = VHD_FIXED;
270     }
271 
272     checksum = be32_to_cpu(footer->checksum);
273     footer->checksum = 0;
274     if (vpc_checksum(s->footer_buf, HEADER_SIZE) != checksum)
275         fprintf(stderr, "block-vpc: The header checksum of '%s' is "
276             "incorrect.\n", bs->filename);
277 
278     /* Write 'checksum' back to footer, or else will leave it with zero. */
279     footer->checksum = cpu_to_be32(checksum);
280 
281     /* The visible size of a image in Virtual PC depends on the geometry
282        rather than on the size stored in the footer (the size in the footer
283        is too large usually) */
284     bs->total_sectors = (int64_t)
285         be16_to_cpu(footer->cyls) * footer->heads * footer->secs_per_cyl;
286 
287     /* Microsoft Virtual PC and Microsoft Hyper-V produce and read
288      * VHD image sizes differently.  VPC will rely on CHS geometry,
289      * while Hyper-V and disk2vhd use the size specified in the footer.
290      *
291      * We use a couple of approaches to try and determine the correct method:
292      * look at the Creator App field, and look for images that have CHS
293      * geometry that is the maximum value.
294      *
295      * If the CHS geometry is the maximum CHS geometry, then we assume that
296      * the size is the footer->current_size to avoid truncation.  Otherwise,
297      * we follow the table based on footer->creator_app:
298      *
299      *  Known creator apps:
300      *      'vpc '  :  CHS              Virtual PC (uses disk geometry)
301      *      'qemu'  :  CHS              QEMU (uses disk geometry)
302      *      'qem2'  :  current_size     QEMU (uses current_size)
303      *      'win '  :  current_size     Hyper-V
304      *      'd2v '  :  current_size     Disk2vhd
305      *      'tap\0' :  current_size     XenServer
306      *      'CTXS'  :  current_size     XenConverter
307      *
308      *  The user can override the table values via drive options, however
309      *  even with an override we will still use current_size for images
310      *  that have CHS geometry of the maximum size.
311      */
312     use_chs = (!!strncmp(footer->creator_app, "win ", 4) &&
313                !!strncmp(footer->creator_app, "qem2", 4) &&
314                !!strncmp(footer->creator_app, "d2v ", 4) &&
315                !!strncmp(footer->creator_app, "CTXS", 4) &&
316                !!memcmp(footer->creator_app, "tap", 4)) || s->force_use_chs;
317 
318     if (!use_chs || bs->total_sectors == VHD_MAX_GEOMETRY || s->force_use_sz) {
319         bs->total_sectors = be64_to_cpu(footer->current_size) /
320                                         BDRV_SECTOR_SIZE;
321     }
322 
323     /* Allow a maximum disk size of 2040 GiB */
324     if (bs->total_sectors > VHD_MAX_SECTORS) {
325         ret = -EFBIG;
326         goto fail;
327     }
328 
329     if (disk_type == VHD_DYNAMIC) {
330         ret = bdrv_pread(bs->file->bs, be64_to_cpu(footer->data_offset), buf,
331                          HEADER_SIZE);
332         if (ret < 0) {
333             error_setg(errp, "Error reading dynamic VHD header");
334             goto fail;
335         }
336 
337         dyndisk_header = (VHDDynDiskHeader *) buf;
338 
339         if (strncmp(dyndisk_header->magic, "cxsparse", 8)) {
340             error_setg(errp, "Invalid header magic");
341             ret = -EINVAL;
342             goto fail;
343         }
344 
345         s->block_size = be32_to_cpu(dyndisk_header->block_size);
346         if (!is_power_of_2(s->block_size) || s->block_size < BDRV_SECTOR_SIZE) {
347             error_setg(errp, "Invalid block size %" PRIu32, s->block_size);
348             ret = -EINVAL;
349             goto fail;
350         }
351         s->bitmap_size = ((s->block_size / (8 * 512)) + 511) & ~511;
352 
353         s->max_table_entries = be32_to_cpu(dyndisk_header->max_table_entries);
354 
355         if ((bs->total_sectors * 512) / s->block_size > 0xffffffffU) {
356             error_setg(errp, "Too many blocks");
357             ret = -EINVAL;
358             goto fail;
359         }
360 
361         computed_size = (uint64_t) s->max_table_entries * s->block_size;
362         if (computed_size < bs->total_sectors * 512) {
363             error_setg(errp, "Page table too small");
364             ret = -EINVAL;
365             goto fail;
366         }
367 
368         if (s->max_table_entries > SIZE_MAX / 4 ||
369             s->max_table_entries > (int) INT_MAX / 4) {
370             error_setg(errp, "Max Table Entries too large (%" PRId32 ")",
371                         s->max_table_entries);
372             ret = -EINVAL;
373             goto fail;
374         }
375 
376         pagetable_size = (uint64_t) s->max_table_entries * 4;
377 
378         s->pagetable = qemu_try_blockalign(bs->file->bs, pagetable_size);
379         if (s->pagetable == NULL) {
380             error_setg(errp, "Unable to allocate memory for page table");
381             ret = -ENOMEM;
382             goto fail;
383         }
384 
385         s->bat_offset = be64_to_cpu(dyndisk_header->table_offset);
386 
387         ret = bdrv_pread(bs->file->bs, s->bat_offset, s->pagetable,
388                          pagetable_size);
389         if (ret < 0) {
390             error_setg(errp, "Error reading pagetable");
391             goto fail;
392         }
393 
394         s->free_data_block_offset =
395             ROUND_UP(s->bat_offset + pagetable_size, 512);
396 
397         for (i = 0; i < s->max_table_entries; i++) {
398             be32_to_cpus(&s->pagetable[i]);
399             if (s->pagetable[i] != 0xFFFFFFFF) {
400                 int64_t next = (512 * (int64_t) s->pagetable[i]) +
401                     s->bitmap_size + s->block_size;
402 
403                 if (next > s->free_data_block_offset) {
404                     s->free_data_block_offset = next;
405                 }
406             }
407         }
408 
409         if (s->free_data_block_offset > bdrv_getlength(bs->file->bs)) {
410             error_setg(errp, "block-vpc: free_data_block_offset points after "
411                              "the end of file. The image has been truncated.");
412             ret = -EINVAL;
413             goto fail;
414         }
415 
416         s->last_bitmap_offset = (int64_t) -1;
417 
418 #ifdef CACHE
419         s->pageentry_u8 = g_malloc(512);
420         s->pageentry_u32 = s->pageentry_u8;
421         s->pageentry_u16 = s->pageentry_u8;
422         s->last_pagetable = -1;
423 #endif
424     }
425 
426     qemu_co_mutex_init(&s->lock);
427 
428     /* Disable migration when VHD images are used */
429     error_setg(&s->migration_blocker, "The vpc format used by node '%s' "
430                "does not support live migration",
431                bdrv_get_device_or_node_name(bs));
432     migrate_add_blocker(s->migration_blocker);
433 
434     return 0;
435 
436 fail:
437     qemu_vfree(s->pagetable);
438 #ifdef CACHE
439     g_free(s->pageentry_u8);
440 #endif
441     return ret;
442 }
443 
444 static int vpc_reopen_prepare(BDRVReopenState *state,
445                               BlockReopenQueue *queue, Error **errp)
446 {
447     return 0;
448 }
449 
450 /*
451  * Returns the absolute byte offset of the given sector in the image file.
452  * If the sector is not allocated, -1 is returned instead.
453  *
454  * The parameter write must be 1 if the offset will be used for a write
455  * operation (the block bitmaps is updated then), 0 otherwise.
456  */
457 static inline int64_t get_sector_offset(BlockDriverState *bs,
458     int64_t sector_num, int write)
459 {
460     BDRVVPCState *s = bs->opaque;
461     uint64_t offset = sector_num * 512;
462     uint64_t bitmap_offset, block_offset;
463     uint32_t pagetable_index, pageentry_index;
464 
465     pagetable_index = offset / s->block_size;
466     pageentry_index = (offset % s->block_size) / 512;
467 
468     if (pagetable_index >= s->max_table_entries || s->pagetable[pagetable_index] == 0xffffffff)
469         return -1; /* not allocated */
470 
471     bitmap_offset = 512 * (uint64_t) s->pagetable[pagetable_index];
472     block_offset = bitmap_offset + s->bitmap_size + (512 * pageentry_index);
473 
474     /* We must ensure that we don't write to any sectors which are marked as
475        unused in the bitmap. We get away with setting all bits in the block
476        bitmap each time we write to a new block. This might cause Virtual PC to
477        miss sparse read optimization, but it's not a problem in terms of
478        correctness. */
479     if (write && (s->last_bitmap_offset != bitmap_offset)) {
480         uint8_t bitmap[s->bitmap_size];
481 
482         s->last_bitmap_offset = bitmap_offset;
483         memset(bitmap, 0xff, s->bitmap_size);
484         bdrv_pwrite_sync(bs->file->bs, bitmap_offset, bitmap, s->bitmap_size);
485     }
486 
487     return block_offset;
488 }
489 
490 /*
491  * Writes the footer to the end of the image file. This is needed when the
492  * file grows as it overwrites the old footer
493  *
494  * Returns 0 on success and < 0 on error
495  */
496 static int rewrite_footer(BlockDriverState* bs)
497 {
498     int ret;
499     BDRVVPCState *s = bs->opaque;
500     int64_t offset = s->free_data_block_offset;
501 
502     ret = bdrv_pwrite_sync(bs->file->bs, offset, s->footer_buf, HEADER_SIZE);
503     if (ret < 0)
504         return ret;
505 
506     return 0;
507 }
508 
509 /*
510  * Allocates a new block. This involves writing a new footer and updating
511  * the Block Allocation Table to use the space at the old end of the image
512  * file (overwriting the old footer)
513  *
514  * Returns the sectors' offset in the image file on success and < 0 on error
515  */
516 static int64_t alloc_block(BlockDriverState* bs, int64_t sector_num)
517 {
518     BDRVVPCState *s = bs->opaque;
519     int64_t bat_offset;
520     uint32_t index, bat_value;
521     int ret;
522     uint8_t bitmap[s->bitmap_size];
523 
524     /* Check if sector_num is valid */
525     if ((sector_num < 0) || (sector_num > bs->total_sectors))
526         return -1;
527 
528     /* Write entry into in-memory BAT */
529     index = (sector_num * 512) / s->block_size;
530     if (s->pagetable[index] != 0xFFFFFFFF)
531         return -1;
532 
533     s->pagetable[index] = s->free_data_block_offset / 512;
534 
535     /* Initialize the block's bitmap */
536     memset(bitmap, 0xff, s->bitmap_size);
537     ret = bdrv_pwrite_sync(bs->file->bs, s->free_data_block_offset, bitmap,
538         s->bitmap_size);
539     if (ret < 0) {
540         return ret;
541     }
542 
543     /* Write new footer (the old one will be overwritten) */
544     s->free_data_block_offset += s->block_size + s->bitmap_size;
545     ret = rewrite_footer(bs);
546     if (ret < 0)
547         goto fail;
548 
549     /* Write BAT entry to disk */
550     bat_offset = s->bat_offset + (4 * index);
551     bat_value = cpu_to_be32(s->pagetable[index]);
552     ret = bdrv_pwrite_sync(bs->file->bs, bat_offset, &bat_value, 4);
553     if (ret < 0)
554         goto fail;
555 
556     return get_sector_offset(bs, sector_num, 0);
557 
558 fail:
559     s->free_data_block_offset -= (s->block_size + s->bitmap_size);
560     return -1;
561 }
562 
563 static int vpc_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
564 {
565     BDRVVPCState *s = (BDRVVPCState *)bs->opaque;
566     VHDFooter *footer = (VHDFooter *) s->footer_buf;
567 
568     if (be32_to_cpu(footer->type) != VHD_FIXED) {
569         bdi->cluster_size = s->block_size;
570     }
571 
572     bdi->unallocated_blocks_are_zero = true;
573     return 0;
574 }
575 
576 static int vpc_read(BlockDriverState *bs, int64_t sector_num,
577                     uint8_t *buf, int nb_sectors)
578 {
579     BDRVVPCState *s = bs->opaque;
580     int ret;
581     int64_t offset;
582     int64_t sectors, sectors_per_block;
583     VHDFooter *footer = (VHDFooter *) s->footer_buf;
584 
585     if (be32_to_cpu(footer->type) == VHD_FIXED) {
586         return bdrv_read(bs->file->bs, sector_num, buf, nb_sectors);
587     }
588     while (nb_sectors > 0) {
589         offset = get_sector_offset(bs, sector_num, 0);
590 
591         sectors_per_block = s->block_size >> BDRV_SECTOR_BITS;
592         sectors = sectors_per_block - (sector_num % sectors_per_block);
593         if (sectors > nb_sectors) {
594             sectors = nb_sectors;
595         }
596 
597         if (offset == -1) {
598             memset(buf, 0, sectors * BDRV_SECTOR_SIZE);
599         } else {
600             ret = bdrv_pread(bs->file->bs, offset, buf,
601                 sectors * BDRV_SECTOR_SIZE);
602             if (ret != sectors * BDRV_SECTOR_SIZE) {
603                 return -1;
604             }
605         }
606 
607         nb_sectors -= sectors;
608         sector_num += sectors;
609         buf += sectors * BDRV_SECTOR_SIZE;
610     }
611     return 0;
612 }
613 
614 static coroutine_fn int vpc_co_read(BlockDriverState *bs, int64_t sector_num,
615                                     uint8_t *buf, int nb_sectors)
616 {
617     int ret;
618     BDRVVPCState *s = bs->opaque;
619     qemu_co_mutex_lock(&s->lock);
620     ret = vpc_read(bs, sector_num, buf, nb_sectors);
621     qemu_co_mutex_unlock(&s->lock);
622     return ret;
623 }
624 
625 static int vpc_write(BlockDriverState *bs, int64_t sector_num,
626     const uint8_t *buf, int nb_sectors)
627 {
628     BDRVVPCState *s = bs->opaque;
629     int64_t offset;
630     int64_t sectors, sectors_per_block;
631     int ret;
632     VHDFooter *footer =  (VHDFooter *) s->footer_buf;
633 
634     if (be32_to_cpu(footer->type) == VHD_FIXED) {
635         return bdrv_write(bs->file->bs, sector_num, buf, nb_sectors);
636     }
637     while (nb_sectors > 0) {
638         offset = get_sector_offset(bs, sector_num, 1);
639 
640         sectors_per_block = s->block_size >> BDRV_SECTOR_BITS;
641         sectors = sectors_per_block - (sector_num % sectors_per_block);
642         if (sectors > nb_sectors) {
643             sectors = nb_sectors;
644         }
645 
646         if (offset == -1) {
647             offset = alloc_block(bs, sector_num);
648             if (offset < 0)
649                 return -1;
650         }
651 
652         ret = bdrv_pwrite(bs->file->bs, offset, buf,
653                           sectors * BDRV_SECTOR_SIZE);
654         if (ret != sectors * BDRV_SECTOR_SIZE) {
655             return -1;
656         }
657 
658         nb_sectors -= sectors;
659         sector_num += sectors;
660         buf += sectors * BDRV_SECTOR_SIZE;
661     }
662 
663     return 0;
664 }
665 
666 static coroutine_fn int vpc_co_write(BlockDriverState *bs, int64_t sector_num,
667                                      const uint8_t *buf, int nb_sectors)
668 {
669     int ret;
670     BDRVVPCState *s = bs->opaque;
671     qemu_co_mutex_lock(&s->lock);
672     ret = vpc_write(bs, sector_num, buf, nb_sectors);
673     qemu_co_mutex_unlock(&s->lock);
674     return ret;
675 }
676 
677 static int64_t coroutine_fn vpc_co_get_block_status(BlockDriverState *bs,
678         int64_t sector_num, int nb_sectors, int *pnum, BlockDriverState **file)
679 {
680     BDRVVPCState *s = bs->opaque;
681     VHDFooter *footer = (VHDFooter*) s->footer_buf;
682     int64_t start, offset;
683     bool allocated;
684     int n;
685 
686     if (be32_to_cpu(footer->type) == VHD_FIXED) {
687         *pnum = nb_sectors;
688         *file = bs->file->bs;
689         return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID | BDRV_BLOCK_DATA |
690                (sector_num << BDRV_SECTOR_BITS);
691     }
692 
693     offset = get_sector_offset(bs, sector_num, 0);
694     start = offset;
695     allocated = (offset != -1);
696     *pnum = 0;
697 
698     do {
699         /* All sectors in a block are contiguous (without using the bitmap) */
700         n = ROUND_UP(sector_num + 1, s->block_size / BDRV_SECTOR_SIZE)
701           - sector_num;
702         n = MIN(n, nb_sectors);
703 
704         *pnum += n;
705         sector_num += n;
706         nb_sectors -= n;
707         /* *pnum can't be greater than one block for allocated
708          * sectors since there is always a bitmap in between. */
709         if (allocated) {
710             *file = bs->file->bs;
711             return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | start;
712         }
713         if (nb_sectors == 0) {
714             break;
715         }
716         offset = get_sector_offset(bs, sector_num, 0);
717     } while (offset == -1);
718 
719     return 0;
720 }
721 
722 /*
723  * Calculates the number of cylinders, heads and sectors per cylinder
724  * based on a given number of sectors. This is the algorithm described
725  * in the VHD specification.
726  *
727  * Note that the geometry doesn't always exactly match total_sectors but
728  * may round it down.
729  *
730  * Returns 0 on success, -EFBIG if the size is larger than 2040 GiB. Override
731  * the hardware EIDE and ATA-2 limit of 16 heads (max disk size of 127 GB)
732  * and instead allow up to 255 heads.
733  */
734 static int calculate_geometry(int64_t total_sectors, uint16_t* cyls,
735     uint8_t* heads, uint8_t* secs_per_cyl)
736 {
737     uint32_t cyls_times_heads;
738 
739     total_sectors = MIN(total_sectors, VHD_MAX_GEOMETRY);
740 
741     if (total_sectors >= 65535LL * 16 * 63) {
742         *secs_per_cyl = 255;
743         *heads = 16;
744         cyls_times_heads = total_sectors / *secs_per_cyl;
745     } else {
746         *secs_per_cyl = 17;
747         cyls_times_heads = total_sectors / *secs_per_cyl;
748         *heads = (cyls_times_heads + 1023) / 1024;
749 
750         if (*heads < 4) {
751             *heads = 4;
752         }
753 
754         if (cyls_times_heads >= (*heads * 1024) || *heads > 16) {
755             *secs_per_cyl = 31;
756             *heads = 16;
757             cyls_times_heads = total_sectors / *secs_per_cyl;
758         }
759 
760         if (cyls_times_heads >= (*heads * 1024)) {
761             *secs_per_cyl = 63;
762             *heads = 16;
763             cyls_times_heads = total_sectors / *secs_per_cyl;
764         }
765     }
766 
767     *cyls = cyls_times_heads / *heads;
768 
769     return 0;
770 }
771 
772 static int create_dynamic_disk(BlockBackend *blk, uint8_t *buf,
773                                int64_t total_sectors)
774 {
775     VHDDynDiskHeader *dyndisk_header =
776         (VHDDynDiskHeader *) buf;
777     size_t block_size, num_bat_entries;
778     int i;
779     int ret;
780     int64_t offset = 0;
781 
782     /* Write the footer (twice: at the beginning and at the end) */
783     block_size = 0x200000;
784     num_bat_entries = (total_sectors + block_size / 512) / (block_size / 512);
785 
786     ret = blk_pwrite(blk, offset, buf, HEADER_SIZE);
787     if (ret < 0) {
788         goto fail;
789     }
790 
791     offset = 1536 + ((num_bat_entries * 4 + 511) & ~511);
792     ret = blk_pwrite(blk, offset, buf, HEADER_SIZE);
793     if (ret < 0) {
794         goto fail;
795     }
796 
797     /* Write the initial BAT */
798     offset = 3 * 512;
799 
800     memset(buf, 0xFF, 512);
801     for (i = 0; i < (num_bat_entries * 4 + 511) / 512; i++) {
802         ret = blk_pwrite(blk, offset, buf, 512);
803         if (ret < 0) {
804             goto fail;
805         }
806         offset += 512;
807     }
808 
809     /* Prepare the Dynamic Disk Header */
810     memset(buf, 0, 1024);
811 
812     memcpy(dyndisk_header->magic, "cxsparse", 8);
813 
814     /*
815      * Note: The spec is actually wrong here for data_offset, it says
816      * 0xFFFFFFFF, but MS tools expect all 64 bits to be set.
817      */
818     dyndisk_header->data_offset = cpu_to_be64(0xFFFFFFFFFFFFFFFFULL);
819     dyndisk_header->table_offset = cpu_to_be64(3 * 512);
820     dyndisk_header->version = cpu_to_be32(0x00010000);
821     dyndisk_header->block_size = cpu_to_be32(block_size);
822     dyndisk_header->max_table_entries = cpu_to_be32(num_bat_entries);
823 
824     dyndisk_header->checksum = cpu_to_be32(vpc_checksum(buf, 1024));
825 
826     /* Write the header */
827     offset = 512;
828 
829     ret = blk_pwrite(blk, offset, buf, 1024);
830     if (ret < 0) {
831         goto fail;
832     }
833 
834  fail:
835     return ret;
836 }
837 
838 static int create_fixed_disk(BlockBackend *blk, uint8_t *buf,
839                              int64_t total_size)
840 {
841     int ret;
842 
843     /* Add footer to total size */
844     total_size += HEADER_SIZE;
845 
846     ret = blk_truncate(blk, total_size);
847     if (ret < 0) {
848         return ret;
849     }
850 
851     ret = blk_pwrite(blk, total_size - HEADER_SIZE, buf, HEADER_SIZE);
852     if (ret < 0) {
853         return ret;
854     }
855 
856     return ret;
857 }
858 
859 static int vpc_create(const char *filename, QemuOpts *opts, Error **errp)
860 {
861     uint8_t buf[1024];
862     VHDFooter *footer = (VHDFooter *) buf;
863     char *disk_type_param;
864     int i;
865     uint16_t cyls = 0;
866     uint8_t heads = 0;
867     uint8_t secs_per_cyl = 0;
868     int64_t total_sectors;
869     int64_t total_size;
870     int disk_type;
871     int ret = -EIO;
872     bool force_size;
873     Error *local_err = NULL;
874     BlockBackend *blk = NULL;
875 
876     /* Read out options */
877     total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
878                           BDRV_SECTOR_SIZE);
879     disk_type_param = qemu_opt_get_del(opts, BLOCK_OPT_SUBFMT);
880     if (disk_type_param) {
881         if (!strcmp(disk_type_param, "dynamic")) {
882             disk_type = VHD_DYNAMIC;
883         } else if (!strcmp(disk_type_param, "fixed")) {
884             disk_type = VHD_FIXED;
885         } else {
886             error_setg(errp, "Invalid disk type, %s", disk_type_param);
887             ret = -EINVAL;
888             goto out;
889         }
890     } else {
891         disk_type = VHD_DYNAMIC;
892     }
893 
894     force_size = qemu_opt_get_bool_del(opts, VPC_OPT_FORCE_SIZE, false);
895 
896     ret = bdrv_create_file(filename, opts, &local_err);
897     if (ret < 0) {
898         error_propagate(errp, local_err);
899         goto out;
900     }
901 
902     blk = blk_new_open(filename, NULL, NULL,
903                        BDRV_O_RDWR | BDRV_O_PROTOCOL, &local_err);
904     if (blk == NULL) {
905         error_propagate(errp, local_err);
906         ret = -EIO;
907         goto out;
908     }
909 
910     blk_set_allow_write_beyond_eof(blk, true);
911 
912     /*
913      * Calculate matching total_size and geometry. Increase the number of
914      * sectors requested until we get enough (or fail). This ensures that
915      * qemu-img convert doesn't truncate images, but rather rounds up.
916      *
917      * If the image size can't be represented by a spec conformant CHS geometry,
918      * we set the geometry to 65535 x 16 x 255 (CxHxS) sectors and use
919      * the image size from the VHD footer to calculate total_sectors.
920      */
921     if (force_size) {
922         /* This will force the use of total_size for sector count, below */
923         cyls         = VHD_CHS_MAX_C;
924         heads        = VHD_CHS_MAX_H;
925         secs_per_cyl = VHD_CHS_MAX_S;
926     } else {
927         total_sectors = MIN(VHD_MAX_GEOMETRY, total_size / BDRV_SECTOR_SIZE);
928         for (i = 0; total_sectors > (int64_t)cyls * heads * secs_per_cyl; i++) {
929             calculate_geometry(total_sectors + i, &cyls, &heads, &secs_per_cyl);
930         }
931     }
932 
933     if ((int64_t)cyls * heads * secs_per_cyl == VHD_MAX_GEOMETRY) {
934         total_sectors = total_size / BDRV_SECTOR_SIZE;
935         /* Allow a maximum disk size of 2040 GiB */
936         if (total_sectors > VHD_MAX_SECTORS) {
937             error_setg(errp, "Disk size is too large, max size is 2040 GiB");
938             ret = -EFBIG;
939             goto out;
940         }
941     } else {
942         total_sectors = (int64_t)cyls * heads * secs_per_cyl;
943         total_size = total_sectors * BDRV_SECTOR_SIZE;
944     }
945 
946     /* Prepare the Hard Disk Footer */
947     memset(buf, 0, 1024);
948 
949     memcpy(footer->creator, "conectix", 8);
950     if (force_size) {
951         memcpy(footer->creator_app, "qem2", 4);
952     } else {
953         memcpy(footer->creator_app, "qemu", 4);
954     }
955     memcpy(footer->creator_os, "Wi2k", 4);
956 
957     footer->features = cpu_to_be32(0x02);
958     footer->version = cpu_to_be32(0x00010000);
959     if (disk_type == VHD_DYNAMIC) {
960         footer->data_offset = cpu_to_be64(HEADER_SIZE);
961     } else {
962         footer->data_offset = cpu_to_be64(0xFFFFFFFFFFFFFFFFULL);
963     }
964     footer->timestamp = cpu_to_be32(time(NULL) - VHD_TIMESTAMP_BASE);
965 
966     /* Version of Virtual PC 2007 */
967     footer->major = cpu_to_be16(0x0005);
968     footer->minor = cpu_to_be16(0x0003);
969     footer->orig_size = cpu_to_be64(total_size);
970     footer->current_size = cpu_to_be64(total_size);
971     footer->cyls = cpu_to_be16(cyls);
972     footer->heads = heads;
973     footer->secs_per_cyl = secs_per_cyl;
974 
975     footer->type = cpu_to_be32(disk_type);
976 
977 #if defined(CONFIG_UUID)
978     uuid_generate(footer->uuid);
979 #endif
980 
981     footer->checksum = cpu_to_be32(vpc_checksum(buf, HEADER_SIZE));
982 
983     if (disk_type == VHD_DYNAMIC) {
984         ret = create_dynamic_disk(blk, buf, total_sectors);
985     } else {
986         ret = create_fixed_disk(blk, buf, total_size);
987     }
988     if (ret < 0) {
989         error_setg(errp, "Unable to create or write VHD header");
990     }
991 
992 out:
993     blk_unref(blk);
994     g_free(disk_type_param);
995     return ret;
996 }
997 
998 static int vpc_has_zero_init(BlockDriverState *bs)
999 {
1000     BDRVVPCState *s = bs->opaque;
1001     VHDFooter *footer =  (VHDFooter *) s->footer_buf;
1002 
1003     if (be32_to_cpu(footer->type) == VHD_FIXED) {
1004         return bdrv_has_zero_init(bs->file->bs);
1005     } else {
1006         return 1;
1007     }
1008 }
1009 
1010 static void vpc_close(BlockDriverState *bs)
1011 {
1012     BDRVVPCState *s = bs->opaque;
1013     qemu_vfree(s->pagetable);
1014 #ifdef CACHE
1015     g_free(s->pageentry_u8);
1016 #endif
1017 
1018     migrate_del_blocker(s->migration_blocker);
1019     error_free(s->migration_blocker);
1020 }
1021 
1022 static QemuOptsList vpc_create_opts = {
1023     .name = "vpc-create-opts",
1024     .head = QTAILQ_HEAD_INITIALIZER(vpc_create_opts.head),
1025     .desc = {
1026         {
1027             .name = BLOCK_OPT_SIZE,
1028             .type = QEMU_OPT_SIZE,
1029             .help = "Virtual disk size"
1030         },
1031         {
1032             .name = BLOCK_OPT_SUBFMT,
1033             .type = QEMU_OPT_STRING,
1034             .help =
1035                 "Type of virtual hard disk format. Supported formats are "
1036                 "{dynamic (default) | fixed} "
1037         },
1038         {
1039             .name = VPC_OPT_FORCE_SIZE,
1040             .type = QEMU_OPT_BOOL,
1041             .help = "Force disk size calculation to use the actual size "
1042                     "specified, rather than using the nearest CHS-based "
1043                     "calculation"
1044         },
1045         { /* end of list */ }
1046     }
1047 };
1048 
1049 static BlockDriver bdrv_vpc = {
1050     .format_name    = "vpc",
1051     .instance_size  = sizeof(BDRVVPCState),
1052 
1053     .bdrv_probe             = vpc_probe,
1054     .bdrv_open              = vpc_open,
1055     .bdrv_close             = vpc_close,
1056     .bdrv_reopen_prepare    = vpc_reopen_prepare,
1057     .bdrv_create            = vpc_create,
1058 
1059     .bdrv_read                  = vpc_co_read,
1060     .bdrv_write                 = vpc_co_write,
1061     .bdrv_co_get_block_status   = vpc_co_get_block_status,
1062 
1063     .bdrv_get_info          = vpc_get_info,
1064 
1065     .create_opts            = &vpc_create_opts,
1066     .bdrv_has_zero_init     = vpc_has_zero_init,
1067 };
1068 
1069 static void bdrv_vpc_init(void)
1070 {
1071     bdrv_register(&bdrv_vpc);
1072 }
1073 
1074 block_init(bdrv_vpc_init);
1075