xref: /openbmc/qemu/block/vpc.c (revision d76aa73fad1f64c192856e1420ad0756f5e3b778)
1  /*
2   * Block driver for Connectix / Microsoft Virtual PC images
3   *
4   * Copyright (c) 2005 Alex Beregszaszi
5   * Copyright (c) 2009 Kevin Wolf <kwolf@suse.de>
6   *
7   * Permission is hereby granted, free of charge, to any person obtaining a copy
8   * of this software and associated documentation files (the "Software"), to deal
9   * in the Software without restriction, including without limitation the rights
10   * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11   * copies of the Software, and to permit persons to whom the Software is
12   * furnished to do so, subject to the following conditions:
13   *
14   * The above copyright notice and this permission notice shall be included in
15   * all copies or substantial portions of the Software.
16   *
17   * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18   * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19   * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20   * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21   * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22   * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23   * THE SOFTWARE.
24   */
25  
26  #include "qemu/osdep.h"
27  #include "qapi/error.h"
28  #include "block/block_int.h"
29  #include "block/qdict.h"
30  #include "sysemu/block-backend.h"
31  #include "qemu/module.h"
32  #include "qemu/option.h"
33  #include "migration/blocker.h"
34  #include "qemu/bswap.h"
35  #include "qemu/uuid.h"
36  #include "qemu/memalign.h"
37  #include "qapi/qmp/qdict.h"
38  #include "qapi/qobject-input-visitor.h"
39  #include "qapi/qapi-visit-block-core.h"
40  
41  /**************************************************************/
42  
43  //#define CACHE
44  
45  enum vhd_type {
46      VHD_FIXED           = 2,
47      VHD_DYNAMIC         = 3,
48      VHD_DIFFERENCING    = 4,
49  };
50  
51  /* Seconds since Jan 1, 2000 0:00:00 (UTC) */
52  #define VHD_TIMESTAMP_BASE 946684800
53  
54  #define VHD_CHS_MAX_C   65535LL
55  #define VHD_CHS_MAX_H   16
56  #define VHD_CHS_MAX_S   255
57  
58  #define VHD_MAX_SECTORS       0xff000000    /* 2040 GiB max image size */
59  #define VHD_MAX_GEOMETRY      (VHD_CHS_MAX_C * VHD_CHS_MAX_H * VHD_CHS_MAX_S)
60  
61  #define VPC_OPT_FORCE_SIZE "force_size"
62  
63  /* always big-endian */
64  typedef struct vhd_footer {
65      char        creator[8]; /* "conectix" */
66      uint32_t    features;
67      uint32_t    version;
68  
69      /* Offset of next header structure, 0xFFFFFFFF if none */
70      uint64_t    data_offset;
71  
72      /* Seconds since Jan 1, 2000 0:00:00 (UTC) */
73      uint32_t    timestamp;
74  
75      char        creator_app[4]; /*  e.g., "vpc " */
76      uint16_t    major;
77      uint16_t    minor;
78      char        creator_os[4]; /* "Wi2k" */
79  
80      uint64_t    orig_size;
81      uint64_t    current_size;
82  
83      uint16_t    cyls;
84      uint8_t     heads;
85      uint8_t     secs_per_cyl;
86  
87      uint32_t    type;
88  
89      /* Checksum of the Hard Disk Footer ("one's complement of the sum of all
90         the bytes in the footer without the checksum field") */
91      uint32_t    checksum;
92  
93      /* UUID used to identify a parent hard disk (backing file) */
94      QemuUUID    uuid;
95  
96      uint8_t     in_saved_state;
97      uint8_t     reserved[427];
98  } QEMU_PACKED VHDFooter;
99  
100  QEMU_BUILD_BUG_ON(sizeof(VHDFooter) != 512);
101  
102  typedef struct vhd_dyndisk_header {
103      char        magic[8]; /* "cxsparse" */
104  
105      /* Offset of next header structure, 0xFFFFFFFF if none */
106      uint64_t    data_offset;
107  
108      /* Offset of the Block Allocation Table (BAT) */
109      uint64_t    table_offset;
110  
111      uint32_t    version;
112      uint32_t    max_table_entries; /* 32bit/entry */
113  
114      /* 2 MB by default, must be a power of two */
115      uint32_t    block_size;
116  
117      uint32_t    checksum;
118      uint8_t     parent_uuid[16];
119      uint32_t    parent_timestamp;
120      uint32_t    reserved;
121  
122      /* Backing file name (in UTF-16) */
123      uint8_t     parent_name[512];
124  
125      struct {
126          uint32_t    platform;
127          uint32_t    data_space;
128          uint32_t    data_length;
129          uint32_t    reserved;
130          uint64_t    data_offset;
131      } parent_locator[8];
132      uint8_t     reserved2[256];
133  } QEMU_PACKED VHDDynDiskHeader;
134  
135  QEMU_BUILD_BUG_ON(sizeof(VHDDynDiskHeader) != 1024);
136  
137  typedef struct BDRVVPCState {
138      CoMutex lock;
139      VHDFooter footer;
140      uint64_t free_data_block_offset;
141      int max_table_entries;
142      uint32_t *pagetable;
143      uint64_t bat_offset;
144      uint64_t last_bitmap_offset;
145  
146      uint32_t block_size;
147      uint32_t bitmap_size;
148      bool force_use_chs;
149      bool force_use_sz;
150  
151  #ifdef CACHE
152      uint8_t *pageentry_u8;
153      uint32_t *pageentry_u32;
154      uint16_t *pageentry_u16;
155  
156      uint64_t last_bitmap;
157  #endif
158  
159      Error *migration_blocker;
160  } BDRVVPCState;
161  
162  #define VPC_OPT_SIZE_CALC "force_size_calc"
163  static QemuOptsList vpc_runtime_opts = {
164      .name = "vpc-runtime-opts",
165      .head = QTAILQ_HEAD_INITIALIZER(vpc_runtime_opts.head),
166      .desc = {
167          {
168              .name = VPC_OPT_SIZE_CALC,
169              .type = QEMU_OPT_STRING,
170              .help = "Force disk size calculation to use either CHS geometry, "
171                      "or use the disk current_size specified in the VHD footer. "
172                      "{chs, current_size}"
173          },
174          { /* end of list */ }
175      }
176  };
177  
178  static QemuOptsList vpc_create_opts;
179  
180  static uint32_t vpc_checksum(void *p, size_t size)
181  {
182      uint8_t *buf = p;
183      uint32_t res = 0;
184      int i;
185  
186      for (i = 0; i < size; i++)
187          res += buf[i];
188  
189      return ~res;
190  }
191  
192  
193  static int vpc_probe(const uint8_t *buf, int buf_size, const char *filename)
194  {
195      if (buf_size >= 8 && !strncmp((char *)buf, "conectix", 8))
196          return 100;
197      return 0;
198  }
199  
200  static void vpc_parse_options(BlockDriverState *bs, QemuOpts *opts,
201                                Error **errp)
202  {
203      BDRVVPCState *s = bs->opaque;
204      const char *size_calc;
205  
206      size_calc = qemu_opt_get(opts, VPC_OPT_SIZE_CALC);
207  
208      if (!size_calc) {
209         /* no override, use autodetect only */
210      } else if (!strcmp(size_calc, "current_size")) {
211          s->force_use_sz = true;
212      } else if (!strcmp(size_calc, "chs")) {
213          s->force_use_chs = true;
214      } else {
215          error_setg(errp, "Invalid size calculation mode: '%s'", size_calc);
216      }
217  }
218  
219  static int vpc_open(BlockDriverState *bs, QDict *options, int flags,
220                      Error **errp)
221  {
222      BDRVVPCState *s = bs->opaque;
223      int i;
224      VHDFooter *footer;
225      QemuOpts *opts = NULL;
226      Error *local_err = NULL;
227      bool use_chs;
228      VHDDynDiskHeader dyndisk_header;
229      uint32_t checksum;
230      uint64_t computed_size;
231      uint64_t pagetable_size;
232      int disk_type = VHD_DYNAMIC;
233      int ret;
234      int64_t bs_size;
235  
236      ret = bdrv_open_file_child(NULL, options, "file", bs, errp);
237      if (ret < 0) {
238          return ret;
239      }
240  
241      opts = qemu_opts_create(&vpc_runtime_opts, NULL, 0, &error_abort);
242      if (!qemu_opts_absorb_qdict(opts, options, errp)) {
243          ret = -EINVAL;
244          goto fail;
245      }
246  
247      vpc_parse_options(bs, opts, &local_err);
248      if (local_err) {
249          error_propagate(errp, local_err);
250          ret = -EINVAL;
251          goto fail;
252      }
253  
254      ret = bdrv_pread(bs->file, 0, sizeof(s->footer), &s->footer, 0);
255      if (ret < 0) {
256          error_setg(errp, "Unable to read VHD header");
257          goto fail;
258      }
259  
260      footer = &s->footer;
261      if (strncmp(footer->creator, "conectix", 8)) {
262          int64_t offset = bdrv_getlength(bs->file->bs);
263          if (offset < 0) {
264              ret = offset;
265              error_setg(errp, "Invalid file size");
266              goto fail;
267          } else if (offset < sizeof(*footer)) {
268              ret = -EINVAL;
269              error_setg(errp, "File too small for a VHD header");
270              goto fail;
271          }
272  
273          /* If a fixed disk, the footer is found only at the end of the file */
274          ret = bdrv_pread(bs->file, offset - sizeof(*footer), sizeof(*footer),
275                           footer, 0);
276          if (ret < 0) {
277              goto fail;
278          }
279          if (strncmp(footer->creator, "conectix", 8) ||
280              be32_to_cpu(footer->type) != VHD_FIXED) {
281              error_setg(errp, "invalid VPC image");
282              ret = -EINVAL;
283              goto fail;
284          }
285          disk_type = VHD_FIXED;
286      }
287  
288      checksum = be32_to_cpu(footer->checksum);
289      footer->checksum = 0;
290      if (vpc_checksum(footer, sizeof(*footer)) != checksum) {
291          error_setg(errp, "Incorrect header checksum");
292          ret = -EINVAL;
293          goto fail;
294      }
295  
296      /* Write 'checksum' back to footer, or else will leave it with zero. */
297      footer->checksum = cpu_to_be32(checksum);
298  
299      /* The visible size of a image in Virtual PC depends on the geometry
300         rather than on the size stored in the footer (the size in the footer
301         is too large usually) */
302      bs->total_sectors = (int64_t)
303          be16_to_cpu(footer->cyls) * footer->heads * footer->secs_per_cyl;
304  
305      /* Microsoft Virtual PC and Microsoft Hyper-V produce and read
306       * VHD image sizes differently.  VPC will rely on CHS geometry,
307       * while Hyper-V and disk2vhd use the size specified in the footer.
308       *
309       * We use a couple of approaches to try and determine the correct method:
310       * look at the Creator App field, and look for images that have CHS
311       * geometry that is the maximum value.
312       *
313       * If the CHS geometry is the maximum CHS geometry, then we assume that
314       * the size is the footer->current_size to avoid truncation.  Otherwise,
315       * we follow the table based on footer->creator_app:
316       *
317       *  Known creator apps:
318       *      'vpc '  :  CHS              Virtual PC (uses disk geometry)
319       *      'qemu'  :  CHS              QEMU (uses disk geometry)
320       *      'qem2'  :  current_size     QEMU (uses current_size)
321       *      'win '  :  current_size     Hyper-V
322       *      'd2v '  :  current_size     Disk2vhd
323       *      'tap\0' :  current_size     XenServer
324       *      'CTXS'  :  current_size     XenConverter
325       *
326       *  The user can override the table values via drive options, however
327       *  even with an override we will still use current_size for images
328       *  that have CHS geometry of the maximum size.
329       */
330      use_chs = (!!strncmp(footer->creator_app, "win ", 4) &&
331                 !!strncmp(footer->creator_app, "qem2", 4) &&
332                 !!strncmp(footer->creator_app, "d2v ", 4) &&
333                 !!strncmp(footer->creator_app, "CTXS", 4) &&
334                 !!memcmp(footer->creator_app, "tap", 4)) || s->force_use_chs;
335  
336      if (!use_chs || bs->total_sectors == VHD_MAX_GEOMETRY || s->force_use_sz) {
337          bs->total_sectors = be64_to_cpu(footer->current_size) /
338                                          BDRV_SECTOR_SIZE;
339      }
340  
341      /* Allow a maximum disk size of 2040 GiB */
342      if (bs->total_sectors > VHD_MAX_SECTORS) {
343          ret = -EFBIG;
344          goto fail;
345      }
346  
347      if (disk_type == VHD_DYNAMIC) {
348          ret = bdrv_pread(bs->file, be64_to_cpu(footer->data_offset),
349                           sizeof(dyndisk_header), &dyndisk_header, 0);
350          if (ret < 0) {
351              error_setg(errp, "Error reading dynamic VHD header");
352              goto fail;
353          }
354  
355          if (strncmp(dyndisk_header.magic, "cxsparse", 8)) {
356              error_setg(errp, "Invalid header magic");
357              ret = -EINVAL;
358              goto fail;
359          }
360  
361          s->block_size = be32_to_cpu(dyndisk_header.block_size);
362          if (!is_power_of_2(s->block_size) || s->block_size < BDRV_SECTOR_SIZE) {
363              error_setg(errp, "Invalid block size %" PRIu32, s->block_size);
364              ret = -EINVAL;
365              goto fail;
366          }
367          s->bitmap_size = ((s->block_size / (8 * 512)) + 511) & ~511;
368  
369          s->max_table_entries = be32_to_cpu(dyndisk_header.max_table_entries);
370  
371          if ((bs->total_sectors * 512) / s->block_size > 0xffffffffU) {
372              error_setg(errp, "Too many blocks");
373              ret = -EINVAL;
374              goto fail;
375          }
376  
377          computed_size = (uint64_t) s->max_table_entries * s->block_size;
378          if (computed_size < bs->total_sectors * 512) {
379              error_setg(errp, "Page table too small");
380              ret = -EINVAL;
381              goto fail;
382          }
383  
384          if (s->max_table_entries > SIZE_MAX / 4 ||
385              s->max_table_entries > (int) INT_MAX / 4) {
386              error_setg(errp, "Max Table Entries too large (%" PRId32 ")",
387                          s->max_table_entries);
388              ret = -EINVAL;
389              goto fail;
390          }
391  
392          pagetable_size = (uint64_t) s->max_table_entries * 4;
393  
394          s->pagetable = qemu_try_blockalign(bs->file->bs, pagetable_size);
395          if (s->pagetable == NULL) {
396              error_setg(errp, "Unable to allocate memory for page table");
397              ret = -ENOMEM;
398              goto fail;
399          }
400  
401          s->bat_offset = be64_to_cpu(dyndisk_header.table_offset);
402  
403          ret = bdrv_pread(bs->file, s->bat_offset, pagetable_size,
404                           s->pagetable, 0);
405          if (ret < 0) {
406              error_setg(errp, "Error reading pagetable");
407              goto fail;
408          }
409  
410          s->free_data_block_offset =
411              ROUND_UP(s->bat_offset + pagetable_size, 512);
412  
413          for (i = 0; i < s->max_table_entries; i++) {
414              be32_to_cpus(&s->pagetable[i]);
415              if (s->pagetable[i] != 0xFFFFFFFF) {
416                  int64_t next = (512 * (int64_t) s->pagetable[i]) +
417                      s->bitmap_size + s->block_size;
418  
419                  if (next > s->free_data_block_offset) {
420                      s->free_data_block_offset = next;
421                  }
422              }
423          }
424  
425          bs_size = bdrv_getlength(bs->file->bs);
426          if (bs_size < 0) {
427              error_setg_errno(errp, -bs_size, "Unable to learn image size");
428              ret = bs_size;
429              goto fail;
430          }
431          if (s->free_data_block_offset > bs_size) {
432              error_setg(errp, "block-vpc: free_data_block_offset points after "
433                               "the end of file. The image has been truncated.");
434              ret = -EINVAL;
435              goto fail;
436          }
437  
438          s->last_bitmap_offset = (int64_t) -1;
439  
440  #ifdef CACHE
441          s->pageentry_u8 = g_malloc(512);
442          s->pageentry_u32 = s->pageentry_u8;
443          s->pageentry_u16 = s->pageentry_u8;
444          s->last_pagetable = -1;
445  #endif
446      }
447  
448      /* Disable migration when VHD images are used */
449      error_setg(&s->migration_blocker, "The vpc format used by node '%s' "
450                 "does not support live migration",
451                 bdrv_get_device_or_node_name(bs));
452      ret = migrate_add_blocker(s->migration_blocker, errp);
453      if (ret < 0) {
454          error_free(s->migration_blocker);
455          goto fail;
456      }
457  
458      qemu_co_mutex_init(&s->lock);
459      qemu_opts_del(opts);
460  
461      return 0;
462  
463  fail:
464      qemu_opts_del(opts);
465      qemu_vfree(s->pagetable);
466  #ifdef CACHE
467      g_free(s->pageentry_u8);
468  #endif
469      return ret;
470  }
471  
472  static int vpc_reopen_prepare(BDRVReopenState *state,
473                                BlockReopenQueue *queue, Error **errp)
474  {
475      return 0;
476  }
477  
478  /*
479   * Returns the absolute byte offset of the given sector in the image file.
480   * If the sector is not allocated, -1 is returned instead.
481   * If an error occurred trying to write an updated block bitmap back to
482   * the file, -2 is returned, and the error value is written to *err.
483   * This can only happen for a write operation.
484   *
485   * The parameter write must be 1 if the offset will be used for a write
486   * operation (the block bitmaps is updated then), 0 otherwise.
487   * If write is true then err must not be NULL.
488   */
489  static inline int64_t get_image_offset(BlockDriverState *bs, uint64_t offset,
490                                         bool write, int *err)
491  {
492      BDRVVPCState *s = bs->opaque;
493      uint64_t bitmap_offset, block_offset;
494      uint32_t pagetable_index, offset_in_block;
495  
496      assert(!(write && err == NULL));
497  
498      pagetable_index = offset / s->block_size;
499      offset_in_block = offset % s->block_size;
500  
501      if (pagetable_index >= s->max_table_entries || s->pagetable[pagetable_index] == 0xffffffff)
502          return -1; /* not allocated */
503  
504      bitmap_offset = 512 * (uint64_t) s->pagetable[pagetable_index];
505      block_offset = bitmap_offset + s->bitmap_size + offset_in_block;
506  
507      /* We must ensure that we don't write to any sectors which are marked as
508         unused in the bitmap. We get away with setting all bits in the block
509         bitmap each time we write to a new block. This might cause Virtual PC to
510         miss sparse read optimization, but it's not a problem in terms of
511         correctness. */
512      if (write && (s->last_bitmap_offset != bitmap_offset)) {
513          uint8_t bitmap[s->bitmap_size];
514          int r;
515  
516          s->last_bitmap_offset = bitmap_offset;
517          memset(bitmap, 0xff, s->bitmap_size);
518          r = bdrv_pwrite_sync(bs->file, bitmap_offset, s->bitmap_size, bitmap,
519                               0);
520          if (r < 0) {
521              *err = r;
522              return -2;
523          }
524      }
525  
526      return block_offset;
527  }
528  
529  /*
530   * Writes the footer to the end of the image file. This is needed when the
531   * file grows as it overwrites the old footer
532   *
533   * Returns 0 on success and < 0 on error
534   */
535  static int rewrite_footer(BlockDriverState *bs)
536  {
537      int ret;
538      BDRVVPCState *s = bs->opaque;
539      int64_t offset = s->free_data_block_offset;
540  
541      ret = bdrv_pwrite_sync(bs->file, offset, sizeof(s->footer), &s->footer, 0);
542      if (ret < 0)
543          return ret;
544  
545      return 0;
546  }
547  
548  /*
549   * Allocates a new block. This involves writing a new footer and updating
550   * the Block Allocation Table to use the space at the old end of the image
551   * file (overwriting the old footer)
552   *
553   * Returns the sectors' offset in the image file on success and < 0 on error
554   */
555  static int64_t alloc_block(BlockDriverState *bs, int64_t offset)
556  {
557      BDRVVPCState *s = bs->opaque;
558      int64_t bat_offset;
559      uint32_t index, bat_value;
560      int ret;
561      uint8_t bitmap[s->bitmap_size];
562  
563      /* Check if sector_num is valid */
564      if ((offset < 0) || (offset > bs->total_sectors * BDRV_SECTOR_SIZE)) {
565          return -EINVAL;
566      }
567  
568      /* Write entry into in-memory BAT */
569      index = offset / s->block_size;
570      assert(s->pagetable[index] == 0xFFFFFFFF);
571      s->pagetable[index] = s->free_data_block_offset / 512;
572  
573      /* Initialize the block's bitmap */
574      memset(bitmap, 0xff, s->bitmap_size);
575      ret = bdrv_pwrite_sync(bs->file, s->free_data_block_offset,
576                             s->bitmap_size, bitmap, 0);
577      if (ret < 0) {
578          return ret;
579      }
580  
581      /* Write new footer (the old one will be overwritten) */
582      s->free_data_block_offset += s->block_size + s->bitmap_size;
583      ret = rewrite_footer(bs);
584      if (ret < 0)
585          goto fail;
586  
587      /* Write BAT entry to disk */
588      bat_offset = s->bat_offset + (4 * index);
589      bat_value = cpu_to_be32(s->pagetable[index]);
590      ret = bdrv_pwrite_sync(bs->file, bat_offset, 4, &bat_value, 0);
591      if (ret < 0)
592          goto fail;
593  
594      return get_image_offset(bs, offset, false, NULL);
595  
596  fail:
597      s->free_data_block_offset -= (s->block_size + s->bitmap_size);
598      return ret;
599  }
600  
601  static int coroutine_fn
602  vpc_co_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
603  {
604      BDRVVPCState *s = (BDRVVPCState *)bs->opaque;
605  
606      if (be32_to_cpu(s->footer.type) != VHD_FIXED) {
607          bdi->cluster_size = s->block_size;
608      }
609  
610      return 0;
611  }
612  
613  static int coroutine_fn
614  vpc_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes,
615                QEMUIOVector *qiov, BdrvRequestFlags flags)
616  {
617      BDRVVPCState *s = bs->opaque;
618      int ret;
619      int64_t image_offset;
620      int64_t n_bytes;
621      int64_t bytes_done = 0;
622      QEMUIOVector local_qiov;
623  
624      if (be32_to_cpu(s->footer.type) == VHD_FIXED) {
625          return bdrv_co_preadv(bs->file, offset, bytes, qiov, 0);
626      }
627  
628      qemu_co_mutex_lock(&s->lock);
629      qemu_iovec_init(&local_qiov, qiov->niov);
630  
631      while (bytes > 0) {
632          image_offset = get_image_offset(bs, offset, false, NULL);
633          n_bytes = MIN(bytes, s->block_size - (offset % s->block_size));
634  
635          if (image_offset == -1) {
636              qemu_iovec_memset(qiov, bytes_done, 0, n_bytes);
637          } else {
638              qemu_iovec_reset(&local_qiov);
639              qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes);
640  
641              qemu_co_mutex_unlock(&s->lock);
642              ret = bdrv_co_preadv(bs->file, image_offset, n_bytes,
643                                   &local_qiov, 0);
644              qemu_co_mutex_lock(&s->lock);
645              if (ret < 0) {
646                  goto fail;
647              }
648          }
649  
650          bytes -= n_bytes;
651          offset += n_bytes;
652          bytes_done += n_bytes;
653      }
654  
655      ret = 0;
656  fail:
657      qemu_iovec_destroy(&local_qiov);
658      qemu_co_mutex_unlock(&s->lock);
659  
660      return ret;
661  }
662  
663  static int coroutine_fn
664  vpc_co_pwritev(BlockDriverState *bs, int64_t offset, int64_t bytes,
665                 QEMUIOVector *qiov, BdrvRequestFlags flags)
666  {
667      BDRVVPCState *s = bs->opaque;
668      int64_t image_offset;
669      int64_t n_bytes;
670      int64_t bytes_done = 0;
671      int ret = 0;
672      QEMUIOVector local_qiov;
673  
674      if (be32_to_cpu(s->footer.type) == VHD_FIXED) {
675          return bdrv_co_pwritev(bs->file, offset, bytes, qiov, 0);
676      }
677  
678      qemu_co_mutex_lock(&s->lock);
679      qemu_iovec_init(&local_qiov, qiov->niov);
680  
681      while (bytes > 0) {
682          image_offset = get_image_offset(bs, offset, true, &ret);
683          if (image_offset == -2) {
684              /* Failed to write block bitmap: can't proceed with write */
685              goto fail;
686          }
687          n_bytes = MIN(bytes, s->block_size - (offset % s->block_size));
688  
689          if (image_offset == -1) {
690              image_offset = alloc_block(bs, offset);
691              if (image_offset < 0) {
692                  ret = image_offset;
693                  goto fail;
694              }
695          }
696  
697          qemu_iovec_reset(&local_qiov);
698          qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes);
699  
700          qemu_co_mutex_unlock(&s->lock);
701          ret = bdrv_co_pwritev(bs->file, image_offset, n_bytes,
702                                &local_qiov, 0);
703          qemu_co_mutex_lock(&s->lock);
704          if (ret < 0) {
705              goto fail;
706          }
707  
708          bytes -= n_bytes;
709          offset += n_bytes;
710          bytes_done += n_bytes;
711      }
712  
713      ret = 0;
714  fail:
715      qemu_iovec_destroy(&local_qiov);
716      qemu_co_mutex_unlock(&s->lock);
717  
718      return ret;
719  }
720  
721  static int coroutine_fn vpc_co_block_status(BlockDriverState *bs,
722                                              bool want_zero,
723                                              int64_t offset, int64_t bytes,
724                                              int64_t *pnum, int64_t *map,
725                                              BlockDriverState **file)
726  {
727      BDRVVPCState *s = bs->opaque;
728      int64_t image_offset;
729      bool allocated;
730      int ret;
731      int64_t n;
732  
733      if (be32_to_cpu(s->footer.type) == VHD_FIXED) {
734          *pnum = bytes;
735          *map = offset;
736          *file = bs->file->bs;
737          return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | BDRV_BLOCK_RECURSE;
738      }
739  
740      qemu_co_mutex_lock(&s->lock);
741  
742      image_offset = get_image_offset(bs, offset, false, NULL);
743      allocated = (image_offset != -1);
744      *pnum = 0;
745      ret = BDRV_BLOCK_ZERO;
746  
747      do {
748          /* All sectors in a block are contiguous (without using the bitmap) */
749          n = ROUND_UP(offset + 1, s->block_size) - offset;
750          n = MIN(n, bytes);
751  
752          *pnum += n;
753          offset += n;
754          bytes -= n;
755          /* *pnum can't be greater than one block for allocated
756           * sectors since there is always a bitmap in between. */
757          if (allocated) {
758              *file = bs->file->bs;
759              *map = image_offset;
760              ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
761              break;
762          }
763          if (bytes == 0) {
764              break;
765          }
766          image_offset = get_image_offset(bs, offset, false, NULL);
767      } while (image_offset == -1);
768  
769      qemu_co_mutex_unlock(&s->lock);
770      return ret;
771  }
772  
773  /*
774   * Calculates the number of cylinders, heads and sectors per cylinder
775   * based on a given number of sectors. This is the algorithm described
776   * in the VHD specification.
777   *
778   * Note that the geometry doesn't always exactly match total_sectors but
779   * may round it down.
780   *
781   * Returns 0 on success, -EFBIG if the size is larger than 2040 GiB. Override
782   * the hardware EIDE and ATA-2 limit of 16 heads (max disk size of 127 GB)
783   * and instead allow up to 255 heads.
784   */
785  static int calculate_geometry(int64_t total_sectors, uint16_t *cyls,
786      uint8_t *heads, uint8_t *secs_per_cyl)
787  {
788      uint32_t cyls_times_heads;
789  
790      total_sectors = MIN(total_sectors, VHD_MAX_GEOMETRY);
791  
792      if (total_sectors >= 65535LL * 16 * 63) {
793          *secs_per_cyl = 255;
794          *heads = 16;
795          cyls_times_heads = total_sectors / *secs_per_cyl;
796      } else {
797          *secs_per_cyl = 17;
798          cyls_times_heads = total_sectors / *secs_per_cyl;
799          *heads = DIV_ROUND_UP(cyls_times_heads, 1024);
800  
801          if (*heads < 4) {
802              *heads = 4;
803          }
804  
805          if (cyls_times_heads >= (*heads * 1024) || *heads > 16) {
806              *secs_per_cyl = 31;
807              *heads = 16;
808              cyls_times_heads = total_sectors / *secs_per_cyl;
809          }
810  
811          if (cyls_times_heads >= (*heads * 1024)) {
812              *secs_per_cyl = 63;
813              *heads = 16;
814              cyls_times_heads = total_sectors / *secs_per_cyl;
815          }
816      }
817  
818      *cyls = cyls_times_heads / *heads;
819  
820      return 0;
821  }
822  
823  static int create_dynamic_disk(BlockBackend *blk, VHDFooter *footer,
824                                 int64_t total_sectors)
825  {
826      VHDDynDiskHeader dyndisk_header;
827      uint8_t bat_sector[512];
828      size_t block_size, num_bat_entries;
829      int i;
830      int ret;
831      int64_t offset = 0;
832  
833      /* Write the footer (twice: at the beginning and at the end) */
834      block_size = 0x200000;
835      num_bat_entries = DIV_ROUND_UP(total_sectors, block_size / 512);
836  
837      ret = blk_pwrite(blk, offset, sizeof(*footer), footer, 0);
838      if (ret < 0) {
839          goto fail;
840      }
841  
842      offset = 1536 + ((num_bat_entries * 4 + 511) & ~511);
843      ret = blk_pwrite(blk, offset, sizeof(*footer), footer, 0);
844      if (ret < 0) {
845          goto fail;
846      }
847  
848      /* Write the initial BAT */
849      offset = 3 * 512;
850  
851      memset(bat_sector, 0xFF, 512);
852      for (i = 0; i < DIV_ROUND_UP(num_bat_entries * 4, 512); i++) {
853          ret = blk_pwrite(blk, offset, 512, bat_sector, 0);
854          if (ret < 0) {
855              goto fail;
856          }
857          offset += 512;
858      }
859  
860      /* Prepare the Dynamic Disk Header */
861      memset(&dyndisk_header, 0, sizeof(dyndisk_header));
862  
863      memcpy(dyndisk_header.magic, "cxsparse", 8);
864  
865      /*
866       * Note: The spec is actually wrong here for data_offset, it says
867       * 0xFFFFFFFF, but MS tools expect all 64 bits to be set.
868       */
869      dyndisk_header.data_offset = cpu_to_be64(0xFFFFFFFFFFFFFFFFULL);
870      dyndisk_header.table_offset = cpu_to_be64(3 * 512);
871      dyndisk_header.version = cpu_to_be32(0x00010000);
872      dyndisk_header.block_size = cpu_to_be32(block_size);
873      dyndisk_header.max_table_entries = cpu_to_be32(num_bat_entries);
874  
875      dyndisk_header.checksum = cpu_to_be32(
876          vpc_checksum(&dyndisk_header, sizeof(dyndisk_header)));
877  
878      /* Write the header */
879      offset = 512;
880  
881      ret = blk_pwrite(blk, offset, sizeof(dyndisk_header), &dyndisk_header, 0);
882      if (ret < 0) {
883          goto fail;
884      }
885  
886      ret = 0;
887   fail:
888      return ret;
889  }
890  
891  static int create_fixed_disk(BlockBackend *blk, VHDFooter *footer,
892                               int64_t total_size, Error **errp)
893  {
894      int ret;
895  
896      /* Add footer to total size */
897      total_size += sizeof(*footer);
898  
899      ret = blk_truncate(blk, total_size, false, PREALLOC_MODE_OFF, 0, errp);
900      if (ret < 0) {
901          return ret;
902      }
903  
904      ret = blk_pwrite(blk, total_size - sizeof(*footer), sizeof(*footer),
905                       footer, 0);
906      if (ret < 0) {
907          error_setg_errno(errp, -ret, "Unable to write VHD header");
908          return ret;
909      }
910  
911      return 0;
912  }
913  
914  static int calculate_rounded_image_size(BlockdevCreateOptionsVpc *vpc_opts,
915                                          uint16_t *out_cyls,
916                                          uint8_t *out_heads,
917                                          uint8_t *out_secs_per_cyl,
918                                          int64_t *out_total_sectors,
919                                          Error **errp)
920  {
921      int64_t total_size = vpc_opts->size;
922      uint16_t cyls = 0;
923      uint8_t heads = 0;
924      uint8_t secs_per_cyl = 0;
925      int64_t total_sectors;
926      int i;
927  
928      /*
929       * Calculate matching total_size and geometry. Increase the number of
930       * sectors requested until we get enough (or fail). This ensures that
931       * qemu-img convert doesn't truncate images, but rather rounds up.
932       *
933       * If the image size can't be represented by a spec conformant CHS geometry,
934       * we set the geometry to 65535 x 16 x 255 (CxHxS) sectors and use
935       * the image size from the VHD footer to calculate total_sectors.
936       */
937      if (vpc_opts->force_size) {
938          /* This will force the use of total_size for sector count, below */
939          cyls         = VHD_CHS_MAX_C;
940          heads        = VHD_CHS_MAX_H;
941          secs_per_cyl = VHD_CHS_MAX_S;
942      } else {
943          total_sectors = MIN(VHD_MAX_GEOMETRY, total_size / BDRV_SECTOR_SIZE);
944          for (i = 0; total_sectors > (int64_t)cyls * heads * secs_per_cyl; i++) {
945              calculate_geometry(total_sectors + i, &cyls, &heads, &secs_per_cyl);
946          }
947      }
948  
949      if ((int64_t)cyls * heads * secs_per_cyl == VHD_MAX_GEOMETRY) {
950          total_sectors = total_size / BDRV_SECTOR_SIZE;
951          /* Allow a maximum disk size of 2040 GiB */
952          if (total_sectors > VHD_MAX_SECTORS) {
953              error_setg(errp, "Disk size is too large, max size is 2040 GiB");
954              return -EFBIG;
955          }
956      } else {
957          total_sectors = (int64_t) cyls * heads * secs_per_cyl;
958      }
959  
960      *out_total_sectors = total_sectors;
961      if (out_cyls) {
962          *out_cyls = cyls;
963          *out_heads = heads;
964          *out_secs_per_cyl = secs_per_cyl;
965      }
966  
967      return 0;
968  }
969  
970  static int coroutine_fn vpc_co_create(BlockdevCreateOptions *opts,
971                                        Error **errp)
972  {
973      BlockdevCreateOptionsVpc *vpc_opts;
974      BlockBackend *blk = NULL;
975      BlockDriverState *bs = NULL;
976  
977      VHDFooter footer;
978      uint16_t cyls = 0;
979      uint8_t heads = 0;
980      uint8_t secs_per_cyl = 0;
981      int64_t total_sectors;
982      int64_t total_size;
983      int disk_type;
984      int ret = -EIO;
985      QemuUUID uuid;
986  
987      assert(opts->driver == BLOCKDEV_DRIVER_VPC);
988      vpc_opts = &opts->u.vpc;
989  
990      /* Validate options and set default values */
991      total_size = vpc_opts->size;
992  
993      if (!vpc_opts->has_subformat) {
994          vpc_opts->subformat = BLOCKDEV_VPC_SUBFORMAT_DYNAMIC;
995      }
996      switch (vpc_opts->subformat) {
997      case BLOCKDEV_VPC_SUBFORMAT_DYNAMIC:
998          disk_type = VHD_DYNAMIC;
999          break;
1000      case BLOCKDEV_VPC_SUBFORMAT_FIXED:
1001          disk_type = VHD_FIXED;
1002          break;
1003      default:
1004          g_assert_not_reached();
1005      }
1006  
1007      /* Create BlockBackend to write to the image */
1008      bs = bdrv_open_blockdev_ref(vpc_opts->file, errp);
1009      if (bs == NULL) {
1010          return -EIO;
1011      }
1012  
1013      blk = blk_new_with_bs(bs, BLK_PERM_WRITE | BLK_PERM_RESIZE, BLK_PERM_ALL,
1014                            errp);
1015      if (!blk) {
1016          ret = -EPERM;
1017          goto out;
1018      }
1019      blk_set_allow_write_beyond_eof(blk, true);
1020  
1021      /* Get geometry and check that it matches the image size*/
1022      ret = calculate_rounded_image_size(vpc_opts, &cyls, &heads, &secs_per_cyl,
1023                                         &total_sectors, errp);
1024      if (ret < 0) {
1025          goto out;
1026      }
1027  
1028      if (total_size != total_sectors * BDRV_SECTOR_SIZE) {
1029          error_setg(errp, "The requested image size cannot be represented in "
1030                           "CHS geometry");
1031          error_append_hint(errp, "Try size=%llu or force-size=on (the "
1032                                  "latter makes the image incompatible with "
1033                                  "Virtual PC)",
1034                            total_sectors * BDRV_SECTOR_SIZE);
1035          ret = -EINVAL;
1036          goto out;
1037      }
1038  
1039      /* Prepare the Hard Disk Footer */
1040      memset(&footer, 0, sizeof(footer));
1041  
1042      memcpy(footer.creator, "conectix", 8);
1043      if (vpc_opts->force_size) {
1044          memcpy(footer.creator_app, "qem2", 4);
1045      } else {
1046          memcpy(footer.creator_app, "qemu", 4);
1047      }
1048      memcpy(footer.creator_os, "Wi2k", 4);
1049  
1050      footer.features = cpu_to_be32(0x02);
1051      footer.version = cpu_to_be32(0x00010000);
1052      if (disk_type == VHD_DYNAMIC) {
1053          footer.data_offset = cpu_to_be64(sizeof(footer));
1054      } else {
1055          footer.data_offset = cpu_to_be64(0xFFFFFFFFFFFFFFFFULL);
1056      }
1057      footer.timestamp = cpu_to_be32(time(NULL) - VHD_TIMESTAMP_BASE);
1058  
1059      /* Version of Virtual PC 2007 */
1060      footer.major = cpu_to_be16(0x0005);
1061      footer.minor = cpu_to_be16(0x0003);
1062      footer.orig_size = cpu_to_be64(total_size);
1063      footer.current_size = cpu_to_be64(total_size);
1064      footer.cyls = cpu_to_be16(cyls);
1065      footer.heads = heads;
1066      footer.secs_per_cyl = secs_per_cyl;
1067  
1068      footer.type = cpu_to_be32(disk_type);
1069  
1070      qemu_uuid_generate(&uuid);
1071      footer.uuid = uuid;
1072  
1073      footer.checksum = cpu_to_be32(vpc_checksum(&footer, sizeof(footer)));
1074  
1075      if (disk_type == VHD_DYNAMIC) {
1076          ret = create_dynamic_disk(blk, &footer, total_sectors);
1077          if (ret < 0) {
1078              error_setg(errp, "Unable to create or write VHD header");
1079          }
1080      } else {
1081          ret = create_fixed_disk(blk, &footer, total_size, errp);
1082      }
1083  
1084  out:
1085      blk_unref(blk);
1086      bdrv_unref(bs);
1087      return ret;
1088  }
1089  
1090  static int coroutine_fn vpc_co_create_opts(BlockDriver *drv,
1091                                             const char *filename,
1092                                             QemuOpts *opts,
1093                                             Error **errp)
1094  {
1095      BlockdevCreateOptions *create_options = NULL;
1096      QDict *qdict;
1097      Visitor *v;
1098      BlockDriverState *bs = NULL;
1099      int ret;
1100  
1101      static const QDictRenames opt_renames[] = {
1102          { VPC_OPT_FORCE_SIZE,           "force-size" },
1103          { NULL, NULL },
1104      };
1105  
1106      /* Parse options and convert legacy syntax */
1107      qdict = qemu_opts_to_qdict_filtered(opts, NULL, &vpc_create_opts, true);
1108  
1109      if (!qdict_rename_keys(qdict, opt_renames, errp)) {
1110          ret = -EINVAL;
1111          goto fail;
1112      }
1113  
1114      /* Create and open the file (protocol layer) */
1115      ret = bdrv_co_create_file(filename, opts, errp);
1116      if (ret < 0) {
1117          goto fail;
1118      }
1119  
1120      bs = bdrv_open(filename, NULL, NULL,
1121                     BDRV_O_RDWR | BDRV_O_RESIZE | BDRV_O_PROTOCOL, errp);
1122      if (bs == NULL) {
1123          ret = -EIO;
1124          goto fail;
1125      }
1126  
1127      /* Now get the QAPI type BlockdevCreateOptions */
1128      qdict_put_str(qdict, "driver", "vpc");
1129      qdict_put_str(qdict, "file", bs->node_name);
1130  
1131      v = qobject_input_visitor_new_flat_confused(qdict, errp);
1132      if (!v) {
1133          ret = -EINVAL;
1134          goto fail;
1135      }
1136  
1137      visit_type_BlockdevCreateOptions(v, NULL, &create_options, errp);
1138      visit_free(v);
1139      if (!create_options) {
1140          ret = -EINVAL;
1141          goto fail;
1142      }
1143  
1144      /* Silently round up size */
1145      assert(create_options->driver == BLOCKDEV_DRIVER_VPC);
1146      create_options->u.vpc.size =
1147          ROUND_UP(create_options->u.vpc.size, BDRV_SECTOR_SIZE);
1148  
1149      if (!create_options->u.vpc.force_size) {
1150          int64_t total_sectors;
1151          ret = calculate_rounded_image_size(&create_options->u.vpc, NULL, NULL,
1152                                             NULL, &total_sectors, errp);
1153          if (ret < 0) {
1154              goto fail;
1155          }
1156  
1157          create_options->u.vpc.size = total_sectors * BDRV_SECTOR_SIZE;
1158      }
1159  
1160  
1161      /* Create the vpc image (format layer) */
1162      ret = vpc_co_create(create_options, errp);
1163  
1164  fail:
1165      qobject_unref(qdict);
1166      bdrv_unref(bs);
1167      qapi_free_BlockdevCreateOptions(create_options);
1168      return ret;
1169  }
1170  
1171  
1172  static int vpc_has_zero_init(BlockDriverState *bs)
1173  {
1174      BDRVVPCState *s = bs->opaque;
1175  
1176      if (be32_to_cpu(s->footer.type) == VHD_FIXED) {
1177          return bdrv_has_zero_init(bs->file->bs);
1178      } else {
1179          return 1;
1180      }
1181  }
1182  
1183  static void vpc_close(BlockDriverState *bs)
1184  {
1185      BDRVVPCState *s = bs->opaque;
1186      qemu_vfree(s->pagetable);
1187  #ifdef CACHE
1188      g_free(s->pageentry_u8);
1189  #endif
1190  
1191      migrate_del_blocker(s->migration_blocker);
1192      error_free(s->migration_blocker);
1193  }
1194  
1195  static QemuOptsList vpc_create_opts = {
1196      .name = "vpc-create-opts",
1197      .head = QTAILQ_HEAD_INITIALIZER(vpc_create_opts.head),
1198      .desc = {
1199          {
1200              .name = BLOCK_OPT_SIZE,
1201              .type = QEMU_OPT_SIZE,
1202              .help = "Virtual disk size"
1203          },
1204          {
1205              .name = BLOCK_OPT_SUBFMT,
1206              .type = QEMU_OPT_STRING,
1207              .help =
1208                  "Type of virtual hard disk format. Supported formats are "
1209                  "{dynamic (default) | fixed} "
1210          },
1211          {
1212              .name = VPC_OPT_FORCE_SIZE,
1213              .type = QEMU_OPT_BOOL,
1214              .help = "Force disk size calculation to use the actual size "
1215                      "specified, rather than using the nearest CHS-based "
1216                      "calculation"
1217          },
1218          { /* end of list */ }
1219      }
1220  };
1221  
1222  static const char *const vpc_strong_runtime_opts[] = {
1223      VPC_OPT_SIZE_CALC,
1224  
1225      NULL
1226  };
1227  
1228  static BlockDriver bdrv_vpc = {
1229      .format_name    = "vpc",
1230      .instance_size  = sizeof(BDRVVPCState),
1231  
1232      .bdrv_probe             = vpc_probe,
1233      .bdrv_open              = vpc_open,
1234      .bdrv_close             = vpc_close,
1235      .bdrv_reopen_prepare    = vpc_reopen_prepare,
1236      .bdrv_child_perm        = bdrv_default_perms,
1237      .bdrv_co_create         = vpc_co_create,
1238      .bdrv_co_create_opts    = vpc_co_create_opts,
1239  
1240      .bdrv_co_preadv             = vpc_co_preadv,
1241      .bdrv_co_pwritev            = vpc_co_pwritev,
1242      .bdrv_co_block_status       = vpc_co_block_status,
1243  
1244      .bdrv_co_get_info       = vpc_co_get_info,
1245  
1246      .is_format              = true,
1247      .create_opts            = &vpc_create_opts,
1248      .bdrv_has_zero_init     = vpc_has_zero_init,
1249      .strong_runtime_opts    = vpc_strong_runtime_opts,
1250  };
1251  
1252  static void bdrv_vpc_init(void)
1253  {
1254      bdrv_register(&bdrv_vpc);
1255  }
1256  
1257  block_init(bdrv_vpc_init);
1258