xref: /openbmc/qemu/block/vhdx.c (revision 88f62c2b)
1 /*
2  * Block driver for Hyper-V VHDX Images
3  *
4  * Copyright (c) 2013 Red Hat, Inc.,
5  *
6  * Authors:
7  *  Jeff Cody <jcody@redhat.com>
8  *
9  *  This is based on the "VHDX Format Specification v0.95", published 4/12/2012
10  *  by Microsoft:
11  *      https://www.microsoft.com/en-us/download/details.aspx?id=29681
12  *
13  * This work is licensed under the terms of the GNU LGPL, version 2 or later.
14  * See the COPYING.LIB file in the top-level directory.
15  *
16  */
17 
18 #include "qemu-common.h"
19 #include "block/block_int.h"
20 #include "qemu/module.h"
21 #include "qemu/crc32c.h"
22 #include "block/vhdx.h"
23 
24 
25 /* Several metadata and region table data entries are identified by
26  * guids in  a MS-specific GUID format. */
27 
28 
29 /* ------- Known Region Table GUIDs ---------------------- */
30 static const MSGUID bat_guid =      { .data1 = 0x2dc27766,
31                                       .data2 = 0xf623,
32                                       .data3 = 0x4200,
33                                       .data4 = { 0x9d, 0x64, 0x11, 0x5e,
34                                                  0x9b, 0xfd, 0x4a, 0x08} };
35 
36 static const MSGUID metadata_guid = { .data1 = 0x8b7ca206,
37                                       .data2 = 0x4790,
38                                       .data3 = 0x4b9a,
39                                       .data4 = { 0xb8, 0xfe, 0x57, 0x5f,
40                                                  0x05, 0x0f, 0x88, 0x6e} };
41 
42 
43 
44 /* ------- Known Metadata Entry GUIDs ---------------------- */
45 static const MSGUID file_param_guid =   { .data1 = 0xcaa16737,
46                                           .data2 = 0xfa36,
47                                           .data3 = 0x4d43,
48                                           .data4 = { 0xb3, 0xb6, 0x33, 0xf0,
49                                                      0xaa, 0x44, 0xe7, 0x6b} };
50 
51 static const MSGUID virtual_size_guid = { .data1 = 0x2FA54224,
52                                           .data2 = 0xcd1b,
53                                           .data3 = 0x4876,
54                                           .data4 = { 0xb2, 0x11, 0x5d, 0xbe,
55                                                      0xd8, 0x3b, 0xf4, 0xb8} };
56 
57 static const MSGUID page83_guid =       { .data1 = 0xbeca12ab,
58                                           .data2 = 0xb2e6,
59                                           .data3 = 0x4523,
60                                           .data4 = { 0x93, 0xef, 0xc3, 0x09,
61                                                      0xe0, 0x00, 0xc7, 0x46} };
62 
63 
64 static const MSGUID phys_sector_guid =  { .data1 = 0xcda348c7,
65                                           .data2 = 0x445d,
66                                           .data3 = 0x4471,
67                                           .data4 = { 0x9c, 0xc9, 0xe9, 0x88,
68                                                      0x52, 0x51, 0xc5, 0x56} };
69 
70 static const MSGUID parent_locator_guid = { .data1 = 0xa8d35f2d,
71                                             .data2 = 0xb30b,
72                                             .data3 = 0x454d,
73                                             .data4 = { 0xab, 0xf7, 0xd3,
74                                                        0xd8, 0x48, 0x34,
75                                                        0xab, 0x0c} };
76 
77 static const MSGUID logical_sector_guid = { .data1 = 0x8141bf1d,
78                                             .data2 = 0xa96f,
79                                             .data3 = 0x4709,
80                                             .data4 = { 0xba, 0x47, 0xf2,
81                                                        0x33, 0xa8, 0xfa,
82                                                        0xab, 0x5f} };
83 
84 /* Each parent type must have a valid GUID; this is for parent images
85  * of type 'VHDX'.  If we were to allow e.g. a QCOW2 parent, we would
86  * need to make up our own QCOW2 GUID type */
87 static const MSGUID parent_vhdx_guid = { .data1 = 0xb04aefb7,
88                                          .data2 = 0xd19e,
89                                          .data3 = 0x4a81,
90                                          .data4 = { 0xb7, 0x89, 0x25, 0xb8,
91                                                     0xe9, 0x44, 0x59, 0x13} };
92 
93 
94 #define META_FILE_PARAMETER_PRESENT      0x01
95 #define META_VIRTUAL_DISK_SIZE_PRESENT   0x02
96 #define META_PAGE_83_PRESENT             0x04
97 #define META_LOGICAL_SECTOR_SIZE_PRESENT 0x08
98 #define META_PHYS_SECTOR_SIZE_PRESENT    0x10
99 #define META_PARENT_LOCATOR_PRESENT      0x20
100 
101 #define META_ALL_PRESENT    \
102     (META_FILE_PARAMETER_PRESENT | META_VIRTUAL_DISK_SIZE_PRESENT | \
103      META_PAGE_83_PRESENT | META_LOGICAL_SECTOR_SIZE_PRESENT | \
104      META_PHYS_SECTOR_SIZE_PRESENT)
105 
106 typedef struct VHDXMetadataEntries {
107     VHDXMetadataTableEntry file_parameters_entry;
108     VHDXMetadataTableEntry virtual_disk_size_entry;
109     VHDXMetadataTableEntry page83_data_entry;
110     VHDXMetadataTableEntry logical_sector_size_entry;
111     VHDXMetadataTableEntry phys_sector_size_entry;
112     VHDXMetadataTableEntry parent_locator_entry;
113     uint16_t present;
114 } VHDXMetadataEntries;
115 
116 
117 typedef struct VHDXSectorInfo {
118     uint32_t bat_idx;       /* BAT entry index */
119     uint32_t sectors_avail; /* sectors available in payload block */
120     uint32_t bytes_left;    /* bytes left in the block after data to r/w */
121     uint32_t bytes_avail;   /* bytes available in payload block */
122     uint64_t file_offset;   /* absolute offset in bytes, in file */
123     uint64_t block_offset;  /* block offset, in bytes */
124 } VHDXSectorInfo;
125 
126 
127 
128 typedef struct BDRVVHDXState {
129     CoMutex lock;
130 
131     int curr_header;
132     VHDXHeader *headers[2];
133 
134     VHDXRegionTableHeader rt;
135     VHDXRegionTableEntry bat_rt;         /* region table for the BAT */
136     VHDXRegionTableEntry metadata_rt;    /* region table for the metadata */
137 
138     VHDXMetadataTableHeader metadata_hdr;
139     VHDXMetadataEntries metadata_entries;
140 
141     VHDXFileParameters params;
142     uint32_t block_size;
143     uint32_t block_size_bits;
144     uint32_t sectors_per_block;
145     uint32_t sectors_per_block_bits;
146 
147     uint64_t virtual_disk_size;
148     uint32_t logical_sector_size;
149     uint32_t physical_sector_size;
150 
151     uint64_t chunk_ratio;
152     uint32_t chunk_ratio_bits;
153     uint32_t logical_sector_size_bits;
154 
155     uint32_t bat_entries;
156     VHDXBatEntry *bat;
157     uint64_t bat_offset;
158 
159     VHDXParentLocatorHeader parent_header;
160     VHDXParentLocatorEntry *parent_entries;
161 
162 } BDRVVHDXState;
163 
164 uint32_t vhdx_checksum_calc(uint32_t crc, uint8_t *buf, size_t size,
165                             int crc_offset)
166 {
167     uint32_t crc_new;
168     uint32_t crc_orig;
169     assert(buf != NULL);
170 
171     if (crc_offset > 0) {
172         memcpy(&crc_orig, buf + crc_offset, sizeof(crc_orig));
173         memset(buf + crc_offset, 0, sizeof(crc_orig));
174     }
175 
176     crc_new = crc32c(crc, buf, size);
177     if (crc_offset > 0) {
178         memcpy(buf + crc_offset, &crc_orig, sizeof(crc_orig));
179     }
180 
181     return crc_new;
182 }
183 
184 /* Validates the checksum of the buffer, with an in-place CRC.
185  *
186  * Zero is substituted during crc calculation for the original crc field,
187  * and the crc field is restored afterwards.  But the buffer will be modifed
188  * during the calculation, so this may not be not suitable for multi-threaded
189  * use.
190  *
191  * crc_offset: byte offset in buf of the buffer crc
192  * buf: buffer pointer
193  * size: size of buffer (must be > crc_offset+4)
194  *
195  * returns true if checksum is valid, false otherwise
196  */
197 bool vhdx_checksum_is_valid(uint8_t *buf, size_t size, int crc_offset)
198 {
199     uint32_t crc_orig;
200     uint32_t crc;
201 
202     assert(buf != NULL);
203     assert(size > (crc_offset + 4));
204 
205     memcpy(&crc_orig, buf + crc_offset, sizeof(crc_orig));
206     crc_orig = le32_to_cpu(crc_orig);
207 
208     crc = vhdx_checksum_calc(0xffffffff, buf, size, crc_offset);
209 
210     return crc == crc_orig;
211 }
212 
213 
214 /*
215  * Per the MS VHDX Specification, for every VHDX file:
216  *      - The header section is fixed size - 1 MB
217  *      - The header section is always the first "object"
218  *      - The first 64KB of the header is the File Identifier
219  *      - The first uint64 (8 bytes) is the VHDX Signature ("vhdxfile")
220  *      - The following 512 bytes constitute a UTF-16 string identifiying the
221  *        software that created the file, and is optional and diagnostic only.
222  *
223  *  Therefore, we probe by looking for the vhdxfile signature "vhdxfile"
224  */
225 static int vhdx_probe(const uint8_t *buf, int buf_size, const char *filename)
226 {
227     if (buf_size >= 8 && !memcmp(buf, "vhdxfile", 8)) {
228         return 100;
229     }
230     return 0;
231 }
232 
233 /* All VHDX structures on disk are little endian */
234 static void vhdx_header_le_import(VHDXHeader *h)
235 {
236     assert(h != NULL);
237 
238     le32_to_cpus(&h->signature);
239     le32_to_cpus(&h->checksum);
240     le64_to_cpus(&h->sequence_number);
241 
242     leguid_to_cpus(&h->file_write_guid);
243     leguid_to_cpus(&h->data_write_guid);
244     leguid_to_cpus(&h->log_guid);
245 
246     le16_to_cpus(&h->log_version);
247     le16_to_cpus(&h->version);
248     le32_to_cpus(&h->log_length);
249     le64_to_cpus(&h->log_offset);
250 }
251 
252 
253 /* opens the specified header block from the VHDX file header section */
254 static int vhdx_parse_header(BlockDriverState *bs, BDRVVHDXState *s)
255 {
256     int ret = 0;
257     VHDXHeader *header1;
258     VHDXHeader *header2;
259     bool h1_valid = false;
260     bool h2_valid = false;
261     uint64_t h1_seq = 0;
262     uint64_t h2_seq = 0;
263     uint8_t *buffer;
264 
265     header1 = qemu_blockalign(bs, sizeof(VHDXHeader));
266     header2 = qemu_blockalign(bs, sizeof(VHDXHeader));
267 
268     buffer = qemu_blockalign(bs, VHDX_HEADER_SIZE);
269 
270     s->headers[0] = header1;
271     s->headers[1] = header2;
272 
273     /* We have to read the whole VHDX_HEADER_SIZE instead of
274      * sizeof(VHDXHeader), because the checksum is over the whole
275      * region */
276     ret = bdrv_pread(bs->file, VHDX_HEADER1_OFFSET, buffer, VHDX_HEADER_SIZE);
277     if (ret < 0) {
278         goto fail;
279     }
280     /* copy over just the relevant portion that we need */
281     memcpy(header1, buffer, sizeof(VHDXHeader));
282     vhdx_header_le_import(header1);
283 
284     if (vhdx_checksum_is_valid(buffer, VHDX_HEADER_SIZE, 4) &&
285         !memcmp(&header1->signature, "head", 4)             &&
286         header1->version == 1) {
287         h1_seq = header1->sequence_number;
288         h1_valid = true;
289     }
290 
291     ret = bdrv_pread(bs->file, VHDX_HEADER2_OFFSET, buffer, VHDX_HEADER_SIZE);
292     if (ret < 0) {
293         goto fail;
294     }
295     /* copy over just the relevant portion that we need */
296     memcpy(header2, buffer, sizeof(VHDXHeader));
297     vhdx_header_le_import(header2);
298 
299     if (vhdx_checksum_is_valid(buffer, VHDX_HEADER_SIZE, 4) &&
300         !memcmp(&header2->signature, "head", 4)             &&
301         header2->version == 1) {
302         h2_seq = header2->sequence_number;
303         h2_valid = true;
304     }
305 
306     /* If there is only 1 valid header (or no valid headers), we
307      * don't care what the sequence numbers are */
308     if (h1_valid && !h2_valid) {
309         s->curr_header = 0;
310     } else if (!h1_valid && h2_valid) {
311         s->curr_header = 1;
312     } else if (!h1_valid && !h2_valid) {
313         ret = -EINVAL;
314         goto fail;
315     } else {
316         /* If both headers are valid, then we choose the active one by the
317          * highest sequence number.  If the sequence numbers are equal, that is
318          * invalid */
319         if (h1_seq > h2_seq) {
320             s->curr_header = 0;
321         } else if (h2_seq > h1_seq) {
322             s->curr_header = 1;
323         } else {
324             ret = -EINVAL;
325             goto fail;
326         }
327     }
328 
329     ret = 0;
330 
331     goto exit;
332 
333 fail:
334     qerror_report(ERROR_CLASS_GENERIC_ERROR, "No valid VHDX header found");
335     qemu_vfree(header1);
336     qemu_vfree(header2);
337     s->headers[0] = NULL;
338     s->headers[1] = NULL;
339 exit:
340     qemu_vfree(buffer);
341     return ret;
342 }
343 
344 
345 static int vhdx_open_region_tables(BlockDriverState *bs, BDRVVHDXState *s)
346 {
347     int ret = 0;
348     uint8_t *buffer;
349     int offset = 0;
350     VHDXRegionTableEntry rt_entry;
351     uint32_t i;
352     bool bat_rt_found = false;
353     bool metadata_rt_found = false;
354 
355     /* We have to read the whole 64KB block, because the crc32 is over the
356      * whole block */
357     buffer = qemu_blockalign(bs, VHDX_HEADER_BLOCK_SIZE);
358 
359     ret = bdrv_pread(bs->file, VHDX_REGION_TABLE_OFFSET, buffer,
360                      VHDX_HEADER_BLOCK_SIZE);
361     if (ret < 0) {
362         goto fail;
363     }
364     memcpy(&s->rt, buffer, sizeof(s->rt));
365     le32_to_cpus(&s->rt.signature);
366     le32_to_cpus(&s->rt.checksum);
367     le32_to_cpus(&s->rt.entry_count);
368     le32_to_cpus(&s->rt.reserved);
369     offset += sizeof(s->rt);
370 
371     if (!vhdx_checksum_is_valid(buffer, VHDX_HEADER_BLOCK_SIZE, 4) ||
372         memcmp(&s->rt.signature, "regi", 4)) {
373         ret = -EINVAL;
374         goto fail;
375     }
376 
377     /* Per spec, maximum region table entry count is 2047 */
378     if (s->rt.entry_count > 2047) {
379         ret = -EINVAL;
380         goto fail;
381     }
382 
383     for (i = 0; i < s->rt.entry_count; i++) {
384         memcpy(&rt_entry, buffer + offset, sizeof(rt_entry));
385         offset += sizeof(rt_entry);
386 
387         leguid_to_cpus(&rt_entry.guid);
388         le64_to_cpus(&rt_entry.file_offset);
389         le32_to_cpus(&rt_entry.length);
390         le32_to_cpus(&rt_entry.data_bits);
391 
392         /* see if we recognize the entry */
393         if (guid_eq(rt_entry.guid, bat_guid)) {
394             /* must be unique; if we have already found it this is invalid */
395             if (bat_rt_found) {
396                 ret = -EINVAL;
397                 goto fail;
398             }
399             bat_rt_found = true;
400             s->bat_rt = rt_entry;
401             continue;
402         }
403 
404         if (guid_eq(rt_entry.guid, metadata_guid)) {
405             /* must be unique; if we have already found it this is invalid */
406             if (metadata_rt_found) {
407                 ret = -EINVAL;
408                 goto fail;
409             }
410             metadata_rt_found = true;
411             s->metadata_rt = rt_entry;
412             continue;
413         }
414 
415         if (rt_entry.data_bits & VHDX_REGION_ENTRY_REQUIRED) {
416             /* cannot read vhdx file - required region table entry that
417              * we do not understand.  per spec, we must fail to open */
418             ret = -ENOTSUP;
419             goto fail;
420         }
421     }
422     ret = 0;
423 
424 fail:
425     qemu_vfree(buffer);
426     return ret;
427 }
428 
429 
430 
431 /* Metadata initial parser
432  *
433  * This loads all the metadata entry fields.  This may cause additional
434  * fields to be processed (e.g. parent locator, etc..).
435  *
436  * There are 5 Metadata items that are always required:
437  *      - File Parameters (block size, has a parent)
438  *      - Virtual Disk Size (size, in bytes, of the virtual drive)
439  *      - Page 83 Data (scsi page 83 guid)
440  *      - Logical Sector Size (logical sector size in bytes, either 512 or
441  *                             4096.  We only support 512 currently)
442  *      - Physical Sector Size (512 or 4096)
443  *
444  * Also, if the File Parameters indicate this is a differencing file,
445  * we must also look for the Parent Locator metadata item.
446  */
447 static int vhdx_parse_metadata(BlockDriverState *bs, BDRVVHDXState *s)
448 {
449     int ret = 0;
450     uint8_t *buffer;
451     int offset = 0;
452     uint32_t i = 0;
453     VHDXMetadataTableEntry md_entry;
454 
455     buffer = qemu_blockalign(bs, VHDX_METADATA_TABLE_MAX_SIZE);
456 
457     ret = bdrv_pread(bs->file, s->metadata_rt.file_offset, buffer,
458                      VHDX_METADATA_TABLE_MAX_SIZE);
459     if (ret < 0) {
460         goto exit;
461     }
462     memcpy(&s->metadata_hdr, buffer, sizeof(s->metadata_hdr));
463     offset += sizeof(s->metadata_hdr);
464 
465     le64_to_cpus(&s->metadata_hdr.signature);
466     le16_to_cpus(&s->metadata_hdr.reserved);
467     le16_to_cpus(&s->metadata_hdr.entry_count);
468 
469     if (memcmp(&s->metadata_hdr.signature, "metadata", 8)) {
470         ret = -EINVAL;
471         goto exit;
472     }
473 
474     s->metadata_entries.present = 0;
475 
476     if ((s->metadata_hdr.entry_count * sizeof(md_entry)) >
477         (VHDX_METADATA_TABLE_MAX_SIZE - offset)) {
478         ret = -EINVAL;
479         goto exit;
480     }
481 
482     for (i = 0; i < s->metadata_hdr.entry_count; i++) {
483         memcpy(&md_entry, buffer + offset, sizeof(md_entry));
484         offset += sizeof(md_entry);
485 
486         leguid_to_cpus(&md_entry.item_id);
487         le32_to_cpus(&md_entry.offset);
488         le32_to_cpus(&md_entry.length);
489         le32_to_cpus(&md_entry.data_bits);
490         le32_to_cpus(&md_entry.reserved2);
491 
492         if (guid_eq(md_entry.item_id, file_param_guid)) {
493             if (s->metadata_entries.present & META_FILE_PARAMETER_PRESENT) {
494                 ret = -EINVAL;
495                 goto exit;
496             }
497             s->metadata_entries.file_parameters_entry = md_entry;
498             s->metadata_entries.present |= META_FILE_PARAMETER_PRESENT;
499             continue;
500         }
501 
502         if (guid_eq(md_entry.item_id, virtual_size_guid)) {
503             if (s->metadata_entries.present & META_VIRTUAL_DISK_SIZE_PRESENT) {
504                 ret = -EINVAL;
505                 goto exit;
506             }
507             s->metadata_entries.virtual_disk_size_entry = md_entry;
508             s->metadata_entries.present |= META_VIRTUAL_DISK_SIZE_PRESENT;
509             continue;
510         }
511 
512         if (guid_eq(md_entry.item_id, page83_guid)) {
513             if (s->metadata_entries.present & META_PAGE_83_PRESENT) {
514                 ret = -EINVAL;
515                 goto exit;
516             }
517             s->metadata_entries.page83_data_entry = md_entry;
518             s->metadata_entries.present |= META_PAGE_83_PRESENT;
519             continue;
520         }
521 
522         if (guid_eq(md_entry.item_id, logical_sector_guid)) {
523             if (s->metadata_entries.present &
524                 META_LOGICAL_SECTOR_SIZE_PRESENT) {
525                 ret = -EINVAL;
526                 goto exit;
527             }
528             s->metadata_entries.logical_sector_size_entry = md_entry;
529             s->metadata_entries.present |= META_LOGICAL_SECTOR_SIZE_PRESENT;
530             continue;
531         }
532 
533         if (guid_eq(md_entry.item_id, phys_sector_guid)) {
534             if (s->metadata_entries.present & META_PHYS_SECTOR_SIZE_PRESENT) {
535                 ret = -EINVAL;
536                 goto exit;
537             }
538             s->metadata_entries.phys_sector_size_entry = md_entry;
539             s->metadata_entries.present |= META_PHYS_SECTOR_SIZE_PRESENT;
540             continue;
541         }
542 
543         if (guid_eq(md_entry.item_id, parent_locator_guid)) {
544             if (s->metadata_entries.present & META_PARENT_LOCATOR_PRESENT) {
545                 ret = -EINVAL;
546                 goto exit;
547             }
548             s->metadata_entries.parent_locator_entry = md_entry;
549             s->metadata_entries.present |= META_PARENT_LOCATOR_PRESENT;
550             continue;
551         }
552 
553         if (md_entry.data_bits & VHDX_META_FLAGS_IS_REQUIRED) {
554             /* cannot read vhdx file - required region table entry that
555              * we do not understand.  per spec, we must fail to open */
556             ret = -ENOTSUP;
557             goto exit;
558         }
559     }
560 
561     if (s->metadata_entries.present != META_ALL_PRESENT) {
562         ret = -ENOTSUP;
563         goto exit;
564     }
565 
566     ret = bdrv_pread(bs->file,
567                      s->metadata_entries.file_parameters_entry.offset
568                                          + s->metadata_rt.file_offset,
569                      &s->params,
570                      sizeof(s->params));
571 
572     if (ret < 0) {
573         goto exit;
574     }
575 
576     le32_to_cpus(&s->params.block_size);
577     le32_to_cpus(&s->params.data_bits);
578 
579 
580     /* We now have the file parameters, so we can tell if this is a
581      * differencing file (i.e.. has_parent), is dynamic or fixed
582      * sized (leave_blocks_allocated), and the block size */
583 
584     /* The parent locator required iff the file parameters has_parent set */
585     if (s->params.data_bits & VHDX_PARAMS_HAS_PARENT) {
586         if (s->metadata_entries.present & META_PARENT_LOCATOR_PRESENT) {
587             /* TODO: parse  parent locator fields */
588             ret = -ENOTSUP; /* temp, until differencing files are supported */
589             goto exit;
590         } else {
591             /* if has_parent is set, but there is not parent locator present,
592              * then that is an invalid combination */
593             ret = -EINVAL;
594             goto exit;
595         }
596     }
597 
598     /* determine virtual disk size, logical sector size,
599      * and phys sector size */
600 
601     ret = bdrv_pread(bs->file,
602                      s->metadata_entries.virtual_disk_size_entry.offset
603                                            + s->metadata_rt.file_offset,
604                      &s->virtual_disk_size,
605                      sizeof(uint64_t));
606     if (ret < 0) {
607         goto exit;
608     }
609     ret = bdrv_pread(bs->file,
610                      s->metadata_entries.logical_sector_size_entry.offset
611                                              + s->metadata_rt.file_offset,
612                      &s->logical_sector_size,
613                      sizeof(uint32_t));
614     if (ret < 0) {
615         goto exit;
616     }
617     ret = bdrv_pread(bs->file,
618                      s->metadata_entries.phys_sector_size_entry.offset
619                                           + s->metadata_rt.file_offset,
620                      &s->physical_sector_size,
621                      sizeof(uint32_t));
622     if (ret < 0) {
623         goto exit;
624     }
625 
626     le64_to_cpus(&s->virtual_disk_size);
627     le32_to_cpus(&s->logical_sector_size);
628     le32_to_cpus(&s->physical_sector_size);
629 
630     if (s->logical_sector_size == 0 || s->params.block_size == 0) {
631         ret = -EINVAL;
632         goto exit;
633     }
634 
635     /* both block_size and sector_size are guaranteed powers of 2 */
636     s->sectors_per_block = s->params.block_size / s->logical_sector_size;
637     s->chunk_ratio = (VHDX_MAX_SECTORS_PER_BLOCK) *
638                      (uint64_t)s->logical_sector_size /
639                      (uint64_t)s->params.block_size;
640 
641     /* These values are ones we will want to use for division / multiplication
642      * later on, and they are all guaranteed (per the spec) to be powers of 2,
643      * so we can take advantage of that for shift operations during
644      * reads/writes */
645     if (s->logical_sector_size & (s->logical_sector_size - 1)) {
646         ret = -EINVAL;
647         goto exit;
648     }
649     if (s->sectors_per_block & (s->sectors_per_block - 1)) {
650         ret = -EINVAL;
651         goto exit;
652     }
653     if (s->chunk_ratio & (s->chunk_ratio - 1)) {
654         ret = -EINVAL;
655         goto exit;
656     }
657     s->block_size = s->params.block_size;
658     if (s->block_size & (s->block_size - 1)) {
659         ret = -EINVAL;
660         goto exit;
661     }
662 
663     s->logical_sector_size_bits = 31 - clz32(s->logical_sector_size);
664     s->sectors_per_block_bits =   31 - clz32(s->sectors_per_block);
665     s->chunk_ratio_bits =         63 - clz64(s->chunk_ratio);
666     s->block_size_bits =          31 - clz32(s->block_size);
667 
668     ret = 0;
669 
670 exit:
671     qemu_vfree(buffer);
672     return ret;
673 }
674 
675 /* Parse the replay log.  Per the VHDX spec, if the log is present
676  * it must be replayed prior to opening the file, even read-only.
677  *
678  * If read-only, we must replay the log in RAM (or refuse to open
679  * a dirty VHDX file read-only */
680 static int vhdx_parse_log(BlockDriverState *bs, BDRVVHDXState *s)
681 {
682     int ret = 0;
683     int i;
684     VHDXHeader *hdr;
685 
686     hdr = s->headers[s->curr_header];
687 
688     /* either the log guid, or log length is zero,
689      * then a replay log is present */
690     for (i = 0; i < sizeof(hdr->log_guid.data4); i++) {
691         ret |= hdr->log_guid.data4[i];
692     }
693     if (hdr->log_guid.data1 == 0 &&
694         hdr->log_guid.data2 == 0 &&
695         hdr->log_guid.data3 == 0 &&
696         ret == 0) {
697         goto exit;
698     }
699 
700     /* per spec, only log version of 0 is supported */
701     if (hdr->log_version != 0) {
702         ret = -EINVAL;
703         goto exit;
704     }
705 
706     if (hdr->log_length == 0) {
707         goto exit;
708     }
709 
710     /* We currently do not support images with logs to replay */
711     ret = -ENOTSUP;
712 
713 exit:
714     return ret;
715 }
716 
717 
718 static int vhdx_open(BlockDriverState *bs, QDict *options, int flags)
719 {
720     BDRVVHDXState *s = bs->opaque;
721     int ret = 0;
722     uint32_t i;
723     uint64_t signature;
724     uint32_t data_blocks_cnt, bitmap_blocks_cnt;
725 
726 
727     s->bat = NULL;
728 
729     qemu_co_mutex_init(&s->lock);
730 
731     /* validate the file signature */
732     ret = bdrv_pread(bs->file, 0, &signature, sizeof(uint64_t));
733     if (ret < 0) {
734         goto fail;
735     }
736     if (memcmp(&signature, "vhdxfile", 8)) {
737         ret = -EINVAL;
738         goto fail;
739     }
740 
741     ret = vhdx_parse_header(bs, s);
742     if (ret) {
743         goto fail;
744     }
745 
746     ret = vhdx_parse_log(bs, s);
747     if (ret) {
748         goto fail;
749     }
750 
751     ret = vhdx_open_region_tables(bs, s);
752     if (ret) {
753         goto fail;
754     }
755 
756     ret = vhdx_parse_metadata(bs, s);
757     if (ret) {
758         goto fail;
759     }
760     s->block_size = s->params.block_size;
761 
762     /* the VHDX spec dictates that virtual_disk_size is always a multiple of
763      * logical_sector_size */
764     bs->total_sectors = s->virtual_disk_size >> s->logical_sector_size_bits;
765 
766     data_blocks_cnt = s->virtual_disk_size >> s->block_size_bits;
767     if (s->virtual_disk_size - (data_blocks_cnt << s->block_size_bits)) {
768         data_blocks_cnt++;
769     }
770     bitmap_blocks_cnt = data_blocks_cnt >> s->chunk_ratio_bits;
771     if (data_blocks_cnt - (bitmap_blocks_cnt << s->chunk_ratio_bits)) {
772         bitmap_blocks_cnt++;
773     }
774 
775     if (s->parent_entries) {
776         s->bat_entries = bitmap_blocks_cnt * (s->chunk_ratio + 1);
777     } else {
778         s->bat_entries = data_blocks_cnt +
779                          ((data_blocks_cnt - 1) >> s->chunk_ratio_bits);
780     }
781 
782     s->bat_offset = s->bat_rt.file_offset;
783 
784     if (s->bat_entries > s->bat_rt.length / sizeof(VHDXBatEntry)) {
785         /* BAT allocation is not large enough for all entries */
786         ret = -EINVAL;
787         goto fail;
788     }
789 
790     s->bat = qemu_blockalign(bs, s->bat_rt.length);
791 
792     ret = bdrv_pread(bs->file, s->bat_offset, s->bat, s->bat_rt.length);
793     if (ret < 0) {
794         goto fail;
795     }
796 
797     for (i = 0; i < s->bat_entries; i++) {
798         le64_to_cpus(&s->bat[i]);
799     }
800 
801     if (flags & BDRV_O_RDWR) {
802         ret = -ENOTSUP;
803         goto fail;
804     }
805 
806     /* TODO: differencing files, write */
807 
808     return 0;
809 fail:
810     qemu_vfree(s->headers[0]);
811     qemu_vfree(s->headers[1]);
812     qemu_vfree(s->bat);
813     qemu_vfree(s->parent_entries);
814     return ret;
815 }
816 
817 static int vhdx_reopen_prepare(BDRVReopenState *state,
818                                BlockReopenQueue *queue, Error **errp)
819 {
820     return 0;
821 }
822 
823 
824 /*
825  * Perform sector to block offset translations, to get various
826  * sector and file offsets into the image.  See VHDXSectorInfo
827  */
828 static void vhdx_block_translate(BDRVVHDXState *s, int64_t sector_num,
829                                  int nb_sectors, VHDXSectorInfo *sinfo)
830 {
831     uint32_t block_offset;
832 
833     sinfo->bat_idx = sector_num >> s->sectors_per_block_bits;
834     /* effectively a modulo - this gives us the offset into the block
835      * (in sector sizes) for our sector number */
836     block_offset = sector_num - (sinfo->bat_idx << s->sectors_per_block_bits);
837     /* the chunk ratio gives us the interleaving of the sector
838      * bitmaps, so we need to advance our page block index by the
839      * sector bitmaps entry number */
840     sinfo->bat_idx += sinfo->bat_idx >> s->chunk_ratio_bits;
841 
842     /* the number of sectors we can read/write in this cycle */
843     sinfo->sectors_avail = s->sectors_per_block - block_offset;
844 
845     sinfo->bytes_left = sinfo->sectors_avail << s->logical_sector_size_bits;
846 
847     if (sinfo->sectors_avail > nb_sectors) {
848         sinfo->sectors_avail = nb_sectors;
849     }
850 
851     sinfo->bytes_avail = sinfo->sectors_avail << s->logical_sector_size_bits;
852 
853     sinfo->file_offset = s->bat[sinfo->bat_idx] >> VHDX_BAT_FILE_OFF_BITS;
854 
855     sinfo->block_offset = block_offset << s->logical_sector_size_bits;
856 
857     /* The file offset must be past the header section, so must be > 0 */
858     if (sinfo->file_offset == 0) {
859         return;
860     }
861 
862     /* block offset is the offset in vhdx logical sectors, in
863      * the payload data block. Convert that to a byte offset
864      * in the block, and add in the payload data block offset
865      * in the file, in bytes, to get the final read address */
866 
867     sinfo->file_offset <<= 20;  /* now in bytes, rather than 1MB units */
868     sinfo->file_offset += sinfo->block_offset;
869 }
870 
871 
872 
873 static coroutine_fn int vhdx_co_readv(BlockDriverState *bs, int64_t sector_num,
874                                       int nb_sectors, QEMUIOVector *qiov)
875 {
876     BDRVVHDXState *s = bs->opaque;
877     int ret = 0;
878     VHDXSectorInfo sinfo;
879     uint64_t bytes_done = 0;
880     QEMUIOVector hd_qiov;
881 
882     qemu_iovec_init(&hd_qiov, qiov->niov);
883 
884     qemu_co_mutex_lock(&s->lock);
885 
886     while (nb_sectors > 0) {
887         /* We are a differencing file, so we need to inspect the sector bitmap
888          * to see if we have the data or not */
889         if (s->params.data_bits & VHDX_PARAMS_HAS_PARENT) {
890             /* not supported yet */
891             ret = -ENOTSUP;
892             goto exit;
893         } else {
894             vhdx_block_translate(s, sector_num, nb_sectors, &sinfo);
895 
896             qemu_iovec_reset(&hd_qiov);
897             qemu_iovec_concat(&hd_qiov, qiov,  bytes_done, sinfo.bytes_avail);
898 
899             /* check the payload block state */
900             switch (s->bat[sinfo.bat_idx] & VHDX_BAT_STATE_BIT_MASK) {
901             case PAYLOAD_BLOCK_NOT_PRESENT: /* fall through */
902             case PAYLOAD_BLOCK_UNDEFINED:   /* fall through */
903             case PAYLOAD_BLOCK_UNMAPPED:    /* fall through */
904             case PAYLOAD_BLOCK_ZERO:
905                 /* return zero */
906                 qemu_iovec_memset(&hd_qiov, 0, 0, sinfo.bytes_avail);
907                 break;
908             case PAYLOAD_BLOCK_FULL_PRESENT:
909                 qemu_co_mutex_unlock(&s->lock);
910                 ret = bdrv_co_readv(bs->file,
911                                     sinfo.file_offset >> BDRV_SECTOR_BITS,
912                                     sinfo.sectors_avail, &hd_qiov);
913                 qemu_co_mutex_lock(&s->lock);
914                 if (ret < 0) {
915                     goto exit;
916                 }
917                 break;
918             case PAYLOAD_BLOCK_PARTIALLY_PRESENT:
919                 /* we don't yet support difference files, fall through
920                  * to error */
921             default:
922                 ret = -EIO;
923                 goto exit;
924                 break;
925             }
926             nb_sectors -= sinfo.sectors_avail;
927             sector_num += sinfo.sectors_avail;
928             bytes_done += sinfo.bytes_avail;
929         }
930     }
931     ret = 0;
932 exit:
933     qemu_co_mutex_unlock(&s->lock);
934     qemu_iovec_destroy(&hd_qiov);
935     return ret;
936 }
937 
938 
939 
940 static coroutine_fn int vhdx_co_writev(BlockDriverState *bs, int64_t sector_num,
941                                       int nb_sectors, QEMUIOVector *qiov)
942 {
943     return -ENOTSUP;
944 }
945 
946 
947 static void vhdx_close(BlockDriverState *bs)
948 {
949     BDRVVHDXState *s = bs->opaque;
950     qemu_vfree(s->headers[0]);
951     qemu_vfree(s->headers[1]);
952     qemu_vfree(s->bat);
953     qemu_vfree(s->parent_entries);
954 }
955 
956 static BlockDriver bdrv_vhdx = {
957     .format_name            = "vhdx",
958     .instance_size          = sizeof(BDRVVHDXState),
959     .bdrv_probe             = vhdx_probe,
960     .bdrv_open              = vhdx_open,
961     .bdrv_close             = vhdx_close,
962     .bdrv_reopen_prepare    = vhdx_reopen_prepare,
963     .bdrv_co_readv          = vhdx_co_readv,
964     .bdrv_co_writev         = vhdx_co_writev,
965 };
966 
967 static void bdrv_vhdx_init(void)
968 {
969     bdrv_register(&bdrv_vhdx);
970 }
971 
972 block_init(bdrv_vhdx_init);
973