xref: /openbmc/qemu/block/vmdk.c (revision 8ffe04ed2ed44b32f97575bc3cb7c29eefdd70da)
1 /*
2  * Block driver for the VMDK format
3  *
4  * Copyright (c) 2004 Fabrice Bellard
5  * Copyright (c) 2005 Filip Navara
6  *
7  * Permission is hereby granted, free of charge, to any person obtaining a copy
8  * of this software and associated documentation files (the "Software"), to deal
9  * in the Software without restriction, including without limitation the rights
10  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11  * copies of the Software, and to permit persons to whom the Software is
12  * furnished to do so, subject to the following conditions:
13  *
14  * The above copyright notice and this permission notice shall be included in
15  * all copies or substantial portions of the Software.
16  *
17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23  * THE SOFTWARE.
24  */
25 
26 #include "qemu-common.h"
27 #include "block/block_int.h"
28 #include "qemu/module.h"
29 #include "migration/migration.h"
30 #include <zlib.h>
31 
32 #define VMDK3_MAGIC (('C' << 24) | ('O' << 16) | ('W' << 8) | 'D')
33 #define VMDK4_MAGIC (('K' << 24) | ('D' << 16) | ('M' << 8) | 'V')
34 #define VMDK4_COMPRESSION_DEFLATE 1
35 #define VMDK4_FLAG_NL_DETECT (1 << 0)
36 #define VMDK4_FLAG_RGD (1 << 1)
37 /* Zeroed-grain enable bit */
38 #define VMDK4_FLAG_ZERO_GRAIN   (1 << 2)
39 #define VMDK4_FLAG_COMPRESS (1 << 16)
40 #define VMDK4_FLAG_MARKER (1 << 17)
41 #define VMDK4_GD_AT_END 0xffffffffffffffffULL
42 
43 #define VMDK_GTE_ZEROED 0x1
44 
45 /* VMDK internal error codes */
46 #define VMDK_OK      0
47 #define VMDK_ERROR   (-1)
48 /* Cluster not allocated */
49 #define VMDK_UNALLOC (-2)
50 #define VMDK_ZEROED  (-3)
51 
52 #define BLOCK_OPT_ZEROED_GRAIN "zeroed_grain"
53 
54 typedef struct {
55     uint32_t version;
56     uint32_t flags;
57     uint32_t disk_sectors;
58     uint32_t granularity;
59     uint32_t l1dir_offset;
60     uint32_t l1dir_size;
61     uint32_t file_sectors;
62     uint32_t cylinders;
63     uint32_t heads;
64     uint32_t sectors_per_track;
65 } QEMU_PACKED VMDK3Header;
66 
67 typedef struct {
68     uint32_t version;
69     uint32_t flags;
70     uint64_t capacity;
71     uint64_t granularity;
72     uint64_t desc_offset;
73     uint64_t desc_size;
74     /* Number of GrainTableEntries per GrainTable */
75     uint32_t num_gtes_per_gt;
76     uint64_t rgd_offset;
77     uint64_t gd_offset;
78     uint64_t grain_offset;
79     char filler[1];
80     char check_bytes[4];
81     uint16_t compressAlgorithm;
82 } QEMU_PACKED VMDK4Header;
83 
84 #define L2_CACHE_SIZE 16
85 
86 typedef struct VmdkExtent {
87     BlockDriverState *file;
88     bool flat;
89     bool compressed;
90     bool has_marker;
91     bool has_zero_grain;
92     int version;
93     int64_t sectors;
94     int64_t end_sector;
95     int64_t flat_start_offset;
96     int64_t l1_table_offset;
97     int64_t l1_backup_table_offset;
98     uint32_t *l1_table;
99     uint32_t *l1_backup_table;
100     unsigned int l1_size;
101     uint32_t l1_entry_sectors;
102 
103     unsigned int l2_size;
104     uint32_t *l2_cache;
105     uint32_t l2_cache_offsets[L2_CACHE_SIZE];
106     uint32_t l2_cache_counts[L2_CACHE_SIZE];
107 
108     int64_t cluster_sectors;
109 } VmdkExtent;
110 
111 typedef struct BDRVVmdkState {
112     CoMutex lock;
113     uint64_t desc_offset;
114     bool cid_updated;
115     bool cid_checked;
116     uint32_t parent_cid;
117     int num_extents;
118     /* Extent array with num_extents entries, ascend ordered by address */
119     VmdkExtent *extents;
120     Error *migration_blocker;
121 } BDRVVmdkState;
122 
123 typedef struct VmdkMetaData {
124     uint32_t offset;
125     unsigned int l1_index;
126     unsigned int l2_index;
127     unsigned int l2_offset;
128     int valid;
129     uint32_t *l2_cache_entry;
130 } VmdkMetaData;
131 
132 typedef struct VmdkGrainMarker {
133     uint64_t lba;
134     uint32_t size;
135     uint8_t  data[0];
136 } QEMU_PACKED VmdkGrainMarker;
137 
138 enum {
139     MARKER_END_OF_STREAM    = 0,
140     MARKER_GRAIN_TABLE      = 1,
141     MARKER_GRAIN_DIRECTORY  = 2,
142     MARKER_FOOTER           = 3,
143 };
144 
145 static int vmdk_probe(const uint8_t *buf, int buf_size, const char *filename)
146 {
147     uint32_t magic;
148 
149     if (buf_size < 4) {
150         return 0;
151     }
152     magic = be32_to_cpu(*(uint32_t *)buf);
153     if (magic == VMDK3_MAGIC ||
154         magic == VMDK4_MAGIC) {
155         return 100;
156     } else {
157         const char *p = (const char *)buf;
158         const char *end = p + buf_size;
159         while (p < end) {
160             if (*p == '#') {
161                 /* skip comment line */
162                 while (p < end && *p != '\n') {
163                     p++;
164                 }
165                 p++;
166                 continue;
167             }
168             if (*p == ' ') {
169                 while (p < end && *p == ' ') {
170                     p++;
171                 }
172                 /* skip '\r' if windows line endings used. */
173                 if (p < end && *p == '\r') {
174                     p++;
175                 }
176                 /* only accept blank lines before 'version=' line */
177                 if (p == end || *p != '\n') {
178                     return 0;
179                 }
180                 p++;
181                 continue;
182             }
183             if (end - p >= strlen("version=X\n")) {
184                 if (strncmp("version=1\n", p, strlen("version=1\n")) == 0 ||
185                     strncmp("version=2\n", p, strlen("version=2\n")) == 0) {
186                     return 100;
187                 }
188             }
189             if (end - p >= strlen("version=X\r\n")) {
190                 if (strncmp("version=1\r\n", p, strlen("version=1\r\n")) == 0 ||
191                     strncmp("version=2\r\n", p, strlen("version=2\r\n")) == 0) {
192                     return 100;
193                 }
194             }
195             return 0;
196         }
197         return 0;
198     }
199 }
200 
201 #define SECTOR_SIZE 512
202 #define DESC_SIZE (20 * SECTOR_SIZE)    /* 20 sectors of 512 bytes each */
203 #define BUF_SIZE 4096
204 #define HEADER_SIZE 512                 /* first sector of 512 bytes */
205 
206 static void vmdk_free_extents(BlockDriverState *bs)
207 {
208     int i;
209     BDRVVmdkState *s = bs->opaque;
210     VmdkExtent *e;
211 
212     for (i = 0; i < s->num_extents; i++) {
213         e = &s->extents[i];
214         g_free(e->l1_table);
215         g_free(e->l2_cache);
216         g_free(e->l1_backup_table);
217         if (e->file != bs->file) {
218             bdrv_unref(e->file);
219         }
220     }
221     g_free(s->extents);
222 }
223 
224 static void vmdk_free_last_extent(BlockDriverState *bs)
225 {
226     BDRVVmdkState *s = bs->opaque;
227 
228     if (s->num_extents == 0) {
229         return;
230     }
231     s->num_extents--;
232     s->extents = g_realloc(s->extents, s->num_extents * sizeof(VmdkExtent));
233 }
234 
235 static uint32_t vmdk_read_cid(BlockDriverState *bs, int parent)
236 {
237     char desc[DESC_SIZE];
238     uint32_t cid = 0xffffffff;
239     const char *p_name, *cid_str;
240     size_t cid_str_size;
241     BDRVVmdkState *s = bs->opaque;
242     int ret;
243 
244     ret = bdrv_pread(bs->file, s->desc_offset, desc, DESC_SIZE);
245     if (ret < 0) {
246         return 0;
247     }
248 
249     if (parent) {
250         cid_str = "parentCID";
251         cid_str_size = sizeof("parentCID");
252     } else {
253         cid_str = "CID";
254         cid_str_size = sizeof("CID");
255     }
256 
257     desc[DESC_SIZE - 1] = '\0';
258     p_name = strstr(desc, cid_str);
259     if (p_name != NULL) {
260         p_name += cid_str_size;
261         sscanf(p_name, "%x", &cid);
262     }
263 
264     return cid;
265 }
266 
267 static int vmdk_write_cid(BlockDriverState *bs, uint32_t cid)
268 {
269     char desc[DESC_SIZE], tmp_desc[DESC_SIZE];
270     char *p_name, *tmp_str;
271     BDRVVmdkState *s = bs->opaque;
272     int ret;
273 
274     ret = bdrv_pread(bs->file, s->desc_offset, desc, DESC_SIZE);
275     if (ret < 0) {
276         return ret;
277     }
278 
279     desc[DESC_SIZE - 1] = '\0';
280     tmp_str = strstr(desc, "parentCID");
281     if (tmp_str == NULL) {
282         return -EINVAL;
283     }
284 
285     pstrcpy(tmp_desc, sizeof(tmp_desc), tmp_str);
286     p_name = strstr(desc, "CID");
287     if (p_name != NULL) {
288         p_name += sizeof("CID");
289         snprintf(p_name, sizeof(desc) - (p_name - desc), "%x\n", cid);
290         pstrcat(desc, sizeof(desc), tmp_desc);
291     }
292 
293     ret = bdrv_pwrite_sync(bs->file, s->desc_offset, desc, DESC_SIZE);
294     if (ret < 0) {
295         return ret;
296     }
297 
298     return 0;
299 }
300 
301 static int vmdk_is_cid_valid(BlockDriverState *bs)
302 {
303     BDRVVmdkState *s = bs->opaque;
304     BlockDriverState *p_bs = bs->backing_hd;
305     uint32_t cur_pcid;
306 
307     if (!s->cid_checked && p_bs) {
308         cur_pcid = vmdk_read_cid(p_bs, 0);
309         if (s->parent_cid != cur_pcid) {
310             /* CID not valid */
311             return 0;
312         }
313     }
314     s->cid_checked = true;
315     /* CID valid */
316     return 1;
317 }
318 
319 /* Queue extents, if any, for reopen() */
320 static int vmdk_reopen_prepare(BDRVReopenState *state,
321                                BlockReopenQueue *queue, Error **errp)
322 {
323     BDRVVmdkState *s;
324     int ret = -1;
325     int i;
326     VmdkExtent *e;
327 
328     assert(state != NULL);
329     assert(state->bs != NULL);
330 
331     if (queue == NULL) {
332         error_setg(errp, "No reopen queue for VMDK extents");
333         goto exit;
334     }
335 
336     s = state->bs->opaque;
337 
338     assert(s != NULL);
339 
340     for (i = 0; i < s->num_extents; i++) {
341         e = &s->extents[i];
342         if (e->file != state->bs->file) {
343             bdrv_reopen_queue(queue, e->file, state->flags);
344         }
345     }
346     ret = 0;
347 
348 exit:
349     return ret;
350 }
351 
352 static int vmdk_parent_open(BlockDriverState *bs)
353 {
354     char *p_name;
355     char desc[DESC_SIZE + 1];
356     BDRVVmdkState *s = bs->opaque;
357     int ret;
358 
359     desc[DESC_SIZE] = '\0';
360     ret = bdrv_pread(bs->file, s->desc_offset, desc, DESC_SIZE);
361     if (ret < 0) {
362         return ret;
363     }
364 
365     p_name = strstr(desc, "parentFileNameHint");
366     if (p_name != NULL) {
367         char *end_name;
368 
369         p_name += sizeof("parentFileNameHint") + 1;
370         end_name = strchr(p_name, '\"');
371         if (end_name == NULL) {
372             return -EINVAL;
373         }
374         if ((end_name - p_name) > sizeof(bs->backing_file) - 1) {
375             return -EINVAL;
376         }
377 
378         pstrcpy(bs->backing_file, end_name - p_name + 1, p_name);
379     }
380 
381     return 0;
382 }
383 
384 /* Create and append extent to the extent array. Return the added VmdkExtent
385  * address. return NULL if allocation failed. */
386 static int vmdk_add_extent(BlockDriverState *bs,
387                            BlockDriverState *file, bool flat, int64_t sectors,
388                            int64_t l1_offset, int64_t l1_backup_offset,
389                            uint32_t l1_size,
390                            int l2_size, uint64_t cluster_sectors,
391                            VmdkExtent **new_extent,
392                            Error **errp)
393 {
394     VmdkExtent *extent;
395     BDRVVmdkState *s = bs->opaque;
396 
397     if (cluster_sectors > 0x200000) {
398         /* 0x200000 * 512Bytes = 1GB for one cluster is unrealistic */
399         error_setg(errp, "Invalid granularity, image may be corrupt");
400         return -EFBIG;
401     }
402     if (l1_size > 512 * 1024 * 1024) {
403         /* Although with big capacity and small l1_entry_sectors, we can get a
404          * big l1_size, we don't want unbounded value to allocate the table.
405          * Limit it to 512M, which is 16PB for default cluster and L2 table
406          * size */
407         error_setg(errp, "L1 size too big");
408         return -EFBIG;
409     }
410 
411     s->extents = g_realloc(s->extents,
412                               (s->num_extents + 1) * sizeof(VmdkExtent));
413     extent = &s->extents[s->num_extents];
414     s->num_extents++;
415 
416     memset(extent, 0, sizeof(VmdkExtent));
417     extent->file = file;
418     extent->flat = flat;
419     extent->sectors = sectors;
420     extent->l1_table_offset = l1_offset;
421     extent->l1_backup_table_offset = l1_backup_offset;
422     extent->l1_size = l1_size;
423     extent->l1_entry_sectors = l2_size * cluster_sectors;
424     extent->l2_size = l2_size;
425     extent->cluster_sectors = flat ? sectors : cluster_sectors;
426 
427     if (s->num_extents > 1) {
428         extent->end_sector = (*(extent - 1)).end_sector + extent->sectors;
429     } else {
430         extent->end_sector = extent->sectors;
431     }
432     bs->total_sectors = extent->end_sector;
433     if (new_extent) {
434         *new_extent = extent;
435     }
436     return 0;
437 }
438 
439 static int vmdk_init_tables(BlockDriverState *bs, VmdkExtent *extent,
440                             Error **errp)
441 {
442     int ret;
443     int l1_size, i;
444 
445     /* read the L1 table */
446     l1_size = extent->l1_size * sizeof(uint32_t);
447     extent->l1_table = g_malloc(l1_size);
448     ret = bdrv_pread(extent->file,
449                      extent->l1_table_offset,
450                      extent->l1_table,
451                      l1_size);
452     if (ret < 0) {
453         error_setg_errno(errp, -ret,
454                          "Could not read l1 table from extent '%s'",
455                          extent->file->filename);
456         goto fail_l1;
457     }
458     for (i = 0; i < extent->l1_size; i++) {
459         le32_to_cpus(&extent->l1_table[i]);
460     }
461 
462     if (extent->l1_backup_table_offset) {
463         extent->l1_backup_table = g_malloc(l1_size);
464         ret = bdrv_pread(extent->file,
465                          extent->l1_backup_table_offset,
466                          extent->l1_backup_table,
467                          l1_size);
468         if (ret < 0) {
469             error_setg_errno(errp, -ret,
470                              "Could not read l1 backup table from extent '%s'",
471                              extent->file->filename);
472             goto fail_l1b;
473         }
474         for (i = 0; i < extent->l1_size; i++) {
475             le32_to_cpus(&extent->l1_backup_table[i]);
476         }
477     }
478 
479     extent->l2_cache =
480         g_malloc(extent->l2_size * L2_CACHE_SIZE * sizeof(uint32_t));
481     return 0;
482  fail_l1b:
483     g_free(extent->l1_backup_table);
484  fail_l1:
485     g_free(extent->l1_table);
486     return ret;
487 }
488 
489 static int vmdk_open_vmfs_sparse(BlockDriverState *bs,
490                                  BlockDriverState *file,
491                                  int flags, Error **errp)
492 {
493     int ret;
494     uint32_t magic;
495     VMDK3Header header;
496     VmdkExtent *extent;
497 
498     ret = bdrv_pread(file, sizeof(magic), &header, sizeof(header));
499     if (ret < 0) {
500         error_setg_errno(errp, -ret,
501                          "Could not read header from file '%s'",
502                          file->filename);
503         return ret;
504     }
505     ret = vmdk_add_extent(bs, file, false,
506                           le32_to_cpu(header.disk_sectors),
507                           le32_to_cpu(header.l1dir_offset) << 9,
508                           0,
509                           le32_to_cpu(header.l1dir_size),
510                           4096,
511                           le32_to_cpu(header.granularity),
512                           &extent,
513                           errp);
514     if (ret < 0) {
515         return ret;
516     }
517     ret = vmdk_init_tables(bs, extent, errp);
518     if (ret) {
519         /* free extent allocated by vmdk_add_extent */
520         vmdk_free_last_extent(bs);
521     }
522     return ret;
523 }
524 
525 static int vmdk_open_desc_file(BlockDriverState *bs, int flags,
526                                uint64_t desc_offset, Error **errp);
527 
528 static int vmdk_open_vmdk4(BlockDriverState *bs,
529                            BlockDriverState *file,
530                            int flags, Error **errp)
531 {
532     int ret;
533     uint32_t magic;
534     uint32_t l1_size, l1_entry_sectors;
535     VMDK4Header header;
536     VmdkExtent *extent;
537     int64_t l1_backup_offset = 0;
538 
539     ret = bdrv_pread(file, sizeof(magic), &header, sizeof(header));
540     if (ret < 0) {
541         error_setg_errno(errp, -ret,
542                          "Could not read header from file '%s'",
543                          file->filename);
544     }
545     if (header.capacity == 0) {
546         uint64_t desc_offset = le64_to_cpu(header.desc_offset);
547         if (desc_offset) {
548             return vmdk_open_desc_file(bs, flags, desc_offset << 9, errp);
549         }
550     }
551 
552     if (le64_to_cpu(header.gd_offset) == VMDK4_GD_AT_END) {
553         /*
554          * The footer takes precedence over the header, so read it in. The
555          * footer starts at offset -1024 from the end: One sector for the
556          * footer, and another one for the end-of-stream marker.
557          */
558         struct {
559             struct {
560                 uint64_t val;
561                 uint32_t size;
562                 uint32_t type;
563                 uint8_t pad[512 - 16];
564             } QEMU_PACKED footer_marker;
565 
566             uint32_t magic;
567             VMDK4Header header;
568             uint8_t pad[512 - 4 - sizeof(VMDK4Header)];
569 
570             struct {
571                 uint64_t val;
572                 uint32_t size;
573                 uint32_t type;
574                 uint8_t pad[512 - 16];
575             } QEMU_PACKED eos_marker;
576         } QEMU_PACKED footer;
577 
578         ret = bdrv_pread(file,
579             bs->file->total_sectors * 512 - 1536,
580             &footer, sizeof(footer));
581         if (ret < 0) {
582             return ret;
583         }
584 
585         /* Some sanity checks for the footer */
586         if (be32_to_cpu(footer.magic) != VMDK4_MAGIC ||
587             le32_to_cpu(footer.footer_marker.size) != 0  ||
588             le32_to_cpu(footer.footer_marker.type) != MARKER_FOOTER ||
589             le64_to_cpu(footer.eos_marker.val) != 0  ||
590             le32_to_cpu(footer.eos_marker.size) != 0  ||
591             le32_to_cpu(footer.eos_marker.type) != MARKER_END_OF_STREAM)
592         {
593             return -EINVAL;
594         }
595 
596         header = footer.header;
597     }
598 
599     if (le32_to_cpu(header.version) >= 3) {
600         char buf[64];
601         snprintf(buf, sizeof(buf), "VMDK version %d",
602                  le32_to_cpu(header.version));
603         qerror_report(QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
604                 bs->device_name, "vmdk", buf);
605         return -ENOTSUP;
606     }
607 
608     if (le32_to_cpu(header.num_gtes_per_gt) > 512) {
609         error_report("L2 table size too big");
610         return -EINVAL;
611     }
612 
613     l1_entry_sectors = le32_to_cpu(header.num_gtes_per_gt)
614                         * le64_to_cpu(header.granularity);
615     if (l1_entry_sectors == 0) {
616         return -EINVAL;
617     }
618     l1_size = (le64_to_cpu(header.capacity) + l1_entry_sectors - 1)
619                 / l1_entry_sectors;
620     if (le32_to_cpu(header.flags) & VMDK4_FLAG_RGD) {
621         l1_backup_offset = le64_to_cpu(header.rgd_offset) << 9;
622     }
623     ret = vmdk_add_extent(bs, file, false,
624                           le64_to_cpu(header.capacity),
625                           le64_to_cpu(header.gd_offset) << 9,
626                           l1_backup_offset,
627                           l1_size,
628                           le32_to_cpu(header.num_gtes_per_gt),
629                           le64_to_cpu(header.granularity),
630                           &extent,
631                           errp);
632     if (ret < 0) {
633         return ret;
634     }
635     extent->compressed =
636         le16_to_cpu(header.compressAlgorithm) == VMDK4_COMPRESSION_DEFLATE;
637     extent->has_marker = le32_to_cpu(header.flags) & VMDK4_FLAG_MARKER;
638     extent->version = le32_to_cpu(header.version);
639     extent->has_zero_grain = le32_to_cpu(header.flags) & VMDK4_FLAG_ZERO_GRAIN;
640     ret = vmdk_init_tables(bs, extent, errp);
641     if (ret) {
642         /* free extent allocated by vmdk_add_extent */
643         vmdk_free_last_extent(bs);
644     }
645     return ret;
646 }
647 
648 /* find an option value out of descriptor file */
649 static int vmdk_parse_description(const char *desc, const char *opt_name,
650         char *buf, int buf_size)
651 {
652     char *opt_pos, *opt_end;
653     const char *end = desc + strlen(desc);
654 
655     opt_pos = strstr(desc, opt_name);
656     if (!opt_pos) {
657         return VMDK_ERROR;
658     }
659     /* Skip "=\"" following opt_name */
660     opt_pos += strlen(opt_name) + 2;
661     if (opt_pos >= end) {
662         return VMDK_ERROR;
663     }
664     opt_end = opt_pos;
665     while (opt_end < end && *opt_end != '"') {
666         opt_end++;
667     }
668     if (opt_end == end || buf_size < opt_end - opt_pos + 1) {
669         return VMDK_ERROR;
670     }
671     pstrcpy(buf, opt_end - opt_pos + 1, opt_pos);
672     return VMDK_OK;
673 }
674 
675 /* Open an extent file and append to bs array */
676 static int vmdk_open_sparse(BlockDriverState *bs,
677                             BlockDriverState *file,
678                             int flags, Error **errp)
679 {
680     uint32_t magic;
681 
682     if (bdrv_pread(file, 0, &magic, sizeof(magic)) != sizeof(magic)) {
683         return -EIO;
684     }
685 
686     magic = be32_to_cpu(magic);
687     switch (magic) {
688         case VMDK3_MAGIC:
689             return vmdk_open_vmfs_sparse(bs, file, flags, errp);
690             break;
691         case VMDK4_MAGIC:
692             return vmdk_open_vmdk4(bs, file, flags, errp);
693             break;
694         default:
695             return -EMEDIUMTYPE;
696             break;
697     }
698 }
699 
700 static int vmdk_parse_extents(const char *desc, BlockDriverState *bs,
701                               const char *desc_file_path, Error **errp)
702 {
703     int ret;
704     char access[11];
705     char type[11];
706     char fname[512];
707     const char *p = desc;
708     int64_t sectors = 0;
709     int64_t flat_offset;
710     char extent_path[PATH_MAX];
711     BlockDriverState *extent_file;
712 
713     while (*p) {
714         /* parse extent line:
715          * RW [size in sectors] FLAT "file-name.vmdk" OFFSET
716          * or
717          * RW [size in sectors] SPARSE "file-name.vmdk"
718          */
719         flat_offset = -1;
720         ret = sscanf(p, "%10s %" SCNd64 " %10s \"%511[^\n\r\"]\" %" SCNd64,
721                 access, &sectors, type, fname, &flat_offset);
722         if (ret < 4 || strcmp(access, "RW")) {
723             goto next_line;
724         } else if (!strcmp(type, "FLAT")) {
725             if (ret != 5 || flat_offset < 0) {
726                 error_setg(errp, "Invalid extent lines: \n%s", p);
727                 return -EINVAL;
728             }
729         } else if (!strcmp(type, "VMFS")) {
730             flat_offset = 0;
731         } else if (ret != 4) {
732             error_setg(errp, "Invalid extent lines: \n%s", p);
733             return -EINVAL;
734         }
735 
736         if (sectors <= 0 ||
737             (strcmp(type, "FLAT") && strcmp(type, "SPARSE") &&
738              strcmp(type, "VMFS") && strcmp(type, "VMFSSPARSE")) ||
739             (strcmp(access, "RW"))) {
740             goto next_line;
741         }
742 
743         path_combine(extent_path, sizeof(extent_path),
744                 desc_file_path, fname);
745         ret = bdrv_file_open(&extent_file, extent_path, NULL, bs->open_flags,
746                              errp);
747         if (ret) {
748             return ret;
749         }
750 
751         /* save to extents array */
752         if (!strcmp(type, "FLAT") || !strcmp(type, "VMFS")) {
753             /* FLAT extent */
754             VmdkExtent *extent;
755 
756             ret = vmdk_add_extent(bs, extent_file, true, sectors,
757                             0, 0, 0, 0, 0, &extent, errp);
758             if (ret < 0) {
759                 return ret;
760             }
761             extent->flat_start_offset = flat_offset << 9;
762         } else if (!strcmp(type, "SPARSE") || !strcmp(type, "VMFSSPARSE")) {
763             /* SPARSE extent and VMFSSPARSE extent are both "COWD" sparse file*/
764             ret = vmdk_open_sparse(bs, extent_file, bs->open_flags, errp);
765             if (ret) {
766                 bdrv_unref(extent_file);
767                 return ret;
768             }
769         } else {
770             error_setg(errp, "Unsupported extent type '%s'", type);
771             return -ENOTSUP;
772         }
773 next_line:
774         /* move to next line */
775         while (*p) {
776             if (*p == '\n') {
777                 p++;
778                 break;
779             }
780             p++;
781         }
782     }
783     return 0;
784 }
785 
786 static int vmdk_open_desc_file(BlockDriverState *bs, int flags,
787                                uint64_t desc_offset, Error **errp)
788 {
789     int ret;
790     char *buf = NULL;
791     char ct[128];
792     BDRVVmdkState *s = bs->opaque;
793     int64_t size;
794 
795     size = bdrv_getlength(bs->file);
796     if (size < 0) {
797         return -EINVAL;
798     }
799 
800     size = MIN(size, 1 << 20);  /* avoid unbounded allocation */
801     buf = g_malloc0(size + 1);
802 
803     ret = bdrv_pread(bs->file, desc_offset, buf, size);
804     if (ret < 0) {
805         goto exit;
806     }
807     if (vmdk_parse_description(buf, "createType", ct, sizeof(ct))) {
808         ret = -EMEDIUMTYPE;
809         goto exit;
810     }
811     if (strcmp(ct, "monolithicFlat") &&
812         strcmp(ct, "vmfs") &&
813         strcmp(ct, "vmfsSparse") &&
814         strcmp(ct, "twoGbMaxExtentSparse") &&
815         strcmp(ct, "twoGbMaxExtentFlat")) {
816         error_setg(errp, "Unsupported image type '%s'", ct);
817         ret = -ENOTSUP;
818         goto exit;
819     }
820     s->desc_offset = 0;
821     ret = vmdk_parse_extents(buf, bs, bs->file->filename, errp);
822 exit:
823     g_free(buf);
824     return ret;
825 }
826 
827 static int vmdk_open(BlockDriverState *bs, QDict *options, int flags,
828                      Error **errp)
829 {
830     int ret;
831     BDRVVmdkState *s = bs->opaque;
832 
833     if (vmdk_open_sparse(bs, bs->file, flags, errp) == 0) {
834         s->desc_offset = 0x200;
835     } else {
836         ret = vmdk_open_desc_file(bs, flags, 0, errp);
837         if (ret) {
838             goto fail;
839         }
840     }
841     /* try to open parent images, if exist */
842     ret = vmdk_parent_open(bs);
843     if (ret) {
844         goto fail;
845     }
846     s->parent_cid = vmdk_read_cid(bs, 1);
847     qemu_co_mutex_init(&s->lock);
848 
849     /* Disable migration when VMDK images are used */
850     error_set(&s->migration_blocker,
851               QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
852               "vmdk", bs->device_name, "live migration");
853     migrate_add_blocker(s->migration_blocker);
854 
855     return 0;
856 
857 fail:
858     vmdk_free_extents(bs);
859     return ret;
860 }
861 
862 static int get_whole_cluster(BlockDriverState *bs,
863                 VmdkExtent *extent,
864                 uint64_t cluster_offset,
865                 uint64_t offset,
866                 bool allocate)
867 {
868     int ret = VMDK_OK;
869     uint8_t *whole_grain = NULL;
870 
871     /* we will be here if it's first write on non-exist grain(cluster).
872      * try to read from parent image, if exist */
873     if (bs->backing_hd) {
874         whole_grain =
875             qemu_blockalign(bs, extent->cluster_sectors << BDRV_SECTOR_BITS);
876         if (!vmdk_is_cid_valid(bs)) {
877             ret = VMDK_ERROR;
878             goto exit;
879         }
880 
881         /* floor offset to cluster */
882         offset -= offset % (extent->cluster_sectors * 512);
883         ret = bdrv_read(bs->backing_hd, offset >> 9, whole_grain,
884                 extent->cluster_sectors);
885         if (ret < 0) {
886             ret = VMDK_ERROR;
887             goto exit;
888         }
889 
890         /* Write grain only into the active image */
891         ret = bdrv_write(extent->file, cluster_offset, whole_grain,
892                 extent->cluster_sectors);
893         if (ret < 0) {
894             ret = VMDK_ERROR;
895             goto exit;
896         }
897     }
898 exit:
899     qemu_vfree(whole_grain);
900     return ret;
901 }
902 
903 static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data)
904 {
905     uint32_t offset;
906     QEMU_BUILD_BUG_ON(sizeof(offset) != sizeof(m_data->offset));
907     offset = cpu_to_le32(m_data->offset);
908     /* update L2 table */
909     if (bdrv_pwrite_sync(
910                 extent->file,
911                 ((int64_t)m_data->l2_offset * 512)
912                     + (m_data->l2_index * sizeof(m_data->offset)),
913                 &offset, sizeof(offset)) < 0) {
914         return VMDK_ERROR;
915     }
916     /* update backup L2 table */
917     if (extent->l1_backup_table_offset != 0) {
918         m_data->l2_offset = extent->l1_backup_table[m_data->l1_index];
919         if (bdrv_pwrite_sync(
920                     extent->file,
921                     ((int64_t)m_data->l2_offset * 512)
922                         + (m_data->l2_index * sizeof(m_data->offset)),
923                     &offset, sizeof(offset)) < 0) {
924             return VMDK_ERROR;
925         }
926     }
927     if (m_data->l2_cache_entry) {
928         *m_data->l2_cache_entry = offset;
929     }
930 
931     return VMDK_OK;
932 }
933 
934 static int get_cluster_offset(BlockDriverState *bs,
935                                     VmdkExtent *extent,
936                                     VmdkMetaData *m_data,
937                                     uint64_t offset,
938                                     int allocate,
939                                     uint64_t *cluster_offset)
940 {
941     unsigned int l1_index, l2_offset, l2_index;
942     int min_index, i, j;
943     uint32_t min_count, *l2_table;
944     bool zeroed = false;
945 
946     if (m_data) {
947         m_data->valid = 0;
948     }
949     if (extent->flat) {
950         *cluster_offset = extent->flat_start_offset;
951         return VMDK_OK;
952     }
953 
954     offset -= (extent->end_sector - extent->sectors) * SECTOR_SIZE;
955     l1_index = (offset >> 9) / extent->l1_entry_sectors;
956     if (l1_index >= extent->l1_size) {
957         return VMDK_ERROR;
958     }
959     l2_offset = extent->l1_table[l1_index];
960     if (!l2_offset) {
961         return VMDK_UNALLOC;
962     }
963     for (i = 0; i < L2_CACHE_SIZE; i++) {
964         if (l2_offset == extent->l2_cache_offsets[i]) {
965             /* increment the hit count */
966             if (++extent->l2_cache_counts[i] == 0xffffffff) {
967                 for (j = 0; j < L2_CACHE_SIZE; j++) {
968                     extent->l2_cache_counts[j] >>= 1;
969                 }
970             }
971             l2_table = extent->l2_cache + (i * extent->l2_size);
972             goto found;
973         }
974     }
975     /* not found: load a new entry in the least used one */
976     min_index = 0;
977     min_count = 0xffffffff;
978     for (i = 0; i < L2_CACHE_SIZE; i++) {
979         if (extent->l2_cache_counts[i] < min_count) {
980             min_count = extent->l2_cache_counts[i];
981             min_index = i;
982         }
983     }
984     l2_table = extent->l2_cache + (min_index * extent->l2_size);
985     if (bdrv_pread(
986                 extent->file,
987                 (int64_t)l2_offset * 512,
988                 l2_table,
989                 extent->l2_size * sizeof(uint32_t)
990             ) != extent->l2_size * sizeof(uint32_t)) {
991         return VMDK_ERROR;
992     }
993 
994     extent->l2_cache_offsets[min_index] = l2_offset;
995     extent->l2_cache_counts[min_index] = 1;
996  found:
997     l2_index = ((offset >> 9) / extent->cluster_sectors) % extent->l2_size;
998     *cluster_offset = le32_to_cpu(l2_table[l2_index]);
999 
1000     if (m_data) {
1001         m_data->valid = 1;
1002         m_data->l1_index = l1_index;
1003         m_data->l2_index = l2_index;
1004         m_data->offset = *cluster_offset;
1005         m_data->l2_offset = l2_offset;
1006         m_data->l2_cache_entry = &l2_table[l2_index];
1007     }
1008     if (extent->has_zero_grain && *cluster_offset == VMDK_GTE_ZEROED) {
1009         zeroed = true;
1010     }
1011 
1012     if (!*cluster_offset || zeroed) {
1013         if (!allocate) {
1014             return zeroed ? VMDK_ZEROED : VMDK_UNALLOC;
1015         }
1016 
1017         /* Avoid the L2 tables update for the images that have snapshots. */
1018         *cluster_offset = bdrv_getlength(extent->file);
1019         if (!extent->compressed) {
1020             bdrv_truncate(
1021                 extent->file,
1022                 *cluster_offset + (extent->cluster_sectors << 9)
1023             );
1024         }
1025 
1026         *cluster_offset >>= 9;
1027         l2_table[l2_index] = cpu_to_le32(*cluster_offset);
1028 
1029         /* First of all we write grain itself, to avoid race condition
1030          * that may to corrupt the image.
1031          * This problem may occur because of insufficient space on host disk
1032          * or inappropriate VM shutdown.
1033          */
1034         if (get_whole_cluster(
1035                 bs, extent, *cluster_offset, offset, allocate) == -1) {
1036             return VMDK_ERROR;
1037         }
1038 
1039         if (m_data) {
1040             m_data->offset = *cluster_offset;
1041         }
1042     }
1043     *cluster_offset <<= 9;
1044     return VMDK_OK;
1045 }
1046 
1047 static VmdkExtent *find_extent(BDRVVmdkState *s,
1048                                 int64_t sector_num, VmdkExtent *start_hint)
1049 {
1050     VmdkExtent *extent = start_hint;
1051 
1052     if (!extent) {
1053         extent = &s->extents[0];
1054     }
1055     while (extent < &s->extents[s->num_extents]) {
1056         if (sector_num < extent->end_sector) {
1057             return extent;
1058         }
1059         extent++;
1060     }
1061     return NULL;
1062 }
1063 
1064 static int64_t coroutine_fn vmdk_co_get_block_status(BlockDriverState *bs,
1065         int64_t sector_num, int nb_sectors, int *pnum)
1066 {
1067     BDRVVmdkState *s = bs->opaque;
1068     int64_t index_in_cluster, n, ret;
1069     uint64_t offset;
1070     VmdkExtent *extent;
1071 
1072     extent = find_extent(s, sector_num, NULL);
1073     if (!extent) {
1074         return 0;
1075     }
1076     qemu_co_mutex_lock(&s->lock);
1077     ret = get_cluster_offset(bs, extent, NULL,
1078                             sector_num * 512, 0, &offset);
1079     qemu_co_mutex_unlock(&s->lock);
1080 
1081     switch (ret) {
1082     case VMDK_ERROR:
1083         ret = -EIO;
1084         break;
1085     case VMDK_UNALLOC:
1086         ret = 0;
1087         break;
1088     case VMDK_ZEROED:
1089         ret = BDRV_BLOCK_ZERO;
1090         break;
1091     case VMDK_OK:
1092         ret = BDRV_BLOCK_DATA;
1093         if (extent->file == bs->file) {
1094             ret |= BDRV_BLOCK_OFFSET_VALID | offset;
1095         }
1096 
1097         break;
1098     }
1099 
1100     index_in_cluster = sector_num % extent->cluster_sectors;
1101     n = extent->cluster_sectors - index_in_cluster;
1102     if (n > nb_sectors) {
1103         n = nb_sectors;
1104     }
1105     *pnum = n;
1106     return ret;
1107 }
1108 
1109 static int vmdk_write_extent(VmdkExtent *extent, int64_t cluster_offset,
1110                             int64_t offset_in_cluster, const uint8_t *buf,
1111                             int nb_sectors, int64_t sector_num)
1112 {
1113     int ret;
1114     VmdkGrainMarker *data = NULL;
1115     uLongf buf_len;
1116     const uint8_t *write_buf = buf;
1117     int write_len = nb_sectors * 512;
1118 
1119     if (extent->compressed) {
1120         if (!extent->has_marker) {
1121             ret = -EINVAL;
1122             goto out;
1123         }
1124         buf_len = (extent->cluster_sectors << 9) * 2;
1125         data = g_malloc(buf_len + sizeof(VmdkGrainMarker));
1126         if (compress(data->data, &buf_len, buf, nb_sectors << 9) != Z_OK ||
1127                 buf_len == 0) {
1128             ret = -EINVAL;
1129             goto out;
1130         }
1131         data->lba = sector_num;
1132         data->size = buf_len;
1133         write_buf = (uint8_t *)data;
1134         write_len = buf_len + sizeof(VmdkGrainMarker);
1135     }
1136     ret = bdrv_pwrite(extent->file,
1137                         cluster_offset + offset_in_cluster,
1138                         write_buf,
1139                         write_len);
1140     if (ret != write_len) {
1141         ret = ret < 0 ? ret : -EIO;
1142         goto out;
1143     }
1144     ret = 0;
1145  out:
1146     g_free(data);
1147     return ret;
1148 }
1149 
1150 static int vmdk_read_extent(VmdkExtent *extent, int64_t cluster_offset,
1151                             int64_t offset_in_cluster, uint8_t *buf,
1152                             int nb_sectors)
1153 {
1154     int ret;
1155     int cluster_bytes, buf_bytes;
1156     uint8_t *cluster_buf, *compressed_data;
1157     uint8_t *uncomp_buf;
1158     uint32_t data_len;
1159     VmdkGrainMarker *marker;
1160     uLongf buf_len;
1161 
1162 
1163     if (!extent->compressed) {
1164         ret = bdrv_pread(extent->file,
1165                           cluster_offset + offset_in_cluster,
1166                           buf, nb_sectors * 512);
1167         if (ret == nb_sectors * 512) {
1168             return 0;
1169         } else {
1170             return -EIO;
1171         }
1172     }
1173     cluster_bytes = extent->cluster_sectors * 512;
1174     /* Read two clusters in case GrainMarker + compressed data > one cluster */
1175     buf_bytes = cluster_bytes * 2;
1176     cluster_buf = g_malloc(buf_bytes);
1177     uncomp_buf = g_malloc(cluster_bytes);
1178     ret = bdrv_pread(extent->file,
1179                 cluster_offset,
1180                 cluster_buf, buf_bytes);
1181     if (ret < 0) {
1182         goto out;
1183     }
1184     compressed_data = cluster_buf;
1185     buf_len = cluster_bytes;
1186     data_len = cluster_bytes;
1187     if (extent->has_marker) {
1188         marker = (VmdkGrainMarker *)cluster_buf;
1189         compressed_data = marker->data;
1190         data_len = le32_to_cpu(marker->size);
1191     }
1192     if (!data_len || data_len > buf_bytes) {
1193         ret = -EINVAL;
1194         goto out;
1195     }
1196     ret = uncompress(uncomp_buf, &buf_len, compressed_data, data_len);
1197     if (ret != Z_OK) {
1198         ret = -EINVAL;
1199         goto out;
1200 
1201     }
1202     if (offset_in_cluster < 0 ||
1203             offset_in_cluster + nb_sectors * 512 > buf_len) {
1204         ret = -EINVAL;
1205         goto out;
1206     }
1207     memcpy(buf, uncomp_buf + offset_in_cluster, nb_sectors * 512);
1208     ret = 0;
1209 
1210  out:
1211     g_free(uncomp_buf);
1212     g_free(cluster_buf);
1213     return ret;
1214 }
1215 
1216 static int vmdk_read(BlockDriverState *bs, int64_t sector_num,
1217                     uint8_t *buf, int nb_sectors)
1218 {
1219     BDRVVmdkState *s = bs->opaque;
1220     int ret;
1221     uint64_t n, index_in_cluster;
1222     uint64_t extent_begin_sector, extent_relative_sector_num;
1223     VmdkExtent *extent = NULL;
1224     uint64_t cluster_offset;
1225 
1226     while (nb_sectors > 0) {
1227         extent = find_extent(s, sector_num, extent);
1228         if (!extent) {
1229             return -EIO;
1230         }
1231         ret = get_cluster_offset(
1232                             bs, extent, NULL,
1233                             sector_num << 9, 0, &cluster_offset);
1234         extent_begin_sector = extent->end_sector - extent->sectors;
1235         extent_relative_sector_num = sector_num - extent_begin_sector;
1236         index_in_cluster = extent_relative_sector_num % extent->cluster_sectors;
1237         n = extent->cluster_sectors - index_in_cluster;
1238         if (n > nb_sectors) {
1239             n = nb_sectors;
1240         }
1241         if (ret != VMDK_OK) {
1242             /* if not allocated, try to read from parent image, if exist */
1243             if (bs->backing_hd && ret != VMDK_ZEROED) {
1244                 if (!vmdk_is_cid_valid(bs)) {
1245                     return -EINVAL;
1246                 }
1247                 ret = bdrv_read(bs->backing_hd, sector_num, buf, n);
1248                 if (ret < 0) {
1249                     return ret;
1250                 }
1251             } else {
1252                 memset(buf, 0, 512 * n);
1253             }
1254         } else {
1255             ret = vmdk_read_extent(extent,
1256                             cluster_offset, index_in_cluster * 512,
1257                             buf, n);
1258             if (ret) {
1259                 return ret;
1260             }
1261         }
1262         nb_sectors -= n;
1263         sector_num += n;
1264         buf += n * 512;
1265     }
1266     return 0;
1267 }
1268 
1269 static coroutine_fn int vmdk_co_read(BlockDriverState *bs, int64_t sector_num,
1270                                      uint8_t *buf, int nb_sectors)
1271 {
1272     int ret;
1273     BDRVVmdkState *s = bs->opaque;
1274     qemu_co_mutex_lock(&s->lock);
1275     ret = vmdk_read(bs, sector_num, buf, nb_sectors);
1276     qemu_co_mutex_unlock(&s->lock);
1277     return ret;
1278 }
1279 
1280 /**
1281  * vmdk_write:
1282  * @zeroed:       buf is ignored (data is zero), use zeroed_grain GTE feature
1283  *                if possible, otherwise return -ENOTSUP.
1284  * @zero_dry_run: used for zeroed == true only, don't update L2 table, just try
1285  *                with each cluster. By dry run we can find if the zero write
1286  *                is possible without modifying image data.
1287  *
1288  * Returns: error code with 0 for success.
1289  */
1290 static int vmdk_write(BlockDriverState *bs, int64_t sector_num,
1291                       const uint8_t *buf, int nb_sectors,
1292                       bool zeroed, bool zero_dry_run)
1293 {
1294     BDRVVmdkState *s = bs->opaque;
1295     VmdkExtent *extent = NULL;
1296     int n, ret;
1297     int64_t index_in_cluster;
1298     uint64_t extent_begin_sector, extent_relative_sector_num;
1299     uint64_t cluster_offset;
1300     VmdkMetaData m_data;
1301 
1302     if (sector_num > bs->total_sectors) {
1303         error_report("Wrong offset: sector_num=0x%" PRIx64
1304                 " total_sectors=0x%" PRIx64 "\n",
1305                 sector_num, bs->total_sectors);
1306         return -EIO;
1307     }
1308 
1309     while (nb_sectors > 0) {
1310         extent = find_extent(s, sector_num, extent);
1311         if (!extent) {
1312             return -EIO;
1313         }
1314         ret = get_cluster_offset(
1315                                 bs,
1316                                 extent,
1317                                 &m_data,
1318                                 sector_num << 9, !extent->compressed,
1319                                 &cluster_offset);
1320         if (extent->compressed) {
1321             if (ret == VMDK_OK) {
1322                 /* Refuse write to allocated cluster for streamOptimized */
1323                 error_report("Could not write to allocated cluster"
1324                               " for streamOptimized");
1325                 return -EIO;
1326             } else {
1327                 /* allocate */
1328                 ret = get_cluster_offset(
1329                                         bs,
1330                                         extent,
1331                                         &m_data,
1332                                         sector_num << 9, 1,
1333                                         &cluster_offset);
1334             }
1335         }
1336         if (ret == VMDK_ERROR) {
1337             return -EINVAL;
1338         }
1339         extent_begin_sector = extent->end_sector - extent->sectors;
1340         extent_relative_sector_num = sector_num - extent_begin_sector;
1341         index_in_cluster = extent_relative_sector_num % extent->cluster_sectors;
1342         n = extent->cluster_sectors - index_in_cluster;
1343         if (n > nb_sectors) {
1344             n = nb_sectors;
1345         }
1346         if (zeroed) {
1347             /* Do zeroed write, buf is ignored */
1348             if (extent->has_zero_grain &&
1349                     index_in_cluster == 0 &&
1350                     n >= extent->cluster_sectors) {
1351                 n = extent->cluster_sectors;
1352                 if (!zero_dry_run) {
1353                     m_data.offset = VMDK_GTE_ZEROED;
1354                     /* update L2 tables */
1355                     if (vmdk_L2update(extent, &m_data) != VMDK_OK) {
1356                         return -EIO;
1357                     }
1358                 }
1359             } else {
1360                 return -ENOTSUP;
1361             }
1362         } else {
1363             ret = vmdk_write_extent(extent,
1364                             cluster_offset, index_in_cluster * 512,
1365                             buf, n, sector_num);
1366             if (ret) {
1367                 return ret;
1368             }
1369             if (m_data.valid) {
1370                 /* update L2 tables */
1371                 if (vmdk_L2update(extent, &m_data) != VMDK_OK) {
1372                     return -EIO;
1373                 }
1374             }
1375         }
1376         nb_sectors -= n;
1377         sector_num += n;
1378         buf += n * 512;
1379 
1380         /* update CID on the first write every time the virtual disk is
1381          * opened */
1382         if (!s->cid_updated) {
1383             ret = vmdk_write_cid(bs, time(NULL));
1384             if (ret < 0) {
1385                 return ret;
1386             }
1387             s->cid_updated = true;
1388         }
1389     }
1390     return 0;
1391 }
1392 
1393 static coroutine_fn int vmdk_co_write(BlockDriverState *bs, int64_t sector_num,
1394                                       const uint8_t *buf, int nb_sectors)
1395 {
1396     int ret;
1397     BDRVVmdkState *s = bs->opaque;
1398     qemu_co_mutex_lock(&s->lock);
1399     ret = vmdk_write(bs, sector_num, buf, nb_sectors, false, false);
1400     qemu_co_mutex_unlock(&s->lock);
1401     return ret;
1402 }
1403 
1404 static int coroutine_fn vmdk_co_write_zeroes(BlockDriverState *bs,
1405                                              int64_t sector_num,
1406                                              int nb_sectors)
1407 {
1408     int ret;
1409     BDRVVmdkState *s = bs->opaque;
1410     qemu_co_mutex_lock(&s->lock);
1411     /* write zeroes could fail if sectors not aligned to cluster, test it with
1412      * dry_run == true before really updating image */
1413     ret = vmdk_write(bs, sector_num, NULL, nb_sectors, true, true);
1414     if (!ret) {
1415         ret = vmdk_write(bs, sector_num, NULL, nb_sectors, true, false);
1416     }
1417     qemu_co_mutex_unlock(&s->lock);
1418     return ret;
1419 }
1420 
1421 static int vmdk_create_extent(const char *filename, int64_t filesize,
1422                               bool flat, bool compress, bool zeroed_grain)
1423 {
1424     int ret, i;
1425     int fd = 0;
1426     VMDK4Header header;
1427     uint32_t tmp, magic, grains, gd_size, gt_size, gt_count;
1428 
1429     fd = qemu_open(filename,
1430                    O_WRONLY | O_CREAT | O_TRUNC | O_BINARY | O_LARGEFILE,
1431                    0644);
1432     if (fd < 0) {
1433         return -errno;
1434     }
1435     if (flat) {
1436         ret = ftruncate(fd, filesize);
1437         if (ret < 0) {
1438             ret = -errno;
1439         }
1440         goto exit;
1441     }
1442     magic = cpu_to_be32(VMDK4_MAGIC);
1443     memset(&header, 0, sizeof(header));
1444     header.version = zeroed_grain ? 2 : 1;
1445     header.flags = VMDK4_FLAG_RGD | VMDK4_FLAG_NL_DETECT
1446                    | (compress ? VMDK4_FLAG_COMPRESS | VMDK4_FLAG_MARKER : 0)
1447                    | (zeroed_grain ? VMDK4_FLAG_ZERO_GRAIN : 0);
1448     header.compressAlgorithm = compress ? VMDK4_COMPRESSION_DEFLATE : 0;
1449     header.capacity = filesize / 512;
1450     header.granularity = 128;
1451     header.num_gtes_per_gt = 512;
1452 
1453     grains = (filesize / 512 + header.granularity - 1) / header.granularity;
1454     gt_size = ((header.num_gtes_per_gt * sizeof(uint32_t)) + 511) >> 9;
1455     gt_count =
1456         (grains + header.num_gtes_per_gt - 1) / header.num_gtes_per_gt;
1457     gd_size = (gt_count * sizeof(uint32_t) + 511) >> 9;
1458 
1459     header.desc_offset = 1;
1460     header.desc_size = 20;
1461     header.rgd_offset = header.desc_offset + header.desc_size;
1462     header.gd_offset = header.rgd_offset + gd_size + (gt_size * gt_count);
1463     header.grain_offset =
1464        ((header.gd_offset + gd_size + (gt_size * gt_count) +
1465          header.granularity - 1) / header.granularity) *
1466         header.granularity;
1467     /* swap endianness for all header fields */
1468     header.version = cpu_to_le32(header.version);
1469     header.flags = cpu_to_le32(header.flags);
1470     header.capacity = cpu_to_le64(header.capacity);
1471     header.granularity = cpu_to_le64(header.granularity);
1472     header.num_gtes_per_gt = cpu_to_le32(header.num_gtes_per_gt);
1473     header.desc_offset = cpu_to_le64(header.desc_offset);
1474     header.desc_size = cpu_to_le64(header.desc_size);
1475     header.rgd_offset = cpu_to_le64(header.rgd_offset);
1476     header.gd_offset = cpu_to_le64(header.gd_offset);
1477     header.grain_offset = cpu_to_le64(header.grain_offset);
1478     header.compressAlgorithm = cpu_to_le16(header.compressAlgorithm);
1479 
1480     header.check_bytes[0] = 0xa;
1481     header.check_bytes[1] = 0x20;
1482     header.check_bytes[2] = 0xd;
1483     header.check_bytes[3] = 0xa;
1484 
1485     /* write all the data */
1486     ret = qemu_write_full(fd, &magic, sizeof(magic));
1487     if (ret != sizeof(magic)) {
1488         ret = -errno;
1489         goto exit;
1490     }
1491     ret = qemu_write_full(fd, &header, sizeof(header));
1492     if (ret != sizeof(header)) {
1493         ret = -errno;
1494         goto exit;
1495     }
1496 
1497     ret = ftruncate(fd, le64_to_cpu(header.grain_offset) << 9);
1498     if (ret < 0) {
1499         ret = -errno;
1500         goto exit;
1501     }
1502 
1503     /* write grain directory */
1504     lseek(fd, le64_to_cpu(header.rgd_offset) << 9, SEEK_SET);
1505     for (i = 0, tmp = le64_to_cpu(header.rgd_offset) + gd_size;
1506          i < gt_count; i++, tmp += gt_size) {
1507         ret = qemu_write_full(fd, &tmp, sizeof(tmp));
1508         if (ret != sizeof(tmp)) {
1509             ret = -errno;
1510             goto exit;
1511         }
1512     }
1513 
1514     /* write backup grain directory */
1515     lseek(fd, le64_to_cpu(header.gd_offset) << 9, SEEK_SET);
1516     for (i = 0, tmp = le64_to_cpu(header.gd_offset) + gd_size;
1517          i < gt_count; i++, tmp += gt_size) {
1518         ret = qemu_write_full(fd, &tmp, sizeof(tmp));
1519         if (ret != sizeof(tmp)) {
1520             ret = -errno;
1521             goto exit;
1522         }
1523     }
1524 
1525     ret = 0;
1526  exit:
1527     qemu_close(fd);
1528     return ret;
1529 }
1530 
1531 static int filename_decompose(const char *filename, char *path, char *prefix,
1532                               char *postfix, size_t buf_len, Error **errp)
1533 {
1534     const char *p, *q;
1535 
1536     if (filename == NULL || !strlen(filename)) {
1537         error_setg(errp, "No filename provided");
1538         return VMDK_ERROR;
1539     }
1540     p = strrchr(filename, '/');
1541     if (p == NULL) {
1542         p = strrchr(filename, '\\');
1543     }
1544     if (p == NULL) {
1545         p = strrchr(filename, ':');
1546     }
1547     if (p != NULL) {
1548         p++;
1549         if (p - filename >= buf_len) {
1550             return VMDK_ERROR;
1551         }
1552         pstrcpy(path, p - filename + 1, filename);
1553     } else {
1554         p = filename;
1555         path[0] = '\0';
1556     }
1557     q = strrchr(p, '.');
1558     if (q == NULL) {
1559         pstrcpy(prefix, buf_len, p);
1560         postfix[0] = '\0';
1561     } else {
1562         if (q - p >= buf_len) {
1563             return VMDK_ERROR;
1564         }
1565         pstrcpy(prefix, q - p + 1, p);
1566         pstrcpy(postfix, buf_len, q);
1567     }
1568     return VMDK_OK;
1569 }
1570 
1571 static int vmdk_create(const char *filename, QEMUOptionParameter *options,
1572                        Error **errp)
1573 {
1574     int fd, idx = 0;
1575     char desc[BUF_SIZE];
1576     int64_t total_size = 0, filesize;
1577     const char *adapter_type = NULL;
1578     const char *backing_file = NULL;
1579     const char *fmt = NULL;
1580     int flags = 0;
1581     int ret = 0;
1582     bool flat, split, compress;
1583     char ext_desc_lines[BUF_SIZE] = "";
1584     char path[PATH_MAX], prefix[PATH_MAX], postfix[PATH_MAX];
1585     const int64_t split_size = 0x80000000;  /* VMDK has constant split size */
1586     const char *desc_extent_line;
1587     char parent_desc_line[BUF_SIZE] = "";
1588     uint32_t parent_cid = 0xffffffff;
1589     uint32_t number_heads = 16;
1590     bool zeroed_grain = false;
1591     const char desc_template[] =
1592         "# Disk DescriptorFile\n"
1593         "version=1\n"
1594         "CID=%x\n"
1595         "parentCID=%x\n"
1596         "createType=\"%s\"\n"
1597         "%s"
1598         "\n"
1599         "# Extent description\n"
1600         "%s"
1601         "\n"
1602         "# The Disk Data Base\n"
1603         "#DDB\n"
1604         "\n"
1605         "ddb.virtualHWVersion = \"%d\"\n"
1606         "ddb.geometry.cylinders = \"%" PRId64 "\"\n"
1607         "ddb.geometry.heads = \"%d\"\n"
1608         "ddb.geometry.sectors = \"63\"\n"
1609         "ddb.adapterType = \"%s\"\n";
1610 
1611     if (filename_decompose(filename, path, prefix, postfix, PATH_MAX, errp)) {
1612         return -EINVAL;
1613     }
1614     /* Read out options */
1615     while (options && options->name) {
1616         if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
1617             total_size = options->value.n;
1618         } else if (!strcmp(options->name, BLOCK_OPT_ADAPTER_TYPE)) {
1619             adapter_type = options->value.s;
1620         } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) {
1621             backing_file = options->value.s;
1622         } else if (!strcmp(options->name, BLOCK_OPT_COMPAT6)) {
1623             flags |= options->value.n ? BLOCK_FLAG_COMPAT6 : 0;
1624         } else if (!strcmp(options->name, BLOCK_OPT_SUBFMT)) {
1625             fmt = options->value.s;
1626         } else if (!strcmp(options->name, BLOCK_OPT_ZEROED_GRAIN)) {
1627             zeroed_grain |= options->value.n;
1628         }
1629         options++;
1630     }
1631     if (!adapter_type) {
1632         adapter_type = "ide";
1633     } else if (strcmp(adapter_type, "ide") &&
1634                strcmp(adapter_type, "buslogic") &&
1635                strcmp(adapter_type, "lsilogic") &&
1636                strcmp(adapter_type, "legacyESX")) {
1637         error_setg(errp, "Unknown adapter type: '%s'", adapter_type);
1638         return -EINVAL;
1639     }
1640     if (strcmp(adapter_type, "ide") != 0) {
1641         /* that's the number of heads with which vmware operates when
1642            creating, exporting, etc. vmdk files with a non-ide adapter type */
1643         number_heads = 255;
1644     }
1645     if (!fmt) {
1646         /* Default format to monolithicSparse */
1647         fmt = "monolithicSparse";
1648     } else if (strcmp(fmt, "monolithicFlat") &&
1649                strcmp(fmt, "monolithicSparse") &&
1650                strcmp(fmt, "twoGbMaxExtentSparse") &&
1651                strcmp(fmt, "twoGbMaxExtentFlat") &&
1652                strcmp(fmt, "streamOptimized")) {
1653         error_setg(errp, "Unknown subformat: '%s'", fmt);
1654         return -EINVAL;
1655     }
1656     split = !(strcmp(fmt, "twoGbMaxExtentFlat") &&
1657               strcmp(fmt, "twoGbMaxExtentSparse"));
1658     flat = !(strcmp(fmt, "monolithicFlat") &&
1659              strcmp(fmt, "twoGbMaxExtentFlat"));
1660     compress = !strcmp(fmt, "streamOptimized");
1661     if (flat) {
1662         desc_extent_line = "RW %lld FLAT \"%s\" 0\n";
1663     } else {
1664         desc_extent_line = "RW %lld SPARSE \"%s\"\n";
1665     }
1666     if (flat && backing_file) {
1667         error_setg(errp, "Flat image can't have backing file");
1668         return -ENOTSUP;
1669     }
1670     if (flat && zeroed_grain) {
1671         error_setg(errp, "Flat image can't enable zeroed grain");
1672         return -ENOTSUP;
1673     }
1674     if (backing_file) {
1675         BlockDriverState *bs = bdrv_new("");
1676         ret = bdrv_open(bs, backing_file, NULL, 0, NULL, errp);
1677         if (ret != 0) {
1678             bdrv_unref(bs);
1679             return ret;
1680         }
1681         if (strcmp(bs->drv->format_name, "vmdk")) {
1682             bdrv_unref(bs);
1683             return -EINVAL;
1684         }
1685         parent_cid = vmdk_read_cid(bs, 0);
1686         bdrv_unref(bs);
1687         snprintf(parent_desc_line, sizeof(parent_desc_line),
1688                 "parentFileNameHint=\"%s\"", backing_file);
1689     }
1690 
1691     /* Create extents */
1692     filesize = total_size;
1693     while (filesize > 0) {
1694         char desc_line[BUF_SIZE];
1695         char ext_filename[PATH_MAX];
1696         char desc_filename[PATH_MAX];
1697         int64_t size = filesize;
1698 
1699         if (split && size > split_size) {
1700             size = split_size;
1701         }
1702         if (split) {
1703             snprintf(desc_filename, sizeof(desc_filename), "%s-%c%03d%s",
1704                     prefix, flat ? 'f' : 's', ++idx, postfix);
1705         } else if (flat) {
1706             snprintf(desc_filename, sizeof(desc_filename), "%s-flat%s",
1707                     prefix, postfix);
1708         } else {
1709             snprintf(desc_filename, sizeof(desc_filename), "%s%s",
1710                     prefix, postfix);
1711         }
1712         snprintf(ext_filename, sizeof(ext_filename), "%s%s",
1713                 path, desc_filename);
1714 
1715         if (vmdk_create_extent(ext_filename, size,
1716                                flat, compress, zeroed_grain)) {
1717             return -EINVAL;
1718         }
1719         filesize -= size;
1720 
1721         /* Format description line */
1722         snprintf(desc_line, sizeof(desc_line),
1723                     desc_extent_line, size / 512, desc_filename);
1724         pstrcat(ext_desc_lines, sizeof(ext_desc_lines), desc_line);
1725     }
1726     /* generate descriptor file */
1727     snprintf(desc, sizeof(desc), desc_template,
1728             (unsigned int)time(NULL),
1729             parent_cid,
1730             fmt,
1731             parent_desc_line,
1732             ext_desc_lines,
1733             (flags & BLOCK_FLAG_COMPAT6 ? 6 : 4),
1734             total_size / (int64_t)(63 * number_heads * 512), number_heads,
1735                 adapter_type);
1736     if (split || flat) {
1737         fd = qemu_open(filename,
1738                        O_WRONLY | O_CREAT | O_TRUNC | O_BINARY | O_LARGEFILE,
1739                        0644);
1740     } else {
1741         fd = qemu_open(filename,
1742                        O_WRONLY | O_BINARY | O_LARGEFILE,
1743                        0644);
1744     }
1745     if (fd < 0) {
1746         return -errno;
1747     }
1748     /* the descriptor offset = 0x200 */
1749     if (!split && !flat && 0x200 != lseek(fd, 0x200, SEEK_SET)) {
1750         ret = -errno;
1751         goto exit;
1752     }
1753     ret = qemu_write_full(fd, desc, strlen(desc));
1754     if (ret != strlen(desc)) {
1755         ret = -errno;
1756         goto exit;
1757     }
1758     ret = 0;
1759 exit:
1760     qemu_close(fd);
1761     return ret;
1762 }
1763 
1764 static void vmdk_close(BlockDriverState *bs)
1765 {
1766     BDRVVmdkState *s = bs->opaque;
1767 
1768     vmdk_free_extents(bs);
1769 
1770     migrate_del_blocker(s->migration_blocker);
1771     error_free(s->migration_blocker);
1772 }
1773 
1774 static coroutine_fn int vmdk_co_flush(BlockDriverState *bs)
1775 {
1776     BDRVVmdkState *s = bs->opaque;
1777     int i, err;
1778     int ret = 0;
1779 
1780     for (i = 0; i < s->num_extents; i++) {
1781         err = bdrv_co_flush(s->extents[i].file);
1782         if (err < 0) {
1783             ret = err;
1784         }
1785     }
1786     return ret;
1787 }
1788 
1789 static int64_t vmdk_get_allocated_file_size(BlockDriverState *bs)
1790 {
1791     int i;
1792     int64_t ret = 0;
1793     int64_t r;
1794     BDRVVmdkState *s = bs->opaque;
1795 
1796     ret = bdrv_get_allocated_file_size(bs->file);
1797     if (ret < 0) {
1798         return ret;
1799     }
1800     for (i = 0; i < s->num_extents; i++) {
1801         if (s->extents[i].file == bs->file) {
1802             continue;
1803         }
1804         r = bdrv_get_allocated_file_size(s->extents[i].file);
1805         if (r < 0) {
1806             return r;
1807         }
1808         ret += r;
1809     }
1810     return ret;
1811 }
1812 
1813 static int vmdk_has_zero_init(BlockDriverState *bs)
1814 {
1815     int i;
1816     BDRVVmdkState *s = bs->opaque;
1817 
1818     /* If has a flat extent and its underlying storage doesn't have zero init,
1819      * return 0. */
1820     for (i = 0; i < s->num_extents; i++) {
1821         if (s->extents[i].flat) {
1822             if (!bdrv_has_zero_init(s->extents[i].file)) {
1823                 return 0;
1824             }
1825         }
1826     }
1827     return 1;
1828 }
1829 
1830 static QEMUOptionParameter vmdk_create_options[] = {
1831     {
1832         .name = BLOCK_OPT_SIZE,
1833         .type = OPT_SIZE,
1834         .help = "Virtual disk size"
1835     },
1836     {
1837         .name = BLOCK_OPT_ADAPTER_TYPE,
1838         .type = OPT_STRING,
1839         .help = "Virtual adapter type, can be one of "
1840                 "ide (default), lsilogic, buslogic or legacyESX"
1841     },
1842     {
1843         .name = BLOCK_OPT_BACKING_FILE,
1844         .type = OPT_STRING,
1845         .help = "File name of a base image"
1846     },
1847     {
1848         .name = BLOCK_OPT_COMPAT6,
1849         .type = OPT_FLAG,
1850         .help = "VMDK version 6 image"
1851     },
1852     {
1853         .name = BLOCK_OPT_SUBFMT,
1854         .type = OPT_STRING,
1855         .help =
1856             "VMDK flat extent format, can be one of "
1857             "{monolithicSparse (default) | monolithicFlat | twoGbMaxExtentSparse | twoGbMaxExtentFlat | streamOptimized} "
1858     },
1859     {
1860         .name = BLOCK_OPT_ZEROED_GRAIN,
1861         .type = OPT_FLAG,
1862         .help = "Enable efficient zero writes using the zeroed-grain GTE feature"
1863     },
1864     { NULL }
1865 };
1866 
1867 static BlockDriver bdrv_vmdk = {
1868     .format_name                  = "vmdk",
1869     .instance_size                = sizeof(BDRVVmdkState),
1870     .bdrv_probe                   = vmdk_probe,
1871     .bdrv_open                    = vmdk_open,
1872     .bdrv_reopen_prepare          = vmdk_reopen_prepare,
1873     .bdrv_read                    = vmdk_co_read,
1874     .bdrv_write                   = vmdk_co_write,
1875     .bdrv_co_write_zeroes         = vmdk_co_write_zeroes,
1876     .bdrv_close                   = vmdk_close,
1877     .bdrv_create                  = vmdk_create,
1878     .bdrv_co_flush_to_disk        = vmdk_co_flush,
1879     .bdrv_co_get_block_status     = vmdk_co_get_block_status,
1880     .bdrv_get_allocated_file_size = vmdk_get_allocated_file_size,
1881     .bdrv_has_zero_init           = vmdk_has_zero_init,
1882 
1883     .create_options               = vmdk_create_options,
1884 };
1885 
1886 static void bdrv_vmdk_init(void)
1887 {
1888     bdrv_register(&bdrv_vmdk);
1889 }
1890 
1891 block_init(bdrv_vmdk_init);
1892