xref: /openbmc/qemu/block/vmdk.c (revision f24582d6)
1 /*
2  * Block driver for the VMDK format
3  *
4  * Copyright (c) 2004 Fabrice Bellard
5  * Copyright (c) 2005 Filip Navara
6  *
7  * Permission is hereby granted, free of charge, to any person obtaining a copy
8  * of this software and associated documentation files (the "Software"), to deal
9  * in the Software without restriction, including without limitation the rights
10  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11  * copies of the Software, and to permit persons to whom the Software is
12  * furnished to do so, subject to the following conditions:
13  *
14  * The above copyright notice and this permission notice shall be included in
15  * all copies or substantial portions of the Software.
16  *
17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23  * THE SOFTWARE.
24  */
25 
26 #include "qemu-common.h"
27 #include "block_int.h"
28 #include "module.h"
29 #include "migration.h"
30 #include <zlib.h>
31 
32 #define VMDK3_MAGIC (('C' << 24) | ('O' << 16) | ('W' << 8) | 'D')
33 #define VMDK4_MAGIC (('K' << 24) | ('D' << 16) | ('M' << 8) | 'V')
34 #define VMDK4_COMPRESSION_DEFLATE 1
35 #define VMDK4_FLAG_RGD (1 << 1)
36 #define VMDK4_FLAG_COMPRESS (1 << 16)
37 #define VMDK4_FLAG_MARKER (1 << 17)
38 
39 typedef struct {
40     uint32_t version;
41     uint32_t flags;
42     uint32_t disk_sectors;
43     uint32_t granularity;
44     uint32_t l1dir_offset;
45     uint32_t l1dir_size;
46     uint32_t file_sectors;
47     uint32_t cylinders;
48     uint32_t heads;
49     uint32_t sectors_per_track;
50 } VMDK3Header;
51 
52 typedef struct {
53     uint32_t version;
54     uint32_t flags;
55     int64_t capacity;
56     int64_t granularity;
57     int64_t desc_offset;
58     int64_t desc_size;
59     int32_t num_gtes_per_gte;
60     int64_t gd_offset;
61     int64_t rgd_offset;
62     int64_t grain_offset;
63     char filler[1];
64     char check_bytes[4];
65     uint16_t compressAlgorithm;
66 } QEMU_PACKED VMDK4Header;
67 
68 #define L2_CACHE_SIZE 16
69 
70 typedef struct VmdkExtent {
71     BlockDriverState *file;
72     bool flat;
73     bool compressed;
74     bool has_marker;
75     int64_t sectors;
76     int64_t end_sector;
77     int64_t flat_start_offset;
78     int64_t l1_table_offset;
79     int64_t l1_backup_table_offset;
80     uint32_t *l1_table;
81     uint32_t *l1_backup_table;
82     unsigned int l1_size;
83     uint32_t l1_entry_sectors;
84 
85     unsigned int l2_size;
86     uint32_t *l2_cache;
87     uint32_t l2_cache_offsets[L2_CACHE_SIZE];
88     uint32_t l2_cache_counts[L2_CACHE_SIZE];
89 
90     unsigned int cluster_sectors;
91 } VmdkExtent;
92 
93 typedef struct BDRVVmdkState {
94     CoMutex lock;
95     int desc_offset;
96     bool cid_updated;
97     uint32_t parent_cid;
98     int num_extents;
99     /* Extent array with num_extents entries, ascend ordered by address */
100     VmdkExtent *extents;
101     Error *migration_blocker;
102 } BDRVVmdkState;
103 
104 typedef struct VmdkMetaData {
105     uint32_t offset;
106     unsigned int l1_index;
107     unsigned int l2_index;
108     unsigned int l2_offset;
109     int valid;
110 } VmdkMetaData;
111 
112 typedef struct VmdkGrainMarker {
113     uint64_t lba;
114     uint32_t size;
115     uint8_t  data[0];
116 } VmdkGrainMarker;
117 
118 static int vmdk_probe(const uint8_t *buf, int buf_size, const char *filename)
119 {
120     uint32_t magic;
121 
122     if (buf_size < 4) {
123         return 0;
124     }
125     magic = be32_to_cpu(*(uint32_t *)buf);
126     if (magic == VMDK3_MAGIC ||
127         magic == VMDK4_MAGIC) {
128         return 100;
129     } else {
130         const char *p = (const char *)buf;
131         const char *end = p + buf_size;
132         while (p < end) {
133             if (*p == '#') {
134                 /* skip comment line */
135                 while (p < end && *p != '\n') {
136                     p++;
137                 }
138                 p++;
139                 continue;
140             }
141             if (*p == ' ') {
142                 while (p < end && *p == ' ') {
143                     p++;
144                 }
145                 /* skip '\r' if windows line endings used. */
146                 if (p < end && *p == '\r') {
147                     p++;
148                 }
149                 /* only accept blank lines before 'version=' line */
150                 if (p == end || *p != '\n') {
151                     return 0;
152                 }
153                 p++;
154                 continue;
155             }
156             if (end - p >= strlen("version=X\n")) {
157                 if (strncmp("version=1\n", p, strlen("version=1\n")) == 0 ||
158                     strncmp("version=2\n", p, strlen("version=2\n")) == 0) {
159                     return 100;
160                 }
161             }
162             if (end - p >= strlen("version=X\r\n")) {
163                 if (strncmp("version=1\r\n", p, strlen("version=1\r\n")) == 0 ||
164                     strncmp("version=2\r\n", p, strlen("version=2\r\n")) == 0) {
165                     return 100;
166                 }
167             }
168             return 0;
169         }
170         return 0;
171     }
172 }
173 
174 #define CHECK_CID 1
175 
176 #define SECTOR_SIZE 512
177 #define DESC_SIZE (20 * SECTOR_SIZE)    /* 20 sectors of 512 bytes each */
178 #define BUF_SIZE 4096
179 #define HEADER_SIZE 512                 /* first sector of 512 bytes */
180 
181 static void vmdk_free_extents(BlockDriverState *bs)
182 {
183     int i;
184     BDRVVmdkState *s = bs->opaque;
185     VmdkExtent *e;
186 
187     for (i = 0; i < s->num_extents; i++) {
188         e = &s->extents[i];
189         g_free(e->l1_table);
190         g_free(e->l2_cache);
191         g_free(e->l1_backup_table);
192         if (e->file != bs->file) {
193             bdrv_delete(e->file);
194         }
195     }
196     g_free(s->extents);
197 }
198 
199 static void vmdk_free_last_extent(BlockDriverState *bs)
200 {
201     BDRVVmdkState *s = bs->opaque;
202 
203     if (s->num_extents == 0) {
204         return;
205     }
206     s->num_extents--;
207     s->extents = g_realloc(s->extents, s->num_extents * sizeof(VmdkExtent));
208 }
209 
210 static uint32_t vmdk_read_cid(BlockDriverState *bs, int parent)
211 {
212     char desc[DESC_SIZE];
213     uint32_t cid = 0xffffffff;
214     const char *p_name, *cid_str;
215     size_t cid_str_size;
216     BDRVVmdkState *s = bs->opaque;
217     int ret;
218 
219     ret = bdrv_pread(bs->file, s->desc_offset, desc, DESC_SIZE);
220     if (ret < 0) {
221         return 0;
222     }
223 
224     if (parent) {
225         cid_str = "parentCID";
226         cid_str_size = sizeof("parentCID");
227     } else {
228         cid_str = "CID";
229         cid_str_size = sizeof("CID");
230     }
231 
232     desc[DESC_SIZE - 1] = '\0';
233     p_name = strstr(desc, cid_str);
234     if (p_name != NULL) {
235         p_name += cid_str_size;
236         sscanf(p_name, "%x", &cid);
237     }
238 
239     return cid;
240 }
241 
242 static int vmdk_write_cid(BlockDriverState *bs, uint32_t cid)
243 {
244     char desc[DESC_SIZE], tmp_desc[DESC_SIZE];
245     char *p_name, *tmp_str;
246     BDRVVmdkState *s = bs->opaque;
247     int ret;
248 
249     ret = bdrv_pread(bs->file, s->desc_offset, desc, DESC_SIZE);
250     if (ret < 0) {
251         return ret;
252     }
253 
254     desc[DESC_SIZE - 1] = '\0';
255     tmp_str = strstr(desc, "parentCID");
256     if (tmp_str == NULL) {
257         return -EINVAL;
258     }
259 
260     pstrcpy(tmp_desc, sizeof(tmp_desc), tmp_str);
261     p_name = strstr(desc, "CID");
262     if (p_name != NULL) {
263         p_name += sizeof("CID");
264         snprintf(p_name, sizeof(desc) - (p_name - desc), "%x\n", cid);
265         pstrcat(desc, sizeof(desc), tmp_desc);
266     }
267 
268     ret = bdrv_pwrite_sync(bs->file, s->desc_offset, desc, DESC_SIZE);
269     if (ret < 0) {
270         return ret;
271     }
272 
273     return 0;
274 }
275 
276 static int vmdk_is_cid_valid(BlockDriverState *bs)
277 {
278 #ifdef CHECK_CID
279     BDRVVmdkState *s = bs->opaque;
280     BlockDriverState *p_bs = bs->backing_hd;
281     uint32_t cur_pcid;
282 
283     if (p_bs) {
284         cur_pcid = vmdk_read_cid(p_bs, 0);
285         if (s->parent_cid != cur_pcid) {
286             /* CID not valid */
287             return 0;
288         }
289     }
290 #endif
291     /* CID valid */
292     return 1;
293 }
294 
295 static int vmdk_parent_open(BlockDriverState *bs)
296 {
297     char *p_name;
298     char desc[DESC_SIZE + 1];
299     BDRVVmdkState *s = bs->opaque;
300     int ret;
301 
302     desc[DESC_SIZE] = '\0';
303     ret = bdrv_pread(bs->file, s->desc_offset, desc, DESC_SIZE);
304     if (ret < 0) {
305         return ret;
306     }
307 
308     p_name = strstr(desc, "parentFileNameHint");
309     if (p_name != NULL) {
310         char *end_name;
311 
312         p_name += sizeof("parentFileNameHint") + 1;
313         end_name = strchr(p_name, '\"');
314         if (end_name == NULL) {
315             return -EINVAL;
316         }
317         if ((end_name - p_name) > sizeof(bs->backing_file) - 1) {
318             return -EINVAL;
319         }
320 
321         pstrcpy(bs->backing_file, end_name - p_name + 1, p_name);
322     }
323 
324     return 0;
325 }
326 
327 /* Create and append extent to the extent array. Return the added VmdkExtent
328  * address. return NULL if allocation failed. */
329 static VmdkExtent *vmdk_add_extent(BlockDriverState *bs,
330                            BlockDriverState *file, bool flat, int64_t sectors,
331                            int64_t l1_offset, int64_t l1_backup_offset,
332                            uint32_t l1_size,
333                            int l2_size, unsigned int cluster_sectors)
334 {
335     VmdkExtent *extent;
336     BDRVVmdkState *s = bs->opaque;
337 
338     s->extents = g_realloc(s->extents,
339                               (s->num_extents + 1) * sizeof(VmdkExtent));
340     extent = &s->extents[s->num_extents];
341     s->num_extents++;
342 
343     memset(extent, 0, sizeof(VmdkExtent));
344     extent->file = file;
345     extent->flat = flat;
346     extent->sectors = sectors;
347     extent->l1_table_offset = l1_offset;
348     extent->l1_backup_table_offset = l1_backup_offset;
349     extent->l1_size = l1_size;
350     extent->l1_entry_sectors = l2_size * cluster_sectors;
351     extent->l2_size = l2_size;
352     extent->cluster_sectors = cluster_sectors;
353 
354     if (s->num_extents > 1) {
355         extent->end_sector = (*(extent - 1)).end_sector + extent->sectors;
356     } else {
357         extent->end_sector = extent->sectors;
358     }
359     bs->total_sectors = extent->end_sector;
360     return extent;
361 }
362 
363 static int vmdk_init_tables(BlockDriverState *bs, VmdkExtent *extent)
364 {
365     int ret;
366     int l1_size, i;
367 
368     /* read the L1 table */
369     l1_size = extent->l1_size * sizeof(uint32_t);
370     extent->l1_table = g_malloc(l1_size);
371     ret = bdrv_pread(extent->file,
372                     extent->l1_table_offset,
373                     extent->l1_table,
374                     l1_size);
375     if (ret < 0) {
376         goto fail_l1;
377     }
378     for (i = 0; i < extent->l1_size; i++) {
379         le32_to_cpus(&extent->l1_table[i]);
380     }
381 
382     if (extent->l1_backup_table_offset) {
383         extent->l1_backup_table = g_malloc(l1_size);
384         ret = bdrv_pread(extent->file,
385                         extent->l1_backup_table_offset,
386                         extent->l1_backup_table,
387                         l1_size);
388         if (ret < 0) {
389             goto fail_l1b;
390         }
391         for (i = 0; i < extent->l1_size; i++) {
392             le32_to_cpus(&extent->l1_backup_table[i]);
393         }
394     }
395 
396     extent->l2_cache =
397         g_malloc(extent->l2_size * L2_CACHE_SIZE * sizeof(uint32_t));
398     return 0;
399  fail_l1b:
400     g_free(extent->l1_backup_table);
401  fail_l1:
402     g_free(extent->l1_table);
403     return ret;
404 }
405 
406 static int vmdk_open_vmdk3(BlockDriverState *bs,
407                            BlockDriverState *file,
408                            int flags)
409 {
410     int ret;
411     uint32_t magic;
412     VMDK3Header header;
413     VmdkExtent *extent;
414 
415     ret = bdrv_pread(file, sizeof(magic), &header, sizeof(header));
416     if (ret < 0) {
417         return ret;
418     }
419     extent = vmdk_add_extent(bs,
420                              bs->file, false,
421                              le32_to_cpu(header.disk_sectors),
422                              le32_to_cpu(header.l1dir_offset) << 9,
423                              0, 1 << 6, 1 << 9,
424                              le32_to_cpu(header.granularity));
425     ret = vmdk_init_tables(bs, extent);
426     if (ret) {
427         /* free extent allocated by vmdk_add_extent */
428         vmdk_free_last_extent(bs);
429     }
430     return ret;
431 }
432 
433 static int vmdk_open_desc_file(BlockDriverState *bs, int flags,
434                                int64_t desc_offset);
435 
436 static int vmdk_open_vmdk4(BlockDriverState *bs,
437                            BlockDriverState *file,
438                            int flags)
439 {
440     int ret;
441     uint32_t magic;
442     uint32_t l1_size, l1_entry_sectors;
443     VMDK4Header header;
444     VmdkExtent *extent;
445     int64_t l1_backup_offset = 0;
446 
447     ret = bdrv_pread(file, sizeof(magic), &header, sizeof(header));
448     if (ret < 0) {
449         return ret;
450     }
451     if (header.capacity == 0 && header.desc_offset) {
452         return vmdk_open_desc_file(bs, flags, header.desc_offset << 9);
453     }
454     l1_entry_sectors = le32_to_cpu(header.num_gtes_per_gte)
455                         * le64_to_cpu(header.granularity);
456     if (l1_entry_sectors == 0) {
457         return -EINVAL;
458     }
459     l1_size = (le64_to_cpu(header.capacity) + l1_entry_sectors - 1)
460                 / l1_entry_sectors;
461     if (le32_to_cpu(header.flags) & VMDK4_FLAG_RGD) {
462         l1_backup_offset = le64_to_cpu(header.rgd_offset) << 9;
463     }
464     extent = vmdk_add_extent(bs, file, false,
465                           le64_to_cpu(header.capacity),
466                           le64_to_cpu(header.gd_offset) << 9,
467                           l1_backup_offset,
468                           l1_size,
469                           le32_to_cpu(header.num_gtes_per_gte),
470                           le64_to_cpu(header.granularity));
471     extent->compressed =
472         le16_to_cpu(header.compressAlgorithm) == VMDK4_COMPRESSION_DEFLATE;
473     extent->has_marker = le32_to_cpu(header.flags) & VMDK4_FLAG_MARKER;
474     ret = vmdk_init_tables(bs, extent);
475     if (ret) {
476         /* free extent allocated by vmdk_add_extent */
477         vmdk_free_last_extent(bs);
478     }
479     return ret;
480 }
481 
482 /* find an option value out of descriptor file */
483 static int vmdk_parse_description(const char *desc, const char *opt_name,
484         char *buf, int buf_size)
485 {
486     char *opt_pos, *opt_end;
487     const char *end = desc + strlen(desc);
488 
489     opt_pos = strstr(desc, opt_name);
490     if (!opt_pos) {
491         return -1;
492     }
493     /* Skip "=\"" following opt_name */
494     opt_pos += strlen(opt_name) + 2;
495     if (opt_pos >= end) {
496         return -1;
497     }
498     opt_end = opt_pos;
499     while (opt_end < end && *opt_end != '"') {
500         opt_end++;
501     }
502     if (opt_end == end || buf_size < opt_end - opt_pos + 1) {
503         return -1;
504     }
505     pstrcpy(buf, opt_end - opt_pos + 1, opt_pos);
506     return 0;
507 }
508 
509 /* Open an extent file and append to bs array */
510 static int vmdk_open_sparse(BlockDriverState *bs,
511                             BlockDriverState *file,
512                             int flags)
513 {
514     uint32_t magic;
515 
516     if (bdrv_pread(file, 0, &magic, sizeof(magic)) != sizeof(magic)) {
517         return -EIO;
518     }
519 
520     magic = be32_to_cpu(magic);
521     switch (magic) {
522         case VMDK3_MAGIC:
523             return vmdk_open_vmdk3(bs, file, flags);
524             break;
525         case VMDK4_MAGIC:
526             return vmdk_open_vmdk4(bs, file, flags);
527             break;
528         default:
529             return -EINVAL;
530             break;
531     }
532 }
533 
534 static int vmdk_parse_extents(const char *desc, BlockDriverState *bs,
535         const char *desc_file_path)
536 {
537     int ret;
538     char access[11];
539     char type[11];
540     char fname[512];
541     const char *p = desc;
542     int64_t sectors = 0;
543     int64_t flat_offset;
544     char extent_path[PATH_MAX];
545     BlockDriverState *extent_file;
546 
547     while (*p) {
548         /* parse extent line:
549          * RW [size in sectors] FLAT "file-name.vmdk" OFFSET
550          * or
551          * RW [size in sectors] SPARSE "file-name.vmdk"
552          */
553         flat_offset = -1;
554         ret = sscanf(p, "%10s %" SCNd64 " %10s %511s %" SCNd64,
555                 access, &sectors, type, fname, &flat_offset);
556         if (ret < 4 || strcmp(access, "RW")) {
557             goto next_line;
558         } else if (!strcmp(type, "FLAT")) {
559             if (ret != 5 || flat_offset < 0) {
560                 return -EINVAL;
561             }
562         } else if (ret != 4) {
563             return -EINVAL;
564         }
565 
566         /* trim the quotation marks around */
567         if (fname[0] == '"') {
568             memmove(fname, fname + 1, strlen(fname));
569             if (strlen(fname) <= 1 || fname[strlen(fname) - 1] != '"') {
570                 return -EINVAL;
571             }
572             fname[strlen(fname) - 1] = '\0';
573         }
574         if (sectors <= 0 ||
575             (strcmp(type, "FLAT") && strcmp(type, "SPARSE")) ||
576             (strcmp(access, "RW"))) {
577             goto next_line;
578         }
579 
580         path_combine(extent_path, sizeof(extent_path),
581                 desc_file_path, fname);
582         ret = bdrv_file_open(&extent_file, extent_path, bs->open_flags);
583         if (ret) {
584             return ret;
585         }
586 
587         /* save to extents array */
588         if (!strcmp(type, "FLAT")) {
589             /* FLAT extent */
590             VmdkExtent *extent;
591 
592             extent = vmdk_add_extent(bs, extent_file, true, sectors,
593                             0, 0, 0, 0, sectors);
594             extent->flat_start_offset = flat_offset << 9;
595         } else if (!strcmp(type, "SPARSE")) {
596             /* SPARSE extent */
597             ret = vmdk_open_sparse(bs, extent_file, bs->open_flags);
598             if (ret) {
599                 bdrv_delete(extent_file);
600                 return ret;
601             }
602         } else {
603             fprintf(stderr,
604                 "VMDK: Not supported extent type \"%s\""".\n", type);
605             return -ENOTSUP;
606         }
607 next_line:
608         /* move to next line */
609         while (*p && *p != '\n') {
610             p++;
611         }
612         p++;
613     }
614     return 0;
615 }
616 
617 static int vmdk_open_desc_file(BlockDriverState *bs, int flags,
618                                int64_t desc_offset)
619 {
620     int ret;
621     char buf[2048];
622     char ct[128];
623     BDRVVmdkState *s = bs->opaque;
624 
625     ret = bdrv_pread(bs->file, desc_offset, buf, sizeof(buf));
626     if (ret < 0) {
627         return ret;
628     }
629     buf[2047] = '\0';
630     if (vmdk_parse_description(buf, "createType", ct, sizeof(ct))) {
631         return -EINVAL;
632     }
633     if (strcmp(ct, "monolithicFlat") &&
634         strcmp(ct, "twoGbMaxExtentSparse") &&
635         strcmp(ct, "twoGbMaxExtentFlat")) {
636         fprintf(stderr,
637                 "VMDK: Not supported image type \"%s\""".\n", ct);
638         return -ENOTSUP;
639     }
640     s->desc_offset = 0;
641     return vmdk_parse_extents(buf, bs, bs->file->filename);
642 }
643 
644 static int vmdk_open(BlockDriverState *bs, int flags)
645 {
646     int ret;
647     BDRVVmdkState *s = bs->opaque;
648 
649     if (vmdk_open_sparse(bs, bs->file, flags) == 0) {
650         s->desc_offset = 0x200;
651     } else {
652         ret = vmdk_open_desc_file(bs, flags, 0);
653         if (ret) {
654             goto fail;
655         }
656     }
657     /* try to open parent images, if exist */
658     ret = vmdk_parent_open(bs);
659     if (ret) {
660         goto fail;
661     }
662     s->parent_cid = vmdk_read_cid(bs, 1);
663     qemu_co_mutex_init(&s->lock);
664 
665     /* Disable migration when VMDK images are used */
666     error_set(&s->migration_blocker,
667               QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
668               "vmdk", bs->device_name, "live migration");
669     migrate_add_blocker(s->migration_blocker);
670 
671     return 0;
672 
673 fail:
674     vmdk_free_extents(bs);
675     return ret;
676 }
677 
678 static int get_whole_cluster(BlockDriverState *bs,
679                 VmdkExtent *extent,
680                 uint64_t cluster_offset,
681                 uint64_t offset,
682                 bool allocate)
683 {
684     /* 128 sectors * 512 bytes each = grain size 64KB */
685     uint8_t  whole_grain[extent->cluster_sectors * 512];
686 
687     /* we will be here if it's first write on non-exist grain(cluster).
688      * try to read from parent image, if exist */
689     if (bs->backing_hd) {
690         int ret;
691 
692         if (!vmdk_is_cid_valid(bs)) {
693             return -1;
694         }
695 
696         /* floor offset to cluster */
697         offset -= offset % (extent->cluster_sectors * 512);
698         ret = bdrv_read(bs->backing_hd, offset >> 9, whole_grain,
699                 extent->cluster_sectors);
700         if (ret < 0) {
701             return -1;
702         }
703 
704         /* Write grain only into the active image */
705         ret = bdrv_write(extent->file, cluster_offset, whole_grain,
706                 extent->cluster_sectors);
707         if (ret < 0) {
708             return -1;
709         }
710     }
711     return 0;
712 }
713 
714 static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data)
715 {
716     /* update L2 table */
717     if (bdrv_pwrite_sync(
718                 extent->file,
719                 ((int64_t)m_data->l2_offset * 512)
720                     + (m_data->l2_index * sizeof(m_data->offset)),
721                 &(m_data->offset),
722                 sizeof(m_data->offset)
723             ) < 0) {
724         return -1;
725     }
726     /* update backup L2 table */
727     if (extent->l1_backup_table_offset != 0) {
728         m_data->l2_offset = extent->l1_backup_table[m_data->l1_index];
729         if (bdrv_pwrite_sync(
730                     extent->file,
731                     ((int64_t)m_data->l2_offset * 512)
732                         + (m_data->l2_index * sizeof(m_data->offset)),
733                     &(m_data->offset), sizeof(m_data->offset)
734                 ) < 0) {
735             return -1;
736         }
737     }
738 
739     return 0;
740 }
741 
742 static int get_cluster_offset(BlockDriverState *bs,
743                                     VmdkExtent *extent,
744                                     VmdkMetaData *m_data,
745                                     uint64_t offset,
746                                     int allocate,
747                                     uint64_t *cluster_offset)
748 {
749     unsigned int l1_index, l2_offset, l2_index;
750     int min_index, i, j;
751     uint32_t min_count, *l2_table, tmp = 0;
752 
753     if (m_data) {
754         m_data->valid = 0;
755     }
756     if (extent->flat) {
757         *cluster_offset = extent->flat_start_offset;
758         return 0;
759     }
760 
761     offset -= (extent->end_sector - extent->sectors) * SECTOR_SIZE;
762     l1_index = (offset >> 9) / extent->l1_entry_sectors;
763     if (l1_index >= extent->l1_size) {
764         return -1;
765     }
766     l2_offset = extent->l1_table[l1_index];
767     if (!l2_offset) {
768         return -1;
769     }
770     for (i = 0; i < L2_CACHE_SIZE; i++) {
771         if (l2_offset == extent->l2_cache_offsets[i]) {
772             /* increment the hit count */
773             if (++extent->l2_cache_counts[i] == 0xffffffff) {
774                 for (j = 0; j < L2_CACHE_SIZE; j++) {
775                     extent->l2_cache_counts[j] >>= 1;
776                 }
777             }
778             l2_table = extent->l2_cache + (i * extent->l2_size);
779             goto found;
780         }
781     }
782     /* not found: load a new entry in the least used one */
783     min_index = 0;
784     min_count = 0xffffffff;
785     for (i = 0; i < L2_CACHE_SIZE; i++) {
786         if (extent->l2_cache_counts[i] < min_count) {
787             min_count = extent->l2_cache_counts[i];
788             min_index = i;
789         }
790     }
791     l2_table = extent->l2_cache + (min_index * extent->l2_size);
792     if (bdrv_pread(
793                 extent->file,
794                 (int64_t)l2_offset * 512,
795                 l2_table,
796                 extent->l2_size * sizeof(uint32_t)
797             ) != extent->l2_size * sizeof(uint32_t)) {
798         return -1;
799     }
800 
801     extent->l2_cache_offsets[min_index] = l2_offset;
802     extent->l2_cache_counts[min_index] = 1;
803  found:
804     l2_index = ((offset >> 9) / extent->cluster_sectors) % extent->l2_size;
805     *cluster_offset = le32_to_cpu(l2_table[l2_index]);
806 
807     if (!*cluster_offset) {
808         if (!allocate) {
809             return -1;
810         }
811 
812         /* Avoid the L2 tables update for the images that have snapshots. */
813         *cluster_offset = bdrv_getlength(extent->file);
814         if (!extent->compressed) {
815             bdrv_truncate(
816                 extent->file,
817                 *cluster_offset + (extent->cluster_sectors << 9)
818             );
819         }
820 
821         *cluster_offset >>= 9;
822         tmp = cpu_to_le32(*cluster_offset);
823         l2_table[l2_index] = tmp;
824 
825         /* First of all we write grain itself, to avoid race condition
826          * that may to corrupt the image.
827          * This problem may occur because of insufficient space on host disk
828          * or inappropriate VM shutdown.
829          */
830         if (get_whole_cluster(
831                 bs, extent, *cluster_offset, offset, allocate) == -1) {
832             return -1;
833         }
834 
835         if (m_data) {
836             m_data->offset = tmp;
837             m_data->l1_index = l1_index;
838             m_data->l2_index = l2_index;
839             m_data->l2_offset = l2_offset;
840             m_data->valid = 1;
841         }
842     }
843     *cluster_offset <<= 9;
844     return 0;
845 }
846 
847 static VmdkExtent *find_extent(BDRVVmdkState *s,
848                                 int64_t sector_num, VmdkExtent *start_hint)
849 {
850     VmdkExtent *extent = start_hint;
851 
852     if (!extent) {
853         extent = &s->extents[0];
854     }
855     while (extent < &s->extents[s->num_extents]) {
856         if (sector_num < extent->end_sector) {
857             return extent;
858         }
859         extent++;
860     }
861     return NULL;
862 }
863 
864 static int coroutine_fn vmdk_co_is_allocated(BlockDriverState *bs,
865         int64_t sector_num, int nb_sectors, int *pnum)
866 {
867     BDRVVmdkState *s = bs->opaque;
868     int64_t index_in_cluster, n, ret;
869     uint64_t offset;
870     VmdkExtent *extent;
871 
872     extent = find_extent(s, sector_num, NULL);
873     if (!extent) {
874         return 0;
875     }
876     qemu_co_mutex_lock(&s->lock);
877     ret = get_cluster_offset(bs, extent, NULL,
878                             sector_num * 512, 0, &offset);
879     qemu_co_mutex_unlock(&s->lock);
880     /* get_cluster_offset returning 0 means success */
881     ret = !ret;
882 
883     index_in_cluster = sector_num % extent->cluster_sectors;
884     n = extent->cluster_sectors - index_in_cluster;
885     if (n > nb_sectors) {
886         n = nb_sectors;
887     }
888     *pnum = n;
889     return ret;
890 }
891 
892 static int vmdk_write_extent(VmdkExtent *extent, int64_t cluster_offset,
893                             int64_t offset_in_cluster, const uint8_t *buf,
894                             int nb_sectors, int64_t sector_num)
895 {
896     int ret;
897     VmdkGrainMarker *data = NULL;
898     uLongf buf_len;
899     const uint8_t *write_buf = buf;
900     int write_len = nb_sectors * 512;
901 
902     if (extent->compressed) {
903         if (!extent->has_marker) {
904             ret = -EINVAL;
905             goto out;
906         }
907         buf_len = (extent->cluster_sectors << 9) * 2;
908         data = g_malloc(buf_len + sizeof(VmdkGrainMarker));
909         if (compress(data->data, &buf_len, buf, nb_sectors << 9) != Z_OK ||
910                 buf_len == 0) {
911             ret = -EINVAL;
912             goto out;
913         }
914         data->lba = sector_num;
915         data->size = buf_len;
916         write_buf = (uint8_t *)data;
917         write_len = buf_len + sizeof(VmdkGrainMarker);
918     }
919     ret = bdrv_pwrite(extent->file,
920                         cluster_offset + offset_in_cluster,
921                         write_buf,
922                         write_len);
923     if (ret != write_len) {
924         ret = ret < 0 ? ret : -EIO;
925         goto out;
926     }
927     ret = 0;
928  out:
929     g_free(data);
930     return ret;
931 }
932 
933 static int vmdk_read_extent(VmdkExtent *extent, int64_t cluster_offset,
934                             int64_t offset_in_cluster, uint8_t *buf,
935                             int nb_sectors)
936 {
937     int ret;
938     int cluster_bytes, buf_bytes;
939     uint8_t *cluster_buf, *compressed_data;
940     uint8_t *uncomp_buf;
941     uint32_t data_len;
942     VmdkGrainMarker *marker;
943     uLongf buf_len;
944 
945 
946     if (!extent->compressed) {
947         ret = bdrv_pread(extent->file,
948                           cluster_offset + offset_in_cluster,
949                           buf, nb_sectors * 512);
950         if (ret == nb_sectors * 512) {
951             return 0;
952         } else {
953             return -EIO;
954         }
955     }
956     cluster_bytes = extent->cluster_sectors * 512;
957     /* Read two clusters in case GrainMarker + compressed data > one cluster */
958     buf_bytes = cluster_bytes * 2;
959     cluster_buf = g_malloc(buf_bytes);
960     uncomp_buf = g_malloc(cluster_bytes);
961     ret = bdrv_pread(extent->file,
962                 cluster_offset,
963                 cluster_buf, buf_bytes);
964     if (ret < 0) {
965         goto out;
966     }
967     compressed_data = cluster_buf;
968     buf_len = cluster_bytes;
969     data_len = cluster_bytes;
970     if (extent->has_marker) {
971         marker = (VmdkGrainMarker *)cluster_buf;
972         compressed_data = marker->data;
973         data_len = le32_to_cpu(marker->size);
974     }
975     if (!data_len || data_len > buf_bytes) {
976         ret = -EINVAL;
977         goto out;
978     }
979     ret = uncompress(uncomp_buf, &buf_len, compressed_data, data_len);
980     if (ret != Z_OK) {
981         ret = -EINVAL;
982         goto out;
983 
984     }
985     if (offset_in_cluster < 0 ||
986             offset_in_cluster + nb_sectors * 512 > buf_len) {
987         ret = -EINVAL;
988         goto out;
989     }
990     memcpy(buf, uncomp_buf + offset_in_cluster, nb_sectors * 512);
991     ret = 0;
992 
993  out:
994     g_free(uncomp_buf);
995     g_free(cluster_buf);
996     return ret;
997 }
998 
999 static int vmdk_read(BlockDriverState *bs, int64_t sector_num,
1000                     uint8_t *buf, int nb_sectors)
1001 {
1002     BDRVVmdkState *s = bs->opaque;
1003     int ret;
1004     uint64_t n, index_in_cluster;
1005     VmdkExtent *extent = NULL;
1006     uint64_t cluster_offset;
1007 
1008     while (nb_sectors > 0) {
1009         extent = find_extent(s, sector_num, extent);
1010         if (!extent) {
1011             return -EIO;
1012         }
1013         ret = get_cluster_offset(
1014                             bs, extent, NULL,
1015                             sector_num << 9, 0, &cluster_offset);
1016         index_in_cluster = sector_num % extent->cluster_sectors;
1017         n = extent->cluster_sectors - index_in_cluster;
1018         if (n > nb_sectors) {
1019             n = nb_sectors;
1020         }
1021         if (ret) {
1022             /* if not allocated, try to read from parent image, if exist */
1023             if (bs->backing_hd) {
1024                 if (!vmdk_is_cid_valid(bs)) {
1025                     return -EINVAL;
1026                 }
1027                 ret = bdrv_read(bs->backing_hd, sector_num, buf, n);
1028                 if (ret < 0) {
1029                     return ret;
1030                 }
1031             } else {
1032                 memset(buf, 0, 512 * n);
1033             }
1034         } else {
1035             ret = vmdk_read_extent(extent,
1036                             cluster_offset, index_in_cluster * 512,
1037                             buf, n);
1038             if (ret) {
1039                 return ret;
1040             }
1041         }
1042         nb_sectors -= n;
1043         sector_num += n;
1044         buf += n * 512;
1045     }
1046     return 0;
1047 }
1048 
1049 static coroutine_fn int vmdk_co_read(BlockDriverState *bs, int64_t sector_num,
1050                                      uint8_t *buf, int nb_sectors)
1051 {
1052     int ret;
1053     BDRVVmdkState *s = bs->opaque;
1054     qemu_co_mutex_lock(&s->lock);
1055     ret = vmdk_read(bs, sector_num, buf, nb_sectors);
1056     qemu_co_mutex_unlock(&s->lock);
1057     return ret;
1058 }
1059 
1060 static int vmdk_write(BlockDriverState *bs, int64_t sector_num,
1061                      const uint8_t *buf, int nb_sectors)
1062 {
1063     BDRVVmdkState *s = bs->opaque;
1064     VmdkExtent *extent = NULL;
1065     int n, ret;
1066     int64_t index_in_cluster;
1067     uint64_t cluster_offset;
1068     VmdkMetaData m_data;
1069 
1070     if (sector_num > bs->total_sectors) {
1071         fprintf(stderr,
1072                 "(VMDK) Wrong offset: sector_num=0x%" PRIx64
1073                 " total_sectors=0x%" PRIx64 "\n",
1074                 sector_num, bs->total_sectors);
1075         return -EIO;
1076     }
1077 
1078     while (nb_sectors > 0) {
1079         extent = find_extent(s, sector_num, extent);
1080         if (!extent) {
1081             return -EIO;
1082         }
1083         ret = get_cluster_offset(
1084                                 bs,
1085                                 extent,
1086                                 &m_data,
1087                                 sector_num << 9, !extent->compressed,
1088                                 &cluster_offset);
1089         if (extent->compressed) {
1090             if (ret == 0) {
1091                 /* Refuse write to allocated cluster for streamOptimized */
1092                 fprintf(stderr,
1093                         "VMDK: can't write to allocated cluster"
1094                         " for streamOptimized\n");
1095                 return -EIO;
1096             } else {
1097                 /* allocate */
1098                 ret = get_cluster_offset(
1099                                         bs,
1100                                         extent,
1101                                         &m_data,
1102                                         sector_num << 9, 1,
1103                                         &cluster_offset);
1104             }
1105         }
1106         if (ret) {
1107             return -EINVAL;
1108         }
1109         index_in_cluster = sector_num % extent->cluster_sectors;
1110         n = extent->cluster_sectors - index_in_cluster;
1111         if (n > nb_sectors) {
1112             n = nb_sectors;
1113         }
1114 
1115         ret = vmdk_write_extent(extent,
1116                         cluster_offset, index_in_cluster * 512,
1117                         buf, n, sector_num);
1118         if (ret) {
1119             return ret;
1120         }
1121         if (m_data.valid) {
1122             /* update L2 tables */
1123             if (vmdk_L2update(extent, &m_data) == -1) {
1124                 return -EIO;
1125             }
1126         }
1127         nb_sectors -= n;
1128         sector_num += n;
1129         buf += n * 512;
1130 
1131         /* update CID on the first write every time the virtual disk is
1132          * opened */
1133         if (!s->cid_updated) {
1134             ret = vmdk_write_cid(bs, time(NULL));
1135             if (ret < 0) {
1136                 return ret;
1137             }
1138             s->cid_updated = true;
1139         }
1140     }
1141     return 0;
1142 }
1143 
1144 static coroutine_fn int vmdk_co_write(BlockDriverState *bs, int64_t sector_num,
1145                                       const uint8_t *buf, int nb_sectors)
1146 {
1147     int ret;
1148     BDRVVmdkState *s = bs->opaque;
1149     qemu_co_mutex_lock(&s->lock);
1150     ret = vmdk_write(bs, sector_num, buf, nb_sectors);
1151     qemu_co_mutex_unlock(&s->lock);
1152     return ret;
1153 }
1154 
1155 
1156 static int vmdk_create_extent(const char *filename, int64_t filesize,
1157                               bool flat, bool compress)
1158 {
1159     int ret, i;
1160     int fd = 0;
1161     VMDK4Header header;
1162     uint32_t tmp, magic, grains, gd_size, gt_size, gt_count;
1163 
1164     fd = open(
1165         filename,
1166         O_WRONLY | O_CREAT | O_TRUNC | O_BINARY | O_LARGEFILE,
1167         0644);
1168     if (fd < 0) {
1169         return -errno;
1170     }
1171     if (flat) {
1172         ret = ftruncate(fd, filesize);
1173         if (ret < 0) {
1174             ret = -errno;
1175         }
1176         goto exit;
1177     }
1178     magic = cpu_to_be32(VMDK4_MAGIC);
1179     memset(&header, 0, sizeof(header));
1180     header.version = 1;
1181     header.flags =
1182         3 | (compress ? VMDK4_FLAG_COMPRESS | VMDK4_FLAG_MARKER : 0);
1183     header.compressAlgorithm = compress ? VMDK4_COMPRESSION_DEFLATE : 0;
1184     header.capacity = filesize / 512;
1185     header.granularity = 128;
1186     header.num_gtes_per_gte = 512;
1187 
1188     grains = (filesize / 512 + header.granularity - 1) / header.granularity;
1189     gt_size = ((header.num_gtes_per_gte * sizeof(uint32_t)) + 511) >> 9;
1190     gt_count =
1191         (grains + header.num_gtes_per_gte - 1) / header.num_gtes_per_gte;
1192     gd_size = (gt_count * sizeof(uint32_t) + 511) >> 9;
1193 
1194     header.desc_offset = 1;
1195     header.desc_size = 20;
1196     header.rgd_offset = header.desc_offset + header.desc_size;
1197     header.gd_offset = header.rgd_offset + gd_size + (gt_size * gt_count);
1198     header.grain_offset =
1199        ((header.gd_offset + gd_size + (gt_size * gt_count) +
1200          header.granularity - 1) / header.granularity) *
1201         header.granularity;
1202     /* swap endianness for all header fields */
1203     header.version = cpu_to_le32(header.version);
1204     header.flags = cpu_to_le32(header.flags);
1205     header.capacity = cpu_to_le64(header.capacity);
1206     header.granularity = cpu_to_le64(header.granularity);
1207     header.num_gtes_per_gte = cpu_to_le32(header.num_gtes_per_gte);
1208     header.desc_offset = cpu_to_le64(header.desc_offset);
1209     header.desc_size = cpu_to_le64(header.desc_size);
1210     header.rgd_offset = cpu_to_le64(header.rgd_offset);
1211     header.gd_offset = cpu_to_le64(header.gd_offset);
1212     header.grain_offset = cpu_to_le64(header.grain_offset);
1213     header.compressAlgorithm = cpu_to_le16(header.compressAlgorithm);
1214 
1215     header.check_bytes[0] = 0xa;
1216     header.check_bytes[1] = 0x20;
1217     header.check_bytes[2] = 0xd;
1218     header.check_bytes[3] = 0xa;
1219 
1220     /* write all the data */
1221     ret = qemu_write_full(fd, &magic, sizeof(magic));
1222     if (ret != sizeof(magic)) {
1223         ret = -errno;
1224         goto exit;
1225     }
1226     ret = qemu_write_full(fd, &header, sizeof(header));
1227     if (ret != sizeof(header)) {
1228         ret = -errno;
1229         goto exit;
1230     }
1231 
1232     ret = ftruncate(fd, le64_to_cpu(header.grain_offset) << 9);
1233     if (ret < 0) {
1234         ret = -errno;
1235         goto exit;
1236     }
1237 
1238     /* write grain directory */
1239     lseek(fd, le64_to_cpu(header.rgd_offset) << 9, SEEK_SET);
1240     for (i = 0, tmp = le64_to_cpu(header.rgd_offset) + gd_size;
1241          i < gt_count; i++, tmp += gt_size) {
1242         ret = qemu_write_full(fd, &tmp, sizeof(tmp));
1243         if (ret != sizeof(tmp)) {
1244             ret = -errno;
1245             goto exit;
1246         }
1247     }
1248 
1249     /* write backup grain directory */
1250     lseek(fd, le64_to_cpu(header.gd_offset) << 9, SEEK_SET);
1251     for (i = 0, tmp = le64_to_cpu(header.gd_offset) + gd_size;
1252          i < gt_count; i++, tmp += gt_size) {
1253         ret = qemu_write_full(fd, &tmp, sizeof(tmp));
1254         if (ret != sizeof(tmp)) {
1255             ret = -errno;
1256             goto exit;
1257         }
1258     }
1259 
1260     ret = 0;
1261  exit:
1262     close(fd);
1263     return ret;
1264 }
1265 
1266 static int filename_decompose(const char *filename, char *path, char *prefix,
1267         char *postfix, size_t buf_len)
1268 {
1269     const char *p, *q;
1270 
1271     if (filename == NULL || !strlen(filename)) {
1272         fprintf(stderr, "Vmdk: no filename provided.\n");
1273         return -1;
1274     }
1275     p = strrchr(filename, '/');
1276     if (p == NULL) {
1277         p = strrchr(filename, '\\');
1278     }
1279     if (p == NULL) {
1280         p = strrchr(filename, ':');
1281     }
1282     if (p != NULL) {
1283         p++;
1284         if (p - filename >= buf_len) {
1285             return -1;
1286         }
1287         pstrcpy(path, p - filename + 1, filename);
1288     } else {
1289         p = filename;
1290         path[0] = '\0';
1291     }
1292     q = strrchr(p, '.');
1293     if (q == NULL) {
1294         pstrcpy(prefix, buf_len, p);
1295         postfix[0] = '\0';
1296     } else {
1297         if (q - p >= buf_len) {
1298             return -1;
1299         }
1300         pstrcpy(prefix, q - p + 1, p);
1301         pstrcpy(postfix, buf_len, q);
1302     }
1303     return 0;
1304 }
1305 
1306 static int relative_path(char *dest, int dest_size,
1307         const char *base, const char *target)
1308 {
1309     int i = 0;
1310     int n = 0;
1311     const char *p, *q;
1312 #ifdef _WIN32
1313     const char *sep = "\\";
1314 #else
1315     const char *sep = "/";
1316 #endif
1317 
1318     if (!(dest && base && target)) {
1319         return -1;
1320     }
1321     if (path_is_absolute(target)) {
1322         dest[dest_size - 1] = '\0';
1323         strncpy(dest, target, dest_size - 1);
1324         return 0;
1325     }
1326     while (base[i] == target[i]) {
1327         i++;
1328     }
1329     p = &base[i];
1330     q = &target[i];
1331     while (*p) {
1332         if (*p == *sep) {
1333             n++;
1334         }
1335         p++;
1336     }
1337     dest[0] = '\0';
1338     for (; n; n--) {
1339         pstrcat(dest, dest_size, "..");
1340         pstrcat(dest, dest_size, sep);
1341     }
1342     pstrcat(dest, dest_size, q);
1343     return 0;
1344 }
1345 
1346 static int vmdk_create(const char *filename, QEMUOptionParameter *options)
1347 {
1348     int fd, idx = 0;
1349     char desc[BUF_SIZE];
1350     int64_t total_size = 0, filesize;
1351     const char *backing_file = NULL;
1352     const char *fmt = NULL;
1353     int flags = 0;
1354     int ret = 0;
1355     bool flat, split, compress;
1356     char ext_desc_lines[BUF_SIZE] = "";
1357     char path[PATH_MAX], prefix[PATH_MAX], postfix[PATH_MAX];
1358     const int64_t split_size = 0x80000000;  /* VMDK has constant split size */
1359     const char *desc_extent_line;
1360     char parent_desc_line[BUF_SIZE] = "";
1361     uint32_t parent_cid = 0xffffffff;
1362     const char desc_template[] =
1363         "# Disk DescriptorFile\n"
1364         "version=1\n"
1365         "CID=%x\n"
1366         "parentCID=%x\n"
1367         "createType=\"%s\"\n"
1368         "%s"
1369         "\n"
1370         "# Extent description\n"
1371         "%s"
1372         "\n"
1373         "# The Disk Data Base\n"
1374         "#DDB\n"
1375         "\n"
1376         "ddb.virtualHWVersion = \"%d\"\n"
1377         "ddb.geometry.cylinders = \"%" PRId64 "\"\n"
1378         "ddb.geometry.heads = \"16\"\n"
1379         "ddb.geometry.sectors = \"63\"\n"
1380         "ddb.adapterType = \"ide\"\n";
1381 
1382     if (filename_decompose(filename, path, prefix, postfix, PATH_MAX)) {
1383         return -EINVAL;
1384     }
1385     /* Read out options */
1386     while (options && options->name) {
1387         if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
1388             total_size = options->value.n;
1389         } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) {
1390             backing_file = options->value.s;
1391         } else if (!strcmp(options->name, BLOCK_OPT_COMPAT6)) {
1392             flags |= options->value.n ? BLOCK_FLAG_COMPAT6 : 0;
1393         } else if (!strcmp(options->name, BLOCK_OPT_SUBFMT)) {
1394             fmt = options->value.s;
1395         }
1396         options++;
1397     }
1398     if (!fmt) {
1399         /* Default format to monolithicSparse */
1400         fmt = "monolithicSparse";
1401     } else if (strcmp(fmt, "monolithicFlat") &&
1402                strcmp(fmt, "monolithicSparse") &&
1403                strcmp(fmt, "twoGbMaxExtentSparse") &&
1404                strcmp(fmt, "twoGbMaxExtentFlat") &&
1405                strcmp(fmt, "streamOptimized")) {
1406         fprintf(stderr, "VMDK: Unknown subformat: %s\n", fmt);
1407         return -EINVAL;
1408     }
1409     split = !(strcmp(fmt, "twoGbMaxExtentFlat") &&
1410               strcmp(fmt, "twoGbMaxExtentSparse"));
1411     flat = !(strcmp(fmt, "monolithicFlat") &&
1412              strcmp(fmt, "twoGbMaxExtentFlat"));
1413     compress = !strcmp(fmt, "streamOptimized");
1414     if (flat) {
1415         desc_extent_line = "RW %lld FLAT \"%s\" 0\n";
1416     } else {
1417         desc_extent_line = "RW %lld SPARSE \"%s\"\n";
1418     }
1419     if (flat && backing_file) {
1420         /* not supporting backing file for flat image */
1421         return -ENOTSUP;
1422     }
1423     if (backing_file) {
1424         char parent_filename[PATH_MAX];
1425         BlockDriverState *bs = bdrv_new("");
1426         ret = bdrv_open(bs, backing_file, 0, NULL);
1427         if (ret != 0) {
1428             bdrv_delete(bs);
1429             return ret;
1430         }
1431         if (strcmp(bs->drv->format_name, "vmdk")) {
1432             bdrv_delete(bs);
1433             return -EINVAL;
1434         }
1435         parent_cid = vmdk_read_cid(bs, 0);
1436         bdrv_delete(bs);
1437         relative_path(parent_filename, sizeof(parent_filename),
1438                       filename, backing_file);
1439         snprintf(parent_desc_line, sizeof(parent_desc_line),
1440                 "parentFileNameHint=\"%s\"", parent_filename);
1441     }
1442 
1443     /* Create extents */
1444     filesize = total_size;
1445     while (filesize > 0) {
1446         char desc_line[BUF_SIZE];
1447         char ext_filename[PATH_MAX];
1448         char desc_filename[PATH_MAX];
1449         int64_t size = filesize;
1450 
1451         if (split && size > split_size) {
1452             size = split_size;
1453         }
1454         if (split) {
1455             snprintf(desc_filename, sizeof(desc_filename), "%s-%c%03d%s",
1456                     prefix, flat ? 'f' : 's', ++idx, postfix);
1457         } else if (flat) {
1458             snprintf(desc_filename, sizeof(desc_filename), "%s-flat%s",
1459                     prefix, postfix);
1460         } else {
1461             snprintf(desc_filename, sizeof(desc_filename), "%s%s",
1462                     prefix, postfix);
1463         }
1464         snprintf(ext_filename, sizeof(ext_filename), "%s%s",
1465                 path, desc_filename);
1466 
1467         if (vmdk_create_extent(ext_filename, size, flat, compress)) {
1468             return -EINVAL;
1469         }
1470         filesize -= size;
1471 
1472         /* Format description line */
1473         snprintf(desc_line, sizeof(desc_line),
1474                     desc_extent_line, size / 512, desc_filename);
1475         pstrcat(ext_desc_lines, sizeof(ext_desc_lines), desc_line);
1476     }
1477     /* generate descriptor file */
1478     snprintf(desc, sizeof(desc), desc_template,
1479             (unsigned int)time(NULL),
1480             parent_cid,
1481             fmt,
1482             parent_desc_line,
1483             ext_desc_lines,
1484             (flags & BLOCK_FLAG_COMPAT6 ? 6 : 4),
1485             total_size / (int64_t)(63 * 16 * 512));
1486     if (split || flat) {
1487         fd = open(
1488                 filename,
1489                 O_WRONLY | O_CREAT | O_TRUNC | O_BINARY | O_LARGEFILE,
1490                 0644);
1491     } else {
1492         fd = open(
1493                 filename,
1494                 O_WRONLY | O_BINARY | O_LARGEFILE,
1495                 0644);
1496     }
1497     if (fd < 0) {
1498         return -errno;
1499     }
1500     /* the descriptor offset = 0x200 */
1501     if (!split && !flat && 0x200 != lseek(fd, 0x200, SEEK_SET)) {
1502         ret = -errno;
1503         goto exit;
1504     }
1505     ret = qemu_write_full(fd, desc, strlen(desc));
1506     if (ret != strlen(desc)) {
1507         ret = -errno;
1508         goto exit;
1509     }
1510     ret = 0;
1511 exit:
1512     close(fd);
1513     return ret;
1514 }
1515 
1516 static void vmdk_close(BlockDriverState *bs)
1517 {
1518     BDRVVmdkState *s = bs->opaque;
1519 
1520     vmdk_free_extents(bs);
1521 
1522     migrate_del_blocker(s->migration_blocker);
1523     error_free(s->migration_blocker);
1524 }
1525 
1526 static coroutine_fn int vmdk_co_flush(BlockDriverState *bs)
1527 {
1528     int i, ret, err;
1529     BDRVVmdkState *s = bs->opaque;
1530 
1531     ret = bdrv_co_flush(bs->file);
1532     for (i = 0; i < s->num_extents; i++) {
1533         err = bdrv_co_flush(s->extents[i].file);
1534         if (err < 0) {
1535             ret = err;
1536         }
1537     }
1538     return ret;
1539 }
1540 
1541 static int64_t vmdk_get_allocated_file_size(BlockDriverState *bs)
1542 {
1543     int i;
1544     int64_t ret = 0;
1545     int64_t r;
1546     BDRVVmdkState *s = bs->opaque;
1547 
1548     ret = bdrv_get_allocated_file_size(bs->file);
1549     if (ret < 0) {
1550         return ret;
1551     }
1552     for (i = 0; i < s->num_extents; i++) {
1553         if (s->extents[i].file == bs->file) {
1554             continue;
1555         }
1556         r = bdrv_get_allocated_file_size(s->extents[i].file);
1557         if (r < 0) {
1558             return r;
1559         }
1560         ret += r;
1561     }
1562     return ret;
1563 }
1564 
1565 static QEMUOptionParameter vmdk_create_options[] = {
1566     {
1567         .name = BLOCK_OPT_SIZE,
1568         .type = OPT_SIZE,
1569         .help = "Virtual disk size"
1570     },
1571     {
1572         .name = BLOCK_OPT_BACKING_FILE,
1573         .type = OPT_STRING,
1574         .help = "File name of a base image"
1575     },
1576     {
1577         .name = BLOCK_OPT_COMPAT6,
1578         .type = OPT_FLAG,
1579         .help = "VMDK version 6 image"
1580     },
1581     {
1582         .name = BLOCK_OPT_SUBFMT,
1583         .type = OPT_STRING,
1584         .help =
1585             "VMDK flat extent format, can be one of "
1586             "{monolithicSparse (default) | monolithicFlat | twoGbMaxExtentSparse | twoGbMaxExtentFlat | streamOptimized} "
1587     },
1588     { NULL }
1589 };
1590 
1591 static BlockDriver bdrv_vmdk = {
1592     .format_name    = "vmdk",
1593     .instance_size  = sizeof(BDRVVmdkState),
1594     .bdrv_probe     = vmdk_probe,
1595     .bdrv_open      = vmdk_open,
1596     .bdrv_read      = vmdk_co_read,
1597     .bdrv_write     = vmdk_co_write,
1598     .bdrv_close     = vmdk_close,
1599     .bdrv_create    = vmdk_create,
1600     .bdrv_co_flush_to_disk  = vmdk_co_flush,
1601     .bdrv_co_is_allocated   = vmdk_co_is_allocated,
1602     .bdrv_get_allocated_file_size  = vmdk_get_allocated_file_size,
1603 
1604     .create_options = vmdk_create_options,
1605 };
1606 
1607 static void bdrv_vmdk_init(void)
1608 {
1609     bdrv_register(&bdrv_vmdk);
1610 }
1611 
1612 block_init(bdrv_vmdk_init);
1613