xref: /openbmc/qemu/block/vmdk.c (revision 3789985f406ecb99f7d3e6521bb4310228f0577c)
1 /*
2  * Block driver for the VMDK format
3  *
4  * Copyright (c) 2004 Fabrice Bellard
5  * Copyright (c) 2005 Filip Navara
6  *
7  * Permission is hereby granted, free of charge, to any person obtaining a copy
8  * of this software and associated documentation files (the "Software"), to deal
9  * in the Software without restriction, including without limitation the rights
10  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11  * copies of the Software, and to permit persons to whom the Software is
12  * furnished to do so, subject to the following conditions:
13  *
14  * The above copyright notice and this permission notice shall be included in
15  * all copies or substantial portions of the Software.
16  *
17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23  * THE SOFTWARE.
24  */
25 
26 #include "qemu-common.h"
27 #include "block_int.h"
28 #include "module.h"
29 #include "zlib.h"
30 
31 #define VMDK3_MAGIC (('C' << 24) | ('O' << 16) | ('W' << 8) | 'D')
32 #define VMDK4_MAGIC (('K' << 24) | ('D' << 16) | ('M' << 8) | 'V')
33 #define VMDK4_COMPRESSION_DEFLATE 1
34 #define VMDK4_FLAG_RGD (1 << 1)
35 #define VMDK4_FLAG_COMPRESS (1 << 16)
36 #define VMDK4_FLAG_MARKER (1 << 17)
37 
38 typedef struct {
39     uint32_t version;
40     uint32_t flags;
41     uint32_t disk_sectors;
42     uint32_t granularity;
43     uint32_t l1dir_offset;
44     uint32_t l1dir_size;
45     uint32_t file_sectors;
46     uint32_t cylinders;
47     uint32_t heads;
48     uint32_t sectors_per_track;
49 } VMDK3Header;
50 
51 typedef struct {
52     uint32_t version;
53     uint32_t flags;
54     int64_t capacity;
55     int64_t granularity;
56     int64_t desc_offset;
57     int64_t desc_size;
58     int32_t num_gtes_per_gte;
59     int64_t gd_offset;
60     int64_t rgd_offset;
61     int64_t grain_offset;
62     char filler[1];
63     char check_bytes[4];
64     uint16_t compressAlgorithm;
65 } QEMU_PACKED VMDK4Header;
66 
67 #define L2_CACHE_SIZE 16
68 
69 typedef struct VmdkExtent {
70     BlockDriverState *file;
71     bool flat;
72     bool compressed;
73     bool has_marker;
74     int64_t sectors;
75     int64_t end_sector;
76     int64_t flat_start_offset;
77     int64_t l1_table_offset;
78     int64_t l1_backup_table_offset;
79     uint32_t *l1_table;
80     uint32_t *l1_backup_table;
81     unsigned int l1_size;
82     uint32_t l1_entry_sectors;
83 
84     unsigned int l2_size;
85     uint32_t *l2_cache;
86     uint32_t l2_cache_offsets[L2_CACHE_SIZE];
87     uint32_t l2_cache_counts[L2_CACHE_SIZE];
88 
89     unsigned int cluster_sectors;
90 } VmdkExtent;
91 
92 typedef struct BDRVVmdkState {
93     CoMutex lock;
94     int desc_offset;
95     bool cid_updated;
96     uint32_t parent_cid;
97     int num_extents;
98     /* Extent array with num_extents entries, ascend ordered by address */
99     VmdkExtent *extents;
100 } BDRVVmdkState;
101 
102 typedef struct VmdkMetaData {
103     uint32_t offset;
104     unsigned int l1_index;
105     unsigned int l2_index;
106     unsigned int l2_offset;
107     int valid;
108 } VmdkMetaData;
109 
110 typedef struct VmdkGrainMarker {
111     uint64_t lba;
112     uint32_t size;
113     uint8_t  data[0];
114 } VmdkGrainMarker;
115 
116 static int vmdk_probe(const uint8_t *buf, int buf_size, const char *filename)
117 {
118     uint32_t magic;
119 
120     if (buf_size < 4) {
121         return 0;
122     }
123     magic = be32_to_cpu(*(uint32_t *)buf);
124     if (magic == VMDK3_MAGIC ||
125         magic == VMDK4_MAGIC) {
126         return 100;
127     } else {
128         const char *p = (const char *)buf;
129         const char *end = p + buf_size;
130         while (p < end) {
131             if (*p == '#') {
132                 /* skip comment line */
133                 while (p < end && *p != '\n') {
134                     p++;
135                 }
136                 p++;
137                 continue;
138             }
139             if (*p == ' ') {
140                 while (p < end && *p == ' ') {
141                     p++;
142                 }
143                 /* skip '\r' if windows line endings used. */
144                 if (p < end && *p == '\r') {
145                     p++;
146                 }
147                 /* only accept blank lines before 'version=' line */
148                 if (p == end || *p != '\n') {
149                     return 0;
150                 }
151                 p++;
152                 continue;
153             }
154             if (end - p >= strlen("version=X\n")) {
155                 if (strncmp("version=1\n", p, strlen("version=1\n")) == 0 ||
156                     strncmp("version=2\n", p, strlen("version=2\n")) == 0) {
157                     return 100;
158                 }
159             }
160             if (end - p >= strlen("version=X\r\n")) {
161                 if (strncmp("version=1\r\n", p, strlen("version=1\r\n")) == 0 ||
162                     strncmp("version=2\r\n", p, strlen("version=2\r\n")) == 0) {
163                     return 100;
164                 }
165             }
166             return 0;
167         }
168         return 0;
169     }
170 }
171 
172 #define CHECK_CID 1
173 
174 #define SECTOR_SIZE 512
175 #define DESC_SIZE (20 * SECTOR_SIZE)    /* 20 sectors of 512 bytes each */
176 #define BUF_SIZE 4096
177 #define HEADER_SIZE 512                 /* first sector of 512 bytes */
178 
179 static void vmdk_free_extents(BlockDriverState *bs)
180 {
181     int i;
182     BDRVVmdkState *s = bs->opaque;
183     VmdkExtent *e;
184 
185     for (i = 0; i < s->num_extents; i++) {
186         e = &s->extents[i];
187         g_free(e->l1_table);
188         g_free(e->l2_cache);
189         g_free(e->l1_backup_table);
190         if (e->file != bs->file) {
191             bdrv_delete(e->file);
192         }
193     }
194     g_free(s->extents);
195 }
196 
197 static void vmdk_free_last_extent(BlockDriverState *bs)
198 {
199     BDRVVmdkState *s = bs->opaque;
200 
201     if (s->num_extents == 0) {
202         return;
203     }
204     s->num_extents--;
205     s->extents = g_realloc(s->extents, s->num_extents * sizeof(VmdkExtent));
206 }
207 
208 static uint32_t vmdk_read_cid(BlockDriverState *bs, int parent)
209 {
210     char desc[DESC_SIZE];
211     uint32_t cid;
212     const char *p_name, *cid_str;
213     size_t cid_str_size;
214     BDRVVmdkState *s = bs->opaque;
215 
216     if (bdrv_pread(bs->file, s->desc_offset, desc, DESC_SIZE) != DESC_SIZE) {
217         return 0;
218     }
219 
220     if (parent) {
221         cid_str = "parentCID";
222         cid_str_size = sizeof("parentCID");
223     } else {
224         cid_str = "CID";
225         cid_str_size = sizeof("CID");
226     }
227 
228     p_name = strstr(desc, cid_str);
229     if (p_name != NULL) {
230         p_name += cid_str_size;
231         sscanf(p_name, "%x", &cid);
232     }
233 
234     return cid;
235 }
236 
237 static int vmdk_write_cid(BlockDriverState *bs, uint32_t cid)
238 {
239     char desc[DESC_SIZE], tmp_desc[DESC_SIZE];
240     char *p_name, *tmp_str;
241     BDRVVmdkState *s = bs->opaque;
242 
243     memset(desc, 0, sizeof(desc));
244     if (bdrv_pread(bs->file, s->desc_offset, desc, DESC_SIZE) != DESC_SIZE) {
245         return -EIO;
246     }
247 
248     tmp_str = strstr(desc, "parentCID");
249     pstrcpy(tmp_desc, sizeof(tmp_desc), tmp_str);
250     p_name = strstr(desc, "CID");
251     if (p_name != NULL) {
252         p_name += sizeof("CID");
253         snprintf(p_name, sizeof(desc) - (p_name - desc), "%x\n", cid);
254         pstrcat(desc, sizeof(desc), tmp_desc);
255     }
256 
257     if (bdrv_pwrite_sync(bs->file, s->desc_offset, desc, DESC_SIZE) < 0) {
258         return -EIO;
259     }
260     return 0;
261 }
262 
263 static int vmdk_is_cid_valid(BlockDriverState *bs)
264 {
265 #ifdef CHECK_CID
266     BDRVVmdkState *s = bs->opaque;
267     BlockDriverState *p_bs = bs->backing_hd;
268     uint32_t cur_pcid;
269 
270     if (p_bs) {
271         cur_pcid = vmdk_read_cid(p_bs, 0);
272         if (s->parent_cid != cur_pcid) {
273             /* CID not valid */
274             return 0;
275         }
276     }
277 #endif
278     /* CID valid */
279     return 1;
280 }
281 
282 static int vmdk_parent_open(BlockDriverState *bs)
283 {
284     char *p_name;
285     char desc[DESC_SIZE + 1];
286     BDRVVmdkState *s = bs->opaque;
287     int ret;
288 
289     desc[DESC_SIZE] = '\0';
290     ret = bdrv_pread(bs->file, s->desc_offset, desc, DESC_SIZE);
291     if (ret < 0) {
292         return ret;
293     }
294 
295     p_name = strstr(desc, "parentFileNameHint");
296     if (p_name != NULL) {
297         char *end_name;
298 
299         p_name += sizeof("parentFileNameHint") + 1;
300         end_name = strchr(p_name, '\"');
301         if (end_name == NULL) {
302             return -EINVAL;
303         }
304         if ((end_name - p_name) > sizeof(bs->backing_file) - 1) {
305             return -EINVAL;
306         }
307 
308         pstrcpy(bs->backing_file, end_name - p_name + 1, p_name);
309     }
310 
311     return 0;
312 }
313 
314 /* Create and append extent to the extent array. Return the added VmdkExtent
315  * address. return NULL if allocation failed. */
316 static VmdkExtent *vmdk_add_extent(BlockDriverState *bs,
317                            BlockDriverState *file, bool flat, int64_t sectors,
318                            int64_t l1_offset, int64_t l1_backup_offset,
319                            uint32_t l1_size,
320                            int l2_size, unsigned int cluster_sectors)
321 {
322     VmdkExtent *extent;
323     BDRVVmdkState *s = bs->opaque;
324 
325     s->extents = g_realloc(s->extents,
326                               (s->num_extents + 1) * sizeof(VmdkExtent));
327     extent = &s->extents[s->num_extents];
328     s->num_extents++;
329 
330     memset(extent, 0, sizeof(VmdkExtent));
331     extent->file = file;
332     extent->flat = flat;
333     extent->sectors = sectors;
334     extent->l1_table_offset = l1_offset;
335     extent->l1_backup_table_offset = l1_backup_offset;
336     extent->l1_size = l1_size;
337     extent->l1_entry_sectors = l2_size * cluster_sectors;
338     extent->l2_size = l2_size;
339     extent->cluster_sectors = cluster_sectors;
340 
341     if (s->num_extents > 1) {
342         extent->end_sector = (*(extent - 1)).end_sector + extent->sectors;
343     } else {
344         extent->end_sector = extent->sectors;
345     }
346     bs->total_sectors = extent->end_sector;
347     return extent;
348 }
349 
350 static int vmdk_init_tables(BlockDriverState *bs, VmdkExtent *extent)
351 {
352     int ret;
353     int l1_size, i;
354 
355     /* read the L1 table */
356     l1_size = extent->l1_size * sizeof(uint32_t);
357     extent->l1_table = g_malloc(l1_size);
358     ret = bdrv_pread(extent->file,
359                     extent->l1_table_offset,
360                     extent->l1_table,
361                     l1_size);
362     if (ret < 0) {
363         goto fail_l1;
364     }
365     for (i = 0; i < extent->l1_size; i++) {
366         le32_to_cpus(&extent->l1_table[i]);
367     }
368 
369     if (extent->l1_backup_table_offset) {
370         extent->l1_backup_table = g_malloc(l1_size);
371         ret = bdrv_pread(extent->file,
372                         extent->l1_backup_table_offset,
373                         extent->l1_backup_table,
374                         l1_size);
375         if (ret < 0) {
376             goto fail_l1b;
377         }
378         for (i = 0; i < extent->l1_size; i++) {
379             le32_to_cpus(&extent->l1_backup_table[i]);
380         }
381     }
382 
383     extent->l2_cache =
384         g_malloc(extent->l2_size * L2_CACHE_SIZE * sizeof(uint32_t));
385     return 0;
386  fail_l1b:
387     g_free(extent->l1_backup_table);
388  fail_l1:
389     g_free(extent->l1_table);
390     return ret;
391 }
392 
393 static int vmdk_open_vmdk3(BlockDriverState *bs,
394                            BlockDriverState *file,
395                            int flags)
396 {
397     int ret;
398     uint32_t magic;
399     VMDK3Header header;
400     VmdkExtent *extent;
401 
402     ret = bdrv_pread(file, sizeof(magic), &header, sizeof(header));
403     if (ret < 0) {
404         return ret;
405     }
406     extent = vmdk_add_extent(bs,
407                              bs->file, false,
408                              le32_to_cpu(header.disk_sectors),
409                              le32_to_cpu(header.l1dir_offset) << 9,
410                              0, 1 << 6, 1 << 9,
411                              le32_to_cpu(header.granularity));
412     ret = vmdk_init_tables(bs, extent);
413     if (ret) {
414         /* free extent allocated by vmdk_add_extent */
415         vmdk_free_last_extent(bs);
416     }
417     return ret;
418 }
419 
420 static int vmdk_open_desc_file(BlockDriverState *bs, int flags,
421                                int64_t desc_offset);
422 
423 static int vmdk_open_vmdk4(BlockDriverState *bs,
424                            BlockDriverState *file,
425                            int flags)
426 {
427     int ret;
428     uint32_t magic;
429     uint32_t l1_size, l1_entry_sectors;
430     VMDK4Header header;
431     VmdkExtent *extent;
432     int64_t l1_backup_offset = 0;
433 
434     ret = bdrv_pread(file, sizeof(magic), &header, sizeof(header));
435     if (ret < 0) {
436         return ret;
437     }
438     if (header.capacity == 0 && header.desc_offset) {
439         return vmdk_open_desc_file(bs, flags, header.desc_offset << 9);
440     }
441     l1_entry_sectors = le32_to_cpu(header.num_gtes_per_gte)
442                         * le64_to_cpu(header.granularity);
443     if (l1_entry_sectors <= 0) {
444         return -EINVAL;
445     }
446     l1_size = (le64_to_cpu(header.capacity) + l1_entry_sectors - 1)
447                 / l1_entry_sectors;
448     if (le32_to_cpu(header.flags) & VMDK4_FLAG_RGD) {
449         l1_backup_offset = le64_to_cpu(header.rgd_offset) << 9;
450     }
451     extent = vmdk_add_extent(bs, file, false,
452                           le64_to_cpu(header.capacity),
453                           le64_to_cpu(header.gd_offset) << 9,
454                           l1_backup_offset,
455                           l1_size,
456                           le32_to_cpu(header.num_gtes_per_gte),
457                           le64_to_cpu(header.granularity));
458     extent->compressed =
459         le16_to_cpu(header.compressAlgorithm) == VMDK4_COMPRESSION_DEFLATE;
460     extent->has_marker = le32_to_cpu(header.flags) & VMDK4_FLAG_MARKER;
461     ret = vmdk_init_tables(bs, extent);
462     if (ret) {
463         /* free extent allocated by vmdk_add_extent */
464         vmdk_free_last_extent(bs);
465     }
466     return ret;
467 }
468 
469 /* find an option value out of descriptor file */
470 static int vmdk_parse_description(const char *desc, const char *opt_name,
471         char *buf, int buf_size)
472 {
473     char *opt_pos, *opt_end;
474     const char *end = desc + strlen(desc);
475 
476     opt_pos = strstr(desc, opt_name);
477     if (!opt_pos) {
478         return -1;
479     }
480     /* Skip "=\"" following opt_name */
481     opt_pos += strlen(opt_name) + 2;
482     if (opt_pos >= end) {
483         return -1;
484     }
485     opt_end = opt_pos;
486     while (opt_end < end && *opt_end != '"') {
487         opt_end++;
488     }
489     if (opt_end == end || buf_size < opt_end - opt_pos + 1) {
490         return -1;
491     }
492     pstrcpy(buf, opt_end - opt_pos + 1, opt_pos);
493     return 0;
494 }
495 
496 /* Open an extent file and append to bs array */
497 static int vmdk_open_sparse(BlockDriverState *bs,
498                             BlockDriverState *file,
499                             int flags)
500 {
501     uint32_t magic;
502 
503     if (bdrv_pread(file, 0, &magic, sizeof(magic)) != sizeof(magic)) {
504         return -EIO;
505     }
506 
507     magic = be32_to_cpu(magic);
508     switch (magic) {
509         case VMDK3_MAGIC:
510             return vmdk_open_vmdk3(bs, file, flags);
511             break;
512         case VMDK4_MAGIC:
513             return vmdk_open_vmdk4(bs, file, flags);
514             break;
515         default:
516             return -EINVAL;
517             break;
518     }
519 }
520 
521 static int vmdk_parse_extents(const char *desc, BlockDriverState *bs,
522         const char *desc_file_path)
523 {
524     int ret;
525     char access[11];
526     char type[11];
527     char fname[512];
528     const char *p = desc;
529     int64_t sectors = 0;
530     int64_t flat_offset;
531     char extent_path[PATH_MAX];
532     BlockDriverState *extent_file;
533 
534     while (*p) {
535         /* parse extent line:
536          * RW [size in sectors] FLAT "file-name.vmdk" OFFSET
537          * or
538          * RW [size in sectors] SPARSE "file-name.vmdk"
539          */
540         flat_offset = -1;
541         ret = sscanf(p, "%10s %" SCNd64 " %10s %511s %" SCNd64,
542                 access, &sectors, type, fname, &flat_offset);
543         if (ret < 4 || strcmp(access, "RW")) {
544             goto next_line;
545         } else if (!strcmp(type, "FLAT")) {
546             if (ret != 5 || flat_offset < 0) {
547                 return -EINVAL;
548             }
549         } else if (ret != 4) {
550             return -EINVAL;
551         }
552 
553         /* trim the quotation marks around */
554         if (fname[0] == '"') {
555             memmove(fname, fname + 1, strlen(fname));
556             if (strlen(fname) <= 1 || fname[strlen(fname) - 1] != '"') {
557                 return -EINVAL;
558             }
559             fname[strlen(fname) - 1] = '\0';
560         }
561         if (sectors <= 0 ||
562             (strcmp(type, "FLAT") && strcmp(type, "SPARSE")) ||
563             (strcmp(access, "RW"))) {
564             goto next_line;
565         }
566 
567         path_combine(extent_path, sizeof(extent_path),
568                 desc_file_path, fname);
569         ret = bdrv_file_open(&extent_file, extent_path, bs->open_flags);
570         if (ret) {
571             return ret;
572         }
573 
574         /* save to extents array */
575         if (!strcmp(type, "FLAT")) {
576             /* FLAT extent */
577             VmdkExtent *extent;
578 
579             extent = vmdk_add_extent(bs, extent_file, true, sectors,
580                             0, 0, 0, 0, sectors);
581             extent->flat_start_offset = flat_offset << 9;
582         } else if (!strcmp(type, "SPARSE")) {
583             /* SPARSE extent */
584             ret = vmdk_open_sparse(bs, extent_file, bs->open_flags);
585             if (ret) {
586                 bdrv_delete(extent_file);
587                 return ret;
588             }
589         } else {
590             fprintf(stderr,
591                 "VMDK: Not supported extent type \"%s\""".\n", type);
592             return -ENOTSUP;
593         }
594 next_line:
595         /* move to next line */
596         while (*p && *p != '\n') {
597             p++;
598         }
599         p++;
600     }
601     return 0;
602 }
603 
604 static int vmdk_open_desc_file(BlockDriverState *bs, int flags,
605                                int64_t desc_offset)
606 {
607     int ret;
608     char buf[2048];
609     char ct[128];
610     BDRVVmdkState *s = bs->opaque;
611 
612     ret = bdrv_pread(bs->file, desc_offset, buf, sizeof(buf));
613     if (ret < 0) {
614         return ret;
615     }
616     buf[2047] = '\0';
617     if (vmdk_parse_description(buf, "createType", ct, sizeof(ct))) {
618         return -EINVAL;
619     }
620     if (strcmp(ct, "monolithicFlat") &&
621         strcmp(ct, "twoGbMaxExtentSparse") &&
622         strcmp(ct, "twoGbMaxExtentFlat")) {
623         fprintf(stderr,
624                 "VMDK: Not supported image type \"%s\""".\n", ct);
625         return -ENOTSUP;
626     }
627     s->desc_offset = 0;
628     return vmdk_parse_extents(buf, bs, bs->file->filename);
629 }
630 
631 static int vmdk_open(BlockDriverState *bs, int flags)
632 {
633     int ret;
634     BDRVVmdkState *s = bs->opaque;
635 
636     if (vmdk_open_sparse(bs, bs->file, flags) == 0) {
637         s->desc_offset = 0x200;
638     } else {
639         ret = vmdk_open_desc_file(bs, flags, 0);
640         if (ret) {
641             goto fail;
642         }
643     }
644     /* try to open parent images, if exist */
645     ret = vmdk_parent_open(bs);
646     if (ret) {
647         goto fail;
648     }
649     s->parent_cid = vmdk_read_cid(bs, 1);
650     qemu_co_mutex_init(&s->lock);
651     return ret;
652 
653 fail:
654     vmdk_free_extents(bs);
655     return ret;
656 }
657 
658 static int get_whole_cluster(BlockDriverState *bs,
659                 VmdkExtent *extent,
660                 uint64_t cluster_offset,
661                 uint64_t offset,
662                 bool allocate)
663 {
664     /* 128 sectors * 512 bytes each = grain size 64KB */
665     uint8_t  whole_grain[extent->cluster_sectors * 512];
666 
667     /* we will be here if it's first write on non-exist grain(cluster).
668      * try to read from parent image, if exist */
669     if (bs->backing_hd) {
670         int ret;
671 
672         if (!vmdk_is_cid_valid(bs)) {
673             return -1;
674         }
675 
676         /* floor offset to cluster */
677         offset -= offset % (extent->cluster_sectors * 512);
678         ret = bdrv_read(bs->backing_hd, offset >> 9, whole_grain,
679                 extent->cluster_sectors);
680         if (ret < 0) {
681             return -1;
682         }
683 
684         /* Write grain only into the active image */
685         ret = bdrv_write(extent->file, cluster_offset, whole_grain,
686                 extent->cluster_sectors);
687         if (ret < 0) {
688             return -1;
689         }
690     }
691     return 0;
692 }
693 
694 static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data)
695 {
696     /* update L2 table */
697     if (bdrv_pwrite_sync(
698                 extent->file,
699                 ((int64_t)m_data->l2_offset * 512)
700                     + (m_data->l2_index * sizeof(m_data->offset)),
701                 &(m_data->offset),
702                 sizeof(m_data->offset)
703             ) < 0) {
704         return -1;
705     }
706     /* update backup L2 table */
707     if (extent->l1_backup_table_offset != 0) {
708         m_data->l2_offset = extent->l1_backup_table[m_data->l1_index];
709         if (bdrv_pwrite_sync(
710                     extent->file,
711                     ((int64_t)m_data->l2_offset * 512)
712                         + (m_data->l2_index * sizeof(m_data->offset)),
713                     &(m_data->offset), sizeof(m_data->offset)
714                 ) < 0) {
715             return -1;
716         }
717     }
718 
719     return 0;
720 }
721 
722 static int get_cluster_offset(BlockDriverState *bs,
723                                     VmdkExtent *extent,
724                                     VmdkMetaData *m_data,
725                                     uint64_t offset,
726                                     int allocate,
727                                     uint64_t *cluster_offset)
728 {
729     unsigned int l1_index, l2_offset, l2_index;
730     int min_index, i, j;
731     uint32_t min_count, *l2_table, tmp = 0;
732 
733     if (m_data) {
734         m_data->valid = 0;
735     }
736     if (extent->flat) {
737         *cluster_offset = extent->flat_start_offset;
738         return 0;
739     }
740 
741     offset -= (extent->end_sector - extent->sectors) * SECTOR_SIZE;
742     l1_index = (offset >> 9) / extent->l1_entry_sectors;
743     if (l1_index >= extent->l1_size) {
744         return -1;
745     }
746     l2_offset = extent->l1_table[l1_index];
747     if (!l2_offset) {
748         return -1;
749     }
750     for (i = 0; i < L2_CACHE_SIZE; i++) {
751         if (l2_offset == extent->l2_cache_offsets[i]) {
752             /* increment the hit count */
753             if (++extent->l2_cache_counts[i] == 0xffffffff) {
754                 for (j = 0; j < L2_CACHE_SIZE; j++) {
755                     extent->l2_cache_counts[j] >>= 1;
756                 }
757             }
758             l2_table = extent->l2_cache + (i * extent->l2_size);
759             goto found;
760         }
761     }
762     /* not found: load a new entry in the least used one */
763     min_index = 0;
764     min_count = 0xffffffff;
765     for (i = 0; i < L2_CACHE_SIZE; i++) {
766         if (extent->l2_cache_counts[i] < min_count) {
767             min_count = extent->l2_cache_counts[i];
768             min_index = i;
769         }
770     }
771     l2_table = extent->l2_cache + (min_index * extent->l2_size);
772     if (bdrv_pread(
773                 extent->file,
774                 (int64_t)l2_offset * 512,
775                 l2_table,
776                 extent->l2_size * sizeof(uint32_t)
777             ) != extent->l2_size * sizeof(uint32_t)) {
778         return -1;
779     }
780 
781     extent->l2_cache_offsets[min_index] = l2_offset;
782     extent->l2_cache_counts[min_index] = 1;
783  found:
784     l2_index = ((offset >> 9) / extent->cluster_sectors) % extent->l2_size;
785     *cluster_offset = le32_to_cpu(l2_table[l2_index]);
786 
787     if (!*cluster_offset) {
788         if (!allocate) {
789             return -1;
790         }
791 
792         /* Avoid the L2 tables update for the images that have snapshots. */
793         *cluster_offset = bdrv_getlength(extent->file);
794         if (!extent->compressed) {
795             bdrv_truncate(
796                 extent->file,
797                 *cluster_offset + (extent->cluster_sectors << 9)
798             );
799         }
800 
801         *cluster_offset >>= 9;
802         tmp = cpu_to_le32(*cluster_offset);
803         l2_table[l2_index] = tmp;
804 
805         /* First of all we write grain itself, to avoid race condition
806          * that may to corrupt the image.
807          * This problem may occur because of insufficient space on host disk
808          * or inappropriate VM shutdown.
809          */
810         if (get_whole_cluster(
811                 bs, extent, *cluster_offset, offset, allocate) == -1) {
812             return -1;
813         }
814 
815         if (m_data) {
816             m_data->offset = tmp;
817             m_data->l1_index = l1_index;
818             m_data->l2_index = l2_index;
819             m_data->l2_offset = l2_offset;
820             m_data->valid = 1;
821         }
822     }
823     *cluster_offset <<= 9;
824     return 0;
825 }
826 
827 static VmdkExtent *find_extent(BDRVVmdkState *s,
828                                 int64_t sector_num, VmdkExtent *start_hint)
829 {
830     VmdkExtent *extent = start_hint;
831 
832     if (!extent) {
833         extent = &s->extents[0];
834     }
835     while (extent < &s->extents[s->num_extents]) {
836         if (sector_num < extent->end_sector) {
837             return extent;
838         }
839         extent++;
840     }
841     return NULL;
842 }
843 
844 static int vmdk_is_allocated(BlockDriverState *bs, int64_t sector_num,
845                              int nb_sectors, int *pnum)
846 {
847     BDRVVmdkState *s = bs->opaque;
848     int64_t index_in_cluster, n, ret;
849     uint64_t offset;
850     VmdkExtent *extent;
851 
852     extent = find_extent(s, sector_num, NULL);
853     if (!extent) {
854         return 0;
855     }
856     ret = get_cluster_offset(bs, extent, NULL,
857                             sector_num * 512, 0, &offset);
858     /* get_cluster_offset returning 0 means success */
859     ret = !ret;
860 
861     index_in_cluster = sector_num % extent->cluster_sectors;
862     n = extent->cluster_sectors - index_in_cluster;
863     if (n > nb_sectors) {
864         n = nb_sectors;
865     }
866     *pnum = n;
867     return ret;
868 }
869 
870 static int vmdk_write_extent(VmdkExtent *extent, int64_t cluster_offset,
871                             int64_t offset_in_cluster, const uint8_t *buf,
872                             int nb_sectors, int64_t sector_num)
873 {
874     int ret;
875     VmdkGrainMarker *data = NULL;
876     uLongf buf_len;
877     const uint8_t *write_buf = buf;
878     int write_len = nb_sectors * 512;
879 
880     if (extent->compressed) {
881         if (!extent->has_marker) {
882             ret = -EINVAL;
883             goto out;
884         }
885         buf_len = (extent->cluster_sectors << 9) * 2;
886         data = g_malloc(buf_len + sizeof(VmdkGrainMarker));
887         if (compress(data->data, &buf_len, buf, nb_sectors << 9) != Z_OK ||
888                 buf_len == 0) {
889             ret = -EINVAL;
890             goto out;
891         }
892         data->lba = sector_num;
893         data->size = buf_len;
894         write_buf = (uint8_t *)data;
895         write_len = buf_len + sizeof(VmdkGrainMarker);
896     }
897     ret = bdrv_pwrite(extent->file,
898                         cluster_offset + offset_in_cluster,
899                         write_buf,
900                         write_len);
901     if (ret != write_len) {
902         ret = ret < 0 ? ret : -EIO;
903         goto out;
904     }
905     ret = 0;
906  out:
907     g_free(data);
908     return ret;
909 }
910 
911 static int vmdk_read_extent(VmdkExtent *extent, int64_t cluster_offset,
912                             int64_t offset_in_cluster, uint8_t *buf,
913                             int nb_sectors)
914 {
915     int ret;
916     int cluster_bytes, buf_bytes;
917     uint8_t *cluster_buf, *compressed_data;
918     uint8_t *uncomp_buf;
919     uint32_t data_len;
920     VmdkGrainMarker *marker;
921     uLongf buf_len;
922 
923 
924     if (!extent->compressed) {
925         ret = bdrv_pread(extent->file,
926                           cluster_offset + offset_in_cluster,
927                           buf, nb_sectors * 512);
928         if (ret == nb_sectors * 512) {
929             return 0;
930         } else {
931             return -EIO;
932         }
933     }
934     cluster_bytes = extent->cluster_sectors * 512;
935     /* Read two clusters in case GrainMarker + compressed data > one cluster */
936     buf_bytes = cluster_bytes * 2;
937     cluster_buf = g_malloc(buf_bytes);
938     uncomp_buf = g_malloc(cluster_bytes);
939     ret = bdrv_pread(extent->file,
940                 cluster_offset,
941                 cluster_buf, buf_bytes);
942     if (ret < 0) {
943         goto out;
944     }
945     compressed_data = cluster_buf;
946     buf_len = cluster_bytes;
947     data_len = cluster_bytes;
948     if (extent->has_marker) {
949         marker = (VmdkGrainMarker *)cluster_buf;
950         compressed_data = marker->data;
951         data_len = le32_to_cpu(marker->size);
952     }
953     if (!data_len || data_len > buf_bytes) {
954         ret = -EINVAL;
955         goto out;
956     }
957     ret = uncompress(uncomp_buf, &buf_len, compressed_data, data_len);
958     if (ret != Z_OK) {
959         ret = -EINVAL;
960         goto out;
961 
962     }
963     if (offset_in_cluster < 0 ||
964             offset_in_cluster + nb_sectors * 512 > buf_len) {
965         ret = -EINVAL;
966         goto out;
967     }
968     memcpy(buf, uncomp_buf + offset_in_cluster, nb_sectors * 512);
969     ret = 0;
970 
971  out:
972     g_free(uncomp_buf);
973     g_free(cluster_buf);
974     return ret;
975 }
976 
977 static int vmdk_read(BlockDriverState *bs, int64_t sector_num,
978                     uint8_t *buf, int nb_sectors)
979 {
980     BDRVVmdkState *s = bs->opaque;
981     int ret;
982     uint64_t n, index_in_cluster;
983     VmdkExtent *extent = NULL;
984     uint64_t cluster_offset;
985 
986     while (nb_sectors > 0) {
987         extent = find_extent(s, sector_num, extent);
988         if (!extent) {
989             return -EIO;
990         }
991         ret = get_cluster_offset(
992                             bs, extent, NULL,
993                             sector_num << 9, 0, &cluster_offset);
994         index_in_cluster = sector_num % extent->cluster_sectors;
995         n = extent->cluster_sectors - index_in_cluster;
996         if (n > nb_sectors) {
997             n = nb_sectors;
998         }
999         if (ret) {
1000             /* if not allocated, try to read from parent image, if exist */
1001             if (bs->backing_hd) {
1002                 if (!vmdk_is_cid_valid(bs)) {
1003                     return -EINVAL;
1004                 }
1005                 ret = bdrv_read(bs->backing_hd, sector_num, buf, n);
1006                 if (ret < 0) {
1007                     return ret;
1008                 }
1009             } else {
1010                 memset(buf, 0, 512 * n);
1011             }
1012         } else {
1013             ret = vmdk_read_extent(extent,
1014                             cluster_offset, index_in_cluster * 512,
1015                             buf, n);
1016             if (ret) {
1017                 return ret;
1018             }
1019         }
1020         nb_sectors -= n;
1021         sector_num += n;
1022         buf += n * 512;
1023     }
1024     return 0;
1025 }
1026 
1027 static coroutine_fn int vmdk_co_read(BlockDriverState *bs, int64_t sector_num,
1028                                      uint8_t *buf, int nb_sectors)
1029 {
1030     int ret;
1031     BDRVVmdkState *s = bs->opaque;
1032     qemu_co_mutex_lock(&s->lock);
1033     ret = vmdk_read(bs, sector_num, buf, nb_sectors);
1034     qemu_co_mutex_unlock(&s->lock);
1035     return ret;
1036 }
1037 
1038 static int vmdk_write(BlockDriverState *bs, int64_t sector_num,
1039                      const uint8_t *buf, int nb_sectors)
1040 {
1041     BDRVVmdkState *s = bs->opaque;
1042     VmdkExtent *extent = NULL;
1043     int n, ret;
1044     int64_t index_in_cluster;
1045     uint64_t cluster_offset;
1046     VmdkMetaData m_data;
1047 
1048     if (sector_num > bs->total_sectors) {
1049         fprintf(stderr,
1050                 "(VMDK) Wrong offset: sector_num=0x%" PRIx64
1051                 " total_sectors=0x%" PRIx64 "\n",
1052                 sector_num, bs->total_sectors);
1053         return -EIO;
1054     }
1055 
1056     while (nb_sectors > 0) {
1057         extent = find_extent(s, sector_num, extent);
1058         if (!extent) {
1059             return -EIO;
1060         }
1061         ret = get_cluster_offset(
1062                                 bs,
1063                                 extent,
1064                                 &m_data,
1065                                 sector_num << 9, !extent->compressed,
1066                                 &cluster_offset);
1067         if (extent->compressed) {
1068             if (ret == 0) {
1069                 /* Refuse write to allocated cluster for streamOptimized */
1070                 fprintf(stderr,
1071                         "VMDK: can't write to allocated cluster"
1072                         " for streamOptimized\n");
1073                 return -EIO;
1074             } else {
1075                 /* allocate */
1076                 ret = get_cluster_offset(
1077                                         bs,
1078                                         extent,
1079                                         &m_data,
1080                                         sector_num << 9, 1,
1081                                         &cluster_offset);
1082             }
1083         }
1084         if (ret) {
1085             return -EINVAL;
1086         }
1087         index_in_cluster = sector_num % extent->cluster_sectors;
1088         n = extent->cluster_sectors - index_in_cluster;
1089         if (n > nb_sectors) {
1090             n = nb_sectors;
1091         }
1092 
1093         ret = vmdk_write_extent(extent,
1094                         cluster_offset, index_in_cluster * 512,
1095                         buf, n, sector_num);
1096         if (ret) {
1097             return ret;
1098         }
1099         if (m_data.valid) {
1100             /* update L2 tables */
1101             if (vmdk_L2update(extent, &m_data) == -1) {
1102                 return -EIO;
1103             }
1104         }
1105         nb_sectors -= n;
1106         sector_num += n;
1107         buf += n * 512;
1108 
1109         /* update CID on the first write every time the virtual disk is
1110          * opened */
1111         if (!s->cid_updated) {
1112             vmdk_write_cid(bs, time(NULL));
1113             s->cid_updated = true;
1114         }
1115     }
1116     return 0;
1117 }
1118 
1119 static coroutine_fn int vmdk_co_write(BlockDriverState *bs, int64_t sector_num,
1120                                       const uint8_t *buf, int nb_sectors)
1121 {
1122     int ret;
1123     BDRVVmdkState *s = bs->opaque;
1124     qemu_co_mutex_lock(&s->lock);
1125     ret = vmdk_write(bs, sector_num, buf, nb_sectors);
1126     qemu_co_mutex_unlock(&s->lock);
1127     return ret;
1128 }
1129 
1130 
1131 static int vmdk_create_extent(const char *filename, int64_t filesize,
1132                               bool flat, bool compress)
1133 {
1134     int ret, i;
1135     int fd = 0;
1136     VMDK4Header header;
1137     uint32_t tmp, magic, grains, gd_size, gt_size, gt_count;
1138 
1139     fd = open(
1140         filename,
1141         O_WRONLY | O_CREAT | O_TRUNC | O_BINARY | O_LARGEFILE,
1142         0644);
1143     if (fd < 0) {
1144         return -errno;
1145     }
1146     if (flat) {
1147         ret = ftruncate(fd, filesize);
1148         if (ret < 0) {
1149             ret = -errno;
1150         }
1151         goto exit;
1152     }
1153     magic = cpu_to_be32(VMDK4_MAGIC);
1154     memset(&header, 0, sizeof(header));
1155     header.version = 1;
1156     header.flags =
1157         3 | (compress ? VMDK4_FLAG_COMPRESS | VMDK4_FLAG_MARKER : 0);
1158     header.compressAlgorithm = compress ? VMDK4_COMPRESSION_DEFLATE : 0;
1159     header.capacity = filesize / 512;
1160     header.granularity = 128;
1161     header.num_gtes_per_gte = 512;
1162 
1163     grains = (filesize / 512 + header.granularity - 1) / header.granularity;
1164     gt_size = ((header.num_gtes_per_gte * sizeof(uint32_t)) + 511) >> 9;
1165     gt_count =
1166         (grains + header.num_gtes_per_gte - 1) / header.num_gtes_per_gte;
1167     gd_size = (gt_count * sizeof(uint32_t) + 511) >> 9;
1168 
1169     header.desc_offset = 1;
1170     header.desc_size = 20;
1171     header.rgd_offset = header.desc_offset + header.desc_size;
1172     header.gd_offset = header.rgd_offset + gd_size + (gt_size * gt_count);
1173     header.grain_offset =
1174        ((header.gd_offset + gd_size + (gt_size * gt_count) +
1175          header.granularity - 1) / header.granularity) *
1176         header.granularity;
1177     /* swap endianness for all header fields */
1178     header.version = cpu_to_le32(header.version);
1179     header.flags = cpu_to_le32(header.flags);
1180     header.capacity = cpu_to_le64(header.capacity);
1181     header.granularity = cpu_to_le64(header.granularity);
1182     header.num_gtes_per_gte = cpu_to_le32(header.num_gtes_per_gte);
1183     header.desc_offset = cpu_to_le64(header.desc_offset);
1184     header.desc_size = cpu_to_le64(header.desc_size);
1185     header.rgd_offset = cpu_to_le64(header.rgd_offset);
1186     header.gd_offset = cpu_to_le64(header.gd_offset);
1187     header.grain_offset = cpu_to_le64(header.grain_offset);
1188     header.compressAlgorithm = cpu_to_le16(header.compressAlgorithm);
1189 
1190     header.check_bytes[0] = 0xa;
1191     header.check_bytes[1] = 0x20;
1192     header.check_bytes[2] = 0xd;
1193     header.check_bytes[3] = 0xa;
1194 
1195     /* write all the data */
1196     ret = qemu_write_full(fd, &magic, sizeof(magic));
1197     if (ret != sizeof(magic)) {
1198         ret = -errno;
1199         goto exit;
1200     }
1201     ret = qemu_write_full(fd, &header, sizeof(header));
1202     if (ret != sizeof(header)) {
1203         ret = -errno;
1204         goto exit;
1205     }
1206 
1207     ret = ftruncate(fd, le64_to_cpu(header.grain_offset) << 9);
1208     if (ret < 0) {
1209         ret = -errno;
1210         goto exit;
1211     }
1212 
1213     /* write grain directory */
1214     lseek(fd, le64_to_cpu(header.rgd_offset) << 9, SEEK_SET);
1215     for (i = 0, tmp = le64_to_cpu(header.rgd_offset) + gd_size;
1216          i < gt_count; i++, tmp += gt_size) {
1217         ret = qemu_write_full(fd, &tmp, sizeof(tmp));
1218         if (ret != sizeof(tmp)) {
1219             ret = -errno;
1220             goto exit;
1221         }
1222     }
1223 
1224     /* write backup grain directory */
1225     lseek(fd, le64_to_cpu(header.gd_offset) << 9, SEEK_SET);
1226     for (i = 0, tmp = le64_to_cpu(header.gd_offset) + gd_size;
1227          i < gt_count; i++, tmp += gt_size) {
1228         ret = qemu_write_full(fd, &tmp, sizeof(tmp));
1229         if (ret != sizeof(tmp)) {
1230             ret = -errno;
1231             goto exit;
1232         }
1233     }
1234 
1235     ret = 0;
1236  exit:
1237     close(fd);
1238     return ret;
1239 }
1240 
1241 static int filename_decompose(const char *filename, char *path, char *prefix,
1242         char *postfix, size_t buf_len)
1243 {
1244     const char *p, *q;
1245 
1246     if (filename == NULL || !strlen(filename)) {
1247         fprintf(stderr, "Vmdk: no filename provided.\n");
1248         return -1;
1249     }
1250     p = strrchr(filename, '/');
1251     if (p == NULL) {
1252         p = strrchr(filename, '\\');
1253     }
1254     if (p == NULL) {
1255         p = strrchr(filename, ':');
1256     }
1257     if (p != NULL) {
1258         p++;
1259         if (p - filename >= buf_len) {
1260             return -1;
1261         }
1262         pstrcpy(path, p - filename + 1, filename);
1263     } else {
1264         p = filename;
1265         path[0] = '\0';
1266     }
1267     q = strrchr(p, '.');
1268     if (q == NULL) {
1269         pstrcpy(prefix, buf_len, p);
1270         postfix[0] = '\0';
1271     } else {
1272         if (q - p >= buf_len) {
1273             return -1;
1274         }
1275         pstrcpy(prefix, q - p + 1, p);
1276         pstrcpy(postfix, buf_len, q);
1277     }
1278     return 0;
1279 }
1280 
1281 static int relative_path(char *dest, int dest_size,
1282         const char *base, const char *target)
1283 {
1284     int i = 0;
1285     int n = 0;
1286     const char *p, *q;
1287 #ifdef _WIN32
1288     const char *sep = "\\";
1289 #else
1290     const char *sep = "/";
1291 #endif
1292 
1293     if (!(dest && base && target)) {
1294         return -1;
1295     }
1296     if (path_is_absolute(target)) {
1297         dest[dest_size - 1] = '\0';
1298         strncpy(dest, target, dest_size - 1);
1299         return 0;
1300     }
1301     while (base[i] == target[i]) {
1302         i++;
1303     }
1304     p = &base[i];
1305     q = &target[i];
1306     while (*p) {
1307         if (*p == *sep) {
1308             n++;
1309         }
1310         p++;
1311     }
1312     dest[0] = '\0';
1313     for (; n; n--) {
1314         pstrcat(dest, dest_size, "..");
1315         pstrcat(dest, dest_size, sep);
1316     }
1317     pstrcat(dest, dest_size, q);
1318     return 0;
1319 }
1320 
1321 static int vmdk_create(const char *filename, QEMUOptionParameter *options)
1322 {
1323     int fd, idx = 0;
1324     char desc[BUF_SIZE];
1325     int64_t total_size = 0, filesize;
1326     const char *backing_file = NULL;
1327     const char *fmt = NULL;
1328     int flags = 0;
1329     int ret = 0;
1330     bool flat, split, compress;
1331     char ext_desc_lines[BUF_SIZE] = "";
1332     char path[PATH_MAX], prefix[PATH_MAX], postfix[PATH_MAX];
1333     const int64_t split_size = 0x80000000;  /* VMDK has constant split size */
1334     const char *desc_extent_line;
1335     char parent_desc_line[BUF_SIZE] = "";
1336     uint32_t parent_cid = 0xffffffff;
1337     const char desc_template[] =
1338         "# Disk DescriptorFile\n"
1339         "version=1\n"
1340         "CID=%x\n"
1341         "parentCID=%x\n"
1342         "createType=\"%s\"\n"
1343         "%s"
1344         "\n"
1345         "# Extent description\n"
1346         "%s"
1347         "\n"
1348         "# The Disk Data Base\n"
1349         "#DDB\n"
1350         "\n"
1351         "ddb.virtualHWVersion = \"%d\"\n"
1352         "ddb.geometry.cylinders = \"%" PRId64 "\"\n"
1353         "ddb.geometry.heads = \"16\"\n"
1354         "ddb.geometry.sectors = \"63\"\n"
1355         "ddb.adapterType = \"ide\"\n";
1356 
1357     if (filename_decompose(filename, path, prefix, postfix, PATH_MAX)) {
1358         return -EINVAL;
1359     }
1360     /* Read out options */
1361     while (options && options->name) {
1362         if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
1363             total_size = options->value.n;
1364         } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) {
1365             backing_file = options->value.s;
1366         } else if (!strcmp(options->name, BLOCK_OPT_COMPAT6)) {
1367             flags |= options->value.n ? BLOCK_FLAG_COMPAT6 : 0;
1368         } else if (!strcmp(options->name, BLOCK_OPT_SUBFMT)) {
1369             fmt = options->value.s;
1370         }
1371         options++;
1372     }
1373     if (!fmt) {
1374         /* Default format to monolithicSparse */
1375         fmt = "monolithicSparse";
1376     } else if (strcmp(fmt, "monolithicFlat") &&
1377                strcmp(fmt, "monolithicSparse") &&
1378                strcmp(fmt, "twoGbMaxExtentSparse") &&
1379                strcmp(fmt, "twoGbMaxExtentFlat") &&
1380                strcmp(fmt, "streamOptimized")) {
1381         fprintf(stderr, "VMDK: Unknown subformat: %s\n", fmt);
1382         return -EINVAL;
1383     }
1384     split = !(strcmp(fmt, "twoGbMaxExtentFlat") &&
1385               strcmp(fmt, "twoGbMaxExtentSparse"));
1386     flat = !(strcmp(fmt, "monolithicFlat") &&
1387              strcmp(fmt, "twoGbMaxExtentFlat"));
1388     compress = !strcmp(fmt, "streamOptimized");
1389     if (flat) {
1390         desc_extent_line = "RW %lld FLAT \"%s\" 0\n";
1391     } else {
1392         desc_extent_line = "RW %lld SPARSE \"%s\"\n";
1393     }
1394     if (flat && backing_file) {
1395         /* not supporting backing file for flat image */
1396         return -ENOTSUP;
1397     }
1398     if (backing_file) {
1399         char parent_filename[PATH_MAX];
1400         BlockDriverState *bs = bdrv_new("");
1401         ret = bdrv_open(bs, backing_file, 0, NULL);
1402         if (ret != 0) {
1403             bdrv_delete(bs);
1404             return ret;
1405         }
1406         if (strcmp(bs->drv->format_name, "vmdk")) {
1407             bdrv_delete(bs);
1408             return -EINVAL;
1409         }
1410         filesize = bdrv_getlength(bs);
1411         parent_cid = vmdk_read_cid(bs, 0);
1412         bdrv_delete(bs);
1413         relative_path(parent_filename, sizeof(parent_filename),
1414                       filename, backing_file);
1415         snprintf(parent_desc_line, sizeof(parent_desc_line),
1416                 "parentFileNameHint=\"%s\"", parent_filename);
1417     }
1418 
1419     /* Create extents */
1420     filesize = total_size;
1421     while (filesize > 0) {
1422         char desc_line[BUF_SIZE];
1423         char ext_filename[PATH_MAX];
1424         char desc_filename[PATH_MAX];
1425         int64_t size = filesize;
1426 
1427         if (split && size > split_size) {
1428             size = split_size;
1429         }
1430         if (split) {
1431             snprintf(desc_filename, sizeof(desc_filename), "%s-%c%03d%s",
1432                     prefix, flat ? 'f' : 's', ++idx, postfix);
1433         } else if (flat) {
1434             snprintf(desc_filename, sizeof(desc_filename), "%s-flat%s",
1435                     prefix, postfix);
1436         } else {
1437             snprintf(desc_filename, sizeof(desc_filename), "%s%s",
1438                     prefix, postfix);
1439         }
1440         snprintf(ext_filename, sizeof(ext_filename), "%s%s",
1441                 path, desc_filename);
1442 
1443         if (vmdk_create_extent(ext_filename, size, flat, compress)) {
1444             return -EINVAL;
1445         }
1446         filesize -= size;
1447 
1448         /* Format description line */
1449         snprintf(desc_line, sizeof(desc_line),
1450                     desc_extent_line, size / 512, desc_filename);
1451         pstrcat(ext_desc_lines, sizeof(ext_desc_lines), desc_line);
1452     }
1453     /* generate descriptor file */
1454     snprintf(desc, sizeof(desc), desc_template,
1455             (unsigned int)time(NULL),
1456             parent_cid,
1457             fmt,
1458             parent_desc_line,
1459             ext_desc_lines,
1460             (flags & BLOCK_FLAG_COMPAT6 ? 6 : 4),
1461             total_size / (int64_t)(63 * 16 * 512));
1462     if (split || flat) {
1463         fd = open(
1464                 filename,
1465                 O_WRONLY | O_CREAT | O_TRUNC | O_BINARY | O_LARGEFILE,
1466                 0644);
1467     } else {
1468         fd = open(
1469                 filename,
1470                 O_WRONLY | O_BINARY | O_LARGEFILE,
1471                 0644);
1472     }
1473     if (fd < 0) {
1474         return -errno;
1475     }
1476     /* the descriptor offset = 0x200 */
1477     if (!split && !flat && 0x200 != lseek(fd, 0x200, SEEK_SET)) {
1478         ret = -errno;
1479         goto exit;
1480     }
1481     ret = qemu_write_full(fd, desc, strlen(desc));
1482     if (ret != strlen(desc)) {
1483         ret = -errno;
1484         goto exit;
1485     }
1486     ret = 0;
1487 exit:
1488     close(fd);
1489     return ret;
1490 }
1491 
1492 static void vmdk_close(BlockDriverState *bs)
1493 {
1494     vmdk_free_extents(bs);
1495 }
1496 
1497 static coroutine_fn int vmdk_co_flush(BlockDriverState *bs)
1498 {
1499     int i, ret, err;
1500     BDRVVmdkState *s = bs->opaque;
1501 
1502     ret = bdrv_co_flush(bs->file);
1503     for (i = 0; i < s->num_extents; i++) {
1504         err = bdrv_co_flush(s->extents[i].file);
1505         if (err < 0) {
1506             ret = err;
1507         }
1508     }
1509     return ret;
1510 }
1511 
1512 static int64_t vmdk_get_allocated_file_size(BlockDriverState *bs)
1513 {
1514     int i;
1515     int64_t ret = 0;
1516     int64_t r;
1517     BDRVVmdkState *s = bs->opaque;
1518 
1519     ret = bdrv_get_allocated_file_size(bs->file);
1520     if (ret < 0) {
1521         return ret;
1522     }
1523     for (i = 0; i < s->num_extents; i++) {
1524         if (s->extents[i].file == bs->file) {
1525             continue;
1526         }
1527         r = bdrv_get_allocated_file_size(s->extents[i].file);
1528         if (r < 0) {
1529             return r;
1530         }
1531         ret += r;
1532     }
1533     return ret;
1534 }
1535 
1536 static QEMUOptionParameter vmdk_create_options[] = {
1537     {
1538         .name = BLOCK_OPT_SIZE,
1539         .type = OPT_SIZE,
1540         .help = "Virtual disk size"
1541     },
1542     {
1543         .name = BLOCK_OPT_BACKING_FILE,
1544         .type = OPT_STRING,
1545         .help = "File name of a base image"
1546     },
1547     {
1548         .name = BLOCK_OPT_COMPAT6,
1549         .type = OPT_FLAG,
1550         .help = "VMDK version 6 image"
1551     },
1552     {
1553         .name = BLOCK_OPT_SUBFMT,
1554         .type = OPT_STRING,
1555         .help =
1556             "VMDK flat extent format, can be one of "
1557             "{monolithicSparse (default) | monolithicFlat | twoGbMaxExtentSparse | twoGbMaxExtentFlat | streamOptimized} "
1558     },
1559     { NULL }
1560 };
1561 
1562 static BlockDriver bdrv_vmdk = {
1563     .format_name    = "vmdk",
1564     .instance_size  = sizeof(BDRVVmdkState),
1565     .bdrv_probe     = vmdk_probe,
1566     .bdrv_open      = vmdk_open,
1567     .bdrv_read      = vmdk_co_read,
1568     .bdrv_write     = vmdk_co_write,
1569     .bdrv_close     = vmdk_close,
1570     .bdrv_create    = vmdk_create,
1571     .bdrv_co_flush  = vmdk_co_flush,
1572     .bdrv_is_allocated  = vmdk_is_allocated,
1573     .bdrv_get_allocated_file_size  = vmdk_get_allocated_file_size,
1574 
1575     .create_options = vmdk_create_options,
1576 };
1577 
1578 static void bdrv_vmdk_init(void)
1579 {
1580     bdrv_register(&bdrv_vmdk);
1581 }
1582 
1583 block_init(bdrv_vmdk_init);
1584