xref: /openbmc/qemu/block/vmdk.c (revision 2345c77c)
1 /*
2  * Block driver for the VMDK format
3  *
4  * Copyright (c) 2004 Fabrice Bellard
5  * Copyright (c) 2005 Filip Navara
6  *
7  * Permission is hereby granted, free of charge, to any person obtaining a copy
8  * of this software and associated documentation files (the "Software"), to deal
9  * in the Software without restriction, including without limitation the rights
10  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11  * copies of the Software, and to permit persons to whom the Software is
12  * furnished to do so, subject to the following conditions:
13  *
14  * The above copyright notice and this permission notice shall be included in
15  * all copies or substantial portions of the Software.
16  *
17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23  * THE SOFTWARE.
24  */
25 
26 #include "qemu-common.h"
27 #include "block_int.h"
28 #include "module.h"
29 
30 #define VMDK3_MAGIC (('C' << 24) | ('O' << 16) | ('W' << 8) | 'D')
31 #define VMDK4_MAGIC (('K' << 24) | ('D' << 16) | ('M' << 8) | 'V')
32 
33 typedef struct {
34     uint32_t version;
35     uint32_t flags;
36     uint32_t disk_sectors;
37     uint32_t granularity;
38     uint32_t l1dir_offset;
39     uint32_t l1dir_size;
40     uint32_t file_sectors;
41     uint32_t cylinders;
42     uint32_t heads;
43     uint32_t sectors_per_track;
44 } VMDK3Header;
45 
46 typedef struct {
47     uint32_t version;
48     uint32_t flags;
49     int64_t capacity;
50     int64_t granularity;
51     int64_t desc_offset;
52     int64_t desc_size;
53     int32_t num_gtes_per_gte;
54     int64_t rgd_offset;
55     int64_t gd_offset;
56     int64_t grain_offset;
57     char filler[1];
58     char check_bytes[4];
59 } __attribute__((packed)) VMDK4Header;
60 
61 #define L2_CACHE_SIZE 16
62 
63 typedef struct BDRVVmdkState {
64     int64_t l1_table_offset;
65     int64_t l1_backup_table_offset;
66     uint32_t *l1_table;
67     uint32_t *l1_backup_table;
68     unsigned int l1_size;
69     uint32_t l1_entry_sectors;
70 
71     unsigned int l2_size;
72     uint32_t *l2_cache;
73     uint32_t l2_cache_offsets[L2_CACHE_SIZE];
74     uint32_t l2_cache_counts[L2_CACHE_SIZE];
75 
76     unsigned int cluster_sectors;
77     uint32_t parent_cid;
78 } BDRVVmdkState;
79 
80 typedef struct VmdkMetaData {
81     uint32_t offset;
82     unsigned int l1_index;
83     unsigned int l2_index;
84     unsigned int l2_offset;
85     int valid;
86 } VmdkMetaData;
87 
88 static int vmdk_probe(const uint8_t *buf, int buf_size, const char *filename)
89 {
90     uint32_t magic;
91 
92     if (buf_size < 4)
93         return 0;
94     magic = be32_to_cpu(*(uint32_t *)buf);
95     if (magic == VMDK3_MAGIC ||
96         magic == VMDK4_MAGIC)
97         return 100;
98     else
99         return 0;
100 }
101 
102 #define CHECK_CID 1
103 
104 #define SECTOR_SIZE 512
105 #define DESC_SIZE 20*SECTOR_SIZE	// 20 sectors of 512 bytes each
106 #define HEADER_SIZE 512   			// first sector of 512 bytes
107 
108 static uint32_t vmdk_read_cid(BlockDriverState *bs, int parent)
109 {
110     char desc[DESC_SIZE];
111     uint32_t cid;
112     const char *p_name, *cid_str;
113     size_t cid_str_size;
114 
115     /* the descriptor offset = 0x200 */
116     if (bdrv_pread(bs->file, 0x200, desc, DESC_SIZE) != DESC_SIZE)
117         return 0;
118 
119     if (parent) {
120         cid_str = "parentCID";
121         cid_str_size = sizeof("parentCID");
122     } else {
123         cid_str = "CID";
124         cid_str_size = sizeof("CID");
125     }
126 
127     if ((p_name = strstr(desc,cid_str)) != NULL) {
128         p_name += cid_str_size;
129         sscanf(p_name,"%x",&cid);
130     }
131 
132     return cid;
133 }
134 
135 static int vmdk_write_cid(BlockDriverState *bs, uint32_t cid)
136 {
137     char desc[DESC_SIZE], tmp_desc[DESC_SIZE];
138     char *p_name, *tmp_str;
139 
140     /* the descriptor offset = 0x200 */
141     if (bdrv_pread(bs->file, 0x200, desc, DESC_SIZE) != DESC_SIZE)
142         return -1;
143 
144     tmp_str = strstr(desc,"parentCID");
145     pstrcpy(tmp_desc, sizeof(tmp_desc), tmp_str);
146     if ((p_name = strstr(desc,"CID")) != NULL) {
147         p_name += sizeof("CID");
148         snprintf(p_name, sizeof(desc) - (p_name - desc), "%x\n", cid);
149         pstrcat(desc, sizeof(desc), tmp_desc);
150     }
151 
152     if (bdrv_pwrite_sync(bs->file, 0x200, desc, DESC_SIZE) < 0)
153         return -1;
154     return 0;
155 }
156 
157 static int vmdk_is_cid_valid(BlockDriverState *bs)
158 {
159 #ifdef CHECK_CID
160     BDRVVmdkState *s = bs->opaque;
161     BlockDriverState *p_bs = bs->backing_hd;
162     uint32_t cur_pcid;
163 
164     if (p_bs) {
165         cur_pcid = vmdk_read_cid(p_bs,0);
166         if (s->parent_cid != cur_pcid)
167             // CID not valid
168             return 0;
169     }
170 #endif
171     // CID valid
172     return 1;
173 }
174 
175 static int vmdk_snapshot_create(const char *filename, const char *backing_file)
176 {
177     int snp_fd, p_fd;
178     int ret;
179     uint32_t p_cid;
180     char *p_name, *gd_buf, *rgd_buf;
181     const char *real_filename, *temp_str;
182     VMDK4Header header;
183     uint32_t gde_entries, gd_size;
184     int64_t gd_offset, rgd_offset, capacity, gt_size;
185     char p_desc[DESC_SIZE], s_desc[DESC_SIZE], hdr[HEADER_SIZE];
186     static const char desc_template[] =
187     "# Disk DescriptorFile\n"
188     "version=1\n"
189     "CID=%x\n"
190     "parentCID=%x\n"
191     "createType=\"monolithicSparse\"\n"
192     "parentFileNameHint=\"%s\"\n"
193     "\n"
194     "# Extent description\n"
195     "RW %u SPARSE \"%s\"\n"
196     "\n"
197     "# The Disk Data Base \n"
198     "#DDB\n"
199     "\n";
200 
201     snp_fd = open(filename, O_RDWR | O_CREAT | O_TRUNC | O_BINARY | O_LARGEFILE, 0644);
202     if (snp_fd < 0)
203         return -errno;
204     p_fd = open(backing_file, O_RDONLY | O_BINARY | O_LARGEFILE);
205     if (p_fd < 0) {
206         close(snp_fd);
207         return -errno;
208     }
209 
210     /* read the header */
211     if (lseek(p_fd, 0x0, SEEK_SET) == -1) {
212         ret = -errno;
213         goto fail;
214     }
215     if (read(p_fd, hdr, HEADER_SIZE) != HEADER_SIZE) {
216         ret = -errno;
217         goto fail;
218     }
219 
220     /* write the header */
221     if (lseek(snp_fd, 0x0, SEEK_SET) == -1) {
222         ret = -errno;
223         goto fail;
224     }
225     if (write(snp_fd, hdr, HEADER_SIZE) == -1) {
226         ret = -errno;
227         goto fail;
228     }
229 
230     memset(&header, 0, sizeof(header));
231     memcpy(&header,&hdr[4], sizeof(header)); // skip the VMDK4_MAGIC
232 
233     if (ftruncate(snp_fd, header.grain_offset << 9)) {
234         ret = -errno;
235         goto fail;
236     }
237     /* the descriptor offset = 0x200 */
238     if (lseek(p_fd, 0x200, SEEK_SET) == -1) {
239         ret = -errno;
240         goto fail;
241     }
242     if (read(p_fd, p_desc, DESC_SIZE) != DESC_SIZE) {
243         ret = -errno;
244         goto fail;
245     }
246 
247     if ((p_name = strstr(p_desc,"CID")) != NULL) {
248         p_name += sizeof("CID");
249         sscanf(p_name,"%x",&p_cid);
250     }
251 
252     real_filename = filename;
253     if ((temp_str = strrchr(real_filename, '\\')) != NULL)
254         real_filename = temp_str + 1;
255     if ((temp_str = strrchr(real_filename, '/')) != NULL)
256         real_filename = temp_str + 1;
257     if ((temp_str = strrchr(real_filename, ':')) != NULL)
258         real_filename = temp_str + 1;
259 
260     snprintf(s_desc, sizeof(s_desc), desc_template, p_cid, p_cid, backing_file,
261              (uint32_t)header.capacity, real_filename);
262 
263     /* write the descriptor */
264     if (lseek(snp_fd, 0x200, SEEK_SET) == -1) {
265         ret = -errno;
266         goto fail;
267     }
268     if (write(snp_fd, s_desc, strlen(s_desc)) == -1) {
269         ret = -errno;
270         goto fail;
271     }
272 
273     gd_offset = header.gd_offset * SECTOR_SIZE;     // offset of GD table
274     rgd_offset = header.rgd_offset * SECTOR_SIZE;   // offset of RGD table
275     capacity = header.capacity * SECTOR_SIZE;       // Extent size
276     /*
277      * Each GDE span 32M disk, means:
278      * 512 GTE per GT, each GTE points to grain
279      */
280     gt_size = (int64_t)header.num_gtes_per_gte * header.granularity * SECTOR_SIZE;
281     if (!gt_size) {
282         ret = -EINVAL;
283         goto fail;
284     }
285     gde_entries = (uint32_t)(capacity / gt_size);  // number of gde/rgde
286     gd_size = gde_entries * sizeof(uint32_t);
287 
288     /* write RGD */
289     rgd_buf = qemu_malloc(gd_size);
290     if (lseek(p_fd, rgd_offset, SEEK_SET) == -1) {
291         ret = -errno;
292         goto fail_rgd;
293     }
294     if (read(p_fd, rgd_buf, gd_size) != gd_size) {
295         ret = -errno;
296         goto fail_rgd;
297     }
298     if (lseek(snp_fd, rgd_offset, SEEK_SET) == -1) {
299         ret = -errno;
300         goto fail_rgd;
301     }
302     if (write(snp_fd, rgd_buf, gd_size) == -1) {
303         ret = -errno;
304         goto fail_rgd;
305     }
306 
307     /* write GD */
308     gd_buf = qemu_malloc(gd_size);
309     if (lseek(p_fd, gd_offset, SEEK_SET) == -1) {
310         ret = -errno;
311         goto fail_gd;
312     }
313     if (read(p_fd, gd_buf, gd_size) != gd_size) {
314         ret = -errno;
315         goto fail_gd;
316     }
317     if (lseek(snp_fd, gd_offset, SEEK_SET) == -1) {
318         ret = -errno;
319         goto fail_gd;
320     }
321     if (write(snp_fd, gd_buf, gd_size) == -1) {
322         ret = -errno;
323         goto fail_gd;
324     }
325     ret = 0;
326 
327 fail_gd:
328     qemu_free(gd_buf);
329 fail_rgd:
330     qemu_free(rgd_buf);
331 fail:
332     close(p_fd);
333     close(snp_fd);
334     return ret;
335 }
336 
337 static int vmdk_parent_open(BlockDriverState *bs)
338 {
339     char *p_name;
340     char desc[DESC_SIZE];
341 
342     /* the descriptor offset = 0x200 */
343     if (bdrv_pread(bs->file, 0x200, desc, DESC_SIZE) != DESC_SIZE)
344         return -1;
345 
346     if ((p_name = strstr(desc,"parentFileNameHint")) != NULL) {
347         char *end_name;
348 
349         p_name += sizeof("parentFileNameHint") + 1;
350         if ((end_name = strchr(p_name,'\"')) == NULL)
351             return -1;
352         if ((end_name - p_name) > sizeof (bs->backing_file) - 1)
353             return -1;
354 
355         pstrcpy(bs->backing_file, end_name - p_name + 1, p_name);
356     }
357 
358     return 0;
359 }
360 
361 static int vmdk_open(BlockDriverState *bs, int flags)
362 {
363     BDRVVmdkState *s = bs->opaque;
364     uint32_t magic;
365     int l1_size, i;
366 
367     if (bdrv_pread(bs->file, 0, &magic, sizeof(magic)) != sizeof(magic))
368         goto fail;
369 
370     magic = be32_to_cpu(magic);
371     if (magic == VMDK3_MAGIC) {
372         VMDK3Header header;
373 
374         if (bdrv_pread(bs->file, sizeof(magic), &header, sizeof(header)) != sizeof(header))
375             goto fail;
376         s->cluster_sectors = le32_to_cpu(header.granularity);
377         s->l2_size = 1 << 9;
378         s->l1_size = 1 << 6;
379         bs->total_sectors = le32_to_cpu(header.disk_sectors);
380         s->l1_table_offset = le32_to_cpu(header.l1dir_offset) << 9;
381         s->l1_backup_table_offset = 0;
382         s->l1_entry_sectors = s->l2_size * s->cluster_sectors;
383     } else if (magic == VMDK4_MAGIC) {
384         VMDK4Header header;
385 
386         if (bdrv_pread(bs->file, sizeof(magic), &header, sizeof(header)) != sizeof(header))
387             goto fail;
388         bs->total_sectors = le64_to_cpu(header.capacity);
389         s->cluster_sectors = le64_to_cpu(header.granularity);
390         s->l2_size = le32_to_cpu(header.num_gtes_per_gte);
391         s->l1_entry_sectors = s->l2_size * s->cluster_sectors;
392         if (s->l1_entry_sectors <= 0)
393             goto fail;
394         s->l1_size = (bs->total_sectors + s->l1_entry_sectors - 1)
395             / s->l1_entry_sectors;
396         s->l1_table_offset = le64_to_cpu(header.rgd_offset) << 9;
397         s->l1_backup_table_offset = le64_to_cpu(header.gd_offset) << 9;
398 
399         // try to open parent images, if exist
400         if (vmdk_parent_open(bs) != 0)
401             goto fail;
402         // write the CID once after the image creation
403         s->parent_cid = vmdk_read_cid(bs,1);
404     } else {
405         goto fail;
406     }
407 
408     /* read the L1 table */
409     l1_size = s->l1_size * sizeof(uint32_t);
410     s->l1_table = qemu_malloc(l1_size);
411     if (bdrv_pread(bs->file, s->l1_table_offset, s->l1_table, l1_size) != l1_size)
412         goto fail;
413     for(i = 0; i < s->l1_size; i++) {
414         le32_to_cpus(&s->l1_table[i]);
415     }
416 
417     if (s->l1_backup_table_offset) {
418         s->l1_backup_table = qemu_malloc(l1_size);
419         if (bdrv_pread(bs->file, s->l1_backup_table_offset, s->l1_backup_table, l1_size) != l1_size)
420             goto fail;
421         for(i = 0; i < s->l1_size; i++) {
422             le32_to_cpus(&s->l1_backup_table[i]);
423         }
424     }
425 
426     s->l2_cache = qemu_malloc(s->l2_size * L2_CACHE_SIZE * sizeof(uint32_t));
427     return 0;
428  fail:
429     qemu_free(s->l1_backup_table);
430     qemu_free(s->l1_table);
431     qemu_free(s->l2_cache);
432     return -1;
433 }
434 
435 static uint64_t get_cluster_offset(BlockDriverState *bs, VmdkMetaData *m_data,
436                                    uint64_t offset, int allocate);
437 
438 static int get_whole_cluster(BlockDriverState *bs, uint64_t cluster_offset,
439                              uint64_t offset, int allocate)
440 {
441     BDRVVmdkState *s = bs->opaque;
442     uint8_t  whole_grain[s->cluster_sectors*512];        // 128 sectors * 512 bytes each = grain size 64KB
443 
444     // we will be here if it's first write on non-exist grain(cluster).
445     // try to read from parent image, if exist
446     if (bs->backing_hd) {
447         int ret;
448 
449         if (!vmdk_is_cid_valid(bs))
450             return -1;
451 
452         ret = bdrv_read(bs->backing_hd, offset >> 9, whole_grain,
453             s->cluster_sectors);
454         if (ret < 0) {
455             return -1;
456         }
457 
458         //Write grain only into the active image
459         ret = bdrv_write(bs->file, cluster_offset, whole_grain,
460             s->cluster_sectors);
461         if (ret < 0) {
462             return -1;
463         }
464     }
465     return 0;
466 }
467 
468 static int vmdk_L2update(BlockDriverState *bs, VmdkMetaData *m_data)
469 {
470     BDRVVmdkState *s = bs->opaque;
471 
472     /* update L2 table */
473     if (bdrv_pwrite_sync(bs->file, ((int64_t)m_data->l2_offset * 512) + (m_data->l2_index * sizeof(m_data->offset)),
474                     &(m_data->offset), sizeof(m_data->offset)) < 0)
475         return -1;
476     /* update backup L2 table */
477     if (s->l1_backup_table_offset != 0) {
478         m_data->l2_offset = s->l1_backup_table[m_data->l1_index];
479         if (bdrv_pwrite_sync(bs->file, ((int64_t)m_data->l2_offset * 512) + (m_data->l2_index * sizeof(m_data->offset)),
480                         &(m_data->offset), sizeof(m_data->offset)) < 0)
481             return -1;
482     }
483 
484     return 0;
485 }
486 
487 static uint64_t get_cluster_offset(BlockDriverState *bs, VmdkMetaData *m_data,
488                                    uint64_t offset, int allocate)
489 {
490     BDRVVmdkState *s = bs->opaque;
491     unsigned int l1_index, l2_offset, l2_index;
492     int min_index, i, j;
493     uint32_t min_count, *l2_table, tmp = 0;
494     uint64_t cluster_offset;
495 
496     if (m_data)
497         m_data->valid = 0;
498 
499     l1_index = (offset >> 9) / s->l1_entry_sectors;
500     if (l1_index >= s->l1_size)
501         return 0;
502     l2_offset = s->l1_table[l1_index];
503     if (!l2_offset)
504         return 0;
505     for(i = 0; i < L2_CACHE_SIZE; i++) {
506         if (l2_offset == s->l2_cache_offsets[i]) {
507             /* increment the hit count */
508             if (++s->l2_cache_counts[i] == 0xffffffff) {
509                 for(j = 0; j < L2_CACHE_SIZE; j++) {
510                     s->l2_cache_counts[j] >>= 1;
511                 }
512             }
513             l2_table = s->l2_cache + (i * s->l2_size);
514             goto found;
515         }
516     }
517     /* not found: load a new entry in the least used one */
518     min_index = 0;
519     min_count = 0xffffffff;
520     for(i = 0; i < L2_CACHE_SIZE; i++) {
521         if (s->l2_cache_counts[i] < min_count) {
522             min_count = s->l2_cache_counts[i];
523             min_index = i;
524         }
525     }
526     l2_table = s->l2_cache + (min_index * s->l2_size);
527     if (bdrv_pread(bs->file, (int64_t)l2_offset * 512, l2_table, s->l2_size * sizeof(uint32_t)) !=
528                                                                         s->l2_size * sizeof(uint32_t))
529         return 0;
530 
531     s->l2_cache_offsets[min_index] = l2_offset;
532     s->l2_cache_counts[min_index] = 1;
533  found:
534     l2_index = ((offset >> 9) / s->cluster_sectors) % s->l2_size;
535     cluster_offset = le32_to_cpu(l2_table[l2_index]);
536 
537     if (!cluster_offset) {
538         if (!allocate)
539             return 0;
540 
541         // Avoid the L2 tables update for the images that have snapshots.
542         cluster_offset = bdrv_getlength(bs->file);
543         bdrv_truncate(bs->file, cluster_offset + (s->cluster_sectors << 9));
544 
545         cluster_offset >>= 9;
546         tmp = cpu_to_le32(cluster_offset);
547         l2_table[l2_index] = tmp;
548 
549         /* First of all we write grain itself, to avoid race condition
550          * that may to corrupt the image.
551          * This problem may occur because of insufficient space on host disk
552          * or inappropriate VM shutdown.
553          */
554         if (get_whole_cluster(bs, cluster_offset, offset, allocate) == -1)
555             return 0;
556 
557         if (m_data) {
558             m_data->offset = tmp;
559             m_data->l1_index = l1_index;
560             m_data->l2_index = l2_index;
561             m_data->l2_offset = l2_offset;
562             m_data->valid = 1;
563         }
564     }
565     cluster_offset <<= 9;
566     return cluster_offset;
567 }
568 
569 static int vmdk_is_allocated(BlockDriverState *bs, int64_t sector_num,
570                              int nb_sectors, int *pnum)
571 {
572     BDRVVmdkState *s = bs->opaque;
573     int index_in_cluster, n;
574     uint64_t cluster_offset;
575 
576     cluster_offset = get_cluster_offset(bs, NULL, sector_num << 9, 0);
577     index_in_cluster = sector_num % s->cluster_sectors;
578     n = s->cluster_sectors - index_in_cluster;
579     if (n > nb_sectors)
580         n = nb_sectors;
581     *pnum = n;
582     return (cluster_offset != 0);
583 }
584 
585 static int vmdk_read(BlockDriverState *bs, int64_t sector_num,
586                     uint8_t *buf, int nb_sectors)
587 {
588     BDRVVmdkState *s = bs->opaque;
589     int index_in_cluster, n, ret;
590     uint64_t cluster_offset;
591 
592     while (nb_sectors > 0) {
593         cluster_offset = get_cluster_offset(bs, NULL, sector_num << 9, 0);
594         index_in_cluster = sector_num % s->cluster_sectors;
595         n = s->cluster_sectors - index_in_cluster;
596         if (n > nb_sectors)
597             n = nb_sectors;
598         if (!cluster_offset) {
599             // try to read from parent image, if exist
600             if (bs->backing_hd) {
601                 if (!vmdk_is_cid_valid(bs))
602                     return -1;
603                 ret = bdrv_read(bs->backing_hd, sector_num, buf, n);
604                 if (ret < 0)
605                     return -1;
606             } else {
607                 memset(buf, 0, 512 * n);
608             }
609         } else {
610             if(bdrv_pread(bs->file, cluster_offset + index_in_cluster * 512, buf, n * 512) != n * 512)
611                 return -1;
612         }
613         nb_sectors -= n;
614         sector_num += n;
615         buf += n * 512;
616     }
617     return 0;
618 }
619 
620 static int vmdk_write(BlockDriverState *bs, int64_t sector_num,
621                      const uint8_t *buf, int nb_sectors)
622 {
623     BDRVVmdkState *s = bs->opaque;
624     VmdkMetaData m_data;
625     int index_in_cluster, n;
626     uint64_t cluster_offset;
627     static int cid_update = 0;
628 
629     if (sector_num > bs->total_sectors) {
630         fprintf(stderr,
631                 "(VMDK) Wrong offset: sector_num=0x%" PRIx64
632                 " total_sectors=0x%" PRIx64 "\n",
633                 sector_num, bs->total_sectors);
634         return -1;
635     }
636 
637     while (nb_sectors > 0) {
638         index_in_cluster = sector_num & (s->cluster_sectors - 1);
639         n = s->cluster_sectors - index_in_cluster;
640         if (n > nb_sectors)
641             n = nb_sectors;
642         cluster_offset = get_cluster_offset(bs, &m_data, sector_num << 9, 1);
643         if (!cluster_offset)
644             return -1;
645 
646         if (bdrv_pwrite(bs->file, cluster_offset + index_in_cluster * 512, buf, n * 512) != n * 512)
647             return -1;
648         if (m_data.valid) {
649             /* update L2 tables */
650             if (vmdk_L2update(bs, &m_data) == -1)
651                 return -1;
652         }
653         nb_sectors -= n;
654         sector_num += n;
655         buf += n * 512;
656 
657         // update CID on the first write every time the virtual disk is opened
658         if (!cid_update) {
659             vmdk_write_cid(bs, time(NULL));
660             cid_update++;
661         }
662     }
663     return 0;
664 }
665 
666 static int vmdk_create(const char *filename, QEMUOptionParameter *options)
667 {
668     int fd, i;
669     VMDK4Header header;
670     uint32_t tmp, magic, grains, gd_size, gt_size, gt_count;
671     static const char desc_template[] =
672         "# Disk DescriptorFile\n"
673         "version=1\n"
674         "CID=%x\n"
675         "parentCID=ffffffff\n"
676         "createType=\"monolithicSparse\"\n"
677         "\n"
678         "# Extent description\n"
679         "RW %" PRId64 " SPARSE \"%s\"\n"
680         "\n"
681         "# The Disk Data Base \n"
682         "#DDB\n"
683         "\n"
684         "ddb.virtualHWVersion = \"%d\"\n"
685         "ddb.geometry.cylinders = \"%" PRId64 "\"\n"
686         "ddb.geometry.heads = \"16\"\n"
687         "ddb.geometry.sectors = \"63\"\n"
688         "ddb.adapterType = \"ide\"\n";
689     char desc[1024];
690     const char *real_filename, *temp_str;
691     int64_t total_size = 0;
692     const char *backing_file = NULL;
693     int flags = 0;
694     int ret;
695 
696     // Read out options
697     while (options && options->name) {
698         if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
699             total_size = options->value.n / 512;
700         } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) {
701             backing_file = options->value.s;
702         } else if (!strcmp(options->name, BLOCK_OPT_COMPAT6)) {
703             flags |= options->value.n ? BLOCK_FLAG_COMPAT6: 0;
704         }
705         options++;
706     }
707 
708     /* XXX: add support for backing file */
709     if (backing_file) {
710         return vmdk_snapshot_create(filename, backing_file);
711     }
712 
713     fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY | O_LARGEFILE,
714               0644);
715     if (fd < 0)
716         return -errno;
717     magic = cpu_to_be32(VMDK4_MAGIC);
718     memset(&header, 0, sizeof(header));
719     header.version = 1;
720     header.flags = 3; /* ?? */
721     header.capacity = total_size;
722     header.granularity = 128;
723     header.num_gtes_per_gte = 512;
724 
725     grains = (total_size + header.granularity - 1) / header.granularity;
726     gt_size = ((header.num_gtes_per_gte * sizeof(uint32_t)) + 511) >> 9;
727     gt_count = (grains + header.num_gtes_per_gte - 1) / header.num_gtes_per_gte;
728     gd_size = (gt_count * sizeof(uint32_t) + 511) >> 9;
729 
730     header.desc_offset = 1;
731     header.desc_size = 20;
732     header.rgd_offset = header.desc_offset + header.desc_size;
733     header.gd_offset = header.rgd_offset + gd_size + (gt_size * gt_count);
734     header.grain_offset =
735        ((header.gd_offset + gd_size + (gt_size * gt_count) +
736          header.granularity - 1) / header.granularity) *
737         header.granularity;
738 
739     /* swap endianness for all header fields */
740     header.version = cpu_to_le32(header.version);
741     header.flags = cpu_to_le32(header.flags);
742     header.capacity = cpu_to_le64(header.capacity);
743     header.granularity = cpu_to_le64(header.granularity);
744     header.num_gtes_per_gte = cpu_to_le32(header.num_gtes_per_gte);
745     header.desc_offset = cpu_to_le64(header.desc_offset);
746     header.desc_size = cpu_to_le64(header.desc_size);
747     header.rgd_offset = cpu_to_le64(header.rgd_offset);
748     header.gd_offset = cpu_to_le64(header.gd_offset);
749     header.grain_offset = cpu_to_le64(header.grain_offset);
750 
751     header.check_bytes[0] = 0xa;
752     header.check_bytes[1] = 0x20;
753     header.check_bytes[2] = 0xd;
754     header.check_bytes[3] = 0xa;
755 
756     /* write all the data */
757     ret = qemu_write_full(fd, &magic, sizeof(magic));
758     if (ret != sizeof(magic)) {
759         ret = -errno;
760         goto exit;
761     }
762     ret = qemu_write_full(fd, &header, sizeof(header));
763     if (ret != sizeof(header)) {
764         ret = -errno;
765         goto exit;
766     }
767 
768     ret = ftruncate(fd, le64_to_cpu(header.grain_offset) << 9);
769     if (ret < 0) {
770         ret = -errno;
771         goto exit;
772     }
773 
774     /* write grain directory */
775     lseek(fd, le64_to_cpu(header.rgd_offset) << 9, SEEK_SET);
776     for (i = 0, tmp = le64_to_cpu(header.rgd_offset) + gd_size;
777          i < gt_count; i++, tmp += gt_size) {
778         ret = qemu_write_full(fd, &tmp, sizeof(tmp));
779         if (ret != sizeof(tmp)) {
780             ret = -errno;
781             goto exit;
782         }
783     }
784 
785     /* write backup grain directory */
786     lseek(fd, le64_to_cpu(header.gd_offset) << 9, SEEK_SET);
787     for (i = 0, tmp = le64_to_cpu(header.gd_offset) + gd_size;
788          i < gt_count; i++, tmp += gt_size) {
789         ret = qemu_write_full(fd, &tmp, sizeof(tmp));
790         if (ret != sizeof(tmp)) {
791             ret = -errno;
792             goto exit;
793         }
794     }
795 
796     /* compose the descriptor */
797     real_filename = filename;
798     if ((temp_str = strrchr(real_filename, '\\')) != NULL)
799         real_filename = temp_str + 1;
800     if ((temp_str = strrchr(real_filename, '/')) != NULL)
801         real_filename = temp_str + 1;
802     if ((temp_str = strrchr(real_filename, ':')) != NULL)
803         real_filename = temp_str + 1;
804     snprintf(desc, sizeof(desc), desc_template, (unsigned int)time(NULL),
805              total_size, real_filename,
806              (flags & BLOCK_FLAG_COMPAT6 ? 6 : 4),
807              total_size / (int64_t)(63 * 16));
808 
809     /* write the descriptor */
810     lseek(fd, le64_to_cpu(header.desc_offset) << 9, SEEK_SET);
811     ret = qemu_write_full(fd, desc, strlen(desc));
812     if (ret != strlen(desc)) {
813         ret = -errno;
814         goto exit;
815     }
816 
817     ret = 0;
818 exit:
819     close(fd);
820     return ret;
821 }
822 
823 static void vmdk_close(BlockDriverState *bs)
824 {
825     BDRVVmdkState *s = bs->opaque;
826 
827     qemu_free(s->l1_table);
828     qemu_free(s->l2_cache);
829 }
830 
831 static int vmdk_flush(BlockDriverState *bs)
832 {
833     return bdrv_flush(bs->file);
834 }
835 
836 
837 static QEMUOptionParameter vmdk_create_options[] = {
838     {
839         .name = BLOCK_OPT_SIZE,
840         .type = OPT_SIZE,
841         .help = "Virtual disk size"
842     },
843     {
844         .name = BLOCK_OPT_BACKING_FILE,
845         .type = OPT_STRING,
846         .help = "File name of a base image"
847     },
848     {
849         .name = BLOCK_OPT_COMPAT6,
850         .type = OPT_FLAG,
851         .help = "VMDK version 6 image"
852     },
853     { NULL }
854 };
855 
856 static BlockDriver bdrv_vmdk = {
857     .format_name	= "vmdk",
858     .instance_size	= sizeof(BDRVVmdkState),
859     .bdrv_probe		= vmdk_probe,
860     .bdrv_open      = vmdk_open,
861     .bdrv_read		= vmdk_read,
862     .bdrv_write		= vmdk_write,
863     .bdrv_close		= vmdk_close,
864     .bdrv_create	= vmdk_create,
865     .bdrv_flush		= vmdk_flush,
866     .bdrv_is_allocated	= vmdk_is_allocated,
867 
868     .create_options = vmdk_create_options,
869 };
870 
871 static void bdrv_vmdk_init(void)
872 {
873     bdrv_register(&bdrv_vmdk);
874 }
875 
876 block_init(bdrv_vmdk_init);
877