xref: /openbmc/qemu/block/vmdk.c (revision b781cce53dddcbf3d881f8d923d87e54834a173b)
1 /*
2  * Block driver for the VMDK format
3  *
4  * Copyright (c) 2004 Fabrice Bellard
5  * Copyright (c) 2005 Filip Navara
6  *
7  * Permission is hereby granted, free of charge, to any person obtaining a copy
8  * of this software and associated documentation files (the "Software"), to deal
9  * in the Software without restriction, including without limitation the rights
10  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11  * copies of the Software, and to permit persons to whom the Software is
12  * furnished to do so, subject to the following conditions:
13  *
14  * The above copyright notice and this permission notice shall be included in
15  * all copies or substantial portions of the Software.
16  *
17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23  * THE SOFTWARE.
24  */
25 
26 #include "qemu-common.h"
27 #include "block_int.h"
28 #include "module.h"
29 
30 #define VMDK3_MAGIC (('C' << 24) | ('O' << 16) | ('W' << 8) | 'D')
31 #define VMDK4_MAGIC (('K' << 24) | ('D' << 16) | ('M' << 8) | 'V')
32 
33 typedef struct {
34     uint32_t version;
35     uint32_t flags;
36     uint32_t disk_sectors;
37     uint32_t granularity;
38     uint32_t l1dir_offset;
39     uint32_t l1dir_size;
40     uint32_t file_sectors;
41     uint32_t cylinders;
42     uint32_t heads;
43     uint32_t sectors_per_track;
44 } VMDK3Header;
45 
46 typedef struct {
47     uint32_t version;
48     uint32_t flags;
49     int64_t capacity;
50     int64_t granularity;
51     int64_t desc_offset;
52     int64_t desc_size;
53     int32_t num_gtes_per_gte;
54     int64_t rgd_offset;
55     int64_t gd_offset;
56     int64_t grain_offset;
57     char filler[1];
58     char check_bytes[4];
59 } __attribute__((packed)) VMDK4Header;
60 
61 #define L2_CACHE_SIZE 16
62 
63 typedef struct BDRVVmdkState {
64     BlockDriverState *hd;
65     int64_t l1_table_offset;
66     int64_t l1_backup_table_offset;
67     uint32_t *l1_table;
68     uint32_t *l1_backup_table;
69     unsigned int l1_size;
70     uint32_t l1_entry_sectors;
71 
72     unsigned int l2_size;
73     uint32_t *l2_cache;
74     uint32_t l2_cache_offsets[L2_CACHE_SIZE];
75     uint32_t l2_cache_counts[L2_CACHE_SIZE];
76 
77     unsigned int cluster_sectors;
78     uint32_t parent_cid;
79     int is_parent;
80 } BDRVVmdkState;
81 
82 typedef struct VmdkMetaData {
83     uint32_t offset;
84     unsigned int l1_index;
85     unsigned int l2_index;
86     unsigned int l2_offset;
87     int valid;
88 } VmdkMetaData;
89 
90 typedef struct ActiveBDRVState{
91     BlockDriverState *hd;            // active image handler
92     uint64_t cluster_offset;         // current write offset
93 }ActiveBDRVState;
94 
95 static ActiveBDRVState activeBDRV;
96 
97 
98 static int vmdk_probe(const uint8_t *buf, int buf_size, const char *filename)
99 {
100     uint32_t magic;
101 
102     if (buf_size < 4)
103         return 0;
104     magic = be32_to_cpu(*(uint32_t *)buf);
105     if (magic == VMDK3_MAGIC ||
106         magic == VMDK4_MAGIC)
107         return 100;
108     else
109         return 0;
110 }
111 
112 #define CHECK_CID 1
113 
114 #define SECTOR_SIZE 512
115 #define DESC_SIZE 20*SECTOR_SIZE	// 20 sectors of 512 bytes each
116 #define HEADER_SIZE 512   			// first sector of 512 bytes
117 
118 static uint32_t vmdk_read_cid(BlockDriverState *bs, int parent)
119 {
120     BDRVVmdkState *s = bs->opaque;
121     char desc[DESC_SIZE];
122     uint32_t cid;
123     const char *p_name, *cid_str;
124     size_t cid_str_size;
125 
126     /* the descriptor offset = 0x200 */
127     if (bdrv_pread(s->hd, 0x200, desc, DESC_SIZE) != DESC_SIZE)
128         return 0;
129 
130     if (parent) {
131         cid_str = "parentCID";
132         cid_str_size = sizeof("parentCID");
133     } else {
134         cid_str = "CID";
135         cid_str_size = sizeof("CID");
136     }
137 
138     if ((p_name = strstr(desc,cid_str)) != NULL) {
139         p_name += cid_str_size;
140         sscanf(p_name,"%x",&cid);
141     }
142 
143     return cid;
144 }
145 
146 static int vmdk_write_cid(BlockDriverState *bs, uint32_t cid)
147 {
148     BDRVVmdkState *s = bs->opaque;
149     char desc[DESC_SIZE], tmp_desc[DESC_SIZE];
150     char *p_name, *tmp_str;
151 
152     /* the descriptor offset = 0x200 */
153     if (bdrv_pread(s->hd, 0x200, desc, DESC_SIZE) != DESC_SIZE)
154         return -1;
155 
156     tmp_str = strstr(desc,"parentCID");
157     pstrcpy(tmp_desc, sizeof(tmp_desc), tmp_str);
158     if ((p_name = strstr(desc,"CID")) != NULL) {
159         p_name += sizeof("CID");
160         snprintf(p_name, sizeof(desc) - (p_name - desc), "%x\n", cid);
161         pstrcat(desc, sizeof(desc), tmp_desc);
162     }
163 
164     if (bdrv_pwrite(s->hd, 0x200, desc, DESC_SIZE) != DESC_SIZE)
165         return -1;
166     return 0;
167 }
168 
169 static int vmdk_is_cid_valid(BlockDriverState *bs)
170 {
171 #ifdef CHECK_CID
172     BDRVVmdkState *s = bs->opaque;
173     BlockDriverState *p_bs = bs->backing_hd;
174     uint32_t cur_pcid;
175 
176     if (p_bs) {
177         cur_pcid = vmdk_read_cid(p_bs,0);
178         if (s->parent_cid != cur_pcid)
179             // CID not valid
180             return 0;
181     }
182 #endif
183     // CID valid
184     return 1;
185 }
186 
187 static int vmdk_snapshot_create(const char *filename, const char *backing_file)
188 {
189     int snp_fd, p_fd;
190     uint32_t p_cid;
191     char *p_name, *gd_buf, *rgd_buf;
192     const char *real_filename, *temp_str;
193     VMDK4Header header;
194     uint32_t gde_entries, gd_size;
195     int64_t gd_offset, rgd_offset, capacity, gt_size;
196     char p_desc[DESC_SIZE], s_desc[DESC_SIZE], hdr[HEADER_SIZE];
197     static const char desc_template[] =
198     "# Disk DescriptorFile\n"
199     "version=1\n"
200     "CID=%x\n"
201     "parentCID=%x\n"
202     "createType=\"monolithicSparse\"\n"
203     "parentFileNameHint=\"%s\"\n"
204     "\n"
205     "# Extent description\n"
206     "RW %u SPARSE \"%s\"\n"
207     "\n"
208     "# The Disk Data Base \n"
209     "#DDB\n"
210     "\n";
211 
212     snp_fd = open(filename, O_RDWR | O_CREAT | O_TRUNC | O_BINARY | O_LARGEFILE, 0644);
213     if (snp_fd < 0)
214         return -1;
215     p_fd = open(backing_file, O_RDONLY | O_BINARY | O_LARGEFILE);
216     if (p_fd < 0) {
217         close(snp_fd);
218         return -1;
219     }
220 
221     /* read the header */
222     if (lseek(p_fd, 0x0, SEEK_SET) == -1)
223         goto fail;
224     if (read(p_fd, hdr, HEADER_SIZE) != HEADER_SIZE)
225         goto fail;
226 
227     /* write the header */
228     if (lseek(snp_fd, 0x0, SEEK_SET) == -1)
229         goto fail;
230     if (write(snp_fd, hdr, HEADER_SIZE) == -1)
231         goto fail;
232 
233     memset(&header, 0, sizeof(header));
234     memcpy(&header,&hdr[4], sizeof(header)); // skip the VMDK4_MAGIC
235 
236     if (ftruncate(snp_fd, header.grain_offset << 9))
237         goto fail;
238     /* the descriptor offset = 0x200 */
239     if (lseek(p_fd, 0x200, SEEK_SET) == -1)
240         goto fail;
241     if (read(p_fd, p_desc, DESC_SIZE) != DESC_SIZE)
242         goto fail;
243 
244     if ((p_name = strstr(p_desc,"CID")) != NULL) {
245         p_name += sizeof("CID");
246         sscanf(p_name,"%x",&p_cid);
247     }
248 
249     real_filename = filename;
250     if ((temp_str = strrchr(real_filename, '\\')) != NULL)
251         real_filename = temp_str + 1;
252     if ((temp_str = strrchr(real_filename, '/')) != NULL)
253         real_filename = temp_str + 1;
254     if ((temp_str = strrchr(real_filename, ':')) != NULL)
255         real_filename = temp_str + 1;
256 
257     snprintf(s_desc, sizeof(s_desc), desc_template, p_cid, p_cid, backing_file,
258              (uint32_t)header.capacity, real_filename);
259 
260     /* write the descriptor */
261     if (lseek(snp_fd, 0x200, SEEK_SET) == -1)
262         goto fail;
263     if (write(snp_fd, s_desc, strlen(s_desc)) == -1)
264         goto fail;
265 
266     gd_offset = header.gd_offset * SECTOR_SIZE;     // offset of GD table
267     rgd_offset = header.rgd_offset * SECTOR_SIZE;   // offset of RGD table
268     capacity = header.capacity * SECTOR_SIZE;       // Extent size
269     /*
270      * Each GDE span 32M disk, means:
271      * 512 GTE per GT, each GTE points to grain
272      */
273     gt_size = (int64_t)header.num_gtes_per_gte * header.granularity * SECTOR_SIZE;
274     if (!gt_size)
275         goto fail;
276     gde_entries = (uint32_t)(capacity / gt_size);  // number of gde/rgde
277     gd_size = gde_entries * sizeof(uint32_t);
278 
279     /* write RGD */
280     rgd_buf = qemu_malloc(gd_size);
281     if (lseek(p_fd, rgd_offset, SEEK_SET) == -1)
282         goto fail_rgd;
283     if (read(p_fd, rgd_buf, gd_size) != gd_size)
284         goto fail_rgd;
285     if (lseek(snp_fd, rgd_offset, SEEK_SET) == -1)
286         goto fail_rgd;
287     if (write(snp_fd, rgd_buf, gd_size) == -1)
288         goto fail_rgd;
289     qemu_free(rgd_buf);
290 
291     /* write GD */
292     gd_buf = qemu_malloc(gd_size);
293     if (lseek(p_fd, gd_offset, SEEK_SET) == -1)
294         goto fail_gd;
295     if (read(p_fd, gd_buf, gd_size) != gd_size)
296         goto fail_gd;
297     if (lseek(snp_fd, gd_offset, SEEK_SET) == -1)
298         goto fail_gd;
299     if (write(snp_fd, gd_buf, gd_size) == -1)
300         goto fail_gd;
301     qemu_free(gd_buf);
302 
303     close(p_fd);
304     close(snp_fd);
305     return 0;
306 
307     fail_gd:
308     qemu_free(gd_buf);
309     fail_rgd:
310     qemu_free(rgd_buf);
311     fail:
312     close(p_fd);
313     close(snp_fd);
314     return -1;
315 }
316 
317 static void vmdk_parent_close(BlockDriverState *bs)
318 {
319     if (bs->backing_hd)
320         bdrv_close(bs->backing_hd);
321 }
322 
323 static int parent_open = 0;
324 static int vmdk_parent_open(BlockDriverState *bs, const char * filename)
325 {
326     BDRVVmdkState *s = bs->opaque;
327     char *p_name;
328     char desc[DESC_SIZE];
329     char parent_img_name[1024];
330 
331     /* the descriptor offset = 0x200 */
332     if (bdrv_pread(s->hd, 0x200, desc, DESC_SIZE) != DESC_SIZE)
333         return -1;
334 
335     if ((p_name = strstr(desc,"parentFileNameHint")) != NULL) {
336         char *end_name;
337         struct stat file_buf;
338 
339         p_name += sizeof("parentFileNameHint") + 1;
340         if ((end_name = strchr(p_name,'\"')) == NULL)
341             return -1;
342         if ((end_name - p_name) > sizeof (bs->backing_file) - 1)
343             return -1;
344 
345         pstrcpy(bs->backing_file, end_name - p_name + 1, p_name);
346         if (stat(bs->backing_file, &file_buf) != 0) {
347             path_combine(parent_img_name, sizeof(parent_img_name),
348                          filename, bs->backing_file);
349         } else {
350             pstrcpy(parent_img_name, sizeof(parent_img_name),
351                     bs->backing_file);
352         }
353 
354         bs->backing_hd = bdrv_new("");
355         if (!bs->backing_hd) {
356             failure:
357             bdrv_close(s->hd);
358             return -1;
359         }
360         parent_open = 1;
361         if (bdrv_open(bs->backing_hd, parent_img_name, 0) < 0)
362             goto failure;
363         parent_open = 0;
364     }
365 
366     return 0;
367 }
368 
369 static int vmdk_open(BlockDriverState *bs, const char *filename, int flags)
370 {
371     BDRVVmdkState *s = bs->opaque;
372     uint32_t magic;
373     int l1_size, i, ret;
374 
375     if (parent_open) {
376         /* Parent must be opened as RO, no RDWR. */
377         flags = 0;
378     }
379 
380     ret = bdrv_file_open(&s->hd, filename, flags);
381     if (ret < 0)
382         return ret;
383     if (bdrv_pread(s->hd, 0, &magic, sizeof(magic)) != sizeof(magic))
384         goto fail;
385 
386     magic = be32_to_cpu(magic);
387     if (magic == VMDK3_MAGIC) {
388         VMDK3Header header;
389 
390         if (bdrv_pread(s->hd, sizeof(magic), &header, sizeof(header)) != sizeof(header))
391             goto fail;
392         s->cluster_sectors = le32_to_cpu(header.granularity);
393         s->l2_size = 1 << 9;
394         s->l1_size = 1 << 6;
395         bs->total_sectors = le32_to_cpu(header.disk_sectors);
396         s->l1_table_offset = le32_to_cpu(header.l1dir_offset) << 9;
397         s->l1_backup_table_offset = 0;
398         s->l1_entry_sectors = s->l2_size * s->cluster_sectors;
399     } else if (magic == VMDK4_MAGIC) {
400         VMDK4Header header;
401 
402         if (bdrv_pread(s->hd, sizeof(magic), &header, sizeof(header)) != sizeof(header))
403             goto fail;
404         bs->total_sectors = le64_to_cpu(header.capacity);
405         s->cluster_sectors = le64_to_cpu(header.granularity);
406         s->l2_size = le32_to_cpu(header.num_gtes_per_gte);
407         s->l1_entry_sectors = s->l2_size * s->cluster_sectors;
408         if (s->l1_entry_sectors <= 0)
409             goto fail;
410         s->l1_size = (bs->total_sectors + s->l1_entry_sectors - 1)
411             / s->l1_entry_sectors;
412         s->l1_table_offset = le64_to_cpu(header.rgd_offset) << 9;
413         s->l1_backup_table_offset = le64_to_cpu(header.gd_offset) << 9;
414 
415         if (parent_open)
416             s->is_parent = 1;
417         else
418             s->is_parent = 0;
419 
420         // try to open parent images, if exist
421         if (vmdk_parent_open(bs, filename) != 0)
422             goto fail;
423         // write the CID once after the image creation
424         s->parent_cid = vmdk_read_cid(bs,1);
425     } else {
426         goto fail;
427     }
428 
429     /* read the L1 table */
430     l1_size = s->l1_size * sizeof(uint32_t);
431     s->l1_table = qemu_malloc(l1_size);
432     if (bdrv_pread(s->hd, s->l1_table_offset, s->l1_table, l1_size) != l1_size)
433         goto fail;
434     for(i = 0; i < s->l1_size; i++) {
435         le32_to_cpus(&s->l1_table[i]);
436     }
437 
438     if (s->l1_backup_table_offset) {
439         s->l1_backup_table = qemu_malloc(l1_size);
440         if (bdrv_pread(s->hd, s->l1_backup_table_offset, s->l1_backup_table, l1_size) != l1_size)
441             goto fail;
442         for(i = 0; i < s->l1_size; i++) {
443             le32_to_cpus(&s->l1_backup_table[i]);
444         }
445     }
446 
447     s->l2_cache = qemu_malloc(s->l2_size * L2_CACHE_SIZE * sizeof(uint32_t));
448     return 0;
449  fail:
450     qemu_free(s->l1_backup_table);
451     qemu_free(s->l1_table);
452     qemu_free(s->l2_cache);
453     bdrv_delete(s->hd);
454     return -1;
455 }
456 
457 static uint64_t get_cluster_offset(BlockDriverState *bs, VmdkMetaData *m_data,
458                                    uint64_t offset, int allocate);
459 
460 static int get_whole_cluster(BlockDriverState *bs, uint64_t cluster_offset,
461                              uint64_t offset, int allocate)
462 {
463     uint64_t parent_cluster_offset;
464     BDRVVmdkState *s = bs->opaque;
465     uint8_t  whole_grain[s->cluster_sectors*512];        // 128 sectors * 512 bytes each = grain size 64KB
466 
467     // we will be here if it's first write on non-exist grain(cluster).
468     // try to read from parent image, if exist
469     if (bs->backing_hd) {
470         BDRVVmdkState *ps = bs->backing_hd->opaque;
471 
472         if (!vmdk_is_cid_valid(bs))
473             return -1;
474 
475         parent_cluster_offset = get_cluster_offset(bs->backing_hd, NULL,
476             offset, allocate);
477 
478         if (parent_cluster_offset) {
479             BDRVVmdkState *act_s = activeBDRV.hd->opaque;
480 
481             if (bdrv_pread(ps->hd, parent_cluster_offset, whole_grain, ps->cluster_sectors*512) != ps->cluster_sectors*512)
482                 return -1;
483 
484             //Write grain only into the active image
485             if (bdrv_pwrite(act_s->hd, activeBDRV.cluster_offset << 9, whole_grain, sizeof(whole_grain)) != sizeof(whole_grain))
486                 return -1;
487         }
488     }
489     return 0;
490 }
491 
492 static int vmdk_L2update(BlockDriverState *bs, VmdkMetaData *m_data)
493 {
494     BDRVVmdkState *s = bs->opaque;
495 
496     /* update L2 table */
497     if (bdrv_pwrite(s->hd, ((int64_t)m_data->l2_offset * 512) + (m_data->l2_index * sizeof(m_data->offset)),
498                     &(m_data->offset), sizeof(m_data->offset)) != sizeof(m_data->offset))
499         return -1;
500     /* update backup L2 table */
501     if (s->l1_backup_table_offset != 0) {
502         m_data->l2_offset = s->l1_backup_table[m_data->l1_index];
503         if (bdrv_pwrite(s->hd, ((int64_t)m_data->l2_offset * 512) + (m_data->l2_index * sizeof(m_data->offset)),
504                         &(m_data->offset), sizeof(m_data->offset)) != sizeof(m_data->offset))
505             return -1;
506     }
507 
508     return 0;
509 }
510 
511 static uint64_t get_cluster_offset(BlockDriverState *bs, VmdkMetaData *m_data,
512                                    uint64_t offset, int allocate)
513 {
514     BDRVVmdkState *s = bs->opaque;
515     unsigned int l1_index, l2_offset, l2_index;
516     int min_index, i, j;
517     uint32_t min_count, *l2_table, tmp = 0;
518     uint64_t cluster_offset;
519 
520     if (m_data)
521         m_data->valid = 0;
522 
523     l1_index = (offset >> 9) / s->l1_entry_sectors;
524     if (l1_index >= s->l1_size)
525         return 0;
526     l2_offset = s->l1_table[l1_index];
527     if (!l2_offset)
528         return 0;
529     for(i = 0; i < L2_CACHE_SIZE; i++) {
530         if (l2_offset == s->l2_cache_offsets[i]) {
531             /* increment the hit count */
532             if (++s->l2_cache_counts[i] == 0xffffffff) {
533                 for(j = 0; j < L2_CACHE_SIZE; j++) {
534                     s->l2_cache_counts[j] >>= 1;
535                 }
536             }
537             l2_table = s->l2_cache + (i * s->l2_size);
538             goto found;
539         }
540     }
541     /* not found: load a new entry in the least used one */
542     min_index = 0;
543     min_count = 0xffffffff;
544     for(i = 0; i < L2_CACHE_SIZE; i++) {
545         if (s->l2_cache_counts[i] < min_count) {
546             min_count = s->l2_cache_counts[i];
547             min_index = i;
548         }
549     }
550     l2_table = s->l2_cache + (min_index * s->l2_size);
551     if (bdrv_pread(s->hd, (int64_t)l2_offset * 512, l2_table, s->l2_size * sizeof(uint32_t)) !=
552                                                                         s->l2_size * sizeof(uint32_t))
553         return 0;
554 
555     s->l2_cache_offsets[min_index] = l2_offset;
556     s->l2_cache_counts[min_index] = 1;
557  found:
558     l2_index = ((offset >> 9) / s->cluster_sectors) % s->l2_size;
559     cluster_offset = le32_to_cpu(l2_table[l2_index]);
560 
561     if (!cluster_offset) {
562         if (!allocate)
563             return 0;
564         // Avoid the L2 tables update for the images that have snapshots.
565         if (!s->is_parent) {
566             cluster_offset = bdrv_getlength(s->hd);
567             bdrv_truncate(s->hd, cluster_offset + (s->cluster_sectors << 9));
568 
569             cluster_offset >>= 9;
570             tmp = cpu_to_le32(cluster_offset);
571             l2_table[l2_index] = tmp;
572             // Save the active image state
573             activeBDRV.cluster_offset = cluster_offset;
574             activeBDRV.hd = bs;
575         }
576         /* First of all we write grain itself, to avoid race condition
577          * that may to corrupt the image.
578          * This problem may occur because of insufficient space on host disk
579          * or inappropriate VM shutdown.
580          */
581         if (get_whole_cluster(bs, cluster_offset, offset, allocate) == -1)
582             return 0;
583 
584         if (m_data) {
585             m_data->offset = tmp;
586             m_data->l1_index = l1_index;
587             m_data->l2_index = l2_index;
588             m_data->l2_offset = l2_offset;
589             m_data->valid = 1;
590         }
591     }
592     cluster_offset <<= 9;
593     return cluster_offset;
594 }
595 
596 static int vmdk_is_allocated(BlockDriverState *bs, int64_t sector_num,
597                              int nb_sectors, int *pnum)
598 {
599     BDRVVmdkState *s = bs->opaque;
600     int index_in_cluster, n;
601     uint64_t cluster_offset;
602 
603     cluster_offset = get_cluster_offset(bs, NULL, sector_num << 9, 0);
604     index_in_cluster = sector_num % s->cluster_sectors;
605     n = s->cluster_sectors - index_in_cluster;
606     if (n > nb_sectors)
607         n = nb_sectors;
608     *pnum = n;
609     return (cluster_offset != 0);
610 }
611 
612 static int vmdk_read(BlockDriverState *bs, int64_t sector_num,
613                     uint8_t *buf, int nb_sectors)
614 {
615     BDRVVmdkState *s = bs->opaque;
616     int index_in_cluster, n, ret;
617     uint64_t cluster_offset;
618 
619     while (nb_sectors > 0) {
620         cluster_offset = get_cluster_offset(bs, NULL, sector_num << 9, 0);
621         index_in_cluster = sector_num % s->cluster_sectors;
622         n = s->cluster_sectors - index_in_cluster;
623         if (n > nb_sectors)
624             n = nb_sectors;
625         if (!cluster_offset) {
626             // try to read from parent image, if exist
627             if (bs->backing_hd) {
628                 if (!vmdk_is_cid_valid(bs))
629                     return -1;
630                 ret = bdrv_read(bs->backing_hd, sector_num, buf, n);
631                 if (ret < 0)
632                     return -1;
633             } else {
634                 memset(buf, 0, 512 * n);
635             }
636         } else {
637             if(bdrv_pread(s->hd, cluster_offset + index_in_cluster * 512, buf, n * 512) != n * 512)
638                 return -1;
639         }
640         nb_sectors -= n;
641         sector_num += n;
642         buf += n * 512;
643     }
644     return 0;
645 }
646 
647 static int vmdk_write(BlockDriverState *bs, int64_t sector_num,
648                      const uint8_t *buf, int nb_sectors)
649 {
650     BDRVVmdkState *s = bs->opaque;
651     VmdkMetaData m_data;
652     int index_in_cluster, n;
653     uint64_t cluster_offset;
654     static int cid_update = 0;
655 
656     if (sector_num > bs->total_sectors) {
657         fprintf(stderr,
658                 "(VMDK) Wrong offset: sector_num=0x%" PRIx64
659                 " total_sectors=0x%" PRIx64 "\n",
660                 sector_num, bs->total_sectors);
661         return -1;
662     }
663 
664     while (nb_sectors > 0) {
665         index_in_cluster = sector_num & (s->cluster_sectors - 1);
666         n = s->cluster_sectors - index_in_cluster;
667         if (n > nb_sectors)
668             n = nb_sectors;
669         cluster_offset = get_cluster_offset(bs, &m_data, sector_num << 9, 1);
670         if (!cluster_offset)
671             return -1;
672 
673         if (bdrv_pwrite(s->hd, cluster_offset + index_in_cluster * 512, buf, n * 512) != n * 512)
674             return -1;
675         if (m_data.valid) {
676             /* update L2 tables */
677             if (vmdk_L2update(bs, &m_data) == -1)
678                 return -1;
679         }
680         nb_sectors -= n;
681         sector_num += n;
682         buf += n * 512;
683 
684         // update CID on the first write every time the virtual disk is opened
685         if (!cid_update) {
686             vmdk_write_cid(bs, time(NULL));
687             cid_update++;
688         }
689     }
690     return 0;
691 }
692 
693 static int vmdk_create(const char *filename, QEMUOptionParameter *options)
694 {
695     int fd, i;
696     VMDK4Header header;
697     uint32_t tmp, magic, grains, gd_size, gt_size, gt_count;
698     static const char desc_template[] =
699         "# Disk DescriptorFile\n"
700         "version=1\n"
701         "CID=%x\n"
702         "parentCID=ffffffff\n"
703         "createType=\"monolithicSparse\"\n"
704         "\n"
705         "# Extent description\n"
706         "RW %" PRId64 " SPARSE \"%s\"\n"
707         "\n"
708         "# The Disk Data Base \n"
709         "#DDB\n"
710         "\n"
711         "ddb.virtualHWVersion = \"%d\"\n"
712         "ddb.geometry.cylinders = \"%" PRId64 "\"\n"
713         "ddb.geometry.heads = \"16\"\n"
714         "ddb.geometry.sectors = \"63\"\n"
715         "ddb.adapterType = \"ide\"\n";
716     char desc[1024];
717     const char *real_filename, *temp_str;
718     int64_t total_size = 0;
719     const char *backing_file = NULL;
720     int flags = 0;
721     int ret;
722 
723     // Read out options
724     while (options && options->name) {
725         if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
726             total_size = options->value.n / 512;
727         } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) {
728             backing_file = options->value.s;
729         } else if (!strcmp(options->name, BLOCK_OPT_COMPAT6)) {
730             flags |= options->value.n ? BLOCK_FLAG_COMPAT6: 0;
731         }
732         options++;
733     }
734 
735     /* XXX: add support for backing file */
736     if (backing_file) {
737         return vmdk_snapshot_create(filename, backing_file);
738     }
739 
740     fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY | O_LARGEFILE,
741               0644);
742     if (fd < 0)
743         return -errno;
744     magic = cpu_to_be32(VMDK4_MAGIC);
745     memset(&header, 0, sizeof(header));
746     header.version = cpu_to_le32(1);
747     header.flags = cpu_to_le32(3); /* ?? */
748     header.capacity = cpu_to_le64(total_size);
749     header.granularity = cpu_to_le64(128);
750     header.num_gtes_per_gte = cpu_to_le32(512);
751 
752     grains = (total_size + header.granularity - 1) / header.granularity;
753     gt_size = ((header.num_gtes_per_gte * sizeof(uint32_t)) + 511) >> 9;
754     gt_count = (grains + header.num_gtes_per_gte - 1) / header.num_gtes_per_gte;
755     gd_size = (gt_count * sizeof(uint32_t) + 511) >> 9;
756 
757     header.desc_offset = 1;
758     header.desc_size = 20;
759     header.rgd_offset = header.desc_offset + header.desc_size;
760     header.gd_offset = header.rgd_offset + gd_size + (gt_size * gt_count);
761     header.grain_offset =
762        ((header.gd_offset + gd_size + (gt_size * gt_count) +
763          header.granularity - 1) / header.granularity) *
764         header.granularity;
765 
766     header.desc_offset = cpu_to_le64(header.desc_offset);
767     header.desc_size = cpu_to_le64(header.desc_size);
768     header.rgd_offset = cpu_to_le64(header.rgd_offset);
769     header.gd_offset = cpu_to_le64(header.gd_offset);
770     header.grain_offset = cpu_to_le64(header.grain_offset);
771 
772     header.check_bytes[0] = 0xa;
773     header.check_bytes[1] = 0x20;
774     header.check_bytes[2] = 0xd;
775     header.check_bytes[3] = 0xa;
776 
777     /* write all the data */
778     ret = qemu_write_full(fd, &magic, sizeof(magic));
779     if (ret != sizeof(magic)) {
780         ret = -errno;
781         goto exit;
782     }
783     ret = qemu_write_full(fd, &header, sizeof(header));
784     if (ret != sizeof(header)) {
785         ret = -errno;
786         goto exit;
787     }
788 
789     ret = ftruncate(fd, header.grain_offset << 9);
790     if (ret < 0) {
791         ret = -errno;
792         goto exit;
793     }
794 
795     /* write grain directory */
796     lseek(fd, le64_to_cpu(header.rgd_offset) << 9, SEEK_SET);
797     for (i = 0, tmp = header.rgd_offset + gd_size;
798          i < gt_count; i++, tmp += gt_size) {
799         ret = qemu_write_full(fd, &tmp, sizeof(tmp));
800         if (ret != sizeof(tmp)) {
801             ret = -errno;
802             goto exit;
803         }
804     }
805 
806     /* write backup grain directory */
807     lseek(fd, le64_to_cpu(header.gd_offset) << 9, SEEK_SET);
808     for (i = 0, tmp = header.gd_offset + gd_size;
809          i < gt_count; i++, tmp += gt_size) {
810         ret = qemu_write_full(fd, &tmp, sizeof(tmp));
811         if (ret != sizeof(tmp)) {
812             ret = -errno;
813             goto exit;
814         }
815     }
816 
817     /* compose the descriptor */
818     real_filename = filename;
819     if ((temp_str = strrchr(real_filename, '\\')) != NULL)
820         real_filename = temp_str + 1;
821     if ((temp_str = strrchr(real_filename, '/')) != NULL)
822         real_filename = temp_str + 1;
823     if ((temp_str = strrchr(real_filename, ':')) != NULL)
824         real_filename = temp_str + 1;
825     snprintf(desc, sizeof(desc), desc_template, (unsigned int)time(NULL),
826              total_size, real_filename,
827              (flags & BLOCK_FLAG_COMPAT6 ? 6 : 4),
828              total_size / (int64_t)(63 * 16));
829 
830     /* write the descriptor */
831     lseek(fd, le64_to_cpu(header.desc_offset) << 9, SEEK_SET);
832     ret = qemu_write_full(fd, desc, strlen(desc));
833     if (ret != strlen(desc)) {
834         ret = -errno;
835         goto exit;
836     }
837 
838     ret = 0;
839 exit:
840     close(fd);
841     return ret;
842 }
843 
844 static void vmdk_close(BlockDriverState *bs)
845 {
846     BDRVVmdkState *s = bs->opaque;
847 
848     qemu_free(s->l1_table);
849     qemu_free(s->l2_cache);
850     // try to close parent image, if exist
851     vmdk_parent_close(s->hd);
852     bdrv_delete(s->hd);
853 }
854 
855 static void vmdk_flush(BlockDriverState *bs)
856 {
857     BDRVVmdkState *s = bs->opaque;
858     bdrv_flush(s->hd);
859 }
860 
861 
862 static QEMUOptionParameter vmdk_create_options[] = {
863     {
864         .name = BLOCK_OPT_SIZE,
865         .type = OPT_SIZE,
866         .help = "Virtual disk size"
867     },
868     {
869         .name = BLOCK_OPT_BACKING_FILE,
870         .type = OPT_STRING,
871         .help = "File name of a base image"
872     },
873     {
874         .name = BLOCK_OPT_COMPAT6,
875         .type = OPT_FLAG,
876         .help = "VMDK version 6 image"
877     },
878     { NULL }
879 };
880 
881 static BlockDriver bdrv_vmdk = {
882     .format_name	= "vmdk",
883     .instance_size	= sizeof(BDRVVmdkState),
884     .bdrv_probe		= vmdk_probe,
885     .bdrv_open		= vmdk_open,
886     .bdrv_read		= vmdk_read,
887     .bdrv_write		= vmdk_write,
888     .bdrv_close		= vmdk_close,
889     .bdrv_create	= vmdk_create,
890     .bdrv_flush		= vmdk_flush,
891     .bdrv_is_allocated	= vmdk_is_allocated,
892 
893     .create_options = vmdk_create_options,
894 };
895 
896 static void bdrv_vmdk_init(void)
897 {
898     bdrv_register(&bdrv_vmdk);
899 }
900 
901 block_init(bdrv_vmdk_init);
902