xref: /openbmc/qemu/block/vmdk.c (revision b8852e87d9d113096342c3e0977266cda0fe9ee5)
1 /*
2  * Block driver for the VMDK format
3  *
4  * Copyright (c) 2004 Fabrice Bellard
5  * Copyright (c) 2005 Filip Navara
6  *
7  * Permission is hereby granted, free of charge, to any person obtaining a copy
8  * of this software and associated documentation files (the "Software"), to deal
9  * in the Software without restriction, including without limitation the rights
10  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11  * copies of the Software, and to permit persons to whom the Software is
12  * furnished to do so, subject to the following conditions:
13  *
14  * The above copyright notice and this permission notice shall be included in
15  * all copies or substantial portions of the Software.
16  *
17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23  * THE SOFTWARE.
24  */
25 
26 #include "qemu-common.h"
27 #include "block_int.h"
28 #include "module.h"
29 
30 #define VMDK3_MAGIC (('C' << 24) | ('O' << 16) | ('W' << 8) | 'D')
31 #define VMDK4_MAGIC (('K' << 24) | ('D' << 16) | ('M' << 8) | 'V')
32 
33 typedef struct {
34     uint32_t version;
35     uint32_t flags;
36     uint32_t disk_sectors;
37     uint32_t granularity;
38     uint32_t l1dir_offset;
39     uint32_t l1dir_size;
40     uint32_t file_sectors;
41     uint32_t cylinders;
42     uint32_t heads;
43     uint32_t sectors_per_track;
44 } VMDK3Header;
45 
46 typedef struct {
47     uint32_t version;
48     uint32_t flags;
49     int64_t capacity;
50     int64_t granularity;
51     int64_t desc_offset;
52     int64_t desc_size;
53     int32_t num_gtes_per_gte;
54     int64_t rgd_offset;
55     int64_t gd_offset;
56     int64_t grain_offset;
57     char filler[1];
58     char check_bytes[4];
59 } __attribute__((packed)) VMDK4Header;
60 
61 #define L2_CACHE_SIZE 16
62 
63 typedef struct BDRVVmdkState {
64     BlockDriverState *hd;
65     int64_t l1_table_offset;
66     int64_t l1_backup_table_offset;
67     uint32_t *l1_table;
68     uint32_t *l1_backup_table;
69     unsigned int l1_size;
70     uint32_t l1_entry_sectors;
71 
72     unsigned int l2_size;
73     uint32_t *l2_cache;
74     uint32_t l2_cache_offsets[L2_CACHE_SIZE];
75     uint32_t l2_cache_counts[L2_CACHE_SIZE];
76 
77     unsigned int cluster_sectors;
78     uint32_t parent_cid;
79 } BDRVVmdkState;
80 
81 typedef struct VmdkMetaData {
82     uint32_t offset;
83     unsigned int l1_index;
84     unsigned int l2_index;
85     unsigned int l2_offset;
86     int valid;
87 } VmdkMetaData;
88 
89 static int vmdk_probe(const uint8_t *buf, int buf_size, const char *filename)
90 {
91     uint32_t magic;
92 
93     if (buf_size < 4)
94         return 0;
95     magic = be32_to_cpu(*(uint32_t *)buf);
96     if (magic == VMDK3_MAGIC ||
97         magic == VMDK4_MAGIC)
98         return 100;
99     else
100         return 0;
101 }
102 
103 #define CHECK_CID 1
104 
105 #define SECTOR_SIZE 512
106 #define DESC_SIZE 20*SECTOR_SIZE	// 20 sectors of 512 bytes each
107 #define HEADER_SIZE 512   			// first sector of 512 bytes
108 
109 static uint32_t vmdk_read_cid(BlockDriverState *bs, int parent)
110 {
111     char desc[DESC_SIZE];
112     uint32_t cid;
113     const char *p_name, *cid_str;
114     size_t cid_str_size;
115 
116     /* the descriptor offset = 0x200 */
117     if (bdrv_pread(bs->file, 0x200, desc, DESC_SIZE) != DESC_SIZE)
118         return 0;
119 
120     if (parent) {
121         cid_str = "parentCID";
122         cid_str_size = sizeof("parentCID");
123     } else {
124         cid_str = "CID";
125         cid_str_size = sizeof("CID");
126     }
127 
128     if ((p_name = strstr(desc,cid_str)) != NULL) {
129         p_name += cid_str_size;
130         sscanf(p_name,"%x",&cid);
131     }
132 
133     return cid;
134 }
135 
136 static int vmdk_write_cid(BlockDriverState *bs, uint32_t cid)
137 {
138     char desc[DESC_SIZE], tmp_desc[DESC_SIZE];
139     char *p_name, *tmp_str;
140 
141     /* the descriptor offset = 0x200 */
142     if (bdrv_pread(bs->file, 0x200, desc, DESC_SIZE) != DESC_SIZE)
143         return -1;
144 
145     tmp_str = strstr(desc,"parentCID");
146     pstrcpy(tmp_desc, sizeof(tmp_desc), tmp_str);
147     if ((p_name = strstr(desc,"CID")) != NULL) {
148         p_name += sizeof("CID");
149         snprintf(p_name, sizeof(desc) - (p_name - desc), "%x\n", cid);
150         pstrcat(desc, sizeof(desc), tmp_desc);
151     }
152 
153     if (bdrv_pwrite_sync(bs->file, 0x200, desc, DESC_SIZE) < 0)
154         return -1;
155     return 0;
156 }
157 
158 static int vmdk_is_cid_valid(BlockDriverState *bs)
159 {
160 #ifdef CHECK_CID
161     BDRVVmdkState *s = bs->opaque;
162     BlockDriverState *p_bs = bs->backing_hd;
163     uint32_t cur_pcid;
164 
165     if (p_bs) {
166         cur_pcid = vmdk_read_cid(p_bs,0);
167         if (s->parent_cid != cur_pcid)
168             // CID not valid
169             return 0;
170     }
171 #endif
172     // CID valid
173     return 1;
174 }
175 
176 static int vmdk_snapshot_create(const char *filename, const char *backing_file)
177 {
178     int snp_fd, p_fd;
179     int ret;
180     uint32_t p_cid;
181     char *p_name, *gd_buf, *rgd_buf;
182     const char *real_filename, *temp_str;
183     VMDK4Header header;
184     uint32_t gde_entries, gd_size;
185     int64_t gd_offset, rgd_offset, capacity, gt_size;
186     char p_desc[DESC_SIZE], s_desc[DESC_SIZE], hdr[HEADER_SIZE];
187     static const char desc_template[] =
188     "# Disk DescriptorFile\n"
189     "version=1\n"
190     "CID=%x\n"
191     "parentCID=%x\n"
192     "createType=\"monolithicSparse\"\n"
193     "parentFileNameHint=\"%s\"\n"
194     "\n"
195     "# Extent description\n"
196     "RW %u SPARSE \"%s\"\n"
197     "\n"
198     "# The Disk Data Base \n"
199     "#DDB\n"
200     "\n";
201 
202     snp_fd = open(filename, O_RDWR | O_CREAT | O_TRUNC | O_BINARY | O_LARGEFILE, 0644);
203     if (snp_fd < 0)
204         return -errno;
205     p_fd = open(backing_file, O_RDONLY | O_BINARY | O_LARGEFILE);
206     if (p_fd < 0) {
207         close(snp_fd);
208         return -errno;
209     }
210 
211     /* read the header */
212     if (lseek(p_fd, 0x0, SEEK_SET) == -1) {
213         ret = -errno;
214         goto fail;
215     }
216     if (read(p_fd, hdr, HEADER_SIZE) != HEADER_SIZE) {
217         ret = -errno;
218         goto fail;
219     }
220 
221     /* write the header */
222     if (lseek(snp_fd, 0x0, SEEK_SET) == -1) {
223         ret = -errno;
224         goto fail;
225     }
226     if (write(snp_fd, hdr, HEADER_SIZE) == -1) {
227         ret = -errno;
228         goto fail;
229     }
230 
231     memset(&header, 0, sizeof(header));
232     memcpy(&header,&hdr[4], sizeof(header)); // skip the VMDK4_MAGIC
233 
234     if (ftruncate(snp_fd, header.grain_offset << 9)) {
235         ret = -errno;
236         goto fail;
237     }
238     /* the descriptor offset = 0x200 */
239     if (lseek(p_fd, 0x200, SEEK_SET) == -1) {
240         ret = -errno;
241         goto fail;
242     }
243     if (read(p_fd, p_desc, DESC_SIZE) != DESC_SIZE) {
244         ret = -errno;
245         goto fail;
246     }
247 
248     if ((p_name = strstr(p_desc,"CID")) != NULL) {
249         p_name += sizeof("CID");
250         sscanf(p_name,"%x",&p_cid);
251     }
252 
253     real_filename = filename;
254     if ((temp_str = strrchr(real_filename, '\\')) != NULL)
255         real_filename = temp_str + 1;
256     if ((temp_str = strrchr(real_filename, '/')) != NULL)
257         real_filename = temp_str + 1;
258     if ((temp_str = strrchr(real_filename, ':')) != NULL)
259         real_filename = temp_str + 1;
260 
261     snprintf(s_desc, sizeof(s_desc), desc_template, p_cid, p_cid, backing_file,
262              (uint32_t)header.capacity, real_filename);
263 
264     /* write the descriptor */
265     if (lseek(snp_fd, 0x200, SEEK_SET) == -1) {
266         ret = -errno;
267         goto fail;
268     }
269     if (write(snp_fd, s_desc, strlen(s_desc)) == -1) {
270         ret = -errno;
271         goto fail;
272     }
273 
274     gd_offset = header.gd_offset * SECTOR_SIZE;     // offset of GD table
275     rgd_offset = header.rgd_offset * SECTOR_SIZE;   // offset of RGD table
276     capacity = header.capacity * SECTOR_SIZE;       // Extent size
277     /*
278      * Each GDE span 32M disk, means:
279      * 512 GTE per GT, each GTE points to grain
280      */
281     gt_size = (int64_t)header.num_gtes_per_gte * header.granularity * SECTOR_SIZE;
282     if (!gt_size) {
283         ret = -EINVAL;
284         goto fail;
285     }
286     gde_entries = (uint32_t)(capacity / gt_size);  // number of gde/rgde
287     gd_size = gde_entries * sizeof(uint32_t);
288 
289     /* write RGD */
290     rgd_buf = qemu_malloc(gd_size);
291     if (lseek(p_fd, rgd_offset, SEEK_SET) == -1) {
292         ret = -errno;
293         goto fail_rgd;
294     }
295     if (read(p_fd, rgd_buf, gd_size) != gd_size) {
296         ret = -errno;
297         goto fail_rgd;
298     }
299     if (lseek(snp_fd, rgd_offset, SEEK_SET) == -1) {
300         ret = -errno;
301         goto fail_rgd;
302     }
303     if (write(snp_fd, rgd_buf, gd_size) == -1) {
304         ret = -errno;
305         goto fail_rgd;
306     }
307 
308     /* write GD */
309     gd_buf = qemu_malloc(gd_size);
310     if (lseek(p_fd, gd_offset, SEEK_SET) == -1) {
311         ret = -errno;
312         goto fail_gd;
313     }
314     if (read(p_fd, gd_buf, gd_size) != gd_size) {
315         ret = -errno;
316         goto fail_gd;
317     }
318     if (lseek(snp_fd, gd_offset, SEEK_SET) == -1) {
319         ret = -errno;
320         goto fail_gd;
321     }
322     if (write(snp_fd, gd_buf, gd_size) == -1) {
323         ret = -errno;
324         goto fail_gd;
325     }
326     ret = 0;
327 
328 fail_gd:
329     qemu_free(gd_buf);
330 fail_rgd:
331     qemu_free(rgd_buf);
332 fail:
333     close(p_fd);
334     close(snp_fd);
335     return ret;
336 }
337 
338 static int vmdk_parent_open(BlockDriverState *bs)
339 {
340     char *p_name;
341     char desc[DESC_SIZE];
342 
343     /* the descriptor offset = 0x200 */
344     if (bdrv_pread(bs->file, 0x200, desc, DESC_SIZE) != DESC_SIZE)
345         return -1;
346 
347     if ((p_name = strstr(desc,"parentFileNameHint")) != NULL) {
348         char *end_name;
349 
350         p_name += sizeof("parentFileNameHint") + 1;
351         if ((end_name = strchr(p_name,'\"')) == NULL)
352             return -1;
353         if ((end_name - p_name) > sizeof (bs->backing_file) - 1)
354             return -1;
355 
356         pstrcpy(bs->backing_file, end_name - p_name + 1, p_name);
357     }
358 
359     return 0;
360 }
361 
362 static int vmdk_open(BlockDriverState *bs, int flags)
363 {
364     BDRVVmdkState *s = bs->opaque;
365     uint32_t magic;
366     int l1_size, i;
367 
368     if (bdrv_pread(bs->file, 0, &magic, sizeof(magic)) != sizeof(magic))
369         goto fail;
370 
371     magic = be32_to_cpu(magic);
372     if (magic == VMDK3_MAGIC) {
373         VMDK3Header header;
374 
375         if (bdrv_pread(bs->file, sizeof(magic), &header, sizeof(header)) != sizeof(header))
376             goto fail;
377         s->cluster_sectors = le32_to_cpu(header.granularity);
378         s->l2_size = 1 << 9;
379         s->l1_size = 1 << 6;
380         bs->total_sectors = le32_to_cpu(header.disk_sectors);
381         s->l1_table_offset = le32_to_cpu(header.l1dir_offset) << 9;
382         s->l1_backup_table_offset = 0;
383         s->l1_entry_sectors = s->l2_size * s->cluster_sectors;
384     } else if (magic == VMDK4_MAGIC) {
385         VMDK4Header header;
386 
387         if (bdrv_pread(bs->file, sizeof(magic), &header, sizeof(header)) != sizeof(header))
388             goto fail;
389         bs->total_sectors = le64_to_cpu(header.capacity);
390         s->cluster_sectors = le64_to_cpu(header.granularity);
391         s->l2_size = le32_to_cpu(header.num_gtes_per_gte);
392         s->l1_entry_sectors = s->l2_size * s->cluster_sectors;
393         if (s->l1_entry_sectors <= 0)
394             goto fail;
395         s->l1_size = (bs->total_sectors + s->l1_entry_sectors - 1)
396             / s->l1_entry_sectors;
397         s->l1_table_offset = le64_to_cpu(header.rgd_offset) << 9;
398         s->l1_backup_table_offset = le64_to_cpu(header.gd_offset) << 9;
399 
400         // try to open parent images, if exist
401         if (vmdk_parent_open(bs) != 0)
402             goto fail;
403         // write the CID once after the image creation
404         s->parent_cid = vmdk_read_cid(bs,1);
405     } else {
406         goto fail;
407     }
408 
409     /* read the L1 table */
410     l1_size = s->l1_size * sizeof(uint32_t);
411     s->l1_table = qemu_malloc(l1_size);
412     if (bdrv_pread(bs->file, s->l1_table_offset, s->l1_table, l1_size) != l1_size)
413         goto fail;
414     for(i = 0; i < s->l1_size; i++) {
415         le32_to_cpus(&s->l1_table[i]);
416     }
417 
418     if (s->l1_backup_table_offset) {
419         s->l1_backup_table = qemu_malloc(l1_size);
420         if (bdrv_pread(bs->file, s->l1_backup_table_offset, s->l1_backup_table, l1_size) != l1_size)
421             goto fail;
422         for(i = 0; i < s->l1_size; i++) {
423             le32_to_cpus(&s->l1_backup_table[i]);
424         }
425     }
426 
427     s->l2_cache = qemu_malloc(s->l2_size * L2_CACHE_SIZE * sizeof(uint32_t));
428     return 0;
429  fail:
430     qemu_free(s->l1_backup_table);
431     qemu_free(s->l1_table);
432     qemu_free(s->l2_cache);
433     return -1;
434 }
435 
436 static uint64_t get_cluster_offset(BlockDriverState *bs, VmdkMetaData *m_data,
437                                    uint64_t offset, int allocate);
438 
439 static int get_whole_cluster(BlockDriverState *bs, uint64_t cluster_offset,
440                              uint64_t offset, int allocate)
441 {
442     BDRVVmdkState *s = bs->opaque;
443     uint8_t  whole_grain[s->cluster_sectors*512];        // 128 sectors * 512 bytes each = grain size 64KB
444 
445     // we will be here if it's first write on non-exist grain(cluster).
446     // try to read from parent image, if exist
447     if (bs->backing_hd) {
448         int ret;
449 
450         if (!vmdk_is_cid_valid(bs))
451             return -1;
452 
453         ret = bdrv_read(bs->backing_hd, offset >> 9, whole_grain,
454             s->cluster_sectors);
455         if (ret < 0) {
456             return -1;
457         }
458 
459         //Write grain only into the active image
460         ret = bdrv_write(bs->file, cluster_offset, whole_grain,
461             s->cluster_sectors);
462         if (ret < 0) {
463             return -1;
464         }
465     }
466     return 0;
467 }
468 
469 static int vmdk_L2update(BlockDriverState *bs, VmdkMetaData *m_data)
470 {
471     BDRVVmdkState *s = bs->opaque;
472 
473     /* update L2 table */
474     if (bdrv_pwrite_sync(bs->file, ((int64_t)m_data->l2_offset * 512) + (m_data->l2_index * sizeof(m_data->offset)),
475                     &(m_data->offset), sizeof(m_data->offset)) < 0)
476         return -1;
477     /* update backup L2 table */
478     if (s->l1_backup_table_offset != 0) {
479         m_data->l2_offset = s->l1_backup_table[m_data->l1_index];
480         if (bdrv_pwrite_sync(bs->file, ((int64_t)m_data->l2_offset * 512) + (m_data->l2_index * sizeof(m_data->offset)),
481                         &(m_data->offset), sizeof(m_data->offset)) < 0)
482             return -1;
483     }
484 
485     return 0;
486 }
487 
488 static uint64_t get_cluster_offset(BlockDriverState *bs, VmdkMetaData *m_data,
489                                    uint64_t offset, int allocate)
490 {
491     BDRVVmdkState *s = bs->opaque;
492     unsigned int l1_index, l2_offset, l2_index;
493     int min_index, i, j;
494     uint32_t min_count, *l2_table, tmp = 0;
495     uint64_t cluster_offset;
496 
497     if (m_data)
498         m_data->valid = 0;
499 
500     l1_index = (offset >> 9) / s->l1_entry_sectors;
501     if (l1_index >= s->l1_size)
502         return 0;
503     l2_offset = s->l1_table[l1_index];
504     if (!l2_offset)
505         return 0;
506     for(i = 0; i < L2_CACHE_SIZE; i++) {
507         if (l2_offset == s->l2_cache_offsets[i]) {
508             /* increment the hit count */
509             if (++s->l2_cache_counts[i] == 0xffffffff) {
510                 for(j = 0; j < L2_CACHE_SIZE; j++) {
511                     s->l2_cache_counts[j] >>= 1;
512                 }
513             }
514             l2_table = s->l2_cache + (i * s->l2_size);
515             goto found;
516         }
517     }
518     /* not found: load a new entry in the least used one */
519     min_index = 0;
520     min_count = 0xffffffff;
521     for(i = 0; i < L2_CACHE_SIZE; i++) {
522         if (s->l2_cache_counts[i] < min_count) {
523             min_count = s->l2_cache_counts[i];
524             min_index = i;
525         }
526     }
527     l2_table = s->l2_cache + (min_index * s->l2_size);
528     if (bdrv_pread(bs->file, (int64_t)l2_offset * 512, l2_table, s->l2_size * sizeof(uint32_t)) !=
529                                                                         s->l2_size * sizeof(uint32_t))
530         return 0;
531 
532     s->l2_cache_offsets[min_index] = l2_offset;
533     s->l2_cache_counts[min_index] = 1;
534  found:
535     l2_index = ((offset >> 9) / s->cluster_sectors) % s->l2_size;
536     cluster_offset = le32_to_cpu(l2_table[l2_index]);
537 
538     if (!cluster_offset) {
539         if (!allocate)
540             return 0;
541 
542         // Avoid the L2 tables update for the images that have snapshots.
543         cluster_offset = bdrv_getlength(bs->file);
544         bdrv_truncate(bs->file, cluster_offset + (s->cluster_sectors << 9));
545 
546         cluster_offset >>= 9;
547         tmp = cpu_to_le32(cluster_offset);
548         l2_table[l2_index] = tmp;
549 
550         /* First of all we write grain itself, to avoid race condition
551          * that may to corrupt the image.
552          * This problem may occur because of insufficient space on host disk
553          * or inappropriate VM shutdown.
554          */
555         if (get_whole_cluster(bs, cluster_offset, offset, allocate) == -1)
556             return 0;
557 
558         if (m_data) {
559             m_data->offset = tmp;
560             m_data->l1_index = l1_index;
561             m_data->l2_index = l2_index;
562             m_data->l2_offset = l2_offset;
563             m_data->valid = 1;
564         }
565     }
566     cluster_offset <<= 9;
567     return cluster_offset;
568 }
569 
570 static int vmdk_is_allocated(BlockDriverState *bs, int64_t sector_num,
571                              int nb_sectors, int *pnum)
572 {
573     BDRVVmdkState *s = bs->opaque;
574     int index_in_cluster, n;
575     uint64_t cluster_offset;
576 
577     cluster_offset = get_cluster_offset(bs, NULL, sector_num << 9, 0);
578     index_in_cluster = sector_num % s->cluster_sectors;
579     n = s->cluster_sectors - index_in_cluster;
580     if (n > nb_sectors)
581         n = nb_sectors;
582     *pnum = n;
583     return (cluster_offset != 0);
584 }
585 
586 static int vmdk_read(BlockDriverState *bs, int64_t sector_num,
587                     uint8_t *buf, int nb_sectors)
588 {
589     BDRVVmdkState *s = bs->opaque;
590     int index_in_cluster, n, ret;
591     uint64_t cluster_offset;
592 
593     while (nb_sectors > 0) {
594         cluster_offset = get_cluster_offset(bs, NULL, sector_num << 9, 0);
595         index_in_cluster = sector_num % s->cluster_sectors;
596         n = s->cluster_sectors - index_in_cluster;
597         if (n > nb_sectors)
598             n = nb_sectors;
599         if (!cluster_offset) {
600             // try to read from parent image, if exist
601             if (bs->backing_hd) {
602                 if (!vmdk_is_cid_valid(bs))
603                     return -1;
604                 ret = bdrv_read(bs->backing_hd, sector_num, buf, n);
605                 if (ret < 0)
606                     return -1;
607             } else {
608                 memset(buf, 0, 512 * n);
609             }
610         } else {
611             if(bdrv_pread(bs->file, cluster_offset + index_in_cluster * 512, buf, n * 512) != n * 512)
612                 return -1;
613         }
614         nb_sectors -= n;
615         sector_num += n;
616         buf += n * 512;
617     }
618     return 0;
619 }
620 
621 static int vmdk_write(BlockDriverState *bs, int64_t sector_num,
622                      const uint8_t *buf, int nb_sectors)
623 {
624     BDRVVmdkState *s = bs->opaque;
625     VmdkMetaData m_data;
626     int index_in_cluster, n;
627     uint64_t cluster_offset;
628     static int cid_update = 0;
629 
630     if (sector_num > bs->total_sectors) {
631         fprintf(stderr,
632                 "(VMDK) Wrong offset: sector_num=0x%" PRIx64
633                 " total_sectors=0x%" PRIx64 "\n",
634                 sector_num, bs->total_sectors);
635         return -1;
636     }
637 
638     while (nb_sectors > 0) {
639         index_in_cluster = sector_num & (s->cluster_sectors - 1);
640         n = s->cluster_sectors - index_in_cluster;
641         if (n > nb_sectors)
642             n = nb_sectors;
643         cluster_offset = get_cluster_offset(bs, &m_data, sector_num << 9, 1);
644         if (!cluster_offset)
645             return -1;
646 
647         if (bdrv_pwrite(bs->file, cluster_offset + index_in_cluster * 512, buf, n * 512) != n * 512)
648             return -1;
649         if (m_data.valid) {
650             /* update L2 tables */
651             if (vmdk_L2update(bs, &m_data) == -1)
652                 return -1;
653         }
654         nb_sectors -= n;
655         sector_num += n;
656         buf += n * 512;
657 
658         // update CID on the first write every time the virtual disk is opened
659         if (!cid_update) {
660             vmdk_write_cid(bs, time(NULL));
661             cid_update++;
662         }
663     }
664     return 0;
665 }
666 
667 static int vmdk_create(const char *filename, QEMUOptionParameter *options)
668 {
669     int fd, i;
670     VMDK4Header header;
671     uint32_t tmp, magic, grains, gd_size, gt_size, gt_count;
672     static const char desc_template[] =
673         "# Disk DescriptorFile\n"
674         "version=1\n"
675         "CID=%x\n"
676         "parentCID=ffffffff\n"
677         "createType=\"monolithicSparse\"\n"
678         "\n"
679         "# Extent description\n"
680         "RW %" PRId64 " SPARSE \"%s\"\n"
681         "\n"
682         "# The Disk Data Base \n"
683         "#DDB\n"
684         "\n"
685         "ddb.virtualHWVersion = \"%d\"\n"
686         "ddb.geometry.cylinders = \"%" PRId64 "\"\n"
687         "ddb.geometry.heads = \"16\"\n"
688         "ddb.geometry.sectors = \"63\"\n"
689         "ddb.adapterType = \"ide\"\n";
690     char desc[1024];
691     const char *real_filename, *temp_str;
692     int64_t total_size = 0;
693     const char *backing_file = NULL;
694     int flags = 0;
695     int ret;
696 
697     // Read out options
698     while (options && options->name) {
699         if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
700             total_size = options->value.n / 512;
701         } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) {
702             backing_file = options->value.s;
703         } else if (!strcmp(options->name, BLOCK_OPT_COMPAT6)) {
704             flags |= options->value.n ? BLOCK_FLAG_COMPAT6: 0;
705         }
706         options++;
707     }
708 
709     /* XXX: add support for backing file */
710     if (backing_file) {
711         return vmdk_snapshot_create(filename, backing_file);
712     }
713 
714     fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY | O_LARGEFILE,
715               0644);
716     if (fd < 0)
717         return -errno;
718     magic = cpu_to_be32(VMDK4_MAGIC);
719     memset(&header, 0, sizeof(header));
720     header.version = cpu_to_le32(1);
721     header.flags = cpu_to_le32(3); /* ?? */
722     header.capacity = cpu_to_le64(total_size);
723     header.granularity = cpu_to_le64(128);
724     header.num_gtes_per_gte = cpu_to_le32(512);
725 
726     grains = (total_size + header.granularity - 1) / header.granularity;
727     gt_size = ((header.num_gtes_per_gte * sizeof(uint32_t)) + 511) >> 9;
728     gt_count = (grains + header.num_gtes_per_gte - 1) / header.num_gtes_per_gte;
729     gd_size = (gt_count * sizeof(uint32_t) + 511) >> 9;
730 
731     header.desc_offset = 1;
732     header.desc_size = 20;
733     header.rgd_offset = header.desc_offset + header.desc_size;
734     header.gd_offset = header.rgd_offset + gd_size + (gt_size * gt_count);
735     header.grain_offset =
736        ((header.gd_offset + gd_size + (gt_size * gt_count) +
737          header.granularity - 1) / header.granularity) *
738         header.granularity;
739 
740     header.desc_offset = cpu_to_le64(header.desc_offset);
741     header.desc_size = cpu_to_le64(header.desc_size);
742     header.rgd_offset = cpu_to_le64(header.rgd_offset);
743     header.gd_offset = cpu_to_le64(header.gd_offset);
744     header.grain_offset = cpu_to_le64(header.grain_offset);
745 
746     header.check_bytes[0] = 0xa;
747     header.check_bytes[1] = 0x20;
748     header.check_bytes[2] = 0xd;
749     header.check_bytes[3] = 0xa;
750 
751     /* write all the data */
752     ret = qemu_write_full(fd, &magic, sizeof(magic));
753     if (ret != sizeof(magic)) {
754         ret = -errno;
755         goto exit;
756     }
757     ret = qemu_write_full(fd, &header, sizeof(header));
758     if (ret != sizeof(header)) {
759         ret = -errno;
760         goto exit;
761     }
762 
763     ret = ftruncate(fd, header.grain_offset << 9);
764     if (ret < 0) {
765         ret = -errno;
766         goto exit;
767     }
768 
769     /* write grain directory */
770     lseek(fd, le64_to_cpu(header.rgd_offset) << 9, SEEK_SET);
771     for (i = 0, tmp = header.rgd_offset + gd_size;
772          i < gt_count; i++, tmp += gt_size) {
773         ret = qemu_write_full(fd, &tmp, sizeof(tmp));
774         if (ret != sizeof(tmp)) {
775             ret = -errno;
776             goto exit;
777         }
778     }
779 
780     /* write backup grain directory */
781     lseek(fd, le64_to_cpu(header.gd_offset) << 9, SEEK_SET);
782     for (i = 0, tmp = header.gd_offset + gd_size;
783          i < gt_count; i++, tmp += gt_size) {
784         ret = qemu_write_full(fd, &tmp, sizeof(tmp));
785         if (ret != sizeof(tmp)) {
786             ret = -errno;
787             goto exit;
788         }
789     }
790 
791     /* compose the descriptor */
792     real_filename = filename;
793     if ((temp_str = strrchr(real_filename, '\\')) != NULL)
794         real_filename = temp_str + 1;
795     if ((temp_str = strrchr(real_filename, '/')) != NULL)
796         real_filename = temp_str + 1;
797     if ((temp_str = strrchr(real_filename, ':')) != NULL)
798         real_filename = temp_str + 1;
799     snprintf(desc, sizeof(desc), desc_template, (unsigned int)time(NULL),
800              total_size, real_filename,
801              (flags & BLOCK_FLAG_COMPAT6 ? 6 : 4),
802              total_size / (int64_t)(63 * 16));
803 
804     /* write the descriptor */
805     lseek(fd, le64_to_cpu(header.desc_offset) << 9, SEEK_SET);
806     ret = qemu_write_full(fd, desc, strlen(desc));
807     if (ret != strlen(desc)) {
808         ret = -errno;
809         goto exit;
810     }
811 
812     ret = 0;
813 exit:
814     close(fd);
815     return ret;
816 }
817 
818 static void vmdk_close(BlockDriverState *bs)
819 {
820     BDRVVmdkState *s = bs->opaque;
821 
822     qemu_free(s->l1_table);
823     qemu_free(s->l2_cache);
824 }
825 
826 static void vmdk_flush(BlockDriverState *bs)
827 {
828     bdrv_flush(bs->file);
829 }
830 
831 
832 static QEMUOptionParameter vmdk_create_options[] = {
833     {
834         .name = BLOCK_OPT_SIZE,
835         .type = OPT_SIZE,
836         .help = "Virtual disk size"
837     },
838     {
839         .name = BLOCK_OPT_BACKING_FILE,
840         .type = OPT_STRING,
841         .help = "File name of a base image"
842     },
843     {
844         .name = BLOCK_OPT_COMPAT6,
845         .type = OPT_FLAG,
846         .help = "VMDK version 6 image"
847     },
848     { NULL }
849 };
850 
851 static BlockDriver bdrv_vmdk = {
852     .format_name	= "vmdk",
853     .instance_size	= sizeof(BDRVVmdkState),
854     .bdrv_probe		= vmdk_probe,
855     .bdrv_open      = vmdk_open,
856     .bdrv_read		= vmdk_read,
857     .bdrv_write		= vmdk_write,
858     .bdrv_close		= vmdk_close,
859     .bdrv_create	= vmdk_create,
860     .bdrv_flush		= vmdk_flush,
861     .bdrv_is_allocated	= vmdk_is_allocated,
862 
863     .create_options = vmdk_create_options,
864 };
865 
866 static void bdrv_vmdk_init(void)
867 {
868     bdrv_register(&bdrv_vmdk);
869 }
870 
871 block_init(bdrv_vmdk_init);
872