1 /* 2 * Block driver for the VMDK format 3 * 4 * Copyright (c) 2004 Fabrice Bellard 5 * Copyright (c) 2005 Filip Navara 6 * 7 * Permission is hereby granted, free of charge, to any person obtaining a copy 8 * of this software and associated documentation files (the "Software"), to deal 9 * in the Software without restriction, including without limitation the rights 10 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 * copies of the Software, and to permit persons to whom the Software is 12 * furnished to do so, subject to the following conditions: 13 * 14 * The above copyright notice and this permission notice shall be included in 15 * all copies or substantial portions of the Software. 16 * 17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 23 * THE SOFTWARE. 24 */ 25 26 #include "qemu-common.h" 27 #include "block/block_int.h" 28 #include "qemu/module.h" 29 #include "migration/migration.h" 30 #include <zlib.h> 31 32 #define VMDK3_MAGIC (('C' << 24) | ('O' << 16) | ('W' << 8) | 'D') 33 #define VMDK4_MAGIC (('K' << 24) | ('D' << 16) | ('M' << 8) | 'V') 34 #define VMDK4_COMPRESSION_DEFLATE 1 35 #define VMDK4_FLAG_NL_DETECT (1 << 0) 36 #define VMDK4_FLAG_RGD (1 << 1) 37 /* Zeroed-grain enable bit */ 38 #define VMDK4_FLAG_ZERO_GRAIN (1 << 2) 39 #define VMDK4_FLAG_COMPRESS (1 << 16) 40 #define VMDK4_FLAG_MARKER (1 << 17) 41 #define VMDK4_GD_AT_END 0xffffffffffffffffULL 42 43 #define VMDK_GTE_ZEROED 0x1 44 45 /* VMDK internal error codes */ 46 #define VMDK_OK 0 47 #define VMDK_ERROR (-1) 48 /* Cluster not allocated */ 49 #define VMDK_UNALLOC (-2) 50 #define VMDK_ZEROED (-3) 51 52 #define BLOCK_OPT_ZEROED_GRAIN "zeroed_grain" 53 54 typedef struct { 55 uint32_t version; 56 uint32_t flags; 57 uint32_t disk_sectors; 58 uint32_t granularity; 59 uint32_t l1dir_offset; 60 uint32_t l1dir_size; 61 uint32_t file_sectors; 62 uint32_t cylinders; 63 uint32_t heads; 64 uint32_t sectors_per_track; 65 } QEMU_PACKED VMDK3Header; 66 67 typedef struct { 68 uint32_t version; 69 uint32_t flags; 70 uint64_t capacity; 71 uint64_t granularity; 72 uint64_t desc_offset; 73 uint64_t desc_size; 74 /* Number of GrainTableEntries per GrainTable */ 75 uint32_t num_gtes_per_gt; 76 uint64_t rgd_offset; 77 uint64_t gd_offset; 78 uint64_t grain_offset; 79 char filler[1]; 80 char check_bytes[4]; 81 uint16_t compressAlgorithm; 82 } QEMU_PACKED VMDK4Header; 83 84 #define L2_CACHE_SIZE 16 85 86 typedef struct VmdkExtent { 87 BlockDriverState *file; 88 bool flat; 89 bool compressed; 90 bool has_marker; 91 bool has_zero_grain; 92 int version; 93 int64_t sectors; 94 int64_t end_sector; 95 int64_t flat_start_offset; 96 int64_t l1_table_offset; 97 int64_t l1_backup_table_offset; 98 uint32_t *l1_table; 99 uint32_t *l1_backup_table; 100 unsigned int l1_size; 101 uint32_t l1_entry_sectors; 102 103 unsigned int l2_size; 104 uint32_t *l2_cache; 105 uint32_t l2_cache_offsets[L2_CACHE_SIZE]; 106 uint32_t l2_cache_counts[L2_CACHE_SIZE]; 107 108 int64_t cluster_sectors; 109 } VmdkExtent; 110 111 typedef struct BDRVVmdkState { 112 CoMutex lock; 113 uint64_t desc_offset; 114 bool cid_updated; 115 bool cid_checked; 116 uint32_t parent_cid; 117 int num_extents; 118 /* Extent array with num_extents entries, ascend ordered by address */ 119 VmdkExtent *extents; 120 Error *migration_blocker; 121 } BDRVVmdkState; 122 123 typedef struct VmdkMetaData { 124 uint32_t offset; 125 unsigned int l1_index; 126 unsigned int l2_index; 127 unsigned int l2_offset; 128 int valid; 129 uint32_t *l2_cache_entry; 130 } VmdkMetaData; 131 132 typedef struct VmdkGrainMarker { 133 uint64_t lba; 134 uint32_t size; 135 uint8_t data[0]; 136 } QEMU_PACKED VmdkGrainMarker; 137 138 enum { 139 MARKER_END_OF_STREAM = 0, 140 MARKER_GRAIN_TABLE = 1, 141 MARKER_GRAIN_DIRECTORY = 2, 142 MARKER_FOOTER = 3, 143 }; 144 145 static int vmdk_probe(const uint8_t *buf, int buf_size, const char *filename) 146 { 147 uint32_t magic; 148 149 if (buf_size < 4) { 150 return 0; 151 } 152 magic = be32_to_cpu(*(uint32_t *)buf); 153 if (magic == VMDK3_MAGIC || 154 magic == VMDK4_MAGIC) { 155 return 100; 156 } else { 157 const char *p = (const char *)buf; 158 const char *end = p + buf_size; 159 while (p < end) { 160 if (*p == '#') { 161 /* skip comment line */ 162 while (p < end && *p != '\n') { 163 p++; 164 } 165 p++; 166 continue; 167 } 168 if (*p == ' ') { 169 while (p < end && *p == ' ') { 170 p++; 171 } 172 /* skip '\r' if windows line endings used. */ 173 if (p < end && *p == '\r') { 174 p++; 175 } 176 /* only accept blank lines before 'version=' line */ 177 if (p == end || *p != '\n') { 178 return 0; 179 } 180 p++; 181 continue; 182 } 183 if (end - p >= strlen("version=X\n")) { 184 if (strncmp("version=1\n", p, strlen("version=1\n")) == 0 || 185 strncmp("version=2\n", p, strlen("version=2\n")) == 0) { 186 return 100; 187 } 188 } 189 if (end - p >= strlen("version=X\r\n")) { 190 if (strncmp("version=1\r\n", p, strlen("version=1\r\n")) == 0 || 191 strncmp("version=2\r\n", p, strlen("version=2\r\n")) == 0) { 192 return 100; 193 } 194 } 195 return 0; 196 } 197 return 0; 198 } 199 } 200 201 #define SECTOR_SIZE 512 202 #define DESC_SIZE (20 * SECTOR_SIZE) /* 20 sectors of 512 bytes each */ 203 #define BUF_SIZE 4096 204 #define HEADER_SIZE 512 /* first sector of 512 bytes */ 205 206 static void vmdk_free_extents(BlockDriverState *bs) 207 { 208 int i; 209 BDRVVmdkState *s = bs->opaque; 210 VmdkExtent *e; 211 212 for (i = 0; i < s->num_extents; i++) { 213 e = &s->extents[i]; 214 g_free(e->l1_table); 215 g_free(e->l2_cache); 216 g_free(e->l1_backup_table); 217 if (e->file != bs->file) { 218 bdrv_unref(e->file); 219 } 220 } 221 g_free(s->extents); 222 } 223 224 static void vmdk_free_last_extent(BlockDriverState *bs) 225 { 226 BDRVVmdkState *s = bs->opaque; 227 228 if (s->num_extents == 0) { 229 return; 230 } 231 s->num_extents--; 232 s->extents = g_realloc(s->extents, s->num_extents * sizeof(VmdkExtent)); 233 } 234 235 static uint32_t vmdk_read_cid(BlockDriverState *bs, int parent) 236 { 237 char desc[DESC_SIZE]; 238 uint32_t cid = 0xffffffff; 239 const char *p_name, *cid_str; 240 size_t cid_str_size; 241 BDRVVmdkState *s = bs->opaque; 242 int ret; 243 244 ret = bdrv_pread(bs->file, s->desc_offset, desc, DESC_SIZE); 245 if (ret < 0) { 246 return 0; 247 } 248 249 if (parent) { 250 cid_str = "parentCID"; 251 cid_str_size = sizeof("parentCID"); 252 } else { 253 cid_str = "CID"; 254 cid_str_size = sizeof("CID"); 255 } 256 257 desc[DESC_SIZE - 1] = '\0'; 258 p_name = strstr(desc, cid_str); 259 if (p_name != NULL) { 260 p_name += cid_str_size; 261 sscanf(p_name, "%x", &cid); 262 } 263 264 return cid; 265 } 266 267 static int vmdk_write_cid(BlockDriverState *bs, uint32_t cid) 268 { 269 char desc[DESC_SIZE], tmp_desc[DESC_SIZE]; 270 char *p_name, *tmp_str; 271 BDRVVmdkState *s = bs->opaque; 272 int ret; 273 274 ret = bdrv_pread(bs->file, s->desc_offset, desc, DESC_SIZE); 275 if (ret < 0) { 276 return ret; 277 } 278 279 desc[DESC_SIZE - 1] = '\0'; 280 tmp_str = strstr(desc, "parentCID"); 281 if (tmp_str == NULL) { 282 return -EINVAL; 283 } 284 285 pstrcpy(tmp_desc, sizeof(tmp_desc), tmp_str); 286 p_name = strstr(desc, "CID"); 287 if (p_name != NULL) { 288 p_name += sizeof("CID"); 289 snprintf(p_name, sizeof(desc) - (p_name - desc), "%x\n", cid); 290 pstrcat(desc, sizeof(desc), tmp_desc); 291 } 292 293 ret = bdrv_pwrite_sync(bs->file, s->desc_offset, desc, DESC_SIZE); 294 if (ret < 0) { 295 return ret; 296 } 297 298 return 0; 299 } 300 301 static int vmdk_is_cid_valid(BlockDriverState *bs) 302 { 303 BDRVVmdkState *s = bs->opaque; 304 BlockDriverState *p_bs = bs->backing_hd; 305 uint32_t cur_pcid; 306 307 if (!s->cid_checked && p_bs) { 308 cur_pcid = vmdk_read_cid(p_bs, 0); 309 if (s->parent_cid != cur_pcid) { 310 /* CID not valid */ 311 return 0; 312 } 313 } 314 s->cid_checked = true; 315 /* CID valid */ 316 return 1; 317 } 318 319 /* Queue extents, if any, for reopen() */ 320 static int vmdk_reopen_prepare(BDRVReopenState *state, 321 BlockReopenQueue *queue, Error **errp) 322 { 323 BDRVVmdkState *s; 324 int ret = -1; 325 int i; 326 VmdkExtent *e; 327 328 assert(state != NULL); 329 assert(state->bs != NULL); 330 331 if (queue == NULL) { 332 error_setg(errp, "No reopen queue for VMDK extents"); 333 goto exit; 334 } 335 336 s = state->bs->opaque; 337 338 assert(s != NULL); 339 340 for (i = 0; i < s->num_extents; i++) { 341 e = &s->extents[i]; 342 if (e->file != state->bs->file) { 343 bdrv_reopen_queue(queue, e->file, state->flags); 344 } 345 } 346 ret = 0; 347 348 exit: 349 return ret; 350 } 351 352 static int vmdk_parent_open(BlockDriverState *bs) 353 { 354 char *p_name; 355 char desc[DESC_SIZE + 1]; 356 BDRVVmdkState *s = bs->opaque; 357 int ret; 358 359 desc[DESC_SIZE] = '\0'; 360 ret = bdrv_pread(bs->file, s->desc_offset, desc, DESC_SIZE); 361 if (ret < 0) { 362 return ret; 363 } 364 365 p_name = strstr(desc, "parentFileNameHint"); 366 if (p_name != NULL) { 367 char *end_name; 368 369 p_name += sizeof("parentFileNameHint") + 1; 370 end_name = strchr(p_name, '\"'); 371 if (end_name == NULL) { 372 return -EINVAL; 373 } 374 if ((end_name - p_name) > sizeof(bs->backing_file) - 1) { 375 return -EINVAL; 376 } 377 378 pstrcpy(bs->backing_file, end_name - p_name + 1, p_name); 379 } 380 381 return 0; 382 } 383 384 /* Create and append extent to the extent array. Return the added VmdkExtent 385 * address. return NULL if allocation failed. */ 386 static int vmdk_add_extent(BlockDriverState *bs, 387 BlockDriverState *file, bool flat, int64_t sectors, 388 int64_t l1_offset, int64_t l1_backup_offset, 389 uint32_t l1_size, 390 int l2_size, uint64_t cluster_sectors, 391 VmdkExtent **new_extent, 392 Error **errp) 393 { 394 VmdkExtent *extent; 395 BDRVVmdkState *s = bs->opaque; 396 397 if (cluster_sectors > 0x200000) { 398 /* 0x200000 * 512Bytes = 1GB for one cluster is unrealistic */ 399 error_setg(errp, "Invalid granularity, image may be corrupt"); 400 return -EFBIG; 401 } 402 if (l1_size > 512 * 1024 * 1024) { 403 /* Although with big capacity and small l1_entry_sectors, we can get a 404 * big l1_size, we don't want unbounded value to allocate the table. 405 * Limit it to 512M, which is 16PB for default cluster and L2 table 406 * size */ 407 error_setg(errp, "L1 size too big"); 408 return -EFBIG; 409 } 410 411 s->extents = g_realloc(s->extents, 412 (s->num_extents + 1) * sizeof(VmdkExtent)); 413 extent = &s->extents[s->num_extents]; 414 s->num_extents++; 415 416 memset(extent, 0, sizeof(VmdkExtent)); 417 extent->file = file; 418 extent->flat = flat; 419 extent->sectors = sectors; 420 extent->l1_table_offset = l1_offset; 421 extent->l1_backup_table_offset = l1_backup_offset; 422 extent->l1_size = l1_size; 423 extent->l1_entry_sectors = l2_size * cluster_sectors; 424 extent->l2_size = l2_size; 425 extent->cluster_sectors = flat ? sectors : cluster_sectors; 426 427 if (s->num_extents > 1) { 428 extent->end_sector = (*(extent - 1)).end_sector + extent->sectors; 429 } else { 430 extent->end_sector = extent->sectors; 431 } 432 bs->total_sectors = extent->end_sector; 433 if (new_extent) { 434 *new_extent = extent; 435 } 436 return 0; 437 } 438 439 static int vmdk_init_tables(BlockDriverState *bs, VmdkExtent *extent, 440 Error **errp) 441 { 442 int ret; 443 int l1_size, i; 444 445 /* read the L1 table */ 446 l1_size = extent->l1_size * sizeof(uint32_t); 447 extent->l1_table = g_malloc(l1_size); 448 ret = bdrv_pread(extent->file, 449 extent->l1_table_offset, 450 extent->l1_table, 451 l1_size); 452 if (ret < 0) { 453 error_setg_errno(errp, -ret, 454 "Could not read l1 table from extent '%s'", 455 extent->file->filename); 456 goto fail_l1; 457 } 458 for (i = 0; i < extent->l1_size; i++) { 459 le32_to_cpus(&extent->l1_table[i]); 460 } 461 462 if (extent->l1_backup_table_offset) { 463 extent->l1_backup_table = g_malloc(l1_size); 464 ret = bdrv_pread(extent->file, 465 extent->l1_backup_table_offset, 466 extent->l1_backup_table, 467 l1_size); 468 if (ret < 0) { 469 error_setg_errno(errp, -ret, 470 "Could not read l1 backup table from extent '%s'", 471 extent->file->filename); 472 goto fail_l1b; 473 } 474 for (i = 0; i < extent->l1_size; i++) { 475 le32_to_cpus(&extent->l1_backup_table[i]); 476 } 477 } 478 479 extent->l2_cache = 480 g_malloc(extent->l2_size * L2_CACHE_SIZE * sizeof(uint32_t)); 481 return 0; 482 fail_l1b: 483 g_free(extent->l1_backup_table); 484 fail_l1: 485 g_free(extent->l1_table); 486 return ret; 487 } 488 489 static int vmdk_open_vmfs_sparse(BlockDriverState *bs, 490 BlockDriverState *file, 491 int flags, Error **errp) 492 { 493 int ret; 494 uint32_t magic; 495 VMDK3Header header; 496 VmdkExtent *extent; 497 498 ret = bdrv_pread(file, sizeof(magic), &header, sizeof(header)); 499 if (ret < 0) { 500 error_setg_errno(errp, -ret, 501 "Could not read header from file '%s'", 502 file->filename); 503 return ret; 504 } 505 ret = vmdk_add_extent(bs, file, false, 506 le32_to_cpu(header.disk_sectors), 507 le32_to_cpu(header.l1dir_offset) << 9, 508 0, 509 le32_to_cpu(header.l1dir_size), 510 4096, 511 le32_to_cpu(header.granularity), 512 &extent, 513 errp); 514 if (ret < 0) { 515 return ret; 516 } 517 ret = vmdk_init_tables(bs, extent, errp); 518 if (ret) { 519 /* free extent allocated by vmdk_add_extent */ 520 vmdk_free_last_extent(bs); 521 } 522 return ret; 523 } 524 525 static int vmdk_open_desc_file(BlockDriverState *bs, int flags, 526 uint64_t desc_offset, Error **errp); 527 528 static int vmdk_open_vmdk4(BlockDriverState *bs, 529 BlockDriverState *file, 530 int flags, Error **errp) 531 { 532 int ret; 533 uint32_t magic; 534 uint32_t l1_size, l1_entry_sectors; 535 VMDK4Header header; 536 VmdkExtent *extent; 537 int64_t l1_backup_offset = 0; 538 539 ret = bdrv_pread(file, sizeof(magic), &header, sizeof(header)); 540 if (ret < 0) { 541 error_setg_errno(errp, -ret, 542 "Could not read header from file '%s'", 543 file->filename); 544 } 545 if (header.capacity == 0) { 546 uint64_t desc_offset = le64_to_cpu(header.desc_offset); 547 if (desc_offset) { 548 return vmdk_open_desc_file(bs, flags, desc_offset << 9, errp); 549 } 550 } 551 552 if (le64_to_cpu(header.gd_offset) == VMDK4_GD_AT_END) { 553 /* 554 * The footer takes precedence over the header, so read it in. The 555 * footer starts at offset -1024 from the end: One sector for the 556 * footer, and another one for the end-of-stream marker. 557 */ 558 struct { 559 struct { 560 uint64_t val; 561 uint32_t size; 562 uint32_t type; 563 uint8_t pad[512 - 16]; 564 } QEMU_PACKED footer_marker; 565 566 uint32_t magic; 567 VMDK4Header header; 568 uint8_t pad[512 - 4 - sizeof(VMDK4Header)]; 569 570 struct { 571 uint64_t val; 572 uint32_t size; 573 uint32_t type; 574 uint8_t pad[512 - 16]; 575 } QEMU_PACKED eos_marker; 576 } QEMU_PACKED footer; 577 578 ret = bdrv_pread(file, 579 bs->file->total_sectors * 512 - 1536, 580 &footer, sizeof(footer)); 581 if (ret < 0) { 582 return ret; 583 } 584 585 /* Some sanity checks for the footer */ 586 if (be32_to_cpu(footer.magic) != VMDK4_MAGIC || 587 le32_to_cpu(footer.footer_marker.size) != 0 || 588 le32_to_cpu(footer.footer_marker.type) != MARKER_FOOTER || 589 le64_to_cpu(footer.eos_marker.val) != 0 || 590 le32_to_cpu(footer.eos_marker.size) != 0 || 591 le32_to_cpu(footer.eos_marker.type) != MARKER_END_OF_STREAM) 592 { 593 return -EINVAL; 594 } 595 596 header = footer.header; 597 } 598 599 if (le32_to_cpu(header.version) >= 3) { 600 char buf[64]; 601 snprintf(buf, sizeof(buf), "VMDK version %d", 602 le32_to_cpu(header.version)); 603 qerror_report(QERR_UNKNOWN_BLOCK_FORMAT_FEATURE, 604 bs->device_name, "vmdk", buf); 605 return -ENOTSUP; 606 } 607 608 if (le32_to_cpu(header.num_gtes_per_gt) > 512) { 609 error_report("L2 table size too big"); 610 return -EINVAL; 611 } 612 613 l1_entry_sectors = le32_to_cpu(header.num_gtes_per_gt) 614 * le64_to_cpu(header.granularity); 615 if (l1_entry_sectors == 0) { 616 return -EINVAL; 617 } 618 l1_size = (le64_to_cpu(header.capacity) + l1_entry_sectors - 1) 619 / l1_entry_sectors; 620 if (le32_to_cpu(header.flags) & VMDK4_FLAG_RGD) { 621 l1_backup_offset = le64_to_cpu(header.rgd_offset) << 9; 622 } 623 ret = vmdk_add_extent(bs, file, false, 624 le64_to_cpu(header.capacity), 625 le64_to_cpu(header.gd_offset) << 9, 626 l1_backup_offset, 627 l1_size, 628 le32_to_cpu(header.num_gtes_per_gt), 629 le64_to_cpu(header.granularity), 630 &extent, 631 errp); 632 if (ret < 0) { 633 return ret; 634 } 635 extent->compressed = 636 le16_to_cpu(header.compressAlgorithm) == VMDK4_COMPRESSION_DEFLATE; 637 extent->has_marker = le32_to_cpu(header.flags) & VMDK4_FLAG_MARKER; 638 extent->version = le32_to_cpu(header.version); 639 extent->has_zero_grain = le32_to_cpu(header.flags) & VMDK4_FLAG_ZERO_GRAIN; 640 ret = vmdk_init_tables(bs, extent, errp); 641 if (ret) { 642 /* free extent allocated by vmdk_add_extent */ 643 vmdk_free_last_extent(bs); 644 } 645 return ret; 646 } 647 648 /* find an option value out of descriptor file */ 649 static int vmdk_parse_description(const char *desc, const char *opt_name, 650 char *buf, int buf_size) 651 { 652 char *opt_pos, *opt_end; 653 const char *end = desc + strlen(desc); 654 655 opt_pos = strstr(desc, opt_name); 656 if (!opt_pos) { 657 return VMDK_ERROR; 658 } 659 /* Skip "=\"" following opt_name */ 660 opt_pos += strlen(opt_name) + 2; 661 if (opt_pos >= end) { 662 return VMDK_ERROR; 663 } 664 opt_end = opt_pos; 665 while (opt_end < end && *opt_end != '"') { 666 opt_end++; 667 } 668 if (opt_end == end || buf_size < opt_end - opt_pos + 1) { 669 return VMDK_ERROR; 670 } 671 pstrcpy(buf, opt_end - opt_pos + 1, opt_pos); 672 return VMDK_OK; 673 } 674 675 /* Open an extent file and append to bs array */ 676 static int vmdk_open_sparse(BlockDriverState *bs, 677 BlockDriverState *file, 678 int flags, Error **errp) 679 { 680 uint32_t magic; 681 682 if (bdrv_pread(file, 0, &magic, sizeof(magic)) != sizeof(magic)) { 683 return -EIO; 684 } 685 686 magic = be32_to_cpu(magic); 687 switch (magic) { 688 case VMDK3_MAGIC: 689 return vmdk_open_vmfs_sparse(bs, file, flags, errp); 690 break; 691 case VMDK4_MAGIC: 692 return vmdk_open_vmdk4(bs, file, flags, errp); 693 break; 694 default: 695 return -EMEDIUMTYPE; 696 break; 697 } 698 } 699 700 static int vmdk_parse_extents(const char *desc, BlockDriverState *bs, 701 const char *desc_file_path, Error **errp) 702 { 703 int ret; 704 char access[11]; 705 char type[11]; 706 char fname[512]; 707 const char *p = desc; 708 int64_t sectors = 0; 709 int64_t flat_offset; 710 char extent_path[PATH_MAX]; 711 BlockDriverState *extent_file; 712 713 while (*p) { 714 /* parse extent line: 715 * RW [size in sectors] FLAT "file-name.vmdk" OFFSET 716 * or 717 * RW [size in sectors] SPARSE "file-name.vmdk" 718 */ 719 flat_offset = -1; 720 ret = sscanf(p, "%10s %" SCNd64 " %10s \"%511[^\n\r\"]\" %" SCNd64, 721 access, §ors, type, fname, &flat_offset); 722 if (ret < 4 || strcmp(access, "RW")) { 723 goto next_line; 724 } else if (!strcmp(type, "FLAT")) { 725 if (ret != 5 || flat_offset < 0) { 726 error_setg(errp, "Invalid extent lines: \n%s", p); 727 return -EINVAL; 728 } 729 } else if (!strcmp(type, "VMFS")) { 730 flat_offset = 0; 731 } else if (ret != 4) { 732 error_setg(errp, "Invalid extent lines: \n%s", p); 733 return -EINVAL; 734 } 735 736 if (sectors <= 0 || 737 (strcmp(type, "FLAT") && strcmp(type, "SPARSE") && 738 strcmp(type, "VMFS") && strcmp(type, "VMFSSPARSE")) || 739 (strcmp(access, "RW"))) { 740 goto next_line; 741 } 742 743 path_combine(extent_path, sizeof(extent_path), 744 desc_file_path, fname); 745 ret = bdrv_file_open(&extent_file, extent_path, NULL, bs->open_flags, 746 errp); 747 if (ret) { 748 return ret; 749 } 750 751 /* save to extents array */ 752 if (!strcmp(type, "FLAT") || !strcmp(type, "VMFS")) { 753 /* FLAT extent */ 754 VmdkExtent *extent; 755 756 ret = vmdk_add_extent(bs, extent_file, true, sectors, 757 0, 0, 0, 0, 0, &extent, errp); 758 if (ret < 0) { 759 return ret; 760 } 761 extent->flat_start_offset = flat_offset << 9; 762 } else if (!strcmp(type, "SPARSE") || !strcmp(type, "VMFSSPARSE")) { 763 /* SPARSE extent and VMFSSPARSE extent are both "COWD" sparse file*/ 764 ret = vmdk_open_sparse(bs, extent_file, bs->open_flags, errp); 765 if (ret) { 766 bdrv_unref(extent_file); 767 return ret; 768 } 769 } else { 770 error_setg(errp, "Unsupported extent type '%s'", type); 771 return -ENOTSUP; 772 } 773 next_line: 774 /* move to next line */ 775 while (*p) { 776 if (*p == '\n') { 777 p++; 778 break; 779 } 780 p++; 781 } 782 } 783 return 0; 784 } 785 786 static int vmdk_open_desc_file(BlockDriverState *bs, int flags, 787 uint64_t desc_offset, Error **errp) 788 { 789 int ret; 790 char *buf = NULL; 791 char ct[128]; 792 BDRVVmdkState *s = bs->opaque; 793 int64_t size; 794 795 size = bdrv_getlength(bs->file); 796 if (size < 0) { 797 return -EINVAL; 798 } 799 800 size = MIN(size, 1 << 20); /* avoid unbounded allocation */ 801 buf = g_malloc0(size + 1); 802 803 ret = bdrv_pread(bs->file, desc_offset, buf, size); 804 if (ret < 0) { 805 goto exit; 806 } 807 if (vmdk_parse_description(buf, "createType", ct, sizeof(ct))) { 808 ret = -EMEDIUMTYPE; 809 goto exit; 810 } 811 if (strcmp(ct, "monolithicFlat") && 812 strcmp(ct, "vmfs") && 813 strcmp(ct, "vmfsSparse") && 814 strcmp(ct, "twoGbMaxExtentSparse") && 815 strcmp(ct, "twoGbMaxExtentFlat")) { 816 error_setg(errp, "Unsupported image type '%s'", ct); 817 ret = -ENOTSUP; 818 goto exit; 819 } 820 s->desc_offset = 0; 821 ret = vmdk_parse_extents(buf, bs, bs->file->filename, errp); 822 exit: 823 g_free(buf); 824 return ret; 825 } 826 827 static int vmdk_open(BlockDriverState *bs, QDict *options, int flags, 828 Error **errp) 829 { 830 int ret; 831 BDRVVmdkState *s = bs->opaque; 832 833 if (vmdk_open_sparse(bs, bs->file, flags, errp) == 0) { 834 s->desc_offset = 0x200; 835 } else { 836 ret = vmdk_open_desc_file(bs, flags, 0, errp); 837 if (ret) { 838 goto fail; 839 } 840 } 841 /* try to open parent images, if exist */ 842 ret = vmdk_parent_open(bs); 843 if (ret) { 844 goto fail; 845 } 846 s->parent_cid = vmdk_read_cid(bs, 1); 847 qemu_co_mutex_init(&s->lock); 848 849 /* Disable migration when VMDK images are used */ 850 error_set(&s->migration_blocker, 851 QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED, 852 "vmdk", bs->device_name, "live migration"); 853 migrate_add_blocker(s->migration_blocker); 854 855 return 0; 856 857 fail: 858 vmdk_free_extents(bs); 859 return ret; 860 } 861 862 static int get_whole_cluster(BlockDriverState *bs, 863 VmdkExtent *extent, 864 uint64_t cluster_offset, 865 uint64_t offset, 866 bool allocate) 867 { 868 int ret = VMDK_OK; 869 uint8_t *whole_grain = NULL; 870 871 /* we will be here if it's first write on non-exist grain(cluster). 872 * try to read from parent image, if exist */ 873 if (bs->backing_hd) { 874 whole_grain = 875 qemu_blockalign(bs, extent->cluster_sectors << BDRV_SECTOR_BITS); 876 if (!vmdk_is_cid_valid(bs)) { 877 ret = VMDK_ERROR; 878 goto exit; 879 } 880 881 /* floor offset to cluster */ 882 offset -= offset % (extent->cluster_sectors * 512); 883 ret = bdrv_read(bs->backing_hd, offset >> 9, whole_grain, 884 extent->cluster_sectors); 885 if (ret < 0) { 886 ret = VMDK_ERROR; 887 goto exit; 888 } 889 890 /* Write grain only into the active image */ 891 ret = bdrv_write(extent->file, cluster_offset, whole_grain, 892 extent->cluster_sectors); 893 if (ret < 0) { 894 ret = VMDK_ERROR; 895 goto exit; 896 } 897 } 898 exit: 899 qemu_vfree(whole_grain); 900 return ret; 901 } 902 903 static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data) 904 { 905 uint32_t offset; 906 QEMU_BUILD_BUG_ON(sizeof(offset) != sizeof(m_data->offset)); 907 offset = cpu_to_le32(m_data->offset); 908 /* update L2 table */ 909 if (bdrv_pwrite_sync( 910 extent->file, 911 ((int64_t)m_data->l2_offset * 512) 912 + (m_data->l2_index * sizeof(m_data->offset)), 913 &offset, sizeof(offset)) < 0) { 914 return VMDK_ERROR; 915 } 916 /* update backup L2 table */ 917 if (extent->l1_backup_table_offset != 0) { 918 m_data->l2_offset = extent->l1_backup_table[m_data->l1_index]; 919 if (bdrv_pwrite_sync( 920 extent->file, 921 ((int64_t)m_data->l2_offset * 512) 922 + (m_data->l2_index * sizeof(m_data->offset)), 923 &offset, sizeof(offset)) < 0) { 924 return VMDK_ERROR; 925 } 926 } 927 if (m_data->l2_cache_entry) { 928 *m_data->l2_cache_entry = offset; 929 } 930 931 return VMDK_OK; 932 } 933 934 static int get_cluster_offset(BlockDriverState *bs, 935 VmdkExtent *extent, 936 VmdkMetaData *m_data, 937 uint64_t offset, 938 int allocate, 939 uint64_t *cluster_offset) 940 { 941 unsigned int l1_index, l2_offset, l2_index; 942 int min_index, i, j; 943 uint32_t min_count, *l2_table; 944 bool zeroed = false; 945 946 if (m_data) { 947 m_data->valid = 0; 948 } 949 if (extent->flat) { 950 *cluster_offset = extent->flat_start_offset; 951 return VMDK_OK; 952 } 953 954 offset -= (extent->end_sector - extent->sectors) * SECTOR_SIZE; 955 l1_index = (offset >> 9) / extent->l1_entry_sectors; 956 if (l1_index >= extent->l1_size) { 957 return VMDK_ERROR; 958 } 959 l2_offset = extent->l1_table[l1_index]; 960 if (!l2_offset) { 961 return VMDK_UNALLOC; 962 } 963 for (i = 0; i < L2_CACHE_SIZE; i++) { 964 if (l2_offset == extent->l2_cache_offsets[i]) { 965 /* increment the hit count */ 966 if (++extent->l2_cache_counts[i] == 0xffffffff) { 967 for (j = 0; j < L2_CACHE_SIZE; j++) { 968 extent->l2_cache_counts[j] >>= 1; 969 } 970 } 971 l2_table = extent->l2_cache + (i * extent->l2_size); 972 goto found; 973 } 974 } 975 /* not found: load a new entry in the least used one */ 976 min_index = 0; 977 min_count = 0xffffffff; 978 for (i = 0; i < L2_CACHE_SIZE; i++) { 979 if (extent->l2_cache_counts[i] < min_count) { 980 min_count = extent->l2_cache_counts[i]; 981 min_index = i; 982 } 983 } 984 l2_table = extent->l2_cache + (min_index * extent->l2_size); 985 if (bdrv_pread( 986 extent->file, 987 (int64_t)l2_offset * 512, 988 l2_table, 989 extent->l2_size * sizeof(uint32_t) 990 ) != extent->l2_size * sizeof(uint32_t)) { 991 return VMDK_ERROR; 992 } 993 994 extent->l2_cache_offsets[min_index] = l2_offset; 995 extent->l2_cache_counts[min_index] = 1; 996 found: 997 l2_index = ((offset >> 9) / extent->cluster_sectors) % extent->l2_size; 998 *cluster_offset = le32_to_cpu(l2_table[l2_index]); 999 1000 if (m_data) { 1001 m_data->valid = 1; 1002 m_data->l1_index = l1_index; 1003 m_data->l2_index = l2_index; 1004 m_data->offset = *cluster_offset; 1005 m_data->l2_offset = l2_offset; 1006 m_data->l2_cache_entry = &l2_table[l2_index]; 1007 } 1008 if (extent->has_zero_grain && *cluster_offset == VMDK_GTE_ZEROED) { 1009 zeroed = true; 1010 } 1011 1012 if (!*cluster_offset || zeroed) { 1013 if (!allocate) { 1014 return zeroed ? VMDK_ZEROED : VMDK_UNALLOC; 1015 } 1016 1017 /* Avoid the L2 tables update for the images that have snapshots. */ 1018 *cluster_offset = bdrv_getlength(extent->file); 1019 if (!extent->compressed) { 1020 bdrv_truncate( 1021 extent->file, 1022 *cluster_offset + (extent->cluster_sectors << 9) 1023 ); 1024 } 1025 1026 *cluster_offset >>= 9; 1027 l2_table[l2_index] = cpu_to_le32(*cluster_offset); 1028 1029 /* First of all we write grain itself, to avoid race condition 1030 * that may to corrupt the image. 1031 * This problem may occur because of insufficient space on host disk 1032 * or inappropriate VM shutdown. 1033 */ 1034 if (get_whole_cluster( 1035 bs, extent, *cluster_offset, offset, allocate) == -1) { 1036 return VMDK_ERROR; 1037 } 1038 1039 if (m_data) { 1040 m_data->offset = *cluster_offset; 1041 } 1042 } 1043 *cluster_offset <<= 9; 1044 return VMDK_OK; 1045 } 1046 1047 static VmdkExtent *find_extent(BDRVVmdkState *s, 1048 int64_t sector_num, VmdkExtent *start_hint) 1049 { 1050 VmdkExtent *extent = start_hint; 1051 1052 if (!extent) { 1053 extent = &s->extents[0]; 1054 } 1055 while (extent < &s->extents[s->num_extents]) { 1056 if (sector_num < extent->end_sector) { 1057 return extent; 1058 } 1059 extent++; 1060 } 1061 return NULL; 1062 } 1063 1064 static int64_t coroutine_fn vmdk_co_get_block_status(BlockDriverState *bs, 1065 int64_t sector_num, int nb_sectors, int *pnum) 1066 { 1067 BDRVVmdkState *s = bs->opaque; 1068 int64_t index_in_cluster, n, ret; 1069 uint64_t offset; 1070 VmdkExtent *extent; 1071 1072 extent = find_extent(s, sector_num, NULL); 1073 if (!extent) { 1074 return 0; 1075 } 1076 qemu_co_mutex_lock(&s->lock); 1077 ret = get_cluster_offset(bs, extent, NULL, 1078 sector_num * 512, 0, &offset); 1079 qemu_co_mutex_unlock(&s->lock); 1080 1081 switch (ret) { 1082 case VMDK_ERROR: 1083 ret = -EIO; 1084 break; 1085 case VMDK_UNALLOC: 1086 ret = 0; 1087 break; 1088 case VMDK_ZEROED: 1089 ret = BDRV_BLOCK_ZERO; 1090 break; 1091 case VMDK_OK: 1092 ret = BDRV_BLOCK_DATA; 1093 if (extent->file == bs->file) { 1094 ret |= BDRV_BLOCK_OFFSET_VALID | offset; 1095 } 1096 1097 break; 1098 } 1099 1100 index_in_cluster = sector_num % extent->cluster_sectors; 1101 n = extent->cluster_sectors - index_in_cluster; 1102 if (n > nb_sectors) { 1103 n = nb_sectors; 1104 } 1105 *pnum = n; 1106 return ret; 1107 } 1108 1109 static int vmdk_write_extent(VmdkExtent *extent, int64_t cluster_offset, 1110 int64_t offset_in_cluster, const uint8_t *buf, 1111 int nb_sectors, int64_t sector_num) 1112 { 1113 int ret; 1114 VmdkGrainMarker *data = NULL; 1115 uLongf buf_len; 1116 const uint8_t *write_buf = buf; 1117 int write_len = nb_sectors * 512; 1118 1119 if (extent->compressed) { 1120 if (!extent->has_marker) { 1121 ret = -EINVAL; 1122 goto out; 1123 } 1124 buf_len = (extent->cluster_sectors << 9) * 2; 1125 data = g_malloc(buf_len + sizeof(VmdkGrainMarker)); 1126 if (compress(data->data, &buf_len, buf, nb_sectors << 9) != Z_OK || 1127 buf_len == 0) { 1128 ret = -EINVAL; 1129 goto out; 1130 } 1131 data->lba = sector_num; 1132 data->size = buf_len; 1133 write_buf = (uint8_t *)data; 1134 write_len = buf_len + sizeof(VmdkGrainMarker); 1135 } 1136 ret = bdrv_pwrite(extent->file, 1137 cluster_offset + offset_in_cluster, 1138 write_buf, 1139 write_len); 1140 if (ret != write_len) { 1141 ret = ret < 0 ? ret : -EIO; 1142 goto out; 1143 } 1144 ret = 0; 1145 out: 1146 g_free(data); 1147 return ret; 1148 } 1149 1150 static int vmdk_read_extent(VmdkExtent *extent, int64_t cluster_offset, 1151 int64_t offset_in_cluster, uint8_t *buf, 1152 int nb_sectors) 1153 { 1154 int ret; 1155 int cluster_bytes, buf_bytes; 1156 uint8_t *cluster_buf, *compressed_data; 1157 uint8_t *uncomp_buf; 1158 uint32_t data_len; 1159 VmdkGrainMarker *marker; 1160 uLongf buf_len; 1161 1162 1163 if (!extent->compressed) { 1164 ret = bdrv_pread(extent->file, 1165 cluster_offset + offset_in_cluster, 1166 buf, nb_sectors * 512); 1167 if (ret == nb_sectors * 512) { 1168 return 0; 1169 } else { 1170 return -EIO; 1171 } 1172 } 1173 cluster_bytes = extent->cluster_sectors * 512; 1174 /* Read two clusters in case GrainMarker + compressed data > one cluster */ 1175 buf_bytes = cluster_bytes * 2; 1176 cluster_buf = g_malloc(buf_bytes); 1177 uncomp_buf = g_malloc(cluster_bytes); 1178 ret = bdrv_pread(extent->file, 1179 cluster_offset, 1180 cluster_buf, buf_bytes); 1181 if (ret < 0) { 1182 goto out; 1183 } 1184 compressed_data = cluster_buf; 1185 buf_len = cluster_bytes; 1186 data_len = cluster_bytes; 1187 if (extent->has_marker) { 1188 marker = (VmdkGrainMarker *)cluster_buf; 1189 compressed_data = marker->data; 1190 data_len = le32_to_cpu(marker->size); 1191 } 1192 if (!data_len || data_len > buf_bytes) { 1193 ret = -EINVAL; 1194 goto out; 1195 } 1196 ret = uncompress(uncomp_buf, &buf_len, compressed_data, data_len); 1197 if (ret != Z_OK) { 1198 ret = -EINVAL; 1199 goto out; 1200 1201 } 1202 if (offset_in_cluster < 0 || 1203 offset_in_cluster + nb_sectors * 512 > buf_len) { 1204 ret = -EINVAL; 1205 goto out; 1206 } 1207 memcpy(buf, uncomp_buf + offset_in_cluster, nb_sectors * 512); 1208 ret = 0; 1209 1210 out: 1211 g_free(uncomp_buf); 1212 g_free(cluster_buf); 1213 return ret; 1214 } 1215 1216 static int vmdk_read(BlockDriverState *bs, int64_t sector_num, 1217 uint8_t *buf, int nb_sectors) 1218 { 1219 BDRVVmdkState *s = bs->opaque; 1220 int ret; 1221 uint64_t n, index_in_cluster; 1222 uint64_t extent_begin_sector, extent_relative_sector_num; 1223 VmdkExtent *extent = NULL; 1224 uint64_t cluster_offset; 1225 1226 while (nb_sectors > 0) { 1227 extent = find_extent(s, sector_num, extent); 1228 if (!extent) { 1229 return -EIO; 1230 } 1231 ret = get_cluster_offset( 1232 bs, extent, NULL, 1233 sector_num << 9, 0, &cluster_offset); 1234 extent_begin_sector = extent->end_sector - extent->sectors; 1235 extent_relative_sector_num = sector_num - extent_begin_sector; 1236 index_in_cluster = extent_relative_sector_num % extent->cluster_sectors; 1237 n = extent->cluster_sectors - index_in_cluster; 1238 if (n > nb_sectors) { 1239 n = nb_sectors; 1240 } 1241 if (ret != VMDK_OK) { 1242 /* if not allocated, try to read from parent image, if exist */ 1243 if (bs->backing_hd && ret != VMDK_ZEROED) { 1244 if (!vmdk_is_cid_valid(bs)) { 1245 return -EINVAL; 1246 } 1247 ret = bdrv_read(bs->backing_hd, sector_num, buf, n); 1248 if (ret < 0) { 1249 return ret; 1250 } 1251 } else { 1252 memset(buf, 0, 512 * n); 1253 } 1254 } else { 1255 ret = vmdk_read_extent(extent, 1256 cluster_offset, index_in_cluster * 512, 1257 buf, n); 1258 if (ret) { 1259 return ret; 1260 } 1261 } 1262 nb_sectors -= n; 1263 sector_num += n; 1264 buf += n * 512; 1265 } 1266 return 0; 1267 } 1268 1269 static coroutine_fn int vmdk_co_read(BlockDriverState *bs, int64_t sector_num, 1270 uint8_t *buf, int nb_sectors) 1271 { 1272 int ret; 1273 BDRVVmdkState *s = bs->opaque; 1274 qemu_co_mutex_lock(&s->lock); 1275 ret = vmdk_read(bs, sector_num, buf, nb_sectors); 1276 qemu_co_mutex_unlock(&s->lock); 1277 return ret; 1278 } 1279 1280 /** 1281 * vmdk_write: 1282 * @zeroed: buf is ignored (data is zero), use zeroed_grain GTE feature 1283 * if possible, otherwise return -ENOTSUP. 1284 * @zero_dry_run: used for zeroed == true only, don't update L2 table, just try 1285 * with each cluster. By dry run we can find if the zero write 1286 * is possible without modifying image data. 1287 * 1288 * Returns: error code with 0 for success. 1289 */ 1290 static int vmdk_write(BlockDriverState *bs, int64_t sector_num, 1291 const uint8_t *buf, int nb_sectors, 1292 bool zeroed, bool zero_dry_run) 1293 { 1294 BDRVVmdkState *s = bs->opaque; 1295 VmdkExtent *extent = NULL; 1296 int n, ret; 1297 int64_t index_in_cluster; 1298 uint64_t extent_begin_sector, extent_relative_sector_num; 1299 uint64_t cluster_offset; 1300 VmdkMetaData m_data; 1301 1302 if (sector_num > bs->total_sectors) { 1303 error_report("Wrong offset: sector_num=0x%" PRIx64 1304 " total_sectors=0x%" PRIx64 "\n", 1305 sector_num, bs->total_sectors); 1306 return -EIO; 1307 } 1308 1309 while (nb_sectors > 0) { 1310 extent = find_extent(s, sector_num, extent); 1311 if (!extent) { 1312 return -EIO; 1313 } 1314 ret = get_cluster_offset( 1315 bs, 1316 extent, 1317 &m_data, 1318 sector_num << 9, !extent->compressed, 1319 &cluster_offset); 1320 if (extent->compressed) { 1321 if (ret == VMDK_OK) { 1322 /* Refuse write to allocated cluster for streamOptimized */ 1323 error_report("Could not write to allocated cluster" 1324 " for streamOptimized"); 1325 return -EIO; 1326 } else { 1327 /* allocate */ 1328 ret = get_cluster_offset( 1329 bs, 1330 extent, 1331 &m_data, 1332 sector_num << 9, 1, 1333 &cluster_offset); 1334 } 1335 } 1336 if (ret == VMDK_ERROR) { 1337 return -EINVAL; 1338 } 1339 extent_begin_sector = extent->end_sector - extent->sectors; 1340 extent_relative_sector_num = sector_num - extent_begin_sector; 1341 index_in_cluster = extent_relative_sector_num % extent->cluster_sectors; 1342 n = extent->cluster_sectors - index_in_cluster; 1343 if (n > nb_sectors) { 1344 n = nb_sectors; 1345 } 1346 if (zeroed) { 1347 /* Do zeroed write, buf is ignored */ 1348 if (extent->has_zero_grain && 1349 index_in_cluster == 0 && 1350 n >= extent->cluster_sectors) { 1351 n = extent->cluster_sectors; 1352 if (!zero_dry_run) { 1353 m_data.offset = VMDK_GTE_ZEROED; 1354 /* update L2 tables */ 1355 if (vmdk_L2update(extent, &m_data) != VMDK_OK) { 1356 return -EIO; 1357 } 1358 } 1359 } else { 1360 return -ENOTSUP; 1361 } 1362 } else { 1363 ret = vmdk_write_extent(extent, 1364 cluster_offset, index_in_cluster * 512, 1365 buf, n, sector_num); 1366 if (ret) { 1367 return ret; 1368 } 1369 if (m_data.valid) { 1370 /* update L2 tables */ 1371 if (vmdk_L2update(extent, &m_data) != VMDK_OK) { 1372 return -EIO; 1373 } 1374 } 1375 } 1376 nb_sectors -= n; 1377 sector_num += n; 1378 buf += n * 512; 1379 1380 /* update CID on the first write every time the virtual disk is 1381 * opened */ 1382 if (!s->cid_updated) { 1383 ret = vmdk_write_cid(bs, time(NULL)); 1384 if (ret < 0) { 1385 return ret; 1386 } 1387 s->cid_updated = true; 1388 } 1389 } 1390 return 0; 1391 } 1392 1393 static coroutine_fn int vmdk_co_write(BlockDriverState *bs, int64_t sector_num, 1394 const uint8_t *buf, int nb_sectors) 1395 { 1396 int ret; 1397 BDRVVmdkState *s = bs->opaque; 1398 qemu_co_mutex_lock(&s->lock); 1399 ret = vmdk_write(bs, sector_num, buf, nb_sectors, false, false); 1400 qemu_co_mutex_unlock(&s->lock); 1401 return ret; 1402 } 1403 1404 static int coroutine_fn vmdk_co_write_zeroes(BlockDriverState *bs, 1405 int64_t sector_num, 1406 int nb_sectors) 1407 { 1408 int ret; 1409 BDRVVmdkState *s = bs->opaque; 1410 qemu_co_mutex_lock(&s->lock); 1411 /* write zeroes could fail if sectors not aligned to cluster, test it with 1412 * dry_run == true before really updating image */ 1413 ret = vmdk_write(bs, sector_num, NULL, nb_sectors, true, true); 1414 if (!ret) { 1415 ret = vmdk_write(bs, sector_num, NULL, nb_sectors, true, false); 1416 } 1417 qemu_co_mutex_unlock(&s->lock); 1418 return ret; 1419 } 1420 1421 static int vmdk_create_extent(const char *filename, int64_t filesize, 1422 bool flat, bool compress, bool zeroed_grain) 1423 { 1424 int ret, i; 1425 int fd = 0; 1426 VMDK4Header header; 1427 uint32_t tmp, magic, grains, gd_size, gt_size, gt_count; 1428 1429 fd = qemu_open(filename, 1430 O_WRONLY | O_CREAT | O_TRUNC | O_BINARY | O_LARGEFILE, 1431 0644); 1432 if (fd < 0) { 1433 return -errno; 1434 } 1435 if (flat) { 1436 ret = ftruncate(fd, filesize); 1437 if (ret < 0) { 1438 ret = -errno; 1439 } 1440 goto exit; 1441 } 1442 magic = cpu_to_be32(VMDK4_MAGIC); 1443 memset(&header, 0, sizeof(header)); 1444 header.version = zeroed_grain ? 2 : 1; 1445 header.flags = VMDK4_FLAG_RGD | VMDK4_FLAG_NL_DETECT 1446 | (compress ? VMDK4_FLAG_COMPRESS | VMDK4_FLAG_MARKER : 0) 1447 | (zeroed_grain ? VMDK4_FLAG_ZERO_GRAIN : 0); 1448 header.compressAlgorithm = compress ? VMDK4_COMPRESSION_DEFLATE : 0; 1449 header.capacity = filesize / 512; 1450 header.granularity = 128; 1451 header.num_gtes_per_gt = 512; 1452 1453 grains = (filesize / 512 + header.granularity - 1) / header.granularity; 1454 gt_size = ((header.num_gtes_per_gt * sizeof(uint32_t)) + 511) >> 9; 1455 gt_count = 1456 (grains + header.num_gtes_per_gt - 1) / header.num_gtes_per_gt; 1457 gd_size = (gt_count * sizeof(uint32_t) + 511) >> 9; 1458 1459 header.desc_offset = 1; 1460 header.desc_size = 20; 1461 header.rgd_offset = header.desc_offset + header.desc_size; 1462 header.gd_offset = header.rgd_offset + gd_size + (gt_size * gt_count); 1463 header.grain_offset = 1464 ((header.gd_offset + gd_size + (gt_size * gt_count) + 1465 header.granularity - 1) / header.granularity) * 1466 header.granularity; 1467 /* swap endianness for all header fields */ 1468 header.version = cpu_to_le32(header.version); 1469 header.flags = cpu_to_le32(header.flags); 1470 header.capacity = cpu_to_le64(header.capacity); 1471 header.granularity = cpu_to_le64(header.granularity); 1472 header.num_gtes_per_gt = cpu_to_le32(header.num_gtes_per_gt); 1473 header.desc_offset = cpu_to_le64(header.desc_offset); 1474 header.desc_size = cpu_to_le64(header.desc_size); 1475 header.rgd_offset = cpu_to_le64(header.rgd_offset); 1476 header.gd_offset = cpu_to_le64(header.gd_offset); 1477 header.grain_offset = cpu_to_le64(header.grain_offset); 1478 header.compressAlgorithm = cpu_to_le16(header.compressAlgorithm); 1479 1480 header.check_bytes[0] = 0xa; 1481 header.check_bytes[1] = 0x20; 1482 header.check_bytes[2] = 0xd; 1483 header.check_bytes[3] = 0xa; 1484 1485 /* write all the data */ 1486 ret = qemu_write_full(fd, &magic, sizeof(magic)); 1487 if (ret != sizeof(magic)) { 1488 ret = -errno; 1489 goto exit; 1490 } 1491 ret = qemu_write_full(fd, &header, sizeof(header)); 1492 if (ret != sizeof(header)) { 1493 ret = -errno; 1494 goto exit; 1495 } 1496 1497 ret = ftruncate(fd, le64_to_cpu(header.grain_offset) << 9); 1498 if (ret < 0) { 1499 ret = -errno; 1500 goto exit; 1501 } 1502 1503 /* write grain directory */ 1504 lseek(fd, le64_to_cpu(header.rgd_offset) << 9, SEEK_SET); 1505 for (i = 0, tmp = le64_to_cpu(header.rgd_offset) + gd_size; 1506 i < gt_count; i++, tmp += gt_size) { 1507 ret = qemu_write_full(fd, &tmp, sizeof(tmp)); 1508 if (ret != sizeof(tmp)) { 1509 ret = -errno; 1510 goto exit; 1511 } 1512 } 1513 1514 /* write backup grain directory */ 1515 lseek(fd, le64_to_cpu(header.gd_offset) << 9, SEEK_SET); 1516 for (i = 0, tmp = le64_to_cpu(header.gd_offset) + gd_size; 1517 i < gt_count; i++, tmp += gt_size) { 1518 ret = qemu_write_full(fd, &tmp, sizeof(tmp)); 1519 if (ret != sizeof(tmp)) { 1520 ret = -errno; 1521 goto exit; 1522 } 1523 } 1524 1525 ret = 0; 1526 exit: 1527 qemu_close(fd); 1528 return ret; 1529 } 1530 1531 static int filename_decompose(const char *filename, char *path, char *prefix, 1532 char *postfix, size_t buf_len, Error **errp) 1533 { 1534 const char *p, *q; 1535 1536 if (filename == NULL || !strlen(filename)) { 1537 error_setg(errp, "No filename provided"); 1538 return VMDK_ERROR; 1539 } 1540 p = strrchr(filename, '/'); 1541 if (p == NULL) { 1542 p = strrchr(filename, '\\'); 1543 } 1544 if (p == NULL) { 1545 p = strrchr(filename, ':'); 1546 } 1547 if (p != NULL) { 1548 p++; 1549 if (p - filename >= buf_len) { 1550 return VMDK_ERROR; 1551 } 1552 pstrcpy(path, p - filename + 1, filename); 1553 } else { 1554 p = filename; 1555 path[0] = '\0'; 1556 } 1557 q = strrchr(p, '.'); 1558 if (q == NULL) { 1559 pstrcpy(prefix, buf_len, p); 1560 postfix[0] = '\0'; 1561 } else { 1562 if (q - p >= buf_len) { 1563 return VMDK_ERROR; 1564 } 1565 pstrcpy(prefix, q - p + 1, p); 1566 pstrcpy(postfix, buf_len, q); 1567 } 1568 return VMDK_OK; 1569 } 1570 1571 static int vmdk_create(const char *filename, QEMUOptionParameter *options, 1572 Error **errp) 1573 { 1574 int fd, idx = 0; 1575 char desc[BUF_SIZE]; 1576 int64_t total_size = 0, filesize; 1577 const char *adapter_type = NULL; 1578 const char *backing_file = NULL; 1579 const char *fmt = NULL; 1580 int flags = 0; 1581 int ret = 0; 1582 bool flat, split, compress; 1583 char ext_desc_lines[BUF_SIZE] = ""; 1584 char path[PATH_MAX], prefix[PATH_MAX], postfix[PATH_MAX]; 1585 const int64_t split_size = 0x80000000; /* VMDK has constant split size */ 1586 const char *desc_extent_line; 1587 char parent_desc_line[BUF_SIZE] = ""; 1588 uint32_t parent_cid = 0xffffffff; 1589 uint32_t number_heads = 16; 1590 bool zeroed_grain = false; 1591 const char desc_template[] = 1592 "# Disk DescriptorFile\n" 1593 "version=1\n" 1594 "CID=%x\n" 1595 "parentCID=%x\n" 1596 "createType=\"%s\"\n" 1597 "%s" 1598 "\n" 1599 "# Extent description\n" 1600 "%s" 1601 "\n" 1602 "# The Disk Data Base\n" 1603 "#DDB\n" 1604 "\n" 1605 "ddb.virtualHWVersion = \"%d\"\n" 1606 "ddb.geometry.cylinders = \"%" PRId64 "\"\n" 1607 "ddb.geometry.heads = \"%d\"\n" 1608 "ddb.geometry.sectors = \"63\"\n" 1609 "ddb.adapterType = \"%s\"\n"; 1610 1611 if (filename_decompose(filename, path, prefix, postfix, PATH_MAX, errp)) { 1612 return -EINVAL; 1613 } 1614 /* Read out options */ 1615 while (options && options->name) { 1616 if (!strcmp(options->name, BLOCK_OPT_SIZE)) { 1617 total_size = options->value.n; 1618 } else if (!strcmp(options->name, BLOCK_OPT_ADAPTER_TYPE)) { 1619 adapter_type = options->value.s; 1620 } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) { 1621 backing_file = options->value.s; 1622 } else if (!strcmp(options->name, BLOCK_OPT_COMPAT6)) { 1623 flags |= options->value.n ? BLOCK_FLAG_COMPAT6 : 0; 1624 } else if (!strcmp(options->name, BLOCK_OPT_SUBFMT)) { 1625 fmt = options->value.s; 1626 } else if (!strcmp(options->name, BLOCK_OPT_ZEROED_GRAIN)) { 1627 zeroed_grain |= options->value.n; 1628 } 1629 options++; 1630 } 1631 if (!adapter_type) { 1632 adapter_type = "ide"; 1633 } else if (strcmp(adapter_type, "ide") && 1634 strcmp(adapter_type, "buslogic") && 1635 strcmp(adapter_type, "lsilogic") && 1636 strcmp(adapter_type, "legacyESX")) { 1637 error_setg(errp, "Unknown adapter type: '%s'", adapter_type); 1638 return -EINVAL; 1639 } 1640 if (strcmp(adapter_type, "ide") != 0) { 1641 /* that's the number of heads with which vmware operates when 1642 creating, exporting, etc. vmdk files with a non-ide adapter type */ 1643 number_heads = 255; 1644 } 1645 if (!fmt) { 1646 /* Default format to monolithicSparse */ 1647 fmt = "monolithicSparse"; 1648 } else if (strcmp(fmt, "monolithicFlat") && 1649 strcmp(fmt, "monolithicSparse") && 1650 strcmp(fmt, "twoGbMaxExtentSparse") && 1651 strcmp(fmt, "twoGbMaxExtentFlat") && 1652 strcmp(fmt, "streamOptimized")) { 1653 error_setg(errp, "Unknown subformat: '%s'", fmt); 1654 return -EINVAL; 1655 } 1656 split = !(strcmp(fmt, "twoGbMaxExtentFlat") && 1657 strcmp(fmt, "twoGbMaxExtentSparse")); 1658 flat = !(strcmp(fmt, "monolithicFlat") && 1659 strcmp(fmt, "twoGbMaxExtentFlat")); 1660 compress = !strcmp(fmt, "streamOptimized"); 1661 if (flat) { 1662 desc_extent_line = "RW %lld FLAT \"%s\" 0\n"; 1663 } else { 1664 desc_extent_line = "RW %lld SPARSE \"%s\"\n"; 1665 } 1666 if (flat && backing_file) { 1667 error_setg(errp, "Flat image can't have backing file"); 1668 return -ENOTSUP; 1669 } 1670 if (flat && zeroed_grain) { 1671 error_setg(errp, "Flat image can't enable zeroed grain"); 1672 return -ENOTSUP; 1673 } 1674 if (backing_file) { 1675 BlockDriverState *bs = bdrv_new(""); 1676 ret = bdrv_open(bs, backing_file, NULL, 0, NULL, errp); 1677 if (ret != 0) { 1678 bdrv_unref(bs); 1679 return ret; 1680 } 1681 if (strcmp(bs->drv->format_name, "vmdk")) { 1682 bdrv_unref(bs); 1683 return -EINVAL; 1684 } 1685 parent_cid = vmdk_read_cid(bs, 0); 1686 bdrv_unref(bs); 1687 snprintf(parent_desc_line, sizeof(parent_desc_line), 1688 "parentFileNameHint=\"%s\"", backing_file); 1689 } 1690 1691 /* Create extents */ 1692 filesize = total_size; 1693 while (filesize > 0) { 1694 char desc_line[BUF_SIZE]; 1695 char ext_filename[PATH_MAX]; 1696 char desc_filename[PATH_MAX]; 1697 int64_t size = filesize; 1698 1699 if (split && size > split_size) { 1700 size = split_size; 1701 } 1702 if (split) { 1703 snprintf(desc_filename, sizeof(desc_filename), "%s-%c%03d%s", 1704 prefix, flat ? 'f' : 's', ++idx, postfix); 1705 } else if (flat) { 1706 snprintf(desc_filename, sizeof(desc_filename), "%s-flat%s", 1707 prefix, postfix); 1708 } else { 1709 snprintf(desc_filename, sizeof(desc_filename), "%s%s", 1710 prefix, postfix); 1711 } 1712 snprintf(ext_filename, sizeof(ext_filename), "%s%s", 1713 path, desc_filename); 1714 1715 if (vmdk_create_extent(ext_filename, size, 1716 flat, compress, zeroed_grain)) { 1717 return -EINVAL; 1718 } 1719 filesize -= size; 1720 1721 /* Format description line */ 1722 snprintf(desc_line, sizeof(desc_line), 1723 desc_extent_line, size / 512, desc_filename); 1724 pstrcat(ext_desc_lines, sizeof(ext_desc_lines), desc_line); 1725 } 1726 /* generate descriptor file */ 1727 snprintf(desc, sizeof(desc), desc_template, 1728 (unsigned int)time(NULL), 1729 parent_cid, 1730 fmt, 1731 parent_desc_line, 1732 ext_desc_lines, 1733 (flags & BLOCK_FLAG_COMPAT6 ? 6 : 4), 1734 total_size / (int64_t)(63 * number_heads * 512), number_heads, 1735 adapter_type); 1736 if (split || flat) { 1737 fd = qemu_open(filename, 1738 O_WRONLY | O_CREAT | O_TRUNC | O_BINARY | O_LARGEFILE, 1739 0644); 1740 } else { 1741 fd = qemu_open(filename, 1742 O_WRONLY | O_BINARY | O_LARGEFILE, 1743 0644); 1744 } 1745 if (fd < 0) { 1746 return -errno; 1747 } 1748 /* the descriptor offset = 0x200 */ 1749 if (!split && !flat && 0x200 != lseek(fd, 0x200, SEEK_SET)) { 1750 ret = -errno; 1751 goto exit; 1752 } 1753 ret = qemu_write_full(fd, desc, strlen(desc)); 1754 if (ret != strlen(desc)) { 1755 ret = -errno; 1756 goto exit; 1757 } 1758 ret = 0; 1759 exit: 1760 qemu_close(fd); 1761 return ret; 1762 } 1763 1764 static void vmdk_close(BlockDriverState *bs) 1765 { 1766 BDRVVmdkState *s = bs->opaque; 1767 1768 vmdk_free_extents(bs); 1769 1770 migrate_del_blocker(s->migration_blocker); 1771 error_free(s->migration_blocker); 1772 } 1773 1774 static coroutine_fn int vmdk_co_flush(BlockDriverState *bs) 1775 { 1776 BDRVVmdkState *s = bs->opaque; 1777 int i, err; 1778 int ret = 0; 1779 1780 for (i = 0; i < s->num_extents; i++) { 1781 err = bdrv_co_flush(s->extents[i].file); 1782 if (err < 0) { 1783 ret = err; 1784 } 1785 } 1786 return ret; 1787 } 1788 1789 static int64_t vmdk_get_allocated_file_size(BlockDriverState *bs) 1790 { 1791 int i; 1792 int64_t ret = 0; 1793 int64_t r; 1794 BDRVVmdkState *s = bs->opaque; 1795 1796 ret = bdrv_get_allocated_file_size(bs->file); 1797 if (ret < 0) { 1798 return ret; 1799 } 1800 for (i = 0; i < s->num_extents; i++) { 1801 if (s->extents[i].file == bs->file) { 1802 continue; 1803 } 1804 r = bdrv_get_allocated_file_size(s->extents[i].file); 1805 if (r < 0) { 1806 return r; 1807 } 1808 ret += r; 1809 } 1810 return ret; 1811 } 1812 1813 static int vmdk_has_zero_init(BlockDriverState *bs) 1814 { 1815 int i; 1816 BDRVVmdkState *s = bs->opaque; 1817 1818 /* If has a flat extent and its underlying storage doesn't have zero init, 1819 * return 0. */ 1820 for (i = 0; i < s->num_extents; i++) { 1821 if (s->extents[i].flat) { 1822 if (!bdrv_has_zero_init(s->extents[i].file)) { 1823 return 0; 1824 } 1825 } 1826 } 1827 return 1; 1828 } 1829 1830 static QEMUOptionParameter vmdk_create_options[] = { 1831 { 1832 .name = BLOCK_OPT_SIZE, 1833 .type = OPT_SIZE, 1834 .help = "Virtual disk size" 1835 }, 1836 { 1837 .name = BLOCK_OPT_ADAPTER_TYPE, 1838 .type = OPT_STRING, 1839 .help = "Virtual adapter type, can be one of " 1840 "ide (default), lsilogic, buslogic or legacyESX" 1841 }, 1842 { 1843 .name = BLOCK_OPT_BACKING_FILE, 1844 .type = OPT_STRING, 1845 .help = "File name of a base image" 1846 }, 1847 { 1848 .name = BLOCK_OPT_COMPAT6, 1849 .type = OPT_FLAG, 1850 .help = "VMDK version 6 image" 1851 }, 1852 { 1853 .name = BLOCK_OPT_SUBFMT, 1854 .type = OPT_STRING, 1855 .help = 1856 "VMDK flat extent format, can be one of " 1857 "{monolithicSparse (default) | monolithicFlat | twoGbMaxExtentSparse | twoGbMaxExtentFlat | streamOptimized} " 1858 }, 1859 { 1860 .name = BLOCK_OPT_ZEROED_GRAIN, 1861 .type = OPT_FLAG, 1862 .help = "Enable efficient zero writes using the zeroed-grain GTE feature" 1863 }, 1864 { NULL } 1865 }; 1866 1867 static BlockDriver bdrv_vmdk = { 1868 .format_name = "vmdk", 1869 .instance_size = sizeof(BDRVVmdkState), 1870 .bdrv_probe = vmdk_probe, 1871 .bdrv_open = vmdk_open, 1872 .bdrv_reopen_prepare = vmdk_reopen_prepare, 1873 .bdrv_read = vmdk_co_read, 1874 .bdrv_write = vmdk_co_write, 1875 .bdrv_co_write_zeroes = vmdk_co_write_zeroes, 1876 .bdrv_close = vmdk_close, 1877 .bdrv_create = vmdk_create, 1878 .bdrv_co_flush_to_disk = vmdk_co_flush, 1879 .bdrv_co_get_block_status = vmdk_co_get_block_status, 1880 .bdrv_get_allocated_file_size = vmdk_get_allocated_file_size, 1881 .bdrv_has_zero_init = vmdk_has_zero_init, 1882 1883 .create_options = vmdk_create_options, 1884 }; 1885 1886 static void bdrv_vmdk_init(void) 1887 { 1888 bdrv_register(&bdrv_vmdk); 1889 } 1890 1891 block_init(bdrv_vmdk_init); 1892