1 /* 2 * Block driver for the VMDK format 3 * 4 * Copyright (c) 2004 Fabrice Bellard 5 * Copyright (c) 2005 Filip Navara 6 * 7 * Permission is hereby granted, free of charge, to any person obtaining a copy 8 * of this software and associated documentation files (the "Software"), to deal 9 * in the Software without restriction, including without limitation the rights 10 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 * copies of the Software, and to permit persons to whom the Software is 12 * furnished to do so, subject to the following conditions: 13 * 14 * The above copyright notice and this permission notice shall be included in 15 * all copies or substantial portions of the Software. 16 * 17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 23 * THE SOFTWARE. 24 */ 25 26 #include "qemu/osdep.h" 27 #include "qapi/error.h" 28 #include "block/block_int.h" 29 #include "sysemu/block-backend.h" 30 #include "qapi/qmp/qerror.h" 31 #include "qemu/error-report.h" 32 #include "qemu/module.h" 33 #include "qemu/bswap.h" 34 #include "migration/migration.h" 35 #include "qemu/cutils.h" 36 #include <zlib.h> 37 #include <glib.h> 38 39 #define VMDK3_MAGIC (('C' << 24) | ('O' << 16) | ('W' << 8) | 'D') 40 #define VMDK4_MAGIC (('K' << 24) | ('D' << 16) | ('M' << 8) | 'V') 41 #define VMDK4_COMPRESSION_DEFLATE 1 42 #define VMDK4_FLAG_NL_DETECT (1 << 0) 43 #define VMDK4_FLAG_RGD (1 << 1) 44 /* Zeroed-grain enable bit */ 45 #define VMDK4_FLAG_ZERO_GRAIN (1 << 2) 46 #define VMDK4_FLAG_COMPRESS (1 << 16) 47 #define VMDK4_FLAG_MARKER (1 << 17) 48 #define VMDK4_GD_AT_END 0xffffffffffffffffULL 49 50 #define VMDK_GTE_ZEROED 0x1 51 52 /* VMDK internal error codes */ 53 #define VMDK_OK 0 54 #define VMDK_ERROR (-1) 55 /* Cluster not allocated */ 56 #define VMDK_UNALLOC (-2) 57 #define VMDK_ZEROED (-3) 58 59 #define BLOCK_OPT_ZEROED_GRAIN "zeroed_grain" 60 61 typedef struct { 62 uint32_t version; 63 uint32_t flags; 64 uint32_t disk_sectors; 65 uint32_t granularity; 66 uint32_t l1dir_offset; 67 uint32_t l1dir_size; 68 uint32_t file_sectors; 69 uint32_t cylinders; 70 uint32_t heads; 71 uint32_t sectors_per_track; 72 } QEMU_PACKED VMDK3Header; 73 74 typedef struct { 75 uint32_t version; 76 uint32_t flags; 77 uint64_t capacity; 78 uint64_t granularity; 79 uint64_t desc_offset; 80 uint64_t desc_size; 81 /* Number of GrainTableEntries per GrainTable */ 82 uint32_t num_gtes_per_gt; 83 uint64_t rgd_offset; 84 uint64_t gd_offset; 85 uint64_t grain_offset; 86 char filler[1]; 87 char check_bytes[4]; 88 uint16_t compressAlgorithm; 89 } QEMU_PACKED VMDK4Header; 90 91 #define L2_CACHE_SIZE 16 92 93 typedef struct VmdkExtent { 94 BdrvChild *file; 95 bool flat; 96 bool compressed; 97 bool has_marker; 98 bool has_zero_grain; 99 int version; 100 int64_t sectors; 101 int64_t end_sector; 102 int64_t flat_start_offset; 103 int64_t l1_table_offset; 104 int64_t l1_backup_table_offset; 105 uint32_t *l1_table; 106 uint32_t *l1_backup_table; 107 unsigned int l1_size; 108 uint32_t l1_entry_sectors; 109 110 unsigned int l2_size; 111 uint32_t *l2_cache; 112 uint32_t l2_cache_offsets[L2_CACHE_SIZE]; 113 uint32_t l2_cache_counts[L2_CACHE_SIZE]; 114 115 int64_t cluster_sectors; 116 int64_t next_cluster_sector; 117 char *type; 118 } VmdkExtent; 119 120 typedef struct BDRVVmdkState { 121 CoMutex lock; 122 uint64_t desc_offset; 123 bool cid_updated; 124 bool cid_checked; 125 uint32_t cid; 126 uint32_t parent_cid; 127 int num_extents; 128 /* Extent array with num_extents entries, ascend ordered by address */ 129 VmdkExtent *extents; 130 Error *migration_blocker; 131 char *create_type; 132 } BDRVVmdkState; 133 134 typedef struct VmdkMetaData { 135 unsigned int l1_index; 136 unsigned int l2_index; 137 unsigned int l2_offset; 138 int valid; 139 uint32_t *l2_cache_entry; 140 } VmdkMetaData; 141 142 typedef struct VmdkGrainMarker { 143 uint64_t lba; 144 uint32_t size; 145 uint8_t data[0]; 146 } QEMU_PACKED VmdkGrainMarker; 147 148 enum { 149 MARKER_END_OF_STREAM = 0, 150 MARKER_GRAIN_TABLE = 1, 151 MARKER_GRAIN_DIRECTORY = 2, 152 MARKER_FOOTER = 3, 153 }; 154 155 static int vmdk_probe(const uint8_t *buf, int buf_size, const char *filename) 156 { 157 uint32_t magic; 158 159 if (buf_size < 4) { 160 return 0; 161 } 162 magic = be32_to_cpu(*(uint32_t *)buf); 163 if (magic == VMDK3_MAGIC || 164 magic == VMDK4_MAGIC) { 165 return 100; 166 } else { 167 const char *p = (const char *)buf; 168 const char *end = p + buf_size; 169 while (p < end) { 170 if (*p == '#') { 171 /* skip comment line */ 172 while (p < end && *p != '\n') { 173 p++; 174 } 175 p++; 176 continue; 177 } 178 if (*p == ' ') { 179 while (p < end && *p == ' ') { 180 p++; 181 } 182 /* skip '\r' if windows line endings used. */ 183 if (p < end && *p == '\r') { 184 p++; 185 } 186 /* only accept blank lines before 'version=' line */ 187 if (p == end || *p != '\n') { 188 return 0; 189 } 190 p++; 191 continue; 192 } 193 if (end - p >= strlen("version=X\n")) { 194 if (strncmp("version=1\n", p, strlen("version=1\n")) == 0 || 195 strncmp("version=2\n", p, strlen("version=2\n")) == 0) { 196 return 100; 197 } 198 } 199 if (end - p >= strlen("version=X\r\n")) { 200 if (strncmp("version=1\r\n", p, strlen("version=1\r\n")) == 0 || 201 strncmp("version=2\r\n", p, strlen("version=2\r\n")) == 0) { 202 return 100; 203 } 204 } 205 return 0; 206 } 207 return 0; 208 } 209 } 210 211 #define SECTOR_SIZE 512 212 #define DESC_SIZE (20 * SECTOR_SIZE) /* 20 sectors of 512 bytes each */ 213 #define BUF_SIZE 4096 214 #define HEADER_SIZE 512 /* first sector of 512 bytes */ 215 216 static void vmdk_free_extents(BlockDriverState *bs) 217 { 218 int i; 219 BDRVVmdkState *s = bs->opaque; 220 VmdkExtent *e; 221 222 for (i = 0; i < s->num_extents; i++) { 223 e = &s->extents[i]; 224 g_free(e->l1_table); 225 g_free(e->l2_cache); 226 g_free(e->l1_backup_table); 227 g_free(e->type); 228 if (e->file != bs->file) { 229 bdrv_unref_child(bs, e->file); 230 } 231 } 232 g_free(s->extents); 233 } 234 235 static void vmdk_free_last_extent(BlockDriverState *bs) 236 { 237 BDRVVmdkState *s = bs->opaque; 238 239 if (s->num_extents == 0) { 240 return; 241 } 242 s->num_extents--; 243 s->extents = g_renew(VmdkExtent, s->extents, s->num_extents); 244 } 245 246 static uint32_t vmdk_read_cid(BlockDriverState *bs, int parent) 247 { 248 char *desc; 249 uint32_t cid = 0xffffffff; 250 const char *p_name, *cid_str; 251 size_t cid_str_size; 252 BDRVVmdkState *s = bs->opaque; 253 int ret; 254 255 desc = g_malloc0(DESC_SIZE); 256 ret = bdrv_pread(bs->file->bs, s->desc_offset, desc, DESC_SIZE); 257 if (ret < 0) { 258 g_free(desc); 259 return 0; 260 } 261 262 if (parent) { 263 cid_str = "parentCID"; 264 cid_str_size = sizeof("parentCID"); 265 } else { 266 cid_str = "CID"; 267 cid_str_size = sizeof("CID"); 268 } 269 270 desc[DESC_SIZE - 1] = '\0'; 271 p_name = strstr(desc, cid_str); 272 if (p_name != NULL) { 273 p_name += cid_str_size; 274 sscanf(p_name, "%" SCNx32, &cid); 275 } 276 277 g_free(desc); 278 return cid; 279 } 280 281 static int vmdk_write_cid(BlockDriverState *bs, uint32_t cid) 282 { 283 char *desc, *tmp_desc; 284 char *p_name, *tmp_str; 285 BDRVVmdkState *s = bs->opaque; 286 int ret = 0; 287 288 desc = g_malloc0(DESC_SIZE); 289 tmp_desc = g_malloc0(DESC_SIZE); 290 ret = bdrv_pread(bs->file->bs, s->desc_offset, desc, DESC_SIZE); 291 if (ret < 0) { 292 goto out; 293 } 294 295 desc[DESC_SIZE - 1] = '\0'; 296 tmp_str = strstr(desc, "parentCID"); 297 if (tmp_str == NULL) { 298 ret = -EINVAL; 299 goto out; 300 } 301 302 pstrcpy(tmp_desc, DESC_SIZE, tmp_str); 303 p_name = strstr(desc, "CID"); 304 if (p_name != NULL) { 305 p_name += sizeof("CID"); 306 snprintf(p_name, DESC_SIZE - (p_name - desc), "%" PRIx32 "\n", cid); 307 pstrcat(desc, DESC_SIZE, tmp_desc); 308 } 309 310 ret = bdrv_pwrite_sync(bs->file->bs, s->desc_offset, desc, DESC_SIZE); 311 312 out: 313 g_free(desc); 314 g_free(tmp_desc); 315 return ret; 316 } 317 318 static int vmdk_is_cid_valid(BlockDriverState *bs) 319 { 320 BDRVVmdkState *s = bs->opaque; 321 uint32_t cur_pcid; 322 323 if (!s->cid_checked && bs->backing) { 324 BlockDriverState *p_bs = bs->backing->bs; 325 326 cur_pcid = vmdk_read_cid(p_bs, 0); 327 if (s->parent_cid != cur_pcid) { 328 /* CID not valid */ 329 return 0; 330 } 331 } 332 s->cid_checked = true; 333 /* CID valid */ 334 return 1; 335 } 336 337 /* We have nothing to do for VMDK reopen, stubs just return success */ 338 static int vmdk_reopen_prepare(BDRVReopenState *state, 339 BlockReopenQueue *queue, Error **errp) 340 { 341 assert(state != NULL); 342 assert(state->bs != NULL); 343 return 0; 344 } 345 346 static int vmdk_parent_open(BlockDriverState *bs) 347 { 348 char *p_name; 349 char *desc; 350 BDRVVmdkState *s = bs->opaque; 351 int ret; 352 353 desc = g_malloc0(DESC_SIZE + 1); 354 ret = bdrv_pread(bs->file->bs, s->desc_offset, desc, DESC_SIZE); 355 if (ret < 0) { 356 goto out; 357 } 358 ret = 0; 359 360 p_name = strstr(desc, "parentFileNameHint"); 361 if (p_name != NULL) { 362 char *end_name; 363 364 p_name += sizeof("parentFileNameHint") + 1; 365 end_name = strchr(p_name, '\"'); 366 if (end_name == NULL) { 367 ret = -EINVAL; 368 goto out; 369 } 370 if ((end_name - p_name) > sizeof(bs->backing_file) - 1) { 371 ret = -EINVAL; 372 goto out; 373 } 374 375 pstrcpy(bs->backing_file, end_name - p_name + 1, p_name); 376 } 377 378 out: 379 g_free(desc); 380 return ret; 381 } 382 383 /* Create and append extent to the extent array. Return the added VmdkExtent 384 * address. return NULL if allocation failed. */ 385 static int vmdk_add_extent(BlockDriverState *bs, 386 BdrvChild *file, bool flat, int64_t sectors, 387 int64_t l1_offset, int64_t l1_backup_offset, 388 uint32_t l1_size, 389 int l2_size, uint64_t cluster_sectors, 390 VmdkExtent **new_extent, 391 Error **errp) 392 { 393 VmdkExtent *extent; 394 BDRVVmdkState *s = bs->opaque; 395 int64_t nb_sectors; 396 397 if (cluster_sectors > 0x200000) { 398 /* 0x200000 * 512Bytes = 1GB for one cluster is unrealistic */ 399 error_setg(errp, "Invalid granularity, image may be corrupt"); 400 return -EFBIG; 401 } 402 if (l1_size > 512 * 1024 * 1024) { 403 /* Although with big capacity and small l1_entry_sectors, we can get a 404 * big l1_size, we don't want unbounded value to allocate the table. 405 * Limit it to 512M, which is 16PB for default cluster and L2 table 406 * size */ 407 error_setg(errp, "L1 size too big"); 408 return -EFBIG; 409 } 410 411 nb_sectors = bdrv_nb_sectors(file->bs); 412 if (nb_sectors < 0) { 413 return nb_sectors; 414 } 415 416 s->extents = g_renew(VmdkExtent, s->extents, s->num_extents + 1); 417 extent = &s->extents[s->num_extents]; 418 s->num_extents++; 419 420 memset(extent, 0, sizeof(VmdkExtent)); 421 extent->file = file; 422 extent->flat = flat; 423 extent->sectors = sectors; 424 extent->l1_table_offset = l1_offset; 425 extent->l1_backup_table_offset = l1_backup_offset; 426 extent->l1_size = l1_size; 427 extent->l1_entry_sectors = l2_size * cluster_sectors; 428 extent->l2_size = l2_size; 429 extent->cluster_sectors = flat ? sectors : cluster_sectors; 430 extent->next_cluster_sector = ROUND_UP(nb_sectors, cluster_sectors); 431 432 if (s->num_extents > 1) { 433 extent->end_sector = (*(extent - 1)).end_sector + extent->sectors; 434 } else { 435 extent->end_sector = extent->sectors; 436 } 437 bs->total_sectors = extent->end_sector; 438 if (new_extent) { 439 *new_extent = extent; 440 } 441 return 0; 442 } 443 444 static int vmdk_init_tables(BlockDriverState *bs, VmdkExtent *extent, 445 Error **errp) 446 { 447 int ret; 448 size_t l1_size; 449 int i; 450 451 /* read the L1 table */ 452 l1_size = extent->l1_size * sizeof(uint32_t); 453 extent->l1_table = g_try_malloc(l1_size); 454 if (l1_size && extent->l1_table == NULL) { 455 return -ENOMEM; 456 } 457 458 ret = bdrv_pread(extent->file->bs, 459 extent->l1_table_offset, 460 extent->l1_table, 461 l1_size); 462 if (ret < 0) { 463 error_setg_errno(errp, -ret, 464 "Could not read l1 table from extent '%s'", 465 extent->file->bs->filename); 466 goto fail_l1; 467 } 468 for (i = 0; i < extent->l1_size; i++) { 469 le32_to_cpus(&extent->l1_table[i]); 470 } 471 472 if (extent->l1_backup_table_offset) { 473 extent->l1_backup_table = g_try_malloc(l1_size); 474 if (l1_size && extent->l1_backup_table == NULL) { 475 ret = -ENOMEM; 476 goto fail_l1; 477 } 478 ret = bdrv_pread(extent->file->bs, 479 extent->l1_backup_table_offset, 480 extent->l1_backup_table, 481 l1_size); 482 if (ret < 0) { 483 error_setg_errno(errp, -ret, 484 "Could not read l1 backup table from extent '%s'", 485 extent->file->bs->filename); 486 goto fail_l1b; 487 } 488 for (i = 0; i < extent->l1_size; i++) { 489 le32_to_cpus(&extent->l1_backup_table[i]); 490 } 491 } 492 493 extent->l2_cache = 494 g_new(uint32_t, extent->l2_size * L2_CACHE_SIZE); 495 return 0; 496 fail_l1b: 497 g_free(extent->l1_backup_table); 498 fail_l1: 499 g_free(extent->l1_table); 500 return ret; 501 } 502 503 static int vmdk_open_vmfs_sparse(BlockDriverState *bs, 504 BdrvChild *file, 505 int flags, Error **errp) 506 { 507 int ret; 508 uint32_t magic; 509 VMDK3Header header; 510 VmdkExtent *extent; 511 512 ret = bdrv_pread(file->bs, sizeof(magic), &header, sizeof(header)); 513 if (ret < 0) { 514 error_setg_errno(errp, -ret, 515 "Could not read header from file '%s'", 516 file->bs->filename); 517 return ret; 518 } 519 ret = vmdk_add_extent(bs, file, false, 520 le32_to_cpu(header.disk_sectors), 521 (int64_t)le32_to_cpu(header.l1dir_offset) << 9, 522 0, 523 le32_to_cpu(header.l1dir_size), 524 4096, 525 le32_to_cpu(header.granularity), 526 &extent, 527 errp); 528 if (ret < 0) { 529 return ret; 530 } 531 ret = vmdk_init_tables(bs, extent, errp); 532 if (ret) { 533 /* free extent allocated by vmdk_add_extent */ 534 vmdk_free_last_extent(bs); 535 } 536 return ret; 537 } 538 539 static int vmdk_open_desc_file(BlockDriverState *bs, int flags, char *buf, 540 QDict *options, Error **errp); 541 542 static char *vmdk_read_desc(BlockDriverState *file, uint64_t desc_offset, 543 Error **errp) 544 { 545 int64_t size; 546 char *buf; 547 int ret; 548 549 size = bdrv_getlength(file); 550 if (size < 0) { 551 error_setg_errno(errp, -size, "Could not access file"); 552 return NULL; 553 } 554 555 if (size < 4) { 556 /* Both descriptor file and sparse image must be much larger than 4 557 * bytes, also callers of vmdk_read_desc want to compare the first 4 558 * bytes with VMDK4_MAGIC, let's error out if less is read. */ 559 error_setg(errp, "File is too small, not a valid image"); 560 return NULL; 561 } 562 563 size = MIN(size, (1 << 20) - 1); /* avoid unbounded allocation */ 564 buf = g_malloc(size + 1); 565 566 ret = bdrv_pread(file, desc_offset, buf, size); 567 if (ret < 0) { 568 error_setg_errno(errp, -ret, "Could not read from file"); 569 g_free(buf); 570 return NULL; 571 } 572 buf[ret] = 0; 573 574 return buf; 575 } 576 577 static int vmdk_open_vmdk4(BlockDriverState *bs, 578 BdrvChild *file, 579 int flags, QDict *options, Error **errp) 580 { 581 int ret; 582 uint32_t magic; 583 uint32_t l1_size, l1_entry_sectors; 584 VMDK4Header header; 585 VmdkExtent *extent; 586 BDRVVmdkState *s = bs->opaque; 587 int64_t l1_backup_offset = 0; 588 bool compressed; 589 590 ret = bdrv_pread(file->bs, sizeof(magic), &header, sizeof(header)); 591 if (ret < 0) { 592 error_setg_errno(errp, -ret, 593 "Could not read header from file '%s'", 594 file->bs->filename); 595 return -EINVAL; 596 } 597 if (header.capacity == 0) { 598 uint64_t desc_offset = le64_to_cpu(header.desc_offset); 599 if (desc_offset) { 600 char *buf = vmdk_read_desc(file->bs, desc_offset << 9, errp); 601 if (!buf) { 602 return -EINVAL; 603 } 604 ret = vmdk_open_desc_file(bs, flags, buf, options, errp); 605 g_free(buf); 606 return ret; 607 } 608 } 609 610 if (!s->create_type) { 611 s->create_type = g_strdup("monolithicSparse"); 612 } 613 614 if (le64_to_cpu(header.gd_offset) == VMDK4_GD_AT_END) { 615 /* 616 * The footer takes precedence over the header, so read it in. The 617 * footer starts at offset -1024 from the end: One sector for the 618 * footer, and another one for the end-of-stream marker. 619 */ 620 struct { 621 struct { 622 uint64_t val; 623 uint32_t size; 624 uint32_t type; 625 uint8_t pad[512 - 16]; 626 } QEMU_PACKED footer_marker; 627 628 uint32_t magic; 629 VMDK4Header header; 630 uint8_t pad[512 - 4 - sizeof(VMDK4Header)]; 631 632 struct { 633 uint64_t val; 634 uint32_t size; 635 uint32_t type; 636 uint8_t pad[512 - 16]; 637 } QEMU_PACKED eos_marker; 638 } QEMU_PACKED footer; 639 640 ret = bdrv_pread(file->bs, 641 bs->file->bs->total_sectors * 512 - 1536, 642 &footer, sizeof(footer)); 643 if (ret < 0) { 644 error_setg_errno(errp, -ret, "Failed to read footer"); 645 return ret; 646 } 647 648 /* Some sanity checks for the footer */ 649 if (be32_to_cpu(footer.magic) != VMDK4_MAGIC || 650 le32_to_cpu(footer.footer_marker.size) != 0 || 651 le32_to_cpu(footer.footer_marker.type) != MARKER_FOOTER || 652 le64_to_cpu(footer.eos_marker.val) != 0 || 653 le32_to_cpu(footer.eos_marker.size) != 0 || 654 le32_to_cpu(footer.eos_marker.type) != MARKER_END_OF_STREAM) 655 { 656 error_setg(errp, "Invalid footer"); 657 return -EINVAL; 658 } 659 660 header = footer.header; 661 } 662 663 compressed = 664 le16_to_cpu(header.compressAlgorithm) == VMDK4_COMPRESSION_DEFLATE; 665 if (le32_to_cpu(header.version) > 3) { 666 error_setg(errp, "Unsupported VMDK version %" PRIu32, 667 le32_to_cpu(header.version)); 668 return -ENOTSUP; 669 } else if (le32_to_cpu(header.version) == 3 && (flags & BDRV_O_RDWR) && 670 !compressed) { 671 /* VMware KB 2064959 explains that version 3 added support for 672 * persistent changed block tracking (CBT), and backup software can 673 * read it as version=1 if it doesn't care about the changed area 674 * information. So we are safe to enable read only. */ 675 error_setg(errp, "VMDK version 3 must be read only"); 676 return -EINVAL; 677 } 678 679 if (le32_to_cpu(header.num_gtes_per_gt) > 512) { 680 error_setg(errp, "L2 table size too big"); 681 return -EINVAL; 682 } 683 684 l1_entry_sectors = le32_to_cpu(header.num_gtes_per_gt) 685 * le64_to_cpu(header.granularity); 686 if (l1_entry_sectors == 0) { 687 error_setg(errp, "L1 entry size is invalid"); 688 return -EINVAL; 689 } 690 l1_size = (le64_to_cpu(header.capacity) + l1_entry_sectors - 1) 691 / l1_entry_sectors; 692 if (le32_to_cpu(header.flags) & VMDK4_FLAG_RGD) { 693 l1_backup_offset = le64_to_cpu(header.rgd_offset) << 9; 694 } 695 if (bdrv_nb_sectors(file->bs) < le64_to_cpu(header.grain_offset)) { 696 error_setg(errp, "File truncated, expecting at least %" PRId64 " bytes", 697 (int64_t)(le64_to_cpu(header.grain_offset) 698 * BDRV_SECTOR_SIZE)); 699 return -EINVAL; 700 } 701 702 ret = vmdk_add_extent(bs, file, false, 703 le64_to_cpu(header.capacity), 704 le64_to_cpu(header.gd_offset) << 9, 705 l1_backup_offset, 706 l1_size, 707 le32_to_cpu(header.num_gtes_per_gt), 708 le64_to_cpu(header.granularity), 709 &extent, 710 errp); 711 if (ret < 0) { 712 return ret; 713 } 714 extent->compressed = 715 le16_to_cpu(header.compressAlgorithm) == VMDK4_COMPRESSION_DEFLATE; 716 if (extent->compressed) { 717 g_free(s->create_type); 718 s->create_type = g_strdup("streamOptimized"); 719 } 720 extent->has_marker = le32_to_cpu(header.flags) & VMDK4_FLAG_MARKER; 721 extent->version = le32_to_cpu(header.version); 722 extent->has_zero_grain = le32_to_cpu(header.flags) & VMDK4_FLAG_ZERO_GRAIN; 723 ret = vmdk_init_tables(bs, extent, errp); 724 if (ret) { 725 /* free extent allocated by vmdk_add_extent */ 726 vmdk_free_last_extent(bs); 727 } 728 return ret; 729 } 730 731 /* find an option value out of descriptor file */ 732 static int vmdk_parse_description(const char *desc, const char *opt_name, 733 char *buf, int buf_size) 734 { 735 char *opt_pos, *opt_end; 736 const char *end = desc + strlen(desc); 737 738 opt_pos = strstr(desc, opt_name); 739 if (!opt_pos) { 740 return VMDK_ERROR; 741 } 742 /* Skip "=\"" following opt_name */ 743 opt_pos += strlen(opt_name) + 2; 744 if (opt_pos >= end) { 745 return VMDK_ERROR; 746 } 747 opt_end = opt_pos; 748 while (opt_end < end && *opt_end != '"') { 749 opt_end++; 750 } 751 if (opt_end == end || buf_size < opt_end - opt_pos + 1) { 752 return VMDK_ERROR; 753 } 754 pstrcpy(buf, opt_end - opt_pos + 1, opt_pos); 755 return VMDK_OK; 756 } 757 758 /* Open an extent file and append to bs array */ 759 static int vmdk_open_sparse(BlockDriverState *bs, BdrvChild *file, int flags, 760 char *buf, QDict *options, Error **errp) 761 { 762 uint32_t magic; 763 764 magic = ldl_be_p(buf); 765 switch (magic) { 766 case VMDK3_MAGIC: 767 return vmdk_open_vmfs_sparse(bs, file, flags, errp); 768 break; 769 case VMDK4_MAGIC: 770 return vmdk_open_vmdk4(bs, file, flags, options, errp); 771 break; 772 default: 773 error_setg(errp, "Image not in VMDK format"); 774 return -EINVAL; 775 break; 776 } 777 } 778 779 static const char *next_line(const char *s) 780 { 781 while (*s) { 782 if (*s == '\n') { 783 return s + 1; 784 } 785 s++; 786 } 787 return s; 788 } 789 790 static int vmdk_parse_extents(const char *desc, BlockDriverState *bs, 791 const char *desc_file_path, QDict *options, 792 Error **errp) 793 { 794 int ret; 795 int matches; 796 char access[11]; 797 char type[11]; 798 char fname[512]; 799 const char *p, *np; 800 int64_t sectors = 0; 801 int64_t flat_offset; 802 char *extent_path; 803 BdrvChild *extent_file; 804 BDRVVmdkState *s = bs->opaque; 805 VmdkExtent *extent; 806 char extent_opt_prefix[32]; 807 Error *local_err = NULL; 808 809 for (p = desc; *p; p = next_line(p)) { 810 /* parse extent line in one of below formats: 811 * 812 * RW [size in sectors] FLAT "file-name.vmdk" OFFSET 813 * RW [size in sectors] SPARSE "file-name.vmdk" 814 * RW [size in sectors] VMFS "file-name.vmdk" 815 * RW [size in sectors] VMFSSPARSE "file-name.vmdk" 816 */ 817 flat_offset = -1; 818 matches = sscanf(p, "%10s %" SCNd64 " %10s \"%511[^\n\r\"]\" %" SCNd64, 819 access, §ors, type, fname, &flat_offset); 820 if (matches < 4 || strcmp(access, "RW")) { 821 continue; 822 } else if (!strcmp(type, "FLAT")) { 823 if (matches != 5 || flat_offset < 0) { 824 goto invalid; 825 } 826 } else if (!strcmp(type, "VMFS")) { 827 if (matches == 4) { 828 flat_offset = 0; 829 } else { 830 goto invalid; 831 } 832 } else if (matches != 4) { 833 goto invalid; 834 } 835 836 if (sectors <= 0 || 837 (strcmp(type, "FLAT") && strcmp(type, "SPARSE") && 838 strcmp(type, "VMFS") && strcmp(type, "VMFSSPARSE")) || 839 (strcmp(access, "RW"))) { 840 continue; 841 } 842 843 if (!path_is_absolute(fname) && !path_has_protocol(fname) && 844 !desc_file_path[0]) 845 { 846 error_setg(errp, "Cannot use relative extent paths with VMDK " 847 "descriptor file '%s'", bs->file->bs->filename); 848 return -EINVAL; 849 } 850 851 extent_path = g_malloc0(PATH_MAX); 852 path_combine(extent_path, PATH_MAX, desc_file_path, fname); 853 854 ret = snprintf(extent_opt_prefix, 32, "extents.%d", s->num_extents); 855 assert(ret < 32); 856 857 extent_file = bdrv_open_child(extent_path, options, extent_opt_prefix, 858 bs, &child_file, false, &local_err); 859 g_free(extent_path); 860 if (local_err) { 861 error_propagate(errp, local_err); 862 return -EINVAL; 863 } 864 865 /* save to extents array */ 866 if (!strcmp(type, "FLAT") || !strcmp(type, "VMFS")) { 867 /* FLAT extent */ 868 869 ret = vmdk_add_extent(bs, extent_file, true, sectors, 870 0, 0, 0, 0, 0, &extent, errp); 871 if (ret < 0) { 872 bdrv_unref_child(bs, extent_file); 873 return ret; 874 } 875 extent->flat_start_offset = flat_offset << 9; 876 } else if (!strcmp(type, "SPARSE") || !strcmp(type, "VMFSSPARSE")) { 877 /* SPARSE extent and VMFSSPARSE extent are both "COWD" sparse file*/ 878 char *buf = vmdk_read_desc(extent_file->bs, 0, errp); 879 if (!buf) { 880 ret = -EINVAL; 881 } else { 882 ret = vmdk_open_sparse(bs, extent_file, bs->open_flags, buf, 883 options, errp); 884 } 885 g_free(buf); 886 if (ret) { 887 bdrv_unref_child(bs, extent_file); 888 return ret; 889 } 890 extent = &s->extents[s->num_extents - 1]; 891 } else { 892 error_setg(errp, "Unsupported extent type '%s'", type); 893 bdrv_unref_child(bs, extent_file); 894 return -ENOTSUP; 895 } 896 extent->type = g_strdup(type); 897 } 898 return 0; 899 900 invalid: 901 np = next_line(p); 902 assert(np != p); 903 if (np[-1] == '\n') { 904 np--; 905 } 906 error_setg(errp, "Invalid extent line: %.*s", (int)(np - p), p); 907 return -EINVAL; 908 } 909 910 static int vmdk_open_desc_file(BlockDriverState *bs, int flags, char *buf, 911 QDict *options, Error **errp) 912 { 913 int ret; 914 char ct[128]; 915 BDRVVmdkState *s = bs->opaque; 916 917 if (vmdk_parse_description(buf, "createType", ct, sizeof(ct))) { 918 error_setg(errp, "invalid VMDK image descriptor"); 919 ret = -EINVAL; 920 goto exit; 921 } 922 if (strcmp(ct, "monolithicFlat") && 923 strcmp(ct, "vmfs") && 924 strcmp(ct, "vmfsSparse") && 925 strcmp(ct, "twoGbMaxExtentSparse") && 926 strcmp(ct, "twoGbMaxExtentFlat")) { 927 error_setg(errp, "Unsupported image type '%s'", ct); 928 ret = -ENOTSUP; 929 goto exit; 930 } 931 s->create_type = g_strdup(ct); 932 s->desc_offset = 0; 933 ret = vmdk_parse_extents(buf, bs, bs->file->bs->exact_filename, options, 934 errp); 935 exit: 936 return ret; 937 } 938 939 static int vmdk_open(BlockDriverState *bs, QDict *options, int flags, 940 Error **errp) 941 { 942 char *buf; 943 int ret; 944 BDRVVmdkState *s = bs->opaque; 945 uint32_t magic; 946 947 buf = vmdk_read_desc(bs->file->bs, 0, errp); 948 if (!buf) { 949 return -EINVAL; 950 } 951 952 magic = ldl_be_p(buf); 953 switch (magic) { 954 case VMDK3_MAGIC: 955 case VMDK4_MAGIC: 956 ret = vmdk_open_sparse(bs, bs->file, flags, buf, options, 957 errp); 958 s->desc_offset = 0x200; 959 break; 960 default: 961 ret = vmdk_open_desc_file(bs, flags, buf, options, errp); 962 break; 963 } 964 if (ret) { 965 goto fail; 966 } 967 968 /* try to open parent images, if exist */ 969 ret = vmdk_parent_open(bs); 970 if (ret) { 971 goto fail; 972 } 973 s->cid = vmdk_read_cid(bs, 0); 974 s->parent_cid = vmdk_read_cid(bs, 1); 975 qemu_co_mutex_init(&s->lock); 976 977 /* Disable migration when VMDK images are used */ 978 error_setg(&s->migration_blocker, "The vmdk format used by node '%s' " 979 "does not support live migration", 980 bdrv_get_device_or_node_name(bs)); 981 migrate_add_blocker(s->migration_blocker); 982 g_free(buf); 983 return 0; 984 985 fail: 986 g_free(buf); 987 g_free(s->create_type); 988 s->create_type = NULL; 989 vmdk_free_extents(bs); 990 return ret; 991 } 992 993 994 static void vmdk_refresh_limits(BlockDriverState *bs, Error **errp) 995 { 996 BDRVVmdkState *s = bs->opaque; 997 int i; 998 999 for (i = 0; i < s->num_extents; i++) { 1000 if (!s->extents[i].flat) { 1001 bs->bl.write_zeroes_alignment = 1002 MAX(bs->bl.write_zeroes_alignment, 1003 s->extents[i].cluster_sectors); 1004 } 1005 } 1006 } 1007 1008 /** 1009 * get_whole_cluster 1010 * 1011 * Copy backing file's cluster that covers @sector_num, otherwise write zero, 1012 * to the cluster at @cluster_sector_num. 1013 * 1014 * If @skip_start_sector < @skip_end_sector, the relative range 1015 * [@skip_start_sector, @skip_end_sector) is not copied or written, and leave 1016 * it for call to write user data in the request. 1017 */ 1018 static int get_whole_cluster(BlockDriverState *bs, 1019 VmdkExtent *extent, 1020 uint64_t cluster_offset, 1021 uint64_t offset, 1022 uint64_t skip_start_bytes, 1023 uint64_t skip_end_bytes) 1024 { 1025 int ret = VMDK_OK; 1026 int64_t cluster_bytes; 1027 uint8_t *whole_grain; 1028 1029 /* For COW, align request sector_num to cluster start */ 1030 cluster_bytes = extent->cluster_sectors << BDRV_SECTOR_BITS; 1031 offset = QEMU_ALIGN_DOWN(offset, cluster_bytes); 1032 whole_grain = qemu_blockalign(bs, cluster_bytes); 1033 1034 if (!bs->backing) { 1035 memset(whole_grain, 0, skip_start_bytes); 1036 memset(whole_grain + skip_end_bytes, 0, cluster_bytes - skip_end_bytes); 1037 } 1038 1039 assert(skip_end_bytes <= cluster_bytes); 1040 /* we will be here if it's first write on non-exist grain(cluster). 1041 * try to read from parent image, if exist */ 1042 if (bs->backing && !vmdk_is_cid_valid(bs)) { 1043 ret = VMDK_ERROR; 1044 goto exit; 1045 } 1046 1047 /* Read backing data before skip range */ 1048 if (skip_start_bytes > 0) { 1049 if (bs->backing) { 1050 ret = bdrv_pread(bs->backing->bs, offset, whole_grain, 1051 skip_start_bytes); 1052 if (ret < 0) { 1053 ret = VMDK_ERROR; 1054 goto exit; 1055 } 1056 } 1057 ret = bdrv_pwrite(extent->file->bs, cluster_offset, whole_grain, 1058 skip_start_bytes); 1059 if (ret < 0) { 1060 ret = VMDK_ERROR; 1061 goto exit; 1062 } 1063 } 1064 /* Read backing data after skip range */ 1065 if (skip_end_bytes < cluster_bytes) { 1066 if (bs->backing) { 1067 ret = bdrv_pread(bs->backing->bs, offset + skip_end_bytes, 1068 whole_grain + skip_end_bytes, 1069 cluster_bytes - skip_end_bytes); 1070 if (ret < 0) { 1071 ret = VMDK_ERROR; 1072 goto exit; 1073 } 1074 } 1075 ret = bdrv_pwrite(extent->file->bs, cluster_offset + skip_end_bytes, 1076 whole_grain + skip_end_bytes, 1077 cluster_bytes - skip_end_bytes); 1078 if (ret < 0) { 1079 ret = VMDK_ERROR; 1080 goto exit; 1081 } 1082 } 1083 1084 ret = VMDK_OK; 1085 exit: 1086 qemu_vfree(whole_grain); 1087 return ret; 1088 } 1089 1090 static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data, 1091 uint32_t offset) 1092 { 1093 offset = cpu_to_le32(offset); 1094 /* update L2 table */ 1095 if (bdrv_pwrite_sync( 1096 extent->file->bs, 1097 ((int64_t)m_data->l2_offset * 512) 1098 + (m_data->l2_index * sizeof(offset)), 1099 &offset, sizeof(offset)) < 0) { 1100 return VMDK_ERROR; 1101 } 1102 /* update backup L2 table */ 1103 if (extent->l1_backup_table_offset != 0) { 1104 m_data->l2_offset = extent->l1_backup_table[m_data->l1_index]; 1105 if (bdrv_pwrite_sync( 1106 extent->file->bs, 1107 ((int64_t)m_data->l2_offset * 512) 1108 + (m_data->l2_index * sizeof(offset)), 1109 &offset, sizeof(offset)) < 0) { 1110 return VMDK_ERROR; 1111 } 1112 } 1113 if (m_data->l2_cache_entry) { 1114 *m_data->l2_cache_entry = offset; 1115 } 1116 1117 return VMDK_OK; 1118 } 1119 1120 /** 1121 * get_cluster_offset 1122 * 1123 * Look up cluster offset in extent file by sector number, and store in 1124 * @cluster_offset. 1125 * 1126 * For flat extents, the start offset as parsed from the description file is 1127 * returned. 1128 * 1129 * For sparse extents, look up in L1, L2 table. If allocate is true, return an 1130 * offset for a new cluster and update L2 cache. If there is a backing file, 1131 * COW is done before returning; otherwise, zeroes are written to the allocated 1132 * cluster. Both COW and zero writing skips the sector range 1133 * [@skip_start_sector, @skip_end_sector) passed in by caller, because caller 1134 * has new data to write there. 1135 * 1136 * Returns: VMDK_OK if cluster exists and mapped in the image. 1137 * VMDK_UNALLOC if cluster is not mapped and @allocate is false. 1138 * VMDK_ERROR if failed. 1139 */ 1140 static int get_cluster_offset(BlockDriverState *bs, 1141 VmdkExtent *extent, 1142 VmdkMetaData *m_data, 1143 uint64_t offset, 1144 bool allocate, 1145 uint64_t *cluster_offset, 1146 uint64_t skip_start_bytes, 1147 uint64_t skip_end_bytes) 1148 { 1149 unsigned int l1_index, l2_offset, l2_index; 1150 int min_index, i, j; 1151 uint32_t min_count, *l2_table; 1152 bool zeroed = false; 1153 int64_t ret; 1154 int64_t cluster_sector; 1155 1156 if (m_data) { 1157 m_data->valid = 0; 1158 } 1159 if (extent->flat) { 1160 *cluster_offset = extent->flat_start_offset; 1161 return VMDK_OK; 1162 } 1163 1164 offset -= (extent->end_sector - extent->sectors) * SECTOR_SIZE; 1165 l1_index = (offset >> 9) / extent->l1_entry_sectors; 1166 if (l1_index >= extent->l1_size) { 1167 return VMDK_ERROR; 1168 } 1169 l2_offset = extent->l1_table[l1_index]; 1170 if (!l2_offset) { 1171 return VMDK_UNALLOC; 1172 } 1173 for (i = 0; i < L2_CACHE_SIZE; i++) { 1174 if (l2_offset == extent->l2_cache_offsets[i]) { 1175 /* increment the hit count */ 1176 if (++extent->l2_cache_counts[i] == 0xffffffff) { 1177 for (j = 0; j < L2_CACHE_SIZE; j++) { 1178 extent->l2_cache_counts[j] >>= 1; 1179 } 1180 } 1181 l2_table = extent->l2_cache + (i * extent->l2_size); 1182 goto found; 1183 } 1184 } 1185 /* not found: load a new entry in the least used one */ 1186 min_index = 0; 1187 min_count = 0xffffffff; 1188 for (i = 0; i < L2_CACHE_SIZE; i++) { 1189 if (extent->l2_cache_counts[i] < min_count) { 1190 min_count = extent->l2_cache_counts[i]; 1191 min_index = i; 1192 } 1193 } 1194 l2_table = extent->l2_cache + (min_index * extent->l2_size); 1195 if (bdrv_pread( 1196 extent->file->bs, 1197 (int64_t)l2_offset * 512, 1198 l2_table, 1199 extent->l2_size * sizeof(uint32_t) 1200 ) != extent->l2_size * sizeof(uint32_t)) { 1201 return VMDK_ERROR; 1202 } 1203 1204 extent->l2_cache_offsets[min_index] = l2_offset; 1205 extent->l2_cache_counts[min_index] = 1; 1206 found: 1207 l2_index = ((offset >> 9) / extent->cluster_sectors) % extent->l2_size; 1208 cluster_sector = le32_to_cpu(l2_table[l2_index]); 1209 1210 if (m_data) { 1211 m_data->valid = 1; 1212 m_data->l1_index = l1_index; 1213 m_data->l2_index = l2_index; 1214 m_data->l2_offset = l2_offset; 1215 m_data->l2_cache_entry = &l2_table[l2_index]; 1216 } 1217 if (extent->has_zero_grain && cluster_sector == VMDK_GTE_ZEROED) { 1218 zeroed = true; 1219 } 1220 1221 if (!cluster_sector || zeroed) { 1222 if (!allocate) { 1223 return zeroed ? VMDK_ZEROED : VMDK_UNALLOC; 1224 } 1225 1226 cluster_sector = extent->next_cluster_sector; 1227 extent->next_cluster_sector += extent->cluster_sectors; 1228 1229 /* First of all we write grain itself, to avoid race condition 1230 * that may to corrupt the image. 1231 * This problem may occur because of insufficient space on host disk 1232 * or inappropriate VM shutdown. 1233 */ 1234 ret = get_whole_cluster(bs, extent, cluster_sector * BDRV_SECTOR_SIZE, 1235 offset, skip_start_bytes, skip_end_bytes); 1236 if (ret) { 1237 return ret; 1238 } 1239 } 1240 *cluster_offset = cluster_sector << BDRV_SECTOR_BITS; 1241 return VMDK_OK; 1242 } 1243 1244 static VmdkExtent *find_extent(BDRVVmdkState *s, 1245 int64_t sector_num, VmdkExtent *start_hint) 1246 { 1247 VmdkExtent *extent = start_hint; 1248 1249 if (!extent) { 1250 extent = &s->extents[0]; 1251 } 1252 while (extent < &s->extents[s->num_extents]) { 1253 if (sector_num < extent->end_sector) { 1254 return extent; 1255 } 1256 extent++; 1257 } 1258 return NULL; 1259 } 1260 1261 static inline uint64_t vmdk_find_offset_in_cluster(VmdkExtent *extent, 1262 int64_t offset) 1263 { 1264 uint64_t offset_in_cluster, extent_begin_offset, extent_relative_offset; 1265 uint64_t cluster_size = extent->cluster_sectors * BDRV_SECTOR_SIZE; 1266 1267 extent_begin_offset = 1268 (extent->end_sector - extent->sectors) * BDRV_SECTOR_SIZE; 1269 extent_relative_offset = offset - extent_begin_offset; 1270 offset_in_cluster = extent_relative_offset % cluster_size; 1271 1272 return offset_in_cluster; 1273 } 1274 1275 static inline uint64_t vmdk_find_index_in_cluster(VmdkExtent *extent, 1276 int64_t sector_num) 1277 { 1278 uint64_t offset; 1279 offset = vmdk_find_offset_in_cluster(extent, sector_num * BDRV_SECTOR_SIZE); 1280 return offset / BDRV_SECTOR_SIZE; 1281 } 1282 1283 static int64_t coroutine_fn vmdk_co_get_block_status(BlockDriverState *bs, 1284 int64_t sector_num, int nb_sectors, int *pnum, BlockDriverState **file) 1285 { 1286 BDRVVmdkState *s = bs->opaque; 1287 int64_t index_in_cluster, n, ret; 1288 uint64_t offset; 1289 VmdkExtent *extent; 1290 1291 extent = find_extent(s, sector_num, NULL); 1292 if (!extent) { 1293 return 0; 1294 } 1295 qemu_co_mutex_lock(&s->lock); 1296 ret = get_cluster_offset(bs, extent, NULL, 1297 sector_num * 512, false, &offset, 1298 0, 0); 1299 qemu_co_mutex_unlock(&s->lock); 1300 1301 index_in_cluster = vmdk_find_index_in_cluster(extent, sector_num); 1302 switch (ret) { 1303 case VMDK_ERROR: 1304 ret = -EIO; 1305 break; 1306 case VMDK_UNALLOC: 1307 ret = 0; 1308 break; 1309 case VMDK_ZEROED: 1310 ret = BDRV_BLOCK_ZERO; 1311 break; 1312 case VMDK_OK: 1313 ret = BDRV_BLOCK_DATA; 1314 if (!extent->compressed) { 1315 ret |= BDRV_BLOCK_OFFSET_VALID; 1316 ret |= (offset + (index_in_cluster << BDRV_SECTOR_BITS)) 1317 & BDRV_BLOCK_OFFSET_MASK; 1318 } 1319 *file = extent->file->bs; 1320 break; 1321 } 1322 1323 n = extent->cluster_sectors - index_in_cluster; 1324 if (n > nb_sectors) { 1325 n = nb_sectors; 1326 } 1327 *pnum = n; 1328 return ret; 1329 } 1330 1331 static int vmdk_write_extent(VmdkExtent *extent, int64_t cluster_offset, 1332 int64_t offset_in_cluster, QEMUIOVector *qiov, 1333 uint64_t qiov_offset, uint64_t n_bytes, 1334 uint64_t offset) 1335 { 1336 int ret; 1337 VmdkGrainMarker *data = NULL; 1338 uLongf buf_len; 1339 QEMUIOVector local_qiov; 1340 struct iovec iov; 1341 int64_t write_offset; 1342 int64_t write_end_sector; 1343 1344 if (extent->compressed) { 1345 void *compressed_data; 1346 1347 if (!extent->has_marker) { 1348 ret = -EINVAL; 1349 goto out; 1350 } 1351 buf_len = (extent->cluster_sectors << 9) * 2; 1352 data = g_malloc(buf_len + sizeof(VmdkGrainMarker)); 1353 1354 compressed_data = g_malloc(n_bytes); 1355 qemu_iovec_to_buf(qiov, qiov_offset, compressed_data, n_bytes); 1356 ret = compress(data->data, &buf_len, compressed_data, n_bytes); 1357 g_free(compressed_data); 1358 1359 if (ret != Z_OK || buf_len == 0) { 1360 ret = -EINVAL; 1361 goto out; 1362 } 1363 1364 data->lba = offset >> BDRV_SECTOR_BITS; 1365 data->size = buf_len; 1366 1367 n_bytes = buf_len + sizeof(VmdkGrainMarker); 1368 iov = (struct iovec) { 1369 .iov_base = data, 1370 .iov_len = n_bytes, 1371 }; 1372 qemu_iovec_init_external(&local_qiov, &iov, 1); 1373 } else { 1374 qemu_iovec_init(&local_qiov, qiov->niov); 1375 qemu_iovec_concat(&local_qiov, qiov, qiov_offset, n_bytes); 1376 } 1377 1378 write_offset = cluster_offset + offset_in_cluster, 1379 ret = bdrv_co_pwritev(extent->file->bs, write_offset, n_bytes, 1380 &local_qiov, 0); 1381 1382 write_end_sector = DIV_ROUND_UP(write_offset + n_bytes, BDRV_SECTOR_SIZE); 1383 1384 if (extent->compressed) { 1385 extent->next_cluster_sector = write_end_sector; 1386 } else { 1387 extent->next_cluster_sector = MAX(extent->next_cluster_sector, 1388 write_end_sector); 1389 } 1390 1391 if (ret < 0) { 1392 goto out; 1393 } 1394 ret = 0; 1395 out: 1396 g_free(data); 1397 if (!extent->compressed) { 1398 qemu_iovec_destroy(&local_qiov); 1399 } 1400 return ret; 1401 } 1402 1403 static int vmdk_read_extent(VmdkExtent *extent, int64_t cluster_offset, 1404 int64_t offset_in_cluster, QEMUIOVector *qiov, 1405 int bytes) 1406 { 1407 int ret; 1408 int cluster_bytes, buf_bytes; 1409 uint8_t *cluster_buf, *compressed_data; 1410 uint8_t *uncomp_buf; 1411 uint32_t data_len; 1412 VmdkGrainMarker *marker; 1413 uLongf buf_len; 1414 1415 1416 if (!extent->compressed) { 1417 ret = bdrv_co_preadv(extent->file->bs, 1418 cluster_offset + offset_in_cluster, bytes, 1419 qiov, 0); 1420 if (ret < 0) { 1421 return ret; 1422 } 1423 return 0; 1424 } 1425 cluster_bytes = extent->cluster_sectors * 512; 1426 /* Read two clusters in case GrainMarker + compressed data > one cluster */ 1427 buf_bytes = cluster_bytes * 2; 1428 cluster_buf = g_malloc(buf_bytes); 1429 uncomp_buf = g_malloc(cluster_bytes); 1430 ret = bdrv_pread(extent->file->bs, 1431 cluster_offset, 1432 cluster_buf, buf_bytes); 1433 if (ret < 0) { 1434 goto out; 1435 } 1436 compressed_data = cluster_buf; 1437 buf_len = cluster_bytes; 1438 data_len = cluster_bytes; 1439 if (extent->has_marker) { 1440 marker = (VmdkGrainMarker *)cluster_buf; 1441 compressed_data = marker->data; 1442 data_len = le32_to_cpu(marker->size); 1443 } 1444 if (!data_len || data_len > buf_bytes) { 1445 ret = -EINVAL; 1446 goto out; 1447 } 1448 ret = uncompress(uncomp_buf, &buf_len, compressed_data, data_len); 1449 if (ret != Z_OK) { 1450 ret = -EINVAL; 1451 goto out; 1452 1453 } 1454 if (offset_in_cluster < 0 || 1455 offset_in_cluster + bytes > buf_len) { 1456 ret = -EINVAL; 1457 goto out; 1458 } 1459 qemu_iovec_from_buf(qiov, 0, uncomp_buf + offset_in_cluster, bytes); 1460 ret = 0; 1461 1462 out: 1463 g_free(uncomp_buf); 1464 g_free(cluster_buf); 1465 return ret; 1466 } 1467 1468 static int coroutine_fn 1469 vmdk_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes, 1470 QEMUIOVector *qiov, int flags) 1471 { 1472 BDRVVmdkState *s = bs->opaque; 1473 int ret; 1474 uint64_t n_bytes, offset_in_cluster; 1475 VmdkExtent *extent = NULL; 1476 QEMUIOVector local_qiov; 1477 uint64_t cluster_offset; 1478 uint64_t bytes_done = 0; 1479 1480 qemu_iovec_init(&local_qiov, qiov->niov); 1481 qemu_co_mutex_lock(&s->lock); 1482 1483 while (bytes > 0) { 1484 extent = find_extent(s, offset >> BDRV_SECTOR_BITS, extent); 1485 if (!extent) { 1486 ret = -EIO; 1487 goto fail; 1488 } 1489 ret = get_cluster_offset(bs, extent, NULL, 1490 offset, false, &cluster_offset, 0, 0); 1491 offset_in_cluster = vmdk_find_offset_in_cluster(extent, offset); 1492 1493 n_bytes = MIN(bytes, extent->cluster_sectors * BDRV_SECTOR_SIZE 1494 - offset_in_cluster); 1495 1496 if (ret != VMDK_OK) { 1497 /* if not allocated, try to read from parent image, if exist */ 1498 if (bs->backing && ret != VMDK_ZEROED) { 1499 if (!vmdk_is_cid_valid(bs)) { 1500 ret = -EINVAL; 1501 goto fail; 1502 } 1503 1504 qemu_iovec_reset(&local_qiov); 1505 qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes); 1506 1507 ret = bdrv_co_preadv(bs->backing->bs, offset, n_bytes, 1508 &local_qiov, 0); 1509 if (ret < 0) { 1510 goto fail; 1511 } 1512 } else { 1513 qemu_iovec_memset(qiov, bytes_done, 0, n_bytes); 1514 } 1515 } else { 1516 qemu_iovec_reset(&local_qiov); 1517 qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes); 1518 1519 ret = vmdk_read_extent(extent, cluster_offset, offset_in_cluster, 1520 &local_qiov, n_bytes); 1521 if (ret) { 1522 goto fail; 1523 } 1524 } 1525 bytes -= n_bytes; 1526 offset += n_bytes; 1527 bytes_done += n_bytes; 1528 } 1529 1530 ret = 0; 1531 fail: 1532 qemu_co_mutex_unlock(&s->lock); 1533 qemu_iovec_destroy(&local_qiov); 1534 1535 return ret; 1536 } 1537 1538 /** 1539 * vmdk_write: 1540 * @zeroed: buf is ignored (data is zero), use zeroed_grain GTE feature 1541 * if possible, otherwise return -ENOTSUP. 1542 * @zero_dry_run: used for zeroed == true only, don't update L2 table, just try 1543 * with each cluster. By dry run we can find if the zero write 1544 * is possible without modifying image data. 1545 * 1546 * Returns: error code with 0 for success. 1547 */ 1548 static int vmdk_pwritev(BlockDriverState *bs, uint64_t offset, 1549 uint64_t bytes, QEMUIOVector *qiov, 1550 bool zeroed, bool zero_dry_run) 1551 { 1552 BDRVVmdkState *s = bs->opaque; 1553 VmdkExtent *extent = NULL; 1554 int ret; 1555 int64_t offset_in_cluster, n_bytes; 1556 uint64_t cluster_offset; 1557 uint64_t bytes_done = 0; 1558 VmdkMetaData m_data; 1559 1560 if (DIV_ROUND_UP(offset, BDRV_SECTOR_SIZE) > bs->total_sectors) { 1561 error_report("Wrong offset: offset=0x%" PRIx64 1562 " total_sectors=0x%" PRIx64, 1563 offset, bs->total_sectors); 1564 return -EIO; 1565 } 1566 1567 while (bytes > 0) { 1568 extent = find_extent(s, offset >> BDRV_SECTOR_BITS, extent); 1569 if (!extent) { 1570 return -EIO; 1571 } 1572 offset_in_cluster = vmdk_find_offset_in_cluster(extent, offset); 1573 n_bytes = MIN(bytes, extent->cluster_sectors * BDRV_SECTOR_SIZE 1574 - offset_in_cluster); 1575 1576 ret = get_cluster_offset(bs, extent, &m_data, offset, 1577 !(extent->compressed || zeroed), 1578 &cluster_offset, offset_in_cluster, 1579 offset_in_cluster + n_bytes); 1580 if (extent->compressed) { 1581 if (ret == VMDK_OK) { 1582 /* Refuse write to allocated cluster for streamOptimized */ 1583 error_report("Could not write to allocated cluster" 1584 " for streamOptimized"); 1585 return -EIO; 1586 } else { 1587 /* allocate */ 1588 ret = get_cluster_offset(bs, extent, &m_data, offset, 1589 true, &cluster_offset, 0, 0); 1590 } 1591 } 1592 if (ret == VMDK_ERROR) { 1593 return -EINVAL; 1594 } 1595 if (zeroed) { 1596 /* Do zeroed write, buf is ignored */ 1597 if (extent->has_zero_grain && 1598 offset_in_cluster == 0 && 1599 n_bytes >= extent->cluster_sectors * BDRV_SECTOR_SIZE) { 1600 n_bytes = extent->cluster_sectors * BDRV_SECTOR_SIZE; 1601 if (!zero_dry_run) { 1602 /* update L2 tables */ 1603 if (vmdk_L2update(extent, &m_data, VMDK_GTE_ZEROED) 1604 != VMDK_OK) { 1605 return -EIO; 1606 } 1607 } 1608 } else { 1609 return -ENOTSUP; 1610 } 1611 } else { 1612 ret = vmdk_write_extent(extent, cluster_offset, offset_in_cluster, 1613 qiov, bytes_done, n_bytes, offset); 1614 if (ret) { 1615 return ret; 1616 } 1617 if (m_data.valid) { 1618 /* update L2 tables */ 1619 if (vmdk_L2update(extent, &m_data, 1620 cluster_offset >> BDRV_SECTOR_BITS) 1621 != VMDK_OK) { 1622 return -EIO; 1623 } 1624 } 1625 } 1626 bytes -= n_bytes; 1627 offset += n_bytes; 1628 bytes_done += n_bytes; 1629 1630 /* update CID on the first write every time the virtual disk is 1631 * opened */ 1632 if (!s->cid_updated) { 1633 ret = vmdk_write_cid(bs, g_random_int()); 1634 if (ret < 0) { 1635 return ret; 1636 } 1637 s->cid_updated = true; 1638 } 1639 } 1640 return 0; 1641 } 1642 1643 static int coroutine_fn 1644 vmdk_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes, 1645 QEMUIOVector *qiov, int flags) 1646 { 1647 int ret; 1648 BDRVVmdkState *s = bs->opaque; 1649 qemu_co_mutex_lock(&s->lock); 1650 ret = vmdk_pwritev(bs, offset, bytes, qiov, false, false); 1651 qemu_co_mutex_unlock(&s->lock); 1652 return ret; 1653 } 1654 1655 typedef struct VmdkWriteCompressedCo { 1656 BlockDriverState *bs; 1657 int64_t sector_num; 1658 const uint8_t *buf; 1659 int nb_sectors; 1660 int ret; 1661 } VmdkWriteCompressedCo; 1662 1663 static void vmdk_co_write_compressed(void *opaque) 1664 { 1665 VmdkWriteCompressedCo *co = opaque; 1666 QEMUIOVector local_qiov; 1667 uint64_t offset = co->sector_num * BDRV_SECTOR_SIZE; 1668 uint64_t bytes = co->nb_sectors * BDRV_SECTOR_SIZE; 1669 1670 struct iovec iov = (struct iovec) { 1671 .iov_base = (uint8_t*) co->buf, 1672 .iov_len = bytes, 1673 }; 1674 qemu_iovec_init_external(&local_qiov, &iov, 1); 1675 1676 co->ret = vmdk_pwritev(co->bs, offset, bytes, &local_qiov, false, false); 1677 } 1678 1679 static int vmdk_write_compressed(BlockDriverState *bs, 1680 int64_t sector_num, 1681 const uint8_t *buf, 1682 int nb_sectors) 1683 { 1684 BDRVVmdkState *s = bs->opaque; 1685 1686 if (s->num_extents == 1 && s->extents[0].compressed) { 1687 Coroutine *co; 1688 AioContext *aio_context = bdrv_get_aio_context(bs); 1689 VmdkWriteCompressedCo data = { 1690 .bs = bs, 1691 .sector_num = sector_num, 1692 .buf = buf, 1693 .nb_sectors = nb_sectors, 1694 .ret = -EINPROGRESS, 1695 }; 1696 co = qemu_coroutine_create(vmdk_co_write_compressed); 1697 qemu_coroutine_enter(co, &data); 1698 while (data.ret == -EINPROGRESS) { 1699 aio_poll(aio_context, true); 1700 } 1701 return data.ret; 1702 } else { 1703 return -ENOTSUP; 1704 } 1705 } 1706 1707 static int coroutine_fn vmdk_co_write_zeroes(BlockDriverState *bs, 1708 int64_t sector_num, 1709 int nb_sectors, 1710 BdrvRequestFlags flags) 1711 { 1712 int ret; 1713 BDRVVmdkState *s = bs->opaque; 1714 uint64_t offset = sector_num * BDRV_SECTOR_SIZE; 1715 uint64_t bytes = nb_sectors * BDRV_SECTOR_SIZE; 1716 1717 qemu_co_mutex_lock(&s->lock); 1718 /* write zeroes could fail if sectors not aligned to cluster, test it with 1719 * dry_run == true before really updating image */ 1720 ret = vmdk_pwritev(bs, offset, bytes, NULL, true, true); 1721 if (!ret) { 1722 ret = vmdk_pwritev(bs, offset, bytes, NULL, true, false); 1723 } 1724 qemu_co_mutex_unlock(&s->lock); 1725 return ret; 1726 } 1727 1728 static int vmdk_create_extent(const char *filename, int64_t filesize, 1729 bool flat, bool compress, bool zeroed_grain, 1730 QemuOpts *opts, Error **errp) 1731 { 1732 int ret, i; 1733 BlockBackend *blk = NULL; 1734 VMDK4Header header; 1735 Error *local_err = NULL; 1736 uint32_t tmp, magic, grains, gd_sectors, gt_size, gt_count; 1737 uint32_t *gd_buf = NULL; 1738 int gd_buf_size; 1739 1740 ret = bdrv_create_file(filename, opts, &local_err); 1741 if (ret < 0) { 1742 error_propagate(errp, local_err); 1743 goto exit; 1744 } 1745 1746 blk = blk_new_open(filename, NULL, NULL, 1747 BDRV_O_RDWR | BDRV_O_PROTOCOL, &local_err); 1748 if (blk == NULL) { 1749 error_propagate(errp, local_err); 1750 ret = -EIO; 1751 goto exit; 1752 } 1753 1754 blk_set_allow_write_beyond_eof(blk, true); 1755 1756 if (flat) { 1757 ret = blk_truncate(blk, filesize); 1758 if (ret < 0) { 1759 error_setg_errno(errp, -ret, "Could not truncate file"); 1760 } 1761 goto exit; 1762 } 1763 magic = cpu_to_be32(VMDK4_MAGIC); 1764 memset(&header, 0, sizeof(header)); 1765 if (compress) { 1766 header.version = 3; 1767 } else if (zeroed_grain) { 1768 header.version = 2; 1769 } else { 1770 header.version = 1; 1771 } 1772 header.flags = VMDK4_FLAG_RGD | VMDK4_FLAG_NL_DETECT 1773 | (compress ? VMDK4_FLAG_COMPRESS | VMDK4_FLAG_MARKER : 0) 1774 | (zeroed_grain ? VMDK4_FLAG_ZERO_GRAIN : 0); 1775 header.compressAlgorithm = compress ? VMDK4_COMPRESSION_DEFLATE : 0; 1776 header.capacity = filesize / BDRV_SECTOR_SIZE; 1777 header.granularity = 128; 1778 header.num_gtes_per_gt = BDRV_SECTOR_SIZE; 1779 1780 grains = DIV_ROUND_UP(filesize / BDRV_SECTOR_SIZE, header.granularity); 1781 gt_size = DIV_ROUND_UP(header.num_gtes_per_gt * sizeof(uint32_t), 1782 BDRV_SECTOR_SIZE); 1783 gt_count = DIV_ROUND_UP(grains, header.num_gtes_per_gt); 1784 gd_sectors = DIV_ROUND_UP(gt_count * sizeof(uint32_t), BDRV_SECTOR_SIZE); 1785 1786 header.desc_offset = 1; 1787 header.desc_size = 20; 1788 header.rgd_offset = header.desc_offset + header.desc_size; 1789 header.gd_offset = header.rgd_offset + gd_sectors + (gt_size * gt_count); 1790 header.grain_offset = 1791 ROUND_UP(header.gd_offset + gd_sectors + (gt_size * gt_count), 1792 header.granularity); 1793 /* swap endianness for all header fields */ 1794 header.version = cpu_to_le32(header.version); 1795 header.flags = cpu_to_le32(header.flags); 1796 header.capacity = cpu_to_le64(header.capacity); 1797 header.granularity = cpu_to_le64(header.granularity); 1798 header.num_gtes_per_gt = cpu_to_le32(header.num_gtes_per_gt); 1799 header.desc_offset = cpu_to_le64(header.desc_offset); 1800 header.desc_size = cpu_to_le64(header.desc_size); 1801 header.rgd_offset = cpu_to_le64(header.rgd_offset); 1802 header.gd_offset = cpu_to_le64(header.gd_offset); 1803 header.grain_offset = cpu_to_le64(header.grain_offset); 1804 header.compressAlgorithm = cpu_to_le16(header.compressAlgorithm); 1805 1806 header.check_bytes[0] = 0xa; 1807 header.check_bytes[1] = 0x20; 1808 header.check_bytes[2] = 0xd; 1809 header.check_bytes[3] = 0xa; 1810 1811 /* write all the data */ 1812 ret = blk_pwrite(blk, 0, &magic, sizeof(magic), 0); 1813 if (ret < 0) { 1814 error_setg(errp, QERR_IO_ERROR); 1815 goto exit; 1816 } 1817 ret = blk_pwrite(blk, sizeof(magic), &header, sizeof(header), 0); 1818 if (ret < 0) { 1819 error_setg(errp, QERR_IO_ERROR); 1820 goto exit; 1821 } 1822 1823 ret = blk_truncate(blk, le64_to_cpu(header.grain_offset) << 9); 1824 if (ret < 0) { 1825 error_setg_errno(errp, -ret, "Could not truncate file"); 1826 goto exit; 1827 } 1828 1829 /* write grain directory */ 1830 gd_buf_size = gd_sectors * BDRV_SECTOR_SIZE; 1831 gd_buf = g_malloc0(gd_buf_size); 1832 for (i = 0, tmp = le64_to_cpu(header.rgd_offset) + gd_sectors; 1833 i < gt_count; i++, tmp += gt_size) { 1834 gd_buf[i] = cpu_to_le32(tmp); 1835 } 1836 ret = blk_pwrite(blk, le64_to_cpu(header.rgd_offset) * BDRV_SECTOR_SIZE, 1837 gd_buf, gd_buf_size, 0); 1838 if (ret < 0) { 1839 error_setg(errp, QERR_IO_ERROR); 1840 goto exit; 1841 } 1842 1843 /* write backup grain directory */ 1844 for (i = 0, tmp = le64_to_cpu(header.gd_offset) + gd_sectors; 1845 i < gt_count; i++, tmp += gt_size) { 1846 gd_buf[i] = cpu_to_le32(tmp); 1847 } 1848 ret = blk_pwrite(blk, le64_to_cpu(header.gd_offset) * BDRV_SECTOR_SIZE, 1849 gd_buf, gd_buf_size, 0); 1850 if (ret < 0) { 1851 error_setg(errp, QERR_IO_ERROR); 1852 goto exit; 1853 } 1854 1855 ret = 0; 1856 exit: 1857 if (blk) { 1858 blk_unref(blk); 1859 } 1860 g_free(gd_buf); 1861 return ret; 1862 } 1863 1864 static int filename_decompose(const char *filename, char *path, char *prefix, 1865 char *postfix, size_t buf_len, Error **errp) 1866 { 1867 const char *p, *q; 1868 1869 if (filename == NULL || !strlen(filename)) { 1870 error_setg(errp, "No filename provided"); 1871 return VMDK_ERROR; 1872 } 1873 p = strrchr(filename, '/'); 1874 if (p == NULL) { 1875 p = strrchr(filename, '\\'); 1876 } 1877 if (p == NULL) { 1878 p = strrchr(filename, ':'); 1879 } 1880 if (p != NULL) { 1881 p++; 1882 if (p - filename >= buf_len) { 1883 return VMDK_ERROR; 1884 } 1885 pstrcpy(path, p - filename + 1, filename); 1886 } else { 1887 p = filename; 1888 path[0] = '\0'; 1889 } 1890 q = strrchr(p, '.'); 1891 if (q == NULL) { 1892 pstrcpy(prefix, buf_len, p); 1893 postfix[0] = '\0'; 1894 } else { 1895 if (q - p >= buf_len) { 1896 return VMDK_ERROR; 1897 } 1898 pstrcpy(prefix, q - p + 1, p); 1899 pstrcpy(postfix, buf_len, q); 1900 } 1901 return VMDK_OK; 1902 } 1903 1904 static int vmdk_create(const char *filename, QemuOpts *opts, Error **errp) 1905 { 1906 int idx = 0; 1907 BlockBackend *new_blk = NULL; 1908 Error *local_err = NULL; 1909 char *desc = NULL; 1910 int64_t total_size = 0, filesize; 1911 char *adapter_type = NULL; 1912 char *backing_file = NULL; 1913 char *hw_version = NULL; 1914 char *fmt = NULL; 1915 int ret = 0; 1916 bool flat, split, compress; 1917 GString *ext_desc_lines; 1918 char *path = g_malloc0(PATH_MAX); 1919 char *prefix = g_malloc0(PATH_MAX); 1920 char *postfix = g_malloc0(PATH_MAX); 1921 char *desc_line = g_malloc0(BUF_SIZE); 1922 char *ext_filename = g_malloc0(PATH_MAX); 1923 char *desc_filename = g_malloc0(PATH_MAX); 1924 const int64_t split_size = 0x80000000; /* VMDK has constant split size */ 1925 const char *desc_extent_line; 1926 char *parent_desc_line = g_malloc0(BUF_SIZE); 1927 uint32_t parent_cid = 0xffffffff; 1928 uint32_t number_heads = 16; 1929 bool zeroed_grain = false; 1930 uint32_t desc_offset = 0, desc_len; 1931 const char desc_template[] = 1932 "# Disk DescriptorFile\n" 1933 "version=1\n" 1934 "CID=%" PRIx32 "\n" 1935 "parentCID=%" PRIx32 "\n" 1936 "createType=\"%s\"\n" 1937 "%s" 1938 "\n" 1939 "# Extent description\n" 1940 "%s" 1941 "\n" 1942 "# The Disk Data Base\n" 1943 "#DDB\n" 1944 "\n" 1945 "ddb.virtualHWVersion = \"%s\"\n" 1946 "ddb.geometry.cylinders = \"%" PRId64 "\"\n" 1947 "ddb.geometry.heads = \"%" PRIu32 "\"\n" 1948 "ddb.geometry.sectors = \"63\"\n" 1949 "ddb.adapterType = \"%s\"\n"; 1950 1951 ext_desc_lines = g_string_new(NULL); 1952 1953 if (filename_decompose(filename, path, prefix, postfix, PATH_MAX, errp)) { 1954 ret = -EINVAL; 1955 goto exit; 1956 } 1957 /* Read out options */ 1958 total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0), 1959 BDRV_SECTOR_SIZE); 1960 adapter_type = qemu_opt_get_del(opts, BLOCK_OPT_ADAPTER_TYPE); 1961 backing_file = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FILE); 1962 hw_version = qemu_opt_get_del(opts, BLOCK_OPT_HWVERSION); 1963 if (qemu_opt_get_bool_del(opts, BLOCK_OPT_COMPAT6, false)) { 1964 if (strcmp(hw_version, "undefined")) { 1965 error_setg(errp, 1966 "compat6 cannot be enabled with hwversion set"); 1967 ret = -EINVAL; 1968 goto exit; 1969 } 1970 g_free(hw_version); 1971 hw_version = g_strdup("6"); 1972 } 1973 if (strcmp(hw_version, "undefined") == 0) { 1974 g_free(hw_version); 1975 hw_version = g_strdup("4"); 1976 } 1977 fmt = qemu_opt_get_del(opts, BLOCK_OPT_SUBFMT); 1978 if (qemu_opt_get_bool_del(opts, BLOCK_OPT_ZEROED_GRAIN, false)) { 1979 zeroed_grain = true; 1980 } 1981 1982 if (!adapter_type) { 1983 adapter_type = g_strdup("ide"); 1984 } else if (strcmp(adapter_type, "ide") && 1985 strcmp(adapter_type, "buslogic") && 1986 strcmp(adapter_type, "lsilogic") && 1987 strcmp(adapter_type, "legacyESX")) { 1988 error_setg(errp, "Unknown adapter type: '%s'", adapter_type); 1989 ret = -EINVAL; 1990 goto exit; 1991 } 1992 if (strcmp(adapter_type, "ide") != 0) { 1993 /* that's the number of heads with which vmware operates when 1994 creating, exporting, etc. vmdk files with a non-ide adapter type */ 1995 number_heads = 255; 1996 } 1997 if (!fmt) { 1998 /* Default format to monolithicSparse */ 1999 fmt = g_strdup("monolithicSparse"); 2000 } else if (strcmp(fmt, "monolithicFlat") && 2001 strcmp(fmt, "monolithicSparse") && 2002 strcmp(fmt, "twoGbMaxExtentSparse") && 2003 strcmp(fmt, "twoGbMaxExtentFlat") && 2004 strcmp(fmt, "streamOptimized")) { 2005 error_setg(errp, "Unknown subformat: '%s'", fmt); 2006 ret = -EINVAL; 2007 goto exit; 2008 } 2009 split = !(strcmp(fmt, "twoGbMaxExtentFlat") && 2010 strcmp(fmt, "twoGbMaxExtentSparse")); 2011 flat = !(strcmp(fmt, "monolithicFlat") && 2012 strcmp(fmt, "twoGbMaxExtentFlat")); 2013 compress = !strcmp(fmt, "streamOptimized"); 2014 if (flat) { 2015 desc_extent_line = "RW %" PRId64 " FLAT \"%s\" 0\n"; 2016 } else { 2017 desc_extent_line = "RW %" PRId64 " SPARSE \"%s\"\n"; 2018 } 2019 if (flat && backing_file) { 2020 error_setg(errp, "Flat image can't have backing file"); 2021 ret = -ENOTSUP; 2022 goto exit; 2023 } 2024 if (flat && zeroed_grain) { 2025 error_setg(errp, "Flat image can't enable zeroed grain"); 2026 ret = -ENOTSUP; 2027 goto exit; 2028 } 2029 if (backing_file) { 2030 BlockBackend *blk; 2031 char *full_backing = g_new0(char, PATH_MAX); 2032 bdrv_get_full_backing_filename_from_filename(filename, backing_file, 2033 full_backing, PATH_MAX, 2034 &local_err); 2035 if (local_err) { 2036 g_free(full_backing); 2037 error_propagate(errp, local_err); 2038 ret = -ENOENT; 2039 goto exit; 2040 } 2041 2042 blk = blk_new_open(full_backing, NULL, NULL, 2043 BDRV_O_NO_BACKING, errp); 2044 g_free(full_backing); 2045 if (blk == NULL) { 2046 ret = -EIO; 2047 goto exit; 2048 } 2049 if (strcmp(blk_bs(blk)->drv->format_name, "vmdk")) { 2050 blk_unref(blk); 2051 ret = -EINVAL; 2052 goto exit; 2053 } 2054 parent_cid = vmdk_read_cid(blk_bs(blk), 0); 2055 blk_unref(blk); 2056 snprintf(parent_desc_line, BUF_SIZE, 2057 "parentFileNameHint=\"%s\"", backing_file); 2058 } 2059 2060 /* Create extents */ 2061 filesize = total_size; 2062 while (filesize > 0) { 2063 int64_t size = filesize; 2064 2065 if (split && size > split_size) { 2066 size = split_size; 2067 } 2068 if (split) { 2069 snprintf(desc_filename, PATH_MAX, "%s-%c%03d%s", 2070 prefix, flat ? 'f' : 's', ++idx, postfix); 2071 } else if (flat) { 2072 snprintf(desc_filename, PATH_MAX, "%s-flat%s", prefix, postfix); 2073 } else { 2074 snprintf(desc_filename, PATH_MAX, "%s%s", prefix, postfix); 2075 } 2076 snprintf(ext_filename, PATH_MAX, "%s%s", path, desc_filename); 2077 2078 if (vmdk_create_extent(ext_filename, size, 2079 flat, compress, zeroed_grain, opts, errp)) { 2080 ret = -EINVAL; 2081 goto exit; 2082 } 2083 filesize -= size; 2084 2085 /* Format description line */ 2086 snprintf(desc_line, BUF_SIZE, 2087 desc_extent_line, size / BDRV_SECTOR_SIZE, desc_filename); 2088 g_string_append(ext_desc_lines, desc_line); 2089 } 2090 /* generate descriptor file */ 2091 desc = g_strdup_printf(desc_template, 2092 g_random_int(), 2093 parent_cid, 2094 fmt, 2095 parent_desc_line, 2096 ext_desc_lines->str, 2097 hw_version, 2098 total_size / 2099 (int64_t)(63 * number_heads * BDRV_SECTOR_SIZE), 2100 number_heads, 2101 adapter_type); 2102 desc_len = strlen(desc); 2103 /* the descriptor offset = 0x200 */ 2104 if (!split && !flat) { 2105 desc_offset = 0x200; 2106 } else { 2107 ret = bdrv_create_file(filename, opts, &local_err); 2108 if (ret < 0) { 2109 error_propagate(errp, local_err); 2110 goto exit; 2111 } 2112 } 2113 2114 new_blk = blk_new_open(filename, NULL, NULL, 2115 BDRV_O_RDWR | BDRV_O_PROTOCOL, &local_err); 2116 if (new_blk == NULL) { 2117 error_propagate(errp, local_err); 2118 ret = -EIO; 2119 goto exit; 2120 } 2121 2122 blk_set_allow_write_beyond_eof(new_blk, true); 2123 2124 ret = blk_pwrite(new_blk, desc_offset, desc, desc_len, 0); 2125 if (ret < 0) { 2126 error_setg_errno(errp, -ret, "Could not write description"); 2127 goto exit; 2128 } 2129 /* bdrv_pwrite write padding zeros to align to sector, we don't need that 2130 * for description file */ 2131 if (desc_offset == 0) { 2132 ret = blk_truncate(new_blk, desc_len); 2133 if (ret < 0) { 2134 error_setg_errno(errp, -ret, "Could not truncate file"); 2135 } 2136 } 2137 exit: 2138 if (new_blk) { 2139 blk_unref(new_blk); 2140 } 2141 g_free(adapter_type); 2142 g_free(backing_file); 2143 g_free(hw_version); 2144 g_free(fmt); 2145 g_free(desc); 2146 g_free(path); 2147 g_free(prefix); 2148 g_free(postfix); 2149 g_free(desc_line); 2150 g_free(ext_filename); 2151 g_free(desc_filename); 2152 g_free(parent_desc_line); 2153 g_string_free(ext_desc_lines, true); 2154 return ret; 2155 } 2156 2157 static void vmdk_close(BlockDriverState *bs) 2158 { 2159 BDRVVmdkState *s = bs->opaque; 2160 2161 vmdk_free_extents(bs); 2162 g_free(s->create_type); 2163 2164 migrate_del_blocker(s->migration_blocker); 2165 error_free(s->migration_blocker); 2166 } 2167 2168 static coroutine_fn int vmdk_co_flush(BlockDriverState *bs) 2169 { 2170 BDRVVmdkState *s = bs->opaque; 2171 int i, err; 2172 int ret = 0; 2173 2174 for (i = 0; i < s->num_extents; i++) { 2175 err = bdrv_co_flush(s->extents[i].file->bs); 2176 if (err < 0) { 2177 ret = err; 2178 } 2179 } 2180 return ret; 2181 } 2182 2183 static int64_t vmdk_get_allocated_file_size(BlockDriverState *bs) 2184 { 2185 int i; 2186 int64_t ret = 0; 2187 int64_t r; 2188 BDRVVmdkState *s = bs->opaque; 2189 2190 ret = bdrv_get_allocated_file_size(bs->file->bs); 2191 if (ret < 0) { 2192 return ret; 2193 } 2194 for (i = 0; i < s->num_extents; i++) { 2195 if (s->extents[i].file == bs->file) { 2196 continue; 2197 } 2198 r = bdrv_get_allocated_file_size(s->extents[i].file->bs); 2199 if (r < 0) { 2200 return r; 2201 } 2202 ret += r; 2203 } 2204 return ret; 2205 } 2206 2207 static int vmdk_has_zero_init(BlockDriverState *bs) 2208 { 2209 int i; 2210 BDRVVmdkState *s = bs->opaque; 2211 2212 /* If has a flat extent and its underlying storage doesn't have zero init, 2213 * return 0. */ 2214 for (i = 0; i < s->num_extents; i++) { 2215 if (s->extents[i].flat) { 2216 if (!bdrv_has_zero_init(s->extents[i].file->bs)) { 2217 return 0; 2218 } 2219 } 2220 } 2221 return 1; 2222 } 2223 2224 static ImageInfo *vmdk_get_extent_info(VmdkExtent *extent) 2225 { 2226 ImageInfo *info = g_new0(ImageInfo, 1); 2227 2228 *info = (ImageInfo){ 2229 .filename = g_strdup(extent->file->bs->filename), 2230 .format = g_strdup(extent->type), 2231 .virtual_size = extent->sectors * BDRV_SECTOR_SIZE, 2232 .compressed = extent->compressed, 2233 .has_compressed = extent->compressed, 2234 .cluster_size = extent->cluster_sectors * BDRV_SECTOR_SIZE, 2235 .has_cluster_size = !extent->flat, 2236 }; 2237 2238 return info; 2239 } 2240 2241 static int vmdk_check(BlockDriverState *bs, BdrvCheckResult *result, 2242 BdrvCheckMode fix) 2243 { 2244 BDRVVmdkState *s = bs->opaque; 2245 VmdkExtent *extent = NULL; 2246 int64_t sector_num = 0; 2247 int64_t total_sectors = bdrv_nb_sectors(bs); 2248 int ret; 2249 uint64_t cluster_offset; 2250 2251 if (fix) { 2252 return -ENOTSUP; 2253 } 2254 2255 for (;;) { 2256 if (sector_num >= total_sectors) { 2257 return 0; 2258 } 2259 extent = find_extent(s, sector_num, extent); 2260 if (!extent) { 2261 fprintf(stderr, 2262 "ERROR: could not find extent for sector %" PRId64 "\n", 2263 sector_num); 2264 break; 2265 } 2266 ret = get_cluster_offset(bs, extent, NULL, 2267 sector_num << BDRV_SECTOR_BITS, 2268 false, &cluster_offset, 0, 0); 2269 if (ret == VMDK_ERROR) { 2270 fprintf(stderr, 2271 "ERROR: could not get cluster_offset for sector %" 2272 PRId64 "\n", sector_num); 2273 break; 2274 } 2275 if (ret == VMDK_OK && 2276 cluster_offset >= bdrv_getlength(extent->file->bs)) 2277 { 2278 fprintf(stderr, 2279 "ERROR: cluster offset for sector %" 2280 PRId64 " points after EOF\n", sector_num); 2281 break; 2282 } 2283 sector_num += extent->cluster_sectors; 2284 } 2285 2286 result->corruptions++; 2287 return 0; 2288 } 2289 2290 static ImageInfoSpecific *vmdk_get_specific_info(BlockDriverState *bs) 2291 { 2292 int i; 2293 BDRVVmdkState *s = bs->opaque; 2294 ImageInfoSpecific *spec_info = g_new0(ImageInfoSpecific, 1); 2295 ImageInfoList **next; 2296 2297 *spec_info = (ImageInfoSpecific){ 2298 .type = IMAGE_INFO_SPECIFIC_KIND_VMDK, 2299 .u = { 2300 .vmdk.data = g_new0(ImageInfoSpecificVmdk, 1), 2301 }, 2302 }; 2303 2304 *spec_info->u.vmdk.data = (ImageInfoSpecificVmdk) { 2305 .create_type = g_strdup(s->create_type), 2306 .cid = s->cid, 2307 .parent_cid = s->parent_cid, 2308 }; 2309 2310 next = &spec_info->u.vmdk.data->extents; 2311 for (i = 0; i < s->num_extents; i++) { 2312 *next = g_new0(ImageInfoList, 1); 2313 (*next)->value = vmdk_get_extent_info(&s->extents[i]); 2314 (*next)->next = NULL; 2315 next = &(*next)->next; 2316 } 2317 2318 return spec_info; 2319 } 2320 2321 static bool vmdk_extents_type_eq(const VmdkExtent *a, const VmdkExtent *b) 2322 { 2323 return a->flat == b->flat && 2324 a->compressed == b->compressed && 2325 (a->flat || a->cluster_sectors == b->cluster_sectors); 2326 } 2327 2328 static int vmdk_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) 2329 { 2330 int i; 2331 BDRVVmdkState *s = bs->opaque; 2332 assert(s->num_extents); 2333 2334 /* See if we have multiple extents but they have different cases */ 2335 for (i = 1; i < s->num_extents; i++) { 2336 if (!vmdk_extents_type_eq(&s->extents[0], &s->extents[i])) { 2337 return -ENOTSUP; 2338 } 2339 } 2340 bdi->needs_compressed_writes = s->extents[0].compressed; 2341 if (!s->extents[0].flat) { 2342 bdi->cluster_size = s->extents[0].cluster_sectors << BDRV_SECTOR_BITS; 2343 } 2344 return 0; 2345 } 2346 2347 static QemuOptsList vmdk_create_opts = { 2348 .name = "vmdk-create-opts", 2349 .head = QTAILQ_HEAD_INITIALIZER(vmdk_create_opts.head), 2350 .desc = { 2351 { 2352 .name = BLOCK_OPT_SIZE, 2353 .type = QEMU_OPT_SIZE, 2354 .help = "Virtual disk size" 2355 }, 2356 { 2357 .name = BLOCK_OPT_ADAPTER_TYPE, 2358 .type = QEMU_OPT_STRING, 2359 .help = "Virtual adapter type, can be one of " 2360 "ide (default), lsilogic, buslogic or legacyESX" 2361 }, 2362 { 2363 .name = BLOCK_OPT_BACKING_FILE, 2364 .type = QEMU_OPT_STRING, 2365 .help = "File name of a base image" 2366 }, 2367 { 2368 .name = BLOCK_OPT_COMPAT6, 2369 .type = QEMU_OPT_BOOL, 2370 .help = "VMDK version 6 image", 2371 .def_value_str = "off" 2372 }, 2373 { 2374 .name = BLOCK_OPT_HWVERSION, 2375 .type = QEMU_OPT_STRING, 2376 .help = "VMDK hardware version", 2377 .def_value_str = "undefined" 2378 }, 2379 { 2380 .name = BLOCK_OPT_SUBFMT, 2381 .type = QEMU_OPT_STRING, 2382 .help = 2383 "VMDK flat extent format, can be one of " 2384 "{monolithicSparse (default) | monolithicFlat | twoGbMaxExtentSparse | twoGbMaxExtentFlat | streamOptimized} " 2385 }, 2386 { 2387 .name = BLOCK_OPT_ZEROED_GRAIN, 2388 .type = QEMU_OPT_BOOL, 2389 .help = "Enable efficient zero writes " 2390 "using the zeroed-grain GTE feature" 2391 }, 2392 { /* end of list */ } 2393 } 2394 }; 2395 2396 static BlockDriver bdrv_vmdk = { 2397 .format_name = "vmdk", 2398 .instance_size = sizeof(BDRVVmdkState), 2399 .bdrv_probe = vmdk_probe, 2400 .bdrv_open = vmdk_open, 2401 .bdrv_check = vmdk_check, 2402 .bdrv_reopen_prepare = vmdk_reopen_prepare, 2403 .bdrv_co_preadv = vmdk_co_preadv, 2404 .bdrv_co_pwritev = vmdk_co_pwritev, 2405 .bdrv_write_compressed = vmdk_write_compressed, 2406 .bdrv_co_write_zeroes = vmdk_co_write_zeroes, 2407 .bdrv_close = vmdk_close, 2408 .bdrv_create = vmdk_create, 2409 .bdrv_co_flush_to_disk = vmdk_co_flush, 2410 .bdrv_co_get_block_status = vmdk_co_get_block_status, 2411 .bdrv_get_allocated_file_size = vmdk_get_allocated_file_size, 2412 .bdrv_has_zero_init = vmdk_has_zero_init, 2413 .bdrv_get_specific_info = vmdk_get_specific_info, 2414 .bdrv_refresh_limits = vmdk_refresh_limits, 2415 .bdrv_get_info = vmdk_get_info, 2416 2417 .supports_backing = true, 2418 .create_opts = &vmdk_create_opts, 2419 }; 2420 2421 static void bdrv_vmdk_init(void) 2422 { 2423 bdrv_register(&bdrv_vmdk); 2424 } 2425 2426 block_init(bdrv_vmdk_init); 2427