1 /* 2 * QEMU Enhanced Disk Format 3 * 4 * Copyright IBM, Corp. 2010 5 * 6 * Authors: 7 * Stefan Hajnoczi <stefanha@linux.vnet.ibm.com> 8 * Anthony Liguori <aliguori@us.ibm.com> 9 * 10 * This work is licensed under the terms of the GNU LGPL, version 2 or later. 11 * See the COPYING.LIB file in the top-level directory. 12 * 13 */ 14 15 #include "qemu-timer.h" 16 #include "trace.h" 17 #include "qed.h" 18 #include "qerror.h" 19 20 static void qed_aio_cancel(BlockDriverAIOCB *blockacb) 21 { 22 QEDAIOCB *acb = (QEDAIOCB *)blockacb; 23 bool finished = false; 24 25 /* Wait for the request to finish */ 26 acb->finished = &finished; 27 while (!finished) { 28 qemu_aio_wait(); 29 } 30 } 31 32 static AIOPool qed_aio_pool = { 33 .aiocb_size = sizeof(QEDAIOCB), 34 .cancel = qed_aio_cancel, 35 }; 36 37 static int bdrv_qed_probe(const uint8_t *buf, int buf_size, 38 const char *filename) 39 { 40 const QEDHeader *header = (const QEDHeader *)buf; 41 42 if (buf_size < sizeof(*header)) { 43 return 0; 44 } 45 if (le32_to_cpu(header->magic) != QED_MAGIC) { 46 return 0; 47 } 48 return 100; 49 } 50 51 /** 52 * Check whether an image format is raw 53 * 54 * @fmt: Backing file format, may be NULL 55 */ 56 static bool qed_fmt_is_raw(const char *fmt) 57 { 58 return fmt && strcmp(fmt, "raw") == 0; 59 } 60 61 static void qed_header_le_to_cpu(const QEDHeader *le, QEDHeader *cpu) 62 { 63 cpu->magic = le32_to_cpu(le->magic); 64 cpu->cluster_size = le32_to_cpu(le->cluster_size); 65 cpu->table_size = le32_to_cpu(le->table_size); 66 cpu->header_size = le32_to_cpu(le->header_size); 67 cpu->features = le64_to_cpu(le->features); 68 cpu->compat_features = le64_to_cpu(le->compat_features); 69 cpu->autoclear_features = le64_to_cpu(le->autoclear_features); 70 cpu->l1_table_offset = le64_to_cpu(le->l1_table_offset); 71 cpu->image_size = le64_to_cpu(le->image_size); 72 cpu->backing_filename_offset = le32_to_cpu(le->backing_filename_offset); 73 cpu->backing_filename_size = le32_to_cpu(le->backing_filename_size); 74 } 75 76 static void qed_header_cpu_to_le(const QEDHeader *cpu, QEDHeader *le) 77 { 78 le->magic = cpu_to_le32(cpu->magic); 79 le->cluster_size = cpu_to_le32(cpu->cluster_size); 80 le->table_size = cpu_to_le32(cpu->table_size); 81 le->header_size = cpu_to_le32(cpu->header_size); 82 le->features = cpu_to_le64(cpu->features); 83 le->compat_features = cpu_to_le64(cpu->compat_features); 84 le->autoclear_features = cpu_to_le64(cpu->autoclear_features); 85 le->l1_table_offset = cpu_to_le64(cpu->l1_table_offset); 86 le->image_size = cpu_to_le64(cpu->image_size); 87 le->backing_filename_offset = cpu_to_le32(cpu->backing_filename_offset); 88 le->backing_filename_size = cpu_to_le32(cpu->backing_filename_size); 89 } 90 91 static int qed_write_header_sync(BDRVQEDState *s) 92 { 93 QEDHeader le; 94 int ret; 95 96 qed_header_cpu_to_le(&s->header, &le); 97 ret = bdrv_pwrite(s->bs->file, 0, &le, sizeof(le)); 98 if (ret != sizeof(le)) { 99 return ret; 100 } 101 return 0; 102 } 103 104 typedef struct { 105 GenericCB gencb; 106 BDRVQEDState *s; 107 struct iovec iov; 108 QEMUIOVector qiov; 109 int nsectors; 110 uint8_t *buf; 111 } QEDWriteHeaderCB; 112 113 static void qed_write_header_cb(void *opaque, int ret) 114 { 115 QEDWriteHeaderCB *write_header_cb = opaque; 116 117 qemu_vfree(write_header_cb->buf); 118 gencb_complete(write_header_cb, ret); 119 } 120 121 static void qed_write_header_read_cb(void *opaque, int ret) 122 { 123 QEDWriteHeaderCB *write_header_cb = opaque; 124 BDRVQEDState *s = write_header_cb->s; 125 BlockDriverAIOCB *acb; 126 127 if (ret) { 128 qed_write_header_cb(write_header_cb, ret); 129 return; 130 } 131 132 /* Update header */ 133 qed_header_cpu_to_le(&s->header, (QEDHeader *)write_header_cb->buf); 134 135 acb = bdrv_aio_writev(s->bs->file, 0, &write_header_cb->qiov, 136 write_header_cb->nsectors, qed_write_header_cb, 137 write_header_cb); 138 if (!acb) { 139 qed_write_header_cb(write_header_cb, -EIO); 140 } 141 } 142 143 /** 144 * Update header in-place (does not rewrite backing filename or other strings) 145 * 146 * This function only updates known header fields in-place and does not affect 147 * extra data after the QED header. 148 */ 149 static void qed_write_header(BDRVQEDState *s, BlockDriverCompletionFunc cb, 150 void *opaque) 151 { 152 /* We must write full sectors for O_DIRECT but cannot necessarily generate 153 * the data following the header if an unrecognized compat feature is 154 * active. Therefore, first read the sectors containing the header, update 155 * them, and write back. 156 */ 157 158 BlockDriverAIOCB *acb; 159 int nsectors = (sizeof(QEDHeader) + BDRV_SECTOR_SIZE - 1) / 160 BDRV_SECTOR_SIZE; 161 size_t len = nsectors * BDRV_SECTOR_SIZE; 162 QEDWriteHeaderCB *write_header_cb = gencb_alloc(sizeof(*write_header_cb), 163 cb, opaque); 164 165 write_header_cb->s = s; 166 write_header_cb->nsectors = nsectors; 167 write_header_cb->buf = qemu_blockalign(s->bs, len); 168 write_header_cb->iov.iov_base = write_header_cb->buf; 169 write_header_cb->iov.iov_len = len; 170 qemu_iovec_init_external(&write_header_cb->qiov, &write_header_cb->iov, 1); 171 172 acb = bdrv_aio_readv(s->bs->file, 0, &write_header_cb->qiov, nsectors, 173 qed_write_header_read_cb, write_header_cb); 174 if (!acb) { 175 qed_write_header_cb(write_header_cb, -EIO); 176 } 177 } 178 179 static uint64_t qed_max_image_size(uint32_t cluster_size, uint32_t table_size) 180 { 181 uint64_t table_entries; 182 uint64_t l2_size; 183 184 table_entries = (table_size * cluster_size) / sizeof(uint64_t); 185 l2_size = table_entries * cluster_size; 186 187 return l2_size * table_entries; 188 } 189 190 static bool qed_is_cluster_size_valid(uint32_t cluster_size) 191 { 192 if (cluster_size < QED_MIN_CLUSTER_SIZE || 193 cluster_size > QED_MAX_CLUSTER_SIZE) { 194 return false; 195 } 196 if (cluster_size & (cluster_size - 1)) { 197 return false; /* not power of 2 */ 198 } 199 return true; 200 } 201 202 static bool qed_is_table_size_valid(uint32_t table_size) 203 { 204 if (table_size < QED_MIN_TABLE_SIZE || 205 table_size > QED_MAX_TABLE_SIZE) { 206 return false; 207 } 208 if (table_size & (table_size - 1)) { 209 return false; /* not power of 2 */ 210 } 211 return true; 212 } 213 214 static bool qed_is_image_size_valid(uint64_t image_size, uint32_t cluster_size, 215 uint32_t table_size) 216 { 217 if (image_size % BDRV_SECTOR_SIZE != 0) { 218 return false; /* not multiple of sector size */ 219 } 220 if (image_size > qed_max_image_size(cluster_size, table_size)) { 221 return false; /* image is too large */ 222 } 223 return true; 224 } 225 226 /** 227 * Read a string of known length from the image file 228 * 229 * @file: Image file 230 * @offset: File offset to start of string, in bytes 231 * @n: String length in bytes 232 * @buf: Destination buffer 233 * @buflen: Destination buffer length in bytes 234 * @ret: 0 on success, -errno on failure 235 * 236 * The string is NUL-terminated. 237 */ 238 static int qed_read_string(BlockDriverState *file, uint64_t offset, size_t n, 239 char *buf, size_t buflen) 240 { 241 int ret; 242 if (n >= buflen) { 243 return -EINVAL; 244 } 245 ret = bdrv_pread(file, offset, buf, n); 246 if (ret < 0) { 247 return ret; 248 } 249 buf[n] = '\0'; 250 return 0; 251 } 252 253 /** 254 * Allocate new clusters 255 * 256 * @s: QED state 257 * @n: Number of contiguous clusters to allocate 258 * @ret: Offset of first allocated cluster 259 * 260 * This function only produces the offset where the new clusters should be 261 * written. It updates BDRVQEDState but does not make any changes to the image 262 * file. 263 */ 264 static uint64_t qed_alloc_clusters(BDRVQEDState *s, unsigned int n) 265 { 266 uint64_t offset = s->file_size; 267 s->file_size += n * s->header.cluster_size; 268 return offset; 269 } 270 271 QEDTable *qed_alloc_table(BDRVQEDState *s) 272 { 273 /* Honor O_DIRECT memory alignment requirements */ 274 return qemu_blockalign(s->bs, 275 s->header.cluster_size * s->header.table_size); 276 } 277 278 /** 279 * Allocate a new zeroed L2 table 280 */ 281 static CachedL2Table *qed_new_l2_table(BDRVQEDState *s) 282 { 283 CachedL2Table *l2_table = qed_alloc_l2_cache_entry(&s->l2_cache); 284 285 l2_table->table = qed_alloc_table(s); 286 l2_table->offset = qed_alloc_clusters(s, s->header.table_size); 287 288 memset(l2_table->table->offsets, 0, 289 s->header.cluster_size * s->header.table_size); 290 return l2_table; 291 } 292 293 static void qed_aio_next_io(void *opaque, int ret); 294 295 static void qed_plug_allocating_write_reqs(BDRVQEDState *s) 296 { 297 assert(!s->allocating_write_reqs_plugged); 298 299 s->allocating_write_reqs_plugged = true; 300 } 301 302 static void qed_unplug_allocating_write_reqs(BDRVQEDState *s) 303 { 304 QEDAIOCB *acb; 305 306 assert(s->allocating_write_reqs_plugged); 307 308 s->allocating_write_reqs_plugged = false; 309 310 acb = QSIMPLEQ_FIRST(&s->allocating_write_reqs); 311 if (acb) { 312 qed_aio_next_io(acb, 0); 313 } 314 } 315 316 static void qed_finish_clear_need_check(void *opaque, int ret) 317 { 318 /* Do nothing */ 319 } 320 321 static void qed_flush_after_clear_need_check(void *opaque, int ret) 322 { 323 BDRVQEDState *s = opaque; 324 325 bdrv_aio_flush(s->bs, qed_finish_clear_need_check, s); 326 327 /* No need to wait until flush completes */ 328 qed_unplug_allocating_write_reqs(s); 329 } 330 331 static void qed_clear_need_check(void *opaque, int ret) 332 { 333 BDRVQEDState *s = opaque; 334 335 if (ret) { 336 qed_unplug_allocating_write_reqs(s); 337 return; 338 } 339 340 s->header.features &= ~QED_F_NEED_CHECK; 341 qed_write_header(s, qed_flush_after_clear_need_check, s); 342 } 343 344 static void qed_need_check_timer_cb(void *opaque) 345 { 346 BDRVQEDState *s = opaque; 347 348 /* The timer should only fire when allocating writes have drained */ 349 assert(!QSIMPLEQ_FIRST(&s->allocating_write_reqs)); 350 351 trace_qed_need_check_timer_cb(s); 352 353 qed_plug_allocating_write_reqs(s); 354 355 /* Ensure writes are on disk before clearing flag */ 356 bdrv_aio_flush(s->bs, qed_clear_need_check, s); 357 } 358 359 static void qed_start_need_check_timer(BDRVQEDState *s) 360 { 361 trace_qed_start_need_check_timer(s); 362 363 /* Use vm_clock so we don't alter the image file while suspended for 364 * migration. 365 */ 366 qemu_mod_timer(s->need_check_timer, qemu_get_clock_ns(vm_clock) + 367 get_ticks_per_sec() * QED_NEED_CHECK_TIMEOUT); 368 } 369 370 /* It's okay to call this multiple times or when no timer is started */ 371 static void qed_cancel_need_check_timer(BDRVQEDState *s) 372 { 373 trace_qed_cancel_need_check_timer(s); 374 qemu_del_timer(s->need_check_timer); 375 } 376 377 static int bdrv_qed_open(BlockDriverState *bs, int flags) 378 { 379 BDRVQEDState *s = bs->opaque; 380 QEDHeader le_header; 381 int64_t file_size; 382 int ret; 383 384 s->bs = bs; 385 QSIMPLEQ_INIT(&s->allocating_write_reqs); 386 387 ret = bdrv_pread(bs->file, 0, &le_header, sizeof(le_header)); 388 if (ret < 0) { 389 return ret; 390 } 391 ret = 0; /* ret should always be 0 or -errno */ 392 qed_header_le_to_cpu(&le_header, &s->header); 393 394 if (s->header.magic != QED_MAGIC) { 395 return -EINVAL; 396 } 397 if (s->header.features & ~QED_FEATURE_MASK) { 398 /* image uses unsupported feature bits */ 399 char buf[64]; 400 snprintf(buf, sizeof(buf), "%" PRIx64, 401 s->header.features & ~QED_FEATURE_MASK); 402 qerror_report(QERR_UNKNOWN_BLOCK_FORMAT_FEATURE, 403 bs->device_name, "QED", buf); 404 return -ENOTSUP; 405 } 406 if (!qed_is_cluster_size_valid(s->header.cluster_size)) { 407 return -EINVAL; 408 } 409 410 /* Round down file size to the last cluster */ 411 file_size = bdrv_getlength(bs->file); 412 if (file_size < 0) { 413 return file_size; 414 } 415 s->file_size = qed_start_of_cluster(s, file_size); 416 417 if (!qed_is_table_size_valid(s->header.table_size)) { 418 return -EINVAL; 419 } 420 if (!qed_is_image_size_valid(s->header.image_size, 421 s->header.cluster_size, 422 s->header.table_size)) { 423 return -EINVAL; 424 } 425 if (!qed_check_table_offset(s, s->header.l1_table_offset)) { 426 return -EINVAL; 427 } 428 429 s->table_nelems = (s->header.cluster_size * s->header.table_size) / 430 sizeof(uint64_t); 431 s->l2_shift = ffs(s->header.cluster_size) - 1; 432 s->l2_mask = s->table_nelems - 1; 433 s->l1_shift = s->l2_shift + ffs(s->table_nelems) - 1; 434 435 if ((s->header.features & QED_F_BACKING_FILE)) { 436 if ((uint64_t)s->header.backing_filename_offset + 437 s->header.backing_filename_size > 438 s->header.cluster_size * s->header.header_size) { 439 return -EINVAL; 440 } 441 442 ret = qed_read_string(bs->file, s->header.backing_filename_offset, 443 s->header.backing_filename_size, bs->backing_file, 444 sizeof(bs->backing_file)); 445 if (ret < 0) { 446 return ret; 447 } 448 449 if (s->header.features & QED_F_BACKING_FORMAT_NO_PROBE) { 450 pstrcpy(bs->backing_format, sizeof(bs->backing_format), "raw"); 451 } 452 } 453 454 /* Reset unknown autoclear feature bits. This is a backwards 455 * compatibility mechanism that allows images to be opened by older 456 * programs, which "knock out" unknown feature bits. When an image is 457 * opened by a newer program again it can detect that the autoclear 458 * feature is no longer valid. 459 */ 460 if ((s->header.autoclear_features & ~QED_AUTOCLEAR_FEATURE_MASK) != 0 && 461 !bdrv_is_read_only(bs->file)) { 462 s->header.autoclear_features &= QED_AUTOCLEAR_FEATURE_MASK; 463 464 ret = qed_write_header_sync(s); 465 if (ret) { 466 return ret; 467 } 468 469 /* From here on only known autoclear feature bits are valid */ 470 bdrv_flush(bs->file); 471 } 472 473 s->l1_table = qed_alloc_table(s); 474 qed_init_l2_cache(&s->l2_cache); 475 476 ret = qed_read_l1_table_sync(s); 477 if (ret) { 478 goto out; 479 } 480 481 /* If image was not closed cleanly, check consistency */ 482 if (s->header.features & QED_F_NEED_CHECK) { 483 /* Read-only images cannot be fixed. There is no risk of corruption 484 * since write operations are not possible. Therefore, allow 485 * potentially inconsistent images to be opened read-only. This can 486 * aid data recovery from an otherwise inconsistent image. 487 */ 488 if (!bdrv_is_read_only(bs->file)) { 489 BdrvCheckResult result = {0}; 490 491 ret = qed_check(s, &result, true); 492 if (ret) { 493 goto out; 494 } 495 if (!result.corruptions && !result.check_errors) { 496 /* Ensure fixes reach storage before clearing check bit */ 497 bdrv_flush(s->bs); 498 499 s->header.features &= ~QED_F_NEED_CHECK; 500 qed_write_header_sync(s); 501 } 502 } 503 } 504 505 s->need_check_timer = qemu_new_timer_ns(vm_clock, 506 qed_need_check_timer_cb, s); 507 508 out: 509 if (ret) { 510 qed_free_l2_cache(&s->l2_cache); 511 qemu_vfree(s->l1_table); 512 } 513 return ret; 514 } 515 516 static void bdrv_qed_close(BlockDriverState *bs) 517 { 518 BDRVQEDState *s = bs->opaque; 519 520 qed_cancel_need_check_timer(s); 521 qemu_free_timer(s->need_check_timer); 522 523 /* Ensure writes reach stable storage */ 524 bdrv_flush(bs->file); 525 526 /* Clean shutdown, no check required on next open */ 527 if (s->header.features & QED_F_NEED_CHECK) { 528 s->header.features &= ~QED_F_NEED_CHECK; 529 qed_write_header_sync(s); 530 } 531 532 qed_free_l2_cache(&s->l2_cache); 533 qemu_vfree(s->l1_table); 534 } 535 536 static int bdrv_qed_flush(BlockDriverState *bs) 537 { 538 return bdrv_flush(bs->file); 539 } 540 541 static int qed_create(const char *filename, uint32_t cluster_size, 542 uint64_t image_size, uint32_t table_size, 543 const char *backing_file, const char *backing_fmt) 544 { 545 QEDHeader header = { 546 .magic = QED_MAGIC, 547 .cluster_size = cluster_size, 548 .table_size = table_size, 549 .header_size = 1, 550 .features = 0, 551 .compat_features = 0, 552 .l1_table_offset = cluster_size, 553 .image_size = image_size, 554 }; 555 QEDHeader le_header; 556 uint8_t *l1_table = NULL; 557 size_t l1_size = header.cluster_size * header.table_size; 558 int ret = 0; 559 BlockDriverState *bs = NULL; 560 561 ret = bdrv_create_file(filename, NULL); 562 if (ret < 0) { 563 return ret; 564 } 565 566 ret = bdrv_file_open(&bs, filename, BDRV_O_RDWR | BDRV_O_CACHE_WB); 567 if (ret < 0) { 568 return ret; 569 } 570 571 /* File must start empty and grow, check truncate is supported */ 572 ret = bdrv_truncate(bs, 0); 573 if (ret < 0) { 574 goto out; 575 } 576 577 if (backing_file) { 578 header.features |= QED_F_BACKING_FILE; 579 header.backing_filename_offset = sizeof(le_header); 580 header.backing_filename_size = strlen(backing_file); 581 582 if (qed_fmt_is_raw(backing_fmt)) { 583 header.features |= QED_F_BACKING_FORMAT_NO_PROBE; 584 } 585 } 586 587 qed_header_cpu_to_le(&header, &le_header); 588 ret = bdrv_pwrite(bs, 0, &le_header, sizeof(le_header)); 589 if (ret < 0) { 590 goto out; 591 } 592 ret = bdrv_pwrite(bs, sizeof(le_header), backing_file, 593 header.backing_filename_size); 594 if (ret < 0) { 595 goto out; 596 } 597 598 l1_table = qemu_mallocz(l1_size); 599 ret = bdrv_pwrite(bs, header.l1_table_offset, l1_table, l1_size); 600 if (ret < 0) { 601 goto out; 602 } 603 604 ret = 0; /* success */ 605 out: 606 qemu_free(l1_table); 607 bdrv_delete(bs); 608 return ret; 609 } 610 611 static int bdrv_qed_create(const char *filename, QEMUOptionParameter *options) 612 { 613 uint64_t image_size = 0; 614 uint32_t cluster_size = QED_DEFAULT_CLUSTER_SIZE; 615 uint32_t table_size = QED_DEFAULT_TABLE_SIZE; 616 const char *backing_file = NULL; 617 const char *backing_fmt = NULL; 618 619 while (options && options->name) { 620 if (!strcmp(options->name, BLOCK_OPT_SIZE)) { 621 image_size = options->value.n; 622 } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) { 623 backing_file = options->value.s; 624 } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FMT)) { 625 backing_fmt = options->value.s; 626 } else if (!strcmp(options->name, BLOCK_OPT_CLUSTER_SIZE)) { 627 if (options->value.n) { 628 cluster_size = options->value.n; 629 } 630 } else if (!strcmp(options->name, BLOCK_OPT_TABLE_SIZE)) { 631 if (options->value.n) { 632 table_size = options->value.n; 633 } 634 } 635 options++; 636 } 637 638 if (!qed_is_cluster_size_valid(cluster_size)) { 639 fprintf(stderr, "QED cluster size must be within range [%u, %u] and power of 2\n", 640 QED_MIN_CLUSTER_SIZE, QED_MAX_CLUSTER_SIZE); 641 return -EINVAL; 642 } 643 if (!qed_is_table_size_valid(table_size)) { 644 fprintf(stderr, "QED table size must be within range [%u, %u] and power of 2\n", 645 QED_MIN_TABLE_SIZE, QED_MAX_TABLE_SIZE); 646 return -EINVAL; 647 } 648 if (!qed_is_image_size_valid(image_size, cluster_size, table_size)) { 649 fprintf(stderr, "QED image size must be a non-zero multiple of " 650 "cluster size and less than %" PRIu64 " bytes\n", 651 qed_max_image_size(cluster_size, table_size)); 652 return -EINVAL; 653 } 654 655 return qed_create(filename, cluster_size, image_size, table_size, 656 backing_file, backing_fmt); 657 } 658 659 typedef struct { 660 int is_allocated; 661 int *pnum; 662 } QEDIsAllocatedCB; 663 664 static void qed_is_allocated_cb(void *opaque, int ret, uint64_t offset, size_t len) 665 { 666 QEDIsAllocatedCB *cb = opaque; 667 *cb->pnum = len / BDRV_SECTOR_SIZE; 668 cb->is_allocated = (ret == QED_CLUSTER_FOUND || ret == QED_CLUSTER_ZERO); 669 } 670 671 static int bdrv_qed_is_allocated(BlockDriverState *bs, int64_t sector_num, 672 int nb_sectors, int *pnum) 673 { 674 BDRVQEDState *s = bs->opaque; 675 uint64_t pos = (uint64_t)sector_num * BDRV_SECTOR_SIZE; 676 size_t len = (size_t)nb_sectors * BDRV_SECTOR_SIZE; 677 QEDIsAllocatedCB cb = { 678 .is_allocated = -1, 679 .pnum = pnum, 680 }; 681 QEDRequest request = { .l2_table = NULL }; 682 683 async_context_push(); 684 685 qed_find_cluster(s, &request, pos, len, qed_is_allocated_cb, &cb); 686 687 while (cb.is_allocated == -1) { 688 qemu_aio_wait(); 689 } 690 691 async_context_pop(); 692 693 qed_unref_l2_cache_entry(request.l2_table); 694 695 return cb.is_allocated; 696 } 697 698 static int bdrv_qed_make_empty(BlockDriverState *bs) 699 { 700 return -ENOTSUP; 701 } 702 703 static BDRVQEDState *acb_to_s(QEDAIOCB *acb) 704 { 705 return acb->common.bs->opaque; 706 } 707 708 /** 709 * Read from the backing file or zero-fill if no backing file 710 * 711 * @s: QED state 712 * @pos: Byte position in device 713 * @qiov: Destination I/O vector 714 * @cb: Completion function 715 * @opaque: User data for completion function 716 * 717 * This function reads qiov->size bytes starting at pos from the backing file. 718 * If there is no backing file then zeroes are read. 719 */ 720 static void qed_read_backing_file(BDRVQEDState *s, uint64_t pos, 721 QEMUIOVector *qiov, 722 BlockDriverCompletionFunc *cb, void *opaque) 723 { 724 BlockDriverAIOCB *aiocb; 725 uint64_t backing_length = 0; 726 size_t size; 727 728 /* If there is a backing file, get its length. Treat the absence of a 729 * backing file like a zero length backing file. 730 */ 731 if (s->bs->backing_hd) { 732 int64_t l = bdrv_getlength(s->bs->backing_hd); 733 if (l < 0) { 734 cb(opaque, l); 735 return; 736 } 737 backing_length = l; 738 } 739 740 /* Zero all sectors if reading beyond the end of the backing file */ 741 if (pos >= backing_length || 742 pos + qiov->size > backing_length) { 743 qemu_iovec_memset(qiov, 0, qiov->size); 744 } 745 746 /* Complete now if there are no backing file sectors to read */ 747 if (pos >= backing_length) { 748 cb(opaque, 0); 749 return; 750 } 751 752 /* If the read straddles the end of the backing file, shorten it */ 753 size = MIN((uint64_t)backing_length - pos, qiov->size); 754 755 BLKDBG_EVENT(s->bs->file, BLKDBG_READ_BACKING); 756 aiocb = bdrv_aio_readv(s->bs->backing_hd, pos / BDRV_SECTOR_SIZE, 757 qiov, size / BDRV_SECTOR_SIZE, cb, opaque); 758 if (!aiocb) { 759 cb(opaque, -EIO); 760 } 761 } 762 763 typedef struct { 764 GenericCB gencb; 765 BDRVQEDState *s; 766 QEMUIOVector qiov; 767 struct iovec iov; 768 uint64_t offset; 769 } CopyFromBackingFileCB; 770 771 static void qed_copy_from_backing_file_cb(void *opaque, int ret) 772 { 773 CopyFromBackingFileCB *copy_cb = opaque; 774 qemu_vfree(copy_cb->iov.iov_base); 775 gencb_complete(©_cb->gencb, ret); 776 } 777 778 static void qed_copy_from_backing_file_write(void *opaque, int ret) 779 { 780 CopyFromBackingFileCB *copy_cb = opaque; 781 BDRVQEDState *s = copy_cb->s; 782 BlockDriverAIOCB *aiocb; 783 784 if (ret) { 785 qed_copy_from_backing_file_cb(copy_cb, ret); 786 return; 787 } 788 789 BLKDBG_EVENT(s->bs->file, BLKDBG_COW_WRITE); 790 aiocb = bdrv_aio_writev(s->bs->file, copy_cb->offset / BDRV_SECTOR_SIZE, 791 ©_cb->qiov, 792 copy_cb->qiov.size / BDRV_SECTOR_SIZE, 793 qed_copy_from_backing_file_cb, copy_cb); 794 if (!aiocb) { 795 qed_copy_from_backing_file_cb(copy_cb, -EIO); 796 } 797 } 798 799 /** 800 * Copy data from backing file into the image 801 * 802 * @s: QED state 803 * @pos: Byte position in device 804 * @len: Number of bytes 805 * @offset: Byte offset in image file 806 * @cb: Completion function 807 * @opaque: User data for completion function 808 */ 809 static void qed_copy_from_backing_file(BDRVQEDState *s, uint64_t pos, 810 uint64_t len, uint64_t offset, 811 BlockDriverCompletionFunc *cb, 812 void *opaque) 813 { 814 CopyFromBackingFileCB *copy_cb; 815 816 /* Skip copy entirely if there is no work to do */ 817 if (len == 0) { 818 cb(opaque, 0); 819 return; 820 } 821 822 copy_cb = gencb_alloc(sizeof(*copy_cb), cb, opaque); 823 copy_cb->s = s; 824 copy_cb->offset = offset; 825 copy_cb->iov.iov_base = qemu_blockalign(s->bs, len); 826 copy_cb->iov.iov_len = len; 827 qemu_iovec_init_external(©_cb->qiov, ©_cb->iov, 1); 828 829 qed_read_backing_file(s, pos, ©_cb->qiov, 830 qed_copy_from_backing_file_write, copy_cb); 831 } 832 833 /** 834 * Link one or more contiguous clusters into a table 835 * 836 * @s: QED state 837 * @table: L2 table 838 * @index: First cluster index 839 * @n: Number of contiguous clusters 840 * @cluster: First cluster offset 841 * 842 * The cluster offset may be an allocated byte offset in the image file, the 843 * zero cluster marker, or the unallocated cluster marker. 844 */ 845 static void qed_update_l2_table(BDRVQEDState *s, QEDTable *table, int index, 846 unsigned int n, uint64_t cluster) 847 { 848 int i; 849 for (i = index; i < index + n; i++) { 850 table->offsets[i] = cluster; 851 if (!qed_offset_is_unalloc_cluster(cluster) && 852 !qed_offset_is_zero_cluster(cluster)) { 853 cluster += s->header.cluster_size; 854 } 855 } 856 } 857 858 static void qed_aio_complete_bh(void *opaque) 859 { 860 QEDAIOCB *acb = opaque; 861 BlockDriverCompletionFunc *cb = acb->common.cb; 862 void *user_opaque = acb->common.opaque; 863 int ret = acb->bh_ret; 864 bool *finished = acb->finished; 865 866 qemu_bh_delete(acb->bh); 867 qemu_aio_release(acb); 868 869 /* Invoke callback */ 870 cb(user_opaque, ret); 871 872 /* Signal cancel completion */ 873 if (finished) { 874 *finished = true; 875 } 876 } 877 878 static void qed_aio_complete(QEDAIOCB *acb, int ret) 879 { 880 BDRVQEDState *s = acb_to_s(acb); 881 882 trace_qed_aio_complete(s, acb, ret); 883 884 /* Free resources */ 885 qemu_iovec_destroy(&acb->cur_qiov); 886 qed_unref_l2_cache_entry(acb->request.l2_table); 887 888 /* Arrange for a bh to invoke the completion function */ 889 acb->bh_ret = ret; 890 acb->bh = qemu_bh_new(qed_aio_complete_bh, acb); 891 qemu_bh_schedule(acb->bh); 892 893 /* Start next allocating write request waiting behind this one. Note that 894 * requests enqueue themselves when they first hit an unallocated cluster 895 * but they wait until the entire request is finished before waking up the 896 * next request in the queue. This ensures that we don't cycle through 897 * requests multiple times but rather finish one at a time completely. 898 */ 899 if (acb == QSIMPLEQ_FIRST(&s->allocating_write_reqs)) { 900 QSIMPLEQ_REMOVE_HEAD(&s->allocating_write_reqs, next); 901 acb = QSIMPLEQ_FIRST(&s->allocating_write_reqs); 902 if (acb) { 903 qed_aio_next_io(acb, 0); 904 } else if (s->header.features & QED_F_NEED_CHECK) { 905 qed_start_need_check_timer(s); 906 } 907 } 908 } 909 910 /** 911 * Commit the current L2 table to the cache 912 */ 913 static void qed_commit_l2_update(void *opaque, int ret) 914 { 915 QEDAIOCB *acb = opaque; 916 BDRVQEDState *s = acb_to_s(acb); 917 CachedL2Table *l2_table = acb->request.l2_table; 918 919 qed_commit_l2_cache_entry(&s->l2_cache, l2_table); 920 921 /* This is guaranteed to succeed because we just committed the entry to the 922 * cache. 923 */ 924 acb->request.l2_table = qed_find_l2_cache_entry(&s->l2_cache, 925 l2_table->offset); 926 assert(acb->request.l2_table != NULL); 927 928 qed_aio_next_io(opaque, ret); 929 } 930 931 /** 932 * Update L1 table with new L2 table offset and write it out 933 */ 934 static void qed_aio_write_l1_update(void *opaque, int ret) 935 { 936 QEDAIOCB *acb = opaque; 937 BDRVQEDState *s = acb_to_s(acb); 938 int index; 939 940 if (ret) { 941 qed_aio_complete(acb, ret); 942 return; 943 } 944 945 index = qed_l1_index(s, acb->cur_pos); 946 s->l1_table->offsets[index] = acb->request.l2_table->offset; 947 948 qed_write_l1_table(s, index, 1, qed_commit_l2_update, acb); 949 } 950 951 /** 952 * Update L2 table with new cluster offsets and write them out 953 */ 954 static void qed_aio_write_l2_update(void *opaque, int ret) 955 { 956 QEDAIOCB *acb = opaque; 957 BDRVQEDState *s = acb_to_s(acb); 958 bool need_alloc = acb->find_cluster_ret == QED_CLUSTER_L1; 959 int index; 960 961 if (ret) { 962 goto err; 963 } 964 965 if (need_alloc) { 966 qed_unref_l2_cache_entry(acb->request.l2_table); 967 acb->request.l2_table = qed_new_l2_table(s); 968 } 969 970 index = qed_l2_index(s, acb->cur_pos); 971 qed_update_l2_table(s, acb->request.l2_table->table, index, acb->cur_nclusters, 972 acb->cur_cluster); 973 974 if (need_alloc) { 975 /* Write out the whole new L2 table */ 976 qed_write_l2_table(s, &acb->request, 0, s->table_nelems, true, 977 qed_aio_write_l1_update, acb); 978 } else { 979 /* Write out only the updated part of the L2 table */ 980 qed_write_l2_table(s, &acb->request, index, acb->cur_nclusters, false, 981 qed_aio_next_io, acb); 982 } 983 return; 984 985 err: 986 qed_aio_complete(acb, ret); 987 } 988 989 /** 990 * Flush new data clusters before updating the L2 table 991 * 992 * This flush is necessary when a backing file is in use. A crash during an 993 * allocating write could result in empty clusters in the image. If the write 994 * only touched a subregion of the cluster, then backing image sectors have 995 * been lost in the untouched region. The solution is to flush after writing a 996 * new data cluster and before updating the L2 table. 997 */ 998 static void qed_aio_write_flush_before_l2_update(void *opaque, int ret) 999 { 1000 QEDAIOCB *acb = opaque; 1001 BDRVQEDState *s = acb_to_s(acb); 1002 1003 if (!bdrv_aio_flush(s->bs->file, qed_aio_write_l2_update, opaque)) { 1004 qed_aio_complete(acb, -EIO); 1005 } 1006 } 1007 1008 /** 1009 * Write data to the image file 1010 */ 1011 static void qed_aio_write_main(void *opaque, int ret) 1012 { 1013 QEDAIOCB *acb = opaque; 1014 BDRVQEDState *s = acb_to_s(acb); 1015 uint64_t offset = acb->cur_cluster + 1016 qed_offset_into_cluster(s, acb->cur_pos); 1017 BlockDriverCompletionFunc *next_fn; 1018 BlockDriverAIOCB *file_acb; 1019 1020 trace_qed_aio_write_main(s, acb, ret, offset, acb->cur_qiov.size); 1021 1022 if (ret) { 1023 qed_aio_complete(acb, ret); 1024 return; 1025 } 1026 1027 if (acb->find_cluster_ret == QED_CLUSTER_FOUND) { 1028 next_fn = qed_aio_next_io; 1029 } else { 1030 if (s->bs->backing_hd) { 1031 next_fn = qed_aio_write_flush_before_l2_update; 1032 } else { 1033 next_fn = qed_aio_write_l2_update; 1034 } 1035 } 1036 1037 BLKDBG_EVENT(s->bs->file, BLKDBG_WRITE_AIO); 1038 file_acb = bdrv_aio_writev(s->bs->file, offset / BDRV_SECTOR_SIZE, 1039 &acb->cur_qiov, 1040 acb->cur_qiov.size / BDRV_SECTOR_SIZE, 1041 next_fn, acb); 1042 if (!file_acb) { 1043 qed_aio_complete(acb, -EIO); 1044 } 1045 } 1046 1047 /** 1048 * Populate back untouched region of new data cluster 1049 */ 1050 static void qed_aio_write_postfill(void *opaque, int ret) 1051 { 1052 QEDAIOCB *acb = opaque; 1053 BDRVQEDState *s = acb_to_s(acb); 1054 uint64_t start = acb->cur_pos + acb->cur_qiov.size; 1055 uint64_t len = 1056 qed_start_of_cluster(s, start + s->header.cluster_size - 1) - start; 1057 uint64_t offset = acb->cur_cluster + 1058 qed_offset_into_cluster(s, acb->cur_pos) + 1059 acb->cur_qiov.size; 1060 1061 if (ret) { 1062 qed_aio_complete(acb, ret); 1063 return; 1064 } 1065 1066 trace_qed_aio_write_postfill(s, acb, start, len, offset); 1067 qed_copy_from_backing_file(s, start, len, offset, 1068 qed_aio_write_main, acb); 1069 } 1070 1071 /** 1072 * Populate front untouched region of new data cluster 1073 */ 1074 static void qed_aio_write_prefill(void *opaque, int ret) 1075 { 1076 QEDAIOCB *acb = opaque; 1077 BDRVQEDState *s = acb_to_s(acb); 1078 uint64_t start = qed_start_of_cluster(s, acb->cur_pos); 1079 uint64_t len = qed_offset_into_cluster(s, acb->cur_pos); 1080 1081 trace_qed_aio_write_prefill(s, acb, start, len, acb->cur_cluster); 1082 qed_copy_from_backing_file(s, start, len, acb->cur_cluster, 1083 qed_aio_write_postfill, acb); 1084 } 1085 1086 /** 1087 * Check if the QED_F_NEED_CHECK bit should be set during allocating write 1088 */ 1089 static bool qed_should_set_need_check(BDRVQEDState *s) 1090 { 1091 /* The flush before L2 update path ensures consistency */ 1092 if (s->bs->backing_hd) { 1093 return false; 1094 } 1095 1096 return !(s->header.features & QED_F_NEED_CHECK); 1097 } 1098 1099 /** 1100 * Write new data cluster 1101 * 1102 * @acb: Write request 1103 * @len: Length in bytes 1104 * 1105 * This path is taken when writing to previously unallocated clusters. 1106 */ 1107 static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len) 1108 { 1109 BDRVQEDState *s = acb_to_s(acb); 1110 1111 /* Cancel timer when the first allocating request comes in */ 1112 if (QSIMPLEQ_EMPTY(&s->allocating_write_reqs)) { 1113 qed_cancel_need_check_timer(s); 1114 } 1115 1116 /* Freeze this request if another allocating write is in progress */ 1117 if (acb != QSIMPLEQ_FIRST(&s->allocating_write_reqs)) { 1118 QSIMPLEQ_INSERT_TAIL(&s->allocating_write_reqs, acb, next); 1119 } 1120 if (acb != QSIMPLEQ_FIRST(&s->allocating_write_reqs) || 1121 s->allocating_write_reqs_plugged) { 1122 return; /* wait for existing request to finish */ 1123 } 1124 1125 acb->cur_nclusters = qed_bytes_to_clusters(s, 1126 qed_offset_into_cluster(s, acb->cur_pos) + len); 1127 acb->cur_cluster = qed_alloc_clusters(s, acb->cur_nclusters); 1128 qemu_iovec_copy(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len); 1129 1130 if (qed_should_set_need_check(s)) { 1131 s->header.features |= QED_F_NEED_CHECK; 1132 qed_write_header(s, qed_aio_write_prefill, acb); 1133 } else { 1134 qed_aio_write_prefill(acb, 0); 1135 } 1136 } 1137 1138 /** 1139 * Write data cluster in place 1140 * 1141 * @acb: Write request 1142 * @offset: Cluster offset in bytes 1143 * @len: Length in bytes 1144 * 1145 * This path is taken when writing to already allocated clusters. 1146 */ 1147 static void qed_aio_write_inplace(QEDAIOCB *acb, uint64_t offset, size_t len) 1148 { 1149 /* Calculate the I/O vector */ 1150 acb->cur_cluster = offset; 1151 qemu_iovec_copy(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len); 1152 1153 /* Do the actual write */ 1154 qed_aio_write_main(acb, 0); 1155 } 1156 1157 /** 1158 * Write data cluster 1159 * 1160 * @opaque: Write request 1161 * @ret: QED_CLUSTER_FOUND, QED_CLUSTER_L2, QED_CLUSTER_L1, 1162 * or -errno 1163 * @offset: Cluster offset in bytes 1164 * @len: Length in bytes 1165 * 1166 * Callback from qed_find_cluster(). 1167 */ 1168 static void qed_aio_write_data(void *opaque, int ret, 1169 uint64_t offset, size_t len) 1170 { 1171 QEDAIOCB *acb = opaque; 1172 1173 trace_qed_aio_write_data(acb_to_s(acb), acb, ret, offset, len); 1174 1175 acb->find_cluster_ret = ret; 1176 1177 switch (ret) { 1178 case QED_CLUSTER_FOUND: 1179 qed_aio_write_inplace(acb, offset, len); 1180 break; 1181 1182 case QED_CLUSTER_L2: 1183 case QED_CLUSTER_L1: 1184 case QED_CLUSTER_ZERO: 1185 qed_aio_write_alloc(acb, len); 1186 break; 1187 1188 default: 1189 qed_aio_complete(acb, ret); 1190 break; 1191 } 1192 } 1193 1194 /** 1195 * Read data cluster 1196 * 1197 * @opaque: Read request 1198 * @ret: QED_CLUSTER_FOUND, QED_CLUSTER_L2, QED_CLUSTER_L1, 1199 * or -errno 1200 * @offset: Cluster offset in bytes 1201 * @len: Length in bytes 1202 * 1203 * Callback from qed_find_cluster(). 1204 */ 1205 static void qed_aio_read_data(void *opaque, int ret, 1206 uint64_t offset, size_t len) 1207 { 1208 QEDAIOCB *acb = opaque; 1209 BDRVQEDState *s = acb_to_s(acb); 1210 BlockDriverState *bs = acb->common.bs; 1211 BlockDriverAIOCB *file_acb; 1212 1213 /* Adjust offset into cluster */ 1214 offset += qed_offset_into_cluster(s, acb->cur_pos); 1215 1216 trace_qed_aio_read_data(s, acb, ret, offset, len); 1217 1218 if (ret < 0) { 1219 goto err; 1220 } 1221 1222 qemu_iovec_copy(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len); 1223 1224 /* Handle zero cluster and backing file reads */ 1225 if (ret == QED_CLUSTER_ZERO) { 1226 qemu_iovec_memset(&acb->cur_qiov, 0, acb->cur_qiov.size); 1227 qed_aio_next_io(acb, 0); 1228 return; 1229 } else if (ret != QED_CLUSTER_FOUND) { 1230 qed_read_backing_file(s, acb->cur_pos, &acb->cur_qiov, 1231 qed_aio_next_io, acb); 1232 return; 1233 } 1234 1235 BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO); 1236 file_acb = bdrv_aio_readv(bs->file, offset / BDRV_SECTOR_SIZE, 1237 &acb->cur_qiov, 1238 acb->cur_qiov.size / BDRV_SECTOR_SIZE, 1239 qed_aio_next_io, acb); 1240 if (!file_acb) { 1241 ret = -EIO; 1242 goto err; 1243 } 1244 return; 1245 1246 err: 1247 qed_aio_complete(acb, ret); 1248 } 1249 1250 /** 1251 * Begin next I/O or complete the request 1252 */ 1253 static void qed_aio_next_io(void *opaque, int ret) 1254 { 1255 QEDAIOCB *acb = opaque; 1256 BDRVQEDState *s = acb_to_s(acb); 1257 QEDFindClusterFunc *io_fn = 1258 acb->is_write ? qed_aio_write_data : qed_aio_read_data; 1259 1260 trace_qed_aio_next_io(s, acb, ret, acb->cur_pos + acb->cur_qiov.size); 1261 1262 /* Handle I/O error */ 1263 if (ret) { 1264 qed_aio_complete(acb, ret); 1265 return; 1266 } 1267 1268 acb->qiov_offset += acb->cur_qiov.size; 1269 acb->cur_pos += acb->cur_qiov.size; 1270 qemu_iovec_reset(&acb->cur_qiov); 1271 1272 /* Complete request */ 1273 if (acb->cur_pos >= acb->end_pos) { 1274 qed_aio_complete(acb, 0); 1275 return; 1276 } 1277 1278 /* Find next cluster and start I/O */ 1279 qed_find_cluster(s, &acb->request, 1280 acb->cur_pos, acb->end_pos - acb->cur_pos, 1281 io_fn, acb); 1282 } 1283 1284 static BlockDriverAIOCB *qed_aio_setup(BlockDriverState *bs, 1285 int64_t sector_num, 1286 QEMUIOVector *qiov, int nb_sectors, 1287 BlockDriverCompletionFunc *cb, 1288 void *opaque, bool is_write) 1289 { 1290 QEDAIOCB *acb = qemu_aio_get(&qed_aio_pool, bs, cb, opaque); 1291 1292 trace_qed_aio_setup(bs->opaque, acb, sector_num, nb_sectors, 1293 opaque, is_write); 1294 1295 acb->is_write = is_write; 1296 acb->finished = NULL; 1297 acb->qiov = qiov; 1298 acb->qiov_offset = 0; 1299 acb->cur_pos = (uint64_t)sector_num * BDRV_SECTOR_SIZE; 1300 acb->end_pos = acb->cur_pos + nb_sectors * BDRV_SECTOR_SIZE; 1301 acb->request.l2_table = NULL; 1302 qemu_iovec_init(&acb->cur_qiov, qiov->niov); 1303 1304 /* Start request */ 1305 qed_aio_next_io(acb, 0); 1306 return &acb->common; 1307 } 1308 1309 static BlockDriverAIOCB *bdrv_qed_aio_readv(BlockDriverState *bs, 1310 int64_t sector_num, 1311 QEMUIOVector *qiov, int nb_sectors, 1312 BlockDriverCompletionFunc *cb, 1313 void *opaque) 1314 { 1315 return qed_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque, false); 1316 } 1317 1318 static BlockDriverAIOCB *bdrv_qed_aio_writev(BlockDriverState *bs, 1319 int64_t sector_num, 1320 QEMUIOVector *qiov, int nb_sectors, 1321 BlockDriverCompletionFunc *cb, 1322 void *opaque) 1323 { 1324 return qed_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque, true); 1325 } 1326 1327 static BlockDriverAIOCB *bdrv_qed_aio_flush(BlockDriverState *bs, 1328 BlockDriverCompletionFunc *cb, 1329 void *opaque) 1330 { 1331 return bdrv_aio_flush(bs->file, cb, opaque); 1332 } 1333 1334 static int bdrv_qed_truncate(BlockDriverState *bs, int64_t offset) 1335 { 1336 BDRVQEDState *s = bs->opaque; 1337 uint64_t old_image_size; 1338 int ret; 1339 1340 if (!qed_is_image_size_valid(offset, s->header.cluster_size, 1341 s->header.table_size)) { 1342 return -EINVAL; 1343 } 1344 1345 /* Shrinking is currently not supported */ 1346 if ((uint64_t)offset < s->header.image_size) { 1347 return -ENOTSUP; 1348 } 1349 1350 old_image_size = s->header.image_size; 1351 s->header.image_size = offset; 1352 ret = qed_write_header_sync(s); 1353 if (ret < 0) { 1354 s->header.image_size = old_image_size; 1355 } 1356 return ret; 1357 } 1358 1359 static int64_t bdrv_qed_getlength(BlockDriverState *bs) 1360 { 1361 BDRVQEDState *s = bs->opaque; 1362 return s->header.image_size; 1363 } 1364 1365 static int bdrv_qed_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) 1366 { 1367 BDRVQEDState *s = bs->opaque; 1368 1369 memset(bdi, 0, sizeof(*bdi)); 1370 bdi->cluster_size = s->header.cluster_size; 1371 return 0; 1372 } 1373 1374 static int bdrv_qed_change_backing_file(BlockDriverState *bs, 1375 const char *backing_file, 1376 const char *backing_fmt) 1377 { 1378 BDRVQEDState *s = bs->opaque; 1379 QEDHeader new_header, le_header; 1380 void *buffer; 1381 size_t buffer_len, backing_file_len; 1382 int ret; 1383 1384 /* Refuse to set backing filename if unknown compat feature bits are 1385 * active. If the image uses an unknown compat feature then we may not 1386 * know the layout of data following the header structure and cannot safely 1387 * add a new string. 1388 */ 1389 if (backing_file && (s->header.compat_features & 1390 ~QED_COMPAT_FEATURE_MASK)) { 1391 return -ENOTSUP; 1392 } 1393 1394 memcpy(&new_header, &s->header, sizeof(new_header)); 1395 1396 new_header.features &= ~(QED_F_BACKING_FILE | 1397 QED_F_BACKING_FORMAT_NO_PROBE); 1398 1399 /* Adjust feature flags */ 1400 if (backing_file) { 1401 new_header.features |= QED_F_BACKING_FILE; 1402 1403 if (qed_fmt_is_raw(backing_fmt)) { 1404 new_header.features |= QED_F_BACKING_FORMAT_NO_PROBE; 1405 } 1406 } 1407 1408 /* Calculate new header size */ 1409 backing_file_len = 0; 1410 1411 if (backing_file) { 1412 backing_file_len = strlen(backing_file); 1413 } 1414 1415 buffer_len = sizeof(new_header); 1416 new_header.backing_filename_offset = buffer_len; 1417 new_header.backing_filename_size = backing_file_len; 1418 buffer_len += backing_file_len; 1419 1420 /* Make sure we can rewrite header without failing */ 1421 if (buffer_len > new_header.header_size * new_header.cluster_size) { 1422 return -ENOSPC; 1423 } 1424 1425 /* Prepare new header */ 1426 buffer = qemu_malloc(buffer_len); 1427 1428 qed_header_cpu_to_le(&new_header, &le_header); 1429 memcpy(buffer, &le_header, sizeof(le_header)); 1430 buffer_len = sizeof(le_header); 1431 1432 memcpy(buffer + buffer_len, backing_file, backing_file_len); 1433 buffer_len += backing_file_len; 1434 1435 /* Write new header */ 1436 ret = bdrv_pwrite_sync(bs->file, 0, buffer, buffer_len); 1437 qemu_free(buffer); 1438 if (ret == 0) { 1439 memcpy(&s->header, &new_header, sizeof(new_header)); 1440 } 1441 return ret; 1442 } 1443 1444 static int bdrv_qed_check(BlockDriverState *bs, BdrvCheckResult *result) 1445 { 1446 BDRVQEDState *s = bs->opaque; 1447 1448 return qed_check(s, result, false); 1449 } 1450 1451 static QEMUOptionParameter qed_create_options[] = { 1452 { 1453 .name = BLOCK_OPT_SIZE, 1454 .type = OPT_SIZE, 1455 .help = "Virtual disk size (in bytes)" 1456 }, { 1457 .name = BLOCK_OPT_BACKING_FILE, 1458 .type = OPT_STRING, 1459 .help = "File name of a base image" 1460 }, { 1461 .name = BLOCK_OPT_BACKING_FMT, 1462 .type = OPT_STRING, 1463 .help = "Image format of the base image" 1464 }, { 1465 .name = BLOCK_OPT_CLUSTER_SIZE, 1466 .type = OPT_SIZE, 1467 .help = "Cluster size (in bytes)", 1468 .value = { .n = QED_DEFAULT_CLUSTER_SIZE }, 1469 }, { 1470 .name = BLOCK_OPT_TABLE_SIZE, 1471 .type = OPT_SIZE, 1472 .help = "L1/L2 table size (in clusters)" 1473 }, 1474 { /* end of list */ } 1475 }; 1476 1477 static BlockDriver bdrv_qed = { 1478 .format_name = "qed", 1479 .instance_size = sizeof(BDRVQEDState), 1480 .create_options = qed_create_options, 1481 1482 .bdrv_probe = bdrv_qed_probe, 1483 .bdrv_open = bdrv_qed_open, 1484 .bdrv_close = bdrv_qed_close, 1485 .bdrv_create = bdrv_qed_create, 1486 .bdrv_flush = bdrv_qed_flush, 1487 .bdrv_is_allocated = bdrv_qed_is_allocated, 1488 .bdrv_make_empty = bdrv_qed_make_empty, 1489 .bdrv_aio_readv = bdrv_qed_aio_readv, 1490 .bdrv_aio_writev = bdrv_qed_aio_writev, 1491 .bdrv_aio_flush = bdrv_qed_aio_flush, 1492 .bdrv_truncate = bdrv_qed_truncate, 1493 .bdrv_getlength = bdrv_qed_getlength, 1494 .bdrv_get_info = bdrv_qed_get_info, 1495 .bdrv_change_backing_file = bdrv_qed_change_backing_file, 1496 .bdrv_check = bdrv_qed_check, 1497 }; 1498 1499 static void bdrv_qed_init(void) 1500 { 1501 bdrv_register(&bdrv_qed); 1502 } 1503 1504 block_init(bdrv_qed_init); 1505