1 /* 2 * QEMU System Emulator block driver 3 * 4 * Copyright (c) 2003 Fabrice Bellard 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to deal 8 * in the Software without restriction, including without limitation the rights 9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 * copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 * THE SOFTWARE. 23 */ 24 #ifndef BLOCK_INT_H 25 #define BLOCK_INT_H 26 27 #include "block/accounting.h" 28 #include "block/block.h" 29 #include "qemu/queue.h" 30 #include "qemu/coroutine.h" 31 #include "qemu/stats64.h" 32 #include "qemu/timer.h" 33 #include "qemu/hbitmap.h" 34 #include "block/snapshot.h" 35 #include "qemu/main-loop.h" 36 #include "qemu/throttle.h" 37 38 #define BLOCK_FLAG_LAZY_REFCOUNTS 8 39 40 #define BLOCK_OPT_SIZE "size" 41 #define BLOCK_OPT_ENCRYPT "encryption" 42 #define BLOCK_OPT_ENCRYPT_FORMAT "encrypt.format" 43 #define BLOCK_OPT_COMPAT6 "compat6" 44 #define BLOCK_OPT_HWVERSION "hwversion" 45 #define BLOCK_OPT_BACKING_FILE "backing_file" 46 #define BLOCK_OPT_BACKING_FMT "backing_fmt" 47 #define BLOCK_OPT_CLUSTER_SIZE "cluster_size" 48 #define BLOCK_OPT_TABLE_SIZE "table_size" 49 #define BLOCK_OPT_PREALLOC "preallocation" 50 #define BLOCK_OPT_SUBFMT "subformat" 51 #define BLOCK_OPT_COMPAT_LEVEL "compat" 52 #define BLOCK_OPT_LAZY_REFCOUNTS "lazy_refcounts" 53 #define BLOCK_OPT_ADAPTER_TYPE "adapter_type" 54 #define BLOCK_OPT_REDUNDANCY "redundancy" 55 #define BLOCK_OPT_NOCOW "nocow" 56 #define BLOCK_OPT_OBJECT_SIZE "object_size" 57 #define BLOCK_OPT_REFCOUNT_BITS "refcount_bits" 58 59 #define BLOCK_PROBE_BUF_SIZE 512 60 61 enum BdrvTrackedRequestType { 62 BDRV_TRACKED_READ, 63 BDRV_TRACKED_WRITE, 64 BDRV_TRACKED_DISCARD, 65 }; 66 67 typedef struct BdrvTrackedRequest { 68 BlockDriverState *bs; 69 int64_t offset; 70 unsigned int bytes; 71 enum BdrvTrackedRequestType type; 72 73 bool serialising; 74 int64_t overlap_offset; 75 unsigned int overlap_bytes; 76 77 QLIST_ENTRY(BdrvTrackedRequest) list; 78 Coroutine *co; /* owner, used for deadlock detection */ 79 CoQueue wait_queue; /* coroutines blocked on this request */ 80 81 struct BdrvTrackedRequest *waiting_for; 82 } BdrvTrackedRequest; 83 84 struct BlockDriver { 85 const char *format_name; 86 int instance_size; 87 88 /* set to true if the BlockDriver is a block filter. Block filters pass 89 * certain callbacks that refer to data (see block.c) to their bs->file if 90 * the driver doesn't implement them. Drivers that do not wish to forward 91 * must implement them and return -ENOTSUP. 92 */ 93 bool is_filter; 94 /* for snapshots block filter like Quorum can implement the 95 * following recursive callback. 96 * It's purpose is to recurse on the filter children while calling 97 * bdrv_recurse_is_first_non_filter on them. 98 * For a sample implementation look in the future Quorum block filter. 99 */ 100 bool (*bdrv_recurse_is_first_non_filter)(BlockDriverState *bs, 101 BlockDriverState *candidate); 102 103 int (*bdrv_probe)(const uint8_t *buf, int buf_size, const char *filename); 104 int (*bdrv_probe_device)(const char *filename); 105 106 /* Any driver implementing this callback is expected to be able to handle 107 * NULL file names in its .bdrv_open() implementation */ 108 void (*bdrv_parse_filename)(const char *filename, QDict *options, Error **errp); 109 /* Drivers not implementing bdrv_parse_filename nor bdrv_open should have 110 * this field set to true, except ones that are defined only by their 111 * child's bs. 112 * An example of the last type will be the quorum block driver. 113 */ 114 bool bdrv_needs_filename; 115 116 /* Set if a driver can support backing files */ 117 bool supports_backing; 118 119 /* For handling image reopen for split or non-split files */ 120 int (*bdrv_reopen_prepare)(BDRVReopenState *reopen_state, 121 BlockReopenQueue *queue, Error **errp); 122 void (*bdrv_reopen_commit)(BDRVReopenState *reopen_state); 123 void (*bdrv_reopen_abort)(BDRVReopenState *reopen_state); 124 void (*bdrv_join_options)(QDict *options, QDict *old_options); 125 126 int (*bdrv_open)(BlockDriverState *bs, QDict *options, int flags, 127 Error **errp); 128 int (*bdrv_file_open)(BlockDriverState *bs, QDict *options, int flags, 129 Error **errp); 130 void (*bdrv_close)(BlockDriverState *bs); 131 int (*bdrv_create)(const char *filename, QemuOpts *opts, Error **errp); 132 int (*bdrv_make_empty)(BlockDriverState *bs); 133 134 void (*bdrv_refresh_filename)(BlockDriverState *bs, QDict *options); 135 136 /* aio */ 137 BlockAIOCB *(*bdrv_aio_readv)(BlockDriverState *bs, 138 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, 139 BlockCompletionFunc *cb, void *opaque); 140 BlockAIOCB *(*bdrv_aio_writev)(BlockDriverState *bs, 141 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, 142 BlockCompletionFunc *cb, void *opaque); 143 BlockAIOCB *(*bdrv_aio_flush)(BlockDriverState *bs, 144 BlockCompletionFunc *cb, void *opaque); 145 BlockAIOCB *(*bdrv_aio_pdiscard)(BlockDriverState *bs, 146 int64_t offset, int bytes, 147 BlockCompletionFunc *cb, void *opaque); 148 149 int coroutine_fn (*bdrv_co_readv)(BlockDriverState *bs, 150 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov); 151 152 /** 153 * @offset: position in bytes to read at 154 * @bytes: number of bytes to read 155 * @qiov: the buffers to fill with read data 156 * @flags: currently unused, always 0 157 * 158 * @offset and @bytes will be a multiple of 'request_alignment', 159 * but the length of individual @qiov elements does not have to 160 * be a multiple. 161 * 162 * @bytes will always equal the total size of @qiov, and will be 163 * no larger than 'max_transfer'. 164 * 165 * The buffer in @qiov may point directly to guest memory. 166 */ 167 int coroutine_fn (*bdrv_co_preadv)(BlockDriverState *bs, 168 uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags); 169 int coroutine_fn (*bdrv_co_writev)(BlockDriverState *bs, 170 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov); 171 int coroutine_fn (*bdrv_co_writev_flags)(BlockDriverState *bs, 172 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, int flags); 173 /** 174 * @offset: position in bytes to write at 175 * @bytes: number of bytes to write 176 * @qiov: the buffers containing data to write 177 * @flags: zero or more bits allowed by 'supported_write_flags' 178 * 179 * @offset and @bytes will be a multiple of 'request_alignment', 180 * but the length of individual @qiov elements does not have to 181 * be a multiple. 182 * 183 * @bytes will always equal the total size of @qiov, and will be 184 * no larger than 'max_transfer'. 185 * 186 * The buffer in @qiov may point directly to guest memory. 187 */ 188 int coroutine_fn (*bdrv_co_pwritev)(BlockDriverState *bs, 189 uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags); 190 191 /* 192 * Efficiently zero a region of the disk image. Typically an image format 193 * would use a compact metadata representation to implement this. This 194 * function pointer may be NULL or return -ENOSUP and .bdrv_co_writev() 195 * will be called instead. 196 */ 197 int coroutine_fn (*bdrv_co_pwrite_zeroes)(BlockDriverState *bs, 198 int64_t offset, int bytes, BdrvRequestFlags flags); 199 int coroutine_fn (*bdrv_co_pdiscard)(BlockDriverState *bs, 200 int64_t offset, int bytes); 201 202 /* 203 * Building block for bdrv_block_status[_above] and 204 * bdrv_is_allocated[_above]. The driver should answer only 205 * according to the current layer, and should not set 206 * BDRV_BLOCK_ALLOCATED, but may set BDRV_BLOCK_RAW. See block.h 207 * for the meaning of _DATA, _ZERO, and _OFFSET_VALID. The block 208 * layer guarantees input aligned to request_alignment, as well as 209 * non-NULL pnum and file. 210 */ 211 int64_t coroutine_fn (*bdrv_co_get_block_status)(BlockDriverState *bs, 212 int64_t sector_num, int nb_sectors, int *pnum, 213 BlockDriverState **file); 214 215 /* 216 * Invalidate any cached meta-data. 217 */ 218 void (*bdrv_invalidate_cache)(BlockDriverState *bs, Error **errp); 219 int (*bdrv_inactivate)(BlockDriverState *bs); 220 221 /* 222 * Flushes all data for all layers by calling bdrv_co_flush for underlying 223 * layers, if needed. This function is needed for deterministic 224 * synchronization of the flush finishing callback. 225 */ 226 int coroutine_fn (*bdrv_co_flush)(BlockDriverState *bs); 227 228 /* 229 * Flushes all data that was already written to the OS all the way down to 230 * the disk (for example file-posix.c calls fsync()). 231 */ 232 int coroutine_fn (*bdrv_co_flush_to_disk)(BlockDriverState *bs); 233 234 /* 235 * Flushes all internal caches to the OS. The data may still sit in a 236 * writeback cache of the host OS, but it will survive a crash of the qemu 237 * process. 238 */ 239 int coroutine_fn (*bdrv_co_flush_to_os)(BlockDriverState *bs); 240 241 const char *protocol_name; 242 int (*bdrv_truncate)(BlockDriverState *bs, int64_t offset, 243 PreallocMode prealloc, Error **errp); 244 245 int64_t (*bdrv_getlength)(BlockDriverState *bs); 246 bool has_variable_length; 247 int64_t (*bdrv_get_allocated_file_size)(BlockDriverState *bs); 248 BlockMeasureInfo *(*bdrv_measure)(QemuOpts *opts, BlockDriverState *in_bs, 249 Error **errp); 250 251 int coroutine_fn (*bdrv_co_pwritev_compressed)(BlockDriverState *bs, 252 uint64_t offset, uint64_t bytes, QEMUIOVector *qiov); 253 254 int (*bdrv_snapshot_create)(BlockDriverState *bs, 255 QEMUSnapshotInfo *sn_info); 256 int (*bdrv_snapshot_goto)(BlockDriverState *bs, 257 const char *snapshot_id); 258 int (*bdrv_snapshot_delete)(BlockDriverState *bs, 259 const char *snapshot_id, 260 const char *name, 261 Error **errp); 262 int (*bdrv_snapshot_list)(BlockDriverState *bs, 263 QEMUSnapshotInfo **psn_info); 264 int (*bdrv_snapshot_load_tmp)(BlockDriverState *bs, 265 const char *snapshot_id, 266 const char *name, 267 Error **errp); 268 int (*bdrv_get_info)(BlockDriverState *bs, BlockDriverInfo *bdi); 269 ImageInfoSpecific *(*bdrv_get_specific_info)(BlockDriverState *bs); 270 271 int coroutine_fn (*bdrv_save_vmstate)(BlockDriverState *bs, 272 QEMUIOVector *qiov, 273 int64_t pos); 274 int coroutine_fn (*bdrv_load_vmstate)(BlockDriverState *bs, 275 QEMUIOVector *qiov, 276 int64_t pos); 277 278 int (*bdrv_change_backing_file)(BlockDriverState *bs, 279 const char *backing_file, const char *backing_fmt); 280 281 /* removable device specific */ 282 bool (*bdrv_is_inserted)(BlockDriverState *bs); 283 void (*bdrv_eject)(BlockDriverState *bs, bool eject_flag); 284 void (*bdrv_lock_medium)(BlockDriverState *bs, bool locked); 285 286 /* to control generic scsi devices */ 287 BlockAIOCB *(*bdrv_aio_ioctl)(BlockDriverState *bs, 288 unsigned long int req, void *buf, 289 BlockCompletionFunc *cb, void *opaque); 290 int coroutine_fn (*bdrv_co_ioctl)(BlockDriverState *bs, 291 unsigned long int req, void *buf); 292 293 /* List of options for creating images, terminated by name == NULL */ 294 QemuOptsList *create_opts; 295 296 /* 297 * Returns 0 for completed check, -errno for internal errors. 298 * The check results are stored in result. 299 */ 300 int (*bdrv_check)(BlockDriverState *bs, BdrvCheckResult *result, 301 BdrvCheckMode fix); 302 303 int (*bdrv_amend_options)(BlockDriverState *bs, QemuOpts *opts, 304 BlockDriverAmendStatusCB *status_cb, 305 void *cb_opaque); 306 307 void (*bdrv_debug_event)(BlockDriverState *bs, BlkdebugEvent event); 308 309 /* TODO Better pass a option string/QDict/QemuOpts to add any rule? */ 310 int (*bdrv_debug_breakpoint)(BlockDriverState *bs, const char *event, 311 const char *tag); 312 int (*bdrv_debug_remove_breakpoint)(BlockDriverState *bs, 313 const char *tag); 314 int (*bdrv_debug_resume)(BlockDriverState *bs, const char *tag); 315 bool (*bdrv_debug_is_suspended)(BlockDriverState *bs, const char *tag); 316 317 void (*bdrv_refresh_limits)(BlockDriverState *bs, Error **errp); 318 319 /* 320 * Returns 1 if newly created images are guaranteed to contain only 321 * zeros, 0 otherwise. 322 */ 323 int (*bdrv_has_zero_init)(BlockDriverState *bs); 324 325 /* Remove fd handlers, timers, and other event loop callbacks so the event 326 * loop is no longer in use. Called with no in-flight requests and in 327 * depth-first traversal order with parents before child nodes. 328 */ 329 void (*bdrv_detach_aio_context)(BlockDriverState *bs); 330 331 /* Add fd handlers, timers, and other event loop callbacks so I/O requests 332 * can be processed again. Called with no in-flight requests and in 333 * depth-first traversal order with child nodes before parent nodes. 334 */ 335 void (*bdrv_attach_aio_context)(BlockDriverState *bs, 336 AioContext *new_context); 337 338 /* io queue for linux-aio */ 339 void (*bdrv_io_plug)(BlockDriverState *bs); 340 void (*bdrv_io_unplug)(BlockDriverState *bs); 341 342 /** 343 * Try to get @bs's logical and physical block size. 344 * On success, store them in @bsz and return zero. 345 * On failure, return negative errno. 346 */ 347 int (*bdrv_probe_blocksizes)(BlockDriverState *bs, BlockSizes *bsz); 348 /** 349 * Try to get @bs's geometry (cyls, heads, sectors) 350 * On success, store them in @geo and return 0. 351 * On failure return -errno. 352 * Only drivers that want to override guest geometry implement this 353 * callback; see hd_geometry_guess(). 354 */ 355 int (*bdrv_probe_geometry)(BlockDriverState *bs, HDGeometry *geo); 356 357 /** 358 * bdrv_co_drain_begin is called if implemented in the beginning of a 359 * drain operation to drain and stop any internal sources of requests in 360 * the driver. 361 * bdrv_co_drain_end is called if implemented at the end of the drain. 362 * 363 * They should be used by the driver to e.g. manage scheduled I/O 364 * requests, or toggle an internal state. After the end of the drain new 365 * requests will continue normally. 366 */ 367 void coroutine_fn (*bdrv_co_drain_begin)(BlockDriverState *bs); 368 void coroutine_fn (*bdrv_co_drain_end)(BlockDriverState *bs); 369 370 void (*bdrv_add_child)(BlockDriverState *parent, BlockDriverState *child, 371 Error **errp); 372 void (*bdrv_del_child)(BlockDriverState *parent, BdrvChild *child, 373 Error **errp); 374 375 /** 376 * Informs the block driver that a permission change is intended. The 377 * driver checks whether the change is permissible and may take other 378 * preparations for the change (e.g. get file system locks). This operation 379 * is always followed either by a call to either .bdrv_set_perm or 380 * .bdrv_abort_perm_update. 381 * 382 * Checks whether the requested set of cumulative permissions in @perm 383 * can be granted for accessing @bs and whether no other users are using 384 * permissions other than those given in @shared (both arguments take 385 * BLK_PERM_* bitmasks). 386 * 387 * If both conditions are met, 0 is returned. Otherwise, -errno is returned 388 * and errp is set to an error describing the conflict. 389 */ 390 int (*bdrv_check_perm)(BlockDriverState *bs, uint64_t perm, 391 uint64_t shared, Error **errp); 392 393 /** 394 * Called to inform the driver that the set of cumulative set of used 395 * permissions for @bs has changed to @perm, and the set of sharable 396 * permission to @shared. The driver can use this to propagate changes to 397 * its children (i.e. request permissions only if a parent actually needs 398 * them). 399 * 400 * This function is only invoked after bdrv_check_perm(), so block drivers 401 * may rely on preparations made in their .bdrv_check_perm implementation. 402 */ 403 void (*bdrv_set_perm)(BlockDriverState *bs, uint64_t perm, uint64_t shared); 404 405 /* 406 * Called to inform the driver that after a previous bdrv_check_perm() 407 * call, the permission update is not performed and any preparations made 408 * for it (e.g. taken file locks) need to be undone. 409 * 410 * This function can be called even for nodes that never saw a 411 * bdrv_check_perm() call. It is a no-op then. 412 */ 413 void (*bdrv_abort_perm_update)(BlockDriverState *bs); 414 415 /** 416 * Returns in @nperm and @nshared the permissions that the driver for @bs 417 * needs on its child @c, based on the cumulative permissions requested by 418 * the parents in @parent_perm and @parent_shared. 419 * 420 * If @c is NULL, return the permissions for attaching a new child for the 421 * given @role. 422 * 423 * If @reopen_queue is non-NULL, don't return the currently needed 424 * permissions, but those that will be needed after applying the 425 * @reopen_queue. 426 */ 427 void (*bdrv_child_perm)(BlockDriverState *bs, BdrvChild *c, 428 const BdrvChildRole *role, 429 BlockReopenQueue *reopen_queue, 430 uint64_t parent_perm, uint64_t parent_shared, 431 uint64_t *nperm, uint64_t *nshared); 432 433 /** 434 * Bitmaps should be marked as 'IN_USE' in the image on reopening image 435 * as rw. This handler should realize it. It also should unset readonly 436 * field of BlockDirtyBitmap's in case of success. 437 */ 438 int (*bdrv_reopen_bitmaps_rw)(BlockDriverState *bs, Error **errp); 439 bool (*bdrv_can_store_new_dirty_bitmap)(BlockDriverState *bs, 440 const char *name, 441 uint32_t granularity, 442 Error **errp); 443 void (*bdrv_remove_persistent_dirty_bitmap)(BlockDriverState *bs, 444 const char *name, 445 Error **errp); 446 447 /** 448 * Register/unregister a buffer for I/O. For example, when the driver is 449 * interested to know the memory areas that will later be used in iovs, so 450 * that it can do IOMMU mapping with VFIO etc., in order to get better 451 * performance. In the case of VFIO drivers, this callback is used to do 452 * DMA mapping for hot buffers. 453 */ 454 void (*bdrv_register_buf)(BlockDriverState *bs, void *host, size_t size); 455 void (*bdrv_unregister_buf)(BlockDriverState *bs, void *host); 456 QLIST_ENTRY(BlockDriver) list; 457 }; 458 459 typedef struct BlockLimits { 460 /* Alignment requirement, in bytes, for offset/length of I/O 461 * requests. Must be a power of 2 less than INT_MAX; defaults to 462 * 1 for drivers with modern byte interfaces, and to 512 463 * otherwise. */ 464 uint32_t request_alignment; 465 466 /* Maximum number of bytes that can be discarded at once (since it 467 * is signed, it must be < 2G, if set). Must be multiple of 468 * pdiscard_alignment, but need not be power of 2. May be 0 if no 469 * inherent 32-bit limit */ 470 int32_t max_pdiscard; 471 472 /* Optimal alignment for discard requests in bytes. A power of 2 473 * is best but not mandatory. Must be a multiple of 474 * bl.request_alignment, and must be less than max_pdiscard if 475 * that is set. May be 0 if bl.request_alignment is good enough */ 476 uint32_t pdiscard_alignment; 477 478 /* Maximum number of bytes that can zeroized at once (since it is 479 * signed, it must be < 2G, if set). Must be multiple of 480 * pwrite_zeroes_alignment. May be 0 if no inherent 32-bit limit */ 481 int32_t max_pwrite_zeroes; 482 483 /* Optimal alignment for write zeroes requests in bytes. A power 484 * of 2 is best but not mandatory. Must be a multiple of 485 * bl.request_alignment, and must be less than max_pwrite_zeroes 486 * if that is set. May be 0 if bl.request_alignment is good 487 * enough */ 488 uint32_t pwrite_zeroes_alignment; 489 490 /* Optimal transfer length in bytes. A power of 2 is best but not 491 * mandatory. Must be a multiple of bl.request_alignment, or 0 if 492 * no preferred size */ 493 uint32_t opt_transfer; 494 495 /* Maximal transfer length in bytes. Need not be power of 2, but 496 * must be multiple of opt_transfer and bl.request_alignment, or 0 497 * for no 32-bit limit. For now, anything larger than INT_MAX is 498 * clamped down. */ 499 uint32_t max_transfer; 500 501 /* memory alignment, in bytes so that no bounce buffer is needed */ 502 size_t min_mem_alignment; 503 504 /* memory alignment, in bytes, for bounce buffer */ 505 size_t opt_mem_alignment; 506 507 /* maximum number of iovec elements */ 508 int max_iov; 509 } BlockLimits; 510 511 typedef struct BdrvOpBlocker BdrvOpBlocker; 512 513 typedef struct BdrvAioNotifier { 514 void (*attached_aio_context)(AioContext *new_context, void *opaque); 515 void (*detach_aio_context)(void *opaque); 516 517 void *opaque; 518 bool deleted; 519 520 QLIST_ENTRY(BdrvAioNotifier) list; 521 } BdrvAioNotifier; 522 523 struct BdrvChildRole { 524 /* If true, bdrv_replace_node() doesn't change the node this BdrvChild 525 * points to. */ 526 bool stay_at_node; 527 528 void (*inherit_options)(int *child_flags, QDict *child_options, 529 int parent_flags, QDict *parent_options); 530 531 void (*change_media)(BdrvChild *child, bool load); 532 void (*resize)(BdrvChild *child); 533 534 /* Returns a name that is supposedly more useful for human users than the 535 * node name for identifying the node in question (in particular, a BB 536 * name), or NULL if the parent can't provide a better name. */ 537 const char *(*get_name)(BdrvChild *child); 538 539 /* Returns a malloced string that describes the parent of the child for a 540 * human reader. This could be a node-name, BlockBackend name, qdev ID or 541 * QOM path of the device owning the BlockBackend, job type and ID etc. The 542 * caller is responsible for freeing the memory. */ 543 char *(*get_parent_desc)(BdrvChild *child); 544 545 /* 546 * If this pair of functions is implemented, the parent doesn't issue new 547 * requests after returning from .drained_begin() until .drained_end() is 548 * called. 549 * 550 * Note that this can be nested. If drained_begin() was called twice, new 551 * I/O is allowed only after drained_end() was called twice, too. 552 */ 553 void (*drained_begin)(BdrvChild *child); 554 void (*drained_end)(BdrvChild *child); 555 556 /* Notifies the parent that the child has been activated/inactivated (e.g. 557 * when migration is completing) and it can start/stop requesting 558 * permissions and doing I/O on it. */ 559 void (*activate)(BdrvChild *child, Error **errp); 560 int (*inactivate)(BdrvChild *child); 561 562 void (*attach)(BdrvChild *child); 563 void (*detach)(BdrvChild *child); 564 565 /* Notifies the parent that the filename of its child has changed (e.g. 566 * because the direct child was removed from the backing chain), so that it 567 * can update its reference. */ 568 int (*update_filename)(BdrvChild *child, BlockDriverState *new_base, 569 const char *filename, Error **errp); 570 }; 571 572 extern const BdrvChildRole child_file; 573 extern const BdrvChildRole child_format; 574 extern const BdrvChildRole child_backing; 575 576 struct BdrvChild { 577 BlockDriverState *bs; 578 char *name; 579 const BdrvChildRole *role; 580 void *opaque; 581 582 /** 583 * Granted permissions for operating on this BdrvChild (BLK_PERM_* bitmask) 584 */ 585 uint64_t perm; 586 587 /** 588 * Permissions that can still be granted to other users of @bs while this 589 * BdrvChild is still attached to it. (BLK_PERM_* bitmask) 590 */ 591 uint64_t shared_perm; 592 593 QLIST_ENTRY(BdrvChild) next; 594 QLIST_ENTRY(BdrvChild) next_parent; 595 }; 596 597 /* 598 * Note: the function bdrv_append() copies and swaps contents of 599 * BlockDriverStates, so if you add new fields to this struct, please 600 * inspect bdrv_append() to determine if the new fields need to be 601 * copied as well. 602 */ 603 struct BlockDriverState { 604 /* Protected by big QEMU lock or read-only after opening. No special 605 * locking needed during I/O... 606 */ 607 int open_flags; /* flags used to open the file, re-used for re-open */ 608 bool read_only; /* if true, the media is read only */ 609 bool encrypted; /* if true, the media is encrypted */ 610 bool sg; /* if true, the device is a /dev/sg* */ 611 bool probed; /* if true, format was probed rather than specified */ 612 bool force_share; /* if true, always allow all shared permissions */ 613 bool implicit; /* if true, this filter node was automatically inserted */ 614 615 BlockDriver *drv; /* NULL means no media */ 616 void *opaque; 617 618 AioContext *aio_context; /* event loop used for fd handlers, timers, etc */ 619 /* long-running tasks intended to always use the same AioContext as this 620 * BDS may register themselves in this list to be notified of changes 621 * regarding this BDS's context */ 622 QLIST_HEAD(, BdrvAioNotifier) aio_notifiers; 623 bool walking_aio_notifiers; /* to make removal during iteration safe */ 624 625 char filename[PATH_MAX]; 626 char backing_file[PATH_MAX]; /* if non zero, the image is a diff of 627 this file image */ 628 char backing_format[16]; /* if non-zero and backing_file exists */ 629 630 QDict *full_open_options; 631 char exact_filename[PATH_MAX]; 632 633 BdrvChild *backing; 634 BdrvChild *file; 635 636 /* I/O Limits */ 637 BlockLimits bl; 638 639 /* Flags honored during pwrite (so far: BDRV_REQ_FUA) */ 640 unsigned int supported_write_flags; 641 /* Flags honored during pwrite_zeroes (so far: BDRV_REQ_FUA, 642 * BDRV_REQ_MAY_UNMAP) */ 643 unsigned int supported_zero_flags; 644 645 /* the following member gives a name to every node on the bs graph. */ 646 char node_name[32]; 647 /* element of the list of named nodes building the graph */ 648 QTAILQ_ENTRY(BlockDriverState) node_list; 649 /* element of the list of all BlockDriverStates (all_bdrv_states) */ 650 QTAILQ_ENTRY(BlockDriverState) bs_list; 651 /* element of the list of monitor-owned BDS */ 652 QTAILQ_ENTRY(BlockDriverState) monitor_list; 653 int refcnt; 654 655 /* operation blockers */ 656 QLIST_HEAD(, BdrvOpBlocker) op_blockers[BLOCK_OP_TYPE_MAX]; 657 658 /* long-running background operation */ 659 BlockJob *job; 660 661 /* The node that this node inherited default options from (and a reopen on 662 * which can affect this node by changing these defaults). This is always a 663 * parent node of this node. */ 664 BlockDriverState *inherits_from; 665 QLIST_HEAD(, BdrvChild) children; 666 QLIST_HEAD(, BdrvChild) parents; 667 668 QDict *options; 669 QDict *explicit_options; 670 BlockdevDetectZeroesOptions detect_zeroes; 671 672 /* The error object in use for blocking operations on backing_hd */ 673 Error *backing_blocker; 674 675 /* Protected by AioContext lock */ 676 677 /* If we are reading a disk image, give its size in sectors. 678 * Generally read-only; it is written to by load_snapshot and 679 * save_snaphost, but the block layer is quiescent during those. 680 */ 681 int64_t total_sectors; 682 683 /* Callback before write request is processed */ 684 NotifierWithReturnList before_write_notifiers; 685 686 /* threshold limit for writes, in bytes. "High water mark". */ 687 uint64_t write_threshold_offset; 688 NotifierWithReturn write_threshold_notifier; 689 690 /* Writing to the list requires the BQL _and_ the dirty_bitmap_mutex. 691 * Reading from the list can be done with either the BQL or the 692 * dirty_bitmap_mutex. Modifying a bitmap only requires 693 * dirty_bitmap_mutex. */ 694 QemuMutex dirty_bitmap_mutex; 695 QLIST_HEAD(, BdrvDirtyBitmap) dirty_bitmaps; 696 697 /* Offset after the highest byte written to */ 698 Stat64 wr_highest_offset; 699 700 /* If true, copy read backing sectors into image. Can be >1 if more 701 * than one client has requested copy-on-read. Accessed with atomic 702 * ops. 703 */ 704 int copy_on_read; 705 706 /* number of in-flight requests; overall and serialising. 707 * Accessed with atomic ops. 708 */ 709 unsigned int in_flight; 710 unsigned int serialising_in_flight; 711 712 /* Internal to BDRV_POLL_WHILE and bdrv_wakeup. Accessed with atomic 713 * ops. 714 */ 715 bool wakeup; 716 717 /* counter for nested bdrv_io_plug. 718 * Accessed with atomic ops. 719 */ 720 unsigned io_plugged; 721 722 /* do we need to tell the quest if we have a volatile write cache? */ 723 int enable_write_cache; 724 725 /* Accessed with atomic ops. */ 726 int quiesce_counter; 727 int recursive_quiesce_counter; 728 729 unsigned int write_gen; /* Current data generation */ 730 731 /* Protected by reqs_lock. */ 732 CoMutex reqs_lock; 733 QLIST_HEAD(, BdrvTrackedRequest) tracked_requests; 734 CoQueue flush_queue; /* Serializing flush queue */ 735 bool active_flush_req; /* Flush request in flight? */ 736 737 /* Only read/written by whoever has set active_flush_req to true. */ 738 unsigned int flushed_gen; /* Flushed write generation */ 739 }; 740 741 struct BlockBackendRootState { 742 int open_flags; 743 bool read_only; 744 BlockdevDetectZeroesOptions detect_zeroes; 745 }; 746 747 typedef enum BlockMirrorBackingMode { 748 /* Reuse the existing backing chain from the source for the target. 749 * - sync=full: Set backing BDS to NULL. 750 * - sync=top: Use source's backing BDS. 751 * - sync=none: Use source as the backing BDS. */ 752 MIRROR_SOURCE_BACKING_CHAIN, 753 754 /* Open the target's backing chain completely anew */ 755 MIRROR_OPEN_BACKING_CHAIN, 756 757 /* Do not change the target's backing BDS after job completion */ 758 MIRROR_LEAVE_BACKING_CHAIN, 759 } BlockMirrorBackingMode; 760 761 static inline BlockDriverState *backing_bs(BlockDriverState *bs) 762 { 763 return bs->backing ? bs->backing->bs : NULL; 764 } 765 766 767 /* Essential block drivers which must always be statically linked into qemu, and 768 * which therefore can be accessed without using bdrv_find_format() */ 769 extern BlockDriver bdrv_file; 770 extern BlockDriver bdrv_raw; 771 extern BlockDriver bdrv_qcow2; 772 773 int coroutine_fn bdrv_co_preadv(BdrvChild *child, 774 int64_t offset, unsigned int bytes, QEMUIOVector *qiov, 775 BdrvRequestFlags flags); 776 int coroutine_fn bdrv_co_pwritev(BdrvChild *child, 777 int64_t offset, unsigned int bytes, QEMUIOVector *qiov, 778 BdrvRequestFlags flags); 779 780 void bdrv_apply_subtree_drain(BdrvChild *child, BlockDriverState *new_parent); 781 void bdrv_unapply_subtree_drain(BdrvChild *child, BlockDriverState *old_parent); 782 783 int get_tmp_filename(char *filename, int size); 784 BlockDriver *bdrv_probe_all(const uint8_t *buf, int buf_size, 785 const char *filename); 786 787 void bdrv_parse_filename_strip_prefix(const char *filename, const char *prefix, 788 QDict *options); 789 790 791 /** 792 * bdrv_add_before_write_notifier: 793 * 794 * Register a callback that is invoked before write requests are processed but 795 * after any throttling or waiting for overlapping requests. 796 */ 797 void bdrv_add_before_write_notifier(BlockDriverState *bs, 798 NotifierWithReturn *notifier); 799 800 /** 801 * bdrv_detach_aio_context: 802 * 803 * May be called from .bdrv_detach_aio_context() to detach children from the 804 * current #AioContext. This is only needed by block drivers that manage their 805 * own children. Both ->file and ->backing are automatically handled and 806 * block drivers should not call this function on them explicitly. 807 */ 808 void bdrv_detach_aio_context(BlockDriverState *bs); 809 810 /** 811 * bdrv_attach_aio_context: 812 * 813 * May be called from .bdrv_attach_aio_context() to attach children to the new 814 * #AioContext. This is only needed by block drivers that manage their own 815 * children. Both ->file and ->backing are automatically handled and block 816 * drivers should not call this function on them explicitly. 817 */ 818 void bdrv_attach_aio_context(BlockDriverState *bs, 819 AioContext *new_context); 820 821 /** 822 * bdrv_add_aio_context_notifier: 823 * 824 * If a long-running job intends to be always run in the same AioContext as a 825 * certain BDS, it may use this function to be notified of changes regarding the 826 * association of the BDS to an AioContext. 827 * 828 * attached_aio_context() is called after the target BDS has been attached to a 829 * new AioContext; detach_aio_context() is called before the target BDS is being 830 * detached from its old AioContext. 831 */ 832 void bdrv_add_aio_context_notifier(BlockDriverState *bs, 833 void (*attached_aio_context)(AioContext *new_context, void *opaque), 834 void (*detach_aio_context)(void *opaque), void *opaque); 835 836 /** 837 * bdrv_remove_aio_context_notifier: 838 * 839 * Unsubscribe of change notifications regarding the BDS's AioContext. The 840 * parameters given here have to be the same as those given to 841 * bdrv_add_aio_context_notifier(). 842 */ 843 void bdrv_remove_aio_context_notifier(BlockDriverState *bs, 844 void (*aio_context_attached)(AioContext *, 845 void *), 846 void (*aio_context_detached)(void *), 847 void *opaque); 848 849 /** 850 * bdrv_wakeup: 851 * @bs: The BlockDriverState for which an I/O operation has been completed. 852 * 853 * Wake up the main thread if it is waiting on BDRV_POLL_WHILE. During 854 * synchronous I/O on a BlockDriverState that is attached to another 855 * I/O thread, the main thread lets the I/O thread's event loop run, 856 * waiting for the I/O operation to complete. A bdrv_wakeup will wake 857 * up the main thread if necessary. 858 * 859 * Manual calls to bdrv_wakeup are rarely necessary, because 860 * bdrv_dec_in_flight already calls it. 861 */ 862 void bdrv_wakeup(BlockDriverState *bs); 863 864 #ifdef _WIN32 865 int is_windows_drive(const char *filename); 866 #endif 867 868 /** 869 * stream_start: 870 * @job_id: The id of the newly-created job, or %NULL to use the 871 * device name of @bs. 872 * @bs: Block device to operate on. 873 * @base: Block device that will become the new base, or %NULL to 874 * flatten the whole backing file chain onto @bs. 875 * @backing_file_str: The file name that will be written to @bs as the 876 * the new backing file if the job completes. Ignored if @base is %NULL. 877 * @speed: The maximum speed, in bytes per second, or 0 for unlimited. 878 * @on_error: The action to take upon error. 879 * @errp: Error object. 880 * 881 * Start a streaming operation on @bs. Clusters that are unallocated 882 * in @bs, but allocated in any image between @base and @bs (both 883 * exclusive) will be written to @bs. At the end of a successful 884 * streaming job, the backing file of @bs will be changed to 885 * @backing_file_str in the written image and to @base in the live 886 * BlockDriverState. 887 */ 888 void stream_start(const char *job_id, BlockDriverState *bs, 889 BlockDriverState *base, const char *backing_file_str, 890 int64_t speed, BlockdevOnError on_error, Error **errp); 891 892 /** 893 * commit_start: 894 * @job_id: The id of the newly-created job, or %NULL to use the 895 * device name of @bs. 896 * @bs: Active block device. 897 * @top: Top block device to be committed. 898 * @base: Block device that will be written into, and become the new top. 899 * @speed: The maximum speed, in bytes per second, or 0 for unlimited. 900 * @on_error: The action to take upon error. 901 * @backing_file_str: String to use as the backing file in @top's overlay 902 * @filter_node_name: The node name that should be assigned to the filter 903 * driver that the commit job inserts into the graph above @top. NULL means 904 * that a node name should be autogenerated. 905 * @errp: Error object. 906 * 907 */ 908 void commit_start(const char *job_id, BlockDriverState *bs, 909 BlockDriverState *base, BlockDriverState *top, int64_t speed, 910 BlockdevOnError on_error, const char *backing_file_str, 911 const char *filter_node_name, Error **errp); 912 /** 913 * commit_active_start: 914 * @job_id: The id of the newly-created job, or %NULL to use the 915 * device name of @bs. 916 * @bs: Active block device to be committed. 917 * @base: Block device that will be written into, and become the new top. 918 * @creation_flags: Flags that control the behavior of the Job lifetime. 919 * See @BlockJobCreateFlags 920 * @speed: The maximum speed, in bytes per second, or 0 for unlimited. 921 * @on_error: The action to take upon error. 922 * @filter_node_name: The node name that should be assigned to the filter 923 * driver that the commit job inserts into the graph above @bs. NULL means that 924 * a node name should be autogenerated. 925 * @cb: Completion function for the job. 926 * @opaque: Opaque pointer value passed to @cb. 927 * @auto_complete: Auto complete the job. 928 * @errp: Error object. 929 * 930 */ 931 void commit_active_start(const char *job_id, BlockDriverState *bs, 932 BlockDriverState *base, int creation_flags, 933 int64_t speed, BlockdevOnError on_error, 934 const char *filter_node_name, 935 BlockCompletionFunc *cb, void *opaque, 936 bool auto_complete, Error **errp); 937 /* 938 * mirror_start: 939 * @job_id: The id of the newly-created job, or %NULL to use the 940 * device name of @bs. 941 * @bs: Block device to operate on. 942 * @target: Block device to write to. 943 * @replaces: Block graph node name to replace once the mirror is done. Can 944 * only be used when full mirroring is selected. 945 * @speed: The maximum speed, in bytes per second, or 0 for unlimited. 946 * @granularity: The chosen granularity for the dirty bitmap. 947 * @buf_size: The amount of data that can be in flight at one time. 948 * @mode: Whether to collapse all images in the chain to the target. 949 * @backing_mode: How to establish the target's backing chain after completion. 950 * @on_source_error: The action to take upon error reading from the source. 951 * @on_target_error: The action to take upon error writing to the target. 952 * @unmap: Whether to unmap target where source sectors only contain zeroes. 953 * @filter_node_name: The node name that should be assigned to the filter 954 * driver that the mirror job inserts into the graph above @bs. NULL means that 955 * a node name should be autogenerated. 956 * @errp: Error object. 957 * 958 * Start a mirroring operation on @bs. Clusters that are allocated 959 * in @bs will be written to @target until the job is cancelled or 960 * manually completed. At the end of a successful mirroring job, 961 * @bs will be switched to read from @target. 962 */ 963 void mirror_start(const char *job_id, BlockDriverState *bs, 964 BlockDriverState *target, const char *replaces, 965 int64_t speed, uint32_t granularity, int64_t buf_size, 966 MirrorSyncMode mode, BlockMirrorBackingMode backing_mode, 967 BlockdevOnError on_source_error, 968 BlockdevOnError on_target_error, 969 bool unmap, const char *filter_node_name, Error **errp); 970 971 /* 972 * backup_job_create: 973 * @job_id: The id of the newly-created job, or %NULL to use the 974 * device name of @bs. 975 * @bs: Block device to operate on. 976 * @target: Block device to write to. 977 * @speed: The maximum speed, in bytes per second, or 0 for unlimited. 978 * @sync_mode: What parts of the disk image should be copied to the destination. 979 * @sync_bitmap: The dirty bitmap if sync_mode is MIRROR_SYNC_MODE_INCREMENTAL. 980 * @on_source_error: The action to take upon error reading from the source. 981 * @on_target_error: The action to take upon error writing to the target. 982 * @creation_flags: Flags that control the behavior of the Job lifetime. 983 * See @BlockJobCreateFlags 984 * @cb: Completion function for the job. 985 * @opaque: Opaque pointer value passed to @cb. 986 * @txn: Transaction that this job is part of (may be NULL). 987 * 988 * Create a backup operation on @bs. Clusters in @bs are written to @target 989 * until the job is cancelled or manually completed. 990 */ 991 BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs, 992 BlockDriverState *target, int64_t speed, 993 MirrorSyncMode sync_mode, 994 BdrvDirtyBitmap *sync_bitmap, 995 bool compress, 996 BlockdevOnError on_source_error, 997 BlockdevOnError on_target_error, 998 int creation_flags, 999 BlockCompletionFunc *cb, void *opaque, 1000 BlockJobTxn *txn, Error **errp); 1001 1002 void hmp_drive_add_node(Monitor *mon, const char *optstr); 1003 1004 BdrvChild *bdrv_root_attach_child(BlockDriverState *child_bs, 1005 const char *child_name, 1006 const BdrvChildRole *child_role, 1007 uint64_t perm, uint64_t shared_perm, 1008 void *opaque, Error **errp); 1009 void bdrv_root_unref_child(BdrvChild *child); 1010 1011 int bdrv_child_try_set_perm(BdrvChild *c, uint64_t perm, uint64_t shared, 1012 Error **errp); 1013 1014 /* Default implementation for BlockDriver.bdrv_child_perm() that can be used by 1015 * block filters: Forward CONSISTENT_READ, WRITE, WRITE_UNCHANGED and RESIZE to 1016 * all children */ 1017 void bdrv_filter_default_perms(BlockDriverState *bs, BdrvChild *c, 1018 const BdrvChildRole *role, 1019 BlockReopenQueue *reopen_queue, 1020 uint64_t perm, uint64_t shared, 1021 uint64_t *nperm, uint64_t *nshared); 1022 1023 /* Default implementation for BlockDriver.bdrv_child_perm() that can be used by 1024 * (non-raw) image formats: Like above for bs->backing, but for bs->file it 1025 * requires WRITE | RESIZE for read-write images, always requires 1026 * CONSISTENT_READ and doesn't share WRITE. */ 1027 void bdrv_format_default_perms(BlockDriverState *bs, BdrvChild *c, 1028 const BdrvChildRole *role, 1029 BlockReopenQueue *reopen_queue, 1030 uint64_t perm, uint64_t shared, 1031 uint64_t *nperm, uint64_t *nshared); 1032 1033 /* 1034 * Default implementation for drivers to pass bdrv_co_get_block_status() to 1035 * their file. 1036 */ 1037 int64_t coroutine_fn bdrv_co_get_block_status_from_file(BlockDriverState *bs, 1038 int64_t sector_num, 1039 int nb_sectors, 1040 int *pnum, 1041 BlockDriverState **file); 1042 /* 1043 * Default implementation for drivers to pass bdrv_co_get_block_status() to 1044 * their backing file. 1045 */ 1046 int64_t coroutine_fn bdrv_co_get_block_status_from_backing(BlockDriverState *bs, 1047 int64_t sector_num, 1048 int nb_sectors, 1049 int *pnum, 1050 BlockDriverState **file); 1051 const char *bdrv_get_parent_name(const BlockDriverState *bs); 1052 void blk_dev_change_media_cb(BlockBackend *blk, bool load, Error **errp); 1053 bool blk_dev_has_removable_media(BlockBackend *blk); 1054 bool blk_dev_has_tray(BlockBackend *blk); 1055 void blk_dev_eject_request(BlockBackend *blk, bool force); 1056 bool blk_dev_is_tray_open(BlockBackend *blk); 1057 bool blk_dev_is_medium_locked(BlockBackend *blk); 1058 1059 void bdrv_set_dirty(BlockDriverState *bs, int64_t offset, int64_t bytes); 1060 1061 void bdrv_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap **out); 1062 void bdrv_undo_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap *in); 1063 1064 void bdrv_inc_in_flight(BlockDriverState *bs); 1065 void bdrv_dec_in_flight(BlockDriverState *bs); 1066 1067 void blockdev_close_all_bdrv_states(void); 1068 1069 #endif /* BLOCK_INT_H */ 1070