1 /* 2 * QEMU System Emulator block driver 3 * 4 * Copyright (c) 2003 Fabrice Bellard 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to deal 8 * in the Software without restriction, including without limitation the rights 9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 * copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 * THE SOFTWARE. 23 */ 24 #ifndef BLOCK_INT_H 25 #define BLOCK_INT_H 26 27 #include "block/accounting.h" 28 #include "block/block.h" 29 #include "qemu/option.h" 30 #include "qemu/queue.h" 31 #include "qemu/coroutine.h" 32 #include "qemu/timer.h" 33 #include "qapi-types.h" 34 #include "qemu/hbitmap.h" 35 #include "block/snapshot.h" 36 #include "qemu/main-loop.h" 37 #include "qemu/throttle.h" 38 39 #define BLOCK_FLAG_ENCRYPT 1 40 #define BLOCK_FLAG_LAZY_REFCOUNTS 8 41 42 #define BLOCK_OPT_SIZE "size" 43 #define BLOCK_OPT_ENCRYPT "encryption" 44 #define BLOCK_OPT_COMPAT6 "compat6" 45 #define BLOCK_OPT_HWVERSION "hwversion" 46 #define BLOCK_OPT_BACKING_FILE "backing_file" 47 #define BLOCK_OPT_BACKING_FMT "backing_fmt" 48 #define BLOCK_OPT_CLUSTER_SIZE "cluster_size" 49 #define BLOCK_OPT_TABLE_SIZE "table_size" 50 #define BLOCK_OPT_PREALLOC "preallocation" 51 #define BLOCK_OPT_SUBFMT "subformat" 52 #define BLOCK_OPT_COMPAT_LEVEL "compat" 53 #define BLOCK_OPT_LAZY_REFCOUNTS "lazy_refcounts" 54 #define BLOCK_OPT_ADAPTER_TYPE "adapter_type" 55 #define BLOCK_OPT_REDUNDANCY "redundancy" 56 #define BLOCK_OPT_NOCOW "nocow" 57 #define BLOCK_OPT_OBJECT_SIZE "object_size" 58 #define BLOCK_OPT_REFCOUNT_BITS "refcount_bits" 59 60 #define BLOCK_PROBE_BUF_SIZE 512 61 62 enum BdrvTrackedRequestType { 63 BDRV_TRACKED_READ, 64 BDRV_TRACKED_WRITE, 65 BDRV_TRACKED_DISCARD, 66 }; 67 68 typedef struct BdrvTrackedRequest { 69 BlockDriverState *bs; 70 int64_t offset; 71 unsigned int bytes; 72 enum BdrvTrackedRequestType type; 73 74 bool serialising; 75 int64_t overlap_offset; 76 unsigned int overlap_bytes; 77 78 QLIST_ENTRY(BdrvTrackedRequest) list; 79 Coroutine *co; /* owner, used for deadlock detection */ 80 CoQueue wait_queue; /* coroutines blocked on this request */ 81 82 struct BdrvTrackedRequest *waiting_for; 83 } BdrvTrackedRequest; 84 85 struct BlockDriver { 86 const char *format_name; 87 int instance_size; 88 89 /* set to true if the BlockDriver is a block filter */ 90 bool is_filter; 91 /* for snapshots block filter like Quorum can implement the 92 * following recursive callback. 93 * It's purpose is to recurse on the filter children while calling 94 * bdrv_recurse_is_first_non_filter on them. 95 * For a sample implementation look in the future Quorum block filter. 96 */ 97 bool (*bdrv_recurse_is_first_non_filter)(BlockDriverState *bs, 98 BlockDriverState *candidate); 99 100 int (*bdrv_probe)(const uint8_t *buf, int buf_size, const char *filename); 101 int (*bdrv_probe_device)(const char *filename); 102 103 /* Any driver implementing this callback is expected to be able to handle 104 * NULL file names in its .bdrv_open() implementation */ 105 void (*bdrv_parse_filename)(const char *filename, QDict *options, Error **errp); 106 /* Drivers not implementing bdrv_parse_filename nor bdrv_open should have 107 * this field set to true, except ones that are defined only by their 108 * child's bs. 109 * An example of the last type will be the quorum block driver. 110 */ 111 bool bdrv_needs_filename; 112 113 /* Set if a driver can support backing files */ 114 bool supports_backing; 115 116 /* For handling image reopen for split or non-split files */ 117 int (*bdrv_reopen_prepare)(BDRVReopenState *reopen_state, 118 BlockReopenQueue *queue, Error **errp); 119 void (*bdrv_reopen_commit)(BDRVReopenState *reopen_state); 120 void (*bdrv_reopen_abort)(BDRVReopenState *reopen_state); 121 void (*bdrv_join_options)(QDict *options, QDict *old_options); 122 123 int (*bdrv_open)(BlockDriverState *bs, QDict *options, int flags, 124 Error **errp); 125 int (*bdrv_file_open)(BlockDriverState *bs, QDict *options, int flags, 126 Error **errp); 127 void (*bdrv_close)(BlockDriverState *bs); 128 int (*bdrv_create)(const char *filename, QemuOpts *opts, Error **errp); 129 int (*bdrv_set_key)(BlockDriverState *bs, const char *key); 130 int (*bdrv_make_empty)(BlockDriverState *bs); 131 132 void (*bdrv_refresh_filename)(BlockDriverState *bs, QDict *options); 133 134 /* aio */ 135 BlockAIOCB *(*bdrv_aio_readv)(BlockDriverState *bs, 136 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, 137 BlockCompletionFunc *cb, void *opaque); 138 BlockAIOCB *(*bdrv_aio_writev)(BlockDriverState *bs, 139 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, 140 BlockCompletionFunc *cb, void *opaque); 141 BlockAIOCB *(*bdrv_aio_flush)(BlockDriverState *bs, 142 BlockCompletionFunc *cb, void *opaque); 143 BlockAIOCB *(*bdrv_aio_pdiscard)(BlockDriverState *bs, 144 int64_t offset, int count, 145 BlockCompletionFunc *cb, void *opaque); 146 147 int coroutine_fn (*bdrv_co_readv)(BlockDriverState *bs, 148 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov); 149 int coroutine_fn (*bdrv_co_preadv)(BlockDriverState *bs, 150 uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags); 151 int coroutine_fn (*bdrv_co_writev)(BlockDriverState *bs, 152 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov); 153 int coroutine_fn (*bdrv_co_writev_flags)(BlockDriverState *bs, 154 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, int flags); 155 int coroutine_fn (*bdrv_co_pwritev)(BlockDriverState *bs, 156 uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags); 157 158 /* 159 * Efficiently zero a region of the disk image. Typically an image format 160 * would use a compact metadata representation to implement this. This 161 * function pointer may be NULL or return -ENOSUP and .bdrv_co_writev() 162 * will be called instead. 163 */ 164 int coroutine_fn (*bdrv_co_pwrite_zeroes)(BlockDriverState *bs, 165 int64_t offset, int count, BdrvRequestFlags flags); 166 int coroutine_fn (*bdrv_co_pdiscard)(BlockDriverState *bs, 167 int64_t offset, int count); 168 169 /* 170 * Building block for bdrv_block_status[_above]. The driver should 171 * answer only according to the current layer, and should not 172 * set BDRV_BLOCK_ALLOCATED, but may set BDRV_BLOCK_RAW. See block.h 173 * for the meaning of _DATA, _ZERO, and _OFFSET_VALID. 174 */ 175 int64_t coroutine_fn (*bdrv_co_get_block_status)(BlockDriverState *bs, 176 int64_t sector_num, int nb_sectors, int *pnum, 177 BlockDriverState **file); 178 179 /* 180 * Invalidate any cached meta-data. 181 */ 182 void (*bdrv_invalidate_cache)(BlockDriverState *bs, Error **errp); 183 int (*bdrv_inactivate)(BlockDriverState *bs); 184 185 /* 186 * Flushes all data for all layers by calling bdrv_co_flush for underlying 187 * layers, if needed. This function is needed for deterministic 188 * synchronization of the flush finishing callback. 189 */ 190 int coroutine_fn (*bdrv_co_flush)(BlockDriverState *bs); 191 192 /* 193 * Flushes all data that was already written to the OS all the way down to 194 * the disk (for example file-posix.c calls fsync()). 195 */ 196 int coroutine_fn (*bdrv_co_flush_to_disk)(BlockDriverState *bs); 197 198 /* 199 * Flushes all internal caches to the OS. The data may still sit in a 200 * writeback cache of the host OS, but it will survive a crash of the qemu 201 * process. 202 */ 203 int coroutine_fn (*bdrv_co_flush_to_os)(BlockDriverState *bs); 204 205 const char *protocol_name; 206 int (*bdrv_truncate)(BlockDriverState *bs, int64_t offset, Error **errp); 207 208 int64_t (*bdrv_getlength)(BlockDriverState *bs); 209 bool has_variable_length; 210 int64_t (*bdrv_get_allocated_file_size)(BlockDriverState *bs); 211 212 int coroutine_fn (*bdrv_co_pwritev_compressed)(BlockDriverState *bs, 213 uint64_t offset, uint64_t bytes, QEMUIOVector *qiov); 214 215 int (*bdrv_snapshot_create)(BlockDriverState *bs, 216 QEMUSnapshotInfo *sn_info); 217 int (*bdrv_snapshot_goto)(BlockDriverState *bs, 218 const char *snapshot_id); 219 int (*bdrv_snapshot_delete)(BlockDriverState *bs, 220 const char *snapshot_id, 221 const char *name, 222 Error **errp); 223 int (*bdrv_snapshot_list)(BlockDriverState *bs, 224 QEMUSnapshotInfo **psn_info); 225 int (*bdrv_snapshot_load_tmp)(BlockDriverState *bs, 226 const char *snapshot_id, 227 const char *name, 228 Error **errp); 229 int (*bdrv_get_info)(BlockDriverState *bs, BlockDriverInfo *bdi); 230 ImageInfoSpecific *(*bdrv_get_specific_info)(BlockDriverState *bs); 231 232 int coroutine_fn (*bdrv_save_vmstate)(BlockDriverState *bs, 233 QEMUIOVector *qiov, 234 int64_t pos); 235 int coroutine_fn (*bdrv_load_vmstate)(BlockDriverState *bs, 236 QEMUIOVector *qiov, 237 int64_t pos); 238 239 int (*bdrv_change_backing_file)(BlockDriverState *bs, 240 const char *backing_file, const char *backing_fmt); 241 242 /* removable device specific */ 243 bool (*bdrv_is_inserted)(BlockDriverState *bs); 244 int (*bdrv_media_changed)(BlockDriverState *bs); 245 void (*bdrv_eject)(BlockDriverState *bs, bool eject_flag); 246 void (*bdrv_lock_medium)(BlockDriverState *bs, bool locked); 247 248 /* to control generic scsi devices */ 249 BlockAIOCB *(*bdrv_aio_ioctl)(BlockDriverState *bs, 250 unsigned long int req, void *buf, 251 BlockCompletionFunc *cb, void *opaque); 252 int coroutine_fn (*bdrv_co_ioctl)(BlockDriverState *bs, 253 unsigned long int req, void *buf); 254 255 /* List of options for creating images, terminated by name == NULL */ 256 QemuOptsList *create_opts; 257 258 /* 259 * Returns 0 for completed check, -errno for internal errors. 260 * The check results are stored in result. 261 */ 262 int (*bdrv_check)(BlockDriverState *bs, BdrvCheckResult *result, 263 BdrvCheckMode fix); 264 265 int (*bdrv_amend_options)(BlockDriverState *bs, QemuOpts *opts, 266 BlockDriverAmendStatusCB *status_cb, 267 void *cb_opaque); 268 269 void (*bdrv_debug_event)(BlockDriverState *bs, BlkdebugEvent event); 270 271 /* TODO Better pass a option string/QDict/QemuOpts to add any rule? */ 272 int (*bdrv_debug_breakpoint)(BlockDriverState *bs, const char *event, 273 const char *tag); 274 int (*bdrv_debug_remove_breakpoint)(BlockDriverState *bs, 275 const char *tag); 276 int (*bdrv_debug_resume)(BlockDriverState *bs, const char *tag); 277 bool (*bdrv_debug_is_suspended)(BlockDriverState *bs, const char *tag); 278 279 void (*bdrv_refresh_limits)(BlockDriverState *bs, Error **errp); 280 281 /* 282 * Returns 1 if newly created images are guaranteed to contain only 283 * zeros, 0 otherwise. 284 */ 285 int (*bdrv_has_zero_init)(BlockDriverState *bs); 286 287 /* Remove fd handlers, timers, and other event loop callbacks so the event 288 * loop is no longer in use. Called with no in-flight requests and in 289 * depth-first traversal order with parents before child nodes. 290 */ 291 void (*bdrv_detach_aio_context)(BlockDriverState *bs); 292 293 /* Add fd handlers, timers, and other event loop callbacks so I/O requests 294 * can be processed again. Called with no in-flight requests and in 295 * depth-first traversal order with child nodes before parent nodes. 296 */ 297 void (*bdrv_attach_aio_context)(BlockDriverState *bs, 298 AioContext *new_context); 299 300 /* io queue for linux-aio */ 301 void (*bdrv_io_plug)(BlockDriverState *bs); 302 void (*bdrv_io_unplug)(BlockDriverState *bs); 303 304 /** 305 * Try to get @bs's logical and physical block size. 306 * On success, store them in @bsz and return zero. 307 * On failure, return negative errno. 308 */ 309 int (*bdrv_probe_blocksizes)(BlockDriverState *bs, BlockSizes *bsz); 310 /** 311 * Try to get @bs's geometry (cyls, heads, sectors) 312 * On success, store them in @geo and return 0. 313 * On failure return -errno. 314 * Only drivers that want to override guest geometry implement this 315 * callback; see hd_geometry_guess(). 316 */ 317 int (*bdrv_probe_geometry)(BlockDriverState *bs, HDGeometry *geo); 318 319 /** 320 * Drain and stop any internal sources of requests in the driver, and 321 * remain so until next I/O callback (e.g. bdrv_co_writev) is called. 322 */ 323 void (*bdrv_drain)(BlockDriverState *bs); 324 325 void (*bdrv_add_child)(BlockDriverState *parent, BlockDriverState *child, 326 Error **errp); 327 void (*bdrv_del_child)(BlockDriverState *parent, BdrvChild *child, 328 Error **errp); 329 330 /** 331 * Informs the block driver that a permission change is intended. The 332 * driver checks whether the change is permissible and may take other 333 * preparations for the change (e.g. get file system locks). This operation 334 * is always followed either by a call to either .bdrv_set_perm or 335 * .bdrv_abort_perm_update. 336 * 337 * Checks whether the requested set of cumulative permissions in @perm 338 * can be granted for accessing @bs and whether no other users are using 339 * permissions other than those given in @shared (both arguments take 340 * BLK_PERM_* bitmasks). 341 * 342 * If both conditions are met, 0 is returned. Otherwise, -errno is returned 343 * and errp is set to an error describing the conflict. 344 */ 345 int (*bdrv_check_perm)(BlockDriverState *bs, uint64_t perm, 346 uint64_t shared, Error **errp); 347 348 /** 349 * Called to inform the driver that the set of cumulative set of used 350 * permissions for @bs has changed to @perm, and the set of sharable 351 * permission to @shared. The driver can use this to propagate changes to 352 * its children (i.e. request permissions only if a parent actually needs 353 * them). 354 * 355 * This function is only invoked after bdrv_check_perm(), so block drivers 356 * may rely on preparations made in their .bdrv_check_perm implementation. 357 */ 358 void (*bdrv_set_perm)(BlockDriverState *bs, uint64_t perm, uint64_t shared); 359 360 /* 361 * Called to inform the driver that after a previous bdrv_check_perm() 362 * call, the permission update is not performed and any preparations made 363 * for it (e.g. taken file locks) need to be undone. 364 * 365 * This function can be called even for nodes that never saw a 366 * bdrv_check_perm() call. It is a no-op then. 367 */ 368 void (*bdrv_abort_perm_update)(BlockDriverState *bs); 369 370 /** 371 * Returns in @nperm and @nshared the permissions that the driver for @bs 372 * needs on its child @c, based on the cumulative permissions requested by 373 * the parents in @parent_perm and @parent_shared. 374 * 375 * If @c is NULL, return the permissions for attaching a new child for the 376 * given @role. 377 */ 378 void (*bdrv_child_perm)(BlockDriverState *bs, BdrvChild *c, 379 const BdrvChildRole *role, 380 uint64_t parent_perm, uint64_t parent_shared, 381 uint64_t *nperm, uint64_t *nshared); 382 383 QLIST_ENTRY(BlockDriver) list; 384 }; 385 386 typedef struct BlockLimits { 387 /* Alignment requirement, in bytes, for offset/length of I/O 388 * requests. Must be a power of 2 less than INT_MAX; defaults to 389 * 1 for drivers with modern byte interfaces, and to 512 390 * otherwise. */ 391 uint32_t request_alignment; 392 393 /* Maximum number of bytes that can be discarded at once (since it 394 * is signed, it must be < 2G, if set). Must be multiple of 395 * pdiscard_alignment, but need not be power of 2. May be 0 if no 396 * inherent 32-bit limit */ 397 int32_t max_pdiscard; 398 399 /* Optimal alignment for discard requests in bytes. A power of 2 400 * is best but not mandatory. Must be a multiple of 401 * bl.request_alignment, and must be less than max_pdiscard if 402 * that is set. May be 0 if bl.request_alignment is good enough */ 403 uint32_t pdiscard_alignment; 404 405 /* Maximum number of bytes that can zeroized at once (since it is 406 * signed, it must be < 2G, if set). Must be multiple of 407 * pwrite_zeroes_alignment. May be 0 if no inherent 32-bit limit */ 408 int32_t max_pwrite_zeroes; 409 410 /* Optimal alignment for write zeroes requests in bytes. A power 411 * of 2 is best but not mandatory. Must be a multiple of 412 * bl.request_alignment, and must be less than max_pwrite_zeroes 413 * if that is set. May be 0 if bl.request_alignment is good 414 * enough */ 415 uint32_t pwrite_zeroes_alignment; 416 417 /* Optimal transfer length in bytes. A power of 2 is best but not 418 * mandatory. Must be a multiple of bl.request_alignment, or 0 if 419 * no preferred size */ 420 uint32_t opt_transfer; 421 422 /* Maximal transfer length in bytes. Need not be power of 2, but 423 * must be multiple of opt_transfer and bl.request_alignment, or 0 424 * for no 32-bit limit. For now, anything larger than INT_MAX is 425 * clamped down. */ 426 uint32_t max_transfer; 427 428 /* memory alignment, in bytes so that no bounce buffer is needed */ 429 size_t min_mem_alignment; 430 431 /* memory alignment, in bytes, for bounce buffer */ 432 size_t opt_mem_alignment; 433 434 /* maximum number of iovec elements */ 435 int max_iov; 436 } BlockLimits; 437 438 typedef struct BdrvOpBlocker BdrvOpBlocker; 439 440 typedef struct BdrvAioNotifier { 441 void (*attached_aio_context)(AioContext *new_context, void *opaque); 442 void (*detach_aio_context)(void *opaque); 443 444 void *opaque; 445 bool deleted; 446 447 QLIST_ENTRY(BdrvAioNotifier) list; 448 } BdrvAioNotifier; 449 450 struct BdrvChildRole { 451 /* If true, bdrv_replace_node() doesn't change the node this BdrvChild 452 * points to. */ 453 bool stay_at_node; 454 455 void (*inherit_options)(int *child_flags, QDict *child_options, 456 int parent_flags, QDict *parent_options); 457 458 void (*change_media)(BdrvChild *child, bool load); 459 void (*resize)(BdrvChild *child); 460 461 /* Returns a name that is supposedly more useful for human users than the 462 * node name for identifying the node in question (in particular, a BB 463 * name), or NULL if the parent can't provide a better name. */ 464 const char *(*get_name)(BdrvChild *child); 465 466 /* Returns a malloced string that describes the parent of the child for a 467 * human reader. This could be a node-name, BlockBackend name, qdev ID or 468 * QOM path of the device owning the BlockBackend, job type and ID etc. The 469 * caller is responsible for freeing the memory. */ 470 char *(*get_parent_desc)(BdrvChild *child); 471 472 /* 473 * If this pair of functions is implemented, the parent doesn't issue new 474 * requests after returning from .drained_begin() until .drained_end() is 475 * called. 476 * 477 * Note that this can be nested. If drained_begin() was called twice, new 478 * I/O is allowed only after drained_end() was called twice, too. 479 */ 480 void (*drained_begin)(BdrvChild *child); 481 void (*drained_end)(BdrvChild *child); 482 483 /* Notifies the parent that the child has been activated/inactivated (e.g. 484 * when migration is completing) and it can start/stop requesting 485 * permissions and doing I/O on it. */ 486 void (*activate)(BdrvChild *child, Error **errp); 487 int (*inactivate)(BdrvChild *child); 488 489 void (*attach)(BdrvChild *child); 490 void (*detach)(BdrvChild *child); 491 }; 492 493 extern const BdrvChildRole child_file; 494 extern const BdrvChildRole child_format; 495 extern const BdrvChildRole child_backing; 496 497 struct BdrvChild { 498 BlockDriverState *bs; 499 char *name; 500 const BdrvChildRole *role; 501 void *opaque; 502 503 /** 504 * Granted permissions for operating on this BdrvChild (BLK_PERM_* bitmask) 505 */ 506 uint64_t perm; 507 508 /** 509 * Permissions that can still be granted to other users of @bs while this 510 * BdrvChild is still attached to it. (BLK_PERM_* bitmask) 511 */ 512 uint64_t shared_perm; 513 514 QLIST_ENTRY(BdrvChild) next; 515 QLIST_ENTRY(BdrvChild) next_parent; 516 }; 517 518 /* 519 * Note: the function bdrv_append() copies and swaps contents of 520 * BlockDriverStates, so if you add new fields to this struct, please 521 * inspect bdrv_append() to determine if the new fields need to be 522 * copied as well. 523 */ 524 struct BlockDriverState { 525 /* Protected by big QEMU lock or read-only after opening. No special 526 * locking needed during I/O... 527 */ 528 int open_flags; /* flags used to open the file, re-used for re-open */ 529 bool read_only; /* if true, the media is read only */ 530 bool encrypted; /* if true, the media is encrypted */ 531 bool valid_key; /* if true, a valid encryption key has been set */ 532 bool sg; /* if true, the device is a /dev/sg* */ 533 bool probed; /* if true, format was probed rather than specified */ 534 bool force_share; /* if true, always allow all shared permissions */ 535 536 BlockDriver *drv; /* NULL means no media */ 537 void *opaque; 538 539 AioContext *aio_context; /* event loop used for fd handlers, timers, etc */ 540 /* long-running tasks intended to always use the same AioContext as this 541 * BDS may register themselves in this list to be notified of changes 542 * regarding this BDS's context */ 543 QLIST_HEAD(, BdrvAioNotifier) aio_notifiers; 544 bool walking_aio_notifiers; /* to make removal during iteration safe */ 545 546 char filename[PATH_MAX]; 547 char backing_file[PATH_MAX]; /* if non zero, the image is a diff of 548 this file image */ 549 char backing_format[16]; /* if non-zero and backing_file exists */ 550 551 QDict *full_open_options; 552 char exact_filename[PATH_MAX]; 553 554 BdrvChild *backing; 555 BdrvChild *file; 556 557 /* I/O Limits */ 558 BlockLimits bl; 559 560 /* Flags honored during pwrite (so far: BDRV_REQ_FUA) */ 561 unsigned int supported_write_flags; 562 /* Flags honored during pwrite_zeroes (so far: BDRV_REQ_FUA, 563 * BDRV_REQ_MAY_UNMAP) */ 564 unsigned int supported_zero_flags; 565 566 /* the following member gives a name to every node on the bs graph. */ 567 char node_name[32]; 568 /* element of the list of named nodes building the graph */ 569 QTAILQ_ENTRY(BlockDriverState) node_list; 570 /* element of the list of all BlockDriverStates (all_bdrv_states) */ 571 QTAILQ_ENTRY(BlockDriverState) bs_list; 572 /* element of the list of monitor-owned BDS */ 573 QTAILQ_ENTRY(BlockDriverState) monitor_list; 574 int refcnt; 575 576 /* operation blockers */ 577 QLIST_HEAD(, BdrvOpBlocker) op_blockers[BLOCK_OP_TYPE_MAX]; 578 579 /* long-running background operation */ 580 BlockJob *job; 581 582 /* The node that this node inherited default options from (and a reopen on 583 * which can affect this node by changing these defaults). This is always a 584 * parent node of this node. */ 585 BlockDriverState *inherits_from; 586 QLIST_HEAD(, BdrvChild) children; 587 QLIST_HEAD(, BdrvChild) parents; 588 589 QDict *options; 590 QDict *explicit_options; 591 BlockdevDetectZeroesOptions detect_zeroes; 592 593 /* The error object in use for blocking operations on backing_hd */ 594 Error *backing_blocker; 595 596 /* Protected by AioContext lock */ 597 598 /* If true, copy read backing sectors into image. Can be >1 if more 599 * than one client has requested copy-on-read. 600 */ 601 int copy_on_read; 602 603 /* If we are reading a disk image, give its size in sectors. 604 * Generally read-only; it is written to by load_vmstate and save_vmstate, 605 * but the block layer is quiescent during those. 606 */ 607 int64_t total_sectors; 608 609 /* Callback before write request is processed */ 610 NotifierWithReturnList before_write_notifiers; 611 612 /* number of in-flight requests; overall and serialising */ 613 unsigned int in_flight; 614 unsigned int serialising_in_flight; 615 616 bool wakeup; 617 618 /* Offset after the highest byte written to */ 619 uint64_t wr_highest_offset; 620 621 /* threshold limit for writes, in bytes. "High water mark". */ 622 uint64_t write_threshold_offset; 623 NotifierWithReturn write_threshold_notifier; 624 625 /* counter for nested bdrv_io_plug */ 626 unsigned io_plugged; 627 628 QLIST_HEAD(, BdrvTrackedRequest) tracked_requests; 629 CoQueue flush_queue; /* Serializing flush queue */ 630 bool active_flush_req; /* Flush request in flight? */ 631 unsigned int write_gen; /* Current data generation */ 632 unsigned int flushed_gen; /* Flushed write generation */ 633 634 QLIST_HEAD(, BdrvDirtyBitmap) dirty_bitmaps; 635 636 /* do we need to tell the quest if we have a volatile write cache? */ 637 int enable_write_cache; 638 639 int quiesce_counter; 640 }; 641 642 struct BlockBackendRootState { 643 int open_flags; 644 bool read_only; 645 BlockdevDetectZeroesOptions detect_zeroes; 646 }; 647 648 typedef enum BlockMirrorBackingMode { 649 /* Reuse the existing backing chain from the source for the target. 650 * - sync=full: Set backing BDS to NULL. 651 * - sync=top: Use source's backing BDS. 652 * - sync=none: Use source as the backing BDS. */ 653 MIRROR_SOURCE_BACKING_CHAIN, 654 655 /* Open the target's backing chain completely anew */ 656 MIRROR_OPEN_BACKING_CHAIN, 657 658 /* Do not change the target's backing BDS after job completion */ 659 MIRROR_LEAVE_BACKING_CHAIN, 660 } BlockMirrorBackingMode; 661 662 static inline BlockDriverState *backing_bs(BlockDriverState *bs) 663 { 664 return bs->backing ? bs->backing->bs : NULL; 665 } 666 667 668 /* Essential block drivers which must always be statically linked into qemu, and 669 * which therefore can be accessed without using bdrv_find_format() */ 670 extern BlockDriver bdrv_file; 671 extern BlockDriver bdrv_raw; 672 extern BlockDriver bdrv_qcow2; 673 674 int coroutine_fn bdrv_co_preadv(BdrvChild *child, 675 int64_t offset, unsigned int bytes, QEMUIOVector *qiov, 676 BdrvRequestFlags flags); 677 int coroutine_fn bdrv_co_pwritev(BdrvChild *child, 678 int64_t offset, unsigned int bytes, QEMUIOVector *qiov, 679 BdrvRequestFlags flags); 680 681 int get_tmp_filename(char *filename, int size); 682 BlockDriver *bdrv_probe_all(const uint8_t *buf, int buf_size, 683 const char *filename); 684 685 686 /** 687 * bdrv_add_before_write_notifier: 688 * 689 * Register a callback that is invoked before write requests are processed but 690 * after any throttling or waiting for overlapping requests. 691 */ 692 void bdrv_add_before_write_notifier(BlockDriverState *bs, 693 NotifierWithReturn *notifier); 694 695 /** 696 * bdrv_detach_aio_context: 697 * 698 * May be called from .bdrv_detach_aio_context() to detach children from the 699 * current #AioContext. This is only needed by block drivers that manage their 700 * own children. Both ->file and ->backing are automatically handled and 701 * block drivers should not call this function on them explicitly. 702 */ 703 void bdrv_detach_aio_context(BlockDriverState *bs); 704 705 /** 706 * bdrv_attach_aio_context: 707 * 708 * May be called from .bdrv_attach_aio_context() to attach children to the new 709 * #AioContext. This is only needed by block drivers that manage their own 710 * children. Both ->file and ->backing are automatically handled and block 711 * drivers should not call this function on them explicitly. 712 */ 713 void bdrv_attach_aio_context(BlockDriverState *bs, 714 AioContext *new_context); 715 716 /** 717 * bdrv_add_aio_context_notifier: 718 * 719 * If a long-running job intends to be always run in the same AioContext as a 720 * certain BDS, it may use this function to be notified of changes regarding the 721 * association of the BDS to an AioContext. 722 * 723 * attached_aio_context() is called after the target BDS has been attached to a 724 * new AioContext; detach_aio_context() is called before the target BDS is being 725 * detached from its old AioContext. 726 */ 727 void bdrv_add_aio_context_notifier(BlockDriverState *bs, 728 void (*attached_aio_context)(AioContext *new_context, void *opaque), 729 void (*detach_aio_context)(void *opaque), void *opaque); 730 731 /** 732 * bdrv_remove_aio_context_notifier: 733 * 734 * Unsubscribe of change notifications regarding the BDS's AioContext. The 735 * parameters given here have to be the same as those given to 736 * bdrv_add_aio_context_notifier(). 737 */ 738 void bdrv_remove_aio_context_notifier(BlockDriverState *bs, 739 void (*aio_context_attached)(AioContext *, 740 void *), 741 void (*aio_context_detached)(void *), 742 void *opaque); 743 744 /** 745 * bdrv_wakeup: 746 * @bs: The BlockDriverState for which an I/O operation has been completed. 747 * 748 * Wake up the main thread if it is waiting on BDRV_POLL_WHILE. During 749 * synchronous I/O on a BlockDriverState that is attached to another 750 * I/O thread, the main thread lets the I/O thread's event loop run, 751 * waiting for the I/O operation to complete. A bdrv_wakeup will wake 752 * up the main thread if necessary. 753 * 754 * Manual calls to bdrv_wakeup are rarely necessary, because 755 * bdrv_dec_in_flight already calls it. 756 */ 757 void bdrv_wakeup(BlockDriverState *bs); 758 759 #ifdef _WIN32 760 int is_windows_drive(const char *filename); 761 #endif 762 763 /** 764 * stream_start: 765 * @job_id: The id of the newly-created job, or %NULL to use the 766 * device name of @bs. 767 * @bs: Block device to operate on. 768 * @base: Block device that will become the new base, or %NULL to 769 * flatten the whole backing file chain onto @bs. 770 * @backing_file_str: The file name that will be written to @bs as the 771 * the new backing file if the job completes. Ignored if @base is %NULL. 772 * @speed: The maximum speed, in bytes per second, or 0 for unlimited. 773 * @on_error: The action to take upon error. 774 * @errp: Error object. 775 * 776 * Start a streaming operation on @bs. Clusters that are unallocated 777 * in @bs, but allocated in any image between @base and @bs (both 778 * exclusive) will be written to @bs. At the end of a successful 779 * streaming job, the backing file of @bs will be changed to 780 * @backing_file_str in the written image and to @base in the live 781 * BlockDriverState. 782 */ 783 void stream_start(const char *job_id, BlockDriverState *bs, 784 BlockDriverState *base, const char *backing_file_str, 785 int64_t speed, BlockdevOnError on_error, Error **errp); 786 787 /** 788 * commit_start: 789 * @job_id: The id of the newly-created job, or %NULL to use the 790 * device name of @bs. 791 * @bs: Active block device. 792 * @top: Top block device to be committed. 793 * @base: Block device that will be written into, and become the new top. 794 * @speed: The maximum speed, in bytes per second, or 0 for unlimited. 795 * @on_error: The action to take upon error. 796 * @backing_file_str: String to use as the backing file in @top's overlay 797 * @filter_node_name: The node name that should be assigned to the filter 798 * driver that the commit job inserts into the graph above @top. NULL means 799 * that a node name should be autogenerated. 800 * @errp: Error object. 801 * 802 */ 803 void commit_start(const char *job_id, BlockDriverState *bs, 804 BlockDriverState *base, BlockDriverState *top, int64_t speed, 805 BlockdevOnError on_error, const char *backing_file_str, 806 const char *filter_node_name, Error **errp); 807 /** 808 * commit_active_start: 809 * @job_id: The id of the newly-created job, or %NULL to use the 810 * device name of @bs. 811 * @bs: Active block device to be committed. 812 * @base: Block device that will be written into, and become the new top. 813 * @creation_flags: Flags that control the behavior of the Job lifetime. 814 * See @BlockJobCreateFlags 815 * @speed: The maximum speed, in bytes per second, or 0 for unlimited. 816 * @on_error: The action to take upon error. 817 * @filter_node_name: The node name that should be assigned to the filter 818 * driver that the commit job inserts into the graph above @bs. NULL means that 819 * a node name should be autogenerated. 820 * @cb: Completion function for the job. 821 * @opaque: Opaque pointer value passed to @cb. 822 * @auto_complete: Auto complete the job. 823 * @errp: Error object. 824 * 825 */ 826 void commit_active_start(const char *job_id, BlockDriverState *bs, 827 BlockDriverState *base, int creation_flags, 828 int64_t speed, BlockdevOnError on_error, 829 const char *filter_node_name, 830 BlockCompletionFunc *cb, void *opaque, 831 bool auto_complete, Error **errp); 832 /* 833 * mirror_start: 834 * @job_id: The id of the newly-created job, or %NULL to use the 835 * device name of @bs. 836 * @bs: Block device to operate on. 837 * @target: Block device to write to. 838 * @replaces: Block graph node name to replace once the mirror is done. Can 839 * only be used when full mirroring is selected. 840 * @speed: The maximum speed, in bytes per second, or 0 for unlimited. 841 * @granularity: The chosen granularity for the dirty bitmap. 842 * @buf_size: The amount of data that can be in flight at one time. 843 * @mode: Whether to collapse all images in the chain to the target. 844 * @backing_mode: How to establish the target's backing chain after completion. 845 * @on_source_error: The action to take upon error reading from the source. 846 * @on_target_error: The action to take upon error writing to the target. 847 * @unmap: Whether to unmap target where source sectors only contain zeroes. 848 * @filter_node_name: The node name that should be assigned to the filter 849 * driver that the mirror job inserts into the graph above @bs. NULL means that 850 * a node name should be autogenerated. 851 * @errp: Error object. 852 * 853 * Start a mirroring operation on @bs. Clusters that are allocated 854 * in @bs will be written to @target until the job is cancelled or 855 * manually completed. At the end of a successful mirroring job, 856 * @bs will be switched to read from @target. 857 */ 858 void mirror_start(const char *job_id, BlockDriverState *bs, 859 BlockDriverState *target, const char *replaces, 860 int64_t speed, uint32_t granularity, int64_t buf_size, 861 MirrorSyncMode mode, BlockMirrorBackingMode backing_mode, 862 BlockdevOnError on_source_error, 863 BlockdevOnError on_target_error, 864 bool unmap, const char *filter_node_name, Error **errp); 865 866 /* 867 * backup_job_create: 868 * @job_id: The id of the newly-created job, or %NULL to use the 869 * device name of @bs. 870 * @bs: Block device to operate on. 871 * @target: Block device to write to. 872 * @speed: The maximum speed, in bytes per second, or 0 for unlimited. 873 * @sync_mode: What parts of the disk image should be copied to the destination. 874 * @sync_bitmap: The dirty bitmap if sync_mode is MIRROR_SYNC_MODE_INCREMENTAL. 875 * @on_source_error: The action to take upon error reading from the source. 876 * @on_target_error: The action to take upon error writing to the target. 877 * @creation_flags: Flags that control the behavior of the Job lifetime. 878 * See @BlockJobCreateFlags 879 * @cb: Completion function for the job. 880 * @opaque: Opaque pointer value passed to @cb. 881 * @txn: Transaction that this job is part of (may be NULL). 882 * 883 * Create a backup operation on @bs. Clusters in @bs are written to @target 884 * until the job is cancelled or manually completed. 885 */ 886 BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs, 887 BlockDriverState *target, int64_t speed, 888 MirrorSyncMode sync_mode, 889 BdrvDirtyBitmap *sync_bitmap, 890 bool compress, 891 BlockdevOnError on_source_error, 892 BlockdevOnError on_target_error, 893 int creation_flags, 894 BlockCompletionFunc *cb, void *opaque, 895 BlockJobTxn *txn, Error **errp); 896 897 void hmp_drive_add_node(Monitor *mon, const char *optstr); 898 899 BdrvChild *bdrv_root_attach_child(BlockDriverState *child_bs, 900 const char *child_name, 901 const BdrvChildRole *child_role, 902 uint64_t perm, uint64_t shared_perm, 903 void *opaque, Error **errp); 904 void bdrv_root_unref_child(BdrvChild *child); 905 906 int bdrv_child_try_set_perm(BdrvChild *c, uint64_t perm, uint64_t shared, 907 Error **errp); 908 909 /* Default implementation for BlockDriver.bdrv_child_perm() that can be used by 910 * block filters: Forward CONSISTENT_READ, WRITE, WRITE_UNCHANGED and RESIZE to 911 * all children */ 912 void bdrv_filter_default_perms(BlockDriverState *bs, BdrvChild *c, 913 const BdrvChildRole *role, 914 uint64_t perm, uint64_t shared, 915 uint64_t *nperm, uint64_t *nshared); 916 917 /* Default implementation for BlockDriver.bdrv_child_perm() that can be used by 918 * (non-raw) image formats: Like above for bs->backing, but for bs->file it 919 * requires WRITE | RESIZE for read-write images, always requires 920 * CONSISTENT_READ and doesn't share WRITE. */ 921 void bdrv_format_default_perms(BlockDriverState *bs, BdrvChild *c, 922 const BdrvChildRole *role, 923 uint64_t perm, uint64_t shared, 924 uint64_t *nperm, uint64_t *nshared); 925 926 const char *bdrv_get_parent_name(const BlockDriverState *bs); 927 void blk_dev_change_media_cb(BlockBackend *blk, bool load, Error **errp); 928 bool blk_dev_has_removable_media(BlockBackend *blk); 929 bool blk_dev_has_tray(BlockBackend *blk); 930 void blk_dev_eject_request(BlockBackend *blk, bool force); 931 bool blk_dev_is_tray_open(BlockBackend *blk); 932 bool blk_dev_is_medium_locked(BlockBackend *blk); 933 934 void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector, int64_t nr_sect); 935 bool bdrv_requests_pending(BlockDriverState *bs); 936 937 void bdrv_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap **out); 938 void bdrv_undo_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap *in); 939 940 void bdrv_inc_in_flight(BlockDriverState *bs); 941 void bdrv_dec_in_flight(BlockDriverState *bs); 942 943 void blockdev_close_all_bdrv_states(void); 944 945 #endif /* BLOCK_INT_H */ 946