1 /* 2 * QEMU System Emulator block driver 3 * 4 * Copyright (c) 2003 Fabrice Bellard 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to deal 8 * in the Software without restriction, including without limitation the rights 9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 * copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 * THE SOFTWARE. 23 */ 24 #ifndef BLOCK_COMMON_H 25 #define BLOCK_COMMON_H 26 27 #include "qapi/qapi-types-block-core.h" 28 #include "qemu/queue.h" 29 30 /* 31 * co_wrapper{*}: Function specifiers used by block-coroutine-wrapper.py 32 * 33 * Function specifiers, which do nothing but mark functions to be 34 * generated by scripts/block-coroutine-wrapper.py 35 * 36 * Usage: read docs/devel/block-coroutine-wrapper.rst 37 * 38 * There are 4 kind of specifiers: 39 * - co_wrapper functions can be called by only non-coroutine context, because 40 * they always generate a new coroutine. 41 * - co_wrapper_mixed functions can be called by both coroutine and 42 * non-coroutine context. 43 * - co_wrapper_bdrv_rdlock are co_wrapper functions but automatically take and 44 * release the graph rdlock when creating a new coroutine 45 * - co_wrapper_mixed_bdrv_rdlock are co_wrapper_mixed functions but 46 * automatically take and release the graph rdlock when creating a new 47 * coroutine. 48 * 49 * These functions should not be called from a coroutine_fn; instead, 50 * call the wrapped function directly. 51 */ 52 #define co_wrapper no_coroutine_fn 53 #define co_wrapper_mixed no_coroutine_fn coroutine_mixed_fn 54 #define co_wrapper_bdrv_rdlock no_coroutine_fn 55 #define co_wrapper_mixed_bdrv_rdlock no_coroutine_fn coroutine_mixed_fn 56 57 /* 58 * no_co_wrapper: Function specifier used by block-coroutine-wrapper.py 59 * 60 * Function specifier which does nothing but mark functions to be generated by 61 * scripts/block-coroutine-wrapper.py. 62 * 63 * A no_co_wrapper function declaration creates a coroutine_fn wrapper around 64 * functions that must not be called in coroutine context. It achieves this by 65 * scheduling a BH in the bottom half that runs the respective non-coroutine 66 * function. The coroutine yields after scheduling the BH and is reentered when 67 * the wrapped function returns. 68 * 69 * A no_co_wrapper_bdrv_rdlock function is a no_co_wrapper function that 70 * automatically takes the graph rdlock when calling the wrapped function. In 71 * the same way, no_co_wrapper_bdrv_wrlock functions automatically take the 72 * graph wrlock. 73 * 74 * If the first parameter of the function is a BlockDriverState, BdrvChild or 75 * BlockBackend pointer, the AioContext lock for it is taken in the wrapper. 76 */ 77 #define no_co_wrapper 78 #define no_co_wrapper_bdrv_rdlock 79 #define no_co_wrapper_bdrv_wrlock 80 81 #include "block/blockjob.h" 82 83 /* block.c */ 84 typedef struct BlockDriver BlockDriver; 85 typedef struct BdrvChild BdrvChild; 86 typedef struct BdrvChildClass BdrvChildClass; 87 88 typedef enum BlockZoneOp { 89 BLK_ZO_OPEN, 90 BLK_ZO_CLOSE, 91 BLK_ZO_FINISH, 92 BLK_ZO_RESET, 93 } BlockZoneOp; 94 95 typedef enum BlockZoneModel { 96 BLK_Z_NONE = 0x0, /* Regular block device */ 97 BLK_Z_HM = 0x1, /* Host-managed zoned block device */ 98 BLK_Z_HA = 0x2, /* Host-aware zoned block device */ 99 } BlockZoneModel; 100 101 typedef enum BlockZoneState { 102 BLK_ZS_NOT_WP = 0x0, 103 BLK_ZS_EMPTY = 0x1, 104 BLK_ZS_IOPEN = 0x2, 105 BLK_ZS_EOPEN = 0x3, 106 BLK_ZS_CLOSED = 0x4, 107 BLK_ZS_RDONLY = 0xD, 108 BLK_ZS_FULL = 0xE, 109 BLK_ZS_OFFLINE = 0xF, 110 } BlockZoneState; 111 112 typedef enum BlockZoneType { 113 BLK_ZT_CONV = 0x1, /* Conventional random writes supported */ 114 BLK_ZT_SWR = 0x2, /* Sequential writes required */ 115 BLK_ZT_SWP = 0x3, /* Sequential writes preferred */ 116 } BlockZoneType; 117 118 /* 119 * Zone descriptor data structure. 120 * Provides information on a zone with all position and size values in bytes. 121 */ 122 typedef struct BlockZoneDescriptor { 123 uint64_t start; 124 uint64_t length; 125 uint64_t cap; 126 uint64_t wp; 127 BlockZoneType type; 128 BlockZoneState state; 129 } BlockZoneDescriptor; 130 131 /* 132 * Track write pointers of a zone in bytes. 133 */ 134 typedef struct BlockZoneWps { 135 CoMutex colock; 136 uint64_t wp[]; 137 } BlockZoneWps; 138 139 typedef struct BlockDriverInfo { 140 /* in bytes, 0 if irrelevant */ 141 int cluster_size; 142 /* 143 * A fraction of cluster_size, if supported (currently QCOW2 only); if 144 * disabled or unsupported, set equal to cluster_size. 145 */ 146 int subcluster_size; 147 /* offset at which the VM state can be saved (0 if not possible) */ 148 int64_t vm_state_offset; 149 bool is_dirty; 150 /* 151 * True if this block driver only supports compressed writes 152 */ 153 bool needs_compressed_writes; 154 } BlockDriverInfo; 155 156 typedef struct BlockFragInfo { 157 uint64_t allocated_clusters; 158 uint64_t total_clusters; 159 uint64_t fragmented_clusters; 160 uint64_t compressed_clusters; 161 } BlockFragInfo; 162 163 typedef enum { 164 BDRV_REQ_COPY_ON_READ = 0x1, 165 BDRV_REQ_ZERO_WRITE = 0x2, 166 167 /* 168 * The BDRV_REQ_MAY_UNMAP flag is used in write_zeroes requests to indicate 169 * that the block driver should unmap (discard) blocks if it is guaranteed 170 * that the result will read back as zeroes. The flag is only passed to the 171 * driver if the block device is opened with BDRV_O_UNMAP. 172 */ 173 BDRV_REQ_MAY_UNMAP = 0x4, 174 175 /* 176 * An optimization hint when all QEMUIOVector elements are within 177 * previously registered bdrv_register_buf() memory ranges. 178 * 179 * Code that replaces the user's QEMUIOVector elements with bounce buffers 180 * must take care to clear this flag. 181 */ 182 BDRV_REQ_REGISTERED_BUF = 0x8, 183 184 BDRV_REQ_FUA = 0x10, 185 BDRV_REQ_WRITE_COMPRESSED = 0x20, 186 187 /* 188 * Signifies that this write request will not change the visible disk 189 * content. 190 */ 191 BDRV_REQ_WRITE_UNCHANGED = 0x40, 192 193 /* 194 * Forces request serialisation. Use only with write requests. 195 */ 196 BDRV_REQ_SERIALISING = 0x80, 197 198 /* 199 * Execute the request only if the operation can be offloaded or otherwise 200 * be executed efficiently, but return an error instead of using a slow 201 * fallback. 202 */ 203 BDRV_REQ_NO_FALLBACK = 0x100, 204 205 /* 206 * BDRV_REQ_PREFETCH makes sense only in the context of copy-on-read 207 * (i.e., together with the BDRV_REQ_COPY_ON_READ flag or when a COR 208 * filter is involved), in which case it signals that the COR operation 209 * need not read the data into memory (qiov) but only ensure they are 210 * copied to the top layer (i.e., that COR operation is done). 211 */ 212 BDRV_REQ_PREFETCH = 0x200, 213 214 /* 215 * If we need to wait for other requests, just fail immediately. Used 216 * only together with BDRV_REQ_SERIALISING. Used only with requests aligned 217 * to request_alignment (corresponding assertions are in block/io.c). 218 */ 219 BDRV_REQ_NO_WAIT = 0x400, 220 221 /* Mask of valid flags */ 222 BDRV_REQ_MASK = 0x7ff, 223 } BdrvRequestFlags; 224 225 #define BDRV_O_NO_SHARE 0x0001 /* don't share permissions */ 226 #define BDRV_O_RDWR 0x0002 227 #define BDRV_O_RESIZE 0x0004 /* request permission for resizing the node */ 228 #define BDRV_O_SNAPSHOT 0x0008 /* open the file read only and save 229 writes in a snapshot */ 230 #define BDRV_O_TEMPORARY 0x0010 /* delete the file after use */ 231 #define BDRV_O_NOCACHE 0x0020 /* do not use the host page cache */ 232 #define BDRV_O_NATIVE_AIO 0x0080 /* use native AIO instead of the 233 thread pool */ 234 #define BDRV_O_NO_BACKING 0x0100 /* don't open the backing file */ 235 #define BDRV_O_NO_FLUSH 0x0200 /* disable flushing on this disk */ 236 #define BDRV_O_COPY_ON_READ 0x0400 /* copy read backing sectors into image */ 237 #define BDRV_O_INACTIVE 0x0800 /* consistency hint for migration handoff */ 238 #define BDRV_O_CHECK 0x1000 /* open solely for consistency check */ 239 #define BDRV_O_ALLOW_RDWR 0x2000 /* allow reopen to change from r/o to r/w */ 240 #define BDRV_O_UNMAP 0x4000 /* execute guest UNMAP/TRIM operations */ 241 #define BDRV_O_PROTOCOL 0x8000 /* if no block driver is explicitly given: 242 select an appropriate protocol driver, 243 ignoring the format layer */ 244 #define BDRV_O_NO_IO 0x10000 /* don't initialize for I/O */ 245 #define BDRV_O_AUTO_RDONLY 0x20000 /* degrade to read-only if opening 246 read-write fails */ 247 #define BDRV_O_IO_URING 0x40000 /* use io_uring instead of the thread pool */ 248 249 #define BDRV_O_CACHE_MASK (BDRV_O_NOCACHE | BDRV_O_NO_FLUSH) 250 251 252 /* Option names of options parsed by the block layer */ 253 254 #define BDRV_OPT_CACHE_WB "cache.writeback" 255 #define BDRV_OPT_CACHE_DIRECT "cache.direct" 256 #define BDRV_OPT_CACHE_NO_FLUSH "cache.no-flush" 257 #define BDRV_OPT_READ_ONLY "read-only" 258 #define BDRV_OPT_AUTO_READ_ONLY "auto-read-only" 259 #define BDRV_OPT_DISCARD "discard" 260 #define BDRV_OPT_FORCE_SHARE "force-share" 261 262 263 #define BDRV_SECTOR_BITS 9 264 #define BDRV_SECTOR_SIZE (1ULL << BDRV_SECTOR_BITS) 265 266 /* 267 * Get the first most significant bit of wp. If it is zero, then 268 * the zone type is SWR. 269 */ 270 #define BDRV_ZT_IS_CONV(wp) (wp & (1ULL << 63)) 271 272 #define BDRV_REQUEST_MAX_SECTORS MIN_CONST(SIZE_MAX >> BDRV_SECTOR_BITS, \ 273 INT_MAX >> BDRV_SECTOR_BITS) 274 #define BDRV_REQUEST_MAX_BYTES (BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS) 275 276 /* 277 * We want allow aligning requests and disk length up to any 32bit alignment 278 * and don't afraid of overflow. 279 * To achieve it, and in the same time use some pretty number as maximum disk 280 * size, let's define maximum "length" (a limit for any offset/bytes request and 281 * for disk size) to be the greatest power of 2 less than INT64_MAX. 282 */ 283 #define BDRV_MAX_ALIGNMENT (1L << 30) 284 #define BDRV_MAX_LENGTH (QEMU_ALIGN_DOWN(INT64_MAX, BDRV_MAX_ALIGNMENT)) 285 286 /* 287 * Allocation status flags for bdrv_block_status() and friends. 288 * 289 * Public flags: 290 * BDRV_BLOCK_DATA: allocation for data at offset is tied to this layer 291 * BDRV_BLOCK_ZERO: offset reads as zero 292 * BDRV_BLOCK_OFFSET_VALID: an associated offset exists for accessing raw data 293 * BDRV_BLOCK_ALLOCATED: the content of the block is determined by this 294 * layer rather than any backing, set by block layer 295 * BDRV_BLOCK_EOF: the returned pnum covers through end of file for this 296 * layer, set by block layer 297 * BDRV_BLOCK_COMPRESSED: the underlying data is compressed; only valid for 298 * the formats supporting compression: qcow, qcow2 299 * 300 * Internal flags: 301 * BDRV_BLOCK_RAW: for use by passthrough drivers, such as raw, to request 302 * that the block layer recompute the answer from the returned 303 * BDS; must be accompanied by just BDRV_BLOCK_OFFSET_VALID. 304 * BDRV_BLOCK_RECURSE: request that the block layer will recursively search for 305 * zeroes in file child of current block node inside 306 * returned region. Only valid together with both 307 * BDRV_BLOCK_DATA and BDRV_BLOCK_OFFSET_VALID. Should not 308 * appear with BDRV_BLOCK_ZERO. 309 * 310 * If BDRV_BLOCK_OFFSET_VALID is set, the map parameter represents the 311 * host offset within the returned BDS that is allocated for the 312 * corresponding raw guest data. However, whether that offset 313 * actually contains data also depends on BDRV_BLOCK_DATA, as follows: 314 * 315 * DATA ZERO OFFSET_VALID 316 * t t t sectors read as zero, returned file is zero at offset 317 * t f t sectors read as valid from file at offset 318 * f t t sectors preallocated, read as zero, returned file not 319 * necessarily zero at offset 320 * f f t sectors preallocated but read from backing_hd, 321 * returned file contains garbage at offset 322 * t t f sectors preallocated, read as zero, unknown offset 323 * t f f sectors read from unknown file or offset 324 * f t f not allocated or unknown offset, read as zero 325 * f f f not allocated or unknown offset, read from backing_hd 326 */ 327 #define BDRV_BLOCK_DATA 0x01 328 #define BDRV_BLOCK_ZERO 0x02 329 #define BDRV_BLOCK_OFFSET_VALID 0x04 330 #define BDRV_BLOCK_RAW 0x08 331 #define BDRV_BLOCK_ALLOCATED 0x10 332 #define BDRV_BLOCK_EOF 0x20 333 #define BDRV_BLOCK_RECURSE 0x40 334 #define BDRV_BLOCK_COMPRESSED 0x80 335 336 typedef QTAILQ_HEAD(BlockReopenQueue, BlockReopenQueueEntry) BlockReopenQueue; 337 338 typedef struct BDRVReopenState { 339 BlockDriverState *bs; 340 int flags; 341 BlockdevDetectZeroesOptions detect_zeroes; 342 bool backing_missing; 343 BlockDriverState *old_backing_bs; /* keep pointer for permissions update */ 344 BlockDriverState *old_file_bs; /* keep pointer for permissions update */ 345 QDict *options; 346 QDict *explicit_options; 347 void *opaque; 348 } BDRVReopenState; 349 350 /* 351 * Block operation types 352 */ 353 typedef enum BlockOpType { 354 BLOCK_OP_TYPE_BACKUP_SOURCE, 355 BLOCK_OP_TYPE_BACKUP_TARGET, 356 BLOCK_OP_TYPE_CHANGE, 357 BLOCK_OP_TYPE_COMMIT_SOURCE, 358 BLOCK_OP_TYPE_COMMIT_TARGET, 359 BLOCK_OP_TYPE_DATAPLANE, 360 BLOCK_OP_TYPE_DRIVE_DEL, 361 BLOCK_OP_TYPE_EJECT, 362 BLOCK_OP_TYPE_EXTERNAL_SNAPSHOT, 363 BLOCK_OP_TYPE_INTERNAL_SNAPSHOT, 364 BLOCK_OP_TYPE_INTERNAL_SNAPSHOT_DELETE, 365 BLOCK_OP_TYPE_MIRROR_SOURCE, 366 BLOCK_OP_TYPE_MIRROR_TARGET, 367 BLOCK_OP_TYPE_RESIZE, 368 BLOCK_OP_TYPE_STREAM, 369 BLOCK_OP_TYPE_REPLACE, 370 BLOCK_OP_TYPE_MAX, 371 } BlockOpType; 372 373 /* Block node permission constants */ 374 enum { 375 /** 376 * A user that has the "permission" of consistent reads is guaranteed that 377 * their view of the contents of the block device is complete and 378 * self-consistent, representing the contents of a disk at a specific 379 * point. 380 * 381 * For most block devices (including their backing files) this is true, but 382 * the property cannot be maintained in a few situations like for 383 * intermediate nodes of a commit block job. 384 */ 385 BLK_PERM_CONSISTENT_READ = 0x01, 386 387 /** This permission is required to change the visible disk contents. */ 388 BLK_PERM_WRITE = 0x02, 389 390 /** 391 * This permission (which is weaker than BLK_PERM_WRITE) is both enough and 392 * required for writes to the block node when the caller promises that 393 * the visible disk content doesn't change. 394 * 395 * As the BLK_PERM_WRITE permission is strictly stronger, either is 396 * sufficient to perform an unchanging write. 397 */ 398 BLK_PERM_WRITE_UNCHANGED = 0x04, 399 400 /** This permission is required to change the size of a block node. */ 401 BLK_PERM_RESIZE = 0x08, 402 403 /** 404 * There was a now-removed bit BLK_PERM_GRAPH_MOD, with value of 0x10. QEMU 405 * 6.1 and earlier may still lock the corresponding byte in block/file-posix 406 * locking. So, implementing some new permission should be very careful to 407 * not interfere with this old unused thing. 408 */ 409 410 BLK_PERM_ALL = 0x0f, 411 412 DEFAULT_PERM_PASSTHROUGH = BLK_PERM_CONSISTENT_READ 413 | BLK_PERM_WRITE 414 | BLK_PERM_WRITE_UNCHANGED 415 | BLK_PERM_RESIZE, 416 417 DEFAULT_PERM_UNCHANGED = BLK_PERM_ALL & ~DEFAULT_PERM_PASSTHROUGH, 418 }; 419 420 /* 421 * Flags that parent nodes assign to child nodes to specify what kind of 422 * role(s) they take. 423 * 424 * At least one of DATA, METADATA, FILTERED, or COW must be set for 425 * every child. 426 * 427 * 428 * = Connection with bs->children, bs->file and bs->backing fields = 429 * 430 * 1. Filters 431 * 432 * Filter drivers have drv->is_filter = true. 433 * 434 * Filter node has exactly one FILTERED|PRIMARY child, and may have other 435 * children which must not have these bits (one example is the 436 * copy-before-write filter, which also has its target DATA child). 437 * 438 * Filter nodes never have COW children. 439 * 440 * For most filters, the filtered child is linked in bs->file, bs->backing is 441 * NULL. For some filters (as an exception), it is the other way around; those 442 * drivers will have drv->filtered_child_is_backing set to true (see that 443 * field’s documentation for what drivers this concerns) 444 * 445 * 2. "raw" driver (block/raw-format.c) 446 * 447 * Formally it's not a filter (drv->is_filter = false) 448 * 449 * bs->backing is always NULL 450 * 451 * Only has one child, linked in bs->file. Its role is either FILTERED|PRIMARY 452 * (like filter) or DATA|PRIMARY depending on options. 453 * 454 * 3. Other drivers 455 * 456 * Don't have any FILTERED children. 457 * 458 * May have at most one COW child. In this case it's linked in bs->backing. 459 * Otherwise bs->backing is NULL. COW child is never PRIMARY. 460 * 461 * May have at most one PRIMARY child. In this case it's linked in bs->file. 462 * Otherwise bs->file is NULL. 463 * 464 * May also have some other children that don't have the PRIMARY or COW bit set. 465 */ 466 enum BdrvChildRoleBits { 467 /* 468 * This child stores data. 469 * Any node may have an arbitrary number of such children. 470 */ 471 BDRV_CHILD_DATA = (1 << 0), 472 473 /* 474 * This child stores metadata. 475 * Any node may have an arbitrary number of metadata-storing 476 * children. 477 */ 478 BDRV_CHILD_METADATA = (1 << 1), 479 480 /* 481 * A child that always presents exactly the same visible data as 482 * the parent, e.g. by virtue of the parent forwarding all reads 483 * and writes. 484 * This flag is mutually exclusive with DATA, METADATA, and COW. 485 * Any node may have at most one filtered child at a time. 486 */ 487 BDRV_CHILD_FILTERED = (1 << 2), 488 489 /* 490 * Child from which to read all data that isn't allocated in the 491 * parent (i.e., the backing child); such data is copied to the 492 * parent through COW (and optionally COR). 493 * This field is mutually exclusive with DATA, METADATA, and 494 * FILTERED. 495 * Any node may have at most one such backing child at a time. 496 */ 497 BDRV_CHILD_COW = (1 << 3), 498 499 /* 500 * The primary child. For most drivers, this is the child whose 501 * filename applies best to the parent node. 502 * Any node may have at most one primary child at a time. 503 */ 504 BDRV_CHILD_PRIMARY = (1 << 4), 505 506 /* Useful combination of flags */ 507 BDRV_CHILD_IMAGE = BDRV_CHILD_DATA 508 | BDRV_CHILD_METADATA 509 | BDRV_CHILD_PRIMARY, 510 }; 511 512 /* Mask of BdrvChildRoleBits values */ 513 typedef unsigned int BdrvChildRole; 514 515 typedef struct BdrvCheckResult { 516 int corruptions; 517 int leaks; 518 int check_errors; 519 int corruptions_fixed; 520 int leaks_fixed; 521 int64_t image_end_offset; 522 BlockFragInfo bfi; 523 } BdrvCheckResult; 524 525 typedef enum { 526 BDRV_FIX_LEAKS = 1, 527 BDRV_FIX_ERRORS = 2, 528 } BdrvCheckMode; 529 530 typedef struct BlockSizes { 531 uint32_t phys; 532 uint32_t log; 533 } BlockSizes; 534 535 typedef struct HDGeometry { 536 uint32_t heads; 537 uint32_t sectors; 538 uint32_t cylinders; 539 } HDGeometry; 540 541 /* 542 * Common functions that are neither I/O nor Global State. 543 * 544 * These functions must never call any function from other categories 545 * (I/O, "I/O or GS", Global State) except this one, but can be invoked by 546 * all of them. 547 */ 548 549 char *bdrv_perm_names(uint64_t perm); 550 uint64_t bdrv_qapi_perm_to_blk_perm(BlockPermission qapi_perm); 551 552 void bdrv_init_with_whitelist(void); 553 bool bdrv_uses_whitelist(void); 554 int bdrv_is_whitelisted(BlockDriver *drv, bool read_only); 555 556 int bdrv_parse_aio(const char *mode, int *flags); 557 int bdrv_parse_cache_mode(const char *mode, int *flags, bool *writethrough); 558 int bdrv_parse_discard_flags(const char *mode, int *flags); 559 560 int path_has_protocol(const char *path); 561 int path_is_absolute(const char *path); 562 char *path_combine(const char *base_path, const char *filename); 563 564 char *bdrv_get_full_backing_filename_from_filename(const char *backed, 565 const char *backing, 566 Error **errp); 567 568 #endif /* BLOCK_COMMON_H */ 569