1 /* 2 * QEMU System Emulator block driver 3 * 4 * Copyright (c) 2003 Fabrice Bellard 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to deal 8 * in the Software without restriction, including without limitation the rights 9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 * copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 * THE SOFTWARE. 23 */ 24 #ifndef BLOCK_COMMON_H 25 #define BLOCK_COMMON_H 26 27 #include "block/aio.h" 28 #include "block/aio-wait.h" 29 #include "qemu/iov.h" 30 #include "qemu/coroutine.h" 31 #include "block/accounting.h" 32 #include "qemu/hbitmap.h" 33 #include "qemu/transactions.h" 34 35 /* 36 * co_wrapper{*}: Function specifiers used by block-coroutine-wrapper.py 37 * 38 * Function specifiers, which do nothing but mark functions to be 39 * generated by scripts/block-coroutine-wrapper.py 40 * 41 * Usage: read docs/devel/block-coroutine-wrapper.rst 42 * 43 * There are 4 kind of specifiers: 44 * - co_wrapper functions can be called by only non-coroutine context, because 45 * they always generate a new coroutine. 46 * - co_wrapper_mixed functions can be called by both coroutine and 47 * non-coroutine context. 48 * - co_wrapper_bdrv_rdlock are co_wrapper functions but automatically take and 49 * release the graph rdlock when creating a new coroutine 50 * - co_wrapper_mixed_bdrv_rdlock are co_wrapper_mixed functions but 51 * automatically take and release the graph rdlock when creating a new 52 * coroutine. 53 */ 54 #define co_wrapper 55 #define co_wrapper_mixed 56 #define co_wrapper_bdrv_rdlock 57 #define co_wrapper_mixed_bdrv_rdlock 58 59 #include "block/dirty-bitmap.h" 60 #include "block/blockjob.h" 61 62 /* block.c */ 63 typedef struct BlockDriver BlockDriver; 64 typedef struct BdrvChild BdrvChild; 65 typedef struct BdrvChildClass BdrvChildClass; 66 67 typedef struct BlockDriverInfo { 68 /* in bytes, 0 if irrelevant */ 69 int cluster_size; 70 /* offset at which the VM state can be saved (0 if not possible) */ 71 int64_t vm_state_offset; 72 bool is_dirty; 73 /* 74 * True if this block driver only supports compressed writes 75 */ 76 bool needs_compressed_writes; 77 } BlockDriverInfo; 78 79 typedef struct BlockFragInfo { 80 uint64_t allocated_clusters; 81 uint64_t total_clusters; 82 uint64_t fragmented_clusters; 83 uint64_t compressed_clusters; 84 } BlockFragInfo; 85 86 typedef enum { 87 BDRV_REQ_COPY_ON_READ = 0x1, 88 BDRV_REQ_ZERO_WRITE = 0x2, 89 90 /* 91 * The BDRV_REQ_MAY_UNMAP flag is used in write_zeroes requests to indicate 92 * that the block driver should unmap (discard) blocks if it is guaranteed 93 * that the result will read back as zeroes. The flag is only passed to the 94 * driver if the block device is opened with BDRV_O_UNMAP. 95 */ 96 BDRV_REQ_MAY_UNMAP = 0x4, 97 98 /* 99 * An optimization hint when all QEMUIOVector elements are within 100 * previously registered bdrv_register_buf() memory ranges. 101 * 102 * Code that replaces the user's QEMUIOVector elements with bounce buffers 103 * must take care to clear this flag. 104 */ 105 BDRV_REQ_REGISTERED_BUF = 0x8, 106 107 BDRV_REQ_FUA = 0x10, 108 BDRV_REQ_WRITE_COMPRESSED = 0x20, 109 110 /* 111 * Signifies that this write request will not change the visible disk 112 * content. 113 */ 114 BDRV_REQ_WRITE_UNCHANGED = 0x40, 115 116 /* 117 * Forces request serialisation. Use only with write requests. 118 */ 119 BDRV_REQ_SERIALISING = 0x80, 120 121 /* 122 * Execute the request only if the operation can be offloaded or otherwise 123 * be executed efficiently, but return an error instead of using a slow 124 * fallback. 125 */ 126 BDRV_REQ_NO_FALLBACK = 0x100, 127 128 /* 129 * BDRV_REQ_PREFETCH makes sense only in the context of copy-on-read 130 * (i.e., together with the BDRV_REQ_COPY_ON_READ flag or when a COR 131 * filter is involved), in which case it signals that the COR operation 132 * need not read the data into memory (qiov) but only ensure they are 133 * copied to the top layer (i.e., that COR operation is done). 134 */ 135 BDRV_REQ_PREFETCH = 0x200, 136 137 /* 138 * If we need to wait for other requests, just fail immediately. Used 139 * only together with BDRV_REQ_SERIALISING. Used only with requests aligned 140 * to request_alignment (corresponding assertions are in block/io.c). 141 */ 142 BDRV_REQ_NO_WAIT = 0x400, 143 144 /* Mask of valid flags */ 145 BDRV_REQ_MASK = 0x7ff, 146 } BdrvRequestFlags; 147 148 #define BDRV_O_NO_SHARE 0x0001 /* don't share permissions */ 149 #define BDRV_O_RDWR 0x0002 150 #define BDRV_O_RESIZE 0x0004 /* request permission for resizing the node */ 151 #define BDRV_O_SNAPSHOT 0x0008 /* open the file read only and save 152 writes in a snapshot */ 153 #define BDRV_O_TEMPORARY 0x0010 /* delete the file after use */ 154 #define BDRV_O_NOCACHE 0x0020 /* do not use the host page cache */ 155 #define BDRV_O_NATIVE_AIO 0x0080 /* use native AIO instead of the 156 thread pool */ 157 #define BDRV_O_NO_BACKING 0x0100 /* don't open the backing file */ 158 #define BDRV_O_NO_FLUSH 0x0200 /* disable flushing on this disk */ 159 #define BDRV_O_COPY_ON_READ 0x0400 /* copy read backing sectors into image */ 160 #define BDRV_O_INACTIVE 0x0800 /* consistency hint for migration handoff */ 161 #define BDRV_O_CHECK 0x1000 /* open solely for consistency check */ 162 #define BDRV_O_ALLOW_RDWR 0x2000 /* allow reopen to change from r/o to r/w */ 163 #define BDRV_O_UNMAP 0x4000 /* execute guest UNMAP/TRIM operations */ 164 #define BDRV_O_PROTOCOL 0x8000 /* if no block driver is explicitly given: 165 select an appropriate protocol driver, 166 ignoring the format layer */ 167 #define BDRV_O_NO_IO 0x10000 /* don't initialize for I/O */ 168 #define BDRV_O_AUTO_RDONLY 0x20000 /* degrade to read-only if opening 169 read-write fails */ 170 #define BDRV_O_IO_URING 0x40000 /* use io_uring instead of the thread pool */ 171 172 #define BDRV_O_CACHE_MASK (BDRV_O_NOCACHE | BDRV_O_NO_FLUSH) 173 174 175 /* Option names of options parsed by the block layer */ 176 177 #define BDRV_OPT_CACHE_WB "cache.writeback" 178 #define BDRV_OPT_CACHE_DIRECT "cache.direct" 179 #define BDRV_OPT_CACHE_NO_FLUSH "cache.no-flush" 180 #define BDRV_OPT_READ_ONLY "read-only" 181 #define BDRV_OPT_AUTO_READ_ONLY "auto-read-only" 182 #define BDRV_OPT_DISCARD "discard" 183 #define BDRV_OPT_FORCE_SHARE "force-share" 184 185 186 #define BDRV_SECTOR_BITS 9 187 #define BDRV_SECTOR_SIZE (1ULL << BDRV_SECTOR_BITS) 188 189 #define BDRV_REQUEST_MAX_SECTORS MIN_CONST(SIZE_MAX >> BDRV_SECTOR_BITS, \ 190 INT_MAX >> BDRV_SECTOR_BITS) 191 #define BDRV_REQUEST_MAX_BYTES (BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS) 192 193 /* 194 * We want allow aligning requests and disk length up to any 32bit alignment 195 * and don't afraid of overflow. 196 * To achieve it, and in the same time use some pretty number as maximum disk 197 * size, let's define maximum "length" (a limit for any offset/bytes request and 198 * for disk size) to be the greatest power of 2 less than INT64_MAX. 199 */ 200 #define BDRV_MAX_ALIGNMENT (1L << 30) 201 #define BDRV_MAX_LENGTH (QEMU_ALIGN_DOWN(INT64_MAX, BDRV_MAX_ALIGNMENT)) 202 203 /* 204 * Allocation status flags for bdrv_block_status() and friends. 205 * 206 * Public flags: 207 * BDRV_BLOCK_DATA: allocation for data at offset is tied to this layer 208 * BDRV_BLOCK_ZERO: offset reads as zero 209 * BDRV_BLOCK_OFFSET_VALID: an associated offset exists for accessing raw data 210 * BDRV_BLOCK_ALLOCATED: the content of the block is determined by this 211 * layer rather than any backing, set by block layer 212 * BDRV_BLOCK_EOF: the returned pnum covers through end of file for this 213 * layer, set by block layer 214 * 215 * Internal flags: 216 * BDRV_BLOCK_RAW: for use by passthrough drivers, such as raw, to request 217 * that the block layer recompute the answer from the returned 218 * BDS; must be accompanied by just BDRV_BLOCK_OFFSET_VALID. 219 * BDRV_BLOCK_RECURSE: request that the block layer will recursively search for 220 * zeroes in file child of current block node inside 221 * returned region. Only valid together with both 222 * BDRV_BLOCK_DATA and BDRV_BLOCK_OFFSET_VALID. Should not 223 * appear with BDRV_BLOCK_ZERO. 224 * 225 * If BDRV_BLOCK_OFFSET_VALID is set, the map parameter represents the 226 * host offset within the returned BDS that is allocated for the 227 * corresponding raw guest data. However, whether that offset 228 * actually contains data also depends on BDRV_BLOCK_DATA, as follows: 229 * 230 * DATA ZERO OFFSET_VALID 231 * t t t sectors read as zero, returned file is zero at offset 232 * t f t sectors read as valid from file at offset 233 * f t t sectors preallocated, read as zero, returned file not 234 * necessarily zero at offset 235 * f f t sectors preallocated but read from backing_hd, 236 * returned file contains garbage at offset 237 * t t f sectors preallocated, read as zero, unknown offset 238 * t f f sectors read from unknown file or offset 239 * f t f not allocated or unknown offset, read as zero 240 * f f f not allocated or unknown offset, read from backing_hd 241 */ 242 #define BDRV_BLOCK_DATA 0x01 243 #define BDRV_BLOCK_ZERO 0x02 244 #define BDRV_BLOCK_OFFSET_VALID 0x04 245 #define BDRV_BLOCK_RAW 0x08 246 #define BDRV_BLOCK_ALLOCATED 0x10 247 #define BDRV_BLOCK_EOF 0x20 248 #define BDRV_BLOCK_RECURSE 0x40 249 250 typedef QTAILQ_HEAD(BlockReopenQueue, BlockReopenQueueEntry) BlockReopenQueue; 251 252 typedef struct BDRVReopenState { 253 BlockDriverState *bs; 254 int flags; 255 BlockdevDetectZeroesOptions detect_zeroes; 256 bool backing_missing; 257 BlockDriverState *old_backing_bs; /* keep pointer for permissions update */ 258 BlockDriverState *old_file_bs; /* keep pointer for permissions update */ 259 QDict *options; 260 QDict *explicit_options; 261 void *opaque; 262 } BDRVReopenState; 263 264 /* 265 * Block operation types 266 */ 267 typedef enum BlockOpType { 268 BLOCK_OP_TYPE_BACKUP_SOURCE, 269 BLOCK_OP_TYPE_BACKUP_TARGET, 270 BLOCK_OP_TYPE_CHANGE, 271 BLOCK_OP_TYPE_COMMIT_SOURCE, 272 BLOCK_OP_TYPE_COMMIT_TARGET, 273 BLOCK_OP_TYPE_DATAPLANE, 274 BLOCK_OP_TYPE_DRIVE_DEL, 275 BLOCK_OP_TYPE_EJECT, 276 BLOCK_OP_TYPE_EXTERNAL_SNAPSHOT, 277 BLOCK_OP_TYPE_INTERNAL_SNAPSHOT, 278 BLOCK_OP_TYPE_INTERNAL_SNAPSHOT_DELETE, 279 BLOCK_OP_TYPE_MIRROR_SOURCE, 280 BLOCK_OP_TYPE_MIRROR_TARGET, 281 BLOCK_OP_TYPE_RESIZE, 282 BLOCK_OP_TYPE_STREAM, 283 BLOCK_OP_TYPE_REPLACE, 284 BLOCK_OP_TYPE_MAX, 285 } BlockOpType; 286 287 /* Block node permission constants */ 288 enum { 289 /** 290 * A user that has the "permission" of consistent reads is guaranteed that 291 * their view of the contents of the block device is complete and 292 * self-consistent, representing the contents of a disk at a specific 293 * point. 294 * 295 * For most block devices (including their backing files) this is true, but 296 * the property cannot be maintained in a few situations like for 297 * intermediate nodes of a commit block job. 298 */ 299 BLK_PERM_CONSISTENT_READ = 0x01, 300 301 /** This permission is required to change the visible disk contents. */ 302 BLK_PERM_WRITE = 0x02, 303 304 /** 305 * This permission (which is weaker than BLK_PERM_WRITE) is both enough and 306 * required for writes to the block node when the caller promises that 307 * the visible disk content doesn't change. 308 * 309 * As the BLK_PERM_WRITE permission is strictly stronger, either is 310 * sufficient to perform an unchanging write. 311 */ 312 BLK_PERM_WRITE_UNCHANGED = 0x04, 313 314 /** This permission is required to change the size of a block node. */ 315 BLK_PERM_RESIZE = 0x08, 316 317 /** 318 * There was a now-removed bit BLK_PERM_GRAPH_MOD, with value of 0x10. QEMU 319 * 6.1 and earlier may still lock the corresponding byte in block/file-posix 320 * locking. So, implementing some new permission should be very careful to 321 * not interfere with this old unused thing. 322 */ 323 324 BLK_PERM_ALL = 0x0f, 325 326 DEFAULT_PERM_PASSTHROUGH = BLK_PERM_CONSISTENT_READ 327 | BLK_PERM_WRITE 328 | BLK_PERM_WRITE_UNCHANGED 329 | BLK_PERM_RESIZE, 330 331 DEFAULT_PERM_UNCHANGED = BLK_PERM_ALL & ~DEFAULT_PERM_PASSTHROUGH, 332 }; 333 334 /* 335 * Flags that parent nodes assign to child nodes to specify what kind of 336 * role(s) they take. 337 * 338 * At least one of DATA, METADATA, FILTERED, or COW must be set for 339 * every child. 340 * 341 * 342 * = Connection with bs->children, bs->file and bs->backing fields = 343 * 344 * 1. Filters 345 * 346 * Filter drivers have drv->is_filter = true. 347 * 348 * Filter node has exactly one FILTERED|PRIMARY child, and may have other 349 * children which must not have these bits (one example is the 350 * copy-before-write filter, which also has its target DATA child). 351 * 352 * Filter nodes never have COW children. 353 * 354 * For most filters, the filtered child is linked in bs->file, bs->backing is 355 * NULL. For some filters (as an exception), it is the other way around; those 356 * drivers will have drv->filtered_child_is_backing set to true (see that 357 * field’s documentation for what drivers this concerns) 358 * 359 * 2. "raw" driver (block/raw-format.c) 360 * 361 * Formally it's not a filter (drv->is_filter = false) 362 * 363 * bs->backing is always NULL 364 * 365 * Only has one child, linked in bs->file. Its role is either FILTERED|PRIMARY 366 * (like filter) or DATA|PRIMARY depending on options. 367 * 368 * 3. Other drivers 369 * 370 * Don't have any FILTERED children. 371 * 372 * May have at most one COW child. In this case it's linked in bs->backing. 373 * Otherwise bs->backing is NULL. COW child is never PRIMARY. 374 * 375 * May have at most one PRIMARY child. In this case it's linked in bs->file. 376 * Otherwise bs->file is NULL. 377 * 378 * May also have some other children that don't have the PRIMARY or COW bit set. 379 */ 380 enum BdrvChildRoleBits { 381 /* 382 * This child stores data. 383 * Any node may have an arbitrary number of such children. 384 */ 385 BDRV_CHILD_DATA = (1 << 0), 386 387 /* 388 * This child stores metadata. 389 * Any node may have an arbitrary number of metadata-storing 390 * children. 391 */ 392 BDRV_CHILD_METADATA = (1 << 1), 393 394 /* 395 * A child that always presents exactly the same visible data as 396 * the parent, e.g. by virtue of the parent forwarding all reads 397 * and writes. 398 * This flag is mutually exclusive with DATA, METADATA, and COW. 399 * Any node may have at most one filtered child at a time. 400 */ 401 BDRV_CHILD_FILTERED = (1 << 2), 402 403 /* 404 * Child from which to read all data that isn't allocated in the 405 * parent (i.e., the backing child); such data is copied to the 406 * parent through COW (and optionally COR). 407 * This field is mutually exclusive with DATA, METADATA, and 408 * FILTERED. 409 * Any node may have at most one such backing child at a time. 410 */ 411 BDRV_CHILD_COW = (1 << 3), 412 413 /* 414 * The primary child. For most drivers, this is the child whose 415 * filename applies best to the parent node. 416 * Any node may have at most one primary child at a time. 417 */ 418 BDRV_CHILD_PRIMARY = (1 << 4), 419 420 /* Useful combination of flags */ 421 BDRV_CHILD_IMAGE = BDRV_CHILD_DATA 422 | BDRV_CHILD_METADATA 423 | BDRV_CHILD_PRIMARY, 424 }; 425 426 /* Mask of BdrvChildRoleBits values */ 427 typedef unsigned int BdrvChildRole; 428 429 typedef struct BdrvCheckResult { 430 int corruptions; 431 int leaks; 432 int check_errors; 433 int corruptions_fixed; 434 int leaks_fixed; 435 int64_t image_end_offset; 436 BlockFragInfo bfi; 437 } BdrvCheckResult; 438 439 typedef enum { 440 BDRV_FIX_LEAKS = 1, 441 BDRV_FIX_ERRORS = 2, 442 } BdrvCheckMode; 443 444 typedef struct BlockSizes { 445 uint32_t phys; 446 uint32_t log; 447 } BlockSizes; 448 449 typedef struct HDGeometry { 450 uint32_t heads; 451 uint32_t sectors; 452 uint32_t cylinders; 453 } HDGeometry; 454 455 /* 456 * Common functions that are neither I/O nor Global State. 457 * 458 * These functions must never call any function from other categories 459 * (I/O, "I/O or GS", Global State) except this one, but can be invoked by 460 * all of them. 461 */ 462 463 char *bdrv_perm_names(uint64_t perm); 464 uint64_t bdrv_qapi_perm_to_blk_perm(BlockPermission qapi_perm); 465 466 void bdrv_init_with_whitelist(void); 467 bool bdrv_uses_whitelist(void); 468 int bdrv_is_whitelisted(BlockDriver *drv, bool read_only); 469 470 int bdrv_parse_aio(const char *mode, int *flags); 471 int bdrv_parse_cache_mode(const char *mode, int *flags, bool *writethrough); 472 int bdrv_parse_discard_flags(const char *mode, int *flags); 473 474 int path_has_protocol(const char *path); 475 int path_is_absolute(const char *path); 476 char *path_combine(const char *base_path, const char *filename); 477 478 char *bdrv_get_full_backing_filename_from_filename(const char *backed, 479 const char *backing, 480 Error **errp); 481 482 #endif /* BLOCK_COMMON_H */ 483