xref: /openbmc/qemu/include/block/block-common.h (revision c1774bdb)
1 /*
2  * QEMU System Emulator block driver
3  *
4  * Copyright (c) 2003 Fabrice Bellard
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22  * THE SOFTWARE.
23  */
24 #ifndef BLOCK_COMMON_H
25 #define BLOCK_COMMON_H
26 
27 #include "qapi/qapi-types-block-core.h"
28 #include "qemu/queue.h"
29 
30 /*
31  * co_wrapper{*}: Function specifiers used by block-coroutine-wrapper.py
32  *
33  * Function specifiers, which do nothing but mark functions to be
34  * generated by scripts/block-coroutine-wrapper.py
35  *
36  * Usage: read docs/devel/block-coroutine-wrapper.rst
37  *
38  * There are 4 kind of specifiers:
39  * - co_wrapper functions can be called by only non-coroutine context, because
40  *   they always generate a new coroutine.
41  * - co_wrapper_mixed functions can be called by both coroutine and
42  *   non-coroutine context.
43  * - co_wrapper_bdrv_rdlock are co_wrapper functions but automatically take and
44  *   release the graph rdlock when creating a new coroutine
45  * - co_wrapper_mixed_bdrv_rdlock are co_wrapper_mixed functions but
46  *   automatically take and release the graph rdlock when creating a new
47  *   coroutine.
48  *
49  * These functions should not be called from a coroutine_fn; instead,
50  * call the wrapped function directly.
51  */
52 #define co_wrapper                     no_coroutine_fn
53 #define co_wrapper_mixed               no_coroutine_fn coroutine_mixed_fn
54 #define co_wrapper_bdrv_rdlock         no_coroutine_fn
55 #define co_wrapper_mixed_bdrv_rdlock   no_coroutine_fn coroutine_mixed_fn
56 
57 /*
58  * no_co_wrapper: Function specifier used by block-coroutine-wrapper.py
59  *
60  * Function specifier which does nothing but mark functions to be generated by
61  * scripts/block-coroutine-wrapper.py.
62  *
63  * A no_co_wrapper function declaration creates a coroutine_fn wrapper around
64  * functions that must not be called in coroutine context. It achieves this by
65  * scheduling a BH in the bottom half that runs the respective non-coroutine
66  * function. The coroutine yields after scheduling the BH and is reentered when
67  * the wrapped function returns.
68  *
69  * A no_co_wrapper_bdrv_wrlock function is a no_co_wrapper function that
70  * automatically takes the graph wrlock when calling the wrapped function.
71  *
72  * If the first parameter of the function is a BlockDriverState, BdrvChild or
73  * BlockBackend pointer, the AioContext lock for it is taken in the wrapper.
74  */
75 #define no_co_wrapper
76 #define no_co_wrapper_bdrv_wrlock
77 
78 #include "block/blockjob.h"
79 
80 /* block.c */
81 typedef struct BlockDriver BlockDriver;
82 typedef struct BdrvChild BdrvChild;
83 typedef struct BdrvChildClass BdrvChildClass;
84 
85 typedef enum BlockZoneOp {
86     BLK_ZO_OPEN,
87     BLK_ZO_CLOSE,
88     BLK_ZO_FINISH,
89     BLK_ZO_RESET,
90 } BlockZoneOp;
91 
92 typedef enum BlockZoneModel {
93     BLK_Z_NONE = 0x0, /* Regular block device */
94     BLK_Z_HM = 0x1, /* Host-managed zoned block device */
95     BLK_Z_HA = 0x2, /* Host-aware zoned block device */
96 } BlockZoneModel;
97 
98 typedef enum BlockZoneState {
99     BLK_ZS_NOT_WP = 0x0,
100     BLK_ZS_EMPTY = 0x1,
101     BLK_ZS_IOPEN = 0x2,
102     BLK_ZS_EOPEN = 0x3,
103     BLK_ZS_CLOSED = 0x4,
104     BLK_ZS_RDONLY = 0xD,
105     BLK_ZS_FULL = 0xE,
106     BLK_ZS_OFFLINE = 0xF,
107 } BlockZoneState;
108 
109 typedef enum BlockZoneType {
110     BLK_ZT_CONV = 0x1, /* Conventional random writes supported */
111     BLK_ZT_SWR = 0x2, /* Sequential writes required */
112     BLK_ZT_SWP = 0x3, /* Sequential writes preferred */
113 } BlockZoneType;
114 
115 /*
116  * Zone descriptor data structure.
117  * Provides information on a zone with all position and size values in bytes.
118  */
119 typedef struct BlockZoneDescriptor {
120     uint64_t start;
121     uint64_t length;
122     uint64_t cap;
123     uint64_t wp;
124     BlockZoneType type;
125     BlockZoneState state;
126 } BlockZoneDescriptor;
127 
128 /*
129  * Track write pointers of a zone in bytes.
130  */
131 typedef struct BlockZoneWps {
132     CoMutex colock;
133     uint64_t wp[];
134 } BlockZoneWps;
135 
136 typedef struct BlockDriverInfo {
137     /* in bytes, 0 if irrelevant */
138     int cluster_size;
139     /*
140      * A fraction of cluster_size, if supported (currently QCOW2 only); if
141      * disabled or unsupported, set equal to cluster_size.
142      */
143     int subcluster_size;
144     /* offset at which the VM state can be saved (0 if not possible) */
145     int64_t vm_state_offset;
146     bool is_dirty;
147     /*
148      * True if this block driver only supports compressed writes
149      */
150     bool needs_compressed_writes;
151 } BlockDriverInfo;
152 
153 typedef struct BlockFragInfo {
154     uint64_t allocated_clusters;
155     uint64_t total_clusters;
156     uint64_t fragmented_clusters;
157     uint64_t compressed_clusters;
158 } BlockFragInfo;
159 
160 typedef enum {
161     BDRV_REQ_COPY_ON_READ       = 0x1,
162     BDRV_REQ_ZERO_WRITE         = 0x2,
163 
164     /*
165      * The BDRV_REQ_MAY_UNMAP flag is used in write_zeroes requests to indicate
166      * that the block driver should unmap (discard) blocks if it is guaranteed
167      * that the result will read back as zeroes. The flag is only passed to the
168      * driver if the block device is opened with BDRV_O_UNMAP.
169      */
170     BDRV_REQ_MAY_UNMAP          = 0x4,
171 
172     /*
173      * An optimization hint when all QEMUIOVector elements are within
174      * previously registered bdrv_register_buf() memory ranges.
175      *
176      * Code that replaces the user's QEMUIOVector elements with bounce buffers
177      * must take care to clear this flag.
178      */
179     BDRV_REQ_REGISTERED_BUF     = 0x8,
180 
181     BDRV_REQ_FUA                = 0x10,
182     BDRV_REQ_WRITE_COMPRESSED   = 0x20,
183 
184     /*
185      * Signifies that this write request will not change the visible disk
186      * content.
187      */
188     BDRV_REQ_WRITE_UNCHANGED    = 0x40,
189 
190     /*
191      * Forces request serialisation. Use only with write requests.
192      */
193     BDRV_REQ_SERIALISING        = 0x80,
194 
195     /*
196      * Execute the request only if the operation can be offloaded or otherwise
197      * be executed efficiently, but return an error instead of using a slow
198      * fallback.
199      */
200     BDRV_REQ_NO_FALLBACK        = 0x100,
201 
202     /*
203      * BDRV_REQ_PREFETCH makes sense only in the context of copy-on-read
204      * (i.e., together with the BDRV_REQ_COPY_ON_READ flag or when a COR
205      * filter is involved), in which case it signals that the COR operation
206      * need not read the data into memory (qiov) but only ensure they are
207      * copied to the top layer (i.e., that COR operation is done).
208      */
209     BDRV_REQ_PREFETCH  = 0x200,
210 
211     /*
212      * If we need to wait for other requests, just fail immediately. Used
213      * only together with BDRV_REQ_SERIALISING. Used only with requests aligned
214      * to request_alignment (corresponding assertions are in block/io.c).
215      */
216     BDRV_REQ_NO_WAIT = 0x400,
217 
218     /* Mask of valid flags */
219     BDRV_REQ_MASK               = 0x7ff,
220 } BdrvRequestFlags;
221 
222 #define BDRV_O_NO_SHARE    0x0001 /* don't share permissions */
223 #define BDRV_O_RDWR        0x0002
224 #define BDRV_O_RESIZE      0x0004 /* request permission for resizing the node */
225 #define BDRV_O_SNAPSHOT    0x0008 /* open the file read only and save
226                                      writes in a snapshot */
227 #define BDRV_O_TEMPORARY   0x0010 /* delete the file after use */
228 #define BDRV_O_NOCACHE     0x0020 /* do not use the host page cache */
229 #define BDRV_O_NATIVE_AIO  0x0080 /* use native AIO instead of the
230                                      thread pool */
231 #define BDRV_O_NO_BACKING  0x0100 /* don't open the backing file */
232 #define BDRV_O_NO_FLUSH    0x0200 /* disable flushing on this disk */
233 #define BDRV_O_COPY_ON_READ 0x0400 /* copy read backing sectors into image */
234 #define BDRV_O_INACTIVE    0x0800  /* consistency hint for migration handoff */
235 #define BDRV_O_CHECK       0x1000  /* open solely for consistency check */
236 #define BDRV_O_ALLOW_RDWR  0x2000  /* allow reopen to change from r/o to r/w */
237 #define BDRV_O_UNMAP       0x4000  /* execute guest UNMAP/TRIM operations */
238 #define BDRV_O_PROTOCOL    0x8000  /* if no block driver is explicitly given:
239                                       select an appropriate protocol driver,
240                                       ignoring the format layer */
241 #define BDRV_O_NO_IO       0x10000 /* don't initialize for I/O */
242 #define BDRV_O_AUTO_RDONLY 0x20000 /* degrade to read-only if opening
243                                       read-write fails */
244 #define BDRV_O_IO_URING    0x40000 /* use io_uring instead of the thread pool */
245 
246 #define BDRV_O_CACHE_MASK  (BDRV_O_NOCACHE | BDRV_O_NO_FLUSH)
247 
248 
249 /* Option names of options parsed by the block layer */
250 
251 #define BDRV_OPT_CACHE_WB       "cache.writeback"
252 #define BDRV_OPT_CACHE_DIRECT   "cache.direct"
253 #define BDRV_OPT_CACHE_NO_FLUSH "cache.no-flush"
254 #define BDRV_OPT_READ_ONLY      "read-only"
255 #define BDRV_OPT_AUTO_READ_ONLY "auto-read-only"
256 #define BDRV_OPT_DISCARD        "discard"
257 #define BDRV_OPT_FORCE_SHARE    "force-share"
258 
259 
260 #define BDRV_SECTOR_BITS   9
261 #define BDRV_SECTOR_SIZE   (1ULL << BDRV_SECTOR_BITS)
262 
263 /*
264  * Get the first most significant bit of wp. If it is zero, then
265  * the zone type is SWR.
266  */
267 #define BDRV_ZT_IS_CONV(wp)    (wp & (1ULL << 63))
268 
269 #define BDRV_REQUEST_MAX_SECTORS MIN_CONST(SIZE_MAX >> BDRV_SECTOR_BITS, \
270                                            INT_MAX >> BDRV_SECTOR_BITS)
271 #define BDRV_REQUEST_MAX_BYTES (BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS)
272 
273 /*
274  * We want allow aligning requests and disk length up to any 32bit alignment
275  * and don't afraid of overflow.
276  * To achieve it, and in the same time use some pretty number as maximum disk
277  * size, let's define maximum "length" (a limit for any offset/bytes request and
278  * for disk size) to be the greatest power of 2 less than INT64_MAX.
279  */
280 #define BDRV_MAX_ALIGNMENT (1L << 30)
281 #define BDRV_MAX_LENGTH (QEMU_ALIGN_DOWN(INT64_MAX, BDRV_MAX_ALIGNMENT))
282 
283 /*
284  * Allocation status flags for bdrv_block_status() and friends.
285  *
286  * Public flags:
287  * BDRV_BLOCK_DATA: allocation for data at offset is tied to this layer
288  * BDRV_BLOCK_ZERO: offset reads as zero
289  * BDRV_BLOCK_OFFSET_VALID: an associated offset exists for accessing raw data
290  * BDRV_BLOCK_ALLOCATED: the content of the block is determined by this
291  *                       layer rather than any backing, set by block layer
292  * BDRV_BLOCK_EOF: the returned pnum covers through end of file for this
293  *                 layer, set by block layer
294  * BDRV_BLOCK_COMPRESSED: the underlying data is compressed; only valid for
295  *                        the formats supporting compression: qcow, qcow2
296  *
297  * Internal flags:
298  * BDRV_BLOCK_RAW: for use by passthrough drivers, such as raw, to request
299  *                 that the block layer recompute the answer from the returned
300  *                 BDS; must be accompanied by just BDRV_BLOCK_OFFSET_VALID.
301  * BDRV_BLOCK_RECURSE: request that the block layer will recursively search for
302  *                     zeroes in file child of current block node inside
303  *                     returned region. Only valid together with both
304  *                     BDRV_BLOCK_DATA and BDRV_BLOCK_OFFSET_VALID. Should not
305  *                     appear with BDRV_BLOCK_ZERO.
306  *
307  * If BDRV_BLOCK_OFFSET_VALID is set, the map parameter represents the
308  * host offset within the returned BDS that is allocated for the
309  * corresponding raw guest data.  However, whether that offset
310  * actually contains data also depends on BDRV_BLOCK_DATA, as follows:
311  *
312  * DATA ZERO OFFSET_VALID
313  *  t    t        t       sectors read as zero, returned file is zero at offset
314  *  t    f        t       sectors read as valid from file at offset
315  *  f    t        t       sectors preallocated, read as zero, returned file not
316  *                        necessarily zero at offset
317  *  f    f        t       sectors preallocated but read from backing_hd,
318  *                        returned file contains garbage at offset
319  *  t    t        f       sectors preallocated, read as zero, unknown offset
320  *  t    f        f       sectors read from unknown file or offset
321  *  f    t        f       not allocated or unknown offset, read as zero
322  *  f    f        f       not allocated or unknown offset, read from backing_hd
323  */
324 #define BDRV_BLOCK_DATA         0x01
325 #define BDRV_BLOCK_ZERO         0x02
326 #define BDRV_BLOCK_OFFSET_VALID 0x04
327 #define BDRV_BLOCK_RAW          0x08
328 #define BDRV_BLOCK_ALLOCATED    0x10
329 #define BDRV_BLOCK_EOF          0x20
330 #define BDRV_BLOCK_RECURSE      0x40
331 #define BDRV_BLOCK_COMPRESSED   0x80
332 
333 typedef QTAILQ_HEAD(BlockReopenQueue, BlockReopenQueueEntry) BlockReopenQueue;
334 
335 typedef struct BDRVReopenState {
336     BlockDriverState *bs;
337     int flags;
338     BlockdevDetectZeroesOptions detect_zeroes;
339     bool backing_missing;
340     BlockDriverState *old_backing_bs; /* keep pointer for permissions update */
341     BlockDriverState *old_file_bs; /* keep pointer for permissions update */
342     QDict *options;
343     QDict *explicit_options;
344     void *opaque;
345 } BDRVReopenState;
346 
347 /*
348  * Block operation types
349  */
350 typedef enum BlockOpType {
351     BLOCK_OP_TYPE_BACKUP_SOURCE,
352     BLOCK_OP_TYPE_BACKUP_TARGET,
353     BLOCK_OP_TYPE_CHANGE,
354     BLOCK_OP_TYPE_COMMIT_SOURCE,
355     BLOCK_OP_TYPE_COMMIT_TARGET,
356     BLOCK_OP_TYPE_DATAPLANE,
357     BLOCK_OP_TYPE_DRIVE_DEL,
358     BLOCK_OP_TYPE_EJECT,
359     BLOCK_OP_TYPE_EXTERNAL_SNAPSHOT,
360     BLOCK_OP_TYPE_INTERNAL_SNAPSHOT,
361     BLOCK_OP_TYPE_INTERNAL_SNAPSHOT_DELETE,
362     BLOCK_OP_TYPE_MIRROR_SOURCE,
363     BLOCK_OP_TYPE_MIRROR_TARGET,
364     BLOCK_OP_TYPE_RESIZE,
365     BLOCK_OP_TYPE_STREAM,
366     BLOCK_OP_TYPE_REPLACE,
367     BLOCK_OP_TYPE_MAX,
368 } BlockOpType;
369 
370 /* Block node permission constants */
371 enum {
372     /**
373      * A user that has the "permission" of consistent reads is guaranteed that
374      * their view of the contents of the block device is complete and
375      * self-consistent, representing the contents of a disk at a specific
376      * point.
377      *
378      * For most block devices (including their backing files) this is true, but
379      * the property cannot be maintained in a few situations like for
380      * intermediate nodes of a commit block job.
381      */
382     BLK_PERM_CONSISTENT_READ    = 0x01,
383 
384     /** This permission is required to change the visible disk contents. */
385     BLK_PERM_WRITE              = 0x02,
386 
387     /**
388      * This permission (which is weaker than BLK_PERM_WRITE) is both enough and
389      * required for writes to the block node when the caller promises that
390      * the visible disk content doesn't change.
391      *
392      * As the BLK_PERM_WRITE permission is strictly stronger, either is
393      * sufficient to perform an unchanging write.
394      */
395     BLK_PERM_WRITE_UNCHANGED    = 0x04,
396 
397     /** This permission is required to change the size of a block node. */
398     BLK_PERM_RESIZE             = 0x08,
399 
400     /**
401      * There was a now-removed bit BLK_PERM_GRAPH_MOD, with value of 0x10. QEMU
402      * 6.1 and earlier may still lock the corresponding byte in block/file-posix
403      * locking.  So, implementing some new permission should be very careful to
404      * not interfere with this old unused thing.
405      */
406 
407     BLK_PERM_ALL                = 0x0f,
408 
409     DEFAULT_PERM_PASSTHROUGH    = BLK_PERM_CONSISTENT_READ
410                                  | BLK_PERM_WRITE
411                                  | BLK_PERM_WRITE_UNCHANGED
412                                  | BLK_PERM_RESIZE,
413 
414     DEFAULT_PERM_UNCHANGED      = BLK_PERM_ALL & ~DEFAULT_PERM_PASSTHROUGH,
415 };
416 
417 /*
418  * Flags that parent nodes assign to child nodes to specify what kind of
419  * role(s) they take.
420  *
421  * At least one of DATA, METADATA, FILTERED, or COW must be set for
422  * every child.
423  *
424  *
425  * = Connection with bs->children, bs->file and bs->backing fields =
426  *
427  * 1. Filters
428  *
429  * Filter drivers have drv->is_filter = true.
430  *
431  * Filter node has exactly one FILTERED|PRIMARY child, and may have other
432  * children which must not have these bits (one example is the
433  * copy-before-write filter, which also has its target DATA child).
434  *
435  * Filter nodes never have COW children.
436  *
437  * For most filters, the filtered child is linked in bs->file, bs->backing is
438  * NULL.  For some filters (as an exception), it is the other way around; those
439  * drivers will have drv->filtered_child_is_backing set to true (see that
440  * field’s documentation for what drivers this concerns)
441  *
442  * 2. "raw" driver (block/raw-format.c)
443  *
444  * Formally it's not a filter (drv->is_filter = false)
445  *
446  * bs->backing is always NULL
447  *
448  * Only has one child, linked in bs->file. Its role is either FILTERED|PRIMARY
449  * (like filter) or DATA|PRIMARY depending on options.
450  *
451  * 3. Other drivers
452  *
453  * Don't have any FILTERED children.
454  *
455  * May have at most one COW child. In this case it's linked in bs->backing.
456  * Otherwise bs->backing is NULL. COW child is never PRIMARY.
457  *
458  * May have at most one PRIMARY child. In this case it's linked in bs->file.
459  * Otherwise bs->file is NULL.
460  *
461  * May also have some other children that don't have the PRIMARY or COW bit set.
462  */
463 enum BdrvChildRoleBits {
464     /*
465      * This child stores data.
466      * Any node may have an arbitrary number of such children.
467      */
468     BDRV_CHILD_DATA         = (1 << 0),
469 
470     /*
471      * This child stores metadata.
472      * Any node may have an arbitrary number of metadata-storing
473      * children.
474      */
475     BDRV_CHILD_METADATA     = (1 << 1),
476 
477     /*
478      * A child that always presents exactly the same visible data as
479      * the parent, e.g. by virtue of the parent forwarding all reads
480      * and writes.
481      * This flag is mutually exclusive with DATA, METADATA, and COW.
482      * Any node may have at most one filtered child at a time.
483      */
484     BDRV_CHILD_FILTERED     = (1 << 2),
485 
486     /*
487      * Child from which to read all data that isn't allocated in the
488      * parent (i.e., the backing child); such data is copied to the
489      * parent through COW (and optionally COR).
490      * This field is mutually exclusive with DATA, METADATA, and
491      * FILTERED.
492      * Any node may have at most one such backing child at a time.
493      */
494     BDRV_CHILD_COW          = (1 << 3),
495 
496     /*
497      * The primary child.  For most drivers, this is the child whose
498      * filename applies best to the parent node.
499      * Any node may have at most one primary child at a time.
500      */
501     BDRV_CHILD_PRIMARY      = (1 << 4),
502 
503     /* Useful combination of flags */
504     BDRV_CHILD_IMAGE        = BDRV_CHILD_DATA
505                               | BDRV_CHILD_METADATA
506                               | BDRV_CHILD_PRIMARY,
507 };
508 
509 /* Mask of BdrvChildRoleBits values */
510 typedef unsigned int BdrvChildRole;
511 
512 typedef struct BdrvCheckResult {
513     int corruptions;
514     int leaks;
515     int check_errors;
516     int corruptions_fixed;
517     int leaks_fixed;
518     int64_t image_end_offset;
519     BlockFragInfo bfi;
520 } BdrvCheckResult;
521 
522 typedef enum {
523     BDRV_FIX_LEAKS    = 1,
524     BDRV_FIX_ERRORS   = 2,
525 } BdrvCheckMode;
526 
527 typedef struct BlockSizes {
528     uint32_t phys;
529     uint32_t log;
530 } BlockSizes;
531 
532 typedef struct HDGeometry {
533     uint32_t heads;
534     uint32_t sectors;
535     uint32_t cylinders;
536 } HDGeometry;
537 
538 /*
539  * Common functions that are neither I/O nor Global State.
540  *
541  * These functions must never call any function from other categories
542  * (I/O, "I/O or GS", Global State) except this one, but can be invoked by
543  * all of them.
544  */
545 
546 char *bdrv_perm_names(uint64_t perm);
547 uint64_t bdrv_qapi_perm_to_blk_perm(BlockPermission qapi_perm);
548 
549 void bdrv_init_with_whitelist(void);
550 bool bdrv_uses_whitelist(void);
551 int bdrv_is_whitelisted(BlockDriver *drv, bool read_only);
552 
553 int bdrv_parse_aio(const char *mode, int *flags);
554 int bdrv_parse_cache_mode(const char *mode, int *flags, bool *writethrough);
555 int bdrv_parse_discard_flags(const char *mode, int *flags);
556 
557 int path_has_protocol(const char *path);
558 int path_is_absolute(const char *path);
559 char *path_combine(const char *base_path, const char *filename);
560 
561 char *bdrv_get_full_backing_filename_from_filename(const char *backed,
562                                                    const char *backing,
563                                                    Error **errp);
564 
565 #endif /* BLOCK_COMMON_H */
566