xref: /openbmc/qemu/include/block/block-common.h (revision 7a5951f6)
1 /*
2  * QEMU System Emulator block driver
3  *
4  * Copyright (c) 2003 Fabrice Bellard
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22  * THE SOFTWARE.
23  */
24 #ifndef BLOCK_COMMON_H
25 #define BLOCK_COMMON_H
26 
27 #include "block/aio.h"
28 #include "block/aio-wait.h"
29 #include "qemu/iov.h"
30 #include "qemu/coroutine.h"
31 #include "block/accounting.h"
32 #include "qemu/hbitmap.h"
33 #include "qemu/transactions.h"
34 
35 /*
36  * co_wrapper{*}: Function specifiers used by block-coroutine-wrapper.py
37  *
38  * Function specifiers, which do nothing but mark functions to be
39  * generated by scripts/block-coroutine-wrapper.py
40  *
41  * Usage: read docs/devel/block-coroutine-wrapper.rst
42  *
43  * There are 4 kind of specifiers:
44  * - co_wrapper functions can be called by only non-coroutine context, because
45  *   they always generate a new coroutine.
46  * - co_wrapper_mixed functions can be called by both coroutine and
47  *   non-coroutine context.
48  * - co_wrapper_bdrv_rdlock are co_wrapper functions but automatically take and
49  *   release the graph rdlock when creating a new coroutine
50  * - co_wrapper_mixed_bdrv_rdlock are co_wrapper_mixed functions but
51  *   automatically take and release the graph rdlock when creating a new
52  *   coroutine.
53  */
54 #define co_wrapper
55 #define co_wrapper_mixed
56 #define co_wrapper_bdrv_rdlock
57 #define co_wrapper_mixed_bdrv_rdlock
58 
59 #include "block/dirty-bitmap.h"
60 #include "block/blockjob.h"
61 
62 /* block.c */
63 typedef struct BlockDriver BlockDriver;
64 typedef struct BdrvChild BdrvChild;
65 typedef struct BdrvChildClass BdrvChildClass;
66 
67 typedef struct BlockDriverInfo {
68     /* in bytes, 0 if irrelevant */
69     int cluster_size;
70     /* offset at which the VM state can be saved (0 if not possible) */
71     int64_t vm_state_offset;
72     bool is_dirty;
73     /*
74      * True if this block driver only supports compressed writes
75      */
76     bool needs_compressed_writes;
77 } BlockDriverInfo;
78 
79 typedef struct BlockFragInfo {
80     uint64_t allocated_clusters;
81     uint64_t total_clusters;
82     uint64_t fragmented_clusters;
83     uint64_t compressed_clusters;
84 } BlockFragInfo;
85 
86 typedef enum {
87     BDRV_REQ_COPY_ON_READ       = 0x1,
88     BDRV_REQ_ZERO_WRITE         = 0x2,
89 
90     /*
91      * The BDRV_REQ_MAY_UNMAP flag is used in write_zeroes requests to indicate
92      * that the block driver should unmap (discard) blocks if it is guaranteed
93      * that the result will read back as zeroes. The flag is only passed to the
94      * driver if the block device is opened with BDRV_O_UNMAP.
95      */
96     BDRV_REQ_MAY_UNMAP          = 0x4,
97 
98     /*
99      * An optimization hint when all QEMUIOVector elements are within
100      * previously registered bdrv_register_buf() memory ranges.
101      *
102      * Code that replaces the user's QEMUIOVector elements with bounce buffers
103      * must take care to clear this flag.
104      */
105     BDRV_REQ_REGISTERED_BUF     = 0x8,
106 
107     BDRV_REQ_FUA                = 0x10,
108     BDRV_REQ_WRITE_COMPRESSED   = 0x20,
109 
110     /*
111      * Signifies that this write request will not change the visible disk
112      * content.
113      */
114     BDRV_REQ_WRITE_UNCHANGED    = 0x40,
115 
116     /*
117      * Forces request serialisation. Use only with write requests.
118      */
119     BDRV_REQ_SERIALISING        = 0x80,
120 
121     /*
122      * Execute the request only if the operation can be offloaded or otherwise
123      * be executed efficiently, but return an error instead of using a slow
124      * fallback.
125      */
126     BDRV_REQ_NO_FALLBACK        = 0x100,
127 
128     /*
129      * BDRV_REQ_PREFETCH makes sense only in the context of copy-on-read
130      * (i.e., together with the BDRV_REQ_COPY_ON_READ flag or when a COR
131      * filter is involved), in which case it signals that the COR operation
132      * need not read the data into memory (qiov) but only ensure they are
133      * copied to the top layer (i.e., that COR operation is done).
134      */
135     BDRV_REQ_PREFETCH  = 0x200,
136 
137     /*
138      * If we need to wait for other requests, just fail immediately. Used
139      * only together with BDRV_REQ_SERIALISING. Used only with requests aligned
140      * to request_alignment (corresponding assertions are in block/io.c).
141      */
142     BDRV_REQ_NO_WAIT = 0x400,
143 
144     /* Mask of valid flags */
145     BDRV_REQ_MASK               = 0x7ff,
146 } BdrvRequestFlags;
147 
148 #define BDRV_O_NO_SHARE    0x0001 /* don't share permissions */
149 #define BDRV_O_RDWR        0x0002
150 #define BDRV_O_RESIZE      0x0004 /* request permission for resizing the node */
151 #define BDRV_O_SNAPSHOT    0x0008 /* open the file read only and save
152                                      writes in a snapshot */
153 #define BDRV_O_TEMPORARY   0x0010 /* delete the file after use */
154 #define BDRV_O_NOCACHE     0x0020 /* do not use the host page cache */
155 #define BDRV_O_NATIVE_AIO  0x0080 /* use native AIO instead of the
156                                      thread pool */
157 #define BDRV_O_NO_BACKING  0x0100 /* don't open the backing file */
158 #define BDRV_O_NO_FLUSH    0x0200 /* disable flushing on this disk */
159 #define BDRV_O_COPY_ON_READ 0x0400 /* copy read backing sectors into image */
160 #define BDRV_O_INACTIVE    0x0800  /* consistency hint for migration handoff */
161 #define BDRV_O_CHECK       0x1000  /* open solely for consistency check */
162 #define BDRV_O_ALLOW_RDWR  0x2000  /* allow reopen to change from r/o to r/w */
163 #define BDRV_O_UNMAP       0x4000  /* execute guest UNMAP/TRIM operations */
164 #define BDRV_O_PROTOCOL    0x8000  /* if no block driver is explicitly given:
165                                       select an appropriate protocol driver,
166                                       ignoring the format layer */
167 #define BDRV_O_NO_IO       0x10000 /* don't initialize for I/O */
168 #define BDRV_O_AUTO_RDONLY 0x20000 /* degrade to read-only if opening
169                                       read-write fails */
170 #define BDRV_O_IO_URING    0x40000 /* use io_uring instead of the thread pool */
171 
172 #define BDRV_O_CACHE_MASK  (BDRV_O_NOCACHE | BDRV_O_NO_FLUSH)
173 
174 
175 /* Option names of options parsed by the block layer */
176 
177 #define BDRV_OPT_CACHE_WB       "cache.writeback"
178 #define BDRV_OPT_CACHE_DIRECT   "cache.direct"
179 #define BDRV_OPT_CACHE_NO_FLUSH "cache.no-flush"
180 #define BDRV_OPT_READ_ONLY      "read-only"
181 #define BDRV_OPT_AUTO_READ_ONLY "auto-read-only"
182 #define BDRV_OPT_DISCARD        "discard"
183 #define BDRV_OPT_FORCE_SHARE    "force-share"
184 
185 
186 #define BDRV_SECTOR_BITS   9
187 #define BDRV_SECTOR_SIZE   (1ULL << BDRV_SECTOR_BITS)
188 
189 #define BDRV_REQUEST_MAX_SECTORS MIN_CONST(SIZE_MAX >> BDRV_SECTOR_BITS, \
190                                            INT_MAX >> BDRV_SECTOR_BITS)
191 #define BDRV_REQUEST_MAX_BYTES (BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS)
192 
193 /*
194  * We want allow aligning requests and disk length up to any 32bit alignment
195  * and don't afraid of overflow.
196  * To achieve it, and in the same time use some pretty number as maximum disk
197  * size, let's define maximum "length" (a limit for any offset/bytes request and
198  * for disk size) to be the greatest power of 2 less than INT64_MAX.
199  */
200 #define BDRV_MAX_ALIGNMENT (1L << 30)
201 #define BDRV_MAX_LENGTH (QEMU_ALIGN_DOWN(INT64_MAX, BDRV_MAX_ALIGNMENT))
202 
203 /*
204  * Allocation status flags for bdrv_block_status() and friends.
205  *
206  * Public flags:
207  * BDRV_BLOCK_DATA: allocation for data at offset is tied to this layer
208  * BDRV_BLOCK_ZERO: offset reads as zero
209  * BDRV_BLOCK_OFFSET_VALID: an associated offset exists for accessing raw data
210  * BDRV_BLOCK_ALLOCATED: the content of the block is determined by this
211  *                       layer rather than any backing, set by block layer
212  * BDRV_BLOCK_EOF: the returned pnum covers through end of file for this
213  *                 layer, set by block layer
214  *
215  * Internal flags:
216  * BDRV_BLOCK_RAW: for use by passthrough drivers, such as raw, to request
217  *                 that the block layer recompute the answer from the returned
218  *                 BDS; must be accompanied by just BDRV_BLOCK_OFFSET_VALID.
219  * BDRV_BLOCK_RECURSE: request that the block layer will recursively search for
220  *                     zeroes in file child of current block node inside
221  *                     returned region. Only valid together with both
222  *                     BDRV_BLOCK_DATA and BDRV_BLOCK_OFFSET_VALID. Should not
223  *                     appear with BDRV_BLOCK_ZERO.
224  *
225  * If BDRV_BLOCK_OFFSET_VALID is set, the map parameter represents the
226  * host offset within the returned BDS that is allocated for the
227  * corresponding raw guest data.  However, whether that offset
228  * actually contains data also depends on BDRV_BLOCK_DATA, as follows:
229  *
230  * DATA ZERO OFFSET_VALID
231  *  t    t        t       sectors read as zero, returned file is zero at offset
232  *  t    f        t       sectors read as valid from file at offset
233  *  f    t        t       sectors preallocated, read as zero, returned file not
234  *                        necessarily zero at offset
235  *  f    f        t       sectors preallocated but read from backing_hd,
236  *                        returned file contains garbage at offset
237  *  t    t        f       sectors preallocated, read as zero, unknown offset
238  *  t    f        f       sectors read from unknown file or offset
239  *  f    t        f       not allocated or unknown offset, read as zero
240  *  f    f        f       not allocated or unknown offset, read from backing_hd
241  */
242 #define BDRV_BLOCK_DATA         0x01
243 #define BDRV_BLOCK_ZERO         0x02
244 #define BDRV_BLOCK_OFFSET_VALID 0x04
245 #define BDRV_BLOCK_RAW          0x08
246 #define BDRV_BLOCK_ALLOCATED    0x10
247 #define BDRV_BLOCK_EOF          0x20
248 #define BDRV_BLOCK_RECURSE      0x40
249 
250 typedef QTAILQ_HEAD(BlockReopenQueue, BlockReopenQueueEntry) BlockReopenQueue;
251 
252 typedef struct BDRVReopenState {
253     BlockDriverState *bs;
254     int flags;
255     BlockdevDetectZeroesOptions detect_zeroes;
256     bool backing_missing;
257     BlockDriverState *old_backing_bs; /* keep pointer for permissions update */
258     BlockDriverState *old_file_bs; /* keep pointer for permissions update */
259     QDict *options;
260     QDict *explicit_options;
261     void *opaque;
262 } BDRVReopenState;
263 
264 /*
265  * Block operation types
266  */
267 typedef enum BlockOpType {
268     BLOCK_OP_TYPE_BACKUP_SOURCE,
269     BLOCK_OP_TYPE_BACKUP_TARGET,
270     BLOCK_OP_TYPE_CHANGE,
271     BLOCK_OP_TYPE_COMMIT_SOURCE,
272     BLOCK_OP_TYPE_COMMIT_TARGET,
273     BLOCK_OP_TYPE_DATAPLANE,
274     BLOCK_OP_TYPE_DRIVE_DEL,
275     BLOCK_OP_TYPE_EJECT,
276     BLOCK_OP_TYPE_EXTERNAL_SNAPSHOT,
277     BLOCK_OP_TYPE_INTERNAL_SNAPSHOT,
278     BLOCK_OP_TYPE_INTERNAL_SNAPSHOT_DELETE,
279     BLOCK_OP_TYPE_MIRROR_SOURCE,
280     BLOCK_OP_TYPE_MIRROR_TARGET,
281     BLOCK_OP_TYPE_RESIZE,
282     BLOCK_OP_TYPE_STREAM,
283     BLOCK_OP_TYPE_REPLACE,
284     BLOCK_OP_TYPE_MAX,
285 } BlockOpType;
286 
287 /* Block node permission constants */
288 enum {
289     /**
290      * A user that has the "permission" of consistent reads is guaranteed that
291      * their view of the contents of the block device is complete and
292      * self-consistent, representing the contents of a disk at a specific
293      * point.
294      *
295      * For most block devices (including their backing files) this is true, but
296      * the property cannot be maintained in a few situations like for
297      * intermediate nodes of a commit block job.
298      */
299     BLK_PERM_CONSISTENT_READ    = 0x01,
300 
301     /** This permission is required to change the visible disk contents. */
302     BLK_PERM_WRITE              = 0x02,
303 
304     /**
305      * This permission (which is weaker than BLK_PERM_WRITE) is both enough and
306      * required for writes to the block node when the caller promises that
307      * the visible disk content doesn't change.
308      *
309      * As the BLK_PERM_WRITE permission is strictly stronger, either is
310      * sufficient to perform an unchanging write.
311      */
312     BLK_PERM_WRITE_UNCHANGED    = 0x04,
313 
314     /** This permission is required to change the size of a block node. */
315     BLK_PERM_RESIZE             = 0x08,
316 
317     /**
318      * There was a now-removed bit BLK_PERM_GRAPH_MOD, with value of 0x10. QEMU
319      * 6.1 and earlier may still lock the corresponding byte in block/file-posix
320      * locking.  So, implementing some new permission should be very careful to
321      * not interfere with this old unused thing.
322      */
323 
324     BLK_PERM_ALL                = 0x0f,
325 
326     DEFAULT_PERM_PASSTHROUGH    = BLK_PERM_CONSISTENT_READ
327                                  | BLK_PERM_WRITE
328                                  | BLK_PERM_WRITE_UNCHANGED
329                                  | BLK_PERM_RESIZE,
330 
331     DEFAULT_PERM_UNCHANGED      = BLK_PERM_ALL & ~DEFAULT_PERM_PASSTHROUGH,
332 };
333 
334 /*
335  * Flags that parent nodes assign to child nodes to specify what kind of
336  * role(s) they take.
337  *
338  * At least one of DATA, METADATA, FILTERED, or COW must be set for
339  * every child.
340  *
341  *
342  * = Connection with bs->children, bs->file and bs->backing fields =
343  *
344  * 1. Filters
345  *
346  * Filter drivers have drv->is_filter = true.
347  *
348  * Filter node has exactly one FILTERED|PRIMARY child, and may have other
349  * children which must not have these bits (one example is the
350  * copy-before-write filter, which also has its target DATA child).
351  *
352  * Filter nodes never have COW children.
353  *
354  * For most filters, the filtered child is linked in bs->file, bs->backing is
355  * NULL.  For some filters (as an exception), it is the other way around; those
356  * drivers will have drv->filtered_child_is_backing set to true (see that
357  * field’s documentation for what drivers this concerns)
358  *
359  * 2. "raw" driver (block/raw-format.c)
360  *
361  * Formally it's not a filter (drv->is_filter = false)
362  *
363  * bs->backing is always NULL
364  *
365  * Only has one child, linked in bs->file. Its role is either FILTERED|PRIMARY
366  * (like filter) or DATA|PRIMARY depending on options.
367  *
368  * 3. Other drivers
369  *
370  * Don't have any FILTERED children.
371  *
372  * May have at most one COW child. In this case it's linked in bs->backing.
373  * Otherwise bs->backing is NULL. COW child is never PRIMARY.
374  *
375  * May have at most one PRIMARY child. In this case it's linked in bs->file.
376  * Otherwise bs->file is NULL.
377  *
378  * May also have some other children that don't have the PRIMARY or COW bit set.
379  */
380 enum BdrvChildRoleBits {
381     /*
382      * This child stores data.
383      * Any node may have an arbitrary number of such children.
384      */
385     BDRV_CHILD_DATA         = (1 << 0),
386 
387     /*
388      * This child stores metadata.
389      * Any node may have an arbitrary number of metadata-storing
390      * children.
391      */
392     BDRV_CHILD_METADATA     = (1 << 1),
393 
394     /*
395      * A child that always presents exactly the same visible data as
396      * the parent, e.g. by virtue of the parent forwarding all reads
397      * and writes.
398      * This flag is mutually exclusive with DATA, METADATA, and COW.
399      * Any node may have at most one filtered child at a time.
400      */
401     BDRV_CHILD_FILTERED     = (1 << 2),
402 
403     /*
404      * Child from which to read all data that isn't allocated in the
405      * parent (i.e., the backing child); such data is copied to the
406      * parent through COW (and optionally COR).
407      * This field is mutually exclusive with DATA, METADATA, and
408      * FILTERED.
409      * Any node may have at most one such backing child at a time.
410      */
411     BDRV_CHILD_COW          = (1 << 3),
412 
413     /*
414      * The primary child.  For most drivers, this is the child whose
415      * filename applies best to the parent node.
416      * Any node may have at most one primary child at a time.
417      */
418     BDRV_CHILD_PRIMARY      = (1 << 4),
419 
420     /* Useful combination of flags */
421     BDRV_CHILD_IMAGE        = BDRV_CHILD_DATA
422                               | BDRV_CHILD_METADATA
423                               | BDRV_CHILD_PRIMARY,
424 };
425 
426 /* Mask of BdrvChildRoleBits values */
427 typedef unsigned int BdrvChildRole;
428 
429 typedef struct BdrvCheckResult {
430     int corruptions;
431     int leaks;
432     int check_errors;
433     int corruptions_fixed;
434     int leaks_fixed;
435     int64_t image_end_offset;
436     BlockFragInfo bfi;
437 } BdrvCheckResult;
438 
439 typedef enum {
440     BDRV_FIX_LEAKS    = 1,
441     BDRV_FIX_ERRORS   = 2,
442 } BdrvCheckMode;
443 
444 typedef struct BlockSizes {
445     uint32_t phys;
446     uint32_t log;
447 } BlockSizes;
448 
449 typedef struct HDGeometry {
450     uint32_t heads;
451     uint32_t sectors;
452     uint32_t cylinders;
453 } HDGeometry;
454 
455 /*
456  * Common functions that are neither I/O nor Global State.
457  *
458  * These functions must never call any function from other categories
459  * (I/O, "I/O or GS", Global State) except this one, but can be invoked by
460  * all of them.
461  */
462 
463 char *bdrv_perm_names(uint64_t perm);
464 uint64_t bdrv_qapi_perm_to_blk_perm(BlockPermission qapi_perm);
465 
466 void bdrv_init_with_whitelist(void);
467 bool bdrv_uses_whitelist(void);
468 int bdrv_is_whitelisted(BlockDriver *drv, bool read_only);
469 
470 int bdrv_parse_aio(const char *mode, int *flags);
471 int bdrv_parse_cache_mode(const char *mode, int *flags, bool *writethrough);
472 int bdrv_parse_discard_flags(const char *mode, int *flags);
473 
474 int path_has_protocol(const char *path);
475 int path_is_absolute(const char *path);
476 char *path_combine(const char *base_path, const char *filename);
477 
478 char *bdrv_get_full_backing_filename_from_filename(const char *backed,
479                                                    const char *backing,
480                                                    Error **errp);
481 
482 #endif /* BLOCK_COMMON_H */
483