xref: /openbmc/qemu/include/block/block-common.h (revision 5242ef88)
1 /*
2  * QEMU System Emulator block driver
3  *
4  * Copyright (c) 2003 Fabrice Bellard
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22  * THE SOFTWARE.
23  */
24 #ifndef BLOCK_COMMON_H
25 #define BLOCK_COMMON_H
26 
27 #include "block/aio.h"
28 #include "block/aio-wait.h"
29 #include "qemu/iov.h"
30 #include "qemu/coroutine.h"
31 #include "block/accounting.h"
32 #include "block/dirty-bitmap.h"
33 #include "block/blockjob.h"
34 #include "qemu/hbitmap.h"
35 #include "qemu/transactions.h"
36 
37 /*
38  * generated_co_wrapper
39  *
40  * Function specifier, which does nothing but mark functions to be
41  * generated by scripts/block-coroutine-wrapper.py
42  *
43  * Read more in docs/devel/block-coroutine-wrapper.rst
44  */
45 #define generated_co_wrapper
46 
47 /* block.c */
48 typedef struct BlockDriver BlockDriver;
49 typedef struct BdrvChild BdrvChild;
50 typedef struct BdrvChildClass BdrvChildClass;
51 
52 typedef struct BlockDriverInfo {
53     /* in bytes, 0 if irrelevant */
54     int cluster_size;
55     /* offset at which the VM state can be saved (0 if not possible) */
56     int64_t vm_state_offset;
57     bool is_dirty;
58     /*
59      * True if this block driver only supports compressed writes
60      */
61     bool needs_compressed_writes;
62 } BlockDriverInfo;
63 
64 typedef struct BlockFragInfo {
65     uint64_t allocated_clusters;
66     uint64_t total_clusters;
67     uint64_t fragmented_clusters;
68     uint64_t compressed_clusters;
69 } BlockFragInfo;
70 
71 typedef enum {
72     BDRV_REQ_COPY_ON_READ       = 0x1,
73     BDRV_REQ_ZERO_WRITE         = 0x2,
74 
75     /*
76      * The BDRV_REQ_MAY_UNMAP flag is used in write_zeroes requests to indicate
77      * that the block driver should unmap (discard) blocks if it is guaranteed
78      * that the result will read back as zeroes. The flag is only passed to the
79      * driver if the block device is opened with BDRV_O_UNMAP.
80      */
81     BDRV_REQ_MAY_UNMAP          = 0x4,
82 
83     BDRV_REQ_FUA                = 0x10,
84     BDRV_REQ_WRITE_COMPRESSED   = 0x20,
85 
86     /*
87      * Signifies that this write request will not change the visible disk
88      * content.
89      */
90     BDRV_REQ_WRITE_UNCHANGED    = 0x40,
91 
92     /*
93      * Forces request serialisation. Use only with write requests.
94      */
95     BDRV_REQ_SERIALISING        = 0x80,
96 
97     /*
98      * Execute the request only if the operation can be offloaded or otherwise
99      * be executed efficiently, but return an error instead of using a slow
100      * fallback.
101      */
102     BDRV_REQ_NO_FALLBACK        = 0x100,
103 
104     /*
105      * BDRV_REQ_PREFETCH makes sense only in the context of copy-on-read
106      * (i.e., together with the BDRV_REQ_COPY_ON_READ flag or when a COR
107      * filter is involved), in which case it signals that the COR operation
108      * need not read the data into memory (qiov) but only ensure they are
109      * copied to the top layer (i.e., that COR operation is done).
110      */
111     BDRV_REQ_PREFETCH  = 0x200,
112 
113     /*
114      * If we need to wait for other requests, just fail immediately. Used
115      * only together with BDRV_REQ_SERIALISING. Used only with requests aligned
116      * to request_alignment (corresponding assertions are in block/io.c).
117      */
118     BDRV_REQ_NO_WAIT = 0x400,
119 
120     /* Mask of valid flags */
121     BDRV_REQ_MASK               = 0x7ff,
122 } BdrvRequestFlags;
123 
124 #define BDRV_O_NO_SHARE    0x0001 /* don't share permissions */
125 #define BDRV_O_RDWR        0x0002
126 #define BDRV_O_RESIZE      0x0004 /* request permission for resizing the node */
127 #define BDRV_O_SNAPSHOT    0x0008 /* open the file read only and save
128                                      writes in a snapshot */
129 #define BDRV_O_TEMPORARY   0x0010 /* delete the file after use */
130 #define BDRV_O_NOCACHE     0x0020 /* do not use the host page cache */
131 #define BDRV_O_NATIVE_AIO  0x0080 /* use native AIO instead of the
132                                      thread pool */
133 #define BDRV_O_NO_BACKING  0x0100 /* don't open the backing file */
134 #define BDRV_O_NO_FLUSH    0x0200 /* disable flushing on this disk */
135 #define BDRV_O_COPY_ON_READ 0x0400 /* copy read backing sectors into image */
136 #define BDRV_O_INACTIVE    0x0800  /* consistency hint for migration handoff */
137 #define BDRV_O_CHECK       0x1000  /* open solely for consistency check */
138 #define BDRV_O_ALLOW_RDWR  0x2000  /* allow reopen to change from r/o to r/w */
139 #define BDRV_O_UNMAP       0x4000  /* execute guest UNMAP/TRIM operations */
140 #define BDRV_O_PROTOCOL    0x8000  /* if no block driver is explicitly given:
141                                       select an appropriate protocol driver,
142                                       ignoring the format layer */
143 #define BDRV_O_NO_IO       0x10000 /* don't initialize for I/O */
144 #define BDRV_O_AUTO_RDONLY 0x20000 /* degrade to read-only if opening
145                                       read-write fails */
146 #define BDRV_O_IO_URING    0x40000 /* use io_uring instead of the thread pool */
147 
148 #define BDRV_O_CACHE_MASK  (BDRV_O_NOCACHE | BDRV_O_NO_FLUSH)
149 
150 
151 /* Option names of options parsed by the block layer */
152 
153 #define BDRV_OPT_CACHE_WB       "cache.writeback"
154 #define BDRV_OPT_CACHE_DIRECT   "cache.direct"
155 #define BDRV_OPT_CACHE_NO_FLUSH "cache.no-flush"
156 #define BDRV_OPT_READ_ONLY      "read-only"
157 #define BDRV_OPT_AUTO_READ_ONLY "auto-read-only"
158 #define BDRV_OPT_DISCARD        "discard"
159 #define BDRV_OPT_FORCE_SHARE    "force-share"
160 
161 
162 #define BDRV_SECTOR_BITS   9
163 #define BDRV_SECTOR_SIZE   (1ULL << BDRV_SECTOR_BITS)
164 
165 #define BDRV_REQUEST_MAX_SECTORS MIN_CONST(SIZE_MAX >> BDRV_SECTOR_BITS, \
166                                            INT_MAX >> BDRV_SECTOR_BITS)
167 #define BDRV_REQUEST_MAX_BYTES (BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS)
168 
169 /*
170  * We want allow aligning requests and disk length up to any 32bit alignment
171  * and don't afraid of overflow.
172  * To achieve it, and in the same time use some pretty number as maximum disk
173  * size, let's define maximum "length" (a limit for any offset/bytes request and
174  * for disk size) to be the greatest power of 2 less than INT64_MAX.
175  */
176 #define BDRV_MAX_ALIGNMENT (1L << 30)
177 #define BDRV_MAX_LENGTH (QEMU_ALIGN_DOWN(INT64_MAX, BDRV_MAX_ALIGNMENT))
178 
179 /*
180  * Allocation status flags for bdrv_block_status() and friends.
181  *
182  * Public flags:
183  * BDRV_BLOCK_DATA: allocation for data at offset is tied to this layer
184  * BDRV_BLOCK_ZERO: offset reads as zero
185  * BDRV_BLOCK_OFFSET_VALID: an associated offset exists for accessing raw data
186  * BDRV_BLOCK_ALLOCATED: the content of the block is determined by this
187  *                       layer rather than any backing, set by block layer
188  * BDRV_BLOCK_EOF: the returned pnum covers through end of file for this
189  *                 layer, set by block layer
190  *
191  * Internal flags:
192  * BDRV_BLOCK_RAW: for use by passthrough drivers, such as raw, to request
193  *                 that the block layer recompute the answer from the returned
194  *                 BDS; must be accompanied by just BDRV_BLOCK_OFFSET_VALID.
195  * BDRV_BLOCK_RECURSE: request that the block layer will recursively search for
196  *                     zeroes in file child of current block node inside
197  *                     returned region. Only valid together with both
198  *                     BDRV_BLOCK_DATA and BDRV_BLOCK_OFFSET_VALID. Should not
199  *                     appear with BDRV_BLOCK_ZERO.
200  *
201  * If BDRV_BLOCK_OFFSET_VALID is set, the map parameter represents the
202  * host offset within the returned BDS that is allocated for the
203  * corresponding raw guest data.  However, whether that offset
204  * actually contains data also depends on BDRV_BLOCK_DATA, as follows:
205  *
206  * DATA ZERO OFFSET_VALID
207  *  t    t        t       sectors read as zero, returned file is zero at offset
208  *  t    f        t       sectors read as valid from file at offset
209  *  f    t        t       sectors preallocated, read as zero, returned file not
210  *                        necessarily zero at offset
211  *  f    f        t       sectors preallocated but read from backing_hd,
212  *                        returned file contains garbage at offset
213  *  t    t        f       sectors preallocated, read as zero, unknown offset
214  *  t    f        f       sectors read from unknown file or offset
215  *  f    t        f       not allocated or unknown offset, read as zero
216  *  f    f        f       not allocated or unknown offset, read from backing_hd
217  */
218 #define BDRV_BLOCK_DATA         0x01
219 #define BDRV_BLOCK_ZERO         0x02
220 #define BDRV_BLOCK_OFFSET_VALID 0x04
221 #define BDRV_BLOCK_RAW          0x08
222 #define BDRV_BLOCK_ALLOCATED    0x10
223 #define BDRV_BLOCK_EOF          0x20
224 #define BDRV_BLOCK_RECURSE      0x40
225 
226 typedef QTAILQ_HEAD(BlockReopenQueue, BlockReopenQueueEntry) BlockReopenQueue;
227 
228 typedef struct BDRVReopenState {
229     BlockDriverState *bs;
230     int flags;
231     BlockdevDetectZeroesOptions detect_zeroes;
232     bool backing_missing;
233     BlockDriverState *old_backing_bs; /* keep pointer for permissions update */
234     BlockDriverState *old_file_bs; /* keep pointer for permissions update */
235     QDict *options;
236     QDict *explicit_options;
237     void *opaque;
238 } BDRVReopenState;
239 
240 /*
241  * Block operation types
242  */
243 typedef enum BlockOpType {
244     BLOCK_OP_TYPE_BACKUP_SOURCE,
245     BLOCK_OP_TYPE_BACKUP_TARGET,
246     BLOCK_OP_TYPE_CHANGE,
247     BLOCK_OP_TYPE_COMMIT_SOURCE,
248     BLOCK_OP_TYPE_COMMIT_TARGET,
249     BLOCK_OP_TYPE_DATAPLANE,
250     BLOCK_OP_TYPE_DRIVE_DEL,
251     BLOCK_OP_TYPE_EJECT,
252     BLOCK_OP_TYPE_EXTERNAL_SNAPSHOT,
253     BLOCK_OP_TYPE_INTERNAL_SNAPSHOT,
254     BLOCK_OP_TYPE_INTERNAL_SNAPSHOT_DELETE,
255     BLOCK_OP_TYPE_MIRROR_SOURCE,
256     BLOCK_OP_TYPE_MIRROR_TARGET,
257     BLOCK_OP_TYPE_RESIZE,
258     BLOCK_OP_TYPE_STREAM,
259     BLOCK_OP_TYPE_REPLACE,
260     BLOCK_OP_TYPE_MAX,
261 } BlockOpType;
262 
263 /* Block node permission constants */
264 enum {
265     /**
266      * A user that has the "permission" of consistent reads is guaranteed that
267      * their view of the contents of the block device is complete and
268      * self-consistent, representing the contents of a disk at a specific
269      * point.
270      *
271      * For most block devices (including their backing files) this is true, but
272      * the property cannot be maintained in a few situations like for
273      * intermediate nodes of a commit block job.
274      */
275     BLK_PERM_CONSISTENT_READ    = 0x01,
276 
277     /** This permission is required to change the visible disk contents. */
278     BLK_PERM_WRITE              = 0x02,
279 
280     /**
281      * This permission (which is weaker than BLK_PERM_WRITE) is both enough and
282      * required for writes to the block node when the caller promises that
283      * the visible disk content doesn't change.
284      *
285      * As the BLK_PERM_WRITE permission is strictly stronger, either is
286      * sufficient to perform an unchanging write.
287      */
288     BLK_PERM_WRITE_UNCHANGED    = 0x04,
289 
290     /** This permission is required to change the size of a block node. */
291     BLK_PERM_RESIZE             = 0x08,
292 
293     /**
294      * There was a now-removed bit BLK_PERM_GRAPH_MOD, with value of 0x10. QEMU
295      * 6.1 and earlier may still lock the corresponding byte in block/file-posix
296      * locking.  So, implementing some new permission should be very careful to
297      * not interfere with this old unused thing.
298      */
299 
300     BLK_PERM_ALL                = 0x0f,
301 
302     DEFAULT_PERM_PASSTHROUGH    = BLK_PERM_CONSISTENT_READ
303                                  | BLK_PERM_WRITE
304                                  | BLK_PERM_WRITE_UNCHANGED
305                                  | BLK_PERM_RESIZE,
306 
307     DEFAULT_PERM_UNCHANGED      = BLK_PERM_ALL & ~DEFAULT_PERM_PASSTHROUGH,
308 };
309 
310 /*
311  * Flags that parent nodes assign to child nodes to specify what kind of
312  * role(s) they take.
313  *
314  * At least one of DATA, METADATA, FILTERED, or COW must be set for
315  * every child.
316  */
317 enum BdrvChildRoleBits {
318     /*
319      * This child stores data.
320      * Any node may have an arbitrary number of such children.
321      */
322     BDRV_CHILD_DATA         = (1 << 0),
323 
324     /*
325      * This child stores metadata.
326      * Any node may have an arbitrary number of metadata-storing
327      * children.
328      */
329     BDRV_CHILD_METADATA     = (1 << 1),
330 
331     /*
332      * A child that always presents exactly the same visible data as
333      * the parent, e.g. by virtue of the parent forwarding all reads
334      * and writes.
335      * This flag is mutually exclusive with DATA, METADATA, and COW.
336      * Any node may have at most one filtered child at a time.
337      */
338     BDRV_CHILD_FILTERED     = (1 << 2),
339 
340     /*
341      * Child from which to read all data that isn't allocated in the
342      * parent (i.e., the backing child); such data is copied to the
343      * parent through COW (and optionally COR).
344      * This field is mutually exclusive with DATA, METADATA, and
345      * FILTERED.
346      * Any node may have at most one such backing child at a time.
347      */
348     BDRV_CHILD_COW          = (1 << 3),
349 
350     /*
351      * The primary child.  For most drivers, this is the child whose
352      * filename applies best to the parent node.
353      * Any node may have at most one primary child at a time.
354      */
355     BDRV_CHILD_PRIMARY      = (1 << 4),
356 
357     /* Useful combination of flags */
358     BDRV_CHILD_IMAGE        = BDRV_CHILD_DATA
359                               | BDRV_CHILD_METADATA
360                               | BDRV_CHILD_PRIMARY,
361 };
362 
363 /* Mask of BdrvChildRoleBits values */
364 typedef unsigned int BdrvChildRole;
365 
366 typedef struct BdrvCheckResult {
367     int corruptions;
368     int leaks;
369     int check_errors;
370     int corruptions_fixed;
371     int leaks_fixed;
372     int64_t image_end_offset;
373     BlockFragInfo bfi;
374 } BdrvCheckResult;
375 
376 typedef enum {
377     BDRV_FIX_LEAKS    = 1,
378     BDRV_FIX_ERRORS   = 2,
379 } BdrvCheckMode;
380 
381 typedef struct BlockSizes {
382     uint32_t phys;
383     uint32_t log;
384 } BlockSizes;
385 
386 typedef struct HDGeometry {
387     uint32_t heads;
388     uint32_t sectors;
389     uint32_t cylinders;
390 } HDGeometry;
391 
392 /*
393  * Common functions that are neither I/O nor Global State.
394  *
395  * These functions must never call any function from other categories
396  * (I/O, "I/O or GS", Global State) except this one, but can be invoked by
397  * all of them.
398  */
399 
400 char *bdrv_perm_names(uint64_t perm);
401 uint64_t bdrv_qapi_perm_to_blk_perm(BlockPermission qapi_perm);
402 
403 void bdrv_init_with_whitelist(void);
404 bool bdrv_uses_whitelist(void);
405 int bdrv_is_whitelisted(BlockDriver *drv, bool read_only);
406 
407 int bdrv_parse_aio(const char *mode, int *flags);
408 int bdrv_parse_cache_mode(const char *mode, int *flags, bool *writethrough);
409 int bdrv_parse_discard_flags(const char *mode, int *flags);
410 
411 int path_has_protocol(const char *path);
412 int path_is_absolute(const char *path);
413 char *path_combine(const char *base_path, const char *filename);
414 
415 char *bdrv_get_full_backing_filename_from_filename(const char *backed,
416                                                    const char *backing,
417                                                    Error **errp);
418 
419 #endif /* BLOCK_COMMON_H */
420