xref: /openbmc/linux/fs/btrfs/extent_io.h (revision 367e5927)
1 /* SPDX-License-Identifier: GPL-2.0 */
2 
3 #ifndef BTRFS_EXTENT_IO_H
4 #define BTRFS_EXTENT_IO_H
5 
6 #include <linux/rbtree.h>
7 #include <linux/refcount.h>
8 #include "ulist.h"
9 
10 /* bits for the extent state */
11 #define EXTENT_DIRTY		(1U << 0)
12 #define EXTENT_WRITEBACK	(1U << 1)
13 #define EXTENT_UPTODATE		(1U << 2)
14 #define EXTENT_LOCKED		(1U << 3)
15 #define EXTENT_NEW		(1U << 4)
16 #define EXTENT_DELALLOC		(1U << 5)
17 #define EXTENT_DEFRAG		(1U << 6)
18 #define EXTENT_BOUNDARY		(1U << 9)
19 #define EXTENT_NODATASUM	(1U << 10)
20 #define EXTENT_CLEAR_META_RESV	(1U << 11)
21 #define EXTENT_NEED_WAIT	(1U << 12)
22 #define EXTENT_DAMAGED		(1U << 13)
23 #define EXTENT_NORESERVE	(1U << 14)
24 #define EXTENT_QGROUP_RESERVED	(1U << 15)
25 #define EXTENT_CLEAR_DATA_RESV	(1U << 16)
26 #define EXTENT_DELALLOC_NEW	(1U << 17)
27 #define EXTENT_IOBITS		(EXTENT_LOCKED | EXTENT_WRITEBACK)
28 #define EXTENT_DO_ACCOUNTING    (EXTENT_CLEAR_META_RESV | \
29 				 EXTENT_CLEAR_DATA_RESV)
30 #define EXTENT_CTLBITS		(EXTENT_DO_ACCOUNTING)
31 
32 /*
33  * flags for bio submission. The high bits indicate the compression
34  * type for this bio
35  */
36 #define EXTENT_BIO_COMPRESSED 1
37 #define EXTENT_BIO_FLAG_SHIFT 16
38 
39 enum {
40 	EXTENT_BUFFER_UPTODATE,
41 	EXTENT_BUFFER_DIRTY,
42 	EXTENT_BUFFER_CORRUPT,
43 	/* this got triggered by readahead */
44 	EXTENT_BUFFER_READAHEAD,
45 	EXTENT_BUFFER_TREE_REF,
46 	EXTENT_BUFFER_STALE,
47 	EXTENT_BUFFER_WRITEBACK,
48 	/* read IO error */
49 	EXTENT_BUFFER_READ_ERR,
50 	EXTENT_BUFFER_UNMAPPED,
51 	EXTENT_BUFFER_IN_TREE,
52 	/* write IO error */
53 	EXTENT_BUFFER_WRITE_ERR,
54 };
55 
56 /* these are flags for __process_pages_contig */
57 #define PAGE_UNLOCK		(1 << 0)
58 #define PAGE_CLEAR_DIRTY	(1 << 1)
59 #define PAGE_SET_WRITEBACK	(1 << 2)
60 #define PAGE_END_WRITEBACK	(1 << 3)
61 #define PAGE_SET_PRIVATE2	(1 << 4)
62 #define PAGE_SET_ERROR		(1 << 5)
63 #define PAGE_LOCK		(1 << 6)
64 
65 /*
66  * page->private values.  Every page that is controlled by the extent
67  * map has page->private set to one.
68  */
69 #define EXTENT_PAGE_PRIVATE 1
70 
71 /*
72  * The extent buffer bitmap operations are done with byte granularity instead of
73  * word granularity for two reasons:
74  * 1. The bitmaps must be little-endian on disk.
75  * 2. Bitmap items are not guaranteed to be aligned to a word and therefore a
76  *    single word in a bitmap may straddle two pages in the extent buffer.
77  */
78 #define BIT_BYTE(nr) ((nr) / BITS_PER_BYTE)
79 #define BYTE_MASK ((1 << BITS_PER_BYTE) - 1)
80 #define BITMAP_FIRST_BYTE_MASK(start) \
81 	((BYTE_MASK << ((start) & (BITS_PER_BYTE - 1))) & BYTE_MASK)
82 #define BITMAP_LAST_BYTE_MASK(nbits) \
83 	(BYTE_MASK >> (-(nbits) & (BITS_PER_BYTE - 1)))
84 
85 struct extent_state;
86 struct btrfs_root;
87 struct btrfs_inode;
88 struct btrfs_io_bio;
89 struct io_failure_record;
90 
91 typedef	blk_status_t (extent_submit_bio_hook_t)(void *private_data, struct bio *bio,
92 				       int mirror_num, unsigned long bio_flags,
93 				       u64 bio_offset);
94 
95 typedef blk_status_t (extent_submit_bio_start_t)(void *private_data,
96 		struct bio *bio, u64 bio_offset);
97 
98 struct extent_io_ops {
99 	/*
100 	 * The following callbacks must be always defined, the function
101 	 * pointer will be called unconditionally.
102 	 */
103 	extent_submit_bio_hook_t *submit_bio_hook;
104 	int (*readpage_end_io_hook)(struct btrfs_io_bio *io_bio, u64 phy_offset,
105 				    struct page *page, u64 start, u64 end,
106 				    int mirror);
107 };
108 
109 struct extent_io_tree {
110 	struct rb_root state;
111 	void *private_data;
112 	u64 dirty_bytes;
113 	int track_uptodate;
114 	spinlock_t lock;
115 	const struct extent_io_ops *ops;
116 };
117 
118 struct extent_state {
119 	u64 start;
120 	u64 end; /* inclusive */
121 	struct rb_node rb_node;
122 
123 	/* ADD NEW ELEMENTS AFTER THIS */
124 	wait_queue_head_t wq;
125 	refcount_t refs;
126 	unsigned state;
127 
128 	struct io_failure_record *failrec;
129 
130 #ifdef CONFIG_BTRFS_DEBUG
131 	struct list_head leak_list;
132 #endif
133 };
134 
135 #define INLINE_EXTENT_BUFFER_PAGES 16
136 #define MAX_INLINE_EXTENT_BUFFER_SIZE (INLINE_EXTENT_BUFFER_PAGES * PAGE_SIZE)
137 struct extent_buffer {
138 	u64 start;
139 	unsigned long len;
140 	unsigned long bflags;
141 	struct btrfs_fs_info *fs_info;
142 	spinlock_t refs_lock;
143 	atomic_t refs;
144 	atomic_t io_pages;
145 	int read_mirror;
146 	struct rcu_head rcu_head;
147 	pid_t lock_owner;
148 
149 	/* count of read lock holders on the extent buffer */
150 	atomic_t write_locks;
151 	atomic_t read_locks;
152 	atomic_t blocking_writers;
153 	atomic_t blocking_readers;
154 	atomic_t spinning_readers;
155 	atomic_t spinning_writers;
156 	short lock_nested;
157 	/* >= 0 if eb belongs to a log tree, -1 otherwise */
158 	short log_index;
159 
160 	/* protects write locks */
161 	rwlock_t lock;
162 
163 	/* readers use lock_wq while they wait for the write
164 	 * lock holders to unlock
165 	 */
166 	wait_queue_head_t write_lock_wq;
167 
168 	/* writers use read_lock_wq while they wait for readers
169 	 * to unlock
170 	 */
171 	wait_queue_head_t read_lock_wq;
172 	struct page *pages[INLINE_EXTENT_BUFFER_PAGES];
173 #ifdef CONFIG_BTRFS_DEBUG
174 	struct list_head leak_list;
175 #endif
176 };
177 
178 /*
179  * Structure to record how many bytes and which ranges are set/cleared
180  */
181 struct extent_changeset {
182 	/* How many bytes are set/cleared in this operation */
183 	unsigned int bytes_changed;
184 
185 	/* Changed ranges */
186 	struct ulist range_changed;
187 };
188 
189 static inline void extent_changeset_init(struct extent_changeset *changeset)
190 {
191 	changeset->bytes_changed = 0;
192 	ulist_init(&changeset->range_changed);
193 }
194 
195 static inline struct extent_changeset *extent_changeset_alloc(void)
196 {
197 	struct extent_changeset *ret;
198 
199 	ret = kmalloc(sizeof(*ret), GFP_KERNEL);
200 	if (!ret)
201 		return NULL;
202 
203 	extent_changeset_init(ret);
204 	return ret;
205 }
206 
207 static inline void extent_changeset_release(struct extent_changeset *changeset)
208 {
209 	if (!changeset)
210 		return;
211 	changeset->bytes_changed = 0;
212 	ulist_release(&changeset->range_changed);
213 }
214 
215 static inline void extent_changeset_free(struct extent_changeset *changeset)
216 {
217 	if (!changeset)
218 		return;
219 	extent_changeset_release(changeset);
220 	kfree(changeset);
221 }
222 
223 static inline void extent_set_compress_type(unsigned long *bio_flags,
224 					    int compress_type)
225 {
226 	*bio_flags |= compress_type << EXTENT_BIO_FLAG_SHIFT;
227 }
228 
229 static inline int extent_compress_type(unsigned long bio_flags)
230 {
231 	return bio_flags >> EXTENT_BIO_FLAG_SHIFT;
232 }
233 
234 struct extent_map_tree;
235 
236 typedef struct extent_map *(get_extent_t)(struct btrfs_inode *inode,
237 					  struct page *page,
238 					  size_t pg_offset,
239 					  u64 start, u64 len,
240 					  int create);
241 
242 void extent_io_tree_init(struct extent_io_tree *tree, void *private_data);
243 int try_release_extent_mapping(struct page *page, gfp_t mask);
244 int try_release_extent_buffer(struct page *page);
245 int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
246 		     struct extent_state **cached);
247 
248 static inline int lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
249 {
250 	return lock_extent_bits(tree, start, end, NULL);
251 }
252 
253 int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end);
254 int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
255 			  get_extent_t *get_extent, int mirror_num);
256 int __init extent_io_init(void);
257 void __cold extent_io_exit(void);
258 
259 u64 count_range_bits(struct extent_io_tree *tree,
260 		     u64 *start, u64 search_end,
261 		     u64 max_bytes, unsigned bits, int contig);
262 
263 void free_extent_state(struct extent_state *state);
264 int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
265 		   unsigned bits, int filled,
266 		   struct extent_state *cached_state);
267 int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
268 		unsigned bits, struct extent_changeset *changeset);
269 int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
270 		     unsigned bits, int wake, int delete,
271 		     struct extent_state **cached);
272 int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
273 		     unsigned bits, int wake, int delete,
274 		     struct extent_state **cached, gfp_t mask,
275 		     struct extent_changeset *changeset);
276 
277 static inline int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end)
278 {
279 	return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL);
280 }
281 
282 static inline int unlock_extent_cached(struct extent_io_tree *tree, u64 start,
283 		u64 end, struct extent_state **cached)
284 {
285 	return __clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached,
286 				GFP_NOFS, NULL);
287 }
288 
289 static inline int unlock_extent_cached_atomic(struct extent_io_tree *tree,
290 		u64 start, u64 end, struct extent_state **cached)
291 {
292 	return __clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached,
293 				GFP_ATOMIC, NULL);
294 }
295 
296 static inline int clear_extent_bits(struct extent_io_tree *tree, u64 start,
297 		u64 end, unsigned bits)
298 {
299 	int wake = 0;
300 
301 	if (bits & EXTENT_LOCKED)
302 		wake = 1;
303 
304 	return clear_extent_bit(tree, start, end, bits, wake, 0, NULL);
305 }
306 
307 int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
308 			   unsigned bits, struct extent_changeset *changeset);
309 int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
310 		   unsigned bits, u64 *failed_start,
311 		   struct extent_state **cached_state, gfp_t mask);
312 
313 static inline int set_extent_bits(struct extent_io_tree *tree, u64 start,
314 		u64 end, unsigned bits)
315 {
316 	return set_extent_bit(tree, start, end, bits, NULL, NULL, GFP_NOFS);
317 }
318 
319 static inline int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
320 		u64 end, struct extent_state **cached_state)
321 {
322 	return __clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,
323 				cached_state, GFP_NOFS, NULL);
324 }
325 
326 static inline int set_extent_dirty(struct extent_io_tree *tree, u64 start,
327 		u64 end, gfp_t mask)
328 {
329 	return set_extent_bit(tree, start, end, EXTENT_DIRTY, NULL,
330 			      NULL, mask);
331 }
332 
333 static inline int clear_extent_dirty(struct extent_io_tree *tree, u64 start,
334 				     u64 end, struct extent_state **cached)
335 {
336 	return clear_extent_bit(tree, start, end,
337 				EXTENT_DIRTY | EXTENT_DELALLOC |
338 				EXTENT_DO_ACCOUNTING, 0, 0, cached);
339 }
340 
341 int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
342 		       unsigned bits, unsigned clear_bits,
343 		       struct extent_state **cached_state);
344 
345 static inline int set_extent_delalloc(struct extent_io_tree *tree, u64 start,
346 				      u64 end, unsigned int extra_bits,
347 				      struct extent_state **cached_state)
348 {
349 	return set_extent_bit(tree, start, end,
350 			      EXTENT_DELALLOC | EXTENT_UPTODATE | extra_bits,
351 			      NULL, cached_state, GFP_NOFS);
352 }
353 
354 static inline int set_extent_defrag(struct extent_io_tree *tree, u64 start,
355 		u64 end, struct extent_state **cached_state)
356 {
357 	return set_extent_bit(tree, start, end,
358 			      EXTENT_DELALLOC | EXTENT_UPTODATE | EXTENT_DEFRAG,
359 			      NULL, cached_state, GFP_NOFS);
360 }
361 
362 static inline int set_extent_new(struct extent_io_tree *tree, u64 start,
363 		u64 end)
364 {
365 	return set_extent_bit(tree, start, end, EXTENT_NEW, NULL, NULL,
366 			GFP_NOFS);
367 }
368 
369 static inline int set_extent_uptodate(struct extent_io_tree *tree, u64 start,
370 		u64 end, struct extent_state **cached_state, gfp_t mask)
371 {
372 	return set_extent_bit(tree, start, end, EXTENT_UPTODATE, NULL,
373 			      cached_state, mask);
374 }
375 
376 int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
377 			  u64 *start_ret, u64 *end_ret, unsigned bits,
378 			  struct extent_state **cached_state);
379 int extent_invalidatepage(struct extent_io_tree *tree,
380 			  struct page *page, unsigned long offset);
381 int extent_write_full_page(struct page *page, struct writeback_control *wbc);
382 int extent_write_locked_range(struct inode *inode, u64 start, u64 end,
383 			      int mode);
384 int extent_writepages(struct address_space *mapping,
385 		      struct writeback_control *wbc);
386 int btree_write_cache_pages(struct address_space *mapping,
387 			    struct writeback_control *wbc);
388 int extent_readpages(struct address_space *mapping, struct list_head *pages,
389 		     unsigned nr_pages);
390 int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
391 		__u64 start, __u64 len);
392 void set_page_extent_mapped(struct page *page);
393 
394 struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
395 					  u64 start);
396 struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
397 						  u64 start, unsigned long len);
398 struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
399 						u64 start);
400 struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src);
401 struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
402 					 u64 start);
403 void free_extent_buffer(struct extent_buffer *eb);
404 void free_extent_buffer_stale(struct extent_buffer *eb);
405 #define WAIT_NONE	0
406 #define WAIT_COMPLETE	1
407 #define WAIT_PAGE_LOCK	2
408 int read_extent_buffer_pages(struct extent_io_tree *tree,
409 			     struct extent_buffer *eb, int wait,
410 			     int mirror_num);
411 void wait_on_extent_buffer_writeback(struct extent_buffer *eb);
412 
413 static inline int num_extent_pages(const struct extent_buffer *eb)
414 {
415 	return (round_up(eb->start + eb->len, PAGE_SIZE) >> PAGE_SHIFT) -
416 	       (eb->start >> PAGE_SHIFT);
417 }
418 
419 static inline void extent_buffer_get(struct extent_buffer *eb)
420 {
421 	atomic_inc(&eb->refs);
422 }
423 
424 static inline int extent_buffer_uptodate(struct extent_buffer *eb)
425 {
426 	return test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
427 }
428 
429 int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv,
430 			 unsigned long start, unsigned long len);
431 void read_extent_buffer(const struct extent_buffer *eb, void *dst,
432 			unsigned long start,
433 			unsigned long len);
434 int read_extent_buffer_to_user(const struct extent_buffer *eb,
435 			       void __user *dst, unsigned long start,
436 			       unsigned long len);
437 void write_extent_buffer_fsid(struct extent_buffer *eb, const void *src);
438 void write_extent_buffer_chunk_tree_uuid(struct extent_buffer *eb,
439 		const void *src);
440 void write_extent_buffer(struct extent_buffer *eb, const void *src,
441 			 unsigned long start, unsigned long len);
442 void copy_extent_buffer_full(struct extent_buffer *dst,
443 			     struct extent_buffer *src);
444 void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
445 			unsigned long dst_offset, unsigned long src_offset,
446 			unsigned long len);
447 void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
448 			   unsigned long src_offset, unsigned long len);
449 void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
450 			   unsigned long src_offset, unsigned long len);
451 void memzero_extent_buffer(struct extent_buffer *eb, unsigned long start,
452 			   unsigned long len);
453 int extent_buffer_test_bit(struct extent_buffer *eb, unsigned long start,
454 			   unsigned long pos);
455 void extent_buffer_bitmap_set(struct extent_buffer *eb, unsigned long start,
456 			      unsigned long pos, unsigned long len);
457 void extent_buffer_bitmap_clear(struct extent_buffer *eb, unsigned long start,
458 				unsigned long pos, unsigned long len);
459 void clear_extent_buffer_dirty(struct extent_buffer *eb);
460 bool set_extent_buffer_dirty(struct extent_buffer *eb);
461 void set_extent_buffer_uptodate(struct extent_buffer *eb);
462 void clear_extent_buffer_uptodate(struct extent_buffer *eb);
463 int extent_buffer_under_io(struct extent_buffer *eb);
464 int map_private_extent_buffer(const struct extent_buffer *eb,
465 			      unsigned long offset, unsigned long min_len,
466 			      char **map, unsigned long *map_start,
467 			      unsigned long *map_len);
468 void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end);
469 void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end);
470 void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
471 				 u64 delalloc_end, struct page *locked_page,
472 				 unsigned bits_to_clear,
473 				 unsigned long page_ops);
474 struct bio *btrfs_bio_alloc(struct block_device *bdev, u64 first_byte);
475 struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs);
476 struct bio *btrfs_bio_clone(struct bio *bio);
477 struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size);
478 
479 struct btrfs_fs_info;
480 struct btrfs_inode;
481 
482 int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
483 		      u64 length, u64 logical, struct page *page,
484 		      unsigned int pg_offset, int mirror_num);
485 int clean_io_failure(struct btrfs_fs_info *fs_info,
486 		     struct extent_io_tree *failure_tree,
487 		     struct extent_io_tree *io_tree, u64 start,
488 		     struct page *page, u64 ino, unsigned int pg_offset);
489 void end_extent_writepage(struct page *page, int err, u64 start, u64 end);
490 int repair_eb_io_failure(struct btrfs_fs_info *fs_info,
491 			 struct extent_buffer *eb, int mirror_num);
492 
493 /*
494  * When IO fails, either with EIO or csum verification fails, we
495  * try other mirrors that might have a good copy of the data.  This
496  * io_failure_record is used to record state as we go through all the
497  * mirrors.  If another mirror has good data, the page is set up to date
498  * and things continue.  If a good mirror can't be found, the original
499  * bio end_io callback is called to indicate things have failed.
500  */
501 struct io_failure_record {
502 	struct page *page;
503 	u64 start;
504 	u64 len;
505 	u64 logical;
506 	unsigned long bio_flags;
507 	int this_mirror;
508 	int failed_mirror;
509 	int in_validation;
510 };
511 
512 
513 void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start,
514 		u64 end);
515 int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end,
516 				struct io_failure_record **failrec_ret);
517 bool btrfs_check_repairable(struct inode *inode, unsigned failed_bio_pages,
518 			    struct io_failure_record *failrec, int fail_mirror);
519 struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio,
520 				    struct io_failure_record *failrec,
521 				    struct page *page, int pg_offset, int icsum,
522 				    bio_end_io_t *endio_func, void *data);
523 int free_io_failure(struct extent_io_tree *failure_tree,
524 		    struct extent_io_tree *io_tree,
525 		    struct io_failure_record *rec);
526 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
527 bool find_lock_delalloc_range(struct inode *inode, struct extent_io_tree *tree,
528 			     struct page *locked_page, u64 *start,
529 			     u64 *end);
530 #endif
531 struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
532 					       u64 start);
533 
534 #endif
535