xref: /openbmc/linux/fs/btrfs/subpage.c (revision aa0dc6a7)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 #include <linux/slab.h>
4 #include "ctree.h"
5 #include "subpage.h"
6 #include "btrfs_inode.h"
7 
8 /*
9  * Subpage (sectorsize < PAGE_SIZE) support overview:
10  *
11  * Limitations:
12  *
13  * - Only support 64K page size for now
14  *   This is to make metadata handling easier, as 64K page would ensure
15  *   all nodesize would fit inside one page, thus we don't need to handle
16  *   cases where a tree block crosses several pages.
17  *
18  * - Only metadata read-write for now
19  *   The data read-write part is in development.
20  *
21  * - Metadata can't cross 64K page boundary
22  *   btrfs-progs and kernel have done that for a while, thus only ancient
23  *   filesystems could have such problem.  For such case, do a graceful
24  *   rejection.
25  *
26  * Special behavior:
27  *
28  * - Metadata
29  *   Metadata read is fully supported.
30  *   Meaning when reading one tree block will only trigger the read for the
31  *   needed range, other unrelated range in the same page will not be touched.
32  *
33  *   Metadata write support is partial.
34  *   The writeback is still for the full page, but we will only submit
35  *   the dirty extent buffers in the page.
36  *
37  *   This means, if we have a metadata page like this:
38  *
39  *   Page offset
40  *   0         16K         32K         48K        64K
41  *   |/////////|           |///////////|
42  *        \- Tree block A        \- Tree block B
43  *
44  *   Even if we just want to writeback tree block A, we will also writeback
45  *   tree block B if it's also dirty.
46  *
47  *   This may cause extra metadata writeback which results more COW.
48  *
49  * Implementation:
50  *
51  * - Common
52  *   Both metadata and data will use a new structure, btrfs_subpage, to
53  *   record the status of each sector inside a page.  This provides the extra
54  *   granularity needed.
55  *
56  * - Metadata
57  *   Since we have multiple tree blocks inside one page, we can't rely on page
58  *   locking anymore, or we will have greatly reduced concurrency or even
59  *   deadlocks (hold one tree lock while trying to lock another tree lock in
60  *   the same page).
61  *
62  *   Thus for metadata locking, subpage support relies on io_tree locking only.
63  *   This means a slightly higher tree locking latency.
64  */
65 
66 int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info,
67 			 struct page *page, enum btrfs_subpage_type type)
68 {
69 	struct btrfs_subpage *subpage = NULL;
70 	int ret;
71 
72 	/*
73 	 * We have cases like a dummy extent buffer page, which is not mappped
74 	 * and doesn't need to be locked.
75 	 */
76 	if (page->mapping)
77 		ASSERT(PageLocked(page));
78 	/* Either not subpage, or the page already has private attached */
79 	if (fs_info->sectorsize == PAGE_SIZE || PagePrivate(page))
80 		return 0;
81 
82 	ret = btrfs_alloc_subpage(fs_info, &subpage, type);
83 	if (ret < 0)
84 		return ret;
85 	attach_page_private(page, subpage);
86 	return 0;
87 }
88 
89 void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info,
90 			  struct page *page)
91 {
92 	struct btrfs_subpage *subpage;
93 
94 	/* Either not subpage, or already detached */
95 	if (fs_info->sectorsize == PAGE_SIZE || !PagePrivate(page))
96 		return;
97 
98 	subpage = (struct btrfs_subpage *)detach_page_private(page);
99 	ASSERT(subpage);
100 	btrfs_free_subpage(subpage);
101 }
102 
103 int btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info,
104 			struct btrfs_subpage **ret,
105 			enum btrfs_subpage_type type)
106 {
107 	if (fs_info->sectorsize == PAGE_SIZE)
108 		return 0;
109 
110 	*ret = kzalloc(sizeof(struct btrfs_subpage), GFP_NOFS);
111 	if (!*ret)
112 		return -ENOMEM;
113 	spin_lock_init(&(*ret)->lock);
114 	if (type == BTRFS_SUBPAGE_METADATA) {
115 		atomic_set(&(*ret)->eb_refs, 0);
116 	} else {
117 		atomic_set(&(*ret)->readers, 0);
118 		atomic_set(&(*ret)->writers, 0);
119 	}
120 	return 0;
121 }
122 
123 void btrfs_free_subpage(struct btrfs_subpage *subpage)
124 {
125 	kfree(subpage);
126 }
127 
128 /*
129  * Increase the eb_refs of current subpage.
130  *
131  * This is important for eb allocation, to prevent race with last eb freeing
132  * of the same page.
133  * With the eb_refs increased before the eb inserted into radix tree,
134  * detach_extent_buffer_page() won't detach the page private while we're still
135  * allocating the extent buffer.
136  */
137 void btrfs_page_inc_eb_refs(const struct btrfs_fs_info *fs_info,
138 			    struct page *page)
139 {
140 	struct btrfs_subpage *subpage;
141 
142 	if (fs_info->sectorsize == PAGE_SIZE)
143 		return;
144 
145 	ASSERT(PagePrivate(page) && page->mapping);
146 	lockdep_assert_held(&page->mapping->private_lock);
147 
148 	subpage = (struct btrfs_subpage *)page->private;
149 	atomic_inc(&subpage->eb_refs);
150 }
151 
152 void btrfs_page_dec_eb_refs(const struct btrfs_fs_info *fs_info,
153 			    struct page *page)
154 {
155 	struct btrfs_subpage *subpage;
156 
157 	if (fs_info->sectorsize == PAGE_SIZE)
158 		return;
159 
160 	ASSERT(PagePrivate(page) && page->mapping);
161 	lockdep_assert_held(&page->mapping->private_lock);
162 
163 	subpage = (struct btrfs_subpage *)page->private;
164 	ASSERT(atomic_read(&subpage->eb_refs));
165 	atomic_dec(&subpage->eb_refs);
166 }
167 
168 static void btrfs_subpage_assert(const struct btrfs_fs_info *fs_info,
169 		struct page *page, u64 start, u32 len)
170 {
171 	/* Basic checks */
172 	ASSERT(PagePrivate(page) && page->private);
173 	ASSERT(IS_ALIGNED(start, fs_info->sectorsize) &&
174 	       IS_ALIGNED(len, fs_info->sectorsize));
175 	/*
176 	 * The range check only works for mapped page, we can still have
177 	 * unmapped page like dummy extent buffer pages.
178 	 */
179 	if (page->mapping)
180 		ASSERT(page_offset(page) <= start &&
181 		       start + len <= page_offset(page) + PAGE_SIZE);
182 }
183 
184 void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info,
185 		struct page *page, u64 start, u32 len)
186 {
187 	struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
188 	const int nbits = len >> fs_info->sectorsize_bits;
189 
190 	btrfs_subpage_assert(fs_info, page, start, len);
191 
192 	atomic_add(nbits, &subpage->readers);
193 }
194 
195 void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info,
196 		struct page *page, u64 start, u32 len)
197 {
198 	struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
199 	const int nbits = len >> fs_info->sectorsize_bits;
200 	bool is_data;
201 	bool last;
202 
203 	btrfs_subpage_assert(fs_info, page, start, len);
204 	is_data = is_data_inode(page->mapping->host);
205 	ASSERT(atomic_read(&subpage->readers) >= nbits);
206 	last = atomic_sub_and_test(nbits, &subpage->readers);
207 
208 	/*
209 	 * For data we need to unlock the page if the last read has finished.
210 	 *
211 	 * And please don't replace @last with atomic_sub_and_test() call
212 	 * inside if () condition.
213 	 * As we want the atomic_sub_and_test() to be always executed.
214 	 */
215 	if (is_data && last)
216 		unlock_page(page);
217 }
218 
219 static void btrfs_subpage_clamp_range(struct page *page, u64 *start, u32 *len)
220 {
221 	u64 orig_start = *start;
222 	u32 orig_len = *len;
223 
224 	*start = max_t(u64, page_offset(page), orig_start);
225 	*len = min_t(u64, page_offset(page) + PAGE_SIZE,
226 		     orig_start + orig_len) - *start;
227 }
228 
229 void btrfs_subpage_start_writer(const struct btrfs_fs_info *fs_info,
230 		struct page *page, u64 start, u32 len)
231 {
232 	struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
233 	const int nbits = (len >> fs_info->sectorsize_bits);
234 	int ret;
235 
236 	btrfs_subpage_assert(fs_info, page, start, len);
237 
238 	ASSERT(atomic_read(&subpage->readers) == 0);
239 	ret = atomic_add_return(nbits, &subpage->writers);
240 	ASSERT(ret == nbits);
241 }
242 
243 bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info,
244 		struct page *page, u64 start, u32 len)
245 {
246 	struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
247 	const int nbits = (len >> fs_info->sectorsize_bits);
248 
249 	btrfs_subpage_assert(fs_info, page, start, len);
250 
251 	ASSERT(atomic_read(&subpage->writers) >= nbits);
252 	return atomic_sub_and_test(nbits, &subpage->writers);
253 }
254 
255 /*
256  * Lock a page for delalloc page writeback.
257  *
258  * Return -EAGAIN if the page is not properly initialized.
259  * Return 0 with the page locked, and writer counter updated.
260  *
261  * Even with 0 returned, the page still need extra check to make sure
262  * it's really the correct page, as the caller is using
263  * find_get_pages_contig(), which can race with page invalidating.
264  */
265 int btrfs_page_start_writer_lock(const struct btrfs_fs_info *fs_info,
266 		struct page *page, u64 start, u32 len)
267 {
268 	if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) {
269 		lock_page(page);
270 		return 0;
271 	}
272 	lock_page(page);
273 	if (!PagePrivate(page) || !page->private) {
274 		unlock_page(page);
275 		return -EAGAIN;
276 	}
277 	btrfs_subpage_clamp_range(page, &start, &len);
278 	btrfs_subpage_start_writer(fs_info, page, start, len);
279 	return 0;
280 }
281 
282 void btrfs_page_end_writer_lock(const struct btrfs_fs_info *fs_info,
283 		struct page *page, u64 start, u32 len)
284 {
285 	if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE)
286 		return unlock_page(page);
287 	btrfs_subpage_clamp_range(page, &start, &len);
288 	if (btrfs_subpage_end_and_test_writer(fs_info, page, start, len))
289 		unlock_page(page);
290 }
291 
292 /*
293  * Convert the [start, start + len) range into a u16 bitmap
294  *
295  * For example: if start == page_offset() + 16K, len = 16K, we get 0x00f0.
296  */
297 static u16 btrfs_subpage_calc_bitmap(const struct btrfs_fs_info *fs_info,
298 		struct page *page, u64 start, u32 len)
299 {
300 	const int bit_start = offset_in_page(start) >> fs_info->sectorsize_bits;
301 	const int nbits = len >> fs_info->sectorsize_bits;
302 
303 	btrfs_subpage_assert(fs_info, page, start, len);
304 
305 	/*
306 	 * Here nbits can be 16, thus can go beyond u16 range. We make the
307 	 * first left shift to be calculate in unsigned long (at least u32),
308 	 * then truncate the result to u16.
309 	 */
310 	return (u16)(((1UL << nbits) - 1) << bit_start);
311 }
312 
313 void btrfs_subpage_set_uptodate(const struct btrfs_fs_info *fs_info,
314 		struct page *page, u64 start, u32 len)
315 {
316 	struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
317 	const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
318 	unsigned long flags;
319 
320 	spin_lock_irqsave(&subpage->lock, flags);
321 	subpage->uptodate_bitmap |= tmp;
322 	if (subpage->uptodate_bitmap == U16_MAX)
323 		SetPageUptodate(page);
324 	spin_unlock_irqrestore(&subpage->lock, flags);
325 }
326 
327 void btrfs_subpage_clear_uptodate(const struct btrfs_fs_info *fs_info,
328 		struct page *page, u64 start, u32 len)
329 {
330 	struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
331 	const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
332 	unsigned long flags;
333 
334 	spin_lock_irqsave(&subpage->lock, flags);
335 	subpage->uptodate_bitmap &= ~tmp;
336 	ClearPageUptodate(page);
337 	spin_unlock_irqrestore(&subpage->lock, flags);
338 }
339 
340 void btrfs_subpage_set_error(const struct btrfs_fs_info *fs_info,
341 		struct page *page, u64 start, u32 len)
342 {
343 	struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
344 	const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
345 	unsigned long flags;
346 
347 	spin_lock_irqsave(&subpage->lock, flags);
348 	subpage->error_bitmap |= tmp;
349 	SetPageError(page);
350 	spin_unlock_irqrestore(&subpage->lock, flags);
351 }
352 
353 void btrfs_subpage_clear_error(const struct btrfs_fs_info *fs_info,
354 		struct page *page, u64 start, u32 len)
355 {
356 	struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
357 	const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
358 	unsigned long flags;
359 
360 	spin_lock_irqsave(&subpage->lock, flags);
361 	subpage->error_bitmap &= ~tmp;
362 	if (subpage->error_bitmap == 0)
363 		ClearPageError(page);
364 	spin_unlock_irqrestore(&subpage->lock, flags);
365 }
366 
367 void btrfs_subpage_set_dirty(const struct btrfs_fs_info *fs_info,
368 		struct page *page, u64 start, u32 len)
369 {
370 	struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
371 	u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
372 	unsigned long flags;
373 
374 	spin_lock_irqsave(&subpage->lock, flags);
375 	subpage->dirty_bitmap |= tmp;
376 	spin_unlock_irqrestore(&subpage->lock, flags);
377 	set_page_dirty(page);
378 }
379 
380 /*
381  * Extra clear_and_test function for subpage dirty bitmap.
382  *
383  * Return true if we're the last bits in the dirty_bitmap and clear the
384  * dirty_bitmap.
385  * Return false otherwise.
386  *
387  * NOTE: Callers should manually clear page dirty for true case, as we have
388  * extra handling for tree blocks.
389  */
390 bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info,
391 		struct page *page, u64 start, u32 len)
392 {
393 	struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
394 	u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
395 	unsigned long flags;
396 	bool last = false;
397 
398 	spin_lock_irqsave(&subpage->lock, flags);
399 	subpage->dirty_bitmap &= ~tmp;
400 	if (subpage->dirty_bitmap == 0)
401 		last = true;
402 	spin_unlock_irqrestore(&subpage->lock, flags);
403 	return last;
404 }
405 
406 void btrfs_subpage_clear_dirty(const struct btrfs_fs_info *fs_info,
407 		struct page *page, u64 start, u32 len)
408 {
409 	bool last;
410 
411 	last = btrfs_subpage_clear_and_test_dirty(fs_info, page, start, len);
412 	if (last)
413 		clear_page_dirty_for_io(page);
414 }
415 
416 void btrfs_subpage_set_writeback(const struct btrfs_fs_info *fs_info,
417 		struct page *page, u64 start, u32 len)
418 {
419 	struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
420 	u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
421 	unsigned long flags;
422 
423 	spin_lock_irqsave(&subpage->lock, flags);
424 	subpage->writeback_bitmap |= tmp;
425 	set_page_writeback(page);
426 	spin_unlock_irqrestore(&subpage->lock, flags);
427 }
428 
429 void btrfs_subpage_clear_writeback(const struct btrfs_fs_info *fs_info,
430 		struct page *page, u64 start, u32 len)
431 {
432 	struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
433 	u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
434 	unsigned long flags;
435 
436 	spin_lock_irqsave(&subpage->lock, flags);
437 	subpage->writeback_bitmap &= ~tmp;
438 	if (subpage->writeback_bitmap == 0)
439 		end_page_writeback(page);
440 	spin_unlock_irqrestore(&subpage->lock, flags);
441 }
442 
443 void btrfs_subpage_set_ordered(const struct btrfs_fs_info *fs_info,
444 		struct page *page, u64 start, u32 len)
445 {
446 	struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
447 	const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
448 	unsigned long flags;
449 
450 	spin_lock_irqsave(&subpage->lock, flags);
451 	subpage->ordered_bitmap |= tmp;
452 	SetPageOrdered(page);
453 	spin_unlock_irqrestore(&subpage->lock, flags);
454 }
455 
456 void btrfs_subpage_clear_ordered(const struct btrfs_fs_info *fs_info,
457 		struct page *page, u64 start, u32 len)
458 {
459 	struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
460 	const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
461 	unsigned long flags;
462 
463 	spin_lock_irqsave(&subpage->lock, flags);
464 	subpage->ordered_bitmap &= ~tmp;
465 	if (subpage->ordered_bitmap == 0)
466 		ClearPageOrdered(page);
467 	spin_unlock_irqrestore(&subpage->lock, flags);
468 }
469 /*
470  * Unlike set/clear which is dependent on each page status, for test all bits
471  * are tested in the same way.
472  */
473 #define IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(name)				\
474 bool btrfs_subpage_test_##name(const struct btrfs_fs_info *fs_info,	\
475 		struct page *page, u64 start, u32 len)			\
476 {									\
477 	struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; \
478 	const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); \
479 	unsigned long flags;						\
480 	bool ret;							\
481 									\
482 	spin_lock_irqsave(&subpage->lock, flags);			\
483 	ret = ((subpage->name##_bitmap & tmp) == tmp);			\
484 	spin_unlock_irqrestore(&subpage->lock, flags);			\
485 	return ret;							\
486 }
487 IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(uptodate);
488 IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(error);
489 IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(dirty);
490 IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(writeback);
491 IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(ordered);
492 
493 /*
494  * Note that, in selftests (extent-io-tests), we can have empty fs_info passed
495  * in.  We only test sectorsize == PAGE_SIZE cases so far, thus we can fall
496  * back to regular sectorsize branch.
497  */
498 #define IMPLEMENT_BTRFS_PAGE_OPS(name, set_page_func, clear_page_func,	\
499 			       test_page_func)				\
500 void btrfs_page_set_##name(const struct btrfs_fs_info *fs_info,		\
501 		struct page *page, u64 start, u32 len)			\
502 {									\
503 	if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) {	\
504 		set_page_func(page);					\
505 		return;							\
506 	}								\
507 	btrfs_subpage_set_##name(fs_info, page, start, len);		\
508 }									\
509 void btrfs_page_clear_##name(const struct btrfs_fs_info *fs_info,	\
510 		struct page *page, u64 start, u32 len)			\
511 {									\
512 	if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) {	\
513 		clear_page_func(page);					\
514 		return;							\
515 	}								\
516 	btrfs_subpage_clear_##name(fs_info, page, start, len);		\
517 }									\
518 bool btrfs_page_test_##name(const struct btrfs_fs_info *fs_info,	\
519 		struct page *page, u64 start, u32 len)			\
520 {									\
521 	if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE)	\
522 		return test_page_func(page);				\
523 	return btrfs_subpage_test_##name(fs_info, page, start, len);	\
524 }									\
525 void btrfs_page_clamp_set_##name(const struct btrfs_fs_info *fs_info,	\
526 		struct page *page, u64 start, u32 len)			\
527 {									\
528 	if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) {	\
529 		set_page_func(page);					\
530 		return;							\
531 	}								\
532 	btrfs_subpage_clamp_range(page, &start, &len);			\
533 	btrfs_subpage_set_##name(fs_info, page, start, len);		\
534 }									\
535 void btrfs_page_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \
536 		struct page *page, u64 start, u32 len)			\
537 {									\
538 	if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) {	\
539 		clear_page_func(page);					\
540 		return;							\
541 	}								\
542 	btrfs_subpage_clamp_range(page, &start, &len);			\
543 	btrfs_subpage_clear_##name(fs_info, page, start, len);		\
544 }									\
545 bool btrfs_page_clamp_test_##name(const struct btrfs_fs_info *fs_info,	\
546 		struct page *page, u64 start, u32 len)			\
547 {									\
548 	if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE)	\
549 		return test_page_func(page);				\
550 	btrfs_subpage_clamp_range(page, &start, &len);			\
551 	return btrfs_subpage_test_##name(fs_info, page, start, len);	\
552 }
553 IMPLEMENT_BTRFS_PAGE_OPS(uptodate, SetPageUptodate, ClearPageUptodate,
554 			 PageUptodate);
555 IMPLEMENT_BTRFS_PAGE_OPS(error, SetPageError, ClearPageError, PageError);
556 IMPLEMENT_BTRFS_PAGE_OPS(dirty, set_page_dirty, clear_page_dirty_for_io,
557 			 PageDirty);
558 IMPLEMENT_BTRFS_PAGE_OPS(writeback, set_page_writeback, end_page_writeback,
559 			 PageWriteback);
560 IMPLEMENT_BTRFS_PAGE_OPS(ordered, SetPageOrdered, ClearPageOrdered,
561 			 PageOrdered);
562