xref: /openbmc/linux/fs/btrfs/subpage.c (revision 6f17400b)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 #include <linux/slab.h>
4 #include "ctree.h"
5 #include "subpage.h"
6 
7 /*
8  * Subpage (sectorsize < PAGE_SIZE) support overview:
9  *
10  * Limitations:
11  *
12  * - Only support 64K page size for now
13  *   This is to make metadata handling easier, as 64K page would ensure
14  *   all nodesize would fit inside one page, thus we don't need to handle
15  *   cases where a tree block crosses several pages.
16  *
17  * - Only metadata read-write for now
18  *   The data read-write part is in development.
19  *
20  * - Metadata can't cross 64K page boundary
21  *   btrfs-progs and kernel have done that for a while, thus only ancient
22  *   filesystems could have such problem.  For such case, do a graceful
23  *   rejection.
24  *
25  * Special behavior:
26  *
27  * - Metadata
28  *   Metadata read is fully supported.
29  *   Meaning when reading one tree block will only trigger the read for the
30  *   needed range, other unrelated range in the same page will not be touched.
31  *
32  *   Metadata write support is partial.
33  *   The writeback is still for the full page, but we will only submit
34  *   the dirty extent buffers in the page.
35  *
36  *   This means, if we have a metadata page like this:
37  *
38  *   Page offset
39  *   0         16K         32K         48K        64K
40  *   |/////////|           |///////////|
41  *        \- Tree block A        \- Tree block B
42  *
43  *   Even if we just want to writeback tree block A, we will also writeback
44  *   tree block B if it's also dirty.
45  *
46  *   This may cause extra metadata writeback which results more COW.
47  *
48  * Implementation:
49  *
50  * - Common
51  *   Both metadata and data will use a new structure, btrfs_subpage, to
52  *   record the status of each sector inside a page.  This provides the extra
53  *   granularity needed.
54  *
55  * - Metadata
56  *   Since we have multiple tree blocks inside one page, we can't rely on page
57  *   locking anymore, or we will have greatly reduced concurrency or even
58  *   deadlocks (hold one tree lock while trying to lock another tree lock in
59  *   the same page).
60  *
61  *   Thus for metadata locking, subpage support relies on io_tree locking only.
62  *   This means a slightly higher tree locking latency.
63  */
64 
65 int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info,
66 			 struct page *page, enum btrfs_subpage_type type)
67 {
68 	struct btrfs_subpage *subpage = NULL;
69 	int ret;
70 
71 	/*
72 	 * We have cases like a dummy extent buffer page, which is not mappped
73 	 * and doesn't need to be locked.
74 	 */
75 	if (page->mapping)
76 		ASSERT(PageLocked(page));
77 	/* Either not subpage, or the page already has private attached */
78 	if (fs_info->sectorsize == PAGE_SIZE || PagePrivate(page))
79 		return 0;
80 
81 	ret = btrfs_alloc_subpage(fs_info, &subpage, type);
82 	if (ret < 0)
83 		return ret;
84 	attach_page_private(page, subpage);
85 	return 0;
86 }
87 
88 void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info,
89 			  struct page *page)
90 {
91 	struct btrfs_subpage *subpage;
92 
93 	/* Either not subpage, or already detached */
94 	if (fs_info->sectorsize == PAGE_SIZE || !PagePrivate(page))
95 		return;
96 
97 	subpage = (struct btrfs_subpage *)detach_page_private(page);
98 	ASSERT(subpage);
99 	btrfs_free_subpage(subpage);
100 }
101 
102 int btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info,
103 			struct btrfs_subpage **ret,
104 			enum btrfs_subpage_type type)
105 {
106 	if (fs_info->sectorsize == PAGE_SIZE)
107 		return 0;
108 
109 	*ret = kzalloc(sizeof(struct btrfs_subpage), GFP_NOFS);
110 	if (!*ret)
111 		return -ENOMEM;
112 	spin_lock_init(&(*ret)->lock);
113 	if (type == BTRFS_SUBPAGE_METADATA) {
114 		atomic_set(&(*ret)->eb_refs, 0);
115 	} else {
116 		atomic_set(&(*ret)->readers, 0);
117 		atomic_set(&(*ret)->writers, 0);
118 	}
119 	return 0;
120 }
121 
122 void btrfs_free_subpage(struct btrfs_subpage *subpage)
123 {
124 	kfree(subpage);
125 }
126 
127 /*
128  * Increase the eb_refs of current subpage.
129  *
130  * This is important for eb allocation, to prevent race with last eb freeing
131  * of the same page.
132  * With the eb_refs increased before the eb inserted into radix tree,
133  * detach_extent_buffer_page() won't detach the page private while we're still
134  * allocating the extent buffer.
135  */
136 void btrfs_page_inc_eb_refs(const struct btrfs_fs_info *fs_info,
137 			    struct page *page)
138 {
139 	struct btrfs_subpage *subpage;
140 
141 	if (fs_info->sectorsize == PAGE_SIZE)
142 		return;
143 
144 	ASSERT(PagePrivate(page) && page->mapping);
145 	lockdep_assert_held(&page->mapping->private_lock);
146 
147 	subpage = (struct btrfs_subpage *)page->private;
148 	atomic_inc(&subpage->eb_refs);
149 }
150 
151 void btrfs_page_dec_eb_refs(const struct btrfs_fs_info *fs_info,
152 			    struct page *page)
153 {
154 	struct btrfs_subpage *subpage;
155 
156 	if (fs_info->sectorsize == PAGE_SIZE)
157 		return;
158 
159 	ASSERT(PagePrivate(page) && page->mapping);
160 	lockdep_assert_held(&page->mapping->private_lock);
161 
162 	subpage = (struct btrfs_subpage *)page->private;
163 	ASSERT(atomic_read(&subpage->eb_refs));
164 	atomic_dec(&subpage->eb_refs);
165 }
166 
167 static void btrfs_subpage_assert(const struct btrfs_fs_info *fs_info,
168 		struct page *page, u64 start, u32 len)
169 {
170 	/* Basic checks */
171 	ASSERT(PagePrivate(page) && page->private);
172 	ASSERT(IS_ALIGNED(start, fs_info->sectorsize) &&
173 	       IS_ALIGNED(len, fs_info->sectorsize));
174 	/*
175 	 * The range check only works for mapped page, we can still have
176 	 * unmapped page like dummy extent buffer pages.
177 	 */
178 	if (page->mapping)
179 		ASSERT(page_offset(page) <= start &&
180 		       start + len <= page_offset(page) + PAGE_SIZE);
181 }
182 
183 void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info,
184 		struct page *page, u64 start, u32 len)
185 {
186 	struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
187 	const int nbits = len >> fs_info->sectorsize_bits;
188 	int ret;
189 
190 	btrfs_subpage_assert(fs_info, page, start, len);
191 
192 	ret = atomic_add_return(nbits, &subpage->readers);
193 	ASSERT(ret == nbits);
194 }
195 
196 void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info,
197 		struct page *page, u64 start, u32 len)
198 {
199 	struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
200 	const int nbits = len >> fs_info->sectorsize_bits;
201 
202 	btrfs_subpage_assert(fs_info, page, start, len);
203 	ASSERT(atomic_read(&subpage->readers) >= nbits);
204 	if (atomic_sub_and_test(nbits, &subpage->readers))
205 		unlock_page(page);
206 }
207 
208 static void btrfs_subpage_clamp_range(struct page *page, u64 *start, u32 *len)
209 {
210 	u64 orig_start = *start;
211 	u32 orig_len = *len;
212 
213 	*start = max_t(u64, page_offset(page), orig_start);
214 	*len = min_t(u64, page_offset(page) + PAGE_SIZE,
215 		     orig_start + orig_len) - *start;
216 }
217 
218 void btrfs_subpage_start_writer(const struct btrfs_fs_info *fs_info,
219 		struct page *page, u64 start, u32 len)
220 {
221 	struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
222 	const int nbits = (len >> fs_info->sectorsize_bits);
223 	int ret;
224 
225 	btrfs_subpage_assert(fs_info, page, start, len);
226 
227 	ASSERT(atomic_read(&subpage->readers) == 0);
228 	ret = atomic_add_return(nbits, &subpage->writers);
229 	ASSERT(ret == nbits);
230 }
231 
232 bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info,
233 		struct page *page, u64 start, u32 len)
234 {
235 	struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
236 	const int nbits = (len >> fs_info->sectorsize_bits);
237 
238 	btrfs_subpage_assert(fs_info, page, start, len);
239 
240 	ASSERT(atomic_read(&subpage->writers) >= nbits);
241 	return atomic_sub_and_test(nbits, &subpage->writers);
242 }
243 
244 /*
245  * Lock a page for delalloc page writeback.
246  *
247  * Return -EAGAIN if the page is not properly initialized.
248  * Return 0 with the page locked, and writer counter updated.
249  *
250  * Even with 0 returned, the page still need extra check to make sure
251  * it's really the correct page, as the caller is using
252  * find_get_pages_contig(), which can race with page invalidating.
253  */
254 int btrfs_page_start_writer_lock(const struct btrfs_fs_info *fs_info,
255 		struct page *page, u64 start, u32 len)
256 {
257 	if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) {
258 		lock_page(page);
259 		return 0;
260 	}
261 	lock_page(page);
262 	if (!PagePrivate(page) || !page->private) {
263 		unlock_page(page);
264 		return -EAGAIN;
265 	}
266 	btrfs_subpage_clamp_range(page, &start, &len);
267 	btrfs_subpage_start_writer(fs_info, page, start, len);
268 	return 0;
269 }
270 
271 void btrfs_page_end_writer_lock(const struct btrfs_fs_info *fs_info,
272 		struct page *page, u64 start, u32 len)
273 {
274 	if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE)
275 		return unlock_page(page);
276 	btrfs_subpage_clamp_range(page, &start, &len);
277 	if (btrfs_subpage_end_and_test_writer(fs_info, page, start, len))
278 		unlock_page(page);
279 }
280 
281 /*
282  * Convert the [start, start + len) range into a u16 bitmap
283  *
284  * For example: if start == page_offset() + 16K, len = 16K, we get 0x00f0.
285  */
286 static u16 btrfs_subpage_calc_bitmap(const struct btrfs_fs_info *fs_info,
287 		struct page *page, u64 start, u32 len)
288 {
289 	const int bit_start = offset_in_page(start) >> fs_info->sectorsize_bits;
290 	const int nbits = len >> fs_info->sectorsize_bits;
291 
292 	btrfs_subpage_assert(fs_info, page, start, len);
293 
294 	/*
295 	 * Here nbits can be 16, thus can go beyond u16 range. We make the
296 	 * first left shift to be calculate in unsigned long (at least u32),
297 	 * then truncate the result to u16.
298 	 */
299 	return (u16)(((1UL << nbits) - 1) << bit_start);
300 }
301 
302 void btrfs_subpage_set_uptodate(const struct btrfs_fs_info *fs_info,
303 		struct page *page, u64 start, u32 len)
304 {
305 	struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
306 	const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
307 	unsigned long flags;
308 
309 	spin_lock_irqsave(&subpage->lock, flags);
310 	subpage->uptodate_bitmap |= tmp;
311 	if (subpage->uptodate_bitmap == U16_MAX)
312 		SetPageUptodate(page);
313 	spin_unlock_irqrestore(&subpage->lock, flags);
314 }
315 
316 void btrfs_subpage_clear_uptodate(const struct btrfs_fs_info *fs_info,
317 		struct page *page, u64 start, u32 len)
318 {
319 	struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
320 	const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
321 	unsigned long flags;
322 
323 	spin_lock_irqsave(&subpage->lock, flags);
324 	subpage->uptodate_bitmap &= ~tmp;
325 	ClearPageUptodate(page);
326 	spin_unlock_irqrestore(&subpage->lock, flags);
327 }
328 
329 void btrfs_subpage_set_error(const struct btrfs_fs_info *fs_info,
330 		struct page *page, u64 start, u32 len)
331 {
332 	struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
333 	const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
334 	unsigned long flags;
335 
336 	spin_lock_irqsave(&subpage->lock, flags);
337 	subpage->error_bitmap |= tmp;
338 	SetPageError(page);
339 	spin_unlock_irqrestore(&subpage->lock, flags);
340 }
341 
342 void btrfs_subpage_clear_error(const struct btrfs_fs_info *fs_info,
343 		struct page *page, u64 start, u32 len)
344 {
345 	struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
346 	const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
347 	unsigned long flags;
348 
349 	spin_lock_irqsave(&subpage->lock, flags);
350 	subpage->error_bitmap &= ~tmp;
351 	if (subpage->error_bitmap == 0)
352 		ClearPageError(page);
353 	spin_unlock_irqrestore(&subpage->lock, flags);
354 }
355 
356 void btrfs_subpage_set_dirty(const struct btrfs_fs_info *fs_info,
357 		struct page *page, u64 start, u32 len)
358 {
359 	struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
360 	u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
361 	unsigned long flags;
362 
363 	spin_lock_irqsave(&subpage->lock, flags);
364 	subpage->dirty_bitmap |= tmp;
365 	spin_unlock_irqrestore(&subpage->lock, flags);
366 	set_page_dirty(page);
367 }
368 
369 /*
370  * Extra clear_and_test function for subpage dirty bitmap.
371  *
372  * Return true if we're the last bits in the dirty_bitmap and clear the
373  * dirty_bitmap.
374  * Return false otherwise.
375  *
376  * NOTE: Callers should manually clear page dirty for true case, as we have
377  * extra handling for tree blocks.
378  */
379 bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info,
380 		struct page *page, u64 start, u32 len)
381 {
382 	struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
383 	u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
384 	unsigned long flags;
385 	bool last = false;
386 
387 	spin_lock_irqsave(&subpage->lock, flags);
388 	subpage->dirty_bitmap &= ~tmp;
389 	if (subpage->dirty_bitmap == 0)
390 		last = true;
391 	spin_unlock_irqrestore(&subpage->lock, flags);
392 	return last;
393 }
394 
395 void btrfs_subpage_clear_dirty(const struct btrfs_fs_info *fs_info,
396 		struct page *page, u64 start, u32 len)
397 {
398 	bool last;
399 
400 	last = btrfs_subpage_clear_and_test_dirty(fs_info, page, start, len);
401 	if (last)
402 		clear_page_dirty_for_io(page);
403 }
404 
405 void btrfs_subpage_set_writeback(const struct btrfs_fs_info *fs_info,
406 		struct page *page, u64 start, u32 len)
407 {
408 	struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
409 	u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
410 	unsigned long flags;
411 
412 	spin_lock_irqsave(&subpage->lock, flags);
413 	subpage->writeback_bitmap |= tmp;
414 	set_page_writeback(page);
415 	spin_unlock_irqrestore(&subpage->lock, flags);
416 }
417 
418 void btrfs_subpage_clear_writeback(const struct btrfs_fs_info *fs_info,
419 		struct page *page, u64 start, u32 len)
420 {
421 	struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
422 	u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
423 	unsigned long flags;
424 
425 	spin_lock_irqsave(&subpage->lock, flags);
426 	subpage->writeback_bitmap &= ~tmp;
427 	if (subpage->writeback_bitmap == 0)
428 		end_page_writeback(page);
429 	spin_unlock_irqrestore(&subpage->lock, flags);
430 }
431 
432 void btrfs_subpage_set_ordered(const struct btrfs_fs_info *fs_info,
433 		struct page *page, u64 start, u32 len)
434 {
435 	struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
436 	const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
437 	unsigned long flags;
438 
439 	spin_lock_irqsave(&subpage->lock, flags);
440 	subpage->ordered_bitmap |= tmp;
441 	SetPageOrdered(page);
442 	spin_unlock_irqrestore(&subpage->lock, flags);
443 }
444 
445 void btrfs_subpage_clear_ordered(const struct btrfs_fs_info *fs_info,
446 		struct page *page, u64 start, u32 len)
447 {
448 	struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
449 	const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
450 	unsigned long flags;
451 
452 	spin_lock_irqsave(&subpage->lock, flags);
453 	subpage->ordered_bitmap &= ~tmp;
454 	if (subpage->ordered_bitmap == 0)
455 		ClearPageOrdered(page);
456 	spin_unlock_irqrestore(&subpage->lock, flags);
457 }
458 /*
459  * Unlike set/clear which is dependent on each page status, for test all bits
460  * are tested in the same way.
461  */
462 #define IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(name)				\
463 bool btrfs_subpage_test_##name(const struct btrfs_fs_info *fs_info,	\
464 		struct page *page, u64 start, u32 len)			\
465 {									\
466 	struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; \
467 	const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); \
468 	unsigned long flags;						\
469 	bool ret;							\
470 									\
471 	spin_lock_irqsave(&subpage->lock, flags);			\
472 	ret = ((subpage->name##_bitmap & tmp) == tmp);			\
473 	spin_unlock_irqrestore(&subpage->lock, flags);			\
474 	return ret;							\
475 }
476 IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(uptodate);
477 IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(error);
478 IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(dirty);
479 IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(writeback);
480 IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(ordered);
481 
482 /*
483  * Note that, in selftests (extent-io-tests), we can have empty fs_info passed
484  * in.  We only test sectorsize == PAGE_SIZE cases so far, thus we can fall
485  * back to regular sectorsize branch.
486  */
487 #define IMPLEMENT_BTRFS_PAGE_OPS(name, set_page_func, clear_page_func,	\
488 			       test_page_func)				\
489 void btrfs_page_set_##name(const struct btrfs_fs_info *fs_info,		\
490 		struct page *page, u64 start, u32 len)			\
491 {									\
492 	if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) {	\
493 		set_page_func(page);					\
494 		return;							\
495 	}								\
496 	btrfs_subpage_set_##name(fs_info, page, start, len);		\
497 }									\
498 void btrfs_page_clear_##name(const struct btrfs_fs_info *fs_info,	\
499 		struct page *page, u64 start, u32 len)			\
500 {									\
501 	if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) {	\
502 		clear_page_func(page);					\
503 		return;							\
504 	}								\
505 	btrfs_subpage_clear_##name(fs_info, page, start, len);		\
506 }									\
507 bool btrfs_page_test_##name(const struct btrfs_fs_info *fs_info,	\
508 		struct page *page, u64 start, u32 len)			\
509 {									\
510 	if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE)	\
511 		return test_page_func(page);				\
512 	return btrfs_subpage_test_##name(fs_info, page, start, len);	\
513 }									\
514 void btrfs_page_clamp_set_##name(const struct btrfs_fs_info *fs_info,	\
515 		struct page *page, u64 start, u32 len)			\
516 {									\
517 	if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) {	\
518 		set_page_func(page);					\
519 		return;							\
520 	}								\
521 	btrfs_subpage_clamp_range(page, &start, &len);			\
522 	btrfs_subpage_set_##name(fs_info, page, start, len);		\
523 }									\
524 void btrfs_page_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \
525 		struct page *page, u64 start, u32 len)			\
526 {									\
527 	if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) {	\
528 		clear_page_func(page);					\
529 		return;							\
530 	}								\
531 	btrfs_subpage_clamp_range(page, &start, &len);			\
532 	btrfs_subpage_clear_##name(fs_info, page, start, len);		\
533 }									\
534 bool btrfs_page_clamp_test_##name(const struct btrfs_fs_info *fs_info,	\
535 		struct page *page, u64 start, u32 len)			\
536 {									\
537 	if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE)	\
538 		return test_page_func(page);				\
539 	btrfs_subpage_clamp_range(page, &start, &len);			\
540 	return btrfs_subpage_test_##name(fs_info, page, start, len);	\
541 }
542 IMPLEMENT_BTRFS_PAGE_OPS(uptodate, SetPageUptodate, ClearPageUptodate,
543 			 PageUptodate);
544 IMPLEMENT_BTRFS_PAGE_OPS(error, SetPageError, ClearPageError, PageError);
545 IMPLEMENT_BTRFS_PAGE_OPS(dirty, set_page_dirty, clear_page_dirty_for_io,
546 			 PageDirty);
547 IMPLEMENT_BTRFS_PAGE_OPS(writeback, set_page_writeback, end_page_writeback,
548 			 PageWriteback);
549 IMPLEMENT_BTRFS_PAGE_OPS(ordered, SetPageOrdered, ClearPageOrdered,
550 			 PageOrdered);
551