1 // SPDX-License-Identifier: GPL-2.0 2 3 #include <linux/slab.h> 4 #include "ctree.h" 5 #include "subpage.h" 6 #include "btrfs_inode.h" 7 8 /* 9 * Subpage (sectorsize < PAGE_SIZE) support overview: 10 * 11 * Limitations: 12 * 13 * - Only support 64K page size for now 14 * This is to make metadata handling easier, as 64K page would ensure 15 * all nodesize would fit inside one page, thus we don't need to handle 16 * cases where a tree block crosses several pages. 17 * 18 * - Only metadata read-write for now 19 * The data read-write part is in development. 20 * 21 * - Metadata can't cross 64K page boundary 22 * btrfs-progs and kernel have done that for a while, thus only ancient 23 * filesystems could have such problem. For such case, do a graceful 24 * rejection. 25 * 26 * Special behavior: 27 * 28 * - Metadata 29 * Metadata read is fully supported. 30 * Meaning when reading one tree block will only trigger the read for the 31 * needed range, other unrelated range in the same page will not be touched. 32 * 33 * Metadata write support is partial. 34 * The writeback is still for the full page, but we will only submit 35 * the dirty extent buffers in the page. 36 * 37 * This means, if we have a metadata page like this: 38 * 39 * Page offset 40 * 0 16K 32K 48K 64K 41 * |/////////| |///////////| 42 * \- Tree block A \- Tree block B 43 * 44 * Even if we just want to writeback tree block A, we will also writeback 45 * tree block B if it's also dirty. 46 * 47 * This may cause extra metadata writeback which results more COW. 48 * 49 * Implementation: 50 * 51 * - Common 52 * Both metadata and data will use a new structure, btrfs_subpage, to 53 * record the status of each sector inside a page. This provides the extra 54 * granularity needed. 55 * 56 * - Metadata 57 * Since we have multiple tree blocks inside one page, we can't rely on page 58 * locking anymore, or we will have greatly reduced concurrency or even 59 * deadlocks (hold one tree lock while trying to lock another tree lock in 60 * the same page). 61 * 62 * Thus for metadata locking, subpage support relies on io_tree locking only. 63 * This means a slightly higher tree locking latency. 64 */ 65 66 void btrfs_init_subpage_info(struct btrfs_subpage_info *subpage_info, u32 sectorsize) 67 { 68 unsigned int cur = 0; 69 unsigned int nr_bits; 70 71 ASSERT(IS_ALIGNED(PAGE_SIZE, sectorsize)); 72 73 nr_bits = PAGE_SIZE / sectorsize; 74 subpage_info->bitmap_nr_bits = nr_bits; 75 76 subpage_info->uptodate_offset = cur; 77 cur += nr_bits; 78 79 subpage_info->error_offset = cur; 80 cur += nr_bits; 81 82 subpage_info->dirty_offset = cur; 83 cur += nr_bits; 84 85 subpage_info->writeback_offset = cur; 86 cur += nr_bits; 87 88 subpage_info->ordered_offset = cur; 89 cur += nr_bits; 90 91 subpage_info->total_nr_bits = cur; 92 } 93 94 int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, 95 struct page *page, enum btrfs_subpage_type type) 96 { 97 struct btrfs_subpage *subpage; 98 99 /* 100 * We have cases like a dummy extent buffer page, which is not mappped 101 * and doesn't need to be locked. 102 */ 103 if (page->mapping) 104 ASSERT(PageLocked(page)); 105 106 /* Either not subpage, or the page already has private attached */ 107 if (fs_info->sectorsize == PAGE_SIZE || PagePrivate(page)) 108 return 0; 109 110 subpage = btrfs_alloc_subpage(fs_info, type); 111 if (IS_ERR(subpage)) 112 return PTR_ERR(subpage); 113 114 attach_page_private(page, subpage); 115 return 0; 116 } 117 118 void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, 119 struct page *page) 120 { 121 struct btrfs_subpage *subpage; 122 123 /* Either not subpage, or already detached */ 124 if (fs_info->sectorsize == PAGE_SIZE || !PagePrivate(page)) 125 return; 126 127 subpage = (struct btrfs_subpage *)detach_page_private(page); 128 ASSERT(subpage); 129 btrfs_free_subpage(subpage); 130 } 131 132 struct btrfs_subpage *btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info, 133 enum btrfs_subpage_type type) 134 { 135 struct btrfs_subpage *ret; 136 137 ASSERT(fs_info->sectorsize < PAGE_SIZE); 138 139 ret = kzalloc(sizeof(struct btrfs_subpage), GFP_NOFS); 140 if (!ret) 141 return ERR_PTR(-ENOMEM); 142 143 spin_lock_init(&ret->lock); 144 if (type == BTRFS_SUBPAGE_METADATA) { 145 atomic_set(&ret->eb_refs, 0); 146 } else { 147 atomic_set(&ret->readers, 0); 148 atomic_set(&ret->writers, 0); 149 } 150 return ret; 151 } 152 153 void btrfs_free_subpage(struct btrfs_subpage *subpage) 154 { 155 kfree(subpage); 156 } 157 158 /* 159 * Increase the eb_refs of current subpage. 160 * 161 * This is important for eb allocation, to prevent race with last eb freeing 162 * of the same page. 163 * With the eb_refs increased before the eb inserted into radix tree, 164 * detach_extent_buffer_page() won't detach the page private while we're still 165 * allocating the extent buffer. 166 */ 167 void btrfs_page_inc_eb_refs(const struct btrfs_fs_info *fs_info, 168 struct page *page) 169 { 170 struct btrfs_subpage *subpage; 171 172 if (fs_info->sectorsize == PAGE_SIZE) 173 return; 174 175 ASSERT(PagePrivate(page) && page->mapping); 176 lockdep_assert_held(&page->mapping->private_lock); 177 178 subpage = (struct btrfs_subpage *)page->private; 179 atomic_inc(&subpage->eb_refs); 180 } 181 182 void btrfs_page_dec_eb_refs(const struct btrfs_fs_info *fs_info, 183 struct page *page) 184 { 185 struct btrfs_subpage *subpage; 186 187 if (fs_info->sectorsize == PAGE_SIZE) 188 return; 189 190 ASSERT(PagePrivate(page) && page->mapping); 191 lockdep_assert_held(&page->mapping->private_lock); 192 193 subpage = (struct btrfs_subpage *)page->private; 194 ASSERT(atomic_read(&subpage->eb_refs)); 195 atomic_dec(&subpage->eb_refs); 196 } 197 198 static void btrfs_subpage_assert(const struct btrfs_fs_info *fs_info, 199 struct page *page, u64 start, u32 len) 200 { 201 /* Basic checks */ 202 ASSERT(PagePrivate(page) && page->private); 203 ASSERT(IS_ALIGNED(start, fs_info->sectorsize) && 204 IS_ALIGNED(len, fs_info->sectorsize)); 205 /* 206 * The range check only works for mapped page, we can still have 207 * unmapped page like dummy extent buffer pages. 208 */ 209 if (page->mapping) 210 ASSERT(page_offset(page) <= start && 211 start + len <= page_offset(page) + PAGE_SIZE); 212 } 213 214 void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info, 215 struct page *page, u64 start, u32 len) 216 { 217 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 218 const int nbits = len >> fs_info->sectorsize_bits; 219 220 btrfs_subpage_assert(fs_info, page, start, len); 221 222 atomic_add(nbits, &subpage->readers); 223 } 224 225 void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info, 226 struct page *page, u64 start, u32 len) 227 { 228 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 229 const int nbits = len >> fs_info->sectorsize_bits; 230 bool is_data; 231 bool last; 232 233 btrfs_subpage_assert(fs_info, page, start, len); 234 is_data = is_data_inode(page->mapping->host); 235 ASSERT(atomic_read(&subpage->readers) >= nbits); 236 last = atomic_sub_and_test(nbits, &subpage->readers); 237 238 /* 239 * For data we need to unlock the page if the last read has finished. 240 * 241 * And please don't replace @last with atomic_sub_and_test() call 242 * inside if () condition. 243 * As we want the atomic_sub_and_test() to be always executed. 244 */ 245 if (is_data && last) 246 unlock_page(page); 247 } 248 249 static void btrfs_subpage_clamp_range(struct page *page, u64 *start, u32 *len) 250 { 251 u64 orig_start = *start; 252 u32 orig_len = *len; 253 254 *start = max_t(u64, page_offset(page), orig_start); 255 *len = min_t(u64, page_offset(page) + PAGE_SIZE, 256 orig_start + orig_len) - *start; 257 } 258 259 void btrfs_subpage_start_writer(const struct btrfs_fs_info *fs_info, 260 struct page *page, u64 start, u32 len) 261 { 262 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 263 const int nbits = (len >> fs_info->sectorsize_bits); 264 int ret; 265 266 btrfs_subpage_assert(fs_info, page, start, len); 267 268 ASSERT(atomic_read(&subpage->readers) == 0); 269 ret = atomic_add_return(nbits, &subpage->writers); 270 ASSERT(ret == nbits); 271 } 272 273 bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info, 274 struct page *page, u64 start, u32 len) 275 { 276 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 277 const int nbits = (len >> fs_info->sectorsize_bits); 278 279 btrfs_subpage_assert(fs_info, page, start, len); 280 281 ASSERT(atomic_read(&subpage->writers) >= nbits); 282 return atomic_sub_and_test(nbits, &subpage->writers); 283 } 284 285 /* 286 * Lock a page for delalloc page writeback. 287 * 288 * Return -EAGAIN if the page is not properly initialized. 289 * Return 0 with the page locked, and writer counter updated. 290 * 291 * Even with 0 returned, the page still need extra check to make sure 292 * it's really the correct page, as the caller is using 293 * find_get_pages_contig(), which can race with page invalidating. 294 */ 295 int btrfs_page_start_writer_lock(const struct btrfs_fs_info *fs_info, 296 struct page *page, u64 start, u32 len) 297 { 298 if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { 299 lock_page(page); 300 return 0; 301 } 302 lock_page(page); 303 if (!PagePrivate(page) || !page->private) { 304 unlock_page(page); 305 return -EAGAIN; 306 } 307 btrfs_subpage_clamp_range(page, &start, &len); 308 btrfs_subpage_start_writer(fs_info, page, start, len); 309 return 0; 310 } 311 312 void btrfs_page_end_writer_lock(const struct btrfs_fs_info *fs_info, 313 struct page *page, u64 start, u32 len) 314 { 315 if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) 316 return unlock_page(page); 317 btrfs_subpage_clamp_range(page, &start, &len); 318 if (btrfs_subpage_end_and_test_writer(fs_info, page, start, len)) 319 unlock_page(page); 320 } 321 322 /* 323 * Convert the [start, start + len) range into a u16 bitmap 324 * 325 * For example: if start == page_offset() + 16K, len = 16K, we get 0x00f0. 326 */ 327 static u16 btrfs_subpage_calc_bitmap(const struct btrfs_fs_info *fs_info, 328 struct page *page, u64 start, u32 len) 329 { 330 const int bit_start = offset_in_page(start) >> fs_info->sectorsize_bits; 331 const int nbits = len >> fs_info->sectorsize_bits; 332 333 btrfs_subpage_assert(fs_info, page, start, len); 334 335 /* 336 * Here nbits can be 16, thus can go beyond u16 range. We make the 337 * first left shift to be calculate in unsigned long (at least u32), 338 * then truncate the result to u16. 339 */ 340 return (u16)(((1UL << nbits) - 1) << bit_start); 341 } 342 343 void btrfs_subpage_set_uptodate(const struct btrfs_fs_info *fs_info, 344 struct page *page, u64 start, u32 len) 345 { 346 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 347 const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); 348 unsigned long flags; 349 350 spin_lock_irqsave(&subpage->lock, flags); 351 subpage->uptodate_bitmap |= tmp; 352 if (subpage->uptodate_bitmap == U16_MAX) 353 SetPageUptodate(page); 354 spin_unlock_irqrestore(&subpage->lock, flags); 355 } 356 357 void btrfs_subpage_clear_uptodate(const struct btrfs_fs_info *fs_info, 358 struct page *page, u64 start, u32 len) 359 { 360 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 361 const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); 362 unsigned long flags; 363 364 spin_lock_irqsave(&subpage->lock, flags); 365 subpage->uptodate_bitmap &= ~tmp; 366 ClearPageUptodate(page); 367 spin_unlock_irqrestore(&subpage->lock, flags); 368 } 369 370 void btrfs_subpage_set_error(const struct btrfs_fs_info *fs_info, 371 struct page *page, u64 start, u32 len) 372 { 373 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 374 const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); 375 unsigned long flags; 376 377 spin_lock_irqsave(&subpage->lock, flags); 378 subpage->error_bitmap |= tmp; 379 SetPageError(page); 380 spin_unlock_irqrestore(&subpage->lock, flags); 381 } 382 383 void btrfs_subpage_clear_error(const struct btrfs_fs_info *fs_info, 384 struct page *page, u64 start, u32 len) 385 { 386 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 387 const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); 388 unsigned long flags; 389 390 spin_lock_irqsave(&subpage->lock, flags); 391 subpage->error_bitmap &= ~tmp; 392 if (subpage->error_bitmap == 0) 393 ClearPageError(page); 394 spin_unlock_irqrestore(&subpage->lock, flags); 395 } 396 397 void btrfs_subpage_set_dirty(const struct btrfs_fs_info *fs_info, 398 struct page *page, u64 start, u32 len) 399 { 400 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 401 u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); 402 unsigned long flags; 403 404 spin_lock_irqsave(&subpage->lock, flags); 405 subpage->dirty_bitmap |= tmp; 406 spin_unlock_irqrestore(&subpage->lock, flags); 407 set_page_dirty(page); 408 } 409 410 /* 411 * Extra clear_and_test function for subpage dirty bitmap. 412 * 413 * Return true if we're the last bits in the dirty_bitmap and clear the 414 * dirty_bitmap. 415 * Return false otherwise. 416 * 417 * NOTE: Callers should manually clear page dirty for true case, as we have 418 * extra handling for tree blocks. 419 */ 420 bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info, 421 struct page *page, u64 start, u32 len) 422 { 423 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 424 u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); 425 unsigned long flags; 426 bool last = false; 427 428 spin_lock_irqsave(&subpage->lock, flags); 429 subpage->dirty_bitmap &= ~tmp; 430 if (subpage->dirty_bitmap == 0) 431 last = true; 432 spin_unlock_irqrestore(&subpage->lock, flags); 433 return last; 434 } 435 436 void btrfs_subpage_clear_dirty(const struct btrfs_fs_info *fs_info, 437 struct page *page, u64 start, u32 len) 438 { 439 bool last; 440 441 last = btrfs_subpage_clear_and_test_dirty(fs_info, page, start, len); 442 if (last) 443 clear_page_dirty_for_io(page); 444 } 445 446 void btrfs_subpage_set_writeback(const struct btrfs_fs_info *fs_info, 447 struct page *page, u64 start, u32 len) 448 { 449 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 450 u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); 451 unsigned long flags; 452 453 spin_lock_irqsave(&subpage->lock, flags); 454 subpage->writeback_bitmap |= tmp; 455 set_page_writeback(page); 456 spin_unlock_irqrestore(&subpage->lock, flags); 457 } 458 459 void btrfs_subpage_clear_writeback(const struct btrfs_fs_info *fs_info, 460 struct page *page, u64 start, u32 len) 461 { 462 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 463 u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); 464 unsigned long flags; 465 466 spin_lock_irqsave(&subpage->lock, flags); 467 subpage->writeback_bitmap &= ~tmp; 468 if (subpage->writeback_bitmap == 0) { 469 ASSERT(PageWriteback(page)); 470 end_page_writeback(page); 471 } 472 spin_unlock_irqrestore(&subpage->lock, flags); 473 } 474 475 void btrfs_subpage_set_ordered(const struct btrfs_fs_info *fs_info, 476 struct page *page, u64 start, u32 len) 477 { 478 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 479 const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); 480 unsigned long flags; 481 482 spin_lock_irqsave(&subpage->lock, flags); 483 subpage->ordered_bitmap |= tmp; 484 SetPageOrdered(page); 485 spin_unlock_irqrestore(&subpage->lock, flags); 486 } 487 488 void btrfs_subpage_clear_ordered(const struct btrfs_fs_info *fs_info, 489 struct page *page, u64 start, u32 len) 490 { 491 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 492 const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); 493 unsigned long flags; 494 495 spin_lock_irqsave(&subpage->lock, flags); 496 subpage->ordered_bitmap &= ~tmp; 497 if (subpage->ordered_bitmap == 0) 498 ClearPageOrdered(page); 499 spin_unlock_irqrestore(&subpage->lock, flags); 500 } 501 /* 502 * Unlike set/clear which is dependent on each page status, for test all bits 503 * are tested in the same way. 504 */ 505 #define IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(name) \ 506 bool btrfs_subpage_test_##name(const struct btrfs_fs_info *fs_info, \ 507 struct page *page, u64 start, u32 len) \ 508 { \ 509 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; \ 510 const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); \ 511 unsigned long flags; \ 512 bool ret; \ 513 \ 514 spin_lock_irqsave(&subpage->lock, flags); \ 515 ret = ((subpage->name##_bitmap & tmp) == tmp); \ 516 spin_unlock_irqrestore(&subpage->lock, flags); \ 517 return ret; \ 518 } 519 IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(uptodate); 520 IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(error); 521 IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(dirty); 522 IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(writeback); 523 IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(ordered); 524 525 /* 526 * Note that, in selftests (extent-io-tests), we can have empty fs_info passed 527 * in. We only test sectorsize == PAGE_SIZE cases so far, thus we can fall 528 * back to regular sectorsize branch. 529 */ 530 #define IMPLEMENT_BTRFS_PAGE_OPS(name, set_page_func, clear_page_func, \ 531 test_page_func) \ 532 void btrfs_page_set_##name(const struct btrfs_fs_info *fs_info, \ 533 struct page *page, u64 start, u32 len) \ 534 { \ 535 if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \ 536 set_page_func(page); \ 537 return; \ 538 } \ 539 btrfs_subpage_set_##name(fs_info, page, start, len); \ 540 } \ 541 void btrfs_page_clear_##name(const struct btrfs_fs_info *fs_info, \ 542 struct page *page, u64 start, u32 len) \ 543 { \ 544 if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \ 545 clear_page_func(page); \ 546 return; \ 547 } \ 548 btrfs_subpage_clear_##name(fs_info, page, start, len); \ 549 } \ 550 bool btrfs_page_test_##name(const struct btrfs_fs_info *fs_info, \ 551 struct page *page, u64 start, u32 len) \ 552 { \ 553 if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) \ 554 return test_page_func(page); \ 555 return btrfs_subpage_test_##name(fs_info, page, start, len); \ 556 } \ 557 void btrfs_page_clamp_set_##name(const struct btrfs_fs_info *fs_info, \ 558 struct page *page, u64 start, u32 len) \ 559 { \ 560 if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \ 561 set_page_func(page); \ 562 return; \ 563 } \ 564 btrfs_subpage_clamp_range(page, &start, &len); \ 565 btrfs_subpage_set_##name(fs_info, page, start, len); \ 566 } \ 567 void btrfs_page_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \ 568 struct page *page, u64 start, u32 len) \ 569 { \ 570 if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \ 571 clear_page_func(page); \ 572 return; \ 573 } \ 574 btrfs_subpage_clamp_range(page, &start, &len); \ 575 btrfs_subpage_clear_##name(fs_info, page, start, len); \ 576 } \ 577 bool btrfs_page_clamp_test_##name(const struct btrfs_fs_info *fs_info, \ 578 struct page *page, u64 start, u32 len) \ 579 { \ 580 if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) \ 581 return test_page_func(page); \ 582 btrfs_subpage_clamp_range(page, &start, &len); \ 583 return btrfs_subpage_test_##name(fs_info, page, start, len); \ 584 } 585 IMPLEMENT_BTRFS_PAGE_OPS(uptodate, SetPageUptodate, ClearPageUptodate, 586 PageUptodate); 587 IMPLEMENT_BTRFS_PAGE_OPS(error, SetPageError, ClearPageError, PageError); 588 IMPLEMENT_BTRFS_PAGE_OPS(dirty, set_page_dirty, clear_page_dirty_for_io, 589 PageDirty); 590 IMPLEMENT_BTRFS_PAGE_OPS(writeback, set_page_writeback, end_page_writeback, 591 PageWriteback); 592 IMPLEMENT_BTRFS_PAGE_OPS(ordered, SetPageOrdered, ClearPageOrdered, 593 PageOrdered); 594 595 /* 596 * Make sure not only the page dirty bit is cleared, but also subpage dirty bit 597 * is cleared. 598 */ 599 void btrfs_page_assert_not_dirty(const struct btrfs_fs_info *fs_info, 600 struct page *page) 601 { 602 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 603 604 if (!IS_ENABLED(CONFIG_BTRFS_ASSERT)) 605 return; 606 607 ASSERT(!PageDirty(page)); 608 if (fs_info->sectorsize == PAGE_SIZE) 609 return; 610 611 ASSERT(PagePrivate(page) && page->private); 612 ASSERT(subpage->dirty_bitmap == 0); 613 } 614