1 // SPDX-License-Identifier: GPL-2.0 2 3 #include <linux/slab.h> 4 #include "ctree.h" 5 #include "subpage.h" 6 #include "btrfs_inode.h" 7 8 /* 9 * Subpage (sectorsize < PAGE_SIZE) support overview: 10 * 11 * Limitations: 12 * 13 * - Only support 64K page size for now 14 * This is to make metadata handling easier, as 64K page would ensure 15 * all nodesize would fit inside one page, thus we don't need to handle 16 * cases where a tree block crosses several pages. 17 * 18 * - Only metadata read-write for now 19 * The data read-write part is in development. 20 * 21 * - Metadata can't cross 64K page boundary 22 * btrfs-progs and kernel have done that for a while, thus only ancient 23 * filesystems could have such problem. For such case, do a graceful 24 * rejection. 25 * 26 * Special behavior: 27 * 28 * - Metadata 29 * Metadata read is fully supported. 30 * Meaning when reading one tree block will only trigger the read for the 31 * needed range, other unrelated range in the same page will not be touched. 32 * 33 * Metadata write support is partial. 34 * The writeback is still for the full page, but we will only submit 35 * the dirty extent buffers in the page. 36 * 37 * This means, if we have a metadata page like this: 38 * 39 * Page offset 40 * 0 16K 32K 48K 64K 41 * |/////////| |///////////| 42 * \- Tree block A \- Tree block B 43 * 44 * Even if we just want to writeback tree block A, we will also writeback 45 * tree block B if it's also dirty. 46 * 47 * This may cause extra metadata writeback which results more COW. 48 * 49 * Implementation: 50 * 51 * - Common 52 * Both metadata and data will use a new structure, btrfs_subpage, to 53 * record the status of each sector inside a page. This provides the extra 54 * granularity needed. 55 * 56 * - Metadata 57 * Since we have multiple tree blocks inside one page, we can't rely on page 58 * locking anymore, or we will have greatly reduced concurrency or even 59 * deadlocks (hold one tree lock while trying to lock another tree lock in 60 * the same page). 61 * 62 * Thus for metadata locking, subpage support relies on io_tree locking only. 63 * This means a slightly higher tree locking latency. 64 */ 65 66 int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, 67 struct page *page, enum btrfs_subpage_type type) 68 { 69 struct btrfs_subpage *subpage; 70 71 /* 72 * We have cases like a dummy extent buffer page, which is not mappped 73 * and doesn't need to be locked. 74 */ 75 if (page->mapping) 76 ASSERT(PageLocked(page)); 77 78 /* Either not subpage, or the page already has private attached */ 79 if (fs_info->sectorsize == PAGE_SIZE || PagePrivate(page)) 80 return 0; 81 82 subpage = btrfs_alloc_subpage(fs_info, type); 83 if (IS_ERR(subpage)) 84 return PTR_ERR(subpage); 85 86 attach_page_private(page, subpage); 87 return 0; 88 } 89 90 void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, 91 struct page *page) 92 { 93 struct btrfs_subpage *subpage; 94 95 /* Either not subpage, or already detached */ 96 if (fs_info->sectorsize == PAGE_SIZE || !PagePrivate(page)) 97 return; 98 99 subpage = (struct btrfs_subpage *)detach_page_private(page); 100 ASSERT(subpage); 101 btrfs_free_subpage(subpage); 102 } 103 104 struct btrfs_subpage *btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info, 105 enum btrfs_subpage_type type) 106 { 107 struct btrfs_subpage *ret; 108 109 ASSERT(fs_info->sectorsize < PAGE_SIZE); 110 111 ret = kzalloc(sizeof(struct btrfs_subpage), GFP_NOFS); 112 if (!ret) 113 return ERR_PTR(-ENOMEM); 114 115 spin_lock_init(&ret->lock); 116 if (type == BTRFS_SUBPAGE_METADATA) { 117 atomic_set(&ret->eb_refs, 0); 118 } else { 119 atomic_set(&ret->readers, 0); 120 atomic_set(&ret->writers, 0); 121 } 122 return ret; 123 } 124 125 void btrfs_free_subpage(struct btrfs_subpage *subpage) 126 { 127 kfree(subpage); 128 } 129 130 /* 131 * Increase the eb_refs of current subpage. 132 * 133 * This is important for eb allocation, to prevent race with last eb freeing 134 * of the same page. 135 * With the eb_refs increased before the eb inserted into radix tree, 136 * detach_extent_buffer_page() won't detach the page private while we're still 137 * allocating the extent buffer. 138 */ 139 void btrfs_page_inc_eb_refs(const struct btrfs_fs_info *fs_info, 140 struct page *page) 141 { 142 struct btrfs_subpage *subpage; 143 144 if (fs_info->sectorsize == PAGE_SIZE) 145 return; 146 147 ASSERT(PagePrivate(page) && page->mapping); 148 lockdep_assert_held(&page->mapping->private_lock); 149 150 subpage = (struct btrfs_subpage *)page->private; 151 atomic_inc(&subpage->eb_refs); 152 } 153 154 void btrfs_page_dec_eb_refs(const struct btrfs_fs_info *fs_info, 155 struct page *page) 156 { 157 struct btrfs_subpage *subpage; 158 159 if (fs_info->sectorsize == PAGE_SIZE) 160 return; 161 162 ASSERT(PagePrivate(page) && page->mapping); 163 lockdep_assert_held(&page->mapping->private_lock); 164 165 subpage = (struct btrfs_subpage *)page->private; 166 ASSERT(atomic_read(&subpage->eb_refs)); 167 atomic_dec(&subpage->eb_refs); 168 } 169 170 static void btrfs_subpage_assert(const struct btrfs_fs_info *fs_info, 171 struct page *page, u64 start, u32 len) 172 { 173 /* Basic checks */ 174 ASSERT(PagePrivate(page) && page->private); 175 ASSERT(IS_ALIGNED(start, fs_info->sectorsize) && 176 IS_ALIGNED(len, fs_info->sectorsize)); 177 /* 178 * The range check only works for mapped page, we can still have 179 * unmapped page like dummy extent buffer pages. 180 */ 181 if (page->mapping) 182 ASSERT(page_offset(page) <= start && 183 start + len <= page_offset(page) + PAGE_SIZE); 184 } 185 186 void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info, 187 struct page *page, u64 start, u32 len) 188 { 189 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 190 const int nbits = len >> fs_info->sectorsize_bits; 191 192 btrfs_subpage_assert(fs_info, page, start, len); 193 194 atomic_add(nbits, &subpage->readers); 195 } 196 197 void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info, 198 struct page *page, u64 start, u32 len) 199 { 200 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 201 const int nbits = len >> fs_info->sectorsize_bits; 202 bool is_data; 203 bool last; 204 205 btrfs_subpage_assert(fs_info, page, start, len); 206 is_data = is_data_inode(page->mapping->host); 207 ASSERT(atomic_read(&subpage->readers) >= nbits); 208 last = atomic_sub_and_test(nbits, &subpage->readers); 209 210 /* 211 * For data we need to unlock the page if the last read has finished. 212 * 213 * And please don't replace @last with atomic_sub_and_test() call 214 * inside if () condition. 215 * As we want the atomic_sub_and_test() to be always executed. 216 */ 217 if (is_data && last) 218 unlock_page(page); 219 } 220 221 static void btrfs_subpage_clamp_range(struct page *page, u64 *start, u32 *len) 222 { 223 u64 orig_start = *start; 224 u32 orig_len = *len; 225 226 *start = max_t(u64, page_offset(page), orig_start); 227 *len = min_t(u64, page_offset(page) + PAGE_SIZE, 228 orig_start + orig_len) - *start; 229 } 230 231 void btrfs_subpage_start_writer(const struct btrfs_fs_info *fs_info, 232 struct page *page, u64 start, u32 len) 233 { 234 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 235 const int nbits = (len >> fs_info->sectorsize_bits); 236 int ret; 237 238 btrfs_subpage_assert(fs_info, page, start, len); 239 240 ASSERT(atomic_read(&subpage->readers) == 0); 241 ret = atomic_add_return(nbits, &subpage->writers); 242 ASSERT(ret == nbits); 243 } 244 245 bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info, 246 struct page *page, u64 start, u32 len) 247 { 248 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 249 const int nbits = (len >> fs_info->sectorsize_bits); 250 251 btrfs_subpage_assert(fs_info, page, start, len); 252 253 ASSERT(atomic_read(&subpage->writers) >= nbits); 254 return atomic_sub_and_test(nbits, &subpage->writers); 255 } 256 257 /* 258 * Lock a page for delalloc page writeback. 259 * 260 * Return -EAGAIN if the page is not properly initialized. 261 * Return 0 with the page locked, and writer counter updated. 262 * 263 * Even with 0 returned, the page still need extra check to make sure 264 * it's really the correct page, as the caller is using 265 * find_get_pages_contig(), which can race with page invalidating. 266 */ 267 int btrfs_page_start_writer_lock(const struct btrfs_fs_info *fs_info, 268 struct page *page, u64 start, u32 len) 269 { 270 if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { 271 lock_page(page); 272 return 0; 273 } 274 lock_page(page); 275 if (!PagePrivate(page) || !page->private) { 276 unlock_page(page); 277 return -EAGAIN; 278 } 279 btrfs_subpage_clamp_range(page, &start, &len); 280 btrfs_subpage_start_writer(fs_info, page, start, len); 281 return 0; 282 } 283 284 void btrfs_page_end_writer_lock(const struct btrfs_fs_info *fs_info, 285 struct page *page, u64 start, u32 len) 286 { 287 if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) 288 return unlock_page(page); 289 btrfs_subpage_clamp_range(page, &start, &len); 290 if (btrfs_subpage_end_and_test_writer(fs_info, page, start, len)) 291 unlock_page(page); 292 } 293 294 /* 295 * Convert the [start, start + len) range into a u16 bitmap 296 * 297 * For example: if start == page_offset() + 16K, len = 16K, we get 0x00f0. 298 */ 299 static u16 btrfs_subpage_calc_bitmap(const struct btrfs_fs_info *fs_info, 300 struct page *page, u64 start, u32 len) 301 { 302 const int bit_start = offset_in_page(start) >> fs_info->sectorsize_bits; 303 const int nbits = len >> fs_info->sectorsize_bits; 304 305 btrfs_subpage_assert(fs_info, page, start, len); 306 307 /* 308 * Here nbits can be 16, thus can go beyond u16 range. We make the 309 * first left shift to be calculate in unsigned long (at least u32), 310 * then truncate the result to u16. 311 */ 312 return (u16)(((1UL << nbits) - 1) << bit_start); 313 } 314 315 void btrfs_subpage_set_uptodate(const struct btrfs_fs_info *fs_info, 316 struct page *page, u64 start, u32 len) 317 { 318 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 319 const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); 320 unsigned long flags; 321 322 spin_lock_irqsave(&subpage->lock, flags); 323 subpage->uptodate_bitmap |= tmp; 324 if (subpage->uptodate_bitmap == U16_MAX) 325 SetPageUptodate(page); 326 spin_unlock_irqrestore(&subpage->lock, flags); 327 } 328 329 void btrfs_subpage_clear_uptodate(const struct btrfs_fs_info *fs_info, 330 struct page *page, u64 start, u32 len) 331 { 332 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 333 const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); 334 unsigned long flags; 335 336 spin_lock_irqsave(&subpage->lock, flags); 337 subpage->uptodate_bitmap &= ~tmp; 338 ClearPageUptodate(page); 339 spin_unlock_irqrestore(&subpage->lock, flags); 340 } 341 342 void btrfs_subpage_set_error(const struct btrfs_fs_info *fs_info, 343 struct page *page, u64 start, u32 len) 344 { 345 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 346 const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); 347 unsigned long flags; 348 349 spin_lock_irqsave(&subpage->lock, flags); 350 subpage->error_bitmap |= tmp; 351 SetPageError(page); 352 spin_unlock_irqrestore(&subpage->lock, flags); 353 } 354 355 void btrfs_subpage_clear_error(const struct btrfs_fs_info *fs_info, 356 struct page *page, u64 start, u32 len) 357 { 358 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 359 const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); 360 unsigned long flags; 361 362 spin_lock_irqsave(&subpage->lock, flags); 363 subpage->error_bitmap &= ~tmp; 364 if (subpage->error_bitmap == 0) 365 ClearPageError(page); 366 spin_unlock_irqrestore(&subpage->lock, flags); 367 } 368 369 void btrfs_subpage_set_dirty(const struct btrfs_fs_info *fs_info, 370 struct page *page, u64 start, u32 len) 371 { 372 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 373 u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); 374 unsigned long flags; 375 376 spin_lock_irqsave(&subpage->lock, flags); 377 subpage->dirty_bitmap |= tmp; 378 spin_unlock_irqrestore(&subpage->lock, flags); 379 set_page_dirty(page); 380 } 381 382 /* 383 * Extra clear_and_test function for subpage dirty bitmap. 384 * 385 * Return true if we're the last bits in the dirty_bitmap and clear the 386 * dirty_bitmap. 387 * Return false otherwise. 388 * 389 * NOTE: Callers should manually clear page dirty for true case, as we have 390 * extra handling for tree blocks. 391 */ 392 bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info, 393 struct page *page, u64 start, u32 len) 394 { 395 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 396 u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); 397 unsigned long flags; 398 bool last = false; 399 400 spin_lock_irqsave(&subpage->lock, flags); 401 subpage->dirty_bitmap &= ~tmp; 402 if (subpage->dirty_bitmap == 0) 403 last = true; 404 spin_unlock_irqrestore(&subpage->lock, flags); 405 return last; 406 } 407 408 void btrfs_subpage_clear_dirty(const struct btrfs_fs_info *fs_info, 409 struct page *page, u64 start, u32 len) 410 { 411 bool last; 412 413 last = btrfs_subpage_clear_and_test_dirty(fs_info, page, start, len); 414 if (last) 415 clear_page_dirty_for_io(page); 416 } 417 418 void btrfs_subpage_set_writeback(const struct btrfs_fs_info *fs_info, 419 struct page *page, u64 start, u32 len) 420 { 421 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 422 u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); 423 unsigned long flags; 424 425 spin_lock_irqsave(&subpage->lock, flags); 426 subpage->writeback_bitmap |= tmp; 427 set_page_writeback(page); 428 spin_unlock_irqrestore(&subpage->lock, flags); 429 } 430 431 void btrfs_subpage_clear_writeback(const struct btrfs_fs_info *fs_info, 432 struct page *page, u64 start, u32 len) 433 { 434 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 435 u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); 436 unsigned long flags; 437 438 spin_lock_irqsave(&subpage->lock, flags); 439 subpage->writeback_bitmap &= ~tmp; 440 if (subpage->writeback_bitmap == 0) { 441 ASSERT(PageWriteback(page)); 442 end_page_writeback(page); 443 } 444 spin_unlock_irqrestore(&subpage->lock, flags); 445 } 446 447 void btrfs_subpage_set_ordered(const struct btrfs_fs_info *fs_info, 448 struct page *page, u64 start, u32 len) 449 { 450 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 451 const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); 452 unsigned long flags; 453 454 spin_lock_irqsave(&subpage->lock, flags); 455 subpage->ordered_bitmap |= tmp; 456 SetPageOrdered(page); 457 spin_unlock_irqrestore(&subpage->lock, flags); 458 } 459 460 void btrfs_subpage_clear_ordered(const struct btrfs_fs_info *fs_info, 461 struct page *page, u64 start, u32 len) 462 { 463 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 464 const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); 465 unsigned long flags; 466 467 spin_lock_irqsave(&subpage->lock, flags); 468 subpage->ordered_bitmap &= ~tmp; 469 if (subpage->ordered_bitmap == 0) 470 ClearPageOrdered(page); 471 spin_unlock_irqrestore(&subpage->lock, flags); 472 } 473 /* 474 * Unlike set/clear which is dependent on each page status, for test all bits 475 * are tested in the same way. 476 */ 477 #define IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(name) \ 478 bool btrfs_subpage_test_##name(const struct btrfs_fs_info *fs_info, \ 479 struct page *page, u64 start, u32 len) \ 480 { \ 481 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; \ 482 const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); \ 483 unsigned long flags; \ 484 bool ret; \ 485 \ 486 spin_lock_irqsave(&subpage->lock, flags); \ 487 ret = ((subpage->name##_bitmap & tmp) == tmp); \ 488 spin_unlock_irqrestore(&subpage->lock, flags); \ 489 return ret; \ 490 } 491 IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(uptodate); 492 IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(error); 493 IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(dirty); 494 IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(writeback); 495 IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(ordered); 496 497 /* 498 * Note that, in selftests (extent-io-tests), we can have empty fs_info passed 499 * in. We only test sectorsize == PAGE_SIZE cases so far, thus we can fall 500 * back to regular sectorsize branch. 501 */ 502 #define IMPLEMENT_BTRFS_PAGE_OPS(name, set_page_func, clear_page_func, \ 503 test_page_func) \ 504 void btrfs_page_set_##name(const struct btrfs_fs_info *fs_info, \ 505 struct page *page, u64 start, u32 len) \ 506 { \ 507 if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \ 508 set_page_func(page); \ 509 return; \ 510 } \ 511 btrfs_subpage_set_##name(fs_info, page, start, len); \ 512 } \ 513 void btrfs_page_clear_##name(const struct btrfs_fs_info *fs_info, \ 514 struct page *page, u64 start, u32 len) \ 515 { \ 516 if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \ 517 clear_page_func(page); \ 518 return; \ 519 } \ 520 btrfs_subpage_clear_##name(fs_info, page, start, len); \ 521 } \ 522 bool btrfs_page_test_##name(const struct btrfs_fs_info *fs_info, \ 523 struct page *page, u64 start, u32 len) \ 524 { \ 525 if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) \ 526 return test_page_func(page); \ 527 return btrfs_subpage_test_##name(fs_info, page, start, len); \ 528 } \ 529 void btrfs_page_clamp_set_##name(const struct btrfs_fs_info *fs_info, \ 530 struct page *page, u64 start, u32 len) \ 531 { \ 532 if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \ 533 set_page_func(page); \ 534 return; \ 535 } \ 536 btrfs_subpage_clamp_range(page, &start, &len); \ 537 btrfs_subpage_set_##name(fs_info, page, start, len); \ 538 } \ 539 void btrfs_page_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \ 540 struct page *page, u64 start, u32 len) \ 541 { \ 542 if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \ 543 clear_page_func(page); \ 544 return; \ 545 } \ 546 btrfs_subpage_clamp_range(page, &start, &len); \ 547 btrfs_subpage_clear_##name(fs_info, page, start, len); \ 548 } \ 549 bool btrfs_page_clamp_test_##name(const struct btrfs_fs_info *fs_info, \ 550 struct page *page, u64 start, u32 len) \ 551 { \ 552 if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) \ 553 return test_page_func(page); \ 554 btrfs_subpage_clamp_range(page, &start, &len); \ 555 return btrfs_subpage_test_##name(fs_info, page, start, len); \ 556 } 557 IMPLEMENT_BTRFS_PAGE_OPS(uptodate, SetPageUptodate, ClearPageUptodate, 558 PageUptodate); 559 IMPLEMENT_BTRFS_PAGE_OPS(error, SetPageError, ClearPageError, PageError); 560 IMPLEMENT_BTRFS_PAGE_OPS(dirty, set_page_dirty, clear_page_dirty_for_io, 561 PageDirty); 562 IMPLEMENT_BTRFS_PAGE_OPS(writeback, set_page_writeback, end_page_writeback, 563 PageWriteback); 564 IMPLEMENT_BTRFS_PAGE_OPS(ordered, SetPageOrdered, ClearPageOrdered, 565 PageOrdered); 566 567 /* 568 * Make sure not only the page dirty bit is cleared, but also subpage dirty bit 569 * is cleared. 570 */ 571 void btrfs_page_assert_not_dirty(const struct btrfs_fs_info *fs_info, 572 struct page *page) 573 { 574 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 575 576 if (!IS_ENABLED(CONFIG_BTRFS_ASSERT)) 577 return; 578 579 ASSERT(!PageDirty(page)); 580 if (fs_info->sectorsize == PAGE_SIZE) 581 return; 582 583 ASSERT(PagePrivate(page) && page->private); 584 ASSERT(subpage->dirty_bitmap == 0); 585 } 586