1 // SPDX-License-Identifier: GPL-2.0 2 3 #include <linux/slab.h> 4 #include "ctree.h" 5 #include "subpage.h" 6 #include "btrfs_inode.h" 7 8 /* 9 * Subpage (sectorsize < PAGE_SIZE) support overview: 10 * 11 * Limitations: 12 * 13 * - Only support 64K page size for now 14 * This is to make metadata handling easier, as 64K page would ensure 15 * all nodesize would fit inside one page, thus we don't need to handle 16 * cases where a tree block crosses several pages. 17 * 18 * - Only metadata read-write for now 19 * The data read-write part is in development. 20 * 21 * - Metadata can't cross 64K page boundary 22 * btrfs-progs and kernel have done that for a while, thus only ancient 23 * filesystems could have such problem. For such case, do a graceful 24 * rejection. 25 * 26 * Special behavior: 27 * 28 * - Metadata 29 * Metadata read is fully supported. 30 * Meaning when reading one tree block will only trigger the read for the 31 * needed range, other unrelated range in the same page will not be touched. 32 * 33 * Metadata write support is partial. 34 * The writeback is still for the full page, but we will only submit 35 * the dirty extent buffers in the page. 36 * 37 * This means, if we have a metadata page like this: 38 * 39 * Page offset 40 * 0 16K 32K 48K 64K 41 * |/////////| |///////////| 42 * \- Tree block A \- Tree block B 43 * 44 * Even if we just want to writeback tree block A, we will also writeback 45 * tree block B if it's also dirty. 46 * 47 * This may cause extra metadata writeback which results more COW. 48 * 49 * Implementation: 50 * 51 * - Common 52 * Both metadata and data will use a new structure, btrfs_subpage, to 53 * record the status of each sector inside a page. This provides the extra 54 * granularity needed. 55 * 56 * - Metadata 57 * Since we have multiple tree blocks inside one page, we can't rely on page 58 * locking anymore, or we will have greatly reduced concurrency or even 59 * deadlocks (hold one tree lock while trying to lock another tree lock in 60 * the same page). 61 * 62 * Thus for metadata locking, subpage support relies on io_tree locking only. 63 * This means a slightly higher tree locking latency. 64 */ 65 66 int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, 67 struct page *page, enum btrfs_subpage_type type) 68 { 69 struct btrfs_subpage *subpage = NULL; 70 int ret; 71 72 /* 73 * We have cases like a dummy extent buffer page, which is not mappped 74 * and doesn't need to be locked. 75 */ 76 if (page->mapping) 77 ASSERT(PageLocked(page)); 78 /* Either not subpage, or the page already has private attached */ 79 if (fs_info->sectorsize == PAGE_SIZE || PagePrivate(page)) 80 return 0; 81 82 ret = btrfs_alloc_subpage(fs_info, &subpage, type); 83 if (ret < 0) 84 return ret; 85 attach_page_private(page, subpage); 86 return 0; 87 } 88 89 void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, 90 struct page *page) 91 { 92 struct btrfs_subpage *subpage; 93 94 /* Either not subpage, or already detached */ 95 if (fs_info->sectorsize == PAGE_SIZE || !PagePrivate(page)) 96 return; 97 98 subpage = (struct btrfs_subpage *)detach_page_private(page); 99 ASSERT(subpage); 100 btrfs_free_subpage(subpage); 101 } 102 103 int btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info, 104 struct btrfs_subpage **ret, 105 enum btrfs_subpage_type type) 106 { 107 ASSERT(fs_info->sectorsize < PAGE_SIZE); 108 109 *ret = kzalloc(sizeof(struct btrfs_subpage), GFP_NOFS); 110 if (!*ret) 111 return -ENOMEM; 112 spin_lock_init(&(*ret)->lock); 113 if (type == BTRFS_SUBPAGE_METADATA) { 114 atomic_set(&(*ret)->eb_refs, 0); 115 } else { 116 atomic_set(&(*ret)->readers, 0); 117 atomic_set(&(*ret)->writers, 0); 118 } 119 return 0; 120 } 121 122 void btrfs_free_subpage(struct btrfs_subpage *subpage) 123 { 124 kfree(subpage); 125 } 126 127 /* 128 * Increase the eb_refs of current subpage. 129 * 130 * This is important for eb allocation, to prevent race with last eb freeing 131 * of the same page. 132 * With the eb_refs increased before the eb inserted into radix tree, 133 * detach_extent_buffer_page() won't detach the page private while we're still 134 * allocating the extent buffer. 135 */ 136 void btrfs_page_inc_eb_refs(const struct btrfs_fs_info *fs_info, 137 struct page *page) 138 { 139 struct btrfs_subpage *subpage; 140 141 if (fs_info->sectorsize == PAGE_SIZE) 142 return; 143 144 ASSERT(PagePrivate(page) && page->mapping); 145 lockdep_assert_held(&page->mapping->private_lock); 146 147 subpage = (struct btrfs_subpage *)page->private; 148 atomic_inc(&subpage->eb_refs); 149 } 150 151 void btrfs_page_dec_eb_refs(const struct btrfs_fs_info *fs_info, 152 struct page *page) 153 { 154 struct btrfs_subpage *subpage; 155 156 if (fs_info->sectorsize == PAGE_SIZE) 157 return; 158 159 ASSERT(PagePrivate(page) && page->mapping); 160 lockdep_assert_held(&page->mapping->private_lock); 161 162 subpage = (struct btrfs_subpage *)page->private; 163 ASSERT(atomic_read(&subpage->eb_refs)); 164 atomic_dec(&subpage->eb_refs); 165 } 166 167 static void btrfs_subpage_assert(const struct btrfs_fs_info *fs_info, 168 struct page *page, u64 start, u32 len) 169 { 170 /* Basic checks */ 171 ASSERT(PagePrivate(page) && page->private); 172 ASSERT(IS_ALIGNED(start, fs_info->sectorsize) && 173 IS_ALIGNED(len, fs_info->sectorsize)); 174 /* 175 * The range check only works for mapped page, we can still have 176 * unmapped page like dummy extent buffer pages. 177 */ 178 if (page->mapping) 179 ASSERT(page_offset(page) <= start && 180 start + len <= page_offset(page) + PAGE_SIZE); 181 } 182 183 void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info, 184 struct page *page, u64 start, u32 len) 185 { 186 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 187 const int nbits = len >> fs_info->sectorsize_bits; 188 189 btrfs_subpage_assert(fs_info, page, start, len); 190 191 atomic_add(nbits, &subpage->readers); 192 } 193 194 void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info, 195 struct page *page, u64 start, u32 len) 196 { 197 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 198 const int nbits = len >> fs_info->sectorsize_bits; 199 bool is_data; 200 bool last; 201 202 btrfs_subpage_assert(fs_info, page, start, len); 203 is_data = is_data_inode(page->mapping->host); 204 ASSERT(atomic_read(&subpage->readers) >= nbits); 205 last = atomic_sub_and_test(nbits, &subpage->readers); 206 207 /* 208 * For data we need to unlock the page if the last read has finished. 209 * 210 * And please don't replace @last with atomic_sub_and_test() call 211 * inside if () condition. 212 * As we want the atomic_sub_and_test() to be always executed. 213 */ 214 if (is_data && last) 215 unlock_page(page); 216 } 217 218 static void btrfs_subpage_clamp_range(struct page *page, u64 *start, u32 *len) 219 { 220 u64 orig_start = *start; 221 u32 orig_len = *len; 222 223 *start = max_t(u64, page_offset(page), orig_start); 224 *len = min_t(u64, page_offset(page) + PAGE_SIZE, 225 orig_start + orig_len) - *start; 226 } 227 228 void btrfs_subpage_start_writer(const struct btrfs_fs_info *fs_info, 229 struct page *page, u64 start, u32 len) 230 { 231 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 232 const int nbits = (len >> fs_info->sectorsize_bits); 233 int ret; 234 235 btrfs_subpage_assert(fs_info, page, start, len); 236 237 ASSERT(atomic_read(&subpage->readers) == 0); 238 ret = atomic_add_return(nbits, &subpage->writers); 239 ASSERT(ret == nbits); 240 } 241 242 bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info, 243 struct page *page, u64 start, u32 len) 244 { 245 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 246 const int nbits = (len >> fs_info->sectorsize_bits); 247 248 btrfs_subpage_assert(fs_info, page, start, len); 249 250 ASSERT(atomic_read(&subpage->writers) >= nbits); 251 return atomic_sub_and_test(nbits, &subpage->writers); 252 } 253 254 /* 255 * Lock a page for delalloc page writeback. 256 * 257 * Return -EAGAIN if the page is not properly initialized. 258 * Return 0 with the page locked, and writer counter updated. 259 * 260 * Even with 0 returned, the page still need extra check to make sure 261 * it's really the correct page, as the caller is using 262 * find_get_pages_contig(), which can race with page invalidating. 263 */ 264 int btrfs_page_start_writer_lock(const struct btrfs_fs_info *fs_info, 265 struct page *page, u64 start, u32 len) 266 { 267 if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { 268 lock_page(page); 269 return 0; 270 } 271 lock_page(page); 272 if (!PagePrivate(page) || !page->private) { 273 unlock_page(page); 274 return -EAGAIN; 275 } 276 btrfs_subpage_clamp_range(page, &start, &len); 277 btrfs_subpage_start_writer(fs_info, page, start, len); 278 return 0; 279 } 280 281 void btrfs_page_end_writer_lock(const struct btrfs_fs_info *fs_info, 282 struct page *page, u64 start, u32 len) 283 { 284 if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) 285 return unlock_page(page); 286 btrfs_subpage_clamp_range(page, &start, &len); 287 if (btrfs_subpage_end_and_test_writer(fs_info, page, start, len)) 288 unlock_page(page); 289 } 290 291 /* 292 * Convert the [start, start + len) range into a u16 bitmap 293 * 294 * For example: if start == page_offset() + 16K, len = 16K, we get 0x00f0. 295 */ 296 static u16 btrfs_subpage_calc_bitmap(const struct btrfs_fs_info *fs_info, 297 struct page *page, u64 start, u32 len) 298 { 299 const int bit_start = offset_in_page(start) >> fs_info->sectorsize_bits; 300 const int nbits = len >> fs_info->sectorsize_bits; 301 302 btrfs_subpage_assert(fs_info, page, start, len); 303 304 /* 305 * Here nbits can be 16, thus can go beyond u16 range. We make the 306 * first left shift to be calculate in unsigned long (at least u32), 307 * then truncate the result to u16. 308 */ 309 return (u16)(((1UL << nbits) - 1) << bit_start); 310 } 311 312 void btrfs_subpage_set_uptodate(const struct btrfs_fs_info *fs_info, 313 struct page *page, u64 start, u32 len) 314 { 315 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 316 const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); 317 unsigned long flags; 318 319 spin_lock_irqsave(&subpage->lock, flags); 320 subpage->uptodate_bitmap |= tmp; 321 if (subpage->uptodate_bitmap == U16_MAX) 322 SetPageUptodate(page); 323 spin_unlock_irqrestore(&subpage->lock, flags); 324 } 325 326 void btrfs_subpage_clear_uptodate(const struct btrfs_fs_info *fs_info, 327 struct page *page, u64 start, u32 len) 328 { 329 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 330 const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); 331 unsigned long flags; 332 333 spin_lock_irqsave(&subpage->lock, flags); 334 subpage->uptodate_bitmap &= ~tmp; 335 ClearPageUptodate(page); 336 spin_unlock_irqrestore(&subpage->lock, flags); 337 } 338 339 void btrfs_subpage_set_error(const struct btrfs_fs_info *fs_info, 340 struct page *page, u64 start, u32 len) 341 { 342 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 343 const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); 344 unsigned long flags; 345 346 spin_lock_irqsave(&subpage->lock, flags); 347 subpage->error_bitmap |= tmp; 348 SetPageError(page); 349 spin_unlock_irqrestore(&subpage->lock, flags); 350 } 351 352 void btrfs_subpage_clear_error(const struct btrfs_fs_info *fs_info, 353 struct page *page, u64 start, u32 len) 354 { 355 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 356 const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); 357 unsigned long flags; 358 359 spin_lock_irqsave(&subpage->lock, flags); 360 subpage->error_bitmap &= ~tmp; 361 if (subpage->error_bitmap == 0) 362 ClearPageError(page); 363 spin_unlock_irqrestore(&subpage->lock, flags); 364 } 365 366 void btrfs_subpage_set_dirty(const struct btrfs_fs_info *fs_info, 367 struct page *page, u64 start, u32 len) 368 { 369 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 370 u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); 371 unsigned long flags; 372 373 spin_lock_irqsave(&subpage->lock, flags); 374 subpage->dirty_bitmap |= tmp; 375 spin_unlock_irqrestore(&subpage->lock, flags); 376 set_page_dirty(page); 377 } 378 379 /* 380 * Extra clear_and_test function for subpage dirty bitmap. 381 * 382 * Return true if we're the last bits in the dirty_bitmap and clear the 383 * dirty_bitmap. 384 * Return false otherwise. 385 * 386 * NOTE: Callers should manually clear page dirty for true case, as we have 387 * extra handling for tree blocks. 388 */ 389 bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info, 390 struct page *page, u64 start, u32 len) 391 { 392 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 393 u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); 394 unsigned long flags; 395 bool last = false; 396 397 spin_lock_irqsave(&subpage->lock, flags); 398 subpage->dirty_bitmap &= ~tmp; 399 if (subpage->dirty_bitmap == 0) 400 last = true; 401 spin_unlock_irqrestore(&subpage->lock, flags); 402 return last; 403 } 404 405 void btrfs_subpage_clear_dirty(const struct btrfs_fs_info *fs_info, 406 struct page *page, u64 start, u32 len) 407 { 408 bool last; 409 410 last = btrfs_subpage_clear_and_test_dirty(fs_info, page, start, len); 411 if (last) 412 clear_page_dirty_for_io(page); 413 } 414 415 void btrfs_subpage_set_writeback(const struct btrfs_fs_info *fs_info, 416 struct page *page, u64 start, u32 len) 417 { 418 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 419 u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); 420 unsigned long flags; 421 422 spin_lock_irqsave(&subpage->lock, flags); 423 subpage->writeback_bitmap |= tmp; 424 set_page_writeback(page); 425 spin_unlock_irqrestore(&subpage->lock, flags); 426 } 427 428 void btrfs_subpage_clear_writeback(const struct btrfs_fs_info *fs_info, 429 struct page *page, u64 start, u32 len) 430 { 431 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 432 u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); 433 unsigned long flags; 434 435 spin_lock_irqsave(&subpage->lock, flags); 436 subpage->writeback_bitmap &= ~tmp; 437 if (subpage->writeback_bitmap == 0) { 438 ASSERT(PageWriteback(page)); 439 end_page_writeback(page); 440 } 441 spin_unlock_irqrestore(&subpage->lock, flags); 442 } 443 444 void btrfs_subpage_set_ordered(const struct btrfs_fs_info *fs_info, 445 struct page *page, u64 start, u32 len) 446 { 447 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 448 const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); 449 unsigned long flags; 450 451 spin_lock_irqsave(&subpage->lock, flags); 452 subpage->ordered_bitmap |= tmp; 453 SetPageOrdered(page); 454 spin_unlock_irqrestore(&subpage->lock, flags); 455 } 456 457 void btrfs_subpage_clear_ordered(const struct btrfs_fs_info *fs_info, 458 struct page *page, u64 start, u32 len) 459 { 460 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 461 const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); 462 unsigned long flags; 463 464 spin_lock_irqsave(&subpage->lock, flags); 465 subpage->ordered_bitmap &= ~tmp; 466 if (subpage->ordered_bitmap == 0) 467 ClearPageOrdered(page); 468 spin_unlock_irqrestore(&subpage->lock, flags); 469 } 470 /* 471 * Unlike set/clear which is dependent on each page status, for test all bits 472 * are tested in the same way. 473 */ 474 #define IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(name) \ 475 bool btrfs_subpage_test_##name(const struct btrfs_fs_info *fs_info, \ 476 struct page *page, u64 start, u32 len) \ 477 { \ 478 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; \ 479 const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); \ 480 unsigned long flags; \ 481 bool ret; \ 482 \ 483 spin_lock_irqsave(&subpage->lock, flags); \ 484 ret = ((subpage->name##_bitmap & tmp) == tmp); \ 485 spin_unlock_irqrestore(&subpage->lock, flags); \ 486 return ret; \ 487 } 488 IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(uptodate); 489 IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(error); 490 IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(dirty); 491 IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(writeback); 492 IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(ordered); 493 494 /* 495 * Note that, in selftests (extent-io-tests), we can have empty fs_info passed 496 * in. We only test sectorsize == PAGE_SIZE cases so far, thus we can fall 497 * back to regular sectorsize branch. 498 */ 499 #define IMPLEMENT_BTRFS_PAGE_OPS(name, set_page_func, clear_page_func, \ 500 test_page_func) \ 501 void btrfs_page_set_##name(const struct btrfs_fs_info *fs_info, \ 502 struct page *page, u64 start, u32 len) \ 503 { \ 504 if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \ 505 set_page_func(page); \ 506 return; \ 507 } \ 508 btrfs_subpage_set_##name(fs_info, page, start, len); \ 509 } \ 510 void btrfs_page_clear_##name(const struct btrfs_fs_info *fs_info, \ 511 struct page *page, u64 start, u32 len) \ 512 { \ 513 if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \ 514 clear_page_func(page); \ 515 return; \ 516 } \ 517 btrfs_subpage_clear_##name(fs_info, page, start, len); \ 518 } \ 519 bool btrfs_page_test_##name(const struct btrfs_fs_info *fs_info, \ 520 struct page *page, u64 start, u32 len) \ 521 { \ 522 if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) \ 523 return test_page_func(page); \ 524 return btrfs_subpage_test_##name(fs_info, page, start, len); \ 525 } \ 526 void btrfs_page_clamp_set_##name(const struct btrfs_fs_info *fs_info, \ 527 struct page *page, u64 start, u32 len) \ 528 { \ 529 if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \ 530 set_page_func(page); \ 531 return; \ 532 } \ 533 btrfs_subpage_clamp_range(page, &start, &len); \ 534 btrfs_subpage_set_##name(fs_info, page, start, len); \ 535 } \ 536 void btrfs_page_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \ 537 struct page *page, u64 start, u32 len) \ 538 { \ 539 if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \ 540 clear_page_func(page); \ 541 return; \ 542 } \ 543 btrfs_subpage_clamp_range(page, &start, &len); \ 544 btrfs_subpage_clear_##name(fs_info, page, start, len); \ 545 } \ 546 bool btrfs_page_clamp_test_##name(const struct btrfs_fs_info *fs_info, \ 547 struct page *page, u64 start, u32 len) \ 548 { \ 549 if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) \ 550 return test_page_func(page); \ 551 btrfs_subpage_clamp_range(page, &start, &len); \ 552 return btrfs_subpage_test_##name(fs_info, page, start, len); \ 553 } 554 IMPLEMENT_BTRFS_PAGE_OPS(uptodate, SetPageUptodate, ClearPageUptodate, 555 PageUptodate); 556 IMPLEMENT_BTRFS_PAGE_OPS(error, SetPageError, ClearPageError, PageError); 557 IMPLEMENT_BTRFS_PAGE_OPS(dirty, set_page_dirty, clear_page_dirty_for_io, 558 PageDirty); 559 IMPLEMENT_BTRFS_PAGE_OPS(writeback, set_page_writeback, end_page_writeback, 560 PageWriteback); 561 IMPLEMENT_BTRFS_PAGE_OPS(ordered, SetPageOrdered, ClearPageOrdered, 562 PageOrdered); 563 564 /* 565 * Make sure not only the page dirty bit is cleared, but also subpage dirty bit 566 * is cleared. 567 */ 568 void btrfs_page_assert_not_dirty(const struct btrfs_fs_info *fs_info, 569 struct page *page) 570 { 571 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 572 573 if (!IS_ENABLED(CONFIG_BTRFS_ASSERT)) 574 return; 575 576 ASSERT(!PageDirty(page)); 577 if (fs_info->sectorsize == PAGE_SIZE) 578 return; 579 580 ASSERT(PagePrivate(page) && page->private); 581 ASSERT(subpage->dirty_bitmap == 0); 582 } 583