1 // SPDX-License-Identifier: GPL-2.0 2 3 #include <linux/slab.h> 4 #include "ctree.h" 5 #include "subpage.h" 6 #include "btrfs_inode.h" 7 8 /* 9 * Subpage (sectorsize < PAGE_SIZE) support overview: 10 * 11 * Limitations: 12 * 13 * - Only support 64K page size for now 14 * This is to make metadata handling easier, as 64K page would ensure 15 * all nodesize would fit inside one page, thus we don't need to handle 16 * cases where a tree block crosses several pages. 17 * 18 * - Only metadata read-write for now 19 * The data read-write part is in development. 20 * 21 * - Metadata can't cross 64K page boundary 22 * btrfs-progs and kernel have done that for a while, thus only ancient 23 * filesystems could have such problem. For such case, do a graceful 24 * rejection. 25 * 26 * Special behavior: 27 * 28 * - Metadata 29 * Metadata read is fully supported. 30 * Meaning when reading one tree block will only trigger the read for the 31 * needed range, other unrelated range in the same page will not be touched. 32 * 33 * Metadata write support is partial. 34 * The writeback is still for the full page, but we will only submit 35 * the dirty extent buffers in the page. 36 * 37 * This means, if we have a metadata page like this: 38 * 39 * Page offset 40 * 0 16K 32K 48K 64K 41 * |/////////| |///////////| 42 * \- Tree block A \- Tree block B 43 * 44 * Even if we just want to writeback tree block A, we will also writeback 45 * tree block B if it's also dirty. 46 * 47 * This may cause extra metadata writeback which results more COW. 48 * 49 * Implementation: 50 * 51 * - Common 52 * Both metadata and data will use a new structure, btrfs_subpage, to 53 * record the status of each sector inside a page. This provides the extra 54 * granularity needed. 55 * 56 * - Metadata 57 * Since we have multiple tree blocks inside one page, we can't rely on page 58 * locking anymore, or we will have greatly reduced concurrency or even 59 * deadlocks (hold one tree lock while trying to lock another tree lock in 60 * the same page). 61 * 62 * Thus for metadata locking, subpage support relies on io_tree locking only. 63 * This means a slightly higher tree locking latency. 64 */ 65 66 int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, 67 struct page *page, enum btrfs_subpage_type type) 68 { 69 struct btrfs_subpage *subpage = NULL; 70 int ret; 71 72 /* 73 * We have cases like a dummy extent buffer page, which is not mappped 74 * and doesn't need to be locked. 75 */ 76 if (page->mapping) 77 ASSERT(PageLocked(page)); 78 /* Either not subpage, or the page already has private attached */ 79 if (fs_info->sectorsize == PAGE_SIZE || PagePrivate(page)) 80 return 0; 81 82 ret = btrfs_alloc_subpage(fs_info, &subpage, type); 83 if (ret < 0) 84 return ret; 85 attach_page_private(page, subpage); 86 return 0; 87 } 88 89 void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, 90 struct page *page) 91 { 92 struct btrfs_subpage *subpage; 93 94 /* Either not subpage, or already detached */ 95 if (fs_info->sectorsize == PAGE_SIZE || !PagePrivate(page)) 96 return; 97 98 subpage = (struct btrfs_subpage *)detach_page_private(page); 99 ASSERT(subpage); 100 btrfs_free_subpage(subpage); 101 } 102 103 int btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info, 104 struct btrfs_subpage **ret, 105 enum btrfs_subpage_type type) 106 { 107 if (fs_info->sectorsize == PAGE_SIZE) 108 return 0; 109 110 *ret = kzalloc(sizeof(struct btrfs_subpage), GFP_NOFS); 111 if (!*ret) 112 return -ENOMEM; 113 spin_lock_init(&(*ret)->lock); 114 if (type == BTRFS_SUBPAGE_METADATA) { 115 atomic_set(&(*ret)->eb_refs, 0); 116 } else { 117 atomic_set(&(*ret)->readers, 0); 118 atomic_set(&(*ret)->writers, 0); 119 } 120 return 0; 121 } 122 123 void btrfs_free_subpage(struct btrfs_subpage *subpage) 124 { 125 kfree(subpage); 126 } 127 128 /* 129 * Increase the eb_refs of current subpage. 130 * 131 * This is important for eb allocation, to prevent race with last eb freeing 132 * of the same page. 133 * With the eb_refs increased before the eb inserted into radix tree, 134 * detach_extent_buffer_page() won't detach the page private while we're still 135 * allocating the extent buffer. 136 */ 137 void btrfs_page_inc_eb_refs(const struct btrfs_fs_info *fs_info, 138 struct page *page) 139 { 140 struct btrfs_subpage *subpage; 141 142 if (fs_info->sectorsize == PAGE_SIZE) 143 return; 144 145 ASSERT(PagePrivate(page) && page->mapping); 146 lockdep_assert_held(&page->mapping->private_lock); 147 148 subpage = (struct btrfs_subpage *)page->private; 149 atomic_inc(&subpage->eb_refs); 150 } 151 152 void btrfs_page_dec_eb_refs(const struct btrfs_fs_info *fs_info, 153 struct page *page) 154 { 155 struct btrfs_subpage *subpage; 156 157 if (fs_info->sectorsize == PAGE_SIZE) 158 return; 159 160 ASSERT(PagePrivate(page) && page->mapping); 161 lockdep_assert_held(&page->mapping->private_lock); 162 163 subpage = (struct btrfs_subpage *)page->private; 164 ASSERT(atomic_read(&subpage->eb_refs)); 165 atomic_dec(&subpage->eb_refs); 166 } 167 168 static void btrfs_subpage_assert(const struct btrfs_fs_info *fs_info, 169 struct page *page, u64 start, u32 len) 170 { 171 /* Basic checks */ 172 ASSERT(PagePrivate(page) && page->private); 173 ASSERT(IS_ALIGNED(start, fs_info->sectorsize) && 174 IS_ALIGNED(len, fs_info->sectorsize)); 175 /* 176 * The range check only works for mapped page, we can still have 177 * unmapped page like dummy extent buffer pages. 178 */ 179 if (page->mapping) 180 ASSERT(page_offset(page) <= start && 181 start + len <= page_offset(page) + PAGE_SIZE); 182 } 183 184 void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info, 185 struct page *page, u64 start, u32 len) 186 { 187 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 188 const int nbits = len >> fs_info->sectorsize_bits; 189 190 btrfs_subpage_assert(fs_info, page, start, len); 191 192 atomic_add(nbits, &subpage->readers); 193 } 194 195 void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info, 196 struct page *page, u64 start, u32 len) 197 { 198 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 199 const int nbits = len >> fs_info->sectorsize_bits; 200 bool is_data; 201 bool last; 202 203 btrfs_subpage_assert(fs_info, page, start, len); 204 is_data = is_data_inode(page->mapping->host); 205 ASSERT(atomic_read(&subpage->readers) >= nbits); 206 last = atomic_sub_and_test(nbits, &subpage->readers); 207 208 /* 209 * For data we need to unlock the page if the last read has finished. 210 * 211 * And please don't replace @last with atomic_sub_and_test() call 212 * inside if () condition. 213 * As we want the atomic_sub_and_test() to be always executed. 214 */ 215 if (is_data && last) 216 unlock_page(page); 217 } 218 219 static void btrfs_subpage_clamp_range(struct page *page, u64 *start, u32 *len) 220 { 221 u64 orig_start = *start; 222 u32 orig_len = *len; 223 224 *start = max_t(u64, page_offset(page), orig_start); 225 *len = min_t(u64, page_offset(page) + PAGE_SIZE, 226 orig_start + orig_len) - *start; 227 } 228 229 void btrfs_subpage_start_writer(const struct btrfs_fs_info *fs_info, 230 struct page *page, u64 start, u32 len) 231 { 232 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 233 const int nbits = (len >> fs_info->sectorsize_bits); 234 int ret; 235 236 btrfs_subpage_assert(fs_info, page, start, len); 237 238 ASSERT(atomic_read(&subpage->readers) == 0); 239 ret = atomic_add_return(nbits, &subpage->writers); 240 ASSERT(ret == nbits); 241 } 242 243 bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info, 244 struct page *page, u64 start, u32 len) 245 { 246 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 247 const int nbits = (len >> fs_info->sectorsize_bits); 248 249 btrfs_subpage_assert(fs_info, page, start, len); 250 251 ASSERT(atomic_read(&subpage->writers) >= nbits); 252 return atomic_sub_and_test(nbits, &subpage->writers); 253 } 254 255 /* 256 * Lock a page for delalloc page writeback. 257 * 258 * Return -EAGAIN if the page is not properly initialized. 259 * Return 0 with the page locked, and writer counter updated. 260 * 261 * Even with 0 returned, the page still need extra check to make sure 262 * it's really the correct page, as the caller is using 263 * find_get_pages_contig(), which can race with page invalidating. 264 */ 265 int btrfs_page_start_writer_lock(const struct btrfs_fs_info *fs_info, 266 struct page *page, u64 start, u32 len) 267 { 268 if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { 269 lock_page(page); 270 return 0; 271 } 272 lock_page(page); 273 if (!PagePrivate(page) || !page->private) { 274 unlock_page(page); 275 return -EAGAIN; 276 } 277 btrfs_subpage_clamp_range(page, &start, &len); 278 btrfs_subpage_start_writer(fs_info, page, start, len); 279 return 0; 280 } 281 282 void btrfs_page_end_writer_lock(const struct btrfs_fs_info *fs_info, 283 struct page *page, u64 start, u32 len) 284 { 285 if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) 286 return unlock_page(page); 287 btrfs_subpage_clamp_range(page, &start, &len); 288 if (btrfs_subpage_end_and_test_writer(fs_info, page, start, len)) 289 unlock_page(page); 290 } 291 292 /* 293 * Convert the [start, start + len) range into a u16 bitmap 294 * 295 * For example: if start == page_offset() + 16K, len = 16K, we get 0x00f0. 296 */ 297 static u16 btrfs_subpage_calc_bitmap(const struct btrfs_fs_info *fs_info, 298 struct page *page, u64 start, u32 len) 299 { 300 const int bit_start = offset_in_page(start) >> fs_info->sectorsize_bits; 301 const int nbits = len >> fs_info->sectorsize_bits; 302 303 btrfs_subpage_assert(fs_info, page, start, len); 304 305 /* 306 * Here nbits can be 16, thus can go beyond u16 range. We make the 307 * first left shift to be calculate in unsigned long (at least u32), 308 * then truncate the result to u16. 309 */ 310 return (u16)(((1UL << nbits) - 1) << bit_start); 311 } 312 313 void btrfs_subpage_set_uptodate(const struct btrfs_fs_info *fs_info, 314 struct page *page, u64 start, u32 len) 315 { 316 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 317 const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); 318 unsigned long flags; 319 320 spin_lock_irqsave(&subpage->lock, flags); 321 subpage->uptodate_bitmap |= tmp; 322 if (subpage->uptodate_bitmap == U16_MAX) 323 SetPageUptodate(page); 324 spin_unlock_irqrestore(&subpage->lock, flags); 325 } 326 327 void btrfs_subpage_clear_uptodate(const struct btrfs_fs_info *fs_info, 328 struct page *page, u64 start, u32 len) 329 { 330 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 331 const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); 332 unsigned long flags; 333 334 spin_lock_irqsave(&subpage->lock, flags); 335 subpage->uptodate_bitmap &= ~tmp; 336 ClearPageUptodate(page); 337 spin_unlock_irqrestore(&subpage->lock, flags); 338 } 339 340 void btrfs_subpage_set_error(const struct btrfs_fs_info *fs_info, 341 struct page *page, u64 start, u32 len) 342 { 343 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 344 const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); 345 unsigned long flags; 346 347 spin_lock_irqsave(&subpage->lock, flags); 348 subpage->error_bitmap |= tmp; 349 SetPageError(page); 350 spin_unlock_irqrestore(&subpage->lock, flags); 351 } 352 353 void btrfs_subpage_clear_error(const struct btrfs_fs_info *fs_info, 354 struct page *page, u64 start, u32 len) 355 { 356 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 357 const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); 358 unsigned long flags; 359 360 spin_lock_irqsave(&subpage->lock, flags); 361 subpage->error_bitmap &= ~tmp; 362 if (subpage->error_bitmap == 0) 363 ClearPageError(page); 364 spin_unlock_irqrestore(&subpage->lock, flags); 365 } 366 367 void btrfs_subpage_set_dirty(const struct btrfs_fs_info *fs_info, 368 struct page *page, u64 start, u32 len) 369 { 370 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 371 u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); 372 unsigned long flags; 373 374 spin_lock_irqsave(&subpage->lock, flags); 375 subpage->dirty_bitmap |= tmp; 376 spin_unlock_irqrestore(&subpage->lock, flags); 377 set_page_dirty(page); 378 } 379 380 /* 381 * Extra clear_and_test function for subpage dirty bitmap. 382 * 383 * Return true if we're the last bits in the dirty_bitmap and clear the 384 * dirty_bitmap. 385 * Return false otherwise. 386 * 387 * NOTE: Callers should manually clear page dirty for true case, as we have 388 * extra handling for tree blocks. 389 */ 390 bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info, 391 struct page *page, u64 start, u32 len) 392 { 393 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 394 u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); 395 unsigned long flags; 396 bool last = false; 397 398 spin_lock_irqsave(&subpage->lock, flags); 399 subpage->dirty_bitmap &= ~tmp; 400 if (subpage->dirty_bitmap == 0) 401 last = true; 402 spin_unlock_irqrestore(&subpage->lock, flags); 403 return last; 404 } 405 406 void btrfs_subpage_clear_dirty(const struct btrfs_fs_info *fs_info, 407 struct page *page, u64 start, u32 len) 408 { 409 bool last; 410 411 last = btrfs_subpage_clear_and_test_dirty(fs_info, page, start, len); 412 if (last) 413 clear_page_dirty_for_io(page); 414 } 415 416 void btrfs_subpage_set_writeback(const struct btrfs_fs_info *fs_info, 417 struct page *page, u64 start, u32 len) 418 { 419 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 420 u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); 421 unsigned long flags; 422 423 spin_lock_irqsave(&subpage->lock, flags); 424 subpage->writeback_bitmap |= tmp; 425 set_page_writeback(page); 426 spin_unlock_irqrestore(&subpage->lock, flags); 427 } 428 429 void btrfs_subpage_clear_writeback(const struct btrfs_fs_info *fs_info, 430 struct page *page, u64 start, u32 len) 431 { 432 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 433 u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); 434 unsigned long flags; 435 436 spin_lock_irqsave(&subpage->lock, flags); 437 subpage->writeback_bitmap &= ~tmp; 438 if (subpage->writeback_bitmap == 0) 439 end_page_writeback(page); 440 spin_unlock_irqrestore(&subpage->lock, flags); 441 } 442 443 void btrfs_subpage_set_ordered(const struct btrfs_fs_info *fs_info, 444 struct page *page, u64 start, u32 len) 445 { 446 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 447 const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); 448 unsigned long flags; 449 450 spin_lock_irqsave(&subpage->lock, flags); 451 subpage->ordered_bitmap |= tmp; 452 SetPageOrdered(page); 453 spin_unlock_irqrestore(&subpage->lock, flags); 454 } 455 456 void btrfs_subpage_clear_ordered(const struct btrfs_fs_info *fs_info, 457 struct page *page, u64 start, u32 len) 458 { 459 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 460 const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); 461 unsigned long flags; 462 463 spin_lock_irqsave(&subpage->lock, flags); 464 subpage->ordered_bitmap &= ~tmp; 465 if (subpage->ordered_bitmap == 0) 466 ClearPageOrdered(page); 467 spin_unlock_irqrestore(&subpage->lock, flags); 468 } 469 /* 470 * Unlike set/clear which is dependent on each page status, for test all bits 471 * are tested in the same way. 472 */ 473 #define IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(name) \ 474 bool btrfs_subpage_test_##name(const struct btrfs_fs_info *fs_info, \ 475 struct page *page, u64 start, u32 len) \ 476 { \ 477 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; \ 478 const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); \ 479 unsigned long flags; \ 480 bool ret; \ 481 \ 482 spin_lock_irqsave(&subpage->lock, flags); \ 483 ret = ((subpage->name##_bitmap & tmp) == tmp); \ 484 spin_unlock_irqrestore(&subpage->lock, flags); \ 485 return ret; \ 486 } 487 IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(uptodate); 488 IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(error); 489 IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(dirty); 490 IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(writeback); 491 IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(ordered); 492 493 /* 494 * Note that, in selftests (extent-io-tests), we can have empty fs_info passed 495 * in. We only test sectorsize == PAGE_SIZE cases so far, thus we can fall 496 * back to regular sectorsize branch. 497 */ 498 #define IMPLEMENT_BTRFS_PAGE_OPS(name, set_page_func, clear_page_func, \ 499 test_page_func) \ 500 void btrfs_page_set_##name(const struct btrfs_fs_info *fs_info, \ 501 struct page *page, u64 start, u32 len) \ 502 { \ 503 if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \ 504 set_page_func(page); \ 505 return; \ 506 } \ 507 btrfs_subpage_set_##name(fs_info, page, start, len); \ 508 } \ 509 void btrfs_page_clear_##name(const struct btrfs_fs_info *fs_info, \ 510 struct page *page, u64 start, u32 len) \ 511 { \ 512 if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \ 513 clear_page_func(page); \ 514 return; \ 515 } \ 516 btrfs_subpage_clear_##name(fs_info, page, start, len); \ 517 } \ 518 bool btrfs_page_test_##name(const struct btrfs_fs_info *fs_info, \ 519 struct page *page, u64 start, u32 len) \ 520 { \ 521 if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) \ 522 return test_page_func(page); \ 523 return btrfs_subpage_test_##name(fs_info, page, start, len); \ 524 } \ 525 void btrfs_page_clamp_set_##name(const struct btrfs_fs_info *fs_info, \ 526 struct page *page, u64 start, u32 len) \ 527 { \ 528 if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \ 529 set_page_func(page); \ 530 return; \ 531 } \ 532 btrfs_subpage_clamp_range(page, &start, &len); \ 533 btrfs_subpage_set_##name(fs_info, page, start, len); \ 534 } \ 535 void btrfs_page_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \ 536 struct page *page, u64 start, u32 len) \ 537 { \ 538 if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \ 539 clear_page_func(page); \ 540 return; \ 541 } \ 542 btrfs_subpage_clamp_range(page, &start, &len); \ 543 btrfs_subpage_clear_##name(fs_info, page, start, len); \ 544 } \ 545 bool btrfs_page_clamp_test_##name(const struct btrfs_fs_info *fs_info, \ 546 struct page *page, u64 start, u32 len) \ 547 { \ 548 if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) \ 549 return test_page_func(page); \ 550 btrfs_subpage_clamp_range(page, &start, &len); \ 551 return btrfs_subpage_test_##name(fs_info, page, start, len); \ 552 } 553 IMPLEMENT_BTRFS_PAGE_OPS(uptodate, SetPageUptodate, ClearPageUptodate, 554 PageUptodate); 555 IMPLEMENT_BTRFS_PAGE_OPS(error, SetPageError, ClearPageError, PageError); 556 IMPLEMENT_BTRFS_PAGE_OPS(dirty, set_page_dirty, clear_page_dirty_for_io, 557 PageDirty); 558 IMPLEMENT_BTRFS_PAGE_OPS(writeback, set_page_writeback, end_page_writeback, 559 PageWriteback); 560 IMPLEMENT_BTRFS_PAGE_OPS(ordered, SetPageOrdered, ClearPageOrdered, 561 PageOrdered); 562