1 // SPDX-License-Identifier: GPL-2.0 2 3 #include <linux/slab.h> 4 #include "ctree.h" 5 #include "subpage.h" 6 7 /* 8 * Subpage (sectorsize < PAGE_SIZE) support overview: 9 * 10 * Limitations: 11 * 12 * - Only support 64K page size for now 13 * This is to make metadata handling easier, as 64K page would ensure 14 * all nodesize would fit inside one page, thus we don't need to handle 15 * cases where a tree block crosses several pages. 16 * 17 * - Only metadata read-write for now 18 * The data read-write part is in development. 19 * 20 * - Metadata can't cross 64K page boundary 21 * btrfs-progs and kernel have done that for a while, thus only ancient 22 * filesystems could have such problem. For such case, do a graceful 23 * rejection. 24 * 25 * Special behavior: 26 * 27 * - Metadata 28 * Metadata read is fully supported. 29 * Meaning when reading one tree block will only trigger the read for the 30 * needed range, other unrelated range in the same page will not be touched. 31 * 32 * Metadata write support is partial. 33 * The writeback is still for the full page, but we will only submit 34 * the dirty extent buffers in the page. 35 * 36 * This means, if we have a metadata page like this: 37 * 38 * Page offset 39 * 0 16K 32K 48K 64K 40 * |/////////| |///////////| 41 * \- Tree block A \- Tree block B 42 * 43 * Even if we just want to writeback tree block A, we will also writeback 44 * tree block B if it's also dirty. 45 * 46 * This may cause extra metadata writeback which results more COW. 47 * 48 * Implementation: 49 * 50 * - Common 51 * Both metadata and data will use a new structure, btrfs_subpage, to 52 * record the status of each sector inside a page. This provides the extra 53 * granularity needed. 54 * 55 * - Metadata 56 * Since we have multiple tree blocks inside one page, we can't rely on page 57 * locking anymore, or we will have greatly reduced concurrency or even 58 * deadlocks (hold one tree lock while trying to lock another tree lock in 59 * the same page). 60 * 61 * Thus for metadata locking, subpage support relies on io_tree locking only. 62 * This means a slightly higher tree locking latency. 63 */ 64 65 int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, 66 struct page *page, enum btrfs_subpage_type type) 67 { 68 struct btrfs_subpage *subpage = NULL; 69 int ret; 70 71 /* 72 * We have cases like a dummy extent buffer page, which is not mappped 73 * and doesn't need to be locked. 74 */ 75 if (page->mapping) 76 ASSERT(PageLocked(page)); 77 /* Either not subpage, or the page already has private attached */ 78 if (fs_info->sectorsize == PAGE_SIZE || PagePrivate(page)) 79 return 0; 80 81 ret = btrfs_alloc_subpage(fs_info, &subpage, type); 82 if (ret < 0) 83 return ret; 84 attach_page_private(page, subpage); 85 return 0; 86 } 87 88 void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, 89 struct page *page) 90 { 91 struct btrfs_subpage *subpage; 92 93 /* Either not subpage, or already detached */ 94 if (fs_info->sectorsize == PAGE_SIZE || !PagePrivate(page)) 95 return; 96 97 subpage = (struct btrfs_subpage *)detach_page_private(page); 98 ASSERT(subpage); 99 btrfs_free_subpage(subpage); 100 } 101 102 int btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info, 103 struct btrfs_subpage **ret, 104 enum btrfs_subpage_type type) 105 { 106 if (fs_info->sectorsize == PAGE_SIZE) 107 return 0; 108 109 *ret = kzalloc(sizeof(struct btrfs_subpage), GFP_NOFS); 110 if (!*ret) 111 return -ENOMEM; 112 spin_lock_init(&(*ret)->lock); 113 if (type == BTRFS_SUBPAGE_METADATA) { 114 atomic_set(&(*ret)->eb_refs, 0); 115 } else { 116 atomic_set(&(*ret)->readers, 0); 117 atomic_set(&(*ret)->writers, 0); 118 } 119 return 0; 120 } 121 122 void btrfs_free_subpage(struct btrfs_subpage *subpage) 123 { 124 kfree(subpage); 125 } 126 127 /* 128 * Increase the eb_refs of current subpage. 129 * 130 * This is important for eb allocation, to prevent race with last eb freeing 131 * of the same page. 132 * With the eb_refs increased before the eb inserted into radix tree, 133 * detach_extent_buffer_page() won't detach the page private while we're still 134 * allocating the extent buffer. 135 */ 136 void btrfs_page_inc_eb_refs(const struct btrfs_fs_info *fs_info, 137 struct page *page) 138 { 139 struct btrfs_subpage *subpage; 140 141 if (fs_info->sectorsize == PAGE_SIZE) 142 return; 143 144 ASSERT(PagePrivate(page) && page->mapping); 145 lockdep_assert_held(&page->mapping->private_lock); 146 147 subpage = (struct btrfs_subpage *)page->private; 148 atomic_inc(&subpage->eb_refs); 149 } 150 151 void btrfs_page_dec_eb_refs(const struct btrfs_fs_info *fs_info, 152 struct page *page) 153 { 154 struct btrfs_subpage *subpage; 155 156 if (fs_info->sectorsize == PAGE_SIZE) 157 return; 158 159 ASSERT(PagePrivate(page) && page->mapping); 160 lockdep_assert_held(&page->mapping->private_lock); 161 162 subpage = (struct btrfs_subpage *)page->private; 163 ASSERT(atomic_read(&subpage->eb_refs)); 164 atomic_dec(&subpage->eb_refs); 165 } 166 167 static void btrfs_subpage_assert(const struct btrfs_fs_info *fs_info, 168 struct page *page, u64 start, u32 len) 169 { 170 /* Basic checks */ 171 ASSERT(PagePrivate(page) && page->private); 172 ASSERT(IS_ALIGNED(start, fs_info->sectorsize) && 173 IS_ALIGNED(len, fs_info->sectorsize)); 174 /* 175 * The range check only works for mapped page, we can still have 176 * unmapped page like dummy extent buffer pages. 177 */ 178 if (page->mapping) 179 ASSERT(page_offset(page) <= start && 180 start + len <= page_offset(page) + PAGE_SIZE); 181 } 182 183 void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info, 184 struct page *page, u64 start, u32 len) 185 { 186 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 187 const int nbits = len >> fs_info->sectorsize_bits; 188 int ret; 189 190 btrfs_subpage_assert(fs_info, page, start, len); 191 192 ret = atomic_add_return(nbits, &subpage->readers); 193 ASSERT(ret == nbits); 194 } 195 196 void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info, 197 struct page *page, u64 start, u32 len) 198 { 199 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 200 const int nbits = len >> fs_info->sectorsize_bits; 201 202 btrfs_subpage_assert(fs_info, page, start, len); 203 ASSERT(atomic_read(&subpage->readers) >= nbits); 204 if (atomic_sub_and_test(nbits, &subpage->readers)) 205 unlock_page(page); 206 } 207 208 static void btrfs_subpage_clamp_range(struct page *page, u64 *start, u32 *len) 209 { 210 u64 orig_start = *start; 211 u32 orig_len = *len; 212 213 *start = max_t(u64, page_offset(page), orig_start); 214 *len = min_t(u64, page_offset(page) + PAGE_SIZE, 215 orig_start + orig_len) - *start; 216 } 217 218 void btrfs_subpage_start_writer(const struct btrfs_fs_info *fs_info, 219 struct page *page, u64 start, u32 len) 220 { 221 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 222 const int nbits = (len >> fs_info->sectorsize_bits); 223 int ret; 224 225 btrfs_subpage_assert(fs_info, page, start, len); 226 227 ASSERT(atomic_read(&subpage->readers) == 0); 228 ret = atomic_add_return(nbits, &subpage->writers); 229 ASSERT(ret == nbits); 230 } 231 232 bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info, 233 struct page *page, u64 start, u32 len) 234 { 235 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 236 const int nbits = (len >> fs_info->sectorsize_bits); 237 238 btrfs_subpage_assert(fs_info, page, start, len); 239 240 ASSERT(atomic_read(&subpage->writers) >= nbits); 241 return atomic_sub_and_test(nbits, &subpage->writers); 242 } 243 244 /* 245 * Lock a page for delalloc page writeback. 246 * 247 * Return -EAGAIN if the page is not properly initialized. 248 * Return 0 with the page locked, and writer counter updated. 249 * 250 * Even with 0 returned, the page still need extra check to make sure 251 * it's really the correct page, as the caller is using 252 * find_get_pages_contig(), which can race with page invalidating. 253 */ 254 int btrfs_page_start_writer_lock(const struct btrfs_fs_info *fs_info, 255 struct page *page, u64 start, u32 len) 256 { 257 if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { 258 lock_page(page); 259 return 0; 260 } 261 lock_page(page); 262 if (!PagePrivate(page) || !page->private) { 263 unlock_page(page); 264 return -EAGAIN; 265 } 266 btrfs_subpage_clamp_range(page, &start, &len); 267 btrfs_subpage_start_writer(fs_info, page, start, len); 268 return 0; 269 } 270 271 void btrfs_page_end_writer_lock(const struct btrfs_fs_info *fs_info, 272 struct page *page, u64 start, u32 len) 273 { 274 if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) 275 return unlock_page(page); 276 btrfs_subpage_clamp_range(page, &start, &len); 277 if (btrfs_subpage_end_and_test_writer(fs_info, page, start, len)) 278 unlock_page(page); 279 } 280 281 /* 282 * Convert the [start, start + len) range into a u16 bitmap 283 * 284 * For example: if start == page_offset() + 16K, len = 16K, we get 0x00f0. 285 */ 286 static u16 btrfs_subpage_calc_bitmap(const struct btrfs_fs_info *fs_info, 287 struct page *page, u64 start, u32 len) 288 { 289 const int bit_start = offset_in_page(start) >> fs_info->sectorsize_bits; 290 const int nbits = len >> fs_info->sectorsize_bits; 291 292 btrfs_subpage_assert(fs_info, page, start, len); 293 294 /* 295 * Here nbits can be 16, thus can go beyond u16 range. We make the 296 * first left shift to be calculate in unsigned long (at least u32), 297 * then truncate the result to u16. 298 */ 299 return (u16)(((1UL << nbits) - 1) << bit_start); 300 } 301 302 void btrfs_subpage_set_uptodate(const struct btrfs_fs_info *fs_info, 303 struct page *page, u64 start, u32 len) 304 { 305 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 306 const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); 307 unsigned long flags; 308 309 spin_lock_irqsave(&subpage->lock, flags); 310 subpage->uptodate_bitmap |= tmp; 311 if (subpage->uptodate_bitmap == U16_MAX) 312 SetPageUptodate(page); 313 spin_unlock_irqrestore(&subpage->lock, flags); 314 } 315 316 void btrfs_subpage_clear_uptodate(const struct btrfs_fs_info *fs_info, 317 struct page *page, u64 start, u32 len) 318 { 319 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 320 const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); 321 unsigned long flags; 322 323 spin_lock_irqsave(&subpage->lock, flags); 324 subpage->uptodate_bitmap &= ~tmp; 325 ClearPageUptodate(page); 326 spin_unlock_irqrestore(&subpage->lock, flags); 327 } 328 329 void btrfs_subpage_set_error(const struct btrfs_fs_info *fs_info, 330 struct page *page, u64 start, u32 len) 331 { 332 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 333 const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); 334 unsigned long flags; 335 336 spin_lock_irqsave(&subpage->lock, flags); 337 subpage->error_bitmap |= tmp; 338 SetPageError(page); 339 spin_unlock_irqrestore(&subpage->lock, flags); 340 } 341 342 void btrfs_subpage_clear_error(const struct btrfs_fs_info *fs_info, 343 struct page *page, u64 start, u32 len) 344 { 345 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 346 const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); 347 unsigned long flags; 348 349 spin_lock_irqsave(&subpage->lock, flags); 350 subpage->error_bitmap &= ~tmp; 351 if (subpage->error_bitmap == 0) 352 ClearPageError(page); 353 spin_unlock_irqrestore(&subpage->lock, flags); 354 } 355 356 void btrfs_subpage_set_dirty(const struct btrfs_fs_info *fs_info, 357 struct page *page, u64 start, u32 len) 358 { 359 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 360 u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); 361 unsigned long flags; 362 363 spin_lock_irqsave(&subpage->lock, flags); 364 subpage->dirty_bitmap |= tmp; 365 spin_unlock_irqrestore(&subpage->lock, flags); 366 set_page_dirty(page); 367 } 368 369 /* 370 * Extra clear_and_test function for subpage dirty bitmap. 371 * 372 * Return true if we're the last bits in the dirty_bitmap and clear the 373 * dirty_bitmap. 374 * Return false otherwise. 375 * 376 * NOTE: Callers should manually clear page dirty for true case, as we have 377 * extra handling for tree blocks. 378 */ 379 bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info, 380 struct page *page, u64 start, u32 len) 381 { 382 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 383 u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); 384 unsigned long flags; 385 bool last = false; 386 387 spin_lock_irqsave(&subpage->lock, flags); 388 subpage->dirty_bitmap &= ~tmp; 389 if (subpage->dirty_bitmap == 0) 390 last = true; 391 spin_unlock_irqrestore(&subpage->lock, flags); 392 return last; 393 } 394 395 void btrfs_subpage_clear_dirty(const struct btrfs_fs_info *fs_info, 396 struct page *page, u64 start, u32 len) 397 { 398 bool last; 399 400 last = btrfs_subpage_clear_and_test_dirty(fs_info, page, start, len); 401 if (last) 402 clear_page_dirty_for_io(page); 403 } 404 405 void btrfs_subpage_set_writeback(const struct btrfs_fs_info *fs_info, 406 struct page *page, u64 start, u32 len) 407 { 408 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 409 u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); 410 unsigned long flags; 411 412 spin_lock_irqsave(&subpage->lock, flags); 413 subpage->writeback_bitmap |= tmp; 414 set_page_writeback(page); 415 spin_unlock_irqrestore(&subpage->lock, flags); 416 } 417 418 void btrfs_subpage_clear_writeback(const struct btrfs_fs_info *fs_info, 419 struct page *page, u64 start, u32 len) 420 { 421 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 422 u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); 423 unsigned long flags; 424 425 spin_lock_irqsave(&subpage->lock, flags); 426 subpage->writeback_bitmap &= ~tmp; 427 if (subpage->writeback_bitmap == 0) 428 end_page_writeback(page); 429 spin_unlock_irqrestore(&subpage->lock, flags); 430 } 431 432 void btrfs_subpage_set_ordered(const struct btrfs_fs_info *fs_info, 433 struct page *page, u64 start, u32 len) 434 { 435 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 436 const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); 437 unsigned long flags; 438 439 spin_lock_irqsave(&subpage->lock, flags); 440 subpage->ordered_bitmap |= tmp; 441 SetPageOrdered(page); 442 spin_unlock_irqrestore(&subpage->lock, flags); 443 } 444 445 void btrfs_subpage_clear_ordered(const struct btrfs_fs_info *fs_info, 446 struct page *page, u64 start, u32 len) 447 { 448 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 449 const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); 450 unsigned long flags; 451 452 spin_lock_irqsave(&subpage->lock, flags); 453 subpage->ordered_bitmap &= ~tmp; 454 if (subpage->ordered_bitmap == 0) 455 ClearPageOrdered(page); 456 spin_unlock_irqrestore(&subpage->lock, flags); 457 } 458 /* 459 * Unlike set/clear which is dependent on each page status, for test all bits 460 * are tested in the same way. 461 */ 462 #define IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(name) \ 463 bool btrfs_subpage_test_##name(const struct btrfs_fs_info *fs_info, \ 464 struct page *page, u64 start, u32 len) \ 465 { \ 466 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; \ 467 const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); \ 468 unsigned long flags; \ 469 bool ret; \ 470 \ 471 spin_lock_irqsave(&subpage->lock, flags); \ 472 ret = ((subpage->name##_bitmap & tmp) == tmp); \ 473 spin_unlock_irqrestore(&subpage->lock, flags); \ 474 return ret; \ 475 } 476 IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(uptodate); 477 IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(error); 478 IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(dirty); 479 IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(writeback); 480 IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(ordered); 481 482 /* 483 * Note that, in selftests (extent-io-tests), we can have empty fs_info passed 484 * in. We only test sectorsize == PAGE_SIZE cases so far, thus we can fall 485 * back to regular sectorsize branch. 486 */ 487 #define IMPLEMENT_BTRFS_PAGE_OPS(name, set_page_func, clear_page_func, \ 488 test_page_func) \ 489 void btrfs_page_set_##name(const struct btrfs_fs_info *fs_info, \ 490 struct page *page, u64 start, u32 len) \ 491 { \ 492 if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \ 493 set_page_func(page); \ 494 return; \ 495 } \ 496 btrfs_subpage_set_##name(fs_info, page, start, len); \ 497 } \ 498 void btrfs_page_clear_##name(const struct btrfs_fs_info *fs_info, \ 499 struct page *page, u64 start, u32 len) \ 500 { \ 501 if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \ 502 clear_page_func(page); \ 503 return; \ 504 } \ 505 btrfs_subpage_clear_##name(fs_info, page, start, len); \ 506 } \ 507 bool btrfs_page_test_##name(const struct btrfs_fs_info *fs_info, \ 508 struct page *page, u64 start, u32 len) \ 509 { \ 510 if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) \ 511 return test_page_func(page); \ 512 return btrfs_subpage_test_##name(fs_info, page, start, len); \ 513 } \ 514 void btrfs_page_clamp_set_##name(const struct btrfs_fs_info *fs_info, \ 515 struct page *page, u64 start, u32 len) \ 516 { \ 517 if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \ 518 set_page_func(page); \ 519 return; \ 520 } \ 521 btrfs_subpage_clamp_range(page, &start, &len); \ 522 btrfs_subpage_set_##name(fs_info, page, start, len); \ 523 } \ 524 void btrfs_page_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \ 525 struct page *page, u64 start, u32 len) \ 526 { \ 527 if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \ 528 clear_page_func(page); \ 529 return; \ 530 } \ 531 btrfs_subpage_clamp_range(page, &start, &len); \ 532 btrfs_subpage_clear_##name(fs_info, page, start, len); \ 533 } \ 534 bool btrfs_page_clamp_test_##name(const struct btrfs_fs_info *fs_info, \ 535 struct page *page, u64 start, u32 len) \ 536 { \ 537 if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) \ 538 return test_page_func(page); \ 539 btrfs_subpage_clamp_range(page, &start, &len); \ 540 return btrfs_subpage_test_##name(fs_info, page, start, len); \ 541 } 542 IMPLEMENT_BTRFS_PAGE_OPS(uptodate, SetPageUptodate, ClearPageUptodate, 543 PageUptodate); 544 IMPLEMENT_BTRFS_PAGE_OPS(error, SetPageError, ClearPageError, PageError); 545 IMPLEMENT_BTRFS_PAGE_OPS(dirty, set_page_dirty, clear_page_dirty_for_io, 546 PageDirty); 547 IMPLEMENT_BTRFS_PAGE_OPS(writeback, set_page_writeback, end_page_writeback, 548 PageWriteback); 549 IMPLEMENT_BTRFS_PAGE_OPS(ordered, SetPageOrdered, ClearPageOrdered, 550 PageOrdered); 551