1 // SPDX-License-Identifier: GPL-2.0 2 3 #include <linux/slab.h> 4 #include "ctree.h" 5 #include "subpage.h" 6 7 /* 8 * Subpage (sectorsize < PAGE_SIZE) support overview: 9 * 10 * Limitations: 11 * 12 * - Only support 64K page size for now 13 * This is to make metadata handling easier, as 64K page would ensure 14 * all nodesize would fit inside one page, thus we don't need to handle 15 * cases where a tree block crosses several pages. 16 * 17 * - Only metadata read-write for now 18 * The data read-write part is in development. 19 * 20 * - Metadata can't cross 64K page boundary 21 * btrfs-progs and kernel have done that for a while, thus only ancient 22 * filesystems could have such problem. For such case, do a graceful 23 * rejection. 24 * 25 * Special behavior: 26 * 27 * - Metadata 28 * Metadata read is fully supported. 29 * Meaning when reading one tree block will only trigger the read for the 30 * needed range, other unrelated range in the same page will not be touched. 31 * 32 * Metadata write support is partial. 33 * The writeback is still for the full page, but we will only submit 34 * the dirty extent buffers in the page. 35 * 36 * This means, if we have a metadata page like this: 37 * 38 * Page offset 39 * 0 16K 32K 48K 64K 40 * |/////////| |///////////| 41 * \- Tree block A \- Tree block B 42 * 43 * Even if we just want to writeback tree block A, we will also writeback 44 * tree block B if it's also dirty. 45 * 46 * This may cause extra metadata writeback which results more COW. 47 * 48 * Implementation: 49 * 50 * - Common 51 * Both metadata and data will use a new structure, btrfs_subpage, to 52 * record the status of each sector inside a page. This provides the extra 53 * granularity needed. 54 * 55 * - Metadata 56 * Since we have multiple tree blocks inside one page, we can't rely on page 57 * locking anymore, or we will have greatly reduced concurrency or even 58 * deadlocks (hold one tree lock while trying to lock another tree lock in 59 * the same page). 60 * 61 * Thus for metadata locking, subpage support relies on io_tree locking only. 62 * This means a slightly higher tree locking latency. 63 */ 64 65 int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, 66 struct page *page, enum btrfs_subpage_type type) 67 { 68 struct btrfs_subpage *subpage = NULL; 69 int ret; 70 71 /* 72 * We have cases like a dummy extent buffer page, which is not mappped 73 * and doesn't need to be locked. 74 */ 75 if (page->mapping) 76 ASSERT(PageLocked(page)); 77 /* Either not subpage, or the page already has private attached */ 78 if (fs_info->sectorsize == PAGE_SIZE || PagePrivate(page)) 79 return 0; 80 81 ret = btrfs_alloc_subpage(fs_info, &subpage, type); 82 if (ret < 0) 83 return ret; 84 attach_page_private(page, subpage); 85 return 0; 86 } 87 88 void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, 89 struct page *page) 90 { 91 struct btrfs_subpage *subpage; 92 93 /* Either not subpage, or already detached */ 94 if (fs_info->sectorsize == PAGE_SIZE || !PagePrivate(page)) 95 return; 96 97 subpage = (struct btrfs_subpage *)detach_page_private(page); 98 ASSERT(subpage); 99 btrfs_free_subpage(subpage); 100 } 101 102 int btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info, 103 struct btrfs_subpage **ret, 104 enum btrfs_subpage_type type) 105 { 106 if (fs_info->sectorsize == PAGE_SIZE) 107 return 0; 108 109 *ret = kzalloc(sizeof(struct btrfs_subpage), GFP_NOFS); 110 if (!*ret) 111 return -ENOMEM; 112 spin_lock_init(&(*ret)->lock); 113 if (type == BTRFS_SUBPAGE_METADATA) 114 atomic_set(&(*ret)->eb_refs, 0); 115 else 116 atomic_set(&(*ret)->readers, 0); 117 return 0; 118 } 119 120 void btrfs_free_subpage(struct btrfs_subpage *subpage) 121 { 122 kfree(subpage); 123 } 124 125 /* 126 * Increase the eb_refs of current subpage. 127 * 128 * This is important for eb allocation, to prevent race with last eb freeing 129 * of the same page. 130 * With the eb_refs increased before the eb inserted into radix tree, 131 * detach_extent_buffer_page() won't detach the page private while we're still 132 * allocating the extent buffer. 133 */ 134 void btrfs_page_inc_eb_refs(const struct btrfs_fs_info *fs_info, 135 struct page *page) 136 { 137 struct btrfs_subpage *subpage; 138 139 if (fs_info->sectorsize == PAGE_SIZE) 140 return; 141 142 ASSERT(PagePrivate(page) && page->mapping); 143 lockdep_assert_held(&page->mapping->private_lock); 144 145 subpage = (struct btrfs_subpage *)page->private; 146 atomic_inc(&subpage->eb_refs); 147 } 148 149 void btrfs_page_dec_eb_refs(const struct btrfs_fs_info *fs_info, 150 struct page *page) 151 { 152 struct btrfs_subpage *subpage; 153 154 if (fs_info->sectorsize == PAGE_SIZE) 155 return; 156 157 ASSERT(PagePrivate(page) && page->mapping); 158 lockdep_assert_held(&page->mapping->private_lock); 159 160 subpage = (struct btrfs_subpage *)page->private; 161 ASSERT(atomic_read(&subpage->eb_refs)); 162 atomic_dec(&subpage->eb_refs); 163 } 164 165 static void btrfs_subpage_assert(const struct btrfs_fs_info *fs_info, 166 struct page *page, u64 start, u32 len) 167 { 168 /* Basic checks */ 169 ASSERT(PagePrivate(page) && page->private); 170 ASSERT(IS_ALIGNED(start, fs_info->sectorsize) && 171 IS_ALIGNED(len, fs_info->sectorsize)); 172 /* 173 * The range check only works for mapped page, we can still have 174 * unmapped page like dummy extent buffer pages. 175 */ 176 if (page->mapping) 177 ASSERT(page_offset(page) <= start && 178 start + len <= page_offset(page) + PAGE_SIZE); 179 } 180 181 void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info, 182 struct page *page, u64 start, u32 len) 183 { 184 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 185 const int nbits = len >> fs_info->sectorsize_bits; 186 int ret; 187 188 btrfs_subpage_assert(fs_info, page, start, len); 189 190 ret = atomic_add_return(nbits, &subpage->readers); 191 ASSERT(ret == nbits); 192 } 193 194 void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info, 195 struct page *page, u64 start, u32 len) 196 { 197 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 198 const int nbits = len >> fs_info->sectorsize_bits; 199 200 btrfs_subpage_assert(fs_info, page, start, len); 201 ASSERT(atomic_read(&subpage->readers) >= nbits); 202 if (atomic_sub_and_test(nbits, &subpage->readers)) 203 unlock_page(page); 204 } 205 206 /* 207 * Convert the [start, start + len) range into a u16 bitmap 208 * 209 * For example: if start == page_offset() + 16K, len = 16K, we get 0x00f0. 210 */ 211 static u16 btrfs_subpage_calc_bitmap(const struct btrfs_fs_info *fs_info, 212 struct page *page, u64 start, u32 len) 213 { 214 const int bit_start = offset_in_page(start) >> fs_info->sectorsize_bits; 215 const int nbits = len >> fs_info->sectorsize_bits; 216 217 btrfs_subpage_assert(fs_info, page, start, len); 218 219 /* 220 * Here nbits can be 16, thus can go beyond u16 range. We make the 221 * first left shift to be calculate in unsigned long (at least u32), 222 * then truncate the result to u16. 223 */ 224 return (u16)(((1UL << nbits) - 1) << bit_start); 225 } 226 227 void btrfs_subpage_set_uptodate(const struct btrfs_fs_info *fs_info, 228 struct page *page, u64 start, u32 len) 229 { 230 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 231 const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); 232 unsigned long flags; 233 234 spin_lock_irqsave(&subpage->lock, flags); 235 subpage->uptodate_bitmap |= tmp; 236 if (subpage->uptodate_bitmap == U16_MAX) 237 SetPageUptodate(page); 238 spin_unlock_irqrestore(&subpage->lock, flags); 239 } 240 241 void btrfs_subpage_clear_uptodate(const struct btrfs_fs_info *fs_info, 242 struct page *page, u64 start, u32 len) 243 { 244 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 245 const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); 246 unsigned long flags; 247 248 spin_lock_irqsave(&subpage->lock, flags); 249 subpage->uptodate_bitmap &= ~tmp; 250 ClearPageUptodate(page); 251 spin_unlock_irqrestore(&subpage->lock, flags); 252 } 253 254 void btrfs_subpage_set_error(const struct btrfs_fs_info *fs_info, 255 struct page *page, u64 start, u32 len) 256 { 257 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 258 const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); 259 unsigned long flags; 260 261 spin_lock_irqsave(&subpage->lock, flags); 262 subpage->error_bitmap |= tmp; 263 SetPageError(page); 264 spin_unlock_irqrestore(&subpage->lock, flags); 265 } 266 267 void btrfs_subpage_clear_error(const struct btrfs_fs_info *fs_info, 268 struct page *page, u64 start, u32 len) 269 { 270 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 271 const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); 272 unsigned long flags; 273 274 spin_lock_irqsave(&subpage->lock, flags); 275 subpage->error_bitmap &= ~tmp; 276 if (subpage->error_bitmap == 0) 277 ClearPageError(page); 278 spin_unlock_irqrestore(&subpage->lock, flags); 279 } 280 281 void btrfs_subpage_set_dirty(const struct btrfs_fs_info *fs_info, 282 struct page *page, u64 start, u32 len) 283 { 284 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 285 u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); 286 unsigned long flags; 287 288 spin_lock_irqsave(&subpage->lock, flags); 289 subpage->dirty_bitmap |= tmp; 290 spin_unlock_irqrestore(&subpage->lock, flags); 291 set_page_dirty(page); 292 } 293 294 /* 295 * Extra clear_and_test function for subpage dirty bitmap. 296 * 297 * Return true if we're the last bits in the dirty_bitmap and clear the 298 * dirty_bitmap. 299 * Return false otherwise. 300 * 301 * NOTE: Callers should manually clear page dirty for true case, as we have 302 * extra handling for tree blocks. 303 */ 304 bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info, 305 struct page *page, u64 start, u32 len) 306 { 307 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 308 u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); 309 unsigned long flags; 310 bool last = false; 311 312 spin_lock_irqsave(&subpage->lock, flags); 313 subpage->dirty_bitmap &= ~tmp; 314 if (subpage->dirty_bitmap == 0) 315 last = true; 316 spin_unlock_irqrestore(&subpage->lock, flags); 317 return last; 318 } 319 320 void btrfs_subpage_clear_dirty(const struct btrfs_fs_info *fs_info, 321 struct page *page, u64 start, u32 len) 322 { 323 bool last; 324 325 last = btrfs_subpage_clear_and_test_dirty(fs_info, page, start, len); 326 if (last) 327 clear_page_dirty_for_io(page); 328 } 329 330 void btrfs_subpage_set_writeback(const struct btrfs_fs_info *fs_info, 331 struct page *page, u64 start, u32 len) 332 { 333 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 334 u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); 335 unsigned long flags; 336 337 spin_lock_irqsave(&subpage->lock, flags); 338 subpage->writeback_bitmap |= tmp; 339 set_page_writeback(page); 340 spin_unlock_irqrestore(&subpage->lock, flags); 341 } 342 343 void btrfs_subpage_clear_writeback(const struct btrfs_fs_info *fs_info, 344 struct page *page, u64 start, u32 len) 345 { 346 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 347 u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); 348 unsigned long flags; 349 350 spin_lock_irqsave(&subpage->lock, flags); 351 subpage->writeback_bitmap &= ~tmp; 352 if (subpage->writeback_bitmap == 0) 353 end_page_writeback(page); 354 spin_unlock_irqrestore(&subpage->lock, flags); 355 } 356 357 static void btrfs_subpage_clamp_range(struct page *page, u64 *start, u32 *len) 358 { 359 u64 orig_start = *start; 360 u32 orig_len = *len; 361 362 *start = max_t(u64, page_offset(page), orig_start); 363 *len = min_t(u64, page_offset(page) + PAGE_SIZE, 364 orig_start + orig_len) - *start; 365 } 366 367 /* 368 * Unlike set/clear which is dependent on each page status, for test all bits 369 * are tested in the same way. 370 */ 371 #define IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(name) \ 372 bool btrfs_subpage_test_##name(const struct btrfs_fs_info *fs_info, \ 373 struct page *page, u64 start, u32 len) \ 374 { \ 375 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; \ 376 const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); \ 377 unsigned long flags; \ 378 bool ret; \ 379 \ 380 spin_lock_irqsave(&subpage->lock, flags); \ 381 ret = ((subpage->name##_bitmap & tmp) == tmp); \ 382 spin_unlock_irqrestore(&subpage->lock, flags); \ 383 return ret; \ 384 } 385 IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(uptodate); 386 IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(error); 387 IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(dirty); 388 IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(writeback); 389 390 /* 391 * Note that, in selftests (extent-io-tests), we can have empty fs_info passed 392 * in. We only test sectorsize == PAGE_SIZE cases so far, thus we can fall 393 * back to regular sectorsize branch. 394 */ 395 #define IMPLEMENT_BTRFS_PAGE_OPS(name, set_page_func, clear_page_func, \ 396 test_page_func) \ 397 void btrfs_page_set_##name(const struct btrfs_fs_info *fs_info, \ 398 struct page *page, u64 start, u32 len) \ 399 { \ 400 if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \ 401 set_page_func(page); \ 402 return; \ 403 } \ 404 btrfs_subpage_set_##name(fs_info, page, start, len); \ 405 } \ 406 void btrfs_page_clear_##name(const struct btrfs_fs_info *fs_info, \ 407 struct page *page, u64 start, u32 len) \ 408 { \ 409 if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \ 410 clear_page_func(page); \ 411 return; \ 412 } \ 413 btrfs_subpage_clear_##name(fs_info, page, start, len); \ 414 } \ 415 bool btrfs_page_test_##name(const struct btrfs_fs_info *fs_info, \ 416 struct page *page, u64 start, u32 len) \ 417 { \ 418 if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) \ 419 return test_page_func(page); \ 420 return btrfs_subpage_test_##name(fs_info, page, start, len); \ 421 } \ 422 void btrfs_page_clamp_set_##name(const struct btrfs_fs_info *fs_info, \ 423 struct page *page, u64 start, u32 len) \ 424 { \ 425 if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \ 426 set_page_func(page); \ 427 return; \ 428 } \ 429 btrfs_subpage_clamp_range(page, &start, &len); \ 430 btrfs_subpage_set_##name(fs_info, page, start, len); \ 431 } \ 432 void btrfs_page_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \ 433 struct page *page, u64 start, u32 len) \ 434 { \ 435 if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \ 436 clear_page_func(page); \ 437 return; \ 438 } \ 439 btrfs_subpage_clamp_range(page, &start, &len); \ 440 btrfs_subpage_clear_##name(fs_info, page, start, len); \ 441 } \ 442 bool btrfs_page_clamp_test_##name(const struct btrfs_fs_info *fs_info, \ 443 struct page *page, u64 start, u32 len) \ 444 { \ 445 if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) \ 446 return test_page_func(page); \ 447 btrfs_subpage_clamp_range(page, &start, &len); \ 448 return btrfs_subpage_test_##name(fs_info, page, start, len); \ 449 } 450 IMPLEMENT_BTRFS_PAGE_OPS(uptodate, SetPageUptodate, ClearPageUptodate, 451 PageUptodate); 452 IMPLEMENT_BTRFS_PAGE_OPS(error, SetPageError, ClearPageError, PageError); 453 IMPLEMENT_BTRFS_PAGE_OPS(dirty, set_page_dirty, clear_page_dirty_for_io, 454 PageDirty); 455 IMPLEMENT_BTRFS_PAGE_OPS(writeback, set_page_writeback, end_page_writeback, 456 PageWriteback); 457