1 /* 2 * fs/ext4/extents_status.c 3 * 4 * Written by Yongqiang Yang <xiaoqiangnk@gmail.com> 5 * Modified by 6 * Allison Henderson <achender@linux.vnet.ibm.com> 7 * Hugh Dickins <hughd@google.com> 8 * Zheng Liu <wenqing.lz@taobao.com> 9 * 10 * Ext4 extents status tree core functions. 11 */ 12 #include <linux/rbtree.h> 13 #include <linux/list_sort.h> 14 #include <linux/proc_fs.h> 15 #include <linux/seq_file.h> 16 #include "ext4.h" 17 #include "extents_status.h" 18 19 #include <trace/events/ext4.h> 20 21 /* 22 * According to previous discussion in Ext4 Developer Workshop, we 23 * will introduce a new structure called io tree to track all extent 24 * status in order to solve some problems that we have met 25 * (e.g. Reservation space warning), and provide extent-level locking. 26 * Delay extent tree is the first step to achieve this goal. It is 27 * original built by Yongqiang Yang. At that time it is called delay 28 * extent tree, whose goal is only track delayed extents in memory to 29 * simplify the implementation of fiemap and bigalloc, and introduce 30 * lseek SEEK_DATA/SEEK_HOLE support. That is why it is still called 31 * delay extent tree at the first commit. But for better understand 32 * what it does, it has been rename to extent status tree. 33 * 34 * Step1: 35 * Currently the first step has been done. All delayed extents are 36 * tracked in the tree. It maintains the delayed extent when a delayed 37 * allocation is issued, and the delayed extent is written out or 38 * invalidated. Therefore the implementation of fiemap and bigalloc 39 * are simplified, and SEEK_DATA/SEEK_HOLE are introduced. 40 * 41 * The following comment describes the implemenmtation of extent 42 * status tree and future works. 43 * 44 * Step2: 45 * In this step all extent status are tracked by extent status tree. 46 * Thus, we can first try to lookup a block mapping in this tree before 47 * finding it in extent tree. Hence, single extent cache can be removed 48 * because extent status tree can do a better job. Extents in status 49 * tree are loaded on-demand. Therefore, the extent status tree may not 50 * contain all of the extents in a file. Meanwhile we define a shrinker 51 * to reclaim memory from extent status tree because fragmented extent 52 * tree will make status tree cost too much memory. written/unwritten/- 53 * hole extents in the tree will be reclaimed by this shrinker when we 54 * are under high memory pressure. Delayed extents will not be 55 * reclimed because fiemap, bigalloc, and seek_data/hole need it. 56 */ 57 58 /* 59 * Extent status tree implementation for ext4. 60 * 61 * 62 * ========================================================================== 63 * Extent status tree tracks all extent status. 64 * 65 * 1. Why we need to implement extent status tree? 66 * 67 * Without extent status tree, ext4 identifies a delayed extent by looking 68 * up page cache, this has several deficiencies - complicated, buggy, 69 * and inefficient code. 70 * 71 * FIEMAP, SEEK_HOLE/DATA, bigalloc, and writeout all need to know if a 72 * block or a range of blocks are belonged to a delayed extent. 73 * 74 * Let us have a look at how they do without extent status tree. 75 * -- FIEMAP 76 * FIEMAP looks up page cache to identify delayed allocations from holes. 77 * 78 * -- SEEK_HOLE/DATA 79 * SEEK_HOLE/DATA has the same problem as FIEMAP. 80 * 81 * -- bigalloc 82 * bigalloc looks up page cache to figure out if a block is 83 * already under delayed allocation or not to determine whether 84 * quota reserving is needed for the cluster. 85 * 86 * -- writeout 87 * Writeout looks up whole page cache to see if a buffer is 88 * mapped, If there are not very many delayed buffers, then it is 89 * time comsuming. 90 * 91 * With extent status tree implementation, FIEMAP, SEEK_HOLE/DATA, 92 * bigalloc and writeout can figure out if a block or a range of 93 * blocks is under delayed allocation(belonged to a delayed extent) or 94 * not by searching the extent tree. 95 * 96 * 97 * ========================================================================== 98 * 2. Ext4 extent status tree impelmentation 99 * 100 * -- extent 101 * A extent is a range of blocks which are contiguous logically and 102 * physically. Unlike extent in extent tree, this extent in ext4 is 103 * a in-memory struct, there is no corresponding on-disk data. There 104 * is no limit on length of extent, so an extent can contain as many 105 * blocks as they are contiguous logically and physically. 106 * 107 * -- extent status tree 108 * Every inode has an extent status tree and all allocation blocks 109 * are added to the tree with different status. The extent in the 110 * tree are ordered by logical block no. 111 * 112 * -- operations on a extent status tree 113 * There are three important operations on a delayed extent tree: find 114 * next extent, adding a extent(a range of blocks) and removing a extent. 115 * 116 * -- race on a extent status tree 117 * Extent status tree is protected by inode->i_es_lock. 118 * 119 * -- memory consumption 120 * Fragmented extent tree will make extent status tree cost too much 121 * memory. Hence, we will reclaim written/unwritten/hole extents from 122 * the tree under a heavy memory pressure. 123 * 124 * 125 * ========================================================================== 126 * 3. Performance analysis 127 * 128 * -- overhead 129 * 1. There is a cache extent for write access, so if writes are 130 * not very random, adding space operaions are in O(1) time. 131 * 132 * -- gain 133 * 2. Code is much simpler, more readable, more maintainable and 134 * more efficient. 135 * 136 * 137 * ========================================================================== 138 * 4. TODO list 139 * 140 * -- Refactor delayed space reservation 141 * 142 * -- Extent-level locking 143 */ 144 145 static struct kmem_cache *ext4_es_cachep; 146 147 static int __es_insert_extent(struct inode *inode, struct extent_status *newes); 148 static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, 149 ext4_lblk_t end); 150 static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei, 151 int nr_to_scan); 152 static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, 153 struct ext4_inode_info *locked_ei); 154 155 int __init ext4_init_es(void) 156 { 157 ext4_es_cachep = kmem_cache_create("ext4_extent_status", 158 sizeof(struct extent_status), 159 0, (SLAB_RECLAIM_ACCOUNT), NULL); 160 if (ext4_es_cachep == NULL) 161 return -ENOMEM; 162 return 0; 163 } 164 165 void ext4_exit_es(void) 166 { 167 if (ext4_es_cachep) 168 kmem_cache_destroy(ext4_es_cachep); 169 } 170 171 void ext4_es_init_tree(struct ext4_es_tree *tree) 172 { 173 tree->root = RB_ROOT; 174 tree->cache_es = NULL; 175 } 176 177 #ifdef ES_DEBUG__ 178 static void ext4_es_print_tree(struct inode *inode) 179 { 180 struct ext4_es_tree *tree; 181 struct rb_node *node; 182 183 printk(KERN_DEBUG "status extents for inode %lu:", inode->i_ino); 184 tree = &EXT4_I(inode)->i_es_tree; 185 node = rb_first(&tree->root); 186 while (node) { 187 struct extent_status *es; 188 es = rb_entry(node, struct extent_status, rb_node); 189 printk(KERN_DEBUG " [%u/%u) %llu %x", 190 es->es_lblk, es->es_len, 191 ext4_es_pblock(es), ext4_es_status(es)); 192 node = rb_next(node); 193 } 194 printk(KERN_DEBUG "\n"); 195 } 196 #else 197 #define ext4_es_print_tree(inode) 198 #endif 199 200 static inline ext4_lblk_t ext4_es_end(struct extent_status *es) 201 { 202 BUG_ON(es->es_lblk + es->es_len < es->es_lblk); 203 return es->es_lblk + es->es_len - 1; 204 } 205 206 /* 207 * search through the tree for an delayed extent with a given offset. If 208 * it can't be found, try to find next extent. 209 */ 210 static struct extent_status *__es_tree_search(struct rb_root *root, 211 ext4_lblk_t lblk) 212 { 213 struct rb_node *node = root->rb_node; 214 struct extent_status *es = NULL; 215 216 while (node) { 217 es = rb_entry(node, struct extent_status, rb_node); 218 if (lblk < es->es_lblk) 219 node = node->rb_left; 220 else if (lblk > ext4_es_end(es)) 221 node = node->rb_right; 222 else 223 return es; 224 } 225 226 if (es && lblk < es->es_lblk) 227 return es; 228 229 if (es && lblk > ext4_es_end(es)) { 230 node = rb_next(&es->rb_node); 231 return node ? rb_entry(node, struct extent_status, rb_node) : 232 NULL; 233 } 234 235 return NULL; 236 } 237 238 /* 239 * ext4_es_find_delayed_extent_range: find the 1st delayed extent covering 240 * @es->lblk if it exists, otherwise, the next extent after @es->lblk. 241 * 242 * @inode: the inode which owns delayed extents 243 * @lblk: the offset where we start to search 244 * @end: the offset where we stop to search 245 * @es: delayed extent that we found 246 */ 247 void ext4_es_find_delayed_extent_range(struct inode *inode, 248 ext4_lblk_t lblk, ext4_lblk_t end, 249 struct extent_status *es) 250 { 251 struct ext4_es_tree *tree = NULL; 252 struct extent_status *es1 = NULL; 253 struct rb_node *node; 254 255 BUG_ON(es == NULL); 256 BUG_ON(end < lblk); 257 trace_ext4_es_find_delayed_extent_range_enter(inode, lblk); 258 259 read_lock(&EXT4_I(inode)->i_es_lock); 260 tree = &EXT4_I(inode)->i_es_tree; 261 262 /* find extent in cache firstly */ 263 es->es_lblk = es->es_len = es->es_pblk = 0; 264 if (tree->cache_es) { 265 es1 = tree->cache_es; 266 if (in_range(lblk, es1->es_lblk, es1->es_len)) { 267 es_debug("%u cached by [%u/%u) %llu %x\n", 268 lblk, es1->es_lblk, es1->es_len, 269 ext4_es_pblock(es1), ext4_es_status(es1)); 270 goto out; 271 } 272 } 273 274 es1 = __es_tree_search(&tree->root, lblk); 275 276 out: 277 if (es1 && !ext4_es_is_delayed(es1)) { 278 while ((node = rb_next(&es1->rb_node)) != NULL) { 279 es1 = rb_entry(node, struct extent_status, rb_node); 280 if (es1->es_lblk > end) { 281 es1 = NULL; 282 break; 283 } 284 if (ext4_es_is_delayed(es1)) 285 break; 286 } 287 } 288 289 if (es1 && ext4_es_is_delayed(es1)) { 290 tree->cache_es = es1; 291 es->es_lblk = es1->es_lblk; 292 es->es_len = es1->es_len; 293 es->es_pblk = es1->es_pblk; 294 } 295 296 read_unlock(&EXT4_I(inode)->i_es_lock); 297 298 trace_ext4_es_find_delayed_extent_range_exit(inode, es); 299 } 300 301 static struct extent_status * 302 ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len, 303 ext4_fsblk_t pblk) 304 { 305 struct extent_status *es; 306 es = kmem_cache_alloc(ext4_es_cachep, GFP_ATOMIC); 307 if (es == NULL) 308 return NULL; 309 es->es_lblk = lblk; 310 es->es_len = len; 311 es->es_pblk = pblk; 312 313 /* 314 * We don't count delayed extent because we never try to reclaim them 315 */ 316 if (!ext4_es_is_delayed(es)) { 317 EXT4_I(inode)->i_es_lru_nr++; 318 percpu_counter_inc(&EXT4_SB(inode->i_sb)-> 319 s_es_stats.es_stats_lru_cnt); 320 } 321 322 EXT4_I(inode)->i_es_all_nr++; 323 percpu_counter_inc(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt); 324 325 return es; 326 } 327 328 static void ext4_es_free_extent(struct inode *inode, struct extent_status *es) 329 { 330 EXT4_I(inode)->i_es_all_nr--; 331 percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt); 332 333 /* Decrease the lru counter when this es is not delayed */ 334 if (!ext4_es_is_delayed(es)) { 335 BUG_ON(EXT4_I(inode)->i_es_lru_nr == 0); 336 EXT4_I(inode)->i_es_lru_nr--; 337 percpu_counter_dec(&EXT4_SB(inode->i_sb)-> 338 s_es_stats.es_stats_lru_cnt); 339 } 340 341 kmem_cache_free(ext4_es_cachep, es); 342 } 343 344 /* 345 * Check whether or not two extents can be merged 346 * Condition: 347 * - logical block number is contiguous 348 * - physical block number is contiguous 349 * - status is equal 350 */ 351 static int ext4_es_can_be_merged(struct extent_status *es1, 352 struct extent_status *es2) 353 { 354 if (ext4_es_status(es1) != ext4_es_status(es2)) 355 return 0; 356 357 if (((__u64) es1->es_len) + es2->es_len > EXT_MAX_BLOCKS) { 358 pr_warn("ES assertion failed when merging extents. " 359 "The sum of lengths of es1 (%d) and es2 (%d) " 360 "is bigger than allowed file size (%d)\n", 361 es1->es_len, es2->es_len, EXT_MAX_BLOCKS); 362 WARN_ON(1); 363 return 0; 364 } 365 366 if (((__u64) es1->es_lblk) + es1->es_len != es2->es_lblk) 367 return 0; 368 369 if ((ext4_es_is_written(es1) || ext4_es_is_unwritten(es1)) && 370 (ext4_es_pblock(es1) + es1->es_len == ext4_es_pblock(es2))) 371 return 1; 372 373 if (ext4_es_is_hole(es1)) 374 return 1; 375 376 /* we need to check delayed extent is without unwritten status */ 377 if (ext4_es_is_delayed(es1) && !ext4_es_is_unwritten(es1)) 378 return 1; 379 380 return 0; 381 } 382 383 static struct extent_status * 384 ext4_es_try_to_merge_left(struct inode *inode, struct extent_status *es) 385 { 386 struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree; 387 struct extent_status *es1; 388 struct rb_node *node; 389 390 node = rb_prev(&es->rb_node); 391 if (!node) 392 return es; 393 394 es1 = rb_entry(node, struct extent_status, rb_node); 395 if (ext4_es_can_be_merged(es1, es)) { 396 es1->es_len += es->es_len; 397 rb_erase(&es->rb_node, &tree->root); 398 ext4_es_free_extent(inode, es); 399 es = es1; 400 } 401 402 return es; 403 } 404 405 static struct extent_status * 406 ext4_es_try_to_merge_right(struct inode *inode, struct extent_status *es) 407 { 408 struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree; 409 struct extent_status *es1; 410 struct rb_node *node; 411 412 node = rb_next(&es->rb_node); 413 if (!node) 414 return es; 415 416 es1 = rb_entry(node, struct extent_status, rb_node); 417 if (ext4_es_can_be_merged(es, es1)) { 418 es->es_len += es1->es_len; 419 rb_erase(node, &tree->root); 420 ext4_es_free_extent(inode, es1); 421 } 422 423 return es; 424 } 425 426 #ifdef ES_AGGRESSIVE_TEST 427 #include "ext4_extents.h" /* Needed when ES_AGGRESSIVE_TEST is defined */ 428 429 static void ext4_es_insert_extent_ext_check(struct inode *inode, 430 struct extent_status *es) 431 { 432 struct ext4_ext_path *path = NULL; 433 struct ext4_extent *ex; 434 ext4_lblk_t ee_block; 435 ext4_fsblk_t ee_start; 436 unsigned short ee_len; 437 int depth, ee_status, es_status; 438 439 path = ext4_find_extent(inode, es->es_lblk, NULL, EXT4_EX_NOCACHE); 440 if (IS_ERR(path)) 441 return; 442 443 depth = ext_depth(inode); 444 ex = path[depth].p_ext; 445 446 if (ex) { 447 448 ee_block = le32_to_cpu(ex->ee_block); 449 ee_start = ext4_ext_pblock(ex); 450 ee_len = ext4_ext_get_actual_len(ex); 451 452 ee_status = ext4_ext_is_unwritten(ex) ? 1 : 0; 453 es_status = ext4_es_is_unwritten(es) ? 1 : 0; 454 455 /* 456 * Make sure ex and es are not overlap when we try to insert 457 * a delayed/hole extent. 458 */ 459 if (!ext4_es_is_written(es) && !ext4_es_is_unwritten(es)) { 460 if (in_range(es->es_lblk, ee_block, ee_len)) { 461 pr_warn("ES insert assertion failed for " 462 "inode: %lu we can find an extent " 463 "at block [%d/%d/%llu/%c], but we " 464 "want to add a delayed/hole extent " 465 "[%d/%d/%llu/%x]\n", 466 inode->i_ino, ee_block, ee_len, 467 ee_start, ee_status ? 'u' : 'w', 468 es->es_lblk, es->es_len, 469 ext4_es_pblock(es), ext4_es_status(es)); 470 } 471 goto out; 472 } 473 474 /* 475 * We don't check ee_block == es->es_lblk, etc. because es 476 * might be a part of whole extent, vice versa. 477 */ 478 if (es->es_lblk < ee_block || 479 ext4_es_pblock(es) != ee_start + es->es_lblk - ee_block) { 480 pr_warn("ES insert assertion failed for inode: %lu " 481 "ex_status [%d/%d/%llu/%c] != " 482 "es_status [%d/%d/%llu/%c]\n", inode->i_ino, 483 ee_block, ee_len, ee_start, 484 ee_status ? 'u' : 'w', es->es_lblk, es->es_len, 485 ext4_es_pblock(es), es_status ? 'u' : 'w'); 486 goto out; 487 } 488 489 if (ee_status ^ es_status) { 490 pr_warn("ES insert assertion failed for inode: %lu " 491 "ex_status [%d/%d/%llu/%c] != " 492 "es_status [%d/%d/%llu/%c]\n", inode->i_ino, 493 ee_block, ee_len, ee_start, 494 ee_status ? 'u' : 'w', es->es_lblk, es->es_len, 495 ext4_es_pblock(es), es_status ? 'u' : 'w'); 496 } 497 } else { 498 /* 499 * We can't find an extent on disk. So we need to make sure 500 * that we don't want to add an written/unwritten extent. 501 */ 502 if (!ext4_es_is_delayed(es) && !ext4_es_is_hole(es)) { 503 pr_warn("ES insert assertion failed for inode: %lu " 504 "can't find an extent at block %d but we want " 505 "to add a written/unwritten extent " 506 "[%d/%d/%llu/%x]\n", inode->i_ino, 507 es->es_lblk, es->es_lblk, es->es_len, 508 ext4_es_pblock(es), ext4_es_status(es)); 509 } 510 } 511 out: 512 ext4_ext_drop_refs(path); 513 kfree(path); 514 } 515 516 static void ext4_es_insert_extent_ind_check(struct inode *inode, 517 struct extent_status *es) 518 { 519 struct ext4_map_blocks map; 520 int retval; 521 522 /* 523 * Here we call ext4_ind_map_blocks to lookup a block mapping because 524 * 'Indirect' structure is defined in indirect.c. So we couldn't 525 * access direct/indirect tree from outside. It is too dirty to define 526 * this function in indirect.c file. 527 */ 528 529 map.m_lblk = es->es_lblk; 530 map.m_len = es->es_len; 531 532 retval = ext4_ind_map_blocks(NULL, inode, &map, 0); 533 if (retval > 0) { 534 if (ext4_es_is_delayed(es) || ext4_es_is_hole(es)) { 535 /* 536 * We want to add a delayed/hole extent but this 537 * block has been allocated. 538 */ 539 pr_warn("ES insert assertion failed for inode: %lu " 540 "We can find blocks but we want to add a " 541 "delayed/hole extent [%d/%d/%llu/%x]\n", 542 inode->i_ino, es->es_lblk, es->es_len, 543 ext4_es_pblock(es), ext4_es_status(es)); 544 return; 545 } else if (ext4_es_is_written(es)) { 546 if (retval != es->es_len) { 547 pr_warn("ES insert assertion failed for " 548 "inode: %lu retval %d != es_len %d\n", 549 inode->i_ino, retval, es->es_len); 550 return; 551 } 552 if (map.m_pblk != ext4_es_pblock(es)) { 553 pr_warn("ES insert assertion failed for " 554 "inode: %lu m_pblk %llu != " 555 "es_pblk %llu\n", 556 inode->i_ino, map.m_pblk, 557 ext4_es_pblock(es)); 558 return; 559 } 560 } else { 561 /* 562 * We don't need to check unwritten extent because 563 * indirect-based file doesn't have it. 564 */ 565 BUG_ON(1); 566 } 567 } else if (retval == 0) { 568 if (ext4_es_is_written(es)) { 569 pr_warn("ES insert assertion failed for inode: %lu " 570 "We can't find the block but we want to add " 571 "a written extent [%d/%d/%llu/%x]\n", 572 inode->i_ino, es->es_lblk, es->es_len, 573 ext4_es_pblock(es), ext4_es_status(es)); 574 return; 575 } 576 } 577 } 578 579 static inline void ext4_es_insert_extent_check(struct inode *inode, 580 struct extent_status *es) 581 { 582 /* 583 * We don't need to worry about the race condition because 584 * caller takes i_data_sem locking. 585 */ 586 BUG_ON(!rwsem_is_locked(&EXT4_I(inode)->i_data_sem)); 587 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 588 ext4_es_insert_extent_ext_check(inode, es); 589 else 590 ext4_es_insert_extent_ind_check(inode, es); 591 } 592 #else 593 static inline void ext4_es_insert_extent_check(struct inode *inode, 594 struct extent_status *es) 595 { 596 } 597 #endif 598 599 static int __es_insert_extent(struct inode *inode, struct extent_status *newes) 600 { 601 struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree; 602 struct rb_node **p = &tree->root.rb_node; 603 struct rb_node *parent = NULL; 604 struct extent_status *es; 605 606 while (*p) { 607 parent = *p; 608 es = rb_entry(parent, struct extent_status, rb_node); 609 610 if (newes->es_lblk < es->es_lblk) { 611 if (ext4_es_can_be_merged(newes, es)) { 612 /* 613 * Here we can modify es_lblk directly 614 * because it isn't overlapped. 615 */ 616 es->es_lblk = newes->es_lblk; 617 es->es_len += newes->es_len; 618 if (ext4_es_is_written(es) || 619 ext4_es_is_unwritten(es)) 620 ext4_es_store_pblock(es, 621 newes->es_pblk); 622 es = ext4_es_try_to_merge_left(inode, es); 623 goto out; 624 } 625 p = &(*p)->rb_left; 626 } else if (newes->es_lblk > ext4_es_end(es)) { 627 if (ext4_es_can_be_merged(es, newes)) { 628 es->es_len += newes->es_len; 629 es = ext4_es_try_to_merge_right(inode, es); 630 goto out; 631 } 632 p = &(*p)->rb_right; 633 } else { 634 BUG_ON(1); 635 return -EINVAL; 636 } 637 } 638 639 es = ext4_es_alloc_extent(inode, newes->es_lblk, newes->es_len, 640 newes->es_pblk); 641 if (!es) 642 return -ENOMEM; 643 rb_link_node(&es->rb_node, parent, p); 644 rb_insert_color(&es->rb_node, &tree->root); 645 646 out: 647 tree->cache_es = es; 648 return 0; 649 } 650 651 /* 652 * ext4_es_insert_extent() adds information to an inode's extent 653 * status tree. 654 * 655 * Return 0 on success, error code on failure. 656 */ 657 int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, 658 ext4_lblk_t len, ext4_fsblk_t pblk, 659 unsigned int status) 660 { 661 struct extent_status newes; 662 ext4_lblk_t end = lblk + len - 1; 663 int err = 0; 664 665 es_debug("add [%u/%u) %llu %x to extent status tree of inode %lu\n", 666 lblk, len, pblk, status, inode->i_ino); 667 668 if (!len) 669 return 0; 670 671 BUG_ON(end < lblk); 672 673 newes.es_lblk = lblk; 674 newes.es_len = len; 675 ext4_es_store_pblock_status(&newes, pblk, status); 676 trace_ext4_es_insert_extent(inode, &newes); 677 678 ext4_es_insert_extent_check(inode, &newes); 679 680 write_lock(&EXT4_I(inode)->i_es_lock); 681 err = __es_remove_extent(inode, lblk, end); 682 if (err != 0) 683 goto error; 684 retry: 685 err = __es_insert_extent(inode, &newes); 686 if (err == -ENOMEM && __ext4_es_shrink(EXT4_SB(inode->i_sb), 1, 687 EXT4_I(inode))) 688 goto retry; 689 if (err == -ENOMEM && !ext4_es_is_delayed(&newes)) 690 err = 0; 691 692 error: 693 write_unlock(&EXT4_I(inode)->i_es_lock); 694 695 ext4_es_print_tree(inode); 696 697 return err; 698 } 699 700 /* 701 * ext4_es_cache_extent() inserts information into the extent status 702 * tree if and only if there isn't information about the range in 703 * question already. 704 */ 705 void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk, 706 ext4_lblk_t len, ext4_fsblk_t pblk, 707 unsigned int status) 708 { 709 struct extent_status *es; 710 struct extent_status newes; 711 ext4_lblk_t end = lblk + len - 1; 712 713 newes.es_lblk = lblk; 714 newes.es_len = len; 715 ext4_es_store_pblock_status(&newes, pblk, status); 716 trace_ext4_es_cache_extent(inode, &newes); 717 718 if (!len) 719 return; 720 721 BUG_ON(end < lblk); 722 723 write_lock(&EXT4_I(inode)->i_es_lock); 724 725 es = __es_tree_search(&EXT4_I(inode)->i_es_tree.root, lblk); 726 if (!es || es->es_lblk > end) 727 __es_insert_extent(inode, &newes); 728 write_unlock(&EXT4_I(inode)->i_es_lock); 729 } 730 731 /* 732 * ext4_es_lookup_extent() looks up an extent in extent status tree. 733 * 734 * ext4_es_lookup_extent is called by ext4_map_blocks/ext4_da_map_blocks. 735 * 736 * Return: 1 on found, 0 on not 737 */ 738 int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, 739 struct extent_status *es) 740 { 741 struct ext4_es_tree *tree; 742 struct ext4_es_stats *stats; 743 struct extent_status *es1 = NULL; 744 struct rb_node *node; 745 int found = 0; 746 747 trace_ext4_es_lookup_extent_enter(inode, lblk); 748 es_debug("lookup extent in block %u\n", lblk); 749 750 tree = &EXT4_I(inode)->i_es_tree; 751 read_lock(&EXT4_I(inode)->i_es_lock); 752 753 /* find extent in cache firstly */ 754 es->es_lblk = es->es_len = es->es_pblk = 0; 755 if (tree->cache_es) { 756 es1 = tree->cache_es; 757 if (in_range(lblk, es1->es_lblk, es1->es_len)) { 758 es_debug("%u cached by [%u/%u)\n", 759 lblk, es1->es_lblk, es1->es_len); 760 found = 1; 761 goto out; 762 } 763 } 764 765 node = tree->root.rb_node; 766 while (node) { 767 es1 = rb_entry(node, struct extent_status, rb_node); 768 if (lblk < es1->es_lblk) 769 node = node->rb_left; 770 else if (lblk > ext4_es_end(es1)) 771 node = node->rb_right; 772 else { 773 found = 1; 774 break; 775 } 776 } 777 778 out: 779 stats = &EXT4_SB(inode->i_sb)->s_es_stats; 780 if (found) { 781 BUG_ON(!es1); 782 es->es_lblk = es1->es_lblk; 783 es->es_len = es1->es_len; 784 es->es_pblk = es1->es_pblk; 785 stats->es_stats_cache_hits++; 786 } else { 787 stats->es_stats_cache_misses++; 788 } 789 790 read_unlock(&EXT4_I(inode)->i_es_lock); 791 792 trace_ext4_es_lookup_extent_exit(inode, es, found); 793 return found; 794 } 795 796 static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, 797 ext4_lblk_t end) 798 { 799 struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree; 800 struct rb_node *node; 801 struct extent_status *es; 802 struct extent_status orig_es; 803 ext4_lblk_t len1, len2; 804 ext4_fsblk_t block; 805 int err; 806 807 retry: 808 err = 0; 809 es = __es_tree_search(&tree->root, lblk); 810 if (!es) 811 goto out; 812 if (es->es_lblk > end) 813 goto out; 814 815 /* Simply invalidate cache_es. */ 816 tree->cache_es = NULL; 817 818 orig_es.es_lblk = es->es_lblk; 819 orig_es.es_len = es->es_len; 820 orig_es.es_pblk = es->es_pblk; 821 822 len1 = lblk > es->es_lblk ? lblk - es->es_lblk : 0; 823 len2 = ext4_es_end(es) > end ? ext4_es_end(es) - end : 0; 824 if (len1 > 0) 825 es->es_len = len1; 826 if (len2 > 0) { 827 if (len1 > 0) { 828 struct extent_status newes; 829 830 newes.es_lblk = end + 1; 831 newes.es_len = len2; 832 block = 0x7FDEADBEEFULL; 833 if (ext4_es_is_written(&orig_es) || 834 ext4_es_is_unwritten(&orig_es)) 835 block = ext4_es_pblock(&orig_es) + 836 orig_es.es_len - len2; 837 ext4_es_store_pblock_status(&newes, block, 838 ext4_es_status(&orig_es)); 839 err = __es_insert_extent(inode, &newes); 840 if (err) { 841 es->es_lblk = orig_es.es_lblk; 842 es->es_len = orig_es.es_len; 843 if ((err == -ENOMEM) && 844 __ext4_es_shrink(EXT4_SB(inode->i_sb), 1, 845 EXT4_I(inode))) 846 goto retry; 847 goto out; 848 } 849 } else { 850 es->es_lblk = end + 1; 851 es->es_len = len2; 852 if (ext4_es_is_written(es) || 853 ext4_es_is_unwritten(es)) { 854 block = orig_es.es_pblk + orig_es.es_len - len2; 855 ext4_es_store_pblock(es, block); 856 } 857 } 858 goto out; 859 } 860 861 if (len1 > 0) { 862 node = rb_next(&es->rb_node); 863 if (node) 864 es = rb_entry(node, struct extent_status, rb_node); 865 else 866 es = NULL; 867 } 868 869 while (es && ext4_es_end(es) <= end) { 870 node = rb_next(&es->rb_node); 871 rb_erase(&es->rb_node, &tree->root); 872 ext4_es_free_extent(inode, es); 873 if (!node) { 874 es = NULL; 875 break; 876 } 877 es = rb_entry(node, struct extent_status, rb_node); 878 } 879 880 if (es && es->es_lblk < end + 1) { 881 ext4_lblk_t orig_len = es->es_len; 882 883 len1 = ext4_es_end(es) - end; 884 es->es_lblk = end + 1; 885 es->es_len = len1; 886 if (ext4_es_is_written(es) || ext4_es_is_unwritten(es)) { 887 block = es->es_pblk + orig_len - len1; 888 ext4_es_store_pblock(es, block); 889 } 890 } 891 892 out: 893 return err; 894 } 895 896 /* 897 * ext4_es_remove_extent() removes a space from a extent status tree. 898 * 899 * Return 0 on success, error code on failure. 900 */ 901 int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, 902 ext4_lblk_t len) 903 { 904 ext4_lblk_t end; 905 int err = 0; 906 907 trace_ext4_es_remove_extent(inode, lblk, len); 908 es_debug("remove [%u/%u) from extent status tree of inode %lu\n", 909 lblk, len, inode->i_ino); 910 911 if (!len) 912 return err; 913 914 end = lblk + len - 1; 915 BUG_ON(end < lblk); 916 917 write_lock(&EXT4_I(inode)->i_es_lock); 918 err = __es_remove_extent(inode, lblk, end); 919 write_unlock(&EXT4_I(inode)->i_es_lock); 920 ext4_es_print_tree(inode); 921 return err; 922 } 923 924 static int ext4_inode_touch_time_cmp(void *priv, struct list_head *a, 925 struct list_head *b) 926 { 927 struct ext4_inode_info *eia, *eib; 928 eia = list_entry(a, struct ext4_inode_info, i_es_lru); 929 eib = list_entry(b, struct ext4_inode_info, i_es_lru); 930 931 if (ext4_test_inode_state(&eia->vfs_inode, EXT4_STATE_EXT_PRECACHED) && 932 !ext4_test_inode_state(&eib->vfs_inode, EXT4_STATE_EXT_PRECACHED)) 933 return 1; 934 if (!ext4_test_inode_state(&eia->vfs_inode, EXT4_STATE_EXT_PRECACHED) && 935 ext4_test_inode_state(&eib->vfs_inode, EXT4_STATE_EXT_PRECACHED)) 936 return -1; 937 if (eia->i_touch_when == eib->i_touch_when) 938 return 0; 939 if (time_after(eia->i_touch_when, eib->i_touch_when)) 940 return 1; 941 else 942 return -1; 943 } 944 945 static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, 946 struct ext4_inode_info *locked_ei) 947 { 948 struct ext4_inode_info *ei; 949 struct ext4_es_stats *es_stats; 950 struct list_head *cur, *tmp; 951 LIST_HEAD(skipped); 952 ktime_t start_time; 953 u64 scan_time; 954 int nr_shrunk = 0; 955 int retried = 0, skip_precached = 1, nr_skipped = 0; 956 957 es_stats = &sbi->s_es_stats; 958 start_time = ktime_get(); 959 spin_lock(&sbi->s_es_lru_lock); 960 961 retry: 962 list_for_each_safe(cur, tmp, &sbi->s_es_lru) { 963 int shrunk; 964 965 /* 966 * If we have already reclaimed all extents from extent 967 * status tree, just stop the loop immediately. 968 */ 969 if (percpu_counter_read_positive( 970 &es_stats->es_stats_lru_cnt) == 0) 971 break; 972 973 ei = list_entry(cur, struct ext4_inode_info, i_es_lru); 974 975 /* 976 * Skip the inode that is newer than the last_sorted 977 * time. Normally we try hard to avoid shrinking 978 * precached inodes, but we will as a last resort. 979 */ 980 if ((es_stats->es_stats_last_sorted < ei->i_touch_when) || 981 (skip_precached && ext4_test_inode_state(&ei->vfs_inode, 982 EXT4_STATE_EXT_PRECACHED))) { 983 nr_skipped++; 984 list_move_tail(cur, &skipped); 985 continue; 986 } 987 988 if (ei->i_es_lru_nr == 0 || ei == locked_ei || 989 !write_trylock(&ei->i_es_lock)) 990 continue; 991 992 shrunk = __es_try_to_reclaim_extents(ei, nr_to_scan); 993 if (ei->i_es_lru_nr == 0) 994 list_del_init(&ei->i_es_lru); 995 write_unlock(&ei->i_es_lock); 996 997 nr_shrunk += shrunk; 998 nr_to_scan -= shrunk; 999 if (nr_to_scan == 0) 1000 break; 1001 } 1002 1003 /* Move the newer inodes into the tail of the LRU list. */ 1004 list_splice_tail(&skipped, &sbi->s_es_lru); 1005 INIT_LIST_HEAD(&skipped); 1006 1007 /* 1008 * If we skipped any inodes, and we weren't able to make any 1009 * forward progress, sort the list and try again. 1010 */ 1011 if ((nr_shrunk == 0) && nr_skipped && !retried) { 1012 retried++; 1013 list_sort(NULL, &sbi->s_es_lru, ext4_inode_touch_time_cmp); 1014 es_stats->es_stats_last_sorted = jiffies; 1015 ei = list_first_entry(&sbi->s_es_lru, struct ext4_inode_info, 1016 i_es_lru); 1017 /* 1018 * If there are no non-precached inodes left on the 1019 * list, start releasing precached extents. 1020 */ 1021 if (ext4_test_inode_state(&ei->vfs_inode, 1022 EXT4_STATE_EXT_PRECACHED)) 1023 skip_precached = 0; 1024 goto retry; 1025 } 1026 1027 spin_unlock(&sbi->s_es_lru_lock); 1028 1029 if (locked_ei && nr_shrunk == 0) 1030 nr_shrunk = __es_try_to_reclaim_extents(locked_ei, nr_to_scan); 1031 1032 scan_time = ktime_to_ns(ktime_sub(ktime_get(), start_time)); 1033 if (likely(es_stats->es_stats_scan_time)) 1034 es_stats->es_stats_scan_time = (scan_time + 1035 es_stats->es_stats_scan_time*3) / 4; 1036 else 1037 es_stats->es_stats_scan_time = scan_time; 1038 if (scan_time > es_stats->es_stats_max_scan_time) 1039 es_stats->es_stats_max_scan_time = scan_time; 1040 if (likely(es_stats->es_stats_shrunk)) 1041 es_stats->es_stats_shrunk = (nr_shrunk + 1042 es_stats->es_stats_shrunk*3) / 4; 1043 else 1044 es_stats->es_stats_shrunk = nr_shrunk; 1045 1046 trace_ext4_es_shrink(sbi->s_sb, nr_shrunk, scan_time, skip_precached, 1047 nr_skipped, retried); 1048 return nr_shrunk; 1049 } 1050 1051 static unsigned long ext4_es_count(struct shrinker *shrink, 1052 struct shrink_control *sc) 1053 { 1054 unsigned long nr; 1055 struct ext4_sb_info *sbi; 1056 1057 sbi = container_of(shrink, struct ext4_sb_info, s_es_shrinker); 1058 nr = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_lru_cnt); 1059 trace_ext4_es_shrink_count(sbi->s_sb, sc->nr_to_scan, nr); 1060 return nr; 1061 } 1062 1063 static unsigned long ext4_es_scan(struct shrinker *shrink, 1064 struct shrink_control *sc) 1065 { 1066 struct ext4_sb_info *sbi = container_of(shrink, 1067 struct ext4_sb_info, s_es_shrinker); 1068 int nr_to_scan = sc->nr_to_scan; 1069 int ret, nr_shrunk; 1070 1071 ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_lru_cnt); 1072 trace_ext4_es_shrink_scan_enter(sbi->s_sb, nr_to_scan, ret); 1073 1074 if (!nr_to_scan) 1075 return ret; 1076 1077 nr_shrunk = __ext4_es_shrink(sbi, nr_to_scan, NULL); 1078 1079 trace_ext4_es_shrink_scan_exit(sbi->s_sb, nr_shrunk, ret); 1080 return nr_shrunk; 1081 } 1082 1083 static void *ext4_es_seq_shrinker_info_start(struct seq_file *seq, loff_t *pos) 1084 { 1085 return *pos ? NULL : SEQ_START_TOKEN; 1086 } 1087 1088 static void * 1089 ext4_es_seq_shrinker_info_next(struct seq_file *seq, void *v, loff_t *pos) 1090 { 1091 return NULL; 1092 } 1093 1094 static int ext4_es_seq_shrinker_info_show(struct seq_file *seq, void *v) 1095 { 1096 struct ext4_sb_info *sbi = seq->private; 1097 struct ext4_es_stats *es_stats = &sbi->s_es_stats; 1098 struct ext4_inode_info *ei, *max = NULL; 1099 unsigned int inode_cnt = 0; 1100 1101 if (v != SEQ_START_TOKEN) 1102 return 0; 1103 1104 /* here we just find an inode that has the max nr. of objects */ 1105 spin_lock(&sbi->s_es_lru_lock); 1106 list_for_each_entry(ei, &sbi->s_es_lru, i_es_lru) { 1107 inode_cnt++; 1108 if (max && max->i_es_all_nr < ei->i_es_all_nr) 1109 max = ei; 1110 else if (!max) 1111 max = ei; 1112 } 1113 spin_unlock(&sbi->s_es_lru_lock); 1114 1115 seq_printf(seq, "stats:\n %lld objects\n %lld reclaimable objects\n", 1116 percpu_counter_sum_positive(&es_stats->es_stats_all_cnt), 1117 percpu_counter_sum_positive(&es_stats->es_stats_lru_cnt)); 1118 seq_printf(seq, " %lu/%lu cache hits/misses\n", 1119 es_stats->es_stats_cache_hits, 1120 es_stats->es_stats_cache_misses); 1121 if (es_stats->es_stats_last_sorted != 0) 1122 seq_printf(seq, " %u ms last sorted interval\n", 1123 jiffies_to_msecs(jiffies - 1124 es_stats->es_stats_last_sorted)); 1125 if (inode_cnt) 1126 seq_printf(seq, " %d inodes on lru list\n", inode_cnt); 1127 1128 seq_printf(seq, "average:\n %llu us scan time\n", 1129 div_u64(es_stats->es_stats_scan_time, 1000)); 1130 seq_printf(seq, " %lu shrunk objects\n", es_stats->es_stats_shrunk); 1131 if (inode_cnt) 1132 seq_printf(seq, 1133 "maximum:\n %lu inode (%u objects, %u reclaimable)\n" 1134 " %llu us max scan time\n", 1135 max->vfs_inode.i_ino, max->i_es_all_nr, max->i_es_lru_nr, 1136 div_u64(es_stats->es_stats_max_scan_time, 1000)); 1137 1138 return 0; 1139 } 1140 1141 static void ext4_es_seq_shrinker_info_stop(struct seq_file *seq, void *v) 1142 { 1143 } 1144 1145 static const struct seq_operations ext4_es_seq_shrinker_info_ops = { 1146 .start = ext4_es_seq_shrinker_info_start, 1147 .next = ext4_es_seq_shrinker_info_next, 1148 .stop = ext4_es_seq_shrinker_info_stop, 1149 .show = ext4_es_seq_shrinker_info_show, 1150 }; 1151 1152 static int 1153 ext4_es_seq_shrinker_info_open(struct inode *inode, struct file *file) 1154 { 1155 int ret; 1156 1157 ret = seq_open(file, &ext4_es_seq_shrinker_info_ops); 1158 if (!ret) { 1159 struct seq_file *m = file->private_data; 1160 m->private = PDE_DATA(inode); 1161 } 1162 1163 return ret; 1164 } 1165 1166 static int 1167 ext4_es_seq_shrinker_info_release(struct inode *inode, struct file *file) 1168 { 1169 return seq_release(inode, file); 1170 } 1171 1172 static const struct file_operations ext4_es_seq_shrinker_info_fops = { 1173 .owner = THIS_MODULE, 1174 .open = ext4_es_seq_shrinker_info_open, 1175 .read = seq_read, 1176 .llseek = seq_lseek, 1177 .release = ext4_es_seq_shrinker_info_release, 1178 }; 1179 1180 int ext4_es_register_shrinker(struct ext4_sb_info *sbi) 1181 { 1182 int err; 1183 1184 INIT_LIST_HEAD(&sbi->s_es_lru); 1185 spin_lock_init(&sbi->s_es_lru_lock); 1186 sbi->s_es_stats.es_stats_last_sorted = 0; 1187 sbi->s_es_stats.es_stats_shrunk = 0; 1188 sbi->s_es_stats.es_stats_cache_hits = 0; 1189 sbi->s_es_stats.es_stats_cache_misses = 0; 1190 sbi->s_es_stats.es_stats_scan_time = 0; 1191 sbi->s_es_stats.es_stats_max_scan_time = 0; 1192 err = percpu_counter_init(&sbi->s_es_stats.es_stats_all_cnt, 0, GFP_KERNEL); 1193 if (err) 1194 return err; 1195 err = percpu_counter_init(&sbi->s_es_stats.es_stats_lru_cnt, 0, GFP_KERNEL); 1196 if (err) 1197 goto err1; 1198 1199 sbi->s_es_shrinker.scan_objects = ext4_es_scan; 1200 sbi->s_es_shrinker.count_objects = ext4_es_count; 1201 sbi->s_es_shrinker.seeks = DEFAULT_SEEKS; 1202 err = register_shrinker(&sbi->s_es_shrinker); 1203 if (err) 1204 goto err2; 1205 1206 if (sbi->s_proc) 1207 proc_create_data("es_shrinker_info", S_IRUGO, sbi->s_proc, 1208 &ext4_es_seq_shrinker_info_fops, sbi); 1209 1210 return 0; 1211 1212 err2: 1213 percpu_counter_destroy(&sbi->s_es_stats.es_stats_lru_cnt); 1214 err1: 1215 percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt); 1216 return err; 1217 } 1218 1219 void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi) 1220 { 1221 if (sbi->s_proc) 1222 remove_proc_entry("es_shrinker_info", sbi->s_proc); 1223 percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt); 1224 percpu_counter_destroy(&sbi->s_es_stats.es_stats_lru_cnt); 1225 unregister_shrinker(&sbi->s_es_shrinker); 1226 } 1227 1228 void ext4_es_lru_add(struct inode *inode) 1229 { 1230 struct ext4_inode_info *ei = EXT4_I(inode); 1231 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1232 1233 ei->i_touch_when = jiffies; 1234 1235 if (!list_empty(&ei->i_es_lru)) 1236 return; 1237 1238 spin_lock(&sbi->s_es_lru_lock); 1239 if (list_empty(&ei->i_es_lru)) 1240 list_add_tail(&ei->i_es_lru, &sbi->s_es_lru); 1241 spin_unlock(&sbi->s_es_lru_lock); 1242 } 1243 1244 void ext4_es_lru_del(struct inode *inode) 1245 { 1246 struct ext4_inode_info *ei = EXT4_I(inode); 1247 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1248 1249 spin_lock(&sbi->s_es_lru_lock); 1250 if (!list_empty(&ei->i_es_lru)) 1251 list_del_init(&ei->i_es_lru); 1252 spin_unlock(&sbi->s_es_lru_lock); 1253 } 1254 1255 static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei, 1256 int nr_to_scan) 1257 { 1258 struct inode *inode = &ei->vfs_inode; 1259 struct ext4_es_tree *tree = &ei->i_es_tree; 1260 struct rb_node *node; 1261 struct extent_status *es; 1262 unsigned long nr_shrunk = 0; 1263 static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, 1264 DEFAULT_RATELIMIT_BURST); 1265 1266 if (ei->i_es_lru_nr == 0) 1267 return 0; 1268 1269 if (ext4_test_inode_state(inode, EXT4_STATE_EXT_PRECACHED) && 1270 __ratelimit(&_rs)) 1271 ext4_warning(inode->i_sb, "forced shrink of precached extents"); 1272 1273 node = rb_first(&tree->root); 1274 while (node != NULL) { 1275 es = rb_entry(node, struct extent_status, rb_node); 1276 node = rb_next(&es->rb_node); 1277 /* 1278 * We can't reclaim delayed extent from status tree because 1279 * fiemap, bigallic, and seek_data/hole need to use it. 1280 */ 1281 if (!ext4_es_is_delayed(es)) { 1282 rb_erase(&es->rb_node, &tree->root); 1283 ext4_es_free_extent(inode, es); 1284 nr_shrunk++; 1285 if (--nr_to_scan == 0) 1286 break; 1287 } 1288 } 1289 tree->cache_es = NULL; 1290 return nr_shrunk; 1291 } 1292