1 /* 2 * fs/f2fs/gc.c 3 * 4 * Copyright (c) 2012 Samsung Electronics Co., Ltd. 5 * http://www.samsung.com/ 6 * 7 * This program is free software; you can redistribute it and/or modify 8 * it under the terms of the GNU General Public License version 2 as 9 * published by the Free Software Foundation. 10 */ 11 #include <linux/fs.h> 12 #include <linux/module.h> 13 #include <linux/backing-dev.h> 14 #include <linux/proc_fs.h> 15 #include <linux/init.h> 16 #include <linux/f2fs_fs.h> 17 #include <linux/kthread.h> 18 #include <linux/delay.h> 19 #include <linux/freezer.h> 20 #include <linux/blkdev.h> 21 22 #include "f2fs.h" 23 #include "node.h" 24 #include "segment.h" 25 #include "gc.h" 26 27 static struct kmem_cache *winode_slab; 28 29 static int gc_thread_func(void *data) 30 { 31 struct f2fs_sb_info *sbi = data; 32 wait_queue_head_t *wq = &sbi->gc_thread->gc_wait_queue_head; 33 long wait_ms; 34 35 wait_ms = GC_THREAD_MIN_SLEEP_TIME; 36 37 do { 38 if (try_to_freeze()) 39 continue; 40 else 41 wait_event_interruptible_timeout(*wq, 42 kthread_should_stop(), 43 msecs_to_jiffies(wait_ms)); 44 if (kthread_should_stop()) 45 break; 46 47 f2fs_balance_fs(sbi); 48 49 if (!test_opt(sbi, BG_GC)) 50 continue; 51 52 /* 53 * [GC triggering condition] 54 * 0. GC is not conducted currently. 55 * 1. There are enough dirty segments. 56 * 2. IO subsystem is idle by checking the # of writeback pages. 57 * 3. IO subsystem is idle by checking the # of requests in 58 * bdev's request list. 59 * 60 * Note) We have to avoid triggering GCs too much frequently. 61 * Because it is possible that some segments can be 62 * invalidated soon after by user update or deletion. 63 * So, I'd like to wait some time to collect dirty segments. 64 */ 65 if (!mutex_trylock(&sbi->gc_mutex)) 66 continue; 67 68 if (!is_idle(sbi)) { 69 wait_ms = increase_sleep_time(wait_ms); 70 mutex_unlock(&sbi->gc_mutex); 71 continue; 72 } 73 74 if (has_enough_invalid_blocks(sbi)) 75 wait_ms = decrease_sleep_time(wait_ms); 76 else 77 wait_ms = increase_sleep_time(wait_ms); 78 79 sbi->bg_gc++; 80 81 if (f2fs_gc(sbi) == GC_NONE) 82 wait_ms = GC_THREAD_NOGC_SLEEP_TIME; 83 else if (wait_ms == GC_THREAD_NOGC_SLEEP_TIME) 84 wait_ms = GC_THREAD_MAX_SLEEP_TIME; 85 86 } while (!kthread_should_stop()); 87 return 0; 88 } 89 90 int start_gc_thread(struct f2fs_sb_info *sbi) 91 { 92 struct f2fs_gc_kthread *gc_th; 93 94 gc_th = kmalloc(sizeof(struct f2fs_gc_kthread), GFP_KERNEL); 95 if (!gc_th) 96 return -ENOMEM; 97 98 sbi->gc_thread = gc_th; 99 init_waitqueue_head(&sbi->gc_thread->gc_wait_queue_head); 100 sbi->gc_thread->f2fs_gc_task = kthread_run(gc_thread_func, sbi, 101 GC_THREAD_NAME); 102 if (IS_ERR(gc_th->f2fs_gc_task)) { 103 kfree(gc_th); 104 return -ENOMEM; 105 } 106 return 0; 107 } 108 109 void stop_gc_thread(struct f2fs_sb_info *sbi) 110 { 111 struct f2fs_gc_kthread *gc_th = sbi->gc_thread; 112 if (!gc_th) 113 return; 114 kthread_stop(gc_th->f2fs_gc_task); 115 kfree(gc_th); 116 sbi->gc_thread = NULL; 117 } 118 119 static int select_gc_type(int gc_type) 120 { 121 return (gc_type == BG_GC) ? GC_CB : GC_GREEDY; 122 } 123 124 static void select_policy(struct f2fs_sb_info *sbi, int gc_type, 125 int type, struct victim_sel_policy *p) 126 { 127 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); 128 129 if (p->alloc_mode) { 130 p->gc_mode = GC_GREEDY; 131 p->dirty_segmap = dirty_i->dirty_segmap[type]; 132 p->ofs_unit = 1; 133 } else { 134 p->gc_mode = select_gc_type(gc_type); 135 p->dirty_segmap = dirty_i->dirty_segmap[DIRTY]; 136 p->ofs_unit = sbi->segs_per_sec; 137 } 138 p->offset = sbi->last_victim[p->gc_mode]; 139 } 140 141 static unsigned int get_max_cost(struct f2fs_sb_info *sbi, 142 struct victim_sel_policy *p) 143 { 144 if (p->gc_mode == GC_GREEDY) 145 return (1 << sbi->log_blocks_per_seg) * p->ofs_unit; 146 else if (p->gc_mode == GC_CB) 147 return UINT_MAX; 148 else /* No other gc_mode */ 149 return 0; 150 } 151 152 static unsigned int check_bg_victims(struct f2fs_sb_info *sbi) 153 { 154 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); 155 unsigned int segno; 156 157 /* 158 * If the gc_type is FG_GC, we can select victim segments 159 * selected by background GC before. 160 * Those segments guarantee they have small valid blocks. 161 */ 162 segno = find_next_bit(dirty_i->victim_segmap[BG_GC], 163 TOTAL_SEGS(sbi), 0); 164 if (segno < TOTAL_SEGS(sbi)) { 165 clear_bit(segno, dirty_i->victim_segmap[BG_GC]); 166 return segno; 167 } 168 return NULL_SEGNO; 169 } 170 171 static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno) 172 { 173 struct sit_info *sit_i = SIT_I(sbi); 174 unsigned int secno = GET_SECNO(sbi, segno); 175 unsigned int start = secno * sbi->segs_per_sec; 176 unsigned long long mtime = 0; 177 unsigned int vblocks; 178 unsigned char age = 0; 179 unsigned char u; 180 unsigned int i; 181 182 for (i = 0; i < sbi->segs_per_sec; i++) 183 mtime += get_seg_entry(sbi, start + i)->mtime; 184 vblocks = get_valid_blocks(sbi, segno, sbi->segs_per_sec); 185 186 mtime = div_u64(mtime, sbi->segs_per_sec); 187 vblocks = div_u64(vblocks, sbi->segs_per_sec); 188 189 u = (vblocks * 100) >> sbi->log_blocks_per_seg; 190 191 /* Handle if the system time is changed by user */ 192 if (mtime < sit_i->min_mtime) 193 sit_i->min_mtime = mtime; 194 if (mtime > sit_i->max_mtime) 195 sit_i->max_mtime = mtime; 196 if (sit_i->max_mtime != sit_i->min_mtime) 197 age = 100 - div64_u64(100 * (mtime - sit_i->min_mtime), 198 sit_i->max_mtime - sit_i->min_mtime); 199 200 return UINT_MAX - ((100 * (100 - u) * age) / (100 + u)); 201 } 202 203 static unsigned int get_gc_cost(struct f2fs_sb_info *sbi, unsigned int segno, 204 struct victim_sel_policy *p) 205 { 206 if (p->alloc_mode == SSR) 207 return get_seg_entry(sbi, segno)->ckpt_valid_blocks; 208 209 /* alloc_mode == LFS */ 210 if (p->gc_mode == GC_GREEDY) 211 return get_valid_blocks(sbi, segno, sbi->segs_per_sec); 212 else 213 return get_cb_cost(sbi, segno); 214 } 215 216 /* 217 * This function is called from two pathes. 218 * One is garbage collection and the other is SSR segment selection. 219 * When it is called during GC, it just gets a victim segment 220 * and it does not remove it from dirty seglist. 221 * When it is called from SSR segment selection, it finds a segment 222 * which has minimum valid blocks and removes it from dirty seglist. 223 */ 224 static int get_victim_by_default(struct f2fs_sb_info *sbi, 225 unsigned int *result, int gc_type, int type, char alloc_mode) 226 { 227 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); 228 struct victim_sel_policy p; 229 unsigned int segno; 230 int nsearched = 0; 231 232 p.alloc_mode = alloc_mode; 233 select_policy(sbi, gc_type, type, &p); 234 235 p.min_segno = NULL_SEGNO; 236 p.min_cost = get_max_cost(sbi, &p); 237 238 mutex_lock(&dirty_i->seglist_lock); 239 240 if (p.alloc_mode == LFS && gc_type == FG_GC) { 241 p.min_segno = check_bg_victims(sbi); 242 if (p.min_segno != NULL_SEGNO) 243 goto got_it; 244 } 245 246 while (1) { 247 unsigned long cost; 248 249 segno = find_next_bit(p.dirty_segmap, 250 TOTAL_SEGS(sbi), p.offset); 251 if (segno >= TOTAL_SEGS(sbi)) { 252 if (sbi->last_victim[p.gc_mode]) { 253 sbi->last_victim[p.gc_mode] = 0; 254 p.offset = 0; 255 continue; 256 } 257 break; 258 } 259 p.offset = ((segno / p.ofs_unit) * p.ofs_unit) + p.ofs_unit; 260 261 if (test_bit(segno, dirty_i->victim_segmap[FG_GC])) 262 continue; 263 if (gc_type == BG_GC && 264 test_bit(segno, dirty_i->victim_segmap[BG_GC])) 265 continue; 266 if (IS_CURSEC(sbi, GET_SECNO(sbi, segno))) 267 continue; 268 269 cost = get_gc_cost(sbi, segno, &p); 270 271 if (p.min_cost > cost) { 272 p.min_segno = segno; 273 p.min_cost = cost; 274 } 275 276 if (cost == get_max_cost(sbi, &p)) 277 continue; 278 279 if (nsearched++ >= MAX_VICTIM_SEARCH) { 280 sbi->last_victim[p.gc_mode] = segno; 281 break; 282 } 283 } 284 got_it: 285 if (p.min_segno != NULL_SEGNO) { 286 *result = (p.min_segno / p.ofs_unit) * p.ofs_unit; 287 if (p.alloc_mode == LFS) { 288 int i; 289 for (i = 0; i < p.ofs_unit; i++) 290 set_bit(*result + i, 291 dirty_i->victim_segmap[gc_type]); 292 } 293 } 294 mutex_unlock(&dirty_i->seglist_lock); 295 296 return (p.min_segno == NULL_SEGNO) ? 0 : 1; 297 } 298 299 static const struct victim_selection default_v_ops = { 300 .get_victim = get_victim_by_default, 301 }; 302 303 static struct inode *find_gc_inode(nid_t ino, struct list_head *ilist) 304 { 305 struct list_head *this; 306 struct inode_entry *ie; 307 308 list_for_each(this, ilist) { 309 ie = list_entry(this, struct inode_entry, list); 310 if (ie->inode->i_ino == ino) 311 return ie->inode; 312 } 313 return NULL; 314 } 315 316 static void add_gc_inode(struct inode *inode, struct list_head *ilist) 317 { 318 struct list_head *this; 319 struct inode_entry *new_ie, *ie; 320 321 list_for_each(this, ilist) { 322 ie = list_entry(this, struct inode_entry, list); 323 if (ie->inode == inode) { 324 iput(inode); 325 return; 326 } 327 } 328 repeat: 329 new_ie = kmem_cache_alloc(winode_slab, GFP_NOFS); 330 if (!new_ie) { 331 cond_resched(); 332 goto repeat; 333 } 334 new_ie->inode = inode; 335 list_add_tail(&new_ie->list, ilist); 336 } 337 338 static void put_gc_inode(struct list_head *ilist) 339 { 340 struct inode_entry *ie, *next_ie; 341 list_for_each_entry_safe(ie, next_ie, ilist, list) { 342 iput(ie->inode); 343 list_del(&ie->list); 344 kmem_cache_free(winode_slab, ie); 345 } 346 } 347 348 static int check_valid_map(struct f2fs_sb_info *sbi, 349 unsigned int segno, int offset) 350 { 351 struct sit_info *sit_i = SIT_I(sbi); 352 struct seg_entry *sentry; 353 int ret; 354 355 mutex_lock(&sit_i->sentry_lock); 356 sentry = get_seg_entry(sbi, segno); 357 ret = f2fs_test_bit(offset, sentry->cur_valid_map); 358 mutex_unlock(&sit_i->sentry_lock); 359 return ret ? GC_OK : GC_NEXT; 360 } 361 362 /* 363 * This function compares node address got in summary with that in NAT. 364 * On validity, copy that node with cold status, otherwise (invalid node) 365 * ignore that. 366 */ 367 static int gc_node_segment(struct f2fs_sb_info *sbi, 368 struct f2fs_summary *sum, unsigned int segno, int gc_type) 369 { 370 bool initial = true; 371 struct f2fs_summary *entry; 372 int off; 373 374 next_step: 375 entry = sum; 376 for (off = 0; off < sbi->blocks_per_seg; off++, entry++) { 377 nid_t nid = le32_to_cpu(entry->nid); 378 struct page *node_page; 379 int err; 380 381 /* 382 * It makes sure that free segments are able to write 383 * all the dirty node pages before CP after this CP. 384 * So let's check the space of dirty node pages. 385 */ 386 if (should_do_checkpoint(sbi)) { 387 mutex_lock(&sbi->cp_mutex); 388 block_operations(sbi); 389 return GC_BLOCKED; 390 } 391 392 err = check_valid_map(sbi, segno, off); 393 if (err == GC_NEXT) 394 continue; 395 396 if (initial) { 397 ra_node_page(sbi, nid); 398 continue; 399 } 400 node_page = get_node_page(sbi, nid); 401 if (IS_ERR(node_page)) 402 continue; 403 404 /* set page dirty and write it */ 405 if (!PageWriteback(node_page)) 406 set_page_dirty(node_page); 407 f2fs_put_page(node_page, 1); 408 stat_inc_node_blk_count(sbi, 1); 409 } 410 if (initial) { 411 initial = false; 412 goto next_step; 413 } 414 415 if (gc_type == FG_GC) { 416 struct writeback_control wbc = { 417 .sync_mode = WB_SYNC_ALL, 418 .nr_to_write = LONG_MAX, 419 .for_reclaim = 0, 420 }; 421 sync_node_pages(sbi, 0, &wbc); 422 } 423 return GC_DONE; 424 } 425 426 /* 427 * Calculate start block index indicating the given node offset. 428 * Be careful, caller should give this node offset only indicating direct node 429 * blocks. If any node offsets, which point the other types of node blocks such 430 * as indirect or double indirect node blocks, are given, it must be a caller's 431 * bug. 432 */ 433 block_t start_bidx_of_node(unsigned int node_ofs) 434 { 435 unsigned int indirect_blks = 2 * NIDS_PER_BLOCK + 4; 436 unsigned int bidx; 437 438 if (node_ofs == 0) 439 return 0; 440 441 if (node_ofs <= 2) { 442 bidx = node_ofs - 1; 443 } else if (node_ofs <= indirect_blks) { 444 int dec = (node_ofs - 4) / (NIDS_PER_BLOCK + 1); 445 bidx = node_ofs - 2 - dec; 446 } else { 447 int dec = (node_ofs - indirect_blks - 3) / (NIDS_PER_BLOCK + 1); 448 bidx = node_ofs - 5 - dec; 449 } 450 return bidx * ADDRS_PER_BLOCK + ADDRS_PER_INODE; 451 } 452 453 static int check_dnode(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, 454 struct node_info *dni, block_t blkaddr, unsigned int *nofs) 455 { 456 struct page *node_page; 457 nid_t nid; 458 unsigned int ofs_in_node; 459 block_t source_blkaddr; 460 461 nid = le32_to_cpu(sum->nid); 462 ofs_in_node = le16_to_cpu(sum->ofs_in_node); 463 464 node_page = get_node_page(sbi, nid); 465 if (IS_ERR(node_page)) 466 return GC_NEXT; 467 468 get_node_info(sbi, nid, dni); 469 470 if (sum->version != dni->version) { 471 f2fs_put_page(node_page, 1); 472 return GC_NEXT; 473 } 474 475 *nofs = ofs_of_node(node_page); 476 source_blkaddr = datablock_addr(node_page, ofs_in_node); 477 f2fs_put_page(node_page, 1); 478 479 if (source_blkaddr != blkaddr) 480 return GC_NEXT; 481 return GC_OK; 482 } 483 484 static void move_data_page(struct inode *inode, struct page *page, int gc_type) 485 { 486 if (page->mapping != inode->i_mapping) 487 goto out; 488 489 if (inode != page->mapping->host) 490 goto out; 491 492 if (PageWriteback(page)) 493 goto out; 494 495 if (gc_type == BG_GC) { 496 set_page_dirty(page); 497 set_cold_data(page); 498 } else { 499 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 500 mutex_lock_op(sbi, DATA_WRITE); 501 if (clear_page_dirty_for_io(page) && 502 S_ISDIR(inode->i_mode)) { 503 dec_page_count(sbi, F2FS_DIRTY_DENTS); 504 inode_dec_dirty_dents(inode); 505 } 506 set_cold_data(page); 507 do_write_data_page(page); 508 mutex_unlock_op(sbi, DATA_WRITE); 509 clear_cold_data(page); 510 } 511 out: 512 f2fs_put_page(page, 1); 513 } 514 515 /* 516 * This function tries to get parent node of victim data block, and identifies 517 * data block validity. If the block is valid, copy that with cold status and 518 * modify parent node. 519 * If the parent node is not valid or the data block address is different, 520 * the victim data block is ignored. 521 */ 522 static int gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, 523 struct list_head *ilist, unsigned int segno, int gc_type) 524 { 525 struct super_block *sb = sbi->sb; 526 struct f2fs_summary *entry; 527 block_t start_addr; 528 int err, off; 529 int phase = 0; 530 531 start_addr = START_BLOCK(sbi, segno); 532 533 next_step: 534 entry = sum; 535 for (off = 0; off < sbi->blocks_per_seg; off++, entry++) { 536 struct page *data_page; 537 struct inode *inode; 538 struct node_info dni; /* dnode info for the data */ 539 unsigned int ofs_in_node, nofs; 540 block_t start_bidx; 541 542 /* 543 * It makes sure that free segments are able to write 544 * all the dirty node pages before CP after this CP. 545 * So let's check the space of dirty node pages. 546 */ 547 if (should_do_checkpoint(sbi)) { 548 mutex_lock(&sbi->cp_mutex); 549 block_operations(sbi); 550 err = GC_BLOCKED; 551 goto stop; 552 } 553 554 err = check_valid_map(sbi, segno, off); 555 if (err == GC_NEXT) 556 continue; 557 558 if (phase == 0) { 559 ra_node_page(sbi, le32_to_cpu(entry->nid)); 560 continue; 561 } 562 563 /* Get an inode by ino with checking validity */ 564 err = check_dnode(sbi, entry, &dni, start_addr + off, &nofs); 565 if (err == GC_NEXT) 566 continue; 567 568 if (phase == 1) { 569 ra_node_page(sbi, dni.ino); 570 continue; 571 } 572 573 start_bidx = start_bidx_of_node(nofs); 574 ofs_in_node = le16_to_cpu(entry->ofs_in_node); 575 576 if (phase == 2) { 577 inode = f2fs_iget_nowait(sb, dni.ino); 578 if (IS_ERR(inode)) 579 continue; 580 581 data_page = find_data_page(inode, 582 start_bidx + ofs_in_node); 583 if (IS_ERR(data_page)) 584 goto next_iput; 585 586 f2fs_put_page(data_page, 0); 587 add_gc_inode(inode, ilist); 588 } else { 589 inode = find_gc_inode(dni.ino, ilist); 590 if (inode) { 591 data_page = get_lock_data_page(inode, 592 start_bidx + ofs_in_node); 593 if (IS_ERR(data_page)) 594 continue; 595 move_data_page(inode, data_page, gc_type); 596 stat_inc_data_blk_count(sbi, 1); 597 } 598 } 599 continue; 600 next_iput: 601 iput(inode); 602 } 603 if (++phase < 4) 604 goto next_step; 605 err = GC_DONE; 606 stop: 607 if (gc_type == FG_GC) 608 f2fs_submit_bio(sbi, DATA, true); 609 return err; 610 } 611 612 static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim, 613 int gc_type, int type) 614 { 615 struct sit_info *sit_i = SIT_I(sbi); 616 int ret; 617 mutex_lock(&sit_i->sentry_lock); 618 ret = DIRTY_I(sbi)->v_ops->get_victim(sbi, victim, gc_type, type, LFS); 619 mutex_unlock(&sit_i->sentry_lock); 620 return ret; 621 } 622 623 static int do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno, 624 struct list_head *ilist, int gc_type) 625 { 626 struct page *sum_page; 627 struct f2fs_summary_block *sum; 628 int ret = GC_DONE; 629 630 /* read segment summary of victim */ 631 sum_page = get_sum_page(sbi, segno); 632 if (IS_ERR(sum_page)) 633 return GC_ERROR; 634 635 /* 636 * CP needs to lock sum_page. In this time, we don't need 637 * to lock this page, because this summary page is not gone anywhere. 638 * Also, this page is not gonna be updated before GC is done. 639 */ 640 unlock_page(sum_page); 641 sum = page_address(sum_page); 642 643 switch (GET_SUM_TYPE((&sum->footer))) { 644 case SUM_TYPE_NODE: 645 ret = gc_node_segment(sbi, sum->entries, segno, gc_type); 646 break; 647 case SUM_TYPE_DATA: 648 ret = gc_data_segment(sbi, sum->entries, ilist, segno, gc_type); 649 break; 650 } 651 stat_inc_seg_count(sbi, GET_SUM_TYPE((&sum->footer))); 652 stat_inc_call_count(sbi->stat_info); 653 654 f2fs_put_page(sum_page, 0); 655 return ret; 656 } 657 658 int f2fs_gc(struct f2fs_sb_info *sbi) 659 { 660 struct list_head ilist; 661 unsigned int segno, i; 662 int gc_type = BG_GC; 663 int gc_status = GC_NONE; 664 665 INIT_LIST_HEAD(&ilist); 666 gc_more: 667 if (!(sbi->sb->s_flags & MS_ACTIVE)) 668 goto stop; 669 670 if (has_not_enough_free_secs(sbi)) 671 gc_type = FG_GC; 672 673 if (!__get_victim(sbi, &segno, gc_type, NO_CHECK_TYPE)) 674 goto stop; 675 676 for (i = 0; i < sbi->segs_per_sec; i++) { 677 /* 678 * do_garbage_collect will give us three gc_status: 679 * GC_ERROR, GC_DONE, and GC_BLOCKED. 680 * If GC is finished uncleanly, we have to return 681 * the victim to dirty segment list. 682 */ 683 gc_status = do_garbage_collect(sbi, segno + i, &ilist, gc_type); 684 if (gc_status != GC_DONE) 685 break; 686 } 687 if (has_not_enough_free_secs(sbi)) { 688 write_checkpoint(sbi, (gc_status == GC_BLOCKED), false); 689 if (has_not_enough_free_secs(sbi)) 690 goto gc_more; 691 } 692 stop: 693 mutex_unlock(&sbi->gc_mutex); 694 695 put_gc_inode(&ilist); 696 return gc_status; 697 } 698 699 void build_gc_manager(struct f2fs_sb_info *sbi) 700 { 701 DIRTY_I(sbi)->v_ops = &default_v_ops; 702 } 703 704 int __init create_gc_caches(void) 705 { 706 winode_slab = f2fs_kmem_cache_create("f2fs_gc_inodes", 707 sizeof(struct inode_entry), NULL); 708 if (!winode_slab) 709 return -ENOMEM; 710 return 0; 711 } 712 713 void destroy_gc_caches(void) 714 { 715 kmem_cache_destroy(winode_slab); 716 } 717