1 /* 2 * fs/f2fs/gc.c 3 * 4 * Copyright (c) 2012 Samsung Electronics Co., Ltd. 5 * http://www.samsung.com/ 6 * 7 * This program is free software; you can redistribute it and/or modify 8 * it under the terms of the GNU General Public License version 2 as 9 * published by the Free Software Foundation. 10 */ 11 #include <linux/fs.h> 12 #include <linux/module.h> 13 #include <linux/backing-dev.h> 14 #include <linux/proc_fs.h> 15 #include <linux/init.h> 16 #include <linux/f2fs_fs.h> 17 #include <linux/kthread.h> 18 #include <linux/delay.h> 19 #include <linux/freezer.h> 20 #include <linux/blkdev.h> 21 22 #include "f2fs.h" 23 #include "node.h" 24 #include "segment.h" 25 #include "gc.h" 26 27 static struct kmem_cache *winode_slab; 28 29 static int gc_thread_func(void *data) 30 { 31 struct f2fs_sb_info *sbi = data; 32 wait_queue_head_t *wq = &sbi->gc_thread->gc_wait_queue_head; 33 long wait_ms; 34 35 wait_ms = GC_THREAD_MIN_SLEEP_TIME; 36 37 do { 38 if (try_to_freeze()) 39 continue; 40 else 41 wait_event_interruptible_timeout(*wq, 42 kthread_should_stop(), 43 msecs_to_jiffies(wait_ms)); 44 if (kthread_should_stop()) 45 break; 46 47 f2fs_balance_fs(sbi); 48 49 if (!test_opt(sbi, BG_GC)) 50 continue; 51 52 /* 53 * [GC triggering condition] 54 * 0. GC is not conducted currently. 55 * 1. There are enough dirty segments. 56 * 2. IO subsystem is idle by checking the # of writeback pages. 57 * 3. IO subsystem is idle by checking the # of requests in 58 * bdev's request list. 59 * 60 * Note) We have to avoid triggering GCs too much frequently. 61 * Because it is possible that some segments can be 62 * invalidated soon after by user update or deletion. 63 * So, I'd like to wait some time to collect dirty segments. 64 */ 65 if (!mutex_trylock(&sbi->gc_mutex)) 66 continue; 67 68 if (!is_idle(sbi)) { 69 wait_ms = increase_sleep_time(wait_ms); 70 mutex_unlock(&sbi->gc_mutex); 71 continue; 72 } 73 74 if (has_enough_invalid_blocks(sbi)) 75 wait_ms = decrease_sleep_time(wait_ms); 76 else 77 wait_ms = increase_sleep_time(wait_ms); 78 79 sbi->bg_gc++; 80 81 if (f2fs_gc(sbi) == GC_NONE) 82 wait_ms = GC_THREAD_NOGC_SLEEP_TIME; 83 else if (wait_ms == GC_THREAD_NOGC_SLEEP_TIME) 84 wait_ms = GC_THREAD_MAX_SLEEP_TIME; 85 86 } while (!kthread_should_stop()); 87 return 0; 88 } 89 90 int start_gc_thread(struct f2fs_sb_info *sbi) 91 { 92 struct f2fs_gc_kthread *gc_th; 93 94 gc_th = kmalloc(sizeof(struct f2fs_gc_kthread), GFP_KERNEL); 95 if (!gc_th) 96 return -ENOMEM; 97 98 sbi->gc_thread = gc_th; 99 init_waitqueue_head(&sbi->gc_thread->gc_wait_queue_head); 100 sbi->gc_thread->f2fs_gc_task = kthread_run(gc_thread_func, sbi, 101 GC_THREAD_NAME); 102 if (IS_ERR(gc_th->f2fs_gc_task)) { 103 kfree(gc_th); 104 return -ENOMEM; 105 } 106 return 0; 107 } 108 109 void stop_gc_thread(struct f2fs_sb_info *sbi) 110 { 111 struct f2fs_gc_kthread *gc_th = sbi->gc_thread; 112 if (!gc_th) 113 return; 114 kthread_stop(gc_th->f2fs_gc_task); 115 kfree(gc_th); 116 sbi->gc_thread = NULL; 117 } 118 119 static int select_gc_type(int gc_type) 120 { 121 return (gc_type == BG_GC) ? GC_CB : GC_GREEDY; 122 } 123 124 static void select_policy(struct f2fs_sb_info *sbi, int gc_type, 125 int type, struct victim_sel_policy *p) 126 { 127 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); 128 129 if (p->alloc_mode) { 130 p->gc_mode = GC_GREEDY; 131 p->dirty_segmap = dirty_i->dirty_segmap[type]; 132 p->ofs_unit = 1; 133 } else { 134 p->gc_mode = select_gc_type(gc_type); 135 p->dirty_segmap = dirty_i->dirty_segmap[DIRTY]; 136 p->ofs_unit = sbi->segs_per_sec; 137 } 138 p->offset = sbi->last_victim[p->gc_mode]; 139 } 140 141 static unsigned int get_max_cost(struct f2fs_sb_info *sbi, 142 struct victim_sel_policy *p) 143 { 144 if (p->gc_mode == GC_GREEDY) 145 return (1 << sbi->log_blocks_per_seg) * p->ofs_unit; 146 else if (p->gc_mode == GC_CB) 147 return UINT_MAX; 148 else /* No other gc_mode */ 149 return 0; 150 } 151 152 static unsigned int check_bg_victims(struct f2fs_sb_info *sbi) 153 { 154 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); 155 unsigned int segno; 156 157 /* 158 * If the gc_type is FG_GC, we can select victim segments 159 * selected by background GC before. 160 * Those segments guarantee they have small valid blocks. 161 */ 162 segno = find_next_bit(dirty_i->victim_segmap[BG_GC], 163 TOTAL_SEGS(sbi), 0); 164 if (segno < TOTAL_SEGS(sbi)) { 165 clear_bit(segno, dirty_i->victim_segmap[BG_GC]); 166 return segno; 167 } 168 return NULL_SEGNO; 169 } 170 171 static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno) 172 { 173 struct sit_info *sit_i = SIT_I(sbi); 174 unsigned int secno = GET_SECNO(sbi, segno); 175 unsigned int start = secno * sbi->segs_per_sec; 176 unsigned long long mtime = 0; 177 unsigned int vblocks; 178 unsigned char age = 0; 179 unsigned char u; 180 unsigned int i; 181 182 for (i = 0; i < sbi->segs_per_sec; i++) 183 mtime += get_seg_entry(sbi, start + i)->mtime; 184 vblocks = get_valid_blocks(sbi, segno, sbi->segs_per_sec); 185 186 mtime = div_u64(mtime, sbi->segs_per_sec); 187 vblocks = div_u64(vblocks, sbi->segs_per_sec); 188 189 u = (vblocks * 100) >> sbi->log_blocks_per_seg; 190 191 /* Handle if the system time is changed by user */ 192 if (mtime < sit_i->min_mtime) 193 sit_i->min_mtime = mtime; 194 if (mtime > sit_i->max_mtime) 195 sit_i->max_mtime = mtime; 196 if (sit_i->max_mtime != sit_i->min_mtime) 197 age = 100 - div64_u64(100 * (mtime - sit_i->min_mtime), 198 sit_i->max_mtime - sit_i->min_mtime); 199 200 return UINT_MAX - ((100 * (100 - u) * age) / (100 + u)); 201 } 202 203 static unsigned int get_gc_cost(struct f2fs_sb_info *sbi, unsigned int segno, 204 struct victim_sel_policy *p) 205 { 206 if (p->alloc_mode == SSR) 207 return get_seg_entry(sbi, segno)->ckpt_valid_blocks; 208 209 /* alloc_mode == LFS */ 210 if (p->gc_mode == GC_GREEDY) 211 return get_valid_blocks(sbi, segno, sbi->segs_per_sec); 212 else 213 return get_cb_cost(sbi, segno); 214 } 215 216 /* 217 * This function is called from two pathes. 218 * One is garbage collection and the other is SSR segment selection. 219 * When it is called during GC, it just gets a victim segment 220 * and it does not remove it from dirty seglist. 221 * When it is called from SSR segment selection, it finds a segment 222 * which has minimum valid blocks and removes it from dirty seglist. 223 */ 224 static int get_victim_by_default(struct f2fs_sb_info *sbi, 225 unsigned int *result, int gc_type, int type, char alloc_mode) 226 { 227 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); 228 struct victim_sel_policy p; 229 unsigned int segno; 230 int nsearched = 0; 231 232 p.alloc_mode = alloc_mode; 233 select_policy(sbi, gc_type, type, &p); 234 235 p.min_segno = NULL_SEGNO; 236 p.min_cost = get_max_cost(sbi, &p); 237 238 mutex_lock(&dirty_i->seglist_lock); 239 240 if (p.alloc_mode == LFS && gc_type == FG_GC) { 241 p.min_segno = check_bg_victims(sbi); 242 if (p.min_segno != NULL_SEGNO) 243 goto got_it; 244 } 245 246 while (1) { 247 unsigned long cost; 248 249 segno = find_next_bit(p.dirty_segmap, 250 TOTAL_SEGS(sbi), p.offset); 251 if (segno >= TOTAL_SEGS(sbi)) { 252 if (sbi->last_victim[p.gc_mode]) { 253 sbi->last_victim[p.gc_mode] = 0; 254 p.offset = 0; 255 continue; 256 } 257 break; 258 } 259 p.offset = ((segno / p.ofs_unit) * p.ofs_unit) + p.ofs_unit; 260 261 if (test_bit(segno, dirty_i->victim_segmap[FG_GC])) 262 continue; 263 if (gc_type == BG_GC && 264 test_bit(segno, dirty_i->victim_segmap[BG_GC])) 265 continue; 266 if (IS_CURSEC(sbi, GET_SECNO(sbi, segno))) 267 continue; 268 269 cost = get_gc_cost(sbi, segno, &p); 270 271 if (p.min_cost > cost) { 272 p.min_segno = segno; 273 p.min_cost = cost; 274 } 275 276 if (cost == get_max_cost(sbi, &p)) 277 continue; 278 279 if (nsearched++ >= MAX_VICTIM_SEARCH) { 280 sbi->last_victim[p.gc_mode] = segno; 281 break; 282 } 283 } 284 got_it: 285 if (p.min_segno != NULL_SEGNO) { 286 *result = (p.min_segno / p.ofs_unit) * p.ofs_unit; 287 if (p.alloc_mode == LFS) { 288 int i; 289 for (i = 0; i < p.ofs_unit; i++) 290 set_bit(*result + i, 291 dirty_i->victim_segmap[gc_type]); 292 } 293 } 294 mutex_unlock(&dirty_i->seglist_lock); 295 296 return (p.min_segno == NULL_SEGNO) ? 0 : 1; 297 } 298 299 static const struct victim_selection default_v_ops = { 300 .get_victim = get_victim_by_default, 301 }; 302 303 static struct inode *find_gc_inode(nid_t ino, struct list_head *ilist) 304 { 305 struct list_head *this; 306 struct inode_entry *ie; 307 308 list_for_each(this, ilist) { 309 ie = list_entry(this, struct inode_entry, list); 310 if (ie->inode->i_ino == ino) 311 return ie->inode; 312 } 313 return NULL; 314 } 315 316 static void add_gc_inode(struct inode *inode, struct list_head *ilist) 317 { 318 struct list_head *this; 319 struct inode_entry *new_ie, *ie; 320 321 list_for_each(this, ilist) { 322 ie = list_entry(this, struct inode_entry, list); 323 if (ie->inode == inode) { 324 iput(inode); 325 return; 326 } 327 } 328 repeat: 329 new_ie = kmem_cache_alloc(winode_slab, GFP_NOFS); 330 if (!new_ie) { 331 cond_resched(); 332 goto repeat; 333 } 334 new_ie->inode = inode; 335 list_add_tail(&new_ie->list, ilist); 336 } 337 338 static void put_gc_inode(struct list_head *ilist) 339 { 340 struct inode_entry *ie, *next_ie; 341 list_for_each_entry_safe(ie, next_ie, ilist, list) { 342 iput(ie->inode); 343 list_del(&ie->list); 344 kmem_cache_free(winode_slab, ie); 345 } 346 } 347 348 static int check_valid_map(struct f2fs_sb_info *sbi, 349 unsigned int segno, int offset) 350 { 351 struct sit_info *sit_i = SIT_I(sbi); 352 struct seg_entry *sentry; 353 int ret; 354 355 mutex_lock(&sit_i->sentry_lock); 356 sentry = get_seg_entry(sbi, segno); 357 ret = f2fs_test_bit(offset, sentry->cur_valid_map); 358 mutex_unlock(&sit_i->sentry_lock); 359 return ret ? GC_OK : GC_NEXT; 360 } 361 362 /* 363 * This function compares node address got in summary with that in NAT. 364 * On validity, copy that node with cold status, otherwise (invalid node) 365 * ignore that. 366 */ 367 static int gc_node_segment(struct f2fs_sb_info *sbi, 368 struct f2fs_summary *sum, unsigned int segno, int gc_type) 369 { 370 bool initial = true; 371 struct f2fs_summary *entry; 372 int off; 373 374 next_step: 375 entry = sum; 376 for (off = 0; off < sbi->blocks_per_seg; off++, entry++) { 377 nid_t nid = le32_to_cpu(entry->nid); 378 struct page *node_page; 379 int err; 380 381 /* 382 * It makes sure that free segments are able to write 383 * all the dirty node pages before CP after this CP. 384 * So let's check the space of dirty node pages. 385 */ 386 if (should_do_checkpoint(sbi)) { 387 mutex_lock(&sbi->cp_mutex); 388 block_operations(sbi); 389 return GC_BLOCKED; 390 } 391 392 err = check_valid_map(sbi, segno, off); 393 if (err == GC_NEXT) 394 continue; 395 396 if (initial) { 397 ra_node_page(sbi, nid); 398 continue; 399 } 400 node_page = get_node_page(sbi, nid); 401 if (IS_ERR(node_page)) 402 continue; 403 404 /* set page dirty and write it */ 405 if (!PageWriteback(node_page)) 406 set_page_dirty(node_page); 407 f2fs_put_page(node_page, 1); 408 stat_inc_node_blk_count(sbi, 1); 409 } 410 if (initial) { 411 initial = false; 412 goto next_step; 413 } 414 415 if (gc_type == FG_GC) { 416 struct writeback_control wbc = { 417 .sync_mode = WB_SYNC_ALL, 418 .nr_to_write = LONG_MAX, 419 .for_reclaim = 0, 420 }; 421 sync_node_pages(sbi, 0, &wbc); 422 } 423 return GC_DONE; 424 } 425 426 /* 427 * Calculate start block index that this node page contains 428 */ 429 block_t start_bidx_of_node(unsigned int node_ofs) 430 { 431 unsigned int indirect_blks = 2 * NIDS_PER_BLOCK + 4; 432 unsigned int bidx; 433 434 if (node_ofs == 0) 435 return 0; 436 437 if (node_ofs <= 2) { 438 bidx = node_ofs - 1; 439 } else if (node_ofs <= indirect_blks) { 440 int dec = (node_ofs - 4) / (NIDS_PER_BLOCK + 1); 441 bidx = node_ofs - 2 - dec; 442 } else { 443 int dec = (node_ofs - indirect_blks - 3) / (NIDS_PER_BLOCK + 1); 444 bidx = node_ofs - 5 - dec; 445 } 446 return bidx * ADDRS_PER_BLOCK + ADDRS_PER_INODE; 447 } 448 449 static int check_dnode(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, 450 struct node_info *dni, block_t blkaddr, unsigned int *nofs) 451 { 452 struct page *node_page; 453 nid_t nid; 454 unsigned int ofs_in_node; 455 block_t source_blkaddr; 456 457 nid = le32_to_cpu(sum->nid); 458 ofs_in_node = le16_to_cpu(sum->ofs_in_node); 459 460 node_page = get_node_page(sbi, nid); 461 if (IS_ERR(node_page)) 462 return GC_NEXT; 463 464 get_node_info(sbi, nid, dni); 465 466 if (sum->version != dni->version) { 467 f2fs_put_page(node_page, 1); 468 return GC_NEXT; 469 } 470 471 *nofs = ofs_of_node(node_page); 472 source_blkaddr = datablock_addr(node_page, ofs_in_node); 473 f2fs_put_page(node_page, 1); 474 475 if (source_blkaddr != blkaddr) 476 return GC_NEXT; 477 return GC_OK; 478 } 479 480 static void move_data_page(struct inode *inode, struct page *page, int gc_type) 481 { 482 if (page->mapping != inode->i_mapping) 483 goto out; 484 485 if (inode != page->mapping->host) 486 goto out; 487 488 if (PageWriteback(page)) 489 goto out; 490 491 if (gc_type == BG_GC) { 492 set_page_dirty(page); 493 set_cold_data(page); 494 } else { 495 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 496 mutex_lock_op(sbi, DATA_WRITE); 497 if (clear_page_dirty_for_io(page) && 498 S_ISDIR(inode->i_mode)) { 499 dec_page_count(sbi, F2FS_DIRTY_DENTS); 500 inode_dec_dirty_dents(inode); 501 } 502 set_cold_data(page); 503 do_write_data_page(page); 504 mutex_unlock_op(sbi, DATA_WRITE); 505 clear_cold_data(page); 506 } 507 out: 508 f2fs_put_page(page, 1); 509 } 510 511 /* 512 * This function tries to get parent node of victim data block, and identifies 513 * data block validity. If the block is valid, copy that with cold status and 514 * modify parent node. 515 * If the parent node is not valid or the data block address is different, 516 * the victim data block is ignored. 517 */ 518 static int gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, 519 struct list_head *ilist, unsigned int segno, int gc_type) 520 { 521 struct super_block *sb = sbi->sb; 522 struct f2fs_summary *entry; 523 block_t start_addr; 524 int err, off; 525 int phase = 0; 526 527 start_addr = START_BLOCK(sbi, segno); 528 529 next_step: 530 entry = sum; 531 for (off = 0; off < sbi->blocks_per_seg; off++, entry++) { 532 struct page *data_page; 533 struct inode *inode; 534 struct node_info dni; /* dnode info for the data */ 535 unsigned int ofs_in_node, nofs; 536 block_t start_bidx; 537 538 /* 539 * It makes sure that free segments are able to write 540 * all the dirty node pages before CP after this CP. 541 * So let's check the space of dirty node pages. 542 */ 543 if (should_do_checkpoint(sbi)) { 544 mutex_lock(&sbi->cp_mutex); 545 block_operations(sbi); 546 err = GC_BLOCKED; 547 goto stop; 548 } 549 550 err = check_valid_map(sbi, segno, off); 551 if (err == GC_NEXT) 552 continue; 553 554 if (phase == 0) { 555 ra_node_page(sbi, le32_to_cpu(entry->nid)); 556 continue; 557 } 558 559 /* Get an inode by ino with checking validity */ 560 err = check_dnode(sbi, entry, &dni, start_addr + off, &nofs); 561 if (err == GC_NEXT) 562 continue; 563 564 if (phase == 1) { 565 ra_node_page(sbi, dni.ino); 566 continue; 567 } 568 569 start_bidx = start_bidx_of_node(nofs); 570 ofs_in_node = le16_to_cpu(entry->ofs_in_node); 571 572 if (phase == 2) { 573 inode = f2fs_iget_nowait(sb, dni.ino); 574 if (IS_ERR(inode)) 575 continue; 576 577 data_page = find_data_page(inode, 578 start_bidx + ofs_in_node); 579 if (IS_ERR(data_page)) 580 goto next_iput; 581 582 f2fs_put_page(data_page, 0); 583 add_gc_inode(inode, ilist); 584 } else { 585 inode = find_gc_inode(dni.ino, ilist); 586 if (inode) { 587 data_page = get_lock_data_page(inode, 588 start_bidx + ofs_in_node); 589 if (IS_ERR(data_page)) 590 continue; 591 move_data_page(inode, data_page, gc_type); 592 stat_inc_data_blk_count(sbi, 1); 593 } 594 } 595 continue; 596 next_iput: 597 iput(inode); 598 } 599 if (++phase < 4) 600 goto next_step; 601 err = GC_DONE; 602 stop: 603 if (gc_type == FG_GC) 604 f2fs_submit_bio(sbi, DATA, true); 605 return err; 606 } 607 608 static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim, 609 int gc_type, int type) 610 { 611 struct sit_info *sit_i = SIT_I(sbi); 612 int ret; 613 mutex_lock(&sit_i->sentry_lock); 614 ret = DIRTY_I(sbi)->v_ops->get_victim(sbi, victim, gc_type, type, LFS); 615 mutex_unlock(&sit_i->sentry_lock); 616 return ret; 617 } 618 619 static int do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno, 620 struct list_head *ilist, int gc_type) 621 { 622 struct page *sum_page; 623 struct f2fs_summary_block *sum; 624 int ret = GC_DONE; 625 626 /* read segment summary of victim */ 627 sum_page = get_sum_page(sbi, segno); 628 if (IS_ERR(sum_page)) 629 return GC_ERROR; 630 631 /* 632 * CP needs to lock sum_page. In this time, we don't need 633 * to lock this page, because this summary page is not gone anywhere. 634 * Also, this page is not gonna be updated before GC is done. 635 */ 636 unlock_page(sum_page); 637 sum = page_address(sum_page); 638 639 switch (GET_SUM_TYPE((&sum->footer))) { 640 case SUM_TYPE_NODE: 641 ret = gc_node_segment(sbi, sum->entries, segno, gc_type); 642 break; 643 case SUM_TYPE_DATA: 644 ret = gc_data_segment(sbi, sum->entries, ilist, segno, gc_type); 645 break; 646 } 647 stat_inc_seg_count(sbi, GET_SUM_TYPE((&sum->footer))); 648 stat_inc_call_count(sbi->stat_info); 649 650 f2fs_put_page(sum_page, 0); 651 return ret; 652 } 653 654 int f2fs_gc(struct f2fs_sb_info *sbi) 655 { 656 struct list_head ilist; 657 unsigned int segno, i; 658 int gc_type = BG_GC; 659 int gc_status = GC_NONE; 660 661 INIT_LIST_HEAD(&ilist); 662 gc_more: 663 if (!(sbi->sb->s_flags & MS_ACTIVE)) 664 goto stop; 665 666 if (has_not_enough_free_secs(sbi)) 667 gc_type = FG_GC; 668 669 if (!__get_victim(sbi, &segno, gc_type, NO_CHECK_TYPE)) 670 goto stop; 671 672 for (i = 0; i < sbi->segs_per_sec; i++) { 673 /* 674 * do_garbage_collect will give us three gc_status: 675 * GC_ERROR, GC_DONE, and GC_BLOCKED. 676 * If GC is finished uncleanly, we have to return 677 * the victim to dirty segment list. 678 */ 679 gc_status = do_garbage_collect(sbi, segno + i, &ilist, gc_type); 680 if (gc_status != GC_DONE) 681 break; 682 } 683 if (has_not_enough_free_secs(sbi)) { 684 write_checkpoint(sbi, (gc_status == GC_BLOCKED), false); 685 if (has_not_enough_free_secs(sbi)) 686 goto gc_more; 687 } 688 stop: 689 mutex_unlock(&sbi->gc_mutex); 690 691 put_gc_inode(&ilist); 692 return gc_status; 693 } 694 695 void build_gc_manager(struct f2fs_sb_info *sbi) 696 { 697 DIRTY_I(sbi)->v_ops = &default_v_ops; 698 } 699 700 int __init create_gc_caches(void) 701 { 702 winode_slab = f2fs_kmem_cache_create("f2fs_gc_inodes", 703 sizeof(struct inode_entry), NULL); 704 if (!winode_slab) 705 return -ENOMEM; 706 return 0; 707 } 708 709 void destroy_gc_caches(void) 710 { 711 kmem_cache_destroy(winode_slab); 712 } 713