1 /* 2 * fs/f2fs/gc.c 3 * 4 * Copyright (c) 2012 Samsung Electronics Co., Ltd. 5 * http://www.samsung.com/ 6 * 7 * This program is free software; you can redistribute it and/or modify 8 * it under the terms of the GNU General Public License version 2 as 9 * published by the Free Software Foundation. 10 */ 11 #include <linux/fs.h> 12 #include <linux/module.h> 13 #include <linux/backing-dev.h> 14 #include <linux/init.h> 15 #include <linux/f2fs_fs.h> 16 #include <linux/kthread.h> 17 #include <linux/delay.h> 18 #include <linux/freezer.h> 19 20 #include "f2fs.h" 21 #include "node.h" 22 #include "segment.h" 23 #include "gc.h" 24 #include <trace/events/f2fs.h> 25 26 static int gc_thread_func(void *data) 27 { 28 struct f2fs_sb_info *sbi = data; 29 struct f2fs_gc_kthread *gc_th = sbi->gc_thread; 30 wait_queue_head_t *wq = &sbi->gc_thread->gc_wait_queue_head; 31 unsigned int wait_ms; 32 33 wait_ms = gc_th->min_sleep_time; 34 35 set_freezable(); 36 do { 37 wait_event_interruptible_timeout(*wq, 38 kthread_should_stop() || freezing(current) || 39 gc_th->gc_wake, 40 msecs_to_jiffies(wait_ms)); 41 42 /* give it a try one time */ 43 if (gc_th->gc_wake) 44 gc_th->gc_wake = 0; 45 46 if (try_to_freeze()) 47 continue; 48 if (kthread_should_stop()) 49 break; 50 51 if (sbi->sb->s_writers.frozen >= SB_FREEZE_WRITE) { 52 increase_sleep_time(gc_th, &wait_ms); 53 continue; 54 } 55 56 #ifdef CONFIG_F2FS_FAULT_INJECTION 57 if (time_to_inject(sbi, FAULT_CHECKPOINT)) { 58 f2fs_show_injection_info(FAULT_CHECKPOINT); 59 f2fs_stop_checkpoint(sbi, false); 60 } 61 #endif 62 63 if (!sb_start_write_trylock(sbi->sb)) 64 continue; 65 66 /* 67 * [GC triggering condition] 68 * 0. GC is not conducted currently. 69 * 1. There are enough dirty segments. 70 * 2. IO subsystem is idle by checking the # of writeback pages. 71 * 3. IO subsystem is idle by checking the # of requests in 72 * bdev's request list. 73 * 74 * Note) We have to avoid triggering GCs frequently. 75 * Because it is possible that some segments can be 76 * invalidated soon after by user update or deletion. 77 * So, I'd like to wait some time to collect dirty segments. 78 */ 79 if (gc_th->gc_urgent) { 80 wait_ms = gc_th->urgent_sleep_time; 81 mutex_lock(&sbi->gc_mutex); 82 goto do_gc; 83 } 84 85 if (!mutex_trylock(&sbi->gc_mutex)) 86 goto next; 87 88 if (!is_idle(sbi)) { 89 increase_sleep_time(gc_th, &wait_ms); 90 mutex_unlock(&sbi->gc_mutex); 91 goto next; 92 } 93 94 if (has_enough_invalid_blocks(sbi)) 95 decrease_sleep_time(gc_th, &wait_ms); 96 else 97 increase_sleep_time(gc_th, &wait_ms); 98 do_gc: 99 stat_inc_bggc_count(sbi); 100 101 /* if return value is not zero, no victim was selected */ 102 if (f2fs_gc(sbi, test_opt(sbi, FORCE_FG_GC), true, NULL_SEGNO)) 103 wait_ms = gc_th->no_gc_sleep_time; 104 105 trace_f2fs_background_gc(sbi->sb, wait_ms, 106 prefree_segments(sbi), free_segments(sbi)); 107 108 /* balancing f2fs's metadata periodically */ 109 f2fs_balance_fs_bg(sbi); 110 next: 111 sb_end_write(sbi->sb); 112 113 } while (!kthread_should_stop()); 114 return 0; 115 } 116 117 int start_gc_thread(struct f2fs_sb_info *sbi) 118 { 119 struct f2fs_gc_kthread *gc_th; 120 dev_t dev = sbi->sb->s_bdev->bd_dev; 121 int err = 0; 122 123 gc_th = f2fs_kmalloc(sbi, sizeof(struct f2fs_gc_kthread), GFP_KERNEL); 124 if (!gc_th) { 125 err = -ENOMEM; 126 goto out; 127 } 128 129 gc_th->urgent_sleep_time = DEF_GC_THREAD_URGENT_SLEEP_TIME; 130 gc_th->min_sleep_time = DEF_GC_THREAD_MIN_SLEEP_TIME; 131 gc_th->max_sleep_time = DEF_GC_THREAD_MAX_SLEEP_TIME; 132 gc_th->no_gc_sleep_time = DEF_GC_THREAD_NOGC_SLEEP_TIME; 133 134 gc_th->gc_idle = 0; 135 gc_th->gc_urgent = 0; 136 gc_th->gc_wake= 0; 137 138 sbi->gc_thread = gc_th; 139 init_waitqueue_head(&sbi->gc_thread->gc_wait_queue_head); 140 sbi->gc_thread->f2fs_gc_task = kthread_run(gc_thread_func, sbi, 141 "f2fs_gc-%u:%u", MAJOR(dev), MINOR(dev)); 142 if (IS_ERR(gc_th->f2fs_gc_task)) { 143 err = PTR_ERR(gc_th->f2fs_gc_task); 144 kfree(gc_th); 145 sbi->gc_thread = NULL; 146 } 147 out: 148 return err; 149 } 150 151 void stop_gc_thread(struct f2fs_sb_info *sbi) 152 { 153 struct f2fs_gc_kthread *gc_th = sbi->gc_thread; 154 if (!gc_th) 155 return; 156 kthread_stop(gc_th->f2fs_gc_task); 157 kfree(gc_th); 158 sbi->gc_thread = NULL; 159 } 160 161 static int select_gc_type(struct f2fs_gc_kthread *gc_th, int gc_type) 162 { 163 int gc_mode = (gc_type == BG_GC) ? GC_CB : GC_GREEDY; 164 165 if (!gc_th) 166 return gc_mode; 167 168 if (gc_th->gc_idle) { 169 if (gc_th->gc_idle == 1) 170 gc_mode = GC_CB; 171 else if (gc_th->gc_idle == 2) 172 gc_mode = GC_GREEDY; 173 } 174 if (gc_th->gc_urgent) 175 gc_mode = GC_GREEDY; 176 return gc_mode; 177 } 178 179 static void select_policy(struct f2fs_sb_info *sbi, int gc_type, 180 int type, struct victim_sel_policy *p) 181 { 182 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); 183 184 if (p->alloc_mode == SSR) { 185 p->gc_mode = GC_GREEDY; 186 p->dirty_segmap = dirty_i->dirty_segmap[type]; 187 p->max_search = dirty_i->nr_dirty[type]; 188 p->ofs_unit = 1; 189 } else { 190 p->gc_mode = select_gc_type(sbi->gc_thread, gc_type); 191 p->dirty_segmap = dirty_i->dirty_segmap[DIRTY]; 192 p->max_search = dirty_i->nr_dirty[DIRTY]; 193 p->ofs_unit = sbi->segs_per_sec; 194 } 195 196 /* we need to check every dirty segments in the FG_GC case */ 197 if (gc_type != FG_GC && 198 (sbi->gc_thread && !sbi->gc_thread->gc_urgent) && 199 p->max_search > sbi->max_victim_search) 200 p->max_search = sbi->max_victim_search; 201 202 /* let's select beginning hot/small space first in no_heap mode*/ 203 if (test_opt(sbi, NOHEAP) && 204 (type == CURSEG_HOT_DATA || IS_NODESEG(type))) 205 p->offset = 0; 206 else 207 p->offset = SIT_I(sbi)->last_victim[p->gc_mode]; 208 } 209 210 static unsigned int get_max_cost(struct f2fs_sb_info *sbi, 211 struct victim_sel_policy *p) 212 { 213 /* SSR allocates in a segment unit */ 214 if (p->alloc_mode == SSR) 215 return sbi->blocks_per_seg; 216 if (p->gc_mode == GC_GREEDY) 217 return 2 * sbi->blocks_per_seg * p->ofs_unit; 218 else if (p->gc_mode == GC_CB) 219 return UINT_MAX; 220 else /* No other gc_mode */ 221 return 0; 222 } 223 224 static unsigned int check_bg_victims(struct f2fs_sb_info *sbi) 225 { 226 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); 227 unsigned int secno; 228 229 /* 230 * If the gc_type is FG_GC, we can select victim segments 231 * selected by background GC before. 232 * Those segments guarantee they have small valid blocks. 233 */ 234 for_each_set_bit(secno, dirty_i->victim_secmap, MAIN_SECS(sbi)) { 235 if (sec_usage_check(sbi, secno)) 236 continue; 237 238 if (no_fggc_candidate(sbi, secno)) 239 continue; 240 241 clear_bit(secno, dirty_i->victim_secmap); 242 return GET_SEG_FROM_SEC(sbi, secno); 243 } 244 return NULL_SEGNO; 245 } 246 247 static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno) 248 { 249 struct sit_info *sit_i = SIT_I(sbi); 250 unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); 251 unsigned int start = GET_SEG_FROM_SEC(sbi, secno); 252 unsigned long long mtime = 0; 253 unsigned int vblocks; 254 unsigned char age = 0; 255 unsigned char u; 256 unsigned int i; 257 258 for (i = 0; i < sbi->segs_per_sec; i++) 259 mtime += get_seg_entry(sbi, start + i)->mtime; 260 vblocks = get_valid_blocks(sbi, segno, true); 261 262 mtime = div_u64(mtime, sbi->segs_per_sec); 263 vblocks = div_u64(vblocks, sbi->segs_per_sec); 264 265 u = (vblocks * 100) >> sbi->log_blocks_per_seg; 266 267 /* Handle if the system time has changed by the user */ 268 if (mtime < sit_i->min_mtime) 269 sit_i->min_mtime = mtime; 270 if (mtime > sit_i->max_mtime) 271 sit_i->max_mtime = mtime; 272 if (sit_i->max_mtime != sit_i->min_mtime) 273 age = 100 - div64_u64(100 * (mtime - sit_i->min_mtime), 274 sit_i->max_mtime - sit_i->min_mtime); 275 276 return UINT_MAX - ((100 * (100 - u) * age) / (100 + u)); 277 } 278 279 static inline unsigned int get_gc_cost(struct f2fs_sb_info *sbi, 280 unsigned int segno, struct victim_sel_policy *p) 281 { 282 if (p->alloc_mode == SSR) 283 return get_seg_entry(sbi, segno)->ckpt_valid_blocks; 284 285 /* alloc_mode == LFS */ 286 if (p->gc_mode == GC_GREEDY) 287 return get_valid_blocks(sbi, segno, true); 288 else 289 return get_cb_cost(sbi, segno); 290 } 291 292 static unsigned int count_bits(const unsigned long *addr, 293 unsigned int offset, unsigned int len) 294 { 295 unsigned int end = offset + len, sum = 0; 296 297 while (offset < end) { 298 if (test_bit(offset++, addr)) 299 ++sum; 300 } 301 return sum; 302 } 303 304 /* 305 * This function is called from two paths. 306 * One is garbage collection and the other is SSR segment selection. 307 * When it is called during GC, it just gets a victim segment 308 * and it does not remove it from dirty seglist. 309 * When it is called from SSR segment selection, it finds a segment 310 * which has minimum valid blocks and removes it from dirty seglist. 311 */ 312 static int get_victim_by_default(struct f2fs_sb_info *sbi, 313 unsigned int *result, int gc_type, int type, char alloc_mode) 314 { 315 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); 316 struct sit_info *sm = SIT_I(sbi); 317 struct victim_sel_policy p; 318 unsigned int secno, last_victim; 319 unsigned int last_segment = MAIN_SEGS(sbi); 320 unsigned int nsearched = 0; 321 322 mutex_lock(&dirty_i->seglist_lock); 323 324 p.alloc_mode = alloc_mode; 325 select_policy(sbi, gc_type, type, &p); 326 327 p.min_segno = NULL_SEGNO; 328 p.min_cost = get_max_cost(sbi, &p); 329 330 if (*result != NULL_SEGNO) { 331 if (IS_DATASEG(get_seg_entry(sbi, *result)->type) && 332 get_valid_blocks(sbi, *result, false) && 333 !sec_usage_check(sbi, GET_SEC_FROM_SEG(sbi, *result))) 334 p.min_segno = *result; 335 goto out; 336 } 337 338 if (p.max_search == 0) 339 goto out; 340 341 last_victim = sm->last_victim[p.gc_mode]; 342 if (p.alloc_mode == LFS && gc_type == FG_GC) { 343 p.min_segno = check_bg_victims(sbi); 344 if (p.min_segno != NULL_SEGNO) 345 goto got_it; 346 } 347 348 while (1) { 349 unsigned long cost; 350 unsigned int segno; 351 352 segno = find_next_bit(p.dirty_segmap, last_segment, p.offset); 353 if (segno >= last_segment) { 354 if (sm->last_victim[p.gc_mode]) { 355 last_segment = 356 sm->last_victim[p.gc_mode]; 357 sm->last_victim[p.gc_mode] = 0; 358 p.offset = 0; 359 continue; 360 } 361 break; 362 } 363 364 p.offset = segno + p.ofs_unit; 365 if (p.ofs_unit > 1) { 366 p.offset -= segno % p.ofs_unit; 367 nsearched += count_bits(p.dirty_segmap, 368 p.offset - p.ofs_unit, 369 p.ofs_unit); 370 } else { 371 nsearched++; 372 } 373 374 secno = GET_SEC_FROM_SEG(sbi, segno); 375 376 if (sec_usage_check(sbi, secno)) 377 goto next; 378 if (gc_type == BG_GC && test_bit(secno, dirty_i->victim_secmap)) 379 goto next; 380 if (gc_type == FG_GC && p.alloc_mode == LFS && 381 no_fggc_candidate(sbi, secno)) 382 goto next; 383 384 cost = get_gc_cost(sbi, segno, &p); 385 386 if (p.min_cost > cost) { 387 p.min_segno = segno; 388 p.min_cost = cost; 389 } 390 next: 391 if (nsearched >= p.max_search) { 392 if (!sm->last_victim[p.gc_mode] && segno <= last_victim) 393 sm->last_victim[p.gc_mode] = last_victim + 1; 394 else 395 sm->last_victim[p.gc_mode] = segno + 1; 396 sm->last_victim[p.gc_mode] %= MAIN_SEGS(sbi); 397 break; 398 } 399 } 400 if (p.min_segno != NULL_SEGNO) { 401 got_it: 402 if (p.alloc_mode == LFS) { 403 secno = GET_SEC_FROM_SEG(sbi, p.min_segno); 404 if (gc_type == FG_GC) 405 sbi->cur_victim_sec = secno; 406 else 407 set_bit(secno, dirty_i->victim_secmap); 408 } 409 *result = (p.min_segno / p.ofs_unit) * p.ofs_unit; 410 411 trace_f2fs_get_victim(sbi->sb, type, gc_type, &p, 412 sbi->cur_victim_sec, 413 prefree_segments(sbi), free_segments(sbi)); 414 } 415 out: 416 mutex_unlock(&dirty_i->seglist_lock); 417 418 return (p.min_segno == NULL_SEGNO) ? 0 : 1; 419 } 420 421 static const struct victim_selection default_v_ops = { 422 .get_victim = get_victim_by_default, 423 }; 424 425 static struct inode *find_gc_inode(struct gc_inode_list *gc_list, nid_t ino) 426 { 427 struct inode_entry *ie; 428 429 ie = radix_tree_lookup(&gc_list->iroot, ino); 430 if (ie) 431 return ie->inode; 432 return NULL; 433 } 434 435 static void add_gc_inode(struct gc_inode_list *gc_list, struct inode *inode) 436 { 437 struct inode_entry *new_ie; 438 439 if (inode == find_gc_inode(gc_list, inode->i_ino)) { 440 iput(inode); 441 return; 442 } 443 new_ie = f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS); 444 new_ie->inode = inode; 445 446 f2fs_radix_tree_insert(&gc_list->iroot, inode->i_ino, new_ie); 447 list_add_tail(&new_ie->list, &gc_list->ilist); 448 } 449 450 static void put_gc_inode(struct gc_inode_list *gc_list) 451 { 452 struct inode_entry *ie, *next_ie; 453 list_for_each_entry_safe(ie, next_ie, &gc_list->ilist, list) { 454 radix_tree_delete(&gc_list->iroot, ie->inode->i_ino); 455 iput(ie->inode); 456 list_del(&ie->list); 457 kmem_cache_free(inode_entry_slab, ie); 458 } 459 } 460 461 static int check_valid_map(struct f2fs_sb_info *sbi, 462 unsigned int segno, int offset) 463 { 464 struct sit_info *sit_i = SIT_I(sbi); 465 struct seg_entry *sentry; 466 int ret; 467 468 down_read(&sit_i->sentry_lock); 469 sentry = get_seg_entry(sbi, segno); 470 ret = f2fs_test_bit(offset, sentry->cur_valid_map); 471 up_read(&sit_i->sentry_lock); 472 return ret; 473 } 474 475 /* 476 * This function compares node address got in summary with that in NAT. 477 * On validity, copy that node with cold status, otherwise (invalid node) 478 * ignore that. 479 */ 480 static void gc_node_segment(struct f2fs_sb_info *sbi, 481 struct f2fs_summary *sum, unsigned int segno, int gc_type) 482 { 483 struct f2fs_summary *entry; 484 block_t start_addr; 485 int off; 486 int phase = 0; 487 488 start_addr = START_BLOCK(sbi, segno); 489 490 next_step: 491 entry = sum; 492 493 for (off = 0; off < sbi->blocks_per_seg; off++, entry++) { 494 nid_t nid = le32_to_cpu(entry->nid); 495 struct page *node_page; 496 struct node_info ni; 497 498 /* stop BG_GC if there is not enough free sections. */ 499 if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0)) 500 return; 501 502 if (check_valid_map(sbi, segno, off) == 0) 503 continue; 504 505 if (phase == 0) { 506 ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), 1, 507 META_NAT, true); 508 continue; 509 } 510 511 if (phase == 1) { 512 ra_node_page(sbi, nid); 513 continue; 514 } 515 516 /* phase == 2 */ 517 node_page = get_node_page(sbi, nid); 518 if (IS_ERR(node_page)) 519 continue; 520 521 /* block may become invalid during get_node_page */ 522 if (check_valid_map(sbi, segno, off) == 0) { 523 f2fs_put_page(node_page, 1); 524 continue; 525 } 526 527 get_node_info(sbi, nid, &ni); 528 if (ni.blk_addr != start_addr + off) { 529 f2fs_put_page(node_page, 1); 530 continue; 531 } 532 533 move_node_page(node_page, gc_type); 534 stat_inc_node_blk_count(sbi, 1, gc_type); 535 } 536 537 if (++phase < 3) 538 goto next_step; 539 } 540 541 /* 542 * Calculate start block index indicating the given node offset. 543 * Be careful, caller should give this node offset only indicating direct node 544 * blocks. If any node offsets, which point the other types of node blocks such 545 * as indirect or double indirect node blocks, are given, it must be a caller's 546 * bug. 547 */ 548 block_t start_bidx_of_node(unsigned int node_ofs, struct inode *inode) 549 { 550 unsigned int indirect_blks = 2 * NIDS_PER_BLOCK + 4; 551 unsigned int bidx; 552 553 if (node_ofs == 0) 554 return 0; 555 556 if (node_ofs <= 2) { 557 bidx = node_ofs - 1; 558 } else if (node_ofs <= indirect_blks) { 559 int dec = (node_ofs - 4) / (NIDS_PER_BLOCK + 1); 560 bidx = node_ofs - 2 - dec; 561 } else { 562 int dec = (node_ofs - indirect_blks - 3) / (NIDS_PER_BLOCK + 1); 563 bidx = node_ofs - 5 - dec; 564 } 565 return bidx * ADDRS_PER_BLOCK + ADDRS_PER_INODE(inode); 566 } 567 568 static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, 569 struct node_info *dni, block_t blkaddr, unsigned int *nofs) 570 { 571 struct page *node_page; 572 nid_t nid; 573 unsigned int ofs_in_node; 574 block_t source_blkaddr; 575 576 nid = le32_to_cpu(sum->nid); 577 ofs_in_node = le16_to_cpu(sum->ofs_in_node); 578 579 node_page = get_node_page(sbi, nid); 580 if (IS_ERR(node_page)) 581 return false; 582 583 get_node_info(sbi, nid, dni); 584 585 if (sum->version != dni->version) { 586 f2fs_msg(sbi->sb, KERN_WARNING, 587 "%s: valid data with mismatched node version.", 588 __func__); 589 set_sbi_flag(sbi, SBI_NEED_FSCK); 590 } 591 592 *nofs = ofs_of_node(node_page); 593 source_blkaddr = datablock_addr(NULL, node_page, ofs_in_node); 594 f2fs_put_page(node_page, 1); 595 596 if (source_blkaddr != blkaddr) 597 return false; 598 return true; 599 } 600 601 /* 602 * Move data block via META_MAPPING while keeping locked data page. 603 * This can be used to move blocks, aka LBAs, directly on disk. 604 */ 605 static void move_data_block(struct inode *inode, block_t bidx, 606 unsigned int segno, int off) 607 { 608 struct f2fs_io_info fio = { 609 .sbi = F2FS_I_SB(inode), 610 .ino = inode->i_ino, 611 .type = DATA, 612 .temp = COLD, 613 .op = REQ_OP_READ, 614 .op_flags = 0, 615 .encrypted_page = NULL, 616 .in_list = false, 617 }; 618 struct dnode_of_data dn; 619 struct f2fs_summary sum; 620 struct node_info ni; 621 struct page *page; 622 block_t newaddr; 623 int err; 624 625 /* do not read out */ 626 page = f2fs_grab_cache_page(inode->i_mapping, bidx, false); 627 if (!page) 628 return; 629 630 if (!check_valid_map(F2FS_I_SB(inode), segno, off)) 631 goto out; 632 633 if (f2fs_is_atomic_file(inode)) 634 goto out; 635 636 if (f2fs_is_pinned_file(inode)) { 637 f2fs_pin_file_control(inode, true); 638 goto out; 639 } 640 641 set_new_dnode(&dn, inode, NULL, NULL, 0); 642 err = get_dnode_of_data(&dn, bidx, LOOKUP_NODE); 643 if (err) 644 goto out; 645 646 if (unlikely(dn.data_blkaddr == NULL_ADDR)) { 647 ClearPageUptodate(page); 648 goto put_out; 649 } 650 651 /* 652 * don't cache encrypted data into meta inode until previous dirty 653 * data were writebacked to avoid racing between GC and flush. 654 */ 655 f2fs_wait_on_page_writeback(page, DATA, true); 656 657 get_node_info(fio.sbi, dn.nid, &ni); 658 set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version); 659 660 /* read page */ 661 fio.page = page; 662 fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr; 663 664 allocate_data_block(fio.sbi, NULL, fio.old_blkaddr, &newaddr, 665 &sum, CURSEG_COLD_DATA, NULL, false); 666 667 fio.encrypted_page = f2fs_pagecache_get_page(META_MAPPING(fio.sbi), 668 newaddr, FGP_LOCK | FGP_CREAT, GFP_NOFS); 669 if (!fio.encrypted_page) { 670 err = -ENOMEM; 671 goto recover_block; 672 } 673 674 err = f2fs_submit_page_bio(&fio); 675 if (err) 676 goto put_page_out; 677 678 /* write page */ 679 lock_page(fio.encrypted_page); 680 681 if (unlikely(fio.encrypted_page->mapping != META_MAPPING(fio.sbi))) { 682 err = -EIO; 683 goto put_page_out; 684 } 685 if (unlikely(!PageUptodate(fio.encrypted_page))) { 686 err = -EIO; 687 goto put_page_out; 688 } 689 690 set_page_dirty(fio.encrypted_page); 691 f2fs_wait_on_page_writeback(fio.encrypted_page, DATA, true); 692 if (clear_page_dirty_for_io(fio.encrypted_page)) 693 dec_page_count(fio.sbi, F2FS_DIRTY_META); 694 695 set_page_writeback(fio.encrypted_page); 696 697 /* allocate block address */ 698 f2fs_wait_on_page_writeback(dn.node_page, NODE, true); 699 700 fio.op = REQ_OP_WRITE; 701 fio.op_flags = REQ_SYNC; 702 fio.new_blkaddr = newaddr; 703 err = f2fs_submit_page_write(&fio); 704 if (err) { 705 if (PageWriteback(fio.encrypted_page)) 706 end_page_writeback(fio.encrypted_page); 707 goto put_page_out; 708 } 709 710 f2fs_update_iostat(fio.sbi, FS_GC_DATA_IO, F2FS_BLKSIZE); 711 712 f2fs_update_data_blkaddr(&dn, newaddr); 713 set_inode_flag(inode, FI_APPEND_WRITE); 714 if (page->index == 0) 715 set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN); 716 put_page_out: 717 f2fs_put_page(fio.encrypted_page, 1); 718 recover_block: 719 if (err) 720 __f2fs_replace_block(fio.sbi, &sum, newaddr, fio.old_blkaddr, 721 true, true); 722 put_out: 723 f2fs_put_dnode(&dn); 724 out: 725 f2fs_put_page(page, 1); 726 } 727 728 static void move_data_page(struct inode *inode, block_t bidx, int gc_type, 729 unsigned int segno, int off) 730 { 731 struct page *page; 732 733 page = get_lock_data_page(inode, bidx, true); 734 if (IS_ERR(page)) 735 return; 736 737 if (!check_valid_map(F2FS_I_SB(inode), segno, off)) 738 goto out; 739 740 if (f2fs_is_atomic_file(inode)) 741 goto out; 742 if (f2fs_is_pinned_file(inode)) { 743 if (gc_type == FG_GC) 744 f2fs_pin_file_control(inode, true); 745 goto out; 746 } 747 748 if (gc_type == BG_GC) { 749 if (PageWriteback(page)) 750 goto out; 751 set_page_dirty(page); 752 set_cold_data(page); 753 } else { 754 struct f2fs_io_info fio = { 755 .sbi = F2FS_I_SB(inode), 756 .ino = inode->i_ino, 757 .type = DATA, 758 .temp = COLD, 759 .op = REQ_OP_WRITE, 760 .op_flags = REQ_SYNC, 761 .old_blkaddr = NULL_ADDR, 762 .page = page, 763 .encrypted_page = NULL, 764 .need_lock = LOCK_REQ, 765 .io_type = FS_GC_DATA_IO, 766 }; 767 bool is_dirty = PageDirty(page); 768 int err; 769 770 retry: 771 set_page_dirty(page); 772 f2fs_wait_on_page_writeback(page, DATA, true); 773 if (clear_page_dirty_for_io(page)) { 774 inode_dec_dirty_pages(inode); 775 remove_dirty_inode(inode); 776 } 777 778 set_cold_data(page); 779 780 err = do_write_data_page(&fio); 781 if (err == -ENOMEM && is_dirty) { 782 congestion_wait(BLK_RW_ASYNC, HZ/50); 783 goto retry; 784 } 785 } 786 out: 787 f2fs_put_page(page, 1); 788 } 789 790 /* 791 * This function tries to get parent node of victim data block, and identifies 792 * data block validity. If the block is valid, copy that with cold status and 793 * modify parent node. 794 * If the parent node is not valid or the data block address is different, 795 * the victim data block is ignored. 796 */ 797 static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, 798 struct gc_inode_list *gc_list, unsigned int segno, int gc_type) 799 { 800 struct super_block *sb = sbi->sb; 801 struct f2fs_summary *entry; 802 block_t start_addr; 803 int off; 804 int phase = 0; 805 806 start_addr = START_BLOCK(sbi, segno); 807 808 next_step: 809 entry = sum; 810 811 for (off = 0; off < sbi->blocks_per_seg; off++, entry++) { 812 struct page *data_page; 813 struct inode *inode; 814 struct node_info dni; /* dnode info for the data */ 815 unsigned int ofs_in_node, nofs; 816 block_t start_bidx; 817 nid_t nid = le32_to_cpu(entry->nid); 818 819 /* stop BG_GC if there is not enough free sections. */ 820 if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0)) 821 return; 822 823 if (check_valid_map(sbi, segno, off) == 0) 824 continue; 825 826 if (phase == 0) { 827 ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), 1, 828 META_NAT, true); 829 continue; 830 } 831 832 if (phase == 1) { 833 ra_node_page(sbi, nid); 834 continue; 835 } 836 837 /* Get an inode by ino with checking validity */ 838 if (!is_alive(sbi, entry, &dni, start_addr + off, &nofs)) 839 continue; 840 841 if (phase == 2) { 842 ra_node_page(sbi, dni.ino); 843 continue; 844 } 845 846 ofs_in_node = le16_to_cpu(entry->ofs_in_node); 847 848 if (phase == 3) { 849 inode = f2fs_iget(sb, dni.ino); 850 if (IS_ERR(inode) || is_bad_inode(inode)) 851 continue; 852 853 /* if encrypted inode, let's go phase 3 */ 854 if (f2fs_encrypted_file(inode)) { 855 add_gc_inode(gc_list, inode); 856 continue; 857 } 858 859 if (!down_write_trylock( 860 &F2FS_I(inode)->dio_rwsem[WRITE])) { 861 iput(inode); 862 continue; 863 } 864 865 start_bidx = start_bidx_of_node(nofs, inode); 866 data_page = get_read_data_page(inode, 867 start_bidx + ofs_in_node, REQ_RAHEAD, 868 true); 869 up_write(&F2FS_I(inode)->dio_rwsem[WRITE]); 870 if (IS_ERR(data_page)) { 871 iput(inode); 872 continue; 873 } 874 875 f2fs_put_page(data_page, 0); 876 add_gc_inode(gc_list, inode); 877 continue; 878 } 879 880 /* phase 4 */ 881 inode = find_gc_inode(gc_list, dni.ino); 882 if (inode) { 883 struct f2fs_inode_info *fi = F2FS_I(inode); 884 bool locked = false; 885 886 if (S_ISREG(inode->i_mode)) { 887 if (!down_write_trylock(&fi->dio_rwsem[READ])) 888 continue; 889 if (!down_write_trylock( 890 &fi->dio_rwsem[WRITE])) { 891 up_write(&fi->dio_rwsem[READ]); 892 continue; 893 } 894 locked = true; 895 896 /* wait for all inflight aio data */ 897 inode_dio_wait(inode); 898 } 899 900 start_bidx = start_bidx_of_node(nofs, inode) 901 + ofs_in_node; 902 if (f2fs_encrypted_file(inode)) 903 move_data_block(inode, start_bidx, segno, off); 904 else 905 move_data_page(inode, start_bidx, gc_type, 906 segno, off); 907 908 if (locked) { 909 up_write(&fi->dio_rwsem[WRITE]); 910 up_write(&fi->dio_rwsem[READ]); 911 } 912 913 stat_inc_data_blk_count(sbi, 1, gc_type); 914 } 915 } 916 917 if (++phase < 5) 918 goto next_step; 919 } 920 921 static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim, 922 int gc_type) 923 { 924 struct sit_info *sit_i = SIT_I(sbi); 925 int ret; 926 927 down_write(&sit_i->sentry_lock); 928 ret = DIRTY_I(sbi)->v_ops->get_victim(sbi, victim, gc_type, 929 NO_CHECK_TYPE, LFS); 930 up_write(&sit_i->sentry_lock); 931 return ret; 932 } 933 934 static int do_garbage_collect(struct f2fs_sb_info *sbi, 935 unsigned int start_segno, 936 struct gc_inode_list *gc_list, int gc_type) 937 { 938 struct page *sum_page; 939 struct f2fs_summary_block *sum; 940 struct blk_plug plug; 941 unsigned int segno = start_segno; 942 unsigned int end_segno = start_segno + sbi->segs_per_sec; 943 int seg_freed = 0; 944 unsigned char type = IS_DATASEG(get_seg_entry(sbi, segno)->type) ? 945 SUM_TYPE_DATA : SUM_TYPE_NODE; 946 947 /* readahead multi ssa blocks those have contiguous address */ 948 if (sbi->segs_per_sec > 1) 949 ra_meta_pages(sbi, GET_SUM_BLOCK(sbi, segno), 950 sbi->segs_per_sec, META_SSA, true); 951 952 /* reference all summary page */ 953 while (segno < end_segno) { 954 sum_page = get_sum_page(sbi, segno++); 955 unlock_page(sum_page); 956 } 957 958 blk_start_plug(&plug); 959 960 for (segno = start_segno; segno < end_segno; segno++) { 961 962 /* find segment summary of victim */ 963 sum_page = find_get_page(META_MAPPING(sbi), 964 GET_SUM_BLOCK(sbi, segno)); 965 f2fs_put_page(sum_page, 0); 966 967 if (get_valid_blocks(sbi, segno, false) == 0 || 968 !PageUptodate(sum_page) || 969 unlikely(f2fs_cp_error(sbi))) 970 goto next; 971 972 sum = page_address(sum_page); 973 f2fs_bug_on(sbi, type != GET_SUM_TYPE((&sum->footer))); 974 975 /* 976 * this is to avoid deadlock: 977 * - lock_page(sum_page) - f2fs_replace_block 978 * - check_valid_map() - down_write(sentry_lock) 979 * - down_read(sentry_lock) - change_curseg() 980 * - lock_page(sum_page) 981 */ 982 if (type == SUM_TYPE_NODE) 983 gc_node_segment(sbi, sum->entries, segno, gc_type); 984 else 985 gc_data_segment(sbi, sum->entries, gc_list, segno, 986 gc_type); 987 988 stat_inc_seg_count(sbi, type, gc_type); 989 990 if (gc_type == FG_GC && 991 get_valid_blocks(sbi, segno, false) == 0) 992 seg_freed++; 993 next: 994 f2fs_put_page(sum_page, 0); 995 } 996 997 if (gc_type == FG_GC) 998 f2fs_submit_merged_write(sbi, 999 (type == SUM_TYPE_NODE) ? NODE : DATA); 1000 1001 blk_finish_plug(&plug); 1002 1003 stat_inc_call_count(sbi->stat_info); 1004 1005 return seg_freed; 1006 } 1007 1008 int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, 1009 bool background, unsigned int segno) 1010 { 1011 int gc_type = sync ? FG_GC : BG_GC; 1012 int sec_freed = 0, seg_freed = 0, total_freed = 0; 1013 int ret = 0; 1014 struct cp_control cpc; 1015 unsigned int init_segno = segno; 1016 struct gc_inode_list gc_list = { 1017 .ilist = LIST_HEAD_INIT(gc_list.ilist), 1018 .iroot = RADIX_TREE_INIT(gc_list.iroot, GFP_NOFS), 1019 }; 1020 1021 trace_f2fs_gc_begin(sbi->sb, sync, background, 1022 get_pages(sbi, F2FS_DIRTY_NODES), 1023 get_pages(sbi, F2FS_DIRTY_DENTS), 1024 get_pages(sbi, F2FS_DIRTY_IMETA), 1025 free_sections(sbi), 1026 free_segments(sbi), 1027 reserved_segments(sbi), 1028 prefree_segments(sbi)); 1029 1030 cpc.reason = __get_cp_reason(sbi); 1031 gc_more: 1032 if (unlikely(!(sbi->sb->s_flags & SB_ACTIVE))) { 1033 ret = -EINVAL; 1034 goto stop; 1035 } 1036 if (unlikely(f2fs_cp_error(sbi))) { 1037 ret = -EIO; 1038 goto stop; 1039 } 1040 1041 if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0)) { 1042 /* 1043 * For example, if there are many prefree_segments below given 1044 * threshold, we can make them free by checkpoint. Then, we 1045 * secure free segments which doesn't need fggc any more. 1046 */ 1047 if (prefree_segments(sbi)) { 1048 ret = write_checkpoint(sbi, &cpc); 1049 if (ret) 1050 goto stop; 1051 } 1052 if (has_not_enough_free_secs(sbi, 0, 0)) 1053 gc_type = FG_GC; 1054 } 1055 1056 /* f2fs_balance_fs doesn't need to do BG_GC in critical path. */ 1057 if (gc_type == BG_GC && !background) { 1058 ret = -EINVAL; 1059 goto stop; 1060 } 1061 if (!__get_victim(sbi, &segno, gc_type)) { 1062 ret = -ENODATA; 1063 goto stop; 1064 } 1065 1066 seg_freed = do_garbage_collect(sbi, segno, &gc_list, gc_type); 1067 if (gc_type == FG_GC && seg_freed == sbi->segs_per_sec) 1068 sec_freed++; 1069 total_freed += seg_freed; 1070 1071 if (gc_type == FG_GC) 1072 sbi->cur_victim_sec = NULL_SEGNO; 1073 1074 if (!sync) { 1075 if (has_not_enough_free_secs(sbi, sec_freed, 0)) { 1076 segno = NULL_SEGNO; 1077 goto gc_more; 1078 } 1079 1080 if (gc_type == FG_GC) 1081 ret = write_checkpoint(sbi, &cpc); 1082 } 1083 stop: 1084 SIT_I(sbi)->last_victim[ALLOC_NEXT] = 0; 1085 SIT_I(sbi)->last_victim[FLUSH_DEVICE] = init_segno; 1086 1087 trace_f2fs_gc_end(sbi->sb, ret, total_freed, sec_freed, 1088 get_pages(sbi, F2FS_DIRTY_NODES), 1089 get_pages(sbi, F2FS_DIRTY_DENTS), 1090 get_pages(sbi, F2FS_DIRTY_IMETA), 1091 free_sections(sbi), 1092 free_segments(sbi), 1093 reserved_segments(sbi), 1094 prefree_segments(sbi)); 1095 1096 mutex_unlock(&sbi->gc_mutex); 1097 1098 put_gc_inode(&gc_list); 1099 1100 if (sync) 1101 ret = sec_freed ? 0 : -EAGAIN; 1102 return ret; 1103 } 1104 1105 void build_gc_manager(struct f2fs_sb_info *sbi) 1106 { 1107 u64 main_count, resv_count, ovp_count; 1108 1109 DIRTY_I(sbi)->v_ops = &default_v_ops; 1110 1111 /* threshold of # of valid blocks in a section for victims of FG_GC */ 1112 main_count = SM_I(sbi)->main_segments << sbi->log_blocks_per_seg; 1113 resv_count = SM_I(sbi)->reserved_segments << sbi->log_blocks_per_seg; 1114 ovp_count = SM_I(sbi)->ovp_segments << sbi->log_blocks_per_seg; 1115 1116 sbi->fggc_threshold = div64_u64((main_count - ovp_count) * 1117 BLKS_PER_SEC(sbi), (main_count - resv_count)); 1118 sbi->gc_pin_file_threshold = DEF_GC_FAILED_PINNED_FILES; 1119 1120 /* give warm/cold data area from slower device */ 1121 if (sbi->s_ndevs && sbi->segs_per_sec == 1) 1122 SIT_I(sbi)->last_victim[ALLOC_NEXT] = 1123 GET_SEGNO(sbi, FDEV(0).end_blk) + 1; 1124 } 1125