1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * fs/f2fs/gc.c 4 * 5 * Copyright (c) 2012 Samsung Electronics Co., Ltd. 6 * http://www.samsung.com/ 7 */ 8 #include <linux/fs.h> 9 #include <linux/module.h> 10 #include <linux/backing-dev.h> 11 #include <linux/init.h> 12 #include <linux/f2fs_fs.h> 13 #include <linux/kthread.h> 14 #include <linux/delay.h> 15 #include <linux/freezer.h> 16 #include <linux/sched/signal.h> 17 18 #include "f2fs.h" 19 #include "node.h" 20 #include "segment.h" 21 #include "gc.h" 22 #include <trace/events/f2fs.h> 23 24 static int gc_thread_func(void *data) 25 { 26 struct f2fs_sb_info *sbi = data; 27 struct f2fs_gc_kthread *gc_th = sbi->gc_thread; 28 wait_queue_head_t *wq = &sbi->gc_thread->gc_wait_queue_head; 29 unsigned int wait_ms; 30 31 wait_ms = gc_th->min_sleep_time; 32 33 set_freezable(); 34 do { 35 bool sync_mode; 36 37 wait_event_interruptible_timeout(*wq, 38 kthread_should_stop() || freezing(current) || 39 gc_th->gc_wake, 40 msecs_to_jiffies(wait_ms)); 41 42 /* give it a try one time */ 43 if (gc_th->gc_wake) 44 gc_th->gc_wake = 0; 45 46 if (try_to_freeze()) { 47 stat_other_skip_bggc_count(sbi); 48 continue; 49 } 50 if (kthread_should_stop()) 51 break; 52 53 if (sbi->sb->s_writers.frozen >= SB_FREEZE_WRITE) { 54 increase_sleep_time(gc_th, &wait_ms); 55 stat_other_skip_bggc_count(sbi); 56 continue; 57 } 58 59 if (time_to_inject(sbi, FAULT_CHECKPOINT)) { 60 f2fs_show_injection_info(sbi, FAULT_CHECKPOINT); 61 f2fs_stop_checkpoint(sbi, false); 62 } 63 64 if (!sb_start_write_trylock(sbi->sb)) { 65 stat_other_skip_bggc_count(sbi); 66 continue; 67 } 68 69 /* 70 * [GC triggering condition] 71 * 0. GC is not conducted currently. 72 * 1. There are enough dirty segments. 73 * 2. IO subsystem is idle by checking the # of writeback pages. 74 * 3. IO subsystem is idle by checking the # of requests in 75 * bdev's request list. 76 * 77 * Note) We have to avoid triggering GCs frequently. 78 * Because it is possible that some segments can be 79 * invalidated soon after by user update or deletion. 80 * So, I'd like to wait some time to collect dirty segments. 81 */ 82 if (sbi->gc_mode == GC_URGENT) { 83 wait_ms = gc_th->urgent_sleep_time; 84 down_write(&sbi->gc_lock); 85 goto do_gc; 86 } 87 88 if (!down_write_trylock(&sbi->gc_lock)) { 89 stat_other_skip_bggc_count(sbi); 90 goto next; 91 } 92 93 if (!is_idle(sbi, GC_TIME)) { 94 increase_sleep_time(gc_th, &wait_ms); 95 up_write(&sbi->gc_lock); 96 stat_io_skip_bggc_count(sbi); 97 goto next; 98 } 99 100 if (has_enough_invalid_blocks(sbi)) 101 decrease_sleep_time(gc_th, &wait_ms); 102 else 103 increase_sleep_time(gc_th, &wait_ms); 104 do_gc: 105 stat_inc_bggc_count(sbi->stat_info); 106 107 sync_mode = F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_SYNC; 108 109 /* if return value is not zero, no victim was selected */ 110 if (f2fs_gc(sbi, sync_mode, true, NULL_SEGNO)) 111 wait_ms = gc_th->no_gc_sleep_time; 112 113 trace_f2fs_background_gc(sbi->sb, wait_ms, 114 prefree_segments(sbi), free_segments(sbi)); 115 116 /* balancing f2fs's metadata periodically */ 117 f2fs_balance_fs_bg(sbi, true); 118 next: 119 sb_end_write(sbi->sb); 120 121 } while (!kthread_should_stop()); 122 return 0; 123 } 124 125 int f2fs_start_gc_thread(struct f2fs_sb_info *sbi) 126 { 127 struct f2fs_gc_kthread *gc_th; 128 dev_t dev = sbi->sb->s_bdev->bd_dev; 129 int err = 0; 130 131 gc_th = f2fs_kmalloc(sbi, sizeof(struct f2fs_gc_kthread), GFP_KERNEL); 132 if (!gc_th) { 133 err = -ENOMEM; 134 goto out; 135 } 136 137 gc_th->urgent_sleep_time = DEF_GC_THREAD_URGENT_SLEEP_TIME; 138 gc_th->min_sleep_time = DEF_GC_THREAD_MIN_SLEEP_TIME; 139 gc_th->max_sleep_time = DEF_GC_THREAD_MAX_SLEEP_TIME; 140 gc_th->no_gc_sleep_time = DEF_GC_THREAD_NOGC_SLEEP_TIME; 141 142 gc_th->gc_wake= 0; 143 144 sbi->gc_thread = gc_th; 145 init_waitqueue_head(&sbi->gc_thread->gc_wait_queue_head); 146 sbi->gc_thread->f2fs_gc_task = kthread_run(gc_thread_func, sbi, 147 "f2fs_gc-%u:%u", MAJOR(dev), MINOR(dev)); 148 if (IS_ERR(gc_th->f2fs_gc_task)) { 149 err = PTR_ERR(gc_th->f2fs_gc_task); 150 kvfree(gc_th); 151 sbi->gc_thread = NULL; 152 } 153 out: 154 return err; 155 } 156 157 void f2fs_stop_gc_thread(struct f2fs_sb_info *sbi) 158 { 159 struct f2fs_gc_kthread *gc_th = sbi->gc_thread; 160 if (!gc_th) 161 return; 162 kthread_stop(gc_th->f2fs_gc_task); 163 kvfree(gc_th); 164 sbi->gc_thread = NULL; 165 } 166 167 static int select_gc_type(struct f2fs_sb_info *sbi, int gc_type) 168 { 169 int gc_mode = (gc_type == BG_GC) ? GC_CB : GC_GREEDY; 170 171 switch (sbi->gc_mode) { 172 case GC_IDLE_CB: 173 gc_mode = GC_CB; 174 break; 175 case GC_IDLE_GREEDY: 176 case GC_URGENT: 177 gc_mode = GC_GREEDY; 178 break; 179 } 180 return gc_mode; 181 } 182 183 static void select_policy(struct f2fs_sb_info *sbi, int gc_type, 184 int type, struct victim_sel_policy *p) 185 { 186 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); 187 188 if (p->alloc_mode == SSR) { 189 p->gc_mode = GC_GREEDY; 190 p->dirty_segmap = dirty_i->dirty_segmap[type]; 191 p->max_search = dirty_i->nr_dirty[type]; 192 p->ofs_unit = 1; 193 } else { 194 p->gc_mode = select_gc_type(sbi, gc_type); 195 p->dirty_segmap = dirty_i->dirty_segmap[DIRTY]; 196 p->max_search = dirty_i->nr_dirty[DIRTY]; 197 p->ofs_unit = sbi->segs_per_sec; 198 } 199 200 /* 201 * adjust candidates range, should select all dirty segments for 202 * foreground GC and urgent GC cases. 203 */ 204 if (gc_type != FG_GC && 205 (sbi->gc_mode != GC_URGENT) && 206 p->max_search > sbi->max_victim_search) 207 p->max_search = sbi->max_victim_search; 208 209 /* let's select beginning hot/small space first in no_heap mode*/ 210 if (test_opt(sbi, NOHEAP) && 211 (type == CURSEG_HOT_DATA || IS_NODESEG(type))) 212 p->offset = 0; 213 else 214 p->offset = SIT_I(sbi)->last_victim[p->gc_mode]; 215 } 216 217 static unsigned int get_max_cost(struct f2fs_sb_info *sbi, 218 struct victim_sel_policy *p) 219 { 220 /* SSR allocates in a segment unit */ 221 if (p->alloc_mode == SSR) 222 return sbi->blocks_per_seg; 223 if (p->gc_mode == GC_GREEDY) 224 return 2 * sbi->blocks_per_seg * p->ofs_unit; 225 else if (p->gc_mode == GC_CB) 226 return UINT_MAX; 227 else /* No other gc_mode */ 228 return 0; 229 } 230 231 static unsigned int check_bg_victims(struct f2fs_sb_info *sbi) 232 { 233 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); 234 unsigned int secno; 235 236 /* 237 * If the gc_type is FG_GC, we can select victim segments 238 * selected by background GC before. 239 * Those segments guarantee they have small valid blocks. 240 */ 241 for_each_set_bit(secno, dirty_i->victim_secmap, MAIN_SECS(sbi)) { 242 if (sec_usage_check(sbi, secno)) 243 continue; 244 clear_bit(secno, dirty_i->victim_secmap); 245 return GET_SEG_FROM_SEC(sbi, secno); 246 } 247 return NULL_SEGNO; 248 } 249 250 static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno) 251 { 252 struct sit_info *sit_i = SIT_I(sbi); 253 unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); 254 unsigned int start = GET_SEG_FROM_SEC(sbi, secno); 255 unsigned long long mtime = 0; 256 unsigned int vblocks; 257 unsigned char age = 0; 258 unsigned char u; 259 unsigned int i; 260 261 for (i = 0; i < sbi->segs_per_sec; i++) 262 mtime += get_seg_entry(sbi, start + i)->mtime; 263 vblocks = get_valid_blocks(sbi, segno, true); 264 265 mtime = div_u64(mtime, sbi->segs_per_sec); 266 vblocks = div_u64(vblocks, sbi->segs_per_sec); 267 268 u = (vblocks * 100) >> sbi->log_blocks_per_seg; 269 270 /* Handle if the system time has changed by the user */ 271 if (mtime < sit_i->min_mtime) 272 sit_i->min_mtime = mtime; 273 if (mtime > sit_i->max_mtime) 274 sit_i->max_mtime = mtime; 275 if (sit_i->max_mtime != sit_i->min_mtime) 276 age = 100 - div64_u64(100 * (mtime - sit_i->min_mtime), 277 sit_i->max_mtime - sit_i->min_mtime); 278 279 return UINT_MAX - ((100 * (100 - u) * age) / (100 + u)); 280 } 281 282 static inline unsigned int get_gc_cost(struct f2fs_sb_info *sbi, 283 unsigned int segno, struct victim_sel_policy *p) 284 { 285 if (p->alloc_mode == SSR) 286 return get_seg_entry(sbi, segno)->ckpt_valid_blocks; 287 288 /* alloc_mode == LFS */ 289 if (p->gc_mode == GC_GREEDY) 290 return get_valid_blocks(sbi, segno, true); 291 else 292 return get_cb_cost(sbi, segno); 293 } 294 295 static unsigned int count_bits(const unsigned long *addr, 296 unsigned int offset, unsigned int len) 297 { 298 unsigned int end = offset + len, sum = 0; 299 300 while (offset < end) { 301 if (test_bit(offset++, addr)) 302 ++sum; 303 } 304 return sum; 305 } 306 307 /* 308 * This function is called from two paths. 309 * One is garbage collection and the other is SSR segment selection. 310 * When it is called during GC, it just gets a victim segment 311 * and it does not remove it from dirty seglist. 312 * When it is called from SSR segment selection, it finds a segment 313 * which has minimum valid blocks and removes it from dirty seglist. 314 */ 315 static int get_victim_by_default(struct f2fs_sb_info *sbi, 316 unsigned int *result, int gc_type, int type, char alloc_mode) 317 { 318 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); 319 struct sit_info *sm = SIT_I(sbi); 320 struct victim_sel_policy p; 321 unsigned int secno, last_victim; 322 unsigned int last_segment; 323 unsigned int nsearched = 0; 324 325 mutex_lock(&dirty_i->seglist_lock); 326 last_segment = MAIN_SECS(sbi) * sbi->segs_per_sec; 327 328 p.alloc_mode = alloc_mode; 329 select_policy(sbi, gc_type, type, &p); 330 331 p.min_segno = NULL_SEGNO; 332 p.min_cost = get_max_cost(sbi, &p); 333 334 if (*result != NULL_SEGNO) { 335 if (get_valid_blocks(sbi, *result, false) && 336 !sec_usage_check(sbi, GET_SEC_FROM_SEG(sbi, *result))) 337 p.min_segno = *result; 338 goto out; 339 } 340 341 if (p.max_search == 0) 342 goto out; 343 344 if (__is_large_section(sbi) && p.alloc_mode == LFS) { 345 if (sbi->next_victim_seg[BG_GC] != NULL_SEGNO) { 346 p.min_segno = sbi->next_victim_seg[BG_GC]; 347 *result = p.min_segno; 348 sbi->next_victim_seg[BG_GC] = NULL_SEGNO; 349 goto got_result; 350 } 351 if (gc_type == FG_GC && 352 sbi->next_victim_seg[FG_GC] != NULL_SEGNO) { 353 p.min_segno = sbi->next_victim_seg[FG_GC]; 354 *result = p.min_segno; 355 sbi->next_victim_seg[FG_GC] = NULL_SEGNO; 356 goto got_result; 357 } 358 } 359 360 last_victim = sm->last_victim[p.gc_mode]; 361 if (p.alloc_mode == LFS && gc_type == FG_GC) { 362 p.min_segno = check_bg_victims(sbi); 363 if (p.min_segno != NULL_SEGNO) 364 goto got_it; 365 } 366 367 while (1) { 368 unsigned long cost; 369 unsigned int segno; 370 371 segno = find_next_bit(p.dirty_segmap, last_segment, p.offset); 372 if (segno >= last_segment) { 373 if (sm->last_victim[p.gc_mode]) { 374 last_segment = 375 sm->last_victim[p.gc_mode]; 376 sm->last_victim[p.gc_mode] = 0; 377 p.offset = 0; 378 continue; 379 } 380 break; 381 } 382 383 p.offset = segno + p.ofs_unit; 384 if (p.ofs_unit > 1) { 385 p.offset -= segno % p.ofs_unit; 386 nsearched += count_bits(p.dirty_segmap, 387 p.offset - p.ofs_unit, 388 p.ofs_unit); 389 } else { 390 nsearched++; 391 } 392 393 #ifdef CONFIG_F2FS_CHECK_FS 394 /* 395 * skip selecting the invalid segno (that is failed due to block 396 * validity check failure during GC) to avoid endless GC loop in 397 * such cases. 398 */ 399 if (test_bit(segno, sm->invalid_segmap)) 400 goto next; 401 #endif 402 403 secno = GET_SEC_FROM_SEG(sbi, segno); 404 405 if (sec_usage_check(sbi, secno)) 406 goto next; 407 /* Don't touch checkpointed data */ 408 if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED) && 409 get_ckpt_valid_blocks(sbi, segno) && 410 p.alloc_mode != SSR)) 411 goto next; 412 if (gc_type == BG_GC && test_bit(secno, dirty_i->victim_secmap)) 413 goto next; 414 415 cost = get_gc_cost(sbi, segno, &p); 416 417 if (p.min_cost > cost) { 418 p.min_segno = segno; 419 p.min_cost = cost; 420 } 421 next: 422 if (nsearched >= p.max_search) { 423 if (!sm->last_victim[p.gc_mode] && segno <= last_victim) 424 sm->last_victim[p.gc_mode] = last_victim + 1; 425 else 426 sm->last_victim[p.gc_mode] = segno + 1; 427 sm->last_victim[p.gc_mode] %= 428 (MAIN_SECS(sbi) * sbi->segs_per_sec); 429 break; 430 } 431 } 432 if (p.min_segno != NULL_SEGNO) { 433 got_it: 434 *result = (p.min_segno / p.ofs_unit) * p.ofs_unit; 435 got_result: 436 if (p.alloc_mode == LFS) { 437 secno = GET_SEC_FROM_SEG(sbi, p.min_segno); 438 if (gc_type == FG_GC) 439 sbi->cur_victim_sec = secno; 440 else 441 set_bit(secno, dirty_i->victim_secmap); 442 } 443 444 } 445 out: 446 if (p.min_segno != NULL_SEGNO) 447 trace_f2fs_get_victim(sbi->sb, type, gc_type, &p, 448 sbi->cur_victim_sec, 449 prefree_segments(sbi), free_segments(sbi)); 450 mutex_unlock(&dirty_i->seglist_lock); 451 452 return (p.min_segno == NULL_SEGNO) ? 0 : 1; 453 } 454 455 static const struct victim_selection default_v_ops = { 456 .get_victim = get_victim_by_default, 457 }; 458 459 static struct inode *find_gc_inode(struct gc_inode_list *gc_list, nid_t ino) 460 { 461 struct inode_entry *ie; 462 463 ie = radix_tree_lookup(&gc_list->iroot, ino); 464 if (ie) 465 return ie->inode; 466 return NULL; 467 } 468 469 static void add_gc_inode(struct gc_inode_list *gc_list, struct inode *inode) 470 { 471 struct inode_entry *new_ie; 472 473 if (inode == find_gc_inode(gc_list, inode->i_ino)) { 474 iput(inode); 475 return; 476 } 477 new_ie = f2fs_kmem_cache_alloc(f2fs_inode_entry_slab, GFP_NOFS); 478 new_ie->inode = inode; 479 480 f2fs_radix_tree_insert(&gc_list->iroot, inode->i_ino, new_ie); 481 list_add_tail(&new_ie->list, &gc_list->ilist); 482 } 483 484 static void put_gc_inode(struct gc_inode_list *gc_list) 485 { 486 struct inode_entry *ie, *next_ie; 487 list_for_each_entry_safe(ie, next_ie, &gc_list->ilist, list) { 488 radix_tree_delete(&gc_list->iroot, ie->inode->i_ino); 489 iput(ie->inode); 490 list_del(&ie->list); 491 kmem_cache_free(f2fs_inode_entry_slab, ie); 492 } 493 } 494 495 static int check_valid_map(struct f2fs_sb_info *sbi, 496 unsigned int segno, int offset) 497 { 498 struct sit_info *sit_i = SIT_I(sbi); 499 struct seg_entry *sentry; 500 int ret; 501 502 down_read(&sit_i->sentry_lock); 503 sentry = get_seg_entry(sbi, segno); 504 ret = f2fs_test_bit(offset, sentry->cur_valid_map); 505 up_read(&sit_i->sentry_lock); 506 return ret; 507 } 508 509 /* 510 * This function compares node address got in summary with that in NAT. 511 * On validity, copy that node with cold status, otherwise (invalid node) 512 * ignore that. 513 */ 514 static int gc_node_segment(struct f2fs_sb_info *sbi, 515 struct f2fs_summary *sum, unsigned int segno, int gc_type) 516 { 517 struct f2fs_summary *entry; 518 block_t start_addr; 519 int off; 520 int phase = 0; 521 bool fggc = (gc_type == FG_GC); 522 int submitted = 0; 523 524 start_addr = START_BLOCK(sbi, segno); 525 526 next_step: 527 entry = sum; 528 529 if (fggc && phase == 2) 530 atomic_inc(&sbi->wb_sync_req[NODE]); 531 532 for (off = 0; off < sbi->blocks_per_seg; off++, entry++) { 533 nid_t nid = le32_to_cpu(entry->nid); 534 struct page *node_page; 535 struct node_info ni; 536 int err; 537 538 /* stop BG_GC if there is not enough free sections. */ 539 if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0)) 540 return submitted; 541 542 if (check_valid_map(sbi, segno, off) == 0) 543 continue; 544 545 if (phase == 0) { 546 f2fs_ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), 1, 547 META_NAT, true); 548 continue; 549 } 550 551 if (phase == 1) { 552 f2fs_ra_node_page(sbi, nid); 553 continue; 554 } 555 556 /* phase == 2 */ 557 node_page = f2fs_get_node_page(sbi, nid); 558 if (IS_ERR(node_page)) 559 continue; 560 561 /* block may become invalid during f2fs_get_node_page */ 562 if (check_valid_map(sbi, segno, off) == 0) { 563 f2fs_put_page(node_page, 1); 564 continue; 565 } 566 567 if (f2fs_get_node_info(sbi, nid, &ni)) { 568 f2fs_put_page(node_page, 1); 569 continue; 570 } 571 572 if (ni.blk_addr != start_addr + off) { 573 f2fs_put_page(node_page, 1); 574 continue; 575 } 576 577 err = f2fs_move_node_page(node_page, gc_type); 578 if (!err && gc_type == FG_GC) 579 submitted++; 580 stat_inc_node_blk_count(sbi, 1, gc_type); 581 } 582 583 if (++phase < 3) 584 goto next_step; 585 586 if (fggc) 587 atomic_dec(&sbi->wb_sync_req[NODE]); 588 return submitted; 589 } 590 591 /* 592 * Calculate start block index indicating the given node offset. 593 * Be careful, caller should give this node offset only indicating direct node 594 * blocks. If any node offsets, which point the other types of node blocks such 595 * as indirect or double indirect node blocks, are given, it must be a caller's 596 * bug. 597 */ 598 block_t f2fs_start_bidx_of_node(unsigned int node_ofs, struct inode *inode) 599 { 600 unsigned int indirect_blks = 2 * NIDS_PER_BLOCK + 4; 601 unsigned int bidx; 602 603 if (node_ofs == 0) 604 return 0; 605 606 if (node_ofs <= 2) { 607 bidx = node_ofs - 1; 608 } else if (node_ofs <= indirect_blks) { 609 int dec = (node_ofs - 4) / (NIDS_PER_BLOCK + 1); 610 bidx = node_ofs - 2 - dec; 611 } else { 612 int dec = (node_ofs - indirect_blks - 3) / (NIDS_PER_BLOCK + 1); 613 bidx = node_ofs - 5 - dec; 614 } 615 return bidx * ADDRS_PER_BLOCK(inode) + ADDRS_PER_INODE(inode); 616 } 617 618 static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, 619 struct node_info *dni, block_t blkaddr, unsigned int *nofs) 620 { 621 struct page *node_page; 622 nid_t nid; 623 unsigned int ofs_in_node; 624 block_t source_blkaddr; 625 626 nid = le32_to_cpu(sum->nid); 627 ofs_in_node = le16_to_cpu(sum->ofs_in_node); 628 629 node_page = f2fs_get_node_page(sbi, nid); 630 if (IS_ERR(node_page)) 631 return false; 632 633 if (f2fs_get_node_info(sbi, nid, dni)) { 634 f2fs_put_page(node_page, 1); 635 return false; 636 } 637 638 if (sum->version != dni->version) { 639 f2fs_warn(sbi, "%s: valid data with mismatched node version.", 640 __func__); 641 set_sbi_flag(sbi, SBI_NEED_FSCK); 642 } 643 644 *nofs = ofs_of_node(node_page); 645 source_blkaddr = data_blkaddr(NULL, node_page, ofs_in_node); 646 f2fs_put_page(node_page, 1); 647 648 if (source_blkaddr != blkaddr) { 649 #ifdef CONFIG_F2FS_CHECK_FS 650 unsigned int segno = GET_SEGNO(sbi, blkaddr); 651 unsigned long offset = GET_BLKOFF_FROM_SEG0(sbi, blkaddr); 652 653 if (unlikely(check_valid_map(sbi, segno, offset))) { 654 if (!test_and_set_bit(segno, SIT_I(sbi)->invalid_segmap)) { 655 f2fs_err(sbi, "mismatched blkaddr %u (source_blkaddr %u) in seg %u\n", 656 blkaddr, source_blkaddr, segno); 657 f2fs_bug_on(sbi, 1); 658 } 659 } 660 #endif 661 return false; 662 } 663 return true; 664 } 665 666 static int ra_data_block(struct inode *inode, pgoff_t index) 667 { 668 struct f2fs_sb_info *sbi = F2FS_I_SB(inode); 669 struct address_space *mapping = inode->i_mapping; 670 struct dnode_of_data dn; 671 struct page *page; 672 struct extent_info ei = {0, 0, 0}; 673 struct f2fs_io_info fio = { 674 .sbi = sbi, 675 .ino = inode->i_ino, 676 .type = DATA, 677 .temp = COLD, 678 .op = REQ_OP_READ, 679 .op_flags = 0, 680 .encrypted_page = NULL, 681 .in_list = false, 682 .retry = false, 683 }; 684 int err; 685 686 page = f2fs_grab_cache_page(mapping, index, true); 687 if (!page) 688 return -ENOMEM; 689 690 if (f2fs_lookup_extent_cache(inode, index, &ei)) { 691 dn.data_blkaddr = ei.blk + index - ei.fofs; 692 if (unlikely(!f2fs_is_valid_blkaddr(sbi, dn.data_blkaddr, 693 DATA_GENERIC_ENHANCE_READ))) { 694 err = -EFSCORRUPTED; 695 goto put_page; 696 } 697 goto got_it; 698 } 699 700 set_new_dnode(&dn, inode, NULL, NULL, 0); 701 err = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE); 702 if (err) 703 goto put_page; 704 f2fs_put_dnode(&dn); 705 706 if (!__is_valid_data_blkaddr(dn.data_blkaddr)) { 707 err = -ENOENT; 708 goto put_page; 709 } 710 if (unlikely(!f2fs_is_valid_blkaddr(sbi, dn.data_blkaddr, 711 DATA_GENERIC_ENHANCE))) { 712 err = -EFSCORRUPTED; 713 goto put_page; 714 } 715 got_it: 716 /* read page */ 717 fio.page = page; 718 fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr; 719 720 /* 721 * don't cache encrypted data into meta inode until previous dirty 722 * data were writebacked to avoid racing between GC and flush. 723 */ 724 f2fs_wait_on_page_writeback(page, DATA, true, true); 725 726 f2fs_wait_on_block_writeback(inode, dn.data_blkaddr); 727 728 fio.encrypted_page = f2fs_pagecache_get_page(META_MAPPING(sbi), 729 dn.data_blkaddr, 730 FGP_LOCK | FGP_CREAT, GFP_NOFS); 731 if (!fio.encrypted_page) { 732 err = -ENOMEM; 733 goto put_page; 734 } 735 736 err = f2fs_submit_page_bio(&fio); 737 if (err) 738 goto put_encrypted_page; 739 f2fs_put_page(fio.encrypted_page, 0); 740 f2fs_put_page(page, 1); 741 742 f2fs_update_iostat(sbi, FS_DATA_READ_IO, F2FS_BLKSIZE); 743 f2fs_update_iostat(sbi, FS_GDATA_READ_IO, F2FS_BLKSIZE); 744 745 return 0; 746 put_encrypted_page: 747 f2fs_put_page(fio.encrypted_page, 1); 748 put_page: 749 f2fs_put_page(page, 1); 750 return err; 751 } 752 753 /* 754 * Move data block via META_MAPPING while keeping locked data page. 755 * This can be used to move blocks, aka LBAs, directly on disk. 756 */ 757 static int move_data_block(struct inode *inode, block_t bidx, 758 int gc_type, unsigned int segno, int off) 759 { 760 struct f2fs_io_info fio = { 761 .sbi = F2FS_I_SB(inode), 762 .ino = inode->i_ino, 763 .type = DATA, 764 .temp = COLD, 765 .op = REQ_OP_READ, 766 .op_flags = 0, 767 .encrypted_page = NULL, 768 .in_list = false, 769 .retry = false, 770 }; 771 struct dnode_of_data dn; 772 struct f2fs_summary sum; 773 struct node_info ni; 774 struct page *page, *mpage; 775 block_t newaddr; 776 int err = 0; 777 bool lfs_mode = f2fs_lfs_mode(fio.sbi); 778 779 /* do not read out */ 780 page = f2fs_grab_cache_page(inode->i_mapping, bidx, false); 781 if (!page) 782 return -ENOMEM; 783 784 if (!check_valid_map(F2FS_I_SB(inode), segno, off)) { 785 err = -ENOENT; 786 goto out; 787 } 788 789 if (f2fs_is_atomic_file(inode)) { 790 F2FS_I(inode)->i_gc_failures[GC_FAILURE_ATOMIC]++; 791 F2FS_I_SB(inode)->skipped_atomic_files[gc_type]++; 792 err = -EAGAIN; 793 goto out; 794 } 795 796 if (f2fs_is_pinned_file(inode)) { 797 f2fs_pin_file_control(inode, true); 798 err = -EAGAIN; 799 goto out; 800 } 801 802 set_new_dnode(&dn, inode, NULL, NULL, 0); 803 err = f2fs_get_dnode_of_data(&dn, bidx, LOOKUP_NODE); 804 if (err) 805 goto out; 806 807 if (unlikely(dn.data_blkaddr == NULL_ADDR)) { 808 ClearPageUptodate(page); 809 err = -ENOENT; 810 goto put_out; 811 } 812 813 /* 814 * don't cache encrypted data into meta inode until previous dirty 815 * data were writebacked to avoid racing between GC and flush. 816 */ 817 f2fs_wait_on_page_writeback(page, DATA, true, true); 818 819 f2fs_wait_on_block_writeback(inode, dn.data_blkaddr); 820 821 err = f2fs_get_node_info(fio.sbi, dn.nid, &ni); 822 if (err) 823 goto put_out; 824 825 set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version); 826 827 /* read page */ 828 fio.page = page; 829 fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr; 830 831 if (lfs_mode) 832 down_write(&fio.sbi->io_order_lock); 833 834 mpage = f2fs_grab_cache_page(META_MAPPING(fio.sbi), 835 fio.old_blkaddr, false); 836 if (!mpage) 837 goto up_out; 838 839 fio.encrypted_page = mpage; 840 841 /* read source block in mpage */ 842 if (!PageUptodate(mpage)) { 843 err = f2fs_submit_page_bio(&fio); 844 if (err) { 845 f2fs_put_page(mpage, 1); 846 goto up_out; 847 } 848 849 f2fs_update_iostat(fio.sbi, FS_DATA_READ_IO, F2FS_BLKSIZE); 850 f2fs_update_iostat(fio.sbi, FS_GDATA_READ_IO, F2FS_BLKSIZE); 851 852 lock_page(mpage); 853 if (unlikely(mpage->mapping != META_MAPPING(fio.sbi) || 854 !PageUptodate(mpage))) { 855 err = -EIO; 856 f2fs_put_page(mpage, 1); 857 goto up_out; 858 } 859 } 860 861 f2fs_allocate_data_block(fio.sbi, NULL, fio.old_blkaddr, &newaddr, 862 &sum, CURSEG_COLD_DATA, NULL, false); 863 864 fio.encrypted_page = f2fs_pagecache_get_page(META_MAPPING(fio.sbi), 865 newaddr, FGP_LOCK | FGP_CREAT, GFP_NOFS); 866 if (!fio.encrypted_page) { 867 err = -ENOMEM; 868 f2fs_put_page(mpage, 1); 869 goto recover_block; 870 } 871 872 /* write target block */ 873 f2fs_wait_on_page_writeback(fio.encrypted_page, DATA, true, true); 874 memcpy(page_address(fio.encrypted_page), 875 page_address(mpage), PAGE_SIZE); 876 f2fs_put_page(mpage, 1); 877 invalidate_mapping_pages(META_MAPPING(fio.sbi), 878 fio.old_blkaddr, fio.old_blkaddr); 879 880 set_page_dirty(fio.encrypted_page); 881 if (clear_page_dirty_for_io(fio.encrypted_page)) 882 dec_page_count(fio.sbi, F2FS_DIRTY_META); 883 884 set_page_writeback(fio.encrypted_page); 885 ClearPageError(page); 886 887 /* allocate block address */ 888 f2fs_wait_on_page_writeback(dn.node_page, NODE, true, true); 889 890 fio.op = REQ_OP_WRITE; 891 fio.op_flags = REQ_SYNC; 892 fio.new_blkaddr = newaddr; 893 f2fs_submit_page_write(&fio); 894 if (fio.retry) { 895 err = -EAGAIN; 896 if (PageWriteback(fio.encrypted_page)) 897 end_page_writeback(fio.encrypted_page); 898 goto put_page_out; 899 } 900 901 f2fs_update_iostat(fio.sbi, FS_GC_DATA_IO, F2FS_BLKSIZE); 902 903 f2fs_update_data_blkaddr(&dn, newaddr); 904 set_inode_flag(inode, FI_APPEND_WRITE); 905 if (page->index == 0) 906 set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN); 907 put_page_out: 908 f2fs_put_page(fio.encrypted_page, 1); 909 recover_block: 910 if (err) 911 f2fs_do_replace_block(fio.sbi, &sum, newaddr, fio.old_blkaddr, 912 true, true); 913 up_out: 914 if (lfs_mode) 915 up_write(&fio.sbi->io_order_lock); 916 put_out: 917 f2fs_put_dnode(&dn); 918 out: 919 f2fs_put_page(page, 1); 920 return err; 921 } 922 923 static int move_data_page(struct inode *inode, block_t bidx, int gc_type, 924 unsigned int segno, int off) 925 { 926 struct page *page; 927 int err = 0; 928 929 page = f2fs_get_lock_data_page(inode, bidx, true); 930 if (IS_ERR(page)) 931 return PTR_ERR(page); 932 933 if (!check_valid_map(F2FS_I_SB(inode), segno, off)) { 934 err = -ENOENT; 935 goto out; 936 } 937 938 if (f2fs_is_atomic_file(inode)) { 939 F2FS_I(inode)->i_gc_failures[GC_FAILURE_ATOMIC]++; 940 F2FS_I_SB(inode)->skipped_atomic_files[gc_type]++; 941 err = -EAGAIN; 942 goto out; 943 } 944 if (f2fs_is_pinned_file(inode)) { 945 if (gc_type == FG_GC) 946 f2fs_pin_file_control(inode, true); 947 err = -EAGAIN; 948 goto out; 949 } 950 951 if (gc_type == BG_GC) { 952 if (PageWriteback(page)) { 953 err = -EAGAIN; 954 goto out; 955 } 956 set_page_dirty(page); 957 set_cold_data(page); 958 } else { 959 struct f2fs_io_info fio = { 960 .sbi = F2FS_I_SB(inode), 961 .ino = inode->i_ino, 962 .type = DATA, 963 .temp = COLD, 964 .op = REQ_OP_WRITE, 965 .op_flags = REQ_SYNC, 966 .old_blkaddr = NULL_ADDR, 967 .page = page, 968 .encrypted_page = NULL, 969 .need_lock = LOCK_REQ, 970 .io_type = FS_GC_DATA_IO, 971 }; 972 bool is_dirty = PageDirty(page); 973 974 retry: 975 f2fs_wait_on_page_writeback(page, DATA, true, true); 976 977 set_page_dirty(page); 978 if (clear_page_dirty_for_io(page)) { 979 inode_dec_dirty_pages(inode); 980 f2fs_remove_dirty_inode(inode); 981 } 982 983 set_cold_data(page); 984 985 err = f2fs_do_write_data_page(&fio); 986 if (err) { 987 clear_cold_data(page); 988 if (err == -ENOMEM) { 989 congestion_wait(BLK_RW_ASYNC, 990 DEFAULT_IO_TIMEOUT); 991 goto retry; 992 } 993 if (is_dirty) 994 set_page_dirty(page); 995 } 996 } 997 out: 998 f2fs_put_page(page, 1); 999 return err; 1000 } 1001 1002 /* 1003 * This function tries to get parent node of victim data block, and identifies 1004 * data block validity. If the block is valid, copy that with cold status and 1005 * modify parent node. 1006 * If the parent node is not valid or the data block address is different, 1007 * the victim data block is ignored. 1008 */ 1009 static int gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, 1010 struct gc_inode_list *gc_list, unsigned int segno, int gc_type) 1011 { 1012 struct super_block *sb = sbi->sb; 1013 struct f2fs_summary *entry; 1014 block_t start_addr; 1015 int off; 1016 int phase = 0; 1017 int submitted = 0; 1018 1019 start_addr = START_BLOCK(sbi, segno); 1020 1021 next_step: 1022 entry = sum; 1023 1024 for (off = 0; off < sbi->blocks_per_seg; off++, entry++) { 1025 struct page *data_page; 1026 struct inode *inode; 1027 struct node_info dni; /* dnode info for the data */ 1028 unsigned int ofs_in_node, nofs; 1029 block_t start_bidx; 1030 nid_t nid = le32_to_cpu(entry->nid); 1031 1032 /* 1033 * stop BG_GC if there is not enough free sections. 1034 * Or, stop GC if the segment becomes fully valid caused by 1035 * race condition along with SSR block allocation. 1036 */ 1037 if ((gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0)) || 1038 get_valid_blocks(sbi, segno, true) == 1039 BLKS_PER_SEC(sbi)) 1040 return submitted; 1041 1042 if (check_valid_map(sbi, segno, off) == 0) 1043 continue; 1044 1045 if (phase == 0) { 1046 f2fs_ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), 1, 1047 META_NAT, true); 1048 continue; 1049 } 1050 1051 if (phase == 1) { 1052 f2fs_ra_node_page(sbi, nid); 1053 continue; 1054 } 1055 1056 /* Get an inode by ino with checking validity */ 1057 if (!is_alive(sbi, entry, &dni, start_addr + off, &nofs)) 1058 continue; 1059 1060 if (phase == 2) { 1061 f2fs_ra_node_page(sbi, dni.ino); 1062 continue; 1063 } 1064 1065 ofs_in_node = le16_to_cpu(entry->ofs_in_node); 1066 1067 if (phase == 3) { 1068 inode = f2fs_iget(sb, dni.ino); 1069 if (IS_ERR(inode) || is_bad_inode(inode)) { 1070 set_sbi_flag(sbi, SBI_NEED_FSCK); 1071 continue; 1072 } 1073 1074 if (!down_write_trylock( 1075 &F2FS_I(inode)->i_gc_rwsem[WRITE])) { 1076 iput(inode); 1077 sbi->skipped_gc_rwsem++; 1078 continue; 1079 } 1080 1081 start_bidx = f2fs_start_bidx_of_node(nofs, inode) + 1082 ofs_in_node; 1083 1084 if (f2fs_post_read_required(inode)) { 1085 int err = ra_data_block(inode, start_bidx); 1086 1087 up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); 1088 if (err) { 1089 iput(inode); 1090 continue; 1091 } 1092 add_gc_inode(gc_list, inode); 1093 continue; 1094 } 1095 1096 data_page = f2fs_get_read_data_page(inode, 1097 start_bidx, REQ_RAHEAD, true); 1098 up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); 1099 if (IS_ERR(data_page)) { 1100 iput(inode); 1101 continue; 1102 } 1103 1104 f2fs_put_page(data_page, 0); 1105 add_gc_inode(gc_list, inode); 1106 continue; 1107 } 1108 1109 /* phase 4 */ 1110 inode = find_gc_inode(gc_list, dni.ino); 1111 if (inode) { 1112 struct f2fs_inode_info *fi = F2FS_I(inode); 1113 bool locked = false; 1114 int err; 1115 1116 if (S_ISREG(inode->i_mode)) { 1117 if (!down_write_trylock(&fi->i_gc_rwsem[READ])) 1118 continue; 1119 if (!down_write_trylock( 1120 &fi->i_gc_rwsem[WRITE])) { 1121 sbi->skipped_gc_rwsem++; 1122 up_write(&fi->i_gc_rwsem[READ]); 1123 continue; 1124 } 1125 locked = true; 1126 1127 /* wait for all inflight aio data */ 1128 inode_dio_wait(inode); 1129 } 1130 1131 start_bidx = f2fs_start_bidx_of_node(nofs, inode) 1132 + ofs_in_node; 1133 if (f2fs_post_read_required(inode)) 1134 err = move_data_block(inode, start_bidx, 1135 gc_type, segno, off); 1136 else 1137 err = move_data_page(inode, start_bidx, gc_type, 1138 segno, off); 1139 1140 if (!err && (gc_type == FG_GC || 1141 f2fs_post_read_required(inode))) 1142 submitted++; 1143 1144 if (locked) { 1145 up_write(&fi->i_gc_rwsem[WRITE]); 1146 up_write(&fi->i_gc_rwsem[READ]); 1147 } 1148 1149 stat_inc_data_blk_count(sbi, 1, gc_type); 1150 } 1151 } 1152 1153 if (++phase < 5) 1154 goto next_step; 1155 1156 return submitted; 1157 } 1158 1159 static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim, 1160 int gc_type) 1161 { 1162 struct sit_info *sit_i = SIT_I(sbi); 1163 int ret; 1164 1165 down_write(&sit_i->sentry_lock); 1166 ret = DIRTY_I(sbi)->v_ops->get_victim(sbi, victim, gc_type, 1167 NO_CHECK_TYPE, LFS); 1168 up_write(&sit_i->sentry_lock); 1169 return ret; 1170 } 1171 1172 static int do_garbage_collect(struct f2fs_sb_info *sbi, 1173 unsigned int start_segno, 1174 struct gc_inode_list *gc_list, int gc_type) 1175 { 1176 struct page *sum_page; 1177 struct f2fs_summary_block *sum; 1178 struct blk_plug plug; 1179 unsigned int segno = start_segno; 1180 unsigned int end_segno = start_segno + sbi->segs_per_sec; 1181 int seg_freed = 0, migrated = 0; 1182 unsigned char type = IS_DATASEG(get_seg_entry(sbi, segno)->type) ? 1183 SUM_TYPE_DATA : SUM_TYPE_NODE; 1184 int submitted = 0; 1185 1186 if (__is_large_section(sbi)) 1187 end_segno = rounddown(end_segno, sbi->segs_per_sec); 1188 1189 /* readahead multi ssa blocks those have contiguous address */ 1190 if (__is_large_section(sbi)) 1191 f2fs_ra_meta_pages(sbi, GET_SUM_BLOCK(sbi, segno), 1192 end_segno - segno, META_SSA, true); 1193 1194 /* reference all summary page */ 1195 while (segno < end_segno) { 1196 sum_page = f2fs_get_sum_page(sbi, segno++); 1197 if (IS_ERR(sum_page)) { 1198 int err = PTR_ERR(sum_page); 1199 1200 end_segno = segno - 1; 1201 for (segno = start_segno; segno < end_segno; segno++) { 1202 sum_page = find_get_page(META_MAPPING(sbi), 1203 GET_SUM_BLOCK(sbi, segno)); 1204 f2fs_put_page(sum_page, 0); 1205 f2fs_put_page(sum_page, 0); 1206 } 1207 return err; 1208 } 1209 unlock_page(sum_page); 1210 } 1211 1212 blk_start_plug(&plug); 1213 1214 for (segno = start_segno; segno < end_segno; segno++) { 1215 1216 /* find segment summary of victim */ 1217 sum_page = find_get_page(META_MAPPING(sbi), 1218 GET_SUM_BLOCK(sbi, segno)); 1219 f2fs_put_page(sum_page, 0); 1220 1221 if (get_valid_blocks(sbi, segno, false) == 0) 1222 goto freed; 1223 if (gc_type == BG_GC && __is_large_section(sbi) && 1224 migrated >= sbi->migration_granularity) 1225 goto skip; 1226 if (!PageUptodate(sum_page) || unlikely(f2fs_cp_error(sbi))) 1227 goto skip; 1228 1229 sum = page_address(sum_page); 1230 if (type != GET_SUM_TYPE((&sum->footer))) { 1231 f2fs_err(sbi, "Inconsistent segment (%u) type [%d, %d] in SSA and SIT", 1232 segno, type, GET_SUM_TYPE((&sum->footer))); 1233 set_sbi_flag(sbi, SBI_NEED_FSCK); 1234 f2fs_stop_checkpoint(sbi, false); 1235 goto skip; 1236 } 1237 1238 /* 1239 * this is to avoid deadlock: 1240 * - lock_page(sum_page) - f2fs_replace_block 1241 * - check_valid_map() - down_write(sentry_lock) 1242 * - down_read(sentry_lock) - change_curseg() 1243 * - lock_page(sum_page) 1244 */ 1245 if (type == SUM_TYPE_NODE) 1246 submitted += gc_node_segment(sbi, sum->entries, segno, 1247 gc_type); 1248 else 1249 submitted += gc_data_segment(sbi, sum->entries, gc_list, 1250 segno, gc_type); 1251 1252 stat_inc_seg_count(sbi, type, gc_type); 1253 migrated++; 1254 1255 freed: 1256 if (gc_type == FG_GC && 1257 get_valid_blocks(sbi, segno, false) == 0) 1258 seg_freed++; 1259 1260 if (__is_large_section(sbi) && segno + 1 < end_segno) 1261 sbi->next_victim_seg[gc_type] = segno + 1; 1262 skip: 1263 f2fs_put_page(sum_page, 0); 1264 } 1265 1266 if (submitted) 1267 f2fs_submit_merged_write(sbi, 1268 (type == SUM_TYPE_NODE) ? NODE : DATA); 1269 1270 blk_finish_plug(&plug); 1271 1272 stat_inc_call_count(sbi->stat_info); 1273 1274 return seg_freed; 1275 } 1276 1277 int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, 1278 bool background, unsigned int segno) 1279 { 1280 int gc_type = sync ? FG_GC : BG_GC; 1281 int sec_freed = 0, seg_freed = 0, total_freed = 0; 1282 int ret = 0; 1283 struct cp_control cpc; 1284 unsigned int init_segno = segno; 1285 struct gc_inode_list gc_list = { 1286 .ilist = LIST_HEAD_INIT(gc_list.ilist), 1287 .iroot = RADIX_TREE_INIT(gc_list.iroot, GFP_NOFS), 1288 }; 1289 unsigned long long last_skipped = sbi->skipped_atomic_files[FG_GC]; 1290 unsigned long long first_skipped; 1291 unsigned int skipped_round = 0, round = 0; 1292 1293 trace_f2fs_gc_begin(sbi->sb, sync, background, 1294 get_pages(sbi, F2FS_DIRTY_NODES), 1295 get_pages(sbi, F2FS_DIRTY_DENTS), 1296 get_pages(sbi, F2FS_DIRTY_IMETA), 1297 free_sections(sbi), 1298 free_segments(sbi), 1299 reserved_segments(sbi), 1300 prefree_segments(sbi)); 1301 1302 cpc.reason = __get_cp_reason(sbi); 1303 sbi->skipped_gc_rwsem = 0; 1304 first_skipped = last_skipped; 1305 gc_more: 1306 if (unlikely(!(sbi->sb->s_flags & SB_ACTIVE))) { 1307 ret = -EINVAL; 1308 goto stop; 1309 } 1310 if (unlikely(f2fs_cp_error(sbi))) { 1311 ret = -EIO; 1312 goto stop; 1313 } 1314 1315 if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0)) { 1316 /* 1317 * For example, if there are many prefree_segments below given 1318 * threshold, we can make them free by checkpoint. Then, we 1319 * secure free segments which doesn't need fggc any more. 1320 */ 1321 if (prefree_segments(sbi) && 1322 !is_sbi_flag_set(sbi, SBI_CP_DISABLED)) { 1323 ret = f2fs_write_checkpoint(sbi, &cpc); 1324 if (ret) 1325 goto stop; 1326 } 1327 if (has_not_enough_free_secs(sbi, 0, 0)) 1328 gc_type = FG_GC; 1329 } 1330 1331 /* f2fs_balance_fs doesn't need to do BG_GC in critical path. */ 1332 if (gc_type == BG_GC && !background) { 1333 ret = -EINVAL; 1334 goto stop; 1335 } 1336 if (!__get_victim(sbi, &segno, gc_type)) { 1337 ret = -ENODATA; 1338 goto stop; 1339 } 1340 1341 seg_freed = do_garbage_collect(sbi, segno, &gc_list, gc_type); 1342 if (gc_type == FG_GC && seg_freed == sbi->segs_per_sec) 1343 sec_freed++; 1344 total_freed += seg_freed; 1345 1346 if (gc_type == FG_GC) { 1347 if (sbi->skipped_atomic_files[FG_GC] > last_skipped || 1348 sbi->skipped_gc_rwsem) 1349 skipped_round++; 1350 last_skipped = sbi->skipped_atomic_files[FG_GC]; 1351 round++; 1352 } 1353 1354 if (gc_type == FG_GC && seg_freed) 1355 sbi->cur_victim_sec = NULL_SEGNO; 1356 1357 if (sync) 1358 goto stop; 1359 1360 if (has_not_enough_free_secs(sbi, sec_freed, 0)) { 1361 if (skipped_round <= MAX_SKIP_GC_COUNT || 1362 skipped_round * 2 < round) { 1363 segno = NULL_SEGNO; 1364 goto gc_more; 1365 } 1366 1367 if (first_skipped < last_skipped && 1368 (last_skipped - first_skipped) > 1369 sbi->skipped_gc_rwsem) { 1370 f2fs_drop_inmem_pages_all(sbi, true); 1371 segno = NULL_SEGNO; 1372 goto gc_more; 1373 } 1374 if (gc_type == FG_GC && !is_sbi_flag_set(sbi, SBI_CP_DISABLED)) 1375 ret = f2fs_write_checkpoint(sbi, &cpc); 1376 } 1377 stop: 1378 SIT_I(sbi)->last_victim[ALLOC_NEXT] = 0; 1379 SIT_I(sbi)->last_victim[FLUSH_DEVICE] = init_segno; 1380 1381 trace_f2fs_gc_end(sbi->sb, ret, total_freed, sec_freed, 1382 get_pages(sbi, F2FS_DIRTY_NODES), 1383 get_pages(sbi, F2FS_DIRTY_DENTS), 1384 get_pages(sbi, F2FS_DIRTY_IMETA), 1385 free_sections(sbi), 1386 free_segments(sbi), 1387 reserved_segments(sbi), 1388 prefree_segments(sbi)); 1389 1390 up_write(&sbi->gc_lock); 1391 1392 put_gc_inode(&gc_list); 1393 1394 if (sync && !ret) 1395 ret = sec_freed ? 0 : -EAGAIN; 1396 return ret; 1397 } 1398 1399 void f2fs_build_gc_manager(struct f2fs_sb_info *sbi) 1400 { 1401 DIRTY_I(sbi)->v_ops = &default_v_ops; 1402 1403 sbi->gc_pin_file_threshold = DEF_GC_FAILED_PINNED_FILES; 1404 1405 /* give warm/cold data area from slower device */ 1406 if (f2fs_is_multi_device(sbi) && !__is_large_section(sbi)) 1407 SIT_I(sbi)->last_victim[ALLOC_NEXT] = 1408 GET_SEGNO(sbi, FDEV(0).end_blk) + 1; 1409 } 1410 1411 static int free_segment_range(struct f2fs_sb_info *sbi, 1412 unsigned int secs, bool gc_only) 1413 { 1414 unsigned int segno, next_inuse, start, end; 1415 struct cp_control cpc = { CP_RESIZE, 0, 0, 0 }; 1416 int gc_mode, gc_type; 1417 int err = 0; 1418 int type; 1419 1420 /* Force block allocation for GC */ 1421 MAIN_SECS(sbi) -= secs; 1422 start = MAIN_SECS(sbi) * sbi->segs_per_sec; 1423 end = MAIN_SEGS(sbi) - 1; 1424 1425 mutex_lock(&DIRTY_I(sbi)->seglist_lock); 1426 for (gc_mode = 0; gc_mode < MAX_GC_POLICY; gc_mode++) 1427 if (SIT_I(sbi)->last_victim[gc_mode] >= start) 1428 SIT_I(sbi)->last_victim[gc_mode] = 0; 1429 1430 for (gc_type = BG_GC; gc_type <= FG_GC; gc_type++) 1431 if (sbi->next_victim_seg[gc_type] >= start) 1432 sbi->next_victim_seg[gc_type] = NULL_SEGNO; 1433 mutex_unlock(&DIRTY_I(sbi)->seglist_lock); 1434 1435 /* Move out cursegs from the target range */ 1436 for (type = CURSEG_HOT_DATA; type < NR_CURSEG_TYPE; type++) 1437 allocate_segment_for_resize(sbi, type, start, end); 1438 1439 /* do GC to move out valid blocks in the range */ 1440 for (segno = start; segno <= end; segno += sbi->segs_per_sec) { 1441 struct gc_inode_list gc_list = { 1442 .ilist = LIST_HEAD_INIT(gc_list.ilist), 1443 .iroot = RADIX_TREE_INIT(gc_list.iroot, GFP_NOFS), 1444 }; 1445 1446 do_garbage_collect(sbi, segno, &gc_list, FG_GC); 1447 put_gc_inode(&gc_list); 1448 1449 if (!gc_only && get_valid_blocks(sbi, segno, true)) { 1450 err = -EAGAIN; 1451 goto out; 1452 } 1453 if (fatal_signal_pending(current)) { 1454 err = -ERESTARTSYS; 1455 goto out; 1456 } 1457 } 1458 if (gc_only) 1459 goto out; 1460 1461 err = f2fs_write_checkpoint(sbi, &cpc); 1462 if (err) 1463 goto out; 1464 1465 next_inuse = find_next_inuse(FREE_I(sbi), end + 1, start); 1466 if (next_inuse <= end) { 1467 f2fs_err(sbi, "segno %u should be free but still inuse!", 1468 next_inuse); 1469 f2fs_bug_on(sbi, 1); 1470 } 1471 out: 1472 MAIN_SECS(sbi) += secs; 1473 return err; 1474 } 1475 1476 static void update_sb_metadata(struct f2fs_sb_info *sbi, int secs) 1477 { 1478 struct f2fs_super_block *raw_sb = F2FS_RAW_SUPER(sbi); 1479 int section_count; 1480 int segment_count; 1481 int segment_count_main; 1482 long long block_count; 1483 int segs = secs * sbi->segs_per_sec; 1484 1485 down_write(&sbi->sb_lock); 1486 1487 section_count = le32_to_cpu(raw_sb->section_count); 1488 segment_count = le32_to_cpu(raw_sb->segment_count); 1489 segment_count_main = le32_to_cpu(raw_sb->segment_count_main); 1490 block_count = le64_to_cpu(raw_sb->block_count); 1491 1492 raw_sb->section_count = cpu_to_le32(section_count + secs); 1493 raw_sb->segment_count = cpu_to_le32(segment_count + segs); 1494 raw_sb->segment_count_main = cpu_to_le32(segment_count_main + segs); 1495 raw_sb->block_count = cpu_to_le64(block_count + 1496 (long long)segs * sbi->blocks_per_seg); 1497 if (f2fs_is_multi_device(sbi)) { 1498 int last_dev = sbi->s_ndevs - 1; 1499 int dev_segs = 1500 le32_to_cpu(raw_sb->devs[last_dev].total_segments); 1501 1502 raw_sb->devs[last_dev].total_segments = 1503 cpu_to_le32(dev_segs + segs); 1504 } 1505 1506 up_write(&sbi->sb_lock); 1507 } 1508 1509 static void update_fs_metadata(struct f2fs_sb_info *sbi, int secs) 1510 { 1511 int segs = secs * sbi->segs_per_sec; 1512 long long blks = (long long)segs * sbi->blocks_per_seg; 1513 long long user_block_count = 1514 le64_to_cpu(F2FS_CKPT(sbi)->user_block_count); 1515 1516 SM_I(sbi)->segment_count = (int)SM_I(sbi)->segment_count + segs; 1517 MAIN_SEGS(sbi) = (int)MAIN_SEGS(sbi) + segs; 1518 MAIN_SECS(sbi) += secs; 1519 FREE_I(sbi)->free_sections = (int)FREE_I(sbi)->free_sections + secs; 1520 FREE_I(sbi)->free_segments = (int)FREE_I(sbi)->free_segments + segs; 1521 F2FS_CKPT(sbi)->user_block_count = cpu_to_le64(user_block_count + blks); 1522 1523 if (f2fs_is_multi_device(sbi)) { 1524 int last_dev = sbi->s_ndevs - 1; 1525 1526 FDEV(last_dev).total_segments = 1527 (int)FDEV(last_dev).total_segments + segs; 1528 FDEV(last_dev).end_blk = 1529 (long long)FDEV(last_dev).end_blk + blks; 1530 #ifdef CONFIG_BLK_DEV_ZONED 1531 FDEV(last_dev).nr_blkz = (int)FDEV(last_dev).nr_blkz + 1532 (int)(blks >> sbi->log_blocks_per_blkz); 1533 #endif 1534 } 1535 } 1536 1537 int f2fs_resize_fs(struct f2fs_sb_info *sbi, __u64 block_count) 1538 { 1539 __u64 old_block_count, shrunk_blocks; 1540 struct cp_control cpc = { CP_RESIZE, 0, 0, 0 }; 1541 unsigned int secs; 1542 int err = 0; 1543 __u32 rem; 1544 1545 old_block_count = le64_to_cpu(F2FS_RAW_SUPER(sbi)->block_count); 1546 if (block_count > old_block_count) 1547 return -EINVAL; 1548 1549 if (f2fs_is_multi_device(sbi)) { 1550 int last_dev = sbi->s_ndevs - 1; 1551 __u64 last_segs = FDEV(last_dev).total_segments; 1552 1553 if (block_count + last_segs * sbi->blocks_per_seg <= 1554 old_block_count) 1555 return -EINVAL; 1556 } 1557 1558 /* new fs size should align to section size */ 1559 div_u64_rem(block_count, BLKS_PER_SEC(sbi), &rem); 1560 if (rem) 1561 return -EINVAL; 1562 1563 if (block_count == old_block_count) 1564 return 0; 1565 1566 if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) { 1567 f2fs_err(sbi, "Should run fsck to repair first."); 1568 return -EFSCORRUPTED; 1569 } 1570 1571 if (test_opt(sbi, DISABLE_CHECKPOINT)) { 1572 f2fs_err(sbi, "Checkpoint should be enabled."); 1573 return -EINVAL; 1574 } 1575 1576 shrunk_blocks = old_block_count - block_count; 1577 secs = div_u64(shrunk_blocks, BLKS_PER_SEC(sbi)); 1578 1579 /* stop other GC */ 1580 if (!down_write_trylock(&sbi->gc_lock)) 1581 return -EAGAIN; 1582 1583 /* stop CP to protect MAIN_SEC in free_segment_range */ 1584 f2fs_lock_op(sbi); 1585 err = free_segment_range(sbi, secs, true); 1586 f2fs_unlock_op(sbi); 1587 up_write(&sbi->gc_lock); 1588 if (err) 1589 return err; 1590 1591 set_sbi_flag(sbi, SBI_IS_RESIZEFS); 1592 1593 freeze_super(sbi->sb); 1594 down_write(&sbi->gc_lock); 1595 mutex_lock(&sbi->cp_mutex); 1596 1597 spin_lock(&sbi->stat_lock); 1598 if (shrunk_blocks + valid_user_blocks(sbi) + 1599 sbi->current_reserved_blocks + sbi->unusable_block_count + 1600 F2FS_OPTION(sbi).root_reserved_blocks > sbi->user_block_count) 1601 err = -ENOSPC; 1602 else 1603 sbi->user_block_count -= shrunk_blocks; 1604 spin_unlock(&sbi->stat_lock); 1605 if (err) 1606 goto out_err; 1607 1608 err = free_segment_range(sbi, secs, false); 1609 if (err) 1610 goto recover_out; 1611 1612 update_sb_metadata(sbi, -secs); 1613 1614 err = f2fs_commit_super(sbi, false); 1615 if (err) { 1616 update_sb_metadata(sbi, secs); 1617 goto recover_out; 1618 } 1619 1620 update_fs_metadata(sbi, -secs); 1621 clear_sbi_flag(sbi, SBI_IS_RESIZEFS); 1622 set_sbi_flag(sbi, SBI_IS_DIRTY); 1623 1624 err = f2fs_write_checkpoint(sbi, &cpc); 1625 if (err) { 1626 update_fs_metadata(sbi, secs); 1627 update_sb_metadata(sbi, secs); 1628 f2fs_commit_super(sbi, false); 1629 } 1630 recover_out: 1631 if (err) { 1632 set_sbi_flag(sbi, SBI_NEED_FSCK); 1633 f2fs_err(sbi, "resize_fs failed, should run fsck to repair!"); 1634 1635 spin_lock(&sbi->stat_lock); 1636 sbi->user_block_count += shrunk_blocks; 1637 spin_unlock(&sbi->stat_lock); 1638 } 1639 out_err: 1640 mutex_unlock(&sbi->cp_mutex); 1641 up_write(&sbi->gc_lock); 1642 thaw_super(sbi->sb); 1643 clear_sbi_flag(sbi, SBI_IS_RESIZEFS); 1644 return err; 1645 } 1646