xref: /openbmc/linux/fs/f2fs/gc.c (revision e1e38ea1)
1 /*
2  * fs/f2fs/gc.c
3  *
4  * Copyright (c) 2012 Samsung Electronics Co., Ltd.
5  *             http://www.samsung.com/
6  *
7  * This program is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License version 2 as
9  * published by the Free Software Foundation.
10  */
11 #include <linux/fs.h>
12 #include <linux/module.h>
13 #include <linux/backing-dev.h>
14 #include <linux/init.h>
15 #include <linux/f2fs_fs.h>
16 #include <linux/kthread.h>
17 #include <linux/delay.h>
18 #include <linux/freezer.h>
19 
20 #include "f2fs.h"
21 #include "node.h"
22 #include "segment.h"
23 #include "gc.h"
24 #include <trace/events/f2fs.h>
25 
26 static int gc_thread_func(void *data)
27 {
28 	struct f2fs_sb_info *sbi = data;
29 	struct f2fs_gc_kthread *gc_th = sbi->gc_thread;
30 	wait_queue_head_t *wq = &sbi->gc_thread->gc_wait_queue_head;
31 	unsigned int wait_ms;
32 
33 	wait_ms = gc_th->min_sleep_time;
34 
35 	set_freezable();
36 	do {
37 		wait_event_interruptible_timeout(*wq,
38 				kthread_should_stop() || freezing(current) ||
39 				gc_th->gc_wake,
40 				msecs_to_jiffies(wait_ms));
41 
42 		/* give it a try one time */
43 		if (gc_th->gc_wake)
44 			gc_th->gc_wake = 0;
45 
46 		if (try_to_freeze())
47 			continue;
48 		if (kthread_should_stop())
49 			break;
50 
51 		if (sbi->sb->s_writers.frozen >= SB_FREEZE_WRITE) {
52 			increase_sleep_time(gc_th, &wait_ms);
53 			continue;
54 		}
55 
56 		if (time_to_inject(sbi, FAULT_CHECKPOINT)) {
57 			f2fs_show_injection_info(FAULT_CHECKPOINT);
58 			f2fs_stop_checkpoint(sbi, false);
59 		}
60 
61 		if (!sb_start_write_trylock(sbi->sb))
62 			continue;
63 
64 		/*
65 		 * [GC triggering condition]
66 		 * 0. GC is not conducted currently.
67 		 * 1. There are enough dirty segments.
68 		 * 2. IO subsystem is idle by checking the # of writeback pages.
69 		 * 3. IO subsystem is idle by checking the # of requests in
70 		 *    bdev's request list.
71 		 *
72 		 * Note) We have to avoid triggering GCs frequently.
73 		 * Because it is possible that some segments can be
74 		 * invalidated soon after by user update or deletion.
75 		 * So, I'd like to wait some time to collect dirty segments.
76 		 */
77 		if (sbi->gc_mode == GC_URGENT) {
78 			wait_ms = gc_th->urgent_sleep_time;
79 			mutex_lock(&sbi->gc_mutex);
80 			goto do_gc;
81 		}
82 
83 		if (!mutex_trylock(&sbi->gc_mutex))
84 			goto next;
85 
86 		if (!is_idle(sbi)) {
87 			increase_sleep_time(gc_th, &wait_ms);
88 			mutex_unlock(&sbi->gc_mutex);
89 			goto next;
90 		}
91 
92 		if (has_enough_invalid_blocks(sbi))
93 			decrease_sleep_time(gc_th, &wait_ms);
94 		else
95 			increase_sleep_time(gc_th, &wait_ms);
96 do_gc:
97 		stat_inc_bggc_count(sbi);
98 
99 		/* if return value is not zero, no victim was selected */
100 		if (f2fs_gc(sbi, test_opt(sbi, FORCE_FG_GC), true, NULL_SEGNO))
101 			wait_ms = gc_th->no_gc_sleep_time;
102 
103 		trace_f2fs_background_gc(sbi->sb, wait_ms,
104 				prefree_segments(sbi), free_segments(sbi));
105 
106 		/* balancing f2fs's metadata periodically */
107 		f2fs_balance_fs_bg(sbi);
108 next:
109 		sb_end_write(sbi->sb);
110 
111 	} while (!kthread_should_stop());
112 	return 0;
113 }
114 
115 int f2fs_start_gc_thread(struct f2fs_sb_info *sbi)
116 {
117 	struct f2fs_gc_kthread *gc_th;
118 	dev_t dev = sbi->sb->s_bdev->bd_dev;
119 	int err = 0;
120 
121 	gc_th = f2fs_kmalloc(sbi, sizeof(struct f2fs_gc_kthread), GFP_KERNEL);
122 	if (!gc_th) {
123 		err = -ENOMEM;
124 		goto out;
125 	}
126 
127 	gc_th->urgent_sleep_time = DEF_GC_THREAD_URGENT_SLEEP_TIME;
128 	gc_th->min_sleep_time = DEF_GC_THREAD_MIN_SLEEP_TIME;
129 	gc_th->max_sleep_time = DEF_GC_THREAD_MAX_SLEEP_TIME;
130 	gc_th->no_gc_sleep_time = DEF_GC_THREAD_NOGC_SLEEP_TIME;
131 
132 	gc_th->gc_wake= 0;
133 
134 	sbi->gc_thread = gc_th;
135 	init_waitqueue_head(&sbi->gc_thread->gc_wait_queue_head);
136 	sbi->gc_thread->f2fs_gc_task = kthread_run(gc_thread_func, sbi,
137 			"f2fs_gc-%u:%u", MAJOR(dev), MINOR(dev));
138 	if (IS_ERR(gc_th->f2fs_gc_task)) {
139 		err = PTR_ERR(gc_th->f2fs_gc_task);
140 		kfree(gc_th);
141 		sbi->gc_thread = NULL;
142 	}
143 out:
144 	return err;
145 }
146 
147 void f2fs_stop_gc_thread(struct f2fs_sb_info *sbi)
148 {
149 	struct f2fs_gc_kthread *gc_th = sbi->gc_thread;
150 	if (!gc_th)
151 		return;
152 	kthread_stop(gc_th->f2fs_gc_task);
153 	kfree(gc_th);
154 	sbi->gc_thread = NULL;
155 }
156 
157 static int select_gc_type(struct f2fs_sb_info *sbi, int gc_type)
158 {
159 	int gc_mode = (gc_type == BG_GC) ? GC_CB : GC_GREEDY;
160 
161 	switch (sbi->gc_mode) {
162 	case GC_IDLE_CB:
163 		gc_mode = GC_CB;
164 		break;
165 	case GC_IDLE_GREEDY:
166 	case GC_URGENT:
167 		gc_mode = GC_GREEDY;
168 		break;
169 	}
170 	return gc_mode;
171 }
172 
173 static void select_policy(struct f2fs_sb_info *sbi, int gc_type,
174 			int type, struct victim_sel_policy *p)
175 {
176 	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
177 
178 	if (p->alloc_mode == SSR) {
179 		p->gc_mode = GC_GREEDY;
180 		p->dirty_segmap = dirty_i->dirty_segmap[type];
181 		p->max_search = dirty_i->nr_dirty[type];
182 		p->ofs_unit = 1;
183 	} else {
184 		p->gc_mode = select_gc_type(sbi, gc_type);
185 		p->dirty_segmap = dirty_i->dirty_segmap[DIRTY];
186 		p->max_search = dirty_i->nr_dirty[DIRTY];
187 		p->ofs_unit = sbi->segs_per_sec;
188 	}
189 
190 	/* we need to check every dirty segments in the FG_GC case */
191 	if (gc_type != FG_GC &&
192 			(sbi->gc_mode != GC_URGENT) &&
193 			p->max_search > sbi->max_victim_search)
194 		p->max_search = sbi->max_victim_search;
195 
196 	/* let's select beginning hot/small space first in no_heap mode*/
197 	if (test_opt(sbi, NOHEAP) &&
198 		(type == CURSEG_HOT_DATA || IS_NODESEG(type)))
199 		p->offset = 0;
200 	else
201 		p->offset = SIT_I(sbi)->last_victim[p->gc_mode];
202 }
203 
204 static unsigned int get_max_cost(struct f2fs_sb_info *sbi,
205 				struct victim_sel_policy *p)
206 {
207 	/* SSR allocates in a segment unit */
208 	if (p->alloc_mode == SSR)
209 		return sbi->blocks_per_seg;
210 	if (p->gc_mode == GC_GREEDY)
211 		return 2 * sbi->blocks_per_seg * p->ofs_unit;
212 	else if (p->gc_mode == GC_CB)
213 		return UINT_MAX;
214 	else /* No other gc_mode */
215 		return 0;
216 }
217 
218 static unsigned int check_bg_victims(struct f2fs_sb_info *sbi)
219 {
220 	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
221 	unsigned int secno;
222 
223 	/*
224 	 * If the gc_type is FG_GC, we can select victim segments
225 	 * selected by background GC before.
226 	 * Those segments guarantee they have small valid blocks.
227 	 */
228 	for_each_set_bit(secno, dirty_i->victim_secmap, MAIN_SECS(sbi)) {
229 		if (sec_usage_check(sbi, secno))
230 			continue;
231 		clear_bit(secno, dirty_i->victim_secmap);
232 		return GET_SEG_FROM_SEC(sbi, secno);
233 	}
234 	return NULL_SEGNO;
235 }
236 
237 static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno)
238 {
239 	struct sit_info *sit_i = SIT_I(sbi);
240 	unsigned int secno = GET_SEC_FROM_SEG(sbi, segno);
241 	unsigned int start = GET_SEG_FROM_SEC(sbi, secno);
242 	unsigned long long mtime = 0;
243 	unsigned int vblocks;
244 	unsigned char age = 0;
245 	unsigned char u;
246 	unsigned int i;
247 
248 	for (i = 0; i < sbi->segs_per_sec; i++)
249 		mtime += get_seg_entry(sbi, start + i)->mtime;
250 	vblocks = get_valid_blocks(sbi, segno, true);
251 
252 	mtime = div_u64(mtime, sbi->segs_per_sec);
253 	vblocks = div_u64(vblocks, sbi->segs_per_sec);
254 
255 	u = (vblocks * 100) >> sbi->log_blocks_per_seg;
256 
257 	/* Handle if the system time has changed by the user */
258 	if (mtime < sit_i->min_mtime)
259 		sit_i->min_mtime = mtime;
260 	if (mtime > sit_i->max_mtime)
261 		sit_i->max_mtime = mtime;
262 	if (sit_i->max_mtime != sit_i->min_mtime)
263 		age = 100 - div64_u64(100 * (mtime - sit_i->min_mtime),
264 				sit_i->max_mtime - sit_i->min_mtime);
265 
266 	return UINT_MAX - ((100 * (100 - u) * age) / (100 + u));
267 }
268 
269 static inline unsigned int get_gc_cost(struct f2fs_sb_info *sbi,
270 			unsigned int segno, struct victim_sel_policy *p)
271 {
272 	if (p->alloc_mode == SSR)
273 		return get_seg_entry(sbi, segno)->ckpt_valid_blocks;
274 
275 	/* alloc_mode == LFS */
276 	if (p->gc_mode == GC_GREEDY)
277 		return get_valid_blocks(sbi, segno, true);
278 	else
279 		return get_cb_cost(sbi, segno);
280 }
281 
282 static unsigned int count_bits(const unsigned long *addr,
283 				unsigned int offset, unsigned int len)
284 {
285 	unsigned int end = offset + len, sum = 0;
286 
287 	while (offset < end) {
288 		if (test_bit(offset++, addr))
289 			++sum;
290 	}
291 	return sum;
292 }
293 
294 /*
295  * This function is called from two paths.
296  * One is garbage collection and the other is SSR segment selection.
297  * When it is called during GC, it just gets a victim segment
298  * and it does not remove it from dirty seglist.
299  * When it is called from SSR segment selection, it finds a segment
300  * which has minimum valid blocks and removes it from dirty seglist.
301  */
302 static int get_victim_by_default(struct f2fs_sb_info *sbi,
303 		unsigned int *result, int gc_type, int type, char alloc_mode)
304 {
305 	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
306 	struct sit_info *sm = SIT_I(sbi);
307 	struct victim_sel_policy p;
308 	unsigned int secno, last_victim;
309 	unsigned int last_segment = MAIN_SEGS(sbi);
310 	unsigned int nsearched = 0;
311 
312 	mutex_lock(&dirty_i->seglist_lock);
313 
314 	p.alloc_mode = alloc_mode;
315 	select_policy(sbi, gc_type, type, &p);
316 
317 	p.min_segno = NULL_SEGNO;
318 	p.min_cost = get_max_cost(sbi, &p);
319 
320 	if (*result != NULL_SEGNO) {
321 		if (IS_DATASEG(get_seg_entry(sbi, *result)->type) &&
322 			get_valid_blocks(sbi, *result, false) &&
323 			!sec_usage_check(sbi, GET_SEC_FROM_SEG(sbi, *result)))
324 			p.min_segno = *result;
325 		goto out;
326 	}
327 
328 	if (p.max_search == 0)
329 		goto out;
330 
331 	last_victim = sm->last_victim[p.gc_mode];
332 	if (p.alloc_mode == LFS && gc_type == FG_GC) {
333 		p.min_segno = check_bg_victims(sbi);
334 		if (p.min_segno != NULL_SEGNO)
335 			goto got_it;
336 	}
337 
338 	while (1) {
339 		unsigned long cost;
340 		unsigned int segno;
341 
342 		segno = find_next_bit(p.dirty_segmap, last_segment, p.offset);
343 		if (segno >= last_segment) {
344 			if (sm->last_victim[p.gc_mode]) {
345 				last_segment =
346 					sm->last_victim[p.gc_mode];
347 				sm->last_victim[p.gc_mode] = 0;
348 				p.offset = 0;
349 				continue;
350 			}
351 			break;
352 		}
353 
354 		p.offset = segno + p.ofs_unit;
355 		if (p.ofs_unit > 1) {
356 			p.offset -= segno % p.ofs_unit;
357 			nsearched += count_bits(p.dirty_segmap,
358 						p.offset - p.ofs_unit,
359 						p.ofs_unit);
360 		} else {
361 			nsearched++;
362 		}
363 
364 		secno = GET_SEC_FROM_SEG(sbi, segno);
365 
366 		if (sec_usage_check(sbi, secno))
367 			goto next;
368 		if (gc_type == BG_GC && test_bit(secno, dirty_i->victim_secmap))
369 			goto next;
370 
371 		cost = get_gc_cost(sbi, segno, &p);
372 
373 		if (p.min_cost > cost) {
374 			p.min_segno = segno;
375 			p.min_cost = cost;
376 		}
377 next:
378 		if (nsearched >= p.max_search) {
379 			if (!sm->last_victim[p.gc_mode] && segno <= last_victim)
380 				sm->last_victim[p.gc_mode] = last_victim + 1;
381 			else
382 				sm->last_victim[p.gc_mode] = segno + 1;
383 			sm->last_victim[p.gc_mode] %= MAIN_SEGS(sbi);
384 			break;
385 		}
386 	}
387 	if (p.min_segno != NULL_SEGNO) {
388 got_it:
389 		if (p.alloc_mode == LFS) {
390 			secno = GET_SEC_FROM_SEG(sbi, p.min_segno);
391 			if (gc_type == FG_GC)
392 				sbi->cur_victim_sec = secno;
393 			else
394 				set_bit(secno, dirty_i->victim_secmap);
395 		}
396 		*result = (p.min_segno / p.ofs_unit) * p.ofs_unit;
397 
398 		trace_f2fs_get_victim(sbi->sb, type, gc_type, &p,
399 				sbi->cur_victim_sec,
400 				prefree_segments(sbi), free_segments(sbi));
401 	}
402 out:
403 	mutex_unlock(&dirty_i->seglist_lock);
404 
405 	return (p.min_segno == NULL_SEGNO) ? 0 : 1;
406 }
407 
408 static const struct victim_selection default_v_ops = {
409 	.get_victim = get_victim_by_default,
410 };
411 
412 static struct inode *find_gc_inode(struct gc_inode_list *gc_list, nid_t ino)
413 {
414 	struct inode_entry *ie;
415 
416 	ie = radix_tree_lookup(&gc_list->iroot, ino);
417 	if (ie)
418 		return ie->inode;
419 	return NULL;
420 }
421 
422 static void add_gc_inode(struct gc_inode_list *gc_list, struct inode *inode)
423 {
424 	struct inode_entry *new_ie;
425 
426 	if (inode == find_gc_inode(gc_list, inode->i_ino)) {
427 		iput(inode);
428 		return;
429 	}
430 	new_ie = f2fs_kmem_cache_alloc(f2fs_inode_entry_slab, GFP_NOFS);
431 	new_ie->inode = inode;
432 
433 	f2fs_radix_tree_insert(&gc_list->iroot, inode->i_ino, new_ie);
434 	list_add_tail(&new_ie->list, &gc_list->ilist);
435 }
436 
437 static void put_gc_inode(struct gc_inode_list *gc_list)
438 {
439 	struct inode_entry *ie, *next_ie;
440 	list_for_each_entry_safe(ie, next_ie, &gc_list->ilist, list) {
441 		radix_tree_delete(&gc_list->iroot, ie->inode->i_ino);
442 		iput(ie->inode);
443 		list_del(&ie->list);
444 		kmem_cache_free(f2fs_inode_entry_slab, ie);
445 	}
446 }
447 
448 static int check_valid_map(struct f2fs_sb_info *sbi,
449 				unsigned int segno, int offset)
450 {
451 	struct sit_info *sit_i = SIT_I(sbi);
452 	struct seg_entry *sentry;
453 	int ret;
454 
455 	down_read(&sit_i->sentry_lock);
456 	sentry = get_seg_entry(sbi, segno);
457 	ret = f2fs_test_bit(offset, sentry->cur_valid_map);
458 	up_read(&sit_i->sentry_lock);
459 	return ret;
460 }
461 
462 /*
463  * This function compares node address got in summary with that in NAT.
464  * On validity, copy that node with cold status, otherwise (invalid node)
465  * ignore that.
466  */
467 static void gc_node_segment(struct f2fs_sb_info *sbi,
468 		struct f2fs_summary *sum, unsigned int segno, int gc_type)
469 {
470 	struct f2fs_summary *entry;
471 	block_t start_addr;
472 	int off;
473 	int phase = 0;
474 	bool fggc = (gc_type == FG_GC);
475 
476 	start_addr = START_BLOCK(sbi, segno);
477 
478 next_step:
479 	entry = sum;
480 
481 	if (fggc && phase == 2)
482 		atomic_inc(&sbi->wb_sync_req[NODE]);
483 
484 	for (off = 0; off < sbi->blocks_per_seg; off++, entry++) {
485 		nid_t nid = le32_to_cpu(entry->nid);
486 		struct page *node_page;
487 		struct node_info ni;
488 
489 		/* stop BG_GC if there is not enough free sections. */
490 		if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0))
491 			return;
492 
493 		if (check_valid_map(sbi, segno, off) == 0)
494 			continue;
495 
496 		if (phase == 0) {
497 			f2fs_ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), 1,
498 							META_NAT, true);
499 			continue;
500 		}
501 
502 		if (phase == 1) {
503 			f2fs_ra_node_page(sbi, nid);
504 			continue;
505 		}
506 
507 		/* phase == 2 */
508 		node_page = f2fs_get_node_page(sbi, nid);
509 		if (IS_ERR(node_page))
510 			continue;
511 
512 		/* block may become invalid during f2fs_get_node_page */
513 		if (check_valid_map(sbi, segno, off) == 0) {
514 			f2fs_put_page(node_page, 1);
515 			continue;
516 		}
517 
518 		if (f2fs_get_node_info(sbi, nid, &ni)) {
519 			f2fs_put_page(node_page, 1);
520 			continue;
521 		}
522 
523 		if (ni.blk_addr != start_addr + off) {
524 			f2fs_put_page(node_page, 1);
525 			continue;
526 		}
527 
528 		f2fs_move_node_page(node_page, gc_type);
529 		stat_inc_node_blk_count(sbi, 1, gc_type);
530 	}
531 
532 	if (++phase < 3)
533 		goto next_step;
534 
535 	if (fggc)
536 		atomic_dec(&sbi->wb_sync_req[NODE]);
537 }
538 
539 /*
540  * Calculate start block index indicating the given node offset.
541  * Be careful, caller should give this node offset only indicating direct node
542  * blocks. If any node offsets, which point the other types of node blocks such
543  * as indirect or double indirect node blocks, are given, it must be a caller's
544  * bug.
545  */
546 block_t f2fs_start_bidx_of_node(unsigned int node_ofs, struct inode *inode)
547 {
548 	unsigned int indirect_blks = 2 * NIDS_PER_BLOCK + 4;
549 	unsigned int bidx;
550 
551 	if (node_ofs == 0)
552 		return 0;
553 
554 	if (node_ofs <= 2) {
555 		bidx = node_ofs - 1;
556 	} else if (node_ofs <= indirect_blks) {
557 		int dec = (node_ofs - 4) / (NIDS_PER_BLOCK + 1);
558 		bidx = node_ofs - 2 - dec;
559 	} else {
560 		int dec = (node_ofs - indirect_blks - 3) / (NIDS_PER_BLOCK + 1);
561 		bidx = node_ofs - 5 - dec;
562 	}
563 	return bidx * ADDRS_PER_BLOCK + ADDRS_PER_INODE(inode);
564 }
565 
566 static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
567 		struct node_info *dni, block_t blkaddr, unsigned int *nofs)
568 {
569 	struct page *node_page;
570 	nid_t nid;
571 	unsigned int ofs_in_node;
572 	block_t source_blkaddr;
573 
574 	nid = le32_to_cpu(sum->nid);
575 	ofs_in_node = le16_to_cpu(sum->ofs_in_node);
576 
577 	node_page = f2fs_get_node_page(sbi, nid);
578 	if (IS_ERR(node_page))
579 		return false;
580 
581 	if (f2fs_get_node_info(sbi, nid, dni)) {
582 		f2fs_put_page(node_page, 1);
583 		return false;
584 	}
585 
586 	if (sum->version != dni->version) {
587 		f2fs_msg(sbi->sb, KERN_WARNING,
588 				"%s: valid data with mismatched node version.",
589 				__func__);
590 		set_sbi_flag(sbi, SBI_NEED_FSCK);
591 	}
592 
593 	*nofs = ofs_of_node(node_page);
594 	source_blkaddr = datablock_addr(NULL, node_page, ofs_in_node);
595 	f2fs_put_page(node_page, 1);
596 
597 	if (source_blkaddr != blkaddr)
598 		return false;
599 	return true;
600 }
601 
602 static int ra_data_block(struct inode *inode, pgoff_t index)
603 {
604 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
605 	struct address_space *mapping = inode->i_mapping;
606 	struct dnode_of_data dn;
607 	struct page *page;
608 	struct extent_info ei = {0, 0, 0};
609 	struct f2fs_io_info fio = {
610 		.sbi = sbi,
611 		.ino = inode->i_ino,
612 		.type = DATA,
613 		.temp = COLD,
614 		.op = REQ_OP_READ,
615 		.op_flags = 0,
616 		.encrypted_page = NULL,
617 		.in_list = false,
618 		.retry = false,
619 	};
620 	int err;
621 
622 	page = f2fs_grab_cache_page(mapping, index, true);
623 	if (!page)
624 		return -ENOMEM;
625 
626 	if (f2fs_lookup_extent_cache(inode, index, &ei)) {
627 		dn.data_blkaddr = ei.blk + index - ei.fofs;
628 		goto got_it;
629 	}
630 
631 	set_new_dnode(&dn, inode, NULL, NULL, 0);
632 	err = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE);
633 	if (err)
634 		goto put_page;
635 	f2fs_put_dnode(&dn);
636 
637 	if (unlikely(!f2fs_is_valid_blkaddr(sbi, dn.data_blkaddr,
638 						DATA_GENERIC))) {
639 		err = -EFAULT;
640 		goto put_page;
641 	}
642 got_it:
643 	/* read page */
644 	fio.page = page;
645 	fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr;
646 
647 	fio.encrypted_page = f2fs_pagecache_get_page(META_MAPPING(sbi),
648 					dn.data_blkaddr,
649 					FGP_LOCK | FGP_CREAT, GFP_NOFS);
650 	if (!fio.encrypted_page) {
651 		err = -ENOMEM;
652 		goto put_page;
653 	}
654 
655 	err = f2fs_submit_page_bio(&fio);
656 	if (err)
657 		goto put_encrypted_page;
658 	f2fs_put_page(fio.encrypted_page, 0);
659 	f2fs_put_page(page, 1);
660 	return 0;
661 put_encrypted_page:
662 	f2fs_put_page(fio.encrypted_page, 1);
663 put_page:
664 	f2fs_put_page(page, 1);
665 	return err;
666 }
667 
668 /*
669  * Move data block via META_MAPPING while keeping locked data page.
670  * This can be used to move blocks, aka LBAs, directly on disk.
671  */
672 static void move_data_block(struct inode *inode, block_t bidx,
673 				int gc_type, unsigned int segno, int off)
674 {
675 	struct f2fs_io_info fio = {
676 		.sbi = F2FS_I_SB(inode),
677 		.ino = inode->i_ino,
678 		.type = DATA,
679 		.temp = COLD,
680 		.op = REQ_OP_READ,
681 		.op_flags = 0,
682 		.encrypted_page = NULL,
683 		.in_list = false,
684 		.retry = false,
685 	};
686 	struct dnode_of_data dn;
687 	struct f2fs_summary sum;
688 	struct node_info ni;
689 	struct page *page, *mpage;
690 	block_t newaddr;
691 	int err;
692 	bool lfs_mode = test_opt(fio.sbi, LFS);
693 
694 	/* do not read out */
695 	page = f2fs_grab_cache_page(inode->i_mapping, bidx, false);
696 	if (!page)
697 		return;
698 
699 	if (!check_valid_map(F2FS_I_SB(inode), segno, off))
700 		goto out;
701 
702 	if (f2fs_is_atomic_file(inode)) {
703 		F2FS_I(inode)->i_gc_failures[GC_FAILURE_ATOMIC]++;
704 		F2FS_I_SB(inode)->skipped_atomic_files[gc_type]++;
705 		goto out;
706 	}
707 
708 	if (f2fs_is_pinned_file(inode)) {
709 		f2fs_pin_file_control(inode, true);
710 		goto out;
711 	}
712 
713 	set_new_dnode(&dn, inode, NULL, NULL, 0);
714 	err = f2fs_get_dnode_of_data(&dn, bidx, LOOKUP_NODE);
715 	if (err)
716 		goto out;
717 
718 	if (unlikely(dn.data_blkaddr == NULL_ADDR)) {
719 		ClearPageUptodate(page);
720 		goto put_out;
721 	}
722 
723 	/*
724 	 * don't cache encrypted data into meta inode until previous dirty
725 	 * data were writebacked to avoid racing between GC and flush.
726 	 */
727 	f2fs_wait_on_page_writeback(page, DATA, true);
728 
729 	err = f2fs_get_node_info(fio.sbi, dn.nid, &ni);
730 	if (err)
731 		goto put_out;
732 
733 	set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version);
734 
735 	/* read page */
736 	fio.page = page;
737 	fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr;
738 
739 	if (lfs_mode)
740 		down_write(&fio.sbi->io_order_lock);
741 
742 	f2fs_allocate_data_block(fio.sbi, NULL, fio.old_blkaddr, &newaddr,
743 					&sum, CURSEG_COLD_DATA, NULL, false);
744 
745 	fio.encrypted_page = f2fs_pagecache_get_page(META_MAPPING(fio.sbi),
746 				newaddr, FGP_LOCK | FGP_CREAT, GFP_NOFS);
747 	if (!fio.encrypted_page) {
748 		err = -ENOMEM;
749 		goto recover_block;
750 	}
751 
752 	mpage = f2fs_pagecache_get_page(META_MAPPING(fio.sbi),
753 					fio.old_blkaddr, FGP_LOCK, GFP_NOFS);
754 	if (mpage) {
755 		bool updated = false;
756 
757 		if (PageUptodate(mpage)) {
758 			memcpy(page_address(fio.encrypted_page),
759 					page_address(mpage), PAGE_SIZE);
760 			updated = true;
761 		}
762 		f2fs_put_page(mpage, 1);
763 		invalidate_mapping_pages(META_MAPPING(fio.sbi),
764 					fio.old_blkaddr, fio.old_blkaddr);
765 		if (updated)
766 			goto write_page;
767 	}
768 
769 	err = f2fs_submit_page_bio(&fio);
770 	if (err)
771 		goto put_page_out;
772 
773 	/* write page */
774 	lock_page(fio.encrypted_page);
775 
776 	if (unlikely(fio.encrypted_page->mapping != META_MAPPING(fio.sbi))) {
777 		err = -EIO;
778 		goto put_page_out;
779 	}
780 	if (unlikely(!PageUptodate(fio.encrypted_page))) {
781 		err = -EIO;
782 		goto put_page_out;
783 	}
784 
785 write_page:
786 	set_page_dirty(fio.encrypted_page);
787 	f2fs_wait_on_page_writeback(fio.encrypted_page, DATA, true);
788 	if (clear_page_dirty_for_io(fio.encrypted_page))
789 		dec_page_count(fio.sbi, F2FS_DIRTY_META);
790 
791 	set_page_writeback(fio.encrypted_page);
792 	ClearPageError(page);
793 
794 	/* allocate block address */
795 	f2fs_wait_on_page_writeback(dn.node_page, NODE, true);
796 
797 	fio.op = REQ_OP_WRITE;
798 	fio.op_flags = REQ_SYNC;
799 	fio.new_blkaddr = newaddr;
800 	f2fs_submit_page_write(&fio);
801 	if (fio.retry) {
802 		if (PageWriteback(fio.encrypted_page))
803 			end_page_writeback(fio.encrypted_page);
804 		goto put_page_out;
805 	}
806 
807 	f2fs_update_iostat(fio.sbi, FS_GC_DATA_IO, F2FS_BLKSIZE);
808 
809 	f2fs_update_data_blkaddr(&dn, newaddr);
810 	set_inode_flag(inode, FI_APPEND_WRITE);
811 	if (page->index == 0)
812 		set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN);
813 put_page_out:
814 	f2fs_put_page(fio.encrypted_page, 1);
815 recover_block:
816 	if (lfs_mode)
817 		up_write(&fio.sbi->io_order_lock);
818 	if (err)
819 		f2fs_do_replace_block(fio.sbi, &sum, newaddr, fio.old_blkaddr,
820 								true, true);
821 put_out:
822 	f2fs_put_dnode(&dn);
823 out:
824 	f2fs_put_page(page, 1);
825 }
826 
827 static void move_data_page(struct inode *inode, block_t bidx, int gc_type,
828 							unsigned int segno, int off)
829 {
830 	struct page *page;
831 
832 	page = f2fs_get_lock_data_page(inode, bidx, true);
833 	if (IS_ERR(page))
834 		return;
835 
836 	if (!check_valid_map(F2FS_I_SB(inode), segno, off))
837 		goto out;
838 
839 	if (f2fs_is_atomic_file(inode)) {
840 		F2FS_I(inode)->i_gc_failures[GC_FAILURE_ATOMIC]++;
841 		F2FS_I_SB(inode)->skipped_atomic_files[gc_type]++;
842 		goto out;
843 	}
844 	if (f2fs_is_pinned_file(inode)) {
845 		if (gc_type == FG_GC)
846 			f2fs_pin_file_control(inode, true);
847 		goto out;
848 	}
849 
850 	if (gc_type == BG_GC) {
851 		if (PageWriteback(page))
852 			goto out;
853 		set_page_dirty(page);
854 		set_cold_data(page);
855 	} else {
856 		struct f2fs_io_info fio = {
857 			.sbi = F2FS_I_SB(inode),
858 			.ino = inode->i_ino,
859 			.type = DATA,
860 			.temp = COLD,
861 			.op = REQ_OP_WRITE,
862 			.op_flags = REQ_SYNC,
863 			.old_blkaddr = NULL_ADDR,
864 			.page = page,
865 			.encrypted_page = NULL,
866 			.need_lock = LOCK_REQ,
867 			.io_type = FS_GC_DATA_IO,
868 		};
869 		bool is_dirty = PageDirty(page);
870 		int err;
871 
872 retry:
873 		set_page_dirty(page);
874 		f2fs_wait_on_page_writeback(page, DATA, true);
875 		if (clear_page_dirty_for_io(page)) {
876 			inode_dec_dirty_pages(inode);
877 			f2fs_remove_dirty_inode(inode);
878 		}
879 
880 		set_cold_data(page);
881 
882 		err = f2fs_do_write_data_page(&fio);
883 		if (err) {
884 			clear_cold_data(page);
885 			if (err == -ENOMEM) {
886 				congestion_wait(BLK_RW_ASYNC, HZ/50);
887 				goto retry;
888 			}
889 			if (is_dirty)
890 				set_page_dirty(page);
891 		}
892 	}
893 out:
894 	f2fs_put_page(page, 1);
895 }
896 
897 /*
898  * This function tries to get parent node of victim data block, and identifies
899  * data block validity. If the block is valid, copy that with cold status and
900  * modify parent node.
901  * If the parent node is not valid or the data block address is different,
902  * the victim data block is ignored.
903  */
904 static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
905 		struct gc_inode_list *gc_list, unsigned int segno, int gc_type)
906 {
907 	struct super_block *sb = sbi->sb;
908 	struct f2fs_summary *entry;
909 	block_t start_addr;
910 	int off;
911 	int phase = 0;
912 
913 	start_addr = START_BLOCK(sbi, segno);
914 
915 next_step:
916 	entry = sum;
917 
918 	for (off = 0; off < sbi->blocks_per_seg; off++, entry++) {
919 		struct page *data_page;
920 		struct inode *inode;
921 		struct node_info dni; /* dnode info for the data */
922 		unsigned int ofs_in_node, nofs;
923 		block_t start_bidx;
924 		nid_t nid = le32_to_cpu(entry->nid);
925 
926 		/* stop BG_GC if there is not enough free sections. */
927 		if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0))
928 			return;
929 
930 		if (check_valid_map(sbi, segno, off) == 0)
931 			continue;
932 
933 		if (phase == 0) {
934 			f2fs_ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), 1,
935 							META_NAT, true);
936 			continue;
937 		}
938 
939 		if (phase == 1) {
940 			f2fs_ra_node_page(sbi, nid);
941 			continue;
942 		}
943 
944 		/* Get an inode by ino with checking validity */
945 		if (!is_alive(sbi, entry, &dni, start_addr + off, &nofs))
946 			continue;
947 
948 		if (phase == 2) {
949 			f2fs_ra_node_page(sbi, dni.ino);
950 			continue;
951 		}
952 
953 		ofs_in_node = le16_to_cpu(entry->ofs_in_node);
954 
955 		if (phase == 3) {
956 			inode = f2fs_iget(sb, dni.ino);
957 			if (IS_ERR(inode) || is_bad_inode(inode))
958 				continue;
959 
960 			if (!down_write_trylock(
961 				&F2FS_I(inode)->i_gc_rwsem[WRITE])) {
962 				iput(inode);
963 				sbi->skipped_gc_rwsem++;
964 				continue;
965 			}
966 
967 			start_bidx = f2fs_start_bidx_of_node(nofs, inode) +
968 								ofs_in_node;
969 
970 			if (f2fs_post_read_required(inode)) {
971 				int err = ra_data_block(inode, start_bidx);
972 
973 				up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
974 				if (err) {
975 					iput(inode);
976 					continue;
977 				}
978 				add_gc_inode(gc_list, inode);
979 				continue;
980 			}
981 
982 			data_page = f2fs_get_read_data_page(inode,
983 						start_bidx, REQ_RAHEAD, true);
984 			up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
985 			if (IS_ERR(data_page)) {
986 				iput(inode);
987 				continue;
988 			}
989 
990 			f2fs_put_page(data_page, 0);
991 			add_gc_inode(gc_list, inode);
992 			continue;
993 		}
994 
995 		/* phase 4 */
996 		inode = find_gc_inode(gc_list, dni.ino);
997 		if (inode) {
998 			struct f2fs_inode_info *fi = F2FS_I(inode);
999 			bool locked = false;
1000 
1001 			if (S_ISREG(inode->i_mode)) {
1002 				if (!down_write_trylock(&fi->i_gc_rwsem[READ]))
1003 					continue;
1004 				if (!down_write_trylock(
1005 						&fi->i_gc_rwsem[WRITE])) {
1006 					sbi->skipped_gc_rwsem++;
1007 					up_write(&fi->i_gc_rwsem[READ]);
1008 					continue;
1009 				}
1010 				locked = true;
1011 
1012 				/* wait for all inflight aio data */
1013 				inode_dio_wait(inode);
1014 			}
1015 
1016 			start_bidx = f2fs_start_bidx_of_node(nofs, inode)
1017 								+ ofs_in_node;
1018 			if (f2fs_post_read_required(inode))
1019 				move_data_block(inode, start_bidx, gc_type,
1020 								segno, off);
1021 			else
1022 				move_data_page(inode, start_bidx, gc_type,
1023 								segno, off);
1024 
1025 			if (locked) {
1026 				up_write(&fi->i_gc_rwsem[WRITE]);
1027 				up_write(&fi->i_gc_rwsem[READ]);
1028 			}
1029 
1030 			stat_inc_data_blk_count(sbi, 1, gc_type);
1031 		}
1032 	}
1033 
1034 	if (++phase < 5)
1035 		goto next_step;
1036 }
1037 
1038 static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim,
1039 			int gc_type)
1040 {
1041 	struct sit_info *sit_i = SIT_I(sbi);
1042 	int ret;
1043 
1044 	down_write(&sit_i->sentry_lock);
1045 	ret = DIRTY_I(sbi)->v_ops->get_victim(sbi, victim, gc_type,
1046 					      NO_CHECK_TYPE, LFS);
1047 	up_write(&sit_i->sentry_lock);
1048 	return ret;
1049 }
1050 
1051 static int do_garbage_collect(struct f2fs_sb_info *sbi,
1052 				unsigned int start_segno,
1053 				struct gc_inode_list *gc_list, int gc_type)
1054 {
1055 	struct page *sum_page;
1056 	struct f2fs_summary_block *sum;
1057 	struct blk_plug plug;
1058 	unsigned int segno = start_segno;
1059 	unsigned int end_segno = start_segno + sbi->segs_per_sec;
1060 	int seg_freed = 0;
1061 	unsigned char type = IS_DATASEG(get_seg_entry(sbi, segno)->type) ?
1062 						SUM_TYPE_DATA : SUM_TYPE_NODE;
1063 
1064 	/* readahead multi ssa blocks those have contiguous address */
1065 	if (sbi->segs_per_sec > 1)
1066 		f2fs_ra_meta_pages(sbi, GET_SUM_BLOCK(sbi, segno),
1067 					sbi->segs_per_sec, META_SSA, true);
1068 
1069 	/* reference all summary page */
1070 	while (segno < end_segno) {
1071 		sum_page = f2fs_get_sum_page(sbi, segno++);
1072 		unlock_page(sum_page);
1073 	}
1074 
1075 	blk_start_plug(&plug);
1076 
1077 	for (segno = start_segno; segno < end_segno; segno++) {
1078 
1079 		/* find segment summary of victim */
1080 		sum_page = find_get_page(META_MAPPING(sbi),
1081 					GET_SUM_BLOCK(sbi, segno));
1082 		f2fs_put_page(sum_page, 0);
1083 
1084 		if (get_valid_blocks(sbi, segno, false) == 0 ||
1085 				!PageUptodate(sum_page) ||
1086 				unlikely(f2fs_cp_error(sbi)))
1087 			goto next;
1088 
1089 		sum = page_address(sum_page);
1090 		if (type != GET_SUM_TYPE((&sum->footer))) {
1091 			f2fs_msg(sbi->sb, KERN_ERR, "Inconsistent segment (%u) "
1092 				"type [%d, %d] in SSA and SIT",
1093 				segno, type, GET_SUM_TYPE((&sum->footer)));
1094 			set_sbi_flag(sbi, SBI_NEED_FSCK);
1095 			goto next;
1096 		}
1097 
1098 		/*
1099 		 * this is to avoid deadlock:
1100 		 * - lock_page(sum_page)         - f2fs_replace_block
1101 		 *  - check_valid_map()            - down_write(sentry_lock)
1102 		 *   - down_read(sentry_lock)     - change_curseg()
1103 		 *                                  - lock_page(sum_page)
1104 		 */
1105 		if (type == SUM_TYPE_NODE)
1106 			gc_node_segment(sbi, sum->entries, segno, gc_type);
1107 		else
1108 			gc_data_segment(sbi, sum->entries, gc_list, segno,
1109 								gc_type);
1110 
1111 		stat_inc_seg_count(sbi, type, gc_type);
1112 
1113 		if (gc_type == FG_GC &&
1114 				get_valid_blocks(sbi, segno, false) == 0)
1115 			seg_freed++;
1116 next:
1117 		f2fs_put_page(sum_page, 0);
1118 	}
1119 
1120 	if (gc_type == FG_GC)
1121 		f2fs_submit_merged_write(sbi,
1122 				(type == SUM_TYPE_NODE) ? NODE : DATA);
1123 
1124 	blk_finish_plug(&plug);
1125 
1126 	stat_inc_call_count(sbi->stat_info);
1127 
1128 	return seg_freed;
1129 }
1130 
1131 int f2fs_gc(struct f2fs_sb_info *sbi, bool sync,
1132 			bool background, unsigned int segno)
1133 {
1134 	int gc_type = sync ? FG_GC : BG_GC;
1135 	int sec_freed = 0, seg_freed = 0, total_freed = 0;
1136 	int ret = 0;
1137 	struct cp_control cpc;
1138 	unsigned int init_segno = segno;
1139 	struct gc_inode_list gc_list = {
1140 		.ilist = LIST_HEAD_INIT(gc_list.ilist),
1141 		.iroot = RADIX_TREE_INIT(gc_list.iroot, GFP_NOFS),
1142 	};
1143 	unsigned long long last_skipped = sbi->skipped_atomic_files[FG_GC];
1144 	unsigned long long first_skipped;
1145 	unsigned int skipped_round = 0, round = 0;
1146 
1147 	trace_f2fs_gc_begin(sbi->sb, sync, background,
1148 				get_pages(sbi, F2FS_DIRTY_NODES),
1149 				get_pages(sbi, F2FS_DIRTY_DENTS),
1150 				get_pages(sbi, F2FS_DIRTY_IMETA),
1151 				free_sections(sbi),
1152 				free_segments(sbi),
1153 				reserved_segments(sbi),
1154 				prefree_segments(sbi));
1155 
1156 	cpc.reason = __get_cp_reason(sbi);
1157 	sbi->skipped_gc_rwsem = 0;
1158 	first_skipped = last_skipped;
1159 gc_more:
1160 	if (unlikely(!(sbi->sb->s_flags & SB_ACTIVE))) {
1161 		ret = -EINVAL;
1162 		goto stop;
1163 	}
1164 	if (unlikely(f2fs_cp_error(sbi))) {
1165 		ret = -EIO;
1166 		goto stop;
1167 	}
1168 
1169 	if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0)) {
1170 		/*
1171 		 * For example, if there are many prefree_segments below given
1172 		 * threshold, we can make them free by checkpoint. Then, we
1173 		 * secure free segments which doesn't need fggc any more.
1174 		 */
1175 		if (prefree_segments(sbi)) {
1176 			ret = f2fs_write_checkpoint(sbi, &cpc);
1177 			if (ret)
1178 				goto stop;
1179 		}
1180 		if (has_not_enough_free_secs(sbi, 0, 0))
1181 			gc_type = FG_GC;
1182 	}
1183 
1184 	/* f2fs_balance_fs doesn't need to do BG_GC in critical path. */
1185 	if (gc_type == BG_GC && !background) {
1186 		ret = -EINVAL;
1187 		goto stop;
1188 	}
1189 	if (!__get_victim(sbi, &segno, gc_type)) {
1190 		ret = -ENODATA;
1191 		goto stop;
1192 	}
1193 
1194 	seg_freed = do_garbage_collect(sbi, segno, &gc_list, gc_type);
1195 	if (gc_type == FG_GC && seg_freed == sbi->segs_per_sec)
1196 		sec_freed++;
1197 	total_freed += seg_freed;
1198 
1199 	if (gc_type == FG_GC) {
1200 		if (sbi->skipped_atomic_files[FG_GC] > last_skipped ||
1201 						sbi->skipped_gc_rwsem)
1202 			skipped_round++;
1203 		last_skipped = sbi->skipped_atomic_files[FG_GC];
1204 		round++;
1205 	}
1206 
1207 	if (gc_type == FG_GC)
1208 		sbi->cur_victim_sec = NULL_SEGNO;
1209 
1210 	if (sync)
1211 		goto stop;
1212 
1213 	if (has_not_enough_free_secs(sbi, sec_freed, 0)) {
1214 		if (skipped_round <= MAX_SKIP_GC_COUNT ||
1215 					skipped_round * 2 < round) {
1216 			segno = NULL_SEGNO;
1217 			goto gc_more;
1218 		}
1219 
1220 		if (first_skipped < last_skipped &&
1221 				(last_skipped - first_skipped) >
1222 						sbi->skipped_gc_rwsem) {
1223 			f2fs_drop_inmem_pages_all(sbi, true);
1224 			segno = NULL_SEGNO;
1225 			goto gc_more;
1226 		}
1227 		if (gc_type == FG_GC)
1228 			ret = f2fs_write_checkpoint(sbi, &cpc);
1229 	}
1230 stop:
1231 	SIT_I(sbi)->last_victim[ALLOC_NEXT] = 0;
1232 	SIT_I(sbi)->last_victim[FLUSH_DEVICE] = init_segno;
1233 
1234 	trace_f2fs_gc_end(sbi->sb, ret, total_freed, sec_freed,
1235 				get_pages(sbi, F2FS_DIRTY_NODES),
1236 				get_pages(sbi, F2FS_DIRTY_DENTS),
1237 				get_pages(sbi, F2FS_DIRTY_IMETA),
1238 				free_sections(sbi),
1239 				free_segments(sbi),
1240 				reserved_segments(sbi),
1241 				prefree_segments(sbi));
1242 
1243 	mutex_unlock(&sbi->gc_mutex);
1244 
1245 	put_gc_inode(&gc_list);
1246 
1247 	if (sync)
1248 		ret = sec_freed ? 0 : -EAGAIN;
1249 	return ret;
1250 }
1251 
1252 void f2fs_build_gc_manager(struct f2fs_sb_info *sbi)
1253 {
1254 	DIRTY_I(sbi)->v_ops = &default_v_ops;
1255 
1256 	sbi->gc_pin_file_threshold = DEF_GC_FAILED_PINNED_FILES;
1257 
1258 	/* give warm/cold data area from slower device */
1259 	if (sbi->s_ndevs && sbi->segs_per_sec == 1)
1260 		SIT_I(sbi)->last_victim[ALLOC_NEXT] =
1261 				GET_SEGNO(sbi, FDEV(0).end_blk) + 1;
1262 }
1263