xref: /openbmc/linux/fs/f2fs/node.c (revision 54f9c4d0)
1 /*
2  * fs/f2fs/node.c
3  *
4  * Copyright (c) 2012 Samsung Electronics Co., Ltd.
5  *             http://www.samsung.com/
6  *
7  * This program is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License version 2 as
9  * published by the Free Software Foundation.
10  */
11 #include <linux/fs.h>
12 #include <linux/f2fs_fs.h>
13 #include <linux/mpage.h>
14 #include <linux/backing-dev.h>
15 #include <linux/blkdev.h>
16 #include <linux/pagevec.h>
17 #include <linux/swap.h>
18 
19 #include "f2fs.h"
20 #include "node.h"
21 #include "segment.h"
22 #include "trace.h"
23 #include <trace/events/f2fs.h>
24 
25 #define on_build_free_nids(nmi) mutex_is_locked(&nm_i->build_lock)
26 
27 static struct kmem_cache *nat_entry_slab;
28 static struct kmem_cache *free_nid_slab;
29 static struct kmem_cache *nat_entry_set_slab;
30 
31 bool available_free_memory(struct f2fs_sb_info *sbi, int type)
32 {
33 	struct f2fs_nm_info *nm_i = NM_I(sbi);
34 	struct sysinfo val;
35 	unsigned long avail_ram;
36 	unsigned long mem_size = 0;
37 	bool res = false;
38 
39 	si_meminfo(&val);
40 
41 	/* only uses low memory */
42 	avail_ram = val.totalram - val.totalhigh;
43 
44 	/*
45 	 * give 25%, 25%, 50%, 50%, 50% memory for each components respectively
46 	 */
47 	if (type == FREE_NIDS) {
48 		mem_size = (nm_i->fcnt * sizeof(struct free_nid)) >>
49 							PAGE_SHIFT;
50 		res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2);
51 	} else if (type == NAT_ENTRIES) {
52 		mem_size = (nm_i->nat_cnt * sizeof(struct nat_entry)) >>
53 							PAGE_SHIFT;
54 		res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2);
55 		if (excess_cached_nats(sbi))
56 			res = false;
57 		if (nm_i->nat_cnt > DEF_NAT_CACHE_THRESHOLD)
58 			res = false;
59 	} else if (type == DIRTY_DENTS) {
60 		if (sbi->sb->s_bdi->wb.dirty_exceeded)
61 			return false;
62 		mem_size = get_pages(sbi, F2FS_DIRTY_DENTS);
63 		res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1);
64 	} else if (type == INO_ENTRIES) {
65 		int i;
66 
67 		for (i = 0; i <= UPDATE_INO; i++)
68 			mem_size += (sbi->im[i].ino_num *
69 				sizeof(struct ino_entry)) >> PAGE_SHIFT;
70 		res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1);
71 	} else if (type == EXTENT_CACHE) {
72 		mem_size = (atomic_read(&sbi->total_ext_tree) *
73 				sizeof(struct extent_tree) +
74 				atomic_read(&sbi->total_ext_node) *
75 				sizeof(struct extent_node)) >> PAGE_SHIFT;
76 		res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1);
77 	} else {
78 		if (!sbi->sb->s_bdi->wb.dirty_exceeded)
79 			return true;
80 	}
81 	return res;
82 }
83 
84 static void clear_node_page_dirty(struct page *page)
85 {
86 	struct address_space *mapping = page->mapping;
87 	unsigned int long flags;
88 
89 	if (PageDirty(page)) {
90 		spin_lock_irqsave(&mapping->tree_lock, flags);
91 		radix_tree_tag_clear(&mapping->page_tree,
92 				page_index(page),
93 				PAGECACHE_TAG_DIRTY);
94 		spin_unlock_irqrestore(&mapping->tree_lock, flags);
95 
96 		clear_page_dirty_for_io(page);
97 		dec_page_count(F2FS_M_SB(mapping), F2FS_DIRTY_NODES);
98 	}
99 	ClearPageUptodate(page);
100 }
101 
102 static struct page *get_current_nat_page(struct f2fs_sb_info *sbi, nid_t nid)
103 {
104 	pgoff_t index = current_nat_addr(sbi, nid);
105 	return get_meta_page(sbi, index);
106 }
107 
108 static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid)
109 {
110 	struct page *src_page;
111 	struct page *dst_page;
112 	pgoff_t src_off;
113 	pgoff_t dst_off;
114 	void *src_addr;
115 	void *dst_addr;
116 	struct f2fs_nm_info *nm_i = NM_I(sbi);
117 
118 	src_off = current_nat_addr(sbi, nid);
119 	dst_off = next_nat_addr(sbi, src_off);
120 
121 	/* get current nat block page with lock */
122 	src_page = get_meta_page(sbi, src_off);
123 	dst_page = grab_meta_page(sbi, dst_off);
124 	f2fs_bug_on(sbi, PageDirty(src_page));
125 
126 	src_addr = page_address(src_page);
127 	dst_addr = page_address(dst_page);
128 	memcpy(dst_addr, src_addr, PAGE_SIZE);
129 	set_page_dirty(dst_page);
130 	f2fs_put_page(src_page, 1);
131 
132 	set_to_next_nat(nm_i, nid);
133 
134 	return dst_page;
135 }
136 
137 static struct nat_entry *__lookup_nat_cache(struct f2fs_nm_info *nm_i, nid_t n)
138 {
139 	return radix_tree_lookup(&nm_i->nat_root, n);
140 }
141 
142 static unsigned int __gang_lookup_nat_cache(struct f2fs_nm_info *nm_i,
143 		nid_t start, unsigned int nr, struct nat_entry **ep)
144 {
145 	return radix_tree_gang_lookup(&nm_i->nat_root, (void **)ep, start, nr);
146 }
147 
148 static void __del_from_nat_cache(struct f2fs_nm_info *nm_i, struct nat_entry *e)
149 {
150 	list_del(&e->list);
151 	radix_tree_delete(&nm_i->nat_root, nat_get_nid(e));
152 	nm_i->nat_cnt--;
153 	kmem_cache_free(nat_entry_slab, e);
154 }
155 
156 static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i,
157 						struct nat_entry *ne)
158 {
159 	nid_t set = NAT_BLOCK_OFFSET(ne->ni.nid);
160 	struct nat_entry_set *head;
161 
162 	if (get_nat_flag(ne, IS_DIRTY))
163 		return;
164 
165 	head = radix_tree_lookup(&nm_i->nat_set_root, set);
166 	if (!head) {
167 		head = f2fs_kmem_cache_alloc(nat_entry_set_slab, GFP_NOFS);
168 
169 		INIT_LIST_HEAD(&head->entry_list);
170 		INIT_LIST_HEAD(&head->set_list);
171 		head->set = set;
172 		head->entry_cnt = 0;
173 		f2fs_radix_tree_insert(&nm_i->nat_set_root, set, head);
174 	}
175 	list_move_tail(&ne->list, &head->entry_list);
176 	nm_i->dirty_nat_cnt++;
177 	head->entry_cnt++;
178 	set_nat_flag(ne, IS_DIRTY, true);
179 }
180 
181 static void __clear_nat_cache_dirty(struct f2fs_nm_info *nm_i,
182 						struct nat_entry *ne)
183 {
184 	nid_t set = NAT_BLOCK_OFFSET(ne->ni.nid);
185 	struct nat_entry_set *head;
186 
187 	head = radix_tree_lookup(&nm_i->nat_set_root, set);
188 	if (head) {
189 		list_move_tail(&ne->list, &nm_i->nat_entries);
190 		set_nat_flag(ne, IS_DIRTY, false);
191 		head->entry_cnt--;
192 		nm_i->dirty_nat_cnt--;
193 	}
194 }
195 
196 static unsigned int __gang_lookup_nat_set(struct f2fs_nm_info *nm_i,
197 		nid_t start, unsigned int nr, struct nat_entry_set **ep)
198 {
199 	return radix_tree_gang_lookup(&nm_i->nat_set_root, (void **)ep,
200 							start, nr);
201 }
202 
203 int need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid)
204 {
205 	struct f2fs_nm_info *nm_i = NM_I(sbi);
206 	struct nat_entry *e;
207 	bool need = false;
208 
209 	down_read(&nm_i->nat_tree_lock);
210 	e = __lookup_nat_cache(nm_i, nid);
211 	if (e) {
212 		if (!get_nat_flag(e, IS_CHECKPOINTED) &&
213 				!get_nat_flag(e, HAS_FSYNCED_INODE))
214 			need = true;
215 	}
216 	up_read(&nm_i->nat_tree_lock);
217 	return need;
218 }
219 
220 bool is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid)
221 {
222 	struct f2fs_nm_info *nm_i = NM_I(sbi);
223 	struct nat_entry *e;
224 	bool is_cp = true;
225 
226 	down_read(&nm_i->nat_tree_lock);
227 	e = __lookup_nat_cache(nm_i, nid);
228 	if (e && !get_nat_flag(e, IS_CHECKPOINTED))
229 		is_cp = false;
230 	up_read(&nm_i->nat_tree_lock);
231 	return is_cp;
232 }
233 
234 bool need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino)
235 {
236 	struct f2fs_nm_info *nm_i = NM_I(sbi);
237 	struct nat_entry *e;
238 	bool need_update = true;
239 
240 	down_read(&nm_i->nat_tree_lock);
241 	e = __lookup_nat_cache(nm_i, ino);
242 	if (e && get_nat_flag(e, HAS_LAST_FSYNC) &&
243 			(get_nat_flag(e, IS_CHECKPOINTED) ||
244 			 get_nat_flag(e, HAS_FSYNCED_INODE)))
245 		need_update = false;
246 	up_read(&nm_i->nat_tree_lock);
247 	return need_update;
248 }
249 
250 static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid)
251 {
252 	struct nat_entry *new;
253 
254 	new = f2fs_kmem_cache_alloc(nat_entry_slab, GFP_NOFS);
255 	f2fs_radix_tree_insert(&nm_i->nat_root, nid, new);
256 	memset(new, 0, sizeof(struct nat_entry));
257 	nat_set_nid(new, nid);
258 	nat_reset_flag(new);
259 	list_add_tail(&new->list, &nm_i->nat_entries);
260 	nm_i->nat_cnt++;
261 	return new;
262 }
263 
264 static void cache_nat_entry(struct f2fs_sb_info *sbi, nid_t nid,
265 						struct f2fs_nat_entry *ne)
266 {
267 	struct f2fs_nm_info *nm_i = NM_I(sbi);
268 	struct nat_entry *e;
269 
270 	e = __lookup_nat_cache(nm_i, nid);
271 	if (!e) {
272 		e = grab_nat_entry(nm_i, nid);
273 		node_info_from_raw_nat(&e->ni, ne);
274 	} else {
275 		f2fs_bug_on(sbi, nat_get_ino(e) != ne->ino ||
276 				nat_get_blkaddr(e) != ne->block_addr ||
277 				nat_get_version(e) != ne->version);
278 	}
279 }
280 
281 static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
282 			block_t new_blkaddr, bool fsync_done)
283 {
284 	struct f2fs_nm_info *nm_i = NM_I(sbi);
285 	struct nat_entry *e;
286 
287 	down_write(&nm_i->nat_tree_lock);
288 	e = __lookup_nat_cache(nm_i, ni->nid);
289 	if (!e) {
290 		e = grab_nat_entry(nm_i, ni->nid);
291 		copy_node_info(&e->ni, ni);
292 		f2fs_bug_on(sbi, ni->blk_addr == NEW_ADDR);
293 	} else if (new_blkaddr == NEW_ADDR) {
294 		/*
295 		 * when nid is reallocated,
296 		 * previous nat entry can be remained in nat cache.
297 		 * So, reinitialize it with new information.
298 		 */
299 		copy_node_info(&e->ni, ni);
300 		f2fs_bug_on(sbi, ni->blk_addr != NULL_ADDR);
301 	}
302 
303 	/* sanity check */
304 	f2fs_bug_on(sbi, nat_get_blkaddr(e) != ni->blk_addr);
305 	f2fs_bug_on(sbi, nat_get_blkaddr(e) == NULL_ADDR &&
306 			new_blkaddr == NULL_ADDR);
307 	f2fs_bug_on(sbi, nat_get_blkaddr(e) == NEW_ADDR &&
308 			new_blkaddr == NEW_ADDR);
309 	f2fs_bug_on(sbi, nat_get_blkaddr(e) != NEW_ADDR &&
310 			nat_get_blkaddr(e) != NULL_ADDR &&
311 			new_blkaddr == NEW_ADDR);
312 
313 	/* increment version no as node is removed */
314 	if (nat_get_blkaddr(e) != NEW_ADDR && new_blkaddr == NULL_ADDR) {
315 		unsigned char version = nat_get_version(e);
316 		nat_set_version(e, inc_node_version(version));
317 
318 		/* in order to reuse the nid */
319 		if (nm_i->next_scan_nid > ni->nid)
320 			nm_i->next_scan_nid = ni->nid;
321 	}
322 
323 	/* change address */
324 	nat_set_blkaddr(e, new_blkaddr);
325 	if (new_blkaddr == NEW_ADDR || new_blkaddr == NULL_ADDR)
326 		set_nat_flag(e, IS_CHECKPOINTED, false);
327 	__set_nat_cache_dirty(nm_i, e);
328 
329 	/* update fsync_mark if its inode nat entry is still alive */
330 	if (ni->nid != ni->ino)
331 		e = __lookup_nat_cache(nm_i, ni->ino);
332 	if (e) {
333 		if (fsync_done && ni->nid == ni->ino)
334 			set_nat_flag(e, HAS_FSYNCED_INODE, true);
335 		set_nat_flag(e, HAS_LAST_FSYNC, fsync_done);
336 	}
337 	up_write(&nm_i->nat_tree_lock);
338 }
339 
340 int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink)
341 {
342 	struct f2fs_nm_info *nm_i = NM_I(sbi);
343 	int nr = nr_shrink;
344 
345 	if (!down_write_trylock(&nm_i->nat_tree_lock))
346 		return 0;
347 
348 	while (nr_shrink && !list_empty(&nm_i->nat_entries)) {
349 		struct nat_entry *ne;
350 		ne = list_first_entry(&nm_i->nat_entries,
351 					struct nat_entry, list);
352 		__del_from_nat_cache(nm_i, ne);
353 		nr_shrink--;
354 	}
355 	up_write(&nm_i->nat_tree_lock);
356 	return nr - nr_shrink;
357 }
358 
359 /*
360  * This function always returns success
361  */
362 void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni)
363 {
364 	struct f2fs_nm_info *nm_i = NM_I(sbi);
365 	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
366 	struct f2fs_journal *journal = curseg->journal;
367 	nid_t start_nid = START_NID(nid);
368 	struct f2fs_nat_block *nat_blk;
369 	struct page *page = NULL;
370 	struct f2fs_nat_entry ne;
371 	struct nat_entry *e;
372 	int i;
373 
374 	ni->nid = nid;
375 
376 	/* Check nat cache */
377 	down_read(&nm_i->nat_tree_lock);
378 	e = __lookup_nat_cache(nm_i, nid);
379 	if (e) {
380 		ni->ino = nat_get_ino(e);
381 		ni->blk_addr = nat_get_blkaddr(e);
382 		ni->version = nat_get_version(e);
383 		up_read(&nm_i->nat_tree_lock);
384 		return;
385 	}
386 
387 	memset(&ne, 0, sizeof(struct f2fs_nat_entry));
388 
389 	/* Check current segment summary */
390 	down_read(&curseg->journal_rwsem);
391 	i = lookup_journal_in_cursum(journal, NAT_JOURNAL, nid, 0);
392 	if (i >= 0) {
393 		ne = nat_in_journal(journal, i);
394 		node_info_from_raw_nat(ni, &ne);
395 	}
396 	up_read(&curseg->journal_rwsem);
397 	if (i >= 0)
398 		goto cache;
399 
400 	/* Fill node_info from nat page */
401 	page = get_current_nat_page(sbi, start_nid);
402 	nat_blk = (struct f2fs_nat_block *)page_address(page);
403 	ne = nat_blk->entries[nid - start_nid];
404 	node_info_from_raw_nat(ni, &ne);
405 	f2fs_put_page(page, 1);
406 cache:
407 	up_read(&nm_i->nat_tree_lock);
408 	/* cache nat entry */
409 	down_write(&nm_i->nat_tree_lock);
410 	cache_nat_entry(sbi, nid, &ne);
411 	up_write(&nm_i->nat_tree_lock);
412 }
413 
414 /*
415  * readahead MAX_RA_NODE number of node pages.
416  */
417 static void ra_node_pages(struct page *parent, int start, int n)
418 {
419 	struct f2fs_sb_info *sbi = F2FS_P_SB(parent);
420 	struct blk_plug plug;
421 	int i, end;
422 	nid_t nid;
423 
424 	blk_start_plug(&plug);
425 
426 	/* Then, try readahead for siblings of the desired node */
427 	end = start + n;
428 	end = min(end, NIDS_PER_BLOCK);
429 	for (i = start; i < end; i++) {
430 		nid = get_nid(parent, i, false);
431 		ra_node_page(sbi, nid);
432 	}
433 
434 	blk_finish_plug(&plug);
435 }
436 
437 pgoff_t get_next_page_offset(struct dnode_of_data *dn, pgoff_t pgofs)
438 {
439 	const long direct_index = ADDRS_PER_INODE(dn->inode);
440 	const long direct_blks = ADDRS_PER_BLOCK;
441 	const long indirect_blks = ADDRS_PER_BLOCK * NIDS_PER_BLOCK;
442 	unsigned int skipped_unit = ADDRS_PER_BLOCK;
443 	int cur_level = dn->cur_level;
444 	int max_level = dn->max_level;
445 	pgoff_t base = 0;
446 
447 	if (!dn->max_level)
448 		return pgofs + 1;
449 
450 	while (max_level-- > cur_level)
451 		skipped_unit *= NIDS_PER_BLOCK;
452 
453 	switch (dn->max_level) {
454 	case 3:
455 		base += 2 * indirect_blks;
456 	case 2:
457 		base += 2 * direct_blks;
458 	case 1:
459 		base += direct_index;
460 		break;
461 	default:
462 		f2fs_bug_on(F2FS_I_SB(dn->inode), 1);
463 	}
464 
465 	return ((pgofs - base) / skipped_unit + 1) * skipped_unit + base;
466 }
467 
468 /*
469  * The maximum depth is four.
470  * Offset[0] will have raw inode offset.
471  */
472 static int get_node_path(struct inode *inode, long block,
473 				int offset[4], unsigned int noffset[4])
474 {
475 	const long direct_index = ADDRS_PER_INODE(inode);
476 	const long direct_blks = ADDRS_PER_BLOCK;
477 	const long dptrs_per_blk = NIDS_PER_BLOCK;
478 	const long indirect_blks = ADDRS_PER_BLOCK * NIDS_PER_BLOCK;
479 	const long dindirect_blks = indirect_blks * NIDS_PER_BLOCK;
480 	int n = 0;
481 	int level = 0;
482 
483 	noffset[0] = 0;
484 
485 	if (block < direct_index) {
486 		offset[n] = block;
487 		goto got;
488 	}
489 	block -= direct_index;
490 	if (block < direct_blks) {
491 		offset[n++] = NODE_DIR1_BLOCK;
492 		noffset[n] = 1;
493 		offset[n] = block;
494 		level = 1;
495 		goto got;
496 	}
497 	block -= direct_blks;
498 	if (block < direct_blks) {
499 		offset[n++] = NODE_DIR2_BLOCK;
500 		noffset[n] = 2;
501 		offset[n] = block;
502 		level = 1;
503 		goto got;
504 	}
505 	block -= direct_blks;
506 	if (block < indirect_blks) {
507 		offset[n++] = NODE_IND1_BLOCK;
508 		noffset[n] = 3;
509 		offset[n++] = block / direct_blks;
510 		noffset[n] = 4 + offset[n - 1];
511 		offset[n] = block % direct_blks;
512 		level = 2;
513 		goto got;
514 	}
515 	block -= indirect_blks;
516 	if (block < indirect_blks) {
517 		offset[n++] = NODE_IND2_BLOCK;
518 		noffset[n] = 4 + dptrs_per_blk;
519 		offset[n++] = block / direct_blks;
520 		noffset[n] = 5 + dptrs_per_blk + offset[n - 1];
521 		offset[n] = block % direct_blks;
522 		level = 2;
523 		goto got;
524 	}
525 	block -= indirect_blks;
526 	if (block < dindirect_blks) {
527 		offset[n++] = NODE_DIND_BLOCK;
528 		noffset[n] = 5 + (dptrs_per_blk * 2);
529 		offset[n++] = block / indirect_blks;
530 		noffset[n] = 6 + (dptrs_per_blk * 2) +
531 			      offset[n - 1] * (dptrs_per_blk + 1);
532 		offset[n++] = (block / direct_blks) % dptrs_per_blk;
533 		noffset[n] = 7 + (dptrs_per_blk * 2) +
534 			      offset[n - 2] * (dptrs_per_blk + 1) +
535 			      offset[n - 1];
536 		offset[n] = block % direct_blks;
537 		level = 3;
538 		goto got;
539 	} else {
540 		BUG();
541 	}
542 got:
543 	return level;
544 }
545 
546 /*
547  * Caller should call f2fs_put_dnode(dn).
548  * Also, it should grab and release a rwsem by calling f2fs_lock_op() and
549  * f2fs_unlock_op() only if ro is not set RDONLY_NODE.
550  * In the case of RDONLY_NODE, we don't need to care about mutex.
551  */
552 int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
553 {
554 	struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
555 	struct page *npage[4];
556 	struct page *parent = NULL;
557 	int offset[4];
558 	unsigned int noffset[4];
559 	nid_t nids[4];
560 	int level, i = 0;
561 	int err = 0;
562 
563 	level = get_node_path(dn->inode, index, offset, noffset);
564 
565 	nids[0] = dn->inode->i_ino;
566 	npage[0] = dn->inode_page;
567 
568 	if (!npage[0]) {
569 		npage[0] = get_node_page(sbi, nids[0]);
570 		if (IS_ERR(npage[0]))
571 			return PTR_ERR(npage[0]);
572 	}
573 
574 	/* if inline_data is set, should not report any block indices */
575 	if (f2fs_has_inline_data(dn->inode) && index) {
576 		err = -ENOENT;
577 		f2fs_put_page(npage[0], 1);
578 		goto release_out;
579 	}
580 
581 	parent = npage[0];
582 	if (level != 0)
583 		nids[1] = get_nid(parent, offset[0], true);
584 	dn->inode_page = npage[0];
585 	dn->inode_page_locked = true;
586 
587 	/* get indirect or direct nodes */
588 	for (i = 1; i <= level; i++) {
589 		bool done = false;
590 
591 		if (!nids[i] && mode == ALLOC_NODE) {
592 			/* alloc new node */
593 			if (!alloc_nid(sbi, &(nids[i]))) {
594 				err = -ENOSPC;
595 				goto release_pages;
596 			}
597 
598 			dn->nid = nids[i];
599 			npage[i] = new_node_page(dn, noffset[i], NULL);
600 			if (IS_ERR(npage[i])) {
601 				alloc_nid_failed(sbi, nids[i]);
602 				err = PTR_ERR(npage[i]);
603 				goto release_pages;
604 			}
605 
606 			set_nid(parent, offset[i - 1], nids[i], i == 1);
607 			alloc_nid_done(sbi, nids[i]);
608 			done = true;
609 		} else if (mode == LOOKUP_NODE_RA && i == level && level > 1) {
610 			npage[i] = get_node_page_ra(parent, offset[i - 1]);
611 			if (IS_ERR(npage[i])) {
612 				err = PTR_ERR(npage[i]);
613 				goto release_pages;
614 			}
615 			done = true;
616 		}
617 		if (i == 1) {
618 			dn->inode_page_locked = false;
619 			unlock_page(parent);
620 		} else {
621 			f2fs_put_page(parent, 1);
622 		}
623 
624 		if (!done) {
625 			npage[i] = get_node_page(sbi, nids[i]);
626 			if (IS_ERR(npage[i])) {
627 				err = PTR_ERR(npage[i]);
628 				f2fs_put_page(npage[0], 0);
629 				goto release_out;
630 			}
631 		}
632 		if (i < level) {
633 			parent = npage[i];
634 			nids[i + 1] = get_nid(parent, offset[i], false);
635 		}
636 	}
637 	dn->nid = nids[level];
638 	dn->ofs_in_node = offset[level];
639 	dn->node_page = npage[level];
640 	dn->data_blkaddr = datablock_addr(dn->node_page, dn->ofs_in_node);
641 	return 0;
642 
643 release_pages:
644 	f2fs_put_page(parent, 1);
645 	if (i > 1)
646 		f2fs_put_page(npage[0], 0);
647 release_out:
648 	dn->inode_page = NULL;
649 	dn->node_page = NULL;
650 	if (err == -ENOENT) {
651 		dn->cur_level = i;
652 		dn->max_level = level;
653 		dn->ofs_in_node = offset[level];
654 	}
655 	return err;
656 }
657 
658 static void truncate_node(struct dnode_of_data *dn)
659 {
660 	struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
661 	struct node_info ni;
662 
663 	get_node_info(sbi, dn->nid, &ni);
664 	if (dn->inode->i_blocks == 0) {
665 		f2fs_bug_on(sbi, ni.blk_addr != NULL_ADDR);
666 		goto invalidate;
667 	}
668 	f2fs_bug_on(sbi, ni.blk_addr == NULL_ADDR);
669 
670 	/* Deallocate node address */
671 	invalidate_blocks(sbi, ni.blk_addr);
672 	dec_valid_node_count(sbi, dn->inode);
673 	set_node_addr(sbi, &ni, NULL_ADDR, false);
674 
675 	if (dn->nid == dn->inode->i_ino) {
676 		remove_orphan_inode(sbi, dn->nid);
677 		dec_valid_inode_count(sbi);
678 		f2fs_inode_synced(dn->inode);
679 	}
680 invalidate:
681 	clear_node_page_dirty(dn->node_page);
682 	set_sbi_flag(sbi, SBI_IS_DIRTY);
683 
684 	f2fs_put_page(dn->node_page, 1);
685 
686 	invalidate_mapping_pages(NODE_MAPPING(sbi),
687 			dn->node_page->index, dn->node_page->index);
688 
689 	dn->node_page = NULL;
690 	trace_f2fs_truncate_node(dn->inode, dn->nid, ni.blk_addr);
691 }
692 
693 static int truncate_dnode(struct dnode_of_data *dn)
694 {
695 	struct page *page;
696 
697 	if (dn->nid == 0)
698 		return 1;
699 
700 	/* get direct node */
701 	page = get_node_page(F2FS_I_SB(dn->inode), dn->nid);
702 	if (IS_ERR(page) && PTR_ERR(page) == -ENOENT)
703 		return 1;
704 	else if (IS_ERR(page))
705 		return PTR_ERR(page);
706 
707 	/* Make dnode_of_data for parameter */
708 	dn->node_page = page;
709 	dn->ofs_in_node = 0;
710 	truncate_data_blocks(dn);
711 	truncate_node(dn);
712 	return 1;
713 }
714 
715 static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs,
716 						int ofs, int depth)
717 {
718 	struct dnode_of_data rdn = *dn;
719 	struct page *page;
720 	struct f2fs_node *rn;
721 	nid_t child_nid;
722 	unsigned int child_nofs;
723 	int freed = 0;
724 	int i, ret;
725 
726 	if (dn->nid == 0)
727 		return NIDS_PER_BLOCK + 1;
728 
729 	trace_f2fs_truncate_nodes_enter(dn->inode, dn->nid, dn->data_blkaddr);
730 
731 	page = get_node_page(F2FS_I_SB(dn->inode), dn->nid);
732 	if (IS_ERR(page)) {
733 		trace_f2fs_truncate_nodes_exit(dn->inode, PTR_ERR(page));
734 		return PTR_ERR(page);
735 	}
736 
737 	ra_node_pages(page, ofs, NIDS_PER_BLOCK);
738 
739 	rn = F2FS_NODE(page);
740 	if (depth < 3) {
741 		for (i = ofs; i < NIDS_PER_BLOCK; i++, freed++) {
742 			child_nid = le32_to_cpu(rn->in.nid[i]);
743 			if (child_nid == 0)
744 				continue;
745 			rdn.nid = child_nid;
746 			ret = truncate_dnode(&rdn);
747 			if (ret < 0)
748 				goto out_err;
749 			if (set_nid(page, i, 0, false))
750 				dn->node_changed = true;
751 		}
752 	} else {
753 		child_nofs = nofs + ofs * (NIDS_PER_BLOCK + 1) + 1;
754 		for (i = ofs; i < NIDS_PER_BLOCK; i++) {
755 			child_nid = le32_to_cpu(rn->in.nid[i]);
756 			if (child_nid == 0) {
757 				child_nofs += NIDS_PER_BLOCK + 1;
758 				continue;
759 			}
760 			rdn.nid = child_nid;
761 			ret = truncate_nodes(&rdn, child_nofs, 0, depth - 1);
762 			if (ret == (NIDS_PER_BLOCK + 1)) {
763 				if (set_nid(page, i, 0, false))
764 					dn->node_changed = true;
765 				child_nofs += ret;
766 			} else if (ret < 0 && ret != -ENOENT) {
767 				goto out_err;
768 			}
769 		}
770 		freed = child_nofs;
771 	}
772 
773 	if (!ofs) {
774 		/* remove current indirect node */
775 		dn->node_page = page;
776 		truncate_node(dn);
777 		freed++;
778 	} else {
779 		f2fs_put_page(page, 1);
780 	}
781 	trace_f2fs_truncate_nodes_exit(dn->inode, freed);
782 	return freed;
783 
784 out_err:
785 	f2fs_put_page(page, 1);
786 	trace_f2fs_truncate_nodes_exit(dn->inode, ret);
787 	return ret;
788 }
789 
790 static int truncate_partial_nodes(struct dnode_of_data *dn,
791 			struct f2fs_inode *ri, int *offset, int depth)
792 {
793 	struct page *pages[2];
794 	nid_t nid[3];
795 	nid_t child_nid;
796 	int err = 0;
797 	int i;
798 	int idx = depth - 2;
799 
800 	nid[0] = le32_to_cpu(ri->i_nid[offset[0] - NODE_DIR1_BLOCK]);
801 	if (!nid[0])
802 		return 0;
803 
804 	/* get indirect nodes in the path */
805 	for (i = 0; i < idx + 1; i++) {
806 		/* reference count'll be increased */
807 		pages[i] = get_node_page(F2FS_I_SB(dn->inode), nid[i]);
808 		if (IS_ERR(pages[i])) {
809 			err = PTR_ERR(pages[i]);
810 			idx = i - 1;
811 			goto fail;
812 		}
813 		nid[i + 1] = get_nid(pages[i], offset[i + 1], false);
814 	}
815 
816 	ra_node_pages(pages[idx], offset[idx + 1], NIDS_PER_BLOCK);
817 
818 	/* free direct nodes linked to a partial indirect node */
819 	for (i = offset[idx + 1]; i < NIDS_PER_BLOCK; i++) {
820 		child_nid = get_nid(pages[idx], i, false);
821 		if (!child_nid)
822 			continue;
823 		dn->nid = child_nid;
824 		err = truncate_dnode(dn);
825 		if (err < 0)
826 			goto fail;
827 		if (set_nid(pages[idx], i, 0, false))
828 			dn->node_changed = true;
829 	}
830 
831 	if (offset[idx + 1] == 0) {
832 		dn->node_page = pages[idx];
833 		dn->nid = nid[idx];
834 		truncate_node(dn);
835 	} else {
836 		f2fs_put_page(pages[idx], 1);
837 	}
838 	offset[idx]++;
839 	offset[idx + 1] = 0;
840 	idx--;
841 fail:
842 	for (i = idx; i >= 0; i--)
843 		f2fs_put_page(pages[i], 1);
844 
845 	trace_f2fs_truncate_partial_nodes(dn->inode, nid, depth, err);
846 
847 	return err;
848 }
849 
850 /*
851  * All the block addresses of data and nodes should be nullified.
852  */
853 int truncate_inode_blocks(struct inode *inode, pgoff_t from)
854 {
855 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
856 	int err = 0, cont = 1;
857 	int level, offset[4], noffset[4];
858 	unsigned int nofs = 0;
859 	struct f2fs_inode *ri;
860 	struct dnode_of_data dn;
861 	struct page *page;
862 
863 	trace_f2fs_truncate_inode_blocks_enter(inode, from);
864 
865 	level = get_node_path(inode, from, offset, noffset);
866 
867 	page = get_node_page(sbi, inode->i_ino);
868 	if (IS_ERR(page)) {
869 		trace_f2fs_truncate_inode_blocks_exit(inode, PTR_ERR(page));
870 		return PTR_ERR(page);
871 	}
872 
873 	set_new_dnode(&dn, inode, page, NULL, 0);
874 	unlock_page(page);
875 
876 	ri = F2FS_INODE(page);
877 	switch (level) {
878 	case 0:
879 	case 1:
880 		nofs = noffset[1];
881 		break;
882 	case 2:
883 		nofs = noffset[1];
884 		if (!offset[level - 1])
885 			goto skip_partial;
886 		err = truncate_partial_nodes(&dn, ri, offset, level);
887 		if (err < 0 && err != -ENOENT)
888 			goto fail;
889 		nofs += 1 + NIDS_PER_BLOCK;
890 		break;
891 	case 3:
892 		nofs = 5 + 2 * NIDS_PER_BLOCK;
893 		if (!offset[level - 1])
894 			goto skip_partial;
895 		err = truncate_partial_nodes(&dn, ri, offset, level);
896 		if (err < 0 && err != -ENOENT)
897 			goto fail;
898 		break;
899 	default:
900 		BUG();
901 	}
902 
903 skip_partial:
904 	while (cont) {
905 		dn.nid = le32_to_cpu(ri->i_nid[offset[0] - NODE_DIR1_BLOCK]);
906 		switch (offset[0]) {
907 		case NODE_DIR1_BLOCK:
908 		case NODE_DIR2_BLOCK:
909 			err = truncate_dnode(&dn);
910 			break;
911 
912 		case NODE_IND1_BLOCK:
913 		case NODE_IND2_BLOCK:
914 			err = truncate_nodes(&dn, nofs, offset[1], 2);
915 			break;
916 
917 		case NODE_DIND_BLOCK:
918 			err = truncate_nodes(&dn, nofs, offset[1], 3);
919 			cont = 0;
920 			break;
921 
922 		default:
923 			BUG();
924 		}
925 		if (err < 0 && err != -ENOENT)
926 			goto fail;
927 		if (offset[1] == 0 &&
928 				ri->i_nid[offset[0] - NODE_DIR1_BLOCK]) {
929 			lock_page(page);
930 			BUG_ON(page->mapping != NODE_MAPPING(sbi));
931 			f2fs_wait_on_page_writeback(page, NODE, true);
932 			ri->i_nid[offset[0] - NODE_DIR1_BLOCK] = 0;
933 			set_page_dirty(page);
934 			unlock_page(page);
935 		}
936 		offset[1] = 0;
937 		offset[0]++;
938 		nofs += err;
939 	}
940 fail:
941 	f2fs_put_page(page, 0);
942 	trace_f2fs_truncate_inode_blocks_exit(inode, err);
943 	return err > 0 ? 0 : err;
944 }
945 
946 int truncate_xattr_node(struct inode *inode, struct page *page)
947 {
948 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
949 	nid_t nid = F2FS_I(inode)->i_xattr_nid;
950 	struct dnode_of_data dn;
951 	struct page *npage;
952 
953 	if (!nid)
954 		return 0;
955 
956 	npage = get_node_page(sbi, nid);
957 	if (IS_ERR(npage))
958 		return PTR_ERR(npage);
959 
960 	f2fs_i_xnid_write(inode, 0);
961 
962 	/* need to do checkpoint during fsync */
963 	F2FS_I(inode)->xattr_ver = cur_cp_version(F2FS_CKPT(sbi));
964 
965 	set_new_dnode(&dn, inode, page, npage, nid);
966 
967 	if (page)
968 		dn.inode_page_locked = true;
969 	truncate_node(&dn);
970 	return 0;
971 }
972 
973 /*
974  * Caller should grab and release a rwsem by calling f2fs_lock_op() and
975  * f2fs_unlock_op().
976  */
977 int remove_inode_page(struct inode *inode)
978 {
979 	struct dnode_of_data dn;
980 	int err;
981 
982 	set_new_dnode(&dn, inode, NULL, NULL, inode->i_ino);
983 	err = get_dnode_of_data(&dn, 0, LOOKUP_NODE);
984 	if (err)
985 		return err;
986 
987 	err = truncate_xattr_node(inode, dn.inode_page);
988 	if (err) {
989 		f2fs_put_dnode(&dn);
990 		return err;
991 	}
992 
993 	/* remove potential inline_data blocks */
994 	if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
995 				S_ISLNK(inode->i_mode))
996 		truncate_data_blocks_range(&dn, 1);
997 
998 	/* 0 is possible, after f2fs_new_inode() has failed */
999 	f2fs_bug_on(F2FS_I_SB(inode),
1000 			inode->i_blocks != 0 && inode->i_blocks != 1);
1001 
1002 	/* will put inode & node pages */
1003 	truncate_node(&dn);
1004 	return 0;
1005 }
1006 
1007 struct page *new_inode_page(struct inode *inode)
1008 {
1009 	struct dnode_of_data dn;
1010 
1011 	/* allocate inode page for new inode */
1012 	set_new_dnode(&dn, inode, NULL, NULL, inode->i_ino);
1013 
1014 	/* caller should f2fs_put_page(page, 1); */
1015 	return new_node_page(&dn, 0, NULL);
1016 }
1017 
1018 struct page *new_node_page(struct dnode_of_data *dn,
1019 				unsigned int ofs, struct page *ipage)
1020 {
1021 	struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
1022 	struct node_info old_ni, new_ni;
1023 	struct page *page;
1024 	int err;
1025 
1026 	if (unlikely(is_inode_flag_set(dn->inode, FI_NO_ALLOC)))
1027 		return ERR_PTR(-EPERM);
1028 
1029 	page = f2fs_grab_cache_page(NODE_MAPPING(sbi), dn->nid, false);
1030 	if (!page)
1031 		return ERR_PTR(-ENOMEM);
1032 
1033 	if (unlikely(!inc_valid_node_count(sbi, dn->inode))) {
1034 		err = -ENOSPC;
1035 		goto fail;
1036 	}
1037 
1038 	get_node_info(sbi, dn->nid, &old_ni);
1039 
1040 	/* Reinitialize old_ni with new node page */
1041 	f2fs_bug_on(sbi, old_ni.blk_addr != NULL_ADDR);
1042 	new_ni = old_ni;
1043 	new_ni.ino = dn->inode->i_ino;
1044 	set_node_addr(sbi, &new_ni, NEW_ADDR, false);
1045 
1046 	f2fs_wait_on_page_writeback(page, NODE, true);
1047 	fill_node_footer(page, dn->nid, dn->inode->i_ino, ofs, true);
1048 	set_cold_node(dn->inode, page);
1049 	if (!PageUptodate(page))
1050 		SetPageUptodate(page);
1051 	if (set_page_dirty(page))
1052 		dn->node_changed = true;
1053 
1054 	if (f2fs_has_xattr_block(ofs))
1055 		f2fs_i_xnid_write(dn->inode, dn->nid);
1056 
1057 	if (ofs == 0)
1058 		inc_valid_inode_count(sbi);
1059 	return page;
1060 
1061 fail:
1062 	clear_node_page_dirty(page);
1063 	f2fs_put_page(page, 1);
1064 	return ERR_PTR(err);
1065 }
1066 
1067 /*
1068  * Caller should do after getting the following values.
1069  * 0: f2fs_put_page(page, 0)
1070  * LOCKED_PAGE or error: f2fs_put_page(page, 1)
1071  */
1072 static int read_node_page(struct page *page, int op_flags)
1073 {
1074 	struct f2fs_sb_info *sbi = F2FS_P_SB(page);
1075 	struct node_info ni;
1076 	struct f2fs_io_info fio = {
1077 		.sbi = sbi,
1078 		.type = NODE,
1079 		.op = REQ_OP_READ,
1080 		.op_flags = op_flags,
1081 		.page = page,
1082 		.encrypted_page = NULL,
1083 	};
1084 
1085 	if (PageUptodate(page))
1086 		return LOCKED_PAGE;
1087 
1088 	get_node_info(sbi, page->index, &ni);
1089 
1090 	if (unlikely(ni.blk_addr == NULL_ADDR)) {
1091 		ClearPageUptodate(page);
1092 		return -ENOENT;
1093 	}
1094 
1095 	fio.new_blkaddr = fio.old_blkaddr = ni.blk_addr;
1096 	return f2fs_submit_page_bio(&fio);
1097 }
1098 
1099 /*
1100  * Readahead a node page
1101  */
1102 void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid)
1103 {
1104 	struct page *apage;
1105 	int err;
1106 
1107 	if (!nid)
1108 		return;
1109 	f2fs_bug_on(sbi, check_nid_range(sbi, nid));
1110 
1111 	rcu_read_lock();
1112 	apage = radix_tree_lookup(&NODE_MAPPING(sbi)->page_tree, nid);
1113 	rcu_read_unlock();
1114 	if (apage)
1115 		return;
1116 
1117 	apage = f2fs_grab_cache_page(NODE_MAPPING(sbi), nid, false);
1118 	if (!apage)
1119 		return;
1120 
1121 	err = read_node_page(apage, REQ_RAHEAD);
1122 	f2fs_put_page(apage, err ? 1 : 0);
1123 }
1124 
1125 static struct page *__get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid,
1126 					struct page *parent, int start)
1127 {
1128 	struct page *page;
1129 	int err;
1130 
1131 	if (!nid)
1132 		return ERR_PTR(-ENOENT);
1133 	f2fs_bug_on(sbi, check_nid_range(sbi, nid));
1134 repeat:
1135 	page = f2fs_grab_cache_page(NODE_MAPPING(sbi), nid, false);
1136 	if (!page)
1137 		return ERR_PTR(-ENOMEM);
1138 
1139 	err = read_node_page(page, READ_SYNC);
1140 	if (err < 0) {
1141 		f2fs_put_page(page, 1);
1142 		return ERR_PTR(err);
1143 	} else if (err == LOCKED_PAGE) {
1144 		goto page_hit;
1145 	}
1146 
1147 	if (parent)
1148 		ra_node_pages(parent, start + 1, MAX_RA_NODE);
1149 
1150 	lock_page(page);
1151 
1152 	if (unlikely(page->mapping != NODE_MAPPING(sbi))) {
1153 		f2fs_put_page(page, 1);
1154 		goto repeat;
1155 	}
1156 
1157 	if (unlikely(!PageUptodate(page)))
1158 		goto out_err;
1159 page_hit:
1160 	if(unlikely(nid != nid_of_node(page))) {
1161 		f2fs_bug_on(sbi, 1);
1162 		ClearPageUptodate(page);
1163 out_err:
1164 		f2fs_put_page(page, 1);
1165 		return ERR_PTR(-EIO);
1166 	}
1167 	return page;
1168 }
1169 
1170 struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid)
1171 {
1172 	return __get_node_page(sbi, nid, NULL, 0);
1173 }
1174 
1175 struct page *get_node_page_ra(struct page *parent, int start)
1176 {
1177 	struct f2fs_sb_info *sbi = F2FS_P_SB(parent);
1178 	nid_t nid = get_nid(parent, start, false);
1179 
1180 	return __get_node_page(sbi, nid, parent, start);
1181 }
1182 
1183 static void flush_inline_data(struct f2fs_sb_info *sbi, nid_t ino)
1184 {
1185 	struct inode *inode;
1186 	struct page *page;
1187 	int ret;
1188 
1189 	/* should flush inline_data before evict_inode */
1190 	inode = ilookup(sbi->sb, ino);
1191 	if (!inode)
1192 		return;
1193 
1194 	page = pagecache_get_page(inode->i_mapping, 0, FGP_LOCK|FGP_NOWAIT, 0);
1195 	if (!page)
1196 		goto iput_out;
1197 
1198 	if (!PageUptodate(page))
1199 		goto page_out;
1200 
1201 	if (!PageDirty(page))
1202 		goto page_out;
1203 
1204 	if (!clear_page_dirty_for_io(page))
1205 		goto page_out;
1206 
1207 	ret = f2fs_write_inline_data(inode, page);
1208 	inode_dec_dirty_pages(inode);
1209 	if (ret)
1210 		set_page_dirty(page);
1211 page_out:
1212 	f2fs_put_page(page, 1);
1213 iput_out:
1214 	iput(inode);
1215 }
1216 
1217 void move_node_page(struct page *node_page, int gc_type)
1218 {
1219 	if (gc_type == FG_GC) {
1220 		struct f2fs_sb_info *sbi = F2FS_P_SB(node_page);
1221 		struct writeback_control wbc = {
1222 			.sync_mode = WB_SYNC_ALL,
1223 			.nr_to_write = 1,
1224 			.for_reclaim = 0,
1225 		};
1226 
1227 		set_page_dirty(node_page);
1228 		f2fs_wait_on_page_writeback(node_page, NODE, true);
1229 
1230 		f2fs_bug_on(sbi, PageWriteback(node_page));
1231 		if (!clear_page_dirty_for_io(node_page))
1232 			goto out_page;
1233 
1234 		if (NODE_MAPPING(sbi)->a_ops->writepage(node_page, &wbc))
1235 			unlock_page(node_page);
1236 		goto release_page;
1237 	} else {
1238 		/* set page dirty and write it */
1239 		if (!PageWriteback(node_page))
1240 			set_page_dirty(node_page);
1241 	}
1242 out_page:
1243 	unlock_page(node_page);
1244 release_page:
1245 	f2fs_put_page(node_page, 0);
1246 }
1247 
1248 static struct page *last_fsync_dnode(struct f2fs_sb_info *sbi, nid_t ino)
1249 {
1250 	pgoff_t index, end;
1251 	struct pagevec pvec;
1252 	struct page *last_page = NULL;
1253 
1254 	pagevec_init(&pvec, 0);
1255 	index = 0;
1256 	end = ULONG_MAX;
1257 
1258 	while (index <= end) {
1259 		int i, nr_pages;
1260 		nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index,
1261 				PAGECACHE_TAG_DIRTY,
1262 				min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
1263 		if (nr_pages == 0)
1264 			break;
1265 
1266 		for (i = 0; i < nr_pages; i++) {
1267 			struct page *page = pvec.pages[i];
1268 
1269 			if (unlikely(f2fs_cp_error(sbi))) {
1270 				f2fs_put_page(last_page, 0);
1271 				pagevec_release(&pvec);
1272 				return ERR_PTR(-EIO);
1273 			}
1274 
1275 			if (!IS_DNODE(page) || !is_cold_node(page))
1276 				continue;
1277 			if (ino_of_node(page) != ino)
1278 				continue;
1279 
1280 			lock_page(page);
1281 
1282 			if (unlikely(page->mapping != NODE_MAPPING(sbi))) {
1283 continue_unlock:
1284 				unlock_page(page);
1285 				continue;
1286 			}
1287 			if (ino_of_node(page) != ino)
1288 				goto continue_unlock;
1289 
1290 			if (!PageDirty(page)) {
1291 				/* someone wrote it for us */
1292 				goto continue_unlock;
1293 			}
1294 
1295 			if (last_page)
1296 				f2fs_put_page(last_page, 0);
1297 
1298 			get_page(page);
1299 			last_page = page;
1300 			unlock_page(page);
1301 		}
1302 		pagevec_release(&pvec);
1303 		cond_resched();
1304 	}
1305 	return last_page;
1306 }
1307 
1308 int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode,
1309 			struct writeback_control *wbc, bool atomic)
1310 {
1311 	pgoff_t index, end;
1312 	struct pagevec pvec;
1313 	int ret = 0;
1314 	struct page *last_page = NULL;
1315 	bool marked = false;
1316 	nid_t ino = inode->i_ino;
1317 
1318 	if (atomic) {
1319 		last_page = last_fsync_dnode(sbi, ino);
1320 		if (IS_ERR_OR_NULL(last_page))
1321 			return PTR_ERR_OR_ZERO(last_page);
1322 	}
1323 retry:
1324 	pagevec_init(&pvec, 0);
1325 	index = 0;
1326 	end = ULONG_MAX;
1327 
1328 	while (index <= end) {
1329 		int i, nr_pages;
1330 		nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index,
1331 				PAGECACHE_TAG_DIRTY,
1332 				min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
1333 		if (nr_pages == 0)
1334 			break;
1335 
1336 		for (i = 0; i < nr_pages; i++) {
1337 			struct page *page = pvec.pages[i];
1338 
1339 			if (unlikely(f2fs_cp_error(sbi))) {
1340 				f2fs_put_page(last_page, 0);
1341 				pagevec_release(&pvec);
1342 				return -EIO;
1343 			}
1344 
1345 			if (!IS_DNODE(page) || !is_cold_node(page))
1346 				continue;
1347 			if (ino_of_node(page) != ino)
1348 				continue;
1349 
1350 			lock_page(page);
1351 
1352 			if (unlikely(page->mapping != NODE_MAPPING(sbi))) {
1353 continue_unlock:
1354 				unlock_page(page);
1355 				continue;
1356 			}
1357 			if (ino_of_node(page) != ino)
1358 				goto continue_unlock;
1359 
1360 			if (!PageDirty(page) && page != last_page) {
1361 				/* someone wrote it for us */
1362 				goto continue_unlock;
1363 			}
1364 
1365 			f2fs_wait_on_page_writeback(page, NODE, true);
1366 			BUG_ON(PageWriteback(page));
1367 
1368 			if (!atomic || page == last_page) {
1369 				set_fsync_mark(page, 1);
1370 				if (IS_INODE(page)) {
1371 					if (is_inode_flag_set(inode,
1372 								FI_DIRTY_INODE))
1373 						update_inode(inode, page);
1374 					set_dentry_mark(page,
1375 						need_dentry_mark(sbi, ino));
1376 				}
1377 				/*  may be written by other thread */
1378 				if (!PageDirty(page))
1379 					set_page_dirty(page);
1380 			}
1381 
1382 			if (!clear_page_dirty_for_io(page))
1383 				goto continue_unlock;
1384 
1385 			ret = NODE_MAPPING(sbi)->a_ops->writepage(page, wbc);
1386 			if (ret) {
1387 				unlock_page(page);
1388 				f2fs_put_page(last_page, 0);
1389 				break;
1390 			}
1391 			if (page == last_page) {
1392 				f2fs_put_page(page, 0);
1393 				marked = true;
1394 				break;
1395 			}
1396 		}
1397 		pagevec_release(&pvec);
1398 		cond_resched();
1399 
1400 		if (ret || marked)
1401 			break;
1402 	}
1403 	if (!ret && atomic && !marked) {
1404 		f2fs_msg(sbi->sb, KERN_DEBUG,
1405 			"Retry to write fsync mark: ino=%u, idx=%lx",
1406 					ino, last_page->index);
1407 		lock_page(last_page);
1408 		set_page_dirty(last_page);
1409 		unlock_page(last_page);
1410 		goto retry;
1411 	}
1412 	return ret ? -EIO: 0;
1413 }
1414 
1415 int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc)
1416 {
1417 	pgoff_t index, end;
1418 	struct pagevec pvec;
1419 	int step = 0;
1420 	int nwritten = 0;
1421 
1422 	pagevec_init(&pvec, 0);
1423 
1424 next_step:
1425 	index = 0;
1426 	end = ULONG_MAX;
1427 
1428 	while (index <= end) {
1429 		int i, nr_pages;
1430 		nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index,
1431 				PAGECACHE_TAG_DIRTY,
1432 				min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
1433 		if (nr_pages == 0)
1434 			break;
1435 
1436 		for (i = 0; i < nr_pages; i++) {
1437 			struct page *page = pvec.pages[i];
1438 
1439 			if (unlikely(f2fs_cp_error(sbi))) {
1440 				pagevec_release(&pvec);
1441 				return -EIO;
1442 			}
1443 
1444 			/*
1445 			 * flushing sequence with step:
1446 			 * 0. indirect nodes
1447 			 * 1. dentry dnodes
1448 			 * 2. file dnodes
1449 			 */
1450 			if (step == 0 && IS_DNODE(page))
1451 				continue;
1452 			if (step == 1 && (!IS_DNODE(page) ||
1453 						is_cold_node(page)))
1454 				continue;
1455 			if (step == 2 && (!IS_DNODE(page) ||
1456 						!is_cold_node(page)))
1457 				continue;
1458 lock_node:
1459 			if (!trylock_page(page))
1460 				continue;
1461 
1462 			if (unlikely(page->mapping != NODE_MAPPING(sbi))) {
1463 continue_unlock:
1464 				unlock_page(page);
1465 				continue;
1466 			}
1467 
1468 			if (!PageDirty(page)) {
1469 				/* someone wrote it for us */
1470 				goto continue_unlock;
1471 			}
1472 
1473 			/* flush inline_data */
1474 			if (is_inline_node(page)) {
1475 				clear_inline_node(page);
1476 				unlock_page(page);
1477 				flush_inline_data(sbi, ino_of_node(page));
1478 				goto lock_node;
1479 			}
1480 
1481 			f2fs_wait_on_page_writeback(page, NODE, true);
1482 
1483 			BUG_ON(PageWriteback(page));
1484 			if (!clear_page_dirty_for_io(page))
1485 				goto continue_unlock;
1486 
1487 			set_fsync_mark(page, 0);
1488 			set_dentry_mark(page, 0);
1489 
1490 			if (NODE_MAPPING(sbi)->a_ops->writepage(page, wbc))
1491 				unlock_page(page);
1492 
1493 			if (--wbc->nr_to_write == 0)
1494 				break;
1495 		}
1496 		pagevec_release(&pvec);
1497 		cond_resched();
1498 
1499 		if (wbc->nr_to_write == 0) {
1500 			step = 2;
1501 			break;
1502 		}
1503 	}
1504 
1505 	if (step < 2) {
1506 		step++;
1507 		goto next_step;
1508 	}
1509 	return nwritten;
1510 }
1511 
1512 int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino)
1513 {
1514 	pgoff_t index = 0, end = ULONG_MAX;
1515 	struct pagevec pvec;
1516 	int ret2 = 0, ret = 0;
1517 
1518 	pagevec_init(&pvec, 0);
1519 
1520 	while (index <= end) {
1521 		int i, nr_pages;
1522 		nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index,
1523 				PAGECACHE_TAG_WRITEBACK,
1524 				min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
1525 		if (nr_pages == 0)
1526 			break;
1527 
1528 		for (i = 0; i < nr_pages; i++) {
1529 			struct page *page = pvec.pages[i];
1530 
1531 			/* until radix tree lookup accepts end_index */
1532 			if (unlikely(page->index > end))
1533 				continue;
1534 
1535 			if (ino && ino_of_node(page) == ino) {
1536 				f2fs_wait_on_page_writeback(page, NODE, true);
1537 				if (TestClearPageError(page))
1538 					ret = -EIO;
1539 			}
1540 		}
1541 		pagevec_release(&pvec);
1542 		cond_resched();
1543 	}
1544 
1545 	if (unlikely(test_and_clear_bit(AS_ENOSPC, &NODE_MAPPING(sbi)->flags)))
1546 		ret2 = -ENOSPC;
1547 	if (unlikely(test_and_clear_bit(AS_EIO, &NODE_MAPPING(sbi)->flags)))
1548 		ret2 = -EIO;
1549 	if (!ret)
1550 		ret = ret2;
1551 	return ret;
1552 }
1553 
1554 static int f2fs_write_node_page(struct page *page,
1555 				struct writeback_control *wbc)
1556 {
1557 	struct f2fs_sb_info *sbi = F2FS_P_SB(page);
1558 	nid_t nid;
1559 	struct node_info ni;
1560 	struct f2fs_io_info fio = {
1561 		.sbi = sbi,
1562 		.type = NODE,
1563 		.op = REQ_OP_WRITE,
1564 		.op_flags = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : 0,
1565 		.page = page,
1566 		.encrypted_page = NULL,
1567 	};
1568 
1569 	trace_f2fs_writepage(page, NODE);
1570 
1571 	if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
1572 		goto redirty_out;
1573 	if (unlikely(f2fs_cp_error(sbi)))
1574 		goto redirty_out;
1575 
1576 	/* get old block addr of this node page */
1577 	nid = nid_of_node(page);
1578 	f2fs_bug_on(sbi, page->index != nid);
1579 
1580 	if (wbc->for_reclaim) {
1581 		if (!down_read_trylock(&sbi->node_write))
1582 			goto redirty_out;
1583 	} else {
1584 		down_read(&sbi->node_write);
1585 	}
1586 
1587 	get_node_info(sbi, nid, &ni);
1588 
1589 	/* This page is already truncated */
1590 	if (unlikely(ni.blk_addr == NULL_ADDR)) {
1591 		ClearPageUptodate(page);
1592 		dec_page_count(sbi, F2FS_DIRTY_NODES);
1593 		up_read(&sbi->node_write);
1594 		unlock_page(page);
1595 		return 0;
1596 	}
1597 
1598 	set_page_writeback(page);
1599 	fio.old_blkaddr = ni.blk_addr;
1600 	write_node_page(nid, &fio);
1601 	set_node_addr(sbi, &ni, fio.new_blkaddr, is_fsync_dnode(page));
1602 	dec_page_count(sbi, F2FS_DIRTY_NODES);
1603 	up_read(&sbi->node_write);
1604 
1605 	if (wbc->for_reclaim)
1606 		f2fs_submit_merged_bio_cond(sbi, NULL, page, 0, NODE, WRITE);
1607 
1608 	unlock_page(page);
1609 
1610 	if (unlikely(f2fs_cp_error(sbi)))
1611 		f2fs_submit_merged_bio(sbi, NODE, WRITE);
1612 
1613 	return 0;
1614 
1615 redirty_out:
1616 	redirty_page_for_writepage(wbc, page);
1617 	return AOP_WRITEPAGE_ACTIVATE;
1618 }
1619 
1620 static int f2fs_write_node_pages(struct address_space *mapping,
1621 			    struct writeback_control *wbc)
1622 {
1623 	struct f2fs_sb_info *sbi = F2FS_M_SB(mapping);
1624 	struct blk_plug plug;
1625 	long diff;
1626 
1627 	/* balancing f2fs's metadata in background */
1628 	f2fs_balance_fs_bg(sbi);
1629 
1630 	/* collect a number of dirty node pages and write together */
1631 	if (get_pages(sbi, F2FS_DIRTY_NODES) < nr_pages_to_skip(sbi, NODE))
1632 		goto skip_write;
1633 
1634 	trace_f2fs_writepages(mapping->host, wbc, NODE);
1635 
1636 	diff = nr_pages_to_write(sbi, NODE, wbc);
1637 	wbc->sync_mode = WB_SYNC_NONE;
1638 	blk_start_plug(&plug);
1639 	sync_node_pages(sbi, wbc);
1640 	blk_finish_plug(&plug);
1641 	wbc->nr_to_write = max((long)0, wbc->nr_to_write - diff);
1642 	return 0;
1643 
1644 skip_write:
1645 	wbc->pages_skipped += get_pages(sbi, F2FS_DIRTY_NODES);
1646 	trace_f2fs_writepages(mapping->host, wbc, NODE);
1647 	return 0;
1648 }
1649 
1650 static int f2fs_set_node_page_dirty(struct page *page)
1651 {
1652 	trace_f2fs_set_page_dirty(page, NODE);
1653 
1654 	if (!PageUptodate(page))
1655 		SetPageUptodate(page);
1656 	if (!PageDirty(page)) {
1657 		f2fs_set_page_dirty_nobuffers(page);
1658 		inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_NODES);
1659 		SetPagePrivate(page);
1660 		f2fs_trace_pid(page);
1661 		return 1;
1662 	}
1663 	return 0;
1664 }
1665 
1666 /*
1667  * Structure of the f2fs node operations
1668  */
1669 const struct address_space_operations f2fs_node_aops = {
1670 	.writepage	= f2fs_write_node_page,
1671 	.writepages	= f2fs_write_node_pages,
1672 	.set_page_dirty	= f2fs_set_node_page_dirty,
1673 	.invalidatepage	= f2fs_invalidate_page,
1674 	.releasepage	= f2fs_release_page,
1675 };
1676 
1677 static struct free_nid *__lookup_free_nid_list(struct f2fs_nm_info *nm_i,
1678 						nid_t n)
1679 {
1680 	return radix_tree_lookup(&nm_i->free_nid_root, n);
1681 }
1682 
1683 static void __del_from_free_nid_list(struct f2fs_nm_info *nm_i,
1684 						struct free_nid *i)
1685 {
1686 	list_del(&i->list);
1687 	radix_tree_delete(&nm_i->free_nid_root, i->nid);
1688 }
1689 
1690 static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build)
1691 {
1692 	struct f2fs_nm_info *nm_i = NM_I(sbi);
1693 	struct free_nid *i;
1694 	struct nat_entry *ne;
1695 
1696 	if (!available_free_memory(sbi, FREE_NIDS))
1697 		return -1;
1698 
1699 	/* 0 nid should not be used */
1700 	if (unlikely(nid == 0))
1701 		return 0;
1702 
1703 	if (build) {
1704 		/* do not add allocated nids */
1705 		ne = __lookup_nat_cache(nm_i, nid);
1706 		if (ne && (!get_nat_flag(ne, IS_CHECKPOINTED) ||
1707 				nat_get_blkaddr(ne) != NULL_ADDR))
1708 			return 0;
1709 	}
1710 
1711 	i = f2fs_kmem_cache_alloc(free_nid_slab, GFP_NOFS);
1712 	i->nid = nid;
1713 	i->state = NID_NEW;
1714 
1715 	if (radix_tree_preload(GFP_NOFS)) {
1716 		kmem_cache_free(free_nid_slab, i);
1717 		return 0;
1718 	}
1719 
1720 	spin_lock(&nm_i->free_nid_list_lock);
1721 	if (radix_tree_insert(&nm_i->free_nid_root, i->nid, i)) {
1722 		spin_unlock(&nm_i->free_nid_list_lock);
1723 		radix_tree_preload_end();
1724 		kmem_cache_free(free_nid_slab, i);
1725 		return 0;
1726 	}
1727 	list_add_tail(&i->list, &nm_i->free_nid_list);
1728 	nm_i->fcnt++;
1729 	spin_unlock(&nm_i->free_nid_list_lock);
1730 	radix_tree_preload_end();
1731 	return 1;
1732 }
1733 
1734 static void remove_free_nid(struct f2fs_nm_info *nm_i, nid_t nid)
1735 {
1736 	struct free_nid *i;
1737 	bool need_free = false;
1738 
1739 	spin_lock(&nm_i->free_nid_list_lock);
1740 	i = __lookup_free_nid_list(nm_i, nid);
1741 	if (i && i->state == NID_NEW) {
1742 		__del_from_free_nid_list(nm_i, i);
1743 		nm_i->fcnt--;
1744 		need_free = true;
1745 	}
1746 	spin_unlock(&nm_i->free_nid_list_lock);
1747 
1748 	if (need_free)
1749 		kmem_cache_free(free_nid_slab, i);
1750 }
1751 
1752 static void scan_nat_page(struct f2fs_sb_info *sbi,
1753 			struct page *nat_page, nid_t start_nid)
1754 {
1755 	struct f2fs_nm_info *nm_i = NM_I(sbi);
1756 	struct f2fs_nat_block *nat_blk = page_address(nat_page);
1757 	block_t blk_addr;
1758 	int i;
1759 
1760 	i = start_nid % NAT_ENTRY_PER_BLOCK;
1761 
1762 	for (; i < NAT_ENTRY_PER_BLOCK; i++, start_nid++) {
1763 
1764 		if (unlikely(start_nid >= nm_i->max_nid))
1765 			break;
1766 
1767 		blk_addr = le32_to_cpu(nat_blk->entries[i].block_addr);
1768 		f2fs_bug_on(sbi, blk_addr == NEW_ADDR);
1769 		if (blk_addr == NULL_ADDR) {
1770 			if (add_free_nid(sbi, start_nid, true) < 0)
1771 				break;
1772 		}
1773 	}
1774 }
1775 
1776 void build_free_nids(struct f2fs_sb_info *sbi)
1777 {
1778 	struct f2fs_nm_info *nm_i = NM_I(sbi);
1779 	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
1780 	struct f2fs_journal *journal = curseg->journal;
1781 	int i = 0;
1782 	nid_t nid = nm_i->next_scan_nid;
1783 
1784 	/* Enough entries */
1785 	if (nm_i->fcnt >= NAT_ENTRY_PER_BLOCK)
1786 		return;
1787 
1788 	/* readahead nat pages to be scanned */
1789 	ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), FREE_NID_PAGES,
1790 							META_NAT, true);
1791 
1792 	down_read(&nm_i->nat_tree_lock);
1793 
1794 	while (1) {
1795 		struct page *page = get_current_nat_page(sbi, nid);
1796 
1797 		scan_nat_page(sbi, page, nid);
1798 		f2fs_put_page(page, 1);
1799 
1800 		nid += (NAT_ENTRY_PER_BLOCK - (nid % NAT_ENTRY_PER_BLOCK));
1801 		if (unlikely(nid >= nm_i->max_nid))
1802 			nid = 0;
1803 
1804 		if (++i >= FREE_NID_PAGES)
1805 			break;
1806 	}
1807 
1808 	/* go to the next free nat pages to find free nids abundantly */
1809 	nm_i->next_scan_nid = nid;
1810 
1811 	/* find free nids from current sum_pages */
1812 	down_read(&curseg->journal_rwsem);
1813 	for (i = 0; i < nats_in_cursum(journal); i++) {
1814 		block_t addr;
1815 
1816 		addr = le32_to_cpu(nat_in_journal(journal, i).block_addr);
1817 		nid = le32_to_cpu(nid_in_journal(journal, i));
1818 		if (addr == NULL_ADDR)
1819 			add_free_nid(sbi, nid, true);
1820 		else
1821 			remove_free_nid(nm_i, nid);
1822 	}
1823 	up_read(&curseg->journal_rwsem);
1824 	up_read(&nm_i->nat_tree_lock);
1825 
1826 	ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nm_i->next_scan_nid),
1827 					nm_i->ra_nid_pages, META_NAT, false);
1828 }
1829 
1830 /*
1831  * If this function returns success, caller can obtain a new nid
1832  * from second parameter of this function.
1833  * The returned nid could be used ino as well as nid when inode is created.
1834  */
1835 bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid)
1836 {
1837 	struct f2fs_nm_info *nm_i = NM_I(sbi);
1838 	struct free_nid *i = NULL;
1839 retry:
1840 #ifdef CONFIG_F2FS_FAULT_INJECTION
1841 	if (time_to_inject(FAULT_ALLOC_NID))
1842 		return false;
1843 #endif
1844 	if (unlikely(sbi->total_valid_node_count + 1 > nm_i->available_nids))
1845 		return false;
1846 
1847 	spin_lock(&nm_i->free_nid_list_lock);
1848 
1849 	/* We should not use stale free nids created by build_free_nids */
1850 	if (nm_i->fcnt && !on_build_free_nids(nm_i)) {
1851 		f2fs_bug_on(sbi, list_empty(&nm_i->free_nid_list));
1852 		list_for_each_entry(i, &nm_i->free_nid_list, list)
1853 			if (i->state == NID_NEW)
1854 				break;
1855 
1856 		f2fs_bug_on(sbi, i->state != NID_NEW);
1857 		*nid = i->nid;
1858 		i->state = NID_ALLOC;
1859 		nm_i->fcnt--;
1860 		spin_unlock(&nm_i->free_nid_list_lock);
1861 		return true;
1862 	}
1863 	spin_unlock(&nm_i->free_nid_list_lock);
1864 
1865 	/* Let's scan nat pages and its caches to get free nids */
1866 	mutex_lock(&nm_i->build_lock);
1867 	build_free_nids(sbi);
1868 	mutex_unlock(&nm_i->build_lock);
1869 	goto retry;
1870 }
1871 
1872 /*
1873  * alloc_nid() should be called prior to this function.
1874  */
1875 void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid)
1876 {
1877 	struct f2fs_nm_info *nm_i = NM_I(sbi);
1878 	struct free_nid *i;
1879 
1880 	spin_lock(&nm_i->free_nid_list_lock);
1881 	i = __lookup_free_nid_list(nm_i, nid);
1882 	f2fs_bug_on(sbi, !i || i->state != NID_ALLOC);
1883 	__del_from_free_nid_list(nm_i, i);
1884 	spin_unlock(&nm_i->free_nid_list_lock);
1885 
1886 	kmem_cache_free(free_nid_slab, i);
1887 }
1888 
1889 /*
1890  * alloc_nid() should be called prior to this function.
1891  */
1892 void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid)
1893 {
1894 	struct f2fs_nm_info *nm_i = NM_I(sbi);
1895 	struct free_nid *i;
1896 	bool need_free = false;
1897 
1898 	if (!nid)
1899 		return;
1900 
1901 	spin_lock(&nm_i->free_nid_list_lock);
1902 	i = __lookup_free_nid_list(nm_i, nid);
1903 	f2fs_bug_on(sbi, !i || i->state != NID_ALLOC);
1904 	if (!available_free_memory(sbi, FREE_NIDS)) {
1905 		__del_from_free_nid_list(nm_i, i);
1906 		need_free = true;
1907 	} else {
1908 		i->state = NID_NEW;
1909 		nm_i->fcnt++;
1910 	}
1911 	spin_unlock(&nm_i->free_nid_list_lock);
1912 
1913 	if (need_free)
1914 		kmem_cache_free(free_nid_slab, i);
1915 }
1916 
1917 int try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink)
1918 {
1919 	struct f2fs_nm_info *nm_i = NM_I(sbi);
1920 	struct free_nid *i, *next;
1921 	int nr = nr_shrink;
1922 
1923 	if (nm_i->fcnt <= MAX_FREE_NIDS)
1924 		return 0;
1925 
1926 	if (!mutex_trylock(&nm_i->build_lock))
1927 		return 0;
1928 
1929 	spin_lock(&nm_i->free_nid_list_lock);
1930 	list_for_each_entry_safe(i, next, &nm_i->free_nid_list, list) {
1931 		if (nr_shrink <= 0 || nm_i->fcnt <= MAX_FREE_NIDS)
1932 			break;
1933 		if (i->state == NID_ALLOC)
1934 			continue;
1935 		__del_from_free_nid_list(nm_i, i);
1936 		kmem_cache_free(free_nid_slab, i);
1937 		nm_i->fcnt--;
1938 		nr_shrink--;
1939 	}
1940 	spin_unlock(&nm_i->free_nid_list_lock);
1941 	mutex_unlock(&nm_i->build_lock);
1942 
1943 	return nr - nr_shrink;
1944 }
1945 
1946 void recover_inline_xattr(struct inode *inode, struct page *page)
1947 {
1948 	void *src_addr, *dst_addr;
1949 	size_t inline_size;
1950 	struct page *ipage;
1951 	struct f2fs_inode *ri;
1952 
1953 	ipage = get_node_page(F2FS_I_SB(inode), inode->i_ino);
1954 	f2fs_bug_on(F2FS_I_SB(inode), IS_ERR(ipage));
1955 
1956 	ri = F2FS_INODE(page);
1957 	if (!(ri->i_inline & F2FS_INLINE_XATTR)) {
1958 		clear_inode_flag(inode, FI_INLINE_XATTR);
1959 		goto update_inode;
1960 	}
1961 
1962 	dst_addr = inline_xattr_addr(ipage);
1963 	src_addr = inline_xattr_addr(page);
1964 	inline_size = inline_xattr_size(inode);
1965 
1966 	f2fs_wait_on_page_writeback(ipage, NODE, true);
1967 	memcpy(dst_addr, src_addr, inline_size);
1968 update_inode:
1969 	update_inode(inode, ipage);
1970 	f2fs_put_page(ipage, 1);
1971 }
1972 
1973 void recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr)
1974 {
1975 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
1976 	nid_t prev_xnid = F2FS_I(inode)->i_xattr_nid;
1977 	nid_t new_xnid = nid_of_node(page);
1978 	struct node_info ni;
1979 
1980 	/* 1: invalidate the previous xattr nid */
1981 	if (!prev_xnid)
1982 		goto recover_xnid;
1983 
1984 	/* Deallocate node address */
1985 	get_node_info(sbi, prev_xnid, &ni);
1986 	f2fs_bug_on(sbi, ni.blk_addr == NULL_ADDR);
1987 	invalidate_blocks(sbi, ni.blk_addr);
1988 	dec_valid_node_count(sbi, inode);
1989 	set_node_addr(sbi, &ni, NULL_ADDR, false);
1990 
1991 recover_xnid:
1992 	/* 2: allocate new xattr nid */
1993 	if (unlikely(!inc_valid_node_count(sbi, inode)))
1994 		f2fs_bug_on(sbi, 1);
1995 
1996 	remove_free_nid(NM_I(sbi), new_xnid);
1997 	get_node_info(sbi, new_xnid, &ni);
1998 	ni.ino = inode->i_ino;
1999 	set_node_addr(sbi, &ni, NEW_ADDR, false);
2000 	f2fs_i_xnid_write(inode, new_xnid);
2001 
2002 	/* 3: update xattr blkaddr */
2003 	refresh_sit_entry(sbi, NEW_ADDR, blkaddr);
2004 	set_node_addr(sbi, &ni, blkaddr, false);
2005 }
2006 
2007 int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
2008 {
2009 	struct f2fs_inode *src, *dst;
2010 	nid_t ino = ino_of_node(page);
2011 	struct node_info old_ni, new_ni;
2012 	struct page *ipage;
2013 
2014 	get_node_info(sbi, ino, &old_ni);
2015 
2016 	if (unlikely(old_ni.blk_addr != NULL_ADDR))
2017 		return -EINVAL;
2018 
2019 	ipage = f2fs_grab_cache_page(NODE_MAPPING(sbi), ino, false);
2020 	if (!ipage)
2021 		return -ENOMEM;
2022 
2023 	/* Should not use this inode from free nid list */
2024 	remove_free_nid(NM_I(sbi), ino);
2025 
2026 	if (!PageUptodate(ipage))
2027 		SetPageUptodate(ipage);
2028 	fill_node_footer(ipage, ino, ino, 0, true);
2029 
2030 	src = F2FS_INODE(page);
2031 	dst = F2FS_INODE(ipage);
2032 
2033 	memcpy(dst, src, (unsigned long)&src->i_ext - (unsigned long)src);
2034 	dst->i_size = 0;
2035 	dst->i_blocks = cpu_to_le64(1);
2036 	dst->i_links = cpu_to_le32(1);
2037 	dst->i_xattr_nid = 0;
2038 	dst->i_inline = src->i_inline & F2FS_INLINE_XATTR;
2039 
2040 	new_ni = old_ni;
2041 	new_ni.ino = ino;
2042 
2043 	if (unlikely(!inc_valid_node_count(sbi, NULL)))
2044 		WARN_ON(1);
2045 	set_node_addr(sbi, &new_ni, NEW_ADDR, false);
2046 	inc_valid_inode_count(sbi);
2047 	set_page_dirty(ipage);
2048 	f2fs_put_page(ipage, 1);
2049 	return 0;
2050 }
2051 
2052 int restore_node_summary(struct f2fs_sb_info *sbi,
2053 			unsigned int segno, struct f2fs_summary_block *sum)
2054 {
2055 	struct f2fs_node *rn;
2056 	struct f2fs_summary *sum_entry;
2057 	block_t addr;
2058 	int bio_blocks = MAX_BIO_BLOCKS(sbi);
2059 	int i, idx, last_offset, nrpages;
2060 
2061 	/* scan the node segment */
2062 	last_offset = sbi->blocks_per_seg;
2063 	addr = START_BLOCK(sbi, segno);
2064 	sum_entry = &sum->entries[0];
2065 
2066 	for (i = 0; i < last_offset; i += nrpages, addr += nrpages) {
2067 		nrpages = min(last_offset - i, bio_blocks);
2068 
2069 		/* readahead node pages */
2070 		ra_meta_pages(sbi, addr, nrpages, META_POR, true);
2071 
2072 		for (idx = addr; idx < addr + nrpages; idx++) {
2073 			struct page *page = get_tmp_page(sbi, idx);
2074 
2075 			rn = F2FS_NODE(page);
2076 			sum_entry->nid = rn->footer.nid;
2077 			sum_entry->version = 0;
2078 			sum_entry->ofs_in_node = 0;
2079 			sum_entry++;
2080 			f2fs_put_page(page, 1);
2081 		}
2082 
2083 		invalidate_mapping_pages(META_MAPPING(sbi), addr,
2084 							addr + nrpages);
2085 	}
2086 	return 0;
2087 }
2088 
2089 static void remove_nats_in_journal(struct f2fs_sb_info *sbi)
2090 {
2091 	struct f2fs_nm_info *nm_i = NM_I(sbi);
2092 	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
2093 	struct f2fs_journal *journal = curseg->journal;
2094 	int i;
2095 
2096 	down_write(&curseg->journal_rwsem);
2097 	for (i = 0; i < nats_in_cursum(journal); i++) {
2098 		struct nat_entry *ne;
2099 		struct f2fs_nat_entry raw_ne;
2100 		nid_t nid = le32_to_cpu(nid_in_journal(journal, i));
2101 
2102 		raw_ne = nat_in_journal(journal, i);
2103 
2104 		ne = __lookup_nat_cache(nm_i, nid);
2105 		if (!ne) {
2106 			ne = grab_nat_entry(nm_i, nid);
2107 			node_info_from_raw_nat(&ne->ni, &raw_ne);
2108 		}
2109 		__set_nat_cache_dirty(nm_i, ne);
2110 	}
2111 	update_nats_in_cursum(journal, -i);
2112 	up_write(&curseg->journal_rwsem);
2113 }
2114 
2115 static void __adjust_nat_entry_set(struct nat_entry_set *nes,
2116 						struct list_head *head, int max)
2117 {
2118 	struct nat_entry_set *cur;
2119 
2120 	if (nes->entry_cnt >= max)
2121 		goto add_out;
2122 
2123 	list_for_each_entry(cur, head, set_list) {
2124 		if (cur->entry_cnt >= nes->entry_cnt) {
2125 			list_add(&nes->set_list, cur->set_list.prev);
2126 			return;
2127 		}
2128 	}
2129 add_out:
2130 	list_add_tail(&nes->set_list, head);
2131 }
2132 
2133 static void __flush_nat_entry_set(struct f2fs_sb_info *sbi,
2134 					struct nat_entry_set *set)
2135 {
2136 	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
2137 	struct f2fs_journal *journal = curseg->journal;
2138 	nid_t start_nid = set->set * NAT_ENTRY_PER_BLOCK;
2139 	bool to_journal = true;
2140 	struct f2fs_nat_block *nat_blk;
2141 	struct nat_entry *ne, *cur;
2142 	struct page *page = NULL;
2143 
2144 	/*
2145 	 * there are two steps to flush nat entries:
2146 	 * #1, flush nat entries to journal in current hot data summary block.
2147 	 * #2, flush nat entries to nat page.
2148 	 */
2149 	if (!__has_cursum_space(journal, set->entry_cnt, NAT_JOURNAL))
2150 		to_journal = false;
2151 
2152 	if (to_journal) {
2153 		down_write(&curseg->journal_rwsem);
2154 	} else {
2155 		page = get_next_nat_page(sbi, start_nid);
2156 		nat_blk = page_address(page);
2157 		f2fs_bug_on(sbi, !nat_blk);
2158 	}
2159 
2160 	/* flush dirty nats in nat entry set */
2161 	list_for_each_entry_safe(ne, cur, &set->entry_list, list) {
2162 		struct f2fs_nat_entry *raw_ne;
2163 		nid_t nid = nat_get_nid(ne);
2164 		int offset;
2165 
2166 		if (nat_get_blkaddr(ne) == NEW_ADDR)
2167 			continue;
2168 
2169 		if (to_journal) {
2170 			offset = lookup_journal_in_cursum(journal,
2171 							NAT_JOURNAL, nid, 1);
2172 			f2fs_bug_on(sbi, offset < 0);
2173 			raw_ne = &nat_in_journal(journal, offset);
2174 			nid_in_journal(journal, offset) = cpu_to_le32(nid);
2175 		} else {
2176 			raw_ne = &nat_blk->entries[nid - start_nid];
2177 		}
2178 		raw_nat_from_node_info(raw_ne, &ne->ni);
2179 		nat_reset_flag(ne);
2180 		__clear_nat_cache_dirty(NM_I(sbi), ne);
2181 		if (nat_get_blkaddr(ne) == NULL_ADDR)
2182 			add_free_nid(sbi, nid, false);
2183 	}
2184 
2185 	if (to_journal)
2186 		up_write(&curseg->journal_rwsem);
2187 	else
2188 		f2fs_put_page(page, 1);
2189 
2190 	f2fs_bug_on(sbi, set->entry_cnt);
2191 
2192 	radix_tree_delete(&NM_I(sbi)->nat_set_root, set->set);
2193 	kmem_cache_free(nat_entry_set_slab, set);
2194 }
2195 
2196 /*
2197  * This function is called during the checkpointing process.
2198  */
2199 void flush_nat_entries(struct f2fs_sb_info *sbi)
2200 {
2201 	struct f2fs_nm_info *nm_i = NM_I(sbi);
2202 	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
2203 	struct f2fs_journal *journal = curseg->journal;
2204 	struct nat_entry_set *setvec[SETVEC_SIZE];
2205 	struct nat_entry_set *set, *tmp;
2206 	unsigned int found;
2207 	nid_t set_idx = 0;
2208 	LIST_HEAD(sets);
2209 
2210 	if (!nm_i->dirty_nat_cnt)
2211 		return;
2212 
2213 	down_write(&nm_i->nat_tree_lock);
2214 
2215 	/*
2216 	 * if there are no enough space in journal to store dirty nat
2217 	 * entries, remove all entries from journal and merge them
2218 	 * into nat entry set.
2219 	 */
2220 	if (!__has_cursum_space(journal, nm_i->dirty_nat_cnt, NAT_JOURNAL))
2221 		remove_nats_in_journal(sbi);
2222 
2223 	while ((found = __gang_lookup_nat_set(nm_i,
2224 					set_idx, SETVEC_SIZE, setvec))) {
2225 		unsigned idx;
2226 		set_idx = setvec[found - 1]->set + 1;
2227 		for (idx = 0; idx < found; idx++)
2228 			__adjust_nat_entry_set(setvec[idx], &sets,
2229 						MAX_NAT_JENTRIES(journal));
2230 	}
2231 
2232 	/* flush dirty nats in nat entry set */
2233 	list_for_each_entry_safe(set, tmp, &sets, set_list)
2234 		__flush_nat_entry_set(sbi, set);
2235 
2236 	up_write(&nm_i->nat_tree_lock);
2237 
2238 	f2fs_bug_on(sbi, nm_i->dirty_nat_cnt);
2239 }
2240 
2241 static int init_node_manager(struct f2fs_sb_info *sbi)
2242 {
2243 	struct f2fs_super_block *sb_raw = F2FS_RAW_SUPER(sbi);
2244 	struct f2fs_nm_info *nm_i = NM_I(sbi);
2245 	unsigned char *version_bitmap;
2246 	unsigned int nat_segs, nat_blocks;
2247 
2248 	nm_i->nat_blkaddr = le32_to_cpu(sb_raw->nat_blkaddr);
2249 
2250 	/* segment_count_nat includes pair segment so divide to 2. */
2251 	nat_segs = le32_to_cpu(sb_raw->segment_count_nat) >> 1;
2252 	nat_blocks = nat_segs << le32_to_cpu(sb_raw->log_blocks_per_seg);
2253 
2254 	nm_i->max_nid = NAT_ENTRY_PER_BLOCK * nat_blocks;
2255 
2256 	/* not used nids: 0, node, meta, (and root counted as valid node) */
2257 	nm_i->available_nids = nm_i->max_nid - F2FS_RESERVED_NODE_NUM;
2258 	nm_i->fcnt = 0;
2259 	nm_i->nat_cnt = 0;
2260 	nm_i->ram_thresh = DEF_RAM_THRESHOLD;
2261 	nm_i->ra_nid_pages = DEF_RA_NID_PAGES;
2262 	nm_i->dirty_nats_ratio = DEF_DIRTY_NAT_RATIO_THRESHOLD;
2263 
2264 	INIT_RADIX_TREE(&nm_i->free_nid_root, GFP_ATOMIC);
2265 	INIT_LIST_HEAD(&nm_i->free_nid_list);
2266 	INIT_RADIX_TREE(&nm_i->nat_root, GFP_NOIO);
2267 	INIT_RADIX_TREE(&nm_i->nat_set_root, GFP_NOIO);
2268 	INIT_LIST_HEAD(&nm_i->nat_entries);
2269 
2270 	mutex_init(&nm_i->build_lock);
2271 	spin_lock_init(&nm_i->free_nid_list_lock);
2272 	init_rwsem(&nm_i->nat_tree_lock);
2273 
2274 	nm_i->next_scan_nid = le32_to_cpu(sbi->ckpt->next_free_nid);
2275 	nm_i->bitmap_size = __bitmap_size(sbi, NAT_BITMAP);
2276 	version_bitmap = __bitmap_ptr(sbi, NAT_BITMAP);
2277 	if (!version_bitmap)
2278 		return -EFAULT;
2279 
2280 	nm_i->nat_bitmap = kmemdup(version_bitmap, nm_i->bitmap_size,
2281 					GFP_KERNEL);
2282 	if (!nm_i->nat_bitmap)
2283 		return -ENOMEM;
2284 	return 0;
2285 }
2286 
2287 int build_node_manager(struct f2fs_sb_info *sbi)
2288 {
2289 	int err;
2290 
2291 	sbi->nm_info = kzalloc(sizeof(struct f2fs_nm_info), GFP_KERNEL);
2292 	if (!sbi->nm_info)
2293 		return -ENOMEM;
2294 
2295 	err = init_node_manager(sbi);
2296 	if (err)
2297 		return err;
2298 
2299 	build_free_nids(sbi);
2300 	return 0;
2301 }
2302 
2303 void destroy_node_manager(struct f2fs_sb_info *sbi)
2304 {
2305 	struct f2fs_nm_info *nm_i = NM_I(sbi);
2306 	struct free_nid *i, *next_i;
2307 	struct nat_entry *natvec[NATVEC_SIZE];
2308 	struct nat_entry_set *setvec[SETVEC_SIZE];
2309 	nid_t nid = 0;
2310 	unsigned int found;
2311 
2312 	if (!nm_i)
2313 		return;
2314 
2315 	/* destroy free nid list */
2316 	spin_lock(&nm_i->free_nid_list_lock);
2317 	list_for_each_entry_safe(i, next_i, &nm_i->free_nid_list, list) {
2318 		f2fs_bug_on(sbi, i->state == NID_ALLOC);
2319 		__del_from_free_nid_list(nm_i, i);
2320 		nm_i->fcnt--;
2321 		spin_unlock(&nm_i->free_nid_list_lock);
2322 		kmem_cache_free(free_nid_slab, i);
2323 		spin_lock(&nm_i->free_nid_list_lock);
2324 	}
2325 	f2fs_bug_on(sbi, nm_i->fcnt);
2326 	spin_unlock(&nm_i->free_nid_list_lock);
2327 
2328 	/* destroy nat cache */
2329 	down_write(&nm_i->nat_tree_lock);
2330 	while ((found = __gang_lookup_nat_cache(nm_i,
2331 					nid, NATVEC_SIZE, natvec))) {
2332 		unsigned idx;
2333 
2334 		nid = nat_get_nid(natvec[found - 1]) + 1;
2335 		for (idx = 0; idx < found; idx++)
2336 			__del_from_nat_cache(nm_i, natvec[idx]);
2337 	}
2338 	f2fs_bug_on(sbi, nm_i->nat_cnt);
2339 
2340 	/* destroy nat set cache */
2341 	nid = 0;
2342 	while ((found = __gang_lookup_nat_set(nm_i,
2343 					nid, SETVEC_SIZE, setvec))) {
2344 		unsigned idx;
2345 
2346 		nid = setvec[found - 1]->set + 1;
2347 		for (idx = 0; idx < found; idx++) {
2348 			/* entry_cnt is not zero, when cp_error was occurred */
2349 			f2fs_bug_on(sbi, !list_empty(&setvec[idx]->entry_list));
2350 			radix_tree_delete(&nm_i->nat_set_root, setvec[idx]->set);
2351 			kmem_cache_free(nat_entry_set_slab, setvec[idx]);
2352 		}
2353 	}
2354 	up_write(&nm_i->nat_tree_lock);
2355 
2356 	kfree(nm_i->nat_bitmap);
2357 	sbi->nm_info = NULL;
2358 	kfree(nm_i);
2359 }
2360 
2361 int __init create_node_manager_caches(void)
2362 {
2363 	nat_entry_slab = f2fs_kmem_cache_create("nat_entry",
2364 			sizeof(struct nat_entry));
2365 	if (!nat_entry_slab)
2366 		goto fail;
2367 
2368 	free_nid_slab = f2fs_kmem_cache_create("free_nid",
2369 			sizeof(struct free_nid));
2370 	if (!free_nid_slab)
2371 		goto destroy_nat_entry;
2372 
2373 	nat_entry_set_slab = f2fs_kmem_cache_create("nat_entry_set",
2374 			sizeof(struct nat_entry_set));
2375 	if (!nat_entry_set_slab)
2376 		goto destroy_free_nid;
2377 	return 0;
2378 
2379 destroy_free_nid:
2380 	kmem_cache_destroy(free_nid_slab);
2381 destroy_nat_entry:
2382 	kmem_cache_destroy(nat_entry_slab);
2383 fail:
2384 	return -ENOMEM;
2385 }
2386 
2387 void destroy_node_manager_caches(void)
2388 {
2389 	kmem_cache_destroy(nat_entry_set_slab);
2390 	kmem_cache_destroy(free_nid_slab);
2391 	kmem_cache_destroy(nat_entry_slab);
2392 }
2393