xref: /openbmc/linux/fs/f2fs/node.c (revision 8a10bc9d)
1 /*
2  * fs/f2fs/node.c
3  *
4  * Copyright (c) 2012 Samsung Electronics Co., Ltd.
5  *             http://www.samsung.com/
6  *
7  * This program is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License version 2 as
9  * published by the Free Software Foundation.
10  */
11 #include <linux/fs.h>
12 #include <linux/f2fs_fs.h>
13 #include <linux/mpage.h>
14 #include <linux/backing-dev.h>
15 #include <linux/blkdev.h>
16 #include <linux/pagevec.h>
17 #include <linux/swap.h>
18 
19 #include "f2fs.h"
20 #include "node.h"
21 #include "segment.h"
22 #include <trace/events/f2fs.h>
23 
24 static struct kmem_cache *nat_entry_slab;
25 static struct kmem_cache *free_nid_slab;
26 
27 static void clear_node_page_dirty(struct page *page)
28 {
29 	struct address_space *mapping = page->mapping;
30 	struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb);
31 	unsigned int long flags;
32 
33 	if (PageDirty(page)) {
34 		spin_lock_irqsave(&mapping->tree_lock, flags);
35 		radix_tree_tag_clear(&mapping->page_tree,
36 				page_index(page),
37 				PAGECACHE_TAG_DIRTY);
38 		spin_unlock_irqrestore(&mapping->tree_lock, flags);
39 
40 		clear_page_dirty_for_io(page);
41 		dec_page_count(sbi, F2FS_DIRTY_NODES);
42 	}
43 	ClearPageUptodate(page);
44 }
45 
46 static struct page *get_current_nat_page(struct f2fs_sb_info *sbi, nid_t nid)
47 {
48 	pgoff_t index = current_nat_addr(sbi, nid);
49 	return get_meta_page(sbi, index);
50 }
51 
52 static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid)
53 {
54 	struct page *src_page;
55 	struct page *dst_page;
56 	pgoff_t src_off;
57 	pgoff_t dst_off;
58 	void *src_addr;
59 	void *dst_addr;
60 	struct f2fs_nm_info *nm_i = NM_I(sbi);
61 
62 	src_off = current_nat_addr(sbi, nid);
63 	dst_off = next_nat_addr(sbi, src_off);
64 
65 	/* get current nat block page with lock */
66 	src_page = get_meta_page(sbi, src_off);
67 
68 	/* Dirty src_page means that it is already the new target NAT page. */
69 	if (PageDirty(src_page))
70 		return src_page;
71 
72 	dst_page = grab_meta_page(sbi, dst_off);
73 
74 	src_addr = page_address(src_page);
75 	dst_addr = page_address(dst_page);
76 	memcpy(dst_addr, src_addr, PAGE_CACHE_SIZE);
77 	set_page_dirty(dst_page);
78 	f2fs_put_page(src_page, 1);
79 
80 	set_to_next_nat(nm_i, nid);
81 
82 	return dst_page;
83 }
84 
85 /*
86  * Readahead NAT pages
87  */
88 static void ra_nat_pages(struct f2fs_sb_info *sbi, int nid)
89 {
90 	struct address_space *mapping = META_MAPPING(sbi);
91 	struct f2fs_nm_info *nm_i = NM_I(sbi);
92 	struct page *page;
93 	pgoff_t index;
94 	int i;
95 	struct f2fs_io_info fio = {
96 		.type = META,
97 		.rw = READ_SYNC | REQ_META | REQ_PRIO
98 	};
99 
100 
101 	for (i = 0; i < FREE_NID_PAGES; i++, nid += NAT_ENTRY_PER_BLOCK) {
102 		if (unlikely(nid >= nm_i->max_nid))
103 			nid = 0;
104 		index = current_nat_addr(sbi, nid);
105 
106 		page = grab_cache_page(mapping, index);
107 		if (!page)
108 			continue;
109 		if (PageUptodate(page)) {
110 			mark_page_accessed(page);
111 			f2fs_put_page(page, 1);
112 			continue;
113 		}
114 		f2fs_submit_page_mbio(sbi, page, index, &fio);
115 		mark_page_accessed(page);
116 		f2fs_put_page(page, 0);
117 	}
118 	f2fs_submit_merged_bio(sbi, META, READ);
119 }
120 
121 static struct nat_entry *__lookup_nat_cache(struct f2fs_nm_info *nm_i, nid_t n)
122 {
123 	return radix_tree_lookup(&nm_i->nat_root, n);
124 }
125 
126 static unsigned int __gang_lookup_nat_cache(struct f2fs_nm_info *nm_i,
127 		nid_t start, unsigned int nr, struct nat_entry **ep)
128 {
129 	return radix_tree_gang_lookup(&nm_i->nat_root, (void **)ep, start, nr);
130 }
131 
132 static void __del_from_nat_cache(struct f2fs_nm_info *nm_i, struct nat_entry *e)
133 {
134 	list_del(&e->list);
135 	radix_tree_delete(&nm_i->nat_root, nat_get_nid(e));
136 	nm_i->nat_cnt--;
137 	kmem_cache_free(nat_entry_slab, e);
138 }
139 
140 int is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid)
141 {
142 	struct f2fs_nm_info *nm_i = NM_I(sbi);
143 	struct nat_entry *e;
144 	int is_cp = 1;
145 
146 	read_lock(&nm_i->nat_tree_lock);
147 	e = __lookup_nat_cache(nm_i, nid);
148 	if (e && !e->checkpointed)
149 		is_cp = 0;
150 	read_unlock(&nm_i->nat_tree_lock);
151 	return is_cp;
152 }
153 
154 static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid)
155 {
156 	struct nat_entry *new;
157 
158 	new = kmem_cache_alloc(nat_entry_slab, GFP_ATOMIC);
159 	if (!new)
160 		return NULL;
161 	if (radix_tree_insert(&nm_i->nat_root, nid, new)) {
162 		kmem_cache_free(nat_entry_slab, new);
163 		return NULL;
164 	}
165 	memset(new, 0, sizeof(struct nat_entry));
166 	nat_set_nid(new, nid);
167 	list_add_tail(&new->list, &nm_i->nat_entries);
168 	nm_i->nat_cnt++;
169 	return new;
170 }
171 
172 static void cache_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid,
173 						struct f2fs_nat_entry *ne)
174 {
175 	struct nat_entry *e;
176 retry:
177 	write_lock(&nm_i->nat_tree_lock);
178 	e = __lookup_nat_cache(nm_i, nid);
179 	if (!e) {
180 		e = grab_nat_entry(nm_i, nid);
181 		if (!e) {
182 			write_unlock(&nm_i->nat_tree_lock);
183 			goto retry;
184 		}
185 		nat_set_blkaddr(e, le32_to_cpu(ne->block_addr));
186 		nat_set_ino(e, le32_to_cpu(ne->ino));
187 		nat_set_version(e, ne->version);
188 		e->checkpointed = true;
189 	}
190 	write_unlock(&nm_i->nat_tree_lock);
191 }
192 
193 static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
194 			block_t new_blkaddr)
195 {
196 	struct f2fs_nm_info *nm_i = NM_I(sbi);
197 	struct nat_entry *e;
198 retry:
199 	write_lock(&nm_i->nat_tree_lock);
200 	e = __lookup_nat_cache(nm_i, ni->nid);
201 	if (!e) {
202 		e = grab_nat_entry(nm_i, ni->nid);
203 		if (!e) {
204 			write_unlock(&nm_i->nat_tree_lock);
205 			goto retry;
206 		}
207 		e->ni = *ni;
208 		e->checkpointed = true;
209 		f2fs_bug_on(ni->blk_addr == NEW_ADDR);
210 	} else if (new_blkaddr == NEW_ADDR) {
211 		/*
212 		 * when nid is reallocated,
213 		 * previous nat entry can be remained in nat cache.
214 		 * So, reinitialize it with new information.
215 		 */
216 		e->ni = *ni;
217 		f2fs_bug_on(ni->blk_addr != NULL_ADDR);
218 	}
219 
220 	if (new_blkaddr == NEW_ADDR)
221 		e->checkpointed = false;
222 
223 	/* sanity check */
224 	f2fs_bug_on(nat_get_blkaddr(e) != ni->blk_addr);
225 	f2fs_bug_on(nat_get_blkaddr(e) == NULL_ADDR &&
226 			new_blkaddr == NULL_ADDR);
227 	f2fs_bug_on(nat_get_blkaddr(e) == NEW_ADDR &&
228 			new_blkaddr == NEW_ADDR);
229 	f2fs_bug_on(nat_get_blkaddr(e) != NEW_ADDR &&
230 			nat_get_blkaddr(e) != NULL_ADDR &&
231 			new_blkaddr == NEW_ADDR);
232 
233 	/* increament version no as node is removed */
234 	if (nat_get_blkaddr(e) != NEW_ADDR && new_blkaddr == NULL_ADDR) {
235 		unsigned char version = nat_get_version(e);
236 		nat_set_version(e, inc_node_version(version));
237 	}
238 
239 	/* change address */
240 	nat_set_blkaddr(e, new_blkaddr);
241 	__set_nat_cache_dirty(nm_i, e);
242 	write_unlock(&nm_i->nat_tree_lock);
243 }
244 
245 int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink)
246 {
247 	struct f2fs_nm_info *nm_i = NM_I(sbi);
248 
249 	if (nm_i->nat_cnt <= NM_WOUT_THRESHOLD)
250 		return 0;
251 
252 	write_lock(&nm_i->nat_tree_lock);
253 	while (nr_shrink && !list_empty(&nm_i->nat_entries)) {
254 		struct nat_entry *ne;
255 		ne = list_first_entry(&nm_i->nat_entries,
256 					struct nat_entry, list);
257 		__del_from_nat_cache(nm_i, ne);
258 		nr_shrink--;
259 	}
260 	write_unlock(&nm_i->nat_tree_lock);
261 	return nr_shrink;
262 }
263 
264 /*
265  * This function returns always success
266  */
267 void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni)
268 {
269 	struct f2fs_nm_info *nm_i = NM_I(sbi);
270 	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
271 	struct f2fs_summary_block *sum = curseg->sum_blk;
272 	nid_t start_nid = START_NID(nid);
273 	struct f2fs_nat_block *nat_blk;
274 	struct page *page = NULL;
275 	struct f2fs_nat_entry ne;
276 	struct nat_entry *e;
277 	int i;
278 
279 	memset(&ne, 0, sizeof(struct f2fs_nat_entry));
280 	ni->nid = nid;
281 
282 	/* Check nat cache */
283 	read_lock(&nm_i->nat_tree_lock);
284 	e = __lookup_nat_cache(nm_i, nid);
285 	if (e) {
286 		ni->ino = nat_get_ino(e);
287 		ni->blk_addr = nat_get_blkaddr(e);
288 		ni->version = nat_get_version(e);
289 	}
290 	read_unlock(&nm_i->nat_tree_lock);
291 	if (e)
292 		return;
293 
294 	/* Check current segment summary */
295 	mutex_lock(&curseg->curseg_mutex);
296 	i = lookup_journal_in_cursum(sum, NAT_JOURNAL, nid, 0);
297 	if (i >= 0) {
298 		ne = nat_in_journal(sum, i);
299 		node_info_from_raw_nat(ni, &ne);
300 	}
301 	mutex_unlock(&curseg->curseg_mutex);
302 	if (i >= 0)
303 		goto cache;
304 
305 	/* Fill node_info from nat page */
306 	page = get_current_nat_page(sbi, start_nid);
307 	nat_blk = (struct f2fs_nat_block *)page_address(page);
308 	ne = nat_blk->entries[nid - start_nid];
309 	node_info_from_raw_nat(ni, &ne);
310 	f2fs_put_page(page, 1);
311 cache:
312 	/* cache nat entry */
313 	cache_nat_entry(NM_I(sbi), nid, &ne);
314 }
315 
316 /*
317  * The maximum depth is four.
318  * Offset[0] will have raw inode offset.
319  */
320 static int get_node_path(struct f2fs_inode_info *fi, long block,
321 				int offset[4], unsigned int noffset[4])
322 {
323 	const long direct_index = ADDRS_PER_INODE(fi);
324 	const long direct_blks = ADDRS_PER_BLOCK;
325 	const long dptrs_per_blk = NIDS_PER_BLOCK;
326 	const long indirect_blks = ADDRS_PER_BLOCK * NIDS_PER_BLOCK;
327 	const long dindirect_blks = indirect_blks * NIDS_PER_BLOCK;
328 	int n = 0;
329 	int level = 0;
330 
331 	noffset[0] = 0;
332 
333 	if (block < direct_index) {
334 		offset[n] = block;
335 		goto got;
336 	}
337 	block -= direct_index;
338 	if (block < direct_blks) {
339 		offset[n++] = NODE_DIR1_BLOCK;
340 		noffset[n] = 1;
341 		offset[n] = block;
342 		level = 1;
343 		goto got;
344 	}
345 	block -= direct_blks;
346 	if (block < direct_blks) {
347 		offset[n++] = NODE_DIR2_BLOCK;
348 		noffset[n] = 2;
349 		offset[n] = block;
350 		level = 1;
351 		goto got;
352 	}
353 	block -= direct_blks;
354 	if (block < indirect_blks) {
355 		offset[n++] = NODE_IND1_BLOCK;
356 		noffset[n] = 3;
357 		offset[n++] = block / direct_blks;
358 		noffset[n] = 4 + offset[n - 1];
359 		offset[n] = block % direct_blks;
360 		level = 2;
361 		goto got;
362 	}
363 	block -= indirect_blks;
364 	if (block < indirect_blks) {
365 		offset[n++] = NODE_IND2_BLOCK;
366 		noffset[n] = 4 + dptrs_per_blk;
367 		offset[n++] = block / direct_blks;
368 		noffset[n] = 5 + dptrs_per_blk + offset[n - 1];
369 		offset[n] = block % direct_blks;
370 		level = 2;
371 		goto got;
372 	}
373 	block -= indirect_blks;
374 	if (block < dindirect_blks) {
375 		offset[n++] = NODE_DIND_BLOCK;
376 		noffset[n] = 5 + (dptrs_per_blk * 2);
377 		offset[n++] = block / indirect_blks;
378 		noffset[n] = 6 + (dptrs_per_blk * 2) +
379 			      offset[n - 1] * (dptrs_per_blk + 1);
380 		offset[n++] = (block / direct_blks) % dptrs_per_blk;
381 		noffset[n] = 7 + (dptrs_per_blk * 2) +
382 			      offset[n - 2] * (dptrs_per_blk + 1) +
383 			      offset[n - 1];
384 		offset[n] = block % direct_blks;
385 		level = 3;
386 		goto got;
387 	} else {
388 		BUG();
389 	}
390 got:
391 	return level;
392 }
393 
394 /*
395  * Caller should call f2fs_put_dnode(dn).
396  * Also, it should grab and release a rwsem by calling f2fs_lock_op() and
397  * f2fs_unlock_op() only if ro is not set RDONLY_NODE.
398  * In the case of RDONLY_NODE, we don't need to care about mutex.
399  */
400 int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
401 {
402 	struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
403 	struct page *npage[4];
404 	struct page *parent;
405 	int offset[4];
406 	unsigned int noffset[4];
407 	nid_t nids[4];
408 	int level, i;
409 	int err = 0;
410 
411 	level = get_node_path(F2FS_I(dn->inode), index, offset, noffset);
412 
413 	nids[0] = dn->inode->i_ino;
414 	npage[0] = dn->inode_page;
415 
416 	if (!npage[0]) {
417 		npage[0] = get_node_page(sbi, nids[0]);
418 		if (IS_ERR(npage[0]))
419 			return PTR_ERR(npage[0]);
420 	}
421 	parent = npage[0];
422 	if (level != 0)
423 		nids[1] = get_nid(parent, offset[0], true);
424 	dn->inode_page = npage[0];
425 	dn->inode_page_locked = true;
426 
427 	/* get indirect or direct nodes */
428 	for (i = 1; i <= level; i++) {
429 		bool done = false;
430 
431 		if (!nids[i] && mode == ALLOC_NODE) {
432 			/* alloc new node */
433 			if (!alloc_nid(sbi, &(nids[i]))) {
434 				err = -ENOSPC;
435 				goto release_pages;
436 			}
437 
438 			dn->nid = nids[i];
439 			npage[i] = new_node_page(dn, noffset[i], NULL);
440 			if (IS_ERR(npage[i])) {
441 				alloc_nid_failed(sbi, nids[i]);
442 				err = PTR_ERR(npage[i]);
443 				goto release_pages;
444 			}
445 
446 			set_nid(parent, offset[i - 1], nids[i], i == 1);
447 			alloc_nid_done(sbi, nids[i]);
448 			done = true;
449 		} else if (mode == LOOKUP_NODE_RA && i == level && level > 1) {
450 			npage[i] = get_node_page_ra(parent, offset[i - 1]);
451 			if (IS_ERR(npage[i])) {
452 				err = PTR_ERR(npage[i]);
453 				goto release_pages;
454 			}
455 			done = true;
456 		}
457 		if (i == 1) {
458 			dn->inode_page_locked = false;
459 			unlock_page(parent);
460 		} else {
461 			f2fs_put_page(parent, 1);
462 		}
463 
464 		if (!done) {
465 			npage[i] = get_node_page(sbi, nids[i]);
466 			if (IS_ERR(npage[i])) {
467 				err = PTR_ERR(npage[i]);
468 				f2fs_put_page(npage[0], 0);
469 				goto release_out;
470 			}
471 		}
472 		if (i < level) {
473 			parent = npage[i];
474 			nids[i + 1] = get_nid(parent, offset[i], false);
475 		}
476 	}
477 	dn->nid = nids[level];
478 	dn->ofs_in_node = offset[level];
479 	dn->node_page = npage[level];
480 	dn->data_blkaddr = datablock_addr(dn->node_page, dn->ofs_in_node);
481 	return 0;
482 
483 release_pages:
484 	f2fs_put_page(parent, 1);
485 	if (i > 1)
486 		f2fs_put_page(npage[0], 0);
487 release_out:
488 	dn->inode_page = NULL;
489 	dn->node_page = NULL;
490 	return err;
491 }
492 
493 static void truncate_node(struct dnode_of_data *dn)
494 {
495 	struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
496 	struct node_info ni;
497 
498 	get_node_info(sbi, dn->nid, &ni);
499 	if (dn->inode->i_blocks == 0) {
500 		f2fs_bug_on(ni.blk_addr != NULL_ADDR);
501 		goto invalidate;
502 	}
503 	f2fs_bug_on(ni.blk_addr == NULL_ADDR);
504 
505 	/* Deallocate node address */
506 	invalidate_blocks(sbi, ni.blk_addr);
507 	dec_valid_node_count(sbi, dn->inode);
508 	set_node_addr(sbi, &ni, NULL_ADDR);
509 
510 	if (dn->nid == dn->inode->i_ino) {
511 		remove_orphan_inode(sbi, dn->nid);
512 		dec_valid_inode_count(sbi);
513 	} else {
514 		sync_inode_page(dn);
515 	}
516 invalidate:
517 	clear_node_page_dirty(dn->node_page);
518 	F2FS_SET_SB_DIRT(sbi);
519 
520 	f2fs_put_page(dn->node_page, 1);
521 
522 	invalidate_mapping_pages(NODE_MAPPING(sbi),
523 			dn->node_page->index, dn->node_page->index);
524 
525 	dn->node_page = NULL;
526 	trace_f2fs_truncate_node(dn->inode, dn->nid, ni.blk_addr);
527 }
528 
529 static int truncate_dnode(struct dnode_of_data *dn)
530 {
531 	struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
532 	struct page *page;
533 
534 	if (dn->nid == 0)
535 		return 1;
536 
537 	/* get direct node */
538 	page = get_node_page(sbi, dn->nid);
539 	if (IS_ERR(page) && PTR_ERR(page) == -ENOENT)
540 		return 1;
541 	else if (IS_ERR(page))
542 		return PTR_ERR(page);
543 
544 	/* Make dnode_of_data for parameter */
545 	dn->node_page = page;
546 	dn->ofs_in_node = 0;
547 	truncate_data_blocks(dn);
548 	truncate_node(dn);
549 	return 1;
550 }
551 
552 static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs,
553 						int ofs, int depth)
554 {
555 	struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
556 	struct dnode_of_data rdn = *dn;
557 	struct page *page;
558 	struct f2fs_node *rn;
559 	nid_t child_nid;
560 	unsigned int child_nofs;
561 	int freed = 0;
562 	int i, ret;
563 
564 	if (dn->nid == 0)
565 		return NIDS_PER_BLOCK + 1;
566 
567 	trace_f2fs_truncate_nodes_enter(dn->inode, dn->nid, dn->data_blkaddr);
568 
569 	page = get_node_page(sbi, dn->nid);
570 	if (IS_ERR(page)) {
571 		trace_f2fs_truncate_nodes_exit(dn->inode, PTR_ERR(page));
572 		return PTR_ERR(page);
573 	}
574 
575 	rn = F2FS_NODE(page);
576 	if (depth < 3) {
577 		for (i = ofs; i < NIDS_PER_BLOCK; i++, freed++) {
578 			child_nid = le32_to_cpu(rn->in.nid[i]);
579 			if (child_nid == 0)
580 				continue;
581 			rdn.nid = child_nid;
582 			ret = truncate_dnode(&rdn);
583 			if (ret < 0)
584 				goto out_err;
585 			set_nid(page, i, 0, false);
586 		}
587 	} else {
588 		child_nofs = nofs + ofs * (NIDS_PER_BLOCK + 1) + 1;
589 		for (i = ofs; i < NIDS_PER_BLOCK; i++) {
590 			child_nid = le32_to_cpu(rn->in.nid[i]);
591 			if (child_nid == 0) {
592 				child_nofs += NIDS_PER_BLOCK + 1;
593 				continue;
594 			}
595 			rdn.nid = child_nid;
596 			ret = truncate_nodes(&rdn, child_nofs, 0, depth - 1);
597 			if (ret == (NIDS_PER_BLOCK + 1)) {
598 				set_nid(page, i, 0, false);
599 				child_nofs += ret;
600 			} else if (ret < 0 && ret != -ENOENT) {
601 				goto out_err;
602 			}
603 		}
604 		freed = child_nofs;
605 	}
606 
607 	if (!ofs) {
608 		/* remove current indirect node */
609 		dn->node_page = page;
610 		truncate_node(dn);
611 		freed++;
612 	} else {
613 		f2fs_put_page(page, 1);
614 	}
615 	trace_f2fs_truncate_nodes_exit(dn->inode, freed);
616 	return freed;
617 
618 out_err:
619 	f2fs_put_page(page, 1);
620 	trace_f2fs_truncate_nodes_exit(dn->inode, ret);
621 	return ret;
622 }
623 
624 static int truncate_partial_nodes(struct dnode_of_data *dn,
625 			struct f2fs_inode *ri, int *offset, int depth)
626 {
627 	struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
628 	struct page *pages[2];
629 	nid_t nid[3];
630 	nid_t child_nid;
631 	int err = 0;
632 	int i;
633 	int idx = depth - 2;
634 
635 	nid[0] = le32_to_cpu(ri->i_nid[offset[0] - NODE_DIR1_BLOCK]);
636 	if (!nid[0])
637 		return 0;
638 
639 	/* get indirect nodes in the path */
640 	for (i = 0; i < idx + 1; i++) {
641 		/* refernece count'll be increased */
642 		pages[i] = get_node_page(sbi, nid[i]);
643 		if (IS_ERR(pages[i])) {
644 			err = PTR_ERR(pages[i]);
645 			idx = i - 1;
646 			goto fail;
647 		}
648 		nid[i + 1] = get_nid(pages[i], offset[i + 1], false);
649 	}
650 
651 	/* free direct nodes linked to a partial indirect node */
652 	for (i = offset[idx + 1]; i < NIDS_PER_BLOCK; i++) {
653 		child_nid = get_nid(pages[idx], i, false);
654 		if (!child_nid)
655 			continue;
656 		dn->nid = child_nid;
657 		err = truncate_dnode(dn);
658 		if (err < 0)
659 			goto fail;
660 		set_nid(pages[idx], i, 0, false);
661 	}
662 
663 	if (offset[idx + 1] == 0) {
664 		dn->node_page = pages[idx];
665 		dn->nid = nid[idx];
666 		truncate_node(dn);
667 	} else {
668 		f2fs_put_page(pages[idx], 1);
669 	}
670 	offset[idx]++;
671 	offset[idx + 1] = 0;
672 	idx--;
673 fail:
674 	for (i = idx; i >= 0; i--)
675 		f2fs_put_page(pages[i], 1);
676 
677 	trace_f2fs_truncate_partial_nodes(dn->inode, nid, depth, err);
678 
679 	return err;
680 }
681 
682 /*
683  * All the block addresses of data and nodes should be nullified.
684  */
685 int truncate_inode_blocks(struct inode *inode, pgoff_t from)
686 {
687 	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
688 	int err = 0, cont = 1;
689 	int level, offset[4], noffset[4];
690 	unsigned int nofs = 0;
691 	struct f2fs_inode *ri;
692 	struct dnode_of_data dn;
693 	struct page *page;
694 
695 	trace_f2fs_truncate_inode_blocks_enter(inode, from);
696 
697 	level = get_node_path(F2FS_I(inode), from, offset, noffset);
698 restart:
699 	page = get_node_page(sbi, inode->i_ino);
700 	if (IS_ERR(page)) {
701 		trace_f2fs_truncate_inode_blocks_exit(inode, PTR_ERR(page));
702 		return PTR_ERR(page);
703 	}
704 
705 	set_new_dnode(&dn, inode, page, NULL, 0);
706 	unlock_page(page);
707 
708 	ri = F2FS_INODE(page);
709 	switch (level) {
710 	case 0:
711 	case 1:
712 		nofs = noffset[1];
713 		break;
714 	case 2:
715 		nofs = noffset[1];
716 		if (!offset[level - 1])
717 			goto skip_partial;
718 		err = truncate_partial_nodes(&dn, ri, offset, level);
719 		if (err < 0 && err != -ENOENT)
720 			goto fail;
721 		nofs += 1 + NIDS_PER_BLOCK;
722 		break;
723 	case 3:
724 		nofs = 5 + 2 * NIDS_PER_BLOCK;
725 		if (!offset[level - 1])
726 			goto skip_partial;
727 		err = truncate_partial_nodes(&dn, ri, offset, level);
728 		if (err < 0 && err != -ENOENT)
729 			goto fail;
730 		break;
731 	default:
732 		BUG();
733 	}
734 
735 skip_partial:
736 	while (cont) {
737 		dn.nid = le32_to_cpu(ri->i_nid[offset[0] - NODE_DIR1_BLOCK]);
738 		switch (offset[0]) {
739 		case NODE_DIR1_BLOCK:
740 		case NODE_DIR2_BLOCK:
741 			err = truncate_dnode(&dn);
742 			break;
743 
744 		case NODE_IND1_BLOCK:
745 		case NODE_IND2_BLOCK:
746 			err = truncate_nodes(&dn, nofs, offset[1], 2);
747 			break;
748 
749 		case NODE_DIND_BLOCK:
750 			err = truncate_nodes(&dn, nofs, offset[1], 3);
751 			cont = 0;
752 			break;
753 
754 		default:
755 			BUG();
756 		}
757 		if (err < 0 && err != -ENOENT)
758 			goto fail;
759 		if (offset[1] == 0 &&
760 				ri->i_nid[offset[0] - NODE_DIR1_BLOCK]) {
761 			lock_page(page);
762 			if (unlikely(page->mapping != NODE_MAPPING(sbi))) {
763 				f2fs_put_page(page, 1);
764 				goto restart;
765 			}
766 			wait_on_page_writeback(page);
767 			ri->i_nid[offset[0] - NODE_DIR1_BLOCK] = 0;
768 			set_page_dirty(page);
769 			unlock_page(page);
770 		}
771 		offset[1] = 0;
772 		offset[0]++;
773 		nofs += err;
774 	}
775 fail:
776 	f2fs_put_page(page, 0);
777 	trace_f2fs_truncate_inode_blocks_exit(inode, err);
778 	return err > 0 ? 0 : err;
779 }
780 
781 int truncate_xattr_node(struct inode *inode, struct page *page)
782 {
783 	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
784 	nid_t nid = F2FS_I(inode)->i_xattr_nid;
785 	struct dnode_of_data dn;
786 	struct page *npage;
787 
788 	if (!nid)
789 		return 0;
790 
791 	npage = get_node_page(sbi, nid);
792 	if (IS_ERR(npage))
793 		return PTR_ERR(npage);
794 
795 	F2FS_I(inode)->i_xattr_nid = 0;
796 
797 	/* need to do checkpoint during fsync */
798 	F2FS_I(inode)->xattr_ver = cur_cp_version(F2FS_CKPT(sbi));
799 
800 	set_new_dnode(&dn, inode, page, npage, nid);
801 
802 	if (page)
803 		dn.inode_page_locked = true;
804 	truncate_node(&dn);
805 	return 0;
806 }
807 
808 /*
809  * Caller should grab and release a rwsem by calling f2fs_lock_op() and
810  * f2fs_unlock_op().
811  */
812 void remove_inode_page(struct inode *inode)
813 {
814 	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
815 	struct page *page;
816 	nid_t ino = inode->i_ino;
817 	struct dnode_of_data dn;
818 
819 	page = get_node_page(sbi, ino);
820 	if (IS_ERR(page))
821 		return;
822 
823 	if (truncate_xattr_node(inode, page)) {
824 		f2fs_put_page(page, 1);
825 		return;
826 	}
827 	/* 0 is possible, after f2fs_new_inode() is failed */
828 	f2fs_bug_on(inode->i_blocks != 0 && inode->i_blocks != 1);
829 	set_new_dnode(&dn, inode, page, page, ino);
830 	truncate_node(&dn);
831 }
832 
833 struct page *new_inode_page(struct inode *inode, const struct qstr *name)
834 {
835 	struct dnode_of_data dn;
836 
837 	/* allocate inode page for new inode */
838 	set_new_dnode(&dn, inode, NULL, NULL, inode->i_ino);
839 
840 	/* caller should f2fs_put_page(page, 1); */
841 	return new_node_page(&dn, 0, NULL);
842 }
843 
844 struct page *new_node_page(struct dnode_of_data *dn,
845 				unsigned int ofs, struct page *ipage)
846 {
847 	struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
848 	struct node_info old_ni, new_ni;
849 	struct page *page;
850 	int err;
851 
852 	if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC)))
853 		return ERR_PTR(-EPERM);
854 
855 	page = grab_cache_page(NODE_MAPPING(sbi), dn->nid);
856 	if (!page)
857 		return ERR_PTR(-ENOMEM);
858 
859 	if (unlikely(!inc_valid_node_count(sbi, dn->inode))) {
860 		err = -ENOSPC;
861 		goto fail;
862 	}
863 
864 	get_node_info(sbi, dn->nid, &old_ni);
865 
866 	/* Reinitialize old_ni with new node page */
867 	f2fs_bug_on(old_ni.blk_addr != NULL_ADDR);
868 	new_ni = old_ni;
869 	new_ni.ino = dn->inode->i_ino;
870 	set_node_addr(sbi, &new_ni, NEW_ADDR);
871 
872 	fill_node_footer(page, dn->nid, dn->inode->i_ino, ofs, true);
873 	set_cold_node(dn->inode, page);
874 	SetPageUptodate(page);
875 	set_page_dirty(page);
876 
877 	if (ofs == XATTR_NODE_OFFSET)
878 		F2FS_I(dn->inode)->i_xattr_nid = dn->nid;
879 
880 	dn->node_page = page;
881 	if (ipage)
882 		update_inode(dn->inode, ipage);
883 	else
884 		sync_inode_page(dn);
885 	if (ofs == 0)
886 		inc_valid_inode_count(sbi);
887 
888 	return page;
889 
890 fail:
891 	clear_node_page_dirty(page);
892 	f2fs_put_page(page, 1);
893 	return ERR_PTR(err);
894 }
895 
896 /*
897  * Caller should do after getting the following values.
898  * 0: f2fs_put_page(page, 0)
899  * LOCKED_PAGE: f2fs_put_page(page, 1)
900  * error: nothing
901  */
902 static int read_node_page(struct page *page, int rw)
903 {
904 	struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb);
905 	struct node_info ni;
906 
907 	get_node_info(sbi, page->index, &ni);
908 
909 	if (unlikely(ni.blk_addr == NULL_ADDR)) {
910 		f2fs_put_page(page, 1);
911 		return -ENOENT;
912 	}
913 
914 	if (PageUptodate(page))
915 		return LOCKED_PAGE;
916 
917 	return f2fs_submit_page_bio(sbi, page, ni.blk_addr, rw);
918 }
919 
920 /*
921  * Readahead a node page
922  */
923 void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid)
924 {
925 	struct page *apage;
926 	int err;
927 
928 	apage = find_get_page(NODE_MAPPING(sbi), nid);
929 	if (apage && PageUptodate(apage)) {
930 		f2fs_put_page(apage, 0);
931 		return;
932 	}
933 	f2fs_put_page(apage, 0);
934 
935 	apage = grab_cache_page(NODE_MAPPING(sbi), nid);
936 	if (!apage)
937 		return;
938 
939 	err = read_node_page(apage, READA);
940 	if (err == 0)
941 		f2fs_put_page(apage, 0);
942 	else if (err == LOCKED_PAGE)
943 		f2fs_put_page(apage, 1);
944 }
945 
946 struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid)
947 {
948 	struct page *page;
949 	int err;
950 repeat:
951 	page = grab_cache_page(NODE_MAPPING(sbi), nid);
952 	if (!page)
953 		return ERR_PTR(-ENOMEM);
954 
955 	err = read_node_page(page, READ_SYNC);
956 	if (err < 0)
957 		return ERR_PTR(err);
958 	else if (err == LOCKED_PAGE)
959 		goto got_it;
960 
961 	lock_page(page);
962 	if (unlikely(!PageUptodate(page))) {
963 		f2fs_put_page(page, 1);
964 		return ERR_PTR(-EIO);
965 	}
966 	if (unlikely(page->mapping != NODE_MAPPING(sbi))) {
967 		f2fs_put_page(page, 1);
968 		goto repeat;
969 	}
970 got_it:
971 	f2fs_bug_on(nid != nid_of_node(page));
972 	mark_page_accessed(page);
973 	return page;
974 }
975 
976 /*
977  * Return a locked page for the desired node page.
978  * And, readahead MAX_RA_NODE number of node pages.
979  */
980 struct page *get_node_page_ra(struct page *parent, int start)
981 {
982 	struct f2fs_sb_info *sbi = F2FS_SB(parent->mapping->host->i_sb);
983 	struct blk_plug plug;
984 	struct page *page;
985 	int err, i, end;
986 	nid_t nid;
987 
988 	/* First, try getting the desired direct node. */
989 	nid = get_nid(parent, start, false);
990 	if (!nid)
991 		return ERR_PTR(-ENOENT);
992 repeat:
993 	page = grab_cache_page(NODE_MAPPING(sbi), nid);
994 	if (!page)
995 		return ERR_PTR(-ENOMEM);
996 
997 	err = read_node_page(page, READ_SYNC);
998 	if (err < 0)
999 		return ERR_PTR(err);
1000 	else if (err == LOCKED_PAGE)
1001 		goto page_hit;
1002 
1003 	blk_start_plug(&plug);
1004 
1005 	/* Then, try readahead for siblings of the desired node */
1006 	end = start + MAX_RA_NODE;
1007 	end = min(end, NIDS_PER_BLOCK);
1008 	for (i = start + 1; i < end; i++) {
1009 		nid = get_nid(parent, i, false);
1010 		if (!nid)
1011 			continue;
1012 		ra_node_page(sbi, nid);
1013 	}
1014 
1015 	blk_finish_plug(&plug);
1016 
1017 	lock_page(page);
1018 	if (unlikely(page->mapping != NODE_MAPPING(sbi))) {
1019 		f2fs_put_page(page, 1);
1020 		goto repeat;
1021 	}
1022 page_hit:
1023 	if (unlikely(!PageUptodate(page))) {
1024 		f2fs_put_page(page, 1);
1025 		return ERR_PTR(-EIO);
1026 	}
1027 	mark_page_accessed(page);
1028 	return page;
1029 }
1030 
1031 void sync_inode_page(struct dnode_of_data *dn)
1032 {
1033 	if (IS_INODE(dn->node_page) || dn->inode_page == dn->node_page) {
1034 		update_inode(dn->inode, dn->node_page);
1035 	} else if (dn->inode_page) {
1036 		if (!dn->inode_page_locked)
1037 			lock_page(dn->inode_page);
1038 		update_inode(dn->inode, dn->inode_page);
1039 		if (!dn->inode_page_locked)
1040 			unlock_page(dn->inode_page);
1041 	} else {
1042 		update_inode_page(dn->inode);
1043 	}
1044 }
1045 
1046 int sync_node_pages(struct f2fs_sb_info *sbi, nid_t ino,
1047 					struct writeback_control *wbc)
1048 {
1049 	pgoff_t index, end;
1050 	struct pagevec pvec;
1051 	int step = ino ? 2 : 0;
1052 	int nwritten = 0, wrote = 0;
1053 
1054 	pagevec_init(&pvec, 0);
1055 
1056 next_step:
1057 	index = 0;
1058 	end = LONG_MAX;
1059 
1060 	while (index <= end) {
1061 		int i, nr_pages;
1062 		nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index,
1063 				PAGECACHE_TAG_DIRTY,
1064 				min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
1065 		if (nr_pages == 0)
1066 			break;
1067 
1068 		for (i = 0; i < nr_pages; i++) {
1069 			struct page *page = pvec.pages[i];
1070 
1071 			/*
1072 			 * flushing sequence with step:
1073 			 * 0. indirect nodes
1074 			 * 1. dentry dnodes
1075 			 * 2. file dnodes
1076 			 */
1077 			if (step == 0 && IS_DNODE(page))
1078 				continue;
1079 			if (step == 1 && (!IS_DNODE(page) ||
1080 						is_cold_node(page)))
1081 				continue;
1082 			if (step == 2 && (!IS_DNODE(page) ||
1083 						!is_cold_node(page)))
1084 				continue;
1085 
1086 			/*
1087 			 * If an fsync mode,
1088 			 * we should not skip writing node pages.
1089 			 */
1090 			if (ino && ino_of_node(page) == ino)
1091 				lock_page(page);
1092 			else if (!trylock_page(page))
1093 				continue;
1094 
1095 			if (unlikely(page->mapping != NODE_MAPPING(sbi))) {
1096 continue_unlock:
1097 				unlock_page(page);
1098 				continue;
1099 			}
1100 			if (ino && ino_of_node(page) != ino)
1101 				goto continue_unlock;
1102 
1103 			if (!PageDirty(page)) {
1104 				/* someone wrote it for us */
1105 				goto continue_unlock;
1106 			}
1107 
1108 			if (!clear_page_dirty_for_io(page))
1109 				goto continue_unlock;
1110 
1111 			/* called by fsync() */
1112 			if (ino && IS_DNODE(page)) {
1113 				int mark = !is_checkpointed_node(sbi, ino);
1114 				set_fsync_mark(page, 1);
1115 				if (IS_INODE(page))
1116 					set_dentry_mark(page, mark);
1117 				nwritten++;
1118 			} else {
1119 				set_fsync_mark(page, 0);
1120 				set_dentry_mark(page, 0);
1121 			}
1122 			NODE_MAPPING(sbi)->a_ops->writepage(page, wbc);
1123 			wrote++;
1124 
1125 			if (--wbc->nr_to_write == 0)
1126 				break;
1127 		}
1128 		pagevec_release(&pvec);
1129 		cond_resched();
1130 
1131 		if (wbc->nr_to_write == 0) {
1132 			step = 2;
1133 			break;
1134 		}
1135 	}
1136 
1137 	if (step < 2) {
1138 		step++;
1139 		goto next_step;
1140 	}
1141 
1142 	if (wrote)
1143 		f2fs_submit_merged_bio(sbi, NODE, WRITE);
1144 	return nwritten;
1145 }
1146 
1147 int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino)
1148 {
1149 	pgoff_t index = 0, end = LONG_MAX;
1150 	struct pagevec pvec;
1151 	int ret2 = 0, ret = 0;
1152 
1153 	pagevec_init(&pvec, 0);
1154 
1155 	while (index <= end) {
1156 		int i, nr_pages;
1157 		nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index,
1158 				PAGECACHE_TAG_WRITEBACK,
1159 				min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
1160 		if (nr_pages == 0)
1161 			break;
1162 
1163 		for (i = 0; i < nr_pages; i++) {
1164 			struct page *page = pvec.pages[i];
1165 
1166 			/* until radix tree lookup accepts end_index */
1167 			if (unlikely(page->index > end))
1168 				continue;
1169 
1170 			if (ino && ino_of_node(page) == ino) {
1171 				wait_on_page_writeback(page);
1172 				if (TestClearPageError(page))
1173 					ret = -EIO;
1174 			}
1175 		}
1176 		pagevec_release(&pvec);
1177 		cond_resched();
1178 	}
1179 
1180 	if (unlikely(test_and_clear_bit(AS_ENOSPC, &NODE_MAPPING(sbi)->flags)))
1181 		ret2 = -ENOSPC;
1182 	if (unlikely(test_and_clear_bit(AS_EIO, &NODE_MAPPING(sbi)->flags)))
1183 		ret2 = -EIO;
1184 	if (!ret)
1185 		ret = ret2;
1186 	return ret;
1187 }
1188 
1189 static int f2fs_write_node_page(struct page *page,
1190 				struct writeback_control *wbc)
1191 {
1192 	struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb);
1193 	nid_t nid;
1194 	block_t new_addr;
1195 	struct node_info ni;
1196 	struct f2fs_io_info fio = {
1197 		.type = NODE,
1198 		.rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE,
1199 	};
1200 
1201 	if (unlikely(sbi->por_doing))
1202 		goto redirty_out;
1203 
1204 	wait_on_page_writeback(page);
1205 
1206 	/* get old block addr of this node page */
1207 	nid = nid_of_node(page);
1208 	f2fs_bug_on(page->index != nid);
1209 
1210 	get_node_info(sbi, nid, &ni);
1211 
1212 	/* This page is already truncated */
1213 	if (unlikely(ni.blk_addr == NULL_ADDR)) {
1214 		dec_page_count(sbi, F2FS_DIRTY_NODES);
1215 		unlock_page(page);
1216 		return 0;
1217 	}
1218 
1219 	if (wbc->for_reclaim)
1220 		goto redirty_out;
1221 
1222 	mutex_lock(&sbi->node_write);
1223 	set_page_writeback(page);
1224 	write_node_page(sbi, page, &fio, nid, ni.blk_addr, &new_addr);
1225 	set_node_addr(sbi, &ni, new_addr);
1226 	dec_page_count(sbi, F2FS_DIRTY_NODES);
1227 	mutex_unlock(&sbi->node_write);
1228 	unlock_page(page);
1229 	return 0;
1230 
1231 redirty_out:
1232 	dec_page_count(sbi, F2FS_DIRTY_NODES);
1233 	wbc->pages_skipped++;
1234 	set_page_dirty(page);
1235 	return AOP_WRITEPAGE_ACTIVATE;
1236 }
1237 
1238 /*
1239  * It is very important to gather dirty pages and write at once, so that we can
1240  * submit a big bio without interfering other data writes.
1241  * Be default, 512 pages (2MB) * 3 node types, is more reasonable.
1242  */
1243 #define COLLECT_DIRTY_NODES	1536
1244 static int f2fs_write_node_pages(struct address_space *mapping,
1245 			    struct writeback_control *wbc)
1246 {
1247 	struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb);
1248 	long nr_to_write = wbc->nr_to_write;
1249 
1250 	/* balancing f2fs's metadata in background */
1251 	f2fs_balance_fs_bg(sbi);
1252 
1253 	/* collect a number of dirty node pages and write together */
1254 	if (get_pages(sbi, F2FS_DIRTY_NODES) < COLLECT_DIRTY_NODES)
1255 		return 0;
1256 
1257 	/* if mounting is failed, skip writing node pages */
1258 	wbc->nr_to_write = 3 * max_hw_blocks(sbi);
1259 	wbc->sync_mode = WB_SYNC_NONE;
1260 	sync_node_pages(sbi, 0, wbc);
1261 	wbc->nr_to_write = nr_to_write - (3 * max_hw_blocks(sbi) -
1262 						wbc->nr_to_write);
1263 	return 0;
1264 }
1265 
1266 static int f2fs_set_node_page_dirty(struct page *page)
1267 {
1268 	struct address_space *mapping = page->mapping;
1269 	struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb);
1270 
1271 	trace_f2fs_set_page_dirty(page, NODE);
1272 
1273 	SetPageUptodate(page);
1274 	if (!PageDirty(page)) {
1275 		__set_page_dirty_nobuffers(page);
1276 		inc_page_count(sbi, F2FS_DIRTY_NODES);
1277 		SetPagePrivate(page);
1278 		return 1;
1279 	}
1280 	return 0;
1281 }
1282 
1283 static void f2fs_invalidate_node_page(struct page *page, unsigned int offset,
1284 				      unsigned int length)
1285 {
1286 	struct inode *inode = page->mapping->host;
1287 	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
1288 	if (PageDirty(page))
1289 		dec_page_count(sbi, F2FS_DIRTY_NODES);
1290 	ClearPagePrivate(page);
1291 }
1292 
1293 static int f2fs_release_node_page(struct page *page, gfp_t wait)
1294 {
1295 	ClearPagePrivate(page);
1296 	return 1;
1297 }
1298 
1299 /*
1300  * Structure of the f2fs node operations
1301  */
1302 const struct address_space_operations f2fs_node_aops = {
1303 	.writepage	= f2fs_write_node_page,
1304 	.writepages	= f2fs_write_node_pages,
1305 	.set_page_dirty	= f2fs_set_node_page_dirty,
1306 	.invalidatepage	= f2fs_invalidate_node_page,
1307 	.releasepage	= f2fs_release_node_page,
1308 };
1309 
1310 static struct free_nid *__lookup_free_nid_list(nid_t n, struct list_head *head)
1311 {
1312 	struct list_head *this;
1313 	struct free_nid *i;
1314 	list_for_each(this, head) {
1315 		i = list_entry(this, struct free_nid, list);
1316 		if (i->nid == n)
1317 			return i;
1318 	}
1319 	return NULL;
1320 }
1321 
1322 static void __del_from_free_nid_list(struct free_nid *i)
1323 {
1324 	list_del(&i->list);
1325 	kmem_cache_free(free_nid_slab, i);
1326 }
1327 
1328 static int add_free_nid(struct f2fs_nm_info *nm_i, nid_t nid, bool build)
1329 {
1330 	struct free_nid *i;
1331 	struct nat_entry *ne;
1332 	bool allocated = false;
1333 
1334 	if (nm_i->fcnt > 2 * MAX_FREE_NIDS)
1335 		return -1;
1336 
1337 	/* 0 nid should not be used */
1338 	if (unlikely(nid == 0))
1339 		return 0;
1340 
1341 	if (build) {
1342 		/* do not add allocated nids */
1343 		read_lock(&nm_i->nat_tree_lock);
1344 		ne = __lookup_nat_cache(nm_i, nid);
1345 		if (ne && nat_get_blkaddr(ne) != NULL_ADDR)
1346 			allocated = true;
1347 		read_unlock(&nm_i->nat_tree_lock);
1348 		if (allocated)
1349 			return 0;
1350 	}
1351 
1352 	i = f2fs_kmem_cache_alloc(free_nid_slab, GFP_NOFS);
1353 	i->nid = nid;
1354 	i->state = NID_NEW;
1355 
1356 	spin_lock(&nm_i->free_nid_list_lock);
1357 	if (__lookup_free_nid_list(nid, &nm_i->free_nid_list)) {
1358 		spin_unlock(&nm_i->free_nid_list_lock);
1359 		kmem_cache_free(free_nid_slab, i);
1360 		return 0;
1361 	}
1362 	list_add_tail(&i->list, &nm_i->free_nid_list);
1363 	nm_i->fcnt++;
1364 	spin_unlock(&nm_i->free_nid_list_lock);
1365 	return 1;
1366 }
1367 
1368 static void remove_free_nid(struct f2fs_nm_info *nm_i, nid_t nid)
1369 {
1370 	struct free_nid *i;
1371 	spin_lock(&nm_i->free_nid_list_lock);
1372 	i = __lookup_free_nid_list(nid, &nm_i->free_nid_list);
1373 	if (i && i->state == NID_NEW) {
1374 		__del_from_free_nid_list(i);
1375 		nm_i->fcnt--;
1376 	}
1377 	spin_unlock(&nm_i->free_nid_list_lock);
1378 }
1379 
1380 static void scan_nat_page(struct f2fs_nm_info *nm_i,
1381 			struct page *nat_page, nid_t start_nid)
1382 {
1383 	struct f2fs_nat_block *nat_blk = page_address(nat_page);
1384 	block_t blk_addr;
1385 	int i;
1386 
1387 	i = start_nid % NAT_ENTRY_PER_BLOCK;
1388 
1389 	for (; i < NAT_ENTRY_PER_BLOCK; i++, start_nid++) {
1390 
1391 		if (unlikely(start_nid >= nm_i->max_nid))
1392 			break;
1393 
1394 		blk_addr = le32_to_cpu(nat_blk->entries[i].block_addr);
1395 		f2fs_bug_on(blk_addr == NEW_ADDR);
1396 		if (blk_addr == NULL_ADDR) {
1397 			if (add_free_nid(nm_i, start_nid, true) < 0)
1398 				break;
1399 		}
1400 	}
1401 }
1402 
1403 static void build_free_nids(struct f2fs_sb_info *sbi)
1404 {
1405 	struct f2fs_nm_info *nm_i = NM_I(sbi);
1406 	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
1407 	struct f2fs_summary_block *sum = curseg->sum_blk;
1408 	int i = 0;
1409 	nid_t nid = nm_i->next_scan_nid;
1410 
1411 	/* Enough entries */
1412 	if (nm_i->fcnt > NAT_ENTRY_PER_BLOCK)
1413 		return;
1414 
1415 	/* readahead nat pages to be scanned */
1416 	ra_nat_pages(sbi, nid);
1417 
1418 	while (1) {
1419 		struct page *page = get_current_nat_page(sbi, nid);
1420 
1421 		scan_nat_page(nm_i, page, nid);
1422 		f2fs_put_page(page, 1);
1423 
1424 		nid += (NAT_ENTRY_PER_BLOCK - (nid % NAT_ENTRY_PER_BLOCK));
1425 		if (unlikely(nid >= nm_i->max_nid))
1426 			nid = 0;
1427 
1428 		if (i++ == FREE_NID_PAGES)
1429 			break;
1430 	}
1431 
1432 	/* go to the next free nat pages to find free nids abundantly */
1433 	nm_i->next_scan_nid = nid;
1434 
1435 	/* find free nids from current sum_pages */
1436 	mutex_lock(&curseg->curseg_mutex);
1437 	for (i = 0; i < nats_in_cursum(sum); i++) {
1438 		block_t addr = le32_to_cpu(nat_in_journal(sum, i).block_addr);
1439 		nid = le32_to_cpu(nid_in_journal(sum, i));
1440 		if (addr == NULL_ADDR)
1441 			add_free_nid(nm_i, nid, true);
1442 		else
1443 			remove_free_nid(nm_i, nid);
1444 	}
1445 	mutex_unlock(&curseg->curseg_mutex);
1446 }
1447 
1448 /*
1449  * If this function returns success, caller can obtain a new nid
1450  * from second parameter of this function.
1451  * The returned nid could be used ino as well as nid when inode is created.
1452  */
1453 bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid)
1454 {
1455 	struct f2fs_nm_info *nm_i = NM_I(sbi);
1456 	struct free_nid *i = NULL;
1457 	struct list_head *this;
1458 retry:
1459 	if (unlikely(sbi->total_valid_node_count + 1 >= nm_i->max_nid))
1460 		return false;
1461 
1462 	spin_lock(&nm_i->free_nid_list_lock);
1463 
1464 	/* We should not use stale free nids created by build_free_nids */
1465 	if (nm_i->fcnt && !sbi->on_build_free_nids) {
1466 		f2fs_bug_on(list_empty(&nm_i->free_nid_list));
1467 		list_for_each(this, &nm_i->free_nid_list) {
1468 			i = list_entry(this, struct free_nid, list);
1469 			if (i->state == NID_NEW)
1470 				break;
1471 		}
1472 
1473 		f2fs_bug_on(i->state != NID_NEW);
1474 		*nid = i->nid;
1475 		i->state = NID_ALLOC;
1476 		nm_i->fcnt--;
1477 		spin_unlock(&nm_i->free_nid_list_lock);
1478 		return true;
1479 	}
1480 	spin_unlock(&nm_i->free_nid_list_lock);
1481 
1482 	/* Let's scan nat pages and its caches to get free nids */
1483 	mutex_lock(&nm_i->build_lock);
1484 	sbi->on_build_free_nids = true;
1485 	build_free_nids(sbi);
1486 	sbi->on_build_free_nids = false;
1487 	mutex_unlock(&nm_i->build_lock);
1488 	goto retry;
1489 }
1490 
1491 /*
1492  * alloc_nid() should be called prior to this function.
1493  */
1494 void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid)
1495 {
1496 	struct f2fs_nm_info *nm_i = NM_I(sbi);
1497 	struct free_nid *i;
1498 
1499 	spin_lock(&nm_i->free_nid_list_lock);
1500 	i = __lookup_free_nid_list(nid, &nm_i->free_nid_list);
1501 	f2fs_bug_on(!i || i->state != NID_ALLOC);
1502 	__del_from_free_nid_list(i);
1503 	spin_unlock(&nm_i->free_nid_list_lock);
1504 }
1505 
1506 /*
1507  * alloc_nid() should be called prior to this function.
1508  */
1509 void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid)
1510 {
1511 	struct f2fs_nm_info *nm_i = NM_I(sbi);
1512 	struct free_nid *i;
1513 
1514 	if (!nid)
1515 		return;
1516 
1517 	spin_lock(&nm_i->free_nid_list_lock);
1518 	i = __lookup_free_nid_list(nid, &nm_i->free_nid_list);
1519 	f2fs_bug_on(!i || i->state != NID_ALLOC);
1520 	if (nm_i->fcnt > 2 * MAX_FREE_NIDS) {
1521 		__del_from_free_nid_list(i);
1522 	} else {
1523 		i->state = NID_NEW;
1524 		nm_i->fcnt++;
1525 	}
1526 	spin_unlock(&nm_i->free_nid_list_lock);
1527 }
1528 
1529 void recover_node_page(struct f2fs_sb_info *sbi, struct page *page,
1530 		struct f2fs_summary *sum, struct node_info *ni,
1531 		block_t new_blkaddr)
1532 {
1533 	rewrite_node_page(sbi, page, sum, ni->blk_addr, new_blkaddr);
1534 	set_node_addr(sbi, ni, new_blkaddr);
1535 	clear_node_page_dirty(page);
1536 }
1537 
1538 int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
1539 {
1540 	struct f2fs_inode *src, *dst;
1541 	nid_t ino = ino_of_node(page);
1542 	struct node_info old_ni, new_ni;
1543 	struct page *ipage;
1544 
1545 	ipage = grab_cache_page(NODE_MAPPING(sbi), ino);
1546 	if (!ipage)
1547 		return -ENOMEM;
1548 
1549 	/* Should not use this inode  from free nid list */
1550 	remove_free_nid(NM_I(sbi), ino);
1551 
1552 	get_node_info(sbi, ino, &old_ni);
1553 	SetPageUptodate(ipage);
1554 	fill_node_footer(ipage, ino, ino, 0, true);
1555 
1556 	src = F2FS_INODE(page);
1557 	dst = F2FS_INODE(ipage);
1558 
1559 	memcpy(dst, src, (unsigned long)&src->i_ext - (unsigned long)src);
1560 	dst->i_size = 0;
1561 	dst->i_blocks = cpu_to_le64(1);
1562 	dst->i_links = cpu_to_le32(1);
1563 	dst->i_xattr_nid = 0;
1564 
1565 	new_ni = old_ni;
1566 	new_ni.ino = ino;
1567 
1568 	if (unlikely(!inc_valid_node_count(sbi, NULL)))
1569 		WARN_ON(1);
1570 	set_node_addr(sbi, &new_ni, NEW_ADDR);
1571 	inc_valid_inode_count(sbi);
1572 	f2fs_put_page(ipage, 1);
1573 	return 0;
1574 }
1575 
1576 /*
1577  * ra_sum_pages() merge contiguous pages into one bio and submit.
1578  * these pre-readed pages are linked in pages list.
1579  */
1580 static int ra_sum_pages(struct f2fs_sb_info *sbi, struct list_head *pages,
1581 				int start, int nrpages)
1582 {
1583 	struct page *page;
1584 	int page_idx = start;
1585 	struct f2fs_io_info fio = {
1586 		.type = META,
1587 		.rw = READ_SYNC | REQ_META | REQ_PRIO
1588 	};
1589 
1590 	for (; page_idx < start + nrpages; page_idx++) {
1591 		/* alloc temporal page for read node summary info*/
1592 		page = alloc_page(GFP_F2FS_ZERO);
1593 		if (!page) {
1594 			struct page *tmp;
1595 			list_for_each_entry_safe(page, tmp, pages, lru) {
1596 				list_del(&page->lru);
1597 				unlock_page(page);
1598 				__free_pages(page, 0);
1599 			}
1600 			return -ENOMEM;
1601 		}
1602 
1603 		lock_page(page);
1604 		page->index = page_idx;
1605 		list_add_tail(&page->lru, pages);
1606 	}
1607 
1608 	list_for_each_entry(page, pages, lru)
1609 		f2fs_submit_page_mbio(sbi, page, page->index, &fio);
1610 
1611 	f2fs_submit_merged_bio(sbi, META, READ);
1612 	return 0;
1613 }
1614 
1615 int restore_node_summary(struct f2fs_sb_info *sbi,
1616 			unsigned int segno, struct f2fs_summary_block *sum)
1617 {
1618 	struct f2fs_node *rn;
1619 	struct f2fs_summary *sum_entry;
1620 	struct page *page, *tmp;
1621 	block_t addr;
1622 	int bio_blocks = MAX_BIO_BLOCKS(max_hw_blocks(sbi));
1623 	int i, last_offset, nrpages, err = 0;
1624 	LIST_HEAD(page_list);
1625 
1626 	/* scan the node segment */
1627 	last_offset = sbi->blocks_per_seg;
1628 	addr = START_BLOCK(sbi, segno);
1629 	sum_entry = &sum->entries[0];
1630 
1631 	for (i = 0; i < last_offset; i += nrpages, addr += nrpages) {
1632 		nrpages = min(last_offset - i, bio_blocks);
1633 
1634 		/* read ahead node pages */
1635 		err = ra_sum_pages(sbi, &page_list, addr, nrpages);
1636 		if (err)
1637 			return err;
1638 
1639 		list_for_each_entry_safe(page, tmp, &page_list, lru) {
1640 
1641 			lock_page(page);
1642 			if (unlikely(!PageUptodate(page))) {
1643 				err = -EIO;
1644 			} else {
1645 				rn = F2FS_NODE(page);
1646 				sum_entry->nid = rn->footer.nid;
1647 				sum_entry->version = 0;
1648 				sum_entry->ofs_in_node = 0;
1649 				sum_entry++;
1650 			}
1651 
1652 			list_del(&page->lru);
1653 			unlock_page(page);
1654 			__free_pages(page, 0);
1655 		}
1656 	}
1657 	return err;
1658 }
1659 
1660 static bool flush_nats_in_journal(struct f2fs_sb_info *sbi)
1661 {
1662 	struct f2fs_nm_info *nm_i = NM_I(sbi);
1663 	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
1664 	struct f2fs_summary_block *sum = curseg->sum_blk;
1665 	int i;
1666 
1667 	mutex_lock(&curseg->curseg_mutex);
1668 
1669 	if (nats_in_cursum(sum) < NAT_JOURNAL_ENTRIES) {
1670 		mutex_unlock(&curseg->curseg_mutex);
1671 		return false;
1672 	}
1673 
1674 	for (i = 0; i < nats_in_cursum(sum); i++) {
1675 		struct nat_entry *ne;
1676 		struct f2fs_nat_entry raw_ne;
1677 		nid_t nid = le32_to_cpu(nid_in_journal(sum, i));
1678 
1679 		raw_ne = nat_in_journal(sum, i);
1680 retry:
1681 		write_lock(&nm_i->nat_tree_lock);
1682 		ne = __lookup_nat_cache(nm_i, nid);
1683 		if (ne) {
1684 			__set_nat_cache_dirty(nm_i, ne);
1685 			write_unlock(&nm_i->nat_tree_lock);
1686 			continue;
1687 		}
1688 		ne = grab_nat_entry(nm_i, nid);
1689 		if (!ne) {
1690 			write_unlock(&nm_i->nat_tree_lock);
1691 			goto retry;
1692 		}
1693 		nat_set_blkaddr(ne, le32_to_cpu(raw_ne.block_addr));
1694 		nat_set_ino(ne, le32_to_cpu(raw_ne.ino));
1695 		nat_set_version(ne, raw_ne.version);
1696 		__set_nat_cache_dirty(nm_i, ne);
1697 		write_unlock(&nm_i->nat_tree_lock);
1698 	}
1699 	update_nats_in_cursum(sum, -i);
1700 	mutex_unlock(&curseg->curseg_mutex);
1701 	return true;
1702 }
1703 
1704 /*
1705  * This function is called during the checkpointing process.
1706  */
1707 void flush_nat_entries(struct f2fs_sb_info *sbi)
1708 {
1709 	struct f2fs_nm_info *nm_i = NM_I(sbi);
1710 	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
1711 	struct f2fs_summary_block *sum = curseg->sum_blk;
1712 	struct list_head *cur, *n;
1713 	struct page *page = NULL;
1714 	struct f2fs_nat_block *nat_blk = NULL;
1715 	nid_t start_nid = 0, end_nid = 0;
1716 	bool flushed;
1717 
1718 	flushed = flush_nats_in_journal(sbi);
1719 
1720 	if (!flushed)
1721 		mutex_lock(&curseg->curseg_mutex);
1722 
1723 	/* 1) flush dirty nat caches */
1724 	list_for_each_safe(cur, n, &nm_i->dirty_nat_entries) {
1725 		struct nat_entry *ne;
1726 		nid_t nid;
1727 		struct f2fs_nat_entry raw_ne;
1728 		int offset = -1;
1729 		block_t new_blkaddr;
1730 
1731 		ne = list_entry(cur, struct nat_entry, list);
1732 		nid = nat_get_nid(ne);
1733 
1734 		if (nat_get_blkaddr(ne) == NEW_ADDR)
1735 			continue;
1736 		if (flushed)
1737 			goto to_nat_page;
1738 
1739 		/* if there is room for nat enries in curseg->sumpage */
1740 		offset = lookup_journal_in_cursum(sum, NAT_JOURNAL, nid, 1);
1741 		if (offset >= 0) {
1742 			raw_ne = nat_in_journal(sum, offset);
1743 			goto flush_now;
1744 		}
1745 to_nat_page:
1746 		if (!page || (start_nid > nid || nid > end_nid)) {
1747 			if (page) {
1748 				f2fs_put_page(page, 1);
1749 				page = NULL;
1750 			}
1751 			start_nid = START_NID(nid);
1752 			end_nid = start_nid + NAT_ENTRY_PER_BLOCK - 1;
1753 
1754 			/*
1755 			 * get nat block with dirty flag, increased reference
1756 			 * count, mapped and lock
1757 			 */
1758 			page = get_next_nat_page(sbi, start_nid);
1759 			nat_blk = page_address(page);
1760 		}
1761 
1762 		f2fs_bug_on(!nat_blk);
1763 		raw_ne = nat_blk->entries[nid - start_nid];
1764 flush_now:
1765 		new_blkaddr = nat_get_blkaddr(ne);
1766 
1767 		raw_ne.ino = cpu_to_le32(nat_get_ino(ne));
1768 		raw_ne.block_addr = cpu_to_le32(new_blkaddr);
1769 		raw_ne.version = nat_get_version(ne);
1770 
1771 		if (offset < 0) {
1772 			nat_blk->entries[nid - start_nid] = raw_ne;
1773 		} else {
1774 			nat_in_journal(sum, offset) = raw_ne;
1775 			nid_in_journal(sum, offset) = cpu_to_le32(nid);
1776 		}
1777 
1778 		if (nat_get_blkaddr(ne) == NULL_ADDR &&
1779 				add_free_nid(NM_I(sbi), nid, false) <= 0) {
1780 			write_lock(&nm_i->nat_tree_lock);
1781 			__del_from_nat_cache(nm_i, ne);
1782 			write_unlock(&nm_i->nat_tree_lock);
1783 		} else {
1784 			write_lock(&nm_i->nat_tree_lock);
1785 			__clear_nat_cache_dirty(nm_i, ne);
1786 			ne->checkpointed = true;
1787 			write_unlock(&nm_i->nat_tree_lock);
1788 		}
1789 	}
1790 	if (!flushed)
1791 		mutex_unlock(&curseg->curseg_mutex);
1792 	f2fs_put_page(page, 1);
1793 
1794 	/* 2) shrink nat caches if necessary */
1795 	try_to_free_nats(sbi, nm_i->nat_cnt - NM_WOUT_THRESHOLD);
1796 }
1797 
1798 static int init_node_manager(struct f2fs_sb_info *sbi)
1799 {
1800 	struct f2fs_super_block *sb_raw = F2FS_RAW_SUPER(sbi);
1801 	struct f2fs_nm_info *nm_i = NM_I(sbi);
1802 	unsigned char *version_bitmap;
1803 	unsigned int nat_segs, nat_blocks;
1804 
1805 	nm_i->nat_blkaddr = le32_to_cpu(sb_raw->nat_blkaddr);
1806 
1807 	/* segment_count_nat includes pair segment so divide to 2. */
1808 	nat_segs = le32_to_cpu(sb_raw->segment_count_nat) >> 1;
1809 	nat_blocks = nat_segs << le32_to_cpu(sb_raw->log_blocks_per_seg);
1810 	nm_i->max_nid = NAT_ENTRY_PER_BLOCK * nat_blocks;
1811 	nm_i->fcnt = 0;
1812 	nm_i->nat_cnt = 0;
1813 
1814 	INIT_LIST_HEAD(&nm_i->free_nid_list);
1815 	INIT_RADIX_TREE(&nm_i->nat_root, GFP_ATOMIC);
1816 	INIT_LIST_HEAD(&nm_i->nat_entries);
1817 	INIT_LIST_HEAD(&nm_i->dirty_nat_entries);
1818 
1819 	mutex_init(&nm_i->build_lock);
1820 	spin_lock_init(&nm_i->free_nid_list_lock);
1821 	rwlock_init(&nm_i->nat_tree_lock);
1822 
1823 	nm_i->next_scan_nid = le32_to_cpu(sbi->ckpt->next_free_nid);
1824 	nm_i->bitmap_size = __bitmap_size(sbi, NAT_BITMAP);
1825 	version_bitmap = __bitmap_ptr(sbi, NAT_BITMAP);
1826 	if (!version_bitmap)
1827 		return -EFAULT;
1828 
1829 	nm_i->nat_bitmap = kmemdup(version_bitmap, nm_i->bitmap_size,
1830 					GFP_KERNEL);
1831 	if (!nm_i->nat_bitmap)
1832 		return -ENOMEM;
1833 	return 0;
1834 }
1835 
1836 int build_node_manager(struct f2fs_sb_info *sbi)
1837 {
1838 	int err;
1839 
1840 	sbi->nm_info = kzalloc(sizeof(struct f2fs_nm_info), GFP_KERNEL);
1841 	if (!sbi->nm_info)
1842 		return -ENOMEM;
1843 
1844 	err = init_node_manager(sbi);
1845 	if (err)
1846 		return err;
1847 
1848 	build_free_nids(sbi);
1849 	return 0;
1850 }
1851 
1852 void destroy_node_manager(struct f2fs_sb_info *sbi)
1853 {
1854 	struct f2fs_nm_info *nm_i = NM_I(sbi);
1855 	struct free_nid *i, *next_i;
1856 	struct nat_entry *natvec[NATVEC_SIZE];
1857 	nid_t nid = 0;
1858 	unsigned int found;
1859 
1860 	if (!nm_i)
1861 		return;
1862 
1863 	/* destroy free nid list */
1864 	spin_lock(&nm_i->free_nid_list_lock);
1865 	list_for_each_entry_safe(i, next_i, &nm_i->free_nid_list, list) {
1866 		f2fs_bug_on(i->state == NID_ALLOC);
1867 		__del_from_free_nid_list(i);
1868 		nm_i->fcnt--;
1869 	}
1870 	f2fs_bug_on(nm_i->fcnt);
1871 	spin_unlock(&nm_i->free_nid_list_lock);
1872 
1873 	/* destroy nat cache */
1874 	write_lock(&nm_i->nat_tree_lock);
1875 	while ((found = __gang_lookup_nat_cache(nm_i,
1876 					nid, NATVEC_SIZE, natvec))) {
1877 		unsigned idx;
1878 		for (idx = 0; idx < found; idx++) {
1879 			struct nat_entry *e = natvec[idx];
1880 			nid = nat_get_nid(e) + 1;
1881 			__del_from_nat_cache(nm_i, e);
1882 		}
1883 	}
1884 	f2fs_bug_on(nm_i->nat_cnt);
1885 	write_unlock(&nm_i->nat_tree_lock);
1886 
1887 	kfree(nm_i->nat_bitmap);
1888 	sbi->nm_info = NULL;
1889 	kfree(nm_i);
1890 }
1891 
1892 int __init create_node_manager_caches(void)
1893 {
1894 	nat_entry_slab = f2fs_kmem_cache_create("nat_entry",
1895 			sizeof(struct nat_entry), NULL);
1896 	if (!nat_entry_slab)
1897 		return -ENOMEM;
1898 
1899 	free_nid_slab = f2fs_kmem_cache_create("free_nid",
1900 			sizeof(struct free_nid), NULL);
1901 	if (!free_nid_slab) {
1902 		kmem_cache_destroy(nat_entry_slab);
1903 		return -ENOMEM;
1904 	}
1905 	return 0;
1906 }
1907 
1908 void destroy_node_manager_caches(void)
1909 {
1910 	kmem_cache_destroy(free_nid_slab);
1911 	kmem_cache_destroy(nat_entry_slab);
1912 }
1913