xref: /openbmc/linux/fs/f2fs/checkpoint.c (revision 275876e2)
1 /*
2  * fs/f2fs/checkpoint.c
3  *
4  * Copyright (c) 2012 Samsung Electronics Co., Ltd.
5  *             http://www.samsung.com/
6  *
7  * This program is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License version 2 as
9  * published by the Free Software Foundation.
10  */
11 #include <linux/fs.h>
12 #include <linux/bio.h>
13 #include <linux/mpage.h>
14 #include <linux/writeback.h>
15 #include <linux/blkdev.h>
16 #include <linux/f2fs_fs.h>
17 #include <linux/pagevec.h>
18 #include <linux/swap.h>
19 
20 #include "f2fs.h"
21 #include "node.h"
22 #include "segment.h"
23 #include <trace/events/f2fs.h>
24 
25 static struct kmem_cache *ino_entry_slab;
26 static struct kmem_cache *inode_entry_slab;
27 
28 /*
29  * We guarantee no failure on the returned page.
30  */
31 struct page *grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index)
32 {
33 	struct address_space *mapping = META_MAPPING(sbi);
34 	struct page *page = NULL;
35 repeat:
36 	page = grab_cache_page(mapping, index);
37 	if (!page) {
38 		cond_resched();
39 		goto repeat;
40 	}
41 	f2fs_wait_on_page_writeback(page, META);
42 	SetPageUptodate(page);
43 	return page;
44 }
45 
46 /*
47  * We guarantee no failure on the returned page.
48  */
49 struct page *get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index)
50 {
51 	struct address_space *mapping = META_MAPPING(sbi);
52 	struct page *page;
53 repeat:
54 	page = grab_cache_page(mapping, index);
55 	if (!page) {
56 		cond_resched();
57 		goto repeat;
58 	}
59 	if (PageUptodate(page))
60 		goto out;
61 
62 	if (f2fs_submit_page_bio(sbi, page, index,
63 				READ_SYNC | REQ_META | REQ_PRIO))
64 		goto repeat;
65 
66 	lock_page(page);
67 	if (unlikely(page->mapping != mapping)) {
68 		f2fs_put_page(page, 1);
69 		goto repeat;
70 	}
71 out:
72 	return page;
73 }
74 
75 static inline int get_max_meta_blks(struct f2fs_sb_info *sbi, int type)
76 {
77 	switch (type) {
78 	case META_NAT:
79 		return NM_I(sbi)->max_nid / NAT_ENTRY_PER_BLOCK;
80 	case META_SIT:
81 		return SIT_BLK_CNT(sbi);
82 	case META_SSA:
83 	case META_CP:
84 		return 0;
85 	default:
86 		BUG();
87 	}
88 }
89 
90 /*
91  * Readahead CP/NAT/SIT/SSA pages
92  */
93 int ra_meta_pages(struct f2fs_sb_info *sbi, int start, int nrpages, int type)
94 {
95 	block_t prev_blk_addr = 0;
96 	struct page *page;
97 	int blkno = start;
98 	int max_blks = get_max_meta_blks(sbi, type);
99 
100 	struct f2fs_io_info fio = {
101 		.type = META,
102 		.rw = READ_SYNC | REQ_META | REQ_PRIO
103 	};
104 
105 	for (; nrpages-- > 0; blkno++) {
106 		block_t blk_addr;
107 
108 		switch (type) {
109 		case META_NAT:
110 			/* get nat block addr */
111 			if (unlikely(blkno >= max_blks))
112 				blkno = 0;
113 			blk_addr = current_nat_addr(sbi,
114 					blkno * NAT_ENTRY_PER_BLOCK);
115 			break;
116 		case META_SIT:
117 			/* get sit block addr */
118 			if (unlikely(blkno >= max_blks))
119 				goto out;
120 			blk_addr = current_sit_addr(sbi,
121 					blkno * SIT_ENTRY_PER_BLOCK);
122 			if (blkno != start && prev_blk_addr + 1 != blk_addr)
123 				goto out;
124 			prev_blk_addr = blk_addr;
125 			break;
126 		case META_SSA:
127 		case META_CP:
128 			/* get ssa/cp block addr */
129 			blk_addr = blkno;
130 			break;
131 		default:
132 			BUG();
133 		}
134 
135 		page = grab_cache_page(META_MAPPING(sbi), blk_addr);
136 		if (!page)
137 			continue;
138 		if (PageUptodate(page)) {
139 			f2fs_put_page(page, 1);
140 			continue;
141 		}
142 
143 		f2fs_submit_page_mbio(sbi, page, blk_addr, &fio);
144 		f2fs_put_page(page, 0);
145 	}
146 out:
147 	f2fs_submit_merged_bio(sbi, META, READ);
148 	return blkno - start;
149 }
150 
151 static int f2fs_write_meta_page(struct page *page,
152 				struct writeback_control *wbc)
153 {
154 	struct inode *inode = page->mapping->host;
155 	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
156 
157 	trace_f2fs_writepage(page, META);
158 
159 	if (unlikely(sbi->por_doing))
160 		goto redirty_out;
161 	if (wbc->for_reclaim)
162 		goto redirty_out;
163 
164 	/* Should not write any meta pages, if any IO error was occurred */
165 	if (unlikely(is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ERROR_FLAG)))
166 		goto no_write;
167 
168 	f2fs_wait_on_page_writeback(page, META);
169 	write_meta_page(sbi, page);
170 no_write:
171 	dec_page_count(sbi, F2FS_DIRTY_META);
172 	unlock_page(page);
173 	return 0;
174 
175 redirty_out:
176 	redirty_page_for_writepage(wbc, page);
177 	return AOP_WRITEPAGE_ACTIVATE;
178 }
179 
180 static int f2fs_write_meta_pages(struct address_space *mapping,
181 				struct writeback_control *wbc)
182 {
183 	struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb);
184 	long diff, written;
185 
186 	trace_f2fs_writepages(mapping->host, wbc, META);
187 
188 	/* collect a number of dirty meta pages and write together */
189 	if (wbc->for_kupdate ||
190 		get_pages(sbi, F2FS_DIRTY_META) < nr_pages_to_skip(sbi, META))
191 		goto skip_write;
192 
193 	/* if mounting is failed, skip writing node pages */
194 	mutex_lock(&sbi->cp_mutex);
195 	diff = nr_pages_to_write(sbi, META, wbc);
196 	written = sync_meta_pages(sbi, META, wbc->nr_to_write);
197 	mutex_unlock(&sbi->cp_mutex);
198 	wbc->nr_to_write = max((long)0, wbc->nr_to_write - written - diff);
199 	return 0;
200 
201 skip_write:
202 	wbc->pages_skipped += get_pages(sbi, F2FS_DIRTY_META);
203 	return 0;
204 }
205 
206 long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type,
207 						long nr_to_write)
208 {
209 	struct address_space *mapping = META_MAPPING(sbi);
210 	pgoff_t index = 0, end = LONG_MAX;
211 	struct pagevec pvec;
212 	long nwritten = 0;
213 	struct writeback_control wbc = {
214 		.for_reclaim = 0,
215 	};
216 
217 	pagevec_init(&pvec, 0);
218 
219 	while (index <= end) {
220 		int i, nr_pages;
221 		nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
222 				PAGECACHE_TAG_DIRTY,
223 				min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
224 		if (unlikely(nr_pages == 0))
225 			break;
226 
227 		for (i = 0; i < nr_pages; i++) {
228 			struct page *page = pvec.pages[i];
229 
230 			lock_page(page);
231 
232 			if (unlikely(page->mapping != mapping)) {
233 continue_unlock:
234 				unlock_page(page);
235 				continue;
236 			}
237 			if (!PageDirty(page)) {
238 				/* someone wrote it for us */
239 				goto continue_unlock;
240 			}
241 
242 			if (!clear_page_dirty_for_io(page))
243 				goto continue_unlock;
244 
245 			if (f2fs_write_meta_page(page, &wbc)) {
246 				unlock_page(page);
247 				break;
248 			}
249 			nwritten++;
250 			if (unlikely(nwritten >= nr_to_write))
251 				break;
252 		}
253 		pagevec_release(&pvec);
254 		cond_resched();
255 	}
256 
257 	if (nwritten)
258 		f2fs_submit_merged_bio(sbi, type, WRITE);
259 
260 	return nwritten;
261 }
262 
263 static int f2fs_set_meta_page_dirty(struct page *page)
264 {
265 	struct address_space *mapping = page->mapping;
266 	struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb);
267 
268 	trace_f2fs_set_page_dirty(page, META);
269 
270 	SetPageUptodate(page);
271 	if (!PageDirty(page)) {
272 		__set_page_dirty_nobuffers(page);
273 		inc_page_count(sbi, F2FS_DIRTY_META);
274 		return 1;
275 	}
276 	return 0;
277 }
278 
279 const struct address_space_operations f2fs_meta_aops = {
280 	.writepage	= f2fs_write_meta_page,
281 	.writepages	= f2fs_write_meta_pages,
282 	.set_page_dirty	= f2fs_set_meta_page_dirty,
283 };
284 
285 static void __add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type)
286 {
287 	struct ino_entry *e;
288 retry:
289 	spin_lock(&sbi->ino_lock[type]);
290 
291 	e = radix_tree_lookup(&sbi->ino_root[type], ino);
292 	if (!e) {
293 		e = kmem_cache_alloc(ino_entry_slab, GFP_ATOMIC);
294 		if (!e) {
295 			spin_unlock(&sbi->ino_lock[type]);
296 			goto retry;
297 		}
298 		if (radix_tree_insert(&sbi->ino_root[type], ino, e)) {
299 			spin_unlock(&sbi->ino_lock[type]);
300 			kmem_cache_free(ino_entry_slab, e);
301 			goto retry;
302 		}
303 		memset(e, 0, sizeof(struct ino_entry));
304 		e->ino = ino;
305 
306 		list_add_tail(&e->list, &sbi->ino_list[type]);
307 	}
308 	spin_unlock(&sbi->ino_lock[type]);
309 }
310 
311 static void __remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type)
312 {
313 	struct ino_entry *e;
314 
315 	spin_lock(&sbi->ino_lock[type]);
316 	e = radix_tree_lookup(&sbi->ino_root[type], ino);
317 	if (e) {
318 		list_del(&e->list);
319 		radix_tree_delete(&sbi->ino_root[type], ino);
320 		if (type == ORPHAN_INO)
321 			sbi->n_orphans--;
322 		spin_unlock(&sbi->ino_lock[type]);
323 		kmem_cache_free(ino_entry_slab, e);
324 		return;
325 	}
326 	spin_unlock(&sbi->ino_lock[type]);
327 }
328 
329 void add_dirty_inode(struct f2fs_sb_info *sbi, nid_t ino, int type)
330 {
331 	/* add new dirty ino entry into list */
332 	__add_ino_entry(sbi, ino, type);
333 }
334 
335 void remove_dirty_inode(struct f2fs_sb_info *sbi, nid_t ino, int type)
336 {
337 	/* remove dirty ino entry from list */
338 	__remove_ino_entry(sbi, ino, type);
339 }
340 
341 /* mode should be APPEND_INO or UPDATE_INO */
342 bool exist_written_data(struct f2fs_sb_info *sbi, nid_t ino, int mode)
343 {
344 	struct ino_entry *e;
345 	spin_lock(&sbi->ino_lock[mode]);
346 	e = radix_tree_lookup(&sbi->ino_root[mode], ino);
347 	spin_unlock(&sbi->ino_lock[mode]);
348 	return e ? true : false;
349 }
350 
351 static void release_dirty_inode(struct f2fs_sb_info *sbi)
352 {
353 	struct ino_entry *e, *tmp;
354 	int i;
355 
356 	for (i = APPEND_INO; i <= UPDATE_INO; i++) {
357 		spin_lock(&sbi->ino_lock[i]);
358 		list_for_each_entry_safe(e, tmp, &sbi->ino_list[i], list) {
359 			list_del(&e->list);
360 			radix_tree_delete(&sbi->ino_root[i], e->ino);
361 			kmem_cache_free(ino_entry_slab, e);
362 		}
363 		spin_unlock(&sbi->ino_lock[i]);
364 	}
365 }
366 
367 int acquire_orphan_inode(struct f2fs_sb_info *sbi)
368 {
369 	int err = 0;
370 
371 	spin_lock(&sbi->ino_lock[ORPHAN_INO]);
372 	if (unlikely(sbi->n_orphans >= sbi->max_orphans))
373 		err = -ENOSPC;
374 	else
375 		sbi->n_orphans++;
376 	spin_unlock(&sbi->ino_lock[ORPHAN_INO]);
377 
378 	return err;
379 }
380 
381 void release_orphan_inode(struct f2fs_sb_info *sbi)
382 {
383 	spin_lock(&sbi->ino_lock[ORPHAN_INO]);
384 	f2fs_bug_on(sbi->n_orphans == 0);
385 	sbi->n_orphans--;
386 	spin_unlock(&sbi->ino_lock[ORPHAN_INO]);
387 }
388 
389 void add_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
390 {
391 	/* add new orphan ino entry into list */
392 	__add_ino_entry(sbi, ino, ORPHAN_INO);
393 }
394 
395 void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
396 {
397 	/* remove orphan entry from orphan list */
398 	__remove_ino_entry(sbi, ino, ORPHAN_INO);
399 }
400 
401 static void recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
402 {
403 	struct inode *inode = f2fs_iget(sbi->sb, ino);
404 	f2fs_bug_on(IS_ERR(inode));
405 	clear_nlink(inode);
406 
407 	/* truncate all the data during iput */
408 	iput(inode);
409 }
410 
411 void recover_orphan_inodes(struct f2fs_sb_info *sbi)
412 {
413 	block_t start_blk, orphan_blkaddr, i, j;
414 
415 	if (!is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG))
416 		return;
417 
418 	sbi->por_doing = true;
419 
420 	start_blk = __start_cp_addr(sbi) + 1 +
421 		le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_payload);
422 	orphan_blkaddr = __start_sum_addr(sbi) - 1;
423 
424 	ra_meta_pages(sbi, start_blk, orphan_blkaddr, META_CP);
425 
426 	for (i = 0; i < orphan_blkaddr; i++) {
427 		struct page *page = get_meta_page(sbi, start_blk + i);
428 		struct f2fs_orphan_block *orphan_blk;
429 
430 		orphan_blk = (struct f2fs_orphan_block *)page_address(page);
431 		for (j = 0; j < le32_to_cpu(orphan_blk->entry_count); j++) {
432 			nid_t ino = le32_to_cpu(orphan_blk->ino[j]);
433 			recover_orphan_inode(sbi, ino);
434 		}
435 		f2fs_put_page(page, 1);
436 	}
437 	/* clear Orphan Flag */
438 	clear_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG);
439 	sbi->por_doing = false;
440 	return;
441 }
442 
443 static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk)
444 {
445 	struct list_head *head;
446 	struct f2fs_orphan_block *orphan_blk = NULL;
447 	unsigned int nentries = 0;
448 	unsigned short index;
449 	unsigned short orphan_blocks = (unsigned short)((sbi->n_orphans +
450 		(F2FS_ORPHANS_PER_BLOCK - 1)) / F2FS_ORPHANS_PER_BLOCK);
451 	struct page *page = NULL;
452 	struct ino_entry *orphan = NULL;
453 
454 	for (index = 0; index < orphan_blocks; index++)
455 		grab_meta_page(sbi, start_blk + index);
456 
457 	index = 1;
458 	spin_lock(&sbi->ino_lock[ORPHAN_INO]);
459 	head = &sbi->ino_list[ORPHAN_INO];
460 
461 	/* loop for each orphan inode entry and write them in Jornal block */
462 	list_for_each_entry(orphan, head, list) {
463 		if (!page) {
464 			page = find_get_page(META_MAPPING(sbi), start_blk++);
465 			f2fs_bug_on(!page);
466 			orphan_blk =
467 				(struct f2fs_orphan_block *)page_address(page);
468 			memset(orphan_blk, 0, sizeof(*orphan_blk));
469 			f2fs_put_page(page, 0);
470 		}
471 
472 		orphan_blk->ino[nentries++] = cpu_to_le32(orphan->ino);
473 
474 		if (nentries == F2FS_ORPHANS_PER_BLOCK) {
475 			/*
476 			 * an orphan block is full of 1020 entries,
477 			 * then we need to flush current orphan blocks
478 			 * and bring another one in memory
479 			 */
480 			orphan_blk->blk_addr = cpu_to_le16(index);
481 			orphan_blk->blk_count = cpu_to_le16(orphan_blocks);
482 			orphan_blk->entry_count = cpu_to_le32(nentries);
483 			set_page_dirty(page);
484 			f2fs_put_page(page, 1);
485 			index++;
486 			nentries = 0;
487 			page = NULL;
488 		}
489 	}
490 
491 	if (page) {
492 		orphan_blk->blk_addr = cpu_to_le16(index);
493 		orphan_blk->blk_count = cpu_to_le16(orphan_blocks);
494 		orphan_blk->entry_count = cpu_to_le32(nentries);
495 		set_page_dirty(page);
496 		f2fs_put_page(page, 1);
497 	}
498 
499 	spin_unlock(&sbi->ino_lock[ORPHAN_INO]);
500 }
501 
502 static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
503 				block_t cp_addr, unsigned long long *version)
504 {
505 	struct page *cp_page_1, *cp_page_2 = NULL;
506 	unsigned long blk_size = sbi->blocksize;
507 	struct f2fs_checkpoint *cp_block;
508 	unsigned long long cur_version = 0, pre_version = 0;
509 	size_t crc_offset;
510 	__u32 crc = 0;
511 
512 	/* Read the 1st cp block in this CP pack */
513 	cp_page_1 = get_meta_page(sbi, cp_addr);
514 
515 	/* get the version number */
516 	cp_block = (struct f2fs_checkpoint *)page_address(cp_page_1);
517 	crc_offset = le32_to_cpu(cp_block->checksum_offset);
518 	if (crc_offset >= blk_size)
519 		goto invalid_cp1;
520 
521 	crc = le32_to_cpu(*((__u32 *)((unsigned char *)cp_block + crc_offset)));
522 	if (!f2fs_crc_valid(crc, cp_block, crc_offset))
523 		goto invalid_cp1;
524 
525 	pre_version = cur_cp_version(cp_block);
526 
527 	/* Read the 2nd cp block in this CP pack */
528 	cp_addr += le32_to_cpu(cp_block->cp_pack_total_block_count) - 1;
529 	cp_page_2 = get_meta_page(sbi, cp_addr);
530 
531 	cp_block = (struct f2fs_checkpoint *)page_address(cp_page_2);
532 	crc_offset = le32_to_cpu(cp_block->checksum_offset);
533 	if (crc_offset >= blk_size)
534 		goto invalid_cp2;
535 
536 	crc = le32_to_cpu(*((__u32 *)((unsigned char *)cp_block + crc_offset)));
537 	if (!f2fs_crc_valid(crc, cp_block, crc_offset))
538 		goto invalid_cp2;
539 
540 	cur_version = cur_cp_version(cp_block);
541 
542 	if (cur_version == pre_version) {
543 		*version = cur_version;
544 		f2fs_put_page(cp_page_2, 1);
545 		return cp_page_1;
546 	}
547 invalid_cp2:
548 	f2fs_put_page(cp_page_2, 1);
549 invalid_cp1:
550 	f2fs_put_page(cp_page_1, 1);
551 	return NULL;
552 }
553 
554 int get_valid_checkpoint(struct f2fs_sb_info *sbi)
555 {
556 	struct f2fs_checkpoint *cp_block;
557 	struct f2fs_super_block *fsb = sbi->raw_super;
558 	struct page *cp1, *cp2, *cur_page;
559 	unsigned long blk_size = sbi->blocksize;
560 	unsigned long long cp1_version = 0, cp2_version = 0;
561 	unsigned long long cp_start_blk_no;
562 	unsigned int cp_blks = 1 + le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_payload);
563 	block_t cp_blk_no;
564 	int i;
565 
566 	sbi->ckpt = kzalloc(cp_blks * blk_size, GFP_KERNEL);
567 	if (!sbi->ckpt)
568 		return -ENOMEM;
569 	/*
570 	 * Finding out valid cp block involves read both
571 	 * sets( cp pack1 and cp pack 2)
572 	 */
573 	cp_start_blk_no = le32_to_cpu(fsb->cp_blkaddr);
574 	cp1 = validate_checkpoint(sbi, cp_start_blk_no, &cp1_version);
575 
576 	/* The second checkpoint pack should start at the next segment */
577 	cp_start_blk_no += ((unsigned long long)1) <<
578 				le32_to_cpu(fsb->log_blocks_per_seg);
579 	cp2 = validate_checkpoint(sbi, cp_start_blk_no, &cp2_version);
580 
581 	if (cp1 && cp2) {
582 		if (ver_after(cp2_version, cp1_version))
583 			cur_page = cp2;
584 		else
585 			cur_page = cp1;
586 	} else if (cp1) {
587 		cur_page = cp1;
588 	} else if (cp2) {
589 		cur_page = cp2;
590 	} else {
591 		goto fail_no_cp;
592 	}
593 
594 	cp_block = (struct f2fs_checkpoint *)page_address(cur_page);
595 	memcpy(sbi->ckpt, cp_block, blk_size);
596 
597 	if (cp_blks <= 1)
598 		goto done;
599 
600 	cp_blk_no = le32_to_cpu(fsb->cp_blkaddr);
601 	if (cur_page == cp2)
602 		cp_blk_no += 1 << le32_to_cpu(fsb->log_blocks_per_seg);
603 
604 	for (i = 1; i < cp_blks; i++) {
605 		void *sit_bitmap_ptr;
606 		unsigned char *ckpt = (unsigned char *)sbi->ckpt;
607 
608 		cur_page = get_meta_page(sbi, cp_blk_no + i);
609 		sit_bitmap_ptr = page_address(cur_page);
610 		memcpy(ckpt + i * blk_size, sit_bitmap_ptr, blk_size);
611 		f2fs_put_page(cur_page, 1);
612 	}
613 done:
614 	f2fs_put_page(cp1, 1);
615 	f2fs_put_page(cp2, 1);
616 	return 0;
617 
618 fail_no_cp:
619 	kfree(sbi->ckpt);
620 	return -EINVAL;
621 }
622 
623 static int __add_dirty_inode(struct inode *inode, struct dir_inode_entry *new)
624 {
625 	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
626 
627 	if (is_inode_flag_set(F2FS_I(inode), FI_DIRTY_DIR))
628 		return -EEXIST;
629 
630 	set_inode_flag(F2FS_I(inode), FI_DIRTY_DIR);
631 	F2FS_I(inode)->dirty_dir = new;
632 	list_add_tail(&new->list, &sbi->dir_inode_list);
633 	stat_inc_dirty_dir(sbi);
634 	return 0;
635 }
636 
637 void set_dirty_dir_page(struct inode *inode, struct page *page)
638 {
639 	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
640 	struct dir_inode_entry *new;
641 	int ret = 0;
642 
643 	if (!S_ISDIR(inode->i_mode))
644 		return;
645 
646 	new = f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS);
647 	new->inode = inode;
648 	INIT_LIST_HEAD(&new->list);
649 
650 	spin_lock(&sbi->dir_inode_lock);
651 	ret = __add_dirty_inode(inode, new);
652 	inode_inc_dirty_dents(inode);
653 	SetPagePrivate(page);
654 	spin_unlock(&sbi->dir_inode_lock);
655 
656 	if (ret)
657 		kmem_cache_free(inode_entry_slab, new);
658 }
659 
660 void add_dirty_dir_inode(struct inode *inode)
661 {
662 	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
663 	struct dir_inode_entry *new =
664 			f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS);
665 	int ret = 0;
666 
667 	new->inode = inode;
668 	INIT_LIST_HEAD(&new->list);
669 
670 	spin_lock(&sbi->dir_inode_lock);
671 	ret = __add_dirty_inode(inode, new);
672 	spin_unlock(&sbi->dir_inode_lock);
673 
674 	if (ret)
675 		kmem_cache_free(inode_entry_slab, new);
676 }
677 
678 void remove_dirty_dir_inode(struct inode *inode)
679 {
680 	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
681 	struct dir_inode_entry *entry;
682 
683 	if (!S_ISDIR(inode->i_mode))
684 		return;
685 
686 	spin_lock(&sbi->dir_inode_lock);
687 	if (get_dirty_dents(inode) ||
688 			!is_inode_flag_set(F2FS_I(inode), FI_DIRTY_DIR)) {
689 		spin_unlock(&sbi->dir_inode_lock);
690 		return;
691 	}
692 
693 	entry = F2FS_I(inode)->dirty_dir;
694 	list_del(&entry->list);
695 	F2FS_I(inode)->dirty_dir = NULL;
696 	clear_inode_flag(F2FS_I(inode), FI_DIRTY_DIR);
697 	stat_dec_dirty_dir(sbi);
698 	spin_unlock(&sbi->dir_inode_lock);
699 	kmem_cache_free(inode_entry_slab, entry);
700 
701 	/* Only from the recovery routine */
702 	if (is_inode_flag_set(F2FS_I(inode), FI_DELAY_IPUT)) {
703 		clear_inode_flag(F2FS_I(inode), FI_DELAY_IPUT);
704 		iput(inode);
705 	}
706 }
707 
708 void sync_dirty_dir_inodes(struct f2fs_sb_info *sbi)
709 {
710 	struct list_head *head;
711 	struct dir_inode_entry *entry;
712 	struct inode *inode;
713 retry:
714 	spin_lock(&sbi->dir_inode_lock);
715 
716 	head = &sbi->dir_inode_list;
717 	if (list_empty(head)) {
718 		spin_unlock(&sbi->dir_inode_lock);
719 		return;
720 	}
721 	entry = list_entry(head->next, struct dir_inode_entry, list);
722 	inode = igrab(entry->inode);
723 	spin_unlock(&sbi->dir_inode_lock);
724 	if (inode) {
725 		filemap_fdatawrite(inode->i_mapping);
726 		iput(inode);
727 	} else {
728 		/*
729 		 * We should submit bio, since it exists several
730 		 * wribacking dentry pages in the freeing inode.
731 		 */
732 		f2fs_submit_merged_bio(sbi, DATA, WRITE);
733 	}
734 	goto retry;
735 }
736 
737 /*
738  * Freeze all the FS-operations for checkpoint.
739  */
740 static void block_operations(struct f2fs_sb_info *sbi)
741 {
742 	struct writeback_control wbc = {
743 		.sync_mode = WB_SYNC_ALL,
744 		.nr_to_write = LONG_MAX,
745 		.for_reclaim = 0,
746 	};
747 	struct blk_plug plug;
748 
749 	blk_start_plug(&plug);
750 
751 retry_flush_dents:
752 	f2fs_lock_all(sbi);
753 	/* write all the dirty dentry pages */
754 	if (get_pages(sbi, F2FS_DIRTY_DENTS)) {
755 		f2fs_unlock_all(sbi);
756 		sync_dirty_dir_inodes(sbi);
757 		goto retry_flush_dents;
758 	}
759 
760 	/*
761 	 * POR: we should ensure that there is no dirty node pages
762 	 * until finishing nat/sit flush.
763 	 */
764 retry_flush_nodes:
765 	down_write(&sbi->node_write);
766 
767 	if (get_pages(sbi, F2FS_DIRTY_NODES)) {
768 		up_write(&sbi->node_write);
769 		sync_node_pages(sbi, 0, &wbc);
770 		goto retry_flush_nodes;
771 	}
772 	blk_finish_plug(&plug);
773 }
774 
775 static void unblock_operations(struct f2fs_sb_info *sbi)
776 {
777 	up_write(&sbi->node_write);
778 	f2fs_unlock_all(sbi);
779 }
780 
781 static void wait_on_all_pages_writeback(struct f2fs_sb_info *sbi)
782 {
783 	DEFINE_WAIT(wait);
784 
785 	for (;;) {
786 		prepare_to_wait(&sbi->cp_wait, &wait, TASK_UNINTERRUPTIBLE);
787 
788 		if (!get_pages(sbi, F2FS_WRITEBACK))
789 			break;
790 
791 		io_schedule();
792 	}
793 	finish_wait(&sbi->cp_wait, &wait);
794 }
795 
796 static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
797 {
798 	struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
799 	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_WARM_NODE);
800 	nid_t last_nid = 0;
801 	block_t start_blk;
802 	struct page *cp_page;
803 	unsigned int data_sum_blocks, orphan_blocks;
804 	__u32 crc32 = 0;
805 	void *kaddr;
806 	int i;
807 	int cp_payload_blks = le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_payload);
808 
809 	/*
810 	 * This avoids to conduct wrong roll-forward operations and uses
811 	 * metapages, so should be called prior to sync_meta_pages below.
812 	 */
813 	discard_next_dnode(sbi, NEXT_FREE_BLKADDR(sbi, curseg));
814 
815 	/* Flush all the NAT/SIT pages */
816 	while (get_pages(sbi, F2FS_DIRTY_META))
817 		sync_meta_pages(sbi, META, LONG_MAX);
818 
819 	next_free_nid(sbi, &last_nid);
820 
821 	/*
822 	 * modify checkpoint
823 	 * version number is already updated
824 	 */
825 	ckpt->elapsed_time = cpu_to_le64(get_mtime(sbi));
826 	ckpt->valid_block_count = cpu_to_le64(valid_user_blocks(sbi));
827 	ckpt->free_segment_count = cpu_to_le32(free_segments(sbi));
828 	for (i = 0; i < 3; i++) {
829 		ckpt->cur_node_segno[i] =
830 			cpu_to_le32(curseg_segno(sbi, i + CURSEG_HOT_NODE));
831 		ckpt->cur_node_blkoff[i] =
832 			cpu_to_le16(curseg_blkoff(sbi, i + CURSEG_HOT_NODE));
833 		ckpt->alloc_type[i + CURSEG_HOT_NODE] =
834 				curseg_alloc_type(sbi, i + CURSEG_HOT_NODE);
835 	}
836 	for (i = 0; i < 3; i++) {
837 		ckpt->cur_data_segno[i] =
838 			cpu_to_le32(curseg_segno(sbi, i + CURSEG_HOT_DATA));
839 		ckpt->cur_data_blkoff[i] =
840 			cpu_to_le16(curseg_blkoff(sbi, i + CURSEG_HOT_DATA));
841 		ckpt->alloc_type[i + CURSEG_HOT_DATA] =
842 				curseg_alloc_type(sbi, i + CURSEG_HOT_DATA);
843 	}
844 
845 	ckpt->valid_node_count = cpu_to_le32(valid_node_count(sbi));
846 	ckpt->valid_inode_count = cpu_to_le32(valid_inode_count(sbi));
847 	ckpt->next_free_nid = cpu_to_le32(last_nid);
848 
849 	/* 2 cp  + n data seg summary + orphan inode blocks */
850 	data_sum_blocks = npages_for_summary_flush(sbi);
851 	if (data_sum_blocks < 3)
852 		set_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG);
853 	else
854 		clear_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG);
855 
856 	orphan_blocks = (sbi->n_orphans + F2FS_ORPHANS_PER_BLOCK - 1)
857 					/ F2FS_ORPHANS_PER_BLOCK;
858 	ckpt->cp_pack_start_sum = cpu_to_le32(1 + cp_payload_blks +
859 			orphan_blocks);
860 
861 	if (is_umount) {
862 		set_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
863 		ckpt->cp_pack_total_block_count = cpu_to_le32(2 +
864 				cp_payload_blks + data_sum_blocks +
865 				orphan_blocks + NR_CURSEG_NODE_TYPE);
866 	} else {
867 		clear_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
868 		ckpt->cp_pack_total_block_count = cpu_to_le32(2 +
869 				cp_payload_blks + data_sum_blocks +
870 				orphan_blocks);
871 	}
872 
873 	if (sbi->n_orphans)
874 		set_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG);
875 	else
876 		clear_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG);
877 
878 	/* update SIT/NAT bitmap */
879 	get_sit_bitmap(sbi, __bitmap_ptr(sbi, SIT_BITMAP));
880 	get_nat_bitmap(sbi, __bitmap_ptr(sbi, NAT_BITMAP));
881 
882 	crc32 = f2fs_crc32(ckpt, le32_to_cpu(ckpt->checksum_offset));
883 	*((__le32 *)((unsigned char *)ckpt +
884 				le32_to_cpu(ckpt->checksum_offset)))
885 				= cpu_to_le32(crc32);
886 
887 	start_blk = __start_cp_addr(sbi);
888 
889 	/* write out checkpoint buffer at block 0 */
890 	cp_page = grab_meta_page(sbi, start_blk++);
891 	kaddr = page_address(cp_page);
892 	memcpy(kaddr, ckpt, (1 << sbi->log_blocksize));
893 	set_page_dirty(cp_page);
894 	f2fs_put_page(cp_page, 1);
895 
896 	for (i = 1; i < 1 + cp_payload_blks; i++) {
897 		cp_page = grab_meta_page(sbi, start_blk++);
898 		kaddr = page_address(cp_page);
899 		memcpy(kaddr, (char *)ckpt + i * F2FS_BLKSIZE,
900 				(1 << sbi->log_blocksize));
901 		set_page_dirty(cp_page);
902 		f2fs_put_page(cp_page, 1);
903 	}
904 
905 	if (sbi->n_orphans) {
906 		write_orphan_inodes(sbi, start_blk);
907 		start_blk += orphan_blocks;
908 	}
909 
910 	write_data_summaries(sbi, start_blk);
911 	start_blk += data_sum_blocks;
912 	if (is_umount) {
913 		write_node_summaries(sbi, start_blk);
914 		start_blk += NR_CURSEG_NODE_TYPE;
915 	}
916 
917 	/* writeout checkpoint block */
918 	cp_page = grab_meta_page(sbi, start_blk);
919 	kaddr = page_address(cp_page);
920 	memcpy(kaddr, ckpt, (1 << sbi->log_blocksize));
921 	set_page_dirty(cp_page);
922 	f2fs_put_page(cp_page, 1);
923 
924 	/* wait for previous submitted node/meta pages writeback */
925 	wait_on_all_pages_writeback(sbi);
926 
927 	filemap_fdatawait_range(NODE_MAPPING(sbi), 0, LONG_MAX);
928 	filemap_fdatawait_range(META_MAPPING(sbi), 0, LONG_MAX);
929 
930 	/* update user_block_counts */
931 	sbi->last_valid_block_count = sbi->total_valid_block_count;
932 	sbi->alloc_valid_block_count = 0;
933 
934 	/* Here, we only have one bio having CP pack */
935 	sync_meta_pages(sbi, META_FLUSH, LONG_MAX);
936 
937 	if (!is_set_ckpt_flags(ckpt, CP_ERROR_FLAG)) {
938 		clear_prefree_segments(sbi);
939 		release_dirty_inode(sbi);
940 		F2FS_RESET_SB_DIRT(sbi);
941 	}
942 }
943 
944 /*
945  * We guarantee that this checkpoint procedure should not fail.
946  */
947 void write_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
948 {
949 	struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
950 	unsigned long long ckpt_ver;
951 
952 	trace_f2fs_write_checkpoint(sbi->sb, is_umount, "start block_ops");
953 
954 	mutex_lock(&sbi->cp_mutex);
955 	block_operations(sbi);
956 
957 	trace_f2fs_write_checkpoint(sbi->sb, is_umount, "finish block_ops");
958 
959 	f2fs_submit_merged_bio(sbi, DATA, WRITE);
960 	f2fs_submit_merged_bio(sbi, NODE, WRITE);
961 	f2fs_submit_merged_bio(sbi, META, WRITE);
962 
963 	/*
964 	 * update checkpoint pack index
965 	 * Increase the version number so that
966 	 * SIT entries and seg summaries are written at correct place
967 	 */
968 	ckpt_ver = cur_cp_version(ckpt);
969 	ckpt->checkpoint_ver = cpu_to_le64(++ckpt_ver);
970 
971 	/* write cached NAT/SIT entries to NAT/SIT area */
972 	flush_nat_entries(sbi);
973 	flush_sit_entries(sbi);
974 
975 	/* unlock all the fs_lock[] in do_checkpoint() */
976 	do_checkpoint(sbi, is_umount);
977 
978 	unblock_operations(sbi);
979 	mutex_unlock(&sbi->cp_mutex);
980 
981 	stat_inc_cp_count(sbi->stat_info);
982 	trace_f2fs_write_checkpoint(sbi->sb, is_umount, "finish checkpoint");
983 }
984 
985 void init_ino_entry_info(struct f2fs_sb_info *sbi)
986 {
987 	int i;
988 
989 	for (i = 0; i < MAX_INO_ENTRY; i++) {
990 		INIT_RADIX_TREE(&sbi->ino_root[i], GFP_ATOMIC);
991 		spin_lock_init(&sbi->ino_lock[i]);
992 		INIT_LIST_HEAD(&sbi->ino_list[i]);
993 	}
994 
995 	/*
996 	 * considering 512 blocks in a segment 8 blocks are needed for cp
997 	 * and log segment summaries. Remaining blocks are used to keep
998 	 * orphan entries with the limitation one reserved segment
999 	 * for cp pack we can have max 1020*504 orphan entries
1000 	 */
1001 	sbi->n_orphans = 0;
1002 	sbi->max_orphans = (sbi->blocks_per_seg - 2 - NR_CURSEG_TYPE)
1003 				* F2FS_ORPHANS_PER_BLOCK;
1004 }
1005 
1006 int __init create_checkpoint_caches(void)
1007 {
1008 	ino_entry_slab = f2fs_kmem_cache_create("f2fs_ino_entry",
1009 			sizeof(struct ino_entry));
1010 	if (!ino_entry_slab)
1011 		return -ENOMEM;
1012 	inode_entry_slab = f2fs_kmem_cache_create("f2fs_dirty_dir_entry",
1013 			sizeof(struct dir_inode_entry));
1014 	if (!inode_entry_slab) {
1015 		kmem_cache_destroy(ino_entry_slab);
1016 		return -ENOMEM;
1017 	}
1018 	return 0;
1019 }
1020 
1021 void destroy_checkpoint_caches(void)
1022 {
1023 	kmem_cache_destroy(ino_entry_slab);
1024 	kmem_cache_destroy(inode_entry_slab);
1025 }
1026