xref: /openbmc/linux/fs/f2fs/checkpoint.c (revision 867a0e05)
1 /*
2  * fs/f2fs/checkpoint.c
3  *
4  * Copyright (c) 2012 Samsung Electronics Co., Ltd.
5  *             http://www.samsung.com/
6  *
7  * This program is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License version 2 as
9  * published by the Free Software Foundation.
10  */
11 #include <linux/fs.h>
12 #include <linux/bio.h>
13 #include <linux/mpage.h>
14 #include <linux/writeback.h>
15 #include <linux/blkdev.h>
16 #include <linux/f2fs_fs.h>
17 #include <linux/pagevec.h>
18 #include <linux/swap.h>
19 
20 #include "f2fs.h"
21 #include "node.h"
22 #include "segment.h"
23 #include <trace/events/f2fs.h>
24 
25 static struct kmem_cache *ino_entry_slab;
26 static struct kmem_cache *inode_entry_slab;
27 
28 /*
29  * We guarantee no failure on the returned page.
30  */
31 struct page *grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index)
32 {
33 	struct address_space *mapping = META_MAPPING(sbi);
34 	struct page *page = NULL;
35 repeat:
36 	page = grab_cache_page(mapping, index);
37 	if (!page) {
38 		cond_resched();
39 		goto repeat;
40 	}
41 	f2fs_wait_on_page_writeback(page, META);
42 	SetPageUptodate(page);
43 	return page;
44 }
45 
46 /*
47  * We guarantee no failure on the returned page.
48  */
49 struct page *get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index)
50 {
51 	struct address_space *mapping = META_MAPPING(sbi);
52 	struct page *page;
53 repeat:
54 	page = grab_cache_page(mapping, index);
55 	if (!page) {
56 		cond_resched();
57 		goto repeat;
58 	}
59 	if (PageUptodate(page))
60 		goto out;
61 
62 	if (f2fs_submit_page_bio(sbi, page, index,
63 				READ_SYNC | REQ_META | REQ_PRIO))
64 		goto repeat;
65 
66 	lock_page(page);
67 	if (unlikely(page->mapping != mapping)) {
68 		f2fs_put_page(page, 1);
69 		goto repeat;
70 	}
71 out:
72 	return page;
73 }
74 
75 static inline bool is_valid_blkaddr(struct f2fs_sb_info *sbi,
76 						block_t blkaddr, int type)
77 {
78 	switch (type) {
79 	case META_NAT:
80 		break;
81 	case META_SIT:
82 		if (unlikely(blkaddr >= SIT_BLK_CNT(sbi)))
83 			return false;
84 		break;
85 	case META_SSA:
86 		if (unlikely(blkaddr >= MAIN_BLKADDR(sbi) ||
87 			blkaddr < SM_I(sbi)->ssa_blkaddr))
88 			return false;
89 		break;
90 	case META_CP:
91 		if (unlikely(blkaddr >= SIT_I(sbi)->sit_base_addr ||
92 			blkaddr < __start_cp_addr(sbi)))
93 			return false;
94 		break;
95 	case META_POR:
96 		if (unlikely(blkaddr >= MAX_BLKADDR(sbi) ||
97 			blkaddr < MAIN_BLKADDR(sbi)))
98 			return false;
99 		break;
100 	default:
101 		BUG();
102 	}
103 
104 	return true;
105 }
106 
107 /*
108  * Readahead CP/NAT/SIT/SSA pages
109  */
110 int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, int type)
111 {
112 	block_t prev_blk_addr = 0;
113 	struct page *page;
114 	block_t blkno = start;
115 
116 	struct f2fs_io_info fio = {
117 		.type = META,
118 		.rw = READ_SYNC | REQ_META | REQ_PRIO
119 	};
120 
121 	for (; nrpages-- > 0; blkno++) {
122 		block_t blk_addr;
123 
124 		if (!is_valid_blkaddr(sbi, blkno, type))
125 			goto out;
126 
127 		switch (type) {
128 		case META_NAT:
129 			if (unlikely(blkno >=
130 					NAT_BLOCK_OFFSET(NM_I(sbi)->max_nid)))
131 				blkno = 0;
132 			/* get nat block addr */
133 			blk_addr = current_nat_addr(sbi,
134 					blkno * NAT_ENTRY_PER_BLOCK);
135 			break;
136 		case META_SIT:
137 			/* get sit block addr */
138 			blk_addr = current_sit_addr(sbi,
139 					blkno * SIT_ENTRY_PER_BLOCK);
140 			if (blkno != start && prev_blk_addr + 1 != blk_addr)
141 				goto out;
142 			prev_blk_addr = blk_addr;
143 			break;
144 		case META_SSA:
145 		case META_CP:
146 		case META_POR:
147 			blk_addr = blkno;
148 			break;
149 		default:
150 			BUG();
151 		}
152 
153 		page = grab_cache_page(META_MAPPING(sbi), blk_addr);
154 		if (!page)
155 			continue;
156 		if (PageUptodate(page)) {
157 			f2fs_put_page(page, 1);
158 			continue;
159 		}
160 
161 		f2fs_submit_page_mbio(sbi, page, blk_addr, &fio);
162 		f2fs_put_page(page, 0);
163 	}
164 out:
165 	f2fs_submit_merged_bio(sbi, META, READ);
166 	return blkno - start;
167 }
168 
169 void ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index)
170 {
171 	struct page *page;
172 	bool readahead = false;
173 
174 	page = find_get_page(META_MAPPING(sbi), index);
175 	if (!page || (page && !PageUptodate(page)))
176 		readahead = true;
177 	f2fs_put_page(page, 0);
178 
179 	if (readahead)
180 		ra_meta_pages(sbi, index, MAX_BIO_BLOCKS(sbi), META_POR);
181 }
182 
183 static int f2fs_write_meta_page(struct page *page,
184 				struct writeback_control *wbc)
185 {
186 	struct f2fs_sb_info *sbi = F2FS_P_SB(page);
187 
188 	trace_f2fs_writepage(page, META);
189 
190 	if (unlikely(sbi->por_doing))
191 		goto redirty_out;
192 	if (wbc->for_reclaim && page->index < GET_SUM_BLOCK(sbi, 0))
193 		goto redirty_out;
194 	if (unlikely(f2fs_cp_error(sbi)))
195 		goto redirty_out;
196 
197 	f2fs_wait_on_page_writeback(page, META);
198 	write_meta_page(sbi, page);
199 	dec_page_count(sbi, F2FS_DIRTY_META);
200 	unlock_page(page);
201 
202 	if (wbc->for_reclaim)
203 		f2fs_submit_merged_bio(sbi, META, WRITE);
204 	return 0;
205 
206 redirty_out:
207 	redirty_page_for_writepage(wbc, page);
208 	return AOP_WRITEPAGE_ACTIVATE;
209 }
210 
211 static int f2fs_write_meta_pages(struct address_space *mapping,
212 				struct writeback_control *wbc)
213 {
214 	struct f2fs_sb_info *sbi = F2FS_M_SB(mapping);
215 	long diff, written;
216 
217 	trace_f2fs_writepages(mapping->host, wbc, META);
218 
219 	/* collect a number of dirty meta pages and write together */
220 	if (wbc->for_kupdate ||
221 		get_pages(sbi, F2FS_DIRTY_META) < nr_pages_to_skip(sbi, META))
222 		goto skip_write;
223 
224 	/* if mounting is failed, skip writing node pages */
225 	mutex_lock(&sbi->cp_mutex);
226 	diff = nr_pages_to_write(sbi, META, wbc);
227 	written = sync_meta_pages(sbi, META, wbc->nr_to_write);
228 	mutex_unlock(&sbi->cp_mutex);
229 	wbc->nr_to_write = max((long)0, wbc->nr_to_write - written - diff);
230 	return 0;
231 
232 skip_write:
233 	wbc->pages_skipped += get_pages(sbi, F2FS_DIRTY_META);
234 	return 0;
235 }
236 
237 long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type,
238 						long nr_to_write)
239 {
240 	struct address_space *mapping = META_MAPPING(sbi);
241 	pgoff_t index = 0, end = LONG_MAX;
242 	struct pagevec pvec;
243 	long nwritten = 0;
244 	struct writeback_control wbc = {
245 		.for_reclaim = 0,
246 	};
247 
248 	pagevec_init(&pvec, 0);
249 
250 	while (index <= end) {
251 		int i, nr_pages;
252 		nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
253 				PAGECACHE_TAG_DIRTY,
254 				min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
255 		if (unlikely(nr_pages == 0))
256 			break;
257 
258 		for (i = 0; i < nr_pages; i++) {
259 			struct page *page = pvec.pages[i];
260 
261 			lock_page(page);
262 
263 			if (unlikely(page->mapping != mapping)) {
264 continue_unlock:
265 				unlock_page(page);
266 				continue;
267 			}
268 			if (!PageDirty(page)) {
269 				/* someone wrote it for us */
270 				goto continue_unlock;
271 			}
272 
273 			if (!clear_page_dirty_for_io(page))
274 				goto continue_unlock;
275 
276 			if (f2fs_write_meta_page(page, &wbc)) {
277 				unlock_page(page);
278 				break;
279 			}
280 			nwritten++;
281 			if (unlikely(nwritten >= nr_to_write))
282 				break;
283 		}
284 		pagevec_release(&pvec);
285 		cond_resched();
286 	}
287 
288 	if (nwritten)
289 		f2fs_submit_merged_bio(sbi, type, WRITE);
290 
291 	return nwritten;
292 }
293 
294 static int f2fs_set_meta_page_dirty(struct page *page)
295 {
296 	trace_f2fs_set_page_dirty(page, META);
297 
298 	SetPageUptodate(page);
299 	if (!PageDirty(page)) {
300 		__set_page_dirty_nobuffers(page);
301 		inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_META);
302 		return 1;
303 	}
304 	return 0;
305 }
306 
307 const struct address_space_operations f2fs_meta_aops = {
308 	.writepage	= f2fs_write_meta_page,
309 	.writepages	= f2fs_write_meta_pages,
310 	.set_page_dirty	= f2fs_set_meta_page_dirty,
311 };
312 
313 static void __add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type)
314 {
315 	struct inode_management *im = &sbi->im[type];
316 	struct ino_entry *e;
317 retry:
318 	if (radix_tree_preload(GFP_NOFS)) {
319 		cond_resched();
320 		goto retry;
321 	}
322 
323 	spin_lock(&im->ino_lock);
324 
325 	e = radix_tree_lookup(&im->ino_root, ino);
326 	if (!e) {
327 		e = kmem_cache_alloc(ino_entry_slab, GFP_ATOMIC);
328 		if (!e) {
329 			spin_unlock(&im->ino_lock);
330 			radix_tree_preload_end();
331 			goto retry;
332 		}
333 		if (radix_tree_insert(&im->ino_root, ino, e)) {
334 			spin_unlock(&im->ino_lock);
335 			kmem_cache_free(ino_entry_slab, e);
336 			radix_tree_preload_end();
337 			goto retry;
338 		}
339 		memset(e, 0, sizeof(struct ino_entry));
340 		e->ino = ino;
341 
342 		list_add_tail(&e->list, &im->ino_list);
343 		if (type != ORPHAN_INO)
344 			im->ino_num++;
345 	}
346 	spin_unlock(&im->ino_lock);
347 	radix_tree_preload_end();
348 }
349 
350 static void __remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type)
351 {
352 	struct inode_management *im = &sbi->im[type];
353 	struct ino_entry *e;
354 
355 	spin_lock(&im->ino_lock);
356 	e = radix_tree_lookup(&im->ino_root, ino);
357 	if (e) {
358 		list_del(&e->list);
359 		radix_tree_delete(&im->ino_root, ino);
360 		im->ino_num--;
361 		spin_unlock(&im->ino_lock);
362 		kmem_cache_free(ino_entry_slab, e);
363 		return;
364 	}
365 	spin_unlock(&im->ino_lock);
366 }
367 
368 void add_dirty_inode(struct f2fs_sb_info *sbi, nid_t ino, int type)
369 {
370 	/* add new dirty ino entry into list */
371 	__add_ino_entry(sbi, ino, type);
372 }
373 
374 void remove_dirty_inode(struct f2fs_sb_info *sbi, nid_t ino, int type)
375 {
376 	/* remove dirty ino entry from list */
377 	__remove_ino_entry(sbi, ino, type);
378 }
379 
380 /* mode should be APPEND_INO or UPDATE_INO */
381 bool exist_written_data(struct f2fs_sb_info *sbi, nid_t ino, int mode)
382 {
383 	struct inode_management *im = &sbi->im[mode];
384 	struct ino_entry *e;
385 
386 	spin_lock(&im->ino_lock);
387 	e = radix_tree_lookup(&im->ino_root, ino);
388 	spin_unlock(&im->ino_lock);
389 	return e ? true : false;
390 }
391 
392 void release_dirty_inode(struct f2fs_sb_info *sbi)
393 {
394 	struct ino_entry *e, *tmp;
395 	int i;
396 
397 	for (i = APPEND_INO; i <= UPDATE_INO; i++) {
398 		struct inode_management *im = &sbi->im[i];
399 
400 		spin_lock(&im->ino_lock);
401 		list_for_each_entry_safe(e, tmp, &im->ino_list, list) {
402 			list_del(&e->list);
403 			radix_tree_delete(&im->ino_root, e->ino);
404 			kmem_cache_free(ino_entry_slab, e);
405 			im->ino_num--;
406 		}
407 		spin_unlock(&im->ino_lock);
408 	}
409 }
410 
411 int acquire_orphan_inode(struct f2fs_sb_info *sbi)
412 {
413 	struct inode_management *im = &sbi->im[ORPHAN_INO];
414 	int err = 0;
415 
416 	spin_lock(&im->ino_lock);
417 	if (unlikely(im->ino_num >= sbi->max_orphans))
418 		err = -ENOSPC;
419 	else
420 		im->ino_num++;
421 	spin_unlock(&im->ino_lock);
422 
423 	return err;
424 }
425 
426 void release_orphan_inode(struct f2fs_sb_info *sbi)
427 {
428 	struct inode_management *im = &sbi->im[ORPHAN_INO];
429 
430 	spin_lock(&im->ino_lock);
431 	f2fs_bug_on(sbi, im->ino_num == 0);
432 	im->ino_num--;
433 	spin_unlock(&im->ino_lock);
434 }
435 
436 void add_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
437 {
438 	/* add new orphan ino entry into list */
439 	__add_ino_entry(sbi, ino, ORPHAN_INO);
440 }
441 
442 void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
443 {
444 	/* remove orphan entry from orphan list */
445 	__remove_ino_entry(sbi, ino, ORPHAN_INO);
446 }
447 
448 static void recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
449 {
450 	struct inode *inode = f2fs_iget(sbi->sb, ino);
451 	f2fs_bug_on(sbi, IS_ERR(inode));
452 	clear_nlink(inode);
453 
454 	/* truncate all the data during iput */
455 	iput(inode);
456 }
457 
458 void recover_orphan_inodes(struct f2fs_sb_info *sbi)
459 {
460 	block_t start_blk, orphan_blkaddr, i, j;
461 
462 	if (!is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG))
463 		return;
464 
465 	sbi->por_doing = true;
466 
467 	start_blk = __start_cp_addr(sbi) + 1 +
468 		le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_payload);
469 	orphan_blkaddr = __start_sum_addr(sbi) - 1;
470 
471 	ra_meta_pages(sbi, start_blk, orphan_blkaddr, META_CP);
472 
473 	for (i = 0; i < orphan_blkaddr; i++) {
474 		struct page *page = get_meta_page(sbi, start_blk + i);
475 		struct f2fs_orphan_block *orphan_blk;
476 
477 		orphan_blk = (struct f2fs_orphan_block *)page_address(page);
478 		for (j = 0; j < le32_to_cpu(orphan_blk->entry_count); j++) {
479 			nid_t ino = le32_to_cpu(orphan_blk->ino[j]);
480 			recover_orphan_inode(sbi, ino);
481 		}
482 		f2fs_put_page(page, 1);
483 	}
484 	/* clear Orphan Flag */
485 	clear_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG);
486 	sbi->por_doing = false;
487 	return;
488 }
489 
490 static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk)
491 {
492 	struct list_head *head;
493 	struct f2fs_orphan_block *orphan_blk = NULL;
494 	unsigned int nentries = 0;
495 	unsigned short index;
496 	unsigned short orphan_blocks;
497 	struct page *page = NULL;
498 	struct ino_entry *orphan = NULL;
499 	struct inode_management *im = &sbi->im[ORPHAN_INO];
500 
501 	orphan_blocks = GET_ORPHAN_BLOCKS(im->ino_num);
502 
503 	for (index = 0; index < orphan_blocks; index++)
504 		grab_meta_page(sbi, start_blk + index);
505 
506 	index = 1;
507 	spin_lock(&im->ino_lock);
508 	head = &im->ino_list;
509 
510 	/* loop for each orphan inode entry and write them in Jornal block */
511 	list_for_each_entry(orphan, head, list) {
512 		if (!page) {
513 			page = find_get_page(META_MAPPING(sbi), start_blk++);
514 			f2fs_bug_on(sbi, !page);
515 			orphan_blk =
516 				(struct f2fs_orphan_block *)page_address(page);
517 			memset(orphan_blk, 0, sizeof(*orphan_blk));
518 			f2fs_put_page(page, 0);
519 		}
520 
521 		orphan_blk->ino[nentries++] = cpu_to_le32(orphan->ino);
522 
523 		if (nentries == F2FS_ORPHANS_PER_BLOCK) {
524 			/*
525 			 * an orphan block is full of 1020 entries,
526 			 * then we need to flush current orphan blocks
527 			 * and bring another one in memory
528 			 */
529 			orphan_blk->blk_addr = cpu_to_le16(index);
530 			orphan_blk->blk_count = cpu_to_le16(orphan_blocks);
531 			orphan_blk->entry_count = cpu_to_le32(nentries);
532 			set_page_dirty(page);
533 			f2fs_put_page(page, 1);
534 			index++;
535 			nentries = 0;
536 			page = NULL;
537 		}
538 	}
539 
540 	if (page) {
541 		orphan_blk->blk_addr = cpu_to_le16(index);
542 		orphan_blk->blk_count = cpu_to_le16(orphan_blocks);
543 		orphan_blk->entry_count = cpu_to_le32(nentries);
544 		set_page_dirty(page);
545 		f2fs_put_page(page, 1);
546 	}
547 
548 	spin_unlock(&im->ino_lock);
549 }
550 
551 static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
552 				block_t cp_addr, unsigned long long *version)
553 {
554 	struct page *cp_page_1, *cp_page_2 = NULL;
555 	unsigned long blk_size = sbi->blocksize;
556 	struct f2fs_checkpoint *cp_block;
557 	unsigned long long cur_version = 0, pre_version = 0;
558 	size_t crc_offset;
559 	__u32 crc = 0;
560 
561 	/* Read the 1st cp block in this CP pack */
562 	cp_page_1 = get_meta_page(sbi, cp_addr);
563 
564 	/* get the version number */
565 	cp_block = (struct f2fs_checkpoint *)page_address(cp_page_1);
566 	crc_offset = le32_to_cpu(cp_block->checksum_offset);
567 	if (crc_offset >= blk_size)
568 		goto invalid_cp1;
569 
570 	crc = le32_to_cpu(*((__u32 *)((unsigned char *)cp_block + crc_offset)));
571 	if (!f2fs_crc_valid(crc, cp_block, crc_offset))
572 		goto invalid_cp1;
573 
574 	pre_version = cur_cp_version(cp_block);
575 
576 	/* Read the 2nd cp block in this CP pack */
577 	cp_addr += le32_to_cpu(cp_block->cp_pack_total_block_count) - 1;
578 	cp_page_2 = get_meta_page(sbi, cp_addr);
579 
580 	cp_block = (struct f2fs_checkpoint *)page_address(cp_page_2);
581 	crc_offset = le32_to_cpu(cp_block->checksum_offset);
582 	if (crc_offset >= blk_size)
583 		goto invalid_cp2;
584 
585 	crc = le32_to_cpu(*((__u32 *)((unsigned char *)cp_block + crc_offset)));
586 	if (!f2fs_crc_valid(crc, cp_block, crc_offset))
587 		goto invalid_cp2;
588 
589 	cur_version = cur_cp_version(cp_block);
590 
591 	if (cur_version == pre_version) {
592 		*version = cur_version;
593 		f2fs_put_page(cp_page_2, 1);
594 		return cp_page_1;
595 	}
596 invalid_cp2:
597 	f2fs_put_page(cp_page_2, 1);
598 invalid_cp1:
599 	f2fs_put_page(cp_page_1, 1);
600 	return NULL;
601 }
602 
603 int get_valid_checkpoint(struct f2fs_sb_info *sbi)
604 {
605 	struct f2fs_checkpoint *cp_block;
606 	struct f2fs_super_block *fsb = sbi->raw_super;
607 	struct page *cp1, *cp2, *cur_page;
608 	unsigned long blk_size = sbi->blocksize;
609 	unsigned long long cp1_version = 0, cp2_version = 0;
610 	unsigned long long cp_start_blk_no;
611 	unsigned int cp_blks = 1 + le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_payload);
612 	block_t cp_blk_no;
613 	int i;
614 
615 	sbi->ckpt = kzalloc(cp_blks * blk_size, GFP_KERNEL);
616 	if (!sbi->ckpt)
617 		return -ENOMEM;
618 	/*
619 	 * Finding out valid cp block involves read both
620 	 * sets( cp pack1 and cp pack 2)
621 	 */
622 	cp_start_blk_no = le32_to_cpu(fsb->cp_blkaddr);
623 	cp1 = validate_checkpoint(sbi, cp_start_blk_no, &cp1_version);
624 
625 	/* The second checkpoint pack should start at the next segment */
626 	cp_start_blk_no += ((unsigned long long)1) <<
627 				le32_to_cpu(fsb->log_blocks_per_seg);
628 	cp2 = validate_checkpoint(sbi, cp_start_blk_no, &cp2_version);
629 
630 	if (cp1 && cp2) {
631 		if (ver_after(cp2_version, cp1_version))
632 			cur_page = cp2;
633 		else
634 			cur_page = cp1;
635 	} else if (cp1) {
636 		cur_page = cp1;
637 	} else if (cp2) {
638 		cur_page = cp2;
639 	} else {
640 		goto fail_no_cp;
641 	}
642 
643 	cp_block = (struct f2fs_checkpoint *)page_address(cur_page);
644 	memcpy(sbi->ckpt, cp_block, blk_size);
645 
646 	if (cp_blks <= 1)
647 		goto done;
648 
649 	cp_blk_no = le32_to_cpu(fsb->cp_blkaddr);
650 	if (cur_page == cp2)
651 		cp_blk_no += 1 << le32_to_cpu(fsb->log_blocks_per_seg);
652 
653 	for (i = 1; i < cp_blks; i++) {
654 		void *sit_bitmap_ptr;
655 		unsigned char *ckpt = (unsigned char *)sbi->ckpt;
656 
657 		cur_page = get_meta_page(sbi, cp_blk_no + i);
658 		sit_bitmap_ptr = page_address(cur_page);
659 		memcpy(ckpt + i * blk_size, sit_bitmap_ptr, blk_size);
660 		f2fs_put_page(cur_page, 1);
661 	}
662 done:
663 	f2fs_put_page(cp1, 1);
664 	f2fs_put_page(cp2, 1);
665 	return 0;
666 
667 fail_no_cp:
668 	kfree(sbi->ckpt);
669 	return -EINVAL;
670 }
671 
672 static int __add_dirty_inode(struct inode *inode, struct dir_inode_entry *new)
673 {
674 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
675 
676 	if (is_inode_flag_set(F2FS_I(inode), FI_DIRTY_DIR))
677 		return -EEXIST;
678 
679 	set_inode_flag(F2FS_I(inode), FI_DIRTY_DIR);
680 	F2FS_I(inode)->dirty_dir = new;
681 	list_add_tail(&new->list, &sbi->dir_inode_list);
682 	stat_inc_dirty_dir(sbi);
683 	return 0;
684 }
685 
686 void update_dirty_page(struct inode *inode, struct page *page)
687 {
688 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
689 	struct dir_inode_entry *new;
690 	int ret = 0;
691 
692 	if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode))
693 		return;
694 
695 	if (!S_ISDIR(inode->i_mode)) {
696 		inode_inc_dirty_pages(inode);
697 		goto out;
698 	}
699 
700 	new = f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS);
701 	new->inode = inode;
702 	INIT_LIST_HEAD(&new->list);
703 
704 	spin_lock(&sbi->dir_inode_lock);
705 	ret = __add_dirty_inode(inode, new);
706 	inode_inc_dirty_pages(inode);
707 	spin_unlock(&sbi->dir_inode_lock);
708 
709 	if (ret)
710 		kmem_cache_free(inode_entry_slab, new);
711 out:
712 	SetPagePrivate(page);
713 }
714 
715 void add_dirty_dir_inode(struct inode *inode)
716 {
717 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
718 	struct dir_inode_entry *new =
719 			f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS);
720 	int ret = 0;
721 
722 	new->inode = inode;
723 	INIT_LIST_HEAD(&new->list);
724 
725 	spin_lock(&sbi->dir_inode_lock);
726 	ret = __add_dirty_inode(inode, new);
727 	spin_unlock(&sbi->dir_inode_lock);
728 
729 	if (ret)
730 		kmem_cache_free(inode_entry_slab, new);
731 }
732 
733 void remove_dirty_dir_inode(struct inode *inode)
734 {
735 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
736 	struct dir_inode_entry *entry;
737 
738 	if (!S_ISDIR(inode->i_mode))
739 		return;
740 
741 	spin_lock(&sbi->dir_inode_lock);
742 	if (get_dirty_pages(inode) ||
743 			!is_inode_flag_set(F2FS_I(inode), FI_DIRTY_DIR)) {
744 		spin_unlock(&sbi->dir_inode_lock);
745 		return;
746 	}
747 
748 	entry = F2FS_I(inode)->dirty_dir;
749 	list_del(&entry->list);
750 	F2FS_I(inode)->dirty_dir = NULL;
751 	clear_inode_flag(F2FS_I(inode), FI_DIRTY_DIR);
752 	stat_dec_dirty_dir(sbi);
753 	spin_unlock(&sbi->dir_inode_lock);
754 	kmem_cache_free(inode_entry_slab, entry);
755 
756 	/* Only from the recovery routine */
757 	if (is_inode_flag_set(F2FS_I(inode), FI_DELAY_IPUT)) {
758 		clear_inode_flag(F2FS_I(inode), FI_DELAY_IPUT);
759 		iput(inode);
760 	}
761 }
762 
763 void sync_dirty_dir_inodes(struct f2fs_sb_info *sbi)
764 {
765 	struct list_head *head;
766 	struct dir_inode_entry *entry;
767 	struct inode *inode;
768 retry:
769 	if (unlikely(f2fs_cp_error(sbi)))
770 		return;
771 
772 	spin_lock(&sbi->dir_inode_lock);
773 
774 	head = &sbi->dir_inode_list;
775 	if (list_empty(head)) {
776 		spin_unlock(&sbi->dir_inode_lock);
777 		return;
778 	}
779 	entry = list_entry(head->next, struct dir_inode_entry, list);
780 	inode = igrab(entry->inode);
781 	spin_unlock(&sbi->dir_inode_lock);
782 	if (inode) {
783 		filemap_fdatawrite(inode->i_mapping);
784 		iput(inode);
785 	} else {
786 		/*
787 		 * We should submit bio, since it exists several
788 		 * wribacking dentry pages in the freeing inode.
789 		 */
790 		f2fs_submit_merged_bio(sbi, DATA, WRITE);
791 	}
792 	goto retry;
793 }
794 
795 /*
796  * Freeze all the FS-operations for checkpoint.
797  */
798 static int block_operations(struct f2fs_sb_info *sbi)
799 {
800 	struct writeback_control wbc = {
801 		.sync_mode = WB_SYNC_ALL,
802 		.nr_to_write = LONG_MAX,
803 		.for_reclaim = 0,
804 	};
805 	struct blk_plug plug;
806 	int err = 0;
807 
808 	blk_start_plug(&plug);
809 
810 retry_flush_dents:
811 	f2fs_lock_all(sbi);
812 	/* write all the dirty dentry pages */
813 	if (get_pages(sbi, F2FS_DIRTY_DENTS)) {
814 		f2fs_unlock_all(sbi);
815 		sync_dirty_dir_inodes(sbi);
816 		if (unlikely(f2fs_cp_error(sbi))) {
817 			err = -EIO;
818 			goto out;
819 		}
820 		goto retry_flush_dents;
821 	}
822 
823 	/*
824 	 * POR: we should ensure that there are no dirty node pages
825 	 * until finishing nat/sit flush.
826 	 */
827 retry_flush_nodes:
828 	down_write(&sbi->node_write);
829 
830 	if (get_pages(sbi, F2FS_DIRTY_NODES)) {
831 		up_write(&sbi->node_write);
832 		sync_node_pages(sbi, 0, &wbc);
833 		if (unlikely(f2fs_cp_error(sbi))) {
834 			f2fs_unlock_all(sbi);
835 			err = -EIO;
836 			goto out;
837 		}
838 		goto retry_flush_nodes;
839 	}
840 out:
841 	blk_finish_plug(&plug);
842 	return err;
843 }
844 
845 static void unblock_operations(struct f2fs_sb_info *sbi)
846 {
847 	up_write(&sbi->node_write);
848 	f2fs_unlock_all(sbi);
849 }
850 
851 static void wait_on_all_pages_writeback(struct f2fs_sb_info *sbi)
852 {
853 	DEFINE_WAIT(wait);
854 
855 	for (;;) {
856 		prepare_to_wait(&sbi->cp_wait, &wait, TASK_UNINTERRUPTIBLE);
857 
858 		if (!get_pages(sbi, F2FS_WRITEBACK))
859 			break;
860 
861 		io_schedule();
862 	}
863 	finish_wait(&sbi->cp_wait, &wait);
864 }
865 
866 static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
867 {
868 	struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
869 	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_WARM_NODE);
870 	struct f2fs_nm_info *nm_i = NM_I(sbi);
871 	unsigned long orphan_num = sbi->im[ORPHAN_INO].ino_num;
872 	nid_t last_nid = nm_i->next_scan_nid;
873 	block_t start_blk;
874 	struct page *cp_page;
875 	unsigned int data_sum_blocks, orphan_blocks;
876 	__u32 crc32 = 0;
877 	void *kaddr;
878 	int i;
879 	int cp_payload_blks = le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_payload);
880 
881 	/*
882 	 * This avoids to conduct wrong roll-forward operations and uses
883 	 * metapages, so should be called prior to sync_meta_pages below.
884 	 */
885 	discard_next_dnode(sbi, NEXT_FREE_BLKADDR(sbi, curseg));
886 
887 	/* Flush all the NAT/SIT pages */
888 	while (get_pages(sbi, F2FS_DIRTY_META)) {
889 		sync_meta_pages(sbi, META, LONG_MAX);
890 		if (unlikely(f2fs_cp_error(sbi)))
891 			return;
892 	}
893 
894 	next_free_nid(sbi, &last_nid);
895 
896 	/*
897 	 * modify checkpoint
898 	 * version number is already updated
899 	 */
900 	ckpt->elapsed_time = cpu_to_le64(get_mtime(sbi));
901 	ckpt->valid_block_count = cpu_to_le64(valid_user_blocks(sbi));
902 	ckpt->free_segment_count = cpu_to_le32(free_segments(sbi));
903 	for (i = 0; i < NR_CURSEG_NODE_TYPE; i++) {
904 		ckpt->cur_node_segno[i] =
905 			cpu_to_le32(curseg_segno(sbi, i + CURSEG_HOT_NODE));
906 		ckpt->cur_node_blkoff[i] =
907 			cpu_to_le16(curseg_blkoff(sbi, i + CURSEG_HOT_NODE));
908 		ckpt->alloc_type[i + CURSEG_HOT_NODE] =
909 				curseg_alloc_type(sbi, i + CURSEG_HOT_NODE);
910 	}
911 	for (i = 0; i < NR_CURSEG_DATA_TYPE; i++) {
912 		ckpt->cur_data_segno[i] =
913 			cpu_to_le32(curseg_segno(sbi, i + CURSEG_HOT_DATA));
914 		ckpt->cur_data_blkoff[i] =
915 			cpu_to_le16(curseg_blkoff(sbi, i + CURSEG_HOT_DATA));
916 		ckpt->alloc_type[i + CURSEG_HOT_DATA] =
917 				curseg_alloc_type(sbi, i + CURSEG_HOT_DATA);
918 	}
919 
920 	ckpt->valid_node_count = cpu_to_le32(valid_node_count(sbi));
921 	ckpt->valid_inode_count = cpu_to_le32(valid_inode_count(sbi));
922 	ckpt->next_free_nid = cpu_to_le32(last_nid);
923 
924 	/* 2 cp  + n data seg summary + orphan inode blocks */
925 	data_sum_blocks = npages_for_summary_flush(sbi);
926 	if (data_sum_blocks < NR_CURSEG_DATA_TYPE)
927 		set_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG);
928 	else
929 		clear_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG);
930 
931 	orphan_blocks = GET_ORPHAN_BLOCKS(orphan_num);
932 	ckpt->cp_pack_start_sum = cpu_to_le32(1 + cp_payload_blks +
933 			orphan_blocks);
934 
935 	if (cpc->reason == CP_UMOUNT) {
936 		set_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
937 		ckpt->cp_pack_total_block_count = cpu_to_le32(F2FS_CP_PACKS+
938 				cp_payload_blks + data_sum_blocks +
939 				orphan_blocks + NR_CURSEG_NODE_TYPE);
940 	} else {
941 		clear_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
942 		ckpt->cp_pack_total_block_count = cpu_to_le32(F2FS_CP_PACKS +
943 				cp_payload_blks + data_sum_blocks +
944 				orphan_blocks);
945 	}
946 
947 	if (orphan_num)
948 		set_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG);
949 	else
950 		clear_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG);
951 
952 	if (sbi->need_fsck)
953 		set_ckpt_flags(ckpt, CP_FSCK_FLAG);
954 
955 	/* update SIT/NAT bitmap */
956 	get_sit_bitmap(sbi, __bitmap_ptr(sbi, SIT_BITMAP));
957 	get_nat_bitmap(sbi, __bitmap_ptr(sbi, NAT_BITMAP));
958 
959 	crc32 = f2fs_crc32(ckpt, le32_to_cpu(ckpt->checksum_offset));
960 	*((__le32 *)((unsigned char *)ckpt +
961 				le32_to_cpu(ckpt->checksum_offset)))
962 				= cpu_to_le32(crc32);
963 
964 	start_blk = __start_cp_addr(sbi);
965 
966 	/* write out checkpoint buffer at block 0 */
967 	cp_page = grab_meta_page(sbi, start_blk++);
968 	kaddr = page_address(cp_page);
969 	memcpy(kaddr, ckpt, (1 << sbi->log_blocksize));
970 	set_page_dirty(cp_page);
971 	f2fs_put_page(cp_page, 1);
972 
973 	for (i = 1; i < 1 + cp_payload_blks; i++) {
974 		cp_page = grab_meta_page(sbi, start_blk++);
975 		kaddr = page_address(cp_page);
976 		memcpy(kaddr, (char *)ckpt + i * F2FS_BLKSIZE,
977 				(1 << sbi->log_blocksize));
978 		set_page_dirty(cp_page);
979 		f2fs_put_page(cp_page, 1);
980 	}
981 
982 	if (orphan_num) {
983 		write_orphan_inodes(sbi, start_blk);
984 		start_blk += orphan_blocks;
985 	}
986 
987 	write_data_summaries(sbi, start_blk);
988 	start_blk += data_sum_blocks;
989 	if (cpc->reason == CP_UMOUNT) {
990 		write_node_summaries(sbi, start_blk);
991 		start_blk += NR_CURSEG_NODE_TYPE;
992 	}
993 
994 	/* writeout checkpoint block */
995 	cp_page = grab_meta_page(sbi, start_blk);
996 	kaddr = page_address(cp_page);
997 	memcpy(kaddr, ckpt, (1 << sbi->log_blocksize));
998 	set_page_dirty(cp_page);
999 	f2fs_put_page(cp_page, 1);
1000 
1001 	/* wait for previous submitted node/meta pages writeback */
1002 	wait_on_all_pages_writeback(sbi);
1003 
1004 	if (unlikely(f2fs_cp_error(sbi)))
1005 		return;
1006 
1007 	filemap_fdatawait_range(NODE_MAPPING(sbi), 0, LONG_MAX);
1008 	filemap_fdatawait_range(META_MAPPING(sbi), 0, LONG_MAX);
1009 
1010 	/* update user_block_counts */
1011 	sbi->last_valid_block_count = sbi->total_valid_block_count;
1012 	sbi->alloc_valid_block_count = 0;
1013 
1014 	/* Here, we only have one bio having CP pack */
1015 	sync_meta_pages(sbi, META_FLUSH, LONG_MAX);
1016 
1017 	/* wait for previous submitted meta pages writeback */
1018 	wait_on_all_pages_writeback(sbi);
1019 
1020 	release_dirty_inode(sbi);
1021 
1022 	if (unlikely(f2fs_cp_error(sbi)))
1023 		return;
1024 
1025 	clear_prefree_segments(sbi);
1026 	F2FS_RESET_SB_DIRT(sbi);
1027 }
1028 
1029 /*
1030  * We guarantee that this checkpoint procedure will not fail.
1031  */
1032 void write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
1033 {
1034 	struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
1035 	unsigned long long ckpt_ver;
1036 
1037 	trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "start block_ops");
1038 
1039 	mutex_lock(&sbi->cp_mutex);
1040 
1041 	if (!sbi->s_dirty && cpc->reason != CP_DISCARD)
1042 		goto out;
1043 	if (unlikely(f2fs_cp_error(sbi)))
1044 		goto out;
1045 	if (block_operations(sbi))
1046 		goto out;
1047 
1048 	trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish block_ops");
1049 
1050 	f2fs_submit_merged_bio(sbi, DATA, WRITE);
1051 	f2fs_submit_merged_bio(sbi, NODE, WRITE);
1052 	f2fs_submit_merged_bio(sbi, META, WRITE);
1053 
1054 	/*
1055 	 * update checkpoint pack index
1056 	 * Increase the version number so that
1057 	 * SIT entries and seg summaries are written at correct place
1058 	 */
1059 	ckpt_ver = cur_cp_version(ckpt);
1060 	ckpt->checkpoint_ver = cpu_to_le64(++ckpt_ver);
1061 
1062 	/* write cached NAT/SIT entries to NAT/SIT area */
1063 	flush_nat_entries(sbi);
1064 	flush_sit_entries(sbi, cpc);
1065 
1066 	/* unlock all the fs_lock[] in do_checkpoint() */
1067 	do_checkpoint(sbi, cpc);
1068 
1069 	unblock_operations(sbi);
1070 	stat_inc_cp_count(sbi->stat_info);
1071 out:
1072 	mutex_unlock(&sbi->cp_mutex);
1073 	trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish checkpoint");
1074 }
1075 
1076 void init_ino_entry_info(struct f2fs_sb_info *sbi)
1077 {
1078 	int i;
1079 
1080 	for (i = 0; i < MAX_INO_ENTRY; i++) {
1081 		struct inode_management *im = &sbi->im[i];
1082 
1083 		INIT_RADIX_TREE(&im->ino_root, GFP_ATOMIC);
1084 		spin_lock_init(&im->ino_lock);
1085 		INIT_LIST_HEAD(&im->ino_list);
1086 		im->ino_num = 0;
1087 	}
1088 
1089 	/*
1090 	 * considering 512 blocks in a segment 8 blocks are needed for cp
1091 	 * and log segment summaries. Remaining blocks are used to keep
1092 	 * orphan entries with the limitation one reserved segment
1093 	 * for cp pack we can have max 1020*504 orphan entries
1094 	 */
1095 	sbi->max_orphans = (sbi->blocks_per_seg - F2FS_CP_PACKS -
1096 			NR_CURSEG_TYPE) * F2FS_ORPHANS_PER_BLOCK;
1097 }
1098 
1099 int __init create_checkpoint_caches(void)
1100 {
1101 	ino_entry_slab = f2fs_kmem_cache_create("f2fs_ino_entry",
1102 			sizeof(struct ino_entry));
1103 	if (!ino_entry_slab)
1104 		return -ENOMEM;
1105 	inode_entry_slab = f2fs_kmem_cache_create("f2fs_dirty_dir_entry",
1106 			sizeof(struct dir_inode_entry));
1107 	if (!inode_entry_slab) {
1108 		kmem_cache_destroy(ino_entry_slab);
1109 		return -ENOMEM;
1110 	}
1111 	return 0;
1112 }
1113 
1114 void destroy_checkpoint_caches(void)
1115 {
1116 	kmem_cache_destroy(ino_entry_slab);
1117 	kmem_cache_destroy(inode_entry_slab);
1118 }
1119