1 /* 2 * fs/f2fs/node.c 3 * 4 * Copyright (c) 2012 Samsung Electronics Co., Ltd. 5 * http://www.samsung.com/ 6 * 7 * This program is free software; you can redistribute it and/or modify 8 * it under the terms of the GNU General Public License version 2 as 9 * published by the Free Software Foundation. 10 */ 11 #include <linux/fs.h> 12 #include <linux/f2fs_fs.h> 13 #include <linux/mpage.h> 14 #include <linux/backing-dev.h> 15 #include <linux/blkdev.h> 16 #include <linux/pagevec.h> 17 #include <linux/swap.h> 18 19 #include "f2fs.h" 20 #include "node.h" 21 #include "segment.h" 22 #include <trace/events/f2fs.h> 23 24 #define on_build_free_nids(nmi) mutex_is_locked(&nm_i->build_lock) 25 26 static struct kmem_cache *nat_entry_slab; 27 static struct kmem_cache *free_nid_slab; 28 29 bool available_free_memory(struct f2fs_sb_info *sbi, int type) 30 { 31 struct f2fs_nm_info *nm_i = NM_I(sbi); 32 struct sysinfo val; 33 unsigned long mem_size = 0; 34 bool res = false; 35 36 si_meminfo(&val); 37 /* give 25%, 25%, 50% memory for each components respectively */ 38 if (type == FREE_NIDS) { 39 mem_size = (nm_i->fcnt * sizeof(struct free_nid)) >> 12; 40 res = mem_size < ((val.totalram * nm_i->ram_thresh / 100) >> 2); 41 } else if (type == NAT_ENTRIES) { 42 mem_size = (nm_i->nat_cnt * sizeof(struct nat_entry)) >> 12; 43 res = mem_size < ((val.totalram * nm_i->ram_thresh / 100) >> 2); 44 } else if (type == DIRTY_DENTS) { 45 if (sbi->sb->s_bdi->dirty_exceeded) 46 return false; 47 mem_size = get_pages(sbi, F2FS_DIRTY_DENTS); 48 res = mem_size < ((val.totalram * nm_i->ram_thresh / 100) >> 1); 49 } 50 return res; 51 } 52 53 static void clear_node_page_dirty(struct page *page) 54 { 55 struct address_space *mapping = page->mapping; 56 struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb); 57 unsigned int long flags; 58 59 if (PageDirty(page)) { 60 spin_lock_irqsave(&mapping->tree_lock, flags); 61 radix_tree_tag_clear(&mapping->page_tree, 62 page_index(page), 63 PAGECACHE_TAG_DIRTY); 64 spin_unlock_irqrestore(&mapping->tree_lock, flags); 65 66 clear_page_dirty_for_io(page); 67 dec_page_count(sbi, F2FS_DIRTY_NODES); 68 } 69 ClearPageUptodate(page); 70 } 71 72 static struct page *get_current_nat_page(struct f2fs_sb_info *sbi, nid_t nid) 73 { 74 pgoff_t index = current_nat_addr(sbi, nid); 75 return get_meta_page(sbi, index); 76 } 77 78 static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid) 79 { 80 struct page *src_page; 81 struct page *dst_page; 82 pgoff_t src_off; 83 pgoff_t dst_off; 84 void *src_addr; 85 void *dst_addr; 86 struct f2fs_nm_info *nm_i = NM_I(sbi); 87 88 src_off = current_nat_addr(sbi, nid); 89 dst_off = next_nat_addr(sbi, src_off); 90 91 /* get current nat block page with lock */ 92 src_page = get_meta_page(sbi, src_off); 93 94 /* Dirty src_page means that it is already the new target NAT page. */ 95 if (PageDirty(src_page)) 96 return src_page; 97 98 dst_page = grab_meta_page(sbi, dst_off); 99 100 src_addr = page_address(src_page); 101 dst_addr = page_address(dst_page); 102 memcpy(dst_addr, src_addr, PAGE_CACHE_SIZE); 103 set_page_dirty(dst_page); 104 f2fs_put_page(src_page, 1); 105 106 set_to_next_nat(nm_i, nid); 107 108 return dst_page; 109 } 110 111 static struct nat_entry *__lookup_nat_cache(struct f2fs_nm_info *nm_i, nid_t n) 112 { 113 return radix_tree_lookup(&nm_i->nat_root, n); 114 } 115 116 static unsigned int __gang_lookup_nat_cache(struct f2fs_nm_info *nm_i, 117 nid_t start, unsigned int nr, struct nat_entry **ep) 118 { 119 return radix_tree_gang_lookup(&nm_i->nat_root, (void **)ep, start, nr); 120 } 121 122 static void __del_from_nat_cache(struct f2fs_nm_info *nm_i, struct nat_entry *e) 123 { 124 list_del(&e->list); 125 radix_tree_delete(&nm_i->nat_root, nat_get_nid(e)); 126 nm_i->nat_cnt--; 127 kmem_cache_free(nat_entry_slab, e); 128 } 129 130 int is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid) 131 { 132 struct f2fs_nm_info *nm_i = NM_I(sbi); 133 struct nat_entry *e; 134 int is_cp = 1; 135 136 read_lock(&nm_i->nat_tree_lock); 137 e = __lookup_nat_cache(nm_i, nid); 138 if (e && !e->checkpointed) 139 is_cp = 0; 140 read_unlock(&nm_i->nat_tree_lock); 141 return is_cp; 142 } 143 144 bool fsync_mark_done(struct f2fs_sb_info *sbi, nid_t nid) 145 { 146 struct f2fs_nm_info *nm_i = NM_I(sbi); 147 struct nat_entry *e; 148 bool fsync_done = false; 149 150 read_lock(&nm_i->nat_tree_lock); 151 e = __lookup_nat_cache(nm_i, nid); 152 if (e) 153 fsync_done = e->fsync_done; 154 read_unlock(&nm_i->nat_tree_lock); 155 return fsync_done; 156 } 157 158 void fsync_mark_clear(struct f2fs_sb_info *sbi, nid_t nid) 159 { 160 struct f2fs_nm_info *nm_i = NM_I(sbi); 161 struct nat_entry *e; 162 163 write_lock(&nm_i->nat_tree_lock); 164 e = __lookup_nat_cache(nm_i, nid); 165 if (e) 166 e->fsync_done = false; 167 write_unlock(&nm_i->nat_tree_lock); 168 } 169 170 static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid) 171 { 172 struct nat_entry *new; 173 174 new = kmem_cache_alloc(nat_entry_slab, GFP_ATOMIC); 175 if (!new) 176 return NULL; 177 if (radix_tree_insert(&nm_i->nat_root, nid, new)) { 178 kmem_cache_free(nat_entry_slab, new); 179 return NULL; 180 } 181 memset(new, 0, sizeof(struct nat_entry)); 182 nat_set_nid(new, nid); 183 new->checkpointed = true; 184 list_add_tail(&new->list, &nm_i->nat_entries); 185 nm_i->nat_cnt++; 186 return new; 187 } 188 189 static void cache_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid, 190 struct f2fs_nat_entry *ne) 191 { 192 struct nat_entry *e; 193 retry: 194 write_lock(&nm_i->nat_tree_lock); 195 e = __lookup_nat_cache(nm_i, nid); 196 if (!e) { 197 e = grab_nat_entry(nm_i, nid); 198 if (!e) { 199 write_unlock(&nm_i->nat_tree_lock); 200 goto retry; 201 } 202 node_info_from_raw_nat(&e->ni, ne); 203 } 204 write_unlock(&nm_i->nat_tree_lock); 205 } 206 207 static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, 208 block_t new_blkaddr, bool fsync_done) 209 { 210 struct f2fs_nm_info *nm_i = NM_I(sbi); 211 struct nat_entry *e; 212 retry: 213 write_lock(&nm_i->nat_tree_lock); 214 e = __lookup_nat_cache(nm_i, ni->nid); 215 if (!e) { 216 e = grab_nat_entry(nm_i, ni->nid); 217 if (!e) { 218 write_unlock(&nm_i->nat_tree_lock); 219 goto retry; 220 } 221 e->ni = *ni; 222 f2fs_bug_on(ni->blk_addr == NEW_ADDR); 223 } else if (new_blkaddr == NEW_ADDR) { 224 /* 225 * when nid is reallocated, 226 * previous nat entry can be remained in nat cache. 227 * So, reinitialize it with new information. 228 */ 229 e->ni = *ni; 230 f2fs_bug_on(ni->blk_addr != NULL_ADDR); 231 } 232 233 /* sanity check */ 234 f2fs_bug_on(nat_get_blkaddr(e) != ni->blk_addr); 235 f2fs_bug_on(nat_get_blkaddr(e) == NULL_ADDR && 236 new_blkaddr == NULL_ADDR); 237 f2fs_bug_on(nat_get_blkaddr(e) == NEW_ADDR && 238 new_blkaddr == NEW_ADDR); 239 f2fs_bug_on(nat_get_blkaddr(e) != NEW_ADDR && 240 nat_get_blkaddr(e) != NULL_ADDR && 241 new_blkaddr == NEW_ADDR); 242 243 /* increament version no as node is removed */ 244 if (nat_get_blkaddr(e) != NEW_ADDR && new_blkaddr == NULL_ADDR) { 245 unsigned char version = nat_get_version(e); 246 nat_set_version(e, inc_node_version(version)); 247 } 248 249 /* change address */ 250 nat_set_blkaddr(e, new_blkaddr); 251 __set_nat_cache_dirty(nm_i, e); 252 253 /* update fsync_mark if its inode nat entry is still alive */ 254 e = __lookup_nat_cache(nm_i, ni->ino); 255 if (e) 256 e->fsync_done = fsync_done; 257 write_unlock(&nm_i->nat_tree_lock); 258 } 259 260 int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink) 261 { 262 struct f2fs_nm_info *nm_i = NM_I(sbi); 263 264 if (available_free_memory(sbi, NAT_ENTRIES)) 265 return 0; 266 267 write_lock(&nm_i->nat_tree_lock); 268 while (nr_shrink && !list_empty(&nm_i->nat_entries)) { 269 struct nat_entry *ne; 270 ne = list_first_entry(&nm_i->nat_entries, 271 struct nat_entry, list); 272 __del_from_nat_cache(nm_i, ne); 273 nr_shrink--; 274 } 275 write_unlock(&nm_i->nat_tree_lock); 276 return nr_shrink; 277 } 278 279 /* 280 * This function returns always success 281 */ 282 void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni) 283 { 284 struct f2fs_nm_info *nm_i = NM_I(sbi); 285 struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); 286 struct f2fs_summary_block *sum = curseg->sum_blk; 287 nid_t start_nid = START_NID(nid); 288 struct f2fs_nat_block *nat_blk; 289 struct page *page = NULL; 290 struct f2fs_nat_entry ne; 291 struct nat_entry *e; 292 int i; 293 294 memset(&ne, 0, sizeof(struct f2fs_nat_entry)); 295 ni->nid = nid; 296 297 /* Check nat cache */ 298 read_lock(&nm_i->nat_tree_lock); 299 e = __lookup_nat_cache(nm_i, nid); 300 if (e) { 301 ni->ino = nat_get_ino(e); 302 ni->blk_addr = nat_get_blkaddr(e); 303 ni->version = nat_get_version(e); 304 } 305 read_unlock(&nm_i->nat_tree_lock); 306 if (e) 307 return; 308 309 /* Check current segment summary */ 310 mutex_lock(&curseg->curseg_mutex); 311 i = lookup_journal_in_cursum(sum, NAT_JOURNAL, nid, 0); 312 if (i >= 0) { 313 ne = nat_in_journal(sum, i); 314 node_info_from_raw_nat(ni, &ne); 315 } 316 mutex_unlock(&curseg->curseg_mutex); 317 if (i >= 0) 318 goto cache; 319 320 /* Fill node_info from nat page */ 321 page = get_current_nat_page(sbi, start_nid); 322 nat_blk = (struct f2fs_nat_block *)page_address(page); 323 ne = nat_blk->entries[nid - start_nid]; 324 node_info_from_raw_nat(ni, &ne); 325 f2fs_put_page(page, 1); 326 cache: 327 /* cache nat entry */ 328 cache_nat_entry(NM_I(sbi), nid, &ne); 329 } 330 331 /* 332 * The maximum depth is four. 333 * Offset[0] will have raw inode offset. 334 */ 335 static int get_node_path(struct f2fs_inode_info *fi, long block, 336 int offset[4], unsigned int noffset[4]) 337 { 338 const long direct_index = ADDRS_PER_INODE(fi); 339 const long direct_blks = ADDRS_PER_BLOCK; 340 const long dptrs_per_blk = NIDS_PER_BLOCK; 341 const long indirect_blks = ADDRS_PER_BLOCK * NIDS_PER_BLOCK; 342 const long dindirect_blks = indirect_blks * NIDS_PER_BLOCK; 343 int n = 0; 344 int level = 0; 345 346 noffset[0] = 0; 347 348 if (block < direct_index) { 349 offset[n] = block; 350 goto got; 351 } 352 block -= direct_index; 353 if (block < direct_blks) { 354 offset[n++] = NODE_DIR1_BLOCK; 355 noffset[n] = 1; 356 offset[n] = block; 357 level = 1; 358 goto got; 359 } 360 block -= direct_blks; 361 if (block < direct_blks) { 362 offset[n++] = NODE_DIR2_BLOCK; 363 noffset[n] = 2; 364 offset[n] = block; 365 level = 1; 366 goto got; 367 } 368 block -= direct_blks; 369 if (block < indirect_blks) { 370 offset[n++] = NODE_IND1_BLOCK; 371 noffset[n] = 3; 372 offset[n++] = block / direct_blks; 373 noffset[n] = 4 + offset[n - 1]; 374 offset[n] = block % direct_blks; 375 level = 2; 376 goto got; 377 } 378 block -= indirect_blks; 379 if (block < indirect_blks) { 380 offset[n++] = NODE_IND2_BLOCK; 381 noffset[n] = 4 + dptrs_per_blk; 382 offset[n++] = block / direct_blks; 383 noffset[n] = 5 + dptrs_per_blk + offset[n - 1]; 384 offset[n] = block % direct_blks; 385 level = 2; 386 goto got; 387 } 388 block -= indirect_blks; 389 if (block < dindirect_blks) { 390 offset[n++] = NODE_DIND_BLOCK; 391 noffset[n] = 5 + (dptrs_per_blk * 2); 392 offset[n++] = block / indirect_blks; 393 noffset[n] = 6 + (dptrs_per_blk * 2) + 394 offset[n - 1] * (dptrs_per_blk + 1); 395 offset[n++] = (block / direct_blks) % dptrs_per_blk; 396 noffset[n] = 7 + (dptrs_per_blk * 2) + 397 offset[n - 2] * (dptrs_per_blk + 1) + 398 offset[n - 1]; 399 offset[n] = block % direct_blks; 400 level = 3; 401 goto got; 402 } else { 403 BUG(); 404 } 405 got: 406 return level; 407 } 408 409 /* 410 * Caller should call f2fs_put_dnode(dn). 411 * Also, it should grab and release a rwsem by calling f2fs_lock_op() and 412 * f2fs_unlock_op() only if ro is not set RDONLY_NODE. 413 * In the case of RDONLY_NODE, we don't need to care about mutex. 414 */ 415 int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) 416 { 417 struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); 418 struct page *npage[4]; 419 struct page *parent; 420 int offset[4]; 421 unsigned int noffset[4]; 422 nid_t nids[4]; 423 int level, i; 424 int err = 0; 425 426 level = get_node_path(F2FS_I(dn->inode), index, offset, noffset); 427 428 nids[0] = dn->inode->i_ino; 429 npage[0] = dn->inode_page; 430 431 if (!npage[0]) { 432 npage[0] = get_node_page(sbi, nids[0]); 433 if (IS_ERR(npage[0])) 434 return PTR_ERR(npage[0]); 435 } 436 parent = npage[0]; 437 if (level != 0) 438 nids[1] = get_nid(parent, offset[0], true); 439 dn->inode_page = npage[0]; 440 dn->inode_page_locked = true; 441 442 /* get indirect or direct nodes */ 443 for (i = 1; i <= level; i++) { 444 bool done = false; 445 446 if (!nids[i] && mode == ALLOC_NODE) { 447 /* alloc new node */ 448 if (!alloc_nid(sbi, &(nids[i]))) { 449 err = -ENOSPC; 450 goto release_pages; 451 } 452 453 dn->nid = nids[i]; 454 npage[i] = new_node_page(dn, noffset[i], NULL); 455 if (IS_ERR(npage[i])) { 456 alloc_nid_failed(sbi, nids[i]); 457 err = PTR_ERR(npage[i]); 458 goto release_pages; 459 } 460 461 set_nid(parent, offset[i - 1], nids[i], i == 1); 462 alloc_nid_done(sbi, nids[i]); 463 done = true; 464 } else if (mode == LOOKUP_NODE_RA && i == level && level > 1) { 465 npage[i] = get_node_page_ra(parent, offset[i - 1]); 466 if (IS_ERR(npage[i])) { 467 err = PTR_ERR(npage[i]); 468 goto release_pages; 469 } 470 done = true; 471 } 472 if (i == 1) { 473 dn->inode_page_locked = false; 474 unlock_page(parent); 475 } else { 476 f2fs_put_page(parent, 1); 477 } 478 479 if (!done) { 480 npage[i] = get_node_page(sbi, nids[i]); 481 if (IS_ERR(npage[i])) { 482 err = PTR_ERR(npage[i]); 483 f2fs_put_page(npage[0], 0); 484 goto release_out; 485 } 486 } 487 if (i < level) { 488 parent = npage[i]; 489 nids[i + 1] = get_nid(parent, offset[i], false); 490 } 491 } 492 dn->nid = nids[level]; 493 dn->ofs_in_node = offset[level]; 494 dn->node_page = npage[level]; 495 dn->data_blkaddr = datablock_addr(dn->node_page, dn->ofs_in_node); 496 return 0; 497 498 release_pages: 499 f2fs_put_page(parent, 1); 500 if (i > 1) 501 f2fs_put_page(npage[0], 0); 502 release_out: 503 dn->inode_page = NULL; 504 dn->node_page = NULL; 505 return err; 506 } 507 508 static void truncate_node(struct dnode_of_data *dn) 509 { 510 struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); 511 struct node_info ni; 512 513 get_node_info(sbi, dn->nid, &ni); 514 if (dn->inode->i_blocks == 0) { 515 f2fs_bug_on(ni.blk_addr != NULL_ADDR); 516 goto invalidate; 517 } 518 f2fs_bug_on(ni.blk_addr == NULL_ADDR); 519 520 /* Deallocate node address */ 521 invalidate_blocks(sbi, ni.blk_addr); 522 dec_valid_node_count(sbi, dn->inode); 523 set_node_addr(sbi, &ni, NULL_ADDR, false); 524 525 if (dn->nid == dn->inode->i_ino) { 526 remove_orphan_inode(sbi, dn->nid); 527 dec_valid_inode_count(sbi); 528 } else { 529 sync_inode_page(dn); 530 } 531 invalidate: 532 clear_node_page_dirty(dn->node_page); 533 F2FS_SET_SB_DIRT(sbi); 534 535 f2fs_put_page(dn->node_page, 1); 536 537 invalidate_mapping_pages(NODE_MAPPING(sbi), 538 dn->node_page->index, dn->node_page->index); 539 540 dn->node_page = NULL; 541 trace_f2fs_truncate_node(dn->inode, dn->nid, ni.blk_addr); 542 } 543 544 static int truncate_dnode(struct dnode_of_data *dn) 545 { 546 struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); 547 struct page *page; 548 549 if (dn->nid == 0) 550 return 1; 551 552 /* get direct node */ 553 page = get_node_page(sbi, dn->nid); 554 if (IS_ERR(page) && PTR_ERR(page) == -ENOENT) 555 return 1; 556 else if (IS_ERR(page)) 557 return PTR_ERR(page); 558 559 /* Make dnode_of_data for parameter */ 560 dn->node_page = page; 561 dn->ofs_in_node = 0; 562 truncate_data_blocks(dn); 563 truncate_node(dn); 564 return 1; 565 } 566 567 static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs, 568 int ofs, int depth) 569 { 570 struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); 571 struct dnode_of_data rdn = *dn; 572 struct page *page; 573 struct f2fs_node *rn; 574 nid_t child_nid; 575 unsigned int child_nofs; 576 int freed = 0; 577 int i, ret; 578 579 if (dn->nid == 0) 580 return NIDS_PER_BLOCK + 1; 581 582 trace_f2fs_truncate_nodes_enter(dn->inode, dn->nid, dn->data_blkaddr); 583 584 page = get_node_page(sbi, dn->nid); 585 if (IS_ERR(page)) { 586 trace_f2fs_truncate_nodes_exit(dn->inode, PTR_ERR(page)); 587 return PTR_ERR(page); 588 } 589 590 rn = F2FS_NODE(page); 591 if (depth < 3) { 592 for (i = ofs; i < NIDS_PER_BLOCK; i++, freed++) { 593 child_nid = le32_to_cpu(rn->in.nid[i]); 594 if (child_nid == 0) 595 continue; 596 rdn.nid = child_nid; 597 ret = truncate_dnode(&rdn); 598 if (ret < 0) 599 goto out_err; 600 set_nid(page, i, 0, false); 601 } 602 } else { 603 child_nofs = nofs + ofs * (NIDS_PER_BLOCK + 1) + 1; 604 for (i = ofs; i < NIDS_PER_BLOCK; i++) { 605 child_nid = le32_to_cpu(rn->in.nid[i]); 606 if (child_nid == 0) { 607 child_nofs += NIDS_PER_BLOCK + 1; 608 continue; 609 } 610 rdn.nid = child_nid; 611 ret = truncate_nodes(&rdn, child_nofs, 0, depth - 1); 612 if (ret == (NIDS_PER_BLOCK + 1)) { 613 set_nid(page, i, 0, false); 614 child_nofs += ret; 615 } else if (ret < 0 && ret != -ENOENT) { 616 goto out_err; 617 } 618 } 619 freed = child_nofs; 620 } 621 622 if (!ofs) { 623 /* remove current indirect node */ 624 dn->node_page = page; 625 truncate_node(dn); 626 freed++; 627 } else { 628 f2fs_put_page(page, 1); 629 } 630 trace_f2fs_truncate_nodes_exit(dn->inode, freed); 631 return freed; 632 633 out_err: 634 f2fs_put_page(page, 1); 635 trace_f2fs_truncate_nodes_exit(dn->inode, ret); 636 return ret; 637 } 638 639 static int truncate_partial_nodes(struct dnode_of_data *dn, 640 struct f2fs_inode *ri, int *offset, int depth) 641 { 642 struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); 643 struct page *pages[2]; 644 nid_t nid[3]; 645 nid_t child_nid; 646 int err = 0; 647 int i; 648 int idx = depth - 2; 649 650 nid[0] = le32_to_cpu(ri->i_nid[offset[0] - NODE_DIR1_BLOCK]); 651 if (!nid[0]) 652 return 0; 653 654 /* get indirect nodes in the path */ 655 for (i = 0; i < idx + 1; i++) { 656 /* refernece count'll be increased */ 657 pages[i] = get_node_page(sbi, nid[i]); 658 if (IS_ERR(pages[i])) { 659 err = PTR_ERR(pages[i]); 660 idx = i - 1; 661 goto fail; 662 } 663 nid[i + 1] = get_nid(pages[i], offset[i + 1], false); 664 } 665 666 /* free direct nodes linked to a partial indirect node */ 667 for (i = offset[idx + 1]; i < NIDS_PER_BLOCK; i++) { 668 child_nid = get_nid(pages[idx], i, false); 669 if (!child_nid) 670 continue; 671 dn->nid = child_nid; 672 err = truncate_dnode(dn); 673 if (err < 0) 674 goto fail; 675 set_nid(pages[idx], i, 0, false); 676 } 677 678 if (offset[idx + 1] == 0) { 679 dn->node_page = pages[idx]; 680 dn->nid = nid[idx]; 681 truncate_node(dn); 682 } else { 683 f2fs_put_page(pages[idx], 1); 684 } 685 offset[idx]++; 686 offset[idx + 1] = 0; 687 idx--; 688 fail: 689 for (i = idx; i >= 0; i--) 690 f2fs_put_page(pages[i], 1); 691 692 trace_f2fs_truncate_partial_nodes(dn->inode, nid, depth, err); 693 694 return err; 695 } 696 697 /* 698 * All the block addresses of data and nodes should be nullified. 699 */ 700 int truncate_inode_blocks(struct inode *inode, pgoff_t from) 701 { 702 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 703 int err = 0, cont = 1; 704 int level, offset[4], noffset[4]; 705 unsigned int nofs = 0; 706 struct f2fs_inode *ri; 707 struct dnode_of_data dn; 708 struct page *page; 709 710 trace_f2fs_truncate_inode_blocks_enter(inode, from); 711 712 level = get_node_path(F2FS_I(inode), from, offset, noffset); 713 restart: 714 page = get_node_page(sbi, inode->i_ino); 715 if (IS_ERR(page)) { 716 trace_f2fs_truncate_inode_blocks_exit(inode, PTR_ERR(page)); 717 return PTR_ERR(page); 718 } 719 720 set_new_dnode(&dn, inode, page, NULL, 0); 721 unlock_page(page); 722 723 ri = F2FS_INODE(page); 724 switch (level) { 725 case 0: 726 case 1: 727 nofs = noffset[1]; 728 break; 729 case 2: 730 nofs = noffset[1]; 731 if (!offset[level - 1]) 732 goto skip_partial; 733 err = truncate_partial_nodes(&dn, ri, offset, level); 734 if (err < 0 && err != -ENOENT) 735 goto fail; 736 nofs += 1 + NIDS_PER_BLOCK; 737 break; 738 case 3: 739 nofs = 5 + 2 * NIDS_PER_BLOCK; 740 if (!offset[level - 1]) 741 goto skip_partial; 742 err = truncate_partial_nodes(&dn, ri, offset, level); 743 if (err < 0 && err != -ENOENT) 744 goto fail; 745 break; 746 default: 747 BUG(); 748 } 749 750 skip_partial: 751 while (cont) { 752 dn.nid = le32_to_cpu(ri->i_nid[offset[0] - NODE_DIR1_BLOCK]); 753 switch (offset[0]) { 754 case NODE_DIR1_BLOCK: 755 case NODE_DIR2_BLOCK: 756 err = truncate_dnode(&dn); 757 break; 758 759 case NODE_IND1_BLOCK: 760 case NODE_IND2_BLOCK: 761 err = truncate_nodes(&dn, nofs, offset[1], 2); 762 break; 763 764 case NODE_DIND_BLOCK: 765 err = truncate_nodes(&dn, nofs, offset[1], 3); 766 cont = 0; 767 break; 768 769 default: 770 BUG(); 771 } 772 if (err < 0 && err != -ENOENT) 773 goto fail; 774 if (offset[1] == 0 && 775 ri->i_nid[offset[0] - NODE_DIR1_BLOCK]) { 776 lock_page(page); 777 if (unlikely(page->mapping != NODE_MAPPING(sbi))) { 778 f2fs_put_page(page, 1); 779 goto restart; 780 } 781 f2fs_wait_on_page_writeback(page, NODE); 782 ri->i_nid[offset[0] - NODE_DIR1_BLOCK] = 0; 783 set_page_dirty(page); 784 unlock_page(page); 785 } 786 offset[1] = 0; 787 offset[0]++; 788 nofs += err; 789 } 790 fail: 791 f2fs_put_page(page, 0); 792 trace_f2fs_truncate_inode_blocks_exit(inode, err); 793 return err > 0 ? 0 : err; 794 } 795 796 int truncate_xattr_node(struct inode *inode, struct page *page) 797 { 798 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 799 nid_t nid = F2FS_I(inode)->i_xattr_nid; 800 struct dnode_of_data dn; 801 struct page *npage; 802 803 if (!nid) 804 return 0; 805 806 npage = get_node_page(sbi, nid); 807 if (IS_ERR(npage)) 808 return PTR_ERR(npage); 809 810 F2FS_I(inode)->i_xattr_nid = 0; 811 812 /* need to do checkpoint during fsync */ 813 F2FS_I(inode)->xattr_ver = cur_cp_version(F2FS_CKPT(sbi)); 814 815 set_new_dnode(&dn, inode, page, npage, nid); 816 817 if (page) 818 dn.inode_page_locked = true; 819 truncate_node(&dn); 820 return 0; 821 } 822 823 /* 824 * Caller should grab and release a rwsem by calling f2fs_lock_op() and 825 * f2fs_unlock_op(). 826 */ 827 void remove_inode_page(struct inode *inode) 828 { 829 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 830 struct page *page; 831 nid_t ino = inode->i_ino; 832 struct dnode_of_data dn; 833 834 page = get_node_page(sbi, ino); 835 if (IS_ERR(page)) 836 return; 837 838 if (truncate_xattr_node(inode, page)) { 839 f2fs_put_page(page, 1); 840 return; 841 } 842 /* 0 is possible, after f2fs_new_inode() is failed */ 843 f2fs_bug_on(inode->i_blocks != 0 && inode->i_blocks != 1); 844 set_new_dnode(&dn, inode, page, page, ino); 845 truncate_node(&dn); 846 } 847 848 struct page *new_inode_page(struct inode *inode, const struct qstr *name) 849 { 850 struct dnode_of_data dn; 851 852 /* allocate inode page for new inode */ 853 set_new_dnode(&dn, inode, NULL, NULL, inode->i_ino); 854 855 /* caller should f2fs_put_page(page, 1); */ 856 return new_node_page(&dn, 0, NULL); 857 } 858 859 struct page *new_node_page(struct dnode_of_data *dn, 860 unsigned int ofs, struct page *ipage) 861 { 862 struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); 863 struct node_info old_ni, new_ni; 864 struct page *page; 865 int err; 866 867 if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC))) 868 return ERR_PTR(-EPERM); 869 870 page = grab_cache_page(NODE_MAPPING(sbi), dn->nid); 871 if (!page) 872 return ERR_PTR(-ENOMEM); 873 874 if (unlikely(!inc_valid_node_count(sbi, dn->inode))) { 875 err = -ENOSPC; 876 goto fail; 877 } 878 879 get_node_info(sbi, dn->nid, &old_ni); 880 881 /* Reinitialize old_ni with new node page */ 882 f2fs_bug_on(old_ni.blk_addr != NULL_ADDR); 883 new_ni = old_ni; 884 new_ni.ino = dn->inode->i_ino; 885 set_node_addr(sbi, &new_ni, NEW_ADDR, false); 886 887 f2fs_wait_on_page_writeback(page, NODE); 888 fill_node_footer(page, dn->nid, dn->inode->i_ino, ofs, true); 889 set_cold_node(dn->inode, page); 890 SetPageUptodate(page); 891 set_page_dirty(page); 892 893 if (f2fs_has_xattr_block(ofs)) 894 F2FS_I(dn->inode)->i_xattr_nid = dn->nid; 895 896 dn->node_page = page; 897 if (ipage) 898 update_inode(dn->inode, ipage); 899 else 900 sync_inode_page(dn); 901 if (ofs == 0) 902 inc_valid_inode_count(sbi); 903 904 return page; 905 906 fail: 907 clear_node_page_dirty(page); 908 f2fs_put_page(page, 1); 909 return ERR_PTR(err); 910 } 911 912 /* 913 * Caller should do after getting the following values. 914 * 0: f2fs_put_page(page, 0) 915 * LOCKED_PAGE: f2fs_put_page(page, 1) 916 * error: nothing 917 */ 918 static int read_node_page(struct page *page, int rw) 919 { 920 struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb); 921 struct node_info ni; 922 923 get_node_info(sbi, page->index, &ni); 924 925 if (unlikely(ni.blk_addr == NULL_ADDR)) { 926 f2fs_put_page(page, 1); 927 return -ENOENT; 928 } 929 930 if (PageUptodate(page)) 931 return LOCKED_PAGE; 932 933 return f2fs_submit_page_bio(sbi, page, ni.blk_addr, rw); 934 } 935 936 /* 937 * Readahead a node page 938 */ 939 void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid) 940 { 941 struct page *apage; 942 int err; 943 944 apage = find_get_page(NODE_MAPPING(sbi), nid); 945 if (apage && PageUptodate(apage)) { 946 f2fs_put_page(apage, 0); 947 return; 948 } 949 f2fs_put_page(apage, 0); 950 951 apage = grab_cache_page(NODE_MAPPING(sbi), nid); 952 if (!apage) 953 return; 954 955 err = read_node_page(apage, READA); 956 if (err == 0) 957 f2fs_put_page(apage, 0); 958 else if (err == LOCKED_PAGE) 959 f2fs_put_page(apage, 1); 960 } 961 962 struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid) 963 { 964 struct page *page; 965 int err; 966 repeat: 967 page = grab_cache_page(NODE_MAPPING(sbi), nid); 968 if (!page) 969 return ERR_PTR(-ENOMEM); 970 971 err = read_node_page(page, READ_SYNC); 972 if (err < 0) 973 return ERR_PTR(err); 974 else if (err == LOCKED_PAGE) 975 goto got_it; 976 977 lock_page(page); 978 if (unlikely(!PageUptodate(page) || nid != nid_of_node(page))) { 979 f2fs_put_page(page, 1); 980 return ERR_PTR(-EIO); 981 } 982 if (unlikely(page->mapping != NODE_MAPPING(sbi))) { 983 f2fs_put_page(page, 1); 984 goto repeat; 985 } 986 got_it: 987 return page; 988 } 989 990 /* 991 * Return a locked page for the desired node page. 992 * And, readahead MAX_RA_NODE number of node pages. 993 */ 994 struct page *get_node_page_ra(struct page *parent, int start) 995 { 996 struct f2fs_sb_info *sbi = F2FS_SB(parent->mapping->host->i_sb); 997 struct blk_plug plug; 998 struct page *page; 999 int err, i, end; 1000 nid_t nid; 1001 1002 /* First, try getting the desired direct node. */ 1003 nid = get_nid(parent, start, false); 1004 if (!nid) 1005 return ERR_PTR(-ENOENT); 1006 repeat: 1007 page = grab_cache_page(NODE_MAPPING(sbi), nid); 1008 if (!page) 1009 return ERR_PTR(-ENOMEM); 1010 1011 err = read_node_page(page, READ_SYNC); 1012 if (err < 0) 1013 return ERR_PTR(err); 1014 else if (err == LOCKED_PAGE) 1015 goto page_hit; 1016 1017 blk_start_plug(&plug); 1018 1019 /* Then, try readahead for siblings of the desired node */ 1020 end = start + MAX_RA_NODE; 1021 end = min(end, NIDS_PER_BLOCK); 1022 for (i = start + 1; i < end; i++) { 1023 nid = get_nid(parent, i, false); 1024 if (!nid) 1025 continue; 1026 ra_node_page(sbi, nid); 1027 } 1028 1029 blk_finish_plug(&plug); 1030 1031 lock_page(page); 1032 if (unlikely(page->mapping != NODE_MAPPING(sbi))) { 1033 f2fs_put_page(page, 1); 1034 goto repeat; 1035 } 1036 page_hit: 1037 if (unlikely(!PageUptodate(page))) { 1038 f2fs_put_page(page, 1); 1039 return ERR_PTR(-EIO); 1040 } 1041 return page; 1042 } 1043 1044 void sync_inode_page(struct dnode_of_data *dn) 1045 { 1046 if (IS_INODE(dn->node_page) || dn->inode_page == dn->node_page) { 1047 update_inode(dn->inode, dn->node_page); 1048 } else if (dn->inode_page) { 1049 if (!dn->inode_page_locked) 1050 lock_page(dn->inode_page); 1051 update_inode(dn->inode, dn->inode_page); 1052 if (!dn->inode_page_locked) 1053 unlock_page(dn->inode_page); 1054 } else { 1055 update_inode_page(dn->inode); 1056 } 1057 } 1058 1059 int sync_node_pages(struct f2fs_sb_info *sbi, nid_t ino, 1060 struct writeback_control *wbc) 1061 { 1062 pgoff_t index, end; 1063 struct pagevec pvec; 1064 int step = ino ? 2 : 0; 1065 int nwritten = 0, wrote = 0; 1066 1067 pagevec_init(&pvec, 0); 1068 1069 next_step: 1070 index = 0; 1071 end = LONG_MAX; 1072 1073 while (index <= end) { 1074 int i, nr_pages; 1075 nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index, 1076 PAGECACHE_TAG_DIRTY, 1077 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); 1078 if (nr_pages == 0) 1079 break; 1080 1081 for (i = 0; i < nr_pages; i++) { 1082 struct page *page = pvec.pages[i]; 1083 1084 /* 1085 * flushing sequence with step: 1086 * 0. indirect nodes 1087 * 1. dentry dnodes 1088 * 2. file dnodes 1089 */ 1090 if (step == 0 && IS_DNODE(page)) 1091 continue; 1092 if (step == 1 && (!IS_DNODE(page) || 1093 is_cold_node(page))) 1094 continue; 1095 if (step == 2 && (!IS_DNODE(page) || 1096 !is_cold_node(page))) 1097 continue; 1098 1099 /* 1100 * If an fsync mode, 1101 * we should not skip writing node pages. 1102 */ 1103 if (ino && ino_of_node(page) == ino) 1104 lock_page(page); 1105 else if (!trylock_page(page)) 1106 continue; 1107 1108 if (unlikely(page->mapping != NODE_MAPPING(sbi))) { 1109 continue_unlock: 1110 unlock_page(page); 1111 continue; 1112 } 1113 if (ino && ino_of_node(page) != ino) 1114 goto continue_unlock; 1115 1116 if (!PageDirty(page)) { 1117 /* someone wrote it for us */ 1118 goto continue_unlock; 1119 } 1120 1121 if (!clear_page_dirty_for_io(page)) 1122 goto continue_unlock; 1123 1124 /* called by fsync() */ 1125 if (ino && IS_DNODE(page)) { 1126 int mark = !is_checkpointed_node(sbi, ino); 1127 set_fsync_mark(page, 1); 1128 if (IS_INODE(page)) 1129 set_dentry_mark(page, mark); 1130 nwritten++; 1131 } else { 1132 set_fsync_mark(page, 0); 1133 set_dentry_mark(page, 0); 1134 } 1135 NODE_MAPPING(sbi)->a_ops->writepage(page, wbc); 1136 wrote++; 1137 1138 if (--wbc->nr_to_write == 0) 1139 break; 1140 } 1141 pagevec_release(&pvec); 1142 cond_resched(); 1143 1144 if (wbc->nr_to_write == 0) { 1145 step = 2; 1146 break; 1147 } 1148 } 1149 1150 if (step < 2) { 1151 step++; 1152 goto next_step; 1153 } 1154 1155 if (wrote) 1156 f2fs_submit_merged_bio(sbi, NODE, WRITE); 1157 return nwritten; 1158 } 1159 1160 int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino) 1161 { 1162 pgoff_t index = 0, end = LONG_MAX; 1163 struct pagevec pvec; 1164 int ret2 = 0, ret = 0; 1165 1166 pagevec_init(&pvec, 0); 1167 1168 while (index <= end) { 1169 int i, nr_pages; 1170 nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index, 1171 PAGECACHE_TAG_WRITEBACK, 1172 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); 1173 if (nr_pages == 0) 1174 break; 1175 1176 for (i = 0; i < nr_pages; i++) { 1177 struct page *page = pvec.pages[i]; 1178 1179 /* until radix tree lookup accepts end_index */ 1180 if (unlikely(page->index > end)) 1181 continue; 1182 1183 if (ino && ino_of_node(page) == ino) { 1184 f2fs_wait_on_page_writeback(page, NODE); 1185 if (TestClearPageError(page)) 1186 ret = -EIO; 1187 } 1188 } 1189 pagevec_release(&pvec); 1190 cond_resched(); 1191 } 1192 1193 if (unlikely(test_and_clear_bit(AS_ENOSPC, &NODE_MAPPING(sbi)->flags))) 1194 ret2 = -ENOSPC; 1195 if (unlikely(test_and_clear_bit(AS_EIO, &NODE_MAPPING(sbi)->flags))) 1196 ret2 = -EIO; 1197 if (!ret) 1198 ret = ret2; 1199 return ret; 1200 } 1201 1202 static int f2fs_write_node_page(struct page *page, 1203 struct writeback_control *wbc) 1204 { 1205 struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb); 1206 nid_t nid; 1207 block_t new_addr; 1208 struct node_info ni; 1209 struct f2fs_io_info fio = { 1210 .type = NODE, 1211 .rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE, 1212 }; 1213 1214 trace_f2fs_writepage(page, NODE); 1215 1216 if (unlikely(sbi->por_doing)) 1217 goto redirty_out; 1218 1219 f2fs_wait_on_page_writeback(page, NODE); 1220 1221 /* get old block addr of this node page */ 1222 nid = nid_of_node(page); 1223 f2fs_bug_on(page->index != nid); 1224 1225 get_node_info(sbi, nid, &ni); 1226 1227 /* This page is already truncated */ 1228 if (unlikely(ni.blk_addr == NULL_ADDR)) { 1229 dec_page_count(sbi, F2FS_DIRTY_NODES); 1230 unlock_page(page); 1231 return 0; 1232 } 1233 1234 if (wbc->for_reclaim) 1235 goto redirty_out; 1236 1237 mutex_lock(&sbi->node_write); 1238 set_page_writeback(page); 1239 write_node_page(sbi, page, &fio, nid, ni.blk_addr, &new_addr); 1240 set_node_addr(sbi, &ni, new_addr, is_fsync_dnode(page)); 1241 dec_page_count(sbi, F2FS_DIRTY_NODES); 1242 mutex_unlock(&sbi->node_write); 1243 unlock_page(page); 1244 return 0; 1245 1246 redirty_out: 1247 redirty_page_for_writepage(wbc, page); 1248 return AOP_WRITEPAGE_ACTIVATE; 1249 } 1250 1251 static int f2fs_write_node_pages(struct address_space *mapping, 1252 struct writeback_control *wbc) 1253 { 1254 struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb); 1255 long diff; 1256 1257 trace_f2fs_writepages(mapping->host, wbc, NODE); 1258 1259 /* balancing f2fs's metadata in background */ 1260 f2fs_balance_fs_bg(sbi); 1261 1262 /* collect a number of dirty node pages and write together */ 1263 if (get_pages(sbi, F2FS_DIRTY_NODES) < nr_pages_to_skip(sbi, NODE)) 1264 goto skip_write; 1265 1266 diff = nr_pages_to_write(sbi, NODE, wbc); 1267 wbc->sync_mode = WB_SYNC_NONE; 1268 sync_node_pages(sbi, 0, wbc); 1269 wbc->nr_to_write = max((long)0, wbc->nr_to_write - diff); 1270 return 0; 1271 1272 skip_write: 1273 wbc->pages_skipped += get_pages(sbi, F2FS_DIRTY_NODES); 1274 return 0; 1275 } 1276 1277 static int f2fs_set_node_page_dirty(struct page *page) 1278 { 1279 struct address_space *mapping = page->mapping; 1280 struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb); 1281 1282 trace_f2fs_set_page_dirty(page, NODE); 1283 1284 SetPageUptodate(page); 1285 if (!PageDirty(page)) { 1286 __set_page_dirty_nobuffers(page); 1287 inc_page_count(sbi, F2FS_DIRTY_NODES); 1288 SetPagePrivate(page); 1289 return 1; 1290 } 1291 return 0; 1292 } 1293 1294 static void f2fs_invalidate_node_page(struct page *page, unsigned int offset, 1295 unsigned int length) 1296 { 1297 struct inode *inode = page->mapping->host; 1298 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 1299 if (PageDirty(page)) 1300 dec_page_count(sbi, F2FS_DIRTY_NODES); 1301 ClearPagePrivate(page); 1302 } 1303 1304 static int f2fs_release_node_page(struct page *page, gfp_t wait) 1305 { 1306 ClearPagePrivate(page); 1307 return 1; 1308 } 1309 1310 /* 1311 * Structure of the f2fs node operations 1312 */ 1313 const struct address_space_operations f2fs_node_aops = { 1314 .writepage = f2fs_write_node_page, 1315 .writepages = f2fs_write_node_pages, 1316 .set_page_dirty = f2fs_set_node_page_dirty, 1317 .invalidatepage = f2fs_invalidate_node_page, 1318 .releasepage = f2fs_release_node_page, 1319 }; 1320 1321 static struct free_nid *__lookup_free_nid_list(struct f2fs_nm_info *nm_i, 1322 nid_t n) 1323 { 1324 return radix_tree_lookup(&nm_i->free_nid_root, n); 1325 } 1326 1327 static void __del_from_free_nid_list(struct f2fs_nm_info *nm_i, 1328 struct free_nid *i) 1329 { 1330 list_del(&i->list); 1331 radix_tree_delete(&nm_i->free_nid_root, i->nid); 1332 } 1333 1334 static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) 1335 { 1336 struct f2fs_nm_info *nm_i = NM_I(sbi); 1337 struct free_nid *i; 1338 struct nat_entry *ne; 1339 bool allocated = false; 1340 1341 if (!available_free_memory(sbi, FREE_NIDS)) 1342 return -1; 1343 1344 /* 0 nid should not be used */ 1345 if (unlikely(nid == 0)) 1346 return 0; 1347 1348 if (build) { 1349 /* do not add allocated nids */ 1350 read_lock(&nm_i->nat_tree_lock); 1351 ne = __lookup_nat_cache(nm_i, nid); 1352 if (ne && 1353 (!ne->checkpointed || nat_get_blkaddr(ne) != NULL_ADDR)) 1354 allocated = true; 1355 read_unlock(&nm_i->nat_tree_lock); 1356 if (allocated) 1357 return 0; 1358 } 1359 1360 i = f2fs_kmem_cache_alloc(free_nid_slab, GFP_NOFS); 1361 i->nid = nid; 1362 i->state = NID_NEW; 1363 1364 spin_lock(&nm_i->free_nid_list_lock); 1365 if (radix_tree_insert(&nm_i->free_nid_root, i->nid, i)) { 1366 spin_unlock(&nm_i->free_nid_list_lock); 1367 kmem_cache_free(free_nid_slab, i); 1368 return 0; 1369 } 1370 list_add_tail(&i->list, &nm_i->free_nid_list); 1371 nm_i->fcnt++; 1372 spin_unlock(&nm_i->free_nid_list_lock); 1373 return 1; 1374 } 1375 1376 static void remove_free_nid(struct f2fs_nm_info *nm_i, nid_t nid) 1377 { 1378 struct free_nid *i; 1379 bool need_free = false; 1380 1381 spin_lock(&nm_i->free_nid_list_lock); 1382 i = __lookup_free_nid_list(nm_i, nid); 1383 if (i && i->state == NID_NEW) { 1384 __del_from_free_nid_list(nm_i, i); 1385 nm_i->fcnt--; 1386 need_free = true; 1387 } 1388 spin_unlock(&nm_i->free_nid_list_lock); 1389 1390 if (need_free) 1391 kmem_cache_free(free_nid_slab, i); 1392 } 1393 1394 static void scan_nat_page(struct f2fs_sb_info *sbi, 1395 struct page *nat_page, nid_t start_nid) 1396 { 1397 struct f2fs_nm_info *nm_i = NM_I(sbi); 1398 struct f2fs_nat_block *nat_blk = page_address(nat_page); 1399 block_t blk_addr; 1400 int i; 1401 1402 i = start_nid % NAT_ENTRY_PER_BLOCK; 1403 1404 for (; i < NAT_ENTRY_PER_BLOCK; i++, start_nid++) { 1405 1406 if (unlikely(start_nid >= nm_i->max_nid)) 1407 break; 1408 1409 blk_addr = le32_to_cpu(nat_blk->entries[i].block_addr); 1410 f2fs_bug_on(blk_addr == NEW_ADDR); 1411 if (blk_addr == NULL_ADDR) { 1412 if (add_free_nid(sbi, start_nid, true) < 0) 1413 break; 1414 } 1415 } 1416 } 1417 1418 static void build_free_nids(struct f2fs_sb_info *sbi) 1419 { 1420 struct f2fs_nm_info *nm_i = NM_I(sbi); 1421 struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); 1422 struct f2fs_summary_block *sum = curseg->sum_blk; 1423 int i = 0; 1424 nid_t nid = nm_i->next_scan_nid; 1425 1426 /* Enough entries */ 1427 if (nm_i->fcnt > NAT_ENTRY_PER_BLOCK) 1428 return; 1429 1430 /* readahead nat pages to be scanned */ 1431 ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), FREE_NID_PAGES, META_NAT); 1432 1433 while (1) { 1434 struct page *page = get_current_nat_page(sbi, nid); 1435 1436 scan_nat_page(sbi, page, nid); 1437 f2fs_put_page(page, 1); 1438 1439 nid += (NAT_ENTRY_PER_BLOCK - (nid % NAT_ENTRY_PER_BLOCK)); 1440 if (unlikely(nid >= nm_i->max_nid)) 1441 nid = 0; 1442 1443 if (i++ == FREE_NID_PAGES) 1444 break; 1445 } 1446 1447 /* go to the next free nat pages to find free nids abundantly */ 1448 nm_i->next_scan_nid = nid; 1449 1450 /* find free nids from current sum_pages */ 1451 mutex_lock(&curseg->curseg_mutex); 1452 for (i = 0; i < nats_in_cursum(sum); i++) { 1453 block_t addr = le32_to_cpu(nat_in_journal(sum, i).block_addr); 1454 nid = le32_to_cpu(nid_in_journal(sum, i)); 1455 if (addr == NULL_ADDR) 1456 add_free_nid(sbi, nid, true); 1457 else 1458 remove_free_nid(nm_i, nid); 1459 } 1460 mutex_unlock(&curseg->curseg_mutex); 1461 } 1462 1463 /* 1464 * If this function returns success, caller can obtain a new nid 1465 * from second parameter of this function. 1466 * The returned nid could be used ino as well as nid when inode is created. 1467 */ 1468 bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid) 1469 { 1470 struct f2fs_nm_info *nm_i = NM_I(sbi); 1471 struct free_nid *i = NULL; 1472 retry: 1473 if (unlikely(sbi->total_valid_node_count + 1 > nm_i->available_nids)) 1474 return false; 1475 1476 spin_lock(&nm_i->free_nid_list_lock); 1477 1478 /* We should not use stale free nids created by build_free_nids */ 1479 if (nm_i->fcnt && !on_build_free_nids(nm_i)) { 1480 f2fs_bug_on(list_empty(&nm_i->free_nid_list)); 1481 list_for_each_entry(i, &nm_i->free_nid_list, list) 1482 if (i->state == NID_NEW) 1483 break; 1484 1485 f2fs_bug_on(i->state != NID_NEW); 1486 *nid = i->nid; 1487 i->state = NID_ALLOC; 1488 nm_i->fcnt--; 1489 spin_unlock(&nm_i->free_nid_list_lock); 1490 return true; 1491 } 1492 spin_unlock(&nm_i->free_nid_list_lock); 1493 1494 /* Let's scan nat pages and its caches to get free nids */ 1495 mutex_lock(&nm_i->build_lock); 1496 build_free_nids(sbi); 1497 mutex_unlock(&nm_i->build_lock); 1498 goto retry; 1499 } 1500 1501 /* 1502 * alloc_nid() should be called prior to this function. 1503 */ 1504 void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid) 1505 { 1506 struct f2fs_nm_info *nm_i = NM_I(sbi); 1507 struct free_nid *i; 1508 1509 spin_lock(&nm_i->free_nid_list_lock); 1510 i = __lookup_free_nid_list(nm_i, nid); 1511 f2fs_bug_on(!i || i->state != NID_ALLOC); 1512 __del_from_free_nid_list(nm_i, i); 1513 spin_unlock(&nm_i->free_nid_list_lock); 1514 1515 kmem_cache_free(free_nid_slab, i); 1516 } 1517 1518 /* 1519 * alloc_nid() should be called prior to this function. 1520 */ 1521 void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid) 1522 { 1523 struct f2fs_nm_info *nm_i = NM_I(sbi); 1524 struct free_nid *i; 1525 bool need_free = false; 1526 1527 if (!nid) 1528 return; 1529 1530 spin_lock(&nm_i->free_nid_list_lock); 1531 i = __lookup_free_nid_list(nm_i, nid); 1532 f2fs_bug_on(!i || i->state != NID_ALLOC); 1533 if (!available_free_memory(sbi, FREE_NIDS)) { 1534 __del_from_free_nid_list(nm_i, i); 1535 need_free = true; 1536 } else { 1537 i->state = NID_NEW; 1538 nm_i->fcnt++; 1539 } 1540 spin_unlock(&nm_i->free_nid_list_lock); 1541 1542 if (need_free) 1543 kmem_cache_free(free_nid_slab, i); 1544 } 1545 1546 void recover_node_page(struct f2fs_sb_info *sbi, struct page *page, 1547 struct f2fs_summary *sum, struct node_info *ni, 1548 block_t new_blkaddr) 1549 { 1550 rewrite_node_page(sbi, page, sum, ni->blk_addr, new_blkaddr); 1551 set_node_addr(sbi, ni, new_blkaddr, false); 1552 clear_node_page_dirty(page); 1553 } 1554 1555 static void recover_inline_xattr(struct inode *inode, struct page *page) 1556 { 1557 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 1558 void *src_addr, *dst_addr; 1559 size_t inline_size; 1560 struct page *ipage; 1561 struct f2fs_inode *ri; 1562 1563 if (!f2fs_has_inline_xattr(inode)) 1564 return; 1565 1566 if (!IS_INODE(page)) 1567 return; 1568 1569 ri = F2FS_INODE(page); 1570 if (!(ri->i_inline & F2FS_INLINE_XATTR)) 1571 return; 1572 1573 ipage = get_node_page(sbi, inode->i_ino); 1574 f2fs_bug_on(IS_ERR(ipage)); 1575 1576 dst_addr = inline_xattr_addr(ipage); 1577 src_addr = inline_xattr_addr(page); 1578 inline_size = inline_xattr_size(inode); 1579 1580 f2fs_wait_on_page_writeback(ipage, NODE); 1581 memcpy(dst_addr, src_addr, inline_size); 1582 1583 update_inode(inode, ipage); 1584 f2fs_put_page(ipage, 1); 1585 } 1586 1587 bool recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr) 1588 { 1589 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 1590 nid_t prev_xnid = F2FS_I(inode)->i_xattr_nid; 1591 nid_t new_xnid = nid_of_node(page); 1592 struct node_info ni; 1593 1594 recover_inline_xattr(inode, page); 1595 1596 if (!f2fs_has_xattr_block(ofs_of_node(page))) 1597 return false; 1598 1599 /* 1: invalidate the previous xattr nid */ 1600 if (!prev_xnid) 1601 goto recover_xnid; 1602 1603 /* Deallocate node address */ 1604 get_node_info(sbi, prev_xnid, &ni); 1605 f2fs_bug_on(ni.blk_addr == NULL_ADDR); 1606 invalidate_blocks(sbi, ni.blk_addr); 1607 dec_valid_node_count(sbi, inode); 1608 set_node_addr(sbi, &ni, NULL_ADDR, false); 1609 1610 recover_xnid: 1611 /* 2: allocate new xattr nid */ 1612 if (unlikely(!inc_valid_node_count(sbi, inode))) 1613 f2fs_bug_on(1); 1614 1615 remove_free_nid(NM_I(sbi), new_xnid); 1616 get_node_info(sbi, new_xnid, &ni); 1617 ni.ino = inode->i_ino; 1618 set_node_addr(sbi, &ni, NEW_ADDR, false); 1619 F2FS_I(inode)->i_xattr_nid = new_xnid; 1620 1621 /* 3: update xattr blkaddr */ 1622 refresh_sit_entry(sbi, NEW_ADDR, blkaddr); 1623 set_node_addr(sbi, &ni, blkaddr, false); 1624 1625 update_inode_page(inode); 1626 return true; 1627 } 1628 1629 int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) 1630 { 1631 struct f2fs_inode *src, *dst; 1632 nid_t ino = ino_of_node(page); 1633 struct node_info old_ni, new_ni; 1634 struct page *ipage; 1635 1636 get_node_info(sbi, ino, &old_ni); 1637 1638 if (unlikely(old_ni.blk_addr != NULL_ADDR)) 1639 return -EINVAL; 1640 1641 ipage = grab_cache_page(NODE_MAPPING(sbi), ino); 1642 if (!ipage) 1643 return -ENOMEM; 1644 1645 /* Should not use this inode from free nid list */ 1646 remove_free_nid(NM_I(sbi), ino); 1647 1648 SetPageUptodate(ipage); 1649 fill_node_footer(ipage, ino, ino, 0, true); 1650 1651 src = F2FS_INODE(page); 1652 dst = F2FS_INODE(ipage); 1653 1654 memcpy(dst, src, (unsigned long)&src->i_ext - (unsigned long)src); 1655 dst->i_size = 0; 1656 dst->i_blocks = cpu_to_le64(1); 1657 dst->i_links = cpu_to_le32(1); 1658 dst->i_xattr_nid = 0; 1659 1660 new_ni = old_ni; 1661 new_ni.ino = ino; 1662 1663 if (unlikely(!inc_valid_node_count(sbi, NULL))) 1664 WARN_ON(1); 1665 set_node_addr(sbi, &new_ni, NEW_ADDR, false); 1666 inc_valid_inode_count(sbi); 1667 f2fs_put_page(ipage, 1); 1668 return 0; 1669 } 1670 1671 /* 1672 * ra_sum_pages() merge contiguous pages into one bio and submit. 1673 * these pre-readed pages are alloced in bd_inode's mapping tree. 1674 */ 1675 static int ra_sum_pages(struct f2fs_sb_info *sbi, struct page **pages, 1676 int start, int nrpages) 1677 { 1678 struct inode *inode = sbi->sb->s_bdev->bd_inode; 1679 struct address_space *mapping = inode->i_mapping; 1680 int i, page_idx = start; 1681 struct f2fs_io_info fio = { 1682 .type = META, 1683 .rw = READ_SYNC | REQ_META | REQ_PRIO 1684 }; 1685 1686 for (i = 0; page_idx < start + nrpages; page_idx++, i++) { 1687 /* alloc page in bd_inode for reading node summary info */ 1688 pages[i] = grab_cache_page(mapping, page_idx); 1689 if (!pages[i]) 1690 break; 1691 f2fs_submit_page_mbio(sbi, pages[i], page_idx, &fio); 1692 } 1693 1694 f2fs_submit_merged_bio(sbi, META, READ); 1695 return i; 1696 } 1697 1698 int restore_node_summary(struct f2fs_sb_info *sbi, 1699 unsigned int segno, struct f2fs_summary_block *sum) 1700 { 1701 struct f2fs_node *rn; 1702 struct f2fs_summary *sum_entry; 1703 struct inode *inode = sbi->sb->s_bdev->bd_inode; 1704 block_t addr; 1705 int bio_blocks = MAX_BIO_BLOCKS(max_hw_blocks(sbi)); 1706 struct page *pages[bio_blocks]; 1707 int i, idx, last_offset, nrpages, err = 0; 1708 1709 /* scan the node segment */ 1710 last_offset = sbi->blocks_per_seg; 1711 addr = START_BLOCK(sbi, segno); 1712 sum_entry = &sum->entries[0]; 1713 1714 for (i = 0; !err && i < last_offset; i += nrpages, addr += nrpages) { 1715 nrpages = min(last_offset - i, bio_blocks); 1716 1717 /* read ahead node pages */ 1718 nrpages = ra_sum_pages(sbi, pages, addr, nrpages); 1719 if (!nrpages) 1720 return -ENOMEM; 1721 1722 for (idx = 0; idx < nrpages; idx++) { 1723 if (err) 1724 goto skip; 1725 1726 lock_page(pages[idx]); 1727 if (unlikely(!PageUptodate(pages[idx]))) { 1728 err = -EIO; 1729 } else { 1730 rn = F2FS_NODE(pages[idx]); 1731 sum_entry->nid = rn->footer.nid; 1732 sum_entry->version = 0; 1733 sum_entry->ofs_in_node = 0; 1734 sum_entry++; 1735 } 1736 unlock_page(pages[idx]); 1737 skip: 1738 page_cache_release(pages[idx]); 1739 } 1740 1741 invalidate_mapping_pages(inode->i_mapping, addr, 1742 addr + nrpages); 1743 } 1744 return err; 1745 } 1746 1747 static bool flush_nats_in_journal(struct f2fs_sb_info *sbi) 1748 { 1749 struct f2fs_nm_info *nm_i = NM_I(sbi); 1750 struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); 1751 struct f2fs_summary_block *sum = curseg->sum_blk; 1752 int i; 1753 1754 mutex_lock(&curseg->curseg_mutex); 1755 1756 if (nats_in_cursum(sum) < NAT_JOURNAL_ENTRIES) { 1757 mutex_unlock(&curseg->curseg_mutex); 1758 return false; 1759 } 1760 1761 for (i = 0; i < nats_in_cursum(sum); i++) { 1762 struct nat_entry *ne; 1763 struct f2fs_nat_entry raw_ne; 1764 nid_t nid = le32_to_cpu(nid_in_journal(sum, i)); 1765 1766 raw_ne = nat_in_journal(sum, i); 1767 retry: 1768 write_lock(&nm_i->nat_tree_lock); 1769 ne = __lookup_nat_cache(nm_i, nid); 1770 if (ne) { 1771 __set_nat_cache_dirty(nm_i, ne); 1772 write_unlock(&nm_i->nat_tree_lock); 1773 continue; 1774 } 1775 ne = grab_nat_entry(nm_i, nid); 1776 if (!ne) { 1777 write_unlock(&nm_i->nat_tree_lock); 1778 goto retry; 1779 } 1780 node_info_from_raw_nat(&ne->ni, &raw_ne); 1781 __set_nat_cache_dirty(nm_i, ne); 1782 write_unlock(&nm_i->nat_tree_lock); 1783 } 1784 update_nats_in_cursum(sum, -i); 1785 mutex_unlock(&curseg->curseg_mutex); 1786 return true; 1787 } 1788 1789 /* 1790 * This function is called during the checkpointing process. 1791 */ 1792 void flush_nat_entries(struct f2fs_sb_info *sbi) 1793 { 1794 struct f2fs_nm_info *nm_i = NM_I(sbi); 1795 struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); 1796 struct f2fs_summary_block *sum = curseg->sum_blk; 1797 struct nat_entry *ne, *cur; 1798 struct page *page = NULL; 1799 struct f2fs_nat_block *nat_blk = NULL; 1800 nid_t start_nid = 0, end_nid = 0; 1801 bool flushed; 1802 1803 flushed = flush_nats_in_journal(sbi); 1804 1805 if (!flushed) 1806 mutex_lock(&curseg->curseg_mutex); 1807 1808 /* 1) flush dirty nat caches */ 1809 list_for_each_entry_safe(ne, cur, &nm_i->dirty_nat_entries, list) { 1810 nid_t nid; 1811 struct f2fs_nat_entry raw_ne; 1812 int offset = -1; 1813 1814 if (nat_get_blkaddr(ne) == NEW_ADDR) 1815 continue; 1816 1817 nid = nat_get_nid(ne); 1818 1819 if (flushed) 1820 goto to_nat_page; 1821 1822 /* if there is room for nat enries in curseg->sumpage */ 1823 offset = lookup_journal_in_cursum(sum, NAT_JOURNAL, nid, 1); 1824 if (offset >= 0) { 1825 raw_ne = nat_in_journal(sum, offset); 1826 goto flush_now; 1827 } 1828 to_nat_page: 1829 if (!page || (start_nid > nid || nid > end_nid)) { 1830 if (page) { 1831 f2fs_put_page(page, 1); 1832 page = NULL; 1833 } 1834 start_nid = START_NID(nid); 1835 end_nid = start_nid + NAT_ENTRY_PER_BLOCK - 1; 1836 1837 /* 1838 * get nat block with dirty flag, increased reference 1839 * count, mapped and lock 1840 */ 1841 page = get_next_nat_page(sbi, start_nid); 1842 nat_blk = page_address(page); 1843 } 1844 1845 f2fs_bug_on(!nat_blk); 1846 raw_ne = nat_blk->entries[nid - start_nid]; 1847 flush_now: 1848 raw_nat_from_node_info(&raw_ne, &ne->ni); 1849 1850 if (offset < 0) { 1851 nat_blk->entries[nid - start_nid] = raw_ne; 1852 } else { 1853 nat_in_journal(sum, offset) = raw_ne; 1854 nid_in_journal(sum, offset) = cpu_to_le32(nid); 1855 } 1856 1857 if (nat_get_blkaddr(ne) == NULL_ADDR && 1858 add_free_nid(sbi, nid, false) <= 0) { 1859 write_lock(&nm_i->nat_tree_lock); 1860 __del_from_nat_cache(nm_i, ne); 1861 write_unlock(&nm_i->nat_tree_lock); 1862 } else { 1863 write_lock(&nm_i->nat_tree_lock); 1864 __clear_nat_cache_dirty(nm_i, ne); 1865 write_unlock(&nm_i->nat_tree_lock); 1866 } 1867 } 1868 if (!flushed) 1869 mutex_unlock(&curseg->curseg_mutex); 1870 f2fs_put_page(page, 1); 1871 } 1872 1873 static int init_node_manager(struct f2fs_sb_info *sbi) 1874 { 1875 struct f2fs_super_block *sb_raw = F2FS_RAW_SUPER(sbi); 1876 struct f2fs_nm_info *nm_i = NM_I(sbi); 1877 unsigned char *version_bitmap; 1878 unsigned int nat_segs, nat_blocks; 1879 1880 nm_i->nat_blkaddr = le32_to_cpu(sb_raw->nat_blkaddr); 1881 1882 /* segment_count_nat includes pair segment so divide to 2. */ 1883 nat_segs = le32_to_cpu(sb_raw->segment_count_nat) >> 1; 1884 nat_blocks = nat_segs << le32_to_cpu(sb_raw->log_blocks_per_seg); 1885 1886 nm_i->max_nid = NAT_ENTRY_PER_BLOCK * nat_blocks; 1887 1888 /* not used nids: 0, node, meta, (and root counted as valid node) */ 1889 nm_i->available_nids = nm_i->max_nid - 3; 1890 nm_i->fcnt = 0; 1891 nm_i->nat_cnt = 0; 1892 nm_i->ram_thresh = DEF_RAM_THRESHOLD; 1893 1894 INIT_RADIX_TREE(&nm_i->free_nid_root, GFP_ATOMIC); 1895 INIT_LIST_HEAD(&nm_i->free_nid_list); 1896 INIT_RADIX_TREE(&nm_i->nat_root, GFP_ATOMIC); 1897 INIT_LIST_HEAD(&nm_i->nat_entries); 1898 INIT_LIST_HEAD(&nm_i->dirty_nat_entries); 1899 1900 mutex_init(&nm_i->build_lock); 1901 spin_lock_init(&nm_i->free_nid_list_lock); 1902 rwlock_init(&nm_i->nat_tree_lock); 1903 1904 nm_i->next_scan_nid = le32_to_cpu(sbi->ckpt->next_free_nid); 1905 nm_i->bitmap_size = __bitmap_size(sbi, NAT_BITMAP); 1906 version_bitmap = __bitmap_ptr(sbi, NAT_BITMAP); 1907 if (!version_bitmap) 1908 return -EFAULT; 1909 1910 nm_i->nat_bitmap = kmemdup(version_bitmap, nm_i->bitmap_size, 1911 GFP_KERNEL); 1912 if (!nm_i->nat_bitmap) 1913 return -ENOMEM; 1914 return 0; 1915 } 1916 1917 int build_node_manager(struct f2fs_sb_info *sbi) 1918 { 1919 int err; 1920 1921 sbi->nm_info = kzalloc(sizeof(struct f2fs_nm_info), GFP_KERNEL); 1922 if (!sbi->nm_info) 1923 return -ENOMEM; 1924 1925 err = init_node_manager(sbi); 1926 if (err) 1927 return err; 1928 1929 build_free_nids(sbi); 1930 return 0; 1931 } 1932 1933 void destroy_node_manager(struct f2fs_sb_info *sbi) 1934 { 1935 struct f2fs_nm_info *nm_i = NM_I(sbi); 1936 struct free_nid *i, *next_i; 1937 struct nat_entry *natvec[NATVEC_SIZE]; 1938 nid_t nid = 0; 1939 unsigned int found; 1940 1941 if (!nm_i) 1942 return; 1943 1944 /* destroy free nid list */ 1945 spin_lock(&nm_i->free_nid_list_lock); 1946 list_for_each_entry_safe(i, next_i, &nm_i->free_nid_list, list) { 1947 f2fs_bug_on(i->state == NID_ALLOC); 1948 __del_from_free_nid_list(nm_i, i); 1949 nm_i->fcnt--; 1950 spin_unlock(&nm_i->free_nid_list_lock); 1951 kmem_cache_free(free_nid_slab, i); 1952 spin_lock(&nm_i->free_nid_list_lock); 1953 } 1954 f2fs_bug_on(nm_i->fcnt); 1955 spin_unlock(&nm_i->free_nid_list_lock); 1956 1957 /* destroy nat cache */ 1958 write_lock(&nm_i->nat_tree_lock); 1959 while ((found = __gang_lookup_nat_cache(nm_i, 1960 nid, NATVEC_SIZE, natvec))) { 1961 unsigned idx; 1962 nid = nat_get_nid(natvec[found - 1]) + 1; 1963 for (idx = 0; idx < found; idx++) 1964 __del_from_nat_cache(nm_i, natvec[idx]); 1965 } 1966 f2fs_bug_on(nm_i->nat_cnt); 1967 write_unlock(&nm_i->nat_tree_lock); 1968 1969 kfree(nm_i->nat_bitmap); 1970 sbi->nm_info = NULL; 1971 kfree(nm_i); 1972 } 1973 1974 int __init create_node_manager_caches(void) 1975 { 1976 nat_entry_slab = f2fs_kmem_cache_create("nat_entry", 1977 sizeof(struct nat_entry)); 1978 if (!nat_entry_slab) 1979 return -ENOMEM; 1980 1981 free_nid_slab = f2fs_kmem_cache_create("free_nid", 1982 sizeof(struct free_nid)); 1983 if (!free_nid_slab) { 1984 kmem_cache_destroy(nat_entry_slab); 1985 return -ENOMEM; 1986 } 1987 return 0; 1988 } 1989 1990 void destroy_node_manager_caches(void) 1991 { 1992 kmem_cache_destroy(free_nid_slab); 1993 kmem_cache_destroy(nat_entry_slab); 1994 } 1995