1 /* 2 * mdt.c - meta data file for NILFS 3 * 4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. 5 * 6 * This program is free software; you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License as published by 8 * the Free Software Foundation; either version 2 of the License, or 9 * (at your option) any later version. 10 * 11 * This program is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 * GNU General Public License for more details. 15 * 16 * You should have received a copy of the GNU General Public License 17 * along with this program; if not, write to the Free Software 18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 19 * 20 * Written by Ryusuke Konishi <ryusuke@osrg.net> 21 */ 22 23 #include <linux/buffer_head.h> 24 #include <linux/mpage.h> 25 #include <linux/mm.h> 26 #include <linux/writeback.h> 27 #include <linux/backing-dev.h> 28 #include <linux/swap.h> 29 #include <linux/slab.h> 30 #include "nilfs.h" 31 #include "btnode.h" 32 #include "segment.h" 33 #include "page.h" 34 #include "mdt.h" 35 36 #include <trace/events/nilfs2.h> 37 38 #define NILFS_MDT_MAX_RA_BLOCKS (16 - 1) 39 40 41 static int 42 nilfs_mdt_insert_new_block(struct inode *inode, unsigned long block, 43 struct buffer_head *bh, 44 void (*init_block)(struct inode *, 45 struct buffer_head *, void *)) 46 { 47 struct nilfs_inode_info *ii = NILFS_I(inode); 48 void *kaddr; 49 int ret; 50 51 /* Caller exclude read accesses using page lock */ 52 53 /* set_buffer_new(bh); */ 54 bh->b_blocknr = 0; 55 56 ret = nilfs_bmap_insert(ii->i_bmap, block, (unsigned long)bh); 57 if (unlikely(ret)) 58 return ret; 59 60 set_buffer_mapped(bh); 61 62 kaddr = kmap_atomic(bh->b_page); 63 memset(kaddr + bh_offset(bh), 0, 1 << inode->i_blkbits); 64 if (init_block) 65 init_block(inode, bh, kaddr); 66 flush_dcache_page(bh->b_page); 67 kunmap_atomic(kaddr); 68 69 set_buffer_uptodate(bh); 70 mark_buffer_dirty(bh); 71 nilfs_mdt_mark_dirty(inode); 72 73 trace_nilfs2_mdt_insert_new_block(inode, inode->i_ino, block); 74 75 return 0; 76 } 77 78 static int nilfs_mdt_create_block(struct inode *inode, unsigned long block, 79 struct buffer_head **out_bh, 80 void (*init_block)(struct inode *, 81 struct buffer_head *, 82 void *)) 83 { 84 struct super_block *sb = inode->i_sb; 85 struct nilfs_transaction_info ti; 86 struct buffer_head *bh; 87 int err; 88 89 nilfs_transaction_begin(sb, &ti, 0); 90 91 err = -ENOMEM; 92 bh = nilfs_grab_buffer(inode, inode->i_mapping, block, 0); 93 if (unlikely(!bh)) 94 goto failed_unlock; 95 96 err = -EEXIST; 97 if (buffer_uptodate(bh)) 98 goto failed_bh; 99 100 wait_on_buffer(bh); 101 if (buffer_uptodate(bh)) 102 goto failed_bh; 103 104 bh->b_bdev = sb->s_bdev; 105 err = nilfs_mdt_insert_new_block(inode, block, bh, init_block); 106 if (likely(!err)) { 107 get_bh(bh); 108 *out_bh = bh; 109 } 110 111 failed_bh: 112 unlock_page(bh->b_page); 113 page_cache_release(bh->b_page); 114 brelse(bh); 115 116 failed_unlock: 117 if (likely(!err)) 118 err = nilfs_transaction_commit(sb); 119 else 120 nilfs_transaction_abort(sb); 121 122 return err; 123 } 124 125 static int 126 nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff, 127 int mode, struct buffer_head **out_bh) 128 { 129 struct buffer_head *bh; 130 __u64 blknum = 0; 131 int ret = -ENOMEM; 132 133 bh = nilfs_grab_buffer(inode, inode->i_mapping, blkoff, 0); 134 if (unlikely(!bh)) 135 goto failed; 136 137 ret = -EEXIST; /* internal code */ 138 if (buffer_uptodate(bh)) 139 goto out; 140 141 if (mode == READA) { 142 if (!trylock_buffer(bh)) { 143 ret = -EBUSY; 144 goto failed_bh; 145 } 146 } else /* mode == READ */ 147 lock_buffer(bh); 148 149 if (buffer_uptodate(bh)) { 150 unlock_buffer(bh); 151 goto out; 152 } 153 154 ret = nilfs_bmap_lookup(NILFS_I(inode)->i_bmap, blkoff, &blknum); 155 if (unlikely(ret)) { 156 unlock_buffer(bh); 157 goto failed_bh; 158 } 159 map_bh(bh, inode->i_sb, (sector_t)blknum); 160 161 bh->b_end_io = end_buffer_read_sync; 162 get_bh(bh); 163 submit_bh(mode, bh); 164 ret = 0; 165 166 trace_nilfs2_mdt_submit_block(inode, inode->i_ino, blkoff, mode); 167 out: 168 get_bh(bh); 169 *out_bh = bh; 170 171 failed_bh: 172 unlock_page(bh->b_page); 173 page_cache_release(bh->b_page); 174 brelse(bh); 175 failed: 176 return ret; 177 } 178 179 static int nilfs_mdt_read_block(struct inode *inode, unsigned long block, 180 int readahead, struct buffer_head **out_bh) 181 { 182 struct buffer_head *first_bh, *bh; 183 unsigned long blkoff; 184 int i, nr_ra_blocks = NILFS_MDT_MAX_RA_BLOCKS; 185 int err; 186 187 err = nilfs_mdt_submit_block(inode, block, READ, &first_bh); 188 if (err == -EEXIST) /* internal code */ 189 goto out; 190 191 if (unlikely(err)) 192 goto failed; 193 194 if (readahead) { 195 blkoff = block + 1; 196 for (i = 0; i < nr_ra_blocks; i++, blkoff++) { 197 err = nilfs_mdt_submit_block(inode, blkoff, READA, &bh); 198 if (likely(!err || err == -EEXIST)) 199 brelse(bh); 200 else if (err != -EBUSY) 201 break; 202 /* abort readahead if bmap lookup failed */ 203 if (!buffer_locked(first_bh)) 204 goto out_no_wait; 205 } 206 } 207 208 wait_on_buffer(first_bh); 209 210 out_no_wait: 211 err = -EIO; 212 if (!buffer_uptodate(first_bh)) 213 goto failed_bh; 214 out: 215 *out_bh = first_bh; 216 return 0; 217 218 failed_bh: 219 brelse(first_bh); 220 failed: 221 return err; 222 } 223 224 /** 225 * nilfs_mdt_get_block - read or create a buffer on meta data file. 226 * @inode: inode of the meta data file 227 * @blkoff: block offset 228 * @create: create flag 229 * @init_block: initializer used for newly allocated block 230 * @out_bh: output of a pointer to the buffer_head 231 * 232 * nilfs_mdt_get_block() looks up the specified buffer and tries to create 233 * a new buffer if @create is not zero. On success, the returned buffer is 234 * assured to be either existing or formatted using a buffer lock on success. 235 * @out_bh is substituted only when zero is returned. 236 * 237 * Return Value: On success, it returns 0. On error, the following negative 238 * error code is returned. 239 * 240 * %-ENOMEM - Insufficient memory available. 241 * 242 * %-EIO - I/O error 243 * 244 * %-ENOENT - the specified block does not exist (hole block) 245 * 246 * %-EROFS - Read only filesystem (for create mode) 247 */ 248 int nilfs_mdt_get_block(struct inode *inode, unsigned long blkoff, int create, 249 void (*init_block)(struct inode *, 250 struct buffer_head *, void *), 251 struct buffer_head **out_bh) 252 { 253 int ret; 254 255 /* Should be rewritten with merging nilfs_mdt_read_block() */ 256 retry: 257 ret = nilfs_mdt_read_block(inode, blkoff, !create, out_bh); 258 if (!create || ret != -ENOENT) 259 return ret; 260 261 ret = nilfs_mdt_create_block(inode, blkoff, out_bh, init_block); 262 if (unlikely(ret == -EEXIST)) { 263 /* create = 0; */ /* limit read-create loop retries */ 264 goto retry; 265 } 266 return ret; 267 } 268 269 /** 270 * nilfs_mdt_find_block - find and get a buffer on meta data file. 271 * @inode: inode of the meta data file 272 * @start: start block offset (inclusive) 273 * @end: end block offset (inclusive) 274 * @blkoff: block offset 275 * @out_bh: place to store a pointer to buffer_head struct 276 * 277 * nilfs_mdt_find_block() looks up an existing block in range of 278 * [@start, @end] and stores pointer to a buffer head of the block to 279 * @out_bh, and block offset to @blkoff, respectively. @out_bh and 280 * @blkoff are substituted only when zero is returned. 281 * 282 * Return Value: On success, it returns 0. On error, the following negative 283 * error code is returned. 284 * 285 * %-ENOMEM - Insufficient memory available. 286 * 287 * %-EIO - I/O error 288 * 289 * %-ENOENT - no block was found in the range 290 */ 291 int nilfs_mdt_find_block(struct inode *inode, unsigned long start, 292 unsigned long end, unsigned long *blkoff, 293 struct buffer_head **out_bh) 294 { 295 __u64 next; 296 int ret; 297 298 if (unlikely(start > end)) 299 return -ENOENT; 300 301 ret = nilfs_mdt_read_block(inode, start, true, out_bh); 302 if (!ret) { 303 *blkoff = start; 304 goto out; 305 } 306 if (unlikely(ret != -ENOENT || start == ULONG_MAX)) 307 goto out; 308 309 ret = nilfs_bmap_seek_key(NILFS_I(inode)->i_bmap, start + 1, &next); 310 if (!ret) { 311 if (next <= end) { 312 ret = nilfs_mdt_read_block(inode, next, true, out_bh); 313 if (!ret) 314 *blkoff = next; 315 } else { 316 ret = -ENOENT; 317 } 318 } 319 out: 320 return ret; 321 } 322 323 /** 324 * nilfs_mdt_delete_block - make a hole on the meta data file. 325 * @inode: inode of the meta data file 326 * @block: block offset 327 * 328 * Return Value: On success, zero is returned. 329 * On error, one of the following negative error code is returned. 330 * 331 * %-ENOMEM - Insufficient memory available. 332 * 333 * %-EIO - I/O error 334 */ 335 int nilfs_mdt_delete_block(struct inode *inode, unsigned long block) 336 { 337 struct nilfs_inode_info *ii = NILFS_I(inode); 338 int err; 339 340 err = nilfs_bmap_delete(ii->i_bmap, block); 341 if (!err || err == -ENOENT) { 342 nilfs_mdt_mark_dirty(inode); 343 nilfs_mdt_forget_block(inode, block); 344 } 345 return err; 346 } 347 348 /** 349 * nilfs_mdt_forget_block - discard dirty state and try to remove the page 350 * @inode: inode of the meta data file 351 * @block: block offset 352 * 353 * nilfs_mdt_forget_block() clears a dirty flag of the specified buffer, and 354 * tries to release the page including the buffer from a page cache. 355 * 356 * Return Value: On success, 0 is returned. On error, one of the following 357 * negative error code is returned. 358 * 359 * %-EBUSY - page has an active buffer. 360 * 361 * %-ENOENT - page cache has no page addressed by the offset. 362 */ 363 int nilfs_mdt_forget_block(struct inode *inode, unsigned long block) 364 { 365 pgoff_t index = (pgoff_t)block >> 366 (PAGE_CACHE_SHIFT - inode->i_blkbits); 367 struct page *page; 368 unsigned long first_block; 369 int ret = 0; 370 int still_dirty; 371 372 page = find_lock_page(inode->i_mapping, index); 373 if (!page) 374 return -ENOENT; 375 376 wait_on_page_writeback(page); 377 378 first_block = (unsigned long)index << 379 (PAGE_CACHE_SHIFT - inode->i_blkbits); 380 if (page_has_buffers(page)) { 381 struct buffer_head *bh; 382 383 bh = nilfs_page_get_nth_block(page, block - first_block); 384 nilfs_forget_buffer(bh); 385 } 386 still_dirty = PageDirty(page); 387 unlock_page(page); 388 page_cache_release(page); 389 390 if (still_dirty || 391 invalidate_inode_pages2_range(inode->i_mapping, index, index) != 0) 392 ret = -EBUSY; 393 return ret; 394 } 395 396 /** 397 * nilfs_mdt_mark_block_dirty - mark a block on the meta data file dirty. 398 * @inode: inode of the meta data file 399 * @block: block offset 400 * 401 * Return Value: On success, it returns 0. On error, the following negative 402 * error code is returned. 403 * 404 * %-ENOMEM - Insufficient memory available. 405 * 406 * %-EIO - I/O error 407 * 408 * %-ENOENT - the specified block does not exist (hole block) 409 */ 410 int nilfs_mdt_mark_block_dirty(struct inode *inode, unsigned long block) 411 { 412 struct buffer_head *bh; 413 int err; 414 415 err = nilfs_mdt_read_block(inode, block, 0, &bh); 416 if (unlikely(err)) 417 return err; 418 mark_buffer_dirty(bh); 419 nilfs_mdt_mark_dirty(inode); 420 brelse(bh); 421 return 0; 422 } 423 424 int nilfs_mdt_fetch_dirty(struct inode *inode) 425 { 426 struct nilfs_inode_info *ii = NILFS_I(inode); 427 428 if (nilfs_bmap_test_and_clear_dirty(ii->i_bmap)) { 429 set_bit(NILFS_I_DIRTY, &ii->i_state); 430 return 1; 431 } 432 return test_bit(NILFS_I_DIRTY, &ii->i_state); 433 } 434 435 static int 436 nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc) 437 { 438 struct inode *inode = page->mapping->host; 439 struct super_block *sb; 440 int err = 0; 441 442 if (inode && (inode->i_sb->s_flags & MS_RDONLY)) { 443 /* 444 * It means that filesystem was remounted in read-only 445 * mode because of error or metadata corruption. But we 446 * have dirty pages that try to be flushed in background. 447 * So, here we simply discard this dirty page. 448 */ 449 nilfs_clear_dirty_page(page, false); 450 unlock_page(page); 451 return -EROFS; 452 } 453 454 redirty_page_for_writepage(wbc, page); 455 unlock_page(page); 456 457 if (!inode) 458 return 0; 459 460 sb = inode->i_sb; 461 462 if (wbc->sync_mode == WB_SYNC_ALL) 463 err = nilfs_construct_segment(sb); 464 else if (wbc->for_reclaim) 465 nilfs_flush_segment(sb, inode->i_ino); 466 467 return err; 468 } 469 470 471 static const struct address_space_operations def_mdt_aops = { 472 .writepage = nilfs_mdt_write_page, 473 }; 474 475 static const struct inode_operations def_mdt_iops; 476 static const struct file_operations def_mdt_fops; 477 478 479 int nilfs_mdt_init(struct inode *inode, gfp_t gfp_mask, size_t objsz) 480 { 481 struct nilfs_mdt_info *mi; 482 483 mi = kzalloc(max(sizeof(*mi), objsz), GFP_NOFS); 484 if (!mi) 485 return -ENOMEM; 486 487 init_rwsem(&mi->mi_sem); 488 inode->i_private = mi; 489 490 inode->i_mode = S_IFREG; 491 mapping_set_gfp_mask(inode->i_mapping, gfp_mask); 492 493 inode->i_op = &def_mdt_iops; 494 inode->i_fop = &def_mdt_fops; 495 inode->i_mapping->a_ops = &def_mdt_aops; 496 497 return 0; 498 } 499 500 void nilfs_mdt_set_entry_size(struct inode *inode, unsigned entry_size, 501 unsigned header_size) 502 { 503 struct nilfs_mdt_info *mi = NILFS_MDT(inode); 504 505 mi->mi_entry_size = entry_size; 506 mi->mi_entries_per_block = (1 << inode->i_blkbits) / entry_size; 507 mi->mi_first_entry_offset = DIV_ROUND_UP(header_size, entry_size); 508 } 509 510 /** 511 * nilfs_mdt_setup_shadow_map - setup shadow map and bind it to metadata file 512 * @inode: inode of the metadata file 513 * @shadow: shadow mapping 514 */ 515 int nilfs_mdt_setup_shadow_map(struct inode *inode, 516 struct nilfs_shadow_map *shadow) 517 { 518 struct nilfs_mdt_info *mi = NILFS_MDT(inode); 519 520 INIT_LIST_HEAD(&shadow->frozen_buffers); 521 address_space_init_once(&shadow->frozen_data); 522 nilfs_mapping_init(&shadow->frozen_data, inode); 523 address_space_init_once(&shadow->frozen_btnodes); 524 nilfs_mapping_init(&shadow->frozen_btnodes, inode); 525 mi->mi_shadow = shadow; 526 return 0; 527 } 528 529 /** 530 * nilfs_mdt_save_to_shadow_map - copy bmap and dirty pages to shadow map 531 * @inode: inode of the metadata file 532 */ 533 int nilfs_mdt_save_to_shadow_map(struct inode *inode) 534 { 535 struct nilfs_mdt_info *mi = NILFS_MDT(inode); 536 struct nilfs_inode_info *ii = NILFS_I(inode); 537 struct nilfs_shadow_map *shadow = mi->mi_shadow; 538 int ret; 539 540 ret = nilfs_copy_dirty_pages(&shadow->frozen_data, inode->i_mapping); 541 if (ret) 542 goto out; 543 544 ret = nilfs_copy_dirty_pages(&shadow->frozen_btnodes, 545 &ii->i_btnode_cache); 546 if (ret) 547 goto out; 548 549 nilfs_bmap_save(ii->i_bmap, &shadow->bmap_store); 550 out: 551 return ret; 552 } 553 554 int nilfs_mdt_freeze_buffer(struct inode *inode, struct buffer_head *bh) 555 { 556 struct nilfs_shadow_map *shadow = NILFS_MDT(inode)->mi_shadow; 557 struct buffer_head *bh_frozen; 558 struct page *page; 559 int blkbits = inode->i_blkbits; 560 561 page = grab_cache_page(&shadow->frozen_data, bh->b_page->index); 562 if (!page) 563 return -ENOMEM; 564 565 if (!page_has_buffers(page)) 566 create_empty_buffers(page, 1 << blkbits, 0); 567 568 bh_frozen = nilfs_page_get_nth_block(page, bh_offset(bh) >> blkbits); 569 570 if (!buffer_uptodate(bh_frozen)) 571 nilfs_copy_buffer(bh_frozen, bh); 572 if (list_empty(&bh_frozen->b_assoc_buffers)) { 573 list_add_tail(&bh_frozen->b_assoc_buffers, 574 &shadow->frozen_buffers); 575 set_buffer_nilfs_redirected(bh); 576 } else { 577 brelse(bh_frozen); /* already frozen */ 578 } 579 580 unlock_page(page); 581 page_cache_release(page); 582 return 0; 583 } 584 585 struct buffer_head * 586 nilfs_mdt_get_frozen_buffer(struct inode *inode, struct buffer_head *bh) 587 { 588 struct nilfs_shadow_map *shadow = NILFS_MDT(inode)->mi_shadow; 589 struct buffer_head *bh_frozen = NULL; 590 struct page *page; 591 int n; 592 593 page = find_lock_page(&shadow->frozen_data, bh->b_page->index); 594 if (page) { 595 if (page_has_buffers(page)) { 596 n = bh_offset(bh) >> inode->i_blkbits; 597 bh_frozen = nilfs_page_get_nth_block(page, n); 598 } 599 unlock_page(page); 600 page_cache_release(page); 601 } 602 return bh_frozen; 603 } 604 605 static void nilfs_release_frozen_buffers(struct nilfs_shadow_map *shadow) 606 { 607 struct list_head *head = &shadow->frozen_buffers; 608 struct buffer_head *bh; 609 610 while (!list_empty(head)) { 611 bh = list_first_entry(head, struct buffer_head, 612 b_assoc_buffers); 613 list_del_init(&bh->b_assoc_buffers); 614 brelse(bh); /* drop ref-count to make it releasable */ 615 } 616 } 617 618 /** 619 * nilfs_mdt_restore_from_shadow_map - restore dirty pages and bmap state 620 * @inode: inode of the metadata file 621 */ 622 void nilfs_mdt_restore_from_shadow_map(struct inode *inode) 623 { 624 struct nilfs_mdt_info *mi = NILFS_MDT(inode); 625 struct nilfs_inode_info *ii = NILFS_I(inode); 626 struct nilfs_shadow_map *shadow = mi->mi_shadow; 627 628 down_write(&mi->mi_sem); 629 630 if (mi->mi_palloc_cache) 631 nilfs_palloc_clear_cache(inode); 632 633 nilfs_clear_dirty_pages(inode->i_mapping, true); 634 nilfs_copy_back_pages(inode->i_mapping, &shadow->frozen_data); 635 636 nilfs_clear_dirty_pages(&ii->i_btnode_cache, true); 637 nilfs_copy_back_pages(&ii->i_btnode_cache, &shadow->frozen_btnodes); 638 639 nilfs_bmap_restore(ii->i_bmap, &shadow->bmap_store); 640 641 up_write(&mi->mi_sem); 642 } 643 644 /** 645 * nilfs_mdt_clear_shadow_map - truncate pages in shadow map caches 646 * @inode: inode of the metadata file 647 */ 648 void nilfs_mdt_clear_shadow_map(struct inode *inode) 649 { 650 struct nilfs_mdt_info *mi = NILFS_MDT(inode); 651 struct nilfs_shadow_map *shadow = mi->mi_shadow; 652 653 down_write(&mi->mi_sem); 654 nilfs_release_frozen_buffers(shadow); 655 truncate_inode_pages(&shadow->frozen_data, 0); 656 truncate_inode_pages(&shadow->frozen_btnodes, 0); 657 up_write(&mi->mi_sem); 658 } 659