1 /* 2 * mdt.c - meta data file for NILFS 3 * 4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. 5 * 6 * This program is free software; you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License as published by 8 * the Free Software Foundation; either version 2 of the License, or 9 * (at your option) any later version. 10 * 11 * This program is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 * GNU General Public License for more details. 15 * 16 * You should have received a copy of the GNU General Public License 17 * along with this program; if not, write to the Free Software 18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 19 * 20 * Written by Ryusuke Konishi <ryusuke@osrg.net> 21 */ 22 23 #include <linux/buffer_head.h> 24 #include <linux/mpage.h> 25 #include <linux/mm.h> 26 #include <linux/writeback.h> 27 #include <linux/backing-dev.h> 28 #include <linux/swap.h> 29 #include "nilfs.h" 30 #include "segment.h" 31 #include "page.h" 32 #include "mdt.h" 33 34 35 #define NILFS_MDT_MAX_RA_BLOCKS (16 - 1) 36 37 #define INIT_UNUSED_INODE_FIELDS 38 39 static int 40 nilfs_mdt_insert_new_block(struct inode *inode, unsigned long block, 41 struct buffer_head *bh, 42 void (*init_block)(struct inode *, 43 struct buffer_head *, void *)) 44 { 45 struct nilfs_inode_info *ii = NILFS_I(inode); 46 void *kaddr; 47 int ret; 48 49 /* Caller exclude read accesses using page lock */ 50 51 /* set_buffer_new(bh); */ 52 bh->b_blocknr = 0; 53 54 ret = nilfs_bmap_insert(ii->i_bmap, block, (unsigned long)bh); 55 if (unlikely(ret)) 56 return ret; 57 58 set_buffer_mapped(bh); 59 60 kaddr = kmap_atomic(bh->b_page, KM_USER0); 61 memset(kaddr + bh_offset(bh), 0, 1 << inode->i_blkbits); 62 if (init_block) 63 init_block(inode, bh, kaddr); 64 flush_dcache_page(bh->b_page); 65 kunmap_atomic(kaddr, KM_USER0); 66 67 set_buffer_uptodate(bh); 68 nilfs_mark_buffer_dirty(bh); 69 nilfs_mdt_mark_dirty(inode); 70 return 0; 71 } 72 73 static int nilfs_mdt_create_block(struct inode *inode, unsigned long block, 74 struct buffer_head **out_bh, 75 void (*init_block)(struct inode *, 76 struct buffer_head *, 77 void *)) 78 { 79 struct the_nilfs *nilfs = NILFS_MDT(inode)->mi_nilfs; 80 struct super_block *sb = inode->i_sb; 81 struct nilfs_transaction_info ti; 82 struct buffer_head *bh; 83 int err; 84 85 if (!sb) { 86 /* 87 * Make sure this function is not called from any 88 * read-only context. 89 */ 90 if (!nilfs->ns_writer) { 91 WARN_ON(1); 92 err = -EROFS; 93 goto out; 94 } 95 sb = nilfs->ns_writer->s_super; 96 } 97 98 nilfs_transaction_begin(sb, &ti, 0); 99 100 err = -ENOMEM; 101 bh = nilfs_grab_buffer(inode, inode->i_mapping, block, 0); 102 if (unlikely(!bh)) 103 goto failed_unlock; 104 105 err = -EEXIST; 106 if (buffer_uptodate(bh)) 107 goto failed_bh; 108 109 wait_on_buffer(bh); 110 if (buffer_uptodate(bh)) 111 goto failed_bh; 112 113 bh->b_bdev = nilfs->ns_bdev; 114 err = nilfs_mdt_insert_new_block(inode, block, bh, init_block); 115 if (likely(!err)) { 116 get_bh(bh); 117 *out_bh = bh; 118 } 119 120 failed_bh: 121 unlock_page(bh->b_page); 122 page_cache_release(bh->b_page); 123 brelse(bh); 124 125 failed_unlock: 126 if (likely(!err)) 127 err = nilfs_transaction_commit(sb); 128 else 129 nilfs_transaction_abort(sb); 130 out: 131 return err; 132 } 133 134 static int 135 nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff, 136 int mode, struct buffer_head **out_bh) 137 { 138 struct buffer_head *bh; 139 __u64 blknum = 0; 140 int ret = -ENOMEM; 141 142 bh = nilfs_grab_buffer(inode, inode->i_mapping, blkoff, 0); 143 if (unlikely(!bh)) 144 goto failed; 145 146 ret = -EEXIST; /* internal code */ 147 if (buffer_uptodate(bh)) 148 goto out; 149 150 if (mode == READA) { 151 if (!trylock_buffer(bh)) { 152 ret = -EBUSY; 153 goto failed_bh; 154 } 155 } else /* mode == READ */ 156 lock_buffer(bh); 157 158 if (buffer_uptodate(bh)) { 159 unlock_buffer(bh); 160 goto out; 161 } 162 163 ret = nilfs_bmap_lookup(NILFS_I(inode)->i_bmap, blkoff, &blknum); 164 if (unlikely(ret)) { 165 unlock_buffer(bh); 166 goto failed_bh; 167 } 168 bh->b_bdev = NILFS_MDT(inode)->mi_nilfs->ns_bdev; 169 bh->b_blocknr = (sector_t)blknum; 170 set_buffer_mapped(bh); 171 172 bh->b_end_io = end_buffer_read_sync; 173 get_bh(bh); 174 submit_bh(mode, bh); 175 ret = 0; 176 out: 177 get_bh(bh); 178 *out_bh = bh; 179 180 failed_bh: 181 unlock_page(bh->b_page); 182 page_cache_release(bh->b_page); 183 brelse(bh); 184 failed: 185 return ret; 186 } 187 188 static int nilfs_mdt_read_block(struct inode *inode, unsigned long block, 189 int readahead, struct buffer_head **out_bh) 190 { 191 struct buffer_head *first_bh, *bh; 192 unsigned long blkoff; 193 int i, nr_ra_blocks = NILFS_MDT_MAX_RA_BLOCKS; 194 int err; 195 196 err = nilfs_mdt_submit_block(inode, block, READ, &first_bh); 197 if (err == -EEXIST) /* internal code */ 198 goto out; 199 200 if (unlikely(err)) 201 goto failed; 202 203 if (readahead) { 204 blkoff = block + 1; 205 for (i = 0; i < nr_ra_blocks; i++, blkoff++) { 206 err = nilfs_mdt_submit_block(inode, blkoff, READA, &bh); 207 if (likely(!err || err == -EEXIST)) 208 brelse(bh); 209 else if (err != -EBUSY) 210 break; 211 /* abort readahead if bmap lookup failed */ 212 if (!buffer_locked(first_bh)) 213 goto out_no_wait; 214 } 215 } 216 217 wait_on_buffer(first_bh); 218 219 out_no_wait: 220 err = -EIO; 221 if (!buffer_uptodate(first_bh)) 222 goto failed_bh; 223 out: 224 *out_bh = first_bh; 225 return 0; 226 227 failed_bh: 228 brelse(first_bh); 229 failed: 230 return err; 231 } 232 233 /** 234 * nilfs_mdt_get_block - read or create a buffer on meta data file. 235 * @inode: inode of the meta data file 236 * @blkoff: block offset 237 * @create: create flag 238 * @init_block: initializer used for newly allocated block 239 * @out_bh: output of a pointer to the buffer_head 240 * 241 * nilfs_mdt_get_block() looks up the specified buffer and tries to create 242 * a new buffer if @create is not zero. On success, the returned buffer is 243 * assured to be either existing or formatted using a buffer lock on success. 244 * @out_bh is substituted only when zero is returned. 245 * 246 * Return Value: On success, it returns 0. On error, the following negative 247 * error code is returned. 248 * 249 * %-ENOMEM - Insufficient memory available. 250 * 251 * %-EIO - I/O error 252 * 253 * %-ENOENT - the specified block does not exist (hole block) 254 * 255 * %-EINVAL - bmap is broken. (the caller should call nilfs_error()) 256 * 257 * %-EROFS - Read only filesystem (for create mode) 258 */ 259 int nilfs_mdt_get_block(struct inode *inode, unsigned long blkoff, int create, 260 void (*init_block)(struct inode *, 261 struct buffer_head *, void *), 262 struct buffer_head **out_bh) 263 { 264 int ret; 265 266 /* Should be rewritten with merging nilfs_mdt_read_block() */ 267 retry: 268 ret = nilfs_mdt_read_block(inode, blkoff, !create, out_bh); 269 if (!create || ret != -ENOENT) 270 return ret; 271 272 ret = nilfs_mdt_create_block(inode, blkoff, out_bh, init_block); 273 if (unlikely(ret == -EEXIST)) { 274 /* create = 0; */ /* limit read-create loop retries */ 275 goto retry; 276 } 277 return ret; 278 } 279 280 /** 281 * nilfs_mdt_delete_block - make a hole on the meta data file. 282 * @inode: inode of the meta data file 283 * @block: block offset 284 * 285 * Return Value: On success, zero is returned. 286 * On error, one of the following negative error code is returned. 287 * 288 * %-ENOMEM - Insufficient memory available. 289 * 290 * %-EIO - I/O error 291 * 292 * %-EINVAL - bmap is broken. (the caller should call nilfs_error()) 293 */ 294 int nilfs_mdt_delete_block(struct inode *inode, unsigned long block) 295 { 296 struct nilfs_inode_info *ii = NILFS_I(inode); 297 int err; 298 299 err = nilfs_bmap_delete(ii->i_bmap, block); 300 if (!err || err == -ENOENT) { 301 nilfs_mdt_mark_dirty(inode); 302 nilfs_mdt_forget_block(inode, block); 303 } 304 return err; 305 } 306 307 /** 308 * nilfs_mdt_forget_block - discard dirty state and try to remove the page 309 * @inode: inode of the meta data file 310 * @block: block offset 311 * 312 * nilfs_mdt_forget_block() clears a dirty flag of the specified buffer, and 313 * tries to release the page including the buffer from a page cache. 314 * 315 * Return Value: On success, 0 is returned. On error, one of the following 316 * negative error code is returned. 317 * 318 * %-EBUSY - page has an active buffer. 319 * 320 * %-ENOENT - page cache has no page addressed by the offset. 321 */ 322 int nilfs_mdt_forget_block(struct inode *inode, unsigned long block) 323 { 324 pgoff_t index = (pgoff_t)block >> 325 (PAGE_CACHE_SHIFT - inode->i_blkbits); 326 struct page *page; 327 unsigned long first_block; 328 int ret = 0; 329 int still_dirty; 330 331 page = find_lock_page(inode->i_mapping, index); 332 if (!page) 333 return -ENOENT; 334 335 wait_on_page_writeback(page); 336 337 first_block = (unsigned long)index << 338 (PAGE_CACHE_SHIFT - inode->i_blkbits); 339 if (page_has_buffers(page)) { 340 struct buffer_head *bh; 341 342 bh = nilfs_page_get_nth_block(page, block - first_block); 343 nilfs_forget_buffer(bh); 344 } 345 still_dirty = PageDirty(page); 346 unlock_page(page); 347 page_cache_release(page); 348 349 if (still_dirty || 350 invalidate_inode_pages2_range(inode->i_mapping, index, index) != 0) 351 ret = -EBUSY; 352 return ret; 353 } 354 355 /** 356 * nilfs_mdt_mark_block_dirty - mark a block on the meta data file dirty. 357 * @inode: inode of the meta data file 358 * @block: block offset 359 * 360 * Return Value: On success, it returns 0. On error, the following negative 361 * error code is returned. 362 * 363 * %-ENOMEM - Insufficient memory available. 364 * 365 * %-EIO - I/O error 366 * 367 * %-ENOENT - the specified block does not exist (hole block) 368 * 369 * %-EINVAL - bmap is broken. (the caller should call nilfs_error()) 370 */ 371 int nilfs_mdt_mark_block_dirty(struct inode *inode, unsigned long block) 372 { 373 struct buffer_head *bh; 374 int err; 375 376 err = nilfs_mdt_read_block(inode, block, 0, &bh); 377 if (unlikely(err)) 378 return err; 379 nilfs_mark_buffer_dirty(bh); 380 nilfs_mdt_mark_dirty(inode); 381 brelse(bh); 382 return 0; 383 } 384 385 int nilfs_mdt_fetch_dirty(struct inode *inode) 386 { 387 struct nilfs_inode_info *ii = NILFS_I(inode); 388 389 if (nilfs_bmap_test_and_clear_dirty(ii->i_bmap)) { 390 set_bit(NILFS_I_DIRTY, &ii->i_state); 391 return 1; 392 } 393 return test_bit(NILFS_I_DIRTY, &ii->i_state); 394 } 395 396 static int 397 nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc) 398 { 399 struct inode *inode = container_of(page->mapping, 400 struct inode, i_data); 401 struct super_block *sb = inode->i_sb; 402 struct the_nilfs *nilfs = NILFS_MDT(inode)->mi_nilfs; 403 struct nilfs_sb_info *writer = NULL; 404 int err = 0; 405 406 redirty_page_for_writepage(wbc, page); 407 unlock_page(page); 408 409 if (page->mapping->assoc_mapping) 410 return 0; /* Do not request flush for shadow page cache */ 411 if (!sb) { 412 down_read(&nilfs->ns_writer_sem); 413 writer = nilfs->ns_writer; 414 if (!writer) { 415 up_read(&nilfs->ns_writer_sem); 416 return -EROFS; 417 } 418 sb = writer->s_super; 419 } 420 421 if (wbc->sync_mode == WB_SYNC_ALL) 422 err = nilfs_construct_segment(sb); 423 else if (wbc->for_reclaim) 424 nilfs_flush_segment(sb, inode->i_ino); 425 426 if (writer) 427 up_read(&nilfs->ns_writer_sem); 428 return err; 429 } 430 431 432 static const struct address_space_operations def_mdt_aops = { 433 .writepage = nilfs_mdt_write_page, 434 .sync_page = block_sync_page, 435 }; 436 437 static const struct inode_operations def_mdt_iops; 438 static const struct file_operations def_mdt_fops; 439 440 /* 441 * NILFS2 uses pseudo inodes for meta data files such as DAT, cpfile, sufile, 442 * ifile, or gcinodes. This allows the B-tree code and segment constructor 443 * to treat them like regular files, and this helps to simplify the 444 * implementation. 445 * On the other hand, some of the pseudo inodes have an irregular point: 446 * They don't have valid inode->i_sb pointer because their lifetimes are 447 * longer than those of the super block structs; they may continue for 448 * several consecutive mounts/umounts. This would need discussions. 449 */ 450 /** 451 * nilfs_mdt_new_common - allocate a pseudo inode for metadata file 452 * @nilfs: nilfs object 453 * @sb: super block instance the metadata file belongs to 454 * @ino: inode number 455 * @gfp_mask: gfp mask for data pages 456 * @objsz: size of the private object attached to inode->i_private 457 */ 458 struct inode * 459 nilfs_mdt_new_common(struct the_nilfs *nilfs, struct super_block *sb, 460 ino_t ino, gfp_t gfp_mask, size_t objsz) 461 { 462 struct inode *inode = nilfs_alloc_inode_common(nilfs); 463 464 if (!inode) 465 return NULL; 466 else { 467 struct address_space * const mapping = &inode->i_data; 468 struct nilfs_mdt_info *mi; 469 470 mi = kzalloc(max(sizeof(*mi), objsz), GFP_NOFS); 471 if (!mi) { 472 nilfs_destroy_inode(inode); 473 return NULL; 474 } 475 mi->mi_nilfs = nilfs; 476 init_rwsem(&mi->mi_sem); 477 478 inode->i_sb = sb; /* sb may be NULL for some meta data files */ 479 inode->i_blkbits = nilfs->ns_blocksize_bits; 480 inode->i_flags = 0; 481 atomic_set(&inode->i_count, 1); 482 inode->i_nlink = 1; 483 inode->i_ino = ino; 484 inode->i_mode = S_IFREG; 485 inode->i_private = mi; 486 487 #ifdef INIT_UNUSED_INODE_FIELDS 488 atomic_set(&inode->i_writecount, 0); 489 inode->i_size = 0; 490 inode->i_blocks = 0; 491 inode->i_bytes = 0; 492 inode->i_generation = 0; 493 #ifdef CONFIG_QUOTA 494 memset(&inode->i_dquot, 0, sizeof(inode->i_dquot)); 495 #endif 496 inode->i_pipe = NULL; 497 inode->i_bdev = NULL; 498 inode->i_cdev = NULL; 499 inode->i_rdev = 0; 500 #ifdef CONFIG_SECURITY 501 inode->i_security = NULL; 502 #endif 503 inode->dirtied_when = 0; 504 505 INIT_LIST_HEAD(&inode->i_list); 506 INIT_LIST_HEAD(&inode->i_sb_list); 507 inode->i_state = 0; 508 #endif 509 510 spin_lock_init(&inode->i_lock); 511 mutex_init(&inode->i_mutex); 512 init_rwsem(&inode->i_alloc_sem); 513 514 mapping->host = NULL; /* instead of inode */ 515 mapping->flags = 0; 516 mapping_set_gfp_mask(mapping, gfp_mask); 517 mapping->assoc_mapping = NULL; 518 mapping->backing_dev_info = nilfs->ns_bdi; 519 520 inode->i_mapping = mapping; 521 } 522 523 return inode; 524 } 525 526 struct inode *nilfs_mdt_new(struct the_nilfs *nilfs, struct super_block *sb, 527 ino_t ino, size_t objsz) 528 { 529 struct inode *inode; 530 531 inode = nilfs_mdt_new_common(nilfs, sb, ino, NILFS_MDT_GFP, objsz); 532 if (!inode) 533 return NULL; 534 535 inode->i_op = &def_mdt_iops; 536 inode->i_fop = &def_mdt_fops; 537 inode->i_mapping->a_ops = &def_mdt_aops; 538 return inode; 539 } 540 541 void nilfs_mdt_set_entry_size(struct inode *inode, unsigned entry_size, 542 unsigned header_size) 543 { 544 struct nilfs_mdt_info *mi = NILFS_MDT(inode); 545 546 mi->mi_entry_size = entry_size; 547 mi->mi_entries_per_block = (1 << inode->i_blkbits) / entry_size; 548 mi->mi_first_entry_offset = DIV_ROUND_UP(header_size, entry_size); 549 } 550 551 void nilfs_mdt_set_shadow(struct inode *orig, struct inode *shadow) 552 { 553 shadow->i_mapping->assoc_mapping = orig->i_mapping; 554 NILFS_I(shadow)->i_btnode_cache.assoc_mapping = 555 &NILFS_I(orig)->i_btnode_cache; 556 } 557 558 static void nilfs_mdt_clear(struct inode *inode) 559 { 560 struct nilfs_inode_info *ii = NILFS_I(inode); 561 562 invalidate_mapping_pages(inode->i_mapping, 0, -1); 563 truncate_inode_pages(inode->i_mapping, 0); 564 565 if (test_bit(NILFS_I_BMAP, &ii->i_state)) 566 nilfs_bmap_clear(ii->i_bmap); 567 nilfs_btnode_cache_clear(&ii->i_btnode_cache); 568 } 569 570 void nilfs_mdt_destroy(struct inode *inode) 571 { 572 struct nilfs_mdt_info *mdi = NILFS_MDT(inode); 573 574 if (mdi->mi_palloc_cache) 575 nilfs_palloc_destroy_cache(inode); 576 nilfs_mdt_clear(inode); 577 578 kfree(mdi->mi_bgl); /* kfree(NULL) is safe */ 579 kfree(mdi); 580 nilfs_destroy_inode(inode); 581 } 582