1 /* -*- mode: c; c-basic-offset: 8; -*- 2 * vim: noexpandtab sw=8 ts=8 sts=0: 3 * 4 * Copyright (C) 2002, 2004 Oracle. All rights reserved. 5 * 6 * This program is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU General Public 8 * License as published by the Free Software Foundation; either 9 * version 2 of the License, or (at your option) any later version. 10 * 11 * This program is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * General Public License for more details. 15 * 16 * You should have received a copy of the GNU General Public 17 * License along with this program; if not, write to the 18 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 19 * Boston, MA 021110-1307, USA. 20 */ 21 22 #include <linux/fs.h> 23 #include <linux/slab.h> 24 #include <linux/highmem.h> 25 #include <linux/pagemap.h> 26 #include <asm/byteorder.h> 27 #include <linux/swap.h> 28 #include <linux/pipe_fs_i.h> 29 30 #define MLOG_MASK_PREFIX ML_FILE_IO 31 #include <cluster/masklog.h> 32 33 #include "ocfs2.h" 34 35 #include "alloc.h" 36 #include "aops.h" 37 #include "dlmglue.h" 38 #include "extent_map.h" 39 #include "file.h" 40 #include "inode.h" 41 #include "journal.h" 42 #include "suballoc.h" 43 #include "super.h" 44 #include "symlink.h" 45 46 #include "buffer_head_io.h" 47 48 static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock, 49 struct buffer_head *bh_result, int create) 50 { 51 int err = -EIO; 52 int status; 53 struct ocfs2_dinode *fe = NULL; 54 struct buffer_head *bh = NULL; 55 struct buffer_head *buffer_cache_bh = NULL; 56 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 57 void *kaddr; 58 59 mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode, 60 (unsigned long long)iblock, bh_result, create); 61 62 BUG_ON(ocfs2_inode_is_fast_symlink(inode)); 63 64 if ((iblock << inode->i_sb->s_blocksize_bits) > PATH_MAX + 1) { 65 mlog(ML_ERROR, "block offset > PATH_MAX: %llu", 66 (unsigned long long)iblock); 67 goto bail; 68 } 69 70 status = ocfs2_read_block(OCFS2_SB(inode->i_sb), 71 OCFS2_I(inode)->ip_blkno, 72 &bh, OCFS2_BH_CACHED, inode); 73 if (status < 0) { 74 mlog_errno(status); 75 goto bail; 76 } 77 fe = (struct ocfs2_dinode *) bh->b_data; 78 79 if (!OCFS2_IS_VALID_DINODE(fe)) { 80 mlog(ML_ERROR, "Invalid dinode #%llu: signature = %.*s\n", 81 (unsigned long long)le64_to_cpu(fe->i_blkno), 7, 82 fe->i_signature); 83 goto bail; 84 } 85 86 if ((u64)iblock >= ocfs2_clusters_to_blocks(inode->i_sb, 87 le32_to_cpu(fe->i_clusters))) { 88 mlog(ML_ERROR, "block offset is outside the allocated size: " 89 "%llu\n", (unsigned long long)iblock); 90 goto bail; 91 } 92 93 /* We don't use the page cache to create symlink data, so if 94 * need be, copy it over from the buffer cache. */ 95 if (!buffer_uptodate(bh_result) && ocfs2_inode_is_new(inode)) { 96 u64 blkno = le64_to_cpu(fe->id2.i_list.l_recs[0].e_blkno) + 97 iblock; 98 buffer_cache_bh = sb_getblk(osb->sb, blkno); 99 if (!buffer_cache_bh) { 100 mlog(ML_ERROR, "couldn't getblock for symlink!\n"); 101 goto bail; 102 } 103 104 /* we haven't locked out transactions, so a commit 105 * could've happened. Since we've got a reference on 106 * the bh, even if it commits while we're doing the 107 * copy, the data is still good. */ 108 if (buffer_jbd(buffer_cache_bh) 109 && ocfs2_inode_is_new(inode)) { 110 kaddr = kmap_atomic(bh_result->b_page, KM_USER0); 111 if (!kaddr) { 112 mlog(ML_ERROR, "couldn't kmap!\n"); 113 goto bail; 114 } 115 memcpy(kaddr + (bh_result->b_size * iblock), 116 buffer_cache_bh->b_data, 117 bh_result->b_size); 118 kunmap_atomic(kaddr, KM_USER0); 119 set_buffer_uptodate(bh_result); 120 } 121 brelse(buffer_cache_bh); 122 } 123 124 map_bh(bh_result, inode->i_sb, 125 le64_to_cpu(fe->id2.i_list.l_recs[0].e_blkno) + iblock); 126 127 err = 0; 128 129 bail: 130 if (bh) 131 brelse(bh); 132 133 mlog_exit(err); 134 return err; 135 } 136 137 static int ocfs2_get_block(struct inode *inode, sector_t iblock, 138 struct buffer_head *bh_result, int create) 139 { 140 int err = 0; 141 unsigned int ext_flags; 142 u64 p_blkno, past_eof; 143 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 144 145 mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode, 146 (unsigned long long)iblock, bh_result, create); 147 148 if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE) 149 mlog(ML_NOTICE, "get_block on system inode 0x%p (%lu)\n", 150 inode, inode->i_ino); 151 152 if (S_ISLNK(inode->i_mode)) { 153 /* this always does I/O for some reason. */ 154 err = ocfs2_symlink_get_block(inode, iblock, bh_result, create); 155 goto bail; 156 } 157 158 err = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, NULL, 159 &ext_flags); 160 if (err) { 161 mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, " 162 "%llu, NULL)\n", err, inode, (unsigned long long)iblock, 163 (unsigned long long)p_blkno); 164 goto bail; 165 } 166 167 /* 168 * ocfs2 never allocates in this function - the only time we 169 * need to use BH_New is when we're extending i_size on a file 170 * system which doesn't support holes, in which case BH_New 171 * allows block_prepare_write() to zero. 172 */ 173 mlog_bug_on_msg(create && p_blkno == 0 && ocfs2_sparse_alloc(osb), 174 "ino %lu, iblock %llu\n", inode->i_ino, 175 (unsigned long long)iblock); 176 177 /* Treat the unwritten extent as a hole for zeroing purposes. */ 178 if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN)) 179 map_bh(bh_result, inode->i_sb, p_blkno); 180 181 if (!ocfs2_sparse_alloc(osb)) { 182 if (p_blkno == 0) { 183 err = -EIO; 184 mlog(ML_ERROR, 185 "iblock = %llu p_blkno = %llu blkno=(%llu)\n", 186 (unsigned long long)iblock, 187 (unsigned long long)p_blkno, 188 (unsigned long long)OCFS2_I(inode)->ip_blkno); 189 mlog(ML_ERROR, "Size %llu, clusters %u\n", (unsigned long long)i_size_read(inode), OCFS2_I(inode)->ip_clusters); 190 dump_stack(); 191 } 192 193 past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); 194 mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino, 195 (unsigned long long)past_eof); 196 197 if (create && (iblock >= past_eof)) 198 set_buffer_new(bh_result); 199 } 200 201 bail: 202 if (err < 0) 203 err = -EIO; 204 205 mlog_exit(err); 206 return err; 207 } 208 209 static int ocfs2_readpage(struct file *file, struct page *page) 210 { 211 struct inode *inode = page->mapping->host; 212 loff_t start = (loff_t)page->index << PAGE_CACHE_SHIFT; 213 int ret, unlock = 1; 214 215 mlog_entry("(0x%p, %lu)\n", file, (page ? page->index : 0)); 216 217 ret = ocfs2_meta_lock_with_page(inode, NULL, 0, page); 218 if (ret != 0) { 219 if (ret == AOP_TRUNCATED_PAGE) 220 unlock = 0; 221 mlog_errno(ret); 222 goto out; 223 } 224 225 down_read(&OCFS2_I(inode)->ip_alloc_sem); 226 227 /* 228 * i_size might have just been updated as we grabed the meta lock. We 229 * might now be discovering a truncate that hit on another node. 230 * block_read_full_page->get_block freaks out if it is asked to read 231 * beyond the end of a file, so we check here. Callers 232 * (generic_file_read, fault->nopage) are clever enough to check i_size 233 * and notice that the page they just read isn't needed. 234 * 235 * XXX sys_readahead() seems to get that wrong? 236 */ 237 if (start >= i_size_read(inode)) { 238 char *addr = kmap(page); 239 memset(addr, 0, PAGE_SIZE); 240 flush_dcache_page(page); 241 kunmap(page); 242 SetPageUptodate(page); 243 ret = 0; 244 goto out_alloc; 245 } 246 247 ret = ocfs2_data_lock_with_page(inode, 0, page); 248 if (ret != 0) { 249 if (ret == AOP_TRUNCATED_PAGE) 250 unlock = 0; 251 mlog_errno(ret); 252 goto out_alloc; 253 } 254 255 ret = block_read_full_page(page, ocfs2_get_block); 256 unlock = 0; 257 258 ocfs2_data_unlock(inode, 0); 259 out_alloc: 260 up_read(&OCFS2_I(inode)->ip_alloc_sem); 261 ocfs2_meta_unlock(inode, 0); 262 out: 263 if (unlock) 264 unlock_page(page); 265 mlog_exit(ret); 266 return ret; 267 } 268 269 /* Note: Because we don't support holes, our allocation has 270 * already happened (allocation writes zeros to the file data) 271 * so we don't have to worry about ordered writes in 272 * ocfs2_writepage. 273 * 274 * ->writepage is called during the process of invalidating the page cache 275 * during blocked lock processing. It can't block on any cluster locks 276 * to during block mapping. It's relying on the fact that the block 277 * mapping can't have disappeared under the dirty pages that it is 278 * being asked to write back. 279 */ 280 static int ocfs2_writepage(struct page *page, struct writeback_control *wbc) 281 { 282 int ret; 283 284 mlog_entry("(0x%p)\n", page); 285 286 ret = block_write_full_page(page, ocfs2_get_block, wbc); 287 288 mlog_exit(ret); 289 290 return ret; 291 } 292 293 /* 294 * This is called from ocfs2_write_zero_page() which has handled it's 295 * own cluster locking and has ensured allocation exists for those 296 * blocks to be written. 297 */ 298 int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page, 299 unsigned from, unsigned to) 300 { 301 int ret; 302 303 down_read(&OCFS2_I(inode)->ip_alloc_sem); 304 305 ret = block_prepare_write(page, from, to, ocfs2_get_block); 306 307 up_read(&OCFS2_I(inode)->ip_alloc_sem); 308 309 return ret; 310 } 311 312 /* Taken from ext3. We don't necessarily need the full blown 313 * functionality yet, but IMHO it's better to cut and paste the whole 314 * thing so we can avoid introducing our own bugs (and easily pick up 315 * their fixes when they happen) --Mark */ 316 int walk_page_buffers( handle_t *handle, 317 struct buffer_head *head, 318 unsigned from, 319 unsigned to, 320 int *partial, 321 int (*fn)( handle_t *handle, 322 struct buffer_head *bh)) 323 { 324 struct buffer_head *bh; 325 unsigned block_start, block_end; 326 unsigned blocksize = head->b_size; 327 int err, ret = 0; 328 struct buffer_head *next; 329 330 for ( bh = head, block_start = 0; 331 ret == 0 && (bh != head || !block_start); 332 block_start = block_end, bh = next) 333 { 334 next = bh->b_this_page; 335 block_end = block_start + blocksize; 336 if (block_end <= from || block_start >= to) { 337 if (partial && !buffer_uptodate(bh)) 338 *partial = 1; 339 continue; 340 } 341 err = (*fn)(handle, bh); 342 if (!ret) 343 ret = err; 344 } 345 return ret; 346 } 347 348 handle_t *ocfs2_start_walk_page_trans(struct inode *inode, 349 struct page *page, 350 unsigned from, 351 unsigned to) 352 { 353 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 354 handle_t *handle = NULL; 355 int ret = 0; 356 357 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 358 if (!handle) { 359 ret = -ENOMEM; 360 mlog_errno(ret); 361 goto out; 362 } 363 364 if (ocfs2_should_order_data(inode)) { 365 ret = walk_page_buffers(handle, 366 page_buffers(page), 367 from, to, NULL, 368 ocfs2_journal_dirty_data); 369 if (ret < 0) 370 mlog_errno(ret); 371 } 372 out: 373 if (ret) { 374 if (handle) 375 ocfs2_commit_trans(osb, handle); 376 handle = ERR_PTR(ret); 377 } 378 return handle; 379 } 380 381 static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block) 382 { 383 sector_t status; 384 u64 p_blkno = 0; 385 int err = 0; 386 struct inode *inode = mapping->host; 387 388 mlog_entry("(block = %llu)\n", (unsigned long long)block); 389 390 /* We don't need to lock journal system files, since they aren't 391 * accessed concurrently from multiple nodes. 392 */ 393 if (!INODE_JOURNAL(inode)) { 394 err = ocfs2_meta_lock(inode, NULL, 0); 395 if (err) { 396 if (err != -ENOENT) 397 mlog_errno(err); 398 goto bail; 399 } 400 down_read(&OCFS2_I(inode)->ip_alloc_sem); 401 } 402 403 err = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL, NULL); 404 405 if (!INODE_JOURNAL(inode)) { 406 up_read(&OCFS2_I(inode)->ip_alloc_sem); 407 ocfs2_meta_unlock(inode, 0); 408 } 409 410 if (err) { 411 mlog(ML_ERROR, "get_blocks() failed, block = %llu\n", 412 (unsigned long long)block); 413 mlog_errno(err); 414 goto bail; 415 } 416 417 418 bail: 419 status = err ? 0 : p_blkno; 420 421 mlog_exit((int)status); 422 423 return status; 424 } 425 426 /* 427 * TODO: Make this into a generic get_blocks function. 428 * 429 * From do_direct_io in direct-io.c: 430 * "So what we do is to permit the ->get_blocks function to populate 431 * bh.b_size with the size of IO which is permitted at this offset and 432 * this i_blkbits." 433 * 434 * This function is called directly from get_more_blocks in direct-io.c. 435 * 436 * called like this: dio->get_blocks(dio->inode, fs_startblk, 437 * fs_count, map_bh, dio->rw == WRITE); 438 */ 439 static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, 440 struct buffer_head *bh_result, int create) 441 { 442 int ret; 443 u64 p_blkno, inode_blocks, contig_blocks; 444 unsigned int ext_flags; 445 unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits; 446 unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits; 447 448 /* This function won't even be called if the request isn't all 449 * nicely aligned and of the right size, so there's no need 450 * for us to check any of that. */ 451 452 inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); 453 454 /* 455 * Any write past EOF is not allowed because we'd be extending. 456 */ 457 if (create && (iblock + max_blocks) > inode_blocks) { 458 ret = -EIO; 459 goto bail; 460 } 461 462 /* This figures out the size of the next contiguous block, and 463 * our logical offset */ 464 ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, 465 &contig_blocks, &ext_flags); 466 if (ret) { 467 mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n", 468 (unsigned long long)iblock); 469 ret = -EIO; 470 goto bail; 471 } 472 473 if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)) && !p_blkno) { 474 ocfs2_error(inode->i_sb, 475 "Inode %llu has a hole at block %llu\n", 476 (unsigned long long)OCFS2_I(inode)->ip_blkno, 477 (unsigned long long)iblock); 478 ret = -EROFS; 479 goto bail; 480 } 481 482 /* 483 * get_more_blocks() expects us to describe a hole by clearing 484 * the mapped bit on bh_result(). 485 * 486 * Consider an unwritten extent as a hole. 487 */ 488 if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN)) 489 map_bh(bh_result, inode->i_sb, p_blkno); 490 else { 491 /* 492 * ocfs2_prepare_inode_for_write() should have caught 493 * the case where we'd be filling a hole and triggered 494 * a buffered write instead. 495 */ 496 if (create) { 497 ret = -EIO; 498 mlog_errno(ret); 499 goto bail; 500 } 501 502 clear_buffer_mapped(bh_result); 503 } 504 505 /* make sure we don't map more than max_blocks blocks here as 506 that's all the kernel will handle at this point. */ 507 if (max_blocks < contig_blocks) 508 contig_blocks = max_blocks; 509 bh_result->b_size = contig_blocks << blocksize_bits; 510 bail: 511 return ret; 512 } 513 514 /* 515 * ocfs2_dio_end_io is called by the dio core when a dio is finished. We're 516 * particularly interested in the aio/dio case. Like the core uses 517 * i_alloc_sem, we use the rw_lock DLM lock to protect io on one node from 518 * truncation on another. 519 */ 520 static void ocfs2_dio_end_io(struct kiocb *iocb, 521 loff_t offset, 522 ssize_t bytes, 523 void *private) 524 { 525 struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; 526 int level; 527 528 /* this io's submitter should not have unlocked this before we could */ 529 BUG_ON(!ocfs2_iocb_is_rw_locked(iocb)); 530 531 ocfs2_iocb_clear_rw_locked(iocb); 532 533 level = ocfs2_iocb_rw_locked_level(iocb); 534 if (!level) 535 up_read(&inode->i_alloc_sem); 536 ocfs2_rw_unlock(inode, level); 537 } 538 539 /* 540 * ocfs2_invalidatepage() and ocfs2_releasepage() are shamelessly stolen 541 * from ext3. PageChecked() bits have been removed as OCFS2 does not 542 * do journalled data. 543 */ 544 static void ocfs2_invalidatepage(struct page *page, unsigned long offset) 545 { 546 journal_t *journal = OCFS2_SB(page->mapping->host->i_sb)->journal->j_journal; 547 548 journal_invalidatepage(journal, page, offset); 549 } 550 551 static int ocfs2_releasepage(struct page *page, gfp_t wait) 552 { 553 journal_t *journal = OCFS2_SB(page->mapping->host->i_sb)->journal->j_journal; 554 555 if (!page_has_buffers(page)) 556 return 0; 557 return journal_try_to_free_buffers(journal, page, wait); 558 } 559 560 static ssize_t ocfs2_direct_IO(int rw, 561 struct kiocb *iocb, 562 const struct iovec *iov, 563 loff_t offset, 564 unsigned long nr_segs) 565 { 566 struct file *file = iocb->ki_filp; 567 struct inode *inode = file->f_path.dentry->d_inode->i_mapping->host; 568 int ret; 569 570 mlog_entry_void(); 571 572 if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) { 573 /* 574 * We get PR data locks even for O_DIRECT. This 575 * allows concurrent O_DIRECT I/O but doesn't let 576 * O_DIRECT with extending and buffered zeroing writes 577 * race. If they did race then the buffered zeroing 578 * could be written back after the O_DIRECT I/O. It's 579 * one thing to tell people not to mix buffered and 580 * O_DIRECT writes, but expecting them to understand 581 * that file extension is also an implicit buffered 582 * write is too much. By getting the PR we force 583 * writeback of the buffered zeroing before 584 * proceeding. 585 */ 586 ret = ocfs2_data_lock(inode, 0); 587 if (ret < 0) { 588 mlog_errno(ret); 589 goto out; 590 } 591 ocfs2_data_unlock(inode, 0); 592 } 593 594 ret = blockdev_direct_IO_no_locking(rw, iocb, inode, 595 inode->i_sb->s_bdev, iov, offset, 596 nr_segs, 597 ocfs2_direct_IO_get_blocks, 598 ocfs2_dio_end_io); 599 out: 600 mlog_exit(ret); 601 return ret; 602 } 603 604 static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb, 605 u32 cpos, 606 unsigned int *start, 607 unsigned int *end) 608 { 609 unsigned int cluster_start = 0, cluster_end = PAGE_CACHE_SIZE; 610 611 if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits)) { 612 unsigned int cpp; 613 614 cpp = 1 << (PAGE_CACHE_SHIFT - osb->s_clustersize_bits); 615 616 cluster_start = cpos % cpp; 617 cluster_start = cluster_start << osb->s_clustersize_bits; 618 619 cluster_end = cluster_start + osb->s_clustersize; 620 } 621 622 BUG_ON(cluster_start > PAGE_SIZE); 623 BUG_ON(cluster_end > PAGE_SIZE); 624 625 if (start) 626 *start = cluster_start; 627 if (end) 628 *end = cluster_end; 629 } 630 631 /* 632 * 'from' and 'to' are the region in the page to avoid zeroing. 633 * 634 * If pagesize > clustersize, this function will avoid zeroing outside 635 * of the cluster boundary. 636 * 637 * from == to == 0 is code for "zero the entire cluster region" 638 */ 639 static void ocfs2_clear_page_regions(struct page *page, 640 struct ocfs2_super *osb, u32 cpos, 641 unsigned from, unsigned to) 642 { 643 void *kaddr; 644 unsigned int cluster_start, cluster_end; 645 646 ocfs2_figure_cluster_boundaries(osb, cpos, &cluster_start, &cluster_end); 647 648 kaddr = kmap_atomic(page, KM_USER0); 649 650 if (from || to) { 651 if (from > cluster_start) 652 memset(kaddr + cluster_start, 0, from - cluster_start); 653 if (to < cluster_end) 654 memset(kaddr + to, 0, cluster_end - to); 655 } else { 656 memset(kaddr + cluster_start, 0, cluster_end - cluster_start); 657 } 658 659 kunmap_atomic(kaddr, KM_USER0); 660 } 661 662 /* 663 * Some of this taken from block_prepare_write(). We already have our 664 * mapping by now though, and the entire write will be allocating or 665 * it won't, so not much need to use BH_New. 666 * 667 * This will also skip zeroing, which is handled externally. 668 */ 669 int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno, 670 struct inode *inode, unsigned int from, 671 unsigned int to, int new) 672 { 673 int ret = 0; 674 struct buffer_head *head, *bh, *wait[2], **wait_bh = wait; 675 unsigned int block_end, block_start; 676 unsigned int bsize = 1 << inode->i_blkbits; 677 678 if (!page_has_buffers(page)) 679 create_empty_buffers(page, bsize, 0); 680 681 head = page_buffers(page); 682 for (bh = head, block_start = 0; bh != head || !block_start; 683 bh = bh->b_this_page, block_start += bsize) { 684 block_end = block_start + bsize; 685 686 /* 687 * Ignore blocks outside of our i/o range - 688 * they may belong to unallocated clusters. 689 */ 690 if (block_start >= to || block_end <= from) { 691 if (PageUptodate(page)) 692 set_buffer_uptodate(bh); 693 continue; 694 } 695 696 /* 697 * For an allocating write with cluster size >= page 698 * size, we always write the entire page. 699 */ 700 701 if (buffer_new(bh)) 702 clear_buffer_new(bh); 703 704 if (!buffer_mapped(bh)) { 705 map_bh(bh, inode->i_sb, *p_blkno); 706 unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); 707 } 708 709 if (PageUptodate(page)) { 710 if (!buffer_uptodate(bh)) 711 set_buffer_uptodate(bh); 712 } else if (!buffer_uptodate(bh) && !buffer_delay(bh) && 713 (block_start < from || block_end > to)) { 714 ll_rw_block(READ, 1, &bh); 715 *wait_bh++=bh; 716 } 717 718 *p_blkno = *p_blkno + 1; 719 } 720 721 /* 722 * If we issued read requests - let them complete. 723 */ 724 while(wait_bh > wait) { 725 wait_on_buffer(*--wait_bh); 726 if (!buffer_uptodate(*wait_bh)) 727 ret = -EIO; 728 } 729 730 if (ret == 0 || !new) 731 return ret; 732 733 /* 734 * If we get -EIO above, zero out any newly allocated blocks 735 * to avoid exposing stale data. 736 */ 737 bh = head; 738 block_start = 0; 739 do { 740 void *kaddr; 741 742 block_end = block_start + bsize; 743 if (block_end <= from) 744 goto next_bh; 745 if (block_start >= to) 746 break; 747 748 kaddr = kmap_atomic(page, KM_USER0); 749 memset(kaddr+block_start, 0, bh->b_size); 750 flush_dcache_page(page); 751 kunmap_atomic(kaddr, KM_USER0); 752 set_buffer_uptodate(bh); 753 mark_buffer_dirty(bh); 754 755 next_bh: 756 block_start = block_end; 757 bh = bh->b_this_page; 758 } while (bh != head); 759 760 return ret; 761 } 762 763 /* 764 * This will copy user data from the buffer page in the splice 765 * context. 766 * 767 * For now, we ignore SPLICE_F_MOVE as that would require some extra 768 * communication out all the way to ocfs2_write(). 769 */ 770 int ocfs2_map_and_write_splice_data(struct inode *inode, 771 struct ocfs2_write_ctxt *wc, u64 *p_blkno, 772 unsigned int *ret_from, unsigned int *ret_to) 773 { 774 int ret; 775 unsigned int to, from, cluster_start, cluster_end; 776 char *src, *dst; 777 struct ocfs2_splice_write_priv *sp = wc->w_private; 778 struct pipe_buffer *buf = sp->s_buf; 779 unsigned long bytes, src_from; 780 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 781 782 ocfs2_figure_cluster_boundaries(osb, wc->w_cpos, &cluster_start, 783 &cluster_end); 784 785 from = sp->s_offset; 786 src_from = sp->s_buf_offset; 787 bytes = wc->w_count; 788 789 if (wc->w_large_pages) { 790 /* 791 * For cluster size < page size, we have to 792 * calculate pos within the cluster and obey 793 * the rightmost boundary. 794 */ 795 bytes = min(bytes, (unsigned long)(osb->s_clustersize 796 - (wc->w_pos & (osb->s_clustersize - 1)))); 797 } 798 to = from + bytes; 799 800 if (wc->w_this_page_new) 801 ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode, 802 cluster_start, cluster_end, 1); 803 else 804 ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode, 805 from, to, 0); 806 if (ret) { 807 mlog_errno(ret); 808 goto out; 809 } 810 811 BUG_ON(from > PAGE_CACHE_SIZE); 812 BUG_ON(to > PAGE_CACHE_SIZE); 813 BUG_ON(from > osb->s_clustersize); 814 BUG_ON(to > osb->s_clustersize); 815 816 src = buf->ops->map(sp->s_pipe, buf, 1); 817 dst = kmap_atomic(wc->w_this_page, KM_USER1); 818 memcpy(dst + from, src + src_from, bytes); 819 kunmap_atomic(wc->w_this_page, KM_USER1); 820 buf->ops->unmap(sp->s_pipe, buf, src); 821 822 wc->w_finished_copy = 1; 823 824 *ret_from = from; 825 *ret_to = to; 826 out: 827 828 return bytes ? (unsigned int)bytes : ret; 829 } 830 831 /* 832 * This will copy user data from the iovec in the buffered write 833 * context. 834 */ 835 int ocfs2_map_and_write_user_data(struct inode *inode, 836 struct ocfs2_write_ctxt *wc, u64 *p_blkno, 837 unsigned int *ret_from, unsigned int *ret_to) 838 { 839 int ret; 840 unsigned int to, from, cluster_start, cluster_end; 841 unsigned long bytes, src_from; 842 char *dst; 843 struct ocfs2_buffered_write_priv *bp = wc->w_private; 844 const struct iovec *cur_iov = bp->b_cur_iov; 845 char __user *buf; 846 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 847 848 ocfs2_figure_cluster_boundaries(osb, wc->w_cpos, &cluster_start, 849 &cluster_end); 850 851 buf = cur_iov->iov_base + bp->b_cur_off; 852 src_from = (unsigned long)buf & ~PAGE_CACHE_MASK; 853 854 from = wc->w_pos & (PAGE_CACHE_SIZE - 1); 855 856 /* 857 * This is a lot of comparisons, but it reads quite 858 * easily, which is important here. 859 */ 860 /* Stay within the src page */ 861 bytes = PAGE_SIZE - src_from; 862 /* Stay within the vector */ 863 bytes = min(bytes, 864 (unsigned long)(cur_iov->iov_len - bp->b_cur_off)); 865 /* Stay within count */ 866 bytes = min(bytes, (unsigned long)wc->w_count); 867 /* 868 * For clustersize > page size, just stay within 869 * target page, otherwise we have to calculate pos 870 * within the cluster and obey the rightmost 871 * boundary. 872 */ 873 if (wc->w_large_pages) { 874 /* 875 * For cluster size < page size, we have to 876 * calculate pos within the cluster and obey 877 * the rightmost boundary. 878 */ 879 bytes = min(bytes, (unsigned long)(osb->s_clustersize 880 - (wc->w_pos & (osb->s_clustersize - 1)))); 881 } else { 882 /* 883 * cluster size > page size is the most common 884 * case - we just stay within the target page 885 * boundary. 886 */ 887 bytes = min(bytes, PAGE_CACHE_SIZE - from); 888 } 889 890 to = from + bytes; 891 892 if (wc->w_this_page_new) 893 ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode, 894 cluster_start, cluster_end, 1); 895 else 896 ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode, 897 from, to, 0); 898 if (ret) { 899 mlog_errno(ret); 900 goto out; 901 } 902 903 BUG_ON(from > PAGE_CACHE_SIZE); 904 BUG_ON(to > PAGE_CACHE_SIZE); 905 BUG_ON(from > osb->s_clustersize); 906 BUG_ON(to > osb->s_clustersize); 907 908 dst = kmap(wc->w_this_page); 909 memcpy(dst + from, bp->b_src_buf + src_from, bytes); 910 kunmap(wc->w_this_page); 911 912 /* 913 * XXX: This is slow, but simple. The caller of 914 * ocfs2_buffered_write_cluster() is responsible for 915 * passing through the iovecs, so it's difficult to 916 * predict what our next step is in here after our 917 * initial write. A future version should be pushing 918 * that iovec manipulation further down. 919 * 920 * By setting this, we indicate that a copy from user 921 * data was done, and subsequent calls for this 922 * cluster will skip copying more data. 923 */ 924 wc->w_finished_copy = 1; 925 926 *ret_from = from; 927 *ret_to = to; 928 out: 929 930 return bytes ? (unsigned int)bytes : ret; 931 } 932 933 /* 934 * Map, fill and write a page to disk. 935 * 936 * The work of copying data is done via callback. Newly allocated 937 * pages which don't take user data will be zero'd (set 'new' to 938 * indicate an allocating write) 939 * 940 * Returns a negative error code or the number of bytes copied into 941 * the page. 942 */ 943 static int ocfs2_write_data_page(struct inode *inode, handle_t *handle, 944 u64 *p_blkno, struct page *page, 945 struct ocfs2_write_ctxt *wc, int new) 946 { 947 int ret, copied = 0; 948 unsigned int from = 0, to = 0; 949 unsigned int cluster_start, cluster_end; 950 unsigned int zero_from = 0, zero_to = 0; 951 952 ocfs2_figure_cluster_boundaries(OCFS2_SB(inode->i_sb), wc->w_cpos, 953 &cluster_start, &cluster_end); 954 955 if ((wc->w_pos >> PAGE_CACHE_SHIFT) == page->index 956 && !wc->w_finished_copy) { 957 958 wc->w_this_page = page; 959 wc->w_this_page_new = new; 960 ret = wc->w_write_data_page(inode, wc, p_blkno, &from, &to); 961 if (ret < 0) { 962 mlog_errno(ret); 963 goto out; 964 } 965 966 copied = ret; 967 968 zero_from = from; 969 zero_to = to; 970 if (new) { 971 from = cluster_start; 972 to = cluster_end; 973 } 974 } else { 975 /* 976 * If we haven't allocated the new page yet, we 977 * shouldn't be writing it out without copying user 978 * data. This is likely a math error from the caller. 979 */ 980 BUG_ON(!new); 981 982 from = cluster_start; 983 to = cluster_end; 984 985 ret = ocfs2_map_page_blocks(page, p_blkno, inode, 986 cluster_start, cluster_end, 1); 987 if (ret) { 988 mlog_errno(ret); 989 goto out; 990 } 991 } 992 993 /* 994 * Parts of newly allocated pages need to be zero'd. 995 * 996 * Above, we have also rewritten 'to' and 'from' - as far as 997 * the rest of the function is concerned, the entire cluster 998 * range inside of a page needs to be written. 999 * 1000 * We can skip this if the page is up to date - it's already 1001 * been zero'd from being read in as a hole. 1002 */ 1003 if (new && !PageUptodate(page)) 1004 ocfs2_clear_page_regions(page, OCFS2_SB(inode->i_sb), 1005 wc->w_cpos, zero_from, zero_to); 1006 1007 flush_dcache_page(page); 1008 1009 if (ocfs2_should_order_data(inode)) { 1010 ret = walk_page_buffers(handle, 1011 page_buffers(page), 1012 from, to, NULL, 1013 ocfs2_journal_dirty_data); 1014 if (ret < 0) 1015 mlog_errno(ret); 1016 } 1017 1018 /* 1019 * We don't use generic_commit_write() because we need to 1020 * handle our own i_size update. 1021 */ 1022 ret = block_commit_write(page, from, to); 1023 if (ret) 1024 mlog_errno(ret); 1025 out: 1026 1027 return copied ? copied : ret; 1028 } 1029 1030 /* 1031 * Do the actual write of some data into an inode. Optionally allocate 1032 * in order to fulfill the write. 1033 * 1034 * cpos is the logical cluster offset within the file to write at 1035 * 1036 * 'phys' is the physical mapping of that offset. a 'phys' value of 1037 * zero indicates that allocation is required. In this case, data_ac 1038 * and meta_ac should be valid (meta_ac can be null if metadata 1039 * allocation isn't required). 1040 */ 1041 static ssize_t ocfs2_write(struct file *file, u32 phys, handle_t *handle, 1042 struct buffer_head *di_bh, 1043 struct ocfs2_alloc_context *data_ac, 1044 struct ocfs2_alloc_context *meta_ac, 1045 struct ocfs2_write_ctxt *wc) 1046 { 1047 int ret, i, numpages = 1, new; 1048 unsigned int copied = 0; 1049 u32 tmp_pos; 1050 u64 v_blkno, p_blkno; 1051 struct address_space *mapping = file->f_mapping; 1052 struct inode *inode = mapping->host; 1053 unsigned long index, start; 1054 struct page **cpages; 1055 1056 new = phys == 0 ? 1 : 0; 1057 1058 /* 1059 * Figure out how many pages we'll be manipulating here. For 1060 * non allocating write, we just change the one 1061 * page. Otherwise, we'll need a whole clusters worth. 1062 */ 1063 if (new) 1064 numpages = ocfs2_pages_per_cluster(inode->i_sb); 1065 1066 cpages = kzalloc(sizeof(*cpages) * numpages, GFP_NOFS); 1067 if (!cpages) { 1068 ret = -ENOMEM; 1069 mlog_errno(ret); 1070 return ret; 1071 } 1072 1073 /* 1074 * Fill our page array first. That way we've grabbed enough so 1075 * that we can zero and flush if we error after adding the 1076 * extent. 1077 */ 1078 if (new) { 1079 start = ocfs2_align_clusters_to_page_index(inode->i_sb, 1080 wc->w_cpos); 1081 v_blkno = ocfs2_clusters_to_blocks(inode->i_sb, wc->w_cpos); 1082 } else { 1083 start = wc->w_pos >> PAGE_CACHE_SHIFT; 1084 v_blkno = wc->w_pos >> inode->i_sb->s_blocksize_bits; 1085 } 1086 1087 for(i = 0; i < numpages; i++) { 1088 index = start + i; 1089 1090 cpages[i] = find_or_create_page(mapping, index, GFP_NOFS); 1091 if (!cpages[i]) { 1092 ret = -ENOMEM; 1093 mlog_errno(ret); 1094 goto out; 1095 } 1096 } 1097 1098 if (new) { 1099 /* 1100 * This is safe to call with the page locks - it won't take 1101 * any additional semaphores or cluster locks. 1102 */ 1103 tmp_pos = wc->w_cpos; 1104 ret = ocfs2_do_extend_allocation(OCFS2_SB(inode->i_sb), inode, 1105 &tmp_pos, 1, di_bh, handle, 1106 data_ac, meta_ac, NULL); 1107 /* 1108 * This shouldn't happen because we must have already 1109 * calculated the correct meta data allocation required. The 1110 * internal tree allocation code should know how to increase 1111 * transaction credits itself. 1112 * 1113 * If need be, we could handle -EAGAIN for a 1114 * RESTART_TRANS here. 1115 */ 1116 mlog_bug_on_msg(ret == -EAGAIN, 1117 "Inode %llu: EAGAIN return during allocation.\n", 1118 (unsigned long long)OCFS2_I(inode)->ip_blkno); 1119 if (ret < 0) { 1120 mlog_errno(ret); 1121 goto out; 1122 } 1123 } 1124 1125 ret = ocfs2_extent_map_get_blocks(inode, v_blkno, &p_blkno, NULL, 1126 NULL); 1127 if (ret < 0) { 1128 1129 /* 1130 * XXX: Should we go readonly here? 1131 */ 1132 1133 mlog_errno(ret); 1134 goto out; 1135 } 1136 1137 BUG_ON(p_blkno == 0); 1138 1139 for(i = 0; i < numpages; i++) { 1140 ret = ocfs2_write_data_page(inode, handle, &p_blkno, cpages[i], 1141 wc, new); 1142 if (ret < 0) { 1143 mlog_errno(ret); 1144 goto out; 1145 } 1146 1147 copied += ret; 1148 } 1149 1150 out: 1151 for(i = 0; i < numpages; i++) { 1152 unlock_page(cpages[i]); 1153 mark_page_accessed(cpages[i]); 1154 page_cache_release(cpages[i]); 1155 } 1156 kfree(cpages); 1157 1158 return copied ? copied : ret; 1159 } 1160 1161 static void ocfs2_write_ctxt_init(struct ocfs2_write_ctxt *wc, 1162 struct ocfs2_super *osb, loff_t pos, 1163 size_t count, ocfs2_page_writer *cb, 1164 void *cb_priv) 1165 { 1166 wc->w_count = count; 1167 wc->w_pos = pos; 1168 wc->w_cpos = wc->w_pos >> osb->s_clustersize_bits; 1169 wc->w_finished_copy = 0; 1170 1171 if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits)) 1172 wc->w_large_pages = 1; 1173 else 1174 wc->w_large_pages = 0; 1175 1176 wc->w_write_data_page = cb; 1177 wc->w_private = cb_priv; 1178 } 1179 1180 /* 1181 * Write a cluster to an inode. The cluster may not be allocated yet, 1182 * in which case it will be. This only exists for buffered writes - 1183 * O_DIRECT takes a more "traditional" path through the kernel. 1184 * 1185 * The caller is responsible for incrementing pos, written counts, etc 1186 * 1187 * For file systems that don't support sparse files, pre-allocation 1188 * and page zeroing up until cpos should be done prior to this 1189 * function call. 1190 * 1191 * Callers should be holding i_sem, and the rw cluster lock. 1192 * 1193 * Returns the number of user bytes written, or less than zero for 1194 * error. 1195 */ 1196 ssize_t ocfs2_buffered_write_cluster(struct file *file, loff_t pos, 1197 size_t count, ocfs2_page_writer *actor, 1198 void *priv) 1199 { 1200 int ret, credits = OCFS2_INODE_UPDATE_CREDITS; 1201 ssize_t written = 0; 1202 u32 phys; 1203 struct inode *inode = file->f_mapping->host; 1204 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1205 struct buffer_head *di_bh = NULL; 1206 struct ocfs2_dinode *di; 1207 struct ocfs2_alloc_context *data_ac = NULL; 1208 struct ocfs2_alloc_context *meta_ac = NULL; 1209 handle_t *handle; 1210 struct ocfs2_write_ctxt wc; 1211 1212 ocfs2_write_ctxt_init(&wc, osb, pos, count, actor, priv); 1213 1214 ret = ocfs2_meta_lock(inode, &di_bh, 1); 1215 if (ret) { 1216 mlog_errno(ret); 1217 goto out; 1218 } 1219 di = (struct ocfs2_dinode *)di_bh->b_data; 1220 1221 /* 1222 * Take alloc sem here to prevent concurrent lookups. That way 1223 * the mapping, zeroing and tree manipulation within 1224 * ocfs2_write() will be safe against ->readpage(). This 1225 * should also serve to lock out allocation from a shared 1226 * writeable region. 1227 */ 1228 down_write(&OCFS2_I(inode)->ip_alloc_sem); 1229 1230 ret = ocfs2_get_clusters(inode, wc.w_cpos, &phys, NULL, NULL); 1231 if (ret) { 1232 mlog_errno(ret); 1233 goto out_meta; 1234 } 1235 1236 /* phys == 0 means that allocation is required. */ 1237 if (phys == 0) { 1238 ret = ocfs2_lock_allocators(inode, di, 1, &data_ac, &meta_ac); 1239 if (ret) { 1240 mlog_errno(ret); 1241 goto out_meta; 1242 } 1243 1244 credits = ocfs2_calc_extend_credits(inode->i_sb, di, 1); 1245 } 1246 1247 ret = ocfs2_data_lock(inode, 1); 1248 if (ret) { 1249 mlog_errno(ret); 1250 goto out_meta; 1251 } 1252 1253 handle = ocfs2_start_trans(osb, credits); 1254 if (IS_ERR(handle)) { 1255 ret = PTR_ERR(handle); 1256 mlog_errno(ret); 1257 goto out_data; 1258 } 1259 1260 written = ocfs2_write(file, phys, handle, di_bh, data_ac, 1261 meta_ac, &wc); 1262 if (written < 0) { 1263 ret = written; 1264 mlog_errno(ret); 1265 goto out_commit; 1266 } 1267 1268 ret = ocfs2_journal_access(handle, inode, di_bh, 1269 OCFS2_JOURNAL_ACCESS_WRITE); 1270 if (ret) { 1271 mlog_errno(ret); 1272 goto out_commit; 1273 } 1274 1275 pos += written; 1276 if (pos > inode->i_size) { 1277 i_size_write(inode, pos); 1278 mark_inode_dirty(inode); 1279 } 1280 inode->i_blocks = ocfs2_inode_sector_count(inode); 1281 di->i_size = cpu_to_le64((u64)i_size_read(inode)); 1282 inode->i_mtime = inode->i_ctime = CURRENT_TIME; 1283 di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec); 1284 di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); 1285 1286 ret = ocfs2_journal_dirty(handle, di_bh); 1287 if (ret) 1288 mlog_errno(ret); 1289 1290 out_commit: 1291 ocfs2_commit_trans(osb, handle); 1292 1293 out_data: 1294 ocfs2_data_unlock(inode, 1); 1295 1296 out_meta: 1297 up_write(&OCFS2_I(inode)->ip_alloc_sem); 1298 ocfs2_meta_unlock(inode, 1); 1299 1300 out: 1301 brelse(di_bh); 1302 if (data_ac) 1303 ocfs2_free_alloc_context(data_ac); 1304 if (meta_ac) 1305 ocfs2_free_alloc_context(meta_ac); 1306 1307 return written ? written : ret; 1308 } 1309 1310 const struct address_space_operations ocfs2_aops = { 1311 .readpage = ocfs2_readpage, 1312 .writepage = ocfs2_writepage, 1313 .bmap = ocfs2_bmap, 1314 .sync_page = block_sync_page, 1315 .direct_IO = ocfs2_direct_IO, 1316 .invalidatepage = ocfs2_invalidatepage, 1317 .releasepage = ocfs2_releasepage, 1318 .migratepage = buffer_migrate_page, 1319 }; 1320