1 /* 2 * file.c - NTFS kernel file operations. Part of the Linux-NTFS project. 3 * 4 * Copyright (c) 2001-2011 Anton Altaparmakov and Tuxera Inc. 5 * 6 * This program/include file is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU General Public License as published 8 * by the Free Software Foundation; either version 2 of the License, or 9 * (at your option) any later version. 10 * 11 * This program/include file is distributed in the hope that it will be 12 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty 13 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 * GNU General Public License for more details. 15 * 16 * You should have received a copy of the GNU General Public License 17 * along with this program (in the main directory of the Linux-NTFS 18 * distribution in the file COPYING); if not, write to the Free Software 19 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 20 */ 21 22 #include <linux/buffer_head.h> 23 #include <linux/gfp.h> 24 #include <linux/pagemap.h> 25 #include <linux/pagevec.h> 26 #include <linux/sched.h> 27 #include <linux/swap.h> 28 #include <linux/uio.h> 29 #include <linux/writeback.h> 30 31 #include <asm/page.h> 32 #include <asm/uaccess.h> 33 34 #include "attrib.h" 35 #include "bitmap.h" 36 #include "inode.h" 37 #include "debug.h" 38 #include "lcnalloc.h" 39 #include "malloc.h" 40 #include "mft.h" 41 #include "ntfs.h" 42 43 /** 44 * ntfs_file_open - called when an inode is about to be opened 45 * @vi: inode to be opened 46 * @filp: file structure describing the inode 47 * 48 * Limit file size to the page cache limit on architectures where unsigned long 49 * is 32-bits. This is the most we can do for now without overflowing the page 50 * cache page index. Doing it this way means we don't run into problems because 51 * of existing too large files. It would be better to allow the user to read 52 * the beginning of the file but I doubt very much anyone is going to hit this 53 * check on a 32-bit architecture, so there is no point in adding the extra 54 * complexity required to support this. 55 * 56 * On 64-bit architectures, the check is hopefully optimized away by the 57 * compiler. 58 * 59 * After the check passes, just call generic_file_open() to do its work. 60 */ 61 static int ntfs_file_open(struct inode *vi, struct file *filp) 62 { 63 if (sizeof(unsigned long) < 8) { 64 if (i_size_read(vi) > MAX_LFS_FILESIZE) 65 return -EOVERFLOW; 66 } 67 return generic_file_open(vi, filp); 68 } 69 70 #ifdef NTFS_RW 71 72 /** 73 * ntfs_attr_extend_initialized - extend the initialized size of an attribute 74 * @ni: ntfs inode of the attribute to extend 75 * @new_init_size: requested new initialized size in bytes 76 * @cached_page: store any allocated but unused page here 77 * @lru_pvec: lru-buffering pagevec of the caller 78 * 79 * Extend the initialized size of an attribute described by the ntfs inode @ni 80 * to @new_init_size bytes. This involves zeroing any non-sparse space between 81 * the old initialized size and @new_init_size both in the page cache and on 82 * disk (if relevant complete pages are already uptodate in the page cache then 83 * these are simply marked dirty). 84 * 85 * As a side-effect, the file size (vfs inode->i_size) may be incremented as, 86 * in the resident attribute case, it is tied to the initialized size and, in 87 * the non-resident attribute case, it may not fall below the initialized size. 88 * 89 * Note that if the attribute is resident, we do not need to touch the page 90 * cache at all. This is because if the page cache page is not uptodate we 91 * bring it uptodate later, when doing the write to the mft record since we 92 * then already have the page mapped. And if the page is uptodate, the 93 * non-initialized region will already have been zeroed when the page was 94 * brought uptodate and the region may in fact already have been overwritten 95 * with new data via mmap() based writes, so we cannot just zero it. And since 96 * POSIX specifies that the behaviour of resizing a file whilst it is mmap()ped 97 * is unspecified, we choose not to do zeroing and thus we do not need to touch 98 * the page at all. For a more detailed explanation see ntfs_truncate() in 99 * fs/ntfs/inode.c. 100 * 101 * Return 0 on success and -errno on error. In the case that an error is 102 * encountered it is possible that the initialized size will already have been 103 * incremented some way towards @new_init_size but it is guaranteed that if 104 * this is the case, the necessary zeroing will also have happened and that all 105 * metadata is self-consistent. 106 * 107 * Locking: i_mutex on the vfs inode corrseponsind to the ntfs inode @ni must be 108 * held by the caller. 109 */ 110 static int ntfs_attr_extend_initialized(ntfs_inode *ni, const s64 new_init_size) 111 { 112 s64 old_init_size; 113 loff_t old_i_size; 114 pgoff_t index, end_index; 115 unsigned long flags; 116 struct inode *vi = VFS_I(ni); 117 ntfs_inode *base_ni; 118 MFT_RECORD *m = NULL; 119 ATTR_RECORD *a; 120 ntfs_attr_search_ctx *ctx = NULL; 121 struct address_space *mapping; 122 struct page *page = NULL; 123 u8 *kattr; 124 int err; 125 u32 attr_len; 126 127 read_lock_irqsave(&ni->size_lock, flags); 128 old_init_size = ni->initialized_size; 129 old_i_size = i_size_read(vi); 130 BUG_ON(new_init_size > ni->allocated_size); 131 read_unlock_irqrestore(&ni->size_lock, flags); 132 ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, " 133 "old_initialized_size 0x%llx, " 134 "new_initialized_size 0x%llx, i_size 0x%llx.", 135 vi->i_ino, (unsigned)le32_to_cpu(ni->type), 136 (unsigned long long)old_init_size, 137 (unsigned long long)new_init_size, old_i_size); 138 if (!NInoAttr(ni)) 139 base_ni = ni; 140 else 141 base_ni = ni->ext.base_ntfs_ino; 142 /* Use goto to reduce indentation and we need the label below anyway. */ 143 if (NInoNonResident(ni)) 144 goto do_non_resident_extend; 145 BUG_ON(old_init_size != old_i_size); 146 m = map_mft_record(base_ni); 147 if (IS_ERR(m)) { 148 err = PTR_ERR(m); 149 m = NULL; 150 goto err_out; 151 } 152 ctx = ntfs_attr_get_search_ctx(base_ni, m); 153 if (unlikely(!ctx)) { 154 err = -ENOMEM; 155 goto err_out; 156 } 157 err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, 158 CASE_SENSITIVE, 0, NULL, 0, ctx); 159 if (unlikely(err)) { 160 if (err == -ENOENT) 161 err = -EIO; 162 goto err_out; 163 } 164 m = ctx->mrec; 165 a = ctx->attr; 166 BUG_ON(a->non_resident); 167 /* The total length of the attribute value. */ 168 attr_len = le32_to_cpu(a->data.resident.value_length); 169 BUG_ON(old_i_size != (loff_t)attr_len); 170 /* 171 * Do the zeroing in the mft record and update the attribute size in 172 * the mft record. 173 */ 174 kattr = (u8*)a + le16_to_cpu(a->data.resident.value_offset); 175 memset(kattr + attr_len, 0, new_init_size - attr_len); 176 a->data.resident.value_length = cpu_to_le32((u32)new_init_size); 177 /* Finally, update the sizes in the vfs and ntfs inodes. */ 178 write_lock_irqsave(&ni->size_lock, flags); 179 i_size_write(vi, new_init_size); 180 ni->initialized_size = new_init_size; 181 write_unlock_irqrestore(&ni->size_lock, flags); 182 goto done; 183 do_non_resident_extend: 184 /* 185 * If the new initialized size @new_init_size exceeds the current file 186 * size (vfs inode->i_size), we need to extend the file size to the 187 * new initialized size. 188 */ 189 if (new_init_size > old_i_size) { 190 m = map_mft_record(base_ni); 191 if (IS_ERR(m)) { 192 err = PTR_ERR(m); 193 m = NULL; 194 goto err_out; 195 } 196 ctx = ntfs_attr_get_search_ctx(base_ni, m); 197 if (unlikely(!ctx)) { 198 err = -ENOMEM; 199 goto err_out; 200 } 201 err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, 202 CASE_SENSITIVE, 0, NULL, 0, ctx); 203 if (unlikely(err)) { 204 if (err == -ENOENT) 205 err = -EIO; 206 goto err_out; 207 } 208 m = ctx->mrec; 209 a = ctx->attr; 210 BUG_ON(!a->non_resident); 211 BUG_ON(old_i_size != (loff_t) 212 sle64_to_cpu(a->data.non_resident.data_size)); 213 a->data.non_resident.data_size = cpu_to_sle64(new_init_size); 214 flush_dcache_mft_record_page(ctx->ntfs_ino); 215 mark_mft_record_dirty(ctx->ntfs_ino); 216 /* Update the file size in the vfs inode. */ 217 i_size_write(vi, new_init_size); 218 ntfs_attr_put_search_ctx(ctx); 219 ctx = NULL; 220 unmap_mft_record(base_ni); 221 m = NULL; 222 } 223 mapping = vi->i_mapping; 224 index = old_init_size >> PAGE_CACHE_SHIFT; 225 end_index = (new_init_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 226 do { 227 /* 228 * Read the page. If the page is not present, this will zero 229 * the uninitialized regions for us. 230 */ 231 page = read_mapping_page(mapping, index, NULL); 232 if (IS_ERR(page)) { 233 err = PTR_ERR(page); 234 goto init_err_out; 235 } 236 if (unlikely(PageError(page))) { 237 page_cache_release(page); 238 err = -EIO; 239 goto init_err_out; 240 } 241 /* 242 * Update the initialized size in the ntfs inode. This is 243 * enough to make ntfs_writepage() work. 244 */ 245 write_lock_irqsave(&ni->size_lock, flags); 246 ni->initialized_size = (s64)(index + 1) << PAGE_CACHE_SHIFT; 247 if (ni->initialized_size > new_init_size) 248 ni->initialized_size = new_init_size; 249 write_unlock_irqrestore(&ni->size_lock, flags); 250 /* Set the page dirty so it gets written out. */ 251 set_page_dirty(page); 252 page_cache_release(page); 253 /* 254 * Play nice with the vm and the rest of the system. This is 255 * very much needed as we can potentially be modifying the 256 * initialised size from a very small value to a really huge 257 * value, e.g. 258 * f = open(somefile, O_TRUNC); 259 * truncate(f, 10GiB); 260 * seek(f, 10GiB); 261 * write(f, 1); 262 * And this would mean we would be marking dirty hundreds of 263 * thousands of pages or as in the above example more than 264 * two and a half million pages! 265 * 266 * TODO: For sparse pages could optimize this workload by using 267 * the FsMisc / MiscFs page bit as a "PageIsSparse" bit. This 268 * would be set in readpage for sparse pages and here we would 269 * not need to mark dirty any pages which have this bit set. 270 * The only caveat is that we have to clear the bit everywhere 271 * where we allocate any clusters that lie in the page or that 272 * contain the page. 273 * 274 * TODO: An even greater optimization would be for us to only 275 * call readpage() on pages which are not in sparse regions as 276 * determined from the runlist. This would greatly reduce the 277 * number of pages we read and make dirty in the case of sparse 278 * files. 279 */ 280 balance_dirty_pages_ratelimited(mapping); 281 cond_resched(); 282 } while (++index < end_index); 283 read_lock_irqsave(&ni->size_lock, flags); 284 BUG_ON(ni->initialized_size != new_init_size); 285 read_unlock_irqrestore(&ni->size_lock, flags); 286 /* Now bring in sync the initialized_size in the mft record. */ 287 m = map_mft_record(base_ni); 288 if (IS_ERR(m)) { 289 err = PTR_ERR(m); 290 m = NULL; 291 goto init_err_out; 292 } 293 ctx = ntfs_attr_get_search_ctx(base_ni, m); 294 if (unlikely(!ctx)) { 295 err = -ENOMEM; 296 goto init_err_out; 297 } 298 err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, 299 CASE_SENSITIVE, 0, NULL, 0, ctx); 300 if (unlikely(err)) { 301 if (err == -ENOENT) 302 err = -EIO; 303 goto init_err_out; 304 } 305 m = ctx->mrec; 306 a = ctx->attr; 307 BUG_ON(!a->non_resident); 308 a->data.non_resident.initialized_size = cpu_to_sle64(new_init_size); 309 done: 310 flush_dcache_mft_record_page(ctx->ntfs_ino); 311 mark_mft_record_dirty(ctx->ntfs_ino); 312 if (ctx) 313 ntfs_attr_put_search_ctx(ctx); 314 if (m) 315 unmap_mft_record(base_ni); 316 ntfs_debug("Done, initialized_size 0x%llx, i_size 0x%llx.", 317 (unsigned long long)new_init_size, i_size_read(vi)); 318 return 0; 319 init_err_out: 320 write_lock_irqsave(&ni->size_lock, flags); 321 ni->initialized_size = old_init_size; 322 write_unlock_irqrestore(&ni->size_lock, flags); 323 err_out: 324 if (ctx) 325 ntfs_attr_put_search_ctx(ctx); 326 if (m) 327 unmap_mft_record(base_ni); 328 ntfs_debug("Failed. Returning error code %i.", err); 329 return err; 330 } 331 332 /** 333 * ntfs_fault_in_pages_readable - 334 * 335 * Fault a number of userspace pages into pagetables. 336 * 337 * Unlike include/linux/pagemap.h::fault_in_pages_readable(), this one copes 338 * with more than two userspace pages as well as handling the single page case 339 * elegantly. 340 * 341 * If you find this difficult to understand, then think of the while loop being 342 * the following code, except that we do without the integer variable ret: 343 * 344 * do { 345 * ret = __get_user(c, uaddr); 346 * uaddr += PAGE_SIZE; 347 * } while (!ret && uaddr < end); 348 * 349 * Note, the final __get_user() may well run out-of-bounds of the user buffer, 350 * but _not_ out-of-bounds of the page the user buffer belongs to, and since 351 * this is only a read and not a write, and since it is still in the same page, 352 * it should not matter and this makes the code much simpler. 353 */ 354 static inline void ntfs_fault_in_pages_readable(const char __user *uaddr, 355 int bytes) 356 { 357 const char __user *end; 358 volatile char c; 359 360 /* Set @end to the first byte outside the last page we care about. */ 361 end = (const char __user*)PAGE_ALIGN((unsigned long)uaddr + bytes); 362 363 while (!__get_user(c, uaddr) && (uaddr += PAGE_SIZE, uaddr < end)) 364 ; 365 } 366 367 /** 368 * ntfs_fault_in_pages_readable_iovec - 369 * 370 * Same as ntfs_fault_in_pages_readable() but operates on an array of iovecs. 371 */ 372 static inline void ntfs_fault_in_pages_readable_iovec(const struct iovec *iov, 373 size_t iov_ofs, int bytes) 374 { 375 do { 376 const char __user *buf; 377 unsigned len; 378 379 buf = iov->iov_base + iov_ofs; 380 len = iov->iov_len - iov_ofs; 381 if (len > bytes) 382 len = bytes; 383 ntfs_fault_in_pages_readable(buf, len); 384 bytes -= len; 385 iov++; 386 iov_ofs = 0; 387 } while (bytes); 388 } 389 390 /** 391 * __ntfs_grab_cache_pages - obtain a number of locked pages 392 * @mapping: address space mapping from which to obtain page cache pages 393 * @index: starting index in @mapping at which to begin obtaining pages 394 * @nr_pages: number of page cache pages to obtain 395 * @pages: array of pages in which to return the obtained page cache pages 396 * @cached_page: allocated but as yet unused page 397 * @lru_pvec: lru-buffering pagevec of caller 398 * 399 * Obtain @nr_pages locked page cache pages from the mapping @mapping and 400 * starting at index @index. 401 * 402 * If a page is newly created, add it to lru list 403 * 404 * Note, the page locks are obtained in ascending page index order. 405 */ 406 static inline int __ntfs_grab_cache_pages(struct address_space *mapping, 407 pgoff_t index, const unsigned nr_pages, struct page **pages, 408 struct page **cached_page) 409 { 410 int err, nr; 411 412 BUG_ON(!nr_pages); 413 err = nr = 0; 414 do { 415 pages[nr] = find_lock_page(mapping, index); 416 if (!pages[nr]) { 417 if (!*cached_page) { 418 *cached_page = page_cache_alloc(mapping); 419 if (unlikely(!*cached_page)) { 420 err = -ENOMEM; 421 goto err_out; 422 } 423 } 424 err = add_to_page_cache_lru(*cached_page, mapping, index, 425 GFP_KERNEL); 426 if (unlikely(err)) { 427 if (err == -EEXIST) 428 continue; 429 goto err_out; 430 } 431 pages[nr] = *cached_page; 432 *cached_page = NULL; 433 } 434 index++; 435 nr++; 436 } while (nr < nr_pages); 437 out: 438 return err; 439 err_out: 440 while (nr > 0) { 441 unlock_page(pages[--nr]); 442 page_cache_release(pages[nr]); 443 } 444 goto out; 445 } 446 447 static inline int ntfs_submit_bh_for_read(struct buffer_head *bh) 448 { 449 lock_buffer(bh); 450 get_bh(bh); 451 bh->b_end_io = end_buffer_read_sync; 452 return submit_bh(READ, bh); 453 } 454 455 /** 456 * ntfs_prepare_pages_for_non_resident_write - prepare pages for receiving data 457 * @pages: array of destination pages 458 * @nr_pages: number of pages in @pages 459 * @pos: byte position in file at which the write begins 460 * @bytes: number of bytes to be written 461 * 462 * This is called for non-resident attributes from ntfs_file_buffered_write() 463 * with i_mutex held on the inode (@pages[0]->mapping->host). There are 464 * @nr_pages pages in @pages which are locked but not kmap()ped. The source 465 * data has not yet been copied into the @pages. 466 * 467 * Need to fill any holes with actual clusters, allocate buffers if necessary, 468 * ensure all the buffers are mapped, and bring uptodate any buffers that are 469 * only partially being written to. 470 * 471 * If @nr_pages is greater than one, we are guaranteed that the cluster size is 472 * greater than PAGE_CACHE_SIZE, that all pages in @pages are entirely inside 473 * the same cluster and that they are the entirety of that cluster, and that 474 * the cluster is sparse, i.e. we need to allocate a cluster to fill the hole. 475 * 476 * i_size is not to be modified yet. 477 * 478 * Return 0 on success or -errno on error. 479 */ 480 static int ntfs_prepare_pages_for_non_resident_write(struct page **pages, 481 unsigned nr_pages, s64 pos, size_t bytes) 482 { 483 VCN vcn, highest_vcn = 0, cpos, cend, bh_cpos, bh_cend; 484 LCN lcn; 485 s64 bh_pos, vcn_len, end, initialized_size; 486 sector_t lcn_block; 487 struct page *page; 488 struct inode *vi; 489 ntfs_inode *ni, *base_ni = NULL; 490 ntfs_volume *vol; 491 runlist_element *rl, *rl2; 492 struct buffer_head *bh, *head, *wait[2], **wait_bh = wait; 493 ntfs_attr_search_ctx *ctx = NULL; 494 MFT_RECORD *m = NULL; 495 ATTR_RECORD *a = NULL; 496 unsigned long flags; 497 u32 attr_rec_len = 0; 498 unsigned blocksize, u; 499 int err, mp_size; 500 bool rl_write_locked, was_hole, is_retry; 501 unsigned char blocksize_bits; 502 struct { 503 u8 runlist_merged:1; 504 u8 mft_attr_mapped:1; 505 u8 mp_rebuilt:1; 506 u8 attr_switched:1; 507 } status = { 0, 0, 0, 0 }; 508 509 BUG_ON(!nr_pages); 510 BUG_ON(!pages); 511 BUG_ON(!*pages); 512 vi = pages[0]->mapping->host; 513 ni = NTFS_I(vi); 514 vol = ni->vol; 515 ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, start page " 516 "index 0x%lx, nr_pages 0x%x, pos 0x%llx, bytes 0x%zx.", 517 vi->i_ino, ni->type, pages[0]->index, nr_pages, 518 (long long)pos, bytes); 519 blocksize = vol->sb->s_blocksize; 520 blocksize_bits = vol->sb->s_blocksize_bits; 521 u = 0; 522 do { 523 page = pages[u]; 524 BUG_ON(!page); 525 /* 526 * create_empty_buffers() will create uptodate/dirty buffers if 527 * the page is uptodate/dirty. 528 */ 529 if (!page_has_buffers(page)) { 530 create_empty_buffers(page, blocksize, 0); 531 if (unlikely(!page_has_buffers(page))) 532 return -ENOMEM; 533 } 534 } while (++u < nr_pages); 535 rl_write_locked = false; 536 rl = NULL; 537 err = 0; 538 vcn = lcn = -1; 539 vcn_len = 0; 540 lcn_block = -1; 541 was_hole = false; 542 cpos = pos >> vol->cluster_size_bits; 543 end = pos + bytes; 544 cend = (end + vol->cluster_size - 1) >> vol->cluster_size_bits; 545 /* 546 * Loop over each page and for each page over each buffer. Use goto to 547 * reduce indentation. 548 */ 549 u = 0; 550 do_next_page: 551 page = pages[u]; 552 bh_pos = (s64)page->index << PAGE_CACHE_SHIFT; 553 bh = head = page_buffers(page); 554 do { 555 VCN cdelta; 556 s64 bh_end; 557 unsigned bh_cofs; 558 559 /* Clear buffer_new on all buffers to reinitialise state. */ 560 if (buffer_new(bh)) 561 clear_buffer_new(bh); 562 bh_end = bh_pos + blocksize; 563 bh_cpos = bh_pos >> vol->cluster_size_bits; 564 bh_cofs = bh_pos & vol->cluster_size_mask; 565 if (buffer_mapped(bh)) { 566 /* 567 * The buffer is already mapped. If it is uptodate, 568 * ignore it. 569 */ 570 if (buffer_uptodate(bh)) 571 continue; 572 /* 573 * The buffer is not uptodate. If the page is uptodate 574 * set the buffer uptodate and otherwise ignore it. 575 */ 576 if (PageUptodate(page)) { 577 set_buffer_uptodate(bh); 578 continue; 579 } 580 /* 581 * Neither the page nor the buffer are uptodate. If 582 * the buffer is only partially being written to, we 583 * need to read it in before the write, i.e. now. 584 */ 585 if ((bh_pos < pos && bh_end > pos) || 586 (bh_pos < end && bh_end > end)) { 587 /* 588 * If the buffer is fully or partially within 589 * the initialized size, do an actual read. 590 * Otherwise, simply zero the buffer. 591 */ 592 read_lock_irqsave(&ni->size_lock, flags); 593 initialized_size = ni->initialized_size; 594 read_unlock_irqrestore(&ni->size_lock, flags); 595 if (bh_pos < initialized_size) { 596 ntfs_submit_bh_for_read(bh); 597 *wait_bh++ = bh; 598 } else { 599 zero_user(page, bh_offset(bh), 600 blocksize); 601 set_buffer_uptodate(bh); 602 } 603 } 604 continue; 605 } 606 /* Unmapped buffer. Need to map it. */ 607 bh->b_bdev = vol->sb->s_bdev; 608 /* 609 * If the current buffer is in the same clusters as the map 610 * cache, there is no need to check the runlist again. The 611 * map cache is made up of @vcn, which is the first cached file 612 * cluster, @vcn_len which is the number of cached file 613 * clusters, @lcn is the device cluster corresponding to @vcn, 614 * and @lcn_block is the block number corresponding to @lcn. 615 */ 616 cdelta = bh_cpos - vcn; 617 if (likely(!cdelta || (cdelta > 0 && cdelta < vcn_len))) { 618 map_buffer_cached: 619 BUG_ON(lcn < 0); 620 bh->b_blocknr = lcn_block + 621 (cdelta << (vol->cluster_size_bits - 622 blocksize_bits)) + 623 (bh_cofs >> blocksize_bits); 624 set_buffer_mapped(bh); 625 /* 626 * If the page is uptodate so is the buffer. If the 627 * buffer is fully outside the write, we ignore it if 628 * it was already allocated and we mark it dirty so it 629 * gets written out if we allocated it. On the other 630 * hand, if we allocated the buffer but we are not 631 * marking it dirty we set buffer_new so we can do 632 * error recovery. 633 */ 634 if (PageUptodate(page)) { 635 if (!buffer_uptodate(bh)) 636 set_buffer_uptodate(bh); 637 if (unlikely(was_hole)) { 638 /* We allocated the buffer. */ 639 unmap_underlying_metadata(bh->b_bdev, 640 bh->b_blocknr); 641 if (bh_end <= pos || bh_pos >= end) 642 mark_buffer_dirty(bh); 643 else 644 set_buffer_new(bh); 645 } 646 continue; 647 } 648 /* Page is _not_ uptodate. */ 649 if (likely(!was_hole)) { 650 /* 651 * Buffer was already allocated. If it is not 652 * uptodate and is only partially being written 653 * to, we need to read it in before the write, 654 * i.e. now. 655 */ 656 if (!buffer_uptodate(bh) && bh_pos < end && 657 bh_end > pos && 658 (bh_pos < pos || 659 bh_end > end)) { 660 /* 661 * If the buffer is fully or partially 662 * within the initialized size, do an 663 * actual read. Otherwise, simply zero 664 * the buffer. 665 */ 666 read_lock_irqsave(&ni->size_lock, 667 flags); 668 initialized_size = ni->initialized_size; 669 read_unlock_irqrestore(&ni->size_lock, 670 flags); 671 if (bh_pos < initialized_size) { 672 ntfs_submit_bh_for_read(bh); 673 *wait_bh++ = bh; 674 } else { 675 zero_user(page, bh_offset(bh), 676 blocksize); 677 set_buffer_uptodate(bh); 678 } 679 } 680 continue; 681 } 682 /* We allocated the buffer. */ 683 unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); 684 /* 685 * If the buffer is fully outside the write, zero it, 686 * set it uptodate, and mark it dirty so it gets 687 * written out. If it is partially being written to, 688 * zero region surrounding the write but leave it to 689 * commit write to do anything else. Finally, if the 690 * buffer is fully being overwritten, do nothing. 691 */ 692 if (bh_end <= pos || bh_pos >= end) { 693 if (!buffer_uptodate(bh)) { 694 zero_user(page, bh_offset(bh), 695 blocksize); 696 set_buffer_uptodate(bh); 697 } 698 mark_buffer_dirty(bh); 699 continue; 700 } 701 set_buffer_new(bh); 702 if (!buffer_uptodate(bh) && 703 (bh_pos < pos || bh_end > end)) { 704 u8 *kaddr; 705 unsigned pofs; 706 707 kaddr = kmap_atomic(page); 708 if (bh_pos < pos) { 709 pofs = bh_pos & ~PAGE_CACHE_MASK; 710 memset(kaddr + pofs, 0, pos - bh_pos); 711 } 712 if (bh_end > end) { 713 pofs = end & ~PAGE_CACHE_MASK; 714 memset(kaddr + pofs, 0, bh_end - end); 715 } 716 kunmap_atomic(kaddr); 717 flush_dcache_page(page); 718 } 719 continue; 720 } 721 /* 722 * Slow path: this is the first buffer in the cluster. If it 723 * is outside allocated size and is not uptodate, zero it and 724 * set it uptodate. 725 */ 726 read_lock_irqsave(&ni->size_lock, flags); 727 initialized_size = ni->allocated_size; 728 read_unlock_irqrestore(&ni->size_lock, flags); 729 if (bh_pos > initialized_size) { 730 if (PageUptodate(page)) { 731 if (!buffer_uptodate(bh)) 732 set_buffer_uptodate(bh); 733 } else if (!buffer_uptodate(bh)) { 734 zero_user(page, bh_offset(bh), blocksize); 735 set_buffer_uptodate(bh); 736 } 737 continue; 738 } 739 is_retry = false; 740 if (!rl) { 741 down_read(&ni->runlist.lock); 742 retry_remap: 743 rl = ni->runlist.rl; 744 } 745 if (likely(rl != NULL)) { 746 /* Seek to element containing target cluster. */ 747 while (rl->length && rl[1].vcn <= bh_cpos) 748 rl++; 749 lcn = ntfs_rl_vcn_to_lcn(rl, bh_cpos); 750 if (likely(lcn >= 0)) { 751 /* 752 * Successful remap, setup the map cache and 753 * use that to deal with the buffer. 754 */ 755 was_hole = false; 756 vcn = bh_cpos; 757 vcn_len = rl[1].vcn - vcn; 758 lcn_block = lcn << (vol->cluster_size_bits - 759 blocksize_bits); 760 cdelta = 0; 761 /* 762 * If the number of remaining clusters touched 763 * by the write is smaller or equal to the 764 * number of cached clusters, unlock the 765 * runlist as the map cache will be used from 766 * now on. 767 */ 768 if (likely(vcn + vcn_len >= cend)) { 769 if (rl_write_locked) { 770 up_write(&ni->runlist.lock); 771 rl_write_locked = false; 772 } else 773 up_read(&ni->runlist.lock); 774 rl = NULL; 775 } 776 goto map_buffer_cached; 777 } 778 } else 779 lcn = LCN_RL_NOT_MAPPED; 780 /* 781 * If it is not a hole and not out of bounds, the runlist is 782 * probably unmapped so try to map it now. 783 */ 784 if (unlikely(lcn != LCN_HOLE && lcn != LCN_ENOENT)) { 785 if (likely(!is_retry && lcn == LCN_RL_NOT_MAPPED)) { 786 /* Attempt to map runlist. */ 787 if (!rl_write_locked) { 788 /* 789 * We need the runlist locked for 790 * writing, so if it is locked for 791 * reading relock it now and retry in 792 * case it changed whilst we dropped 793 * the lock. 794 */ 795 up_read(&ni->runlist.lock); 796 down_write(&ni->runlist.lock); 797 rl_write_locked = true; 798 goto retry_remap; 799 } 800 err = ntfs_map_runlist_nolock(ni, bh_cpos, 801 NULL); 802 if (likely(!err)) { 803 is_retry = true; 804 goto retry_remap; 805 } 806 /* 807 * If @vcn is out of bounds, pretend @lcn is 808 * LCN_ENOENT. As long as the buffer is out 809 * of bounds this will work fine. 810 */ 811 if (err == -ENOENT) { 812 lcn = LCN_ENOENT; 813 err = 0; 814 goto rl_not_mapped_enoent; 815 } 816 } else 817 err = -EIO; 818 /* Failed to map the buffer, even after retrying. */ 819 bh->b_blocknr = -1; 820 ntfs_error(vol->sb, "Failed to write to inode 0x%lx, " 821 "attribute type 0x%x, vcn 0x%llx, " 822 "vcn offset 0x%x, because its " 823 "location on disk could not be " 824 "determined%s (error code %i).", 825 ni->mft_no, ni->type, 826 (unsigned long long)bh_cpos, 827 (unsigned)bh_pos & 828 vol->cluster_size_mask, 829 is_retry ? " even after retrying" : "", 830 err); 831 break; 832 } 833 rl_not_mapped_enoent: 834 /* 835 * The buffer is in a hole or out of bounds. We need to fill 836 * the hole, unless the buffer is in a cluster which is not 837 * touched by the write, in which case we just leave the buffer 838 * unmapped. This can only happen when the cluster size is 839 * less than the page cache size. 840 */ 841 if (unlikely(vol->cluster_size < PAGE_CACHE_SIZE)) { 842 bh_cend = (bh_end + vol->cluster_size - 1) >> 843 vol->cluster_size_bits; 844 if ((bh_cend <= cpos || bh_cpos >= cend)) { 845 bh->b_blocknr = -1; 846 /* 847 * If the buffer is uptodate we skip it. If it 848 * is not but the page is uptodate, we can set 849 * the buffer uptodate. If the page is not 850 * uptodate, we can clear the buffer and set it 851 * uptodate. Whether this is worthwhile is 852 * debatable and this could be removed. 853 */ 854 if (PageUptodate(page)) { 855 if (!buffer_uptodate(bh)) 856 set_buffer_uptodate(bh); 857 } else if (!buffer_uptodate(bh)) { 858 zero_user(page, bh_offset(bh), 859 blocksize); 860 set_buffer_uptodate(bh); 861 } 862 continue; 863 } 864 } 865 /* 866 * Out of bounds buffer is invalid if it was not really out of 867 * bounds. 868 */ 869 BUG_ON(lcn != LCN_HOLE); 870 /* 871 * We need the runlist locked for writing, so if it is locked 872 * for reading relock it now and retry in case it changed 873 * whilst we dropped the lock. 874 */ 875 BUG_ON(!rl); 876 if (!rl_write_locked) { 877 up_read(&ni->runlist.lock); 878 down_write(&ni->runlist.lock); 879 rl_write_locked = true; 880 goto retry_remap; 881 } 882 /* Find the previous last allocated cluster. */ 883 BUG_ON(rl->lcn != LCN_HOLE); 884 lcn = -1; 885 rl2 = rl; 886 while (--rl2 >= ni->runlist.rl) { 887 if (rl2->lcn >= 0) { 888 lcn = rl2->lcn + rl2->length; 889 break; 890 } 891 } 892 rl2 = ntfs_cluster_alloc(vol, bh_cpos, 1, lcn, DATA_ZONE, 893 false); 894 if (IS_ERR(rl2)) { 895 err = PTR_ERR(rl2); 896 ntfs_debug("Failed to allocate cluster, error code %i.", 897 err); 898 break; 899 } 900 lcn = rl2->lcn; 901 rl = ntfs_runlists_merge(ni->runlist.rl, rl2); 902 if (IS_ERR(rl)) { 903 err = PTR_ERR(rl); 904 if (err != -ENOMEM) 905 err = -EIO; 906 if (ntfs_cluster_free_from_rl(vol, rl2)) { 907 ntfs_error(vol->sb, "Failed to release " 908 "allocated cluster in error " 909 "code path. Run chkdsk to " 910 "recover the lost cluster."); 911 NVolSetErrors(vol); 912 } 913 ntfs_free(rl2); 914 break; 915 } 916 ni->runlist.rl = rl; 917 status.runlist_merged = 1; 918 ntfs_debug("Allocated cluster, lcn 0x%llx.", 919 (unsigned long long)lcn); 920 /* Map and lock the mft record and get the attribute record. */ 921 if (!NInoAttr(ni)) 922 base_ni = ni; 923 else 924 base_ni = ni->ext.base_ntfs_ino; 925 m = map_mft_record(base_ni); 926 if (IS_ERR(m)) { 927 err = PTR_ERR(m); 928 break; 929 } 930 ctx = ntfs_attr_get_search_ctx(base_ni, m); 931 if (unlikely(!ctx)) { 932 err = -ENOMEM; 933 unmap_mft_record(base_ni); 934 break; 935 } 936 status.mft_attr_mapped = 1; 937 err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, 938 CASE_SENSITIVE, bh_cpos, NULL, 0, ctx); 939 if (unlikely(err)) { 940 if (err == -ENOENT) 941 err = -EIO; 942 break; 943 } 944 m = ctx->mrec; 945 a = ctx->attr; 946 /* 947 * Find the runlist element with which the attribute extent 948 * starts. Note, we cannot use the _attr_ version because we 949 * have mapped the mft record. That is ok because we know the 950 * runlist fragment must be mapped already to have ever gotten 951 * here, so we can just use the _rl_ version. 952 */ 953 vcn = sle64_to_cpu(a->data.non_resident.lowest_vcn); 954 rl2 = ntfs_rl_find_vcn_nolock(rl, vcn); 955 BUG_ON(!rl2); 956 BUG_ON(!rl2->length); 957 BUG_ON(rl2->lcn < LCN_HOLE); 958 highest_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn); 959 /* 960 * If @highest_vcn is zero, calculate the real highest_vcn 961 * (which can really be zero). 962 */ 963 if (!highest_vcn) 964 highest_vcn = (sle64_to_cpu( 965 a->data.non_resident.allocated_size) >> 966 vol->cluster_size_bits) - 1; 967 /* 968 * Determine the size of the mapping pairs array for the new 969 * extent, i.e. the old extent with the hole filled. 970 */ 971 mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, vcn, 972 highest_vcn); 973 if (unlikely(mp_size <= 0)) { 974 if (!(err = mp_size)) 975 err = -EIO; 976 ntfs_debug("Failed to get size for mapping pairs " 977 "array, error code %i.", err); 978 break; 979 } 980 /* 981 * Resize the attribute record to fit the new mapping pairs 982 * array. 983 */ 984 attr_rec_len = le32_to_cpu(a->length); 985 err = ntfs_attr_record_resize(m, a, mp_size + le16_to_cpu( 986 a->data.non_resident.mapping_pairs_offset)); 987 if (unlikely(err)) { 988 BUG_ON(err != -ENOSPC); 989 // TODO: Deal with this by using the current attribute 990 // and fill it with as much of the mapping pairs 991 // array as possible. Then loop over each attribute 992 // extent rewriting the mapping pairs arrays as we go 993 // along and if when we reach the end we have not 994 // enough space, try to resize the last attribute 995 // extent and if even that fails, add a new attribute 996 // extent. 997 // We could also try to resize at each step in the hope 998 // that we will not need to rewrite every single extent. 999 // Note, we may need to decompress some extents to fill 1000 // the runlist as we are walking the extents... 1001 ntfs_error(vol->sb, "Not enough space in the mft " 1002 "record for the extended attribute " 1003 "record. This case is not " 1004 "implemented yet."); 1005 err = -EOPNOTSUPP; 1006 break ; 1007 } 1008 status.mp_rebuilt = 1; 1009 /* 1010 * Generate the mapping pairs array directly into the attribute 1011 * record. 1012 */ 1013 err = ntfs_mapping_pairs_build(vol, (u8*)a + le16_to_cpu( 1014 a->data.non_resident.mapping_pairs_offset), 1015 mp_size, rl2, vcn, highest_vcn, NULL); 1016 if (unlikely(err)) { 1017 ntfs_error(vol->sb, "Cannot fill hole in inode 0x%lx, " 1018 "attribute type 0x%x, because building " 1019 "the mapping pairs failed with error " 1020 "code %i.", vi->i_ino, 1021 (unsigned)le32_to_cpu(ni->type), err); 1022 err = -EIO; 1023 break; 1024 } 1025 /* Update the highest_vcn but only if it was not set. */ 1026 if (unlikely(!a->data.non_resident.highest_vcn)) 1027 a->data.non_resident.highest_vcn = 1028 cpu_to_sle64(highest_vcn); 1029 /* 1030 * If the attribute is sparse/compressed, update the compressed 1031 * size in the ntfs_inode structure and the attribute record. 1032 */ 1033 if (likely(NInoSparse(ni) || NInoCompressed(ni))) { 1034 /* 1035 * If we are not in the first attribute extent, switch 1036 * to it, but first ensure the changes will make it to 1037 * disk later. 1038 */ 1039 if (a->data.non_resident.lowest_vcn) { 1040 flush_dcache_mft_record_page(ctx->ntfs_ino); 1041 mark_mft_record_dirty(ctx->ntfs_ino); 1042 ntfs_attr_reinit_search_ctx(ctx); 1043 err = ntfs_attr_lookup(ni->type, ni->name, 1044 ni->name_len, CASE_SENSITIVE, 1045 0, NULL, 0, ctx); 1046 if (unlikely(err)) { 1047 status.attr_switched = 1; 1048 break; 1049 } 1050 /* @m is not used any more so do not set it. */ 1051 a = ctx->attr; 1052 } 1053 write_lock_irqsave(&ni->size_lock, flags); 1054 ni->itype.compressed.size += vol->cluster_size; 1055 a->data.non_resident.compressed_size = 1056 cpu_to_sle64(ni->itype.compressed.size); 1057 write_unlock_irqrestore(&ni->size_lock, flags); 1058 } 1059 /* Ensure the changes make it to disk. */ 1060 flush_dcache_mft_record_page(ctx->ntfs_ino); 1061 mark_mft_record_dirty(ctx->ntfs_ino); 1062 ntfs_attr_put_search_ctx(ctx); 1063 unmap_mft_record(base_ni); 1064 /* Successfully filled the hole. */ 1065 status.runlist_merged = 0; 1066 status.mft_attr_mapped = 0; 1067 status.mp_rebuilt = 0; 1068 /* Setup the map cache and use that to deal with the buffer. */ 1069 was_hole = true; 1070 vcn = bh_cpos; 1071 vcn_len = 1; 1072 lcn_block = lcn << (vol->cluster_size_bits - blocksize_bits); 1073 cdelta = 0; 1074 /* 1075 * If the number of remaining clusters in the @pages is smaller 1076 * or equal to the number of cached clusters, unlock the 1077 * runlist as the map cache will be used from now on. 1078 */ 1079 if (likely(vcn + vcn_len >= cend)) { 1080 up_write(&ni->runlist.lock); 1081 rl_write_locked = false; 1082 rl = NULL; 1083 } 1084 goto map_buffer_cached; 1085 } while (bh_pos += blocksize, (bh = bh->b_this_page) != head); 1086 /* If there are no errors, do the next page. */ 1087 if (likely(!err && ++u < nr_pages)) 1088 goto do_next_page; 1089 /* If there are no errors, release the runlist lock if we took it. */ 1090 if (likely(!err)) { 1091 if (unlikely(rl_write_locked)) { 1092 up_write(&ni->runlist.lock); 1093 rl_write_locked = false; 1094 } else if (unlikely(rl)) 1095 up_read(&ni->runlist.lock); 1096 rl = NULL; 1097 } 1098 /* If we issued read requests, let them complete. */ 1099 read_lock_irqsave(&ni->size_lock, flags); 1100 initialized_size = ni->initialized_size; 1101 read_unlock_irqrestore(&ni->size_lock, flags); 1102 while (wait_bh > wait) { 1103 bh = *--wait_bh; 1104 wait_on_buffer(bh); 1105 if (likely(buffer_uptodate(bh))) { 1106 page = bh->b_page; 1107 bh_pos = ((s64)page->index << PAGE_CACHE_SHIFT) + 1108 bh_offset(bh); 1109 /* 1110 * If the buffer overflows the initialized size, need 1111 * to zero the overflowing region. 1112 */ 1113 if (unlikely(bh_pos + blocksize > initialized_size)) { 1114 int ofs = 0; 1115 1116 if (likely(bh_pos < initialized_size)) 1117 ofs = initialized_size - bh_pos; 1118 zero_user_segment(page, bh_offset(bh) + ofs, 1119 blocksize); 1120 } 1121 } else /* if (unlikely(!buffer_uptodate(bh))) */ 1122 err = -EIO; 1123 } 1124 if (likely(!err)) { 1125 /* Clear buffer_new on all buffers. */ 1126 u = 0; 1127 do { 1128 bh = head = page_buffers(pages[u]); 1129 do { 1130 if (buffer_new(bh)) 1131 clear_buffer_new(bh); 1132 } while ((bh = bh->b_this_page) != head); 1133 } while (++u < nr_pages); 1134 ntfs_debug("Done."); 1135 return err; 1136 } 1137 if (status.attr_switched) { 1138 /* Get back to the attribute extent we modified. */ 1139 ntfs_attr_reinit_search_ctx(ctx); 1140 if (ntfs_attr_lookup(ni->type, ni->name, ni->name_len, 1141 CASE_SENSITIVE, bh_cpos, NULL, 0, ctx)) { 1142 ntfs_error(vol->sb, "Failed to find required " 1143 "attribute extent of attribute in " 1144 "error code path. Run chkdsk to " 1145 "recover."); 1146 write_lock_irqsave(&ni->size_lock, flags); 1147 ni->itype.compressed.size += vol->cluster_size; 1148 write_unlock_irqrestore(&ni->size_lock, flags); 1149 flush_dcache_mft_record_page(ctx->ntfs_ino); 1150 mark_mft_record_dirty(ctx->ntfs_ino); 1151 /* 1152 * The only thing that is now wrong is the compressed 1153 * size of the base attribute extent which chkdsk 1154 * should be able to fix. 1155 */ 1156 NVolSetErrors(vol); 1157 } else { 1158 m = ctx->mrec; 1159 a = ctx->attr; 1160 status.attr_switched = 0; 1161 } 1162 } 1163 /* 1164 * If the runlist has been modified, need to restore it by punching a 1165 * hole into it and we then need to deallocate the on-disk cluster as 1166 * well. Note, we only modify the runlist if we are able to generate a 1167 * new mapping pairs array, i.e. only when the mapped attribute extent 1168 * is not switched. 1169 */ 1170 if (status.runlist_merged && !status.attr_switched) { 1171 BUG_ON(!rl_write_locked); 1172 /* Make the file cluster we allocated sparse in the runlist. */ 1173 if (ntfs_rl_punch_nolock(vol, &ni->runlist, bh_cpos, 1)) { 1174 ntfs_error(vol->sb, "Failed to punch hole into " 1175 "attribute runlist in error code " 1176 "path. Run chkdsk to recover the " 1177 "lost cluster."); 1178 NVolSetErrors(vol); 1179 } else /* if (success) */ { 1180 status.runlist_merged = 0; 1181 /* 1182 * Deallocate the on-disk cluster we allocated but only 1183 * if we succeeded in punching its vcn out of the 1184 * runlist. 1185 */ 1186 down_write(&vol->lcnbmp_lock); 1187 if (ntfs_bitmap_clear_bit(vol->lcnbmp_ino, lcn)) { 1188 ntfs_error(vol->sb, "Failed to release " 1189 "allocated cluster in error " 1190 "code path. Run chkdsk to " 1191 "recover the lost cluster."); 1192 NVolSetErrors(vol); 1193 } 1194 up_write(&vol->lcnbmp_lock); 1195 } 1196 } 1197 /* 1198 * Resize the attribute record to its old size and rebuild the mapping 1199 * pairs array. Note, we only can do this if the runlist has been 1200 * restored to its old state which also implies that the mapped 1201 * attribute extent is not switched. 1202 */ 1203 if (status.mp_rebuilt && !status.runlist_merged) { 1204 if (ntfs_attr_record_resize(m, a, attr_rec_len)) { 1205 ntfs_error(vol->sb, "Failed to restore attribute " 1206 "record in error code path. Run " 1207 "chkdsk to recover."); 1208 NVolSetErrors(vol); 1209 } else /* if (success) */ { 1210 if (ntfs_mapping_pairs_build(vol, (u8*)a + 1211 le16_to_cpu(a->data.non_resident. 1212 mapping_pairs_offset), attr_rec_len - 1213 le16_to_cpu(a->data.non_resident. 1214 mapping_pairs_offset), ni->runlist.rl, 1215 vcn, highest_vcn, NULL)) { 1216 ntfs_error(vol->sb, "Failed to restore " 1217 "mapping pairs array in error " 1218 "code path. Run chkdsk to " 1219 "recover."); 1220 NVolSetErrors(vol); 1221 } 1222 flush_dcache_mft_record_page(ctx->ntfs_ino); 1223 mark_mft_record_dirty(ctx->ntfs_ino); 1224 } 1225 } 1226 /* Release the mft record and the attribute. */ 1227 if (status.mft_attr_mapped) { 1228 ntfs_attr_put_search_ctx(ctx); 1229 unmap_mft_record(base_ni); 1230 } 1231 /* Release the runlist lock. */ 1232 if (rl_write_locked) 1233 up_write(&ni->runlist.lock); 1234 else if (rl) 1235 up_read(&ni->runlist.lock); 1236 /* 1237 * Zero out any newly allocated blocks to avoid exposing stale data. 1238 * If BH_New is set, we know that the block was newly allocated above 1239 * and that it has not been fully zeroed and marked dirty yet. 1240 */ 1241 nr_pages = u; 1242 u = 0; 1243 end = bh_cpos << vol->cluster_size_bits; 1244 do { 1245 page = pages[u]; 1246 bh = head = page_buffers(page); 1247 do { 1248 if (u == nr_pages && 1249 ((s64)page->index << PAGE_CACHE_SHIFT) + 1250 bh_offset(bh) >= end) 1251 break; 1252 if (!buffer_new(bh)) 1253 continue; 1254 clear_buffer_new(bh); 1255 if (!buffer_uptodate(bh)) { 1256 if (PageUptodate(page)) 1257 set_buffer_uptodate(bh); 1258 else { 1259 zero_user(page, bh_offset(bh), 1260 blocksize); 1261 set_buffer_uptodate(bh); 1262 } 1263 } 1264 mark_buffer_dirty(bh); 1265 } while ((bh = bh->b_this_page) != head); 1266 } while (++u <= nr_pages); 1267 ntfs_error(vol->sb, "Failed. Returning error code %i.", err); 1268 return err; 1269 } 1270 1271 /* 1272 * Copy as much as we can into the pages and return the number of bytes which 1273 * were successfully copied. If a fault is encountered then clear the pages 1274 * out to (ofs + bytes) and return the number of bytes which were copied. 1275 */ 1276 static inline size_t ntfs_copy_from_user(struct page **pages, 1277 unsigned nr_pages, unsigned ofs, const char __user *buf, 1278 size_t bytes) 1279 { 1280 struct page **last_page = pages + nr_pages; 1281 char *addr; 1282 size_t total = 0; 1283 unsigned len; 1284 int left; 1285 1286 do { 1287 len = PAGE_CACHE_SIZE - ofs; 1288 if (len > bytes) 1289 len = bytes; 1290 addr = kmap_atomic(*pages); 1291 left = __copy_from_user_inatomic(addr + ofs, buf, len); 1292 kunmap_atomic(addr); 1293 if (unlikely(left)) { 1294 /* Do it the slow way. */ 1295 addr = kmap(*pages); 1296 left = __copy_from_user(addr + ofs, buf, len); 1297 kunmap(*pages); 1298 if (unlikely(left)) 1299 goto err_out; 1300 } 1301 total += len; 1302 bytes -= len; 1303 if (!bytes) 1304 break; 1305 buf += len; 1306 ofs = 0; 1307 } while (++pages < last_page); 1308 out: 1309 return total; 1310 err_out: 1311 total += len - left; 1312 /* Zero the rest of the target like __copy_from_user(). */ 1313 while (++pages < last_page) { 1314 bytes -= len; 1315 if (!bytes) 1316 break; 1317 len = PAGE_CACHE_SIZE; 1318 if (len > bytes) 1319 len = bytes; 1320 zero_user(*pages, 0, len); 1321 } 1322 goto out; 1323 } 1324 1325 static size_t __ntfs_copy_from_user_iovec_inatomic(char *vaddr, 1326 const struct iovec *iov, size_t iov_ofs, size_t bytes) 1327 { 1328 size_t total = 0; 1329 1330 while (1) { 1331 const char __user *buf = iov->iov_base + iov_ofs; 1332 unsigned len; 1333 size_t left; 1334 1335 len = iov->iov_len - iov_ofs; 1336 if (len > bytes) 1337 len = bytes; 1338 left = __copy_from_user_inatomic(vaddr, buf, len); 1339 total += len; 1340 bytes -= len; 1341 vaddr += len; 1342 if (unlikely(left)) { 1343 total -= left; 1344 break; 1345 } 1346 if (!bytes) 1347 break; 1348 iov++; 1349 iov_ofs = 0; 1350 } 1351 return total; 1352 } 1353 1354 static inline void ntfs_set_next_iovec(const struct iovec **iovp, 1355 size_t *iov_ofsp, size_t bytes) 1356 { 1357 const struct iovec *iov = *iovp; 1358 size_t iov_ofs = *iov_ofsp; 1359 1360 while (bytes) { 1361 unsigned len; 1362 1363 len = iov->iov_len - iov_ofs; 1364 if (len > bytes) 1365 len = bytes; 1366 bytes -= len; 1367 iov_ofs += len; 1368 if (iov->iov_len == iov_ofs) { 1369 iov++; 1370 iov_ofs = 0; 1371 } 1372 } 1373 *iovp = iov; 1374 *iov_ofsp = iov_ofs; 1375 } 1376 1377 /* 1378 * This has the same side-effects and return value as ntfs_copy_from_user(). 1379 * The difference is that on a fault we need to memset the remainder of the 1380 * pages (out to offset + bytes), to emulate ntfs_copy_from_user()'s 1381 * single-segment behaviour. 1382 * 1383 * We call the same helper (__ntfs_copy_from_user_iovec_inatomic()) both when 1384 * atomic and when not atomic. This is ok because it calls 1385 * __copy_from_user_inatomic() and it is ok to call this when non-atomic. In 1386 * fact, the only difference between __copy_from_user_inatomic() and 1387 * __copy_from_user() is that the latter calls might_sleep() and the former 1388 * should not zero the tail of the buffer on error. And on many architectures 1389 * __copy_from_user_inatomic() is just defined to __copy_from_user() so it 1390 * makes no difference at all on those architectures. 1391 */ 1392 static inline size_t ntfs_copy_from_user_iovec(struct page **pages, 1393 unsigned nr_pages, unsigned ofs, const struct iovec **iov, 1394 size_t *iov_ofs, size_t bytes) 1395 { 1396 struct page **last_page = pages + nr_pages; 1397 char *addr; 1398 size_t copied, len, total = 0; 1399 1400 do { 1401 len = PAGE_CACHE_SIZE - ofs; 1402 if (len > bytes) 1403 len = bytes; 1404 addr = kmap_atomic(*pages); 1405 copied = __ntfs_copy_from_user_iovec_inatomic(addr + ofs, 1406 *iov, *iov_ofs, len); 1407 kunmap_atomic(addr); 1408 if (unlikely(copied != len)) { 1409 /* Do it the slow way. */ 1410 addr = kmap(*pages); 1411 copied = __ntfs_copy_from_user_iovec_inatomic(addr + 1412 ofs, *iov, *iov_ofs, len); 1413 if (unlikely(copied != len)) 1414 goto err_out; 1415 kunmap(*pages); 1416 } 1417 total += len; 1418 ntfs_set_next_iovec(iov, iov_ofs, len); 1419 bytes -= len; 1420 if (!bytes) 1421 break; 1422 ofs = 0; 1423 } while (++pages < last_page); 1424 out: 1425 return total; 1426 err_out: 1427 BUG_ON(copied > len); 1428 /* Zero the rest of the target like __copy_from_user(). */ 1429 memset(addr + ofs + copied, 0, len - copied); 1430 kunmap(*pages); 1431 total += copied; 1432 ntfs_set_next_iovec(iov, iov_ofs, copied); 1433 while (++pages < last_page) { 1434 bytes -= len; 1435 if (!bytes) 1436 break; 1437 len = PAGE_CACHE_SIZE; 1438 if (len > bytes) 1439 len = bytes; 1440 zero_user(*pages, 0, len); 1441 } 1442 goto out; 1443 } 1444 1445 static inline void ntfs_flush_dcache_pages(struct page **pages, 1446 unsigned nr_pages) 1447 { 1448 BUG_ON(!nr_pages); 1449 /* 1450 * Warning: Do not do the decrement at the same time as the call to 1451 * flush_dcache_page() because it is a NULL macro on i386 and hence the 1452 * decrement never happens so the loop never terminates. 1453 */ 1454 do { 1455 --nr_pages; 1456 flush_dcache_page(pages[nr_pages]); 1457 } while (nr_pages > 0); 1458 } 1459 1460 /** 1461 * ntfs_commit_pages_after_non_resident_write - commit the received data 1462 * @pages: array of destination pages 1463 * @nr_pages: number of pages in @pages 1464 * @pos: byte position in file at which the write begins 1465 * @bytes: number of bytes to be written 1466 * 1467 * See description of ntfs_commit_pages_after_write(), below. 1468 */ 1469 static inline int ntfs_commit_pages_after_non_resident_write( 1470 struct page **pages, const unsigned nr_pages, 1471 s64 pos, size_t bytes) 1472 { 1473 s64 end, initialized_size; 1474 struct inode *vi; 1475 ntfs_inode *ni, *base_ni; 1476 struct buffer_head *bh, *head; 1477 ntfs_attr_search_ctx *ctx; 1478 MFT_RECORD *m; 1479 ATTR_RECORD *a; 1480 unsigned long flags; 1481 unsigned blocksize, u; 1482 int err; 1483 1484 vi = pages[0]->mapping->host; 1485 ni = NTFS_I(vi); 1486 blocksize = vi->i_sb->s_blocksize; 1487 end = pos + bytes; 1488 u = 0; 1489 do { 1490 s64 bh_pos; 1491 struct page *page; 1492 bool partial; 1493 1494 page = pages[u]; 1495 bh_pos = (s64)page->index << PAGE_CACHE_SHIFT; 1496 bh = head = page_buffers(page); 1497 partial = false; 1498 do { 1499 s64 bh_end; 1500 1501 bh_end = bh_pos + blocksize; 1502 if (bh_end <= pos || bh_pos >= end) { 1503 if (!buffer_uptodate(bh)) 1504 partial = true; 1505 } else { 1506 set_buffer_uptodate(bh); 1507 mark_buffer_dirty(bh); 1508 } 1509 } while (bh_pos += blocksize, (bh = bh->b_this_page) != head); 1510 /* 1511 * If all buffers are now uptodate but the page is not, set the 1512 * page uptodate. 1513 */ 1514 if (!partial && !PageUptodate(page)) 1515 SetPageUptodate(page); 1516 } while (++u < nr_pages); 1517 /* 1518 * Finally, if we do not need to update initialized_size or i_size we 1519 * are finished. 1520 */ 1521 read_lock_irqsave(&ni->size_lock, flags); 1522 initialized_size = ni->initialized_size; 1523 read_unlock_irqrestore(&ni->size_lock, flags); 1524 if (end <= initialized_size) { 1525 ntfs_debug("Done."); 1526 return 0; 1527 } 1528 /* 1529 * Update initialized_size/i_size as appropriate, both in the inode and 1530 * the mft record. 1531 */ 1532 if (!NInoAttr(ni)) 1533 base_ni = ni; 1534 else 1535 base_ni = ni->ext.base_ntfs_ino; 1536 /* Map, pin, and lock the mft record. */ 1537 m = map_mft_record(base_ni); 1538 if (IS_ERR(m)) { 1539 err = PTR_ERR(m); 1540 m = NULL; 1541 ctx = NULL; 1542 goto err_out; 1543 } 1544 BUG_ON(!NInoNonResident(ni)); 1545 ctx = ntfs_attr_get_search_ctx(base_ni, m); 1546 if (unlikely(!ctx)) { 1547 err = -ENOMEM; 1548 goto err_out; 1549 } 1550 err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, 1551 CASE_SENSITIVE, 0, NULL, 0, ctx); 1552 if (unlikely(err)) { 1553 if (err == -ENOENT) 1554 err = -EIO; 1555 goto err_out; 1556 } 1557 a = ctx->attr; 1558 BUG_ON(!a->non_resident); 1559 write_lock_irqsave(&ni->size_lock, flags); 1560 BUG_ON(end > ni->allocated_size); 1561 ni->initialized_size = end; 1562 a->data.non_resident.initialized_size = cpu_to_sle64(end); 1563 if (end > i_size_read(vi)) { 1564 i_size_write(vi, end); 1565 a->data.non_resident.data_size = 1566 a->data.non_resident.initialized_size; 1567 } 1568 write_unlock_irqrestore(&ni->size_lock, flags); 1569 /* Mark the mft record dirty, so it gets written back. */ 1570 flush_dcache_mft_record_page(ctx->ntfs_ino); 1571 mark_mft_record_dirty(ctx->ntfs_ino); 1572 ntfs_attr_put_search_ctx(ctx); 1573 unmap_mft_record(base_ni); 1574 ntfs_debug("Done."); 1575 return 0; 1576 err_out: 1577 if (ctx) 1578 ntfs_attr_put_search_ctx(ctx); 1579 if (m) 1580 unmap_mft_record(base_ni); 1581 ntfs_error(vi->i_sb, "Failed to update initialized_size/i_size (error " 1582 "code %i).", err); 1583 if (err != -ENOMEM) 1584 NVolSetErrors(ni->vol); 1585 return err; 1586 } 1587 1588 /** 1589 * ntfs_commit_pages_after_write - commit the received data 1590 * @pages: array of destination pages 1591 * @nr_pages: number of pages in @pages 1592 * @pos: byte position in file at which the write begins 1593 * @bytes: number of bytes to be written 1594 * 1595 * This is called from ntfs_file_buffered_write() with i_mutex held on the inode 1596 * (@pages[0]->mapping->host). There are @nr_pages pages in @pages which are 1597 * locked but not kmap()ped. The source data has already been copied into the 1598 * @page. ntfs_prepare_pages_for_non_resident_write() has been called before 1599 * the data was copied (for non-resident attributes only) and it returned 1600 * success. 1601 * 1602 * Need to set uptodate and mark dirty all buffers within the boundary of the 1603 * write. If all buffers in a page are uptodate we set the page uptodate, too. 1604 * 1605 * Setting the buffers dirty ensures that they get written out later when 1606 * ntfs_writepage() is invoked by the VM. 1607 * 1608 * Finally, we need to update i_size and initialized_size as appropriate both 1609 * in the inode and the mft record. 1610 * 1611 * This is modelled after fs/buffer.c::generic_commit_write(), which marks 1612 * buffers uptodate and dirty, sets the page uptodate if all buffers in the 1613 * page are uptodate, and updates i_size if the end of io is beyond i_size. In 1614 * that case, it also marks the inode dirty. 1615 * 1616 * If things have gone as outlined in 1617 * ntfs_prepare_pages_for_non_resident_write(), we do not need to do any page 1618 * content modifications here for non-resident attributes. For resident 1619 * attributes we need to do the uptodate bringing here which we combine with 1620 * the copying into the mft record which means we save one atomic kmap. 1621 * 1622 * Return 0 on success or -errno on error. 1623 */ 1624 static int ntfs_commit_pages_after_write(struct page **pages, 1625 const unsigned nr_pages, s64 pos, size_t bytes) 1626 { 1627 s64 end, initialized_size; 1628 loff_t i_size; 1629 struct inode *vi; 1630 ntfs_inode *ni, *base_ni; 1631 struct page *page; 1632 ntfs_attr_search_ctx *ctx; 1633 MFT_RECORD *m; 1634 ATTR_RECORD *a; 1635 char *kattr, *kaddr; 1636 unsigned long flags; 1637 u32 attr_len; 1638 int err; 1639 1640 BUG_ON(!nr_pages); 1641 BUG_ON(!pages); 1642 page = pages[0]; 1643 BUG_ON(!page); 1644 vi = page->mapping->host; 1645 ni = NTFS_I(vi); 1646 ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, start page " 1647 "index 0x%lx, nr_pages 0x%x, pos 0x%llx, bytes 0x%zx.", 1648 vi->i_ino, ni->type, page->index, nr_pages, 1649 (long long)pos, bytes); 1650 if (NInoNonResident(ni)) 1651 return ntfs_commit_pages_after_non_resident_write(pages, 1652 nr_pages, pos, bytes); 1653 BUG_ON(nr_pages > 1); 1654 /* 1655 * Attribute is resident, implying it is not compressed, encrypted, or 1656 * sparse. 1657 */ 1658 if (!NInoAttr(ni)) 1659 base_ni = ni; 1660 else 1661 base_ni = ni->ext.base_ntfs_ino; 1662 BUG_ON(NInoNonResident(ni)); 1663 /* Map, pin, and lock the mft record. */ 1664 m = map_mft_record(base_ni); 1665 if (IS_ERR(m)) { 1666 err = PTR_ERR(m); 1667 m = NULL; 1668 ctx = NULL; 1669 goto err_out; 1670 } 1671 ctx = ntfs_attr_get_search_ctx(base_ni, m); 1672 if (unlikely(!ctx)) { 1673 err = -ENOMEM; 1674 goto err_out; 1675 } 1676 err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, 1677 CASE_SENSITIVE, 0, NULL, 0, ctx); 1678 if (unlikely(err)) { 1679 if (err == -ENOENT) 1680 err = -EIO; 1681 goto err_out; 1682 } 1683 a = ctx->attr; 1684 BUG_ON(a->non_resident); 1685 /* The total length of the attribute value. */ 1686 attr_len = le32_to_cpu(a->data.resident.value_length); 1687 i_size = i_size_read(vi); 1688 BUG_ON(attr_len != i_size); 1689 BUG_ON(pos > attr_len); 1690 end = pos + bytes; 1691 BUG_ON(end > le32_to_cpu(a->length) - 1692 le16_to_cpu(a->data.resident.value_offset)); 1693 kattr = (u8*)a + le16_to_cpu(a->data.resident.value_offset); 1694 kaddr = kmap_atomic(page); 1695 /* Copy the received data from the page to the mft record. */ 1696 memcpy(kattr + pos, kaddr + pos, bytes); 1697 /* Update the attribute length if necessary. */ 1698 if (end > attr_len) { 1699 attr_len = end; 1700 a->data.resident.value_length = cpu_to_le32(attr_len); 1701 } 1702 /* 1703 * If the page is not uptodate, bring the out of bounds area(s) 1704 * uptodate by copying data from the mft record to the page. 1705 */ 1706 if (!PageUptodate(page)) { 1707 if (pos > 0) 1708 memcpy(kaddr, kattr, pos); 1709 if (end < attr_len) 1710 memcpy(kaddr + end, kattr + end, attr_len - end); 1711 /* Zero the region outside the end of the attribute value. */ 1712 memset(kaddr + attr_len, 0, PAGE_CACHE_SIZE - attr_len); 1713 flush_dcache_page(page); 1714 SetPageUptodate(page); 1715 } 1716 kunmap_atomic(kaddr); 1717 /* Update initialized_size/i_size if necessary. */ 1718 read_lock_irqsave(&ni->size_lock, flags); 1719 initialized_size = ni->initialized_size; 1720 BUG_ON(end > ni->allocated_size); 1721 read_unlock_irqrestore(&ni->size_lock, flags); 1722 BUG_ON(initialized_size != i_size); 1723 if (end > initialized_size) { 1724 write_lock_irqsave(&ni->size_lock, flags); 1725 ni->initialized_size = end; 1726 i_size_write(vi, end); 1727 write_unlock_irqrestore(&ni->size_lock, flags); 1728 } 1729 /* Mark the mft record dirty, so it gets written back. */ 1730 flush_dcache_mft_record_page(ctx->ntfs_ino); 1731 mark_mft_record_dirty(ctx->ntfs_ino); 1732 ntfs_attr_put_search_ctx(ctx); 1733 unmap_mft_record(base_ni); 1734 ntfs_debug("Done."); 1735 return 0; 1736 err_out: 1737 if (err == -ENOMEM) { 1738 ntfs_warning(vi->i_sb, "Error allocating memory required to " 1739 "commit the write."); 1740 if (PageUptodate(page)) { 1741 ntfs_warning(vi->i_sb, "Page is uptodate, setting " 1742 "dirty so the write will be retried " 1743 "later on by the VM."); 1744 /* 1745 * Put the page on mapping->dirty_pages, but leave its 1746 * buffers' dirty state as-is. 1747 */ 1748 __set_page_dirty_nobuffers(page); 1749 err = 0; 1750 } else 1751 ntfs_error(vi->i_sb, "Page is not uptodate. Written " 1752 "data has been lost."); 1753 } else { 1754 ntfs_error(vi->i_sb, "Resident attribute commit write failed " 1755 "with error %i.", err); 1756 NVolSetErrors(ni->vol); 1757 } 1758 if (ctx) 1759 ntfs_attr_put_search_ctx(ctx); 1760 if (m) 1761 unmap_mft_record(base_ni); 1762 return err; 1763 } 1764 1765 /** 1766 * ntfs_file_buffered_write - 1767 * 1768 * Locking: The vfs is holding ->i_mutex on the inode. 1769 */ 1770 static ssize_t ntfs_file_buffered_write(struct kiocb *iocb, 1771 const struct iovec *iov, unsigned long nr_segs, 1772 loff_t pos, loff_t *ppos, size_t count) 1773 { 1774 struct file *file = iocb->ki_filp; 1775 struct address_space *mapping = file->f_mapping; 1776 struct inode *vi = mapping->host; 1777 ntfs_inode *ni = NTFS_I(vi); 1778 ntfs_volume *vol = ni->vol; 1779 struct page *pages[NTFS_MAX_PAGES_PER_CLUSTER]; 1780 struct page *cached_page = NULL; 1781 char __user *buf = NULL; 1782 s64 end, ll; 1783 VCN last_vcn; 1784 LCN lcn; 1785 unsigned long flags; 1786 size_t bytes, iov_ofs = 0; /* Offset in the current iovec. */ 1787 ssize_t status, written; 1788 unsigned nr_pages; 1789 int err; 1790 1791 ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, " 1792 "pos 0x%llx, count 0x%lx.", 1793 vi->i_ino, (unsigned)le32_to_cpu(ni->type), 1794 (unsigned long long)pos, (unsigned long)count); 1795 if (unlikely(!count)) 1796 return 0; 1797 BUG_ON(NInoMstProtected(ni)); 1798 /* 1799 * If the attribute is not an index root and it is encrypted or 1800 * compressed, we cannot write to it yet. Note we need to check for 1801 * AT_INDEX_ALLOCATION since this is the type of both directory and 1802 * index inodes. 1803 */ 1804 if (ni->type != AT_INDEX_ALLOCATION) { 1805 /* If file is encrypted, deny access, just like NT4. */ 1806 if (NInoEncrypted(ni)) { 1807 /* 1808 * Reminder for later: Encrypted files are _always_ 1809 * non-resident so that the content can always be 1810 * encrypted. 1811 */ 1812 ntfs_debug("Denying write access to encrypted file."); 1813 return -EACCES; 1814 } 1815 if (NInoCompressed(ni)) { 1816 /* Only unnamed $DATA attribute can be compressed. */ 1817 BUG_ON(ni->type != AT_DATA); 1818 BUG_ON(ni->name_len); 1819 /* 1820 * Reminder for later: If resident, the data is not 1821 * actually compressed. Only on the switch to non- 1822 * resident does compression kick in. This is in 1823 * contrast to encrypted files (see above). 1824 */ 1825 ntfs_error(vi->i_sb, "Writing to compressed files is " 1826 "not implemented yet. Sorry."); 1827 return -EOPNOTSUPP; 1828 } 1829 } 1830 /* 1831 * If a previous ntfs_truncate() failed, repeat it and abort if it 1832 * fails again. 1833 */ 1834 if (unlikely(NInoTruncateFailed(ni))) { 1835 inode_dio_wait(vi); 1836 err = ntfs_truncate(vi); 1837 if (err || NInoTruncateFailed(ni)) { 1838 if (!err) 1839 err = -EIO; 1840 ntfs_error(vol->sb, "Cannot perform write to inode " 1841 "0x%lx, attribute type 0x%x, because " 1842 "ntfs_truncate() failed (error code " 1843 "%i).", vi->i_ino, 1844 (unsigned)le32_to_cpu(ni->type), err); 1845 return err; 1846 } 1847 } 1848 /* The first byte after the write. */ 1849 end = pos + count; 1850 /* 1851 * If the write goes beyond the allocated size, extend the allocation 1852 * to cover the whole of the write, rounded up to the nearest cluster. 1853 */ 1854 read_lock_irqsave(&ni->size_lock, flags); 1855 ll = ni->allocated_size; 1856 read_unlock_irqrestore(&ni->size_lock, flags); 1857 if (end > ll) { 1858 /* Extend the allocation without changing the data size. */ 1859 ll = ntfs_attr_extend_allocation(ni, end, -1, pos); 1860 if (likely(ll >= 0)) { 1861 BUG_ON(pos >= ll); 1862 /* If the extension was partial truncate the write. */ 1863 if (end > ll) { 1864 ntfs_debug("Truncating write to inode 0x%lx, " 1865 "attribute type 0x%x, because " 1866 "the allocation was only " 1867 "partially extended.", 1868 vi->i_ino, (unsigned) 1869 le32_to_cpu(ni->type)); 1870 end = ll; 1871 count = ll - pos; 1872 } 1873 } else { 1874 err = ll; 1875 read_lock_irqsave(&ni->size_lock, flags); 1876 ll = ni->allocated_size; 1877 read_unlock_irqrestore(&ni->size_lock, flags); 1878 /* Perform a partial write if possible or fail. */ 1879 if (pos < ll) { 1880 ntfs_debug("Truncating write to inode 0x%lx, " 1881 "attribute type 0x%x, because " 1882 "extending the allocation " 1883 "failed (error code %i).", 1884 vi->i_ino, (unsigned) 1885 le32_to_cpu(ni->type), err); 1886 end = ll; 1887 count = ll - pos; 1888 } else { 1889 ntfs_error(vol->sb, "Cannot perform write to " 1890 "inode 0x%lx, attribute type " 1891 "0x%x, because extending the " 1892 "allocation failed (error " 1893 "code %i).", vi->i_ino, 1894 (unsigned) 1895 le32_to_cpu(ni->type), err); 1896 return err; 1897 } 1898 } 1899 } 1900 written = 0; 1901 /* 1902 * If the write starts beyond the initialized size, extend it up to the 1903 * beginning of the write and initialize all non-sparse space between 1904 * the old initialized size and the new one. This automatically also 1905 * increments the vfs inode->i_size to keep it above or equal to the 1906 * initialized_size. 1907 */ 1908 read_lock_irqsave(&ni->size_lock, flags); 1909 ll = ni->initialized_size; 1910 read_unlock_irqrestore(&ni->size_lock, flags); 1911 if (pos > ll) { 1912 err = ntfs_attr_extend_initialized(ni, pos); 1913 if (err < 0) { 1914 ntfs_error(vol->sb, "Cannot perform write to inode " 1915 "0x%lx, attribute type 0x%x, because " 1916 "extending the initialized size " 1917 "failed (error code %i).", vi->i_ino, 1918 (unsigned)le32_to_cpu(ni->type), err); 1919 status = err; 1920 goto err_out; 1921 } 1922 } 1923 /* 1924 * Determine the number of pages per cluster for non-resident 1925 * attributes. 1926 */ 1927 nr_pages = 1; 1928 if (vol->cluster_size > PAGE_CACHE_SIZE && NInoNonResident(ni)) 1929 nr_pages = vol->cluster_size >> PAGE_CACHE_SHIFT; 1930 /* Finally, perform the actual write. */ 1931 last_vcn = -1; 1932 if (likely(nr_segs == 1)) 1933 buf = iov->iov_base; 1934 do { 1935 VCN vcn; 1936 pgoff_t idx, start_idx; 1937 unsigned ofs, do_pages, u; 1938 size_t copied; 1939 1940 start_idx = idx = pos >> PAGE_CACHE_SHIFT; 1941 ofs = pos & ~PAGE_CACHE_MASK; 1942 bytes = PAGE_CACHE_SIZE - ofs; 1943 do_pages = 1; 1944 if (nr_pages > 1) { 1945 vcn = pos >> vol->cluster_size_bits; 1946 if (vcn != last_vcn) { 1947 last_vcn = vcn; 1948 /* 1949 * Get the lcn of the vcn the write is in. If 1950 * it is a hole, need to lock down all pages in 1951 * the cluster. 1952 */ 1953 down_read(&ni->runlist.lock); 1954 lcn = ntfs_attr_vcn_to_lcn_nolock(ni, pos >> 1955 vol->cluster_size_bits, false); 1956 up_read(&ni->runlist.lock); 1957 if (unlikely(lcn < LCN_HOLE)) { 1958 status = -EIO; 1959 if (lcn == LCN_ENOMEM) 1960 status = -ENOMEM; 1961 else 1962 ntfs_error(vol->sb, "Cannot " 1963 "perform write to " 1964 "inode 0x%lx, " 1965 "attribute type 0x%x, " 1966 "because the attribute " 1967 "is corrupt.", 1968 vi->i_ino, (unsigned) 1969 le32_to_cpu(ni->type)); 1970 break; 1971 } 1972 if (lcn == LCN_HOLE) { 1973 start_idx = (pos & ~(s64) 1974 vol->cluster_size_mask) 1975 >> PAGE_CACHE_SHIFT; 1976 bytes = vol->cluster_size - (pos & 1977 vol->cluster_size_mask); 1978 do_pages = nr_pages; 1979 } 1980 } 1981 } 1982 if (bytes > count) 1983 bytes = count; 1984 /* 1985 * Bring in the user page(s) that we will copy from _first_. 1986 * Otherwise there is a nasty deadlock on copying from the same 1987 * page(s) as we are writing to, without it/them being marked 1988 * up-to-date. Note, at present there is nothing to stop the 1989 * pages being swapped out between us bringing them into memory 1990 * and doing the actual copying. 1991 */ 1992 if (likely(nr_segs == 1)) 1993 ntfs_fault_in_pages_readable(buf, bytes); 1994 else 1995 ntfs_fault_in_pages_readable_iovec(iov, iov_ofs, bytes); 1996 /* Get and lock @do_pages starting at index @start_idx. */ 1997 status = __ntfs_grab_cache_pages(mapping, start_idx, do_pages, 1998 pages, &cached_page); 1999 if (unlikely(status)) 2000 break; 2001 /* 2002 * For non-resident attributes, we need to fill any holes with 2003 * actual clusters and ensure all bufferes are mapped. We also 2004 * need to bring uptodate any buffers that are only partially 2005 * being written to. 2006 */ 2007 if (NInoNonResident(ni)) { 2008 status = ntfs_prepare_pages_for_non_resident_write( 2009 pages, do_pages, pos, bytes); 2010 if (unlikely(status)) { 2011 loff_t i_size; 2012 2013 do { 2014 unlock_page(pages[--do_pages]); 2015 page_cache_release(pages[do_pages]); 2016 } while (do_pages); 2017 /* 2018 * The write preparation may have instantiated 2019 * allocated space outside i_size. Trim this 2020 * off again. We can ignore any errors in this 2021 * case as we will just be waisting a bit of 2022 * allocated space, which is not a disaster. 2023 */ 2024 i_size = i_size_read(vi); 2025 if (pos + bytes > i_size) 2026 vmtruncate(vi, i_size); 2027 break; 2028 } 2029 } 2030 u = (pos >> PAGE_CACHE_SHIFT) - pages[0]->index; 2031 if (likely(nr_segs == 1)) { 2032 copied = ntfs_copy_from_user(pages + u, do_pages - u, 2033 ofs, buf, bytes); 2034 buf += copied; 2035 } else 2036 copied = ntfs_copy_from_user_iovec(pages + u, 2037 do_pages - u, ofs, &iov, &iov_ofs, 2038 bytes); 2039 ntfs_flush_dcache_pages(pages + u, do_pages - u); 2040 status = ntfs_commit_pages_after_write(pages, do_pages, pos, 2041 bytes); 2042 if (likely(!status)) { 2043 written += copied; 2044 count -= copied; 2045 pos += copied; 2046 if (unlikely(copied != bytes)) 2047 status = -EFAULT; 2048 } 2049 do { 2050 unlock_page(pages[--do_pages]); 2051 mark_page_accessed(pages[do_pages]); 2052 page_cache_release(pages[do_pages]); 2053 } while (do_pages); 2054 if (unlikely(status)) 2055 break; 2056 balance_dirty_pages_ratelimited(mapping); 2057 cond_resched(); 2058 } while (count); 2059 err_out: 2060 *ppos = pos; 2061 if (cached_page) 2062 page_cache_release(cached_page); 2063 ntfs_debug("Done. Returning %s (written 0x%lx, status %li).", 2064 written ? "written" : "status", (unsigned long)written, 2065 (long)status); 2066 return written ? written : status; 2067 } 2068 2069 /** 2070 * ntfs_file_aio_write_nolock - 2071 */ 2072 static ssize_t ntfs_file_aio_write_nolock(struct kiocb *iocb, 2073 const struct iovec *iov, unsigned long nr_segs, loff_t *ppos) 2074 { 2075 struct file *file = iocb->ki_filp; 2076 struct address_space *mapping = file->f_mapping; 2077 struct inode *inode = mapping->host; 2078 loff_t pos; 2079 size_t count; /* after file limit checks */ 2080 ssize_t written, err; 2081 2082 count = 0; 2083 err = generic_segment_checks(iov, &nr_segs, &count, VERIFY_READ); 2084 if (err) 2085 return err; 2086 pos = *ppos; 2087 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); 2088 /* We can write back this queue in page reclaim. */ 2089 current->backing_dev_info = mapping->backing_dev_info; 2090 written = 0; 2091 err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); 2092 if (err) 2093 goto out; 2094 if (!count) 2095 goto out; 2096 err = file_remove_suid(file); 2097 if (err) 2098 goto out; 2099 err = file_update_time(file); 2100 if (err) 2101 goto out; 2102 written = ntfs_file_buffered_write(iocb, iov, nr_segs, pos, ppos, 2103 count); 2104 out: 2105 current->backing_dev_info = NULL; 2106 return written ? written : err; 2107 } 2108 2109 /** 2110 * ntfs_file_aio_write - 2111 */ 2112 static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov, 2113 unsigned long nr_segs, loff_t pos) 2114 { 2115 struct file *file = iocb->ki_filp; 2116 struct address_space *mapping = file->f_mapping; 2117 struct inode *inode = mapping->host; 2118 ssize_t ret; 2119 2120 BUG_ON(iocb->ki_pos != pos); 2121 2122 mutex_lock(&inode->i_mutex); 2123 ret = ntfs_file_aio_write_nolock(iocb, iov, nr_segs, &iocb->ki_pos); 2124 mutex_unlock(&inode->i_mutex); 2125 if (ret > 0) { 2126 int err = generic_write_sync(file, pos, ret); 2127 if (err < 0) 2128 ret = err; 2129 } 2130 return ret; 2131 } 2132 2133 /** 2134 * ntfs_file_fsync - sync a file to disk 2135 * @filp: file to be synced 2136 * @datasync: if non-zero only flush user data and not metadata 2137 * 2138 * Data integrity sync of a file to disk. Used for fsync, fdatasync, and msync 2139 * system calls. This function is inspired by fs/buffer.c::file_fsync(). 2140 * 2141 * If @datasync is false, write the mft record and all associated extent mft 2142 * records as well as the $DATA attribute and then sync the block device. 2143 * 2144 * If @datasync is true and the attribute is non-resident, we skip the writing 2145 * of the mft record and all associated extent mft records (this might still 2146 * happen due to the write_inode_now() call). 2147 * 2148 * Also, if @datasync is true, we do not wait on the inode to be written out 2149 * but we always wait on the page cache pages to be written out. 2150 * 2151 * Locking: Caller must hold i_mutex on the inode. 2152 * 2153 * TODO: We should probably also write all attribute/index inodes associated 2154 * with this inode but since we have no simple way of getting to them we ignore 2155 * this problem for now. 2156 */ 2157 static int ntfs_file_fsync(struct file *filp, loff_t start, loff_t end, 2158 int datasync) 2159 { 2160 struct inode *vi = filp->f_mapping->host; 2161 int err, ret = 0; 2162 2163 ntfs_debug("Entering for inode 0x%lx.", vi->i_ino); 2164 2165 err = filemap_write_and_wait_range(vi->i_mapping, start, end); 2166 if (err) 2167 return err; 2168 mutex_lock(&vi->i_mutex); 2169 2170 BUG_ON(S_ISDIR(vi->i_mode)); 2171 if (!datasync || !NInoNonResident(NTFS_I(vi))) 2172 ret = __ntfs_write_inode(vi, 1); 2173 write_inode_now(vi, !datasync); 2174 /* 2175 * NOTE: If we were to use mapping->private_list (see ext2 and 2176 * fs/buffer.c) for dirty blocks then we could optimize the below to be 2177 * sync_mapping_buffers(vi->i_mapping). 2178 */ 2179 err = sync_blockdev(vi->i_sb->s_bdev); 2180 if (unlikely(err && !ret)) 2181 ret = err; 2182 if (likely(!ret)) 2183 ntfs_debug("Done."); 2184 else 2185 ntfs_warning(vi->i_sb, "Failed to f%ssync inode 0x%lx. Error " 2186 "%u.", datasync ? "data" : "", vi->i_ino, -ret); 2187 mutex_unlock(&vi->i_mutex); 2188 return ret; 2189 } 2190 2191 #endif /* NTFS_RW */ 2192 2193 const struct file_operations ntfs_file_ops = { 2194 .llseek = generic_file_llseek, /* Seek inside file. */ 2195 .read = do_sync_read, /* Read from file. */ 2196 .aio_read = generic_file_aio_read, /* Async read from file. */ 2197 #ifdef NTFS_RW 2198 .write = do_sync_write, /* Write to file. */ 2199 .aio_write = ntfs_file_aio_write, /* Async write to file. */ 2200 /*.release = ,*/ /* Last file is closed. See 2201 fs/ext2/file.c:: 2202 ext2_release_file() for 2203 how to use this to discard 2204 preallocated space for 2205 write opened files. */ 2206 .fsync = ntfs_file_fsync, /* Sync a file to disk. */ 2207 /*.aio_fsync = ,*/ /* Sync all outstanding async 2208 i/o operations on a 2209 kiocb. */ 2210 #endif /* NTFS_RW */ 2211 /*.ioctl = ,*/ /* Perform function on the 2212 mounted filesystem. */ 2213 .mmap = generic_file_mmap, /* Mmap file. */ 2214 .open = ntfs_file_open, /* Open file. */ 2215 .splice_read = generic_file_splice_read /* Zero-copy data send with 2216 the data source being on 2217 the ntfs partition. We do 2218 not need to care about the 2219 data destination. */ 2220 /*.sendpage = ,*/ /* Zero-copy data send with 2221 the data destination being 2222 on the ntfs partition. We 2223 do not need to care about 2224 the data source. */ 2225 }; 2226 2227 const struct inode_operations ntfs_file_inode_ops = { 2228 #ifdef NTFS_RW 2229 .truncate = ntfs_truncate_vfs, 2230 .setattr = ntfs_setattr, 2231 #endif /* NTFS_RW */ 2232 }; 2233 2234 const struct file_operations ntfs_empty_file_ops = {}; 2235 2236 const struct inode_operations ntfs_empty_inode_ops = {}; 2237