1 /* 2 * file.c - NTFS kernel file operations. Part of the Linux-NTFS project. 3 * 4 * Copyright (c) 2001-2011 Anton Altaparmakov and Tuxera Inc. 5 * 6 * This program/include file is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU General Public License as published 8 * by the Free Software Foundation; either version 2 of the License, or 9 * (at your option) any later version. 10 * 11 * This program/include file is distributed in the hope that it will be 12 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty 13 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 * GNU General Public License for more details. 15 * 16 * You should have received a copy of the GNU General Public License 17 * along with this program (in the main directory of the Linux-NTFS 18 * distribution in the file COPYING); if not, write to the Free Software 19 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 20 */ 21 22 #include <linux/buffer_head.h> 23 #include <linux/gfp.h> 24 #include <linux/pagemap.h> 25 #include <linux/pagevec.h> 26 #include <linux/sched.h> 27 #include <linux/swap.h> 28 #include <linux/uio.h> 29 #include <linux/writeback.h> 30 #include <linux/aio.h> 31 32 #include <asm/page.h> 33 #include <asm/uaccess.h> 34 35 #include "attrib.h" 36 #include "bitmap.h" 37 #include "inode.h" 38 #include "debug.h" 39 #include "lcnalloc.h" 40 #include "malloc.h" 41 #include "mft.h" 42 #include "ntfs.h" 43 44 /** 45 * ntfs_file_open - called when an inode is about to be opened 46 * @vi: inode to be opened 47 * @filp: file structure describing the inode 48 * 49 * Limit file size to the page cache limit on architectures where unsigned long 50 * is 32-bits. This is the most we can do for now without overflowing the page 51 * cache page index. Doing it this way means we don't run into problems because 52 * of existing too large files. It would be better to allow the user to read 53 * the beginning of the file but I doubt very much anyone is going to hit this 54 * check on a 32-bit architecture, so there is no point in adding the extra 55 * complexity required to support this. 56 * 57 * On 64-bit architectures, the check is hopefully optimized away by the 58 * compiler. 59 * 60 * After the check passes, just call generic_file_open() to do its work. 61 */ 62 static int ntfs_file_open(struct inode *vi, struct file *filp) 63 { 64 if (sizeof(unsigned long) < 8) { 65 if (i_size_read(vi) > MAX_LFS_FILESIZE) 66 return -EOVERFLOW; 67 } 68 return generic_file_open(vi, filp); 69 } 70 71 #ifdef NTFS_RW 72 73 /** 74 * ntfs_attr_extend_initialized - extend the initialized size of an attribute 75 * @ni: ntfs inode of the attribute to extend 76 * @new_init_size: requested new initialized size in bytes 77 * 78 * Extend the initialized size of an attribute described by the ntfs inode @ni 79 * to @new_init_size bytes. This involves zeroing any non-sparse space between 80 * the old initialized size and @new_init_size both in the page cache and on 81 * disk (if relevant complete pages are already uptodate in the page cache then 82 * these are simply marked dirty). 83 * 84 * As a side-effect, the file size (vfs inode->i_size) may be incremented as, 85 * in the resident attribute case, it is tied to the initialized size and, in 86 * the non-resident attribute case, it may not fall below the initialized size. 87 * 88 * Note that if the attribute is resident, we do not need to touch the page 89 * cache at all. This is because if the page cache page is not uptodate we 90 * bring it uptodate later, when doing the write to the mft record since we 91 * then already have the page mapped. And if the page is uptodate, the 92 * non-initialized region will already have been zeroed when the page was 93 * brought uptodate and the region may in fact already have been overwritten 94 * with new data via mmap() based writes, so we cannot just zero it. And since 95 * POSIX specifies that the behaviour of resizing a file whilst it is mmap()ped 96 * is unspecified, we choose not to do zeroing and thus we do not need to touch 97 * the page at all. For a more detailed explanation see ntfs_truncate() in 98 * fs/ntfs/inode.c. 99 * 100 * Return 0 on success and -errno on error. In the case that an error is 101 * encountered it is possible that the initialized size will already have been 102 * incremented some way towards @new_init_size but it is guaranteed that if 103 * this is the case, the necessary zeroing will also have happened and that all 104 * metadata is self-consistent. 105 * 106 * Locking: i_mutex on the vfs inode corrseponsind to the ntfs inode @ni must be 107 * held by the caller. 108 */ 109 static int ntfs_attr_extend_initialized(ntfs_inode *ni, const s64 new_init_size) 110 { 111 s64 old_init_size; 112 loff_t old_i_size; 113 pgoff_t index, end_index; 114 unsigned long flags; 115 struct inode *vi = VFS_I(ni); 116 ntfs_inode *base_ni; 117 MFT_RECORD *m = NULL; 118 ATTR_RECORD *a; 119 ntfs_attr_search_ctx *ctx = NULL; 120 struct address_space *mapping; 121 struct page *page = NULL; 122 u8 *kattr; 123 int err; 124 u32 attr_len; 125 126 read_lock_irqsave(&ni->size_lock, flags); 127 old_init_size = ni->initialized_size; 128 old_i_size = i_size_read(vi); 129 BUG_ON(new_init_size > ni->allocated_size); 130 read_unlock_irqrestore(&ni->size_lock, flags); 131 ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, " 132 "old_initialized_size 0x%llx, " 133 "new_initialized_size 0x%llx, i_size 0x%llx.", 134 vi->i_ino, (unsigned)le32_to_cpu(ni->type), 135 (unsigned long long)old_init_size, 136 (unsigned long long)new_init_size, old_i_size); 137 if (!NInoAttr(ni)) 138 base_ni = ni; 139 else 140 base_ni = ni->ext.base_ntfs_ino; 141 /* Use goto to reduce indentation and we need the label below anyway. */ 142 if (NInoNonResident(ni)) 143 goto do_non_resident_extend; 144 BUG_ON(old_init_size != old_i_size); 145 m = map_mft_record(base_ni); 146 if (IS_ERR(m)) { 147 err = PTR_ERR(m); 148 m = NULL; 149 goto err_out; 150 } 151 ctx = ntfs_attr_get_search_ctx(base_ni, m); 152 if (unlikely(!ctx)) { 153 err = -ENOMEM; 154 goto err_out; 155 } 156 err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, 157 CASE_SENSITIVE, 0, NULL, 0, ctx); 158 if (unlikely(err)) { 159 if (err == -ENOENT) 160 err = -EIO; 161 goto err_out; 162 } 163 m = ctx->mrec; 164 a = ctx->attr; 165 BUG_ON(a->non_resident); 166 /* The total length of the attribute value. */ 167 attr_len = le32_to_cpu(a->data.resident.value_length); 168 BUG_ON(old_i_size != (loff_t)attr_len); 169 /* 170 * Do the zeroing in the mft record and update the attribute size in 171 * the mft record. 172 */ 173 kattr = (u8*)a + le16_to_cpu(a->data.resident.value_offset); 174 memset(kattr + attr_len, 0, new_init_size - attr_len); 175 a->data.resident.value_length = cpu_to_le32((u32)new_init_size); 176 /* Finally, update the sizes in the vfs and ntfs inodes. */ 177 write_lock_irqsave(&ni->size_lock, flags); 178 i_size_write(vi, new_init_size); 179 ni->initialized_size = new_init_size; 180 write_unlock_irqrestore(&ni->size_lock, flags); 181 goto done; 182 do_non_resident_extend: 183 /* 184 * If the new initialized size @new_init_size exceeds the current file 185 * size (vfs inode->i_size), we need to extend the file size to the 186 * new initialized size. 187 */ 188 if (new_init_size > old_i_size) { 189 m = map_mft_record(base_ni); 190 if (IS_ERR(m)) { 191 err = PTR_ERR(m); 192 m = NULL; 193 goto err_out; 194 } 195 ctx = ntfs_attr_get_search_ctx(base_ni, m); 196 if (unlikely(!ctx)) { 197 err = -ENOMEM; 198 goto err_out; 199 } 200 err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, 201 CASE_SENSITIVE, 0, NULL, 0, ctx); 202 if (unlikely(err)) { 203 if (err == -ENOENT) 204 err = -EIO; 205 goto err_out; 206 } 207 m = ctx->mrec; 208 a = ctx->attr; 209 BUG_ON(!a->non_resident); 210 BUG_ON(old_i_size != (loff_t) 211 sle64_to_cpu(a->data.non_resident.data_size)); 212 a->data.non_resident.data_size = cpu_to_sle64(new_init_size); 213 flush_dcache_mft_record_page(ctx->ntfs_ino); 214 mark_mft_record_dirty(ctx->ntfs_ino); 215 /* Update the file size in the vfs inode. */ 216 i_size_write(vi, new_init_size); 217 ntfs_attr_put_search_ctx(ctx); 218 ctx = NULL; 219 unmap_mft_record(base_ni); 220 m = NULL; 221 } 222 mapping = vi->i_mapping; 223 index = old_init_size >> PAGE_CACHE_SHIFT; 224 end_index = (new_init_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 225 do { 226 /* 227 * Read the page. If the page is not present, this will zero 228 * the uninitialized regions for us. 229 */ 230 page = read_mapping_page(mapping, index, NULL); 231 if (IS_ERR(page)) { 232 err = PTR_ERR(page); 233 goto init_err_out; 234 } 235 if (unlikely(PageError(page))) { 236 page_cache_release(page); 237 err = -EIO; 238 goto init_err_out; 239 } 240 /* 241 * Update the initialized size in the ntfs inode. This is 242 * enough to make ntfs_writepage() work. 243 */ 244 write_lock_irqsave(&ni->size_lock, flags); 245 ni->initialized_size = (s64)(index + 1) << PAGE_CACHE_SHIFT; 246 if (ni->initialized_size > new_init_size) 247 ni->initialized_size = new_init_size; 248 write_unlock_irqrestore(&ni->size_lock, flags); 249 /* Set the page dirty so it gets written out. */ 250 set_page_dirty(page); 251 page_cache_release(page); 252 /* 253 * Play nice with the vm and the rest of the system. This is 254 * very much needed as we can potentially be modifying the 255 * initialised size from a very small value to a really huge 256 * value, e.g. 257 * f = open(somefile, O_TRUNC); 258 * truncate(f, 10GiB); 259 * seek(f, 10GiB); 260 * write(f, 1); 261 * And this would mean we would be marking dirty hundreds of 262 * thousands of pages or as in the above example more than 263 * two and a half million pages! 264 * 265 * TODO: For sparse pages could optimize this workload by using 266 * the FsMisc / MiscFs page bit as a "PageIsSparse" bit. This 267 * would be set in readpage for sparse pages and here we would 268 * not need to mark dirty any pages which have this bit set. 269 * The only caveat is that we have to clear the bit everywhere 270 * where we allocate any clusters that lie in the page or that 271 * contain the page. 272 * 273 * TODO: An even greater optimization would be for us to only 274 * call readpage() on pages which are not in sparse regions as 275 * determined from the runlist. This would greatly reduce the 276 * number of pages we read and make dirty in the case of sparse 277 * files. 278 */ 279 balance_dirty_pages_ratelimited(mapping); 280 cond_resched(); 281 } while (++index < end_index); 282 read_lock_irqsave(&ni->size_lock, flags); 283 BUG_ON(ni->initialized_size != new_init_size); 284 read_unlock_irqrestore(&ni->size_lock, flags); 285 /* Now bring in sync the initialized_size in the mft record. */ 286 m = map_mft_record(base_ni); 287 if (IS_ERR(m)) { 288 err = PTR_ERR(m); 289 m = NULL; 290 goto init_err_out; 291 } 292 ctx = ntfs_attr_get_search_ctx(base_ni, m); 293 if (unlikely(!ctx)) { 294 err = -ENOMEM; 295 goto init_err_out; 296 } 297 err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, 298 CASE_SENSITIVE, 0, NULL, 0, ctx); 299 if (unlikely(err)) { 300 if (err == -ENOENT) 301 err = -EIO; 302 goto init_err_out; 303 } 304 m = ctx->mrec; 305 a = ctx->attr; 306 BUG_ON(!a->non_resident); 307 a->data.non_resident.initialized_size = cpu_to_sle64(new_init_size); 308 done: 309 flush_dcache_mft_record_page(ctx->ntfs_ino); 310 mark_mft_record_dirty(ctx->ntfs_ino); 311 if (ctx) 312 ntfs_attr_put_search_ctx(ctx); 313 if (m) 314 unmap_mft_record(base_ni); 315 ntfs_debug("Done, initialized_size 0x%llx, i_size 0x%llx.", 316 (unsigned long long)new_init_size, i_size_read(vi)); 317 return 0; 318 init_err_out: 319 write_lock_irqsave(&ni->size_lock, flags); 320 ni->initialized_size = old_init_size; 321 write_unlock_irqrestore(&ni->size_lock, flags); 322 err_out: 323 if (ctx) 324 ntfs_attr_put_search_ctx(ctx); 325 if (m) 326 unmap_mft_record(base_ni); 327 ntfs_debug("Failed. Returning error code %i.", err); 328 return err; 329 } 330 331 /** 332 * ntfs_fault_in_pages_readable - 333 * 334 * Fault a number of userspace pages into pagetables. 335 * 336 * Unlike include/linux/pagemap.h::fault_in_pages_readable(), this one copes 337 * with more than two userspace pages as well as handling the single page case 338 * elegantly. 339 * 340 * If you find this difficult to understand, then think of the while loop being 341 * the following code, except that we do without the integer variable ret: 342 * 343 * do { 344 * ret = __get_user(c, uaddr); 345 * uaddr += PAGE_SIZE; 346 * } while (!ret && uaddr < end); 347 * 348 * Note, the final __get_user() may well run out-of-bounds of the user buffer, 349 * but _not_ out-of-bounds of the page the user buffer belongs to, and since 350 * this is only a read and not a write, and since it is still in the same page, 351 * it should not matter and this makes the code much simpler. 352 */ 353 static inline void ntfs_fault_in_pages_readable(const char __user *uaddr, 354 int bytes) 355 { 356 const char __user *end; 357 volatile char c; 358 359 /* Set @end to the first byte outside the last page we care about. */ 360 end = (const char __user*)PAGE_ALIGN((unsigned long)uaddr + bytes); 361 362 while (!__get_user(c, uaddr) && (uaddr += PAGE_SIZE, uaddr < end)) 363 ; 364 } 365 366 /** 367 * ntfs_fault_in_pages_readable_iovec - 368 * 369 * Same as ntfs_fault_in_pages_readable() but operates on an array of iovecs. 370 */ 371 static inline void ntfs_fault_in_pages_readable_iovec(const struct iovec *iov, 372 size_t iov_ofs, int bytes) 373 { 374 do { 375 const char __user *buf; 376 unsigned len; 377 378 buf = iov->iov_base + iov_ofs; 379 len = iov->iov_len - iov_ofs; 380 if (len > bytes) 381 len = bytes; 382 ntfs_fault_in_pages_readable(buf, len); 383 bytes -= len; 384 iov++; 385 iov_ofs = 0; 386 } while (bytes); 387 } 388 389 /** 390 * __ntfs_grab_cache_pages - obtain a number of locked pages 391 * @mapping: address space mapping from which to obtain page cache pages 392 * @index: starting index in @mapping at which to begin obtaining pages 393 * @nr_pages: number of page cache pages to obtain 394 * @pages: array of pages in which to return the obtained page cache pages 395 * @cached_page: allocated but as yet unused page 396 * 397 * Obtain @nr_pages locked page cache pages from the mapping @mapping and 398 * starting at index @index. 399 * 400 * If a page is newly created, add it to lru list 401 * 402 * Note, the page locks are obtained in ascending page index order. 403 */ 404 static inline int __ntfs_grab_cache_pages(struct address_space *mapping, 405 pgoff_t index, const unsigned nr_pages, struct page **pages, 406 struct page **cached_page) 407 { 408 int err, nr; 409 410 BUG_ON(!nr_pages); 411 err = nr = 0; 412 do { 413 pages[nr] = find_lock_page(mapping, index); 414 if (!pages[nr]) { 415 if (!*cached_page) { 416 *cached_page = page_cache_alloc(mapping); 417 if (unlikely(!*cached_page)) { 418 err = -ENOMEM; 419 goto err_out; 420 } 421 } 422 err = add_to_page_cache_lru(*cached_page, mapping, index, 423 GFP_KERNEL); 424 if (unlikely(err)) { 425 if (err == -EEXIST) 426 continue; 427 goto err_out; 428 } 429 pages[nr] = *cached_page; 430 *cached_page = NULL; 431 } 432 index++; 433 nr++; 434 } while (nr < nr_pages); 435 out: 436 return err; 437 err_out: 438 while (nr > 0) { 439 unlock_page(pages[--nr]); 440 page_cache_release(pages[nr]); 441 } 442 goto out; 443 } 444 445 static inline int ntfs_submit_bh_for_read(struct buffer_head *bh) 446 { 447 lock_buffer(bh); 448 get_bh(bh); 449 bh->b_end_io = end_buffer_read_sync; 450 return submit_bh(READ, bh); 451 } 452 453 /** 454 * ntfs_prepare_pages_for_non_resident_write - prepare pages for receiving data 455 * @pages: array of destination pages 456 * @nr_pages: number of pages in @pages 457 * @pos: byte position in file at which the write begins 458 * @bytes: number of bytes to be written 459 * 460 * This is called for non-resident attributes from ntfs_file_buffered_write() 461 * with i_mutex held on the inode (@pages[0]->mapping->host). There are 462 * @nr_pages pages in @pages which are locked but not kmap()ped. The source 463 * data has not yet been copied into the @pages. 464 * 465 * Need to fill any holes with actual clusters, allocate buffers if necessary, 466 * ensure all the buffers are mapped, and bring uptodate any buffers that are 467 * only partially being written to. 468 * 469 * If @nr_pages is greater than one, we are guaranteed that the cluster size is 470 * greater than PAGE_CACHE_SIZE, that all pages in @pages are entirely inside 471 * the same cluster and that they are the entirety of that cluster, and that 472 * the cluster is sparse, i.e. we need to allocate a cluster to fill the hole. 473 * 474 * i_size is not to be modified yet. 475 * 476 * Return 0 on success or -errno on error. 477 */ 478 static int ntfs_prepare_pages_for_non_resident_write(struct page **pages, 479 unsigned nr_pages, s64 pos, size_t bytes) 480 { 481 VCN vcn, highest_vcn = 0, cpos, cend, bh_cpos, bh_cend; 482 LCN lcn; 483 s64 bh_pos, vcn_len, end, initialized_size; 484 sector_t lcn_block; 485 struct page *page; 486 struct inode *vi; 487 ntfs_inode *ni, *base_ni = NULL; 488 ntfs_volume *vol; 489 runlist_element *rl, *rl2; 490 struct buffer_head *bh, *head, *wait[2], **wait_bh = wait; 491 ntfs_attr_search_ctx *ctx = NULL; 492 MFT_RECORD *m = NULL; 493 ATTR_RECORD *a = NULL; 494 unsigned long flags; 495 u32 attr_rec_len = 0; 496 unsigned blocksize, u; 497 int err, mp_size; 498 bool rl_write_locked, was_hole, is_retry; 499 unsigned char blocksize_bits; 500 struct { 501 u8 runlist_merged:1; 502 u8 mft_attr_mapped:1; 503 u8 mp_rebuilt:1; 504 u8 attr_switched:1; 505 } status = { 0, 0, 0, 0 }; 506 507 BUG_ON(!nr_pages); 508 BUG_ON(!pages); 509 BUG_ON(!*pages); 510 vi = pages[0]->mapping->host; 511 ni = NTFS_I(vi); 512 vol = ni->vol; 513 ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, start page " 514 "index 0x%lx, nr_pages 0x%x, pos 0x%llx, bytes 0x%zx.", 515 vi->i_ino, ni->type, pages[0]->index, nr_pages, 516 (long long)pos, bytes); 517 blocksize = vol->sb->s_blocksize; 518 blocksize_bits = vol->sb->s_blocksize_bits; 519 u = 0; 520 do { 521 page = pages[u]; 522 BUG_ON(!page); 523 /* 524 * create_empty_buffers() will create uptodate/dirty buffers if 525 * the page is uptodate/dirty. 526 */ 527 if (!page_has_buffers(page)) { 528 create_empty_buffers(page, blocksize, 0); 529 if (unlikely(!page_has_buffers(page))) 530 return -ENOMEM; 531 } 532 } while (++u < nr_pages); 533 rl_write_locked = false; 534 rl = NULL; 535 err = 0; 536 vcn = lcn = -1; 537 vcn_len = 0; 538 lcn_block = -1; 539 was_hole = false; 540 cpos = pos >> vol->cluster_size_bits; 541 end = pos + bytes; 542 cend = (end + vol->cluster_size - 1) >> vol->cluster_size_bits; 543 /* 544 * Loop over each page and for each page over each buffer. Use goto to 545 * reduce indentation. 546 */ 547 u = 0; 548 do_next_page: 549 page = pages[u]; 550 bh_pos = (s64)page->index << PAGE_CACHE_SHIFT; 551 bh = head = page_buffers(page); 552 do { 553 VCN cdelta; 554 s64 bh_end; 555 unsigned bh_cofs; 556 557 /* Clear buffer_new on all buffers to reinitialise state. */ 558 if (buffer_new(bh)) 559 clear_buffer_new(bh); 560 bh_end = bh_pos + blocksize; 561 bh_cpos = bh_pos >> vol->cluster_size_bits; 562 bh_cofs = bh_pos & vol->cluster_size_mask; 563 if (buffer_mapped(bh)) { 564 /* 565 * The buffer is already mapped. If it is uptodate, 566 * ignore it. 567 */ 568 if (buffer_uptodate(bh)) 569 continue; 570 /* 571 * The buffer is not uptodate. If the page is uptodate 572 * set the buffer uptodate and otherwise ignore it. 573 */ 574 if (PageUptodate(page)) { 575 set_buffer_uptodate(bh); 576 continue; 577 } 578 /* 579 * Neither the page nor the buffer are uptodate. If 580 * the buffer is only partially being written to, we 581 * need to read it in before the write, i.e. now. 582 */ 583 if ((bh_pos < pos && bh_end > pos) || 584 (bh_pos < end && bh_end > end)) { 585 /* 586 * If the buffer is fully or partially within 587 * the initialized size, do an actual read. 588 * Otherwise, simply zero the buffer. 589 */ 590 read_lock_irqsave(&ni->size_lock, flags); 591 initialized_size = ni->initialized_size; 592 read_unlock_irqrestore(&ni->size_lock, flags); 593 if (bh_pos < initialized_size) { 594 ntfs_submit_bh_for_read(bh); 595 *wait_bh++ = bh; 596 } else { 597 zero_user(page, bh_offset(bh), 598 blocksize); 599 set_buffer_uptodate(bh); 600 } 601 } 602 continue; 603 } 604 /* Unmapped buffer. Need to map it. */ 605 bh->b_bdev = vol->sb->s_bdev; 606 /* 607 * If the current buffer is in the same clusters as the map 608 * cache, there is no need to check the runlist again. The 609 * map cache is made up of @vcn, which is the first cached file 610 * cluster, @vcn_len which is the number of cached file 611 * clusters, @lcn is the device cluster corresponding to @vcn, 612 * and @lcn_block is the block number corresponding to @lcn. 613 */ 614 cdelta = bh_cpos - vcn; 615 if (likely(!cdelta || (cdelta > 0 && cdelta < vcn_len))) { 616 map_buffer_cached: 617 BUG_ON(lcn < 0); 618 bh->b_blocknr = lcn_block + 619 (cdelta << (vol->cluster_size_bits - 620 blocksize_bits)) + 621 (bh_cofs >> blocksize_bits); 622 set_buffer_mapped(bh); 623 /* 624 * If the page is uptodate so is the buffer. If the 625 * buffer is fully outside the write, we ignore it if 626 * it was already allocated and we mark it dirty so it 627 * gets written out if we allocated it. On the other 628 * hand, if we allocated the buffer but we are not 629 * marking it dirty we set buffer_new so we can do 630 * error recovery. 631 */ 632 if (PageUptodate(page)) { 633 if (!buffer_uptodate(bh)) 634 set_buffer_uptodate(bh); 635 if (unlikely(was_hole)) { 636 /* We allocated the buffer. */ 637 unmap_underlying_metadata(bh->b_bdev, 638 bh->b_blocknr); 639 if (bh_end <= pos || bh_pos >= end) 640 mark_buffer_dirty(bh); 641 else 642 set_buffer_new(bh); 643 } 644 continue; 645 } 646 /* Page is _not_ uptodate. */ 647 if (likely(!was_hole)) { 648 /* 649 * Buffer was already allocated. If it is not 650 * uptodate and is only partially being written 651 * to, we need to read it in before the write, 652 * i.e. now. 653 */ 654 if (!buffer_uptodate(bh) && bh_pos < end && 655 bh_end > pos && 656 (bh_pos < pos || 657 bh_end > end)) { 658 /* 659 * If the buffer is fully or partially 660 * within the initialized size, do an 661 * actual read. Otherwise, simply zero 662 * the buffer. 663 */ 664 read_lock_irqsave(&ni->size_lock, 665 flags); 666 initialized_size = ni->initialized_size; 667 read_unlock_irqrestore(&ni->size_lock, 668 flags); 669 if (bh_pos < initialized_size) { 670 ntfs_submit_bh_for_read(bh); 671 *wait_bh++ = bh; 672 } else { 673 zero_user(page, bh_offset(bh), 674 blocksize); 675 set_buffer_uptodate(bh); 676 } 677 } 678 continue; 679 } 680 /* We allocated the buffer. */ 681 unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); 682 /* 683 * If the buffer is fully outside the write, zero it, 684 * set it uptodate, and mark it dirty so it gets 685 * written out. If it is partially being written to, 686 * zero region surrounding the write but leave it to 687 * commit write to do anything else. Finally, if the 688 * buffer is fully being overwritten, do nothing. 689 */ 690 if (bh_end <= pos || bh_pos >= end) { 691 if (!buffer_uptodate(bh)) { 692 zero_user(page, bh_offset(bh), 693 blocksize); 694 set_buffer_uptodate(bh); 695 } 696 mark_buffer_dirty(bh); 697 continue; 698 } 699 set_buffer_new(bh); 700 if (!buffer_uptodate(bh) && 701 (bh_pos < pos || bh_end > end)) { 702 u8 *kaddr; 703 unsigned pofs; 704 705 kaddr = kmap_atomic(page); 706 if (bh_pos < pos) { 707 pofs = bh_pos & ~PAGE_CACHE_MASK; 708 memset(kaddr + pofs, 0, pos - bh_pos); 709 } 710 if (bh_end > end) { 711 pofs = end & ~PAGE_CACHE_MASK; 712 memset(kaddr + pofs, 0, bh_end - end); 713 } 714 kunmap_atomic(kaddr); 715 flush_dcache_page(page); 716 } 717 continue; 718 } 719 /* 720 * Slow path: this is the first buffer in the cluster. If it 721 * is outside allocated size and is not uptodate, zero it and 722 * set it uptodate. 723 */ 724 read_lock_irqsave(&ni->size_lock, flags); 725 initialized_size = ni->allocated_size; 726 read_unlock_irqrestore(&ni->size_lock, flags); 727 if (bh_pos > initialized_size) { 728 if (PageUptodate(page)) { 729 if (!buffer_uptodate(bh)) 730 set_buffer_uptodate(bh); 731 } else if (!buffer_uptodate(bh)) { 732 zero_user(page, bh_offset(bh), blocksize); 733 set_buffer_uptodate(bh); 734 } 735 continue; 736 } 737 is_retry = false; 738 if (!rl) { 739 down_read(&ni->runlist.lock); 740 retry_remap: 741 rl = ni->runlist.rl; 742 } 743 if (likely(rl != NULL)) { 744 /* Seek to element containing target cluster. */ 745 while (rl->length && rl[1].vcn <= bh_cpos) 746 rl++; 747 lcn = ntfs_rl_vcn_to_lcn(rl, bh_cpos); 748 if (likely(lcn >= 0)) { 749 /* 750 * Successful remap, setup the map cache and 751 * use that to deal with the buffer. 752 */ 753 was_hole = false; 754 vcn = bh_cpos; 755 vcn_len = rl[1].vcn - vcn; 756 lcn_block = lcn << (vol->cluster_size_bits - 757 blocksize_bits); 758 cdelta = 0; 759 /* 760 * If the number of remaining clusters touched 761 * by the write is smaller or equal to the 762 * number of cached clusters, unlock the 763 * runlist as the map cache will be used from 764 * now on. 765 */ 766 if (likely(vcn + vcn_len >= cend)) { 767 if (rl_write_locked) { 768 up_write(&ni->runlist.lock); 769 rl_write_locked = false; 770 } else 771 up_read(&ni->runlist.lock); 772 rl = NULL; 773 } 774 goto map_buffer_cached; 775 } 776 } else 777 lcn = LCN_RL_NOT_MAPPED; 778 /* 779 * If it is not a hole and not out of bounds, the runlist is 780 * probably unmapped so try to map it now. 781 */ 782 if (unlikely(lcn != LCN_HOLE && lcn != LCN_ENOENT)) { 783 if (likely(!is_retry && lcn == LCN_RL_NOT_MAPPED)) { 784 /* Attempt to map runlist. */ 785 if (!rl_write_locked) { 786 /* 787 * We need the runlist locked for 788 * writing, so if it is locked for 789 * reading relock it now and retry in 790 * case it changed whilst we dropped 791 * the lock. 792 */ 793 up_read(&ni->runlist.lock); 794 down_write(&ni->runlist.lock); 795 rl_write_locked = true; 796 goto retry_remap; 797 } 798 err = ntfs_map_runlist_nolock(ni, bh_cpos, 799 NULL); 800 if (likely(!err)) { 801 is_retry = true; 802 goto retry_remap; 803 } 804 /* 805 * If @vcn is out of bounds, pretend @lcn is 806 * LCN_ENOENT. As long as the buffer is out 807 * of bounds this will work fine. 808 */ 809 if (err == -ENOENT) { 810 lcn = LCN_ENOENT; 811 err = 0; 812 goto rl_not_mapped_enoent; 813 } 814 } else 815 err = -EIO; 816 /* Failed to map the buffer, even after retrying. */ 817 bh->b_blocknr = -1; 818 ntfs_error(vol->sb, "Failed to write to inode 0x%lx, " 819 "attribute type 0x%x, vcn 0x%llx, " 820 "vcn offset 0x%x, because its " 821 "location on disk could not be " 822 "determined%s (error code %i).", 823 ni->mft_no, ni->type, 824 (unsigned long long)bh_cpos, 825 (unsigned)bh_pos & 826 vol->cluster_size_mask, 827 is_retry ? " even after retrying" : "", 828 err); 829 break; 830 } 831 rl_not_mapped_enoent: 832 /* 833 * The buffer is in a hole or out of bounds. We need to fill 834 * the hole, unless the buffer is in a cluster which is not 835 * touched by the write, in which case we just leave the buffer 836 * unmapped. This can only happen when the cluster size is 837 * less than the page cache size. 838 */ 839 if (unlikely(vol->cluster_size < PAGE_CACHE_SIZE)) { 840 bh_cend = (bh_end + vol->cluster_size - 1) >> 841 vol->cluster_size_bits; 842 if ((bh_cend <= cpos || bh_cpos >= cend)) { 843 bh->b_blocknr = -1; 844 /* 845 * If the buffer is uptodate we skip it. If it 846 * is not but the page is uptodate, we can set 847 * the buffer uptodate. If the page is not 848 * uptodate, we can clear the buffer and set it 849 * uptodate. Whether this is worthwhile is 850 * debatable and this could be removed. 851 */ 852 if (PageUptodate(page)) { 853 if (!buffer_uptodate(bh)) 854 set_buffer_uptodate(bh); 855 } else if (!buffer_uptodate(bh)) { 856 zero_user(page, bh_offset(bh), 857 blocksize); 858 set_buffer_uptodate(bh); 859 } 860 continue; 861 } 862 } 863 /* 864 * Out of bounds buffer is invalid if it was not really out of 865 * bounds. 866 */ 867 BUG_ON(lcn != LCN_HOLE); 868 /* 869 * We need the runlist locked for writing, so if it is locked 870 * for reading relock it now and retry in case it changed 871 * whilst we dropped the lock. 872 */ 873 BUG_ON(!rl); 874 if (!rl_write_locked) { 875 up_read(&ni->runlist.lock); 876 down_write(&ni->runlist.lock); 877 rl_write_locked = true; 878 goto retry_remap; 879 } 880 /* Find the previous last allocated cluster. */ 881 BUG_ON(rl->lcn != LCN_HOLE); 882 lcn = -1; 883 rl2 = rl; 884 while (--rl2 >= ni->runlist.rl) { 885 if (rl2->lcn >= 0) { 886 lcn = rl2->lcn + rl2->length; 887 break; 888 } 889 } 890 rl2 = ntfs_cluster_alloc(vol, bh_cpos, 1, lcn, DATA_ZONE, 891 false); 892 if (IS_ERR(rl2)) { 893 err = PTR_ERR(rl2); 894 ntfs_debug("Failed to allocate cluster, error code %i.", 895 err); 896 break; 897 } 898 lcn = rl2->lcn; 899 rl = ntfs_runlists_merge(ni->runlist.rl, rl2); 900 if (IS_ERR(rl)) { 901 err = PTR_ERR(rl); 902 if (err != -ENOMEM) 903 err = -EIO; 904 if (ntfs_cluster_free_from_rl(vol, rl2)) { 905 ntfs_error(vol->sb, "Failed to release " 906 "allocated cluster in error " 907 "code path. Run chkdsk to " 908 "recover the lost cluster."); 909 NVolSetErrors(vol); 910 } 911 ntfs_free(rl2); 912 break; 913 } 914 ni->runlist.rl = rl; 915 status.runlist_merged = 1; 916 ntfs_debug("Allocated cluster, lcn 0x%llx.", 917 (unsigned long long)lcn); 918 /* Map and lock the mft record and get the attribute record. */ 919 if (!NInoAttr(ni)) 920 base_ni = ni; 921 else 922 base_ni = ni->ext.base_ntfs_ino; 923 m = map_mft_record(base_ni); 924 if (IS_ERR(m)) { 925 err = PTR_ERR(m); 926 break; 927 } 928 ctx = ntfs_attr_get_search_ctx(base_ni, m); 929 if (unlikely(!ctx)) { 930 err = -ENOMEM; 931 unmap_mft_record(base_ni); 932 break; 933 } 934 status.mft_attr_mapped = 1; 935 err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, 936 CASE_SENSITIVE, bh_cpos, NULL, 0, ctx); 937 if (unlikely(err)) { 938 if (err == -ENOENT) 939 err = -EIO; 940 break; 941 } 942 m = ctx->mrec; 943 a = ctx->attr; 944 /* 945 * Find the runlist element with which the attribute extent 946 * starts. Note, we cannot use the _attr_ version because we 947 * have mapped the mft record. That is ok because we know the 948 * runlist fragment must be mapped already to have ever gotten 949 * here, so we can just use the _rl_ version. 950 */ 951 vcn = sle64_to_cpu(a->data.non_resident.lowest_vcn); 952 rl2 = ntfs_rl_find_vcn_nolock(rl, vcn); 953 BUG_ON(!rl2); 954 BUG_ON(!rl2->length); 955 BUG_ON(rl2->lcn < LCN_HOLE); 956 highest_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn); 957 /* 958 * If @highest_vcn is zero, calculate the real highest_vcn 959 * (which can really be zero). 960 */ 961 if (!highest_vcn) 962 highest_vcn = (sle64_to_cpu( 963 a->data.non_resident.allocated_size) >> 964 vol->cluster_size_bits) - 1; 965 /* 966 * Determine the size of the mapping pairs array for the new 967 * extent, i.e. the old extent with the hole filled. 968 */ 969 mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, vcn, 970 highest_vcn); 971 if (unlikely(mp_size <= 0)) { 972 if (!(err = mp_size)) 973 err = -EIO; 974 ntfs_debug("Failed to get size for mapping pairs " 975 "array, error code %i.", err); 976 break; 977 } 978 /* 979 * Resize the attribute record to fit the new mapping pairs 980 * array. 981 */ 982 attr_rec_len = le32_to_cpu(a->length); 983 err = ntfs_attr_record_resize(m, a, mp_size + le16_to_cpu( 984 a->data.non_resident.mapping_pairs_offset)); 985 if (unlikely(err)) { 986 BUG_ON(err != -ENOSPC); 987 // TODO: Deal with this by using the current attribute 988 // and fill it with as much of the mapping pairs 989 // array as possible. Then loop over each attribute 990 // extent rewriting the mapping pairs arrays as we go 991 // along and if when we reach the end we have not 992 // enough space, try to resize the last attribute 993 // extent and if even that fails, add a new attribute 994 // extent. 995 // We could also try to resize at each step in the hope 996 // that we will not need to rewrite every single extent. 997 // Note, we may need to decompress some extents to fill 998 // the runlist as we are walking the extents... 999 ntfs_error(vol->sb, "Not enough space in the mft " 1000 "record for the extended attribute " 1001 "record. This case is not " 1002 "implemented yet."); 1003 err = -EOPNOTSUPP; 1004 break ; 1005 } 1006 status.mp_rebuilt = 1; 1007 /* 1008 * Generate the mapping pairs array directly into the attribute 1009 * record. 1010 */ 1011 err = ntfs_mapping_pairs_build(vol, (u8*)a + le16_to_cpu( 1012 a->data.non_resident.mapping_pairs_offset), 1013 mp_size, rl2, vcn, highest_vcn, NULL); 1014 if (unlikely(err)) { 1015 ntfs_error(vol->sb, "Cannot fill hole in inode 0x%lx, " 1016 "attribute type 0x%x, because building " 1017 "the mapping pairs failed with error " 1018 "code %i.", vi->i_ino, 1019 (unsigned)le32_to_cpu(ni->type), err); 1020 err = -EIO; 1021 break; 1022 } 1023 /* Update the highest_vcn but only if it was not set. */ 1024 if (unlikely(!a->data.non_resident.highest_vcn)) 1025 a->data.non_resident.highest_vcn = 1026 cpu_to_sle64(highest_vcn); 1027 /* 1028 * If the attribute is sparse/compressed, update the compressed 1029 * size in the ntfs_inode structure and the attribute record. 1030 */ 1031 if (likely(NInoSparse(ni) || NInoCompressed(ni))) { 1032 /* 1033 * If we are not in the first attribute extent, switch 1034 * to it, but first ensure the changes will make it to 1035 * disk later. 1036 */ 1037 if (a->data.non_resident.lowest_vcn) { 1038 flush_dcache_mft_record_page(ctx->ntfs_ino); 1039 mark_mft_record_dirty(ctx->ntfs_ino); 1040 ntfs_attr_reinit_search_ctx(ctx); 1041 err = ntfs_attr_lookup(ni->type, ni->name, 1042 ni->name_len, CASE_SENSITIVE, 1043 0, NULL, 0, ctx); 1044 if (unlikely(err)) { 1045 status.attr_switched = 1; 1046 break; 1047 } 1048 /* @m is not used any more so do not set it. */ 1049 a = ctx->attr; 1050 } 1051 write_lock_irqsave(&ni->size_lock, flags); 1052 ni->itype.compressed.size += vol->cluster_size; 1053 a->data.non_resident.compressed_size = 1054 cpu_to_sle64(ni->itype.compressed.size); 1055 write_unlock_irqrestore(&ni->size_lock, flags); 1056 } 1057 /* Ensure the changes make it to disk. */ 1058 flush_dcache_mft_record_page(ctx->ntfs_ino); 1059 mark_mft_record_dirty(ctx->ntfs_ino); 1060 ntfs_attr_put_search_ctx(ctx); 1061 unmap_mft_record(base_ni); 1062 /* Successfully filled the hole. */ 1063 status.runlist_merged = 0; 1064 status.mft_attr_mapped = 0; 1065 status.mp_rebuilt = 0; 1066 /* Setup the map cache and use that to deal with the buffer. */ 1067 was_hole = true; 1068 vcn = bh_cpos; 1069 vcn_len = 1; 1070 lcn_block = lcn << (vol->cluster_size_bits - blocksize_bits); 1071 cdelta = 0; 1072 /* 1073 * If the number of remaining clusters in the @pages is smaller 1074 * or equal to the number of cached clusters, unlock the 1075 * runlist as the map cache will be used from now on. 1076 */ 1077 if (likely(vcn + vcn_len >= cend)) { 1078 up_write(&ni->runlist.lock); 1079 rl_write_locked = false; 1080 rl = NULL; 1081 } 1082 goto map_buffer_cached; 1083 } while (bh_pos += blocksize, (bh = bh->b_this_page) != head); 1084 /* If there are no errors, do the next page. */ 1085 if (likely(!err && ++u < nr_pages)) 1086 goto do_next_page; 1087 /* If there are no errors, release the runlist lock if we took it. */ 1088 if (likely(!err)) { 1089 if (unlikely(rl_write_locked)) { 1090 up_write(&ni->runlist.lock); 1091 rl_write_locked = false; 1092 } else if (unlikely(rl)) 1093 up_read(&ni->runlist.lock); 1094 rl = NULL; 1095 } 1096 /* If we issued read requests, let them complete. */ 1097 read_lock_irqsave(&ni->size_lock, flags); 1098 initialized_size = ni->initialized_size; 1099 read_unlock_irqrestore(&ni->size_lock, flags); 1100 while (wait_bh > wait) { 1101 bh = *--wait_bh; 1102 wait_on_buffer(bh); 1103 if (likely(buffer_uptodate(bh))) { 1104 page = bh->b_page; 1105 bh_pos = ((s64)page->index << PAGE_CACHE_SHIFT) + 1106 bh_offset(bh); 1107 /* 1108 * If the buffer overflows the initialized size, need 1109 * to zero the overflowing region. 1110 */ 1111 if (unlikely(bh_pos + blocksize > initialized_size)) { 1112 int ofs = 0; 1113 1114 if (likely(bh_pos < initialized_size)) 1115 ofs = initialized_size - bh_pos; 1116 zero_user_segment(page, bh_offset(bh) + ofs, 1117 blocksize); 1118 } 1119 } else /* if (unlikely(!buffer_uptodate(bh))) */ 1120 err = -EIO; 1121 } 1122 if (likely(!err)) { 1123 /* Clear buffer_new on all buffers. */ 1124 u = 0; 1125 do { 1126 bh = head = page_buffers(pages[u]); 1127 do { 1128 if (buffer_new(bh)) 1129 clear_buffer_new(bh); 1130 } while ((bh = bh->b_this_page) != head); 1131 } while (++u < nr_pages); 1132 ntfs_debug("Done."); 1133 return err; 1134 } 1135 if (status.attr_switched) { 1136 /* Get back to the attribute extent we modified. */ 1137 ntfs_attr_reinit_search_ctx(ctx); 1138 if (ntfs_attr_lookup(ni->type, ni->name, ni->name_len, 1139 CASE_SENSITIVE, bh_cpos, NULL, 0, ctx)) { 1140 ntfs_error(vol->sb, "Failed to find required " 1141 "attribute extent of attribute in " 1142 "error code path. Run chkdsk to " 1143 "recover."); 1144 write_lock_irqsave(&ni->size_lock, flags); 1145 ni->itype.compressed.size += vol->cluster_size; 1146 write_unlock_irqrestore(&ni->size_lock, flags); 1147 flush_dcache_mft_record_page(ctx->ntfs_ino); 1148 mark_mft_record_dirty(ctx->ntfs_ino); 1149 /* 1150 * The only thing that is now wrong is the compressed 1151 * size of the base attribute extent which chkdsk 1152 * should be able to fix. 1153 */ 1154 NVolSetErrors(vol); 1155 } else { 1156 m = ctx->mrec; 1157 a = ctx->attr; 1158 status.attr_switched = 0; 1159 } 1160 } 1161 /* 1162 * If the runlist has been modified, need to restore it by punching a 1163 * hole into it and we then need to deallocate the on-disk cluster as 1164 * well. Note, we only modify the runlist if we are able to generate a 1165 * new mapping pairs array, i.e. only when the mapped attribute extent 1166 * is not switched. 1167 */ 1168 if (status.runlist_merged && !status.attr_switched) { 1169 BUG_ON(!rl_write_locked); 1170 /* Make the file cluster we allocated sparse in the runlist. */ 1171 if (ntfs_rl_punch_nolock(vol, &ni->runlist, bh_cpos, 1)) { 1172 ntfs_error(vol->sb, "Failed to punch hole into " 1173 "attribute runlist in error code " 1174 "path. Run chkdsk to recover the " 1175 "lost cluster."); 1176 NVolSetErrors(vol); 1177 } else /* if (success) */ { 1178 status.runlist_merged = 0; 1179 /* 1180 * Deallocate the on-disk cluster we allocated but only 1181 * if we succeeded in punching its vcn out of the 1182 * runlist. 1183 */ 1184 down_write(&vol->lcnbmp_lock); 1185 if (ntfs_bitmap_clear_bit(vol->lcnbmp_ino, lcn)) { 1186 ntfs_error(vol->sb, "Failed to release " 1187 "allocated cluster in error " 1188 "code path. Run chkdsk to " 1189 "recover the lost cluster."); 1190 NVolSetErrors(vol); 1191 } 1192 up_write(&vol->lcnbmp_lock); 1193 } 1194 } 1195 /* 1196 * Resize the attribute record to its old size and rebuild the mapping 1197 * pairs array. Note, we only can do this if the runlist has been 1198 * restored to its old state which also implies that the mapped 1199 * attribute extent is not switched. 1200 */ 1201 if (status.mp_rebuilt && !status.runlist_merged) { 1202 if (ntfs_attr_record_resize(m, a, attr_rec_len)) { 1203 ntfs_error(vol->sb, "Failed to restore attribute " 1204 "record in error code path. Run " 1205 "chkdsk to recover."); 1206 NVolSetErrors(vol); 1207 } else /* if (success) */ { 1208 if (ntfs_mapping_pairs_build(vol, (u8*)a + 1209 le16_to_cpu(a->data.non_resident. 1210 mapping_pairs_offset), attr_rec_len - 1211 le16_to_cpu(a->data.non_resident. 1212 mapping_pairs_offset), ni->runlist.rl, 1213 vcn, highest_vcn, NULL)) { 1214 ntfs_error(vol->sb, "Failed to restore " 1215 "mapping pairs array in error " 1216 "code path. Run chkdsk to " 1217 "recover."); 1218 NVolSetErrors(vol); 1219 } 1220 flush_dcache_mft_record_page(ctx->ntfs_ino); 1221 mark_mft_record_dirty(ctx->ntfs_ino); 1222 } 1223 } 1224 /* Release the mft record and the attribute. */ 1225 if (status.mft_attr_mapped) { 1226 ntfs_attr_put_search_ctx(ctx); 1227 unmap_mft_record(base_ni); 1228 } 1229 /* Release the runlist lock. */ 1230 if (rl_write_locked) 1231 up_write(&ni->runlist.lock); 1232 else if (rl) 1233 up_read(&ni->runlist.lock); 1234 /* 1235 * Zero out any newly allocated blocks to avoid exposing stale data. 1236 * If BH_New is set, we know that the block was newly allocated above 1237 * and that it has not been fully zeroed and marked dirty yet. 1238 */ 1239 nr_pages = u; 1240 u = 0; 1241 end = bh_cpos << vol->cluster_size_bits; 1242 do { 1243 page = pages[u]; 1244 bh = head = page_buffers(page); 1245 do { 1246 if (u == nr_pages && 1247 ((s64)page->index << PAGE_CACHE_SHIFT) + 1248 bh_offset(bh) >= end) 1249 break; 1250 if (!buffer_new(bh)) 1251 continue; 1252 clear_buffer_new(bh); 1253 if (!buffer_uptodate(bh)) { 1254 if (PageUptodate(page)) 1255 set_buffer_uptodate(bh); 1256 else { 1257 zero_user(page, bh_offset(bh), 1258 blocksize); 1259 set_buffer_uptodate(bh); 1260 } 1261 } 1262 mark_buffer_dirty(bh); 1263 } while ((bh = bh->b_this_page) != head); 1264 } while (++u <= nr_pages); 1265 ntfs_error(vol->sb, "Failed. Returning error code %i.", err); 1266 return err; 1267 } 1268 1269 /* 1270 * Copy as much as we can into the pages and return the number of bytes which 1271 * were successfully copied. If a fault is encountered then clear the pages 1272 * out to (ofs + bytes) and return the number of bytes which were copied. 1273 */ 1274 static inline size_t ntfs_copy_from_user(struct page **pages, 1275 unsigned nr_pages, unsigned ofs, const char __user *buf, 1276 size_t bytes) 1277 { 1278 struct page **last_page = pages + nr_pages; 1279 char *addr; 1280 size_t total = 0; 1281 unsigned len; 1282 int left; 1283 1284 do { 1285 len = PAGE_CACHE_SIZE - ofs; 1286 if (len > bytes) 1287 len = bytes; 1288 addr = kmap_atomic(*pages); 1289 left = __copy_from_user_inatomic(addr + ofs, buf, len); 1290 kunmap_atomic(addr); 1291 if (unlikely(left)) { 1292 /* Do it the slow way. */ 1293 addr = kmap(*pages); 1294 left = __copy_from_user(addr + ofs, buf, len); 1295 kunmap(*pages); 1296 if (unlikely(left)) 1297 goto err_out; 1298 } 1299 total += len; 1300 bytes -= len; 1301 if (!bytes) 1302 break; 1303 buf += len; 1304 ofs = 0; 1305 } while (++pages < last_page); 1306 out: 1307 return total; 1308 err_out: 1309 total += len - left; 1310 /* Zero the rest of the target like __copy_from_user(). */ 1311 while (++pages < last_page) { 1312 bytes -= len; 1313 if (!bytes) 1314 break; 1315 len = PAGE_CACHE_SIZE; 1316 if (len > bytes) 1317 len = bytes; 1318 zero_user(*pages, 0, len); 1319 } 1320 goto out; 1321 } 1322 1323 static size_t __ntfs_copy_from_user_iovec_inatomic(char *vaddr, 1324 const struct iovec *iov, size_t iov_ofs, size_t bytes) 1325 { 1326 size_t total = 0; 1327 1328 while (1) { 1329 const char __user *buf = iov->iov_base + iov_ofs; 1330 unsigned len; 1331 size_t left; 1332 1333 len = iov->iov_len - iov_ofs; 1334 if (len > bytes) 1335 len = bytes; 1336 left = __copy_from_user_inatomic(vaddr, buf, len); 1337 total += len; 1338 bytes -= len; 1339 vaddr += len; 1340 if (unlikely(left)) { 1341 total -= left; 1342 break; 1343 } 1344 if (!bytes) 1345 break; 1346 iov++; 1347 iov_ofs = 0; 1348 } 1349 return total; 1350 } 1351 1352 static inline void ntfs_set_next_iovec(const struct iovec **iovp, 1353 size_t *iov_ofsp, size_t bytes) 1354 { 1355 const struct iovec *iov = *iovp; 1356 size_t iov_ofs = *iov_ofsp; 1357 1358 while (bytes) { 1359 unsigned len; 1360 1361 len = iov->iov_len - iov_ofs; 1362 if (len > bytes) 1363 len = bytes; 1364 bytes -= len; 1365 iov_ofs += len; 1366 if (iov->iov_len == iov_ofs) { 1367 iov++; 1368 iov_ofs = 0; 1369 } 1370 } 1371 *iovp = iov; 1372 *iov_ofsp = iov_ofs; 1373 } 1374 1375 /* 1376 * This has the same side-effects and return value as ntfs_copy_from_user(). 1377 * The difference is that on a fault we need to memset the remainder of the 1378 * pages (out to offset + bytes), to emulate ntfs_copy_from_user()'s 1379 * single-segment behaviour. 1380 * 1381 * We call the same helper (__ntfs_copy_from_user_iovec_inatomic()) both when 1382 * atomic and when not atomic. This is ok because it calls 1383 * __copy_from_user_inatomic() and it is ok to call this when non-atomic. In 1384 * fact, the only difference between __copy_from_user_inatomic() and 1385 * __copy_from_user() is that the latter calls might_sleep() and the former 1386 * should not zero the tail of the buffer on error. And on many architectures 1387 * __copy_from_user_inatomic() is just defined to __copy_from_user() so it 1388 * makes no difference at all on those architectures. 1389 */ 1390 static inline size_t ntfs_copy_from_user_iovec(struct page **pages, 1391 unsigned nr_pages, unsigned ofs, const struct iovec **iov, 1392 size_t *iov_ofs, size_t bytes) 1393 { 1394 struct page **last_page = pages + nr_pages; 1395 char *addr; 1396 size_t copied, len, total = 0; 1397 1398 do { 1399 len = PAGE_CACHE_SIZE - ofs; 1400 if (len > bytes) 1401 len = bytes; 1402 addr = kmap_atomic(*pages); 1403 copied = __ntfs_copy_from_user_iovec_inatomic(addr + ofs, 1404 *iov, *iov_ofs, len); 1405 kunmap_atomic(addr); 1406 if (unlikely(copied != len)) { 1407 /* Do it the slow way. */ 1408 addr = kmap(*pages); 1409 copied = __ntfs_copy_from_user_iovec_inatomic(addr + 1410 ofs, *iov, *iov_ofs, len); 1411 if (unlikely(copied != len)) 1412 goto err_out; 1413 kunmap(*pages); 1414 } 1415 total += len; 1416 ntfs_set_next_iovec(iov, iov_ofs, len); 1417 bytes -= len; 1418 if (!bytes) 1419 break; 1420 ofs = 0; 1421 } while (++pages < last_page); 1422 out: 1423 return total; 1424 err_out: 1425 BUG_ON(copied > len); 1426 /* Zero the rest of the target like __copy_from_user(). */ 1427 memset(addr + ofs + copied, 0, len - copied); 1428 kunmap(*pages); 1429 total += copied; 1430 ntfs_set_next_iovec(iov, iov_ofs, copied); 1431 while (++pages < last_page) { 1432 bytes -= len; 1433 if (!bytes) 1434 break; 1435 len = PAGE_CACHE_SIZE; 1436 if (len > bytes) 1437 len = bytes; 1438 zero_user(*pages, 0, len); 1439 } 1440 goto out; 1441 } 1442 1443 static inline void ntfs_flush_dcache_pages(struct page **pages, 1444 unsigned nr_pages) 1445 { 1446 BUG_ON(!nr_pages); 1447 /* 1448 * Warning: Do not do the decrement at the same time as the call to 1449 * flush_dcache_page() because it is a NULL macro on i386 and hence the 1450 * decrement never happens so the loop never terminates. 1451 */ 1452 do { 1453 --nr_pages; 1454 flush_dcache_page(pages[nr_pages]); 1455 } while (nr_pages > 0); 1456 } 1457 1458 /** 1459 * ntfs_commit_pages_after_non_resident_write - commit the received data 1460 * @pages: array of destination pages 1461 * @nr_pages: number of pages in @pages 1462 * @pos: byte position in file at which the write begins 1463 * @bytes: number of bytes to be written 1464 * 1465 * See description of ntfs_commit_pages_after_write(), below. 1466 */ 1467 static inline int ntfs_commit_pages_after_non_resident_write( 1468 struct page **pages, const unsigned nr_pages, 1469 s64 pos, size_t bytes) 1470 { 1471 s64 end, initialized_size; 1472 struct inode *vi; 1473 ntfs_inode *ni, *base_ni; 1474 struct buffer_head *bh, *head; 1475 ntfs_attr_search_ctx *ctx; 1476 MFT_RECORD *m; 1477 ATTR_RECORD *a; 1478 unsigned long flags; 1479 unsigned blocksize, u; 1480 int err; 1481 1482 vi = pages[0]->mapping->host; 1483 ni = NTFS_I(vi); 1484 blocksize = vi->i_sb->s_blocksize; 1485 end = pos + bytes; 1486 u = 0; 1487 do { 1488 s64 bh_pos; 1489 struct page *page; 1490 bool partial; 1491 1492 page = pages[u]; 1493 bh_pos = (s64)page->index << PAGE_CACHE_SHIFT; 1494 bh = head = page_buffers(page); 1495 partial = false; 1496 do { 1497 s64 bh_end; 1498 1499 bh_end = bh_pos + blocksize; 1500 if (bh_end <= pos || bh_pos >= end) { 1501 if (!buffer_uptodate(bh)) 1502 partial = true; 1503 } else { 1504 set_buffer_uptodate(bh); 1505 mark_buffer_dirty(bh); 1506 } 1507 } while (bh_pos += blocksize, (bh = bh->b_this_page) != head); 1508 /* 1509 * If all buffers are now uptodate but the page is not, set the 1510 * page uptodate. 1511 */ 1512 if (!partial && !PageUptodate(page)) 1513 SetPageUptodate(page); 1514 } while (++u < nr_pages); 1515 /* 1516 * Finally, if we do not need to update initialized_size or i_size we 1517 * are finished. 1518 */ 1519 read_lock_irqsave(&ni->size_lock, flags); 1520 initialized_size = ni->initialized_size; 1521 read_unlock_irqrestore(&ni->size_lock, flags); 1522 if (end <= initialized_size) { 1523 ntfs_debug("Done."); 1524 return 0; 1525 } 1526 /* 1527 * Update initialized_size/i_size as appropriate, both in the inode and 1528 * the mft record. 1529 */ 1530 if (!NInoAttr(ni)) 1531 base_ni = ni; 1532 else 1533 base_ni = ni->ext.base_ntfs_ino; 1534 /* Map, pin, and lock the mft record. */ 1535 m = map_mft_record(base_ni); 1536 if (IS_ERR(m)) { 1537 err = PTR_ERR(m); 1538 m = NULL; 1539 ctx = NULL; 1540 goto err_out; 1541 } 1542 BUG_ON(!NInoNonResident(ni)); 1543 ctx = ntfs_attr_get_search_ctx(base_ni, m); 1544 if (unlikely(!ctx)) { 1545 err = -ENOMEM; 1546 goto err_out; 1547 } 1548 err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, 1549 CASE_SENSITIVE, 0, NULL, 0, ctx); 1550 if (unlikely(err)) { 1551 if (err == -ENOENT) 1552 err = -EIO; 1553 goto err_out; 1554 } 1555 a = ctx->attr; 1556 BUG_ON(!a->non_resident); 1557 write_lock_irqsave(&ni->size_lock, flags); 1558 BUG_ON(end > ni->allocated_size); 1559 ni->initialized_size = end; 1560 a->data.non_resident.initialized_size = cpu_to_sle64(end); 1561 if (end > i_size_read(vi)) { 1562 i_size_write(vi, end); 1563 a->data.non_resident.data_size = 1564 a->data.non_resident.initialized_size; 1565 } 1566 write_unlock_irqrestore(&ni->size_lock, flags); 1567 /* Mark the mft record dirty, so it gets written back. */ 1568 flush_dcache_mft_record_page(ctx->ntfs_ino); 1569 mark_mft_record_dirty(ctx->ntfs_ino); 1570 ntfs_attr_put_search_ctx(ctx); 1571 unmap_mft_record(base_ni); 1572 ntfs_debug("Done."); 1573 return 0; 1574 err_out: 1575 if (ctx) 1576 ntfs_attr_put_search_ctx(ctx); 1577 if (m) 1578 unmap_mft_record(base_ni); 1579 ntfs_error(vi->i_sb, "Failed to update initialized_size/i_size (error " 1580 "code %i).", err); 1581 if (err != -ENOMEM) 1582 NVolSetErrors(ni->vol); 1583 return err; 1584 } 1585 1586 /** 1587 * ntfs_commit_pages_after_write - commit the received data 1588 * @pages: array of destination pages 1589 * @nr_pages: number of pages in @pages 1590 * @pos: byte position in file at which the write begins 1591 * @bytes: number of bytes to be written 1592 * 1593 * This is called from ntfs_file_buffered_write() with i_mutex held on the inode 1594 * (@pages[0]->mapping->host). There are @nr_pages pages in @pages which are 1595 * locked but not kmap()ped. The source data has already been copied into the 1596 * @page. ntfs_prepare_pages_for_non_resident_write() has been called before 1597 * the data was copied (for non-resident attributes only) and it returned 1598 * success. 1599 * 1600 * Need to set uptodate and mark dirty all buffers within the boundary of the 1601 * write. If all buffers in a page are uptodate we set the page uptodate, too. 1602 * 1603 * Setting the buffers dirty ensures that they get written out later when 1604 * ntfs_writepage() is invoked by the VM. 1605 * 1606 * Finally, we need to update i_size and initialized_size as appropriate both 1607 * in the inode and the mft record. 1608 * 1609 * This is modelled after fs/buffer.c::generic_commit_write(), which marks 1610 * buffers uptodate and dirty, sets the page uptodate if all buffers in the 1611 * page are uptodate, and updates i_size if the end of io is beyond i_size. In 1612 * that case, it also marks the inode dirty. 1613 * 1614 * If things have gone as outlined in 1615 * ntfs_prepare_pages_for_non_resident_write(), we do not need to do any page 1616 * content modifications here for non-resident attributes. For resident 1617 * attributes we need to do the uptodate bringing here which we combine with 1618 * the copying into the mft record which means we save one atomic kmap. 1619 * 1620 * Return 0 on success or -errno on error. 1621 */ 1622 static int ntfs_commit_pages_after_write(struct page **pages, 1623 const unsigned nr_pages, s64 pos, size_t bytes) 1624 { 1625 s64 end, initialized_size; 1626 loff_t i_size; 1627 struct inode *vi; 1628 ntfs_inode *ni, *base_ni; 1629 struct page *page; 1630 ntfs_attr_search_ctx *ctx; 1631 MFT_RECORD *m; 1632 ATTR_RECORD *a; 1633 char *kattr, *kaddr; 1634 unsigned long flags; 1635 u32 attr_len; 1636 int err; 1637 1638 BUG_ON(!nr_pages); 1639 BUG_ON(!pages); 1640 page = pages[0]; 1641 BUG_ON(!page); 1642 vi = page->mapping->host; 1643 ni = NTFS_I(vi); 1644 ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, start page " 1645 "index 0x%lx, nr_pages 0x%x, pos 0x%llx, bytes 0x%zx.", 1646 vi->i_ino, ni->type, page->index, nr_pages, 1647 (long long)pos, bytes); 1648 if (NInoNonResident(ni)) 1649 return ntfs_commit_pages_after_non_resident_write(pages, 1650 nr_pages, pos, bytes); 1651 BUG_ON(nr_pages > 1); 1652 /* 1653 * Attribute is resident, implying it is not compressed, encrypted, or 1654 * sparse. 1655 */ 1656 if (!NInoAttr(ni)) 1657 base_ni = ni; 1658 else 1659 base_ni = ni->ext.base_ntfs_ino; 1660 BUG_ON(NInoNonResident(ni)); 1661 /* Map, pin, and lock the mft record. */ 1662 m = map_mft_record(base_ni); 1663 if (IS_ERR(m)) { 1664 err = PTR_ERR(m); 1665 m = NULL; 1666 ctx = NULL; 1667 goto err_out; 1668 } 1669 ctx = ntfs_attr_get_search_ctx(base_ni, m); 1670 if (unlikely(!ctx)) { 1671 err = -ENOMEM; 1672 goto err_out; 1673 } 1674 err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, 1675 CASE_SENSITIVE, 0, NULL, 0, ctx); 1676 if (unlikely(err)) { 1677 if (err == -ENOENT) 1678 err = -EIO; 1679 goto err_out; 1680 } 1681 a = ctx->attr; 1682 BUG_ON(a->non_resident); 1683 /* The total length of the attribute value. */ 1684 attr_len = le32_to_cpu(a->data.resident.value_length); 1685 i_size = i_size_read(vi); 1686 BUG_ON(attr_len != i_size); 1687 BUG_ON(pos > attr_len); 1688 end = pos + bytes; 1689 BUG_ON(end > le32_to_cpu(a->length) - 1690 le16_to_cpu(a->data.resident.value_offset)); 1691 kattr = (u8*)a + le16_to_cpu(a->data.resident.value_offset); 1692 kaddr = kmap_atomic(page); 1693 /* Copy the received data from the page to the mft record. */ 1694 memcpy(kattr + pos, kaddr + pos, bytes); 1695 /* Update the attribute length if necessary. */ 1696 if (end > attr_len) { 1697 attr_len = end; 1698 a->data.resident.value_length = cpu_to_le32(attr_len); 1699 } 1700 /* 1701 * If the page is not uptodate, bring the out of bounds area(s) 1702 * uptodate by copying data from the mft record to the page. 1703 */ 1704 if (!PageUptodate(page)) { 1705 if (pos > 0) 1706 memcpy(kaddr, kattr, pos); 1707 if (end < attr_len) 1708 memcpy(kaddr + end, kattr + end, attr_len - end); 1709 /* Zero the region outside the end of the attribute value. */ 1710 memset(kaddr + attr_len, 0, PAGE_CACHE_SIZE - attr_len); 1711 flush_dcache_page(page); 1712 SetPageUptodate(page); 1713 } 1714 kunmap_atomic(kaddr); 1715 /* Update initialized_size/i_size if necessary. */ 1716 read_lock_irqsave(&ni->size_lock, flags); 1717 initialized_size = ni->initialized_size; 1718 BUG_ON(end > ni->allocated_size); 1719 read_unlock_irqrestore(&ni->size_lock, flags); 1720 BUG_ON(initialized_size != i_size); 1721 if (end > initialized_size) { 1722 write_lock_irqsave(&ni->size_lock, flags); 1723 ni->initialized_size = end; 1724 i_size_write(vi, end); 1725 write_unlock_irqrestore(&ni->size_lock, flags); 1726 } 1727 /* Mark the mft record dirty, so it gets written back. */ 1728 flush_dcache_mft_record_page(ctx->ntfs_ino); 1729 mark_mft_record_dirty(ctx->ntfs_ino); 1730 ntfs_attr_put_search_ctx(ctx); 1731 unmap_mft_record(base_ni); 1732 ntfs_debug("Done."); 1733 return 0; 1734 err_out: 1735 if (err == -ENOMEM) { 1736 ntfs_warning(vi->i_sb, "Error allocating memory required to " 1737 "commit the write."); 1738 if (PageUptodate(page)) { 1739 ntfs_warning(vi->i_sb, "Page is uptodate, setting " 1740 "dirty so the write will be retried " 1741 "later on by the VM."); 1742 /* 1743 * Put the page on mapping->dirty_pages, but leave its 1744 * buffers' dirty state as-is. 1745 */ 1746 __set_page_dirty_nobuffers(page); 1747 err = 0; 1748 } else 1749 ntfs_error(vi->i_sb, "Page is not uptodate. Written " 1750 "data has been lost."); 1751 } else { 1752 ntfs_error(vi->i_sb, "Resident attribute commit write failed " 1753 "with error %i.", err); 1754 NVolSetErrors(ni->vol); 1755 } 1756 if (ctx) 1757 ntfs_attr_put_search_ctx(ctx); 1758 if (m) 1759 unmap_mft_record(base_ni); 1760 return err; 1761 } 1762 1763 static void ntfs_write_failed(struct address_space *mapping, loff_t to) 1764 { 1765 struct inode *inode = mapping->host; 1766 1767 if (to > inode->i_size) { 1768 truncate_pagecache(inode, inode->i_size); 1769 ntfs_truncate_vfs(inode); 1770 } 1771 } 1772 1773 /** 1774 * ntfs_file_buffered_write - 1775 * 1776 * Locking: The vfs is holding ->i_mutex on the inode. 1777 */ 1778 static ssize_t ntfs_file_buffered_write(struct kiocb *iocb, 1779 const struct iovec *iov, unsigned long nr_segs, 1780 loff_t pos, loff_t *ppos, size_t count) 1781 { 1782 struct file *file = iocb->ki_filp; 1783 struct address_space *mapping = file->f_mapping; 1784 struct inode *vi = mapping->host; 1785 ntfs_inode *ni = NTFS_I(vi); 1786 ntfs_volume *vol = ni->vol; 1787 struct page *pages[NTFS_MAX_PAGES_PER_CLUSTER]; 1788 struct page *cached_page = NULL; 1789 char __user *buf = NULL; 1790 s64 end, ll; 1791 VCN last_vcn; 1792 LCN lcn; 1793 unsigned long flags; 1794 size_t bytes, iov_ofs = 0; /* Offset in the current iovec. */ 1795 ssize_t status, written; 1796 unsigned nr_pages; 1797 int err; 1798 1799 ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, " 1800 "pos 0x%llx, count 0x%lx.", 1801 vi->i_ino, (unsigned)le32_to_cpu(ni->type), 1802 (unsigned long long)pos, (unsigned long)count); 1803 if (unlikely(!count)) 1804 return 0; 1805 BUG_ON(NInoMstProtected(ni)); 1806 /* 1807 * If the attribute is not an index root and it is encrypted or 1808 * compressed, we cannot write to it yet. Note we need to check for 1809 * AT_INDEX_ALLOCATION since this is the type of both directory and 1810 * index inodes. 1811 */ 1812 if (ni->type != AT_INDEX_ALLOCATION) { 1813 /* If file is encrypted, deny access, just like NT4. */ 1814 if (NInoEncrypted(ni)) { 1815 /* 1816 * Reminder for later: Encrypted files are _always_ 1817 * non-resident so that the content can always be 1818 * encrypted. 1819 */ 1820 ntfs_debug("Denying write access to encrypted file."); 1821 return -EACCES; 1822 } 1823 if (NInoCompressed(ni)) { 1824 /* Only unnamed $DATA attribute can be compressed. */ 1825 BUG_ON(ni->type != AT_DATA); 1826 BUG_ON(ni->name_len); 1827 /* 1828 * Reminder for later: If resident, the data is not 1829 * actually compressed. Only on the switch to non- 1830 * resident does compression kick in. This is in 1831 * contrast to encrypted files (see above). 1832 */ 1833 ntfs_error(vi->i_sb, "Writing to compressed files is " 1834 "not implemented yet. Sorry."); 1835 return -EOPNOTSUPP; 1836 } 1837 } 1838 /* 1839 * If a previous ntfs_truncate() failed, repeat it and abort if it 1840 * fails again. 1841 */ 1842 if (unlikely(NInoTruncateFailed(ni))) { 1843 inode_dio_wait(vi); 1844 err = ntfs_truncate(vi); 1845 if (err || NInoTruncateFailed(ni)) { 1846 if (!err) 1847 err = -EIO; 1848 ntfs_error(vol->sb, "Cannot perform write to inode " 1849 "0x%lx, attribute type 0x%x, because " 1850 "ntfs_truncate() failed (error code " 1851 "%i).", vi->i_ino, 1852 (unsigned)le32_to_cpu(ni->type), err); 1853 return err; 1854 } 1855 } 1856 /* The first byte after the write. */ 1857 end = pos + count; 1858 /* 1859 * If the write goes beyond the allocated size, extend the allocation 1860 * to cover the whole of the write, rounded up to the nearest cluster. 1861 */ 1862 read_lock_irqsave(&ni->size_lock, flags); 1863 ll = ni->allocated_size; 1864 read_unlock_irqrestore(&ni->size_lock, flags); 1865 if (end > ll) { 1866 /* Extend the allocation without changing the data size. */ 1867 ll = ntfs_attr_extend_allocation(ni, end, -1, pos); 1868 if (likely(ll >= 0)) { 1869 BUG_ON(pos >= ll); 1870 /* If the extension was partial truncate the write. */ 1871 if (end > ll) { 1872 ntfs_debug("Truncating write to inode 0x%lx, " 1873 "attribute type 0x%x, because " 1874 "the allocation was only " 1875 "partially extended.", 1876 vi->i_ino, (unsigned) 1877 le32_to_cpu(ni->type)); 1878 end = ll; 1879 count = ll - pos; 1880 } 1881 } else { 1882 err = ll; 1883 read_lock_irqsave(&ni->size_lock, flags); 1884 ll = ni->allocated_size; 1885 read_unlock_irqrestore(&ni->size_lock, flags); 1886 /* Perform a partial write if possible or fail. */ 1887 if (pos < ll) { 1888 ntfs_debug("Truncating write to inode 0x%lx, " 1889 "attribute type 0x%x, because " 1890 "extending the allocation " 1891 "failed (error code %i).", 1892 vi->i_ino, (unsigned) 1893 le32_to_cpu(ni->type), err); 1894 end = ll; 1895 count = ll - pos; 1896 } else { 1897 ntfs_error(vol->sb, "Cannot perform write to " 1898 "inode 0x%lx, attribute type " 1899 "0x%x, because extending the " 1900 "allocation failed (error " 1901 "code %i).", vi->i_ino, 1902 (unsigned) 1903 le32_to_cpu(ni->type), err); 1904 return err; 1905 } 1906 } 1907 } 1908 written = 0; 1909 /* 1910 * If the write starts beyond the initialized size, extend it up to the 1911 * beginning of the write and initialize all non-sparse space between 1912 * the old initialized size and the new one. This automatically also 1913 * increments the vfs inode->i_size to keep it above or equal to the 1914 * initialized_size. 1915 */ 1916 read_lock_irqsave(&ni->size_lock, flags); 1917 ll = ni->initialized_size; 1918 read_unlock_irqrestore(&ni->size_lock, flags); 1919 if (pos > ll) { 1920 err = ntfs_attr_extend_initialized(ni, pos); 1921 if (err < 0) { 1922 ntfs_error(vol->sb, "Cannot perform write to inode " 1923 "0x%lx, attribute type 0x%x, because " 1924 "extending the initialized size " 1925 "failed (error code %i).", vi->i_ino, 1926 (unsigned)le32_to_cpu(ni->type), err); 1927 status = err; 1928 goto err_out; 1929 } 1930 } 1931 /* 1932 * Determine the number of pages per cluster for non-resident 1933 * attributes. 1934 */ 1935 nr_pages = 1; 1936 if (vol->cluster_size > PAGE_CACHE_SIZE && NInoNonResident(ni)) 1937 nr_pages = vol->cluster_size >> PAGE_CACHE_SHIFT; 1938 /* Finally, perform the actual write. */ 1939 last_vcn = -1; 1940 if (likely(nr_segs == 1)) 1941 buf = iov->iov_base; 1942 do { 1943 VCN vcn; 1944 pgoff_t idx, start_idx; 1945 unsigned ofs, do_pages, u; 1946 size_t copied; 1947 1948 start_idx = idx = pos >> PAGE_CACHE_SHIFT; 1949 ofs = pos & ~PAGE_CACHE_MASK; 1950 bytes = PAGE_CACHE_SIZE - ofs; 1951 do_pages = 1; 1952 if (nr_pages > 1) { 1953 vcn = pos >> vol->cluster_size_bits; 1954 if (vcn != last_vcn) { 1955 last_vcn = vcn; 1956 /* 1957 * Get the lcn of the vcn the write is in. If 1958 * it is a hole, need to lock down all pages in 1959 * the cluster. 1960 */ 1961 down_read(&ni->runlist.lock); 1962 lcn = ntfs_attr_vcn_to_lcn_nolock(ni, pos >> 1963 vol->cluster_size_bits, false); 1964 up_read(&ni->runlist.lock); 1965 if (unlikely(lcn < LCN_HOLE)) { 1966 status = -EIO; 1967 if (lcn == LCN_ENOMEM) 1968 status = -ENOMEM; 1969 else 1970 ntfs_error(vol->sb, "Cannot " 1971 "perform write to " 1972 "inode 0x%lx, " 1973 "attribute type 0x%x, " 1974 "because the attribute " 1975 "is corrupt.", 1976 vi->i_ino, (unsigned) 1977 le32_to_cpu(ni->type)); 1978 break; 1979 } 1980 if (lcn == LCN_HOLE) { 1981 start_idx = (pos & ~(s64) 1982 vol->cluster_size_mask) 1983 >> PAGE_CACHE_SHIFT; 1984 bytes = vol->cluster_size - (pos & 1985 vol->cluster_size_mask); 1986 do_pages = nr_pages; 1987 } 1988 } 1989 } 1990 if (bytes > count) 1991 bytes = count; 1992 /* 1993 * Bring in the user page(s) that we will copy from _first_. 1994 * Otherwise there is a nasty deadlock on copying from the same 1995 * page(s) as we are writing to, without it/them being marked 1996 * up-to-date. Note, at present there is nothing to stop the 1997 * pages being swapped out between us bringing them into memory 1998 * and doing the actual copying. 1999 */ 2000 if (likely(nr_segs == 1)) 2001 ntfs_fault_in_pages_readable(buf, bytes); 2002 else 2003 ntfs_fault_in_pages_readable_iovec(iov, iov_ofs, bytes); 2004 /* Get and lock @do_pages starting at index @start_idx. */ 2005 status = __ntfs_grab_cache_pages(mapping, start_idx, do_pages, 2006 pages, &cached_page); 2007 if (unlikely(status)) 2008 break; 2009 /* 2010 * For non-resident attributes, we need to fill any holes with 2011 * actual clusters and ensure all bufferes are mapped. We also 2012 * need to bring uptodate any buffers that are only partially 2013 * being written to. 2014 */ 2015 if (NInoNonResident(ni)) { 2016 status = ntfs_prepare_pages_for_non_resident_write( 2017 pages, do_pages, pos, bytes); 2018 if (unlikely(status)) { 2019 loff_t i_size; 2020 2021 do { 2022 unlock_page(pages[--do_pages]); 2023 page_cache_release(pages[do_pages]); 2024 } while (do_pages); 2025 /* 2026 * The write preparation may have instantiated 2027 * allocated space outside i_size. Trim this 2028 * off again. We can ignore any errors in this 2029 * case as we will just be waisting a bit of 2030 * allocated space, which is not a disaster. 2031 */ 2032 i_size = i_size_read(vi); 2033 if (pos + bytes > i_size) { 2034 ntfs_write_failed(mapping, pos + bytes); 2035 } 2036 break; 2037 } 2038 } 2039 u = (pos >> PAGE_CACHE_SHIFT) - pages[0]->index; 2040 if (likely(nr_segs == 1)) { 2041 copied = ntfs_copy_from_user(pages + u, do_pages - u, 2042 ofs, buf, bytes); 2043 buf += copied; 2044 } else 2045 copied = ntfs_copy_from_user_iovec(pages + u, 2046 do_pages - u, ofs, &iov, &iov_ofs, 2047 bytes); 2048 ntfs_flush_dcache_pages(pages + u, do_pages - u); 2049 status = ntfs_commit_pages_after_write(pages, do_pages, pos, 2050 bytes); 2051 if (likely(!status)) { 2052 written += copied; 2053 count -= copied; 2054 pos += copied; 2055 if (unlikely(copied != bytes)) 2056 status = -EFAULT; 2057 } 2058 do { 2059 unlock_page(pages[--do_pages]); 2060 page_cache_release(pages[do_pages]); 2061 } while (do_pages); 2062 if (unlikely(status)) 2063 break; 2064 balance_dirty_pages_ratelimited(mapping); 2065 cond_resched(); 2066 } while (count); 2067 err_out: 2068 *ppos = pos; 2069 if (cached_page) 2070 page_cache_release(cached_page); 2071 ntfs_debug("Done. Returning %s (written 0x%lx, status %li).", 2072 written ? "written" : "status", (unsigned long)written, 2073 (long)status); 2074 return written ? written : status; 2075 } 2076 2077 /** 2078 * ntfs_file_aio_write_nolock - 2079 */ 2080 static ssize_t ntfs_file_aio_write_nolock(struct kiocb *iocb, 2081 const struct iovec *iov, unsigned long nr_segs, loff_t *ppos) 2082 { 2083 struct file *file = iocb->ki_filp; 2084 struct address_space *mapping = file->f_mapping; 2085 struct inode *inode = mapping->host; 2086 loff_t pos; 2087 size_t count; /* after file limit checks */ 2088 ssize_t written, err; 2089 2090 count = iov_length(iov, nr_segs); 2091 pos = *ppos; 2092 /* We can write back this queue in page reclaim. */ 2093 current->backing_dev_info = mapping->backing_dev_info; 2094 written = 0; 2095 err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); 2096 if (err) 2097 goto out; 2098 if (!count) 2099 goto out; 2100 err = file_remove_suid(file); 2101 if (err) 2102 goto out; 2103 err = file_update_time(file); 2104 if (err) 2105 goto out; 2106 written = ntfs_file_buffered_write(iocb, iov, nr_segs, pos, ppos, 2107 count); 2108 out: 2109 current->backing_dev_info = NULL; 2110 return written ? written : err; 2111 } 2112 2113 /** 2114 * ntfs_file_aio_write - 2115 */ 2116 static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov, 2117 unsigned long nr_segs, loff_t pos) 2118 { 2119 struct file *file = iocb->ki_filp; 2120 struct address_space *mapping = file->f_mapping; 2121 struct inode *inode = mapping->host; 2122 ssize_t ret; 2123 2124 BUG_ON(iocb->ki_pos != pos); 2125 2126 mutex_lock(&inode->i_mutex); 2127 ret = ntfs_file_aio_write_nolock(iocb, iov, nr_segs, &iocb->ki_pos); 2128 mutex_unlock(&inode->i_mutex); 2129 if (ret > 0) { 2130 int err = generic_write_sync(file, iocb->ki_pos - ret, ret); 2131 if (err < 0) 2132 ret = err; 2133 } 2134 return ret; 2135 } 2136 2137 /** 2138 * ntfs_file_fsync - sync a file to disk 2139 * @filp: file to be synced 2140 * @datasync: if non-zero only flush user data and not metadata 2141 * 2142 * Data integrity sync of a file to disk. Used for fsync, fdatasync, and msync 2143 * system calls. This function is inspired by fs/buffer.c::file_fsync(). 2144 * 2145 * If @datasync is false, write the mft record and all associated extent mft 2146 * records as well as the $DATA attribute and then sync the block device. 2147 * 2148 * If @datasync is true and the attribute is non-resident, we skip the writing 2149 * of the mft record and all associated extent mft records (this might still 2150 * happen due to the write_inode_now() call). 2151 * 2152 * Also, if @datasync is true, we do not wait on the inode to be written out 2153 * but we always wait on the page cache pages to be written out. 2154 * 2155 * Locking: Caller must hold i_mutex on the inode. 2156 * 2157 * TODO: We should probably also write all attribute/index inodes associated 2158 * with this inode but since we have no simple way of getting to them we ignore 2159 * this problem for now. 2160 */ 2161 static int ntfs_file_fsync(struct file *filp, loff_t start, loff_t end, 2162 int datasync) 2163 { 2164 struct inode *vi = filp->f_mapping->host; 2165 int err, ret = 0; 2166 2167 ntfs_debug("Entering for inode 0x%lx.", vi->i_ino); 2168 2169 err = filemap_write_and_wait_range(vi->i_mapping, start, end); 2170 if (err) 2171 return err; 2172 mutex_lock(&vi->i_mutex); 2173 2174 BUG_ON(S_ISDIR(vi->i_mode)); 2175 if (!datasync || !NInoNonResident(NTFS_I(vi))) 2176 ret = __ntfs_write_inode(vi, 1); 2177 write_inode_now(vi, !datasync); 2178 /* 2179 * NOTE: If we were to use mapping->private_list (see ext2 and 2180 * fs/buffer.c) for dirty blocks then we could optimize the below to be 2181 * sync_mapping_buffers(vi->i_mapping). 2182 */ 2183 err = sync_blockdev(vi->i_sb->s_bdev); 2184 if (unlikely(err && !ret)) 2185 ret = err; 2186 if (likely(!ret)) 2187 ntfs_debug("Done."); 2188 else 2189 ntfs_warning(vi->i_sb, "Failed to f%ssync inode 0x%lx. Error " 2190 "%u.", datasync ? "data" : "", vi->i_ino, -ret); 2191 mutex_unlock(&vi->i_mutex); 2192 return ret; 2193 } 2194 2195 #endif /* NTFS_RW */ 2196 2197 const struct file_operations ntfs_file_ops = { 2198 .llseek = generic_file_llseek, /* Seek inside file. */ 2199 .read = new_sync_read, /* Read from file. */ 2200 .read_iter = generic_file_read_iter, /* Async read from file. */ 2201 #ifdef NTFS_RW 2202 .write = do_sync_write, /* Write to file. */ 2203 .aio_write = ntfs_file_aio_write, /* Async write to file. */ 2204 /*.release = ,*/ /* Last file is closed. See 2205 fs/ext2/file.c:: 2206 ext2_release_file() for 2207 how to use this to discard 2208 preallocated space for 2209 write opened files. */ 2210 .fsync = ntfs_file_fsync, /* Sync a file to disk. */ 2211 /*.aio_fsync = ,*/ /* Sync all outstanding async 2212 i/o operations on a 2213 kiocb. */ 2214 #endif /* NTFS_RW */ 2215 /*.ioctl = ,*/ /* Perform function on the 2216 mounted filesystem. */ 2217 .mmap = generic_file_mmap, /* Mmap file. */ 2218 .open = ntfs_file_open, /* Open file. */ 2219 .splice_read = generic_file_splice_read /* Zero-copy data send with 2220 the data source being on 2221 the ntfs partition. We do 2222 not need to care about the 2223 data destination. */ 2224 /*.sendpage = ,*/ /* Zero-copy data send with 2225 the data destination being 2226 on the ntfs partition. We 2227 do not need to care about 2228 the data source. */ 2229 }; 2230 2231 const struct inode_operations ntfs_file_inode_ops = { 2232 #ifdef NTFS_RW 2233 .setattr = ntfs_setattr, 2234 #endif /* NTFS_RW */ 2235 }; 2236 2237 const struct file_operations ntfs_empty_file_ops = {}; 2238 2239 const struct inode_operations ntfs_empty_inode_ops = {}; 2240