1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * file.c - NTFS kernel file operations. Part of the Linux-NTFS project. 4 * 5 * Copyright (c) 2001-2015 Anton Altaparmakov and Tuxera Inc. 6 */ 7 8 #include <linux/blkdev.h> 9 #include <linux/backing-dev.h> 10 #include <linux/buffer_head.h> 11 #include <linux/gfp.h> 12 #include <linux/pagemap.h> 13 #include <linux/pagevec.h> 14 #include <linux/sched/signal.h> 15 #include <linux/swap.h> 16 #include <linux/uio.h> 17 #include <linux/writeback.h> 18 19 #include <asm/page.h> 20 #include <linux/uaccess.h> 21 22 #include "attrib.h" 23 #include "bitmap.h" 24 #include "inode.h" 25 #include "debug.h" 26 #include "lcnalloc.h" 27 #include "malloc.h" 28 #include "mft.h" 29 #include "ntfs.h" 30 31 /** 32 * ntfs_file_open - called when an inode is about to be opened 33 * @vi: inode to be opened 34 * @filp: file structure describing the inode 35 * 36 * Limit file size to the page cache limit on architectures where unsigned long 37 * is 32-bits. This is the most we can do for now without overflowing the page 38 * cache page index. Doing it this way means we don't run into problems because 39 * of existing too large files. It would be better to allow the user to read 40 * the beginning of the file but I doubt very much anyone is going to hit this 41 * check on a 32-bit architecture, so there is no point in adding the extra 42 * complexity required to support this. 43 * 44 * On 64-bit architectures, the check is hopefully optimized away by the 45 * compiler. 46 * 47 * After the check passes, just call generic_file_open() to do its work. 48 */ 49 static int ntfs_file_open(struct inode *vi, struct file *filp) 50 { 51 if (sizeof(unsigned long) < 8) { 52 if (i_size_read(vi) > MAX_LFS_FILESIZE) 53 return -EOVERFLOW; 54 } 55 return generic_file_open(vi, filp); 56 } 57 58 #ifdef NTFS_RW 59 60 /** 61 * ntfs_attr_extend_initialized - extend the initialized size of an attribute 62 * @ni: ntfs inode of the attribute to extend 63 * @new_init_size: requested new initialized size in bytes 64 * 65 * Extend the initialized size of an attribute described by the ntfs inode @ni 66 * to @new_init_size bytes. This involves zeroing any non-sparse space between 67 * the old initialized size and @new_init_size both in the page cache and on 68 * disk (if relevant complete pages are already uptodate in the page cache then 69 * these are simply marked dirty). 70 * 71 * As a side-effect, the file size (vfs inode->i_size) may be incremented as, 72 * in the resident attribute case, it is tied to the initialized size and, in 73 * the non-resident attribute case, it may not fall below the initialized size. 74 * 75 * Note that if the attribute is resident, we do not need to touch the page 76 * cache at all. This is because if the page cache page is not uptodate we 77 * bring it uptodate later, when doing the write to the mft record since we 78 * then already have the page mapped. And if the page is uptodate, the 79 * non-initialized region will already have been zeroed when the page was 80 * brought uptodate and the region may in fact already have been overwritten 81 * with new data via mmap() based writes, so we cannot just zero it. And since 82 * POSIX specifies that the behaviour of resizing a file whilst it is mmap()ped 83 * is unspecified, we choose not to do zeroing and thus we do not need to touch 84 * the page at all. For a more detailed explanation see ntfs_truncate() in 85 * fs/ntfs/inode.c. 86 * 87 * Return 0 on success and -errno on error. In the case that an error is 88 * encountered it is possible that the initialized size will already have been 89 * incremented some way towards @new_init_size but it is guaranteed that if 90 * this is the case, the necessary zeroing will also have happened and that all 91 * metadata is self-consistent. 92 * 93 * Locking: i_mutex on the vfs inode corrseponsind to the ntfs inode @ni must be 94 * held by the caller. 95 */ 96 static int ntfs_attr_extend_initialized(ntfs_inode *ni, const s64 new_init_size) 97 { 98 s64 old_init_size; 99 loff_t old_i_size; 100 pgoff_t index, end_index; 101 unsigned long flags; 102 struct inode *vi = VFS_I(ni); 103 ntfs_inode *base_ni; 104 MFT_RECORD *m = NULL; 105 ATTR_RECORD *a; 106 ntfs_attr_search_ctx *ctx = NULL; 107 struct address_space *mapping; 108 struct page *page = NULL; 109 u8 *kattr; 110 int err; 111 u32 attr_len; 112 113 read_lock_irqsave(&ni->size_lock, flags); 114 old_init_size = ni->initialized_size; 115 old_i_size = i_size_read(vi); 116 BUG_ON(new_init_size > ni->allocated_size); 117 read_unlock_irqrestore(&ni->size_lock, flags); 118 ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, " 119 "old_initialized_size 0x%llx, " 120 "new_initialized_size 0x%llx, i_size 0x%llx.", 121 vi->i_ino, (unsigned)le32_to_cpu(ni->type), 122 (unsigned long long)old_init_size, 123 (unsigned long long)new_init_size, old_i_size); 124 if (!NInoAttr(ni)) 125 base_ni = ni; 126 else 127 base_ni = ni->ext.base_ntfs_ino; 128 /* Use goto to reduce indentation and we need the label below anyway. */ 129 if (NInoNonResident(ni)) 130 goto do_non_resident_extend; 131 BUG_ON(old_init_size != old_i_size); 132 m = map_mft_record(base_ni); 133 if (IS_ERR(m)) { 134 err = PTR_ERR(m); 135 m = NULL; 136 goto err_out; 137 } 138 ctx = ntfs_attr_get_search_ctx(base_ni, m); 139 if (unlikely(!ctx)) { 140 err = -ENOMEM; 141 goto err_out; 142 } 143 err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, 144 CASE_SENSITIVE, 0, NULL, 0, ctx); 145 if (unlikely(err)) { 146 if (err == -ENOENT) 147 err = -EIO; 148 goto err_out; 149 } 150 m = ctx->mrec; 151 a = ctx->attr; 152 BUG_ON(a->non_resident); 153 /* The total length of the attribute value. */ 154 attr_len = le32_to_cpu(a->data.resident.value_length); 155 BUG_ON(old_i_size != (loff_t)attr_len); 156 /* 157 * Do the zeroing in the mft record and update the attribute size in 158 * the mft record. 159 */ 160 kattr = (u8*)a + le16_to_cpu(a->data.resident.value_offset); 161 memset(kattr + attr_len, 0, new_init_size - attr_len); 162 a->data.resident.value_length = cpu_to_le32((u32)new_init_size); 163 /* Finally, update the sizes in the vfs and ntfs inodes. */ 164 write_lock_irqsave(&ni->size_lock, flags); 165 i_size_write(vi, new_init_size); 166 ni->initialized_size = new_init_size; 167 write_unlock_irqrestore(&ni->size_lock, flags); 168 goto done; 169 do_non_resident_extend: 170 /* 171 * If the new initialized size @new_init_size exceeds the current file 172 * size (vfs inode->i_size), we need to extend the file size to the 173 * new initialized size. 174 */ 175 if (new_init_size > old_i_size) { 176 m = map_mft_record(base_ni); 177 if (IS_ERR(m)) { 178 err = PTR_ERR(m); 179 m = NULL; 180 goto err_out; 181 } 182 ctx = ntfs_attr_get_search_ctx(base_ni, m); 183 if (unlikely(!ctx)) { 184 err = -ENOMEM; 185 goto err_out; 186 } 187 err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, 188 CASE_SENSITIVE, 0, NULL, 0, ctx); 189 if (unlikely(err)) { 190 if (err == -ENOENT) 191 err = -EIO; 192 goto err_out; 193 } 194 m = ctx->mrec; 195 a = ctx->attr; 196 BUG_ON(!a->non_resident); 197 BUG_ON(old_i_size != (loff_t) 198 sle64_to_cpu(a->data.non_resident.data_size)); 199 a->data.non_resident.data_size = cpu_to_sle64(new_init_size); 200 flush_dcache_mft_record_page(ctx->ntfs_ino); 201 mark_mft_record_dirty(ctx->ntfs_ino); 202 /* Update the file size in the vfs inode. */ 203 i_size_write(vi, new_init_size); 204 ntfs_attr_put_search_ctx(ctx); 205 ctx = NULL; 206 unmap_mft_record(base_ni); 207 m = NULL; 208 } 209 mapping = vi->i_mapping; 210 index = old_init_size >> PAGE_SHIFT; 211 end_index = (new_init_size + PAGE_SIZE - 1) >> PAGE_SHIFT; 212 do { 213 /* 214 * Read the page. If the page is not present, this will zero 215 * the uninitialized regions for us. 216 */ 217 page = read_mapping_page(mapping, index, NULL); 218 if (IS_ERR(page)) { 219 err = PTR_ERR(page); 220 goto init_err_out; 221 } 222 if (unlikely(PageError(page))) { 223 put_page(page); 224 err = -EIO; 225 goto init_err_out; 226 } 227 /* 228 * Update the initialized size in the ntfs inode. This is 229 * enough to make ntfs_writepage() work. 230 */ 231 write_lock_irqsave(&ni->size_lock, flags); 232 ni->initialized_size = (s64)(index + 1) << PAGE_SHIFT; 233 if (ni->initialized_size > new_init_size) 234 ni->initialized_size = new_init_size; 235 write_unlock_irqrestore(&ni->size_lock, flags); 236 /* Set the page dirty so it gets written out. */ 237 set_page_dirty(page); 238 put_page(page); 239 /* 240 * Play nice with the vm and the rest of the system. This is 241 * very much needed as we can potentially be modifying the 242 * initialised size from a very small value to a really huge 243 * value, e.g. 244 * f = open(somefile, O_TRUNC); 245 * truncate(f, 10GiB); 246 * seek(f, 10GiB); 247 * write(f, 1); 248 * And this would mean we would be marking dirty hundreds of 249 * thousands of pages or as in the above example more than 250 * two and a half million pages! 251 * 252 * TODO: For sparse pages could optimize this workload by using 253 * the FsMisc / MiscFs page bit as a "PageIsSparse" bit. This 254 * would be set in read_folio for sparse pages and here we would 255 * not need to mark dirty any pages which have this bit set. 256 * The only caveat is that we have to clear the bit everywhere 257 * where we allocate any clusters that lie in the page or that 258 * contain the page. 259 * 260 * TODO: An even greater optimization would be for us to only 261 * call read_folio() on pages which are not in sparse regions as 262 * determined from the runlist. This would greatly reduce the 263 * number of pages we read and make dirty in the case of sparse 264 * files. 265 */ 266 balance_dirty_pages_ratelimited(mapping); 267 cond_resched(); 268 } while (++index < end_index); 269 read_lock_irqsave(&ni->size_lock, flags); 270 BUG_ON(ni->initialized_size != new_init_size); 271 read_unlock_irqrestore(&ni->size_lock, flags); 272 /* Now bring in sync the initialized_size in the mft record. */ 273 m = map_mft_record(base_ni); 274 if (IS_ERR(m)) { 275 err = PTR_ERR(m); 276 m = NULL; 277 goto init_err_out; 278 } 279 ctx = ntfs_attr_get_search_ctx(base_ni, m); 280 if (unlikely(!ctx)) { 281 err = -ENOMEM; 282 goto init_err_out; 283 } 284 err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, 285 CASE_SENSITIVE, 0, NULL, 0, ctx); 286 if (unlikely(err)) { 287 if (err == -ENOENT) 288 err = -EIO; 289 goto init_err_out; 290 } 291 m = ctx->mrec; 292 a = ctx->attr; 293 BUG_ON(!a->non_resident); 294 a->data.non_resident.initialized_size = cpu_to_sle64(new_init_size); 295 done: 296 flush_dcache_mft_record_page(ctx->ntfs_ino); 297 mark_mft_record_dirty(ctx->ntfs_ino); 298 if (ctx) 299 ntfs_attr_put_search_ctx(ctx); 300 if (m) 301 unmap_mft_record(base_ni); 302 ntfs_debug("Done, initialized_size 0x%llx, i_size 0x%llx.", 303 (unsigned long long)new_init_size, i_size_read(vi)); 304 return 0; 305 init_err_out: 306 write_lock_irqsave(&ni->size_lock, flags); 307 ni->initialized_size = old_init_size; 308 write_unlock_irqrestore(&ni->size_lock, flags); 309 err_out: 310 if (ctx) 311 ntfs_attr_put_search_ctx(ctx); 312 if (m) 313 unmap_mft_record(base_ni); 314 ntfs_debug("Failed. Returning error code %i.", err); 315 return err; 316 } 317 318 static ssize_t ntfs_prepare_file_for_write(struct kiocb *iocb, 319 struct iov_iter *from) 320 { 321 loff_t pos; 322 s64 end, ll; 323 ssize_t err; 324 unsigned long flags; 325 struct file *file = iocb->ki_filp; 326 struct inode *vi = file_inode(file); 327 ntfs_inode *ni = NTFS_I(vi); 328 ntfs_volume *vol = ni->vol; 329 330 ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, pos " 331 "0x%llx, count 0x%zx.", vi->i_ino, 332 (unsigned)le32_to_cpu(ni->type), 333 (unsigned long long)iocb->ki_pos, 334 iov_iter_count(from)); 335 err = generic_write_checks(iocb, from); 336 if (unlikely(err <= 0)) 337 goto out; 338 /* 339 * All checks have passed. Before we start doing any writing we want 340 * to abort any totally illegal writes. 341 */ 342 BUG_ON(NInoMstProtected(ni)); 343 BUG_ON(ni->type != AT_DATA); 344 /* If file is encrypted, deny access, just like NT4. */ 345 if (NInoEncrypted(ni)) { 346 /* Only $DATA attributes can be encrypted. */ 347 /* 348 * Reminder for later: Encrypted files are _always_ 349 * non-resident so that the content can always be encrypted. 350 */ 351 ntfs_debug("Denying write access to encrypted file."); 352 err = -EACCES; 353 goto out; 354 } 355 if (NInoCompressed(ni)) { 356 /* Only unnamed $DATA attribute can be compressed. */ 357 BUG_ON(ni->name_len); 358 /* 359 * Reminder for later: If resident, the data is not actually 360 * compressed. Only on the switch to non-resident does 361 * compression kick in. This is in contrast to encrypted files 362 * (see above). 363 */ 364 ntfs_error(vi->i_sb, "Writing to compressed files is not " 365 "implemented yet. Sorry."); 366 err = -EOPNOTSUPP; 367 goto out; 368 } 369 err = file_remove_privs(file); 370 if (unlikely(err)) 371 goto out; 372 /* 373 * Our ->update_time method always succeeds thus file_update_time() 374 * cannot fail either so there is no need to check the return code. 375 */ 376 file_update_time(file); 377 pos = iocb->ki_pos; 378 /* The first byte after the last cluster being written to. */ 379 end = (pos + iov_iter_count(from) + vol->cluster_size_mask) & 380 ~(u64)vol->cluster_size_mask; 381 /* 382 * If the write goes beyond the allocated size, extend the allocation 383 * to cover the whole of the write, rounded up to the nearest cluster. 384 */ 385 read_lock_irqsave(&ni->size_lock, flags); 386 ll = ni->allocated_size; 387 read_unlock_irqrestore(&ni->size_lock, flags); 388 if (end > ll) { 389 /* 390 * Extend the allocation without changing the data size. 391 * 392 * Note we ensure the allocation is big enough to at least 393 * write some data but we do not require the allocation to be 394 * complete, i.e. it may be partial. 395 */ 396 ll = ntfs_attr_extend_allocation(ni, end, -1, pos); 397 if (likely(ll >= 0)) { 398 BUG_ON(pos >= ll); 399 /* If the extension was partial truncate the write. */ 400 if (end > ll) { 401 ntfs_debug("Truncating write to inode 0x%lx, " 402 "attribute type 0x%x, because " 403 "the allocation was only " 404 "partially extended.", 405 vi->i_ino, (unsigned) 406 le32_to_cpu(ni->type)); 407 iov_iter_truncate(from, ll - pos); 408 } 409 } else { 410 err = ll; 411 read_lock_irqsave(&ni->size_lock, flags); 412 ll = ni->allocated_size; 413 read_unlock_irqrestore(&ni->size_lock, flags); 414 /* Perform a partial write if possible or fail. */ 415 if (pos < ll) { 416 ntfs_debug("Truncating write to inode 0x%lx " 417 "attribute type 0x%x, because " 418 "extending the allocation " 419 "failed (error %d).", 420 vi->i_ino, (unsigned) 421 le32_to_cpu(ni->type), 422 (int)-err); 423 iov_iter_truncate(from, ll - pos); 424 } else { 425 if (err != -ENOSPC) 426 ntfs_error(vi->i_sb, "Cannot perform " 427 "write to inode " 428 "0x%lx, attribute " 429 "type 0x%x, because " 430 "extending the " 431 "allocation failed " 432 "(error %ld).", 433 vi->i_ino, (unsigned) 434 le32_to_cpu(ni->type), 435 (long)-err); 436 else 437 ntfs_debug("Cannot perform write to " 438 "inode 0x%lx, " 439 "attribute type 0x%x, " 440 "because there is not " 441 "space left.", 442 vi->i_ino, (unsigned) 443 le32_to_cpu(ni->type)); 444 goto out; 445 } 446 } 447 } 448 /* 449 * If the write starts beyond the initialized size, extend it up to the 450 * beginning of the write and initialize all non-sparse space between 451 * the old initialized size and the new one. This automatically also 452 * increments the vfs inode->i_size to keep it above or equal to the 453 * initialized_size. 454 */ 455 read_lock_irqsave(&ni->size_lock, flags); 456 ll = ni->initialized_size; 457 read_unlock_irqrestore(&ni->size_lock, flags); 458 if (pos > ll) { 459 /* 460 * Wait for ongoing direct i/o to complete before proceeding. 461 * New direct i/o cannot start as we hold i_mutex. 462 */ 463 inode_dio_wait(vi); 464 err = ntfs_attr_extend_initialized(ni, pos); 465 if (unlikely(err < 0)) 466 ntfs_error(vi->i_sb, "Cannot perform write to inode " 467 "0x%lx, attribute type 0x%x, because " 468 "extending the initialized size " 469 "failed (error %d).", vi->i_ino, 470 (unsigned)le32_to_cpu(ni->type), 471 (int)-err); 472 } 473 out: 474 return err; 475 } 476 477 /** 478 * __ntfs_grab_cache_pages - obtain a number of locked pages 479 * @mapping: address space mapping from which to obtain page cache pages 480 * @index: starting index in @mapping at which to begin obtaining pages 481 * @nr_pages: number of page cache pages to obtain 482 * @pages: array of pages in which to return the obtained page cache pages 483 * @cached_page: allocated but as yet unused page 484 * 485 * Obtain @nr_pages locked page cache pages from the mapping @mapping and 486 * starting at index @index. 487 * 488 * If a page is newly created, add it to lru list 489 * 490 * Note, the page locks are obtained in ascending page index order. 491 */ 492 static inline int __ntfs_grab_cache_pages(struct address_space *mapping, 493 pgoff_t index, const unsigned nr_pages, struct page **pages, 494 struct page **cached_page) 495 { 496 int err, nr; 497 498 BUG_ON(!nr_pages); 499 err = nr = 0; 500 do { 501 pages[nr] = find_get_page_flags(mapping, index, FGP_LOCK | 502 FGP_ACCESSED); 503 if (!pages[nr]) { 504 if (!*cached_page) { 505 *cached_page = page_cache_alloc(mapping); 506 if (unlikely(!*cached_page)) { 507 err = -ENOMEM; 508 goto err_out; 509 } 510 } 511 err = add_to_page_cache_lru(*cached_page, mapping, 512 index, 513 mapping_gfp_constraint(mapping, GFP_KERNEL)); 514 if (unlikely(err)) { 515 if (err == -EEXIST) 516 continue; 517 goto err_out; 518 } 519 pages[nr] = *cached_page; 520 *cached_page = NULL; 521 } 522 index++; 523 nr++; 524 } while (nr < nr_pages); 525 out: 526 return err; 527 err_out: 528 while (nr > 0) { 529 unlock_page(pages[--nr]); 530 put_page(pages[nr]); 531 } 532 goto out; 533 } 534 535 static inline int ntfs_submit_bh_for_read(struct buffer_head *bh) 536 { 537 lock_buffer(bh); 538 get_bh(bh); 539 bh->b_end_io = end_buffer_read_sync; 540 return submit_bh(REQ_OP_READ, 0, bh); 541 } 542 543 /** 544 * ntfs_prepare_pages_for_non_resident_write - prepare pages for receiving data 545 * @pages: array of destination pages 546 * @nr_pages: number of pages in @pages 547 * @pos: byte position in file at which the write begins 548 * @bytes: number of bytes to be written 549 * 550 * This is called for non-resident attributes from ntfs_file_buffered_write() 551 * with i_mutex held on the inode (@pages[0]->mapping->host). There are 552 * @nr_pages pages in @pages which are locked but not kmap()ped. The source 553 * data has not yet been copied into the @pages. 554 * 555 * Need to fill any holes with actual clusters, allocate buffers if necessary, 556 * ensure all the buffers are mapped, and bring uptodate any buffers that are 557 * only partially being written to. 558 * 559 * If @nr_pages is greater than one, we are guaranteed that the cluster size is 560 * greater than PAGE_SIZE, that all pages in @pages are entirely inside 561 * the same cluster and that they are the entirety of that cluster, and that 562 * the cluster is sparse, i.e. we need to allocate a cluster to fill the hole. 563 * 564 * i_size is not to be modified yet. 565 * 566 * Return 0 on success or -errno on error. 567 */ 568 static int ntfs_prepare_pages_for_non_resident_write(struct page **pages, 569 unsigned nr_pages, s64 pos, size_t bytes) 570 { 571 VCN vcn, highest_vcn = 0, cpos, cend, bh_cpos, bh_cend; 572 LCN lcn; 573 s64 bh_pos, vcn_len, end, initialized_size; 574 sector_t lcn_block; 575 struct page *page; 576 struct inode *vi; 577 ntfs_inode *ni, *base_ni = NULL; 578 ntfs_volume *vol; 579 runlist_element *rl, *rl2; 580 struct buffer_head *bh, *head, *wait[2], **wait_bh = wait; 581 ntfs_attr_search_ctx *ctx = NULL; 582 MFT_RECORD *m = NULL; 583 ATTR_RECORD *a = NULL; 584 unsigned long flags; 585 u32 attr_rec_len = 0; 586 unsigned blocksize, u; 587 int err, mp_size; 588 bool rl_write_locked, was_hole, is_retry; 589 unsigned char blocksize_bits; 590 struct { 591 u8 runlist_merged:1; 592 u8 mft_attr_mapped:1; 593 u8 mp_rebuilt:1; 594 u8 attr_switched:1; 595 } status = { 0, 0, 0, 0 }; 596 597 BUG_ON(!nr_pages); 598 BUG_ON(!pages); 599 BUG_ON(!*pages); 600 vi = pages[0]->mapping->host; 601 ni = NTFS_I(vi); 602 vol = ni->vol; 603 ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, start page " 604 "index 0x%lx, nr_pages 0x%x, pos 0x%llx, bytes 0x%zx.", 605 vi->i_ino, ni->type, pages[0]->index, nr_pages, 606 (long long)pos, bytes); 607 blocksize = vol->sb->s_blocksize; 608 blocksize_bits = vol->sb->s_blocksize_bits; 609 u = 0; 610 do { 611 page = pages[u]; 612 BUG_ON(!page); 613 /* 614 * create_empty_buffers() will create uptodate/dirty buffers if 615 * the page is uptodate/dirty. 616 */ 617 if (!page_has_buffers(page)) { 618 create_empty_buffers(page, blocksize, 0); 619 if (unlikely(!page_has_buffers(page))) 620 return -ENOMEM; 621 } 622 } while (++u < nr_pages); 623 rl_write_locked = false; 624 rl = NULL; 625 err = 0; 626 vcn = lcn = -1; 627 vcn_len = 0; 628 lcn_block = -1; 629 was_hole = false; 630 cpos = pos >> vol->cluster_size_bits; 631 end = pos + bytes; 632 cend = (end + vol->cluster_size - 1) >> vol->cluster_size_bits; 633 /* 634 * Loop over each page and for each page over each buffer. Use goto to 635 * reduce indentation. 636 */ 637 u = 0; 638 do_next_page: 639 page = pages[u]; 640 bh_pos = (s64)page->index << PAGE_SHIFT; 641 bh = head = page_buffers(page); 642 do { 643 VCN cdelta; 644 s64 bh_end; 645 unsigned bh_cofs; 646 647 /* Clear buffer_new on all buffers to reinitialise state. */ 648 if (buffer_new(bh)) 649 clear_buffer_new(bh); 650 bh_end = bh_pos + blocksize; 651 bh_cpos = bh_pos >> vol->cluster_size_bits; 652 bh_cofs = bh_pos & vol->cluster_size_mask; 653 if (buffer_mapped(bh)) { 654 /* 655 * The buffer is already mapped. If it is uptodate, 656 * ignore it. 657 */ 658 if (buffer_uptodate(bh)) 659 continue; 660 /* 661 * The buffer is not uptodate. If the page is uptodate 662 * set the buffer uptodate and otherwise ignore it. 663 */ 664 if (PageUptodate(page)) { 665 set_buffer_uptodate(bh); 666 continue; 667 } 668 /* 669 * Neither the page nor the buffer are uptodate. If 670 * the buffer is only partially being written to, we 671 * need to read it in before the write, i.e. now. 672 */ 673 if ((bh_pos < pos && bh_end > pos) || 674 (bh_pos < end && bh_end > end)) { 675 /* 676 * If the buffer is fully or partially within 677 * the initialized size, do an actual read. 678 * Otherwise, simply zero the buffer. 679 */ 680 read_lock_irqsave(&ni->size_lock, flags); 681 initialized_size = ni->initialized_size; 682 read_unlock_irqrestore(&ni->size_lock, flags); 683 if (bh_pos < initialized_size) { 684 ntfs_submit_bh_for_read(bh); 685 *wait_bh++ = bh; 686 } else { 687 zero_user(page, bh_offset(bh), 688 blocksize); 689 set_buffer_uptodate(bh); 690 } 691 } 692 continue; 693 } 694 /* Unmapped buffer. Need to map it. */ 695 bh->b_bdev = vol->sb->s_bdev; 696 /* 697 * If the current buffer is in the same clusters as the map 698 * cache, there is no need to check the runlist again. The 699 * map cache is made up of @vcn, which is the first cached file 700 * cluster, @vcn_len which is the number of cached file 701 * clusters, @lcn is the device cluster corresponding to @vcn, 702 * and @lcn_block is the block number corresponding to @lcn. 703 */ 704 cdelta = bh_cpos - vcn; 705 if (likely(!cdelta || (cdelta > 0 && cdelta < vcn_len))) { 706 map_buffer_cached: 707 BUG_ON(lcn < 0); 708 bh->b_blocknr = lcn_block + 709 (cdelta << (vol->cluster_size_bits - 710 blocksize_bits)) + 711 (bh_cofs >> blocksize_bits); 712 set_buffer_mapped(bh); 713 /* 714 * If the page is uptodate so is the buffer. If the 715 * buffer is fully outside the write, we ignore it if 716 * it was already allocated and we mark it dirty so it 717 * gets written out if we allocated it. On the other 718 * hand, if we allocated the buffer but we are not 719 * marking it dirty we set buffer_new so we can do 720 * error recovery. 721 */ 722 if (PageUptodate(page)) { 723 if (!buffer_uptodate(bh)) 724 set_buffer_uptodate(bh); 725 if (unlikely(was_hole)) { 726 /* We allocated the buffer. */ 727 clean_bdev_bh_alias(bh); 728 if (bh_end <= pos || bh_pos >= end) 729 mark_buffer_dirty(bh); 730 else 731 set_buffer_new(bh); 732 } 733 continue; 734 } 735 /* Page is _not_ uptodate. */ 736 if (likely(!was_hole)) { 737 /* 738 * Buffer was already allocated. If it is not 739 * uptodate and is only partially being written 740 * to, we need to read it in before the write, 741 * i.e. now. 742 */ 743 if (!buffer_uptodate(bh) && bh_pos < end && 744 bh_end > pos && 745 (bh_pos < pos || 746 bh_end > end)) { 747 /* 748 * If the buffer is fully or partially 749 * within the initialized size, do an 750 * actual read. Otherwise, simply zero 751 * the buffer. 752 */ 753 read_lock_irqsave(&ni->size_lock, 754 flags); 755 initialized_size = ni->initialized_size; 756 read_unlock_irqrestore(&ni->size_lock, 757 flags); 758 if (bh_pos < initialized_size) { 759 ntfs_submit_bh_for_read(bh); 760 *wait_bh++ = bh; 761 } else { 762 zero_user(page, bh_offset(bh), 763 blocksize); 764 set_buffer_uptodate(bh); 765 } 766 } 767 continue; 768 } 769 /* We allocated the buffer. */ 770 clean_bdev_bh_alias(bh); 771 /* 772 * If the buffer is fully outside the write, zero it, 773 * set it uptodate, and mark it dirty so it gets 774 * written out. If it is partially being written to, 775 * zero region surrounding the write but leave it to 776 * commit write to do anything else. Finally, if the 777 * buffer is fully being overwritten, do nothing. 778 */ 779 if (bh_end <= pos || bh_pos >= end) { 780 if (!buffer_uptodate(bh)) { 781 zero_user(page, bh_offset(bh), 782 blocksize); 783 set_buffer_uptodate(bh); 784 } 785 mark_buffer_dirty(bh); 786 continue; 787 } 788 set_buffer_new(bh); 789 if (!buffer_uptodate(bh) && 790 (bh_pos < pos || bh_end > end)) { 791 u8 *kaddr; 792 unsigned pofs; 793 794 kaddr = kmap_atomic(page); 795 if (bh_pos < pos) { 796 pofs = bh_pos & ~PAGE_MASK; 797 memset(kaddr + pofs, 0, pos - bh_pos); 798 } 799 if (bh_end > end) { 800 pofs = end & ~PAGE_MASK; 801 memset(kaddr + pofs, 0, bh_end - end); 802 } 803 kunmap_atomic(kaddr); 804 flush_dcache_page(page); 805 } 806 continue; 807 } 808 /* 809 * Slow path: this is the first buffer in the cluster. If it 810 * is outside allocated size and is not uptodate, zero it and 811 * set it uptodate. 812 */ 813 read_lock_irqsave(&ni->size_lock, flags); 814 initialized_size = ni->allocated_size; 815 read_unlock_irqrestore(&ni->size_lock, flags); 816 if (bh_pos > initialized_size) { 817 if (PageUptodate(page)) { 818 if (!buffer_uptodate(bh)) 819 set_buffer_uptodate(bh); 820 } else if (!buffer_uptodate(bh)) { 821 zero_user(page, bh_offset(bh), blocksize); 822 set_buffer_uptodate(bh); 823 } 824 continue; 825 } 826 is_retry = false; 827 if (!rl) { 828 down_read(&ni->runlist.lock); 829 retry_remap: 830 rl = ni->runlist.rl; 831 } 832 if (likely(rl != NULL)) { 833 /* Seek to element containing target cluster. */ 834 while (rl->length && rl[1].vcn <= bh_cpos) 835 rl++; 836 lcn = ntfs_rl_vcn_to_lcn(rl, bh_cpos); 837 if (likely(lcn >= 0)) { 838 /* 839 * Successful remap, setup the map cache and 840 * use that to deal with the buffer. 841 */ 842 was_hole = false; 843 vcn = bh_cpos; 844 vcn_len = rl[1].vcn - vcn; 845 lcn_block = lcn << (vol->cluster_size_bits - 846 blocksize_bits); 847 cdelta = 0; 848 /* 849 * If the number of remaining clusters touched 850 * by the write is smaller or equal to the 851 * number of cached clusters, unlock the 852 * runlist as the map cache will be used from 853 * now on. 854 */ 855 if (likely(vcn + vcn_len >= cend)) { 856 if (rl_write_locked) { 857 up_write(&ni->runlist.lock); 858 rl_write_locked = false; 859 } else 860 up_read(&ni->runlist.lock); 861 rl = NULL; 862 } 863 goto map_buffer_cached; 864 } 865 } else 866 lcn = LCN_RL_NOT_MAPPED; 867 /* 868 * If it is not a hole and not out of bounds, the runlist is 869 * probably unmapped so try to map it now. 870 */ 871 if (unlikely(lcn != LCN_HOLE && lcn != LCN_ENOENT)) { 872 if (likely(!is_retry && lcn == LCN_RL_NOT_MAPPED)) { 873 /* Attempt to map runlist. */ 874 if (!rl_write_locked) { 875 /* 876 * We need the runlist locked for 877 * writing, so if it is locked for 878 * reading relock it now and retry in 879 * case it changed whilst we dropped 880 * the lock. 881 */ 882 up_read(&ni->runlist.lock); 883 down_write(&ni->runlist.lock); 884 rl_write_locked = true; 885 goto retry_remap; 886 } 887 err = ntfs_map_runlist_nolock(ni, bh_cpos, 888 NULL); 889 if (likely(!err)) { 890 is_retry = true; 891 goto retry_remap; 892 } 893 /* 894 * If @vcn is out of bounds, pretend @lcn is 895 * LCN_ENOENT. As long as the buffer is out 896 * of bounds this will work fine. 897 */ 898 if (err == -ENOENT) { 899 lcn = LCN_ENOENT; 900 err = 0; 901 goto rl_not_mapped_enoent; 902 } 903 } else 904 err = -EIO; 905 /* Failed to map the buffer, even after retrying. */ 906 bh->b_blocknr = -1; 907 ntfs_error(vol->sb, "Failed to write to inode 0x%lx, " 908 "attribute type 0x%x, vcn 0x%llx, " 909 "vcn offset 0x%x, because its " 910 "location on disk could not be " 911 "determined%s (error code %i).", 912 ni->mft_no, ni->type, 913 (unsigned long long)bh_cpos, 914 (unsigned)bh_pos & 915 vol->cluster_size_mask, 916 is_retry ? " even after retrying" : "", 917 err); 918 break; 919 } 920 rl_not_mapped_enoent: 921 /* 922 * The buffer is in a hole or out of bounds. We need to fill 923 * the hole, unless the buffer is in a cluster which is not 924 * touched by the write, in which case we just leave the buffer 925 * unmapped. This can only happen when the cluster size is 926 * less than the page cache size. 927 */ 928 if (unlikely(vol->cluster_size < PAGE_SIZE)) { 929 bh_cend = (bh_end + vol->cluster_size - 1) >> 930 vol->cluster_size_bits; 931 if ((bh_cend <= cpos || bh_cpos >= cend)) { 932 bh->b_blocknr = -1; 933 /* 934 * If the buffer is uptodate we skip it. If it 935 * is not but the page is uptodate, we can set 936 * the buffer uptodate. If the page is not 937 * uptodate, we can clear the buffer and set it 938 * uptodate. Whether this is worthwhile is 939 * debatable and this could be removed. 940 */ 941 if (PageUptodate(page)) { 942 if (!buffer_uptodate(bh)) 943 set_buffer_uptodate(bh); 944 } else if (!buffer_uptodate(bh)) { 945 zero_user(page, bh_offset(bh), 946 blocksize); 947 set_buffer_uptodate(bh); 948 } 949 continue; 950 } 951 } 952 /* 953 * Out of bounds buffer is invalid if it was not really out of 954 * bounds. 955 */ 956 BUG_ON(lcn != LCN_HOLE); 957 /* 958 * We need the runlist locked for writing, so if it is locked 959 * for reading relock it now and retry in case it changed 960 * whilst we dropped the lock. 961 */ 962 BUG_ON(!rl); 963 if (!rl_write_locked) { 964 up_read(&ni->runlist.lock); 965 down_write(&ni->runlist.lock); 966 rl_write_locked = true; 967 goto retry_remap; 968 } 969 /* Find the previous last allocated cluster. */ 970 BUG_ON(rl->lcn != LCN_HOLE); 971 lcn = -1; 972 rl2 = rl; 973 while (--rl2 >= ni->runlist.rl) { 974 if (rl2->lcn >= 0) { 975 lcn = rl2->lcn + rl2->length; 976 break; 977 } 978 } 979 rl2 = ntfs_cluster_alloc(vol, bh_cpos, 1, lcn, DATA_ZONE, 980 false); 981 if (IS_ERR(rl2)) { 982 err = PTR_ERR(rl2); 983 ntfs_debug("Failed to allocate cluster, error code %i.", 984 err); 985 break; 986 } 987 lcn = rl2->lcn; 988 rl = ntfs_runlists_merge(ni->runlist.rl, rl2); 989 if (IS_ERR(rl)) { 990 err = PTR_ERR(rl); 991 if (err != -ENOMEM) 992 err = -EIO; 993 if (ntfs_cluster_free_from_rl(vol, rl2)) { 994 ntfs_error(vol->sb, "Failed to release " 995 "allocated cluster in error " 996 "code path. Run chkdsk to " 997 "recover the lost cluster."); 998 NVolSetErrors(vol); 999 } 1000 ntfs_free(rl2); 1001 break; 1002 } 1003 ni->runlist.rl = rl; 1004 status.runlist_merged = 1; 1005 ntfs_debug("Allocated cluster, lcn 0x%llx.", 1006 (unsigned long long)lcn); 1007 /* Map and lock the mft record and get the attribute record. */ 1008 if (!NInoAttr(ni)) 1009 base_ni = ni; 1010 else 1011 base_ni = ni->ext.base_ntfs_ino; 1012 m = map_mft_record(base_ni); 1013 if (IS_ERR(m)) { 1014 err = PTR_ERR(m); 1015 break; 1016 } 1017 ctx = ntfs_attr_get_search_ctx(base_ni, m); 1018 if (unlikely(!ctx)) { 1019 err = -ENOMEM; 1020 unmap_mft_record(base_ni); 1021 break; 1022 } 1023 status.mft_attr_mapped = 1; 1024 err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, 1025 CASE_SENSITIVE, bh_cpos, NULL, 0, ctx); 1026 if (unlikely(err)) { 1027 if (err == -ENOENT) 1028 err = -EIO; 1029 break; 1030 } 1031 m = ctx->mrec; 1032 a = ctx->attr; 1033 /* 1034 * Find the runlist element with which the attribute extent 1035 * starts. Note, we cannot use the _attr_ version because we 1036 * have mapped the mft record. That is ok because we know the 1037 * runlist fragment must be mapped already to have ever gotten 1038 * here, so we can just use the _rl_ version. 1039 */ 1040 vcn = sle64_to_cpu(a->data.non_resident.lowest_vcn); 1041 rl2 = ntfs_rl_find_vcn_nolock(rl, vcn); 1042 BUG_ON(!rl2); 1043 BUG_ON(!rl2->length); 1044 BUG_ON(rl2->lcn < LCN_HOLE); 1045 highest_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn); 1046 /* 1047 * If @highest_vcn is zero, calculate the real highest_vcn 1048 * (which can really be zero). 1049 */ 1050 if (!highest_vcn) 1051 highest_vcn = (sle64_to_cpu( 1052 a->data.non_resident.allocated_size) >> 1053 vol->cluster_size_bits) - 1; 1054 /* 1055 * Determine the size of the mapping pairs array for the new 1056 * extent, i.e. the old extent with the hole filled. 1057 */ 1058 mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, vcn, 1059 highest_vcn); 1060 if (unlikely(mp_size <= 0)) { 1061 if (!(err = mp_size)) 1062 err = -EIO; 1063 ntfs_debug("Failed to get size for mapping pairs " 1064 "array, error code %i.", err); 1065 break; 1066 } 1067 /* 1068 * Resize the attribute record to fit the new mapping pairs 1069 * array. 1070 */ 1071 attr_rec_len = le32_to_cpu(a->length); 1072 err = ntfs_attr_record_resize(m, a, mp_size + le16_to_cpu( 1073 a->data.non_resident.mapping_pairs_offset)); 1074 if (unlikely(err)) { 1075 BUG_ON(err != -ENOSPC); 1076 // TODO: Deal with this by using the current attribute 1077 // and fill it with as much of the mapping pairs 1078 // array as possible. Then loop over each attribute 1079 // extent rewriting the mapping pairs arrays as we go 1080 // along and if when we reach the end we have not 1081 // enough space, try to resize the last attribute 1082 // extent and if even that fails, add a new attribute 1083 // extent. 1084 // We could also try to resize at each step in the hope 1085 // that we will not need to rewrite every single extent. 1086 // Note, we may need to decompress some extents to fill 1087 // the runlist as we are walking the extents... 1088 ntfs_error(vol->sb, "Not enough space in the mft " 1089 "record for the extended attribute " 1090 "record. This case is not " 1091 "implemented yet."); 1092 err = -EOPNOTSUPP; 1093 break ; 1094 } 1095 status.mp_rebuilt = 1; 1096 /* 1097 * Generate the mapping pairs array directly into the attribute 1098 * record. 1099 */ 1100 err = ntfs_mapping_pairs_build(vol, (u8*)a + le16_to_cpu( 1101 a->data.non_resident.mapping_pairs_offset), 1102 mp_size, rl2, vcn, highest_vcn, NULL); 1103 if (unlikely(err)) { 1104 ntfs_error(vol->sb, "Cannot fill hole in inode 0x%lx, " 1105 "attribute type 0x%x, because building " 1106 "the mapping pairs failed with error " 1107 "code %i.", vi->i_ino, 1108 (unsigned)le32_to_cpu(ni->type), err); 1109 err = -EIO; 1110 break; 1111 } 1112 /* Update the highest_vcn but only if it was not set. */ 1113 if (unlikely(!a->data.non_resident.highest_vcn)) 1114 a->data.non_resident.highest_vcn = 1115 cpu_to_sle64(highest_vcn); 1116 /* 1117 * If the attribute is sparse/compressed, update the compressed 1118 * size in the ntfs_inode structure and the attribute record. 1119 */ 1120 if (likely(NInoSparse(ni) || NInoCompressed(ni))) { 1121 /* 1122 * If we are not in the first attribute extent, switch 1123 * to it, but first ensure the changes will make it to 1124 * disk later. 1125 */ 1126 if (a->data.non_resident.lowest_vcn) { 1127 flush_dcache_mft_record_page(ctx->ntfs_ino); 1128 mark_mft_record_dirty(ctx->ntfs_ino); 1129 ntfs_attr_reinit_search_ctx(ctx); 1130 err = ntfs_attr_lookup(ni->type, ni->name, 1131 ni->name_len, CASE_SENSITIVE, 1132 0, NULL, 0, ctx); 1133 if (unlikely(err)) { 1134 status.attr_switched = 1; 1135 break; 1136 } 1137 /* @m is not used any more so do not set it. */ 1138 a = ctx->attr; 1139 } 1140 write_lock_irqsave(&ni->size_lock, flags); 1141 ni->itype.compressed.size += vol->cluster_size; 1142 a->data.non_resident.compressed_size = 1143 cpu_to_sle64(ni->itype.compressed.size); 1144 write_unlock_irqrestore(&ni->size_lock, flags); 1145 } 1146 /* Ensure the changes make it to disk. */ 1147 flush_dcache_mft_record_page(ctx->ntfs_ino); 1148 mark_mft_record_dirty(ctx->ntfs_ino); 1149 ntfs_attr_put_search_ctx(ctx); 1150 unmap_mft_record(base_ni); 1151 /* Successfully filled the hole. */ 1152 status.runlist_merged = 0; 1153 status.mft_attr_mapped = 0; 1154 status.mp_rebuilt = 0; 1155 /* Setup the map cache and use that to deal with the buffer. */ 1156 was_hole = true; 1157 vcn = bh_cpos; 1158 vcn_len = 1; 1159 lcn_block = lcn << (vol->cluster_size_bits - blocksize_bits); 1160 cdelta = 0; 1161 /* 1162 * If the number of remaining clusters in the @pages is smaller 1163 * or equal to the number of cached clusters, unlock the 1164 * runlist as the map cache will be used from now on. 1165 */ 1166 if (likely(vcn + vcn_len >= cend)) { 1167 up_write(&ni->runlist.lock); 1168 rl_write_locked = false; 1169 rl = NULL; 1170 } 1171 goto map_buffer_cached; 1172 } while (bh_pos += blocksize, (bh = bh->b_this_page) != head); 1173 /* If there are no errors, do the next page. */ 1174 if (likely(!err && ++u < nr_pages)) 1175 goto do_next_page; 1176 /* If there are no errors, release the runlist lock if we took it. */ 1177 if (likely(!err)) { 1178 if (unlikely(rl_write_locked)) { 1179 up_write(&ni->runlist.lock); 1180 rl_write_locked = false; 1181 } else if (unlikely(rl)) 1182 up_read(&ni->runlist.lock); 1183 rl = NULL; 1184 } 1185 /* If we issued read requests, let them complete. */ 1186 read_lock_irqsave(&ni->size_lock, flags); 1187 initialized_size = ni->initialized_size; 1188 read_unlock_irqrestore(&ni->size_lock, flags); 1189 while (wait_bh > wait) { 1190 bh = *--wait_bh; 1191 wait_on_buffer(bh); 1192 if (likely(buffer_uptodate(bh))) { 1193 page = bh->b_page; 1194 bh_pos = ((s64)page->index << PAGE_SHIFT) + 1195 bh_offset(bh); 1196 /* 1197 * If the buffer overflows the initialized size, need 1198 * to zero the overflowing region. 1199 */ 1200 if (unlikely(bh_pos + blocksize > initialized_size)) { 1201 int ofs = 0; 1202 1203 if (likely(bh_pos < initialized_size)) 1204 ofs = initialized_size - bh_pos; 1205 zero_user_segment(page, bh_offset(bh) + ofs, 1206 blocksize); 1207 } 1208 } else /* if (unlikely(!buffer_uptodate(bh))) */ 1209 err = -EIO; 1210 } 1211 if (likely(!err)) { 1212 /* Clear buffer_new on all buffers. */ 1213 u = 0; 1214 do { 1215 bh = head = page_buffers(pages[u]); 1216 do { 1217 if (buffer_new(bh)) 1218 clear_buffer_new(bh); 1219 } while ((bh = bh->b_this_page) != head); 1220 } while (++u < nr_pages); 1221 ntfs_debug("Done."); 1222 return err; 1223 } 1224 if (status.attr_switched) { 1225 /* Get back to the attribute extent we modified. */ 1226 ntfs_attr_reinit_search_ctx(ctx); 1227 if (ntfs_attr_lookup(ni->type, ni->name, ni->name_len, 1228 CASE_SENSITIVE, bh_cpos, NULL, 0, ctx)) { 1229 ntfs_error(vol->sb, "Failed to find required " 1230 "attribute extent of attribute in " 1231 "error code path. Run chkdsk to " 1232 "recover."); 1233 write_lock_irqsave(&ni->size_lock, flags); 1234 ni->itype.compressed.size += vol->cluster_size; 1235 write_unlock_irqrestore(&ni->size_lock, flags); 1236 flush_dcache_mft_record_page(ctx->ntfs_ino); 1237 mark_mft_record_dirty(ctx->ntfs_ino); 1238 /* 1239 * The only thing that is now wrong is the compressed 1240 * size of the base attribute extent which chkdsk 1241 * should be able to fix. 1242 */ 1243 NVolSetErrors(vol); 1244 } else { 1245 m = ctx->mrec; 1246 a = ctx->attr; 1247 status.attr_switched = 0; 1248 } 1249 } 1250 /* 1251 * If the runlist has been modified, need to restore it by punching a 1252 * hole into it and we then need to deallocate the on-disk cluster as 1253 * well. Note, we only modify the runlist if we are able to generate a 1254 * new mapping pairs array, i.e. only when the mapped attribute extent 1255 * is not switched. 1256 */ 1257 if (status.runlist_merged && !status.attr_switched) { 1258 BUG_ON(!rl_write_locked); 1259 /* Make the file cluster we allocated sparse in the runlist. */ 1260 if (ntfs_rl_punch_nolock(vol, &ni->runlist, bh_cpos, 1)) { 1261 ntfs_error(vol->sb, "Failed to punch hole into " 1262 "attribute runlist in error code " 1263 "path. Run chkdsk to recover the " 1264 "lost cluster."); 1265 NVolSetErrors(vol); 1266 } else /* if (success) */ { 1267 status.runlist_merged = 0; 1268 /* 1269 * Deallocate the on-disk cluster we allocated but only 1270 * if we succeeded in punching its vcn out of the 1271 * runlist. 1272 */ 1273 down_write(&vol->lcnbmp_lock); 1274 if (ntfs_bitmap_clear_bit(vol->lcnbmp_ino, lcn)) { 1275 ntfs_error(vol->sb, "Failed to release " 1276 "allocated cluster in error " 1277 "code path. Run chkdsk to " 1278 "recover the lost cluster."); 1279 NVolSetErrors(vol); 1280 } 1281 up_write(&vol->lcnbmp_lock); 1282 } 1283 } 1284 /* 1285 * Resize the attribute record to its old size and rebuild the mapping 1286 * pairs array. Note, we only can do this if the runlist has been 1287 * restored to its old state which also implies that the mapped 1288 * attribute extent is not switched. 1289 */ 1290 if (status.mp_rebuilt && !status.runlist_merged) { 1291 if (ntfs_attr_record_resize(m, a, attr_rec_len)) { 1292 ntfs_error(vol->sb, "Failed to restore attribute " 1293 "record in error code path. Run " 1294 "chkdsk to recover."); 1295 NVolSetErrors(vol); 1296 } else /* if (success) */ { 1297 if (ntfs_mapping_pairs_build(vol, (u8*)a + 1298 le16_to_cpu(a->data.non_resident. 1299 mapping_pairs_offset), attr_rec_len - 1300 le16_to_cpu(a->data.non_resident. 1301 mapping_pairs_offset), ni->runlist.rl, 1302 vcn, highest_vcn, NULL)) { 1303 ntfs_error(vol->sb, "Failed to restore " 1304 "mapping pairs array in error " 1305 "code path. Run chkdsk to " 1306 "recover."); 1307 NVolSetErrors(vol); 1308 } 1309 flush_dcache_mft_record_page(ctx->ntfs_ino); 1310 mark_mft_record_dirty(ctx->ntfs_ino); 1311 } 1312 } 1313 /* Release the mft record and the attribute. */ 1314 if (status.mft_attr_mapped) { 1315 ntfs_attr_put_search_ctx(ctx); 1316 unmap_mft_record(base_ni); 1317 } 1318 /* Release the runlist lock. */ 1319 if (rl_write_locked) 1320 up_write(&ni->runlist.lock); 1321 else if (rl) 1322 up_read(&ni->runlist.lock); 1323 /* 1324 * Zero out any newly allocated blocks to avoid exposing stale data. 1325 * If BH_New is set, we know that the block was newly allocated above 1326 * and that it has not been fully zeroed and marked dirty yet. 1327 */ 1328 nr_pages = u; 1329 u = 0; 1330 end = bh_cpos << vol->cluster_size_bits; 1331 do { 1332 page = pages[u]; 1333 bh = head = page_buffers(page); 1334 do { 1335 if (u == nr_pages && 1336 ((s64)page->index << PAGE_SHIFT) + 1337 bh_offset(bh) >= end) 1338 break; 1339 if (!buffer_new(bh)) 1340 continue; 1341 clear_buffer_new(bh); 1342 if (!buffer_uptodate(bh)) { 1343 if (PageUptodate(page)) 1344 set_buffer_uptodate(bh); 1345 else { 1346 zero_user(page, bh_offset(bh), 1347 blocksize); 1348 set_buffer_uptodate(bh); 1349 } 1350 } 1351 mark_buffer_dirty(bh); 1352 } while ((bh = bh->b_this_page) != head); 1353 } while (++u <= nr_pages); 1354 ntfs_error(vol->sb, "Failed. Returning error code %i.", err); 1355 return err; 1356 } 1357 1358 static inline void ntfs_flush_dcache_pages(struct page **pages, 1359 unsigned nr_pages) 1360 { 1361 BUG_ON(!nr_pages); 1362 /* 1363 * Warning: Do not do the decrement at the same time as the call to 1364 * flush_dcache_page() because it is a NULL macro on i386 and hence the 1365 * decrement never happens so the loop never terminates. 1366 */ 1367 do { 1368 --nr_pages; 1369 flush_dcache_page(pages[nr_pages]); 1370 } while (nr_pages > 0); 1371 } 1372 1373 /** 1374 * ntfs_commit_pages_after_non_resident_write - commit the received data 1375 * @pages: array of destination pages 1376 * @nr_pages: number of pages in @pages 1377 * @pos: byte position in file at which the write begins 1378 * @bytes: number of bytes to be written 1379 * 1380 * See description of ntfs_commit_pages_after_write(), below. 1381 */ 1382 static inline int ntfs_commit_pages_after_non_resident_write( 1383 struct page **pages, const unsigned nr_pages, 1384 s64 pos, size_t bytes) 1385 { 1386 s64 end, initialized_size; 1387 struct inode *vi; 1388 ntfs_inode *ni, *base_ni; 1389 struct buffer_head *bh, *head; 1390 ntfs_attr_search_ctx *ctx; 1391 MFT_RECORD *m; 1392 ATTR_RECORD *a; 1393 unsigned long flags; 1394 unsigned blocksize, u; 1395 int err; 1396 1397 vi = pages[0]->mapping->host; 1398 ni = NTFS_I(vi); 1399 blocksize = vi->i_sb->s_blocksize; 1400 end = pos + bytes; 1401 u = 0; 1402 do { 1403 s64 bh_pos; 1404 struct page *page; 1405 bool partial; 1406 1407 page = pages[u]; 1408 bh_pos = (s64)page->index << PAGE_SHIFT; 1409 bh = head = page_buffers(page); 1410 partial = false; 1411 do { 1412 s64 bh_end; 1413 1414 bh_end = bh_pos + blocksize; 1415 if (bh_end <= pos || bh_pos >= end) { 1416 if (!buffer_uptodate(bh)) 1417 partial = true; 1418 } else { 1419 set_buffer_uptodate(bh); 1420 mark_buffer_dirty(bh); 1421 } 1422 } while (bh_pos += blocksize, (bh = bh->b_this_page) != head); 1423 /* 1424 * If all buffers are now uptodate but the page is not, set the 1425 * page uptodate. 1426 */ 1427 if (!partial && !PageUptodate(page)) 1428 SetPageUptodate(page); 1429 } while (++u < nr_pages); 1430 /* 1431 * Finally, if we do not need to update initialized_size or i_size we 1432 * are finished. 1433 */ 1434 read_lock_irqsave(&ni->size_lock, flags); 1435 initialized_size = ni->initialized_size; 1436 read_unlock_irqrestore(&ni->size_lock, flags); 1437 if (end <= initialized_size) { 1438 ntfs_debug("Done."); 1439 return 0; 1440 } 1441 /* 1442 * Update initialized_size/i_size as appropriate, both in the inode and 1443 * the mft record. 1444 */ 1445 if (!NInoAttr(ni)) 1446 base_ni = ni; 1447 else 1448 base_ni = ni->ext.base_ntfs_ino; 1449 /* Map, pin, and lock the mft record. */ 1450 m = map_mft_record(base_ni); 1451 if (IS_ERR(m)) { 1452 err = PTR_ERR(m); 1453 m = NULL; 1454 ctx = NULL; 1455 goto err_out; 1456 } 1457 BUG_ON(!NInoNonResident(ni)); 1458 ctx = ntfs_attr_get_search_ctx(base_ni, m); 1459 if (unlikely(!ctx)) { 1460 err = -ENOMEM; 1461 goto err_out; 1462 } 1463 err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, 1464 CASE_SENSITIVE, 0, NULL, 0, ctx); 1465 if (unlikely(err)) { 1466 if (err == -ENOENT) 1467 err = -EIO; 1468 goto err_out; 1469 } 1470 a = ctx->attr; 1471 BUG_ON(!a->non_resident); 1472 write_lock_irqsave(&ni->size_lock, flags); 1473 BUG_ON(end > ni->allocated_size); 1474 ni->initialized_size = end; 1475 a->data.non_resident.initialized_size = cpu_to_sle64(end); 1476 if (end > i_size_read(vi)) { 1477 i_size_write(vi, end); 1478 a->data.non_resident.data_size = 1479 a->data.non_resident.initialized_size; 1480 } 1481 write_unlock_irqrestore(&ni->size_lock, flags); 1482 /* Mark the mft record dirty, so it gets written back. */ 1483 flush_dcache_mft_record_page(ctx->ntfs_ino); 1484 mark_mft_record_dirty(ctx->ntfs_ino); 1485 ntfs_attr_put_search_ctx(ctx); 1486 unmap_mft_record(base_ni); 1487 ntfs_debug("Done."); 1488 return 0; 1489 err_out: 1490 if (ctx) 1491 ntfs_attr_put_search_ctx(ctx); 1492 if (m) 1493 unmap_mft_record(base_ni); 1494 ntfs_error(vi->i_sb, "Failed to update initialized_size/i_size (error " 1495 "code %i).", err); 1496 if (err != -ENOMEM) 1497 NVolSetErrors(ni->vol); 1498 return err; 1499 } 1500 1501 /** 1502 * ntfs_commit_pages_after_write - commit the received data 1503 * @pages: array of destination pages 1504 * @nr_pages: number of pages in @pages 1505 * @pos: byte position in file at which the write begins 1506 * @bytes: number of bytes to be written 1507 * 1508 * This is called from ntfs_file_buffered_write() with i_mutex held on the inode 1509 * (@pages[0]->mapping->host). There are @nr_pages pages in @pages which are 1510 * locked but not kmap()ped. The source data has already been copied into the 1511 * @page. ntfs_prepare_pages_for_non_resident_write() has been called before 1512 * the data was copied (for non-resident attributes only) and it returned 1513 * success. 1514 * 1515 * Need to set uptodate and mark dirty all buffers within the boundary of the 1516 * write. If all buffers in a page are uptodate we set the page uptodate, too. 1517 * 1518 * Setting the buffers dirty ensures that they get written out later when 1519 * ntfs_writepage() is invoked by the VM. 1520 * 1521 * Finally, we need to update i_size and initialized_size as appropriate both 1522 * in the inode and the mft record. 1523 * 1524 * This is modelled after fs/buffer.c::generic_commit_write(), which marks 1525 * buffers uptodate and dirty, sets the page uptodate if all buffers in the 1526 * page are uptodate, and updates i_size if the end of io is beyond i_size. In 1527 * that case, it also marks the inode dirty. 1528 * 1529 * If things have gone as outlined in 1530 * ntfs_prepare_pages_for_non_resident_write(), we do not need to do any page 1531 * content modifications here for non-resident attributes. For resident 1532 * attributes we need to do the uptodate bringing here which we combine with 1533 * the copying into the mft record which means we save one atomic kmap. 1534 * 1535 * Return 0 on success or -errno on error. 1536 */ 1537 static int ntfs_commit_pages_after_write(struct page **pages, 1538 const unsigned nr_pages, s64 pos, size_t bytes) 1539 { 1540 s64 end, initialized_size; 1541 loff_t i_size; 1542 struct inode *vi; 1543 ntfs_inode *ni, *base_ni; 1544 struct page *page; 1545 ntfs_attr_search_ctx *ctx; 1546 MFT_RECORD *m; 1547 ATTR_RECORD *a; 1548 char *kattr, *kaddr; 1549 unsigned long flags; 1550 u32 attr_len; 1551 int err; 1552 1553 BUG_ON(!nr_pages); 1554 BUG_ON(!pages); 1555 page = pages[0]; 1556 BUG_ON(!page); 1557 vi = page->mapping->host; 1558 ni = NTFS_I(vi); 1559 ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, start page " 1560 "index 0x%lx, nr_pages 0x%x, pos 0x%llx, bytes 0x%zx.", 1561 vi->i_ino, ni->type, page->index, nr_pages, 1562 (long long)pos, bytes); 1563 if (NInoNonResident(ni)) 1564 return ntfs_commit_pages_after_non_resident_write(pages, 1565 nr_pages, pos, bytes); 1566 BUG_ON(nr_pages > 1); 1567 /* 1568 * Attribute is resident, implying it is not compressed, encrypted, or 1569 * sparse. 1570 */ 1571 if (!NInoAttr(ni)) 1572 base_ni = ni; 1573 else 1574 base_ni = ni->ext.base_ntfs_ino; 1575 BUG_ON(NInoNonResident(ni)); 1576 /* Map, pin, and lock the mft record. */ 1577 m = map_mft_record(base_ni); 1578 if (IS_ERR(m)) { 1579 err = PTR_ERR(m); 1580 m = NULL; 1581 ctx = NULL; 1582 goto err_out; 1583 } 1584 ctx = ntfs_attr_get_search_ctx(base_ni, m); 1585 if (unlikely(!ctx)) { 1586 err = -ENOMEM; 1587 goto err_out; 1588 } 1589 err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, 1590 CASE_SENSITIVE, 0, NULL, 0, ctx); 1591 if (unlikely(err)) { 1592 if (err == -ENOENT) 1593 err = -EIO; 1594 goto err_out; 1595 } 1596 a = ctx->attr; 1597 BUG_ON(a->non_resident); 1598 /* The total length of the attribute value. */ 1599 attr_len = le32_to_cpu(a->data.resident.value_length); 1600 i_size = i_size_read(vi); 1601 BUG_ON(attr_len != i_size); 1602 BUG_ON(pos > attr_len); 1603 end = pos + bytes; 1604 BUG_ON(end > le32_to_cpu(a->length) - 1605 le16_to_cpu(a->data.resident.value_offset)); 1606 kattr = (u8*)a + le16_to_cpu(a->data.resident.value_offset); 1607 kaddr = kmap_atomic(page); 1608 /* Copy the received data from the page to the mft record. */ 1609 memcpy(kattr + pos, kaddr + pos, bytes); 1610 /* Update the attribute length if necessary. */ 1611 if (end > attr_len) { 1612 attr_len = end; 1613 a->data.resident.value_length = cpu_to_le32(attr_len); 1614 } 1615 /* 1616 * If the page is not uptodate, bring the out of bounds area(s) 1617 * uptodate by copying data from the mft record to the page. 1618 */ 1619 if (!PageUptodate(page)) { 1620 if (pos > 0) 1621 memcpy(kaddr, kattr, pos); 1622 if (end < attr_len) 1623 memcpy(kaddr + end, kattr + end, attr_len - end); 1624 /* Zero the region outside the end of the attribute value. */ 1625 memset(kaddr + attr_len, 0, PAGE_SIZE - attr_len); 1626 flush_dcache_page(page); 1627 SetPageUptodate(page); 1628 } 1629 kunmap_atomic(kaddr); 1630 /* Update initialized_size/i_size if necessary. */ 1631 read_lock_irqsave(&ni->size_lock, flags); 1632 initialized_size = ni->initialized_size; 1633 BUG_ON(end > ni->allocated_size); 1634 read_unlock_irqrestore(&ni->size_lock, flags); 1635 BUG_ON(initialized_size != i_size); 1636 if (end > initialized_size) { 1637 write_lock_irqsave(&ni->size_lock, flags); 1638 ni->initialized_size = end; 1639 i_size_write(vi, end); 1640 write_unlock_irqrestore(&ni->size_lock, flags); 1641 } 1642 /* Mark the mft record dirty, so it gets written back. */ 1643 flush_dcache_mft_record_page(ctx->ntfs_ino); 1644 mark_mft_record_dirty(ctx->ntfs_ino); 1645 ntfs_attr_put_search_ctx(ctx); 1646 unmap_mft_record(base_ni); 1647 ntfs_debug("Done."); 1648 return 0; 1649 err_out: 1650 if (err == -ENOMEM) { 1651 ntfs_warning(vi->i_sb, "Error allocating memory required to " 1652 "commit the write."); 1653 if (PageUptodate(page)) { 1654 ntfs_warning(vi->i_sb, "Page is uptodate, setting " 1655 "dirty so the write will be retried " 1656 "later on by the VM."); 1657 /* 1658 * Put the page on mapping->dirty_pages, but leave its 1659 * buffers' dirty state as-is. 1660 */ 1661 __set_page_dirty_nobuffers(page); 1662 err = 0; 1663 } else 1664 ntfs_error(vi->i_sb, "Page is not uptodate. Written " 1665 "data has been lost."); 1666 } else { 1667 ntfs_error(vi->i_sb, "Resident attribute commit write failed " 1668 "with error %i.", err); 1669 NVolSetErrors(ni->vol); 1670 } 1671 if (ctx) 1672 ntfs_attr_put_search_ctx(ctx); 1673 if (m) 1674 unmap_mft_record(base_ni); 1675 return err; 1676 } 1677 1678 /* 1679 * Copy as much as we can into the pages and return the number of bytes which 1680 * were successfully copied. If a fault is encountered then clear the pages 1681 * out to (ofs + bytes) and return the number of bytes which were copied. 1682 */ 1683 static size_t ntfs_copy_from_user_iter(struct page **pages, unsigned nr_pages, 1684 unsigned ofs, struct iov_iter *i, size_t bytes) 1685 { 1686 struct page **last_page = pages + nr_pages; 1687 size_t total = 0; 1688 unsigned len, copied; 1689 1690 do { 1691 len = PAGE_SIZE - ofs; 1692 if (len > bytes) 1693 len = bytes; 1694 copied = copy_page_from_iter_atomic(*pages, ofs, len, i); 1695 total += copied; 1696 bytes -= copied; 1697 if (!bytes) 1698 break; 1699 if (copied < len) 1700 goto err; 1701 ofs = 0; 1702 } while (++pages < last_page); 1703 out: 1704 return total; 1705 err: 1706 /* Zero the rest of the target like __copy_from_user(). */ 1707 len = PAGE_SIZE - copied; 1708 do { 1709 if (len > bytes) 1710 len = bytes; 1711 zero_user(*pages, copied, len); 1712 bytes -= len; 1713 copied = 0; 1714 len = PAGE_SIZE; 1715 } while (++pages < last_page); 1716 goto out; 1717 } 1718 1719 /** 1720 * ntfs_perform_write - perform buffered write to a file 1721 * @file: file to write to 1722 * @i: iov_iter with data to write 1723 * @pos: byte offset in file at which to begin writing to 1724 */ 1725 static ssize_t ntfs_perform_write(struct file *file, struct iov_iter *i, 1726 loff_t pos) 1727 { 1728 struct address_space *mapping = file->f_mapping; 1729 struct inode *vi = mapping->host; 1730 ntfs_inode *ni = NTFS_I(vi); 1731 ntfs_volume *vol = ni->vol; 1732 struct page *pages[NTFS_MAX_PAGES_PER_CLUSTER]; 1733 struct page *cached_page = NULL; 1734 VCN last_vcn; 1735 LCN lcn; 1736 size_t bytes; 1737 ssize_t status, written = 0; 1738 unsigned nr_pages; 1739 1740 ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, pos " 1741 "0x%llx, count 0x%lx.", vi->i_ino, 1742 (unsigned)le32_to_cpu(ni->type), 1743 (unsigned long long)pos, 1744 (unsigned long)iov_iter_count(i)); 1745 /* 1746 * If a previous ntfs_truncate() failed, repeat it and abort if it 1747 * fails again. 1748 */ 1749 if (unlikely(NInoTruncateFailed(ni))) { 1750 int err; 1751 1752 inode_dio_wait(vi); 1753 err = ntfs_truncate(vi); 1754 if (err || NInoTruncateFailed(ni)) { 1755 if (!err) 1756 err = -EIO; 1757 ntfs_error(vol->sb, "Cannot perform write to inode " 1758 "0x%lx, attribute type 0x%x, because " 1759 "ntfs_truncate() failed (error code " 1760 "%i).", vi->i_ino, 1761 (unsigned)le32_to_cpu(ni->type), err); 1762 return err; 1763 } 1764 } 1765 /* 1766 * Determine the number of pages per cluster for non-resident 1767 * attributes. 1768 */ 1769 nr_pages = 1; 1770 if (vol->cluster_size > PAGE_SIZE && NInoNonResident(ni)) 1771 nr_pages = vol->cluster_size >> PAGE_SHIFT; 1772 last_vcn = -1; 1773 do { 1774 VCN vcn; 1775 pgoff_t start_idx; 1776 unsigned ofs, do_pages, u; 1777 size_t copied; 1778 1779 start_idx = pos >> PAGE_SHIFT; 1780 ofs = pos & ~PAGE_MASK; 1781 bytes = PAGE_SIZE - ofs; 1782 do_pages = 1; 1783 if (nr_pages > 1) { 1784 vcn = pos >> vol->cluster_size_bits; 1785 if (vcn != last_vcn) { 1786 last_vcn = vcn; 1787 /* 1788 * Get the lcn of the vcn the write is in. If 1789 * it is a hole, need to lock down all pages in 1790 * the cluster. 1791 */ 1792 down_read(&ni->runlist.lock); 1793 lcn = ntfs_attr_vcn_to_lcn_nolock(ni, pos >> 1794 vol->cluster_size_bits, false); 1795 up_read(&ni->runlist.lock); 1796 if (unlikely(lcn < LCN_HOLE)) { 1797 if (lcn == LCN_ENOMEM) 1798 status = -ENOMEM; 1799 else { 1800 status = -EIO; 1801 ntfs_error(vol->sb, "Cannot " 1802 "perform write to " 1803 "inode 0x%lx, " 1804 "attribute type 0x%x, " 1805 "because the attribute " 1806 "is corrupt.", 1807 vi->i_ino, (unsigned) 1808 le32_to_cpu(ni->type)); 1809 } 1810 break; 1811 } 1812 if (lcn == LCN_HOLE) { 1813 start_idx = (pos & ~(s64) 1814 vol->cluster_size_mask) 1815 >> PAGE_SHIFT; 1816 bytes = vol->cluster_size - (pos & 1817 vol->cluster_size_mask); 1818 do_pages = nr_pages; 1819 } 1820 } 1821 } 1822 if (bytes > iov_iter_count(i)) 1823 bytes = iov_iter_count(i); 1824 again: 1825 /* 1826 * Bring in the user page(s) that we will copy from _first_. 1827 * Otherwise there is a nasty deadlock on copying from the same 1828 * page(s) as we are writing to, without it/them being marked 1829 * up-to-date. Note, at present there is nothing to stop the 1830 * pages being swapped out between us bringing them into memory 1831 * and doing the actual copying. 1832 */ 1833 if (unlikely(fault_in_iov_iter_readable(i, bytes))) { 1834 status = -EFAULT; 1835 break; 1836 } 1837 /* Get and lock @do_pages starting at index @start_idx. */ 1838 status = __ntfs_grab_cache_pages(mapping, start_idx, do_pages, 1839 pages, &cached_page); 1840 if (unlikely(status)) 1841 break; 1842 /* 1843 * For non-resident attributes, we need to fill any holes with 1844 * actual clusters and ensure all bufferes are mapped. We also 1845 * need to bring uptodate any buffers that are only partially 1846 * being written to. 1847 */ 1848 if (NInoNonResident(ni)) { 1849 status = ntfs_prepare_pages_for_non_resident_write( 1850 pages, do_pages, pos, bytes); 1851 if (unlikely(status)) { 1852 do { 1853 unlock_page(pages[--do_pages]); 1854 put_page(pages[do_pages]); 1855 } while (do_pages); 1856 break; 1857 } 1858 } 1859 u = (pos >> PAGE_SHIFT) - pages[0]->index; 1860 copied = ntfs_copy_from_user_iter(pages + u, do_pages - u, ofs, 1861 i, bytes); 1862 ntfs_flush_dcache_pages(pages + u, do_pages - u); 1863 status = 0; 1864 if (likely(copied == bytes)) { 1865 status = ntfs_commit_pages_after_write(pages, do_pages, 1866 pos, bytes); 1867 } 1868 do { 1869 unlock_page(pages[--do_pages]); 1870 put_page(pages[do_pages]); 1871 } while (do_pages); 1872 if (unlikely(status < 0)) { 1873 iov_iter_revert(i, copied); 1874 break; 1875 } 1876 cond_resched(); 1877 if (unlikely(copied < bytes)) { 1878 iov_iter_revert(i, copied); 1879 if (copied) 1880 bytes = copied; 1881 else if (bytes > PAGE_SIZE - ofs) 1882 bytes = PAGE_SIZE - ofs; 1883 goto again; 1884 } 1885 pos += copied; 1886 written += copied; 1887 balance_dirty_pages_ratelimited(mapping); 1888 if (fatal_signal_pending(current)) { 1889 status = -EINTR; 1890 break; 1891 } 1892 } while (iov_iter_count(i)); 1893 if (cached_page) 1894 put_page(cached_page); 1895 ntfs_debug("Done. Returning %s (written 0x%lx, status %li).", 1896 written ? "written" : "status", (unsigned long)written, 1897 (long)status); 1898 return written ? written : status; 1899 } 1900 1901 /** 1902 * ntfs_file_write_iter - simple wrapper for ntfs_file_write_iter_nolock() 1903 * @iocb: IO state structure 1904 * @from: iov_iter with data to write 1905 * 1906 * Basically the same as generic_file_write_iter() except that it ends up 1907 * up calling ntfs_perform_write() instead of generic_perform_write() and that 1908 * O_DIRECT is not implemented. 1909 */ 1910 static ssize_t ntfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) 1911 { 1912 struct file *file = iocb->ki_filp; 1913 struct inode *vi = file_inode(file); 1914 ssize_t written = 0; 1915 ssize_t err; 1916 1917 inode_lock(vi); 1918 /* We can write back this queue in page reclaim. */ 1919 current->backing_dev_info = inode_to_bdi(vi); 1920 err = ntfs_prepare_file_for_write(iocb, from); 1921 if (iov_iter_count(from) && !err) 1922 written = ntfs_perform_write(file, from, iocb->ki_pos); 1923 current->backing_dev_info = NULL; 1924 inode_unlock(vi); 1925 iocb->ki_pos += written; 1926 if (likely(written > 0)) 1927 written = generic_write_sync(iocb, written); 1928 return written ? written : err; 1929 } 1930 1931 /** 1932 * ntfs_file_fsync - sync a file to disk 1933 * @filp: file to be synced 1934 * @datasync: if non-zero only flush user data and not metadata 1935 * 1936 * Data integrity sync of a file to disk. Used for fsync, fdatasync, and msync 1937 * system calls. This function is inspired by fs/buffer.c::file_fsync(). 1938 * 1939 * If @datasync is false, write the mft record and all associated extent mft 1940 * records as well as the $DATA attribute and then sync the block device. 1941 * 1942 * If @datasync is true and the attribute is non-resident, we skip the writing 1943 * of the mft record and all associated extent mft records (this might still 1944 * happen due to the write_inode_now() call). 1945 * 1946 * Also, if @datasync is true, we do not wait on the inode to be written out 1947 * but we always wait on the page cache pages to be written out. 1948 * 1949 * Locking: Caller must hold i_mutex on the inode. 1950 * 1951 * TODO: We should probably also write all attribute/index inodes associated 1952 * with this inode but since we have no simple way of getting to them we ignore 1953 * this problem for now. 1954 */ 1955 static int ntfs_file_fsync(struct file *filp, loff_t start, loff_t end, 1956 int datasync) 1957 { 1958 struct inode *vi = filp->f_mapping->host; 1959 int err, ret = 0; 1960 1961 ntfs_debug("Entering for inode 0x%lx.", vi->i_ino); 1962 1963 err = file_write_and_wait_range(filp, start, end); 1964 if (err) 1965 return err; 1966 inode_lock(vi); 1967 1968 BUG_ON(S_ISDIR(vi->i_mode)); 1969 if (!datasync || !NInoNonResident(NTFS_I(vi))) 1970 ret = __ntfs_write_inode(vi, 1); 1971 write_inode_now(vi, !datasync); 1972 /* 1973 * NOTE: If we were to use mapping->private_list (see ext2 and 1974 * fs/buffer.c) for dirty blocks then we could optimize the below to be 1975 * sync_mapping_buffers(vi->i_mapping). 1976 */ 1977 err = sync_blockdev(vi->i_sb->s_bdev); 1978 if (unlikely(err && !ret)) 1979 ret = err; 1980 if (likely(!ret)) 1981 ntfs_debug("Done."); 1982 else 1983 ntfs_warning(vi->i_sb, "Failed to f%ssync inode 0x%lx. Error " 1984 "%u.", datasync ? "data" : "", vi->i_ino, -ret); 1985 inode_unlock(vi); 1986 return ret; 1987 } 1988 1989 #endif /* NTFS_RW */ 1990 1991 const struct file_operations ntfs_file_ops = { 1992 .llseek = generic_file_llseek, 1993 .read_iter = generic_file_read_iter, 1994 #ifdef NTFS_RW 1995 .write_iter = ntfs_file_write_iter, 1996 .fsync = ntfs_file_fsync, 1997 #endif /* NTFS_RW */ 1998 .mmap = generic_file_mmap, 1999 .open = ntfs_file_open, 2000 .splice_read = generic_file_splice_read, 2001 }; 2002 2003 const struct inode_operations ntfs_file_inode_ops = { 2004 #ifdef NTFS_RW 2005 .setattr = ntfs_setattr, 2006 #endif /* NTFS_RW */ 2007 }; 2008 2009 const struct file_operations ntfs_empty_file_ops = {}; 2010 2011 const struct inode_operations ntfs_empty_inode_ops = {}; 2012