1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * file.c - NTFS kernel file operations. Part of the Linux-NTFS project. 4 * 5 * Copyright (c) 2001-2015 Anton Altaparmakov and Tuxera Inc. 6 */ 7 8 #include <linux/blkdev.h> 9 #include <linux/backing-dev.h> 10 #include <linux/buffer_head.h> 11 #include <linux/gfp.h> 12 #include <linux/pagemap.h> 13 #include <linux/pagevec.h> 14 #include <linux/sched/signal.h> 15 #include <linux/swap.h> 16 #include <linux/uio.h> 17 #include <linux/writeback.h> 18 19 #include <asm/page.h> 20 #include <linux/uaccess.h> 21 22 #include "attrib.h" 23 #include "bitmap.h" 24 #include "inode.h" 25 #include "debug.h" 26 #include "lcnalloc.h" 27 #include "malloc.h" 28 #include "mft.h" 29 #include "ntfs.h" 30 31 /** 32 * ntfs_file_open - called when an inode is about to be opened 33 * @vi: inode to be opened 34 * @filp: file structure describing the inode 35 * 36 * Limit file size to the page cache limit on architectures where unsigned long 37 * is 32-bits. This is the most we can do for now without overflowing the page 38 * cache page index. Doing it this way means we don't run into problems because 39 * of existing too large files. It would be better to allow the user to read 40 * the beginning of the file but I doubt very much anyone is going to hit this 41 * check on a 32-bit architecture, so there is no point in adding the extra 42 * complexity required to support this. 43 * 44 * On 64-bit architectures, the check is hopefully optimized away by the 45 * compiler. 46 * 47 * After the check passes, just call generic_file_open() to do its work. 48 */ 49 static int ntfs_file_open(struct inode *vi, struct file *filp) 50 { 51 if (sizeof(unsigned long) < 8) { 52 if (i_size_read(vi) > MAX_LFS_FILESIZE) 53 return -EOVERFLOW; 54 } 55 return generic_file_open(vi, filp); 56 } 57 58 #ifdef NTFS_RW 59 60 /** 61 * ntfs_attr_extend_initialized - extend the initialized size of an attribute 62 * @ni: ntfs inode of the attribute to extend 63 * @new_init_size: requested new initialized size in bytes 64 * 65 * Extend the initialized size of an attribute described by the ntfs inode @ni 66 * to @new_init_size bytes. This involves zeroing any non-sparse space between 67 * the old initialized size and @new_init_size both in the page cache and on 68 * disk (if relevant complete pages are already uptodate in the page cache then 69 * these are simply marked dirty). 70 * 71 * As a side-effect, the file size (vfs inode->i_size) may be incremented as, 72 * in the resident attribute case, it is tied to the initialized size and, in 73 * the non-resident attribute case, it may not fall below the initialized size. 74 * 75 * Note that if the attribute is resident, we do not need to touch the page 76 * cache at all. This is because if the page cache page is not uptodate we 77 * bring it uptodate later, when doing the write to the mft record since we 78 * then already have the page mapped. And if the page is uptodate, the 79 * non-initialized region will already have been zeroed when the page was 80 * brought uptodate and the region may in fact already have been overwritten 81 * with new data via mmap() based writes, so we cannot just zero it. And since 82 * POSIX specifies that the behaviour of resizing a file whilst it is mmap()ped 83 * is unspecified, we choose not to do zeroing and thus we do not need to touch 84 * the page at all. For a more detailed explanation see ntfs_truncate() in 85 * fs/ntfs/inode.c. 86 * 87 * Return 0 on success and -errno on error. In the case that an error is 88 * encountered it is possible that the initialized size will already have been 89 * incremented some way towards @new_init_size but it is guaranteed that if 90 * this is the case, the necessary zeroing will also have happened and that all 91 * metadata is self-consistent. 92 * 93 * Locking: i_mutex on the vfs inode corrseponsind to the ntfs inode @ni must be 94 * held by the caller. 95 */ 96 static int ntfs_attr_extend_initialized(ntfs_inode *ni, const s64 new_init_size) 97 { 98 s64 old_init_size; 99 loff_t old_i_size; 100 pgoff_t index, end_index; 101 unsigned long flags; 102 struct inode *vi = VFS_I(ni); 103 ntfs_inode *base_ni; 104 MFT_RECORD *m = NULL; 105 ATTR_RECORD *a; 106 ntfs_attr_search_ctx *ctx = NULL; 107 struct address_space *mapping; 108 struct page *page = NULL; 109 u8 *kattr; 110 int err; 111 u32 attr_len; 112 113 read_lock_irqsave(&ni->size_lock, flags); 114 old_init_size = ni->initialized_size; 115 old_i_size = i_size_read(vi); 116 BUG_ON(new_init_size > ni->allocated_size); 117 read_unlock_irqrestore(&ni->size_lock, flags); 118 ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, " 119 "old_initialized_size 0x%llx, " 120 "new_initialized_size 0x%llx, i_size 0x%llx.", 121 vi->i_ino, (unsigned)le32_to_cpu(ni->type), 122 (unsigned long long)old_init_size, 123 (unsigned long long)new_init_size, old_i_size); 124 if (!NInoAttr(ni)) 125 base_ni = ni; 126 else 127 base_ni = ni->ext.base_ntfs_ino; 128 /* Use goto to reduce indentation and we need the label below anyway. */ 129 if (NInoNonResident(ni)) 130 goto do_non_resident_extend; 131 BUG_ON(old_init_size != old_i_size); 132 m = map_mft_record(base_ni); 133 if (IS_ERR(m)) { 134 err = PTR_ERR(m); 135 m = NULL; 136 goto err_out; 137 } 138 ctx = ntfs_attr_get_search_ctx(base_ni, m); 139 if (unlikely(!ctx)) { 140 err = -ENOMEM; 141 goto err_out; 142 } 143 err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, 144 CASE_SENSITIVE, 0, NULL, 0, ctx); 145 if (unlikely(err)) { 146 if (err == -ENOENT) 147 err = -EIO; 148 goto err_out; 149 } 150 m = ctx->mrec; 151 a = ctx->attr; 152 BUG_ON(a->non_resident); 153 /* The total length of the attribute value. */ 154 attr_len = le32_to_cpu(a->data.resident.value_length); 155 BUG_ON(old_i_size != (loff_t)attr_len); 156 /* 157 * Do the zeroing in the mft record and update the attribute size in 158 * the mft record. 159 */ 160 kattr = (u8*)a + le16_to_cpu(a->data.resident.value_offset); 161 memset(kattr + attr_len, 0, new_init_size - attr_len); 162 a->data.resident.value_length = cpu_to_le32((u32)new_init_size); 163 /* Finally, update the sizes in the vfs and ntfs inodes. */ 164 write_lock_irqsave(&ni->size_lock, flags); 165 i_size_write(vi, new_init_size); 166 ni->initialized_size = new_init_size; 167 write_unlock_irqrestore(&ni->size_lock, flags); 168 goto done; 169 do_non_resident_extend: 170 /* 171 * If the new initialized size @new_init_size exceeds the current file 172 * size (vfs inode->i_size), we need to extend the file size to the 173 * new initialized size. 174 */ 175 if (new_init_size > old_i_size) { 176 m = map_mft_record(base_ni); 177 if (IS_ERR(m)) { 178 err = PTR_ERR(m); 179 m = NULL; 180 goto err_out; 181 } 182 ctx = ntfs_attr_get_search_ctx(base_ni, m); 183 if (unlikely(!ctx)) { 184 err = -ENOMEM; 185 goto err_out; 186 } 187 err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, 188 CASE_SENSITIVE, 0, NULL, 0, ctx); 189 if (unlikely(err)) { 190 if (err == -ENOENT) 191 err = -EIO; 192 goto err_out; 193 } 194 m = ctx->mrec; 195 a = ctx->attr; 196 BUG_ON(!a->non_resident); 197 BUG_ON(old_i_size != (loff_t) 198 sle64_to_cpu(a->data.non_resident.data_size)); 199 a->data.non_resident.data_size = cpu_to_sle64(new_init_size); 200 flush_dcache_mft_record_page(ctx->ntfs_ino); 201 mark_mft_record_dirty(ctx->ntfs_ino); 202 /* Update the file size in the vfs inode. */ 203 i_size_write(vi, new_init_size); 204 ntfs_attr_put_search_ctx(ctx); 205 ctx = NULL; 206 unmap_mft_record(base_ni); 207 m = NULL; 208 } 209 mapping = vi->i_mapping; 210 index = old_init_size >> PAGE_SHIFT; 211 end_index = (new_init_size + PAGE_SIZE - 1) >> PAGE_SHIFT; 212 do { 213 /* 214 * Read the page. If the page is not present, this will zero 215 * the uninitialized regions for us. 216 */ 217 page = read_mapping_page(mapping, index, NULL); 218 if (IS_ERR(page)) { 219 err = PTR_ERR(page); 220 goto init_err_out; 221 } 222 /* 223 * Update the initialized size in the ntfs inode. This is 224 * enough to make ntfs_writepage() work. 225 */ 226 write_lock_irqsave(&ni->size_lock, flags); 227 ni->initialized_size = (s64)(index + 1) << PAGE_SHIFT; 228 if (ni->initialized_size > new_init_size) 229 ni->initialized_size = new_init_size; 230 write_unlock_irqrestore(&ni->size_lock, flags); 231 /* Set the page dirty so it gets written out. */ 232 set_page_dirty(page); 233 put_page(page); 234 /* 235 * Play nice with the vm and the rest of the system. This is 236 * very much needed as we can potentially be modifying the 237 * initialised size from a very small value to a really huge 238 * value, e.g. 239 * f = open(somefile, O_TRUNC); 240 * truncate(f, 10GiB); 241 * seek(f, 10GiB); 242 * write(f, 1); 243 * And this would mean we would be marking dirty hundreds of 244 * thousands of pages or as in the above example more than 245 * two and a half million pages! 246 * 247 * TODO: For sparse pages could optimize this workload by using 248 * the FsMisc / MiscFs page bit as a "PageIsSparse" bit. This 249 * would be set in read_folio for sparse pages and here we would 250 * not need to mark dirty any pages which have this bit set. 251 * The only caveat is that we have to clear the bit everywhere 252 * where we allocate any clusters that lie in the page or that 253 * contain the page. 254 * 255 * TODO: An even greater optimization would be for us to only 256 * call read_folio() on pages which are not in sparse regions as 257 * determined from the runlist. This would greatly reduce the 258 * number of pages we read and make dirty in the case of sparse 259 * files. 260 */ 261 balance_dirty_pages_ratelimited(mapping); 262 cond_resched(); 263 } while (++index < end_index); 264 read_lock_irqsave(&ni->size_lock, flags); 265 BUG_ON(ni->initialized_size != new_init_size); 266 read_unlock_irqrestore(&ni->size_lock, flags); 267 /* Now bring in sync the initialized_size in the mft record. */ 268 m = map_mft_record(base_ni); 269 if (IS_ERR(m)) { 270 err = PTR_ERR(m); 271 m = NULL; 272 goto init_err_out; 273 } 274 ctx = ntfs_attr_get_search_ctx(base_ni, m); 275 if (unlikely(!ctx)) { 276 err = -ENOMEM; 277 goto init_err_out; 278 } 279 err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, 280 CASE_SENSITIVE, 0, NULL, 0, ctx); 281 if (unlikely(err)) { 282 if (err == -ENOENT) 283 err = -EIO; 284 goto init_err_out; 285 } 286 m = ctx->mrec; 287 a = ctx->attr; 288 BUG_ON(!a->non_resident); 289 a->data.non_resident.initialized_size = cpu_to_sle64(new_init_size); 290 done: 291 flush_dcache_mft_record_page(ctx->ntfs_ino); 292 mark_mft_record_dirty(ctx->ntfs_ino); 293 if (ctx) 294 ntfs_attr_put_search_ctx(ctx); 295 if (m) 296 unmap_mft_record(base_ni); 297 ntfs_debug("Done, initialized_size 0x%llx, i_size 0x%llx.", 298 (unsigned long long)new_init_size, i_size_read(vi)); 299 return 0; 300 init_err_out: 301 write_lock_irqsave(&ni->size_lock, flags); 302 ni->initialized_size = old_init_size; 303 write_unlock_irqrestore(&ni->size_lock, flags); 304 err_out: 305 if (ctx) 306 ntfs_attr_put_search_ctx(ctx); 307 if (m) 308 unmap_mft_record(base_ni); 309 ntfs_debug("Failed. Returning error code %i.", err); 310 return err; 311 } 312 313 static ssize_t ntfs_prepare_file_for_write(struct kiocb *iocb, 314 struct iov_iter *from) 315 { 316 loff_t pos; 317 s64 end, ll; 318 ssize_t err; 319 unsigned long flags; 320 struct file *file = iocb->ki_filp; 321 struct inode *vi = file_inode(file); 322 ntfs_inode *ni = NTFS_I(vi); 323 ntfs_volume *vol = ni->vol; 324 325 ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, pos " 326 "0x%llx, count 0x%zx.", vi->i_ino, 327 (unsigned)le32_to_cpu(ni->type), 328 (unsigned long long)iocb->ki_pos, 329 iov_iter_count(from)); 330 err = generic_write_checks(iocb, from); 331 if (unlikely(err <= 0)) 332 goto out; 333 /* 334 * All checks have passed. Before we start doing any writing we want 335 * to abort any totally illegal writes. 336 */ 337 BUG_ON(NInoMstProtected(ni)); 338 BUG_ON(ni->type != AT_DATA); 339 /* If file is encrypted, deny access, just like NT4. */ 340 if (NInoEncrypted(ni)) { 341 /* Only $DATA attributes can be encrypted. */ 342 /* 343 * Reminder for later: Encrypted files are _always_ 344 * non-resident so that the content can always be encrypted. 345 */ 346 ntfs_debug("Denying write access to encrypted file."); 347 err = -EACCES; 348 goto out; 349 } 350 if (NInoCompressed(ni)) { 351 /* Only unnamed $DATA attribute can be compressed. */ 352 BUG_ON(ni->name_len); 353 /* 354 * Reminder for later: If resident, the data is not actually 355 * compressed. Only on the switch to non-resident does 356 * compression kick in. This is in contrast to encrypted files 357 * (see above). 358 */ 359 ntfs_error(vi->i_sb, "Writing to compressed files is not " 360 "implemented yet. Sorry."); 361 err = -EOPNOTSUPP; 362 goto out; 363 } 364 err = file_remove_privs(file); 365 if (unlikely(err)) 366 goto out; 367 /* 368 * Our ->update_time method always succeeds thus file_update_time() 369 * cannot fail either so there is no need to check the return code. 370 */ 371 file_update_time(file); 372 pos = iocb->ki_pos; 373 /* The first byte after the last cluster being written to. */ 374 end = (pos + iov_iter_count(from) + vol->cluster_size_mask) & 375 ~(u64)vol->cluster_size_mask; 376 /* 377 * If the write goes beyond the allocated size, extend the allocation 378 * to cover the whole of the write, rounded up to the nearest cluster. 379 */ 380 read_lock_irqsave(&ni->size_lock, flags); 381 ll = ni->allocated_size; 382 read_unlock_irqrestore(&ni->size_lock, flags); 383 if (end > ll) { 384 /* 385 * Extend the allocation without changing the data size. 386 * 387 * Note we ensure the allocation is big enough to at least 388 * write some data but we do not require the allocation to be 389 * complete, i.e. it may be partial. 390 */ 391 ll = ntfs_attr_extend_allocation(ni, end, -1, pos); 392 if (likely(ll >= 0)) { 393 BUG_ON(pos >= ll); 394 /* If the extension was partial truncate the write. */ 395 if (end > ll) { 396 ntfs_debug("Truncating write to inode 0x%lx, " 397 "attribute type 0x%x, because " 398 "the allocation was only " 399 "partially extended.", 400 vi->i_ino, (unsigned) 401 le32_to_cpu(ni->type)); 402 iov_iter_truncate(from, ll - pos); 403 } 404 } else { 405 err = ll; 406 read_lock_irqsave(&ni->size_lock, flags); 407 ll = ni->allocated_size; 408 read_unlock_irqrestore(&ni->size_lock, flags); 409 /* Perform a partial write if possible or fail. */ 410 if (pos < ll) { 411 ntfs_debug("Truncating write to inode 0x%lx " 412 "attribute type 0x%x, because " 413 "extending the allocation " 414 "failed (error %d).", 415 vi->i_ino, (unsigned) 416 le32_to_cpu(ni->type), 417 (int)-err); 418 iov_iter_truncate(from, ll - pos); 419 } else { 420 if (err != -ENOSPC) 421 ntfs_error(vi->i_sb, "Cannot perform " 422 "write to inode " 423 "0x%lx, attribute " 424 "type 0x%x, because " 425 "extending the " 426 "allocation failed " 427 "(error %ld).", 428 vi->i_ino, (unsigned) 429 le32_to_cpu(ni->type), 430 (long)-err); 431 else 432 ntfs_debug("Cannot perform write to " 433 "inode 0x%lx, " 434 "attribute type 0x%x, " 435 "because there is not " 436 "space left.", 437 vi->i_ino, (unsigned) 438 le32_to_cpu(ni->type)); 439 goto out; 440 } 441 } 442 } 443 /* 444 * If the write starts beyond the initialized size, extend it up to the 445 * beginning of the write and initialize all non-sparse space between 446 * the old initialized size and the new one. This automatically also 447 * increments the vfs inode->i_size to keep it above or equal to the 448 * initialized_size. 449 */ 450 read_lock_irqsave(&ni->size_lock, flags); 451 ll = ni->initialized_size; 452 read_unlock_irqrestore(&ni->size_lock, flags); 453 if (pos > ll) { 454 /* 455 * Wait for ongoing direct i/o to complete before proceeding. 456 * New direct i/o cannot start as we hold i_mutex. 457 */ 458 inode_dio_wait(vi); 459 err = ntfs_attr_extend_initialized(ni, pos); 460 if (unlikely(err < 0)) 461 ntfs_error(vi->i_sb, "Cannot perform write to inode " 462 "0x%lx, attribute type 0x%x, because " 463 "extending the initialized size " 464 "failed (error %d).", vi->i_ino, 465 (unsigned)le32_to_cpu(ni->type), 466 (int)-err); 467 } 468 out: 469 return err; 470 } 471 472 /** 473 * __ntfs_grab_cache_pages - obtain a number of locked pages 474 * @mapping: address space mapping from which to obtain page cache pages 475 * @index: starting index in @mapping at which to begin obtaining pages 476 * @nr_pages: number of page cache pages to obtain 477 * @pages: array of pages in which to return the obtained page cache pages 478 * @cached_page: allocated but as yet unused page 479 * 480 * Obtain @nr_pages locked page cache pages from the mapping @mapping and 481 * starting at index @index. 482 * 483 * If a page is newly created, add it to lru list 484 * 485 * Note, the page locks are obtained in ascending page index order. 486 */ 487 static inline int __ntfs_grab_cache_pages(struct address_space *mapping, 488 pgoff_t index, const unsigned nr_pages, struct page **pages, 489 struct page **cached_page) 490 { 491 int err, nr; 492 493 BUG_ON(!nr_pages); 494 err = nr = 0; 495 do { 496 pages[nr] = find_get_page_flags(mapping, index, FGP_LOCK | 497 FGP_ACCESSED); 498 if (!pages[nr]) { 499 if (!*cached_page) { 500 *cached_page = page_cache_alloc(mapping); 501 if (unlikely(!*cached_page)) { 502 err = -ENOMEM; 503 goto err_out; 504 } 505 } 506 err = add_to_page_cache_lru(*cached_page, mapping, 507 index, 508 mapping_gfp_constraint(mapping, GFP_KERNEL)); 509 if (unlikely(err)) { 510 if (err == -EEXIST) 511 continue; 512 goto err_out; 513 } 514 pages[nr] = *cached_page; 515 *cached_page = NULL; 516 } 517 index++; 518 nr++; 519 } while (nr < nr_pages); 520 out: 521 return err; 522 err_out: 523 while (nr > 0) { 524 unlock_page(pages[--nr]); 525 put_page(pages[nr]); 526 } 527 goto out; 528 } 529 530 static inline int ntfs_submit_bh_for_read(struct buffer_head *bh) 531 { 532 lock_buffer(bh); 533 get_bh(bh); 534 bh->b_end_io = end_buffer_read_sync; 535 return submit_bh(REQ_OP_READ, bh); 536 } 537 538 /** 539 * ntfs_prepare_pages_for_non_resident_write - prepare pages for receiving data 540 * @pages: array of destination pages 541 * @nr_pages: number of pages in @pages 542 * @pos: byte position in file at which the write begins 543 * @bytes: number of bytes to be written 544 * 545 * This is called for non-resident attributes from ntfs_file_buffered_write() 546 * with i_mutex held on the inode (@pages[0]->mapping->host). There are 547 * @nr_pages pages in @pages which are locked but not kmap()ped. The source 548 * data has not yet been copied into the @pages. 549 * 550 * Need to fill any holes with actual clusters, allocate buffers if necessary, 551 * ensure all the buffers are mapped, and bring uptodate any buffers that are 552 * only partially being written to. 553 * 554 * If @nr_pages is greater than one, we are guaranteed that the cluster size is 555 * greater than PAGE_SIZE, that all pages in @pages are entirely inside 556 * the same cluster and that they are the entirety of that cluster, and that 557 * the cluster is sparse, i.e. we need to allocate a cluster to fill the hole. 558 * 559 * i_size is not to be modified yet. 560 * 561 * Return 0 on success or -errno on error. 562 */ 563 static int ntfs_prepare_pages_for_non_resident_write(struct page **pages, 564 unsigned nr_pages, s64 pos, size_t bytes) 565 { 566 VCN vcn, highest_vcn = 0, cpos, cend, bh_cpos, bh_cend; 567 LCN lcn; 568 s64 bh_pos, vcn_len, end, initialized_size; 569 sector_t lcn_block; 570 struct page *page; 571 struct inode *vi; 572 ntfs_inode *ni, *base_ni = NULL; 573 ntfs_volume *vol; 574 runlist_element *rl, *rl2; 575 struct buffer_head *bh, *head, *wait[2], **wait_bh = wait; 576 ntfs_attr_search_ctx *ctx = NULL; 577 MFT_RECORD *m = NULL; 578 ATTR_RECORD *a = NULL; 579 unsigned long flags; 580 u32 attr_rec_len = 0; 581 unsigned blocksize, u; 582 int err, mp_size; 583 bool rl_write_locked, was_hole, is_retry; 584 unsigned char blocksize_bits; 585 struct { 586 u8 runlist_merged:1; 587 u8 mft_attr_mapped:1; 588 u8 mp_rebuilt:1; 589 u8 attr_switched:1; 590 } status = { 0, 0, 0, 0 }; 591 592 BUG_ON(!nr_pages); 593 BUG_ON(!pages); 594 BUG_ON(!*pages); 595 vi = pages[0]->mapping->host; 596 ni = NTFS_I(vi); 597 vol = ni->vol; 598 ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, start page " 599 "index 0x%lx, nr_pages 0x%x, pos 0x%llx, bytes 0x%zx.", 600 vi->i_ino, ni->type, pages[0]->index, nr_pages, 601 (long long)pos, bytes); 602 blocksize = vol->sb->s_blocksize; 603 blocksize_bits = vol->sb->s_blocksize_bits; 604 u = 0; 605 do { 606 page = pages[u]; 607 BUG_ON(!page); 608 /* 609 * create_empty_buffers() will create uptodate/dirty buffers if 610 * the page is uptodate/dirty. 611 */ 612 if (!page_has_buffers(page)) { 613 create_empty_buffers(page, blocksize, 0); 614 if (unlikely(!page_has_buffers(page))) 615 return -ENOMEM; 616 } 617 } while (++u < nr_pages); 618 rl_write_locked = false; 619 rl = NULL; 620 err = 0; 621 vcn = lcn = -1; 622 vcn_len = 0; 623 lcn_block = -1; 624 was_hole = false; 625 cpos = pos >> vol->cluster_size_bits; 626 end = pos + bytes; 627 cend = (end + vol->cluster_size - 1) >> vol->cluster_size_bits; 628 /* 629 * Loop over each page and for each page over each buffer. Use goto to 630 * reduce indentation. 631 */ 632 u = 0; 633 do_next_page: 634 page = pages[u]; 635 bh_pos = (s64)page->index << PAGE_SHIFT; 636 bh = head = page_buffers(page); 637 do { 638 VCN cdelta; 639 s64 bh_end; 640 unsigned bh_cofs; 641 642 /* Clear buffer_new on all buffers to reinitialise state. */ 643 if (buffer_new(bh)) 644 clear_buffer_new(bh); 645 bh_end = bh_pos + blocksize; 646 bh_cpos = bh_pos >> vol->cluster_size_bits; 647 bh_cofs = bh_pos & vol->cluster_size_mask; 648 if (buffer_mapped(bh)) { 649 /* 650 * The buffer is already mapped. If it is uptodate, 651 * ignore it. 652 */ 653 if (buffer_uptodate(bh)) 654 continue; 655 /* 656 * The buffer is not uptodate. If the page is uptodate 657 * set the buffer uptodate and otherwise ignore it. 658 */ 659 if (PageUptodate(page)) { 660 set_buffer_uptodate(bh); 661 continue; 662 } 663 /* 664 * Neither the page nor the buffer are uptodate. If 665 * the buffer is only partially being written to, we 666 * need to read it in before the write, i.e. now. 667 */ 668 if ((bh_pos < pos && bh_end > pos) || 669 (bh_pos < end && bh_end > end)) { 670 /* 671 * If the buffer is fully or partially within 672 * the initialized size, do an actual read. 673 * Otherwise, simply zero the buffer. 674 */ 675 read_lock_irqsave(&ni->size_lock, flags); 676 initialized_size = ni->initialized_size; 677 read_unlock_irqrestore(&ni->size_lock, flags); 678 if (bh_pos < initialized_size) { 679 ntfs_submit_bh_for_read(bh); 680 *wait_bh++ = bh; 681 } else { 682 zero_user(page, bh_offset(bh), 683 blocksize); 684 set_buffer_uptodate(bh); 685 } 686 } 687 continue; 688 } 689 /* Unmapped buffer. Need to map it. */ 690 bh->b_bdev = vol->sb->s_bdev; 691 /* 692 * If the current buffer is in the same clusters as the map 693 * cache, there is no need to check the runlist again. The 694 * map cache is made up of @vcn, which is the first cached file 695 * cluster, @vcn_len which is the number of cached file 696 * clusters, @lcn is the device cluster corresponding to @vcn, 697 * and @lcn_block is the block number corresponding to @lcn. 698 */ 699 cdelta = bh_cpos - vcn; 700 if (likely(!cdelta || (cdelta > 0 && cdelta < vcn_len))) { 701 map_buffer_cached: 702 BUG_ON(lcn < 0); 703 bh->b_blocknr = lcn_block + 704 (cdelta << (vol->cluster_size_bits - 705 blocksize_bits)) + 706 (bh_cofs >> blocksize_bits); 707 set_buffer_mapped(bh); 708 /* 709 * If the page is uptodate so is the buffer. If the 710 * buffer is fully outside the write, we ignore it if 711 * it was already allocated and we mark it dirty so it 712 * gets written out if we allocated it. On the other 713 * hand, if we allocated the buffer but we are not 714 * marking it dirty we set buffer_new so we can do 715 * error recovery. 716 */ 717 if (PageUptodate(page)) { 718 if (!buffer_uptodate(bh)) 719 set_buffer_uptodate(bh); 720 if (unlikely(was_hole)) { 721 /* We allocated the buffer. */ 722 clean_bdev_bh_alias(bh); 723 if (bh_end <= pos || bh_pos >= end) 724 mark_buffer_dirty(bh); 725 else 726 set_buffer_new(bh); 727 } 728 continue; 729 } 730 /* Page is _not_ uptodate. */ 731 if (likely(!was_hole)) { 732 /* 733 * Buffer was already allocated. If it is not 734 * uptodate and is only partially being written 735 * to, we need to read it in before the write, 736 * i.e. now. 737 */ 738 if (!buffer_uptodate(bh) && bh_pos < end && 739 bh_end > pos && 740 (bh_pos < pos || 741 bh_end > end)) { 742 /* 743 * If the buffer is fully or partially 744 * within the initialized size, do an 745 * actual read. Otherwise, simply zero 746 * the buffer. 747 */ 748 read_lock_irqsave(&ni->size_lock, 749 flags); 750 initialized_size = ni->initialized_size; 751 read_unlock_irqrestore(&ni->size_lock, 752 flags); 753 if (bh_pos < initialized_size) { 754 ntfs_submit_bh_for_read(bh); 755 *wait_bh++ = bh; 756 } else { 757 zero_user(page, bh_offset(bh), 758 blocksize); 759 set_buffer_uptodate(bh); 760 } 761 } 762 continue; 763 } 764 /* We allocated the buffer. */ 765 clean_bdev_bh_alias(bh); 766 /* 767 * If the buffer is fully outside the write, zero it, 768 * set it uptodate, and mark it dirty so it gets 769 * written out. If it is partially being written to, 770 * zero region surrounding the write but leave it to 771 * commit write to do anything else. Finally, if the 772 * buffer is fully being overwritten, do nothing. 773 */ 774 if (bh_end <= pos || bh_pos >= end) { 775 if (!buffer_uptodate(bh)) { 776 zero_user(page, bh_offset(bh), 777 blocksize); 778 set_buffer_uptodate(bh); 779 } 780 mark_buffer_dirty(bh); 781 continue; 782 } 783 set_buffer_new(bh); 784 if (!buffer_uptodate(bh) && 785 (bh_pos < pos || bh_end > end)) { 786 u8 *kaddr; 787 unsigned pofs; 788 789 kaddr = kmap_atomic(page); 790 if (bh_pos < pos) { 791 pofs = bh_pos & ~PAGE_MASK; 792 memset(kaddr + pofs, 0, pos - bh_pos); 793 } 794 if (bh_end > end) { 795 pofs = end & ~PAGE_MASK; 796 memset(kaddr + pofs, 0, bh_end - end); 797 } 798 kunmap_atomic(kaddr); 799 flush_dcache_page(page); 800 } 801 continue; 802 } 803 /* 804 * Slow path: this is the first buffer in the cluster. If it 805 * is outside allocated size and is not uptodate, zero it and 806 * set it uptodate. 807 */ 808 read_lock_irqsave(&ni->size_lock, flags); 809 initialized_size = ni->allocated_size; 810 read_unlock_irqrestore(&ni->size_lock, flags); 811 if (bh_pos > initialized_size) { 812 if (PageUptodate(page)) { 813 if (!buffer_uptodate(bh)) 814 set_buffer_uptodate(bh); 815 } else if (!buffer_uptodate(bh)) { 816 zero_user(page, bh_offset(bh), blocksize); 817 set_buffer_uptodate(bh); 818 } 819 continue; 820 } 821 is_retry = false; 822 if (!rl) { 823 down_read(&ni->runlist.lock); 824 retry_remap: 825 rl = ni->runlist.rl; 826 } 827 if (likely(rl != NULL)) { 828 /* Seek to element containing target cluster. */ 829 while (rl->length && rl[1].vcn <= bh_cpos) 830 rl++; 831 lcn = ntfs_rl_vcn_to_lcn(rl, bh_cpos); 832 if (likely(lcn >= 0)) { 833 /* 834 * Successful remap, setup the map cache and 835 * use that to deal with the buffer. 836 */ 837 was_hole = false; 838 vcn = bh_cpos; 839 vcn_len = rl[1].vcn - vcn; 840 lcn_block = lcn << (vol->cluster_size_bits - 841 blocksize_bits); 842 cdelta = 0; 843 /* 844 * If the number of remaining clusters touched 845 * by the write is smaller or equal to the 846 * number of cached clusters, unlock the 847 * runlist as the map cache will be used from 848 * now on. 849 */ 850 if (likely(vcn + vcn_len >= cend)) { 851 if (rl_write_locked) { 852 up_write(&ni->runlist.lock); 853 rl_write_locked = false; 854 } else 855 up_read(&ni->runlist.lock); 856 rl = NULL; 857 } 858 goto map_buffer_cached; 859 } 860 } else 861 lcn = LCN_RL_NOT_MAPPED; 862 /* 863 * If it is not a hole and not out of bounds, the runlist is 864 * probably unmapped so try to map it now. 865 */ 866 if (unlikely(lcn != LCN_HOLE && lcn != LCN_ENOENT)) { 867 if (likely(!is_retry && lcn == LCN_RL_NOT_MAPPED)) { 868 /* Attempt to map runlist. */ 869 if (!rl_write_locked) { 870 /* 871 * We need the runlist locked for 872 * writing, so if it is locked for 873 * reading relock it now and retry in 874 * case it changed whilst we dropped 875 * the lock. 876 */ 877 up_read(&ni->runlist.lock); 878 down_write(&ni->runlist.lock); 879 rl_write_locked = true; 880 goto retry_remap; 881 } 882 err = ntfs_map_runlist_nolock(ni, bh_cpos, 883 NULL); 884 if (likely(!err)) { 885 is_retry = true; 886 goto retry_remap; 887 } 888 /* 889 * If @vcn is out of bounds, pretend @lcn is 890 * LCN_ENOENT. As long as the buffer is out 891 * of bounds this will work fine. 892 */ 893 if (err == -ENOENT) { 894 lcn = LCN_ENOENT; 895 err = 0; 896 goto rl_not_mapped_enoent; 897 } 898 } else 899 err = -EIO; 900 /* Failed to map the buffer, even after retrying. */ 901 bh->b_blocknr = -1; 902 ntfs_error(vol->sb, "Failed to write to inode 0x%lx, " 903 "attribute type 0x%x, vcn 0x%llx, " 904 "vcn offset 0x%x, because its " 905 "location on disk could not be " 906 "determined%s (error code %i).", 907 ni->mft_no, ni->type, 908 (unsigned long long)bh_cpos, 909 (unsigned)bh_pos & 910 vol->cluster_size_mask, 911 is_retry ? " even after retrying" : "", 912 err); 913 break; 914 } 915 rl_not_mapped_enoent: 916 /* 917 * The buffer is in a hole or out of bounds. We need to fill 918 * the hole, unless the buffer is in a cluster which is not 919 * touched by the write, in which case we just leave the buffer 920 * unmapped. This can only happen when the cluster size is 921 * less than the page cache size. 922 */ 923 if (unlikely(vol->cluster_size < PAGE_SIZE)) { 924 bh_cend = (bh_end + vol->cluster_size - 1) >> 925 vol->cluster_size_bits; 926 if ((bh_cend <= cpos || bh_cpos >= cend)) { 927 bh->b_blocknr = -1; 928 /* 929 * If the buffer is uptodate we skip it. If it 930 * is not but the page is uptodate, we can set 931 * the buffer uptodate. If the page is not 932 * uptodate, we can clear the buffer and set it 933 * uptodate. Whether this is worthwhile is 934 * debatable and this could be removed. 935 */ 936 if (PageUptodate(page)) { 937 if (!buffer_uptodate(bh)) 938 set_buffer_uptodate(bh); 939 } else if (!buffer_uptodate(bh)) { 940 zero_user(page, bh_offset(bh), 941 blocksize); 942 set_buffer_uptodate(bh); 943 } 944 continue; 945 } 946 } 947 /* 948 * Out of bounds buffer is invalid if it was not really out of 949 * bounds. 950 */ 951 BUG_ON(lcn != LCN_HOLE); 952 /* 953 * We need the runlist locked for writing, so if it is locked 954 * for reading relock it now and retry in case it changed 955 * whilst we dropped the lock. 956 */ 957 BUG_ON(!rl); 958 if (!rl_write_locked) { 959 up_read(&ni->runlist.lock); 960 down_write(&ni->runlist.lock); 961 rl_write_locked = true; 962 goto retry_remap; 963 } 964 /* Find the previous last allocated cluster. */ 965 BUG_ON(rl->lcn != LCN_HOLE); 966 lcn = -1; 967 rl2 = rl; 968 while (--rl2 >= ni->runlist.rl) { 969 if (rl2->lcn >= 0) { 970 lcn = rl2->lcn + rl2->length; 971 break; 972 } 973 } 974 rl2 = ntfs_cluster_alloc(vol, bh_cpos, 1, lcn, DATA_ZONE, 975 false); 976 if (IS_ERR(rl2)) { 977 err = PTR_ERR(rl2); 978 ntfs_debug("Failed to allocate cluster, error code %i.", 979 err); 980 break; 981 } 982 lcn = rl2->lcn; 983 rl = ntfs_runlists_merge(ni->runlist.rl, rl2); 984 if (IS_ERR(rl)) { 985 err = PTR_ERR(rl); 986 if (err != -ENOMEM) 987 err = -EIO; 988 if (ntfs_cluster_free_from_rl(vol, rl2)) { 989 ntfs_error(vol->sb, "Failed to release " 990 "allocated cluster in error " 991 "code path. Run chkdsk to " 992 "recover the lost cluster."); 993 NVolSetErrors(vol); 994 } 995 ntfs_free(rl2); 996 break; 997 } 998 ni->runlist.rl = rl; 999 status.runlist_merged = 1; 1000 ntfs_debug("Allocated cluster, lcn 0x%llx.", 1001 (unsigned long long)lcn); 1002 /* Map and lock the mft record and get the attribute record. */ 1003 if (!NInoAttr(ni)) 1004 base_ni = ni; 1005 else 1006 base_ni = ni->ext.base_ntfs_ino; 1007 m = map_mft_record(base_ni); 1008 if (IS_ERR(m)) { 1009 err = PTR_ERR(m); 1010 break; 1011 } 1012 ctx = ntfs_attr_get_search_ctx(base_ni, m); 1013 if (unlikely(!ctx)) { 1014 err = -ENOMEM; 1015 unmap_mft_record(base_ni); 1016 break; 1017 } 1018 status.mft_attr_mapped = 1; 1019 err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, 1020 CASE_SENSITIVE, bh_cpos, NULL, 0, ctx); 1021 if (unlikely(err)) { 1022 if (err == -ENOENT) 1023 err = -EIO; 1024 break; 1025 } 1026 m = ctx->mrec; 1027 a = ctx->attr; 1028 /* 1029 * Find the runlist element with which the attribute extent 1030 * starts. Note, we cannot use the _attr_ version because we 1031 * have mapped the mft record. That is ok because we know the 1032 * runlist fragment must be mapped already to have ever gotten 1033 * here, so we can just use the _rl_ version. 1034 */ 1035 vcn = sle64_to_cpu(a->data.non_resident.lowest_vcn); 1036 rl2 = ntfs_rl_find_vcn_nolock(rl, vcn); 1037 BUG_ON(!rl2); 1038 BUG_ON(!rl2->length); 1039 BUG_ON(rl2->lcn < LCN_HOLE); 1040 highest_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn); 1041 /* 1042 * If @highest_vcn is zero, calculate the real highest_vcn 1043 * (which can really be zero). 1044 */ 1045 if (!highest_vcn) 1046 highest_vcn = (sle64_to_cpu( 1047 a->data.non_resident.allocated_size) >> 1048 vol->cluster_size_bits) - 1; 1049 /* 1050 * Determine the size of the mapping pairs array for the new 1051 * extent, i.e. the old extent with the hole filled. 1052 */ 1053 mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, vcn, 1054 highest_vcn); 1055 if (unlikely(mp_size <= 0)) { 1056 if (!(err = mp_size)) 1057 err = -EIO; 1058 ntfs_debug("Failed to get size for mapping pairs " 1059 "array, error code %i.", err); 1060 break; 1061 } 1062 /* 1063 * Resize the attribute record to fit the new mapping pairs 1064 * array. 1065 */ 1066 attr_rec_len = le32_to_cpu(a->length); 1067 err = ntfs_attr_record_resize(m, a, mp_size + le16_to_cpu( 1068 a->data.non_resident.mapping_pairs_offset)); 1069 if (unlikely(err)) { 1070 BUG_ON(err != -ENOSPC); 1071 // TODO: Deal with this by using the current attribute 1072 // and fill it with as much of the mapping pairs 1073 // array as possible. Then loop over each attribute 1074 // extent rewriting the mapping pairs arrays as we go 1075 // along and if when we reach the end we have not 1076 // enough space, try to resize the last attribute 1077 // extent and if even that fails, add a new attribute 1078 // extent. 1079 // We could also try to resize at each step in the hope 1080 // that we will not need to rewrite every single extent. 1081 // Note, we may need to decompress some extents to fill 1082 // the runlist as we are walking the extents... 1083 ntfs_error(vol->sb, "Not enough space in the mft " 1084 "record for the extended attribute " 1085 "record. This case is not " 1086 "implemented yet."); 1087 err = -EOPNOTSUPP; 1088 break ; 1089 } 1090 status.mp_rebuilt = 1; 1091 /* 1092 * Generate the mapping pairs array directly into the attribute 1093 * record. 1094 */ 1095 err = ntfs_mapping_pairs_build(vol, (u8*)a + le16_to_cpu( 1096 a->data.non_resident.mapping_pairs_offset), 1097 mp_size, rl2, vcn, highest_vcn, NULL); 1098 if (unlikely(err)) { 1099 ntfs_error(vol->sb, "Cannot fill hole in inode 0x%lx, " 1100 "attribute type 0x%x, because building " 1101 "the mapping pairs failed with error " 1102 "code %i.", vi->i_ino, 1103 (unsigned)le32_to_cpu(ni->type), err); 1104 err = -EIO; 1105 break; 1106 } 1107 /* Update the highest_vcn but only if it was not set. */ 1108 if (unlikely(!a->data.non_resident.highest_vcn)) 1109 a->data.non_resident.highest_vcn = 1110 cpu_to_sle64(highest_vcn); 1111 /* 1112 * If the attribute is sparse/compressed, update the compressed 1113 * size in the ntfs_inode structure and the attribute record. 1114 */ 1115 if (likely(NInoSparse(ni) || NInoCompressed(ni))) { 1116 /* 1117 * If we are not in the first attribute extent, switch 1118 * to it, but first ensure the changes will make it to 1119 * disk later. 1120 */ 1121 if (a->data.non_resident.lowest_vcn) { 1122 flush_dcache_mft_record_page(ctx->ntfs_ino); 1123 mark_mft_record_dirty(ctx->ntfs_ino); 1124 ntfs_attr_reinit_search_ctx(ctx); 1125 err = ntfs_attr_lookup(ni->type, ni->name, 1126 ni->name_len, CASE_SENSITIVE, 1127 0, NULL, 0, ctx); 1128 if (unlikely(err)) { 1129 status.attr_switched = 1; 1130 break; 1131 } 1132 /* @m is not used any more so do not set it. */ 1133 a = ctx->attr; 1134 } 1135 write_lock_irqsave(&ni->size_lock, flags); 1136 ni->itype.compressed.size += vol->cluster_size; 1137 a->data.non_resident.compressed_size = 1138 cpu_to_sle64(ni->itype.compressed.size); 1139 write_unlock_irqrestore(&ni->size_lock, flags); 1140 } 1141 /* Ensure the changes make it to disk. */ 1142 flush_dcache_mft_record_page(ctx->ntfs_ino); 1143 mark_mft_record_dirty(ctx->ntfs_ino); 1144 ntfs_attr_put_search_ctx(ctx); 1145 unmap_mft_record(base_ni); 1146 /* Successfully filled the hole. */ 1147 status.runlist_merged = 0; 1148 status.mft_attr_mapped = 0; 1149 status.mp_rebuilt = 0; 1150 /* Setup the map cache and use that to deal with the buffer. */ 1151 was_hole = true; 1152 vcn = bh_cpos; 1153 vcn_len = 1; 1154 lcn_block = lcn << (vol->cluster_size_bits - blocksize_bits); 1155 cdelta = 0; 1156 /* 1157 * If the number of remaining clusters in the @pages is smaller 1158 * or equal to the number of cached clusters, unlock the 1159 * runlist as the map cache will be used from now on. 1160 */ 1161 if (likely(vcn + vcn_len >= cend)) { 1162 up_write(&ni->runlist.lock); 1163 rl_write_locked = false; 1164 rl = NULL; 1165 } 1166 goto map_buffer_cached; 1167 } while (bh_pos += blocksize, (bh = bh->b_this_page) != head); 1168 /* If there are no errors, do the next page. */ 1169 if (likely(!err && ++u < nr_pages)) 1170 goto do_next_page; 1171 /* If there are no errors, release the runlist lock if we took it. */ 1172 if (likely(!err)) { 1173 if (unlikely(rl_write_locked)) { 1174 up_write(&ni->runlist.lock); 1175 rl_write_locked = false; 1176 } else if (unlikely(rl)) 1177 up_read(&ni->runlist.lock); 1178 rl = NULL; 1179 } 1180 /* If we issued read requests, let them complete. */ 1181 read_lock_irqsave(&ni->size_lock, flags); 1182 initialized_size = ni->initialized_size; 1183 read_unlock_irqrestore(&ni->size_lock, flags); 1184 while (wait_bh > wait) { 1185 bh = *--wait_bh; 1186 wait_on_buffer(bh); 1187 if (likely(buffer_uptodate(bh))) { 1188 page = bh->b_page; 1189 bh_pos = ((s64)page->index << PAGE_SHIFT) + 1190 bh_offset(bh); 1191 /* 1192 * If the buffer overflows the initialized size, need 1193 * to zero the overflowing region. 1194 */ 1195 if (unlikely(bh_pos + blocksize > initialized_size)) { 1196 int ofs = 0; 1197 1198 if (likely(bh_pos < initialized_size)) 1199 ofs = initialized_size - bh_pos; 1200 zero_user_segment(page, bh_offset(bh) + ofs, 1201 blocksize); 1202 } 1203 } else /* if (unlikely(!buffer_uptodate(bh))) */ 1204 err = -EIO; 1205 } 1206 if (likely(!err)) { 1207 /* Clear buffer_new on all buffers. */ 1208 u = 0; 1209 do { 1210 bh = head = page_buffers(pages[u]); 1211 do { 1212 if (buffer_new(bh)) 1213 clear_buffer_new(bh); 1214 } while ((bh = bh->b_this_page) != head); 1215 } while (++u < nr_pages); 1216 ntfs_debug("Done."); 1217 return err; 1218 } 1219 if (status.attr_switched) { 1220 /* Get back to the attribute extent we modified. */ 1221 ntfs_attr_reinit_search_ctx(ctx); 1222 if (ntfs_attr_lookup(ni->type, ni->name, ni->name_len, 1223 CASE_SENSITIVE, bh_cpos, NULL, 0, ctx)) { 1224 ntfs_error(vol->sb, "Failed to find required " 1225 "attribute extent of attribute in " 1226 "error code path. Run chkdsk to " 1227 "recover."); 1228 write_lock_irqsave(&ni->size_lock, flags); 1229 ni->itype.compressed.size += vol->cluster_size; 1230 write_unlock_irqrestore(&ni->size_lock, flags); 1231 flush_dcache_mft_record_page(ctx->ntfs_ino); 1232 mark_mft_record_dirty(ctx->ntfs_ino); 1233 /* 1234 * The only thing that is now wrong is the compressed 1235 * size of the base attribute extent which chkdsk 1236 * should be able to fix. 1237 */ 1238 NVolSetErrors(vol); 1239 } else { 1240 m = ctx->mrec; 1241 a = ctx->attr; 1242 status.attr_switched = 0; 1243 } 1244 } 1245 /* 1246 * If the runlist has been modified, need to restore it by punching a 1247 * hole into it and we then need to deallocate the on-disk cluster as 1248 * well. Note, we only modify the runlist if we are able to generate a 1249 * new mapping pairs array, i.e. only when the mapped attribute extent 1250 * is not switched. 1251 */ 1252 if (status.runlist_merged && !status.attr_switched) { 1253 BUG_ON(!rl_write_locked); 1254 /* Make the file cluster we allocated sparse in the runlist. */ 1255 if (ntfs_rl_punch_nolock(vol, &ni->runlist, bh_cpos, 1)) { 1256 ntfs_error(vol->sb, "Failed to punch hole into " 1257 "attribute runlist in error code " 1258 "path. Run chkdsk to recover the " 1259 "lost cluster."); 1260 NVolSetErrors(vol); 1261 } else /* if (success) */ { 1262 status.runlist_merged = 0; 1263 /* 1264 * Deallocate the on-disk cluster we allocated but only 1265 * if we succeeded in punching its vcn out of the 1266 * runlist. 1267 */ 1268 down_write(&vol->lcnbmp_lock); 1269 if (ntfs_bitmap_clear_bit(vol->lcnbmp_ino, lcn)) { 1270 ntfs_error(vol->sb, "Failed to release " 1271 "allocated cluster in error " 1272 "code path. Run chkdsk to " 1273 "recover the lost cluster."); 1274 NVolSetErrors(vol); 1275 } 1276 up_write(&vol->lcnbmp_lock); 1277 } 1278 } 1279 /* 1280 * Resize the attribute record to its old size and rebuild the mapping 1281 * pairs array. Note, we only can do this if the runlist has been 1282 * restored to its old state which also implies that the mapped 1283 * attribute extent is not switched. 1284 */ 1285 if (status.mp_rebuilt && !status.runlist_merged) { 1286 if (ntfs_attr_record_resize(m, a, attr_rec_len)) { 1287 ntfs_error(vol->sb, "Failed to restore attribute " 1288 "record in error code path. Run " 1289 "chkdsk to recover."); 1290 NVolSetErrors(vol); 1291 } else /* if (success) */ { 1292 if (ntfs_mapping_pairs_build(vol, (u8*)a + 1293 le16_to_cpu(a->data.non_resident. 1294 mapping_pairs_offset), attr_rec_len - 1295 le16_to_cpu(a->data.non_resident. 1296 mapping_pairs_offset), ni->runlist.rl, 1297 vcn, highest_vcn, NULL)) { 1298 ntfs_error(vol->sb, "Failed to restore " 1299 "mapping pairs array in error " 1300 "code path. Run chkdsk to " 1301 "recover."); 1302 NVolSetErrors(vol); 1303 } 1304 flush_dcache_mft_record_page(ctx->ntfs_ino); 1305 mark_mft_record_dirty(ctx->ntfs_ino); 1306 } 1307 } 1308 /* Release the mft record and the attribute. */ 1309 if (status.mft_attr_mapped) { 1310 ntfs_attr_put_search_ctx(ctx); 1311 unmap_mft_record(base_ni); 1312 } 1313 /* Release the runlist lock. */ 1314 if (rl_write_locked) 1315 up_write(&ni->runlist.lock); 1316 else if (rl) 1317 up_read(&ni->runlist.lock); 1318 /* 1319 * Zero out any newly allocated blocks to avoid exposing stale data. 1320 * If BH_New is set, we know that the block was newly allocated above 1321 * and that it has not been fully zeroed and marked dirty yet. 1322 */ 1323 nr_pages = u; 1324 u = 0; 1325 end = bh_cpos << vol->cluster_size_bits; 1326 do { 1327 page = pages[u]; 1328 bh = head = page_buffers(page); 1329 do { 1330 if (u == nr_pages && 1331 ((s64)page->index << PAGE_SHIFT) + 1332 bh_offset(bh) >= end) 1333 break; 1334 if (!buffer_new(bh)) 1335 continue; 1336 clear_buffer_new(bh); 1337 if (!buffer_uptodate(bh)) { 1338 if (PageUptodate(page)) 1339 set_buffer_uptodate(bh); 1340 else { 1341 zero_user(page, bh_offset(bh), 1342 blocksize); 1343 set_buffer_uptodate(bh); 1344 } 1345 } 1346 mark_buffer_dirty(bh); 1347 } while ((bh = bh->b_this_page) != head); 1348 } while (++u <= nr_pages); 1349 ntfs_error(vol->sb, "Failed. Returning error code %i.", err); 1350 return err; 1351 } 1352 1353 static inline void ntfs_flush_dcache_pages(struct page **pages, 1354 unsigned nr_pages) 1355 { 1356 BUG_ON(!nr_pages); 1357 /* 1358 * Warning: Do not do the decrement at the same time as the call to 1359 * flush_dcache_page() because it is a NULL macro on i386 and hence the 1360 * decrement never happens so the loop never terminates. 1361 */ 1362 do { 1363 --nr_pages; 1364 flush_dcache_page(pages[nr_pages]); 1365 } while (nr_pages > 0); 1366 } 1367 1368 /** 1369 * ntfs_commit_pages_after_non_resident_write - commit the received data 1370 * @pages: array of destination pages 1371 * @nr_pages: number of pages in @pages 1372 * @pos: byte position in file at which the write begins 1373 * @bytes: number of bytes to be written 1374 * 1375 * See description of ntfs_commit_pages_after_write(), below. 1376 */ 1377 static inline int ntfs_commit_pages_after_non_resident_write( 1378 struct page **pages, const unsigned nr_pages, 1379 s64 pos, size_t bytes) 1380 { 1381 s64 end, initialized_size; 1382 struct inode *vi; 1383 ntfs_inode *ni, *base_ni; 1384 struct buffer_head *bh, *head; 1385 ntfs_attr_search_ctx *ctx; 1386 MFT_RECORD *m; 1387 ATTR_RECORD *a; 1388 unsigned long flags; 1389 unsigned blocksize, u; 1390 int err; 1391 1392 vi = pages[0]->mapping->host; 1393 ni = NTFS_I(vi); 1394 blocksize = vi->i_sb->s_blocksize; 1395 end = pos + bytes; 1396 u = 0; 1397 do { 1398 s64 bh_pos; 1399 struct page *page; 1400 bool partial; 1401 1402 page = pages[u]; 1403 bh_pos = (s64)page->index << PAGE_SHIFT; 1404 bh = head = page_buffers(page); 1405 partial = false; 1406 do { 1407 s64 bh_end; 1408 1409 bh_end = bh_pos + blocksize; 1410 if (bh_end <= pos || bh_pos >= end) { 1411 if (!buffer_uptodate(bh)) 1412 partial = true; 1413 } else { 1414 set_buffer_uptodate(bh); 1415 mark_buffer_dirty(bh); 1416 } 1417 } while (bh_pos += blocksize, (bh = bh->b_this_page) != head); 1418 /* 1419 * If all buffers are now uptodate but the page is not, set the 1420 * page uptodate. 1421 */ 1422 if (!partial && !PageUptodate(page)) 1423 SetPageUptodate(page); 1424 } while (++u < nr_pages); 1425 /* 1426 * Finally, if we do not need to update initialized_size or i_size we 1427 * are finished. 1428 */ 1429 read_lock_irqsave(&ni->size_lock, flags); 1430 initialized_size = ni->initialized_size; 1431 read_unlock_irqrestore(&ni->size_lock, flags); 1432 if (end <= initialized_size) { 1433 ntfs_debug("Done."); 1434 return 0; 1435 } 1436 /* 1437 * Update initialized_size/i_size as appropriate, both in the inode and 1438 * the mft record. 1439 */ 1440 if (!NInoAttr(ni)) 1441 base_ni = ni; 1442 else 1443 base_ni = ni->ext.base_ntfs_ino; 1444 /* Map, pin, and lock the mft record. */ 1445 m = map_mft_record(base_ni); 1446 if (IS_ERR(m)) { 1447 err = PTR_ERR(m); 1448 m = NULL; 1449 ctx = NULL; 1450 goto err_out; 1451 } 1452 BUG_ON(!NInoNonResident(ni)); 1453 ctx = ntfs_attr_get_search_ctx(base_ni, m); 1454 if (unlikely(!ctx)) { 1455 err = -ENOMEM; 1456 goto err_out; 1457 } 1458 err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, 1459 CASE_SENSITIVE, 0, NULL, 0, ctx); 1460 if (unlikely(err)) { 1461 if (err == -ENOENT) 1462 err = -EIO; 1463 goto err_out; 1464 } 1465 a = ctx->attr; 1466 BUG_ON(!a->non_resident); 1467 write_lock_irqsave(&ni->size_lock, flags); 1468 BUG_ON(end > ni->allocated_size); 1469 ni->initialized_size = end; 1470 a->data.non_resident.initialized_size = cpu_to_sle64(end); 1471 if (end > i_size_read(vi)) { 1472 i_size_write(vi, end); 1473 a->data.non_resident.data_size = 1474 a->data.non_resident.initialized_size; 1475 } 1476 write_unlock_irqrestore(&ni->size_lock, flags); 1477 /* Mark the mft record dirty, so it gets written back. */ 1478 flush_dcache_mft_record_page(ctx->ntfs_ino); 1479 mark_mft_record_dirty(ctx->ntfs_ino); 1480 ntfs_attr_put_search_ctx(ctx); 1481 unmap_mft_record(base_ni); 1482 ntfs_debug("Done."); 1483 return 0; 1484 err_out: 1485 if (ctx) 1486 ntfs_attr_put_search_ctx(ctx); 1487 if (m) 1488 unmap_mft_record(base_ni); 1489 ntfs_error(vi->i_sb, "Failed to update initialized_size/i_size (error " 1490 "code %i).", err); 1491 if (err != -ENOMEM) 1492 NVolSetErrors(ni->vol); 1493 return err; 1494 } 1495 1496 /** 1497 * ntfs_commit_pages_after_write - commit the received data 1498 * @pages: array of destination pages 1499 * @nr_pages: number of pages in @pages 1500 * @pos: byte position in file at which the write begins 1501 * @bytes: number of bytes to be written 1502 * 1503 * This is called from ntfs_file_buffered_write() with i_mutex held on the inode 1504 * (@pages[0]->mapping->host). There are @nr_pages pages in @pages which are 1505 * locked but not kmap()ped. The source data has already been copied into the 1506 * @page. ntfs_prepare_pages_for_non_resident_write() has been called before 1507 * the data was copied (for non-resident attributes only) and it returned 1508 * success. 1509 * 1510 * Need to set uptodate and mark dirty all buffers within the boundary of the 1511 * write. If all buffers in a page are uptodate we set the page uptodate, too. 1512 * 1513 * Setting the buffers dirty ensures that they get written out later when 1514 * ntfs_writepage() is invoked by the VM. 1515 * 1516 * Finally, we need to update i_size and initialized_size as appropriate both 1517 * in the inode and the mft record. 1518 * 1519 * This is modelled after fs/buffer.c::generic_commit_write(), which marks 1520 * buffers uptodate and dirty, sets the page uptodate if all buffers in the 1521 * page are uptodate, and updates i_size if the end of io is beyond i_size. In 1522 * that case, it also marks the inode dirty. 1523 * 1524 * If things have gone as outlined in 1525 * ntfs_prepare_pages_for_non_resident_write(), we do not need to do any page 1526 * content modifications here for non-resident attributes. For resident 1527 * attributes we need to do the uptodate bringing here which we combine with 1528 * the copying into the mft record which means we save one atomic kmap. 1529 * 1530 * Return 0 on success or -errno on error. 1531 */ 1532 static int ntfs_commit_pages_after_write(struct page **pages, 1533 const unsigned nr_pages, s64 pos, size_t bytes) 1534 { 1535 s64 end, initialized_size; 1536 loff_t i_size; 1537 struct inode *vi; 1538 ntfs_inode *ni, *base_ni; 1539 struct page *page; 1540 ntfs_attr_search_ctx *ctx; 1541 MFT_RECORD *m; 1542 ATTR_RECORD *a; 1543 char *kattr, *kaddr; 1544 unsigned long flags; 1545 u32 attr_len; 1546 int err; 1547 1548 BUG_ON(!nr_pages); 1549 BUG_ON(!pages); 1550 page = pages[0]; 1551 BUG_ON(!page); 1552 vi = page->mapping->host; 1553 ni = NTFS_I(vi); 1554 ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, start page " 1555 "index 0x%lx, nr_pages 0x%x, pos 0x%llx, bytes 0x%zx.", 1556 vi->i_ino, ni->type, page->index, nr_pages, 1557 (long long)pos, bytes); 1558 if (NInoNonResident(ni)) 1559 return ntfs_commit_pages_after_non_resident_write(pages, 1560 nr_pages, pos, bytes); 1561 BUG_ON(nr_pages > 1); 1562 /* 1563 * Attribute is resident, implying it is not compressed, encrypted, or 1564 * sparse. 1565 */ 1566 if (!NInoAttr(ni)) 1567 base_ni = ni; 1568 else 1569 base_ni = ni->ext.base_ntfs_ino; 1570 BUG_ON(NInoNonResident(ni)); 1571 /* Map, pin, and lock the mft record. */ 1572 m = map_mft_record(base_ni); 1573 if (IS_ERR(m)) { 1574 err = PTR_ERR(m); 1575 m = NULL; 1576 ctx = NULL; 1577 goto err_out; 1578 } 1579 ctx = ntfs_attr_get_search_ctx(base_ni, m); 1580 if (unlikely(!ctx)) { 1581 err = -ENOMEM; 1582 goto err_out; 1583 } 1584 err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, 1585 CASE_SENSITIVE, 0, NULL, 0, ctx); 1586 if (unlikely(err)) { 1587 if (err == -ENOENT) 1588 err = -EIO; 1589 goto err_out; 1590 } 1591 a = ctx->attr; 1592 BUG_ON(a->non_resident); 1593 /* The total length of the attribute value. */ 1594 attr_len = le32_to_cpu(a->data.resident.value_length); 1595 i_size = i_size_read(vi); 1596 BUG_ON(attr_len != i_size); 1597 BUG_ON(pos > attr_len); 1598 end = pos + bytes; 1599 BUG_ON(end > le32_to_cpu(a->length) - 1600 le16_to_cpu(a->data.resident.value_offset)); 1601 kattr = (u8*)a + le16_to_cpu(a->data.resident.value_offset); 1602 kaddr = kmap_atomic(page); 1603 /* Copy the received data from the page to the mft record. */ 1604 memcpy(kattr + pos, kaddr + pos, bytes); 1605 /* Update the attribute length if necessary. */ 1606 if (end > attr_len) { 1607 attr_len = end; 1608 a->data.resident.value_length = cpu_to_le32(attr_len); 1609 } 1610 /* 1611 * If the page is not uptodate, bring the out of bounds area(s) 1612 * uptodate by copying data from the mft record to the page. 1613 */ 1614 if (!PageUptodate(page)) { 1615 if (pos > 0) 1616 memcpy(kaddr, kattr, pos); 1617 if (end < attr_len) 1618 memcpy(kaddr + end, kattr + end, attr_len - end); 1619 /* Zero the region outside the end of the attribute value. */ 1620 memset(kaddr + attr_len, 0, PAGE_SIZE - attr_len); 1621 flush_dcache_page(page); 1622 SetPageUptodate(page); 1623 } 1624 kunmap_atomic(kaddr); 1625 /* Update initialized_size/i_size if necessary. */ 1626 read_lock_irqsave(&ni->size_lock, flags); 1627 initialized_size = ni->initialized_size; 1628 BUG_ON(end > ni->allocated_size); 1629 read_unlock_irqrestore(&ni->size_lock, flags); 1630 BUG_ON(initialized_size != i_size); 1631 if (end > initialized_size) { 1632 write_lock_irqsave(&ni->size_lock, flags); 1633 ni->initialized_size = end; 1634 i_size_write(vi, end); 1635 write_unlock_irqrestore(&ni->size_lock, flags); 1636 } 1637 /* Mark the mft record dirty, so it gets written back. */ 1638 flush_dcache_mft_record_page(ctx->ntfs_ino); 1639 mark_mft_record_dirty(ctx->ntfs_ino); 1640 ntfs_attr_put_search_ctx(ctx); 1641 unmap_mft_record(base_ni); 1642 ntfs_debug("Done."); 1643 return 0; 1644 err_out: 1645 if (err == -ENOMEM) { 1646 ntfs_warning(vi->i_sb, "Error allocating memory required to " 1647 "commit the write."); 1648 if (PageUptodate(page)) { 1649 ntfs_warning(vi->i_sb, "Page is uptodate, setting " 1650 "dirty so the write will be retried " 1651 "later on by the VM."); 1652 /* 1653 * Put the page on mapping->dirty_pages, but leave its 1654 * buffers' dirty state as-is. 1655 */ 1656 __set_page_dirty_nobuffers(page); 1657 err = 0; 1658 } else 1659 ntfs_error(vi->i_sb, "Page is not uptodate. Written " 1660 "data has been lost."); 1661 } else { 1662 ntfs_error(vi->i_sb, "Resident attribute commit write failed " 1663 "with error %i.", err); 1664 NVolSetErrors(ni->vol); 1665 } 1666 if (ctx) 1667 ntfs_attr_put_search_ctx(ctx); 1668 if (m) 1669 unmap_mft_record(base_ni); 1670 return err; 1671 } 1672 1673 /* 1674 * Copy as much as we can into the pages and return the number of bytes which 1675 * were successfully copied. If a fault is encountered then clear the pages 1676 * out to (ofs + bytes) and return the number of bytes which were copied. 1677 */ 1678 static size_t ntfs_copy_from_user_iter(struct page **pages, unsigned nr_pages, 1679 unsigned ofs, struct iov_iter *i, size_t bytes) 1680 { 1681 struct page **last_page = pages + nr_pages; 1682 size_t total = 0; 1683 unsigned len, copied; 1684 1685 do { 1686 len = PAGE_SIZE - ofs; 1687 if (len > bytes) 1688 len = bytes; 1689 copied = copy_page_from_iter_atomic(*pages, ofs, len, i); 1690 total += copied; 1691 bytes -= copied; 1692 if (!bytes) 1693 break; 1694 if (copied < len) 1695 goto err; 1696 ofs = 0; 1697 } while (++pages < last_page); 1698 out: 1699 return total; 1700 err: 1701 /* Zero the rest of the target like __copy_from_user(). */ 1702 len = PAGE_SIZE - copied; 1703 do { 1704 if (len > bytes) 1705 len = bytes; 1706 zero_user(*pages, copied, len); 1707 bytes -= len; 1708 copied = 0; 1709 len = PAGE_SIZE; 1710 } while (++pages < last_page); 1711 goto out; 1712 } 1713 1714 /** 1715 * ntfs_perform_write - perform buffered write to a file 1716 * @file: file to write to 1717 * @i: iov_iter with data to write 1718 * @pos: byte offset in file at which to begin writing to 1719 */ 1720 static ssize_t ntfs_perform_write(struct file *file, struct iov_iter *i, 1721 loff_t pos) 1722 { 1723 struct address_space *mapping = file->f_mapping; 1724 struct inode *vi = mapping->host; 1725 ntfs_inode *ni = NTFS_I(vi); 1726 ntfs_volume *vol = ni->vol; 1727 struct page *pages[NTFS_MAX_PAGES_PER_CLUSTER]; 1728 struct page *cached_page = NULL; 1729 VCN last_vcn; 1730 LCN lcn; 1731 size_t bytes; 1732 ssize_t status, written = 0; 1733 unsigned nr_pages; 1734 1735 ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, pos " 1736 "0x%llx, count 0x%lx.", vi->i_ino, 1737 (unsigned)le32_to_cpu(ni->type), 1738 (unsigned long long)pos, 1739 (unsigned long)iov_iter_count(i)); 1740 /* 1741 * If a previous ntfs_truncate() failed, repeat it and abort if it 1742 * fails again. 1743 */ 1744 if (unlikely(NInoTruncateFailed(ni))) { 1745 int err; 1746 1747 inode_dio_wait(vi); 1748 err = ntfs_truncate(vi); 1749 if (err || NInoTruncateFailed(ni)) { 1750 if (!err) 1751 err = -EIO; 1752 ntfs_error(vol->sb, "Cannot perform write to inode " 1753 "0x%lx, attribute type 0x%x, because " 1754 "ntfs_truncate() failed (error code " 1755 "%i).", vi->i_ino, 1756 (unsigned)le32_to_cpu(ni->type), err); 1757 return err; 1758 } 1759 } 1760 /* 1761 * Determine the number of pages per cluster for non-resident 1762 * attributes. 1763 */ 1764 nr_pages = 1; 1765 if (vol->cluster_size > PAGE_SIZE && NInoNonResident(ni)) 1766 nr_pages = vol->cluster_size >> PAGE_SHIFT; 1767 last_vcn = -1; 1768 do { 1769 VCN vcn; 1770 pgoff_t start_idx; 1771 unsigned ofs, do_pages, u; 1772 size_t copied; 1773 1774 start_idx = pos >> PAGE_SHIFT; 1775 ofs = pos & ~PAGE_MASK; 1776 bytes = PAGE_SIZE - ofs; 1777 do_pages = 1; 1778 if (nr_pages > 1) { 1779 vcn = pos >> vol->cluster_size_bits; 1780 if (vcn != last_vcn) { 1781 last_vcn = vcn; 1782 /* 1783 * Get the lcn of the vcn the write is in. If 1784 * it is a hole, need to lock down all pages in 1785 * the cluster. 1786 */ 1787 down_read(&ni->runlist.lock); 1788 lcn = ntfs_attr_vcn_to_lcn_nolock(ni, pos >> 1789 vol->cluster_size_bits, false); 1790 up_read(&ni->runlist.lock); 1791 if (unlikely(lcn < LCN_HOLE)) { 1792 if (lcn == LCN_ENOMEM) 1793 status = -ENOMEM; 1794 else { 1795 status = -EIO; 1796 ntfs_error(vol->sb, "Cannot " 1797 "perform write to " 1798 "inode 0x%lx, " 1799 "attribute type 0x%x, " 1800 "because the attribute " 1801 "is corrupt.", 1802 vi->i_ino, (unsigned) 1803 le32_to_cpu(ni->type)); 1804 } 1805 break; 1806 } 1807 if (lcn == LCN_HOLE) { 1808 start_idx = (pos & ~(s64) 1809 vol->cluster_size_mask) 1810 >> PAGE_SHIFT; 1811 bytes = vol->cluster_size - (pos & 1812 vol->cluster_size_mask); 1813 do_pages = nr_pages; 1814 } 1815 } 1816 } 1817 if (bytes > iov_iter_count(i)) 1818 bytes = iov_iter_count(i); 1819 again: 1820 /* 1821 * Bring in the user page(s) that we will copy from _first_. 1822 * Otherwise there is a nasty deadlock on copying from the same 1823 * page(s) as we are writing to, without it/them being marked 1824 * up-to-date. Note, at present there is nothing to stop the 1825 * pages being swapped out between us bringing them into memory 1826 * and doing the actual copying. 1827 */ 1828 if (unlikely(fault_in_iov_iter_readable(i, bytes))) { 1829 status = -EFAULT; 1830 break; 1831 } 1832 /* Get and lock @do_pages starting at index @start_idx. */ 1833 status = __ntfs_grab_cache_pages(mapping, start_idx, do_pages, 1834 pages, &cached_page); 1835 if (unlikely(status)) 1836 break; 1837 /* 1838 * For non-resident attributes, we need to fill any holes with 1839 * actual clusters and ensure all bufferes are mapped. We also 1840 * need to bring uptodate any buffers that are only partially 1841 * being written to. 1842 */ 1843 if (NInoNonResident(ni)) { 1844 status = ntfs_prepare_pages_for_non_resident_write( 1845 pages, do_pages, pos, bytes); 1846 if (unlikely(status)) { 1847 do { 1848 unlock_page(pages[--do_pages]); 1849 put_page(pages[do_pages]); 1850 } while (do_pages); 1851 break; 1852 } 1853 } 1854 u = (pos >> PAGE_SHIFT) - pages[0]->index; 1855 copied = ntfs_copy_from_user_iter(pages + u, do_pages - u, ofs, 1856 i, bytes); 1857 ntfs_flush_dcache_pages(pages + u, do_pages - u); 1858 status = 0; 1859 if (likely(copied == bytes)) { 1860 status = ntfs_commit_pages_after_write(pages, do_pages, 1861 pos, bytes); 1862 } 1863 do { 1864 unlock_page(pages[--do_pages]); 1865 put_page(pages[do_pages]); 1866 } while (do_pages); 1867 if (unlikely(status < 0)) { 1868 iov_iter_revert(i, copied); 1869 break; 1870 } 1871 cond_resched(); 1872 if (unlikely(copied < bytes)) { 1873 iov_iter_revert(i, copied); 1874 if (copied) 1875 bytes = copied; 1876 else if (bytes > PAGE_SIZE - ofs) 1877 bytes = PAGE_SIZE - ofs; 1878 goto again; 1879 } 1880 pos += copied; 1881 written += copied; 1882 balance_dirty_pages_ratelimited(mapping); 1883 if (fatal_signal_pending(current)) { 1884 status = -EINTR; 1885 break; 1886 } 1887 } while (iov_iter_count(i)); 1888 if (cached_page) 1889 put_page(cached_page); 1890 ntfs_debug("Done. Returning %s (written 0x%lx, status %li).", 1891 written ? "written" : "status", (unsigned long)written, 1892 (long)status); 1893 return written ? written : status; 1894 } 1895 1896 /** 1897 * ntfs_file_write_iter - simple wrapper for ntfs_file_write_iter_nolock() 1898 * @iocb: IO state structure 1899 * @from: iov_iter with data to write 1900 * 1901 * Basically the same as generic_file_write_iter() except that it ends up 1902 * up calling ntfs_perform_write() instead of generic_perform_write() and that 1903 * O_DIRECT is not implemented. 1904 */ 1905 static ssize_t ntfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) 1906 { 1907 struct file *file = iocb->ki_filp; 1908 struct inode *vi = file_inode(file); 1909 ssize_t written = 0; 1910 ssize_t err; 1911 1912 inode_lock(vi); 1913 /* We can write back this queue in page reclaim. */ 1914 current->backing_dev_info = inode_to_bdi(vi); 1915 err = ntfs_prepare_file_for_write(iocb, from); 1916 if (iov_iter_count(from) && !err) 1917 written = ntfs_perform_write(file, from, iocb->ki_pos); 1918 current->backing_dev_info = NULL; 1919 inode_unlock(vi); 1920 iocb->ki_pos += written; 1921 if (likely(written > 0)) 1922 written = generic_write_sync(iocb, written); 1923 return written ? written : err; 1924 } 1925 1926 /** 1927 * ntfs_file_fsync - sync a file to disk 1928 * @filp: file to be synced 1929 * @datasync: if non-zero only flush user data and not metadata 1930 * 1931 * Data integrity sync of a file to disk. Used for fsync, fdatasync, and msync 1932 * system calls. This function is inspired by fs/buffer.c::file_fsync(). 1933 * 1934 * If @datasync is false, write the mft record and all associated extent mft 1935 * records as well as the $DATA attribute and then sync the block device. 1936 * 1937 * If @datasync is true and the attribute is non-resident, we skip the writing 1938 * of the mft record and all associated extent mft records (this might still 1939 * happen due to the write_inode_now() call). 1940 * 1941 * Also, if @datasync is true, we do not wait on the inode to be written out 1942 * but we always wait on the page cache pages to be written out. 1943 * 1944 * Locking: Caller must hold i_mutex on the inode. 1945 * 1946 * TODO: We should probably also write all attribute/index inodes associated 1947 * with this inode but since we have no simple way of getting to them we ignore 1948 * this problem for now. 1949 */ 1950 static int ntfs_file_fsync(struct file *filp, loff_t start, loff_t end, 1951 int datasync) 1952 { 1953 struct inode *vi = filp->f_mapping->host; 1954 int err, ret = 0; 1955 1956 ntfs_debug("Entering for inode 0x%lx.", vi->i_ino); 1957 1958 err = file_write_and_wait_range(filp, start, end); 1959 if (err) 1960 return err; 1961 inode_lock(vi); 1962 1963 BUG_ON(S_ISDIR(vi->i_mode)); 1964 if (!datasync || !NInoNonResident(NTFS_I(vi))) 1965 ret = __ntfs_write_inode(vi, 1); 1966 write_inode_now(vi, !datasync); 1967 /* 1968 * NOTE: If we were to use mapping->private_list (see ext2 and 1969 * fs/buffer.c) for dirty blocks then we could optimize the below to be 1970 * sync_mapping_buffers(vi->i_mapping). 1971 */ 1972 err = sync_blockdev(vi->i_sb->s_bdev); 1973 if (unlikely(err && !ret)) 1974 ret = err; 1975 if (likely(!ret)) 1976 ntfs_debug("Done."); 1977 else 1978 ntfs_warning(vi->i_sb, "Failed to f%ssync inode 0x%lx. Error " 1979 "%u.", datasync ? "data" : "", vi->i_ino, -ret); 1980 inode_unlock(vi); 1981 return ret; 1982 } 1983 1984 #endif /* NTFS_RW */ 1985 1986 const struct file_operations ntfs_file_ops = { 1987 .llseek = generic_file_llseek, 1988 .read_iter = generic_file_read_iter, 1989 #ifdef NTFS_RW 1990 .write_iter = ntfs_file_write_iter, 1991 .fsync = ntfs_file_fsync, 1992 #endif /* NTFS_RW */ 1993 .mmap = generic_file_mmap, 1994 .open = ntfs_file_open, 1995 .splice_read = generic_file_splice_read, 1996 }; 1997 1998 const struct inode_operations ntfs_file_inode_ops = { 1999 #ifdef NTFS_RW 2000 .setattr = ntfs_setattr, 2001 #endif /* NTFS_RW */ 2002 }; 2003 2004 const struct file_operations ntfs_empty_file_ops = {}; 2005 2006 const struct inode_operations ntfs_empty_inode_ops = {}; 2007