1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * file.c - NTFS kernel file operations. Part of the Linux-NTFS project. 4 * 5 * Copyright (c) 2001-2015 Anton Altaparmakov and Tuxera Inc. 6 */ 7 8 #include <linux/backing-dev.h> 9 #include <linux/buffer_head.h> 10 #include <linux/gfp.h> 11 #include <linux/pagemap.h> 12 #include <linux/pagevec.h> 13 #include <linux/sched/signal.h> 14 #include <linux/swap.h> 15 #include <linux/uio.h> 16 #include <linux/writeback.h> 17 18 #include <asm/page.h> 19 #include <linux/uaccess.h> 20 21 #include "attrib.h" 22 #include "bitmap.h" 23 #include "inode.h" 24 #include "debug.h" 25 #include "lcnalloc.h" 26 #include "malloc.h" 27 #include "mft.h" 28 #include "ntfs.h" 29 30 /** 31 * ntfs_file_open - called when an inode is about to be opened 32 * @vi: inode to be opened 33 * @filp: file structure describing the inode 34 * 35 * Limit file size to the page cache limit on architectures where unsigned long 36 * is 32-bits. This is the most we can do for now without overflowing the page 37 * cache page index. Doing it this way means we don't run into problems because 38 * of existing too large files. It would be better to allow the user to read 39 * the beginning of the file but I doubt very much anyone is going to hit this 40 * check on a 32-bit architecture, so there is no point in adding the extra 41 * complexity required to support this. 42 * 43 * On 64-bit architectures, the check is hopefully optimized away by the 44 * compiler. 45 * 46 * After the check passes, just call generic_file_open() to do its work. 47 */ 48 static int ntfs_file_open(struct inode *vi, struct file *filp) 49 { 50 if (sizeof(unsigned long) < 8) { 51 if (i_size_read(vi) > MAX_LFS_FILESIZE) 52 return -EOVERFLOW; 53 } 54 return generic_file_open(vi, filp); 55 } 56 57 #ifdef NTFS_RW 58 59 /** 60 * ntfs_attr_extend_initialized - extend the initialized size of an attribute 61 * @ni: ntfs inode of the attribute to extend 62 * @new_init_size: requested new initialized size in bytes 63 * 64 * Extend the initialized size of an attribute described by the ntfs inode @ni 65 * to @new_init_size bytes. This involves zeroing any non-sparse space between 66 * the old initialized size and @new_init_size both in the page cache and on 67 * disk (if relevant complete pages are already uptodate in the page cache then 68 * these are simply marked dirty). 69 * 70 * As a side-effect, the file size (vfs inode->i_size) may be incremented as, 71 * in the resident attribute case, it is tied to the initialized size and, in 72 * the non-resident attribute case, it may not fall below the initialized size. 73 * 74 * Note that if the attribute is resident, we do not need to touch the page 75 * cache at all. This is because if the page cache page is not uptodate we 76 * bring it uptodate later, when doing the write to the mft record since we 77 * then already have the page mapped. And if the page is uptodate, the 78 * non-initialized region will already have been zeroed when the page was 79 * brought uptodate and the region may in fact already have been overwritten 80 * with new data via mmap() based writes, so we cannot just zero it. And since 81 * POSIX specifies that the behaviour of resizing a file whilst it is mmap()ped 82 * is unspecified, we choose not to do zeroing and thus we do not need to touch 83 * the page at all. For a more detailed explanation see ntfs_truncate() in 84 * fs/ntfs/inode.c. 85 * 86 * Return 0 on success and -errno on error. In the case that an error is 87 * encountered it is possible that the initialized size will already have been 88 * incremented some way towards @new_init_size but it is guaranteed that if 89 * this is the case, the necessary zeroing will also have happened and that all 90 * metadata is self-consistent. 91 * 92 * Locking: i_mutex on the vfs inode corrseponsind to the ntfs inode @ni must be 93 * held by the caller. 94 */ 95 static int ntfs_attr_extend_initialized(ntfs_inode *ni, const s64 new_init_size) 96 { 97 s64 old_init_size; 98 loff_t old_i_size; 99 pgoff_t index, end_index; 100 unsigned long flags; 101 struct inode *vi = VFS_I(ni); 102 ntfs_inode *base_ni; 103 MFT_RECORD *m = NULL; 104 ATTR_RECORD *a; 105 ntfs_attr_search_ctx *ctx = NULL; 106 struct address_space *mapping; 107 struct page *page = NULL; 108 u8 *kattr; 109 int err; 110 u32 attr_len; 111 112 read_lock_irqsave(&ni->size_lock, flags); 113 old_init_size = ni->initialized_size; 114 old_i_size = i_size_read(vi); 115 BUG_ON(new_init_size > ni->allocated_size); 116 read_unlock_irqrestore(&ni->size_lock, flags); 117 ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, " 118 "old_initialized_size 0x%llx, " 119 "new_initialized_size 0x%llx, i_size 0x%llx.", 120 vi->i_ino, (unsigned)le32_to_cpu(ni->type), 121 (unsigned long long)old_init_size, 122 (unsigned long long)new_init_size, old_i_size); 123 if (!NInoAttr(ni)) 124 base_ni = ni; 125 else 126 base_ni = ni->ext.base_ntfs_ino; 127 /* Use goto to reduce indentation and we need the label below anyway. */ 128 if (NInoNonResident(ni)) 129 goto do_non_resident_extend; 130 BUG_ON(old_init_size != old_i_size); 131 m = map_mft_record(base_ni); 132 if (IS_ERR(m)) { 133 err = PTR_ERR(m); 134 m = NULL; 135 goto err_out; 136 } 137 ctx = ntfs_attr_get_search_ctx(base_ni, m); 138 if (unlikely(!ctx)) { 139 err = -ENOMEM; 140 goto err_out; 141 } 142 err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, 143 CASE_SENSITIVE, 0, NULL, 0, ctx); 144 if (unlikely(err)) { 145 if (err == -ENOENT) 146 err = -EIO; 147 goto err_out; 148 } 149 m = ctx->mrec; 150 a = ctx->attr; 151 BUG_ON(a->non_resident); 152 /* The total length of the attribute value. */ 153 attr_len = le32_to_cpu(a->data.resident.value_length); 154 BUG_ON(old_i_size != (loff_t)attr_len); 155 /* 156 * Do the zeroing in the mft record and update the attribute size in 157 * the mft record. 158 */ 159 kattr = (u8*)a + le16_to_cpu(a->data.resident.value_offset); 160 memset(kattr + attr_len, 0, new_init_size - attr_len); 161 a->data.resident.value_length = cpu_to_le32((u32)new_init_size); 162 /* Finally, update the sizes in the vfs and ntfs inodes. */ 163 write_lock_irqsave(&ni->size_lock, flags); 164 i_size_write(vi, new_init_size); 165 ni->initialized_size = new_init_size; 166 write_unlock_irqrestore(&ni->size_lock, flags); 167 goto done; 168 do_non_resident_extend: 169 /* 170 * If the new initialized size @new_init_size exceeds the current file 171 * size (vfs inode->i_size), we need to extend the file size to the 172 * new initialized size. 173 */ 174 if (new_init_size > old_i_size) { 175 m = map_mft_record(base_ni); 176 if (IS_ERR(m)) { 177 err = PTR_ERR(m); 178 m = NULL; 179 goto err_out; 180 } 181 ctx = ntfs_attr_get_search_ctx(base_ni, m); 182 if (unlikely(!ctx)) { 183 err = -ENOMEM; 184 goto err_out; 185 } 186 err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, 187 CASE_SENSITIVE, 0, NULL, 0, ctx); 188 if (unlikely(err)) { 189 if (err == -ENOENT) 190 err = -EIO; 191 goto err_out; 192 } 193 m = ctx->mrec; 194 a = ctx->attr; 195 BUG_ON(!a->non_resident); 196 BUG_ON(old_i_size != (loff_t) 197 sle64_to_cpu(a->data.non_resident.data_size)); 198 a->data.non_resident.data_size = cpu_to_sle64(new_init_size); 199 flush_dcache_mft_record_page(ctx->ntfs_ino); 200 mark_mft_record_dirty(ctx->ntfs_ino); 201 /* Update the file size in the vfs inode. */ 202 i_size_write(vi, new_init_size); 203 ntfs_attr_put_search_ctx(ctx); 204 ctx = NULL; 205 unmap_mft_record(base_ni); 206 m = NULL; 207 } 208 mapping = vi->i_mapping; 209 index = old_init_size >> PAGE_SHIFT; 210 end_index = (new_init_size + PAGE_SIZE - 1) >> PAGE_SHIFT; 211 do { 212 /* 213 * Read the page. If the page is not present, this will zero 214 * the uninitialized regions for us. 215 */ 216 page = read_mapping_page(mapping, index, NULL); 217 if (IS_ERR(page)) { 218 err = PTR_ERR(page); 219 goto init_err_out; 220 } 221 if (unlikely(PageError(page))) { 222 put_page(page); 223 err = -EIO; 224 goto init_err_out; 225 } 226 /* 227 * Update the initialized size in the ntfs inode. This is 228 * enough to make ntfs_writepage() work. 229 */ 230 write_lock_irqsave(&ni->size_lock, flags); 231 ni->initialized_size = (s64)(index + 1) << PAGE_SHIFT; 232 if (ni->initialized_size > new_init_size) 233 ni->initialized_size = new_init_size; 234 write_unlock_irqrestore(&ni->size_lock, flags); 235 /* Set the page dirty so it gets written out. */ 236 set_page_dirty(page); 237 put_page(page); 238 /* 239 * Play nice with the vm and the rest of the system. This is 240 * very much needed as we can potentially be modifying the 241 * initialised size from a very small value to a really huge 242 * value, e.g. 243 * f = open(somefile, O_TRUNC); 244 * truncate(f, 10GiB); 245 * seek(f, 10GiB); 246 * write(f, 1); 247 * And this would mean we would be marking dirty hundreds of 248 * thousands of pages or as in the above example more than 249 * two and a half million pages! 250 * 251 * TODO: For sparse pages could optimize this workload by using 252 * the FsMisc / MiscFs page bit as a "PageIsSparse" bit. This 253 * would be set in readpage for sparse pages and here we would 254 * not need to mark dirty any pages which have this bit set. 255 * The only caveat is that we have to clear the bit everywhere 256 * where we allocate any clusters that lie in the page or that 257 * contain the page. 258 * 259 * TODO: An even greater optimization would be for us to only 260 * call readpage() on pages which are not in sparse regions as 261 * determined from the runlist. This would greatly reduce the 262 * number of pages we read and make dirty in the case of sparse 263 * files. 264 */ 265 balance_dirty_pages_ratelimited(mapping); 266 cond_resched(); 267 } while (++index < end_index); 268 read_lock_irqsave(&ni->size_lock, flags); 269 BUG_ON(ni->initialized_size != new_init_size); 270 read_unlock_irqrestore(&ni->size_lock, flags); 271 /* Now bring in sync the initialized_size in the mft record. */ 272 m = map_mft_record(base_ni); 273 if (IS_ERR(m)) { 274 err = PTR_ERR(m); 275 m = NULL; 276 goto init_err_out; 277 } 278 ctx = ntfs_attr_get_search_ctx(base_ni, m); 279 if (unlikely(!ctx)) { 280 err = -ENOMEM; 281 goto init_err_out; 282 } 283 err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, 284 CASE_SENSITIVE, 0, NULL, 0, ctx); 285 if (unlikely(err)) { 286 if (err == -ENOENT) 287 err = -EIO; 288 goto init_err_out; 289 } 290 m = ctx->mrec; 291 a = ctx->attr; 292 BUG_ON(!a->non_resident); 293 a->data.non_resident.initialized_size = cpu_to_sle64(new_init_size); 294 done: 295 flush_dcache_mft_record_page(ctx->ntfs_ino); 296 mark_mft_record_dirty(ctx->ntfs_ino); 297 if (ctx) 298 ntfs_attr_put_search_ctx(ctx); 299 if (m) 300 unmap_mft_record(base_ni); 301 ntfs_debug("Done, initialized_size 0x%llx, i_size 0x%llx.", 302 (unsigned long long)new_init_size, i_size_read(vi)); 303 return 0; 304 init_err_out: 305 write_lock_irqsave(&ni->size_lock, flags); 306 ni->initialized_size = old_init_size; 307 write_unlock_irqrestore(&ni->size_lock, flags); 308 err_out: 309 if (ctx) 310 ntfs_attr_put_search_ctx(ctx); 311 if (m) 312 unmap_mft_record(base_ni); 313 ntfs_debug("Failed. Returning error code %i.", err); 314 return err; 315 } 316 317 static ssize_t ntfs_prepare_file_for_write(struct kiocb *iocb, 318 struct iov_iter *from) 319 { 320 loff_t pos; 321 s64 end, ll; 322 ssize_t err; 323 unsigned long flags; 324 struct file *file = iocb->ki_filp; 325 struct inode *vi = file_inode(file); 326 ntfs_inode *ni = NTFS_I(vi); 327 ntfs_volume *vol = ni->vol; 328 329 ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, pos " 330 "0x%llx, count 0x%zx.", vi->i_ino, 331 (unsigned)le32_to_cpu(ni->type), 332 (unsigned long long)iocb->ki_pos, 333 iov_iter_count(from)); 334 err = generic_write_checks(iocb, from); 335 if (unlikely(err <= 0)) 336 goto out; 337 /* 338 * All checks have passed. Before we start doing any writing we want 339 * to abort any totally illegal writes. 340 */ 341 BUG_ON(NInoMstProtected(ni)); 342 BUG_ON(ni->type != AT_DATA); 343 /* If file is encrypted, deny access, just like NT4. */ 344 if (NInoEncrypted(ni)) { 345 /* Only $DATA attributes can be encrypted. */ 346 /* 347 * Reminder for later: Encrypted files are _always_ 348 * non-resident so that the content can always be encrypted. 349 */ 350 ntfs_debug("Denying write access to encrypted file."); 351 err = -EACCES; 352 goto out; 353 } 354 if (NInoCompressed(ni)) { 355 /* Only unnamed $DATA attribute can be compressed. */ 356 BUG_ON(ni->name_len); 357 /* 358 * Reminder for later: If resident, the data is not actually 359 * compressed. Only on the switch to non-resident does 360 * compression kick in. This is in contrast to encrypted files 361 * (see above). 362 */ 363 ntfs_error(vi->i_sb, "Writing to compressed files is not " 364 "implemented yet. Sorry."); 365 err = -EOPNOTSUPP; 366 goto out; 367 } 368 err = file_remove_privs(file); 369 if (unlikely(err)) 370 goto out; 371 /* 372 * Our ->update_time method always succeeds thus file_update_time() 373 * cannot fail either so there is no need to check the return code. 374 */ 375 file_update_time(file); 376 pos = iocb->ki_pos; 377 /* The first byte after the last cluster being written to. */ 378 end = (pos + iov_iter_count(from) + vol->cluster_size_mask) & 379 ~(u64)vol->cluster_size_mask; 380 /* 381 * If the write goes beyond the allocated size, extend the allocation 382 * to cover the whole of the write, rounded up to the nearest cluster. 383 */ 384 read_lock_irqsave(&ni->size_lock, flags); 385 ll = ni->allocated_size; 386 read_unlock_irqrestore(&ni->size_lock, flags); 387 if (end > ll) { 388 /* 389 * Extend the allocation without changing the data size. 390 * 391 * Note we ensure the allocation is big enough to at least 392 * write some data but we do not require the allocation to be 393 * complete, i.e. it may be partial. 394 */ 395 ll = ntfs_attr_extend_allocation(ni, end, -1, pos); 396 if (likely(ll >= 0)) { 397 BUG_ON(pos >= ll); 398 /* If the extension was partial truncate the write. */ 399 if (end > ll) { 400 ntfs_debug("Truncating write to inode 0x%lx, " 401 "attribute type 0x%x, because " 402 "the allocation was only " 403 "partially extended.", 404 vi->i_ino, (unsigned) 405 le32_to_cpu(ni->type)); 406 iov_iter_truncate(from, ll - pos); 407 } 408 } else { 409 err = ll; 410 read_lock_irqsave(&ni->size_lock, flags); 411 ll = ni->allocated_size; 412 read_unlock_irqrestore(&ni->size_lock, flags); 413 /* Perform a partial write if possible or fail. */ 414 if (pos < ll) { 415 ntfs_debug("Truncating write to inode 0x%lx " 416 "attribute type 0x%x, because " 417 "extending the allocation " 418 "failed (error %d).", 419 vi->i_ino, (unsigned) 420 le32_to_cpu(ni->type), 421 (int)-err); 422 iov_iter_truncate(from, ll - pos); 423 } else { 424 if (err != -ENOSPC) 425 ntfs_error(vi->i_sb, "Cannot perform " 426 "write to inode " 427 "0x%lx, attribute " 428 "type 0x%x, because " 429 "extending the " 430 "allocation failed " 431 "(error %ld).", 432 vi->i_ino, (unsigned) 433 le32_to_cpu(ni->type), 434 (long)-err); 435 else 436 ntfs_debug("Cannot perform write to " 437 "inode 0x%lx, " 438 "attribute type 0x%x, " 439 "because there is not " 440 "space left.", 441 vi->i_ino, (unsigned) 442 le32_to_cpu(ni->type)); 443 goto out; 444 } 445 } 446 } 447 /* 448 * If the write starts beyond the initialized size, extend it up to the 449 * beginning of the write and initialize all non-sparse space between 450 * the old initialized size and the new one. This automatically also 451 * increments the vfs inode->i_size to keep it above or equal to the 452 * initialized_size. 453 */ 454 read_lock_irqsave(&ni->size_lock, flags); 455 ll = ni->initialized_size; 456 read_unlock_irqrestore(&ni->size_lock, flags); 457 if (pos > ll) { 458 /* 459 * Wait for ongoing direct i/o to complete before proceeding. 460 * New direct i/o cannot start as we hold i_mutex. 461 */ 462 inode_dio_wait(vi); 463 err = ntfs_attr_extend_initialized(ni, pos); 464 if (unlikely(err < 0)) 465 ntfs_error(vi->i_sb, "Cannot perform write to inode " 466 "0x%lx, attribute type 0x%x, because " 467 "extending the initialized size " 468 "failed (error %d).", vi->i_ino, 469 (unsigned)le32_to_cpu(ni->type), 470 (int)-err); 471 } 472 out: 473 return err; 474 } 475 476 /** 477 * __ntfs_grab_cache_pages - obtain a number of locked pages 478 * @mapping: address space mapping from which to obtain page cache pages 479 * @index: starting index in @mapping at which to begin obtaining pages 480 * @nr_pages: number of page cache pages to obtain 481 * @pages: array of pages in which to return the obtained page cache pages 482 * @cached_page: allocated but as yet unused page 483 * 484 * Obtain @nr_pages locked page cache pages from the mapping @mapping and 485 * starting at index @index. 486 * 487 * If a page is newly created, add it to lru list 488 * 489 * Note, the page locks are obtained in ascending page index order. 490 */ 491 static inline int __ntfs_grab_cache_pages(struct address_space *mapping, 492 pgoff_t index, const unsigned nr_pages, struct page **pages, 493 struct page **cached_page) 494 { 495 int err, nr; 496 497 BUG_ON(!nr_pages); 498 err = nr = 0; 499 do { 500 pages[nr] = find_get_page_flags(mapping, index, FGP_LOCK | 501 FGP_ACCESSED); 502 if (!pages[nr]) { 503 if (!*cached_page) { 504 *cached_page = page_cache_alloc(mapping); 505 if (unlikely(!*cached_page)) { 506 err = -ENOMEM; 507 goto err_out; 508 } 509 } 510 err = add_to_page_cache_lru(*cached_page, mapping, 511 index, 512 mapping_gfp_constraint(mapping, GFP_KERNEL)); 513 if (unlikely(err)) { 514 if (err == -EEXIST) 515 continue; 516 goto err_out; 517 } 518 pages[nr] = *cached_page; 519 *cached_page = NULL; 520 } 521 index++; 522 nr++; 523 } while (nr < nr_pages); 524 out: 525 return err; 526 err_out: 527 while (nr > 0) { 528 unlock_page(pages[--nr]); 529 put_page(pages[nr]); 530 } 531 goto out; 532 } 533 534 static inline int ntfs_submit_bh_for_read(struct buffer_head *bh) 535 { 536 lock_buffer(bh); 537 get_bh(bh); 538 bh->b_end_io = end_buffer_read_sync; 539 return submit_bh(REQ_OP_READ, 0, bh); 540 } 541 542 /** 543 * ntfs_prepare_pages_for_non_resident_write - prepare pages for receiving data 544 * @pages: array of destination pages 545 * @nr_pages: number of pages in @pages 546 * @pos: byte position in file at which the write begins 547 * @bytes: number of bytes to be written 548 * 549 * This is called for non-resident attributes from ntfs_file_buffered_write() 550 * with i_mutex held on the inode (@pages[0]->mapping->host). There are 551 * @nr_pages pages in @pages which are locked but not kmap()ped. The source 552 * data has not yet been copied into the @pages. 553 * 554 * Need to fill any holes with actual clusters, allocate buffers if necessary, 555 * ensure all the buffers are mapped, and bring uptodate any buffers that are 556 * only partially being written to. 557 * 558 * If @nr_pages is greater than one, we are guaranteed that the cluster size is 559 * greater than PAGE_SIZE, that all pages in @pages are entirely inside 560 * the same cluster and that they are the entirety of that cluster, and that 561 * the cluster is sparse, i.e. we need to allocate a cluster to fill the hole. 562 * 563 * i_size is not to be modified yet. 564 * 565 * Return 0 on success or -errno on error. 566 */ 567 static int ntfs_prepare_pages_for_non_resident_write(struct page **pages, 568 unsigned nr_pages, s64 pos, size_t bytes) 569 { 570 VCN vcn, highest_vcn = 0, cpos, cend, bh_cpos, bh_cend; 571 LCN lcn; 572 s64 bh_pos, vcn_len, end, initialized_size; 573 sector_t lcn_block; 574 struct page *page; 575 struct inode *vi; 576 ntfs_inode *ni, *base_ni = NULL; 577 ntfs_volume *vol; 578 runlist_element *rl, *rl2; 579 struct buffer_head *bh, *head, *wait[2], **wait_bh = wait; 580 ntfs_attr_search_ctx *ctx = NULL; 581 MFT_RECORD *m = NULL; 582 ATTR_RECORD *a = NULL; 583 unsigned long flags; 584 u32 attr_rec_len = 0; 585 unsigned blocksize, u; 586 int err, mp_size; 587 bool rl_write_locked, was_hole, is_retry; 588 unsigned char blocksize_bits; 589 struct { 590 u8 runlist_merged:1; 591 u8 mft_attr_mapped:1; 592 u8 mp_rebuilt:1; 593 u8 attr_switched:1; 594 } status = { 0, 0, 0, 0 }; 595 596 BUG_ON(!nr_pages); 597 BUG_ON(!pages); 598 BUG_ON(!*pages); 599 vi = pages[0]->mapping->host; 600 ni = NTFS_I(vi); 601 vol = ni->vol; 602 ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, start page " 603 "index 0x%lx, nr_pages 0x%x, pos 0x%llx, bytes 0x%zx.", 604 vi->i_ino, ni->type, pages[0]->index, nr_pages, 605 (long long)pos, bytes); 606 blocksize = vol->sb->s_blocksize; 607 blocksize_bits = vol->sb->s_blocksize_bits; 608 u = 0; 609 do { 610 page = pages[u]; 611 BUG_ON(!page); 612 /* 613 * create_empty_buffers() will create uptodate/dirty buffers if 614 * the page is uptodate/dirty. 615 */ 616 if (!page_has_buffers(page)) { 617 create_empty_buffers(page, blocksize, 0); 618 if (unlikely(!page_has_buffers(page))) 619 return -ENOMEM; 620 } 621 } while (++u < nr_pages); 622 rl_write_locked = false; 623 rl = NULL; 624 err = 0; 625 vcn = lcn = -1; 626 vcn_len = 0; 627 lcn_block = -1; 628 was_hole = false; 629 cpos = pos >> vol->cluster_size_bits; 630 end = pos + bytes; 631 cend = (end + vol->cluster_size - 1) >> vol->cluster_size_bits; 632 /* 633 * Loop over each page and for each page over each buffer. Use goto to 634 * reduce indentation. 635 */ 636 u = 0; 637 do_next_page: 638 page = pages[u]; 639 bh_pos = (s64)page->index << PAGE_SHIFT; 640 bh = head = page_buffers(page); 641 do { 642 VCN cdelta; 643 s64 bh_end; 644 unsigned bh_cofs; 645 646 /* Clear buffer_new on all buffers to reinitialise state. */ 647 if (buffer_new(bh)) 648 clear_buffer_new(bh); 649 bh_end = bh_pos + blocksize; 650 bh_cpos = bh_pos >> vol->cluster_size_bits; 651 bh_cofs = bh_pos & vol->cluster_size_mask; 652 if (buffer_mapped(bh)) { 653 /* 654 * The buffer is already mapped. If it is uptodate, 655 * ignore it. 656 */ 657 if (buffer_uptodate(bh)) 658 continue; 659 /* 660 * The buffer is not uptodate. If the page is uptodate 661 * set the buffer uptodate and otherwise ignore it. 662 */ 663 if (PageUptodate(page)) { 664 set_buffer_uptodate(bh); 665 continue; 666 } 667 /* 668 * Neither the page nor the buffer are uptodate. If 669 * the buffer is only partially being written to, we 670 * need to read it in before the write, i.e. now. 671 */ 672 if ((bh_pos < pos && bh_end > pos) || 673 (bh_pos < end && bh_end > end)) { 674 /* 675 * If the buffer is fully or partially within 676 * the initialized size, do an actual read. 677 * Otherwise, simply zero the buffer. 678 */ 679 read_lock_irqsave(&ni->size_lock, flags); 680 initialized_size = ni->initialized_size; 681 read_unlock_irqrestore(&ni->size_lock, flags); 682 if (bh_pos < initialized_size) { 683 ntfs_submit_bh_for_read(bh); 684 *wait_bh++ = bh; 685 } else { 686 zero_user(page, bh_offset(bh), 687 blocksize); 688 set_buffer_uptodate(bh); 689 } 690 } 691 continue; 692 } 693 /* Unmapped buffer. Need to map it. */ 694 bh->b_bdev = vol->sb->s_bdev; 695 /* 696 * If the current buffer is in the same clusters as the map 697 * cache, there is no need to check the runlist again. The 698 * map cache is made up of @vcn, which is the first cached file 699 * cluster, @vcn_len which is the number of cached file 700 * clusters, @lcn is the device cluster corresponding to @vcn, 701 * and @lcn_block is the block number corresponding to @lcn. 702 */ 703 cdelta = bh_cpos - vcn; 704 if (likely(!cdelta || (cdelta > 0 && cdelta < vcn_len))) { 705 map_buffer_cached: 706 BUG_ON(lcn < 0); 707 bh->b_blocknr = lcn_block + 708 (cdelta << (vol->cluster_size_bits - 709 blocksize_bits)) + 710 (bh_cofs >> blocksize_bits); 711 set_buffer_mapped(bh); 712 /* 713 * If the page is uptodate so is the buffer. If the 714 * buffer is fully outside the write, we ignore it if 715 * it was already allocated and we mark it dirty so it 716 * gets written out if we allocated it. On the other 717 * hand, if we allocated the buffer but we are not 718 * marking it dirty we set buffer_new so we can do 719 * error recovery. 720 */ 721 if (PageUptodate(page)) { 722 if (!buffer_uptodate(bh)) 723 set_buffer_uptodate(bh); 724 if (unlikely(was_hole)) { 725 /* We allocated the buffer. */ 726 clean_bdev_bh_alias(bh); 727 if (bh_end <= pos || bh_pos >= end) 728 mark_buffer_dirty(bh); 729 else 730 set_buffer_new(bh); 731 } 732 continue; 733 } 734 /* Page is _not_ uptodate. */ 735 if (likely(!was_hole)) { 736 /* 737 * Buffer was already allocated. If it is not 738 * uptodate and is only partially being written 739 * to, we need to read it in before the write, 740 * i.e. now. 741 */ 742 if (!buffer_uptodate(bh) && bh_pos < end && 743 bh_end > pos && 744 (bh_pos < pos || 745 bh_end > end)) { 746 /* 747 * If the buffer is fully or partially 748 * within the initialized size, do an 749 * actual read. Otherwise, simply zero 750 * the buffer. 751 */ 752 read_lock_irqsave(&ni->size_lock, 753 flags); 754 initialized_size = ni->initialized_size; 755 read_unlock_irqrestore(&ni->size_lock, 756 flags); 757 if (bh_pos < initialized_size) { 758 ntfs_submit_bh_for_read(bh); 759 *wait_bh++ = bh; 760 } else { 761 zero_user(page, bh_offset(bh), 762 blocksize); 763 set_buffer_uptodate(bh); 764 } 765 } 766 continue; 767 } 768 /* We allocated the buffer. */ 769 clean_bdev_bh_alias(bh); 770 /* 771 * If the buffer is fully outside the write, zero it, 772 * set it uptodate, and mark it dirty so it gets 773 * written out. If it is partially being written to, 774 * zero region surrounding the write but leave it to 775 * commit write to do anything else. Finally, if the 776 * buffer is fully being overwritten, do nothing. 777 */ 778 if (bh_end <= pos || bh_pos >= end) { 779 if (!buffer_uptodate(bh)) { 780 zero_user(page, bh_offset(bh), 781 blocksize); 782 set_buffer_uptodate(bh); 783 } 784 mark_buffer_dirty(bh); 785 continue; 786 } 787 set_buffer_new(bh); 788 if (!buffer_uptodate(bh) && 789 (bh_pos < pos || bh_end > end)) { 790 u8 *kaddr; 791 unsigned pofs; 792 793 kaddr = kmap_atomic(page); 794 if (bh_pos < pos) { 795 pofs = bh_pos & ~PAGE_MASK; 796 memset(kaddr + pofs, 0, pos - bh_pos); 797 } 798 if (bh_end > end) { 799 pofs = end & ~PAGE_MASK; 800 memset(kaddr + pofs, 0, bh_end - end); 801 } 802 kunmap_atomic(kaddr); 803 flush_dcache_page(page); 804 } 805 continue; 806 } 807 /* 808 * Slow path: this is the first buffer in the cluster. If it 809 * is outside allocated size and is not uptodate, zero it and 810 * set it uptodate. 811 */ 812 read_lock_irqsave(&ni->size_lock, flags); 813 initialized_size = ni->allocated_size; 814 read_unlock_irqrestore(&ni->size_lock, flags); 815 if (bh_pos > initialized_size) { 816 if (PageUptodate(page)) { 817 if (!buffer_uptodate(bh)) 818 set_buffer_uptodate(bh); 819 } else if (!buffer_uptodate(bh)) { 820 zero_user(page, bh_offset(bh), blocksize); 821 set_buffer_uptodate(bh); 822 } 823 continue; 824 } 825 is_retry = false; 826 if (!rl) { 827 down_read(&ni->runlist.lock); 828 retry_remap: 829 rl = ni->runlist.rl; 830 } 831 if (likely(rl != NULL)) { 832 /* Seek to element containing target cluster. */ 833 while (rl->length && rl[1].vcn <= bh_cpos) 834 rl++; 835 lcn = ntfs_rl_vcn_to_lcn(rl, bh_cpos); 836 if (likely(lcn >= 0)) { 837 /* 838 * Successful remap, setup the map cache and 839 * use that to deal with the buffer. 840 */ 841 was_hole = false; 842 vcn = bh_cpos; 843 vcn_len = rl[1].vcn - vcn; 844 lcn_block = lcn << (vol->cluster_size_bits - 845 blocksize_bits); 846 cdelta = 0; 847 /* 848 * If the number of remaining clusters touched 849 * by the write is smaller or equal to the 850 * number of cached clusters, unlock the 851 * runlist as the map cache will be used from 852 * now on. 853 */ 854 if (likely(vcn + vcn_len >= cend)) { 855 if (rl_write_locked) { 856 up_write(&ni->runlist.lock); 857 rl_write_locked = false; 858 } else 859 up_read(&ni->runlist.lock); 860 rl = NULL; 861 } 862 goto map_buffer_cached; 863 } 864 } else 865 lcn = LCN_RL_NOT_MAPPED; 866 /* 867 * If it is not a hole and not out of bounds, the runlist is 868 * probably unmapped so try to map it now. 869 */ 870 if (unlikely(lcn != LCN_HOLE && lcn != LCN_ENOENT)) { 871 if (likely(!is_retry && lcn == LCN_RL_NOT_MAPPED)) { 872 /* Attempt to map runlist. */ 873 if (!rl_write_locked) { 874 /* 875 * We need the runlist locked for 876 * writing, so if it is locked for 877 * reading relock it now and retry in 878 * case it changed whilst we dropped 879 * the lock. 880 */ 881 up_read(&ni->runlist.lock); 882 down_write(&ni->runlist.lock); 883 rl_write_locked = true; 884 goto retry_remap; 885 } 886 err = ntfs_map_runlist_nolock(ni, bh_cpos, 887 NULL); 888 if (likely(!err)) { 889 is_retry = true; 890 goto retry_remap; 891 } 892 /* 893 * If @vcn is out of bounds, pretend @lcn is 894 * LCN_ENOENT. As long as the buffer is out 895 * of bounds this will work fine. 896 */ 897 if (err == -ENOENT) { 898 lcn = LCN_ENOENT; 899 err = 0; 900 goto rl_not_mapped_enoent; 901 } 902 } else 903 err = -EIO; 904 /* Failed to map the buffer, even after retrying. */ 905 bh->b_blocknr = -1; 906 ntfs_error(vol->sb, "Failed to write to inode 0x%lx, " 907 "attribute type 0x%x, vcn 0x%llx, " 908 "vcn offset 0x%x, because its " 909 "location on disk could not be " 910 "determined%s (error code %i).", 911 ni->mft_no, ni->type, 912 (unsigned long long)bh_cpos, 913 (unsigned)bh_pos & 914 vol->cluster_size_mask, 915 is_retry ? " even after retrying" : "", 916 err); 917 break; 918 } 919 rl_not_mapped_enoent: 920 /* 921 * The buffer is in a hole or out of bounds. We need to fill 922 * the hole, unless the buffer is in a cluster which is not 923 * touched by the write, in which case we just leave the buffer 924 * unmapped. This can only happen when the cluster size is 925 * less than the page cache size. 926 */ 927 if (unlikely(vol->cluster_size < PAGE_SIZE)) { 928 bh_cend = (bh_end + vol->cluster_size - 1) >> 929 vol->cluster_size_bits; 930 if ((bh_cend <= cpos || bh_cpos >= cend)) { 931 bh->b_blocknr = -1; 932 /* 933 * If the buffer is uptodate we skip it. If it 934 * is not but the page is uptodate, we can set 935 * the buffer uptodate. If the page is not 936 * uptodate, we can clear the buffer and set it 937 * uptodate. Whether this is worthwhile is 938 * debatable and this could be removed. 939 */ 940 if (PageUptodate(page)) { 941 if (!buffer_uptodate(bh)) 942 set_buffer_uptodate(bh); 943 } else if (!buffer_uptodate(bh)) { 944 zero_user(page, bh_offset(bh), 945 blocksize); 946 set_buffer_uptodate(bh); 947 } 948 continue; 949 } 950 } 951 /* 952 * Out of bounds buffer is invalid if it was not really out of 953 * bounds. 954 */ 955 BUG_ON(lcn != LCN_HOLE); 956 /* 957 * We need the runlist locked for writing, so if it is locked 958 * for reading relock it now and retry in case it changed 959 * whilst we dropped the lock. 960 */ 961 BUG_ON(!rl); 962 if (!rl_write_locked) { 963 up_read(&ni->runlist.lock); 964 down_write(&ni->runlist.lock); 965 rl_write_locked = true; 966 goto retry_remap; 967 } 968 /* Find the previous last allocated cluster. */ 969 BUG_ON(rl->lcn != LCN_HOLE); 970 lcn = -1; 971 rl2 = rl; 972 while (--rl2 >= ni->runlist.rl) { 973 if (rl2->lcn >= 0) { 974 lcn = rl2->lcn + rl2->length; 975 break; 976 } 977 } 978 rl2 = ntfs_cluster_alloc(vol, bh_cpos, 1, lcn, DATA_ZONE, 979 false); 980 if (IS_ERR(rl2)) { 981 err = PTR_ERR(rl2); 982 ntfs_debug("Failed to allocate cluster, error code %i.", 983 err); 984 break; 985 } 986 lcn = rl2->lcn; 987 rl = ntfs_runlists_merge(ni->runlist.rl, rl2); 988 if (IS_ERR(rl)) { 989 err = PTR_ERR(rl); 990 if (err != -ENOMEM) 991 err = -EIO; 992 if (ntfs_cluster_free_from_rl(vol, rl2)) { 993 ntfs_error(vol->sb, "Failed to release " 994 "allocated cluster in error " 995 "code path. Run chkdsk to " 996 "recover the lost cluster."); 997 NVolSetErrors(vol); 998 } 999 ntfs_free(rl2); 1000 break; 1001 } 1002 ni->runlist.rl = rl; 1003 status.runlist_merged = 1; 1004 ntfs_debug("Allocated cluster, lcn 0x%llx.", 1005 (unsigned long long)lcn); 1006 /* Map and lock the mft record and get the attribute record. */ 1007 if (!NInoAttr(ni)) 1008 base_ni = ni; 1009 else 1010 base_ni = ni->ext.base_ntfs_ino; 1011 m = map_mft_record(base_ni); 1012 if (IS_ERR(m)) { 1013 err = PTR_ERR(m); 1014 break; 1015 } 1016 ctx = ntfs_attr_get_search_ctx(base_ni, m); 1017 if (unlikely(!ctx)) { 1018 err = -ENOMEM; 1019 unmap_mft_record(base_ni); 1020 break; 1021 } 1022 status.mft_attr_mapped = 1; 1023 err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, 1024 CASE_SENSITIVE, bh_cpos, NULL, 0, ctx); 1025 if (unlikely(err)) { 1026 if (err == -ENOENT) 1027 err = -EIO; 1028 break; 1029 } 1030 m = ctx->mrec; 1031 a = ctx->attr; 1032 /* 1033 * Find the runlist element with which the attribute extent 1034 * starts. Note, we cannot use the _attr_ version because we 1035 * have mapped the mft record. That is ok because we know the 1036 * runlist fragment must be mapped already to have ever gotten 1037 * here, so we can just use the _rl_ version. 1038 */ 1039 vcn = sle64_to_cpu(a->data.non_resident.lowest_vcn); 1040 rl2 = ntfs_rl_find_vcn_nolock(rl, vcn); 1041 BUG_ON(!rl2); 1042 BUG_ON(!rl2->length); 1043 BUG_ON(rl2->lcn < LCN_HOLE); 1044 highest_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn); 1045 /* 1046 * If @highest_vcn is zero, calculate the real highest_vcn 1047 * (which can really be zero). 1048 */ 1049 if (!highest_vcn) 1050 highest_vcn = (sle64_to_cpu( 1051 a->data.non_resident.allocated_size) >> 1052 vol->cluster_size_bits) - 1; 1053 /* 1054 * Determine the size of the mapping pairs array for the new 1055 * extent, i.e. the old extent with the hole filled. 1056 */ 1057 mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, vcn, 1058 highest_vcn); 1059 if (unlikely(mp_size <= 0)) { 1060 if (!(err = mp_size)) 1061 err = -EIO; 1062 ntfs_debug("Failed to get size for mapping pairs " 1063 "array, error code %i.", err); 1064 break; 1065 } 1066 /* 1067 * Resize the attribute record to fit the new mapping pairs 1068 * array. 1069 */ 1070 attr_rec_len = le32_to_cpu(a->length); 1071 err = ntfs_attr_record_resize(m, a, mp_size + le16_to_cpu( 1072 a->data.non_resident.mapping_pairs_offset)); 1073 if (unlikely(err)) { 1074 BUG_ON(err != -ENOSPC); 1075 // TODO: Deal with this by using the current attribute 1076 // and fill it with as much of the mapping pairs 1077 // array as possible. Then loop over each attribute 1078 // extent rewriting the mapping pairs arrays as we go 1079 // along and if when we reach the end we have not 1080 // enough space, try to resize the last attribute 1081 // extent and if even that fails, add a new attribute 1082 // extent. 1083 // We could also try to resize at each step in the hope 1084 // that we will not need to rewrite every single extent. 1085 // Note, we may need to decompress some extents to fill 1086 // the runlist as we are walking the extents... 1087 ntfs_error(vol->sb, "Not enough space in the mft " 1088 "record for the extended attribute " 1089 "record. This case is not " 1090 "implemented yet."); 1091 err = -EOPNOTSUPP; 1092 break ; 1093 } 1094 status.mp_rebuilt = 1; 1095 /* 1096 * Generate the mapping pairs array directly into the attribute 1097 * record. 1098 */ 1099 err = ntfs_mapping_pairs_build(vol, (u8*)a + le16_to_cpu( 1100 a->data.non_resident.mapping_pairs_offset), 1101 mp_size, rl2, vcn, highest_vcn, NULL); 1102 if (unlikely(err)) { 1103 ntfs_error(vol->sb, "Cannot fill hole in inode 0x%lx, " 1104 "attribute type 0x%x, because building " 1105 "the mapping pairs failed with error " 1106 "code %i.", vi->i_ino, 1107 (unsigned)le32_to_cpu(ni->type), err); 1108 err = -EIO; 1109 break; 1110 } 1111 /* Update the highest_vcn but only if it was not set. */ 1112 if (unlikely(!a->data.non_resident.highest_vcn)) 1113 a->data.non_resident.highest_vcn = 1114 cpu_to_sle64(highest_vcn); 1115 /* 1116 * If the attribute is sparse/compressed, update the compressed 1117 * size in the ntfs_inode structure and the attribute record. 1118 */ 1119 if (likely(NInoSparse(ni) || NInoCompressed(ni))) { 1120 /* 1121 * If we are not in the first attribute extent, switch 1122 * to it, but first ensure the changes will make it to 1123 * disk later. 1124 */ 1125 if (a->data.non_resident.lowest_vcn) { 1126 flush_dcache_mft_record_page(ctx->ntfs_ino); 1127 mark_mft_record_dirty(ctx->ntfs_ino); 1128 ntfs_attr_reinit_search_ctx(ctx); 1129 err = ntfs_attr_lookup(ni->type, ni->name, 1130 ni->name_len, CASE_SENSITIVE, 1131 0, NULL, 0, ctx); 1132 if (unlikely(err)) { 1133 status.attr_switched = 1; 1134 break; 1135 } 1136 /* @m is not used any more so do not set it. */ 1137 a = ctx->attr; 1138 } 1139 write_lock_irqsave(&ni->size_lock, flags); 1140 ni->itype.compressed.size += vol->cluster_size; 1141 a->data.non_resident.compressed_size = 1142 cpu_to_sle64(ni->itype.compressed.size); 1143 write_unlock_irqrestore(&ni->size_lock, flags); 1144 } 1145 /* Ensure the changes make it to disk. */ 1146 flush_dcache_mft_record_page(ctx->ntfs_ino); 1147 mark_mft_record_dirty(ctx->ntfs_ino); 1148 ntfs_attr_put_search_ctx(ctx); 1149 unmap_mft_record(base_ni); 1150 /* Successfully filled the hole. */ 1151 status.runlist_merged = 0; 1152 status.mft_attr_mapped = 0; 1153 status.mp_rebuilt = 0; 1154 /* Setup the map cache and use that to deal with the buffer. */ 1155 was_hole = true; 1156 vcn = bh_cpos; 1157 vcn_len = 1; 1158 lcn_block = lcn << (vol->cluster_size_bits - blocksize_bits); 1159 cdelta = 0; 1160 /* 1161 * If the number of remaining clusters in the @pages is smaller 1162 * or equal to the number of cached clusters, unlock the 1163 * runlist as the map cache will be used from now on. 1164 */ 1165 if (likely(vcn + vcn_len >= cend)) { 1166 up_write(&ni->runlist.lock); 1167 rl_write_locked = false; 1168 rl = NULL; 1169 } 1170 goto map_buffer_cached; 1171 } while (bh_pos += blocksize, (bh = bh->b_this_page) != head); 1172 /* If there are no errors, do the next page. */ 1173 if (likely(!err && ++u < nr_pages)) 1174 goto do_next_page; 1175 /* If there are no errors, release the runlist lock if we took it. */ 1176 if (likely(!err)) { 1177 if (unlikely(rl_write_locked)) { 1178 up_write(&ni->runlist.lock); 1179 rl_write_locked = false; 1180 } else if (unlikely(rl)) 1181 up_read(&ni->runlist.lock); 1182 rl = NULL; 1183 } 1184 /* If we issued read requests, let them complete. */ 1185 read_lock_irqsave(&ni->size_lock, flags); 1186 initialized_size = ni->initialized_size; 1187 read_unlock_irqrestore(&ni->size_lock, flags); 1188 while (wait_bh > wait) { 1189 bh = *--wait_bh; 1190 wait_on_buffer(bh); 1191 if (likely(buffer_uptodate(bh))) { 1192 page = bh->b_page; 1193 bh_pos = ((s64)page->index << PAGE_SHIFT) + 1194 bh_offset(bh); 1195 /* 1196 * If the buffer overflows the initialized size, need 1197 * to zero the overflowing region. 1198 */ 1199 if (unlikely(bh_pos + blocksize > initialized_size)) { 1200 int ofs = 0; 1201 1202 if (likely(bh_pos < initialized_size)) 1203 ofs = initialized_size - bh_pos; 1204 zero_user_segment(page, bh_offset(bh) + ofs, 1205 blocksize); 1206 } 1207 } else /* if (unlikely(!buffer_uptodate(bh))) */ 1208 err = -EIO; 1209 } 1210 if (likely(!err)) { 1211 /* Clear buffer_new on all buffers. */ 1212 u = 0; 1213 do { 1214 bh = head = page_buffers(pages[u]); 1215 do { 1216 if (buffer_new(bh)) 1217 clear_buffer_new(bh); 1218 } while ((bh = bh->b_this_page) != head); 1219 } while (++u < nr_pages); 1220 ntfs_debug("Done."); 1221 return err; 1222 } 1223 if (status.attr_switched) { 1224 /* Get back to the attribute extent we modified. */ 1225 ntfs_attr_reinit_search_ctx(ctx); 1226 if (ntfs_attr_lookup(ni->type, ni->name, ni->name_len, 1227 CASE_SENSITIVE, bh_cpos, NULL, 0, ctx)) { 1228 ntfs_error(vol->sb, "Failed to find required " 1229 "attribute extent of attribute in " 1230 "error code path. Run chkdsk to " 1231 "recover."); 1232 write_lock_irqsave(&ni->size_lock, flags); 1233 ni->itype.compressed.size += vol->cluster_size; 1234 write_unlock_irqrestore(&ni->size_lock, flags); 1235 flush_dcache_mft_record_page(ctx->ntfs_ino); 1236 mark_mft_record_dirty(ctx->ntfs_ino); 1237 /* 1238 * The only thing that is now wrong is the compressed 1239 * size of the base attribute extent which chkdsk 1240 * should be able to fix. 1241 */ 1242 NVolSetErrors(vol); 1243 } else { 1244 m = ctx->mrec; 1245 a = ctx->attr; 1246 status.attr_switched = 0; 1247 } 1248 } 1249 /* 1250 * If the runlist has been modified, need to restore it by punching a 1251 * hole into it and we then need to deallocate the on-disk cluster as 1252 * well. Note, we only modify the runlist if we are able to generate a 1253 * new mapping pairs array, i.e. only when the mapped attribute extent 1254 * is not switched. 1255 */ 1256 if (status.runlist_merged && !status.attr_switched) { 1257 BUG_ON(!rl_write_locked); 1258 /* Make the file cluster we allocated sparse in the runlist. */ 1259 if (ntfs_rl_punch_nolock(vol, &ni->runlist, bh_cpos, 1)) { 1260 ntfs_error(vol->sb, "Failed to punch hole into " 1261 "attribute runlist in error code " 1262 "path. Run chkdsk to recover the " 1263 "lost cluster."); 1264 NVolSetErrors(vol); 1265 } else /* if (success) */ { 1266 status.runlist_merged = 0; 1267 /* 1268 * Deallocate the on-disk cluster we allocated but only 1269 * if we succeeded in punching its vcn out of the 1270 * runlist. 1271 */ 1272 down_write(&vol->lcnbmp_lock); 1273 if (ntfs_bitmap_clear_bit(vol->lcnbmp_ino, lcn)) { 1274 ntfs_error(vol->sb, "Failed to release " 1275 "allocated cluster in error " 1276 "code path. Run chkdsk to " 1277 "recover the lost cluster."); 1278 NVolSetErrors(vol); 1279 } 1280 up_write(&vol->lcnbmp_lock); 1281 } 1282 } 1283 /* 1284 * Resize the attribute record to its old size and rebuild the mapping 1285 * pairs array. Note, we only can do this if the runlist has been 1286 * restored to its old state which also implies that the mapped 1287 * attribute extent is not switched. 1288 */ 1289 if (status.mp_rebuilt && !status.runlist_merged) { 1290 if (ntfs_attr_record_resize(m, a, attr_rec_len)) { 1291 ntfs_error(vol->sb, "Failed to restore attribute " 1292 "record in error code path. Run " 1293 "chkdsk to recover."); 1294 NVolSetErrors(vol); 1295 } else /* if (success) */ { 1296 if (ntfs_mapping_pairs_build(vol, (u8*)a + 1297 le16_to_cpu(a->data.non_resident. 1298 mapping_pairs_offset), attr_rec_len - 1299 le16_to_cpu(a->data.non_resident. 1300 mapping_pairs_offset), ni->runlist.rl, 1301 vcn, highest_vcn, NULL)) { 1302 ntfs_error(vol->sb, "Failed to restore " 1303 "mapping pairs array in error " 1304 "code path. Run chkdsk to " 1305 "recover."); 1306 NVolSetErrors(vol); 1307 } 1308 flush_dcache_mft_record_page(ctx->ntfs_ino); 1309 mark_mft_record_dirty(ctx->ntfs_ino); 1310 } 1311 } 1312 /* Release the mft record and the attribute. */ 1313 if (status.mft_attr_mapped) { 1314 ntfs_attr_put_search_ctx(ctx); 1315 unmap_mft_record(base_ni); 1316 } 1317 /* Release the runlist lock. */ 1318 if (rl_write_locked) 1319 up_write(&ni->runlist.lock); 1320 else if (rl) 1321 up_read(&ni->runlist.lock); 1322 /* 1323 * Zero out any newly allocated blocks to avoid exposing stale data. 1324 * If BH_New is set, we know that the block was newly allocated above 1325 * and that it has not been fully zeroed and marked dirty yet. 1326 */ 1327 nr_pages = u; 1328 u = 0; 1329 end = bh_cpos << vol->cluster_size_bits; 1330 do { 1331 page = pages[u]; 1332 bh = head = page_buffers(page); 1333 do { 1334 if (u == nr_pages && 1335 ((s64)page->index << PAGE_SHIFT) + 1336 bh_offset(bh) >= end) 1337 break; 1338 if (!buffer_new(bh)) 1339 continue; 1340 clear_buffer_new(bh); 1341 if (!buffer_uptodate(bh)) { 1342 if (PageUptodate(page)) 1343 set_buffer_uptodate(bh); 1344 else { 1345 zero_user(page, bh_offset(bh), 1346 blocksize); 1347 set_buffer_uptodate(bh); 1348 } 1349 } 1350 mark_buffer_dirty(bh); 1351 } while ((bh = bh->b_this_page) != head); 1352 } while (++u <= nr_pages); 1353 ntfs_error(vol->sb, "Failed. Returning error code %i.", err); 1354 return err; 1355 } 1356 1357 static inline void ntfs_flush_dcache_pages(struct page **pages, 1358 unsigned nr_pages) 1359 { 1360 BUG_ON(!nr_pages); 1361 /* 1362 * Warning: Do not do the decrement at the same time as the call to 1363 * flush_dcache_page() because it is a NULL macro on i386 and hence the 1364 * decrement never happens so the loop never terminates. 1365 */ 1366 do { 1367 --nr_pages; 1368 flush_dcache_page(pages[nr_pages]); 1369 } while (nr_pages > 0); 1370 } 1371 1372 /** 1373 * ntfs_commit_pages_after_non_resident_write - commit the received data 1374 * @pages: array of destination pages 1375 * @nr_pages: number of pages in @pages 1376 * @pos: byte position in file at which the write begins 1377 * @bytes: number of bytes to be written 1378 * 1379 * See description of ntfs_commit_pages_after_write(), below. 1380 */ 1381 static inline int ntfs_commit_pages_after_non_resident_write( 1382 struct page **pages, const unsigned nr_pages, 1383 s64 pos, size_t bytes) 1384 { 1385 s64 end, initialized_size; 1386 struct inode *vi; 1387 ntfs_inode *ni, *base_ni; 1388 struct buffer_head *bh, *head; 1389 ntfs_attr_search_ctx *ctx; 1390 MFT_RECORD *m; 1391 ATTR_RECORD *a; 1392 unsigned long flags; 1393 unsigned blocksize, u; 1394 int err; 1395 1396 vi = pages[0]->mapping->host; 1397 ni = NTFS_I(vi); 1398 blocksize = vi->i_sb->s_blocksize; 1399 end = pos + bytes; 1400 u = 0; 1401 do { 1402 s64 bh_pos; 1403 struct page *page; 1404 bool partial; 1405 1406 page = pages[u]; 1407 bh_pos = (s64)page->index << PAGE_SHIFT; 1408 bh = head = page_buffers(page); 1409 partial = false; 1410 do { 1411 s64 bh_end; 1412 1413 bh_end = bh_pos + blocksize; 1414 if (bh_end <= pos || bh_pos >= end) { 1415 if (!buffer_uptodate(bh)) 1416 partial = true; 1417 } else { 1418 set_buffer_uptodate(bh); 1419 mark_buffer_dirty(bh); 1420 } 1421 } while (bh_pos += blocksize, (bh = bh->b_this_page) != head); 1422 /* 1423 * If all buffers are now uptodate but the page is not, set the 1424 * page uptodate. 1425 */ 1426 if (!partial && !PageUptodate(page)) 1427 SetPageUptodate(page); 1428 } while (++u < nr_pages); 1429 /* 1430 * Finally, if we do not need to update initialized_size or i_size we 1431 * are finished. 1432 */ 1433 read_lock_irqsave(&ni->size_lock, flags); 1434 initialized_size = ni->initialized_size; 1435 read_unlock_irqrestore(&ni->size_lock, flags); 1436 if (end <= initialized_size) { 1437 ntfs_debug("Done."); 1438 return 0; 1439 } 1440 /* 1441 * Update initialized_size/i_size as appropriate, both in the inode and 1442 * the mft record. 1443 */ 1444 if (!NInoAttr(ni)) 1445 base_ni = ni; 1446 else 1447 base_ni = ni->ext.base_ntfs_ino; 1448 /* Map, pin, and lock the mft record. */ 1449 m = map_mft_record(base_ni); 1450 if (IS_ERR(m)) { 1451 err = PTR_ERR(m); 1452 m = NULL; 1453 ctx = NULL; 1454 goto err_out; 1455 } 1456 BUG_ON(!NInoNonResident(ni)); 1457 ctx = ntfs_attr_get_search_ctx(base_ni, m); 1458 if (unlikely(!ctx)) { 1459 err = -ENOMEM; 1460 goto err_out; 1461 } 1462 err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, 1463 CASE_SENSITIVE, 0, NULL, 0, ctx); 1464 if (unlikely(err)) { 1465 if (err == -ENOENT) 1466 err = -EIO; 1467 goto err_out; 1468 } 1469 a = ctx->attr; 1470 BUG_ON(!a->non_resident); 1471 write_lock_irqsave(&ni->size_lock, flags); 1472 BUG_ON(end > ni->allocated_size); 1473 ni->initialized_size = end; 1474 a->data.non_resident.initialized_size = cpu_to_sle64(end); 1475 if (end > i_size_read(vi)) { 1476 i_size_write(vi, end); 1477 a->data.non_resident.data_size = 1478 a->data.non_resident.initialized_size; 1479 } 1480 write_unlock_irqrestore(&ni->size_lock, flags); 1481 /* Mark the mft record dirty, so it gets written back. */ 1482 flush_dcache_mft_record_page(ctx->ntfs_ino); 1483 mark_mft_record_dirty(ctx->ntfs_ino); 1484 ntfs_attr_put_search_ctx(ctx); 1485 unmap_mft_record(base_ni); 1486 ntfs_debug("Done."); 1487 return 0; 1488 err_out: 1489 if (ctx) 1490 ntfs_attr_put_search_ctx(ctx); 1491 if (m) 1492 unmap_mft_record(base_ni); 1493 ntfs_error(vi->i_sb, "Failed to update initialized_size/i_size (error " 1494 "code %i).", err); 1495 if (err != -ENOMEM) 1496 NVolSetErrors(ni->vol); 1497 return err; 1498 } 1499 1500 /** 1501 * ntfs_commit_pages_after_write - commit the received data 1502 * @pages: array of destination pages 1503 * @nr_pages: number of pages in @pages 1504 * @pos: byte position in file at which the write begins 1505 * @bytes: number of bytes to be written 1506 * 1507 * This is called from ntfs_file_buffered_write() with i_mutex held on the inode 1508 * (@pages[0]->mapping->host). There are @nr_pages pages in @pages which are 1509 * locked but not kmap()ped. The source data has already been copied into the 1510 * @page. ntfs_prepare_pages_for_non_resident_write() has been called before 1511 * the data was copied (for non-resident attributes only) and it returned 1512 * success. 1513 * 1514 * Need to set uptodate and mark dirty all buffers within the boundary of the 1515 * write. If all buffers in a page are uptodate we set the page uptodate, too. 1516 * 1517 * Setting the buffers dirty ensures that they get written out later when 1518 * ntfs_writepage() is invoked by the VM. 1519 * 1520 * Finally, we need to update i_size and initialized_size as appropriate both 1521 * in the inode and the mft record. 1522 * 1523 * This is modelled after fs/buffer.c::generic_commit_write(), which marks 1524 * buffers uptodate and dirty, sets the page uptodate if all buffers in the 1525 * page are uptodate, and updates i_size if the end of io is beyond i_size. In 1526 * that case, it also marks the inode dirty. 1527 * 1528 * If things have gone as outlined in 1529 * ntfs_prepare_pages_for_non_resident_write(), we do not need to do any page 1530 * content modifications here for non-resident attributes. For resident 1531 * attributes we need to do the uptodate bringing here which we combine with 1532 * the copying into the mft record which means we save one atomic kmap. 1533 * 1534 * Return 0 on success or -errno on error. 1535 */ 1536 static int ntfs_commit_pages_after_write(struct page **pages, 1537 const unsigned nr_pages, s64 pos, size_t bytes) 1538 { 1539 s64 end, initialized_size; 1540 loff_t i_size; 1541 struct inode *vi; 1542 ntfs_inode *ni, *base_ni; 1543 struct page *page; 1544 ntfs_attr_search_ctx *ctx; 1545 MFT_RECORD *m; 1546 ATTR_RECORD *a; 1547 char *kattr, *kaddr; 1548 unsigned long flags; 1549 u32 attr_len; 1550 int err; 1551 1552 BUG_ON(!nr_pages); 1553 BUG_ON(!pages); 1554 page = pages[0]; 1555 BUG_ON(!page); 1556 vi = page->mapping->host; 1557 ni = NTFS_I(vi); 1558 ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, start page " 1559 "index 0x%lx, nr_pages 0x%x, pos 0x%llx, bytes 0x%zx.", 1560 vi->i_ino, ni->type, page->index, nr_pages, 1561 (long long)pos, bytes); 1562 if (NInoNonResident(ni)) 1563 return ntfs_commit_pages_after_non_resident_write(pages, 1564 nr_pages, pos, bytes); 1565 BUG_ON(nr_pages > 1); 1566 /* 1567 * Attribute is resident, implying it is not compressed, encrypted, or 1568 * sparse. 1569 */ 1570 if (!NInoAttr(ni)) 1571 base_ni = ni; 1572 else 1573 base_ni = ni->ext.base_ntfs_ino; 1574 BUG_ON(NInoNonResident(ni)); 1575 /* Map, pin, and lock the mft record. */ 1576 m = map_mft_record(base_ni); 1577 if (IS_ERR(m)) { 1578 err = PTR_ERR(m); 1579 m = NULL; 1580 ctx = NULL; 1581 goto err_out; 1582 } 1583 ctx = ntfs_attr_get_search_ctx(base_ni, m); 1584 if (unlikely(!ctx)) { 1585 err = -ENOMEM; 1586 goto err_out; 1587 } 1588 err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, 1589 CASE_SENSITIVE, 0, NULL, 0, ctx); 1590 if (unlikely(err)) { 1591 if (err == -ENOENT) 1592 err = -EIO; 1593 goto err_out; 1594 } 1595 a = ctx->attr; 1596 BUG_ON(a->non_resident); 1597 /* The total length of the attribute value. */ 1598 attr_len = le32_to_cpu(a->data.resident.value_length); 1599 i_size = i_size_read(vi); 1600 BUG_ON(attr_len != i_size); 1601 BUG_ON(pos > attr_len); 1602 end = pos + bytes; 1603 BUG_ON(end > le32_to_cpu(a->length) - 1604 le16_to_cpu(a->data.resident.value_offset)); 1605 kattr = (u8*)a + le16_to_cpu(a->data.resident.value_offset); 1606 kaddr = kmap_atomic(page); 1607 /* Copy the received data from the page to the mft record. */ 1608 memcpy(kattr + pos, kaddr + pos, bytes); 1609 /* Update the attribute length if necessary. */ 1610 if (end > attr_len) { 1611 attr_len = end; 1612 a->data.resident.value_length = cpu_to_le32(attr_len); 1613 } 1614 /* 1615 * If the page is not uptodate, bring the out of bounds area(s) 1616 * uptodate by copying data from the mft record to the page. 1617 */ 1618 if (!PageUptodate(page)) { 1619 if (pos > 0) 1620 memcpy(kaddr, kattr, pos); 1621 if (end < attr_len) 1622 memcpy(kaddr + end, kattr + end, attr_len - end); 1623 /* Zero the region outside the end of the attribute value. */ 1624 memset(kaddr + attr_len, 0, PAGE_SIZE - attr_len); 1625 flush_dcache_page(page); 1626 SetPageUptodate(page); 1627 } 1628 kunmap_atomic(kaddr); 1629 /* Update initialized_size/i_size if necessary. */ 1630 read_lock_irqsave(&ni->size_lock, flags); 1631 initialized_size = ni->initialized_size; 1632 BUG_ON(end > ni->allocated_size); 1633 read_unlock_irqrestore(&ni->size_lock, flags); 1634 BUG_ON(initialized_size != i_size); 1635 if (end > initialized_size) { 1636 write_lock_irqsave(&ni->size_lock, flags); 1637 ni->initialized_size = end; 1638 i_size_write(vi, end); 1639 write_unlock_irqrestore(&ni->size_lock, flags); 1640 } 1641 /* Mark the mft record dirty, so it gets written back. */ 1642 flush_dcache_mft_record_page(ctx->ntfs_ino); 1643 mark_mft_record_dirty(ctx->ntfs_ino); 1644 ntfs_attr_put_search_ctx(ctx); 1645 unmap_mft_record(base_ni); 1646 ntfs_debug("Done."); 1647 return 0; 1648 err_out: 1649 if (err == -ENOMEM) { 1650 ntfs_warning(vi->i_sb, "Error allocating memory required to " 1651 "commit the write."); 1652 if (PageUptodate(page)) { 1653 ntfs_warning(vi->i_sb, "Page is uptodate, setting " 1654 "dirty so the write will be retried " 1655 "later on by the VM."); 1656 /* 1657 * Put the page on mapping->dirty_pages, but leave its 1658 * buffers' dirty state as-is. 1659 */ 1660 __set_page_dirty_nobuffers(page); 1661 err = 0; 1662 } else 1663 ntfs_error(vi->i_sb, "Page is not uptodate. Written " 1664 "data has been lost."); 1665 } else { 1666 ntfs_error(vi->i_sb, "Resident attribute commit write failed " 1667 "with error %i.", err); 1668 NVolSetErrors(ni->vol); 1669 } 1670 if (ctx) 1671 ntfs_attr_put_search_ctx(ctx); 1672 if (m) 1673 unmap_mft_record(base_ni); 1674 return err; 1675 } 1676 1677 /* 1678 * Copy as much as we can into the pages and return the number of bytes which 1679 * were successfully copied. If a fault is encountered then clear the pages 1680 * out to (ofs + bytes) and return the number of bytes which were copied. 1681 */ 1682 static size_t ntfs_copy_from_user_iter(struct page **pages, unsigned nr_pages, 1683 unsigned ofs, struct iov_iter *i, size_t bytes) 1684 { 1685 struct page **last_page = pages + nr_pages; 1686 size_t total = 0; 1687 struct iov_iter data = *i; 1688 unsigned len, copied; 1689 1690 do { 1691 len = PAGE_SIZE - ofs; 1692 if (len > bytes) 1693 len = bytes; 1694 copied = iov_iter_copy_from_user_atomic(*pages, &data, ofs, 1695 len); 1696 total += copied; 1697 bytes -= copied; 1698 if (!bytes) 1699 break; 1700 iov_iter_advance(&data, copied); 1701 if (copied < len) 1702 goto err; 1703 ofs = 0; 1704 } while (++pages < last_page); 1705 out: 1706 return total; 1707 err: 1708 /* Zero the rest of the target like __copy_from_user(). */ 1709 len = PAGE_SIZE - copied; 1710 do { 1711 if (len > bytes) 1712 len = bytes; 1713 zero_user(*pages, copied, len); 1714 bytes -= len; 1715 copied = 0; 1716 len = PAGE_SIZE; 1717 } while (++pages < last_page); 1718 goto out; 1719 } 1720 1721 /** 1722 * ntfs_perform_write - perform buffered write to a file 1723 * @file: file to write to 1724 * @i: iov_iter with data to write 1725 * @pos: byte offset in file at which to begin writing to 1726 */ 1727 static ssize_t ntfs_perform_write(struct file *file, struct iov_iter *i, 1728 loff_t pos) 1729 { 1730 struct address_space *mapping = file->f_mapping; 1731 struct inode *vi = mapping->host; 1732 ntfs_inode *ni = NTFS_I(vi); 1733 ntfs_volume *vol = ni->vol; 1734 struct page *pages[NTFS_MAX_PAGES_PER_CLUSTER]; 1735 struct page *cached_page = NULL; 1736 VCN last_vcn; 1737 LCN lcn; 1738 size_t bytes; 1739 ssize_t status, written = 0; 1740 unsigned nr_pages; 1741 1742 ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, pos " 1743 "0x%llx, count 0x%lx.", vi->i_ino, 1744 (unsigned)le32_to_cpu(ni->type), 1745 (unsigned long long)pos, 1746 (unsigned long)iov_iter_count(i)); 1747 /* 1748 * If a previous ntfs_truncate() failed, repeat it and abort if it 1749 * fails again. 1750 */ 1751 if (unlikely(NInoTruncateFailed(ni))) { 1752 int err; 1753 1754 inode_dio_wait(vi); 1755 err = ntfs_truncate(vi); 1756 if (err || NInoTruncateFailed(ni)) { 1757 if (!err) 1758 err = -EIO; 1759 ntfs_error(vol->sb, "Cannot perform write to inode " 1760 "0x%lx, attribute type 0x%x, because " 1761 "ntfs_truncate() failed (error code " 1762 "%i).", vi->i_ino, 1763 (unsigned)le32_to_cpu(ni->type), err); 1764 return err; 1765 } 1766 } 1767 /* 1768 * Determine the number of pages per cluster for non-resident 1769 * attributes. 1770 */ 1771 nr_pages = 1; 1772 if (vol->cluster_size > PAGE_SIZE && NInoNonResident(ni)) 1773 nr_pages = vol->cluster_size >> PAGE_SHIFT; 1774 last_vcn = -1; 1775 do { 1776 VCN vcn; 1777 pgoff_t idx, start_idx; 1778 unsigned ofs, do_pages, u; 1779 size_t copied; 1780 1781 start_idx = idx = pos >> PAGE_SHIFT; 1782 ofs = pos & ~PAGE_MASK; 1783 bytes = PAGE_SIZE - ofs; 1784 do_pages = 1; 1785 if (nr_pages > 1) { 1786 vcn = pos >> vol->cluster_size_bits; 1787 if (vcn != last_vcn) { 1788 last_vcn = vcn; 1789 /* 1790 * Get the lcn of the vcn the write is in. If 1791 * it is a hole, need to lock down all pages in 1792 * the cluster. 1793 */ 1794 down_read(&ni->runlist.lock); 1795 lcn = ntfs_attr_vcn_to_lcn_nolock(ni, pos >> 1796 vol->cluster_size_bits, false); 1797 up_read(&ni->runlist.lock); 1798 if (unlikely(lcn < LCN_HOLE)) { 1799 if (lcn == LCN_ENOMEM) 1800 status = -ENOMEM; 1801 else { 1802 status = -EIO; 1803 ntfs_error(vol->sb, "Cannot " 1804 "perform write to " 1805 "inode 0x%lx, " 1806 "attribute type 0x%x, " 1807 "because the attribute " 1808 "is corrupt.", 1809 vi->i_ino, (unsigned) 1810 le32_to_cpu(ni->type)); 1811 } 1812 break; 1813 } 1814 if (lcn == LCN_HOLE) { 1815 start_idx = (pos & ~(s64) 1816 vol->cluster_size_mask) 1817 >> PAGE_SHIFT; 1818 bytes = vol->cluster_size - (pos & 1819 vol->cluster_size_mask); 1820 do_pages = nr_pages; 1821 } 1822 } 1823 } 1824 if (bytes > iov_iter_count(i)) 1825 bytes = iov_iter_count(i); 1826 again: 1827 /* 1828 * Bring in the user page(s) that we will copy from _first_. 1829 * Otherwise there is a nasty deadlock on copying from the same 1830 * page(s) as we are writing to, without it/them being marked 1831 * up-to-date. Note, at present there is nothing to stop the 1832 * pages being swapped out between us bringing them into memory 1833 * and doing the actual copying. 1834 */ 1835 if (unlikely(iov_iter_fault_in_readable(i, bytes))) { 1836 status = -EFAULT; 1837 break; 1838 } 1839 /* Get and lock @do_pages starting at index @start_idx. */ 1840 status = __ntfs_grab_cache_pages(mapping, start_idx, do_pages, 1841 pages, &cached_page); 1842 if (unlikely(status)) 1843 break; 1844 /* 1845 * For non-resident attributes, we need to fill any holes with 1846 * actual clusters and ensure all bufferes are mapped. We also 1847 * need to bring uptodate any buffers that are only partially 1848 * being written to. 1849 */ 1850 if (NInoNonResident(ni)) { 1851 status = ntfs_prepare_pages_for_non_resident_write( 1852 pages, do_pages, pos, bytes); 1853 if (unlikely(status)) { 1854 do { 1855 unlock_page(pages[--do_pages]); 1856 put_page(pages[do_pages]); 1857 } while (do_pages); 1858 break; 1859 } 1860 } 1861 u = (pos >> PAGE_SHIFT) - pages[0]->index; 1862 copied = ntfs_copy_from_user_iter(pages + u, do_pages - u, ofs, 1863 i, bytes); 1864 ntfs_flush_dcache_pages(pages + u, do_pages - u); 1865 status = 0; 1866 if (likely(copied == bytes)) { 1867 status = ntfs_commit_pages_after_write(pages, do_pages, 1868 pos, bytes); 1869 if (!status) 1870 status = bytes; 1871 } 1872 do { 1873 unlock_page(pages[--do_pages]); 1874 put_page(pages[do_pages]); 1875 } while (do_pages); 1876 if (unlikely(status < 0)) 1877 break; 1878 copied = status; 1879 cond_resched(); 1880 if (unlikely(!copied)) { 1881 size_t sc; 1882 1883 /* 1884 * We failed to copy anything. Fall back to single 1885 * segment length write. 1886 * 1887 * This is needed to avoid possible livelock in the 1888 * case that all segments in the iov cannot be copied 1889 * at once without a pagefault. 1890 */ 1891 sc = iov_iter_single_seg_count(i); 1892 if (bytes > sc) 1893 bytes = sc; 1894 goto again; 1895 } 1896 iov_iter_advance(i, copied); 1897 pos += copied; 1898 written += copied; 1899 balance_dirty_pages_ratelimited(mapping); 1900 if (fatal_signal_pending(current)) { 1901 status = -EINTR; 1902 break; 1903 } 1904 } while (iov_iter_count(i)); 1905 if (cached_page) 1906 put_page(cached_page); 1907 ntfs_debug("Done. Returning %s (written 0x%lx, status %li).", 1908 written ? "written" : "status", (unsigned long)written, 1909 (long)status); 1910 return written ? written : status; 1911 } 1912 1913 /** 1914 * ntfs_file_write_iter - simple wrapper for ntfs_file_write_iter_nolock() 1915 * @iocb: IO state structure 1916 * @from: iov_iter with data to write 1917 * 1918 * Basically the same as generic_file_write_iter() except that it ends up 1919 * up calling ntfs_perform_write() instead of generic_perform_write() and that 1920 * O_DIRECT is not implemented. 1921 */ 1922 static ssize_t ntfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) 1923 { 1924 struct file *file = iocb->ki_filp; 1925 struct inode *vi = file_inode(file); 1926 ssize_t written = 0; 1927 ssize_t err; 1928 1929 inode_lock(vi); 1930 /* We can write back this queue in page reclaim. */ 1931 current->backing_dev_info = inode_to_bdi(vi); 1932 err = ntfs_prepare_file_for_write(iocb, from); 1933 if (iov_iter_count(from) && !err) 1934 written = ntfs_perform_write(file, from, iocb->ki_pos); 1935 current->backing_dev_info = NULL; 1936 inode_unlock(vi); 1937 iocb->ki_pos += written; 1938 if (likely(written > 0)) 1939 written = generic_write_sync(iocb, written); 1940 return written ? written : err; 1941 } 1942 1943 /** 1944 * ntfs_file_fsync - sync a file to disk 1945 * @filp: file to be synced 1946 * @datasync: if non-zero only flush user data and not metadata 1947 * 1948 * Data integrity sync of a file to disk. Used for fsync, fdatasync, and msync 1949 * system calls. This function is inspired by fs/buffer.c::file_fsync(). 1950 * 1951 * If @datasync is false, write the mft record and all associated extent mft 1952 * records as well as the $DATA attribute and then sync the block device. 1953 * 1954 * If @datasync is true and the attribute is non-resident, we skip the writing 1955 * of the mft record and all associated extent mft records (this might still 1956 * happen due to the write_inode_now() call). 1957 * 1958 * Also, if @datasync is true, we do not wait on the inode to be written out 1959 * but we always wait on the page cache pages to be written out. 1960 * 1961 * Locking: Caller must hold i_mutex on the inode. 1962 * 1963 * TODO: We should probably also write all attribute/index inodes associated 1964 * with this inode but since we have no simple way of getting to them we ignore 1965 * this problem for now. 1966 */ 1967 static int ntfs_file_fsync(struct file *filp, loff_t start, loff_t end, 1968 int datasync) 1969 { 1970 struct inode *vi = filp->f_mapping->host; 1971 int err, ret = 0; 1972 1973 ntfs_debug("Entering for inode 0x%lx.", vi->i_ino); 1974 1975 err = file_write_and_wait_range(filp, start, end); 1976 if (err) 1977 return err; 1978 inode_lock(vi); 1979 1980 BUG_ON(S_ISDIR(vi->i_mode)); 1981 if (!datasync || !NInoNonResident(NTFS_I(vi))) 1982 ret = __ntfs_write_inode(vi, 1); 1983 write_inode_now(vi, !datasync); 1984 /* 1985 * NOTE: If we were to use mapping->private_list (see ext2 and 1986 * fs/buffer.c) for dirty blocks then we could optimize the below to be 1987 * sync_mapping_buffers(vi->i_mapping). 1988 */ 1989 err = sync_blockdev(vi->i_sb->s_bdev); 1990 if (unlikely(err && !ret)) 1991 ret = err; 1992 if (likely(!ret)) 1993 ntfs_debug("Done."); 1994 else 1995 ntfs_warning(vi->i_sb, "Failed to f%ssync inode 0x%lx. Error " 1996 "%u.", datasync ? "data" : "", vi->i_ino, -ret); 1997 inode_unlock(vi); 1998 return ret; 1999 } 2000 2001 #endif /* NTFS_RW */ 2002 2003 const struct file_operations ntfs_file_ops = { 2004 .llseek = generic_file_llseek, 2005 .read_iter = generic_file_read_iter, 2006 #ifdef NTFS_RW 2007 .write_iter = ntfs_file_write_iter, 2008 .fsync = ntfs_file_fsync, 2009 #endif /* NTFS_RW */ 2010 .mmap = generic_file_mmap, 2011 .open = ntfs_file_open, 2012 .splice_read = generic_file_splice_read, 2013 }; 2014 2015 const struct inode_operations ntfs_file_inode_ops = { 2016 #ifdef NTFS_RW 2017 .setattr = ntfs_setattr, 2018 #endif /* NTFS_RW */ 2019 }; 2020 2021 const struct file_operations ntfs_empty_file_ops = {}; 2022 2023 const struct inode_operations ntfs_empty_inode_ops = {}; 2024