1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * (C) 2001 Clemson University and The University of Chicago 4 * Copyright 2018 Omnibond Systems, L.L.C. 5 * 6 * See COPYING in top-level directory. 7 */ 8 9 /* 10 * Linux VFS inode operations. 11 */ 12 13 #include <linux/blkdev.h> 14 #include <linux/fileattr.h> 15 #include "protocol.h" 16 #include "orangefs-kernel.h" 17 #include "orangefs-bufmap.h" 18 19 static int orangefs_writepage_locked(struct page *page, 20 struct writeback_control *wbc) 21 { 22 struct inode *inode = page->mapping->host; 23 struct orangefs_write_range *wr = NULL; 24 struct iov_iter iter; 25 struct bio_vec bv; 26 size_t len, wlen; 27 ssize_t ret; 28 loff_t off; 29 30 set_page_writeback(page); 31 32 len = i_size_read(inode); 33 if (PagePrivate(page)) { 34 wr = (struct orangefs_write_range *)page_private(page); 35 WARN_ON(wr->pos >= len); 36 off = wr->pos; 37 if (off + wr->len > len) 38 wlen = len - off; 39 else 40 wlen = wr->len; 41 } else { 42 WARN_ON(1); 43 off = page_offset(page); 44 if (off + PAGE_SIZE > len) 45 wlen = len - off; 46 else 47 wlen = PAGE_SIZE; 48 } 49 /* Should've been handled in orangefs_invalidate_folio. */ 50 WARN_ON(off == len || off + wlen > len); 51 52 bv.bv_page = page; 53 bv.bv_len = wlen; 54 bv.bv_offset = off % PAGE_SIZE; 55 WARN_ON(wlen == 0); 56 iov_iter_bvec(&iter, ITER_SOURCE, &bv, 1, wlen); 57 58 ret = wait_for_direct_io(ORANGEFS_IO_WRITE, inode, &off, &iter, wlen, 59 len, wr, NULL, NULL); 60 if (ret < 0) { 61 SetPageError(page); 62 mapping_set_error(page->mapping, ret); 63 } else { 64 ret = 0; 65 } 66 kfree(detach_page_private(page)); 67 return ret; 68 } 69 70 static int orangefs_writepage(struct page *page, struct writeback_control *wbc) 71 { 72 int ret; 73 ret = orangefs_writepage_locked(page, wbc); 74 unlock_page(page); 75 end_page_writeback(page); 76 return ret; 77 } 78 79 struct orangefs_writepages { 80 loff_t off; 81 size_t len; 82 kuid_t uid; 83 kgid_t gid; 84 int maxpages; 85 int npages; 86 struct page **pages; 87 struct bio_vec *bv; 88 }; 89 90 static int orangefs_writepages_work(struct orangefs_writepages *ow, 91 struct writeback_control *wbc) 92 { 93 struct inode *inode = ow->pages[0]->mapping->host; 94 struct orangefs_write_range *wrp, wr; 95 struct iov_iter iter; 96 ssize_t ret; 97 size_t len; 98 loff_t off; 99 int i; 100 101 len = i_size_read(inode); 102 103 for (i = 0; i < ow->npages; i++) { 104 set_page_writeback(ow->pages[i]); 105 ow->bv[i].bv_page = ow->pages[i]; 106 ow->bv[i].bv_len = min(page_offset(ow->pages[i]) + PAGE_SIZE, 107 ow->off + ow->len) - 108 max(ow->off, page_offset(ow->pages[i])); 109 if (i == 0) 110 ow->bv[i].bv_offset = ow->off - 111 page_offset(ow->pages[i]); 112 else 113 ow->bv[i].bv_offset = 0; 114 } 115 iov_iter_bvec(&iter, ITER_SOURCE, ow->bv, ow->npages, ow->len); 116 117 WARN_ON(ow->off >= len); 118 if (ow->off + ow->len > len) 119 ow->len = len - ow->off; 120 121 off = ow->off; 122 wr.uid = ow->uid; 123 wr.gid = ow->gid; 124 ret = wait_for_direct_io(ORANGEFS_IO_WRITE, inode, &off, &iter, ow->len, 125 0, &wr, NULL, NULL); 126 if (ret < 0) { 127 for (i = 0; i < ow->npages; i++) { 128 SetPageError(ow->pages[i]); 129 mapping_set_error(ow->pages[i]->mapping, ret); 130 if (PagePrivate(ow->pages[i])) { 131 wrp = (struct orangefs_write_range *) 132 page_private(ow->pages[i]); 133 ClearPagePrivate(ow->pages[i]); 134 put_page(ow->pages[i]); 135 kfree(wrp); 136 } 137 end_page_writeback(ow->pages[i]); 138 unlock_page(ow->pages[i]); 139 } 140 } else { 141 ret = 0; 142 for (i = 0; i < ow->npages; i++) { 143 if (PagePrivate(ow->pages[i])) { 144 wrp = (struct orangefs_write_range *) 145 page_private(ow->pages[i]); 146 ClearPagePrivate(ow->pages[i]); 147 put_page(ow->pages[i]); 148 kfree(wrp); 149 } 150 end_page_writeback(ow->pages[i]); 151 unlock_page(ow->pages[i]); 152 } 153 } 154 return ret; 155 } 156 157 static int orangefs_writepages_callback(struct folio *folio, 158 struct writeback_control *wbc, void *data) 159 { 160 struct orangefs_writepages *ow = data; 161 struct orangefs_write_range *wr = folio->private; 162 int ret; 163 164 if (!wr) { 165 folio_unlock(folio); 166 /* It's not private so there's nothing to write, right? */ 167 printk("writepages_callback not private!\n"); 168 BUG(); 169 return 0; 170 } 171 172 ret = -1; 173 if (ow->npages == 0) { 174 ow->off = wr->pos; 175 ow->len = wr->len; 176 ow->uid = wr->uid; 177 ow->gid = wr->gid; 178 ow->pages[ow->npages++] = &folio->page; 179 ret = 0; 180 goto done; 181 } 182 if (!uid_eq(ow->uid, wr->uid) || !gid_eq(ow->gid, wr->gid)) { 183 orangefs_writepages_work(ow, wbc); 184 ow->npages = 0; 185 ret = -1; 186 goto done; 187 } 188 if (ow->off + ow->len == wr->pos) { 189 ow->len += wr->len; 190 ow->pages[ow->npages++] = &folio->page; 191 ret = 0; 192 goto done; 193 } 194 done: 195 if (ret == -1) { 196 if (ow->npages) { 197 orangefs_writepages_work(ow, wbc); 198 ow->npages = 0; 199 } 200 ret = orangefs_writepage_locked(&folio->page, wbc); 201 mapping_set_error(folio->mapping, ret); 202 folio_unlock(folio); 203 folio_end_writeback(folio); 204 } else { 205 if (ow->npages == ow->maxpages) { 206 orangefs_writepages_work(ow, wbc); 207 ow->npages = 0; 208 } 209 } 210 return ret; 211 } 212 213 static int orangefs_writepages(struct address_space *mapping, 214 struct writeback_control *wbc) 215 { 216 struct orangefs_writepages *ow; 217 struct blk_plug plug; 218 int ret; 219 ow = kzalloc(sizeof(struct orangefs_writepages), GFP_KERNEL); 220 if (!ow) 221 return -ENOMEM; 222 ow->maxpages = orangefs_bufmap_size_query()/PAGE_SIZE; 223 ow->pages = kcalloc(ow->maxpages, sizeof(struct page *), GFP_KERNEL); 224 if (!ow->pages) { 225 kfree(ow); 226 return -ENOMEM; 227 } 228 ow->bv = kcalloc(ow->maxpages, sizeof(struct bio_vec), GFP_KERNEL); 229 if (!ow->bv) { 230 kfree(ow->pages); 231 kfree(ow); 232 return -ENOMEM; 233 } 234 blk_start_plug(&plug); 235 ret = write_cache_pages(mapping, wbc, orangefs_writepages_callback, ow); 236 if (ow->npages) 237 ret = orangefs_writepages_work(ow, wbc); 238 blk_finish_plug(&plug); 239 kfree(ow->pages); 240 kfree(ow->bv); 241 kfree(ow); 242 return ret; 243 } 244 245 static int orangefs_launder_folio(struct folio *); 246 247 static void orangefs_readahead(struct readahead_control *rac) 248 { 249 loff_t offset; 250 struct iov_iter iter; 251 struct inode *inode = rac->mapping->host; 252 struct xarray *i_pages; 253 struct page *page; 254 loff_t new_start = readahead_pos(rac); 255 int ret; 256 size_t new_len = 0; 257 258 loff_t bytes_remaining = inode->i_size - readahead_pos(rac); 259 loff_t pages_remaining = bytes_remaining / PAGE_SIZE; 260 261 if (pages_remaining >= 1024) 262 new_len = 4194304; 263 else if (pages_remaining > readahead_count(rac)) 264 new_len = bytes_remaining; 265 266 if (new_len) 267 readahead_expand(rac, new_start, new_len); 268 269 offset = readahead_pos(rac); 270 i_pages = &rac->mapping->i_pages; 271 272 iov_iter_xarray(&iter, ITER_DEST, i_pages, offset, readahead_length(rac)); 273 274 /* read in the pages. */ 275 if ((ret = wait_for_direct_io(ORANGEFS_IO_READ, inode, 276 &offset, &iter, readahead_length(rac), 277 inode->i_size, NULL, NULL, rac->file)) < 0) 278 gossip_debug(GOSSIP_FILE_DEBUG, 279 "%s: wait_for_direct_io failed. \n", __func__); 280 else 281 ret = 0; 282 283 /* clean up. */ 284 while ((page = readahead_page(rac))) { 285 page_endio(page, false, ret); 286 put_page(page); 287 } 288 } 289 290 static int orangefs_read_folio(struct file *file, struct folio *folio) 291 { 292 struct inode *inode = folio->mapping->host; 293 struct iov_iter iter; 294 struct bio_vec bv; 295 ssize_t ret; 296 loff_t off; /* offset of this folio in the file */ 297 298 if (folio_test_dirty(folio)) 299 orangefs_launder_folio(folio); 300 301 off = folio_pos(folio); 302 bv.bv_page = &folio->page; 303 bv.bv_len = folio_size(folio); 304 bv.bv_offset = 0; 305 iov_iter_bvec(&iter, ITER_DEST, &bv, 1, folio_size(folio)); 306 307 ret = wait_for_direct_io(ORANGEFS_IO_READ, inode, &off, &iter, 308 folio_size(folio), inode->i_size, NULL, NULL, file); 309 /* this will only zero remaining unread portions of the folio data */ 310 iov_iter_zero(~0U, &iter); 311 /* takes care of potential aliasing */ 312 flush_dcache_folio(folio); 313 if (ret < 0) { 314 folio_set_error(folio); 315 } else { 316 folio_mark_uptodate(folio); 317 ret = 0; 318 } 319 /* unlock the folio after the ->read_folio() routine completes */ 320 folio_unlock(folio); 321 return ret; 322 } 323 324 static int orangefs_write_begin(struct file *file, 325 struct address_space *mapping, loff_t pos, unsigned len, 326 struct page **pagep, void **fsdata) 327 { 328 struct orangefs_write_range *wr; 329 struct folio *folio; 330 struct page *page; 331 pgoff_t index; 332 int ret; 333 334 index = pos >> PAGE_SHIFT; 335 336 page = grab_cache_page_write_begin(mapping, index); 337 if (!page) 338 return -ENOMEM; 339 340 *pagep = page; 341 folio = page_folio(page); 342 343 if (folio_test_dirty(folio) && !folio_test_private(folio)) { 344 /* 345 * Should be impossible. If it happens, launder the page 346 * since we don't know what's dirty. This will WARN in 347 * orangefs_writepage_locked. 348 */ 349 ret = orangefs_launder_folio(folio); 350 if (ret) 351 return ret; 352 } 353 if (folio_test_private(folio)) { 354 struct orangefs_write_range *wr; 355 wr = folio_get_private(folio); 356 if (wr->pos + wr->len == pos && 357 uid_eq(wr->uid, current_fsuid()) && 358 gid_eq(wr->gid, current_fsgid())) { 359 wr->len += len; 360 goto okay; 361 } else { 362 ret = orangefs_launder_folio(folio); 363 if (ret) 364 return ret; 365 } 366 } 367 368 wr = kmalloc(sizeof *wr, GFP_KERNEL); 369 if (!wr) 370 return -ENOMEM; 371 372 wr->pos = pos; 373 wr->len = len; 374 wr->uid = current_fsuid(); 375 wr->gid = current_fsgid(); 376 folio_attach_private(folio, wr); 377 okay: 378 return 0; 379 } 380 381 static int orangefs_write_end(struct file *file, struct address_space *mapping, 382 loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata) 383 { 384 struct inode *inode = page->mapping->host; 385 loff_t last_pos = pos + copied; 386 387 /* 388 * No need to use i_size_read() here, the i_size 389 * cannot change under us because we hold the i_mutex. 390 */ 391 if (last_pos > inode->i_size) 392 i_size_write(inode, last_pos); 393 394 /* zero the stale part of the page if we did a short copy */ 395 if (!PageUptodate(page)) { 396 unsigned from = pos & (PAGE_SIZE - 1); 397 if (copied < len) { 398 zero_user(page, from + copied, len - copied); 399 } 400 /* Set fully written pages uptodate. */ 401 if (pos == page_offset(page) && 402 (len == PAGE_SIZE || pos + len == inode->i_size)) { 403 zero_user_segment(page, from + copied, PAGE_SIZE); 404 SetPageUptodate(page); 405 } 406 } 407 408 set_page_dirty(page); 409 unlock_page(page); 410 put_page(page); 411 412 mark_inode_dirty_sync(file_inode(file)); 413 return copied; 414 } 415 416 static void orangefs_invalidate_folio(struct folio *folio, 417 size_t offset, size_t length) 418 { 419 struct orangefs_write_range *wr = folio_get_private(folio); 420 421 if (offset == 0 && length == PAGE_SIZE) { 422 kfree(folio_detach_private(folio)); 423 return; 424 /* write range entirely within invalidate range (or equal) */ 425 } else if (folio_pos(folio) + offset <= wr->pos && 426 wr->pos + wr->len <= folio_pos(folio) + offset + length) { 427 kfree(folio_detach_private(folio)); 428 /* XXX is this right? only caller in fs */ 429 folio_cancel_dirty(folio); 430 return; 431 /* invalidate range chops off end of write range */ 432 } else if (wr->pos < folio_pos(folio) + offset && 433 wr->pos + wr->len <= folio_pos(folio) + offset + length && 434 folio_pos(folio) + offset < wr->pos + wr->len) { 435 size_t x; 436 x = wr->pos + wr->len - (folio_pos(folio) + offset); 437 WARN_ON(x > wr->len); 438 wr->len -= x; 439 wr->uid = current_fsuid(); 440 wr->gid = current_fsgid(); 441 /* invalidate range chops off beginning of write range */ 442 } else if (folio_pos(folio) + offset <= wr->pos && 443 folio_pos(folio) + offset + length < wr->pos + wr->len && 444 wr->pos < folio_pos(folio) + offset + length) { 445 size_t x; 446 x = folio_pos(folio) + offset + length - wr->pos; 447 WARN_ON(x > wr->len); 448 wr->pos += x; 449 wr->len -= x; 450 wr->uid = current_fsuid(); 451 wr->gid = current_fsgid(); 452 /* invalidate range entirely within write range (punch hole) */ 453 } else if (wr->pos < folio_pos(folio) + offset && 454 folio_pos(folio) + offset + length < wr->pos + wr->len) { 455 /* XXX what do we do here... should not WARN_ON */ 456 WARN_ON(1); 457 /* punch hole */ 458 /* 459 * should we just ignore this and write it out anyway? 460 * it hardly makes sense 461 */ 462 return; 463 /* non-overlapping ranges */ 464 } else { 465 /* WARN if they do overlap */ 466 if (!((folio_pos(folio) + offset + length <= wr->pos) ^ 467 (wr->pos + wr->len <= folio_pos(folio) + offset))) { 468 WARN_ON(1); 469 printk("invalidate range offset %llu length %zu\n", 470 folio_pos(folio) + offset, length); 471 printk("write range offset %llu length %zu\n", 472 wr->pos, wr->len); 473 } 474 return; 475 } 476 477 /* 478 * Above there are returns where wr is freed or where we WARN. 479 * Thus the following runs if wr was modified above. 480 */ 481 482 orangefs_launder_folio(folio); 483 } 484 485 static bool orangefs_release_folio(struct folio *folio, gfp_t foo) 486 { 487 return !folio_test_private(folio); 488 } 489 490 static void orangefs_free_folio(struct folio *folio) 491 { 492 kfree(folio_detach_private(folio)); 493 } 494 495 static int orangefs_launder_folio(struct folio *folio) 496 { 497 int r = 0; 498 struct writeback_control wbc = { 499 .sync_mode = WB_SYNC_ALL, 500 .nr_to_write = 0, 501 }; 502 folio_wait_writeback(folio); 503 if (folio_clear_dirty_for_io(folio)) { 504 r = orangefs_writepage_locked(&folio->page, &wbc); 505 folio_end_writeback(folio); 506 } 507 return r; 508 } 509 510 static ssize_t orangefs_direct_IO(struct kiocb *iocb, 511 struct iov_iter *iter) 512 { 513 /* 514 * Comment from original do_readv_writev: 515 * Common entry point for read/write/readv/writev 516 * This function will dispatch it to either the direct I/O 517 * or buffered I/O path depending on the mount options and/or 518 * augmented/extended metadata attached to the file. 519 * Note: File extended attributes override any mount options. 520 */ 521 struct file *file = iocb->ki_filp; 522 loff_t pos = iocb->ki_pos; 523 enum ORANGEFS_io_type type = iov_iter_rw(iter) == WRITE ? 524 ORANGEFS_IO_WRITE : ORANGEFS_IO_READ; 525 loff_t *offset = &pos; 526 struct inode *inode = file->f_mapping->host; 527 struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); 528 struct orangefs_khandle *handle = &orangefs_inode->refn.khandle; 529 size_t count = iov_iter_count(iter); 530 ssize_t total_count = 0; 531 ssize_t ret = -EINVAL; 532 533 gossip_debug(GOSSIP_FILE_DEBUG, 534 "%s-BEGIN(%pU): count(%d) after estimate_max_iovecs.\n", 535 __func__, 536 handle, 537 (int)count); 538 539 if (type == ORANGEFS_IO_WRITE) { 540 gossip_debug(GOSSIP_FILE_DEBUG, 541 "%s(%pU): proceeding with offset : %llu, " 542 "size %d\n", 543 __func__, 544 handle, 545 llu(*offset), 546 (int)count); 547 } 548 549 if (count == 0) { 550 ret = 0; 551 goto out; 552 } 553 554 while (iov_iter_count(iter)) { 555 size_t each_count = iov_iter_count(iter); 556 size_t amt_complete; 557 558 /* how much to transfer in this loop iteration */ 559 if (each_count > orangefs_bufmap_size_query()) 560 each_count = orangefs_bufmap_size_query(); 561 562 gossip_debug(GOSSIP_FILE_DEBUG, 563 "%s(%pU): size of each_count(%d)\n", 564 __func__, 565 handle, 566 (int)each_count); 567 gossip_debug(GOSSIP_FILE_DEBUG, 568 "%s(%pU): BEFORE wait_for_io: offset is %d\n", 569 __func__, 570 handle, 571 (int)*offset); 572 573 ret = wait_for_direct_io(type, inode, offset, iter, 574 each_count, 0, NULL, NULL, file); 575 gossip_debug(GOSSIP_FILE_DEBUG, 576 "%s(%pU): return from wait_for_io:%d\n", 577 __func__, 578 handle, 579 (int)ret); 580 581 if (ret < 0) 582 goto out; 583 584 *offset += ret; 585 total_count += ret; 586 amt_complete = ret; 587 588 gossip_debug(GOSSIP_FILE_DEBUG, 589 "%s(%pU): AFTER wait_for_io: offset is %d\n", 590 __func__, 591 handle, 592 (int)*offset); 593 594 /* 595 * if we got a short I/O operations, 596 * fall out and return what we got so far 597 */ 598 if (amt_complete < each_count) 599 break; 600 } /*end while */ 601 602 out: 603 if (total_count > 0) 604 ret = total_count; 605 if (ret > 0) { 606 if (type == ORANGEFS_IO_READ) { 607 file_accessed(file); 608 } else { 609 file_update_time(file); 610 if (*offset > i_size_read(inode)) 611 i_size_write(inode, *offset); 612 } 613 } 614 615 gossip_debug(GOSSIP_FILE_DEBUG, 616 "%s(%pU): Value(%d) returned.\n", 617 __func__, 618 handle, 619 (int)ret); 620 621 return ret; 622 } 623 624 /** ORANGEFS2 implementation of address space operations */ 625 static const struct address_space_operations orangefs_address_operations = { 626 .writepage = orangefs_writepage, 627 .readahead = orangefs_readahead, 628 .read_folio = orangefs_read_folio, 629 .writepages = orangefs_writepages, 630 .dirty_folio = filemap_dirty_folio, 631 .write_begin = orangefs_write_begin, 632 .write_end = orangefs_write_end, 633 .invalidate_folio = orangefs_invalidate_folio, 634 .release_folio = orangefs_release_folio, 635 .free_folio = orangefs_free_folio, 636 .launder_folio = orangefs_launder_folio, 637 .direct_IO = orangefs_direct_IO, 638 }; 639 640 vm_fault_t orangefs_page_mkwrite(struct vm_fault *vmf) 641 { 642 struct folio *folio = page_folio(vmf->page); 643 struct inode *inode = file_inode(vmf->vma->vm_file); 644 struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); 645 unsigned long *bitlock = &orangefs_inode->bitlock; 646 vm_fault_t ret; 647 struct orangefs_write_range *wr; 648 649 sb_start_pagefault(inode->i_sb); 650 651 if (wait_on_bit(bitlock, 1, TASK_KILLABLE)) { 652 ret = VM_FAULT_RETRY; 653 goto out; 654 } 655 656 folio_lock(folio); 657 if (folio_test_dirty(folio) && !folio_test_private(folio)) { 658 /* 659 * Should be impossible. If it happens, launder the folio 660 * since we don't know what's dirty. This will WARN in 661 * orangefs_writepage_locked. 662 */ 663 if (orangefs_launder_folio(folio)) { 664 ret = VM_FAULT_LOCKED|VM_FAULT_RETRY; 665 goto out; 666 } 667 } 668 if (folio_test_private(folio)) { 669 wr = folio_get_private(folio); 670 if (uid_eq(wr->uid, current_fsuid()) && 671 gid_eq(wr->gid, current_fsgid())) { 672 wr->pos = page_offset(vmf->page); 673 wr->len = PAGE_SIZE; 674 goto okay; 675 } else { 676 if (orangefs_launder_folio(folio)) { 677 ret = VM_FAULT_LOCKED|VM_FAULT_RETRY; 678 goto out; 679 } 680 } 681 } 682 wr = kmalloc(sizeof *wr, GFP_KERNEL); 683 if (!wr) { 684 ret = VM_FAULT_LOCKED|VM_FAULT_RETRY; 685 goto out; 686 } 687 wr->pos = page_offset(vmf->page); 688 wr->len = PAGE_SIZE; 689 wr->uid = current_fsuid(); 690 wr->gid = current_fsgid(); 691 folio_attach_private(folio, wr); 692 okay: 693 694 file_update_time(vmf->vma->vm_file); 695 if (folio->mapping != inode->i_mapping) { 696 folio_unlock(folio); 697 ret = VM_FAULT_LOCKED|VM_FAULT_NOPAGE; 698 goto out; 699 } 700 701 /* 702 * We mark the folio dirty already here so that when freeze is in 703 * progress, we are guaranteed that writeback during freezing will 704 * see the dirty folio and writeprotect it again. 705 */ 706 folio_mark_dirty(folio); 707 folio_wait_stable(folio); 708 ret = VM_FAULT_LOCKED; 709 out: 710 sb_end_pagefault(inode->i_sb); 711 return ret; 712 } 713 714 static int orangefs_setattr_size(struct inode *inode, struct iattr *iattr) 715 { 716 struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); 717 struct orangefs_kernel_op_s *new_op; 718 loff_t orig_size; 719 int ret = -EINVAL; 720 721 gossip_debug(GOSSIP_INODE_DEBUG, 722 "%s: %pU: Handle is %pU | fs_id %d | size is %llu\n", 723 __func__, 724 get_khandle_from_ino(inode), 725 &orangefs_inode->refn.khandle, 726 orangefs_inode->refn.fs_id, 727 iattr->ia_size); 728 729 /* Ensure that we have a up to date size, so we know if it changed. */ 730 ret = orangefs_inode_getattr(inode, ORANGEFS_GETATTR_SIZE); 731 if (ret == -ESTALE) 732 ret = -EIO; 733 if (ret) { 734 gossip_err("%s: orangefs_inode_getattr failed, ret:%d:.\n", 735 __func__, ret); 736 return ret; 737 } 738 orig_size = i_size_read(inode); 739 740 /* This is truncate_setsize in a different order. */ 741 truncate_pagecache(inode, iattr->ia_size); 742 i_size_write(inode, iattr->ia_size); 743 if (iattr->ia_size > orig_size) 744 pagecache_isize_extended(inode, orig_size, iattr->ia_size); 745 746 new_op = op_alloc(ORANGEFS_VFS_OP_TRUNCATE); 747 if (!new_op) 748 return -ENOMEM; 749 750 new_op->upcall.req.truncate.refn = orangefs_inode->refn; 751 new_op->upcall.req.truncate.size = (__s64) iattr->ia_size; 752 753 ret = service_operation(new_op, 754 __func__, 755 get_interruptible_flag(inode)); 756 757 /* 758 * the truncate has no downcall members to retrieve, but 759 * the status value tells us if it went through ok or not 760 */ 761 gossip_debug(GOSSIP_INODE_DEBUG, "%s: ret:%d:\n", __func__, ret); 762 763 op_release(new_op); 764 765 if (ret != 0) 766 return ret; 767 768 if (orig_size != i_size_read(inode)) 769 iattr->ia_valid |= ATTR_CTIME | ATTR_MTIME; 770 771 return ret; 772 } 773 774 int __orangefs_setattr(struct inode *inode, struct iattr *iattr) 775 { 776 int ret; 777 778 if (iattr->ia_valid & ATTR_MODE) { 779 if (iattr->ia_mode & (S_ISVTX)) { 780 if (is_root_handle(inode)) { 781 /* 782 * allow sticky bit to be set on root (since 783 * it shows up that way by default anyhow), 784 * but don't show it to the server 785 */ 786 iattr->ia_mode -= S_ISVTX; 787 } else { 788 gossip_debug(GOSSIP_UTILS_DEBUG, 789 "User attempted to set sticky bit on non-root directory; returning EINVAL.\n"); 790 ret = -EINVAL; 791 goto out; 792 } 793 } 794 if (iattr->ia_mode & (S_ISUID)) { 795 gossip_debug(GOSSIP_UTILS_DEBUG, 796 "Attempting to set setuid bit (not supported); returning EINVAL.\n"); 797 ret = -EINVAL; 798 goto out; 799 } 800 } 801 802 if (iattr->ia_valid & ATTR_SIZE) { 803 ret = orangefs_setattr_size(inode, iattr); 804 if (ret) 805 goto out; 806 } 807 808 again: 809 spin_lock(&inode->i_lock); 810 if (ORANGEFS_I(inode)->attr_valid) { 811 if (uid_eq(ORANGEFS_I(inode)->attr_uid, current_fsuid()) && 812 gid_eq(ORANGEFS_I(inode)->attr_gid, current_fsgid())) { 813 ORANGEFS_I(inode)->attr_valid = iattr->ia_valid; 814 } else { 815 spin_unlock(&inode->i_lock); 816 write_inode_now(inode, 1); 817 goto again; 818 } 819 } else { 820 ORANGEFS_I(inode)->attr_valid = iattr->ia_valid; 821 ORANGEFS_I(inode)->attr_uid = current_fsuid(); 822 ORANGEFS_I(inode)->attr_gid = current_fsgid(); 823 } 824 setattr_copy(&init_user_ns, inode, iattr); 825 spin_unlock(&inode->i_lock); 826 mark_inode_dirty(inode); 827 828 ret = 0; 829 out: 830 return ret; 831 } 832 833 int __orangefs_setattr_mode(struct dentry *dentry, struct iattr *iattr) 834 { 835 int ret; 836 struct inode *inode = d_inode(dentry); 837 838 ret = __orangefs_setattr(inode, iattr); 839 /* change mode on a file that has ACLs */ 840 if (!ret && (iattr->ia_valid & ATTR_MODE)) 841 ret = posix_acl_chmod(&init_user_ns, dentry, inode->i_mode); 842 return ret; 843 } 844 845 /* 846 * Change attributes of an object referenced by dentry. 847 */ 848 int orangefs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, 849 struct iattr *iattr) 850 { 851 int ret; 852 gossip_debug(GOSSIP_INODE_DEBUG, "__orangefs_setattr: called on %pd\n", 853 dentry); 854 ret = setattr_prepare(&init_user_ns, dentry, iattr); 855 if (ret) 856 goto out; 857 ret = __orangefs_setattr_mode(dentry, iattr); 858 sync_inode_metadata(d_inode(dentry), 1); 859 out: 860 gossip_debug(GOSSIP_INODE_DEBUG, "orangefs_setattr: returning %d\n", 861 ret); 862 return ret; 863 } 864 865 /* 866 * Obtain attributes of an object given a dentry 867 */ 868 int orangefs_getattr(struct user_namespace *mnt_userns, const struct path *path, 869 struct kstat *stat, u32 request_mask, unsigned int flags) 870 { 871 int ret; 872 struct inode *inode = path->dentry->d_inode; 873 874 gossip_debug(GOSSIP_INODE_DEBUG, 875 "orangefs_getattr: called on %pd mask %u\n", 876 path->dentry, request_mask); 877 878 ret = orangefs_inode_getattr(inode, 879 request_mask & STATX_SIZE ? ORANGEFS_GETATTR_SIZE : 0); 880 if (ret == 0) { 881 generic_fillattr(&init_user_ns, inode, stat); 882 883 /* override block size reported to stat */ 884 if (!(request_mask & STATX_SIZE)) 885 stat->result_mask &= ~STATX_SIZE; 886 887 generic_fill_statx_attr(inode, stat); 888 } 889 return ret; 890 } 891 892 int orangefs_permission(struct user_namespace *mnt_userns, 893 struct inode *inode, int mask) 894 { 895 int ret; 896 897 if (mask & MAY_NOT_BLOCK) 898 return -ECHILD; 899 900 gossip_debug(GOSSIP_INODE_DEBUG, "%s: refreshing\n", __func__); 901 902 /* Make sure the permission (and other common attrs) are up to date. */ 903 ret = orangefs_inode_getattr(inode, 0); 904 if (ret < 0) 905 return ret; 906 907 return generic_permission(&init_user_ns, inode, mask); 908 } 909 910 int orangefs_update_time(struct inode *inode, struct timespec64 *time, int flags) 911 { 912 struct iattr iattr; 913 gossip_debug(GOSSIP_INODE_DEBUG, "orangefs_update_time: %pU\n", 914 get_khandle_from_ino(inode)); 915 generic_update_time(inode, time, flags); 916 memset(&iattr, 0, sizeof iattr); 917 if (flags & S_ATIME) 918 iattr.ia_valid |= ATTR_ATIME; 919 if (flags & S_CTIME) 920 iattr.ia_valid |= ATTR_CTIME; 921 if (flags & S_MTIME) 922 iattr.ia_valid |= ATTR_MTIME; 923 return __orangefs_setattr(inode, &iattr); 924 } 925 926 static int orangefs_fileattr_get(struct dentry *dentry, struct fileattr *fa) 927 { 928 u64 val = 0; 929 int ret; 930 931 gossip_debug(GOSSIP_FILE_DEBUG, "%s: called on %pd\n", __func__, 932 dentry); 933 934 ret = orangefs_inode_getxattr(d_inode(dentry), 935 "user.pvfs2.meta_hint", 936 &val, sizeof(val)); 937 if (ret < 0 && ret != -ENODATA) 938 return ret; 939 940 gossip_debug(GOSSIP_FILE_DEBUG, "%s: flags=%u\n", __func__, (u32) val); 941 942 fileattr_fill_flags(fa, val); 943 return 0; 944 } 945 946 static int orangefs_fileattr_set(struct user_namespace *mnt_userns, 947 struct dentry *dentry, struct fileattr *fa) 948 { 949 u64 val = 0; 950 951 gossip_debug(GOSSIP_FILE_DEBUG, "%s: called on %pd\n", __func__, 952 dentry); 953 /* 954 * ORANGEFS_MIRROR_FL is set internally when the mirroring mode is 955 * turned on for a file. The user is not allowed to turn on this bit, 956 * but the bit is present if the user first gets the flags and then 957 * updates the flags with some new settings. So, we ignore it in the 958 * following edit. bligon. 959 */ 960 if (fileattr_has_fsx(fa) || 961 (fa->flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | FS_NOATIME_FL | ORANGEFS_MIRROR_FL))) { 962 gossip_err("%s: only supports setting one of FS_IMMUTABLE_FL|FS_APPEND_FL|FS_NOATIME_FL\n", 963 __func__); 964 return -EOPNOTSUPP; 965 } 966 val = fa->flags; 967 gossip_debug(GOSSIP_FILE_DEBUG, "%s: flags=%u\n", __func__, (u32) val); 968 return orangefs_inode_setxattr(d_inode(dentry), 969 "user.pvfs2.meta_hint", 970 &val, sizeof(val), 0); 971 } 972 973 /* ORANGEFS2 implementation of VFS inode operations for files */ 974 static const struct inode_operations orangefs_file_inode_operations = { 975 .get_inode_acl = orangefs_get_acl, 976 .set_acl = orangefs_set_acl, 977 .setattr = orangefs_setattr, 978 .getattr = orangefs_getattr, 979 .listxattr = orangefs_listxattr, 980 .permission = orangefs_permission, 981 .update_time = orangefs_update_time, 982 .fileattr_get = orangefs_fileattr_get, 983 .fileattr_set = orangefs_fileattr_set, 984 }; 985 986 static int orangefs_init_iops(struct inode *inode) 987 { 988 inode->i_mapping->a_ops = &orangefs_address_operations; 989 990 switch (inode->i_mode & S_IFMT) { 991 case S_IFREG: 992 inode->i_op = &orangefs_file_inode_operations; 993 inode->i_fop = &orangefs_file_operations; 994 break; 995 case S_IFLNK: 996 inode->i_op = &orangefs_symlink_inode_operations; 997 break; 998 case S_IFDIR: 999 inode->i_op = &orangefs_dir_inode_operations; 1000 inode->i_fop = &orangefs_dir_operations; 1001 break; 1002 default: 1003 gossip_debug(GOSSIP_INODE_DEBUG, 1004 "%s: unsupported mode\n", 1005 __func__); 1006 return -EINVAL; 1007 } 1008 1009 return 0; 1010 } 1011 1012 /* 1013 * Given an ORANGEFS object identifier (fsid, handle), convert it into 1014 * a ino_t type that will be used as a hash-index from where the handle will 1015 * be searched for in the VFS hash table of inodes. 1016 */ 1017 static inline ino_t orangefs_handle_hash(struct orangefs_object_kref *ref) 1018 { 1019 if (!ref) 1020 return 0; 1021 return orangefs_khandle_to_ino(&(ref->khandle)); 1022 } 1023 1024 /* 1025 * Called to set up an inode from iget5_locked. 1026 */ 1027 static int orangefs_set_inode(struct inode *inode, void *data) 1028 { 1029 struct orangefs_object_kref *ref = (struct orangefs_object_kref *) data; 1030 ORANGEFS_I(inode)->refn.fs_id = ref->fs_id; 1031 ORANGEFS_I(inode)->refn.khandle = ref->khandle; 1032 ORANGEFS_I(inode)->attr_valid = 0; 1033 hash_init(ORANGEFS_I(inode)->xattr_cache); 1034 ORANGEFS_I(inode)->mapping_time = jiffies - 1; 1035 ORANGEFS_I(inode)->bitlock = 0; 1036 return 0; 1037 } 1038 1039 /* 1040 * Called to determine if handles match. 1041 */ 1042 static int orangefs_test_inode(struct inode *inode, void *data) 1043 { 1044 struct orangefs_object_kref *ref = (struct orangefs_object_kref *) data; 1045 struct orangefs_inode_s *orangefs_inode = NULL; 1046 1047 orangefs_inode = ORANGEFS_I(inode); 1048 /* test handles and fs_ids... */ 1049 return (!ORANGEFS_khandle_cmp(&(orangefs_inode->refn.khandle), 1050 &(ref->khandle)) && 1051 orangefs_inode->refn.fs_id == ref->fs_id); 1052 } 1053 1054 /* 1055 * Front-end to lookup the inode-cache maintained by the VFS using the ORANGEFS 1056 * file handle. 1057 * 1058 * @sb: the file system super block instance. 1059 * @ref: The ORANGEFS object for which we are trying to locate an inode. 1060 */ 1061 struct inode *orangefs_iget(struct super_block *sb, 1062 struct orangefs_object_kref *ref) 1063 { 1064 struct inode *inode = NULL; 1065 unsigned long hash; 1066 int error; 1067 1068 hash = orangefs_handle_hash(ref); 1069 inode = iget5_locked(sb, 1070 hash, 1071 orangefs_test_inode, 1072 orangefs_set_inode, 1073 ref); 1074 1075 if (!inode) 1076 return ERR_PTR(-ENOMEM); 1077 1078 if (!(inode->i_state & I_NEW)) 1079 return inode; 1080 1081 error = orangefs_inode_getattr(inode, ORANGEFS_GETATTR_NEW); 1082 if (error) { 1083 iget_failed(inode); 1084 return ERR_PTR(error); 1085 } 1086 1087 inode->i_ino = hash; /* needed for stat etc */ 1088 orangefs_init_iops(inode); 1089 unlock_new_inode(inode); 1090 1091 gossip_debug(GOSSIP_INODE_DEBUG, 1092 "iget handle %pU, fsid %d hash %ld i_ino %lu\n", 1093 &ref->khandle, 1094 ref->fs_id, 1095 hash, 1096 inode->i_ino); 1097 1098 return inode; 1099 } 1100 1101 /* 1102 * Allocate an inode for a newly created file and insert it into the inode hash. 1103 */ 1104 struct inode *orangefs_new_inode(struct super_block *sb, struct inode *dir, 1105 umode_t mode, dev_t dev, struct orangefs_object_kref *ref) 1106 { 1107 struct posix_acl *acl = NULL, *default_acl = NULL; 1108 unsigned long hash = orangefs_handle_hash(ref); 1109 struct inode *inode; 1110 int error; 1111 1112 gossip_debug(GOSSIP_INODE_DEBUG, 1113 "%s:(sb is %p | MAJOR(dev)=%u | MINOR(dev)=%u mode=%o)\n", 1114 __func__, 1115 sb, 1116 MAJOR(dev), 1117 MINOR(dev), 1118 mode); 1119 1120 inode = new_inode(sb); 1121 if (!inode) 1122 return ERR_PTR(-ENOMEM); 1123 1124 error = posix_acl_create(dir, &mode, &default_acl, &acl); 1125 if (error) 1126 goto out_iput; 1127 1128 orangefs_set_inode(inode, ref); 1129 inode->i_ino = hash; /* needed for stat etc */ 1130 1131 error = orangefs_inode_getattr(inode, ORANGEFS_GETATTR_NEW); 1132 if (error) 1133 goto out_iput; 1134 1135 orangefs_init_iops(inode); 1136 inode->i_rdev = dev; 1137 1138 if (default_acl) { 1139 error = __orangefs_set_acl(inode, default_acl, 1140 ACL_TYPE_DEFAULT); 1141 if (error) 1142 goto out_iput; 1143 } 1144 1145 if (acl) { 1146 error = __orangefs_set_acl(inode, acl, ACL_TYPE_ACCESS); 1147 if (error) 1148 goto out_iput; 1149 } 1150 1151 error = insert_inode_locked4(inode, hash, orangefs_test_inode, ref); 1152 if (error < 0) 1153 goto out_iput; 1154 1155 gossip_debug(GOSSIP_INODE_DEBUG, 1156 "Initializing ACL's for inode %pU\n", 1157 get_khandle_from_ino(inode)); 1158 if (mode != inode->i_mode) { 1159 struct iattr iattr = { 1160 .ia_mode = mode, 1161 .ia_valid = ATTR_MODE, 1162 }; 1163 inode->i_mode = mode; 1164 __orangefs_setattr(inode, &iattr); 1165 __posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); 1166 } 1167 posix_acl_release(acl); 1168 posix_acl_release(default_acl); 1169 return inode; 1170 1171 out_iput: 1172 iput(inode); 1173 posix_acl_release(acl); 1174 posix_acl_release(default_acl); 1175 return ERR_PTR(error); 1176 } 1177