1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * (C) 2001 Clemson University and The University of Chicago 4 * 5 * See COPYING in top-level directory. 6 */ 7 8 /* 9 * Linux VFS file operations. 10 */ 11 12 #include "protocol.h" 13 #include "orangefs-kernel.h" 14 #include "orangefs-bufmap.h" 15 #include <linux/fs.h> 16 #include <linux/pagemap.h> 17 18 static int flush_racache(struct inode *inode) 19 { 20 struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); 21 struct orangefs_kernel_op_s *new_op; 22 int ret; 23 24 gossip_debug(GOSSIP_UTILS_DEBUG, 25 "%s: %pU: Handle is %pU | fs_id %d\n", __func__, 26 get_khandle_from_ino(inode), &orangefs_inode->refn.khandle, 27 orangefs_inode->refn.fs_id); 28 29 new_op = op_alloc(ORANGEFS_VFS_OP_RA_FLUSH); 30 if (!new_op) 31 return -ENOMEM; 32 new_op->upcall.req.ra_cache_flush.refn = orangefs_inode->refn; 33 34 ret = service_operation(new_op, "orangefs_flush_racache", 35 get_interruptible_flag(inode)); 36 37 gossip_debug(GOSSIP_UTILS_DEBUG, "%s: got return value of %d\n", 38 __func__, ret); 39 40 op_release(new_op); 41 return ret; 42 } 43 44 /* 45 * Post and wait for the I/O upcall to finish 46 */ 47 static ssize_t wait_for_direct_io(enum ORANGEFS_io_type type, struct inode *inode, 48 loff_t *offset, struct iov_iter *iter, 49 size_t total_size, loff_t readahead_size) 50 { 51 struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); 52 struct orangefs_khandle *handle = &orangefs_inode->refn.khandle; 53 struct orangefs_kernel_op_s *new_op = NULL; 54 int buffer_index = -1; 55 ssize_t ret; 56 57 new_op = op_alloc(ORANGEFS_VFS_OP_FILE_IO); 58 if (!new_op) 59 return -ENOMEM; 60 61 /* synchronous I/O */ 62 new_op->upcall.req.io.readahead_size = readahead_size; 63 new_op->upcall.req.io.io_type = type; 64 new_op->upcall.req.io.refn = orangefs_inode->refn; 65 66 populate_shared_memory: 67 /* get a shared buffer index */ 68 buffer_index = orangefs_bufmap_get(); 69 if (buffer_index < 0) { 70 ret = buffer_index; 71 gossip_debug(GOSSIP_FILE_DEBUG, 72 "%s: orangefs_bufmap_get failure (%zd)\n", 73 __func__, ret); 74 goto out; 75 } 76 gossip_debug(GOSSIP_FILE_DEBUG, 77 "%s(%pU): GET op %p -> buffer_index %d\n", 78 __func__, 79 handle, 80 new_op, 81 buffer_index); 82 83 new_op->uses_shared_memory = 1; 84 new_op->upcall.req.io.buf_index = buffer_index; 85 new_op->upcall.req.io.count = total_size; 86 new_op->upcall.req.io.offset = *offset; 87 88 gossip_debug(GOSSIP_FILE_DEBUG, 89 "%s(%pU): offset: %llu total_size: %zd\n", 90 __func__, 91 handle, 92 llu(*offset), 93 total_size); 94 /* 95 * Stage 1: copy the buffers into client-core's address space 96 */ 97 if (type == ORANGEFS_IO_WRITE && total_size) { 98 ret = orangefs_bufmap_copy_from_iovec(iter, buffer_index, 99 total_size); 100 if (ret < 0) { 101 gossip_err("%s: Failed to copy-in buffers. Please make sure that the pvfs2-client is running. %ld\n", 102 __func__, (long)ret); 103 goto out; 104 } 105 } 106 107 gossip_debug(GOSSIP_FILE_DEBUG, 108 "%s(%pU): Calling post_io_request with tag (%llu)\n", 109 __func__, 110 handle, 111 llu(new_op->tag)); 112 113 /* Stage 2: Service the I/O operation */ 114 ret = service_operation(new_op, 115 type == ORANGEFS_IO_WRITE ? 116 "file_write" : 117 "file_read", 118 get_interruptible_flag(inode)); 119 120 /* 121 * If service_operation() returns -EAGAIN #and# the operation was 122 * purged from orangefs_request_list or htable_ops_in_progress, then 123 * we know that the client was restarted, causing the shared memory 124 * area to be wiped clean. To restart a write operation in this 125 * case, we must re-copy the data from the user's iovec to a NEW 126 * shared memory location. To restart a read operation, we must get 127 * a new shared memory location. 128 */ 129 if (ret == -EAGAIN && op_state_purged(new_op)) { 130 orangefs_bufmap_put(buffer_index); 131 buffer_index = -1; 132 if (type == ORANGEFS_IO_WRITE) 133 iov_iter_revert(iter, total_size); 134 gossip_debug(GOSSIP_FILE_DEBUG, 135 "%s:going to repopulate_shared_memory.\n", 136 __func__); 137 goto populate_shared_memory; 138 } 139 140 if (ret < 0) { 141 if (ret == -EINTR) { 142 /* 143 * We can't return EINTR if any data was written, 144 * it's not POSIX. It is minimally acceptable 145 * to give a partial write, the way NFS does. 146 * 147 * It would be optimal to return all or nothing, 148 * but if a userspace write is bigger than 149 * an IO buffer, and the interrupt occurs 150 * between buffer writes, that would not be 151 * possible. 152 */ 153 switch (new_op->op_state - OP_VFS_STATE_GIVEN_UP) { 154 /* 155 * If the op was waiting when the interrupt 156 * occurred, then the client-core did not 157 * trigger the write. 158 */ 159 case OP_VFS_STATE_WAITING: 160 if (*offset == 0) 161 ret = -EINTR; 162 else 163 ret = 0; 164 break; 165 /* 166 * If the op was in progress when the interrupt 167 * occurred, then the client-core was able to 168 * trigger the write. 169 */ 170 case OP_VFS_STATE_INPROGR: 171 ret = total_size; 172 break; 173 default: 174 gossip_err("%s: unexpected op state :%d:.\n", 175 __func__, 176 new_op->op_state); 177 ret = 0; 178 break; 179 } 180 gossip_debug(GOSSIP_FILE_DEBUG, 181 "%s: got EINTR, state:%d: %p\n", 182 __func__, 183 new_op->op_state, 184 new_op); 185 } else { 186 gossip_err("%s: error in %s handle %pU, returning %zd\n", 187 __func__, 188 type == ORANGEFS_IO_READ ? 189 "read from" : "write to", 190 handle, ret); 191 } 192 if (orangefs_cancel_op_in_progress(new_op)) 193 return ret; 194 195 goto out; 196 } 197 198 /* 199 * Stage 3: Post copy buffers from client-core's address space 200 */ 201 if (type == ORANGEFS_IO_READ && new_op->downcall.resp.io.amt_complete) { 202 /* 203 * NOTE: the iovector can either contain addresses which 204 * can futher be kernel-space or user-space addresses. 205 * or it can pointers to struct page's 206 */ 207 ret = orangefs_bufmap_copy_to_iovec(iter, buffer_index, 208 new_op->downcall.resp.io.amt_complete); 209 if (ret < 0) { 210 gossip_err("%s: Failed to copy-out buffers. Please make sure that the pvfs2-client is running (%ld)\n", 211 __func__, (long)ret); 212 goto out; 213 } 214 } 215 gossip_debug(GOSSIP_FILE_DEBUG, 216 "%s(%pU): Amount %s, returned by the sys-io call:%d\n", 217 __func__, 218 handle, 219 type == ORANGEFS_IO_READ ? "read" : "written", 220 (int)new_op->downcall.resp.io.amt_complete); 221 222 ret = new_op->downcall.resp.io.amt_complete; 223 224 out: 225 if (buffer_index >= 0) { 226 orangefs_bufmap_put(buffer_index); 227 gossip_debug(GOSSIP_FILE_DEBUG, 228 "%s(%pU): PUT buffer_index %d\n", 229 __func__, handle, buffer_index); 230 buffer_index = -1; 231 } 232 op_release(new_op); 233 return ret; 234 } 235 236 /* 237 * Common entry point for read/write/readv/writev 238 * This function will dispatch it to either the direct I/O 239 * or buffered I/O path depending on the mount options and/or 240 * augmented/extended metadata attached to the file. 241 * Note: File extended attributes override any mount options. 242 */ 243 static ssize_t do_readv_writev(enum ORANGEFS_io_type type, struct file *file, 244 loff_t *offset, struct iov_iter *iter) 245 { 246 struct inode *inode = file->f_mapping->host; 247 struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); 248 struct orangefs_khandle *handle = &orangefs_inode->refn.khandle; 249 size_t count = iov_iter_count(iter); 250 ssize_t total_count = 0; 251 ssize_t ret = -EINVAL; 252 253 gossip_debug(GOSSIP_FILE_DEBUG, 254 "%s-BEGIN(%pU): count(%d) after estimate_max_iovecs.\n", 255 __func__, 256 handle, 257 (int)count); 258 259 if (type == ORANGEFS_IO_WRITE) { 260 gossip_debug(GOSSIP_FILE_DEBUG, 261 "%s(%pU): proceeding with offset : %llu, " 262 "size %d\n", 263 __func__, 264 handle, 265 llu(*offset), 266 (int)count); 267 } 268 269 if (count == 0) { 270 ret = 0; 271 goto out; 272 } 273 274 while (iov_iter_count(iter)) { 275 size_t each_count = iov_iter_count(iter); 276 size_t amt_complete; 277 278 /* how much to transfer in this loop iteration */ 279 if (each_count > orangefs_bufmap_size_query()) 280 each_count = orangefs_bufmap_size_query(); 281 282 gossip_debug(GOSSIP_FILE_DEBUG, 283 "%s(%pU): size of each_count(%d)\n", 284 __func__, 285 handle, 286 (int)each_count); 287 gossip_debug(GOSSIP_FILE_DEBUG, 288 "%s(%pU): BEFORE wait_for_io: offset is %d\n", 289 __func__, 290 handle, 291 (int)*offset); 292 293 ret = wait_for_direct_io(type, inode, offset, iter, 294 each_count, 0); 295 gossip_debug(GOSSIP_FILE_DEBUG, 296 "%s(%pU): return from wait_for_io:%d\n", 297 __func__, 298 handle, 299 (int)ret); 300 301 if (ret < 0) 302 goto out; 303 304 *offset += ret; 305 total_count += ret; 306 amt_complete = ret; 307 308 gossip_debug(GOSSIP_FILE_DEBUG, 309 "%s(%pU): AFTER wait_for_io: offset is %d\n", 310 __func__, 311 handle, 312 (int)*offset); 313 314 /* 315 * if we got a short I/O operations, 316 * fall out and return what we got so far 317 */ 318 if (amt_complete < each_count) 319 break; 320 } /*end while */ 321 322 out: 323 if (total_count > 0) 324 ret = total_count; 325 if (ret > 0) { 326 if (type == ORANGEFS_IO_READ) { 327 file_accessed(file); 328 } else { 329 file_update_time(file); 330 /* 331 * Must invalidate to ensure write loop doesn't 332 * prevent kernel from reading updated 333 * attribute. Size probably changed because of 334 * the write, and other clients could update 335 * any other attribute. 336 */ 337 orangefs_inode->getattr_time = jiffies - 1; 338 } 339 } 340 341 gossip_debug(GOSSIP_FILE_DEBUG, 342 "%s(%pU): Value(%d) returned.\n", 343 __func__, 344 handle, 345 (int)ret); 346 347 return ret; 348 } 349 350 /* 351 * Read data from a specified offset in a file (referenced by inode). 352 * Data may be placed either in a user or kernel buffer. 353 */ 354 ssize_t orangefs_inode_read(struct inode *inode, 355 struct iov_iter *iter, 356 loff_t *offset, 357 loff_t readahead_size) 358 { 359 struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); 360 size_t count = iov_iter_count(iter); 361 size_t bufmap_size; 362 ssize_t ret = -EINVAL; 363 364 orangefs_stats.reads++; 365 366 bufmap_size = orangefs_bufmap_size_query(); 367 if (count > bufmap_size) { 368 gossip_debug(GOSSIP_FILE_DEBUG, 369 "%s: count is too large (%zd/%zd)!\n", 370 __func__, count, bufmap_size); 371 return -EINVAL; 372 } 373 374 gossip_debug(GOSSIP_FILE_DEBUG, 375 "%s(%pU) %zd@%llu\n", 376 __func__, 377 &orangefs_inode->refn.khandle, 378 count, 379 llu(*offset)); 380 381 ret = wait_for_direct_io(ORANGEFS_IO_READ, inode, offset, iter, 382 count, readahead_size); 383 if (ret > 0) 384 *offset += ret; 385 386 gossip_debug(GOSSIP_FILE_DEBUG, 387 "%s(%pU): Value(%zd) returned.\n", 388 __func__, 389 &orangefs_inode->refn.khandle, 390 ret); 391 392 return ret; 393 } 394 395 static ssize_t orangefs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) 396 { 397 struct file *file = iocb->ki_filp; 398 loff_t pos = iocb->ki_pos; 399 ssize_t rc = 0; 400 401 BUG_ON(iocb->private); 402 403 gossip_debug(GOSSIP_FILE_DEBUG, "orangefs_file_read_iter\n"); 404 405 orangefs_stats.reads++; 406 407 rc = do_readv_writev(ORANGEFS_IO_READ, file, &pos, iter); 408 iocb->ki_pos = pos; 409 410 return rc; 411 } 412 413 static ssize_t orangefs_file_write_iter(struct kiocb *iocb, struct iov_iter *iter) 414 { 415 struct file *file = iocb->ki_filp; 416 loff_t pos; 417 ssize_t rc; 418 419 BUG_ON(iocb->private); 420 421 gossip_debug(GOSSIP_FILE_DEBUG, "orangefs_file_write_iter\n"); 422 423 inode_lock(file->f_mapping->host); 424 425 /* Make sure generic_write_checks sees an up to date inode size. */ 426 if (file->f_flags & O_APPEND) { 427 rc = orangefs_inode_getattr(file->f_mapping->host, 0, 1, 428 STATX_SIZE); 429 if (rc == -ESTALE) 430 rc = -EIO; 431 if (rc) { 432 gossip_err("%s: orangefs_inode_getattr failed, " 433 "rc:%zd:.\n", __func__, rc); 434 goto out; 435 } 436 } 437 438 rc = generic_write_checks(iocb, iter); 439 440 if (rc <= 0) { 441 gossip_err("%s: generic_write_checks failed, rc:%zd:.\n", 442 __func__, rc); 443 goto out; 444 } 445 446 /* 447 * if we are appending, generic_write_checks would have updated 448 * pos to the end of the file, so we will wait till now to set 449 * pos... 450 */ 451 pos = iocb->ki_pos; 452 453 rc = do_readv_writev(ORANGEFS_IO_WRITE, 454 file, 455 &pos, 456 iter); 457 if (rc < 0) { 458 gossip_err("%s: do_readv_writev failed, rc:%zd:.\n", 459 __func__, rc); 460 goto out; 461 } 462 463 iocb->ki_pos = pos; 464 orangefs_stats.writes++; 465 466 out: 467 468 inode_unlock(file->f_mapping->host); 469 return rc; 470 } 471 472 /* 473 * Perform a miscellaneous operation on a file. 474 */ 475 static long orangefs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) 476 { 477 int ret = -ENOTTY; 478 __u64 val = 0; 479 unsigned long uval; 480 481 gossip_debug(GOSSIP_FILE_DEBUG, 482 "orangefs_ioctl: called with cmd %d\n", 483 cmd); 484 485 /* 486 * we understand some general ioctls on files, such as the immutable 487 * and append flags 488 */ 489 if (cmd == FS_IOC_GETFLAGS) { 490 val = 0; 491 ret = orangefs_inode_getxattr(file_inode(file), 492 "user.pvfs2.meta_hint", 493 &val, sizeof(val)); 494 if (ret < 0 && ret != -ENODATA) 495 return ret; 496 else if (ret == -ENODATA) 497 val = 0; 498 uval = val; 499 gossip_debug(GOSSIP_FILE_DEBUG, 500 "orangefs_ioctl: FS_IOC_GETFLAGS: %llu\n", 501 (unsigned long long)uval); 502 return put_user(uval, (int __user *)arg); 503 } else if (cmd == FS_IOC_SETFLAGS) { 504 ret = 0; 505 if (get_user(uval, (int __user *)arg)) 506 return -EFAULT; 507 /* 508 * ORANGEFS_MIRROR_FL is set internally when the mirroring mode 509 * is turned on for a file. The user is not allowed to turn 510 * on this bit, but the bit is present if the user first gets 511 * the flags and then updates the flags with some new 512 * settings. So, we ignore it in the following edit. bligon. 513 */ 514 if ((uval & ~ORANGEFS_MIRROR_FL) & 515 (~(FS_IMMUTABLE_FL | FS_APPEND_FL | FS_NOATIME_FL))) { 516 gossip_err("orangefs_ioctl: the FS_IOC_SETFLAGS only supports setting one of FS_IMMUTABLE_FL|FS_APPEND_FL|FS_NOATIME_FL\n"); 517 return -EINVAL; 518 } 519 val = uval; 520 gossip_debug(GOSSIP_FILE_DEBUG, 521 "orangefs_ioctl: FS_IOC_SETFLAGS: %llu\n", 522 (unsigned long long)val); 523 ret = orangefs_inode_setxattr(file_inode(file), 524 "user.pvfs2.meta_hint", 525 &val, sizeof(val), 0); 526 } 527 528 return ret; 529 } 530 531 static vm_fault_t orangefs_fault(struct vm_fault *vmf) 532 { 533 struct file *file = vmf->vma->vm_file; 534 int ret; 535 536 ret = orangefs_inode_getattr(file->f_mapping->host, 0, 1, 537 STATX_SIZE); 538 if (ret == -ESTALE) 539 ret = -EIO; 540 if (ret) { 541 gossip_err("%s: orangefs_inode_getattr failed, ret:%d:.\n", 542 __func__, ret); 543 return VM_FAULT_SIGBUS; 544 } 545 return filemap_fault(vmf); 546 } 547 548 static const struct vm_operations_struct orangefs_file_vm_ops = { 549 .fault = orangefs_fault, 550 .map_pages = filemap_map_pages, 551 .page_mkwrite = filemap_page_mkwrite, 552 }; 553 554 /* 555 * Memory map a region of a file. 556 */ 557 static int orangefs_file_mmap(struct file *file, struct vm_area_struct *vma) 558 { 559 gossip_debug(GOSSIP_FILE_DEBUG, 560 "orangefs_file_mmap: called on %s\n", 561 (file ? 562 (char *)file->f_path.dentry->d_name.name : 563 (char *)"Unknown")); 564 565 if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) 566 return -EINVAL; 567 568 /* set the sequential readahead hint */ 569 vma->vm_flags |= VM_SEQ_READ; 570 vma->vm_flags &= ~VM_RAND_READ; 571 572 file_accessed(file); 573 vma->vm_ops = &orangefs_file_vm_ops; 574 return 0; 575 } 576 577 #define mapping_nrpages(idata) ((idata)->nrpages) 578 579 /* 580 * Called to notify the module that there are no more references to 581 * this file (i.e. no processes have it open). 582 * 583 * \note Not called when each file is closed. 584 */ 585 static int orangefs_file_release(struct inode *inode, struct file *file) 586 { 587 gossip_debug(GOSSIP_FILE_DEBUG, 588 "orangefs_file_release: called on %pD\n", 589 file); 590 591 /* 592 * remove all associated inode pages from the page cache and 593 * readahead cache (if any); this forces an expensive refresh of 594 * data for the next caller of mmap (or 'get_block' accesses) 595 */ 596 if (file_inode(file) && 597 file_inode(file)->i_mapping && 598 mapping_nrpages(&file_inode(file)->i_data)) { 599 if (orangefs_features & ORANGEFS_FEATURE_READAHEAD) { 600 gossip_debug(GOSSIP_INODE_DEBUG, 601 "calling flush_racache on %pU\n", 602 get_khandle_from_ino(inode)); 603 flush_racache(inode); 604 gossip_debug(GOSSIP_INODE_DEBUG, 605 "flush_racache finished\n"); 606 } 607 truncate_inode_pages(file_inode(file)->i_mapping, 608 0); 609 } 610 return 0; 611 } 612 613 /* 614 * Push all data for a specific file onto permanent storage. 615 */ 616 static int orangefs_fsync(struct file *file, 617 loff_t start, 618 loff_t end, 619 int datasync) 620 { 621 int ret; 622 struct orangefs_inode_s *orangefs_inode = 623 ORANGEFS_I(file_inode(file)); 624 struct orangefs_kernel_op_s *new_op = NULL; 625 626 new_op = op_alloc(ORANGEFS_VFS_OP_FSYNC); 627 if (!new_op) 628 return -ENOMEM; 629 new_op->upcall.req.fsync.refn = orangefs_inode->refn; 630 631 ret = service_operation(new_op, 632 "orangefs_fsync", 633 get_interruptible_flag(file_inode(file))); 634 635 gossip_debug(GOSSIP_FILE_DEBUG, 636 "orangefs_fsync got return value of %d\n", 637 ret); 638 639 op_release(new_op); 640 return ret; 641 } 642 643 /* 644 * Change the file pointer position for an instance of an open file. 645 * 646 * \note If .llseek is overriden, we must acquire lock as described in 647 * Documentation/filesystems/Locking. 648 * 649 * Future upgrade could support SEEK_DATA and SEEK_HOLE but would 650 * require much changes to the FS 651 */ 652 static loff_t orangefs_file_llseek(struct file *file, loff_t offset, int origin) 653 { 654 int ret = -EINVAL; 655 struct inode *inode = file_inode(file); 656 657 if (origin == SEEK_END) { 658 /* 659 * revalidate the inode's file size. 660 * NOTE: We are only interested in file size here, 661 * so we set mask accordingly. 662 */ 663 ret = orangefs_inode_getattr(file->f_mapping->host, 0, 1, 664 STATX_SIZE); 665 if (ret == -ESTALE) 666 ret = -EIO; 667 if (ret) { 668 gossip_debug(GOSSIP_FILE_DEBUG, 669 "%s:%s:%d calling make bad inode\n", 670 __FILE__, 671 __func__, 672 __LINE__); 673 return ret; 674 } 675 } 676 677 gossip_debug(GOSSIP_FILE_DEBUG, 678 "orangefs_file_llseek: offset is %ld | origin is %d" 679 " | inode size is %lu\n", 680 (long)offset, 681 origin, 682 (unsigned long)i_size_read(inode)); 683 684 return generic_file_llseek(file, offset, origin); 685 } 686 687 /* 688 * Support local locks (locks that only this kernel knows about) 689 * if Orangefs was mounted -o local_lock. 690 */ 691 static int orangefs_lock(struct file *filp, int cmd, struct file_lock *fl) 692 { 693 int rc = -EINVAL; 694 695 if (ORANGEFS_SB(file_inode(filp)->i_sb)->flags & ORANGEFS_OPT_LOCAL_LOCK) { 696 if (cmd == F_GETLK) { 697 rc = 0; 698 posix_test_lock(filp, fl); 699 } else { 700 rc = posix_lock_file(filp, fl, NULL); 701 } 702 } 703 704 return rc; 705 } 706 707 /** ORANGEFS implementation of VFS file operations */ 708 const struct file_operations orangefs_file_operations = { 709 .llseek = orangefs_file_llseek, 710 .read_iter = orangefs_file_read_iter, 711 .write_iter = orangefs_file_write_iter, 712 .lock = orangefs_lock, 713 .unlocked_ioctl = orangefs_ioctl, 714 .mmap = orangefs_file_mmap, 715 .open = generic_file_open, 716 .release = orangefs_file_release, 717 .fsync = orangefs_fsync, 718 }; 719