1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * (C) 2001 Clemson University and The University of Chicago 4 * 5 * See COPYING in top-level directory. 6 */ 7 8 /* 9 * Linux VFS file operations. 10 */ 11 12 #include "protocol.h" 13 #include "orangefs-kernel.h" 14 #include "orangefs-bufmap.h" 15 #include <linux/fs.h> 16 #include <linux/pagemap.h> 17 18 static int flush_racache(struct inode *inode) 19 { 20 struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); 21 struct orangefs_kernel_op_s *new_op; 22 int ret; 23 24 gossip_debug(GOSSIP_UTILS_DEBUG, 25 "%s: %pU: Handle is %pU | fs_id %d\n", __func__, 26 get_khandle_from_ino(inode), &orangefs_inode->refn.khandle, 27 orangefs_inode->refn.fs_id); 28 29 new_op = op_alloc(ORANGEFS_VFS_OP_RA_FLUSH); 30 if (!new_op) 31 return -ENOMEM; 32 new_op->upcall.req.ra_cache_flush.refn = orangefs_inode->refn; 33 34 ret = service_operation(new_op, "orangefs_flush_racache", 35 get_interruptible_flag(inode)); 36 37 gossip_debug(GOSSIP_UTILS_DEBUG, "%s: got return value of %d\n", 38 __func__, ret); 39 40 op_release(new_op); 41 return ret; 42 } 43 44 /* 45 * Post and wait for the I/O upcall to finish 46 */ 47 static ssize_t wait_for_direct_io(enum ORANGEFS_io_type type, struct inode *inode, 48 loff_t *offset, struct iov_iter *iter, 49 size_t total_size, loff_t readahead_size) 50 { 51 struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); 52 struct orangefs_khandle *handle = &orangefs_inode->refn.khandle; 53 struct orangefs_kernel_op_s *new_op = NULL; 54 int buffer_index = -1; 55 ssize_t ret; 56 57 new_op = op_alloc(ORANGEFS_VFS_OP_FILE_IO); 58 if (!new_op) 59 return -ENOMEM; 60 61 /* synchronous I/O */ 62 new_op->upcall.req.io.readahead_size = readahead_size; 63 new_op->upcall.req.io.io_type = type; 64 new_op->upcall.req.io.refn = orangefs_inode->refn; 65 66 populate_shared_memory: 67 /* get a shared buffer index */ 68 buffer_index = orangefs_bufmap_get(); 69 if (buffer_index < 0) { 70 ret = buffer_index; 71 gossip_debug(GOSSIP_FILE_DEBUG, 72 "%s: orangefs_bufmap_get failure (%zd)\n", 73 __func__, ret); 74 goto out; 75 } 76 gossip_debug(GOSSIP_FILE_DEBUG, 77 "%s(%pU): GET op %p -> buffer_index %d\n", 78 __func__, 79 handle, 80 new_op, 81 buffer_index); 82 83 new_op->uses_shared_memory = 1; 84 new_op->upcall.req.io.buf_index = buffer_index; 85 new_op->upcall.req.io.count = total_size; 86 new_op->upcall.req.io.offset = *offset; 87 88 gossip_debug(GOSSIP_FILE_DEBUG, 89 "%s(%pU): offset: %llu total_size: %zd\n", 90 __func__, 91 handle, 92 llu(*offset), 93 total_size); 94 /* 95 * Stage 1: copy the buffers into client-core's address space 96 */ 97 if (type == ORANGEFS_IO_WRITE && total_size) { 98 ret = orangefs_bufmap_copy_from_iovec(iter, buffer_index, 99 total_size); 100 if (ret < 0) { 101 gossip_err("%s: Failed to copy-in buffers. Please make sure that the pvfs2-client is running. %ld\n", 102 __func__, (long)ret); 103 goto out; 104 } 105 } 106 107 gossip_debug(GOSSIP_FILE_DEBUG, 108 "%s(%pU): Calling post_io_request with tag (%llu)\n", 109 __func__, 110 handle, 111 llu(new_op->tag)); 112 113 /* Stage 2: Service the I/O operation */ 114 ret = service_operation(new_op, 115 type == ORANGEFS_IO_WRITE ? 116 "file_write" : 117 "file_read", 118 get_interruptible_flag(inode)); 119 120 /* 121 * If service_operation() returns -EAGAIN #and# the operation was 122 * purged from orangefs_request_list or htable_ops_in_progress, then 123 * we know that the client was restarted, causing the shared memory 124 * area to be wiped clean. To restart a write operation in this 125 * case, we must re-copy the data from the user's iovec to a NEW 126 * shared memory location. To restart a read operation, we must get 127 * a new shared memory location. 128 */ 129 if (ret == -EAGAIN && op_state_purged(new_op)) { 130 orangefs_bufmap_put(buffer_index); 131 buffer_index = -1; 132 if (type == ORANGEFS_IO_WRITE) 133 iov_iter_revert(iter, total_size); 134 gossip_debug(GOSSIP_FILE_DEBUG, 135 "%s:going to repopulate_shared_memory.\n", 136 __func__); 137 goto populate_shared_memory; 138 } 139 140 if (ret < 0) { 141 if (ret == -EINTR) { 142 /* 143 * We can't return EINTR if any data was written, 144 * it's not POSIX. It is minimally acceptable 145 * to give a partial write, the way NFS does. 146 * 147 * It would be optimal to return all or nothing, 148 * but if a userspace write is bigger than 149 * an IO buffer, and the interrupt occurs 150 * between buffer writes, that would not be 151 * possible. 152 */ 153 switch (new_op->op_state - OP_VFS_STATE_GIVEN_UP) { 154 /* 155 * If the op was waiting when the interrupt 156 * occurred, then the client-core did not 157 * trigger the write. 158 */ 159 case OP_VFS_STATE_WAITING: 160 if (*offset == 0) 161 ret = -EINTR; 162 else 163 ret = 0; 164 break; 165 /* 166 * If the op was in progress when the interrupt 167 * occurred, then the client-core was able to 168 * trigger the write. 169 */ 170 case OP_VFS_STATE_INPROGR: 171 ret = total_size; 172 break; 173 default: 174 gossip_err("%s: unexpected op state :%d:.\n", 175 __func__, 176 new_op->op_state); 177 ret = 0; 178 break; 179 } 180 gossip_debug(GOSSIP_FILE_DEBUG, 181 "%s: got EINTR, state:%d: %p\n", 182 __func__, 183 new_op->op_state, 184 new_op); 185 } else { 186 gossip_err("%s: error in %s handle %pU, returning %zd\n", 187 __func__, 188 type == ORANGEFS_IO_READ ? 189 "read from" : "write to", 190 handle, ret); 191 } 192 if (orangefs_cancel_op_in_progress(new_op)) 193 return ret; 194 195 goto out; 196 } 197 198 /* 199 * Stage 3: Post copy buffers from client-core's address space 200 */ 201 if (type == ORANGEFS_IO_READ && new_op->downcall.resp.io.amt_complete) { 202 /* 203 * NOTE: the iovector can either contain addresses which 204 * can futher be kernel-space or user-space addresses. 205 * or it can pointers to struct page's 206 */ 207 ret = orangefs_bufmap_copy_to_iovec(iter, buffer_index, 208 new_op->downcall.resp.io.amt_complete); 209 if (ret < 0) { 210 gossip_err("%s: Failed to copy-out buffers. Please make sure that the pvfs2-client is running (%ld)\n", 211 __func__, (long)ret); 212 goto out; 213 } 214 } 215 gossip_debug(GOSSIP_FILE_DEBUG, 216 "%s(%pU): Amount %s, returned by the sys-io call:%d\n", 217 __func__, 218 handle, 219 type == ORANGEFS_IO_READ ? "read" : "written", 220 (int)new_op->downcall.resp.io.amt_complete); 221 222 ret = new_op->downcall.resp.io.amt_complete; 223 224 out: 225 if (buffer_index >= 0) { 226 orangefs_bufmap_put(buffer_index); 227 gossip_debug(GOSSIP_FILE_DEBUG, 228 "%s(%pU): PUT buffer_index %d\n", 229 __func__, handle, buffer_index); 230 buffer_index = -1; 231 } 232 op_release(new_op); 233 return ret; 234 } 235 236 /* 237 * Common entry point for read/write/readv/writev 238 * This function will dispatch it to either the direct I/O 239 * or buffered I/O path depending on the mount options and/or 240 * augmented/extended metadata attached to the file. 241 * Note: File extended attributes override any mount options. 242 */ 243 static ssize_t do_readv_writev(enum ORANGEFS_io_type type, struct file *file, 244 loff_t *offset, struct iov_iter *iter) 245 { 246 struct inode *inode = file->f_mapping->host; 247 struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); 248 struct orangefs_khandle *handle = &orangefs_inode->refn.khandle; 249 size_t count = iov_iter_count(iter); 250 ssize_t total_count = 0; 251 ssize_t ret = -EINVAL; 252 253 gossip_debug(GOSSIP_FILE_DEBUG, 254 "%s-BEGIN(%pU): count(%d) after estimate_max_iovecs.\n", 255 __func__, 256 handle, 257 (int)count); 258 259 if (type == ORANGEFS_IO_WRITE) { 260 gossip_debug(GOSSIP_FILE_DEBUG, 261 "%s(%pU): proceeding with offset : %llu, " 262 "size %d\n", 263 __func__, 264 handle, 265 llu(*offset), 266 (int)count); 267 } 268 269 if (count == 0) { 270 ret = 0; 271 goto out; 272 } 273 274 while (iov_iter_count(iter)) { 275 size_t each_count = iov_iter_count(iter); 276 size_t amt_complete; 277 278 /* how much to transfer in this loop iteration */ 279 if (each_count > orangefs_bufmap_size_query()) 280 each_count = orangefs_bufmap_size_query(); 281 282 gossip_debug(GOSSIP_FILE_DEBUG, 283 "%s(%pU): size of each_count(%d)\n", 284 __func__, 285 handle, 286 (int)each_count); 287 gossip_debug(GOSSIP_FILE_DEBUG, 288 "%s(%pU): BEFORE wait_for_io: offset is %d\n", 289 __func__, 290 handle, 291 (int)*offset); 292 293 ret = wait_for_direct_io(type, inode, offset, iter, 294 each_count, 0); 295 gossip_debug(GOSSIP_FILE_DEBUG, 296 "%s(%pU): return from wait_for_io:%d\n", 297 __func__, 298 handle, 299 (int)ret); 300 301 if (ret < 0) 302 goto out; 303 304 *offset += ret; 305 total_count += ret; 306 amt_complete = ret; 307 308 gossip_debug(GOSSIP_FILE_DEBUG, 309 "%s(%pU): AFTER wait_for_io: offset is %d\n", 310 __func__, 311 handle, 312 (int)*offset); 313 314 /* 315 * if we got a short I/O operations, 316 * fall out and return what we got so far 317 */ 318 if (amt_complete < each_count) 319 break; 320 } /*end while */ 321 322 out: 323 if (total_count > 0) 324 ret = total_count; 325 if (ret > 0) { 326 if (type == ORANGEFS_IO_READ) { 327 file_accessed(file); 328 } else { 329 file_update_time(file); 330 /* 331 * Must invalidate to ensure write loop doesn't 332 * prevent kernel from reading updated 333 * attribute. Size probably changed because of 334 * the write, and other clients could update 335 * any other attribute. 336 */ 337 orangefs_inode->getattr_time = jiffies - 1; 338 } 339 } 340 341 gossip_debug(GOSSIP_FILE_DEBUG, 342 "%s(%pU): Value(%d) returned.\n", 343 __func__, 344 handle, 345 (int)ret); 346 347 return ret; 348 } 349 350 /* 351 * Read data from a specified offset in a file (referenced by inode). 352 * Data may be placed either in a user or kernel buffer. 353 */ 354 ssize_t orangefs_inode_read(struct inode *inode, 355 struct iov_iter *iter, 356 loff_t *offset, 357 loff_t readahead_size) 358 { 359 struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); 360 size_t count = iov_iter_count(iter); 361 size_t bufmap_size; 362 ssize_t ret = -EINVAL; 363 364 orangefs_stats.reads++; 365 366 bufmap_size = orangefs_bufmap_size_query(); 367 if (count > bufmap_size) { 368 gossip_debug(GOSSIP_FILE_DEBUG, 369 "%s: count is too large (%zd/%zd)!\n", 370 __func__, count, bufmap_size); 371 return -EINVAL; 372 } 373 374 gossip_debug(GOSSIP_FILE_DEBUG, 375 "%s(%pU) %zd@%llu\n", 376 __func__, 377 &orangefs_inode->refn.khandle, 378 count, 379 llu(*offset)); 380 381 ret = wait_for_direct_io(ORANGEFS_IO_READ, inode, offset, iter, 382 count, readahead_size); 383 if (ret > 0) 384 *offset += ret; 385 386 gossip_debug(GOSSIP_FILE_DEBUG, 387 "%s(%pU): Value(%zd) returned.\n", 388 __func__, 389 &orangefs_inode->refn.khandle, 390 ret); 391 392 return ret; 393 } 394 395 static ssize_t orangefs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) 396 { 397 struct file *file = iocb->ki_filp; 398 loff_t pos = iocb->ki_pos; 399 ssize_t rc = 0; 400 401 gossip_debug(GOSSIP_FILE_DEBUG, "orangefs_file_read_iter\n"); 402 403 orangefs_stats.reads++; 404 405 rc = do_readv_writev(ORANGEFS_IO_READ, file, &pos, iter); 406 iocb->ki_pos = pos; 407 408 return rc; 409 } 410 411 static ssize_t orangefs_file_write_iter(struct kiocb *iocb, struct iov_iter *iter) 412 { 413 struct file *file = iocb->ki_filp; 414 loff_t pos; 415 ssize_t rc; 416 417 gossip_debug(GOSSIP_FILE_DEBUG, "orangefs_file_write_iter\n"); 418 419 inode_lock(file->f_mapping->host); 420 421 /* Make sure generic_write_checks sees an up to date inode size. */ 422 if (file->f_flags & O_APPEND) { 423 rc = orangefs_inode_getattr(file->f_mapping->host, 0, 1, 424 STATX_SIZE); 425 if (rc == -ESTALE) 426 rc = -EIO; 427 if (rc) { 428 gossip_err("%s: orangefs_inode_getattr failed, " 429 "rc:%zd:.\n", __func__, rc); 430 goto out; 431 } 432 } 433 434 rc = generic_write_checks(iocb, iter); 435 436 if (rc <= 0) { 437 gossip_err("%s: generic_write_checks failed, rc:%zd:.\n", 438 __func__, rc); 439 goto out; 440 } 441 442 /* 443 * if we are appending, generic_write_checks would have updated 444 * pos to the end of the file, so we will wait till now to set 445 * pos... 446 */ 447 pos = iocb->ki_pos; 448 449 rc = do_readv_writev(ORANGEFS_IO_WRITE, 450 file, 451 &pos, 452 iter); 453 if (rc < 0) { 454 gossip_err("%s: do_readv_writev failed, rc:%zd:.\n", 455 __func__, rc); 456 goto out; 457 } 458 459 iocb->ki_pos = pos; 460 orangefs_stats.writes++; 461 462 out: 463 464 inode_unlock(file->f_mapping->host); 465 return rc; 466 } 467 468 /* 469 * Perform a miscellaneous operation on a file. 470 */ 471 static long orangefs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) 472 { 473 int ret = -ENOTTY; 474 __u64 val = 0; 475 unsigned long uval; 476 477 gossip_debug(GOSSIP_FILE_DEBUG, 478 "orangefs_ioctl: called with cmd %d\n", 479 cmd); 480 481 /* 482 * we understand some general ioctls on files, such as the immutable 483 * and append flags 484 */ 485 if (cmd == FS_IOC_GETFLAGS) { 486 val = 0; 487 ret = orangefs_inode_getxattr(file_inode(file), 488 "user.pvfs2.meta_hint", 489 &val, sizeof(val)); 490 if (ret < 0 && ret != -ENODATA) 491 return ret; 492 else if (ret == -ENODATA) 493 val = 0; 494 uval = val; 495 gossip_debug(GOSSIP_FILE_DEBUG, 496 "orangefs_ioctl: FS_IOC_GETFLAGS: %llu\n", 497 (unsigned long long)uval); 498 return put_user(uval, (int __user *)arg); 499 } else if (cmd == FS_IOC_SETFLAGS) { 500 ret = 0; 501 if (get_user(uval, (int __user *)arg)) 502 return -EFAULT; 503 /* 504 * ORANGEFS_MIRROR_FL is set internally when the mirroring mode 505 * is turned on for a file. The user is not allowed to turn 506 * on this bit, but the bit is present if the user first gets 507 * the flags and then updates the flags with some new 508 * settings. So, we ignore it in the following edit. bligon. 509 */ 510 if ((uval & ~ORANGEFS_MIRROR_FL) & 511 (~(FS_IMMUTABLE_FL | FS_APPEND_FL | FS_NOATIME_FL))) { 512 gossip_err("orangefs_ioctl: the FS_IOC_SETFLAGS only supports setting one of FS_IMMUTABLE_FL|FS_APPEND_FL|FS_NOATIME_FL\n"); 513 return -EINVAL; 514 } 515 val = uval; 516 gossip_debug(GOSSIP_FILE_DEBUG, 517 "orangefs_ioctl: FS_IOC_SETFLAGS: %llu\n", 518 (unsigned long long)val); 519 ret = orangefs_inode_setxattr(file_inode(file), 520 "user.pvfs2.meta_hint", 521 &val, sizeof(val), 0); 522 } 523 524 return ret; 525 } 526 527 static vm_fault_t orangefs_fault(struct vm_fault *vmf) 528 { 529 struct file *file = vmf->vma->vm_file; 530 int ret; 531 532 ret = orangefs_inode_getattr(file->f_mapping->host, 0, 1, 533 STATX_SIZE); 534 if (ret == -ESTALE) 535 ret = -EIO; 536 if (ret) { 537 gossip_err("%s: orangefs_inode_getattr failed, ret:%d:.\n", 538 __func__, ret); 539 return VM_FAULT_SIGBUS; 540 } 541 return filemap_fault(vmf); 542 } 543 544 static const struct vm_operations_struct orangefs_file_vm_ops = { 545 .fault = orangefs_fault, 546 .map_pages = filemap_map_pages, 547 .page_mkwrite = filemap_page_mkwrite, 548 }; 549 550 /* 551 * Memory map a region of a file. 552 */ 553 static int orangefs_file_mmap(struct file *file, struct vm_area_struct *vma) 554 { 555 gossip_debug(GOSSIP_FILE_DEBUG, 556 "orangefs_file_mmap: called on %s\n", 557 (file ? 558 (char *)file->f_path.dentry->d_name.name : 559 (char *)"Unknown")); 560 561 if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) 562 return -EINVAL; 563 564 /* set the sequential readahead hint */ 565 vma->vm_flags |= VM_SEQ_READ; 566 vma->vm_flags &= ~VM_RAND_READ; 567 568 file_accessed(file); 569 vma->vm_ops = &orangefs_file_vm_ops; 570 return 0; 571 } 572 573 #define mapping_nrpages(idata) ((idata)->nrpages) 574 575 /* 576 * Called to notify the module that there are no more references to 577 * this file (i.e. no processes have it open). 578 * 579 * \note Not called when each file is closed. 580 */ 581 static int orangefs_file_release(struct inode *inode, struct file *file) 582 { 583 gossip_debug(GOSSIP_FILE_DEBUG, 584 "orangefs_file_release: called on %pD\n", 585 file); 586 587 /* 588 * remove all associated inode pages from the page cache and 589 * readahead cache (if any); this forces an expensive refresh of 590 * data for the next caller of mmap (or 'get_block' accesses) 591 */ 592 if (file_inode(file) && 593 file_inode(file)->i_mapping && 594 mapping_nrpages(&file_inode(file)->i_data)) { 595 if (orangefs_features & ORANGEFS_FEATURE_READAHEAD) { 596 gossip_debug(GOSSIP_INODE_DEBUG, 597 "calling flush_racache on %pU\n", 598 get_khandle_from_ino(inode)); 599 flush_racache(inode); 600 gossip_debug(GOSSIP_INODE_DEBUG, 601 "flush_racache finished\n"); 602 } 603 truncate_inode_pages(file_inode(file)->i_mapping, 604 0); 605 } 606 return 0; 607 } 608 609 /* 610 * Push all data for a specific file onto permanent storage. 611 */ 612 static int orangefs_fsync(struct file *file, 613 loff_t start, 614 loff_t end, 615 int datasync) 616 { 617 int ret; 618 struct orangefs_inode_s *orangefs_inode = 619 ORANGEFS_I(file_inode(file)); 620 struct orangefs_kernel_op_s *new_op = NULL; 621 622 new_op = op_alloc(ORANGEFS_VFS_OP_FSYNC); 623 if (!new_op) 624 return -ENOMEM; 625 new_op->upcall.req.fsync.refn = orangefs_inode->refn; 626 627 ret = service_operation(new_op, 628 "orangefs_fsync", 629 get_interruptible_flag(file_inode(file))); 630 631 gossip_debug(GOSSIP_FILE_DEBUG, 632 "orangefs_fsync got return value of %d\n", 633 ret); 634 635 op_release(new_op); 636 return ret; 637 } 638 639 /* 640 * Change the file pointer position for an instance of an open file. 641 * 642 * \note If .llseek is overriden, we must acquire lock as described in 643 * Documentation/filesystems/Locking. 644 * 645 * Future upgrade could support SEEK_DATA and SEEK_HOLE but would 646 * require much changes to the FS 647 */ 648 static loff_t orangefs_file_llseek(struct file *file, loff_t offset, int origin) 649 { 650 int ret = -EINVAL; 651 struct inode *inode = file_inode(file); 652 653 if (origin == SEEK_END) { 654 /* 655 * revalidate the inode's file size. 656 * NOTE: We are only interested in file size here, 657 * so we set mask accordingly. 658 */ 659 ret = orangefs_inode_getattr(file->f_mapping->host, 0, 1, 660 STATX_SIZE); 661 if (ret == -ESTALE) 662 ret = -EIO; 663 if (ret) { 664 gossip_debug(GOSSIP_FILE_DEBUG, 665 "%s:%s:%d calling make bad inode\n", 666 __FILE__, 667 __func__, 668 __LINE__); 669 return ret; 670 } 671 } 672 673 gossip_debug(GOSSIP_FILE_DEBUG, 674 "orangefs_file_llseek: offset is %ld | origin is %d" 675 " | inode size is %lu\n", 676 (long)offset, 677 origin, 678 (unsigned long)i_size_read(inode)); 679 680 return generic_file_llseek(file, offset, origin); 681 } 682 683 /* 684 * Support local locks (locks that only this kernel knows about) 685 * if Orangefs was mounted -o local_lock. 686 */ 687 static int orangefs_lock(struct file *filp, int cmd, struct file_lock *fl) 688 { 689 int rc = -EINVAL; 690 691 if (ORANGEFS_SB(file_inode(filp)->i_sb)->flags & ORANGEFS_OPT_LOCAL_LOCK) { 692 if (cmd == F_GETLK) { 693 rc = 0; 694 posix_test_lock(filp, fl); 695 } else { 696 rc = posix_lock_file(filp, fl, NULL); 697 } 698 } 699 700 return rc; 701 } 702 703 /** ORANGEFS implementation of VFS file operations */ 704 const struct file_operations orangefs_file_operations = { 705 .llseek = orangefs_file_llseek, 706 .read_iter = orangefs_file_read_iter, 707 .write_iter = orangefs_file_write_iter, 708 .lock = orangefs_lock, 709 .unlocked_ioctl = orangefs_ioctl, 710 .mmap = orangefs_file_mmap, 711 .open = generic_file_open, 712 .release = orangefs_file_release, 713 .fsync = orangefs_fsync, 714 }; 715