1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * (C) 2001 Clemson University and The University of Chicago 4 * Copyright 2018 Omnibond Systems, L.L.C. 5 * 6 * See COPYING in top-level directory. 7 */ 8 9 /* 10 * Linux VFS file operations. 11 */ 12 13 #include "protocol.h" 14 #include "orangefs-kernel.h" 15 #include "orangefs-bufmap.h" 16 #include <linux/fs.h> 17 #include <linux/pagemap.h> 18 19 static int flush_racache(struct inode *inode) 20 { 21 struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); 22 struct orangefs_kernel_op_s *new_op; 23 int ret; 24 25 gossip_debug(GOSSIP_UTILS_DEBUG, 26 "%s: %pU: Handle is %pU | fs_id %d\n", __func__, 27 get_khandle_from_ino(inode), &orangefs_inode->refn.khandle, 28 orangefs_inode->refn.fs_id); 29 30 new_op = op_alloc(ORANGEFS_VFS_OP_RA_FLUSH); 31 if (!new_op) 32 return -ENOMEM; 33 new_op->upcall.req.ra_cache_flush.refn = orangefs_inode->refn; 34 35 ret = service_operation(new_op, "orangefs_flush_racache", 36 get_interruptible_flag(inode)); 37 38 gossip_debug(GOSSIP_UTILS_DEBUG, "%s: got return value of %d\n", 39 __func__, ret); 40 41 op_release(new_op); 42 return ret; 43 } 44 45 /* 46 * Post and wait for the I/O upcall to finish 47 */ 48 ssize_t wait_for_direct_io(enum ORANGEFS_io_type type, struct inode *inode, 49 loff_t *offset, struct iov_iter *iter, size_t total_size, 50 loff_t readahead_size, struct orangefs_write_range *wr, int *index_return) 51 { 52 struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); 53 struct orangefs_khandle *handle = &orangefs_inode->refn.khandle; 54 struct orangefs_kernel_op_s *new_op = NULL; 55 int buffer_index; 56 ssize_t ret; 57 size_t copy_amount; 58 59 new_op = op_alloc(ORANGEFS_VFS_OP_FILE_IO); 60 if (!new_op) 61 return -ENOMEM; 62 63 /* synchronous I/O */ 64 new_op->upcall.req.io.readahead_size = readahead_size; 65 new_op->upcall.req.io.io_type = type; 66 new_op->upcall.req.io.refn = orangefs_inode->refn; 67 68 populate_shared_memory: 69 /* get a shared buffer index */ 70 buffer_index = orangefs_bufmap_get(); 71 if (buffer_index < 0) { 72 ret = buffer_index; 73 gossip_debug(GOSSIP_FILE_DEBUG, 74 "%s: orangefs_bufmap_get failure (%zd)\n", 75 __func__, ret); 76 goto out; 77 } 78 gossip_debug(GOSSIP_FILE_DEBUG, 79 "%s(%pU): GET op %p -> buffer_index %d\n", 80 __func__, 81 handle, 82 new_op, 83 buffer_index); 84 85 new_op->uses_shared_memory = 1; 86 new_op->upcall.req.io.buf_index = buffer_index; 87 new_op->upcall.req.io.count = total_size; 88 new_op->upcall.req.io.offset = *offset; 89 if (type == ORANGEFS_IO_WRITE && wr) { 90 new_op->upcall.uid = from_kuid(&init_user_ns, wr->uid); 91 new_op->upcall.gid = from_kgid(&init_user_ns, wr->gid); 92 } 93 94 gossip_debug(GOSSIP_FILE_DEBUG, 95 "%s(%pU): offset: %llu total_size: %zd\n", 96 __func__, 97 handle, 98 llu(*offset), 99 total_size); 100 /* 101 * Stage 1: copy the buffers into client-core's address space 102 */ 103 if (type == ORANGEFS_IO_WRITE && total_size) { 104 ret = orangefs_bufmap_copy_from_iovec(iter, buffer_index, 105 total_size); 106 if (ret < 0) { 107 gossip_err("%s: Failed to copy-in buffers. Please make sure that the pvfs2-client is running. %ld\n", 108 __func__, (long)ret); 109 goto out; 110 } 111 } 112 113 gossip_debug(GOSSIP_FILE_DEBUG, 114 "%s(%pU): Calling post_io_request with tag (%llu)\n", 115 __func__, 116 handle, 117 llu(new_op->tag)); 118 119 /* Stage 2: Service the I/O operation */ 120 ret = service_operation(new_op, 121 type == ORANGEFS_IO_WRITE ? 122 "file_write" : 123 "file_read", 124 get_interruptible_flag(inode)); 125 126 /* 127 * If service_operation() returns -EAGAIN #and# the operation was 128 * purged from orangefs_request_list or htable_ops_in_progress, then 129 * we know that the client was restarted, causing the shared memory 130 * area to be wiped clean. To restart a write operation in this 131 * case, we must re-copy the data from the user's iovec to a NEW 132 * shared memory location. To restart a read operation, we must get 133 * a new shared memory location. 134 */ 135 if (ret == -EAGAIN && op_state_purged(new_op)) { 136 orangefs_bufmap_put(buffer_index); 137 if (type == ORANGEFS_IO_WRITE) 138 iov_iter_revert(iter, total_size); 139 gossip_debug(GOSSIP_FILE_DEBUG, 140 "%s:going to repopulate_shared_memory.\n", 141 __func__); 142 goto populate_shared_memory; 143 } 144 145 if (ret < 0) { 146 if (ret == -EINTR) { 147 /* 148 * We can't return EINTR if any data was written, 149 * it's not POSIX. It is minimally acceptable 150 * to give a partial write, the way NFS does. 151 * 152 * It would be optimal to return all or nothing, 153 * but if a userspace write is bigger than 154 * an IO buffer, and the interrupt occurs 155 * between buffer writes, that would not be 156 * possible. 157 */ 158 switch (new_op->op_state - OP_VFS_STATE_GIVEN_UP) { 159 /* 160 * If the op was waiting when the interrupt 161 * occurred, then the client-core did not 162 * trigger the write. 163 */ 164 case OP_VFS_STATE_WAITING: 165 if (*offset == 0) 166 ret = -EINTR; 167 else 168 ret = 0; 169 break; 170 /* 171 * If the op was in progress when the interrupt 172 * occurred, then the client-core was able to 173 * trigger the write. 174 */ 175 case OP_VFS_STATE_INPROGR: 176 if (type == ORANGEFS_IO_READ) 177 ret = -EINTR; 178 else 179 ret = total_size; 180 break; 181 default: 182 gossip_err("%s: unexpected op state :%d:.\n", 183 __func__, 184 new_op->op_state); 185 ret = 0; 186 break; 187 } 188 gossip_debug(GOSSIP_FILE_DEBUG, 189 "%s: got EINTR, state:%d: %p\n", 190 __func__, 191 new_op->op_state, 192 new_op); 193 } else { 194 gossip_err("%s: error in %s handle %pU, returning %zd\n", 195 __func__, 196 type == ORANGEFS_IO_READ ? 197 "read from" : "write to", 198 handle, ret); 199 } 200 if (orangefs_cancel_op_in_progress(new_op)) 201 return ret; 202 203 goto out; 204 } 205 206 /* 207 * Stage 3: Post copy buffers from client-core's address space 208 */ 209 if (type == ORANGEFS_IO_READ && new_op->downcall.resp.io.amt_complete) { 210 /* 211 * NOTE: the iovector can either contain addresses which 212 * can futher be kernel-space or user-space addresses. 213 * or it can pointers to struct page's 214 */ 215 216 /* 217 * When reading, readahead_size will only be zero when 218 * we're doing O_DIRECT, otherwise we got here from 219 * orangefs_readpage. 220 * 221 * If we got here from orangefs_readpage we want to 222 * copy either a page or the whole file into the io 223 * vector, whichever is smaller. 224 */ 225 if (readahead_size) 226 copy_amount = 227 min(new_op->downcall.resp.io.amt_complete, 228 (__s64)PAGE_SIZE); 229 else 230 copy_amount = new_op->downcall.resp.io.amt_complete; 231 232 ret = orangefs_bufmap_copy_to_iovec(iter, buffer_index, 233 copy_amount); 234 if (ret < 0) { 235 gossip_err("%s: Failed to copy-out buffers. Please make sure that the pvfs2-client is running (%ld)\n", 236 __func__, (long)ret); 237 goto out; 238 } 239 } 240 gossip_debug(GOSSIP_FILE_DEBUG, 241 "%s(%pU): Amount %s, returned by the sys-io call:%d\n", 242 __func__, 243 handle, 244 type == ORANGEFS_IO_READ ? "read" : "written", 245 (int)new_op->downcall.resp.io.amt_complete); 246 247 ret = new_op->downcall.resp.io.amt_complete; 248 249 out: 250 if (buffer_index >= 0) { 251 if ((readahead_size) && (type == ORANGEFS_IO_READ)) { 252 /* readpage */ 253 *index_return = buffer_index; 254 gossip_debug(GOSSIP_FILE_DEBUG, 255 "%s: hold on to buffer_index :%d:\n", 256 __func__, buffer_index); 257 } else { 258 /* O_DIRECT */ 259 orangefs_bufmap_put(buffer_index); 260 gossip_debug(GOSSIP_FILE_DEBUG, 261 "%s(%pU): PUT buffer_index %d\n", 262 __func__, handle, buffer_index); 263 } 264 } 265 op_release(new_op); 266 return ret; 267 } 268 269 int orangefs_revalidate_mapping(struct inode *inode) 270 { 271 struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); 272 struct address_space *mapping = inode->i_mapping; 273 unsigned long *bitlock = &orangefs_inode->bitlock; 274 int ret; 275 276 while (1) { 277 ret = wait_on_bit(bitlock, 1, TASK_KILLABLE); 278 if (ret) 279 return ret; 280 spin_lock(&inode->i_lock); 281 if (test_bit(1, bitlock)) { 282 spin_unlock(&inode->i_lock); 283 continue; 284 } 285 if (!time_before(jiffies, orangefs_inode->mapping_time)) 286 break; 287 spin_unlock(&inode->i_lock); 288 return 0; 289 } 290 291 set_bit(1, bitlock); 292 smp_wmb(); 293 spin_unlock(&inode->i_lock); 294 295 unmap_mapping_range(mapping, 0, 0, 0); 296 ret = filemap_write_and_wait(mapping); 297 if (!ret) 298 ret = invalidate_inode_pages2(mapping); 299 300 orangefs_inode->mapping_time = jiffies + 301 orangefs_cache_timeout_msecs*HZ/1000; 302 303 clear_bit(1, bitlock); 304 smp_mb__after_atomic(); 305 wake_up_bit(bitlock, 1); 306 307 return ret; 308 } 309 310 static ssize_t orangefs_file_read_iter(struct kiocb *iocb, 311 struct iov_iter *iter) 312 { 313 int ret; 314 struct orangefs_read_options *ro; 315 316 orangefs_stats.reads++; 317 318 /* 319 * Remember how they set "count" in read(2) or pread(2) or whatever - 320 * users can use count as a knob to control orangefs io size and later 321 * we can try to help them fill as many pages as possible in readpage. 322 */ 323 if (!iocb->ki_filp->private_data) { 324 iocb->ki_filp->private_data = kmalloc(sizeof *ro, GFP_KERNEL); 325 if (!iocb->ki_filp->private_data) 326 return(ENOMEM); 327 ro = iocb->ki_filp->private_data; 328 ro->blksiz = iter->count; 329 } 330 331 down_read(&file_inode(iocb->ki_filp)->i_rwsem); 332 ret = orangefs_revalidate_mapping(file_inode(iocb->ki_filp)); 333 if (ret) 334 goto out; 335 336 ret = generic_file_read_iter(iocb, iter); 337 out: 338 up_read(&file_inode(iocb->ki_filp)->i_rwsem); 339 return ret; 340 } 341 342 static ssize_t orangefs_file_write_iter(struct kiocb *iocb, 343 struct iov_iter *iter) 344 { 345 int ret; 346 orangefs_stats.writes++; 347 348 if (iocb->ki_pos > i_size_read(file_inode(iocb->ki_filp))) { 349 ret = orangefs_revalidate_mapping(file_inode(iocb->ki_filp)); 350 if (ret) 351 return ret; 352 } 353 354 ret = generic_file_write_iter(iocb, iter); 355 return ret; 356 } 357 358 static int orangefs_getflags(struct inode *inode, unsigned long *uval) 359 { 360 __u64 val = 0; 361 int ret; 362 363 ret = orangefs_inode_getxattr(inode, 364 "user.pvfs2.meta_hint", 365 &val, sizeof(val)); 366 if (ret < 0 && ret != -ENODATA) 367 return ret; 368 else if (ret == -ENODATA) 369 val = 0; 370 *uval = val; 371 return 0; 372 } 373 374 /* 375 * Perform a miscellaneous operation on a file. 376 */ 377 static long orangefs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) 378 { 379 struct inode *inode = file_inode(file); 380 int ret = -ENOTTY; 381 __u64 val = 0; 382 unsigned long uval; 383 384 gossip_debug(GOSSIP_FILE_DEBUG, 385 "orangefs_ioctl: called with cmd %d\n", 386 cmd); 387 388 /* 389 * we understand some general ioctls on files, such as the immutable 390 * and append flags 391 */ 392 if (cmd == FS_IOC_GETFLAGS) { 393 ret = orangefs_getflags(inode, &uval); 394 if (ret) 395 return ret; 396 gossip_debug(GOSSIP_FILE_DEBUG, 397 "orangefs_ioctl: FS_IOC_GETFLAGS: %llu\n", 398 (unsigned long long)uval); 399 return put_user(uval, (int __user *)arg); 400 } else if (cmd == FS_IOC_SETFLAGS) { 401 unsigned long old_uval; 402 403 ret = 0; 404 if (get_user(uval, (int __user *)arg)) 405 return -EFAULT; 406 /* 407 * ORANGEFS_MIRROR_FL is set internally when the mirroring mode 408 * is turned on for a file. The user is not allowed to turn 409 * on this bit, but the bit is present if the user first gets 410 * the flags and then updates the flags with some new 411 * settings. So, we ignore it in the following edit. bligon. 412 */ 413 if ((uval & ~ORANGEFS_MIRROR_FL) & 414 (~(FS_IMMUTABLE_FL | FS_APPEND_FL | FS_NOATIME_FL))) { 415 gossip_err("orangefs_ioctl: the FS_IOC_SETFLAGS only supports setting one of FS_IMMUTABLE_FL|FS_APPEND_FL|FS_NOATIME_FL\n"); 416 return -EINVAL; 417 } 418 ret = orangefs_getflags(inode, &old_uval); 419 if (ret) 420 return ret; 421 ret = vfs_ioc_setflags_prepare(inode, old_uval, uval); 422 if (ret) 423 return ret; 424 val = uval; 425 gossip_debug(GOSSIP_FILE_DEBUG, 426 "orangefs_ioctl: FS_IOC_SETFLAGS: %llu\n", 427 (unsigned long long)val); 428 ret = orangefs_inode_setxattr(inode, 429 "user.pvfs2.meta_hint", 430 &val, sizeof(val), 0); 431 } 432 433 return ret; 434 } 435 436 static vm_fault_t orangefs_fault(struct vm_fault *vmf) 437 { 438 struct file *file = vmf->vma->vm_file; 439 int ret; 440 ret = orangefs_inode_getattr(file->f_mapping->host, 441 ORANGEFS_GETATTR_SIZE); 442 if (ret == -ESTALE) 443 ret = -EIO; 444 if (ret) { 445 gossip_err("%s: orangefs_inode_getattr failed, " 446 "ret:%d:.\n", __func__, ret); 447 return VM_FAULT_SIGBUS; 448 } 449 return filemap_fault(vmf); 450 } 451 452 static const struct vm_operations_struct orangefs_file_vm_ops = { 453 .fault = orangefs_fault, 454 .map_pages = filemap_map_pages, 455 .page_mkwrite = orangefs_page_mkwrite, 456 }; 457 458 /* 459 * Memory map a region of a file. 460 */ 461 static int orangefs_file_mmap(struct file *file, struct vm_area_struct *vma) 462 { 463 int ret; 464 465 ret = orangefs_revalidate_mapping(file_inode(file)); 466 if (ret) 467 return ret; 468 469 gossip_debug(GOSSIP_FILE_DEBUG, 470 "orangefs_file_mmap: called on %s\n", 471 (file ? 472 (char *)file->f_path.dentry->d_name.name : 473 (char *)"Unknown")); 474 475 /* set the sequential readahead hint */ 476 vma->vm_flags |= VM_SEQ_READ; 477 vma->vm_flags &= ~VM_RAND_READ; 478 479 file_accessed(file); 480 vma->vm_ops = &orangefs_file_vm_ops; 481 return 0; 482 } 483 484 #define mapping_nrpages(idata) ((idata)->nrpages) 485 486 /* 487 * Called to notify the module that there are no more references to 488 * this file (i.e. no processes have it open). 489 * 490 * \note Not called when each file is closed. 491 */ 492 static int orangefs_file_release(struct inode *inode, struct file *file) 493 { 494 gossip_debug(GOSSIP_FILE_DEBUG, 495 "orangefs_file_release: called on %pD\n", 496 file); 497 498 /* 499 * remove all associated inode pages from the page cache and 500 * readahead cache (if any); this forces an expensive refresh of 501 * data for the next caller of mmap (or 'get_block' accesses) 502 */ 503 if (file_inode(file) && 504 file_inode(file)->i_mapping && 505 mapping_nrpages(&file_inode(file)->i_data)) { 506 if (orangefs_features & ORANGEFS_FEATURE_READAHEAD) { 507 gossip_debug(GOSSIP_INODE_DEBUG, 508 "calling flush_racache on %pU\n", 509 get_khandle_from_ino(inode)); 510 flush_racache(inode); 511 gossip_debug(GOSSIP_INODE_DEBUG, 512 "flush_racache finished\n"); 513 } 514 515 } 516 return 0; 517 } 518 519 /* 520 * Push all data for a specific file onto permanent storage. 521 */ 522 static int orangefs_fsync(struct file *file, 523 loff_t start, 524 loff_t end, 525 int datasync) 526 { 527 int ret; 528 struct orangefs_inode_s *orangefs_inode = 529 ORANGEFS_I(file_inode(file)); 530 struct orangefs_kernel_op_s *new_op = NULL; 531 532 ret = filemap_write_and_wait_range(file_inode(file)->i_mapping, 533 start, end); 534 if (ret < 0) 535 return ret; 536 537 new_op = op_alloc(ORANGEFS_VFS_OP_FSYNC); 538 if (!new_op) 539 return -ENOMEM; 540 new_op->upcall.req.fsync.refn = orangefs_inode->refn; 541 542 ret = service_operation(new_op, 543 "orangefs_fsync", 544 get_interruptible_flag(file_inode(file))); 545 546 gossip_debug(GOSSIP_FILE_DEBUG, 547 "orangefs_fsync got return value of %d\n", 548 ret); 549 550 op_release(new_op); 551 return ret; 552 } 553 554 /* 555 * Change the file pointer position for an instance of an open file. 556 * 557 * \note If .llseek is overriden, we must acquire lock as described in 558 * Documentation/filesystems/locking.rst. 559 * 560 * Future upgrade could support SEEK_DATA and SEEK_HOLE but would 561 * require much changes to the FS 562 */ 563 static loff_t orangefs_file_llseek(struct file *file, loff_t offset, int origin) 564 { 565 int ret = -EINVAL; 566 struct inode *inode = file_inode(file); 567 568 if (origin == SEEK_END) { 569 /* 570 * revalidate the inode's file size. 571 * NOTE: We are only interested in file size here, 572 * so we set mask accordingly. 573 */ 574 ret = orangefs_inode_getattr(file->f_mapping->host, 575 ORANGEFS_GETATTR_SIZE); 576 if (ret == -ESTALE) 577 ret = -EIO; 578 if (ret) { 579 gossip_debug(GOSSIP_FILE_DEBUG, 580 "%s:%s:%d calling make bad inode\n", 581 __FILE__, 582 __func__, 583 __LINE__); 584 return ret; 585 } 586 } 587 588 gossip_debug(GOSSIP_FILE_DEBUG, 589 "orangefs_file_llseek: offset is %ld | origin is %d" 590 " | inode size is %lu\n", 591 (long)offset, 592 origin, 593 (unsigned long)i_size_read(inode)); 594 595 return generic_file_llseek(file, offset, origin); 596 } 597 598 /* 599 * Support local locks (locks that only this kernel knows about) 600 * if Orangefs was mounted -o local_lock. 601 */ 602 static int orangefs_lock(struct file *filp, int cmd, struct file_lock *fl) 603 { 604 int rc = -EINVAL; 605 606 if (ORANGEFS_SB(file_inode(filp)->i_sb)->flags & ORANGEFS_OPT_LOCAL_LOCK) { 607 if (cmd == F_GETLK) { 608 rc = 0; 609 posix_test_lock(filp, fl); 610 } else { 611 rc = posix_lock_file(filp, fl, NULL); 612 } 613 } 614 615 return rc; 616 } 617 618 static int orangefs_file_open(struct inode * inode, struct file *file) 619 { 620 file->private_data = NULL; 621 return generic_file_open(inode, file); 622 } 623 624 static int orangefs_flush(struct file *file, fl_owner_t id) 625 { 626 /* 627 * This is vfs_fsync_range(file, 0, LLONG_MAX, 0) without the 628 * service_operation in orangefs_fsync. 629 * 630 * Do not send fsync to OrangeFS server on a close. Do send fsync 631 * on an explicit fsync call. This duplicates historical OrangeFS 632 * behavior. 633 */ 634 struct inode *inode = file->f_mapping->host; 635 int r; 636 637 kfree(file->private_data); 638 file->private_data = NULL; 639 640 if (inode->i_state & I_DIRTY_TIME) { 641 spin_lock(&inode->i_lock); 642 inode->i_state &= ~I_DIRTY_TIME; 643 spin_unlock(&inode->i_lock); 644 mark_inode_dirty_sync(inode); 645 } 646 647 r = filemap_write_and_wait_range(file->f_mapping, 0, LLONG_MAX); 648 if (r > 0) 649 return 0; 650 else 651 return r; 652 } 653 654 /** ORANGEFS implementation of VFS file operations */ 655 const struct file_operations orangefs_file_operations = { 656 .llseek = orangefs_file_llseek, 657 .read_iter = orangefs_file_read_iter, 658 .write_iter = orangefs_file_write_iter, 659 .lock = orangefs_lock, 660 .unlocked_ioctl = orangefs_ioctl, 661 .mmap = orangefs_file_mmap, 662 .open = orangefs_file_open, 663 .flush = orangefs_flush, 664 .release = orangefs_file_release, 665 .fsync = orangefs_fsync, 666 }; 667