1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * (C) 2001 Clemson University and The University of Chicago 4 * Copyright 2018 Omnibond Systems, L.L.C. 5 * 6 * See COPYING in top-level directory. 7 */ 8 9 /* 10 * Linux VFS file operations. 11 */ 12 13 #include "protocol.h" 14 #include "orangefs-kernel.h" 15 #include "orangefs-bufmap.h" 16 #include <linux/fs.h> 17 #include <linux/pagemap.h> 18 19 static int flush_racache(struct inode *inode) 20 { 21 struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); 22 struct orangefs_kernel_op_s *new_op; 23 int ret; 24 25 gossip_debug(GOSSIP_UTILS_DEBUG, 26 "%s: %pU: Handle is %pU | fs_id %d\n", __func__, 27 get_khandle_from_ino(inode), &orangefs_inode->refn.khandle, 28 orangefs_inode->refn.fs_id); 29 30 new_op = op_alloc(ORANGEFS_VFS_OP_RA_FLUSH); 31 if (!new_op) 32 return -ENOMEM; 33 new_op->upcall.req.ra_cache_flush.refn = orangefs_inode->refn; 34 35 ret = service_operation(new_op, "orangefs_flush_racache", 36 get_interruptible_flag(inode)); 37 38 gossip_debug(GOSSIP_UTILS_DEBUG, "%s: got return value of %d\n", 39 __func__, ret); 40 41 op_release(new_op); 42 return ret; 43 } 44 45 /* 46 * Post and wait for the I/O upcall to finish 47 */ 48 ssize_t wait_for_direct_io(enum ORANGEFS_io_type type, struct inode *inode, 49 loff_t *offset, struct iov_iter *iter, size_t total_size, 50 loff_t readahead_size, struct orangefs_write_range *wr, int *index_return) 51 { 52 struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); 53 struct orangefs_khandle *handle = &orangefs_inode->refn.khandle; 54 struct orangefs_kernel_op_s *new_op = NULL; 55 int buffer_index = -1; 56 ssize_t ret; 57 size_t copy_amount; 58 59 new_op = op_alloc(ORANGEFS_VFS_OP_FILE_IO); 60 if (!new_op) 61 return -ENOMEM; 62 63 /* synchronous I/O */ 64 new_op->upcall.req.io.readahead_size = readahead_size; 65 new_op->upcall.req.io.io_type = type; 66 new_op->upcall.req.io.refn = orangefs_inode->refn; 67 68 populate_shared_memory: 69 /* get a shared buffer index */ 70 buffer_index = orangefs_bufmap_get(); 71 if (buffer_index < 0) { 72 ret = buffer_index; 73 gossip_debug(GOSSIP_FILE_DEBUG, 74 "%s: orangefs_bufmap_get failure (%zd)\n", 75 __func__, ret); 76 goto out; 77 } 78 gossip_debug(GOSSIP_FILE_DEBUG, 79 "%s(%pU): GET op %p -> buffer_index %d\n", 80 __func__, 81 handle, 82 new_op, 83 buffer_index); 84 85 new_op->uses_shared_memory = 1; 86 new_op->upcall.req.io.buf_index = buffer_index; 87 new_op->upcall.req.io.count = total_size; 88 new_op->upcall.req.io.offset = *offset; 89 if (type == ORANGEFS_IO_WRITE && wr) { 90 new_op->upcall.uid = from_kuid(&init_user_ns, wr->uid); 91 new_op->upcall.gid = from_kgid(&init_user_ns, wr->gid); 92 } 93 94 gossip_debug(GOSSIP_FILE_DEBUG, 95 "%s(%pU): offset: %llu total_size: %zd\n", 96 __func__, 97 handle, 98 llu(*offset), 99 total_size); 100 /* 101 * Stage 1: copy the buffers into client-core's address space 102 */ 103 if (type == ORANGEFS_IO_WRITE && total_size) { 104 ret = orangefs_bufmap_copy_from_iovec(iter, buffer_index, 105 total_size); 106 if (ret < 0) { 107 gossip_err("%s: Failed to copy-in buffers. Please make sure that the pvfs2-client is running. %ld\n", 108 __func__, (long)ret); 109 goto out; 110 } 111 } 112 113 gossip_debug(GOSSIP_FILE_DEBUG, 114 "%s(%pU): Calling post_io_request with tag (%llu)\n", 115 __func__, 116 handle, 117 llu(new_op->tag)); 118 119 /* Stage 2: Service the I/O operation */ 120 ret = service_operation(new_op, 121 type == ORANGEFS_IO_WRITE ? 122 "file_write" : 123 "file_read", 124 get_interruptible_flag(inode)); 125 126 /* 127 * If service_operation() returns -EAGAIN #and# the operation was 128 * purged from orangefs_request_list or htable_ops_in_progress, then 129 * we know that the client was restarted, causing the shared memory 130 * area to be wiped clean. To restart a write operation in this 131 * case, we must re-copy the data from the user's iovec to a NEW 132 * shared memory location. To restart a read operation, we must get 133 * a new shared memory location. 134 */ 135 if (ret == -EAGAIN && op_state_purged(new_op)) { 136 orangefs_bufmap_put(buffer_index); 137 buffer_index = -1; 138 if (type == ORANGEFS_IO_WRITE) 139 iov_iter_revert(iter, total_size); 140 gossip_debug(GOSSIP_FILE_DEBUG, 141 "%s:going to repopulate_shared_memory.\n", 142 __func__); 143 goto populate_shared_memory; 144 } 145 146 if (ret < 0) { 147 if (ret == -EINTR) { 148 /* 149 * We can't return EINTR if any data was written, 150 * it's not POSIX. It is minimally acceptable 151 * to give a partial write, the way NFS does. 152 * 153 * It would be optimal to return all or nothing, 154 * but if a userspace write is bigger than 155 * an IO buffer, and the interrupt occurs 156 * between buffer writes, that would not be 157 * possible. 158 */ 159 switch (new_op->op_state - OP_VFS_STATE_GIVEN_UP) { 160 /* 161 * If the op was waiting when the interrupt 162 * occurred, then the client-core did not 163 * trigger the write. 164 */ 165 case OP_VFS_STATE_WAITING: 166 if (*offset == 0) 167 ret = -EINTR; 168 else 169 ret = 0; 170 break; 171 /* 172 * If the op was in progress when the interrupt 173 * occurred, then the client-core was able to 174 * trigger the write. 175 */ 176 case OP_VFS_STATE_INPROGR: 177 if (type == ORANGEFS_IO_READ) 178 ret = -EINTR; 179 else 180 ret = total_size; 181 break; 182 default: 183 gossip_err("%s: unexpected op state :%d:.\n", 184 __func__, 185 new_op->op_state); 186 ret = 0; 187 break; 188 } 189 gossip_debug(GOSSIP_FILE_DEBUG, 190 "%s: got EINTR, state:%d: %p\n", 191 __func__, 192 new_op->op_state, 193 new_op); 194 } else { 195 gossip_err("%s: error in %s handle %pU, returning %zd\n", 196 __func__, 197 type == ORANGEFS_IO_READ ? 198 "read from" : "write to", 199 handle, ret); 200 } 201 if (orangefs_cancel_op_in_progress(new_op)) 202 return ret; 203 204 goto out; 205 } 206 207 /* 208 * Stage 3: Post copy buffers from client-core's address space 209 */ 210 if (type == ORANGEFS_IO_READ && new_op->downcall.resp.io.amt_complete) { 211 /* 212 * NOTE: the iovector can either contain addresses which 213 * can futher be kernel-space or user-space addresses. 214 * or it can pointers to struct page's 215 */ 216 217 /* 218 * When reading, readahead_size will only be zero when 219 * we're doing O_DIRECT, otherwise we got here from 220 * orangefs_readpage. 221 * 222 * If we got here from orangefs_readpage we want to 223 * copy either a page or the whole file into the io 224 * vector, whichever is smaller. 225 */ 226 if (readahead_size) 227 copy_amount = 228 min(new_op->downcall.resp.io.amt_complete, 229 (__s64)PAGE_SIZE); 230 else 231 copy_amount = new_op->downcall.resp.io.amt_complete; 232 233 ret = orangefs_bufmap_copy_to_iovec(iter, buffer_index, 234 copy_amount); 235 if (ret < 0) { 236 gossip_err("%s: Failed to copy-out buffers. Please make sure that the pvfs2-client is running (%ld)\n", 237 __func__, (long)ret); 238 goto out; 239 } 240 } 241 gossip_debug(GOSSIP_FILE_DEBUG, 242 "%s(%pU): Amount %s, returned by the sys-io call:%d\n", 243 __func__, 244 handle, 245 type == ORANGEFS_IO_READ ? "read" : "written", 246 (int)new_op->downcall.resp.io.amt_complete); 247 248 ret = new_op->downcall.resp.io.amt_complete; 249 250 out: 251 if (buffer_index >= 0) { 252 if ((readahead_size) && (type == ORANGEFS_IO_READ)) { 253 /* readpage */ 254 *index_return = buffer_index; 255 gossip_debug(GOSSIP_FILE_DEBUG, 256 "%s: hold on to buffer_index :%d:\n", 257 __func__, buffer_index); 258 } else { 259 /* O_DIRECT */ 260 orangefs_bufmap_put(buffer_index); 261 gossip_debug(GOSSIP_FILE_DEBUG, 262 "%s(%pU): PUT buffer_index %d\n", 263 __func__, handle, buffer_index); 264 } 265 buffer_index = -1; 266 } 267 op_release(new_op); 268 return ret; 269 } 270 271 int orangefs_revalidate_mapping(struct inode *inode) 272 { 273 struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); 274 struct address_space *mapping = inode->i_mapping; 275 unsigned long *bitlock = &orangefs_inode->bitlock; 276 int ret; 277 278 while (1) { 279 ret = wait_on_bit(bitlock, 1, TASK_KILLABLE); 280 if (ret) 281 return ret; 282 spin_lock(&inode->i_lock); 283 if (test_bit(1, bitlock)) { 284 spin_unlock(&inode->i_lock); 285 continue; 286 } 287 if (!time_before(jiffies, orangefs_inode->mapping_time)) 288 break; 289 spin_unlock(&inode->i_lock); 290 return 0; 291 } 292 293 set_bit(1, bitlock); 294 smp_wmb(); 295 spin_unlock(&inode->i_lock); 296 297 unmap_mapping_range(mapping, 0, 0, 0); 298 ret = filemap_write_and_wait(mapping); 299 if (!ret) 300 ret = invalidate_inode_pages2(mapping); 301 302 orangefs_inode->mapping_time = jiffies + 303 orangefs_cache_timeout_msecs*HZ/1000; 304 305 clear_bit(1, bitlock); 306 smp_mb__after_atomic(); 307 wake_up_bit(bitlock, 1); 308 309 return ret; 310 } 311 312 static ssize_t orangefs_file_read_iter(struct kiocb *iocb, 313 struct iov_iter *iter) 314 { 315 int ret; 316 struct orangefs_read_options *ro; 317 318 orangefs_stats.reads++; 319 320 /* 321 * Remember how they set "count" in read(2) or pread(2) or whatever - 322 * users can use count as a knob to control orangefs io size and later 323 * we can try to help them fill as many pages as possible in readpage. 324 */ 325 if (!iocb->ki_filp->private_data) { 326 iocb->ki_filp->private_data = kmalloc(sizeof *ro, GFP_KERNEL); 327 if (!iocb->ki_filp->private_data) 328 return(ENOMEM); 329 ro = iocb->ki_filp->private_data; 330 ro->blksiz = iter->count; 331 } 332 333 down_read(&file_inode(iocb->ki_filp)->i_rwsem); 334 ret = orangefs_revalidate_mapping(file_inode(iocb->ki_filp)); 335 if (ret) 336 goto out; 337 338 ret = generic_file_read_iter(iocb, iter); 339 out: 340 up_read(&file_inode(iocb->ki_filp)->i_rwsem); 341 return ret; 342 } 343 344 static ssize_t orangefs_file_write_iter(struct kiocb *iocb, 345 struct iov_iter *iter) 346 { 347 int ret; 348 orangefs_stats.writes++; 349 350 if (iocb->ki_pos > i_size_read(file_inode(iocb->ki_filp))) { 351 ret = orangefs_revalidate_mapping(file_inode(iocb->ki_filp)); 352 if (ret) 353 return ret; 354 } 355 356 ret = generic_file_write_iter(iocb, iter); 357 return ret; 358 } 359 360 /* 361 * Perform a miscellaneous operation on a file. 362 */ 363 static long orangefs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) 364 { 365 int ret = -ENOTTY; 366 __u64 val = 0; 367 unsigned long uval; 368 369 gossip_debug(GOSSIP_FILE_DEBUG, 370 "orangefs_ioctl: called with cmd %d\n", 371 cmd); 372 373 /* 374 * we understand some general ioctls on files, such as the immutable 375 * and append flags 376 */ 377 if (cmd == FS_IOC_GETFLAGS) { 378 val = 0; 379 ret = orangefs_inode_getxattr(file_inode(file), 380 "user.pvfs2.meta_hint", 381 &val, sizeof(val)); 382 if (ret < 0 && ret != -ENODATA) 383 return ret; 384 else if (ret == -ENODATA) 385 val = 0; 386 uval = val; 387 gossip_debug(GOSSIP_FILE_DEBUG, 388 "orangefs_ioctl: FS_IOC_GETFLAGS: %llu\n", 389 (unsigned long long)uval); 390 return put_user(uval, (int __user *)arg); 391 } else if (cmd == FS_IOC_SETFLAGS) { 392 ret = 0; 393 if (get_user(uval, (int __user *)arg)) 394 return -EFAULT; 395 /* 396 * ORANGEFS_MIRROR_FL is set internally when the mirroring mode 397 * is turned on for a file. The user is not allowed to turn 398 * on this bit, but the bit is present if the user first gets 399 * the flags and then updates the flags with some new 400 * settings. So, we ignore it in the following edit. bligon. 401 */ 402 if ((uval & ~ORANGEFS_MIRROR_FL) & 403 (~(FS_IMMUTABLE_FL | FS_APPEND_FL | FS_NOATIME_FL))) { 404 gossip_err("orangefs_ioctl: the FS_IOC_SETFLAGS only supports setting one of FS_IMMUTABLE_FL|FS_APPEND_FL|FS_NOATIME_FL\n"); 405 return -EINVAL; 406 } 407 val = uval; 408 gossip_debug(GOSSIP_FILE_DEBUG, 409 "orangefs_ioctl: FS_IOC_SETFLAGS: %llu\n", 410 (unsigned long long)val); 411 ret = orangefs_inode_setxattr(file_inode(file), 412 "user.pvfs2.meta_hint", 413 &val, sizeof(val), 0); 414 } 415 416 return ret; 417 } 418 419 static vm_fault_t orangefs_fault(struct vm_fault *vmf) 420 { 421 struct file *file = vmf->vma->vm_file; 422 int ret; 423 ret = orangefs_inode_getattr(file->f_mapping->host, 424 ORANGEFS_GETATTR_SIZE); 425 if (ret == -ESTALE) 426 ret = -EIO; 427 if (ret) { 428 gossip_err("%s: orangefs_inode_getattr failed, " 429 "ret:%d:.\n", __func__, ret); 430 return VM_FAULT_SIGBUS; 431 } 432 return filemap_fault(vmf); 433 } 434 435 static const struct vm_operations_struct orangefs_file_vm_ops = { 436 .fault = orangefs_fault, 437 .map_pages = filemap_map_pages, 438 .page_mkwrite = orangefs_page_mkwrite, 439 }; 440 441 /* 442 * Memory map a region of a file. 443 */ 444 static int orangefs_file_mmap(struct file *file, struct vm_area_struct *vma) 445 { 446 int ret; 447 448 ret = orangefs_revalidate_mapping(file_inode(file)); 449 if (ret) 450 return ret; 451 452 gossip_debug(GOSSIP_FILE_DEBUG, 453 "orangefs_file_mmap: called on %s\n", 454 (file ? 455 (char *)file->f_path.dentry->d_name.name : 456 (char *)"Unknown")); 457 458 /* set the sequential readahead hint */ 459 vma->vm_flags |= VM_SEQ_READ; 460 vma->vm_flags &= ~VM_RAND_READ; 461 462 file_accessed(file); 463 vma->vm_ops = &orangefs_file_vm_ops; 464 return 0; 465 } 466 467 #define mapping_nrpages(idata) ((idata)->nrpages) 468 469 /* 470 * Called to notify the module that there are no more references to 471 * this file (i.e. no processes have it open). 472 * 473 * \note Not called when each file is closed. 474 */ 475 static int orangefs_file_release(struct inode *inode, struct file *file) 476 { 477 gossip_debug(GOSSIP_FILE_DEBUG, 478 "orangefs_file_release: called on %pD\n", 479 file); 480 481 /* 482 * remove all associated inode pages from the page cache and 483 * readahead cache (if any); this forces an expensive refresh of 484 * data for the next caller of mmap (or 'get_block' accesses) 485 */ 486 if (file_inode(file) && 487 file_inode(file)->i_mapping && 488 mapping_nrpages(&file_inode(file)->i_data)) { 489 if (orangefs_features & ORANGEFS_FEATURE_READAHEAD) { 490 gossip_debug(GOSSIP_INODE_DEBUG, 491 "calling flush_racache on %pU\n", 492 get_khandle_from_ino(inode)); 493 flush_racache(inode); 494 gossip_debug(GOSSIP_INODE_DEBUG, 495 "flush_racache finished\n"); 496 } 497 498 } 499 return 0; 500 } 501 502 /* 503 * Push all data for a specific file onto permanent storage. 504 */ 505 static int orangefs_fsync(struct file *file, 506 loff_t start, 507 loff_t end, 508 int datasync) 509 { 510 int ret; 511 struct orangefs_inode_s *orangefs_inode = 512 ORANGEFS_I(file_inode(file)); 513 struct orangefs_kernel_op_s *new_op = NULL; 514 515 ret = filemap_write_and_wait_range(file_inode(file)->i_mapping, 516 start, end); 517 if (ret < 0) 518 return ret; 519 520 new_op = op_alloc(ORANGEFS_VFS_OP_FSYNC); 521 if (!new_op) 522 return -ENOMEM; 523 new_op->upcall.req.fsync.refn = orangefs_inode->refn; 524 525 ret = service_operation(new_op, 526 "orangefs_fsync", 527 get_interruptible_flag(file_inode(file))); 528 529 gossip_debug(GOSSIP_FILE_DEBUG, 530 "orangefs_fsync got return value of %d\n", 531 ret); 532 533 op_release(new_op); 534 return ret; 535 } 536 537 /* 538 * Change the file pointer position for an instance of an open file. 539 * 540 * \note If .llseek is overriden, we must acquire lock as described in 541 * Documentation/filesystems/Locking. 542 * 543 * Future upgrade could support SEEK_DATA and SEEK_HOLE but would 544 * require much changes to the FS 545 */ 546 static loff_t orangefs_file_llseek(struct file *file, loff_t offset, int origin) 547 { 548 int ret = -EINVAL; 549 struct inode *inode = file_inode(file); 550 551 if (origin == SEEK_END) { 552 /* 553 * revalidate the inode's file size. 554 * NOTE: We are only interested in file size here, 555 * so we set mask accordingly. 556 */ 557 ret = orangefs_inode_getattr(file->f_mapping->host, 558 ORANGEFS_GETATTR_SIZE); 559 if (ret == -ESTALE) 560 ret = -EIO; 561 if (ret) { 562 gossip_debug(GOSSIP_FILE_DEBUG, 563 "%s:%s:%d calling make bad inode\n", 564 __FILE__, 565 __func__, 566 __LINE__); 567 return ret; 568 } 569 } 570 571 gossip_debug(GOSSIP_FILE_DEBUG, 572 "orangefs_file_llseek: offset is %ld | origin is %d" 573 " | inode size is %lu\n", 574 (long)offset, 575 origin, 576 (unsigned long)i_size_read(inode)); 577 578 return generic_file_llseek(file, offset, origin); 579 } 580 581 /* 582 * Support local locks (locks that only this kernel knows about) 583 * if Orangefs was mounted -o local_lock. 584 */ 585 static int orangefs_lock(struct file *filp, int cmd, struct file_lock *fl) 586 { 587 int rc = -EINVAL; 588 589 if (ORANGEFS_SB(file_inode(filp)->i_sb)->flags & ORANGEFS_OPT_LOCAL_LOCK) { 590 if (cmd == F_GETLK) { 591 rc = 0; 592 posix_test_lock(filp, fl); 593 } else { 594 rc = posix_lock_file(filp, fl, NULL); 595 } 596 } 597 598 return rc; 599 } 600 601 static int orangefs_file_open(struct inode * inode, struct file *file) 602 { 603 file->private_data = NULL; 604 return generic_file_open(inode, file); 605 } 606 607 static int orangefs_flush(struct file *file, fl_owner_t id) 608 { 609 /* 610 * This is vfs_fsync_range(file, 0, LLONG_MAX, 0) without the 611 * service_operation in orangefs_fsync. 612 * 613 * Do not send fsync to OrangeFS server on a close. Do send fsync 614 * on an explicit fsync call. This duplicates historical OrangeFS 615 * behavior. 616 */ 617 struct inode *inode = file->f_mapping->host; 618 int r; 619 620 kfree(file->private_data); 621 file->private_data = NULL; 622 623 if (inode->i_state & I_DIRTY_TIME) { 624 spin_lock(&inode->i_lock); 625 inode->i_state &= ~I_DIRTY_TIME; 626 spin_unlock(&inode->i_lock); 627 mark_inode_dirty_sync(inode); 628 } 629 630 r = filemap_write_and_wait_range(file->f_mapping, 0, LLONG_MAX); 631 if (r > 0) 632 return 0; 633 else 634 return r; 635 } 636 637 /** ORANGEFS implementation of VFS file operations */ 638 const struct file_operations orangefs_file_operations = { 639 .llseek = orangefs_file_llseek, 640 .read_iter = orangefs_file_read_iter, 641 .write_iter = orangefs_file_write_iter, 642 .lock = orangefs_lock, 643 .unlocked_ioctl = orangefs_ioctl, 644 .mmap = orangefs_file_mmap, 645 .open = orangefs_file_open, 646 .flush = orangefs_flush, 647 .release = orangefs_file_release, 648 .fsync = orangefs_fsync, 649 }; 650