1 /* 2 * (C) 2001 Clemson University and The University of Chicago 3 * 4 * See COPYING in top-level directory. 5 */ 6 7 /* 8 * Linux VFS file operations. 9 */ 10 11 #include "protocol.h" 12 #include "orangefs-kernel.h" 13 #include "orangefs-bufmap.h" 14 #include <linux/fs.h> 15 #include <linux/pagemap.h> 16 17 /* 18 * Copy to client-core's address space from the buffers specified 19 * by the iovec upto total_size bytes. 20 * NOTE: the iovector can either contain addresses which 21 * can futher be kernel-space or user-space addresses. 22 * or it can pointers to struct page's 23 */ 24 static int precopy_buffers(int buffer_index, 25 struct iov_iter *iter, 26 size_t total_size) 27 { 28 int ret = 0; 29 /* 30 * copy data from application/kernel by pulling it out 31 * of the iovec. 32 */ 33 34 35 if (total_size) { 36 ret = orangefs_bufmap_copy_from_iovec(iter, 37 buffer_index, 38 total_size); 39 if (ret < 0) 40 gossip_err("%s: Failed to copy-in buffers. Please make sure that the pvfs2-client is running. %ld\n", 41 __func__, 42 (long)ret); 43 } 44 45 if (ret < 0) 46 gossip_err("%s: Failed to copy-in buffers. Please make sure that the pvfs2-client is running. %ld\n", 47 __func__, 48 (long)ret); 49 return ret; 50 } 51 52 /* 53 * Copy from client-core's address space to the buffers specified 54 * by the iovec upto total_size bytes. 55 * NOTE: the iovector can either contain addresses which 56 * can futher be kernel-space or user-space addresses. 57 * or it can pointers to struct page's 58 */ 59 static int postcopy_buffers(int buffer_index, 60 struct iov_iter *iter, 61 size_t total_size) 62 { 63 int ret = 0; 64 /* 65 * copy data to application/kernel by pushing it out to 66 * the iovec. NOTE; target buffers can be addresses or 67 * struct page pointers. 68 */ 69 if (total_size) { 70 ret = orangefs_bufmap_copy_to_iovec(iter, 71 buffer_index, 72 total_size); 73 if (ret < 0) 74 gossip_err("%s: Failed to copy-out buffers. Please make sure that the pvfs2-client is running (%ld)\n", 75 __func__, 76 (long)ret); 77 } 78 return ret; 79 } 80 81 /* 82 * Post and wait for the I/O upcall to finish 83 */ 84 static ssize_t wait_for_direct_io(enum ORANGEFS_io_type type, struct inode *inode, 85 loff_t *offset, struct iov_iter *iter, 86 size_t total_size, loff_t readahead_size) 87 { 88 struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); 89 struct orangefs_khandle *handle = &orangefs_inode->refn.khandle; 90 struct orangefs_kernel_op_s *new_op = NULL; 91 struct iov_iter saved = *iter; 92 int buffer_index = -1; 93 ssize_t ret; 94 95 new_op = op_alloc(ORANGEFS_VFS_OP_FILE_IO); 96 if (!new_op) 97 return -ENOMEM; 98 99 /* synchronous I/O */ 100 new_op->upcall.req.io.readahead_size = readahead_size; 101 new_op->upcall.req.io.io_type = type; 102 new_op->upcall.req.io.refn = orangefs_inode->refn; 103 104 populate_shared_memory: 105 /* get a shared buffer index */ 106 buffer_index = orangefs_bufmap_get(); 107 if (buffer_index < 0) { 108 ret = buffer_index; 109 gossip_debug(GOSSIP_FILE_DEBUG, 110 "%s: orangefs_bufmap_get failure (%zd)\n", 111 __func__, ret); 112 goto out; 113 } 114 gossip_debug(GOSSIP_FILE_DEBUG, 115 "%s(%pU): GET op %p -> buffer_index %d\n", 116 __func__, 117 handle, 118 new_op, 119 buffer_index); 120 121 new_op->uses_shared_memory = 1; 122 new_op->upcall.req.io.buf_index = buffer_index; 123 new_op->upcall.req.io.count = total_size; 124 new_op->upcall.req.io.offset = *offset; 125 126 gossip_debug(GOSSIP_FILE_DEBUG, 127 "%s(%pU): offset: %llu total_size: %zd\n", 128 __func__, 129 handle, 130 llu(*offset), 131 total_size); 132 /* 133 * Stage 1: copy the buffers into client-core's address space 134 * precopy_buffers only pertains to writes. 135 */ 136 if (type == ORANGEFS_IO_WRITE) { 137 ret = precopy_buffers(buffer_index, 138 iter, 139 total_size); 140 if (ret < 0) 141 goto out; 142 } 143 144 gossip_debug(GOSSIP_FILE_DEBUG, 145 "%s(%pU): Calling post_io_request with tag (%llu)\n", 146 __func__, 147 handle, 148 llu(new_op->tag)); 149 150 /* Stage 2: Service the I/O operation */ 151 ret = service_operation(new_op, 152 type == ORANGEFS_IO_WRITE ? 153 "file_write" : 154 "file_read", 155 get_interruptible_flag(inode)); 156 157 /* 158 * If service_operation() returns -EAGAIN #and# the operation was 159 * purged from orangefs_request_list or htable_ops_in_progress, then 160 * we know that the client was restarted, causing the shared memory 161 * area to be wiped clean. To restart a write operation in this 162 * case, we must re-copy the data from the user's iovec to a NEW 163 * shared memory location. To restart a read operation, we must get 164 * a new shared memory location. 165 */ 166 if (ret == -EAGAIN && op_state_purged(new_op)) { 167 orangefs_bufmap_put(buffer_index); 168 buffer_index = -1; 169 if (type == ORANGEFS_IO_WRITE) 170 *iter = saved; 171 gossip_debug(GOSSIP_FILE_DEBUG, 172 "%s:going to repopulate_shared_memory.\n", 173 __func__); 174 goto populate_shared_memory; 175 } 176 177 if (ret < 0) { 178 if (ret == -EINTR) { 179 /* 180 * We can't return EINTR if any data was written, 181 * it's not POSIX. It is minimally acceptable 182 * to give a partial write, the way NFS does. 183 * 184 * It would be optimal to return all or nothing, 185 * but if a userspace write is bigger than 186 * an IO buffer, and the interrupt occurs 187 * between buffer writes, that would not be 188 * possible. 189 */ 190 switch (new_op->op_state - OP_VFS_STATE_GIVEN_UP) { 191 /* 192 * If the op was waiting when the interrupt 193 * occurred, then the client-core did not 194 * trigger the write. 195 */ 196 case OP_VFS_STATE_WAITING: 197 if (*offset == 0) 198 ret = -EINTR; 199 else 200 ret = 0; 201 break; 202 /* 203 * If the op was in progress when the interrupt 204 * occurred, then the client-core was able to 205 * trigger the write. 206 */ 207 case OP_VFS_STATE_INPROGR: 208 ret = total_size; 209 break; 210 default: 211 gossip_err("%s: unexpected op state :%d:.\n", 212 __func__, 213 new_op->op_state); 214 ret = 0; 215 break; 216 } 217 gossip_debug(GOSSIP_FILE_DEBUG, 218 "%s: got EINTR, state:%d: %p\n", 219 __func__, 220 new_op->op_state, 221 new_op); 222 } else { 223 gossip_err("%s: error in %s handle %pU, returning %zd\n", 224 __func__, 225 type == ORANGEFS_IO_READ ? 226 "read from" : "write to", 227 handle, ret); 228 } 229 if (orangefs_cancel_op_in_progress(new_op)) 230 return ret; 231 232 goto out; 233 } 234 235 /* 236 * Stage 3: Post copy buffers from client-core's address space 237 * postcopy_buffers only pertains to reads. 238 */ 239 if (type == ORANGEFS_IO_READ) { 240 ret = postcopy_buffers(buffer_index, 241 iter, 242 new_op->downcall.resp.io.amt_complete); 243 if (ret < 0) 244 goto out; 245 } 246 gossip_debug(GOSSIP_FILE_DEBUG, 247 "%s(%pU): Amount %s, returned by the sys-io call:%d\n", 248 __func__, 249 handle, 250 type == ORANGEFS_IO_READ ? "read" : "written", 251 (int)new_op->downcall.resp.io.amt_complete); 252 253 ret = new_op->downcall.resp.io.amt_complete; 254 255 out: 256 if (buffer_index >= 0) { 257 orangefs_bufmap_put(buffer_index); 258 gossip_debug(GOSSIP_FILE_DEBUG, 259 "%s(%pU): PUT buffer_index %d\n", 260 __func__, handle, buffer_index); 261 buffer_index = -1; 262 } 263 op_release(new_op); 264 return ret; 265 } 266 267 /* 268 * Common entry point for read/write/readv/writev 269 * This function will dispatch it to either the direct I/O 270 * or buffered I/O path depending on the mount options and/or 271 * augmented/extended metadata attached to the file. 272 * Note: File extended attributes override any mount options. 273 */ 274 static ssize_t do_readv_writev(enum ORANGEFS_io_type type, struct file *file, 275 loff_t *offset, struct iov_iter *iter) 276 { 277 struct inode *inode = file->f_mapping->host; 278 struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); 279 struct orangefs_khandle *handle = &orangefs_inode->refn.khandle; 280 size_t count = iov_iter_count(iter); 281 ssize_t total_count = 0; 282 ssize_t ret = -EINVAL; 283 284 gossip_debug(GOSSIP_FILE_DEBUG, 285 "%s-BEGIN(%pU): count(%d) after estimate_max_iovecs.\n", 286 __func__, 287 handle, 288 (int)count); 289 290 if (type == ORANGEFS_IO_WRITE) { 291 gossip_debug(GOSSIP_FILE_DEBUG, 292 "%s(%pU): proceeding with offset : %llu, " 293 "size %d\n", 294 __func__, 295 handle, 296 llu(*offset), 297 (int)count); 298 } 299 300 if (count == 0) { 301 ret = 0; 302 goto out; 303 } 304 305 while (iov_iter_count(iter)) { 306 size_t each_count = iov_iter_count(iter); 307 size_t amt_complete; 308 309 /* how much to transfer in this loop iteration */ 310 if (each_count > orangefs_bufmap_size_query()) 311 each_count = orangefs_bufmap_size_query(); 312 313 gossip_debug(GOSSIP_FILE_DEBUG, 314 "%s(%pU): size of each_count(%d)\n", 315 __func__, 316 handle, 317 (int)each_count); 318 gossip_debug(GOSSIP_FILE_DEBUG, 319 "%s(%pU): BEFORE wait_for_io: offset is %d\n", 320 __func__, 321 handle, 322 (int)*offset); 323 324 ret = wait_for_direct_io(type, inode, offset, iter, 325 each_count, 0); 326 gossip_debug(GOSSIP_FILE_DEBUG, 327 "%s(%pU): return from wait_for_io:%d\n", 328 __func__, 329 handle, 330 (int)ret); 331 332 if (ret < 0) 333 goto out; 334 335 *offset += ret; 336 total_count += ret; 337 amt_complete = ret; 338 339 gossip_debug(GOSSIP_FILE_DEBUG, 340 "%s(%pU): AFTER wait_for_io: offset is %d\n", 341 __func__, 342 handle, 343 (int)*offset); 344 345 /* 346 * if we got a short I/O operations, 347 * fall out and return what we got so far 348 */ 349 if (amt_complete < each_count) 350 break; 351 } /*end while */ 352 353 out: 354 if (total_count > 0) 355 ret = total_count; 356 if (ret > 0) { 357 if (type == ORANGEFS_IO_READ) { 358 file_accessed(file); 359 } else { 360 SetMtimeFlag(orangefs_inode); 361 inode->i_mtime = CURRENT_TIME; 362 mark_inode_dirty_sync(inode); 363 } 364 } 365 366 gossip_debug(GOSSIP_FILE_DEBUG, 367 "%s(%pU): Value(%d) returned.\n", 368 __func__, 369 handle, 370 (int)ret); 371 372 return ret; 373 } 374 375 /* 376 * Read data from a specified offset in a file (referenced by inode). 377 * Data may be placed either in a user or kernel buffer. 378 */ 379 ssize_t orangefs_inode_read(struct inode *inode, 380 struct iov_iter *iter, 381 loff_t *offset, 382 loff_t readahead_size) 383 { 384 struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); 385 size_t count = iov_iter_count(iter); 386 size_t bufmap_size; 387 ssize_t ret = -EINVAL; 388 389 g_orangefs_stats.reads++; 390 391 bufmap_size = orangefs_bufmap_size_query(); 392 if (count > bufmap_size) { 393 gossip_debug(GOSSIP_FILE_DEBUG, 394 "%s: count is too large (%zd/%zd)!\n", 395 __func__, count, bufmap_size); 396 return -EINVAL; 397 } 398 399 gossip_debug(GOSSIP_FILE_DEBUG, 400 "%s(%pU) %zd@%llu\n", 401 __func__, 402 &orangefs_inode->refn.khandle, 403 count, 404 llu(*offset)); 405 406 ret = wait_for_direct_io(ORANGEFS_IO_READ, inode, offset, iter, 407 count, readahead_size); 408 if (ret > 0) 409 *offset += ret; 410 411 gossip_debug(GOSSIP_FILE_DEBUG, 412 "%s(%pU): Value(%zd) returned.\n", 413 __func__, 414 &orangefs_inode->refn.khandle, 415 ret); 416 417 return ret; 418 } 419 420 static ssize_t orangefs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) 421 { 422 struct file *file = iocb->ki_filp; 423 loff_t pos = *(&iocb->ki_pos); 424 ssize_t rc = 0; 425 426 BUG_ON(iocb->private); 427 428 gossip_debug(GOSSIP_FILE_DEBUG, "orangefs_file_read_iter\n"); 429 430 g_orangefs_stats.reads++; 431 432 rc = do_readv_writev(ORANGEFS_IO_READ, file, &pos, iter); 433 iocb->ki_pos = pos; 434 435 return rc; 436 } 437 438 static ssize_t orangefs_file_write_iter(struct kiocb *iocb, struct iov_iter *iter) 439 { 440 struct file *file = iocb->ki_filp; 441 loff_t pos; 442 ssize_t rc; 443 444 BUG_ON(iocb->private); 445 446 gossip_debug(GOSSIP_FILE_DEBUG, "orangefs_file_write_iter\n"); 447 448 inode_lock(file->f_mapping->host); 449 450 /* Make sure generic_write_checks sees an up to date inode size. */ 451 if (file->f_flags & O_APPEND) { 452 rc = orangefs_inode_getattr(file->f_mapping->host, 0, 1); 453 if (rc == -ESTALE) 454 rc = -EIO; 455 if (rc) { 456 gossip_err("%s: orangefs_inode_getattr failed, " 457 "rc:%zd:.\n", __func__, rc); 458 goto out; 459 } 460 } 461 462 if (file->f_pos > i_size_read(file->f_mapping->host)) 463 orangefs_i_size_write(file->f_mapping->host, file->f_pos); 464 465 rc = generic_write_checks(iocb, iter); 466 467 if (rc <= 0) { 468 gossip_err("%s: generic_write_checks failed, rc:%zd:.\n", 469 __func__, rc); 470 goto out; 471 } 472 473 /* 474 * if we are appending, generic_write_checks would have updated 475 * pos to the end of the file, so we will wait till now to set 476 * pos... 477 */ 478 pos = *(&iocb->ki_pos); 479 480 rc = do_readv_writev(ORANGEFS_IO_WRITE, 481 file, 482 &pos, 483 iter); 484 if (rc < 0) { 485 gossip_err("%s: do_readv_writev failed, rc:%zd:.\n", 486 __func__, rc); 487 goto out; 488 } 489 490 iocb->ki_pos = pos; 491 g_orangefs_stats.writes++; 492 493 out: 494 495 inode_unlock(file->f_mapping->host); 496 return rc; 497 } 498 499 /* 500 * Perform a miscellaneous operation on a file. 501 */ 502 static long orangefs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) 503 { 504 int ret = -ENOTTY; 505 __u64 val = 0; 506 unsigned long uval; 507 508 gossip_debug(GOSSIP_FILE_DEBUG, 509 "orangefs_ioctl: called with cmd %d\n", 510 cmd); 511 512 /* 513 * we understand some general ioctls on files, such as the immutable 514 * and append flags 515 */ 516 if (cmd == FS_IOC_GETFLAGS) { 517 val = 0; 518 ret = orangefs_inode_getxattr(file_inode(file), 519 "user.pvfs2.meta_hint", 520 &val, sizeof(val)); 521 if (ret < 0 && ret != -ENODATA) 522 return ret; 523 else if (ret == -ENODATA) 524 val = 0; 525 uval = val; 526 gossip_debug(GOSSIP_FILE_DEBUG, 527 "orangefs_ioctl: FS_IOC_GETFLAGS: %llu\n", 528 (unsigned long long)uval); 529 return put_user(uval, (int __user *)arg); 530 } else if (cmd == FS_IOC_SETFLAGS) { 531 ret = 0; 532 if (get_user(uval, (int __user *)arg)) 533 return -EFAULT; 534 /* 535 * ORANGEFS_MIRROR_FL is set internally when the mirroring mode 536 * is turned on for a file. The user is not allowed to turn 537 * on this bit, but the bit is present if the user first gets 538 * the flags and then updates the flags with some new 539 * settings. So, we ignore it in the following edit. bligon. 540 */ 541 if ((uval & ~ORANGEFS_MIRROR_FL) & 542 (~(FS_IMMUTABLE_FL | FS_APPEND_FL | FS_NOATIME_FL))) { 543 gossip_err("orangefs_ioctl: the FS_IOC_SETFLAGS only supports setting one of FS_IMMUTABLE_FL|FS_APPEND_FL|FS_NOATIME_FL\n"); 544 return -EINVAL; 545 } 546 val = uval; 547 gossip_debug(GOSSIP_FILE_DEBUG, 548 "orangefs_ioctl: FS_IOC_SETFLAGS: %llu\n", 549 (unsigned long long)val); 550 ret = orangefs_inode_setxattr(file_inode(file), 551 "user.pvfs2.meta_hint", 552 &val, sizeof(val), 0); 553 } 554 555 return ret; 556 } 557 558 /* 559 * Memory map a region of a file. 560 */ 561 static int orangefs_file_mmap(struct file *file, struct vm_area_struct *vma) 562 { 563 gossip_debug(GOSSIP_FILE_DEBUG, 564 "orangefs_file_mmap: called on %s\n", 565 (file ? 566 (char *)file->f_path.dentry->d_name.name : 567 (char *)"Unknown")); 568 569 /* set the sequential readahead hint */ 570 vma->vm_flags |= VM_SEQ_READ; 571 vma->vm_flags &= ~VM_RAND_READ; 572 573 /* Use readonly mmap since we cannot support writable maps. */ 574 return generic_file_readonly_mmap(file, vma); 575 } 576 577 #define mapping_nrpages(idata) ((idata)->nrpages) 578 579 /* 580 * Called to notify the module that there are no more references to 581 * this file (i.e. no processes have it open). 582 * 583 * \note Not called when each file is closed. 584 */ 585 static int orangefs_file_release(struct inode *inode, struct file *file) 586 { 587 gossip_debug(GOSSIP_FILE_DEBUG, 588 "orangefs_file_release: called on %s\n", 589 file->f_path.dentry->d_name.name); 590 591 orangefs_flush_inode(inode); 592 593 /* 594 * remove all associated inode pages from the page cache and mmap 595 * readahead cache (if any); this forces an expensive refresh of 596 * data for the next caller of mmap (or 'get_block' accesses) 597 */ 598 if (file->f_path.dentry->d_inode && 599 file->f_path.dentry->d_inode->i_mapping && 600 mapping_nrpages(&file->f_path.dentry->d_inode->i_data)) 601 truncate_inode_pages(file->f_path.dentry->d_inode->i_mapping, 602 0); 603 return 0; 604 } 605 606 /* 607 * Push all data for a specific file onto permanent storage. 608 */ 609 static int orangefs_fsync(struct file *file, 610 loff_t start, 611 loff_t end, 612 int datasync) 613 { 614 int ret = -EINVAL; 615 struct orangefs_inode_s *orangefs_inode = 616 ORANGEFS_I(file->f_path.dentry->d_inode); 617 struct orangefs_kernel_op_s *new_op = NULL; 618 619 /* required call */ 620 filemap_write_and_wait_range(file->f_mapping, start, end); 621 622 new_op = op_alloc(ORANGEFS_VFS_OP_FSYNC); 623 if (!new_op) 624 return -ENOMEM; 625 new_op->upcall.req.fsync.refn = orangefs_inode->refn; 626 627 ret = service_operation(new_op, 628 "orangefs_fsync", 629 get_interruptible_flag(file->f_path.dentry->d_inode)); 630 631 gossip_debug(GOSSIP_FILE_DEBUG, 632 "orangefs_fsync got return value of %d\n", 633 ret); 634 635 op_release(new_op); 636 637 orangefs_flush_inode(file->f_path.dentry->d_inode); 638 return ret; 639 } 640 641 /* 642 * Change the file pointer position for an instance of an open file. 643 * 644 * \note If .llseek is overriden, we must acquire lock as described in 645 * Documentation/filesystems/Locking. 646 * 647 * Future upgrade could support SEEK_DATA and SEEK_HOLE but would 648 * require much changes to the FS 649 */ 650 static loff_t orangefs_file_llseek(struct file *file, loff_t offset, int origin) 651 { 652 int ret = -EINVAL; 653 struct inode *inode = file_inode(file); 654 655 if (origin == SEEK_END) { 656 /* 657 * revalidate the inode's file size. 658 * NOTE: We are only interested in file size here, 659 * so we set mask accordingly. 660 */ 661 ret = orangefs_inode_getattr(file->f_mapping->host, 0, 1); 662 if (ret == -ESTALE) 663 ret = -EIO; 664 if (ret) { 665 gossip_debug(GOSSIP_FILE_DEBUG, 666 "%s:%s:%d calling make bad inode\n", 667 __FILE__, 668 __func__, 669 __LINE__); 670 return ret; 671 } 672 } 673 674 gossip_debug(GOSSIP_FILE_DEBUG, 675 "orangefs_file_llseek: offset is %ld | origin is %d" 676 " | inode size is %lu\n", 677 (long)offset, 678 origin, 679 (unsigned long)i_size_read(inode)); 680 681 return generic_file_llseek(file, offset, origin); 682 } 683 684 /* 685 * Support local locks (locks that only this kernel knows about) 686 * if Orangefs was mounted -o local_lock. 687 */ 688 static int orangefs_lock(struct file *filp, int cmd, struct file_lock *fl) 689 { 690 int rc = -EINVAL; 691 692 if (ORANGEFS_SB(filp->f_inode->i_sb)->flags & ORANGEFS_OPT_LOCAL_LOCK) { 693 if (cmd == F_GETLK) { 694 rc = 0; 695 posix_test_lock(filp, fl); 696 } else { 697 rc = posix_lock_file(filp, fl, NULL); 698 } 699 } 700 701 return rc; 702 } 703 704 /** ORANGEFS implementation of VFS file operations */ 705 const struct file_operations orangefs_file_operations = { 706 .llseek = orangefs_file_llseek, 707 .read_iter = orangefs_file_read_iter, 708 .write_iter = orangefs_file_write_iter, 709 .lock = orangefs_lock, 710 .unlocked_ioctl = orangefs_ioctl, 711 .mmap = orangefs_file_mmap, 712 .open = generic_file_open, 713 .release = orangefs_file_release, 714 .fsync = orangefs_fsync, 715 }; 716