1 /* 2 * linux/fs/nfs/direct.c 3 * 4 * Copyright (C) 2003 by Chuck Lever <cel@netapp.com> 5 * 6 * High-performance uncached I/O for the Linux NFS client 7 * 8 * There are important applications whose performance or correctness 9 * depends on uncached access to file data. Database clusters 10 * (multiple copies of the same instance running on separate hosts) 11 * implement their own cache coherency protocol that subsumes file 12 * system cache protocols. Applications that process datasets 13 * considerably larger than the client's memory do not always benefit 14 * from a local cache. A streaming video server, for instance, has no 15 * need to cache the contents of a file. 16 * 17 * When an application requests uncached I/O, all read and write requests 18 * are made directly to the server; data stored or fetched via these 19 * requests is not cached in the Linux page cache. The client does not 20 * correct unaligned requests from applications. All requested bytes are 21 * held on permanent storage before a direct write system call returns to 22 * an application. 23 * 24 * Solaris implements an uncached I/O facility called directio() that 25 * is used for backups and sequential I/O to very large files. Solaris 26 * also supports uncaching whole NFS partitions with "-o forcedirectio," 27 * an undocumented mount option. 28 * 29 * Designed by Jeff Kimmel, Chuck Lever, and Trond Myklebust, with 30 * help from Andrew Morton. 31 * 32 * 18 Dec 2001 Initial implementation for 2.4 --cel 33 * 08 Jul 2002 Version for 2.4.19, with bug fixes --trondmy 34 * 08 Jun 2003 Port to 2.5 APIs --cel 35 * 31 Mar 2004 Handle direct I/O without VFS support --cel 36 * 15 Sep 2004 Parallel async reads --cel 37 * 04 May 2005 support O_DIRECT with aio --cel 38 * 39 */ 40 41 #include <linux/errno.h> 42 #include <linux/sched.h> 43 #include <linux/kernel.h> 44 #include <linux/file.h> 45 #include <linux/pagemap.h> 46 #include <linux/kref.h> 47 #include <linux/slab.h> 48 #include <linux/task_io_accounting_ops.h> 49 #include <linux/module.h> 50 51 #include <linux/nfs_fs.h> 52 #include <linux/nfs_page.h> 53 #include <linux/sunrpc/clnt.h> 54 55 #include <asm/uaccess.h> 56 #include <linux/atomic.h> 57 58 #include "internal.h" 59 #include "iostat.h" 60 #include "pnfs.h" 61 62 #define NFSDBG_FACILITY NFSDBG_VFS 63 64 static struct kmem_cache *nfs_direct_cachep; 65 66 /* 67 * This represents a set of asynchronous requests that we're waiting on 68 */ 69 struct nfs_direct_req { 70 struct kref kref; /* release manager */ 71 72 /* I/O parameters */ 73 struct nfs_open_context *ctx; /* file open context info */ 74 struct nfs_lock_context *l_ctx; /* Lock context info */ 75 struct kiocb * iocb; /* controlling i/o request */ 76 struct inode * inode; /* target file of i/o */ 77 78 /* completion state */ 79 atomic_t io_count; /* i/os we're waiting for */ 80 spinlock_t lock; /* protect completion state */ 81 ssize_t count, /* bytes actually processed */ 82 bytes_left, /* bytes left to be sent */ 83 error; /* any reported error */ 84 struct completion completion; /* wait for i/o completion */ 85 86 /* commit state */ 87 struct nfs_mds_commit_info mds_cinfo; /* Storage for cinfo */ 88 struct pnfs_ds_commit_info ds_cinfo; /* Storage for cinfo */ 89 struct work_struct work; 90 int flags; 91 #define NFS_ODIRECT_DO_COMMIT (1) /* an unstable reply was received */ 92 #define NFS_ODIRECT_RESCHED_WRITES (2) /* write verification failed */ 93 struct nfs_writeverf verf; /* unstable write verifier */ 94 }; 95 96 static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops; 97 static const struct nfs_commit_completion_ops nfs_direct_commit_completion_ops; 98 static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode); 99 static void nfs_direct_write_schedule_work(struct work_struct *work); 100 101 static inline void get_dreq(struct nfs_direct_req *dreq) 102 { 103 atomic_inc(&dreq->io_count); 104 } 105 106 static inline int put_dreq(struct nfs_direct_req *dreq) 107 { 108 return atomic_dec_and_test(&dreq->io_count); 109 } 110 111 /** 112 * nfs_direct_IO - NFS address space operation for direct I/O 113 * @rw: direction (read or write) 114 * @iocb: target I/O control block 115 * @iov: array of vectors that define I/O buffer 116 * @pos: offset in file to begin the operation 117 * @nr_segs: size of iovec array 118 * 119 * The presence of this routine in the address space ops vector means 120 * the NFS client supports direct I/O. However, for most direct IO, we 121 * shunt off direct read and write requests before the VFS gets them, 122 * so this method is only ever called for swap. 123 */ 124 ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t pos, unsigned long nr_segs) 125 { 126 #ifndef CONFIG_NFS_SWAP 127 dprintk("NFS: nfs_direct_IO (%pD) off/no(%Ld/%lu) EINVAL\n", 128 iocb->ki_filp, (long long) pos, nr_segs); 129 130 return -EINVAL; 131 #else 132 VM_BUG_ON(iocb->ki_nbytes != PAGE_SIZE); 133 134 if (rw == READ || rw == KERNEL_READ) 135 return nfs_file_direct_read(iocb, iov, nr_segs, pos, 136 rw == READ ? true : false); 137 return nfs_file_direct_write(iocb, iov, nr_segs, pos, 138 rw == WRITE ? true : false); 139 #endif /* CONFIG_NFS_SWAP */ 140 } 141 142 static void nfs_direct_release_pages(struct page **pages, unsigned int npages) 143 { 144 unsigned int i; 145 for (i = 0; i < npages; i++) 146 page_cache_release(pages[i]); 147 } 148 149 void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo, 150 struct nfs_direct_req *dreq) 151 { 152 cinfo->lock = &dreq->lock; 153 cinfo->mds = &dreq->mds_cinfo; 154 cinfo->ds = &dreq->ds_cinfo; 155 cinfo->dreq = dreq; 156 cinfo->completion_ops = &nfs_direct_commit_completion_ops; 157 } 158 159 static inline struct nfs_direct_req *nfs_direct_req_alloc(void) 160 { 161 struct nfs_direct_req *dreq; 162 163 dreq = kmem_cache_zalloc(nfs_direct_cachep, GFP_KERNEL); 164 if (!dreq) 165 return NULL; 166 167 kref_init(&dreq->kref); 168 kref_get(&dreq->kref); 169 init_completion(&dreq->completion); 170 INIT_LIST_HEAD(&dreq->mds_cinfo.list); 171 INIT_WORK(&dreq->work, nfs_direct_write_schedule_work); 172 spin_lock_init(&dreq->lock); 173 174 return dreq; 175 } 176 177 static void nfs_direct_req_free(struct kref *kref) 178 { 179 struct nfs_direct_req *dreq = container_of(kref, struct nfs_direct_req, kref); 180 181 if (dreq->l_ctx != NULL) 182 nfs_put_lock_context(dreq->l_ctx); 183 if (dreq->ctx != NULL) 184 put_nfs_open_context(dreq->ctx); 185 kmem_cache_free(nfs_direct_cachep, dreq); 186 } 187 188 static void nfs_direct_req_release(struct nfs_direct_req *dreq) 189 { 190 kref_put(&dreq->kref, nfs_direct_req_free); 191 } 192 193 ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq) 194 { 195 return dreq->bytes_left; 196 } 197 EXPORT_SYMBOL_GPL(nfs_dreq_bytes_left); 198 199 /* 200 * Collects and returns the final error value/byte-count. 201 */ 202 static ssize_t nfs_direct_wait(struct nfs_direct_req *dreq) 203 { 204 ssize_t result = -EIOCBQUEUED; 205 206 /* Async requests don't wait here */ 207 if (dreq->iocb) 208 goto out; 209 210 result = wait_for_completion_killable(&dreq->completion); 211 212 if (!result) 213 result = dreq->error; 214 if (!result) 215 result = dreq->count; 216 217 out: 218 return (ssize_t) result; 219 } 220 221 /* 222 * Synchronous I/O uses a stack-allocated iocb. Thus we can't trust 223 * the iocb is still valid here if this is a synchronous request. 224 */ 225 static void nfs_direct_complete(struct nfs_direct_req *dreq, bool write) 226 { 227 struct inode *inode = dreq->inode; 228 229 if (dreq->iocb) { 230 loff_t pos = dreq->iocb->ki_pos + dreq->count; 231 long res = (long) dreq->error; 232 if (!res) 233 res = (long) dreq->count; 234 235 if (write) { 236 spin_lock(&inode->i_lock); 237 if (i_size_read(inode) < pos) 238 i_size_write(inode, pos); 239 spin_unlock(&inode->i_lock); 240 } 241 242 aio_complete(dreq->iocb, res, 0); 243 } 244 complete_all(&dreq->completion); 245 246 nfs_direct_req_release(dreq); 247 } 248 249 static void nfs_direct_readpage_release(struct nfs_page *req) 250 { 251 dprintk("NFS: direct read done (%s/%llu %d@%lld)\n", 252 req->wb_context->dentry->d_inode->i_sb->s_id, 253 (unsigned long long)NFS_FILEID(req->wb_context->dentry->d_inode), 254 req->wb_bytes, 255 (long long)req_offset(req)); 256 nfs_release_request(req); 257 } 258 259 static void nfs_direct_read_completion(struct nfs_pgio_header *hdr) 260 { 261 unsigned long bytes = 0; 262 struct nfs_direct_req *dreq = hdr->dreq; 263 264 if (test_bit(NFS_IOHDR_REDO, &hdr->flags)) 265 goto out_put; 266 267 spin_lock(&dreq->lock); 268 if (test_bit(NFS_IOHDR_ERROR, &hdr->flags) && (hdr->good_bytes == 0)) 269 dreq->error = hdr->error; 270 else 271 dreq->count += hdr->good_bytes; 272 spin_unlock(&dreq->lock); 273 274 while (!list_empty(&hdr->pages)) { 275 struct nfs_page *req = nfs_list_entry(hdr->pages.next); 276 struct page *page = req->wb_page; 277 278 if (!PageCompound(page) && bytes < hdr->good_bytes) 279 set_page_dirty(page); 280 bytes += req->wb_bytes; 281 nfs_list_remove_request(req); 282 nfs_direct_readpage_release(req); 283 } 284 out_put: 285 if (put_dreq(dreq)) 286 nfs_direct_complete(dreq, false); 287 hdr->release(hdr); 288 } 289 290 static void nfs_read_sync_pgio_error(struct list_head *head) 291 { 292 struct nfs_page *req; 293 294 while (!list_empty(head)) { 295 req = nfs_list_entry(head->next); 296 nfs_list_remove_request(req); 297 nfs_release_request(req); 298 } 299 } 300 301 static void nfs_direct_pgio_init(struct nfs_pgio_header *hdr) 302 { 303 get_dreq(hdr->dreq); 304 } 305 306 static const struct nfs_pgio_completion_ops nfs_direct_read_completion_ops = { 307 .error_cleanup = nfs_read_sync_pgio_error, 308 .init_hdr = nfs_direct_pgio_init, 309 .completion = nfs_direct_read_completion, 310 }; 311 312 /* 313 * For each rsize'd chunk of the user's buffer, dispatch an NFS READ 314 * operation. If nfs_readdata_alloc() or get_user_pages() fails, 315 * bail and stop sending more reads. Read length accounting is 316 * handled automatically by nfs_direct_read_result(). Otherwise, if 317 * no requests have been sent, just return an error. 318 */ 319 static ssize_t nfs_direct_read_schedule_segment(struct nfs_pageio_descriptor *desc, 320 const struct iovec *iov, 321 loff_t pos, bool uio) 322 { 323 struct nfs_direct_req *dreq = desc->pg_dreq; 324 struct nfs_open_context *ctx = dreq->ctx; 325 struct inode *inode = ctx->dentry->d_inode; 326 unsigned long user_addr = (unsigned long)iov->iov_base; 327 size_t count = iov->iov_len; 328 size_t rsize = NFS_SERVER(inode)->rsize; 329 unsigned int pgbase; 330 int result; 331 ssize_t started = 0; 332 struct page **pagevec = NULL; 333 unsigned int npages; 334 335 do { 336 size_t bytes; 337 int i; 338 339 pgbase = user_addr & ~PAGE_MASK; 340 bytes = min(max_t(size_t, rsize, PAGE_SIZE), count); 341 342 result = -ENOMEM; 343 npages = nfs_page_array_len(pgbase, bytes); 344 if (!pagevec) 345 pagevec = kmalloc(npages * sizeof(struct page *), 346 GFP_KERNEL); 347 if (!pagevec) 348 break; 349 if (uio) { 350 down_read(¤t->mm->mmap_sem); 351 result = get_user_pages(current, current->mm, user_addr, 352 npages, 1, 0, pagevec, NULL); 353 up_read(¤t->mm->mmap_sem); 354 if (result < 0) 355 break; 356 } else { 357 WARN_ON(npages != 1); 358 result = get_kernel_page(user_addr, 1, pagevec); 359 if (WARN_ON(result != 1)) 360 break; 361 } 362 363 if ((unsigned)result < npages) { 364 bytes = result * PAGE_SIZE; 365 if (bytes <= pgbase) { 366 nfs_direct_release_pages(pagevec, result); 367 break; 368 } 369 bytes -= pgbase; 370 npages = result; 371 } 372 373 for (i = 0; i < npages; i++) { 374 struct nfs_page *req; 375 unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase); 376 /* XXX do we need to do the eof zeroing found in async_filler? */ 377 req = nfs_create_request(dreq->ctx, dreq->inode, 378 pagevec[i], 379 pgbase, req_len); 380 if (IS_ERR(req)) { 381 result = PTR_ERR(req); 382 break; 383 } 384 req->wb_index = pos >> PAGE_SHIFT; 385 req->wb_offset = pos & ~PAGE_MASK; 386 if (!nfs_pageio_add_request(desc, req)) { 387 result = desc->pg_error; 388 nfs_release_request(req); 389 break; 390 } 391 pgbase = 0; 392 bytes -= req_len; 393 started += req_len; 394 user_addr += req_len; 395 pos += req_len; 396 count -= req_len; 397 dreq->bytes_left -= req_len; 398 } 399 /* The nfs_page now hold references to these pages */ 400 nfs_direct_release_pages(pagevec, npages); 401 } while (count != 0 && result >= 0); 402 403 kfree(pagevec); 404 405 if (started) 406 return started; 407 return result < 0 ? (ssize_t) result : -EFAULT; 408 } 409 410 static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq, 411 const struct iovec *iov, 412 unsigned long nr_segs, 413 loff_t pos, bool uio) 414 { 415 struct nfs_pageio_descriptor desc; 416 ssize_t result = -EINVAL; 417 size_t requested_bytes = 0; 418 unsigned long seg; 419 420 NFS_PROTO(dreq->inode)->read_pageio_init(&desc, dreq->inode, 421 &nfs_direct_read_completion_ops); 422 get_dreq(dreq); 423 desc.pg_dreq = dreq; 424 425 for (seg = 0; seg < nr_segs; seg++) { 426 const struct iovec *vec = &iov[seg]; 427 result = nfs_direct_read_schedule_segment(&desc, vec, pos, uio); 428 if (result < 0) 429 break; 430 requested_bytes += result; 431 if ((size_t)result < vec->iov_len) 432 break; 433 pos += vec->iov_len; 434 } 435 436 nfs_pageio_complete(&desc); 437 438 /* 439 * If no bytes were started, return the error, and let the 440 * generic layer handle the completion. 441 */ 442 if (requested_bytes == 0) { 443 nfs_direct_req_release(dreq); 444 return result < 0 ? result : -EIO; 445 } 446 447 if (put_dreq(dreq)) 448 nfs_direct_complete(dreq, false); 449 return 0; 450 } 451 452 static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov, 453 unsigned long nr_segs, loff_t pos, bool uio) 454 { 455 ssize_t result = -ENOMEM; 456 struct inode *inode = iocb->ki_filp->f_mapping->host; 457 struct nfs_direct_req *dreq; 458 struct nfs_lock_context *l_ctx; 459 460 dreq = nfs_direct_req_alloc(); 461 if (dreq == NULL) 462 goto out; 463 464 dreq->inode = inode; 465 dreq->bytes_left = iov_length(iov, nr_segs); 466 dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp)); 467 l_ctx = nfs_get_lock_context(dreq->ctx); 468 if (IS_ERR(l_ctx)) { 469 result = PTR_ERR(l_ctx); 470 goto out_release; 471 } 472 dreq->l_ctx = l_ctx; 473 if (!is_sync_kiocb(iocb)) 474 dreq->iocb = iocb; 475 476 NFS_I(inode)->read_io += iov_length(iov, nr_segs); 477 result = nfs_direct_read_schedule_iovec(dreq, iov, nr_segs, pos, uio); 478 if (!result) 479 result = nfs_direct_wait(dreq); 480 out_release: 481 nfs_direct_req_release(dreq); 482 out: 483 return result; 484 } 485 486 static void nfs_inode_dio_write_done(struct inode *inode) 487 { 488 nfs_zap_mapping(inode, inode->i_mapping); 489 inode_dio_done(inode); 490 } 491 492 #if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4) 493 static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) 494 { 495 struct nfs_pageio_descriptor desc; 496 struct nfs_page *req, *tmp; 497 LIST_HEAD(reqs); 498 struct nfs_commit_info cinfo; 499 LIST_HEAD(failed); 500 501 nfs_init_cinfo_from_dreq(&cinfo, dreq); 502 pnfs_recover_commit_reqs(dreq->inode, &reqs, &cinfo); 503 spin_lock(cinfo.lock); 504 nfs_scan_commit_list(&cinfo.mds->list, &reqs, &cinfo, 0); 505 spin_unlock(cinfo.lock); 506 507 dreq->count = 0; 508 get_dreq(dreq); 509 510 NFS_PROTO(dreq->inode)->write_pageio_init(&desc, dreq->inode, FLUSH_STABLE, 511 &nfs_direct_write_completion_ops); 512 desc.pg_dreq = dreq; 513 514 list_for_each_entry_safe(req, tmp, &reqs, wb_list) { 515 if (!nfs_pageio_add_request(&desc, req)) { 516 nfs_list_remove_request(req); 517 nfs_list_add_request(req, &failed); 518 spin_lock(cinfo.lock); 519 dreq->flags = 0; 520 dreq->error = -EIO; 521 spin_unlock(cinfo.lock); 522 } 523 nfs_release_request(req); 524 } 525 nfs_pageio_complete(&desc); 526 527 while (!list_empty(&failed)) { 528 req = nfs_list_entry(failed.next); 529 nfs_list_remove_request(req); 530 nfs_unlock_and_release_request(req); 531 } 532 533 if (put_dreq(dreq)) 534 nfs_direct_write_complete(dreq, dreq->inode); 535 } 536 537 static void nfs_direct_commit_complete(struct nfs_commit_data *data) 538 { 539 struct nfs_direct_req *dreq = data->dreq; 540 struct nfs_commit_info cinfo; 541 struct nfs_page *req; 542 int status = data->task.tk_status; 543 544 nfs_init_cinfo_from_dreq(&cinfo, dreq); 545 if (status < 0) { 546 dprintk("NFS: %5u commit failed with error %d.\n", 547 data->task.tk_pid, status); 548 dreq->flags = NFS_ODIRECT_RESCHED_WRITES; 549 } else if (memcmp(&dreq->verf, &data->verf, sizeof(data->verf))) { 550 dprintk("NFS: %5u commit verify failed\n", data->task.tk_pid); 551 dreq->flags = NFS_ODIRECT_RESCHED_WRITES; 552 } 553 554 dprintk("NFS: %5u commit returned %d\n", data->task.tk_pid, status); 555 while (!list_empty(&data->pages)) { 556 req = nfs_list_entry(data->pages.next); 557 nfs_list_remove_request(req); 558 if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES) { 559 /* Note the rewrite will go through mds */ 560 nfs_mark_request_commit(req, NULL, &cinfo); 561 } else 562 nfs_release_request(req); 563 nfs_unlock_and_release_request(req); 564 } 565 566 if (atomic_dec_and_test(&cinfo.mds->rpcs_out)) 567 nfs_direct_write_complete(dreq, data->inode); 568 } 569 570 static void nfs_direct_error_cleanup(struct nfs_inode *nfsi) 571 { 572 /* There is no lock to clear */ 573 } 574 575 static const struct nfs_commit_completion_ops nfs_direct_commit_completion_ops = { 576 .completion = nfs_direct_commit_complete, 577 .error_cleanup = nfs_direct_error_cleanup, 578 }; 579 580 static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq) 581 { 582 int res; 583 struct nfs_commit_info cinfo; 584 LIST_HEAD(mds_list); 585 586 nfs_init_cinfo_from_dreq(&cinfo, dreq); 587 nfs_scan_commit(dreq->inode, &mds_list, &cinfo); 588 res = nfs_generic_commit_list(dreq->inode, &mds_list, 0, &cinfo); 589 if (res < 0) /* res == -ENOMEM */ 590 nfs_direct_write_reschedule(dreq); 591 } 592 593 static void nfs_direct_write_schedule_work(struct work_struct *work) 594 { 595 struct nfs_direct_req *dreq = container_of(work, struct nfs_direct_req, work); 596 int flags = dreq->flags; 597 598 dreq->flags = 0; 599 switch (flags) { 600 case NFS_ODIRECT_DO_COMMIT: 601 nfs_direct_commit_schedule(dreq); 602 break; 603 case NFS_ODIRECT_RESCHED_WRITES: 604 nfs_direct_write_reschedule(dreq); 605 break; 606 default: 607 nfs_inode_dio_write_done(dreq->inode); 608 nfs_direct_complete(dreq, true); 609 } 610 } 611 612 static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode) 613 { 614 schedule_work(&dreq->work); /* Calls nfs_direct_write_schedule_work */ 615 } 616 617 #else 618 static void nfs_direct_write_schedule_work(struct work_struct *work) 619 { 620 } 621 622 static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode) 623 { 624 nfs_inode_dio_write_done(inode); 625 nfs_direct_complete(dreq, true); 626 } 627 #endif 628 629 /* 630 * NB: Return the value of the first error return code. Subsequent 631 * errors after the first one are ignored. 632 */ 633 /* 634 * For each wsize'd chunk of the user's buffer, dispatch an NFS WRITE 635 * operation. If nfs_writedata_alloc() or get_user_pages() fails, 636 * bail and stop sending more writes. Write length accounting is 637 * handled automatically by nfs_direct_write_result(). Otherwise, if 638 * no requests have been sent, just return an error. 639 */ 640 static ssize_t nfs_direct_write_schedule_segment(struct nfs_pageio_descriptor *desc, 641 const struct iovec *iov, 642 loff_t pos, bool uio) 643 { 644 struct nfs_direct_req *dreq = desc->pg_dreq; 645 struct nfs_open_context *ctx = dreq->ctx; 646 struct inode *inode = ctx->dentry->d_inode; 647 unsigned long user_addr = (unsigned long)iov->iov_base; 648 size_t count = iov->iov_len; 649 size_t wsize = NFS_SERVER(inode)->wsize; 650 unsigned int pgbase; 651 int result; 652 ssize_t started = 0; 653 struct page **pagevec = NULL; 654 unsigned int npages; 655 656 do { 657 size_t bytes; 658 int i; 659 660 pgbase = user_addr & ~PAGE_MASK; 661 bytes = min(max_t(size_t, wsize, PAGE_SIZE), count); 662 663 result = -ENOMEM; 664 npages = nfs_page_array_len(pgbase, bytes); 665 if (!pagevec) 666 pagevec = kmalloc(npages * sizeof(struct page *), GFP_KERNEL); 667 if (!pagevec) 668 break; 669 670 if (uio) { 671 down_read(¤t->mm->mmap_sem); 672 result = get_user_pages(current, current->mm, user_addr, 673 npages, 0, 0, pagevec, NULL); 674 up_read(¤t->mm->mmap_sem); 675 if (result < 0) 676 break; 677 } else { 678 WARN_ON(npages != 1); 679 result = get_kernel_page(user_addr, 0, pagevec); 680 if (WARN_ON(result != 1)) 681 break; 682 } 683 684 if ((unsigned)result < npages) { 685 bytes = result * PAGE_SIZE; 686 if (bytes <= pgbase) { 687 nfs_direct_release_pages(pagevec, result); 688 break; 689 } 690 bytes -= pgbase; 691 npages = result; 692 } 693 694 for (i = 0; i < npages; i++) { 695 struct nfs_page *req; 696 unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase); 697 698 req = nfs_create_request(dreq->ctx, dreq->inode, 699 pagevec[i], 700 pgbase, req_len); 701 if (IS_ERR(req)) { 702 result = PTR_ERR(req); 703 break; 704 } 705 nfs_lock_request(req); 706 req->wb_index = pos >> PAGE_SHIFT; 707 req->wb_offset = pos & ~PAGE_MASK; 708 if (!nfs_pageio_add_request(desc, req)) { 709 result = desc->pg_error; 710 nfs_unlock_and_release_request(req); 711 break; 712 } 713 pgbase = 0; 714 bytes -= req_len; 715 started += req_len; 716 user_addr += req_len; 717 pos += req_len; 718 count -= req_len; 719 dreq->bytes_left -= req_len; 720 } 721 /* The nfs_page now hold references to these pages */ 722 nfs_direct_release_pages(pagevec, npages); 723 } while (count != 0 && result >= 0); 724 725 kfree(pagevec); 726 727 if (started) 728 return started; 729 return result < 0 ? (ssize_t) result : -EFAULT; 730 } 731 732 static void nfs_direct_write_completion(struct nfs_pgio_header *hdr) 733 { 734 struct nfs_direct_req *dreq = hdr->dreq; 735 struct nfs_commit_info cinfo; 736 int bit = -1; 737 struct nfs_page *req = nfs_list_entry(hdr->pages.next); 738 739 if (test_bit(NFS_IOHDR_REDO, &hdr->flags)) 740 goto out_put; 741 742 nfs_init_cinfo_from_dreq(&cinfo, dreq); 743 744 spin_lock(&dreq->lock); 745 746 if (test_bit(NFS_IOHDR_ERROR, &hdr->flags)) { 747 dreq->flags = 0; 748 dreq->error = hdr->error; 749 } 750 if (dreq->error != 0) 751 bit = NFS_IOHDR_ERROR; 752 else { 753 dreq->count += hdr->good_bytes; 754 if (test_bit(NFS_IOHDR_NEED_RESCHED, &hdr->flags)) { 755 dreq->flags = NFS_ODIRECT_RESCHED_WRITES; 756 bit = NFS_IOHDR_NEED_RESCHED; 757 } else if (test_bit(NFS_IOHDR_NEED_COMMIT, &hdr->flags)) { 758 if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES) 759 bit = NFS_IOHDR_NEED_RESCHED; 760 else if (dreq->flags == 0) { 761 memcpy(&dreq->verf, hdr->verf, 762 sizeof(dreq->verf)); 763 bit = NFS_IOHDR_NEED_COMMIT; 764 dreq->flags = NFS_ODIRECT_DO_COMMIT; 765 } else if (dreq->flags == NFS_ODIRECT_DO_COMMIT) { 766 if (memcmp(&dreq->verf, hdr->verf, sizeof(dreq->verf))) { 767 dreq->flags = NFS_ODIRECT_RESCHED_WRITES; 768 bit = NFS_IOHDR_NEED_RESCHED; 769 } else 770 bit = NFS_IOHDR_NEED_COMMIT; 771 } 772 } 773 } 774 spin_unlock(&dreq->lock); 775 776 while (!list_empty(&hdr->pages)) { 777 req = nfs_list_entry(hdr->pages.next); 778 nfs_list_remove_request(req); 779 switch (bit) { 780 case NFS_IOHDR_NEED_RESCHED: 781 case NFS_IOHDR_NEED_COMMIT: 782 kref_get(&req->wb_kref); 783 nfs_mark_request_commit(req, hdr->lseg, &cinfo); 784 } 785 nfs_unlock_and_release_request(req); 786 } 787 788 out_put: 789 if (put_dreq(dreq)) 790 nfs_direct_write_complete(dreq, hdr->inode); 791 hdr->release(hdr); 792 } 793 794 static void nfs_write_sync_pgio_error(struct list_head *head) 795 { 796 struct nfs_page *req; 797 798 while (!list_empty(head)) { 799 req = nfs_list_entry(head->next); 800 nfs_list_remove_request(req); 801 nfs_unlock_and_release_request(req); 802 } 803 } 804 805 static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops = { 806 .error_cleanup = nfs_write_sync_pgio_error, 807 .init_hdr = nfs_direct_pgio_init, 808 .completion = nfs_direct_write_completion, 809 }; 810 811 static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, 812 const struct iovec *iov, 813 unsigned long nr_segs, 814 loff_t pos, bool uio) 815 { 816 struct nfs_pageio_descriptor desc; 817 struct inode *inode = dreq->inode; 818 ssize_t result = 0; 819 size_t requested_bytes = 0; 820 unsigned long seg; 821 822 NFS_PROTO(inode)->write_pageio_init(&desc, inode, FLUSH_COND_STABLE, 823 &nfs_direct_write_completion_ops); 824 desc.pg_dreq = dreq; 825 get_dreq(dreq); 826 atomic_inc(&inode->i_dio_count); 827 828 NFS_I(dreq->inode)->write_io += iov_length(iov, nr_segs); 829 for (seg = 0; seg < nr_segs; seg++) { 830 const struct iovec *vec = &iov[seg]; 831 result = nfs_direct_write_schedule_segment(&desc, vec, pos, uio); 832 if (result < 0) 833 break; 834 requested_bytes += result; 835 if ((size_t)result < vec->iov_len) 836 break; 837 pos += vec->iov_len; 838 } 839 nfs_pageio_complete(&desc); 840 841 /* 842 * If no bytes were started, return the error, and let the 843 * generic layer handle the completion. 844 */ 845 if (requested_bytes == 0) { 846 inode_dio_done(inode); 847 nfs_direct_req_release(dreq); 848 return result < 0 ? result : -EIO; 849 } 850 851 if (put_dreq(dreq)) 852 nfs_direct_write_complete(dreq, dreq->inode); 853 return 0; 854 } 855 856 static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov, 857 unsigned long nr_segs, loff_t pos, 858 size_t count, bool uio) 859 { 860 ssize_t result = -ENOMEM; 861 struct inode *inode = iocb->ki_filp->f_mapping->host; 862 struct nfs_direct_req *dreq; 863 struct nfs_lock_context *l_ctx; 864 865 dreq = nfs_direct_req_alloc(); 866 if (!dreq) 867 goto out; 868 869 dreq->inode = inode; 870 dreq->bytes_left = count; 871 dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp)); 872 l_ctx = nfs_get_lock_context(dreq->ctx); 873 if (IS_ERR(l_ctx)) { 874 result = PTR_ERR(l_ctx); 875 goto out_release; 876 } 877 dreq->l_ctx = l_ctx; 878 if (!is_sync_kiocb(iocb)) 879 dreq->iocb = iocb; 880 881 result = nfs_direct_write_schedule_iovec(dreq, iov, nr_segs, pos, uio); 882 if (!result) 883 result = nfs_direct_wait(dreq); 884 out_release: 885 nfs_direct_req_release(dreq); 886 out: 887 return result; 888 } 889 890 /** 891 * nfs_file_direct_read - file direct read operation for NFS files 892 * @iocb: target I/O control block 893 * @iov: vector of user buffers into which to read data 894 * @nr_segs: size of iov vector 895 * @pos: byte offset in file where reading starts 896 * 897 * We use this function for direct reads instead of calling 898 * generic_file_aio_read() in order to avoid gfar's check to see if 899 * the request starts before the end of the file. For that check 900 * to work, we must generate a GETATTR before each direct read, and 901 * even then there is a window between the GETATTR and the subsequent 902 * READ where the file size could change. Our preference is simply 903 * to do all reads the application wants, and the server will take 904 * care of managing the end of file boundary. 905 * 906 * This function also eliminates unnecessarily updating the file's 907 * atime locally, as the NFS server sets the file's atime, and this 908 * client must read the updated atime from the server back into its 909 * cache. 910 */ 911 ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov, 912 unsigned long nr_segs, loff_t pos, bool uio) 913 { 914 ssize_t retval = -EINVAL; 915 struct file *file = iocb->ki_filp; 916 struct address_space *mapping = file->f_mapping; 917 size_t count; 918 919 count = iov_length(iov, nr_segs); 920 nfs_add_stats(mapping->host, NFSIOS_DIRECTREADBYTES, count); 921 922 dfprintk(FILE, "NFS: direct read(%pD2, %zd@%Ld)\n", 923 file, count, (long long) pos); 924 925 retval = 0; 926 if (!count) 927 goto out; 928 929 retval = nfs_sync_mapping(mapping); 930 if (retval) 931 goto out; 932 933 task_io_account_read(count); 934 935 retval = nfs_direct_read(iocb, iov, nr_segs, pos, uio); 936 if (retval > 0) 937 iocb->ki_pos = pos + retval; 938 939 out: 940 return retval; 941 } 942 943 /** 944 * nfs_file_direct_write - file direct write operation for NFS files 945 * @iocb: target I/O control block 946 * @iov: vector of user buffers from which to write data 947 * @nr_segs: size of iov vector 948 * @pos: byte offset in file where writing starts 949 * 950 * We use this function for direct writes instead of calling 951 * generic_file_aio_write() in order to avoid taking the inode 952 * semaphore and updating the i_size. The NFS server will set 953 * the new i_size and this client must read the updated size 954 * back into its cache. We let the server do generic write 955 * parameter checking and report problems. 956 * 957 * We eliminate local atime updates, see direct read above. 958 * 959 * We avoid unnecessary page cache invalidations for normal cached 960 * readers of this file. 961 * 962 * Note that O_APPEND is not supported for NFS direct writes, as there 963 * is no atomic O_APPEND write facility in the NFS protocol. 964 */ 965 ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov, 966 unsigned long nr_segs, loff_t pos, bool uio) 967 { 968 ssize_t retval = -EINVAL; 969 struct file *file = iocb->ki_filp; 970 struct address_space *mapping = file->f_mapping; 971 size_t count; 972 973 count = iov_length(iov, nr_segs); 974 nfs_add_stats(mapping->host, NFSIOS_DIRECTWRITTENBYTES, count); 975 976 dfprintk(FILE, "NFS: direct write(%pD2, %zd@%Ld)\n", 977 file, count, (long long) pos); 978 979 retval = generic_write_checks(file, &pos, &count, 0); 980 if (retval) 981 goto out; 982 983 retval = -EINVAL; 984 if ((ssize_t) count < 0) 985 goto out; 986 retval = 0; 987 if (!count) 988 goto out; 989 990 retval = nfs_sync_mapping(mapping); 991 if (retval) 992 goto out; 993 994 task_io_account_write(count); 995 996 retval = nfs_direct_write(iocb, iov, nr_segs, pos, count, uio); 997 if (retval > 0) { 998 struct inode *inode = mapping->host; 999 1000 iocb->ki_pos = pos + retval; 1001 spin_lock(&inode->i_lock); 1002 if (i_size_read(inode) < iocb->ki_pos) 1003 i_size_write(inode, iocb->ki_pos); 1004 spin_unlock(&inode->i_lock); 1005 } 1006 out: 1007 return retval; 1008 } 1009 1010 /** 1011 * nfs_init_directcache - create a slab cache for nfs_direct_req structures 1012 * 1013 */ 1014 int __init nfs_init_directcache(void) 1015 { 1016 nfs_direct_cachep = kmem_cache_create("nfs_direct_cache", 1017 sizeof(struct nfs_direct_req), 1018 0, (SLAB_RECLAIM_ACCOUNT| 1019 SLAB_MEM_SPREAD), 1020 NULL); 1021 if (nfs_direct_cachep == NULL) 1022 return -ENOMEM; 1023 1024 return 0; 1025 } 1026 1027 /** 1028 * nfs_destroy_directcache - destroy the slab cache for nfs_direct_req structures 1029 * 1030 */ 1031 void nfs_destroy_directcache(void) 1032 { 1033 kmem_cache_destroy(nfs_direct_cachep); 1034 } 1035