1 /* 2 * linux/fs/nfs/read.c 3 * 4 * Block I/O for NFS 5 * 6 * Partial copy of Linus' read cache modifications to fs/nfs/file.c 7 * modified for async RPC by okir@monad.swb.de 8 * 9 * We do an ugly hack here in order to return proper error codes to the 10 * user program when a read request failed: since generic_file_read 11 * only checks the return value of inode->i_op->readpage() which is always 0 12 * for async RPC, we set the error bit of the page to 1 when an error occurs, 13 * and make nfs_readpage transmit requests synchronously when encountering this. 14 * This is only a small problem, though, since we now retry all operations 15 * within the RPC code when root squashing is suspected. 16 */ 17 18 #include <linux/config.h> 19 #include <linux/time.h> 20 #include <linux/kernel.h> 21 #include <linux/errno.h> 22 #include <linux/fcntl.h> 23 #include <linux/stat.h> 24 #include <linux/mm.h> 25 #include <linux/slab.h> 26 #include <linux/pagemap.h> 27 #include <linux/sunrpc/clnt.h> 28 #include <linux/nfs_fs.h> 29 #include <linux/nfs_page.h> 30 #include <linux/smp_lock.h> 31 32 #include <asm/system.h> 33 34 #include "iostat.h" 35 36 #define NFSDBG_FACILITY NFSDBG_PAGECACHE 37 38 static int nfs_pagein_one(struct list_head *, struct inode *); 39 static const struct rpc_call_ops nfs_read_partial_ops; 40 static const struct rpc_call_ops nfs_read_full_ops; 41 42 static kmem_cache_t *nfs_rdata_cachep; 43 static mempool_t *nfs_rdata_mempool; 44 45 #define MIN_POOL_READ (32) 46 47 struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount) 48 { 49 struct nfs_read_data *p = mempool_alloc(nfs_rdata_mempool, SLAB_NOFS); 50 51 if (p) { 52 memset(p, 0, sizeof(*p)); 53 INIT_LIST_HEAD(&p->pages); 54 if (pagecount <= ARRAY_SIZE(p->page_array)) 55 p->pagevec = p->page_array; 56 else { 57 p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_NOFS); 58 if (!p->pagevec) { 59 mempool_free(p, nfs_rdata_mempool); 60 p = NULL; 61 } 62 } 63 } 64 return p; 65 } 66 67 void nfs_readdata_free(struct nfs_read_data *p) 68 { 69 if (p && (p->pagevec != &p->page_array[0])) 70 kfree(p->pagevec); 71 mempool_free(p, nfs_rdata_mempool); 72 } 73 74 void nfs_readdata_release(void *data) 75 { 76 nfs_readdata_free(data); 77 } 78 79 static 80 unsigned int nfs_page_length(struct inode *inode, struct page *page) 81 { 82 loff_t i_size = i_size_read(inode); 83 unsigned long idx; 84 85 if (i_size <= 0) 86 return 0; 87 idx = (i_size - 1) >> PAGE_CACHE_SHIFT; 88 if (page->index > idx) 89 return 0; 90 if (page->index != idx) 91 return PAGE_CACHE_SIZE; 92 return 1 + ((i_size - 1) & (PAGE_CACHE_SIZE - 1)); 93 } 94 95 static 96 int nfs_return_empty_page(struct page *page) 97 { 98 memclear_highpage_flush(page, 0, PAGE_CACHE_SIZE); 99 SetPageUptodate(page); 100 unlock_page(page); 101 return 0; 102 } 103 104 static void nfs_readpage_truncate_uninitialised_page(struct nfs_read_data *data) 105 { 106 unsigned int remainder = data->args.count - data->res.count; 107 unsigned int base = data->args.pgbase + data->res.count; 108 unsigned int pglen; 109 struct page **pages; 110 111 if (data->res.eof == 0 || remainder == 0) 112 return; 113 /* 114 * Note: "remainder" can never be negative, since we check for 115 * this in the XDR code. 116 */ 117 pages = &data->args.pages[base >> PAGE_CACHE_SHIFT]; 118 base &= ~PAGE_CACHE_MASK; 119 pglen = PAGE_CACHE_SIZE - base; 120 if (pglen < remainder) 121 memclear_highpage_flush(*pages, base, pglen); 122 else 123 memclear_highpage_flush(*pages, base, remainder); 124 } 125 126 /* 127 * Read a page synchronously. 128 */ 129 static int nfs_readpage_sync(struct nfs_open_context *ctx, struct inode *inode, 130 struct page *page) 131 { 132 unsigned int rsize = NFS_SERVER(inode)->rsize; 133 unsigned int count = PAGE_CACHE_SIZE; 134 int result; 135 struct nfs_read_data *rdata; 136 137 rdata = nfs_readdata_alloc(1); 138 if (!rdata) 139 return -ENOMEM; 140 141 memset(rdata, 0, sizeof(*rdata)); 142 rdata->flags = (IS_SWAPFILE(inode)? NFS_RPC_SWAPFLAGS : 0); 143 rdata->cred = ctx->cred; 144 rdata->inode = inode; 145 INIT_LIST_HEAD(&rdata->pages); 146 rdata->args.fh = NFS_FH(inode); 147 rdata->args.context = ctx; 148 rdata->args.pages = &page; 149 rdata->args.pgbase = 0UL; 150 rdata->args.count = rsize; 151 rdata->res.fattr = &rdata->fattr; 152 153 dprintk("NFS: nfs_readpage_sync(%p)\n", page); 154 155 /* 156 * This works now because the socket layer never tries to DMA 157 * into this buffer directly. 158 */ 159 do { 160 if (count < rsize) 161 rdata->args.count = count; 162 rdata->res.count = rdata->args.count; 163 rdata->args.offset = page_offset(page) + rdata->args.pgbase; 164 165 dprintk("NFS: nfs_proc_read(%s, (%s/%Ld), %Lu, %u)\n", 166 NFS_SERVER(inode)->hostname, 167 inode->i_sb->s_id, 168 (long long)NFS_FILEID(inode), 169 (unsigned long long)rdata->args.pgbase, 170 rdata->args.count); 171 172 lock_kernel(); 173 result = NFS_PROTO(inode)->read(rdata); 174 unlock_kernel(); 175 176 /* 177 * Even if we had a partial success we can't mark the page 178 * cache valid. 179 */ 180 if (result < 0) { 181 if (result == -EISDIR) 182 result = -EINVAL; 183 goto io_error; 184 } 185 count -= result; 186 rdata->args.pgbase += result; 187 nfs_add_stats(inode, NFSIOS_SERVERREADBYTES, result); 188 189 /* Note: result == 0 should only happen if we're caching 190 * a write that extends the file and punches a hole. 191 */ 192 if (rdata->res.eof != 0 || result == 0) 193 break; 194 } while (count); 195 spin_lock(&inode->i_lock); 196 NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATIME; 197 spin_unlock(&inode->i_lock); 198 199 nfs_readpage_truncate_uninitialised_page(rdata); 200 if (rdata->res.eof || rdata->res.count == rdata->args.count) 201 SetPageUptodate(page); 202 result = 0; 203 204 io_error: 205 unlock_page(page); 206 nfs_readdata_free(rdata); 207 return result; 208 } 209 210 static int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode, 211 struct page *page) 212 { 213 LIST_HEAD(one_request); 214 struct nfs_page *new; 215 unsigned int len; 216 217 len = nfs_page_length(inode, page); 218 if (len == 0) 219 return nfs_return_empty_page(page); 220 new = nfs_create_request(ctx, inode, page, 0, len); 221 if (IS_ERR(new)) { 222 unlock_page(page); 223 return PTR_ERR(new); 224 } 225 if (len < PAGE_CACHE_SIZE) 226 memclear_highpage_flush(page, len, PAGE_CACHE_SIZE - len); 227 228 nfs_list_add_request(new, &one_request); 229 nfs_pagein_one(&one_request, inode); 230 return 0; 231 } 232 233 static void nfs_readpage_release(struct nfs_page *req) 234 { 235 unlock_page(req->wb_page); 236 237 dprintk("NFS: read done (%s/%Ld %d@%Ld)\n", 238 req->wb_context->dentry->d_inode->i_sb->s_id, 239 (long long)NFS_FILEID(req->wb_context->dentry->d_inode), 240 req->wb_bytes, 241 (long long)req_offset(req)); 242 nfs_clear_request(req); 243 nfs_release_request(req); 244 } 245 246 /* 247 * Set up the NFS read request struct 248 */ 249 static void nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data, 250 const struct rpc_call_ops *call_ops, 251 unsigned int count, unsigned int offset) 252 { 253 struct inode *inode; 254 int flags; 255 256 data->req = req; 257 data->inode = inode = req->wb_context->dentry->d_inode; 258 data->cred = req->wb_context->cred; 259 260 data->args.fh = NFS_FH(inode); 261 data->args.offset = req_offset(req) + offset; 262 data->args.pgbase = req->wb_pgbase + offset; 263 data->args.pages = data->pagevec; 264 data->args.count = count; 265 data->args.context = req->wb_context; 266 267 data->res.fattr = &data->fattr; 268 data->res.count = count; 269 data->res.eof = 0; 270 nfs_fattr_init(&data->fattr); 271 272 /* Set up the initial task struct. */ 273 flags = RPC_TASK_ASYNC | (IS_SWAPFILE(inode)? NFS_RPC_SWAPFLAGS : 0); 274 rpc_init_task(&data->task, NFS_CLIENT(inode), flags, call_ops, data); 275 NFS_PROTO(inode)->read_setup(data); 276 277 data->task.tk_cookie = (unsigned long)inode; 278 279 dprintk("NFS: %4d initiated read call (req %s/%Ld, %u bytes @ offset %Lu)\n", 280 data->task.tk_pid, 281 inode->i_sb->s_id, 282 (long long)NFS_FILEID(inode), 283 count, 284 (unsigned long long)data->args.offset); 285 } 286 287 static void 288 nfs_async_read_error(struct list_head *head) 289 { 290 struct nfs_page *req; 291 292 while (!list_empty(head)) { 293 req = nfs_list_entry(head->next); 294 nfs_list_remove_request(req); 295 SetPageError(req->wb_page); 296 nfs_readpage_release(req); 297 } 298 } 299 300 /* 301 * Start an async read operation 302 */ 303 static void nfs_execute_read(struct nfs_read_data *data) 304 { 305 struct rpc_clnt *clnt = NFS_CLIENT(data->inode); 306 sigset_t oldset; 307 308 rpc_clnt_sigmask(clnt, &oldset); 309 lock_kernel(); 310 rpc_execute(&data->task); 311 unlock_kernel(); 312 rpc_clnt_sigunmask(clnt, &oldset); 313 } 314 315 /* 316 * Generate multiple requests to fill a single page. 317 * 318 * We optimize to reduce the number of read operations on the wire. If we 319 * detect that we're reading a page, or an area of a page, that is past the 320 * end of file, we do not generate NFS read operations but just clear the 321 * parts of the page that would have come back zero from the server anyway. 322 * 323 * We rely on the cached value of i_size to make this determination; another 324 * client can fill pages on the server past our cached end-of-file, but we 325 * won't see the new data until our attribute cache is updated. This is more 326 * or less conventional NFS client behavior. 327 */ 328 static int nfs_pagein_multi(struct list_head *head, struct inode *inode) 329 { 330 struct nfs_page *req = nfs_list_entry(head->next); 331 struct page *page = req->wb_page; 332 struct nfs_read_data *data; 333 unsigned int rsize = NFS_SERVER(inode)->rsize; 334 unsigned int nbytes, offset; 335 int requests = 0; 336 LIST_HEAD(list); 337 338 nfs_list_remove_request(req); 339 340 nbytes = req->wb_bytes; 341 for(;;) { 342 data = nfs_readdata_alloc(1); 343 if (!data) 344 goto out_bad; 345 INIT_LIST_HEAD(&data->pages); 346 list_add(&data->pages, &list); 347 requests++; 348 if (nbytes <= rsize) 349 break; 350 nbytes -= rsize; 351 } 352 atomic_set(&req->wb_complete, requests); 353 354 ClearPageError(page); 355 offset = 0; 356 nbytes = req->wb_bytes; 357 do { 358 data = list_entry(list.next, struct nfs_read_data, pages); 359 list_del_init(&data->pages); 360 361 data->pagevec[0] = page; 362 363 if (nbytes > rsize) { 364 nfs_read_rpcsetup(req, data, &nfs_read_partial_ops, 365 rsize, offset); 366 offset += rsize; 367 nbytes -= rsize; 368 } else { 369 nfs_read_rpcsetup(req, data, &nfs_read_partial_ops, 370 nbytes, offset); 371 nbytes = 0; 372 } 373 nfs_execute_read(data); 374 } while (nbytes != 0); 375 376 return 0; 377 378 out_bad: 379 while (!list_empty(&list)) { 380 data = list_entry(list.next, struct nfs_read_data, pages); 381 list_del(&data->pages); 382 nfs_readdata_free(data); 383 } 384 SetPageError(page); 385 nfs_readpage_release(req); 386 return -ENOMEM; 387 } 388 389 static int nfs_pagein_one(struct list_head *head, struct inode *inode) 390 { 391 struct nfs_page *req; 392 struct page **pages; 393 struct nfs_read_data *data; 394 unsigned int count; 395 396 if (NFS_SERVER(inode)->rsize < PAGE_CACHE_SIZE) 397 return nfs_pagein_multi(head, inode); 398 399 data = nfs_readdata_alloc(NFS_SERVER(inode)->rpages); 400 if (!data) 401 goto out_bad; 402 403 INIT_LIST_HEAD(&data->pages); 404 pages = data->pagevec; 405 count = 0; 406 while (!list_empty(head)) { 407 req = nfs_list_entry(head->next); 408 nfs_list_remove_request(req); 409 nfs_list_add_request(req, &data->pages); 410 ClearPageError(req->wb_page); 411 *pages++ = req->wb_page; 412 count += req->wb_bytes; 413 } 414 req = nfs_list_entry(data->pages.next); 415 416 nfs_read_rpcsetup(req, data, &nfs_read_full_ops, count, 0); 417 418 nfs_execute_read(data); 419 return 0; 420 out_bad: 421 nfs_async_read_error(head); 422 return -ENOMEM; 423 } 424 425 static int 426 nfs_pagein_list(struct list_head *head, int rpages) 427 { 428 LIST_HEAD(one_request); 429 struct nfs_page *req; 430 int error = 0; 431 unsigned int pages = 0; 432 433 while (!list_empty(head)) { 434 pages += nfs_coalesce_requests(head, &one_request, rpages); 435 req = nfs_list_entry(one_request.next); 436 error = nfs_pagein_one(&one_request, req->wb_context->dentry->d_inode); 437 if (error < 0) 438 break; 439 } 440 if (error >= 0) 441 return pages; 442 443 nfs_async_read_error(head); 444 return error; 445 } 446 447 /* 448 * Handle a read reply that fills part of a page. 449 */ 450 static void nfs_readpage_result_partial(struct rpc_task *task, void *calldata) 451 { 452 struct nfs_read_data *data = calldata; 453 struct nfs_page *req = data->req; 454 struct page *page = req->wb_page; 455 456 if (likely(task->tk_status >= 0)) 457 nfs_readpage_truncate_uninitialised_page(data); 458 else 459 SetPageError(page); 460 if (nfs_readpage_result(task, data) != 0) 461 return; 462 if (atomic_dec_and_test(&req->wb_complete)) { 463 if (!PageError(page)) 464 SetPageUptodate(page); 465 nfs_readpage_release(req); 466 } 467 } 468 469 static const struct rpc_call_ops nfs_read_partial_ops = { 470 .rpc_call_done = nfs_readpage_result_partial, 471 .rpc_release = nfs_readdata_release, 472 }; 473 474 static void nfs_readpage_set_pages_uptodate(struct nfs_read_data *data) 475 { 476 unsigned int count = data->res.count; 477 unsigned int base = data->args.pgbase; 478 struct page **pages; 479 480 if (unlikely(count == 0)) 481 return; 482 pages = &data->args.pages[base >> PAGE_CACHE_SHIFT]; 483 base &= ~PAGE_CACHE_MASK; 484 count += base; 485 for (;count >= PAGE_CACHE_SIZE; count -= PAGE_CACHE_SIZE, pages++) 486 SetPageUptodate(*pages); 487 /* 488 * Was this an eof or a short read? If the latter, don't mark the page 489 * as uptodate yet. 490 */ 491 if (count > 0 && (data->res.eof || data->args.count == data->res.count)) 492 SetPageUptodate(*pages); 493 } 494 495 static void nfs_readpage_set_pages_error(struct nfs_read_data *data) 496 { 497 unsigned int count = data->args.count; 498 unsigned int base = data->args.pgbase; 499 struct page **pages; 500 501 pages = &data->args.pages[base >> PAGE_CACHE_SHIFT]; 502 base &= ~PAGE_CACHE_MASK; 503 count += base; 504 for (;count >= PAGE_CACHE_SIZE; count -= PAGE_CACHE_SIZE, pages++) 505 SetPageError(*pages); 506 } 507 508 /* 509 * This is the callback from RPC telling us whether a reply was 510 * received or some error occurred (timeout or socket shutdown). 511 */ 512 static void nfs_readpage_result_full(struct rpc_task *task, void *calldata) 513 { 514 struct nfs_read_data *data = calldata; 515 516 /* 517 * Note: nfs_readpage_result may change the values of 518 * data->args. In the multi-page case, we therefore need 519 * to ensure that we call the next nfs_readpage_set_page_uptodate() 520 * first in the multi-page case. 521 */ 522 if (likely(task->tk_status >= 0)) { 523 nfs_readpage_truncate_uninitialised_page(data); 524 nfs_readpage_set_pages_uptodate(data); 525 } else 526 nfs_readpage_set_pages_error(data); 527 if (nfs_readpage_result(task, data) != 0) 528 return; 529 while (!list_empty(&data->pages)) { 530 struct nfs_page *req = nfs_list_entry(data->pages.next); 531 532 nfs_list_remove_request(req); 533 nfs_readpage_release(req); 534 } 535 } 536 537 static const struct rpc_call_ops nfs_read_full_ops = { 538 .rpc_call_done = nfs_readpage_result_full, 539 .rpc_release = nfs_readdata_release, 540 }; 541 542 /* 543 * This is the callback from RPC telling us whether a reply was 544 * received or some error occurred (timeout or socket shutdown). 545 */ 546 int nfs_readpage_result(struct rpc_task *task, struct nfs_read_data *data) 547 { 548 struct nfs_readargs *argp = &data->args; 549 struct nfs_readres *resp = &data->res; 550 int status; 551 552 dprintk("NFS: %4d nfs_readpage_result, (status %d)\n", 553 task->tk_pid, task->tk_status); 554 555 status = NFS_PROTO(data->inode)->read_done(task, data); 556 if (status != 0) 557 return status; 558 559 nfs_add_stats(data->inode, NFSIOS_SERVERREADBYTES, resp->count); 560 561 /* Is this a short read? */ 562 if (task->tk_status >= 0 && resp->count < argp->count && !resp->eof) { 563 nfs_inc_stats(data->inode, NFSIOS_SHORTREAD); 564 /* Has the server at least made some progress? */ 565 if (resp->count != 0) { 566 /* Yes, so retry the read at the end of the data */ 567 argp->offset += resp->count; 568 argp->pgbase += resp->count; 569 argp->count -= resp->count; 570 rpc_restart_call(task); 571 return -EAGAIN; 572 } 573 task->tk_status = -EIO; 574 } 575 spin_lock(&data->inode->i_lock); 576 NFS_I(data->inode)->cache_validity |= NFS_INO_INVALID_ATIME; 577 spin_unlock(&data->inode->i_lock); 578 return 0; 579 } 580 581 /* 582 * Read a page over NFS. 583 * We read the page synchronously in the following case: 584 * - The error flag is set for this page. This happens only when a 585 * previous async read operation failed. 586 */ 587 int nfs_readpage(struct file *file, struct page *page) 588 { 589 struct nfs_open_context *ctx; 590 struct inode *inode = page->mapping->host; 591 int error; 592 593 dprintk("NFS: nfs_readpage (%p %ld@%lu)\n", 594 page, PAGE_CACHE_SIZE, page->index); 595 nfs_inc_stats(inode, NFSIOS_VFSREADPAGE); 596 nfs_add_stats(inode, NFSIOS_READPAGES, 1); 597 598 /* 599 * Try to flush any pending writes to the file.. 600 * 601 * NOTE! Because we own the page lock, there cannot 602 * be any new pending writes generated at this point 603 * for this page (other pages can be written to). 604 */ 605 error = nfs_wb_page(inode, page); 606 if (error) 607 goto out_error; 608 609 if (file == NULL) { 610 ctx = nfs_find_open_context(inode, NULL, FMODE_READ); 611 if (ctx == NULL) 612 return -EBADF; 613 } else 614 ctx = get_nfs_open_context((struct nfs_open_context *) 615 file->private_data); 616 if (!IS_SYNC(inode)) { 617 error = nfs_readpage_async(ctx, inode, page); 618 goto out; 619 } 620 621 error = nfs_readpage_sync(ctx, inode, page); 622 if (error < 0 && IS_SWAPFILE(inode)) 623 printk("Aiee.. nfs swap-in of page failed!\n"); 624 out: 625 put_nfs_open_context(ctx); 626 return error; 627 628 out_error: 629 unlock_page(page); 630 return error; 631 } 632 633 struct nfs_readdesc { 634 struct list_head *head; 635 struct nfs_open_context *ctx; 636 }; 637 638 static int 639 readpage_async_filler(void *data, struct page *page) 640 { 641 struct nfs_readdesc *desc = (struct nfs_readdesc *)data; 642 struct inode *inode = page->mapping->host; 643 struct nfs_page *new; 644 unsigned int len; 645 646 nfs_wb_page(inode, page); 647 len = nfs_page_length(inode, page); 648 if (len == 0) 649 return nfs_return_empty_page(page); 650 new = nfs_create_request(desc->ctx, inode, page, 0, len); 651 if (IS_ERR(new)) { 652 SetPageError(page); 653 unlock_page(page); 654 return PTR_ERR(new); 655 } 656 if (len < PAGE_CACHE_SIZE) 657 memclear_highpage_flush(page, len, PAGE_CACHE_SIZE - len); 658 nfs_list_add_request(new, desc->head); 659 return 0; 660 } 661 662 int nfs_readpages(struct file *filp, struct address_space *mapping, 663 struct list_head *pages, unsigned nr_pages) 664 { 665 LIST_HEAD(head); 666 struct nfs_readdesc desc = { 667 .head = &head, 668 }; 669 struct inode *inode = mapping->host; 670 struct nfs_server *server = NFS_SERVER(inode); 671 int ret; 672 673 dprintk("NFS: nfs_readpages (%s/%Ld %d)\n", 674 inode->i_sb->s_id, 675 (long long)NFS_FILEID(inode), 676 nr_pages); 677 nfs_inc_stats(inode, NFSIOS_VFSREADPAGES); 678 679 if (filp == NULL) { 680 desc.ctx = nfs_find_open_context(inode, NULL, FMODE_READ); 681 if (desc.ctx == NULL) 682 return -EBADF; 683 } else 684 desc.ctx = get_nfs_open_context((struct nfs_open_context *) 685 filp->private_data); 686 ret = read_cache_pages(mapping, pages, readpage_async_filler, &desc); 687 if (!list_empty(&head)) { 688 int err = nfs_pagein_list(&head, server->rpages); 689 if (!ret) 690 nfs_add_stats(inode, NFSIOS_READPAGES, err); 691 ret = err; 692 } 693 put_nfs_open_context(desc.ctx); 694 return ret; 695 } 696 697 int __init nfs_init_readpagecache(void) 698 { 699 nfs_rdata_cachep = kmem_cache_create("nfs_read_data", 700 sizeof(struct nfs_read_data), 701 0, SLAB_HWCACHE_ALIGN, 702 NULL, NULL); 703 if (nfs_rdata_cachep == NULL) 704 return -ENOMEM; 705 706 nfs_rdata_mempool = mempool_create_slab_pool(MIN_POOL_READ, 707 nfs_rdata_cachep); 708 if (nfs_rdata_mempool == NULL) 709 return -ENOMEM; 710 711 return 0; 712 } 713 714 void nfs_destroy_readpagecache(void) 715 { 716 mempool_destroy(nfs_rdata_mempool); 717 if (kmem_cache_destroy(nfs_rdata_cachep)) 718 printk(KERN_INFO "nfs_read_data: not all structures were freed\n"); 719 } 720