1 /* 2 * pNFS functions to call and manage layout drivers. 3 * 4 * Copyright (c) 2002 [year of first publication] 5 * The Regents of the University of Michigan 6 * All Rights Reserved 7 * 8 * Dean Hildebrand <dhildebz@umich.edu> 9 * 10 * Permission is granted to use, copy, create derivative works, and 11 * redistribute this software and such derivative works for any purpose, 12 * so long as the name of the University of Michigan is not used in 13 * any advertising or publicity pertaining to the use or distribution 14 * of this software without specific, written prior authorization. If 15 * the above copyright notice or any other identification of the 16 * University of Michigan is included in any copy of any portion of 17 * this software, then the disclaimer below must also be included. 18 * 19 * This software is provided as is, without representation or warranty 20 * of any kind either express or implied, including without limitation 21 * the implied warranties of merchantability, fitness for a particular 22 * purpose, or noninfringement. The Regents of the University of 23 * Michigan shall not be liable for any damages, including special, 24 * indirect, incidental, or consequential damages, with respect to any 25 * claim arising out of or in connection with the use of the software, 26 * even if it has been or is hereafter advised of the possibility of 27 * such damages. 28 */ 29 30 #include <linux/nfs_fs.h> 31 #include <linux/nfs_page.h> 32 #include <linux/module.h> 33 #include "internal.h" 34 #include "pnfs.h" 35 #include "iostat.h" 36 37 #define NFSDBG_FACILITY NFSDBG_PNFS 38 #define PNFS_LAYOUTGET_RETRY_TIMEOUT (120*HZ) 39 40 /* Locking: 41 * 42 * pnfs_spinlock: 43 * protects pnfs_modules_tbl. 44 */ 45 static DEFINE_SPINLOCK(pnfs_spinlock); 46 47 /* 48 * pnfs_modules_tbl holds all pnfs modules 49 */ 50 static LIST_HEAD(pnfs_modules_tbl); 51 52 /* Return the registered pnfs layout driver module matching given id */ 53 static struct pnfs_layoutdriver_type * 54 find_pnfs_driver_locked(u32 id) 55 { 56 struct pnfs_layoutdriver_type *local; 57 58 list_for_each_entry(local, &pnfs_modules_tbl, pnfs_tblid) 59 if (local->id == id) 60 goto out; 61 local = NULL; 62 out: 63 dprintk("%s: Searching for id %u, found %p\n", __func__, id, local); 64 return local; 65 } 66 67 static struct pnfs_layoutdriver_type * 68 find_pnfs_driver(u32 id) 69 { 70 struct pnfs_layoutdriver_type *local; 71 72 spin_lock(&pnfs_spinlock); 73 local = find_pnfs_driver_locked(id); 74 if (local != NULL && !try_module_get(local->owner)) { 75 dprintk("%s: Could not grab reference on module\n", __func__); 76 local = NULL; 77 } 78 spin_unlock(&pnfs_spinlock); 79 return local; 80 } 81 82 void 83 unset_pnfs_layoutdriver(struct nfs_server *nfss) 84 { 85 if (nfss->pnfs_curr_ld) { 86 if (nfss->pnfs_curr_ld->clear_layoutdriver) 87 nfss->pnfs_curr_ld->clear_layoutdriver(nfss); 88 /* Decrement the MDS count. Purge the deviceid cache if zero */ 89 if (atomic_dec_and_test(&nfss->nfs_client->cl_mds_count)) 90 nfs4_deviceid_purge_client(nfss->nfs_client); 91 module_put(nfss->pnfs_curr_ld->owner); 92 } 93 nfss->pnfs_curr_ld = NULL; 94 } 95 96 /* 97 * Try to set the server's pnfs module to the pnfs layout type specified by id. 98 * Currently only one pNFS layout driver per filesystem is supported. 99 * 100 * @id layout type. Zero (illegal layout type) indicates pNFS not in use. 101 */ 102 void 103 set_pnfs_layoutdriver(struct nfs_server *server, const struct nfs_fh *mntfh, 104 u32 id) 105 { 106 struct pnfs_layoutdriver_type *ld_type = NULL; 107 108 if (id == 0) 109 goto out_no_driver; 110 if (!(server->nfs_client->cl_exchange_flags & 111 (EXCHGID4_FLAG_USE_NON_PNFS | EXCHGID4_FLAG_USE_PNFS_MDS))) { 112 printk(KERN_ERR "NFS: %s: id %u cl_exchange_flags 0x%x\n", 113 __func__, id, server->nfs_client->cl_exchange_flags); 114 goto out_no_driver; 115 } 116 ld_type = find_pnfs_driver(id); 117 if (!ld_type) { 118 request_module("%s-%u", LAYOUT_NFSV4_1_MODULE_PREFIX, id); 119 ld_type = find_pnfs_driver(id); 120 if (!ld_type) { 121 dprintk("%s: No pNFS module found for %u.\n", 122 __func__, id); 123 goto out_no_driver; 124 } 125 } 126 server->pnfs_curr_ld = ld_type; 127 if (ld_type->set_layoutdriver 128 && ld_type->set_layoutdriver(server, mntfh)) { 129 printk(KERN_ERR "NFS: %s: Error initializing pNFS layout " 130 "driver %u.\n", __func__, id); 131 module_put(ld_type->owner); 132 goto out_no_driver; 133 } 134 /* Bump the MDS count */ 135 atomic_inc(&server->nfs_client->cl_mds_count); 136 137 dprintk("%s: pNFS module for %u set\n", __func__, id); 138 return; 139 140 out_no_driver: 141 dprintk("%s: Using NFSv4 I/O\n", __func__); 142 server->pnfs_curr_ld = NULL; 143 } 144 145 int 146 pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *ld_type) 147 { 148 int status = -EINVAL; 149 struct pnfs_layoutdriver_type *tmp; 150 151 if (ld_type->id == 0) { 152 printk(KERN_ERR "NFS: %s id 0 is reserved\n", __func__); 153 return status; 154 } 155 if (!ld_type->alloc_lseg || !ld_type->free_lseg) { 156 printk(KERN_ERR "NFS: %s Layout driver must provide " 157 "alloc_lseg and free_lseg.\n", __func__); 158 return status; 159 } 160 161 spin_lock(&pnfs_spinlock); 162 tmp = find_pnfs_driver_locked(ld_type->id); 163 if (!tmp) { 164 list_add(&ld_type->pnfs_tblid, &pnfs_modules_tbl); 165 status = 0; 166 dprintk("%s Registering id:%u name:%s\n", __func__, ld_type->id, 167 ld_type->name); 168 } else { 169 printk(KERN_ERR "NFS: %s Module with id %d already loaded!\n", 170 __func__, ld_type->id); 171 } 172 spin_unlock(&pnfs_spinlock); 173 174 return status; 175 } 176 EXPORT_SYMBOL_GPL(pnfs_register_layoutdriver); 177 178 void 179 pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *ld_type) 180 { 181 dprintk("%s Deregistering id:%u\n", __func__, ld_type->id); 182 spin_lock(&pnfs_spinlock); 183 list_del(&ld_type->pnfs_tblid); 184 spin_unlock(&pnfs_spinlock); 185 } 186 EXPORT_SYMBOL_GPL(pnfs_unregister_layoutdriver); 187 188 /* 189 * pNFS client layout cache 190 */ 191 192 /* Need to hold i_lock if caller does not already hold reference */ 193 void 194 pnfs_get_layout_hdr(struct pnfs_layout_hdr *lo) 195 { 196 atomic_inc(&lo->plh_refcount); 197 } 198 199 static struct pnfs_layout_hdr * 200 pnfs_alloc_layout_hdr(struct inode *ino, gfp_t gfp_flags) 201 { 202 struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld; 203 return ld->alloc_layout_hdr(ino, gfp_flags); 204 } 205 206 static void 207 pnfs_free_layout_hdr(struct pnfs_layout_hdr *lo) 208 { 209 struct nfs_server *server = NFS_SERVER(lo->plh_inode); 210 struct pnfs_layoutdriver_type *ld = server->pnfs_curr_ld; 211 212 if (!list_empty(&lo->plh_layouts)) { 213 struct nfs_client *clp = server->nfs_client; 214 215 spin_lock(&clp->cl_lock); 216 list_del_init(&lo->plh_layouts); 217 spin_unlock(&clp->cl_lock); 218 } 219 put_rpccred(lo->plh_lc_cred); 220 return ld->free_layout_hdr(lo); 221 } 222 223 static void 224 pnfs_detach_layout_hdr(struct pnfs_layout_hdr *lo) 225 { 226 struct nfs_inode *nfsi = NFS_I(lo->plh_inode); 227 dprintk("%s: freeing layout cache %p\n", __func__, lo); 228 nfsi->layout = NULL; 229 /* Reset MDS Threshold I/O counters */ 230 nfsi->write_io = 0; 231 nfsi->read_io = 0; 232 } 233 234 void 235 pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo) 236 { 237 struct inode *inode = lo->plh_inode; 238 239 if (atomic_dec_and_lock(&lo->plh_refcount, &inode->i_lock)) { 240 pnfs_detach_layout_hdr(lo); 241 spin_unlock(&inode->i_lock); 242 pnfs_free_layout_hdr(lo); 243 } 244 } 245 246 static int 247 pnfs_iomode_to_fail_bit(u32 iomode) 248 { 249 return iomode == IOMODE_RW ? 250 NFS_LAYOUT_RW_FAILED : NFS_LAYOUT_RO_FAILED; 251 } 252 253 static void 254 pnfs_layout_set_fail_bit(struct pnfs_layout_hdr *lo, int fail_bit) 255 { 256 lo->plh_retry_timestamp = jiffies; 257 if (test_and_set_bit(fail_bit, &lo->plh_flags)) 258 atomic_inc(&lo->plh_refcount); 259 } 260 261 static void 262 pnfs_layout_clear_fail_bit(struct pnfs_layout_hdr *lo, int fail_bit) 263 { 264 if (test_and_clear_bit(fail_bit, &lo->plh_flags)) 265 atomic_dec(&lo->plh_refcount); 266 } 267 268 static void 269 pnfs_layout_io_set_failed(struct pnfs_layout_hdr *lo, u32 iomode) 270 { 271 struct inode *inode = lo->plh_inode; 272 struct pnfs_layout_range range = { 273 .iomode = iomode, 274 .offset = 0, 275 .length = NFS4_MAX_UINT64, 276 }; 277 LIST_HEAD(head); 278 279 spin_lock(&inode->i_lock); 280 pnfs_layout_set_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode)); 281 pnfs_mark_matching_lsegs_invalid(lo, &head, &range); 282 spin_unlock(&inode->i_lock); 283 pnfs_free_lseg_list(&head); 284 dprintk("%s Setting layout IOMODE_%s fail bit\n", __func__, 285 iomode == IOMODE_RW ? "RW" : "READ"); 286 } 287 288 static bool 289 pnfs_layout_io_test_failed(struct pnfs_layout_hdr *lo, u32 iomode) 290 { 291 unsigned long start, end; 292 int fail_bit = pnfs_iomode_to_fail_bit(iomode); 293 294 if (test_bit(fail_bit, &lo->plh_flags) == 0) 295 return false; 296 end = jiffies; 297 start = end - PNFS_LAYOUTGET_RETRY_TIMEOUT; 298 if (!time_in_range(lo->plh_retry_timestamp, start, end)) { 299 /* It is time to retry the failed layoutgets */ 300 pnfs_layout_clear_fail_bit(lo, fail_bit); 301 return false; 302 } 303 return true; 304 } 305 306 static void 307 init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg) 308 { 309 INIT_LIST_HEAD(&lseg->pls_list); 310 INIT_LIST_HEAD(&lseg->pls_lc_list); 311 atomic_set(&lseg->pls_refcount, 1); 312 smp_mb(); 313 set_bit(NFS_LSEG_VALID, &lseg->pls_flags); 314 lseg->pls_layout = lo; 315 } 316 317 static void pnfs_free_lseg(struct pnfs_layout_segment *lseg) 318 { 319 struct inode *ino = lseg->pls_layout->plh_inode; 320 321 NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg); 322 } 323 324 static void 325 pnfs_layout_remove_lseg(struct pnfs_layout_hdr *lo, 326 struct pnfs_layout_segment *lseg) 327 { 328 struct inode *inode = lo->plh_inode; 329 330 WARN_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags)); 331 list_del_init(&lseg->pls_list); 332 /* Matched by pnfs_get_layout_hdr in pnfs_layout_insert_lseg */ 333 atomic_dec(&lo->plh_refcount); 334 if (list_empty(&lo->plh_segs)) 335 clear_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags); 336 rpc_wake_up(&NFS_SERVER(inode)->roc_rpcwaitq); 337 } 338 339 void 340 pnfs_put_lseg(struct pnfs_layout_segment *lseg) 341 { 342 struct pnfs_layout_hdr *lo; 343 struct inode *inode; 344 345 if (!lseg) 346 return; 347 348 dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg, 349 atomic_read(&lseg->pls_refcount), 350 test_bit(NFS_LSEG_VALID, &lseg->pls_flags)); 351 lo = lseg->pls_layout; 352 inode = lo->plh_inode; 353 if (atomic_dec_and_lock(&lseg->pls_refcount, &inode->i_lock)) { 354 pnfs_get_layout_hdr(lo); 355 pnfs_layout_remove_lseg(lo, lseg); 356 spin_unlock(&inode->i_lock); 357 pnfs_free_lseg(lseg); 358 pnfs_put_layout_hdr(lo); 359 } 360 } 361 EXPORT_SYMBOL_GPL(pnfs_put_lseg); 362 363 static inline u64 364 end_offset(u64 start, u64 len) 365 { 366 u64 end; 367 368 end = start + len; 369 return end >= start ? end : NFS4_MAX_UINT64; 370 } 371 372 /* last octet in a range */ 373 static inline u64 374 last_byte_offset(u64 start, u64 len) 375 { 376 u64 end; 377 378 BUG_ON(!len); 379 end = start + len; 380 return end > start ? end - 1 : NFS4_MAX_UINT64; 381 } 382 383 /* 384 * is l2 fully contained in l1? 385 * start1 end1 386 * [----------------------------------) 387 * start2 end2 388 * [----------------) 389 */ 390 static inline int 391 lo_seg_contained(struct pnfs_layout_range *l1, 392 struct pnfs_layout_range *l2) 393 { 394 u64 start1 = l1->offset; 395 u64 end1 = end_offset(start1, l1->length); 396 u64 start2 = l2->offset; 397 u64 end2 = end_offset(start2, l2->length); 398 399 return (start1 <= start2) && (end1 >= end2); 400 } 401 402 /* 403 * is l1 and l2 intersecting? 404 * start1 end1 405 * [----------------------------------) 406 * start2 end2 407 * [----------------) 408 */ 409 static inline int 410 lo_seg_intersecting(struct pnfs_layout_range *l1, 411 struct pnfs_layout_range *l2) 412 { 413 u64 start1 = l1->offset; 414 u64 end1 = end_offset(start1, l1->length); 415 u64 start2 = l2->offset; 416 u64 end2 = end_offset(start2, l2->length); 417 418 return (end1 == NFS4_MAX_UINT64 || end1 > start2) && 419 (end2 == NFS4_MAX_UINT64 || end2 > start1); 420 } 421 422 static bool 423 should_free_lseg(struct pnfs_layout_range *lseg_range, 424 struct pnfs_layout_range *recall_range) 425 { 426 return (recall_range->iomode == IOMODE_ANY || 427 lseg_range->iomode == recall_range->iomode) && 428 lo_seg_intersecting(lseg_range, recall_range); 429 } 430 431 /* Returns 1 if lseg is removed from list, 0 otherwise */ 432 static int mark_lseg_invalid(struct pnfs_layout_segment *lseg, 433 struct list_head *tmp_list) 434 { 435 int rv = 0; 436 437 if (test_and_clear_bit(NFS_LSEG_VALID, &lseg->pls_flags)) { 438 /* Remove the reference keeping the lseg in the 439 * list. It will now be removed when all 440 * outstanding io is finished. 441 */ 442 dprintk("%s: lseg %p ref %d\n", __func__, lseg, 443 atomic_read(&lseg->pls_refcount)); 444 if (atomic_dec_and_test(&lseg->pls_refcount)) { 445 pnfs_layout_remove_lseg(lseg->pls_layout, lseg); 446 list_add(&lseg->pls_list, tmp_list); 447 rv = 1; 448 } 449 } 450 return rv; 451 } 452 453 /* Returns count of number of matching invalid lsegs remaining in list 454 * after call. 455 */ 456 int 457 pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, 458 struct list_head *tmp_list, 459 struct pnfs_layout_range *recall_range) 460 { 461 struct pnfs_layout_segment *lseg, *next; 462 int invalid = 0, removed = 0; 463 464 dprintk("%s:Begin lo %p\n", __func__, lo); 465 466 if (list_empty(&lo->plh_segs)) 467 return 0; 468 list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list) 469 if (!recall_range || 470 should_free_lseg(&lseg->pls_range, recall_range)) { 471 dprintk("%s: freeing lseg %p iomode %d " 472 "offset %llu length %llu\n", __func__, 473 lseg, lseg->pls_range.iomode, lseg->pls_range.offset, 474 lseg->pls_range.length); 475 invalid++; 476 removed += mark_lseg_invalid(lseg, tmp_list); 477 } 478 dprintk("%s:Return %i\n", __func__, invalid - removed); 479 return invalid - removed; 480 } 481 482 /* note free_me must contain lsegs from a single layout_hdr */ 483 void 484 pnfs_free_lseg_list(struct list_head *free_me) 485 { 486 struct pnfs_layout_segment *lseg, *tmp; 487 488 if (list_empty(free_me)) 489 return; 490 491 list_for_each_entry_safe(lseg, tmp, free_me, pls_list) { 492 list_del(&lseg->pls_list); 493 pnfs_free_lseg(lseg); 494 } 495 } 496 497 void 498 pnfs_destroy_layout(struct nfs_inode *nfsi) 499 { 500 struct pnfs_layout_hdr *lo; 501 LIST_HEAD(tmp_list); 502 503 spin_lock(&nfsi->vfs_inode.i_lock); 504 lo = nfsi->layout; 505 if (lo) { 506 lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */ 507 pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL); 508 pnfs_get_layout_hdr(lo); 509 pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RO_FAILED); 510 pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RW_FAILED); 511 spin_unlock(&nfsi->vfs_inode.i_lock); 512 pnfs_free_lseg_list(&tmp_list); 513 pnfs_put_layout_hdr(lo); 514 } else 515 spin_unlock(&nfsi->vfs_inode.i_lock); 516 } 517 EXPORT_SYMBOL_GPL(pnfs_destroy_layout); 518 519 /* 520 * Called by the state manger to remove all layouts established under an 521 * expired lease. 522 */ 523 void 524 pnfs_destroy_all_layouts(struct nfs_client *clp) 525 { 526 struct nfs_server *server; 527 struct pnfs_layout_hdr *lo; 528 LIST_HEAD(tmp_list); 529 530 nfs4_deviceid_mark_client_invalid(clp); 531 nfs4_deviceid_purge_client(clp); 532 533 spin_lock(&clp->cl_lock); 534 rcu_read_lock(); 535 list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { 536 if (!list_empty(&server->layouts)) 537 list_splice_init(&server->layouts, &tmp_list); 538 } 539 rcu_read_unlock(); 540 spin_unlock(&clp->cl_lock); 541 542 while (!list_empty(&tmp_list)) { 543 lo = list_entry(tmp_list.next, struct pnfs_layout_hdr, 544 plh_layouts); 545 dprintk("%s freeing layout for inode %lu\n", __func__, 546 lo->plh_inode->i_ino); 547 list_del_init(&lo->plh_layouts); 548 pnfs_destroy_layout(NFS_I(lo->plh_inode)); 549 } 550 } 551 552 /* 553 * Compare 2 layout stateid sequence ids, to see which is newer, 554 * taking into account wraparound issues. 555 */ 556 static bool pnfs_seqid_is_newer(u32 s1, u32 s2) 557 { 558 return (s32)s1 - (s32)s2 > 0; 559 } 560 561 /* update lo->plh_stateid with new if is more recent */ 562 void 563 pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new, 564 bool update_barrier) 565 { 566 u32 oldseq, newseq, new_barrier; 567 int empty = list_empty(&lo->plh_segs); 568 569 oldseq = be32_to_cpu(lo->plh_stateid.seqid); 570 newseq = be32_to_cpu(new->seqid); 571 if (empty || pnfs_seqid_is_newer(newseq, oldseq)) { 572 nfs4_stateid_copy(&lo->plh_stateid, new); 573 if (update_barrier) { 574 new_barrier = be32_to_cpu(new->seqid); 575 } else { 576 /* Because of wraparound, we want to keep the barrier 577 * "close" to the current seqids. 578 */ 579 new_barrier = newseq - atomic_read(&lo->plh_outstanding); 580 } 581 if (empty || pnfs_seqid_is_newer(new_barrier, lo->plh_barrier)) 582 lo->plh_barrier = new_barrier; 583 } 584 } 585 586 static bool 587 pnfs_layout_stateid_blocked(const struct pnfs_layout_hdr *lo, 588 const nfs4_stateid *stateid) 589 { 590 u32 seqid = be32_to_cpu(stateid->seqid); 591 592 return !pnfs_seqid_is_newer(seqid, lo->plh_barrier); 593 } 594 595 /* lget is set to 1 if called from inside send_layoutget call chain */ 596 static bool 597 pnfs_layoutgets_blocked(const struct pnfs_layout_hdr *lo, int lget) 598 { 599 return lo->plh_block_lgets || 600 test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) || 601 (list_empty(&lo->plh_segs) && 602 (atomic_read(&lo->plh_outstanding) > lget)); 603 } 604 605 int 606 pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo, 607 struct nfs4_state *open_state) 608 { 609 int status = 0; 610 611 dprintk("--> %s\n", __func__); 612 spin_lock(&lo->plh_inode->i_lock); 613 if (pnfs_layoutgets_blocked(lo, 1)) { 614 status = -EAGAIN; 615 } else if (list_empty(&lo->plh_segs)) { 616 int seq; 617 618 do { 619 seq = read_seqbegin(&open_state->seqlock); 620 nfs4_stateid_copy(dst, &open_state->stateid); 621 } while (read_seqretry(&open_state->seqlock, seq)); 622 } else 623 nfs4_stateid_copy(dst, &lo->plh_stateid); 624 spin_unlock(&lo->plh_inode->i_lock); 625 dprintk("<-- %s\n", __func__); 626 return status; 627 } 628 629 /* 630 * Get layout from server. 631 * for now, assume that whole file layouts are requested. 632 * arg->offset: 0 633 * arg->length: all ones 634 */ 635 static struct pnfs_layout_segment * 636 send_layoutget(struct pnfs_layout_hdr *lo, 637 struct nfs_open_context *ctx, 638 struct pnfs_layout_range *range, 639 gfp_t gfp_flags) 640 { 641 struct inode *ino = lo->plh_inode; 642 struct nfs_server *server = NFS_SERVER(ino); 643 struct nfs4_layoutget *lgp; 644 struct pnfs_layout_segment *lseg; 645 646 dprintk("--> %s\n", __func__); 647 648 BUG_ON(ctx == NULL); 649 lgp = kzalloc(sizeof(*lgp), gfp_flags); 650 if (lgp == NULL) 651 return NULL; 652 653 lgp->args.minlength = PAGE_CACHE_SIZE; 654 if (lgp->args.minlength > range->length) 655 lgp->args.minlength = range->length; 656 lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE; 657 lgp->args.range = *range; 658 lgp->args.type = server->pnfs_curr_ld->id; 659 lgp->args.inode = ino; 660 lgp->args.ctx = get_nfs_open_context(ctx); 661 lgp->gfp_flags = gfp_flags; 662 663 /* Synchronously retrieve layout information from server and 664 * store in lseg. 665 */ 666 lseg = nfs4_proc_layoutget(lgp, gfp_flags); 667 if (IS_ERR(lseg)) { 668 switch (PTR_ERR(lseg)) { 669 case -ENOMEM: 670 case -ERESTARTSYS: 671 break; 672 default: 673 /* remember that LAYOUTGET failed and suspend trying */ 674 pnfs_layout_io_set_failed(lo, range->iomode); 675 } 676 return NULL; 677 } 678 679 return lseg; 680 } 681 682 /* 683 * Initiates a LAYOUTRETURN(FILE), and removes the pnfs_layout_hdr 684 * when the layout segment list is empty. 685 * 686 * Note that a pnfs_layout_hdr can exist with an empty layout segment 687 * list when LAYOUTGET has failed, or when LAYOUTGET succeeded, but the 688 * deviceid is marked invalid. 689 */ 690 int 691 _pnfs_return_layout(struct inode *ino) 692 { 693 struct pnfs_layout_hdr *lo = NULL; 694 struct nfs_inode *nfsi = NFS_I(ino); 695 LIST_HEAD(tmp_list); 696 struct nfs4_layoutreturn *lrp; 697 nfs4_stateid stateid; 698 int status = 0, empty; 699 700 dprintk("NFS: %s for inode %lu\n", __func__, ino->i_ino); 701 702 spin_lock(&ino->i_lock); 703 lo = nfsi->layout; 704 if (!lo) { 705 spin_unlock(&ino->i_lock); 706 dprintk("NFS: %s no layout to return\n", __func__); 707 goto out; 708 } 709 stateid = nfsi->layout->plh_stateid; 710 /* Reference matched in nfs4_layoutreturn_release */ 711 pnfs_get_layout_hdr(lo); 712 empty = list_empty(&lo->plh_segs); 713 pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL); 714 /* Don't send a LAYOUTRETURN if list was initially empty */ 715 if (empty) { 716 spin_unlock(&ino->i_lock); 717 pnfs_put_layout_hdr(lo); 718 dprintk("NFS: %s no layout segments to return\n", __func__); 719 goto out; 720 } 721 lo->plh_block_lgets++; 722 spin_unlock(&ino->i_lock); 723 pnfs_free_lseg_list(&tmp_list); 724 725 WARN_ON(test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)); 726 727 lrp = kzalloc(sizeof(*lrp), GFP_KERNEL); 728 if (unlikely(lrp == NULL)) { 729 status = -ENOMEM; 730 spin_lock(&ino->i_lock); 731 lo->plh_block_lgets--; 732 spin_unlock(&ino->i_lock); 733 pnfs_put_layout_hdr(lo); 734 goto out; 735 } 736 737 lrp->args.stateid = stateid; 738 lrp->args.layout_type = NFS_SERVER(ino)->pnfs_curr_ld->id; 739 lrp->args.inode = ino; 740 lrp->args.layout = lo; 741 lrp->clp = NFS_SERVER(ino)->nfs_client; 742 743 status = nfs4_proc_layoutreturn(lrp); 744 out: 745 dprintk("<-- %s status: %d\n", __func__, status); 746 return status; 747 } 748 EXPORT_SYMBOL_GPL(_pnfs_return_layout); 749 750 bool pnfs_roc(struct inode *ino) 751 { 752 struct pnfs_layout_hdr *lo; 753 struct pnfs_layout_segment *lseg, *tmp; 754 LIST_HEAD(tmp_list); 755 bool found = false; 756 757 spin_lock(&ino->i_lock); 758 lo = NFS_I(ino)->layout; 759 if (!lo || !test_and_clear_bit(NFS_LAYOUT_ROC, &lo->plh_flags) || 760 test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) 761 goto out_nolayout; 762 list_for_each_entry_safe(lseg, tmp, &lo->plh_segs, pls_list) 763 if (test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) { 764 mark_lseg_invalid(lseg, &tmp_list); 765 found = true; 766 } 767 if (!found) 768 goto out_nolayout; 769 lo->plh_block_lgets++; 770 pnfs_get_layout_hdr(lo); /* matched in pnfs_roc_release */ 771 spin_unlock(&ino->i_lock); 772 pnfs_free_lseg_list(&tmp_list); 773 return true; 774 775 out_nolayout: 776 spin_unlock(&ino->i_lock); 777 return false; 778 } 779 780 void pnfs_roc_release(struct inode *ino) 781 { 782 struct pnfs_layout_hdr *lo; 783 784 spin_lock(&ino->i_lock); 785 lo = NFS_I(ino)->layout; 786 lo->plh_block_lgets--; 787 if (atomic_dec_and_test(&lo->plh_refcount)) { 788 pnfs_detach_layout_hdr(lo); 789 spin_unlock(&ino->i_lock); 790 pnfs_free_layout_hdr(lo); 791 } else 792 spin_unlock(&ino->i_lock); 793 } 794 795 void pnfs_roc_set_barrier(struct inode *ino, u32 barrier) 796 { 797 struct pnfs_layout_hdr *lo; 798 799 spin_lock(&ino->i_lock); 800 lo = NFS_I(ino)->layout; 801 if (pnfs_seqid_is_newer(barrier, lo->plh_barrier)) 802 lo->plh_barrier = barrier; 803 spin_unlock(&ino->i_lock); 804 } 805 806 bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task) 807 { 808 struct nfs_inode *nfsi = NFS_I(ino); 809 struct pnfs_layout_hdr *lo; 810 struct pnfs_layout_segment *lseg; 811 u32 current_seqid; 812 bool found = false; 813 814 spin_lock(&ino->i_lock); 815 list_for_each_entry(lseg, &nfsi->layout->plh_segs, pls_list) 816 if (test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) { 817 rpc_sleep_on(&NFS_SERVER(ino)->roc_rpcwaitq, task, NULL); 818 found = true; 819 goto out; 820 } 821 lo = nfsi->layout; 822 current_seqid = be32_to_cpu(lo->plh_stateid.seqid); 823 824 /* Since close does not return a layout stateid for use as 825 * a barrier, we choose the worst-case barrier. 826 */ 827 *barrier = current_seqid + atomic_read(&lo->plh_outstanding); 828 out: 829 spin_unlock(&ino->i_lock); 830 return found; 831 } 832 833 /* 834 * Compare two layout segments for sorting into layout cache. 835 * We want to preferentially return RW over RO layouts, so ensure those 836 * are seen first. 837 */ 838 static s64 839 cmp_layout(struct pnfs_layout_range *l1, 840 struct pnfs_layout_range *l2) 841 { 842 s64 d; 843 844 /* high offset > low offset */ 845 d = l1->offset - l2->offset; 846 if (d) 847 return d; 848 849 /* short length > long length */ 850 d = l2->length - l1->length; 851 if (d) 852 return d; 853 854 /* read > read/write */ 855 return (int)(l1->iomode == IOMODE_READ) - (int)(l2->iomode == IOMODE_READ); 856 } 857 858 static void 859 pnfs_layout_insert_lseg(struct pnfs_layout_hdr *lo, 860 struct pnfs_layout_segment *lseg) 861 { 862 struct pnfs_layout_segment *lp; 863 864 dprintk("%s:Begin\n", __func__); 865 866 list_for_each_entry(lp, &lo->plh_segs, pls_list) { 867 if (cmp_layout(&lseg->pls_range, &lp->pls_range) > 0) 868 continue; 869 list_add_tail(&lseg->pls_list, &lp->pls_list); 870 dprintk("%s: inserted lseg %p " 871 "iomode %d offset %llu length %llu before " 872 "lp %p iomode %d offset %llu length %llu\n", 873 __func__, lseg, lseg->pls_range.iomode, 874 lseg->pls_range.offset, lseg->pls_range.length, 875 lp, lp->pls_range.iomode, lp->pls_range.offset, 876 lp->pls_range.length); 877 goto out; 878 } 879 list_add_tail(&lseg->pls_list, &lo->plh_segs); 880 dprintk("%s: inserted lseg %p " 881 "iomode %d offset %llu length %llu at tail\n", 882 __func__, lseg, lseg->pls_range.iomode, 883 lseg->pls_range.offset, lseg->pls_range.length); 884 out: 885 pnfs_get_layout_hdr(lo); 886 887 dprintk("%s:Return\n", __func__); 888 } 889 890 static struct pnfs_layout_hdr * 891 alloc_init_layout_hdr(struct inode *ino, 892 struct nfs_open_context *ctx, 893 gfp_t gfp_flags) 894 { 895 struct pnfs_layout_hdr *lo; 896 897 lo = pnfs_alloc_layout_hdr(ino, gfp_flags); 898 if (!lo) 899 return NULL; 900 atomic_set(&lo->plh_refcount, 1); 901 INIT_LIST_HEAD(&lo->plh_layouts); 902 INIT_LIST_HEAD(&lo->plh_segs); 903 INIT_LIST_HEAD(&lo->plh_bulk_recall); 904 lo->plh_inode = ino; 905 lo->plh_lc_cred = get_rpccred(ctx->state->owner->so_cred); 906 return lo; 907 } 908 909 static struct pnfs_layout_hdr * 910 pnfs_find_alloc_layout(struct inode *ino, 911 struct nfs_open_context *ctx, 912 gfp_t gfp_flags) 913 { 914 struct nfs_inode *nfsi = NFS_I(ino); 915 struct pnfs_layout_hdr *new = NULL; 916 917 dprintk("%s Begin ino=%p layout=%p\n", __func__, ino, nfsi->layout); 918 919 if (nfsi->layout != NULL) 920 goto out_existing; 921 spin_unlock(&ino->i_lock); 922 new = alloc_init_layout_hdr(ino, ctx, gfp_flags); 923 spin_lock(&ino->i_lock); 924 925 if (likely(nfsi->layout == NULL)) { /* Won the race? */ 926 nfsi->layout = new; 927 return new; 928 } else if (new != NULL) 929 pnfs_free_layout_hdr(new); 930 out_existing: 931 pnfs_get_layout_hdr(nfsi->layout); 932 return nfsi->layout; 933 } 934 935 /* 936 * iomode matching rules: 937 * iomode lseg match 938 * ----- ----- ----- 939 * ANY READ true 940 * ANY RW true 941 * RW READ false 942 * RW RW true 943 * READ READ true 944 * READ RW true 945 */ 946 static int 947 is_matching_lseg(struct pnfs_layout_range *ls_range, 948 struct pnfs_layout_range *range) 949 { 950 struct pnfs_layout_range range1; 951 952 if ((range->iomode == IOMODE_RW && 953 ls_range->iomode != IOMODE_RW) || 954 !lo_seg_intersecting(ls_range, range)) 955 return 0; 956 957 /* range1 covers only the first byte in the range */ 958 range1 = *range; 959 range1.length = 1; 960 return lo_seg_contained(ls_range, &range1); 961 } 962 963 /* 964 * lookup range in layout 965 */ 966 static struct pnfs_layout_segment * 967 pnfs_find_lseg(struct pnfs_layout_hdr *lo, 968 struct pnfs_layout_range *range) 969 { 970 struct pnfs_layout_segment *lseg, *ret = NULL; 971 972 dprintk("%s:Begin\n", __func__); 973 974 list_for_each_entry(lseg, &lo->plh_segs, pls_list) { 975 if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) && 976 is_matching_lseg(&lseg->pls_range, range)) { 977 ret = pnfs_get_lseg(lseg); 978 break; 979 } 980 if (lseg->pls_range.offset > range->offset) 981 break; 982 } 983 984 dprintk("%s:Return lseg %p ref %d\n", 985 __func__, ret, ret ? atomic_read(&ret->pls_refcount) : 0); 986 return ret; 987 } 988 989 /* 990 * Use mdsthreshold hints set at each OPEN to determine if I/O should go 991 * to the MDS or over pNFS 992 * 993 * The nfs_inode read_io and write_io fields are cumulative counters reset 994 * when there are no layout segments. Note that in pnfs_update_layout iomode 995 * is set to IOMODE_READ for a READ request, and set to IOMODE_RW for a 996 * WRITE request. 997 * 998 * A return of true means use MDS I/O. 999 * 1000 * From rfc 5661: 1001 * If a file's size is smaller than the file size threshold, data accesses 1002 * SHOULD be sent to the metadata server. If an I/O request has a length that 1003 * is below the I/O size threshold, the I/O SHOULD be sent to the metadata 1004 * server. If both file size and I/O size are provided, the client SHOULD 1005 * reach or exceed both thresholds before sending its read or write 1006 * requests to the data server. 1007 */ 1008 static bool pnfs_within_mdsthreshold(struct nfs_open_context *ctx, 1009 struct inode *ino, int iomode) 1010 { 1011 struct nfs4_threshold *t = ctx->mdsthreshold; 1012 struct nfs_inode *nfsi = NFS_I(ino); 1013 loff_t fsize = i_size_read(ino); 1014 bool size = false, size_set = false, io = false, io_set = false, ret = false; 1015 1016 if (t == NULL) 1017 return ret; 1018 1019 dprintk("%s bm=0x%x rd_sz=%llu wr_sz=%llu rd_io=%llu wr_io=%llu\n", 1020 __func__, t->bm, t->rd_sz, t->wr_sz, t->rd_io_sz, t->wr_io_sz); 1021 1022 switch (iomode) { 1023 case IOMODE_READ: 1024 if (t->bm & THRESHOLD_RD) { 1025 dprintk("%s fsize %llu\n", __func__, fsize); 1026 size_set = true; 1027 if (fsize < t->rd_sz) 1028 size = true; 1029 } 1030 if (t->bm & THRESHOLD_RD_IO) { 1031 dprintk("%s nfsi->read_io %llu\n", __func__, 1032 nfsi->read_io); 1033 io_set = true; 1034 if (nfsi->read_io < t->rd_io_sz) 1035 io = true; 1036 } 1037 break; 1038 case IOMODE_RW: 1039 if (t->bm & THRESHOLD_WR) { 1040 dprintk("%s fsize %llu\n", __func__, fsize); 1041 size_set = true; 1042 if (fsize < t->wr_sz) 1043 size = true; 1044 } 1045 if (t->bm & THRESHOLD_WR_IO) { 1046 dprintk("%s nfsi->write_io %llu\n", __func__, 1047 nfsi->write_io); 1048 io_set = true; 1049 if (nfsi->write_io < t->wr_io_sz) 1050 io = true; 1051 } 1052 break; 1053 } 1054 if (size_set && io_set) { 1055 if (size && io) 1056 ret = true; 1057 } else if (size || io) 1058 ret = true; 1059 1060 dprintk("<-- %s size %d io %d ret %d\n", __func__, size, io, ret); 1061 return ret; 1062 } 1063 1064 /* 1065 * Layout segment is retreived from the server if not cached. 1066 * The appropriate layout segment is referenced and returned to the caller. 1067 */ 1068 struct pnfs_layout_segment * 1069 pnfs_update_layout(struct inode *ino, 1070 struct nfs_open_context *ctx, 1071 loff_t pos, 1072 u64 count, 1073 enum pnfs_iomode iomode, 1074 gfp_t gfp_flags) 1075 { 1076 struct pnfs_layout_range arg = { 1077 .iomode = iomode, 1078 .offset = pos, 1079 .length = count, 1080 }; 1081 unsigned pg_offset; 1082 struct nfs_server *server = NFS_SERVER(ino); 1083 struct nfs_client *clp = server->nfs_client; 1084 struct pnfs_layout_hdr *lo; 1085 struct pnfs_layout_segment *lseg = NULL; 1086 bool first = false; 1087 1088 if (!pnfs_enabled_sb(NFS_SERVER(ino))) 1089 goto out; 1090 1091 if (pnfs_within_mdsthreshold(ctx, ino, iomode)) 1092 goto out; 1093 1094 spin_lock(&ino->i_lock); 1095 lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags); 1096 if (lo == NULL) { 1097 spin_unlock(&ino->i_lock); 1098 goto out; 1099 } 1100 1101 /* Do we even need to bother with this? */ 1102 if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) { 1103 dprintk("%s matches recall, use MDS\n", __func__); 1104 goto out_unlock; 1105 } 1106 1107 /* if LAYOUTGET already failed once we don't try again */ 1108 if (pnfs_layout_io_test_failed(lo, iomode)) 1109 goto out_unlock; 1110 1111 /* Check to see if the layout for the given range already exists */ 1112 lseg = pnfs_find_lseg(lo, &arg); 1113 if (lseg) 1114 goto out_unlock; 1115 1116 if (pnfs_layoutgets_blocked(lo, 0)) 1117 goto out_unlock; 1118 atomic_inc(&lo->plh_outstanding); 1119 1120 if (list_empty(&lo->plh_segs)) 1121 first = true; 1122 1123 spin_unlock(&ino->i_lock); 1124 if (first) { 1125 /* The lo must be on the clp list if there is any 1126 * chance of a CB_LAYOUTRECALL(FILE) coming in. 1127 */ 1128 spin_lock(&clp->cl_lock); 1129 BUG_ON(!list_empty(&lo->plh_layouts)); 1130 list_add_tail(&lo->plh_layouts, &server->layouts); 1131 spin_unlock(&clp->cl_lock); 1132 } 1133 1134 pg_offset = arg.offset & ~PAGE_CACHE_MASK; 1135 if (pg_offset) { 1136 arg.offset -= pg_offset; 1137 arg.length += pg_offset; 1138 } 1139 if (arg.length != NFS4_MAX_UINT64) 1140 arg.length = PAGE_CACHE_ALIGN(arg.length); 1141 1142 lseg = send_layoutget(lo, ctx, &arg, gfp_flags); 1143 atomic_dec(&lo->plh_outstanding); 1144 out_put_layout_hdr: 1145 pnfs_put_layout_hdr(lo); 1146 out: 1147 dprintk("%s: inode %s/%llu pNFS layout segment %s for " 1148 "(%s, offset: %llu, length: %llu)\n", 1149 __func__, ino->i_sb->s_id, 1150 (unsigned long long)NFS_FILEID(ino), 1151 lseg == NULL ? "not found" : "found", 1152 iomode==IOMODE_RW ? "read/write" : "read-only", 1153 (unsigned long long)pos, 1154 (unsigned long long)count); 1155 return lseg; 1156 out_unlock: 1157 spin_unlock(&ino->i_lock); 1158 goto out_put_layout_hdr; 1159 } 1160 EXPORT_SYMBOL_GPL(pnfs_update_layout); 1161 1162 struct pnfs_layout_segment * 1163 pnfs_layout_process(struct nfs4_layoutget *lgp) 1164 { 1165 struct pnfs_layout_hdr *lo = NFS_I(lgp->args.inode)->layout; 1166 struct nfs4_layoutget_res *res = &lgp->res; 1167 struct pnfs_layout_segment *lseg; 1168 struct inode *ino = lo->plh_inode; 1169 int status = 0; 1170 1171 /* Inject layout blob into I/O device driver */ 1172 lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res, lgp->gfp_flags); 1173 if (!lseg || IS_ERR(lseg)) { 1174 if (!lseg) 1175 status = -ENOMEM; 1176 else 1177 status = PTR_ERR(lseg); 1178 dprintk("%s: Could not allocate layout: error %d\n", 1179 __func__, status); 1180 goto out; 1181 } 1182 1183 spin_lock(&ino->i_lock); 1184 if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) { 1185 dprintk("%s forget reply due to recall\n", __func__); 1186 goto out_forget_reply; 1187 } 1188 1189 if (pnfs_layoutgets_blocked(lo, 1) || 1190 pnfs_layout_stateid_blocked(lo, &res->stateid)) { 1191 dprintk("%s forget reply due to state\n", __func__); 1192 goto out_forget_reply; 1193 } 1194 1195 /* Done processing layoutget. Set the layout stateid */ 1196 pnfs_set_layout_stateid(lo, &res->stateid, false); 1197 1198 init_lseg(lo, lseg); 1199 lseg->pls_range = res->range; 1200 pnfs_get_lseg(lseg); 1201 pnfs_layout_insert_lseg(lo, lseg); 1202 1203 if (res->return_on_close) { 1204 set_bit(NFS_LSEG_ROC, &lseg->pls_flags); 1205 set_bit(NFS_LAYOUT_ROC, &lo->plh_flags); 1206 } 1207 1208 spin_unlock(&ino->i_lock); 1209 return lseg; 1210 out: 1211 return ERR_PTR(status); 1212 1213 out_forget_reply: 1214 spin_unlock(&ino->i_lock); 1215 lseg->pls_layout = lo; 1216 NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg); 1217 goto out; 1218 } 1219 1220 void 1221 pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) 1222 { 1223 u64 rd_size = req->wb_bytes; 1224 1225 BUG_ON(pgio->pg_lseg != NULL); 1226 1227 if (req->wb_offset != req->wb_pgbase) { 1228 nfs_pageio_reset_read_mds(pgio); 1229 return; 1230 } 1231 1232 if (pgio->pg_dreq == NULL) 1233 rd_size = i_size_read(pgio->pg_inode) - req_offset(req); 1234 else 1235 rd_size = nfs_dreq_bytes_left(pgio->pg_dreq); 1236 1237 pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, 1238 req->wb_context, 1239 req_offset(req), 1240 rd_size, 1241 IOMODE_READ, 1242 GFP_KERNEL); 1243 /* If no lseg, fall back to read through mds */ 1244 if (pgio->pg_lseg == NULL) 1245 nfs_pageio_reset_read_mds(pgio); 1246 1247 } 1248 EXPORT_SYMBOL_GPL(pnfs_generic_pg_init_read); 1249 1250 void 1251 pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio, 1252 struct nfs_page *req, u64 wb_size) 1253 { 1254 BUG_ON(pgio->pg_lseg != NULL); 1255 1256 if (req->wb_offset != req->wb_pgbase) { 1257 nfs_pageio_reset_write_mds(pgio); 1258 return; 1259 } 1260 1261 pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, 1262 req->wb_context, 1263 req_offset(req), 1264 wb_size, 1265 IOMODE_RW, 1266 GFP_NOFS); 1267 /* If no lseg, fall back to write through mds */ 1268 if (pgio->pg_lseg == NULL) 1269 nfs_pageio_reset_write_mds(pgio); 1270 } 1271 EXPORT_SYMBOL_GPL(pnfs_generic_pg_init_write); 1272 1273 void 1274 pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode, 1275 const struct nfs_pgio_completion_ops *compl_ops) 1276 { 1277 struct nfs_server *server = NFS_SERVER(inode); 1278 struct pnfs_layoutdriver_type *ld = server->pnfs_curr_ld; 1279 1280 if (ld == NULL) 1281 nfs_pageio_init_read(pgio, inode, compl_ops); 1282 else 1283 nfs_pageio_init(pgio, inode, ld->pg_read_ops, compl_ops, server->rsize, 0); 1284 } 1285 1286 void 1287 pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode, 1288 int ioflags, 1289 const struct nfs_pgio_completion_ops *compl_ops) 1290 { 1291 struct nfs_server *server = NFS_SERVER(inode); 1292 struct pnfs_layoutdriver_type *ld = server->pnfs_curr_ld; 1293 1294 if (ld == NULL) 1295 nfs_pageio_init_write(pgio, inode, ioflags, compl_ops); 1296 else 1297 nfs_pageio_init(pgio, inode, ld->pg_write_ops, compl_ops, server->wsize, ioflags); 1298 } 1299 1300 bool 1301 pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, 1302 struct nfs_page *req) 1303 { 1304 if (pgio->pg_lseg == NULL) 1305 return nfs_generic_pg_test(pgio, prev, req); 1306 1307 /* 1308 * Test if a nfs_page is fully contained in the pnfs_layout_range. 1309 * Note that this test makes several assumptions: 1310 * - that the previous nfs_page in the struct nfs_pageio_descriptor 1311 * is known to lie within the range. 1312 * - that the nfs_page being tested is known to be contiguous with the 1313 * previous nfs_page. 1314 * - Layout ranges are page aligned, so we only have to test the 1315 * start offset of the request. 1316 * 1317 * Please also note that 'end_offset' is actually the offset of the 1318 * first byte that lies outside the pnfs_layout_range. FIXME? 1319 * 1320 */ 1321 return req_offset(req) < end_offset(pgio->pg_lseg->pls_range.offset, 1322 pgio->pg_lseg->pls_range.length); 1323 } 1324 EXPORT_SYMBOL_GPL(pnfs_generic_pg_test); 1325 1326 int pnfs_write_done_resend_to_mds(struct inode *inode, 1327 struct list_head *head, 1328 const struct nfs_pgio_completion_ops *compl_ops) 1329 { 1330 struct nfs_pageio_descriptor pgio; 1331 LIST_HEAD(failed); 1332 1333 /* Resend all requests through the MDS */ 1334 nfs_pageio_init_write(&pgio, inode, FLUSH_STABLE, compl_ops); 1335 while (!list_empty(head)) { 1336 struct nfs_page *req = nfs_list_entry(head->next); 1337 1338 nfs_list_remove_request(req); 1339 if (!nfs_pageio_add_request(&pgio, req)) 1340 nfs_list_add_request(req, &failed); 1341 } 1342 nfs_pageio_complete(&pgio); 1343 1344 if (!list_empty(&failed)) { 1345 /* For some reason our attempt to resend pages. Mark the 1346 * overall send request as having failed, and let 1347 * nfs_writeback_release_full deal with the error. 1348 */ 1349 list_move(&failed, head); 1350 return -EIO; 1351 } 1352 return 0; 1353 } 1354 EXPORT_SYMBOL_GPL(pnfs_write_done_resend_to_mds); 1355 1356 static void pnfs_ld_handle_write_error(struct nfs_write_data *data) 1357 { 1358 struct nfs_pgio_header *hdr = data->header; 1359 1360 dprintk("pnfs write error = %d\n", hdr->pnfs_error); 1361 if (NFS_SERVER(hdr->inode)->pnfs_curr_ld->flags & 1362 PNFS_LAYOUTRET_ON_ERROR) { 1363 clear_bit(NFS_INO_LAYOUTCOMMIT, &NFS_I(hdr->inode)->flags); 1364 pnfs_return_layout(hdr->inode); 1365 } 1366 if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) 1367 data->task.tk_status = pnfs_write_done_resend_to_mds(hdr->inode, 1368 &hdr->pages, 1369 hdr->completion_ops); 1370 } 1371 1372 /* 1373 * Called by non rpc-based layout drivers 1374 */ 1375 void pnfs_ld_write_done(struct nfs_write_data *data) 1376 { 1377 struct nfs_pgio_header *hdr = data->header; 1378 1379 if (!hdr->pnfs_error) { 1380 pnfs_set_layoutcommit(data); 1381 hdr->mds_ops->rpc_call_done(&data->task, data); 1382 } else 1383 pnfs_ld_handle_write_error(data); 1384 hdr->mds_ops->rpc_release(data); 1385 } 1386 EXPORT_SYMBOL_GPL(pnfs_ld_write_done); 1387 1388 static void 1389 pnfs_write_through_mds(struct nfs_pageio_descriptor *desc, 1390 struct nfs_write_data *data) 1391 { 1392 struct nfs_pgio_header *hdr = data->header; 1393 1394 if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) { 1395 list_splice_tail_init(&hdr->pages, &desc->pg_list); 1396 nfs_pageio_reset_write_mds(desc); 1397 desc->pg_recoalesce = 1; 1398 } 1399 nfs_writedata_release(data); 1400 } 1401 1402 static enum pnfs_try_status 1403 pnfs_try_to_write_data(struct nfs_write_data *wdata, 1404 const struct rpc_call_ops *call_ops, 1405 struct pnfs_layout_segment *lseg, 1406 int how) 1407 { 1408 struct nfs_pgio_header *hdr = wdata->header; 1409 struct inode *inode = hdr->inode; 1410 enum pnfs_try_status trypnfs; 1411 struct nfs_server *nfss = NFS_SERVER(inode); 1412 1413 hdr->mds_ops = call_ops; 1414 1415 dprintk("%s: Writing ino:%lu %u@%llu (how %d)\n", __func__, 1416 inode->i_ino, wdata->args.count, wdata->args.offset, how); 1417 trypnfs = nfss->pnfs_curr_ld->write_pagelist(wdata, how); 1418 if (trypnfs != PNFS_NOT_ATTEMPTED) 1419 nfs_inc_stats(inode, NFSIOS_PNFS_WRITE); 1420 dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs); 1421 return trypnfs; 1422 } 1423 1424 static void 1425 pnfs_do_multiple_writes(struct nfs_pageio_descriptor *desc, struct list_head *head, int how) 1426 { 1427 struct nfs_write_data *data; 1428 const struct rpc_call_ops *call_ops = desc->pg_rpc_callops; 1429 struct pnfs_layout_segment *lseg = desc->pg_lseg; 1430 1431 desc->pg_lseg = NULL; 1432 while (!list_empty(head)) { 1433 enum pnfs_try_status trypnfs; 1434 1435 data = list_first_entry(head, struct nfs_write_data, list); 1436 list_del_init(&data->list); 1437 1438 trypnfs = pnfs_try_to_write_data(data, call_ops, lseg, how); 1439 if (trypnfs == PNFS_NOT_ATTEMPTED) 1440 pnfs_write_through_mds(desc, data); 1441 } 1442 pnfs_put_lseg(lseg); 1443 } 1444 1445 static void pnfs_writehdr_free(struct nfs_pgio_header *hdr) 1446 { 1447 pnfs_put_lseg(hdr->lseg); 1448 nfs_writehdr_free(hdr); 1449 } 1450 EXPORT_SYMBOL_GPL(pnfs_writehdr_free); 1451 1452 int 1453 pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc) 1454 { 1455 struct nfs_write_header *whdr; 1456 struct nfs_pgio_header *hdr; 1457 int ret; 1458 1459 whdr = nfs_writehdr_alloc(); 1460 if (!whdr) { 1461 desc->pg_completion_ops->error_cleanup(&desc->pg_list); 1462 pnfs_put_lseg(desc->pg_lseg); 1463 desc->pg_lseg = NULL; 1464 return -ENOMEM; 1465 } 1466 hdr = &whdr->header; 1467 nfs_pgheader_init(desc, hdr, pnfs_writehdr_free); 1468 hdr->lseg = pnfs_get_lseg(desc->pg_lseg); 1469 atomic_inc(&hdr->refcnt); 1470 ret = nfs_generic_flush(desc, hdr); 1471 if (ret != 0) { 1472 pnfs_put_lseg(desc->pg_lseg); 1473 desc->pg_lseg = NULL; 1474 } else 1475 pnfs_do_multiple_writes(desc, &hdr->rpc_list, desc->pg_ioflags); 1476 if (atomic_dec_and_test(&hdr->refcnt)) 1477 hdr->completion_ops->completion(hdr); 1478 return ret; 1479 } 1480 EXPORT_SYMBOL_GPL(pnfs_generic_pg_writepages); 1481 1482 int pnfs_read_done_resend_to_mds(struct inode *inode, 1483 struct list_head *head, 1484 const struct nfs_pgio_completion_ops *compl_ops) 1485 { 1486 struct nfs_pageio_descriptor pgio; 1487 LIST_HEAD(failed); 1488 1489 /* Resend all requests through the MDS */ 1490 nfs_pageio_init_read(&pgio, inode, compl_ops); 1491 while (!list_empty(head)) { 1492 struct nfs_page *req = nfs_list_entry(head->next); 1493 1494 nfs_list_remove_request(req); 1495 if (!nfs_pageio_add_request(&pgio, req)) 1496 nfs_list_add_request(req, &failed); 1497 } 1498 nfs_pageio_complete(&pgio); 1499 1500 if (!list_empty(&failed)) { 1501 list_move(&failed, head); 1502 return -EIO; 1503 } 1504 return 0; 1505 } 1506 EXPORT_SYMBOL_GPL(pnfs_read_done_resend_to_mds); 1507 1508 static void pnfs_ld_handle_read_error(struct nfs_read_data *data) 1509 { 1510 struct nfs_pgio_header *hdr = data->header; 1511 1512 dprintk("pnfs read error = %d\n", hdr->pnfs_error); 1513 if (NFS_SERVER(hdr->inode)->pnfs_curr_ld->flags & 1514 PNFS_LAYOUTRET_ON_ERROR) { 1515 clear_bit(NFS_INO_LAYOUTCOMMIT, &NFS_I(hdr->inode)->flags); 1516 pnfs_return_layout(hdr->inode); 1517 } 1518 if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) 1519 data->task.tk_status = pnfs_read_done_resend_to_mds(hdr->inode, 1520 &hdr->pages, 1521 hdr->completion_ops); 1522 } 1523 1524 /* 1525 * Called by non rpc-based layout drivers 1526 */ 1527 void pnfs_ld_read_done(struct nfs_read_data *data) 1528 { 1529 struct nfs_pgio_header *hdr = data->header; 1530 1531 if (likely(!hdr->pnfs_error)) { 1532 __nfs4_read_done_cb(data); 1533 hdr->mds_ops->rpc_call_done(&data->task, data); 1534 } else 1535 pnfs_ld_handle_read_error(data); 1536 hdr->mds_ops->rpc_release(data); 1537 } 1538 EXPORT_SYMBOL_GPL(pnfs_ld_read_done); 1539 1540 static void 1541 pnfs_read_through_mds(struct nfs_pageio_descriptor *desc, 1542 struct nfs_read_data *data) 1543 { 1544 struct nfs_pgio_header *hdr = data->header; 1545 1546 if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) { 1547 list_splice_tail_init(&hdr->pages, &desc->pg_list); 1548 nfs_pageio_reset_read_mds(desc); 1549 desc->pg_recoalesce = 1; 1550 } 1551 nfs_readdata_release(data); 1552 } 1553 1554 /* 1555 * Call the appropriate parallel I/O subsystem read function. 1556 */ 1557 static enum pnfs_try_status 1558 pnfs_try_to_read_data(struct nfs_read_data *rdata, 1559 const struct rpc_call_ops *call_ops, 1560 struct pnfs_layout_segment *lseg) 1561 { 1562 struct nfs_pgio_header *hdr = rdata->header; 1563 struct inode *inode = hdr->inode; 1564 struct nfs_server *nfss = NFS_SERVER(inode); 1565 enum pnfs_try_status trypnfs; 1566 1567 hdr->mds_ops = call_ops; 1568 1569 dprintk("%s: Reading ino:%lu %u@%llu\n", 1570 __func__, inode->i_ino, rdata->args.count, rdata->args.offset); 1571 1572 trypnfs = nfss->pnfs_curr_ld->read_pagelist(rdata); 1573 if (trypnfs != PNFS_NOT_ATTEMPTED) 1574 nfs_inc_stats(inode, NFSIOS_PNFS_READ); 1575 dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs); 1576 return trypnfs; 1577 } 1578 1579 static void 1580 pnfs_do_multiple_reads(struct nfs_pageio_descriptor *desc, struct list_head *head) 1581 { 1582 struct nfs_read_data *data; 1583 const struct rpc_call_ops *call_ops = desc->pg_rpc_callops; 1584 struct pnfs_layout_segment *lseg = desc->pg_lseg; 1585 1586 desc->pg_lseg = NULL; 1587 while (!list_empty(head)) { 1588 enum pnfs_try_status trypnfs; 1589 1590 data = list_first_entry(head, struct nfs_read_data, list); 1591 list_del_init(&data->list); 1592 1593 trypnfs = pnfs_try_to_read_data(data, call_ops, lseg); 1594 if (trypnfs == PNFS_NOT_ATTEMPTED) 1595 pnfs_read_through_mds(desc, data); 1596 } 1597 pnfs_put_lseg(lseg); 1598 } 1599 1600 static void pnfs_readhdr_free(struct nfs_pgio_header *hdr) 1601 { 1602 pnfs_put_lseg(hdr->lseg); 1603 nfs_readhdr_free(hdr); 1604 } 1605 EXPORT_SYMBOL_GPL(pnfs_readhdr_free); 1606 1607 int 1608 pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc) 1609 { 1610 struct nfs_read_header *rhdr; 1611 struct nfs_pgio_header *hdr; 1612 int ret; 1613 1614 rhdr = nfs_readhdr_alloc(); 1615 if (!rhdr) { 1616 desc->pg_completion_ops->error_cleanup(&desc->pg_list); 1617 ret = -ENOMEM; 1618 pnfs_put_lseg(desc->pg_lseg); 1619 desc->pg_lseg = NULL; 1620 return ret; 1621 } 1622 hdr = &rhdr->header; 1623 nfs_pgheader_init(desc, hdr, pnfs_readhdr_free); 1624 hdr->lseg = pnfs_get_lseg(desc->pg_lseg); 1625 atomic_inc(&hdr->refcnt); 1626 ret = nfs_generic_pagein(desc, hdr); 1627 if (ret != 0) { 1628 pnfs_put_lseg(desc->pg_lseg); 1629 desc->pg_lseg = NULL; 1630 } else 1631 pnfs_do_multiple_reads(desc, &hdr->rpc_list); 1632 if (atomic_dec_and_test(&hdr->refcnt)) 1633 hdr->completion_ops->completion(hdr); 1634 return ret; 1635 } 1636 EXPORT_SYMBOL_GPL(pnfs_generic_pg_readpages); 1637 1638 /* 1639 * There can be multiple RW segments. 1640 */ 1641 static void pnfs_list_write_lseg(struct inode *inode, struct list_head *listp) 1642 { 1643 struct pnfs_layout_segment *lseg; 1644 1645 list_for_each_entry(lseg, &NFS_I(inode)->layout->plh_segs, pls_list) { 1646 if (lseg->pls_range.iomode == IOMODE_RW && 1647 test_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags)) 1648 list_add(&lseg->pls_lc_list, listp); 1649 } 1650 } 1651 1652 void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg) 1653 { 1654 pnfs_layout_io_set_failed(lseg->pls_layout, lseg->pls_range.iomode); 1655 } 1656 EXPORT_SYMBOL_GPL(pnfs_set_lo_fail); 1657 1658 void 1659 pnfs_set_layoutcommit(struct nfs_write_data *wdata) 1660 { 1661 struct nfs_pgio_header *hdr = wdata->header; 1662 struct inode *inode = hdr->inode; 1663 struct nfs_inode *nfsi = NFS_I(inode); 1664 loff_t end_pos = wdata->mds_offset + wdata->res.count; 1665 bool mark_as_dirty = false; 1666 1667 spin_lock(&inode->i_lock); 1668 if (!test_and_set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) { 1669 mark_as_dirty = true; 1670 dprintk("%s: Set layoutcommit for inode %lu ", 1671 __func__, inode->i_ino); 1672 } 1673 if (!test_and_set_bit(NFS_LSEG_LAYOUTCOMMIT, &hdr->lseg->pls_flags)) { 1674 /* references matched in nfs4_layoutcommit_release */ 1675 pnfs_get_lseg(hdr->lseg); 1676 } 1677 if (end_pos > nfsi->layout->plh_lwb) 1678 nfsi->layout->plh_lwb = end_pos; 1679 spin_unlock(&inode->i_lock); 1680 dprintk("%s: lseg %p end_pos %llu\n", 1681 __func__, hdr->lseg, nfsi->layout->plh_lwb); 1682 1683 /* if pnfs_layoutcommit_inode() runs between inode locks, the next one 1684 * will be a noop because NFS_INO_LAYOUTCOMMIT will not be set */ 1685 if (mark_as_dirty) 1686 mark_inode_dirty_sync(inode); 1687 } 1688 EXPORT_SYMBOL_GPL(pnfs_set_layoutcommit); 1689 1690 void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data) 1691 { 1692 struct nfs_server *nfss = NFS_SERVER(data->args.inode); 1693 1694 if (nfss->pnfs_curr_ld->cleanup_layoutcommit) 1695 nfss->pnfs_curr_ld->cleanup_layoutcommit(data); 1696 } 1697 1698 /* 1699 * For the LAYOUT4_NFSV4_1_FILES layout type, NFS_DATA_SYNC WRITEs and 1700 * NFS_UNSTABLE WRITEs with a COMMIT to data servers must store enough 1701 * data to disk to allow the server to recover the data if it crashes. 1702 * LAYOUTCOMMIT is only needed when the NFL4_UFLG_COMMIT_THRU_MDS flag 1703 * is off, and a COMMIT is sent to a data server, or 1704 * if WRITEs to a data server return NFS_DATA_SYNC. 1705 */ 1706 int 1707 pnfs_layoutcommit_inode(struct inode *inode, bool sync) 1708 { 1709 struct nfs4_layoutcommit_data *data; 1710 struct nfs_inode *nfsi = NFS_I(inode); 1711 loff_t end_pos; 1712 int status = 0; 1713 1714 dprintk("--> %s inode %lu\n", __func__, inode->i_ino); 1715 1716 if (!test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) 1717 return 0; 1718 1719 /* Note kzalloc ensures data->res.seq_res.sr_slot == NULL */ 1720 data = kzalloc(sizeof(*data), GFP_NOFS); 1721 if (!data) { 1722 status = -ENOMEM; 1723 goto out; 1724 } 1725 1726 if (!test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) 1727 goto out_free; 1728 1729 if (test_and_set_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags)) { 1730 if (!sync) { 1731 status = -EAGAIN; 1732 goto out_free; 1733 } 1734 status = wait_on_bit_lock(&nfsi->flags, NFS_INO_LAYOUTCOMMITTING, 1735 nfs_wait_bit_killable, TASK_KILLABLE); 1736 if (status) 1737 goto out_free; 1738 } 1739 1740 INIT_LIST_HEAD(&data->lseg_list); 1741 spin_lock(&inode->i_lock); 1742 if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) { 1743 clear_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags); 1744 spin_unlock(&inode->i_lock); 1745 wake_up_bit(&nfsi->flags, NFS_INO_LAYOUTCOMMITTING); 1746 goto out_free; 1747 } 1748 1749 pnfs_list_write_lseg(inode, &data->lseg_list); 1750 1751 end_pos = nfsi->layout->plh_lwb; 1752 nfsi->layout->plh_lwb = 0; 1753 1754 nfs4_stateid_copy(&data->args.stateid, &nfsi->layout->plh_stateid); 1755 spin_unlock(&inode->i_lock); 1756 1757 data->args.inode = inode; 1758 data->cred = get_rpccred(nfsi->layout->plh_lc_cred); 1759 nfs_fattr_init(&data->fattr); 1760 data->args.bitmask = NFS_SERVER(inode)->cache_consistency_bitmask; 1761 data->res.fattr = &data->fattr; 1762 data->args.lastbytewritten = end_pos - 1; 1763 data->res.server = NFS_SERVER(inode); 1764 1765 status = nfs4_proc_layoutcommit(data, sync); 1766 out: 1767 if (status) 1768 mark_inode_dirty_sync(inode); 1769 dprintk("<-- %s status %d\n", __func__, status); 1770 return status; 1771 out_free: 1772 kfree(data); 1773 goto out; 1774 } 1775 1776 struct nfs4_threshold *pnfs_mdsthreshold_alloc(void) 1777 { 1778 struct nfs4_threshold *thp; 1779 1780 thp = kzalloc(sizeof(*thp), GFP_NOFS); 1781 if (!thp) { 1782 dprintk("%s mdsthreshold allocation failed\n", __func__); 1783 return NULL; 1784 } 1785 return thp; 1786 } 1787