1 /* 2 * pNFS functions to call and manage layout drivers. 3 * 4 * Copyright (c) 2002 [year of first publication] 5 * The Regents of the University of Michigan 6 * All Rights Reserved 7 * 8 * Dean Hildebrand <dhildebz@umich.edu> 9 * 10 * Permission is granted to use, copy, create derivative works, and 11 * redistribute this software and such derivative works for any purpose, 12 * so long as the name of the University of Michigan is not used in 13 * any advertising or publicity pertaining to the use or distribution 14 * of this software without specific, written prior authorization. If 15 * the above copyright notice or any other identification of the 16 * University of Michigan is included in any copy of any portion of 17 * this software, then the disclaimer below must also be included. 18 * 19 * This software is provided as is, without representation or warranty 20 * of any kind either express or implied, including without limitation 21 * the implied warranties of merchantability, fitness for a particular 22 * purpose, or noninfringement. The Regents of the University of 23 * Michigan shall not be liable for any damages, including special, 24 * indirect, incidental, or consequential damages, with respect to any 25 * claim arising out of or in connection with the use of the software, 26 * even if it has been or is hereafter advised of the possibility of 27 * such damages. 28 */ 29 30 #include <linux/nfs_fs.h> 31 #include <linux/nfs_page.h> 32 #include <linux/module.h> 33 #include "internal.h" 34 #include "pnfs.h" 35 #include "iostat.h" 36 37 #define NFSDBG_FACILITY NFSDBG_PNFS 38 39 /* Locking: 40 * 41 * pnfs_spinlock: 42 * protects pnfs_modules_tbl. 43 */ 44 static DEFINE_SPINLOCK(pnfs_spinlock); 45 46 /* 47 * pnfs_modules_tbl holds all pnfs modules 48 */ 49 static LIST_HEAD(pnfs_modules_tbl); 50 51 /* Return the registered pnfs layout driver module matching given id */ 52 static struct pnfs_layoutdriver_type * 53 find_pnfs_driver_locked(u32 id) 54 { 55 struct pnfs_layoutdriver_type *local; 56 57 list_for_each_entry(local, &pnfs_modules_tbl, pnfs_tblid) 58 if (local->id == id) 59 goto out; 60 local = NULL; 61 out: 62 dprintk("%s: Searching for id %u, found %p\n", __func__, id, local); 63 return local; 64 } 65 66 static struct pnfs_layoutdriver_type * 67 find_pnfs_driver(u32 id) 68 { 69 struct pnfs_layoutdriver_type *local; 70 71 spin_lock(&pnfs_spinlock); 72 local = find_pnfs_driver_locked(id); 73 if (local != NULL && !try_module_get(local->owner)) { 74 dprintk("%s: Could not grab reference on module\n", __func__); 75 local = NULL; 76 } 77 spin_unlock(&pnfs_spinlock); 78 return local; 79 } 80 81 void 82 unset_pnfs_layoutdriver(struct nfs_server *nfss) 83 { 84 if (nfss->pnfs_curr_ld) { 85 if (nfss->pnfs_curr_ld->clear_layoutdriver) 86 nfss->pnfs_curr_ld->clear_layoutdriver(nfss); 87 /* Decrement the MDS count. Purge the deviceid cache if zero */ 88 if (atomic_dec_and_test(&nfss->nfs_client->cl_mds_count)) 89 nfs4_deviceid_purge_client(nfss->nfs_client); 90 module_put(nfss->pnfs_curr_ld->owner); 91 } 92 nfss->pnfs_curr_ld = NULL; 93 } 94 95 /* 96 * Try to set the server's pnfs module to the pnfs layout type specified by id. 97 * Currently only one pNFS layout driver per filesystem is supported. 98 * 99 * @id layout type. Zero (illegal layout type) indicates pNFS not in use. 100 */ 101 void 102 set_pnfs_layoutdriver(struct nfs_server *server, const struct nfs_fh *mntfh, 103 u32 id) 104 { 105 struct pnfs_layoutdriver_type *ld_type = NULL; 106 107 if (id == 0) 108 goto out_no_driver; 109 if (!(server->nfs_client->cl_exchange_flags & 110 (EXCHGID4_FLAG_USE_NON_PNFS | EXCHGID4_FLAG_USE_PNFS_MDS))) { 111 printk(KERN_ERR "NFS: %s: id %u cl_exchange_flags 0x%x\n", 112 __func__, id, server->nfs_client->cl_exchange_flags); 113 goto out_no_driver; 114 } 115 ld_type = find_pnfs_driver(id); 116 if (!ld_type) { 117 request_module("%s-%u", LAYOUT_NFSV4_1_MODULE_PREFIX, id); 118 ld_type = find_pnfs_driver(id); 119 if (!ld_type) { 120 dprintk("%s: No pNFS module found for %u.\n", 121 __func__, id); 122 goto out_no_driver; 123 } 124 } 125 server->pnfs_curr_ld = ld_type; 126 if (ld_type->set_layoutdriver 127 && ld_type->set_layoutdriver(server, mntfh)) { 128 printk(KERN_ERR "NFS: %s: Error initializing pNFS layout " 129 "driver %u.\n", __func__, id); 130 module_put(ld_type->owner); 131 goto out_no_driver; 132 } 133 /* Bump the MDS count */ 134 atomic_inc(&server->nfs_client->cl_mds_count); 135 136 dprintk("%s: pNFS module for %u set\n", __func__, id); 137 return; 138 139 out_no_driver: 140 dprintk("%s: Using NFSv4 I/O\n", __func__); 141 server->pnfs_curr_ld = NULL; 142 } 143 144 int 145 pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *ld_type) 146 { 147 int status = -EINVAL; 148 struct pnfs_layoutdriver_type *tmp; 149 150 if (ld_type->id == 0) { 151 printk(KERN_ERR "NFS: %s id 0 is reserved\n", __func__); 152 return status; 153 } 154 if (!ld_type->alloc_lseg || !ld_type->free_lseg) { 155 printk(KERN_ERR "NFS: %s Layout driver must provide " 156 "alloc_lseg and free_lseg.\n", __func__); 157 return status; 158 } 159 160 spin_lock(&pnfs_spinlock); 161 tmp = find_pnfs_driver_locked(ld_type->id); 162 if (!tmp) { 163 list_add(&ld_type->pnfs_tblid, &pnfs_modules_tbl); 164 status = 0; 165 dprintk("%s Registering id:%u name:%s\n", __func__, ld_type->id, 166 ld_type->name); 167 } else { 168 printk(KERN_ERR "NFS: %s Module with id %d already loaded!\n", 169 __func__, ld_type->id); 170 } 171 spin_unlock(&pnfs_spinlock); 172 173 return status; 174 } 175 EXPORT_SYMBOL_GPL(pnfs_register_layoutdriver); 176 177 void 178 pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *ld_type) 179 { 180 dprintk("%s Deregistering id:%u\n", __func__, ld_type->id); 181 spin_lock(&pnfs_spinlock); 182 list_del(&ld_type->pnfs_tblid); 183 spin_unlock(&pnfs_spinlock); 184 } 185 EXPORT_SYMBOL_GPL(pnfs_unregister_layoutdriver); 186 187 /* 188 * pNFS client layout cache 189 */ 190 191 /* Need to hold i_lock if caller does not already hold reference */ 192 void 193 get_layout_hdr(struct pnfs_layout_hdr *lo) 194 { 195 atomic_inc(&lo->plh_refcount); 196 } 197 198 static struct pnfs_layout_hdr * 199 pnfs_alloc_layout_hdr(struct inode *ino, gfp_t gfp_flags) 200 { 201 struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld; 202 return ld->alloc_layout_hdr ? ld->alloc_layout_hdr(ino, gfp_flags) : 203 kzalloc(sizeof(struct pnfs_layout_hdr), gfp_flags); 204 } 205 206 static void 207 pnfs_free_layout_hdr(struct pnfs_layout_hdr *lo) 208 { 209 struct pnfs_layoutdriver_type *ld = NFS_SERVER(lo->plh_inode)->pnfs_curr_ld; 210 put_rpccred(lo->plh_lc_cred); 211 return ld->alloc_layout_hdr ? ld->free_layout_hdr(lo) : kfree(lo); 212 } 213 214 static void 215 destroy_layout_hdr(struct pnfs_layout_hdr *lo) 216 { 217 dprintk("%s: freeing layout cache %p\n", __func__, lo); 218 BUG_ON(!list_empty(&lo->plh_layouts)); 219 NFS_I(lo->plh_inode)->layout = NULL; 220 pnfs_free_layout_hdr(lo); 221 } 222 223 static void 224 put_layout_hdr_locked(struct pnfs_layout_hdr *lo) 225 { 226 if (atomic_dec_and_test(&lo->plh_refcount)) 227 destroy_layout_hdr(lo); 228 } 229 230 void 231 put_layout_hdr(struct pnfs_layout_hdr *lo) 232 { 233 struct inode *inode = lo->plh_inode; 234 235 if (atomic_dec_and_lock(&lo->plh_refcount, &inode->i_lock)) { 236 destroy_layout_hdr(lo); 237 spin_unlock(&inode->i_lock); 238 } 239 } 240 241 static void 242 init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg) 243 { 244 INIT_LIST_HEAD(&lseg->pls_list); 245 INIT_LIST_HEAD(&lseg->pls_lc_list); 246 atomic_set(&lseg->pls_refcount, 1); 247 smp_mb(); 248 set_bit(NFS_LSEG_VALID, &lseg->pls_flags); 249 lseg->pls_layout = lo; 250 } 251 252 static void free_lseg(struct pnfs_layout_segment *lseg) 253 { 254 struct inode *ino = lseg->pls_layout->plh_inode; 255 256 NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg); 257 /* Matched by get_layout_hdr in pnfs_insert_layout */ 258 put_layout_hdr(NFS_I(ino)->layout); 259 } 260 261 static void 262 put_lseg_common(struct pnfs_layout_segment *lseg) 263 { 264 struct inode *inode = lseg->pls_layout->plh_inode; 265 266 WARN_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags)); 267 list_del_init(&lseg->pls_list); 268 if (list_empty(&lseg->pls_layout->plh_segs)) { 269 set_bit(NFS_LAYOUT_DESTROYED, &lseg->pls_layout->plh_flags); 270 /* Matched by initial refcount set in alloc_init_layout_hdr */ 271 put_layout_hdr_locked(lseg->pls_layout); 272 } 273 rpc_wake_up(&NFS_SERVER(inode)->roc_rpcwaitq); 274 } 275 276 void 277 put_lseg(struct pnfs_layout_segment *lseg) 278 { 279 struct inode *inode; 280 281 if (!lseg) 282 return; 283 284 dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg, 285 atomic_read(&lseg->pls_refcount), 286 test_bit(NFS_LSEG_VALID, &lseg->pls_flags)); 287 inode = lseg->pls_layout->plh_inode; 288 if (atomic_dec_and_lock(&lseg->pls_refcount, &inode->i_lock)) { 289 LIST_HEAD(free_me); 290 291 put_lseg_common(lseg); 292 list_add(&lseg->pls_list, &free_me); 293 spin_unlock(&inode->i_lock); 294 pnfs_free_lseg_list(&free_me); 295 } 296 } 297 EXPORT_SYMBOL_GPL(put_lseg); 298 299 static inline u64 300 end_offset(u64 start, u64 len) 301 { 302 u64 end; 303 304 end = start + len; 305 return end >= start ? end : NFS4_MAX_UINT64; 306 } 307 308 /* last octet in a range */ 309 static inline u64 310 last_byte_offset(u64 start, u64 len) 311 { 312 u64 end; 313 314 BUG_ON(!len); 315 end = start + len; 316 return end > start ? end - 1 : NFS4_MAX_UINT64; 317 } 318 319 /* 320 * is l2 fully contained in l1? 321 * start1 end1 322 * [----------------------------------) 323 * start2 end2 324 * [----------------) 325 */ 326 static inline int 327 lo_seg_contained(struct pnfs_layout_range *l1, 328 struct pnfs_layout_range *l2) 329 { 330 u64 start1 = l1->offset; 331 u64 end1 = end_offset(start1, l1->length); 332 u64 start2 = l2->offset; 333 u64 end2 = end_offset(start2, l2->length); 334 335 return (start1 <= start2) && (end1 >= end2); 336 } 337 338 /* 339 * is l1 and l2 intersecting? 340 * start1 end1 341 * [----------------------------------) 342 * start2 end2 343 * [----------------) 344 */ 345 static inline int 346 lo_seg_intersecting(struct pnfs_layout_range *l1, 347 struct pnfs_layout_range *l2) 348 { 349 u64 start1 = l1->offset; 350 u64 end1 = end_offset(start1, l1->length); 351 u64 start2 = l2->offset; 352 u64 end2 = end_offset(start2, l2->length); 353 354 return (end1 == NFS4_MAX_UINT64 || end1 > start2) && 355 (end2 == NFS4_MAX_UINT64 || end2 > start1); 356 } 357 358 static bool 359 should_free_lseg(struct pnfs_layout_range *lseg_range, 360 struct pnfs_layout_range *recall_range) 361 { 362 return (recall_range->iomode == IOMODE_ANY || 363 lseg_range->iomode == recall_range->iomode) && 364 lo_seg_intersecting(lseg_range, recall_range); 365 } 366 367 /* Returns 1 if lseg is removed from list, 0 otherwise */ 368 static int mark_lseg_invalid(struct pnfs_layout_segment *lseg, 369 struct list_head *tmp_list) 370 { 371 int rv = 0; 372 373 if (test_and_clear_bit(NFS_LSEG_VALID, &lseg->pls_flags)) { 374 /* Remove the reference keeping the lseg in the 375 * list. It will now be removed when all 376 * outstanding io is finished. 377 */ 378 dprintk("%s: lseg %p ref %d\n", __func__, lseg, 379 atomic_read(&lseg->pls_refcount)); 380 if (atomic_dec_and_test(&lseg->pls_refcount)) { 381 put_lseg_common(lseg); 382 list_add(&lseg->pls_list, tmp_list); 383 rv = 1; 384 } 385 } 386 return rv; 387 } 388 389 /* Returns count of number of matching invalid lsegs remaining in list 390 * after call. 391 */ 392 int 393 mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, 394 struct list_head *tmp_list, 395 struct pnfs_layout_range *recall_range) 396 { 397 struct pnfs_layout_segment *lseg, *next; 398 int invalid = 0, removed = 0; 399 400 dprintk("%s:Begin lo %p\n", __func__, lo); 401 402 if (list_empty(&lo->plh_segs)) { 403 /* Reset MDS Threshold I/O counters */ 404 NFS_I(lo->plh_inode)->write_io = 0; 405 NFS_I(lo->plh_inode)->read_io = 0; 406 if (!test_and_set_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags)) 407 put_layout_hdr_locked(lo); 408 return 0; 409 } 410 list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list) 411 if (!recall_range || 412 should_free_lseg(&lseg->pls_range, recall_range)) { 413 dprintk("%s: freeing lseg %p iomode %d " 414 "offset %llu length %llu\n", __func__, 415 lseg, lseg->pls_range.iomode, lseg->pls_range.offset, 416 lseg->pls_range.length); 417 invalid++; 418 removed += mark_lseg_invalid(lseg, tmp_list); 419 } 420 dprintk("%s:Return %i\n", __func__, invalid - removed); 421 return invalid - removed; 422 } 423 424 /* note free_me must contain lsegs from a single layout_hdr */ 425 void 426 pnfs_free_lseg_list(struct list_head *free_me) 427 { 428 struct pnfs_layout_segment *lseg, *tmp; 429 struct pnfs_layout_hdr *lo; 430 431 if (list_empty(free_me)) 432 return; 433 434 lo = list_first_entry(free_me, struct pnfs_layout_segment, 435 pls_list)->pls_layout; 436 437 if (test_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags)) { 438 struct nfs_client *clp; 439 440 clp = NFS_SERVER(lo->plh_inode)->nfs_client; 441 spin_lock(&clp->cl_lock); 442 list_del_init(&lo->plh_layouts); 443 spin_unlock(&clp->cl_lock); 444 } 445 list_for_each_entry_safe(lseg, tmp, free_me, pls_list) { 446 list_del(&lseg->pls_list); 447 free_lseg(lseg); 448 } 449 } 450 451 void 452 pnfs_destroy_layout(struct nfs_inode *nfsi) 453 { 454 struct pnfs_layout_hdr *lo; 455 LIST_HEAD(tmp_list); 456 457 spin_lock(&nfsi->vfs_inode.i_lock); 458 lo = nfsi->layout; 459 if (lo) { 460 lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */ 461 mark_matching_lsegs_invalid(lo, &tmp_list, NULL); 462 } 463 spin_unlock(&nfsi->vfs_inode.i_lock); 464 pnfs_free_lseg_list(&tmp_list); 465 } 466 EXPORT_SYMBOL_GPL(pnfs_destroy_layout); 467 468 /* 469 * Called by the state manger to remove all layouts established under an 470 * expired lease. 471 */ 472 void 473 pnfs_destroy_all_layouts(struct nfs_client *clp) 474 { 475 struct nfs_server *server; 476 struct pnfs_layout_hdr *lo; 477 LIST_HEAD(tmp_list); 478 479 nfs4_deviceid_mark_client_invalid(clp); 480 nfs4_deviceid_purge_client(clp); 481 482 spin_lock(&clp->cl_lock); 483 rcu_read_lock(); 484 list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { 485 if (!list_empty(&server->layouts)) 486 list_splice_init(&server->layouts, &tmp_list); 487 } 488 rcu_read_unlock(); 489 spin_unlock(&clp->cl_lock); 490 491 while (!list_empty(&tmp_list)) { 492 lo = list_entry(tmp_list.next, struct pnfs_layout_hdr, 493 plh_layouts); 494 dprintk("%s freeing layout for inode %lu\n", __func__, 495 lo->plh_inode->i_ino); 496 list_del_init(&lo->plh_layouts); 497 pnfs_destroy_layout(NFS_I(lo->plh_inode)); 498 } 499 } 500 501 /* update lo->plh_stateid with new if is more recent */ 502 void 503 pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new, 504 bool update_barrier) 505 { 506 u32 oldseq, newseq; 507 508 oldseq = be32_to_cpu(lo->plh_stateid.seqid); 509 newseq = be32_to_cpu(new->seqid); 510 if ((int)(newseq - oldseq) > 0) { 511 nfs4_stateid_copy(&lo->plh_stateid, new); 512 if (update_barrier) { 513 u32 new_barrier = be32_to_cpu(new->seqid); 514 515 if ((int)(new_barrier - lo->plh_barrier)) 516 lo->plh_barrier = new_barrier; 517 } else { 518 /* Because of wraparound, we want to keep the barrier 519 * "close" to the current seqids. It needs to be 520 * within 2**31 to count as "behind", so if it 521 * gets too near that limit, give us a litle leeway 522 * and bring it to within 2**30. 523 * NOTE - and yes, this is all unsigned arithmetic. 524 */ 525 if (unlikely((newseq - lo->plh_barrier) > (3 << 29))) 526 lo->plh_barrier = newseq - (1 << 30); 527 } 528 } 529 } 530 531 /* lget is set to 1 if called from inside send_layoutget call chain */ 532 static bool 533 pnfs_layoutgets_blocked(struct pnfs_layout_hdr *lo, nfs4_stateid *stateid, 534 int lget) 535 { 536 if ((stateid) && 537 (int)(lo->plh_barrier - be32_to_cpu(stateid->seqid)) >= 0) 538 return true; 539 return lo->plh_block_lgets || 540 test_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags) || 541 test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) || 542 (list_empty(&lo->plh_segs) && 543 (atomic_read(&lo->plh_outstanding) > lget)); 544 } 545 546 int 547 pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo, 548 struct nfs4_state *open_state) 549 { 550 int status = 0; 551 552 dprintk("--> %s\n", __func__); 553 spin_lock(&lo->plh_inode->i_lock); 554 if (pnfs_layoutgets_blocked(lo, NULL, 1)) { 555 status = -EAGAIN; 556 } else if (list_empty(&lo->plh_segs)) { 557 int seq; 558 559 do { 560 seq = read_seqbegin(&open_state->seqlock); 561 nfs4_stateid_copy(dst, &open_state->stateid); 562 } while (read_seqretry(&open_state->seqlock, seq)); 563 } else 564 nfs4_stateid_copy(dst, &lo->plh_stateid); 565 spin_unlock(&lo->plh_inode->i_lock); 566 dprintk("<-- %s\n", __func__); 567 return status; 568 } 569 570 /* 571 * Get layout from server. 572 * for now, assume that whole file layouts are requested. 573 * arg->offset: 0 574 * arg->length: all ones 575 */ 576 static struct pnfs_layout_segment * 577 send_layoutget(struct pnfs_layout_hdr *lo, 578 struct nfs_open_context *ctx, 579 struct pnfs_layout_range *range, 580 gfp_t gfp_flags) 581 { 582 struct inode *ino = lo->plh_inode; 583 struct nfs_server *server = NFS_SERVER(ino); 584 struct nfs4_layoutget *lgp; 585 struct pnfs_layout_segment *lseg = NULL; 586 587 dprintk("--> %s\n", __func__); 588 589 BUG_ON(ctx == NULL); 590 lgp = kzalloc(sizeof(*lgp), gfp_flags); 591 if (lgp == NULL) 592 return NULL; 593 594 lgp->args.minlength = PAGE_CACHE_SIZE; 595 if (lgp->args.minlength > range->length) 596 lgp->args.minlength = range->length; 597 lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE; 598 lgp->args.range = *range; 599 lgp->args.type = server->pnfs_curr_ld->id; 600 lgp->args.inode = ino; 601 lgp->args.ctx = get_nfs_open_context(ctx); 602 lgp->lsegpp = &lseg; 603 lgp->gfp_flags = gfp_flags; 604 605 /* Synchronously retrieve layout information from server and 606 * store in lseg. 607 */ 608 nfs4_proc_layoutget(lgp, gfp_flags); 609 if (!lseg) { 610 /* remember that LAYOUTGET failed and suspend trying */ 611 set_bit(lo_fail_bit(range->iomode), &lo->plh_flags); 612 } 613 614 return lseg; 615 } 616 617 /* 618 * Initiates a LAYOUTRETURN(FILE), and removes the pnfs_layout_hdr 619 * when the layout segment list is empty. 620 * 621 * Note that a pnfs_layout_hdr can exist with an empty layout segment 622 * list when LAYOUTGET has failed, or when LAYOUTGET succeeded, but the 623 * deviceid is marked invalid. 624 */ 625 int 626 _pnfs_return_layout(struct inode *ino) 627 { 628 struct pnfs_layout_hdr *lo = NULL; 629 struct nfs_inode *nfsi = NFS_I(ino); 630 LIST_HEAD(tmp_list); 631 struct nfs4_layoutreturn *lrp; 632 nfs4_stateid stateid; 633 int status = 0, empty; 634 635 dprintk("NFS: %s for inode %lu\n", __func__, ino->i_ino); 636 637 spin_lock(&ino->i_lock); 638 lo = nfsi->layout; 639 if (!lo || pnfs_test_layout_returned(lo)) { 640 spin_unlock(&ino->i_lock); 641 dprintk("NFS: %s no layout to return\n", __func__); 642 goto out; 643 } 644 stateid = nfsi->layout->plh_stateid; 645 /* Reference matched in nfs4_layoutreturn_release */ 646 get_layout_hdr(lo); 647 empty = list_empty(&lo->plh_segs); 648 mark_matching_lsegs_invalid(lo, &tmp_list, NULL); 649 /* Don't send a LAYOUTRETURN if list was initially empty */ 650 if (empty) { 651 spin_unlock(&ino->i_lock); 652 put_layout_hdr(lo); 653 dprintk("NFS: %s no layout segments to return\n", __func__); 654 goto out; 655 } 656 lo->plh_block_lgets++; 657 pnfs_mark_layout_returned(lo); 658 spin_unlock(&ino->i_lock); 659 pnfs_free_lseg_list(&tmp_list); 660 661 WARN_ON(test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)); 662 663 lrp = kzalloc(sizeof(*lrp), GFP_KERNEL); 664 if (unlikely(lrp == NULL)) { 665 status = -ENOMEM; 666 set_bit(NFS_LAYOUT_RW_FAILED, &lo->plh_flags); 667 set_bit(NFS_LAYOUT_RO_FAILED, &lo->plh_flags); 668 pnfs_clear_layout_returned(lo); 669 put_layout_hdr(lo); 670 goto out; 671 } 672 673 lrp->args.stateid = stateid; 674 lrp->args.layout_type = NFS_SERVER(ino)->pnfs_curr_ld->id; 675 lrp->args.inode = ino; 676 lrp->args.layout = lo; 677 lrp->clp = NFS_SERVER(ino)->nfs_client; 678 679 status = nfs4_proc_layoutreturn(lrp); 680 out: 681 dprintk("<-- %s status: %d\n", __func__, status); 682 return status; 683 } 684 EXPORT_SYMBOL_GPL(_pnfs_return_layout); 685 686 bool pnfs_roc(struct inode *ino) 687 { 688 struct pnfs_layout_hdr *lo; 689 struct pnfs_layout_segment *lseg, *tmp; 690 LIST_HEAD(tmp_list); 691 bool found = false; 692 693 spin_lock(&ino->i_lock); 694 lo = NFS_I(ino)->layout; 695 if (!lo || !test_and_clear_bit(NFS_LAYOUT_ROC, &lo->plh_flags) || 696 test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) 697 goto out_nolayout; 698 list_for_each_entry_safe(lseg, tmp, &lo->plh_segs, pls_list) 699 if (test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) { 700 mark_lseg_invalid(lseg, &tmp_list); 701 found = true; 702 } 703 if (!found) 704 goto out_nolayout; 705 lo->plh_block_lgets++; 706 get_layout_hdr(lo); /* matched in pnfs_roc_release */ 707 spin_unlock(&ino->i_lock); 708 pnfs_free_lseg_list(&tmp_list); 709 return true; 710 711 out_nolayout: 712 spin_unlock(&ino->i_lock); 713 return false; 714 } 715 716 void pnfs_roc_release(struct inode *ino) 717 { 718 struct pnfs_layout_hdr *lo; 719 720 spin_lock(&ino->i_lock); 721 lo = NFS_I(ino)->layout; 722 lo->plh_block_lgets--; 723 put_layout_hdr_locked(lo); 724 spin_unlock(&ino->i_lock); 725 } 726 727 void pnfs_roc_set_barrier(struct inode *ino, u32 barrier) 728 { 729 struct pnfs_layout_hdr *lo; 730 731 spin_lock(&ino->i_lock); 732 lo = NFS_I(ino)->layout; 733 if ((int)(barrier - lo->plh_barrier) > 0) 734 lo->plh_barrier = barrier; 735 spin_unlock(&ino->i_lock); 736 } 737 738 bool pnfs_roc_drain(struct inode *ino, u32 *barrier) 739 { 740 struct nfs_inode *nfsi = NFS_I(ino); 741 struct pnfs_layout_segment *lseg; 742 bool found = false; 743 744 spin_lock(&ino->i_lock); 745 list_for_each_entry(lseg, &nfsi->layout->plh_segs, pls_list) 746 if (test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) { 747 found = true; 748 break; 749 } 750 if (!found) { 751 struct pnfs_layout_hdr *lo = nfsi->layout; 752 u32 current_seqid = be32_to_cpu(lo->plh_stateid.seqid); 753 754 /* Since close does not return a layout stateid for use as 755 * a barrier, we choose the worst-case barrier. 756 */ 757 *barrier = current_seqid + atomic_read(&lo->plh_outstanding); 758 } 759 spin_unlock(&ino->i_lock); 760 return found; 761 } 762 763 /* 764 * Compare two layout segments for sorting into layout cache. 765 * We want to preferentially return RW over RO layouts, so ensure those 766 * are seen first. 767 */ 768 static s64 769 cmp_layout(struct pnfs_layout_range *l1, 770 struct pnfs_layout_range *l2) 771 { 772 s64 d; 773 774 /* high offset > low offset */ 775 d = l1->offset - l2->offset; 776 if (d) 777 return d; 778 779 /* short length > long length */ 780 d = l2->length - l1->length; 781 if (d) 782 return d; 783 784 /* read > read/write */ 785 return (int)(l1->iomode == IOMODE_READ) - (int)(l2->iomode == IOMODE_READ); 786 } 787 788 static void 789 pnfs_insert_layout(struct pnfs_layout_hdr *lo, 790 struct pnfs_layout_segment *lseg) 791 { 792 struct pnfs_layout_segment *lp; 793 794 dprintk("%s:Begin\n", __func__); 795 796 assert_spin_locked(&lo->plh_inode->i_lock); 797 list_for_each_entry(lp, &lo->plh_segs, pls_list) { 798 if (cmp_layout(&lseg->pls_range, &lp->pls_range) > 0) 799 continue; 800 list_add_tail(&lseg->pls_list, &lp->pls_list); 801 dprintk("%s: inserted lseg %p " 802 "iomode %d offset %llu length %llu before " 803 "lp %p iomode %d offset %llu length %llu\n", 804 __func__, lseg, lseg->pls_range.iomode, 805 lseg->pls_range.offset, lseg->pls_range.length, 806 lp, lp->pls_range.iomode, lp->pls_range.offset, 807 lp->pls_range.length); 808 goto out; 809 } 810 list_add_tail(&lseg->pls_list, &lo->plh_segs); 811 dprintk("%s: inserted lseg %p " 812 "iomode %d offset %llu length %llu at tail\n", 813 __func__, lseg, lseg->pls_range.iomode, 814 lseg->pls_range.offset, lseg->pls_range.length); 815 out: 816 get_layout_hdr(lo); 817 818 dprintk("%s:Return\n", __func__); 819 } 820 821 static struct pnfs_layout_hdr * 822 alloc_init_layout_hdr(struct inode *ino, 823 struct nfs_open_context *ctx, 824 gfp_t gfp_flags) 825 { 826 struct pnfs_layout_hdr *lo; 827 828 lo = pnfs_alloc_layout_hdr(ino, gfp_flags); 829 if (!lo) 830 return NULL; 831 atomic_set(&lo->plh_refcount, 1); 832 INIT_LIST_HEAD(&lo->plh_layouts); 833 INIT_LIST_HEAD(&lo->plh_segs); 834 INIT_LIST_HEAD(&lo->plh_bulk_recall); 835 lo->plh_inode = ino; 836 lo->plh_lc_cred = get_rpccred(ctx->state->owner->so_cred); 837 return lo; 838 } 839 840 static struct pnfs_layout_hdr * 841 pnfs_find_alloc_layout(struct inode *ino, 842 struct nfs_open_context *ctx, 843 gfp_t gfp_flags) 844 { 845 struct nfs_inode *nfsi = NFS_I(ino); 846 struct pnfs_layout_hdr *new = NULL; 847 848 dprintk("%s Begin ino=%p layout=%p\n", __func__, ino, nfsi->layout); 849 850 assert_spin_locked(&ino->i_lock); 851 if (nfsi->layout) { 852 if (test_bit(NFS_LAYOUT_DESTROYED, &nfsi->layout->plh_flags)) 853 return NULL; 854 else 855 return nfsi->layout; 856 } 857 spin_unlock(&ino->i_lock); 858 new = alloc_init_layout_hdr(ino, ctx, gfp_flags); 859 spin_lock(&ino->i_lock); 860 861 if (likely(nfsi->layout == NULL)) /* Won the race? */ 862 nfsi->layout = new; 863 else 864 pnfs_free_layout_hdr(new); 865 return nfsi->layout; 866 } 867 868 /* 869 * iomode matching rules: 870 * iomode lseg match 871 * ----- ----- ----- 872 * ANY READ true 873 * ANY RW true 874 * RW READ false 875 * RW RW true 876 * READ READ true 877 * READ RW true 878 */ 879 static int 880 is_matching_lseg(struct pnfs_layout_range *ls_range, 881 struct pnfs_layout_range *range) 882 { 883 struct pnfs_layout_range range1; 884 885 if ((range->iomode == IOMODE_RW && 886 ls_range->iomode != IOMODE_RW) || 887 !lo_seg_intersecting(ls_range, range)) 888 return 0; 889 890 /* range1 covers only the first byte in the range */ 891 range1 = *range; 892 range1.length = 1; 893 return lo_seg_contained(ls_range, &range1); 894 } 895 896 /* 897 * lookup range in layout 898 */ 899 static struct pnfs_layout_segment * 900 pnfs_find_lseg(struct pnfs_layout_hdr *lo, 901 struct pnfs_layout_range *range) 902 { 903 struct pnfs_layout_segment *lseg, *ret = NULL; 904 905 dprintk("%s:Begin\n", __func__); 906 907 assert_spin_locked(&lo->plh_inode->i_lock); 908 list_for_each_entry(lseg, &lo->plh_segs, pls_list) { 909 if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) && 910 is_matching_lseg(&lseg->pls_range, range)) { 911 ret = get_lseg(lseg); 912 break; 913 } 914 if (lseg->pls_range.offset > range->offset) 915 break; 916 } 917 918 dprintk("%s:Return lseg %p ref %d\n", 919 __func__, ret, ret ? atomic_read(&ret->pls_refcount) : 0); 920 return ret; 921 } 922 923 /* 924 * Use mdsthreshold hints set at each OPEN to determine if I/O should go 925 * to the MDS or over pNFS 926 * 927 * The nfs_inode read_io and write_io fields are cumulative counters reset 928 * when there are no layout segments. Note that in pnfs_update_layout iomode 929 * is set to IOMODE_READ for a READ request, and set to IOMODE_RW for a 930 * WRITE request. 931 * 932 * A return of true means use MDS I/O. 933 * 934 * From rfc 5661: 935 * If a file's size is smaller than the file size threshold, data accesses 936 * SHOULD be sent to the metadata server. If an I/O request has a length that 937 * is below the I/O size threshold, the I/O SHOULD be sent to the metadata 938 * server. If both file size and I/O size are provided, the client SHOULD 939 * reach or exceed both thresholds before sending its read or write 940 * requests to the data server. 941 */ 942 static bool pnfs_within_mdsthreshold(struct nfs_open_context *ctx, 943 struct inode *ino, int iomode) 944 { 945 struct nfs4_threshold *t = ctx->mdsthreshold; 946 struct nfs_inode *nfsi = NFS_I(ino); 947 loff_t fsize = i_size_read(ino); 948 bool size = false, size_set = false, io = false, io_set = false, ret = false; 949 950 if (t == NULL) 951 return ret; 952 953 dprintk("%s bm=0x%x rd_sz=%llu wr_sz=%llu rd_io=%llu wr_io=%llu\n", 954 __func__, t->bm, t->rd_sz, t->wr_sz, t->rd_io_sz, t->wr_io_sz); 955 956 switch (iomode) { 957 case IOMODE_READ: 958 if (t->bm & THRESHOLD_RD) { 959 dprintk("%s fsize %llu\n", __func__, fsize); 960 size_set = true; 961 if (fsize < t->rd_sz) 962 size = true; 963 } 964 if (t->bm & THRESHOLD_RD_IO) { 965 dprintk("%s nfsi->read_io %llu\n", __func__, 966 nfsi->read_io); 967 io_set = true; 968 if (nfsi->read_io < t->rd_io_sz) 969 io = true; 970 } 971 break; 972 case IOMODE_RW: 973 if (t->bm & THRESHOLD_WR) { 974 dprintk("%s fsize %llu\n", __func__, fsize); 975 size_set = true; 976 if (fsize < t->wr_sz) 977 size = true; 978 } 979 if (t->bm & THRESHOLD_WR_IO) { 980 dprintk("%s nfsi->write_io %llu\n", __func__, 981 nfsi->write_io); 982 io_set = true; 983 if (nfsi->write_io < t->wr_io_sz) 984 io = true; 985 } 986 break; 987 } 988 if (size_set && io_set) { 989 if (size && io) 990 ret = true; 991 } else if (size || io) 992 ret = true; 993 994 dprintk("<-- %s size %d io %d ret %d\n", __func__, size, io, ret); 995 return ret; 996 } 997 998 /* 999 * Layout segment is retreived from the server if not cached. 1000 * The appropriate layout segment is referenced and returned to the caller. 1001 */ 1002 struct pnfs_layout_segment * 1003 pnfs_update_layout(struct inode *ino, 1004 struct nfs_open_context *ctx, 1005 loff_t pos, 1006 u64 count, 1007 enum pnfs_iomode iomode, 1008 gfp_t gfp_flags) 1009 { 1010 struct pnfs_layout_range arg = { 1011 .iomode = iomode, 1012 .offset = pos, 1013 .length = count, 1014 }; 1015 unsigned pg_offset; 1016 struct nfs_inode *nfsi = NFS_I(ino); 1017 struct nfs_server *server = NFS_SERVER(ino); 1018 struct nfs_client *clp = server->nfs_client; 1019 struct pnfs_layout_hdr *lo; 1020 struct pnfs_layout_segment *lseg = NULL; 1021 bool first = false; 1022 1023 if (!pnfs_enabled_sb(NFS_SERVER(ino))) 1024 return NULL; 1025 1026 if (pnfs_within_mdsthreshold(ctx, ino, iomode)) 1027 return NULL; 1028 1029 spin_lock(&ino->i_lock); 1030 lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags); 1031 if (lo == NULL) { 1032 dprintk("%s ERROR: can't get pnfs_layout_hdr\n", __func__); 1033 goto out_unlock; 1034 } 1035 1036 /* Do we even need to bother with this? */ 1037 if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) { 1038 dprintk("%s matches recall, use MDS\n", __func__); 1039 goto out_unlock; 1040 } 1041 1042 /* if LAYOUTGET already failed once we don't try again */ 1043 if (test_bit(lo_fail_bit(iomode), &nfsi->layout->plh_flags)) 1044 goto out_unlock; 1045 1046 /* Check to see if the layout for the given range already exists */ 1047 lseg = pnfs_find_lseg(lo, &arg); 1048 if (lseg) 1049 goto out_unlock; 1050 1051 if (pnfs_layoutgets_blocked(lo, NULL, 0)) 1052 goto out_unlock; 1053 atomic_inc(&lo->plh_outstanding); 1054 1055 get_layout_hdr(lo); 1056 if (list_empty(&lo->plh_segs)) 1057 first = true; 1058 1059 /* Enable LAYOUTRETURNs */ 1060 pnfs_clear_layout_returned(lo); 1061 1062 spin_unlock(&ino->i_lock); 1063 if (first) { 1064 /* The lo must be on the clp list if there is any 1065 * chance of a CB_LAYOUTRECALL(FILE) coming in. 1066 */ 1067 spin_lock(&clp->cl_lock); 1068 BUG_ON(!list_empty(&lo->plh_layouts)); 1069 list_add_tail(&lo->plh_layouts, &server->layouts); 1070 spin_unlock(&clp->cl_lock); 1071 } 1072 1073 pg_offset = arg.offset & ~PAGE_CACHE_MASK; 1074 if (pg_offset) { 1075 arg.offset -= pg_offset; 1076 arg.length += pg_offset; 1077 } 1078 if (arg.length != NFS4_MAX_UINT64) 1079 arg.length = PAGE_CACHE_ALIGN(arg.length); 1080 1081 lseg = send_layoutget(lo, ctx, &arg, gfp_flags); 1082 if (!lseg && first) { 1083 spin_lock(&clp->cl_lock); 1084 list_del_init(&lo->plh_layouts); 1085 spin_unlock(&clp->cl_lock); 1086 } 1087 atomic_dec(&lo->plh_outstanding); 1088 put_layout_hdr(lo); 1089 out: 1090 dprintk("%s end, state 0x%lx lseg %p\n", __func__, 1091 nfsi->layout ? nfsi->layout->plh_flags : -1, lseg); 1092 return lseg; 1093 out_unlock: 1094 spin_unlock(&ino->i_lock); 1095 goto out; 1096 } 1097 EXPORT_SYMBOL_GPL(pnfs_update_layout); 1098 1099 int 1100 pnfs_layout_process(struct nfs4_layoutget *lgp) 1101 { 1102 struct pnfs_layout_hdr *lo = NFS_I(lgp->args.inode)->layout; 1103 struct nfs4_layoutget_res *res = &lgp->res; 1104 struct pnfs_layout_segment *lseg; 1105 struct inode *ino = lo->plh_inode; 1106 int status = 0; 1107 1108 /* Inject layout blob into I/O device driver */ 1109 lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res, lgp->gfp_flags); 1110 if (!lseg || IS_ERR(lseg)) { 1111 if (!lseg) 1112 status = -ENOMEM; 1113 else 1114 status = PTR_ERR(lseg); 1115 dprintk("%s: Could not allocate layout: error %d\n", 1116 __func__, status); 1117 goto out; 1118 } 1119 1120 spin_lock(&ino->i_lock); 1121 if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) { 1122 dprintk("%s forget reply due to recall\n", __func__); 1123 goto out_forget_reply; 1124 } 1125 1126 if (pnfs_layoutgets_blocked(lo, &res->stateid, 1)) { 1127 dprintk("%s forget reply due to state\n", __func__); 1128 goto out_forget_reply; 1129 } 1130 init_lseg(lo, lseg); 1131 lseg->pls_range = res->range; 1132 *lgp->lsegpp = get_lseg(lseg); 1133 pnfs_insert_layout(lo, lseg); 1134 1135 if (res->return_on_close) { 1136 set_bit(NFS_LSEG_ROC, &lseg->pls_flags); 1137 set_bit(NFS_LAYOUT_ROC, &lo->plh_flags); 1138 } 1139 1140 /* Done processing layoutget. Set the layout stateid */ 1141 pnfs_set_layout_stateid(lo, &res->stateid, false); 1142 spin_unlock(&ino->i_lock); 1143 out: 1144 return status; 1145 1146 out_forget_reply: 1147 spin_unlock(&ino->i_lock); 1148 lseg->pls_layout = lo; 1149 NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg); 1150 goto out; 1151 } 1152 1153 void 1154 pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) 1155 { 1156 BUG_ON(pgio->pg_lseg != NULL); 1157 1158 if (req->wb_offset != req->wb_pgbase) { 1159 nfs_pageio_reset_read_mds(pgio); 1160 return; 1161 } 1162 pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, 1163 req->wb_context, 1164 req_offset(req), 1165 req->wb_bytes, 1166 IOMODE_READ, 1167 GFP_KERNEL); 1168 /* If no lseg, fall back to read through mds */ 1169 if (pgio->pg_lseg == NULL) 1170 nfs_pageio_reset_read_mds(pgio); 1171 1172 } 1173 EXPORT_SYMBOL_GPL(pnfs_generic_pg_init_read); 1174 1175 void 1176 pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) 1177 { 1178 BUG_ON(pgio->pg_lseg != NULL); 1179 1180 if (req->wb_offset != req->wb_pgbase) { 1181 nfs_pageio_reset_write_mds(pgio); 1182 return; 1183 } 1184 pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, 1185 req->wb_context, 1186 req_offset(req), 1187 req->wb_bytes, 1188 IOMODE_RW, 1189 GFP_NOFS); 1190 /* If no lseg, fall back to write through mds */ 1191 if (pgio->pg_lseg == NULL) 1192 nfs_pageio_reset_write_mds(pgio); 1193 } 1194 EXPORT_SYMBOL_GPL(pnfs_generic_pg_init_write); 1195 1196 void 1197 pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode, 1198 const struct nfs_pgio_completion_ops *compl_ops) 1199 { 1200 struct nfs_server *server = NFS_SERVER(inode); 1201 struct pnfs_layoutdriver_type *ld = server->pnfs_curr_ld; 1202 1203 if (ld == NULL) 1204 nfs_pageio_init_read(pgio, inode, compl_ops); 1205 else 1206 nfs_pageio_init(pgio, inode, ld->pg_read_ops, compl_ops, server->rsize, 0); 1207 } 1208 1209 void 1210 pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode, 1211 int ioflags, 1212 const struct nfs_pgio_completion_ops *compl_ops) 1213 { 1214 struct nfs_server *server = NFS_SERVER(inode); 1215 struct pnfs_layoutdriver_type *ld = server->pnfs_curr_ld; 1216 1217 if (ld == NULL) 1218 nfs_pageio_init_write(pgio, inode, ioflags, compl_ops); 1219 else 1220 nfs_pageio_init(pgio, inode, ld->pg_write_ops, compl_ops, server->wsize, ioflags); 1221 } 1222 1223 bool 1224 pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, 1225 struct nfs_page *req) 1226 { 1227 if (pgio->pg_lseg == NULL) 1228 return nfs_generic_pg_test(pgio, prev, req); 1229 1230 /* 1231 * Test if a nfs_page is fully contained in the pnfs_layout_range. 1232 * Note that this test makes several assumptions: 1233 * - that the previous nfs_page in the struct nfs_pageio_descriptor 1234 * is known to lie within the range. 1235 * - that the nfs_page being tested is known to be contiguous with the 1236 * previous nfs_page. 1237 * - Layout ranges are page aligned, so we only have to test the 1238 * start offset of the request. 1239 * 1240 * Please also note that 'end_offset' is actually the offset of the 1241 * first byte that lies outside the pnfs_layout_range. FIXME? 1242 * 1243 */ 1244 return req_offset(req) < end_offset(pgio->pg_lseg->pls_range.offset, 1245 pgio->pg_lseg->pls_range.length); 1246 } 1247 EXPORT_SYMBOL_GPL(pnfs_generic_pg_test); 1248 1249 int pnfs_write_done_resend_to_mds(struct inode *inode, 1250 struct list_head *head, 1251 const struct nfs_pgio_completion_ops *compl_ops) 1252 { 1253 struct nfs_pageio_descriptor pgio; 1254 LIST_HEAD(failed); 1255 1256 /* Resend all requests through the MDS */ 1257 nfs_pageio_init_write(&pgio, inode, FLUSH_STABLE, compl_ops); 1258 while (!list_empty(head)) { 1259 struct nfs_page *req = nfs_list_entry(head->next); 1260 1261 nfs_list_remove_request(req); 1262 if (!nfs_pageio_add_request(&pgio, req)) 1263 nfs_list_add_request(req, &failed); 1264 } 1265 nfs_pageio_complete(&pgio); 1266 1267 if (!list_empty(&failed)) { 1268 /* For some reason our attempt to resend pages. Mark the 1269 * overall send request as having failed, and let 1270 * nfs_writeback_release_full deal with the error. 1271 */ 1272 list_move(&failed, head); 1273 return -EIO; 1274 } 1275 return 0; 1276 } 1277 EXPORT_SYMBOL_GPL(pnfs_write_done_resend_to_mds); 1278 1279 static void pnfs_ld_handle_write_error(struct nfs_write_data *data) 1280 { 1281 struct nfs_pgio_header *hdr = data->header; 1282 1283 dprintk("pnfs write error = %d\n", hdr->pnfs_error); 1284 if (NFS_SERVER(hdr->inode)->pnfs_curr_ld->flags & 1285 PNFS_LAYOUTRET_ON_ERROR) { 1286 clear_bit(NFS_INO_LAYOUTCOMMIT, &NFS_I(hdr->inode)->flags); 1287 pnfs_return_layout(hdr->inode); 1288 } 1289 if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) 1290 data->task.tk_status = pnfs_write_done_resend_to_mds(hdr->inode, 1291 &hdr->pages, 1292 hdr->completion_ops); 1293 } 1294 1295 /* 1296 * Called by non rpc-based layout drivers 1297 */ 1298 void pnfs_ld_write_done(struct nfs_write_data *data) 1299 { 1300 struct nfs_pgio_header *hdr = data->header; 1301 1302 if (!hdr->pnfs_error) { 1303 pnfs_set_layoutcommit(data); 1304 hdr->mds_ops->rpc_call_done(&data->task, data); 1305 } else 1306 pnfs_ld_handle_write_error(data); 1307 hdr->mds_ops->rpc_release(data); 1308 } 1309 EXPORT_SYMBOL_GPL(pnfs_ld_write_done); 1310 1311 static void 1312 pnfs_write_through_mds(struct nfs_pageio_descriptor *desc, 1313 struct nfs_write_data *data) 1314 { 1315 struct nfs_pgio_header *hdr = data->header; 1316 1317 if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) { 1318 list_splice_tail_init(&hdr->pages, &desc->pg_list); 1319 nfs_pageio_reset_write_mds(desc); 1320 desc->pg_recoalesce = 1; 1321 } 1322 nfs_writedata_release(data); 1323 } 1324 1325 static enum pnfs_try_status 1326 pnfs_try_to_write_data(struct nfs_write_data *wdata, 1327 const struct rpc_call_ops *call_ops, 1328 struct pnfs_layout_segment *lseg, 1329 int how) 1330 { 1331 struct nfs_pgio_header *hdr = wdata->header; 1332 struct inode *inode = hdr->inode; 1333 enum pnfs_try_status trypnfs; 1334 struct nfs_server *nfss = NFS_SERVER(inode); 1335 1336 hdr->mds_ops = call_ops; 1337 1338 dprintk("%s: Writing ino:%lu %u@%llu (how %d)\n", __func__, 1339 inode->i_ino, wdata->args.count, wdata->args.offset, how); 1340 trypnfs = nfss->pnfs_curr_ld->write_pagelist(wdata, how); 1341 if (trypnfs != PNFS_NOT_ATTEMPTED) 1342 nfs_inc_stats(inode, NFSIOS_PNFS_WRITE); 1343 dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs); 1344 return trypnfs; 1345 } 1346 1347 static void 1348 pnfs_do_multiple_writes(struct nfs_pageio_descriptor *desc, struct list_head *head, int how) 1349 { 1350 struct nfs_write_data *data; 1351 const struct rpc_call_ops *call_ops = desc->pg_rpc_callops; 1352 struct pnfs_layout_segment *lseg = desc->pg_lseg; 1353 1354 desc->pg_lseg = NULL; 1355 while (!list_empty(head)) { 1356 enum pnfs_try_status trypnfs; 1357 1358 data = list_first_entry(head, struct nfs_write_data, list); 1359 list_del_init(&data->list); 1360 1361 trypnfs = pnfs_try_to_write_data(data, call_ops, lseg, how); 1362 if (trypnfs == PNFS_NOT_ATTEMPTED) 1363 pnfs_write_through_mds(desc, data); 1364 } 1365 put_lseg(lseg); 1366 } 1367 1368 static void pnfs_writehdr_free(struct nfs_pgio_header *hdr) 1369 { 1370 put_lseg(hdr->lseg); 1371 nfs_writehdr_free(hdr); 1372 } 1373 EXPORT_SYMBOL_GPL(pnfs_writehdr_free); 1374 1375 int 1376 pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc) 1377 { 1378 struct nfs_write_header *whdr; 1379 struct nfs_pgio_header *hdr; 1380 int ret; 1381 1382 whdr = nfs_writehdr_alloc(); 1383 if (!whdr) { 1384 desc->pg_completion_ops->error_cleanup(&desc->pg_list); 1385 put_lseg(desc->pg_lseg); 1386 desc->pg_lseg = NULL; 1387 return -ENOMEM; 1388 } 1389 hdr = &whdr->header; 1390 nfs_pgheader_init(desc, hdr, pnfs_writehdr_free); 1391 hdr->lseg = get_lseg(desc->pg_lseg); 1392 atomic_inc(&hdr->refcnt); 1393 ret = nfs_generic_flush(desc, hdr); 1394 if (ret != 0) { 1395 put_lseg(desc->pg_lseg); 1396 desc->pg_lseg = NULL; 1397 } else 1398 pnfs_do_multiple_writes(desc, &hdr->rpc_list, desc->pg_ioflags); 1399 if (atomic_dec_and_test(&hdr->refcnt)) 1400 hdr->completion_ops->completion(hdr); 1401 return ret; 1402 } 1403 EXPORT_SYMBOL_GPL(pnfs_generic_pg_writepages); 1404 1405 int pnfs_read_done_resend_to_mds(struct inode *inode, 1406 struct list_head *head, 1407 const struct nfs_pgio_completion_ops *compl_ops) 1408 { 1409 struct nfs_pageio_descriptor pgio; 1410 LIST_HEAD(failed); 1411 1412 /* Resend all requests through the MDS */ 1413 nfs_pageio_init_read(&pgio, inode, compl_ops); 1414 while (!list_empty(head)) { 1415 struct nfs_page *req = nfs_list_entry(head->next); 1416 1417 nfs_list_remove_request(req); 1418 if (!nfs_pageio_add_request(&pgio, req)) 1419 nfs_list_add_request(req, &failed); 1420 } 1421 nfs_pageio_complete(&pgio); 1422 1423 if (!list_empty(&failed)) { 1424 list_move(&failed, head); 1425 return -EIO; 1426 } 1427 return 0; 1428 } 1429 EXPORT_SYMBOL_GPL(pnfs_read_done_resend_to_mds); 1430 1431 static void pnfs_ld_handle_read_error(struct nfs_read_data *data) 1432 { 1433 struct nfs_pgio_header *hdr = data->header; 1434 1435 dprintk("pnfs read error = %d\n", hdr->pnfs_error); 1436 if (NFS_SERVER(hdr->inode)->pnfs_curr_ld->flags & 1437 PNFS_LAYOUTRET_ON_ERROR) { 1438 clear_bit(NFS_INO_LAYOUTCOMMIT, &NFS_I(hdr->inode)->flags); 1439 pnfs_return_layout(hdr->inode); 1440 } 1441 if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) 1442 data->task.tk_status = pnfs_read_done_resend_to_mds(hdr->inode, 1443 &hdr->pages, 1444 hdr->completion_ops); 1445 } 1446 1447 /* 1448 * Called by non rpc-based layout drivers 1449 */ 1450 void pnfs_ld_read_done(struct nfs_read_data *data) 1451 { 1452 struct nfs_pgio_header *hdr = data->header; 1453 1454 if (likely(!hdr->pnfs_error)) { 1455 __nfs4_read_done_cb(data); 1456 hdr->mds_ops->rpc_call_done(&data->task, data); 1457 } else 1458 pnfs_ld_handle_read_error(data); 1459 hdr->mds_ops->rpc_release(data); 1460 } 1461 EXPORT_SYMBOL_GPL(pnfs_ld_read_done); 1462 1463 static void 1464 pnfs_read_through_mds(struct nfs_pageio_descriptor *desc, 1465 struct nfs_read_data *data) 1466 { 1467 struct nfs_pgio_header *hdr = data->header; 1468 1469 if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) { 1470 list_splice_tail_init(&hdr->pages, &desc->pg_list); 1471 nfs_pageio_reset_read_mds(desc); 1472 desc->pg_recoalesce = 1; 1473 } 1474 nfs_readdata_release(data); 1475 } 1476 1477 /* 1478 * Call the appropriate parallel I/O subsystem read function. 1479 */ 1480 static enum pnfs_try_status 1481 pnfs_try_to_read_data(struct nfs_read_data *rdata, 1482 const struct rpc_call_ops *call_ops, 1483 struct pnfs_layout_segment *lseg) 1484 { 1485 struct nfs_pgio_header *hdr = rdata->header; 1486 struct inode *inode = hdr->inode; 1487 struct nfs_server *nfss = NFS_SERVER(inode); 1488 enum pnfs_try_status trypnfs; 1489 1490 hdr->mds_ops = call_ops; 1491 1492 dprintk("%s: Reading ino:%lu %u@%llu\n", 1493 __func__, inode->i_ino, rdata->args.count, rdata->args.offset); 1494 1495 trypnfs = nfss->pnfs_curr_ld->read_pagelist(rdata); 1496 if (trypnfs != PNFS_NOT_ATTEMPTED) 1497 nfs_inc_stats(inode, NFSIOS_PNFS_READ); 1498 dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs); 1499 return trypnfs; 1500 } 1501 1502 static void 1503 pnfs_do_multiple_reads(struct nfs_pageio_descriptor *desc, struct list_head *head) 1504 { 1505 struct nfs_read_data *data; 1506 const struct rpc_call_ops *call_ops = desc->pg_rpc_callops; 1507 struct pnfs_layout_segment *lseg = desc->pg_lseg; 1508 1509 desc->pg_lseg = NULL; 1510 while (!list_empty(head)) { 1511 enum pnfs_try_status trypnfs; 1512 1513 data = list_first_entry(head, struct nfs_read_data, list); 1514 list_del_init(&data->list); 1515 1516 trypnfs = pnfs_try_to_read_data(data, call_ops, lseg); 1517 if (trypnfs == PNFS_NOT_ATTEMPTED) 1518 pnfs_read_through_mds(desc, data); 1519 } 1520 put_lseg(lseg); 1521 } 1522 1523 static void pnfs_readhdr_free(struct nfs_pgio_header *hdr) 1524 { 1525 put_lseg(hdr->lseg); 1526 nfs_readhdr_free(hdr); 1527 } 1528 EXPORT_SYMBOL_GPL(pnfs_readhdr_free); 1529 1530 int 1531 pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc) 1532 { 1533 struct nfs_read_header *rhdr; 1534 struct nfs_pgio_header *hdr; 1535 int ret; 1536 1537 rhdr = nfs_readhdr_alloc(); 1538 if (!rhdr) { 1539 desc->pg_completion_ops->error_cleanup(&desc->pg_list); 1540 ret = -ENOMEM; 1541 put_lseg(desc->pg_lseg); 1542 desc->pg_lseg = NULL; 1543 return ret; 1544 } 1545 hdr = &rhdr->header; 1546 nfs_pgheader_init(desc, hdr, pnfs_readhdr_free); 1547 hdr->lseg = get_lseg(desc->pg_lseg); 1548 atomic_inc(&hdr->refcnt); 1549 ret = nfs_generic_pagein(desc, hdr); 1550 if (ret != 0) { 1551 put_lseg(desc->pg_lseg); 1552 desc->pg_lseg = NULL; 1553 } else 1554 pnfs_do_multiple_reads(desc, &hdr->rpc_list); 1555 if (atomic_dec_and_test(&hdr->refcnt)) 1556 hdr->completion_ops->completion(hdr); 1557 return ret; 1558 } 1559 EXPORT_SYMBOL_GPL(pnfs_generic_pg_readpages); 1560 1561 /* 1562 * There can be multiple RW segments. 1563 */ 1564 static void pnfs_list_write_lseg(struct inode *inode, struct list_head *listp) 1565 { 1566 struct pnfs_layout_segment *lseg; 1567 1568 list_for_each_entry(lseg, &NFS_I(inode)->layout->plh_segs, pls_list) { 1569 if (lseg->pls_range.iomode == IOMODE_RW && 1570 test_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags)) 1571 list_add(&lseg->pls_lc_list, listp); 1572 } 1573 } 1574 1575 void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg) 1576 { 1577 if (lseg->pls_range.iomode == IOMODE_RW) { 1578 dprintk("%s Setting layout IOMODE_RW fail bit\n", __func__); 1579 set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags); 1580 } else { 1581 dprintk("%s Setting layout IOMODE_READ fail bit\n", __func__); 1582 set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags); 1583 } 1584 } 1585 EXPORT_SYMBOL_GPL(pnfs_set_lo_fail); 1586 1587 void 1588 pnfs_set_layoutcommit(struct nfs_write_data *wdata) 1589 { 1590 struct nfs_pgio_header *hdr = wdata->header; 1591 struct inode *inode = hdr->inode; 1592 struct nfs_inode *nfsi = NFS_I(inode); 1593 loff_t end_pos = wdata->mds_offset + wdata->res.count; 1594 bool mark_as_dirty = false; 1595 1596 spin_lock(&inode->i_lock); 1597 if (!test_and_set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) { 1598 mark_as_dirty = true; 1599 dprintk("%s: Set layoutcommit for inode %lu ", 1600 __func__, inode->i_ino); 1601 } 1602 if (!test_and_set_bit(NFS_LSEG_LAYOUTCOMMIT, &hdr->lseg->pls_flags)) { 1603 /* references matched in nfs4_layoutcommit_release */ 1604 get_lseg(hdr->lseg); 1605 } 1606 if (end_pos > nfsi->layout->plh_lwb) 1607 nfsi->layout->plh_lwb = end_pos; 1608 spin_unlock(&inode->i_lock); 1609 dprintk("%s: lseg %p end_pos %llu\n", 1610 __func__, hdr->lseg, nfsi->layout->plh_lwb); 1611 1612 /* if pnfs_layoutcommit_inode() runs between inode locks, the next one 1613 * will be a noop because NFS_INO_LAYOUTCOMMIT will not be set */ 1614 if (mark_as_dirty) 1615 mark_inode_dirty_sync(inode); 1616 } 1617 EXPORT_SYMBOL_GPL(pnfs_set_layoutcommit); 1618 1619 void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data) 1620 { 1621 struct nfs_server *nfss = NFS_SERVER(data->args.inode); 1622 1623 if (nfss->pnfs_curr_ld->cleanup_layoutcommit) 1624 nfss->pnfs_curr_ld->cleanup_layoutcommit(data); 1625 } 1626 1627 /* 1628 * For the LAYOUT4_NFSV4_1_FILES layout type, NFS_DATA_SYNC WRITEs and 1629 * NFS_UNSTABLE WRITEs with a COMMIT to data servers must store enough 1630 * data to disk to allow the server to recover the data if it crashes. 1631 * LAYOUTCOMMIT is only needed when the NFL4_UFLG_COMMIT_THRU_MDS flag 1632 * is off, and a COMMIT is sent to a data server, or 1633 * if WRITEs to a data server return NFS_DATA_SYNC. 1634 */ 1635 int 1636 pnfs_layoutcommit_inode(struct inode *inode, bool sync) 1637 { 1638 struct nfs4_layoutcommit_data *data; 1639 struct nfs_inode *nfsi = NFS_I(inode); 1640 loff_t end_pos; 1641 int status = 0; 1642 1643 dprintk("--> %s inode %lu\n", __func__, inode->i_ino); 1644 1645 if (!test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) 1646 return 0; 1647 1648 /* Note kzalloc ensures data->res.seq_res.sr_slot == NULL */ 1649 data = kzalloc(sizeof(*data), GFP_NOFS); 1650 if (!data) { 1651 status = -ENOMEM; 1652 goto out; 1653 } 1654 1655 if (!test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) 1656 goto out_free; 1657 1658 if (test_and_set_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags)) { 1659 if (!sync) { 1660 status = -EAGAIN; 1661 goto out_free; 1662 } 1663 status = wait_on_bit_lock(&nfsi->flags, NFS_INO_LAYOUTCOMMITTING, 1664 nfs_wait_bit_killable, TASK_KILLABLE); 1665 if (status) 1666 goto out_free; 1667 } 1668 1669 INIT_LIST_HEAD(&data->lseg_list); 1670 spin_lock(&inode->i_lock); 1671 if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) { 1672 clear_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags); 1673 spin_unlock(&inode->i_lock); 1674 wake_up_bit(&nfsi->flags, NFS_INO_LAYOUTCOMMITTING); 1675 goto out_free; 1676 } 1677 1678 pnfs_list_write_lseg(inode, &data->lseg_list); 1679 1680 end_pos = nfsi->layout->plh_lwb; 1681 nfsi->layout->plh_lwb = 0; 1682 1683 nfs4_stateid_copy(&data->args.stateid, &nfsi->layout->plh_stateid); 1684 spin_unlock(&inode->i_lock); 1685 1686 data->args.inode = inode; 1687 data->cred = get_rpccred(nfsi->layout->plh_lc_cred); 1688 nfs_fattr_init(&data->fattr); 1689 data->args.bitmask = NFS_SERVER(inode)->cache_consistency_bitmask; 1690 data->res.fattr = &data->fattr; 1691 data->args.lastbytewritten = end_pos - 1; 1692 data->res.server = NFS_SERVER(inode); 1693 1694 status = nfs4_proc_layoutcommit(data, sync); 1695 out: 1696 if (status) 1697 mark_inode_dirty_sync(inode); 1698 dprintk("<-- %s status %d\n", __func__, status); 1699 return status; 1700 out_free: 1701 kfree(data); 1702 goto out; 1703 } 1704 1705 struct nfs4_threshold *pnfs_mdsthreshold_alloc(void) 1706 { 1707 struct nfs4_threshold *thp; 1708 1709 thp = kzalloc(sizeof(*thp), GFP_NOFS); 1710 if (!thp) { 1711 dprintk("%s mdsthreshold allocation failed\n", __func__); 1712 return NULL; 1713 } 1714 return thp; 1715 } 1716