1 /* 2 * pNFS functions to call and manage layout drivers. 3 * 4 * Copyright (c) 2002 [year of first publication] 5 * The Regents of the University of Michigan 6 * All Rights Reserved 7 * 8 * Dean Hildebrand <dhildebz@umich.edu> 9 * 10 * Permission is granted to use, copy, create derivative works, and 11 * redistribute this software and such derivative works for any purpose, 12 * so long as the name of the University of Michigan is not used in 13 * any advertising or publicity pertaining to the use or distribution 14 * of this software without specific, written prior authorization. If 15 * the above copyright notice or any other identification of the 16 * University of Michigan is included in any copy of any portion of 17 * this software, then the disclaimer below must also be included. 18 * 19 * This software is provided as is, without representation or warranty 20 * of any kind either express or implied, including without limitation 21 * the implied warranties of merchantability, fitness for a particular 22 * purpose, or noninfringement. The Regents of the University of 23 * Michigan shall not be liable for any damages, including special, 24 * indirect, incidental, or consequential damages, with respect to any 25 * claim arising out of or in connection with the use of the software, 26 * even if it has been or is hereafter advised of the possibility of 27 * such damages. 28 */ 29 30 #include <linux/nfs_fs.h> 31 #include <linux/nfs_page.h> 32 #include <linux/module.h> 33 #include "internal.h" 34 #include "pnfs.h" 35 #include "iostat.h" 36 #include "nfs4trace.h" 37 #include "delegation.h" 38 39 #define NFSDBG_FACILITY NFSDBG_PNFS 40 #define PNFS_LAYOUTGET_RETRY_TIMEOUT (120*HZ) 41 42 /* Locking: 43 * 44 * pnfs_spinlock: 45 * protects pnfs_modules_tbl. 46 */ 47 static DEFINE_SPINLOCK(pnfs_spinlock); 48 49 /* 50 * pnfs_modules_tbl holds all pnfs modules 51 */ 52 static LIST_HEAD(pnfs_modules_tbl); 53 54 static int 55 pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, nfs4_stateid stateid, 56 enum pnfs_iomode iomode, bool sync); 57 58 /* Return the registered pnfs layout driver module matching given id */ 59 static struct pnfs_layoutdriver_type * 60 find_pnfs_driver_locked(u32 id) 61 { 62 struct pnfs_layoutdriver_type *local; 63 64 list_for_each_entry(local, &pnfs_modules_tbl, pnfs_tblid) 65 if (local->id == id) 66 goto out; 67 local = NULL; 68 out: 69 dprintk("%s: Searching for id %u, found %p\n", __func__, id, local); 70 return local; 71 } 72 73 static struct pnfs_layoutdriver_type * 74 find_pnfs_driver(u32 id) 75 { 76 struct pnfs_layoutdriver_type *local; 77 78 spin_lock(&pnfs_spinlock); 79 local = find_pnfs_driver_locked(id); 80 if (local != NULL && !try_module_get(local->owner)) { 81 dprintk("%s: Could not grab reference on module\n", __func__); 82 local = NULL; 83 } 84 spin_unlock(&pnfs_spinlock); 85 return local; 86 } 87 88 void 89 unset_pnfs_layoutdriver(struct nfs_server *nfss) 90 { 91 if (nfss->pnfs_curr_ld) { 92 if (nfss->pnfs_curr_ld->clear_layoutdriver) 93 nfss->pnfs_curr_ld->clear_layoutdriver(nfss); 94 /* Decrement the MDS count. Purge the deviceid cache if zero */ 95 if (atomic_dec_and_test(&nfss->nfs_client->cl_mds_count)) 96 nfs4_deviceid_purge_client(nfss->nfs_client); 97 module_put(nfss->pnfs_curr_ld->owner); 98 } 99 nfss->pnfs_curr_ld = NULL; 100 } 101 102 /* 103 * Try to set the server's pnfs module to the pnfs layout type specified by id. 104 * Currently only one pNFS layout driver per filesystem is supported. 105 * 106 * @id layout type. Zero (illegal layout type) indicates pNFS not in use. 107 */ 108 void 109 set_pnfs_layoutdriver(struct nfs_server *server, const struct nfs_fh *mntfh, 110 u32 id) 111 { 112 struct pnfs_layoutdriver_type *ld_type = NULL; 113 114 if (id == 0) 115 goto out_no_driver; 116 if (!(server->nfs_client->cl_exchange_flags & 117 (EXCHGID4_FLAG_USE_NON_PNFS | EXCHGID4_FLAG_USE_PNFS_MDS))) { 118 printk(KERN_ERR "NFS: %s: id %u cl_exchange_flags 0x%x\n", 119 __func__, id, server->nfs_client->cl_exchange_flags); 120 goto out_no_driver; 121 } 122 ld_type = find_pnfs_driver(id); 123 if (!ld_type) { 124 request_module("%s-%u", LAYOUT_NFSV4_1_MODULE_PREFIX, id); 125 ld_type = find_pnfs_driver(id); 126 if (!ld_type) { 127 dprintk("%s: No pNFS module found for %u.\n", 128 __func__, id); 129 goto out_no_driver; 130 } 131 } 132 server->pnfs_curr_ld = ld_type; 133 if (ld_type->set_layoutdriver 134 && ld_type->set_layoutdriver(server, mntfh)) { 135 printk(KERN_ERR "NFS: %s: Error initializing pNFS layout " 136 "driver %u.\n", __func__, id); 137 module_put(ld_type->owner); 138 goto out_no_driver; 139 } 140 /* Bump the MDS count */ 141 atomic_inc(&server->nfs_client->cl_mds_count); 142 143 dprintk("%s: pNFS module for %u set\n", __func__, id); 144 return; 145 146 out_no_driver: 147 dprintk("%s: Using NFSv4 I/O\n", __func__); 148 server->pnfs_curr_ld = NULL; 149 } 150 151 int 152 pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *ld_type) 153 { 154 int status = -EINVAL; 155 struct pnfs_layoutdriver_type *tmp; 156 157 if (ld_type->id == 0) { 158 printk(KERN_ERR "NFS: %s id 0 is reserved\n", __func__); 159 return status; 160 } 161 if (!ld_type->alloc_lseg || !ld_type->free_lseg) { 162 printk(KERN_ERR "NFS: %s Layout driver must provide " 163 "alloc_lseg and free_lseg.\n", __func__); 164 return status; 165 } 166 167 spin_lock(&pnfs_spinlock); 168 tmp = find_pnfs_driver_locked(ld_type->id); 169 if (!tmp) { 170 list_add(&ld_type->pnfs_tblid, &pnfs_modules_tbl); 171 status = 0; 172 dprintk("%s Registering id:%u name:%s\n", __func__, ld_type->id, 173 ld_type->name); 174 } else { 175 printk(KERN_ERR "NFS: %s Module with id %d already loaded!\n", 176 __func__, ld_type->id); 177 } 178 spin_unlock(&pnfs_spinlock); 179 180 return status; 181 } 182 EXPORT_SYMBOL_GPL(pnfs_register_layoutdriver); 183 184 void 185 pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *ld_type) 186 { 187 dprintk("%s Deregistering id:%u\n", __func__, ld_type->id); 188 spin_lock(&pnfs_spinlock); 189 list_del(&ld_type->pnfs_tblid); 190 spin_unlock(&pnfs_spinlock); 191 } 192 EXPORT_SYMBOL_GPL(pnfs_unregister_layoutdriver); 193 194 /* 195 * pNFS client layout cache 196 */ 197 198 /* Need to hold i_lock if caller does not already hold reference */ 199 void 200 pnfs_get_layout_hdr(struct pnfs_layout_hdr *lo) 201 { 202 atomic_inc(&lo->plh_refcount); 203 } 204 205 static struct pnfs_layout_hdr * 206 pnfs_alloc_layout_hdr(struct inode *ino, gfp_t gfp_flags) 207 { 208 struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld; 209 return ld->alloc_layout_hdr(ino, gfp_flags); 210 } 211 212 static void 213 pnfs_free_layout_hdr(struct pnfs_layout_hdr *lo) 214 { 215 struct nfs_server *server = NFS_SERVER(lo->plh_inode); 216 struct pnfs_layoutdriver_type *ld = server->pnfs_curr_ld; 217 218 if (!list_empty(&lo->plh_layouts)) { 219 struct nfs_client *clp = server->nfs_client; 220 221 spin_lock(&clp->cl_lock); 222 list_del_init(&lo->plh_layouts); 223 spin_unlock(&clp->cl_lock); 224 } 225 put_rpccred(lo->plh_lc_cred); 226 return ld->free_layout_hdr(lo); 227 } 228 229 static void 230 pnfs_detach_layout_hdr(struct pnfs_layout_hdr *lo) 231 { 232 struct nfs_inode *nfsi = NFS_I(lo->plh_inode); 233 dprintk("%s: freeing layout cache %p\n", __func__, lo); 234 nfsi->layout = NULL; 235 /* Reset MDS Threshold I/O counters */ 236 nfsi->write_io = 0; 237 nfsi->read_io = 0; 238 } 239 240 void 241 pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo) 242 { 243 struct inode *inode = lo->plh_inode; 244 245 if (atomic_dec_and_lock(&lo->plh_refcount, &inode->i_lock)) { 246 if (!list_empty(&lo->plh_segs)) 247 WARN_ONCE(1, "NFS: BUG unfreed layout segments.\n"); 248 pnfs_detach_layout_hdr(lo); 249 spin_unlock(&inode->i_lock); 250 pnfs_free_layout_hdr(lo); 251 } 252 } 253 254 static int 255 pnfs_iomode_to_fail_bit(u32 iomode) 256 { 257 return iomode == IOMODE_RW ? 258 NFS_LAYOUT_RW_FAILED : NFS_LAYOUT_RO_FAILED; 259 } 260 261 static void 262 pnfs_layout_set_fail_bit(struct pnfs_layout_hdr *lo, int fail_bit) 263 { 264 lo->plh_retry_timestamp = jiffies; 265 if (!test_and_set_bit(fail_bit, &lo->plh_flags)) 266 atomic_inc(&lo->plh_refcount); 267 } 268 269 static void 270 pnfs_layout_clear_fail_bit(struct pnfs_layout_hdr *lo, int fail_bit) 271 { 272 if (test_and_clear_bit(fail_bit, &lo->plh_flags)) 273 atomic_dec(&lo->plh_refcount); 274 } 275 276 static void 277 pnfs_layout_io_set_failed(struct pnfs_layout_hdr *lo, u32 iomode) 278 { 279 struct inode *inode = lo->plh_inode; 280 struct pnfs_layout_range range = { 281 .iomode = iomode, 282 .offset = 0, 283 .length = NFS4_MAX_UINT64, 284 }; 285 LIST_HEAD(head); 286 287 spin_lock(&inode->i_lock); 288 pnfs_layout_set_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode)); 289 pnfs_mark_matching_lsegs_invalid(lo, &head, &range); 290 spin_unlock(&inode->i_lock); 291 pnfs_free_lseg_list(&head); 292 dprintk("%s Setting layout IOMODE_%s fail bit\n", __func__, 293 iomode == IOMODE_RW ? "RW" : "READ"); 294 } 295 296 static bool 297 pnfs_layout_io_test_failed(struct pnfs_layout_hdr *lo, u32 iomode) 298 { 299 unsigned long start, end; 300 int fail_bit = pnfs_iomode_to_fail_bit(iomode); 301 302 if (test_bit(fail_bit, &lo->plh_flags) == 0) 303 return false; 304 end = jiffies; 305 start = end - PNFS_LAYOUTGET_RETRY_TIMEOUT; 306 if (!time_in_range(lo->plh_retry_timestamp, start, end)) { 307 /* It is time to retry the failed layoutgets */ 308 pnfs_layout_clear_fail_bit(lo, fail_bit); 309 return false; 310 } 311 return true; 312 } 313 314 static void 315 init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg) 316 { 317 INIT_LIST_HEAD(&lseg->pls_list); 318 INIT_LIST_HEAD(&lseg->pls_lc_list); 319 atomic_set(&lseg->pls_refcount, 1); 320 smp_mb(); 321 set_bit(NFS_LSEG_VALID, &lseg->pls_flags); 322 lseg->pls_layout = lo; 323 } 324 325 static void pnfs_free_lseg(struct pnfs_layout_segment *lseg) 326 { 327 struct inode *ino = lseg->pls_layout->plh_inode; 328 329 NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg); 330 } 331 332 static void 333 pnfs_layout_remove_lseg(struct pnfs_layout_hdr *lo, 334 struct pnfs_layout_segment *lseg) 335 { 336 struct inode *inode = lo->plh_inode; 337 338 WARN_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags)); 339 list_del_init(&lseg->pls_list); 340 /* Matched by pnfs_get_layout_hdr in pnfs_layout_insert_lseg */ 341 atomic_dec(&lo->plh_refcount); 342 if (list_empty(&lo->plh_segs)) 343 clear_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags); 344 rpc_wake_up(&NFS_SERVER(inode)->roc_rpcwaitq); 345 } 346 347 /* Return true if layoutreturn is needed */ 348 static bool 349 pnfs_layout_need_return(struct pnfs_layout_hdr *lo, 350 struct pnfs_layout_segment *lseg) 351 { 352 struct pnfs_layout_segment *s; 353 354 if (!test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags)) 355 return false; 356 357 list_for_each_entry(s, &lo->plh_segs, pls_list) 358 if (s != lseg && test_bit(NFS_LSEG_LAYOUTRETURN, &s->pls_flags)) 359 return false; 360 361 return true; 362 } 363 364 static void pnfs_layoutreturn_before_put_lseg(struct pnfs_layout_segment *lseg, 365 struct pnfs_layout_hdr *lo, struct inode *inode) 366 { 367 lo = lseg->pls_layout; 368 inode = lo->plh_inode; 369 370 spin_lock(&inode->i_lock); 371 if (pnfs_layout_need_return(lo, lseg)) { 372 nfs4_stateid stateid; 373 enum pnfs_iomode iomode; 374 375 stateid = lo->plh_stateid; 376 iomode = lo->plh_return_iomode; 377 /* decreased in pnfs_send_layoutreturn() */ 378 lo->plh_block_lgets++; 379 lo->plh_return_iomode = 0; 380 spin_unlock(&inode->i_lock); 381 pnfs_get_layout_hdr(lo); 382 383 /* Send an async layoutreturn so we dont deadlock */ 384 pnfs_send_layoutreturn(lo, stateid, iomode, false); 385 } else 386 spin_unlock(&inode->i_lock); 387 } 388 389 void 390 pnfs_put_lseg(struct pnfs_layout_segment *lseg) 391 { 392 struct pnfs_layout_hdr *lo; 393 struct inode *inode; 394 395 if (!lseg) 396 return; 397 398 dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg, 399 atomic_read(&lseg->pls_refcount), 400 test_bit(NFS_LSEG_VALID, &lseg->pls_flags)); 401 402 /* Handle the case where refcount != 1 */ 403 if (atomic_add_unless(&lseg->pls_refcount, -1, 1)) 404 return; 405 406 lo = lseg->pls_layout; 407 inode = lo->plh_inode; 408 /* Do we need a layoutreturn? */ 409 if (test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags)) 410 pnfs_layoutreturn_before_put_lseg(lseg, lo, inode); 411 412 if (atomic_dec_and_lock(&lseg->pls_refcount, &inode->i_lock)) { 413 pnfs_get_layout_hdr(lo); 414 pnfs_layout_remove_lseg(lo, lseg); 415 spin_unlock(&inode->i_lock); 416 pnfs_free_lseg(lseg); 417 pnfs_put_layout_hdr(lo); 418 } 419 } 420 EXPORT_SYMBOL_GPL(pnfs_put_lseg); 421 422 static void pnfs_free_lseg_async_work(struct work_struct *work) 423 { 424 struct pnfs_layout_segment *lseg; 425 struct pnfs_layout_hdr *lo; 426 427 lseg = container_of(work, struct pnfs_layout_segment, pls_work); 428 lo = lseg->pls_layout; 429 430 pnfs_free_lseg(lseg); 431 pnfs_put_layout_hdr(lo); 432 } 433 434 static void pnfs_free_lseg_async(struct pnfs_layout_segment *lseg) 435 { 436 INIT_WORK(&lseg->pls_work, pnfs_free_lseg_async_work); 437 schedule_work(&lseg->pls_work); 438 } 439 440 void 441 pnfs_put_lseg_locked(struct pnfs_layout_segment *lseg) 442 { 443 if (!lseg) 444 return; 445 446 assert_spin_locked(&lseg->pls_layout->plh_inode->i_lock); 447 448 dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg, 449 atomic_read(&lseg->pls_refcount), 450 test_bit(NFS_LSEG_VALID, &lseg->pls_flags)); 451 if (atomic_dec_and_test(&lseg->pls_refcount)) { 452 struct pnfs_layout_hdr *lo = lseg->pls_layout; 453 pnfs_get_layout_hdr(lo); 454 pnfs_layout_remove_lseg(lo, lseg); 455 pnfs_free_lseg_async(lseg); 456 } 457 } 458 EXPORT_SYMBOL_GPL(pnfs_put_lseg_locked); 459 460 static u64 461 end_offset(u64 start, u64 len) 462 { 463 u64 end; 464 465 end = start + len; 466 return end >= start ? end : NFS4_MAX_UINT64; 467 } 468 469 /* 470 * is l2 fully contained in l1? 471 * start1 end1 472 * [----------------------------------) 473 * start2 end2 474 * [----------------) 475 */ 476 static bool 477 pnfs_lseg_range_contained(const struct pnfs_layout_range *l1, 478 const struct pnfs_layout_range *l2) 479 { 480 u64 start1 = l1->offset; 481 u64 end1 = end_offset(start1, l1->length); 482 u64 start2 = l2->offset; 483 u64 end2 = end_offset(start2, l2->length); 484 485 return (start1 <= start2) && (end1 >= end2); 486 } 487 488 /* 489 * is l1 and l2 intersecting? 490 * start1 end1 491 * [----------------------------------) 492 * start2 end2 493 * [----------------) 494 */ 495 static bool 496 pnfs_lseg_range_intersecting(const struct pnfs_layout_range *l1, 497 const struct pnfs_layout_range *l2) 498 { 499 u64 start1 = l1->offset; 500 u64 end1 = end_offset(start1, l1->length); 501 u64 start2 = l2->offset; 502 u64 end2 = end_offset(start2, l2->length); 503 504 return (end1 == NFS4_MAX_UINT64 || end1 > start2) && 505 (end2 == NFS4_MAX_UINT64 || end2 > start1); 506 } 507 508 static bool 509 should_free_lseg(const struct pnfs_layout_range *lseg_range, 510 const struct pnfs_layout_range *recall_range) 511 { 512 return (recall_range->iomode == IOMODE_ANY || 513 lseg_range->iomode == recall_range->iomode) && 514 pnfs_lseg_range_intersecting(lseg_range, recall_range); 515 } 516 517 static bool pnfs_lseg_dec_and_remove_zero(struct pnfs_layout_segment *lseg, 518 struct list_head *tmp_list) 519 { 520 if (!atomic_dec_and_test(&lseg->pls_refcount)) 521 return false; 522 pnfs_layout_remove_lseg(lseg->pls_layout, lseg); 523 list_add(&lseg->pls_list, tmp_list); 524 return true; 525 } 526 527 /* Returns 1 if lseg is removed from list, 0 otherwise */ 528 static int mark_lseg_invalid(struct pnfs_layout_segment *lseg, 529 struct list_head *tmp_list) 530 { 531 int rv = 0; 532 533 if (test_and_clear_bit(NFS_LSEG_VALID, &lseg->pls_flags)) { 534 /* Remove the reference keeping the lseg in the 535 * list. It will now be removed when all 536 * outstanding io is finished. 537 */ 538 dprintk("%s: lseg %p ref %d\n", __func__, lseg, 539 atomic_read(&lseg->pls_refcount)); 540 if (pnfs_lseg_dec_and_remove_zero(lseg, tmp_list)) 541 rv = 1; 542 } 543 return rv; 544 } 545 546 /* Returns count of number of matching invalid lsegs remaining in list 547 * after call. 548 */ 549 int 550 pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, 551 struct list_head *tmp_list, 552 struct pnfs_layout_range *recall_range) 553 { 554 struct pnfs_layout_segment *lseg, *next; 555 int invalid = 0, removed = 0; 556 557 dprintk("%s:Begin lo %p\n", __func__, lo); 558 559 if (list_empty(&lo->plh_segs)) 560 return 0; 561 list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list) 562 if (!recall_range || 563 should_free_lseg(&lseg->pls_range, recall_range)) { 564 dprintk("%s: freeing lseg %p iomode %d " 565 "offset %llu length %llu\n", __func__, 566 lseg, lseg->pls_range.iomode, lseg->pls_range.offset, 567 lseg->pls_range.length); 568 invalid++; 569 removed += mark_lseg_invalid(lseg, tmp_list); 570 } 571 dprintk("%s:Return %i\n", __func__, invalid - removed); 572 return invalid - removed; 573 } 574 575 /* note free_me must contain lsegs from a single layout_hdr */ 576 void 577 pnfs_free_lseg_list(struct list_head *free_me) 578 { 579 struct pnfs_layout_segment *lseg, *tmp; 580 581 if (list_empty(free_me)) 582 return; 583 584 list_for_each_entry_safe(lseg, tmp, free_me, pls_list) { 585 list_del(&lseg->pls_list); 586 pnfs_free_lseg(lseg); 587 } 588 } 589 590 void 591 pnfs_destroy_layout(struct nfs_inode *nfsi) 592 { 593 struct pnfs_layout_hdr *lo; 594 LIST_HEAD(tmp_list); 595 596 spin_lock(&nfsi->vfs_inode.i_lock); 597 lo = nfsi->layout; 598 if (lo) { 599 lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */ 600 pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL); 601 pnfs_get_layout_hdr(lo); 602 pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RO_FAILED); 603 pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RW_FAILED); 604 pnfs_clear_retry_layoutget(lo); 605 spin_unlock(&nfsi->vfs_inode.i_lock); 606 pnfs_free_lseg_list(&tmp_list); 607 pnfs_put_layout_hdr(lo); 608 } else 609 spin_unlock(&nfsi->vfs_inode.i_lock); 610 } 611 EXPORT_SYMBOL_GPL(pnfs_destroy_layout); 612 613 static bool 614 pnfs_layout_add_bulk_destroy_list(struct inode *inode, 615 struct list_head *layout_list) 616 { 617 struct pnfs_layout_hdr *lo; 618 bool ret = false; 619 620 spin_lock(&inode->i_lock); 621 lo = NFS_I(inode)->layout; 622 if (lo != NULL && list_empty(&lo->plh_bulk_destroy)) { 623 pnfs_get_layout_hdr(lo); 624 list_add(&lo->plh_bulk_destroy, layout_list); 625 ret = true; 626 } 627 spin_unlock(&inode->i_lock); 628 return ret; 629 } 630 631 /* Caller must hold rcu_read_lock and clp->cl_lock */ 632 static int 633 pnfs_layout_bulk_destroy_byserver_locked(struct nfs_client *clp, 634 struct nfs_server *server, 635 struct list_head *layout_list) 636 { 637 struct pnfs_layout_hdr *lo, *next; 638 struct inode *inode; 639 640 list_for_each_entry_safe(lo, next, &server->layouts, plh_layouts) { 641 inode = igrab(lo->plh_inode); 642 if (inode == NULL) 643 continue; 644 list_del_init(&lo->plh_layouts); 645 if (pnfs_layout_add_bulk_destroy_list(inode, layout_list)) 646 continue; 647 rcu_read_unlock(); 648 spin_unlock(&clp->cl_lock); 649 iput(inode); 650 spin_lock(&clp->cl_lock); 651 rcu_read_lock(); 652 return -EAGAIN; 653 } 654 return 0; 655 } 656 657 static int 658 pnfs_layout_free_bulk_destroy_list(struct list_head *layout_list, 659 bool is_bulk_recall) 660 { 661 struct pnfs_layout_hdr *lo; 662 struct inode *inode; 663 struct pnfs_layout_range range = { 664 .iomode = IOMODE_ANY, 665 .offset = 0, 666 .length = NFS4_MAX_UINT64, 667 }; 668 LIST_HEAD(lseg_list); 669 int ret = 0; 670 671 while (!list_empty(layout_list)) { 672 lo = list_entry(layout_list->next, struct pnfs_layout_hdr, 673 plh_bulk_destroy); 674 dprintk("%s freeing layout for inode %lu\n", __func__, 675 lo->plh_inode->i_ino); 676 inode = lo->plh_inode; 677 678 pnfs_layoutcommit_inode(inode, false); 679 680 spin_lock(&inode->i_lock); 681 list_del_init(&lo->plh_bulk_destroy); 682 lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */ 683 if (is_bulk_recall) 684 set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags); 685 if (pnfs_mark_matching_lsegs_invalid(lo, &lseg_list, &range)) 686 ret = -EAGAIN; 687 spin_unlock(&inode->i_lock); 688 pnfs_free_lseg_list(&lseg_list); 689 pnfs_put_layout_hdr(lo); 690 iput(inode); 691 } 692 return ret; 693 } 694 695 int 696 pnfs_destroy_layouts_byfsid(struct nfs_client *clp, 697 struct nfs_fsid *fsid, 698 bool is_recall) 699 { 700 struct nfs_server *server; 701 LIST_HEAD(layout_list); 702 703 spin_lock(&clp->cl_lock); 704 rcu_read_lock(); 705 restart: 706 list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { 707 if (memcmp(&server->fsid, fsid, sizeof(*fsid)) != 0) 708 continue; 709 if (pnfs_layout_bulk_destroy_byserver_locked(clp, 710 server, 711 &layout_list) != 0) 712 goto restart; 713 } 714 rcu_read_unlock(); 715 spin_unlock(&clp->cl_lock); 716 717 if (list_empty(&layout_list)) 718 return 0; 719 return pnfs_layout_free_bulk_destroy_list(&layout_list, is_recall); 720 } 721 722 int 723 pnfs_destroy_layouts_byclid(struct nfs_client *clp, 724 bool is_recall) 725 { 726 struct nfs_server *server; 727 LIST_HEAD(layout_list); 728 729 spin_lock(&clp->cl_lock); 730 rcu_read_lock(); 731 restart: 732 list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { 733 if (pnfs_layout_bulk_destroy_byserver_locked(clp, 734 server, 735 &layout_list) != 0) 736 goto restart; 737 } 738 rcu_read_unlock(); 739 spin_unlock(&clp->cl_lock); 740 741 if (list_empty(&layout_list)) 742 return 0; 743 return pnfs_layout_free_bulk_destroy_list(&layout_list, is_recall); 744 } 745 746 /* 747 * Called by the state manger to remove all layouts established under an 748 * expired lease. 749 */ 750 void 751 pnfs_destroy_all_layouts(struct nfs_client *clp) 752 { 753 nfs4_deviceid_mark_client_invalid(clp); 754 nfs4_deviceid_purge_client(clp); 755 756 pnfs_destroy_layouts_byclid(clp, false); 757 } 758 759 /* 760 * Compare 2 layout stateid sequence ids, to see which is newer, 761 * taking into account wraparound issues. 762 */ 763 static bool pnfs_seqid_is_newer(u32 s1, u32 s2) 764 { 765 return (s32)(s1 - s2) > 0; 766 } 767 768 /* update lo->plh_stateid with new if is more recent */ 769 void 770 pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new, 771 bool update_barrier) 772 { 773 u32 oldseq, newseq, new_barrier; 774 int empty = list_empty(&lo->plh_segs); 775 776 oldseq = be32_to_cpu(lo->plh_stateid.seqid); 777 newseq = be32_to_cpu(new->seqid); 778 if (empty || pnfs_seqid_is_newer(newseq, oldseq)) { 779 nfs4_stateid_copy(&lo->plh_stateid, new); 780 if (update_barrier) { 781 new_barrier = be32_to_cpu(new->seqid); 782 } else { 783 /* Because of wraparound, we want to keep the barrier 784 * "close" to the current seqids. 785 */ 786 new_barrier = newseq - atomic_read(&lo->plh_outstanding); 787 } 788 if (empty || pnfs_seqid_is_newer(new_barrier, lo->plh_barrier)) 789 lo->plh_barrier = new_barrier; 790 } 791 } 792 793 static bool 794 pnfs_layout_stateid_blocked(const struct pnfs_layout_hdr *lo, 795 const nfs4_stateid *stateid) 796 { 797 u32 seqid = be32_to_cpu(stateid->seqid); 798 799 return !pnfs_seqid_is_newer(seqid, lo->plh_barrier); 800 } 801 802 static bool 803 pnfs_layout_returning(const struct pnfs_layout_hdr *lo, 804 struct pnfs_layout_range *range) 805 { 806 return test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags) && 807 (lo->plh_return_iomode == IOMODE_ANY || 808 lo->plh_return_iomode == range->iomode); 809 } 810 811 /* lget is set to 1 if called from inside send_layoutget call chain */ 812 static bool 813 pnfs_layoutgets_blocked(const struct pnfs_layout_hdr *lo, 814 struct pnfs_layout_range *range, int lget) 815 { 816 return lo->plh_block_lgets || 817 test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) || 818 (list_empty(&lo->plh_segs) && 819 (atomic_read(&lo->plh_outstanding) > lget)) || 820 pnfs_layout_returning(lo, range); 821 } 822 823 int 824 pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo, 825 struct pnfs_layout_range *range, 826 struct nfs4_state *open_state) 827 { 828 int status = 0; 829 830 dprintk("--> %s\n", __func__); 831 spin_lock(&lo->plh_inode->i_lock); 832 if (pnfs_layoutgets_blocked(lo, range, 1)) { 833 status = -EAGAIN; 834 } else if (!nfs4_valid_open_stateid(open_state)) { 835 status = -EBADF; 836 } else if (list_empty(&lo->plh_segs) || 837 test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags)) { 838 int seq; 839 840 do { 841 seq = read_seqbegin(&open_state->seqlock); 842 nfs4_stateid_copy(dst, &open_state->stateid); 843 } while (read_seqretry(&open_state->seqlock, seq)); 844 } else 845 nfs4_stateid_copy(dst, &lo->plh_stateid); 846 spin_unlock(&lo->plh_inode->i_lock); 847 dprintk("<-- %s\n", __func__); 848 return status; 849 } 850 851 /* 852 * Get layout from server. 853 * for now, assume that whole file layouts are requested. 854 * arg->offset: 0 855 * arg->length: all ones 856 */ 857 static struct pnfs_layout_segment * 858 send_layoutget(struct pnfs_layout_hdr *lo, 859 struct nfs_open_context *ctx, 860 struct pnfs_layout_range *range, 861 gfp_t gfp_flags) 862 { 863 struct inode *ino = lo->plh_inode; 864 struct nfs_server *server = NFS_SERVER(ino); 865 struct nfs4_layoutget *lgp; 866 struct pnfs_layout_segment *lseg; 867 868 dprintk("--> %s\n", __func__); 869 870 lgp = kzalloc(sizeof(*lgp), gfp_flags); 871 if (lgp == NULL) 872 return NULL; 873 874 lgp->args.minlength = PAGE_CACHE_SIZE; 875 if (lgp->args.minlength > range->length) 876 lgp->args.minlength = range->length; 877 lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE; 878 lgp->args.range = *range; 879 lgp->args.type = server->pnfs_curr_ld->id; 880 lgp->args.inode = ino; 881 lgp->args.ctx = get_nfs_open_context(ctx); 882 lgp->gfp_flags = gfp_flags; 883 lgp->cred = lo->plh_lc_cred; 884 885 /* Synchronously retrieve layout information from server and 886 * store in lseg. 887 */ 888 lseg = nfs4_proc_layoutget(lgp, gfp_flags); 889 if (IS_ERR(lseg)) { 890 switch (PTR_ERR(lseg)) { 891 case -ENOMEM: 892 case -ERESTARTSYS: 893 break; 894 default: 895 /* remember that LAYOUTGET failed and suspend trying */ 896 pnfs_layout_io_set_failed(lo, range->iomode); 897 } 898 return NULL; 899 } else 900 pnfs_layout_clear_fail_bit(lo, 901 pnfs_iomode_to_fail_bit(range->iomode)); 902 903 return lseg; 904 } 905 906 static void pnfs_clear_layoutcommit(struct inode *inode, 907 struct list_head *head) 908 { 909 struct nfs_inode *nfsi = NFS_I(inode); 910 struct pnfs_layout_segment *lseg, *tmp; 911 912 if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) 913 return; 914 list_for_each_entry_safe(lseg, tmp, &nfsi->layout->plh_segs, pls_list) { 915 if (!test_and_clear_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags)) 916 continue; 917 pnfs_lseg_dec_and_remove_zero(lseg, head); 918 } 919 } 920 921 void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo) 922 { 923 clear_bit_unlock(NFS_LAYOUT_RETURN, &lo->plh_flags); 924 smp_mb__after_atomic(); 925 wake_up_bit(&lo->plh_flags, NFS_LAYOUT_RETURN); 926 } 927 928 static int 929 pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, nfs4_stateid stateid, 930 enum pnfs_iomode iomode, bool sync) 931 { 932 struct inode *ino = lo->plh_inode; 933 struct nfs4_layoutreturn *lrp; 934 int status = 0; 935 936 lrp = kzalloc(sizeof(*lrp), GFP_NOFS); 937 if (unlikely(lrp == NULL)) { 938 status = -ENOMEM; 939 spin_lock(&ino->i_lock); 940 lo->plh_block_lgets--; 941 pnfs_clear_layoutreturn_waitbit(lo); 942 rpc_wake_up(&NFS_SERVER(ino)->roc_rpcwaitq); 943 spin_unlock(&ino->i_lock); 944 pnfs_put_layout_hdr(lo); 945 goto out; 946 } 947 948 lrp->args.stateid = stateid; 949 lrp->args.layout_type = NFS_SERVER(ino)->pnfs_curr_ld->id; 950 lrp->args.inode = ino; 951 lrp->args.range.iomode = iomode; 952 lrp->args.range.offset = 0; 953 lrp->args.range.length = NFS4_MAX_UINT64; 954 lrp->args.layout = lo; 955 lrp->clp = NFS_SERVER(ino)->nfs_client; 956 lrp->cred = lo->plh_lc_cred; 957 958 status = nfs4_proc_layoutreturn(lrp, sync); 959 out: 960 dprintk("<-- %s status: %d\n", __func__, status); 961 return status; 962 } 963 964 /* 965 * Initiates a LAYOUTRETURN(FILE), and removes the pnfs_layout_hdr 966 * when the layout segment list is empty. 967 * 968 * Note that a pnfs_layout_hdr can exist with an empty layout segment 969 * list when LAYOUTGET has failed, or when LAYOUTGET succeeded, but the 970 * deviceid is marked invalid. 971 */ 972 int 973 _pnfs_return_layout(struct inode *ino) 974 { 975 struct pnfs_layout_hdr *lo = NULL; 976 struct nfs_inode *nfsi = NFS_I(ino); 977 LIST_HEAD(tmp_list); 978 nfs4_stateid stateid; 979 int status = 0, empty; 980 981 dprintk("NFS: %s for inode %lu\n", __func__, ino->i_ino); 982 983 spin_lock(&ino->i_lock); 984 lo = nfsi->layout; 985 if (!lo) { 986 spin_unlock(&ino->i_lock); 987 dprintk("NFS: %s no layout to return\n", __func__); 988 goto out; 989 } 990 stateid = nfsi->layout->plh_stateid; 991 /* Reference matched in nfs4_layoutreturn_release */ 992 pnfs_get_layout_hdr(lo); 993 empty = list_empty(&lo->plh_segs); 994 pnfs_clear_layoutcommit(ino, &tmp_list); 995 pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL); 996 997 if (NFS_SERVER(ino)->pnfs_curr_ld->return_range) { 998 struct pnfs_layout_range range = { 999 .iomode = IOMODE_ANY, 1000 .offset = 0, 1001 .length = NFS4_MAX_UINT64, 1002 }; 1003 NFS_SERVER(ino)->pnfs_curr_ld->return_range(lo, &range); 1004 } 1005 1006 /* Don't send a LAYOUTRETURN if list was initially empty */ 1007 if (empty) { 1008 spin_unlock(&ino->i_lock); 1009 pnfs_put_layout_hdr(lo); 1010 dprintk("NFS: %s no layout segments to return\n", __func__); 1011 goto out; 1012 } 1013 1014 set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); 1015 lo->plh_block_lgets++; 1016 spin_unlock(&ino->i_lock); 1017 pnfs_free_lseg_list(&tmp_list); 1018 1019 status = pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, true); 1020 out: 1021 dprintk("<-- %s status: %d\n", __func__, status); 1022 return status; 1023 } 1024 EXPORT_SYMBOL_GPL(_pnfs_return_layout); 1025 1026 int 1027 pnfs_commit_and_return_layout(struct inode *inode) 1028 { 1029 struct pnfs_layout_hdr *lo; 1030 int ret; 1031 1032 spin_lock(&inode->i_lock); 1033 lo = NFS_I(inode)->layout; 1034 if (lo == NULL) { 1035 spin_unlock(&inode->i_lock); 1036 return 0; 1037 } 1038 pnfs_get_layout_hdr(lo); 1039 /* Block new layoutgets and read/write to ds */ 1040 lo->plh_block_lgets++; 1041 spin_unlock(&inode->i_lock); 1042 filemap_fdatawait(inode->i_mapping); 1043 ret = pnfs_layoutcommit_inode(inode, true); 1044 if (ret == 0) 1045 ret = _pnfs_return_layout(inode); 1046 spin_lock(&inode->i_lock); 1047 lo->plh_block_lgets--; 1048 spin_unlock(&inode->i_lock); 1049 pnfs_put_layout_hdr(lo); 1050 return ret; 1051 } 1052 1053 bool pnfs_roc(struct inode *ino) 1054 { 1055 struct nfs_inode *nfsi = NFS_I(ino); 1056 struct nfs_open_context *ctx; 1057 struct nfs4_state *state; 1058 struct pnfs_layout_hdr *lo; 1059 struct pnfs_layout_segment *lseg, *tmp; 1060 nfs4_stateid stateid; 1061 LIST_HEAD(tmp_list); 1062 bool found = false, layoutreturn = false; 1063 1064 spin_lock(&ino->i_lock); 1065 lo = nfsi->layout; 1066 if (!lo || !test_and_clear_bit(NFS_LAYOUT_ROC, &lo->plh_flags) || 1067 test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) 1068 goto out_noroc; 1069 1070 /* Don't return layout if we hold a delegation */ 1071 if (nfs4_check_delegation(ino, FMODE_READ)) 1072 goto out_noroc; 1073 1074 list_for_each_entry(ctx, &nfsi->open_files, list) { 1075 state = ctx->state; 1076 /* Don't return layout if there is open file state */ 1077 if (state != NULL && state->state != 0) 1078 goto out_noroc; 1079 } 1080 1081 pnfs_clear_retry_layoutget(lo); 1082 list_for_each_entry_safe(lseg, tmp, &lo->plh_segs, pls_list) 1083 if (test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) { 1084 mark_lseg_invalid(lseg, &tmp_list); 1085 found = true; 1086 } 1087 if (!found) 1088 goto out_noroc; 1089 lo->plh_block_lgets++; 1090 pnfs_get_layout_hdr(lo); /* matched in pnfs_roc_release */ 1091 spin_unlock(&ino->i_lock); 1092 pnfs_free_lseg_list(&tmp_list); 1093 return true; 1094 1095 out_noroc: 1096 if (lo) { 1097 stateid = lo->plh_stateid; 1098 layoutreturn = 1099 test_and_clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, 1100 &lo->plh_flags); 1101 if (layoutreturn) { 1102 lo->plh_block_lgets++; 1103 pnfs_get_layout_hdr(lo); 1104 } 1105 } 1106 spin_unlock(&ino->i_lock); 1107 if (layoutreturn) 1108 pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, true); 1109 return false; 1110 } 1111 1112 void pnfs_roc_release(struct inode *ino) 1113 { 1114 struct pnfs_layout_hdr *lo; 1115 1116 spin_lock(&ino->i_lock); 1117 lo = NFS_I(ino)->layout; 1118 lo->plh_block_lgets--; 1119 if (atomic_dec_and_test(&lo->plh_refcount)) { 1120 pnfs_detach_layout_hdr(lo); 1121 spin_unlock(&ino->i_lock); 1122 pnfs_free_layout_hdr(lo); 1123 } else 1124 spin_unlock(&ino->i_lock); 1125 } 1126 1127 void pnfs_roc_set_barrier(struct inode *ino, u32 barrier) 1128 { 1129 struct pnfs_layout_hdr *lo; 1130 1131 spin_lock(&ino->i_lock); 1132 lo = NFS_I(ino)->layout; 1133 if (pnfs_seqid_is_newer(barrier, lo->plh_barrier)) 1134 lo->plh_barrier = barrier; 1135 spin_unlock(&ino->i_lock); 1136 } 1137 1138 bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task) 1139 { 1140 struct nfs_inode *nfsi = NFS_I(ino); 1141 struct pnfs_layout_hdr *lo; 1142 struct pnfs_layout_segment *lseg; 1143 nfs4_stateid stateid; 1144 u32 current_seqid; 1145 bool found = false, layoutreturn = false; 1146 1147 spin_lock(&ino->i_lock); 1148 list_for_each_entry(lseg, &nfsi->layout->plh_segs, pls_list) 1149 if (test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) { 1150 rpc_sleep_on(&NFS_SERVER(ino)->roc_rpcwaitq, task, NULL); 1151 found = true; 1152 goto out; 1153 } 1154 lo = nfsi->layout; 1155 current_seqid = be32_to_cpu(lo->plh_stateid.seqid); 1156 1157 /* Since close does not return a layout stateid for use as 1158 * a barrier, we choose the worst-case barrier. 1159 */ 1160 *barrier = current_seqid + atomic_read(&lo->plh_outstanding); 1161 out: 1162 if (!found) { 1163 stateid = lo->plh_stateid; 1164 layoutreturn = 1165 test_and_clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, 1166 &lo->plh_flags); 1167 if (layoutreturn) { 1168 lo->plh_block_lgets++; 1169 pnfs_get_layout_hdr(lo); 1170 } 1171 } 1172 spin_unlock(&ino->i_lock); 1173 if (layoutreturn) { 1174 rpc_sleep_on(&NFS_SERVER(ino)->roc_rpcwaitq, task, NULL); 1175 pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, false); 1176 } 1177 return found; 1178 } 1179 1180 /* 1181 * Compare two layout segments for sorting into layout cache. 1182 * We want to preferentially return RW over RO layouts, so ensure those 1183 * are seen first. 1184 */ 1185 static s64 1186 pnfs_lseg_range_cmp(const struct pnfs_layout_range *l1, 1187 const struct pnfs_layout_range *l2) 1188 { 1189 s64 d; 1190 1191 /* high offset > low offset */ 1192 d = l1->offset - l2->offset; 1193 if (d) 1194 return d; 1195 1196 /* short length > long length */ 1197 d = l2->length - l1->length; 1198 if (d) 1199 return d; 1200 1201 /* read > read/write */ 1202 return (int)(l1->iomode == IOMODE_READ) - (int)(l2->iomode == IOMODE_READ); 1203 } 1204 1205 static void 1206 pnfs_layout_insert_lseg(struct pnfs_layout_hdr *lo, 1207 struct pnfs_layout_segment *lseg) 1208 { 1209 struct pnfs_layout_segment *lp; 1210 1211 dprintk("%s:Begin\n", __func__); 1212 1213 list_for_each_entry(lp, &lo->plh_segs, pls_list) { 1214 if (pnfs_lseg_range_cmp(&lseg->pls_range, &lp->pls_range) > 0) 1215 continue; 1216 list_add_tail(&lseg->pls_list, &lp->pls_list); 1217 dprintk("%s: inserted lseg %p " 1218 "iomode %d offset %llu length %llu before " 1219 "lp %p iomode %d offset %llu length %llu\n", 1220 __func__, lseg, lseg->pls_range.iomode, 1221 lseg->pls_range.offset, lseg->pls_range.length, 1222 lp, lp->pls_range.iomode, lp->pls_range.offset, 1223 lp->pls_range.length); 1224 goto out; 1225 } 1226 list_add_tail(&lseg->pls_list, &lo->plh_segs); 1227 dprintk("%s: inserted lseg %p " 1228 "iomode %d offset %llu length %llu at tail\n", 1229 __func__, lseg, lseg->pls_range.iomode, 1230 lseg->pls_range.offset, lseg->pls_range.length); 1231 out: 1232 pnfs_get_layout_hdr(lo); 1233 1234 dprintk("%s:Return\n", __func__); 1235 } 1236 1237 static struct pnfs_layout_hdr * 1238 alloc_init_layout_hdr(struct inode *ino, 1239 struct nfs_open_context *ctx, 1240 gfp_t gfp_flags) 1241 { 1242 struct pnfs_layout_hdr *lo; 1243 1244 lo = pnfs_alloc_layout_hdr(ino, gfp_flags); 1245 if (!lo) 1246 return NULL; 1247 atomic_set(&lo->plh_refcount, 1); 1248 INIT_LIST_HEAD(&lo->plh_layouts); 1249 INIT_LIST_HEAD(&lo->plh_segs); 1250 INIT_LIST_HEAD(&lo->plh_bulk_destroy); 1251 lo->plh_inode = ino; 1252 lo->plh_lc_cred = get_rpccred(ctx->cred); 1253 return lo; 1254 } 1255 1256 static struct pnfs_layout_hdr * 1257 pnfs_find_alloc_layout(struct inode *ino, 1258 struct nfs_open_context *ctx, 1259 gfp_t gfp_flags) 1260 { 1261 struct nfs_inode *nfsi = NFS_I(ino); 1262 struct pnfs_layout_hdr *new = NULL; 1263 1264 dprintk("%s Begin ino=%p layout=%p\n", __func__, ino, nfsi->layout); 1265 1266 if (nfsi->layout != NULL) 1267 goto out_existing; 1268 spin_unlock(&ino->i_lock); 1269 new = alloc_init_layout_hdr(ino, ctx, gfp_flags); 1270 spin_lock(&ino->i_lock); 1271 1272 if (likely(nfsi->layout == NULL)) { /* Won the race? */ 1273 nfsi->layout = new; 1274 return new; 1275 } else if (new != NULL) 1276 pnfs_free_layout_hdr(new); 1277 out_existing: 1278 pnfs_get_layout_hdr(nfsi->layout); 1279 return nfsi->layout; 1280 } 1281 1282 /* 1283 * iomode matching rules: 1284 * iomode lseg match 1285 * ----- ----- ----- 1286 * ANY READ true 1287 * ANY RW true 1288 * RW READ false 1289 * RW RW true 1290 * READ READ true 1291 * READ RW true 1292 */ 1293 static bool 1294 pnfs_lseg_range_match(const struct pnfs_layout_range *ls_range, 1295 const struct pnfs_layout_range *range) 1296 { 1297 struct pnfs_layout_range range1; 1298 1299 if ((range->iomode == IOMODE_RW && 1300 ls_range->iomode != IOMODE_RW) || 1301 !pnfs_lseg_range_intersecting(ls_range, range)) 1302 return 0; 1303 1304 /* range1 covers only the first byte in the range */ 1305 range1 = *range; 1306 range1.length = 1; 1307 return pnfs_lseg_range_contained(ls_range, &range1); 1308 } 1309 1310 /* 1311 * lookup range in layout 1312 */ 1313 static struct pnfs_layout_segment * 1314 pnfs_find_lseg(struct pnfs_layout_hdr *lo, 1315 struct pnfs_layout_range *range) 1316 { 1317 struct pnfs_layout_segment *lseg, *ret = NULL; 1318 1319 dprintk("%s:Begin\n", __func__); 1320 1321 list_for_each_entry(lseg, &lo->plh_segs, pls_list) { 1322 if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) && 1323 !test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags) && 1324 pnfs_lseg_range_match(&lseg->pls_range, range)) { 1325 ret = pnfs_get_lseg(lseg); 1326 break; 1327 } 1328 if (lseg->pls_range.offset > range->offset) 1329 break; 1330 } 1331 1332 dprintk("%s:Return lseg %p ref %d\n", 1333 __func__, ret, ret ? atomic_read(&ret->pls_refcount) : 0); 1334 return ret; 1335 } 1336 1337 /* 1338 * Use mdsthreshold hints set at each OPEN to determine if I/O should go 1339 * to the MDS or over pNFS 1340 * 1341 * The nfs_inode read_io and write_io fields are cumulative counters reset 1342 * when there are no layout segments. Note that in pnfs_update_layout iomode 1343 * is set to IOMODE_READ for a READ request, and set to IOMODE_RW for a 1344 * WRITE request. 1345 * 1346 * A return of true means use MDS I/O. 1347 * 1348 * From rfc 5661: 1349 * If a file's size is smaller than the file size threshold, data accesses 1350 * SHOULD be sent to the metadata server. If an I/O request has a length that 1351 * is below the I/O size threshold, the I/O SHOULD be sent to the metadata 1352 * server. If both file size and I/O size are provided, the client SHOULD 1353 * reach or exceed both thresholds before sending its read or write 1354 * requests to the data server. 1355 */ 1356 static bool pnfs_within_mdsthreshold(struct nfs_open_context *ctx, 1357 struct inode *ino, int iomode) 1358 { 1359 struct nfs4_threshold *t = ctx->mdsthreshold; 1360 struct nfs_inode *nfsi = NFS_I(ino); 1361 loff_t fsize = i_size_read(ino); 1362 bool size = false, size_set = false, io = false, io_set = false, ret = false; 1363 1364 if (t == NULL) 1365 return ret; 1366 1367 dprintk("%s bm=0x%x rd_sz=%llu wr_sz=%llu rd_io=%llu wr_io=%llu\n", 1368 __func__, t->bm, t->rd_sz, t->wr_sz, t->rd_io_sz, t->wr_io_sz); 1369 1370 switch (iomode) { 1371 case IOMODE_READ: 1372 if (t->bm & THRESHOLD_RD) { 1373 dprintk("%s fsize %llu\n", __func__, fsize); 1374 size_set = true; 1375 if (fsize < t->rd_sz) 1376 size = true; 1377 } 1378 if (t->bm & THRESHOLD_RD_IO) { 1379 dprintk("%s nfsi->read_io %llu\n", __func__, 1380 nfsi->read_io); 1381 io_set = true; 1382 if (nfsi->read_io < t->rd_io_sz) 1383 io = true; 1384 } 1385 break; 1386 case IOMODE_RW: 1387 if (t->bm & THRESHOLD_WR) { 1388 dprintk("%s fsize %llu\n", __func__, fsize); 1389 size_set = true; 1390 if (fsize < t->wr_sz) 1391 size = true; 1392 } 1393 if (t->bm & THRESHOLD_WR_IO) { 1394 dprintk("%s nfsi->write_io %llu\n", __func__, 1395 nfsi->write_io); 1396 io_set = true; 1397 if (nfsi->write_io < t->wr_io_sz) 1398 io = true; 1399 } 1400 break; 1401 } 1402 if (size_set && io_set) { 1403 if (size && io) 1404 ret = true; 1405 } else if (size || io) 1406 ret = true; 1407 1408 dprintk("<-- %s size %d io %d ret %d\n", __func__, size, io, ret); 1409 return ret; 1410 } 1411 1412 /* stop waiting if someone clears NFS_LAYOUT_RETRY_LAYOUTGET bit. */ 1413 static int pnfs_layoutget_retry_bit_wait(struct wait_bit_key *key) 1414 { 1415 if (!test_bit(NFS_LAYOUT_RETRY_LAYOUTGET, key->flags)) 1416 return 1; 1417 return nfs_wait_bit_killable(key); 1418 } 1419 1420 static bool pnfs_prepare_to_retry_layoutget(struct pnfs_layout_hdr *lo) 1421 { 1422 /* 1423 * send layoutcommit as it can hold up layoutreturn due to lseg 1424 * reference 1425 */ 1426 pnfs_layoutcommit_inode(lo->plh_inode, false); 1427 return !wait_on_bit_action(&lo->plh_flags, NFS_LAYOUT_RETURN, 1428 pnfs_layoutget_retry_bit_wait, 1429 TASK_UNINTERRUPTIBLE); 1430 } 1431 1432 static void pnfs_clear_first_layoutget(struct pnfs_layout_hdr *lo) 1433 { 1434 unsigned long *bitlock = &lo->plh_flags; 1435 1436 clear_bit_unlock(NFS_LAYOUT_FIRST_LAYOUTGET, bitlock); 1437 smp_mb__after_atomic(); 1438 wake_up_bit(bitlock, NFS_LAYOUT_FIRST_LAYOUTGET); 1439 } 1440 1441 /* 1442 * Layout segment is retreived from the server if not cached. 1443 * The appropriate layout segment is referenced and returned to the caller. 1444 */ 1445 struct pnfs_layout_segment * 1446 pnfs_update_layout(struct inode *ino, 1447 struct nfs_open_context *ctx, 1448 loff_t pos, 1449 u64 count, 1450 enum pnfs_iomode iomode, 1451 gfp_t gfp_flags) 1452 { 1453 struct pnfs_layout_range arg = { 1454 .iomode = iomode, 1455 .offset = pos, 1456 .length = count, 1457 }; 1458 unsigned pg_offset; 1459 struct nfs_server *server = NFS_SERVER(ino); 1460 struct nfs_client *clp = server->nfs_client; 1461 struct pnfs_layout_hdr *lo; 1462 struct pnfs_layout_segment *lseg = NULL; 1463 bool first; 1464 1465 if (!pnfs_enabled_sb(NFS_SERVER(ino))) 1466 goto out; 1467 1468 if (pnfs_within_mdsthreshold(ctx, ino, iomode)) 1469 goto out; 1470 1471 lookup_again: 1472 first = false; 1473 spin_lock(&ino->i_lock); 1474 lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags); 1475 if (lo == NULL) { 1476 spin_unlock(&ino->i_lock); 1477 goto out; 1478 } 1479 1480 /* Do we even need to bother with this? */ 1481 if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) { 1482 dprintk("%s matches recall, use MDS\n", __func__); 1483 goto out_unlock; 1484 } 1485 1486 /* if LAYOUTGET already failed once we don't try again */ 1487 if (pnfs_layout_io_test_failed(lo, iomode) && 1488 !pnfs_should_retry_layoutget(lo)) 1489 goto out_unlock; 1490 1491 first = list_empty(&lo->plh_segs); 1492 if (first) { 1493 /* The first layoutget for the file. Need to serialize per 1494 * RFC 5661 Errata 3208. 1495 */ 1496 if (test_and_set_bit(NFS_LAYOUT_FIRST_LAYOUTGET, 1497 &lo->plh_flags)) { 1498 spin_unlock(&ino->i_lock); 1499 wait_on_bit(&lo->plh_flags, NFS_LAYOUT_FIRST_LAYOUTGET, 1500 TASK_UNINTERRUPTIBLE); 1501 pnfs_put_layout_hdr(lo); 1502 goto lookup_again; 1503 } 1504 } else { 1505 /* Check to see if the layout for the given range 1506 * already exists 1507 */ 1508 lseg = pnfs_find_lseg(lo, &arg); 1509 if (lseg) 1510 goto out_unlock; 1511 } 1512 1513 /* 1514 * Because we free lsegs before sending LAYOUTRETURN, we need to wait 1515 * for LAYOUTRETURN even if first is true. 1516 */ 1517 if (!lseg && pnfs_should_retry_layoutget(lo) && 1518 test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) { 1519 spin_unlock(&ino->i_lock); 1520 dprintk("%s wait for layoutreturn\n", __func__); 1521 if (pnfs_prepare_to_retry_layoutget(lo)) { 1522 if (first) 1523 pnfs_clear_first_layoutget(lo); 1524 pnfs_put_layout_hdr(lo); 1525 dprintk("%s retrying\n", __func__); 1526 goto lookup_again; 1527 } 1528 goto out_put_layout_hdr; 1529 } 1530 1531 if (pnfs_layoutgets_blocked(lo, &arg, 0)) 1532 goto out_unlock; 1533 atomic_inc(&lo->plh_outstanding); 1534 spin_unlock(&ino->i_lock); 1535 1536 if (list_empty(&lo->plh_layouts)) { 1537 /* The lo must be on the clp list if there is any 1538 * chance of a CB_LAYOUTRECALL(FILE) coming in. 1539 */ 1540 spin_lock(&clp->cl_lock); 1541 if (list_empty(&lo->plh_layouts)) 1542 list_add_tail(&lo->plh_layouts, &server->layouts); 1543 spin_unlock(&clp->cl_lock); 1544 } 1545 1546 pg_offset = arg.offset & ~PAGE_CACHE_MASK; 1547 if (pg_offset) { 1548 arg.offset -= pg_offset; 1549 arg.length += pg_offset; 1550 } 1551 if (arg.length != NFS4_MAX_UINT64) 1552 arg.length = PAGE_CACHE_ALIGN(arg.length); 1553 1554 lseg = send_layoutget(lo, ctx, &arg, gfp_flags); 1555 pnfs_clear_retry_layoutget(lo); 1556 atomic_dec(&lo->plh_outstanding); 1557 out_put_layout_hdr: 1558 if (first) 1559 pnfs_clear_first_layoutget(lo); 1560 pnfs_put_layout_hdr(lo); 1561 out: 1562 dprintk("%s: inode %s/%llu pNFS layout segment %s for " 1563 "(%s, offset: %llu, length: %llu)\n", 1564 __func__, ino->i_sb->s_id, 1565 (unsigned long long)NFS_FILEID(ino), 1566 lseg == NULL ? "not found" : "found", 1567 iomode==IOMODE_RW ? "read/write" : "read-only", 1568 (unsigned long long)pos, 1569 (unsigned long long)count); 1570 return lseg; 1571 out_unlock: 1572 spin_unlock(&ino->i_lock); 1573 goto out_put_layout_hdr; 1574 } 1575 EXPORT_SYMBOL_GPL(pnfs_update_layout); 1576 1577 struct pnfs_layout_segment * 1578 pnfs_layout_process(struct nfs4_layoutget *lgp) 1579 { 1580 struct pnfs_layout_hdr *lo = NFS_I(lgp->args.inode)->layout; 1581 struct nfs4_layoutget_res *res = &lgp->res; 1582 struct pnfs_layout_segment *lseg; 1583 struct inode *ino = lo->plh_inode; 1584 LIST_HEAD(free_me); 1585 int status = 0; 1586 1587 /* Inject layout blob into I/O device driver */ 1588 lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res, lgp->gfp_flags); 1589 if (!lseg || IS_ERR(lseg)) { 1590 if (!lseg) 1591 status = -ENOMEM; 1592 else 1593 status = PTR_ERR(lseg); 1594 dprintk("%s: Could not allocate layout: error %d\n", 1595 __func__, status); 1596 goto out; 1597 } 1598 1599 init_lseg(lo, lseg); 1600 lseg->pls_range = res->range; 1601 1602 spin_lock(&ino->i_lock); 1603 if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) { 1604 dprintk("%s forget reply due to recall\n", __func__); 1605 goto out_forget_reply; 1606 } 1607 1608 if (pnfs_layoutgets_blocked(lo, &lgp->args.range, 1)) { 1609 dprintk("%s forget reply due to state\n", __func__); 1610 goto out_forget_reply; 1611 } 1612 1613 if (nfs4_stateid_match_other(&lo->plh_stateid, &res->stateid)) { 1614 /* existing state ID, make sure the sequence number matches. */ 1615 if (pnfs_layout_stateid_blocked(lo, &res->stateid)) { 1616 dprintk("%s forget reply due to sequence\n", __func__); 1617 goto out_forget_reply; 1618 } 1619 pnfs_set_layout_stateid(lo, &res->stateid, false); 1620 } else { 1621 /* 1622 * We got an entirely new state ID. Mark all segments for the 1623 * inode invalid, and don't bother validating the stateid 1624 * sequence number. 1625 */ 1626 pnfs_mark_matching_lsegs_invalid(lo, &free_me, NULL); 1627 1628 nfs4_stateid_copy(&lo->plh_stateid, &res->stateid); 1629 lo->plh_barrier = be32_to_cpu(res->stateid.seqid); 1630 } 1631 1632 clear_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); 1633 1634 pnfs_get_lseg(lseg); 1635 pnfs_layout_insert_lseg(lo, lseg); 1636 1637 if (res->return_on_close) { 1638 set_bit(NFS_LSEG_ROC, &lseg->pls_flags); 1639 set_bit(NFS_LAYOUT_ROC, &lo->plh_flags); 1640 } 1641 1642 spin_unlock(&ino->i_lock); 1643 pnfs_free_lseg_list(&free_me); 1644 return lseg; 1645 out: 1646 return ERR_PTR(status); 1647 1648 out_forget_reply: 1649 spin_unlock(&ino->i_lock); 1650 lseg->pls_layout = lo; 1651 NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg); 1652 goto out; 1653 } 1654 1655 static void 1656 pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo, 1657 struct list_head *tmp_list, 1658 struct pnfs_layout_range *return_range) 1659 { 1660 struct pnfs_layout_segment *lseg, *next; 1661 1662 dprintk("%s:Begin lo %p\n", __func__, lo); 1663 1664 if (list_empty(&lo->plh_segs)) 1665 return; 1666 1667 list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list) 1668 if (should_free_lseg(&lseg->pls_range, return_range)) { 1669 dprintk("%s: marking lseg %p iomode %d " 1670 "offset %llu length %llu\n", __func__, 1671 lseg, lseg->pls_range.iomode, 1672 lseg->pls_range.offset, 1673 lseg->pls_range.length); 1674 set_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags); 1675 mark_lseg_invalid(lseg, tmp_list); 1676 } 1677 } 1678 1679 void pnfs_error_mark_layout_for_return(struct inode *inode, 1680 struct pnfs_layout_segment *lseg) 1681 { 1682 struct pnfs_layout_hdr *lo = NFS_I(inode)->layout; 1683 int iomode = pnfs_iomode_to_fail_bit(lseg->pls_range.iomode); 1684 struct pnfs_layout_range range = { 1685 .iomode = lseg->pls_range.iomode, 1686 .offset = 0, 1687 .length = NFS4_MAX_UINT64, 1688 }; 1689 LIST_HEAD(free_me); 1690 1691 spin_lock(&inode->i_lock); 1692 /* set failure bit so that pnfs path will be retried later */ 1693 pnfs_layout_set_fail_bit(lo, iomode); 1694 set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags); 1695 if (lo->plh_return_iomode == 0) 1696 lo->plh_return_iomode = range.iomode; 1697 else if (lo->plh_return_iomode != range.iomode) 1698 lo->plh_return_iomode = IOMODE_ANY; 1699 /* 1700 * mark all matching lsegs so that we are sure to have no live 1701 * segments at hand when sending layoutreturn. See pnfs_put_lseg() 1702 * for how it works. 1703 */ 1704 pnfs_mark_matching_lsegs_return(lo, &free_me, &range); 1705 spin_unlock(&inode->i_lock); 1706 pnfs_free_lseg_list(&free_me); 1707 } 1708 EXPORT_SYMBOL_GPL(pnfs_error_mark_layout_for_return); 1709 1710 void 1711 pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) 1712 { 1713 u64 rd_size = req->wb_bytes; 1714 1715 if (pgio->pg_lseg == NULL) { 1716 if (pgio->pg_dreq == NULL) 1717 rd_size = i_size_read(pgio->pg_inode) - req_offset(req); 1718 else 1719 rd_size = nfs_dreq_bytes_left(pgio->pg_dreq); 1720 1721 pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, 1722 req->wb_context, 1723 req_offset(req), 1724 rd_size, 1725 IOMODE_READ, 1726 GFP_KERNEL); 1727 } 1728 /* If no lseg, fall back to read through mds */ 1729 if (pgio->pg_lseg == NULL) 1730 nfs_pageio_reset_read_mds(pgio); 1731 1732 } 1733 EXPORT_SYMBOL_GPL(pnfs_generic_pg_init_read); 1734 1735 void 1736 pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio, 1737 struct nfs_page *req, u64 wb_size) 1738 { 1739 if (pgio->pg_lseg == NULL) 1740 pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, 1741 req->wb_context, 1742 req_offset(req), 1743 wb_size, 1744 IOMODE_RW, 1745 GFP_NOFS); 1746 /* If no lseg, fall back to write through mds */ 1747 if (pgio->pg_lseg == NULL) 1748 nfs_pageio_reset_write_mds(pgio); 1749 } 1750 EXPORT_SYMBOL_GPL(pnfs_generic_pg_init_write); 1751 1752 void 1753 pnfs_generic_pg_cleanup(struct nfs_pageio_descriptor *desc) 1754 { 1755 if (desc->pg_lseg) { 1756 pnfs_put_lseg(desc->pg_lseg); 1757 desc->pg_lseg = NULL; 1758 } 1759 } 1760 EXPORT_SYMBOL_GPL(pnfs_generic_pg_cleanup); 1761 1762 /* 1763 * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number 1764 * of bytes (maximum @req->wb_bytes) that can be coalesced. 1765 */ 1766 size_t 1767 pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, 1768 struct nfs_page *prev, struct nfs_page *req) 1769 { 1770 unsigned int size; 1771 u64 seg_end, req_start, seg_left; 1772 1773 size = nfs_generic_pg_test(pgio, prev, req); 1774 if (!size) 1775 return 0; 1776 1777 /* 1778 * 'size' contains the number of bytes left in the current page (up 1779 * to the original size asked for in @req->wb_bytes). 1780 * 1781 * Calculate how many bytes are left in the layout segment 1782 * and if there are less bytes than 'size', return that instead. 1783 * 1784 * Please also note that 'end_offset' is actually the offset of the 1785 * first byte that lies outside the pnfs_layout_range. FIXME? 1786 * 1787 */ 1788 if (pgio->pg_lseg) { 1789 seg_end = end_offset(pgio->pg_lseg->pls_range.offset, 1790 pgio->pg_lseg->pls_range.length); 1791 req_start = req_offset(req); 1792 WARN_ON_ONCE(req_start >= seg_end); 1793 /* start of request is past the last byte of this segment */ 1794 if (req_start >= seg_end) { 1795 /* reference the new lseg */ 1796 if (pgio->pg_ops->pg_cleanup) 1797 pgio->pg_ops->pg_cleanup(pgio); 1798 if (pgio->pg_ops->pg_init) 1799 pgio->pg_ops->pg_init(pgio, req); 1800 return 0; 1801 } 1802 1803 /* adjust 'size' iff there are fewer bytes left in the 1804 * segment than what nfs_generic_pg_test returned */ 1805 seg_left = seg_end - req_start; 1806 if (seg_left < size) 1807 size = (unsigned int)seg_left; 1808 } 1809 1810 return size; 1811 } 1812 EXPORT_SYMBOL_GPL(pnfs_generic_pg_test); 1813 1814 int pnfs_write_done_resend_to_mds(struct nfs_pgio_header *hdr) 1815 { 1816 struct nfs_pageio_descriptor pgio; 1817 1818 /* Resend all requests through the MDS */ 1819 nfs_pageio_init_write(&pgio, hdr->inode, FLUSH_STABLE, true, 1820 hdr->completion_ops); 1821 return nfs_pageio_resend(&pgio, hdr); 1822 } 1823 EXPORT_SYMBOL_GPL(pnfs_write_done_resend_to_mds); 1824 1825 static void pnfs_ld_handle_write_error(struct nfs_pgio_header *hdr) 1826 { 1827 1828 dprintk("pnfs write error = %d\n", hdr->pnfs_error); 1829 if (NFS_SERVER(hdr->inode)->pnfs_curr_ld->flags & 1830 PNFS_LAYOUTRET_ON_ERROR) { 1831 pnfs_return_layout(hdr->inode); 1832 } 1833 if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) 1834 hdr->task.tk_status = pnfs_write_done_resend_to_mds(hdr); 1835 } 1836 1837 /* 1838 * Called by non rpc-based layout drivers 1839 */ 1840 void pnfs_ld_write_done(struct nfs_pgio_header *hdr) 1841 { 1842 trace_nfs4_pnfs_write(hdr, hdr->pnfs_error); 1843 if (!hdr->pnfs_error) { 1844 pnfs_set_layoutcommit(hdr); 1845 hdr->mds_ops->rpc_call_done(&hdr->task, hdr); 1846 } else 1847 pnfs_ld_handle_write_error(hdr); 1848 hdr->mds_ops->rpc_release(hdr); 1849 } 1850 EXPORT_SYMBOL_GPL(pnfs_ld_write_done); 1851 1852 static void 1853 pnfs_write_through_mds(struct nfs_pageio_descriptor *desc, 1854 struct nfs_pgio_header *hdr) 1855 { 1856 struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc); 1857 1858 if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) { 1859 list_splice_tail_init(&hdr->pages, &mirror->pg_list); 1860 nfs_pageio_reset_write_mds(desc); 1861 mirror->pg_recoalesce = 1; 1862 } 1863 nfs_pgio_data_destroy(hdr); 1864 } 1865 1866 static enum pnfs_try_status 1867 pnfs_try_to_write_data(struct nfs_pgio_header *hdr, 1868 const struct rpc_call_ops *call_ops, 1869 struct pnfs_layout_segment *lseg, 1870 int how) 1871 { 1872 struct inode *inode = hdr->inode; 1873 enum pnfs_try_status trypnfs; 1874 struct nfs_server *nfss = NFS_SERVER(inode); 1875 1876 hdr->mds_ops = call_ops; 1877 1878 dprintk("%s: Writing ino:%lu %u@%llu (how %d)\n", __func__, 1879 inode->i_ino, hdr->args.count, hdr->args.offset, how); 1880 trypnfs = nfss->pnfs_curr_ld->write_pagelist(hdr, how); 1881 if (trypnfs != PNFS_NOT_ATTEMPTED) 1882 nfs_inc_stats(inode, NFSIOS_PNFS_WRITE); 1883 dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs); 1884 return trypnfs; 1885 } 1886 1887 static void 1888 pnfs_do_write(struct nfs_pageio_descriptor *desc, 1889 struct nfs_pgio_header *hdr, int how) 1890 { 1891 const struct rpc_call_ops *call_ops = desc->pg_rpc_callops; 1892 struct pnfs_layout_segment *lseg = desc->pg_lseg; 1893 enum pnfs_try_status trypnfs; 1894 1895 trypnfs = pnfs_try_to_write_data(hdr, call_ops, lseg, how); 1896 if (trypnfs == PNFS_NOT_ATTEMPTED) 1897 pnfs_write_through_mds(desc, hdr); 1898 } 1899 1900 static void pnfs_writehdr_free(struct nfs_pgio_header *hdr) 1901 { 1902 pnfs_put_lseg(hdr->lseg); 1903 nfs_pgio_header_free(hdr); 1904 } 1905 EXPORT_SYMBOL_GPL(pnfs_writehdr_free); 1906 1907 int 1908 pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc) 1909 { 1910 struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc); 1911 1912 struct nfs_pgio_header *hdr; 1913 int ret; 1914 1915 hdr = nfs_pgio_header_alloc(desc->pg_rw_ops); 1916 if (!hdr) { 1917 desc->pg_completion_ops->error_cleanup(&mirror->pg_list); 1918 return -ENOMEM; 1919 } 1920 nfs_pgheader_init(desc, hdr, pnfs_writehdr_free); 1921 1922 hdr->lseg = pnfs_get_lseg(desc->pg_lseg); 1923 ret = nfs_generic_pgio(desc, hdr); 1924 if (!ret) 1925 pnfs_do_write(desc, hdr, desc->pg_ioflags); 1926 1927 return ret; 1928 } 1929 EXPORT_SYMBOL_GPL(pnfs_generic_pg_writepages); 1930 1931 int pnfs_read_done_resend_to_mds(struct nfs_pgio_header *hdr) 1932 { 1933 struct nfs_pageio_descriptor pgio; 1934 1935 /* Resend all requests through the MDS */ 1936 nfs_pageio_init_read(&pgio, hdr->inode, true, hdr->completion_ops); 1937 return nfs_pageio_resend(&pgio, hdr); 1938 } 1939 EXPORT_SYMBOL_GPL(pnfs_read_done_resend_to_mds); 1940 1941 static void pnfs_ld_handle_read_error(struct nfs_pgio_header *hdr) 1942 { 1943 dprintk("pnfs read error = %d\n", hdr->pnfs_error); 1944 if (NFS_SERVER(hdr->inode)->pnfs_curr_ld->flags & 1945 PNFS_LAYOUTRET_ON_ERROR) { 1946 pnfs_return_layout(hdr->inode); 1947 } 1948 if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) 1949 hdr->task.tk_status = pnfs_read_done_resend_to_mds(hdr); 1950 } 1951 1952 /* 1953 * Called by non rpc-based layout drivers 1954 */ 1955 void pnfs_ld_read_done(struct nfs_pgio_header *hdr) 1956 { 1957 trace_nfs4_pnfs_read(hdr, hdr->pnfs_error); 1958 if (likely(!hdr->pnfs_error)) { 1959 __nfs4_read_done_cb(hdr); 1960 hdr->mds_ops->rpc_call_done(&hdr->task, hdr); 1961 } else 1962 pnfs_ld_handle_read_error(hdr); 1963 hdr->mds_ops->rpc_release(hdr); 1964 } 1965 EXPORT_SYMBOL_GPL(pnfs_ld_read_done); 1966 1967 static void 1968 pnfs_read_through_mds(struct nfs_pageio_descriptor *desc, 1969 struct nfs_pgio_header *hdr) 1970 { 1971 struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc); 1972 1973 if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) { 1974 list_splice_tail_init(&hdr->pages, &mirror->pg_list); 1975 nfs_pageio_reset_read_mds(desc); 1976 mirror->pg_recoalesce = 1; 1977 } 1978 nfs_pgio_data_destroy(hdr); 1979 } 1980 1981 /* 1982 * Call the appropriate parallel I/O subsystem read function. 1983 */ 1984 static enum pnfs_try_status 1985 pnfs_try_to_read_data(struct nfs_pgio_header *hdr, 1986 const struct rpc_call_ops *call_ops, 1987 struct pnfs_layout_segment *lseg) 1988 { 1989 struct inode *inode = hdr->inode; 1990 struct nfs_server *nfss = NFS_SERVER(inode); 1991 enum pnfs_try_status trypnfs; 1992 1993 hdr->mds_ops = call_ops; 1994 1995 dprintk("%s: Reading ino:%lu %u@%llu\n", 1996 __func__, inode->i_ino, hdr->args.count, hdr->args.offset); 1997 1998 trypnfs = nfss->pnfs_curr_ld->read_pagelist(hdr); 1999 if (trypnfs != PNFS_NOT_ATTEMPTED) 2000 nfs_inc_stats(inode, NFSIOS_PNFS_READ); 2001 dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs); 2002 return trypnfs; 2003 } 2004 2005 /* Resend all requests through pnfs. */ 2006 int pnfs_read_resend_pnfs(struct nfs_pgio_header *hdr) 2007 { 2008 struct nfs_pageio_descriptor pgio; 2009 2010 nfs_pageio_init_read(&pgio, hdr->inode, false, hdr->completion_ops); 2011 return nfs_pageio_resend(&pgio, hdr); 2012 } 2013 EXPORT_SYMBOL_GPL(pnfs_read_resend_pnfs); 2014 2015 static void 2016 pnfs_do_read(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr) 2017 { 2018 const struct rpc_call_ops *call_ops = desc->pg_rpc_callops; 2019 struct pnfs_layout_segment *lseg = desc->pg_lseg; 2020 enum pnfs_try_status trypnfs; 2021 int err = 0; 2022 2023 trypnfs = pnfs_try_to_read_data(hdr, call_ops, lseg); 2024 if (trypnfs == PNFS_TRY_AGAIN) 2025 err = pnfs_read_resend_pnfs(hdr); 2026 if (trypnfs == PNFS_NOT_ATTEMPTED || err) 2027 pnfs_read_through_mds(desc, hdr); 2028 } 2029 2030 static void pnfs_readhdr_free(struct nfs_pgio_header *hdr) 2031 { 2032 pnfs_put_lseg(hdr->lseg); 2033 nfs_pgio_header_free(hdr); 2034 } 2035 EXPORT_SYMBOL_GPL(pnfs_readhdr_free); 2036 2037 int 2038 pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc) 2039 { 2040 struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc); 2041 2042 struct nfs_pgio_header *hdr; 2043 int ret; 2044 2045 hdr = nfs_pgio_header_alloc(desc->pg_rw_ops); 2046 if (!hdr) { 2047 desc->pg_completion_ops->error_cleanup(&mirror->pg_list); 2048 return -ENOMEM; 2049 } 2050 nfs_pgheader_init(desc, hdr, pnfs_readhdr_free); 2051 hdr->lseg = pnfs_get_lseg(desc->pg_lseg); 2052 ret = nfs_generic_pgio(desc, hdr); 2053 if (!ret) 2054 pnfs_do_read(desc, hdr); 2055 return ret; 2056 } 2057 EXPORT_SYMBOL_GPL(pnfs_generic_pg_readpages); 2058 2059 static void pnfs_clear_layoutcommitting(struct inode *inode) 2060 { 2061 unsigned long *bitlock = &NFS_I(inode)->flags; 2062 2063 clear_bit_unlock(NFS_INO_LAYOUTCOMMITTING, bitlock); 2064 smp_mb__after_atomic(); 2065 wake_up_bit(bitlock, NFS_INO_LAYOUTCOMMITTING); 2066 } 2067 2068 /* 2069 * There can be multiple RW segments. 2070 */ 2071 static void pnfs_list_write_lseg(struct inode *inode, struct list_head *listp) 2072 { 2073 struct pnfs_layout_segment *lseg; 2074 2075 list_for_each_entry(lseg, &NFS_I(inode)->layout->plh_segs, pls_list) { 2076 if (lseg->pls_range.iomode == IOMODE_RW && 2077 test_and_clear_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags)) 2078 list_add(&lseg->pls_lc_list, listp); 2079 } 2080 } 2081 2082 static void pnfs_list_write_lseg_done(struct inode *inode, struct list_head *listp) 2083 { 2084 struct pnfs_layout_segment *lseg, *tmp; 2085 2086 /* Matched by references in pnfs_set_layoutcommit */ 2087 list_for_each_entry_safe(lseg, tmp, listp, pls_lc_list) { 2088 list_del_init(&lseg->pls_lc_list); 2089 pnfs_put_lseg(lseg); 2090 } 2091 2092 pnfs_clear_layoutcommitting(inode); 2093 } 2094 2095 void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg) 2096 { 2097 pnfs_layout_io_set_failed(lseg->pls_layout, lseg->pls_range.iomode); 2098 } 2099 EXPORT_SYMBOL_GPL(pnfs_set_lo_fail); 2100 2101 void 2102 pnfs_set_layoutcommit(struct nfs_pgio_header *hdr) 2103 { 2104 struct inode *inode = hdr->inode; 2105 struct nfs_inode *nfsi = NFS_I(inode); 2106 loff_t end_pos = hdr->mds_offset + hdr->res.count; 2107 bool mark_as_dirty = false; 2108 2109 spin_lock(&inode->i_lock); 2110 if (!test_and_set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) { 2111 mark_as_dirty = true; 2112 dprintk("%s: Set layoutcommit for inode %lu ", 2113 __func__, inode->i_ino); 2114 } 2115 if (!test_and_set_bit(NFS_LSEG_LAYOUTCOMMIT, &hdr->lseg->pls_flags)) { 2116 /* references matched in nfs4_layoutcommit_release */ 2117 pnfs_get_lseg(hdr->lseg); 2118 } 2119 if (end_pos > nfsi->layout->plh_lwb) 2120 nfsi->layout->plh_lwb = end_pos; 2121 spin_unlock(&inode->i_lock); 2122 dprintk("%s: lseg %p end_pos %llu\n", 2123 __func__, hdr->lseg, nfsi->layout->plh_lwb); 2124 2125 /* if pnfs_layoutcommit_inode() runs between inode locks, the next one 2126 * will be a noop because NFS_INO_LAYOUTCOMMIT will not be set */ 2127 if (mark_as_dirty) 2128 mark_inode_dirty_sync(inode); 2129 } 2130 EXPORT_SYMBOL_GPL(pnfs_set_layoutcommit); 2131 2132 void pnfs_commit_set_layoutcommit(struct nfs_commit_data *data) 2133 { 2134 struct inode *inode = data->inode; 2135 struct nfs_inode *nfsi = NFS_I(inode); 2136 bool mark_as_dirty = false; 2137 2138 spin_lock(&inode->i_lock); 2139 if (!test_and_set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) { 2140 mark_as_dirty = true; 2141 dprintk("%s: Set layoutcommit for inode %lu ", 2142 __func__, inode->i_ino); 2143 } 2144 if (!test_and_set_bit(NFS_LSEG_LAYOUTCOMMIT, &data->lseg->pls_flags)) { 2145 /* references matched in nfs4_layoutcommit_release */ 2146 pnfs_get_lseg(data->lseg); 2147 } 2148 if (data->lwb > nfsi->layout->plh_lwb) 2149 nfsi->layout->plh_lwb = data->lwb; 2150 spin_unlock(&inode->i_lock); 2151 dprintk("%s: lseg %p end_pos %llu\n", 2152 __func__, data->lseg, nfsi->layout->plh_lwb); 2153 2154 /* if pnfs_layoutcommit_inode() runs between inode locks, the next one 2155 * will be a noop because NFS_INO_LAYOUTCOMMIT will not be set */ 2156 if (mark_as_dirty) 2157 mark_inode_dirty_sync(inode); 2158 } 2159 EXPORT_SYMBOL_GPL(pnfs_commit_set_layoutcommit); 2160 2161 void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data) 2162 { 2163 struct nfs_server *nfss = NFS_SERVER(data->args.inode); 2164 2165 if (nfss->pnfs_curr_ld->cleanup_layoutcommit) 2166 nfss->pnfs_curr_ld->cleanup_layoutcommit(data); 2167 pnfs_list_write_lseg_done(data->args.inode, &data->lseg_list); 2168 } 2169 2170 /* 2171 * For the LAYOUT4_NFSV4_1_FILES layout type, NFS_DATA_SYNC WRITEs and 2172 * NFS_UNSTABLE WRITEs with a COMMIT to data servers must store enough 2173 * data to disk to allow the server to recover the data if it crashes. 2174 * LAYOUTCOMMIT is only needed when the NFL4_UFLG_COMMIT_THRU_MDS flag 2175 * is off, and a COMMIT is sent to a data server, or 2176 * if WRITEs to a data server return NFS_DATA_SYNC. 2177 */ 2178 int 2179 pnfs_layoutcommit_inode(struct inode *inode, bool sync) 2180 { 2181 struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld; 2182 struct nfs4_layoutcommit_data *data; 2183 struct nfs_inode *nfsi = NFS_I(inode); 2184 loff_t end_pos; 2185 int status; 2186 2187 if (!pnfs_layoutcommit_outstanding(inode)) 2188 return 0; 2189 2190 dprintk("--> %s inode %lu\n", __func__, inode->i_ino); 2191 2192 status = -EAGAIN; 2193 if (test_and_set_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags)) { 2194 if (!sync) 2195 goto out; 2196 status = wait_on_bit_lock_action(&nfsi->flags, 2197 NFS_INO_LAYOUTCOMMITTING, 2198 nfs_wait_bit_killable, 2199 TASK_KILLABLE); 2200 if (status) 2201 goto out; 2202 } 2203 2204 status = -ENOMEM; 2205 /* Note kzalloc ensures data->res.seq_res.sr_slot == NULL */ 2206 data = kzalloc(sizeof(*data), GFP_NOFS); 2207 if (!data) 2208 goto clear_layoutcommitting; 2209 2210 status = 0; 2211 spin_lock(&inode->i_lock); 2212 if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) 2213 goto out_unlock; 2214 2215 INIT_LIST_HEAD(&data->lseg_list); 2216 pnfs_list_write_lseg(inode, &data->lseg_list); 2217 2218 end_pos = nfsi->layout->plh_lwb; 2219 nfsi->layout->plh_lwb = 0; 2220 2221 nfs4_stateid_copy(&data->args.stateid, &nfsi->layout->plh_stateid); 2222 spin_unlock(&inode->i_lock); 2223 2224 data->args.inode = inode; 2225 data->cred = get_rpccred(nfsi->layout->plh_lc_cred); 2226 nfs_fattr_init(&data->fattr); 2227 data->args.bitmask = NFS_SERVER(inode)->cache_consistency_bitmask; 2228 data->res.fattr = &data->fattr; 2229 data->args.lastbytewritten = end_pos - 1; 2230 data->res.server = NFS_SERVER(inode); 2231 2232 if (ld->prepare_layoutcommit) { 2233 status = ld->prepare_layoutcommit(&data->args); 2234 if (status) { 2235 spin_lock(&inode->i_lock); 2236 if (end_pos < nfsi->layout->plh_lwb) 2237 nfsi->layout->plh_lwb = end_pos; 2238 spin_unlock(&inode->i_lock); 2239 put_rpccred(data->cred); 2240 set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags); 2241 goto clear_layoutcommitting; 2242 } 2243 } 2244 2245 2246 status = nfs4_proc_layoutcommit(data, sync); 2247 out: 2248 if (status) 2249 mark_inode_dirty_sync(inode); 2250 dprintk("<-- %s status %d\n", __func__, status); 2251 return status; 2252 out_unlock: 2253 spin_unlock(&inode->i_lock); 2254 kfree(data); 2255 clear_layoutcommitting: 2256 pnfs_clear_layoutcommitting(inode); 2257 goto out; 2258 } 2259 EXPORT_SYMBOL_GPL(pnfs_layoutcommit_inode); 2260 2261 struct nfs4_threshold *pnfs_mdsthreshold_alloc(void) 2262 { 2263 struct nfs4_threshold *thp; 2264 2265 thp = kzalloc(sizeof(*thp), GFP_NOFS); 2266 if (!thp) { 2267 dprintk("%s mdsthreshold allocation failed\n", __func__); 2268 return NULL; 2269 } 2270 return thp; 2271 } 2272