1 /* 2 * pNFS functions to call and manage layout drivers. 3 * 4 * Copyright (c) 2002 [year of first publication] 5 * The Regents of the University of Michigan 6 * All Rights Reserved 7 * 8 * Dean Hildebrand <dhildebz@umich.edu> 9 * 10 * Permission is granted to use, copy, create derivative works, and 11 * redistribute this software and such derivative works for any purpose, 12 * so long as the name of the University of Michigan is not used in 13 * any advertising or publicity pertaining to the use or distribution 14 * of this software without specific, written prior authorization. If 15 * the above copyright notice or any other identification of the 16 * University of Michigan is included in any copy of any portion of 17 * this software, then the disclaimer below must also be included. 18 * 19 * This software is provided as is, without representation or warranty 20 * of any kind either express or implied, including without limitation 21 * the implied warranties of merchantability, fitness for a particular 22 * purpose, or noninfringement. The Regents of the University of 23 * Michigan shall not be liable for any damages, including special, 24 * indirect, incidental, or consequential damages, with respect to any 25 * claim arising out of or in connection with the use of the software, 26 * even if it has been or is hereafter advised of the possibility of 27 * such damages. 28 */ 29 30 #include <linux/nfs_fs.h> 31 #include <linux/nfs_page.h> 32 #include <linux/module.h> 33 #include "internal.h" 34 #include "pnfs.h" 35 #include "iostat.h" 36 #include "nfs4trace.h" 37 38 #define NFSDBG_FACILITY NFSDBG_PNFS 39 #define PNFS_LAYOUTGET_RETRY_TIMEOUT (120*HZ) 40 41 /* Locking: 42 * 43 * pnfs_spinlock: 44 * protects pnfs_modules_tbl. 45 */ 46 static DEFINE_SPINLOCK(pnfs_spinlock); 47 48 /* 49 * pnfs_modules_tbl holds all pnfs modules 50 */ 51 static LIST_HEAD(pnfs_modules_tbl); 52 53 /* Return the registered pnfs layout driver module matching given id */ 54 static struct pnfs_layoutdriver_type * 55 find_pnfs_driver_locked(u32 id) 56 { 57 struct pnfs_layoutdriver_type *local; 58 59 list_for_each_entry(local, &pnfs_modules_tbl, pnfs_tblid) 60 if (local->id == id) 61 goto out; 62 local = NULL; 63 out: 64 dprintk("%s: Searching for id %u, found %p\n", __func__, id, local); 65 return local; 66 } 67 68 static struct pnfs_layoutdriver_type * 69 find_pnfs_driver(u32 id) 70 { 71 struct pnfs_layoutdriver_type *local; 72 73 spin_lock(&pnfs_spinlock); 74 local = find_pnfs_driver_locked(id); 75 if (local != NULL && !try_module_get(local->owner)) { 76 dprintk("%s: Could not grab reference on module\n", __func__); 77 local = NULL; 78 } 79 spin_unlock(&pnfs_spinlock); 80 return local; 81 } 82 83 void 84 unset_pnfs_layoutdriver(struct nfs_server *nfss) 85 { 86 if (nfss->pnfs_curr_ld) { 87 if (nfss->pnfs_curr_ld->clear_layoutdriver) 88 nfss->pnfs_curr_ld->clear_layoutdriver(nfss); 89 /* Decrement the MDS count. Purge the deviceid cache if zero */ 90 if (atomic_dec_and_test(&nfss->nfs_client->cl_mds_count)) 91 nfs4_deviceid_purge_client(nfss->nfs_client); 92 module_put(nfss->pnfs_curr_ld->owner); 93 } 94 nfss->pnfs_curr_ld = NULL; 95 } 96 97 /* 98 * Try to set the server's pnfs module to the pnfs layout type specified by id. 99 * Currently only one pNFS layout driver per filesystem is supported. 100 * 101 * @id layout type. Zero (illegal layout type) indicates pNFS not in use. 102 */ 103 void 104 set_pnfs_layoutdriver(struct nfs_server *server, const struct nfs_fh *mntfh, 105 u32 id) 106 { 107 struct pnfs_layoutdriver_type *ld_type = NULL; 108 109 if (id == 0) 110 goto out_no_driver; 111 if (!(server->nfs_client->cl_exchange_flags & 112 (EXCHGID4_FLAG_USE_NON_PNFS | EXCHGID4_FLAG_USE_PNFS_MDS))) { 113 printk(KERN_ERR "NFS: %s: id %u cl_exchange_flags 0x%x\n", 114 __func__, id, server->nfs_client->cl_exchange_flags); 115 goto out_no_driver; 116 } 117 ld_type = find_pnfs_driver(id); 118 if (!ld_type) { 119 request_module("%s-%u", LAYOUT_NFSV4_1_MODULE_PREFIX, id); 120 ld_type = find_pnfs_driver(id); 121 if (!ld_type) { 122 dprintk("%s: No pNFS module found for %u.\n", 123 __func__, id); 124 goto out_no_driver; 125 } 126 } 127 server->pnfs_curr_ld = ld_type; 128 if (ld_type->set_layoutdriver 129 && ld_type->set_layoutdriver(server, mntfh)) { 130 printk(KERN_ERR "NFS: %s: Error initializing pNFS layout " 131 "driver %u.\n", __func__, id); 132 module_put(ld_type->owner); 133 goto out_no_driver; 134 } 135 /* Bump the MDS count */ 136 atomic_inc(&server->nfs_client->cl_mds_count); 137 138 dprintk("%s: pNFS module for %u set\n", __func__, id); 139 return; 140 141 out_no_driver: 142 dprintk("%s: Using NFSv4 I/O\n", __func__); 143 server->pnfs_curr_ld = NULL; 144 } 145 146 int 147 pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *ld_type) 148 { 149 int status = -EINVAL; 150 struct pnfs_layoutdriver_type *tmp; 151 152 if (ld_type->id == 0) { 153 printk(KERN_ERR "NFS: %s id 0 is reserved\n", __func__); 154 return status; 155 } 156 if (!ld_type->alloc_lseg || !ld_type->free_lseg) { 157 printk(KERN_ERR "NFS: %s Layout driver must provide " 158 "alloc_lseg and free_lseg.\n", __func__); 159 return status; 160 } 161 162 spin_lock(&pnfs_spinlock); 163 tmp = find_pnfs_driver_locked(ld_type->id); 164 if (!tmp) { 165 list_add(&ld_type->pnfs_tblid, &pnfs_modules_tbl); 166 status = 0; 167 dprintk("%s Registering id:%u name:%s\n", __func__, ld_type->id, 168 ld_type->name); 169 } else { 170 printk(KERN_ERR "NFS: %s Module with id %d already loaded!\n", 171 __func__, ld_type->id); 172 } 173 spin_unlock(&pnfs_spinlock); 174 175 return status; 176 } 177 EXPORT_SYMBOL_GPL(pnfs_register_layoutdriver); 178 179 void 180 pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *ld_type) 181 { 182 dprintk("%s Deregistering id:%u\n", __func__, ld_type->id); 183 spin_lock(&pnfs_spinlock); 184 list_del(&ld_type->pnfs_tblid); 185 spin_unlock(&pnfs_spinlock); 186 } 187 EXPORT_SYMBOL_GPL(pnfs_unregister_layoutdriver); 188 189 /* 190 * pNFS client layout cache 191 */ 192 193 /* Need to hold i_lock if caller does not already hold reference */ 194 void 195 pnfs_get_layout_hdr(struct pnfs_layout_hdr *lo) 196 { 197 atomic_inc(&lo->plh_refcount); 198 } 199 200 static struct pnfs_layout_hdr * 201 pnfs_alloc_layout_hdr(struct inode *ino, gfp_t gfp_flags) 202 { 203 struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld; 204 return ld->alloc_layout_hdr(ino, gfp_flags); 205 } 206 207 static void 208 pnfs_free_layout_hdr(struct pnfs_layout_hdr *lo) 209 { 210 struct nfs_server *server = NFS_SERVER(lo->plh_inode); 211 struct pnfs_layoutdriver_type *ld = server->pnfs_curr_ld; 212 213 if (!list_empty(&lo->plh_layouts)) { 214 struct nfs_client *clp = server->nfs_client; 215 216 spin_lock(&clp->cl_lock); 217 list_del_init(&lo->plh_layouts); 218 spin_unlock(&clp->cl_lock); 219 } 220 put_rpccred(lo->plh_lc_cred); 221 return ld->free_layout_hdr(lo); 222 } 223 224 static void 225 pnfs_detach_layout_hdr(struct pnfs_layout_hdr *lo) 226 { 227 struct nfs_inode *nfsi = NFS_I(lo->plh_inode); 228 dprintk("%s: freeing layout cache %p\n", __func__, lo); 229 nfsi->layout = NULL; 230 /* Reset MDS Threshold I/O counters */ 231 nfsi->write_io = 0; 232 nfsi->read_io = 0; 233 } 234 235 void 236 pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo) 237 { 238 struct inode *inode = lo->plh_inode; 239 240 if (atomic_dec_and_lock(&lo->plh_refcount, &inode->i_lock)) { 241 pnfs_detach_layout_hdr(lo); 242 spin_unlock(&inode->i_lock); 243 pnfs_free_layout_hdr(lo); 244 } 245 } 246 247 static int 248 pnfs_iomode_to_fail_bit(u32 iomode) 249 { 250 return iomode == IOMODE_RW ? 251 NFS_LAYOUT_RW_FAILED : NFS_LAYOUT_RO_FAILED; 252 } 253 254 static void 255 pnfs_layout_set_fail_bit(struct pnfs_layout_hdr *lo, int fail_bit) 256 { 257 lo->plh_retry_timestamp = jiffies; 258 if (!test_and_set_bit(fail_bit, &lo->plh_flags)) 259 atomic_inc(&lo->plh_refcount); 260 } 261 262 static void 263 pnfs_layout_clear_fail_bit(struct pnfs_layout_hdr *lo, int fail_bit) 264 { 265 if (test_and_clear_bit(fail_bit, &lo->plh_flags)) 266 atomic_dec(&lo->plh_refcount); 267 } 268 269 static void 270 pnfs_layout_io_set_failed(struct pnfs_layout_hdr *lo, u32 iomode) 271 { 272 struct inode *inode = lo->plh_inode; 273 struct pnfs_layout_range range = { 274 .iomode = iomode, 275 .offset = 0, 276 .length = NFS4_MAX_UINT64, 277 }; 278 LIST_HEAD(head); 279 280 spin_lock(&inode->i_lock); 281 pnfs_layout_set_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode)); 282 pnfs_mark_matching_lsegs_invalid(lo, &head, &range); 283 spin_unlock(&inode->i_lock); 284 pnfs_free_lseg_list(&head); 285 dprintk("%s Setting layout IOMODE_%s fail bit\n", __func__, 286 iomode == IOMODE_RW ? "RW" : "READ"); 287 } 288 289 static bool 290 pnfs_layout_io_test_failed(struct pnfs_layout_hdr *lo, u32 iomode) 291 { 292 unsigned long start, end; 293 int fail_bit = pnfs_iomode_to_fail_bit(iomode); 294 295 if (test_bit(fail_bit, &lo->plh_flags) == 0) 296 return false; 297 end = jiffies; 298 start = end - PNFS_LAYOUTGET_RETRY_TIMEOUT; 299 if (!time_in_range(lo->plh_retry_timestamp, start, end)) { 300 /* It is time to retry the failed layoutgets */ 301 pnfs_layout_clear_fail_bit(lo, fail_bit); 302 return false; 303 } 304 return true; 305 } 306 307 static void 308 init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg) 309 { 310 INIT_LIST_HEAD(&lseg->pls_list); 311 INIT_LIST_HEAD(&lseg->pls_lc_list); 312 atomic_set(&lseg->pls_refcount, 1); 313 smp_mb(); 314 set_bit(NFS_LSEG_VALID, &lseg->pls_flags); 315 lseg->pls_layout = lo; 316 } 317 318 static void pnfs_free_lseg(struct pnfs_layout_segment *lseg) 319 { 320 struct inode *ino = lseg->pls_layout->plh_inode; 321 322 NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg); 323 } 324 325 static void 326 pnfs_layout_remove_lseg(struct pnfs_layout_hdr *lo, 327 struct pnfs_layout_segment *lseg) 328 { 329 struct inode *inode = lo->plh_inode; 330 331 WARN_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags)); 332 list_del_init(&lseg->pls_list); 333 /* Matched by pnfs_get_layout_hdr in pnfs_layout_insert_lseg */ 334 atomic_dec(&lo->plh_refcount); 335 if (list_empty(&lo->plh_segs)) 336 clear_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags); 337 rpc_wake_up(&NFS_SERVER(inode)->roc_rpcwaitq); 338 } 339 340 void 341 pnfs_put_lseg(struct pnfs_layout_segment *lseg) 342 { 343 struct pnfs_layout_hdr *lo; 344 struct inode *inode; 345 346 if (!lseg) 347 return; 348 349 dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg, 350 atomic_read(&lseg->pls_refcount), 351 test_bit(NFS_LSEG_VALID, &lseg->pls_flags)); 352 lo = lseg->pls_layout; 353 inode = lo->plh_inode; 354 if (atomic_dec_and_lock(&lseg->pls_refcount, &inode->i_lock)) { 355 pnfs_get_layout_hdr(lo); 356 pnfs_layout_remove_lseg(lo, lseg); 357 spin_unlock(&inode->i_lock); 358 pnfs_free_lseg(lseg); 359 pnfs_put_layout_hdr(lo); 360 } 361 } 362 EXPORT_SYMBOL_GPL(pnfs_put_lseg); 363 364 static u64 365 end_offset(u64 start, u64 len) 366 { 367 u64 end; 368 369 end = start + len; 370 return end >= start ? end : NFS4_MAX_UINT64; 371 } 372 373 /* 374 * is l2 fully contained in l1? 375 * start1 end1 376 * [----------------------------------) 377 * start2 end2 378 * [----------------) 379 */ 380 static bool 381 pnfs_lseg_range_contained(const struct pnfs_layout_range *l1, 382 const struct pnfs_layout_range *l2) 383 { 384 u64 start1 = l1->offset; 385 u64 end1 = end_offset(start1, l1->length); 386 u64 start2 = l2->offset; 387 u64 end2 = end_offset(start2, l2->length); 388 389 return (start1 <= start2) && (end1 >= end2); 390 } 391 392 /* 393 * is l1 and l2 intersecting? 394 * start1 end1 395 * [----------------------------------) 396 * start2 end2 397 * [----------------) 398 */ 399 static bool 400 pnfs_lseg_range_intersecting(const struct pnfs_layout_range *l1, 401 const struct pnfs_layout_range *l2) 402 { 403 u64 start1 = l1->offset; 404 u64 end1 = end_offset(start1, l1->length); 405 u64 start2 = l2->offset; 406 u64 end2 = end_offset(start2, l2->length); 407 408 return (end1 == NFS4_MAX_UINT64 || end1 > start2) && 409 (end2 == NFS4_MAX_UINT64 || end2 > start1); 410 } 411 412 static bool 413 should_free_lseg(const struct pnfs_layout_range *lseg_range, 414 const struct pnfs_layout_range *recall_range) 415 { 416 return (recall_range->iomode == IOMODE_ANY || 417 lseg_range->iomode == recall_range->iomode) && 418 pnfs_lseg_range_intersecting(lseg_range, recall_range); 419 } 420 421 static bool pnfs_lseg_dec_and_remove_zero(struct pnfs_layout_segment *lseg, 422 struct list_head *tmp_list) 423 { 424 if (!atomic_dec_and_test(&lseg->pls_refcount)) 425 return false; 426 pnfs_layout_remove_lseg(lseg->pls_layout, lseg); 427 list_add(&lseg->pls_list, tmp_list); 428 return true; 429 } 430 431 /* Returns 1 if lseg is removed from list, 0 otherwise */ 432 static int mark_lseg_invalid(struct pnfs_layout_segment *lseg, 433 struct list_head *tmp_list) 434 { 435 int rv = 0; 436 437 if (test_and_clear_bit(NFS_LSEG_VALID, &lseg->pls_flags)) { 438 /* Remove the reference keeping the lseg in the 439 * list. It will now be removed when all 440 * outstanding io is finished. 441 */ 442 dprintk("%s: lseg %p ref %d\n", __func__, lseg, 443 atomic_read(&lseg->pls_refcount)); 444 if (pnfs_lseg_dec_and_remove_zero(lseg, tmp_list)) 445 rv = 1; 446 } 447 return rv; 448 } 449 450 /* Returns count of number of matching invalid lsegs remaining in list 451 * after call. 452 */ 453 int 454 pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, 455 struct list_head *tmp_list, 456 struct pnfs_layout_range *recall_range) 457 { 458 struct pnfs_layout_segment *lseg, *next; 459 int invalid = 0, removed = 0; 460 461 dprintk("%s:Begin lo %p\n", __func__, lo); 462 463 if (list_empty(&lo->plh_segs)) 464 return 0; 465 list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list) 466 if (!recall_range || 467 should_free_lseg(&lseg->pls_range, recall_range)) { 468 dprintk("%s: freeing lseg %p iomode %d " 469 "offset %llu length %llu\n", __func__, 470 lseg, lseg->pls_range.iomode, lseg->pls_range.offset, 471 lseg->pls_range.length); 472 invalid++; 473 removed += mark_lseg_invalid(lseg, tmp_list); 474 } 475 dprintk("%s:Return %i\n", __func__, invalid - removed); 476 return invalid - removed; 477 } 478 479 /* note free_me must contain lsegs from a single layout_hdr */ 480 void 481 pnfs_free_lseg_list(struct list_head *free_me) 482 { 483 struct pnfs_layout_segment *lseg, *tmp; 484 485 if (list_empty(free_me)) 486 return; 487 488 list_for_each_entry_safe(lseg, tmp, free_me, pls_list) { 489 list_del(&lseg->pls_list); 490 pnfs_free_lseg(lseg); 491 } 492 } 493 494 void 495 pnfs_destroy_layout(struct nfs_inode *nfsi) 496 { 497 struct pnfs_layout_hdr *lo; 498 LIST_HEAD(tmp_list); 499 500 spin_lock(&nfsi->vfs_inode.i_lock); 501 lo = nfsi->layout; 502 if (lo) { 503 lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */ 504 pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL); 505 pnfs_get_layout_hdr(lo); 506 pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RO_FAILED); 507 pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RW_FAILED); 508 spin_unlock(&nfsi->vfs_inode.i_lock); 509 pnfs_free_lseg_list(&tmp_list); 510 pnfs_put_layout_hdr(lo); 511 } else 512 spin_unlock(&nfsi->vfs_inode.i_lock); 513 } 514 EXPORT_SYMBOL_GPL(pnfs_destroy_layout); 515 516 static bool 517 pnfs_layout_add_bulk_destroy_list(struct inode *inode, 518 struct list_head *layout_list) 519 { 520 struct pnfs_layout_hdr *lo; 521 bool ret = false; 522 523 spin_lock(&inode->i_lock); 524 lo = NFS_I(inode)->layout; 525 if (lo != NULL && list_empty(&lo->plh_bulk_destroy)) { 526 pnfs_get_layout_hdr(lo); 527 list_add(&lo->plh_bulk_destroy, layout_list); 528 ret = true; 529 } 530 spin_unlock(&inode->i_lock); 531 return ret; 532 } 533 534 /* Caller must hold rcu_read_lock and clp->cl_lock */ 535 static int 536 pnfs_layout_bulk_destroy_byserver_locked(struct nfs_client *clp, 537 struct nfs_server *server, 538 struct list_head *layout_list) 539 { 540 struct pnfs_layout_hdr *lo, *next; 541 struct inode *inode; 542 543 list_for_each_entry_safe(lo, next, &server->layouts, plh_layouts) { 544 inode = igrab(lo->plh_inode); 545 if (inode == NULL) 546 continue; 547 list_del_init(&lo->plh_layouts); 548 if (pnfs_layout_add_bulk_destroy_list(inode, layout_list)) 549 continue; 550 rcu_read_unlock(); 551 spin_unlock(&clp->cl_lock); 552 iput(inode); 553 spin_lock(&clp->cl_lock); 554 rcu_read_lock(); 555 return -EAGAIN; 556 } 557 return 0; 558 } 559 560 static int 561 pnfs_layout_free_bulk_destroy_list(struct list_head *layout_list, 562 bool is_bulk_recall) 563 { 564 struct pnfs_layout_hdr *lo; 565 struct inode *inode; 566 struct pnfs_layout_range range = { 567 .iomode = IOMODE_ANY, 568 .offset = 0, 569 .length = NFS4_MAX_UINT64, 570 }; 571 LIST_HEAD(lseg_list); 572 int ret = 0; 573 574 while (!list_empty(layout_list)) { 575 lo = list_entry(layout_list->next, struct pnfs_layout_hdr, 576 plh_bulk_destroy); 577 dprintk("%s freeing layout for inode %lu\n", __func__, 578 lo->plh_inode->i_ino); 579 inode = lo->plh_inode; 580 spin_lock(&inode->i_lock); 581 list_del_init(&lo->plh_bulk_destroy); 582 lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */ 583 if (is_bulk_recall) 584 set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags); 585 if (pnfs_mark_matching_lsegs_invalid(lo, &lseg_list, &range)) 586 ret = -EAGAIN; 587 spin_unlock(&inode->i_lock); 588 pnfs_free_lseg_list(&lseg_list); 589 pnfs_put_layout_hdr(lo); 590 iput(inode); 591 } 592 return ret; 593 } 594 595 int 596 pnfs_destroy_layouts_byfsid(struct nfs_client *clp, 597 struct nfs_fsid *fsid, 598 bool is_recall) 599 { 600 struct nfs_server *server; 601 LIST_HEAD(layout_list); 602 603 spin_lock(&clp->cl_lock); 604 rcu_read_lock(); 605 restart: 606 list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { 607 if (memcmp(&server->fsid, fsid, sizeof(*fsid)) != 0) 608 continue; 609 if (pnfs_layout_bulk_destroy_byserver_locked(clp, 610 server, 611 &layout_list) != 0) 612 goto restart; 613 } 614 rcu_read_unlock(); 615 spin_unlock(&clp->cl_lock); 616 617 if (list_empty(&layout_list)) 618 return 0; 619 return pnfs_layout_free_bulk_destroy_list(&layout_list, is_recall); 620 } 621 622 int 623 pnfs_destroy_layouts_byclid(struct nfs_client *clp, 624 bool is_recall) 625 { 626 struct nfs_server *server; 627 LIST_HEAD(layout_list); 628 629 spin_lock(&clp->cl_lock); 630 rcu_read_lock(); 631 restart: 632 list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { 633 if (pnfs_layout_bulk_destroy_byserver_locked(clp, 634 server, 635 &layout_list) != 0) 636 goto restart; 637 } 638 rcu_read_unlock(); 639 spin_unlock(&clp->cl_lock); 640 641 if (list_empty(&layout_list)) 642 return 0; 643 return pnfs_layout_free_bulk_destroy_list(&layout_list, is_recall); 644 } 645 646 /* 647 * Called by the state manger to remove all layouts established under an 648 * expired lease. 649 */ 650 void 651 pnfs_destroy_all_layouts(struct nfs_client *clp) 652 { 653 nfs4_deviceid_mark_client_invalid(clp); 654 nfs4_deviceid_purge_client(clp); 655 656 pnfs_destroy_layouts_byclid(clp, false); 657 } 658 659 /* 660 * Compare 2 layout stateid sequence ids, to see which is newer, 661 * taking into account wraparound issues. 662 */ 663 static bool pnfs_seqid_is_newer(u32 s1, u32 s2) 664 { 665 return (s32)s1 - (s32)s2 > 0; 666 } 667 668 /* update lo->plh_stateid with new if is more recent */ 669 void 670 pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new, 671 bool update_barrier) 672 { 673 u32 oldseq, newseq, new_barrier; 674 int empty = list_empty(&lo->plh_segs); 675 676 oldseq = be32_to_cpu(lo->plh_stateid.seqid); 677 newseq = be32_to_cpu(new->seqid); 678 if (empty || pnfs_seqid_is_newer(newseq, oldseq)) { 679 nfs4_stateid_copy(&lo->plh_stateid, new); 680 if (update_barrier) { 681 new_barrier = be32_to_cpu(new->seqid); 682 } else { 683 /* Because of wraparound, we want to keep the barrier 684 * "close" to the current seqids. 685 */ 686 new_barrier = newseq - atomic_read(&lo->plh_outstanding); 687 } 688 if (empty || pnfs_seqid_is_newer(new_barrier, lo->plh_barrier)) 689 lo->plh_barrier = new_barrier; 690 } 691 } 692 693 static bool 694 pnfs_layout_stateid_blocked(const struct pnfs_layout_hdr *lo, 695 const nfs4_stateid *stateid) 696 { 697 u32 seqid = be32_to_cpu(stateid->seqid); 698 699 return !pnfs_seqid_is_newer(seqid, lo->plh_barrier); 700 } 701 702 /* lget is set to 1 if called from inside send_layoutget call chain */ 703 static bool 704 pnfs_layoutgets_blocked(const struct pnfs_layout_hdr *lo, int lget) 705 { 706 return lo->plh_block_lgets || 707 test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) || 708 (list_empty(&lo->plh_segs) && 709 (atomic_read(&lo->plh_outstanding) > lget)); 710 } 711 712 int 713 pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo, 714 struct nfs4_state *open_state) 715 { 716 int status = 0; 717 718 dprintk("--> %s\n", __func__); 719 spin_lock(&lo->plh_inode->i_lock); 720 if (pnfs_layoutgets_blocked(lo, 1)) { 721 status = -EAGAIN; 722 } else if (!nfs4_valid_open_stateid(open_state)) { 723 status = -EBADF; 724 } else if (list_empty(&lo->plh_segs)) { 725 int seq; 726 727 do { 728 seq = read_seqbegin(&open_state->seqlock); 729 nfs4_stateid_copy(dst, &open_state->stateid); 730 } while (read_seqretry(&open_state->seqlock, seq)); 731 } else 732 nfs4_stateid_copy(dst, &lo->plh_stateid); 733 spin_unlock(&lo->plh_inode->i_lock); 734 dprintk("<-- %s\n", __func__); 735 return status; 736 } 737 738 /* 739 * Get layout from server. 740 * for now, assume that whole file layouts are requested. 741 * arg->offset: 0 742 * arg->length: all ones 743 */ 744 static struct pnfs_layout_segment * 745 send_layoutget(struct pnfs_layout_hdr *lo, 746 struct nfs_open_context *ctx, 747 struct pnfs_layout_range *range, 748 gfp_t gfp_flags) 749 { 750 struct inode *ino = lo->plh_inode; 751 struct nfs_server *server = NFS_SERVER(ino); 752 struct nfs4_layoutget *lgp; 753 struct pnfs_layout_segment *lseg; 754 755 dprintk("--> %s\n", __func__); 756 757 lgp = kzalloc(sizeof(*lgp), gfp_flags); 758 if (lgp == NULL) 759 return NULL; 760 761 lgp->args.minlength = PAGE_CACHE_SIZE; 762 if (lgp->args.minlength > range->length) 763 lgp->args.minlength = range->length; 764 lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE; 765 lgp->args.range = *range; 766 lgp->args.type = server->pnfs_curr_ld->id; 767 lgp->args.inode = ino; 768 lgp->args.ctx = get_nfs_open_context(ctx); 769 lgp->gfp_flags = gfp_flags; 770 lgp->cred = lo->plh_lc_cred; 771 772 /* Synchronously retrieve layout information from server and 773 * store in lseg. 774 */ 775 lseg = nfs4_proc_layoutget(lgp, gfp_flags); 776 if (IS_ERR(lseg)) { 777 switch (PTR_ERR(lseg)) { 778 case -ENOMEM: 779 case -ERESTARTSYS: 780 break; 781 default: 782 /* remember that LAYOUTGET failed and suspend trying */ 783 pnfs_layout_io_set_failed(lo, range->iomode); 784 } 785 return NULL; 786 } 787 788 return lseg; 789 } 790 791 static void pnfs_clear_layoutcommit(struct inode *inode, 792 struct list_head *head) 793 { 794 struct nfs_inode *nfsi = NFS_I(inode); 795 struct pnfs_layout_segment *lseg, *tmp; 796 797 if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) 798 return; 799 list_for_each_entry_safe(lseg, tmp, &nfsi->layout->plh_segs, pls_list) { 800 if (!test_and_clear_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags)) 801 continue; 802 pnfs_lseg_dec_and_remove_zero(lseg, head); 803 } 804 } 805 806 /* 807 * Initiates a LAYOUTRETURN(FILE), and removes the pnfs_layout_hdr 808 * when the layout segment list is empty. 809 * 810 * Note that a pnfs_layout_hdr can exist with an empty layout segment 811 * list when LAYOUTGET has failed, or when LAYOUTGET succeeded, but the 812 * deviceid is marked invalid. 813 */ 814 int 815 _pnfs_return_layout(struct inode *ino) 816 { 817 struct pnfs_layout_hdr *lo = NULL; 818 struct nfs_inode *nfsi = NFS_I(ino); 819 LIST_HEAD(tmp_list); 820 struct nfs4_layoutreturn *lrp; 821 nfs4_stateid stateid; 822 int status = 0, empty; 823 824 dprintk("NFS: %s for inode %lu\n", __func__, ino->i_ino); 825 826 spin_lock(&ino->i_lock); 827 lo = nfsi->layout; 828 if (!lo) { 829 spin_unlock(&ino->i_lock); 830 dprintk("NFS: %s no layout to return\n", __func__); 831 goto out; 832 } 833 stateid = nfsi->layout->plh_stateid; 834 /* Reference matched in nfs4_layoutreturn_release */ 835 pnfs_get_layout_hdr(lo); 836 empty = list_empty(&lo->plh_segs); 837 pnfs_clear_layoutcommit(ino, &tmp_list); 838 pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL); 839 /* Don't send a LAYOUTRETURN if list was initially empty */ 840 if (empty) { 841 spin_unlock(&ino->i_lock); 842 pnfs_put_layout_hdr(lo); 843 dprintk("NFS: %s no layout segments to return\n", __func__); 844 goto out; 845 } 846 lo->plh_block_lgets++; 847 spin_unlock(&ino->i_lock); 848 pnfs_free_lseg_list(&tmp_list); 849 850 lrp = kzalloc(sizeof(*lrp), GFP_KERNEL); 851 if (unlikely(lrp == NULL)) { 852 status = -ENOMEM; 853 spin_lock(&ino->i_lock); 854 lo->plh_block_lgets--; 855 spin_unlock(&ino->i_lock); 856 pnfs_put_layout_hdr(lo); 857 goto out; 858 } 859 860 lrp->args.stateid = stateid; 861 lrp->args.layout_type = NFS_SERVER(ino)->pnfs_curr_ld->id; 862 lrp->args.inode = ino; 863 lrp->args.layout = lo; 864 lrp->clp = NFS_SERVER(ino)->nfs_client; 865 lrp->cred = lo->plh_lc_cred; 866 867 status = nfs4_proc_layoutreturn(lrp); 868 out: 869 dprintk("<-- %s status: %d\n", __func__, status); 870 return status; 871 } 872 EXPORT_SYMBOL_GPL(_pnfs_return_layout); 873 874 int 875 pnfs_commit_and_return_layout(struct inode *inode) 876 { 877 struct pnfs_layout_hdr *lo; 878 int ret; 879 880 spin_lock(&inode->i_lock); 881 lo = NFS_I(inode)->layout; 882 if (lo == NULL) { 883 spin_unlock(&inode->i_lock); 884 return 0; 885 } 886 pnfs_get_layout_hdr(lo); 887 /* Block new layoutgets and read/write to ds */ 888 lo->plh_block_lgets++; 889 spin_unlock(&inode->i_lock); 890 filemap_fdatawait(inode->i_mapping); 891 ret = pnfs_layoutcommit_inode(inode, true); 892 if (ret == 0) 893 ret = _pnfs_return_layout(inode); 894 spin_lock(&inode->i_lock); 895 lo->plh_block_lgets--; 896 spin_unlock(&inode->i_lock); 897 pnfs_put_layout_hdr(lo); 898 return ret; 899 } 900 901 bool pnfs_roc(struct inode *ino) 902 { 903 struct pnfs_layout_hdr *lo; 904 struct pnfs_layout_segment *lseg, *tmp; 905 LIST_HEAD(tmp_list); 906 bool found = false; 907 908 spin_lock(&ino->i_lock); 909 lo = NFS_I(ino)->layout; 910 if (!lo || !test_and_clear_bit(NFS_LAYOUT_ROC, &lo->plh_flags) || 911 test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) 912 goto out_nolayout; 913 list_for_each_entry_safe(lseg, tmp, &lo->plh_segs, pls_list) 914 if (test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) { 915 mark_lseg_invalid(lseg, &tmp_list); 916 found = true; 917 } 918 if (!found) 919 goto out_nolayout; 920 lo->plh_block_lgets++; 921 pnfs_get_layout_hdr(lo); /* matched in pnfs_roc_release */ 922 spin_unlock(&ino->i_lock); 923 pnfs_free_lseg_list(&tmp_list); 924 return true; 925 926 out_nolayout: 927 spin_unlock(&ino->i_lock); 928 return false; 929 } 930 931 void pnfs_roc_release(struct inode *ino) 932 { 933 struct pnfs_layout_hdr *lo; 934 935 spin_lock(&ino->i_lock); 936 lo = NFS_I(ino)->layout; 937 lo->plh_block_lgets--; 938 if (atomic_dec_and_test(&lo->plh_refcount)) { 939 pnfs_detach_layout_hdr(lo); 940 spin_unlock(&ino->i_lock); 941 pnfs_free_layout_hdr(lo); 942 } else 943 spin_unlock(&ino->i_lock); 944 } 945 946 void pnfs_roc_set_barrier(struct inode *ino, u32 barrier) 947 { 948 struct pnfs_layout_hdr *lo; 949 950 spin_lock(&ino->i_lock); 951 lo = NFS_I(ino)->layout; 952 if (pnfs_seqid_is_newer(barrier, lo->plh_barrier)) 953 lo->plh_barrier = barrier; 954 spin_unlock(&ino->i_lock); 955 } 956 957 bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task) 958 { 959 struct nfs_inode *nfsi = NFS_I(ino); 960 struct pnfs_layout_hdr *lo; 961 struct pnfs_layout_segment *lseg; 962 u32 current_seqid; 963 bool found = false; 964 965 spin_lock(&ino->i_lock); 966 list_for_each_entry(lseg, &nfsi->layout->plh_segs, pls_list) 967 if (test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) { 968 rpc_sleep_on(&NFS_SERVER(ino)->roc_rpcwaitq, task, NULL); 969 found = true; 970 goto out; 971 } 972 lo = nfsi->layout; 973 current_seqid = be32_to_cpu(lo->plh_stateid.seqid); 974 975 /* Since close does not return a layout stateid for use as 976 * a barrier, we choose the worst-case barrier. 977 */ 978 *barrier = current_seqid + atomic_read(&lo->plh_outstanding); 979 out: 980 spin_unlock(&ino->i_lock); 981 return found; 982 } 983 984 /* 985 * Compare two layout segments for sorting into layout cache. 986 * We want to preferentially return RW over RO layouts, so ensure those 987 * are seen first. 988 */ 989 static s64 990 pnfs_lseg_range_cmp(const struct pnfs_layout_range *l1, 991 const struct pnfs_layout_range *l2) 992 { 993 s64 d; 994 995 /* high offset > low offset */ 996 d = l1->offset - l2->offset; 997 if (d) 998 return d; 999 1000 /* short length > long length */ 1001 d = l2->length - l1->length; 1002 if (d) 1003 return d; 1004 1005 /* read > read/write */ 1006 return (int)(l1->iomode == IOMODE_READ) - (int)(l2->iomode == IOMODE_READ); 1007 } 1008 1009 static void 1010 pnfs_layout_insert_lseg(struct pnfs_layout_hdr *lo, 1011 struct pnfs_layout_segment *lseg) 1012 { 1013 struct pnfs_layout_segment *lp; 1014 1015 dprintk("%s:Begin\n", __func__); 1016 1017 list_for_each_entry(lp, &lo->plh_segs, pls_list) { 1018 if (pnfs_lseg_range_cmp(&lseg->pls_range, &lp->pls_range) > 0) 1019 continue; 1020 list_add_tail(&lseg->pls_list, &lp->pls_list); 1021 dprintk("%s: inserted lseg %p " 1022 "iomode %d offset %llu length %llu before " 1023 "lp %p iomode %d offset %llu length %llu\n", 1024 __func__, lseg, lseg->pls_range.iomode, 1025 lseg->pls_range.offset, lseg->pls_range.length, 1026 lp, lp->pls_range.iomode, lp->pls_range.offset, 1027 lp->pls_range.length); 1028 goto out; 1029 } 1030 list_add_tail(&lseg->pls_list, &lo->plh_segs); 1031 dprintk("%s: inserted lseg %p " 1032 "iomode %d offset %llu length %llu at tail\n", 1033 __func__, lseg, lseg->pls_range.iomode, 1034 lseg->pls_range.offset, lseg->pls_range.length); 1035 out: 1036 pnfs_get_layout_hdr(lo); 1037 1038 dprintk("%s:Return\n", __func__); 1039 } 1040 1041 static struct pnfs_layout_hdr * 1042 alloc_init_layout_hdr(struct inode *ino, 1043 struct nfs_open_context *ctx, 1044 gfp_t gfp_flags) 1045 { 1046 struct pnfs_layout_hdr *lo; 1047 1048 lo = pnfs_alloc_layout_hdr(ino, gfp_flags); 1049 if (!lo) 1050 return NULL; 1051 atomic_set(&lo->plh_refcount, 1); 1052 INIT_LIST_HEAD(&lo->plh_layouts); 1053 INIT_LIST_HEAD(&lo->plh_segs); 1054 INIT_LIST_HEAD(&lo->plh_bulk_destroy); 1055 lo->plh_inode = ino; 1056 lo->plh_lc_cred = get_rpccred(ctx->cred); 1057 return lo; 1058 } 1059 1060 static struct pnfs_layout_hdr * 1061 pnfs_find_alloc_layout(struct inode *ino, 1062 struct nfs_open_context *ctx, 1063 gfp_t gfp_flags) 1064 { 1065 struct nfs_inode *nfsi = NFS_I(ino); 1066 struct pnfs_layout_hdr *new = NULL; 1067 1068 dprintk("%s Begin ino=%p layout=%p\n", __func__, ino, nfsi->layout); 1069 1070 if (nfsi->layout != NULL) 1071 goto out_existing; 1072 spin_unlock(&ino->i_lock); 1073 new = alloc_init_layout_hdr(ino, ctx, gfp_flags); 1074 spin_lock(&ino->i_lock); 1075 1076 if (likely(nfsi->layout == NULL)) { /* Won the race? */ 1077 nfsi->layout = new; 1078 return new; 1079 } else if (new != NULL) 1080 pnfs_free_layout_hdr(new); 1081 out_existing: 1082 pnfs_get_layout_hdr(nfsi->layout); 1083 return nfsi->layout; 1084 } 1085 1086 /* 1087 * iomode matching rules: 1088 * iomode lseg match 1089 * ----- ----- ----- 1090 * ANY READ true 1091 * ANY RW true 1092 * RW READ false 1093 * RW RW true 1094 * READ READ true 1095 * READ RW true 1096 */ 1097 static bool 1098 pnfs_lseg_range_match(const struct pnfs_layout_range *ls_range, 1099 const struct pnfs_layout_range *range) 1100 { 1101 struct pnfs_layout_range range1; 1102 1103 if ((range->iomode == IOMODE_RW && 1104 ls_range->iomode != IOMODE_RW) || 1105 !pnfs_lseg_range_intersecting(ls_range, range)) 1106 return 0; 1107 1108 /* range1 covers only the first byte in the range */ 1109 range1 = *range; 1110 range1.length = 1; 1111 return pnfs_lseg_range_contained(ls_range, &range1); 1112 } 1113 1114 /* 1115 * lookup range in layout 1116 */ 1117 static struct pnfs_layout_segment * 1118 pnfs_find_lseg(struct pnfs_layout_hdr *lo, 1119 struct pnfs_layout_range *range) 1120 { 1121 struct pnfs_layout_segment *lseg, *ret = NULL; 1122 1123 dprintk("%s:Begin\n", __func__); 1124 1125 list_for_each_entry(lseg, &lo->plh_segs, pls_list) { 1126 if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) && 1127 pnfs_lseg_range_match(&lseg->pls_range, range)) { 1128 ret = pnfs_get_lseg(lseg); 1129 break; 1130 } 1131 if (lseg->pls_range.offset > range->offset) 1132 break; 1133 } 1134 1135 dprintk("%s:Return lseg %p ref %d\n", 1136 __func__, ret, ret ? atomic_read(&ret->pls_refcount) : 0); 1137 return ret; 1138 } 1139 1140 /* 1141 * Use mdsthreshold hints set at each OPEN to determine if I/O should go 1142 * to the MDS or over pNFS 1143 * 1144 * The nfs_inode read_io and write_io fields are cumulative counters reset 1145 * when there are no layout segments. Note that in pnfs_update_layout iomode 1146 * is set to IOMODE_READ for a READ request, and set to IOMODE_RW for a 1147 * WRITE request. 1148 * 1149 * A return of true means use MDS I/O. 1150 * 1151 * From rfc 5661: 1152 * If a file's size is smaller than the file size threshold, data accesses 1153 * SHOULD be sent to the metadata server. If an I/O request has a length that 1154 * is below the I/O size threshold, the I/O SHOULD be sent to the metadata 1155 * server. If both file size and I/O size are provided, the client SHOULD 1156 * reach or exceed both thresholds before sending its read or write 1157 * requests to the data server. 1158 */ 1159 static bool pnfs_within_mdsthreshold(struct nfs_open_context *ctx, 1160 struct inode *ino, int iomode) 1161 { 1162 struct nfs4_threshold *t = ctx->mdsthreshold; 1163 struct nfs_inode *nfsi = NFS_I(ino); 1164 loff_t fsize = i_size_read(ino); 1165 bool size = false, size_set = false, io = false, io_set = false, ret = false; 1166 1167 if (t == NULL) 1168 return ret; 1169 1170 dprintk("%s bm=0x%x rd_sz=%llu wr_sz=%llu rd_io=%llu wr_io=%llu\n", 1171 __func__, t->bm, t->rd_sz, t->wr_sz, t->rd_io_sz, t->wr_io_sz); 1172 1173 switch (iomode) { 1174 case IOMODE_READ: 1175 if (t->bm & THRESHOLD_RD) { 1176 dprintk("%s fsize %llu\n", __func__, fsize); 1177 size_set = true; 1178 if (fsize < t->rd_sz) 1179 size = true; 1180 } 1181 if (t->bm & THRESHOLD_RD_IO) { 1182 dprintk("%s nfsi->read_io %llu\n", __func__, 1183 nfsi->read_io); 1184 io_set = true; 1185 if (nfsi->read_io < t->rd_io_sz) 1186 io = true; 1187 } 1188 break; 1189 case IOMODE_RW: 1190 if (t->bm & THRESHOLD_WR) { 1191 dprintk("%s fsize %llu\n", __func__, fsize); 1192 size_set = true; 1193 if (fsize < t->wr_sz) 1194 size = true; 1195 } 1196 if (t->bm & THRESHOLD_WR_IO) { 1197 dprintk("%s nfsi->write_io %llu\n", __func__, 1198 nfsi->write_io); 1199 io_set = true; 1200 if (nfsi->write_io < t->wr_io_sz) 1201 io = true; 1202 } 1203 break; 1204 } 1205 if (size_set && io_set) { 1206 if (size && io) 1207 ret = true; 1208 } else if (size || io) 1209 ret = true; 1210 1211 dprintk("<-- %s size %d io %d ret %d\n", __func__, size, io, ret); 1212 return ret; 1213 } 1214 1215 /* 1216 * Layout segment is retreived from the server if not cached. 1217 * The appropriate layout segment is referenced and returned to the caller. 1218 */ 1219 struct pnfs_layout_segment * 1220 pnfs_update_layout(struct inode *ino, 1221 struct nfs_open_context *ctx, 1222 loff_t pos, 1223 u64 count, 1224 enum pnfs_iomode iomode, 1225 gfp_t gfp_flags) 1226 { 1227 struct pnfs_layout_range arg = { 1228 .iomode = iomode, 1229 .offset = pos, 1230 .length = count, 1231 }; 1232 unsigned pg_offset; 1233 struct nfs_server *server = NFS_SERVER(ino); 1234 struct nfs_client *clp = server->nfs_client; 1235 struct pnfs_layout_hdr *lo; 1236 struct pnfs_layout_segment *lseg = NULL; 1237 bool first; 1238 1239 if (!pnfs_enabled_sb(NFS_SERVER(ino))) 1240 goto out; 1241 1242 if (pnfs_within_mdsthreshold(ctx, ino, iomode)) 1243 goto out; 1244 1245 spin_lock(&ino->i_lock); 1246 lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags); 1247 if (lo == NULL) { 1248 spin_unlock(&ino->i_lock); 1249 goto out; 1250 } 1251 1252 /* Do we even need to bother with this? */ 1253 if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) { 1254 dprintk("%s matches recall, use MDS\n", __func__); 1255 goto out_unlock; 1256 } 1257 1258 /* if LAYOUTGET already failed once we don't try again */ 1259 if (pnfs_layout_io_test_failed(lo, iomode)) 1260 goto out_unlock; 1261 1262 /* Check to see if the layout for the given range already exists */ 1263 lseg = pnfs_find_lseg(lo, &arg); 1264 if (lseg) 1265 goto out_unlock; 1266 1267 if (pnfs_layoutgets_blocked(lo, 0)) 1268 goto out_unlock; 1269 atomic_inc(&lo->plh_outstanding); 1270 1271 first = list_empty(&lo->plh_layouts) ? true : false; 1272 spin_unlock(&ino->i_lock); 1273 1274 if (first) { 1275 /* The lo must be on the clp list if there is any 1276 * chance of a CB_LAYOUTRECALL(FILE) coming in. 1277 */ 1278 spin_lock(&clp->cl_lock); 1279 list_add_tail(&lo->plh_layouts, &server->layouts); 1280 spin_unlock(&clp->cl_lock); 1281 } 1282 1283 pg_offset = arg.offset & ~PAGE_CACHE_MASK; 1284 if (pg_offset) { 1285 arg.offset -= pg_offset; 1286 arg.length += pg_offset; 1287 } 1288 if (arg.length != NFS4_MAX_UINT64) 1289 arg.length = PAGE_CACHE_ALIGN(arg.length); 1290 1291 lseg = send_layoutget(lo, ctx, &arg, gfp_flags); 1292 atomic_dec(&lo->plh_outstanding); 1293 out_put_layout_hdr: 1294 pnfs_put_layout_hdr(lo); 1295 out: 1296 dprintk("%s: inode %s/%llu pNFS layout segment %s for " 1297 "(%s, offset: %llu, length: %llu)\n", 1298 __func__, ino->i_sb->s_id, 1299 (unsigned long long)NFS_FILEID(ino), 1300 lseg == NULL ? "not found" : "found", 1301 iomode==IOMODE_RW ? "read/write" : "read-only", 1302 (unsigned long long)pos, 1303 (unsigned long long)count); 1304 return lseg; 1305 out_unlock: 1306 spin_unlock(&ino->i_lock); 1307 goto out_put_layout_hdr; 1308 } 1309 EXPORT_SYMBOL_GPL(pnfs_update_layout); 1310 1311 struct pnfs_layout_segment * 1312 pnfs_layout_process(struct nfs4_layoutget *lgp) 1313 { 1314 struct pnfs_layout_hdr *lo = NFS_I(lgp->args.inode)->layout; 1315 struct nfs4_layoutget_res *res = &lgp->res; 1316 struct pnfs_layout_segment *lseg; 1317 struct inode *ino = lo->plh_inode; 1318 int status = 0; 1319 1320 /* Inject layout blob into I/O device driver */ 1321 lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res, lgp->gfp_flags); 1322 if (!lseg || IS_ERR(lseg)) { 1323 if (!lseg) 1324 status = -ENOMEM; 1325 else 1326 status = PTR_ERR(lseg); 1327 dprintk("%s: Could not allocate layout: error %d\n", 1328 __func__, status); 1329 goto out; 1330 } 1331 1332 spin_lock(&ino->i_lock); 1333 if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) { 1334 dprintk("%s forget reply due to recall\n", __func__); 1335 goto out_forget_reply; 1336 } 1337 1338 if (pnfs_layoutgets_blocked(lo, 1) || 1339 pnfs_layout_stateid_blocked(lo, &res->stateid)) { 1340 dprintk("%s forget reply due to state\n", __func__); 1341 goto out_forget_reply; 1342 } 1343 1344 /* Done processing layoutget. Set the layout stateid */ 1345 pnfs_set_layout_stateid(lo, &res->stateid, false); 1346 1347 init_lseg(lo, lseg); 1348 lseg->pls_range = res->range; 1349 pnfs_get_lseg(lseg); 1350 pnfs_layout_insert_lseg(lo, lseg); 1351 1352 if (res->return_on_close) { 1353 set_bit(NFS_LSEG_ROC, &lseg->pls_flags); 1354 set_bit(NFS_LAYOUT_ROC, &lo->plh_flags); 1355 } 1356 1357 spin_unlock(&ino->i_lock); 1358 return lseg; 1359 out: 1360 return ERR_PTR(status); 1361 1362 out_forget_reply: 1363 spin_unlock(&ino->i_lock); 1364 lseg->pls_layout = lo; 1365 NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg); 1366 goto out; 1367 } 1368 1369 void 1370 pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) 1371 { 1372 u64 rd_size = req->wb_bytes; 1373 1374 WARN_ON_ONCE(pgio->pg_lseg != NULL); 1375 1376 if (req->wb_offset != req->wb_pgbase) { 1377 nfs_pageio_reset_read_mds(pgio); 1378 return; 1379 } 1380 1381 if (pgio->pg_dreq == NULL) 1382 rd_size = i_size_read(pgio->pg_inode) - req_offset(req); 1383 else 1384 rd_size = nfs_dreq_bytes_left(pgio->pg_dreq); 1385 1386 pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, 1387 req->wb_context, 1388 req_offset(req), 1389 rd_size, 1390 IOMODE_READ, 1391 GFP_KERNEL); 1392 /* If no lseg, fall back to read through mds */ 1393 if (pgio->pg_lseg == NULL) 1394 nfs_pageio_reset_read_mds(pgio); 1395 1396 } 1397 EXPORT_SYMBOL_GPL(pnfs_generic_pg_init_read); 1398 1399 void 1400 pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio, 1401 struct nfs_page *req, u64 wb_size) 1402 { 1403 WARN_ON_ONCE(pgio->pg_lseg != NULL); 1404 1405 if (req->wb_offset != req->wb_pgbase) { 1406 nfs_pageio_reset_write_mds(pgio); 1407 return; 1408 } 1409 1410 pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, 1411 req->wb_context, 1412 req_offset(req), 1413 wb_size, 1414 IOMODE_RW, 1415 GFP_NOFS); 1416 /* If no lseg, fall back to write through mds */ 1417 if (pgio->pg_lseg == NULL) 1418 nfs_pageio_reset_write_mds(pgio); 1419 } 1420 EXPORT_SYMBOL_GPL(pnfs_generic_pg_init_write); 1421 1422 void 1423 pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode, 1424 const struct nfs_pgio_completion_ops *compl_ops) 1425 { 1426 struct nfs_server *server = NFS_SERVER(inode); 1427 struct pnfs_layoutdriver_type *ld = server->pnfs_curr_ld; 1428 1429 if (ld == NULL) 1430 nfs_pageio_init_read(pgio, inode, compl_ops); 1431 else 1432 nfs_pageio_init(pgio, inode, ld->pg_read_ops, compl_ops, server->rsize, 0); 1433 } 1434 1435 void 1436 pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode, 1437 int ioflags, 1438 const struct nfs_pgio_completion_ops *compl_ops) 1439 { 1440 struct nfs_server *server = NFS_SERVER(inode); 1441 struct pnfs_layoutdriver_type *ld = server->pnfs_curr_ld; 1442 1443 if (ld == NULL) 1444 nfs_pageio_init_write(pgio, inode, ioflags, compl_ops); 1445 else 1446 nfs_pageio_init(pgio, inode, ld->pg_write_ops, compl_ops, server->wsize, ioflags); 1447 } 1448 1449 bool 1450 pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, 1451 struct nfs_page *req) 1452 { 1453 if (pgio->pg_lseg == NULL) 1454 return nfs_generic_pg_test(pgio, prev, req); 1455 1456 /* 1457 * Test if a nfs_page is fully contained in the pnfs_layout_range. 1458 * Note that this test makes several assumptions: 1459 * - that the previous nfs_page in the struct nfs_pageio_descriptor 1460 * is known to lie within the range. 1461 * - that the nfs_page being tested is known to be contiguous with the 1462 * previous nfs_page. 1463 * - Layout ranges are page aligned, so we only have to test the 1464 * start offset of the request. 1465 * 1466 * Please also note that 'end_offset' is actually the offset of the 1467 * first byte that lies outside the pnfs_layout_range. FIXME? 1468 * 1469 */ 1470 return req_offset(req) < end_offset(pgio->pg_lseg->pls_range.offset, 1471 pgio->pg_lseg->pls_range.length); 1472 } 1473 EXPORT_SYMBOL_GPL(pnfs_generic_pg_test); 1474 1475 int pnfs_write_done_resend_to_mds(struct inode *inode, 1476 struct list_head *head, 1477 const struct nfs_pgio_completion_ops *compl_ops, 1478 struct nfs_direct_req *dreq) 1479 { 1480 struct nfs_pageio_descriptor pgio; 1481 LIST_HEAD(failed); 1482 1483 /* Resend all requests through the MDS */ 1484 nfs_pageio_init_write(&pgio, inode, FLUSH_STABLE, compl_ops); 1485 pgio.pg_dreq = dreq; 1486 while (!list_empty(head)) { 1487 struct nfs_page *req = nfs_list_entry(head->next); 1488 1489 nfs_list_remove_request(req); 1490 if (!nfs_pageio_add_request(&pgio, req)) 1491 nfs_list_add_request(req, &failed); 1492 } 1493 nfs_pageio_complete(&pgio); 1494 1495 if (!list_empty(&failed)) { 1496 /* For some reason our attempt to resend pages. Mark the 1497 * overall send request as having failed, and let 1498 * nfs_writeback_release_full deal with the error. 1499 */ 1500 list_move(&failed, head); 1501 return -EIO; 1502 } 1503 return 0; 1504 } 1505 EXPORT_SYMBOL_GPL(pnfs_write_done_resend_to_mds); 1506 1507 static void pnfs_ld_handle_write_error(struct nfs_write_data *data) 1508 { 1509 struct nfs_pgio_header *hdr = data->header; 1510 1511 dprintk("pnfs write error = %d\n", hdr->pnfs_error); 1512 if (NFS_SERVER(hdr->inode)->pnfs_curr_ld->flags & 1513 PNFS_LAYOUTRET_ON_ERROR) { 1514 pnfs_return_layout(hdr->inode); 1515 } 1516 if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) 1517 data->task.tk_status = pnfs_write_done_resend_to_mds(hdr->inode, 1518 &hdr->pages, 1519 hdr->completion_ops, 1520 hdr->dreq); 1521 } 1522 1523 /* 1524 * Called by non rpc-based layout drivers 1525 */ 1526 void pnfs_ld_write_done(struct nfs_write_data *data) 1527 { 1528 struct nfs_pgio_header *hdr = data->header; 1529 1530 trace_nfs4_pnfs_write(data, hdr->pnfs_error); 1531 if (!hdr->pnfs_error) { 1532 pnfs_set_layoutcommit(data); 1533 hdr->mds_ops->rpc_call_done(&data->task, data); 1534 } else 1535 pnfs_ld_handle_write_error(data); 1536 hdr->mds_ops->rpc_release(data); 1537 } 1538 EXPORT_SYMBOL_GPL(pnfs_ld_write_done); 1539 1540 static void 1541 pnfs_write_through_mds(struct nfs_pageio_descriptor *desc, 1542 struct nfs_write_data *data) 1543 { 1544 struct nfs_pgio_header *hdr = data->header; 1545 1546 if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) { 1547 list_splice_tail_init(&hdr->pages, &desc->pg_list); 1548 nfs_pageio_reset_write_mds(desc); 1549 desc->pg_recoalesce = 1; 1550 } 1551 nfs_writedata_release(data); 1552 } 1553 1554 static enum pnfs_try_status 1555 pnfs_try_to_write_data(struct nfs_write_data *wdata, 1556 const struct rpc_call_ops *call_ops, 1557 struct pnfs_layout_segment *lseg, 1558 int how) 1559 { 1560 struct nfs_pgio_header *hdr = wdata->header; 1561 struct inode *inode = hdr->inode; 1562 enum pnfs_try_status trypnfs; 1563 struct nfs_server *nfss = NFS_SERVER(inode); 1564 1565 hdr->mds_ops = call_ops; 1566 1567 dprintk("%s: Writing ino:%lu %u@%llu (how %d)\n", __func__, 1568 inode->i_ino, wdata->args.count, wdata->args.offset, how); 1569 trypnfs = nfss->pnfs_curr_ld->write_pagelist(wdata, how); 1570 if (trypnfs != PNFS_NOT_ATTEMPTED) 1571 nfs_inc_stats(inode, NFSIOS_PNFS_WRITE); 1572 dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs); 1573 return trypnfs; 1574 } 1575 1576 static void 1577 pnfs_do_multiple_writes(struct nfs_pageio_descriptor *desc, struct list_head *head, int how) 1578 { 1579 struct nfs_write_data *data; 1580 const struct rpc_call_ops *call_ops = desc->pg_rpc_callops; 1581 struct pnfs_layout_segment *lseg = desc->pg_lseg; 1582 1583 desc->pg_lseg = NULL; 1584 while (!list_empty(head)) { 1585 enum pnfs_try_status trypnfs; 1586 1587 data = list_first_entry(head, struct nfs_write_data, list); 1588 list_del_init(&data->list); 1589 1590 trypnfs = pnfs_try_to_write_data(data, call_ops, lseg, how); 1591 if (trypnfs == PNFS_NOT_ATTEMPTED) 1592 pnfs_write_through_mds(desc, data); 1593 } 1594 pnfs_put_lseg(lseg); 1595 } 1596 1597 static void pnfs_writehdr_free(struct nfs_pgio_header *hdr) 1598 { 1599 pnfs_put_lseg(hdr->lseg); 1600 nfs_writehdr_free(hdr); 1601 } 1602 EXPORT_SYMBOL_GPL(pnfs_writehdr_free); 1603 1604 int 1605 pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc) 1606 { 1607 struct nfs_write_header *whdr; 1608 struct nfs_pgio_header *hdr; 1609 int ret; 1610 1611 whdr = nfs_writehdr_alloc(); 1612 if (!whdr) { 1613 desc->pg_completion_ops->error_cleanup(&desc->pg_list); 1614 pnfs_put_lseg(desc->pg_lseg); 1615 desc->pg_lseg = NULL; 1616 return -ENOMEM; 1617 } 1618 hdr = &whdr->header; 1619 nfs_pgheader_init(desc, hdr, pnfs_writehdr_free); 1620 hdr->lseg = pnfs_get_lseg(desc->pg_lseg); 1621 atomic_inc(&hdr->refcnt); 1622 ret = nfs_generic_flush(desc, hdr); 1623 if (ret != 0) { 1624 pnfs_put_lseg(desc->pg_lseg); 1625 desc->pg_lseg = NULL; 1626 } else 1627 pnfs_do_multiple_writes(desc, &hdr->rpc_list, desc->pg_ioflags); 1628 if (atomic_dec_and_test(&hdr->refcnt)) 1629 hdr->completion_ops->completion(hdr); 1630 return ret; 1631 } 1632 EXPORT_SYMBOL_GPL(pnfs_generic_pg_writepages); 1633 1634 int pnfs_read_done_resend_to_mds(struct inode *inode, 1635 struct list_head *head, 1636 const struct nfs_pgio_completion_ops *compl_ops, 1637 struct nfs_direct_req *dreq) 1638 { 1639 struct nfs_pageio_descriptor pgio; 1640 LIST_HEAD(failed); 1641 1642 /* Resend all requests through the MDS */ 1643 nfs_pageio_init_read(&pgio, inode, compl_ops); 1644 pgio.pg_dreq = dreq; 1645 while (!list_empty(head)) { 1646 struct nfs_page *req = nfs_list_entry(head->next); 1647 1648 nfs_list_remove_request(req); 1649 if (!nfs_pageio_add_request(&pgio, req)) 1650 nfs_list_add_request(req, &failed); 1651 } 1652 nfs_pageio_complete(&pgio); 1653 1654 if (!list_empty(&failed)) { 1655 list_move(&failed, head); 1656 return -EIO; 1657 } 1658 return 0; 1659 } 1660 EXPORT_SYMBOL_GPL(pnfs_read_done_resend_to_mds); 1661 1662 static void pnfs_ld_handle_read_error(struct nfs_read_data *data) 1663 { 1664 struct nfs_pgio_header *hdr = data->header; 1665 1666 dprintk("pnfs read error = %d\n", hdr->pnfs_error); 1667 if (NFS_SERVER(hdr->inode)->pnfs_curr_ld->flags & 1668 PNFS_LAYOUTRET_ON_ERROR) { 1669 pnfs_return_layout(hdr->inode); 1670 } 1671 if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) 1672 data->task.tk_status = pnfs_read_done_resend_to_mds(hdr->inode, 1673 &hdr->pages, 1674 hdr->completion_ops, 1675 hdr->dreq); 1676 } 1677 1678 /* 1679 * Called by non rpc-based layout drivers 1680 */ 1681 void pnfs_ld_read_done(struct nfs_read_data *data) 1682 { 1683 struct nfs_pgio_header *hdr = data->header; 1684 1685 trace_nfs4_pnfs_read(data, hdr->pnfs_error); 1686 if (likely(!hdr->pnfs_error)) { 1687 __nfs4_read_done_cb(data); 1688 hdr->mds_ops->rpc_call_done(&data->task, data); 1689 } else 1690 pnfs_ld_handle_read_error(data); 1691 hdr->mds_ops->rpc_release(data); 1692 } 1693 EXPORT_SYMBOL_GPL(pnfs_ld_read_done); 1694 1695 static void 1696 pnfs_read_through_mds(struct nfs_pageio_descriptor *desc, 1697 struct nfs_read_data *data) 1698 { 1699 struct nfs_pgio_header *hdr = data->header; 1700 1701 if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) { 1702 list_splice_tail_init(&hdr->pages, &desc->pg_list); 1703 nfs_pageio_reset_read_mds(desc); 1704 desc->pg_recoalesce = 1; 1705 } 1706 nfs_readdata_release(data); 1707 } 1708 1709 /* 1710 * Call the appropriate parallel I/O subsystem read function. 1711 */ 1712 static enum pnfs_try_status 1713 pnfs_try_to_read_data(struct nfs_read_data *rdata, 1714 const struct rpc_call_ops *call_ops, 1715 struct pnfs_layout_segment *lseg) 1716 { 1717 struct nfs_pgio_header *hdr = rdata->header; 1718 struct inode *inode = hdr->inode; 1719 struct nfs_server *nfss = NFS_SERVER(inode); 1720 enum pnfs_try_status trypnfs; 1721 1722 hdr->mds_ops = call_ops; 1723 1724 dprintk("%s: Reading ino:%lu %u@%llu\n", 1725 __func__, inode->i_ino, rdata->args.count, rdata->args.offset); 1726 1727 trypnfs = nfss->pnfs_curr_ld->read_pagelist(rdata); 1728 if (trypnfs != PNFS_NOT_ATTEMPTED) 1729 nfs_inc_stats(inode, NFSIOS_PNFS_READ); 1730 dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs); 1731 return trypnfs; 1732 } 1733 1734 static void 1735 pnfs_do_multiple_reads(struct nfs_pageio_descriptor *desc, struct list_head *head) 1736 { 1737 struct nfs_read_data *data; 1738 const struct rpc_call_ops *call_ops = desc->pg_rpc_callops; 1739 struct pnfs_layout_segment *lseg = desc->pg_lseg; 1740 1741 desc->pg_lseg = NULL; 1742 while (!list_empty(head)) { 1743 enum pnfs_try_status trypnfs; 1744 1745 data = list_first_entry(head, struct nfs_read_data, list); 1746 list_del_init(&data->list); 1747 1748 trypnfs = pnfs_try_to_read_data(data, call_ops, lseg); 1749 if (trypnfs == PNFS_NOT_ATTEMPTED) 1750 pnfs_read_through_mds(desc, data); 1751 } 1752 pnfs_put_lseg(lseg); 1753 } 1754 1755 static void pnfs_readhdr_free(struct nfs_pgio_header *hdr) 1756 { 1757 pnfs_put_lseg(hdr->lseg); 1758 nfs_readhdr_free(hdr); 1759 } 1760 EXPORT_SYMBOL_GPL(pnfs_readhdr_free); 1761 1762 int 1763 pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc) 1764 { 1765 struct nfs_read_header *rhdr; 1766 struct nfs_pgio_header *hdr; 1767 int ret; 1768 1769 rhdr = nfs_readhdr_alloc(); 1770 if (!rhdr) { 1771 desc->pg_completion_ops->error_cleanup(&desc->pg_list); 1772 ret = -ENOMEM; 1773 pnfs_put_lseg(desc->pg_lseg); 1774 desc->pg_lseg = NULL; 1775 return ret; 1776 } 1777 hdr = &rhdr->header; 1778 nfs_pgheader_init(desc, hdr, pnfs_readhdr_free); 1779 hdr->lseg = pnfs_get_lseg(desc->pg_lseg); 1780 atomic_inc(&hdr->refcnt); 1781 ret = nfs_generic_pagein(desc, hdr); 1782 if (ret != 0) { 1783 pnfs_put_lseg(desc->pg_lseg); 1784 desc->pg_lseg = NULL; 1785 } else 1786 pnfs_do_multiple_reads(desc, &hdr->rpc_list); 1787 if (atomic_dec_and_test(&hdr->refcnt)) 1788 hdr->completion_ops->completion(hdr); 1789 return ret; 1790 } 1791 EXPORT_SYMBOL_GPL(pnfs_generic_pg_readpages); 1792 1793 /* 1794 * There can be multiple RW segments. 1795 */ 1796 static void pnfs_list_write_lseg(struct inode *inode, struct list_head *listp) 1797 { 1798 struct pnfs_layout_segment *lseg; 1799 1800 list_for_each_entry(lseg, &NFS_I(inode)->layout->plh_segs, pls_list) { 1801 if (lseg->pls_range.iomode == IOMODE_RW && 1802 test_and_clear_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags)) 1803 list_add(&lseg->pls_lc_list, listp); 1804 } 1805 } 1806 1807 static void pnfs_list_write_lseg_done(struct inode *inode, struct list_head *listp) 1808 { 1809 struct pnfs_layout_segment *lseg, *tmp; 1810 unsigned long *bitlock = &NFS_I(inode)->flags; 1811 1812 /* Matched by references in pnfs_set_layoutcommit */ 1813 list_for_each_entry_safe(lseg, tmp, listp, pls_lc_list) { 1814 list_del_init(&lseg->pls_lc_list); 1815 pnfs_put_lseg(lseg); 1816 } 1817 1818 clear_bit_unlock(NFS_INO_LAYOUTCOMMITTING, bitlock); 1819 smp_mb__after_clear_bit(); 1820 wake_up_bit(bitlock, NFS_INO_LAYOUTCOMMITTING); 1821 } 1822 1823 void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg) 1824 { 1825 pnfs_layout_io_set_failed(lseg->pls_layout, lseg->pls_range.iomode); 1826 } 1827 EXPORT_SYMBOL_GPL(pnfs_set_lo_fail); 1828 1829 void 1830 pnfs_set_layoutcommit(struct nfs_write_data *wdata) 1831 { 1832 struct nfs_pgio_header *hdr = wdata->header; 1833 struct inode *inode = hdr->inode; 1834 struct nfs_inode *nfsi = NFS_I(inode); 1835 loff_t end_pos = wdata->mds_offset + wdata->res.count; 1836 bool mark_as_dirty = false; 1837 1838 spin_lock(&inode->i_lock); 1839 if (!test_and_set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) { 1840 mark_as_dirty = true; 1841 dprintk("%s: Set layoutcommit for inode %lu ", 1842 __func__, inode->i_ino); 1843 } 1844 if (!test_and_set_bit(NFS_LSEG_LAYOUTCOMMIT, &hdr->lseg->pls_flags)) { 1845 /* references matched in nfs4_layoutcommit_release */ 1846 pnfs_get_lseg(hdr->lseg); 1847 } 1848 if (end_pos > nfsi->layout->plh_lwb) 1849 nfsi->layout->plh_lwb = end_pos; 1850 spin_unlock(&inode->i_lock); 1851 dprintk("%s: lseg %p end_pos %llu\n", 1852 __func__, hdr->lseg, nfsi->layout->plh_lwb); 1853 1854 /* if pnfs_layoutcommit_inode() runs between inode locks, the next one 1855 * will be a noop because NFS_INO_LAYOUTCOMMIT will not be set */ 1856 if (mark_as_dirty) 1857 mark_inode_dirty_sync(inode); 1858 } 1859 EXPORT_SYMBOL_GPL(pnfs_set_layoutcommit); 1860 1861 void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data) 1862 { 1863 struct nfs_server *nfss = NFS_SERVER(data->args.inode); 1864 1865 if (nfss->pnfs_curr_ld->cleanup_layoutcommit) 1866 nfss->pnfs_curr_ld->cleanup_layoutcommit(data); 1867 pnfs_list_write_lseg_done(data->args.inode, &data->lseg_list); 1868 } 1869 1870 /* 1871 * For the LAYOUT4_NFSV4_1_FILES layout type, NFS_DATA_SYNC WRITEs and 1872 * NFS_UNSTABLE WRITEs with a COMMIT to data servers must store enough 1873 * data to disk to allow the server to recover the data if it crashes. 1874 * LAYOUTCOMMIT is only needed when the NFL4_UFLG_COMMIT_THRU_MDS flag 1875 * is off, and a COMMIT is sent to a data server, or 1876 * if WRITEs to a data server return NFS_DATA_SYNC. 1877 */ 1878 int 1879 pnfs_layoutcommit_inode(struct inode *inode, bool sync) 1880 { 1881 struct nfs4_layoutcommit_data *data; 1882 struct nfs_inode *nfsi = NFS_I(inode); 1883 loff_t end_pos; 1884 int status = 0; 1885 1886 dprintk("--> %s inode %lu\n", __func__, inode->i_ino); 1887 1888 if (!test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) 1889 return 0; 1890 1891 /* Note kzalloc ensures data->res.seq_res.sr_slot == NULL */ 1892 data = kzalloc(sizeof(*data), GFP_NOFS); 1893 if (!data) { 1894 status = -ENOMEM; 1895 goto out; 1896 } 1897 1898 if (!test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) 1899 goto out_free; 1900 1901 if (test_and_set_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags)) { 1902 if (!sync) { 1903 status = -EAGAIN; 1904 goto out_free; 1905 } 1906 status = wait_on_bit_lock(&nfsi->flags, NFS_INO_LAYOUTCOMMITTING, 1907 nfs_wait_bit_killable, TASK_KILLABLE); 1908 if (status) 1909 goto out_free; 1910 } 1911 1912 INIT_LIST_HEAD(&data->lseg_list); 1913 spin_lock(&inode->i_lock); 1914 if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) { 1915 clear_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags); 1916 spin_unlock(&inode->i_lock); 1917 wake_up_bit(&nfsi->flags, NFS_INO_LAYOUTCOMMITTING); 1918 goto out_free; 1919 } 1920 1921 pnfs_list_write_lseg(inode, &data->lseg_list); 1922 1923 end_pos = nfsi->layout->plh_lwb; 1924 nfsi->layout->plh_lwb = 0; 1925 1926 nfs4_stateid_copy(&data->args.stateid, &nfsi->layout->plh_stateid); 1927 spin_unlock(&inode->i_lock); 1928 1929 data->args.inode = inode; 1930 data->cred = get_rpccred(nfsi->layout->plh_lc_cred); 1931 nfs_fattr_init(&data->fattr); 1932 data->args.bitmask = NFS_SERVER(inode)->cache_consistency_bitmask; 1933 data->res.fattr = &data->fattr; 1934 data->args.lastbytewritten = end_pos - 1; 1935 data->res.server = NFS_SERVER(inode); 1936 1937 status = nfs4_proc_layoutcommit(data, sync); 1938 out: 1939 if (status) 1940 mark_inode_dirty_sync(inode); 1941 dprintk("<-- %s status %d\n", __func__, status); 1942 return status; 1943 out_free: 1944 kfree(data); 1945 goto out; 1946 } 1947 1948 struct nfs4_threshold *pnfs_mdsthreshold_alloc(void) 1949 { 1950 struct nfs4_threshold *thp; 1951 1952 thp = kzalloc(sizeof(*thp), GFP_NOFS); 1953 if (!thp) { 1954 dprintk("%s mdsthreshold allocation failed\n", __func__); 1955 return NULL; 1956 } 1957 return thp; 1958 } 1959