1 /* 2 * Device operations for the pnfs nfs4 file layout driver. 3 * 4 * Copyright (c) 2014, Primary Data, Inc. All rights reserved. 5 * 6 * Tao Peng <bergwolf@primarydata.com> 7 */ 8 9 #include <linux/nfs_fs.h> 10 #include <linux/vmalloc.h> 11 #include <linux/module.h> 12 #include <linux/sunrpc/addr.h> 13 14 #include "../internal.h" 15 #include "../nfs4session.h" 16 #include "flexfilelayout.h" 17 18 #define NFSDBG_FACILITY NFSDBG_PNFS_LD 19 20 static unsigned int dataserver_timeo = NFS_DEF_TCP_RETRANS; 21 static unsigned int dataserver_retrans; 22 23 static bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg); 24 25 void nfs4_ff_layout_put_deviceid(struct nfs4_ff_layout_ds *mirror_ds) 26 { 27 if (!IS_ERR_OR_NULL(mirror_ds)) 28 nfs4_put_deviceid_node(&mirror_ds->id_node); 29 } 30 31 void nfs4_ff_layout_free_deviceid(struct nfs4_ff_layout_ds *mirror_ds) 32 { 33 nfs4_print_deviceid(&mirror_ds->id_node.deviceid); 34 nfs4_pnfs_ds_put(mirror_ds->ds); 35 kfree_rcu(mirror_ds, id_node.rcu); 36 } 37 38 /* Decode opaque device data and construct new_ds using it */ 39 struct nfs4_ff_layout_ds * 40 nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, 41 gfp_t gfp_flags) 42 { 43 struct xdr_stream stream; 44 struct xdr_buf buf; 45 struct page *scratch; 46 struct list_head dsaddrs; 47 struct nfs4_pnfs_ds_addr *da; 48 struct nfs4_ff_layout_ds *new_ds = NULL; 49 struct nfs4_ff_ds_version *ds_versions = NULL; 50 u32 mp_count; 51 u32 version_count; 52 __be32 *p; 53 int i, ret = -ENOMEM; 54 55 /* set up xdr stream */ 56 scratch = alloc_page(gfp_flags); 57 if (!scratch) 58 goto out_err; 59 60 new_ds = kzalloc(sizeof(struct nfs4_ff_layout_ds), gfp_flags); 61 if (!new_ds) 62 goto out_scratch; 63 64 nfs4_init_deviceid_node(&new_ds->id_node, 65 server, 66 &pdev->dev_id); 67 INIT_LIST_HEAD(&dsaddrs); 68 69 xdr_init_decode_pages(&stream, &buf, pdev->pages, pdev->pglen); 70 xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE); 71 72 /* multipath count */ 73 p = xdr_inline_decode(&stream, 4); 74 if (unlikely(!p)) 75 goto out_err_drain_dsaddrs; 76 mp_count = be32_to_cpup(p); 77 dprintk("%s: multipath ds count %d\n", __func__, mp_count); 78 79 for (i = 0; i < mp_count; i++) { 80 /* multipath ds */ 81 da = nfs4_decode_mp_ds_addr(server->nfs_client->cl_net, 82 &stream, gfp_flags); 83 if (da) 84 list_add_tail(&da->da_node, &dsaddrs); 85 } 86 if (list_empty(&dsaddrs)) { 87 dprintk("%s: no suitable DS addresses found\n", 88 __func__); 89 ret = -ENOMEDIUM; 90 goto out_err_drain_dsaddrs; 91 } 92 93 /* version count */ 94 p = xdr_inline_decode(&stream, 4); 95 if (unlikely(!p)) 96 goto out_err_drain_dsaddrs; 97 version_count = be32_to_cpup(p); 98 dprintk("%s: version count %d\n", __func__, version_count); 99 100 ds_versions = kzalloc(version_count * sizeof(struct nfs4_ff_ds_version), 101 gfp_flags); 102 if (!ds_versions) 103 goto out_scratch; 104 105 for (i = 0; i < version_count; i++) { 106 /* 20 = version(4) + minor_version(4) + rsize(4) + wsize(4) + 107 * tightly_coupled(4) */ 108 p = xdr_inline_decode(&stream, 20); 109 if (unlikely(!p)) 110 goto out_err_drain_dsaddrs; 111 ds_versions[i].version = be32_to_cpup(p++); 112 ds_versions[i].minor_version = be32_to_cpup(p++); 113 ds_versions[i].rsize = nfs_block_size(be32_to_cpup(p++), NULL); 114 ds_versions[i].wsize = nfs_block_size(be32_to_cpup(p++), NULL); 115 ds_versions[i].tightly_coupled = be32_to_cpup(p); 116 117 if (ds_versions[i].rsize > NFS_MAX_FILE_IO_SIZE) 118 ds_versions[i].rsize = NFS_MAX_FILE_IO_SIZE; 119 if (ds_versions[i].wsize > NFS_MAX_FILE_IO_SIZE) 120 ds_versions[i].wsize = NFS_MAX_FILE_IO_SIZE; 121 122 if (ds_versions[i].version != 3 || ds_versions[i].minor_version != 0) { 123 dprintk("%s: [%d] unsupported ds version %d-%d\n", __func__, 124 i, ds_versions[i].version, 125 ds_versions[i].minor_version); 126 ret = -EPROTONOSUPPORT; 127 goto out_err_drain_dsaddrs; 128 } 129 130 dprintk("%s: [%d] vers %u minor_ver %u rsize %u wsize %u coupled %d\n", 131 __func__, i, ds_versions[i].version, 132 ds_versions[i].minor_version, 133 ds_versions[i].rsize, 134 ds_versions[i].wsize, 135 ds_versions[i].tightly_coupled); 136 } 137 138 new_ds->ds_versions = ds_versions; 139 new_ds->ds_versions_cnt = version_count; 140 141 new_ds->ds = nfs4_pnfs_ds_add(&dsaddrs, gfp_flags); 142 if (!new_ds->ds) 143 goto out_err_drain_dsaddrs; 144 145 /* If DS was already in cache, free ds addrs */ 146 while (!list_empty(&dsaddrs)) { 147 da = list_first_entry(&dsaddrs, 148 struct nfs4_pnfs_ds_addr, 149 da_node); 150 list_del_init(&da->da_node); 151 kfree(da->da_remotestr); 152 kfree(da); 153 } 154 155 __free_page(scratch); 156 return new_ds; 157 158 out_err_drain_dsaddrs: 159 while (!list_empty(&dsaddrs)) { 160 da = list_first_entry(&dsaddrs, struct nfs4_pnfs_ds_addr, 161 da_node); 162 list_del_init(&da->da_node); 163 kfree(da->da_remotestr); 164 kfree(da); 165 } 166 167 kfree(ds_versions); 168 out_scratch: 169 __free_page(scratch); 170 out_err: 171 kfree(new_ds); 172 173 dprintk("%s ERROR: returning %d\n", __func__, ret); 174 return NULL; 175 } 176 177 static void ff_layout_mark_devid_invalid(struct pnfs_layout_segment *lseg, 178 struct nfs4_deviceid_node *devid) 179 { 180 nfs4_delete_deviceid(devid->ld, devid->nfs_client, &devid->deviceid); 181 if (!ff_layout_has_available_ds(lseg)) 182 pnfs_error_mark_layout_for_return(lseg->pls_layout->plh_inode, 183 lseg); 184 } 185 186 static bool ff_layout_mirror_valid(struct pnfs_layout_segment *lseg, 187 struct nfs4_ff_layout_mirror *mirror, 188 bool create) 189 { 190 if (mirror == NULL || IS_ERR(mirror->mirror_ds)) 191 goto outerr; 192 if (mirror->mirror_ds == NULL) { 193 if (create) { 194 struct nfs4_deviceid_node *node; 195 struct pnfs_layout_hdr *lh = lseg->pls_layout; 196 struct nfs4_ff_layout_ds *mirror_ds = ERR_PTR(-ENODEV); 197 198 node = nfs4_find_get_deviceid(NFS_SERVER(lh->plh_inode), 199 &mirror->devid, lh->plh_lc_cred, 200 GFP_KERNEL); 201 if (node) 202 mirror_ds = FF_LAYOUT_MIRROR_DS(node); 203 204 /* check for race with another call to this function */ 205 if (cmpxchg(&mirror->mirror_ds, NULL, mirror_ds) && 206 mirror_ds != ERR_PTR(-ENODEV)) 207 nfs4_put_deviceid_node(node); 208 } else 209 goto outerr; 210 } 211 if (mirror->mirror_ds->ds == NULL) { 212 struct nfs4_deviceid_node *devid; 213 devid = &mirror->mirror_ds->id_node; 214 ff_layout_mark_devid_invalid(lseg, devid); 215 return false; 216 } 217 return true; 218 outerr: 219 pnfs_error_mark_layout_for_return(lseg->pls_layout->plh_inode, lseg); 220 return false; 221 } 222 223 static void extend_ds_error(struct nfs4_ff_layout_ds_err *err, 224 u64 offset, u64 length) 225 { 226 u64 end; 227 228 end = max_t(u64, pnfs_end_offset(err->offset, err->length), 229 pnfs_end_offset(offset, length)); 230 err->offset = min_t(u64, err->offset, offset); 231 err->length = end - err->offset; 232 } 233 234 static int 235 ff_ds_error_match(const struct nfs4_ff_layout_ds_err *e1, 236 const struct nfs4_ff_layout_ds_err *e2) 237 { 238 int ret; 239 240 if (e1->opnum != e2->opnum) 241 return e1->opnum < e2->opnum ? -1 : 1; 242 if (e1->status != e2->status) 243 return e1->status < e2->status ? -1 : 1; 244 ret = memcmp(e1->stateid.data, e2->stateid.data, 245 sizeof(e1->stateid.data)); 246 if (ret != 0) 247 return ret; 248 ret = memcmp(&e1->deviceid, &e2->deviceid, sizeof(e1->deviceid)); 249 if (ret != 0) 250 return ret; 251 if (pnfs_end_offset(e1->offset, e1->length) < e2->offset) 252 return -1; 253 if (e1->offset > pnfs_end_offset(e2->offset, e2->length)) 254 return 1; 255 /* If ranges overlap or are contiguous, they are the same */ 256 return 0; 257 } 258 259 static void 260 ff_layout_add_ds_error_locked(struct nfs4_flexfile_layout *flo, 261 struct nfs4_ff_layout_ds_err *dserr) 262 { 263 struct nfs4_ff_layout_ds_err *err, *tmp; 264 struct list_head *head = &flo->error_list; 265 int match; 266 267 /* Do insertion sort w/ merges */ 268 list_for_each_entry_safe(err, tmp, &flo->error_list, list) { 269 match = ff_ds_error_match(err, dserr); 270 if (match < 0) 271 continue; 272 if (match > 0) { 273 /* Add entry "dserr" _before_ entry "err" */ 274 head = &err->list; 275 break; 276 } 277 /* Entries match, so merge "err" into "dserr" */ 278 extend_ds_error(dserr, err->offset, err->length); 279 list_replace(&err->list, &dserr->list); 280 kfree(err); 281 return; 282 } 283 284 list_add_tail(&dserr->list, head); 285 } 286 287 int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo, 288 struct nfs4_ff_layout_mirror *mirror, u64 offset, 289 u64 length, int status, enum nfs_opnum4 opnum, 290 gfp_t gfp_flags) 291 { 292 struct nfs4_ff_layout_ds_err *dserr; 293 294 if (status == 0) 295 return 0; 296 297 if (mirror->mirror_ds == NULL) 298 return -EINVAL; 299 300 dserr = kmalloc(sizeof(*dserr), gfp_flags); 301 if (!dserr) 302 return -ENOMEM; 303 304 INIT_LIST_HEAD(&dserr->list); 305 dserr->offset = offset; 306 dserr->length = length; 307 dserr->status = status; 308 dserr->opnum = opnum; 309 nfs4_stateid_copy(&dserr->stateid, &mirror->stateid); 310 memcpy(&dserr->deviceid, &mirror->mirror_ds->id_node.deviceid, 311 NFS4_DEVICEID4_SIZE); 312 313 spin_lock(&flo->generic_hdr.plh_inode->i_lock); 314 ff_layout_add_ds_error_locked(flo, dserr); 315 spin_unlock(&flo->generic_hdr.plh_inode->i_lock); 316 317 return 0; 318 } 319 320 static struct rpc_cred * 321 ff_layout_get_mirror_cred(struct nfs4_ff_layout_mirror *mirror, u32 iomode) 322 { 323 struct rpc_cred *cred, __rcu **pcred; 324 325 if (iomode == IOMODE_READ) 326 pcred = &mirror->ro_cred; 327 else 328 pcred = &mirror->rw_cred; 329 330 rcu_read_lock(); 331 do { 332 cred = rcu_dereference(*pcred); 333 if (!cred) 334 break; 335 336 cred = get_rpccred_rcu(cred); 337 } while(!cred); 338 rcu_read_unlock(); 339 return cred; 340 } 341 342 struct nfs_fh * 343 nfs4_ff_layout_select_ds_fh(struct pnfs_layout_segment *lseg, u32 mirror_idx) 344 { 345 struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, mirror_idx); 346 struct nfs_fh *fh = NULL; 347 348 if (!ff_layout_mirror_valid(lseg, mirror, false)) { 349 pr_err_ratelimited("NFS: %s: No data server for mirror offset index %d\n", 350 __func__, mirror_idx); 351 goto out; 352 } 353 354 /* FIXME: For now assume there is only 1 version available for the DS */ 355 fh = &mirror->fh_versions[0]; 356 out: 357 return fh; 358 } 359 360 /** 361 * nfs4_ff_layout_prepare_ds - prepare a DS connection for an RPC call 362 * @lseg: the layout segment we're operating on 363 * @ds_idx: index of the DS to use 364 * @fail_return: return layout on connect failure? 365 * 366 * Try to prepare a DS connection to accept an RPC call. This involves 367 * selecting a mirror to use and connecting the client to it if it's not 368 * already connected. 369 * 370 * Since we only need a single functioning mirror to satisfy a read, we don't 371 * want to return the layout if there is one. For writes though, any down 372 * mirror should result in a LAYOUTRETURN. @fail_return is how we distinguish 373 * between the two cases. 374 * 375 * Returns a pointer to a connected DS object on success or NULL on failure. 376 */ 377 struct nfs4_pnfs_ds * 378 nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx, 379 bool fail_return) 380 { 381 struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, ds_idx); 382 struct nfs4_pnfs_ds *ds = NULL; 383 struct nfs4_deviceid_node *devid; 384 struct inode *ino = lseg->pls_layout->plh_inode; 385 struct nfs_server *s = NFS_SERVER(ino); 386 unsigned int max_payload; 387 int status; 388 389 if (!ff_layout_mirror_valid(lseg, mirror, true)) { 390 pr_err_ratelimited("NFS: %s: No data server for offset index %d\n", 391 __func__, ds_idx); 392 goto out; 393 } 394 395 devid = &mirror->mirror_ds->id_node; 396 if (ff_layout_test_devid_unavailable(devid)) 397 goto out_fail; 398 399 ds = mirror->mirror_ds->ds; 400 /* matching smp_wmb() in _nfs4_pnfs_v3/4_ds_connect */ 401 smp_rmb(); 402 if (ds->ds_clp) 403 goto out; 404 405 /* FIXME: For now we assume the server sent only one version of NFS 406 * to use for the DS. 407 */ 408 status = nfs4_pnfs_ds_connect(s, ds, devid, dataserver_timeo, 409 dataserver_retrans, 410 mirror->mirror_ds->ds_versions[0].version, 411 mirror->mirror_ds->ds_versions[0].minor_version); 412 413 /* connect success, check rsize/wsize limit */ 414 if (ds->ds_clp) { 415 max_payload = 416 nfs_block_size(rpc_max_payload(ds->ds_clp->cl_rpcclient), 417 NULL); 418 if (mirror->mirror_ds->ds_versions[0].rsize > max_payload) 419 mirror->mirror_ds->ds_versions[0].rsize = max_payload; 420 if (mirror->mirror_ds->ds_versions[0].wsize > max_payload) 421 mirror->mirror_ds->ds_versions[0].wsize = max_payload; 422 goto out; 423 } 424 out_fail: 425 ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout), 426 mirror, lseg->pls_range.offset, 427 lseg->pls_range.length, NFS4ERR_NXIO, 428 OP_ILLEGAL, GFP_NOIO); 429 if (fail_return || !ff_layout_has_available_ds(lseg)) 430 pnfs_error_mark_layout_for_return(ino, lseg); 431 ds = NULL; 432 out: 433 return ds; 434 } 435 436 struct rpc_cred * 437 ff_layout_get_ds_cred(struct pnfs_layout_segment *lseg, u32 ds_idx, 438 struct rpc_cred *mdscred) 439 { 440 struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, ds_idx); 441 struct rpc_cred *cred; 442 443 if (mirror) { 444 cred = ff_layout_get_mirror_cred(mirror, lseg->pls_range.iomode); 445 if (!cred) 446 cred = get_rpccred(mdscred); 447 } else { 448 cred = get_rpccred(mdscred); 449 } 450 return cred; 451 } 452 453 /** 454 * Find or create a DS rpc client with th MDS server rpc client auth flavor 455 * in the nfs_client cl_ds_clients list. 456 */ 457 struct rpc_clnt * 458 nfs4_ff_find_or_create_ds_client(struct pnfs_layout_segment *lseg, u32 ds_idx, 459 struct nfs_client *ds_clp, struct inode *inode) 460 { 461 struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, ds_idx); 462 463 switch (mirror->mirror_ds->ds_versions[0].version) { 464 case 3: 465 /* For NFSv3 DS, flavor is set when creating DS connections */ 466 return ds_clp->cl_rpcclient; 467 case 4: 468 return nfs4_find_or_create_ds_client(ds_clp, inode); 469 default: 470 BUG(); 471 } 472 } 473 474 void ff_layout_free_ds_ioerr(struct list_head *head) 475 { 476 struct nfs4_ff_layout_ds_err *err; 477 478 while (!list_empty(head)) { 479 err = list_first_entry(head, 480 struct nfs4_ff_layout_ds_err, 481 list); 482 list_del(&err->list); 483 kfree(err); 484 } 485 } 486 487 /* called with inode i_lock held */ 488 int ff_layout_encode_ds_ioerr(struct xdr_stream *xdr, const struct list_head *head) 489 { 490 struct nfs4_ff_layout_ds_err *err; 491 __be32 *p; 492 493 list_for_each_entry(err, head, list) { 494 /* offset(8) + length(8) + stateid(NFS4_STATEID_SIZE) 495 * + array length + deviceid(NFS4_DEVICEID4_SIZE) 496 * + status(4) + opnum(4) 497 */ 498 p = xdr_reserve_space(xdr, 499 28 + NFS4_STATEID_SIZE + NFS4_DEVICEID4_SIZE); 500 if (unlikely(!p)) 501 return -ENOBUFS; 502 p = xdr_encode_hyper(p, err->offset); 503 p = xdr_encode_hyper(p, err->length); 504 p = xdr_encode_opaque_fixed(p, &err->stateid, 505 NFS4_STATEID_SIZE); 506 /* Encode 1 error */ 507 *p++ = cpu_to_be32(1); 508 p = xdr_encode_opaque_fixed(p, &err->deviceid, 509 NFS4_DEVICEID4_SIZE); 510 *p++ = cpu_to_be32(err->status); 511 *p++ = cpu_to_be32(err->opnum); 512 dprintk("%s: offset %llu length %llu status %d op %d\n", 513 __func__, err->offset, err->length, err->status, 514 err->opnum); 515 } 516 517 return 0; 518 } 519 520 static 521 unsigned int do_layout_fetch_ds_ioerr(struct pnfs_layout_hdr *lo, 522 const struct pnfs_layout_range *range, 523 struct list_head *head, 524 unsigned int maxnum) 525 { 526 struct nfs4_flexfile_layout *flo = FF_LAYOUT_FROM_HDR(lo); 527 struct inode *inode = lo->plh_inode; 528 struct nfs4_ff_layout_ds_err *err, *n; 529 unsigned int ret = 0; 530 531 spin_lock(&inode->i_lock); 532 list_for_each_entry_safe(err, n, &flo->error_list, list) { 533 if (!pnfs_is_range_intersecting(err->offset, 534 pnfs_end_offset(err->offset, err->length), 535 range->offset, 536 pnfs_end_offset(range->offset, range->length))) 537 continue; 538 if (!maxnum) 539 break; 540 list_move(&err->list, head); 541 maxnum--; 542 ret++; 543 } 544 spin_unlock(&inode->i_lock); 545 return ret; 546 } 547 548 unsigned int ff_layout_fetch_ds_ioerr(struct pnfs_layout_hdr *lo, 549 const struct pnfs_layout_range *range, 550 struct list_head *head, 551 unsigned int maxnum) 552 { 553 unsigned int ret; 554 555 ret = do_layout_fetch_ds_ioerr(lo, range, head, maxnum); 556 /* If we're over the max, discard all remaining entries */ 557 if (ret == maxnum) { 558 LIST_HEAD(discard); 559 do_layout_fetch_ds_ioerr(lo, range, &discard, -1); 560 ff_layout_free_ds_ioerr(&discard); 561 } 562 return ret; 563 } 564 565 static bool ff_read_layout_has_available_ds(struct pnfs_layout_segment *lseg) 566 { 567 struct nfs4_ff_layout_mirror *mirror; 568 struct nfs4_deviceid_node *devid; 569 u32 idx; 570 571 for (idx = 0; idx < FF_LAYOUT_MIRROR_COUNT(lseg); idx++) { 572 mirror = FF_LAYOUT_COMP(lseg, idx); 573 if (mirror) { 574 if (!mirror->mirror_ds) 575 return true; 576 if (IS_ERR(mirror->mirror_ds)) 577 continue; 578 devid = &mirror->mirror_ds->id_node; 579 if (!ff_layout_test_devid_unavailable(devid)) 580 return true; 581 } 582 } 583 584 return false; 585 } 586 587 static bool ff_rw_layout_has_available_ds(struct pnfs_layout_segment *lseg) 588 { 589 struct nfs4_ff_layout_mirror *mirror; 590 struct nfs4_deviceid_node *devid; 591 u32 idx; 592 593 for (idx = 0; idx < FF_LAYOUT_MIRROR_COUNT(lseg); idx++) { 594 mirror = FF_LAYOUT_COMP(lseg, idx); 595 if (!mirror || IS_ERR(mirror->mirror_ds)) 596 return false; 597 if (!mirror->mirror_ds) 598 continue; 599 devid = &mirror->mirror_ds->id_node; 600 if (ff_layout_test_devid_unavailable(devid)) 601 return false; 602 } 603 604 return FF_LAYOUT_MIRROR_COUNT(lseg) != 0; 605 } 606 607 static bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg) 608 { 609 if (lseg->pls_range.iomode == IOMODE_READ) 610 return ff_read_layout_has_available_ds(lseg); 611 /* Note: RW layout needs all mirrors available */ 612 return ff_rw_layout_has_available_ds(lseg); 613 } 614 615 bool ff_layout_avoid_mds_available_ds(struct pnfs_layout_segment *lseg) 616 { 617 return ff_layout_no_fallback_to_mds(lseg) || 618 ff_layout_has_available_ds(lseg); 619 } 620 621 bool ff_layout_avoid_read_on_rw(struct pnfs_layout_segment *lseg) 622 { 623 return lseg->pls_range.iomode == IOMODE_RW && 624 ff_layout_no_read_on_rw(lseg); 625 } 626 627 module_param(dataserver_retrans, uint, 0644); 628 MODULE_PARM_DESC(dataserver_retrans, "The number of times the NFSv4.1 client " 629 "retries a request before it attempts further " 630 " recovery action."); 631 module_param(dataserver_timeo, uint, 0644); 632 MODULE_PARM_DESC(dataserver_timeo, "The time (in tenths of a second) the " 633 "NFSv4.1 client waits for a response from a " 634 " data server before it retries an NFS request."); 635