1 /* Handle fileserver selection and rotation. 2 * 3 * Copyright (C) 2017 Red Hat, Inc. All Rights Reserved. 4 * Written by David Howells (dhowells@redhat.com) 5 * 6 * This program is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU General Public Licence 8 * as published by the Free Software Foundation; either version 9 * 2 of the Licence, or (at your option) any later version. 10 */ 11 12 #include <linux/kernel.h> 13 #include <linux/slab.h> 14 #include <linux/fs.h> 15 #include <linux/sched.h> 16 #include <linux/delay.h> 17 #include <linux/sched/signal.h> 18 #include "internal.h" 19 #include "afs_fs.h" 20 21 /* 22 * Initialise a filesystem server cursor for iterating over FS servers. 23 */ 24 void afs_init_fs_cursor(struct afs_fs_cursor *fc, struct afs_vnode *vnode) 25 { 26 memset(fc, 0, sizeof(*fc)); 27 } 28 29 /* 30 * Begin an operation on the fileserver. 31 * 32 * Fileserver operations are serialised on the server by vnode, so we serialise 33 * them here also using the io_lock. 34 */ 35 bool afs_begin_vnode_operation(struct afs_fs_cursor *fc, struct afs_vnode *vnode, 36 struct key *key) 37 { 38 afs_init_fs_cursor(fc, vnode); 39 fc->vnode = vnode; 40 fc->key = key; 41 fc->ac.error = SHRT_MAX; 42 43 if (mutex_lock_interruptible(&vnode->io_lock) < 0) { 44 fc->ac.error = -EINTR; 45 fc->flags |= AFS_FS_CURSOR_STOP; 46 return false; 47 } 48 49 if (test_bit(AFS_VNODE_READLOCKED, &vnode->flags) || 50 test_bit(AFS_VNODE_WRITELOCKED, &vnode->flags)) 51 fc->flags |= AFS_FS_CURSOR_CUR_ONLY; 52 return true; 53 } 54 55 /* 56 * Begin iteration through a server list, starting with the vnode's last used 57 * server if possible, or the last recorded good server if not. 58 */ 59 static bool afs_start_fs_iteration(struct afs_fs_cursor *fc, 60 struct afs_vnode *vnode) 61 { 62 struct afs_cb_interest *cbi; 63 int i; 64 65 read_lock(&vnode->volume->servers_lock); 66 fc->server_list = afs_get_serverlist(vnode->volume->servers); 67 read_unlock(&vnode->volume->servers_lock); 68 69 cbi = vnode->cb_interest; 70 if (cbi) { 71 /* See if the vnode's preferred record is still available */ 72 for (i = 0; i < fc->server_list->nr_servers; i++) { 73 if (fc->server_list->servers[i].cb_interest == cbi) { 74 fc->start = i; 75 goto found_interest; 76 } 77 } 78 79 /* If we have a lock outstanding on a server that's no longer 80 * serving this vnode, then we can't switch to another server 81 * and have to return an error. 82 */ 83 if (fc->flags & AFS_FS_CURSOR_CUR_ONLY) { 84 fc->ac.error = -ESTALE; 85 return false; 86 } 87 88 /* Note that the callback promise is effectively broken */ 89 write_seqlock(&vnode->cb_lock); 90 ASSERTCMP(cbi, ==, vnode->cb_interest); 91 vnode->cb_interest = NULL; 92 if (test_and_clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) 93 vnode->cb_break++; 94 write_sequnlock(&vnode->cb_lock); 95 96 afs_put_cb_interest(afs_v2net(vnode), cbi); 97 cbi = NULL; 98 } else { 99 fc->start = READ_ONCE(fc->server_list->index); 100 } 101 102 found_interest: 103 fc->index = fc->start; 104 return true; 105 } 106 107 /* 108 * Post volume busy note. 109 */ 110 static void afs_busy(struct afs_volume *volume, u32 abort_code) 111 { 112 const char *m; 113 114 switch (abort_code) { 115 case VOFFLINE: m = "offline"; break; 116 case VRESTARTING: m = "restarting"; break; 117 case VSALVAGING: m = "being salvaged"; break; 118 default: m = "busy"; break; 119 } 120 121 pr_notice("kAFS: Volume %u '%s' is %s\n", volume->vid, volume->name, m); 122 } 123 124 /* 125 * Sleep and retry the operation to the same fileserver. 126 */ 127 static bool afs_sleep_and_retry(struct afs_fs_cursor *fc) 128 { 129 msleep_interruptible(1000); 130 if (signal_pending(current)) { 131 fc->ac.error = -ERESTARTSYS; 132 return false; 133 } 134 135 return true; 136 } 137 138 /* 139 * Select the fileserver to use. May be called multiple times to rotate 140 * through the fileservers. 141 */ 142 bool afs_select_fileserver(struct afs_fs_cursor *fc) 143 { 144 struct afs_addr_list *alist; 145 struct afs_server *server; 146 struct afs_vnode *vnode = fc->vnode; 147 148 _enter("%u/%u,%u/%u,%d,%d", 149 fc->index, fc->start, 150 fc->ac.index, fc->ac.start, 151 fc->ac.error, fc->ac.abort_code); 152 153 if (fc->flags & AFS_FS_CURSOR_STOP) { 154 _leave(" = f [stopped]"); 155 return false; 156 } 157 158 /* Evaluate the result of the previous operation, if there was one. */ 159 switch (fc->ac.error) { 160 case SHRT_MAX: 161 goto start; 162 163 case 0: 164 default: 165 /* Success or local failure. Stop. */ 166 fc->flags |= AFS_FS_CURSOR_STOP; 167 _leave(" = f [okay/local %d]", fc->ac.error); 168 return false; 169 170 case -ECONNABORTED: 171 /* The far side rejected the operation on some grounds. This 172 * might involve the server being busy or the volume having been moved. 173 */ 174 switch (fc->ac.abort_code) { 175 case VNOVOL: 176 /* This fileserver doesn't know about the volume. 177 * - May indicate that the VL is wrong - retry once and compare 178 * the results. 179 * - May indicate that the fileserver couldn't attach to the vol. 180 */ 181 if (fc->flags & AFS_FS_CURSOR_VNOVOL) { 182 fc->ac.error = -EREMOTEIO; 183 goto failed; 184 } 185 186 write_lock(&vnode->volume->servers_lock); 187 fc->server_list->vnovol_mask |= 1 << fc->index; 188 write_unlock(&vnode->volume->servers_lock); 189 190 set_bit(AFS_VOLUME_NEEDS_UPDATE, &vnode->volume->flags); 191 fc->ac.error = afs_check_volume_status(vnode->volume, fc->key); 192 if (fc->ac.error < 0) 193 goto failed; 194 195 if (test_bit(AFS_VOLUME_DELETED, &vnode->volume->flags)) { 196 fc->ac.error = -ENOMEDIUM; 197 goto failed; 198 } 199 200 /* If the server list didn't change, then assume that 201 * it's the fileserver having trouble. 202 */ 203 if (vnode->volume->servers == fc->server_list) { 204 fc->ac.error = -EREMOTEIO; 205 goto failed; 206 } 207 208 /* Try again */ 209 fc->flags |= AFS_FS_CURSOR_VNOVOL; 210 _leave(" = t [vnovol]"); 211 return true; 212 213 case VSALVAGE: /* TODO: Should this return an error or iterate? */ 214 case VVOLEXISTS: 215 case VNOSERVICE: 216 case VONLINE: 217 case VDISKFULL: 218 case VOVERQUOTA: 219 fc->ac.error = afs_abort_to_error(fc->ac.abort_code); 220 goto next_server; 221 222 case VOFFLINE: 223 if (!test_and_set_bit(AFS_VOLUME_OFFLINE, &vnode->volume->flags)) { 224 afs_busy(vnode->volume, fc->ac.abort_code); 225 clear_bit(AFS_VOLUME_BUSY, &vnode->volume->flags); 226 } 227 if (fc->flags & AFS_FS_CURSOR_NO_VSLEEP) { 228 fc->ac.error = -EADV; 229 goto failed; 230 } 231 if (fc->flags & AFS_FS_CURSOR_CUR_ONLY) { 232 fc->ac.error = -ESTALE; 233 goto failed; 234 } 235 goto busy; 236 237 case VSALVAGING: 238 case VRESTARTING: 239 case VBUSY: 240 /* Retry after going round all the servers unless we 241 * have a file lock we need to maintain. 242 */ 243 if (fc->flags & AFS_FS_CURSOR_NO_VSLEEP) { 244 fc->ac.error = -EBUSY; 245 goto failed; 246 } 247 if (!test_and_set_bit(AFS_VOLUME_BUSY, &vnode->volume->flags)) { 248 afs_busy(vnode->volume, fc->ac.abort_code); 249 clear_bit(AFS_VOLUME_OFFLINE, &vnode->volume->flags); 250 } 251 busy: 252 if (fc->flags & AFS_FS_CURSOR_CUR_ONLY) { 253 if (!afs_sleep_and_retry(fc)) 254 goto failed; 255 256 /* Retry with same server & address */ 257 _leave(" = t [vbusy]"); 258 return true; 259 } 260 261 fc->flags |= AFS_FS_CURSOR_VBUSY; 262 goto next_server; 263 264 case VMOVED: 265 /* The volume migrated to another server. We consider 266 * consider all locks and callbacks broken and request 267 * an update from the VLDB. 268 * 269 * We also limit the number of VMOVED hops we will 270 * honour, just in case someone sets up a loop. 271 */ 272 if (fc->flags & AFS_FS_CURSOR_VMOVED) { 273 fc->ac.error = -EREMOTEIO; 274 goto failed; 275 } 276 fc->flags |= AFS_FS_CURSOR_VMOVED; 277 278 set_bit(AFS_VOLUME_WAIT, &vnode->volume->flags); 279 set_bit(AFS_VOLUME_NEEDS_UPDATE, &vnode->volume->flags); 280 fc->ac.error = afs_check_volume_status(vnode->volume, fc->key); 281 if (fc->ac.error < 0) 282 goto failed; 283 284 /* If the server list didn't change, then the VLDB is 285 * out of sync with the fileservers. This is hopefully 286 * a temporary condition, however, so we don't want to 287 * permanently block access to the file. 288 * 289 * TODO: Try other fileservers if we can. 290 * 291 * TODO: Retry a few times with sleeps. 292 */ 293 if (vnode->volume->servers == fc->server_list) { 294 fc->ac.error = -ENOMEDIUM; 295 goto failed; 296 } 297 298 goto restart_from_beginning; 299 300 default: 301 clear_bit(AFS_VOLUME_OFFLINE, &vnode->volume->flags); 302 clear_bit(AFS_VOLUME_BUSY, &vnode->volume->flags); 303 fc->ac.error = afs_abort_to_error(fc->ac.abort_code); 304 goto failed; 305 } 306 307 case -ENETUNREACH: 308 case -EHOSTUNREACH: 309 case -ECONNREFUSED: 310 case -ETIMEDOUT: 311 case -ETIME: 312 _debug("no conn"); 313 goto iterate_address; 314 } 315 316 restart_from_beginning: 317 _debug("restart"); 318 afs_end_cursor(&fc->ac); 319 afs_put_cb_interest(afs_v2net(vnode), fc->cbi); 320 fc->cbi = NULL; 321 afs_put_serverlist(afs_v2net(vnode), fc->server_list); 322 fc->server_list = NULL; 323 start: 324 _debug("start"); 325 /* See if we need to do an update of the volume record. Note that the 326 * volume may have moved or even have been deleted. 327 */ 328 fc->ac.error = afs_check_volume_status(vnode->volume, fc->key); 329 if (fc->ac.error < 0) 330 goto failed; 331 332 if (!afs_start_fs_iteration(fc, vnode)) 333 goto failed; 334 goto use_server; 335 336 next_server: 337 _debug("next"); 338 afs_put_cb_interest(afs_v2net(vnode), fc->cbi); 339 fc->cbi = NULL; 340 fc->index++; 341 if (fc->index >= fc->server_list->nr_servers) 342 fc->index = 0; 343 if (fc->index != fc->start) 344 goto use_server; 345 346 /* That's all the servers poked to no good effect. Try again if some 347 * of them were busy. 348 */ 349 if (fc->flags & AFS_FS_CURSOR_VBUSY) 350 goto restart_from_beginning; 351 352 fc->ac.error = -EDESTADDRREQ; 353 goto failed; 354 355 use_server: 356 _debug("use"); 357 /* We're starting on a different fileserver from the list. We need to 358 * check it, create a callback intercept, find its address list and 359 * probe its capabilities before we use it. 360 */ 361 ASSERTCMP(fc->ac.alist, ==, NULL); 362 server = fc->server_list->servers[fc->index].server; 363 364 if (!afs_check_server_record(fc, server)) 365 goto failed; 366 367 _debug("USING SERVER: %pU", &server->uuid); 368 369 /* Make sure we've got a callback interest record for this server. We 370 * have to link it in before we send the request as we can be sent a 371 * break request before we've finished decoding the reply and 372 * installing the vnode. 373 */ 374 fc->ac.error = afs_register_server_cb_interest( 375 vnode, &fc->server_list->servers[fc->index]); 376 if (fc->ac.error < 0) 377 goto failed; 378 379 fc->cbi = afs_get_cb_interest(vnode->cb_interest); 380 381 read_lock(&server->fs_lock); 382 alist = rcu_dereference_protected(server->addresses, 383 lockdep_is_held(&server->fs_lock)); 384 afs_get_addrlist(alist); 385 read_unlock(&server->fs_lock); 386 387 388 /* Probe the current fileserver if we haven't done so yet. */ 389 if (!test_bit(AFS_SERVER_FL_PROBED, &server->flags)) { 390 fc->ac.alist = afs_get_addrlist(alist); 391 392 if (!afs_probe_fileserver(fc)) 393 goto failed; 394 } 395 396 if (!fc->ac.alist) 397 fc->ac.alist = alist; 398 else 399 afs_put_addrlist(alist); 400 401 fc->ac.addr = NULL; 402 fc->ac.start = READ_ONCE(alist->index); 403 fc->ac.index = fc->ac.start; 404 fc->ac.error = 0; 405 fc->ac.begun = false; 406 goto iterate_address; 407 408 iterate_address: 409 ASSERT(fc->ac.alist); 410 _debug("iterate %d/%d", fc->ac.index, fc->ac.alist->nr_addrs); 411 /* Iterate over the current server's address list to try and find an 412 * address on which it will respond to us. 413 */ 414 if (afs_iterate_addresses(&fc->ac)) { 415 _leave(" = t"); 416 return true; 417 } 418 419 afs_end_cursor(&fc->ac); 420 goto next_server; 421 422 failed: 423 fc->flags |= AFS_FS_CURSOR_STOP; 424 _leave(" = f [failed %d]", fc->ac.error); 425 return false; 426 } 427 428 /* 429 * Select the same fileserver we used for a vnode before and only that 430 * fileserver. We use this when we have a lock on that file, which is backed 431 * only by the fileserver we obtained it from. 432 */ 433 bool afs_select_current_fileserver(struct afs_fs_cursor *fc) 434 { 435 struct afs_vnode *vnode = fc->vnode; 436 struct afs_cb_interest *cbi = vnode->cb_interest; 437 struct afs_addr_list *alist; 438 439 _enter(""); 440 441 if (!cbi) { 442 fc->ac.error = -ESTALE; 443 fc->flags |= AFS_FS_CURSOR_STOP; 444 return false; 445 } 446 447 read_lock(&cbi->server->fs_lock); 448 alist = afs_get_addrlist(cbi->server->addresses); 449 read_unlock(&cbi->server->fs_lock); 450 if (!alist) { 451 fc->ac.error = -ESTALE; 452 fc->flags |= AFS_FS_CURSOR_STOP; 453 return false; 454 } 455 456 fc->ac.alist = alist; 457 fc->ac.error = 0; 458 return true; 459 } 460 461 /* 462 * Tidy up a filesystem cursor and unlock the vnode. 463 */ 464 int afs_end_vnode_operation(struct afs_fs_cursor *fc) 465 { 466 struct afs_net *net = afs_v2net(fc->vnode); 467 int ret; 468 469 mutex_unlock(&fc->vnode->io_lock); 470 471 afs_end_cursor(&fc->ac); 472 afs_put_cb_interest(net, fc->cbi); 473 afs_put_serverlist(net, fc->server_list); 474 475 ret = fc->ac.error; 476 if (ret == -ECONNABORTED) 477 afs_abort_to_error(fc->ac.abort_code); 478 479 return fc->ac.error; 480 } 481 482 #if 0 483 /* 484 * Set a filesystem server cursor for using a specific FS server. 485 */ 486 int afs_set_fs_cursor(struct afs_fs_cursor *fc, struct afs_vnode *vnode) 487 { 488 afs_init_fs_cursor(fc, vnode); 489 490 read_seqlock_excl(&vnode->cb_lock); 491 if (vnode->cb_interest) { 492 if (vnode->cb_interest->server->fs_state == 0) 493 fc->server = afs_get_server(vnode->cb_interest->server); 494 else 495 fc->ac.error = vnode->cb_interest->server->fs_state; 496 } else { 497 fc->ac.error = -ESTALE; 498 } 499 read_sequnlock_excl(&vnode->cb_lock); 500 501 return fc->ac.error; 502 } 503 504 /* 505 * pick a server to use to try accessing this volume 506 * - returns with an elevated usage count on the server chosen 507 */ 508 bool afs_volume_pick_fileserver(struct afs_fs_cursor *fc, struct afs_vnode *vnode) 509 { 510 struct afs_volume *volume = vnode->volume; 511 struct afs_server *server; 512 int ret, state, loop; 513 514 _enter("%s", volume->vlocation->vldb.name); 515 516 /* stick with the server we're already using if we can */ 517 if (vnode->cb_interest && vnode->cb_interest->server->fs_state == 0) { 518 fc->server = afs_get_server(vnode->cb_interest->server); 519 goto set_server; 520 } 521 522 down_read(&volume->server_sem); 523 524 /* handle the no-server case */ 525 if (volume->nservers == 0) { 526 fc->ac.error = volume->rjservers ? -ENOMEDIUM : -ESTALE; 527 up_read(&volume->server_sem); 528 _leave(" = f [no servers %d]", fc->ac.error); 529 return false; 530 } 531 532 /* basically, just search the list for the first live server and use 533 * that */ 534 ret = 0; 535 for (loop = 0; loop < volume->nservers; loop++) { 536 server = volume->servers[loop]; 537 state = server->fs_state; 538 539 _debug("consider %d [%d]", loop, state); 540 541 switch (state) { 542 case 0: 543 goto picked_server; 544 545 case -ENETUNREACH: 546 if (ret == 0) 547 ret = state; 548 break; 549 550 case -EHOSTUNREACH: 551 if (ret == 0 || 552 ret == -ENETUNREACH) 553 ret = state; 554 break; 555 556 case -ECONNREFUSED: 557 if (ret == 0 || 558 ret == -ENETUNREACH || 559 ret == -EHOSTUNREACH) 560 ret = state; 561 break; 562 563 default: 564 case -EREMOTEIO: 565 if (ret == 0 || 566 ret == -ENETUNREACH || 567 ret == -EHOSTUNREACH || 568 ret == -ECONNREFUSED) 569 ret = state; 570 break; 571 } 572 } 573 574 error: 575 fc->ac.error = ret; 576 577 /* no available servers 578 * - TODO: handle the no active servers case better 579 */ 580 up_read(&volume->server_sem); 581 _leave(" = f [%d]", fc->ac.error); 582 return false; 583 584 picked_server: 585 /* Found an apparently healthy server. We need to register an interest 586 * in receiving callbacks before we talk to it. 587 */ 588 ret = afs_register_server_cb_interest(vnode, 589 &volume->cb_interests[loop], server); 590 if (ret < 0) 591 goto error; 592 593 fc->server = afs_get_server(server); 594 up_read(&volume->server_sem); 595 set_server: 596 fc->ac.alist = afs_get_addrlist(fc->server->addrs); 597 fc->ac.addr = &fc->ac.alist->addrs[0]; 598 _debug("USING SERVER: %pIS\n", &fc->ac.addr->transport); 599 _leave(" = t (picked %pIS)", &fc->ac.addr->transport); 600 return true; 601 } 602 603 /* 604 * release a server after use 605 * - releases the ref on the server struct that was acquired by picking 606 * - records result of using a particular server to access a volume 607 * - return true to try again, false if okay or to issue error 608 * - the caller must release the server struct if result was false 609 */ 610 bool afs_iterate_fs_cursor(struct afs_fs_cursor *fc, 611 struct afs_vnode *vnode) 612 { 613 struct afs_volume *volume = vnode->volume; 614 struct afs_server *server = fc->server; 615 unsigned loop; 616 617 _enter("%s,%pIS,%d", 618 volume->vlocation->vldb.name, &fc->ac.addr->transport, 619 fc->ac.error); 620 621 switch (fc->ac.error) { 622 /* success */ 623 case 0: 624 server->fs_state = 0; 625 _leave(" = f"); 626 return false; 627 628 /* the fileserver denied all knowledge of the volume */ 629 case -ENOMEDIUM: 630 down_write(&volume->server_sem); 631 632 /* firstly, find where the server is in the active list (if it 633 * is) */ 634 for (loop = 0; loop < volume->nservers; loop++) 635 if (volume->servers[loop] == server) 636 goto present; 637 638 /* no longer there - may have been discarded by another op */ 639 goto try_next_server_upw; 640 641 present: 642 volume->nservers--; 643 memmove(&volume->servers[loop], 644 &volume->servers[loop + 1], 645 sizeof(volume->servers[loop]) * 646 (volume->nservers - loop)); 647 volume->servers[volume->nservers] = NULL; 648 afs_put_server(afs_v2net(vnode), server); 649 volume->rjservers++; 650 651 if (volume->nservers > 0) 652 /* another server might acknowledge its existence */ 653 goto try_next_server_upw; 654 655 /* handle the case where all the fileservers have rejected the 656 * volume 657 * - TODO: try asking the fileservers for volume information 658 * - TODO: contact the VL server again to see if the volume is 659 * no longer registered 660 */ 661 up_write(&volume->server_sem); 662 afs_put_server(afs_v2net(vnode), server); 663 fc->server = NULL; 664 _leave(" = f [completely rejected]"); 665 return false; 666 667 /* problem reaching the server */ 668 case -ENETUNREACH: 669 case -EHOSTUNREACH: 670 case -ECONNREFUSED: 671 case -ETIME: 672 case -ETIMEDOUT: 673 case -EREMOTEIO: 674 /* mark the server as dead 675 * TODO: vary dead timeout depending on error 676 */ 677 spin_lock(&server->fs_lock); 678 if (!server->fs_state) { 679 server->fs_state = fc->ac.error; 680 printk("kAFS: SERVER DEAD state=%d\n", fc->ac.error); 681 } 682 spin_unlock(&server->fs_lock); 683 goto try_next_server; 684 685 /* miscellaneous error */ 686 default: 687 case -ENOMEM: 688 case -ENONET: 689 /* tell the caller to accept the result */ 690 afs_put_server(afs_v2net(vnode), server); 691 fc->server = NULL; 692 _leave(" = f [local failure]"); 693 return false; 694 } 695 696 /* tell the caller to loop around and try the next server */ 697 try_next_server_upw: 698 up_write(&volume->server_sem); 699 try_next_server: 700 afs_put_server(afs_v2net(vnode), server); 701 _leave(" = t [try next server]"); 702 return true; 703 } 704 705 /* 706 * Clean up a fileserver cursor. 707 */ 708 int afs_end_fs_cursor(struct afs_fs_cursor *fc, struct afs_net *net) 709 { 710 afs_end_cursor(&fc->ac); 711 afs_put_server(net, fc->server); 712 return fc->ac.error; 713 } 714 715 #endif 716