1 /* Handle fileserver selection and rotation. 2 * 3 * Copyright (C) 2017 Red Hat, Inc. All Rights Reserved. 4 * Written by David Howells (dhowells@redhat.com) 5 * 6 * This program is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU General Public Licence 8 * as published by the Free Software Foundation; either version 9 * 2 of the Licence, or (at your option) any later version. 10 */ 11 12 #include <linux/kernel.h> 13 #include <linux/slab.h> 14 #include <linux/fs.h> 15 #include <linux/sched.h> 16 #include <linux/delay.h> 17 #include <linux/sched/signal.h> 18 #include "internal.h" 19 #include "afs_fs.h" 20 21 /* 22 * Begin an operation on the fileserver. 23 * 24 * Fileserver operations are serialised on the server by vnode, so we serialise 25 * them here also using the io_lock. 26 */ 27 bool afs_begin_vnode_operation(struct afs_fs_cursor *fc, struct afs_vnode *vnode, 28 struct key *key) 29 { 30 memset(fc, 0, sizeof(*fc)); 31 fc->vnode = vnode; 32 fc->key = key; 33 fc->ac.error = SHRT_MAX; 34 fc->error = -EDESTADDRREQ; 35 36 if (mutex_lock_interruptible(&vnode->io_lock) < 0) { 37 fc->error = -EINTR; 38 fc->flags |= AFS_FS_CURSOR_STOP; 39 return false; 40 } 41 42 if (vnode->lock_state != AFS_VNODE_LOCK_NONE) 43 fc->flags |= AFS_FS_CURSOR_CUR_ONLY; 44 return true; 45 } 46 47 /* 48 * Begin iteration through a server list, starting with the vnode's last used 49 * server if possible, or the last recorded good server if not. 50 */ 51 static bool afs_start_fs_iteration(struct afs_fs_cursor *fc, 52 struct afs_vnode *vnode) 53 { 54 struct afs_cb_interest *cbi; 55 int i; 56 57 read_lock(&vnode->volume->servers_lock); 58 fc->server_list = afs_get_serverlist(vnode->volume->servers); 59 read_unlock(&vnode->volume->servers_lock); 60 61 fc->untried = (1UL << fc->server_list->nr_servers) - 1; 62 fc->index = READ_ONCE(fc->server_list->preferred); 63 64 cbi = vnode->cb_interest; 65 if (cbi) { 66 /* See if the vnode's preferred record is still available */ 67 for (i = 0; i < fc->server_list->nr_servers; i++) { 68 if (fc->server_list->servers[i].cb_interest == cbi) { 69 fc->index = i; 70 goto found_interest; 71 } 72 } 73 74 /* If we have a lock outstanding on a server that's no longer 75 * serving this vnode, then we can't switch to another server 76 * and have to return an error. 77 */ 78 if (fc->flags & AFS_FS_CURSOR_CUR_ONLY) { 79 fc->error = -ESTALE; 80 return false; 81 } 82 83 /* Note that the callback promise is effectively broken */ 84 write_seqlock(&vnode->cb_lock); 85 ASSERTCMP(cbi, ==, vnode->cb_interest); 86 vnode->cb_interest = NULL; 87 if (test_and_clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) 88 vnode->cb_break++; 89 write_sequnlock(&vnode->cb_lock); 90 91 afs_put_cb_interest(afs_v2net(vnode), cbi); 92 cbi = NULL; 93 } 94 95 found_interest: 96 return true; 97 } 98 99 /* 100 * Post volume busy note. 101 */ 102 static void afs_busy(struct afs_volume *volume, u32 abort_code) 103 { 104 const char *m; 105 106 switch (abort_code) { 107 case VOFFLINE: m = "offline"; break; 108 case VRESTARTING: m = "restarting"; break; 109 case VSALVAGING: m = "being salvaged"; break; 110 default: m = "busy"; break; 111 } 112 113 pr_notice("kAFS: Volume %llu '%s' is %s\n", volume->vid, volume->name, m); 114 } 115 116 /* 117 * Sleep and retry the operation to the same fileserver. 118 */ 119 static bool afs_sleep_and_retry(struct afs_fs_cursor *fc) 120 { 121 msleep_interruptible(1000); 122 if (signal_pending(current)) { 123 fc->error = -ERESTARTSYS; 124 return false; 125 } 126 127 return true; 128 } 129 130 /* 131 * Select the fileserver to use. May be called multiple times to rotate 132 * through the fileservers. 133 */ 134 bool afs_select_fileserver(struct afs_fs_cursor *fc) 135 { 136 struct afs_addr_list *alist; 137 struct afs_server *server; 138 struct afs_vnode *vnode = fc->vnode; 139 u32 rtt, abort_code; 140 int error = fc->ac.error, i; 141 142 _enter("%lx[%d],%lx[%d],%d,%d", 143 fc->untried, fc->index, 144 fc->ac.tried, fc->ac.index, 145 error, fc->ac.abort_code); 146 147 if (fc->flags & AFS_FS_CURSOR_STOP) { 148 _leave(" = f [stopped]"); 149 return false; 150 } 151 152 fc->nr_iterations++; 153 154 /* Evaluate the result of the previous operation, if there was one. */ 155 switch (error) { 156 case SHRT_MAX: 157 goto start; 158 159 case 0: 160 default: 161 /* Success or local failure. Stop. */ 162 fc->error = error; 163 fc->flags |= AFS_FS_CURSOR_STOP; 164 _leave(" = f [okay/local %d]", error); 165 return false; 166 167 case -ECONNABORTED: 168 /* The far side rejected the operation on some grounds. This 169 * might involve the server being busy or the volume having been moved. 170 */ 171 switch (fc->ac.abort_code) { 172 case VNOVOL: 173 /* This fileserver doesn't know about the volume. 174 * - May indicate that the VL is wrong - retry once and compare 175 * the results. 176 * - May indicate that the fileserver couldn't attach to the vol. 177 */ 178 if (fc->flags & AFS_FS_CURSOR_VNOVOL) { 179 fc->error = -EREMOTEIO; 180 goto next_server; 181 } 182 183 write_lock(&vnode->volume->servers_lock); 184 fc->server_list->vnovol_mask |= 1 << fc->index; 185 write_unlock(&vnode->volume->servers_lock); 186 187 set_bit(AFS_VOLUME_NEEDS_UPDATE, &vnode->volume->flags); 188 error = afs_check_volume_status(vnode->volume, fc->key); 189 if (error < 0) 190 goto failed_set_error; 191 192 if (test_bit(AFS_VOLUME_DELETED, &vnode->volume->flags)) { 193 fc->error = -ENOMEDIUM; 194 goto failed; 195 } 196 197 /* If the server list didn't change, then assume that 198 * it's the fileserver having trouble. 199 */ 200 if (vnode->volume->servers == fc->server_list) { 201 fc->error = -EREMOTEIO; 202 goto next_server; 203 } 204 205 /* Try again */ 206 fc->flags |= AFS_FS_CURSOR_VNOVOL; 207 _leave(" = t [vnovol]"); 208 return true; 209 210 case VSALVAGE: /* TODO: Should this return an error or iterate? */ 211 case VVOLEXISTS: 212 case VNOSERVICE: 213 case VONLINE: 214 case VDISKFULL: 215 case VOVERQUOTA: 216 fc->error = afs_abort_to_error(fc->ac.abort_code); 217 goto next_server; 218 219 case VOFFLINE: 220 if (!test_and_set_bit(AFS_VOLUME_OFFLINE, &vnode->volume->flags)) { 221 afs_busy(vnode->volume, fc->ac.abort_code); 222 clear_bit(AFS_VOLUME_BUSY, &vnode->volume->flags); 223 } 224 if (fc->flags & AFS_FS_CURSOR_NO_VSLEEP) { 225 fc->error = -EADV; 226 goto failed; 227 } 228 if (fc->flags & AFS_FS_CURSOR_CUR_ONLY) { 229 fc->error = -ESTALE; 230 goto failed; 231 } 232 goto busy; 233 234 case VSALVAGING: 235 case VRESTARTING: 236 case VBUSY: 237 /* Retry after going round all the servers unless we 238 * have a file lock we need to maintain. 239 */ 240 if (fc->flags & AFS_FS_CURSOR_NO_VSLEEP) { 241 fc->error = -EBUSY; 242 goto failed; 243 } 244 if (!test_and_set_bit(AFS_VOLUME_BUSY, &vnode->volume->flags)) { 245 afs_busy(vnode->volume, fc->ac.abort_code); 246 clear_bit(AFS_VOLUME_OFFLINE, &vnode->volume->flags); 247 } 248 busy: 249 if (fc->flags & AFS_FS_CURSOR_CUR_ONLY) { 250 if (!afs_sleep_and_retry(fc)) 251 goto failed; 252 253 /* Retry with same server & address */ 254 _leave(" = t [vbusy]"); 255 return true; 256 } 257 258 fc->flags |= AFS_FS_CURSOR_VBUSY; 259 goto next_server; 260 261 case VMOVED: 262 /* The volume migrated to another server. We consider 263 * consider all locks and callbacks broken and request 264 * an update from the VLDB. 265 * 266 * We also limit the number of VMOVED hops we will 267 * honour, just in case someone sets up a loop. 268 */ 269 if (fc->flags & AFS_FS_CURSOR_VMOVED) { 270 fc->error = -EREMOTEIO; 271 goto failed; 272 } 273 fc->flags |= AFS_FS_CURSOR_VMOVED; 274 275 set_bit(AFS_VOLUME_WAIT, &vnode->volume->flags); 276 set_bit(AFS_VOLUME_NEEDS_UPDATE, &vnode->volume->flags); 277 error = afs_check_volume_status(vnode->volume, fc->key); 278 if (error < 0) 279 goto failed_set_error; 280 281 /* If the server list didn't change, then the VLDB is 282 * out of sync with the fileservers. This is hopefully 283 * a temporary condition, however, so we don't want to 284 * permanently block access to the file. 285 * 286 * TODO: Try other fileservers if we can. 287 * 288 * TODO: Retry a few times with sleeps. 289 */ 290 if (vnode->volume->servers == fc->server_list) { 291 fc->error = -ENOMEDIUM; 292 goto failed; 293 } 294 295 goto restart_from_beginning; 296 297 default: 298 clear_bit(AFS_VOLUME_OFFLINE, &vnode->volume->flags); 299 clear_bit(AFS_VOLUME_BUSY, &vnode->volume->flags); 300 fc->error = afs_abort_to_error(fc->ac.abort_code); 301 goto failed; 302 } 303 304 case -ETIMEDOUT: 305 case -ETIME: 306 if (fc->error != -EDESTADDRREQ) 307 goto iterate_address; 308 /* Fall through */ 309 case -ENETUNREACH: 310 case -EHOSTUNREACH: 311 case -ECONNREFUSED: 312 _debug("no conn"); 313 fc->error = error; 314 goto iterate_address; 315 316 case -ECONNRESET: 317 _debug("call reset"); 318 fc->error = error; 319 goto failed; 320 } 321 322 restart_from_beginning: 323 _debug("restart"); 324 afs_end_cursor(&fc->ac); 325 afs_put_cb_interest(afs_v2net(vnode), fc->cbi); 326 fc->cbi = NULL; 327 afs_put_serverlist(afs_v2net(vnode), fc->server_list); 328 fc->server_list = NULL; 329 start: 330 _debug("start"); 331 /* See if we need to do an update of the volume record. Note that the 332 * volume may have moved or even have been deleted. 333 */ 334 error = afs_check_volume_status(vnode->volume, fc->key); 335 if (error < 0) 336 goto failed_set_error; 337 338 if (!afs_start_fs_iteration(fc, vnode)) 339 goto failed; 340 341 _debug("__ VOL %llx __", vnode->volume->vid); 342 error = afs_probe_fileservers(afs_v2net(vnode), fc->key, fc->server_list); 343 if (error < 0) 344 goto failed_set_error; 345 346 pick_server: 347 _debug("pick [%lx]", fc->untried); 348 349 error = afs_wait_for_fs_probes(fc->server_list, fc->untried); 350 if (error < 0) 351 goto failed_set_error; 352 353 /* Pick the untried server with the lowest RTT. If we have outstanding 354 * callbacks, we stick with the server we're already using if we can. 355 */ 356 if (fc->cbi) { 357 _debug("cbi %u", fc->index); 358 if (test_bit(fc->index, &fc->untried)) 359 goto selected_server; 360 afs_put_cb_interest(afs_v2net(vnode), fc->cbi); 361 fc->cbi = NULL; 362 _debug("nocbi"); 363 } 364 365 fc->index = -1; 366 rtt = U32_MAX; 367 for (i = 0; i < fc->server_list->nr_servers; i++) { 368 struct afs_server *s = fc->server_list->servers[i].server; 369 370 if (!test_bit(i, &fc->untried) || !s->probe.responded) 371 continue; 372 if (s->probe.rtt < rtt) { 373 fc->index = i; 374 rtt = s->probe.rtt; 375 } 376 } 377 378 if (fc->index == -1) 379 goto no_more_servers; 380 381 selected_server: 382 _debug("use %d", fc->index); 383 __clear_bit(fc->index, &fc->untried); 384 385 /* We're starting on a different fileserver from the list. We need to 386 * check it, create a callback intercept, find its address list and 387 * probe its capabilities before we use it. 388 */ 389 ASSERTCMP(fc->ac.alist, ==, NULL); 390 server = fc->server_list->servers[fc->index].server; 391 392 if (!afs_check_server_record(fc, server)) 393 goto failed; 394 395 _debug("USING SERVER: %pU", &server->uuid); 396 397 /* Make sure we've got a callback interest record for this server. We 398 * have to link it in before we send the request as we can be sent a 399 * break request before we've finished decoding the reply and 400 * installing the vnode. 401 */ 402 error = afs_register_server_cb_interest(vnode, fc->server_list, 403 fc->index); 404 if (error < 0) 405 goto failed_set_error; 406 407 fc->cbi = afs_get_cb_interest(vnode->cb_interest); 408 409 read_lock(&server->fs_lock); 410 alist = rcu_dereference_protected(server->addresses, 411 lockdep_is_held(&server->fs_lock)); 412 afs_get_addrlist(alist); 413 read_unlock(&server->fs_lock); 414 415 memset(&fc->ac, 0, sizeof(fc->ac)); 416 417 if (!fc->ac.alist) 418 fc->ac.alist = alist; 419 else 420 afs_put_addrlist(alist); 421 422 fc->ac.index = -1; 423 424 iterate_address: 425 ASSERT(fc->ac.alist); 426 /* Iterate over the current server's address list to try and find an 427 * address on which it will respond to us. 428 */ 429 if (!afs_iterate_addresses(&fc->ac)) 430 goto next_server; 431 432 _debug("address [%u] %u/%u", fc->index, fc->ac.index, fc->ac.alist->nr_addrs); 433 434 _leave(" = t"); 435 return true; 436 437 next_server: 438 _debug("next"); 439 afs_end_cursor(&fc->ac); 440 goto pick_server; 441 442 no_more_servers: 443 /* That's all the servers poked to no good effect. Try again if some 444 * of them were busy. 445 */ 446 if (fc->flags & AFS_FS_CURSOR_VBUSY) 447 goto restart_from_beginning; 448 449 abort_code = 0; 450 error = -EDESTADDRREQ; 451 for (i = 0; i < fc->server_list->nr_servers; i++) { 452 struct afs_server *s = fc->server_list->servers[i].server; 453 int probe_error = READ_ONCE(s->probe.error); 454 455 switch (probe_error) { 456 case 0: 457 continue; 458 default: 459 if (error == -ETIMEDOUT || 460 error == -ETIME) 461 continue; 462 case -ETIMEDOUT: 463 case -ETIME: 464 if (error == -ENOMEM || 465 error == -ENONET) 466 continue; 467 case -ENOMEM: 468 case -ENONET: 469 if (error == -ENETUNREACH) 470 continue; 471 case -ENETUNREACH: 472 if (error == -EHOSTUNREACH) 473 continue; 474 case -EHOSTUNREACH: 475 if (error == -ECONNREFUSED) 476 continue; 477 case -ECONNREFUSED: 478 if (error == -ECONNRESET) 479 continue; 480 case -ECONNRESET: /* Responded, but call expired. */ 481 if (error == -ECONNABORTED) 482 continue; 483 case -ECONNABORTED: 484 abort_code = s->probe.abort_code; 485 error = probe_error; 486 continue; 487 } 488 } 489 490 if (error == -ECONNABORTED) 491 error = afs_abort_to_error(abort_code); 492 493 failed_set_error: 494 fc->error = error; 495 failed: 496 fc->flags |= AFS_FS_CURSOR_STOP; 497 afs_end_cursor(&fc->ac); 498 _leave(" = f [failed %d]", fc->error); 499 return false; 500 } 501 502 /* 503 * Select the same fileserver we used for a vnode before and only that 504 * fileserver. We use this when we have a lock on that file, which is backed 505 * only by the fileserver we obtained it from. 506 */ 507 bool afs_select_current_fileserver(struct afs_fs_cursor *fc) 508 { 509 struct afs_vnode *vnode = fc->vnode; 510 struct afs_cb_interest *cbi = vnode->cb_interest; 511 struct afs_addr_list *alist; 512 int error = fc->ac.error; 513 514 _enter(""); 515 516 switch (error) { 517 case SHRT_MAX: 518 if (!cbi) { 519 fc->error = -ESTALE; 520 fc->flags |= AFS_FS_CURSOR_STOP; 521 return false; 522 } 523 524 fc->cbi = afs_get_cb_interest(vnode->cb_interest); 525 526 read_lock(&cbi->server->fs_lock); 527 alist = rcu_dereference_protected(cbi->server->addresses, 528 lockdep_is_held(&cbi->server->fs_lock)); 529 afs_get_addrlist(alist); 530 read_unlock(&cbi->server->fs_lock); 531 if (!alist) { 532 fc->error = -ESTALE; 533 fc->flags |= AFS_FS_CURSOR_STOP; 534 return false; 535 } 536 537 memset(&fc->ac, 0, sizeof(fc->ac)); 538 fc->ac.alist = alist; 539 fc->ac.index = -1; 540 goto iterate_address; 541 542 case 0: 543 default: 544 /* Success or local failure. Stop. */ 545 fc->error = error; 546 fc->flags |= AFS_FS_CURSOR_STOP; 547 _leave(" = f [okay/local %d]", error); 548 return false; 549 550 case -ECONNABORTED: 551 fc->error = afs_abort_to_error(fc->ac.abort_code); 552 fc->flags |= AFS_FS_CURSOR_STOP; 553 _leave(" = f [abort]"); 554 return false; 555 556 case -ENETUNREACH: 557 case -EHOSTUNREACH: 558 case -ECONNREFUSED: 559 case -ETIMEDOUT: 560 case -ETIME: 561 _debug("no conn"); 562 fc->error = error; 563 goto iterate_address; 564 } 565 566 iterate_address: 567 /* Iterate over the current server's address list to try and find an 568 * address on which it will respond to us. 569 */ 570 if (afs_iterate_addresses(&fc->ac)) { 571 _leave(" = t"); 572 return true; 573 } 574 575 afs_end_cursor(&fc->ac); 576 return false; 577 } 578 579 /* 580 * Dump cursor state in the case of the error being EDESTADDRREQ. 581 */ 582 static void afs_dump_edestaddrreq(const struct afs_fs_cursor *fc) 583 { 584 static int count; 585 int i; 586 587 if (!IS_ENABLED(CONFIG_AFS_DEBUG_CURSOR) || count > 3) 588 return; 589 count++; 590 591 rcu_read_lock(); 592 593 pr_notice("EDESTADDR occurred\n"); 594 pr_notice("FC: cbb=%x cbb2=%x fl=%hx err=%hd\n", 595 fc->cb_break, fc->cb_break_2, fc->flags, fc->error); 596 pr_notice("FC: ut=%lx ix=%d ni=%u\n", 597 fc->untried, fc->index, fc->nr_iterations); 598 599 if (fc->server_list) { 600 const struct afs_server_list *sl = fc->server_list; 601 pr_notice("FC: SL nr=%u pr=%u vnov=%hx\n", 602 sl->nr_servers, sl->preferred, sl->vnovol_mask); 603 for (i = 0; i < sl->nr_servers; i++) { 604 const struct afs_server *s = sl->servers[i].server; 605 pr_notice("FC: server fl=%lx av=%u %pU\n", 606 s->flags, s->addr_version, &s->uuid); 607 if (s->addresses) { 608 const struct afs_addr_list *a = 609 rcu_dereference(s->addresses); 610 pr_notice("FC: - av=%u nr=%u/%u/%u pr=%u\n", 611 a->version, 612 a->nr_ipv4, a->nr_addrs, a->max_addrs, 613 a->preferred); 614 pr_notice("FC: - pr=%lx R=%lx F=%lx\n", 615 a->probed, a->responded, a->failed); 616 if (a == fc->ac.alist) 617 pr_notice("FC: - current\n"); 618 } 619 } 620 } 621 622 pr_notice("AC: t=%lx ax=%u ac=%d er=%d r=%u ni=%u\n", 623 fc->ac.tried, fc->ac.index, fc->ac.abort_code, fc->ac.error, 624 fc->ac.responded, fc->ac.nr_iterations); 625 rcu_read_unlock(); 626 } 627 628 /* 629 * Tidy up a filesystem cursor and unlock the vnode. 630 */ 631 int afs_end_vnode_operation(struct afs_fs_cursor *fc) 632 { 633 struct afs_net *net = afs_v2net(fc->vnode); 634 635 if (fc->error == -EDESTADDRREQ || 636 fc->error == -ENETUNREACH || 637 fc->error == -EHOSTUNREACH) 638 afs_dump_edestaddrreq(fc); 639 640 mutex_unlock(&fc->vnode->io_lock); 641 642 afs_end_cursor(&fc->ac); 643 afs_put_cb_interest(net, fc->cbi); 644 afs_put_serverlist(net, fc->server_list); 645 646 if (fc->error == -ECONNABORTED) 647 fc->error = afs_abort_to_error(fc->ac.abort_code); 648 649 return fc->error; 650 } 651