1 /* Handle fileserver selection and rotation. 2 * 3 * Copyright (C) 2017 Red Hat, Inc. All Rights Reserved. 4 * Written by David Howells (dhowells@redhat.com) 5 * 6 * This program is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU General Public Licence 8 * as published by the Free Software Foundation; either version 9 * 2 of the Licence, or (at your option) any later version. 10 */ 11 12 #include <linux/kernel.h> 13 #include <linux/slab.h> 14 #include <linux/fs.h> 15 #include <linux/sched.h> 16 #include <linux/delay.h> 17 #include <linux/sched/signal.h> 18 #include "internal.h" 19 #include "afs_fs.h" 20 21 /* 22 * Begin an operation on the fileserver. 23 * 24 * Fileserver operations are serialised on the server by vnode, so we serialise 25 * them here also using the io_lock. 26 */ 27 bool afs_begin_vnode_operation(struct afs_fs_cursor *fc, struct afs_vnode *vnode, 28 struct key *key) 29 { 30 memset(fc, 0, sizeof(*fc)); 31 fc->vnode = vnode; 32 fc->key = key; 33 fc->ac.error = SHRT_MAX; 34 fc->error = -EDESTADDRREQ; 35 36 if (mutex_lock_interruptible(&vnode->io_lock) < 0) { 37 fc->error = -EINTR; 38 fc->flags |= AFS_FS_CURSOR_STOP; 39 return false; 40 } 41 42 if (vnode->lock_state != AFS_VNODE_LOCK_NONE) 43 fc->flags |= AFS_FS_CURSOR_CUR_ONLY; 44 return true; 45 } 46 47 /* 48 * Begin iteration through a server list, starting with the vnode's last used 49 * server if possible, or the last recorded good server if not. 50 */ 51 static bool afs_start_fs_iteration(struct afs_fs_cursor *fc, 52 struct afs_vnode *vnode) 53 { 54 struct afs_cb_interest *cbi; 55 int i; 56 57 read_lock(&vnode->volume->servers_lock); 58 fc->server_list = afs_get_serverlist(vnode->volume->servers); 59 read_unlock(&vnode->volume->servers_lock); 60 61 fc->untried = (1UL << fc->server_list->nr_servers) - 1; 62 fc->index = READ_ONCE(fc->server_list->preferred); 63 64 cbi = vnode->cb_interest; 65 if (cbi) { 66 /* See if the vnode's preferred record is still available */ 67 for (i = 0; i < fc->server_list->nr_servers; i++) { 68 if (fc->server_list->servers[i].cb_interest == cbi) { 69 fc->index = i; 70 goto found_interest; 71 } 72 } 73 74 /* If we have a lock outstanding on a server that's no longer 75 * serving this vnode, then we can't switch to another server 76 * and have to return an error. 77 */ 78 if (fc->flags & AFS_FS_CURSOR_CUR_ONLY) { 79 fc->error = -ESTALE; 80 return false; 81 } 82 83 /* Note that the callback promise is effectively broken */ 84 write_seqlock(&vnode->cb_lock); 85 ASSERTCMP(cbi, ==, vnode->cb_interest); 86 vnode->cb_interest = NULL; 87 if (test_and_clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) 88 vnode->cb_break++; 89 write_sequnlock(&vnode->cb_lock); 90 91 afs_put_cb_interest(afs_v2net(vnode), cbi); 92 cbi = NULL; 93 } 94 95 found_interest: 96 return true; 97 } 98 99 /* 100 * Post volume busy note. 101 */ 102 static void afs_busy(struct afs_volume *volume, u32 abort_code) 103 { 104 const char *m; 105 106 switch (abort_code) { 107 case VOFFLINE: m = "offline"; break; 108 case VRESTARTING: m = "restarting"; break; 109 case VSALVAGING: m = "being salvaged"; break; 110 default: m = "busy"; break; 111 } 112 113 pr_notice("kAFS: Volume %llu '%s' is %s\n", volume->vid, volume->name, m); 114 } 115 116 /* 117 * Sleep and retry the operation to the same fileserver. 118 */ 119 static bool afs_sleep_and_retry(struct afs_fs_cursor *fc) 120 { 121 msleep_interruptible(1000); 122 if (signal_pending(current)) { 123 fc->error = -ERESTARTSYS; 124 return false; 125 } 126 127 return true; 128 } 129 130 /* 131 * Select the fileserver to use. May be called multiple times to rotate 132 * through the fileservers. 133 */ 134 bool afs_select_fileserver(struct afs_fs_cursor *fc) 135 { 136 struct afs_addr_list *alist; 137 struct afs_server *server; 138 struct afs_vnode *vnode = fc->vnode; 139 struct afs_error e; 140 u32 rtt; 141 int error = fc->ac.error, i; 142 143 _enter("%lx[%d],%lx[%d],%d,%d", 144 fc->untried, fc->index, 145 fc->ac.tried, fc->ac.index, 146 error, fc->ac.abort_code); 147 148 if (fc->flags & AFS_FS_CURSOR_STOP) { 149 _leave(" = f [stopped]"); 150 return false; 151 } 152 153 fc->nr_iterations++; 154 155 /* Evaluate the result of the previous operation, if there was one. */ 156 switch (error) { 157 case SHRT_MAX: 158 goto start; 159 160 case 0: 161 default: 162 /* Success or local failure. Stop. */ 163 fc->error = error; 164 fc->flags |= AFS_FS_CURSOR_STOP; 165 _leave(" = f [okay/local %d]", error); 166 return false; 167 168 case -ECONNABORTED: 169 /* The far side rejected the operation on some grounds. This 170 * might involve the server being busy or the volume having been moved. 171 */ 172 switch (fc->ac.abort_code) { 173 case VNOVOL: 174 /* This fileserver doesn't know about the volume. 175 * - May indicate that the VL is wrong - retry once and compare 176 * the results. 177 * - May indicate that the fileserver couldn't attach to the vol. 178 */ 179 if (fc->flags & AFS_FS_CURSOR_VNOVOL) { 180 fc->error = -EREMOTEIO; 181 goto next_server; 182 } 183 184 write_lock(&vnode->volume->servers_lock); 185 fc->server_list->vnovol_mask |= 1 << fc->index; 186 write_unlock(&vnode->volume->servers_lock); 187 188 set_bit(AFS_VOLUME_NEEDS_UPDATE, &vnode->volume->flags); 189 error = afs_check_volume_status(vnode->volume, fc->key); 190 if (error < 0) 191 goto failed_set_error; 192 193 if (test_bit(AFS_VOLUME_DELETED, &vnode->volume->flags)) { 194 fc->error = -ENOMEDIUM; 195 goto failed; 196 } 197 198 /* If the server list didn't change, then assume that 199 * it's the fileserver having trouble. 200 */ 201 if (vnode->volume->servers == fc->server_list) { 202 fc->error = -EREMOTEIO; 203 goto next_server; 204 } 205 206 /* Try again */ 207 fc->flags |= AFS_FS_CURSOR_VNOVOL; 208 _leave(" = t [vnovol]"); 209 return true; 210 211 case VSALVAGE: /* TODO: Should this return an error or iterate? */ 212 case VVOLEXISTS: 213 case VNOSERVICE: 214 case VONLINE: 215 case VDISKFULL: 216 case VOVERQUOTA: 217 fc->error = afs_abort_to_error(fc->ac.abort_code); 218 goto next_server; 219 220 case VOFFLINE: 221 if (!test_and_set_bit(AFS_VOLUME_OFFLINE, &vnode->volume->flags)) { 222 afs_busy(vnode->volume, fc->ac.abort_code); 223 clear_bit(AFS_VOLUME_BUSY, &vnode->volume->flags); 224 } 225 if (fc->flags & AFS_FS_CURSOR_NO_VSLEEP) { 226 fc->error = -EADV; 227 goto failed; 228 } 229 if (fc->flags & AFS_FS_CURSOR_CUR_ONLY) { 230 fc->error = -ESTALE; 231 goto failed; 232 } 233 goto busy; 234 235 case VSALVAGING: 236 case VRESTARTING: 237 case VBUSY: 238 /* Retry after going round all the servers unless we 239 * have a file lock we need to maintain. 240 */ 241 if (fc->flags & AFS_FS_CURSOR_NO_VSLEEP) { 242 fc->error = -EBUSY; 243 goto failed; 244 } 245 if (!test_and_set_bit(AFS_VOLUME_BUSY, &vnode->volume->flags)) { 246 afs_busy(vnode->volume, fc->ac.abort_code); 247 clear_bit(AFS_VOLUME_OFFLINE, &vnode->volume->flags); 248 } 249 busy: 250 if (fc->flags & AFS_FS_CURSOR_CUR_ONLY) { 251 if (!afs_sleep_and_retry(fc)) 252 goto failed; 253 254 /* Retry with same server & address */ 255 _leave(" = t [vbusy]"); 256 return true; 257 } 258 259 fc->flags |= AFS_FS_CURSOR_VBUSY; 260 goto next_server; 261 262 case VMOVED: 263 /* The volume migrated to another server. We consider 264 * consider all locks and callbacks broken and request 265 * an update from the VLDB. 266 * 267 * We also limit the number of VMOVED hops we will 268 * honour, just in case someone sets up a loop. 269 */ 270 if (fc->flags & AFS_FS_CURSOR_VMOVED) { 271 fc->error = -EREMOTEIO; 272 goto failed; 273 } 274 fc->flags |= AFS_FS_CURSOR_VMOVED; 275 276 set_bit(AFS_VOLUME_WAIT, &vnode->volume->flags); 277 set_bit(AFS_VOLUME_NEEDS_UPDATE, &vnode->volume->flags); 278 error = afs_check_volume_status(vnode->volume, fc->key); 279 if (error < 0) 280 goto failed_set_error; 281 282 /* If the server list didn't change, then the VLDB is 283 * out of sync with the fileservers. This is hopefully 284 * a temporary condition, however, so we don't want to 285 * permanently block access to the file. 286 * 287 * TODO: Try other fileservers if we can. 288 * 289 * TODO: Retry a few times with sleeps. 290 */ 291 if (vnode->volume->servers == fc->server_list) { 292 fc->error = -ENOMEDIUM; 293 goto failed; 294 } 295 296 goto restart_from_beginning; 297 298 default: 299 clear_bit(AFS_VOLUME_OFFLINE, &vnode->volume->flags); 300 clear_bit(AFS_VOLUME_BUSY, &vnode->volume->flags); 301 fc->error = afs_abort_to_error(fc->ac.abort_code); 302 goto failed; 303 } 304 305 case -ETIMEDOUT: 306 case -ETIME: 307 if (fc->error != -EDESTADDRREQ) 308 goto iterate_address; 309 /* Fall through */ 310 case -ERFKILL: 311 case -EADDRNOTAVAIL: 312 case -ENETUNREACH: 313 case -EHOSTUNREACH: 314 case -EHOSTDOWN: 315 case -ECONNREFUSED: 316 _debug("no conn"); 317 fc->error = error; 318 goto iterate_address; 319 320 case -ECONNRESET: 321 _debug("call reset"); 322 fc->error = error; 323 goto failed; 324 } 325 326 restart_from_beginning: 327 _debug("restart"); 328 afs_end_cursor(&fc->ac); 329 afs_put_cb_interest(afs_v2net(vnode), fc->cbi); 330 fc->cbi = NULL; 331 afs_put_serverlist(afs_v2net(vnode), fc->server_list); 332 fc->server_list = NULL; 333 start: 334 _debug("start"); 335 /* See if we need to do an update of the volume record. Note that the 336 * volume may have moved or even have been deleted. 337 */ 338 error = afs_check_volume_status(vnode->volume, fc->key); 339 if (error < 0) 340 goto failed_set_error; 341 342 if (!afs_start_fs_iteration(fc, vnode)) 343 goto failed; 344 345 _debug("__ VOL %llx __", vnode->volume->vid); 346 error = afs_probe_fileservers(afs_v2net(vnode), fc->key, fc->server_list); 347 if (error < 0) 348 goto failed_set_error; 349 350 pick_server: 351 _debug("pick [%lx]", fc->untried); 352 353 error = afs_wait_for_fs_probes(fc->server_list, fc->untried); 354 if (error < 0) 355 goto failed_set_error; 356 357 /* Pick the untried server with the lowest RTT. If we have outstanding 358 * callbacks, we stick with the server we're already using if we can. 359 */ 360 if (fc->cbi) { 361 _debug("cbi %u", fc->index); 362 if (test_bit(fc->index, &fc->untried)) 363 goto selected_server; 364 afs_put_cb_interest(afs_v2net(vnode), fc->cbi); 365 fc->cbi = NULL; 366 _debug("nocbi"); 367 } 368 369 fc->index = -1; 370 rtt = U32_MAX; 371 for (i = 0; i < fc->server_list->nr_servers; i++) { 372 struct afs_server *s = fc->server_list->servers[i].server; 373 374 if (!test_bit(i, &fc->untried) || !s->probe.responded) 375 continue; 376 if (s->probe.rtt < rtt) { 377 fc->index = i; 378 rtt = s->probe.rtt; 379 } 380 } 381 382 if (fc->index == -1) 383 goto no_more_servers; 384 385 selected_server: 386 _debug("use %d", fc->index); 387 __clear_bit(fc->index, &fc->untried); 388 389 /* We're starting on a different fileserver from the list. We need to 390 * check it, create a callback intercept, find its address list and 391 * probe its capabilities before we use it. 392 */ 393 ASSERTCMP(fc->ac.alist, ==, NULL); 394 server = fc->server_list->servers[fc->index].server; 395 396 if (!afs_check_server_record(fc, server)) 397 goto failed; 398 399 _debug("USING SERVER: %pU", &server->uuid); 400 401 /* Make sure we've got a callback interest record for this server. We 402 * have to link it in before we send the request as we can be sent a 403 * break request before we've finished decoding the reply and 404 * installing the vnode. 405 */ 406 error = afs_register_server_cb_interest(vnode, fc->server_list, 407 fc->index); 408 if (error < 0) 409 goto failed_set_error; 410 411 fc->cbi = afs_get_cb_interest(vnode->cb_interest); 412 413 read_lock(&server->fs_lock); 414 alist = rcu_dereference_protected(server->addresses, 415 lockdep_is_held(&server->fs_lock)); 416 afs_get_addrlist(alist); 417 read_unlock(&server->fs_lock); 418 419 memset(&fc->ac, 0, sizeof(fc->ac)); 420 421 if (!fc->ac.alist) 422 fc->ac.alist = alist; 423 else 424 afs_put_addrlist(alist); 425 426 fc->ac.index = -1; 427 428 iterate_address: 429 ASSERT(fc->ac.alist); 430 /* Iterate over the current server's address list to try and find an 431 * address on which it will respond to us. 432 */ 433 if (!afs_iterate_addresses(&fc->ac)) 434 goto next_server; 435 436 _debug("address [%u] %u/%u", fc->index, fc->ac.index, fc->ac.alist->nr_addrs); 437 438 _leave(" = t"); 439 return true; 440 441 next_server: 442 _debug("next"); 443 afs_end_cursor(&fc->ac); 444 goto pick_server; 445 446 no_more_servers: 447 /* That's all the servers poked to no good effect. Try again if some 448 * of them were busy. 449 */ 450 if (fc->flags & AFS_FS_CURSOR_VBUSY) 451 goto restart_from_beginning; 452 453 e.error = -EDESTADDRREQ; 454 e.responded = false; 455 for (i = 0; i < fc->server_list->nr_servers; i++) { 456 struct afs_server *s = fc->server_list->servers[i].server; 457 458 afs_prioritise_error(&e, READ_ONCE(s->probe.error), 459 s->probe.abort_code); 460 } 461 462 failed_set_error: 463 fc->error = error; 464 failed: 465 fc->flags |= AFS_FS_CURSOR_STOP; 466 afs_end_cursor(&fc->ac); 467 _leave(" = f [failed %d]", fc->error); 468 return false; 469 } 470 471 /* 472 * Select the same fileserver we used for a vnode before and only that 473 * fileserver. We use this when we have a lock on that file, which is backed 474 * only by the fileserver we obtained it from. 475 */ 476 bool afs_select_current_fileserver(struct afs_fs_cursor *fc) 477 { 478 struct afs_vnode *vnode = fc->vnode; 479 struct afs_cb_interest *cbi = vnode->cb_interest; 480 struct afs_addr_list *alist; 481 int error = fc->ac.error; 482 483 _enter(""); 484 485 switch (error) { 486 case SHRT_MAX: 487 if (!cbi) { 488 fc->error = -ESTALE; 489 fc->flags |= AFS_FS_CURSOR_STOP; 490 return false; 491 } 492 493 fc->cbi = afs_get_cb_interest(vnode->cb_interest); 494 495 read_lock(&cbi->server->fs_lock); 496 alist = rcu_dereference_protected(cbi->server->addresses, 497 lockdep_is_held(&cbi->server->fs_lock)); 498 afs_get_addrlist(alist); 499 read_unlock(&cbi->server->fs_lock); 500 if (!alist) { 501 fc->error = -ESTALE; 502 fc->flags |= AFS_FS_CURSOR_STOP; 503 return false; 504 } 505 506 memset(&fc->ac, 0, sizeof(fc->ac)); 507 fc->ac.alist = alist; 508 fc->ac.index = -1; 509 goto iterate_address; 510 511 case 0: 512 default: 513 /* Success or local failure. Stop. */ 514 fc->error = error; 515 fc->flags |= AFS_FS_CURSOR_STOP; 516 _leave(" = f [okay/local %d]", error); 517 return false; 518 519 case -ECONNABORTED: 520 fc->error = afs_abort_to_error(fc->ac.abort_code); 521 fc->flags |= AFS_FS_CURSOR_STOP; 522 _leave(" = f [abort]"); 523 return false; 524 525 case -ERFKILL: 526 case -EADDRNOTAVAIL: 527 case -ENETUNREACH: 528 case -EHOSTUNREACH: 529 case -EHOSTDOWN: 530 case -ECONNREFUSED: 531 case -ETIMEDOUT: 532 case -ETIME: 533 _debug("no conn"); 534 fc->error = error; 535 goto iterate_address; 536 } 537 538 iterate_address: 539 /* Iterate over the current server's address list to try and find an 540 * address on which it will respond to us. 541 */ 542 if (afs_iterate_addresses(&fc->ac)) { 543 _leave(" = t"); 544 return true; 545 } 546 547 afs_end_cursor(&fc->ac); 548 return false; 549 } 550 551 /* 552 * Dump cursor state in the case of the error being EDESTADDRREQ. 553 */ 554 static void afs_dump_edestaddrreq(const struct afs_fs_cursor *fc) 555 { 556 static int count; 557 int i; 558 559 if (!IS_ENABLED(CONFIG_AFS_DEBUG_CURSOR) || count > 3) 560 return; 561 count++; 562 563 rcu_read_lock(); 564 565 pr_notice("EDESTADDR occurred\n"); 566 pr_notice("FC: cbb=%x cbb2=%x fl=%hx err=%hd\n", 567 fc->cb_break, fc->cb_break_2, fc->flags, fc->error); 568 pr_notice("FC: ut=%lx ix=%d ni=%u\n", 569 fc->untried, fc->index, fc->nr_iterations); 570 571 if (fc->server_list) { 572 const struct afs_server_list *sl = fc->server_list; 573 pr_notice("FC: SL nr=%u pr=%u vnov=%hx\n", 574 sl->nr_servers, sl->preferred, sl->vnovol_mask); 575 for (i = 0; i < sl->nr_servers; i++) { 576 const struct afs_server *s = sl->servers[i].server; 577 pr_notice("FC: server fl=%lx av=%u %pU\n", 578 s->flags, s->addr_version, &s->uuid); 579 if (s->addresses) { 580 const struct afs_addr_list *a = 581 rcu_dereference(s->addresses); 582 pr_notice("FC: - av=%u nr=%u/%u/%u pr=%u\n", 583 a->version, 584 a->nr_ipv4, a->nr_addrs, a->max_addrs, 585 a->preferred); 586 pr_notice("FC: - pr=%lx R=%lx F=%lx\n", 587 a->probed, a->responded, a->failed); 588 if (a == fc->ac.alist) 589 pr_notice("FC: - current\n"); 590 } 591 } 592 } 593 594 pr_notice("AC: t=%lx ax=%u ac=%d er=%d r=%u ni=%u\n", 595 fc->ac.tried, fc->ac.index, fc->ac.abort_code, fc->ac.error, 596 fc->ac.responded, fc->ac.nr_iterations); 597 rcu_read_unlock(); 598 } 599 600 /* 601 * Tidy up a filesystem cursor and unlock the vnode. 602 */ 603 int afs_end_vnode_operation(struct afs_fs_cursor *fc) 604 { 605 struct afs_net *net = afs_v2net(fc->vnode); 606 607 if (fc->error == -EDESTADDRREQ || 608 fc->error == -EADDRNOTAVAIL || 609 fc->error == -ENETUNREACH || 610 fc->error == -EHOSTUNREACH) 611 afs_dump_edestaddrreq(fc); 612 613 mutex_unlock(&fc->vnode->io_lock); 614 615 afs_end_cursor(&fc->ac); 616 afs_put_cb_interest(net, fc->cbi); 617 afs_put_serverlist(net, fc->server_list); 618 619 if (fc->error == -ECONNABORTED) 620 fc->error = afs_abort_to_error(fc->ac.abort_code); 621 622 return fc->error; 623 } 624