xref: /openbmc/linux/fs/afs/rotate.c (revision 83268fa6b43cefb60ee188fd53ed49120d3ae4f4)
1 /* Handle fileserver selection and rotation.
2  *
3  * Copyright (C) 2017 Red Hat, Inc. All Rights Reserved.
4  * Written by David Howells (dhowells@redhat.com)
5  *
6  * This program is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public Licence
8  * as published by the Free Software Foundation; either version
9  * 2 of the Licence, or (at your option) any later version.
10  */
11 
12 #include <linux/kernel.h>
13 #include <linux/slab.h>
14 #include <linux/fs.h>
15 #include <linux/sched.h>
16 #include <linux/delay.h>
17 #include <linux/sched/signal.h>
18 #include "internal.h"
19 #include "afs_fs.h"
20 
21 /*
22  * Begin an operation on the fileserver.
23  *
24  * Fileserver operations are serialised on the server by vnode, so we serialise
25  * them here also using the io_lock.
26  */
27 bool afs_begin_vnode_operation(struct afs_fs_cursor *fc, struct afs_vnode *vnode,
28 			       struct key *key)
29 {
30 	memset(fc, 0, sizeof(*fc));
31 	fc->vnode = vnode;
32 	fc->key = key;
33 	fc->ac.error = SHRT_MAX;
34 	fc->error = -EDESTADDRREQ;
35 
36 	if (mutex_lock_interruptible(&vnode->io_lock) < 0) {
37 		fc->error = -EINTR;
38 		fc->flags |= AFS_FS_CURSOR_STOP;
39 		return false;
40 	}
41 
42 	if (vnode->lock_state != AFS_VNODE_LOCK_NONE)
43 		fc->flags |= AFS_FS_CURSOR_CUR_ONLY;
44 	return true;
45 }
46 
47 /*
48  * Begin iteration through a server list, starting with the vnode's last used
49  * server if possible, or the last recorded good server if not.
50  */
51 static bool afs_start_fs_iteration(struct afs_fs_cursor *fc,
52 				   struct afs_vnode *vnode)
53 {
54 	struct afs_cb_interest *cbi;
55 	int i;
56 
57 	read_lock(&vnode->volume->servers_lock);
58 	fc->server_list = afs_get_serverlist(vnode->volume->servers);
59 	read_unlock(&vnode->volume->servers_lock);
60 
61 	fc->untried = (1UL << fc->server_list->nr_servers) - 1;
62 	fc->index = READ_ONCE(fc->server_list->preferred);
63 
64 	cbi = vnode->cb_interest;
65 	if (cbi) {
66 		/* See if the vnode's preferred record is still available */
67 		for (i = 0; i < fc->server_list->nr_servers; i++) {
68 			if (fc->server_list->servers[i].cb_interest == cbi) {
69 				fc->index = i;
70 				goto found_interest;
71 			}
72 		}
73 
74 		/* If we have a lock outstanding on a server that's no longer
75 		 * serving this vnode, then we can't switch to another server
76 		 * and have to return an error.
77 		 */
78 		if (fc->flags & AFS_FS_CURSOR_CUR_ONLY) {
79 			fc->error = -ESTALE;
80 			return false;
81 		}
82 
83 		/* Note that the callback promise is effectively broken */
84 		write_seqlock(&vnode->cb_lock);
85 		ASSERTCMP(cbi, ==, vnode->cb_interest);
86 		vnode->cb_interest = NULL;
87 		if (test_and_clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags))
88 			vnode->cb_break++;
89 		write_sequnlock(&vnode->cb_lock);
90 
91 		afs_put_cb_interest(afs_v2net(vnode), cbi);
92 		cbi = NULL;
93 	}
94 
95 found_interest:
96 	return true;
97 }
98 
99 /*
100  * Post volume busy note.
101  */
102 static void afs_busy(struct afs_volume *volume, u32 abort_code)
103 {
104 	const char *m;
105 
106 	switch (abort_code) {
107 	case VOFFLINE:		m = "offline";		break;
108 	case VRESTARTING:	m = "restarting";	break;
109 	case VSALVAGING:	m = "being salvaged";	break;
110 	default:		m = "busy";		break;
111 	}
112 
113 	pr_notice("kAFS: Volume %llu '%s' is %s\n", volume->vid, volume->name, m);
114 }
115 
116 /*
117  * Sleep and retry the operation to the same fileserver.
118  */
119 static bool afs_sleep_and_retry(struct afs_fs_cursor *fc)
120 {
121 	msleep_interruptible(1000);
122 	if (signal_pending(current)) {
123 		fc->error = -ERESTARTSYS;
124 		return false;
125 	}
126 
127 	return true;
128 }
129 
130 /*
131  * Select the fileserver to use.  May be called multiple times to rotate
132  * through the fileservers.
133  */
134 bool afs_select_fileserver(struct afs_fs_cursor *fc)
135 {
136 	struct afs_addr_list *alist;
137 	struct afs_server *server;
138 	struct afs_vnode *vnode = fc->vnode;
139 	u32 rtt, abort_code;
140 	int error = fc->ac.error, i;
141 
142 	_enter("%lx[%d],%lx[%d],%d,%d",
143 	       fc->untried, fc->index,
144 	       fc->ac.tried, fc->ac.index,
145 	       error, fc->ac.abort_code);
146 
147 	if (fc->flags & AFS_FS_CURSOR_STOP) {
148 		_leave(" = f [stopped]");
149 		return false;
150 	}
151 
152 	fc->nr_iterations++;
153 
154 	/* Evaluate the result of the previous operation, if there was one. */
155 	switch (error) {
156 	case SHRT_MAX:
157 		goto start;
158 
159 	case 0:
160 	default:
161 		/* Success or local failure.  Stop. */
162 		fc->error = error;
163 		fc->flags |= AFS_FS_CURSOR_STOP;
164 		_leave(" = f [okay/local %d]", error);
165 		return false;
166 
167 	case -ECONNABORTED:
168 		/* The far side rejected the operation on some grounds.  This
169 		 * might involve the server being busy or the volume having been moved.
170 		 */
171 		switch (fc->ac.abort_code) {
172 		case VNOVOL:
173 			/* This fileserver doesn't know about the volume.
174 			 * - May indicate that the VL is wrong - retry once and compare
175 			 *   the results.
176 			 * - May indicate that the fileserver couldn't attach to the vol.
177 			 */
178 			if (fc->flags & AFS_FS_CURSOR_VNOVOL) {
179 				fc->error = -EREMOTEIO;
180 				goto next_server;
181 			}
182 
183 			write_lock(&vnode->volume->servers_lock);
184 			fc->server_list->vnovol_mask |= 1 << fc->index;
185 			write_unlock(&vnode->volume->servers_lock);
186 
187 			set_bit(AFS_VOLUME_NEEDS_UPDATE, &vnode->volume->flags);
188 			error = afs_check_volume_status(vnode->volume, fc->key);
189 			if (error < 0)
190 				goto failed_set_error;
191 
192 			if (test_bit(AFS_VOLUME_DELETED, &vnode->volume->flags)) {
193 				fc->error = -ENOMEDIUM;
194 				goto failed;
195 			}
196 
197 			/* If the server list didn't change, then assume that
198 			 * it's the fileserver having trouble.
199 			 */
200 			if (vnode->volume->servers == fc->server_list) {
201 				fc->error = -EREMOTEIO;
202 				goto next_server;
203 			}
204 
205 			/* Try again */
206 			fc->flags |= AFS_FS_CURSOR_VNOVOL;
207 			_leave(" = t [vnovol]");
208 			return true;
209 
210 		case VSALVAGE: /* TODO: Should this return an error or iterate? */
211 		case VVOLEXISTS:
212 		case VNOSERVICE:
213 		case VONLINE:
214 		case VDISKFULL:
215 		case VOVERQUOTA:
216 			fc->error = afs_abort_to_error(fc->ac.abort_code);
217 			goto next_server;
218 
219 		case VOFFLINE:
220 			if (!test_and_set_bit(AFS_VOLUME_OFFLINE, &vnode->volume->flags)) {
221 				afs_busy(vnode->volume, fc->ac.abort_code);
222 				clear_bit(AFS_VOLUME_BUSY, &vnode->volume->flags);
223 			}
224 			if (fc->flags & AFS_FS_CURSOR_NO_VSLEEP) {
225 				fc->error = -EADV;
226 				goto failed;
227 			}
228 			if (fc->flags & AFS_FS_CURSOR_CUR_ONLY) {
229 				fc->error = -ESTALE;
230 				goto failed;
231 			}
232 			goto busy;
233 
234 		case VSALVAGING:
235 		case VRESTARTING:
236 		case VBUSY:
237 			/* Retry after going round all the servers unless we
238 			 * have a file lock we need to maintain.
239 			 */
240 			if (fc->flags & AFS_FS_CURSOR_NO_VSLEEP) {
241 				fc->error = -EBUSY;
242 				goto failed;
243 			}
244 			if (!test_and_set_bit(AFS_VOLUME_BUSY, &vnode->volume->flags)) {
245 				afs_busy(vnode->volume, fc->ac.abort_code);
246 				clear_bit(AFS_VOLUME_OFFLINE, &vnode->volume->flags);
247 			}
248 		busy:
249 			if (fc->flags & AFS_FS_CURSOR_CUR_ONLY) {
250 				if (!afs_sleep_and_retry(fc))
251 					goto failed;
252 
253 				 /* Retry with same server & address */
254 				_leave(" = t [vbusy]");
255 				return true;
256 			}
257 
258 			fc->flags |= AFS_FS_CURSOR_VBUSY;
259 			goto next_server;
260 
261 		case VMOVED:
262 			/* The volume migrated to another server.  We consider
263 			 * consider all locks and callbacks broken and request
264 			 * an update from the VLDB.
265 			 *
266 			 * We also limit the number of VMOVED hops we will
267 			 * honour, just in case someone sets up a loop.
268 			 */
269 			if (fc->flags & AFS_FS_CURSOR_VMOVED) {
270 				fc->error = -EREMOTEIO;
271 				goto failed;
272 			}
273 			fc->flags |= AFS_FS_CURSOR_VMOVED;
274 
275 			set_bit(AFS_VOLUME_WAIT, &vnode->volume->flags);
276 			set_bit(AFS_VOLUME_NEEDS_UPDATE, &vnode->volume->flags);
277 			error = afs_check_volume_status(vnode->volume, fc->key);
278 			if (error < 0)
279 				goto failed_set_error;
280 
281 			/* If the server list didn't change, then the VLDB is
282 			 * out of sync with the fileservers.  This is hopefully
283 			 * a temporary condition, however, so we don't want to
284 			 * permanently block access to the file.
285 			 *
286 			 * TODO: Try other fileservers if we can.
287 			 *
288 			 * TODO: Retry a few times with sleeps.
289 			 */
290 			if (vnode->volume->servers == fc->server_list) {
291 				fc->error = -ENOMEDIUM;
292 				goto failed;
293 			}
294 
295 			goto restart_from_beginning;
296 
297 		default:
298 			clear_bit(AFS_VOLUME_OFFLINE, &vnode->volume->flags);
299 			clear_bit(AFS_VOLUME_BUSY, &vnode->volume->flags);
300 			fc->error = afs_abort_to_error(fc->ac.abort_code);
301 			goto failed;
302 		}
303 
304 	case -ETIMEDOUT:
305 	case -ETIME:
306 		if (fc->error != -EDESTADDRREQ)
307 			goto iterate_address;
308 		/* Fall through */
309 	case -ENETUNREACH:
310 	case -EHOSTUNREACH:
311 	case -ECONNREFUSED:
312 		_debug("no conn");
313 		fc->error = error;
314 		goto iterate_address;
315 
316 	case -ECONNRESET:
317 		_debug("call reset");
318 		fc->error = error;
319 		goto failed;
320 	}
321 
322 restart_from_beginning:
323 	_debug("restart");
324 	afs_end_cursor(&fc->ac);
325 	afs_put_cb_interest(afs_v2net(vnode), fc->cbi);
326 	fc->cbi = NULL;
327 	afs_put_serverlist(afs_v2net(vnode), fc->server_list);
328 	fc->server_list = NULL;
329 start:
330 	_debug("start");
331 	/* See if we need to do an update of the volume record.  Note that the
332 	 * volume may have moved or even have been deleted.
333 	 */
334 	error = afs_check_volume_status(vnode->volume, fc->key);
335 	if (error < 0)
336 		goto failed_set_error;
337 
338 	if (!afs_start_fs_iteration(fc, vnode))
339 		goto failed;
340 
341 	_debug("__ VOL %llx __", vnode->volume->vid);
342 	error = afs_probe_fileservers(afs_v2net(vnode), fc->key, fc->server_list);
343 	if (error < 0)
344 		goto failed_set_error;
345 
346 pick_server:
347 	_debug("pick [%lx]", fc->untried);
348 
349 	error = afs_wait_for_fs_probes(fc->server_list, fc->untried);
350 	if (error < 0)
351 		goto failed_set_error;
352 
353 	/* Pick the untried server with the lowest RTT.  If we have outstanding
354 	 * callbacks, we stick with the server we're already using if we can.
355 	 */
356 	if (fc->cbi) {
357 		_debug("cbi %u", fc->index);
358 		if (test_bit(fc->index, &fc->untried))
359 			goto selected_server;
360 		afs_put_cb_interest(afs_v2net(vnode), fc->cbi);
361 		fc->cbi = NULL;
362 		_debug("nocbi");
363 	}
364 
365 	fc->index = -1;
366 	rtt = U32_MAX;
367 	for (i = 0; i < fc->server_list->nr_servers; i++) {
368 		struct afs_server *s = fc->server_list->servers[i].server;
369 
370 		if (!test_bit(i, &fc->untried) || !s->probe.responded)
371 			continue;
372 		if (s->probe.rtt < rtt) {
373 			fc->index = i;
374 			rtt = s->probe.rtt;
375 		}
376 	}
377 
378 	if (fc->index == -1)
379 		goto no_more_servers;
380 
381 selected_server:
382 	_debug("use %d", fc->index);
383 	__clear_bit(fc->index, &fc->untried);
384 
385 	/* We're starting on a different fileserver from the list.  We need to
386 	 * check it, create a callback intercept, find its address list and
387 	 * probe its capabilities before we use it.
388 	 */
389 	ASSERTCMP(fc->ac.alist, ==, NULL);
390 	server = fc->server_list->servers[fc->index].server;
391 
392 	if (!afs_check_server_record(fc, server))
393 		goto failed;
394 
395 	_debug("USING SERVER: %pU", &server->uuid);
396 
397 	/* Make sure we've got a callback interest record for this server.  We
398 	 * have to link it in before we send the request as we can be sent a
399 	 * break request before we've finished decoding the reply and
400 	 * installing the vnode.
401 	 */
402 	error = afs_register_server_cb_interest(vnode, fc->server_list,
403 						fc->index);
404 	if (error < 0)
405 		goto failed_set_error;
406 
407 	fc->cbi = afs_get_cb_interest(vnode->cb_interest);
408 
409 	read_lock(&server->fs_lock);
410 	alist = rcu_dereference_protected(server->addresses,
411 					  lockdep_is_held(&server->fs_lock));
412 	afs_get_addrlist(alist);
413 	read_unlock(&server->fs_lock);
414 
415 	memset(&fc->ac, 0, sizeof(fc->ac));
416 
417 	if (!fc->ac.alist)
418 		fc->ac.alist = alist;
419 	else
420 		afs_put_addrlist(alist);
421 
422 	fc->ac.index = -1;
423 
424 iterate_address:
425 	ASSERT(fc->ac.alist);
426 	/* Iterate over the current server's address list to try and find an
427 	 * address on which it will respond to us.
428 	 */
429 	if (!afs_iterate_addresses(&fc->ac))
430 		goto next_server;
431 
432 	_debug("address [%u] %u/%u", fc->index, fc->ac.index, fc->ac.alist->nr_addrs);
433 
434 	_leave(" = t");
435 	return true;
436 
437 next_server:
438 	_debug("next");
439 	afs_end_cursor(&fc->ac);
440 	goto pick_server;
441 
442 no_more_servers:
443 	/* That's all the servers poked to no good effect.  Try again if some
444 	 * of them were busy.
445 	 */
446 	if (fc->flags & AFS_FS_CURSOR_VBUSY)
447 		goto restart_from_beginning;
448 
449 	abort_code = 0;
450 	error = -EDESTADDRREQ;
451 	for (i = 0; i < fc->server_list->nr_servers; i++) {
452 		struct afs_server *s = fc->server_list->servers[i].server;
453 		int probe_error = READ_ONCE(s->probe.error);
454 
455 		switch (probe_error) {
456 		case 0:
457 			continue;
458 		default:
459 			if (error == -ETIMEDOUT ||
460 			    error == -ETIME)
461 				continue;
462 		case -ETIMEDOUT:
463 		case -ETIME:
464 			if (error == -ENOMEM ||
465 			    error == -ENONET)
466 				continue;
467 		case -ENOMEM:
468 		case -ENONET:
469 			if (error == -ENETUNREACH)
470 				continue;
471 		case -ENETUNREACH:
472 			if (error == -EHOSTUNREACH)
473 				continue;
474 		case -EHOSTUNREACH:
475 			if (error == -ECONNREFUSED)
476 				continue;
477 		case -ECONNREFUSED:
478 			if (error == -ECONNRESET)
479 				continue;
480 		case -ECONNRESET: /* Responded, but call expired. */
481 			if (error == -ECONNABORTED)
482 				continue;
483 		case -ECONNABORTED:
484 			abort_code = s->probe.abort_code;
485 			error = probe_error;
486 			continue;
487 		}
488 	}
489 
490 	if (error == -ECONNABORTED)
491 		error = afs_abort_to_error(abort_code);
492 
493 failed_set_error:
494 	fc->error = error;
495 failed:
496 	fc->flags |= AFS_FS_CURSOR_STOP;
497 	afs_end_cursor(&fc->ac);
498 	_leave(" = f [failed %d]", fc->error);
499 	return false;
500 }
501 
502 /*
503  * Select the same fileserver we used for a vnode before and only that
504  * fileserver.  We use this when we have a lock on that file, which is backed
505  * only by the fileserver we obtained it from.
506  */
507 bool afs_select_current_fileserver(struct afs_fs_cursor *fc)
508 {
509 	struct afs_vnode *vnode = fc->vnode;
510 	struct afs_cb_interest *cbi = vnode->cb_interest;
511 	struct afs_addr_list *alist;
512 	int error = fc->ac.error;
513 
514 	_enter("");
515 
516 	switch (error) {
517 	case SHRT_MAX:
518 		if (!cbi) {
519 			fc->error = -ESTALE;
520 			fc->flags |= AFS_FS_CURSOR_STOP;
521 			return false;
522 		}
523 
524 		fc->cbi = afs_get_cb_interest(vnode->cb_interest);
525 
526 		read_lock(&cbi->server->fs_lock);
527 		alist = rcu_dereference_protected(cbi->server->addresses,
528 						  lockdep_is_held(&cbi->server->fs_lock));
529 		afs_get_addrlist(alist);
530 		read_unlock(&cbi->server->fs_lock);
531 		if (!alist) {
532 			fc->error = -ESTALE;
533 			fc->flags |= AFS_FS_CURSOR_STOP;
534 			return false;
535 		}
536 
537 		memset(&fc->ac, 0, sizeof(fc->ac));
538 		fc->ac.alist = alist;
539 		fc->ac.index = -1;
540 		goto iterate_address;
541 
542 	case 0:
543 	default:
544 		/* Success or local failure.  Stop. */
545 		fc->error = error;
546 		fc->flags |= AFS_FS_CURSOR_STOP;
547 		_leave(" = f [okay/local %d]", error);
548 		return false;
549 
550 	case -ECONNABORTED:
551 		fc->error = afs_abort_to_error(fc->ac.abort_code);
552 		fc->flags |= AFS_FS_CURSOR_STOP;
553 		_leave(" = f [abort]");
554 		return false;
555 
556 	case -ENETUNREACH:
557 	case -EHOSTUNREACH:
558 	case -ECONNREFUSED:
559 	case -ETIMEDOUT:
560 	case -ETIME:
561 		_debug("no conn");
562 		fc->error = error;
563 		goto iterate_address;
564 	}
565 
566 iterate_address:
567 	/* Iterate over the current server's address list to try and find an
568 	 * address on which it will respond to us.
569 	 */
570 	if (afs_iterate_addresses(&fc->ac)) {
571 		_leave(" = t");
572 		return true;
573 	}
574 
575 	afs_end_cursor(&fc->ac);
576 	return false;
577 }
578 
579 /*
580  * Dump cursor state in the case of the error being EDESTADDRREQ.
581  */
582 static void afs_dump_edestaddrreq(const struct afs_fs_cursor *fc)
583 {
584 	static int count;
585 	int i;
586 
587 	if (!IS_ENABLED(CONFIG_AFS_DEBUG_CURSOR) || count > 3)
588 		return;
589 	count++;
590 
591 	rcu_read_lock();
592 
593 	pr_notice("EDESTADDR occurred\n");
594 	pr_notice("FC: cbb=%x cbb2=%x fl=%hx err=%hd\n",
595 		  fc->cb_break, fc->cb_break_2, fc->flags, fc->error);
596 	pr_notice("FC: ut=%lx ix=%d ni=%u\n",
597 		  fc->untried, fc->index, fc->nr_iterations);
598 
599 	if (fc->server_list) {
600 		const struct afs_server_list *sl = fc->server_list;
601 		pr_notice("FC: SL nr=%u pr=%u vnov=%hx\n",
602 			  sl->nr_servers, sl->preferred, sl->vnovol_mask);
603 		for (i = 0; i < sl->nr_servers; i++) {
604 			const struct afs_server *s = sl->servers[i].server;
605 			pr_notice("FC: server fl=%lx av=%u %pU\n",
606 				  s->flags, s->addr_version, &s->uuid);
607 			if (s->addresses) {
608 				const struct afs_addr_list *a =
609 					rcu_dereference(s->addresses);
610 				pr_notice("FC:  - av=%u nr=%u/%u/%u pr=%u\n",
611 					  a->version,
612 					  a->nr_ipv4, a->nr_addrs, a->max_addrs,
613 					  a->preferred);
614 				pr_notice("FC:  - pr=%lx R=%lx F=%lx\n",
615 					  a->probed, a->responded, a->failed);
616 				if (a == fc->ac.alist)
617 					pr_notice("FC:  - current\n");
618 			}
619 		}
620 	}
621 
622 	pr_notice("AC: t=%lx ax=%u ac=%d er=%d r=%u ni=%u\n",
623 		  fc->ac.tried, fc->ac.index, fc->ac.abort_code, fc->ac.error,
624 		  fc->ac.responded, fc->ac.nr_iterations);
625 	rcu_read_unlock();
626 }
627 
628 /*
629  * Tidy up a filesystem cursor and unlock the vnode.
630  */
631 int afs_end_vnode_operation(struct afs_fs_cursor *fc)
632 {
633 	struct afs_net *net = afs_v2net(fc->vnode);
634 
635 	if (fc->error == -EDESTADDRREQ ||
636 	    fc->error == -ENETUNREACH ||
637 	    fc->error == -EHOSTUNREACH)
638 		afs_dump_edestaddrreq(fc);
639 
640 	mutex_unlock(&fc->vnode->io_lock);
641 
642 	afs_end_cursor(&fc->ac);
643 	afs_put_cb_interest(net, fc->cbi);
644 	afs_put_serverlist(net, fc->server_list);
645 
646 	if (fc->error == -ECONNABORTED)
647 		fc->error = afs_abort_to_error(fc->ac.abort_code);
648 
649 	return fc->error;
650 }
651