xref: /openbmc/linux/fs/afs/rotate.c (revision ed1666f6)
1 /* Handle fileserver selection and rotation.
2  *
3  * Copyright (C) 2017 Red Hat, Inc. All Rights Reserved.
4  * Written by David Howells (dhowells@redhat.com)
5  *
6  * This program is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public Licence
8  * as published by the Free Software Foundation; either version
9  * 2 of the Licence, or (at your option) any later version.
10  */
11 
12 #include <linux/kernel.h>
13 #include <linux/slab.h>
14 #include <linux/fs.h>
15 #include <linux/sched.h>
16 #include <linux/delay.h>
17 #include <linux/sched/signal.h>
18 #include "internal.h"
19 #include "afs_fs.h"
20 
21 /*
22  * Begin an operation on the fileserver.
23  *
24  * Fileserver operations are serialised on the server by vnode, so we serialise
25  * them here also using the io_lock.
26  */
27 bool afs_begin_vnode_operation(struct afs_fs_cursor *fc, struct afs_vnode *vnode,
28 			       struct key *key)
29 {
30 	memset(fc, 0, sizeof(*fc));
31 	fc->vnode = vnode;
32 	fc->key = key;
33 	fc->ac.error = SHRT_MAX;
34 	fc->error = -EDESTADDRREQ;
35 
36 	if (mutex_lock_interruptible(&vnode->io_lock) < 0) {
37 		fc->error = -EINTR;
38 		fc->flags |= AFS_FS_CURSOR_STOP;
39 		return false;
40 	}
41 
42 	if (vnode->lock_state != AFS_VNODE_LOCK_NONE)
43 		fc->flags |= AFS_FS_CURSOR_CUR_ONLY;
44 	return true;
45 }
46 
47 /*
48  * Begin iteration through a server list, starting with the vnode's last used
49  * server if possible, or the last recorded good server if not.
50  */
51 static bool afs_start_fs_iteration(struct afs_fs_cursor *fc,
52 				   struct afs_vnode *vnode)
53 {
54 	struct afs_cb_interest *cbi;
55 	int i;
56 
57 	read_lock(&vnode->volume->servers_lock);
58 	fc->server_list = afs_get_serverlist(vnode->volume->servers);
59 	read_unlock(&vnode->volume->servers_lock);
60 
61 	fc->untried = (1UL << fc->server_list->nr_servers) - 1;
62 	fc->index = READ_ONCE(fc->server_list->preferred);
63 
64 	cbi = vnode->cb_interest;
65 	if (cbi) {
66 		/* See if the vnode's preferred record is still available */
67 		for (i = 0; i < fc->server_list->nr_servers; i++) {
68 			if (fc->server_list->servers[i].cb_interest == cbi) {
69 				fc->index = i;
70 				goto found_interest;
71 			}
72 		}
73 
74 		/* If we have a lock outstanding on a server that's no longer
75 		 * serving this vnode, then we can't switch to another server
76 		 * and have to return an error.
77 		 */
78 		if (fc->flags & AFS_FS_CURSOR_CUR_ONLY) {
79 			fc->error = -ESTALE;
80 			return false;
81 		}
82 
83 		/* Note that the callback promise is effectively broken */
84 		write_seqlock(&vnode->cb_lock);
85 		ASSERTCMP(cbi, ==, vnode->cb_interest);
86 		vnode->cb_interest = NULL;
87 		if (test_and_clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags))
88 			vnode->cb_break++;
89 		write_sequnlock(&vnode->cb_lock);
90 
91 		afs_put_cb_interest(afs_v2net(vnode), cbi);
92 		cbi = NULL;
93 	}
94 
95 found_interest:
96 	return true;
97 }
98 
99 /*
100  * Post volume busy note.
101  */
102 static void afs_busy(struct afs_volume *volume, u32 abort_code)
103 {
104 	const char *m;
105 
106 	switch (abort_code) {
107 	case VOFFLINE:		m = "offline";		break;
108 	case VRESTARTING:	m = "restarting";	break;
109 	case VSALVAGING:	m = "being salvaged";	break;
110 	default:		m = "busy";		break;
111 	}
112 
113 	pr_notice("kAFS: Volume %llu '%s' is %s\n", volume->vid, volume->name, m);
114 }
115 
116 /*
117  * Sleep and retry the operation to the same fileserver.
118  */
119 static bool afs_sleep_and_retry(struct afs_fs_cursor *fc)
120 {
121 	msleep_interruptible(1000);
122 	if (signal_pending(current)) {
123 		fc->error = -ERESTARTSYS;
124 		return false;
125 	}
126 
127 	return true;
128 }
129 
130 /*
131  * Select the fileserver to use.  May be called multiple times to rotate
132  * through the fileservers.
133  */
134 bool afs_select_fileserver(struct afs_fs_cursor *fc)
135 {
136 	struct afs_addr_list *alist;
137 	struct afs_server *server;
138 	struct afs_vnode *vnode = fc->vnode;
139 	struct afs_error e;
140 	u32 rtt;
141 	int error = fc->ac.error, i;
142 
143 	_enter("%lx[%d],%lx[%d],%d,%d",
144 	       fc->untried, fc->index,
145 	       fc->ac.tried, fc->ac.index,
146 	       error, fc->ac.abort_code);
147 
148 	if (fc->flags & AFS_FS_CURSOR_STOP) {
149 		_leave(" = f [stopped]");
150 		return false;
151 	}
152 
153 	fc->nr_iterations++;
154 
155 	/* Evaluate the result of the previous operation, if there was one. */
156 	switch (error) {
157 	case SHRT_MAX:
158 		goto start;
159 
160 	case 0:
161 	default:
162 		/* Success or local failure.  Stop. */
163 		fc->error = error;
164 		fc->flags |= AFS_FS_CURSOR_STOP;
165 		_leave(" = f [okay/local %d]", error);
166 		return false;
167 
168 	case -ECONNABORTED:
169 		/* The far side rejected the operation on some grounds.  This
170 		 * might involve the server being busy or the volume having been moved.
171 		 */
172 		switch (fc->ac.abort_code) {
173 		case VNOVOL:
174 			/* This fileserver doesn't know about the volume.
175 			 * - May indicate that the VL is wrong - retry once and compare
176 			 *   the results.
177 			 * - May indicate that the fileserver couldn't attach to the vol.
178 			 */
179 			if (fc->flags & AFS_FS_CURSOR_VNOVOL) {
180 				fc->error = -EREMOTEIO;
181 				goto next_server;
182 			}
183 
184 			write_lock(&vnode->volume->servers_lock);
185 			fc->server_list->vnovol_mask |= 1 << fc->index;
186 			write_unlock(&vnode->volume->servers_lock);
187 
188 			set_bit(AFS_VOLUME_NEEDS_UPDATE, &vnode->volume->flags);
189 			error = afs_check_volume_status(vnode->volume, fc->key);
190 			if (error < 0)
191 				goto failed_set_error;
192 
193 			if (test_bit(AFS_VOLUME_DELETED, &vnode->volume->flags)) {
194 				fc->error = -ENOMEDIUM;
195 				goto failed;
196 			}
197 
198 			/* If the server list didn't change, then assume that
199 			 * it's the fileserver having trouble.
200 			 */
201 			if (vnode->volume->servers == fc->server_list) {
202 				fc->error = -EREMOTEIO;
203 				goto next_server;
204 			}
205 
206 			/* Try again */
207 			fc->flags |= AFS_FS_CURSOR_VNOVOL;
208 			_leave(" = t [vnovol]");
209 			return true;
210 
211 		case VSALVAGE: /* TODO: Should this return an error or iterate? */
212 		case VVOLEXISTS:
213 		case VNOSERVICE:
214 		case VONLINE:
215 		case VDISKFULL:
216 		case VOVERQUOTA:
217 			fc->error = afs_abort_to_error(fc->ac.abort_code);
218 			goto next_server;
219 
220 		case VOFFLINE:
221 			if (!test_and_set_bit(AFS_VOLUME_OFFLINE, &vnode->volume->flags)) {
222 				afs_busy(vnode->volume, fc->ac.abort_code);
223 				clear_bit(AFS_VOLUME_BUSY, &vnode->volume->flags);
224 			}
225 			if (fc->flags & AFS_FS_CURSOR_NO_VSLEEP) {
226 				fc->error = -EADV;
227 				goto failed;
228 			}
229 			if (fc->flags & AFS_FS_CURSOR_CUR_ONLY) {
230 				fc->error = -ESTALE;
231 				goto failed;
232 			}
233 			goto busy;
234 
235 		case VSALVAGING:
236 		case VRESTARTING:
237 		case VBUSY:
238 			/* Retry after going round all the servers unless we
239 			 * have a file lock we need to maintain.
240 			 */
241 			if (fc->flags & AFS_FS_CURSOR_NO_VSLEEP) {
242 				fc->error = -EBUSY;
243 				goto failed;
244 			}
245 			if (!test_and_set_bit(AFS_VOLUME_BUSY, &vnode->volume->flags)) {
246 				afs_busy(vnode->volume, fc->ac.abort_code);
247 				clear_bit(AFS_VOLUME_OFFLINE, &vnode->volume->flags);
248 			}
249 		busy:
250 			if (fc->flags & AFS_FS_CURSOR_CUR_ONLY) {
251 				if (!afs_sleep_and_retry(fc))
252 					goto failed;
253 
254 				 /* Retry with same server & address */
255 				_leave(" = t [vbusy]");
256 				return true;
257 			}
258 
259 			fc->flags |= AFS_FS_CURSOR_VBUSY;
260 			goto next_server;
261 
262 		case VMOVED:
263 			/* The volume migrated to another server.  We consider
264 			 * consider all locks and callbacks broken and request
265 			 * an update from the VLDB.
266 			 *
267 			 * We also limit the number of VMOVED hops we will
268 			 * honour, just in case someone sets up a loop.
269 			 */
270 			if (fc->flags & AFS_FS_CURSOR_VMOVED) {
271 				fc->error = -EREMOTEIO;
272 				goto failed;
273 			}
274 			fc->flags |= AFS_FS_CURSOR_VMOVED;
275 
276 			set_bit(AFS_VOLUME_WAIT, &vnode->volume->flags);
277 			set_bit(AFS_VOLUME_NEEDS_UPDATE, &vnode->volume->flags);
278 			error = afs_check_volume_status(vnode->volume, fc->key);
279 			if (error < 0)
280 				goto failed_set_error;
281 
282 			/* If the server list didn't change, then the VLDB is
283 			 * out of sync with the fileservers.  This is hopefully
284 			 * a temporary condition, however, so we don't want to
285 			 * permanently block access to the file.
286 			 *
287 			 * TODO: Try other fileservers if we can.
288 			 *
289 			 * TODO: Retry a few times with sleeps.
290 			 */
291 			if (vnode->volume->servers == fc->server_list) {
292 				fc->error = -ENOMEDIUM;
293 				goto failed;
294 			}
295 
296 			goto restart_from_beginning;
297 
298 		default:
299 			clear_bit(AFS_VOLUME_OFFLINE, &vnode->volume->flags);
300 			clear_bit(AFS_VOLUME_BUSY, &vnode->volume->flags);
301 			fc->error = afs_abort_to_error(fc->ac.abort_code);
302 			goto failed;
303 		}
304 
305 	case -ETIMEDOUT:
306 	case -ETIME:
307 		if (fc->error != -EDESTADDRREQ)
308 			goto iterate_address;
309 		/* Fall through */
310 	case -ERFKILL:
311 	case -EADDRNOTAVAIL:
312 	case -ENETUNREACH:
313 	case -EHOSTUNREACH:
314 	case -EHOSTDOWN:
315 	case -ECONNREFUSED:
316 		_debug("no conn");
317 		fc->error = error;
318 		goto iterate_address;
319 
320 	case -ECONNRESET:
321 		_debug("call reset");
322 		fc->error = error;
323 		goto failed;
324 	}
325 
326 restart_from_beginning:
327 	_debug("restart");
328 	afs_end_cursor(&fc->ac);
329 	afs_put_cb_interest(afs_v2net(vnode), fc->cbi);
330 	fc->cbi = NULL;
331 	afs_put_serverlist(afs_v2net(vnode), fc->server_list);
332 	fc->server_list = NULL;
333 start:
334 	_debug("start");
335 	/* See if we need to do an update of the volume record.  Note that the
336 	 * volume may have moved or even have been deleted.
337 	 */
338 	error = afs_check_volume_status(vnode->volume, fc->key);
339 	if (error < 0)
340 		goto failed_set_error;
341 
342 	if (!afs_start_fs_iteration(fc, vnode))
343 		goto failed;
344 
345 	_debug("__ VOL %llx __", vnode->volume->vid);
346 	error = afs_probe_fileservers(afs_v2net(vnode), fc->key, fc->server_list);
347 	if (error < 0)
348 		goto failed_set_error;
349 
350 pick_server:
351 	_debug("pick [%lx]", fc->untried);
352 
353 	error = afs_wait_for_fs_probes(fc->server_list, fc->untried);
354 	if (error < 0)
355 		goto failed_set_error;
356 
357 	/* Pick the untried server with the lowest RTT.  If we have outstanding
358 	 * callbacks, we stick with the server we're already using if we can.
359 	 */
360 	if (fc->cbi) {
361 		_debug("cbi %u", fc->index);
362 		if (test_bit(fc->index, &fc->untried))
363 			goto selected_server;
364 		afs_put_cb_interest(afs_v2net(vnode), fc->cbi);
365 		fc->cbi = NULL;
366 		_debug("nocbi");
367 	}
368 
369 	fc->index = -1;
370 	rtt = U32_MAX;
371 	for (i = 0; i < fc->server_list->nr_servers; i++) {
372 		struct afs_server *s = fc->server_list->servers[i].server;
373 
374 		if (!test_bit(i, &fc->untried) || !s->probe.responded)
375 			continue;
376 		if (s->probe.rtt < rtt) {
377 			fc->index = i;
378 			rtt = s->probe.rtt;
379 		}
380 	}
381 
382 	if (fc->index == -1)
383 		goto no_more_servers;
384 
385 selected_server:
386 	_debug("use %d", fc->index);
387 	__clear_bit(fc->index, &fc->untried);
388 
389 	/* We're starting on a different fileserver from the list.  We need to
390 	 * check it, create a callback intercept, find its address list and
391 	 * probe its capabilities before we use it.
392 	 */
393 	ASSERTCMP(fc->ac.alist, ==, NULL);
394 	server = fc->server_list->servers[fc->index].server;
395 
396 	if (!afs_check_server_record(fc, server))
397 		goto failed;
398 
399 	_debug("USING SERVER: %pU", &server->uuid);
400 
401 	/* Make sure we've got a callback interest record for this server.  We
402 	 * have to link it in before we send the request as we can be sent a
403 	 * break request before we've finished decoding the reply and
404 	 * installing the vnode.
405 	 */
406 	error = afs_register_server_cb_interest(vnode, fc->server_list,
407 						fc->index);
408 	if (error < 0)
409 		goto failed_set_error;
410 
411 	fc->cbi = afs_get_cb_interest(vnode->cb_interest);
412 
413 	read_lock(&server->fs_lock);
414 	alist = rcu_dereference_protected(server->addresses,
415 					  lockdep_is_held(&server->fs_lock));
416 	afs_get_addrlist(alist);
417 	read_unlock(&server->fs_lock);
418 
419 	memset(&fc->ac, 0, sizeof(fc->ac));
420 
421 	if (!fc->ac.alist)
422 		fc->ac.alist = alist;
423 	else
424 		afs_put_addrlist(alist);
425 
426 	fc->ac.index = -1;
427 
428 iterate_address:
429 	ASSERT(fc->ac.alist);
430 	/* Iterate over the current server's address list to try and find an
431 	 * address on which it will respond to us.
432 	 */
433 	if (!afs_iterate_addresses(&fc->ac))
434 		goto next_server;
435 
436 	_debug("address [%u] %u/%u", fc->index, fc->ac.index, fc->ac.alist->nr_addrs);
437 
438 	_leave(" = t");
439 	return true;
440 
441 next_server:
442 	_debug("next");
443 	afs_end_cursor(&fc->ac);
444 	goto pick_server;
445 
446 no_more_servers:
447 	/* That's all the servers poked to no good effect.  Try again if some
448 	 * of them were busy.
449 	 */
450 	if (fc->flags & AFS_FS_CURSOR_VBUSY)
451 		goto restart_from_beginning;
452 
453 	e.error = -EDESTADDRREQ;
454 	e.responded = false;
455 	for (i = 0; i < fc->server_list->nr_servers; i++) {
456 		struct afs_server *s = fc->server_list->servers[i].server;
457 
458 		afs_prioritise_error(&e, READ_ONCE(s->probe.error),
459 				     s->probe.abort_code);
460 	}
461 
462 failed_set_error:
463 	fc->error = error;
464 failed:
465 	fc->flags |= AFS_FS_CURSOR_STOP;
466 	afs_end_cursor(&fc->ac);
467 	_leave(" = f [failed %d]", fc->error);
468 	return false;
469 }
470 
471 /*
472  * Select the same fileserver we used for a vnode before and only that
473  * fileserver.  We use this when we have a lock on that file, which is backed
474  * only by the fileserver we obtained it from.
475  */
476 bool afs_select_current_fileserver(struct afs_fs_cursor *fc)
477 {
478 	struct afs_vnode *vnode = fc->vnode;
479 	struct afs_cb_interest *cbi = vnode->cb_interest;
480 	struct afs_addr_list *alist;
481 	int error = fc->ac.error;
482 
483 	_enter("");
484 
485 	switch (error) {
486 	case SHRT_MAX:
487 		if (!cbi) {
488 			fc->error = -ESTALE;
489 			fc->flags |= AFS_FS_CURSOR_STOP;
490 			return false;
491 		}
492 
493 		fc->cbi = afs_get_cb_interest(vnode->cb_interest);
494 
495 		read_lock(&cbi->server->fs_lock);
496 		alist = rcu_dereference_protected(cbi->server->addresses,
497 						  lockdep_is_held(&cbi->server->fs_lock));
498 		afs_get_addrlist(alist);
499 		read_unlock(&cbi->server->fs_lock);
500 		if (!alist) {
501 			fc->error = -ESTALE;
502 			fc->flags |= AFS_FS_CURSOR_STOP;
503 			return false;
504 		}
505 
506 		memset(&fc->ac, 0, sizeof(fc->ac));
507 		fc->ac.alist = alist;
508 		fc->ac.index = -1;
509 		goto iterate_address;
510 
511 	case 0:
512 	default:
513 		/* Success or local failure.  Stop. */
514 		fc->error = error;
515 		fc->flags |= AFS_FS_CURSOR_STOP;
516 		_leave(" = f [okay/local %d]", error);
517 		return false;
518 
519 	case -ECONNABORTED:
520 		fc->error = afs_abort_to_error(fc->ac.abort_code);
521 		fc->flags |= AFS_FS_CURSOR_STOP;
522 		_leave(" = f [abort]");
523 		return false;
524 
525 	case -ERFKILL:
526 	case -EADDRNOTAVAIL:
527 	case -ENETUNREACH:
528 	case -EHOSTUNREACH:
529 	case -EHOSTDOWN:
530 	case -ECONNREFUSED:
531 	case -ETIMEDOUT:
532 	case -ETIME:
533 		_debug("no conn");
534 		fc->error = error;
535 		goto iterate_address;
536 	}
537 
538 iterate_address:
539 	/* Iterate over the current server's address list to try and find an
540 	 * address on which it will respond to us.
541 	 */
542 	if (afs_iterate_addresses(&fc->ac)) {
543 		_leave(" = t");
544 		return true;
545 	}
546 
547 	afs_end_cursor(&fc->ac);
548 	return false;
549 }
550 
551 /*
552  * Dump cursor state in the case of the error being EDESTADDRREQ.
553  */
554 static void afs_dump_edestaddrreq(const struct afs_fs_cursor *fc)
555 {
556 	static int count;
557 	int i;
558 
559 	if (!IS_ENABLED(CONFIG_AFS_DEBUG_CURSOR) || count > 3)
560 		return;
561 	count++;
562 
563 	rcu_read_lock();
564 
565 	pr_notice("EDESTADDR occurred\n");
566 	pr_notice("FC: cbb=%x cbb2=%x fl=%hx err=%hd\n",
567 		  fc->cb_break, fc->cb_break_2, fc->flags, fc->error);
568 	pr_notice("FC: ut=%lx ix=%d ni=%u\n",
569 		  fc->untried, fc->index, fc->nr_iterations);
570 
571 	if (fc->server_list) {
572 		const struct afs_server_list *sl = fc->server_list;
573 		pr_notice("FC: SL nr=%u pr=%u vnov=%hx\n",
574 			  sl->nr_servers, sl->preferred, sl->vnovol_mask);
575 		for (i = 0; i < sl->nr_servers; i++) {
576 			const struct afs_server *s = sl->servers[i].server;
577 			pr_notice("FC: server fl=%lx av=%u %pU\n",
578 				  s->flags, s->addr_version, &s->uuid);
579 			if (s->addresses) {
580 				const struct afs_addr_list *a =
581 					rcu_dereference(s->addresses);
582 				pr_notice("FC:  - av=%u nr=%u/%u/%u pr=%u\n",
583 					  a->version,
584 					  a->nr_ipv4, a->nr_addrs, a->max_addrs,
585 					  a->preferred);
586 				pr_notice("FC:  - pr=%lx R=%lx F=%lx\n",
587 					  a->probed, a->responded, a->failed);
588 				if (a == fc->ac.alist)
589 					pr_notice("FC:  - current\n");
590 			}
591 		}
592 	}
593 
594 	pr_notice("AC: t=%lx ax=%u ac=%d er=%d r=%u ni=%u\n",
595 		  fc->ac.tried, fc->ac.index, fc->ac.abort_code, fc->ac.error,
596 		  fc->ac.responded, fc->ac.nr_iterations);
597 	rcu_read_unlock();
598 }
599 
600 /*
601  * Tidy up a filesystem cursor and unlock the vnode.
602  */
603 int afs_end_vnode_operation(struct afs_fs_cursor *fc)
604 {
605 	struct afs_net *net = afs_v2net(fc->vnode);
606 
607 	if (fc->error == -EDESTADDRREQ ||
608 	    fc->error == -EADDRNOTAVAIL ||
609 	    fc->error == -ENETUNREACH ||
610 	    fc->error == -EHOSTUNREACH)
611 		afs_dump_edestaddrreq(fc);
612 
613 	mutex_unlock(&fc->vnode->io_lock);
614 
615 	afs_end_cursor(&fc->ac);
616 	afs_put_cb_interest(net, fc->cbi);
617 	afs_put_serverlist(net, fc->server_list);
618 
619 	if (fc->error == -ECONNABORTED)
620 		fc->error = afs_abort_to_error(fc->ac.abort_code);
621 
622 	return fc->error;
623 }
624