xref: /openbmc/linux/drivers/md/dm-mpath.c (revision f15cbe6f1a4b4d9df59142fc8e4abb973302cf44)
1 /*
2  * Copyright (C) 2003 Sistina Software Limited.
3  * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4  *
5  * This file is released under the GPL.
6  */
7 
8 #include "dm.h"
9 #include "dm-path-selector.h"
10 #include "dm-bio-list.h"
11 #include "dm-bio-record.h"
12 #include "dm-uevent.h"
13 
14 #include <linux/ctype.h>
15 #include <linux/init.h>
16 #include <linux/mempool.h>
17 #include <linux/module.h>
18 #include <linux/pagemap.h>
19 #include <linux/slab.h>
20 #include <linux/time.h>
21 #include <linux/workqueue.h>
22 #include <scsi/scsi_dh.h>
23 #include <asm/atomic.h>
24 
25 #define DM_MSG_PREFIX "multipath"
26 #define MESG_STR(x) x, sizeof(x)
27 
28 /* Path properties */
29 struct pgpath {
30 	struct list_head list;
31 
32 	struct priority_group *pg;	/* Owning PG */
33 	unsigned fail_count;		/* Cumulative failure count */
34 
35 	struct dm_path path;
36 };
37 
38 #define path_to_pgpath(__pgp) container_of((__pgp), struct pgpath, path)
39 
40 /*
41  * Paths are grouped into Priority Groups and numbered from 1 upwards.
42  * Each has a path selector which controls which path gets used.
43  */
44 struct priority_group {
45 	struct list_head list;
46 
47 	struct multipath *m;		/* Owning multipath instance */
48 	struct path_selector ps;
49 
50 	unsigned pg_num;		/* Reference number */
51 	unsigned bypassed;		/* Temporarily bypass this PG? */
52 
53 	unsigned nr_pgpaths;		/* Number of paths in PG */
54 	struct list_head pgpaths;
55 };
56 
57 /* Multipath context */
58 struct multipath {
59 	struct list_head list;
60 	struct dm_target *ti;
61 
62 	spinlock_t lock;
63 
64 	const char *hw_handler_name;
65 	struct work_struct activate_path;
66 	unsigned nr_priority_groups;
67 	struct list_head priority_groups;
68 	unsigned pg_init_required;	/* pg_init needs calling? */
69 	unsigned pg_init_in_progress;	/* Only one pg_init allowed at once */
70 
71 	unsigned nr_valid_paths;	/* Total number of usable paths */
72 	struct pgpath *current_pgpath;
73 	struct priority_group *current_pg;
74 	struct priority_group *next_pg;	/* Switch to this PG if set */
75 	unsigned repeat_count;		/* I/Os left before calling PS again */
76 
77 	unsigned queue_io;		/* Must we queue all I/O? */
78 	unsigned queue_if_no_path;	/* Queue I/O if last path fails? */
79 	unsigned saved_queue_if_no_path;/* Saved state during suspension */
80 	unsigned pg_init_retries;	/* Number of times to retry pg_init */
81 	unsigned pg_init_count;		/* Number of times pg_init called */
82 
83 	struct work_struct process_queued_ios;
84 	struct bio_list queued_ios;
85 	unsigned queue_size;
86 
87 	struct work_struct trigger_event;
88 
89 	/*
90 	 * We must use a mempool of dm_mpath_io structs so that we
91 	 * can resubmit bios on error.
92 	 */
93 	mempool_t *mpio_pool;
94 };
95 
96 /*
97  * Context information attached to each bio we process.
98  */
99 struct dm_mpath_io {
100 	struct pgpath *pgpath;
101 	struct dm_bio_details details;
102 };
103 
104 typedef int (*action_fn) (struct pgpath *pgpath);
105 
106 #define MIN_IOS 256	/* Mempool size */
107 
108 static struct kmem_cache *_mpio_cache;
109 
110 static struct workqueue_struct *kmultipathd, *kmpath_handlerd;
111 static void process_queued_ios(struct work_struct *work);
112 static void trigger_event(struct work_struct *work);
113 static void activate_path(struct work_struct *work);
114 
115 
116 /*-----------------------------------------------
117  * Allocation routines
118  *-----------------------------------------------*/
119 
120 static struct pgpath *alloc_pgpath(void)
121 {
122 	struct pgpath *pgpath = kzalloc(sizeof(*pgpath), GFP_KERNEL);
123 
124 	if (pgpath)
125 		pgpath->path.is_active = 1;
126 
127 	return pgpath;
128 }
129 
130 static void free_pgpath(struct pgpath *pgpath)
131 {
132 	kfree(pgpath);
133 }
134 
135 static struct priority_group *alloc_priority_group(void)
136 {
137 	struct priority_group *pg;
138 
139 	pg = kzalloc(sizeof(*pg), GFP_KERNEL);
140 
141 	if (pg)
142 		INIT_LIST_HEAD(&pg->pgpaths);
143 
144 	return pg;
145 }
146 
147 static void free_pgpaths(struct list_head *pgpaths, struct dm_target *ti)
148 {
149 	struct pgpath *pgpath, *tmp;
150 	struct multipath *m = ti->private;
151 
152 	list_for_each_entry_safe(pgpath, tmp, pgpaths, list) {
153 		list_del(&pgpath->list);
154 		if (m->hw_handler_name)
155 			scsi_dh_detach(bdev_get_queue(pgpath->path.dev->bdev));
156 		dm_put_device(ti, pgpath->path.dev);
157 		free_pgpath(pgpath);
158 	}
159 }
160 
161 static void free_priority_group(struct priority_group *pg,
162 				struct dm_target *ti)
163 {
164 	struct path_selector *ps = &pg->ps;
165 
166 	if (ps->type) {
167 		ps->type->destroy(ps);
168 		dm_put_path_selector(ps->type);
169 	}
170 
171 	free_pgpaths(&pg->pgpaths, ti);
172 	kfree(pg);
173 }
174 
175 static struct multipath *alloc_multipath(struct dm_target *ti)
176 {
177 	struct multipath *m;
178 
179 	m = kzalloc(sizeof(*m), GFP_KERNEL);
180 	if (m) {
181 		INIT_LIST_HEAD(&m->priority_groups);
182 		spin_lock_init(&m->lock);
183 		m->queue_io = 1;
184 		INIT_WORK(&m->process_queued_ios, process_queued_ios);
185 		INIT_WORK(&m->trigger_event, trigger_event);
186 		INIT_WORK(&m->activate_path, activate_path);
187 		m->mpio_pool = mempool_create_slab_pool(MIN_IOS, _mpio_cache);
188 		if (!m->mpio_pool) {
189 			kfree(m);
190 			return NULL;
191 		}
192 		m->ti = ti;
193 		ti->private = m;
194 	}
195 
196 	return m;
197 }
198 
199 static void free_multipath(struct multipath *m)
200 {
201 	struct priority_group *pg, *tmp;
202 
203 	list_for_each_entry_safe(pg, tmp, &m->priority_groups, list) {
204 		list_del(&pg->list);
205 		free_priority_group(pg, m->ti);
206 	}
207 
208 	kfree(m->hw_handler_name);
209 	mempool_destroy(m->mpio_pool);
210 	kfree(m);
211 }
212 
213 
214 /*-----------------------------------------------
215  * Path selection
216  *-----------------------------------------------*/
217 
218 static void __switch_pg(struct multipath *m, struct pgpath *pgpath)
219 {
220 	m->current_pg = pgpath->pg;
221 
222 	/* Must we initialise the PG first, and queue I/O till it's ready? */
223 	if (m->hw_handler_name) {
224 		m->pg_init_required = 1;
225 		m->queue_io = 1;
226 	} else {
227 		m->pg_init_required = 0;
228 		m->queue_io = 0;
229 	}
230 
231 	m->pg_init_count = 0;
232 }
233 
234 static int __choose_path_in_pg(struct multipath *m, struct priority_group *pg)
235 {
236 	struct dm_path *path;
237 
238 	path = pg->ps.type->select_path(&pg->ps, &m->repeat_count);
239 	if (!path)
240 		return -ENXIO;
241 
242 	m->current_pgpath = path_to_pgpath(path);
243 
244 	if (m->current_pg != pg)
245 		__switch_pg(m, m->current_pgpath);
246 
247 	return 0;
248 }
249 
250 static void __choose_pgpath(struct multipath *m)
251 {
252 	struct priority_group *pg;
253 	unsigned bypassed = 1;
254 
255 	if (!m->nr_valid_paths)
256 		goto failed;
257 
258 	/* Were we instructed to switch PG? */
259 	if (m->next_pg) {
260 		pg = m->next_pg;
261 		m->next_pg = NULL;
262 		if (!__choose_path_in_pg(m, pg))
263 			return;
264 	}
265 
266 	/* Don't change PG until it has no remaining paths */
267 	if (m->current_pg && !__choose_path_in_pg(m, m->current_pg))
268 		return;
269 
270 	/*
271 	 * Loop through priority groups until we find a valid path.
272 	 * First time we skip PGs marked 'bypassed'.
273 	 * Second time we only try the ones we skipped.
274 	 */
275 	do {
276 		list_for_each_entry(pg, &m->priority_groups, list) {
277 			if (pg->bypassed == bypassed)
278 				continue;
279 			if (!__choose_path_in_pg(m, pg))
280 				return;
281 		}
282 	} while (bypassed--);
283 
284 failed:
285 	m->current_pgpath = NULL;
286 	m->current_pg = NULL;
287 }
288 
289 /*
290  * Check whether bios must be queued in the device-mapper core rather
291  * than here in the target.
292  *
293  * m->lock must be held on entry.
294  *
295  * If m->queue_if_no_path and m->saved_queue_if_no_path hold the
296  * same value then we are not between multipath_presuspend()
297  * and multipath_resume() calls and we have no need to check
298  * for the DMF_NOFLUSH_SUSPENDING flag.
299  */
300 static int __must_push_back(struct multipath *m)
301 {
302 	return (m->queue_if_no_path != m->saved_queue_if_no_path &&
303 		dm_noflush_suspending(m->ti));
304 }
305 
306 static int map_io(struct multipath *m, struct bio *bio,
307 		  struct dm_mpath_io *mpio, unsigned was_queued)
308 {
309 	int r = DM_MAPIO_REMAPPED;
310 	unsigned long flags;
311 	struct pgpath *pgpath;
312 
313 	spin_lock_irqsave(&m->lock, flags);
314 
315 	/* Do we need to select a new pgpath? */
316 	if (!m->current_pgpath ||
317 	    (!m->queue_io && (m->repeat_count && --m->repeat_count == 0)))
318 		__choose_pgpath(m);
319 
320 	pgpath = m->current_pgpath;
321 
322 	if (was_queued)
323 		m->queue_size--;
324 
325 	if ((pgpath && m->queue_io) ||
326 	    (!pgpath && m->queue_if_no_path)) {
327 		/* Queue for the daemon to resubmit */
328 		bio_list_add(&m->queued_ios, bio);
329 		m->queue_size++;
330 		if ((m->pg_init_required && !m->pg_init_in_progress) ||
331 		    !m->queue_io)
332 			queue_work(kmultipathd, &m->process_queued_ios);
333 		pgpath = NULL;
334 		r = DM_MAPIO_SUBMITTED;
335 	} else if (pgpath)
336 		bio->bi_bdev = pgpath->path.dev->bdev;
337 	else if (__must_push_back(m))
338 		r = DM_MAPIO_REQUEUE;
339 	else
340 		r = -EIO;	/* Failed */
341 
342 	mpio->pgpath = pgpath;
343 
344 	spin_unlock_irqrestore(&m->lock, flags);
345 
346 	return r;
347 }
348 
349 /*
350  * If we run out of usable paths, should we queue I/O or error it?
351  */
352 static int queue_if_no_path(struct multipath *m, unsigned queue_if_no_path,
353 			    unsigned save_old_value)
354 {
355 	unsigned long flags;
356 
357 	spin_lock_irqsave(&m->lock, flags);
358 
359 	if (save_old_value)
360 		m->saved_queue_if_no_path = m->queue_if_no_path;
361 	else
362 		m->saved_queue_if_no_path = queue_if_no_path;
363 	m->queue_if_no_path = queue_if_no_path;
364 	if (!m->queue_if_no_path && m->queue_size)
365 		queue_work(kmultipathd, &m->process_queued_ios);
366 
367 	spin_unlock_irqrestore(&m->lock, flags);
368 
369 	return 0;
370 }
371 
372 /*-----------------------------------------------------------------
373  * The multipath daemon is responsible for resubmitting queued ios.
374  *---------------------------------------------------------------*/
375 
376 static void dispatch_queued_ios(struct multipath *m)
377 {
378 	int r;
379 	unsigned long flags;
380 	struct bio *bio = NULL, *next;
381 	struct dm_mpath_io *mpio;
382 	union map_info *info;
383 
384 	spin_lock_irqsave(&m->lock, flags);
385 	bio = bio_list_get(&m->queued_ios);
386 	spin_unlock_irqrestore(&m->lock, flags);
387 
388 	while (bio) {
389 		next = bio->bi_next;
390 		bio->bi_next = NULL;
391 
392 		info = dm_get_mapinfo(bio);
393 		mpio = info->ptr;
394 
395 		r = map_io(m, bio, mpio, 1);
396 		if (r < 0)
397 			bio_endio(bio, r);
398 		else if (r == DM_MAPIO_REMAPPED)
399 			generic_make_request(bio);
400 		else if (r == DM_MAPIO_REQUEUE)
401 			bio_endio(bio, -EIO);
402 
403 		bio = next;
404 	}
405 }
406 
407 static void process_queued_ios(struct work_struct *work)
408 {
409 	struct multipath *m =
410 		container_of(work, struct multipath, process_queued_ios);
411 	struct pgpath *pgpath = NULL;
412 	unsigned init_required = 0, must_queue = 1;
413 	unsigned long flags;
414 
415 	spin_lock_irqsave(&m->lock, flags);
416 
417 	if (!m->queue_size)
418 		goto out;
419 
420 	if (!m->current_pgpath)
421 		__choose_pgpath(m);
422 
423 	pgpath = m->current_pgpath;
424 
425 	if ((pgpath && !m->queue_io) ||
426 	    (!pgpath && !m->queue_if_no_path))
427 		must_queue = 0;
428 
429 	if (m->pg_init_required && !m->pg_init_in_progress) {
430 		m->pg_init_count++;
431 		m->pg_init_required = 0;
432 		m->pg_init_in_progress = 1;
433 		init_required = 1;
434 	}
435 
436 out:
437 	spin_unlock_irqrestore(&m->lock, flags);
438 
439 	if (init_required)
440 		queue_work(kmpath_handlerd, &m->activate_path);
441 
442 	if (!must_queue)
443 		dispatch_queued_ios(m);
444 }
445 
446 /*
447  * An event is triggered whenever a path is taken out of use.
448  * Includes path failure and PG bypass.
449  */
450 static void trigger_event(struct work_struct *work)
451 {
452 	struct multipath *m =
453 		container_of(work, struct multipath, trigger_event);
454 
455 	dm_table_event(m->ti->table);
456 }
457 
458 /*-----------------------------------------------------------------
459  * Constructor/argument parsing:
460  * <#multipath feature args> [<arg>]*
461  * <#hw_handler args> [hw_handler [<arg>]*]
462  * <#priority groups>
463  * <initial priority group>
464  *     [<selector> <#selector args> [<arg>]*
465  *      <#paths> <#per-path selector args>
466  *         [<path> [<arg>]* ]+ ]+
467  *---------------------------------------------------------------*/
468 struct param {
469 	unsigned min;
470 	unsigned max;
471 	char *error;
472 };
473 
474 static int read_param(struct param *param, char *str, unsigned *v, char **error)
475 {
476 	if (!str ||
477 	    (sscanf(str, "%u", v) != 1) ||
478 	    (*v < param->min) ||
479 	    (*v > param->max)) {
480 		*error = param->error;
481 		return -EINVAL;
482 	}
483 
484 	return 0;
485 }
486 
487 struct arg_set {
488 	unsigned argc;
489 	char **argv;
490 };
491 
492 static char *shift(struct arg_set *as)
493 {
494 	char *r;
495 
496 	if (as->argc) {
497 		as->argc--;
498 		r = *as->argv;
499 		as->argv++;
500 		return r;
501 	}
502 
503 	return NULL;
504 }
505 
506 static void consume(struct arg_set *as, unsigned n)
507 {
508 	BUG_ON (as->argc < n);
509 	as->argc -= n;
510 	as->argv += n;
511 }
512 
513 static int parse_path_selector(struct arg_set *as, struct priority_group *pg,
514 			       struct dm_target *ti)
515 {
516 	int r;
517 	struct path_selector_type *pst;
518 	unsigned ps_argc;
519 
520 	static struct param _params[] = {
521 		{0, 1024, "invalid number of path selector args"},
522 	};
523 
524 	pst = dm_get_path_selector(shift(as));
525 	if (!pst) {
526 		ti->error = "unknown path selector type";
527 		return -EINVAL;
528 	}
529 
530 	r = read_param(_params, shift(as), &ps_argc, &ti->error);
531 	if (r) {
532 		dm_put_path_selector(pst);
533 		return -EINVAL;
534 	}
535 
536 	r = pst->create(&pg->ps, ps_argc, as->argv);
537 	if (r) {
538 		dm_put_path_selector(pst);
539 		ti->error = "path selector constructor failed";
540 		return r;
541 	}
542 
543 	pg->ps.type = pst;
544 	consume(as, ps_argc);
545 
546 	return 0;
547 }
548 
549 static struct pgpath *parse_path(struct arg_set *as, struct path_selector *ps,
550 			       struct dm_target *ti)
551 {
552 	int r;
553 	struct pgpath *p;
554 	struct multipath *m = ti->private;
555 
556 	/* we need at least a path arg */
557 	if (as->argc < 1) {
558 		ti->error = "no device given";
559 		return NULL;
560 	}
561 
562 	p = alloc_pgpath();
563 	if (!p)
564 		return NULL;
565 
566 	r = dm_get_device(ti, shift(as), ti->begin, ti->len,
567 			  dm_table_get_mode(ti->table), &p->path.dev);
568 	if (r) {
569 		ti->error = "error getting device";
570 		goto bad;
571 	}
572 
573 	if (m->hw_handler_name) {
574 		r = scsi_dh_attach(bdev_get_queue(p->path.dev->bdev),
575 				   m->hw_handler_name);
576 		if (r < 0) {
577 			dm_put_device(ti, p->path.dev);
578 			goto bad;
579 		}
580 	}
581 
582 	r = ps->type->add_path(ps, &p->path, as->argc, as->argv, &ti->error);
583 	if (r) {
584 		dm_put_device(ti, p->path.dev);
585 		goto bad;
586 	}
587 
588 	return p;
589 
590  bad:
591 	free_pgpath(p);
592 	return NULL;
593 }
594 
595 static struct priority_group *parse_priority_group(struct arg_set *as,
596 						   struct multipath *m)
597 {
598 	static struct param _params[] = {
599 		{1, 1024, "invalid number of paths"},
600 		{0, 1024, "invalid number of selector args"}
601 	};
602 
603 	int r;
604 	unsigned i, nr_selector_args, nr_params;
605 	struct priority_group *pg;
606 	struct dm_target *ti = m->ti;
607 
608 	if (as->argc < 2) {
609 		as->argc = 0;
610 		ti->error = "not enough priority group aruments";
611 		return NULL;
612 	}
613 
614 	pg = alloc_priority_group();
615 	if (!pg) {
616 		ti->error = "couldn't allocate priority group";
617 		return NULL;
618 	}
619 	pg->m = m;
620 
621 	r = parse_path_selector(as, pg, ti);
622 	if (r)
623 		goto bad;
624 
625 	/*
626 	 * read the paths
627 	 */
628 	r = read_param(_params, shift(as), &pg->nr_pgpaths, &ti->error);
629 	if (r)
630 		goto bad;
631 
632 	r = read_param(_params + 1, shift(as), &nr_selector_args, &ti->error);
633 	if (r)
634 		goto bad;
635 
636 	nr_params = 1 + nr_selector_args;
637 	for (i = 0; i < pg->nr_pgpaths; i++) {
638 		struct pgpath *pgpath;
639 		struct arg_set path_args;
640 
641 		if (as->argc < nr_params) {
642 			ti->error = "not enough path parameters";
643 			goto bad;
644 		}
645 
646 		path_args.argc = nr_params;
647 		path_args.argv = as->argv;
648 
649 		pgpath = parse_path(&path_args, &pg->ps, ti);
650 		if (!pgpath)
651 			goto bad;
652 
653 		pgpath->pg = pg;
654 		list_add_tail(&pgpath->list, &pg->pgpaths);
655 		consume(as, nr_params);
656 	}
657 
658 	return pg;
659 
660  bad:
661 	free_priority_group(pg, ti);
662 	return NULL;
663 }
664 
665 static int parse_hw_handler(struct arg_set *as, struct multipath *m)
666 {
667 	unsigned hw_argc;
668 	struct dm_target *ti = m->ti;
669 
670 	static struct param _params[] = {
671 		{0, 1024, "invalid number of hardware handler args"},
672 	};
673 
674 	if (read_param(_params, shift(as), &hw_argc, &ti->error))
675 		return -EINVAL;
676 
677 	if (!hw_argc)
678 		return 0;
679 
680 	m->hw_handler_name = kstrdup(shift(as), GFP_KERNEL);
681 	request_module("scsi_dh_%s", m->hw_handler_name);
682 	if (scsi_dh_handler_exist(m->hw_handler_name) == 0) {
683 		ti->error = "unknown hardware handler type";
684 		kfree(m->hw_handler_name);
685 		m->hw_handler_name = NULL;
686 		return -EINVAL;
687 	}
688 	consume(as, hw_argc - 1);
689 
690 	return 0;
691 }
692 
693 static int parse_features(struct arg_set *as, struct multipath *m)
694 {
695 	int r;
696 	unsigned argc;
697 	struct dm_target *ti = m->ti;
698 	const char *param_name;
699 
700 	static struct param _params[] = {
701 		{0, 3, "invalid number of feature args"},
702 		{1, 50, "pg_init_retries must be between 1 and 50"},
703 	};
704 
705 	r = read_param(_params, shift(as), &argc, &ti->error);
706 	if (r)
707 		return -EINVAL;
708 
709 	if (!argc)
710 		return 0;
711 
712 	do {
713 		param_name = shift(as);
714 		argc--;
715 
716 		if (!strnicmp(param_name, MESG_STR("queue_if_no_path"))) {
717 			r = queue_if_no_path(m, 1, 0);
718 			continue;
719 		}
720 
721 		if (!strnicmp(param_name, MESG_STR("pg_init_retries")) &&
722 		    (argc >= 1)) {
723 			r = read_param(_params + 1, shift(as),
724 				       &m->pg_init_retries, &ti->error);
725 			argc--;
726 			continue;
727 		}
728 
729 		ti->error = "Unrecognised multipath feature request";
730 		r = -EINVAL;
731 	} while (argc && !r);
732 
733 	return r;
734 }
735 
736 static int multipath_ctr(struct dm_target *ti, unsigned int argc,
737 			 char **argv)
738 {
739 	/* target parameters */
740 	static struct param _params[] = {
741 		{1, 1024, "invalid number of priority groups"},
742 		{1, 1024, "invalid initial priority group number"},
743 	};
744 
745 	int r;
746 	struct multipath *m;
747 	struct arg_set as;
748 	unsigned pg_count = 0;
749 	unsigned next_pg_num;
750 
751 	as.argc = argc;
752 	as.argv = argv;
753 
754 	m = alloc_multipath(ti);
755 	if (!m) {
756 		ti->error = "can't allocate multipath";
757 		return -EINVAL;
758 	}
759 
760 	r = parse_features(&as, m);
761 	if (r)
762 		goto bad;
763 
764 	r = parse_hw_handler(&as, m);
765 	if (r)
766 		goto bad;
767 
768 	r = read_param(_params, shift(&as), &m->nr_priority_groups, &ti->error);
769 	if (r)
770 		goto bad;
771 
772 	r = read_param(_params + 1, shift(&as), &next_pg_num, &ti->error);
773 	if (r)
774 		goto bad;
775 
776 	/* parse the priority groups */
777 	while (as.argc) {
778 		struct priority_group *pg;
779 
780 		pg = parse_priority_group(&as, m);
781 		if (!pg) {
782 			r = -EINVAL;
783 			goto bad;
784 		}
785 
786 		m->nr_valid_paths += pg->nr_pgpaths;
787 		list_add_tail(&pg->list, &m->priority_groups);
788 		pg_count++;
789 		pg->pg_num = pg_count;
790 		if (!--next_pg_num)
791 			m->next_pg = pg;
792 	}
793 
794 	if (pg_count != m->nr_priority_groups) {
795 		ti->error = "priority group count mismatch";
796 		r = -EINVAL;
797 		goto bad;
798 	}
799 
800 	return 0;
801 
802  bad:
803 	free_multipath(m);
804 	return r;
805 }
806 
807 static void multipath_dtr(struct dm_target *ti)
808 {
809 	struct multipath *m = (struct multipath *) ti->private;
810 
811 	flush_workqueue(kmpath_handlerd);
812 	flush_workqueue(kmultipathd);
813 	free_multipath(m);
814 }
815 
816 /*
817  * Map bios, recording original fields for later in case we have to resubmit
818  */
819 static int multipath_map(struct dm_target *ti, struct bio *bio,
820 			 union map_info *map_context)
821 {
822 	int r;
823 	struct dm_mpath_io *mpio;
824 	struct multipath *m = (struct multipath *) ti->private;
825 
826 	mpio = mempool_alloc(m->mpio_pool, GFP_NOIO);
827 	dm_bio_record(&mpio->details, bio);
828 
829 	map_context->ptr = mpio;
830 	bio->bi_rw |= (1 << BIO_RW_FAILFAST);
831 	r = map_io(m, bio, mpio, 0);
832 	if (r < 0 || r == DM_MAPIO_REQUEUE)
833 		mempool_free(mpio, m->mpio_pool);
834 
835 	return r;
836 }
837 
838 /*
839  * Take a path out of use.
840  */
841 static int fail_path(struct pgpath *pgpath)
842 {
843 	unsigned long flags;
844 	struct multipath *m = pgpath->pg->m;
845 
846 	spin_lock_irqsave(&m->lock, flags);
847 
848 	if (!pgpath->path.is_active)
849 		goto out;
850 
851 	DMWARN("Failing path %s.", pgpath->path.dev->name);
852 
853 	pgpath->pg->ps.type->fail_path(&pgpath->pg->ps, &pgpath->path);
854 	pgpath->path.is_active = 0;
855 	pgpath->fail_count++;
856 
857 	m->nr_valid_paths--;
858 
859 	if (pgpath == m->current_pgpath)
860 		m->current_pgpath = NULL;
861 
862 	dm_path_uevent(DM_UEVENT_PATH_FAILED, m->ti,
863 		      pgpath->path.dev->name, m->nr_valid_paths);
864 
865 	queue_work(kmultipathd, &m->trigger_event);
866 
867 out:
868 	spin_unlock_irqrestore(&m->lock, flags);
869 
870 	return 0;
871 }
872 
873 /*
874  * Reinstate a previously-failed path
875  */
876 static int reinstate_path(struct pgpath *pgpath)
877 {
878 	int r = 0;
879 	unsigned long flags;
880 	struct multipath *m = pgpath->pg->m;
881 
882 	spin_lock_irqsave(&m->lock, flags);
883 
884 	if (pgpath->path.is_active)
885 		goto out;
886 
887 	if (!pgpath->pg->ps.type->reinstate_path) {
888 		DMWARN("Reinstate path not supported by path selector %s",
889 		       pgpath->pg->ps.type->name);
890 		r = -EINVAL;
891 		goto out;
892 	}
893 
894 	r = pgpath->pg->ps.type->reinstate_path(&pgpath->pg->ps, &pgpath->path);
895 	if (r)
896 		goto out;
897 
898 	pgpath->path.is_active = 1;
899 
900 	m->current_pgpath = NULL;
901 	if (!m->nr_valid_paths++ && m->queue_size)
902 		queue_work(kmultipathd, &m->process_queued_ios);
903 
904 	dm_path_uevent(DM_UEVENT_PATH_REINSTATED, m->ti,
905 		      pgpath->path.dev->name, m->nr_valid_paths);
906 
907 	queue_work(kmultipathd, &m->trigger_event);
908 
909 out:
910 	spin_unlock_irqrestore(&m->lock, flags);
911 
912 	return r;
913 }
914 
915 /*
916  * Fail or reinstate all paths that match the provided struct dm_dev.
917  */
918 static int action_dev(struct multipath *m, struct dm_dev *dev,
919 		      action_fn action)
920 {
921 	int r = 0;
922 	struct pgpath *pgpath;
923 	struct priority_group *pg;
924 
925 	list_for_each_entry(pg, &m->priority_groups, list) {
926 		list_for_each_entry(pgpath, &pg->pgpaths, list) {
927 			if (pgpath->path.dev == dev)
928 				r = action(pgpath);
929 		}
930 	}
931 
932 	return r;
933 }
934 
935 /*
936  * Temporarily try to avoid having to use the specified PG
937  */
938 static void bypass_pg(struct multipath *m, struct priority_group *pg,
939 		      int bypassed)
940 {
941 	unsigned long flags;
942 
943 	spin_lock_irqsave(&m->lock, flags);
944 
945 	pg->bypassed = bypassed;
946 	m->current_pgpath = NULL;
947 	m->current_pg = NULL;
948 
949 	spin_unlock_irqrestore(&m->lock, flags);
950 
951 	queue_work(kmultipathd, &m->trigger_event);
952 }
953 
954 /*
955  * Switch to using the specified PG from the next I/O that gets mapped
956  */
957 static int switch_pg_num(struct multipath *m, const char *pgstr)
958 {
959 	struct priority_group *pg;
960 	unsigned pgnum;
961 	unsigned long flags;
962 
963 	if (!pgstr || (sscanf(pgstr, "%u", &pgnum) != 1) || !pgnum ||
964 	    (pgnum > m->nr_priority_groups)) {
965 		DMWARN("invalid PG number supplied to switch_pg_num");
966 		return -EINVAL;
967 	}
968 
969 	spin_lock_irqsave(&m->lock, flags);
970 	list_for_each_entry(pg, &m->priority_groups, list) {
971 		pg->bypassed = 0;
972 		if (--pgnum)
973 			continue;
974 
975 		m->current_pgpath = NULL;
976 		m->current_pg = NULL;
977 		m->next_pg = pg;
978 	}
979 	spin_unlock_irqrestore(&m->lock, flags);
980 
981 	queue_work(kmultipathd, &m->trigger_event);
982 	return 0;
983 }
984 
985 /*
986  * Set/clear bypassed status of a PG.
987  * PGs are numbered upwards from 1 in the order they were declared.
988  */
989 static int bypass_pg_num(struct multipath *m, const char *pgstr, int bypassed)
990 {
991 	struct priority_group *pg;
992 	unsigned pgnum;
993 
994 	if (!pgstr || (sscanf(pgstr, "%u", &pgnum) != 1) || !pgnum ||
995 	    (pgnum > m->nr_priority_groups)) {
996 		DMWARN("invalid PG number supplied to bypass_pg");
997 		return -EINVAL;
998 	}
999 
1000 	list_for_each_entry(pg, &m->priority_groups, list) {
1001 		if (!--pgnum)
1002 			break;
1003 	}
1004 
1005 	bypass_pg(m, pg, bypassed);
1006 	return 0;
1007 }
1008 
1009 /*
1010  * Should we retry pg_init immediately?
1011  */
1012 static int pg_init_limit_reached(struct multipath *m, struct pgpath *pgpath)
1013 {
1014 	unsigned long flags;
1015 	int limit_reached = 0;
1016 
1017 	spin_lock_irqsave(&m->lock, flags);
1018 
1019 	if (m->pg_init_count <= m->pg_init_retries)
1020 		m->pg_init_required = 1;
1021 	else
1022 		limit_reached = 1;
1023 
1024 	spin_unlock_irqrestore(&m->lock, flags);
1025 
1026 	return limit_reached;
1027 }
1028 
1029 static void pg_init_done(struct dm_path *path, int errors)
1030 {
1031 	struct pgpath *pgpath = path_to_pgpath(path);
1032 	struct priority_group *pg = pgpath->pg;
1033 	struct multipath *m = pg->m;
1034 	unsigned long flags;
1035 
1036 	/* device or driver problems */
1037 	switch (errors) {
1038 	case SCSI_DH_OK:
1039 		break;
1040 	case SCSI_DH_NOSYS:
1041 		if (!m->hw_handler_name) {
1042 			errors = 0;
1043 			break;
1044 		}
1045 		DMERR("Cannot failover device because scsi_dh_%s was not "
1046 		      "loaded.", m->hw_handler_name);
1047 		/*
1048 		 * Fail path for now, so we do not ping pong
1049 		 */
1050 		fail_path(pgpath);
1051 		break;
1052 	case SCSI_DH_DEV_TEMP_BUSY:
1053 		/*
1054 		 * Probably doing something like FW upgrade on the
1055 		 * controller so try the other pg.
1056 		 */
1057 		bypass_pg(m, pg, 1);
1058 		break;
1059 	/* TODO: For SCSI_DH_RETRY we should wait a couple seconds */
1060 	case SCSI_DH_RETRY:
1061 	case SCSI_DH_IMM_RETRY:
1062 	case SCSI_DH_RES_TEMP_UNAVAIL:
1063 		if (pg_init_limit_reached(m, pgpath))
1064 			fail_path(pgpath);
1065 		errors = 0;
1066 		break;
1067 	default:
1068 		/*
1069 		 * We probably do not want to fail the path for a device
1070 		 * error, but this is what the old dm did. In future
1071 		 * patches we can do more advanced handling.
1072 		 */
1073 		fail_path(pgpath);
1074 	}
1075 
1076 	spin_lock_irqsave(&m->lock, flags);
1077 	if (errors) {
1078 		DMERR("Could not failover device. Error %d.", errors);
1079 		m->current_pgpath = NULL;
1080 		m->current_pg = NULL;
1081 	} else if (!m->pg_init_required) {
1082 		m->queue_io = 0;
1083 		pg->bypassed = 0;
1084 	}
1085 
1086 	m->pg_init_in_progress = 0;
1087 	queue_work(kmultipathd, &m->process_queued_ios);
1088 	spin_unlock_irqrestore(&m->lock, flags);
1089 }
1090 
1091 static void activate_path(struct work_struct *work)
1092 {
1093 	int ret;
1094 	struct multipath *m =
1095 		container_of(work, struct multipath, activate_path);
1096 	struct dm_path *path = &m->current_pgpath->path;
1097 
1098 	ret = scsi_dh_activate(bdev_get_queue(path->dev->bdev));
1099 	pg_init_done(path, ret);
1100 }
1101 
1102 /*
1103  * end_io handling
1104  */
1105 static int do_end_io(struct multipath *m, struct bio *bio,
1106 		     int error, struct dm_mpath_io *mpio)
1107 {
1108 	unsigned long flags;
1109 
1110 	if (!error)
1111 		return 0;	/* I/O complete */
1112 
1113 	if ((error == -EWOULDBLOCK) && bio_rw_ahead(bio))
1114 		return error;
1115 
1116 	if (error == -EOPNOTSUPP)
1117 		return error;
1118 
1119 	spin_lock_irqsave(&m->lock, flags);
1120 	if (!m->nr_valid_paths) {
1121 		if (__must_push_back(m)) {
1122 			spin_unlock_irqrestore(&m->lock, flags);
1123 			return DM_ENDIO_REQUEUE;
1124 		} else if (!m->queue_if_no_path) {
1125 			spin_unlock_irqrestore(&m->lock, flags);
1126 			return -EIO;
1127 		} else {
1128 			spin_unlock_irqrestore(&m->lock, flags);
1129 			goto requeue;
1130 		}
1131 	}
1132 	spin_unlock_irqrestore(&m->lock, flags);
1133 
1134 	if (mpio->pgpath)
1135 		fail_path(mpio->pgpath);
1136 
1137       requeue:
1138 	dm_bio_restore(&mpio->details, bio);
1139 
1140 	/* queue for the daemon to resubmit or fail */
1141 	spin_lock_irqsave(&m->lock, flags);
1142 	bio_list_add(&m->queued_ios, bio);
1143 	m->queue_size++;
1144 	if (!m->queue_io)
1145 		queue_work(kmultipathd, &m->process_queued_ios);
1146 	spin_unlock_irqrestore(&m->lock, flags);
1147 
1148 	return DM_ENDIO_INCOMPLETE;	/* io not complete */
1149 }
1150 
1151 static int multipath_end_io(struct dm_target *ti, struct bio *bio,
1152 			    int error, union map_info *map_context)
1153 {
1154 	struct multipath *m = ti->private;
1155 	struct dm_mpath_io *mpio = map_context->ptr;
1156 	struct pgpath *pgpath = mpio->pgpath;
1157 	struct path_selector *ps;
1158 	int r;
1159 
1160 	r  = do_end_io(m, bio, error, mpio);
1161 	if (pgpath) {
1162 		ps = &pgpath->pg->ps;
1163 		if (ps->type->end_io)
1164 			ps->type->end_io(ps, &pgpath->path);
1165 	}
1166 	if (r != DM_ENDIO_INCOMPLETE)
1167 		mempool_free(mpio, m->mpio_pool);
1168 
1169 	return r;
1170 }
1171 
1172 /*
1173  * Suspend can't complete until all the I/O is processed so if
1174  * the last path fails we must error any remaining I/O.
1175  * Note that if the freeze_bdev fails while suspending, the
1176  * queue_if_no_path state is lost - userspace should reset it.
1177  */
1178 static void multipath_presuspend(struct dm_target *ti)
1179 {
1180 	struct multipath *m = (struct multipath *) ti->private;
1181 
1182 	queue_if_no_path(m, 0, 1);
1183 }
1184 
1185 /*
1186  * Restore the queue_if_no_path setting.
1187  */
1188 static void multipath_resume(struct dm_target *ti)
1189 {
1190 	struct multipath *m = (struct multipath *) ti->private;
1191 	unsigned long flags;
1192 
1193 	spin_lock_irqsave(&m->lock, flags);
1194 	m->queue_if_no_path = m->saved_queue_if_no_path;
1195 	spin_unlock_irqrestore(&m->lock, flags);
1196 }
1197 
1198 /*
1199  * Info output has the following format:
1200  * num_multipath_feature_args [multipath_feature_args]*
1201  * num_handler_status_args [handler_status_args]*
1202  * num_groups init_group_number
1203  *            [A|D|E num_ps_status_args [ps_status_args]*
1204  *             num_paths num_selector_args
1205  *             [path_dev A|F fail_count [selector_args]* ]+ ]+
1206  *
1207  * Table output has the following format (identical to the constructor string):
1208  * num_feature_args [features_args]*
1209  * num_handler_args hw_handler [hw_handler_args]*
1210  * num_groups init_group_number
1211  *     [priority selector-name num_ps_args [ps_args]*
1212  *      num_paths num_selector_args [path_dev [selector_args]* ]+ ]+
1213  */
1214 static int multipath_status(struct dm_target *ti, status_type_t type,
1215 			    char *result, unsigned int maxlen)
1216 {
1217 	int sz = 0;
1218 	unsigned long flags;
1219 	struct multipath *m = (struct multipath *) ti->private;
1220 	struct priority_group *pg;
1221 	struct pgpath *p;
1222 	unsigned pg_num;
1223 	char state;
1224 
1225 	spin_lock_irqsave(&m->lock, flags);
1226 
1227 	/* Features */
1228 	if (type == STATUSTYPE_INFO)
1229 		DMEMIT("2 %u %u ", m->queue_size, m->pg_init_count);
1230 	else {
1231 		DMEMIT("%u ", m->queue_if_no_path +
1232 			      (m->pg_init_retries > 0) * 2);
1233 		if (m->queue_if_no_path)
1234 			DMEMIT("queue_if_no_path ");
1235 		if (m->pg_init_retries)
1236 			DMEMIT("pg_init_retries %u ", m->pg_init_retries);
1237 	}
1238 
1239 	if (!m->hw_handler_name || type == STATUSTYPE_INFO)
1240 		DMEMIT("0 ");
1241 	else
1242 		DMEMIT("1 %s ", m->hw_handler_name);
1243 
1244 	DMEMIT("%u ", m->nr_priority_groups);
1245 
1246 	if (m->next_pg)
1247 		pg_num = m->next_pg->pg_num;
1248 	else if (m->current_pg)
1249 		pg_num = m->current_pg->pg_num;
1250 	else
1251 			pg_num = 1;
1252 
1253 	DMEMIT("%u ", pg_num);
1254 
1255 	switch (type) {
1256 	case STATUSTYPE_INFO:
1257 		list_for_each_entry(pg, &m->priority_groups, list) {
1258 			if (pg->bypassed)
1259 				state = 'D';	/* Disabled */
1260 			else if (pg == m->current_pg)
1261 				state = 'A';	/* Currently Active */
1262 			else
1263 				state = 'E';	/* Enabled */
1264 
1265 			DMEMIT("%c ", state);
1266 
1267 			if (pg->ps.type->status)
1268 				sz += pg->ps.type->status(&pg->ps, NULL, type,
1269 							  result + sz,
1270 							  maxlen - sz);
1271 			else
1272 				DMEMIT("0 ");
1273 
1274 			DMEMIT("%u %u ", pg->nr_pgpaths,
1275 			       pg->ps.type->info_args);
1276 
1277 			list_for_each_entry(p, &pg->pgpaths, list) {
1278 				DMEMIT("%s %s %u ", p->path.dev->name,
1279 				       p->path.is_active ? "A" : "F",
1280 				       p->fail_count);
1281 				if (pg->ps.type->status)
1282 					sz += pg->ps.type->status(&pg->ps,
1283 					      &p->path, type, result + sz,
1284 					      maxlen - sz);
1285 			}
1286 		}
1287 		break;
1288 
1289 	case STATUSTYPE_TABLE:
1290 		list_for_each_entry(pg, &m->priority_groups, list) {
1291 			DMEMIT("%s ", pg->ps.type->name);
1292 
1293 			if (pg->ps.type->status)
1294 				sz += pg->ps.type->status(&pg->ps, NULL, type,
1295 							  result + sz,
1296 							  maxlen - sz);
1297 			else
1298 				DMEMIT("0 ");
1299 
1300 			DMEMIT("%u %u ", pg->nr_pgpaths,
1301 			       pg->ps.type->table_args);
1302 
1303 			list_for_each_entry(p, &pg->pgpaths, list) {
1304 				DMEMIT("%s ", p->path.dev->name);
1305 				if (pg->ps.type->status)
1306 					sz += pg->ps.type->status(&pg->ps,
1307 					      &p->path, type, result + sz,
1308 					      maxlen - sz);
1309 			}
1310 		}
1311 		break;
1312 	}
1313 
1314 	spin_unlock_irqrestore(&m->lock, flags);
1315 
1316 	return 0;
1317 }
1318 
1319 static int multipath_message(struct dm_target *ti, unsigned argc, char **argv)
1320 {
1321 	int r;
1322 	struct dm_dev *dev;
1323 	struct multipath *m = (struct multipath *) ti->private;
1324 	action_fn action;
1325 
1326 	if (argc == 1) {
1327 		if (!strnicmp(argv[0], MESG_STR("queue_if_no_path")))
1328 			return queue_if_no_path(m, 1, 0);
1329 		else if (!strnicmp(argv[0], MESG_STR("fail_if_no_path")))
1330 			return queue_if_no_path(m, 0, 0);
1331 	}
1332 
1333 	if (argc != 2)
1334 		goto error;
1335 
1336 	if (!strnicmp(argv[0], MESG_STR("disable_group")))
1337 		return bypass_pg_num(m, argv[1], 1);
1338 	else if (!strnicmp(argv[0], MESG_STR("enable_group")))
1339 		return bypass_pg_num(m, argv[1], 0);
1340 	else if (!strnicmp(argv[0], MESG_STR("switch_group")))
1341 		return switch_pg_num(m, argv[1]);
1342 	else if (!strnicmp(argv[0], MESG_STR("reinstate_path")))
1343 		action = reinstate_path;
1344 	else if (!strnicmp(argv[0], MESG_STR("fail_path")))
1345 		action = fail_path;
1346 	else
1347 		goto error;
1348 
1349 	r = dm_get_device(ti, argv[1], ti->begin, ti->len,
1350 			  dm_table_get_mode(ti->table), &dev);
1351 	if (r) {
1352 		DMWARN("message: error getting device %s",
1353 		       argv[1]);
1354 		return -EINVAL;
1355 	}
1356 
1357 	r = action_dev(m, dev, action);
1358 
1359 	dm_put_device(ti, dev);
1360 
1361 	return r;
1362 
1363 error:
1364 	DMWARN("Unrecognised multipath message received.");
1365 	return -EINVAL;
1366 }
1367 
1368 static int multipath_ioctl(struct dm_target *ti, struct inode *inode,
1369 			   struct file *filp, unsigned int cmd,
1370 			   unsigned long arg)
1371 {
1372 	struct multipath *m = (struct multipath *) ti->private;
1373 	struct block_device *bdev = NULL;
1374 	unsigned long flags;
1375 	struct file fake_file = {};
1376 	struct dentry fake_dentry = {};
1377 	int r = 0;
1378 
1379 	fake_file.f_path.dentry = &fake_dentry;
1380 
1381 	spin_lock_irqsave(&m->lock, flags);
1382 
1383 	if (!m->current_pgpath)
1384 		__choose_pgpath(m);
1385 
1386 	if (m->current_pgpath) {
1387 		bdev = m->current_pgpath->path.dev->bdev;
1388 		fake_dentry.d_inode = bdev->bd_inode;
1389 		fake_file.f_mode = m->current_pgpath->path.dev->mode;
1390 	}
1391 
1392 	if (m->queue_io)
1393 		r = -EAGAIN;
1394 	else if (!bdev)
1395 		r = -EIO;
1396 
1397 	spin_unlock_irqrestore(&m->lock, flags);
1398 
1399 	return r ? : blkdev_driver_ioctl(bdev->bd_inode, &fake_file,
1400 					 bdev->bd_disk, cmd, arg);
1401 }
1402 
1403 /*-----------------------------------------------------------------
1404  * Module setup
1405  *---------------------------------------------------------------*/
1406 static struct target_type multipath_target = {
1407 	.name = "multipath",
1408 	.version = {1, 0, 5},
1409 	.module = THIS_MODULE,
1410 	.ctr = multipath_ctr,
1411 	.dtr = multipath_dtr,
1412 	.map = multipath_map,
1413 	.end_io = multipath_end_io,
1414 	.presuspend = multipath_presuspend,
1415 	.resume = multipath_resume,
1416 	.status = multipath_status,
1417 	.message = multipath_message,
1418 	.ioctl  = multipath_ioctl,
1419 };
1420 
1421 static int __init dm_multipath_init(void)
1422 {
1423 	int r;
1424 
1425 	/* allocate a slab for the dm_ios */
1426 	_mpio_cache = KMEM_CACHE(dm_mpath_io, 0);
1427 	if (!_mpio_cache)
1428 		return -ENOMEM;
1429 
1430 	r = dm_register_target(&multipath_target);
1431 	if (r < 0) {
1432 		DMERR("register failed %d", r);
1433 		kmem_cache_destroy(_mpio_cache);
1434 		return -EINVAL;
1435 	}
1436 
1437 	kmultipathd = create_workqueue("kmpathd");
1438 	if (!kmultipathd) {
1439 		DMERR("failed to create workqueue kmpathd");
1440 		dm_unregister_target(&multipath_target);
1441 		kmem_cache_destroy(_mpio_cache);
1442 		return -ENOMEM;
1443 	}
1444 
1445 	/*
1446 	 * A separate workqueue is used to handle the device handlers
1447 	 * to avoid overloading existing workqueue. Overloading the
1448 	 * old workqueue would also create a bottleneck in the
1449 	 * path of the storage hardware device activation.
1450 	 */
1451 	kmpath_handlerd = create_singlethread_workqueue("kmpath_handlerd");
1452 	if (!kmpath_handlerd) {
1453 		DMERR("failed to create workqueue kmpath_handlerd");
1454 		destroy_workqueue(kmultipathd);
1455 		dm_unregister_target(&multipath_target);
1456 		kmem_cache_destroy(_mpio_cache);
1457 		return -ENOMEM;
1458 	}
1459 
1460 	DMINFO("version %u.%u.%u loaded",
1461 	       multipath_target.version[0], multipath_target.version[1],
1462 	       multipath_target.version[2]);
1463 
1464 	return r;
1465 }
1466 
1467 static void __exit dm_multipath_exit(void)
1468 {
1469 	int r;
1470 
1471 	destroy_workqueue(kmpath_handlerd);
1472 	destroy_workqueue(kmultipathd);
1473 
1474 	r = dm_unregister_target(&multipath_target);
1475 	if (r < 0)
1476 		DMERR("target unregister failed %d", r);
1477 	kmem_cache_destroy(_mpio_cache);
1478 }
1479 
1480 module_init(dm_multipath_init);
1481 module_exit(dm_multipath_exit);
1482 
1483 MODULE_DESCRIPTION(DM_NAME " multipath target");
1484 MODULE_AUTHOR("Sistina Software <dm-devel@redhat.com>");
1485 MODULE_LICENSE("GPL");
1486