xref: /openbmc/linux/drivers/md/dm-mpath.c (revision 545e4006)
1 /*
2  * Copyright (C) 2003 Sistina Software Limited.
3  * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4  *
5  * This file is released under the GPL.
6  */
7 
8 #include "dm.h"
9 #include "dm-path-selector.h"
10 #include "dm-bio-list.h"
11 #include "dm-bio-record.h"
12 #include "dm-uevent.h"
13 
14 #include <linux/ctype.h>
15 #include <linux/init.h>
16 #include <linux/mempool.h>
17 #include <linux/module.h>
18 #include <linux/pagemap.h>
19 #include <linux/slab.h>
20 #include <linux/time.h>
21 #include <linux/workqueue.h>
22 #include <scsi/scsi_dh.h>
23 #include <asm/atomic.h>
24 
25 #define DM_MSG_PREFIX "multipath"
26 #define MESG_STR(x) x, sizeof(x)
27 
28 /* Path properties */
29 struct pgpath {
30 	struct list_head list;
31 
32 	struct priority_group *pg;	/* Owning PG */
33 	unsigned fail_count;		/* Cumulative failure count */
34 
35 	struct dm_path path;
36 };
37 
38 #define path_to_pgpath(__pgp) container_of((__pgp), struct pgpath, path)
39 
40 /*
41  * Paths are grouped into Priority Groups and numbered from 1 upwards.
42  * Each has a path selector which controls which path gets used.
43  */
44 struct priority_group {
45 	struct list_head list;
46 
47 	struct multipath *m;		/* Owning multipath instance */
48 	struct path_selector ps;
49 
50 	unsigned pg_num;		/* Reference number */
51 	unsigned bypassed;		/* Temporarily bypass this PG? */
52 
53 	unsigned nr_pgpaths;		/* Number of paths in PG */
54 	struct list_head pgpaths;
55 };
56 
57 /* Multipath context */
58 struct multipath {
59 	struct list_head list;
60 	struct dm_target *ti;
61 
62 	spinlock_t lock;
63 
64 	const char *hw_handler_name;
65 	struct work_struct activate_path;
66 	unsigned nr_priority_groups;
67 	struct list_head priority_groups;
68 	unsigned pg_init_required;	/* pg_init needs calling? */
69 	unsigned pg_init_in_progress;	/* Only one pg_init allowed at once */
70 
71 	unsigned nr_valid_paths;	/* Total number of usable paths */
72 	struct pgpath *current_pgpath;
73 	struct priority_group *current_pg;
74 	struct priority_group *next_pg;	/* Switch to this PG if set */
75 	unsigned repeat_count;		/* I/Os left before calling PS again */
76 
77 	unsigned queue_io;		/* Must we queue all I/O? */
78 	unsigned queue_if_no_path;	/* Queue I/O if last path fails? */
79 	unsigned saved_queue_if_no_path;/* Saved state during suspension */
80 	unsigned pg_init_retries;	/* Number of times to retry pg_init */
81 	unsigned pg_init_count;		/* Number of times pg_init called */
82 
83 	struct work_struct process_queued_ios;
84 	struct bio_list queued_ios;
85 	unsigned queue_size;
86 
87 	struct work_struct trigger_event;
88 
89 	/*
90 	 * We must use a mempool of dm_mpath_io structs so that we
91 	 * can resubmit bios on error.
92 	 */
93 	mempool_t *mpio_pool;
94 };
95 
96 /*
97  * Context information attached to each bio we process.
98  */
99 struct dm_mpath_io {
100 	struct pgpath *pgpath;
101 	struct dm_bio_details details;
102 };
103 
104 typedef int (*action_fn) (struct pgpath *pgpath);
105 
106 #define MIN_IOS 256	/* Mempool size */
107 
108 static struct kmem_cache *_mpio_cache;
109 
110 static struct workqueue_struct *kmultipathd, *kmpath_handlerd;
111 static void process_queued_ios(struct work_struct *work);
112 static void trigger_event(struct work_struct *work);
113 static void activate_path(struct work_struct *work);
114 
115 
116 /*-----------------------------------------------
117  * Allocation routines
118  *-----------------------------------------------*/
119 
120 static struct pgpath *alloc_pgpath(void)
121 {
122 	struct pgpath *pgpath = kzalloc(sizeof(*pgpath), GFP_KERNEL);
123 
124 	if (pgpath)
125 		pgpath->path.is_active = 1;
126 
127 	return pgpath;
128 }
129 
130 static void free_pgpath(struct pgpath *pgpath)
131 {
132 	kfree(pgpath);
133 }
134 
135 static struct priority_group *alloc_priority_group(void)
136 {
137 	struct priority_group *pg;
138 
139 	pg = kzalloc(sizeof(*pg), GFP_KERNEL);
140 
141 	if (pg)
142 		INIT_LIST_HEAD(&pg->pgpaths);
143 
144 	return pg;
145 }
146 
147 static void free_pgpaths(struct list_head *pgpaths, struct dm_target *ti)
148 {
149 	struct pgpath *pgpath, *tmp;
150 
151 	list_for_each_entry_safe(pgpath, tmp, pgpaths, list) {
152 		list_del(&pgpath->list);
153 		dm_put_device(ti, pgpath->path.dev);
154 		free_pgpath(pgpath);
155 	}
156 }
157 
158 static void free_priority_group(struct priority_group *pg,
159 				struct dm_target *ti)
160 {
161 	struct path_selector *ps = &pg->ps;
162 
163 	if (ps->type) {
164 		ps->type->destroy(ps);
165 		dm_put_path_selector(ps->type);
166 	}
167 
168 	free_pgpaths(&pg->pgpaths, ti);
169 	kfree(pg);
170 }
171 
172 static struct multipath *alloc_multipath(struct dm_target *ti)
173 {
174 	struct multipath *m;
175 
176 	m = kzalloc(sizeof(*m), GFP_KERNEL);
177 	if (m) {
178 		INIT_LIST_HEAD(&m->priority_groups);
179 		spin_lock_init(&m->lock);
180 		m->queue_io = 1;
181 		INIT_WORK(&m->process_queued_ios, process_queued_ios);
182 		INIT_WORK(&m->trigger_event, trigger_event);
183 		INIT_WORK(&m->activate_path, activate_path);
184 		m->mpio_pool = mempool_create_slab_pool(MIN_IOS, _mpio_cache);
185 		if (!m->mpio_pool) {
186 			kfree(m);
187 			return NULL;
188 		}
189 		m->ti = ti;
190 		ti->private = m;
191 	}
192 
193 	return m;
194 }
195 
196 static void free_multipath(struct multipath *m)
197 {
198 	struct priority_group *pg, *tmp;
199 
200 	list_for_each_entry_safe(pg, tmp, &m->priority_groups, list) {
201 		list_del(&pg->list);
202 		free_priority_group(pg, m->ti);
203 	}
204 
205 	kfree(m->hw_handler_name);
206 	mempool_destroy(m->mpio_pool);
207 	kfree(m);
208 }
209 
210 
211 /*-----------------------------------------------
212  * Path selection
213  *-----------------------------------------------*/
214 
215 static void __switch_pg(struct multipath *m, struct pgpath *pgpath)
216 {
217 	m->current_pg = pgpath->pg;
218 
219 	/* Must we initialise the PG first, and queue I/O till it's ready? */
220 	if (m->hw_handler_name) {
221 		m->pg_init_required = 1;
222 		m->queue_io = 1;
223 	} else {
224 		m->pg_init_required = 0;
225 		m->queue_io = 0;
226 	}
227 
228 	m->pg_init_count = 0;
229 }
230 
231 static int __choose_path_in_pg(struct multipath *m, struct priority_group *pg)
232 {
233 	struct dm_path *path;
234 
235 	path = pg->ps.type->select_path(&pg->ps, &m->repeat_count);
236 	if (!path)
237 		return -ENXIO;
238 
239 	m->current_pgpath = path_to_pgpath(path);
240 
241 	if (m->current_pg != pg)
242 		__switch_pg(m, m->current_pgpath);
243 
244 	return 0;
245 }
246 
247 static void __choose_pgpath(struct multipath *m)
248 {
249 	struct priority_group *pg;
250 	unsigned bypassed = 1;
251 
252 	if (!m->nr_valid_paths)
253 		goto failed;
254 
255 	/* Were we instructed to switch PG? */
256 	if (m->next_pg) {
257 		pg = m->next_pg;
258 		m->next_pg = NULL;
259 		if (!__choose_path_in_pg(m, pg))
260 			return;
261 	}
262 
263 	/* Don't change PG until it has no remaining paths */
264 	if (m->current_pg && !__choose_path_in_pg(m, m->current_pg))
265 		return;
266 
267 	/*
268 	 * Loop through priority groups until we find a valid path.
269 	 * First time we skip PGs marked 'bypassed'.
270 	 * Second time we only try the ones we skipped.
271 	 */
272 	do {
273 		list_for_each_entry(pg, &m->priority_groups, list) {
274 			if (pg->bypassed == bypassed)
275 				continue;
276 			if (!__choose_path_in_pg(m, pg))
277 				return;
278 		}
279 	} while (bypassed--);
280 
281 failed:
282 	m->current_pgpath = NULL;
283 	m->current_pg = NULL;
284 }
285 
286 /*
287  * Check whether bios must be queued in the device-mapper core rather
288  * than here in the target.
289  *
290  * m->lock must be held on entry.
291  *
292  * If m->queue_if_no_path and m->saved_queue_if_no_path hold the
293  * same value then we are not between multipath_presuspend()
294  * and multipath_resume() calls and we have no need to check
295  * for the DMF_NOFLUSH_SUSPENDING flag.
296  */
297 static int __must_push_back(struct multipath *m)
298 {
299 	return (m->queue_if_no_path != m->saved_queue_if_no_path &&
300 		dm_noflush_suspending(m->ti));
301 }
302 
303 static int map_io(struct multipath *m, struct bio *bio,
304 		  struct dm_mpath_io *mpio, unsigned was_queued)
305 {
306 	int r = DM_MAPIO_REMAPPED;
307 	unsigned long flags;
308 	struct pgpath *pgpath;
309 
310 	spin_lock_irqsave(&m->lock, flags);
311 
312 	/* Do we need to select a new pgpath? */
313 	if (!m->current_pgpath ||
314 	    (!m->queue_io && (m->repeat_count && --m->repeat_count == 0)))
315 		__choose_pgpath(m);
316 
317 	pgpath = m->current_pgpath;
318 
319 	if (was_queued)
320 		m->queue_size--;
321 
322 	if ((pgpath && m->queue_io) ||
323 	    (!pgpath && m->queue_if_no_path)) {
324 		/* Queue for the daemon to resubmit */
325 		bio_list_add(&m->queued_ios, bio);
326 		m->queue_size++;
327 		if ((m->pg_init_required && !m->pg_init_in_progress) ||
328 		    !m->queue_io)
329 			queue_work(kmultipathd, &m->process_queued_ios);
330 		pgpath = NULL;
331 		r = DM_MAPIO_SUBMITTED;
332 	} else if (pgpath)
333 		bio->bi_bdev = pgpath->path.dev->bdev;
334 	else if (__must_push_back(m))
335 		r = DM_MAPIO_REQUEUE;
336 	else
337 		r = -EIO;	/* Failed */
338 
339 	mpio->pgpath = pgpath;
340 
341 	spin_unlock_irqrestore(&m->lock, flags);
342 
343 	return r;
344 }
345 
346 /*
347  * If we run out of usable paths, should we queue I/O or error it?
348  */
349 static int queue_if_no_path(struct multipath *m, unsigned queue_if_no_path,
350 			    unsigned save_old_value)
351 {
352 	unsigned long flags;
353 
354 	spin_lock_irqsave(&m->lock, flags);
355 
356 	if (save_old_value)
357 		m->saved_queue_if_no_path = m->queue_if_no_path;
358 	else
359 		m->saved_queue_if_no_path = queue_if_no_path;
360 	m->queue_if_no_path = queue_if_no_path;
361 	if (!m->queue_if_no_path && m->queue_size)
362 		queue_work(kmultipathd, &m->process_queued_ios);
363 
364 	spin_unlock_irqrestore(&m->lock, flags);
365 
366 	return 0;
367 }
368 
369 /*-----------------------------------------------------------------
370  * The multipath daemon is responsible for resubmitting queued ios.
371  *---------------------------------------------------------------*/
372 
373 static void dispatch_queued_ios(struct multipath *m)
374 {
375 	int r;
376 	unsigned long flags;
377 	struct bio *bio = NULL, *next;
378 	struct dm_mpath_io *mpio;
379 	union map_info *info;
380 
381 	spin_lock_irqsave(&m->lock, flags);
382 	bio = bio_list_get(&m->queued_ios);
383 	spin_unlock_irqrestore(&m->lock, flags);
384 
385 	while (bio) {
386 		next = bio->bi_next;
387 		bio->bi_next = NULL;
388 
389 		info = dm_get_mapinfo(bio);
390 		mpio = info->ptr;
391 
392 		r = map_io(m, bio, mpio, 1);
393 		if (r < 0)
394 			bio_endio(bio, r);
395 		else if (r == DM_MAPIO_REMAPPED)
396 			generic_make_request(bio);
397 		else if (r == DM_MAPIO_REQUEUE)
398 			bio_endio(bio, -EIO);
399 
400 		bio = next;
401 	}
402 }
403 
404 static void process_queued_ios(struct work_struct *work)
405 {
406 	struct multipath *m =
407 		container_of(work, struct multipath, process_queued_ios);
408 	struct pgpath *pgpath = NULL;
409 	unsigned init_required = 0, must_queue = 1;
410 	unsigned long flags;
411 
412 	spin_lock_irqsave(&m->lock, flags);
413 
414 	if (!m->queue_size)
415 		goto out;
416 
417 	if (!m->current_pgpath)
418 		__choose_pgpath(m);
419 
420 	pgpath = m->current_pgpath;
421 
422 	if ((pgpath && !m->queue_io) ||
423 	    (!pgpath && !m->queue_if_no_path))
424 		must_queue = 0;
425 
426 	if (m->pg_init_required && !m->pg_init_in_progress) {
427 		m->pg_init_count++;
428 		m->pg_init_required = 0;
429 		m->pg_init_in_progress = 1;
430 		init_required = 1;
431 	}
432 
433 out:
434 	spin_unlock_irqrestore(&m->lock, flags);
435 
436 	if (init_required)
437 		queue_work(kmpath_handlerd, &m->activate_path);
438 
439 	if (!must_queue)
440 		dispatch_queued_ios(m);
441 }
442 
443 /*
444  * An event is triggered whenever a path is taken out of use.
445  * Includes path failure and PG bypass.
446  */
447 static void trigger_event(struct work_struct *work)
448 {
449 	struct multipath *m =
450 		container_of(work, struct multipath, trigger_event);
451 
452 	dm_table_event(m->ti->table);
453 }
454 
455 /*-----------------------------------------------------------------
456  * Constructor/argument parsing:
457  * <#multipath feature args> [<arg>]*
458  * <#hw_handler args> [hw_handler [<arg>]*]
459  * <#priority groups>
460  * <initial priority group>
461  *     [<selector> <#selector args> [<arg>]*
462  *      <#paths> <#per-path selector args>
463  *         [<path> [<arg>]* ]+ ]+
464  *---------------------------------------------------------------*/
465 struct param {
466 	unsigned min;
467 	unsigned max;
468 	char *error;
469 };
470 
471 static int read_param(struct param *param, char *str, unsigned *v, char **error)
472 {
473 	if (!str ||
474 	    (sscanf(str, "%u", v) != 1) ||
475 	    (*v < param->min) ||
476 	    (*v > param->max)) {
477 		*error = param->error;
478 		return -EINVAL;
479 	}
480 
481 	return 0;
482 }
483 
484 struct arg_set {
485 	unsigned argc;
486 	char **argv;
487 };
488 
489 static char *shift(struct arg_set *as)
490 {
491 	char *r;
492 
493 	if (as->argc) {
494 		as->argc--;
495 		r = *as->argv;
496 		as->argv++;
497 		return r;
498 	}
499 
500 	return NULL;
501 }
502 
503 static void consume(struct arg_set *as, unsigned n)
504 {
505 	BUG_ON (as->argc < n);
506 	as->argc -= n;
507 	as->argv += n;
508 }
509 
510 static int parse_path_selector(struct arg_set *as, struct priority_group *pg,
511 			       struct dm_target *ti)
512 {
513 	int r;
514 	struct path_selector_type *pst;
515 	unsigned ps_argc;
516 
517 	static struct param _params[] = {
518 		{0, 1024, "invalid number of path selector args"},
519 	};
520 
521 	pst = dm_get_path_selector(shift(as));
522 	if (!pst) {
523 		ti->error = "unknown path selector type";
524 		return -EINVAL;
525 	}
526 
527 	r = read_param(_params, shift(as), &ps_argc, &ti->error);
528 	if (r) {
529 		dm_put_path_selector(pst);
530 		return -EINVAL;
531 	}
532 
533 	r = pst->create(&pg->ps, ps_argc, as->argv);
534 	if (r) {
535 		dm_put_path_selector(pst);
536 		ti->error = "path selector constructor failed";
537 		return r;
538 	}
539 
540 	pg->ps.type = pst;
541 	consume(as, ps_argc);
542 
543 	return 0;
544 }
545 
546 static struct pgpath *parse_path(struct arg_set *as, struct path_selector *ps,
547 			       struct dm_target *ti)
548 {
549 	int r;
550 	struct pgpath *p;
551 
552 	/* we need at least a path arg */
553 	if (as->argc < 1) {
554 		ti->error = "no device given";
555 		return NULL;
556 	}
557 
558 	p = alloc_pgpath();
559 	if (!p)
560 		return NULL;
561 
562 	r = dm_get_device(ti, shift(as), ti->begin, ti->len,
563 			  dm_table_get_mode(ti->table), &p->path.dev);
564 	if (r) {
565 		ti->error = "error getting device";
566 		goto bad;
567 	}
568 
569 	r = ps->type->add_path(ps, &p->path, as->argc, as->argv, &ti->error);
570 	if (r) {
571 		dm_put_device(ti, p->path.dev);
572 		goto bad;
573 	}
574 
575 	return p;
576 
577  bad:
578 	free_pgpath(p);
579 	return NULL;
580 }
581 
582 static struct priority_group *parse_priority_group(struct arg_set *as,
583 						   struct multipath *m)
584 {
585 	static struct param _params[] = {
586 		{1, 1024, "invalid number of paths"},
587 		{0, 1024, "invalid number of selector args"}
588 	};
589 
590 	int r;
591 	unsigned i, nr_selector_args, nr_params;
592 	struct priority_group *pg;
593 	struct dm_target *ti = m->ti;
594 
595 	if (as->argc < 2) {
596 		as->argc = 0;
597 		ti->error = "not enough priority group aruments";
598 		return NULL;
599 	}
600 
601 	pg = alloc_priority_group();
602 	if (!pg) {
603 		ti->error = "couldn't allocate priority group";
604 		return NULL;
605 	}
606 	pg->m = m;
607 
608 	r = parse_path_selector(as, pg, ti);
609 	if (r)
610 		goto bad;
611 
612 	/*
613 	 * read the paths
614 	 */
615 	r = read_param(_params, shift(as), &pg->nr_pgpaths, &ti->error);
616 	if (r)
617 		goto bad;
618 
619 	r = read_param(_params + 1, shift(as), &nr_selector_args, &ti->error);
620 	if (r)
621 		goto bad;
622 
623 	nr_params = 1 + nr_selector_args;
624 	for (i = 0; i < pg->nr_pgpaths; i++) {
625 		struct pgpath *pgpath;
626 		struct arg_set path_args;
627 
628 		if (as->argc < nr_params) {
629 			ti->error = "not enough path parameters";
630 			goto bad;
631 		}
632 
633 		path_args.argc = nr_params;
634 		path_args.argv = as->argv;
635 
636 		pgpath = parse_path(&path_args, &pg->ps, ti);
637 		if (!pgpath)
638 			goto bad;
639 
640 		pgpath->pg = pg;
641 		list_add_tail(&pgpath->list, &pg->pgpaths);
642 		consume(as, nr_params);
643 	}
644 
645 	return pg;
646 
647  bad:
648 	free_priority_group(pg, ti);
649 	return NULL;
650 }
651 
652 static int parse_hw_handler(struct arg_set *as, struct multipath *m)
653 {
654 	unsigned hw_argc;
655 	struct dm_target *ti = m->ti;
656 
657 	static struct param _params[] = {
658 		{0, 1024, "invalid number of hardware handler args"},
659 	};
660 
661 	if (read_param(_params, shift(as), &hw_argc, &ti->error))
662 		return -EINVAL;
663 
664 	if (!hw_argc)
665 		return 0;
666 
667 	m->hw_handler_name = kstrdup(shift(as), GFP_KERNEL);
668 	request_module("scsi_dh_%s", m->hw_handler_name);
669 	if (scsi_dh_handler_exist(m->hw_handler_name) == 0) {
670 		ti->error = "unknown hardware handler type";
671 		kfree(m->hw_handler_name);
672 		m->hw_handler_name = NULL;
673 		return -EINVAL;
674 	}
675 	consume(as, hw_argc - 1);
676 
677 	return 0;
678 }
679 
680 static int parse_features(struct arg_set *as, struct multipath *m)
681 {
682 	int r;
683 	unsigned argc;
684 	struct dm_target *ti = m->ti;
685 	const char *param_name;
686 
687 	static struct param _params[] = {
688 		{0, 3, "invalid number of feature args"},
689 		{1, 50, "pg_init_retries must be between 1 and 50"},
690 	};
691 
692 	r = read_param(_params, shift(as), &argc, &ti->error);
693 	if (r)
694 		return -EINVAL;
695 
696 	if (!argc)
697 		return 0;
698 
699 	do {
700 		param_name = shift(as);
701 		argc--;
702 
703 		if (!strnicmp(param_name, MESG_STR("queue_if_no_path"))) {
704 			r = queue_if_no_path(m, 1, 0);
705 			continue;
706 		}
707 
708 		if (!strnicmp(param_name, MESG_STR("pg_init_retries")) &&
709 		    (argc >= 1)) {
710 			r = read_param(_params + 1, shift(as),
711 				       &m->pg_init_retries, &ti->error);
712 			argc--;
713 			continue;
714 		}
715 
716 		ti->error = "Unrecognised multipath feature request";
717 		r = -EINVAL;
718 	} while (argc && !r);
719 
720 	return r;
721 }
722 
723 static int multipath_ctr(struct dm_target *ti, unsigned int argc,
724 			 char **argv)
725 {
726 	/* target parameters */
727 	static struct param _params[] = {
728 		{1, 1024, "invalid number of priority groups"},
729 		{1, 1024, "invalid initial priority group number"},
730 	};
731 
732 	int r;
733 	struct multipath *m;
734 	struct arg_set as;
735 	unsigned pg_count = 0;
736 	unsigned next_pg_num;
737 
738 	as.argc = argc;
739 	as.argv = argv;
740 
741 	m = alloc_multipath(ti);
742 	if (!m) {
743 		ti->error = "can't allocate multipath";
744 		return -EINVAL;
745 	}
746 
747 	r = parse_features(&as, m);
748 	if (r)
749 		goto bad;
750 
751 	r = parse_hw_handler(&as, m);
752 	if (r)
753 		goto bad;
754 
755 	r = read_param(_params, shift(&as), &m->nr_priority_groups, &ti->error);
756 	if (r)
757 		goto bad;
758 
759 	r = read_param(_params + 1, shift(&as), &next_pg_num, &ti->error);
760 	if (r)
761 		goto bad;
762 
763 	/* parse the priority groups */
764 	while (as.argc) {
765 		struct priority_group *pg;
766 
767 		pg = parse_priority_group(&as, m);
768 		if (!pg) {
769 			r = -EINVAL;
770 			goto bad;
771 		}
772 
773 		m->nr_valid_paths += pg->nr_pgpaths;
774 		list_add_tail(&pg->list, &m->priority_groups);
775 		pg_count++;
776 		pg->pg_num = pg_count;
777 		if (!--next_pg_num)
778 			m->next_pg = pg;
779 	}
780 
781 	if (pg_count != m->nr_priority_groups) {
782 		ti->error = "priority group count mismatch";
783 		r = -EINVAL;
784 		goto bad;
785 	}
786 
787 	return 0;
788 
789  bad:
790 	free_multipath(m);
791 	return r;
792 }
793 
794 static void multipath_dtr(struct dm_target *ti)
795 {
796 	struct multipath *m = (struct multipath *) ti->private;
797 
798 	flush_workqueue(kmpath_handlerd);
799 	flush_workqueue(kmultipathd);
800 	free_multipath(m);
801 }
802 
803 /*
804  * Map bios, recording original fields for later in case we have to resubmit
805  */
806 static int multipath_map(struct dm_target *ti, struct bio *bio,
807 			 union map_info *map_context)
808 {
809 	int r;
810 	struct dm_mpath_io *mpio;
811 	struct multipath *m = (struct multipath *) ti->private;
812 
813 	mpio = mempool_alloc(m->mpio_pool, GFP_NOIO);
814 	dm_bio_record(&mpio->details, bio);
815 
816 	map_context->ptr = mpio;
817 	bio->bi_rw |= (1 << BIO_RW_FAILFAST);
818 	r = map_io(m, bio, mpio, 0);
819 	if (r < 0 || r == DM_MAPIO_REQUEUE)
820 		mempool_free(mpio, m->mpio_pool);
821 
822 	return r;
823 }
824 
825 /*
826  * Take a path out of use.
827  */
828 static int fail_path(struct pgpath *pgpath)
829 {
830 	unsigned long flags;
831 	struct multipath *m = pgpath->pg->m;
832 
833 	spin_lock_irqsave(&m->lock, flags);
834 
835 	if (!pgpath->path.is_active)
836 		goto out;
837 
838 	DMWARN("Failing path %s.", pgpath->path.dev->name);
839 
840 	pgpath->pg->ps.type->fail_path(&pgpath->pg->ps, &pgpath->path);
841 	pgpath->path.is_active = 0;
842 	pgpath->fail_count++;
843 
844 	m->nr_valid_paths--;
845 
846 	if (pgpath == m->current_pgpath)
847 		m->current_pgpath = NULL;
848 
849 	dm_path_uevent(DM_UEVENT_PATH_FAILED, m->ti,
850 		      pgpath->path.dev->name, m->nr_valid_paths);
851 
852 	queue_work(kmultipathd, &m->trigger_event);
853 
854 out:
855 	spin_unlock_irqrestore(&m->lock, flags);
856 
857 	return 0;
858 }
859 
860 /*
861  * Reinstate a previously-failed path
862  */
863 static int reinstate_path(struct pgpath *pgpath)
864 {
865 	int r = 0;
866 	unsigned long flags;
867 	struct multipath *m = pgpath->pg->m;
868 
869 	spin_lock_irqsave(&m->lock, flags);
870 
871 	if (pgpath->path.is_active)
872 		goto out;
873 
874 	if (!pgpath->pg->ps.type->reinstate_path) {
875 		DMWARN("Reinstate path not supported by path selector %s",
876 		       pgpath->pg->ps.type->name);
877 		r = -EINVAL;
878 		goto out;
879 	}
880 
881 	r = pgpath->pg->ps.type->reinstate_path(&pgpath->pg->ps, &pgpath->path);
882 	if (r)
883 		goto out;
884 
885 	pgpath->path.is_active = 1;
886 
887 	m->current_pgpath = NULL;
888 	if (!m->nr_valid_paths++ && m->queue_size)
889 		queue_work(kmultipathd, &m->process_queued_ios);
890 
891 	dm_path_uevent(DM_UEVENT_PATH_REINSTATED, m->ti,
892 		      pgpath->path.dev->name, m->nr_valid_paths);
893 
894 	queue_work(kmultipathd, &m->trigger_event);
895 
896 out:
897 	spin_unlock_irqrestore(&m->lock, flags);
898 
899 	return r;
900 }
901 
902 /*
903  * Fail or reinstate all paths that match the provided struct dm_dev.
904  */
905 static int action_dev(struct multipath *m, struct dm_dev *dev,
906 		      action_fn action)
907 {
908 	int r = 0;
909 	struct pgpath *pgpath;
910 	struct priority_group *pg;
911 
912 	list_for_each_entry(pg, &m->priority_groups, list) {
913 		list_for_each_entry(pgpath, &pg->pgpaths, list) {
914 			if (pgpath->path.dev == dev)
915 				r = action(pgpath);
916 		}
917 	}
918 
919 	return r;
920 }
921 
922 /*
923  * Temporarily try to avoid having to use the specified PG
924  */
925 static void bypass_pg(struct multipath *m, struct priority_group *pg,
926 		      int bypassed)
927 {
928 	unsigned long flags;
929 
930 	spin_lock_irqsave(&m->lock, flags);
931 
932 	pg->bypassed = bypassed;
933 	m->current_pgpath = NULL;
934 	m->current_pg = NULL;
935 
936 	spin_unlock_irqrestore(&m->lock, flags);
937 
938 	queue_work(kmultipathd, &m->trigger_event);
939 }
940 
941 /*
942  * Switch to using the specified PG from the next I/O that gets mapped
943  */
944 static int switch_pg_num(struct multipath *m, const char *pgstr)
945 {
946 	struct priority_group *pg;
947 	unsigned pgnum;
948 	unsigned long flags;
949 
950 	if (!pgstr || (sscanf(pgstr, "%u", &pgnum) != 1) || !pgnum ||
951 	    (pgnum > m->nr_priority_groups)) {
952 		DMWARN("invalid PG number supplied to switch_pg_num");
953 		return -EINVAL;
954 	}
955 
956 	spin_lock_irqsave(&m->lock, flags);
957 	list_for_each_entry(pg, &m->priority_groups, list) {
958 		pg->bypassed = 0;
959 		if (--pgnum)
960 			continue;
961 
962 		m->current_pgpath = NULL;
963 		m->current_pg = NULL;
964 		m->next_pg = pg;
965 	}
966 	spin_unlock_irqrestore(&m->lock, flags);
967 
968 	queue_work(kmultipathd, &m->trigger_event);
969 	return 0;
970 }
971 
972 /*
973  * Set/clear bypassed status of a PG.
974  * PGs are numbered upwards from 1 in the order they were declared.
975  */
976 static int bypass_pg_num(struct multipath *m, const char *pgstr, int bypassed)
977 {
978 	struct priority_group *pg;
979 	unsigned pgnum;
980 
981 	if (!pgstr || (sscanf(pgstr, "%u", &pgnum) != 1) || !pgnum ||
982 	    (pgnum > m->nr_priority_groups)) {
983 		DMWARN("invalid PG number supplied to bypass_pg");
984 		return -EINVAL;
985 	}
986 
987 	list_for_each_entry(pg, &m->priority_groups, list) {
988 		if (!--pgnum)
989 			break;
990 	}
991 
992 	bypass_pg(m, pg, bypassed);
993 	return 0;
994 }
995 
996 /*
997  * Should we retry pg_init immediately?
998  */
999 static int pg_init_limit_reached(struct multipath *m, struct pgpath *pgpath)
1000 {
1001 	unsigned long flags;
1002 	int limit_reached = 0;
1003 
1004 	spin_lock_irqsave(&m->lock, flags);
1005 
1006 	if (m->pg_init_count <= m->pg_init_retries)
1007 		m->pg_init_required = 1;
1008 	else
1009 		limit_reached = 1;
1010 
1011 	spin_unlock_irqrestore(&m->lock, flags);
1012 
1013 	return limit_reached;
1014 }
1015 
1016 static void pg_init_done(struct dm_path *path, int errors)
1017 {
1018 	struct pgpath *pgpath = path_to_pgpath(path);
1019 	struct priority_group *pg = pgpath->pg;
1020 	struct multipath *m = pg->m;
1021 	unsigned long flags;
1022 
1023 	/* device or driver problems */
1024 	switch (errors) {
1025 	case SCSI_DH_OK:
1026 		break;
1027 	case SCSI_DH_NOSYS:
1028 		if (!m->hw_handler_name) {
1029 			errors = 0;
1030 			break;
1031 		}
1032 		DMERR("Cannot failover device because scsi_dh_%s was not "
1033 		      "loaded.", m->hw_handler_name);
1034 		/*
1035 		 * Fail path for now, so we do not ping pong
1036 		 */
1037 		fail_path(pgpath);
1038 		break;
1039 	case SCSI_DH_DEV_TEMP_BUSY:
1040 		/*
1041 		 * Probably doing something like FW upgrade on the
1042 		 * controller so try the other pg.
1043 		 */
1044 		bypass_pg(m, pg, 1);
1045 		break;
1046 	/* TODO: For SCSI_DH_RETRY we should wait a couple seconds */
1047 	case SCSI_DH_RETRY:
1048 	case SCSI_DH_IMM_RETRY:
1049 	case SCSI_DH_RES_TEMP_UNAVAIL:
1050 		if (pg_init_limit_reached(m, pgpath))
1051 			fail_path(pgpath);
1052 		errors = 0;
1053 		break;
1054 	default:
1055 		/*
1056 		 * We probably do not want to fail the path for a device
1057 		 * error, but this is what the old dm did. In future
1058 		 * patches we can do more advanced handling.
1059 		 */
1060 		fail_path(pgpath);
1061 	}
1062 
1063 	spin_lock_irqsave(&m->lock, flags);
1064 	if (errors) {
1065 		DMERR("Could not failover device. Error %d.", errors);
1066 		m->current_pgpath = NULL;
1067 		m->current_pg = NULL;
1068 	} else if (!m->pg_init_required) {
1069 		m->queue_io = 0;
1070 		pg->bypassed = 0;
1071 	}
1072 
1073 	m->pg_init_in_progress = 0;
1074 	queue_work(kmultipathd, &m->process_queued_ios);
1075 	spin_unlock_irqrestore(&m->lock, flags);
1076 }
1077 
1078 static void activate_path(struct work_struct *work)
1079 {
1080 	int ret;
1081 	struct multipath *m =
1082 		container_of(work, struct multipath, activate_path);
1083 	struct dm_path *path = &m->current_pgpath->path;
1084 
1085 	ret = scsi_dh_activate(bdev_get_queue(path->dev->bdev));
1086 	pg_init_done(path, ret);
1087 }
1088 
1089 /*
1090  * end_io handling
1091  */
1092 static int do_end_io(struct multipath *m, struct bio *bio,
1093 		     int error, struct dm_mpath_io *mpio)
1094 {
1095 	unsigned long flags;
1096 
1097 	if (!error)
1098 		return 0;	/* I/O complete */
1099 
1100 	if ((error == -EWOULDBLOCK) && bio_rw_ahead(bio))
1101 		return error;
1102 
1103 	if (error == -EOPNOTSUPP)
1104 		return error;
1105 
1106 	spin_lock_irqsave(&m->lock, flags);
1107 	if (!m->nr_valid_paths) {
1108 		if (__must_push_back(m)) {
1109 			spin_unlock_irqrestore(&m->lock, flags);
1110 			return DM_ENDIO_REQUEUE;
1111 		} else if (!m->queue_if_no_path) {
1112 			spin_unlock_irqrestore(&m->lock, flags);
1113 			return -EIO;
1114 		} else {
1115 			spin_unlock_irqrestore(&m->lock, flags);
1116 			goto requeue;
1117 		}
1118 	}
1119 	spin_unlock_irqrestore(&m->lock, flags);
1120 
1121 	if (mpio->pgpath)
1122 		fail_path(mpio->pgpath);
1123 
1124       requeue:
1125 	dm_bio_restore(&mpio->details, bio);
1126 
1127 	/* queue for the daemon to resubmit or fail */
1128 	spin_lock_irqsave(&m->lock, flags);
1129 	bio_list_add(&m->queued_ios, bio);
1130 	m->queue_size++;
1131 	if (!m->queue_io)
1132 		queue_work(kmultipathd, &m->process_queued_ios);
1133 	spin_unlock_irqrestore(&m->lock, flags);
1134 
1135 	return DM_ENDIO_INCOMPLETE;	/* io not complete */
1136 }
1137 
1138 static int multipath_end_io(struct dm_target *ti, struct bio *bio,
1139 			    int error, union map_info *map_context)
1140 {
1141 	struct multipath *m = ti->private;
1142 	struct dm_mpath_io *mpio = map_context->ptr;
1143 	struct pgpath *pgpath = mpio->pgpath;
1144 	struct path_selector *ps;
1145 	int r;
1146 
1147 	r  = do_end_io(m, bio, error, mpio);
1148 	if (pgpath) {
1149 		ps = &pgpath->pg->ps;
1150 		if (ps->type->end_io)
1151 			ps->type->end_io(ps, &pgpath->path);
1152 	}
1153 	if (r != DM_ENDIO_INCOMPLETE)
1154 		mempool_free(mpio, m->mpio_pool);
1155 
1156 	return r;
1157 }
1158 
1159 /*
1160  * Suspend can't complete until all the I/O is processed so if
1161  * the last path fails we must error any remaining I/O.
1162  * Note that if the freeze_bdev fails while suspending, the
1163  * queue_if_no_path state is lost - userspace should reset it.
1164  */
1165 static void multipath_presuspend(struct dm_target *ti)
1166 {
1167 	struct multipath *m = (struct multipath *) ti->private;
1168 
1169 	queue_if_no_path(m, 0, 1);
1170 }
1171 
1172 /*
1173  * Restore the queue_if_no_path setting.
1174  */
1175 static void multipath_resume(struct dm_target *ti)
1176 {
1177 	struct multipath *m = (struct multipath *) ti->private;
1178 	unsigned long flags;
1179 
1180 	spin_lock_irqsave(&m->lock, flags);
1181 	m->queue_if_no_path = m->saved_queue_if_no_path;
1182 	spin_unlock_irqrestore(&m->lock, flags);
1183 }
1184 
1185 /*
1186  * Info output has the following format:
1187  * num_multipath_feature_args [multipath_feature_args]*
1188  * num_handler_status_args [handler_status_args]*
1189  * num_groups init_group_number
1190  *            [A|D|E num_ps_status_args [ps_status_args]*
1191  *             num_paths num_selector_args
1192  *             [path_dev A|F fail_count [selector_args]* ]+ ]+
1193  *
1194  * Table output has the following format (identical to the constructor string):
1195  * num_feature_args [features_args]*
1196  * num_handler_args hw_handler [hw_handler_args]*
1197  * num_groups init_group_number
1198  *     [priority selector-name num_ps_args [ps_args]*
1199  *      num_paths num_selector_args [path_dev [selector_args]* ]+ ]+
1200  */
1201 static int multipath_status(struct dm_target *ti, status_type_t type,
1202 			    char *result, unsigned int maxlen)
1203 {
1204 	int sz = 0;
1205 	unsigned long flags;
1206 	struct multipath *m = (struct multipath *) ti->private;
1207 	struct priority_group *pg;
1208 	struct pgpath *p;
1209 	unsigned pg_num;
1210 	char state;
1211 
1212 	spin_lock_irqsave(&m->lock, flags);
1213 
1214 	/* Features */
1215 	if (type == STATUSTYPE_INFO)
1216 		DMEMIT("2 %u %u ", m->queue_size, m->pg_init_count);
1217 	else {
1218 		DMEMIT("%u ", m->queue_if_no_path +
1219 			      (m->pg_init_retries > 0) * 2);
1220 		if (m->queue_if_no_path)
1221 			DMEMIT("queue_if_no_path ");
1222 		if (m->pg_init_retries)
1223 			DMEMIT("pg_init_retries %u ", m->pg_init_retries);
1224 	}
1225 
1226 	if (!m->hw_handler_name || type == STATUSTYPE_INFO)
1227 		DMEMIT("0 ");
1228 	else
1229 		DMEMIT("1 %s ", m->hw_handler_name);
1230 
1231 	DMEMIT("%u ", m->nr_priority_groups);
1232 
1233 	if (m->next_pg)
1234 		pg_num = m->next_pg->pg_num;
1235 	else if (m->current_pg)
1236 		pg_num = m->current_pg->pg_num;
1237 	else
1238 			pg_num = 1;
1239 
1240 	DMEMIT("%u ", pg_num);
1241 
1242 	switch (type) {
1243 	case STATUSTYPE_INFO:
1244 		list_for_each_entry(pg, &m->priority_groups, list) {
1245 			if (pg->bypassed)
1246 				state = 'D';	/* Disabled */
1247 			else if (pg == m->current_pg)
1248 				state = 'A';	/* Currently Active */
1249 			else
1250 				state = 'E';	/* Enabled */
1251 
1252 			DMEMIT("%c ", state);
1253 
1254 			if (pg->ps.type->status)
1255 				sz += pg->ps.type->status(&pg->ps, NULL, type,
1256 							  result + sz,
1257 							  maxlen - sz);
1258 			else
1259 				DMEMIT("0 ");
1260 
1261 			DMEMIT("%u %u ", pg->nr_pgpaths,
1262 			       pg->ps.type->info_args);
1263 
1264 			list_for_each_entry(p, &pg->pgpaths, list) {
1265 				DMEMIT("%s %s %u ", p->path.dev->name,
1266 				       p->path.is_active ? "A" : "F",
1267 				       p->fail_count);
1268 				if (pg->ps.type->status)
1269 					sz += pg->ps.type->status(&pg->ps,
1270 					      &p->path, type, result + sz,
1271 					      maxlen - sz);
1272 			}
1273 		}
1274 		break;
1275 
1276 	case STATUSTYPE_TABLE:
1277 		list_for_each_entry(pg, &m->priority_groups, list) {
1278 			DMEMIT("%s ", pg->ps.type->name);
1279 
1280 			if (pg->ps.type->status)
1281 				sz += pg->ps.type->status(&pg->ps, NULL, type,
1282 							  result + sz,
1283 							  maxlen - sz);
1284 			else
1285 				DMEMIT("0 ");
1286 
1287 			DMEMIT("%u %u ", pg->nr_pgpaths,
1288 			       pg->ps.type->table_args);
1289 
1290 			list_for_each_entry(p, &pg->pgpaths, list) {
1291 				DMEMIT("%s ", p->path.dev->name);
1292 				if (pg->ps.type->status)
1293 					sz += pg->ps.type->status(&pg->ps,
1294 					      &p->path, type, result + sz,
1295 					      maxlen - sz);
1296 			}
1297 		}
1298 		break;
1299 	}
1300 
1301 	spin_unlock_irqrestore(&m->lock, flags);
1302 
1303 	return 0;
1304 }
1305 
1306 static int multipath_message(struct dm_target *ti, unsigned argc, char **argv)
1307 {
1308 	int r;
1309 	struct dm_dev *dev;
1310 	struct multipath *m = (struct multipath *) ti->private;
1311 	action_fn action;
1312 
1313 	if (argc == 1) {
1314 		if (!strnicmp(argv[0], MESG_STR("queue_if_no_path")))
1315 			return queue_if_no_path(m, 1, 0);
1316 		else if (!strnicmp(argv[0], MESG_STR("fail_if_no_path")))
1317 			return queue_if_no_path(m, 0, 0);
1318 	}
1319 
1320 	if (argc != 2)
1321 		goto error;
1322 
1323 	if (!strnicmp(argv[0], MESG_STR("disable_group")))
1324 		return bypass_pg_num(m, argv[1], 1);
1325 	else if (!strnicmp(argv[0], MESG_STR("enable_group")))
1326 		return bypass_pg_num(m, argv[1], 0);
1327 	else if (!strnicmp(argv[0], MESG_STR("switch_group")))
1328 		return switch_pg_num(m, argv[1]);
1329 	else if (!strnicmp(argv[0], MESG_STR("reinstate_path")))
1330 		action = reinstate_path;
1331 	else if (!strnicmp(argv[0], MESG_STR("fail_path")))
1332 		action = fail_path;
1333 	else
1334 		goto error;
1335 
1336 	r = dm_get_device(ti, argv[1], ti->begin, ti->len,
1337 			  dm_table_get_mode(ti->table), &dev);
1338 	if (r) {
1339 		DMWARN("message: error getting device %s",
1340 		       argv[1]);
1341 		return -EINVAL;
1342 	}
1343 
1344 	r = action_dev(m, dev, action);
1345 
1346 	dm_put_device(ti, dev);
1347 
1348 	return r;
1349 
1350 error:
1351 	DMWARN("Unrecognised multipath message received.");
1352 	return -EINVAL;
1353 }
1354 
1355 static int multipath_ioctl(struct dm_target *ti, struct inode *inode,
1356 			   struct file *filp, unsigned int cmd,
1357 			   unsigned long arg)
1358 {
1359 	struct multipath *m = (struct multipath *) ti->private;
1360 	struct block_device *bdev = NULL;
1361 	unsigned long flags;
1362 	struct file fake_file = {};
1363 	struct dentry fake_dentry = {};
1364 	int r = 0;
1365 
1366 	fake_file.f_path.dentry = &fake_dentry;
1367 
1368 	spin_lock_irqsave(&m->lock, flags);
1369 
1370 	if (!m->current_pgpath)
1371 		__choose_pgpath(m);
1372 
1373 	if (m->current_pgpath) {
1374 		bdev = m->current_pgpath->path.dev->bdev;
1375 		fake_dentry.d_inode = bdev->bd_inode;
1376 		fake_file.f_mode = m->current_pgpath->path.dev->mode;
1377 	}
1378 
1379 	if (m->queue_io)
1380 		r = -EAGAIN;
1381 	else if (!bdev)
1382 		r = -EIO;
1383 
1384 	spin_unlock_irqrestore(&m->lock, flags);
1385 
1386 	return r ? : blkdev_driver_ioctl(bdev->bd_inode, &fake_file,
1387 					 bdev->bd_disk, cmd, arg);
1388 }
1389 
1390 /*-----------------------------------------------------------------
1391  * Module setup
1392  *---------------------------------------------------------------*/
1393 static struct target_type multipath_target = {
1394 	.name = "multipath",
1395 	.version = {1, 0, 5},
1396 	.module = THIS_MODULE,
1397 	.ctr = multipath_ctr,
1398 	.dtr = multipath_dtr,
1399 	.map = multipath_map,
1400 	.end_io = multipath_end_io,
1401 	.presuspend = multipath_presuspend,
1402 	.resume = multipath_resume,
1403 	.status = multipath_status,
1404 	.message = multipath_message,
1405 	.ioctl  = multipath_ioctl,
1406 };
1407 
1408 static int __init dm_multipath_init(void)
1409 {
1410 	int r;
1411 
1412 	/* allocate a slab for the dm_ios */
1413 	_mpio_cache = KMEM_CACHE(dm_mpath_io, 0);
1414 	if (!_mpio_cache)
1415 		return -ENOMEM;
1416 
1417 	r = dm_register_target(&multipath_target);
1418 	if (r < 0) {
1419 		DMERR("register failed %d", r);
1420 		kmem_cache_destroy(_mpio_cache);
1421 		return -EINVAL;
1422 	}
1423 
1424 	kmultipathd = create_workqueue("kmpathd");
1425 	if (!kmultipathd) {
1426 		DMERR("failed to create workqueue kmpathd");
1427 		dm_unregister_target(&multipath_target);
1428 		kmem_cache_destroy(_mpio_cache);
1429 		return -ENOMEM;
1430 	}
1431 
1432 	/*
1433 	 * A separate workqueue is used to handle the device handlers
1434 	 * to avoid overloading existing workqueue. Overloading the
1435 	 * old workqueue would also create a bottleneck in the
1436 	 * path of the storage hardware device activation.
1437 	 */
1438 	kmpath_handlerd = create_singlethread_workqueue("kmpath_handlerd");
1439 	if (!kmpath_handlerd) {
1440 		DMERR("failed to create workqueue kmpath_handlerd");
1441 		destroy_workqueue(kmultipathd);
1442 		dm_unregister_target(&multipath_target);
1443 		kmem_cache_destroy(_mpio_cache);
1444 		return -ENOMEM;
1445 	}
1446 
1447 	DMINFO("version %u.%u.%u loaded",
1448 	       multipath_target.version[0], multipath_target.version[1],
1449 	       multipath_target.version[2]);
1450 
1451 	return r;
1452 }
1453 
1454 static void __exit dm_multipath_exit(void)
1455 {
1456 	int r;
1457 
1458 	destroy_workqueue(kmpath_handlerd);
1459 	destroy_workqueue(kmultipathd);
1460 
1461 	r = dm_unregister_target(&multipath_target);
1462 	if (r < 0)
1463 		DMERR("target unregister failed %d", r);
1464 	kmem_cache_destroy(_mpio_cache);
1465 }
1466 
1467 module_init(dm_multipath_init);
1468 module_exit(dm_multipath_exit);
1469 
1470 MODULE_DESCRIPTION(DM_NAME " multipath target");
1471 MODULE_AUTHOR("Sistina Software <dm-devel@redhat.com>");
1472 MODULE_LICENSE("GPL");
1473