xref: /openbmc/linux/drivers/md/dm-mpath.c (revision cfae5c9bb66325cd32d5f2ee41f14749f062a53c)
1 /*
2  * Copyright (C) 2003 Sistina Software Limited.
3  * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4  *
5  * This file is released under the GPL.
6  */
7 
8 #include "dm.h"
9 #include "dm-path-selector.h"
10 #include "dm-hw-handler.h"
11 #include "dm-bio-list.h"
12 #include "dm-bio-record.h"
13 #include "dm-uevent.h"
14 
15 #include <linux/ctype.h>
16 #include <linux/init.h>
17 #include <linux/mempool.h>
18 #include <linux/module.h>
19 #include <linux/pagemap.h>
20 #include <linux/slab.h>
21 #include <linux/time.h>
22 #include <linux/workqueue.h>
23 #include <scsi/scsi_dh.h>
24 #include <asm/atomic.h>
25 
26 #define DM_MSG_PREFIX "multipath"
27 #define MESG_STR(x) x, sizeof(x)
28 
29 /* Path properties */
30 struct pgpath {
31 	struct list_head list;
32 
33 	struct priority_group *pg;	/* Owning PG */
34 	unsigned fail_count;		/* Cumulative failure count */
35 
36 	struct dm_path path;
37 };
38 
39 #define path_to_pgpath(__pgp) container_of((__pgp), struct pgpath, path)
40 
41 /*
42  * Paths are grouped into Priority Groups and numbered from 1 upwards.
43  * Each has a path selector which controls which path gets used.
44  */
45 struct priority_group {
46 	struct list_head list;
47 
48 	struct multipath *m;		/* Owning multipath instance */
49 	struct path_selector ps;
50 
51 	unsigned pg_num;		/* Reference number */
52 	unsigned bypassed;		/* Temporarily bypass this PG? */
53 
54 	unsigned nr_pgpaths;		/* Number of paths in PG */
55 	struct list_head pgpaths;
56 };
57 
58 /* Multipath context */
59 struct multipath {
60 	struct list_head list;
61 	struct dm_target *ti;
62 
63 	spinlock_t lock;
64 
65 	const char *hw_handler_name;
66 	unsigned nr_priority_groups;
67 	struct list_head priority_groups;
68 	unsigned pg_init_required;	/* pg_init needs calling? */
69 	unsigned pg_init_in_progress;	/* Only one pg_init allowed at once */
70 
71 	unsigned nr_valid_paths;	/* Total number of usable paths */
72 	struct pgpath *current_pgpath;
73 	struct priority_group *current_pg;
74 	struct priority_group *next_pg;	/* Switch to this PG if set */
75 	unsigned repeat_count;		/* I/Os left before calling PS again */
76 
77 	unsigned queue_io;		/* Must we queue all I/O? */
78 	unsigned queue_if_no_path;	/* Queue I/O if last path fails? */
79 	unsigned saved_queue_if_no_path;/* Saved state during suspension */
80 	unsigned pg_init_retries;	/* Number of times to retry pg_init */
81 	unsigned pg_init_count;		/* Number of times pg_init called */
82 
83 	struct work_struct process_queued_ios;
84 	struct bio_list queued_ios;
85 	unsigned queue_size;
86 
87 	struct work_struct trigger_event;
88 
89 	/*
90 	 * We must use a mempool of dm_mpath_io structs so that we
91 	 * can resubmit bios on error.
92 	 */
93 	mempool_t *mpio_pool;
94 };
95 
96 /*
97  * Context information attached to each bio we process.
98  */
99 struct dm_mpath_io {
100 	struct pgpath *pgpath;
101 	struct dm_bio_details details;
102 };
103 
104 typedef int (*action_fn) (struct pgpath *pgpath);
105 
106 #define MIN_IOS 256	/* Mempool size */
107 
108 static struct kmem_cache *_mpio_cache;
109 
110 static struct workqueue_struct *kmultipathd;
111 static void process_queued_ios(struct work_struct *work);
112 static void trigger_event(struct work_struct *work);
113 static void pg_init_done(struct dm_path *, int);
114 
115 
116 /*-----------------------------------------------
117  * Allocation routines
118  *-----------------------------------------------*/
119 
120 static struct pgpath *alloc_pgpath(void)
121 {
122 	struct pgpath *pgpath = kzalloc(sizeof(*pgpath), GFP_KERNEL);
123 
124 	if (pgpath)
125 		pgpath->path.is_active = 1;
126 
127 	return pgpath;
128 }
129 
130 static void free_pgpath(struct pgpath *pgpath)
131 {
132 	kfree(pgpath);
133 }
134 
135 static struct priority_group *alloc_priority_group(void)
136 {
137 	struct priority_group *pg;
138 
139 	pg = kzalloc(sizeof(*pg), GFP_KERNEL);
140 
141 	if (pg)
142 		INIT_LIST_HEAD(&pg->pgpaths);
143 
144 	return pg;
145 }
146 
147 static void free_pgpaths(struct list_head *pgpaths, struct dm_target *ti)
148 {
149 	struct pgpath *pgpath, *tmp;
150 
151 	list_for_each_entry_safe(pgpath, tmp, pgpaths, list) {
152 		list_del(&pgpath->list);
153 		dm_put_device(ti, pgpath->path.dev);
154 		free_pgpath(pgpath);
155 	}
156 }
157 
158 static void free_priority_group(struct priority_group *pg,
159 				struct dm_target *ti)
160 {
161 	struct path_selector *ps = &pg->ps;
162 
163 	if (ps->type) {
164 		ps->type->destroy(ps);
165 		dm_put_path_selector(ps->type);
166 	}
167 
168 	free_pgpaths(&pg->pgpaths, ti);
169 	kfree(pg);
170 }
171 
172 static struct multipath *alloc_multipath(struct dm_target *ti)
173 {
174 	struct multipath *m;
175 
176 	m = kzalloc(sizeof(*m), GFP_KERNEL);
177 	if (m) {
178 		INIT_LIST_HEAD(&m->priority_groups);
179 		spin_lock_init(&m->lock);
180 		m->queue_io = 1;
181 		INIT_WORK(&m->process_queued_ios, process_queued_ios);
182 		INIT_WORK(&m->trigger_event, trigger_event);
183 		m->mpio_pool = mempool_create_slab_pool(MIN_IOS, _mpio_cache);
184 		if (!m->mpio_pool) {
185 			kfree(m);
186 			return NULL;
187 		}
188 		m->ti = ti;
189 		ti->private = m;
190 	}
191 
192 	return m;
193 }
194 
195 static void free_multipath(struct multipath *m)
196 {
197 	struct priority_group *pg, *tmp;
198 
199 	list_for_each_entry_safe(pg, tmp, &m->priority_groups, list) {
200 		list_del(&pg->list);
201 		free_priority_group(pg, m->ti);
202 	}
203 
204 	kfree(m->hw_handler_name);
205 	mempool_destroy(m->mpio_pool);
206 	kfree(m);
207 }
208 
209 
210 /*-----------------------------------------------
211  * Path selection
212  *-----------------------------------------------*/
213 
214 static void __switch_pg(struct multipath *m, struct pgpath *pgpath)
215 {
216 	m->current_pg = pgpath->pg;
217 
218 	/* Must we initialise the PG first, and queue I/O till it's ready? */
219 	if (m->hw_handler_name) {
220 		m->pg_init_required = 1;
221 		m->queue_io = 1;
222 	} else {
223 		m->pg_init_required = 0;
224 		m->queue_io = 0;
225 	}
226 
227 	m->pg_init_count = 0;
228 }
229 
230 static int __choose_path_in_pg(struct multipath *m, struct priority_group *pg)
231 {
232 	struct dm_path *path;
233 
234 	path = pg->ps.type->select_path(&pg->ps, &m->repeat_count);
235 	if (!path)
236 		return -ENXIO;
237 
238 	m->current_pgpath = path_to_pgpath(path);
239 
240 	if (m->current_pg != pg)
241 		__switch_pg(m, m->current_pgpath);
242 
243 	return 0;
244 }
245 
246 static void __choose_pgpath(struct multipath *m)
247 {
248 	struct priority_group *pg;
249 	unsigned bypassed = 1;
250 
251 	if (!m->nr_valid_paths)
252 		goto failed;
253 
254 	/* Were we instructed to switch PG? */
255 	if (m->next_pg) {
256 		pg = m->next_pg;
257 		m->next_pg = NULL;
258 		if (!__choose_path_in_pg(m, pg))
259 			return;
260 	}
261 
262 	/* Don't change PG until it has no remaining paths */
263 	if (m->current_pg && !__choose_path_in_pg(m, m->current_pg))
264 		return;
265 
266 	/*
267 	 * Loop through priority groups until we find a valid path.
268 	 * First time we skip PGs marked 'bypassed'.
269 	 * Second time we only try the ones we skipped.
270 	 */
271 	do {
272 		list_for_each_entry(pg, &m->priority_groups, list) {
273 			if (pg->bypassed == bypassed)
274 				continue;
275 			if (!__choose_path_in_pg(m, pg))
276 				return;
277 		}
278 	} while (bypassed--);
279 
280 failed:
281 	m->current_pgpath = NULL;
282 	m->current_pg = NULL;
283 }
284 
285 /*
286  * Check whether bios must be queued in the device-mapper core rather
287  * than here in the target.
288  *
289  * m->lock must be held on entry.
290  *
291  * If m->queue_if_no_path and m->saved_queue_if_no_path hold the
292  * same value then we are not between multipath_presuspend()
293  * and multipath_resume() calls and we have no need to check
294  * for the DMF_NOFLUSH_SUSPENDING flag.
295  */
296 static int __must_push_back(struct multipath *m)
297 {
298 	return (m->queue_if_no_path != m->saved_queue_if_no_path &&
299 		dm_noflush_suspending(m->ti));
300 }
301 
302 static int map_io(struct multipath *m, struct bio *bio,
303 		  struct dm_mpath_io *mpio, unsigned was_queued)
304 {
305 	int r = DM_MAPIO_REMAPPED;
306 	unsigned long flags;
307 	struct pgpath *pgpath;
308 
309 	spin_lock_irqsave(&m->lock, flags);
310 
311 	/* Do we need to select a new pgpath? */
312 	if (!m->current_pgpath ||
313 	    (!m->queue_io && (m->repeat_count && --m->repeat_count == 0)))
314 		__choose_pgpath(m);
315 
316 	pgpath = m->current_pgpath;
317 
318 	if (was_queued)
319 		m->queue_size--;
320 
321 	if ((pgpath && m->queue_io) ||
322 	    (!pgpath && m->queue_if_no_path)) {
323 		/* Queue for the daemon to resubmit */
324 		bio_list_add(&m->queued_ios, bio);
325 		m->queue_size++;
326 		if ((m->pg_init_required && !m->pg_init_in_progress) ||
327 		    !m->queue_io)
328 			queue_work(kmultipathd, &m->process_queued_ios);
329 		pgpath = NULL;
330 		r = DM_MAPIO_SUBMITTED;
331 	} else if (pgpath)
332 		bio->bi_bdev = pgpath->path.dev->bdev;
333 	else if (__must_push_back(m))
334 		r = DM_MAPIO_REQUEUE;
335 	else
336 		r = -EIO;	/* Failed */
337 
338 	mpio->pgpath = pgpath;
339 
340 	spin_unlock_irqrestore(&m->lock, flags);
341 
342 	return r;
343 }
344 
345 /*
346  * If we run out of usable paths, should we queue I/O or error it?
347  */
348 static int queue_if_no_path(struct multipath *m, unsigned queue_if_no_path,
349 			    unsigned save_old_value)
350 {
351 	unsigned long flags;
352 
353 	spin_lock_irqsave(&m->lock, flags);
354 
355 	if (save_old_value)
356 		m->saved_queue_if_no_path = m->queue_if_no_path;
357 	else
358 		m->saved_queue_if_no_path = queue_if_no_path;
359 	m->queue_if_no_path = queue_if_no_path;
360 	if (!m->queue_if_no_path && m->queue_size)
361 		queue_work(kmultipathd, &m->process_queued_ios);
362 
363 	spin_unlock_irqrestore(&m->lock, flags);
364 
365 	return 0;
366 }
367 
368 /*-----------------------------------------------------------------
369  * The multipath daemon is responsible for resubmitting queued ios.
370  *---------------------------------------------------------------*/
371 
372 static void dispatch_queued_ios(struct multipath *m)
373 {
374 	int r;
375 	unsigned long flags;
376 	struct bio *bio = NULL, *next;
377 	struct dm_mpath_io *mpio;
378 	union map_info *info;
379 
380 	spin_lock_irqsave(&m->lock, flags);
381 	bio = bio_list_get(&m->queued_ios);
382 	spin_unlock_irqrestore(&m->lock, flags);
383 
384 	while (bio) {
385 		next = bio->bi_next;
386 		bio->bi_next = NULL;
387 
388 		info = dm_get_mapinfo(bio);
389 		mpio = info->ptr;
390 
391 		r = map_io(m, bio, mpio, 1);
392 		if (r < 0)
393 			bio_endio(bio, r);
394 		else if (r == DM_MAPIO_REMAPPED)
395 			generic_make_request(bio);
396 		else if (r == DM_MAPIO_REQUEUE)
397 			bio_endio(bio, -EIO);
398 
399 		bio = next;
400 	}
401 }
402 
403 static void process_queued_ios(struct work_struct *work)
404 {
405 	struct multipath *m =
406 		container_of(work, struct multipath, process_queued_ios);
407 	struct pgpath *pgpath = NULL;
408 	unsigned init_required = 0, must_queue = 1;
409 	unsigned long flags;
410 
411 	spin_lock_irqsave(&m->lock, flags);
412 
413 	if (!m->queue_size)
414 		goto out;
415 
416 	if (!m->current_pgpath)
417 		__choose_pgpath(m);
418 
419 	pgpath = m->current_pgpath;
420 
421 	if ((pgpath && !m->queue_io) ||
422 	    (!pgpath && !m->queue_if_no_path))
423 		must_queue = 0;
424 
425 	if (m->pg_init_required && !m->pg_init_in_progress) {
426 		m->pg_init_count++;
427 		m->pg_init_required = 0;
428 		m->pg_init_in_progress = 1;
429 		init_required = 1;
430 	}
431 
432 out:
433 	spin_unlock_irqrestore(&m->lock, flags);
434 
435 	if (init_required) {
436 		struct dm_path *path = &pgpath->path;
437 		int ret = scsi_dh_activate(bdev_get_queue(path->dev->bdev));
438 		pg_init_done(path, ret);
439 	}
440 
441 	if (!must_queue)
442 		dispatch_queued_ios(m);
443 }
444 
445 /*
446  * An event is triggered whenever a path is taken out of use.
447  * Includes path failure and PG bypass.
448  */
449 static void trigger_event(struct work_struct *work)
450 {
451 	struct multipath *m =
452 		container_of(work, struct multipath, trigger_event);
453 
454 	dm_table_event(m->ti->table);
455 }
456 
457 /*-----------------------------------------------------------------
458  * Constructor/argument parsing:
459  * <#multipath feature args> [<arg>]*
460  * <#hw_handler args> [hw_handler [<arg>]*]
461  * <#priority groups>
462  * <initial priority group>
463  *     [<selector> <#selector args> [<arg>]*
464  *      <#paths> <#per-path selector args>
465  *         [<path> [<arg>]* ]+ ]+
466  *---------------------------------------------------------------*/
467 struct param {
468 	unsigned min;
469 	unsigned max;
470 	char *error;
471 };
472 
473 static int read_param(struct param *param, char *str, unsigned *v, char **error)
474 {
475 	if (!str ||
476 	    (sscanf(str, "%u", v) != 1) ||
477 	    (*v < param->min) ||
478 	    (*v > param->max)) {
479 		*error = param->error;
480 		return -EINVAL;
481 	}
482 
483 	return 0;
484 }
485 
486 struct arg_set {
487 	unsigned argc;
488 	char **argv;
489 };
490 
491 static char *shift(struct arg_set *as)
492 {
493 	char *r;
494 
495 	if (as->argc) {
496 		as->argc--;
497 		r = *as->argv;
498 		as->argv++;
499 		return r;
500 	}
501 
502 	return NULL;
503 }
504 
505 static void consume(struct arg_set *as, unsigned n)
506 {
507 	BUG_ON (as->argc < n);
508 	as->argc -= n;
509 	as->argv += n;
510 }
511 
512 static int parse_path_selector(struct arg_set *as, struct priority_group *pg,
513 			       struct dm_target *ti)
514 {
515 	int r;
516 	struct path_selector_type *pst;
517 	unsigned ps_argc;
518 
519 	static struct param _params[] = {
520 		{0, 1024, "invalid number of path selector args"},
521 	};
522 
523 	pst = dm_get_path_selector(shift(as));
524 	if (!pst) {
525 		ti->error = "unknown path selector type";
526 		return -EINVAL;
527 	}
528 
529 	r = read_param(_params, shift(as), &ps_argc, &ti->error);
530 	if (r)
531 		return -EINVAL;
532 
533 	r = pst->create(&pg->ps, ps_argc, as->argv);
534 	if (r) {
535 		dm_put_path_selector(pst);
536 		ti->error = "path selector constructor failed";
537 		return r;
538 	}
539 
540 	pg->ps.type = pst;
541 	consume(as, ps_argc);
542 
543 	return 0;
544 }
545 
546 static struct pgpath *parse_path(struct arg_set *as, struct path_selector *ps,
547 			       struct dm_target *ti)
548 {
549 	int r;
550 	struct pgpath *p;
551 
552 	/* we need at least a path arg */
553 	if (as->argc < 1) {
554 		ti->error = "no device given";
555 		return NULL;
556 	}
557 
558 	p = alloc_pgpath();
559 	if (!p)
560 		return NULL;
561 
562 	r = dm_get_device(ti, shift(as), ti->begin, ti->len,
563 			  dm_table_get_mode(ti->table), &p->path.dev);
564 	if (r) {
565 		ti->error = "error getting device";
566 		goto bad;
567 	}
568 
569 	r = ps->type->add_path(ps, &p->path, as->argc, as->argv, &ti->error);
570 	if (r) {
571 		dm_put_device(ti, p->path.dev);
572 		goto bad;
573 	}
574 
575 	return p;
576 
577  bad:
578 	free_pgpath(p);
579 	return NULL;
580 }
581 
582 static struct priority_group *parse_priority_group(struct arg_set *as,
583 						   struct multipath *m)
584 {
585 	static struct param _params[] = {
586 		{1, 1024, "invalid number of paths"},
587 		{0, 1024, "invalid number of selector args"}
588 	};
589 
590 	int r;
591 	unsigned i, nr_selector_args, nr_params;
592 	struct priority_group *pg;
593 	struct dm_target *ti = m->ti;
594 
595 	if (as->argc < 2) {
596 		as->argc = 0;
597 		ti->error = "not enough priority group aruments";
598 		return NULL;
599 	}
600 
601 	pg = alloc_priority_group();
602 	if (!pg) {
603 		ti->error = "couldn't allocate priority group";
604 		return NULL;
605 	}
606 	pg->m = m;
607 
608 	r = parse_path_selector(as, pg, ti);
609 	if (r)
610 		goto bad;
611 
612 	/*
613 	 * read the paths
614 	 */
615 	r = read_param(_params, shift(as), &pg->nr_pgpaths, &ti->error);
616 	if (r)
617 		goto bad;
618 
619 	r = read_param(_params + 1, shift(as), &nr_selector_args, &ti->error);
620 	if (r)
621 		goto bad;
622 
623 	nr_params = 1 + nr_selector_args;
624 	for (i = 0; i < pg->nr_pgpaths; i++) {
625 		struct pgpath *pgpath;
626 		struct arg_set path_args;
627 
628 		if (as->argc < nr_params)
629 			goto bad;
630 
631 		path_args.argc = nr_params;
632 		path_args.argv = as->argv;
633 
634 		pgpath = parse_path(&path_args, &pg->ps, ti);
635 		if (!pgpath)
636 			goto bad;
637 
638 		pgpath->pg = pg;
639 		list_add_tail(&pgpath->list, &pg->pgpaths);
640 		consume(as, nr_params);
641 	}
642 
643 	return pg;
644 
645  bad:
646 	free_priority_group(pg, ti);
647 	return NULL;
648 }
649 
650 static int parse_hw_handler(struct arg_set *as, struct multipath *m)
651 {
652 	unsigned hw_argc;
653 	struct dm_target *ti = m->ti;
654 
655 	static struct param _params[] = {
656 		{0, 1024, "invalid number of hardware handler args"},
657 	};
658 
659 	if (read_param(_params, shift(as), &hw_argc, &ti->error))
660 		return -EINVAL;
661 
662 	if (!hw_argc)
663 		return 0;
664 
665 	m->hw_handler_name = kstrdup(shift(as), GFP_KERNEL);
666 	request_module("scsi_dh_%s", m->hw_handler_name);
667 	if (scsi_dh_handler_exist(m->hw_handler_name) == 0) {
668 		ti->error = "unknown hardware handler type";
669 		return -EINVAL;
670 	}
671 	consume(as, hw_argc - 1);
672 
673 	return 0;
674 }
675 
676 static int parse_features(struct arg_set *as, struct multipath *m)
677 {
678 	int r;
679 	unsigned argc;
680 	struct dm_target *ti = m->ti;
681 	const char *param_name;
682 
683 	static struct param _params[] = {
684 		{0, 3, "invalid number of feature args"},
685 		{1, 50, "pg_init_retries must be between 1 and 50"},
686 	};
687 
688 	r = read_param(_params, shift(as), &argc, &ti->error);
689 	if (r)
690 		return -EINVAL;
691 
692 	if (!argc)
693 		return 0;
694 
695 	do {
696 		param_name = shift(as);
697 		argc--;
698 
699 		if (!strnicmp(param_name, MESG_STR("queue_if_no_path"))) {
700 			r = queue_if_no_path(m, 1, 0);
701 			continue;
702 		}
703 
704 		if (!strnicmp(param_name, MESG_STR("pg_init_retries")) &&
705 		    (argc >= 1)) {
706 			r = read_param(_params + 1, shift(as),
707 				       &m->pg_init_retries, &ti->error);
708 			argc--;
709 			continue;
710 		}
711 
712 		ti->error = "Unrecognised multipath feature request";
713 		r = -EINVAL;
714 	} while (argc && !r);
715 
716 	return r;
717 }
718 
719 static int multipath_ctr(struct dm_target *ti, unsigned int argc,
720 			 char **argv)
721 {
722 	/* target parameters */
723 	static struct param _params[] = {
724 		{1, 1024, "invalid number of priority groups"},
725 		{1, 1024, "invalid initial priority group number"},
726 	};
727 
728 	int r;
729 	struct multipath *m;
730 	struct arg_set as;
731 	unsigned pg_count = 0;
732 	unsigned next_pg_num;
733 
734 	as.argc = argc;
735 	as.argv = argv;
736 
737 	m = alloc_multipath(ti);
738 	if (!m) {
739 		ti->error = "can't allocate multipath";
740 		return -EINVAL;
741 	}
742 
743 	r = parse_features(&as, m);
744 	if (r)
745 		goto bad;
746 
747 	r = parse_hw_handler(&as, m);
748 	if (r)
749 		goto bad;
750 
751 	r = read_param(_params, shift(&as), &m->nr_priority_groups, &ti->error);
752 	if (r)
753 		goto bad;
754 
755 	r = read_param(_params + 1, shift(&as), &next_pg_num, &ti->error);
756 	if (r)
757 		goto bad;
758 
759 	/* parse the priority groups */
760 	while (as.argc) {
761 		struct priority_group *pg;
762 
763 		pg = parse_priority_group(&as, m);
764 		if (!pg) {
765 			r = -EINVAL;
766 			goto bad;
767 		}
768 
769 		m->nr_valid_paths += pg->nr_pgpaths;
770 		list_add_tail(&pg->list, &m->priority_groups);
771 		pg_count++;
772 		pg->pg_num = pg_count;
773 		if (!--next_pg_num)
774 			m->next_pg = pg;
775 	}
776 
777 	if (pg_count != m->nr_priority_groups) {
778 		ti->error = "priority group count mismatch";
779 		r = -EINVAL;
780 		goto bad;
781 	}
782 
783 	return 0;
784 
785  bad:
786 	free_multipath(m);
787 	return r;
788 }
789 
790 static void multipath_dtr(struct dm_target *ti)
791 {
792 	struct multipath *m = (struct multipath *) ti->private;
793 
794 	flush_workqueue(kmultipathd);
795 	free_multipath(m);
796 }
797 
798 /*
799  * Map bios, recording original fields for later in case we have to resubmit
800  */
801 static int multipath_map(struct dm_target *ti, struct bio *bio,
802 			 union map_info *map_context)
803 {
804 	int r;
805 	struct dm_mpath_io *mpio;
806 	struct multipath *m = (struct multipath *) ti->private;
807 
808 	mpio = mempool_alloc(m->mpio_pool, GFP_NOIO);
809 	dm_bio_record(&mpio->details, bio);
810 
811 	map_context->ptr = mpio;
812 	bio->bi_rw |= (1 << BIO_RW_FAILFAST);
813 	r = map_io(m, bio, mpio, 0);
814 	if (r < 0 || r == DM_MAPIO_REQUEUE)
815 		mempool_free(mpio, m->mpio_pool);
816 
817 	return r;
818 }
819 
820 /*
821  * Take a path out of use.
822  */
823 static int fail_path(struct pgpath *pgpath)
824 {
825 	unsigned long flags;
826 	struct multipath *m = pgpath->pg->m;
827 
828 	spin_lock_irqsave(&m->lock, flags);
829 
830 	if (!pgpath->path.is_active)
831 		goto out;
832 
833 	DMWARN("Failing path %s.", pgpath->path.dev->name);
834 
835 	pgpath->pg->ps.type->fail_path(&pgpath->pg->ps, &pgpath->path);
836 	pgpath->path.is_active = 0;
837 	pgpath->fail_count++;
838 
839 	m->nr_valid_paths--;
840 
841 	if (pgpath == m->current_pgpath)
842 		m->current_pgpath = NULL;
843 
844 	dm_path_uevent(DM_UEVENT_PATH_FAILED, m->ti,
845 		      pgpath->path.dev->name, m->nr_valid_paths);
846 
847 	queue_work(kmultipathd, &m->trigger_event);
848 
849 out:
850 	spin_unlock_irqrestore(&m->lock, flags);
851 
852 	return 0;
853 }
854 
855 /*
856  * Reinstate a previously-failed path
857  */
858 static int reinstate_path(struct pgpath *pgpath)
859 {
860 	int r = 0;
861 	unsigned long flags;
862 	struct multipath *m = pgpath->pg->m;
863 
864 	spin_lock_irqsave(&m->lock, flags);
865 
866 	if (pgpath->path.is_active)
867 		goto out;
868 
869 	if (!pgpath->pg->ps.type) {
870 		DMWARN("Reinstate path not supported by path selector %s",
871 		       pgpath->pg->ps.type->name);
872 		r = -EINVAL;
873 		goto out;
874 	}
875 
876 	r = pgpath->pg->ps.type->reinstate_path(&pgpath->pg->ps, &pgpath->path);
877 	if (r)
878 		goto out;
879 
880 	pgpath->path.is_active = 1;
881 
882 	m->current_pgpath = NULL;
883 	if (!m->nr_valid_paths++ && m->queue_size)
884 		queue_work(kmultipathd, &m->process_queued_ios);
885 
886 	dm_path_uevent(DM_UEVENT_PATH_REINSTATED, m->ti,
887 		      pgpath->path.dev->name, m->nr_valid_paths);
888 
889 	queue_work(kmultipathd, &m->trigger_event);
890 
891 out:
892 	spin_unlock_irqrestore(&m->lock, flags);
893 
894 	return r;
895 }
896 
897 /*
898  * Fail or reinstate all paths that match the provided struct dm_dev.
899  */
900 static int action_dev(struct multipath *m, struct dm_dev *dev,
901 		      action_fn action)
902 {
903 	int r = 0;
904 	struct pgpath *pgpath;
905 	struct priority_group *pg;
906 
907 	list_for_each_entry(pg, &m->priority_groups, list) {
908 		list_for_each_entry(pgpath, &pg->pgpaths, list) {
909 			if (pgpath->path.dev == dev)
910 				r = action(pgpath);
911 		}
912 	}
913 
914 	return r;
915 }
916 
917 /*
918  * Temporarily try to avoid having to use the specified PG
919  */
920 static void bypass_pg(struct multipath *m, struct priority_group *pg,
921 		      int bypassed)
922 {
923 	unsigned long flags;
924 
925 	spin_lock_irqsave(&m->lock, flags);
926 
927 	pg->bypassed = bypassed;
928 	m->current_pgpath = NULL;
929 	m->current_pg = NULL;
930 
931 	spin_unlock_irqrestore(&m->lock, flags);
932 
933 	queue_work(kmultipathd, &m->trigger_event);
934 }
935 
936 /*
937  * Switch to using the specified PG from the next I/O that gets mapped
938  */
939 static int switch_pg_num(struct multipath *m, const char *pgstr)
940 {
941 	struct priority_group *pg;
942 	unsigned pgnum;
943 	unsigned long flags;
944 
945 	if (!pgstr || (sscanf(pgstr, "%u", &pgnum) != 1) || !pgnum ||
946 	    (pgnum > m->nr_priority_groups)) {
947 		DMWARN("invalid PG number supplied to switch_pg_num");
948 		return -EINVAL;
949 	}
950 
951 	spin_lock_irqsave(&m->lock, flags);
952 	list_for_each_entry(pg, &m->priority_groups, list) {
953 		pg->bypassed = 0;
954 		if (--pgnum)
955 			continue;
956 
957 		m->current_pgpath = NULL;
958 		m->current_pg = NULL;
959 		m->next_pg = pg;
960 	}
961 	spin_unlock_irqrestore(&m->lock, flags);
962 
963 	queue_work(kmultipathd, &m->trigger_event);
964 	return 0;
965 }
966 
967 /*
968  * Set/clear bypassed status of a PG.
969  * PGs are numbered upwards from 1 in the order they were declared.
970  */
971 static int bypass_pg_num(struct multipath *m, const char *pgstr, int bypassed)
972 {
973 	struct priority_group *pg;
974 	unsigned pgnum;
975 
976 	if (!pgstr || (sscanf(pgstr, "%u", &pgnum) != 1) || !pgnum ||
977 	    (pgnum > m->nr_priority_groups)) {
978 		DMWARN("invalid PG number supplied to bypass_pg");
979 		return -EINVAL;
980 	}
981 
982 	list_for_each_entry(pg, &m->priority_groups, list) {
983 		if (!--pgnum)
984 			break;
985 	}
986 
987 	bypass_pg(m, pg, bypassed);
988 	return 0;
989 }
990 
991 /*
992  * Should we retry pg_init immediately?
993  */
994 static int pg_init_limit_reached(struct multipath *m, struct pgpath *pgpath)
995 {
996 	unsigned long flags;
997 	int limit_reached = 0;
998 
999 	spin_lock_irqsave(&m->lock, flags);
1000 
1001 	if (m->pg_init_count <= m->pg_init_retries)
1002 		m->pg_init_required = 1;
1003 	else
1004 		limit_reached = 1;
1005 
1006 	spin_unlock_irqrestore(&m->lock, flags);
1007 
1008 	return limit_reached;
1009 }
1010 
1011 /*
1012  * pg_init must call this when it has completed its initialisation
1013  */
1014 void dm_pg_init_complete(struct dm_path *path, unsigned err_flags)
1015 {
1016 	struct pgpath *pgpath = path_to_pgpath(path);
1017 	struct priority_group *pg = pgpath->pg;
1018 	struct multipath *m = pg->m;
1019 	unsigned long flags;
1020 
1021 	/*
1022 	 * If requested, retry pg_init until maximum number of retries exceeded.
1023 	 * If retry not requested and PG already bypassed, always fail the path.
1024 	 */
1025 	if (err_flags & MP_RETRY) {
1026 		if (pg_init_limit_reached(m, pgpath))
1027 			err_flags |= MP_FAIL_PATH;
1028 	} else if (err_flags && pg->bypassed)
1029 		err_flags |= MP_FAIL_PATH;
1030 
1031 	if (err_flags & MP_FAIL_PATH)
1032 		fail_path(pgpath);
1033 
1034 	if (err_flags & MP_BYPASS_PG)
1035 		bypass_pg(m, pg, 1);
1036 
1037 	spin_lock_irqsave(&m->lock, flags);
1038 	if (err_flags & ~MP_RETRY) {
1039 		m->current_pgpath = NULL;
1040 		m->current_pg = NULL;
1041 	} else if (!m->pg_init_required)
1042 		m->queue_io = 0;
1043 
1044 	m->pg_init_in_progress = 0;
1045 	queue_work(kmultipathd, &m->process_queued_ios);
1046 	spin_unlock_irqrestore(&m->lock, flags);
1047 }
1048 
1049 static void pg_init_done(struct dm_path *path, int errors)
1050 {
1051 	struct pgpath *pgpath = path_to_pgpath(path);
1052 	struct priority_group *pg = pgpath->pg;
1053 	struct multipath *m = pg->m;
1054 	unsigned long flags;
1055 
1056 	/* device or driver problems */
1057 	switch (errors) {
1058 	case SCSI_DH_OK:
1059 		break;
1060 	case SCSI_DH_NOSYS:
1061 		if (!m->hw_handler_name) {
1062 			errors = 0;
1063 			break;
1064 		}
1065 		DMERR("Cannot failover device because scsi_dh_%s was not "
1066 		      "loaded.", m->hw_handler_name);
1067 		/*
1068 		 * Fail path for now, so we do not ping pong
1069 		 */
1070 		fail_path(pgpath);
1071 		break;
1072 	case SCSI_DH_DEV_TEMP_BUSY:
1073 		/*
1074 		 * Probably doing something like FW upgrade on the
1075 		 * controller so try the other pg.
1076 		 */
1077 		bypass_pg(m, pg, 1);
1078 		break;
1079 	/* TODO: For SCSI_DH_RETRY we should wait a couple seconds */
1080 	case SCSI_DH_RETRY:
1081 	case SCSI_DH_IMM_RETRY:
1082 	case SCSI_DH_RES_TEMP_UNAVAIL:
1083 		if (pg_init_limit_reached(m, pgpath))
1084 			fail_path(pgpath);
1085 		errors = 0;
1086 		break;
1087 	default:
1088 		/*
1089 		 * We probably do not want to fail the path for a device
1090 		 * error, but this is what the old dm did. In future
1091 		 * patches we can do more advanced handling.
1092 		 */
1093 		fail_path(pgpath);
1094 	}
1095 
1096 	spin_lock_irqsave(&m->lock, flags);
1097 	if (errors) {
1098 		DMERR("Could not failover device. Error %d.", errors);
1099 		m->current_pgpath = NULL;
1100 		m->current_pg = NULL;
1101 	} else if (!m->pg_init_required) {
1102 		m->queue_io = 0;
1103 		pg->bypassed = 0;
1104 	}
1105 
1106 	m->pg_init_in_progress = 0;
1107 	queue_work(kmultipathd, &m->process_queued_ios);
1108 	spin_unlock_irqrestore(&m->lock, flags);
1109 }
1110 
1111 /*
1112  * end_io handling
1113  */
1114 static int do_end_io(struct multipath *m, struct bio *bio,
1115 		     int error, struct dm_mpath_io *mpio)
1116 {
1117 	unsigned long flags;
1118 
1119 	if (!error)
1120 		return 0;	/* I/O complete */
1121 
1122 	if ((error == -EWOULDBLOCK) && bio_rw_ahead(bio))
1123 		return error;
1124 
1125 	if (error == -EOPNOTSUPP)
1126 		return error;
1127 
1128 	spin_lock_irqsave(&m->lock, flags);
1129 	if (!m->nr_valid_paths) {
1130 		if (__must_push_back(m)) {
1131 			spin_unlock_irqrestore(&m->lock, flags);
1132 			return DM_ENDIO_REQUEUE;
1133 		} else if (!m->queue_if_no_path) {
1134 			spin_unlock_irqrestore(&m->lock, flags);
1135 			return -EIO;
1136 		} else {
1137 			spin_unlock_irqrestore(&m->lock, flags);
1138 			goto requeue;
1139 		}
1140 	}
1141 	spin_unlock_irqrestore(&m->lock, flags);
1142 
1143 	if (mpio->pgpath)
1144 		fail_path(mpio->pgpath);
1145 
1146       requeue:
1147 	dm_bio_restore(&mpio->details, bio);
1148 
1149 	/* queue for the daemon to resubmit or fail */
1150 	spin_lock_irqsave(&m->lock, flags);
1151 	bio_list_add(&m->queued_ios, bio);
1152 	m->queue_size++;
1153 	if (!m->queue_io)
1154 		queue_work(kmultipathd, &m->process_queued_ios);
1155 	spin_unlock_irqrestore(&m->lock, flags);
1156 
1157 	return DM_ENDIO_INCOMPLETE;	/* io not complete */
1158 }
1159 
1160 static int multipath_end_io(struct dm_target *ti, struct bio *bio,
1161 			    int error, union map_info *map_context)
1162 {
1163 	struct multipath *m = ti->private;
1164 	struct dm_mpath_io *mpio = map_context->ptr;
1165 	struct pgpath *pgpath = mpio->pgpath;
1166 	struct path_selector *ps;
1167 	int r;
1168 
1169 	r  = do_end_io(m, bio, error, mpio);
1170 	if (pgpath) {
1171 		ps = &pgpath->pg->ps;
1172 		if (ps->type->end_io)
1173 			ps->type->end_io(ps, &pgpath->path);
1174 	}
1175 	if (r != DM_ENDIO_INCOMPLETE)
1176 		mempool_free(mpio, m->mpio_pool);
1177 
1178 	return r;
1179 }
1180 
1181 /*
1182  * Suspend can't complete until all the I/O is processed so if
1183  * the last path fails we must error any remaining I/O.
1184  * Note that if the freeze_bdev fails while suspending, the
1185  * queue_if_no_path state is lost - userspace should reset it.
1186  */
1187 static void multipath_presuspend(struct dm_target *ti)
1188 {
1189 	struct multipath *m = (struct multipath *) ti->private;
1190 
1191 	queue_if_no_path(m, 0, 1);
1192 }
1193 
1194 /*
1195  * Restore the queue_if_no_path setting.
1196  */
1197 static void multipath_resume(struct dm_target *ti)
1198 {
1199 	struct multipath *m = (struct multipath *) ti->private;
1200 	unsigned long flags;
1201 
1202 	spin_lock_irqsave(&m->lock, flags);
1203 	m->queue_if_no_path = m->saved_queue_if_no_path;
1204 	spin_unlock_irqrestore(&m->lock, flags);
1205 }
1206 
1207 /*
1208  * Info output has the following format:
1209  * num_multipath_feature_args [multipath_feature_args]*
1210  * num_handler_status_args [handler_status_args]*
1211  * num_groups init_group_number
1212  *            [A|D|E num_ps_status_args [ps_status_args]*
1213  *             num_paths num_selector_args
1214  *             [path_dev A|F fail_count [selector_args]* ]+ ]+
1215  *
1216  * Table output has the following format (identical to the constructor string):
1217  * num_feature_args [features_args]*
1218  * num_handler_args hw_handler [hw_handler_args]*
1219  * num_groups init_group_number
1220  *     [priority selector-name num_ps_args [ps_args]*
1221  *      num_paths num_selector_args [path_dev [selector_args]* ]+ ]+
1222  */
1223 static int multipath_status(struct dm_target *ti, status_type_t type,
1224 			    char *result, unsigned int maxlen)
1225 {
1226 	int sz = 0;
1227 	unsigned long flags;
1228 	struct multipath *m = (struct multipath *) ti->private;
1229 	struct priority_group *pg;
1230 	struct pgpath *p;
1231 	unsigned pg_num;
1232 	char state;
1233 
1234 	spin_lock_irqsave(&m->lock, flags);
1235 
1236 	/* Features */
1237 	if (type == STATUSTYPE_INFO)
1238 		DMEMIT("2 %u %u ", m->queue_size, m->pg_init_count);
1239 	else {
1240 		DMEMIT("%u ", m->queue_if_no_path +
1241 			      (m->pg_init_retries > 0) * 2);
1242 		if (m->queue_if_no_path)
1243 			DMEMIT("queue_if_no_path ");
1244 		if (m->pg_init_retries)
1245 			DMEMIT("pg_init_retries %u ", m->pg_init_retries);
1246 	}
1247 
1248 	if (!m->hw_handler_name || type == STATUSTYPE_INFO)
1249 		DMEMIT("0 ");
1250 	else
1251 		DMEMIT("1 %s ", m->hw_handler_name);
1252 
1253 	DMEMIT("%u ", m->nr_priority_groups);
1254 
1255 	if (m->next_pg)
1256 		pg_num = m->next_pg->pg_num;
1257 	else if (m->current_pg)
1258 		pg_num = m->current_pg->pg_num;
1259 	else
1260 			pg_num = 1;
1261 
1262 	DMEMIT("%u ", pg_num);
1263 
1264 	switch (type) {
1265 	case STATUSTYPE_INFO:
1266 		list_for_each_entry(pg, &m->priority_groups, list) {
1267 			if (pg->bypassed)
1268 				state = 'D';	/* Disabled */
1269 			else if (pg == m->current_pg)
1270 				state = 'A';	/* Currently Active */
1271 			else
1272 				state = 'E';	/* Enabled */
1273 
1274 			DMEMIT("%c ", state);
1275 
1276 			if (pg->ps.type->status)
1277 				sz += pg->ps.type->status(&pg->ps, NULL, type,
1278 							  result + sz,
1279 							  maxlen - sz);
1280 			else
1281 				DMEMIT("0 ");
1282 
1283 			DMEMIT("%u %u ", pg->nr_pgpaths,
1284 			       pg->ps.type->info_args);
1285 
1286 			list_for_each_entry(p, &pg->pgpaths, list) {
1287 				DMEMIT("%s %s %u ", p->path.dev->name,
1288 				       p->path.is_active ? "A" : "F",
1289 				       p->fail_count);
1290 				if (pg->ps.type->status)
1291 					sz += pg->ps.type->status(&pg->ps,
1292 					      &p->path, type, result + sz,
1293 					      maxlen - sz);
1294 			}
1295 		}
1296 		break;
1297 
1298 	case STATUSTYPE_TABLE:
1299 		list_for_each_entry(pg, &m->priority_groups, list) {
1300 			DMEMIT("%s ", pg->ps.type->name);
1301 
1302 			if (pg->ps.type->status)
1303 				sz += pg->ps.type->status(&pg->ps, NULL, type,
1304 							  result + sz,
1305 							  maxlen - sz);
1306 			else
1307 				DMEMIT("0 ");
1308 
1309 			DMEMIT("%u %u ", pg->nr_pgpaths,
1310 			       pg->ps.type->table_args);
1311 
1312 			list_for_each_entry(p, &pg->pgpaths, list) {
1313 				DMEMIT("%s ", p->path.dev->name);
1314 				if (pg->ps.type->status)
1315 					sz += pg->ps.type->status(&pg->ps,
1316 					      &p->path, type, result + sz,
1317 					      maxlen - sz);
1318 			}
1319 		}
1320 		break;
1321 	}
1322 
1323 	spin_unlock_irqrestore(&m->lock, flags);
1324 
1325 	return 0;
1326 }
1327 
1328 static int multipath_message(struct dm_target *ti, unsigned argc, char **argv)
1329 {
1330 	int r;
1331 	struct dm_dev *dev;
1332 	struct multipath *m = (struct multipath *) ti->private;
1333 	action_fn action;
1334 
1335 	if (argc == 1) {
1336 		if (!strnicmp(argv[0], MESG_STR("queue_if_no_path")))
1337 			return queue_if_no_path(m, 1, 0);
1338 		else if (!strnicmp(argv[0], MESG_STR("fail_if_no_path")))
1339 			return queue_if_no_path(m, 0, 0);
1340 	}
1341 
1342 	if (argc != 2)
1343 		goto error;
1344 
1345 	if (!strnicmp(argv[0], MESG_STR("disable_group")))
1346 		return bypass_pg_num(m, argv[1], 1);
1347 	else if (!strnicmp(argv[0], MESG_STR("enable_group")))
1348 		return bypass_pg_num(m, argv[1], 0);
1349 	else if (!strnicmp(argv[0], MESG_STR("switch_group")))
1350 		return switch_pg_num(m, argv[1]);
1351 	else if (!strnicmp(argv[0], MESG_STR("reinstate_path")))
1352 		action = reinstate_path;
1353 	else if (!strnicmp(argv[0], MESG_STR("fail_path")))
1354 		action = fail_path;
1355 	else
1356 		goto error;
1357 
1358 	r = dm_get_device(ti, argv[1], ti->begin, ti->len,
1359 			  dm_table_get_mode(ti->table), &dev);
1360 	if (r) {
1361 		DMWARN("message: error getting device %s",
1362 		       argv[1]);
1363 		return -EINVAL;
1364 	}
1365 
1366 	r = action_dev(m, dev, action);
1367 
1368 	dm_put_device(ti, dev);
1369 
1370 	return r;
1371 
1372 error:
1373 	DMWARN("Unrecognised multipath message received.");
1374 	return -EINVAL;
1375 }
1376 
1377 static int multipath_ioctl(struct dm_target *ti, struct inode *inode,
1378 			   struct file *filp, unsigned int cmd,
1379 			   unsigned long arg)
1380 {
1381 	struct multipath *m = (struct multipath *) ti->private;
1382 	struct block_device *bdev = NULL;
1383 	unsigned long flags;
1384 	struct file fake_file = {};
1385 	struct dentry fake_dentry = {};
1386 	int r = 0;
1387 
1388 	fake_file.f_path.dentry = &fake_dentry;
1389 
1390 	spin_lock_irqsave(&m->lock, flags);
1391 
1392 	if (!m->current_pgpath)
1393 		__choose_pgpath(m);
1394 
1395 	if (m->current_pgpath) {
1396 		bdev = m->current_pgpath->path.dev->bdev;
1397 		fake_dentry.d_inode = bdev->bd_inode;
1398 		fake_file.f_mode = m->current_pgpath->path.dev->mode;
1399 	}
1400 
1401 	if (m->queue_io)
1402 		r = -EAGAIN;
1403 	else if (!bdev)
1404 		r = -EIO;
1405 
1406 	spin_unlock_irqrestore(&m->lock, flags);
1407 
1408 	return r ? : blkdev_driver_ioctl(bdev->bd_inode, &fake_file,
1409 					 bdev->bd_disk, cmd, arg);
1410 }
1411 
1412 /*-----------------------------------------------------------------
1413  * Module setup
1414  *---------------------------------------------------------------*/
1415 static struct target_type multipath_target = {
1416 	.name = "multipath",
1417 	.version = {1, 0, 5},
1418 	.module = THIS_MODULE,
1419 	.ctr = multipath_ctr,
1420 	.dtr = multipath_dtr,
1421 	.map = multipath_map,
1422 	.end_io = multipath_end_io,
1423 	.presuspend = multipath_presuspend,
1424 	.resume = multipath_resume,
1425 	.status = multipath_status,
1426 	.message = multipath_message,
1427 	.ioctl  = multipath_ioctl,
1428 };
1429 
1430 static int __init dm_multipath_init(void)
1431 {
1432 	int r;
1433 
1434 	/* allocate a slab for the dm_ios */
1435 	_mpio_cache = KMEM_CACHE(dm_mpath_io, 0);
1436 	if (!_mpio_cache)
1437 		return -ENOMEM;
1438 
1439 	r = dm_register_target(&multipath_target);
1440 	if (r < 0) {
1441 		DMERR("register failed %d", r);
1442 		kmem_cache_destroy(_mpio_cache);
1443 		return -EINVAL;
1444 	}
1445 
1446 	kmultipathd = create_workqueue("kmpathd");
1447 	if (!kmultipathd) {
1448 		DMERR("failed to create workqueue kmpathd");
1449 		dm_unregister_target(&multipath_target);
1450 		kmem_cache_destroy(_mpio_cache);
1451 		return -ENOMEM;
1452 	}
1453 
1454 	DMINFO("version %u.%u.%u loaded",
1455 	       multipath_target.version[0], multipath_target.version[1],
1456 	       multipath_target.version[2]);
1457 
1458 	return r;
1459 }
1460 
1461 static void __exit dm_multipath_exit(void)
1462 {
1463 	int r;
1464 
1465 	destroy_workqueue(kmultipathd);
1466 
1467 	r = dm_unregister_target(&multipath_target);
1468 	if (r < 0)
1469 		DMERR("target unregister failed %d", r);
1470 	kmem_cache_destroy(_mpio_cache);
1471 }
1472 
1473 EXPORT_SYMBOL_GPL(dm_pg_init_complete);
1474 
1475 module_init(dm_multipath_init);
1476 module_exit(dm_multipath_exit);
1477 
1478 MODULE_DESCRIPTION(DM_NAME " multipath target");
1479 MODULE_AUTHOR("Sistina Software <dm-devel@redhat.com>");
1480 MODULE_LICENSE("GPL");
1481