xref: /openbmc/linux/drivers/md/dm-mpath.c (revision 643d1f7f)
1 /*
2  * Copyright (C) 2003 Sistina Software Limited.
3  * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4  *
5  * This file is released under the GPL.
6  */
7 
8 #include "dm.h"
9 #include "dm-path-selector.h"
10 #include "dm-hw-handler.h"
11 #include "dm-bio-list.h"
12 #include "dm-bio-record.h"
13 #include "dm-uevent.h"
14 
15 #include <linux/ctype.h>
16 #include <linux/init.h>
17 #include <linux/mempool.h>
18 #include <linux/module.h>
19 #include <linux/pagemap.h>
20 #include <linux/slab.h>
21 #include <linux/time.h>
22 #include <linux/workqueue.h>
23 #include <asm/atomic.h>
24 
25 #define DM_MSG_PREFIX "multipath"
26 #define MESG_STR(x) x, sizeof(x)
27 
28 /* Path properties */
29 struct pgpath {
30 	struct list_head list;
31 
32 	struct priority_group *pg;	/* Owning PG */
33 	unsigned fail_count;		/* Cumulative failure count */
34 
35 	struct dm_path path;
36 };
37 
38 #define path_to_pgpath(__pgp) container_of((__pgp), struct pgpath, path)
39 
40 /*
41  * Paths are grouped into Priority Groups and numbered from 1 upwards.
42  * Each has a path selector which controls which path gets used.
43  */
44 struct priority_group {
45 	struct list_head list;
46 
47 	struct multipath *m;		/* Owning multipath instance */
48 	struct path_selector ps;
49 
50 	unsigned pg_num;		/* Reference number */
51 	unsigned bypassed;		/* Temporarily bypass this PG? */
52 
53 	unsigned nr_pgpaths;		/* Number of paths in PG */
54 	struct list_head pgpaths;
55 };
56 
57 /* Multipath context */
58 struct multipath {
59 	struct list_head list;
60 	struct dm_target *ti;
61 
62 	spinlock_t lock;
63 
64 	struct hw_handler hw_handler;
65 	unsigned nr_priority_groups;
66 	struct list_head priority_groups;
67 	unsigned pg_init_required;	/* pg_init needs calling? */
68 	unsigned pg_init_in_progress;	/* Only one pg_init allowed at once */
69 
70 	unsigned nr_valid_paths;	/* Total number of usable paths */
71 	struct pgpath *current_pgpath;
72 	struct priority_group *current_pg;
73 	struct priority_group *next_pg;	/* Switch to this PG if set */
74 	unsigned repeat_count;		/* I/Os left before calling PS again */
75 
76 	unsigned queue_io;		/* Must we queue all I/O? */
77 	unsigned queue_if_no_path;	/* Queue I/O if last path fails? */
78 	unsigned saved_queue_if_no_path;/* Saved state during suspension */
79 	unsigned pg_init_retries;	/* Number of times to retry pg_init */
80 	unsigned pg_init_count;		/* Number of times pg_init called */
81 
82 	struct work_struct process_queued_ios;
83 	struct bio_list queued_ios;
84 	unsigned queue_size;
85 
86 	struct work_struct trigger_event;
87 
88 	/*
89 	 * We must use a mempool of dm_mpath_io structs so that we
90 	 * can resubmit bios on error.
91 	 */
92 	mempool_t *mpio_pool;
93 };
94 
95 /*
96  * Context information attached to each bio we process.
97  */
98 struct dm_mpath_io {
99 	struct pgpath *pgpath;
100 	struct dm_bio_details details;
101 };
102 
103 typedef int (*action_fn) (struct pgpath *pgpath);
104 
105 #define MIN_IOS 256	/* Mempool size */
106 
107 static struct kmem_cache *_mpio_cache;
108 
109 struct workqueue_struct *kmultipathd;
110 static void process_queued_ios(struct work_struct *work);
111 static void trigger_event(struct work_struct *work);
112 
113 
114 /*-----------------------------------------------
115  * Allocation routines
116  *-----------------------------------------------*/
117 
118 static struct pgpath *alloc_pgpath(void)
119 {
120 	struct pgpath *pgpath = kzalloc(sizeof(*pgpath), GFP_KERNEL);
121 
122 	if (pgpath)
123 		pgpath->path.is_active = 1;
124 
125 	return pgpath;
126 }
127 
128 static void free_pgpath(struct pgpath *pgpath)
129 {
130 	kfree(pgpath);
131 }
132 
133 static struct priority_group *alloc_priority_group(void)
134 {
135 	struct priority_group *pg;
136 
137 	pg = kzalloc(sizeof(*pg), GFP_KERNEL);
138 
139 	if (pg)
140 		INIT_LIST_HEAD(&pg->pgpaths);
141 
142 	return pg;
143 }
144 
145 static void free_pgpaths(struct list_head *pgpaths, struct dm_target *ti)
146 {
147 	struct pgpath *pgpath, *tmp;
148 
149 	list_for_each_entry_safe(pgpath, tmp, pgpaths, list) {
150 		list_del(&pgpath->list);
151 		dm_put_device(ti, pgpath->path.dev);
152 		free_pgpath(pgpath);
153 	}
154 }
155 
156 static void free_priority_group(struct priority_group *pg,
157 				struct dm_target *ti)
158 {
159 	struct path_selector *ps = &pg->ps;
160 
161 	if (ps->type) {
162 		ps->type->destroy(ps);
163 		dm_put_path_selector(ps->type);
164 	}
165 
166 	free_pgpaths(&pg->pgpaths, ti);
167 	kfree(pg);
168 }
169 
170 static struct multipath *alloc_multipath(struct dm_target *ti)
171 {
172 	struct multipath *m;
173 
174 	m = kzalloc(sizeof(*m), GFP_KERNEL);
175 	if (m) {
176 		INIT_LIST_HEAD(&m->priority_groups);
177 		spin_lock_init(&m->lock);
178 		m->queue_io = 1;
179 		INIT_WORK(&m->process_queued_ios, process_queued_ios);
180 		INIT_WORK(&m->trigger_event, trigger_event);
181 		m->mpio_pool = mempool_create_slab_pool(MIN_IOS, _mpio_cache);
182 		if (!m->mpio_pool) {
183 			kfree(m);
184 			return NULL;
185 		}
186 		m->ti = ti;
187 		ti->private = m;
188 	}
189 
190 	return m;
191 }
192 
193 static void free_multipath(struct multipath *m)
194 {
195 	struct priority_group *pg, *tmp;
196 	struct hw_handler *hwh = &m->hw_handler;
197 
198 	list_for_each_entry_safe(pg, tmp, &m->priority_groups, list) {
199 		list_del(&pg->list);
200 		free_priority_group(pg, m->ti);
201 	}
202 
203 	if (hwh->type) {
204 		hwh->type->destroy(hwh);
205 		dm_put_hw_handler(hwh->type);
206 	}
207 
208 	mempool_destroy(m->mpio_pool);
209 	kfree(m);
210 }
211 
212 
213 /*-----------------------------------------------
214  * Path selection
215  *-----------------------------------------------*/
216 
217 static void __switch_pg(struct multipath *m, struct pgpath *pgpath)
218 {
219 	struct hw_handler *hwh = &m->hw_handler;
220 
221 	m->current_pg = pgpath->pg;
222 
223 	/* Must we initialise the PG first, and queue I/O till it's ready? */
224 	if (hwh->type && hwh->type->pg_init) {
225 		m->pg_init_required = 1;
226 		m->queue_io = 1;
227 	} else {
228 		m->pg_init_required = 0;
229 		m->queue_io = 0;
230 	}
231 
232 	m->pg_init_count = 0;
233 }
234 
235 static int __choose_path_in_pg(struct multipath *m, struct priority_group *pg)
236 {
237 	struct dm_path *path;
238 
239 	path = pg->ps.type->select_path(&pg->ps, &m->repeat_count);
240 	if (!path)
241 		return -ENXIO;
242 
243 	m->current_pgpath = path_to_pgpath(path);
244 
245 	if (m->current_pg != pg)
246 		__switch_pg(m, m->current_pgpath);
247 
248 	return 0;
249 }
250 
251 static void __choose_pgpath(struct multipath *m)
252 {
253 	struct priority_group *pg;
254 	unsigned bypassed = 1;
255 
256 	if (!m->nr_valid_paths)
257 		goto failed;
258 
259 	/* Were we instructed to switch PG? */
260 	if (m->next_pg) {
261 		pg = m->next_pg;
262 		m->next_pg = NULL;
263 		if (!__choose_path_in_pg(m, pg))
264 			return;
265 	}
266 
267 	/* Don't change PG until it has no remaining paths */
268 	if (m->current_pg && !__choose_path_in_pg(m, m->current_pg))
269 		return;
270 
271 	/*
272 	 * Loop through priority groups until we find a valid path.
273 	 * First time we skip PGs marked 'bypassed'.
274 	 * Second time we only try the ones we skipped.
275 	 */
276 	do {
277 		list_for_each_entry(pg, &m->priority_groups, list) {
278 			if (pg->bypassed == bypassed)
279 				continue;
280 			if (!__choose_path_in_pg(m, pg))
281 				return;
282 		}
283 	} while (bypassed--);
284 
285 failed:
286 	m->current_pgpath = NULL;
287 	m->current_pg = NULL;
288 }
289 
290 /*
291  * Check whether bios must be queued in the device-mapper core rather
292  * than here in the target.
293  *
294  * m->lock must be held on entry.
295  *
296  * If m->queue_if_no_path and m->saved_queue_if_no_path hold the
297  * same value then we are not between multipath_presuspend()
298  * and multipath_resume() calls and we have no need to check
299  * for the DMF_NOFLUSH_SUSPENDING flag.
300  */
301 static int __must_push_back(struct multipath *m)
302 {
303 	return (m->queue_if_no_path != m->saved_queue_if_no_path &&
304 		dm_noflush_suspending(m->ti));
305 }
306 
307 static int map_io(struct multipath *m, struct bio *bio,
308 		  struct dm_mpath_io *mpio, unsigned was_queued)
309 {
310 	int r = DM_MAPIO_REMAPPED;
311 	unsigned long flags;
312 	struct pgpath *pgpath;
313 
314 	spin_lock_irqsave(&m->lock, flags);
315 
316 	/* Do we need to select a new pgpath? */
317 	if (!m->current_pgpath ||
318 	    (!m->queue_io && (m->repeat_count && --m->repeat_count == 0)))
319 		__choose_pgpath(m);
320 
321 	pgpath = m->current_pgpath;
322 
323 	if (was_queued)
324 		m->queue_size--;
325 
326 	if ((pgpath && m->queue_io) ||
327 	    (!pgpath && m->queue_if_no_path)) {
328 		/* Queue for the daemon to resubmit */
329 		bio_list_add(&m->queued_ios, bio);
330 		m->queue_size++;
331 		if ((m->pg_init_required && !m->pg_init_in_progress) ||
332 		    !m->queue_io)
333 			queue_work(kmultipathd, &m->process_queued_ios);
334 		pgpath = NULL;
335 		r = DM_MAPIO_SUBMITTED;
336 	} else if (pgpath)
337 		bio->bi_bdev = pgpath->path.dev->bdev;
338 	else if (__must_push_back(m))
339 		r = DM_MAPIO_REQUEUE;
340 	else
341 		r = -EIO;	/* Failed */
342 
343 	mpio->pgpath = pgpath;
344 
345 	spin_unlock_irqrestore(&m->lock, flags);
346 
347 	return r;
348 }
349 
350 /*
351  * If we run out of usable paths, should we queue I/O or error it?
352  */
353 static int queue_if_no_path(struct multipath *m, unsigned queue_if_no_path,
354 			    unsigned save_old_value)
355 {
356 	unsigned long flags;
357 
358 	spin_lock_irqsave(&m->lock, flags);
359 
360 	if (save_old_value)
361 		m->saved_queue_if_no_path = m->queue_if_no_path;
362 	else
363 		m->saved_queue_if_no_path = queue_if_no_path;
364 	m->queue_if_no_path = queue_if_no_path;
365 	if (!m->queue_if_no_path && m->queue_size)
366 		queue_work(kmultipathd, &m->process_queued_ios);
367 
368 	spin_unlock_irqrestore(&m->lock, flags);
369 
370 	return 0;
371 }
372 
373 /*-----------------------------------------------------------------
374  * The multipath daemon is responsible for resubmitting queued ios.
375  *---------------------------------------------------------------*/
376 
377 static void dispatch_queued_ios(struct multipath *m)
378 {
379 	int r;
380 	unsigned long flags;
381 	struct bio *bio = NULL, *next;
382 	struct dm_mpath_io *mpio;
383 	union map_info *info;
384 
385 	spin_lock_irqsave(&m->lock, flags);
386 	bio = bio_list_get(&m->queued_ios);
387 	spin_unlock_irqrestore(&m->lock, flags);
388 
389 	while (bio) {
390 		next = bio->bi_next;
391 		bio->bi_next = NULL;
392 
393 		info = dm_get_mapinfo(bio);
394 		mpio = info->ptr;
395 
396 		r = map_io(m, bio, mpio, 1);
397 		if (r < 0)
398 			bio_endio(bio, r);
399 		else if (r == DM_MAPIO_REMAPPED)
400 			generic_make_request(bio);
401 		else if (r == DM_MAPIO_REQUEUE)
402 			bio_endio(bio, -EIO);
403 
404 		bio = next;
405 	}
406 }
407 
408 static void process_queued_ios(struct work_struct *work)
409 {
410 	struct multipath *m =
411 		container_of(work, struct multipath, process_queued_ios);
412 	struct hw_handler *hwh = &m->hw_handler;
413 	struct pgpath *pgpath = NULL;
414 	unsigned init_required = 0, must_queue = 1;
415 	unsigned long flags;
416 
417 	spin_lock_irqsave(&m->lock, flags);
418 
419 	if (!m->queue_size)
420 		goto out;
421 
422 	if (!m->current_pgpath)
423 		__choose_pgpath(m);
424 
425 	pgpath = m->current_pgpath;
426 
427 	if ((pgpath && !m->queue_io) ||
428 	    (!pgpath && !m->queue_if_no_path))
429 		must_queue = 0;
430 
431 	if (m->pg_init_required && !m->pg_init_in_progress) {
432 		m->pg_init_count++;
433 		m->pg_init_required = 0;
434 		m->pg_init_in_progress = 1;
435 		init_required = 1;
436 	}
437 
438 out:
439 	spin_unlock_irqrestore(&m->lock, flags);
440 
441 	if (init_required)
442 		hwh->type->pg_init(hwh, pgpath->pg->bypassed, &pgpath->path);
443 
444 	if (!must_queue)
445 		dispatch_queued_ios(m);
446 }
447 
448 /*
449  * An event is triggered whenever a path is taken out of use.
450  * Includes path failure and PG bypass.
451  */
452 static void trigger_event(struct work_struct *work)
453 {
454 	struct multipath *m =
455 		container_of(work, struct multipath, trigger_event);
456 
457 	dm_table_event(m->ti->table);
458 }
459 
460 /*-----------------------------------------------------------------
461  * Constructor/argument parsing:
462  * <#multipath feature args> [<arg>]*
463  * <#hw_handler args> [hw_handler [<arg>]*]
464  * <#priority groups>
465  * <initial priority group>
466  *     [<selector> <#selector args> [<arg>]*
467  *      <#paths> <#per-path selector args>
468  *         [<path> [<arg>]* ]+ ]+
469  *---------------------------------------------------------------*/
470 struct param {
471 	unsigned min;
472 	unsigned max;
473 	char *error;
474 };
475 
476 static int read_param(struct param *param, char *str, unsigned *v, char **error)
477 {
478 	if (!str ||
479 	    (sscanf(str, "%u", v) != 1) ||
480 	    (*v < param->min) ||
481 	    (*v > param->max)) {
482 		*error = param->error;
483 		return -EINVAL;
484 	}
485 
486 	return 0;
487 }
488 
489 struct arg_set {
490 	unsigned argc;
491 	char **argv;
492 };
493 
494 static char *shift(struct arg_set *as)
495 {
496 	char *r;
497 
498 	if (as->argc) {
499 		as->argc--;
500 		r = *as->argv;
501 		as->argv++;
502 		return r;
503 	}
504 
505 	return NULL;
506 }
507 
508 static void consume(struct arg_set *as, unsigned n)
509 {
510 	BUG_ON (as->argc < n);
511 	as->argc -= n;
512 	as->argv += n;
513 }
514 
515 static int parse_path_selector(struct arg_set *as, struct priority_group *pg,
516 			       struct dm_target *ti)
517 {
518 	int r;
519 	struct path_selector_type *pst;
520 	unsigned ps_argc;
521 
522 	static struct param _params[] = {
523 		{0, 1024, "invalid number of path selector args"},
524 	};
525 
526 	pst = dm_get_path_selector(shift(as));
527 	if (!pst) {
528 		ti->error = "unknown path selector type";
529 		return -EINVAL;
530 	}
531 
532 	r = read_param(_params, shift(as), &ps_argc, &ti->error);
533 	if (r)
534 		return -EINVAL;
535 
536 	r = pst->create(&pg->ps, ps_argc, as->argv);
537 	if (r) {
538 		dm_put_path_selector(pst);
539 		ti->error = "path selector constructor failed";
540 		return r;
541 	}
542 
543 	pg->ps.type = pst;
544 	consume(as, ps_argc);
545 
546 	return 0;
547 }
548 
549 static struct pgpath *parse_path(struct arg_set *as, struct path_selector *ps,
550 			       struct dm_target *ti)
551 {
552 	int r;
553 	struct pgpath *p;
554 
555 	/* we need at least a path arg */
556 	if (as->argc < 1) {
557 		ti->error = "no device given";
558 		return NULL;
559 	}
560 
561 	p = alloc_pgpath();
562 	if (!p)
563 		return NULL;
564 
565 	r = dm_get_device(ti, shift(as), ti->begin, ti->len,
566 			  dm_table_get_mode(ti->table), &p->path.dev);
567 	if (r) {
568 		ti->error = "error getting device";
569 		goto bad;
570 	}
571 
572 	r = ps->type->add_path(ps, &p->path, as->argc, as->argv, &ti->error);
573 	if (r) {
574 		dm_put_device(ti, p->path.dev);
575 		goto bad;
576 	}
577 
578 	return p;
579 
580  bad:
581 	free_pgpath(p);
582 	return NULL;
583 }
584 
585 static struct priority_group *parse_priority_group(struct arg_set *as,
586 						   struct multipath *m)
587 {
588 	static struct param _params[] = {
589 		{1, 1024, "invalid number of paths"},
590 		{0, 1024, "invalid number of selector args"}
591 	};
592 
593 	int r;
594 	unsigned i, nr_selector_args, nr_params;
595 	struct priority_group *pg;
596 	struct dm_target *ti = m->ti;
597 
598 	if (as->argc < 2) {
599 		as->argc = 0;
600 		ti->error = "not enough priority group aruments";
601 		return NULL;
602 	}
603 
604 	pg = alloc_priority_group();
605 	if (!pg) {
606 		ti->error = "couldn't allocate priority group";
607 		return NULL;
608 	}
609 	pg->m = m;
610 
611 	r = parse_path_selector(as, pg, ti);
612 	if (r)
613 		goto bad;
614 
615 	/*
616 	 * read the paths
617 	 */
618 	r = read_param(_params, shift(as), &pg->nr_pgpaths, &ti->error);
619 	if (r)
620 		goto bad;
621 
622 	r = read_param(_params + 1, shift(as), &nr_selector_args, &ti->error);
623 	if (r)
624 		goto bad;
625 
626 	nr_params = 1 + nr_selector_args;
627 	for (i = 0; i < pg->nr_pgpaths; i++) {
628 		struct pgpath *pgpath;
629 		struct arg_set path_args;
630 
631 		if (as->argc < nr_params)
632 			goto bad;
633 
634 		path_args.argc = nr_params;
635 		path_args.argv = as->argv;
636 
637 		pgpath = parse_path(&path_args, &pg->ps, ti);
638 		if (!pgpath)
639 			goto bad;
640 
641 		pgpath->pg = pg;
642 		list_add_tail(&pgpath->list, &pg->pgpaths);
643 		consume(as, nr_params);
644 	}
645 
646 	return pg;
647 
648  bad:
649 	free_priority_group(pg, ti);
650 	return NULL;
651 }
652 
653 static int parse_hw_handler(struct arg_set *as, struct multipath *m)
654 {
655 	int r;
656 	struct hw_handler_type *hwht;
657 	unsigned hw_argc;
658 	struct dm_target *ti = m->ti;
659 
660 	static struct param _params[] = {
661 		{0, 1024, "invalid number of hardware handler args"},
662 	};
663 
664 	r = read_param(_params, shift(as), &hw_argc, &ti->error);
665 	if (r)
666 		return -EINVAL;
667 
668 	if (!hw_argc)
669 		return 0;
670 
671 	hwht = dm_get_hw_handler(shift(as));
672 	if (!hwht) {
673 		ti->error = "unknown hardware handler type";
674 		return -EINVAL;
675 	}
676 
677 	m->hw_handler.md = dm_table_get_md(ti->table);
678 	dm_put(m->hw_handler.md);
679 
680 	r = hwht->create(&m->hw_handler, hw_argc - 1, as->argv);
681 	if (r) {
682 		dm_put_hw_handler(hwht);
683 		ti->error = "hardware handler constructor failed";
684 		return r;
685 	}
686 
687 	m->hw_handler.type = hwht;
688 	consume(as, hw_argc - 1);
689 
690 	return 0;
691 }
692 
693 static int parse_features(struct arg_set *as, struct multipath *m)
694 {
695 	int r;
696 	unsigned argc;
697 	struct dm_target *ti = m->ti;
698 	const char *param_name;
699 
700 	static struct param _params[] = {
701 		{0, 3, "invalid number of feature args"},
702 		{1, 50, "pg_init_retries must be between 1 and 50"},
703 	};
704 
705 	r = read_param(_params, shift(as), &argc, &ti->error);
706 	if (r)
707 		return -EINVAL;
708 
709 	if (!argc)
710 		return 0;
711 
712 	do {
713 		param_name = shift(as);
714 		argc--;
715 
716 		if (!strnicmp(param_name, MESG_STR("queue_if_no_path"))) {
717 			r = queue_if_no_path(m, 1, 0);
718 			continue;
719 		}
720 
721 		if (!strnicmp(param_name, MESG_STR("pg_init_retries")) &&
722 		    (argc >= 1)) {
723 			r = read_param(_params + 1, shift(as),
724 				       &m->pg_init_retries, &ti->error);
725 			argc--;
726 			continue;
727 		}
728 
729 		ti->error = "Unrecognised multipath feature request";
730 		r = -EINVAL;
731 	} while (argc && !r);
732 
733 	return r;
734 }
735 
736 static int multipath_ctr(struct dm_target *ti, unsigned int argc,
737 			 char **argv)
738 {
739 	/* target parameters */
740 	static struct param _params[] = {
741 		{1, 1024, "invalid number of priority groups"},
742 		{1, 1024, "invalid initial priority group number"},
743 	};
744 
745 	int r;
746 	struct multipath *m;
747 	struct arg_set as;
748 	unsigned pg_count = 0;
749 	unsigned next_pg_num;
750 
751 	as.argc = argc;
752 	as.argv = argv;
753 
754 	m = alloc_multipath(ti);
755 	if (!m) {
756 		ti->error = "can't allocate multipath";
757 		return -EINVAL;
758 	}
759 
760 	r = parse_features(&as, m);
761 	if (r)
762 		goto bad;
763 
764 	r = parse_hw_handler(&as, m);
765 	if (r)
766 		goto bad;
767 
768 	r = read_param(_params, shift(&as), &m->nr_priority_groups, &ti->error);
769 	if (r)
770 		goto bad;
771 
772 	r = read_param(_params + 1, shift(&as), &next_pg_num, &ti->error);
773 	if (r)
774 		goto bad;
775 
776 	/* parse the priority groups */
777 	while (as.argc) {
778 		struct priority_group *pg;
779 
780 		pg = parse_priority_group(&as, m);
781 		if (!pg) {
782 			r = -EINVAL;
783 			goto bad;
784 		}
785 
786 		m->nr_valid_paths += pg->nr_pgpaths;
787 		list_add_tail(&pg->list, &m->priority_groups);
788 		pg_count++;
789 		pg->pg_num = pg_count;
790 		if (!--next_pg_num)
791 			m->next_pg = pg;
792 	}
793 
794 	if (pg_count != m->nr_priority_groups) {
795 		ti->error = "priority group count mismatch";
796 		r = -EINVAL;
797 		goto bad;
798 	}
799 
800 	return 0;
801 
802  bad:
803 	free_multipath(m);
804 	return r;
805 }
806 
807 static void multipath_dtr(struct dm_target *ti)
808 {
809 	struct multipath *m = (struct multipath *) ti->private;
810 
811 	flush_workqueue(kmultipathd);
812 	free_multipath(m);
813 }
814 
815 /*
816  * Map bios, recording original fields for later in case we have to resubmit
817  */
818 static int multipath_map(struct dm_target *ti, struct bio *bio,
819 			 union map_info *map_context)
820 {
821 	int r;
822 	struct dm_mpath_io *mpio;
823 	struct multipath *m = (struct multipath *) ti->private;
824 
825 	mpio = mempool_alloc(m->mpio_pool, GFP_NOIO);
826 	dm_bio_record(&mpio->details, bio);
827 
828 	map_context->ptr = mpio;
829 	bio->bi_rw |= (1 << BIO_RW_FAILFAST);
830 	r = map_io(m, bio, mpio, 0);
831 	if (r < 0 || r == DM_MAPIO_REQUEUE)
832 		mempool_free(mpio, m->mpio_pool);
833 
834 	return r;
835 }
836 
837 /*
838  * Take a path out of use.
839  */
840 static int fail_path(struct pgpath *pgpath)
841 {
842 	unsigned long flags;
843 	struct multipath *m = pgpath->pg->m;
844 
845 	spin_lock_irqsave(&m->lock, flags);
846 
847 	if (!pgpath->path.is_active)
848 		goto out;
849 
850 	DMWARN("Failing path %s.", pgpath->path.dev->name);
851 
852 	pgpath->pg->ps.type->fail_path(&pgpath->pg->ps, &pgpath->path);
853 	pgpath->path.is_active = 0;
854 	pgpath->fail_count++;
855 
856 	m->nr_valid_paths--;
857 
858 	if (pgpath == m->current_pgpath)
859 		m->current_pgpath = NULL;
860 
861 	dm_path_uevent(DM_UEVENT_PATH_FAILED, m->ti,
862 		      pgpath->path.dev->name, m->nr_valid_paths);
863 
864 	queue_work(kmultipathd, &m->trigger_event);
865 
866 out:
867 	spin_unlock_irqrestore(&m->lock, flags);
868 
869 	return 0;
870 }
871 
872 /*
873  * Reinstate a previously-failed path
874  */
875 static int reinstate_path(struct pgpath *pgpath)
876 {
877 	int r = 0;
878 	unsigned long flags;
879 	struct multipath *m = pgpath->pg->m;
880 
881 	spin_lock_irqsave(&m->lock, flags);
882 
883 	if (pgpath->path.is_active)
884 		goto out;
885 
886 	if (!pgpath->pg->ps.type) {
887 		DMWARN("Reinstate path not supported by path selector %s",
888 		       pgpath->pg->ps.type->name);
889 		r = -EINVAL;
890 		goto out;
891 	}
892 
893 	r = pgpath->pg->ps.type->reinstate_path(&pgpath->pg->ps, &pgpath->path);
894 	if (r)
895 		goto out;
896 
897 	pgpath->path.is_active = 1;
898 
899 	m->current_pgpath = NULL;
900 	if (!m->nr_valid_paths++ && m->queue_size)
901 		queue_work(kmultipathd, &m->process_queued_ios);
902 
903 	dm_path_uevent(DM_UEVENT_PATH_REINSTATED, m->ti,
904 		      pgpath->path.dev->name, m->nr_valid_paths);
905 
906 	queue_work(kmultipathd, &m->trigger_event);
907 
908 out:
909 	spin_unlock_irqrestore(&m->lock, flags);
910 
911 	return r;
912 }
913 
914 /*
915  * Fail or reinstate all paths that match the provided struct dm_dev.
916  */
917 static int action_dev(struct multipath *m, struct dm_dev *dev,
918 		      action_fn action)
919 {
920 	int r = 0;
921 	struct pgpath *pgpath;
922 	struct priority_group *pg;
923 
924 	list_for_each_entry(pg, &m->priority_groups, list) {
925 		list_for_each_entry(pgpath, &pg->pgpaths, list) {
926 			if (pgpath->path.dev == dev)
927 				r = action(pgpath);
928 		}
929 	}
930 
931 	return r;
932 }
933 
934 /*
935  * Temporarily try to avoid having to use the specified PG
936  */
937 static void bypass_pg(struct multipath *m, struct priority_group *pg,
938 		      int bypassed)
939 {
940 	unsigned long flags;
941 
942 	spin_lock_irqsave(&m->lock, flags);
943 
944 	pg->bypassed = bypassed;
945 	m->current_pgpath = NULL;
946 	m->current_pg = NULL;
947 
948 	spin_unlock_irqrestore(&m->lock, flags);
949 
950 	queue_work(kmultipathd, &m->trigger_event);
951 }
952 
953 /*
954  * Switch to using the specified PG from the next I/O that gets mapped
955  */
956 static int switch_pg_num(struct multipath *m, const char *pgstr)
957 {
958 	struct priority_group *pg;
959 	unsigned pgnum;
960 	unsigned long flags;
961 
962 	if (!pgstr || (sscanf(pgstr, "%u", &pgnum) != 1) || !pgnum ||
963 	    (pgnum > m->nr_priority_groups)) {
964 		DMWARN("invalid PG number supplied to switch_pg_num");
965 		return -EINVAL;
966 	}
967 
968 	spin_lock_irqsave(&m->lock, flags);
969 	list_for_each_entry(pg, &m->priority_groups, list) {
970 		pg->bypassed = 0;
971 		if (--pgnum)
972 			continue;
973 
974 		m->current_pgpath = NULL;
975 		m->current_pg = NULL;
976 		m->next_pg = pg;
977 	}
978 	spin_unlock_irqrestore(&m->lock, flags);
979 
980 	queue_work(kmultipathd, &m->trigger_event);
981 	return 0;
982 }
983 
984 /*
985  * Set/clear bypassed status of a PG.
986  * PGs are numbered upwards from 1 in the order they were declared.
987  */
988 static int bypass_pg_num(struct multipath *m, const char *pgstr, int bypassed)
989 {
990 	struct priority_group *pg;
991 	unsigned pgnum;
992 
993 	if (!pgstr || (sscanf(pgstr, "%u", &pgnum) != 1) || !pgnum ||
994 	    (pgnum > m->nr_priority_groups)) {
995 		DMWARN("invalid PG number supplied to bypass_pg");
996 		return -EINVAL;
997 	}
998 
999 	list_for_each_entry(pg, &m->priority_groups, list) {
1000 		if (!--pgnum)
1001 			break;
1002 	}
1003 
1004 	bypass_pg(m, pg, bypassed);
1005 	return 0;
1006 }
1007 
1008 /*
1009  * Should we retry pg_init immediately?
1010  */
1011 static int pg_init_limit_reached(struct multipath *m, struct pgpath *pgpath)
1012 {
1013 	unsigned long flags;
1014 	int limit_reached = 0;
1015 
1016 	spin_lock_irqsave(&m->lock, flags);
1017 
1018 	if (m->pg_init_count <= m->pg_init_retries)
1019 		m->pg_init_required = 1;
1020 	else
1021 		limit_reached = 1;
1022 
1023 	spin_unlock_irqrestore(&m->lock, flags);
1024 
1025 	return limit_reached;
1026 }
1027 
1028 /*
1029  * pg_init must call this when it has completed its initialisation
1030  */
1031 void dm_pg_init_complete(struct dm_path *path, unsigned err_flags)
1032 {
1033 	struct pgpath *pgpath = path_to_pgpath(path);
1034 	struct priority_group *pg = pgpath->pg;
1035 	struct multipath *m = pg->m;
1036 	unsigned long flags;
1037 
1038 	/*
1039 	 * If requested, retry pg_init until maximum number of retries exceeded.
1040 	 * If retry not requested and PG already bypassed, always fail the path.
1041 	 */
1042 	if (err_flags & MP_RETRY) {
1043 		if (pg_init_limit_reached(m, pgpath))
1044 			err_flags |= MP_FAIL_PATH;
1045 	} else if (err_flags && pg->bypassed)
1046 		err_flags |= MP_FAIL_PATH;
1047 
1048 	if (err_flags & MP_FAIL_PATH)
1049 		fail_path(pgpath);
1050 
1051 	if (err_flags & MP_BYPASS_PG)
1052 		bypass_pg(m, pg, 1);
1053 
1054 	spin_lock_irqsave(&m->lock, flags);
1055 	if (err_flags & ~MP_RETRY) {
1056 		m->current_pgpath = NULL;
1057 		m->current_pg = NULL;
1058 	} else if (!m->pg_init_required)
1059 		m->queue_io = 0;
1060 
1061 	m->pg_init_in_progress = 0;
1062 	queue_work(kmultipathd, &m->process_queued_ios);
1063 	spin_unlock_irqrestore(&m->lock, flags);
1064 }
1065 
1066 /*
1067  * end_io handling
1068  */
1069 static int do_end_io(struct multipath *m, struct bio *bio,
1070 		     int error, struct dm_mpath_io *mpio)
1071 {
1072 	struct hw_handler *hwh = &m->hw_handler;
1073 	unsigned err_flags = MP_FAIL_PATH;	/* Default behavior */
1074 	unsigned long flags;
1075 
1076 	if (!error)
1077 		return 0;	/* I/O complete */
1078 
1079 	if ((error == -EWOULDBLOCK) && bio_rw_ahead(bio))
1080 		return error;
1081 
1082 	if (error == -EOPNOTSUPP)
1083 		return error;
1084 
1085 	spin_lock_irqsave(&m->lock, flags);
1086 	if (!m->nr_valid_paths) {
1087 		if (__must_push_back(m)) {
1088 			spin_unlock_irqrestore(&m->lock, flags);
1089 			return DM_ENDIO_REQUEUE;
1090 		} else if (!m->queue_if_no_path) {
1091 			spin_unlock_irqrestore(&m->lock, flags);
1092 			return -EIO;
1093 		} else {
1094 			spin_unlock_irqrestore(&m->lock, flags);
1095 			goto requeue;
1096 		}
1097 	}
1098 	spin_unlock_irqrestore(&m->lock, flags);
1099 
1100 	if (hwh->type && hwh->type->error)
1101 		err_flags = hwh->type->error(hwh, bio);
1102 
1103 	if (mpio->pgpath) {
1104 		if (err_flags & MP_FAIL_PATH)
1105 			fail_path(mpio->pgpath);
1106 
1107 		if (err_flags & MP_BYPASS_PG)
1108 			bypass_pg(m, mpio->pgpath->pg, 1);
1109 	}
1110 
1111 	if (err_flags & MP_ERROR_IO)
1112 		return -EIO;
1113 
1114       requeue:
1115 	dm_bio_restore(&mpio->details, bio);
1116 
1117 	/* queue for the daemon to resubmit or fail */
1118 	spin_lock_irqsave(&m->lock, flags);
1119 	bio_list_add(&m->queued_ios, bio);
1120 	m->queue_size++;
1121 	if (!m->queue_io)
1122 		queue_work(kmultipathd, &m->process_queued_ios);
1123 	spin_unlock_irqrestore(&m->lock, flags);
1124 
1125 	return DM_ENDIO_INCOMPLETE;	/* io not complete */
1126 }
1127 
1128 static int multipath_end_io(struct dm_target *ti, struct bio *bio,
1129 			    int error, union map_info *map_context)
1130 {
1131 	struct multipath *m = ti->private;
1132 	struct dm_mpath_io *mpio = map_context->ptr;
1133 	struct pgpath *pgpath = mpio->pgpath;
1134 	struct path_selector *ps;
1135 	int r;
1136 
1137 	r  = do_end_io(m, bio, error, mpio);
1138 	if (pgpath) {
1139 		ps = &pgpath->pg->ps;
1140 		if (ps->type->end_io)
1141 			ps->type->end_io(ps, &pgpath->path);
1142 	}
1143 	if (r != DM_ENDIO_INCOMPLETE)
1144 		mempool_free(mpio, m->mpio_pool);
1145 
1146 	return r;
1147 }
1148 
1149 /*
1150  * Suspend can't complete until all the I/O is processed so if
1151  * the last path fails we must error any remaining I/O.
1152  * Note that if the freeze_bdev fails while suspending, the
1153  * queue_if_no_path state is lost - userspace should reset it.
1154  */
1155 static void multipath_presuspend(struct dm_target *ti)
1156 {
1157 	struct multipath *m = (struct multipath *) ti->private;
1158 
1159 	queue_if_no_path(m, 0, 1);
1160 }
1161 
1162 /*
1163  * Restore the queue_if_no_path setting.
1164  */
1165 static void multipath_resume(struct dm_target *ti)
1166 {
1167 	struct multipath *m = (struct multipath *) ti->private;
1168 	unsigned long flags;
1169 
1170 	spin_lock_irqsave(&m->lock, flags);
1171 	m->queue_if_no_path = m->saved_queue_if_no_path;
1172 	spin_unlock_irqrestore(&m->lock, flags);
1173 }
1174 
1175 /*
1176  * Info output has the following format:
1177  * num_multipath_feature_args [multipath_feature_args]*
1178  * num_handler_status_args [handler_status_args]*
1179  * num_groups init_group_number
1180  *            [A|D|E num_ps_status_args [ps_status_args]*
1181  *             num_paths num_selector_args
1182  *             [path_dev A|F fail_count [selector_args]* ]+ ]+
1183  *
1184  * Table output has the following format (identical to the constructor string):
1185  * num_feature_args [features_args]*
1186  * num_handler_args hw_handler [hw_handler_args]*
1187  * num_groups init_group_number
1188  *     [priority selector-name num_ps_args [ps_args]*
1189  *      num_paths num_selector_args [path_dev [selector_args]* ]+ ]+
1190  */
1191 static int multipath_status(struct dm_target *ti, status_type_t type,
1192 			    char *result, unsigned int maxlen)
1193 {
1194 	int sz = 0;
1195 	unsigned long flags;
1196 	struct multipath *m = (struct multipath *) ti->private;
1197 	struct hw_handler *hwh = &m->hw_handler;
1198 	struct priority_group *pg;
1199 	struct pgpath *p;
1200 	unsigned pg_num;
1201 	char state;
1202 
1203 	spin_lock_irqsave(&m->lock, flags);
1204 
1205 	/* Features */
1206 	if (type == STATUSTYPE_INFO)
1207 		DMEMIT("2 %u %u ", m->queue_size, m->pg_init_count);
1208 	else {
1209 		DMEMIT("%u ", m->queue_if_no_path +
1210 			      (m->pg_init_retries > 0) * 2);
1211 		if (m->queue_if_no_path)
1212 			DMEMIT("queue_if_no_path ");
1213 		if (m->pg_init_retries)
1214 			DMEMIT("pg_init_retries %u ", m->pg_init_retries);
1215 	}
1216 
1217 	if (hwh->type && hwh->type->status)
1218 		sz += hwh->type->status(hwh, type, result + sz, maxlen - sz);
1219 	else if (!hwh->type || type == STATUSTYPE_INFO)
1220 		DMEMIT("0 ");
1221 	else
1222 		DMEMIT("1 %s ", hwh->type->name);
1223 
1224 	DMEMIT("%u ", m->nr_priority_groups);
1225 
1226 	if (m->next_pg)
1227 		pg_num = m->next_pg->pg_num;
1228 	else if (m->current_pg)
1229 		pg_num = m->current_pg->pg_num;
1230 	else
1231 			pg_num = 1;
1232 
1233 	DMEMIT("%u ", pg_num);
1234 
1235 	switch (type) {
1236 	case STATUSTYPE_INFO:
1237 		list_for_each_entry(pg, &m->priority_groups, list) {
1238 			if (pg->bypassed)
1239 				state = 'D';	/* Disabled */
1240 			else if (pg == m->current_pg)
1241 				state = 'A';	/* Currently Active */
1242 			else
1243 				state = 'E';	/* Enabled */
1244 
1245 			DMEMIT("%c ", state);
1246 
1247 			if (pg->ps.type->status)
1248 				sz += pg->ps.type->status(&pg->ps, NULL, type,
1249 							  result + sz,
1250 							  maxlen - sz);
1251 			else
1252 				DMEMIT("0 ");
1253 
1254 			DMEMIT("%u %u ", pg->nr_pgpaths,
1255 			       pg->ps.type->info_args);
1256 
1257 			list_for_each_entry(p, &pg->pgpaths, list) {
1258 				DMEMIT("%s %s %u ", p->path.dev->name,
1259 				       p->path.is_active ? "A" : "F",
1260 				       p->fail_count);
1261 				if (pg->ps.type->status)
1262 					sz += pg->ps.type->status(&pg->ps,
1263 					      &p->path, type, result + sz,
1264 					      maxlen - sz);
1265 			}
1266 		}
1267 		break;
1268 
1269 	case STATUSTYPE_TABLE:
1270 		list_for_each_entry(pg, &m->priority_groups, list) {
1271 			DMEMIT("%s ", pg->ps.type->name);
1272 
1273 			if (pg->ps.type->status)
1274 				sz += pg->ps.type->status(&pg->ps, NULL, type,
1275 							  result + sz,
1276 							  maxlen - sz);
1277 			else
1278 				DMEMIT("0 ");
1279 
1280 			DMEMIT("%u %u ", pg->nr_pgpaths,
1281 			       pg->ps.type->table_args);
1282 
1283 			list_for_each_entry(p, &pg->pgpaths, list) {
1284 				DMEMIT("%s ", p->path.dev->name);
1285 				if (pg->ps.type->status)
1286 					sz += pg->ps.type->status(&pg->ps,
1287 					      &p->path, type, result + sz,
1288 					      maxlen - sz);
1289 			}
1290 		}
1291 		break;
1292 	}
1293 
1294 	spin_unlock_irqrestore(&m->lock, flags);
1295 
1296 	return 0;
1297 }
1298 
1299 static int multipath_message(struct dm_target *ti, unsigned argc, char **argv)
1300 {
1301 	int r;
1302 	struct dm_dev *dev;
1303 	struct multipath *m = (struct multipath *) ti->private;
1304 	action_fn action;
1305 
1306 	if (argc == 1) {
1307 		if (!strnicmp(argv[0], MESG_STR("queue_if_no_path")))
1308 			return queue_if_no_path(m, 1, 0);
1309 		else if (!strnicmp(argv[0], MESG_STR("fail_if_no_path")))
1310 			return queue_if_no_path(m, 0, 0);
1311 	}
1312 
1313 	if (argc != 2)
1314 		goto error;
1315 
1316 	if (!strnicmp(argv[0], MESG_STR("disable_group")))
1317 		return bypass_pg_num(m, argv[1], 1);
1318 	else if (!strnicmp(argv[0], MESG_STR("enable_group")))
1319 		return bypass_pg_num(m, argv[1], 0);
1320 	else if (!strnicmp(argv[0], MESG_STR("switch_group")))
1321 		return switch_pg_num(m, argv[1]);
1322 	else if (!strnicmp(argv[0], MESG_STR("reinstate_path")))
1323 		action = reinstate_path;
1324 	else if (!strnicmp(argv[0], MESG_STR("fail_path")))
1325 		action = fail_path;
1326 	else
1327 		goto error;
1328 
1329 	r = dm_get_device(ti, argv[1], ti->begin, ti->len,
1330 			  dm_table_get_mode(ti->table), &dev);
1331 	if (r) {
1332 		DMWARN("message: error getting device %s",
1333 		       argv[1]);
1334 		return -EINVAL;
1335 	}
1336 
1337 	r = action_dev(m, dev, action);
1338 
1339 	dm_put_device(ti, dev);
1340 
1341 	return r;
1342 
1343 error:
1344 	DMWARN("Unrecognised multipath message received.");
1345 	return -EINVAL;
1346 }
1347 
1348 static int multipath_ioctl(struct dm_target *ti, struct inode *inode,
1349 			   struct file *filp, unsigned int cmd,
1350 			   unsigned long arg)
1351 {
1352 	struct multipath *m = (struct multipath *) ti->private;
1353 	struct block_device *bdev = NULL;
1354 	unsigned long flags;
1355 	struct file fake_file = {};
1356 	struct dentry fake_dentry = {};
1357 	int r = 0;
1358 
1359 	fake_file.f_path.dentry = &fake_dentry;
1360 
1361 	spin_lock_irqsave(&m->lock, flags);
1362 
1363 	if (!m->current_pgpath)
1364 		__choose_pgpath(m);
1365 
1366 	if (m->current_pgpath) {
1367 		bdev = m->current_pgpath->path.dev->bdev;
1368 		fake_dentry.d_inode = bdev->bd_inode;
1369 		fake_file.f_mode = m->current_pgpath->path.dev->mode;
1370 	}
1371 
1372 	if (m->queue_io)
1373 		r = -EAGAIN;
1374 	else if (!bdev)
1375 		r = -EIO;
1376 
1377 	spin_unlock_irqrestore(&m->lock, flags);
1378 
1379 	return r ? : blkdev_driver_ioctl(bdev->bd_inode, &fake_file,
1380 					 bdev->bd_disk, cmd, arg);
1381 }
1382 
1383 /*-----------------------------------------------------------------
1384  * Module setup
1385  *---------------------------------------------------------------*/
1386 static struct target_type multipath_target = {
1387 	.name = "multipath",
1388 	.version = {1, 0, 5},
1389 	.module = THIS_MODULE,
1390 	.ctr = multipath_ctr,
1391 	.dtr = multipath_dtr,
1392 	.map = multipath_map,
1393 	.end_io = multipath_end_io,
1394 	.presuspend = multipath_presuspend,
1395 	.resume = multipath_resume,
1396 	.status = multipath_status,
1397 	.message = multipath_message,
1398 	.ioctl  = multipath_ioctl,
1399 };
1400 
1401 static int __init dm_multipath_init(void)
1402 {
1403 	int r;
1404 
1405 	/* allocate a slab for the dm_ios */
1406 	_mpio_cache = KMEM_CACHE(dm_mpath_io, 0);
1407 	if (!_mpio_cache)
1408 		return -ENOMEM;
1409 
1410 	r = dm_register_target(&multipath_target);
1411 	if (r < 0) {
1412 		DMERR("register failed %d", r);
1413 		kmem_cache_destroy(_mpio_cache);
1414 		return -EINVAL;
1415 	}
1416 
1417 	kmultipathd = create_workqueue("kmpathd");
1418 	if (!kmultipathd) {
1419 		DMERR("failed to create workqueue kmpathd");
1420 		dm_unregister_target(&multipath_target);
1421 		kmem_cache_destroy(_mpio_cache);
1422 		return -ENOMEM;
1423 	}
1424 
1425 	DMINFO("version %u.%u.%u loaded",
1426 	       multipath_target.version[0], multipath_target.version[1],
1427 	       multipath_target.version[2]);
1428 
1429 	return r;
1430 }
1431 
1432 static void __exit dm_multipath_exit(void)
1433 {
1434 	int r;
1435 
1436 	destroy_workqueue(kmultipathd);
1437 
1438 	r = dm_unregister_target(&multipath_target);
1439 	if (r < 0)
1440 		DMERR("target unregister failed %d", r);
1441 	kmem_cache_destroy(_mpio_cache);
1442 }
1443 
1444 EXPORT_SYMBOL_GPL(dm_pg_init_complete);
1445 
1446 module_init(dm_multipath_init);
1447 module_exit(dm_multipath_exit);
1448 
1449 MODULE_DESCRIPTION(DM_NAME " multipath target");
1450 MODULE_AUTHOR("Sistina Software <dm-devel@redhat.com>");
1451 MODULE_LICENSE("GPL");
1452