xref: /openbmc/linux/drivers/md/md-multipath.c (revision 022dacdd)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * multipath.c : Multiple Devices driver for Linux
4  *
5  * Copyright (C) 1999, 2000, 2001 Ingo Molnar, Red Hat
6  *
7  * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman
8  *
9  * MULTIPATH management functions.
10  *
11  * derived from raid1.c.
12  */
13 
14 #include <linux/blkdev.h>
15 #include <linux/module.h>
16 #include <linux/raid/md_u.h>
17 #include <linux/seq_file.h>
18 #include <linux/slab.h>
19 #include "md.h"
20 #include "md-multipath.h"
21 
22 #define MAX_WORK_PER_DISK 128
23 
24 #define	NR_RESERVED_BUFS	32
25 
26 static int multipath_map (struct mpconf *conf)
27 {
28 	int i, disks = conf->raid_disks;
29 
30 	/*
31 	 * Later we do read balancing on the read side
32 	 * now we use the first available disk.
33 	 */
34 
35 	rcu_read_lock();
36 	for (i = 0; i < disks; i++) {
37 		struct md_rdev *rdev = rcu_dereference(conf->multipaths[i].rdev);
38 		if (rdev && test_bit(In_sync, &rdev->flags) &&
39 		    !test_bit(Faulty, &rdev->flags)) {
40 			atomic_inc(&rdev->nr_pending);
41 			rcu_read_unlock();
42 			return i;
43 		}
44 	}
45 	rcu_read_unlock();
46 
47 	pr_crit_ratelimited("multipath_map(): no more operational IO paths?\n");
48 	return (-1);
49 }
50 
51 static void multipath_reschedule_retry (struct multipath_bh *mp_bh)
52 {
53 	unsigned long flags;
54 	struct mddev *mddev = mp_bh->mddev;
55 	struct mpconf *conf = mddev->private;
56 
57 	spin_lock_irqsave(&conf->device_lock, flags);
58 	list_add(&mp_bh->retry_list, &conf->retry_list);
59 	spin_unlock_irqrestore(&conf->device_lock, flags);
60 	md_wakeup_thread(mddev->thread);
61 }
62 
63 /*
64  * multipath_end_bh_io() is called when we have finished servicing a multipathed
65  * operation and are ready to return a success/failure code to the buffer
66  * cache layer.
67  */
68 static void multipath_end_bh_io(struct multipath_bh *mp_bh, blk_status_t status)
69 {
70 	struct bio *bio = mp_bh->master_bio;
71 	struct mpconf *conf = mp_bh->mddev->private;
72 
73 	bio->bi_status = status;
74 	bio_endio(bio);
75 	mempool_free(mp_bh, &conf->pool);
76 }
77 
78 static void multipath_end_request(struct bio *bio)
79 {
80 	struct multipath_bh *mp_bh = bio->bi_private;
81 	struct mpconf *conf = mp_bh->mddev->private;
82 	struct md_rdev *rdev = conf->multipaths[mp_bh->path].rdev;
83 
84 	if (!bio->bi_status)
85 		multipath_end_bh_io(mp_bh, 0);
86 	else if (!(bio->bi_opf & REQ_RAHEAD)) {
87 		/*
88 		 * oops, IO error:
89 		 */
90 		char b[BDEVNAME_SIZE];
91 		md_error (mp_bh->mddev, rdev);
92 		pr_info("multipath: %s: rescheduling sector %llu\n",
93 			bdevname(rdev->bdev,b),
94 			(unsigned long long)bio->bi_iter.bi_sector);
95 		multipath_reschedule_retry(mp_bh);
96 	} else
97 		multipath_end_bh_io(mp_bh, bio->bi_status);
98 	rdev_dec_pending(rdev, conf->mddev);
99 }
100 
101 static bool multipath_make_request(struct mddev *mddev, struct bio * bio)
102 {
103 	struct mpconf *conf = mddev->private;
104 	struct multipath_bh * mp_bh;
105 	struct multipath_info *multipath;
106 
107 	if (unlikely(bio->bi_opf & REQ_PREFLUSH)
108 	    && md_flush_request(mddev, bio))
109 		return true;
110 
111 	mp_bh = mempool_alloc(&conf->pool, GFP_NOIO);
112 
113 	mp_bh->master_bio = bio;
114 	mp_bh->mddev = mddev;
115 
116 	mp_bh->path = multipath_map(conf);
117 	if (mp_bh->path < 0) {
118 		bio_io_error(bio);
119 		mempool_free(mp_bh, &conf->pool);
120 		return true;
121 	}
122 	multipath = conf->multipaths + mp_bh->path;
123 
124 	bio_init(&mp_bh->bio, NULL, 0);
125 	__bio_clone_fast(&mp_bh->bio, bio);
126 
127 	mp_bh->bio.bi_iter.bi_sector += multipath->rdev->data_offset;
128 	bio_set_dev(&mp_bh->bio, multipath->rdev->bdev);
129 	mp_bh->bio.bi_opf |= REQ_FAILFAST_TRANSPORT;
130 	mp_bh->bio.bi_end_io = multipath_end_request;
131 	mp_bh->bio.bi_private = mp_bh;
132 	mddev_check_writesame(mddev, &mp_bh->bio);
133 	mddev_check_write_zeroes(mddev, &mp_bh->bio);
134 	generic_make_request(&mp_bh->bio);
135 	return true;
136 }
137 
138 static void multipath_status(struct seq_file *seq, struct mddev *mddev)
139 {
140 	struct mpconf *conf = mddev->private;
141 	int i;
142 
143 	seq_printf (seq, " [%d/%d] [", conf->raid_disks,
144 		    conf->raid_disks - mddev->degraded);
145 	rcu_read_lock();
146 	for (i = 0; i < conf->raid_disks; i++) {
147 		struct md_rdev *rdev = rcu_dereference(conf->multipaths[i].rdev);
148 		seq_printf (seq, "%s", rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_");
149 	}
150 	rcu_read_unlock();
151 	seq_putc(seq, ']');
152 }
153 
154 static int multipath_congested(struct mddev *mddev, int bits)
155 {
156 	struct mpconf *conf = mddev->private;
157 	int i, ret = 0;
158 
159 	rcu_read_lock();
160 	for (i = 0; i < mddev->raid_disks ; i++) {
161 		struct md_rdev *rdev = rcu_dereference(conf->multipaths[i].rdev);
162 		if (rdev && !test_bit(Faulty, &rdev->flags)) {
163 			struct request_queue *q = bdev_get_queue(rdev->bdev);
164 
165 			ret |= bdi_congested(q->backing_dev_info, bits);
166 			/* Just like multipath_map, we just check the
167 			 * first available device
168 			 */
169 			break;
170 		}
171 	}
172 	rcu_read_unlock();
173 	return ret;
174 }
175 
176 /*
177  * Careful, this can execute in IRQ contexts as well!
178  */
179 static void multipath_error (struct mddev *mddev, struct md_rdev *rdev)
180 {
181 	struct mpconf *conf = mddev->private;
182 	char b[BDEVNAME_SIZE];
183 
184 	if (conf->raid_disks - mddev->degraded <= 1) {
185 		/*
186 		 * Uh oh, we can do nothing if this is our last path, but
187 		 * first check if this is a queued request for a device
188 		 * which has just failed.
189 		 */
190 		pr_warn("multipath: only one IO path left and IO error.\n");
191 		/* leave it active... it's all we have */
192 		return;
193 	}
194 	/*
195 	 * Mark disk as unusable
196 	 */
197 	if (test_and_clear_bit(In_sync, &rdev->flags)) {
198 		unsigned long flags;
199 		spin_lock_irqsave(&conf->device_lock, flags);
200 		mddev->degraded++;
201 		spin_unlock_irqrestore(&conf->device_lock, flags);
202 	}
203 	set_bit(Faulty, &rdev->flags);
204 	set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
205 	pr_err("multipath: IO failure on %s, disabling IO path.\n"
206 	       "multipath: Operation continuing on %d IO paths.\n",
207 	       bdevname(rdev->bdev, b),
208 	       conf->raid_disks - mddev->degraded);
209 }
210 
211 static void print_multipath_conf (struct mpconf *conf)
212 {
213 	int i;
214 	struct multipath_info *tmp;
215 
216 	pr_debug("MULTIPATH conf printout:\n");
217 	if (!conf) {
218 		pr_debug("(conf==NULL)\n");
219 		return;
220 	}
221 	pr_debug(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded,
222 		 conf->raid_disks);
223 
224 	for (i = 0; i < conf->raid_disks; i++) {
225 		char b[BDEVNAME_SIZE];
226 		tmp = conf->multipaths + i;
227 		if (tmp->rdev)
228 			pr_debug(" disk%d, o:%d, dev:%s\n",
229 				 i,!test_bit(Faulty, &tmp->rdev->flags),
230 				 bdevname(tmp->rdev->bdev,b));
231 	}
232 }
233 
234 static int multipath_add_disk(struct mddev *mddev, struct md_rdev *rdev)
235 {
236 	struct mpconf *conf = mddev->private;
237 	int err = -EEXIST;
238 	int path;
239 	struct multipath_info *p;
240 	int first = 0;
241 	int last = mddev->raid_disks - 1;
242 
243 	if (rdev->raid_disk >= 0)
244 		first = last = rdev->raid_disk;
245 
246 	print_multipath_conf(conf);
247 
248 	for (path = first; path <= last; path++)
249 		if ((p=conf->multipaths+path)->rdev == NULL) {
250 			disk_stack_limits(mddev->gendisk, rdev->bdev,
251 					  rdev->data_offset << 9);
252 
253 			err = md_integrity_add_rdev(rdev, mddev);
254 			if (err)
255 				break;
256 			spin_lock_irq(&conf->device_lock);
257 			mddev->degraded--;
258 			rdev->raid_disk = path;
259 			set_bit(In_sync, &rdev->flags);
260 			spin_unlock_irq(&conf->device_lock);
261 			rcu_assign_pointer(p->rdev, rdev);
262 			err = 0;
263 			break;
264 		}
265 
266 	print_multipath_conf(conf);
267 
268 	return err;
269 }
270 
271 static int multipath_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
272 {
273 	struct mpconf *conf = mddev->private;
274 	int err = 0;
275 	int number = rdev->raid_disk;
276 	struct multipath_info *p = conf->multipaths + number;
277 
278 	print_multipath_conf(conf);
279 
280 	if (rdev == p->rdev) {
281 		if (test_bit(In_sync, &rdev->flags) ||
282 		    atomic_read(&rdev->nr_pending)) {
283 			pr_warn("hot-remove-disk, slot %d is identified but is still operational!\n", number);
284 			err = -EBUSY;
285 			goto abort;
286 		}
287 		p->rdev = NULL;
288 		if (!test_bit(RemoveSynchronized, &rdev->flags)) {
289 			synchronize_rcu();
290 			if (atomic_read(&rdev->nr_pending)) {
291 				/* lost the race, try later */
292 				err = -EBUSY;
293 				p->rdev = rdev;
294 				goto abort;
295 			}
296 		}
297 		err = md_integrity_register(mddev);
298 	}
299 abort:
300 
301 	print_multipath_conf(conf);
302 	return err;
303 }
304 
305 /*
306  * This is a kernel thread which:
307  *
308  *	1.	Retries failed read operations on working multipaths.
309  *	2.	Updates the raid superblock when problems encounter.
310  *	3.	Performs writes following reads for array syncronising.
311  */
312 
313 static void multipathd(struct md_thread *thread)
314 {
315 	struct mddev *mddev = thread->mddev;
316 	struct multipath_bh *mp_bh;
317 	struct bio *bio;
318 	unsigned long flags;
319 	struct mpconf *conf = mddev->private;
320 	struct list_head *head = &conf->retry_list;
321 
322 	md_check_recovery(mddev);
323 	for (;;) {
324 		char b[BDEVNAME_SIZE];
325 		spin_lock_irqsave(&conf->device_lock, flags);
326 		if (list_empty(head))
327 			break;
328 		mp_bh = list_entry(head->prev, struct multipath_bh, retry_list);
329 		list_del(head->prev);
330 		spin_unlock_irqrestore(&conf->device_lock, flags);
331 
332 		bio = &mp_bh->bio;
333 		bio->bi_iter.bi_sector = mp_bh->master_bio->bi_iter.bi_sector;
334 
335 		if ((mp_bh->path = multipath_map (conf))<0) {
336 			pr_err("multipath: %s: unrecoverable IO read error for block %llu\n",
337 			       bio_devname(bio, b),
338 			       (unsigned long long)bio->bi_iter.bi_sector);
339 			multipath_end_bh_io(mp_bh, BLK_STS_IOERR);
340 		} else {
341 			pr_err("multipath: %s: redirecting sector %llu to another IO path\n",
342 			       bio_devname(bio, b),
343 			       (unsigned long long)bio->bi_iter.bi_sector);
344 			*bio = *(mp_bh->master_bio);
345 			bio->bi_iter.bi_sector +=
346 				conf->multipaths[mp_bh->path].rdev->data_offset;
347 			bio_set_dev(bio, conf->multipaths[mp_bh->path].rdev->bdev);
348 			bio->bi_opf |= REQ_FAILFAST_TRANSPORT;
349 			bio->bi_end_io = multipath_end_request;
350 			bio->bi_private = mp_bh;
351 			generic_make_request(bio);
352 		}
353 	}
354 	spin_unlock_irqrestore(&conf->device_lock, flags);
355 }
356 
357 static sector_t multipath_size(struct mddev *mddev, sector_t sectors, int raid_disks)
358 {
359 	WARN_ONCE(sectors || raid_disks,
360 		  "%s does not support generic reshape\n", __func__);
361 
362 	return mddev->dev_sectors;
363 }
364 
365 static int multipath_run (struct mddev *mddev)
366 {
367 	struct mpconf *conf;
368 	int disk_idx;
369 	struct multipath_info *disk;
370 	struct md_rdev *rdev;
371 	int working_disks;
372 	int ret;
373 
374 	if (md_check_no_bitmap(mddev))
375 		return -EINVAL;
376 
377 	if (mddev->level != LEVEL_MULTIPATH) {
378 		pr_warn("multipath: %s: raid level not set to multipath IO (%d)\n",
379 			mdname(mddev), mddev->level);
380 		goto out;
381 	}
382 	/*
383 	 * copy the already verified devices into our private MULTIPATH
384 	 * bookkeeping area. [whatever we allocate in multipath_run(),
385 	 * should be freed in multipath_free()]
386 	 */
387 
388 	conf = kzalloc(sizeof(struct mpconf), GFP_KERNEL);
389 	mddev->private = conf;
390 	if (!conf)
391 		goto out;
392 
393 	conf->multipaths = kcalloc(mddev->raid_disks,
394 				   sizeof(struct multipath_info),
395 				   GFP_KERNEL);
396 	if (!conf->multipaths)
397 		goto out_free_conf;
398 
399 	working_disks = 0;
400 	rdev_for_each(rdev, mddev) {
401 		disk_idx = rdev->raid_disk;
402 		if (disk_idx < 0 ||
403 		    disk_idx >= mddev->raid_disks)
404 			continue;
405 
406 		disk = conf->multipaths + disk_idx;
407 		disk->rdev = rdev;
408 		disk_stack_limits(mddev->gendisk, rdev->bdev,
409 				  rdev->data_offset << 9);
410 
411 		if (!test_bit(Faulty, &rdev->flags))
412 			working_disks++;
413 	}
414 
415 	conf->raid_disks = mddev->raid_disks;
416 	conf->mddev = mddev;
417 	spin_lock_init(&conf->device_lock);
418 	INIT_LIST_HEAD(&conf->retry_list);
419 
420 	if (!working_disks) {
421 		pr_warn("multipath: no operational IO paths for %s\n",
422 			mdname(mddev));
423 		goto out_free_conf;
424 	}
425 	mddev->degraded = conf->raid_disks - working_disks;
426 
427 	ret = mempool_init_kmalloc_pool(&conf->pool, NR_RESERVED_BUFS,
428 					sizeof(struct multipath_bh));
429 	if (ret)
430 		goto out_free_conf;
431 
432 	mddev->thread = md_register_thread(multipathd, mddev,
433 					   "multipath");
434 	if (!mddev->thread)
435 		goto out_free_conf;
436 
437 	pr_info("multipath: array %s active with %d out of %d IO paths\n",
438 		mdname(mddev), conf->raid_disks - mddev->degraded,
439 		mddev->raid_disks);
440 	/*
441 	 * Ok, everything is just fine now
442 	 */
443 	md_set_array_sectors(mddev, multipath_size(mddev, 0, 0));
444 
445 	if (md_integrity_register(mddev))
446 		goto out_free_conf;
447 
448 	return 0;
449 
450 out_free_conf:
451 	mempool_exit(&conf->pool);
452 	kfree(conf->multipaths);
453 	kfree(conf);
454 	mddev->private = NULL;
455 out:
456 	return -EIO;
457 }
458 
459 static void multipath_free(struct mddev *mddev, void *priv)
460 {
461 	struct mpconf *conf = priv;
462 
463 	mempool_exit(&conf->pool);
464 	kfree(conf->multipaths);
465 	kfree(conf);
466 }
467 
468 static struct md_personality multipath_personality =
469 {
470 	.name		= "multipath",
471 	.level		= LEVEL_MULTIPATH,
472 	.owner		= THIS_MODULE,
473 	.make_request	= multipath_make_request,
474 	.run		= multipath_run,
475 	.free		= multipath_free,
476 	.status		= multipath_status,
477 	.error_handler	= multipath_error,
478 	.hot_add_disk	= multipath_add_disk,
479 	.hot_remove_disk= multipath_remove_disk,
480 	.size		= multipath_size,
481 	.congested	= multipath_congested,
482 };
483 
484 static int __init multipath_init (void)
485 {
486 	return register_md_personality (&multipath_personality);
487 }
488 
489 static void __exit multipath_exit (void)
490 {
491 	unregister_md_personality (&multipath_personality);
492 }
493 
494 module_init(multipath_init);
495 module_exit(multipath_exit);
496 MODULE_LICENSE("GPL");
497 MODULE_DESCRIPTION("simple multi-path personality for MD");
498 MODULE_ALIAS("md-personality-7"); /* MULTIPATH */
499 MODULE_ALIAS("md-multipath");
500 MODULE_ALIAS("md-level--4");
501