1 /* 2 * Copyright (C) 2003 Sistina Software Limited. 3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. 4 * 5 * This file is released under the GPL. 6 */ 7 8 #include <linux/device-mapper.h> 9 10 #include "dm-rq.h" 11 #include "dm-bio-record.h" 12 #include "dm-path-selector.h" 13 #include "dm-uevent.h" 14 15 #include <linux/blkdev.h> 16 #include <linux/ctype.h> 17 #include <linux/init.h> 18 #include <linux/mempool.h> 19 #include <linux/module.h> 20 #include <linux/pagemap.h> 21 #include <linux/slab.h> 22 #include <linux/time.h> 23 #include <linux/workqueue.h> 24 #include <linux/delay.h> 25 #include <scsi/scsi_dh.h> 26 #include <linux/atomic.h> 27 #include <linux/blk-mq.h> 28 29 #define DM_MSG_PREFIX "multipath" 30 #define DM_PG_INIT_DELAY_MSECS 2000 31 #define DM_PG_INIT_DELAY_DEFAULT ((unsigned) -1) 32 33 /* Path properties */ 34 struct pgpath { 35 struct list_head list; 36 37 struct priority_group *pg; /* Owning PG */ 38 unsigned fail_count; /* Cumulative failure count */ 39 40 struct dm_path path; 41 struct delayed_work activate_path; 42 43 bool is_active:1; /* Path status */ 44 }; 45 46 #define path_to_pgpath(__pgp) container_of((__pgp), struct pgpath, path) 47 48 /* 49 * Paths are grouped into Priority Groups and numbered from 1 upwards. 50 * Each has a path selector which controls which path gets used. 51 */ 52 struct priority_group { 53 struct list_head list; 54 55 struct multipath *m; /* Owning multipath instance */ 56 struct path_selector ps; 57 58 unsigned pg_num; /* Reference number */ 59 unsigned nr_pgpaths; /* Number of paths in PG */ 60 struct list_head pgpaths; 61 62 bool bypassed:1; /* Temporarily bypass this PG? */ 63 }; 64 65 /* Multipath context */ 66 struct multipath { 67 unsigned long flags; /* Multipath state flags */ 68 69 spinlock_t lock; 70 enum dm_queue_mode queue_mode; 71 72 struct pgpath *current_pgpath; 73 struct priority_group *current_pg; 74 struct priority_group *next_pg; /* Switch to this PG if set */ 75 76 atomic_t nr_valid_paths; /* Total number of usable paths */ 77 unsigned nr_priority_groups; 78 struct list_head priority_groups; 79 80 const char *hw_handler_name; 81 char *hw_handler_params; 82 wait_queue_head_t pg_init_wait; /* Wait for pg_init completion */ 83 unsigned pg_init_retries; /* Number of times to retry pg_init */ 84 unsigned pg_init_delay_msecs; /* Number of msecs before pg_init retry */ 85 atomic_t pg_init_in_progress; /* Only one pg_init allowed at once */ 86 atomic_t pg_init_count; /* Number of times pg_init called */ 87 88 struct mutex work_mutex; 89 struct work_struct trigger_event; 90 struct dm_target *ti; 91 92 struct work_struct process_queued_bios; 93 struct bio_list queued_bios; 94 }; 95 96 /* 97 * Context information attached to each io we process. 98 */ 99 struct dm_mpath_io { 100 struct pgpath *pgpath; 101 size_t nr_bytes; 102 }; 103 104 typedef int (*action_fn) (struct pgpath *pgpath); 105 106 static struct workqueue_struct *kmultipathd, *kmpath_handlerd; 107 static void trigger_event(struct work_struct *work); 108 static void activate_or_offline_path(struct pgpath *pgpath); 109 static void activate_path_work(struct work_struct *work); 110 static void process_queued_bios(struct work_struct *work); 111 112 /*----------------------------------------------- 113 * Multipath state flags. 114 *-----------------------------------------------*/ 115 116 #define MPATHF_QUEUE_IO 0 /* Must we queue all I/O? */ 117 #define MPATHF_QUEUE_IF_NO_PATH 1 /* Queue I/O if last path fails? */ 118 #define MPATHF_SAVED_QUEUE_IF_NO_PATH 2 /* Saved state during suspension */ 119 #define MPATHF_RETAIN_ATTACHED_HW_HANDLER 3 /* If there's already a hw_handler present, don't change it. */ 120 #define MPATHF_PG_INIT_DISABLED 4 /* pg_init is not currently allowed */ 121 #define MPATHF_PG_INIT_REQUIRED 5 /* pg_init needs calling? */ 122 #define MPATHF_PG_INIT_DELAY_RETRY 6 /* Delay pg_init retry? */ 123 124 /*----------------------------------------------- 125 * Allocation routines 126 *-----------------------------------------------*/ 127 128 static struct pgpath *alloc_pgpath(void) 129 { 130 struct pgpath *pgpath = kzalloc(sizeof(*pgpath), GFP_KERNEL); 131 132 if (!pgpath) 133 return NULL; 134 135 pgpath->is_active = true; 136 137 return pgpath; 138 } 139 140 static void free_pgpath(struct pgpath *pgpath) 141 { 142 kfree(pgpath); 143 } 144 145 static struct priority_group *alloc_priority_group(void) 146 { 147 struct priority_group *pg; 148 149 pg = kzalloc(sizeof(*pg), GFP_KERNEL); 150 151 if (pg) 152 INIT_LIST_HEAD(&pg->pgpaths); 153 154 return pg; 155 } 156 157 static void free_pgpaths(struct list_head *pgpaths, struct dm_target *ti) 158 { 159 struct pgpath *pgpath, *tmp; 160 161 list_for_each_entry_safe(pgpath, tmp, pgpaths, list) { 162 list_del(&pgpath->list); 163 dm_put_device(ti, pgpath->path.dev); 164 free_pgpath(pgpath); 165 } 166 } 167 168 static void free_priority_group(struct priority_group *pg, 169 struct dm_target *ti) 170 { 171 struct path_selector *ps = &pg->ps; 172 173 if (ps->type) { 174 ps->type->destroy(ps); 175 dm_put_path_selector(ps->type); 176 } 177 178 free_pgpaths(&pg->pgpaths, ti); 179 kfree(pg); 180 } 181 182 static struct multipath *alloc_multipath(struct dm_target *ti) 183 { 184 struct multipath *m; 185 186 m = kzalloc(sizeof(*m), GFP_KERNEL); 187 if (m) { 188 INIT_LIST_HEAD(&m->priority_groups); 189 spin_lock_init(&m->lock); 190 atomic_set(&m->nr_valid_paths, 0); 191 INIT_WORK(&m->trigger_event, trigger_event); 192 mutex_init(&m->work_mutex); 193 194 m->queue_mode = DM_TYPE_NONE; 195 196 m->ti = ti; 197 ti->private = m; 198 } 199 200 return m; 201 } 202 203 static int alloc_multipath_stage2(struct dm_target *ti, struct multipath *m) 204 { 205 if (m->queue_mode == DM_TYPE_NONE) { 206 m->queue_mode = DM_TYPE_REQUEST_BASED; 207 } else if (m->queue_mode == DM_TYPE_BIO_BASED) { 208 INIT_WORK(&m->process_queued_bios, process_queued_bios); 209 /* 210 * bio-based doesn't support any direct scsi_dh management; 211 * it just discovers if a scsi_dh is attached. 212 */ 213 set_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags); 214 } 215 216 dm_table_set_type(ti->table, m->queue_mode); 217 218 /* 219 * Init fields that are only used when a scsi_dh is attached 220 * - must do this unconditionally (really doesn't hurt non-SCSI uses) 221 */ 222 set_bit(MPATHF_QUEUE_IO, &m->flags); 223 atomic_set(&m->pg_init_in_progress, 0); 224 atomic_set(&m->pg_init_count, 0); 225 m->pg_init_delay_msecs = DM_PG_INIT_DELAY_DEFAULT; 226 init_waitqueue_head(&m->pg_init_wait); 227 228 return 0; 229 } 230 231 static void free_multipath(struct multipath *m) 232 { 233 struct priority_group *pg, *tmp; 234 235 list_for_each_entry_safe(pg, tmp, &m->priority_groups, list) { 236 list_del(&pg->list); 237 free_priority_group(pg, m->ti); 238 } 239 240 kfree(m->hw_handler_name); 241 kfree(m->hw_handler_params); 242 mutex_destroy(&m->work_mutex); 243 kfree(m); 244 } 245 246 static struct dm_mpath_io *get_mpio(union map_info *info) 247 { 248 return info->ptr; 249 } 250 251 static size_t multipath_per_bio_data_size(void) 252 { 253 return sizeof(struct dm_mpath_io) + sizeof(struct dm_bio_details); 254 } 255 256 static struct dm_mpath_io *get_mpio_from_bio(struct bio *bio) 257 { 258 return dm_per_bio_data(bio, multipath_per_bio_data_size()); 259 } 260 261 static struct dm_bio_details *get_bio_details_from_mpio(struct dm_mpath_io *mpio) 262 { 263 /* dm_bio_details is immediately after the dm_mpath_io in bio's per-bio-data */ 264 void *bio_details = mpio + 1; 265 return bio_details; 266 } 267 268 static void multipath_init_per_bio_data(struct bio *bio, struct dm_mpath_io **mpio_p) 269 { 270 struct dm_mpath_io *mpio = get_mpio_from_bio(bio); 271 struct dm_bio_details *bio_details = get_bio_details_from_mpio(mpio); 272 273 mpio->nr_bytes = bio->bi_iter.bi_size; 274 mpio->pgpath = NULL; 275 *mpio_p = mpio; 276 277 dm_bio_record(bio_details, bio); 278 } 279 280 /*----------------------------------------------- 281 * Path selection 282 *-----------------------------------------------*/ 283 284 static int __pg_init_all_paths(struct multipath *m) 285 { 286 struct pgpath *pgpath; 287 unsigned long pg_init_delay = 0; 288 289 lockdep_assert_held(&m->lock); 290 291 if (atomic_read(&m->pg_init_in_progress) || test_bit(MPATHF_PG_INIT_DISABLED, &m->flags)) 292 return 0; 293 294 atomic_inc(&m->pg_init_count); 295 clear_bit(MPATHF_PG_INIT_REQUIRED, &m->flags); 296 297 /* Check here to reset pg_init_required */ 298 if (!m->current_pg) 299 return 0; 300 301 if (test_bit(MPATHF_PG_INIT_DELAY_RETRY, &m->flags)) 302 pg_init_delay = msecs_to_jiffies(m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT ? 303 m->pg_init_delay_msecs : DM_PG_INIT_DELAY_MSECS); 304 list_for_each_entry(pgpath, &m->current_pg->pgpaths, list) { 305 /* Skip failed paths */ 306 if (!pgpath->is_active) 307 continue; 308 if (queue_delayed_work(kmpath_handlerd, &pgpath->activate_path, 309 pg_init_delay)) 310 atomic_inc(&m->pg_init_in_progress); 311 } 312 return atomic_read(&m->pg_init_in_progress); 313 } 314 315 static int pg_init_all_paths(struct multipath *m) 316 { 317 int ret; 318 unsigned long flags; 319 320 spin_lock_irqsave(&m->lock, flags); 321 ret = __pg_init_all_paths(m); 322 spin_unlock_irqrestore(&m->lock, flags); 323 324 return ret; 325 } 326 327 static void __switch_pg(struct multipath *m, struct priority_group *pg) 328 { 329 m->current_pg = pg; 330 331 /* Must we initialise the PG first, and queue I/O till it's ready? */ 332 if (m->hw_handler_name) { 333 set_bit(MPATHF_PG_INIT_REQUIRED, &m->flags); 334 set_bit(MPATHF_QUEUE_IO, &m->flags); 335 } else { 336 clear_bit(MPATHF_PG_INIT_REQUIRED, &m->flags); 337 clear_bit(MPATHF_QUEUE_IO, &m->flags); 338 } 339 340 atomic_set(&m->pg_init_count, 0); 341 } 342 343 static struct pgpath *choose_path_in_pg(struct multipath *m, 344 struct priority_group *pg, 345 size_t nr_bytes) 346 { 347 unsigned long flags; 348 struct dm_path *path; 349 struct pgpath *pgpath; 350 351 path = pg->ps.type->select_path(&pg->ps, nr_bytes); 352 if (!path) 353 return ERR_PTR(-ENXIO); 354 355 pgpath = path_to_pgpath(path); 356 357 if (unlikely(READ_ONCE(m->current_pg) != pg)) { 358 /* Only update current_pgpath if pg changed */ 359 spin_lock_irqsave(&m->lock, flags); 360 m->current_pgpath = pgpath; 361 __switch_pg(m, pg); 362 spin_unlock_irqrestore(&m->lock, flags); 363 } 364 365 return pgpath; 366 } 367 368 static struct pgpath *choose_pgpath(struct multipath *m, size_t nr_bytes) 369 { 370 unsigned long flags; 371 struct priority_group *pg; 372 struct pgpath *pgpath; 373 unsigned bypassed = 1; 374 375 if (!atomic_read(&m->nr_valid_paths)) { 376 clear_bit(MPATHF_QUEUE_IO, &m->flags); 377 goto failed; 378 } 379 380 /* Were we instructed to switch PG? */ 381 if (READ_ONCE(m->next_pg)) { 382 spin_lock_irqsave(&m->lock, flags); 383 pg = m->next_pg; 384 if (!pg) { 385 spin_unlock_irqrestore(&m->lock, flags); 386 goto check_current_pg; 387 } 388 m->next_pg = NULL; 389 spin_unlock_irqrestore(&m->lock, flags); 390 pgpath = choose_path_in_pg(m, pg, nr_bytes); 391 if (!IS_ERR_OR_NULL(pgpath)) 392 return pgpath; 393 } 394 395 /* Don't change PG until it has no remaining paths */ 396 check_current_pg: 397 pg = READ_ONCE(m->current_pg); 398 if (pg) { 399 pgpath = choose_path_in_pg(m, pg, nr_bytes); 400 if (!IS_ERR_OR_NULL(pgpath)) 401 return pgpath; 402 } 403 404 /* 405 * Loop through priority groups until we find a valid path. 406 * First time we skip PGs marked 'bypassed'. 407 * Second time we only try the ones we skipped, but set 408 * pg_init_delay_retry so we do not hammer controllers. 409 */ 410 do { 411 list_for_each_entry(pg, &m->priority_groups, list) { 412 if (pg->bypassed == !!bypassed) 413 continue; 414 pgpath = choose_path_in_pg(m, pg, nr_bytes); 415 if (!IS_ERR_OR_NULL(pgpath)) { 416 if (!bypassed) 417 set_bit(MPATHF_PG_INIT_DELAY_RETRY, &m->flags); 418 return pgpath; 419 } 420 } 421 } while (bypassed--); 422 423 failed: 424 spin_lock_irqsave(&m->lock, flags); 425 m->current_pgpath = NULL; 426 m->current_pg = NULL; 427 spin_unlock_irqrestore(&m->lock, flags); 428 429 return NULL; 430 } 431 432 /* 433 * dm_report_EIO() is a macro instead of a function to make pr_debug() 434 * report the function name and line number of the function from which 435 * it has been invoked. 436 */ 437 #define dm_report_EIO(m) \ 438 do { \ 439 struct mapped_device *md = dm_table_get_md((m)->ti->table); \ 440 \ 441 pr_debug("%s: returning EIO; QIFNP = %d; SQIFNP = %d; DNFS = %d\n", \ 442 dm_device_name(md), \ 443 test_bit(MPATHF_QUEUE_IF_NO_PATH, &(m)->flags), \ 444 test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &(m)->flags), \ 445 dm_noflush_suspending((m)->ti)); \ 446 } while (0) 447 448 /* 449 * Check whether bios must be queued in the device-mapper core rather 450 * than here in the target. 451 * 452 * If MPATHF_QUEUE_IF_NO_PATH and MPATHF_SAVED_QUEUE_IF_NO_PATH hold 453 * the same value then we are not between multipath_presuspend() 454 * and multipath_resume() calls and we have no need to check 455 * for the DMF_NOFLUSH_SUSPENDING flag. 456 */ 457 static bool __must_push_back(struct multipath *m, unsigned long flags) 458 { 459 return ((test_bit(MPATHF_QUEUE_IF_NO_PATH, &flags) != 460 test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &flags)) && 461 dm_noflush_suspending(m->ti)); 462 } 463 464 /* 465 * Following functions use READ_ONCE to get atomic access to 466 * all m->flags to avoid taking spinlock 467 */ 468 static bool must_push_back_rq(struct multipath *m) 469 { 470 unsigned long flags = READ_ONCE(m->flags); 471 return test_bit(MPATHF_QUEUE_IF_NO_PATH, &flags) || __must_push_back(m, flags); 472 } 473 474 static bool must_push_back_bio(struct multipath *m) 475 { 476 unsigned long flags = READ_ONCE(m->flags); 477 return __must_push_back(m, flags); 478 } 479 480 /* 481 * Map cloned requests (request-based multipath) 482 */ 483 static int multipath_clone_and_map(struct dm_target *ti, struct request *rq, 484 union map_info *map_context, 485 struct request **__clone) 486 { 487 struct multipath *m = ti->private; 488 size_t nr_bytes = blk_rq_bytes(rq); 489 struct pgpath *pgpath; 490 struct block_device *bdev; 491 struct dm_mpath_io *mpio = get_mpio(map_context); 492 struct request_queue *q; 493 struct request *clone; 494 495 /* Do we need to select a new pgpath? */ 496 pgpath = READ_ONCE(m->current_pgpath); 497 if (!pgpath || !test_bit(MPATHF_QUEUE_IO, &m->flags)) 498 pgpath = choose_pgpath(m, nr_bytes); 499 500 if (!pgpath) { 501 if (must_push_back_rq(m)) 502 return DM_MAPIO_DELAY_REQUEUE; 503 dm_report_EIO(m); /* Failed */ 504 return DM_MAPIO_KILL; 505 } else if (test_bit(MPATHF_QUEUE_IO, &m->flags) || 506 test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) { 507 pg_init_all_paths(m); 508 return DM_MAPIO_DELAY_REQUEUE; 509 } 510 511 mpio->pgpath = pgpath; 512 mpio->nr_bytes = nr_bytes; 513 514 bdev = pgpath->path.dev->bdev; 515 q = bdev_get_queue(bdev); 516 clone = blk_get_request(q, rq->cmd_flags | REQ_NOMERGE, 517 BLK_MQ_REQ_NOWAIT); 518 if (IS_ERR(clone)) { 519 /* EBUSY, ENODEV or EWOULDBLOCK: requeue */ 520 if (blk_queue_dying(q)) { 521 atomic_inc(&m->pg_init_in_progress); 522 activate_or_offline_path(pgpath); 523 return DM_MAPIO_DELAY_REQUEUE; 524 } 525 526 /* 527 * blk-mq's SCHED_RESTART can cover this requeue, so we 528 * needn't deal with it by DELAY_REQUEUE. More importantly, 529 * we have to return DM_MAPIO_REQUEUE so that blk-mq can 530 * get the queue busy feedback (via BLK_STS_RESOURCE), 531 * otherwise I/O merging can suffer. 532 */ 533 return DM_MAPIO_REQUEUE; 534 } 535 clone->bio = clone->biotail = NULL; 536 clone->rq_disk = bdev->bd_disk; 537 clone->cmd_flags |= REQ_FAILFAST_TRANSPORT; 538 *__clone = clone; 539 540 if (pgpath->pg->ps.type->start_io) 541 pgpath->pg->ps.type->start_io(&pgpath->pg->ps, 542 &pgpath->path, 543 nr_bytes); 544 return DM_MAPIO_REMAPPED; 545 } 546 547 static void multipath_release_clone(struct request *clone, 548 union map_info *map_context) 549 { 550 if (unlikely(map_context)) { 551 /* 552 * non-NULL map_context means caller is still map 553 * method; must undo multipath_clone_and_map() 554 */ 555 struct dm_mpath_io *mpio = get_mpio(map_context); 556 struct pgpath *pgpath = mpio->pgpath; 557 558 if (pgpath && pgpath->pg->ps.type->end_io) 559 pgpath->pg->ps.type->end_io(&pgpath->pg->ps, 560 &pgpath->path, 561 mpio->nr_bytes); 562 } 563 564 blk_put_request(clone); 565 } 566 567 /* 568 * Map cloned bios (bio-based multipath) 569 */ 570 571 static struct pgpath *__map_bio(struct multipath *m, struct bio *bio) 572 { 573 struct pgpath *pgpath; 574 unsigned long flags; 575 bool queue_io; 576 577 /* Do we need to select a new pgpath? */ 578 pgpath = READ_ONCE(m->current_pgpath); 579 queue_io = test_bit(MPATHF_QUEUE_IO, &m->flags); 580 if (!pgpath || !queue_io) 581 pgpath = choose_pgpath(m, bio->bi_iter.bi_size); 582 583 if ((pgpath && queue_io) || 584 (!pgpath && test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))) { 585 /* Queue for the daemon to resubmit */ 586 spin_lock_irqsave(&m->lock, flags); 587 bio_list_add(&m->queued_bios, bio); 588 spin_unlock_irqrestore(&m->lock, flags); 589 590 /* PG_INIT_REQUIRED cannot be set without QUEUE_IO */ 591 if (queue_io || test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) 592 pg_init_all_paths(m); 593 else if (!queue_io) 594 queue_work(kmultipathd, &m->process_queued_bios); 595 596 return ERR_PTR(-EAGAIN); 597 } 598 599 return pgpath; 600 } 601 602 static struct pgpath *__map_bio_fast(struct multipath *m, struct bio *bio) 603 { 604 struct pgpath *pgpath; 605 unsigned long flags; 606 607 /* Do we need to select a new pgpath? */ 608 /* 609 * FIXME: currently only switching path if no path (due to failure, etc) 610 * - which negates the point of using a path selector 611 */ 612 pgpath = READ_ONCE(m->current_pgpath); 613 if (!pgpath) 614 pgpath = choose_pgpath(m, bio->bi_iter.bi_size); 615 616 if (!pgpath) { 617 if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) { 618 /* Queue for the daemon to resubmit */ 619 spin_lock_irqsave(&m->lock, flags); 620 bio_list_add(&m->queued_bios, bio); 621 spin_unlock_irqrestore(&m->lock, flags); 622 queue_work(kmultipathd, &m->process_queued_bios); 623 624 return ERR_PTR(-EAGAIN); 625 } 626 return NULL; 627 } 628 629 return pgpath; 630 } 631 632 static int __multipath_map_bio(struct multipath *m, struct bio *bio, 633 struct dm_mpath_io *mpio) 634 { 635 struct pgpath *pgpath; 636 637 if (!m->hw_handler_name) 638 pgpath = __map_bio_fast(m, bio); 639 else 640 pgpath = __map_bio(m, bio); 641 642 if (IS_ERR(pgpath)) 643 return DM_MAPIO_SUBMITTED; 644 645 if (!pgpath) { 646 if (must_push_back_bio(m)) 647 return DM_MAPIO_REQUEUE; 648 dm_report_EIO(m); 649 return DM_MAPIO_KILL; 650 } 651 652 mpio->pgpath = pgpath; 653 654 bio->bi_status = 0; 655 bio_set_dev(bio, pgpath->path.dev->bdev); 656 bio->bi_opf |= REQ_FAILFAST_TRANSPORT; 657 658 if (pgpath->pg->ps.type->start_io) 659 pgpath->pg->ps.type->start_io(&pgpath->pg->ps, 660 &pgpath->path, 661 mpio->nr_bytes); 662 return DM_MAPIO_REMAPPED; 663 } 664 665 static int multipath_map_bio(struct dm_target *ti, struct bio *bio) 666 { 667 struct multipath *m = ti->private; 668 struct dm_mpath_io *mpio = NULL; 669 670 multipath_init_per_bio_data(bio, &mpio); 671 return __multipath_map_bio(m, bio, mpio); 672 } 673 674 static void process_queued_io_list(struct multipath *m) 675 { 676 if (m->queue_mode == DM_TYPE_REQUEST_BASED) 677 dm_mq_kick_requeue_list(dm_table_get_md(m->ti->table)); 678 else if (m->queue_mode == DM_TYPE_BIO_BASED) 679 queue_work(kmultipathd, &m->process_queued_bios); 680 } 681 682 static void process_queued_bios(struct work_struct *work) 683 { 684 int r; 685 unsigned long flags; 686 struct bio *bio; 687 struct bio_list bios; 688 struct blk_plug plug; 689 struct multipath *m = 690 container_of(work, struct multipath, process_queued_bios); 691 692 bio_list_init(&bios); 693 694 spin_lock_irqsave(&m->lock, flags); 695 696 if (bio_list_empty(&m->queued_bios)) { 697 spin_unlock_irqrestore(&m->lock, flags); 698 return; 699 } 700 701 bio_list_merge(&bios, &m->queued_bios); 702 bio_list_init(&m->queued_bios); 703 704 spin_unlock_irqrestore(&m->lock, flags); 705 706 blk_start_plug(&plug); 707 while ((bio = bio_list_pop(&bios))) { 708 struct dm_mpath_io *mpio = get_mpio_from_bio(bio); 709 dm_bio_restore(get_bio_details_from_mpio(mpio), bio); 710 r = __multipath_map_bio(m, bio, mpio); 711 switch (r) { 712 case DM_MAPIO_KILL: 713 bio->bi_status = BLK_STS_IOERR; 714 bio_endio(bio); 715 break; 716 case DM_MAPIO_REQUEUE: 717 bio->bi_status = BLK_STS_DM_REQUEUE; 718 bio_endio(bio); 719 break; 720 case DM_MAPIO_REMAPPED: 721 generic_make_request(bio); 722 break; 723 case DM_MAPIO_SUBMITTED: 724 break; 725 default: 726 WARN_ONCE(true, "__multipath_map_bio() returned %d\n", r); 727 } 728 } 729 blk_finish_plug(&plug); 730 } 731 732 /* 733 * If we run out of usable paths, should we queue I/O or error it? 734 */ 735 static int queue_if_no_path(struct multipath *m, bool queue_if_no_path, 736 bool save_old_value) 737 { 738 unsigned long flags; 739 740 spin_lock_irqsave(&m->lock, flags); 741 assign_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags, 742 (save_old_value && test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) || 743 (!save_old_value && queue_if_no_path)); 744 assign_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags, queue_if_no_path); 745 spin_unlock_irqrestore(&m->lock, flags); 746 747 if (!queue_if_no_path) { 748 dm_table_run_md_queue_async(m->ti->table); 749 process_queued_io_list(m); 750 } 751 752 return 0; 753 } 754 755 /* 756 * An event is triggered whenever a path is taken out of use. 757 * Includes path failure and PG bypass. 758 */ 759 static void trigger_event(struct work_struct *work) 760 { 761 struct multipath *m = 762 container_of(work, struct multipath, trigger_event); 763 764 dm_table_event(m->ti->table); 765 } 766 767 /*----------------------------------------------------------------- 768 * Constructor/argument parsing: 769 * <#multipath feature args> [<arg>]* 770 * <#hw_handler args> [hw_handler [<arg>]*] 771 * <#priority groups> 772 * <initial priority group> 773 * [<selector> <#selector args> [<arg>]* 774 * <#paths> <#per-path selector args> 775 * [<path> [<arg>]* ]+ ]+ 776 *---------------------------------------------------------------*/ 777 static int parse_path_selector(struct dm_arg_set *as, struct priority_group *pg, 778 struct dm_target *ti) 779 { 780 int r; 781 struct path_selector_type *pst; 782 unsigned ps_argc; 783 784 static const struct dm_arg _args[] = { 785 {0, 1024, "invalid number of path selector args"}, 786 }; 787 788 pst = dm_get_path_selector(dm_shift_arg(as)); 789 if (!pst) { 790 ti->error = "unknown path selector type"; 791 return -EINVAL; 792 } 793 794 r = dm_read_arg_group(_args, as, &ps_argc, &ti->error); 795 if (r) { 796 dm_put_path_selector(pst); 797 return -EINVAL; 798 } 799 800 r = pst->create(&pg->ps, ps_argc, as->argv); 801 if (r) { 802 dm_put_path_selector(pst); 803 ti->error = "path selector constructor failed"; 804 return r; 805 } 806 807 pg->ps.type = pst; 808 dm_consume_args(as, ps_argc); 809 810 return 0; 811 } 812 813 static int setup_scsi_dh(struct block_device *bdev, struct multipath *m, 814 const char **attached_handler_name, char **error) 815 { 816 struct request_queue *q = bdev_get_queue(bdev); 817 int r; 818 819 if (test_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags)) { 820 retain: 821 if (*attached_handler_name) { 822 /* 823 * Clear any hw_handler_params associated with a 824 * handler that isn't already attached. 825 */ 826 if (m->hw_handler_name && strcmp(*attached_handler_name, m->hw_handler_name)) { 827 kfree(m->hw_handler_params); 828 m->hw_handler_params = NULL; 829 } 830 831 /* 832 * Reset hw_handler_name to match the attached handler 833 * 834 * NB. This modifies the table line to show the actual 835 * handler instead of the original table passed in. 836 */ 837 kfree(m->hw_handler_name); 838 m->hw_handler_name = *attached_handler_name; 839 *attached_handler_name = NULL; 840 } 841 } 842 843 if (m->hw_handler_name) { 844 r = scsi_dh_attach(q, m->hw_handler_name); 845 if (r == -EBUSY) { 846 char b[BDEVNAME_SIZE]; 847 848 printk(KERN_INFO "dm-mpath: retaining handler on device %s\n", 849 bdevname(bdev, b)); 850 goto retain; 851 } 852 if (r < 0) { 853 *error = "error attaching hardware handler"; 854 return r; 855 } 856 857 if (m->hw_handler_params) { 858 r = scsi_dh_set_params(q, m->hw_handler_params); 859 if (r < 0) { 860 *error = "unable to set hardware handler parameters"; 861 return r; 862 } 863 } 864 } 865 866 return 0; 867 } 868 869 static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps, 870 struct dm_target *ti) 871 { 872 int r; 873 struct pgpath *p; 874 struct multipath *m = ti->private; 875 struct request_queue *q; 876 const char *attached_handler_name = NULL; 877 878 /* we need at least a path arg */ 879 if (as->argc < 1) { 880 ti->error = "no device given"; 881 return ERR_PTR(-EINVAL); 882 } 883 884 p = alloc_pgpath(); 885 if (!p) 886 return ERR_PTR(-ENOMEM); 887 888 r = dm_get_device(ti, dm_shift_arg(as), dm_table_get_mode(ti->table), 889 &p->path.dev); 890 if (r) { 891 ti->error = "error getting device"; 892 goto bad; 893 } 894 895 q = bdev_get_queue(p->path.dev->bdev); 896 attached_handler_name = scsi_dh_attached_handler_name(q, GFP_KERNEL); 897 if (attached_handler_name || m->hw_handler_name) { 898 INIT_DELAYED_WORK(&p->activate_path, activate_path_work); 899 r = setup_scsi_dh(p->path.dev->bdev, m, &attached_handler_name, &ti->error); 900 kfree(attached_handler_name); 901 if (r) { 902 dm_put_device(ti, p->path.dev); 903 goto bad; 904 } 905 } 906 907 r = ps->type->add_path(ps, &p->path, as->argc, as->argv, &ti->error); 908 if (r) { 909 dm_put_device(ti, p->path.dev); 910 goto bad; 911 } 912 913 return p; 914 bad: 915 free_pgpath(p); 916 return ERR_PTR(r); 917 } 918 919 static struct priority_group *parse_priority_group(struct dm_arg_set *as, 920 struct multipath *m) 921 { 922 static const struct dm_arg _args[] = { 923 {1, 1024, "invalid number of paths"}, 924 {0, 1024, "invalid number of selector args"} 925 }; 926 927 int r; 928 unsigned i, nr_selector_args, nr_args; 929 struct priority_group *pg; 930 struct dm_target *ti = m->ti; 931 932 if (as->argc < 2) { 933 as->argc = 0; 934 ti->error = "not enough priority group arguments"; 935 return ERR_PTR(-EINVAL); 936 } 937 938 pg = alloc_priority_group(); 939 if (!pg) { 940 ti->error = "couldn't allocate priority group"; 941 return ERR_PTR(-ENOMEM); 942 } 943 pg->m = m; 944 945 r = parse_path_selector(as, pg, ti); 946 if (r) 947 goto bad; 948 949 /* 950 * read the paths 951 */ 952 r = dm_read_arg(_args, as, &pg->nr_pgpaths, &ti->error); 953 if (r) 954 goto bad; 955 956 r = dm_read_arg(_args + 1, as, &nr_selector_args, &ti->error); 957 if (r) 958 goto bad; 959 960 nr_args = 1 + nr_selector_args; 961 for (i = 0; i < pg->nr_pgpaths; i++) { 962 struct pgpath *pgpath; 963 struct dm_arg_set path_args; 964 965 if (as->argc < nr_args) { 966 ti->error = "not enough path parameters"; 967 r = -EINVAL; 968 goto bad; 969 } 970 971 path_args.argc = nr_args; 972 path_args.argv = as->argv; 973 974 pgpath = parse_path(&path_args, &pg->ps, ti); 975 if (IS_ERR(pgpath)) { 976 r = PTR_ERR(pgpath); 977 goto bad; 978 } 979 980 pgpath->pg = pg; 981 list_add_tail(&pgpath->list, &pg->pgpaths); 982 dm_consume_args(as, nr_args); 983 } 984 985 return pg; 986 987 bad: 988 free_priority_group(pg, ti); 989 return ERR_PTR(r); 990 } 991 992 static int parse_hw_handler(struct dm_arg_set *as, struct multipath *m) 993 { 994 unsigned hw_argc; 995 int ret; 996 struct dm_target *ti = m->ti; 997 998 static const struct dm_arg _args[] = { 999 {0, 1024, "invalid number of hardware handler args"}, 1000 }; 1001 1002 if (dm_read_arg_group(_args, as, &hw_argc, &ti->error)) 1003 return -EINVAL; 1004 1005 if (!hw_argc) 1006 return 0; 1007 1008 if (m->queue_mode == DM_TYPE_BIO_BASED) { 1009 dm_consume_args(as, hw_argc); 1010 DMERR("bio-based multipath doesn't allow hardware handler args"); 1011 return 0; 1012 } 1013 1014 m->hw_handler_name = kstrdup(dm_shift_arg(as), GFP_KERNEL); 1015 if (!m->hw_handler_name) 1016 return -EINVAL; 1017 1018 if (hw_argc > 1) { 1019 char *p; 1020 int i, j, len = 4; 1021 1022 for (i = 0; i <= hw_argc - 2; i++) 1023 len += strlen(as->argv[i]) + 1; 1024 p = m->hw_handler_params = kzalloc(len, GFP_KERNEL); 1025 if (!p) { 1026 ti->error = "memory allocation failed"; 1027 ret = -ENOMEM; 1028 goto fail; 1029 } 1030 j = sprintf(p, "%d", hw_argc - 1); 1031 for (i = 0, p+=j+1; i <= hw_argc - 2; i++, p+=j+1) 1032 j = sprintf(p, "%s", as->argv[i]); 1033 } 1034 dm_consume_args(as, hw_argc - 1); 1035 1036 return 0; 1037 fail: 1038 kfree(m->hw_handler_name); 1039 m->hw_handler_name = NULL; 1040 return ret; 1041 } 1042 1043 static int parse_features(struct dm_arg_set *as, struct multipath *m) 1044 { 1045 int r; 1046 unsigned argc; 1047 struct dm_target *ti = m->ti; 1048 const char *arg_name; 1049 1050 static const struct dm_arg _args[] = { 1051 {0, 8, "invalid number of feature args"}, 1052 {1, 50, "pg_init_retries must be between 1 and 50"}, 1053 {0, 60000, "pg_init_delay_msecs must be between 0 and 60000"}, 1054 }; 1055 1056 r = dm_read_arg_group(_args, as, &argc, &ti->error); 1057 if (r) 1058 return -EINVAL; 1059 1060 if (!argc) 1061 return 0; 1062 1063 do { 1064 arg_name = dm_shift_arg(as); 1065 argc--; 1066 1067 if (!strcasecmp(arg_name, "queue_if_no_path")) { 1068 r = queue_if_no_path(m, true, false); 1069 continue; 1070 } 1071 1072 if (!strcasecmp(arg_name, "retain_attached_hw_handler")) { 1073 set_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags); 1074 continue; 1075 } 1076 1077 if (!strcasecmp(arg_name, "pg_init_retries") && 1078 (argc >= 1)) { 1079 r = dm_read_arg(_args + 1, as, &m->pg_init_retries, &ti->error); 1080 argc--; 1081 continue; 1082 } 1083 1084 if (!strcasecmp(arg_name, "pg_init_delay_msecs") && 1085 (argc >= 1)) { 1086 r = dm_read_arg(_args + 2, as, &m->pg_init_delay_msecs, &ti->error); 1087 argc--; 1088 continue; 1089 } 1090 1091 if (!strcasecmp(arg_name, "queue_mode") && 1092 (argc >= 1)) { 1093 const char *queue_mode_name = dm_shift_arg(as); 1094 1095 if (!strcasecmp(queue_mode_name, "bio")) 1096 m->queue_mode = DM_TYPE_BIO_BASED; 1097 else if (!strcasecmp(queue_mode_name, "rq") || 1098 !strcasecmp(queue_mode_name, "mq")) 1099 m->queue_mode = DM_TYPE_REQUEST_BASED; 1100 else { 1101 ti->error = "Unknown 'queue_mode' requested"; 1102 r = -EINVAL; 1103 } 1104 argc--; 1105 continue; 1106 } 1107 1108 ti->error = "Unrecognised multipath feature request"; 1109 r = -EINVAL; 1110 } while (argc && !r); 1111 1112 return r; 1113 } 1114 1115 static int multipath_ctr(struct dm_target *ti, unsigned argc, char **argv) 1116 { 1117 /* target arguments */ 1118 static const struct dm_arg _args[] = { 1119 {0, 1024, "invalid number of priority groups"}, 1120 {0, 1024, "invalid initial priority group number"}, 1121 }; 1122 1123 int r; 1124 struct multipath *m; 1125 struct dm_arg_set as; 1126 unsigned pg_count = 0; 1127 unsigned next_pg_num; 1128 1129 as.argc = argc; 1130 as.argv = argv; 1131 1132 m = alloc_multipath(ti); 1133 if (!m) { 1134 ti->error = "can't allocate multipath"; 1135 return -EINVAL; 1136 } 1137 1138 r = parse_features(&as, m); 1139 if (r) 1140 goto bad; 1141 1142 r = alloc_multipath_stage2(ti, m); 1143 if (r) 1144 goto bad; 1145 1146 r = parse_hw_handler(&as, m); 1147 if (r) 1148 goto bad; 1149 1150 r = dm_read_arg(_args, &as, &m->nr_priority_groups, &ti->error); 1151 if (r) 1152 goto bad; 1153 1154 r = dm_read_arg(_args + 1, &as, &next_pg_num, &ti->error); 1155 if (r) 1156 goto bad; 1157 1158 if ((!m->nr_priority_groups && next_pg_num) || 1159 (m->nr_priority_groups && !next_pg_num)) { 1160 ti->error = "invalid initial priority group"; 1161 r = -EINVAL; 1162 goto bad; 1163 } 1164 1165 /* parse the priority groups */ 1166 while (as.argc) { 1167 struct priority_group *pg; 1168 unsigned nr_valid_paths = atomic_read(&m->nr_valid_paths); 1169 1170 pg = parse_priority_group(&as, m); 1171 if (IS_ERR(pg)) { 1172 r = PTR_ERR(pg); 1173 goto bad; 1174 } 1175 1176 nr_valid_paths += pg->nr_pgpaths; 1177 atomic_set(&m->nr_valid_paths, nr_valid_paths); 1178 1179 list_add_tail(&pg->list, &m->priority_groups); 1180 pg_count++; 1181 pg->pg_num = pg_count; 1182 if (!--next_pg_num) 1183 m->next_pg = pg; 1184 } 1185 1186 if (pg_count != m->nr_priority_groups) { 1187 ti->error = "priority group count mismatch"; 1188 r = -EINVAL; 1189 goto bad; 1190 } 1191 1192 ti->num_flush_bios = 1; 1193 ti->num_discard_bios = 1; 1194 ti->num_write_same_bios = 1; 1195 ti->num_write_zeroes_bios = 1; 1196 if (m->queue_mode == DM_TYPE_BIO_BASED) 1197 ti->per_io_data_size = multipath_per_bio_data_size(); 1198 else 1199 ti->per_io_data_size = sizeof(struct dm_mpath_io); 1200 1201 return 0; 1202 1203 bad: 1204 free_multipath(m); 1205 return r; 1206 } 1207 1208 static void multipath_wait_for_pg_init_completion(struct multipath *m) 1209 { 1210 DEFINE_WAIT(wait); 1211 1212 while (1) { 1213 prepare_to_wait(&m->pg_init_wait, &wait, TASK_UNINTERRUPTIBLE); 1214 1215 if (!atomic_read(&m->pg_init_in_progress)) 1216 break; 1217 1218 io_schedule(); 1219 } 1220 finish_wait(&m->pg_init_wait, &wait); 1221 } 1222 1223 static void flush_multipath_work(struct multipath *m) 1224 { 1225 if (m->hw_handler_name) { 1226 set_bit(MPATHF_PG_INIT_DISABLED, &m->flags); 1227 smp_mb__after_atomic(); 1228 1229 if (atomic_read(&m->pg_init_in_progress)) 1230 flush_workqueue(kmpath_handlerd); 1231 multipath_wait_for_pg_init_completion(m); 1232 1233 clear_bit(MPATHF_PG_INIT_DISABLED, &m->flags); 1234 smp_mb__after_atomic(); 1235 } 1236 1237 if (m->queue_mode == DM_TYPE_BIO_BASED) 1238 flush_work(&m->process_queued_bios); 1239 flush_work(&m->trigger_event); 1240 } 1241 1242 static void multipath_dtr(struct dm_target *ti) 1243 { 1244 struct multipath *m = ti->private; 1245 1246 flush_multipath_work(m); 1247 free_multipath(m); 1248 } 1249 1250 /* 1251 * Take a path out of use. 1252 */ 1253 static int fail_path(struct pgpath *pgpath) 1254 { 1255 unsigned long flags; 1256 struct multipath *m = pgpath->pg->m; 1257 1258 spin_lock_irqsave(&m->lock, flags); 1259 1260 if (!pgpath->is_active) 1261 goto out; 1262 1263 DMWARN("Failing path %s.", pgpath->path.dev->name); 1264 1265 pgpath->pg->ps.type->fail_path(&pgpath->pg->ps, &pgpath->path); 1266 pgpath->is_active = false; 1267 pgpath->fail_count++; 1268 1269 atomic_dec(&m->nr_valid_paths); 1270 1271 if (pgpath == m->current_pgpath) 1272 m->current_pgpath = NULL; 1273 1274 dm_path_uevent(DM_UEVENT_PATH_FAILED, m->ti, 1275 pgpath->path.dev->name, atomic_read(&m->nr_valid_paths)); 1276 1277 schedule_work(&m->trigger_event); 1278 1279 out: 1280 spin_unlock_irqrestore(&m->lock, flags); 1281 1282 return 0; 1283 } 1284 1285 /* 1286 * Reinstate a previously-failed path 1287 */ 1288 static int reinstate_path(struct pgpath *pgpath) 1289 { 1290 int r = 0, run_queue = 0; 1291 unsigned long flags; 1292 struct multipath *m = pgpath->pg->m; 1293 unsigned nr_valid_paths; 1294 1295 spin_lock_irqsave(&m->lock, flags); 1296 1297 if (pgpath->is_active) 1298 goto out; 1299 1300 DMWARN("Reinstating path %s.", pgpath->path.dev->name); 1301 1302 r = pgpath->pg->ps.type->reinstate_path(&pgpath->pg->ps, &pgpath->path); 1303 if (r) 1304 goto out; 1305 1306 pgpath->is_active = true; 1307 1308 nr_valid_paths = atomic_inc_return(&m->nr_valid_paths); 1309 if (nr_valid_paths == 1) { 1310 m->current_pgpath = NULL; 1311 run_queue = 1; 1312 } else if (m->hw_handler_name && (m->current_pg == pgpath->pg)) { 1313 if (queue_work(kmpath_handlerd, &pgpath->activate_path.work)) 1314 atomic_inc(&m->pg_init_in_progress); 1315 } 1316 1317 dm_path_uevent(DM_UEVENT_PATH_REINSTATED, m->ti, 1318 pgpath->path.dev->name, nr_valid_paths); 1319 1320 schedule_work(&m->trigger_event); 1321 1322 out: 1323 spin_unlock_irqrestore(&m->lock, flags); 1324 if (run_queue) { 1325 dm_table_run_md_queue_async(m->ti->table); 1326 process_queued_io_list(m); 1327 } 1328 1329 return r; 1330 } 1331 1332 /* 1333 * Fail or reinstate all paths that match the provided struct dm_dev. 1334 */ 1335 static int action_dev(struct multipath *m, struct dm_dev *dev, 1336 action_fn action) 1337 { 1338 int r = -EINVAL; 1339 struct pgpath *pgpath; 1340 struct priority_group *pg; 1341 1342 list_for_each_entry(pg, &m->priority_groups, list) { 1343 list_for_each_entry(pgpath, &pg->pgpaths, list) { 1344 if (pgpath->path.dev == dev) 1345 r = action(pgpath); 1346 } 1347 } 1348 1349 return r; 1350 } 1351 1352 /* 1353 * Temporarily try to avoid having to use the specified PG 1354 */ 1355 static void bypass_pg(struct multipath *m, struct priority_group *pg, 1356 bool bypassed) 1357 { 1358 unsigned long flags; 1359 1360 spin_lock_irqsave(&m->lock, flags); 1361 1362 pg->bypassed = bypassed; 1363 m->current_pgpath = NULL; 1364 m->current_pg = NULL; 1365 1366 spin_unlock_irqrestore(&m->lock, flags); 1367 1368 schedule_work(&m->trigger_event); 1369 } 1370 1371 /* 1372 * Switch to using the specified PG from the next I/O that gets mapped 1373 */ 1374 static int switch_pg_num(struct multipath *m, const char *pgstr) 1375 { 1376 struct priority_group *pg; 1377 unsigned pgnum; 1378 unsigned long flags; 1379 char dummy; 1380 1381 if (!pgstr || (sscanf(pgstr, "%u%c", &pgnum, &dummy) != 1) || !pgnum || 1382 !m->nr_priority_groups || (pgnum > m->nr_priority_groups)) { 1383 DMWARN("invalid PG number supplied to switch_pg_num"); 1384 return -EINVAL; 1385 } 1386 1387 spin_lock_irqsave(&m->lock, flags); 1388 list_for_each_entry(pg, &m->priority_groups, list) { 1389 pg->bypassed = false; 1390 if (--pgnum) 1391 continue; 1392 1393 m->current_pgpath = NULL; 1394 m->current_pg = NULL; 1395 m->next_pg = pg; 1396 } 1397 spin_unlock_irqrestore(&m->lock, flags); 1398 1399 schedule_work(&m->trigger_event); 1400 return 0; 1401 } 1402 1403 /* 1404 * Set/clear bypassed status of a PG. 1405 * PGs are numbered upwards from 1 in the order they were declared. 1406 */ 1407 static int bypass_pg_num(struct multipath *m, const char *pgstr, bool bypassed) 1408 { 1409 struct priority_group *pg; 1410 unsigned pgnum; 1411 char dummy; 1412 1413 if (!pgstr || (sscanf(pgstr, "%u%c", &pgnum, &dummy) != 1) || !pgnum || 1414 !m->nr_priority_groups || (pgnum > m->nr_priority_groups)) { 1415 DMWARN("invalid PG number supplied to bypass_pg"); 1416 return -EINVAL; 1417 } 1418 1419 list_for_each_entry(pg, &m->priority_groups, list) { 1420 if (!--pgnum) 1421 break; 1422 } 1423 1424 bypass_pg(m, pg, bypassed); 1425 return 0; 1426 } 1427 1428 /* 1429 * Should we retry pg_init immediately? 1430 */ 1431 static bool pg_init_limit_reached(struct multipath *m, struct pgpath *pgpath) 1432 { 1433 unsigned long flags; 1434 bool limit_reached = false; 1435 1436 spin_lock_irqsave(&m->lock, flags); 1437 1438 if (atomic_read(&m->pg_init_count) <= m->pg_init_retries && 1439 !test_bit(MPATHF_PG_INIT_DISABLED, &m->flags)) 1440 set_bit(MPATHF_PG_INIT_REQUIRED, &m->flags); 1441 else 1442 limit_reached = true; 1443 1444 spin_unlock_irqrestore(&m->lock, flags); 1445 1446 return limit_reached; 1447 } 1448 1449 static void pg_init_done(void *data, int errors) 1450 { 1451 struct pgpath *pgpath = data; 1452 struct priority_group *pg = pgpath->pg; 1453 struct multipath *m = pg->m; 1454 unsigned long flags; 1455 bool delay_retry = false; 1456 1457 /* device or driver problems */ 1458 switch (errors) { 1459 case SCSI_DH_OK: 1460 break; 1461 case SCSI_DH_NOSYS: 1462 if (!m->hw_handler_name) { 1463 errors = 0; 1464 break; 1465 } 1466 DMERR("Could not failover the device: Handler scsi_dh_%s " 1467 "Error %d.", m->hw_handler_name, errors); 1468 /* 1469 * Fail path for now, so we do not ping pong 1470 */ 1471 fail_path(pgpath); 1472 break; 1473 case SCSI_DH_DEV_TEMP_BUSY: 1474 /* 1475 * Probably doing something like FW upgrade on the 1476 * controller so try the other pg. 1477 */ 1478 bypass_pg(m, pg, true); 1479 break; 1480 case SCSI_DH_RETRY: 1481 /* Wait before retrying. */ 1482 delay_retry = 1; 1483 /* fall through */ 1484 case SCSI_DH_IMM_RETRY: 1485 case SCSI_DH_RES_TEMP_UNAVAIL: 1486 if (pg_init_limit_reached(m, pgpath)) 1487 fail_path(pgpath); 1488 errors = 0; 1489 break; 1490 case SCSI_DH_DEV_OFFLINED: 1491 default: 1492 /* 1493 * We probably do not want to fail the path for a device 1494 * error, but this is what the old dm did. In future 1495 * patches we can do more advanced handling. 1496 */ 1497 fail_path(pgpath); 1498 } 1499 1500 spin_lock_irqsave(&m->lock, flags); 1501 if (errors) { 1502 if (pgpath == m->current_pgpath) { 1503 DMERR("Could not failover device. Error %d.", errors); 1504 m->current_pgpath = NULL; 1505 m->current_pg = NULL; 1506 } 1507 } else if (!test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) 1508 pg->bypassed = false; 1509 1510 if (atomic_dec_return(&m->pg_init_in_progress) > 0) 1511 /* Activations of other paths are still on going */ 1512 goto out; 1513 1514 if (test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) { 1515 if (delay_retry) 1516 set_bit(MPATHF_PG_INIT_DELAY_RETRY, &m->flags); 1517 else 1518 clear_bit(MPATHF_PG_INIT_DELAY_RETRY, &m->flags); 1519 1520 if (__pg_init_all_paths(m)) 1521 goto out; 1522 } 1523 clear_bit(MPATHF_QUEUE_IO, &m->flags); 1524 1525 process_queued_io_list(m); 1526 1527 /* 1528 * Wake up any thread waiting to suspend. 1529 */ 1530 wake_up(&m->pg_init_wait); 1531 1532 out: 1533 spin_unlock_irqrestore(&m->lock, flags); 1534 } 1535 1536 static void activate_or_offline_path(struct pgpath *pgpath) 1537 { 1538 struct request_queue *q = bdev_get_queue(pgpath->path.dev->bdev); 1539 1540 if (pgpath->is_active && !blk_queue_dying(q)) 1541 scsi_dh_activate(q, pg_init_done, pgpath); 1542 else 1543 pg_init_done(pgpath, SCSI_DH_DEV_OFFLINED); 1544 } 1545 1546 static void activate_path_work(struct work_struct *work) 1547 { 1548 struct pgpath *pgpath = 1549 container_of(work, struct pgpath, activate_path.work); 1550 1551 activate_or_offline_path(pgpath); 1552 } 1553 1554 static int multipath_end_io(struct dm_target *ti, struct request *clone, 1555 blk_status_t error, union map_info *map_context) 1556 { 1557 struct dm_mpath_io *mpio = get_mpio(map_context); 1558 struct pgpath *pgpath = mpio->pgpath; 1559 int r = DM_ENDIO_DONE; 1560 1561 /* 1562 * We don't queue any clone request inside the multipath target 1563 * during end I/O handling, since those clone requests don't have 1564 * bio clones. If we queue them inside the multipath target, 1565 * we need to make bio clones, that requires memory allocation. 1566 * (See drivers/md/dm-rq.c:end_clone_bio() about why the clone requests 1567 * don't have bio clones.) 1568 * Instead of queueing the clone request here, we queue the original 1569 * request into dm core, which will remake a clone request and 1570 * clone bios for it and resubmit it later. 1571 */ 1572 if (error && blk_path_error(error)) { 1573 struct multipath *m = ti->private; 1574 1575 if (error == BLK_STS_RESOURCE) 1576 r = DM_ENDIO_DELAY_REQUEUE; 1577 else 1578 r = DM_ENDIO_REQUEUE; 1579 1580 if (pgpath) 1581 fail_path(pgpath); 1582 1583 if (atomic_read(&m->nr_valid_paths) == 0 && 1584 !must_push_back_rq(m)) { 1585 if (error == BLK_STS_IOERR) 1586 dm_report_EIO(m); 1587 /* complete with the original error */ 1588 r = DM_ENDIO_DONE; 1589 } 1590 } 1591 1592 if (pgpath) { 1593 struct path_selector *ps = &pgpath->pg->ps; 1594 1595 if (ps->type->end_io) 1596 ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes); 1597 } 1598 1599 return r; 1600 } 1601 1602 static int multipath_end_io_bio(struct dm_target *ti, struct bio *clone, 1603 blk_status_t *error) 1604 { 1605 struct multipath *m = ti->private; 1606 struct dm_mpath_io *mpio = get_mpio_from_bio(clone); 1607 struct pgpath *pgpath = mpio->pgpath; 1608 unsigned long flags; 1609 int r = DM_ENDIO_DONE; 1610 1611 if (!*error || !blk_path_error(*error)) 1612 goto done; 1613 1614 if (pgpath) 1615 fail_path(pgpath); 1616 1617 if (atomic_read(&m->nr_valid_paths) == 0 && 1618 !test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) { 1619 if (must_push_back_bio(m)) { 1620 r = DM_ENDIO_REQUEUE; 1621 } else { 1622 dm_report_EIO(m); 1623 *error = BLK_STS_IOERR; 1624 } 1625 goto done; 1626 } 1627 1628 spin_lock_irqsave(&m->lock, flags); 1629 bio_list_add(&m->queued_bios, clone); 1630 spin_unlock_irqrestore(&m->lock, flags); 1631 if (!test_bit(MPATHF_QUEUE_IO, &m->flags)) 1632 queue_work(kmultipathd, &m->process_queued_bios); 1633 1634 r = DM_ENDIO_INCOMPLETE; 1635 done: 1636 if (pgpath) { 1637 struct path_selector *ps = &pgpath->pg->ps; 1638 1639 if (ps->type->end_io) 1640 ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes); 1641 } 1642 1643 return r; 1644 } 1645 1646 /* 1647 * Suspend can't complete until all the I/O is processed so if 1648 * the last path fails we must error any remaining I/O. 1649 * Note that if the freeze_bdev fails while suspending, the 1650 * queue_if_no_path state is lost - userspace should reset it. 1651 */ 1652 static void multipath_presuspend(struct dm_target *ti) 1653 { 1654 struct multipath *m = ti->private; 1655 1656 queue_if_no_path(m, false, true); 1657 } 1658 1659 static void multipath_postsuspend(struct dm_target *ti) 1660 { 1661 struct multipath *m = ti->private; 1662 1663 mutex_lock(&m->work_mutex); 1664 flush_multipath_work(m); 1665 mutex_unlock(&m->work_mutex); 1666 } 1667 1668 /* 1669 * Restore the queue_if_no_path setting. 1670 */ 1671 static void multipath_resume(struct dm_target *ti) 1672 { 1673 struct multipath *m = ti->private; 1674 unsigned long flags; 1675 1676 spin_lock_irqsave(&m->lock, flags); 1677 assign_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags, 1678 test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags)); 1679 spin_unlock_irqrestore(&m->lock, flags); 1680 } 1681 1682 /* 1683 * Info output has the following format: 1684 * num_multipath_feature_args [multipath_feature_args]* 1685 * num_handler_status_args [handler_status_args]* 1686 * num_groups init_group_number 1687 * [A|D|E num_ps_status_args [ps_status_args]* 1688 * num_paths num_selector_args 1689 * [path_dev A|F fail_count [selector_args]* ]+ ]+ 1690 * 1691 * Table output has the following format (identical to the constructor string): 1692 * num_feature_args [features_args]* 1693 * num_handler_args hw_handler [hw_handler_args]* 1694 * num_groups init_group_number 1695 * [priority selector-name num_ps_args [ps_args]* 1696 * num_paths num_selector_args [path_dev [selector_args]* ]+ ]+ 1697 */ 1698 static void multipath_status(struct dm_target *ti, status_type_t type, 1699 unsigned status_flags, char *result, unsigned maxlen) 1700 { 1701 int sz = 0; 1702 unsigned long flags; 1703 struct multipath *m = ti->private; 1704 struct priority_group *pg; 1705 struct pgpath *p; 1706 unsigned pg_num; 1707 char state; 1708 1709 spin_lock_irqsave(&m->lock, flags); 1710 1711 /* Features */ 1712 if (type == STATUSTYPE_INFO) 1713 DMEMIT("2 %u %u ", test_bit(MPATHF_QUEUE_IO, &m->flags), 1714 atomic_read(&m->pg_init_count)); 1715 else { 1716 DMEMIT("%u ", test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) + 1717 (m->pg_init_retries > 0) * 2 + 1718 (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT) * 2 + 1719 test_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags) + 1720 (m->queue_mode != DM_TYPE_REQUEST_BASED) * 2); 1721 1722 if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) 1723 DMEMIT("queue_if_no_path "); 1724 if (m->pg_init_retries) 1725 DMEMIT("pg_init_retries %u ", m->pg_init_retries); 1726 if (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT) 1727 DMEMIT("pg_init_delay_msecs %u ", m->pg_init_delay_msecs); 1728 if (test_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags)) 1729 DMEMIT("retain_attached_hw_handler "); 1730 if (m->queue_mode != DM_TYPE_REQUEST_BASED) { 1731 switch(m->queue_mode) { 1732 case DM_TYPE_BIO_BASED: 1733 DMEMIT("queue_mode bio "); 1734 break; 1735 default: 1736 WARN_ON_ONCE(true); 1737 break; 1738 } 1739 } 1740 } 1741 1742 if (!m->hw_handler_name || type == STATUSTYPE_INFO) 1743 DMEMIT("0 "); 1744 else 1745 DMEMIT("1 %s ", m->hw_handler_name); 1746 1747 DMEMIT("%u ", m->nr_priority_groups); 1748 1749 if (m->next_pg) 1750 pg_num = m->next_pg->pg_num; 1751 else if (m->current_pg) 1752 pg_num = m->current_pg->pg_num; 1753 else 1754 pg_num = (m->nr_priority_groups ? 1 : 0); 1755 1756 DMEMIT("%u ", pg_num); 1757 1758 switch (type) { 1759 case STATUSTYPE_INFO: 1760 list_for_each_entry(pg, &m->priority_groups, list) { 1761 if (pg->bypassed) 1762 state = 'D'; /* Disabled */ 1763 else if (pg == m->current_pg) 1764 state = 'A'; /* Currently Active */ 1765 else 1766 state = 'E'; /* Enabled */ 1767 1768 DMEMIT("%c ", state); 1769 1770 if (pg->ps.type->status) 1771 sz += pg->ps.type->status(&pg->ps, NULL, type, 1772 result + sz, 1773 maxlen - sz); 1774 else 1775 DMEMIT("0 "); 1776 1777 DMEMIT("%u %u ", pg->nr_pgpaths, 1778 pg->ps.type->info_args); 1779 1780 list_for_each_entry(p, &pg->pgpaths, list) { 1781 DMEMIT("%s %s %u ", p->path.dev->name, 1782 p->is_active ? "A" : "F", 1783 p->fail_count); 1784 if (pg->ps.type->status) 1785 sz += pg->ps.type->status(&pg->ps, 1786 &p->path, type, result + sz, 1787 maxlen - sz); 1788 } 1789 } 1790 break; 1791 1792 case STATUSTYPE_TABLE: 1793 list_for_each_entry(pg, &m->priority_groups, list) { 1794 DMEMIT("%s ", pg->ps.type->name); 1795 1796 if (pg->ps.type->status) 1797 sz += pg->ps.type->status(&pg->ps, NULL, type, 1798 result + sz, 1799 maxlen - sz); 1800 else 1801 DMEMIT("0 "); 1802 1803 DMEMIT("%u %u ", pg->nr_pgpaths, 1804 pg->ps.type->table_args); 1805 1806 list_for_each_entry(p, &pg->pgpaths, list) { 1807 DMEMIT("%s ", p->path.dev->name); 1808 if (pg->ps.type->status) 1809 sz += pg->ps.type->status(&pg->ps, 1810 &p->path, type, result + sz, 1811 maxlen - sz); 1812 } 1813 } 1814 break; 1815 } 1816 1817 spin_unlock_irqrestore(&m->lock, flags); 1818 } 1819 1820 static int multipath_message(struct dm_target *ti, unsigned argc, char **argv, 1821 char *result, unsigned maxlen) 1822 { 1823 int r = -EINVAL; 1824 struct dm_dev *dev; 1825 struct multipath *m = ti->private; 1826 action_fn action; 1827 1828 mutex_lock(&m->work_mutex); 1829 1830 if (dm_suspended(ti)) { 1831 r = -EBUSY; 1832 goto out; 1833 } 1834 1835 if (argc == 1) { 1836 if (!strcasecmp(argv[0], "queue_if_no_path")) { 1837 r = queue_if_no_path(m, true, false); 1838 goto out; 1839 } else if (!strcasecmp(argv[0], "fail_if_no_path")) { 1840 r = queue_if_no_path(m, false, false); 1841 goto out; 1842 } 1843 } 1844 1845 if (argc != 2) { 1846 DMWARN("Invalid multipath message arguments. Expected 2 arguments, got %d.", argc); 1847 goto out; 1848 } 1849 1850 if (!strcasecmp(argv[0], "disable_group")) { 1851 r = bypass_pg_num(m, argv[1], true); 1852 goto out; 1853 } else if (!strcasecmp(argv[0], "enable_group")) { 1854 r = bypass_pg_num(m, argv[1], false); 1855 goto out; 1856 } else if (!strcasecmp(argv[0], "switch_group")) { 1857 r = switch_pg_num(m, argv[1]); 1858 goto out; 1859 } else if (!strcasecmp(argv[0], "reinstate_path")) 1860 action = reinstate_path; 1861 else if (!strcasecmp(argv[0], "fail_path")) 1862 action = fail_path; 1863 else { 1864 DMWARN("Unrecognised multipath message received: %s", argv[0]); 1865 goto out; 1866 } 1867 1868 r = dm_get_device(ti, argv[1], dm_table_get_mode(ti->table), &dev); 1869 if (r) { 1870 DMWARN("message: error getting device %s", 1871 argv[1]); 1872 goto out; 1873 } 1874 1875 r = action_dev(m, dev, action); 1876 1877 dm_put_device(ti, dev); 1878 1879 out: 1880 mutex_unlock(&m->work_mutex); 1881 return r; 1882 } 1883 1884 static int multipath_prepare_ioctl(struct dm_target *ti, 1885 struct block_device **bdev) 1886 { 1887 struct multipath *m = ti->private; 1888 struct pgpath *current_pgpath; 1889 int r; 1890 1891 current_pgpath = READ_ONCE(m->current_pgpath); 1892 if (!current_pgpath) 1893 current_pgpath = choose_pgpath(m, 0); 1894 1895 if (current_pgpath) { 1896 if (!test_bit(MPATHF_QUEUE_IO, &m->flags)) { 1897 *bdev = current_pgpath->path.dev->bdev; 1898 r = 0; 1899 } else { 1900 /* pg_init has not started or completed */ 1901 r = -ENOTCONN; 1902 } 1903 } else { 1904 /* No path is available */ 1905 if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) 1906 r = -ENOTCONN; 1907 else 1908 r = -EIO; 1909 } 1910 1911 if (r == -ENOTCONN) { 1912 if (!READ_ONCE(m->current_pg)) { 1913 /* Path status changed, redo selection */ 1914 (void) choose_pgpath(m, 0); 1915 } 1916 if (test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) 1917 pg_init_all_paths(m); 1918 dm_table_run_md_queue_async(m->ti->table); 1919 process_queued_io_list(m); 1920 } 1921 1922 /* 1923 * Only pass ioctls through if the device sizes match exactly. 1924 */ 1925 if (!r && ti->len != i_size_read((*bdev)->bd_inode) >> SECTOR_SHIFT) 1926 return 1; 1927 return r; 1928 } 1929 1930 static int multipath_iterate_devices(struct dm_target *ti, 1931 iterate_devices_callout_fn fn, void *data) 1932 { 1933 struct multipath *m = ti->private; 1934 struct priority_group *pg; 1935 struct pgpath *p; 1936 int ret = 0; 1937 1938 list_for_each_entry(pg, &m->priority_groups, list) { 1939 list_for_each_entry(p, &pg->pgpaths, list) { 1940 ret = fn(ti, p->path.dev, ti->begin, ti->len, data); 1941 if (ret) 1942 goto out; 1943 } 1944 } 1945 1946 out: 1947 return ret; 1948 } 1949 1950 static int pgpath_busy(struct pgpath *pgpath) 1951 { 1952 struct request_queue *q = bdev_get_queue(pgpath->path.dev->bdev); 1953 1954 return blk_lld_busy(q); 1955 } 1956 1957 /* 1958 * We return "busy", only when we can map I/Os but underlying devices 1959 * are busy (so even if we map I/Os now, the I/Os will wait on 1960 * the underlying queue). 1961 * In other words, if we want to kill I/Os or queue them inside us 1962 * due to map unavailability, we don't return "busy". Otherwise, 1963 * dm core won't give us the I/Os and we can't do what we want. 1964 */ 1965 static int multipath_busy(struct dm_target *ti) 1966 { 1967 bool busy = false, has_active = false; 1968 struct multipath *m = ti->private; 1969 struct priority_group *pg, *next_pg; 1970 struct pgpath *pgpath; 1971 1972 /* pg_init in progress */ 1973 if (atomic_read(&m->pg_init_in_progress)) 1974 return true; 1975 1976 /* no paths available, for blk-mq: rely on IO mapping to delay requeue */ 1977 if (!atomic_read(&m->nr_valid_paths) && test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) 1978 return (m->queue_mode != DM_TYPE_REQUEST_BASED); 1979 1980 /* Guess which priority_group will be used at next mapping time */ 1981 pg = READ_ONCE(m->current_pg); 1982 next_pg = READ_ONCE(m->next_pg); 1983 if (unlikely(!READ_ONCE(m->current_pgpath) && next_pg)) 1984 pg = next_pg; 1985 1986 if (!pg) { 1987 /* 1988 * We don't know which pg will be used at next mapping time. 1989 * We don't call choose_pgpath() here to avoid to trigger 1990 * pg_init just by busy checking. 1991 * So we don't know whether underlying devices we will be using 1992 * at next mapping time are busy or not. Just try mapping. 1993 */ 1994 return busy; 1995 } 1996 1997 /* 1998 * If there is one non-busy active path at least, the path selector 1999 * will be able to select it. So we consider such a pg as not busy. 2000 */ 2001 busy = true; 2002 list_for_each_entry(pgpath, &pg->pgpaths, list) { 2003 if (pgpath->is_active) { 2004 has_active = true; 2005 if (!pgpath_busy(pgpath)) { 2006 busy = false; 2007 break; 2008 } 2009 } 2010 } 2011 2012 if (!has_active) { 2013 /* 2014 * No active path in this pg, so this pg won't be used and 2015 * the current_pg will be changed at next mapping time. 2016 * We need to try mapping to determine it. 2017 */ 2018 busy = false; 2019 } 2020 2021 return busy; 2022 } 2023 2024 /*----------------------------------------------------------------- 2025 * Module setup 2026 *---------------------------------------------------------------*/ 2027 static struct target_type multipath_target = { 2028 .name = "multipath", 2029 .version = {1, 13, 0}, 2030 .features = DM_TARGET_SINGLETON | DM_TARGET_IMMUTABLE | 2031 DM_TARGET_PASSES_INTEGRITY, 2032 .module = THIS_MODULE, 2033 .ctr = multipath_ctr, 2034 .dtr = multipath_dtr, 2035 .clone_and_map_rq = multipath_clone_and_map, 2036 .release_clone_rq = multipath_release_clone, 2037 .rq_end_io = multipath_end_io, 2038 .map = multipath_map_bio, 2039 .end_io = multipath_end_io_bio, 2040 .presuspend = multipath_presuspend, 2041 .postsuspend = multipath_postsuspend, 2042 .resume = multipath_resume, 2043 .status = multipath_status, 2044 .message = multipath_message, 2045 .prepare_ioctl = multipath_prepare_ioctl, 2046 .iterate_devices = multipath_iterate_devices, 2047 .busy = multipath_busy, 2048 }; 2049 2050 static int __init dm_multipath_init(void) 2051 { 2052 int r; 2053 2054 kmultipathd = alloc_workqueue("kmpathd", WQ_MEM_RECLAIM, 0); 2055 if (!kmultipathd) { 2056 DMERR("failed to create workqueue kmpathd"); 2057 r = -ENOMEM; 2058 goto bad_alloc_kmultipathd; 2059 } 2060 2061 /* 2062 * A separate workqueue is used to handle the device handlers 2063 * to avoid overloading existing workqueue. Overloading the 2064 * old workqueue would also create a bottleneck in the 2065 * path of the storage hardware device activation. 2066 */ 2067 kmpath_handlerd = alloc_ordered_workqueue("kmpath_handlerd", 2068 WQ_MEM_RECLAIM); 2069 if (!kmpath_handlerd) { 2070 DMERR("failed to create workqueue kmpath_handlerd"); 2071 r = -ENOMEM; 2072 goto bad_alloc_kmpath_handlerd; 2073 } 2074 2075 r = dm_register_target(&multipath_target); 2076 if (r < 0) { 2077 DMERR("request-based register failed %d", r); 2078 r = -EINVAL; 2079 goto bad_register_target; 2080 } 2081 2082 return 0; 2083 2084 bad_register_target: 2085 destroy_workqueue(kmpath_handlerd); 2086 bad_alloc_kmpath_handlerd: 2087 destroy_workqueue(kmultipathd); 2088 bad_alloc_kmultipathd: 2089 return r; 2090 } 2091 2092 static void __exit dm_multipath_exit(void) 2093 { 2094 destroy_workqueue(kmpath_handlerd); 2095 destroy_workqueue(kmultipathd); 2096 2097 dm_unregister_target(&multipath_target); 2098 } 2099 2100 module_init(dm_multipath_init); 2101 module_exit(dm_multipath_exit); 2102 2103 MODULE_DESCRIPTION(DM_NAME " multipath target"); 2104 MODULE_AUTHOR("Sistina Software <dm-devel@redhat.com>"); 2105 MODULE_LICENSE("GPL"); 2106