1 /* 2 * Copyright (C) 2003 Sistina Software Limited. 3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. 4 * 5 * This file is released under the GPL. 6 */ 7 8 #include <linux/device-mapper.h> 9 10 #include "dm-rq.h" 11 #include "dm-bio-record.h" 12 #include "dm-path-selector.h" 13 #include "dm-uevent.h" 14 15 #include <linux/blkdev.h> 16 #include <linux/ctype.h> 17 #include <linux/init.h> 18 #include <linux/mempool.h> 19 #include <linux/module.h> 20 #include <linux/pagemap.h> 21 #include <linux/slab.h> 22 #include <linux/time.h> 23 #include <linux/workqueue.h> 24 #include <linux/delay.h> 25 #include <scsi/scsi_dh.h> 26 #include <linux/atomic.h> 27 #include <linux/blk-mq.h> 28 29 #define DM_MSG_PREFIX "multipath" 30 #define DM_PG_INIT_DELAY_MSECS 2000 31 #define DM_PG_INIT_DELAY_DEFAULT ((unsigned) -1) 32 33 /* Path properties */ 34 struct pgpath { 35 struct list_head list; 36 37 struct priority_group *pg; /* Owning PG */ 38 unsigned fail_count; /* Cumulative failure count */ 39 40 struct dm_path path; 41 struct delayed_work activate_path; 42 43 bool is_active:1; /* Path status */ 44 }; 45 46 #define path_to_pgpath(__pgp) container_of((__pgp), struct pgpath, path) 47 48 /* 49 * Paths are grouped into Priority Groups and numbered from 1 upwards. 50 * Each has a path selector which controls which path gets used. 51 */ 52 struct priority_group { 53 struct list_head list; 54 55 struct multipath *m; /* Owning multipath instance */ 56 struct path_selector ps; 57 58 unsigned pg_num; /* Reference number */ 59 unsigned nr_pgpaths; /* Number of paths in PG */ 60 struct list_head pgpaths; 61 62 bool bypassed:1; /* Temporarily bypass this PG? */ 63 }; 64 65 /* Multipath context */ 66 struct multipath { 67 struct list_head list; 68 struct dm_target *ti; 69 70 const char *hw_handler_name; 71 char *hw_handler_params; 72 73 spinlock_t lock; 74 75 unsigned nr_priority_groups; 76 struct list_head priority_groups; 77 78 wait_queue_head_t pg_init_wait; /* Wait for pg_init completion */ 79 80 struct pgpath *current_pgpath; 81 struct priority_group *current_pg; 82 struct priority_group *next_pg; /* Switch to this PG if set */ 83 84 unsigned long flags; /* Multipath state flags */ 85 86 unsigned pg_init_retries; /* Number of times to retry pg_init */ 87 unsigned pg_init_delay_msecs; /* Number of msecs before pg_init retry */ 88 89 atomic_t nr_valid_paths; /* Total number of usable paths */ 90 atomic_t pg_init_in_progress; /* Only one pg_init allowed at once */ 91 atomic_t pg_init_count; /* Number of times pg_init called */ 92 93 unsigned queue_mode; 94 95 /* 96 * We must use a mempool of dm_mpath_io structs so that we 97 * can resubmit bios on error. 98 */ 99 mempool_t *mpio_pool; 100 101 struct mutex work_mutex; 102 struct work_struct trigger_event; 103 104 struct work_struct process_queued_bios; 105 struct bio_list queued_bios; 106 }; 107 108 /* 109 * Context information attached to each io we process. 110 */ 111 struct dm_mpath_io { 112 struct pgpath *pgpath; 113 size_t nr_bytes; 114 }; 115 116 typedef int (*action_fn) (struct pgpath *pgpath); 117 118 static struct kmem_cache *_mpio_cache; 119 120 static struct workqueue_struct *kmultipathd, *kmpath_handlerd; 121 static void trigger_event(struct work_struct *work); 122 static void activate_path(struct work_struct *work); 123 static void process_queued_bios(struct work_struct *work); 124 125 /*----------------------------------------------- 126 * Multipath state flags. 127 *-----------------------------------------------*/ 128 129 #define MPATHF_QUEUE_IO 0 /* Must we queue all I/O? */ 130 #define MPATHF_QUEUE_IF_NO_PATH 1 /* Queue I/O if last path fails? */ 131 #define MPATHF_SAVED_QUEUE_IF_NO_PATH 2 /* Saved state during suspension */ 132 #define MPATHF_RETAIN_ATTACHED_HW_HANDLER 3 /* If there's already a hw_handler present, don't change it. */ 133 #define MPATHF_PG_INIT_DISABLED 4 /* pg_init is not currently allowed */ 134 #define MPATHF_PG_INIT_REQUIRED 5 /* pg_init needs calling? */ 135 #define MPATHF_PG_INIT_DELAY_RETRY 6 /* Delay pg_init retry? */ 136 137 /*----------------------------------------------- 138 * Allocation routines 139 *-----------------------------------------------*/ 140 141 static struct pgpath *alloc_pgpath(void) 142 { 143 struct pgpath *pgpath = kzalloc(sizeof(*pgpath), GFP_KERNEL); 144 145 if (pgpath) { 146 pgpath->is_active = true; 147 INIT_DELAYED_WORK(&pgpath->activate_path, activate_path); 148 } 149 150 return pgpath; 151 } 152 153 static void free_pgpath(struct pgpath *pgpath) 154 { 155 kfree(pgpath); 156 } 157 158 static struct priority_group *alloc_priority_group(void) 159 { 160 struct priority_group *pg; 161 162 pg = kzalloc(sizeof(*pg), GFP_KERNEL); 163 164 if (pg) 165 INIT_LIST_HEAD(&pg->pgpaths); 166 167 return pg; 168 } 169 170 static void free_pgpaths(struct list_head *pgpaths, struct dm_target *ti) 171 { 172 struct pgpath *pgpath, *tmp; 173 174 list_for_each_entry_safe(pgpath, tmp, pgpaths, list) { 175 list_del(&pgpath->list); 176 dm_put_device(ti, pgpath->path.dev); 177 free_pgpath(pgpath); 178 } 179 } 180 181 static void free_priority_group(struct priority_group *pg, 182 struct dm_target *ti) 183 { 184 struct path_selector *ps = &pg->ps; 185 186 if (ps->type) { 187 ps->type->destroy(ps); 188 dm_put_path_selector(ps->type); 189 } 190 191 free_pgpaths(&pg->pgpaths, ti); 192 kfree(pg); 193 } 194 195 static struct multipath *alloc_multipath(struct dm_target *ti) 196 { 197 struct multipath *m; 198 199 m = kzalloc(sizeof(*m), GFP_KERNEL); 200 if (m) { 201 INIT_LIST_HEAD(&m->priority_groups); 202 spin_lock_init(&m->lock); 203 set_bit(MPATHF_QUEUE_IO, &m->flags); 204 atomic_set(&m->nr_valid_paths, 0); 205 atomic_set(&m->pg_init_in_progress, 0); 206 atomic_set(&m->pg_init_count, 0); 207 m->pg_init_delay_msecs = DM_PG_INIT_DELAY_DEFAULT; 208 INIT_WORK(&m->trigger_event, trigger_event); 209 init_waitqueue_head(&m->pg_init_wait); 210 mutex_init(&m->work_mutex); 211 212 m->mpio_pool = NULL; 213 m->queue_mode = DM_TYPE_NONE; 214 215 m->ti = ti; 216 ti->private = m; 217 } 218 219 return m; 220 } 221 222 static int alloc_multipath_stage2(struct dm_target *ti, struct multipath *m) 223 { 224 if (m->queue_mode == DM_TYPE_NONE) { 225 /* 226 * Default to request-based. 227 */ 228 if (dm_use_blk_mq(dm_table_get_md(ti->table))) 229 m->queue_mode = DM_TYPE_MQ_REQUEST_BASED; 230 else 231 m->queue_mode = DM_TYPE_REQUEST_BASED; 232 } 233 234 if (m->queue_mode == DM_TYPE_REQUEST_BASED) { 235 unsigned min_ios = dm_get_reserved_rq_based_ios(); 236 237 m->mpio_pool = mempool_create_slab_pool(min_ios, _mpio_cache); 238 if (!m->mpio_pool) 239 return -ENOMEM; 240 } 241 else if (m->queue_mode == DM_TYPE_BIO_BASED) { 242 INIT_WORK(&m->process_queued_bios, process_queued_bios); 243 /* 244 * bio-based doesn't support any direct scsi_dh management; 245 * it just discovers if a scsi_dh is attached. 246 */ 247 set_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags); 248 } 249 250 dm_table_set_type(ti->table, m->queue_mode); 251 252 return 0; 253 } 254 255 static void free_multipath(struct multipath *m) 256 { 257 struct priority_group *pg, *tmp; 258 259 list_for_each_entry_safe(pg, tmp, &m->priority_groups, list) { 260 list_del(&pg->list); 261 free_priority_group(pg, m->ti); 262 } 263 264 kfree(m->hw_handler_name); 265 kfree(m->hw_handler_params); 266 mempool_destroy(m->mpio_pool); 267 kfree(m); 268 } 269 270 static struct dm_mpath_io *get_mpio(union map_info *info) 271 { 272 return info->ptr; 273 } 274 275 static struct dm_mpath_io *set_mpio(struct multipath *m, union map_info *info) 276 { 277 struct dm_mpath_io *mpio; 278 279 if (!m->mpio_pool) { 280 /* Use blk-mq pdu memory requested via per_io_data_size */ 281 mpio = get_mpio(info); 282 memset(mpio, 0, sizeof(*mpio)); 283 return mpio; 284 } 285 286 mpio = mempool_alloc(m->mpio_pool, GFP_ATOMIC); 287 if (!mpio) 288 return NULL; 289 290 memset(mpio, 0, sizeof(*mpio)); 291 info->ptr = mpio; 292 293 return mpio; 294 } 295 296 static void clear_request_fn_mpio(struct multipath *m, union map_info *info) 297 { 298 /* Only needed for non blk-mq (.request_fn) multipath */ 299 if (m->mpio_pool) { 300 struct dm_mpath_io *mpio = info->ptr; 301 302 info->ptr = NULL; 303 mempool_free(mpio, m->mpio_pool); 304 } 305 } 306 307 static size_t multipath_per_bio_data_size(void) 308 { 309 return sizeof(struct dm_mpath_io) + sizeof(struct dm_bio_details); 310 } 311 312 static struct dm_mpath_io *get_mpio_from_bio(struct bio *bio) 313 { 314 return dm_per_bio_data(bio, multipath_per_bio_data_size()); 315 } 316 317 static struct dm_bio_details *get_bio_details_from_bio(struct bio *bio) 318 { 319 /* dm_bio_details is immediately after the dm_mpath_io in bio's per-bio-data */ 320 struct dm_mpath_io *mpio = get_mpio_from_bio(bio); 321 void *bio_details = mpio + 1; 322 323 return bio_details; 324 } 325 326 static void multipath_init_per_bio_data(struct bio *bio, struct dm_mpath_io **mpio_p, 327 struct dm_bio_details **bio_details_p) 328 { 329 struct dm_mpath_io *mpio = get_mpio_from_bio(bio); 330 struct dm_bio_details *bio_details = get_bio_details_from_bio(bio); 331 332 memset(mpio, 0, sizeof(*mpio)); 333 memset(bio_details, 0, sizeof(*bio_details)); 334 dm_bio_record(bio_details, bio); 335 336 if (mpio_p) 337 *mpio_p = mpio; 338 if (bio_details_p) 339 *bio_details_p = bio_details; 340 } 341 342 /*----------------------------------------------- 343 * Path selection 344 *-----------------------------------------------*/ 345 346 static int __pg_init_all_paths(struct multipath *m) 347 { 348 struct pgpath *pgpath; 349 unsigned long pg_init_delay = 0; 350 351 if (atomic_read(&m->pg_init_in_progress) || test_bit(MPATHF_PG_INIT_DISABLED, &m->flags)) 352 return 0; 353 354 atomic_inc(&m->pg_init_count); 355 clear_bit(MPATHF_PG_INIT_REQUIRED, &m->flags); 356 357 /* Check here to reset pg_init_required */ 358 if (!m->current_pg) 359 return 0; 360 361 if (test_bit(MPATHF_PG_INIT_DELAY_RETRY, &m->flags)) 362 pg_init_delay = msecs_to_jiffies(m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT ? 363 m->pg_init_delay_msecs : DM_PG_INIT_DELAY_MSECS); 364 list_for_each_entry(pgpath, &m->current_pg->pgpaths, list) { 365 /* Skip failed paths */ 366 if (!pgpath->is_active) 367 continue; 368 if (queue_delayed_work(kmpath_handlerd, &pgpath->activate_path, 369 pg_init_delay)) 370 atomic_inc(&m->pg_init_in_progress); 371 } 372 return atomic_read(&m->pg_init_in_progress); 373 } 374 375 static int pg_init_all_paths(struct multipath *m) 376 { 377 int r; 378 unsigned long flags; 379 380 spin_lock_irqsave(&m->lock, flags); 381 r = __pg_init_all_paths(m); 382 spin_unlock_irqrestore(&m->lock, flags); 383 384 return r; 385 } 386 387 static void __switch_pg(struct multipath *m, struct priority_group *pg) 388 { 389 m->current_pg = pg; 390 391 /* Must we initialise the PG first, and queue I/O till it's ready? */ 392 if (m->hw_handler_name) { 393 set_bit(MPATHF_PG_INIT_REQUIRED, &m->flags); 394 set_bit(MPATHF_QUEUE_IO, &m->flags); 395 } else { 396 clear_bit(MPATHF_PG_INIT_REQUIRED, &m->flags); 397 clear_bit(MPATHF_QUEUE_IO, &m->flags); 398 } 399 400 atomic_set(&m->pg_init_count, 0); 401 } 402 403 static struct pgpath *choose_path_in_pg(struct multipath *m, 404 struct priority_group *pg, 405 size_t nr_bytes) 406 { 407 unsigned long flags; 408 struct dm_path *path; 409 struct pgpath *pgpath; 410 411 path = pg->ps.type->select_path(&pg->ps, nr_bytes); 412 if (!path) 413 return ERR_PTR(-ENXIO); 414 415 pgpath = path_to_pgpath(path); 416 417 if (unlikely(lockless_dereference(m->current_pg) != pg)) { 418 /* Only update current_pgpath if pg changed */ 419 spin_lock_irqsave(&m->lock, flags); 420 m->current_pgpath = pgpath; 421 __switch_pg(m, pg); 422 spin_unlock_irqrestore(&m->lock, flags); 423 } 424 425 return pgpath; 426 } 427 428 static struct pgpath *choose_pgpath(struct multipath *m, size_t nr_bytes) 429 { 430 unsigned long flags; 431 struct priority_group *pg; 432 struct pgpath *pgpath; 433 bool bypassed = true; 434 435 if (!atomic_read(&m->nr_valid_paths)) { 436 clear_bit(MPATHF_QUEUE_IO, &m->flags); 437 goto failed; 438 } 439 440 /* Were we instructed to switch PG? */ 441 if (lockless_dereference(m->next_pg)) { 442 spin_lock_irqsave(&m->lock, flags); 443 pg = m->next_pg; 444 if (!pg) { 445 spin_unlock_irqrestore(&m->lock, flags); 446 goto check_current_pg; 447 } 448 m->next_pg = NULL; 449 spin_unlock_irqrestore(&m->lock, flags); 450 pgpath = choose_path_in_pg(m, pg, nr_bytes); 451 if (!IS_ERR_OR_NULL(pgpath)) 452 return pgpath; 453 } 454 455 /* Don't change PG until it has no remaining paths */ 456 check_current_pg: 457 pg = lockless_dereference(m->current_pg); 458 if (pg) { 459 pgpath = choose_path_in_pg(m, pg, nr_bytes); 460 if (!IS_ERR_OR_NULL(pgpath)) 461 return pgpath; 462 } 463 464 /* 465 * Loop through priority groups until we find a valid path. 466 * First time we skip PGs marked 'bypassed'. 467 * Second time we only try the ones we skipped, but set 468 * pg_init_delay_retry so we do not hammer controllers. 469 */ 470 do { 471 list_for_each_entry(pg, &m->priority_groups, list) { 472 if (pg->bypassed == bypassed) 473 continue; 474 pgpath = choose_path_in_pg(m, pg, nr_bytes); 475 if (!IS_ERR_OR_NULL(pgpath)) { 476 if (!bypassed) 477 set_bit(MPATHF_PG_INIT_DELAY_RETRY, &m->flags); 478 return pgpath; 479 } 480 } 481 } while (bypassed--); 482 483 failed: 484 spin_lock_irqsave(&m->lock, flags); 485 m->current_pgpath = NULL; 486 m->current_pg = NULL; 487 spin_unlock_irqrestore(&m->lock, flags); 488 489 return NULL; 490 } 491 492 /* 493 * Check whether bios must be queued in the device-mapper core rather 494 * than here in the target. 495 * 496 * If m->queue_if_no_path and m->saved_queue_if_no_path hold the 497 * same value then we are not between multipath_presuspend() 498 * and multipath_resume() calls and we have no need to check 499 * for the DMF_NOFLUSH_SUSPENDING flag. 500 */ 501 static bool __must_push_back(struct multipath *m) 502 { 503 return ((test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) != 504 test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags)) && 505 dm_noflush_suspending(m->ti)); 506 } 507 508 static bool must_push_back_rq(struct multipath *m) 509 { 510 return (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) || 511 __must_push_back(m)); 512 } 513 514 static bool must_push_back_bio(struct multipath *m) 515 { 516 return __must_push_back(m); 517 } 518 519 /* 520 * Map cloned requests (request-based multipath) 521 */ 522 static int __multipath_map(struct dm_target *ti, struct request *clone, 523 union map_info *map_context, 524 struct request *rq, struct request **__clone) 525 { 526 struct multipath *m = ti->private; 527 int r = DM_MAPIO_REQUEUE; 528 size_t nr_bytes = clone ? blk_rq_bytes(clone) : blk_rq_bytes(rq); 529 struct pgpath *pgpath; 530 struct block_device *bdev; 531 struct dm_mpath_io *mpio; 532 533 /* Do we need to select a new pgpath? */ 534 pgpath = lockless_dereference(m->current_pgpath); 535 if (!pgpath || !test_bit(MPATHF_QUEUE_IO, &m->flags)) 536 pgpath = choose_pgpath(m, nr_bytes); 537 538 if (!pgpath) { 539 if (!must_push_back_rq(m)) 540 r = -EIO; /* Failed */ 541 return r; 542 } else if (test_bit(MPATHF_QUEUE_IO, &m->flags) || 543 test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) { 544 pg_init_all_paths(m); 545 return r; 546 } 547 548 mpio = set_mpio(m, map_context); 549 if (!mpio) 550 /* ENOMEM, requeue */ 551 return r; 552 553 mpio->pgpath = pgpath; 554 mpio->nr_bytes = nr_bytes; 555 556 bdev = pgpath->path.dev->bdev; 557 558 if (clone) { 559 /* 560 * Old request-based interface: allocated clone is passed in. 561 * Used by: .request_fn stacked on .request_fn path(s). 562 */ 563 clone->q = bdev_get_queue(bdev); 564 clone->rq_disk = bdev->bd_disk; 565 clone->cmd_flags |= REQ_FAILFAST_TRANSPORT; 566 } else { 567 /* 568 * blk-mq request-based interface; used by both: 569 * .request_fn stacked on blk-mq path(s) and 570 * blk-mq stacked on blk-mq path(s). 571 */ 572 *__clone = blk_mq_alloc_request(bdev_get_queue(bdev), 573 rq_data_dir(rq), BLK_MQ_REQ_NOWAIT); 574 if (IS_ERR(*__clone)) { 575 /* ENOMEM, requeue */ 576 clear_request_fn_mpio(m, map_context); 577 return r; 578 } 579 (*__clone)->bio = (*__clone)->biotail = NULL; 580 (*__clone)->rq_disk = bdev->bd_disk; 581 (*__clone)->cmd_flags |= REQ_FAILFAST_TRANSPORT; 582 } 583 584 if (pgpath->pg->ps.type->start_io) 585 pgpath->pg->ps.type->start_io(&pgpath->pg->ps, 586 &pgpath->path, 587 nr_bytes); 588 return DM_MAPIO_REMAPPED; 589 } 590 591 static int multipath_map(struct dm_target *ti, struct request *clone, 592 union map_info *map_context) 593 { 594 return __multipath_map(ti, clone, map_context, NULL, NULL); 595 } 596 597 static int multipath_clone_and_map(struct dm_target *ti, struct request *rq, 598 union map_info *map_context, 599 struct request **clone) 600 { 601 return __multipath_map(ti, NULL, map_context, rq, clone); 602 } 603 604 static void multipath_release_clone(struct request *clone) 605 { 606 blk_mq_free_request(clone); 607 } 608 609 /* 610 * Map cloned bios (bio-based multipath) 611 */ 612 static int __multipath_map_bio(struct multipath *m, struct bio *bio, struct dm_mpath_io *mpio) 613 { 614 size_t nr_bytes = bio->bi_iter.bi_size; 615 struct pgpath *pgpath; 616 unsigned long flags; 617 bool queue_io; 618 619 /* Do we need to select a new pgpath? */ 620 pgpath = lockless_dereference(m->current_pgpath); 621 queue_io = test_bit(MPATHF_QUEUE_IO, &m->flags); 622 if (!pgpath || !queue_io) 623 pgpath = choose_pgpath(m, nr_bytes); 624 625 if ((pgpath && queue_io) || 626 (!pgpath && test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))) { 627 /* Queue for the daemon to resubmit */ 628 spin_lock_irqsave(&m->lock, flags); 629 bio_list_add(&m->queued_bios, bio); 630 spin_unlock_irqrestore(&m->lock, flags); 631 /* PG_INIT_REQUIRED cannot be set without QUEUE_IO */ 632 if (queue_io || test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) 633 pg_init_all_paths(m); 634 else if (!queue_io) 635 queue_work(kmultipathd, &m->process_queued_bios); 636 return DM_MAPIO_SUBMITTED; 637 } 638 639 if (!pgpath) { 640 if (!must_push_back_bio(m)) 641 return -EIO; 642 return DM_MAPIO_REQUEUE; 643 } 644 645 mpio->pgpath = pgpath; 646 mpio->nr_bytes = nr_bytes; 647 648 bio->bi_error = 0; 649 bio->bi_bdev = pgpath->path.dev->bdev; 650 bio->bi_rw |= REQ_FAILFAST_TRANSPORT; 651 652 if (pgpath->pg->ps.type->start_io) 653 pgpath->pg->ps.type->start_io(&pgpath->pg->ps, 654 &pgpath->path, 655 nr_bytes); 656 return DM_MAPIO_REMAPPED; 657 } 658 659 static int multipath_map_bio(struct dm_target *ti, struct bio *bio) 660 { 661 struct multipath *m = ti->private; 662 struct dm_mpath_io *mpio = NULL; 663 664 multipath_init_per_bio_data(bio, &mpio, NULL); 665 666 return __multipath_map_bio(m, bio, mpio); 667 } 668 669 static void process_queued_bios_list(struct multipath *m) 670 { 671 if (m->queue_mode == DM_TYPE_BIO_BASED) 672 queue_work(kmultipathd, &m->process_queued_bios); 673 } 674 675 static void process_queued_bios(struct work_struct *work) 676 { 677 int r; 678 unsigned long flags; 679 struct bio *bio; 680 struct bio_list bios; 681 struct blk_plug plug; 682 struct multipath *m = 683 container_of(work, struct multipath, process_queued_bios); 684 685 bio_list_init(&bios); 686 687 spin_lock_irqsave(&m->lock, flags); 688 689 if (bio_list_empty(&m->queued_bios)) { 690 spin_unlock_irqrestore(&m->lock, flags); 691 return; 692 } 693 694 bio_list_merge(&bios, &m->queued_bios); 695 bio_list_init(&m->queued_bios); 696 697 spin_unlock_irqrestore(&m->lock, flags); 698 699 blk_start_plug(&plug); 700 while ((bio = bio_list_pop(&bios))) { 701 r = __multipath_map_bio(m, bio, get_mpio_from_bio(bio)); 702 if (r < 0 || r == DM_MAPIO_REQUEUE) { 703 bio->bi_error = r; 704 bio_endio(bio); 705 } else if (r == DM_MAPIO_REMAPPED) 706 generic_make_request(bio); 707 } 708 blk_finish_plug(&plug); 709 } 710 711 /* 712 * If we run out of usable paths, should we queue I/O or error it? 713 */ 714 static int queue_if_no_path(struct multipath *m, bool queue_if_no_path, 715 bool save_old_value) 716 { 717 unsigned long flags; 718 719 spin_lock_irqsave(&m->lock, flags); 720 721 if (save_old_value) { 722 if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) 723 set_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags); 724 else 725 clear_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags); 726 } else { 727 if (queue_if_no_path) 728 set_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags); 729 else 730 clear_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags); 731 } 732 if (queue_if_no_path) 733 set_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags); 734 else 735 clear_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags); 736 737 spin_unlock_irqrestore(&m->lock, flags); 738 739 if (!queue_if_no_path) { 740 dm_table_run_md_queue_async(m->ti->table); 741 process_queued_bios_list(m); 742 } 743 744 return 0; 745 } 746 747 /* 748 * An event is triggered whenever a path is taken out of use. 749 * Includes path failure and PG bypass. 750 */ 751 static void trigger_event(struct work_struct *work) 752 { 753 struct multipath *m = 754 container_of(work, struct multipath, trigger_event); 755 756 dm_table_event(m->ti->table); 757 } 758 759 /*----------------------------------------------------------------- 760 * Constructor/argument parsing: 761 * <#multipath feature args> [<arg>]* 762 * <#hw_handler args> [hw_handler [<arg>]*] 763 * <#priority groups> 764 * <initial priority group> 765 * [<selector> <#selector args> [<arg>]* 766 * <#paths> <#per-path selector args> 767 * [<path> [<arg>]* ]+ ]+ 768 *---------------------------------------------------------------*/ 769 static int parse_path_selector(struct dm_arg_set *as, struct priority_group *pg, 770 struct dm_target *ti) 771 { 772 int r; 773 struct path_selector_type *pst; 774 unsigned ps_argc; 775 776 static struct dm_arg _args[] = { 777 {0, 1024, "invalid number of path selector args"}, 778 }; 779 780 pst = dm_get_path_selector(dm_shift_arg(as)); 781 if (!pst) { 782 ti->error = "unknown path selector type"; 783 return -EINVAL; 784 } 785 786 r = dm_read_arg_group(_args, as, &ps_argc, &ti->error); 787 if (r) { 788 dm_put_path_selector(pst); 789 return -EINVAL; 790 } 791 792 r = pst->create(&pg->ps, ps_argc, as->argv); 793 if (r) { 794 dm_put_path_selector(pst); 795 ti->error = "path selector constructor failed"; 796 return r; 797 } 798 799 pg->ps.type = pst; 800 dm_consume_args(as, ps_argc); 801 802 return 0; 803 } 804 805 static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps, 806 struct dm_target *ti) 807 { 808 int r; 809 struct pgpath *p; 810 struct multipath *m = ti->private; 811 struct request_queue *q = NULL; 812 const char *attached_handler_name; 813 814 /* we need at least a path arg */ 815 if (as->argc < 1) { 816 ti->error = "no device given"; 817 return ERR_PTR(-EINVAL); 818 } 819 820 p = alloc_pgpath(); 821 if (!p) 822 return ERR_PTR(-ENOMEM); 823 824 r = dm_get_device(ti, dm_shift_arg(as), dm_table_get_mode(ti->table), 825 &p->path.dev); 826 if (r) { 827 ti->error = "error getting device"; 828 goto bad; 829 } 830 831 if (test_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags) || m->hw_handler_name) 832 q = bdev_get_queue(p->path.dev->bdev); 833 834 if (test_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags)) { 835 retain: 836 attached_handler_name = scsi_dh_attached_handler_name(q, GFP_KERNEL); 837 if (attached_handler_name) { 838 /* 839 * Reset hw_handler_name to match the attached handler 840 * and clear any hw_handler_params associated with the 841 * ignored handler. 842 * 843 * NB. This modifies the table line to show the actual 844 * handler instead of the original table passed in. 845 */ 846 kfree(m->hw_handler_name); 847 m->hw_handler_name = attached_handler_name; 848 849 kfree(m->hw_handler_params); 850 m->hw_handler_params = NULL; 851 } 852 } 853 854 if (m->hw_handler_name) { 855 r = scsi_dh_attach(q, m->hw_handler_name); 856 if (r == -EBUSY) { 857 char b[BDEVNAME_SIZE]; 858 859 printk(KERN_INFO "dm-mpath: retaining handler on device %s\n", 860 bdevname(p->path.dev->bdev, b)); 861 goto retain; 862 } 863 if (r < 0) { 864 ti->error = "error attaching hardware handler"; 865 dm_put_device(ti, p->path.dev); 866 goto bad; 867 } 868 869 if (m->hw_handler_params) { 870 r = scsi_dh_set_params(q, m->hw_handler_params); 871 if (r < 0) { 872 ti->error = "unable to set hardware " 873 "handler parameters"; 874 dm_put_device(ti, p->path.dev); 875 goto bad; 876 } 877 } 878 } 879 880 r = ps->type->add_path(ps, &p->path, as->argc, as->argv, &ti->error); 881 if (r) { 882 dm_put_device(ti, p->path.dev); 883 goto bad; 884 } 885 886 return p; 887 888 bad: 889 free_pgpath(p); 890 return ERR_PTR(r); 891 } 892 893 static struct priority_group *parse_priority_group(struct dm_arg_set *as, 894 struct multipath *m) 895 { 896 static struct dm_arg _args[] = { 897 {1, 1024, "invalid number of paths"}, 898 {0, 1024, "invalid number of selector args"} 899 }; 900 901 int r; 902 unsigned i, nr_selector_args, nr_args; 903 struct priority_group *pg; 904 struct dm_target *ti = m->ti; 905 906 if (as->argc < 2) { 907 as->argc = 0; 908 ti->error = "not enough priority group arguments"; 909 return ERR_PTR(-EINVAL); 910 } 911 912 pg = alloc_priority_group(); 913 if (!pg) { 914 ti->error = "couldn't allocate priority group"; 915 return ERR_PTR(-ENOMEM); 916 } 917 pg->m = m; 918 919 r = parse_path_selector(as, pg, ti); 920 if (r) 921 goto bad; 922 923 /* 924 * read the paths 925 */ 926 r = dm_read_arg(_args, as, &pg->nr_pgpaths, &ti->error); 927 if (r) 928 goto bad; 929 930 r = dm_read_arg(_args + 1, as, &nr_selector_args, &ti->error); 931 if (r) 932 goto bad; 933 934 nr_args = 1 + nr_selector_args; 935 for (i = 0; i < pg->nr_pgpaths; i++) { 936 struct pgpath *pgpath; 937 struct dm_arg_set path_args; 938 939 if (as->argc < nr_args) { 940 ti->error = "not enough path parameters"; 941 r = -EINVAL; 942 goto bad; 943 } 944 945 path_args.argc = nr_args; 946 path_args.argv = as->argv; 947 948 pgpath = parse_path(&path_args, &pg->ps, ti); 949 if (IS_ERR(pgpath)) { 950 r = PTR_ERR(pgpath); 951 goto bad; 952 } 953 954 pgpath->pg = pg; 955 list_add_tail(&pgpath->list, &pg->pgpaths); 956 dm_consume_args(as, nr_args); 957 } 958 959 return pg; 960 961 bad: 962 free_priority_group(pg, ti); 963 return ERR_PTR(r); 964 } 965 966 static int parse_hw_handler(struct dm_arg_set *as, struct multipath *m) 967 { 968 unsigned hw_argc; 969 int ret; 970 struct dm_target *ti = m->ti; 971 972 static struct dm_arg _args[] = { 973 {0, 1024, "invalid number of hardware handler args"}, 974 }; 975 976 if (dm_read_arg_group(_args, as, &hw_argc, &ti->error)) 977 return -EINVAL; 978 979 if (!hw_argc) 980 return 0; 981 982 if (m->queue_mode == DM_TYPE_BIO_BASED) { 983 dm_consume_args(as, hw_argc); 984 DMERR("bio-based multipath doesn't allow hardware handler args"); 985 return 0; 986 } 987 988 m->hw_handler_name = kstrdup(dm_shift_arg(as), GFP_KERNEL); 989 990 if (hw_argc > 1) { 991 char *p; 992 int i, j, len = 4; 993 994 for (i = 0; i <= hw_argc - 2; i++) 995 len += strlen(as->argv[i]) + 1; 996 p = m->hw_handler_params = kzalloc(len, GFP_KERNEL); 997 if (!p) { 998 ti->error = "memory allocation failed"; 999 ret = -ENOMEM; 1000 goto fail; 1001 } 1002 j = sprintf(p, "%d", hw_argc - 1); 1003 for (i = 0, p+=j+1; i <= hw_argc - 2; i++, p+=j+1) 1004 j = sprintf(p, "%s", as->argv[i]); 1005 } 1006 dm_consume_args(as, hw_argc - 1); 1007 1008 return 0; 1009 fail: 1010 kfree(m->hw_handler_name); 1011 m->hw_handler_name = NULL; 1012 return ret; 1013 } 1014 1015 static int parse_features(struct dm_arg_set *as, struct multipath *m) 1016 { 1017 int r; 1018 unsigned argc; 1019 struct dm_target *ti = m->ti; 1020 const char *arg_name; 1021 1022 static struct dm_arg _args[] = { 1023 {0, 8, "invalid number of feature args"}, 1024 {1, 50, "pg_init_retries must be between 1 and 50"}, 1025 {0, 60000, "pg_init_delay_msecs must be between 0 and 60000"}, 1026 }; 1027 1028 r = dm_read_arg_group(_args, as, &argc, &ti->error); 1029 if (r) 1030 return -EINVAL; 1031 1032 if (!argc) 1033 return 0; 1034 1035 do { 1036 arg_name = dm_shift_arg(as); 1037 argc--; 1038 1039 if (!strcasecmp(arg_name, "queue_if_no_path")) { 1040 r = queue_if_no_path(m, true, false); 1041 continue; 1042 } 1043 1044 if (!strcasecmp(arg_name, "retain_attached_hw_handler")) { 1045 set_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags); 1046 continue; 1047 } 1048 1049 if (!strcasecmp(arg_name, "pg_init_retries") && 1050 (argc >= 1)) { 1051 r = dm_read_arg(_args + 1, as, &m->pg_init_retries, &ti->error); 1052 argc--; 1053 continue; 1054 } 1055 1056 if (!strcasecmp(arg_name, "pg_init_delay_msecs") && 1057 (argc >= 1)) { 1058 r = dm_read_arg(_args + 2, as, &m->pg_init_delay_msecs, &ti->error); 1059 argc--; 1060 continue; 1061 } 1062 1063 if (!strcasecmp(arg_name, "queue_mode") && 1064 (argc >= 1)) { 1065 const char *queue_mode_name = dm_shift_arg(as); 1066 1067 if (!strcasecmp(queue_mode_name, "bio")) 1068 m->queue_mode = DM_TYPE_BIO_BASED; 1069 else if (!strcasecmp(queue_mode_name, "rq")) 1070 m->queue_mode = DM_TYPE_REQUEST_BASED; 1071 else if (!strcasecmp(queue_mode_name, "mq")) 1072 m->queue_mode = DM_TYPE_MQ_REQUEST_BASED; 1073 else { 1074 ti->error = "Unknown 'queue_mode' requested"; 1075 r = -EINVAL; 1076 } 1077 argc--; 1078 continue; 1079 } 1080 1081 ti->error = "Unrecognised multipath feature request"; 1082 r = -EINVAL; 1083 } while (argc && !r); 1084 1085 return r; 1086 } 1087 1088 static int multipath_ctr(struct dm_target *ti, unsigned argc, char **argv) 1089 { 1090 /* target arguments */ 1091 static struct dm_arg _args[] = { 1092 {0, 1024, "invalid number of priority groups"}, 1093 {0, 1024, "invalid initial priority group number"}, 1094 }; 1095 1096 int r; 1097 struct multipath *m; 1098 struct dm_arg_set as; 1099 unsigned pg_count = 0; 1100 unsigned next_pg_num; 1101 1102 as.argc = argc; 1103 as.argv = argv; 1104 1105 m = alloc_multipath(ti); 1106 if (!m) { 1107 ti->error = "can't allocate multipath"; 1108 return -EINVAL; 1109 } 1110 1111 r = parse_features(&as, m); 1112 if (r) 1113 goto bad; 1114 1115 r = alloc_multipath_stage2(ti, m); 1116 if (r) 1117 goto bad; 1118 1119 r = parse_hw_handler(&as, m); 1120 if (r) 1121 goto bad; 1122 1123 r = dm_read_arg(_args, &as, &m->nr_priority_groups, &ti->error); 1124 if (r) 1125 goto bad; 1126 1127 r = dm_read_arg(_args + 1, &as, &next_pg_num, &ti->error); 1128 if (r) 1129 goto bad; 1130 1131 if ((!m->nr_priority_groups && next_pg_num) || 1132 (m->nr_priority_groups && !next_pg_num)) { 1133 ti->error = "invalid initial priority group"; 1134 r = -EINVAL; 1135 goto bad; 1136 } 1137 1138 /* parse the priority groups */ 1139 while (as.argc) { 1140 struct priority_group *pg; 1141 unsigned nr_valid_paths = atomic_read(&m->nr_valid_paths); 1142 1143 pg = parse_priority_group(&as, m); 1144 if (IS_ERR(pg)) { 1145 r = PTR_ERR(pg); 1146 goto bad; 1147 } 1148 1149 nr_valid_paths += pg->nr_pgpaths; 1150 atomic_set(&m->nr_valid_paths, nr_valid_paths); 1151 1152 list_add_tail(&pg->list, &m->priority_groups); 1153 pg_count++; 1154 pg->pg_num = pg_count; 1155 if (!--next_pg_num) 1156 m->next_pg = pg; 1157 } 1158 1159 if (pg_count != m->nr_priority_groups) { 1160 ti->error = "priority group count mismatch"; 1161 r = -EINVAL; 1162 goto bad; 1163 } 1164 1165 ti->num_flush_bios = 1; 1166 ti->num_discard_bios = 1; 1167 ti->num_write_same_bios = 1; 1168 if (m->queue_mode == DM_TYPE_BIO_BASED) 1169 ti->per_io_data_size = multipath_per_bio_data_size(); 1170 else if (m->queue_mode == DM_TYPE_MQ_REQUEST_BASED) 1171 ti->per_io_data_size = sizeof(struct dm_mpath_io); 1172 1173 return 0; 1174 1175 bad: 1176 free_multipath(m); 1177 return r; 1178 } 1179 1180 static void multipath_wait_for_pg_init_completion(struct multipath *m) 1181 { 1182 DECLARE_WAITQUEUE(wait, current); 1183 1184 add_wait_queue(&m->pg_init_wait, &wait); 1185 1186 while (1) { 1187 set_current_state(TASK_UNINTERRUPTIBLE); 1188 1189 if (!atomic_read(&m->pg_init_in_progress)) 1190 break; 1191 1192 io_schedule(); 1193 } 1194 set_current_state(TASK_RUNNING); 1195 1196 remove_wait_queue(&m->pg_init_wait, &wait); 1197 } 1198 1199 static void flush_multipath_work(struct multipath *m) 1200 { 1201 set_bit(MPATHF_PG_INIT_DISABLED, &m->flags); 1202 smp_mb__after_atomic(); 1203 1204 flush_workqueue(kmpath_handlerd); 1205 multipath_wait_for_pg_init_completion(m); 1206 flush_workqueue(kmultipathd); 1207 flush_work(&m->trigger_event); 1208 1209 clear_bit(MPATHF_PG_INIT_DISABLED, &m->flags); 1210 smp_mb__after_atomic(); 1211 } 1212 1213 static void multipath_dtr(struct dm_target *ti) 1214 { 1215 struct multipath *m = ti->private; 1216 1217 flush_multipath_work(m); 1218 free_multipath(m); 1219 } 1220 1221 /* 1222 * Take a path out of use. 1223 */ 1224 static int fail_path(struct pgpath *pgpath) 1225 { 1226 unsigned long flags; 1227 struct multipath *m = pgpath->pg->m; 1228 1229 spin_lock_irqsave(&m->lock, flags); 1230 1231 if (!pgpath->is_active) 1232 goto out; 1233 1234 DMWARN("Failing path %s.", pgpath->path.dev->name); 1235 1236 pgpath->pg->ps.type->fail_path(&pgpath->pg->ps, &pgpath->path); 1237 pgpath->is_active = false; 1238 pgpath->fail_count++; 1239 1240 atomic_dec(&m->nr_valid_paths); 1241 1242 if (pgpath == m->current_pgpath) 1243 m->current_pgpath = NULL; 1244 1245 dm_path_uevent(DM_UEVENT_PATH_FAILED, m->ti, 1246 pgpath->path.dev->name, atomic_read(&m->nr_valid_paths)); 1247 1248 schedule_work(&m->trigger_event); 1249 1250 out: 1251 spin_unlock_irqrestore(&m->lock, flags); 1252 1253 return 0; 1254 } 1255 1256 /* 1257 * Reinstate a previously-failed path 1258 */ 1259 static int reinstate_path(struct pgpath *pgpath) 1260 { 1261 int r = 0, run_queue = 0; 1262 unsigned long flags; 1263 struct multipath *m = pgpath->pg->m; 1264 unsigned nr_valid_paths; 1265 1266 spin_lock_irqsave(&m->lock, flags); 1267 1268 if (pgpath->is_active) 1269 goto out; 1270 1271 DMWARN("Reinstating path %s.", pgpath->path.dev->name); 1272 1273 r = pgpath->pg->ps.type->reinstate_path(&pgpath->pg->ps, &pgpath->path); 1274 if (r) 1275 goto out; 1276 1277 pgpath->is_active = true; 1278 1279 nr_valid_paths = atomic_inc_return(&m->nr_valid_paths); 1280 if (nr_valid_paths == 1) { 1281 m->current_pgpath = NULL; 1282 run_queue = 1; 1283 } else if (m->hw_handler_name && (m->current_pg == pgpath->pg)) { 1284 if (queue_work(kmpath_handlerd, &pgpath->activate_path.work)) 1285 atomic_inc(&m->pg_init_in_progress); 1286 } 1287 1288 dm_path_uevent(DM_UEVENT_PATH_REINSTATED, m->ti, 1289 pgpath->path.dev->name, nr_valid_paths); 1290 1291 schedule_work(&m->trigger_event); 1292 1293 out: 1294 spin_unlock_irqrestore(&m->lock, flags); 1295 if (run_queue) { 1296 dm_table_run_md_queue_async(m->ti->table); 1297 process_queued_bios_list(m); 1298 } 1299 1300 return r; 1301 } 1302 1303 /* 1304 * Fail or reinstate all paths that match the provided struct dm_dev. 1305 */ 1306 static int action_dev(struct multipath *m, struct dm_dev *dev, 1307 action_fn action) 1308 { 1309 int r = -EINVAL; 1310 struct pgpath *pgpath; 1311 struct priority_group *pg; 1312 1313 list_for_each_entry(pg, &m->priority_groups, list) { 1314 list_for_each_entry(pgpath, &pg->pgpaths, list) { 1315 if (pgpath->path.dev == dev) 1316 r = action(pgpath); 1317 } 1318 } 1319 1320 return r; 1321 } 1322 1323 /* 1324 * Temporarily try to avoid having to use the specified PG 1325 */ 1326 static void bypass_pg(struct multipath *m, struct priority_group *pg, 1327 bool bypassed) 1328 { 1329 unsigned long flags; 1330 1331 spin_lock_irqsave(&m->lock, flags); 1332 1333 pg->bypassed = bypassed; 1334 m->current_pgpath = NULL; 1335 m->current_pg = NULL; 1336 1337 spin_unlock_irqrestore(&m->lock, flags); 1338 1339 schedule_work(&m->trigger_event); 1340 } 1341 1342 /* 1343 * Switch to using the specified PG from the next I/O that gets mapped 1344 */ 1345 static int switch_pg_num(struct multipath *m, const char *pgstr) 1346 { 1347 struct priority_group *pg; 1348 unsigned pgnum; 1349 unsigned long flags; 1350 char dummy; 1351 1352 if (!pgstr || (sscanf(pgstr, "%u%c", &pgnum, &dummy) != 1) || !pgnum || 1353 (pgnum > m->nr_priority_groups)) { 1354 DMWARN("invalid PG number supplied to switch_pg_num"); 1355 return -EINVAL; 1356 } 1357 1358 spin_lock_irqsave(&m->lock, flags); 1359 list_for_each_entry(pg, &m->priority_groups, list) { 1360 pg->bypassed = false; 1361 if (--pgnum) 1362 continue; 1363 1364 m->current_pgpath = NULL; 1365 m->current_pg = NULL; 1366 m->next_pg = pg; 1367 } 1368 spin_unlock_irqrestore(&m->lock, flags); 1369 1370 schedule_work(&m->trigger_event); 1371 return 0; 1372 } 1373 1374 /* 1375 * Set/clear bypassed status of a PG. 1376 * PGs are numbered upwards from 1 in the order they were declared. 1377 */ 1378 static int bypass_pg_num(struct multipath *m, const char *pgstr, bool bypassed) 1379 { 1380 struct priority_group *pg; 1381 unsigned pgnum; 1382 char dummy; 1383 1384 if (!pgstr || (sscanf(pgstr, "%u%c", &pgnum, &dummy) != 1) || !pgnum || 1385 (pgnum > m->nr_priority_groups)) { 1386 DMWARN("invalid PG number supplied to bypass_pg"); 1387 return -EINVAL; 1388 } 1389 1390 list_for_each_entry(pg, &m->priority_groups, list) { 1391 if (!--pgnum) 1392 break; 1393 } 1394 1395 bypass_pg(m, pg, bypassed); 1396 return 0; 1397 } 1398 1399 /* 1400 * Should we retry pg_init immediately? 1401 */ 1402 static bool pg_init_limit_reached(struct multipath *m, struct pgpath *pgpath) 1403 { 1404 unsigned long flags; 1405 bool limit_reached = false; 1406 1407 spin_lock_irqsave(&m->lock, flags); 1408 1409 if (atomic_read(&m->pg_init_count) <= m->pg_init_retries && 1410 !test_bit(MPATHF_PG_INIT_DISABLED, &m->flags)) 1411 set_bit(MPATHF_PG_INIT_REQUIRED, &m->flags); 1412 else 1413 limit_reached = true; 1414 1415 spin_unlock_irqrestore(&m->lock, flags); 1416 1417 return limit_reached; 1418 } 1419 1420 static void pg_init_done(void *data, int errors) 1421 { 1422 struct pgpath *pgpath = data; 1423 struct priority_group *pg = pgpath->pg; 1424 struct multipath *m = pg->m; 1425 unsigned long flags; 1426 bool delay_retry = false; 1427 1428 /* device or driver problems */ 1429 switch (errors) { 1430 case SCSI_DH_OK: 1431 break; 1432 case SCSI_DH_NOSYS: 1433 if (!m->hw_handler_name) { 1434 errors = 0; 1435 break; 1436 } 1437 DMERR("Could not failover the device: Handler scsi_dh_%s " 1438 "Error %d.", m->hw_handler_name, errors); 1439 /* 1440 * Fail path for now, so we do not ping pong 1441 */ 1442 fail_path(pgpath); 1443 break; 1444 case SCSI_DH_DEV_TEMP_BUSY: 1445 /* 1446 * Probably doing something like FW upgrade on the 1447 * controller so try the other pg. 1448 */ 1449 bypass_pg(m, pg, true); 1450 break; 1451 case SCSI_DH_RETRY: 1452 /* Wait before retrying. */ 1453 delay_retry = 1; 1454 case SCSI_DH_IMM_RETRY: 1455 case SCSI_DH_RES_TEMP_UNAVAIL: 1456 if (pg_init_limit_reached(m, pgpath)) 1457 fail_path(pgpath); 1458 errors = 0; 1459 break; 1460 case SCSI_DH_DEV_OFFLINED: 1461 default: 1462 /* 1463 * We probably do not want to fail the path for a device 1464 * error, but this is what the old dm did. In future 1465 * patches we can do more advanced handling. 1466 */ 1467 fail_path(pgpath); 1468 } 1469 1470 spin_lock_irqsave(&m->lock, flags); 1471 if (errors) { 1472 if (pgpath == m->current_pgpath) { 1473 DMERR("Could not failover device. Error %d.", errors); 1474 m->current_pgpath = NULL; 1475 m->current_pg = NULL; 1476 } 1477 } else if (!test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) 1478 pg->bypassed = false; 1479 1480 if (atomic_dec_return(&m->pg_init_in_progress) > 0) 1481 /* Activations of other paths are still on going */ 1482 goto out; 1483 1484 if (test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) { 1485 if (delay_retry) 1486 set_bit(MPATHF_PG_INIT_DELAY_RETRY, &m->flags); 1487 else 1488 clear_bit(MPATHF_PG_INIT_DELAY_RETRY, &m->flags); 1489 1490 if (__pg_init_all_paths(m)) 1491 goto out; 1492 } 1493 clear_bit(MPATHF_QUEUE_IO, &m->flags); 1494 1495 process_queued_bios_list(m); 1496 1497 /* 1498 * Wake up any thread waiting to suspend. 1499 */ 1500 wake_up(&m->pg_init_wait); 1501 1502 out: 1503 spin_unlock_irqrestore(&m->lock, flags); 1504 } 1505 1506 static void activate_path(struct work_struct *work) 1507 { 1508 struct pgpath *pgpath = 1509 container_of(work, struct pgpath, activate_path.work); 1510 1511 if (pgpath->is_active) 1512 scsi_dh_activate(bdev_get_queue(pgpath->path.dev->bdev), 1513 pg_init_done, pgpath); 1514 else 1515 pg_init_done(pgpath, SCSI_DH_DEV_OFFLINED); 1516 } 1517 1518 static int noretry_error(int error) 1519 { 1520 switch (error) { 1521 case -EOPNOTSUPP: 1522 case -EREMOTEIO: 1523 case -EILSEQ: 1524 case -ENODATA: 1525 case -ENOSPC: 1526 return 1; 1527 } 1528 1529 /* Anything else could be a path failure, so should be retried */ 1530 return 0; 1531 } 1532 1533 /* 1534 * end_io handling 1535 */ 1536 static int do_end_io(struct multipath *m, struct request *clone, 1537 int error, struct dm_mpath_io *mpio) 1538 { 1539 /* 1540 * We don't queue any clone request inside the multipath target 1541 * during end I/O handling, since those clone requests don't have 1542 * bio clones. If we queue them inside the multipath target, 1543 * we need to make bio clones, that requires memory allocation. 1544 * (See drivers/md/dm-rq.c:end_clone_bio() about why the clone requests 1545 * don't have bio clones.) 1546 * Instead of queueing the clone request here, we queue the original 1547 * request into dm core, which will remake a clone request and 1548 * clone bios for it and resubmit it later. 1549 */ 1550 int r = DM_ENDIO_REQUEUE; 1551 1552 if (!error && !clone->errors) 1553 return 0; /* I/O complete */ 1554 1555 if (noretry_error(error)) 1556 return error; 1557 1558 if (mpio->pgpath) 1559 fail_path(mpio->pgpath); 1560 1561 if (!atomic_read(&m->nr_valid_paths)) { 1562 if (!test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) { 1563 if (!must_push_back_rq(m)) 1564 r = -EIO; 1565 } else { 1566 if (error == -EBADE) 1567 r = error; 1568 } 1569 } 1570 1571 return r; 1572 } 1573 1574 static int multipath_end_io(struct dm_target *ti, struct request *clone, 1575 int error, union map_info *map_context) 1576 { 1577 struct multipath *m = ti->private; 1578 struct dm_mpath_io *mpio = get_mpio(map_context); 1579 struct pgpath *pgpath; 1580 struct path_selector *ps; 1581 int r; 1582 1583 BUG_ON(!mpio); 1584 1585 r = do_end_io(m, clone, error, mpio); 1586 pgpath = mpio->pgpath; 1587 if (pgpath) { 1588 ps = &pgpath->pg->ps; 1589 if (ps->type->end_io) 1590 ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes); 1591 } 1592 clear_request_fn_mpio(m, map_context); 1593 1594 return r; 1595 } 1596 1597 static int do_end_io_bio(struct multipath *m, struct bio *clone, 1598 int error, struct dm_mpath_io *mpio) 1599 { 1600 unsigned long flags; 1601 1602 if (!error) 1603 return 0; /* I/O complete */ 1604 1605 if (noretry_error(error)) 1606 return error; 1607 1608 if (mpio->pgpath) 1609 fail_path(mpio->pgpath); 1610 1611 if (!atomic_read(&m->nr_valid_paths)) { 1612 if (!test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) { 1613 if (!must_push_back_bio(m)) 1614 return -EIO; 1615 return DM_ENDIO_REQUEUE; 1616 } else { 1617 if (error == -EBADE) 1618 return error; 1619 } 1620 } 1621 1622 /* Queue for the daemon to resubmit */ 1623 dm_bio_restore(get_bio_details_from_bio(clone), clone); 1624 1625 spin_lock_irqsave(&m->lock, flags); 1626 bio_list_add(&m->queued_bios, clone); 1627 spin_unlock_irqrestore(&m->lock, flags); 1628 if (!test_bit(MPATHF_QUEUE_IO, &m->flags)) 1629 queue_work(kmultipathd, &m->process_queued_bios); 1630 1631 return DM_ENDIO_INCOMPLETE; 1632 } 1633 1634 static int multipath_end_io_bio(struct dm_target *ti, struct bio *clone, int error) 1635 { 1636 struct multipath *m = ti->private; 1637 struct dm_mpath_io *mpio = get_mpio_from_bio(clone); 1638 struct pgpath *pgpath; 1639 struct path_selector *ps; 1640 int r; 1641 1642 BUG_ON(!mpio); 1643 1644 r = do_end_io_bio(m, clone, error, mpio); 1645 pgpath = mpio->pgpath; 1646 if (pgpath) { 1647 ps = &pgpath->pg->ps; 1648 if (ps->type->end_io) 1649 ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes); 1650 } 1651 1652 return r; 1653 } 1654 1655 /* 1656 * Suspend can't complete until all the I/O is processed so if 1657 * the last path fails we must error any remaining I/O. 1658 * Note that if the freeze_bdev fails while suspending, the 1659 * queue_if_no_path state is lost - userspace should reset it. 1660 */ 1661 static void multipath_presuspend(struct dm_target *ti) 1662 { 1663 struct multipath *m = ti->private; 1664 1665 queue_if_no_path(m, false, true); 1666 } 1667 1668 static void multipath_postsuspend(struct dm_target *ti) 1669 { 1670 struct multipath *m = ti->private; 1671 1672 mutex_lock(&m->work_mutex); 1673 flush_multipath_work(m); 1674 mutex_unlock(&m->work_mutex); 1675 } 1676 1677 /* 1678 * Restore the queue_if_no_path setting. 1679 */ 1680 static void multipath_resume(struct dm_target *ti) 1681 { 1682 struct multipath *m = ti->private; 1683 1684 if (test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags)) 1685 set_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags); 1686 else 1687 clear_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags); 1688 smp_mb__after_atomic(); 1689 } 1690 1691 /* 1692 * Info output has the following format: 1693 * num_multipath_feature_args [multipath_feature_args]* 1694 * num_handler_status_args [handler_status_args]* 1695 * num_groups init_group_number 1696 * [A|D|E num_ps_status_args [ps_status_args]* 1697 * num_paths num_selector_args 1698 * [path_dev A|F fail_count [selector_args]* ]+ ]+ 1699 * 1700 * Table output has the following format (identical to the constructor string): 1701 * num_feature_args [features_args]* 1702 * num_handler_args hw_handler [hw_handler_args]* 1703 * num_groups init_group_number 1704 * [priority selector-name num_ps_args [ps_args]* 1705 * num_paths num_selector_args [path_dev [selector_args]* ]+ ]+ 1706 */ 1707 static void multipath_status(struct dm_target *ti, status_type_t type, 1708 unsigned status_flags, char *result, unsigned maxlen) 1709 { 1710 int sz = 0; 1711 unsigned long flags; 1712 struct multipath *m = ti->private; 1713 struct priority_group *pg; 1714 struct pgpath *p; 1715 unsigned pg_num; 1716 char state; 1717 1718 spin_lock_irqsave(&m->lock, flags); 1719 1720 /* Features */ 1721 if (type == STATUSTYPE_INFO) 1722 DMEMIT("2 %u %u ", test_bit(MPATHF_QUEUE_IO, &m->flags), 1723 atomic_read(&m->pg_init_count)); 1724 else { 1725 DMEMIT("%u ", test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) + 1726 (m->pg_init_retries > 0) * 2 + 1727 (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT) * 2 + 1728 test_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags) + 1729 (m->queue_mode != DM_TYPE_REQUEST_BASED) * 2); 1730 1731 if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) 1732 DMEMIT("queue_if_no_path "); 1733 if (m->pg_init_retries) 1734 DMEMIT("pg_init_retries %u ", m->pg_init_retries); 1735 if (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT) 1736 DMEMIT("pg_init_delay_msecs %u ", m->pg_init_delay_msecs); 1737 if (test_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags)) 1738 DMEMIT("retain_attached_hw_handler "); 1739 if (m->queue_mode != DM_TYPE_REQUEST_BASED) { 1740 switch(m->queue_mode) { 1741 case DM_TYPE_BIO_BASED: 1742 DMEMIT("queue_mode bio "); 1743 break; 1744 case DM_TYPE_MQ_REQUEST_BASED: 1745 DMEMIT("queue_mode mq "); 1746 break; 1747 } 1748 } 1749 } 1750 1751 if (!m->hw_handler_name || type == STATUSTYPE_INFO) 1752 DMEMIT("0 "); 1753 else 1754 DMEMIT("1 %s ", m->hw_handler_name); 1755 1756 DMEMIT("%u ", m->nr_priority_groups); 1757 1758 if (m->next_pg) 1759 pg_num = m->next_pg->pg_num; 1760 else if (m->current_pg) 1761 pg_num = m->current_pg->pg_num; 1762 else 1763 pg_num = (m->nr_priority_groups ? 1 : 0); 1764 1765 DMEMIT("%u ", pg_num); 1766 1767 switch (type) { 1768 case STATUSTYPE_INFO: 1769 list_for_each_entry(pg, &m->priority_groups, list) { 1770 if (pg->bypassed) 1771 state = 'D'; /* Disabled */ 1772 else if (pg == m->current_pg) 1773 state = 'A'; /* Currently Active */ 1774 else 1775 state = 'E'; /* Enabled */ 1776 1777 DMEMIT("%c ", state); 1778 1779 if (pg->ps.type->status) 1780 sz += pg->ps.type->status(&pg->ps, NULL, type, 1781 result + sz, 1782 maxlen - sz); 1783 else 1784 DMEMIT("0 "); 1785 1786 DMEMIT("%u %u ", pg->nr_pgpaths, 1787 pg->ps.type->info_args); 1788 1789 list_for_each_entry(p, &pg->pgpaths, list) { 1790 DMEMIT("%s %s %u ", p->path.dev->name, 1791 p->is_active ? "A" : "F", 1792 p->fail_count); 1793 if (pg->ps.type->status) 1794 sz += pg->ps.type->status(&pg->ps, 1795 &p->path, type, result + sz, 1796 maxlen - sz); 1797 } 1798 } 1799 break; 1800 1801 case STATUSTYPE_TABLE: 1802 list_for_each_entry(pg, &m->priority_groups, list) { 1803 DMEMIT("%s ", pg->ps.type->name); 1804 1805 if (pg->ps.type->status) 1806 sz += pg->ps.type->status(&pg->ps, NULL, type, 1807 result + sz, 1808 maxlen - sz); 1809 else 1810 DMEMIT("0 "); 1811 1812 DMEMIT("%u %u ", pg->nr_pgpaths, 1813 pg->ps.type->table_args); 1814 1815 list_for_each_entry(p, &pg->pgpaths, list) { 1816 DMEMIT("%s ", p->path.dev->name); 1817 if (pg->ps.type->status) 1818 sz += pg->ps.type->status(&pg->ps, 1819 &p->path, type, result + sz, 1820 maxlen - sz); 1821 } 1822 } 1823 break; 1824 } 1825 1826 spin_unlock_irqrestore(&m->lock, flags); 1827 } 1828 1829 static int multipath_message(struct dm_target *ti, unsigned argc, char **argv) 1830 { 1831 int r = -EINVAL; 1832 struct dm_dev *dev; 1833 struct multipath *m = ti->private; 1834 action_fn action; 1835 1836 mutex_lock(&m->work_mutex); 1837 1838 if (dm_suspended(ti)) { 1839 r = -EBUSY; 1840 goto out; 1841 } 1842 1843 if (argc == 1) { 1844 if (!strcasecmp(argv[0], "queue_if_no_path")) { 1845 r = queue_if_no_path(m, true, false); 1846 goto out; 1847 } else if (!strcasecmp(argv[0], "fail_if_no_path")) { 1848 r = queue_if_no_path(m, false, false); 1849 goto out; 1850 } 1851 } 1852 1853 if (argc != 2) { 1854 DMWARN("Invalid multipath message arguments. Expected 2 arguments, got %d.", argc); 1855 goto out; 1856 } 1857 1858 if (!strcasecmp(argv[0], "disable_group")) { 1859 r = bypass_pg_num(m, argv[1], true); 1860 goto out; 1861 } else if (!strcasecmp(argv[0], "enable_group")) { 1862 r = bypass_pg_num(m, argv[1], false); 1863 goto out; 1864 } else if (!strcasecmp(argv[0], "switch_group")) { 1865 r = switch_pg_num(m, argv[1]); 1866 goto out; 1867 } else if (!strcasecmp(argv[0], "reinstate_path")) 1868 action = reinstate_path; 1869 else if (!strcasecmp(argv[0], "fail_path")) 1870 action = fail_path; 1871 else { 1872 DMWARN("Unrecognised multipath message received: %s", argv[0]); 1873 goto out; 1874 } 1875 1876 r = dm_get_device(ti, argv[1], dm_table_get_mode(ti->table), &dev); 1877 if (r) { 1878 DMWARN("message: error getting device %s", 1879 argv[1]); 1880 goto out; 1881 } 1882 1883 r = action_dev(m, dev, action); 1884 1885 dm_put_device(ti, dev); 1886 1887 out: 1888 mutex_unlock(&m->work_mutex); 1889 return r; 1890 } 1891 1892 static int multipath_prepare_ioctl(struct dm_target *ti, 1893 struct block_device **bdev, fmode_t *mode) 1894 { 1895 struct multipath *m = ti->private; 1896 struct pgpath *current_pgpath; 1897 int r; 1898 1899 current_pgpath = lockless_dereference(m->current_pgpath); 1900 if (!current_pgpath) 1901 current_pgpath = choose_pgpath(m, 0); 1902 1903 if (current_pgpath) { 1904 if (!test_bit(MPATHF_QUEUE_IO, &m->flags)) { 1905 *bdev = current_pgpath->path.dev->bdev; 1906 *mode = current_pgpath->path.dev->mode; 1907 r = 0; 1908 } else { 1909 /* pg_init has not started or completed */ 1910 r = -ENOTCONN; 1911 } 1912 } else { 1913 /* No path is available */ 1914 if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) 1915 r = -ENOTCONN; 1916 else 1917 r = -EIO; 1918 } 1919 1920 if (r == -ENOTCONN) { 1921 if (!lockless_dereference(m->current_pg)) { 1922 /* Path status changed, redo selection */ 1923 (void) choose_pgpath(m, 0); 1924 } 1925 if (test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) 1926 pg_init_all_paths(m); 1927 dm_table_run_md_queue_async(m->ti->table); 1928 process_queued_bios_list(m); 1929 } 1930 1931 /* 1932 * Only pass ioctls through if the device sizes match exactly. 1933 */ 1934 if (!r && ti->len != i_size_read((*bdev)->bd_inode) >> SECTOR_SHIFT) 1935 return 1; 1936 return r; 1937 } 1938 1939 static int multipath_iterate_devices(struct dm_target *ti, 1940 iterate_devices_callout_fn fn, void *data) 1941 { 1942 struct multipath *m = ti->private; 1943 struct priority_group *pg; 1944 struct pgpath *p; 1945 int ret = 0; 1946 1947 list_for_each_entry(pg, &m->priority_groups, list) { 1948 list_for_each_entry(p, &pg->pgpaths, list) { 1949 ret = fn(ti, p->path.dev, ti->begin, ti->len, data); 1950 if (ret) 1951 goto out; 1952 } 1953 } 1954 1955 out: 1956 return ret; 1957 } 1958 1959 static int pgpath_busy(struct pgpath *pgpath) 1960 { 1961 struct request_queue *q = bdev_get_queue(pgpath->path.dev->bdev); 1962 1963 return blk_lld_busy(q); 1964 } 1965 1966 /* 1967 * We return "busy", only when we can map I/Os but underlying devices 1968 * are busy (so even if we map I/Os now, the I/Os will wait on 1969 * the underlying queue). 1970 * In other words, if we want to kill I/Os or queue them inside us 1971 * due to map unavailability, we don't return "busy". Otherwise, 1972 * dm core won't give us the I/Os and we can't do what we want. 1973 */ 1974 static int multipath_busy(struct dm_target *ti) 1975 { 1976 bool busy = false, has_active = false; 1977 struct multipath *m = ti->private; 1978 struct priority_group *pg, *next_pg; 1979 struct pgpath *pgpath; 1980 1981 /* pg_init in progress or no paths available */ 1982 if (atomic_read(&m->pg_init_in_progress) || 1983 (!atomic_read(&m->nr_valid_paths) && test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))) 1984 return true; 1985 1986 /* Guess which priority_group will be used at next mapping time */ 1987 pg = lockless_dereference(m->current_pg); 1988 next_pg = lockless_dereference(m->next_pg); 1989 if (unlikely(!lockless_dereference(m->current_pgpath) && next_pg)) 1990 pg = next_pg; 1991 1992 if (!pg) { 1993 /* 1994 * We don't know which pg will be used at next mapping time. 1995 * We don't call choose_pgpath() here to avoid to trigger 1996 * pg_init just by busy checking. 1997 * So we don't know whether underlying devices we will be using 1998 * at next mapping time are busy or not. Just try mapping. 1999 */ 2000 return busy; 2001 } 2002 2003 /* 2004 * If there is one non-busy active path at least, the path selector 2005 * will be able to select it. So we consider such a pg as not busy. 2006 */ 2007 busy = true; 2008 list_for_each_entry(pgpath, &pg->pgpaths, list) { 2009 if (pgpath->is_active) { 2010 has_active = true; 2011 if (!pgpath_busy(pgpath)) { 2012 busy = false; 2013 break; 2014 } 2015 } 2016 } 2017 2018 if (!has_active) { 2019 /* 2020 * No active path in this pg, so this pg won't be used and 2021 * the current_pg will be changed at next mapping time. 2022 * We need to try mapping to determine it. 2023 */ 2024 busy = false; 2025 } 2026 2027 return busy; 2028 } 2029 2030 /*----------------------------------------------------------------- 2031 * Module setup 2032 *---------------------------------------------------------------*/ 2033 static struct target_type multipath_target = { 2034 .name = "multipath", 2035 .version = {1, 12, 0}, 2036 .features = DM_TARGET_SINGLETON | DM_TARGET_IMMUTABLE, 2037 .module = THIS_MODULE, 2038 .ctr = multipath_ctr, 2039 .dtr = multipath_dtr, 2040 .map_rq = multipath_map, 2041 .clone_and_map_rq = multipath_clone_and_map, 2042 .release_clone_rq = multipath_release_clone, 2043 .rq_end_io = multipath_end_io, 2044 .map = multipath_map_bio, 2045 .end_io = multipath_end_io_bio, 2046 .presuspend = multipath_presuspend, 2047 .postsuspend = multipath_postsuspend, 2048 .resume = multipath_resume, 2049 .status = multipath_status, 2050 .message = multipath_message, 2051 .prepare_ioctl = multipath_prepare_ioctl, 2052 .iterate_devices = multipath_iterate_devices, 2053 .busy = multipath_busy, 2054 }; 2055 2056 static int __init dm_multipath_init(void) 2057 { 2058 int r; 2059 2060 /* allocate a slab for the dm_mpath_ios */ 2061 _mpio_cache = KMEM_CACHE(dm_mpath_io, 0); 2062 if (!_mpio_cache) 2063 return -ENOMEM; 2064 2065 r = dm_register_target(&multipath_target); 2066 if (r < 0) { 2067 DMERR("request-based register failed %d", r); 2068 r = -EINVAL; 2069 goto bad_register_target; 2070 } 2071 2072 kmultipathd = alloc_workqueue("kmpathd", WQ_MEM_RECLAIM, 0); 2073 if (!kmultipathd) { 2074 DMERR("failed to create workqueue kmpathd"); 2075 r = -ENOMEM; 2076 goto bad_alloc_kmultipathd; 2077 } 2078 2079 /* 2080 * A separate workqueue is used to handle the device handlers 2081 * to avoid overloading existing workqueue. Overloading the 2082 * old workqueue would also create a bottleneck in the 2083 * path of the storage hardware device activation. 2084 */ 2085 kmpath_handlerd = alloc_ordered_workqueue("kmpath_handlerd", 2086 WQ_MEM_RECLAIM); 2087 if (!kmpath_handlerd) { 2088 DMERR("failed to create workqueue kmpath_handlerd"); 2089 r = -ENOMEM; 2090 goto bad_alloc_kmpath_handlerd; 2091 } 2092 2093 return 0; 2094 2095 bad_alloc_kmpath_handlerd: 2096 destroy_workqueue(kmultipathd); 2097 bad_alloc_kmultipathd: 2098 dm_unregister_target(&multipath_target); 2099 bad_register_target: 2100 kmem_cache_destroy(_mpio_cache); 2101 2102 return r; 2103 } 2104 2105 static void __exit dm_multipath_exit(void) 2106 { 2107 destroy_workqueue(kmpath_handlerd); 2108 destroy_workqueue(kmultipathd); 2109 2110 dm_unregister_target(&multipath_target); 2111 kmem_cache_destroy(_mpio_cache); 2112 } 2113 2114 module_init(dm_multipath_init); 2115 module_exit(dm_multipath_exit); 2116 2117 MODULE_DESCRIPTION(DM_NAME " multipath target"); 2118 MODULE_AUTHOR("Sistina Software <dm-devel@redhat.com>"); 2119 MODULE_LICENSE("GPL"); 2120