1 /* 2 * Copyright (C) 2003 Sistina Software Limited. 3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. 4 * 5 * This file is released under the GPL. 6 */ 7 8 #include <linux/device-mapper.h> 9 10 #include "dm-rq.h" 11 #include "dm-bio-record.h" 12 #include "dm-path-selector.h" 13 #include "dm-uevent.h" 14 15 #include <linux/blkdev.h> 16 #include <linux/ctype.h> 17 #include <linux/init.h> 18 #include <linux/mempool.h> 19 #include <linux/module.h> 20 #include <linux/pagemap.h> 21 #include <linux/slab.h> 22 #include <linux/time.h> 23 #include <linux/workqueue.h> 24 #include <linux/delay.h> 25 #include <scsi/scsi_dh.h> 26 #include <linux/atomic.h> 27 #include <linux/blk-mq.h> 28 29 #define DM_MSG_PREFIX "multipath" 30 #define DM_PG_INIT_DELAY_MSECS 2000 31 #define DM_PG_INIT_DELAY_DEFAULT ((unsigned) -1) 32 33 /* Path properties */ 34 struct pgpath { 35 struct list_head list; 36 37 struct priority_group *pg; /* Owning PG */ 38 unsigned fail_count; /* Cumulative failure count */ 39 40 struct dm_path path; 41 struct delayed_work activate_path; 42 43 bool is_active:1; /* Path status */ 44 }; 45 46 #define path_to_pgpath(__pgp) container_of((__pgp), struct pgpath, path) 47 48 /* 49 * Paths are grouped into Priority Groups and numbered from 1 upwards. 50 * Each has a path selector which controls which path gets used. 51 */ 52 struct priority_group { 53 struct list_head list; 54 55 struct multipath *m; /* Owning multipath instance */ 56 struct path_selector ps; 57 58 unsigned pg_num; /* Reference number */ 59 unsigned nr_pgpaths; /* Number of paths in PG */ 60 struct list_head pgpaths; 61 62 bool bypassed:1; /* Temporarily bypass this PG? */ 63 }; 64 65 /* Multipath context */ 66 struct multipath { 67 struct list_head list; 68 struct dm_target *ti; 69 70 const char *hw_handler_name; 71 char *hw_handler_params; 72 73 spinlock_t lock; 74 75 unsigned nr_priority_groups; 76 struct list_head priority_groups; 77 78 wait_queue_head_t pg_init_wait; /* Wait for pg_init completion */ 79 80 struct pgpath *current_pgpath; 81 struct priority_group *current_pg; 82 struct priority_group *next_pg; /* Switch to this PG if set */ 83 84 unsigned long flags; /* Multipath state flags */ 85 86 unsigned pg_init_retries; /* Number of times to retry pg_init */ 87 unsigned pg_init_delay_msecs; /* Number of msecs before pg_init retry */ 88 89 atomic_t nr_valid_paths; /* Total number of usable paths */ 90 atomic_t pg_init_in_progress; /* Only one pg_init allowed at once */ 91 atomic_t pg_init_count; /* Number of times pg_init called */ 92 93 enum dm_queue_mode queue_mode; 94 95 struct mutex work_mutex; 96 struct work_struct trigger_event; 97 98 struct work_struct process_queued_bios; 99 struct bio_list queued_bios; 100 }; 101 102 /* 103 * Context information attached to each io we process. 104 */ 105 struct dm_mpath_io { 106 struct pgpath *pgpath; 107 size_t nr_bytes; 108 }; 109 110 typedef int (*action_fn) (struct pgpath *pgpath); 111 112 static struct workqueue_struct *kmultipathd, *kmpath_handlerd; 113 static void trigger_event(struct work_struct *work); 114 static void activate_or_offline_path(struct pgpath *pgpath); 115 static void activate_path_work(struct work_struct *work); 116 static void process_queued_bios(struct work_struct *work); 117 118 /*----------------------------------------------- 119 * Multipath state flags. 120 *-----------------------------------------------*/ 121 122 #define MPATHF_QUEUE_IO 0 /* Must we queue all I/O? */ 123 #define MPATHF_QUEUE_IF_NO_PATH 1 /* Queue I/O if last path fails? */ 124 #define MPATHF_SAVED_QUEUE_IF_NO_PATH 2 /* Saved state during suspension */ 125 #define MPATHF_RETAIN_ATTACHED_HW_HANDLER 3 /* If there's already a hw_handler present, don't change it. */ 126 #define MPATHF_PG_INIT_DISABLED 4 /* pg_init is not currently allowed */ 127 #define MPATHF_PG_INIT_REQUIRED 5 /* pg_init needs calling? */ 128 #define MPATHF_PG_INIT_DELAY_RETRY 6 /* Delay pg_init retry? */ 129 130 /*----------------------------------------------- 131 * Allocation routines 132 *-----------------------------------------------*/ 133 134 static struct pgpath *alloc_pgpath(void) 135 { 136 struct pgpath *pgpath = kzalloc(sizeof(*pgpath), GFP_KERNEL); 137 138 if (pgpath) { 139 pgpath->is_active = true; 140 INIT_DELAYED_WORK(&pgpath->activate_path, activate_path_work); 141 } 142 143 return pgpath; 144 } 145 146 static void free_pgpath(struct pgpath *pgpath) 147 { 148 kfree(pgpath); 149 } 150 151 static struct priority_group *alloc_priority_group(void) 152 { 153 struct priority_group *pg; 154 155 pg = kzalloc(sizeof(*pg), GFP_KERNEL); 156 157 if (pg) 158 INIT_LIST_HEAD(&pg->pgpaths); 159 160 return pg; 161 } 162 163 static void free_pgpaths(struct list_head *pgpaths, struct dm_target *ti) 164 { 165 struct pgpath *pgpath, *tmp; 166 167 list_for_each_entry_safe(pgpath, tmp, pgpaths, list) { 168 list_del(&pgpath->list); 169 dm_put_device(ti, pgpath->path.dev); 170 free_pgpath(pgpath); 171 } 172 } 173 174 static void free_priority_group(struct priority_group *pg, 175 struct dm_target *ti) 176 { 177 struct path_selector *ps = &pg->ps; 178 179 if (ps->type) { 180 ps->type->destroy(ps); 181 dm_put_path_selector(ps->type); 182 } 183 184 free_pgpaths(&pg->pgpaths, ti); 185 kfree(pg); 186 } 187 188 static struct multipath *alloc_multipath(struct dm_target *ti) 189 { 190 struct multipath *m; 191 192 m = kzalloc(sizeof(*m), GFP_KERNEL); 193 if (m) { 194 INIT_LIST_HEAD(&m->priority_groups); 195 spin_lock_init(&m->lock); 196 set_bit(MPATHF_QUEUE_IO, &m->flags); 197 atomic_set(&m->nr_valid_paths, 0); 198 atomic_set(&m->pg_init_in_progress, 0); 199 atomic_set(&m->pg_init_count, 0); 200 m->pg_init_delay_msecs = DM_PG_INIT_DELAY_DEFAULT; 201 INIT_WORK(&m->trigger_event, trigger_event); 202 init_waitqueue_head(&m->pg_init_wait); 203 mutex_init(&m->work_mutex); 204 205 m->queue_mode = DM_TYPE_NONE; 206 207 m->ti = ti; 208 ti->private = m; 209 } 210 211 return m; 212 } 213 214 static int alloc_multipath_stage2(struct dm_target *ti, struct multipath *m) 215 { 216 if (m->queue_mode == DM_TYPE_NONE) { 217 /* 218 * Default to request-based. 219 */ 220 if (dm_use_blk_mq(dm_table_get_md(ti->table))) 221 m->queue_mode = DM_TYPE_MQ_REQUEST_BASED; 222 else 223 m->queue_mode = DM_TYPE_REQUEST_BASED; 224 } else if (m->queue_mode == DM_TYPE_BIO_BASED) { 225 INIT_WORK(&m->process_queued_bios, process_queued_bios); 226 /* 227 * bio-based doesn't support any direct scsi_dh management; 228 * it just discovers if a scsi_dh is attached. 229 */ 230 set_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags); 231 } 232 233 dm_table_set_type(ti->table, m->queue_mode); 234 235 return 0; 236 } 237 238 static void free_multipath(struct multipath *m) 239 { 240 struct priority_group *pg, *tmp; 241 242 list_for_each_entry_safe(pg, tmp, &m->priority_groups, list) { 243 list_del(&pg->list); 244 free_priority_group(pg, m->ti); 245 } 246 247 kfree(m->hw_handler_name); 248 kfree(m->hw_handler_params); 249 kfree(m); 250 } 251 252 static struct dm_mpath_io *get_mpio(union map_info *info) 253 { 254 return info->ptr; 255 } 256 257 static size_t multipath_per_bio_data_size(void) 258 { 259 return sizeof(struct dm_mpath_io) + sizeof(struct dm_bio_details); 260 } 261 262 static struct dm_mpath_io *get_mpio_from_bio(struct bio *bio) 263 { 264 return dm_per_bio_data(bio, multipath_per_bio_data_size()); 265 } 266 267 static struct dm_bio_details *get_bio_details_from_bio(struct bio *bio) 268 { 269 /* dm_bio_details is immediately after the dm_mpath_io in bio's per-bio-data */ 270 struct dm_mpath_io *mpio = get_mpio_from_bio(bio); 271 void *bio_details = mpio + 1; 272 273 return bio_details; 274 } 275 276 static void multipath_init_per_bio_data(struct bio *bio, struct dm_mpath_io **mpio_p) 277 { 278 struct dm_mpath_io *mpio = get_mpio_from_bio(bio); 279 struct dm_bio_details *bio_details = get_bio_details_from_bio(bio); 280 281 memset(mpio, 0, sizeof(*mpio)); 282 memset(bio_details, 0, sizeof(*bio_details)); 283 dm_bio_record(bio_details, bio); 284 285 if (mpio_p) 286 *mpio_p = mpio; 287 } 288 289 /*----------------------------------------------- 290 * Path selection 291 *-----------------------------------------------*/ 292 293 static int __pg_init_all_paths(struct multipath *m) 294 { 295 struct pgpath *pgpath; 296 unsigned long pg_init_delay = 0; 297 298 lockdep_assert_held(&m->lock); 299 300 if (atomic_read(&m->pg_init_in_progress) || test_bit(MPATHF_PG_INIT_DISABLED, &m->flags)) 301 return 0; 302 303 atomic_inc(&m->pg_init_count); 304 clear_bit(MPATHF_PG_INIT_REQUIRED, &m->flags); 305 306 /* Check here to reset pg_init_required */ 307 if (!m->current_pg) 308 return 0; 309 310 if (test_bit(MPATHF_PG_INIT_DELAY_RETRY, &m->flags)) 311 pg_init_delay = msecs_to_jiffies(m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT ? 312 m->pg_init_delay_msecs : DM_PG_INIT_DELAY_MSECS); 313 list_for_each_entry(pgpath, &m->current_pg->pgpaths, list) { 314 /* Skip failed paths */ 315 if (!pgpath->is_active) 316 continue; 317 if (queue_delayed_work(kmpath_handlerd, &pgpath->activate_path, 318 pg_init_delay)) 319 atomic_inc(&m->pg_init_in_progress); 320 } 321 return atomic_read(&m->pg_init_in_progress); 322 } 323 324 static int pg_init_all_paths(struct multipath *m) 325 { 326 int ret; 327 unsigned long flags; 328 329 spin_lock_irqsave(&m->lock, flags); 330 ret = __pg_init_all_paths(m); 331 spin_unlock_irqrestore(&m->lock, flags); 332 333 return ret; 334 } 335 336 static void __switch_pg(struct multipath *m, struct priority_group *pg) 337 { 338 m->current_pg = pg; 339 340 /* Must we initialise the PG first, and queue I/O till it's ready? */ 341 if (m->hw_handler_name) { 342 set_bit(MPATHF_PG_INIT_REQUIRED, &m->flags); 343 set_bit(MPATHF_QUEUE_IO, &m->flags); 344 } else { 345 clear_bit(MPATHF_PG_INIT_REQUIRED, &m->flags); 346 clear_bit(MPATHF_QUEUE_IO, &m->flags); 347 } 348 349 atomic_set(&m->pg_init_count, 0); 350 } 351 352 static struct pgpath *choose_path_in_pg(struct multipath *m, 353 struct priority_group *pg, 354 size_t nr_bytes) 355 { 356 unsigned long flags; 357 struct dm_path *path; 358 struct pgpath *pgpath; 359 360 path = pg->ps.type->select_path(&pg->ps, nr_bytes); 361 if (!path) 362 return ERR_PTR(-ENXIO); 363 364 pgpath = path_to_pgpath(path); 365 366 if (unlikely(READ_ONCE(m->current_pg) != pg)) { 367 /* Only update current_pgpath if pg changed */ 368 spin_lock_irqsave(&m->lock, flags); 369 m->current_pgpath = pgpath; 370 __switch_pg(m, pg); 371 spin_unlock_irqrestore(&m->lock, flags); 372 } 373 374 return pgpath; 375 } 376 377 static struct pgpath *choose_pgpath(struct multipath *m, size_t nr_bytes) 378 { 379 unsigned long flags; 380 struct priority_group *pg; 381 struct pgpath *pgpath; 382 unsigned bypassed = 1; 383 384 if (!atomic_read(&m->nr_valid_paths)) { 385 clear_bit(MPATHF_QUEUE_IO, &m->flags); 386 goto failed; 387 } 388 389 /* Were we instructed to switch PG? */ 390 if (READ_ONCE(m->next_pg)) { 391 spin_lock_irqsave(&m->lock, flags); 392 pg = m->next_pg; 393 if (!pg) { 394 spin_unlock_irqrestore(&m->lock, flags); 395 goto check_current_pg; 396 } 397 m->next_pg = NULL; 398 spin_unlock_irqrestore(&m->lock, flags); 399 pgpath = choose_path_in_pg(m, pg, nr_bytes); 400 if (!IS_ERR_OR_NULL(pgpath)) 401 return pgpath; 402 } 403 404 /* Don't change PG until it has no remaining paths */ 405 check_current_pg: 406 pg = READ_ONCE(m->current_pg); 407 if (pg) { 408 pgpath = choose_path_in_pg(m, pg, nr_bytes); 409 if (!IS_ERR_OR_NULL(pgpath)) 410 return pgpath; 411 } 412 413 /* 414 * Loop through priority groups until we find a valid path. 415 * First time we skip PGs marked 'bypassed'. 416 * Second time we only try the ones we skipped, but set 417 * pg_init_delay_retry so we do not hammer controllers. 418 */ 419 do { 420 list_for_each_entry(pg, &m->priority_groups, list) { 421 if (pg->bypassed == !!bypassed) 422 continue; 423 pgpath = choose_path_in_pg(m, pg, nr_bytes); 424 if (!IS_ERR_OR_NULL(pgpath)) { 425 if (!bypassed) 426 set_bit(MPATHF_PG_INIT_DELAY_RETRY, &m->flags); 427 return pgpath; 428 } 429 } 430 } while (bypassed--); 431 432 failed: 433 spin_lock_irqsave(&m->lock, flags); 434 m->current_pgpath = NULL; 435 m->current_pg = NULL; 436 spin_unlock_irqrestore(&m->lock, flags); 437 438 return NULL; 439 } 440 441 /* 442 * dm_report_EIO() is a macro instead of a function to make pr_debug() 443 * report the function name and line number of the function from which 444 * it has been invoked. 445 */ 446 #define dm_report_EIO(m) \ 447 do { \ 448 struct mapped_device *md = dm_table_get_md((m)->ti->table); \ 449 \ 450 pr_debug("%s: returning EIO; QIFNP = %d; SQIFNP = %d; DNFS = %d\n", \ 451 dm_device_name(md), \ 452 test_bit(MPATHF_QUEUE_IF_NO_PATH, &(m)->flags), \ 453 test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &(m)->flags), \ 454 dm_noflush_suspending((m)->ti)); \ 455 } while (0) 456 457 /* 458 * Check whether bios must be queued in the device-mapper core rather 459 * than here in the target. 460 * 461 * If MPATHF_QUEUE_IF_NO_PATH and MPATHF_SAVED_QUEUE_IF_NO_PATH hold 462 * the same value then we are not between multipath_presuspend() 463 * and multipath_resume() calls and we have no need to check 464 * for the DMF_NOFLUSH_SUSPENDING flag. 465 */ 466 static bool __must_push_back(struct multipath *m, unsigned long flags) 467 { 468 return ((test_bit(MPATHF_QUEUE_IF_NO_PATH, &flags) != 469 test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &flags)) && 470 dm_noflush_suspending(m->ti)); 471 } 472 473 /* 474 * Following functions use READ_ONCE to get atomic access to 475 * all m->flags to avoid taking spinlock 476 */ 477 static bool must_push_back_rq(struct multipath *m) 478 { 479 unsigned long flags = READ_ONCE(m->flags); 480 return test_bit(MPATHF_QUEUE_IF_NO_PATH, &flags) || __must_push_back(m, flags); 481 } 482 483 static bool must_push_back_bio(struct multipath *m) 484 { 485 unsigned long flags = READ_ONCE(m->flags); 486 return __must_push_back(m, flags); 487 } 488 489 /* 490 * Map cloned requests (request-based multipath) 491 */ 492 static int multipath_clone_and_map(struct dm_target *ti, struct request *rq, 493 union map_info *map_context, 494 struct request **__clone) 495 { 496 struct multipath *m = ti->private; 497 size_t nr_bytes = blk_rq_bytes(rq); 498 struct pgpath *pgpath; 499 struct block_device *bdev; 500 struct dm_mpath_io *mpio = get_mpio(map_context); 501 struct request_queue *q; 502 struct request *clone; 503 504 /* Do we need to select a new pgpath? */ 505 pgpath = READ_ONCE(m->current_pgpath); 506 if (!pgpath || !test_bit(MPATHF_QUEUE_IO, &m->flags)) 507 pgpath = choose_pgpath(m, nr_bytes); 508 509 if (!pgpath) { 510 if (must_push_back_rq(m)) 511 return DM_MAPIO_DELAY_REQUEUE; 512 dm_report_EIO(m); /* Failed */ 513 return DM_MAPIO_KILL; 514 } else if (test_bit(MPATHF_QUEUE_IO, &m->flags) || 515 test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) { 516 if (pg_init_all_paths(m)) 517 return DM_MAPIO_DELAY_REQUEUE; 518 return DM_MAPIO_REQUEUE; 519 } 520 521 memset(mpio, 0, sizeof(*mpio)); 522 mpio->pgpath = pgpath; 523 mpio->nr_bytes = nr_bytes; 524 525 bdev = pgpath->path.dev->bdev; 526 q = bdev_get_queue(bdev); 527 clone = blk_get_request(q, rq->cmd_flags | REQ_NOMERGE, GFP_ATOMIC); 528 if (IS_ERR(clone)) { 529 /* EBUSY, ENODEV or EWOULDBLOCK: requeue */ 530 bool queue_dying = blk_queue_dying(q); 531 if (queue_dying) { 532 atomic_inc(&m->pg_init_in_progress); 533 activate_or_offline_path(pgpath); 534 } 535 return DM_MAPIO_DELAY_REQUEUE; 536 } 537 clone->bio = clone->biotail = NULL; 538 clone->rq_disk = bdev->bd_disk; 539 clone->cmd_flags |= REQ_FAILFAST_TRANSPORT; 540 *__clone = clone; 541 542 if (pgpath->pg->ps.type->start_io) 543 pgpath->pg->ps.type->start_io(&pgpath->pg->ps, 544 &pgpath->path, 545 nr_bytes); 546 return DM_MAPIO_REMAPPED; 547 } 548 549 static void multipath_release_clone(struct request *clone) 550 { 551 blk_put_request(clone); 552 } 553 554 /* 555 * Map cloned bios (bio-based multipath) 556 */ 557 static int __multipath_map_bio(struct multipath *m, struct bio *bio, struct dm_mpath_io *mpio) 558 { 559 size_t nr_bytes = bio->bi_iter.bi_size; 560 struct pgpath *pgpath; 561 unsigned long flags; 562 bool queue_io; 563 564 /* Do we need to select a new pgpath? */ 565 pgpath = READ_ONCE(m->current_pgpath); 566 queue_io = test_bit(MPATHF_QUEUE_IO, &m->flags); 567 if (!pgpath || !queue_io) 568 pgpath = choose_pgpath(m, nr_bytes); 569 570 if ((pgpath && queue_io) || 571 (!pgpath && test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))) { 572 /* Queue for the daemon to resubmit */ 573 spin_lock_irqsave(&m->lock, flags); 574 bio_list_add(&m->queued_bios, bio); 575 spin_unlock_irqrestore(&m->lock, flags); 576 /* PG_INIT_REQUIRED cannot be set without QUEUE_IO */ 577 if (queue_io || test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) 578 pg_init_all_paths(m); 579 else if (!queue_io) 580 queue_work(kmultipathd, &m->process_queued_bios); 581 return DM_MAPIO_SUBMITTED; 582 } 583 584 if (!pgpath) { 585 if (must_push_back_bio(m)) 586 return DM_MAPIO_REQUEUE; 587 dm_report_EIO(m); 588 return DM_MAPIO_KILL; 589 } 590 591 mpio->pgpath = pgpath; 592 mpio->nr_bytes = nr_bytes; 593 594 bio->bi_status = 0; 595 bio_set_dev(bio, pgpath->path.dev->bdev); 596 bio->bi_opf |= REQ_FAILFAST_TRANSPORT; 597 598 if (pgpath->pg->ps.type->start_io) 599 pgpath->pg->ps.type->start_io(&pgpath->pg->ps, 600 &pgpath->path, 601 nr_bytes); 602 return DM_MAPIO_REMAPPED; 603 } 604 605 static int multipath_map_bio(struct dm_target *ti, struct bio *bio) 606 { 607 struct multipath *m = ti->private; 608 struct dm_mpath_io *mpio = NULL; 609 610 multipath_init_per_bio_data(bio, &mpio); 611 return __multipath_map_bio(m, bio, mpio); 612 } 613 614 static void process_queued_io_list(struct multipath *m) 615 { 616 if (m->queue_mode == DM_TYPE_MQ_REQUEST_BASED) 617 dm_mq_kick_requeue_list(dm_table_get_md(m->ti->table)); 618 else if (m->queue_mode == DM_TYPE_BIO_BASED) 619 queue_work(kmultipathd, &m->process_queued_bios); 620 } 621 622 static void process_queued_bios(struct work_struct *work) 623 { 624 int r; 625 unsigned long flags; 626 struct bio *bio; 627 struct bio_list bios; 628 struct blk_plug plug; 629 struct multipath *m = 630 container_of(work, struct multipath, process_queued_bios); 631 632 bio_list_init(&bios); 633 634 spin_lock_irqsave(&m->lock, flags); 635 636 if (bio_list_empty(&m->queued_bios)) { 637 spin_unlock_irqrestore(&m->lock, flags); 638 return; 639 } 640 641 bio_list_merge(&bios, &m->queued_bios); 642 bio_list_init(&m->queued_bios); 643 644 spin_unlock_irqrestore(&m->lock, flags); 645 646 blk_start_plug(&plug); 647 while ((bio = bio_list_pop(&bios))) { 648 r = __multipath_map_bio(m, bio, get_mpio_from_bio(bio)); 649 switch (r) { 650 case DM_MAPIO_KILL: 651 bio->bi_status = BLK_STS_IOERR; 652 bio_endio(bio); 653 break; 654 case DM_MAPIO_REQUEUE: 655 bio->bi_status = BLK_STS_DM_REQUEUE; 656 bio_endio(bio); 657 break; 658 case DM_MAPIO_REMAPPED: 659 generic_make_request(bio); 660 break; 661 case 0: 662 break; 663 default: 664 WARN_ONCE(true, "__multipath_map_bio() returned %d\n", r); 665 } 666 } 667 blk_finish_plug(&plug); 668 } 669 670 /* 671 * If we run out of usable paths, should we queue I/O or error it? 672 */ 673 static int queue_if_no_path(struct multipath *m, bool queue_if_no_path, 674 bool save_old_value) 675 { 676 unsigned long flags; 677 678 spin_lock_irqsave(&m->lock, flags); 679 assign_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags, 680 (save_old_value && test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) || 681 (!save_old_value && queue_if_no_path)); 682 assign_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags, queue_if_no_path); 683 spin_unlock_irqrestore(&m->lock, flags); 684 685 if (!queue_if_no_path) { 686 dm_table_run_md_queue_async(m->ti->table); 687 process_queued_io_list(m); 688 } 689 690 return 0; 691 } 692 693 /* 694 * An event is triggered whenever a path is taken out of use. 695 * Includes path failure and PG bypass. 696 */ 697 static void trigger_event(struct work_struct *work) 698 { 699 struct multipath *m = 700 container_of(work, struct multipath, trigger_event); 701 702 dm_table_event(m->ti->table); 703 } 704 705 /*----------------------------------------------------------------- 706 * Constructor/argument parsing: 707 * <#multipath feature args> [<arg>]* 708 * <#hw_handler args> [hw_handler [<arg>]*] 709 * <#priority groups> 710 * <initial priority group> 711 * [<selector> <#selector args> [<arg>]* 712 * <#paths> <#per-path selector args> 713 * [<path> [<arg>]* ]+ ]+ 714 *---------------------------------------------------------------*/ 715 static int parse_path_selector(struct dm_arg_set *as, struct priority_group *pg, 716 struct dm_target *ti) 717 { 718 int r; 719 struct path_selector_type *pst; 720 unsigned ps_argc; 721 722 static const struct dm_arg _args[] = { 723 {0, 1024, "invalid number of path selector args"}, 724 }; 725 726 pst = dm_get_path_selector(dm_shift_arg(as)); 727 if (!pst) { 728 ti->error = "unknown path selector type"; 729 return -EINVAL; 730 } 731 732 r = dm_read_arg_group(_args, as, &ps_argc, &ti->error); 733 if (r) { 734 dm_put_path_selector(pst); 735 return -EINVAL; 736 } 737 738 r = pst->create(&pg->ps, ps_argc, as->argv); 739 if (r) { 740 dm_put_path_selector(pst); 741 ti->error = "path selector constructor failed"; 742 return r; 743 } 744 745 pg->ps.type = pst; 746 dm_consume_args(as, ps_argc); 747 748 return 0; 749 } 750 751 static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps, 752 struct dm_target *ti) 753 { 754 int r; 755 struct pgpath *p; 756 struct multipath *m = ti->private; 757 struct request_queue *q = NULL; 758 const char *attached_handler_name; 759 760 /* we need at least a path arg */ 761 if (as->argc < 1) { 762 ti->error = "no device given"; 763 return ERR_PTR(-EINVAL); 764 } 765 766 p = alloc_pgpath(); 767 if (!p) 768 return ERR_PTR(-ENOMEM); 769 770 r = dm_get_device(ti, dm_shift_arg(as), dm_table_get_mode(ti->table), 771 &p->path.dev); 772 if (r) { 773 ti->error = "error getting device"; 774 goto bad; 775 } 776 777 if (test_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags) || m->hw_handler_name) 778 q = bdev_get_queue(p->path.dev->bdev); 779 780 if (test_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags)) { 781 retain: 782 attached_handler_name = scsi_dh_attached_handler_name(q, GFP_KERNEL); 783 if (attached_handler_name) { 784 /* 785 * Clear any hw_handler_params associated with a 786 * handler that isn't already attached. 787 */ 788 if (m->hw_handler_name && strcmp(attached_handler_name, m->hw_handler_name)) { 789 kfree(m->hw_handler_params); 790 m->hw_handler_params = NULL; 791 } 792 793 /* 794 * Reset hw_handler_name to match the attached handler 795 * 796 * NB. This modifies the table line to show the actual 797 * handler instead of the original table passed in. 798 */ 799 kfree(m->hw_handler_name); 800 m->hw_handler_name = attached_handler_name; 801 } 802 } 803 804 if (m->hw_handler_name) { 805 r = scsi_dh_attach(q, m->hw_handler_name); 806 if (r == -EBUSY) { 807 char b[BDEVNAME_SIZE]; 808 809 printk(KERN_INFO "dm-mpath: retaining handler on device %s\n", 810 bdevname(p->path.dev->bdev, b)); 811 goto retain; 812 } 813 if (r < 0) { 814 ti->error = "error attaching hardware handler"; 815 dm_put_device(ti, p->path.dev); 816 goto bad; 817 } 818 819 if (m->hw_handler_params) { 820 r = scsi_dh_set_params(q, m->hw_handler_params); 821 if (r < 0) { 822 ti->error = "unable to set hardware " 823 "handler parameters"; 824 dm_put_device(ti, p->path.dev); 825 goto bad; 826 } 827 } 828 } 829 830 r = ps->type->add_path(ps, &p->path, as->argc, as->argv, &ti->error); 831 if (r) { 832 dm_put_device(ti, p->path.dev); 833 goto bad; 834 } 835 836 return p; 837 838 bad: 839 free_pgpath(p); 840 return ERR_PTR(r); 841 } 842 843 static struct priority_group *parse_priority_group(struct dm_arg_set *as, 844 struct multipath *m) 845 { 846 static const struct dm_arg _args[] = { 847 {1, 1024, "invalid number of paths"}, 848 {0, 1024, "invalid number of selector args"} 849 }; 850 851 int r; 852 unsigned i, nr_selector_args, nr_args; 853 struct priority_group *pg; 854 struct dm_target *ti = m->ti; 855 856 if (as->argc < 2) { 857 as->argc = 0; 858 ti->error = "not enough priority group arguments"; 859 return ERR_PTR(-EINVAL); 860 } 861 862 pg = alloc_priority_group(); 863 if (!pg) { 864 ti->error = "couldn't allocate priority group"; 865 return ERR_PTR(-ENOMEM); 866 } 867 pg->m = m; 868 869 r = parse_path_selector(as, pg, ti); 870 if (r) 871 goto bad; 872 873 /* 874 * read the paths 875 */ 876 r = dm_read_arg(_args, as, &pg->nr_pgpaths, &ti->error); 877 if (r) 878 goto bad; 879 880 r = dm_read_arg(_args + 1, as, &nr_selector_args, &ti->error); 881 if (r) 882 goto bad; 883 884 nr_args = 1 + nr_selector_args; 885 for (i = 0; i < pg->nr_pgpaths; i++) { 886 struct pgpath *pgpath; 887 struct dm_arg_set path_args; 888 889 if (as->argc < nr_args) { 890 ti->error = "not enough path parameters"; 891 r = -EINVAL; 892 goto bad; 893 } 894 895 path_args.argc = nr_args; 896 path_args.argv = as->argv; 897 898 pgpath = parse_path(&path_args, &pg->ps, ti); 899 if (IS_ERR(pgpath)) { 900 r = PTR_ERR(pgpath); 901 goto bad; 902 } 903 904 pgpath->pg = pg; 905 list_add_tail(&pgpath->list, &pg->pgpaths); 906 dm_consume_args(as, nr_args); 907 } 908 909 return pg; 910 911 bad: 912 free_priority_group(pg, ti); 913 return ERR_PTR(r); 914 } 915 916 static int parse_hw_handler(struct dm_arg_set *as, struct multipath *m) 917 { 918 unsigned hw_argc; 919 int ret; 920 struct dm_target *ti = m->ti; 921 922 static const struct dm_arg _args[] = { 923 {0, 1024, "invalid number of hardware handler args"}, 924 }; 925 926 if (dm_read_arg_group(_args, as, &hw_argc, &ti->error)) 927 return -EINVAL; 928 929 if (!hw_argc) 930 return 0; 931 932 if (m->queue_mode == DM_TYPE_BIO_BASED) { 933 dm_consume_args(as, hw_argc); 934 DMERR("bio-based multipath doesn't allow hardware handler args"); 935 return 0; 936 } 937 938 m->hw_handler_name = kstrdup(dm_shift_arg(as), GFP_KERNEL); 939 if (!m->hw_handler_name) 940 return -EINVAL; 941 942 if (hw_argc > 1) { 943 char *p; 944 int i, j, len = 4; 945 946 for (i = 0; i <= hw_argc - 2; i++) 947 len += strlen(as->argv[i]) + 1; 948 p = m->hw_handler_params = kzalloc(len, GFP_KERNEL); 949 if (!p) { 950 ti->error = "memory allocation failed"; 951 ret = -ENOMEM; 952 goto fail; 953 } 954 j = sprintf(p, "%d", hw_argc - 1); 955 for (i = 0, p+=j+1; i <= hw_argc - 2; i++, p+=j+1) 956 j = sprintf(p, "%s", as->argv[i]); 957 } 958 dm_consume_args(as, hw_argc - 1); 959 960 return 0; 961 fail: 962 kfree(m->hw_handler_name); 963 m->hw_handler_name = NULL; 964 return ret; 965 } 966 967 static int parse_features(struct dm_arg_set *as, struct multipath *m) 968 { 969 int r; 970 unsigned argc; 971 struct dm_target *ti = m->ti; 972 const char *arg_name; 973 974 static const struct dm_arg _args[] = { 975 {0, 8, "invalid number of feature args"}, 976 {1, 50, "pg_init_retries must be between 1 and 50"}, 977 {0, 60000, "pg_init_delay_msecs must be between 0 and 60000"}, 978 }; 979 980 r = dm_read_arg_group(_args, as, &argc, &ti->error); 981 if (r) 982 return -EINVAL; 983 984 if (!argc) 985 return 0; 986 987 do { 988 arg_name = dm_shift_arg(as); 989 argc--; 990 991 if (!strcasecmp(arg_name, "queue_if_no_path")) { 992 r = queue_if_no_path(m, true, false); 993 continue; 994 } 995 996 if (!strcasecmp(arg_name, "retain_attached_hw_handler")) { 997 set_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags); 998 continue; 999 } 1000 1001 if (!strcasecmp(arg_name, "pg_init_retries") && 1002 (argc >= 1)) { 1003 r = dm_read_arg(_args + 1, as, &m->pg_init_retries, &ti->error); 1004 argc--; 1005 continue; 1006 } 1007 1008 if (!strcasecmp(arg_name, "pg_init_delay_msecs") && 1009 (argc >= 1)) { 1010 r = dm_read_arg(_args + 2, as, &m->pg_init_delay_msecs, &ti->error); 1011 argc--; 1012 continue; 1013 } 1014 1015 if (!strcasecmp(arg_name, "queue_mode") && 1016 (argc >= 1)) { 1017 const char *queue_mode_name = dm_shift_arg(as); 1018 1019 if (!strcasecmp(queue_mode_name, "bio")) 1020 m->queue_mode = DM_TYPE_BIO_BASED; 1021 else if (!strcasecmp(queue_mode_name, "rq")) 1022 m->queue_mode = DM_TYPE_REQUEST_BASED; 1023 else if (!strcasecmp(queue_mode_name, "mq")) 1024 m->queue_mode = DM_TYPE_MQ_REQUEST_BASED; 1025 else { 1026 ti->error = "Unknown 'queue_mode' requested"; 1027 r = -EINVAL; 1028 } 1029 argc--; 1030 continue; 1031 } 1032 1033 ti->error = "Unrecognised multipath feature request"; 1034 r = -EINVAL; 1035 } while (argc && !r); 1036 1037 return r; 1038 } 1039 1040 static int multipath_ctr(struct dm_target *ti, unsigned argc, char **argv) 1041 { 1042 /* target arguments */ 1043 static const struct dm_arg _args[] = { 1044 {0, 1024, "invalid number of priority groups"}, 1045 {0, 1024, "invalid initial priority group number"}, 1046 }; 1047 1048 int r; 1049 struct multipath *m; 1050 struct dm_arg_set as; 1051 unsigned pg_count = 0; 1052 unsigned next_pg_num; 1053 1054 as.argc = argc; 1055 as.argv = argv; 1056 1057 m = alloc_multipath(ti); 1058 if (!m) { 1059 ti->error = "can't allocate multipath"; 1060 return -EINVAL; 1061 } 1062 1063 r = parse_features(&as, m); 1064 if (r) 1065 goto bad; 1066 1067 r = alloc_multipath_stage2(ti, m); 1068 if (r) 1069 goto bad; 1070 1071 r = parse_hw_handler(&as, m); 1072 if (r) 1073 goto bad; 1074 1075 r = dm_read_arg(_args, &as, &m->nr_priority_groups, &ti->error); 1076 if (r) 1077 goto bad; 1078 1079 r = dm_read_arg(_args + 1, &as, &next_pg_num, &ti->error); 1080 if (r) 1081 goto bad; 1082 1083 if ((!m->nr_priority_groups && next_pg_num) || 1084 (m->nr_priority_groups && !next_pg_num)) { 1085 ti->error = "invalid initial priority group"; 1086 r = -EINVAL; 1087 goto bad; 1088 } 1089 1090 /* parse the priority groups */ 1091 while (as.argc) { 1092 struct priority_group *pg; 1093 unsigned nr_valid_paths = atomic_read(&m->nr_valid_paths); 1094 1095 pg = parse_priority_group(&as, m); 1096 if (IS_ERR(pg)) { 1097 r = PTR_ERR(pg); 1098 goto bad; 1099 } 1100 1101 nr_valid_paths += pg->nr_pgpaths; 1102 atomic_set(&m->nr_valid_paths, nr_valid_paths); 1103 1104 list_add_tail(&pg->list, &m->priority_groups); 1105 pg_count++; 1106 pg->pg_num = pg_count; 1107 if (!--next_pg_num) 1108 m->next_pg = pg; 1109 } 1110 1111 if (pg_count != m->nr_priority_groups) { 1112 ti->error = "priority group count mismatch"; 1113 r = -EINVAL; 1114 goto bad; 1115 } 1116 1117 ti->num_flush_bios = 1; 1118 ti->num_discard_bios = 1; 1119 ti->num_write_same_bios = 1; 1120 ti->num_write_zeroes_bios = 1; 1121 if (m->queue_mode == DM_TYPE_BIO_BASED) 1122 ti->per_io_data_size = multipath_per_bio_data_size(); 1123 else 1124 ti->per_io_data_size = sizeof(struct dm_mpath_io); 1125 1126 return 0; 1127 1128 bad: 1129 free_multipath(m); 1130 return r; 1131 } 1132 1133 static void multipath_wait_for_pg_init_completion(struct multipath *m) 1134 { 1135 DEFINE_WAIT(wait); 1136 1137 while (1) { 1138 prepare_to_wait(&m->pg_init_wait, &wait, TASK_UNINTERRUPTIBLE); 1139 1140 if (!atomic_read(&m->pg_init_in_progress)) 1141 break; 1142 1143 io_schedule(); 1144 } 1145 finish_wait(&m->pg_init_wait, &wait); 1146 } 1147 1148 static void flush_multipath_work(struct multipath *m) 1149 { 1150 set_bit(MPATHF_PG_INIT_DISABLED, &m->flags); 1151 smp_mb__after_atomic(); 1152 1153 flush_workqueue(kmpath_handlerd); 1154 multipath_wait_for_pg_init_completion(m); 1155 flush_workqueue(kmultipathd); 1156 flush_work(&m->trigger_event); 1157 1158 clear_bit(MPATHF_PG_INIT_DISABLED, &m->flags); 1159 smp_mb__after_atomic(); 1160 } 1161 1162 static void multipath_dtr(struct dm_target *ti) 1163 { 1164 struct multipath *m = ti->private; 1165 1166 flush_multipath_work(m); 1167 free_multipath(m); 1168 } 1169 1170 /* 1171 * Take a path out of use. 1172 */ 1173 static int fail_path(struct pgpath *pgpath) 1174 { 1175 unsigned long flags; 1176 struct multipath *m = pgpath->pg->m; 1177 1178 spin_lock_irqsave(&m->lock, flags); 1179 1180 if (!pgpath->is_active) 1181 goto out; 1182 1183 DMWARN("Failing path %s.", pgpath->path.dev->name); 1184 1185 pgpath->pg->ps.type->fail_path(&pgpath->pg->ps, &pgpath->path); 1186 pgpath->is_active = false; 1187 pgpath->fail_count++; 1188 1189 atomic_dec(&m->nr_valid_paths); 1190 1191 if (pgpath == m->current_pgpath) 1192 m->current_pgpath = NULL; 1193 1194 dm_path_uevent(DM_UEVENT_PATH_FAILED, m->ti, 1195 pgpath->path.dev->name, atomic_read(&m->nr_valid_paths)); 1196 1197 schedule_work(&m->trigger_event); 1198 1199 out: 1200 spin_unlock_irqrestore(&m->lock, flags); 1201 1202 return 0; 1203 } 1204 1205 /* 1206 * Reinstate a previously-failed path 1207 */ 1208 static int reinstate_path(struct pgpath *pgpath) 1209 { 1210 int r = 0, run_queue = 0; 1211 unsigned long flags; 1212 struct multipath *m = pgpath->pg->m; 1213 unsigned nr_valid_paths; 1214 1215 spin_lock_irqsave(&m->lock, flags); 1216 1217 if (pgpath->is_active) 1218 goto out; 1219 1220 DMWARN("Reinstating path %s.", pgpath->path.dev->name); 1221 1222 r = pgpath->pg->ps.type->reinstate_path(&pgpath->pg->ps, &pgpath->path); 1223 if (r) 1224 goto out; 1225 1226 pgpath->is_active = true; 1227 1228 nr_valid_paths = atomic_inc_return(&m->nr_valid_paths); 1229 if (nr_valid_paths == 1) { 1230 m->current_pgpath = NULL; 1231 run_queue = 1; 1232 } else if (m->hw_handler_name && (m->current_pg == pgpath->pg)) { 1233 if (queue_work(kmpath_handlerd, &pgpath->activate_path.work)) 1234 atomic_inc(&m->pg_init_in_progress); 1235 } 1236 1237 dm_path_uevent(DM_UEVENT_PATH_REINSTATED, m->ti, 1238 pgpath->path.dev->name, nr_valid_paths); 1239 1240 schedule_work(&m->trigger_event); 1241 1242 out: 1243 spin_unlock_irqrestore(&m->lock, flags); 1244 if (run_queue) { 1245 dm_table_run_md_queue_async(m->ti->table); 1246 process_queued_io_list(m); 1247 } 1248 1249 return r; 1250 } 1251 1252 /* 1253 * Fail or reinstate all paths that match the provided struct dm_dev. 1254 */ 1255 static int action_dev(struct multipath *m, struct dm_dev *dev, 1256 action_fn action) 1257 { 1258 int r = -EINVAL; 1259 struct pgpath *pgpath; 1260 struct priority_group *pg; 1261 1262 list_for_each_entry(pg, &m->priority_groups, list) { 1263 list_for_each_entry(pgpath, &pg->pgpaths, list) { 1264 if (pgpath->path.dev == dev) 1265 r = action(pgpath); 1266 } 1267 } 1268 1269 return r; 1270 } 1271 1272 /* 1273 * Temporarily try to avoid having to use the specified PG 1274 */ 1275 static void bypass_pg(struct multipath *m, struct priority_group *pg, 1276 bool bypassed) 1277 { 1278 unsigned long flags; 1279 1280 spin_lock_irqsave(&m->lock, flags); 1281 1282 pg->bypassed = bypassed; 1283 m->current_pgpath = NULL; 1284 m->current_pg = NULL; 1285 1286 spin_unlock_irqrestore(&m->lock, flags); 1287 1288 schedule_work(&m->trigger_event); 1289 } 1290 1291 /* 1292 * Switch to using the specified PG from the next I/O that gets mapped 1293 */ 1294 static int switch_pg_num(struct multipath *m, const char *pgstr) 1295 { 1296 struct priority_group *pg; 1297 unsigned pgnum; 1298 unsigned long flags; 1299 char dummy; 1300 1301 if (!pgstr || (sscanf(pgstr, "%u%c", &pgnum, &dummy) != 1) || !pgnum || 1302 !m->nr_priority_groups || (pgnum > m->nr_priority_groups)) { 1303 DMWARN("invalid PG number supplied to switch_pg_num"); 1304 return -EINVAL; 1305 } 1306 1307 spin_lock_irqsave(&m->lock, flags); 1308 list_for_each_entry(pg, &m->priority_groups, list) { 1309 pg->bypassed = false; 1310 if (--pgnum) 1311 continue; 1312 1313 m->current_pgpath = NULL; 1314 m->current_pg = NULL; 1315 m->next_pg = pg; 1316 } 1317 spin_unlock_irqrestore(&m->lock, flags); 1318 1319 schedule_work(&m->trigger_event); 1320 return 0; 1321 } 1322 1323 /* 1324 * Set/clear bypassed status of a PG. 1325 * PGs are numbered upwards from 1 in the order they were declared. 1326 */ 1327 static int bypass_pg_num(struct multipath *m, const char *pgstr, bool bypassed) 1328 { 1329 struct priority_group *pg; 1330 unsigned pgnum; 1331 char dummy; 1332 1333 if (!pgstr || (sscanf(pgstr, "%u%c", &pgnum, &dummy) != 1) || !pgnum || 1334 !m->nr_priority_groups || (pgnum > m->nr_priority_groups)) { 1335 DMWARN("invalid PG number supplied to bypass_pg"); 1336 return -EINVAL; 1337 } 1338 1339 list_for_each_entry(pg, &m->priority_groups, list) { 1340 if (!--pgnum) 1341 break; 1342 } 1343 1344 bypass_pg(m, pg, bypassed); 1345 return 0; 1346 } 1347 1348 /* 1349 * Should we retry pg_init immediately? 1350 */ 1351 static bool pg_init_limit_reached(struct multipath *m, struct pgpath *pgpath) 1352 { 1353 unsigned long flags; 1354 bool limit_reached = false; 1355 1356 spin_lock_irqsave(&m->lock, flags); 1357 1358 if (atomic_read(&m->pg_init_count) <= m->pg_init_retries && 1359 !test_bit(MPATHF_PG_INIT_DISABLED, &m->flags)) 1360 set_bit(MPATHF_PG_INIT_REQUIRED, &m->flags); 1361 else 1362 limit_reached = true; 1363 1364 spin_unlock_irqrestore(&m->lock, flags); 1365 1366 return limit_reached; 1367 } 1368 1369 static void pg_init_done(void *data, int errors) 1370 { 1371 struct pgpath *pgpath = data; 1372 struct priority_group *pg = pgpath->pg; 1373 struct multipath *m = pg->m; 1374 unsigned long flags; 1375 bool delay_retry = false; 1376 1377 /* device or driver problems */ 1378 switch (errors) { 1379 case SCSI_DH_OK: 1380 break; 1381 case SCSI_DH_NOSYS: 1382 if (!m->hw_handler_name) { 1383 errors = 0; 1384 break; 1385 } 1386 DMERR("Could not failover the device: Handler scsi_dh_%s " 1387 "Error %d.", m->hw_handler_name, errors); 1388 /* 1389 * Fail path for now, so we do not ping pong 1390 */ 1391 fail_path(pgpath); 1392 break; 1393 case SCSI_DH_DEV_TEMP_BUSY: 1394 /* 1395 * Probably doing something like FW upgrade on the 1396 * controller so try the other pg. 1397 */ 1398 bypass_pg(m, pg, true); 1399 break; 1400 case SCSI_DH_RETRY: 1401 /* Wait before retrying. */ 1402 delay_retry = 1; 1403 /* fall through */ 1404 case SCSI_DH_IMM_RETRY: 1405 case SCSI_DH_RES_TEMP_UNAVAIL: 1406 if (pg_init_limit_reached(m, pgpath)) 1407 fail_path(pgpath); 1408 errors = 0; 1409 break; 1410 case SCSI_DH_DEV_OFFLINED: 1411 default: 1412 /* 1413 * We probably do not want to fail the path for a device 1414 * error, but this is what the old dm did. In future 1415 * patches we can do more advanced handling. 1416 */ 1417 fail_path(pgpath); 1418 } 1419 1420 spin_lock_irqsave(&m->lock, flags); 1421 if (errors) { 1422 if (pgpath == m->current_pgpath) { 1423 DMERR("Could not failover device. Error %d.", errors); 1424 m->current_pgpath = NULL; 1425 m->current_pg = NULL; 1426 } 1427 } else if (!test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) 1428 pg->bypassed = false; 1429 1430 if (atomic_dec_return(&m->pg_init_in_progress) > 0) 1431 /* Activations of other paths are still on going */ 1432 goto out; 1433 1434 if (test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) { 1435 if (delay_retry) 1436 set_bit(MPATHF_PG_INIT_DELAY_RETRY, &m->flags); 1437 else 1438 clear_bit(MPATHF_PG_INIT_DELAY_RETRY, &m->flags); 1439 1440 if (__pg_init_all_paths(m)) 1441 goto out; 1442 } 1443 clear_bit(MPATHF_QUEUE_IO, &m->flags); 1444 1445 process_queued_io_list(m); 1446 1447 /* 1448 * Wake up any thread waiting to suspend. 1449 */ 1450 wake_up(&m->pg_init_wait); 1451 1452 out: 1453 spin_unlock_irqrestore(&m->lock, flags); 1454 } 1455 1456 static void activate_or_offline_path(struct pgpath *pgpath) 1457 { 1458 struct request_queue *q = bdev_get_queue(pgpath->path.dev->bdev); 1459 1460 if (pgpath->is_active && !blk_queue_dying(q)) 1461 scsi_dh_activate(q, pg_init_done, pgpath); 1462 else 1463 pg_init_done(pgpath, SCSI_DH_DEV_OFFLINED); 1464 } 1465 1466 static void activate_path_work(struct work_struct *work) 1467 { 1468 struct pgpath *pgpath = 1469 container_of(work, struct pgpath, activate_path.work); 1470 1471 activate_or_offline_path(pgpath); 1472 } 1473 1474 static int noretry_error(blk_status_t error) 1475 { 1476 switch (error) { 1477 case BLK_STS_NOTSUPP: 1478 case BLK_STS_NOSPC: 1479 case BLK_STS_TARGET: 1480 case BLK_STS_NEXUS: 1481 case BLK_STS_MEDIUM: 1482 return 1; 1483 } 1484 1485 /* Anything else could be a path failure, so should be retried */ 1486 return 0; 1487 } 1488 1489 static int multipath_end_io(struct dm_target *ti, struct request *clone, 1490 blk_status_t error, union map_info *map_context) 1491 { 1492 struct dm_mpath_io *mpio = get_mpio(map_context); 1493 struct pgpath *pgpath = mpio->pgpath; 1494 int r = DM_ENDIO_DONE; 1495 1496 /* 1497 * We don't queue any clone request inside the multipath target 1498 * during end I/O handling, since those clone requests don't have 1499 * bio clones. If we queue them inside the multipath target, 1500 * we need to make bio clones, that requires memory allocation. 1501 * (See drivers/md/dm-rq.c:end_clone_bio() about why the clone requests 1502 * don't have bio clones.) 1503 * Instead of queueing the clone request here, we queue the original 1504 * request into dm core, which will remake a clone request and 1505 * clone bios for it and resubmit it later. 1506 */ 1507 if (error && !noretry_error(error)) { 1508 struct multipath *m = ti->private; 1509 1510 r = DM_ENDIO_REQUEUE; 1511 1512 if (pgpath) 1513 fail_path(pgpath); 1514 1515 if (atomic_read(&m->nr_valid_paths) == 0 && 1516 !must_push_back_rq(m)) { 1517 if (error == BLK_STS_IOERR) 1518 dm_report_EIO(m); 1519 /* complete with the original error */ 1520 r = DM_ENDIO_DONE; 1521 } 1522 } 1523 1524 if (pgpath) { 1525 struct path_selector *ps = &pgpath->pg->ps; 1526 1527 if (ps->type->end_io) 1528 ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes); 1529 } 1530 1531 return r; 1532 } 1533 1534 static int multipath_end_io_bio(struct dm_target *ti, struct bio *clone, 1535 blk_status_t *error) 1536 { 1537 struct multipath *m = ti->private; 1538 struct dm_mpath_io *mpio = get_mpio_from_bio(clone); 1539 struct pgpath *pgpath = mpio->pgpath; 1540 unsigned long flags; 1541 int r = DM_ENDIO_DONE; 1542 1543 if (!*error || noretry_error(*error)) 1544 goto done; 1545 1546 if (pgpath) 1547 fail_path(pgpath); 1548 1549 if (atomic_read(&m->nr_valid_paths) == 0 && 1550 !test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) { 1551 if (must_push_back_bio(m)) { 1552 r = DM_ENDIO_REQUEUE; 1553 } else { 1554 dm_report_EIO(m); 1555 *error = BLK_STS_IOERR; 1556 } 1557 goto done; 1558 } 1559 1560 /* Queue for the daemon to resubmit */ 1561 dm_bio_restore(get_bio_details_from_bio(clone), clone); 1562 1563 spin_lock_irqsave(&m->lock, flags); 1564 bio_list_add(&m->queued_bios, clone); 1565 spin_unlock_irqrestore(&m->lock, flags); 1566 if (!test_bit(MPATHF_QUEUE_IO, &m->flags)) 1567 queue_work(kmultipathd, &m->process_queued_bios); 1568 1569 r = DM_ENDIO_INCOMPLETE; 1570 done: 1571 if (pgpath) { 1572 struct path_selector *ps = &pgpath->pg->ps; 1573 1574 if (ps->type->end_io) 1575 ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes); 1576 } 1577 1578 return r; 1579 } 1580 1581 /* 1582 * Suspend can't complete until all the I/O is processed so if 1583 * the last path fails we must error any remaining I/O. 1584 * Note that if the freeze_bdev fails while suspending, the 1585 * queue_if_no_path state is lost - userspace should reset it. 1586 */ 1587 static void multipath_presuspend(struct dm_target *ti) 1588 { 1589 struct multipath *m = ti->private; 1590 1591 queue_if_no_path(m, false, true); 1592 } 1593 1594 static void multipath_postsuspend(struct dm_target *ti) 1595 { 1596 struct multipath *m = ti->private; 1597 1598 mutex_lock(&m->work_mutex); 1599 flush_multipath_work(m); 1600 mutex_unlock(&m->work_mutex); 1601 } 1602 1603 /* 1604 * Restore the queue_if_no_path setting. 1605 */ 1606 static void multipath_resume(struct dm_target *ti) 1607 { 1608 struct multipath *m = ti->private; 1609 unsigned long flags; 1610 1611 spin_lock_irqsave(&m->lock, flags); 1612 assign_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags, 1613 test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags)); 1614 spin_unlock_irqrestore(&m->lock, flags); 1615 } 1616 1617 /* 1618 * Info output has the following format: 1619 * num_multipath_feature_args [multipath_feature_args]* 1620 * num_handler_status_args [handler_status_args]* 1621 * num_groups init_group_number 1622 * [A|D|E num_ps_status_args [ps_status_args]* 1623 * num_paths num_selector_args 1624 * [path_dev A|F fail_count [selector_args]* ]+ ]+ 1625 * 1626 * Table output has the following format (identical to the constructor string): 1627 * num_feature_args [features_args]* 1628 * num_handler_args hw_handler [hw_handler_args]* 1629 * num_groups init_group_number 1630 * [priority selector-name num_ps_args [ps_args]* 1631 * num_paths num_selector_args [path_dev [selector_args]* ]+ ]+ 1632 */ 1633 static void multipath_status(struct dm_target *ti, status_type_t type, 1634 unsigned status_flags, char *result, unsigned maxlen) 1635 { 1636 int sz = 0; 1637 unsigned long flags; 1638 struct multipath *m = ti->private; 1639 struct priority_group *pg; 1640 struct pgpath *p; 1641 unsigned pg_num; 1642 char state; 1643 1644 spin_lock_irqsave(&m->lock, flags); 1645 1646 /* Features */ 1647 if (type == STATUSTYPE_INFO) 1648 DMEMIT("2 %u %u ", test_bit(MPATHF_QUEUE_IO, &m->flags), 1649 atomic_read(&m->pg_init_count)); 1650 else { 1651 DMEMIT("%u ", test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) + 1652 (m->pg_init_retries > 0) * 2 + 1653 (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT) * 2 + 1654 test_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags) + 1655 (m->queue_mode != DM_TYPE_REQUEST_BASED) * 2); 1656 1657 if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) 1658 DMEMIT("queue_if_no_path "); 1659 if (m->pg_init_retries) 1660 DMEMIT("pg_init_retries %u ", m->pg_init_retries); 1661 if (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT) 1662 DMEMIT("pg_init_delay_msecs %u ", m->pg_init_delay_msecs); 1663 if (test_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags)) 1664 DMEMIT("retain_attached_hw_handler "); 1665 if (m->queue_mode != DM_TYPE_REQUEST_BASED) { 1666 switch(m->queue_mode) { 1667 case DM_TYPE_BIO_BASED: 1668 DMEMIT("queue_mode bio "); 1669 break; 1670 case DM_TYPE_MQ_REQUEST_BASED: 1671 DMEMIT("queue_mode mq "); 1672 break; 1673 default: 1674 WARN_ON_ONCE(true); 1675 break; 1676 } 1677 } 1678 } 1679 1680 if (!m->hw_handler_name || type == STATUSTYPE_INFO) 1681 DMEMIT("0 "); 1682 else 1683 DMEMIT("1 %s ", m->hw_handler_name); 1684 1685 DMEMIT("%u ", m->nr_priority_groups); 1686 1687 if (m->next_pg) 1688 pg_num = m->next_pg->pg_num; 1689 else if (m->current_pg) 1690 pg_num = m->current_pg->pg_num; 1691 else 1692 pg_num = (m->nr_priority_groups ? 1 : 0); 1693 1694 DMEMIT("%u ", pg_num); 1695 1696 switch (type) { 1697 case STATUSTYPE_INFO: 1698 list_for_each_entry(pg, &m->priority_groups, list) { 1699 if (pg->bypassed) 1700 state = 'D'; /* Disabled */ 1701 else if (pg == m->current_pg) 1702 state = 'A'; /* Currently Active */ 1703 else 1704 state = 'E'; /* Enabled */ 1705 1706 DMEMIT("%c ", state); 1707 1708 if (pg->ps.type->status) 1709 sz += pg->ps.type->status(&pg->ps, NULL, type, 1710 result + sz, 1711 maxlen - sz); 1712 else 1713 DMEMIT("0 "); 1714 1715 DMEMIT("%u %u ", pg->nr_pgpaths, 1716 pg->ps.type->info_args); 1717 1718 list_for_each_entry(p, &pg->pgpaths, list) { 1719 DMEMIT("%s %s %u ", p->path.dev->name, 1720 p->is_active ? "A" : "F", 1721 p->fail_count); 1722 if (pg->ps.type->status) 1723 sz += pg->ps.type->status(&pg->ps, 1724 &p->path, type, result + sz, 1725 maxlen - sz); 1726 } 1727 } 1728 break; 1729 1730 case STATUSTYPE_TABLE: 1731 list_for_each_entry(pg, &m->priority_groups, list) { 1732 DMEMIT("%s ", pg->ps.type->name); 1733 1734 if (pg->ps.type->status) 1735 sz += pg->ps.type->status(&pg->ps, NULL, type, 1736 result + sz, 1737 maxlen - sz); 1738 else 1739 DMEMIT("0 "); 1740 1741 DMEMIT("%u %u ", pg->nr_pgpaths, 1742 pg->ps.type->table_args); 1743 1744 list_for_each_entry(p, &pg->pgpaths, list) { 1745 DMEMIT("%s ", p->path.dev->name); 1746 if (pg->ps.type->status) 1747 sz += pg->ps.type->status(&pg->ps, 1748 &p->path, type, result + sz, 1749 maxlen - sz); 1750 } 1751 } 1752 break; 1753 } 1754 1755 spin_unlock_irqrestore(&m->lock, flags); 1756 } 1757 1758 static int multipath_message(struct dm_target *ti, unsigned argc, char **argv) 1759 { 1760 int r = -EINVAL; 1761 struct dm_dev *dev; 1762 struct multipath *m = ti->private; 1763 action_fn action; 1764 1765 mutex_lock(&m->work_mutex); 1766 1767 if (dm_suspended(ti)) { 1768 r = -EBUSY; 1769 goto out; 1770 } 1771 1772 if (argc == 1) { 1773 if (!strcasecmp(argv[0], "queue_if_no_path")) { 1774 r = queue_if_no_path(m, true, false); 1775 goto out; 1776 } else if (!strcasecmp(argv[0], "fail_if_no_path")) { 1777 r = queue_if_no_path(m, false, false); 1778 goto out; 1779 } 1780 } 1781 1782 if (argc != 2) { 1783 DMWARN("Invalid multipath message arguments. Expected 2 arguments, got %d.", argc); 1784 goto out; 1785 } 1786 1787 if (!strcasecmp(argv[0], "disable_group")) { 1788 r = bypass_pg_num(m, argv[1], true); 1789 goto out; 1790 } else if (!strcasecmp(argv[0], "enable_group")) { 1791 r = bypass_pg_num(m, argv[1], false); 1792 goto out; 1793 } else if (!strcasecmp(argv[0], "switch_group")) { 1794 r = switch_pg_num(m, argv[1]); 1795 goto out; 1796 } else if (!strcasecmp(argv[0], "reinstate_path")) 1797 action = reinstate_path; 1798 else if (!strcasecmp(argv[0], "fail_path")) 1799 action = fail_path; 1800 else { 1801 DMWARN("Unrecognised multipath message received: %s", argv[0]); 1802 goto out; 1803 } 1804 1805 r = dm_get_device(ti, argv[1], dm_table_get_mode(ti->table), &dev); 1806 if (r) { 1807 DMWARN("message: error getting device %s", 1808 argv[1]); 1809 goto out; 1810 } 1811 1812 r = action_dev(m, dev, action); 1813 1814 dm_put_device(ti, dev); 1815 1816 out: 1817 mutex_unlock(&m->work_mutex); 1818 return r; 1819 } 1820 1821 static int multipath_prepare_ioctl(struct dm_target *ti, 1822 struct block_device **bdev, fmode_t *mode) 1823 { 1824 struct multipath *m = ti->private; 1825 struct pgpath *current_pgpath; 1826 int r; 1827 1828 current_pgpath = READ_ONCE(m->current_pgpath); 1829 if (!current_pgpath) 1830 current_pgpath = choose_pgpath(m, 0); 1831 1832 if (current_pgpath) { 1833 if (!test_bit(MPATHF_QUEUE_IO, &m->flags)) { 1834 *bdev = current_pgpath->path.dev->bdev; 1835 *mode = current_pgpath->path.dev->mode; 1836 r = 0; 1837 } else { 1838 /* pg_init has not started or completed */ 1839 r = -ENOTCONN; 1840 } 1841 } else { 1842 /* No path is available */ 1843 if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) 1844 r = -ENOTCONN; 1845 else 1846 r = -EIO; 1847 } 1848 1849 if (r == -ENOTCONN) { 1850 if (!READ_ONCE(m->current_pg)) { 1851 /* Path status changed, redo selection */ 1852 (void) choose_pgpath(m, 0); 1853 } 1854 if (test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) 1855 pg_init_all_paths(m); 1856 dm_table_run_md_queue_async(m->ti->table); 1857 process_queued_io_list(m); 1858 } 1859 1860 /* 1861 * Only pass ioctls through if the device sizes match exactly. 1862 */ 1863 if (!r && ti->len != i_size_read((*bdev)->bd_inode) >> SECTOR_SHIFT) 1864 return 1; 1865 return r; 1866 } 1867 1868 static int multipath_iterate_devices(struct dm_target *ti, 1869 iterate_devices_callout_fn fn, void *data) 1870 { 1871 struct multipath *m = ti->private; 1872 struct priority_group *pg; 1873 struct pgpath *p; 1874 int ret = 0; 1875 1876 list_for_each_entry(pg, &m->priority_groups, list) { 1877 list_for_each_entry(p, &pg->pgpaths, list) { 1878 ret = fn(ti, p->path.dev, ti->begin, ti->len, data); 1879 if (ret) 1880 goto out; 1881 } 1882 } 1883 1884 out: 1885 return ret; 1886 } 1887 1888 static int pgpath_busy(struct pgpath *pgpath) 1889 { 1890 struct request_queue *q = bdev_get_queue(pgpath->path.dev->bdev); 1891 1892 return blk_lld_busy(q); 1893 } 1894 1895 /* 1896 * We return "busy", only when we can map I/Os but underlying devices 1897 * are busy (so even if we map I/Os now, the I/Os will wait on 1898 * the underlying queue). 1899 * In other words, if we want to kill I/Os or queue them inside us 1900 * due to map unavailability, we don't return "busy". Otherwise, 1901 * dm core won't give us the I/Os and we can't do what we want. 1902 */ 1903 static int multipath_busy(struct dm_target *ti) 1904 { 1905 bool busy = false, has_active = false; 1906 struct multipath *m = ti->private; 1907 struct priority_group *pg, *next_pg; 1908 struct pgpath *pgpath; 1909 1910 /* pg_init in progress */ 1911 if (atomic_read(&m->pg_init_in_progress)) 1912 return true; 1913 1914 /* no paths available, for blk-mq: rely on IO mapping to delay requeue */ 1915 if (!atomic_read(&m->nr_valid_paths) && test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) 1916 return (m->queue_mode != DM_TYPE_MQ_REQUEST_BASED); 1917 1918 /* Guess which priority_group will be used at next mapping time */ 1919 pg = READ_ONCE(m->current_pg); 1920 next_pg = READ_ONCE(m->next_pg); 1921 if (unlikely(!READ_ONCE(m->current_pgpath) && next_pg)) 1922 pg = next_pg; 1923 1924 if (!pg) { 1925 /* 1926 * We don't know which pg will be used at next mapping time. 1927 * We don't call choose_pgpath() here to avoid to trigger 1928 * pg_init just by busy checking. 1929 * So we don't know whether underlying devices we will be using 1930 * at next mapping time are busy or not. Just try mapping. 1931 */ 1932 return busy; 1933 } 1934 1935 /* 1936 * If there is one non-busy active path at least, the path selector 1937 * will be able to select it. So we consider such a pg as not busy. 1938 */ 1939 busy = true; 1940 list_for_each_entry(pgpath, &pg->pgpaths, list) { 1941 if (pgpath->is_active) { 1942 has_active = true; 1943 if (!pgpath_busy(pgpath)) { 1944 busy = false; 1945 break; 1946 } 1947 } 1948 } 1949 1950 if (!has_active) { 1951 /* 1952 * No active path in this pg, so this pg won't be used and 1953 * the current_pg will be changed at next mapping time. 1954 * We need to try mapping to determine it. 1955 */ 1956 busy = false; 1957 } 1958 1959 return busy; 1960 } 1961 1962 /*----------------------------------------------------------------- 1963 * Module setup 1964 *---------------------------------------------------------------*/ 1965 static struct target_type multipath_target = { 1966 .name = "multipath", 1967 .version = {1, 12, 0}, 1968 .features = DM_TARGET_SINGLETON | DM_TARGET_IMMUTABLE, 1969 .module = THIS_MODULE, 1970 .ctr = multipath_ctr, 1971 .dtr = multipath_dtr, 1972 .clone_and_map_rq = multipath_clone_and_map, 1973 .release_clone_rq = multipath_release_clone, 1974 .rq_end_io = multipath_end_io, 1975 .map = multipath_map_bio, 1976 .end_io = multipath_end_io_bio, 1977 .presuspend = multipath_presuspend, 1978 .postsuspend = multipath_postsuspend, 1979 .resume = multipath_resume, 1980 .status = multipath_status, 1981 .message = multipath_message, 1982 .prepare_ioctl = multipath_prepare_ioctl, 1983 .iterate_devices = multipath_iterate_devices, 1984 .busy = multipath_busy, 1985 }; 1986 1987 static int __init dm_multipath_init(void) 1988 { 1989 int r; 1990 1991 kmultipathd = alloc_workqueue("kmpathd", WQ_MEM_RECLAIM, 0); 1992 if (!kmultipathd) { 1993 DMERR("failed to create workqueue kmpathd"); 1994 r = -ENOMEM; 1995 goto bad_alloc_kmultipathd; 1996 } 1997 1998 /* 1999 * A separate workqueue is used to handle the device handlers 2000 * to avoid overloading existing workqueue. Overloading the 2001 * old workqueue would also create a bottleneck in the 2002 * path of the storage hardware device activation. 2003 */ 2004 kmpath_handlerd = alloc_ordered_workqueue("kmpath_handlerd", 2005 WQ_MEM_RECLAIM); 2006 if (!kmpath_handlerd) { 2007 DMERR("failed to create workqueue kmpath_handlerd"); 2008 r = -ENOMEM; 2009 goto bad_alloc_kmpath_handlerd; 2010 } 2011 2012 r = dm_register_target(&multipath_target); 2013 if (r < 0) { 2014 DMERR("request-based register failed %d", r); 2015 r = -EINVAL; 2016 goto bad_register_target; 2017 } 2018 2019 return 0; 2020 2021 bad_register_target: 2022 destroy_workqueue(kmpath_handlerd); 2023 bad_alloc_kmpath_handlerd: 2024 destroy_workqueue(kmultipathd); 2025 bad_alloc_kmultipathd: 2026 return r; 2027 } 2028 2029 static void __exit dm_multipath_exit(void) 2030 { 2031 destroy_workqueue(kmpath_handlerd); 2032 destroy_workqueue(kmultipathd); 2033 2034 dm_unregister_target(&multipath_target); 2035 } 2036 2037 module_init(dm_multipath_init); 2038 module_exit(dm_multipath_exit); 2039 2040 MODULE_DESCRIPTION(DM_NAME " multipath target"); 2041 MODULE_AUTHOR("Sistina Software <dm-devel@redhat.com>"); 2042 MODULE_LICENSE("GPL"); 2043