1 /* 2 * Copyright (C) 2003 Sistina Software Limited. 3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. 4 * 5 * This file is released under the GPL. 6 */ 7 8 #include <linux/device-mapper.h> 9 10 #include "dm.h" 11 #include "dm-path-selector.h" 12 #include "dm-uevent.h" 13 14 #include <linux/blkdev.h> 15 #include <linux/ctype.h> 16 #include <linux/init.h> 17 #include <linux/mempool.h> 18 #include <linux/module.h> 19 #include <linux/pagemap.h> 20 #include <linux/slab.h> 21 #include <linux/time.h> 22 #include <linux/workqueue.h> 23 #include <linux/delay.h> 24 #include <scsi/scsi_dh.h> 25 #include <linux/atomic.h> 26 #include <linux/blk-mq.h> 27 28 #define DM_MSG_PREFIX "multipath" 29 #define DM_PG_INIT_DELAY_MSECS 2000 30 #define DM_PG_INIT_DELAY_DEFAULT ((unsigned) -1) 31 32 /* Path properties */ 33 struct pgpath { 34 struct list_head list; 35 36 struct priority_group *pg; /* Owning PG */ 37 unsigned fail_count; /* Cumulative failure count */ 38 39 struct dm_path path; 40 struct delayed_work activate_path; 41 42 bool is_active:1; /* Path status */ 43 }; 44 45 #define path_to_pgpath(__pgp) container_of((__pgp), struct pgpath, path) 46 47 /* 48 * Paths are grouped into Priority Groups and numbered from 1 upwards. 49 * Each has a path selector which controls which path gets used. 50 */ 51 struct priority_group { 52 struct list_head list; 53 54 struct multipath *m; /* Owning multipath instance */ 55 struct path_selector ps; 56 57 unsigned pg_num; /* Reference number */ 58 unsigned nr_pgpaths; /* Number of paths in PG */ 59 struct list_head pgpaths; 60 61 bool bypassed:1; /* Temporarily bypass this PG? */ 62 }; 63 64 /* Multipath context */ 65 struct multipath { 66 struct list_head list; 67 struct dm_target *ti; 68 69 const char *hw_handler_name; 70 char *hw_handler_params; 71 72 spinlock_t lock; 73 74 unsigned nr_priority_groups; 75 struct list_head priority_groups; 76 77 wait_queue_head_t pg_init_wait; /* Wait for pg_init completion */ 78 79 struct pgpath *current_pgpath; 80 struct priority_group *current_pg; 81 struct priority_group *next_pg; /* Switch to this PG if set */ 82 83 unsigned long flags; /* Multipath state flags */ 84 85 unsigned pg_init_retries; /* Number of times to retry pg_init */ 86 unsigned pg_init_delay_msecs; /* Number of msecs before pg_init retry */ 87 88 atomic_t nr_valid_paths; /* Total number of usable paths */ 89 atomic_t pg_init_in_progress; /* Only one pg_init allowed at once */ 90 atomic_t pg_init_count; /* Number of times pg_init called */ 91 92 /* 93 * We must use a mempool of dm_mpath_io structs so that we 94 * can resubmit bios on error. 95 */ 96 mempool_t *mpio_pool; 97 98 struct mutex work_mutex; 99 struct work_struct trigger_event; 100 }; 101 102 /* 103 * Context information attached to each bio we process. 104 */ 105 struct dm_mpath_io { 106 struct pgpath *pgpath; 107 size_t nr_bytes; 108 }; 109 110 typedef int (*action_fn) (struct pgpath *pgpath); 111 112 static struct kmem_cache *_mpio_cache; 113 114 static struct workqueue_struct *kmultipathd, *kmpath_handlerd; 115 static void trigger_event(struct work_struct *work); 116 static void activate_path(struct work_struct *work); 117 118 /*----------------------------------------------- 119 * Multipath state flags. 120 *-----------------------------------------------*/ 121 122 #define MPATHF_QUEUE_IO 0 /* Must we queue all I/O? */ 123 #define MPATHF_QUEUE_IF_NO_PATH 1 /* Queue I/O if last path fails? */ 124 #define MPATHF_SAVED_QUEUE_IF_NO_PATH 2 /* Saved state during suspension */ 125 #define MPATHF_RETAIN_ATTACHED_HW_HANDLER 3 /* If there's already a hw_handler present, don't change it. */ 126 #define MPATHF_PG_INIT_DISABLED 4 /* pg_init is not currently allowed */ 127 #define MPATHF_PG_INIT_REQUIRED 5 /* pg_init needs calling? */ 128 #define MPATHF_PG_INIT_DELAY_RETRY 6 /* Delay pg_init retry? */ 129 130 /*----------------------------------------------- 131 * Allocation routines 132 *-----------------------------------------------*/ 133 134 static struct pgpath *alloc_pgpath(void) 135 { 136 struct pgpath *pgpath = kzalloc(sizeof(*pgpath), GFP_KERNEL); 137 138 if (pgpath) { 139 pgpath->is_active = true; 140 INIT_DELAYED_WORK(&pgpath->activate_path, activate_path); 141 } 142 143 return pgpath; 144 } 145 146 static void free_pgpath(struct pgpath *pgpath) 147 { 148 kfree(pgpath); 149 } 150 151 static struct priority_group *alloc_priority_group(void) 152 { 153 struct priority_group *pg; 154 155 pg = kzalloc(sizeof(*pg), GFP_KERNEL); 156 157 if (pg) 158 INIT_LIST_HEAD(&pg->pgpaths); 159 160 return pg; 161 } 162 163 static void free_pgpaths(struct list_head *pgpaths, struct dm_target *ti) 164 { 165 struct pgpath *pgpath, *tmp; 166 167 list_for_each_entry_safe(pgpath, tmp, pgpaths, list) { 168 list_del(&pgpath->list); 169 dm_put_device(ti, pgpath->path.dev); 170 free_pgpath(pgpath); 171 } 172 } 173 174 static void free_priority_group(struct priority_group *pg, 175 struct dm_target *ti) 176 { 177 struct path_selector *ps = &pg->ps; 178 179 if (ps->type) { 180 ps->type->destroy(ps); 181 dm_put_path_selector(ps->type); 182 } 183 184 free_pgpaths(&pg->pgpaths, ti); 185 kfree(pg); 186 } 187 188 static struct multipath *alloc_multipath(struct dm_target *ti, bool use_blk_mq) 189 { 190 struct multipath *m; 191 192 m = kzalloc(sizeof(*m), GFP_KERNEL); 193 if (m) { 194 INIT_LIST_HEAD(&m->priority_groups); 195 spin_lock_init(&m->lock); 196 set_bit(MPATHF_QUEUE_IO, &m->flags); 197 atomic_set(&m->nr_valid_paths, 0); 198 atomic_set(&m->pg_init_in_progress, 0); 199 atomic_set(&m->pg_init_count, 0); 200 m->pg_init_delay_msecs = DM_PG_INIT_DELAY_DEFAULT; 201 INIT_WORK(&m->trigger_event, trigger_event); 202 init_waitqueue_head(&m->pg_init_wait); 203 mutex_init(&m->work_mutex); 204 205 m->mpio_pool = NULL; 206 if (!use_blk_mq) { 207 unsigned min_ios = dm_get_reserved_rq_based_ios(); 208 209 m->mpio_pool = mempool_create_slab_pool(min_ios, _mpio_cache); 210 if (!m->mpio_pool) { 211 kfree(m); 212 return NULL; 213 } 214 } 215 216 m->ti = ti; 217 ti->private = m; 218 } 219 220 return m; 221 } 222 223 static void free_multipath(struct multipath *m) 224 { 225 struct priority_group *pg, *tmp; 226 227 list_for_each_entry_safe(pg, tmp, &m->priority_groups, list) { 228 list_del(&pg->list); 229 free_priority_group(pg, m->ti); 230 } 231 232 kfree(m->hw_handler_name); 233 kfree(m->hw_handler_params); 234 mempool_destroy(m->mpio_pool); 235 kfree(m); 236 } 237 238 static struct dm_mpath_io *get_mpio(union map_info *info) 239 { 240 return info->ptr; 241 } 242 243 static struct dm_mpath_io *set_mpio(struct multipath *m, union map_info *info) 244 { 245 struct dm_mpath_io *mpio; 246 247 if (!m->mpio_pool) { 248 /* Use blk-mq pdu memory requested via per_io_data_size */ 249 mpio = get_mpio(info); 250 memset(mpio, 0, sizeof(*mpio)); 251 return mpio; 252 } 253 254 mpio = mempool_alloc(m->mpio_pool, GFP_ATOMIC); 255 if (!mpio) 256 return NULL; 257 258 memset(mpio, 0, sizeof(*mpio)); 259 info->ptr = mpio; 260 261 return mpio; 262 } 263 264 static void clear_request_fn_mpio(struct multipath *m, union map_info *info) 265 { 266 /* Only needed for non blk-mq (.request_fn) multipath */ 267 if (m->mpio_pool) { 268 struct dm_mpath_io *mpio = info->ptr; 269 270 info->ptr = NULL; 271 mempool_free(mpio, m->mpio_pool); 272 } 273 } 274 275 /*----------------------------------------------- 276 * Path selection 277 *-----------------------------------------------*/ 278 279 static int __pg_init_all_paths(struct multipath *m) 280 { 281 struct pgpath *pgpath; 282 unsigned long pg_init_delay = 0; 283 284 if (atomic_read(&m->pg_init_in_progress) || test_bit(MPATHF_PG_INIT_DISABLED, &m->flags)) 285 return 0; 286 287 atomic_inc(&m->pg_init_count); 288 clear_bit(MPATHF_PG_INIT_REQUIRED, &m->flags); 289 290 /* Check here to reset pg_init_required */ 291 if (!m->current_pg) 292 return 0; 293 294 if (test_bit(MPATHF_PG_INIT_DELAY_RETRY, &m->flags)) 295 pg_init_delay = msecs_to_jiffies(m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT ? 296 m->pg_init_delay_msecs : DM_PG_INIT_DELAY_MSECS); 297 list_for_each_entry(pgpath, &m->current_pg->pgpaths, list) { 298 /* Skip failed paths */ 299 if (!pgpath->is_active) 300 continue; 301 if (queue_delayed_work(kmpath_handlerd, &pgpath->activate_path, 302 pg_init_delay)) 303 atomic_inc(&m->pg_init_in_progress); 304 } 305 return atomic_read(&m->pg_init_in_progress); 306 } 307 308 static int pg_init_all_paths(struct multipath *m) 309 { 310 int r; 311 unsigned long flags; 312 313 spin_lock_irqsave(&m->lock, flags); 314 r = __pg_init_all_paths(m); 315 spin_unlock_irqrestore(&m->lock, flags); 316 317 return r; 318 } 319 320 static void __switch_pg(struct multipath *m, struct priority_group *pg) 321 { 322 m->current_pg = pg; 323 324 /* Must we initialise the PG first, and queue I/O till it's ready? */ 325 if (m->hw_handler_name) { 326 set_bit(MPATHF_PG_INIT_REQUIRED, &m->flags); 327 set_bit(MPATHF_QUEUE_IO, &m->flags); 328 } else { 329 clear_bit(MPATHF_PG_INIT_REQUIRED, &m->flags); 330 clear_bit(MPATHF_QUEUE_IO, &m->flags); 331 } 332 333 atomic_set(&m->pg_init_count, 0); 334 } 335 336 static struct pgpath *choose_path_in_pg(struct multipath *m, 337 struct priority_group *pg, 338 size_t nr_bytes) 339 { 340 unsigned long flags; 341 struct dm_path *path; 342 struct pgpath *pgpath; 343 344 path = pg->ps.type->select_path(&pg->ps, nr_bytes); 345 if (!path) 346 return ERR_PTR(-ENXIO); 347 348 pgpath = path_to_pgpath(path); 349 350 if (unlikely(lockless_dereference(m->current_pg) != pg)) { 351 /* Only update current_pgpath if pg changed */ 352 spin_lock_irqsave(&m->lock, flags); 353 m->current_pgpath = pgpath; 354 __switch_pg(m, pg); 355 spin_unlock_irqrestore(&m->lock, flags); 356 } 357 358 return pgpath; 359 } 360 361 static struct pgpath *choose_pgpath(struct multipath *m, size_t nr_bytes) 362 { 363 unsigned long flags; 364 struct priority_group *pg; 365 struct pgpath *pgpath; 366 bool bypassed = true; 367 368 if (!atomic_read(&m->nr_valid_paths)) { 369 clear_bit(MPATHF_QUEUE_IO, &m->flags); 370 goto failed; 371 } 372 373 /* Were we instructed to switch PG? */ 374 if (lockless_dereference(m->next_pg)) { 375 spin_lock_irqsave(&m->lock, flags); 376 pg = m->next_pg; 377 if (!pg) { 378 spin_unlock_irqrestore(&m->lock, flags); 379 goto check_current_pg; 380 } 381 m->next_pg = NULL; 382 spin_unlock_irqrestore(&m->lock, flags); 383 pgpath = choose_path_in_pg(m, pg, nr_bytes); 384 if (!IS_ERR_OR_NULL(pgpath)) 385 return pgpath; 386 } 387 388 /* Don't change PG until it has no remaining paths */ 389 check_current_pg: 390 pg = lockless_dereference(m->current_pg); 391 if (pg) { 392 pgpath = choose_path_in_pg(m, pg, nr_bytes); 393 if (!IS_ERR_OR_NULL(pgpath)) 394 return pgpath; 395 } 396 397 /* 398 * Loop through priority groups until we find a valid path. 399 * First time we skip PGs marked 'bypassed'. 400 * Second time we only try the ones we skipped, but set 401 * pg_init_delay_retry so we do not hammer controllers. 402 */ 403 do { 404 list_for_each_entry(pg, &m->priority_groups, list) { 405 if (pg->bypassed == bypassed) 406 continue; 407 pgpath = choose_path_in_pg(m, pg, nr_bytes); 408 if (!IS_ERR_OR_NULL(pgpath)) { 409 if (!bypassed) 410 set_bit(MPATHF_PG_INIT_DELAY_RETRY, &m->flags); 411 return pgpath; 412 } 413 } 414 } while (bypassed--); 415 416 failed: 417 spin_lock_irqsave(&m->lock, flags); 418 m->current_pgpath = NULL; 419 m->current_pg = NULL; 420 spin_unlock_irqrestore(&m->lock, flags); 421 422 return NULL; 423 } 424 425 /* 426 * Check whether bios must be queued in the device-mapper core rather 427 * than here in the target. 428 * 429 * If m->queue_if_no_path and m->saved_queue_if_no_path hold the 430 * same value then we are not between multipath_presuspend() 431 * and multipath_resume() calls and we have no need to check 432 * for the DMF_NOFLUSH_SUSPENDING flag. 433 */ 434 static int must_push_back(struct multipath *m) 435 { 436 return (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) || 437 ((test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) != 438 test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags)) && 439 dm_noflush_suspending(m->ti))); 440 } 441 442 /* 443 * Map cloned requests 444 */ 445 static int __multipath_map(struct dm_target *ti, struct request *clone, 446 union map_info *map_context, 447 struct request *rq, struct request **__clone) 448 { 449 struct multipath *m = ti->private; 450 int r = DM_MAPIO_REQUEUE; 451 size_t nr_bytes = clone ? blk_rq_bytes(clone) : blk_rq_bytes(rq); 452 struct pgpath *pgpath; 453 struct block_device *bdev; 454 struct dm_mpath_io *mpio; 455 456 /* Do we need to select a new pgpath? */ 457 pgpath = lockless_dereference(m->current_pgpath); 458 if (!pgpath || !test_bit(MPATHF_QUEUE_IO, &m->flags)) 459 pgpath = choose_pgpath(m, nr_bytes); 460 461 if (!pgpath) { 462 if (!must_push_back(m)) 463 r = -EIO; /* Failed */ 464 return r; 465 } else if (test_bit(MPATHF_QUEUE_IO, &m->flags) || 466 test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) { 467 pg_init_all_paths(m); 468 return r; 469 } 470 471 mpio = set_mpio(m, map_context); 472 if (!mpio) 473 /* ENOMEM, requeue */ 474 return r; 475 476 mpio->pgpath = pgpath; 477 mpio->nr_bytes = nr_bytes; 478 479 bdev = pgpath->path.dev->bdev; 480 481 if (clone) { 482 /* 483 * Old request-based interface: allocated clone is passed in. 484 * Used by: .request_fn stacked on .request_fn path(s). 485 */ 486 clone->q = bdev_get_queue(bdev); 487 clone->rq_disk = bdev->bd_disk; 488 clone->cmd_flags |= REQ_FAILFAST_TRANSPORT; 489 } else { 490 /* 491 * blk-mq request-based interface; used by both: 492 * .request_fn stacked on blk-mq path(s) and 493 * blk-mq stacked on blk-mq path(s). 494 */ 495 *__clone = blk_mq_alloc_request(bdev_get_queue(bdev), 496 rq_data_dir(rq), BLK_MQ_REQ_NOWAIT); 497 if (IS_ERR(*__clone)) { 498 /* ENOMEM, requeue */ 499 clear_request_fn_mpio(m, map_context); 500 return r; 501 } 502 (*__clone)->bio = (*__clone)->biotail = NULL; 503 (*__clone)->rq_disk = bdev->bd_disk; 504 (*__clone)->cmd_flags |= REQ_FAILFAST_TRANSPORT; 505 } 506 507 if (pgpath->pg->ps.type->start_io) 508 pgpath->pg->ps.type->start_io(&pgpath->pg->ps, 509 &pgpath->path, 510 nr_bytes); 511 return DM_MAPIO_REMAPPED; 512 } 513 514 static int multipath_map(struct dm_target *ti, struct request *clone, 515 union map_info *map_context) 516 { 517 return __multipath_map(ti, clone, map_context, NULL, NULL); 518 } 519 520 static int multipath_clone_and_map(struct dm_target *ti, struct request *rq, 521 union map_info *map_context, 522 struct request **clone) 523 { 524 return __multipath_map(ti, NULL, map_context, rq, clone); 525 } 526 527 static void multipath_release_clone(struct request *clone) 528 { 529 blk_mq_free_request(clone); 530 } 531 532 /* 533 * If we run out of usable paths, should we queue I/O or error it? 534 */ 535 static int queue_if_no_path(struct multipath *m, bool queue_if_no_path, 536 bool save_old_value) 537 { 538 unsigned long flags; 539 540 spin_lock_irqsave(&m->lock, flags); 541 542 if (save_old_value) { 543 if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) 544 set_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags); 545 else 546 clear_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags); 547 } else { 548 if (queue_if_no_path) 549 set_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags); 550 else 551 clear_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags); 552 } 553 if (queue_if_no_path) 554 set_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags); 555 else 556 clear_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags); 557 558 spin_unlock_irqrestore(&m->lock, flags); 559 560 if (!queue_if_no_path) 561 dm_table_run_md_queue_async(m->ti->table); 562 563 return 0; 564 } 565 566 /* 567 * An event is triggered whenever a path is taken out of use. 568 * Includes path failure and PG bypass. 569 */ 570 static void trigger_event(struct work_struct *work) 571 { 572 struct multipath *m = 573 container_of(work, struct multipath, trigger_event); 574 575 dm_table_event(m->ti->table); 576 } 577 578 /*----------------------------------------------------------------- 579 * Constructor/argument parsing: 580 * <#multipath feature args> [<arg>]* 581 * <#hw_handler args> [hw_handler [<arg>]*] 582 * <#priority groups> 583 * <initial priority group> 584 * [<selector> <#selector args> [<arg>]* 585 * <#paths> <#per-path selector args> 586 * [<path> [<arg>]* ]+ ]+ 587 *---------------------------------------------------------------*/ 588 static int parse_path_selector(struct dm_arg_set *as, struct priority_group *pg, 589 struct dm_target *ti) 590 { 591 int r; 592 struct path_selector_type *pst; 593 unsigned ps_argc; 594 595 static struct dm_arg _args[] = { 596 {0, 1024, "invalid number of path selector args"}, 597 }; 598 599 pst = dm_get_path_selector(dm_shift_arg(as)); 600 if (!pst) { 601 ti->error = "unknown path selector type"; 602 return -EINVAL; 603 } 604 605 r = dm_read_arg_group(_args, as, &ps_argc, &ti->error); 606 if (r) { 607 dm_put_path_selector(pst); 608 return -EINVAL; 609 } 610 611 r = pst->create(&pg->ps, ps_argc, as->argv); 612 if (r) { 613 dm_put_path_selector(pst); 614 ti->error = "path selector constructor failed"; 615 return r; 616 } 617 618 pg->ps.type = pst; 619 dm_consume_args(as, ps_argc); 620 621 return 0; 622 } 623 624 static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps, 625 struct dm_target *ti) 626 { 627 int r; 628 struct pgpath *p; 629 struct multipath *m = ti->private; 630 struct request_queue *q = NULL; 631 const char *attached_handler_name; 632 633 /* we need at least a path arg */ 634 if (as->argc < 1) { 635 ti->error = "no device given"; 636 return ERR_PTR(-EINVAL); 637 } 638 639 p = alloc_pgpath(); 640 if (!p) 641 return ERR_PTR(-ENOMEM); 642 643 r = dm_get_device(ti, dm_shift_arg(as), dm_table_get_mode(ti->table), 644 &p->path.dev); 645 if (r) { 646 ti->error = "error getting device"; 647 goto bad; 648 } 649 650 if (test_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags) || m->hw_handler_name) 651 q = bdev_get_queue(p->path.dev->bdev); 652 653 if (test_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags)) { 654 retain: 655 attached_handler_name = scsi_dh_attached_handler_name(q, GFP_KERNEL); 656 if (attached_handler_name) { 657 /* 658 * Reset hw_handler_name to match the attached handler 659 * and clear any hw_handler_params associated with the 660 * ignored handler. 661 * 662 * NB. This modifies the table line to show the actual 663 * handler instead of the original table passed in. 664 */ 665 kfree(m->hw_handler_name); 666 m->hw_handler_name = attached_handler_name; 667 668 kfree(m->hw_handler_params); 669 m->hw_handler_params = NULL; 670 } 671 } 672 673 if (m->hw_handler_name) { 674 r = scsi_dh_attach(q, m->hw_handler_name); 675 if (r == -EBUSY) { 676 char b[BDEVNAME_SIZE]; 677 678 printk(KERN_INFO "dm-mpath: retaining handler on device %s\n", 679 bdevname(p->path.dev->bdev, b)); 680 goto retain; 681 } 682 if (r < 0) { 683 ti->error = "error attaching hardware handler"; 684 dm_put_device(ti, p->path.dev); 685 goto bad; 686 } 687 688 if (m->hw_handler_params) { 689 r = scsi_dh_set_params(q, m->hw_handler_params); 690 if (r < 0) { 691 ti->error = "unable to set hardware " 692 "handler parameters"; 693 dm_put_device(ti, p->path.dev); 694 goto bad; 695 } 696 } 697 } 698 699 r = ps->type->add_path(ps, &p->path, as->argc, as->argv, &ti->error); 700 if (r) { 701 dm_put_device(ti, p->path.dev); 702 goto bad; 703 } 704 705 return p; 706 707 bad: 708 free_pgpath(p); 709 return ERR_PTR(r); 710 } 711 712 static struct priority_group *parse_priority_group(struct dm_arg_set *as, 713 struct multipath *m) 714 { 715 static struct dm_arg _args[] = { 716 {1, 1024, "invalid number of paths"}, 717 {0, 1024, "invalid number of selector args"} 718 }; 719 720 int r; 721 unsigned i, nr_selector_args, nr_args; 722 struct priority_group *pg; 723 struct dm_target *ti = m->ti; 724 725 if (as->argc < 2) { 726 as->argc = 0; 727 ti->error = "not enough priority group arguments"; 728 return ERR_PTR(-EINVAL); 729 } 730 731 pg = alloc_priority_group(); 732 if (!pg) { 733 ti->error = "couldn't allocate priority group"; 734 return ERR_PTR(-ENOMEM); 735 } 736 pg->m = m; 737 738 r = parse_path_selector(as, pg, ti); 739 if (r) 740 goto bad; 741 742 /* 743 * read the paths 744 */ 745 r = dm_read_arg(_args, as, &pg->nr_pgpaths, &ti->error); 746 if (r) 747 goto bad; 748 749 r = dm_read_arg(_args + 1, as, &nr_selector_args, &ti->error); 750 if (r) 751 goto bad; 752 753 nr_args = 1 + nr_selector_args; 754 for (i = 0; i < pg->nr_pgpaths; i++) { 755 struct pgpath *pgpath; 756 struct dm_arg_set path_args; 757 758 if (as->argc < nr_args) { 759 ti->error = "not enough path parameters"; 760 r = -EINVAL; 761 goto bad; 762 } 763 764 path_args.argc = nr_args; 765 path_args.argv = as->argv; 766 767 pgpath = parse_path(&path_args, &pg->ps, ti); 768 if (IS_ERR(pgpath)) { 769 r = PTR_ERR(pgpath); 770 goto bad; 771 } 772 773 pgpath->pg = pg; 774 list_add_tail(&pgpath->list, &pg->pgpaths); 775 dm_consume_args(as, nr_args); 776 } 777 778 return pg; 779 780 bad: 781 free_priority_group(pg, ti); 782 return ERR_PTR(r); 783 } 784 785 static int parse_hw_handler(struct dm_arg_set *as, struct multipath *m) 786 { 787 unsigned hw_argc; 788 int ret; 789 struct dm_target *ti = m->ti; 790 791 static struct dm_arg _args[] = { 792 {0, 1024, "invalid number of hardware handler args"}, 793 }; 794 795 if (dm_read_arg_group(_args, as, &hw_argc, &ti->error)) 796 return -EINVAL; 797 798 if (!hw_argc) 799 return 0; 800 801 m->hw_handler_name = kstrdup(dm_shift_arg(as), GFP_KERNEL); 802 803 if (hw_argc > 1) { 804 char *p; 805 int i, j, len = 4; 806 807 for (i = 0; i <= hw_argc - 2; i++) 808 len += strlen(as->argv[i]) + 1; 809 p = m->hw_handler_params = kzalloc(len, GFP_KERNEL); 810 if (!p) { 811 ti->error = "memory allocation failed"; 812 ret = -ENOMEM; 813 goto fail; 814 } 815 j = sprintf(p, "%d", hw_argc - 1); 816 for (i = 0, p+=j+1; i <= hw_argc - 2; i++, p+=j+1) 817 j = sprintf(p, "%s", as->argv[i]); 818 } 819 dm_consume_args(as, hw_argc - 1); 820 821 return 0; 822 fail: 823 kfree(m->hw_handler_name); 824 m->hw_handler_name = NULL; 825 return ret; 826 } 827 828 static int parse_features(struct dm_arg_set *as, struct multipath *m) 829 { 830 int r; 831 unsigned argc; 832 struct dm_target *ti = m->ti; 833 const char *arg_name; 834 835 static struct dm_arg _args[] = { 836 {0, 6, "invalid number of feature args"}, 837 {1, 50, "pg_init_retries must be between 1 and 50"}, 838 {0, 60000, "pg_init_delay_msecs must be between 0 and 60000"}, 839 }; 840 841 r = dm_read_arg_group(_args, as, &argc, &ti->error); 842 if (r) 843 return -EINVAL; 844 845 if (!argc) 846 return 0; 847 848 do { 849 arg_name = dm_shift_arg(as); 850 argc--; 851 852 if (!strcasecmp(arg_name, "queue_if_no_path")) { 853 r = queue_if_no_path(m, true, false); 854 continue; 855 } 856 857 if (!strcasecmp(arg_name, "retain_attached_hw_handler")) { 858 set_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags); 859 continue; 860 } 861 862 if (!strcasecmp(arg_name, "pg_init_retries") && 863 (argc >= 1)) { 864 r = dm_read_arg(_args + 1, as, &m->pg_init_retries, &ti->error); 865 argc--; 866 continue; 867 } 868 869 if (!strcasecmp(arg_name, "pg_init_delay_msecs") && 870 (argc >= 1)) { 871 r = dm_read_arg(_args + 2, as, &m->pg_init_delay_msecs, &ti->error); 872 argc--; 873 continue; 874 } 875 876 ti->error = "Unrecognised multipath feature request"; 877 r = -EINVAL; 878 } while (argc && !r); 879 880 return r; 881 } 882 883 static int multipath_ctr(struct dm_target *ti, unsigned int argc, 884 char **argv) 885 { 886 /* target arguments */ 887 static struct dm_arg _args[] = { 888 {0, 1024, "invalid number of priority groups"}, 889 {0, 1024, "invalid initial priority group number"}, 890 }; 891 892 int r; 893 struct multipath *m; 894 struct dm_arg_set as; 895 unsigned pg_count = 0; 896 unsigned next_pg_num; 897 bool use_blk_mq = dm_use_blk_mq(dm_table_get_md(ti->table)); 898 899 as.argc = argc; 900 as.argv = argv; 901 902 m = alloc_multipath(ti, use_blk_mq); 903 if (!m) { 904 ti->error = "can't allocate multipath"; 905 return -EINVAL; 906 } 907 908 r = parse_features(&as, m); 909 if (r) 910 goto bad; 911 912 r = parse_hw_handler(&as, m); 913 if (r) 914 goto bad; 915 916 r = dm_read_arg(_args, &as, &m->nr_priority_groups, &ti->error); 917 if (r) 918 goto bad; 919 920 r = dm_read_arg(_args + 1, &as, &next_pg_num, &ti->error); 921 if (r) 922 goto bad; 923 924 if ((!m->nr_priority_groups && next_pg_num) || 925 (m->nr_priority_groups && !next_pg_num)) { 926 ti->error = "invalid initial priority group"; 927 r = -EINVAL; 928 goto bad; 929 } 930 931 /* parse the priority groups */ 932 while (as.argc) { 933 struct priority_group *pg; 934 unsigned nr_valid_paths = atomic_read(&m->nr_valid_paths); 935 936 pg = parse_priority_group(&as, m); 937 if (IS_ERR(pg)) { 938 r = PTR_ERR(pg); 939 goto bad; 940 } 941 942 nr_valid_paths += pg->nr_pgpaths; 943 atomic_set(&m->nr_valid_paths, nr_valid_paths); 944 945 list_add_tail(&pg->list, &m->priority_groups); 946 pg_count++; 947 pg->pg_num = pg_count; 948 if (!--next_pg_num) 949 m->next_pg = pg; 950 } 951 952 if (pg_count != m->nr_priority_groups) { 953 ti->error = "priority group count mismatch"; 954 r = -EINVAL; 955 goto bad; 956 } 957 958 ti->num_flush_bios = 1; 959 ti->num_discard_bios = 1; 960 ti->num_write_same_bios = 1; 961 if (use_blk_mq) 962 ti->per_io_data_size = sizeof(struct dm_mpath_io); 963 964 return 0; 965 966 bad: 967 free_multipath(m); 968 return r; 969 } 970 971 static void multipath_wait_for_pg_init_completion(struct multipath *m) 972 { 973 DECLARE_WAITQUEUE(wait, current); 974 975 add_wait_queue(&m->pg_init_wait, &wait); 976 977 while (1) { 978 set_current_state(TASK_UNINTERRUPTIBLE); 979 980 if (!atomic_read(&m->pg_init_in_progress)) 981 break; 982 983 io_schedule(); 984 } 985 set_current_state(TASK_RUNNING); 986 987 remove_wait_queue(&m->pg_init_wait, &wait); 988 } 989 990 static void flush_multipath_work(struct multipath *m) 991 { 992 set_bit(MPATHF_PG_INIT_DISABLED, &m->flags); 993 smp_mb__after_atomic(); 994 995 flush_workqueue(kmpath_handlerd); 996 multipath_wait_for_pg_init_completion(m); 997 flush_workqueue(kmultipathd); 998 flush_work(&m->trigger_event); 999 1000 clear_bit(MPATHF_PG_INIT_DISABLED, &m->flags); 1001 smp_mb__after_atomic(); 1002 } 1003 1004 static void multipath_dtr(struct dm_target *ti) 1005 { 1006 struct multipath *m = ti->private; 1007 1008 flush_multipath_work(m); 1009 free_multipath(m); 1010 } 1011 1012 /* 1013 * Take a path out of use. 1014 */ 1015 static int fail_path(struct pgpath *pgpath) 1016 { 1017 unsigned long flags; 1018 struct multipath *m = pgpath->pg->m; 1019 1020 spin_lock_irqsave(&m->lock, flags); 1021 1022 if (!pgpath->is_active) 1023 goto out; 1024 1025 DMWARN("Failing path %s.", pgpath->path.dev->name); 1026 1027 pgpath->pg->ps.type->fail_path(&pgpath->pg->ps, &pgpath->path); 1028 pgpath->is_active = false; 1029 pgpath->fail_count++; 1030 1031 atomic_dec(&m->nr_valid_paths); 1032 1033 if (pgpath == m->current_pgpath) 1034 m->current_pgpath = NULL; 1035 1036 dm_path_uevent(DM_UEVENT_PATH_FAILED, m->ti, 1037 pgpath->path.dev->name, atomic_read(&m->nr_valid_paths)); 1038 1039 schedule_work(&m->trigger_event); 1040 1041 out: 1042 spin_unlock_irqrestore(&m->lock, flags); 1043 1044 return 0; 1045 } 1046 1047 /* 1048 * Reinstate a previously-failed path 1049 */ 1050 static int reinstate_path(struct pgpath *pgpath) 1051 { 1052 int r = 0, run_queue = 0; 1053 unsigned long flags; 1054 struct multipath *m = pgpath->pg->m; 1055 unsigned nr_valid_paths; 1056 1057 spin_lock_irqsave(&m->lock, flags); 1058 1059 if (pgpath->is_active) 1060 goto out; 1061 1062 DMWARN("Reinstating path %s.", pgpath->path.dev->name); 1063 1064 r = pgpath->pg->ps.type->reinstate_path(&pgpath->pg->ps, &pgpath->path); 1065 if (r) 1066 goto out; 1067 1068 pgpath->is_active = true; 1069 1070 nr_valid_paths = atomic_inc_return(&m->nr_valid_paths); 1071 if (nr_valid_paths == 1) { 1072 m->current_pgpath = NULL; 1073 run_queue = 1; 1074 } else if (m->hw_handler_name && (m->current_pg == pgpath->pg)) { 1075 if (queue_work(kmpath_handlerd, &pgpath->activate_path.work)) 1076 atomic_inc(&m->pg_init_in_progress); 1077 } 1078 1079 dm_path_uevent(DM_UEVENT_PATH_REINSTATED, m->ti, 1080 pgpath->path.dev->name, nr_valid_paths); 1081 1082 schedule_work(&m->trigger_event); 1083 1084 out: 1085 spin_unlock_irqrestore(&m->lock, flags); 1086 if (run_queue) 1087 dm_table_run_md_queue_async(m->ti->table); 1088 1089 return r; 1090 } 1091 1092 /* 1093 * Fail or reinstate all paths that match the provided struct dm_dev. 1094 */ 1095 static int action_dev(struct multipath *m, struct dm_dev *dev, 1096 action_fn action) 1097 { 1098 int r = -EINVAL; 1099 struct pgpath *pgpath; 1100 struct priority_group *pg; 1101 1102 list_for_each_entry(pg, &m->priority_groups, list) { 1103 list_for_each_entry(pgpath, &pg->pgpaths, list) { 1104 if (pgpath->path.dev == dev) 1105 r = action(pgpath); 1106 } 1107 } 1108 1109 return r; 1110 } 1111 1112 /* 1113 * Temporarily try to avoid having to use the specified PG 1114 */ 1115 static void bypass_pg(struct multipath *m, struct priority_group *pg, 1116 bool bypassed) 1117 { 1118 unsigned long flags; 1119 1120 spin_lock_irqsave(&m->lock, flags); 1121 1122 pg->bypassed = bypassed; 1123 m->current_pgpath = NULL; 1124 m->current_pg = NULL; 1125 1126 spin_unlock_irqrestore(&m->lock, flags); 1127 1128 schedule_work(&m->trigger_event); 1129 } 1130 1131 /* 1132 * Switch to using the specified PG from the next I/O that gets mapped 1133 */ 1134 static int switch_pg_num(struct multipath *m, const char *pgstr) 1135 { 1136 struct priority_group *pg; 1137 unsigned pgnum; 1138 unsigned long flags; 1139 char dummy; 1140 1141 if (!pgstr || (sscanf(pgstr, "%u%c", &pgnum, &dummy) != 1) || !pgnum || 1142 (pgnum > m->nr_priority_groups)) { 1143 DMWARN("invalid PG number supplied to switch_pg_num"); 1144 return -EINVAL; 1145 } 1146 1147 spin_lock_irqsave(&m->lock, flags); 1148 list_for_each_entry(pg, &m->priority_groups, list) { 1149 pg->bypassed = false; 1150 if (--pgnum) 1151 continue; 1152 1153 m->current_pgpath = NULL; 1154 m->current_pg = NULL; 1155 m->next_pg = pg; 1156 } 1157 spin_unlock_irqrestore(&m->lock, flags); 1158 1159 schedule_work(&m->trigger_event); 1160 return 0; 1161 } 1162 1163 /* 1164 * Set/clear bypassed status of a PG. 1165 * PGs are numbered upwards from 1 in the order they were declared. 1166 */ 1167 static int bypass_pg_num(struct multipath *m, const char *pgstr, bool bypassed) 1168 { 1169 struct priority_group *pg; 1170 unsigned pgnum; 1171 char dummy; 1172 1173 if (!pgstr || (sscanf(pgstr, "%u%c", &pgnum, &dummy) != 1) || !pgnum || 1174 (pgnum > m->nr_priority_groups)) { 1175 DMWARN("invalid PG number supplied to bypass_pg"); 1176 return -EINVAL; 1177 } 1178 1179 list_for_each_entry(pg, &m->priority_groups, list) { 1180 if (!--pgnum) 1181 break; 1182 } 1183 1184 bypass_pg(m, pg, bypassed); 1185 return 0; 1186 } 1187 1188 /* 1189 * Should we retry pg_init immediately? 1190 */ 1191 static bool pg_init_limit_reached(struct multipath *m, struct pgpath *pgpath) 1192 { 1193 unsigned long flags; 1194 bool limit_reached = false; 1195 1196 spin_lock_irqsave(&m->lock, flags); 1197 1198 if (atomic_read(&m->pg_init_count) <= m->pg_init_retries && 1199 !test_bit(MPATHF_PG_INIT_DISABLED, &m->flags)) 1200 set_bit(MPATHF_PG_INIT_REQUIRED, &m->flags); 1201 else 1202 limit_reached = true; 1203 1204 spin_unlock_irqrestore(&m->lock, flags); 1205 1206 return limit_reached; 1207 } 1208 1209 static void pg_init_done(void *data, int errors) 1210 { 1211 struct pgpath *pgpath = data; 1212 struct priority_group *pg = pgpath->pg; 1213 struct multipath *m = pg->m; 1214 unsigned long flags; 1215 bool delay_retry = false; 1216 1217 /* device or driver problems */ 1218 switch (errors) { 1219 case SCSI_DH_OK: 1220 break; 1221 case SCSI_DH_NOSYS: 1222 if (!m->hw_handler_name) { 1223 errors = 0; 1224 break; 1225 } 1226 DMERR("Could not failover the device: Handler scsi_dh_%s " 1227 "Error %d.", m->hw_handler_name, errors); 1228 /* 1229 * Fail path for now, so we do not ping pong 1230 */ 1231 fail_path(pgpath); 1232 break; 1233 case SCSI_DH_DEV_TEMP_BUSY: 1234 /* 1235 * Probably doing something like FW upgrade on the 1236 * controller so try the other pg. 1237 */ 1238 bypass_pg(m, pg, true); 1239 break; 1240 case SCSI_DH_RETRY: 1241 /* Wait before retrying. */ 1242 delay_retry = 1; 1243 case SCSI_DH_IMM_RETRY: 1244 case SCSI_DH_RES_TEMP_UNAVAIL: 1245 if (pg_init_limit_reached(m, pgpath)) 1246 fail_path(pgpath); 1247 errors = 0; 1248 break; 1249 case SCSI_DH_DEV_OFFLINED: 1250 default: 1251 /* 1252 * We probably do not want to fail the path for a device 1253 * error, but this is what the old dm did. In future 1254 * patches we can do more advanced handling. 1255 */ 1256 fail_path(pgpath); 1257 } 1258 1259 spin_lock_irqsave(&m->lock, flags); 1260 if (errors) { 1261 if (pgpath == m->current_pgpath) { 1262 DMERR("Could not failover device. Error %d.", errors); 1263 m->current_pgpath = NULL; 1264 m->current_pg = NULL; 1265 } 1266 } else if (!test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) 1267 pg->bypassed = false; 1268 1269 if (atomic_dec_return(&m->pg_init_in_progress) > 0) 1270 /* Activations of other paths are still on going */ 1271 goto out; 1272 1273 if (test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) { 1274 if (delay_retry) 1275 set_bit(MPATHF_PG_INIT_DELAY_RETRY, &m->flags); 1276 else 1277 clear_bit(MPATHF_PG_INIT_DELAY_RETRY, &m->flags); 1278 1279 if (__pg_init_all_paths(m)) 1280 goto out; 1281 } 1282 clear_bit(MPATHF_QUEUE_IO, &m->flags); 1283 1284 /* 1285 * Wake up any thread waiting to suspend. 1286 */ 1287 wake_up(&m->pg_init_wait); 1288 1289 out: 1290 spin_unlock_irqrestore(&m->lock, flags); 1291 } 1292 1293 static void activate_path(struct work_struct *work) 1294 { 1295 struct pgpath *pgpath = 1296 container_of(work, struct pgpath, activate_path.work); 1297 1298 if (pgpath->is_active) 1299 scsi_dh_activate(bdev_get_queue(pgpath->path.dev->bdev), 1300 pg_init_done, pgpath); 1301 else 1302 pg_init_done(pgpath, SCSI_DH_DEV_OFFLINED); 1303 } 1304 1305 static int noretry_error(int error) 1306 { 1307 switch (error) { 1308 case -EOPNOTSUPP: 1309 case -EREMOTEIO: 1310 case -EILSEQ: 1311 case -ENODATA: 1312 case -ENOSPC: 1313 return 1; 1314 } 1315 1316 /* Anything else could be a path failure, so should be retried */ 1317 return 0; 1318 } 1319 1320 /* 1321 * end_io handling 1322 */ 1323 static int do_end_io(struct multipath *m, struct request *clone, 1324 int error, struct dm_mpath_io *mpio) 1325 { 1326 /* 1327 * We don't queue any clone request inside the multipath target 1328 * during end I/O handling, since those clone requests don't have 1329 * bio clones. If we queue them inside the multipath target, 1330 * we need to make bio clones, that requires memory allocation. 1331 * (See drivers/md/dm.c:end_clone_bio() about why the clone requests 1332 * don't have bio clones.) 1333 * Instead of queueing the clone request here, we queue the original 1334 * request into dm core, which will remake a clone request and 1335 * clone bios for it and resubmit it later. 1336 */ 1337 int r = DM_ENDIO_REQUEUE; 1338 1339 if (!error && !clone->errors) 1340 return 0; /* I/O complete */ 1341 1342 if (noretry_error(error)) 1343 return error; 1344 1345 if (mpio->pgpath) 1346 fail_path(mpio->pgpath); 1347 1348 if (!atomic_read(&m->nr_valid_paths)) { 1349 if (!test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) { 1350 if (!must_push_back(m)) 1351 r = -EIO; 1352 } else { 1353 if (error == -EBADE) 1354 r = error; 1355 } 1356 } 1357 1358 return r; 1359 } 1360 1361 static int multipath_end_io(struct dm_target *ti, struct request *clone, 1362 int error, union map_info *map_context) 1363 { 1364 struct multipath *m = ti->private; 1365 struct dm_mpath_io *mpio = get_mpio(map_context); 1366 struct pgpath *pgpath; 1367 struct path_selector *ps; 1368 int r; 1369 1370 BUG_ON(!mpio); 1371 1372 r = do_end_io(m, clone, error, mpio); 1373 pgpath = mpio->pgpath; 1374 if (pgpath) { 1375 ps = &pgpath->pg->ps; 1376 if (ps->type->end_io) 1377 ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes); 1378 } 1379 clear_request_fn_mpio(m, map_context); 1380 1381 return r; 1382 } 1383 1384 /* 1385 * Suspend can't complete until all the I/O is processed so if 1386 * the last path fails we must error any remaining I/O. 1387 * Note that if the freeze_bdev fails while suspending, the 1388 * queue_if_no_path state is lost - userspace should reset it. 1389 */ 1390 static void multipath_presuspend(struct dm_target *ti) 1391 { 1392 struct multipath *m = ti->private; 1393 1394 queue_if_no_path(m, false, true); 1395 } 1396 1397 static void multipath_postsuspend(struct dm_target *ti) 1398 { 1399 struct multipath *m = ti->private; 1400 1401 mutex_lock(&m->work_mutex); 1402 flush_multipath_work(m); 1403 mutex_unlock(&m->work_mutex); 1404 } 1405 1406 /* 1407 * Restore the queue_if_no_path setting. 1408 */ 1409 static void multipath_resume(struct dm_target *ti) 1410 { 1411 struct multipath *m = ti->private; 1412 1413 if (test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags)) 1414 set_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags); 1415 else 1416 clear_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags); 1417 smp_mb__after_atomic(); 1418 } 1419 1420 /* 1421 * Info output has the following format: 1422 * num_multipath_feature_args [multipath_feature_args]* 1423 * num_handler_status_args [handler_status_args]* 1424 * num_groups init_group_number 1425 * [A|D|E num_ps_status_args [ps_status_args]* 1426 * num_paths num_selector_args 1427 * [path_dev A|F fail_count [selector_args]* ]+ ]+ 1428 * 1429 * Table output has the following format (identical to the constructor string): 1430 * num_feature_args [features_args]* 1431 * num_handler_args hw_handler [hw_handler_args]* 1432 * num_groups init_group_number 1433 * [priority selector-name num_ps_args [ps_args]* 1434 * num_paths num_selector_args [path_dev [selector_args]* ]+ ]+ 1435 */ 1436 static void multipath_status(struct dm_target *ti, status_type_t type, 1437 unsigned status_flags, char *result, unsigned maxlen) 1438 { 1439 int sz = 0; 1440 unsigned long flags; 1441 struct multipath *m = ti->private; 1442 struct priority_group *pg; 1443 struct pgpath *p; 1444 unsigned pg_num; 1445 char state; 1446 1447 spin_lock_irqsave(&m->lock, flags); 1448 1449 /* Features */ 1450 if (type == STATUSTYPE_INFO) 1451 DMEMIT("2 %u %u ", test_bit(MPATHF_QUEUE_IO, &m->flags), 1452 atomic_read(&m->pg_init_count)); 1453 else { 1454 DMEMIT("%u ", test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) + 1455 (m->pg_init_retries > 0) * 2 + 1456 (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT) * 2 + 1457 test_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags)); 1458 if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) 1459 DMEMIT("queue_if_no_path "); 1460 if (m->pg_init_retries) 1461 DMEMIT("pg_init_retries %u ", m->pg_init_retries); 1462 if (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT) 1463 DMEMIT("pg_init_delay_msecs %u ", m->pg_init_delay_msecs); 1464 if (test_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags)) 1465 DMEMIT("retain_attached_hw_handler "); 1466 } 1467 1468 if (!m->hw_handler_name || type == STATUSTYPE_INFO) 1469 DMEMIT("0 "); 1470 else 1471 DMEMIT("1 %s ", m->hw_handler_name); 1472 1473 DMEMIT("%u ", m->nr_priority_groups); 1474 1475 if (m->next_pg) 1476 pg_num = m->next_pg->pg_num; 1477 else if (m->current_pg) 1478 pg_num = m->current_pg->pg_num; 1479 else 1480 pg_num = (m->nr_priority_groups ? 1 : 0); 1481 1482 DMEMIT("%u ", pg_num); 1483 1484 switch (type) { 1485 case STATUSTYPE_INFO: 1486 list_for_each_entry(pg, &m->priority_groups, list) { 1487 if (pg->bypassed) 1488 state = 'D'; /* Disabled */ 1489 else if (pg == m->current_pg) 1490 state = 'A'; /* Currently Active */ 1491 else 1492 state = 'E'; /* Enabled */ 1493 1494 DMEMIT("%c ", state); 1495 1496 if (pg->ps.type->status) 1497 sz += pg->ps.type->status(&pg->ps, NULL, type, 1498 result + sz, 1499 maxlen - sz); 1500 else 1501 DMEMIT("0 "); 1502 1503 DMEMIT("%u %u ", pg->nr_pgpaths, 1504 pg->ps.type->info_args); 1505 1506 list_for_each_entry(p, &pg->pgpaths, list) { 1507 DMEMIT("%s %s %u ", p->path.dev->name, 1508 p->is_active ? "A" : "F", 1509 p->fail_count); 1510 if (pg->ps.type->status) 1511 sz += pg->ps.type->status(&pg->ps, 1512 &p->path, type, result + sz, 1513 maxlen - sz); 1514 } 1515 } 1516 break; 1517 1518 case STATUSTYPE_TABLE: 1519 list_for_each_entry(pg, &m->priority_groups, list) { 1520 DMEMIT("%s ", pg->ps.type->name); 1521 1522 if (pg->ps.type->status) 1523 sz += pg->ps.type->status(&pg->ps, NULL, type, 1524 result + sz, 1525 maxlen - sz); 1526 else 1527 DMEMIT("0 "); 1528 1529 DMEMIT("%u %u ", pg->nr_pgpaths, 1530 pg->ps.type->table_args); 1531 1532 list_for_each_entry(p, &pg->pgpaths, list) { 1533 DMEMIT("%s ", p->path.dev->name); 1534 if (pg->ps.type->status) 1535 sz += pg->ps.type->status(&pg->ps, 1536 &p->path, type, result + sz, 1537 maxlen - sz); 1538 } 1539 } 1540 break; 1541 } 1542 1543 spin_unlock_irqrestore(&m->lock, flags); 1544 } 1545 1546 static int multipath_message(struct dm_target *ti, unsigned argc, char **argv) 1547 { 1548 int r = -EINVAL; 1549 struct dm_dev *dev; 1550 struct multipath *m = ti->private; 1551 action_fn action; 1552 1553 mutex_lock(&m->work_mutex); 1554 1555 if (dm_suspended(ti)) { 1556 r = -EBUSY; 1557 goto out; 1558 } 1559 1560 if (argc == 1) { 1561 if (!strcasecmp(argv[0], "queue_if_no_path")) { 1562 r = queue_if_no_path(m, true, false); 1563 goto out; 1564 } else if (!strcasecmp(argv[0], "fail_if_no_path")) { 1565 r = queue_if_no_path(m, false, false); 1566 goto out; 1567 } 1568 } 1569 1570 if (argc != 2) { 1571 DMWARN("Invalid multipath message arguments. Expected 2 arguments, got %d.", argc); 1572 goto out; 1573 } 1574 1575 if (!strcasecmp(argv[0], "disable_group")) { 1576 r = bypass_pg_num(m, argv[1], true); 1577 goto out; 1578 } else if (!strcasecmp(argv[0], "enable_group")) { 1579 r = bypass_pg_num(m, argv[1], false); 1580 goto out; 1581 } else if (!strcasecmp(argv[0], "switch_group")) { 1582 r = switch_pg_num(m, argv[1]); 1583 goto out; 1584 } else if (!strcasecmp(argv[0], "reinstate_path")) 1585 action = reinstate_path; 1586 else if (!strcasecmp(argv[0], "fail_path")) 1587 action = fail_path; 1588 else { 1589 DMWARN("Unrecognised multipath message received: %s", argv[0]); 1590 goto out; 1591 } 1592 1593 r = dm_get_device(ti, argv[1], dm_table_get_mode(ti->table), &dev); 1594 if (r) { 1595 DMWARN("message: error getting device %s", 1596 argv[1]); 1597 goto out; 1598 } 1599 1600 r = action_dev(m, dev, action); 1601 1602 dm_put_device(ti, dev); 1603 1604 out: 1605 mutex_unlock(&m->work_mutex); 1606 return r; 1607 } 1608 1609 static int multipath_prepare_ioctl(struct dm_target *ti, 1610 struct block_device **bdev, fmode_t *mode) 1611 { 1612 struct multipath *m = ti->private; 1613 struct pgpath *current_pgpath; 1614 int r; 1615 1616 current_pgpath = lockless_dereference(m->current_pgpath); 1617 if (!current_pgpath) 1618 current_pgpath = choose_pgpath(m, 0); 1619 1620 if (current_pgpath) { 1621 if (!test_bit(MPATHF_QUEUE_IO, &m->flags)) { 1622 *bdev = current_pgpath->path.dev->bdev; 1623 *mode = current_pgpath->path.dev->mode; 1624 r = 0; 1625 } else { 1626 /* pg_init has not started or completed */ 1627 r = -ENOTCONN; 1628 } 1629 } else { 1630 /* No path is available */ 1631 if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) 1632 r = -ENOTCONN; 1633 else 1634 r = -EIO; 1635 } 1636 1637 if (r == -ENOTCONN) { 1638 if (!lockless_dereference(m->current_pg)) { 1639 /* Path status changed, redo selection */ 1640 (void) choose_pgpath(m, 0); 1641 } 1642 if (test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) 1643 pg_init_all_paths(m); 1644 dm_table_run_md_queue_async(m->ti->table); 1645 } 1646 1647 /* 1648 * Only pass ioctls through if the device sizes match exactly. 1649 */ 1650 if (!r && ti->len != i_size_read((*bdev)->bd_inode) >> SECTOR_SHIFT) 1651 return 1; 1652 return r; 1653 } 1654 1655 static int multipath_iterate_devices(struct dm_target *ti, 1656 iterate_devices_callout_fn fn, void *data) 1657 { 1658 struct multipath *m = ti->private; 1659 struct priority_group *pg; 1660 struct pgpath *p; 1661 int ret = 0; 1662 1663 list_for_each_entry(pg, &m->priority_groups, list) { 1664 list_for_each_entry(p, &pg->pgpaths, list) { 1665 ret = fn(ti, p->path.dev, ti->begin, ti->len, data); 1666 if (ret) 1667 goto out; 1668 } 1669 } 1670 1671 out: 1672 return ret; 1673 } 1674 1675 static int pgpath_busy(struct pgpath *pgpath) 1676 { 1677 struct request_queue *q = bdev_get_queue(pgpath->path.dev->bdev); 1678 1679 return blk_lld_busy(q); 1680 } 1681 1682 /* 1683 * We return "busy", only when we can map I/Os but underlying devices 1684 * are busy (so even if we map I/Os now, the I/Os will wait on 1685 * the underlying queue). 1686 * In other words, if we want to kill I/Os or queue them inside us 1687 * due to map unavailability, we don't return "busy". Otherwise, 1688 * dm core won't give us the I/Os and we can't do what we want. 1689 */ 1690 static int multipath_busy(struct dm_target *ti) 1691 { 1692 bool busy = false, has_active = false; 1693 struct multipath *m = ti->private; 1694 struct priority_group *pg, *next_pg; 1695 struct pgpath *pgpath; 1696 1697 /* pg_init in progress or no paths available */ 1698 if (atomic_read(&m->pg_init_in_progress) || 1699 (!atomic_read(&m->nr_valid_paths) && test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))) 1700 return true; 1701 1702 /* Guess which priority_group will be used at next mapping time */ 1703 pg = lockless_dereference(m->current_pg); 1704 next_pg = lockless_dereference(m->next_pg); 1705 if (unlikely(!lockless_dereference(m->current_pgpath) && next_pg)) 1706 pg = next_pg; 1707 1708 if (!pg) { 1709 /* 1710 * We don't know which pg will be used at next mapping time. 1711 * We don't call choose_pgpath() here to avoid to trigger 1712 * pg_init just by busy checking. 1713 * So we don't know whether underlying devices we will be using 1714 * at next mapping time are busy or not. Just try mapping. 1715 */ 1716 return busy; 1717 } 1718 1719 /* 1720 * If there is one non-busy active path at least, the path selector 1721 * will be able to select it. So we consider such a pg as not busy. 1722 */ 1723 busy = true; 1724 list_for_each_entry(pgpath, &pg->pgpaths, list) { 1725 if (pgpath->is_active) { 1726 has_active = true; 1727 if (!pgpath_busy(pgpath)) { 1728 busy = false; 1729 break; 1730 } 1731 } 1732 } 1733 1734 if (!has_active) { 1735 /* 1736 * No active path in this pg, so this pg won't be used and 1737 * the current_pg will be changed at next mapping time. 1738 * We need to try mapping to determine it. 1739 */ 1740 busy = false; 1741 } 1742 1743 return busy; 1744 } 1745 1746 /*----------------------------------------------------------------- 1747 * Module setup 1748 *---------------------------------------------------------------*/ 1749 static struct target_type multipath_target = { 1750 .name = "multipath", 1751 .version = {1, 11, 0}, 1752 .features = DM_TARGET_SINGLETON | DM_TARGET_IMMUTABLE, 1753 .module = THIS_MODULE, 1754 .ctr = multipath_ctr, 1755 .dtr = multipath_dtr, 1756 .map_rq = multipath_map, 1757 .clone_and_map_rq = multipath_clone_and_map, 1758 .release_clone_rq = multipath_release_clone, 1759 .rq_end_io = multipath_end_io, 1760 .presuspend = multipath_presuspend, 1761 .postsuspend = multipath_postsuspend, 1762 .resume = multipath_resume, 1763 .status = multipath_status, 1764 .message = multipath_message, 1765 .prepare_ioctl = multipath_prepare_ioctl, 1766 .iterate_devices = multipath_iterate_devices, 1767 .busy = multipath_busy, 1768 }; 1769 1770 static int __init dm_multipath_init(void) 1771 { 1772 int r; 1773 1774 /* allocate a slab for the dm_ios */ 1775 _mpio_cache = KMEM_CACHE(dm_mpath_io, 0); 1776 if (!_mpio_cache) 1777 return -ENOMEM; 1778 1779 r = dm_register_target(&multipath_target); 1780 if (r < 0) { 1781 DMERR("register failed %d", r); 1782 r = -EINVAL; 1783 goto bad_register_target; 1784 } 1785 1786 kmultipathd = alloc_workqueue("kmpathd", WQ_MEM_RECLAIM, 0); 1787 if (!kmultipathd) { 1788 DMERR("failed to create workqueue kmpathd"); 1789 r = -ENOMEM; 1790 goto bad_alloc_kmultipathd; 1791 } 1792 1793 /* 1794 * A separate workqueue is used to handle the device handlers 1795 * to avoid overloading existing workqueue. Overloading the 1796 * old workqueue would also create a bottleneck in the 1797 * path of the storage hardware device activation. 1798 */ 1799 kmpath_handlerd = alloc_ordered_workqueue("kmpath_handlerd", 1800 WQ_MEM_RECLAIM); 1801 if (!kmpath_handlerd) { 1802 DMERR("failed to create workqueue kmpath_handlerd"); 1803 r = -ENOMEM; 1804 goto bad_alloc_kmpath_handlerd; 1805 } 1806 1807 DMINFO("version %u.%u.%u loaded", 1808 multipath_target.version[0], multipath_target.version[1], 1809 multipath_target.version[2]); 1810 1811 return 0; 1812 1813 bad_alloc_kmpath_handlerd: 1814 destroy_workqueue(kmultipathd); 1815 bad_alloc_kmultipathd: 1816 dm_unregister_target(&multipath_target); 1817 bad_register_target: 1818 kmem_cache_destroy(_mpio_cache); 1819 1820 return r; 1821 } 1822 1823 static void __exit dm_multipath_exit(void) 1824 { 1825 destroy_workqueue(kmpath_handlerd); 1826 destroy_workqueue(kmultipathd); 1827 1828 dm_unregister_target(&multipath_target); 1829 kmem_cache_destroy(_mpio_cache); 1830 } 1831 1832 module_init(dm_multipath_init); 1833 module_exit(dm_multipath_exit); 1834 1835 MODULE_DESCRIPTION(DM_NAME " multipath target"); 1836 MODULE_AUTHOR("Sistina Software <dm-devel@redhat.com>"); 1837 MODULE_LICENSE("GPL"); 1838