1 /* 2 * Copyright (C) 2003 Sistina Software Limited. 3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. 4 * 5 * This file is released under the GPL. 6 */ 7 8 #include <linux/device-mapper.h> 9 10 #include "dm.h" 11 #include "dm-path-selector.h" 12 #include "dm-uevent.h" 13 14 #include <linux/blkdev.h> 15 #include <linux/ctype.h> 16 #include <linux/init.h> 17 #include <linux/mempool.h> 18 #include <linux/module.h> 19 #include <linux/pagemap.h> 20 #include <linux/slab.h> 21 #include <linux/time.h> 22 #include <linux/workqueue.h> 23 #include <linux/delay.h> 24 #include <scsi/scsi_dh.h> 25 #include <linux/atomic.h> 26 27 #define DM_MSG_PREFIX "multipath" 28 #define DM_PG_INIT_DELAY_MSECS 2000 29 #define DM_PG_INIT_DELAY_DEFAULT ((unsigned) -1) 30 31 /* Path properties */ 32 struct pgpath { 33 struct list_head list; 34 35 struct priority_group *pg; /* Owning PG */ 36 unsigned is_active; /* Path status */ 37 unsigned fail_count; /* Cumulative failure count */ 38 39 struct dm_path path; 40 struct delayed_work activate_path; 41 }; 42 43 #define path_to_pgpath(__pgp) container_of((__pgp), struct pgpath, path) 44 45 /* 46 * Paths are grouped into Priority Groups and numbered from 1 upwards. 47 * Each has a path selector which controls which path gets used. 48 */ 49 struct priority_group { 50 struct list_head list; 51 52 struct multipath *m; /* Owning multipath instance */ 53 struct path_selector ps; 54 55 unsigned pg_num; /* Reference number */ 56 unsigned bypassed; /* Temporarily bypass this PG? */ 57 58 unsigned nr_pgpaths; /* Number of paths in PG */ 59 struct list_head pgpaths; 60 }; 61 62 /* Multipath context */ 63 struct multipath { 64 struct list_head list; 65 struct dm_target *ti; 66 67 const char *hw_handler_name; 68 char *hw_handler_params; 69 70 spinlock_t lock; 71 72 unsigned nr_priority_groups; 73 struct list_head priority_groups; 74 75 wait_queue_head_t pg_init_wait; /* Wait for pg_init completion */ 76 77 unsigned pg_init_required; /* pg_init needs calling? */ 78 unsigned pg_init_in_progress; /* Only one pg_init allowed at once */ 79 unsigned pg_init_delay_retry; /* Delay pg_init retry? */ 80 81 unsigned nr_valid_paths; /* Total number of usable paths */ 82 struct pgpath *current_pgpath; 83 struct priority_group *current_pg; 84 struct priority_group *next_pg; /* Switch to this PG if set */ 85 unsigned repeat_count; /* I/Os left before calling PS again */ 86 87 unsigned queue_io:1; /* Must we queue all I/O? */ 88 unsigned queue_if_no_path:1; /* Queue I/O if last path fails? */ 89 unsigned saved_queue_if_no_path:1; /* Saved state during suspension */ 90 unsigned retain_attached_hw_handler:1; /* If there's already a hw_handler present, don't change it. */ 91 unsigned pg_init_disabled:1; /* pg_init is not currently allowed */ 92 93 unsigned pg_init_retries; /* Number of times to retry pg_init */ 94 unsigned pg_init_count; /* Number of times pg_init called */ 95 unsigned pg_init_delay_msecs; /* Number of msecs before pg_init retry */ 96 97 struct work_struct trigger_event; 98 99 /* 100 * We must use a mempool of dm_mpath_io structs so that we 101 * can resubmit bios on error. 102 */ 103 mempool_t *mpio_pool; 104 105 struct mutex work_mutex; 106 }; 107 108 /* 109 * Context information attached to each bio we process. 110 */ 111 struct dm_mpath_io { 112 struct pgpath *pgpath; 113 size_t nr_bytes; 114 }; 115 116 typedef int (*action_fn) (struct pgpath *pgpath); 117 118 static struct kmem_cache *_mpio_cache; 119 120 static struct workqueue_struct *kmultipathd, *kmpath_handlerd; 121 static void trigger_event(struct work_struct *work); 122 static void activate_path(struct work_struct *work); 123 static int __pgpath_busy(struct pgpath *pgpath); 124 125 126 /*----------------------------------------------- 127 * Allocation routines 128 *-----------------------------------------------*/ 129 130 static struct pgpath *alloc_pgpath(void) 131 { 132 struct pgpath *pgpath = kzalloc(sizeof(*pgpath), GFP_KERNEL); 133 134 if (pgpath) { 135 pgpath->is_active = 1; 136 INIT_DELAYED_WORK(&pgpath->activate_path, activate_path); 137 } 138 139 return pgpath; 140 } 141 142 static void free_pgpath(struct pgpath *pgpath) 143 { 144 kfree(pgpath); 145 } 146 147 static struct priority_group *alloc_priority_group(void) 148 { 149 struct priority_group *pg; 150 151 pg = kzalloc(sizeof(*pg), GFP_KERNEL); 152 153 if (pg) 154 INIT_LIST_HEAD(&pg->pgpaths); 155 156 return pg; 157 } 158 159 static void free_pgpaths(struct list_head *pgpaths, struct dm_target *ti) 160 { 161 struct pgpath *pgpath, *tmp; 162 163 list_for_each_entry_safe(pgpath, tmp, pgpaths, list) { 164 list_del(&pgpath->list); 165 dm_put_device(ti, pgpath->path.dev); 166 free_pgpath(pgpath); 167 } 168 } 169 170 static void free_priority_group(struct priority_group *pg, 171 struct dm_target *ti) 172 { 173 struct path_selector *ps = &pg->ps; 174 175 if (ps->type) { 176 ps->type->destroy(ps); 177 dm_put_path_selector(ps->type); 178 } 179 180 free_pgpaths(&pg->pgpaths, ti); 181 kfree(pg); 182 } 183 184 static struct multipath *alloc_multipath(struct dm_target *ti) 185 { 186 struct multipath *m; 187 unsigned min_ios = dm_get_reserved_rq_based_ios(); 188 189 m = kzalloc(sizeof(*m), GFP_KERNEL); 190 if (m) { 191 INIT_LIST_HEAD(&m->priority_groups); 192 spin_lock_init(&m->lock); 193 m->queue_io = 1; 194 m->pg_init_delay_msecs = DM_PG_INIT_DELAY_DEFAULT; 195 INIT_WORK(&m->trigger_event, trigger_event); 196 init_waitqueue_head(&m->pg_init_wait); 197 mutex_init(&m->work_mutex); 198 m->mpio_pool = mempool_create_slab_pool(min_ios, _mpio_cache); 199 if (!m->mpio_pool) { 200 kfree(m); 201 return NULL; 202 } 203 m->ti = ti; 204 ti->private = m; 205 } 206 207 return m; 208 } 209 210 static void free_multipath(struct multipath *m) 211 { 212 struct priority_group *pg, *tmp; 213 214 list_for_each_entry_safe(pg, tmp, &m->priority_groups, list) { 215 list_del(&pg->list); 216 free_priority_group(pg, m->ti); 217 } 218 219 kfree(m->hw_handler_name); 220 kfree(m->hw_handler_params); 221 mempool_destroy(m->mpio_pool); 222 kfree(m); 223 } 224 225 static int set_mapinfo(struct multipath *m, union map_info *info) 226 { 227 struct dm_mpath_io *mpio; 228 229 mpio = mempool_alloc(m->mpio_pool, GFP_ATOMIC); 230 if (!mpio) 231 return -ENOMEM; 232 233 memset(mpio, 0, sizeof(*mpio)); 234 info->ptr = mpio; 235 236 return 0; 237 } 238 239 static void clear_mapinfo(struct multipath *m, union map_info *info) 240 { 241 struct dm_mpath_io *mpio = info->ptr; 242 243 info->ptr = NULL; 244 mempool_free(mpio, m->mpio_pool); 245 } 246 247 /*----------------------------------------------- 248 * Path selection 249 *-----------------------------------------------*/ 250 251 static int __pg_init_all_paths(struct multipath *m) 252 { 253 struct pgpath *pgpath; 254 unsigned long pg_init_delay = 0; 255 256 if (m->pg_init_in_progress || m->pg_init_disabled) 257 return 0; 258 259 m->pg_init_count++; 260 m->pg_init_required = 0; 261 262 /* Check here to reset pg_init_required */ 263 if (!m->current_pg) 264 return 0; 265 266 if (m->pg_init_delay_retry) 267 pg_init_delay = msecs_to_jiffies(m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT ? 268 m->pg_init_delay_msecs : DM_PG_INIT_DELAY_MSECS); 269 list_for_each_entry(pgpath, &m->current_pg->pgpaths, list) { 270 /* Skip failed paths */ 271 if (!pgpath->is_active) 272 continue; 273 if (queue_delayed_work(kmpath_handlerd, &pgpath->activate_path, 274 pg_init_delay)) 275 m->pg_init_in_progress++; 276 } 277 return m->pg_init_in_progress; 278 } 279 280 static void __switch_pg(struct multipath *m, struct pgpath *pgpath) 281 { 282 m->current_pg = pgpath->pg; 283 284 /* Must we initialise the PG first, and queue I/O till it's ready? */ 285 if (m->hw_handler_name) { 286 m->pg_init_required = 1; 287 m->queue_io = 1; 288 } else { 289 m->pg_init_required = 0; 290 m->queue_io = 0; 291 } 292 293 m->pg_init_count = 0; 294 } 295 296 static int __choose_path_in_pg(struct multipath *m, struct priority_group *pg, 297 size_t nr_bytes) 298 { 299 struct dm_path *path; 300 301 path = pg->ps.type->select_path(&pg->ps, &m->repeat_count, nr_bytes); 302 if (!path) 303 return -ENXIO; 304 305 m->current_pgpath = path_to_pgpath(path); 306 307 if (m->current_pg != pg) 308 __switch_pg(m, m->current_pgpath); 309 310 return 0; 311 } 312 313 static void __choose_pgpath(struct multipath *m, size_t nr_bytes) 314 { 315 struct priority_group *pg; 316 unsigned bypassed = 1; 317 318 if (!m->nr_valid_paths) { 319 m->queue_io = 0; 320 goto failed; 321 } 322 323 /* Were we instructed to switch PG? */ 324 if (m->next_pg) { 325 pg = m->next_pg; 326 m->next_pg = NULL; 327 if (!__choose_path_in_pg(m, pg, nr_bytes)) 328 return; 329 } 330 331 /* Don't change PG until it has no remaining paths */ 332 if (m->current_pg && !__choose_path_in_pg(m, m->current_pg, nr_bytes)) 333 return; 334 335 /* 336 * Loop through priority groups until we find a valid path. 337 * First time we skip PGs marked 'bypassed'. 338 * Second time we only try the ones we skipped, but set 339 * pg_init_delay_retry so we do not hammer controllers. 340 */ 341 do { 342 list_for_each_entry(pg, &m->priority_groups, list) { 343 if (pg->bypassed == bypassed) 344 continue; 345 if (!__choose_path_in_pg(m, pg, nr_bytes)) { 346 if (!bypassed) 347 m->pg_init_delay_retry = 1; 348 return; 349 } 350 } 351 } while (bypassed--); 352 353 failed: 354 m->current_pgpath = NULL; 355 m->current_pg = NULL; 356 } 357 358 /* 359 * Check whether bios must be queued in the device-mapper core rather 360 * than here in the target. 361 * 362 * m->lock must be held on entry. 363 * 364 * If m->queue_if_no_path and m->saved_queue_if_no_path hold the 365 * same value then we are not between multipath_presuspend() 366 * and multipath_resume() calls and we have no need to check 367 * for the DMF_NOFLUSH_SUSPENDING flag. 368 */ 369 static int __must_push_back(struct multipath *m) 370 { 371 return (m->queue_if_no_path || 372 (m->queue_if_no_path != m->saved_queue_if_no_path && 373 dm_noflush_suspending(m->ti))); 374 } 375 376 /* 377 * Map cloned requests 378 */ 379 static int __multipath_map(struct dm_target *ti, struct request *clone, 380 union map_info *map_context, 381 struct request *rq, struct request **__clone) 382 { 383 struct multipath *m = (struct multipath *) ti->private; 384 int r = DM_MAPIO_REQUEUE; 385 size_t nr_bytes = clone ? blk_rq_bytes(clone) : blk_rq_bytes(rq); 386 struct pgpath *pgpath; 387 struct block_device *bdev; 388 struct dm_mpath_io *mpio; 389 390 spin_lock_irq(&m->lock); 391 392 /* Do we need to select a new pgpath? */ 393 if (!m->current_pgpath || 394 (!m->queue_io && (m->repeat_count && --m->repeat_count == 0))) 395 __choose_pgpath(m, nr_bytes); 396 397 pgpath = m->current_pgpath; 398 399 if (!pgpath) { 400 if (!__must_push_back(m)) 401 r = -EIO; /* Failed */ 402 goto out_unlock; 403 } else if (m->queue_io || m->pg_init_required) { 404 __pg_init_all_paths(m); 405 goto out_unlock; 406 } 407 408 if (set_mapinfo(m, map_context) < 0) 409 /* ENOMEM, requeue */ 410 goto out_unlock; 411 412 mpio = map_context->ptr; 413 mpio->pgpath = pgpath; 414 mpio->nr_bytes = nr_bytes; 415 416 bdev = pgpath->path.dev->bdev; 417 418 spin_unlock_irq(&m->lock); 419 420 if (clone) { 421 /* Old request-based interface: allocated clone is passed in */ 422 clone->q = bdev_get_queue(bdev); 423 clone->rq_disk = bdev->bd_disk; 424 clone->cmd_flags |= REQ_FAILFAST_TRANSPORT; 425 } else { 426 /* blk-mq request-based interface */ 427 *__clone = blk_get_request(bdev_get_queue(bdev), 428 rq_data_dir(rq), GFP_ATOMIC); 429 if (IS_ERR(*__clone)) { 430 /* ENOMEM, requeue */ 431 clear_mapinfo(m, map_context); 432 return r; 433 } 434 (*__clone)->bio = (*__clone)->biotail = NULL; 435 (*__clone)->rq_disk = bdev->bd_disk; 436 (*__clone)->cmd_flags |= REQ_FAILFAST_TRANSPORT; 437 } 438 439 if (pgpath->pg->ps.type->start_io) 440 pgpath->pg->ps.type->start_io(&pgpath->pg->ps, 441 &pgpath->path, 442 nr_bytes); 443 return DM_MAPIO_REMAPPED; 444 445 out_unlock: 446 spin_unlock_irq(&m->lock); 447 448 return r; 449 } 450 451 static int multipath_map(struct dm_target *ti, struct request *clone, 452 union map_info *map_context) 453 { 454 return __multipath_map(ti, clone, map_context, NULL, NULL); 455 } 456 457 static int multipath_clone_and_map(struct dm_target *ti, struct request *rq, 458 union map_info *map_context, 459 struct request **clone) 460 { 461 return __multipath_map(ti, NULL, map_context, rq, clone); 462 } 463 464 static void multipath_release_clone(struct request *clone) 465 { 466 blk_put_request(clone); 467 } 468 469 /* 470 * If we run out of usable paths, should we queue I/O or error it? 471 */ 472 static int queue_if_no_path(struct multipath *m, unsigned queue_if_no_path, 473 unsigned save_old_value) 474 { 475 unsigned long flags; 476 477 spin_lock_irqsave(&m->lock, flags); 478 479 if (save_old_value) 480 m->saved_queue_if_no_path = m->queue_if_no_path; 481 else 482 m->saved_queue_if_no_path = queue_if_no_path; 483 m->queue_if_no_path = queue_if_no_path; 484 spin_unlock_irqrestore(&m->lock, flags); 485 486 if (!queue_if_no_path) 487 dm_table_run_md_queue_async(m->ti->table); 488 489 return 0; 490 } 491 492 /* 493 * An event is triggered whenever a path is taken out of use. 494 * Includes path failure and PG bypass. 495 */ 496 static void trigger_event(struct work_struct *work) 497 { 498 struct multipath *m = 499 container_of(work, struct multipath, trigger_event); 500 501 dm_table_event(m->ti->table); 502 } 503 504 /*----------------------------------------------------------------- 505 * Constructor/argument parsing: 506 * <#multipath feature args> [<arg>]* 507 * <#hw_handler args> [hw_handler [<arg>]*] 508 * <#priority groups> 509 * <initial priority group> 510 * [<selector> <#selector args> [<arg>]* 511 * <#paths> <#per-path selector args> 512 * [<path> [<arg>]* ]+ ]+ 513 *---------------------------------------------------------------*/ 514 static int parse_path_selector(struct dm_arg_set *as, struct priority_group *pg, 515 struct dm_target *ti) 516 { 517 int r; 518 struct path_selector_type *pst; 519 unsigned ps_argc; 520 521 static struct dm_arg _args[] = { 522 {0, 1024, "invalid number of path selector args"}, 523 }; 524 525 pst = dm_get_path_selector(dm_shift_arg(as)); 526 if (!pst) { 527 ti->error = "unknown path selector type"; 528 return -EINVAL; 529 } 530 531 r = dm_read_arg_group(_args, as, &ps_argc, &ti->error); 532 if (r) { 533 dm_put_path_selector(pst); 534 return -EINVAL; 535 } 536 537 r = pst->create(&pg->ps, ps_argc, as->argv); 538 if (r) { 539 dm_put_path_selector(pst); 540 ti->error = "path selector constructor failed"; 541 return r; 542 } 543 544 pg->ps.type = pst; 545 dm_consume_args(as, ps_argc); 546 547 return 0; 548 } 549 550 static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps, 551 struct dm_target *ti) 552 { 553 int r; 554 struct pgpath *p; 555 struct multipath *m = ti->private; 556 struct request_queue *q = NULL; 557 const char *attached_handler_name; 558 559 /* we need at least a path arg */ 560 if (as->argc < 1) { 561 ti->error = "no device given"; 562 return ERR_PTR(-EINVAL); 563 } 564 565 p = alloc_pgpath(); 566 if (!p) 567 return ERR_PTR(-ENOMEM); 568 569 r = dm_get_device(ti, dm_shift_arg(as), dm_table_get_mode(ti->table), 570 &p->path.dev); 571 if (r) { 572 ti->error = "error getting device"; 573 goto bad; 574 } 575 576 if (m->retain_attached_hw_handler || m->hw_handler_name) 577 q = bdev_get_queue(p->path.dev->bdev); 578 579 if (m->retain_attached_hw_handler) { 580 retain: 581 attached_handler_name = scsi_dh_attached_handler_name(q, GFP_KERNEL); 582 if (attached_handler_name) { 583 /* 584 * Reset hw_handler_name to match the attached handler 585 * and clear any hw_handler_params associated with the 586 * ignored handler. 587 * 588 * NB. This modifies the table line to show the actual 589 * handler instead of the original table passed in. 590 */ 591 kfree(m->hw_handler_name); 592 m->hw_handler_name = attached_handler_name; 593 594 kfree(m->hw_handler_params); 595 m->hw_handler_params = NULL; 596 } 597 } 598 599 if (m->hw_handler_name) { 600 r = scsi_dh_attach(q, m->hw_handler_name); 601 if (r == -EBUSY) { 602 char b[BDEVNAME_SIZE]; 603 604 printk(KERN_INFO "dm-mpath: retaining handler on device %s\n", 605 bdevname(p->path.dev->bdev, b)); 606 goto retain; 607 } 608 if (r < 0) { 609 ti->error = "error attaching hardware handler"; 610 dm_put_device(ti, p->path.dev); 611 goto bad; 612 } 613 614 if (m->hw_handler_params) { 615 r = scsi_dh_set_params(q, m->hw_handler_params); 616 if (r < 0) { 617 ti->error = "unable to set hardware " 618 "handler parameters"; 619 dm_put_device(ti, p->path.dev); 620 goto bad; 621 } 622 } 623 } 624 625 r = ps->type->add_path(ps, &p->path, as->argc, as->argv, &ti->error); 626 if (r) { 627 dm_put_device(ti, p->path.dev); 628 goto bad; 629 } 630 631 return p; 632 633 bad: 634 free_pgpath(p); 635 return ERR_PTR(r); 636 } 637 638 static struct priority_group *parse_priority_group(struct dm_arg_set *as, 639 struct multipath *m) 640 { 641 static struct dm_arg _args[] = { 642 {1, 1024, "invalid number of paths"}, 643 {0, 1024, "invalid number of selector args"} 644 }; 645 646 int r; 647 unsigned i, nr_selector_args, nr_args; 648 struct priority_group *pg; 649 struct dm_target *ti = m->ti; 650 651 if (as->argc < 2) { 652 as->argc = 0; 653 ti->error = "not enough priority group arguments"; 654 return ERR_PTR(-EINVAL); 655 } 656 657 pg = alloc_priority_group(); 658 if (!pg) { 659 ti->error = "couldn't allocate priority group"; 660 return ERR_PTR(-ENOMEM); 661 } 662 pg->m = m; 663 664 r = parse_path_selector(as, pg, ti); 665 if (r) 666 goto bad; 667 668 /* 669 * read the paths 670 */ 671 r = dm_read_arg(_args, as, &pg->nr_pgpaths, &ti->error); 672 if (r) 673 goto bad; 674 675 r = dm_read_arg(_args + 1, as, &nr_selector_args, &ti->error); 676 if (r) 677 goto bad; 678 679 nr_args = 1 + nr_selector_args; 680 for (i = 0; i < pg->nr_pgpaths; i++) { 681 struct pgpath *pgpath; 682 struct dm_arg_set path_args; 683 684 if (as->argc < nr_args) { 685 ti->error = "not enough path parameters"; 686 r = -EINVAL; 687 goto bad; 688 } 689 690 path_args.argc = nr_args; 691 path_args.argv = as->argv; 692 693 pgpath = parse_path(&path_args, &pg->ps, ti); 694 if (IS_ERR(pgpath)) { 695 r = PTR_ERR(pgpath); 696 goto bad; 697 } 698 699 pgpath->pg = pg; 700 list_add_tail(&pgpath->list, &pg->pgpaths); 701 dm_consume_args(as, nr_args); 702 } 703 704 return pg; 705 706 bad: 707 free_priority_group(pg, ti); 708 return ERR_PTR(r); 709 } 710 711 static int parse_hw_handler(struct dm_arg_set *as, struct multipath *m) 712 { 713 unsigned hw_argc; 714 int ret; 715 struct dm_target *ti = m->ti; 716 717 static struct dm_arg _args[] = { 718 {0, 1024, "invalid number of hardware handler args"}, 719 }; 720 721 if (dm_read_arg_group(_args, as, &hw_argc, &ti->error)) 722 return -EINVAL; 723 724 if (!hw_argc) 725 return 0; 726 727 m->hw_handler_name = kstrdup(dm_shift_arg(as), GFP_KERNEL); 728 729 if (hw_argc > 1) { 730 char *p; 731 int i, j, len = 4; 732 733 for (i = 0; i <= hw_argc - 2; i++) 734 len += strlen(as->argv[i]) + 1; 735 p = m->hw_handler_params = kzalloc(len, GFP_KERNEL); 736 if (!p) { 737 ti->error = "memory allocation failed"; 738 ret = -ENOMEM; 739 goto fail; 740 } 741 j = sprintf(p, "%d", hw_argc - 1); 742 for (i = 0, p+=j+1; i <= hw_argc - 2; i++, p+=j+1) 743 j = sprintf(p, "%s", as->argv[i]); 744 } 745 dm_consume_args(as, hw_argc - 1); 746 747 return 0; 748 fail: 749 kfree(m->hw_handler_name); 750 m->hw_handler_name = NULL; 751 return ret; 752 } 753 754 static int parse_features(struct dm_arg_set *as, struct multipath *m) 755 { 756 int r; 757 unsigned argc; 758 struct dm_target *ti = m->ti; 759 const char *arg_name; 760 761 static struct dm_arg _args[] = { 762 {0, 6, "invalid number of feature args"}, 763 {1, 50, "pg_init_retries must be between 1 and 50"}, 764 {0, 60000, "pg_init_delay_msecs must be between 0 and 60000"}, 765 }; 766 767 r = dm_read_arg_group(_args, as, &argc, &ti->error); 768 if (r) 769 return -EINVAL; 770 771 if (!argc) 772 return 0; 773 774 do { 775 arg_name = dm_shift_arg(as); 776 argc--; 777 778 if (!strcasecmp(arg_name, "queue_if_no_path")) { 779 r = queue_if_no_path(m, 1, 0); 780 continue; 781 } 782 783 if (!strcasecmp(arg_name, "retain_attached_hw_handler")) { 784 m->retain_attached_hw_handler = 1; 785 continue; 786 } 787 788 if (!strcasecmp(arg_name, "pg_init_retries") && 789 (argc >= 1)) { 790 r = dm_read_arg(_args + 1, as, &m->pg_init_retries, &ti->error); 791 argc--; 792 continue; 793 } 794 795 if (!strcasecmp(arg_name, "pg_init_delay_msecs") && 796 (argc >= 1)) { 797 r = dm_read_arg(_args + 2, as, &m->pg_init_delay_msecs, &ti->error); 798 argc--; 799 continue; 800 } 801 802 ti->error = "Unrecognised multipath feature request"; 803 r = -EINVAL; 804 } while (argc && !r); 805 806 return r; 807 } 808 809 static int multipath_ctr(struct dm_target *ti, unsigned int argc, 810 char **argv) 811 { 812 /* target arguments */ 813 static struct dm_arg _args[] = { 814 {0, 1024, "invalid number of priority groups"}, 815 {0, 1024, "invalid initial priority group number"}, 816 }; 817 818 int r; 819 struct multipath *m; 820 struct dm_arg_set as; 821 unsigned pg_count = 0; 822 unsigned next_pg_num; 823 824 as.argc = argc; 825 as.argv = argv; 826 827 m = alloc_multipath(ti); 828 if (!m) { 829 ti->error = "can't allocate multipath"; 830 return -EINVAL; 831 } 832 833 r = parse_features(&as, m); 834 if (r) 835 goto bad; 836 837 r = parse_hw_handler(&as, m); 838 if (r) 839 goto bad; 840 841 r = dm_read_arg(_args, &as, &m->nr_priority_groups, &ti->error); 842 if (r) 843 goto bad; 844 845 r = dm_read_arg(_args + 1, &as, &next_pg_num, &ti->error); 846 if (r) 847 goto bad; 848 849 if ((!m->nr_priority_groups && next_pg_num) || 850 (m->nr_priority_groups && !next_pg_num)) { 851 ti->error = "invalid initial priority group"; 852 r = -EINVAL; 853 goto bad; 854 } 855 856 /* parse the priority groups */ 857 while (as.argc) { 858 struct priority_group *pg; 859 860 pg = parse_priority_group(&as, m); 861 if (IS_ERR(pg)) { 862 r = PTR_ERR(pg); 863 goto bad; 864 } 865 866 m->nr_valid_paths += pg->nr_pgpaths; 867 list_add_tail(&pg->list, &m->priority_groups); 868 pg_count++; 869 pg->pg_num = pg_count; 870 if (!--next_pg_num) 871 m->next_pg = pg; 872 } 873 874 if (pg_count != m->nr_priority_groups) { 875 ti->error = "priority group count mismatch"; 876 r = -EINVAL; 877 goto bad; 878 } 879 880 ti->num_flush_bios = 1; 881 ti->num_discard_bios = 1; 882 ti->num_write_same_bios = 1; 883 884 return 0; 885 886 bad: 887 free_multipath(m); 888 return r; 889 } 890 891 static void multipath_wait_for_pg_init_completion(struct multipath *m) 892 { 893 DECLARE_WAITQUEUE(wait, current); 894 unsigned long flags; 895 896 add_wait_queue(&m->pg_init_wait, &wait); 897 898 while (1) { 899 set_current_state(TASK_UNINTERRUPTIBLE); 900 901 spin_lock_irqsave(&m->lock, flags); 902 if (!m->pg_init_in_progress) { 903 spin_unlock_irqrestore(&m->lock, flags); 904 break; 905 } 906 spin_unlock_irqrestore(&m->lock, flags); 907 908 io_schedule(); 909 } 910 set_current_state(TASK_RUNNING); 911 912 remove_wait_queue(&m->pg_init_wait, &wait); 913 } 914 915 static void flush_multipath_work(struct multipath *m) 916 { 917 unsigned long flags; 918 919 spin_lock_irqsave(&m->lock, flags); 920 m->pg_init_disabled = 1; 921 spin_unlock_irqrestore(&m->lock, flags); 922 923 flush_workqueue(kmpath_handlerd); 924 multipath_wait_for_pg_init_completion(m); 925 flush_workqueue(kmultipathd); 926 flush_work(&m->trigger_event); 927 928 spin_lock_irqsave(&m->lock, flags); 929 m->pg_init_disabled = 0; 930 spin_unlock_irqrestore(&m->lock, flags); 931 } 932 933 static void multipath_dtr(struct dm_target *ti) 934 { 935 struct multipath *m = ti->private; 936 937 flush_multipath_work(m); 938 free_multipath(m); 939 } 940 941 /* 942 * Take a path out of use. 943 */ 944 static int fail_path(struct pgpath *pgpath) 945 { 946 unsigned long flags; 947 struct multipath *m = pgpath->pg->m; 948 949 spin_lock_irqsave(&m->lock, flags); 950 951 if (!pgpath->is_active) 952 goto out; 953 954 DMWARN("Failing path %s.", pgpath->path.dev->name); 955 956 pgpath->pg->ps.type->fail_path(&pgpath->pg->ps, &pgpath->path); 957 pgpath->is_active = 0; 958 pgpath->fail_count++; 959 960 m->nr_valid_paths--; 961 962 if (pgpath == m->current_pgpath) 963 m->current_pgpath = NULL; 964 965 dm_path_uevent(DM_UEVENT_PATH_FAILED, m->ti, 966 pgpath->path.dev->name, m->nr_valid_paths); 967 968 schedule_work(&m->trigger_event); 969 970 out: 971 spin_unlock_irqrestore(&m->lock, flags); 972 973 return 0; 974 } 975 976 /* 977 * Reinstate a previously-failed path 978 */ 979 static int reinstate_path(struct pgpath *pgpath) 980 { 981 int r = 0, run_queue = 0; 982 unsigned long flags; 983 struct multipath *m = pgpath->pg->m; 984 985 spin_lock_irqsave(&m->lock, flags); 986 987 if (pgpath->is_active) 988 goto out; 989 990 if (!pgpath->pg->ps.type->reinstate_path) { 991 DMWARN("Reinstate path not supported by path selector %s", 992 pgpath->pg->ps.type->name); 993 r = -EINVAL; 994 goto out; 995 } 996 997 r = pgpath->pg->ps.type->reinstate_path(&pgpath->pg->ps, &pgpath->path); 998 if (r) 999 goto out; 1000 1001 pgpath->is_active = 1; 1002 1003 if (!m->nr_valid_paths++) { 1004 m->current_pgpath = NULL; 1005 run_queue = 1; 1006 } else if (m->hw_handler_name && (m->current_pg == pgpath->pg)) { 1007 if (queue_work(kmpath_handlerd, &pgpath->activate_path.work)) 1008 m->pg_init_in_progress++; 1009 } 1010 1011 dm_path_uevent(DM_UEVENT_PATH_REINSTATED, m->ti, 1012 pgpath->path.dev->name, m->nr_valid_paths); 1013 1014 schedule_work(&m->trigger_event); 1015 1016 out: 1017 spin_unlock_irqrestore(&m->lock, flags); 1018 if (run_queue) 1019 dm_table_run_md_queue_async(m->ti->table); 1020 1021 return r; 1022 } 1023 1024 /* 1025 * Fail or reinstate all paths that match the provided struct dm_dev. 1026 */ 1027 static int action_dev(struct multipath *m, struct dm_dev *dev, 1028 action_fn action) 1029 { 1030 int r = -EINVAL; 1031 struct pgpath *pgpath; 1032 struct priority_group *pg; 1033 1034 list_for_each_entry(pg, &m->priority_groups, list) { 1035 list_for_each_entry(pgpath, &pg->pgpaths, list) { 1036 if (pgpath->path.dev == dev) 1037 r = action(pgpath); 1038 } 1039 } 1040 1041 return r; 1042 } 1043 1044 /* 1045 * Temporarily try to avoid having to use the specified PG 1046 */ 1047 static void bypass_pg(struct multipath *m, struct priority_group *pg, 1048 int bypassed) 1049 { 1050 unsigned long flags; 1051 1052 spin_lock_irqsave(&m->lock, flags); 1053 1054 pg->bypassed = bypassed; 1055 m->current_pgpath = NULL; 1056 m->current_pg = NULL; 1057 1058 spin_unlock_irqrestore(&m->lock, flags); 1059 1060 schedule_work(&m->trigger_event); 1061 } 1062 1063 /* 1064 * Switch to using the specified PG from the next I/O that gets mapped 1065 */ 1066 static int switch_pg_num(struct multipath *m, const char *pgstr) 1067 { 1068 struct priority_group *pg; 1069 unsigned pgnum; 1070 unsigned long flags; 1071 char dummy; 1072 1073 if (!pgstr || (sscanf(pgstr, "%u%c", &pgnum, &dummy) != 1) || !pgnum || 1074 (pgnum > m->nr_priority_groups)) { 1075 DMWARN("invalid PG number supplied to switch_pg_num"); 1076 return -EINVAL; 1077 } 1078 1079 spin_lock_irqsave(&m->lock, flags); 1080 list_for_each_entry(pg, &m->priority_groups, list) { 1081 pg->bypassed = 0; 1082 if (--pgnum) 1083 continue; 1084 1085 m->current_pgpath = NULL; 1086 m->current_pg = NULL; 1087 m->next_pg = pg; 1088 } 1089 spin_unlock_irqrestore(&m->lock, flags); 1090 1091 schedule_work(&m->trigger_event); 1092 return 0; 1093 } 1094 1095 /* 1096 * Set/clear bypassed status of a PG. 1097 * PGs are numbered upwards from 1 in the order they were declared. 1098 */ 1099 static int bypass_pg_num(struct multipath *m, const char *pgstr, int bypassed) 1100 { 1101 struct priority_group *pg; 1102 unsigned pgnum; 1103 char dummy; 1104 1105 if (!pgstr || (sscanf(pgstr, "%u%c", &pgnum, &dummy) != 1) || !pgnum || 1106 (pgnum > m->nr_priority_groups)) { 1107 DMWARN("invalid PG number supplied to bypass_pg"); 1108 return -EINVAL; 1109 } 1110 1111 list_for_each_entry(pg, &m->priority_groups, list) { 1112 if (!--pgnum) 1113 break; 1114 } 1115 1116 bypass_pg(m, pg, bypassed); 1117 return 0; 1118 } 1119 1120 /* 1121 * Should we retry pg_init immediately? 1122 */ 1123 static int pg_init_limit_reached(struct multipath *m, struct pgpath *pgpath) 1124 { 1125 unsigned long flags; 1126 int limit_reached = 0; 1127 1128 spin_lock_irqsave(&m->lock, flags); 1129 1130 if (m->pg_init_count <= m->pg_init_retries && !m->pg_init_disabled) 1131 m->pg_init_required = 1; 1132 else 1133 limit_reached = 1; 1134 1135 spin_unlock_irqrestore(&m->lock, flags); 1136 1137 return limit_reached; 1138 } 1139 1140 static void pg_init_done(void *data, int errors) 1141 { 1142 struct pgpath *pgpath = data; 1143 struct priority_group *pg = pgpath->pg; 1144 struct multipath *m = pg->m; 1145 unsigned long flags; 1146 unsigned delay_retry = 0; 1147 1148 /* device or driver problems */ 1149 switch (errors) { 1150 case SCSI_DH_OK: 1151 break; 1152 case SCSI_DH_NOSYS: 1153 if (!m->hw_handler_name) { 1154 errors = 0; 1155 break; 1156 } 1157 DMERR("Could not failover the device: Handler scsi_dh_%s " 1158 "Error %d.", m->hw_handler_name, errors); 1159 /* 1160 * Fail path for now, so we do not ping pong 1161 */ 1162 fail_path(pgpath); 1163 break; 1164 case SCSI_DH_DEV_TEMP_BUSY: 1165 /* 1166 * Probably doing something like FW upgrade on the 1167 * controller so try the other pg. 1168 */ 1169 bypass_pg(m, pg, 1); 1170 break; 1171 case SCSI_DH_RETRY: 1172 /* Wait before retrying. */ 1173 delay_retry = 1; 1174 case SCSI_DH_IMM_RETRY: 1175 case SCSI_DH_RES_TEMP_UNAVAIL: 1176 if (pg_init_limit_reached(m, pgpath)) 1177 fail_path(pgpath); 1178 errors = 0; 1179 break; 1180 default: 1181 /* 1182 * We probably do not want to fail the path for a device 1183 * error, but this is what the old dm did. In future 1184 * patches we can do more advanced handling. 1185 */ 1186 fail_path(pgpath); 1187 } 1188 1189 spin_lock_irqsave(&m->lock, flags); 1190 if (errors) { 1191 if (pgpath == m->current_pgpath) { 1192 DMERR("Could not failover device. Error %d.", errors); 1193 m->current_pgpath = NULL; 1194 m->current_pg = NULL; 1195 } 1196 } else if (!m->pg_init_required) 1197 pg->bypassed = 0; 1198 1199 if (--m->pg_init_in_progress) 1200 /* Activations of other paths are still on going */ 1201 goto out; 1202 1203 if (m->pg_init_required) { 1204 m->pg_init_delay_retry = delay_retry; 1205 if (__pg_init_all_paths(m)) 1206 goto out; 1207 } 1208 m->queue_io = 0; 1209 1210 /* 1211 * Wake up any thread waiting to suspend. 1212 */ 1213 wake_up(&m->pg_init_wait); 1214 1215 out: 1216 spin_unlock_irqrestore(&m->lock, flags); 1217 } 1218 1219 static void activate_path(struct work_struct *work) 1220 { 1221 struct pgpath *pgpath = 1222 container_of(work, struct pgpath, activate_path.work); 1223 1224 if (pgpath->is_active) 1225 scsi_dh_activate(bdev_get_queue(pgpath->path.dev->bdev), 1226 pg_init_done, pgpath); 1227 else 1228 pg_init_done(pgpath, SCSI_DH_DEV_OFFLINED); 1229 } 1230 1231 static int noretry_error(int error) 1232 { 1233 switch (error) { 1234 case -EOPNOTSUPP: 1235 case -EREMOTEIO: 1236 case -EILSEQ: 1237 case -ENODATA: 1238 case -ENOSPC: 1239 return 1; 1240 } 1241 1242 /* Anything else could be a path failure, so should be retried */ 1243 return 0; 1244 } 1245 1246 /* 1247 * end_io handling 1248 */ 1249 static int do_end_io(struct multipath *m, struct request *clone, 1250 int error, struct dm_mpath_io *mpio) 1251 { 1252 /* 1253 * We don't queue any clone request inside the multipath target 1254 * during end I/O handling, since those clone requests don't have 1255 * bio clones. If we queue them inside the multipath target, 1256 * we need to make bio clones, that requires memory allocation. 1257 * (See drivers/md/dm.c:end_clone_bio() about why the clone requests 1258 * don't have bio clones.) 1259 * Instead of queueing the clone request here, we queue the original 1260 * request into dm core, which will remake a clone request and 1261 * clone bios for it and resubmit it later. 1262 */ 1263 int r = DM_ENDIO_REQUEUE; 1264 unsigned long flags; 1265 1266 if (!error && !clone->errors) 1267 return 0; /* I/O complete */ 1268 1269 if (noretry_error(error)) 1270 return error; 1271 1272 if (mpio->pgpath) 1273 fail_path(mpio->pgpath); 1274 1275 spin_lock_irqsave(&m->lock, flags); 1276 if (!m->nr_valid_paths) { 1277 if (!m->queue_if_no_path) { 1278 if (!__must_push_back(m)) 1279 r = -EIO; 1280 } else { 1281 if (error == -EBADE) 1282 r = error; 1283 } 1284 } 1285 spin_unlock_irqrestore(&m->lock, flags); 1286 1287 return r; 1288 } 1289 1290 static int multipath_end_io(struct dm_target *ti, struct request *clone, 1291 int error, union map_info *map_context) 1292 { 1293 struct multipath *m = ti->private; 1294 struct dm_mpath_io *mpio = map_context->ptr; 1295 struct pgpath *pgpath; 1296 struct path_selector *ps; 1297 int r; 1298 1299 BUG_ON(!mpio); 1300 1301 r = do_end_io(m, clone, error, mpio); 1302 pgpath = mpio->pgpath; 1303 if (pgpath) { 1304 ps = &pgpath->pg->ps; 1305 if (ps->type->end_io) 1306 ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes); 1307 } 1308 clear_mapinfo(m, map_context); 1309 1310 return r; 1311 } 1312 1313 /* 1314 * Suspend can't complete until all the I/O is processed so if 1315 * the last path fails we must error any remaining I/O. 1316 * Note that if the freeze_bdev fails while suspending, the 1317 * queue_if_no_path state is lost - userspace should reset it. 1318 */ 1319 static void multipath_presuspend(struct dm_target *ti) 1320 { 1321 struct multipath *m = (struct multipath *) ti->private; 1322 1323 queue_if_no_path(m, 0, 1); 1324 } 1325 1326 static void multipath_postsuspend(struct dm_target *ti) 1327 { 1328 struct multipath *m = ti->private; 1329 1330 mutex_lock(&m->work_mutex); 1331 flush_multipath_work(m); 1332 mutex_unlock(&m->work_mutex); 1333 } 1334 1335 /* 1336 * Restore the queue_if_no_path setting. 1337 */ 1338 static void multipath_resume(struct dm_target *ti) 1339 { 1340 struct multipath *m = (struct multipath *) ti->private; 1341 unsigned long flags; 1342 1343 spin_lock_irqsave(&m->lock, flags); 1344 m->queue_if_no_path = m->saved_queue_if_no_path; 1345 spin_unlock_irqrestore(&m->lock, flags); 1346 } 1347 1348 /* 1349 * Info output has the following format: 1350 * num_multipath_feature_args [multipath_feature_args]* 1351 * num_handler_status_args [handler_status_args]* 1352 * num_groups init_group_number 1353 * [A|D|E num_ps_status_args [ps_status_args]* 1354 * num_paths num_selector_args 1355 * [path_dev A|F fail_count [selector_args]* ]+ ]+ 1356 * 1357 * Table output has the following format (identical to the constructor string): 1358 * num_feature_args [features_args]* 1359 * num_handler_args hw_handler [hw_handler_args]* 1360 * num_groups init_group_number 1361 * [priority selector-name num_ps_args [ps_args]* 1362 * num_paths num_selector_args [path_dev [selector_args]* ]+ ]+ 1363 */ 1364 static void multipath_status(struct dm_target *ti, status_type_t type, 1365 unsigned status_flags, char *result, unsigned maxlen) 1366 { 1367 int sz = 0; 1368 unsigned long flags; 1369 struct multipath *m = (struct multipath *) ti->private; 1370 struct priority_group *pg; 1371 struct pgpath *p; 1372 unsigned pg_num; 1373 char state; 1374 1375 spin_lock_irqsave(&m->lock, flags); 1376 1377 /* Features */ 1378 if (type == STATUSTYPE_INFO) 1379 DMEMIT("2 %u %u ", m->queue_io, m->pg_init_count); 1380 else { 1381 DMEMIT("%u ", m->queue_if_no_path + 1382 (m->pg_init_retries > 0) * 2 + 1383 (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT) * 2 + 1384 m->retain_attached_hw_handler); 1385 if (m->queue_if_no_path) 1386 DMEMIT("queue_if_no_path "); 1387 if (m->pg_init_retries) 1388 DMEMIT("pg_init_retries %u ", m->pg_init_retries); 1389 if (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT) 1390 DMEMIT("pg_init_delay_msecs %u ", m->pg_init_delay_msecs); 1391 if (m->retain_attached_hw_handler) 1392 DMEMIT("retain_attached_hw_handler "); 1393 } 1394 1395 if (!m->hw_handler_name || type == STATUSTYPE_INFO) 1396 DMEMIT("0 "); 1397 else 1398 DMEMIT("1 %s ", m->hw_handler_name); 1399 1400 DMEMIT("%u ", m->nr_priority_groups); 1401 1402 if (m->next_pg) 1403 pg_num = m->next_pg->pg_num; 1404 else if (m->current_pg) 1405 pg_num = m->current_pg->pg_num; 1406 else 1407 pg_num = (m->nr_priority_groups ? 1 : 0); 1408 1409 DMEMIT("%u ", pg_num); 1410 1411 switch (type) { 1412 case STATUSTYPE_INFO: 1413 list_for_each_entry(pg, &m->priority_groups, list) { 1414 if (pg->bypassed) 1415 state = 'D'; /* Disabled */ 1416 else if (pg == m->current_pg) 1417 state = 'A'; /* Currently Active */ 1418 else 1419 state = 'E'; /* Enabled */ 1420 1421 DMEMIT("%c ", state); 1422 1423 if (pg->ps.type->status) 1424 sz += pg->ps.type->status(&pg->ps, NULL, type, 1425 result + sz, 1426 maxlen - sz); 1427 else 1428 DMEMIT("0 "); 1429 1430 DMEMIT("%u %u ", pg->nr_pgpaths, 1431 pg->ps.type->info_args); 1432 1433 list_for_each_entry(p, &pg->pgpaths, list) { 1434 DMEMIT("%s %s %u ", p->path.dev->name, 1435 p->is_active ? "A" : "F", 1436 p->fail_count); 1437 if (pg->ps.type->status) 1438 sz += pg->ps.type->status(&pg->ps, 1439 &p->path, type, result + sz, 1440 maxlen - sz); 1441 } 1442 } 1443 break; 1444 1445 case STATUSTYPE_TABLE: 1446 list_for_each_entry(pg, &m->priority_groups, list) { 1447 DMEMIT("%s ", pg->ps.type->name); 1448 1449 if (pg->ps.type->status) 1450 sz += pg->ps.type->status(&pg->ps, NULL, type, 1451 result + sz, 1452 maxlen - sz); 1453 else 1454 DMEMIT("0 "); 1455 1456 DMEMIT("%u %u ", pg->nr_pgpaths, 1457 pg->ps.type->table_args); 1458 1459 list_for_each_entry(p, &pg->pgpaths, list) { 1460 DMEMIT("%s ", p->path.dev->name); 1461 if (pg->ps.type->status) 1462 sz += pg->ps.type->status(&pg->ps, 1463 &p->path, type, result + sz, 1464 maxlen - sz); 1465 } 1466 } 1467 break; 1468 } 1469 1470 spin_unlock_irqrestore(&m->lock, flags); 1471 } 1472 1473 static int multipath_message(struct dm_target *ti, unsigned argc, char **argv) 1474 { 1475 int r = -EINVAL; 1476 struct dm_dev *dev; 1477 struct multipath *m = (struct multipath *) ti->private; 1478 action_fn action; 1479 1480 mutex_lock(&m->work_mutex); 1481 1482 if (dm_suspended(ti)) { 1483 r = -EBUSY; 1484 goto out; 1485 } 1486 1487 if (argc == 1) { 1488 if (!strcasecmp(argv[0], "queue_if_no_path")) { 1489 r = queue_if_no_path(m, 1, 0); 1490 goto out; 1491 } else if (!strcasecmp(argv[0], "fail_if_no_path")) { 1492 r = queue_if_no_path(m, 0, 0); 1493 goto out; 1494 } 1495 } 1496 1497 if (argc != 2) { 1498 DMWARN("Invalid multipath message arguments. Expected 2 arguments, got %d.", argc); 1499 goto out; 1500 } 1501 1502 if (!strcasecmp(argv[0], "disable_group")) { 1503 r = bypass_pg_num(m, argv[1], 1); 1504 goto out; 1505 } else if (!strcasecmp(argv[0], "enable_group")) { 1506 r = bypass_pg_num(m, argv[1], 0); 1507 goto out; 1508 } else if (!strcasecmp(argv[0], "switch_group")) { 1509 r = switch_pg_num(m, argv[1]); 1510 goto out; 1511 } else if (!strcasecmp(argv[0], "reinstate_path")) 1512 action = reinstate_path; 1513 else if (!strcasecmp(argv[0], "fail_path")) 1514 action = fail_path; 1515 else { 1516 DMWARN("Unrecognised multipath message received: %s", argv[0]); 1517 goto out; 1518 } 1519 1520 r = dm_get_device(ti, argv[1], dm_table_get_mode(ti->table), &dev); 1521 if (r) { 1522 DMWARN("message: error getting device %s", 1523 argv[1]); 1524 goto out; 1525 } 1526 1527 r = action_dev(m, dev, action); 1528 1529 dm_put_device(ti, dev); 1530 1531 out: 1532 mutex_unlock(&m->work_mutex); 1533 return r; 1534 } 1535 1536 static int multipath_ioctl(struct dm_target *ti, unsigned int cmd, 1537 unsigned long arg) 1538 { 1539 struct multipath *m = ti->private; 1540 struct pgpath *pgpath; 1541 struct block_device *bdev; 1542 fmode_t mode; 1543 unsigned long flags; 1544 int r; 1545 1546 bdev = NULL; 1547 mode = 0; 1548 r = 0; 1549 1550 spin_lock_irqsave(&m->lock, flags); 1551 1552 if (!m->current_pgpath) 1553 __choose_pgpath(m, 0); 1554 1555 pgpath = m->current_pgpath; 1556 1557 if (pgpath) { 1558 bdev = pgpath->path.dev->bdev; 1559 mode = pgpath->path.dev->mode; 1560 } 1561 1562 if ((pgpath && m->queue_io) || (!pgpath && m->queue_if_no_path)) 1563 r = -ENOTCONN; 1564 else if (!bdev) 1565 r = -EIO; 1566 1567 spin_unlock_irqrestore(&m->lock, flags); 1568 1569 /* 1570 * Only pass ioctls through if the device sizes match exactly. 1571 */ 1572 if (!bdev || ti->len != i_size_read(bdev->bd_inode) >> SECTOR_SHIFT) { 1573 int err = scsi_verify_blk_ioctl(NULL, cmd); 1574 if (err) 1575 r = err; 1576 } 1577 1578 if (r == -ENOTCONN && !fatal_signal_pending(current)) { 1579 spin_lock_irqsave(&m->lock, flags); 1580 if (!m->current_pg) { 1581 /* Path status changed, redo selection */ 1582 __choose_pgpath(m, 0); 1583 } 1584 if (m->pg_init_required) 1585 __pg_init_all_paths(m); 1586 spin_unlock_irqrestore(&m->lock, flags); 1587 dm_table_run_md_queue_async(m->ti->table); 1588 } 1589 1590 return r ? : __blkdev_driver_ioctl(bdev, mode, cmd, arg); 1591 } 1592 1593 static int multipath_iterate_devices(struct dm_target *ti, 1594 iterate_devices_callout_fn fn, void *data) 1595 { 1596 struct multipath *m = ti->private; 1597 struct priority_group *pg; 1598 struct pgpath *p; 1599 int ret = 0; 1600 1601 list_for_each_entry(pg, &m->priority_groups, list) { 1602 list_for_each_entry(p, &pg->pgpaths, list) { 1603 ret = fn(ti, p->path.dev, ti->begin, ti->len, data); 1604 if (ret) 1605 goto out; 1606 } 1607 } 1608 1609 out: 1610 return ret; 1611 } 1612 1613 static int __pgpath_busy(struct pgpath *pgpath) 1614 { 1615 struct request_queue *q = bdev_get_queue(pgpath->path.dev->bdev); 1616 1617 return blk_lld_busy(q); 1618 } 1619 1620 /* 1621 * We return "busy", only when we can map I/Os but underlying devices 1622 * are busy (so even if we map I/Os now, the I/Os will wait on 1623 * the underlying queue). 1624 * In other words, if we want to kill I/Os or queue them inside us 1625 * due to map unavailability, we don't return "busy". Otherwise, 1626 * dm core won't give us the I/Os and we can't do what we want. 1627 */ 1628 static int multipath_busy(struct dm_target *ti) 1629 { 1630 int busy = 0, has_active = 0; 1631 struct multipath *m = ti->private; 1632 struct priority_group *pg; 1633 struct pgpath *pgpath; 1634 unsigned long flags; 1635 1636 spin_lock_irqsave(&m->lock, flags); 1637 1638 /* pg_init in progress or no paths available */ 1639 if (m->pg_init_in_progress || 1640 (!m->nr_valid_paths && m->queue_if_no_path)) { 1641 busy = 1; 1642 goto out; 1643 } 1644 /* Guess which priority_group will be used at next mapping time */ 1645 if (unlikely(!m->current_pgpath && m->next_pg)) 1646 pg = m->next_pg; 1647 else if (likely(m->current_pg)) 1648 pg = m->current_pg; 1649 else 1650 /* 1651 * We don't know which pg will be used at next mapping time. 1652 * We don't call __choose_pgpath() here to avoid to trigger 1653 * pg_init just by busy checking. 1654 * So we don't know whether underlying devices we will be using 1655 * at next mapping time are busy or not. Just try mapping. 1656 */ 1657 goto out; 1658 1659 /* 1660 * If there is one non-busy active path at least, the path selector 1661 * will be able to select it. So we consider such a pg as not busy. 1662 */ 1663 busy = 1; 1664 list_for_each_entry(pgpath, &pg->pgpaths, list) 1665 if (pgpath->is_active) { 1666 has_active = 1; 1667 1668 if (!__pgpath_busy(pgpath)) { 1669 busy = 0; 1670 break; 1671 } 1672 } 1673 1674 if (!has_active) 1675 /* 1676 * No active path in this pg, so this pg won't be used and 1677 * the current_pg will be changed at next mapping time. 1678 * We need to try mapping to determine it. 1679 */ 1680 busy = 0; 1681 1682 out: 1683 spin_unlock_irqrestore(&m->lock, flags); 1684 1685 return busy; 1686 } 1687 1688 /*----------------------------------------------------------------- 1689 * Module setup 1690 *---------------------------------------------------------------*/ 1691 static struct target_type multipath_target = { 1692 .name = "multipath", 1693 .version = {1, 9, 0}, 1694 .module = THIS_MODULE, 1695 .ctr = multipath_ctr, 1696 .dtr = multipath_dtr, 1697 .map_rq = multipath_map, 1698 .clone_and_map_rq = multipath_clone_and_map, 1699 .release_clone_rq = multipath_release_clone, 1700 .rq_end_io = multipath_end_io, 1701 .presuspend = multipath_presuspend, 1702 .postsuspend = multipath_postsuspend, 1703 .resume = multipath_resume, 1704 .status = multipath_status, 1705 .message = multipath_message, 1706 .ioctl = multipath_ioctl, 1707 .iterate_devices = multipath_iterate_devices, 1708 .busy = multipath_busy, 1709 }; 1710 1711 static int __init dm_multipath_init(void) 1712 { 1713 int r; 1714 1715 /* allocate a slab for the dm_ios */ 1716 _mpio_cache = KMEM_CACHE(dm_mpath_io, 0); 1717 if (!_mpio_cache) 1718 return -ENOMEM; 1719 1720 r = dm_register_target(&multipath_target); 1721 if (r < 0) { 1722 DMERR("register failed %d", r); 1723 r = -EINVAL; 1724 goto bad_register_target; 1725 } 1726 1727 kmultipathd = alloc_workqueue("kmpathd", WQ_MEM_RECLAIM, 0); 1728 if (!kmultipathd) { 1729 DMERR("failed to create workqueue kmpathd"); 1730 r = -ENOMEM; 1731 goto bad_alloc_kmultipathd; 1732 } 1733 1734 /* 1735 * A separate workqueue is used to handle the device handlers 1736 * to avoid overloading existing workqueue. Overloading the 1737 * old workqueue would also create a bottleneck in the 1738 * path of the storage hardware device activation. 1739 */ 1740 kmpath_handlerd = alloc_ordered_workqueue("kmpath_handlerd", 1741 WQ_MEM_RECLAIM); 1742 if (!kmpath_handlerd) { 1743 DMERR("failed to create workqueue kmpath_handlerd"); 1744 r = -ENOMEM; 1745 goto bad_alloc_kmpath_handlerd; 1746 } 1747 1748 DMINFO("version %u.%u.%u loaded", 1749 multipath_target.version[0], multipath_target.version[1], 1750 multipath_target.version[2]); 1751 1752 return 0; 1753 1754 bad_alloc_kmpath_handlerd: 1755 destroy_workqueue(kmultipathd); 1756 bad_alloc_kmultipathd: 1757 dm_unregister_target(&multipath_target); 1758 bad_register_target: 1759 kmem_cache_destroy(_mpio_cache); 1760 1761 return r; 1762 } 1763 1764 static void __exit dm_multipath_exit(void) 1765 { 1766 destroy_workqueue(kmpath_handlerd); 1767 destroy_workqueue(kmultipathd); 1768 1769 dm_unregister_target(&multipath_target); 1770 kmem_cache_destroy(_mpio_cache); 1771 } 1772 1773 module_init(dm_multipath_init); 1774 module_exit(dm_multipath_exit); 1775 1776 MODULE_DESCRIPTION(DM_NAME " multipath target"); 1777 MODULE_AUTHOR("Sistina Software <dm-devel@redhat.com>"); 1778 MODULE_LICENSE("GPL"); 1779