1 /* 2 * Copyright (C) 2006-2009 Red Hat, Inc. 3 * 4 * This file is released under the LGPL. 5 */ 6 7 #include <linux/bio.h> 8 #include <linux/slab.h> 9 #include <linux/dm-dirty-log.h> 10 #include <linux/device-mapper.h> 11 #include <linux/dm-log-userspace.h> 12 #include <linux/module.h> 13 14 #include "dm-log-userspace-transfer.h" 15 16 #define DM_LOG_USERSPACE_VSN "1.1.0" 17 18 struct flush_entry { 19 int type; 20 region_t region; 21 struct list_head list; 22 }; 23 24 /* 25 * This limit on the number of mark and clear request is, to a degree, 26 * arbitrary. However, there is some basis for the choice in the limits 27 * imposed on the size of data payload by dm-log-userspace-transfer.c: 28 * dm_consult_userspace(). 29 */ 30 #define MAX_FLUSH_GROUP_COUNT 32 31 32 struct log_c { 33 struct dm_target *ti; 34 uint32_t region_size; 35 region_t region_count; 36 uint64_t luid; 37 char uuid[DM_UUID_LEN]; 38 39 char *usr_argv_str; 40 uint32_t usr_argc; 41 42 /* 43 * in_sync_hint gets set when doing is_remote_recovering. It 44 * represents the first region that needs recovery. IOW, the 45 * first zero bit of sync_bits. This can be useful for to limit 46 * traffic for calls like is_remote_recovering and get_resync_work, 47 * but be take care in its use for anything else. 48 */ 49 uint64_t in_sync_hint; 50 51 /* 52 * Mark and clear requests are held until a flush is issued 53 * so that we can group, and thereby limit, the amount of 54 * network traffic between kernel and userspace. The 'flush_lock' 55 * is used to protect these lists. 56 */ 57 spinlock_t flush_lock; 58 struct list_head mark_list; 59 struct list_head clear_list; 60 }; 61 62 static mempool_t *flush_entry_pool; 63 64 static void *flush_entry_alloc(gfp_t gfp_mask, void *pool_data) 65 { 66 return kmalloc(sizeof(struct flush_entry), gfp_mask); 67 } 68 69 static void flush_entry_free(void *element, void *pool_data) 70 { 71 kfree(element); 72 } 73 74 static int userspace_do_request(struct log_c *lc, const char *uuid, 75 int request_type, char *data, size_t data_size, 76 char *rdata, size_t *rdata_size) 77 { 78 int r; 79 80 /* 81 * If the server isn't there, -ESRCH is returned, 82 * and we must keep trying until the server is 83 * restored. 84 */ 85 retry: 86 r = dm_consult_userspace(uuid, lc->luid, request_type, data, 87 data_size, rdata, rdata_size); 88 89 if (r != -ESRCH) 90 return r; 91 92 DMERR(" Userspace log server not found."); 93 while (1) { 94 set_current_state(TASK_INTERRUPTIBLE); 95 schedule_timeout(2*HZ); 96 DMWARN("Attempting to contact userspace log server..."); 97 r = dm_consult_userspace(uuid, lc->luid, DM_ULOG_CTR, 98 lc->usr_argv_str, 99 strlen(lc->usr_argv_str) + 1, 100 NULL, NULL); 101 if (!r) 102 break; 103 } 104 DMINFO("Reconnected to userspace log server... DM_ULOG_CTR complete"); 105 r = dm_consult_userspace(uuid, lc->luid, DM_ULOG_RESUME, NULL, 106 0, NULL, NULL); 107 if (!r) 108 goto retry; 109 110 DMERR("Error trying to resume userspace log: %d", r); 111 112 return -ESRCH; 113 } 114 115 static int build_constructor_string(struct dm_target *ti, 116 unsigned argc, char **argv, 117 char **ctr_str) 118 { 119 int i, str_size; 120 char *str = NULL; 121 122 *ctr_str = NULL; 123 124 for (i = 0, str_size = 0; i < argc; i++) 125 str_size += strlen(argv[i]) + 1; /* +1 for space between args */ 126 127 str_size += 20; /* Max number of chars in a printed u64 number */ 128 129 str = kzalloc(str_size, GFP_KERNEL); 130 if (!str) { 131 DMWARN("Unable to allocate memory for constructor string"); 132 return -ENOMEM; 133 } 134 135 str_size = sprintf(str, "%llu", (unsigned long long)ti->len); 136 for (i = 0; i < argc; i++) 137 str_size += sprintf(str + str_size, " %s", argv[i]); 138 139 *ctr_str = str; 140 return str_size; 141 } 142 143 /* 144 * userspace_ctr 145 * 146 * argv contains: 147 * <UUID> <other args> 148 * Where 'other args' is the userspace implementation specific log 149 * arguments. An example might be: 150 * <UUID> clustered_disk <arg count> <log dev> <region_size> [[no]sync] 151 * 152 * So, this module will strip off the <UUID> for identification purposes 153 * when communicating with userspace about a log; but will pass on everything 154 * else. 155 */ 156 static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti, 157 unsigned argc, char **argv) 158 { 159 int r = 0; 160 int str_size; 161 char *ctr_str = NULL; 162 struct log_c *lc = NULL; 163 uint64_t rdata; 164 size_t rdata_size = sizeof(rdata); 165 166 if (argc < 3) { 167 DMWARN("Too few arguments to userspace dirty log"); 168 return -EINVAL; 169 } 170 171 lc = kmalloc(sizeof(*lc), GFP_KERNEL); 172 if (!lc) { 173 DMWARN("Unable to allocate userspace log context."); 174 return -ENOMEM; 175 } 176 177 /* The ptr value is sufficient for local unique id */ 178 lc->luid = (unsigned long)lc; 179 180 lc->ti = ti; 181 182 if (strlen(argv[0]) > (DM_UUID_LEN - 1)) { 183 DMWARN("UUID argument too long."); 184 kfree(lc); 185 return -EINVAL; 186 } 187 188 strncpy(lc->uuid, argv[0], DM_UUID_LEN); 189 spin_lock_init(&lc->flush_lock); 190 INIT_LIST_HEAD(&lc->mark_list); 191 INIT_LIST_HEAD(&lc->clear_list); 192 193 str_size = build_constructor_string(ti, argc - 1, argv + 1, &ctr_str); 194 if (str_size < 0) { 195 kfree(lc); 196 return str_size; 197 } 198 199 /* Send table string */ 200 r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_CTR, 201 ctr_str, str_size, NULL, NULL); 202 203 if (r < 0) { 204 if (r == -ESRCH) 205 DMERR("Userspace log server not found"); 206 else 207 DMERR("Userspace log server failed to create log"); 208 goto out; 209 } 210 211 /* Since the region size does not change, get it now */ 212 rdata_size = sizeof(rdata); 213 r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_GET_REGION_SIZE, 214 NULL, 0, (char *)&rdata, &rdata_size); 215 216 if (r) { 217 DMERR("Failed to get region size of dirty log"); 218 goto out; 219 } 220 221 lc->region_size = (uint32_t)rdata; 222 lc->region_count = dm_sector_div_up(ti->len, lc->region_size); 223 224 out: 225 if (r) { 226 kfree(lc); 227 kfree(ctr_str); 228 } else { 229 lc->usr_argv_str = ctr_str; 230 lc->usr_argc = argc; 231 log->context = lc; 232 } 233 234 return r; 235 } 236 237 static void userspace_dtr(struct dm_dirty_log *log) 238 { 239 struct log_c *lc = log->context; 240 241 (void) dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_DTR, 242 NULL, 0, 243 NULL, NULL); 244 245 kfree(lc->usr_argv_str); 246 kfree(lc); 247 248 return; 249 } 250 251 static int userspace_presuspend(struct dm_dirty_log *log) 252 { 253 int r; 254 struct log_c *lc = log->context; 255 256 r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_PRESUSPEND, 257 NULL, 0, 258 NULL, NULL); 259 260 return r; 261 } 262 263 static int userspace_postsuspend(struct dm_dirty_log *log) 264 { 265 int r; 266 struct log_c *lc = log->context; 267 268 r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_POSTSUSPEND, 269 NULL, 0, 270 NULL, NULL); 271 272 return r; 273 } 274 275 static int userspace_resume(struct dm_dirty_log *log) 276 { 277 int r; 278 struct log_c *lc = log->context; 279 280 lc->in_sync_hint = 0; 281 r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_RESUME, 282 NULL, 0, 283 NULL, NULL); 284 285 return r; 286 } 287 288 static uint32_t userspace_get_region_size(struct dm_dirty_log *log) 289 { 290 struct log_c *lc = log->context; 291 292 return lc->region_size; 293 } 294 295 /* 296 * userspace_is_clean 297 * 298 * Check whether a region is clean. If there is any sort of 299 * failure when consulting the server, we return not clean. 300 * 301 * Returns: 1 if clean, 0 otherwise 302 */ 303 static int userspace_is_clean(struct dm_dirty_log *log, region_t region) 304 { 305 int r; 306 uint64_t region64 = (uint64_t)region; 307 int64_t is_clean; 308 size_t rdata_size; 309 struct log_c *lc = log->context; 310 311 rdata_size = sizeof(is_clean); 312 r = userspace_do_request(lc, lc->uuid, DM_ULOG_IS_CLEAN, 313 (char *)®ion64, sizeof(region64), 314 (char *)&is_clean, &rdata_size); 315 316 return (r) ? 0 : (int)is_clean; 317 } 318 319 /* 320 * userspace_in_sync 321 * 322 * Check if the region is in-sync. If there is any sort 323 * of failure when consulting the server, we assume that 324 * the region is not in sync. 325 * 326 * If 'can_block' is set, return immediately 327 * 328 * Returns: 1 if in-sync, 0 if not-in-sync, -EWOULDBLOCK 329 */ 330 static int userspace_in_sync(struct dm_dirty_log *log, region_t region, 331 int can_block) 332 { 333 int r; 334 uint64_t region64 = region; 335 int64_t in_sync; 336 size_t rdata_size; 337 struct log_c *lc = log->context; 338 339 /* 340 * We can never respond directly - even if in_sync_hint is 341 * set. This is because another machine could see a device 342 * failure and mark the region out-of-sync. If we don't go 343 * to userspace to ask, we might think the region is in-sync 344 * and allow a read to pick up data that is stale. (This is 345 * very unlikely if a device actually fails; but it is very 346 * likely if a connection to one device from one machine fails.) 347 * 348 * There still might be a problem if the mirror caches the region 349 * state as in-sync... but then this call would not be made. So, 350 * that is a mirror problem. 351 */ 352 if (!can_block) 353 return -EWOULDBLOCK; 354 355 rdata_size = sizeof(in_sync); 356 r = userspace_do_request(lc, lc->uuid, DM_ULOG_IN_SYNC, 357 (char *)®ion64, sizeof(region64), 358 (char *)&in_sync, &rdata_size); 359 return (r) ? 0 : (int)in_sync; 360 } 361 362 static int flush_one_by_one(struct log_c *lc, struct list_head *flush_list) 363 { 364 int r = 0; 365 struct flush_entry *fe; 366 367 list_for_each_entry(fe, flush_list, list) { 368 r = userspace_do_request(lc, lc->uuid, fe->type, 369 (char *)&fe->region, 370 sizeof(fe->region), 371 NULL, NULL); 372 if (r) 373 break; 374 } 375 376 return r; 377 } 378 379 static int flush_by_group(struct log_c *lc, struct list_head *flush_list) 380 { 381 int r = 0; 382 int count; 383 uint32_t type = 0; 384 struct flush_entry *fe, *tmp_fe; 385 LIST_HEAD(tmp_list); 386 uint64_t group[MAX_FLUSH_GROUP_COUNT]; 387 388 /* 389 * Group process the requests 390 */ 391 while (!list_empty(flush_list)) { 392 count = 0; 393 394 list_for_each_entry_safe(fe, tmp_fe, flush_list, list) { 395 group[count] = fe->region; 396 count++; 397 398 list_move(&fe->list, &tmp_list); 399 400 type = fe->type; 401 if (count >= MAX_FLUSH_GROUP_COUNT) 402 break; 403 } 404 405 r = userspace_do_request(lc, lc->uuid, type, 406 (char *)(group), 407 count * sizeof(uint64_t), 408 NULL, NULL); 409 if (r) { 410 /* Group send failed. Attempt one-by-one. */ 411 list_splice_init(&tmp_list, flush_list); 412 r = flush_one_by_one(lc, flush_list); 413 break; 414 } 415 } 416 417 /* 418 * Must collect flush_entrys that were successfully processed 419 * as a group so that they will be free'd by the caller. 420 */ 421 list_splice_init(&tmp_list, flush_list); 422 423 return r; 424 } 425 426 /* 427 * userspace_flush 428 * 429 * This function is ok to block. 430 * The flush happens in two stages. First, it sends all 431 * clear/mark requests that are on the list. Then it 432 * tells the server to commit them. This gives the 433 * server a chance to optimise the commit, instead of 434 * doing it for every request. 435 * 436 * Additionally, we could implement another thread that 437 * sends the requests up to the server - reducing the 438 * load on flush. Then the flush would have less in 439 * the list and be responsible for the finishing commit. 440 * 441 * Returns: 0 on success, < 0 on failure 442 */ 443 static int userspace_flush(struct dm_dirty_log *log) 444 { 445 int r = 0; 446 unsigned long flags; 447 struct log_c *lc = log->context; 448 LIST_HEAD(mark_list); 449 LIST_HEAD(clear_list); 450 struct flush_entry *fe, *tmp_fe; 451 452 spin_lock_irqsave(&lc->flush_lock, flags); 453 list_splice_init(&lc->mark_list, &mark_list); 454 list_splice_init(&lc->clear_list, &clear_list); 455 spin_unlock_irqrestore(&lc->flush_lock, flags); 456 457 if (list_empty(&mark_list) && list_empty(&clear_list)) 458 return 0; 459 460 r = flush_by_group(lc, &mark_list); 461 if (r) 462 goto fail; 463 464 r = flush_by_group(lc, &clear_list); 465 if (r) 466 goto fail; 467 468 r = userspace_do_request(lc, lc->uuid, DM_ULOG_FLUSH, 469 NULL, 0, NULL, NULL); 470 471 fail: 472 /* 473 * We can safely remove these entries, even if failure. 474 * Calling code will receive an error and will know that 475 * the log facility has failed. 476 */ 477 list_for_each_entry_safe(fe, tmp_fe, &mark_list, list) { 478 list_del(&fe->list); 479 mempool_free(fe, flush_entry_pool); 480 } 481 list_for_each_entry_safe(fe, tmp_fe, &clear_list, list) { 482 list_del(&fe->list); 483 mempool_free(fe, flush_entry_pool); 484 } 485 486 if (r) 487 dm_table_event(lc->ti->table); 488 489 return r; 490 } 491 492 /* 493 * userspace_mark_region 494 * 495 * This function should avoid blocking unless absolutely required. 496 * (Memory allocation is valid for blocking.) 497 */ 498 static void userspace_mark_region(struct dm_dirty_log *log, region_t region) 499 { 500 unsigned long flags; 501 struct log_c *lc = log->context; 502 struct flush_entry *fe; 503 504 /* Wait for an allocation, but _never_ fail */ 505 fe = mempool_alloc(flush_entry_pool, GFP_NOIO); 506 BUG_ON(!fe); 507 508 spin_lock_irqsave(&lc->flush_lock, flags); 509 fe->type = DM_ULOG_MARK_REGION; 510 fe->region = region; 511 list_add(&fe->list, &lc->mark_list); 512 spin_unlock_irqrestore(&lc->flush_lock, flags); 513 514 return; 515 } 516 517 /* 518 * userspace_clear_region 519 * 520 * This function must not block. 521 * So, the alloc can't block. In the worst case, it is ok to 522 * fail. It would simply mean we can't clear the region. 523 * Does nothing to current sync context, but does mean 524 * the region will be re-sync'ed on a reload of the mirror 525 * even though it is in-sync. 526 */ 527 static void userspace_clear_region(struct dm_dirty_log *log, region_t region) 528 { 529 unsigned long flags; 530 struct log_c *lc = log->context; 531 struct flush_entry *fe; 532 533 /* 534 * If we fail to allocate, we skip the clearing of 535 * the region. This doesn't hurt us in any way, except 536 * to cause the region to be resync'ed when the 537 * device is activated next time. 538 */ 539 fe = mempool_alloc(flush_entry_pool, GFP_ATOMIC); 540 if (!fe) { 541 DMERR("Failed to allocate memory to clear region."); 542 return; 543 } 544 545 spin_lock_irqsave(&lc->flush_lock, flags); 546 fe->type = DM_ULOG_CLEAR_REGION; 547 fe->region = region; 548 list_add(&fe->list, &lc->clear_list); 549 spin_unlock_irqrestore(&lc->flush_lock, flags); 550 551 return; 552 } 553 554 /* 555 * userspace_get_resync_work 556 * 557 * Get a region that needs recovery. It is valid to return 558 * an error for this function. 559 * 560 * Returns: 1 if region filled, 0 if no work, <0 on error 561 */ 562 static int userspace_get_resync_work(struct dm_dirty_log *log, region_t *region) 563 { 564 int r; 565 size_t rdata_size; 566 struct log_c *lc = log->context; 567 struct { 568 int64_t i; /* 64-bit for mix arch compatibility */ 569 region_t r; 570 } pkg; 571 572 if (lc->in_sync_hint >= lc->region_count) 573 return 0; 574 575 rdata_size = sizeof(pkg); 576 r = userspace_do_request(lc, lc->uuid, DM_ULOG_GET_RESYNC_WORK, 577 NULL, 0, 578 (char *)&pkg, &rdata_size); 579 580 *region = pkg.r; 581 return (r) ? r : (int)pkg.i; 582 } 583 584 /* 585 * userspace_set_region_sync 586 * 587 * Set the sync status of a given region. This function 588 * must not fail. 589 */ 590 static void userspace_set_region_sync(struct dm_dirty_log *log, 591 region_t region, int in_sync) 592 { 593 int r; 594 struct log_c *lc = log->context; 595 struct { 596 region_t r; 597 int64_t i; 598 } pkg; 599 600 pkg.r = region; 601 pkg.i = (int64_t)in_sync; 602 603 r = userspace_do_request(lc, lc->uuid, DM_ULOG_SET_REGION_SYNC, 604 (char *)&pkg, sizeof(pkg), 605 NULL, NULL); 606 607 /* 608 * It would be nice to be able to report failures. 609 * However, it is easy emough to detect and resolve. 610 */ 611 return; 612 } 613 614 /* 615 * userspace_get_sync_count 616 * 617 * If there is any sort of failure when consulting the server, 618 * we assume that the sync count is zero. 619 * 620 * Returns: sync count on success, 0 on failure 621 */ 622 static region_t userspace_get_sync_count(struct dm_dirty_log *log) 623 { 624 int r; 625 size_t rdata_size; 626 uint64_t sync_count; 627 struct log_c *lc = log->context; 628 629 rdata_size = sizeof(sync_count); 630 r = userspace_do_request(lc, lc->uuid, DM_ULOG_GET_SYNC_COUNT, 631 NULL, 0, 632 (char *)&sync_count, &rdata_size); 633 634 if (r) 635 return 0; 636 637 if (sync_count >= lc->region_count) 638 lc->in_sync_hint = lc->region_count; 639 640 return (region_t)sync_count; 641 } 642 643 /* 644 * userspace_status 645 * 646 * Returns: amount of space consumed 647 */ 648 static int userspace_status(struct dm_dirty_log *log, status_type_t status_type, 649 char *result, unsigned maxlen) 650 { 651 int r = 0; 652 char *table_args; 653 size_t sz = (size_t)maxlen; 654 struct log_c *lc = log->context; 655 656 switch (status_type) { 657 case STATUSTYPE_INFO: 658 r = userspace_do_request(lc, lc->uuid, DM_ULOG_STATUS_INFO, 659 NULL, 0, 660 result, &sz); 661 662 if (r) { 663 sz = 0; 664 DMEMIT("%s 1 COM_FAILURE", log->type->name); 665 } 666 break; 667 case STATUSTYPE_TABLE: 668 sz = 0; 669 table_args = strchr(lc->usr_argv_str, ' '); 670 BUG_ON(!table_args); /* There will always be a ' ' */ 671 table_args++; 672 673 DMEMIT("%s %u %s %s ", log->type->name, lc->usr_argc, 674 lc->uuid, table_args); 675 break; 676 } 677 return (r) ? 0 : (int)sz; 678 } 679 680 /* 681 * userspace_is_remote_recovering 682 * 683 * Returns: 1 if region recovering, 0 otherwise 684 */ 685 static int userspace_is_remote_recovering(struct dm_dirty_log *log, 686 region_t region) 687 { 688 int r; 689 uint64_t region64 = region; 690 struct log_c *lc = log->context; 691 static unsigned long long limit; 692 struct { 693 int64_t is_recovering; 694 uint64_t in_sync_hint; 695 } pkg; 696 size_t rdata_size = sizeof(pkg); 697 698 /* 699 * Once the mirror has been reported to be in-sync, 700 * it will never again ask for recovery work. So, 701 * we can safely say there is not a remote machine 702 * recovering if the device is in-sync. (in_sync_hint 703 * must be reset at resume time.) 704 */ 705 if (region < lc->in_sync_hint) 706 return 0; 707 else if (jiffies < limit) 708 return 1; 709 710 limit = jiffies + (HZ / 4); 711 r = userspace_do_request(lc, lc->uuid, DM_ULOG_IS_REMOTE_RECOVERING, 712 (char *)®ion64, sizeof(region64), 713 (char *)&pkg, &rdata_size); 714 if (r) 715 return 1; 716 717 lc->in_sync_hint = pkg.in_sync_hint; 718 719 return (int)pkg.is_recovering; 720 } 721 722 static struct dm_dirty_log_type _userspace_type = { 723 .name = "userspace", 724 .module = THIS_MODULE, 725 .ctr = userspace_ctr, 726 .dtr = userspace_dtr, 727 .presuspend = userspace_presuspend, 728 .postsuspend = userspace_postsuspend, 729 .resume = userspace_resume, 730 .get_region_size = userspace_get_region_size, 731 .is_clean = userspace_is_clean, 732 .in_sync = userspace_in_sync, 733 .flush = userspace_flush, 734 .mark_region = userspace_mark_region, 735 .clear_region = userspace_clear_region, 736 .get_resync_work = userspace_get_resync_work, 737 .set_region_sync = userspace_set_region_sync, 738 .get_sync_count = userspace_get_sync_count, 739 .status = userspace_status, 740 .is_remote_recovering = userspace_is_remote_recovering, 741 }; 742 743 static int __init userspace_dirty_log_init(void) 744 { 745 int r = 0; 746 747 flush_entry_pool = mempool_create(100, flush_entry_alloc, 748 flush_entry_free, NULL); 749 750 if (!flush_entry_pool) { 751 DMWARN("Unable to create flush_entry_pool: No memory."); 752 return -ENOMEM; 753 } 754 755 r = dm_ulog_tfr_init(); 756 if (r) { 757 DMWARN("Unable to initialize userspace log communications"); 758 mempool_destroy(flush_entry_pool); 759 return r; 760 } 761 762 r = dm_dirty_log_type_register(&_userspace_type); 763 if (r) { 764 DMWARN("Couldn't register userspace dirty log type"); 765 dm_ulog_tfr_exit(); 766 mempool_destroy(flush_entry_pool); 767 return r; 768 } 769 770 DMINFO("version " DM_LOG_USERSPACE_VSN " loaded"); 771 return 0; 772 } 773 774 static void __exit userspace_dirty_log_exit(void) 775 { 776 dm_dirty_log_type_unregister(&_userspace_type); 777 dm_ulog_tfr_exit(); 778 mempool_destroy(flush_entry_pool); 779 780 DMINFO("version " DM_LOG_USERSPACE_VSN " unloaded"); 781 return; 782 } 783 784 module_init(userspace_dirty_log_init); 785 module_exit(userspace_dirty_log_exit); 786 787 MODULE_DESCRIPTION(DM_NAME " userspace dirty log link"); 788 MODULE_AUTHOR("Jonathan Brassow <dm-devel@redhat.com>"); 789 MODULE_LICENSE("GPL"); 790