1 /* 2 * Copyright (C) 2006-2009 Red Hat, Inc. 3 * 4 * This file is released under the LGPL. 5 */ 6 7 #include <linux/bio.h> 8 #include <linux/slab.h> 9 #include <linux/dm-dirty-log.h> 10 #include <linux/device-mapper.h> 11 #include <linux/dm-log-userspace.h> 12 13 #include "dm-log-userspace-transfer.h" 14 15 struct flush_entry { 16 int type; 17 region_t region; 18 struct list_head list; 19 }; 20 21 /* 22 * This limit on the number of mark and clear request is, to a degree, 23 * arbitrary. However, there is some basis for the choice in the limits 24 * imposed on the size of data payload by dm-log-userspace-transfer.c: 25 * dm_consult_userspace(). 26 */ 27 #define MAX_FLUSH_GROUP_COUNT 32 28 29 struct log_c { 30 struct dm_target *ti; 31 uint32_t region_size; 32 region_t region_count; 33 uint64_t luid; 34 char uuid[DM_UUID_LEN]; 35 36 char *usr_argv_str; 37 uint32_t usr_argc; 38 39 /* 40 * in_sync_hint gets set when doing is_remote_recovering. It 41 * represents the first region that needs recovery. IOW, the 42 * first zero bit of sync_bits. This can be useful for to limit 43 * traffic for calls like is_remote_recovering and get_resync_work, 44 * but be take care in its use for anything else. 45 */ 46 uint64_t in_sync_hint; 47 48 /* 49 * Mark and clear requests are held until a flush is issued 50 * so that we can group, and thereby limit, the amount of 51 * network traffic between kernel and userspace. The 'flush_lock' 52 * is used to protect these lists. 53 */ 54 spinlock_t flush_lock; 55 struct list_head mark_list; 56 struct list_head clear_list; 57 }; 58 59 static mempool_t *flush_entry_pool; 60 61 static void *flush_entry_alloc(gfp_t gfp_mask, void *pool_data) 62 { 63 return kmalloc(sizeof(struct flush_entry), gfp_mask); 64 } 65 66 static void flush_entry_free(void *element, void *pool_data) 67 { 68 kfree(element); 69 } 70 71 static int userspace_do_request(struct log_c *lc, const char *uuid, 72 int request_type, char *data, size_t data_size, 73 char *rdata, size_t *rdata_size) 74 { 75 int r; 76 77 /* 78 * If the server isn't there, -ESRCH is returned, 79 * and we must keep trying until the server is 80 * restored. 81 */ 82 retry: 83 r = dm_consult_userspace(uuid, lc->luid, request_type, data, 84 data_size, rdata, rdata_size); 85 86 if (r != -ESRCH) 87 return r; 88 89 DMERR(" Userspace log server not found."); 90 while (1) { 91 set_current_state(TASK_INTERRUPTIBLE); 92 schedule_timeout(2*HZ); 93 DMWARN("Attempting to contact userspace log server..."); 94 r = dm_consult_userspace(uuid, lc->luid, DM_ULOG_CTR, 95 lc->usr_argv_str, 96 strlen(lc->usr_argv_str) + 1, 97 NULL, NULL); 98 if (!r) 99 break; 100 } 101 DMINFO("Reconnected to userspace log server... DM_ULOG_CTR complete"); 102 r = dm_consult_userspace(uuid, lc->luid, DM_ULOG_RESUME, NULL, 103 0, NULL, NULL); 104 if (!r) 105 goto retry; 106 107 DMERR("Error trying to resume userspace log: %d", r); 108 109 return -ESRCH; 110 } 111 112 static int build_constructor_string(struct dm_target *ti, 113 unsigned argc, char **argv, 114 char **ctr_str) 115 { 116 int i, str_size; 117 char *str = NULL; 118 119 *ctr_str = NULL; 120 121 for (i = 0, str_size = 0; i < argc; i++) 122 str_size += strlen(argv[i]) + 1; /* +1 for space between args */ 123 124 str_size += 20; /* Max number of chars in a printed u64 number */ 125 126 str = kzalloc(str_size, GFP_KERNEL); 127 if (!str) { 128 DMWARN("Unable to allocate memory for constructor string"); 129 return -ENOMEM; 130 } 131 132 str_size = sprintf(str, "%llu", (unsigned long long)ti->len); 133 for (i = 0; i < argc; i++) 134 str_size += sprintf(str + str_size, " %s", argv[i]); 135 136 *ctr_str = str; 137 return str_size; 138 } 139 140 /* 141 * userspace_ctr 142 * 143 * argv contains: 144 * <UUID> <other args> 145 * Where 'other args' is the userspace implementation specific log 146 * arguments. An example might be: 147 * <UUID> clustered_disk <arg count> <log dev> <region_size> [[no]sync] 148 * 149 * So, this module will strip off the <UUID> for identification purposes 150 * when communicating with userspace about a log; but will pass on everything 151 * else. 152 */ 153 static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti, 154 unsigned argc, char **argv) 155 { 156 int r = 0; 157 int str_size; 158 char *ctr_str = NULL; 159 struct log_c *lc = NULL; 160 uint64_t rdata; 161 size_t rdata_size = sizeof(rdata); 162 163 if (argc < 3) { 164 DMWARN("Too few arguments to userspace dirty log"); 165 return -EINVAL; 166 } 167 168 lc = kmalloc(sizeof(*lc), GFP_KERNEL); 169 if (!lc) { 170 DMWARN("Unable to allocate userspace log context."); 171 return -ENOMEM; 172 } 173 174 /* The ptr value is sufficient for local unique id */ 175 lc->luid = (unsigned long)lc; 176 177 lc->ti = ti; 178 179 if (strlen(argv[0]) > (DM_UUID_LEN - 1)) { 180 DMWARN("UUID argument too long."); 181 kfree(lc); 182 return -EINVAL; 183 } 184 185 strncpy(lc->uuid, argv[0], DM_UUID_LEN); 186 spin_lock_init(&lc->flush_lock); 187 INIT_LIST_HEAD(&lc->mark_list); 188 INIT_LIST_HEAD(&lc->clear_list); 189 190 str_size = build_constructor_string(ti, argc - 1, argv + 1, &ctr_str); 191 if (str_size < 0) { 192 kfree(lc); 193 return str_size; 194 } 195 196 /* Send table string */ 197 r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_CTR, 198 ctr_str, str_size, NULL, NULL); 199 200 if (r < 0) { 201 if (r == -ESRCH) 202 DMERR("Userspace log server not found"); 203 else 204 DMERR("Userspace log server failed to create log"); 205 goto out; 206 } 207 208 /* Since the region size does not change, get it now */ 209 rdata_size = sizeof(rdata); 210 r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_GET_REGION_SIZE, 211 NULL, 0, (char *)&rdata, &rdata_size); 212 213 if (r) { 214 DMERR("Failed to get region size of dirty log"); 215 goto out; 216 } 217 218 lc->region_size = (uint32_t)rdata; 219 lc->region_count = dm_sector_div_up(ti->len, lc->region_size); 220 221 out: 222 if (r) { 223 kfree(lc); 224 kfree(ctr_str); 225 } else { 226 lc->usr_argv_str = ctr_str; 227 lc->usr_argc = argc; 228 log->context = lc; 229 } 230 231 return r; 232 } 233 234 static void userspace_dtr(struct dm_dirty_log *log) 235 { 236 struct log_c *lc = log->context; 237 238 (void) dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_DTR, 239 NULL, 0, 240 NULL, NULL); 241 242 kfree(lc->usr_argv_str); 243 kfree(lc); 244 245 return; 246 } 247 248 static int userspace_presuspend(struct dm_dirty_log *log) 249 { 250 int r; 251 struct log_c *lc = log->context; 252 253 r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_PRESUSPEND, 254 NULL, 0, 255 NULL, NULL); 256 257 return r; 258 } 259 260 static int userspace_postsuspend(struct dm_dirty_log *log) 261 { 262 int r; 263 struct log_c *lc = log->context; 264 265 r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_POSTSUSPEND, 266 NULL, 0, 267 NULL, NULL); 268 269 return r; 270 } 271 272 static int userspace_resume(struct dm_dirty_log *log) 273 { 274 int r; 275 struct log_c *lc = log->context; 276 277 lc->in_sync_hint = 0; 278 r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_RESUME, 279 NULL, 0, 280 NULL, NULL); 281 282 return r; 283 } 284 285 static uint32_t userspace_get_region_size(struct dm_dirty_log *log) 286 { 287 struct log_c *lc = log->context; 288 289 return lc->region_size; 290 } 291 292 /* 293 * userspace_is_clean 294 * 295 * Check whether a region is clean. If there is any sort of 296 * failure when consulting the server, we return not clean. 297 * 298 * Returns: 1 if clean, 0 otherwise 299 */ 300 static int userspace_is_clean(struct dm_dirty_log *log, region_t region) 301 { 302 int r; 303 uint64_t region64 = (uint64_t)region; 304 int64_t is_clean; 305 size_t rdata_size; 306 struct log_c *lc = log->context; 307 308 rdata_size = sizeof(is_clean); 309 r = userspace_do_request(lc, lc->uuid, DM_ULOG_IS_CLEAN, 310 (char *)®ion64, sizeof(region64), 311 (char *)&is_clean, &rdata_size); 312 313 return (r) ? 0 : (int)is_clean; 314 } 315 316 /* 317 * userspace_in_sync 318 * 319 * Check if the region is in-sync. If there is any sort 320 * of failure when consulting the server, we assume that 321 * the region is not in sync. 322 * 323 * If 'can_block' is set, return immediately 324 * 325 * Returns: 1 if in-sync, 0 if not-in-sync, -EWOULDBLOCK 326 */ 327 static int userspace_in_sync(struct dm_dirty_log *log, region_t region, 328 int can_block) 329 { 330 int r; 331 uint64_t region64 = region; 332 int64_t in_sync; 333 size_t rdata_size; 334 struct log_c *lc = log->context; 335 336 /* 337 * We can never respond directly - even if in_sync_hint is 338 * set. This is because another machine could see a device 339 * failure and mark the region out-of-sync. If we don't go 340 * to userspace to ask, we might think the region is in-sync 341 * and allow a read to pick up data that is stale. (This is 342 * very unlikely if a device actually fails; but it is very 343 * likely if a connection to one device from one machine fails.) 344 * 345 * There still might be a problem if the mirror caches the region 346 * state as in-sync... but then this call would not be made. So, 347 * that is a mirror problem. 348 */ 349 if (!can_block) 350 return -EWOULDBLOCK; 351 352 rdata_size = sizeof(in_sync); 353 r = userspace_do_request(lc, lc->uuid, DM_ULOG_IN_SYNC, 354 (char *)®ion64, sizeof(region64), 355 (char *)&in_sync, &rdata_size); 356 return (r) ? 0 : (int)in_sync; 357 } 358 359 static int flush_one_by_one(struct log_c *lc, struct list_head *flush_list) 360 { 361 int r = 0; 362 struct flush_entry *fe; 363 364 list_for_each_entry(fe, flush_list, list) { 365 r = userspace_do_request(lc, lc->uuid, fe->type, 366 (char *)&fe->region, 367 sizeof(fe->region), 368 NULL, NULL); 369 if (r) 370 break; 371 } 372 373 return r; 374 } 375 376 static int flush_by_group(struct log_c *lc, struct list_head *flush_list) 377 { 378 int r = 0; 379 int count; 380 uint32_t type = 0; 381 struct flush_entry *fe, *tmp_fe; 382 LIST_HEAD(tmp_list); 383 uint64_t group[MAX_FLUSH_GROUP_COUNT]; 384 385 /* 386 * Group process the requests 387 */ 388 while (!list_empty(flush_list)) { 389 count = 0; 390 391 list_for_each_entry_safe(fe, tmp_fe, flush_list, list) { 392 group[count] = fe->region; 393 count++; 394 395 list_del(&fe->list); 396 list_add(&fe->list, &tmp_list); 397 398 type = fe->type; 399 if (count >= MAX_FLUSH_GROUP_COUNT) 400 break; 401 } 402 403 r = userspace_do_request(lc, lc->uuid, type, 404 (char *)(group), 405 count * sizeof(uint64_t), 406 NULL, NULL); 407 if (r) { 408 /* Group send failed. Attempt one-by-one. */ 409 list_splice_init(&tmp_list, flush_list); 410 r = flush_one_by_one(lc, flush_list); 411 break; 412 } 413 } 414 415 /* 416 * Must collect flush_entrys that were successfully processed 417 * as a group so that they will be free'd by the caller. 418 */ 419 list_splice_init(&tmp_list, flush_list); 420 421 return r; 422 } 423 424 /* 425 * userspace_flush 426 * 427 * This function is ok to block. 428 * The flush happens in two stages. First, it sends all 429 * clear/mark requests that are on the list. Then it 430 * tells the server to commit them. This gives the 431 * server a chance to optimise the commit, instead of 432 * doing it for every request. 433 * 434 * Additionally, we could implement another thread that 435 * sends the requests up to the server - reducing the 436 * load on flush. Then the flush would have less in 437 * the list and be responsible for the finishing commit. 438 * 439 * Returns: 0 on success, < 0 on failure 440 */ 441 static int userspace_flush(struct dm_dirty_log *log) 442 { 443 int r = 0; 444 unsigned long flags; 445 struct log_c *lc = log->context; 446 LIST_HEAD(mark_list); 447 LIST_HEAD(clear_list); 448 struct flush_entry *fe, *tmp_fe; 449 450 spin_lock_irqsave(&lc->flush_lock, flags); 451 list_splice_init(&lc->mark_list, &mark_list); 452 list_splice_init(&lc->clear_list, &clear_list); 453 spin_unlock_irqrestore(&lc->flush_lock, flags); 454 455 if (list_empty(&mark_list) && list_empty(&clear_list)) 456 return 0; 457 458 r = flush_by_group(lc, &mark_list); 459 if (r) 460 goto fail; 461 462 r = flush_by_group(lc, &clear_list); 463 if (r) 464 goto fail; 465 466 r = userspace_do_request(lc, lc->uuid, DM_ULOG_FLUSH, 467 NULL, 0, NULL, NULL); 468 469 fail: 470 /* 471 * We can safely remove these entries, even if failure. 472 * Calling code will receive an error and will know that 473 * the log facility has failed. 474 */ 475 list_for_each_entry_safe(fe, tmp_fe, &mark_list, list) { 476 list_del(&fe->list); 477 mempool_free(fe, flush_entry_pool); 478 } 479 list_for_each_entry_safe(fe, tmp_fe, &clear_list, list) { 480 list_del(&fe->list); 481 mempool_free(fe, flush_entry_pool); 482 } 483 484 if (r) 485 dm_table_event(lc->ti->table); 486 487 return r; 488 } 489 490 /* 491 * userspace_mark_region 492 * 493 * This function should avoid blocking unless absolutely required. 494 * (Memory allocation is valid for blocking.) 495 */ 496 static void userspace_mark_region(struct dm_dirty_log *log, region_t region) 497 { 498 unsigned long flags; 499 struct log_c *lc = log->context; 500 struct flush_entry *fe; 501 502 /* Wait for an allocation, but _never_ fail */ 503 fe = mempool_alloc(flush_entry_pool, GFP_NOIO); 504 BUG_ON(!fe); 505 506 spin_lock_irqsave(&lc->flush_lock, flags); 507 fe->type = DM_ULOG_MARK_REGION; 508 fe->region = region; 509 list_add(&fe->list, &lc->mark_list); 510 spin_unlock_irqrestore(&lc->flush_lock, flags); 511 512 return; 513 } 514 515 /* 516 * userspace_clear_region 517 * 518 * This function must not block. 519 * So, the alloc can't block. In the worst case, it is ok to 520 * fail. It would simply mean we can't clear the region. 521 * Does nothing to current sync context, but does mean 522 * the region will be re-sync'ed on a reload of the mirror 523 * even though it is in-sync. 524 */ 525 static void userspace_clear_region(struct dm_dirty_log *log, region_t region) 526 { 527 unsigned long flags; 528 struct log_c *lc = log->context; 529 struct flush_entry *fe; 530 531 /* 532 * If we fail to allocate, we skip the clearing of 533 * the region. This doesn't hurt us in any way, except 534 * to cause the region to be resync'ed when the 535 * device is activated next time. 536 */ 537 fe = mempool_alloc(flush_entry_pool, GFP_ATOMIC); 538 if (!fe) { 539 DMERR("Failed to allocate memory to clear region."); 540 return; 541 } 542 543 spin_lock_irqsave(&lc->flush_lock, flags); 544 fe->type = DM_ULOG_CLEAR_REGION; 545 fe->region = region; 546 list_add(&fe->list, &lc->clear_list); 547 spin_unlock_irqrestore(&lc->flush_lock, flags); 548 549 return; 550 } 551 552 /* 553 * userspace_get_resync_work 554 * 555 * Get a region that needs recovery. It is valid to return 556 * an error for this function. 557 * 558 * Returns: 1 if region filled, 0 if no work, <0 on error 559 */ 560 static int userspace_get_resync_work(struct dm_dirty_log *log, region_t *region) 561 { 562 int r; 563 size_t rdata_size; 564 struct log_c *lc = log->context; 565 struct { 566 int64_t i; /* 64-bit for mix arch compatibility */ 567 region_t r; 568 } pkg; 569 570 if (lc->in_sync_hint >= lc->region_count) 571 return 0; 572 573 rdata_size = sizeof(pkg); 574 r = userspace_do_request(lc, lc->uuid, DM_ULOG_GET_RESYNC_WORK, 575 NULL, 0, 576 (char *)&pkg, &rdata_size); 577 578 *region = pkg.r; 579 return (r) ? r : (int)pkg.i; 580 } 581 582 /* 583 * userspace_set_region_sync 584 * 585 * Set the sync status of a given region. This function 586 * must not fail. 587 */ 588 static void userspace_set_region_sync(struct dm_dirty_log *log, 589 region_t region, int in_sync) 590 { 591 int r; 592 struct log_c *lc = log->context; 593 struct { 594 region_t r; 595 int64_t i; 596 } pkg; 597 598 pkg.r = region; 599 pkg.i = (int64_t)in_sync; 600 601 r = userspace_do_request(lc, lc->uuid, DM_ULOG_SET_REGION_SYNC, 602 (char *)&pkg, sizeof(pkg), 603 NULL, NULL); 604 605 /* 606 * It would be nice to be able to report failures. 607 * However, it is easy emough to detect and resolve. 608 */ 609 return; 610 } 611 612 /* 613 * userspace_get_sync_count 614 * 615 * If there is any sort of failure when consulting the server, 616 * we assume that the sync count is zero. 617 * 618 * Returns: sync count on success, 0 on failure 619 */ 620 static region_t userspace_get_sync_count(struct dm_dirty_log *log) 621 { 622 int r; 623 size_t rdata_size; 624 uint64_t sync_count; 625 struct log_c *lc = log->context; 626 627 rdata_size = sizeof(sync_count); 628 r = userspace_do_request(lc, lc->uuid, DM_ULOG_GET_SYNC_COUNT, 629 NULL, 0, 630 (char *)&sync_count, &rdata_size); 631 632 if (r) 633 return 0; 634 635 if (sync_count >= lc->region_count) 636 lc->in_sync_hint = lc->region_count; 637 638 return (region_t)sync_count; 639 } 640 641 /* 642 * userspace_status 643 * 644 * Returns: amount of space consumed 645 */ 646 static int userspace_status(struct dm_dirty_log *log, status_type_t status_type, 647 char *result, unsigned maxlen) 648 { 649 int r = 0; 650 char *table_args; 651 size_t sz = (size_t)maxlen; 652 struct log_c *lc = log->context; 653 654 switch (status_type) { 655 case STATUSTYPE_INFO: 656 r = userspace_do_request(lc, lc->uuid, DM_ULOG_STATUS_INFO, 657 NULL, 0, 658 result, &sz); 659 660 if (r) { 661 sz = 0; 662 DMEMIT("%s 1 COM_FAILURE", log->type->name); 663 } 664 break; 665 case STATUSTYPE_TABLE: 666 sz = 0; 667 table_args = strchr(lc->usr_argv_str, ' '); 668 BUG_ON(!table_args); /* There will always be a ' ' */ 669 table_args++; 670 671 DMEMIT("%s %u %s %s ", log->type->name, lc->usr_argc, 672 lc->uuid, table_args); 673 break; 674 } 675 return (r) ? 0 : (int)sz; 676 } 677 678 /* 679 * userspace_is_remote_recovering 680 * 681 * Returns: 1 if region recovering, 0 otherwise 682 */ 683 static int userspace_is_remote_recovering(struct dm_dirty_log *log, 684 region_t region) 685 { 686 int r; 687 uint64_t region64 = region; 688 struct log_c *lc = log->context; 689 static unsigned long long limit; 690 struct { 691 int64_t is_recovering; 692 uint64_t in_sync_hint; 693 } pkg; 694 size_t rdata_size = sizeof(pkg); 695 696 /* 697 * Once the mirror has been reported to be in-sync, 698 * it will never again ask for recovery work. So, 699 * we can safely say there is not a remote machine 700 * recovering if the device is in-sync. (in_sync_hint 701 * must be reset at resume time.) 702 */ 703 if (region < lc->in_sync_hint) 704 return 0; 705 else if (jiffies < limit) 706 return 1; 707 708 limit = jiffies + (HZ / 4); 709 r = userspace_do_request(lc, lc->uuid, DM_ULOG_IS_REMOTE_RECOVERING, 710 (char *)®ion64, sizeof(region64), 711 (char *)&pkg, &rdata_size); 712 if (r) 713 return 1; 714 715 lc->in_sync_hint = pkg.in_sync_hint; 716 717 return (int)pkg.is_recovering; 718 } 719 720 static struct dm_dirty_log_type _userspace_type = { 721 .name = "userspace", 722 .module = THIS_MODULE, 723 .ctr = userspace_ctr, 724 .dtr = userspace_dtr, 725 .presuspend = userspace_presuspend, 726 .postsuspend = userspace_postsuspend, 727 .resume = userspace_resume, 728 .get_region_size = userspace_get_region_size, 729 .is_clean = userspace_is_clean, 730 .in_sync = userspace_in_sync, 731 .flush = userspace_flush, 732 .mark_region = userspace_mark_region, 733 .clear_region = userspace_clear_region, 734 .get_resync_work = userspace_get_resync_work, 735 .set_region_sync = userspace_set_region_sync, 736 .get_sync_count = userspace_get_sync_count, 737 .status = userspace_status, 738 .is_remote_recovering = userspace_is_remote_recovering, 739 }; 740 741 static int __init userspace_dirty_log_init(void) 742 { 743 int r = 0; 744 745 flush_entry_pool = mempool_create(100, flush_entry_alloc, 746 flush_entry_free, NULL); 747 748 if (!flush_entry_pool) { 749 DMWARN("Unable to create flush_entry_pool: No memory."); 750 return -ENOMEM; 751 } 752 753 r = dm_ulog_tfr_init(); 754 if (r) { 755 DMWARN("Unable to initialize userspace log communications"); 756 mempool_destroy(flush_entry_pool); 757 return r; 758 } 759 760 r = dm_dirty_log_type_register(&_userspace_type); 761 if (r) { 762 DMWARN("Couldn't register userspace dirty log type"); 763 dm_ulog_tfr_exit(); 764 mempool_destroy(flush_entry_pool); 765 return r; 766 } 767 768 DMINFO("version 1.0.0 loaded"); 769 return 0; 770 } 771 772 static void __exit userspace_dirty_log_exit(void) 773 { 774 dm_dirty_log_type_unregister(&_userspace_type); 775 dm_ulog_tfr_exit(); 776 mempool_destroy(flush_entry_pool); 777 778 DMINFO("version 1.0.0 unloaded"); 779 return; 780 } 781 782 module_init(userspace_dirty_log_init); 783 module_exit(userspace_dirty_log_exit); 784 785 MODULE_DESCRIPTION(DM_NAME " userspace dirty log link"); 786 MODULE_AUTHOR("Jonathan Brassow <dm-devel@redhat.com>"); 787 MODULE_LICENSE("GPL"); 788