1 /* 2 * Copyright (C) 2006-2009 Red Hat, Inc. 3 * 4 * This file is released under the LGPL. 5 */ 6 7 #include <linux/bio.h> 8 #include <linux/slab.h> 9 #include <linux/dm-dirty-log.h> 10 #include <linux/device-mapper.h> 11 #include <linux/dm-log-userspace.h> 12 13 #include "dm-log-userspace-transfer.h" 14 15 #define DM_LOG_USERSPACE_VSN "1.1.0" 16 17 struct flush_entry { 18 int type; 19 region_t region; 20 struct list_head list; 21 }; 22 23 /* 24 * This limit on the number of mark and clear request is, to a degree, 25 * arbitrary. However, there is some basis for the choice in the limits 26 * imposed on the size of data payload by dm-log-userspace-transfer.c: 27 * dm_consult_userspace(). 28 */ 29 #define MAX_FLUSH_GROUP_COUNT 32 30 31 struct log_c { 32 struct dm_target *ti; 33 uint32_t region_size; 34 region_t region_count; 35 uint64_t luid; 36 char uuid[DM_UUID_LEN]; 37 38 char *usr_argv_str; 39 uint32_t usr_argc; 40 41 /* 42 * in_sync_hint gets set when doing is_remote_recovering. It 43 * represents the first region that needs recovery. IOW, the 44 * first zero bit of sync_bits. This can be useful for to limit 45 * traffic for calls like is_remote_recovering and get_resync_work, 46 * but be take care in its use for anything else. 47 */ 48 uint64_t in_sync_hint; 49 50 /* 51 * Mark and clear requests are held until a flush is issued 52 * so that we can group, and thereby limit, the amount of 53 * network traffic between kernel and userspace. The 'flush_lock' 54 * is used to protect these lists. 55 */ 56 spinlock_t flush_lock; 57 struct list_head mark_list; 58 struct list_head clear_list; 59 }; 60 61 static mempool_t *flush_entry_pool; 62 63 static void *flush_entry_alloc(gfp_t gfp_mask, void *pool_data) 64 { 65 return kmalloc(sizeof(struct flush_entry), gfp_mask); 66 } 67 68 static void flush_entry_free(void *element, void *pool_data) 69 { 70 kfree(element); 71 } 72 73 static int userspace_do_request(struct log_c *lc, const char *uuid, 74 int request_type, char *data, size_t data_size, 75 char *rdata, size_t *rdata_size) 76 { 77 int r; 78 79 /* 80 * If the server isn't there, -ESRCH is returned, 81 * and we must keep trying until the server is 82 * restored. 83 */ 84 retry: 85 r = dm_consult_userspace(uuid, lc->luid, request_type, data, 86 data_size, rdata, rdata_size); 87 88 if (r != -ESRCH) 89 return r; 90 91 DMERR(" Userspace log server not found."); 92 while (1) { 93 set_current_state(TASK_INTERRUPTIBLE); 94 schedule_timeout(2*HZ); 95 DMWARN("Attempting to contact userspace log server..."); 96 r = dm_consult_userspace(uuid, lc->luid, DM_ULOG_CTR, 97 lc->usr_argv_str, 98 strlen(lc->usr_argv_str) + 1, 99 NULL, NULL); 100 if (!r) 101 break; 102 } 103 DMINFO("Reconnected to userspace log server... DM_ULOG_CTR complete"); 104 r = dm_consult_userspace(uuid, lc->luid, DM_ULOG_RESUME, NULL, 105 0, NULL, NULL); 106 if (!r) 107 goto retry; 108 109 DMERR("Error trying to resume userspace log: %d", r); 110 111 return -ESRCH; 112 } 113 114 static int build_constructor_string(struct dm_target *ti, 115 unsigned argc, char **argv, 116 char **ctr_str) 117 { 118 int i, str_size; 119 char *str = NULL; 120 121 *ctr_str = NULL; 122 123 for (i = 0, str_size = 0; i < argc; i++) 124 str_size += strlen(argv[i]) + 1; /* +1 for space between args */ 125 126 str_size += 20; /* Max number of chars in a printed u64 number */ 127 128 str = kzalloc(str_size, GFP_KERNEL); 129 if (!str) { 130 DMWARN("Unable to allocate memory for constructor string"); 131 return -ENOMEM; 132 } 133 134 str_size = sprintf(str, "%llu", (unsigned long long)ti->len); 135 for (i = 0; i < argc; i++) 136 str_size += sprintf(str + str_size, " %s", argv[i]); 137 138 *ctr_str = str; 139 return str_size; 140 } 141 142 /* 143 * userspace_ctr 144 * 145 * argv contains: 146 * <UUID> <other args> 147 * Where 'other args' is the userspace implementation specific log 148 * arguments. An example might be: 149 * <UUID> clustered_disk <arg count> <log dev> <region_size> [[no]sync] 150 * 151 * So, this module will strip off the <UUID> for identification purposes 152 * when communicating with userspace about a log; but will pass on everything 153 * else. 154 */ 155 static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti, 156 unsigned argc, char **argv) 157 { 158 int r = 0; 159 int str_size; 160 char *ctr_str = NULL; 161 struct log_c *lc = NULL; 162 uint64_t rdata; 163 size_t rdata_size = sizeof(rdata); 164 165 if (argc < 3) { 166 DMWARN("Too few arguments to userspace dirty log"); 167 return -EINVAL; 168 } 169 170 lc = kmalloc(sizeof(*lc), GFP_KERNEL); 171 if (!lc) { 172 DMWARN("Unable to allocate userspace log context."); 173 return -ENOMEM; 174 } 175 176 /* The ptr value is sufficient for local unique id */ 177 lc->luid = (unsigned long)lc; 178 179 lc->ti = ti; 180 181 if (strlen(argv[0]) > (DM_UUID_LEN - 1)) { 182 DMWARN("UUID argument too long."); 183 kfree(lc); 184 return -EINVAL; 185 } 186 187 strncpy(lc->uuid, argv[0], DM_UUID_LEN); 188 spin_lock_init(&lc->flush_lock); 189 INIT_LIST_HEAD(&lc->mark_list); 190 INIT_LIST_HEAD(&lc->clear_list); 191 192 str_size = build_constructor_string(ti, argc - 1, argv + 1, &ctr_str); 193 if (str_size < 0) { 194 kfree(lc); 195 return str_size; 196 } 197 198 /* Send table string */ 199 r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_CTR, 200 ctr_str, str_size, NULL, NULL); 201 202 if (r < 0) { 203 if (r == -ESRCH) 204 DMERR("Userspace log server not found"); 205 else 206 DMERR("Userspace log server failed to create log"); 207 goto out; 208 } 209 210 /* Since the region size does not change, get it now */ 211 rdata_size = sizeof(rdata); 212 r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_GET_REGION_SIZE, 213 NULL, 0, (char *)&rdata, &rdata_size); 214 215 if (r) { 216 DMERR("Failed to get region size of dirty log"); 217 goto out; 218 } 219 220 lc->region_size = (uint32_t)rdata; 221 lc->region_count = dm_sector_div_up(ti->len, lc->region_size); 222 223 out: 224 if (r) { 225 kfree(lc); 226 kfree(ctr_str); 227 } else { 228 lc->usr_argv_str = ctr_str; 229 lc->usr_argc = argc; 230 log->context = lc; 231 } 232 233 return r; 234 } 235 236 static void userspace_dtr(struct dm_dirty_log *log) 237 { 238 struct log_c *lc = log->context; 239 240 (void) dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_DTR, 241 NULL, 0, 242 NULL, NULL); 243 244 kfree(lc->usr_argv_str); 245 kfree(lc); 246 247 return; 248 } 249 250 static int userspace_presuspend(struct dm_dirty_log *log) 251 { 252 int r; 253 struct log_c *lc = log->context; 254 255 r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_PRESUSPEND, 256 NULL, 0, 257 NULL, NULL); 258 259 return r; 260 } 261 262 static int userspace_postsuspend(struct dm_dirty_log *log) 263 { 264 int r; 265 struct log_c *lc = log->context; 266 267 r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_POSTSUSPEND, 268 NULL, 0, 269 NULL, NULL); 270 271 return r; 272 } 273 274 static int userspace_resume(struct dm_dirty_log *log) 275 { 276 int r; 277 struct log_c *lc = log->context; 278 279 lc->in_sync_hint = 0; 280 r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_RESUME, 281 NULL, 0, 282 NULL, NULL); 283 284 return r; 285 } 286 287 static uint32_t userspace_get_region_size(struct dm_dirty_log *log) 288 { 289 struct log_c *lc = log->context; 290 291 return lc->region_size; 292 } 293 294 /* 295 * userspace_is_clean 296 * 297 * Check whether a region is clean. If there is any sort of 298 * failure when consulting the server, we return not clean. 299 * 300 * Returns: 1 if clean, 0 otherwise 301 */ 302 static int userspace_is_clean(struct dm_dirty_log *log, region_t region) 303 { 304 int r; 305 uint64_t region64 = (uint64_t)region; 306 int64_t is_clean; 307 size_t rdata_size; 308 struct log_c *lc = log->context; 309 310 rdata_size = sizeof(is_clean); 311 r = userspace_do_request(lc, lc->uuid, DM_ULOG_IS_CLEAN, 312 (char *)®ion64, sizeof(region64), 313 (char *)&is_clean, &rdata_size); 314 315 return (r) ? 0 : (int)is_clean; 316 } 317 318 /* 319 * userspace_in_sync 320 * 321 * Check if the region is in-sync. If there is any sort 322 * of failure when consulting the server, we assume that 323 * the region is not in sync. 324 * 325 * If 'can_block' is set, return immediately 326 * 327 * Returns: 1 if in-sync, 0 if not-in-sync, -EWOULDBLOCK 328 */ 329 static int userspace_in_sync(struct dm_dirty_log *log, region_t region, 330 int can_block) 331 { 332 int r; 333 uint64_t region64 = region; 334 int64_t in_sync; 335 size_t rdata_size; 336 struct log_c *lc = log->context; 337 338 /* 339 * We can never respond directly - even if in_sync_hint is 340 * set. This is because another machine could see a device 341 * failure and mark the region out-of-sync. If we don't go 342 * to userspace to ask, we might think the region is in-sync 343 * and allow a read to pick up data that is stale. (This is 344 * very unlikely if a device actually fails; but it is very 345 * likely if a connection to one device from one machine fails.) 346 * 347 * There still might be a problem if the mirror caches the region 348 * state as in-sync... but then this call would not be made. So, 349 * that is a mirror problem. 350 */ 351 if (!can_block) 352 return -EWOULDBLOCK; 353 354 rdata_size = sizeof(in_sync); 355 r = userspace_do_request(lc, lc->uuid, DM_ULOG_IN_SYNC, 356 (char *)®ion64, sizeof(region64), 357 (char *)&in_sync, &rdata_size); 358 return (r) ? 0 : (int)in_sync; 359 } 360 361 static int flush_one_by_one(struct log_c *lc, struct list_head *flush_list) 362 { 363 int r = 0; 364 struct flush_entry *fe; 365 366 list_for_each_entry(fe, flush_list, list) { 367 r = userspace_do_request(lc, lc->uuid, fe->type, 368 (char *)&fe->region, 369 sizeof(fe->region), 370 NULL, NULL); 371 if (r) 372 break; 373 } 374 375 return r; 376 } 377 378 static int flush_by_group(struct log_c *lc, struct list_head *flush_list) 379 { 380 int r = 0; 381 int count; 382 uint32_t type = 0; 383 struct flush_entry *fe, *tmp_fe; 384 LIST_HEAD(tmp_list); 385 uint64_t group[MAX_FLUSH_GROUP_COUNT]; 386 387 /* 388 * Group process the requests 389 */ 390 while (!list_empty(flush_list)) { 391 count = 0; 392 393 list_for_each_entry_safe(fe, tmp_fe, flush_list, list) { 394 group[count] = fe->region; 395 count++; 396 397 list_move(&fe->list, &tmp_list); 398 399 type = fe->type; 400 if (count >= MAX_FLUSH_GROUP_COUNT) 401 break; 402 } 403 404 r = userspace_do_request(lc, lc->uuid, type, 405 (char *)(group), 406 count * sizeof(uint64_t), 407 NULL, NULL); 408 if (r) { 409 /* Group send failed. Attempt one-by-one. */ 410 list_splice_init(&tmp_list, flush_list); 411 r = flush_one_by_one(lc, flush_list); 412 break; 413 } 414 } 415 416 /* 417 * Must collect flush_entrys that were successfully processed 418 * as a group so that they will be free'd by the caller. 419 */ 420 list_splice_init(&tmp_list, flush_list); 421 422 return r; 423 } 424 425 /* 426 * userspace_flush 427 * 428 * This function is ok to block. 429 * The flush happens in two stages. First, it sends all 430 * clear/mark requests that are on the list. Then it 431 * tells the server to commit them. This gives the 432 * server a chance to optimise the commit, instead of 433 * doing it for every request. 434 * 435 * Additionally, we could implement another thread that 436 * sends the requests up to the server - reducing the 437 * load on flush. Then the flush would have less in 438 * the list and be responsible for the finishing commit. 439 * 440 * Returns: 0 on success, < 0 on failure 441 */ 442 static int userspace_flush(struct dm_dirty_log *log) 443 { 444 int r = 0; 445 unsigned long flags; 446 struct log_c *lc = log->context; 447 LIST_HEAD(mark_list); 448 LIST_HEAD(clear_list); 449 struct flush_entry *fe, *tmp_fe; 450 451 spin_lock_irqsave(&lc->flush_lock, flags); 452 list_splice_init(&lc->mark_list, &mark_list); 453 list_splice_init(&lc->clear_list, &clear_list); 454 spin_unlock_irqrestore(&lc->flush_lock, flags); 455 456 if (list_empty(&mark_list) && list_empty(&clear_list)) 457 return 0; 458 459 r = flush_by_group(lc, &mark_list); 460 if (r) 461 goto fail; 462 463 r = flush_by_group(lc, &clear_list); 464 if (r) 465 goto fail; 466 467 r = userspace_do_request(lc, lc->uuid, DM_ULOG_FLUSH, 468 NULL, 0, NULL, NULL); 469 470 fail: 471 /* 472 * We can safely remove these entries, even if failure. 473 * Calling code will receive an error and will know that 474 * the log facility has failed. 475 */ 476 list_for_each_entry_safe(fe, tmp_fe, &mark_list, list) { 477 list_del(&fe->list); 478 mempool_free(fe, flush_entry_pool); 479 } 480 list_for_each_entry_safe(fe, tmp_fe, &clear_list, list) { 481 list_del(&fe->list); 482 mempool_free(fe, flush_entry_pool); 483 } 484 485 if (r) 486 dm_table_event(lc->ti->table); 487 488 return r; 489 } 490 491 /* 492 * userspace_mark_region 493 * 494 * This function should avoid blocking unless absolutely required. 495 * (Memory allocation is valid for blocking.) 496 */ 497 static void userspace_mark_region(struct dm_dirty_log *log, region_t region) 498 { 499 unsigned long flags; 500 struct log_c *lc = log->context; 501 struct flush_entry *fe; 502 503 /* Wait for an allocation, but _never_ fail */ 504 fe = mempool_alloc(flush_entry_pool, GFP_NOIO); 505 BUG_ON(!fe); 506 507 spin_lock_irqsave(&lc->flush_lock, flags); 508 fe->type = DM_ULOG_MARK_REGION; 509 fe->region = region; 510 list_add(&fe->list, &lc->mark_list); 511 spin_unlock_irqrestore(&lc->flush_lock, flags); 512 513 return; 514 } 515 516 /* 517 * userspace_clear_region 518 * 519 * This function must not block. 520 * So, the alloc can't block. In the worst case, it is ok to 521 * fail. It would simply mean we can't clear the region. 522 * Does nothing to current sync context, but does mean 523 * the region will be re-sync'ed on a reload of the mirror 524 * even though it is in-sync. 525 */ 526 static void userspace_clear_region(struct dm_dirty_log *log, region_t region) 527 { 528 unsigned long flags; 529 struct log_c *lc = log->context; 530 struct flush_entry *fe; 531 532 /* 533 * If we fail to allocate, we skip the clearing of 534 * the region. This doesn't hurt us in any way, except 535 * to cause the region to be resync'ed when the 536 * device is activated next time. 537 */ 538 fe = mempool_alloc(flush_entry_pool, GFP_ATOMIC); 539 if (!fe) { 540 DMERR("Failed to allocate memory to clear region."); 541 return; 542 } 543 544 spin_lock_irqsave(&lc->flush_lock, flags); 545 fe->type = DM_ULOG_CLEAR_REGION; 546 fe->region = region; 547 list_add(&fe->list, &lc->clear_list); 548 spin_unlock_irqrestore(&lc->flush_lock, flags); 549 550 return; 551 } 552 553 /* 554 * userspace_get_resync_work 555 * 556 * Get a region that needs recovery. It is valid to return 557 * an error for this function. 558 * 559 * Returns: 1 if region filled, 0 if no work, <0 on error 560 */ 561 static int userspace_get_resync_work(struct dm_dirty_log *log, region_t *region) 562 { 563 int r; 564 size_t rdata_size; 565 struct log_c *lc = log->context; 566 struct { 567 int64_t i; /* 64-bit for mix arch compatibility */ 568 region_t r; 569 } pkg; 570 571 if (lc->in_sync_hint >= lc->region_count) 572 return 0; 573 574 rdata_size = sizeof(pkg); 575 r = userspace_do_request(lc, lc->uuid, DM_ULOG_GET_RESYNC_WORK, 576 NULL, 0, 577 (char *)&pkg, &rdata_size); 578 579 *region = pkg.r; 580 return (r) ? r : (int)pkg.i; 581 } 582 583 /* 584 * userspace_set_region_sync 585 * 586 * Set the sync status of a given region. This function 587 * must not fail. 588 */ 589 static void userspace_set_region_sync(struct dm_dirty_log *log, 590 region_t region, int in_sync) 591 { 592 int r; 593 struct log_c *lc = log->context; 594 struct { 595 region_t r; 596 int64_t i; 597 } pkg; 598 599 pkg.r = region; 600 pkg.i = (int64_t)in_sync; 601 602 r = userspace_do_request(lc, lc->uuid, DM_ULOG_SET_REGION_SYNC, 603 (char *)&pkg, sizeof(pkg), 604 NULL, NULL); 605 606 /* 607 * It would be nice to be able to report failures. 608 * However, it is easy emough to detect and resolve. 609 */ 610 return; 611 } 612 613 /* 614 * userspace_get_sync_count 615 * 616 * If there is any sort of failure when consulting the server, 617 * we assume that the sync count is zero. 618 * 619 * Returns: sync count on success, 0 on failure 620 */ 621 static region_t userspace_get_sync_count(struct dm_dirty_log *log) 622 { 623 int r; 624 size_t rdata_size; 625 uint64_t sync_count; 626 struct log_c *lc = log->context; 627 628 rdata_size = sizeof(sync_count); 629 r = userspace_do_request(lc, lc->uuid, DM_ULOG_GET_SYNC_COUNT, 630 NULL, 0, 631 (char *)&sync_count, &rdata_size); 632 633 if (r) 634 return 0; 635 636 if (sync_count >= lc->region_count) 637 lc->in_sync_hint = lc->region_count; 638 639 return (region_t)sync_count; 640 } 641 642 /* 643 * userspace_status 644 * 645 * Returns: amount of space consumed 646 */ 647 static int userspace_status(struct dm_dirty_log *log, status_type_t status_type, 648 char *result, unsigned maxlen) 649 { 650 int r = 0; 651 char *table_args; 652 size_t sz = (size_t)maxlen; 653 struct log_c *lc = log->context; 654 655 switch (status_type) { 656 case STATUSTYPE_INFO: 657 r = userspace_do_request(lc, lc->uuid, DM_ULOG_STATUS_INFO, 658 NULL, 0, 659 result, &sz); 660 661 if (r) { 662 sz = 0; 663 DMEMIT("%s 1 COM_FAILURE", log->type->name); 664 } 665 break; 666 case STATUSTYPE_TABLE: 667 sz = 0; 668 table_args = strchr(lc->usr_argv_str, ' '); 669 BUG_ON(!table_args); /* There will always be a ' ' */ 670 table_args++; 671 672 DMEMIT("%s %u %s %s ", log->type->name, lc->usr_argc, 673 lc->uuid, table_args); 674 break; 675 } 676 return (r) ? 0 : (int)sz; 677 } 678 679 /* 680 * userspace_is_remote_recovering 681 * 682 * Returns: 1 if region recovering, 0 otherwise 683 */ 684 static int userspace_is_remote_recovering(struct dm_dirty_log *log, 685 region_t region) 686 { 687 int r; 688 uint64_t region64 = region; 689 struct log_c *lc = log->context; 690 static unsigned long long limit; 691 struct { 692 int64_t is_recovering; 693 uint64_t in_sync_hint; 694 } pkg; 695 size_t rdata_size = sizeof(pkg); 696 697 /* 698 * Once the mirror has been reported to be in-sync, 699 * it will never again ask for recovery work. So, 700 * we can safely say there is not a remote machine 701 * recovering if the device is in-sync. (in_sync_hint 702 * must be reset at resume time.) 703 */ 704 if (region < lc->in_sync_hint) 705 return 0; 706 else if (jiffies < limit) 707 return 1; 708 709 limit = jiffies + (HZ / 4); 710 r = userspace_do_request(lc, lc->uuid, DM_ULOG_IS_REMOTE_RECOVERING, 711 (char *)®ion64, sizeof(region64), 712 (char *)&pkg, &rdata_size); 713 if (r) 714 return 1; 715 716 lc->in_sync_hint = pkg.in_sync_hint; 717 718 return (int)pkg.is_recovering; 719 } 720 721 static struct dm_dirty_log_type _userspace_type = { 722 .name = "userspace", 723 .module = THIS_MODULE, 724 .ctr = userspace_ctr, 725 .dtr = userspace_dtr, 726 .presuspend = userspace_presuspend, 727 .postsuspend = userspace_postsuspend, 728 .resume = userspace_resume, 729 .get_region_size = userspace_get_region_size, 730 .is_clean = userspace_is_clean, 731 .in_sync = userspace_in_sync, 732 .flush = userspace_flush, 733 .mark_region = userspace_mark_region, 734 .clear_region = userspace_clear_region, 735 .get_resync_work = userspace_get_resync_work, 736 .set_region_sync = userspace_set_region_sync, 737 .get_sync_count = userspace_get_sync_count, 738 .status = userspace_status, 739 .is_remote_recovering = userspace_is_remote_recovering, 740 }; 741 742 static int __init userspace_dirty_log_init(void) 743 { 744 int r = 0; 745 746 flush_entry_pool = mempool_create(100, flush_entry_alloc, 747 flush_entry_free, NULL); 748 749 if (!flush_entry_pool) { 750 DMWARN("Unable to create flush_entry_pool: No memory."); 751 return -ENOMEM; 752 } 753 754 r = dm_ulog_tfr_init(); 755 if (r) { 756 DMWARN("Unable to initialize userspace log communications"); 757 mempool_destroy(flush_entry_pool); 758 return r; 759 } 760 761 r = dm_dirty_log_type_register(&_userspace_type); 762 if (r) { 763 DMWARN("Couldn't register userspace dirty log type"); 764 dm_ulog_tfr_exit(); 765 mempool_destroy(flush_entry_pool); 766 return r; 767 } 768 769 DMINFO("version " DM_LOG_USERSPACE_VSN " loaded"); 770 return 0; 771 } 772 773 static void __exit userspace_dirty_log_exit(void) 774 { 775 dm_dirty_log_type_unregister(&_userspace_type); 776 dm_ulog_tfr_exit(); 777 mempool_destroy(flush_entry_pool); 778 779 DMINFO("version " DM_LOG_USERSPACE_VSN " unloaded"); 780 return; 781 } 782 783 module_init(userspace_dirty_log_init); 784 module_exit(userspace_dirty_log_exit); 785 786 MODULE_DESCRIPTION(DM_NAME " userspace dirty log link"); 787 MODULE_AUTHOR("Jonathan Brassow <dm-devel@redhat.com>"); 788 MODULE_LICENSE("GPL"); 789