1 /* 2 * Copyright (C) 2006-2009 Red Hat, Inc. 3 * 4 * This file is released under the LGPL. 5 */ 6 7 #include <linux/bio.h> 8 #include <linux/slab.h> 9 #include <linux/dm-dirty-log.h> 10 #include <linux/device-mapper.h> 11 #include <linux/dm-log-userspace.h> 12 13 #include "dm-log-userspace-transfer.h" 14 15 struct flush_entry { 16 int type; 17 region_t region; 18 struct list_head list; 19 }; 20 21 struct log_c { 22 struct dm_target *ti; 23 uint32_t region_size; 24 region_t region_count; 25 uint64_t luid; 26 char uuid[DM_UUID_LEN]; 27 28 char *usr_argv_str; 29 uint32_t usr_argc; 30 31 /* 32 * in_sync_hint gets set when doing is_remote_recovering. It 33 * represents the first region that needs recovery. IOW, the 34 * first zero bit of sync_bits. This can be useful for to limit 35 * traffic for calls like is_remote_recovering and get_resync_work, 36 * but be take care in its use for anything else. 37 */ 38 uint64_t in_sync_hint; 39 40 /* 41 * Mark and clear requests are held until a flush is issued 42 * so that we can group, and thereby limit, the amount of 43 * network traffic between kernel and userspace. The 'flush_lock' 44 * is used to protect these lists. 45 */ 46 spinlock_t flush_lock; 47 struct list_head mark_list; 48 struct list_head clear_list; 49 }; 50 51 static mempool_t *flush_entry_pool; 52 53 static void *flush_entry_alloc(gfp_t gfp_mask, void *pool_data) 54 { 55 return kmalloc(sizeof(struct flush_entry), gfp_mask); 56 } 57 58 static void flush_entry_free(void *element, void *pool_data) 59 { 60 kfree(element); 61 } 62 63 static int userspace_do_request(struct log_c *lc, const char *uuid, 64 int request_type, char *data, size_t data_size, 65 char *rdata, size_t *rdata_size) 66 { 67 int r; 68 69 /* 70 * If the server isn't there, -ESRCH is returned, 71 * and we must keep trying until the server is 72 * restored. 73 */ 74 retry: 75 r = dm_consult_userspace(uuid, lc->luid, request_type, data, 76 data_size, rdata, rdata_size); 77 78 if (r != -ESRCH) 79 return r; 80 81 DMERR(" Userspace log server not found."); 82 while (1) { 83 set_current_state(TASK_INTERRUPTIBLE); 84 schedule_timeout(2*HZ); 85 DMWARN("Attempting to contact userspace log server..."); 86 r = dm_consult_userspace(uuid, lc->luid, DM_ULOG_CTR, 87 lc->usr_argv_str, 88 strlen(lc->usr_argv_str) + 1, 89 NULL, NULL); 90 if (!r) 91 break; 92 } 93 DMINFO("Reconnected to userspace log server... DM_ULOG_CTR complete"); 94 r = dm_consult_userspace(uuid, lc->luid, DM_ULOG_RESUME, NULL, 95 0, NULL, NULL); 96 if (!r) 97 goto retry; 98 99 DMERR("Error trying to resume userspace log: %d", r); 100 101 return -ESRCH; 102 } 103 104 static int build_constructor_string(struct dm_target *ti, 105 unsigned argc, char **argv, 106 char **ctr_str) 107 { 108 int i, str_size; 109 char *str = NULL; 110 111 *ctr_str = NULL; 112 113 for (i = 0, str_size = 0; i < argc; i++) 114 str_size += strlen(argv[i]) + 1; /* +1 for space between args */ 115 116 str_size += 20; /* Max number of chars in a printed u64 number */ 117 118 str = kzalloc(str_size, GFP_KERNEL); 119 if (!str) { 120 DMWARN("Unable to allocate memory for constructor string"); 121 return -ENOMEM; 122 } 123 124 str_size = sprintf(str, "%llu", (unsigned long long)ti->len); 125 for (i = 0; i < argc; i++) 126 str_size += sprintf(str + str_size, " %s", argv[i]); 127 128 *ctr_str = str; 129 return str_size; 130 } 131 132 /* 133 * userspace_ctr 134 * 135 * argv contains: 136 * <UUID> <other args> 137 * Where 'other args' is the userspace implementation specific log 138 * arguments. An example might be: 139 * <UUID> clustered_disk <arg count> <log dev> <region_size> [[no]sync] 140 * 141 * So, this module will strip off the <UUID> for identification purposes 142 * when communicating with userspace about a log; but will pass on everything 143 * else. 144 */ 145 static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti, 146 unsigned argc, char **argv) 147 { 148 int r = 0; 149 int str_size; 150 char *ctr_str = NULL; 151 struct log_c *lc = NULL; 152 uint64_t rdata; 153 size_t rdata_size = sizeof(rdata); 154 155 if (argc < 3) { 156 DMWARN("Too few arguments to userspace dirty log"); 157 return -EINVAL; 158 } 159 160 lc = kmalloc(sizeof(*lc), GFP_KERNEL); 161 if (!lc) { 162 DMWARN("Unable to allocate userspace log context."); 163 return -ENOMEM; 164 } 165 166 /* The ptr value is sufficient for local unique id */ 167 lc->luid = (unsigned long)lc; 168 169 lc->ti = ti; 170 171 if (strlen(argv[0]) > (DM_UUID_LEN - 1)) { 172 DMWARN("UUID argument too long."); 173 kfree(lc); 174 return -EINVAL; 175 } 176 177 strncpy(lc->uuid, argv[0], DM_UUID_LEN); 178 spin_lock_init(&lc->flush_lock); 179 INIT_LIST_HEAD(&lc->mark_list); 180 INIT_LIST_HEAD(&lc->clear_list); 181 182 str_size = build_constructor_string(ti, argc - 1, argv + 1, &ctr_str); 183 if (str_size < 0) { 184 kfree(lc); 185 return str_size; 186 } 187 188 /* Send table string */ 189 r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_CTR, 190 ctr_str, str_size, NULL, NULL); 191 192 if (r < 0) { 193 if (r == -ESRCH) 194 DMERR("Userspace log server not found"); 195 else 196 DMERR("Userspace log server failed to create log"); 197 goto out; 198 } 199 200 /* Since the region size does not change, get it now */ 201 rdata_size = sizeof(rdata); 202 r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_GET_REGION_SIZE, 203 NULL, 0, (char *)&rdata, &rdata_size); 204 205 if (r) { 206 DMERR("Failed to get region size of dirty log"); 207 goto out; 208 } 209 210 lc->region_size = (uint32_t)rdata; 211 lc->region_count = dm_sector_div_up(ti->len, lc->region_size); 212 213 out: 214 if (r) { 215 kfree(lc); 216 kfree(ctr_str); 217 } else { 218 lc->usr_argv_str = ctr_str; 219 lc->usr_argc = argc; 220 log->context = lc; 221 } 222 223 return r; 224 } 225 226 static void userspace_dtr(struct dm_dirty_log *log) 227 { 228 struct log_c *lc = log->context; 229 230 (void) dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_DTR, 231 NULL, 0, 232 NULL, NULL); 233 234 kfree(lc->usr_argv_str); 235 kfree(lc); 236 237 return; 238 } 239 240 static int userspace_presuspend(struct dm_dirty_log *log) 241 { 242 int r; 243 struct log_c *lc = log->context; 244 245 r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_PRESUSPEND, 246 NULL, 0, 247 NULL, NULL); 248 249 return r; 250 } 251 252 static int userspace_postsuspend(struct dm_dirty_log *log) 253 { 254 int r; 255 struct log_c *lc = log->context; 256 257 r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_POSTSUSPEND, 258 NULL, 0, 259 NULL, NULL); 260 261 return r; 262 } 263 264 static int userspace_resume(struct dm_dirty_log *log) 265 { 266 int r; 267 struct log_c *lc = log->context; 268 269 lc->in_sync_hint = 0; 270 r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_RESUME, 271 NULL, 0, 272 NULL, NULL); 273 274 return r; 275 } 276 277 static uint32_t userspace_get_region_size(struct dm_dirty_log *log) 278 { 279 struct log_c *lc = log->context; 280 281 return lc->region_size; 282 } 283 284 /* 285 * userspace_is_clean 286 * 287 * Check whether a region is clean. If there is any sort of 288 * failure when consulting the server, we return not clean. 289 * 290 * Returns: 1 if clean, 0 otherwise 291 */ 292 static int userspace_is_clean(struct dm_dirty_log *log, region_t region) 293 { 294 int r; 295 uint64_t region64 = (uint64_t)region; 296 int64_t is_clean; 297 size_t rdata_size; 298 struct log_c *lc = log->context; 299 300 rdata_size = sizeof(is_clean); 301 r = userspace_do_request(lc, lc->uuid, DM_ULOG_IS_CLEAN, 302 (char *)®ion64, sizeof(region64), 303 (char *)&is_clean, &rdata_size); 304 305 return (r) ? 0 : (int)is_clean; 306 } 307 308 /* 309 * userspace_in_sync 310 * 311 * Check if the region is in-sync. If there is any sort 312 * of failure when consulting the server, we assume that 313 * the region is not in sync. 314 * 315 * If 'can_block' is set, return immediately 316 * 317 * Returns: 1 if in-sync, 0 if not-in-sync, -EWOULDBLOCK 318 */ 319 static int userspace_in_sync(struct dm_dirty_log *log, region_t region, 320 int can_block) 321 { 322 int r; 323 uint64_t region64 = region; 324 int64_t in_sync; 325 size_t rdata_size; 326 struct log_c *lc = log->context; 327 328 /* 329 * We can never respond directly - even if in_sync_hint is 330 * set. This is because another machine could see a device 331 * failure and mark the region out-of-sync. If we don't go 332 * to userspace to ask, we might think the region is in-sync 333 * and allow a read to pick up data that is stale. (This is 334 * very unlikely if a device actually fails; but it is very 335 * likely if a connection to one device from one machine fails.) 336 * 337 * There still might be a problem if the mirror caches the region 338 * state as in-sync... but then this call would not be made. So, 339 * that is a mirror problem. 340 */ 341 if (!can_block) 342 return -EWOULDBLOCK; 343 344 rdata_size = sizeof(in_sync); 345 r = userspace_do_request(lc, lc->uuid, DM_ULOG_IN_SYNC, 346 (char *)®ion64, sizeof(region64), 347 (char *)&in_sync, &rdata_size); 348 return (r) ? 0 : (int)in_sync; 349 } 350 351 /* 352 * userspace_flush 353 * 354 * This function is ok to block. 355 * The flush happens in two stages. First, it sends all 356 * clear/mark requests that are on the list. Then it 357 * tells the server to commit them. This gives the 358 * server a chance to optimise the commit, instead of 359 * doing it for every request. 360 * 361 * Additionally, we could implement another thread that 362 * sends the requests up to the server - reducing the 363 * load on flush. Then the flush would have less in 364 * the list and be responsible for the finishing commit. 365 * 366 * Returns: 0 on success, < 0 on failure 367 */ 368 static int userspace_flush(struct dm_dirty_log *log) 369 { 370 int r = 0; 371 unsigned long flags; 372 struct log_c *lc = log->context; 373 LIST_HEAD(mark_list); 374 LIST_HEAD(clear_list); 375 struct flush_entry *fe, *tmp_fe; 376 377 spin_lock_irqsave(&lc->flush_lock, flags); 378 list_splice_init(&lc->mark_list, &mark_list); 379 list_splice_init(&lc->clear_list, &clear_list); 380 spin_unlock_irqrestore(&lc->flush_lock, flags); 381 382 if (list_empty(&mark_list) && list_empty(&clear_list)) 383 return 0; 384 385 /* 386 * FIXME: Count up requests, group request types, 387 * allocate memory to stick all requests in and 388 * send to server in one go. Failing the allocation, 389 * do it one by one. 390 */ 391 392 list_for_each_entry(fe, &mark_list, list) { 393 r = userspace_do_request(lc, lc->uuid, fe->type, 394 (char *)&fe->region, 395 sizeof(fe->region), 396 NULL, NULL); 397 if (r) 398 goto fail; 399 } 400 401 list_for_each_entry(fe, &clear_list, list) { 402 r = userspace_do_request(lc, lc->uuid, fe->type, 403 (char *)&fe->region, 404 sizeof(fe->region), 405 NULL, NULL); 406 if (r) 407 goto fail; 408 } 409 410 r = userspace_do_request(lc, lc->uuid, DM_ULOG_FLUSH, 411 NULL, 0, NULL, NULL); 412 413 fail: 414 /* 415 * We can safely remove these entries, even if failure. 416 * Calling code will receive an error and will know that 417 * the log facility has failed. 418 */ 419 list_for_each_entry_safe(fe, tmp_fe, &mark_list, list) { 420 list_del(&fe->list); 421 mempool_free(fe, flush_entry_pool); 422 } 423 list_for_each_entry_safe(fe, tmp_fe, &clear_list, list) { 424 list_del(&fe->list); 425 mempool_free(fe, flush_entry_pool); 426 } 427 428 if (r) 429 dm_table_event(lc->ti->table); 430 431 return r; 432 } 433 434 /* 435 * userspace_mark_region 436 * 437 * This function should avoid blocking unless absolutely required. 438 * (Memory allocation is valid for blocking.) 439 */ 440 static void userspace_mark_region(struct dm_dirty_log *log, region_t region) 441 { 442 unsigned long flags; 443 struct log_c *lc = log->context; 444 struct flush_entry *fe; 445 446 /* Wait for an allocation, but _never_ fail */ 447 fe = mempool_alloc(flush_entry_pool, GFP_NOIO); 448 BUG_ON(!fe); 449 450 spin_lock_irqsave(&lc->flush_lock, flags); 451 fe->type = DM_ULOG_MARK_REGION; 452 fe->region = region; 453 list_add(&fe->list, &lc->mark_list); 454 spin_unlock_irqrestore(&lc->flush_lock, flags); 455 456 return; 457 } 458 459 /* 460 * userspace_clear_region 461 * 462 * This function must not block. 463 * So, the alloc can't block. In the worst case, it is ok to 464 * fail. It would simply mean we can't clear the region. 465 * Does nothing to current sync context, but does mean 466 * the region will be re-sync'ed on a reload of the mirror 467 * even though it is in-sync. 468 */ 469 static void userspace_clear_region(struct dm_dirty_log *log, region_t region) 470 { 471 unsigned long flags; 472 struct log_c *lc = log->context; 473 struct flush_entry *fe; 474 475 /* 476 * If we fail to allocate, we skip the clearing of 477 * the region. This doesn't hurt us in any way, except 478 * to cause the region to be resync'ed when the 479 * device is activated next time. 480 */ 481 fe = mempool_alloc(flush_entry_pool, GFP_ATOMIC); 482 if (!fe) { 483 DMERR("Failed to allocate memory to clear region."); 484 return; 485 } 486 487 spin_lock_irqsave(&lc->flush_lock, flags); 488 fe->type = DM_ULOG_CLEAR_REGION; 489 fe->region = region; 490 list_add(&fe->list, &lc->clear_list); 491 spin_unlock_irqrestore(&lc->flush_lock, flags); 492 493 return; 494 } 495 496 /* 497 * userspace_get_resync_work 498 * 499 * Get a region that needs recovery. It is valid to return 500 * an error for this function. 501 * 502 * Returns: 1 if region filled, 0 if no work, <0 on error 503 */ 504 static int userspace_get_resync_work(struct dm_dirty_log *log, region_t *region) 505 { 506 int r; 507 size_t rdata_size; 508 struct log_c *lc = log->context; 509 struct { 510 int64_t i; /* 64-bit for mix arch compatibility */ 511 region_t r; 512 } pkg; 513 514 if (lc->in_sync_hint >= lc->region_count) 515 return 0; 516 517 rdata_size = sizeof(pkg); 518 r = userspace_do_request(lc, lc->uuid, DM_ULOG_GET_RESYNC_WORK, 519 NULL, 0, 520 (char *)&pkg, &rdata_size); 521 522 *region = pkg.r; 523 return (r) ? r : (int)pkg.i; 524 } 525 526 /* 527 * userspace_set_region_sync 528 * 529 * Set the sync status of a given region. This function 530 * must not fail. 531 */ 532 static void userspace_set_region_sync(struct dm_dirty_log *log, 533 region_t region, int in_sync) 534 { 535 int r; 536 struct log_c *lc = log->context; 537 struct { 538 region_t r; 539 int64_t i; 540 } pkg; 541 542 pkg.r = region; 543 pkg.i = (int64_t)in_sync; 544 545 r = userspace_do_request(lc, lc->uuid, DM_ULOG_SET_REGION_SYNC, 546 (char *)&pkg, sizeof(pkg), 547 NULL, NULL); 548 549 /* 550 * It would be nice to be able to report failures. 551 * However, it is easy emough to detect and resolve. 552 */ 553 return; 554 } 555 556 /* 557 * userspace_get_sync_count 558 * 559 * If there is any sort of failure when consulting the server, 560 * we assume that the sync count is zero. 561 * 562 * Returns: sync count on success, 0 on failure 563 */ 564 static region_t userspace_get_sync_count(struct dm_dirty_log *log) 565 { 566 int r; 567 size_t rdata_size; 568 uint64_t sync_count; 569 struct log_c *lc = log->context; 570 571 rdata_size = sizeof(sync_count); 572 r = userspace_do_request(lc, lc->uuid, DM_ULOG_GET_SYNC_COUNT, 573 NULL, 0, 574 (char *)&sync_count, &rdata_size); 575 576 if (r) 577 return 0; 578 579 if (sync_count >= lc->region_count) 580 lc->in_sync_hint = lc->region_count; 581 582 return (region_t)sync_count; 583 } 584 585 /* 586 * userspace_status 587 * 588 * Returns: amount of space consumed 589 */ 590 static int userspace_status(struct dm_dirty_log *log, status_type_t status_type, 591 char *result, unsigned maxlen) 592 { 593 int r = 0; 594 char *table_args; 595 size_t sz = (size_t)maxlen; 596 struct log_c *lc = log->context; 597 598 switch (status_type) { 599 case STATUSTYPE_INFO: 600 r = userspace_do_request(lc, lc->uuid, DM_ULOG_STATUS_INFO, 601 NULL, 0, 602 result, &sz); 603 604 if (r) { 605 sz = 0; 606 DMEMIT("%s 1 COM_FAILURE", log->type->name); 607 } 608 break; 609 case STATUSTYPE_TABLE: 610 sz = 0; 611 table_args = strchr(lc->usr_argv_str, ' '); 612 BUG_ON(!table_args); /* There will always be a ' ' */ 613 table_args++; 614 615 DMEMIT("%s %u %s %s ", log->type->name, lc->usr_argc, 616 lc->uuid, table_args); 617 break; 618 } 619 return (r) ? 0 : (int)sz; 620 } 621 622 /* 623 * userspace_is_remote_recovering 624 * 625 * Returns: 1 if region recovering, 0 otherwise 626 */ 627 static int userspace_is_remote_recovering(struct dm_dirty_log *log, 628 region_t region) 629 { 630 int r; 631 uint64_t region64 = region; 632 struct log_c *lc = log->context; 633 static unsigned long long limit; 634 struct { 635 int64_t is_recovering; 636 uint64_t in_sync_hint; 637 } pkg; 638 size_t rdata_size = sizeof(pkg); 639 640 /* 641 * Once the mirror has been reported to be in-sync, 642 * it will never again ask for recovery work. So, 643 * we can safely say there is not a remote machine 644 * recovering if the device is in-sync. (in_sync_hint 645 * must be reset at resume time.) 646 */ 647 if (region < lc->in_sync_hint) 648 return 0; 649 else if (jiffies < limit) 650 return 1; 651 652 limit = jiffies + (HZ / 4); 653 r = userspace_do_request(lc, lc->uuid, DM_ULOG_IS_REMOTE_RECOVERING, 654 (char *)®ion64, sizeof(region64), 655 (char *)&pkg, &rdata_size); 656 if (r) 657 return 1; 658 659 lc->in_sync_hint = pkg.in_sync_hint; 660 661 return (int)pkg.is_recovering; 662 } 663 664 static struct dm_dirty_log_type _userspace_type = { 665 .name = "userspace", 666 .module = THIS_MODULE, 667 .ctr = userspace_ctr, 668 .dtr = userspace_dtr, 669 .presuspend = userspace_presuspend, 670 .postsuspend = userspace_postsuspend, 671 .resume = userspace_resume, 672 .get_region_size = userspace_get_region_size, 673 .is_clean = userspace_is_clean, 674 .in_sync = userspace_in_sync, 675 .flush = userspace_flush, 676 .mark_region = userspace_mark_region, 677 .clear_region = userspace_clear_region, 678 .get_resync_work = userspace_get_resync_work, 679 .set_region_sync = userspace_set_region_sync, 680 .get_sync_count = userspace_get_sync_count, 681 .status = userspace_status, 682 .is_remote_recovering = userspace_is_remote_recovering, 683 }; 684 685 static int __init userspace_dirty_log_init(void) 686 { 687 int r = 0; 688 689 flush_entry_pool = mempool_create(100, flush_entry_alloc, 690 flush_entry_free, NULL); 691 692 if (!flush_entry_pool) { 693 DMWARN("Unable to create flush_entry_pool: No memory."); 694 return -ENOMEM; 695 } 696 697 r = dm_ulog_tfr_init(); 698 if (r) { 699 DMWARN("Unable to initialize userspace log communications"); 700 mempool_destroy(flush_entry_pool); 701 return r; 702 } 703 704 r = dm_dirty_log_type_register(&_userspace_type); 705 if (r) { 706 DMWARN("Couldn't register userspace dirty log type"); 707 dm_ulog_tfr_exit(); 708 mempool_destroy(flush_entry_pool); 709 return r; 710 } 711 712 DMINFO("version 1.0.0 loaded"); 713 return 0; 714 } 715 716 static void __exit userspace_dirty_log_exit(void) 717 { 718 dm_dirty_log_type_unregister(&_userspace_type); 719 dm_ulog_tfr_exit(); 720 mempool_destroy(flush_entry_pool); 721 722 DMINFO("version 1.0.0 unloaded"); 723 return; 724 } 725 726 module_init(userspace_dirty_log_init); 727 module_exit(userspace_dirty_log_exit); 728 729 MODULE_DESCRIPTION(DM_NAME " userspace dirty log link"); 730 MODULE_AUTHOR("Jonathan Brassow <dm-devel@redhat.com>"); 731 MODULE_LICENSE("GPL"); 732