1 /* 2 * Dirty page rate limit implementation code 3 * 4 * Copyright (c) 2022 CHINA TELECOM CO.,LTD. 5 * 6 * Authors: 7 * Hyman Huang(黄勇) <huangy81@chinatelecom.cn> 8 * 9 * This work is licensed under the terms of the GNU GPL, version 2 or later. 10 * See the COPYING file in the top-level directory. 11 */ 12 13 #include "qemu/osdep.h" 14 #include "qemu/main-loop.h" 15 #include "qapi/qapi-commands-migration.h" 16 #include "qapi/qmp/qdict.h" 17 #include "qapi/error.h" 18 #include "sysemu/dirtyrate.h" 19 #include "sysemu/dirtylimit.h" 20 #include "monitor/hmp.h" 21 #include "monitor/monitor.h" 22 #include "exec/memory.h" 23 #include "exec/target_page.h" 24 #include "hw/boards.h" 25 #include "sysemu/kvm.h" 26 #include "trace.h" 27 #include "migration/misc.h" 28 #include "migration/migration.h" 29 30 /* 31 * Dirtylimit stop working if dirty page rate error 32 * value less than DIRTYLIMIT_TOLERANCE_RANGE 33 */ 34 #define DIRTYLIMIT_TOLERANCE_RANGE 25 /* MB/s */ 35 /* 36 * Plus or minus vcpu sleep time linearly if dirty 37 * page rate error value percentage over 38 * DIRTYLIMIT_LINEAR_ADJUSTMENT_PCT. 39 * Otherwise, plus or minus a fixed vcpu sleep time. 40 */ 41 #define DIRTYLIMIT_LINEAR_ADJUSTMENT_PCT 50 42 /* 43 * Max vcpu sleep time percentage during a cycle 44 * composed of dirty ring full and sleep time. 45 */ 46 #define DIRTYLIMIT_THROTTLE_PCT_MAX 99 47 48 struct { 49 VcpuStat stat; 50 bool running; 51 QemuThread thread; 52 } *vcpu_dirty_rate_stat; 53 54 typedef struct VcpuDirtyLimitState { 55 int cpu_index; 56 bool enabled; 57 /* 58 * Quota dirty page rate, unit is MB/s 59 * zero if not enabled. 60 */ 61 uint64_t quota; 62 } VcpuDirtyLimitState; 63 64 struct { 65 VcpuDirtyLimitState *states; 66 /* Max cpus number configured by user */ 67 int max_cpus; 68 /* Number of vcpu under dirtylimit */ 69 int limited_nvcpu; 70 } *dirtylimit_state; 71 72 /* protect dirtylimit_state */ 73 static QemuMutex dirtylimit_mutex; 74 75 /* dirtylimit thread quit if dirtylimit_quit is true */ 76 static bool dirtylimit_quit; 77 78 static void vcpu_dirty_rate_stat_collect(void) 79 { 80 VcpuStat stat; 81 int i = 0; 82 int64_t period = DIRTYLIMIT_CALC_TIME_MS; 83 84 if (migrate_dirty_limit() && 85 migration_is_active()) { 86 period = migrate_vcpu_dirty_limit_period(); 87 } 88 89 /* calculate vcpu dirtyrate */ 90 vcpu_calculate_dirtyrate(period, 91 &stat, 92 GLOBAL_DIRTY_LIMIT, 93 false); 94 95 for (i = 0; i < stat.nvcpu; i++) { 96 vcpu_dirty_rate_stat->stat.rates[i].id = i; 97 vcpu_dirty_rate_stat->stat.rates[i].dirty_rate = 98 stat.rates[i].dirty_rate; 99 } 100 101 g_free(stat.rates); 102 } 103 104 static void *vcpu_dirty_rate_stat_thread(void *opaque) 105 { 106 rcu_register_thread(); 107 108 /* start log sync */ 109 global_dirty_log_change(GLOBAL_DIRTY_LIMIT, true); 110 111 while (qatomic_read(&vcpu_dirty_rate_stat->running)) { 112 vcpu_dirty_rate_stat_collect(); 113 if (dirtylimit_in_service()) { 114 dirtylimit_process(); 115 } 116 } 117 118 /* stop log sync */ 119 global_dirty_log_change(GLOBAL_DIRTY_LIMIT, false); 120 121 rcu_unregister_thread(); 122 return NULL; 123 } 124 125 int64_t vcpu_dirty_rate_get(int cpu_index) 126 { 127 DirtyRateVcpu *rates = vcpu_dirty_rate_stat->stat.rates; 128 return qatomic_read_i64(&rates[cpu_index].dirty_rate); 129 } 130 131 void vcpu_dirty_rate_stat_start(void) 132 { 133 if (qatomic_read(&vcpu_dirty_rate_stat->running)) { 134 return; 135 } 136 137 qatomic_set(&vcpu_dirty_rate_stat->running, 1); 138 qemu_thread_create(&vcpu_dirty_rate_stat->thread, 139 "dirtyrate-stat", 140 vcpu_dirty_rate_stat_thread, 141 NULL, 142 QEMU_THREAD_JOINABLE); 143 } 144 145 void vcpu_dirty_rate_stat_stop(void) 146 { 147 qatomic_set(&vcpu_dirty_rate_stat->running, 0); 148 dirtylimit_state_unlock(); 149 bql_unlock(); 150 qemu_thread_join(&vcpu_dirty_rate_stat->thread); 151 bql_lock(); 152 dirtylimit_state_lock(); 153 } 154 155 void vcpu_dirty_rate_stat_initialize(void) 156 { 157 MachineState *ms = MACHINE(qdev_get_machine()); 158 int max_cpus = ms->smp.max_cpus; 159 160 vcpu_dirty_rate_stat = 161 g_malloc0(sizeof(*vcpu_dirty_rate_stat)); 162 163 vcpu_dirty_rate_stat->stat.nvcpu = max_cpus; 164 vcpu_dirty_rate_stat->stat.rates = 165 g_new0(DirtyRateVcpu, max_cpus); 166 167 vcpu_dirty_rate_stat->running = false; 168 } 169 170 void vcpu_dirty_rate_stat_finalize(void) 171 { 172 g_free(vcpu_dirty_rate_stat->stat.rates); 173 vcpu_dirty_rate_stat->stat.rates = NULL; 174 175 g_free(vcpu_dirty_rate_stat); 176 vcpu_dirty_rate_stat = NULL; 177 } 178 179 void dirtylimit_state_lock(void) 180 { 181 qemu_mutex_lock(&dirtylimit_mutex); 182 } 183 184 void dirtylimit_state_unlock(void) 185 { 186 qemu_mutex_unlock(&dirtylimit_mutex); 187 } 188 189 static void 190 __attribute__((__constructor__)) dirtylimit_mutex_init(void) 191 { 192 qemu_mutex_init(&dirtylimit_mutex); 193 } 194 195 static inline VcpuDirtyLimitState *dirtylimit_vcpu_get_state(int cpu_index) 196 { 197 return &dirtylimit_state->states[cpu_index]; 198 } 199 200 void dirtylimit_state_initialize(void) 201 { 202 MachineState *ms = MACHINE(qdev_get_machine()); 203 int max_cpus = ms->smp.max_cpus; 204 int i; 205 206 dirtylimit_state = g_malloc0(sizeof(*dirtylimit_state)); 207 208 dirtylimit_state->states = 209 g_new0(VcpuDirtyLimitState, max_cpus); 210 211 for (i = 0; i < max_cpus; i++) { 212 dirtylimit_state->states[i].cpu_index = i; 213 } 214 215 dirtylimit_state->max_cpus = max_cpus; 216 trace_dirtylimit_state_initialize(max_cpus); 217 } 218 219 void dirtylimit_state_finalize(void) 220 { 221 g_free(dirtylimit_state->states); 222 dirtylimit_state->states = NULL; 223 224 g_free(dirtylimit_state); 225 dirtylimit_state = NULL; 226 227 trace_dirtylimit_state_finalize(); 228 } 229 230 bool dirtylimit_in_service(void) 231 { 232 return !!dirtylimit_state; 233 } 234 235 bool dirtylimit_vcpu_index_valid(int cpu_index) 236 { 237 MachineState *ms = MACHINE(qdev_get_machine()); 238 239 return !(cpu_index < 0 || 240 cpu_index >= ms->smp.max_cpus); 241 } 242 243 static uint64_t dirtylimit_dirty_ring_full_time(uint64_t dirtyrate) 244 { 245 static uint64_t max_dirtyrate; 246 uint64_t dirty_ring_size_MiB; 247 248 dirty_ring_size_MiB = qemu_target_pages_to_MiB(kvm_dirty_ring_size()); 249 250 if (max_dirtyrate < dirtyrate) { 251 max_dirtyrate = dirtyrate; 252 } 253 254 return dirty_ring_size_MiB * 1000000 / max_dirtyrate; 255 } 256 257 static inline bool dirtylimit_done(uint64_t quota, 258 uint64_t current) 259 { 260 uint64_t min, max; 261 262 min = MIN(quota, current); 263 max = MAX(quota, current); 264 265 return ((max - min) <= DIRTYLIMIT_TOLERANCE_RANGE) ? true : false; 266 } 267 268 static inline bool 269 dirtylimit_need_linear_adjustment(uint64_t quota, 270 uint64_t current) 271 { 272 uint64_t min, max; 273 274 min = MIN(quota, current); 275 max = MAX(quota, current); 276 277 return ((max - min) * 100 / max) > DIRTYLIMIT_LINEAR_ADJUSTMENT_PCT; 278 } 279 280 static void dirtylimit_set_throttle(CPUState *cpu, 281 uint64_t quota, 282 uint64_t current) 283 { 284 int64_t ring_full_time_us = 0; 285 uint64_t sleep_pct = 0; 286 uint64_t throttle_us = 0; 287 288 if (current == 0) { 289 cpu->throttle_us_per_full = 0; 290 return; 291 } 292 293 ring_full_time_us = dirtylimit_dirty_ring_full_time(current); 294 295 if (dirtylimit_need_linear_adjustment(quota, current)) { 296 if (quota < current) { 297 sleep_pct = (current - quota) * 100 / current; 298 throttle_us = 299 ring_full_time_us * sleep_pct / (double)(100 - sleep_pct); 300 cpu->throttle_us_per_full += throttle_us; 301 } else { 302 sleep_pct = (quota - current) * 100 / quota; 303 throttle_us = 304 ring_full_time_us * sleep_pct / (double)(100 - sleep_pct); 305 cpu->throttle_us_per_full -= throttle_us; 306 } 307 308 trace_dirtylimit_throttle_pct(cpu->cpu_index, 309 sleep_pct, 310 throttle_us); 311 } else { 312 if (quota < current) { 313 cpu->throttle_us_per_full += ring_full_time_us / 10; 314 } else { 315 cpu->throttle_us_per_full -= ring_full_time_us / 10; 316 } 317 } 318 319 /* 320 * TODO: in the big kvm_dirty_ring_size case (eg: 65536, or other scenario), 321 * current dirty page rate may never reach the quota, we should stop 322 * increasing sleep time? 323 */ 324 cpu->throttle_us_per_full = MIN(cpu->throttle_us_per_full, 325 ring_full_time_us * DIRTYLIMIT_THROTTLE_PCT_MAX); 326 327 cpu->throttle_us_per_full = MAX(cpu->throttle_us_per_full, 0); 328 } 329 330 static void dirtylimit_adjust_throttle(CPUState *cpu) 331 { 332 uint64_t quota = 0; 333 uint64_t current = 0; 334 int cpu_index = cpu->cpu_index; 335 336 quota = dirtylimit_vcpu_get_state(cpu_index)->quota; 337 current = vcpu_dirty_rate_get(cpu_index); 338 339 if (!dirtylimit_done(quota, current)) { 340 dirtylimit_set_throttle(cpu, quota, current); 341 } 342 343 return; 344 } 345 346 void dirtylimit_process(void) 347 { 348 CPUState *cpu; 349 350 if (!qatomic_read(&dirtylimit_quit)) { 351 dirtylimit_state_lock(); 352 353 if (!dirtylimit_in_service()) { 354 dirtylimit_state_unlock(); 355 return; 356 } 357 358 CPU_FOREACH(cpu) { 359 if (!dirtylimit_vcpu_get_state(cpu->cpu_index)->enabled) { 360 continue; 361 } 362 dirtylimit_adjust_throttle(cpu); 363 } 364 dirtylimit_state_unlock(); 365 } 366 } 367 368 void dirtylimit_change(bool start) 369 { 370 if (start) { 371 qatomic_set(&dirtylimit_quit, 0); 372 } else { 373 qatomic_set(&dirtylimit_quit, 1); 374 } 375 } 376 377 void dirtylimit_set_vcpu(int cpu_index, 378 uint64_t quota, 379 bool enable) 380 { 381 trace_dirtylimit_set_vcpu(cpu_index, quota); 382 383 if (enable) { 384 dirtylimit_state->states[cpu_index].quota = quota; 385 if (!dirtylimit_vcpu_get_state(cpu_index)->enabled) { 386 dirtylimit_state->limited_nvcpu++; 387 } 388 } else { 389 dirtylimit_state->states[cpu_index].quota = 0; 390 if (dirtylimit_state->states[cpu_index].enabled) { 391 dirtylimit_state->limited_nvcpu--; 392 } 393 } 394 395 dirtylimit_state->states[cpu_index].enabled = enable; 396 } 397 398 void dirtylimit_set_all(uint64_t quota, 399 bool enable) 400 { 401 MachineState *ms = MACHINE(qdev_get_machine()); 402 int max_cpus = ms->smp.max_cpus; 403 int i; 404 405 for (i = 0; i < max_cpus; i++) { 406 dirtylimit_set_vcpu(i, quota, enable); 407 } 408 } 409 410 void dirtylimit_vcpu_execute(CPUState *cpu) 411 { 412 if (cpu->throttle_us_per_full) { 413 dirtylimit_state_lock(); 414 415 if (dirtylimit_in_service() && 416 dirtylimit_vcpu_get_state(cpu->cpu_index)->enabled) { 417 dirtylimit_state_unlock(); 418 trace_dirtylimit_vcpu_execute(cpu->cpu_index, 419 cpu->throttle_us_per_full); 420 421 g_usleep(cpu->throttle_us_per_full); 422 return; 423 } 424 425 dirtylimit_state_unlock(); 426 } 427 } 428 429 static void dirtylimit_init(void) 430 { 431 dirtylimit_state_initialize(); 432 dirtylimit_change(true); 433 vcpu_dirty_rate_stat_initialize(); 434 vcpu_dirty_rate_stat_start(); 435 } 436 437 static void dirtylimit_cleanup(void) 438 { 439 vcpu_dirty_rate_stat_stop(); 440 vcpu_dirty_rate_stat_finalize(); 441 dirtylimit_change(false); 442 dirtylimit_state_finalize(); 443 } 444 445 /* 446 * dirty page rate limit is not allowed to set if migration 447 * is running with dirty-limit capability enabled. 448 */ 449 static bool dirtylimit_is_allowed(void) 450 { 451 MigrationState *ms = migrate_get_current(); 452 453 if (migration_is_running() && 454 (!qemu_thread_is_self(&ms->thread)) && 455 migrate_dirty_limit() && 456 dirtylimit_in_service()) { 457 return false; 458 } 459 return true; 460 } 461 462 void qmp_cancel_vcpu_dirty_limit(bool has_cpu_index, 463 int64_t cpu_index, 464 Error **errp) 465 { 466 if (!kvm_enabled() || !kvm_dirty_ring_enabled()) { 467 return; 468 } 469 470 if (has_cpu_index && !dirtylimit_vcpu_index_valid(cpu_index)) { 471 error_setg(errp, "incorrect cpu index specified"); 472 return; 473 } 474 475 if (!dirtylimit_is_allowed()) { 476 error_setg(errp, "can't cancel dirty page rate limit while" 477 " migration is running"); 478 return; 479 } 480 481 if (!dirtylimit_in_service()) { 482 return; 483 } 484 485 dirtylimit_state_lock(); 486 487 if (has_cpu_index) { 488 dirtylimit_set_vcpu(cpu_index, 0, false); 489 } else { 490 dirtylimit_set_all(0, false); 491 } 492 493 if (!dirtylimit_state->limited_nvcpu) { 494 dirtylimit_cleanup(); 495 } 496 497 dirtylimit_state_unlock(); 498 } 499 500 void hmp_cancel_vcpu_dirty_limit(Monitor *mon, const QDict *qdict) 501 { 502 int64_t cpu_index = qdict_get_try_int(qdict, "cpu_index", -1); 503 Error *err = NULL; 504 505 qmp_cancel_vcpu_dirty_limit(!!(cpu_index != -1), cpu_index, &err); 506 if (err) { 507 hmp_handle_error(mon, err); 508 return; 509 } 510 511 monitor_printf(mon, "[Please use 'info vcpu_dirty_limit' to query " 512 "dirty limit for virtual CPU]\n"); 513 } 514 515 void qmp_set_vcpu_dirty_limit(bool has_cpu_index, 516 int64_t cpu_index, 517 uint64_t dirty_rate, 518 Error **errp) 519 { 520 if (!kvm_enabled() || !kvm_dirty_ring_enabled()) { 521 error_setg(errp, "dirty page limit feature requires KVM with" 522 " accelerator property 'dirty-ring-size' set'"); 523 return; 524 } 525 526 if (has_cpu_index && !dirtylimit_vcpu_index_valid(cpu_index)) { 527 error_setg(errp, "incorrect cpu index specified"); 528 return; 529 } 530 531 if (!dirtylimit_is_allowed()) { 532 error_setg(errp, "can't set dirty page rate limit while" 533 " migration is running"); 534 return; 535 } 536 537 if (!dirty_rate) { 538 qmp_cancel_vcpu_dirty_limit(has_cpu_index, cpu_index, errp); 539 return; 540 } 541 542 dirtylimit_state_lock(); 543 544 if (!dirtylimit_in_service()) { 545 dirtylimit_init(); 546 } 547 548 if (has_cpu_index) { 549 dirtylimit_set_vcpu(cpu_index, dirty_rate, true); 550 } else { 551 dirtylimit_set_all(dirty_rate, true); 552 } 553 554 dirtylimit_state_unlock(); 555 } 556 557 void hmp_set_vcpu_dirty_limit(Monitor *mon, const QDict *qdict) 558 { 559 int64_t dirty_rate = qdict_get_int(qdict, "dirty_rate"); 560 int64_t cpu_index = qdict_get_try_int(qdict, "cpu_index", -1); 561 Error *err = NULL; 562 563 if (dirty_rate < 0) { 564 error_setg(&err, "invalid dirty page limit %" PRId64, dirty_rate); 565 goto out; 566 } 567 568 qmp_set_vcpu_dirty_limit(!!(cpu_index != -1), cpu_index, dirty_rate, &err); 569 570 out: 571 hmp_handle_error(mon, err); 572 } 573 574 /* Return the max throttle time of each virtual CPU */ 575 uint64_t dirtylimit_throttle_time_per_round(void) 576 { 577 CPUState *cpu; 578 int64_t max = 0; 579 580 CPU_FOREACH(cpu) { 581 if (cpu->throttle_us_per_full > max) { 582 max = cpu->throttle_us_per_full; 583 } 584 } 585 586 return max; 587 } 588 589 /* 590 * Estimate average dirty ring full time of each virtaul CPU. 591 * Return 0 if guest doesn't dirty memory. 592 */ 593 uint64_t dirtylimit_ring_full_time(void) 594 { 595 CPUState *cpu; 596 uint64_t curr_rate = 0; 597 int nvcpus = 0; 598 599 CPU_FOREACH(cpu) { 600 if (cpu->running) { 601 nvcpus++; 602 curr_rate += vcpu_dirty_rate_get(cpu->cpu_index); 603 } 604 } 605 606 if (!curr_rate || !nvcpus) { 607 return 0; 608 } 609 610 return dirtylimit_dirty_ring_full_time(curr_rate / nvcpus); 611 } 612 613 static struct DirtyLimitInfo *dirtylimit_query_vcpu(int cpu_index) 614 { 615 DirtyLimitInfo *info = NULL; 616 617 info = g_malloc0(sizeof(*info)); 618 info->cpu_index = cpu_index; 619 info->limit_rate = dirtylimit_vcpu_get_state(cpu_index)->quota; 620 info->current_rate = vcpu_dirty_rate_get(cpu_index); 621 622 return info; 623 } 624 625 static struct DirtyLimitInfoList *dirtylimit_query_all(void) 626 { 627 int i, index; 628 DirtyLimitInfo *info = NULL; 629 DirtyLimitInfoList *head = NULL, **tail = &head; 630 631 dirtylimit_state_lock(); 632 633 if (!dirtylimit_in_service()) { 634 dirtylimit_state_unlock(); 635 return NULL; 636 } 637 638 for (i = 0; i < dirtylimit_state->max_cpus; i++) { 639 index = dirtylimit_state->states[i].cpu_index; 640 if (dirtylimit_vcpu_get_state(index)->enabled) { 641 info = dirtylimit_query_vcpu(index); 642 QAPI_LIST_APPEND(tail, info); 643 } 644 } 645 646 dirtylimit_state_unlock(); 647 648 return head; 649 } 650 651 struct DirtyLimitInfoList *qmp_query_vcpu_dirty_limit(Error **errp) 652 { 653 return dirtylimit_query_all(); 654 } 655 656 void hmp_info_vcpu_dirty_limit(Monitor *mon, const QDict *qdict) 657 { 658 DirtyLimitInfoList *info; 659 g_autoptr(DirtyLimitInfoList) head = NULL; 660 Error *err = NULL; 661 662 if (!dirtylimit_in_service()) { 663 monitor_printf(mon, "Dirty page limit not enabled!\n"); 664 return; 665 } 666 667 head = qmp_query_vcpu_dirty_limit(&err); 668 if (err) { 669 hmp_handle_error(mon, err); 670 return; 671 } 672 673 for (info = head; info != NULL; info = info->next) { 674 monitor_printf(mon, "vcpu[%"PRIi64"], limit rate %"PRIi64 " (MB/s)," 675 " current rate %"PRIi64 " (MB/s)\n", 676 info->value->cpu_index, 677 info->value->limit_rate, 678 info->value->current_rate); 679 } 680 } 681