1 /* 2 * Dirty page rate limit implementation code 3 * 4 * Copyright (c) 2022 CHINA TELECOM CO.,LTD. 5 * 6 * Authors: 7 * Hyman Huang(黄勇) <huangy81@chinatelecom.cn> 8 * 9 * This work is licensed under the terms of the GNU GPL, version 2 or later. 10 * See the COPYING file in the top-level directory. 11 */ 12 13 #include "qemu/osdep.h" 14 #include "qemu/main-loop.h" 15 #include "qapi/qapi-commands-migration.h" 16 #include "qapi/qmp/qdict.h" 17 #include "qapi/error.h" 18 #include "sysemu/dirtyrate.h" 19 #include "sysemu/dirtylimit.h" 20 #include "monitor/hmp.h" 21 #include "monitor/monitor.h" 22 #include "exec/memory.h" 23 #include "exec/target_page.h" 24 #include "hw/boards.h" 25 #include "sysemu/kvm.h" 26 #include "trace.h" 27 #include "migration/misc.h" 28 29 /* 30 * Dirtylimit stop working if dirty page rate error 31 * value less than DIRTYLIMIT_TOLERANCE_RANGE 32 */ 33 #define DIRTYLIMIT_TOLERANCE_RANGE 25 /* MB/s */ 34 /* 35 * Plus or minus vcpu sleep time linearly if dirty 36 * page rate error value percentage over 37 * DIRTYLIMIT_LINEAR_ADJUSTMENT_PCT. 38 * Otherwise, plus or minus a fixed vcpu sleep time. 39 */ 40 #define DIRTYLIMIT_LINEAR_ADJUSTMENT_PCT 50 41 /* 42 * Max vcpu sleep time percentage during a cycle 43 * composed of dirty ring full and sleep time. 44 */ 45 #define DIRTYLIMIT_THROTTLE_PCT_MAX 99 46 47 struct { 48 VcpuStat stat; 49 bool running; 50 QemuThread thread; 51 } *vcpu_dirty_rate_stat; 52 53 typedef struct VcpuDirtyLimitState { 54 int cpu_index; 55 bool enabled; 56 /* 57 * Quota dirty page rate, unit is MB/s 58 * zero if not enabled. 59 */ 60 uint64_t quota; 61 } VcpuDirtyLimitState; 62 63 struct { 64 VcpuDirtyLimitState *states; 65 /* Max cpus number configured by user */ 66 int max_cpus; 67 /* Number of vcpu under dirtylimit */ 68 int limited_nvcpu; 69 } *dirtylimit_state; 70 71 /* protect dirtylimit_state */ 72 static QemuMutex dirtylimit_mutex; 73 74 /* dirtylimit thread quit if dirtylimit_quit is true */ 75 static bool dirtylimit_quit; 76 77 static void vcpu_dirty_rate_stat_collect(void) 78 { 79 VcpuStat stat; 80 int i = 0; 81 int64_t period = DIRTYLIMIT_CALC_TIME_MS; 82 83 if (migrate_dirty_limit() && 84 migration_is_active()) { 85 period = migrate_vcpu_dirty_limit_period(); 86 } 87 88 /* calculate vcpu dirtyrate */ 89 vcpu_calculate_dirtyrate(period, 90 &stat, 91 GLOBAL_DIRTY_LIMIT, 92 false); 93 94 for (i = 0; i < stat.nvcpu; i++) { 95 vcpu_dirty_rate_stat->stat.rates[i].id = i; 96 vcpu_dirty_rate_stat->stat.rates[i].dirty_rate = 97 stat.rates[i].dirty_rate; 98 } 99 100 g_free(stat.rates); 101 } 102 103 static void *vcpu_dirty_rate_stat_thread(void *opaque) 104 { 105 rcu_register_thread(); 106 107 /* start log sync */ 108 global_dirty_log_change(GLOBAL_DIRTY_LIMIT, true); 109 110 while (qatomic_read(&vcpu_dirty_rate_stat->running)) { 111 vcpu_dirty_rate_stat_collect(); 112 if (dirtylimit_in_service()) { 113 dirtylimit_process(); 114 } 115 } 116 117 /* stop log sync */ 118 global_dirty_log_change(GLOBAL_DIRTY_LIMIT, false); 119 120 rcu_unregister_thread(); 121 return NULL; 122 } 123 124 int64_t vcpu_dirty_rate_get(int cpu_index) 125 { 126 DirtyRateVcpu *rates = vcpu_dirty_rate_stat->stat.rates; 127 return qatomic_read_i64(&rates[cpu_index].dirty_rate); 128 } 129 130 void vcpu_dirty_rate_stat_start(void) 131 { 132 if (qatomic_read(&vcpu_dirty_rate_stat->running)) { 133 return; 134 } 135 136 qatomic_set(&vcpu_dirty_rate_stat->running, 1); 137 qemu_thread_create(&vcpu_dirty_rate_stat->thread, 138 "dirtyrate-stat", 139 vcpu_dirty_rate_stat_thread, 140 NULL, 141 QEMU_THREAD_JOINABLE); 142 } 143 144 void vcpu_dirty_rate_stat_stop(void) 145 { 146 qatomic_set(&vcpu_dirty_rate_stat->running, 0); 147 dirtylimit_state_unlock(); 148 bql_unlock(); 149 qemu_thread_join(&vcpu_dirty_rate_stat->thread); 150 bql_lock(); 151 dirtylimit_state_lock(); 152 } 153 154 void vcpu_dirty_rate_stat_initialize(void) 155 { 156 MachineState *ms = MACHINE(qdev_get_machine()); 157 int max_cpus = ms->smp.max_cpus; 158 159 vcpu_dirty_rate_stat = 160 g_malloc0(sizeof(*vcpu_dirty_rate_stat)); 161 162 vcpu_dirty_rate_stat->stat.nvcpu = max_cpus; 163 vcpu_dirty_rate_stat->stat.rates = 164 g_new0(DirtyRateVcpu, max_cpus); 165 166 vcpu_dirty_rate_stat->running = false; 167 } 168 169 void vcpu_dirty_rate_stat_finalize(void) 170 { 171 g_free(vcpu_dirty_rate_stat->stat.rates); 172 vcpu_dirty_rate_stat->stat.rates = NULL; 173 174 g_free(vcpu_dirty_rate_stat); 175 vcpu_dirty_rate_stat = NULL; 176 } 177 178 void dirtylimit_state_lock(void) 179 { 180 qemu_mutex_lock(&dirtylimit_mutex); 181 } 182 183 void dirtylimit_state_unlock(void) 184 { 185 qemu_mutex_unlock(&dirtylimit_mutex); 186 } 187 188 static void 189 __attribute__((__constructor__)) dirtylimit_mutex_init(void) 190 { 191 qemu_mutex_init(&dirtylimit_mutex); 192 } 193 194 static inline VcpuDirtyLimitState *dirtylimit_vcpu_get_state(int cpu_index) 195 { 196 return &dirtylimit_state->states[cpu_index]; 197 } 198 199 void dirtylimit_state_initialize(void) 200 { 201 MachineState *ms = MACHINE(qdev_get_machine()); 202 int max_cpus = ms->smp.max_cpus; 203 int i; 204 205 dirtylimit_state = g_malloc0(sizeof(*dirtylimit_state)); 206 207 dirtylimit_state->states = 208 g_new0(VcpuDirtyLimitState, max_cpus); 209 210 for (i = 0; i < max_cpus; i++) { 211 dirtylimit_state->states[i].cpu_index = i; 212 } 213 214 dirtylimit_state->max_cpus = max_cpus; 215 trace_dirtylimit_state_initialize(max_cpus); 216 } 217 218 void dirtylimit_state_finalize(void) 219 { 220 g_free(dirtylimit_state->states); 221 dirtylimit_state->states = NULL; 222 223 g_free(dirtylimit_state); 224 dirtylimit_state = NULL; 225 226 trace_dirtylimit_state_finalize(); 227 } 228 229 bool dirtylimit_in_service(void) 230 { 231 return !!dirtylimit_state; 232 } 233 234 bool dirtylimit_vcpu_index_valid(int cpu_index) 235 { 236 MachineState *ms = MACHINE(qdev_get_machine()); 237 238 return !(cpu_index < 0 || 239 cpu_index >= ms->smp.max_cpus); 240 } 241 242 static uint64_t dirtylimit_dirty_ring_full_time(uint64_t dirtyrate) 243 { 244 static uint64_t max_dirtyrate; 245 uint64_t dirty_ring_size_MiB; 246 247 dirty_ring_size_MiB = qemu_target_pages_to_MiB(kvm_dirty_ring_size()); 248 249 if (max_dirtyrate < dirtyrate) { 250 max_dirtyrate = dirtyrate; 251 } 252 253 return dirty_ring_size_MiB * 1000000 / max_dirtyrate; 254 } 255 256 static inline bool dirtylimit_done(uint64_t quota, 257 uint64_t current) 258 { 259 uint64_t min, max; 260 261 min = MIN(quota, current); 262 max = MAX(quota, current); 263 264 return ((max - min) <= DIRTYLIMIT_TOLERANCE_RANGE) ? true : false; 265 } 266 267 static inline bool 268 dirtylimit_need_linear_adjustment(uint64_t quota, 269 uint64_t current) 270 { 271 uint64_t min, max; 272 273 min = MIN(quota, current); 274 max = MAX(quota, current); 275 276 return ((max - min) * 100 / max) > DIRTYLIMIT_LINEAR_ADJUSTMENT_PCT; 277 } 278 279 static void dirtylimit_set_throttle(CPUState *cpu, 280 uint64_t quota, 281 uint64_t current) 282 { 283 int64_t ring_full_time_us = 0; 284 uint64_t sleep_pct = 0; 285 uint64_t throttle_us = 0; 286 287 if (current == 0) { 288 cpu->throttle_us_per_full = 0; 289 return; 290 } 291 292 ring_full_time_us = dirtylimit_dirty_ring_full_time(current); 293 294 if (dirtylimit_need_linear_adjustment(quota, current)) { 295 if (quota < current) { 296 sleep_pct = (current - quota) * 100 / current; 297 throttle_us = 298 ring_full_time_us * sleep_pct / (double)(100 - sleep_pct); 299 cpu->throttle_us_per_full += throttle_us; 300 } else { 301 sleep_pct = (quota - current) * 100 / quota; 302 throttle_us = 303 ring_full_time_us * sleep_pct / (double)(100 - sleep_pct); 304 cpu->throttle_us_per_full -= throttle_us; 305 } 306 307 trace_dirtylimit_throttle_pct(cpu->cpu_index, 308 sleep_pct, 309 throttle_us); 310 } else { 311 if (quota < current) { 312 cpu->throttle_us_per_full += ring_full_time_us / 10; 313 } else { 314 cpu->throttle_us_per_full -= ring_full_time_us / 10; 315 } 316 } 317 318 /* 319 * TODO: in the big kvm_dirty_ring_size case (eg: 65536, or other scenario), 320 * current dirty page rate may never reach the quota, we should stop 321 * increasing sleep time? 322 */ 323 cpu->throttle_us_per_full = MIN(cpu->throttle_us_per_full, 324 ring_full_time_us * DIRTYLIMIT_THROTTLE_PCT_MAX); 325 326 cpu->throttle_us_per_full = MAX(cpu->throttle_us_per_full, 0); 327 } 328 329 static void dirtylimit_adjust_throttle(CPUState *cpu) 330 { 331 uint64_t quota = 0; 332 uint64_t current = 0; 333 int cpu_index = cpu->cpu_index; 334 335 quota = dirtylimit_vcpu_get_state(cpu_index)->quota; 336 current = vcpu_dirty_rate_get(cpu_index); 337 338 if (!dirtylimit_done(quota, current)) { 339 dirtylimit_set_throttle(cpu, quota, current); 340 } 341 342 return; 343 } 344 345 void dirtylimit_process(void) 346 { 347 CPUState *cpu; 348 349 if (!qatomic_read(&dirtylimit_quit)) { 350 dirtylimit_state_lock(); 351 352 if (!dirtylimit_in_service()) { 353 dirtylimit_state_unlock(); 354 return; 355 } 356 357 CPU_FOREACH(cpu) { 358 if (!dirtylimit_vcpu_get_state(cpu->cpu_index)->enabled) { 359 continue; 360 } 361 dirtylimit_adjust_throttle(cpu); 362 } 363 dirtylimit_state_unlock(); 364 } 365 } 366 367 void dirtylimit_change(bool start) 368 { 369 if (start) { 370 qatomic_set(&dirtylimit_quit, 0); 371 } else { 372 qatomic_set(&dirtylimit_quit, 1); 373 } 374 } 375 376 void dirtylimit_set_vcpu(int cpu_index, 377 uint64_t quota, 378 bool enable) 379 { 380 trace_dirtylimit_set_vcpu(cpu_index, quota); 381 382 if (enable) { 383 dirtylimit_state->states[cpu_index].quota = quota; 384 if (!dirtylimit_vcpu_get_state(cpu_index)->enabled) { 385 dirtylimit_state->limited_nvcpu++; 386 } 387 } else { 388 dirtylimit_state->states[cpu_index].quota = 0; 389 if (dirtylimit_state->states[cpu_index].enabled) { 390 dirtylimit_state->limited_nvcpu--; 391 } 392 } 393 394 dirtylimit_state->states[cpu_index].enabled = enable; 395 } 396 397 void dirtylimit_set_all(uint64_t quota, 398 bool enable) 399 { 400 MachineState *ms = MACHINE(qdev_get_machine()); 401 int max_cpus = ms->smp.max_cpus; 402 int i; 403 404 for (i = 0; i < max_cpus; i++) { 405 dirtylimit_set_vcpu(i, quota, enable); 406 } 407 } 408 409 void dirtylimit_vcpu_execute(CPUState *cpu) 410 { 411 if (cpu->throttle_us_per_full) { 412 dirtylimit_state_lock(); 413 414 if (dirtylimit_in_service() && 415 dirtylimit_vcpu_get_state(cpu->cpu_index)->enabled) { 416 dirtylimit_state_unlock(); 417 trace_dirtylimit_vcpu_execute(cpu->cpu_index, 418 cpu->throttle_us_per_full); 419 420 g_usleep(cpu->throttle_us_per_full); 421 return; 422 } 423 424 dirtylimit_state_unlock(); 425 } 426 } 427 428 static void dirtylimit_init(void) 429 { 430 dirtylimit_state_initialize(); 431 dirtylimit_change(true); 432 vcpu_dirty_rate_stat_initialize(); 433 vcpu_dirty_rate_stat_start(); 434 } 435 436 static void dirtylimit_cleanup(void) 437 { 438 vcpu_dirty_rate_stat_stop(); 439 vcpu_dirty_rate_stat_finalize(); 440 dirtylimit_change(false); 441 dirtylimit_state_finalize(); 442 } 443 444 /* 445 * dirty page rate limit is not allowed to set if migration 446 * is running with dirty-limit capability enabled. 447 */ 448 static bool dirtylimit_is_allowed(void) 449 { 450 if (migration_is_running() && 451 !migration_thread_is_self() && 452 migrate_dirty_limit() && 453 dirtylimit_in_service()) { 454 return false; 455 } 456 return true; 457 } 458 459 void qmp_cancel_vcpu_dirty_limit(bool has_cpu_index, 460 int64_t cpu_index, 461 Error **errp) 462 { 463 if (!kvm_enabled() || !kvm_dirty_ring_enabled()) { 464 return; 465 } 466 467 if (has_cpu_index && !dirtylimit_vcpu_index_valid(cpu_index)) { 468 error_setg(errp, "incorrect cpu index specified"); 469 return; 470 } 471 472 if (!dirtylimit_is_allowed()) { 473 error_setg(errp, "can't cancel dirty page rate limit while" 474 " migration is running"); 475 return; 476 } 477 478 if (!dirtylimit_in_service()) { 479 return; 480 } 481 482 dirtylimit_state_lock(); 483 484 if (has_cpu_index) { 485 dirtylimit_set_vcpu(cpu_index, 0, false); 486 } else { 487 dirtylimit_set_all(0, false); 488 } 489 490 if (!dirtylimit_state->limited_nvcpu) { 491 dirtylimit_cleanup(); 492 } 493 494 dirtylimit_state_unlock(); 495 } 496 497 void hmp_cancel_vcpu_dirty_limit(Monitor *mon, const QDict *qdict) 498 { 499 int64_t cpu_index = qdict_get_try_int(qdict, "cpu_index", -1); 500 Error *err = NULL; 501 502 qmp_cancel_vcpu_dirty_limit(!!(cpu_index != -1), cpu_index, &err); 503 if (err) { 504 hmp_handle_error(mon, err); 505 return; 506 } 507 508 monitor_printf(mon, "[Please use 'info vcpu_dirty_limit' to query " 509 "dirty limit for virtual CPU]\n"); 510 } 511 512 void qmp_set_vcpu_dirty_limit(bool has_cpu_index, 513 int64_t cpu_index, 514 uint64_t dirty_rate, 515 Error **errp) 516 { 517 if (!kvm_enabled() || !kvm_dirty_ring_enabled()) { 518 error_setg(errp, "dirty page limit feature requires KVM with" 519 " accelerator property 'dirty-ring-size' set'"); 520 return; 521 } 522 523 if (has_cpu_index && !dirtylimit_vcpu_index_valid(cpu_index)) { 524 error_setg(errp, "incorrect cpu index specified"); 525 return; 526 } 527 528 if (!dirtylimit_is_allowed()) { 529 error_setg(errp, "can't set dirty page rate limit while" 530 " migration is running"); 531 return; 532 } 533 534 if (!dirty_rate) { 535 qmp_cancel_vcpu_dirty_limit(has_cpu_index, cpu_index, errp); 536 return; 537 } 538 539 dirtylimit_state_lock(); 540 541 if (!dirtylimit_in_service()) { 542 dirtylimit_init(); 543 } 544 545 if (has_cpu_index) { 546 dirtylimit_set_vcpu(cpu_index, dirty_rate, true); 547 } else { 548 dirtylimit_set_all(dirty_rate, true); 549 } 550 551 dirtylimit_state_unlock(); 552 } 553 554 void hmp_set_vcpu_dirty_limit(Monitor *mon, const QDict *qdict) 555 { 556 int64_t dirty_rate = qdict_get_int(qdict, "dirty_rate"); 557 int64_t cpu_index = qdict_get_try_int(qdict, "cpu_index", -1); 558 Error *err = NULL; 559 560 if (dirty_rate < 0) { 561 error_setg(&err, "invalid dirty page limit %" PRId64, dirty_rate); 562 goto out; 563 } 564 565 qmp_set_vcpu_dirty_limit(!!(cpu_index != -1), cpu_index, dirty_rate, &err); 566 567 out: 568 hmp_handle_error(mon, err); 569 } 570 571 /* Return the max throttle time of each virtual CPU */ 572 uint64_t dirtylimit_throttle_time_per_round(void) 573 { 574 CPUState *cpu; 575 int64_t max = 0; 576 577 CPU_FOREACH(cpu) { 578 if (cpu->throttle_us_per_full > max) { 579 max = cpu->throttle_us_per_full; 580 } 581 } 582 583 return max; 584 } 585 586 /* 587 * Estimate average dirty ring full time of each virtaul CPU. 588 * Return 0 if guest doesn't dirty memory. 589 */ 590 uint64_t dirtylimit_ring_full_time(void) 591 { 592 CPUState *cpu; 593 uint64_t curr_rate = 0; 594 int nvcpus = 0; 595 596 CPU_FOREACH(cpu) { 597 if (cpu->running) { 598 nvcpus++; 599 curr_rate += vcpu_dirty_rate_get(cpu->cpu_index); 600 } 601 } 602 603 if (!curr_rate || !nvcpus) { 604 return 0; 605 } 606 607 return dirtylimit_dirty_ring_full_time(curr_rate / nvcpus); 608 } 609 610 static struct DirtyLimitInfo *dirtylimit_query_vcpu(int cpu_index) 611 { 612 DirtyLimitInfo *info = NULL; 613 614 info = g_malloc0(sizeof(*info)); 615 info->cpu_index = cpu_index; 616 info->limit_rate = dirtylimit_vcpu_get_state(cpu_index)->quota; 617 info->current_rate = vcpu_dirty_rate_get(cpu_index); 618 619 return info; 620 } 621 622 static struct DirtyLimitInfoList *dirtylimit_query_all(void) 623 { 624 int i, index; 625 DirtyLimitInfo *info = NULL; 626 DirtyLimitInfoList *head = NULL, **tail = &head; 627 628 dirtylimit_state_lock(); 629 630 if (!dirtylimit_in_service()) { 631 dirtylimit_state_unlock(); 632 return NULL; 633 } 634 635 for (i = 0; i < dirtylimit_state->max_cpus; i++) { 636 index = dirtylimit_state->states[i].cpu_index; 637 if (dirtylimit_vcpu_get_state(index)->enabled) { 638 info = dirtylimit_query_vcpu(index); 639 QAPI_LIST_APPEND(tail, info); 640 } 641 } 642 643 dirtylimit_state_unlock(); 644 645 return head; 646 } 647 648 struct DirtyLimitInfoList *qmp_query_vcpu_dirty_limit(Error **errp) 649 { 650 return dirtylimit_query_all(); 651 } 652 653 void hmp_info_vcpu_dirty_limit(Monitor *mon, const QDict *qdict) 654 { 655 DirtyLimitInfoList *info; 656 g_autoptr(DirtyLimitInfoList) head = NULL; 657 Error *err = NULL; 658 659 if (!dirtylimit_in_service()) { 660 monitor_printf(mon, "Dirty page limit not enabled!\n"); 661 return; 662 } 663 664 head = qmp_query_vcpu_dirty_limit(&err); 665 if (err) { 666 hmp_handle_error(mon, err); 667 return; 668 } 669 670 for (info = head; info != NULL; info = info->next) { 671 monitor_printf(mon, "vcpu[%"PRIi64"], limit rate %"PRIi64 " (MB/s)," 672 " current rate %"PRIi64 " (MB/s)\n", 673 info->value->cpu_index, 674 info->value->limit_rate, 675 info->value->current_rate); 676 } 677 } 678