xref: /openbmc/qemu/system/dirtylimit.c (revision bb6cf6f0168efadb95cf3e41963ec295ad28a941)
1 /*
2  * Dirty page rate limit implementation code
3  *
4  * Copyright (c) 2022 CHINA TELECOM CO.,LTD.
5  *
6  * Authors:
7  *  Hyman Huang(黄勇) <huangy81@chinatelecom.cn>
8  *
9  * This work is licensed under the terms of the GNU GPL, version 2 or later.
10  * See the COPYING file in the top-level directory.
11  */
12 
13 #include "qemu/osdep.h"
14 #include "qemu/main-loop.h"
15 #include "qapi/qapi-commands-migration.h"
16 #include "qapi/qmp/qdict.h"
17 #include "qapi/error.h"
18 #include "sysemu/dirtyrate.h"
19 #include "sysemu/dirtylimit.h"
20 #include "monitor/hmp.h"
21 #include "monitor/monitor.h"
22 #include "exec/memory.h"
23 #include "exec/target_page.h"
24 #include "hw/boards.h"
25 #include "sysemu/kvm.h"
26 #include "trace.h"
27 #include "migration/misc.h"
28 #include "migration/migration.h"
29 #include "migration/options.h"
30 
31 /*
32  * Dirtylimit stop working if dirty page rate error
33  * value less than DIRTYLIMIT_TOLERANCE_RANGE
34  */
35 #define DIRTYLIMIT_TOLERANCE_RANGE  25  /* MB/s */
36 /*
37  * Plus or minus vcpu sleep time linearly if dirty
38  * page rate error value percentage over
39  * DIRTYLIMIT_LINEAR_ADJUSTMENT_PCT.
40  * Otherwise, plus or minus a fixed vcpu sleep time.
41  */
42 #define DIRTYLIMIT_LINEAR_ADJUSTMENT_PCT     50
43 /*
44  * Max vcpu sleep time percentage during a cycle
45  * composed of dirty ring full and sleep time.
46  */
47 #define DIRTYLIMIT_THROTTLE_PCT_MAX 99
48 
49 struct {
50     VcpuStat stat;
51     bool running;
52     QemuThread thread;
53 } *vcpu_dirty_rate_stat;
54 
55 typedef struct VcpuDirtyLimitState {
56     int cpu_index;
57     bool enabled;
58     /*
59      * Quota dirty page rate, unit is MB/s
60      * zero if not enabled.
61      */
62     uint64_t quota;
63 } VcpuDirtyLimitState;
64 
65 struct {
66     VcpuDirtyLimitState *states;
67     /* Max cpus number configured by user */
68     int max_cpus;
69     /* Number of vcpu under dirtylimit */
70     int limited_nvcpu;
71 } *dirtylimit_state;
72 
73 /* protect dirtylimit_state */
74 static QemuMutex dirtylimit_mutex;
75 
76 /* dirtylimit thread quit if dirtylimit_quit is true */
77 static bool dirtylimit_quit;
78 
79 static void vcpu_dirty_rate_stat_collect(void)
80 {
81     MigrationState *s = migrate_get_current();
82     VcpuStat stat;
83     int i = 0;
84     int64_t period = DIRTYLIMIT_CALC_TIME_MS;
85 
86     if (migrate_dirty_limit() &&
87         migration_is_active(s)) {
88         period = s->parameters.x_vcpu_dirty_limit_period;
89     }
90 
91     /* calculate vcpu dirtyrate */
92     vcpu_calculate_dirtyrate(period,
93                               &stat,
94                               GLOBAL_DIRTY_LIMIT,
95                               false);
96 
97     for (i = 0; i < stat.nvcpu; i++) {
98         vcpu_dirty_rate_stat->stat.rates[i].id = i;
99         vcpu_dirty_rate_stat->stat.rates[i].dirty_rate =
100             stat.rates[i].dirty_rate;
101     }
102 
103     g_free(stat.rates);
104 }
105 
106 static void *vcpu_dirty_rate_stat_thread(void *opaque)
107 {
108     rcu_register_thread();
109 
110     /* start log sync */
111     global_dirty_log_change(GLOBAL_DIRTY_LIMIT, true);
112 
113     while (qatomic_read(&vcpu_dirty_rate_stat->running)) {
114         vcpu_dirty_rate_stat_collect();
115         if (dirtylimit_in_service()) {
116             dirtylimit_process();
117         }
118     }
119 
120     /* stop log sync */
121     global_dirty_log_change(GLOBAL_DIRTY_LIMIT, false);
122 
123     rcu_unregister_thread();
124     return NULL;
125 }
126 
127 int64_t vcpu_dirty_rate_get(int cpu_index)
128 {
129     DirtyRateVcpu *rates = vcpu_dirty_rate_stat->stat.rates;
130     return qatomic_read_i64(&rates[cpu_index].dirty_rate);
131 }
132 
133 void vcpu_dirty_rate_stat_start(void)
134 {
135     if (qatomic_read(&vcpu_dirty_rate_stat->running)) {
136         return;
137     }
138 
139     qatomic_set(&vcpu_dirty_rate_stat->running, 1);
140     qemu_thread_create(&vcpu_dirty_rate_stat->thread,
141                        "dirtyrate-stat",
142                        vcpu_dirty_rate_stat_thread,
143                        NULL,
144                        QEMU_THREAD_JOINABLE);
145 }
146 
147 void vcpu_dirty_rate_stat_stop(void)
148 {
149     qatomic_set(&vcpu_dirty_rate_stat->running, 0);
150     dirtylimit_state_unlock();
151     qemu_mutex_unlock_iothread();
152     qemu_thread_join(&vcpu_dirty_rate_stat->thread);
153     qemu_mutex_lock_iothread();
154     dirtylimit_state_lock();
155 }
156 
157 void vcpu_dirty_rate_stat_initialize(void)
158 {
159     MachineState *ms = MACHINE(qdev_get_machine());
160     int max_cpus = ms->smp.max_cpus;
161 
162     vcpu_dirty_rate_stat =
163         g_malloc0(sizeof(*vcpu_dirty_rate_stat));
164 
165     vcpu_dirty_rate_stat->stat.nvcpu = max_cpus;
166     vcpu_dirty_rate_stat->stat.rates =
167         g_new0(DirtyRateVcpu, max_cpus);
168 
169     vcpu_dirty_rate_stat->running = false;
170 }
171 
172 void vcpu_dirty_rate_stat_finalize(void)
173 {
174     g_free(vcpu_dirty_rate_stat->stat.rates);
175     vcpu_dirty_rate_stat->stat.rates = NULL;
176 
177     g_free(vcpu_dirty_rate_stat);
178     vcpu_dirty_rate_stat = NULL;
179 }
180 
181 void dirtylimit_state_lock(void)
182 {
183     qemu_mutex_lock(&dirtylimit_mutex);
184 }
185 
186 void dirtylimit_state_unlock(void)
187 {
188     qemu_mutex_unlock(&dirtylimit_mutex);
189 }
190 
191 static void
192 __attribute__((__constructor__)) dirtylimit_mutex_init(void)
193 {
194     qemu_mutex_init(&dirtylimit_mutex);
195 }
196 
197 static inline VcpuDirtyLimitState *dirtylimit_vcpu_get_state(int cpu_index)
198 {
199     return &dirtylimit_state->states[cpu_index];
200 }
201 
202 void dirtylimit_state_initialize(void)
203 {
204     MachineState *ms = MACHINE(qdev_get_machine());
205     int max_cpus = ms->smp.max_cpus;
206     int i;
207 
208     dirtylimit_state = g_malloc0(sizeof(*dirtylimit_state));
209 
210     dirtylimit_state->states =
211             g_new0(VcpuDirtyLimitState, max_cpus);
212 
213     for (i = 0; i < max_cpus; i++) {
214         dirtylimit_state->states[i].cpu_index = i;
215     }
216 
217     dirtylimit_state->max_cpus = max_cpus;
218     trace_dirtylimit_state_initialize(max_cpus);
219 }
220 
221 void dirtylimit_state_finalize(void)
222 {
223     g_free(dirtylimit_state->states);
224     dirtylimit_state->states = NULL;
225 
226     g_free(dirtylimit_state);
227     dirtylimit_state = NULL;
228 
229     trace_dirtylimit_state_finalize();
230 }
231 
232 bool dirtylimit_in_service(void)
233 {
234     return !!dirtylimit_state;
235 }
236 
237 bool dirtylimit_vcpu_index_valid(int cpu_index)
238 {
239     MachineState *ms = MACHINE(qdev_get_machine());
240 
241     return !(cpu_index < 0 ||
242              cpu_index >= ms->smp.max_cpus);
243 }
244 
245 static uint64_t dirtylimit_dirty_ring_full_time(uint64_t dirtyrate)
246 {
247     static uint64_t max_dirtyrate;
248     uint64_t dirty_ring_size_MiB;
249 
250     dirty_ring_size_MiB = qemu_target_pages_to_MiB(kvm_dirty_ring_size());
251 
252     if (max_dirtyrate < dirtyrate) {
253         max_dirtyrate = dirtyrate;
254     }
255 
256     return dirty_ring_size_MiB * 1000000 / max_dirtyrate;
257 }
258 
259 static inline bool dirtylimit_done(uint64_t quota,
260                                    uint64_t current)
261 {
262     uint64_t min, max;
263 
264     min = MIN(quota, current);
265     max = MAX(quota, current);
266 
267     return ((max - min) <= DIRTYLIMIT_TOLERANCE_RANGE) ? true : false;
268 }
269 
270 static inline bool
271 dirtylimit_need_linear_adjustment(uint64_t quota,
272                                   uint64_t current)
273 {
274     uint64_t min, max;
275 
276     min = MIN(quota, current);
277     max = MAX(quota, current);
278 
279     return ((max - min) * 100 / max) > DIRTYLIMIT_LINEAR_ADJUSTMENT_PCT;
280 }
281 
282 static void dirtylimit_set_throttle(CPUState *cpu,
283                                     uint64_t quota,
284                                     uint64_t current)
285 {
286     int64_t ring_full_time_us = 0;
287     uint64_t sleep_pct = 0;
288     uint64_t throttle_us = 0;
289 
290     if (current == 0) {
291         cpu->throttle_us_per_full = 0;
292         return;
293     }
294 
295     ring_full_time_us = dirtylimit_dirty_ring_full_time(current);
296 
297     if (dirtylimit_need_linear_adjustment(quota, current)) {
298         if (quota < current) {
299             sleep_pct = (current - quota) * 100 / current;
300             throttle_us =
301                 ring_full_time_us * sleep_pct / (double)(100 - sleep_pct);
302             cpu->throttle_us_per_full += throttle_us;
303         } else {
304             sleep_pct = (quota - current) * 100 / quota;
305             throttle_us =
306                 ring_full_time_us * sleep_pct / (double)(100 - sleep_pct);
307             cpu->throttle_us_per_full -= throttle_us;
308         }
309 
310         trace_dirtylimit_throttle_pct(cpu->cpu_index,
311                                       sleep_pct,
312                                       throttle_us);
313     } else {
314         if (quota < current) {
315             cpu->throttle_us_per_full += ring_full_time_us / 10;
316         } else {
317             cpu->throttle_us_per_full -= ring_full_time_us / 10;
318         }
319     }
320 
321     /*
322      * TODO: in the big kvm_dirty_ring_size case (eg: 65536, or other scenario),
323      *       current dirty page rate may never reach the quota, we should stop
324      *       increasing sleep time?
325      */
326     cpu->throttle_us_per_full = MIN(cpu->throttle_us_per_full,
327         ring_full_time_us * DIRTYLIMIT_THROTTLE_PCT_MAX);
328 
329     cpu->throttle_us_per_full = MAX(cpu->throttle_us_per_full, 0);
330 }
331 
332 static void dirtylimit_adjust_throttle(CPUState *cpu)
333 {
334     uint64_t quota = 0;
335     uint64_t current = 0;
336     int cpu_index = cpu->cpu_index;
337 
338     quota = dirtylimit_vcpu_get_state(cpu_index)->quota;
339     current = vcpu_dirty_rate_get(cpu_index);
340 
341     if (!dirtylimit_done(quota, current)) {
342         dirtylimit_set_throttle(cpu, quota, current);
343     }
344 
345     return;
346 }
347 
348 void dirtylimit_process(void)
349 {
350     CPUState *cpu;
351 
352     if (!qatomic_read(&dirtylimit_quit)) {
353         dirtylimit_state_lock();
354 
355         if (!dirtylimit_in_service()) {
356             dirtylimit_state_unlock();
357             return;
358         }
359 
360         CPU_FOREACH(cpu) {
361             if (!dirtylimit_vcpu_get_state(cpu->cpu_index)->enabled) {
362                 continue;
363             }
364             dirtylimit_adjust_throttle(cpu);
365         }
366         dirtylimit_state_unlock();
367     }
368 }
369 
370 void dirtylimit_change(bool start)
371 {
372     if (start) {
373         qatomic_set(&dirtylimit_quit, 0);
374     } else {
375         qatomic_set(&dirtylimit_quit, 1);
376     }
377 }
378 
379 void dirtylimit_set_vcpu(int cpu_index,
380                          uint64_t quota,
381                          bool enable)
382 {
383     trace_dirtylimit_set_vcpu(cpu_index, quota);
384 
385     if (enable) {
386         dirtylimit_state->states[cpu_index].quota = quota;
387         if (!dirtylimit_vcpu_get_state(cpu_index)->enabled) {
388             dirtylimit_state->limited_nvcpu++;
389         }
390     } else {
391         dirtylimit_state->states[cpu_index].quota = 0;
392         if (dirtylimit_state->states[cpu_index].enabled) {
393             dirtylimit_state->limited_nvcpu--;
394         }
395     }
396 
397     dirtylimit_state->states[cpu_index].enabled = enable;
398 }
399 
400 void dirtylimit_set_all(uint64_t quota,
401                         bool enable)
402 {
403     MachineState *ms = MACHINE(qdev_get_machine());
404     int max_cpus = ms->smp.max_cpus;
405     int i;
406 
407     for (i = 0; i < max_cpus; i++) {
408         dirtylimit_set_vcpu(i, quota, enable);
409     }
410 }
411 
412 void dirtylimit_vcpu_execute(CPUState *cpu)
413 {
414     if (cpu->throttle_us_per_full) {
415         dirtylimit_state_lock();
416 
417         if (dirtylimit_in_service() &&
418             dirtylimit_vcpu_get_state(cpu->cpu_index)->enabled) {
419             dirtylimit_state_unlock();
420             trace_dirtylimit_vcpu_execute(cpu->cpu_index,
421                     cpu->throttle_us_per_full);
422 
423             g_usleep(cpu->throttle_us_per_full);
424             return;
425         }
426 
427         dirtylimit_state_unlock();
428     }
429 }
430 
431 static void dirtylimit_init(void)
432 {
433     dirtylimit_state_initialize();
434     dirtylimit_change(true);
435     vcpu_dirty_rate_stat_initialize();
436     vcpu_dirty_rate_stat_start();
437 }
438 
439 static void dirtylimit_cleanup(void)
440 {
441     vcpu_dirty_rate_stat_stop();
442     vcpu_dirty_rate_stat_finalize();
443     dirtylimit_change(false);
444     dirtylimit_state_finalize();
445 }
446 
447 /*
448  * dirty page rate limit is not allowed to set if migration
449  * is running with dirty-limit capability enabled.
450  */
451 static bool dirtylimit_is_allowed(void)
452 {
453     MigrationState *ms = migrate_get_current();
454 
455     if (migration_is_running(ms->state) &&
456         (!qemu_thread_is_self(&ms->thread)) &&
457         migrate_dirty_limit() &&
458         dirtylimit_in_service()) {
459         return false;
460     }
461     return true;
462 }
463 
464 void qmp_cancel_vcpu_dirty_limit(bool has_cpu_index,
465                                  int64_t cpu_index,
466                                  Error **errp)
467 {
468     if (!kvm_enabled() || !kvm_dirty_ring_enabled()) {
469         return;
470     }
471 
472     if (has_cpu_index && !dirtylimit_vcpu_index_valid(cpu_index)) {
473         error_setg(errp, "incorrect cpu index specified");
474         return;
475     }
476 
477     if (!dirtylimit_is_allowed()) {
478         error_setg(errp, "can't cancel dirty page rate limit while"
479                    " migration is running");
480         return;
481     }
482 
483     if (!dirtylimit_in_service()) {
484         return;
485     }
486 
487     dirtylimit_state_lock();
488 
489     if (has_cpu_index) {
490         dirtylimit_set_vcpu(cpu_index, 0, false);
491     } else {
492         dirtylimit_set_all(0, false);
493     }
494 
495     if (!dirtylimit_state->limited_nvcpu) {
496         dirtylimit_cleanup();
497     }
498 
499     dirtylimit_state_unlock();
500 }
501 
502 void hmp_cancel_vcpu_dirty_limit(Monitor *mon, const QDict *qdict)
503 {
504     int64_t cpu_index = qdict_get_try_int(qdict, "cpu_index", -1);
505     Error *err = NULL;
506 
507     qmp_cancel_vcpu_dirty_limit(!!(cpu_index != -1), cpu_index, &err);
508     if (err) {
509         hmp_handle_error(mon, err);
510         return;
511     }
512 
513     monitor_printf(mon, "[Please use 'info vcpu_dirty_limit' to query "
514                    "dirty limit for virtual CPU]\n");
515 }
516 
517 void qmp_set_vcpu_dirty_limit(bool has_cpu_index,
518                               int64_t cpu_index,
519                               uint64_t dirty_rate,
520                               Error **errp)
521 {
522     if (!kvm_enabled() || !kvm_dirty_ring_enabled()) {
523         error_setg(errp, "dirty page limit feature requires KVM with"
524                    " accelerator property 'dirty-ring-size' set'");
525         return;
526     }
527 
528     if (has_cpu_index && !dirtylimit_vcpu_index_valid(cpu_index)) {
529         error_setg(errp, "incorrect cpu index specified");
530         return;
531     }
532 
533     if (!dirtylimit_is_allowed()) {
534         error_setg(errp, "can't set dirty page rate limit while"
535                    " migration is running");
536         return;
537     }
538 
539     if (!dirty_rate) {
540         qmp_cancel_vcpu_dirty_limit(has_cpu_index, cpu_index, errp);
541         return;
542     }
543 
544     dirtylimit_state_lock();
545 
546     if (!dirtylimit_in_service()) {
547         dirtylimit_init();
548     }
549 
550     if (has_cpu_index) {
551         dirtylimit_set_vcpu(cpu_index, dirty_rate, true);
552     } else {
553         dirtylimit_set_all(dirty_rate, true);
554     }
555 
556     dirtylimit_state_unlock();
557 }
558 
559 void hmp_set_vcpu_dirty_limit(Monitor *mon, const QDict *qdict)
560 {
561     int64_t dirty_rate = qdict_get_int(qdict, "dirty_rate");
562     int64_t cpu_index = qdict_get_try_int(qdict, "cpu_index", -1);
563     Error *err = NULL;
564 
565     if (dirty_rate < 0) {
566         error_setg(&err, "invalid dirty page limit %" PRId64, dirty_rate);
567         goto out;
568     }
569 
570     qmp_set_vcpu_dirty_limit(!!(cpu_index != -1), cpu_index, dirty_rate, &err);
571 
572 out:
573     hmp_handle_error(mon, err);
574 }
575 
576 /* Return the max throttle time of each virtual CPU */
577 uint64_t dirtylimit_throttle_time_per_round(void)
578 {
579     CPUState *cpu;
580     int64_t max = 0;
581 
582     CPU_FOREACH(cpu) {
583         if (cpu->throttle_us_per_full > max) {
584             max = cpu->throttle_us_per_full;
585         }
586     }
587 
588     return max;
589 }
590 
591 /*
592  * Estimate average dirty ring full time of each virtaul CPU.
593  * Return 0 if guest doesn't dirty memory.
594  */
595 uint64_t dirtylimit_ring_full_time(void)
596 {
597     CPUState *cpu;
598     uint64_t curr_rate = 0;
599     int nvcpus = 0;
600 
601     CPU_FOREACH(cpu) {
602         if (cpu->running) {
603             nvcpus++;
604             curr_rate += vcpu_dirty_rate_get(cpu->cpu_index);
605         }
606     }
607 
608     if (!curr_rate || !nvcpus) {
609         return 0;
610     }
611 
612     return dirtylimit_dirty_ring_full_time(curr_rate / nvcpus);
613 }
614 
615 static struct DirtyLimitInfo *dirtylimit_query_vcpu(int cpu_index)
616 {
617     DirtyLimitInfo *info = NULL;
618 
619     info = g_malloc0(sizeof(*info));
620     info->cpu_index = cpu_index;
621     info->limit_rate = dirtylimit_vcpu_get_state(cpu_index)->quota;
622     info->current_rate = vcpu_dirty_rate_get(cpu_index);
623 
624     return info;
625 }
626 
627 static struct DirtyLimitInfoList *dirtylimit_query_all(void)
628 {
629     int i, index;
630     DirtyLimitInfo *info = NULL;
631     DirtyLimitInfoList *head = NULL, **tail = &head;
632 
633     dirtylimit_state_lock();
634 
635     if (!dirtylimit_in_service()) {
636         dirtylimit_state_unlock();
637         return NULL;
638     }
639 
640     for (i = 0; i < dirtylimit_state->max_cpus; i++) {
641         index = dirtylimit_state->states[i].cpu_index;
642         if (dirtylimit_vcpu_get_state(index)->enabled) {
643             info = dirtylimit_query_vcpu(index);
644             QAPI_LIST_APPEND(tail, info);
645         }
646     }
647 
648     dirtylimit_state_unlock();
649 
650     return head;
651 }
652 
653 struct DirtyLimitInfoList *qmp_query_vcpu_dirty_limit(Error **errp)
654 {
655     return dirtylimit_query_all();
656 }
657 
658 void hmp_info_vcpu_dirty_limit(Monitor *mon, const QDict *qdict)
659 {
660     DirtyLimitInfoList *info;
661     g_autoptr(DirtyLimitInfoList) head = NULL;
662     Error *err = NULL;
663 
664     if (!dirtylimit_in_service()) {
665         monitor_printf(mon, "Dirty page limit not enabled!\n");
666         return;
667     }
668 
669     head = qmp_query_vcpu_dirty_limit(&err);
670     if (err) {
671         hmp_handle_error(mon, err);
672         return;
673     }
674 
675     for (info = head; info != NULL; info = info->next) {
676         monitor_printf(mon, "vcpu[%"PRIi64"], limit rate %"PRIi64 " (MB/s),"
677                             " current rate %"PRIi64 " (MB/s)\n",
678                             info->value->cpu_index,
679                             info->value->limit_rate,
680                             info->value->current_rate);
681     }
682 }
683