xref: /openbmc/qemu/block/throttle-groups.c (revision 80adf54e)
1 /*
2  * QEMU block throttling group infrastructure
3  *
4  * Copyright (C) Nodalink, EURL. 2014
5  * Copyright (C) Igalia, S.L. 2015
6  *
7  * Authors:
8  *   Benoît Canet <benoit.canet@nodalink.com>
9  *   Alberto Garcia <berto@igalia.com>
10  *
11  * This program is free software; you can redistribute it and/or
12  * modify it under the terms of the GNU General Public License as
13  * published by the Free Software Foundation; either version 2 or
14  * (at your option) version 3 of the License.
15  *
16  * This program is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19  * GNU General Public License for more details.
20  *
21  * You should have received a copy of the GNU General Public License
22  * along with this program; if not, see <http://www.gnu.org/licenses/>.
23  */
24 
25 #include "qemu/osdep.h"
26 #include "sysemu/block-backend.h"
27 #include "block/throttle-groups.h"
28 #include "qemu/queue.h"
29 #include "qemu/thread.h"
30 #include "sysemu/qtest.h"
31 
32 /* The ThrottleGroup structure (with its ThrottleState) is shared
33  * among different BlockBackends and it's independent from
34  * AioContext, so in order to use it from different threads it needs
35  * its own locking.
36  *
37  * This locking is however handled internally in this file, so it's
38  * transparent to outside users.
39  *
40  * The whole ThrottleGroup structure is private and invisible to
41  * outside users, that only use it through its ThrottleState.
42  *
43  * In addition to the ThrottleGroup structure, BlockBackendPublic has
44  * fields that need to be accessed by other members of the group and
45  * therefore also need to be protected by this lock. Once a
46  * BlockBackend is registered in a group those fields can be accessed
47  * by other threads any time.
48  *
49  * Again, all this is handled internally and is mostly transparent to
50  * the outside. The 'throttle_timers' field however has an additional
51  * constraint because it may be temporarily invalid (see for example
52  * blk_set_aio_context()). Therefore in this file a thread will
53  * access some other BlockBackend's timers only after verifying that
54  * that BlockBackend has throttled requests in the queue.
55  */
56 typedef struct ThrottleGroup {
57     char *name; /* This is constant during the lifetime of the group */
58 
59     QemuMutex lock; /* This lock protects the following four fields */
60     ThrottleState ts;
61     QLIST_HEAD(, BlockBackendPublic) head;
62     BlockBackend *tokens[2];
63     bool any_timer_armed[2];
64     QEMUClockType clock_type;
65 
66     /* These two are protected by the global throttle_groups_lock */
67     unsigned refcount;
68     QTAILQ_ENTRY(ThrottleGroup) list;
69 } ThrottleGroup;
70 
71 static QemuMutex throttle_groups_lock;
72 static QTAILQ_HEAD(, ThrottleGroup) throttle_groups =
73     QTAILQ_HEAD_INITIALIZER(throttle_groups);
74 
75 /* Increments the reference count of a ThrottleGroup given its name.
76  *
77  * If no ThrottleGroup is found with the given name a new one is
78  * created.
79  *
80  * @name: the name of the ThrottleGroup
81  * @ret:  the ThrottleState member of the ThrottleGroup
82  */
83 ThrottleState *throttle_group_incref(const char *name)
84 {
85     ThrottleGroup *tg = NULL;
86     ThrottleGroup *iter;
87 
88     qemu_mutex_lock(&throttle_groups_lock);
89 
90     /* Look for an existing group with that name */
91     QTAILQ_FOREACH(iter, &throttle_groups, list) {
92         if (!strcmp(name, iter->name)) {
93             tg = iter;
94             break;
95         }
96     }
97 
98     /* Create a new one if not found */
99     if (!tg) {
100         tg = g_new0(ThrottleGroup, 1);
101         tg->name = g_strdup(name);
102         tg->clock_type = QEMU_CLOCK_REALTIME;
103 
104         if (qtest_enabled()) {
105             /* For testing block IO throttling only */
106             tg->clock_type = QEMU_CLOCK_VIRTUAL;
107         }
108         qemu_mutex_init(&tg->lock);
109         throttle_init(&tg->ts);
110         QLIST_INIT(&tg->head);
111 
112         QTAILQ_INSERT_TAIL(&throttle_groups, tg, list);
113     }
114 
115     tg->refcount++;
116 
117     qemu_mutex_unlock(&throttle_groups_lock);
118 
119     return &tg->ts;
120 }
121 
122 /* Decrease the reference count of a ThrottleGroup.
123  *
124  * When the reference count reaches zero the ThrottleGroup is
125  * destroyed.
126  *
127  * @ts:  The ThrottleGroup to unref, given by its ThrottleState member
128  */
129 void throttle_group_unref(ThrottleState *ts)
130 {
131     ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
132 
133     qemu_mutex_lock(&throttle_groups_lock);
134     if (--tg->refcount == 0) {
135         QTAILQ_REMOVE(&throttle_groups, tg, list);
136         qemu_mutex_destroy(&tg->lock);
137         g_free(tg->name);
138         g_free(tg);
139     }
140     qemu_mutex_unlock(&throttle_groups_lock);
141 }
142 
143 /* Get the name from a BlockBackend's ThrottleGroup. The name (and the pointer)
144  * is guaranteed to remain constant during the lifetime of the group.
145  *
146  * @blk:  a BlockBackend that is member of a throttling group
147  * @ret:  the name of the group.
148  */
149 const char *throttle_group_get_name(BlockBackend *blk)
150 {
151     BlockBackendPublic *blkp = blk_get_public(blk);
152     ThrottleGroup *tg = container_of(blkp->throttle_state, ThrottleGroup, ts);
153     return tg->name;
154 }
155 
156 /* Return the next BlockBackend in the round-robin sequence, simulating a
157  * circular list.
158  *
159  * This assumes that tg->lock is held.
160  *
161  * @blk: the current BlockBackend
162  * @ret: the next BlockBackend in the sequence
163  */
164 static BlockBackend *throttle_group_next_blk(BlockBackend *blk)
165 {
166     BlockBackendPublic *blkp = blk_get_public(blk);
167     ThrottleState *ts = blkp->throttle_state;
168     ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
169     BlockBackendPublic *next = QLIST_NEXT(blkp, round_robin);
170 
171     if (!next) {
172         next = QLIST_FIRST(&tg->head);
173     }
174 
175     return blk_by_public(next);
176 }
177 
178 /*
179  * Return whether a BlockBackend has pending requests.
180  *
181  * This assumes that tg->lock is held.
182  *
183  * @blk: the BlockBackend
184  * @is_write:  the type of operation (read/write)
185  * @ret:       whether the BlockBackend has pending requests.
186  */
187 static inline bool blk_has_pending_reqs(BlockBackend *blk,
188                                         bool is_write)
189 {
190     const BlockBackendPublic *blkp = blk_get_public(blk);
191     return blkp->pending_reqs[is_write];
192 }
193 
194 /* Return the next BlockBackend in the round-robin sequence with pending I/O
195  * requests.
196  *
197  * This assumes that tg->lock is held.
198  *
199  * @blk:       the current BlockBackend
200  * @is_write:  the type of operation (read/write)
201  * @ret:       the next BlockBackend with pending requests, or blk if there is
202  *             none.
203  */
204 static BlockBackend *next_throttle_token(BlockBackend *blk, bool is_write)
205 {
206     BlockBackendPublic *blkp = blk_get_public(blk);
207     ThrottleGroup *tg = container_of(blkp->throttle_state, ThrottleGroup, ts);
208     BlockBackend *token, *start;
209 
210     start = token = tg->tokens[is_write];
211 
212     /* get next bs round in round robin style */
213     token = throttle_group_next_blk(token);
214     while (token != start && !blk_has_pending_reqs(token, is_write)) {
215         token = throttle_group_next_blk(token);
216     }
217 
218     /* If no IO are queued for scheduling on the next round robin token
219      * then decide the token is the current bs because chances are
220      * the current bs get the current request queued.
221      */
222     if (token == start && !blk_has_pending_reqs(token, is_write)) {
223         token = blk;
224     }
225 
226     /* Either we return the original BB, or one with pending requests */
227     assert(token == blk || blk_has_pending_reqs(token, is_write));
228 
229     return token;
230 }
231 
232 /* Check if the next I/O request for a BlockBackend needs to be throttled or
233  * not. If there's no timer set in this group, set one and update the token
234  * accordingly.
235  *
236  * This assumes that tg->lock is held.
237  *
238  * @blk:        the current BlockBackend
239  * @is_write:   the type of operation (read/write)
240  * @ret:        whether the I/O request needs to be throttled or not
241  */
242 static bool throttle_group_schedule_timer(BlockBackend *blk, bool is_write)
243 {
244     BlockBackendPublic *blkp = blk_get_public(blk);
245     ThrottleState *ts = blkp->throttle_state;
246     ThrottleTimers *tt = &blkp->throttle_timers;
247     ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
248     bool must_wait;
249 
250     if (atomic_read(&blkp->io_limits_disabled)) {
251         return false;
252     }
253 
254     /* Check if any of the timers in this group is already armed */
255     if (tg->any_timer_armed[is_write]) {
256         return true;
257     }
258 
259     must_wait = throttle_schedule_timer(ts, tt, is_write);
260 
261     /* If a timer just got armed, set blk as the current token */
262     if (must_wait) {
263         tg->tokens[is_write] = blk;
264         tg->any_timer_armed[is_write] = true;
265     }
266 
267     return must_wait;
268 }
269 
270 /* Start the next pending I/O request for a BlockBackend.  Return whether
271  * any request was actually pending.
272  *
273  * @blk:       the current BlockBackend
274  * @is_write:  the type of operation (read/write)
275  */
276 static bool coroutine_fn throttle_group_co_restart_queue(BlockBackend *blk,
277                                                          bool is_write)
278 {
279     BlockBackendPublic *blkp = blk_get_public(blk);
280     bool ret;
281 
282     qemu_co_mutex_lock(&blkp->throttled_reqs_lock);
283     ret = qemu_co_queue_next(&blkp->throttled_reqs[is_write]);
284     qemu_co_mutex_unlock(&blkp->throttled_reqs_lock);
285 
286     return ret;
287 }
288 
289 /* Look for the next pending I/O request and schedule it.
290  *
291  * This assumes that tg->lock is held.
292  *
293  * @blk:       the current BlockBackend
294  * @is_write:  the type of operation (read/write)
295  */
296 static void schedule_next_request(BlockBackend *blk, bool is_write)
297 {
298     BlockBackendPublic *blkp = blk_get_public(blk);
299     ThrottleGroup *tg = container_of(blkp->throttle_state, ThrottleGroup, ts);
300     bool must_wait;
301     BlockBackend *token;
302 
303     /* Check if there's any pending request to schedule next */
304     token = next_throttle_token(blk, is_write);
305     if (!blk_has_pending_reqs(token, is_write)) {
306         return;
307     }
308 
309     /* Set a timer for the request if it needs to be throttled */
310     must_wait = throttle_group_schedule_timer(token, is_write);
311 
312     /* If it doesn't have to wait, queue it for immediate execution */
313     if (!must_wait) {
314         /* Give preference to requests from the current blk */
315         if (qemu_in_coroutine() &&
316             throttle_group_co_restart_queue(blk, is_write)) {
317             token = blk;
318         } else {
319             ThrottleTimers *tt = &blk_get_public(token)->throttle_timers;
320             int64_t now = qemu_clock_get_ns(tg->clock_type);
321             timer_mod(tt->timers[is_write], now);
322             tg->any_timer_armed[is_write] = true;
323         }
324         tg->tokens[is_write] = token;
325     }
326 }
327 
328 /* Check if an I/O request needs to be throttled, wait and set a timer
329  * if necessary, and schedule the next request using a round robin
330  * algorithm.
331  *
332  * @blk:       the current BlockBackend
333  * @bytes:     the number of bytes for this I/O
334  * @is_write:  the type of operation (read/write)
335  */
336 void coroutine_fn throttle_group_co_io_limits_intercept(BlockBackend *blk,
337                                                         unsigned int bytes,
338                                                         bool is_write)
339 {
340     bool must_wait;
341     BlockBackend *token;
342 
343     BlockBackendPublic *blkp = blk_get_public(blk);
344     ThrottleGroup *tg = container_of(blkp->throttle_state, ThrottleGroup, ts);
345     qemu_mutex_lock(&tg->lock);
346 
347     /* First we check if this I/O has to be throttled. */
348     token = next_throttle_token(blk, is_write);
349     must_wait = throttle_group_schedule_timer(token, is_write);
350 
351     /* Wait if there's a timer set or queued requests of this type */
352     if (must_wait || blkp->pending_reqs[is_write]) {
353         blkp->pending_reqs[is_write]++;
354         qemu_mutex_unlock(&tg->lock);
355         qemu_co_mutex_lock(&blkp->throttled_reqs_lock);
356         qemu_co_queue_wait(&blkp->throttled_reqs[is_write],
357                            &blkp->throttled_reqs_lock);
358         qemu_co_mutex_unlock(&blkp->throttled_reqs_lock);
359         qemu_mutex_lock(&tg->lock);
360         blkp->pending_reqs[is_write]--;
361     }
362 
363     /* The I/O will be executed, so do the accounting */
364     throttle_account(blkp->throttle_state, is_write, bytes);
365 
366     /* Schedule the next request */
367     schedule_next_request(blk, is_write);
368 
369     qemu_mutex_unlock(&tg->lock);
370 }
371 
372 typedef struct {
373     BlockBackend *blk;
374     bool is_write;
375 } RestartData;
376 
377 static void coroutine_fn throttle_group_restart_queue_entry(void *opaque)
378 {
379     RestartData *data = opaque;
380     BlockBackend *blk = data->blk;
381     bool is_write = data->is_write;
382     BlockBackendPublic *blkp = blk_get_public(blk);
383     ThrottleGroup *tg = container_of(blkp->throttle_state, ThrottleGroup, ts);
384     bool empty_queue;
385 
386     empty_queue = !throttle_group_co_restart_queue(blk, is_write);
387 
388     /* If the request queue was empty then we have to take care of
389      * scheduling the next one */
390     if (empty_queue) {
391         qemu_mutex_lock(&tg->lock);
392         schedule_next_request(blk, is_write);
393         qemu_mutex_unlock(&tg->lock);
394     }
395 }
396 
397 static void throttle_group_restart_queue(BlockBackend *blk, bool is_write)
398 {
399     Coroutine *co;
400     RestartData rd = {
401         .blk = blk,
402         .is_write = is_write
403     };
404 
405     co = qemu_coroutine_create(throttle_group_restart_queue_entry, &rd);
406     aio_co_enter(blk_get_aio_context(blk), co);
407 }
408 
409 void throttle_group_restart_blk(BlockBackend *blk)
410 {
411     BlockBackendPublic *blkp = blk_get_public(blk);
412 
413     if (blkp->throttle_state) {
414         throttle_group_restart_queue(blk, 0);
415         throttle_group_restart_queue(blk, 1);
416     }
417 }
418 
419 /* Update the throttle configuration for a particular group. Similar
420  * to throttle_config(), but guarantees atomicity within the
421  * throttling group.
422  *
423  * @blk: a BlockBackend that is a member of the group
424  * @cfg: the configuration to set
425  */
426 void throttle_group_config(BlockBackend *blk, ThrottleConfig *cfg)
427 {
428     BlockBackendPublic *blkp = blk_get_public(blk);
429     ThrottleState *ts = blkp->throttle_state;
430     ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
431     qemu_mutex_lock(&tg->lock);
432     throttle_config(ts, tg->clock_type, cfg);
433     qemu_mutex_unlock(&tg->lock);
434 
435     throttle_group_restart_blk(blk);
436 }
437 
438 /* Get the throttle configuration from a particular group. Similar to
439  * throttle_get_config(), but guarantees atomicity within the
440  * throttling group.
441  *
442  * @blk: a BlockBackend that is a member of the group
443  * @cfg: the configuration will be written here
444  */
445 void throttle_group_get_config(BlockBackend *blk, ThrottleConfig *cfg)
446 {
447     BlockBackendPublic *blkp = blk_get_public(blk);
448     ThrottleState *ts = blkp->throttle_state;
449     ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
450     qemu_mutex_lock(&tg->lock);
451     throttle_get_config(ts, cfg);
452     qemu_mutex_unlock(&tg->lock);
453 }
454 
455 /* ThrottleTimers callback. This wakes up a request that was waiting
456  * because it had been throttled.
457  *
458  * @blk:       the BlockBackend whose request had been throttled
459  * @is_write:  the type of operation (read/write)
460  */
461 static void timer_cb(BlockBackend *blk, bool is_write)
462 {
463     BlockBackendPublic *blkp = blk_get_public(blk);
464     ThrottleState *ts = blkp->throttle_state;
465     ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
466 
467     /* The timer has just been fired, so we can update the flag */
468     qemu_mutex_lock(&tg->lock);
469     tg->any_timer_armed[is_write] = false;
470     qemu_mutex_unlock(&tg->lock);
471 
472     /* Run the request that was waiting for this timer */
473     throttle_group_restart_queue(blk, is_write);
474 }
475 
476 static void read_timer_cb(void *opaque)
477 {
478     timer_cb(opaque, false);
479 }
480 
481 static void write_timer_cb(void *opaque)
482 {
483     timer_cb(opaque, true);
484 }
485 
486 /* Register a BlockBackend in the throttling group, also initializing its
487  * timers and updating its throttle_state pointer to point to it. If a
488  * throttling group with that name does not exist yet, it will be created.
489  *
490  * @blk:       the BlockBackend to insert
491  * @groupname: the name of the group
492  */
493 void throttle_group_register_blk(BlockBackend *blk, const char *groupname)
494 {
495     int i;
496     BlockBackendPublic *blkp = blk_get_public(blk);
497     ThrottleState *ts = throttle_group_incref(groupname);
498     ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
499     blkp->throttle_state = ts;
500 
501     qemu_mutex_lock(&tg->lock);
502     /* If the ThrottleGroup is new set this BlockBackend as the token */
503     for (i = 0; i < 2; i++) {
504         if (!tg->tokens[i]) {
505             tg->tokens[i] = blk;
506         }
507     }
508 
509     QLIST_INSERT_HEAD(&tg->head, blkp, round_robin);
510 
511     throttle_timers_init(&blkp->throttle_timers,
512                          blk_get_aio_context(blk),
513                          tg->clock_type,
514                          read_timer_cb,
515                          write_timer_cb,
516                          blk);
517 
518     qemu_mutex_unlock(&tg->lock);
519 }
520 
521 /* Unregister a BlockBackend from its group, removing it from the list,
522  * destroying the timers and setting the throttle_state pointer to NULL.
523  *
524  * The BlockBackend must not have pending throttled requests, so the caller has
525  * to drain them first.
526  *
527  * The group will be destroyed if it's empty after this operation.
528  *
529  * @blk: the BlockBackend to remove
530  */
531 void throttle_group_unregister_blk(BlockBackend *blk)
532 {
533     BlockBackendPublic *blkp = blk_get_public(blk);
534     ThrottleGroup *tg = container_of(blkp->throttle_state, ThrottleGroup, ts);
535     int i;
536 
537     assert(blkp->pending_reqs[0] == 0 && blkp->pending_reqs[1] == 0);
538     assert(qemu_co_queue_empty(&blkp->throttled_reqs[0]));
539     assert(qemu_co_queue_empty(&blkp->throttled_reqs[1]));
540 
541     qemu_mutex_lock(&tg->lock);
542     for (i = 0; i < 2; i++) {
543         if (tg->tokens[i] == blk) {
544             BlockBackend *token = throttle_group_next_blk(blk);
545             /* Take care of the case where this is the last blk in the group */
546             if (token == blk) {
547                 token = NULL;
548             }
549             tg->tokens[i] = token;
550         }
551     }
552 
553     /* remove the current blk from the list */
554     QLIST_REMOVE(blkp, round_robin);
555     throttle_timers_destroy(&blkp->throttle_timers);
556     qemu_mutex_unlock(&tg->lock);
557 
558     throttle_group_unref(&tg->ts);
559     blkp->throttle_state = NULL;
560 }
561 
562 static void throttle_groups_init(void)
563 {
564     qemu_mutex_init(&throttle_groups_lock);
565 }
566 
567 block_init(throttle_groups_init);
568