1 /* 2 * QEMU throttling infrastructure 3 * 4 * Copyright (C) Nodalink, EURL. 2013-2014 5 * Copyright (C) Igalia, S.L. 2015 6 * 7 * Authors: 8 * Benoît Canet <benoit.canet@nodalink.com> 9 * Alberto Garcia <berto@igalia.com> 10 * 11 * This program is free software; you can redistribute it and/or 12 * modify it under the terms of the GNU General Public License as 13 * published by the Free Software Foundation; either version 2 or 14 * (at your option) version 3 of the License. 15 * 16 * This program is distributed in the hope that it will be useful, 17 * but WITHOUT ANY WARRANTY; without even the implied warranty of 18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 * GNU General Public License for more details. 20 * 21 * You should have received a copy of the GNU General Public License 22 * along with this program; if not, see <http://www.gnu.org/licenses/>. 23 */ 24 25 #include "qemu/throttle.h" 26 #include "qemu/timer.h" 27 #include "block/aio.h" 28 29 /* This function make a bucket leak 30 * 31 * @bkt: the bucket to make leak 32 * @delta_ns: the time delta 33 */ 34 void throttle_leak_bucket(LeakyBucket *bkt, int64_t delta_ns) 35 { 36 double leak; 37 38 /* compute how much to leak */ 39 leak = (bkt->avg * (double) delta_ns) / NANOSECONDS_PER_SECOND; 40 41 /* make the bucket leak */ 42 bkt->level = MAX(bkt->level - leak, 0); 43 } 44 45 /* Calculate the time delta since last leak and make proportionals leaks 46 * 47 * @now: the current timestamp in ns 48 */ 49 static void throttle_do_leak(ThrottleState *ts, int64_t now) 50 { 51 /* compute the time elapsed since the last leak */ 52 int64_t delta_ns = now - ts->previous_leak; 53 int i; 54 55 ts->previous_leak = now; 56 57 if (delta_ns <= 0) { 58 return; 59 } 60 61 /* make each bucket leak */ 62 for (i = 0; i < BUCKETS_COUNT; i++) { 63 throttle_leak_bucket(&ts->cfg.buckets[i], delta_ns); 64 } 65 } 66 67 /* do the real job of computing the time to wait 68 * 69 * @limit: the throttling limit 70 * @extra: the number of operation to delay 71 * @ret: the time to wait in ns 72 */ 73 static int64_t throttle_do_compute_wait(double limit, double extra) 74 { 75 double wait = extra * NANOSECONDS_PER_SECOND; 76 wait /= limit; 77 return wait; 78 } 79 80 /* This function compute the wait time in ns that a leaky bucket should trigger 81 * 82 * @bkt: the leaky bucket we operate on 83 * @ret: the resulting wait time in ns or 0 if the operation can go through 84 */ 85 int64_t throttle_compute_wait(LeakyBucket *bkt) 86 { 87 double extra; /* the number of extra units blocking the io */ 88 89 if (!bkt->avg) { 90 return 0; 91 } 92 93 extra = bkt->level - bkt->max; 94 95 if (extra <= 0) { 96 return 0; 97 } 98 99 return throttle_do_compute_wait(bkt->avg, extra); 100 } 101 102 /* This function compute the time that must be waited while this IO 103 * 104 * @is_write: true if the current IO is a write, false if it's a read 105 * @ret: time to wait 106 */ 107 static int64_t throttle_compute_wait_for(ThrottleState *ts, 108 bool is_write) 109 { 110 BucketType to_check[2][4] = { {THROTTLE_BPS_TOTAL, 111 THROTTLE_OPS_TOTAL, 112 THROTTLE_BPS_READ, 113 THROTTLE_OPS_READ}, 114 {THROTTLE_BPS_TOTAL, 115 THROTTLE_OPS_TOTAL, 116 THROTTLE_BPS_WRITE, 117 THROTTLE_OPS_WRITE}, }; 118 int64_t wait, max_wait = 0; 119 int i; 120 121 for (i = 0; i < 4; i++) { 122 BucketType index = to_check[is_write][i]; 123 wait = throttle_compute_wait(&ts->cfg.buckets[index]); 124 if (wait > max_wait) { 125 max_wait = wait; 126 } 127 } 128 129 return max_wait; 130 } 131 132 /* compute the timer for this type of operation 133 * 134 * @is_write: the type of operation 135 * @now: the current clock timestamp 136 * @next_timestamp: the resulting timer 137 * @ret: true if a timer must be set 138 */ 139 bool throttle_compute_timer(ThrottleState *ts, 140 bool is_write, 141 int64_t now, 142 int64_t *next_timestamp) 143 { 144 int64_t wait; 145 146 /* leak proportionally to the time elapsed */ 147 throttle_do_leak(ts, now); 148 149 /* compute the wait time if any */ 150 wait = throttle_compute_wait_for(ts, is_write); 151 152 /* if the code must wait compute when the next timer should fire */ 153 if (wait) { 154 *next_timestamp = now + wait; 155 return true; 156 } 157 158 /* else no need to wait at all */ 159 *next_timestamp = now; 160 return false; 161 } 162 163 /* Add timers to event loop */ 164 void throttle_timers_attach_aio_context(ThrottleTimers *tt, 165 AioContext *new_context) 166 { 167 tt->timers[0] = aio_timer_new(new_context, tt->clock_type, SCALE_NS, 168 tt->read_timer_cb, tt->timer_opaque); 169 tt->timers[1] = aio_timer_new(new_context, tt->clock_type, SCALE_NS, 170 tt->write_timer_cb, tt->timer_opaque); 171 } 172 173 /* To be called first on the ThrottleState */ 174 void throttle_init(ThrottleState *ts) 175 { 176 memset(ts, 0, sizeof(ThrottleState)); 177 } 178 179 /* To be called first on the ThrottleTimers */ 180 void throttle_timers_init(ThrottleTimers *tt, 181 AioContext *aio_context, 182 QEMUClockType clock_type, 183 QEMUTimerCB *read_timer_cb, 184 QEMUTimerCB *write_timer_cb, 185 void *timer_opaque) 186 { 187 memset(tt, 0, sizeof(ThrottleTimers)); 188 189 tt->clock_type = clock_type; 190 tt->read_timer_cb = read_timer_cb; 191 tt->write_timer_cb = write_timer_cb; 192 tt->timer_opaque = timer_opaque; 193 throttle_timers_attach_aio_context(tt, aio_context); 194 } 195 196 /* destroy a timer */ 197 static void throttle_timer_destroy(QEMUTimer **timer) 198 { 199 assert(*timer != NULL); 200 201 timer_del(*timer); 202 timer_free(*timer); 203 *timer = NULL; 204 } 205 206 /* Remove timers from event loop */ 207 void throttle_timers_detach_aio_context(ThrottleTimers *tt) 208 { 209 int i; 210 211 for (i = 0; i < 2; i++) { 212 throttle_timer_destroy(&tt->timers[i]); 213 } 214 } 215 216 /* To be called last on the ThrottleTimers */ 217 void throttle_timers_destroy(ThrottleTimers *tt) 218 { 219 throttle_timers_detach_aio_context(tt); 220 } 221 222 /* is any throttling timer configured */ 223 bool throttle_timers_are_initialized(ThrottleTimers *tt) 224 { 225 if (tt->timers[0]) { 226 return true; 227 } 228 229 return false; 230 } 231 232 /* Does any throttling must be done 233 * 234 * @cfg: the throttling configuration to inspect 235 * @ret: true if throttling must be done else false 236 */ 237 bool throttle_enabled(ThrottleConfig *cfg) 238 { 239 int i; 240 241 for (i = 0; i < BUCKETS_COUNT; i++) { 242 if (cfg->buckets[i].avg > 0) { 243 return true; 244 } 245 } 246 247 return false; 248 } 249 250 /* return true if any two throttling parameters conflicts 251 * 252 * @cfg: the throttling configuration to inspect 253 * @ret: true if any conflict detected else false 254 */ 255 bool throttle_conflicting(ThrottleConfig *cfg) 256 { 257 bool bps_flag, ops_flag; 258 bool bps_max_flag, ops_max_flag; 259 260 bps_flag = cfg->buckets[THROTTLE_BPS_TOTAL].avg && 261 (cfg->buckets[THROTTLE_BPS_READ].avg || 262 cfg->buckets[THROTTLE_BPS_WRITE].avg); 263 264 ops_flag = cfg->buckets[THROTTLE_OPS_TOTAL].avg && 265 (cfg->buckets[THROTTLE_OPS_READ].avg || 266 cfg->buckets[THROTTLE_OPS_WRITE].avg); 267 268 bps_max_flag = cfg->buckets[THROTTLE_BPS_TOTAL].max && 269 (cfg->buckets[THROTTLE_BPS_READ].max || 270 cfg->buckets[THROTTLE_BPS_WRITE].max); 271 272 ops_max_flag = cfg->buckets[THROTTLE_OPS_TOTAL].max && 273 (cfg->buckets[THROTTLE_OPS_READ].max || 274 cfg->buckets[THROTTLE_OPS_WRITE].max); 275 276 return bps_flag || ops_flag || bps_max_flag || ops_max_flag; 277 } 278 279 /* check if a throttling configuration is valid 280 * @cfg: the throttling configuration to inspect 281 * @ret: true if valid else false 282 */ 283 bool throttle_is_valid(ThrottleConfig *cfg) 284 { 285 int i; 286 287 for (i = 0; i < BUCKETS_COUNT; i++) { 288 if (cfg->buckets[i].avg < 0 || 289 cfg->buckets[i].max < 0 || 290 cfg->buckets[i].avg > THROTTLE_VALUE_MAX || 291 cfg->buckets[i].max > THROTTLE_VALUE_MAX) { 292 return false; 293 } 294 } 295 296 return true; 297 } 298 299 /* check if bps_max/iops_max is used without bps/iops 300 * @cfg: the throttling configuration to inspect 301 */ 302 bool throttle_max_is_missing_limit(ThrottleConfig *cfg) 303 { 304 int i; 305 306 for (i = 0; i < BUCKETS_COUNT; i++) { 307 if (cfg->buckets[i].max && !cfg->buckets[i].avg) { 308 return true; 309 } 310 } 311 return false; 312 } 313 314 /* fix bucket parameters */ 315 static void throttle_fix_bucket(LeakyBucket *bkt) 316 { 317 double min; 318 319 /* zero bucket level */ 320 bkt->level = 0; 321 322 /* The following is done to cope with the Linux CFQ block scheduler 323 * which regroup reads and writes by block of 100ms in the guest. 324 * When they are two process one making reads and one making writes cfq 325 * make a pattern looking like the following: 326 * WWWWWWWWWWWRRRRRRRRRRRRRRWWWWWWWWWWWWWwRRRRRRRRRRRRRRRRR 327 * Having a max burst value of 100ms of the average will help smooth the 328 * throttling 329 */ 330 min = bkt->avg / 10; 331 if (bkt->avg && !bkt->max) { 332 bkt->max = min; 333 } 334 } 335 336 /* take care of canceling a timer */ 337 static void throttle_cancel_timer(QEMUTimer *timer) 338 { 339 assert(timer != NULL); 340 341 timer_del(timer); 342 } 343 344 /* Used to configure the throttle 345 * 346 * @ts: the throttle state we are working on 347 * @tt: the throttle timers we use in this aio context 348 * @cfg: the config to set 349 */ 350 void throttle_config(ThrottleState *ts, 351 ThrottleTimers *tt, 352 ThrottleConfig *cfg) 353 { 354 int i; 355 356 ts->cfg = *cfg; 357 358 for (i = 0; i < BUCKETS_COUNT; i++) { 359 throttle_fix_bucket(&ts->cfg.buckets[i]); 360 } 361 362 ts->previous_leak = qemu_clock_get_ns(tt->clock_type); 363 364 for (i = 0; i < 2; i++) { 365 throttle_cancel_timer(tt->timers[i]); 366 } 367 } 368 369 /* used to get config 370 * 371 * @ts: the throttle state we are working on 372 * @cfg: the config to write 373 */ 374 void throttle_get_config(ThrottleState *ts, ThrottleConfig *cfg) 375 { 376 *cfg = ts->cfg; 377 } 378 379 380 /* Schedule the read or write timer if needed 381 * 382 * NOTE: this function is not unit tested due to it's usage of timer_mod 383 * 384 * @tt: the timers structure 385 * @is_write: the type of operation (read/write) 386 * @ret: true if the timer has been scheduled else false 387 */ 388 bool throttle_schedule_timer(ThrottleState *ts, 389 ThrottleTimers *tt, 390 bool is_write) 391 { 392 int64_t now = qemu_clock_get_ns(tt->clock_type); 393 int64_t next_timestamp; 394 bool must_wait; 395 396 must_wait = throttle_compute_timer(ts, 397 is_write, 398 now, 399 &next_timestamp); 400 401 /* request not throttled */ 402 if (!must_wait) { 403 return false; 404 } 405 406 /* request throttled and timer pending -> do nothing */ 407 if (timer_pending(tt->timers[is_write])) { 408 return true; 409 } 410 411 /* request throttled and timer not pending -> arm timer */ 412 timer_mod(tt->timers[is_write], next_timestamp); 413 return true; 414 } 415 416 /* do the accounting for this operation 417 * 418 * @is_write: the type of operation (read/write) 419 * @size: the size of the operation 420 */ 421 void throttle_account(ThrottleState *ts, bool is_write, uint64_t size) 422 { 423 double units = 1.0; 424 425 /* if cfg.op_size is defined and smaller than size we compute unit count */ 426 if (ts->cfg.op_size && size > ts->cfg.op_size) { 427 units = (double) size / ts->cfg.op_size; 428 } 429 430 ts->cfg.buckets[THROTTLE_BPS_TOTAL].level += size; 431 ts->cfg.buckets[THROTTLE_OPS_TOTAL].level += units; 432 433 if (is_write) { 434 ts->cfg.buckets[THROTTLE_BPS_WRITE].level += size; 435 ts->cfg.buckets[THROTTLE_OPS_WRITE].level += units; 436 } else { 437 ts->cfg.buckets[THROTTLE_BPS_READ].level += size; 438 ts->cfg.buckets[THROTTLE_OPS_READ].level += units; 439 } 440 } 441 442