1 /* 2 * QEMU throttling infrastructure 3 * 4 * Copyright (C) Nodalink, EURL. 2013-2014 5 * Copyright (C) Igalia, S.L. 2015 6 * 7 * Authors: 8 * Benoît Canet <benoit.canet@nodalink.com> 9 * Alberto Garcia <berto@igalia.com> 10 * 11 * This program is free software; you can redistribute it and/or 12 * modify it under the terms of the GNU General Public License as 13 * published by the Free Software Foundation; either version 2 or 14 * (at your option) version 3 of the License. 15 * 16 * This program is distributed in the hope that it will be useful, 17 * but WITHOUT ANY WARRANTY; without even the implied warranty of 18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 * GNU General Public License for more details. 20 * 21 * You should have received a copy of the GNU General Public License 22 * along with this program; if not, see <http://www.gnu.org/licenses/>. 23 */ 24 25 #include "qemu/osdep.h" 26 #include "qemu/throttle.h" 27 #include "qemu/timer.h" 28 #include "block/aio.h" 29 30 /* This function make a bucket leak 31 * 32 * @bkt: the bucket to make leak 33 * @delta_ns: the time delta 34 */ 35 void throttle_leak_bucket(LeakyBucket *bkt, int64_t delta_ns) 36 { 37 double leak; 38 39 /* compute how much to leak */ 40 leak = (bkt->avg * (double) delta_ns) / NANOSECONDS_PER_SECOND; 41 42 /* make the bucket leak */ 43 bkt->level = MAX(bkt->level - leak, 0); 44 } 45 46 /* Calculate the time delta since last leak and make proportionals leaks 47 * 48 * @now: the current timestamp in ns 49 */ 50 static void throttle_do_leak(ThrottleState *ts, int64_t now) 51 { 52 /* compute the time elapsed since the last leak */ 53 int64_t delta_ns = now - ts->previous_leak; 54 int i; 55 56 ts->previous_leak = now; 57 58 if (delta_ns <= 0) { 59 return; 60 } 61 62 /* make each bucket leak */ 63 for (i = 0; i < BUCKETS_COUNT; i++) { 64 throttle_leak_bucket(&ts->cfg.buckets[i], delta_ns); 65 } 66 } 67 68 /* do the real job of computing the time to wait 69 * 70 * @limit: the throttling limit 71 * @extra: the number of operation to delay 72 * @ret: the time to wait in ns 73 */ 74 static int64_t throttle_do_compute_wait(double limit, double extra) 75 { 76 double wait = extra * NANOSECONDS_PER_SECOND; 77 wait /= limit; 78 return wait; 79 } 80 81 /* This function compute the wait time in ns that a leaky bucket should trigger 82 * 83 * @bkt: the leaky bucket we operate on 84 * @ret: the resulting wait time in ns or 0 if the operation can go through 85 */ 86 int64_t throttle_compute_wait(LeakyBucket *bkt) 87 { 88 double extra; /* the number of extra units blocking the io */ 89 90 if (!bkt->avg) { 91 return 0; 92 } 93 94 extra = bkt->level - bkt->max; 95 96 if (extra <= 0) { 97 return 0; 98 } 99 100 return throttle_do_compute_wait(bkt->avg, extra); 101 } 102 103 /* This function compute the time that must be waited while this IO 104 * 105 * @is_write: true if the current IO is a write, false if it's a read 106 * @ret: time to wait 107 */ 108 static int64_t throttle_compute_wait_for(ThrottleState *ts, 109 bool is_write) 110 { 111 BucketType to_check[2][4] = { {THROTTLE_BPS_TOTAL, 112 THROTTLE_OPS_TOTAL, 113 THROTTLE_BPS_READ, 114 THROTTLE_OPS_READ}, 115 {THROTTLE_BPS_TOTAL, 116 THROTTLE_OPS_TOTAL, 117 THROTTLE_BPS_WRITE, 118 THROTTLE_OPS_WRITE}, }; 119 int64_t wait, max_wait = 0; 120 int i; 121 122 for (i = 0; i < 4; i++) { 123 BucketType index = to_check[is_write][i]; 124 wait = throttle_compute_wait(&ts->cfg.buckets[index]); 125 if (wait > max_wait) { 126 max_wait = wait; 127 } 128 } 129 130 return max_wait; 131 } 132 133 /* compute the timer for this type of operation 134 * 135 * @is_write: the type of operation 136 * @now: the current clock timestamp 137 * @next_timestamp: the resulting timer 138 * @ret: true if a timer must be set 139 */ 140 bool throttle_compute_timer(ThrottleState *ts, 141 bool is_write, 142 int64_t now, 143 int64_t *next_timestamp) 144 { 145 int64_t wait; 146 147 /* leak proportionally to the time elapsed */ 148 throttle_do_leak(ts, now); 149 150 /* compute the wait time if any */ 151 wait = throttle_compute_wait_for(ts, is_write); 152 153 /* if the code must wait compute when the next timer should fire */ 154 if (wait) { 155 *next_timestamp = now + wait; 156 return true; 157 } 158 159 /* else no need to wait at all */ 160 *next_timestamp = now; 161 return false; 162 } 163 164 /* Add timers to event loop */ 165 void throttle_timers_attach_aio_context(ThrottleTimers *tt, 166 AioContext *new_context) 167 { 168 tt->timers[0] = aio_timer_new(new_context, tt->clock_type, SCALE_NS, 169 tt->read_timer_cb, tt->timer_opaque); 170 tt->timers[1] = aio_timer_new(new_context, tt->clock_type, SCALE_NS, 171 tt->write_timer_cb, tt->timer_opaque); 172 } 173 174 /* To be called first on the ThrottleState */ 175 void throttle_init(ThrottleState *ts) 176 { 177 memset(ts, 0, sizeof(ThrottleState)); 178 } 179 180 /* To be called first on the ThrottleTimers */ 181 void throttle_timers_init(ThrottleTimers *tt, 182 AioContext *aio_context, 183 QEMUClockType clock_type, 184 QEMUTimerCB *read_timer_cb, 185 QEMUTimerCB *write_timer_cb, 186 void *timer_opaque) 187 { 188 memset(tt, 0, sizeof(ThrottleTimers)); 189 190 tt->clock_type = clock_type; 191 tt->read_timer_cb = read_timer_cb; 192 tt->write_timer_cb = write_timer_cb; 193 tt->timer_opaque = timer_opaque; 194 throttle_timers_attach_aio_context(tt, aio_context); 195 } 196 197 /* destroy a timer */ 198 static void throttle_timer_destroy(QEMUTimer **timer) 199 { 200 assert(*timer != NULL); 201 202 timer_del(*timer); 203 timer_free(*timer); 204 *timer = NULL; 205 } 206 207 /* Remove timers from event loop */ 208 void throttle_timers_detach_aio_context(ThrottleTimers *tt) 209 { 210 int i; 211 212 for (i = 0; i < 2; i++) { 213 throttle_timer_destroy(&tt->timers[i]); 214 } 215 } 216 217 /* To be called last on the ThrottleTimers */ 218 void throttle_timers_destroy(ThrottleTimers *tt) 219 { 220 throttle_timers_detach_aio_context(tt); 221 } 222 223 /* is any throttling timer configured */ 224 bool throttle_timers_are_initialized(ThrottleTimers *tt) 225 { 226 if (tt->timers[0]) { 227 return true; 228 } 229 230 return false; 231 } 232 233 /* Does any throttling must be done 234 * 235 * @cfg: the throttling configuration to inspect 236 * @ret: true if throttling must be done else false 237 */ 238 bool throttle_enabled(ThrottleConfig *cfg) 239 { 240 int i; 241 242 for (i = 0; i < BUCKETS_COUNT; i++) { 243 if (cfg->buckets[i].avg > 0) { 244 return true; 245 } 246 } 247 248 return false; 249 } 250 251 /* return true if any two throttling parameters conflicts 252 * 253 * @cfg: the throttling configuration to inspect 254 * @ret: true if any conflict detected else false 255 */ 256 bool throttle_conflicting(ThrottleConfig *cfg) 257 { 258 bool bps_flag, ops_flag; 259 bool bps_max_flag, ops_max_flag; 260 261 bps_flag = cfg->buckets[THROTTLE_BPS_TOTAL].avg && 262 (cfg->buckets[THROTTLE_BPS_READ].avg || 263 cfg->buckets[THROTTLE_BPS_WRITE].avg); 264 265 ops_flag = cfg->buckets[THROTTLE_OPS_TOTAL].avg && 266 (cfg->buckets[THROTTLE_OPS_READ].avg || 267 cfg->buckets[THROTTLE_OPS_WRITE].avg); 268 269 bps_max_flag = cfg->buckets[THROTTLE_BPS_TOTAL].max && 270 (cfg->buckets[THROTTLE_BPS_READ].max || 271 cfg->buckets[THROTTLE_BPS_WRITE].max); 272 273 ops_max_flag = cfg->buckets[THROTTLE_OPS_TOTAL].max && 274 (cfg->buckets[THROTTLE_OPS_READ].max || 275 cfg->buckets[THROTTLE_OPS_WRITE].max); 276 277 return bps_flag || ops_flag || bps_max_flag || ops_max_flag; 278 } 279 280 /* check if a throttling configuration is valid 281 * @cfg: the throttling configuration to inspect 282 * @ret: true if valid else false 283 */ 284 bool throttle_is_valid(ThrottleConfig *cfg) 285 { 286 int i; 287 288 for (i = 0; i < BUCKETS_COUNT; i++) { 289 if (cfg->buckets[i].avg < 0 || 290 cfg->buckets[i].max < 0 || 291 cfg->buckets[i].avg > THROTTLE_VALUE_MAX || 292 cfg->buckets[i].max > THROTTLE_VALUE_MAX) { 293 return false; 294 } 295 } 296 297 return true; 298 } 299 300 /* check if bps_max/iops_max is used without bps/iops 301 * @cfg: the throttling configuration to inspect 302 */ 303 bool throttle_max_is_missing_limit(ThrottleConfig *cfg) 304 { 305 int i; 306 307 for (i = 0; i < BUCKETS_COUNT; i++) { 308 if (cfg->buckets[i].max && !cfg->buckets[i].avg) { 309 return true; 310 } 311 } 312 return false; 313 } 314 315 /* fix bucket parameters */ 316 static void throttle_fix_bucket(LeakyBucket *bkt) 317 { 318 double min; 319 320 /* zero bucket level */ 321 bkt->level = 0; 322 323 /* The following is done to cope with the Linux CFQ block scheduler 324 * which regroup reads and writes by block of 100ms in the guest. 325 * When they are two process one making reads and one making writes cfq 326 * make a pattern looking like the following: 327 * WWWWWWWWWWWRRRRRRRRRRRRRRWWWWWWWWWWWWWwRRRRRRRRRRRRRRRRR 328 * Having a max burst value of 100ms of the average will help smooth the 329 * throttling 330 */ 331 min = bkt->avg / 10; 332 if (bkt->avg && !bkt->max) { 333 bkt->max = min; 334 } 335 } 336 337 /* take care of canceling a timer */ 338 static void throttle_cancel_timer(QEMUTimer *timer) 339 { 340 assert(timer != NULL); 341 342 timer_del(timer); 343 } 344 345 /* Used to configure the throttle 346 * 347 * @ts: the throttle state we are working on 348 * @tt: the throttle timers we use in this aio context 349 * @cfg: the config to set 350 */ 351 void throttle_config(ThrottleState *ts, 352 ThrottleTimers *tt, 353 ThrottleConfig *cfg) 354 { 355 int i; 356 357 ts->cfg = *cfg; 358 359 for (i = 0; i < BUCKETS_COUNT; i++) { 360 throttle_fix_bucket(&ts->cfg.buckets[i]); 361 } 362 363 ts->previous_leak = qemu_clock_get_ns(tt->clock_type); 364 365 for (i = 0; i < 2; i++) { 366 throttle_cancel_timer(tt->timers[i]); 367 } 368 } 369 370 /* used to get config 371 * 372 * @ts: the throttle state we are working on 373 * @cfg: the config to write 374 */ 375 void throttle_get_config(ThrottleState *ts, ThrottleConfig *cfg) 376 { 377 *cfg = ts->cfg; 378 } 379 380 381 /* Schedule the read or write timer if needed 382 * 383 * NOTE: this function is not unit tested due to it's usage of timer_mod 384 * 385 * @tt: the timers structure 386 * @is_write: the type of operation (read/write) 387 * @ret: true if the timer has been scheduled else false 388 */ 389 bool throttle_schedule_timer(ThrottleState *ts, 390 ThrottleTimers *tt, 391 bool is_write) 392 { 393 int64_t now = qemu_clock_get_ns(tt->clock_type); 394 int64_t next_timestamp; 395 bool must_wait; 396 397 must_wait = throttle_compute_timer(ts, 398 is_write, 399 now, 400 &next_timestamp); 401 402 /* request not throttled */ 403 if (!must_wait) { 404 return false; 405 } 406 407 /* request throttled and timer pending -> do nothing */ 408 if (timer_pending(tt->timers[is_write])) { 409 return true; 410 } 411 412 /* request throttled and timer not pending -> arm timer */ 413 timer_mod(tt->timers[is_write], next_timestamp); 414 return true; 415 } 416 417 /* do the accounting for this operation 418 * 419 * @is_write: the type of operation (read/write) 420 * @size: the size of the operation 421 */ 422 void throttle_account(ThrottleState *ts, bool is_write, uint64_t size) 423 { 424 double units = 1.0; 425 426 /* if cfg.op_size is defined and smaller than size we compute unit count */ 427 if (ts->cfg.op_size && size > ts->cfg.op_size) { 428 units = (double) size / ts->cfg.op_size; 429 } 430 431 ts->cfg.buckets[THROTTLE_BPS_TOTAL].level += size; 432 ts->cfg.buckets[THROTTLE_OPS_TOTAL].level += units; 433 434 if (is_write) { 435 ts->cfg.buckets[THROTTLE_BPS_WRITE].level += size; 436 ts->cfg.buckets[THROTTLE_OPS_WRITE].level += units; 437 } else { 438 ts->cfg.buckets[THROTTLE_BPS_READ].level += size; 439 ts->cfg.buckets[THROTTLE_OPS_READ].level += units; 440 } 441 } 442 443