1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright © 2020 Intel Corporation 4 */ 5 6 #include <linux/pm_qos.h> 7 #include <linux/sort.h> 8 9 #include "intel_engine_heartbeat.h" 10 #include "intel_engine_pm.h" 11 #include "intel_gpu_commands.h" 12 #include "intel_gt_clock_utils.h" 13 #include "intel_gt_pm.h" 14 #include "intel_rc6.h" 15 #include "selftest_rps.h" 16 #include "selftests/igt_flush_test.h" 17 #include "selftests/igt_spinner.h" 18 #include "selftests/librapl.h" 19 20 /* Try to isolate the impact of cstates from determing frequency response */ 21 #define CPU_LATENCY 0 /* -1 to disable pm_qos, 0 to disable cstates */ 22 23 static unsigned long engine_heartbeat_disable(struct intel_engine_cs *engine) 24 { 25 unsigned long old; 26 27 old = fetch_and_zero(&engine->props.heartbeat_interval_ms); 28 29 intel_engine_pm_get(engine); 30 intel_engine_park_heartbeat(engine); 31 32 return old; 33 } 34 35 static void engine_heartbeat_enable(struct intel_engine_cs *engine, 36 unsigned long saved) 37 { 38 intel_engine_pm_put(engine); 39 40 engine->props.heartbeat_interval_ms = saved; 41 } 42 43 static void dummy_rps_work(struct work_struct *wrk) 44 { 45 } 46 47 static int cmp_u64(const void *A, const void *B) 48 { 49 const u64 *a = A, *b = B; 50 51 if (a < b) 52 return -1; 53 else if (a > b) 54 return 1; 55 else 56 return 0; 57 } 58 59 static int cmp_u32(const void *A, const void *B) 60 { 61 const u32 *a = A, *b = B; 62 63 if (a < b) 64 return -1; 65 else if (a > b) 66 return 1; 67 else 68 return 0; 69 } 70 71 static struct i915_vma * 72 create_spin_counter(struct intel_engine_cs *engine, 73 struct i915_address_space *vm, 74 bool srm, 75 u32 **cancel, 76 u32 **counter) 77 { 78 enum { 79 COUNT, 80 INC, 81 __NGPR__, 82 }; 83 #define CS_GPR(x) GEN8_RING_CS_GPR(engine->mmio_base, x) 84 struct drm_i915_gem_object *obj; 85 struct i915_vma *vma; 86 unsigned long end; 87 u32 *base, *cs; 88 int loop, i; 89 int err; 90 91 obj = i915_gem_object_create_internal(vm->i915, 64 << 10); 92 if (IS_ERR(obj)) 93 return ERR_CAST(obj); 94 95 end = obj->base.size / sizeof(u32) - 1; 96 97 vma = i915_vma_instance(obj, vm, NULL); 98 if (IS_ERR(vma)) { 99 i915_gem_object_put(obj); 100 return vma; 101 } 102 103 err = i915_vma_pin(vma, 0, 0, PIN_USER); 104 if (err) { 105 i915_vma_put(vma); 106 return ERR_PTR(err); 107 } 108 109 base = i915_gem_object_pin_map(obj, I915_MAP_WC); 110 if (IS_ERR(base)) { 111 i915_gem_object_put(obj); 112 return ERR_CAST(base); 113 } 114 cs = base; 115 116 *cs++ = MI_LOAD_REGISTER_IMM(__NGPR__ * 2); 117 for (i = 0; i < __NGPR__; i++) { 118 *cs++ = i915_mmio_reg_offset(CS_GPR(i)); 119 *cs++ = 0; 120 *cs++ = i915_mmio_reg_offset(CS_GPR(i)) + 4; 121 *cs++ = 0; 122 } 123 124 *cs++ = MI_LOAD_REGISTER_IMM(1); 125 *cs++ = i915_mmio_reg_offset(CS_GPR(INC)); 126 *cs++ = 1; 127 128 loop = cs - base; 129 130 /* Unroll the loop to avoid MI_BB_START stalls impacting measurements */ 131 for (i = 0; i < 1024; i++) { 132 *cs++ = MI_MATH(4); 133 *cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCA, MI_MATH_REG(COUNT)); 134 *cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCB, MI_MATH_REG(INC)); 135 *cs++ = MI_MATH_ADD; 136 *cs++ = MI_MATH_STORE(MI_MATH_REG(COUNT), MI_MATH_REG_ACCU); 137 138 if (srm) { 139 *cs++ = MI_STORE_REGISTER_MEM_GEN8; 140 *cs++ = i915_mmio_reg_offset(CS_GPR(COUNT)); 141 *cs++ = lower_32_bits(vma->node.start + end * sizeof(*cs)); 142 *cs++ = upper_32_bits(vma->node.start + end * sizeof(*cs)); 143 } 144 } 145 146 *cs++ = MI_BATCH_BUFFER_START_GEN8; 147 *cs++ = lower_32_bits(vma->node.start + loop * sizeof(*cs)); 148 *cs++ = upper_32_bits(vma->node.start + loop * sizeof(*cs)); 149 GEM_BUG_ON(cs - base > end); 150 151 i915_gem_object_flush_map(obj); 152 153 *cancel = base + loop; 154 *counter = srm ? memset32(base + end, 0, 1) : NULL; 155 return vma; 156 } 157 158 static u8 wait_for_freq(struct intel_rps *rps, u8 freq, int timeout_ms) 159 { 160 u8 history[64], i; 161 unsigned long end; 162 int sleep; 163 164 i = 0; 165 memset(history, freq, sizeof(history)); 166 sleep = 20; 167 168 /* The PCU does not change instantly, but drifts towards the goal? */ 169 end = jiffies + msecs_to_jiffies(timeout_ms); 170 do { 171 u8 act; 172 173 act = read_cagf(rps); 174 if (time_after(jiffies, end)) 175 return act; 176 177 /* Target acquired */ 178 if (act == freq) 179 return act; 180 181 /* Any change within the last N samples? */ 182 if (!memchr_inv(history, act, sizeof(history))) 183 return act; 184 185 history[i] = act; 186 i = (i + 1) % ARRAY_SIZE(history); 187 188 usleep_range(sleep, 2 * sleep); 189 sleep *= 2; 190 if (sleep > timeout_ms * 20) 191 sleep = timeout_ms * 20; 192 } while (1); 193 } 194 195 static u8 rps_set_check(struct intel_rps *rps, u8 freq) 196 { 197 mutex_lock(&rps->lock); 198 GEM_BUG_ON(!intel_rps_is_active(rps)); 199 intel_rps_set(rps, freq); 200 GEM_BUG_ON(rps->last_freq != freq); 201 mutex_unlock(&rps->lock); 202 203 return wait_for_freq(rps, freq, 50); 204 } 205 206 static void show_pstate_limits(struct intel_rps *rps) 207 { 208 struct drm_i915_private *i915 = rps_to_i915(rps); 209 210 if (IS_BROXTON(i915)) { 211 pr_info("P_STATE_CAP[%x]: 0x%08x\n", 212 i915_mmio_reg_offset(BXT_RP_STATE_CAP), 213 intel_uncore_read(rps_to_uncore(rps), 214 BXT_RP_STATE_CAP)); 215 } else if (IS_GEN(i915, 9)) { 216 pr_info("P_STATE_LIMITS[%x]: 0x%08x\n", 217 i915_mmio_reg_offset(GEN9_RP_STATE_LIMITS), 218 intel_uncore_read(rps_to_uncore(rps), 219 GEN9_RP_STATE_LIMITS)); 220 } 221 } 222 223 int live_rps_clock_interval(void *arg) 224 { 225 struct intel_gt *gt = arg; 226 struct intel_rps *rps = >->rps; 227 void (*saved_work)(struct work_struct *wrk); 228 struct intel_engine_cs *engine; 229 enum intel_engine_id id; 230 struct igt_spinner spin; 231 int err = 0; 232 233 if (!intel_rps_is_enabled(rps)) 234 return 0; 235 236 if (igt_spinner_init(&spin, gt)) 237 return -ENOMEM; 238 239 intel_gt_pm_wait_for_idle(gt); 240 saved_work = rps->work.func; 241 rps->work.func = dummy_rps_work; 242 243 intel_gt_pm_get(gt); 244 intel_rps_disable(>->rps); 245 246 intel_gt_check_clock_frequency(gt); 247 248 for_each_engine(engine, gt, id) { 249 unsigned long saved_heartbeat; 250 struct i915_request *rq; 251 u32 cycles; 252 u64 dt; 253 254 if (!intel_engine_can_store_dword(engine)) 255 continue; 256 257 saved_heartbeat = engine_heartbeat_disable(engine); 258 259 rq = igt_spinner_create_request(&spin, 260 engine->kernel_context, 261 MI_NOOP); 262 if (IS_ERR(rq)) { 263 engine_heartbeat_enable(engine, saved_heartbeat); 264 err = PTR_ERR(rq); 265 break; 266 } 267 268 i915_request_add(rq); 269 270 if (!igt_wait_for_spinner(&spin, rq)) { 271 pr_err("%s: RPS spinner did not start\n", 272 engine->name); 273 igt_spinner_end(&spin); 274 engine_heartbeat_enable(engine, saved_heartbeat); 275 intel_gt_set_wedged(engine->gt); 276 err = -EIO; 277 break; 278 } 279 280 intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL); 281 282 intel_uncore_write_fw(gt->uncore, GEN6_RP_CUR_UP_EI, 0); 283 284 /* Set the evaluation interval to infinity! */ 285 intel_uncore_write_fw(gt->uncore, 286 GEN6_RP_UP_EI, 0xffffffff); 287 intel_uncore_write_fw(gt->uncore, 288 GEN6_RP_UP_THRESHOLD, 0xffffffff); 289 290 intel_uncore_write_fw(gt->uncore, GEN6_RP_CONTROL, 291 GEN6_RP_ENABLE | GEN6_RP_UP_BUSY_AVG); 292 293 if (wait_for(intel_uncore_read_fw(gt->uncore, 294 GEN6_RP_CUR_UP_EI), 295 10)) { 296 /* Just skip the test; assume lack of HW support */ 297 pr_notice("%s: rps evaluation interval not ticking\n", 298 engine->name); 299 err = -ENODEV; 300 } else { 301 ktime_t dt_[5]; 302 u32 cycles_[5]; 303 int i; 304 305 for (i = 0; i < 5; i++) { 306 preempt_disable(); 307 308 dt_[i] = ktime_get(); 309 cycles_[i] = -intel_uncore_read_fw(gt->uncore, GEN6_RP_CUR_UP_EI); 310 311 udelay(1000); 312 313 dt_[i] = ktime_sub(ktime_get(), dt_[i]); 314 cycles_[i] += intel_uncore_read_fw(gt->uncore, GEN6_RP_CUR_UP_EI); 315 316 preempt_enable(); 317 } 318 319 /* Use the median of both cycle/dt; close enough */ 320 sort(cycles_, 5, sizeof(*cycles_), cmp_u32, NULL); 321 cycles = (cycles_[1] + 2 * cycles_[2] + cycles_[3]) / 4; 322 sort(dt_, 5, sizeof(*dt_), cmp_u64, NULL); 323 dt = div_u64(dt_[1] + 2 * dt_[2] + dt_[3], 4); 324 } 325 326 intel_uncore_write_fw(gt->uncore, GEN6_RP_CONTROL, 0); 327 intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL); 328 329 igt_spinner_end(&spin); 330 engine_heartbeat_enable(engine, saved_heartbeat); 331 332 if (err == 0) { 333 u64 time = intel_gt_pm_interval_to_ns(gt, cycles); 334 u32 expected = 335 intel_gt_ns_to_pm_interval(gt, dt); 336 337 pr_info("%s: rps counted %d C0 cycles [%lldns] in %lldns [%d cycles], using GT clock frequency of %uKHz\n", 338 engine->name, cycles, time, dt, expected, 339 gt->clock_frequency / 1000); 340 341 if (10 * time < 8 * dt || 342 8 * time > 10 * dt) { 343 pr_err("%s: rps clock time does not match walltime!\n", 344 engine->name); 345 err = -EINVAL; 346 } 347 348 if (10 * expected < 8 * cycles || 349 8 * expected > 10 * cycles) { 350 pr_err("%s: walltime does not match rps clock ticks!\n", 351 engine->name); 352 err = -EINVAL; 353 } 354 } 355 356 if (igt_flush_test(gt->i915)) 357 err = -EIO; 358 359 break; /* once is enough */ 360 } 361 362 intel_rps_enable(>->rps); 363 intel_gt_pm_put(gt); 364 365 igt_spinner_fini(&spin); 366 367 intel_gt_pm_wait_for_idle(gt); 368 rps->work.func = saved_work; 369 370 if (err == -ENODEV) /* skipped, don't report a fail */ 371 err = 0; 372 373 return err; 374 } 375 376 int live_rps_control(void *arg) 377 { 378 struct intel_gt *gt = arg; 379 struct intel_rps *rps = >->rps; 380 void (*saved_work)(struct work_struct *wrk); 381 struct intel_engine_cs *engine; 382 enum intel_engine_id id; 383 struct igt_spinner spin; 384 int err = 0; 385 386 /* 387 * Check that the actual frequency matches our requested frequency, 388 * to verify our control mechanism. We have to be careful that the 389 * PCU may throttle the GPU in which case the actual frequency used 390 * will be lowered than requested. 391 */ 392 393 if (!intel_rps_is_enabled(rps)) 394 return 0; 395 396 if (IS_CHERRYVIEW(gt->i915)) /* XXX fragile PCU */ 397 return 0; 398 399 if (igt_spinner_init(&spin, gt)) 400 return -ENOMEM; 401 402 intel_gt_pm_wait_for_idle(gt); 403 saved_work = rps->work.func; 404 rps->work.func = dummy_rps_work; 405 406 intel_gt_pm_get(gt); 407 for_each_engine(engine, gt, id) { 408 unsigned long saved_heartbeat; 409 struct i915_request *rq; 410 ktime_t min_dt, max_dt; 411 int f, limit; 412 int min, max; 413 414 if (!intel_engine_can_store_dword(engine)) 415 continue; 416 417 saved_heartbeat = engine_heartbeat_disable(engine); 418 419 rq = igt_spinner_create_request(&spin, 420 engine->kernel_context, 421 MI_NOOP); 422 if (IS_ERR(rq)) { 423 err = PTR_ERR(rq); 424 break; 425 } 426 427 i915_request_add(rq); 428 429 if (!igt_wait_for_spinner(&spin, rq)) { 430 pr_err("%s: RPS spinner did not start\n", 431 engine->name); 432 igt_spinner_end(&spin); 433 engine_heartbeat_enable(engine, saved_heartbeat); 434 intel_gt_set_wedged(engine->gt); 435 err = -EIO; 436 break; 437 } 438 439 if (rps_set_check(rps, rps->min_freq) != rps->min_freq) { 440 pr_err("%s: could not set minimum frequency [%x], only %x!\n", 441 engine->name, rps->min_freq, read_cagf(rps)); 442 igt_spinner_end(&spin); 443 engine_heartbeat_enable(engine, saved_heartbeat); 444 show_pstate_limits(rps); 445 err = -EINVAL; 446 break; 447 } 448 449 for (f = rps->min_freq + 1; f < rps->max_freq; f++) { 450 if (rps_set_check(rps, f) < f) 451 break; 452 } 453 454 limit = rps_set_check(rps, f); 455 456 if (rps_set_check(rps, rps->min_freq) != rps->min_freq) { 457 pr_err("%s: could not restore minimum frequency [%x], only %x!\n", 458 engine->name, rps->min_freq, read_cagf(rps)); 459 igt_spinner_end(&spin); 460 engine_heartbeat_enable(engine, saved_heartbeat); 461 show_pstate_limits(rps); 462 err = -EINVAL; 463 break; 464 } 465 466 max_dt = ktime_get(); 467 max = rps_set_check(rps, limit); 468 max_dt = ktime_sub(ktime_get(), max_dt); 469 470 min_dt = ktime_get(); 471 min = rps_set_check(rps, rps->min_freq); 472 min_dt = ktime_sub(ktime_get(), min_dt); 473 474 igt_spinner_end(&spin); 475 engine_heartbeat_enable(engine, saved_heartbeat); 476 477 pr_info("%s: range:[%x:%uMHz, %x:%uMHz] limit:[%x:%uMHz], %x:%x response %lluns:%lluns\n", 478 engine->name, 479 rps->min_freq, intel_gpu_freq(rps, rps->min_freq), 480 rps->max_freq, intel_gpu_freq(rps, rps->max_freq), 481 limit, intel_gpu_freq(rps, limit), 482 min, max, ktime_to_ns(min_dt), ktime_to_ns(max_dt)); 483 484 if (limit == rps->min_freq) { 485 pr_err("%s: GPU throttled to minimum!\n", 486 engine->name); 487 show_pstate_limits(rps); 488 err = -ENODEV; 489 break; 490 } 491 492 if (igt_flush_test(gt->i915)) { 493 err = -EIO; 494 break; 495 } 496 } 497 intel_gt_pm_put(gt); 498 499 igt_spinner_fini(&spin); 500 501 intel_gt_pm_wait_for_idle(gt); 502 rps->work.func = saved_work; 503 504 return err; 505 } 506 507 static void show_pcu_config(struct intel_rps *rps) 508 { 509 struct drm_i915_private *i915 = rps_to_i915(rps); 510 unsigned int max_gpu_freq, min_gpu_freq; 511 intel_wakeref_t wakeref; 512 int gpu_freq; 513 514 if (!HAS_LLC(i915)) 515 return; 516 517 min_gpu_freq = rps->min_freq; 518 max_gpu_freq = rps->max_freq; 519 if (INTEL_GEN(i915) >= 9) { 520 /* Convert GT frequency to 50 HZ units */ 521 min_gpu_freq /= GEN9_FREQ_SCALER; 522 max_gpu_freq /= GEN9_FREQ_SCALER; 523 } 524 525 wakeref = intel_runtime_pm_get(rps_to_uncore(rps)->rpm); 526 527 pr_info("%5s %5s %5s\n", "GPU", "eCPU", "eRing"); 528 for (gpu_freq = min_gpu_freq; gpu_freq <= max_gpu_freq; gpu_freq++) { 529 int ia_freq = gpu_freq; 530 531 sandybridge_pcode_read(i915, 532 GEN6_PCODE_READ_MIN_FREQ_TABLE, 533 &ia_freq, NULL); 534 535 pr_info("%5d %5d %5d\n", 536 gpu_freq * 50, 537 ((ia_freq >> 0) & 0xff) * 100, 538 ((ia_freq >> 8) & 0xff) * 100); 539 } 540 541 intel_runtime_pm_put(rps_to_uncore(rps)->rpm, wakeref); 542 } 543 544 static u64 __measure_frequency(u32 *cntr, int duration_ms) 545 { 546 u64 dc, dt; 547 548 dt = ktime_get(); 549 dc = READ_ONCE(*cntr); 550 usleep_range(1000 * duration_ms, 2000 * duration_ms); 551 dc = READ_ONCE(*cntr) - dc; 552 dt = ktime_get() - dt; 553 554 return div64_u64(1000 * 1000 * dc, dt); 555 } 556 557 static u64 measure_frequency_at(struct intel_rps *rps, u32 *cntr, int *freq) 558 { 559 u64 x[5]; 560 int i; 561 562 *freq = rps_set_check(rps, *freq); 563 for (i = 0; i < 5; i++) 564 x[i] = __measure_frequency(cntr, 2); 565 *freq = (*freq + read_cagf(rps)) / 2; 566 567 /* A simple triangle filter for better result stability */ 568 sort(x, 5, sizeof(*x), cmp_u64, NULL); 569 return div_u64(x[1] + 2 * x[2] + x[3], 4); 570 } 571 572 static u64 __measure_cs_frequency(struct intel_engine_cs *engine, 573 int duration_ms) 574 { 575 u64 dc, dt; 576 577 dt = ktime_get(); 578 dc = intel_uncore_read_fw(engine->uncore, CS_GPR(0)); 579 usleep_range(1000 * duration_ms, 2000 * duration_ms); 580 dc = intel_uncore_read_fw(engine->uncore, CS_GPR(0)) - dc; 581 dt = ktime_get() - dt; 582 583 return div64_u64(1000 * 1000 * dc, dt); 584 } 585 586 static u64 measure_cs_frequency_at(struct intel_rps *rps, 587 struct intel_engine_cs *engine, 588 int *freq) 589 { 590 u64 x[5]; 591 int i; 592 593 *freq = rps_set_check(rps, *freq); 594 for (i = 0; i < 5; i++) 595 x[i] = __measure_cs_frequency(engine, 2); 596 *freq = (*freq + read_cagf(rps)) / 2; 597 598 /* A simple triangle filter for better result stability */ 599 sort(x, 5, sizeof(*x), cmp_u64, NULL); 600 return div_u64(x[1] + 2 * x[2] + x[3], 4); 601 } 602 603 static bool scaled_within(u64 x, u64 y, u32 f_n, u32 f_d) 604 { 605 return f_d * x > f_n * y && f_n * x < f_d * y; 606 } 607 608 int live_rps_frequency_cs(void *arg) 609 { 610 void (*saved_work)(struct work_struct *wrk); 611 struct intel_gt *gt = arg; 612 struct intel_rps *rps = >->rps; 613 struct intel_engine_cs *engine; 614 struct pm_qos_request qos; 615 enum intel_engine_id id; 616 int err = 0; 617 618 /* 619 * The premise is that the GPU does change freqency at our behest. 620 * Let's check there is a correspondence between the requested 621 * frequency, the actual frequency, and the observed clock rate. 622 */ 623 624 if (!intel_rps_is_enabled(rps)) 625 return 0; 626 627 if (INTEL_GEN(gt->i915) < 8) /* for CS simplicity */ 628 return 0; 629 630 if (CPU_LATENCY >= 0) 631 cpu_latency_qos_add_request(&qos, CPU_LATENCY); 632 633 intel_gt_pm_wait_for_idle(gt); 634 saved_work = rps->work.func; 635 rps->work.func = dummy_rps_work; 636 637 for_each_engine(engine, gt, id) { 638 unsigned long saved_heartbeat; 639 struct i915_request *rq; 640 struct i915_vma *vma; 641 u32 *cancel, *cntr; 642 struct { 643 u64 count; 644 int freq; 645 } min, max; 646 647 saved_heartbeat = engine_heartbeat_disable(engine); 648 649 vma = create_spin_counter(engine, 650 engine->kernel_context->vm, false, 651 &cancel, &cntr); 652 if (IS_ERR(vma)) { 653 err = PTR_ERR(vma); 654 engine_heartbeat_enable(engine, saved_heartbeat); 655 break; 656 } 657 658 rq = intel_engine_create_kernel_request(engine); 659 if (IS_ERR(rq)) { 660 err = PTR_ERR(rq); 661 goto err_vma; 662 } 663 664 i915_vma_lock(vma); 665 err = i915_request_await_object(rq, vma->obj, false); 666 if (!err) 667 err = i915_vma_move_to_active(vma, rq, 0); 668 if (!err) 669 err = rq->engine->emit_bb_start(rq, 670 vma->node.start, 671 PAGE_SIZE, 0); 672 i915_vma_unlock(vma); 673 i915_request_add(rq); 674 if (err) 675 goto err_vma; 676 677 if (wait_for(intel_uncore_read(engine->uncore, CS_GPR(0)), 678 10)) { 679 pr_err("%s: timed loop did not start\n", 680 engine->name); 681 goto err_vma; 682 } 683 684 min.freq = rps->min_freq; 685 min.count = measure_cs_frequency_at(rps, engine, &min.freq); 686 687 max.freq = rps->max_freq; 688 max.count = measure_cs_frequency_at(rps, engine, &max.freq); 689 690 pr_info("%s: min:%lluKHz @ %uMHz, max:%lluKHz @ %uMHz [%d%%]\n", 691 engine->name, 692 min.count, intel_gpu_freq(rps, min.freq), 693 max.count, intel_gpu_freq(rps, max.freq), 694 (int)DIV64_U64_ROUND_CLOSEST(100 * min.freq * max.count, 695 max.freq * min.count)); 696 697 if (!scaled_within(max.freq * min.count, 698 min.freq * max.count, 699 2, 3)) { 700 int f; 701 702 pr_err("%s: CS did not scale with frequency! scaled min:%llu, max:%llu\n", 703 engine->name, 704 max.freq * min.count, 705 min.freq * max.count); 706 show_pcu_config(rps); 707 708 for (f = min.freq + 1; f <= rps->max_freq; f++) { 709 int act = f; 710 u64 count; 711 712 count = measure_cs_frequency_at(rps, engine, &act); 713 if (act < f) 714 break; 715 716 pr_info("%s: %x:%uMHz: %lluKHz [%d%%]\n", 717 engine->name, 718 act, intel_gpu_freq(rps, act), count, 719 (int)DIV64_U64_ROUND_CLOSEST(100 * min.freq * count, 720 act * min.count)); 721 722 f = act; /* may skip ahead [pcu granularity] */ 723 } 724 725 err = -EINVAL; 726 } 727 728 err_vma: 729 *cancel = MI_BATCH_BUFFER_END; 730 i915_gem_object_flush_map(vma->obj); 731 i915_gem_object_unpin_map(vma->obj); 732 i915_vma_unpin(vma); 733 i915_vma_put(vma); 734 735 engine_heartbeat_enable(engine, saved_heartbeat); 736 if (igt_flush_test(gt->i915)) 737 err = -EIO; 738 if (err) 739 break; 740 } 741 742 intel_gt_pm_wait_for_idle(gt); 743 rps->work.func = saved_work; 744 745 if (CPU_LATENCY >= 0) 746 cpu_latency_qos_remove_request(&qos); 747 748 return err; 749 } 750 751 int live_rps_frequency_srm(void *arg) 752 { 753 void (*saved_work)(struct work_struct *wrk); 754 struct intel_gt *gt = arg; 755 struct intel_rps *rps = >->rps; 756 struct intel_engine_cs *engine; 757 struct pm_qos_request qos; 758 enum intel_engine_id id; 759 int err = 0; 760 761 /* 762 * The premise is that the GPU does change freqency at our behest. 763 * Let's check there is a correspondence between the requested 764 * frequency, the actual frequency, and the observed clock rate. 765 */ 766 767 if (!intel_rps_is_enabled(rps)) 768 return 0; 769 770 if (INTEL_GEN(gt->i915) < 8) /* for CS simplicity */ 771 return 0; 772 773 if (CPU_LATENCY >= 0) 774 cpu_latency_qos_add_request(&qos, CPU_LATENCY); 775 776 intel_gt_pm_wait_for_idle(gt); 777 saved_work = rps->work.func; 778 rps->work.func = dummy_rps_work; 779 780 for_each_engine(engine, gt, id) { 781 unsigned long saved_heartbeat; 782 struct i915_request *rq; 783 struct i915_vma *vma; 784 u32 *cancel, *cntr; 785 struct { 786 u64 count; 787 int freq; 788 } min, max; 789 790 saved_heartbeat = engine_heartbeat_disable(engine); 791 792 vma = create_spin_counter(engine, 793 engine->kernel_context->vm, true, 794 &cancel, &cntr); 795 if (IS_ERR(vma)) { 796 err = PTR_ERR(vma); 797 engine_heartbeat_enable(engine, saved_heartbeat); 798 break; 799 } 800 801 rq = intel_engine_create_kernel_request(engine); 802 if (IS_ERR(rq)) { 803 err = PTR_ERR(rq); 804 goto err_vma; 805 } 806 807 i915_vma_lock(vma); 808 err = i915_request_await_object(rq, vma->obj, false); 809 if (!err) 810 err = i915_vma_move_to_active(vma, rq, 0); 811 if (!err) 812 err = rq->engine->emit_bb_start(rq, 813 vma->node.start, 814 PAGE_SIZE, 0); 815 i915_vma_unlock(vma); 816 i915_request_add(rq); 817 if (err) 818 goto err_vma; 819 820 if (wait_for(READ_ONCE(*cntr), 10)) { 821 pr_err("%s: timed loop did not start\n", 822 engine->name); 823 goto err_vma; 824 } 825 826 min.freq = rps->min_freq; 827 min.count = measure_frequency_at(rps, cntr, &min.freq); 828 829 max.freq = rps->max_freq; 830 max.count = measure_frequency_at(rps, cntr, &max.freq); 831 832 pr_info("%s: min:%lluKHz @ %uMHz, max:%lluKHz @ %uMHz [%d%%]\n", 833 engine->name, 834 min.count, intel_gpu_freq(rps, min.freq), 835 max.count, intel_gpu_freq(rps, max.freq), 836 (int)DIV64_U64_ROUND_CLOSEST(100 * min.freq * max.count, 837 max.freq * min.count)); 838 839 if (!scaled_within(max.freq * min.count, 840 min.freq * max.count, 841 1, 2)) { 842 int f; 843 844 pr_err("%s: CS did not scale with frequency! scaled min:%llu, max:%llu\n", 845 engine->name, 846 max.freq * min.count, 847 min.freq * max.count); 848 show_pcu_config(rps); 849 850 for (f = min.freq + 1; f <= rps->max_freq; f++) { 851 int act = f; 852 u64 count; 853 854 count = measure_frequency_at(rps, cntr, &act); 855 if (act < f) 856 break; 857 858 pr_info("%s: %x:%uMHz: %lluKHz [%d%%]\n", 859 engine->name, 860 act, intel_gpu_freq(rps, act), count, 861 (int)DIV64_U64_ROUND_CLOSEST(100 * min.freq * count, 862 act * min.count)); 863 864 f = act; /* may skip ahead [pcu granularity] */ 865 } 866 867 err = -EINVAL; 868 } 869 870 err_vma: 871 *cancel = MI_BATCH_BUFFER_END; 872 i915_gem_object_flush_map(vma->obj); 873 i915_gem_object_unpin_map(vma->obj); 874 i915_vma_unpin(vma); 875 i915_vma_put(vma); 876 877 engine_heartbeat_enable(engine, saved_heartbeat); 878 if (igt_flush_test(gt->i915)) 879 err = -EIO; 880 if (err) 881 break; 882 } 883 884 intel_gt_pm_wait_for_idle(gt); 885 rps->work.func = saved_work; 886 887 if (CPU_LATENCY >= 0) 888 cpu_latency_qos_remove_request(&qos); 889 890 return err; 891 } 892 893 static void sleep_for_ei(struct intel_rps *rps, int timeout_us) 894 { 895 /* Flush any previous EI */ 896 usleep_range(timeout_us, 2 * timeout_us); 897 898 /* Reset the interrupt status */ 899 rps_disable_interrupts(rps); 900 GEM_BUG_ON(rps->pm_iir); 901 rps_enable_interrupts(rps); 902 903 /* And then wait for the timeout, for real this time */ 904 usleep_range(2 * timeout_us, 3 * timeout_us); 905 } 906 907 static int __rps_up_interrupt(struct intel_rps *rps, 908 struct intel_engine_cs *engine, 909 struct igt_spinner *spin) 910 { 911 struct intel_uncore *uncore = engine->uncore; 912 struct i915_request *rq; 913 u32 timeout; 914 915 if (!intel_engine_can_store_dword(engine)) 916 return 0; 917 918 rps_set_check(rps, rps->min_freq); 919 920 rq = igt_spinner_create_request(spin, engine->kernel_context, MI_NOOP); 921 if (IS_ERR(rq)) 922 return PTR_ERR(rq); 923 924 i915_request_get(rq); 925 i915_request_add(rq); 926 927 if (!igt_wait_for_spinner(spin, rq)) { 928 pr_err("%s: RPS spinner did not start\n", 929 engine->name); 930 i915_request_put(rq); 931 intel_gt_set_wedged(engine->gt); 932 return -EIO; 933 } 934 935 if (!intel_rps_is_active(rps)) { 936 pr_err("%s: RPS not enabled on starting spinner\n", 937 engine->name); 938 igt_spinner_end(spin); 939 i915_request_put(rq); 940 return -EINVAL; 941 } 942 943 if (!(rps->pm_events & GEN6_PM_RP_UP_THRESHOLD)) { 944 pr_err("%s: RPS did not register UP interrupt\n", 945 engine->name); 946 i915_request_put(rq); 947 return -EINVAL; 948 } 949 950 if (rps->last_freq != rps->min_freq) { 951 pr_err("%s: RPS did not program min frequency\n", 952 engine->name); 953 i915_request_put(rq); 954 return -EINVAL; 955 } 956 957 timeout = intel_uncore_read(uncore, GEN6_RP_UP_EI); 958 timeout = intel_gt_pm_interval_to_ns(engine->gt, timeout); 959 timeout = DIV_ROUND_UP(timeout, 1000); 960 961 sleep_for_ei(rps, timeout); 962 GEM_BUG_ON(i915_request_completed(rq)); 963 964 igt_spinner_end(spin); 965 i915_request_put(rq); 966 967 if (rps->cur_freq != rps->min_freq) { 968 pr_err("%s: Frequency unexpectedly changed [up], now %d!\n", 969 engine->name, intel_rps_read_actual_frequency(rps)); 970 return -EINVAL; 971 } 972 973 if (!(rps->pm_iir & GEN6_PM_RP_UP_THRESHOLD)) { 974 pr_err("%s: UP interrupt not recorded for spinner, pm_iir:%x, prev_up:%x, up_threshold:%x, up_ei:%x\n", 975 engine->name, rps->pm_iir, 976 intel_uncore_read(uncore, GEN6_RP_PREV_UP), 977 intel_uncore_read(uncore, GEN6_RP_UP_THRESHOLD), 978 intel_uncore_read(uncore, GEN6_RP_UP_EI)); 979 return -EINVAL; 980 } 981 982 return 0; 983 } 984 985 static int __rps_down_interrupt(struct intel_rps *rps, 986 struct intel_engine_cs *engine) 987 { 988 struct intel_uncore *uncore = engine->uncore; 989 u32 timeout; 990 991 rps_set_check(rps, rps->max_freq); 992 993 if (!(rps->pm_events & GEN6_PM_RP_DOWN_THRESHOLD)) { 994 pr_err("%s: RPS did not register DOWN interrupt\n", 995 engine->name); 996 return -EINVAL; 997 } 998 999 if (rps->last_freq != rps->max_freq) { 1000 pr_err("%s: RPS did not program max frequency\n", 1001 engine->name); 1002 return -EINVAL; 1003 } 1004 1005 timeout = intel_uncore_read(uncore, GEN6_RP_DOWN_EI); 1006 timeout = intel_gt_pm_interval_to_ns(engine->gt, timeout); 1007 timeout = DIV_ROUND_UP(timeout, 1000); 1008 1009 sleep_for_ei(rps, timeout); 1010 1011 if (rps->cur_freq != rps->max_freq) { 1012 pr_err("%s: Frequency unexpectedly changed [down], now %d!\n", 1013 engine->name, 1014 intel_rps_read_actual_frequency(rps)); 1015 return -EINVAL; 1016 } 1017 1018 if (!(rps->pm_iir & (GEN6_PM_RP_DOWN_THRESHOLD | GEN6_PM_RP_DOWN_TIMEOUT))) { 1019 pr_err("%s: DOWN interrupt not recorded for idle, pm_iir:%x, prev_down:%x, down_threshold:%x, down_ei:%x [prev_up:%x, up_threshold:%x, up_ei:%x]\n", 1020 engine->name, rps->pm_iir, 1021 intel_uncore_read(uncore, GEN6_RP_PREV_DOWN), 1022 intel_uncore_read(uncore, GEN6_RP_DOWN_THRESHOLD), 1023 intel_uncore_read(uncore, GEN6_RP_DOWN_EI), 1024 intel_uncore_read(uncore, GEN6_RP_PREV_UP), 1025 intel_uncore_read(uncore, GEN6_RP_UP_THRESHOLD), 1026 intel_uncore_read(uncore, GEN6_RP_UP_EI)); 1027 return -EINVAL; 1028 } 1029 1030 return 0; 1031 } 1032 1033 int live_rps_interrupt(void *arg) 1034 { 1035 struct intel_gt *gt = arg; 1036 struct intel_rps *rps = >->rps; 1037 void (*saved_work)(struct work_struct *wrk); 1038 struct intel_engine_cs *engine; 1039 enum intel_engine_id id; 1040 struct igt_spinner spin; 1041 u32 pm_events; 1042 int err = 0; 1043 1044 /* 1045 * First, let's check whether or not we are receiving interrupts. 1046 */ 1047 1048 if (!intel_rps_has_interrupts(rps)) 1049 return 0; 1050 1051 intel_gt_pm_get(gt); 1052 pm_events = rps->pm_events; 1053 intel_gt_pm_put(gt); 1054 if (!pm_events) { 1055 pr_err("No RPS PM events registered, but RPS is enabled?\n"); 1056 return -ENODEV; 1057 } 1058 1059 if (igt_spinner_init(&spin, gt)) 1060 return -ENOMEM; 1061 1062 intel_gt_pm_wait_for_idle(gt); 1063 saved_work = rps->work.func; 1064 rps->work.func = dummy_rps_work; 1065 1066 for_each_engine(engine, gt, id) { 1067 /* Keep the engine busy with a spinner; expect an UP! */ 1068 if (pm_events & GEN6_PM_RP_UP_THRESHOLD) { 1069 unsigned long saved_heartbeat; 1070 1071 intel_gt_pm_wait_for_idle(engine->gt); 1072 GEM_BUG_ON(intel_rps_is_active(rps)); 1073 1074 saved_heartbeat = engine_heartbeat_disable(engine); 1075 1076 err = __rps_up_interrupt(rps, engine, &spin); 1077 1078 engine_heartbeat_enable(engine, saved_heartbeat); 1079 if (err) 1080 goto out; 1081 1082 intel_gt_pm_wait_for_idle(engine->gt); 1083 } 1084 1085 /* Keep the engine awake but idle and check for DOWN */ 1086 if (pm_events & GEN6_PM_RP_DOWN_THRESHOLD) { 1087 unsigned long saved_heartbeat; 1088 1089 saved_heartbeat = engine_heartbeat_disable(engine); 1090 intel_rc6_disable(>->rc6); 1091 1092 err = __rps_down_interrupt(rps, engine); 1093 1094 intel_rc6_enable(>->rc6); 1095 engine_heartbeat_enable(engine, saved_heartbeat); 1096 if (err) 1097 goto out; 1098 } 1099 } 1100 1101 out: 1102 if (igt_flush_test(gt->i915)) 1103 err = -EIO; 1104 1105 igt_spinner_fini(&spin); 1106 1107 intel_gt_pm_wait_for_idle(gt); 1108 rps->work.func = saved_work; 1109 1110 return err; 1111 } 1112 1113 static u64 __measure_power(int duration_ms) 1114 { 1115 u64 dE, dt; 1116 1117 dt = ktime_get(); 1118 dE = librapl_energy_uJ(); 1119 usleep_range(1000 * duration_ms, 2000 * duration_ms); 1120 dE = librapl_energy_uJ() - dE; 1121 dt = ktime_get() - dt; 1122 1123 return div64_u64(1000 * 1000 * dE, dt); 1124 } 1125 1126 static u64 measure_power_at(struct intel_rps *rps, int *freq) 1127 { 1128 u64 x[5]; 1129 int i; 1130 1131 *freq = rps_set_check(rps, *freq); 1132 for (i = 0; i < 5; i++) 1133 x[i] = __measure_power(5); 1134 *freq = (*freq + read_cagf(rps)) / 2; 1135 1136 /* A simple triangle filter for better result stability */ 1137 sort(x, 5, sizeof(*x), cmp_u64, NULL); 1138 return div_u64(x[1] + 2 * x[2] + x[3], 4); 1139 } 1140 1141 int live_rps_power(void *arg) 1142 { 1143 struct intel_gt *gt = arg; 1144 struct intel_rps *rps = >->rps; 1145 void (*saved_work)(struct work_struct *wrk); 1146 struct intel_engine_cs *engine; 1147 enum intel_engine_id id; 1148 struct igt_spinner spin; 1149 int err = 0; 1150 1151 /* 1152 * Our fundamental assumption is that running at lower frequency 1153 * actually saves power. Let's see if our RAPL measurement support 1154 * that theory. 1155 */ 1156 1157 if (!intel_rps_is_enabled(rps)) 1158 return 0; 1159 1160 if (!librapl_energy_uJ()) 1161 return 0; 1162 1163 if (igt_spinner_init(&spin, gt)) 1164 return -ENOMEM; 1165 1166 intel_gt_pm_wait_for_idle(gt); 1167 saved_work = rps->work.func; 1168 rps->work.func = dummy_rps_work; 1169 1170 for_each_engine(engine, gt, id) { 1171 unsigned long saved_heartbeat; 1172 struct i915_request *rq; 1173 struct { 1174 u64 power; 1175 int freq; 1176 } min, max; 1177 1178 if (!intel_engine_can_store_dword(engine)) 1179 continue; 1180 1181 saved_heartbeat = engine_heartbeat_disable(engine); 1182 1183 rq = igt_spinner_create_request(&spin, 1184 engine->kernel_context, 1185 MI_NOOP); 1186 if (IS_ERR(rq)) { 1187 engine_heartbeat_enable(engine, saved_heartbeat); 1188 err = PTR_ERR(rq); 1189 break; 1190 } 1191 1192 i915_request_add(rq); 1193 1194 if (!igt_wait_for_spinner(&spin, rq)) { 1195 pr_err("%s: RPS spinner did not start\n", 1196 engine->name); 1197 igt_spinner_end(&spin); 1198 engine_heartbeat_enable(engine, saved_heartbeat); 1199 intel_gt_set_wedged(engine->gt); 1200 err = -EIO; 1201 break; 1202 } 1203 1204 max.freq = rps->max_freq; 1205 max.power = measure_power_at(rps, &max.freq); 1206 1207 min.freq = rps->min_freq; 1208 min.power = measure_power_at(rps, &min.freq); 1209 1210 igt_spinner_end(&spin); 1211 engine_heartbeat_enable(engine, saved_heartbeat); 1212 1213 pr_info("%s: min:%llumW @ %uMHz, max:%llumW @ %uMHz\n", 1214 engine->name, 1215 min.power, intel_gpu_freq(rps, min.freq), 1216 max.power, intel_gpu_freq(rps, max.freq)); 1217 1218 if (10 * min.freq >= 9 * max.freq) { 1219 pr_notice("Could not control frequency, ran at [%d:%uMHz, %d:%uMhz]\n", 1220 min.freq, intel_gpu_freq(rps, min.freq), 1221 max.freq, intel_gpu_freq(rps, max.freq)); 1222 continue; 1223 } 1224 1225 if (11 * min.power > 10 * max.power) { 1226 pr_err("%s: did not conserve power when setting lower frequency!\n", 1227 engine->name); 1228 err = -EINVAL; 1229 break; 1230 } 1231 1232 if (igt_flush_test(gt->i915)) { 1233 err = -EIO; 1234 break; 1235 } 1236 } 1237 1238 igt_spinner_fini(&spin); 1239 1240 intel_gt_pm_wait_for_idle(gt); 1241 rps->work.func = saved_work; 1242 1243 return err; 1244 } 1245 1246 int live_rps_dynamic(void *arg) 1247 { 1248 struct intel_gt *gt = arg; 1249 struct intel_rps *rps = >->rps; 1250 struct intel_engine_cs *engine; 1251 enum intel_engine_id id; 1252 struct igt_spinner spin; 1253 int err = 0; 1254 1255 /* 1256 * We've looked at the bascs, and have established that we 1257 * can change the clock frequency and that the HW will generate 1258 * interrupts based on load. Now we check how we integrate those 1259 * moving parts into dynamic reclocking based on load. 1260 */ 1261 1262 if (!intel_rps_is_enabled(rps)) 1263 return 0; 1264 1265 if (igt_spinner_init(&spin, gt)) 1266 return -ENOMEM; 1267 1268 for_each_engine(engine, gt, id) { 1269 struct i915_request *rq; 1270 struct { 1271 ktime_t dt; 1272 u8 freq; 1273 } min, max; 1274 1275 if (!intel_engine_can_store_dword(engine)) 1276 continue; 1277 1278 intel_gt_pm_wait_for_idle(gt); 1279 GEM_BUG_ON(intel_rps_is_active(rps)); 1280 rps->cur_freq = rps->min_freq; 1281 1282 intel_engine_pm_get(engine); 1283 intel_rc6_disable(>->rc6); 1284 GEM_BUG_ON(rps->last_freq != rps->min_freq); 1285 1286 rq = igt_spinner_create_request(&spin, 1287 engine->kernel_context, 1288 MI_NOOP); 1289 if (IS_ERR(rq)) { 1290 err = PTR_ERR(rq); 1291 goto err; 1292 } 1293 1294 i915_request_add(rq); 1295 1296 max.dt = ktime_get(); 1297 max.freq = wait_for_freq(rps, rps->max_freq, 500); 1298 max.dt = ktime_sub(ktime_get(), max.dt); 1299 1300 igt_spinner_end(&spin); 1301 1302 min.dt = ktime_get(); 1303 min.freq = wait_for_freq(rps, rps->min_freq, 2000); 1304 min.dt = ktime_sub(ktime_get(), min.dt); 1305 1306 pr_info("%s: dynamically reclocked to %u:%uMHz while busy in %lluns, and %u:%uMHz while idle in %lluns\n", 1307 engine->name, 1308 max.freq, intel_gpu_freq(rps, max.freq), 1309 ktime_to_ns(max.dt), 1310 min.freq, intel_gpu_freq(rps, min.freq), 1311 ktime_to_ns(min.dt)); 1312 if (min.freq >= max.freq) { 1313 pr_err("%s: dynamic reclocking of spinner failed\n!", 1314 engine->name); 1315 err = -EINVAL; 1316 } 1317 1318 err: 1319 intel_rc6_enable(>->rc6); 1320 intel_engine_pm_put(engine); 1321 1322 if (igt_flush_test(gt->i915)) 1323 err = -EIO; 1324 if (err) 1325 break; 1326 } 1327 1328 igt_spinner_fini(&spin); 1329 1330 return err; 1331 } 1332