1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright © 2020 Intel Corporation 4 */ 5 6 #include <linux/pm_qos.h> 7 #include <linux/sort.h> 8 9 #include "intel_engine_heartbeat.h" 10 #include "intel_engine_pm.h" 11 #include "intel_gpu_commands.h" 12 #include "intel_gt_clock_utils.h" 13 #include "intel_gt_pm.h" 14 #include "intel_rc6.h" 15 #include "selftest_engine_heartbeat.h" 16 #include "selftest_rps.h" 17 #include "selftests/igt_flush_test.h" 18 #include "selftests/igt_spinner.h" 19 #include "selftests/librapl.h" 20 21 /* Try to isolate the impact of cstates from determing frequency response */ 22 #define CPU_LATENCY 0 /* -1 to disable pm_qos, 0 to disable cstates */ 23 24 static void dummy_rps_work(struct work_struct *wrk) 25 { 26 } 27 28 static int cmp_u64(const void *A, const void *B) 29 { 30 const u64 *a = A, *b = B; 31 32 if (*a < *b) 33 return -1; 34 else if (*a > *b) 35 return 1; 36 else 37 return 0; 38 } 39 40 static int cmp_u32(const void *A, const void *B) 41 { 42 const u32 *a = A, *b = B; 43 44 if (*a < *b) 45 return -1; 46 else if (*a > *b) 47 return 1; 48 else 49 return 0; 50 } 51 52 static struct i915_vma * 53 create_spin_counter(struct intel_engine_cs *engine, 54 struct i915_address_space *vm, 55 bool srm, 56 u32 **cancel, 57 u32 **counter) 58 { 59 enum { 60 COUNT, 61 INC, 62 __NGPR__, 63 }; 64 #define CS_GPR(x) GEN8_RING_CS_GPR(engine->mmio_base, x) 65 struct drm_i915_gem_object *obj; 66 struct i915_vma *vma; 67 unsigned long end; 68 u32 *base, *cs; 69 int loop, i; 70 int err; 71 72 obj = i915_gem_object_create_internal(vm->i915, 64 << 10); 73 if (IS_ERR(obj)) 74 return ERR_CAST(obj); 75 76 end = obj->base.size / sizeof(u32) - 1; 77 78 vma = i915_vma_instance(obj, vm, NULL); 79 if (IS_ERR(vma)) { 80 err = PTR_ERR(vma); 81 goto err_put; 82 } 83 84 err = i915_vma_pin(vma, 0, 0, PIN_USER); 85 if (err) 86 goto err_unlock; 87 88 i915_vma_lock(vma); 89 90 base = i915_gem_object_pin_map(obj, I915_MAP_WC); 91 if (IS_ERR(base)) { 92 err = PTR_ERR(base); 93 goto err_unpin; 94 } 95 cs = base; 96 97 *cs++ = MI_LOAD_REGISTER_IMM(__NGPR__ * 2); 98 for (i = 0; i < __NGPR__; i++) { 99 *cs++ = i915_mmio_reg_offset(CS_GPR(i)); 100 *cs++ = 0; 101 *cs++ = i915_mmio_reg_offset(CS_GPR(i)) + 4; 102 *cs++ = 0; 103 } 104 105 *cs++ = MI_LOAD_REGISTER_IMM(1); 106 *cs++ = i915_mmio_reg_offset(CS_GPR(INC)); 107 *cs++ = 1; 108 109 loop = cs - base; 110 111 /* Unroll the loop to avoid MI_BB_START stalls impacting measurements */ 112 for (i = 0; i < 1024; i++) { 113 *cs++ = MI_MATH(4); 114 *cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCA, MI_MATH_REG(COUNT)); 115 *cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCB, MI_MATH_REG(INC)); 116 *cs++ = MI_MATH_ADD; 117 *cs++ = MI_MATH_STORE(MI_MATH_REG(COUNT), MI_MATH_REG_ACCU); 118 119 if (srm) { 120 *cs++ = MI_STORE_REGISTER_MEM_GEN8; 121 *cs++ = i915_mmio_reg_offset(CS_GPR(COUNT)); 122 *cs++ = lower_32_bits(vma->node.start + end * sizeof(*cs)); 123 *cs++ = upper_32_bits(vma->node.start + end * sizeof(*cs)); 124 } 125 } 126 127 *cs++ = MI_BATCH_BUFFER_START_GEN8; 128 *cs++ = lower_32_bits(vma->node.start + loop * sizeof(*cs)); 129 *cs++ = upper_32_bits(vma->node.start + loop * sizeof(*cs)); 130 GEM_BUG_ON(cs - base > end); 131 132 i915_gem_object_flush_map(obj); 133 134 *cancel = base + loop; 135 *counter = srm ? memset32(base + end, 0, 1) : NULL; 136 return vma; 137 138 err_unpin: 139 i915_vma_unpin(vma); 140 err_unlock: 141 i915_vma_unlock(vma); 142 err_put: 143 i915_gem_object_put(obj); 144 return ERR_PTR(err); 145 } 146 147 static u8 wait_for_freq(struct intel_rps *rps, u8 freq, int timeout_ms) 148 { 149 u8 history[64], i; 150 unsigned long end; 151 int sleep; 152 153 i = 0; 154 memset(history, freq, sizeof(history)); 155 sleep = 20; 156 157 /* The PCU does not change instantly, but drifts towards the goal? */ 158 end = jiffies + msecs_to_jiffies(timeout_ms); 159 do { 160 u8 act; 161 162 act = read_cagf(rps); 163 if (time_after(jiffies, end)) 164 return act; 165 166 /* Target acquired */ 167 if (act == freq) 168 return act; 169 170 /* Any change within the last N samples? */ 171 if (!memchr_inv(history, act, sizeof(history))) 172 return act; 173 174 history[i] = act; 175 i = (i + 1) % ARRAY_SIZE(history); 176 177 usleep_range(sleep, 2 * sleep); 178 sleep *= 2; 179 if (sleep > timeout_ms * 20) 180 sleep = timeout_ms * 20; 181 } while (1); 182 } 183 184 static u8 rps_set_check(struct intel_rps *rps, u8 freq) 185 { 186 mutex_lock(&rps->lock); 187 GEM_BUG_ON(!intel_rps_is_active(rps)); 188 intel_rps_set(rps, freq); 189 GEM_BUG_ON(rps->last_freq != freq); 190 mutex_unlock(&rps->lock); 191 192 return wait_for_freq(rps, freq, 50); 193 } 194 195 static void show_pstate_limits(struct intel_rps *rps) 196 { 197 struct drm_i915_private *i915 = rps_to_i915(rps); 198 199 if (IS_BROXTON(i915)) { 200 pr_info("P_STATE_CAP[%x]: 0x%08x\n", 201 i915_mmio_reg_offset(BXT_RP_STATE_CAP), 202 intel_uncore_read(rps_to_uncore(rps), 203 BXT_RP_STATE_CAP)); 204 } else if (IS_GEN(i915, 9)) { 205 pr_info("P_STATE_LIMITS[%x]: 0x%08x\n", 206 i915_mmio_reg_offset(GEN9_RP_STATE_LIMITS), 207 intel_uncore_read(rps_to_uncore(rps), 208 GEN9_RP_STATE_LIMITS)); 209 } 210 } 211 212 int live_rps_clock_interval(void *arg) 213 { 214 struct intel_gt *gt = arg; 215 struct intel_rps *rps = >->rps; 216 void (*saved_work)(struct work_struct *wrk); 217 struct intel_engine_cs *engine; 218 enum intel_engine_id id; 219 struct igt_spinner spin; 220 int err = 0; 221 222 if (!intel_rps_is_enabled(rps)) 223 return 0; 224 225 if (igt_spinner_init(&spin, gt)) 226 return -ENOMEM; 227 228 intel_gt_pm_wait_for_idle(gt); 229 saved_work = rps->work.func; 230 rps->work.func = dummy_rps_work; 231 232 intel_gt_pm_get(gt); 233 intel_rps_disable(>->rps); 234 235 intel_gt_check_clock_frequency(gt); 236 237 for_each_engine(engine, gt, id) { 238 struct i915_request *rq; 239 u32 cycles; 240 u64 dt; 241 242 if (!intel_engine_can_store_dword(engine)) 243 continue; 244 245 st_engine_heartbeat_disable(engine); 246 247 rq = igt_spinner_create_request(&spin, 248 engine->kernel_context, 249 MI_NOOP); 250 if (IS_ERR(rq)) { 251 st_engine_heartbeat_enable(engine); 252 err = PTR_ERR(rq); 253 break; 254 } 255 256 i915_request_add(rq); 257 258 if (!igt_wait_for_spinner(&spin, rq)) { 259 pr_err("%s: RPS spinner did not start\n", 260 engine->name); 261 igt_spinner_end(&spin); 262 st_engine_heartbeat_enable(engine); 263 intel_gt_set_wedged(engine->gt); 264 err = -EIO; 265 break; 266 } 267 268 intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL); 269 270 intel_uncore_write_fw(gt->uncore, GEN6_RP_CUR_UP_EI, 0); 271 272 /* Set the evaluation interval to infinity! */ 273 intel_uncore_write_fw(gt->uncore, 274 GEN6_RP_UP_EI, 0xffffffff); 275 intel_uncore_write_fw(gt->uncore, 276 GEN6_RP_UP_THRESHOLD, 0xffffffff); 277 278 intel_uncore_write_fw(gt->uncore, GEN6_RP_CONTROL, 279 GEN6_RP_ENABLE | GEN6_RP_UP_BUSY_AVG); 280 281 if (wait_for(intel_uncore_read_fw(gt->uncore, 282 GEN6_RP_CUR_UP_EI), 283 10)) { 284 /* Just skip the test; assume lack of HW support */ 285 pr_notice("%s: rps evaluation interval not ticking\n", 286 engine->name); 287 err = -ENODEV; 288 } else { 289 ktime_t dt_[5]; 290 u32 cycles_[5]; 291 int i; 292 293 for (i = 0; i < 5; i++) { 294 preempt_disable(); 295 296 dt_[i] = ktime_get(); 297 cycles_[i] = -intel_uncore_read_fw(gt->uncore, GEN6_RP_CUR_UP_EI); 298 299 udelay(1000); 300 301 dt_[i] = ktime_sub(ktime_get(), dt_[i]); 302 cycles_[i] += intel_uncore_read_fw(gt->uncore, GEN6_RP_CUR_UP_EI); 303 304 preempt_enable(); 305 } 306 307 /* Use the median of both cycle/dt; close enough */ 308 sort(cycles_, 5, sizeof(*cycles_), cmp_u32, NULL); 309 cycles = (cycles_[1] + 2 * cycles_[2] + cycles_[3]) / 4; 310 sort(dt_, 5, sizeof(*dt_), cmp_u64, NULL); 311 dt = div_u64(dt_[1] + 2 * dt_[2] + dt_[3], 4); 312 } 313 314 intel_uncore_write_fw(gt->uncore, GEN6_RP_CONTROL, 0); 315 intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL); 316 317 igt_spinner_end(&spin); 318 st_engine_heartbeat_enable(engine); 319 320 if (err == 0) { 321 u64 time = intel_gt_pm_interval_to_ns(gt, cycles); 322 u32 expected = 323 intel_gt_ns_to_pm_interval(gt, dt); 324 325 pr_info("%s: rps counted %d C0 cycles [%lldns] in %lldns [%d cycles], using GT clock frequency of %uKHz\n", 326 engine->name, cycles, time, dt, expected, 327 gt->clock_frequency / 1000); 328 329 if (10 * time < 8 * dt || 330 8 * time > 10 * dt) { 331 pr_err("%s: rps clock time does not match walltime!\n", 332 engine->name); 333 err = -EINVAL; 334 } 335 336 if (10 * expected < 8 * cycles || 337 8 * expected > 10 * cycles) { 338 pr_err("%s: walltime does not match rps clock ticks!\n", 339 engine->name); 340 err = -EINVAL; 341 } 342 } 343 344 if (igt_flush_test(gt->i915)) 345 err = -EIO; 346 347 break; /* once is enough */ 348 } 349 350 intel_rps_enable(>->rps); 351 intel_gt_pm_put(gt); 352 353 igt_spinner_fini(&spin); 354 355 intel_gt_pm_wait_for_idle(gt); 356 rps->work.func = saved_work; 357 358 if (err == -ENODEV) /* skipped, don't report a fail */ 359 err = 0; 360 361 return err; 362 } 363 364 int live_rps_control(void *arg) 365 { 366 struct intel_gt *gt = arg; 367 struct intel_rps *rps = >->rps; 368 void (*saved_work)(struct work_struct *wrk); 369 struct intel_engine_cs *engine; 370 enum intel_engine_id id; 371 struct igt_spinner spin; 372 int err = 0; 373 374 /* 375 * Check that the actual frequency matches our requested frequency, 376 * to verify our control mechanism. We have to be careful that the 377 * PCU may throttle the GPU in which case the actual frequency used 378 * will be lowered than requested. 379 */ 380 381 if (!intel_rps_is_enabled(rps)) 382 return 0; 383 384 if (IS_CHERRYVIEW(gt->i915)) /* XXX fragile PCU */ 385 return 0; 386 387 if (igt_spinner_init(&spin, gt)) 388 return -ENOMEM; 389 390 intel_gt_pm_wait_for_idle(gt); 391 saved_work = rps->work.func; 392 rps->work.func = dummy_rps_work; 393 394 intel_gt_pm_get(gt); 395 for_each_engine(engine, gt, id) { 396 struct i915_request *rq; 397 ktime_t min_dt, max_dt; 398 int f, limit; 399 int min, max; 400 401 if (!intel_engine_can_store_dword(engine)) 402 continue; 403 404 st_engine_heartbeat_disable(engine); 405 406 rq = igt_spinner_create_request(&spin, 407 engine->kernel_context, 408 MI_NOOP); 409 if (IS_ERR(rq)) { 410 err = PTR_ERR(rq); 411 break; 412 } 413 414 i915_request_add(rq); 415 416 if (!igt_wait_for_spinner(&spin, rq)) { 417 pr_err("%s: RPS spinner did not start\n", 418 engine->name); 419 igt_spinner_end(&spin); 420 st_engine_heartbeat_enable(engine); 421 intel_gt_set_wedged(engine->gt); 422 err = -EIO; 423 break; 424 } 425 426 if (rps_set_check(rps, rps->min_freq) != rps->min_freq) { 427 pr_err("%s: could not set minimum frequency [%x], only %x!\n", 428 engine->name, rps->min_freq, read_cagf(rps)); 429 igt_spinner_end(&spin); 430 st_engine_heartbeat_enable(engine); 431 show_pstate_limits(rps); 432 err = -EINVAL; 433 break; 434 } 435 436 for (f = rps->min_freq + 1; f < rps->max_freq; f++) { 437 if (rps_set_check(rps, f) < f) 438 break; 439 } 440 441 limit = rps_set_check(rps, f); 442 443 if (rps_set_check(rps, rps->min_freq) != rps->min_freq) { 444 pr_err("%s: could not restore minimum frequency [%x], only %x!\n", 445 engine->name, rps->min_freq, read_cagf(rps)); 446 igt_spinner_end(&spin); 447 st_engine_heartbeat_enable(engine); 448 show_pstate_limits(rps); 449 err = -EINVAL; 450 break; 451 } 452 453 max_dt = ktime_get(); 454 max = rps_set_check(rps, limit); 455 max_dt = ktime_sub(ktime_get(), max_dt); 456 457 min_dt = ktime_get(); 458 min = rps_set_check(rps, rps->min_freq); 459 min_dt = ktime_sub(ktime_get(), min_dt); 460 461 igt_spinner_end(&spin); 462 st_engine_heartbeat_enable(engine); 463 464 pr_info("%s: range:[%x:%uMHz, %x:%uMHz] limit:[%x:%uMHz], %x:%x response %lluns:%lluns\n", 465 engine->name, 466 rps->min_freq, intel_gpu_freq(rps, rps->min_freq), 467 rps->max_freq, intel_gpu_freq(rps, rps->max_freq), 468 limit, intel_gpu_freq(rps, limit), 469 min, max, ktime_to_ns(min_dt), ktime_to_ns(max_dt)); 470 471 if (limit == rps->min_freq) { 472 pr_err("%s: GPU throttled to minimum!\n", 473 engine->name); 474 show_pstate_limits(rps); 475 err = -ENODEV; 476 break; 477 } 478 479 if (igt_flush_test(gt->i915)) { 480 err = -EIO; 481 break; 482 } 483 } 484 intel_gt_pm_put(gt); 485 486 igt_spinner_fini(&spin); 487 488 intel_gt_pm_wait_for_idle(gt); 489 rps->work.func = saved_work; 490 491 return err; 492 } 493 494 static void show_pcu_config(struct intel_rps *rps) 495 { 496 struct drm_i915_private *i915 = rps_to_i915(rps); 497 unsigned int max_gpu_freq, min_gpu_freq; 498 intel_wakeref_t wakeref; 499 int gpu_freq; 500 501 if (!HAS_LLC(i915)) 502 return; 503 504 min_gpu_freq = rps->min_freq; 505 max_gpu_freq = rps->max_freq; 506 if (INTEL_GEN(i915) >= 9) { 507 /* Convert GT frequency to 50 HZ units */ 508 min_gpu_freq /= GEN9_FREQ_SCALER; 509 max_gpu_freq /= GEN9_FREQ_SCALER; 510 } 511 512 wakeref = intel_runtime_pm_get(rps_to_uncore(rps)->rpm); 513 514 pr_info("%5s %5s %5s\n", "GPU", "eCPU", "eRing"); 515 for (gpu_freq = min_gpu_freq; gpu_freq <= max_gpu_freq; gpu_freq++) { 516 int ia_freq = gpu_freq; 517 518 sandybridge_pcode_read(i915, 519 GEN6_PCODE_READ_MIN_FREQ_TABLE, 520 &ia_freq, NULL); 521 522 pr_info("%5d %5d %5d\n", 523 gpu_freq * 50, 524 ((ia_freq >> 0) & 0xff) * 100, 525 ((ia_freq >> 8) & 0xff) * 100); 526 } 527 528 intel_runtime_pm_put(rps_to_uncore(rps)->rpm, wakeref); 529 } 530 531 static u64 __measure_frequency(u32 *cntr, int duration_ms) 532 { 533 u64 dc, dt; 534 535 dt = ktime_get(); 536 dc = READ_ONCE(*cntr); 537 usleep_range(1000 * duration_ms, 2000 * duration_ms); 538 dc = READ_ONCE(*cntr) - dc; 539 dt = ktime_get() - dt; 540 541 return div64_u64(1000 * 1000 * dc, dt); 542 } 543 544 static u64 measure_frequency_at(struct intel_rps *rps, u32 *cntr, int *freq) 545 { 546 u64 x[5]; 547 int i; 548 549 *freq = rps_set_check(rps, *freq); 550 for (i = 0; i < 5; i++) 551 x[i] = __measure_frequency(cntr, 2); 552 *freq = (*freq + read_cagf(rps)) / 2; 553 554 /* A simple triangle filter for better result stability */ 555 sort(x, 5, sizeof(*x), cmp_u64, NULL); 556 return div_u64(x[1] + 2 * x[2] + x[3], 4); 557 } 558 559 static u64 __measure_cs_frequency(struct intel_engine_cs *engine, 560 int duration_ms) 561 { 562 u64 dc, dt; 563 564 dt = ktime_get(); 565 dc = intel_uncore_read_fw(engine->uncore, CS_GPR(0)); 566 usleep_range(1000 * duration_ms, 2000 * duration_ms); 567 dc = intel_uncore_read_fw(engine->uncore, CS_GPR(0)) - dc; 568 dt = ktime_get() - dt; 569 570 return div64_u64(1000 * 1000 * dc, dt); 571 } 572 573 static u64 measure_cs_frequency_at(struct intel_rps *rps, 574 struct intel_engine_cs *engine, 575 int *freq) 576 { 577 u64 x[5]; 578 int i; 579 580 *freq = rps_set_check(rps, *freq); 581 for (i = 0; i < 5; i++) 582 x[i] = __measure_cs_frequency(engine, 2); 583 *freq = (*freq + read_cagf(rps)) / 2; 584 585 /* A simple triangle filter for better result stability */ 586 sort(x, 5, sizeof(*x), cmp_u64, NULL); 587 return div_u64(x[1] + 2 * x[2] + x[3], 4); 588 } 589 590 static bool scaled_within(u64 x, u64 y, u32 f_n, u32 f_d) 591 { 592 return f_d * x > f_n * y && f_n * x < f_d * y; 593 } 594 595 int live_rps_frequency_cs(void *arg) 596 { 597 void (*saved_work)(struct work_struct *wrk); 598 struct intel_gt *gt = arg; 599 struct intel_rps *rps = >->rps; 600 struct intel_engine_cs *engine; 601 struct pm_qos_request qos; 602 enum intel_engine_id id; 603 int err = 0; 604 605 /* 606 * The premise is that the GPU does change freqency at our behest. 607 * Let's check there is a correspondence between the requested 608 * frequency, the actual frequency, and the observed clock rate. 609 */ 610 611 if (!intel_rps_is_enabled(rps)) 612 return 0; 613 614 if (INTEL_GEN(gt->i915) < 8) /* for CS simplicity */ 615 return 0; 616 617 if (CPU_LATENCY >= 0) 618 cpu_latency_qos_add_request(&qos, CPU_LATENCY); 619 620 intel_gt_pm_wait_for_idle(gt); 621 saved_work = rps->work.func; 622 rps->work.func = dummy_rps_work; 623 624 for_each_engine(engine, gt, id) { 625 struct i915_request *rq; 626 struct i915_vma *vma; 627 u32 *cancel, *cntr; 628 struct { 629 u64 count; 630 int freq; 631 } min, max; 632 633 st_engine_heartbeat_disable(engine); 634 635 vma = create_spin_counter(engine, 636 engine->kernel_context->vm, false, 637 &cancel, &cntr); 638 if (IS_ERR(vma)) { 639 err = PTR_ERR(vma); 640 st_engine_heartbeat_enable(engine); 641 break; 642 } 643 644 rq = intel_engine_create_kernel_request(engine); 645 if (IS_ERR(rq)) { 646 err = PTR_ERR(rq); 647 goto err_vma; 648 } 649 650 err = i915_request_await_object(rq, vma->obj, false); 651 if (!err) 652 err = i915_vma_move_to_active(vma, rq, 0); 653 if (!err) 654 err = rq->engine->emit_bb_start(rq, 655 vma->node.start, 656 PAGE_SIZE, 0); 657 i915_request_add(rq); 658 if (err) 659 goto err_vma; 660 661 if (wait_for(intel_uncore_read(engine->uncore, CS_GPR(0)), 662 10)) { 663 pr_err("%s: timed loop did not start\n", 664 engine->name); 665 goto err_vma; 666 } 667 668 min.freq = rps->min_freq; 669 min.count = measure_cs_frequency_at(rps, engine, &min.freq); 670 671 max.freq = rps->max_freq; 672 max.count = measure_cs_frequency_at(rps, engine, &max.freq); 673 674 pr_info("%s: min:%lluKHz @ %uMHz, max:%lluKHz @ %uMHz [%d%%]\n", 675 engine->name, 676 min.count, intel_gpu_freq(rps, min.freq), 677 max.count, intel_gpu_freq(rps, max.freq), 678 (int)DIV64_U64_ROUND_CLOSEST(100 * min.freq * max.count, 679 max.freq * min.count)); 680 681 if (!scaled_within(max.freq * min.count, 682 min.freq * max.count, 683 2, 3)) { 684 int f; 685 686 pr_err("%s: CS did not scale with frequency! scaled min:%llu, max:%llu\n", 687 engine->name, 688 max.freq * min.count, 689 min.freq * max.count); 690 show_pcu_config(rps); 691 692 for (f = min.freq + 1; f <= rps->max_freq; f++) { 693 int act = f; 694 u64 count; 695 696 count = measure_cs_frequency_at(rps, engine, &act); 697 if (act < f) 698 break; 699 700 pr_info("%s: %x:%uMHz: %lluKHz [%d%%]\n", 701 engine->name, 702 act, intel_gpu_freq(rps, act), count, 703 (int)DIV64_U64_ROUND_CLOSEST(100 * min.freq * count, 704 act * min.count)); 705 706 f = act; /* may skip ahead [pcu granularity] */ 707 } 708 709 err = -EINTR; /* ignore error, continue on with test */ 710 } 711 712 err_vma: 713 *cancel = MI_BATCH_BUFFER_END; 714 i915_gem_object_flush_map(vma->obj); 715 i915_gem_object_unpin_map(vma->obj); 716 i915_vma_unpin(vma); 717 i915_vma_unlock(vma); 718 i915_vma_put(vma); 719 720 st_engine_heartbeat_enable(engine); 721 if (igt_flush_test(gt->i915)) 722 err = -EIO; 723 if (err) 724 break; 725 } 726 727 intel_gt_pm_wait_for_idle(gt); 728 rps->work.func = saved_work; 729 730 if (CPU_LATENCY >= 0) 731 cpu_latency_qos_remove_request(&qos); 732 733 return err; 734 } 735 736 int live_rps_frequency_srm(void *arg) 737 { 738 void (*saved_work)(struct work_struct *wrk); 739 struct intel_gt *gt = arg; 740 struct intel_rps *rps = >->rps; 741 struct intel_engine_cs *engine; 742 struct pm_qos_request qos; 743 enum intel_engine_id id; 744 int err = 0; 745 746 /* 747 * The premise is that the GPU does change freqency at our behest. 748 * Let's check there is a correspondence between the requested 749 * frequency, the actual frequency, and the observed clock rate. 750 */ 751 752 if (!intel_rps_is_enabled(rps)) 753 return 0; 754 755 if (INTEL_GEN(gt->i915) < 8) /* for CS simplicity */ 756 return 0; 757 758 if (CPU_LATENCY >= 0) 759 cpu_latency_qos_add_request(&qos, CPU_LATENCY); 760 761 intel_gt_pm_wait_for_idle(gt); 762 saved_work = rps->work.func; 763 rps->work.func = dummy_rps_work; 764 765 for_each_engine(engine, gt, id) { 766 struct i915_request *rq; 767 struct i915_vma *vma; 768 u32 *cancel, *cntr; 769 struct { 770 u64 count; 771 int freq; 772 } min, max; 773 774 st_engine_heartbeat_disable(engine); 775 776 vma = create_spin_counter(engine, 777 engine->kernel_context->vm, true, 778 &cancel, &cntr); 779 if (IS_ERR(vma)) { 780 err = PTR_ERR(vma); 781 st_engine_heartbeat_enable(engine); 782 break; 783 } 784 785 rq = intel_engine_create_kernel_request(engine); 786 if (IS_ERR(rq)) { 787 err = PTR_ERR(rq); 788 goto err_vma; 789 } 790 791 err = i915_request_await_object(rq, vma->obj, false); 792 if (!err) 793 err = i915_vma_move_to_active(vma, rq, 0); 794 if (!err) 795 err = rq->engine->emit_bb_start(rq, 796 vma->node.start, 797 PAGE_SIZE, 0); 798 i915_request_add(rq); 799 if (err) 800 goto err_vma; 801 802 if (wait_for(READ_ONCE(*cntr), 10)) { 803 pr_err("%s: timed loop did not start\n", 804 engine->name); 805 goto err_vma; 806 } 807 808 min.freq = rps->min_freq; 809 min.count = measure_frequency_at(rps, cntr, &min.freq); 810 811 max.freq = rps->max_freq; 812 max.count = measure_frequency_at(rps, cntr, &max.freq); 813 814 pr_info("%s: min:%lluKHz @ %uMHz, max:%lluKHz @ %uMHz [%d%%]\n", 815 engine->name, 816 min.count, intel_gpu_freq(rps, min.freq), 817 max.count, intel_gpu_freq(rps, max.freq), 818 (int)DIV64_U64_ROUND_CLOSEST(100 * min.freq * max.count, 819 max.freq * min.count)); 820 821 if (!scaled_within(max.freq * min.count, 822 min.freq * max.count, 823 1, 2)) { 824 int f; 825 826 pr_err("%s: CS did not scale with frequency! scaled min:%llu, max:%llu\n", 827 engine->name, 828 max.freq * min.count, 829 min.freq * max.count); 830 show_pcu_config(rps); 831 832 for (f = min.freq + 1; f <= rps->max_freq; f++) { 833 int act = f; 834 u64 count; 835 836 count = measure_frequency_at(rps, cntr, &act); 837 if (act < f) 838 break; 839 840 pr_info("%s: %x:%uMHz: %lluKHz [%d%%]\n", 841 engine->name, 842 act, intel_gpu_freq(rps, act), count, 843 (int)DIV64_U64_ROUND_CLOSEST(100 * min.freq * count, 844 act * min.count)); 845 846 f = act; /* may skip ahead [pcu granularity] */ 847 } 848 849 err = -EINTR; /* ignore error, continue on with test */ 850 } 851 852 err_vma: 853 *cancel = MI_BATCH_BUFFER_END; 854 i915_gem_object_flush_map(vma->obj); 855 i915_gem_object_unpin_map(vma->obj); 856 i915_vma_unpin(vma); 857 i915_vma_unlock(vma); 858 i915_vma_put(vma); 859 860 st_engine_heartbeat_enable(engine); 861 if (igt_flush_test(gt->i915)) 862 err = -EIO; 863 if (err) 864 break; 865 } 866 867 intel_gt_pm_wait_for_idle(gt); 868 rps->work.func = saved_work; 869 870 if (CPU_LATENCY >= 0) 871 cpu_latency_qos_remove_request(&qos); 872 873 return err; 874 } 875 876 static void sleep_for_ei(struct intel_rps *rps, int timeout_us) 877 { 878 /* Flush any previous EI */ 879 usleep_range(timeout_us, 2 * timeout_us); 880 881 /* Reset the interrupt status */ 882 rps_disable_interrupts(rps); 883 GEM_BUG_ON(rps->pm_iir); 884 rps_enable_interrupts(rps); 885 886 /* And then wait for the timeout, for real this time */ 887 usleep_range(2 * timeout_us, 3 * timeout_us); 888 } 889 890 static int __rps_up_interrupt(struct intel_rps *rps, 891 struct intel_engine_cs *engine, 892 struct igt_spinner *spin) 893 { 894 struct intel_uncore *uncore = engine->uncore; 895 struct i915_request *rq; 896 u32 timeout; 897 898 if (!intel_engine_can_store_dword(engine)) 899 return 0; 900 901 rps_set_check(rps, rps->min_freq); 902 903 rq = igt_spinner_create_request(spin, engine->kernel_context, MI_NOOP); 904 if (IS_ERR(rq)) 905 return PTR_ERR(rq); 906 907 i915_request_get(rq); 908 i915_request_add(rq); 909 910 if (!igt_wait_for_spinner(spin, rq)) { 911 pr_err("%s: RPS spinner did not start\n", 912 engine->name); 913 i915_request_put(rq); 914 intel_gt_set_wedged(engine->gt); 915 return -EIO; 916 } 917 918 if (!intel_rps_is_active(rps)) { 919 pr_err("%s: RPS not enabled on starting spinner\n", 920 engine->name); 921 igt_spinner_end(spin); 922 i915_request_put(rq); 923 return -EINVAL; 924 } 925 926 if (!(rps->pm_events & GEN6_PM_RP_UP_THRESHOLD)) { 927 pr_err("%s: RPS did not register UP interrupt\n", 928 engine->name); 929 i915_request_put(rq); 930 return -EINVAL; 931 } 932 933 if (rps->last_freq != rps->min_freq) { 934 pr_err("%s: RPS did not program min frequency\n", 935 engine->name); 936 i915_request_put(rq); 937 return -EINVAL; 938 } 939 940 timeout = intel_uncore_read(uncore, GEN6_RP_UP_EI); 941 timeout = intel_gt_pm_interval_to_ns(engine->gt, timeout); 942 timeout = DIV_ROUND_UP(timeout, 1000); 943 944 sleep_for_ei(rps, timeout); 945 GEM_BUG_ON(i915_request_completed(rq)); 946 947 igt_spinner_end(spin); 948 i915_request_put(rq); 949 950 if (rps->cur_freq != rps->min_freq) { 951 pr_err("%s: Frequency unexpectedly changed [up], now %d!\n", 952 engine->name, intel_rps_read_actual_frequency(rps)); 953 return -EINVAL; 954 } 955 956 if (!(rps->pm_iir & GEN6_PM_RP_UP_THRESHOLD)) { 957 pr_err("%s: UP interrupt not recorded for spinner, pm_iir:%x, prev_up:%x, up_threshold:%x, up_ei:%x\n", 958 engine->name, rps->pm_iir, 959 intel_uncore_read(uncore, GEN6_RP_PREV_UP), 960 intel_uncore_read(uncore, GEN6_RP_UP_THRESHOLD), 961 intel_uncore_read(uncore, GEN6_RP_UP_EI)); 962 return -EINVAL; 963 } 964 965 return 0; 966 } 967 968 static int __rps_down_interrupt(struct intel_rps *rps, 969 struct intel_engine_cs *engine) 970 { 971 struct intel_uncore *uncore = engine->uncore; 972 u32 timeout; 973 974 rps_set_check(rps, rps->max_freq); 975 976 if (!(rps->pm_events & GEN6_PM_RP_DOWN_THRESHOLD)) { 977 pr_err("%s: RPS did not register DOWN interrupt\n", 978 engine->name); 979 return -EINVAL; 980 } 981 982 if (rps->last_freq != rps->max_freq) { 983 pr_err("%s: RPS did not program max frequency\n", 984 engine->name); 985 return -EINVAL; 986 } 987 988 timeout = intel_uncore_read(uncore, GEN6_RP_DOWN_EI); 989 timeout = intel_gt_pm_interval_to_ns(engine->gt, timeout); 990 timeout = DIV_ROUND_UP(timeout, 1000); 991 992 sleep_for_ei(rps, timeout); 993 994 if (rps->cur_freq != rps->max_freq) { 995 pr_err("%s: Frequency unexpectedly changed [down], now %d!\n", 996 engine->name, 997 intel_rps_read_actual_frequency(rps)); 998 return -EINVAL; 999 } 1000 1001 if (!(rps->pm_iir & (GEN6_PM_RP_DOWN_THRESHOLD | GEN6_PM_RP_DOWN_TIMEOUT))) { 1002 pr_err("%s: DOWN interrupt not recorded for idle, pm_iir:%x, prev_down:%x, down_threshold:%x, down_ei:%x [prev_up:%x, up_threshold:%x, up_ei:%x]\n", 1003 engine->name, rps->pm_iir, 1004 intel_uncore_read(uncore, GEN6_RP_PREV_DOWN), 1005 intel_uncore_read(uncore, GEN6_RP_DOWN_THRESHOLD), 1006 intel_uncore_read(uncore, GEN6_RP_DOWN_EI), 1007 intel_uncore_read(uncore, GEN6_RP_PREV_UP), 1008 intel_uncore_read(uncore, GEN6_RP_UP_THRESHOLD), 1009 intel_uncore_read(uncore, GEN6_RP_UP_EI)); 1010 return -EINVAL; 1011 } 1012 1013 return 0; 1014 } 1015 1016 int live_rps_interrupt(void *arg) 1017 { 1018 struct intel_gt *gt = arg; 1019 struct intel_rps *rps = >->rps; 1020 void (*saved_work)(struct work_struct *wrk); 1021 struct intel_engine_cs *engine; 1022 enum intel_engine_id id; 1023 struct igt_spinner spin; 1024 u32 pm_events; 1025 int err = 0; 1026 1027 /* 1028 * First, let's check whether or not we are receiving interrupts. 1029 */ 1030 1031 if (!intel_rps_has_interrupts(rps)) 1032 return 0; 1033 1034 intel_gt_pm_get(gt); 1035 pm_events = rps->pm_events; 1036 intel_gt_pm_put(gt); 1037 if (!pm_events) { 1038 pr_err("No RPS PM events registered, but RPS is enabled?\n"); 1039 return -ENODEV; 1040 } 1041 1042 if (igt_spinner_init(&spin, gt)) 1043 return -ENOMEM; 1044 1045 intel_gt_pm_wait_for_idle(gt); 1046 saved_work = rps->work.func; 1047 rps->work.func = dummy_rps_work; 1048 1049 for_each_engine(engine, gt, id) { 1050 /* Keep the engine busy with a spinner; expect an UP! */ 1051 if (pm_events & GEN6_PM_RP_UP_THRESHOLD) { 1052 intel_gt_pm_wait_for_idle(engine->gt); 1053 GEM_BUG_ON(intel_rps_is_active(rps)); 1054 1055 st_engine_heartbeat_disable(engine); 1056 1057 err = __rps_up_interrupt(rps, engine, &spin); 1058 1059 st_engine_heartbeat_enable(engine); 1060 if (err) 1061 goto out; 1062 1063 intel_gt_pm_wait_for_idle(engine->gt); 1064 } 1065 1066 /* Keep the engine awake but idle and check for DOWN */ 1067 if (pm_events & GEN6_PM_RP_DOWN_THRESHOLD) { 1068 st_engine_heartbeat_disable(engine); 1069 intel_rc6_disable(>->rc6); 1070 1071 err = __rps_down_interrupt(rps, engine); 1072 1073 intel_rc6_enable(>->rc6); 1074 st_engine_heartbeat_enable(engine); 1075 if (err) 1076 goto out; 1077 } 1078 } 1079 1080 out: 1081 if (igt_flush_test(gt->i915)) 1082 err = -EIO; 1083 1084 igt_spinner_fini(&spin); 1085 1086 intel_gt_pm_wait_for_idle(gt); 1087 rps->work.func = saved_work; 1088 1089 return err; 1090 } 1091 1092 static u64 __measure_power(int duration_ms) 1093 { 1094 u64 dE, dt; 1095 1096 dt = ktime_get(); 1097 dE = librapl_energy_uJ(); 1098 usleep_range(1000 * duration_ms, 2000 * duration_ms); 1099 dE = librapl_energy_uJ() - dE; 1100 dt = ktime_get() - dt; 1101 1102 return div64_u64(1000 * 1000 * dE, dt); 1103 } 1104 1105 static u64 measure_power_at(struct intel_rps *rps, int *freq) 1106 { 1107 u64 x[5]; 1108 int i; 1109 1110 *freq = rps_set_check(rps, *freq); 1111 for (i = 0; i < 5; i++) 1112 x[i] = __measure_power(5); 1113 *freq = (*freq + read_cagf(rps)) / 2; 1114 1115 /* A simple triangle filter for better result stability */ 1116 sort(x, 5, sizeof(*x), cmp_u64, NULL); 1117 return div_u64(x[1] + 2 * x[2] + x[3], 4); 1118 } 1119 1120 int live_rps_power(void *arg) 1121 { 1122 struct intel_gt *gt = arg; 1123 struct intel_rps *rps = >->rps; 1124 void (*saved_work)(struct work_struct *wrk); 1125 struct intel_engine_cs *engine; 1126 enum intel_engine_id id; 1127 struct igt_spinner spin; 1128 int err = 0; 1129 1130 /* 1131 * Our fundamental assumption is that running at lower frequency 1132 * actually saves power. Let's see if our RAPL measurement support 1133 * that theory. 1134 */ 1135 1136 if (!intel_rps_is_enabled(rps)) 1137 return 0; 1138 1139 if (!librapl_energy_uJ()) 1140 return 0; 1141 1142 if (igt_spinner_init(&spin, gt)) 1143 return -ENOMEM; 1144 1145 intel_gt_pm_wait_for_idle(gt); 1146 saved_work = rps->work.func; 1147 rps->work.func = dummy_rps_work; 1148 1149 for_each_engine(engine, gt, id) { 1150 struct i915_request *rq; 1151 struct { 1152 u64 power; 1153 int freq; 1154 } min, max; 1155 1156 if (!intel_engine_can_store_dword(engine)) 1157 continue; 1158 1159 st_engine_heartbeat_disable(engine); 1160 1161 rq = igt_spinner_create_request(&spin, 1162 engine->kernel_context, 1163 MI_NOOP); 1164 if (IS_ERR(rq)) { 1165 st_engine_heartbeat_enable(engine); 1166 err = PTR_ERR(rq); 1167 break; 1168 } 1169 1170 i915_request_add(rq); 1171 1172 if (!igt_wait_for_spinner(&spin, rq)) { 1173 pr_err("%s: RPS spinner did not start\n", 1174 engine->name); 1175 igt_spinner_end(&spin); 1176 st_engine_heartbeat_enable(engine); 1177 intel_gt_set_wedged(engine->gt); 1178 err = -EIO; 1179 break; 1180 } 1181 1182 max.freq = rps->max_freq; 1183 max.power = measure_power_at(rps, &max.freq); 1184 1185 min.freq = rps->min_freq; 1186 min.power = measure_power_at(rps, &min.freq); 1187 1188 igt_spinner_end(&spin); 1189 st_engine_heartbeat_enable(engine); 1190 1191 pr_info("%s: min:%llumW @ %uMHz, max:%llumW @ %uMHz\n", 1192 engine->name, 1193 min.power, intel_gpu_freq(rps, min.freq), 1194 max.power, intel_gpu_freq(rps, max.freq)); 1195 1196 if (10 * min.freq >= 9 * max.freq) { 1197 pr_notice("Could not control frequency, ran at [%d:%uMHz, %d:%uMhz]\n", 1198 min.freq, intel_gpu_freq(rps, min.freq), 1199 max.freq, intel_gpu_freq(rps, max.freq)); 1200 continue; 1201 } 1202 1203 if (11 * min.power > 10 * max.power) { 1204 pr_err("%s: did not conserve power when setting lower frequency!\n", 1205 engine->name); 1206 err = -EINVAL; 1207 break; 1208 } 1209 1210 if (igt_flush_test(gt->i915)) { 1211 err = -EIO; 1212 break; 1213 } 1214 } 1215 1216 igt_spinner_fini(&spin); 1217 1218 intel_gt_pm_wait_for_idle(gt); 1219 rps->work.func = saved_work; 1220 1221 return err; 1222 } 1223 1224 int live_rps_dynamic(void *arg) 1225 { 1226 struct intel_gt *gt = arg; 1227 struct intel_rps *rps = >->rps; 1228 struct intel_engine_cs *engine; 1229 enum intel_engine_id id; 1230 struct igt_spinner spin; 1231 int err = 0; 1232 1233 /* 1234 * We've looked at the bascs, and have established that we 1235 * can change the clock frequency and that the HW will generate 1236 * interrupts based on load. Now we check how we integrate those 1237 * moving parts into dynamic reclocking based on load. 1238 */ 1239 1240 if (!intel_rps_is_enabled(rps)) 1241 return 0; 1242 1243 if (igt_spinner_init(&spin, gt)) 1244 return -ENOMEM; 1245 1246 if (intel_rps_has_interrupts(rps)) 1247 pr_info("RPS has interrupt support\n"); 1248 if (intel_rps_uses_timer(rps)) 1249 pr_info("RPS has timer support\n"); 1250 1251 for_each_engine(engine, gt, id) { 1252 struct i915_request *rq; 1253 struct { 1254 ktime_t dt; 1255 u8 freq; 1256 } min, max; 1257 1258 if (!intel_engine_can_store_dword(engine)) 1259 continue; 1260 1261 intel_gt_pm_wait_for_idle(gt); 1262 GEM_BUG_ON(intel_rps_is_active(rps)); 1263 rps->cur_freq = rps->min_freq; 1264 1265 intel_engine_pm_get(engine); 1266 intel_rc6_disable(>->rc6); 1267 GEM_BUG_ON(rps->last_freq != rps->min_freq); 1268 1269 rq = igt_spinner_create_request(&spin, 1270 engine->kernel_context, 1271 MI_NOOP); 1272 if (IS_ERR(rq)) { 1273 err = PTR_ERR(rq); 1274 goto err; 1275 } 1276 1277 i915_request_add(rq); 1278 1279 max.dt = ktime_get(); 1280 max.freq = wait_for_freq(rps, rps->max_freq, 500); 1281 max.dt = ktime_sub(ktime_get(), max.dt); 1282 1283 igt_spinner_end(&spin); 1284 1285 min.dt = ktime_get(); 1286 min.freq = wait_for_freq(rps, rps->min_freq, 2000); 1287 min.dt = ktime_sub(ktime_get(), min.dt); 1288 1289 pr_info("%s: dynamically reclocked to %u:%uMHz while busy in %lluns, and %u:%uMHz while idle in %lluns\n", 1290 engine->name, 1291 max.freq, intel_gpu_freq(rps, max.freq), 1292 ktime_to_ns(max.dt), 1293 min.freq, intel_gpu_freq(rps, min.freq), 1294 ktime_to_ns(min.dt)); 1295 if (min.freq >= max.freq) { 1296 pr_err("%s: dynamic reclocking of spinner failed\n!", 1297 engine->name); 1298 err = -EINVAL; 1299 } 1300 1301 err: 1302 intel_rc6_enable(>->rc6); 1303 intel_engine_pm_put(engine); 1304 1305 if (igt_flush_test(gt->i915)) 1306 err = -EIO; 1307 if (err) 1308 break; 1309 } 1310 1311 igt_spinner_fini(&spin); 1312 1313 return err; 1314 } 1315