1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Common code for Intel Running Average Power Limit (RAPL) support. 4 * Copyright (c) 2019, Intel Corporation. 5 */ 6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 7 8 #include <linux/kernel.h> 9 #include <linux/module.h> 10 #include <linux/list.h> 11 #include <linux/types.h> 12 #include <linux/device.h> 13 #include <linux/slab.h> 14 #include <linux/log2.h> 15 #include <linux/bitmap.h> 16 #include <linux/delay.h> 17 #include <linux/sysfs.h> 18 #include <linux/cpu.h> 19 #include <linux/powercap.h> 20 #include <linux/suspend.h> 21 #include <linux/intel_rapl.h> 22 #include <linux/processor.h> 23 #include <linux/platform_device.h> 24 25 #include <asm/iosf_mbi.h> 26 #include <asm/cpu_device_id.h> 27 #include <asm/intel-family.h> 28 29 /* bitmasks for RAPL MSRs, used by primitive access functions */ 30 #define ENERGY_STATUS_MASK 0xffffffff 31 32 #define POWER_LIMIT1_MASK 0x7FFF 33 #define POWER_LIMIT1_ENABLE BIT(15) 34 #define POWER_LIMIT1_CLAMP BIT(16) 35 36 #define POWER_LIMIT2_MASK (0x7FFFULL<<32) 37 #define POWER_LIMIT2_ENABLE BIT_ULL(47) 38 #define POWER_LIMIT2_CLAMP BIT_ULL(48) 39 #define POWER_HIGH_LOCK BIT_ULL(63) 40 #define POWER_LOW_LOCK BIT(31) 41 42 #define POWER_LIMIT4_MASK 0x1FFF 43 44 #define TIME_WINDOW1_MASK (0x7FULL<<17) 45 #define TIME_WINDOW2_MASK (0x7FULL<<49) 46 47 #define POWER_UNIT_OFFSET 0 48 #define POWER_UNIT_MASK 0x0F 49 50 #define ENERGY_UNIT_OFFSET 0x08 51 #define ENERGY_UNIT_MASK 0x1F00 52 53 #define TIME_UNIT_OFFSET 0x10 54 #define TIME_UNIT_MASK 0xF0000 55 56 #define POWER_INFO_MAX_MASK (0x7fffULL<<32) 57 #define POWER_INFO_MIN_MASK (0x7fffULL<<16) 58 #define POWER_INFO_MAX_TIME_WIN_MASK (0x3fULL<<48) 59 #define POWER_INFO_THERMAL_SPEC_MASK 0x7fff 60 61 #define PERF_STATUS_THROTTLE_TIME_MASK 0xffffffff 62 #define PP_POLICY_MASK 0x1F 63 64 /* Non HW constants */ 65 #define RAPL_PRIMITIVE_DERIVED BIT(1) /* not from raw data */ 66 #define RAPL_PRIMITIVE_DUMMY BIT(2) 67 68 #define TIME_WINDOW_MAX_MSEC 40000 69 #define TIME_WINDOW_MIN_MSEC 250 70 #define ENERGY_UNIT_SCALE 1000 /* scale from driver unit to powercap unit */ 71 enum unit_type { 72 ARBITRARY_UNIT, /* no translation */ 73 POWER_UNIT, 74 ENERGY_UNIT, 75 TIME_UNIT, 76 }; 77 78 /* per domain data, some are optional */ 79 #define NR_RAW_PRIMITIVES (NR_RAPL_PRIMITIVES - 2) 80 81 #define DOMAIN_STATE_INACTIVE BIT(0) 82 #define DOMAIN_STATE_POWER_LIMIT_SET BIT(1) 83 #define DOMAIN_STATE_BIOS_LOCKED BIT(2) 84 85 static const char pl1_name[] = "long_term"; 86 static const char pl2_name[] = "short_term"; 87 static const char pl4_name[] = "peak_power"; 88 89 #define power_zone_to_rapl_domain(_zone) \ 90 container_of(_zone, struct rapl_domain, power_zone) 91 92 struct rapl_defaults { 93 u8 floor_freq_reg_addr; 94 int (*check_unit)(struct rapl_package *rp, int cpu); 95 void (*set_floor_freq)(struct rapl_domain *rd, bool mode); 96 u64 (*compute_time_window)(struct rapl_package *rp, u64 val, 97 bool to_raw); 98 unsigned int dram_domain_energy_unit; 99 unsigned int psys_domain_energy_unit; 100 }; 101 static struct rapl_defaults *rapl_defaults; 102 103 /* Sideband MBI registers */ 104 #define IOSF_CPU_POWER_BUDGET_CTL_BYT (0x2) 105 #define IOSF_CPU_POWER_BUDGET_CTL_TNG (0xdf) 106 107 #define PACKAGE_PLN_INT_SAVED BIT(0) 108 #define MAX_PRIM_NAME (32) 109 110 /* per domain data. used to describe individual knobs such that access function 111 * can be consolidated into one instead of many inline functions. 112 */ 113 struct rapl_primitive_info { 114 const char *name; 115 u64 mask; 116 int shift; 117 enum rapl_domain_reg_id id; 118 enum unit_type unit; 119 u32 flag; 120 }; 121 122 #define PRIMITIVE_INFO_INIT(p, m, s, i, u, f) { \ 123 .name = #p, \ 124 .mask = m, \ 125 .shift = s, \ 126 .id = i, \ 127 .unit = u, \ 128 .flag = f \ 129 } 130 131 static void rapl_init_domains(struct rapl_package *rp); 132 static int rapl_read_data_raw(struct rapl_domain *rd, 133 enum rapl_primitives prim, 134 bool xlate, u64 *data); 135 static int rapl_write_data_raw(struct rapl_domain *rd, 136 enum rapl_primitives prim, 137 unsigned long long value); 138 static u64 rapl_unit_xlate(struct rapl_domain *rd, 139 enum unit_type type, u64 value, int to_raw); 140 static void package_power_limit_irq_save(struct rapl_package *rp); 141 142 static LIST_HEAD(rapl_packages); /* guarded by CPU hotplug lock */ 143 144 static const char *const rapl_domain_names[] = { 145 "package", 146 "core", 147 "uncore", 148 "dram", 149 "psys", 150 }; 151 152 static int get_energy_counter(struct powercap_zone *power_zone, 153 u64 *energy_raw) 154 { 155 struct rapl_domain *rd; 156 u64 energy_now; 157 158 /* prevent CPU hotplug, make sure the RAPL domain does not go 159 * away while reading the counter. 160 */ 161 get_online_cpus(); 162 rd = power_zone_to_rapl_domain(power_zone); 163 164 if (!rapl_read_data_raw(rd, ENERGY_COUNTER, true, &energy_now)) { 165 *energy_raw = energy_now; 166 put_online_cpus(); 167 168 return 0; 169 } 170 put_online_cpus(); 171 172 return -EIO; 173 } 174 175 static int get_max_energy_counter(struct powercap_zone *pcd_dev, u64 *energy) 176 { 177 struct rapl_domain *rd = power_zone_to_rapl_domain(pcd_dev); 178 179 *energy = rapl_unit_xlate(rd, ENERGY_UNIT, ENERGY_STATUS_MASK, 0); 180 return 0; 181 } 182 183 static int release_zone(struct powercap_zone *power_zone) 184 { 185 struct rapl_domain *rd = power_zone_to_rapl_domain(power_zone); 186 struct rapl_package *rp = rd->rp; 187 188 /* package zone is the last zone of a package, we can free 189 * memory here since all children has been unregistered. 190 */ 191 if (rd->id == RAPL_DOMAIN_PACKAGE) { 192 kfree(rd); 193 rp->domains = NULL; 194 } 195 196 return 0; 197 198 } 199 200 static int find_nr_power_limit(struct rapl_domain *rd) 201 { 202 int i, nr_pl = 0; 203 204 for (i = 0; i < NR_POWER_LIMITS; i++) { 205 if (rd->rpl[i].name) 206 nr_pl++; 207 } 208 209 return nr_pl; 210 } 211 212 static int set_domain_enable(struct powercap_zone *power_zone, bool mode) 213 { 214 struct rapl_domain *rd = power_zone_to_rapl_domain(power_zone); 215 216 if (rd->state & DOMAIN_STATE_BIOS_LOCKED) 217 return -EACCES; 218 219 get_online_cpus(); 220 rapl_write_data_raw(rd, PL1_ENABLE, mode); 221 if (rapl_defaults->set_floor_freq) 222 rapl_defaults->set_floor_freq(rd, mode); 223 put_online_cpus(); 224 225 return 0; 226 } 227 228 static int get_domain_enable(struct powercap_zone *power_zone, bool *mode) 229 { 230 struct rapl_domain *rd = power_zone_to_rapl_domain(power_zone); 231 u64 val; 232 233 if (rd->state & DOMAIN_STATE_BIOS_LOCKED) { 234 *mode = false; 235 return 0; 236 } 237 get_online_cpus(); 238 if (rapl_read_data_raw(rd, PL1_ENABLE, true, &val)) { 239 put_online_cpus(); 240 return -EIO; 241 } 242 *mode = val; 243 put_online_cpus(); 244 245 return 0; 246 } 247 248 /* per RAPL domain ops, in the order of rapl_domain_type */ 249 static const struct powercap_zone_ops zone_ops[] = { 250 /* RAPL_DOMAIN_PACKAGE */ 251 { 252 .get_energy_uj = get_energy_counter, 253 .get_max_energy_range_uj = get_max_energy_counter, 254 .release = release_zone, 255 .set_enable = set_domain_enable, 256 .get_enable = get_domain_enable, 257 }, 258 /* RAPL_DOMAIN_PP0 */ 259 { 260 .get_energy_uj = get_energy_counter, 261 .get_max_energy_range_uj = get_max_energy_counter, 262 .release = release_zone, 263 .set_enable = set_domain_enable, 264 .get_enable = get_domain_enable, 265 }, 266 /* RAPL_DOMAIN_PP1 */ 267 { 268 .get_energy_uj = get_energy_counter, 269 .get_max_energy_range_uj = get_max_energy_counter, 270 .release = release_zone, 271 .set_enable = set_domain_enable, 272 .get_enable = get_domain_enable, 273 }, 274 /* RAPL_DOMAIN_DRAM */ 275 { 276 .get_energy_uj = get_energy_counter, 277 .get_max_energy_range_uj = get_max_energy_counter, 278 .release = release_zone, 279 .set_enable = set_domain_enable, 280 .get_enable = get_domain_enable, 281 }, 282 /* RAPL_DOMAIN_PLATFORM */ 283 { 284 .get_energy_uj = get_energy_counter, 285 .get_max_energy_range_uj = get_max_energy_counter, 286 .release = release_zone, 287 .set_enable = set_domain_enable, 288 .get_enable = get_domain_enable, 289 }, 290 }; 291 292 /* 293 * Constraint index used by powercap can be different than power limit (PL) 294 * index in that some PLs maybe missing due to non-existent MSRs. So we 295 * need to convert here by finding the valid PLs only (name populated). 296 */ 297 static int contraint_to_pl(struct rapl_domain *rd, int cid) 298 { 299 int i, j; 300 301 for (i = 0, j = 0; i < NR_POWER_LIMITS; i++) { 302 if ((rd->rpl[i].name) && j++ == cid) { 303 pr_debug("%s: index %d\n", __func__, i); 304 return i; 305 } 306 } 307 pr_err("Cannot find matching power limit for constraint %d\n", cid); 308 309 return -EINVAL; 310 } 311 312 static int set_power_limit(struct powercap_zone *power_zone, int cid, 313 u64 power_limit) 314 { 315 struct rapl_domain *rd; 316 struct rapl_package *rp; 317 int ret = 0; 318 int id; 319 320 get_online_cpus(); 321 rd = power_zone_to_rapl_domain(power_zone); 322 id = contraint_to_pl(rd, cid); 323 if (id < 0) { 324 ret = id; 325 goto set_exit; 326 } 327 328 rp = rd->rp; 329 330 if (rd->state & DOMAIN_STATE_BIOS_LOCKED) { 331 dev_warn(&power_zone->dev, 332 "%s locked by BIOS, monitoring only\n", rd->name); 333 ret = -EACCES; 334 goto set_exit; 335 } 336 337 switch (rd->rpl[id].prim_id) { 338 case PL1_ENABLE: 339 rapl_write_data_raw(rd, POWER_LIMIT1, power_limit); 340 break; 341 case PL2_ENABLE: 342 rapl_write_data_raw(rd, POWER_LIMIT2, power_limit); 343 break; 344 case PL4_ENABLE: 345 rapl_write_data_raw(rd, POWER_LIMIT4, power_limit); 346 break; 347 default: 348 ret = -EINVAL; 349 } 350 if (!ret) 351 package_power_limit_irq_save(rp); 352 set_exit: 353 put_online_cpus(); 354 return ret; 355 } 356 357 static int get_current_power_limit(struct powercap_zone *power_zone, int cid, 358 u64 *data) 359 { 360 struct rapl_domain *rd; 361 u64 val; 362 int prim; 363 int ret = 0; 364 int id; 365 366 get_online_cpus(); 367 rd = power_zone_to_rapl_domain(power_zone); 368 id = contraint_to_pl(rd, cid); 369 if (id < 0) { 370 ret = id; 371 goto get_exit; 372 } 373 374 switch (rd->rpl[id].prim_id) { 375 case PL1_ENABLE: 376 prim = POWER_LIMIT1; 377 break; 378 case PL2_ENABLE: 379 prim = POWER_LIMIT2; 380 break; 381 case PL4_ENABLE: 382 prim = POWER_LIMIT4; 383 break; 384 default: 385 put_online_cpus(); 386 return -EINVAL; 387 } 388 if (rapl_read_data_raw(rd, prim, true, &val)) 389 ret = -EIO; 390 else 391 *data = val; 392 393 get_exit: 394 put_online_cpus(); 395 396 return ret; 397 } 398 399 static int set_time_window(struct powercap_zone *power_zone, int cid, 400 u64 window) 401 { 402 struct rapl_domain *rd; 403 int ret = 0; 404 int id; 405 406 get_online_cpus(); 407 rd = power_zone_to_rapl_domain(power_zone); 408 id = contraint_to_pl(rd, cid); 409 if (id < 0) { 410 ret = id; 411 goto set_time_exit; 412 } 413 414 switch (rd->rpl[id].prim_id) { 415 case PL1_ENABLE: 416 rapl_write_data_raw(rd, TIME_WINDOW1, window); 417 break; 418 case PL2_ENABLE: 419 rapl_write_data_raw(rd, TIME_WINDOW2, window); 420 break; 421 default: 422 ret = -EINVAL; 423 } 424 425 set_time_exit: 426 put_online_cpus(); 427 return ret; 428 } 429 430 static int get_time_window(struct powercap_zone *power_zone, int cid, 431 u64 *data) 432 { 433 struct rapl_domain *rd; 434 u64 val; 435 int ret = 0; 436 int id; 437 438 get_online_cpus(); 439 rd = power_zone_to_rapl_domain(power_zone); 440 id = contraint_to_pl(rd, cid); 441 if (id < 0) { 442 ret = id; 443 goto get_time_exit; 444 } 445 446 switch (rd->rpl[id].prim_id) { 447 case PL1_ENABLE: 448 ret = rapl_read_data_raw(rd, TIME_WINDOW1, true, &val); 449 break; 450 case PL2_ENABLE: 451 ret = rapl_read_data_raw(rd, TIME_WINDOW2, true, &val); 452 break; 453 case PL4_ENABLE: 454 /* 455 * Time window parameter is not applicable for PL4 entry 456 * so assigining '0' as default value. 457 */ 458 val = 0; 459 break; 460 default: 461 put_online_cpus(); 462 return -EINVAL; 463 } 464 if (!ret) 465 *data = val; 466 467 get_time_exit: 468 put_online_cpus(); 469 470 return ret; 471 } 472 473 static const char *get_constraint_name(struct powercap_zone *power_zone, 474 int cid) 475 { 476 struct rapl_domain *rd; 477 int id; 478 479 rd = power_zone_to_rapl_domain(power_zone); 480 id = contraint_to_pl(rd, cid); 481 if (id >= 0) 482 return rd->rpl[id].name; 483 484 return NULL; 485 } 486 487 static int get_max_power(struct powercap_zone *power_zone, int id, u64 *data) 488 { 489 struct rapl_domain *rd; 490 u64 val; 491 int prim; 492 int ret = 0; 493 494 get_online_cpus(); 495 rd = power_zone_to_rapl_domain(power_zone); 496 switch (rd->rpl[id].prim_id) { 497 case PL1_ENABLE: 498 prim = THERMAL_SPEC_POWER; 499 break; 500 case PL2_ENABLE: 501 prim = MAX_POWER; 502 break; 503 case PL4_ENABLE: 504 prim = MAX_POWER; 505 break; 506 default: 507 put_online_cpus(); 508 return -EINVAL; 509 } 510 if (rapl_read_data_raw(rd, prim, true, &val)) 511 ret = -EIO; 512 else 513 *data = val; 514 515 /* As a generalization rule, PL4 would be around two times PL2. */ 516 if (rd->rpl[id].prim_id == PL4_ENABLE) 517 *data = *data * 2; 518 519 put_online_cpus(); 520 521 return ret; 522 } 523 524 static const struct powercap_zone_constraint_ops constraint_ops = { 525 .set_power_limit_uw = set_power_limit, 526 .get_power_limit_uw = get_current_power_limit, 527 .set_time_window_us = set_time_window, 528 .get_time_window_us = get_time_window, 529 .get_max_power_uw = get_max_power, 530 .get_name = get_constraint_name, 531 }; 532 533 /* called after domain detection and package level data are set */ 534 static void rapl_init_domains(struct rapl_package *rp) 535 { 536 enum rapl_domain_type i; 537 enum rapl_domain_reg_id j; 538 struct rapl_domain *rd = rp->domains; 539 540 for (i = 0; i < RAPL_DOMAIN_MAX; i++) { 541 unsigned int mask = rp->domain_map & (1 << i); 542 543 if (!mask) 544 continue; 545 546 rd->rp = rp; 547 rd->name = rapl_domain_names[i]; 548 rd->id = i; 549 rd->rpl[0].prim_id = PL1_ENABLE; 550 rd->rpl[0].name = pl1_name; 551 552 /* 553 * The PL2 power domain is applicable for limits two 554 * and limits three 555 */ 556 if (rp->priv->limits[i] >= 2) { 557 rd->rpl[1].prim_id = PL2_ENABLE; 558 rd->rpl[1].name = pl2_name; 559 } 560 561 /* Enable PL4 domain if the total power limits are three */ 562 if (rp->priv->limits[i] == 3) { 563 rd->rpl[2].prim_id = PL4_ENABLE; 564 rd->rpl[2].name = pl4_name; 565 } 566 567 for (j = 0; j < RAPL_DOMAIN_REG_MAX; j++) 568 rd->regs[j] = rp->priv->regs[i][j]; 569 570 switch (i) { 571 case RAPL_DOMAIN_DRAM: 572 rd->domain_energy_unit = 573 rapl_defaults->dram_domain_energy_unit; 574 if (rd->domain_energy_unit) 575 pr_info("DRAM domain energy unit %dpj\n", 576 rd->domain_energy_unit); 577 break; 578 case RAPL_DOMAIN_PLATFORM: 579 rd->domain_energy_unit = 580 rapl_defaults->psys_domain_energy_unit; 581 if (rd->domain_energy_unit) 582 pr_info("Platform domain energy unit %dpj\n", 583 rd->domain_energy_unit); 584 break; 585 default: 586 break; 587 } 588 rd++; 589 } 590 } 591 592 static u64 rapl_unit_xlate(struct rapl_domain *rd, enum unit_type type, 593 u64 value, int to_raw) 594 { 595 u64 units = 1; 596 struct rapl_package *rp = rd->rp; 597 u64 scale = 1; 598 599 switch (type) { 600 case POWER_UNIT: 601 units = rp->power_unit; 602 break; 603 case ENERGY_UNIT: 604 scale = ENERGY_UNIT_SCALE; 605 /* per domain unit takes precedence */ 606 if (rd->domain_energy_unit) 607 units = rd->domain_energy_unit; 608 else 609 units = rp->energy_unit; 610 break; 611 case TIME_UNIT: 612 return rapl_defaults->compute_time_window(rp, value, to_raw); 613 case ARBITRARY_UNIT: 614 default: 615 return value; 616 }; 617 618 if (to_raw) 619 return div64_u64(value, units) * scale; 620 621 value *= units; 622 623 return div64_u64(value, scale); 624 } 625 626 /* in the order of enum rapl_primitives */ 627 static struct rapl_primitive_info rpi[] = { 628 /* name, mask, shift, msr index, unit divisor */ 629 PRIMITIVE_INFO_INIT(ENERGY_COUNTER, ENERGY_STATUS_MASK, 0, 630 RAPL_DOMAIN_REG_STATUS, ENERGY_UNIT, 0), 631 PRIMITIVE_INFO_INIT(POWER_LIMIT1, POWER_LIMIT1_MASK, 0, 632 RAPL_DOMAIN_REG_LIMIT, POWER_UNIT, 0), 633 PRIMITIVE_INFO_INIT(POWER_LIMIT2, POWER_LIMIT2_MASK, 32, 634 RAPL_DOMAIN_REG_LIMIT, POWER_UNIT, 0), 635 PRIMITIVE_INFO_INIT(POWER_LIMIT4, POWER_LIMIT4_MASK, 0, 636 RAPL_DOMAIN_REG_PL4, POWER_UNIT, 0), 637 PRIMITIVE_INFO_INIT(FW_LOCK, POWER_LOW_LOCK, 31, 638 RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0), 639 PRIMITIVE_INFO_INIT(PL1_ENABLE, POWER_LIMIT1_ENABLE, 15, 640 RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0), 641 PRIMITIVE_INFO_INIT(PL1_CLAMP, POWER_LIMIT1_CLAMP, 16, 642 RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0), 643 PRIMITIVE_INFO_INIT(PL2_ENABLE, POWER_LIMIT2_ENABLE, 47, 644 RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0), 645 PRIMITIVE_INFO_INIT(PL2_CLAMP, POWER_LIMIT2_CLAMP, 48, 646 RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0), 647 PRIMITIVE_INFO_INIT(PL4_ENABLE, POWER_LIMIT4_MASK, 0, 648 RAPL_DOMAIN_REG_PL4, ARBITRARY_UNIT, 0), 649 PRIMITIVE_INFO_INIT(TIME_WINDOW1, TIME_WINDOW1_MASK, 17, 650 RAPL_DOMAIN_REG_LIMIT, TIME_UNIT, 0), 651 PRIMITIVE_INFO_INIT(TIME_WINDOW2, TIME_WINDOW2_MASK, 49, 652 RAPL_DOMAIN_REG_LIMIT, TIME_UNIT, 0), 653 PRIMITIVE_INFO_INIT(THERMAL_SPEC_POWER, POWER_INFO_THERMAL_SPEC_MASK, 654 0, RAPL_DOMAIN_REG_INFO, POWER_UNIT, 0), 655 PRIMITIVE_INFO_INIT(MAX_POWER, POWER_INFO_MAX_MASK, 32, 656 RAPL_DOMAIN_REG_INFO, POWER_UNIT, 0), 657 PRIMITIVE_INFO_INIT(MIN_POWER, POWER_INFO_MIN_MASK, 16, 658 RAPL_DOMAIN_REG_INFO, POWER_UNIT, 0), 659 PRIMITIVE_INFO_INIT(MAX_TIME_WINDOW, POWER_INFO_MAX_TIME_WIN_MASK, 48, 660 RAPL_DOMAIN_REG_INFO, TIME_UNIT, 0), 661 PRIMITIVE_INFO_INIT(THROTTLED_TIME, PERF_STATUS_THROTTLE_TIME_MASK, 0, 662 RAPL_DOMAIN_REG_PERF, TIME_UNIT, 0), 663 PRIMITIVE_INFO_INIT(PRIORITY_LEVEL, PP_POLICY_MASK, 0, 664 RAPL_DOMAIN_REG_POLICY, ARBITRARY_UNIT, 0), 665 /* non-hardware */ 666 PRIMITIVE_INFO_INIT(AVERAGE_POWER, 0, 0, 0, POWER_UNIT, 667 RAPL_PRIMITIVE_DERIVED), 668 {NULL, 0, 0, 0}, 669 }; 670 671 /* Read primitive data based on its related struct rapl_primitive_info. 672 * if xlate flag is set, return translated data based on data units, i.e. 673 * time, energy, and power. 674 * RAPL MSRs are non-architectual and are laid out not consistently across 675 * domains. Here we use primitive info to allow writing consolidated access 676 * functions. 677 * For a given primitive, it is processed by MSR mask and shift. Unit conversion 678 * is pre-assigned based on RAPL unit MSRs read at init time. 679 * 63-------------------------- 31--------------------------- 0 680 * | xxxxx (mask) | 681 * | |<- shift ----------------| 682 * 63-------------------------- 31--------------------------- 0 683 */ 684 static int rapl_read_data_raw(struct rapl_domain *rd, 685 enum rapl_primitives prim, bool xlate, u64 *data) 686 { 687 u64 value; 688 struct rapl_primitive_info *rp = &rpi[prim]; 689 struct reg_action ra; 690 int cpu; 691 692 if (!rp->name || rp->flag & RAPL_PRIMITIVE_DUMMY) 693 return -EINVAL; 694 695 ra.reg = rd->regs[rp->id]; 696 if (!ra.reg) 697 return -EINVAL; 698 699 cpu = rd->rp->lead_cpu; 700 701 /* domain with 2 limits has different bit */ 702 if (prim == FW_LOCK && rd->rp->priv->limits[rd->id] == 2) { 703 rp->mask = POWER_HIGH_LOCK; 704 rp->shift = 63; 705 } 706 /* non-hardware data are collected by the polling thread */ 707 if (rp->flag & RAPL_PRIMITIVE_DERIVED) { 708 *data = rd->rdd.primitives[prim]; 709 return 0; 710 } 711 712 ra.mask = rp->mask; 713 714 if (rd->rp->priv->read_raw(cpu, &ra)) { 715 pr_debug("failed to read reg 0x%llx on cpu %d\n", ra.reg, cpu); 716 return -EIO; 717 } 718 719 value = ra.value >> rp->shift; 720 721 if (xlate) 722 *data = rapl_unit_xlate(rd, rp->unit, value, 0); 723 else 724 *data = value; 725 726 return 0; 727 } 728 729 /* Similar use of primitive info in the read counterpart */ 730 static int rapl_write_data_raw(struct rapl_domain *rd, 731 enum rapl_primitives prim, 732 unsigned long long value) 733 { 734 struct rapl_primitive_info *rp = &rpi[prim]; 735 int cpu; 736 u64 bits; 737 struct reg_action ra; 738 int ret; 739 740 cpu = rd->rp->lead_cpu; 741 bits = rapl_unit_xlate(rd, rp->unit, value, 1); 742 bits <<= rp->shift; 743 bits &= rp->mask; 744 745 memset(&ra, 0, sizeof(ra)); 746 747 ra.reg = rd->regs[rp->id]; 748 ra.mask = rp->mask; 749 ra.value = bits; 750 751 ret = rd->rp->priv->write_raw(cpu, &ra); 752 753 return ret; 754 } 755 756 /* 757 * Raw RAPL data stored in MSRs are in certain scales. We need to 758 * convert them into standard units based on the units reported in 759 * the RAPL unit MSRs. This is specific to CPUs as the method to 760 * calculate units differ on different CPUs. 761 * We convert the units to below format based on CPUs. 762 * i.e. 763 * energy unit: picoJoules : Represented in picoJoules by default 764 * power unit : microWatts : Represented in milliWatts by default 765 * time unit : microseconds: Represented in seconds by default 766 */ 767 static int rapl_check_unit_core(struct rapl_package *rp, int cpu) 768 { 769 struct reg_action ra; 770 u32 value; 771 772 ra.reg = rp->priv->reg_unit; 773 ra.mask = ~0; 774 if (rp->priv->read_raw(cpu, &ra)) { 775 pr_err("Failed to read power unit REG 0x%llx on CPU %d, exit.\n", 776 rp->priv->reg_unit, cpu); 777 return -ENODEV; 778 } 779 780 value = (ra.value & ENERGY_UNIT_MASK) >> ENERGY_UNIT_OFFSET; 781 rp->energy_unit = ENERGY_UNIT_SCALE * 1000000 / (1 << value); 782 783 value = (ra.value & POWER_UNIT_MASK) >> POWER_UNIT_OFFSET; 784 rp->power_unit = 1000000 / (1 << value); 785 786 value = (ra.value & TIME_UNIT_MASK) >> TIME_UNIT_OFFSET; 787 rp->time_unit = 1000000 / (1 << value); 788 789 pr_debug("Core CPU %s energy=%dpJ, time=%dus, power=%duW\n", 790 rp->name, rp->energy_unit, rp->time_unit, rp->power_unit); 791 792 return 0; 793 } 794 795 static int rapl_check_unit_atom(struct rapl_package *rp, int cpu) 796 { 797 struct reg_action ra; 798 u32 value; 799 800 ra.reg = rp->priv->reg_unit; 801 ra.mask = ~0; 802 if (rp->priv->read_raw(cpu, &ra)) { 803 pr_err("Failed to read power unit REG 0x%llx on CPU %d, exit.\n", 804 rp->priv->reg_unit, cpu); 805 return -ENODEV; 806 } 807 808 value = (ra.value & ENERGY_UNIT_MASK) >> ENERGY_UNIT_OFFSET; 809 rp->energy_unit = ENERGY_UNIT_SCALE * 1 << value; 810 811 value = (ra.value & POWER_UNIT_MASK) >> POWER_UNIT_OFFSET; 812 rp->power_unit = (1 << value) * 1000; 813 814 value = (ra.value & TIME_UNIT_MASK) >> TIME_UNIT_OFFSET; 815 rp->time_unit = 1000000 / (1 << value); 816 817 pr_debug("Atom %s energy=%dpJ, time=%dus, power=%duW\n", 818 rp->name, rp->energy_unit, rp->time_unit, rp->power_unit); 819 820 return 0; 821 } 822 823 static void power_limit_irq_save_cpu(void *info) 824 { 825 u32 l, h = 0; 826 struct rapl_package *rp = (struct rapl_package *)info; 827 828 /* save the state of PLN irq mask bit before disabling it */ 829 rdmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, &l, &h); 830 if (!(rp->power_limit_irq & PACKAGE_PLN_INT_SAVED)) { 831 rp->power_limit_irq = l & PACKAGE_THERM_INT_PLN_ENABLE; 832 rp->power_limit_irq |= PACKAGE_PLN_INT_SAVED; 833 } 834 l &= ~PACKAGE_THERM_INT_PLN_ENABLE; 835 wrmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h); 836 } 837 838 /* REVISIT: 839 * When package power limit is set artificially low by RAPL, LVT 840 * thermal interrupt for package power limit should be ignored 841 * since we are not really exceeding the real limit. The intention 842 * is to avoid excessive interrupts while we are trying to save power. 843 * A useful feature might be routing the package_power_limit interrupt 844 * to userspace via eventfd. once we have a usecase, this is simple 845 * to do by adding an atomic notifier. 846 */ 847 848 static void package_power_limit_irq_save(struct rapl_package *rp) 849 { 850 if (!boot_cpu_has(X86_FEATURE_PTS) || !boot_cpu_has(X86_FEATURE_PLN)) 851 return; 852 853 smp_call_function_single(rp->lead_cpu, power_limit_irq_save_cpu, rp, 1); 854 } 855 856 /* 857 * Restore per package power limit interrupt enable state. Called from cpu 858 * hotplug code on package removal. 859 */ 860 static void package_power_limit_irq_restore(struct rapl_package *rp) 861 { 862 u32 l, h; 863 864 if (!boot_cpu_has(X86_FEATURE_PTS) || !boot_cpu_has(X86_FEATURE_PLN)) 865 return; 866 867 /* irq enable state not saved, nothing to restore */ 868 if (!(rp->power_limit_irq & PACKAGE_PLN_INT_SAVED)) 869 return; 870 871 rdmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, &l, &h); 872 873 if (rp->power_limit_irq & PACKAGE_THERM_INT_PLN_ENABLE) 874 l |= PACKAGE_THERM_INT_PLN_ENABLE; 875 else 876 l &= ~PACKAGE_THERM_INT_PLN_ENABLE; 877 878 wrmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h); 879 } 880 881 static void set_floor_freq_default(struct rapl_domain *rd, bool mode) 882 { 883 int nr_powerlimit = find_nr_power_limit(rd); 884 885 /* always enable clamp such that p-state can go below OS requested 886 * range. power capping priority over guranteed frequency. 887 */ 888 rapl_write_data_raw(rd, PL1_CLAMP, mode); 889 890 /* some domains have pl2 */ 891 if (nr_powerlimit > 1) { 892 rapl_write_data_raw(rd, PL2_ENABLE, mode); 893 rapl_write_data_raw(rd, PL2_CLAMP, mode); 894 } 895 } 896 897 static void set_floor_freq_atom(struct rapl_domain *rd, bool enable) 898 { 899 static u32 power_ctrl_orig_val; 900 u32 mdata; 901 902 if (!rapl_defaults->floor_freq_reg_addr) { 903 pr_err("Invalid floor frequency config register\n"); 904 return; 905 } 906 907 if (!power_ctrl_orig_val) 908 iosf_mbi_read(BT_MBI_UNIT_PMC, MBI_CR_READ, 909 rapl_defaults->floor_freq_reg_addr, 910 &power_ctrl_orig_val); 911 mdata = power_ctrl_orig_val; 912 if (enable) { 913 mdata &= ~(0x7f << 8); 914 mdata |= 1 << 8; 915 } 916 iosf_mbi_write(BT_MBI_UNIT_PMC, MBI_CR_WRITE, 917 rapl_defaults->floor_freq_reg_addr, mdata); 918 } 919 920 static u64 rapl_compute_time_window_core(struct rapl_package *rp, u64 value, 921 bool to_raw) 922 { 923 u64 f, y; /* fraction and exp. used for time unit */ 924 925 /* 926 * Special processing based on 2^Y*(1+F/4), refer 927 * to Intel Software Developer's manual Vol.3B: CH 14.9.3. 928 */ 929 if (!to_raw) { 930 f = (value & 0x60) >> 5; 931 y = value & 0x1f; 932 value = (1 << y) * (4 + f) * rp->time_unit / 4; 933 } else { 934 do_div(value, rp->time_unit); 935 y = ilog2(value); 936 f = div64_u64(4 * (value - (1 << y)), 1 << y); 937 value = (y & 0x1f) | ((f & 0x3) << 5); 938 } 939 return value; 940 } 941 942 static u64 rapl_compute_time_window_atom(struct rapl_package *rp, u64 value, 943 bool to_raw) 944 { 945 /* 946 * Atom time unit encoding is straight forward val * time_unit, 947 * where time_unit is default to 1 sec. Never 0. 948 */ 949 if (!to_raw) 950 return (value) ? value *= rp->time_unit : rp->time_unit; 951 952 value = div64_u64(value, rp->time_unit); 953 954 return value; 955 } 956 957 static const struct rapl_defaults rapl_defaults_core = { 958 .floor_freq_reg_addr = 0, 959 .check_unit = rapl_check_unit_core, 960 .set_floor_freq = set_floor_freq_default, 961 .compute_time_window = rapl_compute_time_window_core, 962 }; 963 964 static const struct rapl_defaults rapl_defaults_hsw_server = { 965 .check_unit = rapl_check_unit_core, 966 .set_floor_freq = set_floor_freq_default, 967 .compute_time_window = rapl_compute_time_window_core, 968 .dram_domain_energy_unit = 15300, 969 }; 970 971 static const struct rapl_defaults rapl_defaults_spr_server = { 972 .check_unit = rapl_check_unit_core, 973 .set_floor_freq = set_floor_freq_default, 974 .compute_time_window = rapl_compute_time_window_core, 975 .dram_domain_energy_unit = 15300, 976 .psys_domain_energy_unit = 1000000000, 977 }; 978 979 static const struct rapl_defaults rapl_defaults_byt = { 980 .floor_freq_reg_addr = IOSF_CPU_POWER_BUDGET_CTL_BYT, 981 .check_unit = rapl_check_unit_atom, 982 .set_floor_freq = set_floor_freq_atom, 983 .compute_time_window = rapl_compute_time_window_atom, 984 }; 985 986 static const struct rapl_defaults rapl_defaults_tng = { 987 .floor_freq_reg_addr = IOSF_CPU_POWER_BUDGET_CTL_TNG, 988 .check_unit = rapl_check_unit_atom, 989 .set_floor_freq = set_floor_freq_atom, 990 .compute_time_window = rapl_compute_time_window_atom, 991 }; 992 993 static const struct rapl_defaults rapl_defaults_ann = { 994 .floor_freq_reg_addr = 0, 995 .check_unit = rapl_check_unit_atom, 996 .set_floor_freq = NULL, 997 .compute_time_window = rapl_compute_time_window_atom, 998 }; 999 1000 static const struct rapl_defaults rapl_defaults_cht = { 1001 .floor_freq_reg_addr = 0, 1002 .check_unit = rapl_check_unit_atom, 1003 .set_floor_freq = NULL, 1004 .compute_time_window = rapl_compute_time_window_atom, 1005 }; 1006 1007 static const struct x86_cpu_id rapl_ids[] __initconst = { 1008 X86_MATCH_INTEL_FAM6_MODEL(SANDYBRIDGE, &rapl_defaults_core), 1009 X86_MATCH_INTEL_FAM6_MODEL(SANDYBRIDGE_X, &rapl_defaults_core), 1010 1011 X86_MATCH_INTEL_FAM6_MODEL(IVYBRIDGE, &rapl_defaults_core), 1012 X86_MATCH_INTEL_FAM6_MODEL(IVYBRIDGE_X, &rapl_defaults_core), 1013 1014 X86_MATCH_INTEL_FAM6_MODEL(HASWELL, &rapl_defaults_core), 1015 X86_MATCH_INTEL_FAM6_MODEL(HASWELL_L, &rapl_defaults_core), 1016 X86_MATCH_INTEL_FAM6_MODEL(HASWELL_G, &rapl_defaults_core), 1017 X86_MATCH_INTEL_FAM6_MODEL(HASWELL_X, &rapl_defaults_hsw_server), 1018 1019 X86_MATCH_INTEL_FAM6_MODEL(BROADWELL, &rapl_defaults_core), 1020 X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_G, &rapl_defaults_core), 1021 X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_D, &rapl_defaults_core), 1022 X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_X, &rapl_defaults_hsw_server), 1023 1024 X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE, &rapl_defaults_core), 1025 X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_L, &rapl_defaults_core), 1026 X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_X, &rapl_defaults_hsw_server), 1027 X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE_L, &rapl_defaults_core), 1028 X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE, &rapl_defaults_core), 1029 X86_MATCH_INTEL_FAM6_MODEL(CANNONLAKE_L, &rapl_defaults_core), 1030 X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_L, &rapl_defaults_core), 1031 X86_MATCH_INTEL_FAM6_MODEL(ICELAKE, &rapl_defaults_core), 1032 X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_NNPI, &rapl_defaults_core), 1033 X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, &rapl_defaults_hsw_server), 1034 X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, &rapl_defaults_hsw_server), 1035 X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE_L, &rapl_defaults_core), 1036 X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE, &rapl_defaults_core), 1037 X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE_L, &rapl_defaults_core), 1038 X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE, &rapl_defaults_core), 1039 X86_MATCH_INTEL_FAM6_MODEL(ROCKETLAKE, &rapl_defaults_core), 1040 X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE, &rapl_defaults_core), 1041 X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &rapl_defaults_spr_server), 1042 X86_MATCH_INTEL_FAM6_MODEL(LAKEFIELD, &rapl_defaults_core), 1043 1044 X86_MATCH_INTEL_FAM6_MODEL(ATOM_SILVERMONT, &rapl_defaults_byt), 1045 X86_MATCH_INTEL_FAM6_MODEL(ATOM_AIRMONT, &rapl_defaults_cht), 1046 X86_MATCH_INTEL_FAM6_MODEL(ATOM_SILVERMONT_MID, &rapl_defaults_tng), 1047 X86_MATCH_INTEL_FAM6_MODEL(ATOM_AIRMONT_MID, &rapl_defaults_ann), 1048 X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT, &rapl_defaults_core), 1049 X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT_PLUS, &rapl_defaults_core), 1050 X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT_D, &rapl_defaults_core), 1051 X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT, &rapl_defaults_core), 1052 X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_D, &rapl_defaults_core), 1053 X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_L, &rapl_defaults_core), 1054 1055 X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNL, &rapl_defaults_hsw_server), 1056 X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNM, &rapl_defaults_hsw_server), 1057 {} 1058 }; 1059 MODULE_DEVICE_TABLE(x86cpu, rapl_ids); 1060 1061 /* Read once for all raw primitive data for domains */ 1062 static void rapl_update_domain_data(struct rapl_package *rp) 1063 { 1064 int dmn, prim; 1065 u64 val; 1066 1067 for (dmn = 0; dmn < rp->nr_domains; dmn++) { 1068 pr_debug("update %s domain %s data\n", rp->name, 1069 rp->domains[dmn].name); 1070 /* exclude non-raw primitives */ 1071 for (prim = 0; prim < NR_RAW_PRIMITIVES; prim++) { 1072 if (!rapl_read_data_raw(&rp->domains[dmn], prim, 1073 rpi[prim].unit, &val)) 1074 rp->domains[dmn].rdd.primitives[prim] = val; 1075 } 1076 } 1077 1078 } 1079 1080 static int rapl_package_register_powercap(struct rapl_package *rp) 1081 { 1082 struct rapl_domain *rd; 1083 struct powercap_zone *power_zone = NULL; 1084 int nr_pl, ret; 1085 1086 /* Update the domain data of the new package */ 1087 rapl_update_domain_data(rp); 1088 1089 /* first we register package domain as the parent zone */ 1090 for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) { 1091 if (rd->id == RAPL_DOMAIN_PACKAGE) { 1092 nr_pl = find_nr_power_limit(rd); 1093 pr_debug("register package domain %s\n", rp->name); 1094 power_zone = powercap_register_zone(&rd->power_zone, 1095 rp->priv->control_type, rp->name, 1096 NULL, &zone_ops[rd->id], nr_pl, 1097 &constraint_ops); 1098 if (IS_ERR(power_zone)) { 1099 pr_debug("failed to register power zone %s\n", 1100 rp->name); 1101 return PTR_ERR(power_zone); 1102 } 1103 /* track parent zone in per package/socket data */ 1104 rp->power_zone = power_zone; 1105 /* done, only one package domain per socket */ 1106 break; 1107 } 1108 } 1109 if (!power_zone) { 1110 pr_err("no package domain found, unknown topology!\n"); 1111 return -ENODEV; 1112 } 1113 /* now register domains as children of the socket/package */ 1114 for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) { 1115 if (rd->id == RAPL_DOMAIN_PACKAGE) 1116 continue; 1117 /* number of power limits per domain varies */ 1118 nr_pl = find_nr_power_limit(rd); 1119 power_zone = powercap_register_zone(&rd->power_zone, 1120 rp->priv->control_type, 1121 rd->name, rp->power_zone, 1122 &zone_ops[rd->id], nr_pl, 1123 &constraint_ops); 1124 1125 if (IS_ERR(power_zone)) { 1126 pr_debug("failed to register power_zone, %s:%s\n", 1127 rp->name, rd->name); 1128 ret = PTR_ERR(power_zone); 1129 goto err_cleanup; 1130 } 1131 } 1132 return 0; 1133 1134 err_cleanup: 1135 /* 1136 * Clean up previously initialized domains within the package if we 1137 * failed after the first domain setup. 1138 */ 1139 while (--rd >= rp->domains) { 1140 pr_debug("unregister %s domain %s\n", rp->name, rd->name); 1141 powercap_unregister_zone(rp->priv->control_type, 1142 &rd->power_zone); 1143 } 1144 1145 return ret; 1146 } 1147 1148 int rapl_add_platform_domain(struct rapl_if_priv *priv) 1149 { 1150 struct rapl_domain *rd; 1151 struct powercap_zone *power_zone; 1152 struct reg_action ra; 1153 int ret; 1154 1155 ra.reg = priv->regs[RAPL_DOMAIN_PLATFORM][RAPL_DOMAIN_REG_STATUS]; 1156 ra.mask = ~0; 1157 ret = priv->read_raw(0, &ra); 1158 if (ret || !ra.value) 1159 return -ENODEV; 1160 1161 ra.reg = priv->regs[RAPL_DOMAIN_PLATFORM][RAPL_DOMAIN_REG_LIMIT]; 1162 ra.mask = ~0; 1163 ret = priv->read_raw(0, &ra); 1164 if (ret || !ra.value) 1165 return -ENODEV; 1166 1167 rd = kzalloc(sizeof(*rd), GFP_KERNEL); 1168 if (!rd) 1169 return -ENOMEM; 1170 1171 rd->name = rapl_domain_names[RAPL_DOMAIN_PLATFORM]; 1172 rd->id = RAPL_DOMAIN_PLATFORM; 1173 rd->regs[RAPL_DOMAIN_REG_LIMIT] = 1174 priv->regs[RAPL_DOMAIN_PLATFORM][RAPL_DOMAIN_REG_LIMIT]; 1175 rd->regs[RAPL_DOMAIN_REG_STATUS] = 1176 priv->regs[RAPL_DOMAIN_PLATFORM][RAPL_DOMAIN_REG_STATUS]; 1177 rd->rpl[0].prim_id = PL1_ENABLE; 1178 rd->rpl[0].name = pl1_name; 1179 rd->rpl[1].prim_id = PL2_ENABLE; 1180 rd->rpl[1].name = pl2_name; 1181 rd->rp = rapl_find_package_domain(0, priv); 1182 1183 power_zone = powercap_register_zone(&rd->power_zone, priv->control_type, 1184 "psys", NULL, 1185 &zone_ops[RAPL_DOMAIN_PLATFORM], 1186 2, &constraint_ops); 1187 1188 if (IS_ERR(power_zone)) { 1189 kfree(rd); 1190 return PTR_ERR(power_zone); 1191 } 1192 1193 priv->platform_rapl_domain = rd; 1194 1195 return 0; 1196 } 1197 EXPORT_SYMBOL_GPL(rapl_add_platform_domain); 1198 1199 void rapl_remove_platform_domain(struct rapl_if_priv *priv) 1200 { 1201 if (priv->platform_rapl_domain) { 1202 powercap_unregister_zone(priv->control_type, 1203 &priv->platform_rapl_domain->power_zone); 1204 kfree(priv->platform_rapl_domain); 1205 } 1206 } 1207 EXPORT_SYMBOL_GPL(rapl_remove_platform_domain); 1208 1209 static int rapl_check_domain(int cpu, int domain, struct rapl_package *rp) 1210 { 1211 struct reg_action ra; 1212 1213 switch (domain) { 1214 case RAPL_DOMAIN_PACKAGE: 1215 case RAPL_DOMAIN_PP0: 1216 case RAPL_DOMAIN_PP1: 1217 case RAPL_DOMAIN_DRAM: 1218 ra.reg = rp->priv->regs[domain][RAPL_DOMAIN_REG_STATUS]; 1219 break; 1220 case RAPL_DOMAIN_PLATFORM: 1221 /* PSYS(PLATFORM) is not a CPU domain, so avoid printng error */ 1222 return -EINVAL; 1223 default: 1224 pr_err("invalid domain id %d\n", domain); 1225 return -EINVAL; 1226 } 1227 /* make sure domain counters are available and contains non-zero 1228 * values, otherwise skip it. 1229 */ 1230 1231 ra.mask = ~0; 1232 if (rp->priv->read_raw(cpu, &ra) || !ra.value) 1233 return -ENODEV; 1234 1235 return 0; 1236 } 1237 1238 /* 1239 * Check if power limits are available. Two cases when they are not available: 1240 * 1. Locked by BIOS, in this case we still provide read-only access so that 1241 * users can see what limit is set by the BIOS. 1242 * 2. Some CPUs make some domains monitoring only which means PLx MSRs may not 1243 * exist at all. In this case, we do not show the constraints in powercap. 1244 * 1245 * Called after domains are detected and initialized. 1246 */ 1247 static void rapl_detect_powerlimit(struct rapl_domain *rd) 1248 { 1249 u64 val64; 1250 int i; 1251 1252 /* check if the domain is locked by BIOS, ignore if MSR doesn't exist */ 1253 if (!rapl_read_data_raw(rd, FW_LOCK, false, &val64)) { 1254 if (val64) { 1255 pr_info("RAPL %s domain %s locked by BIOS\n", 1256 rd->rp->name, rd->name); 1257 rd->state |= DOMAIN_STATE_BIOS_LOCKED; 1258 } 1259 } 1260 /* check if power limit MSR exists, otherwise domain is monitoring only */ 1261 for (i = 0; i < NR_POWER_LIMITS; i++) { 1262 int prim = rd->rpl[i].prim_id; 1263 1264 if (rapl_read_data_raw(rd, prim, false, &val64)) 1265 rd->rpl[i].name = NULL; 1266 } 1267 } 1268 1269 /* Detect active and valid domains for the given CPU, caller must 1270 * ensure the CPU belongs to the targeted package and CPU hotlug is disabled. 1271 */ 1272 static int rapl_detect_domains(struct rapl_package *rp, int cpu) 1273 { 1274 struct rapl_domain *rd; 1275 int i; 1276 1277 for (i = 0; i < RAPL_DOMAIN_MAX; i++) { 1278 /* use physical package id to read counters */ 1279 if (!rapl_check_domain(cpu, i, rp)) { 1280 rp->domain_map |= 1 << i; 1281 pr_info("Found RAPL domain %s\n", rapl_domain_names[i]); 1282 } 1283 } 1284 rp->nr_domains = bitmap_weight(&rp->domain_map, RAPL_DOMAIN_MAX); 1285 if (!rp->nr_domains) { 1286 pr_debug("no valid rapl domains found in %s\n", rp->name); 1287 return -ENODEV; 1288 } 1289 pr_debug("found %d domains on %s\n", rp->nr_domains, rp->name); 1290 1291 rp->domains = kcalloc(rp->nr_domains + 1, sizeof(struct rapl_domain), 1292 GFP_KERNEL); 1293 if (!rp->domains) 1294 return -ENOMEM; 1295 1296 rapl_init_domains(rp); 1297 1298 for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) 1299 rapl_detect_powerlimit(rd); 1300 1301 return 0; 1302 } 1303 1304 /* called from CPU hotplug notifier, hotplug lock held */ 1305 void rapl_remove_package(struct rapl_package *rp) 1306 { 1307 struct rapl_domain *rd, *rd_package = NULL; 1308 1309 package_power_limit_irq_restore(rp); 1310 1311 for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) { 1312 rapl_write_data_raw(rd, PL1_ENABLE, 0); 1313 rapl_write_data_raw(rd, PL1_CLAMP, 0); 1314 if (find_nr_power_limit(rd) > 1) { 1315 rapl_write_data_raw(rd, PL2_ENABLE, 0); 1316 rapl_write_data_raw(rd, PL2_CLAMP, 0); 1317 rapl_write_data_raw(rd, PL4_ENABLE, 0); 1318 } 1319 if (rd->id == RAPL_DOMAIN_PACKAGE) { 1320 rd_package = rd; 1321 continue; 1322 } 1323 pr_debug("remove package, undo power limit on %s: %s\n", 1324 rp->name, rd->name); 1325 powercap_unregister_zone(rp->priv->control_type, 1326 &rd->power_zone); 1327 } 1328 /* do parent zone last */ 1329 powercap_unregister_zone(rp->priv->control_type, 1330 &rd_package->power_zone); 1331 list_del(&rp->plist); 1332 kfree(rp); 1333 } 1334 EXPORT_SYMBOL_GPL(rapl_remove_package); 1335 1336 /* caller to ensure CPU hotplug lock is held */ 1337 struct rapl_package *rapl_find_package_domain(int cpu, struct rapl_if_priv *priv) 1338 { 1339 int id = topology_logical_die_id(cpu); 1340 struct rapl_package *rp; 1341 1342 list_for_each_entry(rp, &rapl_packages, plist) { 1343 if (rp->id == id 1344 && rp->priv->control_type == priv->control_type) 1345 return rp; 1346 } 1347 1348 return NULL; 1349 } 1350 EXPORT_SYMBOL_GPL(rapl_find_package_domain); 1351 1352 /* called from CPU hotplug notifier, hotplug lock held */ 1353 struct rapl_package *rapl_add_package(int cpu, struct rapl_if_priv *priv) 1354 { 1355 int id = topology_logical_die_id(cpu); 1356 struct rapl_package *rp; 1357 struct cpuinfo_x86 *c = &cpu_data(cpu); 1358 int ret; 1359 1360 if (!rapl_defaults) 1361 return ERR_PTR(-ENODEV); 1362 1363 rp = kzalloc(sizeof(struct rapl_package), GFP_KERNEL); 1364 if (!rp) 1365 return ERR_PTR(-ENOMEM); 1366 1367 /* add the new package to the list */ 1368 rp->id = id; 1369 rp->lead_cpu = cpu; 1370 rp->priv = priv; 1371 1372 if (topology_max_die_per_package() > 1) 1373 snprintf(rp->name, PACKAGE_DOMAIN_NAME_LENGTH, 1374 "package-%d-die-%d", c->phys_proc_id, c->cpu_die_id); 1375 else 1376 snprintf(rp->name, PACKAGE_DOMAIN_NAME_LENGTH, "package-%d", 1377 c->phys_proc_id); 1378 1379 /* check if the package contains valid domains */ 1380 if (rapl_detect_domains(rp, cpu) || rapl_defaults->check_unit(rp, cpu)) { 1381 ret = -ENODEV; 1382 goto err_free_package; 1383 } 1384 ret = rapl_package_register_powercap(rp); 1385 if (!ret) { 1386 INIT_LIST_HEAD(&rp->plist); 1387 list_add(&rp->plist, &rapl_packages); 1388 return rp; 1389 } 1390 1391 err_free_package: 1392 kfree(rp->domains); 1393 kfree(rp); 1394 return ERR_PTR(ret); 1395 } 1396 EXPORT_SYMBOL_GPL(rapl_add_package); 1397 1398 static void power_limit_state_save(void) 1399 { 1400 struct rapl_package *rp; 1401 struct rapl_domain *rd; 1402 int nr_pl, ret, i; 1403 1404 get_online_cpus(); 1405 list_for_each_entry(rp, &rapl_packages, plist) { 1406 if (!rp->power_zone) 1407 continue; 1408 rd = power_zone_to_rapl_domain(rp->power_zone); 1409 nr_pl = find_nr_power_limit(rd); 1410 for (i = 0; i < nr_pl; i++) { 1411 switch (rd->rpl[i].prim_id) { 1412 case PL1_ENABLE: 1413 ret = rapl_read_data_raw(rd, 1414 POWER_LIMIT1, true, 1415 &rd->rpl[i].last_power_limit); 1416 if (ret) 1417 rd->rpl[i].last_power_limit = 0; 1418 break; 1419 case PL2_ENABLE: 1420 ret = rapl_read_data_raw(rd, 1421 POWER_LIMIT2, true, 1422 &rd->rpl[i].last_power_limit); 1423 if (ret) 1424 rd->rpl[i].last_power_limit = 0; 1425 break; 1426 case PL4_ENABLE: 1427 ret = rapl_read_data_raw(rd, 1428 POWER_LIMIT4, true, 1429 &rd->rpl[i].last_power_limit); 1430 if (ret) 1431 rd->rpl[i].last_power_limit = 0; 1432 break; 1433 } 1434 } 1435 } 1436 put_online_cpus(); 1437 } 1438 1439 static void power_limit_state_restore(void) 1440 { 1441 struct rapl_package *rp; 1442 struct rapl_domain *rd; 1443 int nr_pl, i; 1444 1445 get_online_cpus(); 1446 list_for_each_entry(rp, &rapl_packages, plist) { 1447 if (!rp->power_zone) 1448 continue; 1449 rd = power_zone_to_rapl_domain(rp->power_zone); 1450 nr_pl = find_nr_power_limit(rd); 1451 for (i = 0; i < nr_pl; i++) { 1452 switch (rd->rpl[i].prim_id) { 1453 case PL1_ENABLE: 1454 if (rd->rpl[i].last_power_limit) 1455 rapl_write_data_raw(rd, POWER_LIMIT1, 1456 rd->rpl[i].last_power_limit); 1457 break; 1458 case PL2_ENABLE: 1459 if (rd->rpl[i].last_power_limit) 1460 rapl_write_data_raw(rd, POWER_LIMIT2, 1461 rd->rpl[i].last_power_limit); 1462 break; 1463 case PL4_ENABLE: 1464 if (rd->rpl[i].last_power_limit) 1465 rapl_write_data_raw(rd, POWER_LIMIT4, 1466 rd->rpl[i].last_power_limit); 1467 break; 1468 } 1469 } 1470 } 1471 put_online_cpus(); 1472 } 1473 1474 static int rapl_pm_callback(struct notifier_block *nb, 1475 unsigned long mode, void *_unused) 1476 { 1477 switch (mode) { 1478 case PM_SUSPEND_PREPARE: 1479 power_limit_state_save(); 1480 break; 1481 case PM_POST_SUSPEND: 1482 power_limit_state_restore(); 1483 break; 1484 } 1485 return NOTIFY_OK; 1486 } 1487 1488 static struct notifier_block rapl_pm_notifier = { 1489 .notifier_call = rapl_pm_callback, 1490 }; 1491 1492 static struct platform_device *rapl_msr_platdev; 1493 1494 static int __init rapl_init(void) 1495 { 1496 const struct x86_cpu_id *id; 1497 int ret; 1498 1499 id = x86_match_cpu(rapl_ids); 1500 if (!id) { 1501 pr_err("driver does not support CPU family %d model %d\n", 1502 boot_cpu_data.x86, boot_cpu_data.x86_model); 1503 1504 return -ENODEV; 1505 } 1506 1507 rapl_defaults = (struct rapl_defaults *)id->driver_data; 1508 1509 ret = register_pm_notifier(&rapl_pm_notifier); 1510 if (ret) 1511 return ret; 1512 1513 rapl_msr_platdev = platform_device_alloc("intel_rapl_msr", 0); 1514 if (!rapl_msr_platdev) { 1515 ret = -ENOMEM; 1516 goto end; 1517 } 1518 1519 ret = platform_device_add(rapl_msr_platdev); 1520 if (ret) 1521 platform_device_put(rapl_msr_platdev); 1522 1523 end: 1524 if (ret) 1525 unregister_pm_notifier(&rapl_pm_notifier); 1526 1527 return ret; 1528 } 1529 1530 static void __exit rapl_exit(void) 1531 { 1532 platform_device_unregister(rapl_msr_platdev); 1533 unregister_pm_notifier(&rapl_pm_notifier); 1534 } 1535 1536 fs_initcall(rapl_init); 1537 module_exit(rapl_exit); 1538 1539 MODULE_DESCRIPTION("Intel Runtime Average Power Limit (RAPL) common code"); 1540 MODULE_AUTHOR("Jacob Pan <jacob.jun.pan@intel.com>"); 1541 MODULE_LICENSE("GPL v2"); 1542