1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Common code for Intel Running Average Power Limit (RAPL) support. 4 * Copyright (c) 2019, Intel Corporation. 5 */ 6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 7 8 #include <linux/kernel.h> 9 #include <linux/module.h> 10 #include <linux/list.h> 11 #include <linux/types.h> 12 #include <linux/device.h> 13 #include <linux/slab.h> 14 #include <linux/log2.h> 15 #include <linux/bitmap.h> 16 #include <linux/delay.h> 17 #include <linux/sysfs.h> 18 #include <linux/cpu.h> 19 #include <linux/powercap.h> 20 #include <linux/suspend.h> 21 #include <linux/intel_rapl.h> 22 #include <linux/processor.h> 23 #include <linux/platform_device.h> 24 25 #include <asm/iosf_mbi.h> 26 #include <asm/cpu_device_id.h> 27 #include <asm/intel-family.h> 28 29 /* bitmasks for RAPL MSRs, used by primitive access functions */ 30 #define ENERGY_STATUS_MASK 0xffffffff 31 32 #define POWER_LIMIT1_MASK 0x7FFF 33 #define POWER_LIMIT1_ENABLE BIT(15) 34 #define POWER_LIMIT1_CLAMP BIT(16) 35 36 #define POWER_LIMIT2_MASK (0x7FFFULL<<32) 37 #define POWER_LIMIT2_ENABLE BIT_ULL(47) 38 #define POWER_LIMIT2_CLAMP BIT_ULL(48) 39 #define POWER_HIGH_LOCK BIT_ULL(63) 40 #define POWER_LOW_LOCK BIT(31) 41 42 #define POWER_LIMIT4_MASK 0x1FFF 43 44 #define TIME_WINDOW1_MASK (0x7FULL<<17) 45 #define TIME_WINDOW2_MASK (0x7FULL<<49) 46 47 #define POWER_UNIT_OFFSET 0 48 #define POWER_UNIT_MASK 0x0F 49 50 #define ENERGY_UNIT_OFFSET 0x08 51 #define ENERGY_UNIT_MASK 0x1F00 52 53 #define TIME_UNIT_OFFSET 0x10 54 #define TIME_UNIT_MASK 0xF0000 55 56 #define POWER_INFO_MAX_MASK (0x7fffULL<<32) 57 #define POWER_INFO_MIN_MASK (0x7fffULL<<16) 58 #define POWER_INFO_MAX_TIME_WIN_MASK (0x3fULL<<48) 59 #define POWER_INFO_THERMAL_SPEC_MASK 0x7fff 60 61 #define PERF_STATUS_THROTTLE_TIME_MASK 0xffffffff 62 #define PP_POLICY_MASK 0x1F 63 64 /* 65 * SPR has different layout for Psys Domain PowerLimit registers. 66 * There are 17 bits of PL1 and PL2 instead of 15 bits. 67 * The Enable bits and TimeWindow bits are also shifted as a result. 68 */ 69 #define PSYS_POWER_LIMIT1_MASK 0x1FFFF 70 #define PSYS_POWER_LIMIT1_ENABLE BIT(17) 71 72 #define PSYS_POWER_LIMIT2_MASK (0x1FFFFULL<<32) 73 #define PSYS_POWER_LIMIT2_ENABLE BIT_ULL(49) 74 75 #define PSYS_TIME_WINDOW1_MASK (0x7FULL<<19) 76 #define PSYS_TIME_WINDOW2_MASK (0x7FULL<<51) 77 78 /* Non HW constants */ 79 #define RAPL_PRIMITIVE_DERIVED BIT(1) /* not from raw data */ 80 #define RAPL_PRIMITIVE_DUMMY BIT(2) 81 82 #define TIME_WINDOW_MAX_MSEC 40000 83 #define TIME_WINDOW_MIN_MSEC 250 84 #define ENERGY_UNIT_SCALE 1000 /* scale from driver unit to powercap unit */ 85 enum unit_type { 86 ARBITRARY_UNIT, /* no translation */ 87 POWER_UNIT, 88 ENERGY_UNIT, 89 TIME_UNIT, 90 }; 91 92 /* per domain data, some are optional */ 93 #define NR_RAW_PRIMITIVES (NR_RAPL_PRIMITIVES - 2) 94 95 #define DOMAIN_STATE_INACTIVE BIT(0) 96 #define DOMAIN_STATE_POWER_LIMIT_SET BIT(1) 97 #define DOMAIN_STATE_BIOS_LOCKED BIT(2) 98 99 static const char pl1_name[] = "long_term"; 100 static const char pl2_name[] = "short_term"; 101 static const char pl4_name[] = "peak_power"; 102 103 #define power_zone_to_rapl_domain(_zone) \ 104 container_of(_zone, struct rapl_domain, power_zone) 105 106 struct rapl_defaults { 107 u8 floor_freq_reg_addr; 108 int (*check_unit)(struct rapl_package *rp, int cpu); 109 void (*set_floor_freq)(struct rapl_domain *rd, bool mode); 110 u64 (*compute_time_window)(struct rapl_package *rp, u64 val, 111 bool to_raw); 112 unsigned int dram_domain_energy_unit; 113 unsigned int psys_domain_energy_unit; 114 bool spr_psys_bits; 115 }; 116 static struct rapl_defaults *rapl_defaults; 117 118 /* Sideband MBI registers */ 119 #define IOSF_CPU_POWER_BUDGET_CTL_BYT (0x2) 120 #define IOSF_CPU_POWER_BUDGET_CTL_TNG (0xdf) 121 122 #define PACKAGE_PLN_INT_SAVED BIT(0) 123 #define MAX_PRIM_NAME (32) 124 125 /* per domain data. used to describe individual knobs such that access function 126 * can be consolidated into one instead of many inline functions. 127 */ 128 struct rapl_primitive_info { 129 const char *name; 130 u64 mask; 131 int shift; 132 enum rapl_domain_reg_id id; 133 enum unit_type unit; 134 u32 flag; 135 }; 136 137 #define PRIMITIVE_INFO_INIT(p, m, s, i, u, f) { \ 138 .name = #p, \ 139 .mask = m, \ 140 .shift = s, \ 141 .id = i, \ 142 .unit = u, \ 143 .flag = f \ 144 } 145 146 static void rapl_init_domains(struct rapl_package *rp); 147 static int rapl_read_data_raw(struct rapl_domain *rd, 148 enum rapl_primitives prim, 149 bool xlate, u64 *data); 150 static int rapl_write_data_raw(struct rapl_domain *rd, 151 enum rapl_primitives prim, 152 unsigned long long value); 153 static u64 rapl_unit_xlate(struct rapl_domain *rd, 154 enum unit_type type, u64 value, int to_raw); 155 static void package_power_limit_irq_save(struct rapl_package *rp); 156 157 static LIST_HEAD(rapl_packages); /* guarded by CPU hotplug lock */ 158 159 static const char *const rapl_domain_names[] = { 160 "package", 161 "core", 162 "uncore", 163 "dram", 164 "psys", 165 }; 166 167 static int get_energy_counter(struct powercap_zone *power_zone, 168 u64 *energy_raw) 169 { 170 struct rapl_domain *rd; 171 u64 energy_now; 172 173 /* prevent CPU hotplug, make sure the RAPL domain does not go 174 * away while reading the counter. 175 */ 176 cpus_read_lock(); 177 rd = power_zone_to_rapl_domain(power_zone); 178 179 if (!rapl_read_data_raw(rd, ENERGY_COUNTER, true, &energy_now)) { 180 *energy_raw = energy_now; 181 cpus_read_unlock(); 182 183 return 0; 184 } 185 cpus_read_unlock(); 186 187 return -EIO; 188 } 189 190 static int get_max_energy_counter(struct powercap_zone *pcd_dev, u64 *energy) 191 { 192 struct rapl_domain *rd = power_zone_to_rapl_domain(pcd_dev); 193 194 *energy = rapl_unit_xlate(rd, ENERGY_UNIT, ENERGY_STATUS_MASK, 0); 195 return 0; 196 } 197 198 static int release_zone(struct powercap_zone *power_zone) 199 { 200 struct rapl_domain *rd = power_zone_to_rapl_domain(power_zone); 201 struct rapl_package *rp = rd->rp; 202 203 /* package zone is the last zone of a package, we can free 204 * memory here since all children has been unregistered. 205 */ 206 if (rd->id == RAPL_DOMAIN_PACKAGE) { 207 kfree(rd); 208 rp->domains = NULL; 209 } 210 211 return 0; 212 213 } 214 215 static int find_nr_power_limit(struct rapl_domain *rd) 216 { 217 int i, nr_pl = 0; 218 219 for (i = 0; i < NR_POWER_LIMITS; i++) { 220 if (rd->rpl[i].name) 221 nr_pl++; 222 } 223 224 return nr_pl; 225 } 226 227 static int set_domain_enable(struct powercap_zone *power_zone, bool mode) 228 { 229 struct rapl_domain *rd = power_zone_to_rapl_domain(power_zone); 230 231 if (rd->state & DOMAIN_STATE_BIOS_LOCKED) 232 return -EACCES; 233 234 cpus_read_lock(); 235 rapl_write_data_raw(rd, PL1_ENABLE, mode); 236 if (rapl_defaults->set_floor_freq) 237 rapl_defaults->set_floor_freq(rd, mode); 238 cpus_read_unlock(); 239 240 return 0; 241 } 242 243 static int get_domain_enable(struct powercap_zone *power_zone, bool *mode) 244 { 245 struct rapl_domain *rd = power_zone_to_rapl_domain(power_zone); 246 u64 val; 247 248 if (rd->state & DOMAIN_STATE_BIOS_LOCKED) { 249 *mode = false; 250 return 0; 251 } 252 cpus_read_lock(); 253 if (rapl_read_data_raw(rd, PL1_ENABLE, true, &val)) { 254 cpus_read_unlock(); 255 return -EIO; 256 } 257 *mode = val; 258 cpus_read_unlock(); 259 260 return 0; 261 } 262 263 /* per RAPL domain ops, in the order of rapl_domain_type */ 264 static const struct powercap_zone_ops zone_ops[] = { 265 /* RAPL_DOMAIN_PACKAGE */ 266 { 267 .get_energy_uj = get_energy_counter, 268 .get_max_energy_range_uj = get_max_energy_counter, 269 .release = release_zone, 270 .set_enable = set_domain_enable, 271 .get_enable = get_domain_enable, 272 }, 273 /* RAPL_DOMAIN_PP0 */ 274 { 275 .get_energy_uj = get_energy_counter, 276 .get_max_energy_range_uj = get_max_energy_counter, 277 .release = release_zone, 278 .set_enable = set_domain_enable, 279 .get_enable = get_domain_enable, 280 }, 281 /* RAPL_DOMAIN_PP1 */ 282 { 283 .get_energy_uj = get_energy_counter, 284 .get_max_energy_range_uj = get_max_energy_counter, 285 .release = release_zone, 286 .set_enable = set_domain_enable, 287 .get_enable = get_domain_enable, 288 }, 289 /* RAPL_DOMAIN_DRAM */ 290 { 291 .get_energy_uj = get_energy_counter, 292 .get_max_energy_range_uj = get_max_energy_counter, 293 .release = release_zone, 294 .set_enable = set_domain_enable, 295 .get_enable = get_domain_enable, 296 }, 297 /* RAPL_DOMAIN_PLATFORM */ 298 { 299 .get_energy_uj = get_energy_counter, 300 .get_max_energy_range_uj = get_max_energy_counter, 301 .release = release_zone, 302 .set_enable = set_domain_enable, 303 .get_enable = get_domain_enable, 304 }, 305 }; 306 307 /* 308 * Constraint index used by powercap can be different than power limit (PL) 309 * index in that some PLs maybe missing due to non-existent MSRs. So we 310 * need to convert here by finding the valid PLs only (name populated). 311 */ 312 static int contraint_to_pl(struct rapl_domain *rd, int cid) 313 { 314 int i, j; 315 316 for (i = 0, j = 0; i < NR_POWER_LIMITS; i++) { 317 if ((rd->rpl[i].name) && j++ == cid) { 318 pr_debug("%s: index %d\n", __func__, i); 319 return i; 320 } 321 } 322 pr_err("Cannot find matching power limit for constraint %d\n", cid); 323 324 return -EINVAL; 325 } 326 327 static int set_power_limit(struct powercap_zone *power_zone, int cid, 328 u64 power_limit) 329 { 330 struct rapl_domain *rd; 331 struct rapl_package *rp; 332 int ret = 0; 333 int id; 334 335 cpus_read_lock(); 336 rd = power_zone_to_rapl_domain(power_zone); 337 id = contraint_to_pl(rd, cid); 338 if (id < 0) { 339 ret = id; 340 goto set_exit; 341 } 342 343 rp = rd->rp; 344 345 if (rd->state & DOMAIN_STATE_BIOS_LOCKED) { 346 dev_warn(&power_zone->dev, 347 "%s locked by BIOS, monitoring only\n", rd->name); 348 ret = -EACCES; 349 goto set_exit; 350 } 351 352 switch (rd->rpl[id].prim_id) { 353 case PL1_ENABLE: 354 rapl_write_data_raw(rd, POWER_LIMIT1, power_limit); 355 break; 356 case PL2_ENABLE: 357 rapl_write_data_raw(rd, POWER_LIMIT2, power_limit); 358 break; 359 case PL4_ENABLE: 360 rapl_write_data_raw(rd, POWER_LIMIT4, power_limit); 361 break; 362 default: 363 ret = -EINVAL; 364 } 365 if (!ret) 366 package_power_limit_irq_save(rp); 367 set_exit: 368 cpus_read_unlock(); 369 return ret; 370 } 371 372 static int get_current_power_limit(struct powercap_zone *power_zone, int cid, 373 u64 *data) 374 { 375 struct rapl_domain *rd; 376 u64 val; 377 int prim; 378 int ret = 0; 379 int id; 380 381 cpus_read_lock(); 382 rd = power_zone_to_rapl_domain(power_zone); 383 id = contraint_to_pl(rd, cid); 384 if (id < 0) { 385 ret = id; 386 goto get_exit; 387 } 388 389 switch (rd->rpl[id].prim_id) { 390 case PL1_ENABLE: 391 prim = POWER_LIMIT1; 392 break; 393 case PL2_ENABLE: 394 prim = POWER_LIMIT2; 395 break; 396 case PL4_ENABLE: 397 prim = POWER_LIMIT4; 398 break; 399 default: 400 cpus_read_unlock(); 401 return -EINVAL; 402 } 403 if (rapl_read_data_raw(rd, prim, true, &val)) 404 ret = -EIO; 405 else 406 *data = val; 407 408 get_exit: 409 cpus_read_unlock(); 410 411 return ret; 412 } 413 414 static int set_time_window(struct powercap_zone *power_zone, int cid, 415 u64 window) 416 { 417 struct rapl_domain *rd; 418 int ret = 0; 419 int id; 420 421 cpus_read_lock(); 422 rd = power_zone_to_rapl_domain(power_zone); 423 id = contraint_to_pl(rd, cid); 424 if (id < 0) { 425 ret = id; 426 goto set_time_exit; 427 } 428 429 switch (rd->rpl[id].prim_id) { 430 case PL1_ENABLE: 431 rapl_write_data_raw(rd, TIME_WINDOW1, window); 432 break; 433 case PL2_ENABLE: 434 rapl_write_data_raw(rd, TIME_WINDOW2, window); 435 break; 436 default: 437 ret = -EINVAL; 438 } 439 440 set_time_exit: 441 cpus_read_unlock(); 442 return ret; 443 } 444 445 static int get_time_window(struct powercap_zone *power_zone, int cid, 446 u64 *data) 447 { 448 struct rapl_domain *rd; 449 u64 val; 450 int ret = 0; 451 int id; 452 453 cpus_read_lock(); 454 rd = power_zone_to_rapl_domain(power_zone); 455 id = contraint_to_pl(rd, cid); 456 if (id < 0) { 457 ret = id; 458 goto get_time_exit; 459 } 460 461 switch (rd->rpl[id].prim_id) { 462 case PL1_ENABLE: 463 ret = rapl_read_data_raw(rd, TIME_WINDOW1, true, &val); 464 break; 465 case PL2_ENABLE: 466 ret = rapl_read_data_raw(rd, TIME_WINDOW2, true, &val); 467 break; 468 case PL4_ENABLE: 469 /* 470 * Time window parameter is not applicable for PL4 entry 471 * so assigining '0' as default value. 472 */ 473 val = 0; 474 break; 475 default: 476 cpus_read_unlock(); 477 return -EINVAL; 478 } 479 if (!ret) 480 *data = val; 481 482 get_time_exit: 483 cpus_read_unlock(); 484 485 return ret; 486 } 487 488 static const char *get_constraint_name(struct powercap_zone *power_zone, 489 int cid) 490 { 491 struct rapl_domain *rd; 492 int id; 493 494 rd = power_zone_to_rapl_domain(power_zone); 495 id = contraint_to_pl(rd, cid); 496 if (id >= 0) 497 return rd->rpl[id].name; 498 499 return NULL; 500 } 501 502 static int get_max_power(struct powercap_zone *power_zone, int id, u64 *data) 503 { 504 struct rapl_domain *rd; 505 u64 val; 506 int prim; 507 int ret = 0; 508 509 cpus_read_lock(); 510 rd = power_zone_to_rapl_domain(power_zone); 511 switch (rd->rpl[id].prim_id) { 512 case PL1_ENABLE: 513 prim = THERMAL_SPEC_POWER; 514 break; 515 case PL2_ENABLE: 516 prim = MAX_POWER; 517 break; 518 case PL4_ENABLE: 519 prim = MAX_POWER; 520 break; 521 default: 522 cpus_read_unlock(); 523 return -EINVAL; 524 } 525 if (rapl_read_data_raw(rd, prim, true, &val)) 526 ret = -EIO; 527 else 528 *data = val; 529 530 /* As a generalization rule, PL4 would be around two times PL2. */ 531 if (rd->rpl[id].prim_id == PL4_ENABLE) 532 *data = *data * 2; 533 534 cpus_read_unlock(); 535 536 return ret; 537 } 538 539 static const struct powercap_zone_constraint_ops constraint_ops = { 540 .set_power_limit_uw = set_power_limit, 541 .get_power_limit_uw = get_current_power_limit, 542 .set_time_window_us = set_time_window, 543 .get_time_window_us = get_time_window, 544 .get_max_power_uw = get_max_power, 545 .get_name = get_constraint_name, 546 }; 547 548 /* called after domain detection and package level data are set */ 549 static void rapl_init_domains(struct rapl_package *rp) 550 { 551 enum rapl_domain_type i; 552 enum rapl_domain_reg_id j; 553 struct rapl_domain *rd = rp->domains; 554 555 for (i = 0; i < RAPL_DOMAIN_MAX; i++) { 556 unsigned int mask = rp->domain_map & (1 << i); 557 558 if (!mask) 559 continue; 560 561 rd->rp = rp; 562 563 if (i == RAPL_DOMAIN_PLATFORM && rp->id > 0) { 564 snprintf(rd->name, RAPL_DOMAIN_NAME_LENGTH, "psys-%d", 565 topology_physical_package_id(rp->lead_cpu)); 566 } else 567 snprintf(rd->name, RAPL_DOMAIN_NAME_LENGTH, "%s", 568 rapl_domain_names[i]); 569 570 rd->id = i; 571 rd->rpl[0].prim_id = PL1_ENABLE; 572 rd->rpl[0].name = pl1_name; 573 574 /* 575 * The PL2 power domain is applicable for limits two 576 * and limits three 577 */ 578 if (rp->priv->limits[i] >= 2) { 579 rd->rpl[1].prim_id = PL2_ENABLE; 580 rd->rpl[1].name = pl2_name; 581 } 582 583 /* Enable PL4 domain if the total power limits are three */ 584 if (rp->priv->limits[i] == 3) { 585 rd->rpl[2].prim_id = PL4_ENABLE; 586 rd->rpl[2].name = pl4_name; 587 } 588 589 for (j = 0; j < RAPL_DOMAIN_REG_MAX; j++) 590 rd->regs[j] = rp->priv->regs[i][j]; 591 592 switch (i) { 593 case RAPL_DOMAIN_DRAM: 594 rd->domain_energy_unit = 595 rapl_defaults->dram_domain_energy_unit; 596 if (rd->domain_energy_unit) 597 pr_info("DRAM domain energy unit %dpj\n", 598 rd->domain_energy_unit); 599 break; 600 case RAPL_DOMAIN_PLATFORM: 601 rd->domain_energy_unit = 602 rapl_defaults->psys_domain_energy_unit; 603 if (rd->domain_energy_unit) 604 pr_info("Platform domain energy unit %dpj\n", 605 rd->domain_energy_unit); 606 break; 607 default: 608 break; 609 } 610 rd++; 611 } 612 } 613 614 static u64 rapl_unit_xlate(struct rapl_domain *rd, enum unit_type type, 615 u64 value, int to_raw) 616 { 617 u64 units = 1; 618 struct rapl_package *rp = rd->rp; 619 u64 scale = 1; 620 621 switch (type) { 622 case POWER_UNIT: 623 units = rp->power_unit; 624 break; 625 case ENERGY_UNIT: 626 scale = ENERGY_UNIT_SCALE; 627 /* per domain unit takes precedence */ 628 if (rd->domain_energy_unit) 629 units = rd->domain_energy_unit; 630 else 631 units = rp->energy_unit; 632 break; 633 case TIME_UNIT: 634 return rapl_defaults->compute_time_window(rp, value, to_raw); 635 case ARBITRARY_UNIT: 636 default: 637 return value; 638 } 639 640 if (to_raw) 641 return div64_u64(value, units) * scale; 642 643 value *= units; 644 645 return div64_u64(value, scale); 646 } 647 648 /* in the order of enum rapl_primitives */ 649 static struct rapl_primitive_info rpi[] = { 650 /* name, mask, shift, msr index, unit divisor */ 651 PRIMITIVE_INFO_INIT(ENERGY_COUNTER, ENERGY_STATUS_MASK, 0, 652 RAPL_DOMAIN_REG_STATUS, ENERGY_UNIT, 0), 653 PRIMITIVE_INFO_INIT(POWER_LIMIT1, POWER_LIMIT1_MASK, 0, 654 RAPL_DOMAIN_REG_LIMIT, POWER_UNIT, 0), 655 PRIMITIVE_INFO_INIT(POWER_LIMIT2, POWER_LIMIT2_MASK, 32, 656 RAPL_DOMAIN_REG_LIMIT, POWER_UNIT, 0), 657 PRIMITIVE_INFO_INIT(POWER_LIMIT4, POWER_LIMIT4_MASK, 0, 658 RAPL_DOMAIN_REG_PL4, POWER_UNIT, 0), 659 PRIMITIVE_INFO_INIT(FW_LOCK, POWER_LOW_LOCK, 31, 660 RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0), 661 PRIMITIVE_INFO_INIT(PL1_ENABLE, POWER_LIMIT1_ENABLE, 15, 662 RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0), 663 PRIMITIVE_INFO_INIT(PL1_CLAMP, POWER_LIMIT1_CLAMP, 16, 664 RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0), 665 PRIMITIVE_INFO_INIT(PL2_ENABLE, POWER_LIMIT2_ENABLE, 47, 666 RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0), 667 PRIMITIVE_INFO_INIT(PL2_CLAMP, POWER_LIMIT2_CLAMP, 48, 668 RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0), 669 PRIMITIVE_INFO_INIT(PL4_ENABLE, POWER_LIMIT4_MASK, 0, 670 RAPL_DOMAIN_REG_PL4, ARBITRARY_UNIT, 0), 671 PRIMITIVE_INFO_INIT(TIME_WINDOW1, TIME_WINDOW1_MASK, 17, 672 RAPL_DOMAIN_REG_LIMIT, TIME_UNIT, 0), 673 PRIMITIVE_INFO_INIT(TIME_WINDOW2, TIME_WINDOW2_MASK, 49, 674 RAPL_DOMAIN_REG_LIMIT, TIME_UNIT, 0), 675 PRIMITIVE_INFO_INIT(THERMAL_SPEC_POWER, POWER_INFO_THERMAL_SPEC_MASK, 676 0, RAPL_DOMAIN_REG_INFO, POWER_UNIT, 0), 677 PRIMITIVE_INFO_INIT(MAX_POWER, POWER_INFO_MAX_MASK, 32, 678 RAPL_DOMAIN_REG_INFO, POWER_UNIT, 0), 679 PRIMITIVE_INFO_INIT(MIN_POWER, POWER_INFO_MIN_MASK, 16, 680 RAPL_DOMAIN_REG_INFO, POWER_UNIT, 0), 681 PRIMITIVE_INFO_INIT(MAX_TIME_WINDOW, POWER_INFO_MAX_TIME_WIN_MASK, 48, 682 RAPL_DOMAIN_REG_INFO, TIME_UNIT, 0), 683 PRIMITIVE_INFO_INIT(THROTTLED_TIME, PERF_STATUS_THROTTLE_TIME_MASK, 0, 684 RAPL_DOMAIN_REG_PERF, TIME_UNIT, 0), 685 PRIMITIVE_INFO_INIT(PRIORITY_LEVEL, PP_POLICY_MASK, 0, 686 RAPL_DOMAIN_REG_POLICY, ARBITRARY_UNIT, 0), 687 PRIMITIVE_INFO_INIT(PSYS_POWER_LIMIT1, PSYS_POWER_LIMIT1_MASK, 0, 688 RAPL_DOMAIN_REG_LIMIT, POWER_UNIT, 0), 689 PRIMITIVE_INFO_INIT(PSYS_POWER_LIMIT2, PSYS_POWER_LIMIT2_MASK, 32, 690 RAPL_DOMAIN_REG_LIMIT, POWER_UNIT, 0), 691 PRIMITIVE_INFO_INIT(PSYS_PL1_ENABLE, PSYS_POWER_LIMIT1_ENABLE, 17, 692 RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0), 693 PRIMITIVE_INFO_INIT(PSYS_PL2_ENABLE, PSYS_POWER_LIMIT2_ENABLE, 49, 694 RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0), 695 PRIMITIVE_INFO_INIT(PSYS_TIME_WINDOW1, PSYS_TIME_WINDOW1_MASK, 19, 696 RAPL_DOMAIN_REG_LIMIT, TIME_UNIT, 0), 697 PRIMITIVE_INFO_INIT(PSYS_TIME_WINDOW2, PSYS_TIME_WINDOW2_MASK, 51, 698 RAPL_DOMAIN_REG_LIMIT, TIME_UNIT, 0), 699 /* non-hardware */ 700 PRIMITIVE_INFO_INIT(AVERAGE_POWER, 0, 0, 0, POWER_UNIT, 701 RAPL_PRIMITIVE_DERIVED), 702 {NULL, 0, 0, 0}, 703 }; 704 705 static enum rapl_primitives 706 prim_fixups(struct rapl_domain *rd, enum rapl_primitives prim) 707 { 708 if (!rapl_defaults->spr_psys_bits) 709 return prim; 710 711 if (rd->id != RAPL_DOMAIN_PLATFORM) 712 return prim; 713 714 switch (prim) { 715 case POWER_LIMIT1: 716 return PSYS_POWER_LIMIT1; 717 case POWER_LIMIT2: 718 return PSYS_POWER_LIMIT2; 719 case PL1_ENABLE: 720 return PSYS_PL1_ENABLE; 721 case PL2_ENABLE: 722 return PSYS_PL2_ENABLE; 723 case TIME_WINDOW1: 724 return PSYS_TIME_WINDOW1; 725 case TIME_WINDOW2: 726 return PSYS_TIME_WINDOW2; 727 default: 728 return prim; 729 } 730 } 731 732 /* Read primitive data based on its related struct rapl_primitive_info. 733 * if xlate flag is set, return translated data based on data units, i.e. 734 * time, energy, and power. 735 * RAPL MSRs are non-architectual and are laid out not consistently across 736 * domains. Here we use primitive info to allow writing consolidated access 737 * functions. 738 * For a given primitive, it is processed by MSR mask and shift. Unit conversion 739 * is pre-assigned based on RAPL unit MSRs read at init time. 740 * 63-------------------------- 31--------------------------- 0 741 * | xxxxx (mask) | 742 * | |<- shift ----------------| 743 * 63-------------------------- 31--------------------------- 0 744 */ 745 static int rapl_read_data_raw(struct rapl_domain *rd, 746 enum rapl_primitives prim, bool xlate, u64 *data) 747 { 748 u64 value; 749 enum rapl_primitives prim_fixed = prim_fixups(rd, prim); 750 struct rapl_primitive_info *rp = &rpi[prim_fixed]; 751 struct reg_action ra; 752 int cpu; 753 754 if (!rp->name || rp->flag & RAPL_PRIMITIVE_DUMMY) 755 return -EINVAL; 756 757 ra.reg = rd->regs[rp->id]; 758 if (!ra.reg) 759 return -EINVAL; 760 761 cpu = rd->rp->lead_cpu; 762 763 /* domain with 2 limits has different bit */ 764 if (prim == FW_LOCK && rd->rp->priv->limits[rd->id] == 2) { 765 rp->mask = POWER_HIGH_LOCK; 766 rp->shift = 63; 767 } 768 /* non-hardware data are collected by the polling thread */ 769 if (rp->flag & RAPL_PRIMITIVE_DERIVED) { 770 *data = rd->rdd.primitives[prim]; 771 return 0; 772 } 773 774 ra.mask = rp->mask; 775 776 if (rd->rp->priv->read_raw(cpu, &ra)) { 777 pr_debug("failed to read reg 0x%llx on cpu %d\n", ra.reg, cpu); 778 return -EIO; 779 } 780 781 value = ra.value >> rp->shift; 782 783 if (xlate) 784 *data = rapl_unit_xlate(rd, rp->unit, value, 0); 785 else 786 *data = value; 787 788 return 0; 789 } 790 791 /* Similar use of primitive info in the read counterpart */ 792 static int rapl_write_data_raw(struct rapl_domain *rd, 793 enum rapl_primitives prim, 794 unsigned long long value) 795 { 796 enum rapl_primitives prim_fixed = prim_fixups(rd, prim); 797 struct rapl_primitive_info *rp = &rpi[prim_fixed]; 798 int cpu; 799 u64 bits; 800 struct reg_action ra; 801 int ret; 802 803 cpu = rd->rp->lead_cpu; 804 bits = rapl_unit_xlate(rd, rp->unit, value, 1); 805 bits <<= rp->shift; 806 bits &= rp->mask; 807 808 memset(&ra, 0, sizeof(ra)); 809 810 ra.reg = rd->regs[rp->id]; 811 ra.mask = rp->mask; 812 ra.value = bits; 813 814 ret = rd->rp->priv->write_raw(cpu, &ra); 815 816 return ret; 817 } 818 819 /* 820 * Raw RAPL data stored in MSRs are in certain scales. We need to 821 * convert them into standard units based on the units reported in 822 * the RAPL unit MSRs. This is specific to CPUs as the method to 823 * calculate units differ on different CPUs. 824 * We convert the units to below format based on CPUs. 825 * i.e. 826 * energy unit: picoJoules : Represented in picoJoules by default 827 * power unit : microWatts : Represented in milliWatts by default 828 * time unit : microseconds: Represented in seconds by default 829 */ 830 static int rapl_check_unit_core(struct rapl_package *rp, int cpu) 831 { 832 struct reg_action ra; 833 u32 value; 834 835 ra.reg = rp->priv->reg_unit; 836 ra.mask = ~0; 837 if (rp->priv->read_raw(cpu, &ra)) { 838 pr_err("Failed to read power unit REG 0x%llx on CPU %d, exit.\n", 839 rp->priv->reg_unit, cpu); 840 return -ENODEV; 841 } 842 843 value = (ra.value & ENERGY_UNIT_MASK) >> ENERGY_UNIT_OFFSET; 844 rp->energy_unit = ENERGY_UNIT_SCALE * 1000000 / (1 << value); 845 846 value = (ra.value & POWER_UNIT_MASK) >> POWER_UNIT_OFFSET; 847 rp->power_unit = 1000000 / (1 << value); 848 849 value = (ra.value & TIME_UNIT_MASK) >> TIME_UNIT_OFFSET; 850 rp->time_unit = 1000000 / (1 << value); 851 852 pr_debug("Core CPU %s energy=%dpJ, time=%dus, power=%duW\n", 853 rp->name, rp->energy_unit, rp->time_unit, rp->power_unit); 854 855 return 0; 856 } 857 858 static int rapl_check_unit_atom(struct rapl_package *rp, int cpu) 859 { 860 struct reg_action ra; 861 u32 value; 862 863 ra.reg = rp->priv->reg_unit; 864 ra.mask = ~0; 865 if (rp->priv->read_raw(cpu, &ra)) { 866 pr_err("Failed to read power unit REG 0x%llx on CPU %d, exit.\n", 867 rp->priv->reg_unit, cpu); 868 return -ENODEV; 869 } 870 871 value = (ra.value & ENERGY_UNIT_MASK) >> ENERGY_UNIT_OFFSET; 872 rp->energy_unit = ENERGY_UNIT_SCALE * 1 << value; 873 874 value = (ra.value & POWER_UNIT_MASK) >> POWER_UNIT_OFFSET; 875 rp->power_unit = (1 << value) * 1000; 876 877 value = (ra.value & TIME_UNIT_MASK) >> TIME_UNIT_OFFSET; 878 rp->time_unit = 1000000 / (1 << value); 879 880 pr_debug("Atom %s energy=%dpJ, time=%dus, power=%duW\n", 881 rp->name, rp->energy_unit, rp->time_unit, rp->power_unit); 882 883 return 0; 884 } 885 886 static void power_limit_irq_save_cpu(void *info) 887 { 888 u32 l, h = 0; 889 struct rapl_package *rp = (struct rapl_package *)info; 890 891 /* save the state of PLN irq mask bit before disabling it */ 892 rdmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, &l, &h); 893 if (!(rp->power_limit_irq & PACKAGE_PLN_INT_SAVED)) { 894 rp->power_limit_irq = l & PACKAGE_THERM_INT_PLN_ENABLE; 895 rp->power_limit_irq |= PACKAGE_PLN_INT_SAVED; 896 } 897 l &= ~PACKAGE_THERM_INT_PLN_ENABLE; 898 wrmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h); 899 } 900 901 /* REVISIT: 902 * When package power limit is set artificially low by RAPL, LVT 903 * thermal interrupt for package power limit should be ignored 904 * since we are not really exceeding the real limit. The intention 905 * is to avoid excessive interrupts while we are trying to save power. 906 * A useful feature might be routing the package_power_limit interrupt 907 * to userspace via eventfd. once we have a usecase, this is simple 908 * to do by adding an atomic notifier. 909 */ 910 911 static void package_power_limit_irq_save(struct rapl_package *rp) 912 { 913 if (!boot_cpu_has(X86_FEATURE_PTS) || !boot_cpu_has(X86_FEATURE_PLN)) 914 return; 915 916 smp_call_function_single(rp->lead_cpu, power_limit_irq_save_cpu, rp, 1); 917 } 918 919 /* 920 * Restore per package power limit interrupt enable state. Called from cpu 921 * hotplug code on package removal. 922 */ 923 static void package_power_limit_irq_restore(struct rapl_package *rp) 924 { 925 u32 l, h; 926 927 if (!boot_cpu_has(X86_FEATURE_PTS) || !boot_cpu_has(X86_FEATURE_PLN)) 928 return; 929 930 /* irq enable state not saved, nothing to restore */ 931 if (!(rp->power_limit_irq & PACKAGE_PLN_INT_SAVED)) 932 return; 933 934 rdmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, &l, &h); 935 936 if (rp->power_limit_irq & PACKAGE_THERM_INT_PLN_ENABLE) 937 l |= PACKAGE_THERM_INT_PLN_ENABLE; 938 else 939 l &= ~PACKAGE_THERM_INT_PLN_ENABLE; 940 941 wrmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h); 942 } 943 944 static void set_floor_freq_default(struct rapl_domain *rd, bool mode) 945 { 946 int nr_powerlimit = find_nr_power_limit(rd); 947 948 /* always enable clamp such that p-state can go below OS requested 949 * range. power capping priority over guranteed frequency. 950 */ 951 rapl_write_data_raw(rd, PL1_CLAMP, mode); 952 953 /* some domains have pl2 */ 954 if (nr_powerlimit > 1) { 955 rapl_write_data_raw(rd, PL2_ENABLE, mode); 956 rapl_write_data_raw(rd, PL2_CLAMP, mode); 957 } 958 } 959 960 static void set_floor_freq_atom(struct rapl_domain *rd, bool enable) 961 { 962 static u32 power_ctrl_orig_val; 963 u32 mdata; 964 965 if (!rapl_defaults->floor_freq_reg_addr) { 966 pr_err("Invalid floor frequency config register\n"); 967 return; 968 } 969 970 if (!power_ctrl_orig_val) 971 iosf_mbi_read(BT_MBI_UNIT_PMC, MBI_CR_READ, 972 rapl_defaults->floor_freq_reg_addr, 973 &power_ctrl_orig_val); 974 mdata = power_ctrl_orig_val; 975 if (enable) { 976 mdata &= ~(0x7f << 8); 977 mdata |= 1 << 8; 978 } 979 iosf_mbi_write(BT_MBI_UNIT_PMC, MBI_CR_WRITE, 980 rapl_defaults->floor_freq_reg_addr, mdata); 981 } 982 983 static u64 rapl_compute_time_window_core(struct rapl_package *rp, u64 value, 984 bool to_raw) 985 { 986 u64 f, y; /* fraction and exp. used for time unit */ 987 988 /* 989 * Special processing based on 2^Y*(1+F/4), refer 990 * to Intel Software Developer's manual Vol.3B: CH 14.9.3. 991 */ 992 if (!to_raw) { 993 f = (value & 0x60) >> 5; 994 y = value & 0x1f; 995 value = (1 << y) * (4 + f) * rp->time_unit / 4; 996 } else { 997 if (value < rp->time_unit) 998 return 0; 999 1000 do_div(value, rp->time_unit); 1001 y = ilog2(value); 1002 1003 /* 1004 * The target hardware field is 7 bits wide, so return all ones 1005 * if the exponent is too large. 1006 */ 1007 if (y > 0x1f) 1008 return 0x7f; 1009 1010 f = div64_u64(4 * (value - (1ULL << y)), 1ULL << y); 1011 value = (y & 0x1f) | ((f & 0x3) << 5); 1012 } 1013 return value; 1014 } 1015 1016 static u64 rapl_compute_time_window_atom(struct rapl_package *rp, u64 value, 1017 bool to_raw) 1018 { 1019 /* 1020 * Atom time unit encoding is straight forward val * time_unit, 1021 * where time_unit is default to 1 sec. Never 0. 1022 */ 1023 if (!to_raw) 1024 return (value) ? value * rp->time_unit : rp->time_unit; 1025 1026 value = div64_u64(value, rp->time_unit); 1027 1028 return value; 1029 } 1030 1031 static const struct rapl_defaults rapl_defaults_core = { 1032 .floor_freq_reg_addr = 0, 1033 .check_unit = rapl_check_unit_core, 1034 .set_floor_freq = set_floor_freq_default, 1035 .compute_time_window = rapl_compute_time_window_core, 1036 }; 1037 1038 static const struct rapl_defaults rapl_defaults_hsw_server = { 1039 .check_unit = rapl_check_unit_core, 1040 .set_floor_freq = set_floor_freq_default, 1041 .compute_time_window = rapl_compute_time_window_core, 1042 .dram_domain_energy_unit = 15300, 1043 }; 1044 1045 static const struct rapl_defaults rapl_defaults_spr_server = { 1046 .check_unit = rapl_check_unit_core, 1047 .set_floor_freq = set_floor_freq_default, 1048 .compute_time_window = rapl_compute_time_window_core, 1049 .psys_domain_energy_unit = 1000000000, 1050 .spr_psys_bits = true, 1051 }; 1052 1053 static const struct rapl_defaults rapl_defaults_byt = { 1054 .floor_freq_reg_addr = IOSF_CPU_POWER_BUDGET_CTL_BYT, 1055 .check_unit = rapl_check_unit_atom, 1056 .set_floor_freq = set_floor_freq_atom, 1057 .compute_time_window = rapl_compute_time_window_atom, 1058 }; 1059 1060 static const struct rapl_defaults rapl_defaults_tng = { 1061 .floor_freq_reg_addr = IOSF_CPU_POWER_BUDGET_CTL_TNG, 1062 .check_unit = rapl_check_unit_atom, 1063 .set_floor_freq = set_floor_freq_atom, 1064 .compute_time_window = rapl_compute_time_window_atom, 1065 }; 1066 1067 static const struct rapl_defaults rapl_defaults_ann = { 1068 .floor_freq_reg_addr = 0, 1069 .check_unit = rapl_check_unit_atom, 1070 .set_floor_freq = NULL, 1071 .compute_time_window = rapl_compute_time_window_atom, 1072 }; 1073 1074 static const struct rapl_defaults rapl_defaults_cht = { 1075 .floor_freq_reg_addr = 0, 1076 .check_unit = rapl_check_unit_atom, 1077 .set_floor_freq = NULL, 1078 .compute_time_window = rapl_compute_time_window_atom, 1079 }; 1080 1081 static const struct rapl_defaults rapl_defaults_amd = { 1082 .check_unit = rapl_check_unit_core, 1083 }; 1084 1085 static const struct x86_cpu_id rapl_ids[] __initconst = { 1086 X86_MATCH_INTEL_FAM6_MODEL(SANDYBRIDGE, &rapl_defaults_core), 1087 X86_MATCH_INTEL_FAM6_MODEL(SANDYBRIDGE_X, &rapl_defaults_core), 1088 1089 X86_MATCH_INTEL_FAM6_MODEL(IVYBRIDGE, &rapl_defaults_core), 1090 X86_MATCH_INTEL_FAM6_MODEL(IVYBRIDGE_X, &rapl_defaults_core), 1091 1092 X86_MATCH_INTEL_FAM6_MODEL(HASWELL, &rapl_defaults_core), 1093 X86_MATCH_INTEL_FAM6_MODEL(HASWELL_L, &rapl_defaults_core), 1094 X86_MATCH_INTEL_FAM6_MODEL(HASWELL_G, &rapl_defaults_core), 1095 X86_MATCH_INTEL_FAM6_MODEL(HASWELL_X, &rapl_defaults_hsw_server), 1096 1097 X86_MATCH_INTEL_FAM6_MODEL(BROADWELL, &rapl_defaults_core), 1098 X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_G, &rapl_defaults_core), 1099 X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_D, &rapl_defaults_core), 1100 X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_X, &rapl_defaults_hsw_server), 1101 1102 X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE, &rapl_defaults_core), 1103 X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_L, &rapl_defaults_core), 1104 X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_X, &rapl_defaults_hsw_server), 1105 X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE_L, &rapl_defaults_core), 1106 X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE, &rapl_defaults_core), 1107 X86_MATCH_INTEL_FAM6_MODEL(CANNONLAKE_L, &rapl_defaults_core), 1108 X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_L, &rapl_defaults_core), 1109 X86_MATCH_INTEL_FAM6_MODEL(ICELAKE, &rapl_defaults_core), 1110 X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_NNPI, &rapl_defaults_core), 1111 X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, &rapl_defaults_hsw_server), 1112 X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, &rapl_defaults_hsw_server), 1113 X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE_L, &rapl_defaults_core), 1114 X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE, &rapl_defaults_core), 1115 X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE_L, &rapl_defaults_core), 1116 X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE, &rapl_defaults_core), 1117 X86_MATCH_INTEL_FAM6_MODEL(ROCKETLAKE, &rapl_defaults_core), 1118 X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE, &rapl_defaults_core), 1119 X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, &rapl_defaults_core), 1120 X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_N, &rapl_defaults_core), 1121 X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE, &rapl_defaults_core), 1122 X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_P, &rapl_defaults_core), 1123 X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_S, &rapl_defaults_core), 1124 X86_MATCH_INTEL_FAM6_MODEL(METEORLAKE, &rapl_defaults_core), 1125 X86_MATCH_INTEL_FAM6_MODEL(METEORLAKE_L, &rapl_defaults_core), 1126 X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &rapl_defaults_spr_server), 1127 X86_MATCH_INTEL_FAM6_MODEL(EMERALDRAPIDS_X, &rapl_defaults_spr_server), 1128 X86_MATCH_INTEL_FAM6_MODEL(LAKEFIELD, &rapl_defaults_core), 1129 1130 X86_MATCH_INTEL_FAM6_MODEL(ATOM_SILVERMONT, &rapl_defaults_byt), 1131 X86_MATCH_INTEL_FAM6_MODEL(ATOM_AIRMONT, &rapl_defaults_cht), 1132 X86_MATCH_INTEL_FAM6_MODEL(ATOM_SILVERMONT_MID, &rapl_defaults_tng), 1133 X86_MATCH_INTEL_FAM6_MODEL(ATOM_AIRMONT_MID, &rapl_defaults_ann), 1134 X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT, &rapl_defaults_core), 1135 X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT_PLUS, &rapl_defaults_core), 1136 X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT_D, &rapl_defaults_core), 1137 X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT, &rapl_defaults_core), 1138 X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_D, &rapl_defaults_core), 1139 X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_L, &rapl_defaults_core), 1140 1141 X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNL, &rapl_defaults_hsw_server), 1142 X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNM, &rapl_defaults_hsw_server), 1143 1144 X86_MATCH_VENDOR_FAM(AMD, 0x17, &rapl_defaults_amd), 1145 X86_MATCH_VENDOR_FAM(AMD, 0x19, &rapl_defaults_amd), 1146 X86_MATCH_VENDOR_FAM(HYGON, 0x18, &rapl_defaults_amd), 1147 {} 1148 }; 1149 MODULE_DEVICE_TABLE(x86cpu, rapl_ids); 1150 1151 /* Read once for all raw primitive data for domains */ 1152 static void rapl_update_domain_data(struct rapl_package *rp) 1153 { 1154 int dmn, prim; 1155 u64 val; 1156 1157 for (dmn = 0; dmn < rp->nr_domains; dmn++) { 1158 pr_debug("update %s domain %s data\n", rp->name, 1159 rp->domains[dmn].name); 1160 /* exclude non-raw primitives */ 1161 for (prim = 0; prim < NR_RAW_PRIMITIVES; prim++) { 1162 if (!rapl_read_data_raw(&rp->domains[dmn], prim, 1163 rpi[prim].unit, &val)) 1164 rp->domains[dmn].rdd.primitives[prim] = val; 1165 } 1166 } 1167 1168 } 1169 1170 static int rapl_package_register_powercap(struct rapl_package *rp) 1171 { 1172 struct rapl_domain *rd; 1173 struct powercap_zone *power_zone = NULL; 1174 int nr_pl, ret; 1175 1176 /* Update the domain data of the new package */ 1177 rapl_update_domain_data(rp); 1178 1179 /* first we register package domain as the parent zone */ 1180 for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) { 1181 if (rd->id == RAPL_DOMAIN_PACKAGE) { 1182 nr_pl = find_nr_power_limit(rd); 1183 pr_debug("register package domain %s\n", rp->name); 1184 power_zone = powercap_register_zone(&rd->power_zone, 1185 rp->priv->control_type, rp->name, 1186 NULL, &zone_ops[rd->id], nr_pl, 1187 &constraint_ops); 1188 if (IS_ERR(power_zone)) { 1189 pr_debug("failed to register power zone %s\n", 1190 rp->name); 1191 return PTR_ERR(power_zone); 1192 } 1193 /* track parent zone in per package/socket data */ 1194 rp->power_zone = power_zone; 1195 /* done, only one package domain per socket */ 1196 break; 1197 } 1198 } 1199 if (!power_zone) { 1200 pr_err("no package domain found, unknown topology!\n"); 1201 return -ENODEV; 1202 } 1203 /* now register domains as children of the socket/package */ 1204 for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) { 1205 struct powercap_zone *parent = rp->power_zone; 1206 1207 if (rd->id == RAPL_DOMAIN_PACKAGE) 1208 continue; 1209 if (rd->id == RAPL_DOMAIN_PLATFORM) 1210 parent = NULL; 1211 /* number of power limits per domain varies */ 1212 nr_pl = find_nr_power_limit(rd); 1213 power_zone = powercap_register_zone(&rd->power_zone, 1214 rp->priv->control_type, 1215 rd->name, parent, 1216 &zone_ops[rd->id], nr_pl, 1217 &constraint_ops); 1218 1219 if (IS_ERR(power_zone)) { 1220 pr_debug("failed to register power_zone, %s:%s\n", 1221 rp->name, rd->name); 1222 ret = PTR_ERR(power_zone); 1223 goto err_cleanup; 1224 } 1225 } 1226 return 0; 1227 1228 err_cleanup: 1229 /* 1230 * Clean up previously initialized domains within the package if we 1231 * failed after the first domain setup. 1232 */ 1233 while (--rd >= rp->domains) { 1234 pr_debug("unregister %s domain %s\n", rp->name, rd->name); 1235 powercap_unregister_zone(rp->priv->control_type, 1236 &rd->power_zone); 1237 } 1238 1239 return ret; 1240 } 1241 1242 static int rapl_check_domain(int cpu, int domain, struct rapl_package *rp) 1243 { 1244 struct reg_action ra; 1245 1246 switch (domain) { 1247 case RAPL_DOMAIN_PACKAGE: 1248 case RAPL_DOMAIN_PP0: 1249 case RAPL_DOMAIN_PP1: 1250 case RAPL_DOMAIN_DRAM: 1251 case RAPL_DOMAIN_PLATFORM: 1252 ra.reg = rp->priv->regs[domain][RAPL_DOMAIN_REG_STATUS]; 1253 break; 1254 default: 1255 pr_err("invalid domain id %d\n", domain); 1256 return -EINVAL; 1257 } 1258 /* make sure domain counters are available and contains non-zero 1259 * values, otherwise skip it. 1260 */ 1261 1262 ra.mask = ENERGY_STATUS_MASK; 1263 if (rp->priv->read_raw(cpu, &ra) || !ra.value) 1264 return -ENODEV; 1265 1266 return 0; 1267 } 1268 1269 /* 1270 * Check if power limits are available. Two cases when they are not available: 1271 * 1. Locked by BIOS, in this case we still provide read-only access so that 1272 * users can see what limit is set by the BIOS. 1273 * 2. Some CPUs make some domains monitoring only which means PLx MSRs may not 1274 * exist at all. In this case, we do not show the constraints in powercap. 1275 * 1276 * Called after domains are detected and initialized. 1277 */ 1278 static void rapl_detect_powerlimit(struct rapl_domain *rd) 1279 { 1280 u64 val64; 1281 int i; 1282 1283 /* check if the domain is locked by BIOS, ignore if MSR doesn't exist */ 1284 if (!rapl_read_data_raw(rd, FW_LOCK, false, &val64)) { 1285 if (val64) { 1286 pr_info("RAPL %s domain %s locked by BIOS\n", 1287 rd->rp->name, rd->name); 1288 rd->state |= DOMAIN_STATE_BIOS_LOCKED; 1289 } 1290 } 1291 /* check if power limit MSR exists, otherwise domain is monitoring only */ 1292 for (i = 0; i < NR_POWER_LIMITS; i++) { 1293 int prim = rd->rpl[i].prim_id; 1294 1295 if (rapl_read_data_raw(rd, prim, false, &val64)) 1296 rd->rpl[i].name = NULL; 1297 } 1298 } 1299 1300 /* Detect active and valid domains for the given CPU, caller must 1301 * ensure the CPU belongs to the targeted package and CPU hotlug is disabled. 1302 */ 1303 static int rapl_detect_domains(struct rapl_package *rp, int cpu) 1304 { 1305 struct rapl_domain *rd; 1306 int i; 1307 1308 for (i = 0; i < RAPL_DOMAIN_MAX; i++) { 1309 /* use physical package id to read counters */ 1310 if (!rapl_check_domain(cpu, i, rp)) { 1311 rp->domain_map |= 1 << i; 1312 pr_info("Found RAPL domain %s\n", rapl_domain_names[i]); 1313 } 1314 } 1315 rp->nr_domains = bitmap_weight(&rp->domain_map, RAPL_DOMAIN_MAX); 1316 if (!rp->nr_domains) { 1317 pr_debug("no valid rapl domains found in %s\n", rp->name); 1318 return -ENODEV; 1319 } 1320 pr_debug("found %d domains on %s\n", rp->nr_domains, rp->name); 1321 1322 rp->domains = kcalloc(rp->nr_domains + 1, sizeof(struct rapl_domain), 1323 GFP_KERNEL); 1324 if (!rp->domains) 1325 return -ENOMEM; 1326 1327 rapl_init_domains(rp); 1328 1329 for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) 1330 rapl_detect_powerlimit(rd); 1331 1332 return 0; 1333 } 1334 1335 /* called from CPU hotplug notifier, hotplug lock held */ 1336 void rapl_remove_package(struct rapl_package *rp) 1337 { 1338 struct rapl_domain *rd, *rd_package = NULL; 1339 1340 package_power_limit_irq_restore(rp); 1341 1342 for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) { 1343 rapl_write_data_raw(rd, PL1_ENABLE, 0); 1344 rapl_write_data_raw(rd, PL1_CLAMP, 0); 1345 if (find_nr_power_limit(rd) > 1) { 1346 rapl_write_data_raw(rd, PL2_ENABLE, 0); 1347 rapl_write_data_raw(rd, PL2_CLAMP, 0); 1348 rapl_write_data_raw(rd, PL4_ENABLE, 0); 1349 } 1350 if (rd->id == RAPL_DOMAIN_PACKAGE) { 1351 rd_package = rd; 1352 continue; 1353 } 1354 pr_debug("remove package, undo power limit on %s: %s\n", 1355 rp->name, rd->name); 1356 powercap_unregister_zone(rp->priv->control_type, 1357 &rd->power_zone); 1358 } 1359 /* do parent zone last */ 1360 powercap_unregister_zone(rp->priv->control_type, 1361 &rd_package->power_zone); 1362 list_del(&rp->plist); 1363 kfree(rp); 1364 } 1365 EXPORT_SYMBOL_GPL(rapl_remove_package); 1366 1367 /* caller to ensure CPU hotplug lock is held */ 1368 struct rapl_package *rapl_find_package_domain(int cpu, struct rapl_if_priv *priv) 1369 { 1370 int id = topology_logical_die_id(cpu); 1371 struct rapl_package *rp; 1372 1373 list_for_each_entry(rp, &rapl_packages, plist) { 1374 if (rp->id == id 1375 && rp->priv->control_type == priv->control_type) 1376 return rp; 1377 } 1378 1379 return NULL; 1380 } 1381 EXPORT_SYMBOL_GPL(rapl_find_package_domain); 1382 1383 /* called from CPU hotplug notifier, hotplug lock held */ 1384 struct rapl_package *rapl_add_package(int cpu, struct rapl_if_priv *priv) 1385 { 1386 int id = topology_logical_die_id(cpu); 1387 struct rapl_package *rp; 1388 int ret; 1389 1390 if (!rapl_defaults) 1391 return ERR_PTR(-ENODEV); 1392 1393 rp = kzalloc(sizeof(struct rapl_package), GFP_KERNEL); 1394 if (!rp) 1395 return ERR_PTR(-ENOMEM); 1396 1397 /* add the new package to the list */ 1398 rp->id = id; 1399 rp->lead_cpu = cpu; 1400 rp->priv = priv; 1401 1402 if (topology_max_die_per_package() > 1) 1403 snprintf(rp->name, PACKAGE_DOMAIN_NAME_LENGTH, 1404 "package-%d-die-%d", 1405 topology_physical_package_id(cpu), topology_die_id(cpu)); 1406 else 1407 snprintf(rp->name, PACKAGE_DOMAIN_NAME_LENGTH, "package-%d", 1408 topology_physical_package_id(cpu)); 1409 1410 /* check if the package contains valid domains */ 1411 if (rapl_detect_domains(rp, cpu) || rapl_defaults->check_unit(rp, cpu)) { 1412 ret = -ENODEV; 1413 goto err_free_package; 1414 } 1415 ret = rapl_package_register_powercap(rp); 1416 if (!ret) { 1417 INIT_LIST_HEAD(&rp->plist); 1418 list_add(&rp->plist, &rapl_packages); 1419 return rp; 1420 } 1421 1422 err_free_package: 1423 kfree(rp->domains); 1424 kfree(rp); 1425 return ERR_PTR(ret); 1426 } 1427 EXPORT_SYMBOL_GPL(rapl_add_package); 1428 1429 static void power_limit_state_save(void) 1430 { 1431 struct rapl_package *rp; 1432 struct rapl_domain *rd; 1433 int nr_pl, ret, i; 1434 1435 cpus_read_lock(); 1436 list_for_each_entry(rp, &rapl_packages, plist) { 1437 if (!rp->power_zone) 1438 continue; 1439 rd = power_zone_to_rapl_domain(rp->power_zone); 1440 nr_pl = find_nr_power_limit(rd); 1441 for (i = 0; i < nr_pl; i++) { 1442 switch (rd->rpl[i].prim_id) { 1443 case PL1_ENABLE: 1444 ret = rapl_read_data_raw(rd, 1445 POWER_LIMIT1, true, 1446 &rd->rpl[i].last_power_limit); 1447 if (ret) 1448 rd->rpl[i].last_power_limit = 0; 1449 break; 1450 case PL2_ENABLE: 1451 ret = rapl_read_data_raw(rd, 1452 POWER_LIMIT2, true, 1453 &rd->rpl[i].last_power_limit); 1454 if (ret) 1455 rd->rpl[i].last_power_limit = 0; 1456 break; 1457 case PL4_ENABLE: 1458 ret = rapl_read_data_raw(rd, 1459 POWER_LIMIT4, true, 1460 &rd->rpl[i].last_power_limit); 1461 if (ret) 1462 rd->rpl[i].last_power_limit = 0; 1463 break; 1464 } 1465 } 1466 } 1467 cpus_read_unlock(); 1468 } 1469 1470 static void power_limit_state_restore(void) 1471 { 1472 struct rapl_package *rp; 1473 struct rapl_domain *rd; 1474 int nr_pl, i; 1475 1476 cpus_read_lock(); 1477 list_for_each_entry(rp, &rapl_packages, plist) { 1478 if (!rp->power_zone) 1479 continue; 1480 rd = power_zone_to_rapl_domain(rp->power_zone); 1481 nr_pl = find_nr_power_limit(rd); 1482 for (i = 0; i < nr_pl; i++) { 1483 switch (rd->rpl[i].prim_id) { 1484 case PL1_ENABLE: 1485 if (rd->rpl[i].last_power_limit) 1486 rapl_write_data_raw(rd, POWER_LIMIT1, 1487 rd->rpl[i].last_power_limit); 1488 break; 1489 case PL2_ENABLE: 1490 if (rd->rpl[i].last_power_limit) 1491 rapl_write_data_raw(rd, POWER_LIMIT2, 1492 rd->rpl[i].last_power_limit); 1493 break; 1494 case PL4_ENABLE: 1495 if (rd->rpl[i].last_power_limit) 1496 rapl_write_data_raw(rd, POWER_LIMIT4, 1497 rd->rpl[i].last_power_limit); 1498 break; 1499 } 1500 } 1501 } 1502 cpus_read_unlock(); 1503 } 1504 1505 static int rapl_pm_callback(struct notifier_block *nb, 1506 unsigned long mode, void *_unused) 1507 { 1508 switch (mode) { 1509 case PM_SUSPEND_PREPARE: 1510 power_limit_state_save(); 1511 break; 1512 case PM_POST_SUSPEND: 1513 power_limit_state_restore(); 1514 break; 1515 } 1516 return NOTIFY_OK; 1517 } 1518 1519 static struct notifier_block rapl_pm_notifier = { 1520 .notifier_call = rapl_pm_callback, 1521 }; 1522 1523 static struct platform_device *rapl_msr_platdev; 1524 1525 static int __init rapl_init(void) 1526 { 1527 const struct x86_cpu_id *id; 1528 int ret; 1529 1530 id = x86_match_cpu(rapl_ids); 1531 if (!id) { 1532 pr_err("driver does not support CPU family %d model %d\n", 1533 boot_cpu_data.x86, boot_cpu_data.x86_model); 1534 1535 return -ENODEV; 1536 } 1537 1538 rapl_defaults = (struct rapl_defaults *)id->driver_data; 1539 1540 ret = register_pm_notifier(&rapl_pm_notifier); 1541 if (ret) 1542 return ret; 1543 1544 rapl_msr_platdev = platform_device_alloc("intel_rapl_msr", 0); 1545 if (!rapl_msr_platdev) { 1546 ret = -ENOMEM; 1547 goto end; 1548 } 1549 1550 ret = platform_device_add(rapl_msr_platdev); 1551 if (ret) 1552 platform_device_put(rapl_msr_platdev); 1553 1554 end: 1555 if (ret) 1556 unregister_pm_notifier(&rapl_pm_notifier); 1557 1558 return ret; 1559 } 1560 1561 static void __exit rapl_exit(void) 1562 { 1563 platform_device_unregister(rapl_msr_platdev); 1564 unregister_pm_notifier(&rapl_pm_notifier); 1565 } 1566 1567 fs_initcall(rapl_init); 1568 module_exit(rapl_exit); 1569 1570 MODULE_DESCRIPTION("Intel Runtime Average Power Limit (RAPL) common code"); 1571 MODULE_AUTHOR("Jacob Pan <jacob.jun.pan@intel.com>"); 1572 MODULE_LICENSE("GPL v2"); 1573