1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Common code for Intel Running Average Power Limit (RAPL) support. 4 * Copyright (c) 2019, Intel Corporation. 5 */ 6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 7 8 #include <linux/kernel.h> 9 #include <linux/module.h> 10 #include <linux/list.h> 11 #include <linux/types.h> 12 #include <linux/device.h> 13 #include <linux/slab.h> 14 #include <linux/log2.h> 15 #include <linux/bitmap.h> 16 #include <linux/delay.h> 17 #include <linux/sysfs.h> 18 #include <linux/cpu.h> 19 #include <linux/powercap.h> 20 #include <linux/suspend.h> 21 #include <linux/intel_rapl.h> 22 #include <linux/processor.h> 23 #include <linux/platform_device.h> 24 25 #include <asm/iosf_mbi.h> 26 #include <asm/cpu_device_id.h> 27 #include <asm/intel-family.h> 28 29 /* bitmasks for RAPL MSRs, used by primitive access functions */ 30 #define ENERGY_STATUS_MASK 0xffffffff 31 32 #define POWER_LIMIT1_MASK 0x7FFF 33 #define POWER_LIMIT1_ENABLE BIT(15) 34 #define POWER_LIMIT1_CLAMP BIT(16) 35 36 #define POWER_LIMIT2_MASK (0x7FFFULL<<32) 37 #define POWER_LIMIT2_ENABLE BIT_ULL(47) 38 #define POWER_LIMIT2_CLAMP BIT_ULL(48) 39 #define POWER_HIGH_LOCK BIT_ULL(63) 40 #define POWER_LOW_LOCK BIT(31) 41 42 #define POWER_LIMIT4_MASK 0x1FFF 43 44 #define TIME_WINDOW1_MASK (0x7FULL<<17) 45 #define TIME_WINDOW2_MASK (0x7FULL<<49) 46 47 #define POWER_UNIT_OFFSET 0 48 #define POWER_UNIT_MASK 0x0F 49 50 #define ENERGY_UNIT_OFFSET 0x08 51 #define ENERGY_UNIT_MASK 0x1F00 52 53 #define TIME_UNIT_OFFSET 0x10 54 #define TIME_UNIT_MASK 0xF0000 55 56 #define POWER_INFO_MAX_MASK (0x7fffULL<<32) 57 #define POWER_INFO_MIN_MASK (0x7fffULL<<16) 58 #define POWER_INFO_MAX_TIME_WIN_MASK (0x3fULL<<48) 59 #define POWER_INFO_THERMAL_SPEC_MASK 0x7fff 60 61 #define PERF_STATUS_THROTTLE_TIME_MASK 0xffffffff 62 #define PP_POLICY_MASK 0x1F 63 64 /* Non HW constants */ 65 #define RAPL_PRIMITIVE_DERIVED BIT(1) /* not from raw data */ 66 #define RAPL_PRIMITIVE_DUMMY BIT(2) 67 68 #define TIME_WINDOW_MAX_MSEC 40000 69 #define TIME_WINDOW_MIN_MSEC 250 70 #define ENERGY_UNIT_SCALE 1000 /* scale from driver unit to powercap unit */ 71 enum unit_type { 72 ARBITRARY_UNIT, /* no translation */ 73 POWER_UNIT, 74 ENERGY_UNIT, 75 TIME_UNIT, 76 }; 77 78 /* per domain data, some are optional */ 79 #define NR_RAW_PRIMITIVES (NR_RAPL_PRIMITIVES - 2) 80 81 #define DOMAIN_STATE_INACTIVE BIT(0) 82 #define DOMAIN_STATE_POWER_LIMIT_SET BIT(1) 83 #define DOMAIN_STATE_BIOS_LOCKED BIT(2) 84 85 static const char pl1_name[] = "long_term"; 86 static const char pl2_name[] = "short_term"; 87 static const char pl4_name[] = "peak_power"; 88 89 #define power_zone_to_rapl_domain(_zone) \ 90 container_of(_zone, struct rapl_domain, power_zone) 91 92 struct rapl_defaults { 93 u8 floor_freq_reg_addr; 94 int (*check_unit)(struct rapl_package *rp, int cpu); 95 void (*set_floor_freq)(struct rapl_domain *rd, bool mode); 96 u64 (*compute_time_window)(struct rapl_package *rp, u64 val, 97 bool to_raw); 98 unsigned int dram_domain_energy_unit; 99 unsigned int psys_domain_energy_unit; 100 }; 101 static struct rapl_defaults *rapl_defaults; 102 103 /* Sideband MBI registers */ 104 #define IOSF_CPU_POWER_BUDGET_CTL_BYT (0x2) 105 #define IOSF_CPU_POWER_BUDGET_CTL_TNG (0xdf) 106 107 #define PACKAGE_PLN_INT_SAVED BIT(0) 108 #define MAX_PRIM_NAME (32) 109 110 /* per domain data. used to describe individual knobs such that access function 111 * can be consolidated into one instead of many inline functions. 112 */ 113 struct rapl_primitive_info { 114 const char *name; 115 u64 mask; 116 int shift; 117 enum rapl_domain_reg_id id; 118 enum unit_type unit; 119 u32 flag; 120 }; 121 122 #define PRIMITIVE_INFO_INIT(p, m, s, i, u, f) { \ 123 .name = #p, \ 124 .mask = m, \ 125 .shift = s, \ 126 .id = i, \ 127 .unit = u, \ 128 .flag = f \ 129 } 130 131 static void rapl_init_domains(struct rapl_package *rp); 132 static int rapl_read_data_raw(struct rapl_domain *rd, 133 enum rapl_primitives prim, 134 bool xlate, u64 *data); 135 static int rapl_write_data_raw(struct rapl_domain *rd, 136 enum rapl_primitives prim, 137 unsigned long long value); 138 static u64 rapl_unit_xlate(struct rapl_domain *rd, 139 enum unit_type type, u64 value, int to_raw); 140 static void package_power_limit_irq_save(struct rapl_package *rp); 141 142 static LIST_HEAD(rapl_packages); /* guarded by CPU hotplug lock */ 143 144 static const char *const rapl_domain_names[] = { 145 "package", 146 "core", 147 "uncore", 148 "dram", 149 "psys", 150 }; 151 152 static int get_energy_counter(struct powercap_zone *power_zone, 153 u64 *energy_raw) 154 { 155 struct rapl_domain *rd; 156 u64 energy_now; 157 158 /* prevent CPU hotplug, make sure the RAPL domain does not go 159 * away while reading the counter. 160 */ 161 cpus_read_lock(); 162 rd = power_zone_to_rapl_domain(power_zone); 163 164 if (!rapl_read_data_raw(rd, ENERGY_COUNTER, true, &energy_now)) { 165 *energy_raw = energy_now; 166 cpus_read_unlock(); 167 168 return 0; 169 } 170 cpus_read_unlock(); 171 172 return -EIO; 173 } 174 175 static int get_max_energy_counter(struct powercap_zone *pcd_dev, u64 *energy) 176 { 177 struct rapl_domain *rd = power_zone_to_rapl_domain(pcd_dev); 178 179 *energy = rapl_unit_xlate(rd, ENERGY_UNIT, ENERGY_STATUS_MASK, 0); 180 return 0; 181 } 182 183 static int release_zone(struct powercap_zone *power_zone) 184 { 185 struct rapl_domain *rd = power_zone_to_rapl_domain(power_zone); 186 struct rapl_package *rp = rd->rp; 187 188 /* package zone is the last zone of a package, we can free 189 * memory here since all children has been unregistered. 190 */ 191 if (rd->id == RAPL_DOMAIN_PACKAGE) { 192 kfree(rd); 193 rp->domains = NULL; 194 } 195 196 return 0; 197 198 } 199 200 static int find_nr_power_limit(struct rapl_domain *rd) 201 { 202 int i, nr_pl = 0; 203 204 for (i = 0; i < NR_POWER_LIMITS; i++) { 205 if (rd->rpl[i].name) 206 nr_pl++; 207 } 208 209 return nr_pl; 210 } 211 212 static int set_domain_enable(struct powercap_zone *power_zone, bool mode) 213 { 214 struct rapl_domain *rd = power_zone_to_rapl_domain(power_zone); 215 216 if (rd->state & DOMAIN_STATE_BIOS_LOCKED) 217 return -EACCES; 218 219 cpus_read_lock(); 220 rapl_write_data_raw(rd, PL1_ENABLE, mode); 221 if (rapl_defaults->set_floor_freq) 222 rapl_defaults->set_floor_freq(rd, mode); 223 cpus_read_unlock(); 224 225 return 0; 226 } 227 228 static int get_domain_enable(struct powercap_zone *power_zone, bool *mode) 229 { 230 struct rapl_domain *rd = power_zone_to_rapl_domain(power_zone); 231 u64 val; 232 233 if (rd->state & DOMAIN_STATE_BIOS_LOCKED) { 234 *mode = false; 235 return 0; 236 } 237 cpus_read_lock(); 238 if (rapl_read_data_raw(rd, PL1_ENABLE, true, &val)) { 239 cpus_read_unlock(); 240 return -EIO; 241 } 242 *mode = val; 243 cpus_read_unlock(); 244 245 return 0; 246 } 247 248 /* per RAPL domain ops, in the order of rapl_domain_type */ 249 static const struct powercap_zone_ops zone_ops[] = { 250 /* RAPL_DOMAIN_PACKAGE */ 251 { 252 .get_energy_uj = get_energy_counter, 253 .get_max_energy_range_uj = get_max_energy_counter, 254 .release = release_zone, 255 .set_enable = set_domain_enable, 256 .get_enable = get_domain_enable, 257 }, 258 /* RAPL_DOMAIN_PP0 */ 259 { 260 .get_energy_uj = get_energy_counter, 261 .get_max_energy_range_uj = get_max_energy_counter, 262 .release = release_zone, 263 .set_enable = set_domain_enable, 264 .get_enable = get_domain_enable, 265 }, 266 /* RAPL_DOMAIN_PP1 */ 267 { 268 .get_energy_uj = get_energy_counter, 269 .get_max_energy_range_uj = get_max_energy_counter, 270 .release = release_zone, 271 .set_enable = set_domain_enable, 272 .get_enable = get_domain_enable, 273 }, 274 /* RAPL_DOMAIN_DRAM */ 275 { 276 .get_energy_uj = get_energy_counter, 277 .get_max_energy_range_uj = get_max_energy_counter, 278 .release = release_zone, 279 .set_enable = set_domain_enable, 280 .get_enable = get_domain_enable, 281 }, 282 /* RAPL_DOMAIN_PLATFORM */ 283 { 284 .get_energy_uj = get_energy_counter, 285 .get_max_energy_range_uj = get_max_energy_counter, 286 .release = release_zone, 287 .set_enable = set_domain_enable, 288 .get_enable = get_domain_enable, 289 }, 290 }; 291 292 /* 293 * Constraint index used by powercap can be different than power limit (PL) 294 * index in that some PLs maybe missing due to non-existent MSRs. So we 295 * need to convert here by finding the valid PLs only (name populated). 296 */ 297 static int contraint_to_pl(struct rapl_domain *rd, int cid) 298 { 299 int i, j; 300 301 for (i = 0, j = 0; i < NR_POWER_LIMITS; i++) { 302 if ((rd->rpl[i].name) && j++ == cid) { 303 pr_debug("%s: index %d\n", __func__, i); 304 return i; 305 } 306 } 307 pr_err("Cannot find matching power limit for constraint %d\n", cid); 308 309 return -EINVAL; 310 } 311 312 static int set_power_limit(struct powercap_zone *power_zone, int cid, 313 u64 power_limit) 314 { 315 struct rapl_domain *rd; 316 struct rapl_package *rp; 317 int ret = 0; 318 int id; 319 320 cpus_read_lock(); 321 rd = power_zone_to_rapl_domain(power_zone); 322 id = contraint_to_pl(rd, cid); 323 if (id < 0) { 324 ret = id; 325 goto set_exit; 326 } 327 328 rp = rd->rp; 329 330 if (rd->state & DOMAIN_STATE_BIOS_LOCKED) { 331 dev_warn(&power_zone->dev, 332 "%s locked by BIOS, monitoring only\n", rd->name); 333 ret = -EACCES; 334 goto set_exit; 335 } 336 337 switch (rd->rpl[id].prim_id) { 338 case PL1_ENABLE: 339 rapl_write_data_raw(rd, POWER_LIMIT1, power_limit); 340 break; 341 case PL2_ENABLE: 342 rapl_write_data_raw(rd, POWER_LIMIT2, power_limit); 343 break; 344 case PL4_ENABLE: 345 rapl_write_data_raw(rd, POWER_LIMIT4, power_limit); 346 break; 347 default: 348 ret = -EINVAL; 349 } 350 if (!ret) 351 package_power_limit_irq_save(rp); 352 set_exit: 353 cpus_read_unlock(); 354 return ret; 355 } 356 357 static int get_current_power_limit(struct powercap_zone *power_zone, int cid, 358 u64 *data) 359 { 360 struct rapl_domain *rd; 361 u64 val; 362 int prim; 363 int ret = 0; 364 int id; 365 366 cpus_read_lock(); 367 rd = power_zone_to_rapl_domain(power_zone); 368 id = contraint_to_pl(rd, cid); 369 if (id < 0) { 370 ret = id; 371 goto get_exit; 372 } 373 374 switch (rd->rpl[id].prim_id) { 375 case PL1_ENABLE: 376 prim = POWER_LIMIT1; 377 break; 378 case PL2_ENABLE: 379 prim = POWER_LIMIT2; 380 break; 381 case PL4_ENABLE: 382 prim = POWER_LIMIT4; 383 break; 384 default: 385 cpus_read_unlock(); 386 return -EINVAL; 387 } 388 if (rapl_read_data_raw(rd, prim, true, &val)) 389 ret = -EIO; 390 else 391 *data = val; 392 393 get_exit: 394 cpus_read_unlock(); 395 396 return ret; 397 } 398 399 static int set_time_window(struct powercap_zone *power_zone, int cid, 400 u64 window) 401 { 402 struct rapl_domain *rd; 403 int ret = 0; 404 int id; 405 406 cpus_read_lock(); 407 rd = power_zone_to_rapl_domain(power_zone); 408 id = contraint_to_pl(rd, cid); 409 if (id < 0) { 410 ret = id; 411 goto set_time_exit; 412 } 413 414 switch (rd->rpl[id].prim_id) { 415 case PL1_ENABLE: 416 rapl_write_data_raw(rd, TIME_WINDOW1, window); 417 break; 418 case PL2_ENABLE: 419 rapl_write_data_raw(rd, TIME_WINDOW2, window); 420 break; 421 default: 422 ret = -EINVAL; 423 } 424 425 set_time_exit: 426 cpus_read_unlock(); 427 return ret; 428 } 429 430 static int get_time_window(struct powercap_zone *power_zone, int cid, 431 u64 *data) 432 { 433 struct rapl_domain *rd; 434 u64 val; 435 int ret = 0; 436 int id; 437 438 cpus_read_lock(); 439 rd = power_zone_to_rapl_domain(power_zone); 440 id = contraint_to_pl(rd, cid); 441 if (id < 0) { 442 ret = id; 443 goto get_time_exit; 444 } 445 446 switch (rd->rpl[id].prim_id) { 447 case PL1_ENABLE: 448 ret = rapl_read_data_raw(rd, TIME_WINDOW1, true, &val); 449 break; 450 case PL2_ENABLE: 451 ret = rapl_read_data_raw(rd, TIME_WINDOW2, true, &val); 452 break; 453 case PL4_ENABLE: 454 /* 455 * Time window parameter is not applicable for PL4 entry 456 * so assigining '0' as default value. 457 */ 458 val = 0; 459 break; 460 default: 461 cpus_read_unlock(); 462 return -EINVAL; 463 } 464 if (!ret) 465 *data = val; 466 467 get_time_exit: 468 cpus_read_unlock(); 469 470 return ret; 471 } 472 473 static const char *get_constraint_name(struct powercap_zone *power_zone, 474 int cid) 475 { 476 struct rapl_domain *rd; 477 int id; 478 479 rd = power_zone_to_rapl_domain(power_zone); 480 id = contraint_to_pl(rd, cid); 481 if (id >= 0) 482 return rd->rpl[id].name; 483 484 return NULL; 485 } 486 487 static int get_max_power(struct powercap_zone *power_zone, int id, u64 *data) 488 { 489 struct rapl_domain *rd; 490 u64 val; 491 int prim; 492 int ret = 0; 493 494 cpus_read_lock(); 495 rd = power_zone_to_rapl_domain(power_zone); 496 switch (rd->rpl[id].prim_id) { 497 case PL1_ENABLE: 498 prim = THERMAL_SPEC_POWER; 499 break; 500 case PL2_ENABLE: 501 prim = MAX_POWER; 502 break; 503 case PL4_ENABLE: 504 prim = MAX_POWER; 505 break; 506 default: 507 cpus_read_unlock(); 508 return -EINVAL; 509 } 510 if (rapl_read_data_raw(rd, prim, true, &val)) 511 ret = -EIO; 512 else 513 *data = val; 514 515 /* As a generalization rule, PL4 would be around two times PL2. */ 516 if (rd->rpl[id].prim_id == PL4_ENABLE) 517 *data = *data * 2; 518 519 cpus_read_unlock(); 520 521 return ret; 522 } 523 524 static const struct powercap_zone_constraint_ops constraint_ops = { 525 .set_power_limit_uw = set_power_limit, 526 .get_power_limit_uw = get_current_power_limit, 527 .set_time_window_us = set_time_window, 528 .get_time_window_us = get_time_window, 529 .get_max_power_uw = get_max_power, 530 .get_name = get_constraint_name, 531 }; 532 533 /* called after domain detection and package level data are set */ 534 static void rapl_init_domains(struct rapl_package *rp) 535 { 536 enum rapl_domain_type i; 537 enum rapl_domain_reg_id j; 538 struct rapl_domain *rd = rp->domains; 539 540 for (i = 0; i < RAPL_DOMAIN_MAX; i++) { 541 unsigned int mask = rp->domain_map & (1 << i); 542 543 if (!mask) 544 continue; 545 546 rd->rp = rp; 547 548 if (i == RAPL_DOMAIN_PLATFORM && rp->id > 0) { 549 snprintf(rd->name, RAPL_DOMAIN_NAME_LENGTH, "psys-%d", 550 topology_physical_package_id(rp->lead_cpu)); 551 } else 552 snprintf(rd->name, RAPL_DOMAIN_NAME_LENGTH, "%s", 553 rapl_domain_names[i]); 554 555 rd->id = i; 556 rd->rpl[0].prim_id = PL1_ENABLE; 557 rd->rpl[0].name = pl1_name; 558 559 /* 560 * The PL2 power domain is applicable for limits two 561 * and limits three 562 */ 563 if (rp->priv->limits[i] >= 2) { 564 rd->rpl[1].prim_id = PL2_ENABLE; 565 rd->rpl[1].name = pl2_name; 566 } 567 568 /* Enable PL4 domain if the total power limits are three */ 569 if (rp->priv->limits[i] == 3) { 570 rd->rpl[2].prim_id = PL4_ENABLE; 571 rd->rpl[2].name = pl4_name; 572 } 573 574 for (j = 0; j < RAPL_DOMAIN_REG_MAX; j++) 575 rd->regs[j] = rp->priv->regs[i][j]; 576 577 switch (i) { 578 case RAPL_DOMAIN_DRAM: 579 rd->domain_energy_unit = 580 rapl_defaults->dram_domain_energy_unit; 581 if (rd->domain_energy_unit) 582 pr_info("DRAM domain energy unit %dpj\n", 583 rd->domain_energy_unit); 584 break; 585 case RAPL_DOMAIN_PLATFORM: 586 rd->domain_energy_unit = 587 rapl_defaults->psys_domain_energy_unit; 588 if (rd->domain_energy_unit) 589 pr_info("Platform domain energy unit %dpj\n", 590 rd->domain_energy_unit); 591 break; 592 default: 593 break; 594 } 595 rd++; 596 } 597 } 598 599 static u64 rapl_unit_xlate(struct rapl_domain *rd, enum unit_type type, 600 u64 value, int to_raw) 601 { 602 u64 units = 1; 603 struct rapl_package *rp = rd->rp; 604 u64 scale = 1; 605 606 switch (type) { 607 case POWER_UNIT: 608 units = rp->power_unit; 609 break; 610 case ENERGY_UNIT: 611 scale = ENERGY_UNIT_SCALE; 612 /* per domain unit takes precedence */ 613 if (rd->domain_energy_unit) 614 units = rd->domain_energy_unit; 615 else 616 units = rp->energy_unit; 617 break; 618 case TIME_UNIT: 619 return rapl_defaults->compute_time_window(rp, value, to_raw); 620 case ARBITRARY_UNIT: 621 default: 622 return value; 623 } 624 625 if (to_raw) 626 return div64_u64(value, units) * scale; 627 628 value *= units; 629 630 return div64_u64(value, scale); 631 } 632 633 /* in the order of enum rapl_primitives */ 634 static struct rapl_primitive_info rpi[] = { 635 /* name, mask, shift, msr index, unit divisor */ 636 PRIMITIVE_INFO_INIT(ENERGY_COUNTER, ENERGY_STATUS_MASK, 0, 637 RAPL_DOMAIN_REG_STATUS, ENERGY_UNIT, 0), 638 PRIMITIVE_INFO_INIT(POWER_LIMIT1, POWER_LIMIT1_MASK, 0, 639 RAPL_DOMAIN_REG_LIMIT, POWER_UNIT, 0), 640 PRIMITIVE_INFO_INIT(POWER_LIMIT2, POWER_LIMIT2_MASK, 32, 641 RAPL_DOMAIN_REG_LIMIT, POWER_UNIT, 0), 642 PRIMITIVE_INFO_INIT(POWER_LIMIT4, POWER_LIMIT4_MASK, 0, 643 RAPL_DOMAIN_REG_PL4, POWER_UNIT, 0), 644 PRIMITIVE_INFO_INIT(FW_LOCK, POWER_LOW_LOCK, 31, 645 RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0), 646 PRIMITIVE_INFO_INIT(PL1_ENABLE, POWER_LIMIT1_ENABLE, 15, 647 RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0), 648 PRIMITIVE_INFO_INIT(PL1_CLAMP, POWER_LIMIT1_CLAMP, 16, 649 RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0), 650 PRIMITIVE_INFO_INIT(PL2_ENABLE, POWER_LIMIT2_ENABLE, 47, 651 RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0), 652 PRIMITIVE_INFO_INIT(PL2_CLAMP, POWER_LIMIT2_CLAMP, 48, 653 RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0), 654 PRIMITIVE_INFO_INIT(PL4_ENABLE, POWER_LIMIT4_MASK, 0, 655 RAPL_DOMAIN_REG_PL4, ARBITRARY_UNIT, 0), 656 PRIMITIVE_INFO_INIT(TIME_WINDOW1, TIME_WINDOW1_MASK, 17, 657 RAPL_DOMAIN_REG_LIMIT, TIME_UNIT, 0), 658 PRIMITIVE_INFO_INIT(TIME_WINDOW2, TIME_WINDOW2_MASK, 49, 659 RAPL_DOMAIN_REG_LIMIT, TIME_UNIT, 0), 660 PRIMITIVE_INFO_INIT(THERMAL_SPEC_POWER, POWER_INFO_THERMAL_SPEC_MASK, 661 0, RAPL_DOMAIN_REG_INFO, POWER_UNIT, 0), 662 PRIMITIVE_INFO_INIT(MAX_POWER, POWER_INFO_MAX_MASK, 32, 663 RAPL_DOMAIN_REG_INFO, POWER_UNIT, 0), 664 PRIMITIVE_INFO_INIT(MIN_POWER, POWER_INFO_MIN_MASK, 16, 665 RAPL_DOMAIN_REG_INFO, POWER_UNIT, 0), 666 PRIMITIVE_INFO_INIT(MAX_TIME_WINDOW, POWER_INFO_MAX_TIME_WIN_MASK, 48, 667 RAPL_DOMAIN_REG_INFO, TIME_UNIT, 0), 668 PRIMITIVE_INFO_INIT(THROTTLED_TIME, PERF_STATUS_THROTTLE_TIME_MASK, 0, 669 RAPL_DOMAIN_REG_PERF, TIME_UNIT, 0), 670 PRIMITIVE_INFO_INIT(PRIORITY_LEVEL, PP_POLICY_MASK, 0, 671 RAPL_DOMAIN_REG_POLICY, ARBITRARY_UNIT, 0), 672 /* non-hardware */ 673 PRIMITIVE_INFO_INIT(AVERAGE_POWER, 0, 0, 0, POWER_UNIT, 674 RAPL_PRIMITIVE_DERIVED), 675 {NULL, 0, 0, 0}, 676 }; 677 678 /* Read primitive data based on its related struct rapl_primitive_info. 679 * if xlate flag is set, return translated data based on data units, i.e. 680 * time, energy, and power. 681 * RAPL MSRs are non-architectual and are laid out not consistently across 682 * domains. Here we use primitive info to allow writing consolidated access 683 * functions. 684 * For a given primitive, it is processed by MSR mask and shift. Unit conversion 685 * is pre-assigned based on RAPL unit MSRs read at init time. 686 * 63-------------------------- 31--------------------------- 0 687 * | xxxxx (mask) | 688 * | |<- shift ----------------| 689 * 63-------------------------- 31--------------------------- 0 690 */ 691 static int rapl_read_data_raw(struct rapl_domain *rd, 692 enum rapl_primitives prim, bool xlate, u64 *data) 693 { 694 u64 value; 695 struct rapl_primitive_info *rp = &rpi[prim]; 696 struct reg_action ra; 697 int cpu; 698 699 if (!rp->name || rp->flag & RAPL_PRIMITIVE_DUMMY) 700 return -EINVAL; 701 702 ra.reg = rd->regs[rp->id]; 703 if (!ra.reg) 704 return -EINVAL; 705 706 cpu = rd->rp->lead_cpu; 707 708 /* domain with 2 limits has different bit */ 709 if (prim == FW_LOCK && rd->rp->priv->limits[rd->id] == 2) { 710 rp->mask = POWER_HIGH_LOCK; 711 rp->shift = 63; 712 } 713 /* non-hardware data are collected by the polling thread */ 714 if (rp->flag & RAPL_PRIMITIVE_DERIVED) { 715 *data = rd->rdd.primitives[prim]; 716 return 0; 717 } 718 719 ra.mask = rp->mask; 720 721 if (rd->rp->priv->read_raw(cpu, &ra)) { 722 pr_debug("failed to read reg 0x%llx on cpu %d\n", ra.reg, cpu); 723 return -EIO; 724 } 725 726 value = ra.value >> rp->shift; 727 728 if (xlate) 729 *data = rapl_unit_xlate(rd, rp->unit, value, 0); 730 else 731 *data = value; 732 733 return 0; 734 } 735 736 /* Similar use of primitive info in the read counterpart */ 737 static int rapl_write_data_raw(struct rapl_domain *rd, 738 enum rapl_primitives prim, 739 unsigned long long value) 740 { 741 struct rapl_primitive_info *rp = &rpi[prim]; 742 int cpu; 743 u64 bits; 744 struct reg_action ra; 745 int ret; 746 747 cpu = rd->rp->lead_cpu; 748 bits = rapl_unit_xlate(rd, rp->unit, value, 1); 749 bits <<= rp->shift; 750 bits &= rp->mask; 751 752 memset(&ra, 0, sizeof(ra)); 753 754 ra.reg = rd->regs[rp->id]; 755 ra.mask = rp->mask; 756 ra.value = bits; 757 758 ret = rd->rp->priv->write_raw(cpu, &ra); 759 760 return ret; 761 } 762 763 /* 764 * Raw RAPL data stored in MSRs are in certain scales. We need to 765 * convert them into standard units based on the units reported in 766 * the RAPL unit MSRs. This is specific to CPUs as the method to 767 * calculate units differ on different CPUs. 768 * We convert the units to below format based on CPUs. 769 * i.e. 770 * energy unit: picoJoules : Represented in picoJoules by default 771 * power unit : microWatts : Represented in milliWatts by default 772 * time unit : microseconds: Represented in seconds by default 773 */ 774 static int rapl_check_unit_core(struct rapl_package *rp, int cpu) 775 { 776 struct reg_action ra; 777 u32 value; 778 779 ra.reg = rp->priv->reg_unit; 780 ra.mask = ~0; 781 if (rp->priv->read_raw(cpu, &ra)) { 782 pr_err("Failed to read power unit REG 0x%llx on CPU %d, exit.\n", 783 rp->priv->reg_unit, cpu); 784 return -ENODEV; 785 } 786 787 value = (ra.value & ENERGY_UNIT_MASK) >> ENERGY_UNIT_OFFSET; 788 rp->energy_unit = ENERGY_UNIT_SCALE * 1000000 / (1 << value); 789 790 value = (ra.value & POWER_UNIT_MASK) >> POWER_UNIT_OFFSET; 791 rp->power_unit = 1000000 / (1 << value); 792 793 value = (ra.value & TIME_UNIT_MASK) >> TIME_UNIT_OFFSET; 794 rp->time_unit = 1000000 / (1 << value); 795 796 pr_debug("Core CPU %s energy=%dpJ, time=%dus, power=%duW\n", 797 rp->name, rp->energy_unit, rp->time_unit, rp->power_unit); 798 799 return 0; 800 } 801 802 static int rapl_check_unit_atom(struct rapl_package *rp, int cpu) 803 { 804 struct reg_action ra; 805 u32 value; 806 807 ra.reg = rp->priv->reg_unit; 808 ra.mask = ~0; 809 if (rp->priv->read_raw(cpu, &ra)) { 810 pr_err("Failed to read power unit REG 0x%llx on CPU %d, exit.\n", 811 rp->priv->reg_unit, cpu); 812 return -ENODEV; 813 } 814 815 value = (ra.value & ENERGY_UNIT_MASK) >> ENERGY_UNIT_OFFSET; 816 rp->energy_unit = ENERGY_UNIT_SCALE * 1 << value; 817 818 value = (ra.value & POWER_UNIT_MASK) >> POWER_UNIT_OFFSET; 819 rp->power_unit = (1 << value) * 1000; 820 821 value = (ra.value & TIME_UNIT_MASK) >> TIME_UNIT_OFFSET; 822 rp->time_unit = 1000000 / (1 << value); 823 824 pr_debug("Atom %s energy=%dpJ, time=%dus, power=%duW\n", 825 rp->name, rp->energy_unit, rp->time_unit, rp->power_unit); 826 827 return 0; 828 } 829 830 static void power_limit_irq_save_cpu(void *info) 831 { 832 u32 l, h = 0; 833 struct rapl_package *rp = (struct rapl_package *)info; 834 835 /* save the state of PLN irq mask bit before disabling it */ 836 rdmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, &l, &h); 837 if (!(rp->power_limit_irq & PACKAGE_PLN_INT_SAVED)) { 838 rp->power_limit_irq = l & PACKAGE_THERM_INT_PLN_ENABLE; 839 rp->power_limit_irq |= PACKAGE_PLN_INT_SAVED; 840 } 841 l &= ~PACKAGE_THERM_INT_PLN_ENABLE; 842 wrmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h); 843 } 844 845 /* REVISIT: 846 * When package power limit is set artificially low by RAPL, LVT 847 * thermal interrupt for package power limit should be ignored 848 * since we are not really exceeding the real limit. The intention 849 * is to avoid excessive interrupts while we are trying to save power. 850 * A useful feature might be routing the package_power_limit interrupt 851 * to userspace via eventfd. once we have a usecase, this is simple 852 * to do by adding an atomic notifier. 853 */ 854 855 static void package_power_limit_irq_save(struct rapl_package *rp) 856 { 857 if (!boot_cpu_has(X86_FEATURE_PTS) || !boot_cpu_has(X86_FEATURE_PLN)) 858 return; 859 860 smp_call_function_single(rp->lead_cpu, power_limit_irq_save_cpu, rp, 1); 861 } 862 863 /* 864 * Restore per package power limit interrupt enable state. Called from cpu 865 * hotplug code on package removal. 866 */ 867 static void package_power_limit_irq_restore(struct rapl_package *rp) 868 { 869 u32 l, h; 870 871 if (!boot_cpu_has(X86_FEATURE_PTS) || !boot_cpu_has(X86_FEATURE_PLN)) 872 return; 873 874 /* irq enable state not saved, nothing to restore */ 875 if (!(rp->power_limit_irq & PACKAGE_PLN_INT_SAVED)) 876 return; 877 878 rdmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, &l, &h); 879 880 if (rp->power_limit_irq & PACKAGE_THERM_INT_PLN_ENABLE) 881 l |= PACKAGE_THERM_INT_PLN_ENABLE; 882 else 883 l &= ~PACKAGE_THERM_INT_PLN_ENABLE; 884 885 wrmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h); 886 } 887 888 static void set_floor_freq_default(struct rapl_domain *rd, bool mode) 889 { 890 int nr_powerlimit = find_nr_power_limit(rd); 891 892 /* always enable clamp such that p-state can go below OS requested 893 * range. power capping priority over guranteed frequency. 894 */ 895 rapl_write_data_raw(rd, PL1_CLAMP, mode); 896 897 /* some domains have pl2 */ 898 if (nr_powerlimit > 1) { 899 rapl_write_data_raw(rd, PL2_ENABLE, mode); 900 rapl_write_data_raw(rd, PL2_CLAMP, mode); 901 } 902 } 903 904 static void set_floor_freq_atom(struct rapl_domain *rd, bool enable) 905 { 906 static u32 power_ctrl_orig_val; 907 u32 mdata; 908 909 if (!rapl_defaults->floor_freq_reg_addr) { 910 pr_err("Invalid floor frequency config register\n"); 911 return; 912 } 913 914 if (!power_ctrl_orig_val) 915 iosf_mbi_read(BT_MBI_UNIT_PMC, MBI_CR_READ, 916 rapl_defaults->floor_freq_reg_addr, 917 &power_ctrl_orig_val); 918 mdata = power_ctrl_orig_val; 919 if (enable) { 920 mdata &= ~(0x7f << 8); 921 mdata |= 1 << 8; 922 } 923 iosf_mbi_write(BT_MBI_UNIT_PMC, MBI_CR_WRITE, 924 rapl_defaults->floor_freq_reg_addr, mdata); 925 } 926 927 static u64 rapl_compute_time_window_core(struct rapl_package *rp, u64 value, 928 bool to_raw) 929 { 930 u64 f, y; /* fraction and exp. used for time unit */ 931 932 /* 933 * Special processing based on 2^Y*(1+F/4), refer 934 * to Intel Software Developer's manual Vol.3B: CH 14.9.3. 935 */ 936 if (!to_raw) { 937 f = (value & 0x60) >> 5; 938 y = value & 0x1f; 939 value = (1 << y) * (4 + f) * rp->time_unit / 4; 940 } else { 941 do_div(value, rp->time_unit); 942 y = ilog2(value); 943 f = div64_u64(4 * (value - (1 << y)), 1 << y); 944 value = (y & 0x1f) | ((f & 0x3) << 5); 945 } 946 return value; 947 } 948 949 static u64 rapl_compute_time_window_atom(struct rapl_package *rp, u64 value, 950 bool to_raw) 951 { 952 /* 953 * Atom time unit encoding is straight forward val * time_unit, 954 * where time_unit is default to 1 sec. Never 0. 955 */ 956 if (!to_raw) 957 return (value) ? value *= rp->time_unit : rp->time_unit; 958 959 value = div64_u64(value, rp->time_unit); 960 961 return value; 962 } 963 964 static const struct rapl_defaults rapl_defaults_core = { 965 .floor_freq_reg_addr = 0, 966 .check_unit = rapl_check_unit_core, 967 .set_floor_freq = set_floor_freq_default, 968 .compute_time_window = rapl_compute_time_window_core, 969 }; 970 971 static const struct rapl_defaults rapl_defaults_hsw_server = { 972 .check_unit = rapl_check_unit_core, 973 .set_floor_freq = set_floor_freq_default, 974 .compute_time_window = rapl_compute_time_window_core, 975 .dram_domain_energy_unit = 15300, 976 }; 977 978 static const struct rapl_defaults rapl_defaults_spr_server = { 979 .check_unit = rapl_check_unit_core, 980 .set_floor_freq = set_floor_freq_default, 981 .compute_time_window = rapl_compute_time_window_core, 982 .dram_domain_energy_unit = 15300, 983 .psys_domain_energy_unit = 1000000000, 984 }; 985 986 static const struct rapl_defaults rapl_defaults_byt = { 987 .floor_freq_reg_addr = IOSF_CPU_POWER_BUDGET_CTL_BYT, 988 .check_unit = rapl_check_unit_atom, 989 .set_floor_freq = set_floor_freq_atom, 990 .compute_time_window = rapl_compute_time_window_atom, 991 }; 992 993 static const struct rapl_defaults rapl_defaults_tng = { 994 .floor_freq_reg_addr = IOSF_CPU_POWER_BUDGET_CTL_TNG, 995 .check_unit = rapl_check_unit_atom, 996 .set_floor_freq = set_floor_freq_atom, 997 .compute_time_window = rapl_compute_time_window_atom, 998 }; 999 1000 static const struct rapl_defaults rapl_defaults_ann = { 1001 .floor_freq_reg_addr = 0, 1002 .check_unit = rapl_check_unit_atom, 1003 .set_floor_freq = NULL, 1004 .compute_time_window = rapl_compute_time_window_atom, 1005 }; 1006 1007 static const struct rapl_defaults rapl_defaults_cht = { 1008 .floor_freq_reg_addr = 0, 1009 .check_unit = rapl_check_unit_atom, 1010 .set_floor_freq = NULL, 1011 .compute_time_window = rapl_compute_time_window_atom, 1012 }; 1013 1014 static const struct rapl_defaults rapl_defaults_amd = { 1015 .check_unit = rapl_check_unit_core, 1016 }; 1017 1018 static const struct x86_cpu_id rapl_ids[] __initconst = { 1019 X86_MATCH_INTEL_FAM6_MODEL(SANDYBRIDGE, &rapl_defaults_core), 1020 X86_MATCH_INTEL_FAM6_MODEL(SANDYBRIDGE_X, &rapl_defaults_core), 1021 1022 X86_MATCH_INTEL_FAM6_MODEL(IVYBRIDGE, &rapl_defaults_core), 1023 X86_MATCH_INTEL_FAM6_MODEL(IVYBRIDGE_X, &rapl_defaults_core), 1024 1025 X86_MATCH_INTEL_FAM6_MODEL(HASWELL, &rapl_defaults_core), 1026 X86_MATCH_INTEL_FAM6_MODEL(HASWELL_L, &rapl_defaults_core), 1027 X86_MATCH_INTEL_FAM6_MODEL(HASWELL_G, &rapl_defaults_core), 1028 X86_MATCH_INTEL_FAM6_MODEL(HASWELL_X, &rapl_defaults_hsw_server), 1029 1030 X86_MATCH_INTEL_FAM6_MODEL(BROADWELL, &rapl_defaults_core), 1031 X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_G, &rapl_defaults_core), 1032 X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_D, &rapl_defaults_core), 1033 X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_X, &rapl_defaults_hsw_server), 1034 1035 X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE, &rapl_defaults_core), 1036 X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_L, &rapl_defaults_core), 1037 X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_X, &rapl_defaults_hsw_server), 1038 X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE_L, &rapl_defaults_core), 1039 X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE, &rapl_defaults_core), 1040 X86_MATCH_INTEL_FAM6_MODEL(CANNONLAKE_L, &rapl_defaults_core), 1041 X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_L, &rapl_defaults_core), 1042 X86_MATCH_INTEL_FAM6_MODEL(ICELAKE, &rapl_defaults_core), 1043 X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_NNPI, &rapl_defaults_core), 1044 X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, &rapl_defaults_hsw_server), 1045 X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, &rapl_defaults_hsw_server), 1046 X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE_L, &rapl_defaults_core), 1047 X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE, &rapl_defaults_core), 1048 X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE_L, &rapl_defaults_core), 1049 X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE, &rapl_defaults_core), 1050 X86_MATCH_INTEL_FAM6_MODEL(ROCKETLAKE, &rapl_defaults_core), 1051 X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE, &rapl_defaults_core), 1052 X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, &rapl_defaults_core), 1053 X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &rapl_defaults_spr_server), 1054 X86_MATCH_INTEL_FAM6_MODEL(LAKEFIELD, &rapl_defaults_core), 1055 1056 X86_MATCH_INTEL_FAM6_MODEL(ATOM_SILVERMONT, &rapl_defaults_byt), 1057 X86_MATCH_INTEL_FAM6_MODEL(ATOM_AIRMONT, &rapl_defaults_cht), 1058 X86_MATCH_INTEL_FAM6_MODEL(ATOM_SILVERMONT_MID, &rapl_defaults_tng), 1059 X86_MATCH_INTEL_FAM6_MODEL(ATOM_AIRMONT_MID, &rapl_defaults_ann), 1060 X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT, &rapl_defaults_core), 1061 X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT_PLUS, &rapl_defaults_core), 1062 X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT_D, &rapl_defaults_core), 1063 X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT, &rapl_defaults_core), 1064 X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_D, &rapl_defaults_core), 1065 X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_L, &rapl_defaults_core), 1066 1067 X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNL, &rapl_defaults_hsw_server), 1068 X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNM, &rapl_defaults_hsw_server), 1069 1070 X86_MATCH_VENDOR_FAM(AMD, 0x17, &rapl_defaults_amd), 1071 X86_MATCH_VENDOR_FAM(AMD, 0x19, &rapl_defaults_amd), 1072 X86_MATCH_VENDOR_FAM(HYGON, 0x18, &rapl_defaults_amd), 1073 {} 1074 }; 1075 MODULE_DEVICE_TABLE(x86cpu, rapl_ids); 1076 1077 /* Read once for all raw primitive data for domains */ 1078 static void rapl_update_domain_data(struct rapl_package *rp) 1079 { 1080 int dmn, prim; 1081 u64 val; 1082 1083 for (dmn = 0; dmn < rp->nr_domains; dmn++) { 1084 pr_debug("update %s domain %s data\n", rp->name, 1085 rp->domains[dmn].name); 1086 /* exclude non-raw primitives */ 1087 for (prim = 0; prim < NR_RAW_PRIMITIVES; prim++) { 1088 if (!rapl_read_data_raw(&rp->domains[dmn], prim, 1089 rpi[prim].unit, &val)) 1090 rp->domains[dmn].rdd.primitives[prim] = val; 1091 } 1092 } 1093 1094 } 1095 1096 static int rapl_package_register_powercap(struct rapl_package *rp) 1097 { 1098 struct rapl_domain *rd; 1099 struct powercap_zone *power_zone = NULL; 1100 int nr_pl, ret; 1101 1102 /* Update the domain data of the new package */ 1103 rapl_update_domain_data(rp); 1104 1105 /* first we register package domain as the parent zone */ 1106 for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) { 1107 if (rd->id == RAPL_DOMAIN_PACKAGE) { 1108 nr_pl = find_nr_power_limit(rd); 1109 pr_debug("register package domain %s\n", rp->name); 1110 power_zone = powercap_register_zone(&rd->power_zone, 1111 rp->priv->control_type, rp->name, 1112 NULL, &zone_ops[rd->id], nr_pl, 1113 &constraint_ops); 1114 if (IS_ERR(power_zone)) { 1115 pr_debug("failed to register power zone %s\n", 1116 rp->name); 1117 return PTR_ERR(power_zone); 1118 } 1119 /* track parent zone in per package/socket data */ 1120 rp->power_zone = power_zone; 1121 /* done, only one package domain per socket */ 1122 break; 1123 } 1124 } 1125 if (!power_zone) { 1126 pr_err("no package domain found, unknown topology!\n"); 1127 return -ENODEV; 1128 } 1129 /* now register domains as children of the socket/package */ 1130 for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) { 1131 struct powercap_zone *parent = rp->power_zone; 1132 1133 if (rd->id == RAPL_DOMAIN_PACKAGE) 1134 continue; 1135 if (rd->id == RAPL_DOMAIN_PLATFORM) 1136 parent = NULL; 1137 /* number of power limits per domain varies */ 1138 nr_pl = find_nr_power_limit(rd); 1139 power_zone = powercap_register_zone(&rd->power_zone, 1140 rp->priv->control_type, 1141 rd->name, parent, 1142 &zone_ops[rd->id], nr_pl, 1143 &constraint_ops); 1144 1145 if (IS_ERR(power_zone)) { 1146 pr_debug("failed to register power_zone, %s:%s\n", 1147 rp->name, rd->name); 1148 ret = PTR_ERR(power_zone); 1149 goto err_cleanup; 1150 } 1151 } 1152 return 0; 1153 1154 err_cleanup: 1155 /* 1156 * Clean up previously initialized domains within the package if we 1157 * failed after the first domain setup. 1158 */ 1159 while (--rd >= rp->domains) { 1160 pr_debug("unregister %s domain %s\n", rp->name, rd->name); 1161 powercap_unregister_zone(rp->priv->control_type, 1162 &rd->power_zone); 1163 } 1164 1165 return ret; 1166 } 1167 1168 static int rapl_check_domain(int cpu, int domain, struct rapl_package *rp) 1169 { 1170 struct reg_action ra; 1171 1172 switch (domain) { 1173 case RAPL_DOMAIN_PACKAGE: 1174 case RAPL_DOMAIN_PP0: 1175 case RAPL_DOMAIN_PP1: 1176 case RAPL_DOMAIN_DRAM: 1177 case RAPL_DOMAIN_PLATFORM: 1178 ra.reg = rp->priv->regs[domain][RAPL_DOMAIN_REG_STATUS]; 1179 break; 1180 default: 1181 pr_err("invalid domain id %d\n", domain); 1182 return -EINVAL; 1183 } 1184 /* make sure domain counters are available and contains non-zero 1185 * values, otherwise skip it. 1186 */ 1187 1188 ra.mask = ENERGY_STATUS_MASK; 1189 if (rp->priv->read_raw(cpu, &ra) || !ra.value) 1190 return -ENODEV; 1191 1192 return 0; 1193 } 1194 1195 /* 1196 * Check if power limits are available. Two cases when they are not available: 1197 * 1. Locked by BIOS, in this case we still provide read-only access so that 1198 * users can see what limit is set by the BIOS. 1199 * 2. Some CPUs make some domains monitoring only which means PLx MSRs may not 1200 * exist at all. In this case, we do not show the constraints in powercap. 1201 * 1202 * Called after domains are detected and initialized. 1203 */ 1204 static void rapl_detect_powerlimit(struct rapl_domain *rd) 1205 { 1206 u64 val64; 1207 int i; 1208 1209 /* check if the domain is locked by BIOS, ignore if MSR doesn't exist */ 1210 if (!rapl_read_data_raw(rd, FW_LOCK, false, &val64)) { 1211 if (val64) { 1212 pr_info("RAPL %s domain %s locked by BIOS\n", 1213 rd->rp->name, rd->name); 1214 rd->state |= DOMAIN_STATE_BIOS_LOCKED; 1215 } 1216 } 1217 /* check if power limit MSR exists, otherwise domain is monitoring only */ 1218 for (i = 0; i < NR_POWER_LIMITS; i++) { 1219 int prim = rd->rpl[i].prim_id; 1220 1221 if (rapl_read_data_raw(rd, prim, false, &val64)) 1222 rd->rpl[i].name = NULL; 1223 } 1224 } 1225 1226 /* Detect active and valid domains for the given CPU, caller must 1227 * ensure the CPU belongs to the targeted package and CPU hotlug is disabled. 1228 */ 1229 static int rapl_detect_domains(struct rapl_package *rp, int cpu) 1230 { 1231 struct rapl_domain *rd; 1232 int i; 1233 1234 for (i = 0; i < RAPL_DOMAIN_MAX; i++) { 1235 /* use physical package id to read counters */ 1236 if (!rapl_check_domain(cpu, i, rp)) { 1237 rp->domain_map |= 1 << i; 1238 pr_info("Found RAPL domain %s\n", rapl_domain_names[i]); 1239 } 1240 } 1241 rp->nr_domains = bitmap_weight(&rp->domain_map, RAPL_DOMAIN_MAX); 1242 if (!rp->nr_domains) { 1243 pr_debug("no valid rapl domains found in %s\n", rp->name); 1244 return -ENODEV; 1245 } 1246 pr_debug("found %d domains on %s\n", rp->nr_domains, rp->name); 1247 1248 rp->domains = kcalloc(rp->nr_domains + 1, sizeof(struct rapl_domain), 1249 GFP_KERNEL); 1250 if (!rp->domains) 1251 return -ENOMEM; 1252 1253 rapl_init_domains(rp); 1254 1255 for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) 1256 rapl_detect_powerlimit(rd); 1257 1258 return 0; 1259 } 1260 1261 /* called from CPU hotplug notifier, hotplug lock held */ 1262 void rapl_remove_package(struct rapl_package *rp) 1263 { 1264 struct rapl_domain *rd, *rd_package = NULL; 1265 1266 package_power_limit_irq_restore(rp); 1267 1268 for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) { 1269 rapl_write_data_raw(rd, PL1_ENABLE, 0); 1270 rapl_write_data_raw(rd, PL1_CLAMP, 0); 1271 if (find_nr_power_limit(rd) > 1) { 1272 rapl_write_data_raw(rd, PL2_ENABLE, 0); 1273 rapl_write_data_raw(rd, PL2_CLAMP, 0); 1274 rapl_write_data_raw(rd, PL4_ENABLE, 0); 1275 } 1276 if (rd->id == RAPL_DOMAIN_PACKAGE) { 1277 rd_package = rd; 1278 continue; 1279 } 1280 pr_debug("remove package, undo power limit on %s: %s\n", 1281 rp->name, rd->name); 1282 powercap_unregister_zone(rp->priv->control_type, 1283 &rd->power_zone); 1284 } 1285 /* do parent zone last */ 1286 powercap_unregister_zone(rp->priv->control_type, 1287 &rd_package->power_zone); 1288 list_del(&rp->plist); 1289 kfree(rp); 1290 } 1291 EXPORT_SYMBOL_GPL(rapl_remove_package); 1292 1293 /* caller to ensure CPU hotplug lock is held */ 1294 struct rapl_package *rapl_find_package_domain(int cpu, struct rapl_if_priv *priv) 1295 { 1296 int id = topology_logical_die_id(cpu); 1297 struct rapl_package *rp; 1298 1299 list_for_each_entry(rp, &rapl_packages, plist) { 1300 if (rp->id == id 1301 && rp->priv->control_type == priv->control_type) 1302 return rp; 1303 } 1304 1305 return NULL; 1306 } 1307 EXPORT_SYMBOL_GPL(rapl_find_package_domain); 1308 1309 /* called from CPU hotplug notifier, hotplug lock held */ 1310 struct rapl_package *rapl_add_package(int cpu, struct rapl_if_priv *priv) 1311 { 1312 int id = topology_logical_die_id(cpu); 1313 struct rapl_package *rp; 1314 int ret; 1315 1316 if (!rapl_defaults) 1317 return ERR_PTR(-ENODEV); 1318 1319 rp = kzalloc(sizeof(struct rapl_package), GFP_KERNEL); 1320 if (!rp) 1321 return ERR_PTR(-ENOMEM); 1322 1323 /* add the new package to the list */ 1324 rp->id = id; 1325 rp->lead_cpu = cpu; 1326 rp->priv = priv; 1327 1328 if (topology_max_die_per_package() > 1) 1329 snprintf(rp->name, PACKAGE_DOMAIN_NAME_LENGTH, 1330 "package-%d-die-%d", 1331 topology_physical_package_id(cpu), topology_die_id(cpu)); 1332 else 1333 snprintf(rp->name, PACKAGE_DOMAIN_NAME_LENGTH, "package-%d", 1334 topology_physical_package_id(cpu)); 1335 1336 /* check if the package contains valid domains */ 1337 if (rapl_detect_domains(rp, cpu) || rapl_defaults->check_unit(rp, cpu)) { 1338 ret = -ENODEV; 1339 goto err_free_package; 1340 } 1341 ret = rapl_package_register_powercap(rp); 1342 if (!ret) { 1343 INIT_LIST_HEAD(&rp->plist); 1344 list_add(&rp->plist, &rapl_packages); 1345 return rp; 1346 } 1347 1348 err_free_package: 1349 kfree(rp->domains); 1350 kfree(rp); 1351 return ERR_PTR(ret); 1352 } 1353 EXPORT_SYMBOL_GPL(rapl_add_package); 1354 1355 static void power_limit_state_save(void) 1356 { 1357 struct rapl_package *rp; 1358 struct rapl_domain *rd; 1359 int nr_pl, ret, i; 1360 1361 cpus_read_lock(); 1362 list_for_each_entry(rp, &rapl_packages, plist) { 1363 if (!rp->power_zone) 1364 continue; 1365 rd = power_zone_to_rapl_domain(rp->power_zone); 1366 nr_pl = find_nr_power_limit(rd); 1367 for (i = 0; i < nr_pl; i++) { 1368 switch (rd->rpl[i].prim_id) { 1369 case PL1_ENABLE: 1370 ret = rapl_read_data_raw(rd, 1371 POWER_LIMIT1, true, 1372 &rd->rpl[i].last_power_limit); 1373 if (ret) 1374 rd->rpl[i].last_power_limit = 0; 1375 break; 1376 case PL2_ENABLE: 1377 ret = rapl_read_data_raw(rd, 1378 POWER_LIMIT2, true, 1379 &rd->rpl[i].last_power_limit); 1380 if (ret) 1381 rd->rpl[i].last_power_limit = 0; 1382 break; 1383 case PL4_ENABLE: 1384 ret = rapl_read_data_raw(rd, 1385 POWER_LIMIT4, true, 1386 &rd->rpl[i].last_power_limit); 1387 if (ret) 1388 rd->rpl[i].last_power_limit = 0; 1389 break; 1390 } 1391 } 1392 } 1393 cpus_read_unlock(); 1394 } 1395 1396 static void power_limit_state_restore(void) 1397 { 1398 struct rapl_package *rp; 1399 struct rapl_domain *rd; 1400 int nr_pl, i; 1401 1402 cpus_read_lock(); 1403 list_for_each_entry(rp, &rapl_packages, plist) { 1404 if (!rp->power_zone) 1405 continue; 1406 rd = power_zone_to_rapl_domain(rp->power_zone); 1407 nr_pl = find_nr_power_limit(rd); 1408 for (i = 0; i < nr_pl; i++) { 1409 switch (rd->rpl[i].prim_id) { 1410 case PL1_ENABLE: 1411 if (rd->rpl[i].last_power_limit) 1412 rapl_write_data_raw(rd, POWER_LIMIT1, 1413 rd->rpl[i].last_power_limit); 1414 break; 1415 case PL2_ENABLE: 1416 if (rd->rpl[i].last_power_limit) 1417 rapl_write_data_raw(rd, POWER_LIMIT2, 1418 rd->rpl[i].last_power_limit); 1419 break; 1420 case PL4_ENABLE: 1421 if (rd->rpl[i].last_power_limit) 1422 rapl_write_data_raw(rd, POWER_LIMIT4, 1423 rd->rpl[i].last_power_limit); 1424 break; 1425 } 1426 } 1427 } 1428 cpus_read_unlock(); 1429 } 1430 1431 static int rapl_pm_callback(struct notifier_block *nb, 1432 unsigned long mode, void *_unused) 1433 { 1434 switch (mode) { 1435 case PM_SUSPEND_PREPARE: 1436 power_limit_state_save(); 1437 break; 1438 case PM_POST_SUSPEND: 1439 power_limit_state_restore(); 1440 break; 1441 } 1442 return NOTIFY_OK; 1443 } 1444 1445 static struct notifier_block rapl_pm_notifier = { 1446 .notifier_call = rapl_pm_callback, 1447 }; 1448 1449 static struct platform_device *rapl_msr_platdev; 1450 1451 static int __init rapl_init(void) 1452 { 1453 const struct x86_cpu_id *id; 1454 int ret; 1455 1456 id = x86_match_cpu(rapl_ids); 1457 if (!id) { 1458 pr_err("driver does not support CPU family %d model %d\n", 1459 boot_cpu_data.x86, boot_cpu_data.x86_model); 1460 1461 return -ENODEV; 1462 } 1463 1464 rapl_defaults = (struct rapl_defaults *)id->driver_data; 1465 1466 ret = register_pm_notifier(&rapl_pm_notifier); 1467 if (ret) 1468 return ret; 1469 1470 rapl_msr_platdev = platform_device_alloc("intel_rapl_msr", 0); 1471 if (!rapl_msr_platdev) { 1472 ret = -ENOMEM; 1473 goto end; 1474 } 1475 1476 ret = platform_device_add(rapl_msr_platdev); 1477 if (ret) 1478 platform_device_put(rapl_msr_platdev); 1479 1480 end: 1481 if (ret) 1482 unregister_pm_notifier(&rapl_pm_notifier); 1483 1484 return ret; 1485 } 1486 1487 static void __exit rapl_exit(void) 1488 { 1489 platform_device_unregister(rapl_msr_platdev); 1490 unregister_pm_notifier(&rapl_pm_notifier); 1491 } 1492 1493 fs_initcall(rapl_init); 1494 module_exit(rapl_exit); 1495 1496 MODULE_DESCRIPTION("Intel Runtime Average Power Limit (RAPL) common code"); 1497 MODULE_AUTHOR("Jacob Pan <jacob.jun.pan@intel.com>"); 1498 MODULE_LICENSE("GPL v2"); 1499