1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Common code for Intel Running Average Power Limit (RAPL) support. 4 * Copyright (c) 2019, Intel Corporation. 5 */ 6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 7 8 #include <linux/kernel.h> 9 #include <linux/module.h> 10 #include <linux/list.h> 11 #include <linux/types.h> 12 #include <linux/device.h> 13 #include <linux/slab.h> 14 #include <linux/log2.h> 15 #include <linux/bitmap.h> 16 #include <linux/delay.h> 17 #include <linux/sysfs.h> 18 #include <linux/cpu.h> 19 #include <linux/powercap.h> 20 #include <linux/suspend.h> 21 #include <linux/intel_rapl.h> 22 #include <linux/processor.h> 23 #include <linux/platform_device.h> 24 25 #include <asm/iosf_mbi.h> 26 #include <asm/cpu_device_id.h> 27 #include <asm/intel-family.h> 28 29 /* Local defines */ 30 #define MSR_PLATFORM_POWER_LIMIT 0x0000065C 31 32 /* bitmasks for RAPL MSRs, used by primitive access functions */ 33 #define ENERGY_STATUS_MASK 0xffffffff 34 35 #define POWER_LIMIT1_MASK 0x7FFF 36 #define POWER_LIMIT1_ENABLE BIT(15) 37 #define POWER_LIMIT1_CLAMP BIT(16) 38 39 #define POWER_LIMIT2_MASK (0x7FFFULL<<32) 40 #define POWER_LIMIT2_ENABLE BIT_ULL(47) 41 #define POWER_LIMIT2_CLAMP BIT_ULL(48) 42 #define POWER_HIGH_LOCK BIT_ULL(63) 43 #define POWER_LOW_LOCK BIT(31) 44 45 #define TIME_WINDOW1_MASK (0x7FULL<<17) 46 #define TIME_WINDOW2_MASK (0x7FULL<<49) 47 48 #define POWER_UNIT_OFFSET 0 49 #define POWER_UNIT_MASK 0x0F 50 51 #define ENERGY_UNIT_OFFSET 0x08 52 #define ENERGY_UNIT_MASK 0x1F00 53 54 #define TIME_UNIT_OFFSET 0x10 55 #define TIME_UNIT_MASK 0xF0000 56 57 #define POWER_INFO_MAX_MASK (0x7fffULL<<32) 58 #define POWER_INFO_MIN_MASK (0x7fffULL<<16) 59 #define POWER_INFO_MAX_TIME_WIN_MASK (0x3fULL<<48) 60 #define POWER_INFO_THERMAL_SPEC_MASK 0x7fff 61 62 #define PERF_STATUS_THROTTLE_TIME_MASK 0xffffffff 63 #define PP_POLICY_MASK 0x1F 64 65 /* Non HW constants */ 66 #define RAPL_PRIMITIVE_DERIVED BIT(1) /* not from raw data */ 67 #define RAPL_PRIMITIVE_DUMMY BIT(2) 68 69 #define TIME_WINDOW_MAX_MSEC 40000 70 #define TIME_WINDOW_MIN_MSEC 250 71 #define ENERGY_UNIT_SCALE 1000 /* scale from driver unit to powercap unit */ 72 enum unit_type { 73 ARBITRARY_UNIT, /* no translation */ 74 POWER_UNIT, 75 ENERGY_UNIT, 76 TIME_UNIT, 77 }; 78 79 /* per domain data, some are optional */ 80 #define NR_RAW_PRIMITIVES (NR_RAPL_PRIMITIVES - 2) 81 82 #define DOMAIN_STATE_INACTIVE BIT(0) 83 #define DOMAIN_STATE_POWER_LIMIT_SET BIT(1) 84 #define DOMAIN_STATE_BIOS_LOCKED BIT(2) 85 86 static const char pl1_name[] = "long_term"; 87 static const char pl2_name[] = "short_term"; 88 89 #define power_zone_to_rapl_domain(_zone) \ 90 container_of(_zone, struct rapl_domain, power_zone) 91 92 struct rapl_defaults { 93 u8 floor_freq_reg_addr; 94 int (*check_unit)(struct rapl_package *rp, int cpu); 95 void (*set_floor_freq)(struct rapl_domain *rd, bool mode); 96 u64 (*compute_time_window)(struct rapl_package *rp, u64 val, 97 bool to_raw); 98 unsigned int dram_domain_energy_unit; 99 }; 100 static struct rapl_defaults *rapl_defaults; 101 102 /* Sideband MBI registers */ 103 #define IOSF_CPU_POWER_BUDGET_CTL_BYT (0x2) 104 #define IOSF_CPU_POWER_BUDGET_CTL_TNG (0xdf) 105 106 #define PACKAGE_PLN_INT_SAVED BIT(0) 107 #define MAX_PRIM_NAME (32) 108 109 /* per domain data. used to describe individual knobs such that access function 110 * can be consolidated into one instead of many inline functions. 111 */ 112 struct rapl_primitive_info { 113 const char *name; 114 u64 mask; 115 int shift; 116 enum rapl_domain_reg_id id; 117 enum unit_type unit; 118 u32 flag; 119 }; 120 121 #define PRIMITIVE_INFO_INIT(p, m, s, i, u, f) { \ 122 .name = #p, \ 123 .mask = m, \ 124 .shift = s, \ 125 .id = i, \ 126 .unit = u, \ 127 .flag = f \ 128 } 129 130 static void rapl_init_domains(struct rapl_package *rp); 131 static int rapl_read_data_raw(struct rapl_domain *rd, 132 enum rapl_primitives prim, 133 bool xlate, u64 *data); 134 static int rapl_write_data_raw(struct rapl_domain *rd, 135 enum rapl_primitives prim, 136 unsigned long long value); 137 static u64 rapl_unit_xlate(struct rapl_domain *rd, 138 enum unit_type type, u64 value, int to_raw); 139 static void package_power_limit_irq_save(struct rapl_package *rp); 140 141 static LIST_HEAD(rapl_packages); /* guarded by CPU hotplug lock */ 142 143 static const char *const rapl_domain_names[] = { 144 "package", 145 "core", 146 "uncore", 147 "dram", 148 "psys", 149 }; 150 151 static int get_energy_counter(struct powercap_zone *power_zone, 152 u64 *energy_raw) 153 { 154 struct rapl_domain *rd; 155 u64 energy_now; 156 157 /* prevent CPU hotplug, make sure the RAPL domain does not go 158 * away while reading the counter. 159 */ 160 get_online_cpus(); 161 rd = power_zone_to_rapl_domain(power_zone); 162 163 if (!rapl_read_data_raw(rd, ENERGY_COUNTER, true, &energy_now)) { 164 *energy_raw = energy_now; 165 put_online_cpus(); 166 167 return 0; 168 } 169 put_online_cpus(); 170 171 return -EIO; 172 } 173 174 static int get_max_energy_counter(struct powercap_zone *pcd_dev, u64 *energy) 175 { 176 struct rapl_domain *rd = power_zone_to_rapl_domain(pcd_dev); 177 178 *energy = rapl_unit_xlate(rd, ENERGY_UNIT, ENERGY_STATUS_MASK, 0); 179 return 0; 180 } 181 182 static int release_zone(struct powercap_zone *power_zone) 183 { 184 struct rapl_domain *rd = power_zone_to_rapl_domain(power_zone); 185 struct rapl_package *rp = rd->rp; 186 187 /* package zone is the last zone of a package, we can free 188 * memory here since all children has been unregistered. 189 */ 190 if (rd->id == RAPL_DOMAIN_PACKAGE) { 191 kfree(rd); 192 rp->domains = NULL; 193 } 194 195 return 0; 196 197 } 198 199 static int find_nr_power_limit(struct rapl_domain *rd) 200 { 201 int i, nr_pl = 0; 202 203 for (i = 0; i < NR_POWER_LIMITS; i++) { 204 if (rd->rpl[i].name) 205 nr_pl++; 206 } 207 208 return nr_pl; 209 } 210 211 static int set_domain_enable(struct powercap_zone *power_zone, bool mode) 212 { 213 struct rapl_domain *rd = power_zone_to_rapl_domain(power_zone); 214 215 if (rd->state & DOMAIN_STATE_BIOS_LOCKED) 216 return -EACCES; 217 218 get_online_cpus(); 219 rapl_write_data_raw(rd, PL1_ENABLE, mode); 220 if (rapl_defaults->set_floor_freq) 221 rapl_defaults->set_floor_freq(rd, mode); 222 put_online_cpus(); 223 224 return 0; 225 } 226 227 static int get_domain_enable(struct powercap_zone *power_zone, bool *mode) 228 { 229 struct rapl_domain *rd = power_zone_to_rapl_domain(power_zone); 230 u64 val; 231 232 if (rd->state & DOMAIN_STATE_BIOS_LOCKED) { 233 *mode = false; 234 return 0; 235 } 236 get_online_cpus(); 237 if (rapl_read_data_raw(rd, PL1_ENABLE, true, &val)) { 238 put_online_cpus(); 239 return -EIO; 240 } 241 *mode = val; 242 put_online_cpus(); 243 244 return 0; 245 } 246 247 /* per RAPL domain ops, in the order of rapl_domain_type */ 248 static const struct powercap_zone_ops zone_ops[] = { 249 /* RAPL_DOMAIN_PACKAGE */ 250 { 251 .get_energy_uj = get_energy_counter, 252 .get_max_energy_range_uj = get_max_energy_counter, 253 .release = release_zone, 254 .set_enable = set_domain_enable, 255 .get_enable = get_domain_enable, 256 }, 257 /* RAPL_DOMAIN_PP0 */ 258 { 259 .get_energy_uj = get_energy_counter, 260 .get_max_energy_range_uj = get_max_energy_counter, 261 .release = release_zone, 262 .set_enable = set_domain_enable, 263 .get_enable = get_domain_enable, 264 }, 265 /* RAPL_DOMAIN_PP1 */ 266 { 267 .get_energy_uj = get_energy_counter, 268 .get_max_energy_range_uj = get_max_energy_counter, 269 .release = release_zone, 270 .set_enable = set_domain_enable, 271 .get_enable = get_domain_enable, 272 }, 273 /* RAPL_DOMAIN_DRAM */ 274 { 275 .get_energy_uj = get_energy_counter, 276 .get_max_energy_range_uj = get_max_energy_counter, 277 .release = release_zone, 278 .set_enable = set_domain_enable, 279 .get_enable = get_domain_enable, 280 }, 281 /* RAPL_DOMAIN_PLATFORM */ 282 { 283 .get_energy_uj = get_energy_counter, 284 .get_max_energy_range_uj = get_max_energy_counter, 285 .release = release_zone, 286 .set_enable = set_domain_enable, 287 .get_enable = get_domain_enable, 288 }, 289 }; 290 291 /* 292 * Constraint index used by powercap can be different than power limit (PL) 293 * index in that some PLs maybe missing due to non-existent MSRs. So we 294 * need to convert here by finding the valid PLs only (name populated). 295 */ 296 static int contraint_to_pl(struct rapl_domain *rd, int cid) 297 { 298 int i, j; 299 300 for (i = 0, j = 0; i < NR_POWER_LIMITS; i++) { 301 if ((rd->rpl[i].name) && j++ == cid) { 302 pr_debug("%s: index %d\n", __func__, i); 303 return i; 304 } 305 } 306 pr_err("Cannot find matching power limit for constraint %d\n", cid); 307 308 return -EINVAL; 309 } 310 311 static int set_power_limit(struct powercap_zone *power_zone, int cid, 312 u64 power_limit) 313 { 314 struct rapl_domain *rd; 315 struct rapl_package *rp; 316 int ret = 0; 317 int id; 318 319 get_online_cpus(); 320 rd = power_zone_to_rapl_domain(power_zone); 321 id = contraint_to_pl(rd, cid); 322 if (id < 0) { 323 ret = id; 324 goto set_exit; 325 } 326 327 rp = rd->rp; 328 329 if (rd->state & DOMAIN_STATE_BIOS_LOCKED) { 330 dev_warn(&power_zone->dev, 331 "%s locked by BIOS, monitoring only\n", rd->name); 332 ret = -EACCES; 333 goto set_exit; 334 } 335 336 switch (rd->rpl[id].prim_id) { 337 case PL1_ENABLE: 338 rapl_write_data_raw(rd, POWER_LIMIT1, power_limit); 339 break; 340 case PL2_ENABLE: 341 rapl_write_data_raw(rd, POWER_LIMIT2, power_limit); 342 break; 343 default: 344 ret = -EINVAL; 345 } 346 if (!ret) 347 package_power_limit_irq_save(rp); 348 set_exit: 349 put_online_cpus(); 350 return ret; 351 } 352 353 static int get_current_power_limit(struct powercap_zone *power_zone, int cid, 354 u64 *data) 355 { 356 struct rapl_domain *rd; 357 u64 val; 358 int prim; 359 int ret = 0; 360 int id; 361 362 get_online_cpus(); 363 rd = power_zone_to_rapl_domain(power_zone); 364 id = contraint_to_pl(rd, cid); 365 if (id < 0) { 366 ret = id; 367 goto get_exit; 368 } 369 370 switch (rd->rpl[id].prim_id) { 371 case PL1_ENABLE: 372 prim = POWER_LIMIT1; 373 break; 374 case PL2_ENABLE: 375 prim = POWER_LIMIT2; 376 break; 377 default: 378 put_online_cpus(); 379 return -EINVAL; 380 } 381 if (rapl_read_data_raw(rd, prim, true, &val)) 382 ret = -EIO; 383 else 384 *data = val; 385 386 get_exit: 387 put_online_cpus(); 388 389 return ret; 390 } 391 392 static int set_time_window(struct powercap_zone *power_zone, int cid, 393 u64 window) 394 { 395 struct rapl_domain *rd; 396 int ret = 0; 397 int id; 398 399 get_online_cpus(); 400 rd = power_zone_to_rapl_domain(power_zone); 401 id = contraint_to_pl(rd, cid); 402 if (id < 0) { 403 ret = id; 404 goto set_time_exit; 405 } 406 407 switch (rd->rpl[id].prim_id) { 408 case PL1_ENABLE: 409 rapl_write_data_raw(rd, TIME_WINDOW1, window); 410 break; 411 case PL2_ENABLE: 412 rapl_write_data_raw(rd, TIME_WINDOW2, window); 413 break; 414 default: 415 ret = -EINVAL; 416 } 417 418 set_time_exit: 419 put_online_cpus(); 420 return ret; 421 } 422 423 static int get_time_window(struct powercap_zone *power_zone, int cid, 424 u64 *data) 425 { 426 struct rapl_domain *rd; 427 u64 val; 428 int ret = 0; 429 int id; 430 431 get_online_cpus(); 432 rd = power_zone_to_rapl_domain(power_zone); 433 id = contraint_to_pl(rd, cid); 434 if (id < 0) { 435 ret = id; 436 goto get_time_exit; 437 } 438 439 switch (rd->rpl[id].prim_id) { 440 case PL1_ENABLE: 441 ret = rapl_read_data_raw(rd, TIME_WINDOW1, true, &val); 442 break; 443 case PL2_ENABLE: 444 ret = rapl_read_data_raw(rd, TIME_WINDOW2, true, &val); 445 break; 446 default: 447 put_online_cpus(); 448 return -EINVAL; 449 } 450 if (!ret) 451 *data = val; 452 453 get_time_exit: 454 put_online_cpus(); 455 456 return ret; 457 } 458 459 static const char *get_constraint_name(struct powercap_zone *power_zone, 460 int cid) 461 { 462 struct rapl_domain *rd; 463 int id; 464 465 rd = power_zone_to_rapl_domain(power_zone); 466 id = contraint_to_pl(rd, cid); 467 if (id >= 0) 468 return rd->rpl[id].name; 469 470 return NULL; 471 } 472 473 static int get_max_power(struct powercap_zone *power_zone, int id, u64 *data) 474 { 475 struct rapl_domain *rd; 476 u64 val; 477 int prim; 478 int ret = 0; 479 480 get_online_cpus(); 481 rd = power_zone_to_rapl_domain(power_zone); 482 switch (rd->rpl[id].prim_id) { 483 case PL1_ENABLE: 484 prim = THERMAL_SPEC_POWER; 485 break; 486 case PL2_ENABLE: 487 prim = MAX_POWER; 488 break; 489 default: 490 put_online_cpus(); 491 return -EINVAL; 492 } 493 if (rapl_read_data_raw(rd, prim, true, &val)) 494 ret = -EIO; 495 else 496 *data = val; 497 498 put_online_cpus(); 499 500 return ret; 501 } 502 503 static const struct powercap_zone_constraint_ops constraint_ops = { 504 .set_power_limit_uw = set_power_limit, 505 .get_power_limit_uw = get_current_power_limit, 506 .set_time_window_us = set_time_window, 507 .get_time_window_us = get_time_window, 508 .get_max_power_uw = get_max_power, 509 .get_name = get_constraint_name, 510 }; 511 512 /* called after domain detection and package level data are set */ 513 static void rapl_init_domains(struct rapl_package *rp) 514 { 515 enum rapl_domain_type i; 516 enum rapl_domain_reg_id j; 517 struct rapl_domain *rd = rp->domains; 518 519 for (i = 0; i < RAPL_DOMAIN_MAX; i++) { 520 unsigned int mask = rp->domain_map & (1 << i); 521 522 if (!mask) 523 continue; 524 525 rd->rp = rp; 526 rd->name = rapl_domain_names[i]; 527 rd->id = i; 528 rd->rpl[0].prim_id = PL1_ENABLE; 529 rd->rpl[0].name = pl1_name; 530 /* some domain may support two power limits */ 531 if (rp->priv->limits[i] == 2) { 532 rd->rpl[1].prim_id = PL2_ENABLE; 533 rd->rpl[1].name = pl2_name; 534 } 535 536 for (j = 0; j < RAPL_DOMAIN_REG_MAX; j++) 537 rd->regs[j] = rp->priv->regs[i][j]; 538 539 if (i == RAPL_DOMAIN_DRAM) { 540 rd->domain_energy_unit = 541 rapl_defaults->dram_domain_energy_unit; 542 if (rd->domain_energy_unit) 543 pr_info("DRAM domain energy unit %dpj\n", 544 rd->domain_energy_unit); 545 } 546 rd++; 547 } 548 } 549 550 static u64 rapl_unit_xlate(struct rapl_domain *rd, enum unit_type type, 551 u64 value, int to_raw) 552 { 553 u64 units = 1; 554 struct rapl_package *rp = rd->rp; 555 u64 scale = 1; 556 557 switch (type) { 558 case POWER_UNIT: 559 units = rp->power_unit; 560 break; 561 case ENERGY_UNIT: 562 scale = ENERGY_UNIT_SCALE; 563 /* per domain unit takes precedence */ 564 if (rd->domain_energy_unit) 565 units = rd->domain_energy_unit; 566 else 567 units = rp->energy_unit; 568 break; 569 case TIME_UNIT: 570 return rapl_defaults->compute_time_window(rp, value, to_raw); 571 case ARBITRARY_UNIT: 572 default: 573 return value; 574 }; 575 576 if (to_raw) 577 return div64_u64(value, units) * scale; 578 579 value *= units; 580 581 return div64_u64(value, scale); 582 } 583 584 /* in the order of enum rapl_primitives */ 585 static struct rapl_primitive_info rpi[] = { 586 /* name, mask, shift, msr index, unit divisor */ 587 PRIMITIVE_INFO_INIT(ENERGY_COUNTER, ENERGY_STATUS_MASK, 0, 588 RAPL_DOMAIN_REG_STATUS, ENERGY_UNIT, 0), 589 PRIMITIVE_INFO_INIT(POWER_LIMIT1, POWER_LIMIT1_MASK, 0, 590 RAPL_DOMAIN_REG_LIMIT, POWER_UNIT, 0), 591 PRIMITIVE_INFO_INIT(POWER_LIMIT2, POWER_LIMIT2_MASK, 32, 592 RAPL_DOMAIN_REG_LIMIT, POWER_UNIT, 0), 593 PRIMITIVE_INFO_INIT(FW_LOCK, POWER_LOW_LOCK, 31, 594 RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0), 595 PRIMITIVE_INFO_INIT(PL1_ENABLE, POWER_LIMIT1_ENABLE, 15, 596 RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0), 597 PRIMITIVE_INFO_INIT(PL1_CLAMP, POWER_LIMIT1_CLAMP, 16, 598 RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0), 599 PRIMITIVE_INFO_INIT(PL2_ENABLE, POWER_LIMIT2_ENABLE, 47, 600 RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0), 601 PRIMITIVE_INFO_INIT(PL2_CLAMP, POWER_LIMIT2_CLAMP, 48, 602 RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0), 603 PRIMITIVE_INFO_INIT(TIME_WINDOW1, TIME_WINDOW1_MASK, 17, 604 RAPL_DOMAIN_REG_LIMIT, TIME_UNIT, 0), 605 PRIMITIVE_INFO_INIT(TIME_WINDOW2, TIME_WINDOW2_MASK, 49, 606 RAPL_DOMAIN_REG_LIMIT, TIME_UNIT, 0), 607 PRIMITIVE_INFO_INIT(THERMAL_SPEC_POWER, POWER_INFO_THERMAL_SPEC_MASK, 608 0, RAPL_DOMAIN_REG_INFO, POWER_UNIT, 0), 609 PRIMITIVE_INFO_INIT(MAX_POWER, POWER_INFO_MAX_MASK, 32, 610 RAPL_DOMAIN_REG_INFO, POWER_UNIT, 0), 611 PRIMITIVE_INFO_INIT(MIN_POWER, POWER_INFO_MIN_MASK, 16, 612 RAPL_DOMAIN_REG_INFO, POWER_UNIT, 0), 613 PRIMITIVE_INFO_INIT(MAX_TIME_WINDOW, POWER_INFO_MAX_TIME_WIN_MASK, 48, 614 RAPL_DOMAIN_REG_INFO, TIME_UNIT, 0), 615 PRIMITIVE_INFO_INIT(THROTTLED_TIME, PERF_STATUS_THROTTLE_TIME_MASK, 0, 616 RAPL_DOMAIN_REG_PERF, TIME_UNIT, 0), 617 PRIMITIVE_INFO_INIT(PRIORITY_LEVEL, PP_POLICY_MASK, 0, 618 RAPL_DOMAIN_REG_POLICY, ARBITRARY_UNIT, 0), 619 /* non-hardware */ 620 PRIMITIVE_INFO_INIT(AVERAGE_POWER, 0, 0, 0, POWER_UNIT, 621 RAPL_PRIMITIVE_DERIVED), 622 {NULL, 0, 0, 0}, 623 }; 624 625 /* Read primitive data based on its related struct rapl_primitive_info. 626 * if xlate flag is set, return translated data based on data units, i.e. 627 * time, energy, and power. 628 * RAPL MSRs are non-architectual and are laid out not consistently across 629 * domains. Here we use primitive info to allow writing consolidated access 630 * functions. 631 * For a given primitive, it is processed by MSR mask and shift. Unit conversion 632 * is pre-assigned based on RAPL unit MSRs read at init time. 633 * 63-------------------------- 31--------------------------- 0 634 * | xxxxx (mask) | 635 * | |<- shift ----------------| 636 * 63-------------------------- 31--------------------------- 0 637 */ 638 static int rapl_read_data_raw(struct rapl_domain *rd, 639 enum rapl_primitives prim, bool xlate, u64 *data) 640 { 641 u64 value; 642 struct rapl_primitive_info *rp = &rpi[prim]; 643 struct reg_action ra; 644 int cpu; 645 646 if (!rp->name || rp->flag & RAPL_PRIMITIVE_DUMMY) 647 return -EINVAL; 648 649 ra.reg = rd->regs[rp->id]; 650 if (!ra.reg) 651 return -EINVAL; 652 653 cpu = rd->rp->lead_cpu; 654 655 /* domain with 2 limits has different bit */ 656 if (prim == FW_LOCK && rd->rp->priv->limits[rd->id] == 2) { 657 rp->mask = POWER_HIGH_LOCK; 658 rp->shift = 63; 659 } 660 /* non-hardware data are collected by the polling thread */ 661 if (rp->flag & RAPL_PRIMITIVE_DERIVED) { 662 *data = rd->rdd.primitives[prim]; 663 return 0; 664 } 665 666 ra.mask = rp->mask; 667 668 if (rd->rp->priv->read_raw(cpu, &ra)) { 669 pr_debug("failed to read reg 0x%llx on cpu %d\n", ra.reg, cpu); 670 return -EIO; 671 } 672 673 value = ra.value >> rp->shift; 674 675 if (xlate) 676 *data = rapl_unit_xlate(rd, rp->unit, value, 0); 677 else 678 *data = value; 679 680 return 0; 681 } 682 683 /* Similar use of primitive info in the read counterpart */ 684 static int rapl_write_data_raw(struct rapl_domain *rd, 685 enum rapl_primitives prim, 686 unsigned long long value) 687 { 688 struct rapl_primitive_info *rp = &rpi[prim]; 689 int cpu; 690 u64 bits; 691 struct reg_action ra; 692 int ret; 693 694 cpu = rd->rp->lead_cpu; 695 bits = rapl_unit_xlate(rd, rp->unit, value, 1); 696 bits <<= rp->shift; 697 bits &= rp->mask; 698 699 memset(&ra, 0, sizeof(ra)); 700 701 ra.reg = rd->regs[rp->id]; 702 ra.mask = rp->mask; 703 ra.value = bits; 704 705 ret = rd->rp->priv->write_raw(cpu, &ra); 706 707 return ret; 708 } 709 710 /* 711 * Raw RAPL data stored in MSRs are in certain scales. We need to 712 * convert them into standard units based on the units reported in 713 * the RAPL unit MSRs. This is specific to CPUs as the method to 714 * calculate units differ on different CPUs. 715 * We convert the units to below format based on CPUs. 716 * i.e. 717 * energy unit: picoJoules : Represented in picoJoules by default 718 * power unit : microWatts : Represented in milliWatts by default 719 * time unit : microseconds: Represented in seconds by default 720 */ 721 static int rapl_check_unit_core(struct rapl_package *rp, int cpu) 722 { 723 struct reg_action ra; 724 u32 value; 725 726 ra.reg = rp->priv->reg_unit; 727 ra.mask = ~0; 728 if (rp->priv->read_raw(cpu, &ra)) { 729 pr_err("Failed to read power unit REG 0x%llx on CPU %d, exit.\n", 730 rp->priv->reg_unit, cpu); 731 return -ENODEV; 732 } 733 734 value = (ra.value & ENERGY_UNIT_MASK) >> ENERGY_UNIT_OFFSET; 735 rp->energy_unit = ENERGY_UNIT_SCALE * 1000000 / (1 << value); 736 737 value = (ra.value & POWER_UNIT_MASK) >> POWER_UNIT_OFFSET; 738 rp->power_unit = 1000000 / (1 << value); 739 740 value = (ra.value & TIME_UNIT_MASK) >> TIME_UNIT_OFFSET; 741 rp->time_unit = 1000000 / (1 << value); 742 743 pr_debug("Core CPU %s energy=%dpJ, time=%dus, power=%duW\n", 744 rp->name, rp->energy_unit, rp->time_unit, rp->power_unit); 745 746 return 0; 747 } 748 749 static int rapl_check_unit_atom(struct rapl_package *rp, int cpu) 750 { 751 struct reg_action ra; 752 u32 value; 753 754 ra.reg = rp->priv->reg_unit; 755 ra.mask = ~0; 756 if (rp->priv->read_raw(cpu, &ra)) { 757 pr_err("Failed to read power unit REG 0x%llx on CPU %d, exit.\n", 758 rp->priv->reg_unit, cpu); 759 return -ENODEV; 760 } 761 762 value = (ra.value & ENERGY_UNIT_MASK) >> ENERGY_UNIT_OFFSET; 763 rp->energy_unit = ENERGY_UNIT_SCALE * 1 << value; 764 765 value = (ra.value & POWER_UNIT_MASK) >> POWER_UNIT_OFFSET; 766 rp->power_unit = (1 << value) * 1000; 767 768 value = (ra.value & TIME_UNIT_MASK) >> TIME_UNIT_OFFSET; 769 rp->time_unit = 1000000 / (1 << value); 770 771 pr_debug("Atom %s energy=%dpJ, time=%dus, power=%duW\n", 772 rp->name, rp->energy_unit, rp->time_unit, rp->power_unit); 773 774 return 0; 775 } 776 777 static void power_limit_irq_save_cpu(void *info) 778 { 779 u32 l, h = 0; 780 struct rapl_package *rp = (struct rapl_package *)info; 781 782 /* save the state of PLN irq mask bit before disabling it */ 783 rdmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, &l, &h); 784 if (!(rp->power_limit_irq & PACKAGE_PLN_INT_SAVED)) { 785 rp->power_limit_irq = l & PACKAGE_THERM_INT_PLN_ENABLE; 786 rp->power_limit_irq |= PACKAGE_PLN_INT_SAVED; 787 } 788 l &= ~PACKAGE_THERM_INT_PLN_ENABLE; 789 wrmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h); 790 } 791 792 /* REVISIT: 793 * When package power limit is set artificially low by RAPL, LVT 794 * thermal interrupt for package power limit should be ignored 795 * since we are not really exceeding the real limit. The intention 796 * is to avoid excessive interrupts while we are trying to save power. 797 * A useful feature might be routing the package_power_limit interrupt 798 * to userspace via eventfd. once we have a usecase, this is simple 799 * to do by adding an atomic notifier. 800 */ 801 802 static void package_power_limit_irq_save(struct rapl_package *rp) 803 { 804 if (!boot_cpu_has(X86_FEATURE_PTS) || !boot_cpu_has(X86_FEATURE_PLN)) 805 return; 806 807 smp_call_function_single(rp->lead_cpu, power_limit_irq_save_cpu, rp, 1); 808 } 809 810 /* 811 * Restore per package power limit interrupt enable state. Called from cpu 812 * hotplug code on package removal. 813 */ 814 static void package_power_limit_irq_restore(struct rapl_package *rp) 815 { 816 u32 l, h; 817 818 if (!boot_cpu_has(X86_FEATURE_PTS) || !boot_cpu_has(X86_FEATURE_PLN)) 819 return; 820 821 /* irq enable state not saved, nothing to restore */ 822 if (!(rp->power_limit_irq & PACKAGE_PLN_INT_SAVED)) 823 return; 824 825 rdmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, &l, &h); 826 827 if (rp->power_limit_irq & PACKAGE_THERM_INT_PLN_ENABLE) 828 l |= PACKAGE_THERM_INT_PLN_ENABLE; 829 else 830 l &= ~PACKAGE_THERM_INT_PLN_ENABLE; 831 832 wrmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h); 833 } 834 835 static void set_floor_freq_default(struct rapl_domain *rd, bool mode) 836 { 837 int nr_powerlimit = find_nr_power_limit(rd); 838 839 /* always enable clamp such that p-state can go below OS requested 840 * range. power capping priority over guranteed frequency. 841 */ 842 rapl_write_data_raw(rd, PL1_CLAMP, mode); 843 844 /* some domains have pl2 */ 845 if (nr_powerlimit > 1) { 846 rapl_write_data_raw(rd, PL2_ENABLE, mode); 847 rapl_write_data_raw(rd, PL2_CLAMP, mode); 848 } 849 } 850 851 static void set_floor_freq_atom(struct rapl_domain *rd, bool enable) 852 { 853 static u32 power_ctrl_orig_val; 854 u32 mdata; 855 856 if (!rapl_defaults->floor_freq_reg_addr) { 857 pr_err("Invalid floor frequency config register\n"); 858 return; 859 } 860 861 if (!power_ctrl_orig_val) 862 iosf_mbi_read(BT_MBI_UNIT_PMC, MBI_CR_READ, 863 rapl_defaults->floor_freq_reg_addr, 864 &power_ctrl_orig_val); 865 mdata = power_ctrl_orig_val; 866 if (enable) { 867 mdata &= ~(0x7f << 8); 868 mdata |= 1 << 8; 869 } 870 iosf_mbi_write(BT_MBI_UNIT_PMC, MBI_CR_WRITE, 871 rapl_defaults->floor_freq_reg_addr, mdata); 872 } 873 874 static u64 rapl_compute_time_window_core(struct rapl_package *rp, u64 value, 875 bool to_raw) 876 { 877 u64 f, y; /* fraction and exp. used for time unit */ 878 879 /* 880 * Special processing based on 2^Y*(1+F/4), refer 881 * to Intel Software Developer's manual Vol.3B: CH 14.9.3. 882 */ 883 if (!to_raw) { 884 f = (value & 0x60) >> 5; 885 y = value & 0x1f; 886 value = (1 << y) * (4 + f) * rp->time_unit / 4; 887 } else { 888 do_div(value, rp->time_unit); 889 y = ilog2(value); 890 f = div64_u64(4 * (value - (1 << y)), 1 << y); 891 value = (y & 0x1f) | ((f & 0x3) << 5); 892 } 893 return value; 894 } 895 896 static u64 rapl_compute_time_window_atom(struct rapl_package *rp, u64 value, 897 bool to_raw) 898 { 899 /* 900 * Atom time unit encoding is straight forward val * time_unit, 901 * where time_unit is default to 1 sec. Never 0. 902 */ 903 if (!to_raw) 904 return (value) ? value *= rp->time_unit : rp->time_unit; 905 906 value = div64_u64(value, rp->time_unit); 907 908 return value; 909 } 910 911 static const struct rapl_defaults rapl_defaults_core = { 912 .floor_freq_reg_addr = 0, 913 .check_unit = rapl_check_unit_core, 914 .set_floor_freq = set_floor_freq_default, 915 .compute_time_window = rapl_compute_time_window_core, 916 }; 917 918 static const struct rapl_defaults rapl_defaults_hsw_server = { 919 .check_unit = rapl_check_unit_core, 920 .set_floor_freq = set_floor_freq_default, 921 .compute_time_window = rapl_compute_time_window_core, 922 .dram_domain_energy_unit = 15300, 923 }; 924 925 static const struct rapl_defaults rapl_defaults_byt = { 926 .floor_freq_reg_addr = IOSF_CPU_POWER_BUDGET_CTL_BYT, 927 .check_unit = rapl_check_unit_atom, 928 .set_floor_freq = set_floor_freq_atom, 929 .compute_time_window = rapl_compute_time_window_atom, 930 }; 931 932 static const struct rapl_defaults rapl_defaults_tng = { 933 .floor_freq_reg_addr = IOSF_CPU_POWER_BUDGET_CTL_TNG, 934 .check_unit = rapl_check_unit_atom, 935 .set_floor_freq = set_floor_freq_atom, 936 .compute_time_window = rapl_compute_time_window_atom, 937 }; 938 939 static const struct rapl_defaults rapl_defaults_ann = { 940 .floor_freq_reg_addr = 0, 941 .check_unit = rapl_check_unit_atom, 942 .set_floor_freq = NULL, 943 .compute_time_window = rapl_compute_time_window_atom, 944 }; 945 946 static const struct rapl_defaults rapl_defaults_cht = { 947 .floor_freq_reg_addr = 0, 948 .check_unit = rapl_check_unit_atom, 949 .set_floor_freq = NULL, 950 .compute_time_window = rapl_compute_time_window_atom, 951 }; 952 953 static const struct x86_cpu_id rapl_ids[] __initconst = { 954 INTEL_CPU_FAM6(SANDYBRIDGE, rapl_defaults_core), 955 INTEL_CPU_FAM6(SANDYBRIDGE_X, rapl_defaults_core), 956 957 INTEL_CPU_FAM6(IVYBRIDGE, rapl_defaults_core), 958 INTEL_CPU_FAM6(IVYBRIDGE_X, rapl_defaults_core), 959 960 INTEL_CPU_FAM6(HASWELL, rapl_defaults_core), 961 INTEL_CPU_FAM6(HASWELL_L, rapl_defaults_core), 962 INTEL_CPU_FAM6(HASWELL_G, rapl_defaults_core), 963 INTEL_CPU_FAM6(HASWELL_X, rapl_defaults_hsw_server), 964 965 INTEL_CPU_FAM6(BROADWELL, rapl_defaults_core), 966 INTEL_CPU_FAM6(BROADWELL_G, rapl_defaults_core), 967 INTEL_CPU_FAM6(BROADWELL_D, rapl_defaults_core), 968 INTEL_CPU_FAM6(BROADWELL_X, rapl_defaults_hsw_server), 969 970 INTEL_CPU_FAM6(SKYLAKE, rapl_defaults_core), 971 INTEL_CPU_FAM6(SKYLAKE_L, rapl_defaults_core), 972 INTEL_CPU_FAM6(SKYLAKE_X, rapl_defaults_hsw_server), 973 INTEL_CPU_FAM6(KABYLAKE_L, rapl_defaults_core), 974 INTEL_CPU_FAM6(KABYLAKE, rapl_defaults_core), 975 INTEL_CPU_FAM6(CANNONLAKE_L, rapl_defaults_core), 976 INTEL_CPU_FAM6(ICELAKE_L, rapl_defaults_core), 977 INTEL_CPU_FAM6(ICELAKE, rapl_defaults_core), 978 INTEL_CPU_FAM6(ICELAKE_NNPI, rapl_defaults_core), 979 INTEL_CPU_FAM6(ICELAKE_X, rapl_defaults_hsw_server), 980 INTEL_CPU_FAM6(ICELAKE_D, rapl_defaults_hsw_server), 981 INTEL_CPU_FAM6(COMETLAKE_L, rapl_defaults_core), 982 INTEL_CPU_FAM6(COMETLAKE, rapl_defaults_core), 983 INTEL_CPU_FAM6(TIGERLAKE_L, rapl_defaults_core), 984 985 INTEL_CPU_FAM6(ATOM_SILVERMONT, rapl_defaults_byt), 986 INTEL_CPU_FAM6(ATOM_AIRMONT, rapl_defaults_cht), 987 INTEL_CPU_FAM6(ATOM_SILVERMONT_MID, rapl_defaults_tng), 988 INTEL_CPU_FAM6(ATOM_AIRMONT_MID, rapl_defaults_ann), 989 INTEL_CPU_FAM6(ATOM_GOLDMONT, rapl_defaults_core), 990 INTEL_CPU_FAM6(ATOM_GOLDMONT_PLUS, rapl_defaults_core), 991 INTEL_CPU_FAM6(ATOM_GOLDMONT_D, rapl_defaults_core), 992 INTEL_CPU_FAM6(ATOM_TREMONT_D, rapl_defaults_core), 993 INTEL_CPU_FAM6(ATOM_TREMONT_L, rapl_defaults_core), 994 995 INTEL_CPU_FAM6(XEON_PHI_KNL, rapl_defaults_hsw_server), 996 INTEL_CPU_FAM6(XEON_PHI_KNM, rapl_defaults_hsw_server), 997 {} 998 }; 999 1000 MODULE_DEVICE_TABLE(x86cpu, rapl_ids); 1001 1002 /* Read once for all raw primitive data for domains */ 1003 static void rapl_update_domain_data(struct rapl_package *rp) 1004 { 1005 int dmn, prim; 1006 u64 val; 1007 1008 for (dmn = 0; dmn < rp->nr_domains; dmn++) { 1009 pr_debug("update %s domain %s data\n", rp->name, 1010 rp->domains[dmn].name); 1011 /* exclude non-raw primitives */ 1012 for (prim = 0; prim < NR_RAW_PRIMITIVES; prim++) { 1013 if (!rapl_read_data_raw(&rp->domains[dmn], prim, 1014 rpi[prim].unit, &val)) 1015 rp->domains[dmn].rdd.primitives[prim] = val; 1016 } 1017 } 1018 1019 } 1020 1021 static int rapl_package_register_powercap(struct rapl_package *rp) 1022 { 1023 struct rapl_domain *rd; 1024 struct powercap_zone *power_zone = NULL; 1025 int nr_pl, ret; 1026 1027 /* Update the domain data of the new package */ 1028 rapl_update_domain_data(rp); 1029 1030 /* first we register package domain as the parent zone */ 1031 for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) { 1032 if (rd->id == RAPL_DOMAIN_PACKAGE) { 1033 nr_pl = find_nr_power_limit(rd); 1034 pr_debug("register package domain %s\n", rp->name); 1035 power_zone = powercap_register_zone(&rd->power_zone, 1036 rp->priv->control_type, rp->name, 1037 NULL, &zone_ops[rd->id], nr_pl, 1038 &constraint_ops); 1039 if (IS_ERR(power_zone)) { 1040 pr_debug("failed to register power zone %s\n", 1041 rp->name); 1042 return PTR_ERR(power_zone); 1043 } 1044 /* track parent zone in per package/socket data */ 1045 rp->power_zone = power_zone; 1046 /* done, only one package domain per socket */ 1047 break; 1048 } 1049 } 1050 if (!power_zone) { 1051 pr_err("no package domain found, unknown topology!\n"); 1052 return -ENODEV; 1053 } 1054 /* now register domains as children of the socket/package */ 1055 for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) { 1056 if (rd->id == RAPL_DOMAIN_PACKAGE) 1057 continue; 1058 /* number of power limits per domain varies */ 1059 nr_pl = find_nr_power_limit(rd); 1060 power_zone = powercap_register_zone(&rd->power_zone, 1061 rp->priv->control_type, 1062 rd->name, rp->power_zone, 1063 &zone_ops[rd->id], nr_pl, 1064 &constraint_ops); 1065 1066 if (IS_ERR(power_zone)) { 1067 pr_debug("failed to register power_zone, %s:%s\n", 1068 rp->name, rd->name); 1069 ret = PTR_ERR(power_zone); 1070 goto err_cleanup; 1071 } 1072 } 1073 return 0; 1074 1075 err_cleanup: 1076 /* 1077 * Clean up previously initialized domains within the package if we 1078 * failed after the first domain setup. 1079 */ 1080 while (--rd >= rp->domains) { 1081 pr_debug("unregister %s domain %s\n", rp->name, rd->name); 1082 powercap_unregister_zone(rp->priv->control_type, 1083 &rd->power_zone); 1084 } 1085 1086 return ret; 1087 } 1088 1089 int rapl_add_platform_domain(struct rapl_if_priv *priv) 1090 { 1091 struct rapl_domain *rd; 1092 struct powercap_zone *power_zone; 1093 struct reg_action ra; 1094 int ret; 1095 1096 ra.reg = priv->regs[RAPL_DOMAIN_PLATFORM][RAPL_DOMAIN_REG_STATUS]; 1097 ra.mask = ~0; 1098 ret = priv->read_raw(0, &ra); 1099 if (ret || !ra.value) 1100 return -ENODEV; 1101 1102 ra.reg = priv->regs[RAPL_DOMAIN_PLATFORM][RAPL_DOMAIN_REG_LIMIT]; 1103 ra.mask = ~0; 1104 ret = priv->read_raw(0, &ra); 1105 if (ret || !ra.value) 1106 return -ENODEV; 1107 1108 rd = kzalloc(sizeof(*rd), GFP_KERNEL); 1109 if (!rd) 1110 return -ENOMEM; 1111 1112 rd->name = rapl_domain_names[RAPL_DOMAIN_PLATFORM]; 1113 rd->id = RAPL_DOMAIN_PLATFORM; 1114 rd->regs[RAPL_DOMAIN_REG_LIMIT] = 1115 priv->regs[RAPL_DOMAIN_PLATFORM][RAPL_DOMAIN_REG_LIMIT]; 1116 rd->regs[RAPL_DOMAIN_REG_STATUS] = 1117 priv->regs[RAPL_DOMAIN_PLATFORM][RAPL_DOMAIN_REG_STATUS]; 1118 rd->rpl[0].prim_id = PL1_ENABLE; 1119 rd->rpl[0].name = pl1_name; 1120 rd->rpl[1].prim_id = PL2_ENABLE; 1121 rd->rpl[1].name = pl2_name; 1122 rd->rp = rapl_find_package_domain(0, priv); 1123 1124 power_zone = powercap_register_zone(&rd->power_zone, priv->control_type, 1125 "psys", NULL, 1126 &zone_ops[RAPL_DOMAIN_PLATFORM], 1127 2, &constraint_ops); 1128 1129 if (IS_ERR(power_zone)) { 1130 kfree(rd); 1131 return PTR_ERR(power_zone); 1132 } 1133 1134 priv->platform_rapl_domain = rd; 1135 1136 return 0; 1137 } 1138 EXPORT_SYMBOL_GPL(rapl_add_platform_domain); 1139 1140 void rapl_remove_platform_domain(struct rapl_if_priv *priv) 1141 { 1142 if (priv->platform_rapl_domain) { 1143 powercap_unregister_zone(priv->control_type, 1144 &priv->platform_rapl_domain->power_zone); 1145 kfree(priv->platform_rapl_domain); 1146 } 1147 } 1148 EXPORT_SYMBOL_GPL(rapl_remove_platform_domain); 1149 1150 static int rapl_check_domain(int cpu, int domain, struct rapl_package *rp) 1151 { 1152 struct reg_action ra; 1153 1154 switch (domain) { 1155 case RAPL_DOMAIN_PACKAGE: 1156 case RAPL_DOMAIN_PP0: 1157 case RAPL_DOMAIN_PP1: 1158 case RAPL_DOMAIN_DRAM: 1159 ra.reg = rp->priv->regs[domain][RAPL_DOMAIN_REG_STATUS]; 1160 break; 1161 case RAPL_DOMAIN_PLATFORM: 1162 /* PSYS(PLATFORM) is not a CPU domain, so avoid printng error */ 1163 return -EINVAL; 1164 default: 1165 pr_err("invalid domain id %d\n", domain); 1166 return -EINVAL; 1167 } 1168 /* make sure domain counters are available and contains non-zero 1169 * values, otherwise skip it. 1170 */ 1171 1172 ra.mask = ~0; 1173 if (rp->priv->read_raw(cpu, &ra) || !ra.value) 1174 return -ENODEV; 1175 1176 return 0; 1177 } 1178 1179 /* 1180 * Check if power limits are available. Two cases when they are not available: 1181 * 1. Locked by BIOS, in this case we still provide read-only access so that 1182 * users can see what limit is set by the BIOS. 1183 * 2. Some CPUs make some domains monitoring only which means PLx MSRs may not 1184 * exist at all. In this case, we do not show the constraints in powercap. 1185 * 1186 * Called after domains are detected and initialized. 1187 */ 1188 static void rapl_detect_powerlimit(struct rapl_domain *rd) 1189 { 1190 u64 val64; 1191 int i; 1192 1193 /* check if the domain is locked by BIOS, ignore if MSR doesn't exist */ 1194 if (!rapl_read_data_raw(rd, FW_LOCK, false, &val64)) { 1195 if (val64) { 1196 pr_info("RAPL %s domain %s locked by BIOS\n", 1197 rd->rp->name, rd->name); 1198 rd->state |= DOMAIN_STATE_BIOS_LOCKED; 1199 } 1200 } 1201 /* check if power limit MSR exists, otherwise domain is monitoring only */ 1202 for (i = 0; i < NR_POWER_LIMITS; i++) { 1203 int prim = rd->rpl[i].prim_id; 1204 1205 if (rapl_read_data_raw(rd, prim, false, &val64)) 1206 rd->rpl[i].name = NULL; 1207 } 1208 } 1209 1210 /* Detect active and valid domains for the given CPU, caller must 1211 * ensure the CPU belongs to the targeted package and CPU hotlug is disabled. 1212 */ 1213 static int rapl_detect_domains(struct rapl_package *rp, int cpu) 1214 { 1215 struct rapl_domain *rd; 1216 int i; 1217 1218 for (i = 0; i < RAPL_DOMAIN_MAX; i++) { 1219 /* use physical package id to read counters */ 1220 if (!rapl_check_domain(cpu, i, rp)) { 1221 rp->domain_map |= 1 << i; 1222 pr_info("Found RAPL domain %s\n", rapl_domain_names[i]); 1223 } 1224 } 1225 rp->nr_domains = bitmap_weight(&rp->domain_map, RAPL_DOMAIN_MAX); 1226 if (!rp->nr_domains) { 1227 pr_debug("no valid rapl domains found in %s\n", rp->name); 1228 return -ENODEV; 1229 } 1230 pr_debug("found %d domains on %s\n", rp->nr_domains, rp->name); 1231 1232 rp->domains = kcalloc(rp->nr_domains + 1, sizeof(struct rapl_domain), 1233 GFP_KERNEL); 1234 if (!rp->domains) 1235 return -ENOMEM; 1236 1237 rapl_init_domains(rp); 1238 1239 for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) 1240 rapl_detect_powerlimit(rd); 1241 1242 return 0; 1243 } 1244 1245 /* called from CPU hotplug notifier, hotplug lock held */ 1246 void rapl_remove_package(struct rapl_package *rp) 1247 { 1248 struct rapl_domain *rd, *rd_package = NULL; 1249 1250 package_power_limit_irq_restore(rp); 1251 1252 for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) { 1253 rapl_write_data_raw(rd, PL1_ENABLE, 0); 1254 rapl_write_data_raw(rd, PL1_CLAMP, 0); 1255 if (find_nr_power_limit(rd) > 1) { 1256 rapl_write_data_raw(rd, PL2_ENABLE, 0); 1257 rapl_write_data_raw(rd, PL2_CLAMP, 0); 1258 } 1259 if (rd->id == RAPL_DOMAIN_PACKAGE) { 1260 rd_package = rd; 1261 continue; 1262 } 1263 pr_debug("remove package, undo power limit on %s: %s\n", 1264 rp->name, rd->name); 1265 powercap_unregister_zone(rp->priv->control_type, 1266 &rd->power_zone); 1267 } 1268 /* do parent zone last */ 1269 powercap_unregister_zone(rp->priv->control_type, 1270 &rd_package->power_zone); 1271 list_del(&rp->plist); 1272 kfree(rp); 1273 } 1274 EXPORT_SYMBOL_GPL(rapl_remove_package); 1275 1276 /* caller to ensure CPU hotplug lock is held */ 1277 struct rapl_package *rapl_find_package_domain(int cpu, struct rapl_if_priv *priv) 1278 { 1279 int id = topology_logical_die_id(cpu); 1280 struct rapl_package *rp; 1281 1282 list_for_each_entry(rp, &rapl_packages, plist) { 1283 if (rp->id == id 1284 && rp->priv->control_type == priv->control_type) 1285 return rp; 1286 } 1287 1288 return NULL; 1289 } 1290 EXPORT_SYMBOL_GPL(rapl_find_package_domain); 1291 1292 /* called from CPU hotplug notifier, hotplug lock held */ 1293 struct rapl_package *rapl_add_package(int cpu, struct rapl_if_priv *priv) 1294 { 1295 int id = topology_logical_die_id(cpu); 1296 struct rapl_package *rp; 1297 struct cpuinfo_x86 *c = &cpu_data(cpu); 1298 int ret; 1299 1300 if (!rapl_defaults) 1301 return ERR_PTR(-ENODEV); 1302 1303 rp = kzalloc(sizeof(struct rapl_package), GFP_KERNEL); 1304 if (!rp) 1305 return ERR_PTR(-ENOMEM); 1306 1307 /* add the new package to the list */ 1308 rp->id = id; 1309 rp->lead_cpu = cpu; 1310 rp->priv = priv; 1311 1312 if (topology_max_die_per_package() > 1) 1313 snprintf(rp->name, PACKAGE_DOMAIN_NAME_LENGTH, 1314 "package-%d-die-%d", c->phys_proc_id, c->cpu_die_id); 1315 else 1316 snprintf(rp->name, PACKAGE_DOMAIN_NAME_LENGTH, "package-%d", 1317 c->phys_proc_id); 1318 1319 /* check if the package contains valid domains */ 1320 if (rapl_detect_domains(rp, cpu) || rapl_defaults->check_unit(rp, cpu)) { 1321 ret = -ENODEV; 1322 goto err_free_package; 1323 } 1324 ret = rapl_package_register_powercap(rp); 1325 if (!ret) { 1326 INIT_LIST_HEAD(&rp->plist); 1327 list_add(&rp->plist, &rapl_packages); 1328 return rp; 1329 } 1330 1331 err_free_package: 1332 kfree(rp->domains); 1333 kfree(rp); 1334 return ERR_PTR(ret); 1335 } 1336 EXPORT_SYMBOL_GPL(rapl_add_package); 1337 1338 static void power_limit_state_save(void) 1339 { 1340 struct rapl_package *rp; 1341 struct rapl_domain *rd; 1342 int nr_pl, ret, i; 1343 1344 get_online_cpus(); 1345 list_for_each_entry(rp, &rapl_packages, plist) { 1346 if (!rp->power_zone) 1347 continue; 1348 rd = power_zone_to_rapl_domain(rp->power_zone); 1349 nr_pl = find_nr_power_limit(rd); 1350 for (i = 0; i < nr_pl; i++) { 1351 switch (rd->rpl[i].prim_id) { 1352 case PL1_ENABLE: 1353 ret = rapl_read_data_raw(rd, 1354 POWER_LIMIT1, true, 1355 &rd->rpl[i].last_power_limit); 1356 if (ret) 1357 rd->rpl[i].last_power_limit = 0; 1358 break; 1359 case PL2_ENABLE: 1360 ret = rapl_read_data_raw(rd, 1361 POWER_LIMIT2, true, 1362 &rd->rpl[i].last_power_limit); 1363 if (ret) 1364 rd->rpl[i].last_power_limit = 0; 1365 break; 1366 } 1367 } 1368 } 1369 put_online_cpus(); 1370 } 1371 1372 static void power_limit_state_restore(void) 1373 { 1374 struct rapl_package *rp; 1375 struct rapl_domain *rd; 1376 int nr_pl, i; 1377 1378 get_online_cpus(); 1379 list_for_each_entry(rp, &rapl_packages, plist) { 1380 if (!rp->power_zone) 1381 continue; 1382 rd = power_zone_to_rapl_domain(rp->power_zone); 1383 nr_pl = find_nr_power_limit(rd); 1384 for (i = 0; i < nr_pl; i++) { 1385 switch (rd->rpl[i].prim_id) { 1386 case PL1_ENABLE: 1387 if (rd->rpl[i].last_power_limit) 1388 rapl_write_data_raw(rd, POWER_LIMIT1, 1389 rd->rpl[i].last_power_limit); 1390 break; 1391 case PL2_ENABLE: 1392 if (rd->rpl[i].last_power_limit) 1393 rapl_write_data_raw(rd, POWER_LIMIT2, 1394 rd->rpl[i].last_power_limit); 1395 break; 1396 } 1397 } 1398 } 1399 put_online_cpus(); 1400 } 1401 1402 static int rapl_pm_callback(struct notifier_block *nb, 1403 unsigned long mode, void *_unused) 1404 { 1405 switch (mode) { 1406 case PM_SUSPEND_PREPARE: 1407 power_limit_state_save(); 1408 break; 1409 case PM_POST_SUSPEND: 1410 power_limit_state_restore(); 1411 break; 1412 } 1413 return NOTIFY_OK; 1414 } 1415 1416 static struct notifier_block rapl_pm_notifier = { 1417 .notifier_call = rapl_pm_callback, 1418 }; 1419 1420 static struct platform_device *rapl_msr_platdev; 1421 1422 static int __init rapl_init(void) 1423 { 1424 const struct x86_cpu_id *id; 1425 int ret; 1426 1427 id = x86_match_cpu(rapl_ids); 1428 if (!id) { 1429 pr_err("driver does not support CPU family %d model %d\n", 1430 boot_cpu_data.x86, boot_cpu_data.x86_model); 1431 1432 return -ENODEV; 1433 } 1434 1435 rapl_defaults = (struct rapl_defaults *)id->driver_data; 1436 1437 ret = register_pm_notifier(&rapl_pm_notifier); 1438 if (ret) 1439 return ret; 1440 1441 rapl_msr_platdev = platform_device_alloc("intel_rapl_msr", 0); 1442 if (!rapl_msr_platdev) { 1443 ret = -ENOMEM; 1444 goto end; 1445 } 1446 1447 ret = platform_device_add(rapl_msr_platdev); 1448 if (ret) 1449 platform_device_put(rapl_msr_platdev); 1450 1451 end: 1452 if (ret) 1453 unregister_pm_notifier(&rapl_pm_notifier); 1454 1455 return ret; 1456 } 1457 1458 static void __exit rapl_exit(void) 1459 { 1460 platform_device_unregister(rapl_msr_platdev); 1461 unregister_pm_notifier(&rapl_pm_notifier); 1462 } 1463 1464 fs_initcall(rapl_init); 1465 module_exit(rapl_exit); 1466 1467 MODULE_DESCRIPTION("Intel Runtime Average Power Limit (RAPL) common code"); 1468 MODULE_AUTHOR("Jacob Pan <jacob.jun.pan@intel.com>"); 1469 MODULE_LICENSE("GPL v2"); 1470