1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Zhaoxin PMU; like Intel Architectural PerfMon-v2 4 */ 5 6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 7 8 #include <linux/stddef.h> 9 #include <linux/types.h> 10 #include <linux/init.h> 11 #include <linux/slab.h> 12 #include <linux/export.h> 13 #include <linux/nmi.h> 14 15 #include <asm/cpufeature.h> 16 #include <asm/hardirq.h> 17 #include <asm/apic.h> 18 19 #include "../perf_event.h" 20 21 /* 22 * Zhaoxin PerfMon, used on zxc and later. 23 */ 24 static u64 zx_pmon_event_map[PERF_COUNT_HW_MAX] __read_mostly = { 25 26 [PERF_COUNT_HW_CPU_CYCLES] = 0x0082, 27 [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, 28 [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0515, 29 [PERF_COUNT_HW_CACHE_MISSES] = 0x051a, 30 [PERF_COUNT_HW_BUS_CYCLES] = 0x0083, 31 }; 32 33 static struct event_constraint zxc_event_constraints[] __read_mostly = { 34 35 FIXED_EVENT_CONSTRAINT(0x0082, 1), /* unhalted core clock cycles */ 36 EVENT_CONSTRAINT_END 37 }; 38 39 static struct event_constraint zxd_event_constraints[] __read_mostly = { 40 41 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* retired instructions */ 42 FIXED_EVENT_CONSTRAINT(0x0082, 1), /* unhalted core clock cycles */ 43 FIXED_EVENT_CONSTRAINT(0x0083, 2), /* unhalted bus clock cycles */ 44 EVENT_CONSTRAINT_END 45 }; 46 47 static __initconst const u64 zxd_hw_cache_event_ids 48 [PERF_COUNT_HW_CACHE_MAX] 49 [PERF_COUNT_HW_CACHE_OP_MAX] 50 [PERF_COUNT_HW_CACHE_RESULT_MAX] = { 51 [C(L1D)] = { 52 [C(OP_READ)] = { 53 [C(RESULT_ACCESS)] = 0x0042, 54 [C(RESULT_MISS)] = 0x0538, 55 }, 56 [C(OP_WRITE)] = { 57 [C(RESULT_ACCESS)] = 0x0043, 58 [C(RESULT_MISS)] = 0x0562, 59 }, 60 [C(OP_PREFETCH)] = { 61 [C(RESULT_ACCESS)] = -1, 62 [C(RESULT_MISS)] = -1, 63 }, 64 }, 65 [C(L1I)] = { 66 [C(OP_READ)] = { 67 [C(RESULT_ACCESS)] = 0x0300, 68 [C(RESULT_MISS)] = 0x0301, 69 }, 70 [C(OP_WRITE)] = { 71 [C(RESULT_ACCESS)] = -1, 72 [C(RESULT_MISS)] = -1, 73 }, 74 [C(OP_PREFETCH)] = { 75 [C(RESULT_ACCESS)] = 0x030a, 76 [C(RESULT_MISS)] = 0x030b, 77 }, 78 }, 79 [C(LL)] = { 80 [C(OP_READ)] = { 81 [C(RESULT_ACCESS)] = -1, 82 [C(RESULT_MISS)] = -1, 83 }, 84 [C(OP_WRITE)] = { 85 [C(RESULT_ACCESS)] = -1, 86 [C(RESULT_MISS)] = -1, 87 }, 88 [C(OP_PREFETCH)] = { 89 [C(RESULT_ACCESS)] = -1, 90 [C(RESULT_MISS)] = -1, 91 }, 92 }, 93 [C(DTLB)] = { 94 [C(OP_READ)] = { 95 [C(RESULT_ACCESS)] = 0x0042, 96 [C(RESULT_MISS)] = 0x052c, 97 }, 98 [C(OP_WRITE)] = { 99 [C(RESULT_ACCESS)] = 0x0043, 100 [C(RESULT_MISS)] = 0x0530, 101 }, 102 [C(OP_PREFETCH)] = { 103 [C(RESULT_ACCESS)] = 0x0564, 104 [C(RESULT_MISS)] = 0x0565, 105 }, 106 }, 107 [C(ITLB)] = { 108 [C(OP_READ)] = { 109 [C(RESULT_ACCESS)] = 0x00c0, 110 [C(RESULT_MISS)] = 0x0534, 111 }, 112 [C(OP_WRITE)] = { 113 [C(RESULT_ACCESS)] = -1, 114 [C(RESULT_MISS)] = -1, 115 }, 116 [C(OP_PREFETCH)] = { 117 [C(RESULT_ACCESS)] = -1, 118 [C(RESULT_MISS)] = -1, 119 }, 120 }, 121 [C(BPU)] = { 122 [C(OP_READ)] = { 123 [C(RESULT_ACCESS)] = 0x0700, 124 [C(RESULT_MISS)] = 0x0709, 125 }, 126 [C(OP_WRITE)] = { 127 [C(RESULT_ACCESS)] = -1, 128 [C(RESULT_MISS)] = -1, 129 }, 130 [C(OP_PREFETCH)] = { 131 [C(RESULT_ACCESS)] = -1, 132 [C(RESULT_MISS)] = -1, 133 }, 134 }, 135 [C(NODE)] = { 136 [C(OP_READ)] = { 137 [C(RESULT_ACCESS)] = -1, 138 [C(RESULT_MISS)] = -1, 139 }, 140 [C(OP_WRITE)] = { 141 [C(RESULT_ACCESS)] = -1, 142 [C(RESULT_MISS)] = -1, 143 }, 144 [C(OP_PREFETCH)] = { 145 [C(RESULT_ACCESS)] = -1, 146 [C(RESULT_MISS)] = -1, 147 }, 148 }, 149 }; 150 151 static __initconst const u64 zxe_hw_cache_event_ids 152 [PERF_COUNT_HW_CACHE_MAX] 153 [PERF_COUNT_HW_CACHE_OP_MAX] 154 [PERF_COUNT_HW_CACHE_RESULT_MAX] = { 155 [C(L1D)] = { 156 [C(OP_READ)] = { 157 [C(RESULT_ACCESS)] = 0x0568, 158 [C(RESULT_MISS)] = 0x054b, 159 }, 160 [C(OP_WRITE)] = { 161 [C(RESULT_ACCESS)] = 0x0669, 162 [C(RESULT_MISS)] = 0x0562, 163 }, 164 [C(OP_PREFETCH)] = { 165 [C(RESULT_ACCESS)] = -1, 166 [C(RESULT_MISS)] = -1, 167 }, 168 }, 169 [C(L1I)] = { 170 [C(OP_READ)] = { 171 [C(RESULT_ACCESS)] = 0x0300, 172 [C(RESULT_MISS)] = 0x0301, 173 }, 174 [C(OP_WRITE)] = { 175 [C(RESULT_ACCESS)] = -1, 176 [C(RESULT_MISS)] = -1, 177 }, 178 [C(OP_PREFETCH)] = { 179 [C(RESULT_ACCESS)] = 0x030a, 180 [C(RESULT_MISS)] = 0x030b, 181 }, 182 }, 183 [C(LL)] = { 184 [C(OP_READ)] = { 185 [C(RESULT_ACCESS)] = 0x0, 186 [C(RESULT_MISS)] = 0x0, 187 }, 188 [C(OP_WRITE)] = { 189 [C(RESULT_ACCESS)] = 0x0, 190 [C(RESULT_MISS)] = 0x0, 191 }, 192 [C(OP_PREFETCH)] = { 193 [C(RESULT_ACCESS)] = 0x0, 194 [C(RESULT_MISS)] = 0x0, 195 }, 196 }, 197 [C(DTLB)] = { 198 [C(OP_READ)] = { 199 [C(RESULT_ACCESS)] = 0x0568, 200 [C(RESULT_MISS)] = 0x052c, 201 }, 202 [C(OP_WRITE)] = { 203 [C(RESULT_ACCESS)] = 0x0669, 204 [C(RESULT_MISS)] = 0x0530, 205 }, 206 [C(OP_PREFETCH)] = { 207 [C(RESULT_ACCESS)] = 0x0564, 208 [C(RESULT_MISS)] = 0x0565, 209 }, 210 }, 211 [C(ITLB)] = { 212 [C(OP_READ)] = { 213 [C(RESULT_ACCESS)] = 0x00c0, 214 [C(RESULT_MISS)] = 0x0534, 215 }, 216 [C(OP_WRITE)] = { 217 [C(RESULT_ACCESS)] = -1, 218 [C(RESULT_MISS)] = -1, 219 }, 220 [C(OP_PREFETCH)] = { 221 [C(RESULT_ACCESS)] = -1, 222 [C(RESULT_MISS)] = -1, 223 }, 224 }, 225 [C(BPU)] = { 226 [C(OP_READ)] = { 227 [C(RESULT_ACCESS)] = 0x0028, 228 [C(RESULT_MISS)] = 0x0029, 229 }, 230 [C(OP_WRITE)] = { 231 [C(RESULT_ACCESS)] = -1, 232 [C(RESULT_MISS)] = -1, 233 }, 234 [C(OP_PREFETCH)] = { 235 [C(RESULT_ACCESS)] = -1, 236 [C(RESULT_MISS)] = -1, 237 }, 238 }, 239 [C(NODE)] = { 240 [C(OP_READ)] = { 241 [C(RESULT_ACCESS)] = -1, 242 [C(RESULT_MISS)] = -1, 243 }, 244 [C(OP_WRITE)] = { 245 [C(RESULT_ACCESS)] = -1, 246 [C(RESULT_MISS)] = -1, 247 }, 248 [C(OP_PREFETCH)] = { 249 [C(RESULT_ACCESS)] = -1, 250 [C(RESULT_MISS)] = -1, 251 }, 252 }, 253 }; 254 255 static void zhaoxin_pmu_disable_all(void) 256 { 257 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); 258 } 259 260 static void zhaoxin_pmu_enable_all(int added) 261 { 262 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); 263 } 264 265 static inline u64 zhaoxin_pmu_get_status(void) 266 { 267 u64 status; 268 269 rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); 270 271 return status; 272 } 273 274 static inline void zhaoxin_pmu_ack_status(u64 ack) 275 { 276 wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack); 277 } 278 279 static inline void zxc_pmu_ack_status(u64 ack) 280 { 281 /* 282 * ZXC needs global control enabled in order to clear status bits. 283 */ 284 zhaoxin_pmu_enable_all(0); 285 zhaoxin_pmu_ack_status(ack); 286 zhaoxin_pmu_disable_all(); 287 } 288 289 static void zhaoxin_pmu_disable_fixed(struct hw_perf_event *hwc) 290 { 291 int idx = hwc->idx - INTEL_PMC_IDX_FIXED; 292 u64 ctrl_val, mask; 293 294 mask = 0xfULL << (idx * 4); 295 296 rdmsrl(hwc->config_base, ctrl_val); 297 ctrl_val &= ~mask; 298 wrmsrl(hwc->config_base, ctrl_val); 299 } 300 301 static void zhaoxin_pmu_disable_event(struct perf_event *event) 302 { 303 struct hw_perf_event *hwc = &event->hw; 304 305 if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { 306 zhaoxin_pmu_disable_fixed(hwc); 307 return; 308 } 309 310 x86_pmu_disable_event(event); 311 } 312 313 static void zhaoxin_pmu_enable_fixed(struct hw_perf_event *hwc) 314 { 315 int idx = hwc->idx - INTEL_PMC_IDX_FIXED; 316 u64 ctrl_val, bits, mask; 317 318 /* 319 * Enable IRQ generation (0x8), 320 * and enable ring-3 counting (0x2) and ring-0 counting (0x1) 321 * if requested: 322 */ 323 bits = 0x8ULL; 324 if (hwc->config & ARCH_PERFMON_EVENTSEL_USR) 325 bits |= 0x2; 326 if (hwc->config & ARCH_PERFMON_EVENTSEL_OS) 327 bits |= 0x1; 328 329 bits <<= (idx * 4); 330 mask = 0xfULL << (idx * 4); 331 332 rdmsrl(hwc->config_base, ctrl_val); 333 ctrl_val &= ~mask; 334 ctrl_val |= bits; 335 wrmsrl(hwc->config_base, ctrl_val); 336 } 337 338 static void zhaoxin_pmu_enable_event(struct perf_event *event) 339 { 340 struct hw_perf_event *hwc = &event->hw; 341 342 if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { 343 zhaoxin_pmu_enable_fixed(hwc); 344 return; 345 } 346 347 __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE); 348 } 349 350 /* 351 * This handler is triggered by the local APIC, so the APIC IRQ handling 352 * rules apply: 353 */ 354 static int zhaoxin_pmu_handle_irq(struct pt_regs *regs) 355 { 356 struct perf_sample_data data; 357 struct cpu_hw_events *cpuc; 358 int handled = 0; 359 u64 status; 360 int bit; 361 362 cpuc = this_cpu_ptr(&cpu_hw_events); 363 apic_write(APIC_LVTPC, APIC_DM_NMI); 364 zhaoxin_pmu_disable_all(); 365 status = zhaoxin_pmu_get_status(); 366 if (!status) 367 goto done; 368 369 again: 370 if (x86_pmu.enabled_ack) 371 zxc_pmu_ack_status(status); 372 else 373 zhaoxin_pmu_ack_status(status); 374 375 inc_irq_stat(apic_perf_irqs); 376 377 /* 378 * CondChgd bit 63 doesn't mean any overflow status. Ignore 379 * and clear the bit. 380 */ 381 if (__test_and_clear_bit(63, (unsigned long *)&status)) { 382 if (!status) 383 goto done; 384 } 385 386 for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { 387 struct perf_event *event = cpuc->events[bit]; 388 389 handled++; 390 391 if (!test_bit(bit, cpuc->active_mask)) 392 continue; 393 394 x86_perf_event_update(event); 395 perf_sample_data_init(&data, 0, event->hw.last_period); 396 397 if (!x86_perf_event_set_period(event)) 398 continue; 399 400 if (perf_event_overflow(event, &data, regs)) 401 x86_pmu_stop(event, 0); 402 } 403 404 /* 405 * Repeat if there is more work to be done: 406 */ 407 status = zhaoxin_pmu_get_status(); 408 if (status) 409 goto again; 410 411 done: 412 zhaoxin_pmu_enable_all(0); 413 return handled; 414 } 415 416 static u64 zhaoxin_pmu_event_map(int hw_event) 417 { 418 return zx_pmon_event_map[hw_event]; 419 } 420 421 static struct event_constraint * 422 zhaoxin_get_event_constraints(struct cpu_hw_events *cpuc, int idx, 423 struct perf_event *event) 424 { 425 struct event_constraint *c; 426 427 if (x86_pmu.event_constraints) { 428 for_each_event_constraint(c, x86_pmu.event_constraints) { 429 if ((event->hw.config & c->cmask) == c->code) 430 return c; 431 } 432 } 433 434 return &unconstrained; 435 } 436 437 PMU_FORMAT_ATTR(event, "config:0-7"); 438 PMU_FORMAT_ATTR(umask, "config:8-15"); 439 PMU_FORMAT_ATTR(edge, "config:18"); 440 PMU_FORMAT_ATTR(inv, "config:23"); 441 PMU_FORMAT_ATTR(cmask, "config:24-31"); 442 443 static struct attribute *zx_arch_formats_attr[] = { 444 &format_attr_event.attr, 445 &format_attr_umask.attr, 446 &format_attr_edge.attr, 447 &format_attr_inv.attr, 448 &format_attr_cmask.attr, 449 NULL, 450 }; 451 452 static ssize_t zhaoxin_event_sysfs_show(char *page, u64 config) 453 { 454 u64 event = (config & ARCH_PERFMON_EVENTSEL_EVENT); 455 456 return x86_event_sysfs_show(page, config, event); 457 } 458 459 static const struct x86_pmu zhaoxin_pmu __initconst = { 460 .name = "zhaoxin", 461 .handle_irq = zhaoxin_pmu_handle_irq, 462 .disable_all = zhaoxin_pmu_disable_all, 463 .enable_all = zhaoxin_pmu_enable_all, 464 .enable = zhaoxin_pmu_enable_event, 465 .disable = zhaoxin_pmu_disable_event, 466 .hw_config = x86_pmu_hw_config, 467 .schedule_events = x86_schedule_events, 468 .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, 469 .perfctr = MSR_ARCH_PERFMON_PERFCTR0, 470 .event_map = zhaoxin_pmu_event_map, 471 .max_events = ARRAY_SIZE(zx_pmon_event_map), 472 .apic = 1, 473 /* 474 * For zxd/zxe, read/write operation for PMCx MSR is 48 bits. 475 */ 476 .max_period = (1ULL << 47) - 1, 477 .get_event_constraints = zhaoxin_get_event_constraints, 478 479 .format_attrs = zx_arch_formats_attr, 480 .events_sysfs_show = zhaoxin_event_sysfs_show, 481 }; 482 483 static const struct { int id; char *name; } zx_arch_events_map[] __initconst = { 484 { PERF_COUNT_HW_CPU_CYCLES, "cpu cycles" }, 485 { PERF_COUNT_HW_INSTRUCTIONS, "instructions" }, 486 { PERF_COUNT_HW_BUS_CYCLES, "bus cycles" }, 487 { PERF_COUNT_HW_CACHE_REFERENCES, "cache references" }, 488 { PERF_COUNT_HW_CACHE_MISSES, "cache misses" }, 489 { PERF_COUNT_HW_BRANCH_INSTRUCTIONS, "branch instructions" }, 490 { PERF_COUNT_HW_BRANCH_MISSES, "branch misses" }, 491 }; 492 493 static __init void zhaoxin_arch_events_quirk(void) 494 { 495 int bit; 496 497 /* disable event that reported as not present by cpuid */ 498 for_each_set_bit(bit, x86_pmu.events_mask, ARRAY_SIZE(zx_arch_events_map)) { 499 zx_pmon_event_map[zx_arch_events_map[bit].id] = 0; 500 pr_warn("CPUID marked event: \'%s\' unavailable\n", 501 zx_arch_events_map[bit].name); 502 } 503 } 504 505 __init int zhaoxin_pmu_init(void) 506 { 507 union cpuid10_edx edx; 508 union cpuid10_eax eax; 509 union cpuid10_ebx ebx; 510 struct event_constraint *c; 511 unsigned int unused; 512 int version; 513 514 pr_info("Welcome to zhaoxin pmu!\n"); 515 516 /* 517 * Check whether the Architectural PerfMon supports 518 * hw_event or not. 519 */ 520 cpuid(10, &eax.full, &ebx.full, &unused, &edx.full); 521 522 if (eax.split.mask_length < ARCH_PERFMON_EVENTS_COUNT - 1) 523 return -ENODEV; 524 525 version = eax.split.version_id; 526 if (version != 2) 527 return -ENODEV; 528 529 x86_pmu = zhaoxin_pmu; 530 pr_info("Version check pass!\n"); 531 532 x86_pmu.version = version; 533 x86_pmu.num_counters = eax.split.num_counters; 534 x86_pmu.cntval_bits = eax.split.bit_width; 535 x86_pmu.cntval_mask = (1ULL << eax.split.bit_width) - 1; 536 x86_pmu.events_maskl = ebx.full; 537 x86_pmu.events_mask_len = eax.split.mask_length; 538 539 x86_pmu.num_counters_fixed = edx.split.num_counters_fixed; 540 x86_add_quirk(zhaoxin_arch_events_quirk); 541 542 switch (boot_cpu_data.x86) { 543 case 0x06: 544 /* 545 * Support Zhaoxin CPU from ZXC series, exclude Nano series through FMS. 546 * Nano FMS: Family=6, Model=F, Stepping=[0-A][C-D] 547 * ZXC FMS: Family=6, Model=F, Stepping=E-F OR Family=6, Model=0x19, Stepping=0-3 548 */ 549 if ((boot_cpu_data.x86_model == 0x0f && boot_cpu_data.x86_stepping >= 0x0e) || 550 boot_cpu_data.x86_model == 0x19) { 551 552 x86_pmu.max_period = x86_pmu.cntval_mask >> 1; 553 554 /* Clearing status works only if the global control is enable on zxc. */ 555 x86_pmu.enabled_ack = 1; 556 557 x86_pmu.event_constraints = zxc_event_constraints; 558 zx_pmon_event_map[PERF_COUNT_HW_INSTRUCTIONS] = 0; 559 zx_pmon_event_map[PERF_COUNT_HW_CACHE_REFERENCES] = 0; 560 zx_pmon_event_map[PERF_COUNT_HW_CACHE_MISSES] = 0; 561 zx_pmon_event_map[PERF_COUNT_HW_BUS_CYCLES] = 0; 562 563 pr_cont("ZXC events, "); 564 break; 565 } 566 return -ENODEV; 567 568 case 0x07: 569 zx_pmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 570 X86_CONFIG(.event = 0x01, .umask = 0x01, .inv = 0x01, .cmask = 0x01); 571 572 zx_pmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 573 X86_CONFIG(.event = 0x0f, .umask = 0x04, .inv = 0, .cmask = 0); 574 575 switch (boot_cpu_data.x86_model) { 576 case 0x1b: 577 memcpy(hw_cache_event_ids, zxd_hw_cache_event_ids, 578 sizeof(hw_cache_event_ids)); 579 580 x86_pmu.event_constraints = zxd_event_constraints; 581 582 zx_pmon_event_map[PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x0700; 583 zx_pmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x0709; 584 585 pr_cont("ZXD events, "); 586 break; 587 case 0x3b: 588 memcpy(hw_cache_event_ids, zxe_hw_cache_event_ids, 589 sizeof(hw_cache_event_ids)); 590 591 x86_pmu.event_constraints = zxd_event_constraints; 592 593 zx_pmon_event_map[PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x0028; 594 zx_pmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x0029; 595 596 pr_cont("ZXE events, "); 597 break; 598 default: 599 return -ENODEV; 600 } 601 break; 602 603 default: 604 return -ENODEV; 605 } 606 607 x86_pmu.intel_ctrl = (1 << (x86_pmu.num_counters)) - 1; 608 x86_pmu.intel_ctrl |= ((1LL << x86_pmu.num_counters_fixed)-1) << INTEL_PMC_IDX_FIXED; 609 610 if (x86_pmu.event_constraints) { 611 for_each_event_constraint(c, x86_pmu.event_constraints) { 612 c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1; 613 c->weight += x86_pmu.num_counters; 614 } 615 } 616 617 return 0; 618 } 619 620