1 /* 2 * Intel(R) Processor Trace PMU driver for perf 3 * Copyright (c) 2013-2014, Intel Corporation. 4 * 5 * This program is free software; you can redistribute it and/or modify it 6 * under the terms and conditions of the GNU General Public License, 7 * version 2, as published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 12 * more details. 13 * 14 * Intel PT is specified in the Intel Architecture Instruction Set Extensions 15 * Programming Reference: 16 * http://software.intel.com/en-us/intel-isa-extensions 17 */ 18 19 #undef DEBUG 20 21 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 22 23 #include <linux/types.h> 24 #include <linux/slab.h> 25 #include <linux/device.h> 26 27 #include <asm/perf_event.h> 28 #include <asm/insn.h> 29 #include <asm/io.h> 30 #include <asm/intel_pt.h> 31 32 #include "../perf_event.h" 33 #include "pt.h" 34 35 static DEFINE_PER_CPU(struct pt, pt_ctx); 36 37 static struct pt_pmu pt_pmu; 38 39 /* 40 * Capabilities of Intel PT hardware, such as number of address bits or 41 * supported output schemes, are cached and exported to userspace as "caps" 42 * attribute group of pt pmu device 43 * (/sys/bus/event_source/devices/intel_pt/caps/) so that userspace can store 44 * relevant bits together with intel_pt traces. 45 * 46 * These are necessary for both trace decoding (payloads_lip, contains address 47 * width encoded in IP-related packets), and event configuration (bitmasks with 48 * permitted values for certain bit fields). 49 */ 50 #define PT_CAP(_n, _l, _r, _m) \ 51 [PT_CAP_ ## _n] = { .name = __stringify(_n), .leaf = _l, \ 52 .reg = _r, .mask = _m } 53 54 static struct pt_cap_desc { 55 const char *name; 56 u32 leaf; 57 u8 reg; 58 u32 mask; 59 } pt_caps[] = { 60 PT_CAP(max_subleaf, 0, CPUID_EAX, 0xffffffff), 61 PT_CAP(cr3_filtering, 0, CPUID_EBX, BIT(0)), 62 PT_CAP(psb_cyc, 0, CPUID_EBX, BIT(1)), 63 PT_CAP(ip_filtering, 0, CPUID_EBX, BIT(2)), 64 PT_CAP(mtc, 0, CPUID_EBX, BIT(3)), 65 PT_CAP(ptwrite, 0, CPUID_EBX, BIT(4)), 66 PT_CAP(power_event_trace, 0, CPUID_EBX, BIT(5)), 67 PT_CAP(topa_output, 0, CPUID_ECX, BIT(0)), 68 PT_CAP(topa_multiple_entries, 0, CPUID_ECX, BIT(1)), 69 PT_CAP(single_range_output, 0, CPUID_ECX, BIT(2)), 70 PT_CAP(payloads_lip, 0, CPUID_ECX, BIT(31)), 71 PT_CAP(num_address_ranges, 1, CPUID_EAX, 0x3), 72 PT_CAP(mtc_periods, 1, CPUID_EAX, 0xffff0000), 73 PT_CAP(cycle_thresholds, 1, CPUID_EBX, 0xffff), 74 PT_CAP(psb_periods, 1, CPUID_EBX, 0xffff0000), 75 }; 76 77 static u32 pt_cap_get(enum pt_capabilities cap) 78 { 79 struct pt_cap_desc *cd = &pt_caps[cap]; 80 u32 c = pt_pmu.caps[cd->leaf * PT_CPUID_REGS_NUM + cd->reg]; 81 unsigned int shift = __ffs(cd->mask); 82 83 return (c & cd->mask) >> shift; 84 } 85 86 static ssize_t pt_cap_show(struct device *cdev, 87 struct device_attribute *attr, 88 char *buf) 89 { 90 struct dev_ext_attribute *ea = 91 container_of(attr, struct dev_ext_attribute, attr); 92 enum pt_capabilities cap = (long)ea->var; 93 94 return snprintf(buf, PAGE_SIZE, "%x\n", pt_cap_get(cap)); 95 } 96 97 static struct attribute_group pt_cap_group = { 98 .name = "caps", 99 }; 100 101 PMU_FORMAT_ATTR(cyc, "config:1" ); 102 PMU_FORMAT_ATTR(pwr_evt, "config:4" ); 103 PMU_FORMAT_ATTR(fup_on_ptw, "config:5" ); 104 PMU_FORMAT_ATTR(mtc, "config:9" ); 105 PMU_FORMAT_ATTR(tsc, "config:10" ); 106 PMU_FORMAT_ATTR(noretcomp, "config:11" ); 107 PMU_FORMAT_ATTR(ptw, "config:12" ); 108 PMU_FORMAT_ATTR(mtc_period, "config:14-17" ); 109 PMU_FORMAT_ATTR(cyc_thresh, "config:19-22" ); 110 PMU_FORMAT_ATTR(psb_period, "config:24-27" ); 111 112 static struct attribute *pt_formats_attr[] = { 113 &format_attr_cyc.attr, 114 &format_attr_pwr_evt.attr, 115 &format_attr_fup_on_ptw.attr, 116 &format_attr_mtc.attr, 117 &format_attr_tsc.attr, 118 &format_attr_noretcomp.attr, 119 &format_attr_ptw.attr, 120 &format_attr_mtc_period.attr, 121 &format_attr_cyc_thresh.attr, 122 &format_attr_psb_period.attr, 123 NULL, 124 }; 125 126 static struct attribute_group pt_format_group = { 127 .name = "format", 128 .attrs = pt_formats_attr, 129 }; 130 131 static ssize_t 132 pt_timing_attr_show(struct device *dev, struct device_attribute *attr, 133 char *page) 134 { 135 struct perf_pmu_events_attr *pmu_attr = 136 container_of(attr, struct perf_pmu_events_attr, attr); 137 138 switch (pmu_attr->id) { 139 case 0: 140 return sprintf(page, "%lu\n", pt_pmu.max_nonturbo_ratio); 141 case 1: 142 return sprintf(page, "%u:%u\n", 143 pt_pmu.tsc_art_num, 144 pt_pmu.tsc_art_den); 145 default: 146 break; 147 } 148 149 return -EINVAL; 150 } 151 152 PMU_EVENT_ATTR(max_nonturbo_ratio, timing_attr_max_nonturbo_ratio, 0, 153 pt_timing_attr_show); 154 PMU_EVENT_ATTR(tsc_art_ratio, timing_attr_tsc_art_ratio, 1, 155 pt_timing_attr_show); 156 157 static struct attribute *pt_timing_attr[] = { 158 &timing_attr_max_nonturbo_ratio.attr.attr, 159 &timing_attr_tsc_art_ratio.attr.attr, 160 NULL, 161 }; 162 163 static struct attribute_group pt_timing_group = { 164 .attrs = pt_timing_attr, 165 }; 166 167 static const struct attribute_group *pt_attr_groups[] = { 168 &pt_cap_group, 169 &pt_format_group, 170 &pt_timing_group, 171 NULL, 172 }; 173 174 static int __init pt_pmu_hw_init(void) 175 { 176 struct dev_ext_attribute *de_attrs; 177 struct attribute **attrs; 178 size_t size; 179 u64 reg; 180 int ret; 181 long i; 182 183 rdmsrl(MSR_PLATFORM_INFO, reg); 184 pt_pmu.max_nonturbo_ratio = (reg & 0xff00) >> 8; 185 186 /* 187 * if available, read in TSC to core crystal clock ratio, 188 * otherwise, zero for numerator stands for "not enumerated" 189 * as per SDM 190 */ 191 if (boot_cpu_data.cpuid_level >= CPUID_TSC_LEAF) { 192 u32 eax, ebx, ecx, edx; 193 194 cpuid(CPUID_TSC_LEAF, &eax, &ebx, &ecx, &edx); 195 196 pt_pmu.tsc_art_num = ebx; 197 pt_pmu.tsc_art_den = eax; 198 } 199 200 if (boot_cpu_has(X86_FEATURE_VMX)) { 201 /* 202 * Intel SDM, 36.5 "Tracing post-VMXON" says that 203 * "IA32_VMX_MISC[bit 14]" being 1 means PT can trace 204 * post-VMXON. 205 */ 206 rdmsrl(MSR_IA32_VMX_MISC, reg); 207 if (reg & BIT(14)) 208 pt_pmu.vmx = true; 209 } 210 211 attrs = NULL; 212 213 for (i = 0; i < PT_CPUID_LEAVES; i++) { 214 cpuid_count(20, i, 215 &pt_pmu.caps[CPUID_EAX + i*PT_CPUID_REGS_NUM], 216 &pt_pmu.caps[CPUID_EBX + i*PT_CPUID_REGS_NUM], 217 &pt_pmu.caps[CPUID_ECX + i*PT_CPUID_REGS_NUM], 218 &pt_pmu.caps[CPUID_EDX + i*PT_CPUID_REGS_NUM]); 219 } 220 221 ret = -ENOMEM; 222 size = sizeof(struct attribute *) * (ARRAY_SIZE(pt_caps)+1); 223 attrs = kzalloc(size, GFP_KERNEL); 224 if (!attrs) 225 goto fail; 226 227 size = sizeof(struct dev_ext_attribute) * (ARRAY_SIZE(pt_caps)+1); 228 de_attrs = kzalloc(size, GFP_KERNEL); 229 if (!de_attrs) 230 goto fail; 231 232 for (i = 0; i < ARRAY_SIZE(pt_caps); i++) { 233 struct dev_ext_attribute *de_attr = de_attrs + i; 234 235 de_attr->attr.attr.name = pt_caps[i].name; 236 237 sysfs_attr_init(&de_attr->attr.attr); 238 239 de_attr->attr.attr.mode = S_IRUGO; 240 de_attr->attr.show = pt_cap_show; 241 de_attr->var = (void *)i; 242 243 attrs[i] = &de_attr->attr.attr; 244 } 245 246 pt_cap_group.attrs = attrs; 247 248 return 0; 249 250 fail: 251 kfree(attrs); 252 253 return ret; 254 } 255 256 #define RTIT_CTL_CYC_PSB (RTIT_CTL_CYCLEACC | \ 257 RTIT_CTL_CYC_THRESH | \ 258 RTIT_CTL_PSB_FREQ) 259 260 #define RTIT_CTL_MTC (RTIT_CTL_MTC_EN | \ 261 RTIT_CTL_MTC_RANGE) 262 263 #define RTIT_CTL_PTW (RTIT_CTL_PTW_EN | \ 264 RTIT_CTL_FUP_ON_PTW) 265 266 #define PT_CONFIG_MASK (RTIT_CTL_TSC_EN | \ 267 RTIT_CTL_DISRETC | \ 268 RTIT_CTL_CYC_PSB | \ 269 RTIT_CTL_MTC | \ 270 RTIT_CTL_PWR_EVT_EN | \ 271 RTIT_CTL_FUP_ON_PTW | \ 272 RTIT_CTL_PTW_EN) 273 274 static bool pt_event_valid(struct perf_event *event) 275 { 276 u64 config = event->attr.config; 277 u64 allowed, requested; 278 279 if ((config & PT_CONFIG_MASK) != config) 280 return false; 281 282 if (config & RTIT_CTL_CYC_PSB) { 283 if (!pt_cap_get(PT_CAP_psb_cyc)) 284 return false; 285 286 allowed = pt_cap_get(PT_CAP_psb_periods); 287 requested = (config & RTIT_CTL_PSB_FREQ) >> 288 RTIT_CTL_PSB_FREQ_OFFSET; 289 if (requested && (!(allowed & BIT(requested)))) 290 return false; 291 292 allowed = pt_cap_get(PT_CAP_cycle_thresholds); 293 requested = (config & RTIT_CTL_CYC_THRESH) >> 294 RTIT_CTL_CYC_THRESH_OFFSET; 295 if (requested && (!(allowed & BIT(requested)))) 296 return false; 297 } 298 299 if (config & RTIT_CTL_MTC) { 300 /* 301 * In the unlikely case that CPUID lists valid mtc periods, 302 * but not the mtc capability, drop out here. 303 * 304 * Spec says that setting mtc period bits while mtc bit in 305 * CPUID is 0 will #GP, so better safe than sorry. 306 */ 307 if (!pt_cap_get(PT_CAP_mtc)) 308 return false; 309 310 allowed = pt_cap_get(PT_CAP_mtc_periods); 311 if (!allowed) 312 return false; 313 314 requested = (config & RTIT_CTL_MTC_RANGE) >> 315 RTIT_CTL_MTC_RANGE_OFFSET; 316 317 if (!(allowed & BIT(requested))) 318 return false; 319 } 320 321 if (config & RTIT_CTL_PWR_EVT_EN && 322 !pt_cap_get(PT_CAP_power_event_trace)) 323 return false; 324 325 if (config & RTIT_CTL_PTW) { 326 if (!pt_cap_get(PT_CAP_ptwrite)) 327 return false; 328 329 /* FUPonPTW without PTW doesn't make sense */ 330 if ((config & RTIT_CTL_FUP_ON_PTW) && 331 !(config & RTIT_CTL_PTW_EN)) 332 return false; 333 } 334 335 return true; 336 } 337 338 /* 339 * PT configuration helpers 340 * These all are cpu affine and operate on a local PT 341 */ 342 343 /* Address ranges and their corresponding msr configuration registers */ 344 static const struct pt_address_range { 345 unsigned long msr_a; 346 unsigned long msr_b; 347 unsigned int reg_off; 348 } pt_address_ranges[] = { 349 { 350 .msr_a = MSR_IA32_RTIT_ADDR0_A, 351 .msr_b = MSR_IA32_RTIT_ADDR0_B, 352 .reg_off = RTIT_CTL_ADDR0_OFFSET, 353 }, 354 { 355 .msr_a = MSR_IA32_RTIT_ADDR1_A, 356 .msr_b = MSR_IA32_RTIT_ADDR1_B, 357 .reg_off = RTIT_CTL_ADDR1_OFFSET, 358 }, 359 { 360 .msr_a = MSR_IA32_RTIT_ADDR2_A, 361 .msr_b = MSR_IA32_RTIT_ADDR2_B, 362 .reg_off = RTIT_CTL_ADDR2_OFFSET, 363 }, 364 { 365 .msr_a = MSR_IA32_RTIT_ADDR3_A, 366 .msr_b = MSR_IA32_RTIT_ADDR3_B, 367 .reg_off = RTIT_CTL_ADDR3_OFFSET, 368 } 369 }; 370 371 static u64 pt_config_filters(struct perf_event *event) 372 { 373 struct pt_filters *filters = event->hw.addr_filters; 374 struct pt *pt = this_cpu_ptr(&pt_ctx); 375 unsigned int range = 0; 376 u64 rtit_ctl = 0; 377 378 if (!filters) 379 return 0; 380 381 perf_event_addr_filters_sync(event); 382 383 for (range = 0; range < filters->nr_filters; range++) { 384 struct pt_filter *filter = &filters->filter[range]; 385 386 /* 387 * Note, if the range has zero start/end addresses due 388 * to its dynamic object not being loaded yet, we just 389 * go ahead and program zeroed range, which will simply 390 * produce no data. Note^2: if executable code at 0x0 391 * is a concern, we can set up an "invalid" configuration 392 * such as msr_b < msr_a. 393 */ 394 395 /* avoid redundant msr writes */ 396 if (pt->filters.filter[range].msr_a != filter->msr_a) { 397 wrmsrl(pt_address_ranges[range].msr_a, filter->msr_a); 398 pt->filters.filter[range].msr_a = filter->msr_a; 399 } 400 401 if (pt->filters.filter[range].msr_b != filter->msr_b) { 402 wrmsrl(pt_address_ranges[range].msr_b, filter->msr_b); 403 pt->filters.filter[range].msr_b = filter->msr_b; 404 } 405 406 rtit_ctl |= filter->config << pt_address_ranges[range].reg_off; 407 } 408 409 return rtit_ctl; 410 } 411 412 static void pt_config(struct perf_event *event) 413 { 414 u64 reg; 415 416 if (!event->hw.itrace_started) { 417 event->hw.itrace_started = 1; 418 wrmsrl(MSR_IA32_RTIT_STATUS, 0); 419 } 420 421 reg = pt_config_filters(event); 422 reg |= RTIT_CTL_TOPA | RTIT_CTL_BRANCH_EN | RTIT_CTL_TRACEEN; 423 424 if (!event->attr.exclude_kernel) 425 reg |= RTIT_CTL_OS; 426 if (!event->attr.exclude_user) 427 reg |= RTIT_CTL_USR; 428 429 reg |= (event->attr.config & PT_CONFIG_MASK); 430 431 event->hw.config = reg; 432 wrmsrl(MSR_IA32_RTIT_CTL, reg); 433 } 434 435 static void pt_config_stop(struct perf_event *event) 436 { 437 u64 ctl = READ_ONCE(event->hw.config); 438 439 /* may be already stopped by a PMI */ 440 if (!(ctl & RTIT_CTL_TRACEEN)) 441 return; 442 443 ctl &= ~RTIT_CTL_TRACEEN; 444 wrmsrl(MSR_IA32_RTIT_CTL, ctl); 445 446 WRITE_ONCE(event->hw.config, ctl); 447 448 /* 449 * A wrmsr that disables trace generation serializes other PT 450 * registers and causes all data packets to be written to memory, 451 * but a fence is required for the data to become globally visible. 452 * 453 * The below WMB, separating data store and aux_head store matches 454 * the consumer's RMB that separates aux_head load and data load. 455 */ 456 wmb(); 457 } 458 459 static void pt_config_buffer(void *buf, unsigned int topa_idx, 460 unsigned int output_off) 461 { 462 u64 reg; 463 464 wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, virt_to_phys(buf)); 465 466 reg = 0x7f | ((u64)topa_idx << 7) | ((u64)output_off << 32); 467 468 wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, reg); 469 } 470 471 /* 472 * Keep ToPA table-related metadata on the same page as the actual table, 473 * taking up a few words from the top 474 */ 475 476 #define TENTS_PER_PAGE (((PAGE_SIZE - 40) / sizeof(struct topa_entry)) - 1) 477 478 /** 479 * struct topa - page-sized ToPA table with metadata at the top 480 * @table: actual ToPA table entries, as understood by PT hardware 481 * @list: linkage to struct pt_buffer's list of tables 482 * @phys: physical address of this page 483 * @offset: offset of the first entry in this table in the buffer 484 * @size: total size of all entries in this table 485 * @last: index of the last initialized entry in this table 486 */ 487 struct topa { 488 struct topa_entry table[TENTS_PER_PAGE]; 489 struct list_head list; 490 u64 phys; 491 u64 offset; 492 size_t size; 493 int last; 494 }; 495 496 /* make -1 stand for the last table entry */ 497 #define TOPA_ENTRY(t, i) ((i) == -1 ? &(t)->table[(t)->last] : &(t)->table[(i)]) 498 499 /** 500 * topa_alloc() - allocate page-sized ToPA table 501 * @cpu: CPU on which to allocate. 502 * @gfp: Allocation flags. 503 * 504 * Return: On success, return the pointer to ToPA table page. 505 */ 506 static struct topa *topa_alloc(int cpu, gfp_t gfp) 507 { 508 int node = cpu_to_node(cpu); 509 struct topa *topa; 510 struct page *p; 511 512 p = alloc_pages_node(node, gfp | __GFP_ZERO, 0); 513 if (!p) 514 return NULL; 515 516 topa = page_address(p); 517 topa->last = 0; 518 topa->phys = page_to_phys(p); 519 520 /* 521 * In case of singe-entry ToPA, always put the self-referencing END 522 * link as the 2nd entry in the table 523 */ 524 if (!pt_cap_get(PT_CAP_topa_multiple_entries)) { 525 TOPA_ENTRY(topa, 1)->base = topa->phys >> TOPA_SHIFT; 526 TOPA_ENTRY(topa, 1)->end = 1; 527 } 528 529 return topa; 530 } 531 532 /** 533 * topa_free() - free a page-sized ToPA table 534 * @topa: Table to deallocate. 535 */ 536 static void topa_free(struct topa *topa) 537 { 538 free_page((unsigned long)topa); 539 } 540 541 /** 542 * topa_insert_table() - insert a ToPA table into a buffer 543 * @buf: PT buffer that's being extended. 544 * @topa: New topa table to be inserted. 545 * 546 * If it's the first table in this buffer, set up buffer's pointers 547 * accordingly; otherwise, add a END=1 link entry to @topa to the current 548 * "last" table and adjust the last table pointer to @topa. 549 */ 550 static void topa_insert_table(struct pt_buffer *buf, struct topa *topa) 551 { 552 struct topa *last = buf->last; 553 554 list_add_tail(&topa->list, &buf->tables); 555 556 if (!buf->first) { 557 buf->first = buf->last = buf->cur = topa; 558 return; 559 } 560 561 topa->offset = last->offset + last->size; 562 buf->last = topa; 563 564 if (!pt_cap_get(PT_CAP_topa_multiple_entries)) 565 return; 566 567 BUG_ON(last->last != TENTS_PER_PAGE - 1); 568 569 TOPA_ENTRY(last, -1)->base = topa->phys >> TOPA_SHIFT; 570 TOPA_ENTRY(last, -1)->end = 1; 571 } 572 573 /** 574 * topa_table_full() - check if a ToPA table is filled up 575 * @topa: ToPA table. 576 */ 577 static bool topa_table_full(struct topa *topa) 578 { 579 /* single-entry ToPA is a special case */ 580 if (!pt_cap_get(PT_CAP_topa_multiple_entries)) 581 return !!topa->last; 582 583 return topa->last == TENTS_PER_PAGE - 1; 584 } 585 586 /** 587 * topa_insert_pages() - create a list of ToPA tables 588 * @buf: PT buffer being initialized. 589 * @gfp: Allocation flags. 590 * 591 * This initializes a list of ToPA tables with entries from 592 * the data_pages provided by rb_alloc_aux(). 593 * 594 * Return: 0 on success or error code. 595 */ 596 static int topa_insert_pages(struct pt_buffer *buf, gfp_t gfp) 597 { 598 struct topa *topa = buf->last; 599 int order = 0; 600 struct page *p; 601 602 p = virt_to_page(buf->data_pages[buf->nr_pages]); 603 if (PagePrivate(p)) 604 order = page_private(p); 605 606 if (topa_table_full(topa)) { 607 topa = topa_alloc(buf->cpu, gfp); 608 if (!topa) 609 return -ENOMEM; 610 611 topa_insert_table(buf, topa); 612 } 613 614 TOPA_ENTRY(topa, -1)->base = page_to_phys(p) >> TOPA_SHIFT; 615 TOPA_ENTRY(topa, -1)->size = order; 616 if (!buf->snapshot && !pt_cap_get(PT_CAP_topa_multiple_entries)) { 617 TOPA_ENTRY(topa, -1)->intr = 1; 618 TOPA_ENTRY(topa, -1)->stop = 1; 619 } 620 621 topa->last++; 622 topa->size += sizes(order); 623 624 buf->nr_pages += 1ul << order; 625 626 return 0; 627 } 628 629 /** 630 * pt_topa_dump() - print ToPA tables and their entries 631 * @buf: PT buffer. 632 */ 633 static void pt_topa_dump(struct pt_buffer *buf) 634 { 635 struct topa *topa; 636 637 list_for_each_entry(topa, &buf->tables, list) { 638 int i; 639 640 pr_debug("# table @%p (%016Lx), off %llx size %zx\n", topa->table, 641 topa->phys, topa->offset, topa->size); 642 for (i = 0; i < TENTS_PER_PAGE; i++) { 643 pr_debug("# entry @%p (%lx sz %u %c%c%c) raw=%16llx\n", 644 &topa->table[i], 645 (unsigned long)topa->table[i].base << TOPA_SHIFT, 646 sizes(topa->table[i].size), 647 topa->table[i].end ? 'E' : ' ', 648 topa->table[i].intr ? 'I' : ' ', 649 topa->table[i].stop ? 'S' : ' ', 650 *(u64 *)&topa->table[i]); 651 if ((pt_cap_get(PT_CAP_topa_multiple_entries) && 652 topa->table[i].stop) || 653 topa->table[i].end) 654 break; 655 } 656 } 657 } 658 659 /** 660 * pt_buffer_advance() - advance to the next output region 661 * @buf: PT buffer. 662 * 663 * Advance the current pointers in the buffer to the next ToPA entry. 664 */ 665 static void pt_buffer_advance(struct pt_buffer *buf) 666 { 667 buf->output_off = 0; 668 buf->cur_idx++; 669 670 if (buf->cur_idx == buf->cur->last) { 671 if (buf->cur == buf->last) 672 buf->cur = buf->first; 673 else 674 buf->cur = list_entry(buf->cur->list.next, struct topa, 675 list); 676 buf->cur_idx = 0; 677 } 678 } 679 680 /** 681 * pt_update_head() - calculate current offsets and sizes 682 * @pt: Per-cpu pt context. 683 * 684 * Update buffer's current write pointer position and data size. 685 */ 686 static void pt_update_head(struct pt *pt) 687 { 688 struct pt_buffer *buf = perf_get_aux(&pt->handle); 689 u64 topa_idx, base, old; 690 691 /* offset of the first region in this table from the beginning of buf */ 692 base = buf->cur->offset + buf->output_off; 693 694 /* offset of the current output region within this table */ 695 for (topa_idx = 0; topa_idx < buf->cur_idx; topa_idx++) 696 base += sizes(buf->cur->table[topa_idx].size); 697 698 if (buf->snapshot) { 699 local_set(&buf->data_size, base); 700 } else { 701 old = (local64_xchg(&buf->head, base) & 702 ((buf->nr_pages << PAGE_SHIFT) - 1)); 703 if (base < old) 704 base += buf->nr_pages << PAGE_SHIFT; 705 706 local_add(base - old, &buf->data_size); 707 } 708 } 709 710 /** 711 * pt_buffer_region() - obtain current output region's address 712 * @buf: PT buffer. 713 */ 714 static void *pt_buffer_region(struct pt_buffer *buf) 715 { 716 return phys_to_virt(buf->cur->table[buf->cur_idx].base << TOPA_SHIFT); 717 } 718 719 /** 720 * pt_buffer_region_size() - obtain current output region's size 721 * @buf: PT buffer. 722 */ 723 static size_t pt_buffer_region_size(struct pt_buffer *buf) 724 { 725 return sizes(buf->cur->table[buf->cur_idx].size); 726 } 727 728 /** 729 * pt_handle_status() - take care of possible status conditions 730 * @pt: Per-cpu pt context. 731 */ 732 static void pt_handle_status(struct pt *pt) 733 { 734 struct pt_buffer *buf = perf_get_aux(&pt->handle); 735 int advance = 0; 736 u64 status; 737 738 rdmsrl(MSR_IA32_RTIT_STATUS, status); 739 740 if (status & RTIT_STATUS_ERROR) { 741 pr_err_ratelimited("ToPA ERROR encountered, trying to recover\n"); 742 pt_topa_dump(buf); 743 status &= ~RTIT_STATUS_ERROR; 744 } 745 746 if (status & RTIT_STATUS_STOPPED) { 747 status &= ~RTIT_STATUS_STOPPED; 748 749 /* 750 * On systems that only do single-entry ToPA, hitting STOP 751 * means we are already losing data; need to let the decoder 752 * know. 753 */ 754 if (!pt_cap_get(PT_CAP_topa_multiple_entries) || 755 buf->output_off == sizes(TOPA_ENTRY(buf->cur, buf->cur_idx)->size)) { 756 local_inc(&buf->lost); 757 advance++; 758 } 759 } 760 761 /* 762 * Also on single-entry ToPA implementations, interrupt will come 763 * before the output reaches its output region's boundary. 764 */ 765 if (!pt_cap_get(PT_CAP_topa_multiple_entries) && !buf->snapshot && 766 pt_buffer_region_size(buf) - buf->output_off <= TOPA_PMI_MARGIN) { 767 void *head = pt_buffer_region(buf); 768 769 /* everything within this margin needs to be zeroed out */ 770 memset(head + buf->output_off, 0, 771 pt_buffer_region_size(buf) - 772 buf->output_off); 773 advance++; 774 } 775 776 if (advance) 777 pt_buffer_advance(buf); 778 779 wrmsrl(MSR_IA32_RTIT_STATUS, status); 780 } 781 782 /** 783 * pt_read_offset() - translate registers into buffer pointers 784 * @buf: PT buffer. 785 * 786 * Set buffer's output pointers from MSR values. 787 */ 788 static void pt_read_offset(struct pt_buffer *buf) 789 { 790 u64 offset, base_topa; 791 792 rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, base_topa); 793 buf->cur = phys_to_virt(base_topa); 794 795 rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, offset); 796 /* offset within current output region */ 797 buf->output_off = offset >> 32; 798 /* index of current output region within this table */ 799 buf->cur_idx = (offset & 0xffffff80) >> 7; 800 } 801 802 /** 803 * pt_topa_next_entry() - obtain index of the first page in the next ToPA entry 804 * @buf: PT buffer. 805 * @pg: Page offset in the buffer. 806 * 807 * When advancing to the next output region (ToPA entry), given a page offset 808 * into the buffer, we need to find the offset of the first page in the next 809 * region. 810 */ 811 static unsigned int pt_topa_next_entry(struct pt_buffer *buf, unsigned int pg) 812 { 813 struct topa_entry *te = buf->topa_index[pg]; 814 815 /* one region */ 816 if (buf->first == buf->last && buf->first->last == 1) 817 return pg; 818 819 do { 820 pg++; 821 pg &= buf->nr_pages - 1; 822 } while (buf->topa_index[pg] == te); 823 824 return pg; 825 } 826 827 /** 828 * pt_buffer_reset_markers() - place interrupt and stop bits in the buffer 829 * @buf: PT buffer. 830 * @handle: Current output handle. 831 * 832 * Place INT and STOP marks to prevent overwriting old data that the consumer 833 * hasn't yet collected and waking up the consumer after a certain fraction of 834 * the buffer has filled up. Only needed and sensible for non-snapshot counters. 835 * 836 * This obviously relies on buf::head to figure out buffer markers, so it has 837 * to be called after pt_buffer_reset_offsets() and before the hardware tracing 838 * is enabled. 839 */ 840 static int pt_buffer_reset_markers(struct pt_buffer *buf, 841 struct perf_output_handle *handle) 842 843 { 844 unsigned long head = local64_read(&buf->head); 845 unsigned long idx, npages, wakeup; 846 847 /* can't stop in the middle of an output region */ 848 if (buf->output_off + handle->size + 1 < 849 sizes(TOPA_ENTRY(buf->cur, buf->cur_idx)->size)) 850 return -EINVAL; 851 852 853 /* single entry ToPA is handled by marking all regions STOP=1 INT=1 */ 854 if (!pt_cap_get(PT_CAP_topa_multiple_entries)) 855 return 0; 856 857 /* clear STOP and INT from current entry */ 858 buf->topa_index[buf->stop_pos]->stop = 0; 859 buf->topa_index[buf->stop_pos]->intr = 0; 860 buf->topa_index[buf->intr_pos]->intr = 0; 861 862 /* how many pages till the STOP marker */ 863 npages = handle->size >> PAGE_SHIFT; 864 865 /* if it's on a page boundary, fill up one more page */ 866 if (!offset_in_page(head + handle->size + 1)) 867 npages++; 868 869 idx = (head >> PAGE_SHIFT) + npages; 870 idx &= buf->nr_pages - 1; 871 buf->stop_pos = idx; 872 873 wakeup = handle->wakeup >> PAGE_SHIFT; 874 875 /* in the worst case, wake up the consumer one page before hard stop */ 876 idx = (head >> PAGE_SHIFT) + npages - 1; 877 if (idx > wakeup) 878 idx = wakeup; 879 880 idx &= buf->nr_pages - 1; 881 buf->intr_pos = idx; 882 883 buf->topa_index[buf->stop_pos]->stop = 1; 884 buf->topa_index[buf->stop_pos]->intr = 1; 885 buf->topa_index[buf->intr_pos]->intr = 1; 886 887 return 0; 888 } 889 890 /** 891 * pt_buffer_setup_topa_index() - build topa_index[] table of regions 892 * @buf: PT buffer. 893 * 894 * topa_index[] references output regions indexed by offset into the 895 * buffer for purposes of quick reverse lookup. 896 */ 897 static void pt_buffer_setup_topa_index(struct pt_buffer *buf) 898 { 899 struct topa *cur = buf->first, *prev = buf->last; 900 struct topa_entry *te_cur = TOPA_ENTRY(cur, 0), 901 *te_prev = TOPA_ENTRY(prev, prev->last - 1); 902 int pg = 0, idx = 0; 903 904 while (pg < buf->nr_pages) { 905 int tidx; 906 907 /* pages within one topa entry */ 908 for (tidx = 0; tidx < 1 << te_cur->size; tidx++, pg++) 909 buf->topa_index[pg] = te_prev; 910 911 te_prev = te_cur; 912 913 if (idx == cur->last - 1) { 914 /* advance to next topa table */ 915 idx = 0; 916 cur = list_entry(cur->list.next, struct topa, list); 917 } else { 918 idx++; 919 } 920 te_cur = TOPA_ENTRY(cur, idx); 921 } 922 923 } 924 925 /** 926 * pt_buffer_reset_offsets() - adjust buffer's write pointers from aux_head 927 * @buf: PT buffer. 928 * @head: Write pointer (aux_head) from AUX buffer. 929 * 930 * Find the ToPA table and entry corresponding to given @head and set buffer's 931 * "current" pointers accordingly. This is done after we have obtained the 932 * current aux_head position from a successful call to perf_aux_output_begin() 933 * to make sure the hardware is writing to the right place. 934 * 935 * This function modifies buf::{cur,cur_idx,output_off} that will be programmed 936 * into PT msrs when the tracing is enabled and buf::head and buf::data_size, 937 * which are used to determine INT and STOP markers' locations by a subsequent 938 * call to pt_buffer_reset_markers(). 939 */ 940 static void pt_buffer_reset_offsets(struct pt_buffer *buf, unsigned long head) 941 { 942 int pg; 943 944 if (buf->snapshot) 945 head &= (buf->nr_pages << PAGE_SHIFT) - 1; 946 947 pg = (head >> PAGE_SHIFT) & (buf->nr_pages - 1); 948 pg = pt_topa_next_entry(buf, pg); 949 950 buf->cur = (struct topa *)((unsigned long)buf->topa_index[pg] & PAGE_MASK); 951 buf->cur_idx = ((unsigned long)buf->topa_index[pg] - 952 (unsigned long)buf->cur) / sizeof(struct topa_entry); 953 buf->output_off = head & (sizes(buf->cur->table[buf->cur_idx].size) - 1); 954 955 local64_set(&buf->head, head); 956 local_set(&buf->data_size, 0); 957 } 958 959 /** 960 * pt_buffer_fini_topa() - deallocate ToPA structure of a buffer 961 * @buf: PT buffer. 962 */ 963 static void pt_buffer_fini_topa(struct pt_buffer *buf) 964 { 965 struct topa *topa, *iter; 966 967 list_for_each_entry_safe(topa, iter, &buf->tables, list) { 968 /* 969 * right now, this is in free_aux() path only, so 970 * no need to unlink this table from the list 971 */ 972 topa_free(topa); 973 } 974 } 975 976 /** 977 * pt_buffer_init_topa() - initialize ToPA table for pt buffer 978 * @buf: PT buffer. 979 * @size: Total size of all regions within this ToPA. 980 * @gfp: Allocation flags. 981 */ 982 static int pt_buffer_init_topa(struct pt_buffer *buf, unsigned long nr_pages, 983 gfp_t gfp) 984 { 985 struct topa *topa; 986 int err; 987 988 topa = topa_alloc(buf->cpu, gfp); 989 if (!topa) 990 return -ENOMEM; 991 992 topa_insert_table(buf, topa); 993 994 while (buf->nr_pages < nr_pages) { 995 err = topa_insert_pages(buf, gfp); 996 if (err) { 997 pt_buffer_fini_topa(buf); 998 return -ENOMEM; 999 } 1000 } 1001 1002 pt_buffer_setup_topa_index(buf); 1003 1004 /* link last table to the first one, unless we're double buffering */ 1005 if (pt_cap_get(PT_CAP_topa_multiple_entries)) { 1006 TOPA_ENTRY(buf->last, -1)->base = buf->first->phys >> TOPA_SHIFT; 1007 TOPA_ENTRY(buf->last, -1)->end = 1; 1008 } 1009 1010 pt_topa_dump(buf); 1011 return 0; 1012 } 1013 1014 /** 1015 * pt_buffer_setup_aux() - set up topa tables for a PT buffer 1016 * @cpu: Cpu on which to allocate, -1 means current. 1017 * @pages: Array of pointers to buffer pages passed from perf core. 1018 * @nr_pages: Number of pages in the buffer. 1019 * @snapshot: If this is a snapshot/overwrite counter. 1020 * 1021 * This is a pmu::setup_aux callback that sets up ToPA tables and all the 1022 * bookkeeping for an AUX buffer. 1023 * 1024 * Return: Our private PT buffer structure. 1025 */ 1026 static void * 1027 pt_buffer_setup_aux(int cpu, void **pages, int nr_pages, bool snapshot) 1028 { 1029 struct pt_buffer *buf; 1030 int node, ret; 1031 1032 if (!nr_pages) 1033 return NULL; 1034 1035 if (cpu == -1) 1036 cpu = raw_smp_processor_id(); 1037 node = cpu_to_node(cpu); 1038 1039 buf = kzalloc_node(offsetof(struct pt_buffer, topa_index[nr_pages]), 1040 GFP_KERNEL, node); 1041 if (!buf) 1042 return NULL; 1043 1044 buf->cpu = cpu; 1045 buf->snapshot = snapshot; 1046 buf->data_pages = pages; 1047 1048 INIT_LIST_HEAD(&buf->tables); 1049 1050 ret = pt_buffer_init_topa(buf, nr_pages, GFP_KERNEL); 1051 if (ret) { 1052 kfree(buf); 1053 return NULL; 1054 } 1055 1056 return buf; 1057 } 1058 1059 /** 1060 * pt_buffer_free_aux() - perf AUX deallocation path callback 1061 * @data: PT buffer. 1062 */ 1063 static void pt_buffer_free_aux(void *data) 1064 { 1065 struct pt_buffer *buf = data; 1066 1067 pt_buffer_fini_topa(buf); 1068 kfree(buf); 1069 } 1070 1071 static int pt_addr_filters_init(struct perf_event *event) 1072 { 1073 struct pt_filters *filters; 1074 int node = event->cpu == -1 ? -1 : cpu_to_node(event->cpu); 1075 1076 if (!pt_cap_get(PT_CAP_num_address_ranges)) 1077 return 0; 1078 1079 filters = kzalloc_node(sizeof(struct pt_filters), GFP_KERNEL, node); 1080 if (!filters) 1081 return -ENOMEM; 1082 1083 if (event->parent) 1084 memcpy(filters, event->parent->hw.addr_filters, 1085 sizeof(*filters)); 1086 1087 event->hw.addr_filters = filters; 1088 1089 return 0; 1090 } 1091 1092 static void pt_addr_filters_fini(struct perf_event *event) 1093 { 1094 kfree(event->hw.addr_filters); 1095 event->hw.addr_filters = NULL; 1096 } 1097 1098 static inline bool valid_kernel_ip(unsigned long ip) 1099 { 1100 return virt_addr_valid(ip) && kernel_ip(ip); 1101 } 1102 1103 static int pt_event_addr_filters_validate(struct list_head *filters) 1104 { 1105 struct perf_addr_filter *filter; 1106 int range = 0; 1107 1108 list_for_each_entry(filter, filters, entry) { 1109 /* PT doesn't support single address triggers */ 1110 if (!filter->range || !filter->size) 1111 return -EOPNOTSUPP; 1112 1113 if (!filter->inode) { 1114 if (!valid_kernel_ip(filter->offset)) 1115 return -EINVAL; 1116 1117 if (!valid_kernel_ip(filter->offset + filter->size)) 1118 return -EINVAL; 1119 } 1120 1121 if (++range > pt_cap_get(PT_CAP_num_address_ranges)) 1122 return -EOPNOTSUPP; 1123 } 1124 1125 return 0; 1126 } 1127 1128 static void pt_event_addr_filters_sync(struct perf_event *event) 1129 { 1130 struct perf_addr_filters_head *head = perf_event_addr_filters(event); 1131 unsigned long msr_a, msr_b, *offs = event->addr_filters_offs; 1132 struct pt_filters *filters = event->hw.addr_filters; 1133 struct perf_addr_filter *filter; 1134 int range = 0; 1135 1136 if (!filters) 1137 return; 1138 1139 list_for_each_entry(filter, &head->list, entry) { 1140 if (filter->inode && !offs[range]) { 1141 msr_a = msr_b = 0; 1142 } else { 1143 /* apply the offset */ 1144 msr_a = filter->offset + offs[range]; 1145 msr_b = filter->size + msr_a - 1; 1146 } 1147 1148 filters->filter[range].msr_a = msr_a; 1149 filters->filter[range].msr_b = msr_b; 1150 filters->filter[range].config = filter->filter ? 1 : 2; 1151 range++; 1152 } 1153 1154 filters->nr_filters = range; 1155 } 1156 1157 /** 1158 * intel_pt_interrupt() - PT PMI handler 1159 */ 1160 void intel_pt_interrupt(void) 1161 { 1162 struct pt *pt = this_cpu_ptr(&pt_ctx); 1163 struct pt_buffer *buf; 1164 struct perf_event *event = pt->handle.event; 1165 1166 /* 1167 * There may be a dangling PT bit in the interrupt status register 1168 * after PT has been disabled by pt_event_stop(). Make sure we don't 1169 * do anything (particularly, re-enable) for this event here. 1170 */ 1171 if (!READ_ONCE(pt->handle_nmi)) 1172 return; 1173 1174 /* 1175 * If VMX is on and PT does not support it, don't touch anything. 1176 */ 1177 if (READ_ONCE(pt->vmx_on)) 1178 return; 1179 1180 if (!event) 1181 return; 1182 1183 pt_config_stop(event); 1184 1185 buf = perf_get_aux(&pt->handle); 1186 if (!buf) 1187 return; 1188 1189 pt_read_offset(buf); 1190 1191 pt_handle_status(pt); 1192 1193 pt_update_head(pt); 1194 1195 perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0), 1196 local_xchg(&buf->lost, 0)); 1197 1198 if (!event->hw.state) { 1199 int ret; 1200 1201 buf = perf_aux_output_begin(&pt->handle, event); 1202 if (!buf) { 1203 event->hw.state = PERF_HES_STOPPED; 1204 return; 1205 } 1206 1207 pt_buffer_reset_offsets(buf, pt->handle.head); 1208 /* snapshot counters don't use PMI, so it's safe */ 1209 ret = pt_buffer_reset_markers(buf, &pt->handle); 1210 if (ret) { 1211 perf_aux_output_end(&pt->handle, 0, true); 1212 return; 1213 } 1214 1215 pt_config_buffer(buf->cur->table, buf->cur_idx, 1216 buf->output_off); 1217 pt_config(event); 1218 } 1219 } 1220 1221 void intel_pt_handle_vmx(int on) 1222 { 1223 struct pt *pt = this_cpu_ptr(&pt_ctx); 1224 struct perf_event *event; 1225 unsigned long flags; 1226 1227 /* PT plays nice with VMX, do nothing */ 1228 if (pt_pmu.vmx) 1229 return; 1230 1231 /* 1232 * VMXON will clear RTIT_CTL.TraceEn; we need to make 1233 * sure to not try to set it while VMX is on. Disable 1234 * interrupts to avoid racing with pmu callbacks; 1235 * concurrent PMI should be handled fine. 1236 */ 1237 local_irq_save(flags); 1238 WRITE_ONCE(pt->vmx_on, on); 1239 1240 if (on) { 1241 /* prevent pt_config_stop() from writing RTIT_CTL */ 1242 event = pt->handle.event; 1243 if (event) 1244 event->hw.config = 0; 1245 } 1246 local_irq_restore(flags); 1247 } 1248 EXPORT_SYMBOL_GPL(intel_pt_handle_vmx); 1249 1250 /* 1251 * PMU callbacks 1252 */ 1253 1254 static void pt_event_start(struct perf_event *event, int mode) 1255 { 1256 struct hw_perf_event *hwc = &event->hw; 1257 struct pt *pt = this_cpu_ptr(&pt_ctx); 1258 struct pt_buffer *buf; 1259 1260 if (READ_ONCE(pt->vmx_on)) 1261 return; 1262 1263 buf = perf_aux_output_begin(&pt->handle, event); 1264 if (!buf) 1265 goto fail_stop; 1266 1267 pt_buffer_reset_offsets(buf, pt->handle.head); 1268 if (!buf->snapshot) { 1269 if (pt_buffer_reset_markers(buf, &pt->handle)) 1270 goto fail_end_stop; 1271 } 1272 1273 WRITE_ONCE(pt->handle_nmi, 1); 1274 hwc->state = 0; 1275 1276 pt_config_buffer(buf->cur->table, buf->cur_idx, 1277 buf->output_off); 1278 pt_config(event); 1279 1280 return; 1281 1282 fail_end_stop: 1283 perf_aux_output_end(&pt->handle, 0, true); 1284 fail_stop: 1285 hwc->state = PERF_HES_STOPPED; 1286 } 1287 1288 static void pt_event_stop(struct perf_event *event, int mode) 1289 { 1290 struct pt *pt = this_cpu_ptr(&pt_ctx); 1291 1292 /* 1293 * Protect against the PMI racing with disabling wrmsr, 1294 * see comment in intel_pt_interrupt(). 1295 */ 1296 WRITE_ONCE(pt->handle_nmi, 0); 1297 1298 pt_config_stop(event); 1299 1300 if (event->hw.state == PERF_HES_STOPPED) 1301 return; 1302 1303 event->hw.state = PERF_HES_STOPPED; 1304 1305 if (mode & PERF_EF_UPDATE) { 1306 struct pt_buffer *buf = perf_get_aux(&pt->handle); 1307 1308 if (!buf) 1309 return; 1310 1311 if (WARN_ON_ONCE(pt->handle.event != event)) 1312 return; 1313 1314 pt_read_offset(buf); 1315 1316 pt_handle_status(pt); 1317 1318 pt_update_head(pt); 1319 1320 if (buf->snapshot) 1321 pt->handle.head = 1322 local_xchg(&buf->data_size, 1323 buf->nr_pages << PAGE_SHIFT); 1324 perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0), 1325 local_xchg(&buf->lost, 0)); 1326 } 1327 } 1328 1329 static void pt_event_del(struct perf_event *event, int mode) 1330 { 1331 pt_event_stop(event, PERF_EF_UPDATE); 1332 } 1333 1334 static int pt_event_add(struct perf_event *event, int mode) 1335 { 1336 struct pt *pt = this_cpu_ptr(&pt_ctx); 1337 struct hw_perf_event *hwc = &event->hw; 1338 int ret = -EBUSY; 1339 1340 if (pt->handle.event) 1341 goto fail; 1342 1343 if (mode & PERF_EF_START) { 1344 pt_event_start(event, 0); 1345 ret = -EINVAL; 1346 if (hwc->state == PERF_HES_STOPPED) 1347 goto fail; 1348 } else { 1349 hwc->state = PERF_HES_STOPPED; 1350 } 1351 1352 ret = 0; 1353 fail: 1354 1355 return ret; 1356 } 1357 1358 static void pt_event_read(struct perf_event *event) 1359 { 1360 } 1361 1362 static void pt_event_destroy(struct perf_event *event) 1363 { 1364 pt_addr_filters_fini(event); 1365 x86_del_exclusive(x86_lbr_exclusive_pt); 1366 } 1367 1368 static int pt_event_init(struct perf_event *event) 1369 { 1370 if (event->attr.type != pt_pmu.pmu.type) 1371 return -ENOENT; 1372 1373 if (!pt_event_valid(event)) 1374 return -EINVAL; 1375 1376 if (x86_add_exclusive(x86_lbr_exclusive_pt)) 1377 return -EBUSY; 1378 1379 if (pt_addr_filters_init(event)) { 1380 x86_del_exclusive(x86_lbr_exclusive_pt); 1381 return -ENOMEM; 1382 } 1383 1384 event->destroy = pt_event_destroy; 1385 1386 return 0; 1387 } 1388 1389 void cpu_emergency_stop_pt(void) 1390 { 1391 struct pt *pt = this_cpu_ptr(&pt_ctx); 1392 1393 if (pt->handle.event) 1394 pt_event_stop(pt->handle.event, PERF_EF_UPDATE); 1395 } 1396 1397 static __init int pt_init(void) 1398 { 1399 int ret, cpu, prior_warn = 0; 1400 1401 BUILD_BUG_ON(sizeof(struct topa) > PAGE_SIZE); 1402 1403 if (!boot_cpu_has(X86_FEATURE_INTEL_PT)) 1404 return -ENODEV; 1405 1406 get_online_cpus(); 1407 for_each_online_cpu(cpu) { 1408 u64 ctl; 1409 1410 ret = rdmsrl_safe_on_cpu(cpu, MSR_IA32_RTIT_CTL, &ctl); 1411 if (!ret && (ctl & RTIT_CTL_TRACEEN)) 1412 prior_warn++; 1413 } 1414 put_online_cpus(); 1415 1416 if (prior_warn) { 1417 x86_add_exclusive(x86_lbr_exclusive_pt); 1418 pr_warn("PT is enabled at boot time, doing nothing\n"); 1419 1420 return -EBUSY; 1421 } 1422 1423 ret = pt_pmu_hw_init(); 1424 if (ret) 1425 return ret; 1426 1427 if (!pt_cap_get(PT_CAP_topa_output)) { 1428 pr_warn("ToPA output is not supported on this CPU\n"); 1429 return -ENODEV; 1430 } 1431 1432 if (!pt_cap_get(PT_CAP_topa_multiple_entries)) 1433 pt_pmu.pmu.capabilities = 1434 PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_AUX_SW_DOUBLEBUF; 1435 1436 pt_pmu.pmu.capabilities |= PERF_PMU_CAP_EXCLUSIVE | PERF_PMU_CAP_ITRACE; 1437 pt_pmu.pmu.attr_groups = pt_attr_groups; 1438 pt_pmu.pmu.task_ctx_nr = perf_sw_context; 1439 pt_pmu.pmu.event_init = pt_event_init; 1440 pt_pmu.pmu.add = pt_event_add; 1441 pt_pmu.pmu.del = pt_event_del; 1442 pt_pmu.pmu.start = pt_event_start; 1443 pt_pmu.pmu.stop = pt_event_stop; 1444 pt_pmu.pmu.read = pt_event_read; 1445 pt_pmu.pmu.setup_aux = pt_buffer_setup_aux; 1446 pt_pmu.pmu.free_aux = pt_buffer_free_aux; 1447 pt_pmu.pmu.addr_filters_sync = pt_event_addr_filters_sync; 1448 pt_pmu.pmu.addr_filters_validate = pt_event_addr_filters_validate; 1449 pt_pmu.pmu.nr_addr_filters = 1450 pt_cap_get(PT_CAP_num_address_ranges); 1451 1452 ret = perf_pmu_register(&pt_pmu.pmu, "intel_pt", -1); 1453 1454 return ret; 1455 } 1456 arch_initcall(pt_init); 1457