1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Performance event support for s390x - CPU-measurement Counter Facility 4 * 5 * Copyright IBM Corp. 2012, 2023 6 * Author(s): Hendrik Brueckner <brueckner@linux.ibm.com> 7 * Thomas Richter <tmricht@linux.ibm.com> 8 */ 9 #define KMSG_COMPONENT "cpum_cf" 10 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt 11 12 #include <linux/kernel.h> 13 #include <linux/kernel_stat.h> 14 #include <linux/percpu.h> 15 #include <linux/notifier.h> 16 #include <linux/init.h> 17 #include <linux/export.h> 18 #include <linux/miscdevice.h> 19 #include <linux/perf_event.h> 20 21 #include <asm/cpu_mf.h> 22 #include <asm/hwctrset.h> 23 #include <asm/debug.h> 24 25 enum cpumf_ctr_set { 26 CPUMF_CTR_SET_BASIC = 0, /* Basic Counter Set */ 27 CPUMF_CTR_SET_USER = 1, /* Problem-State Counter Set */ 28 CPUMF_CTR_SET_CRYPTO = 2, /* Crypto-Activity Counter Set */ 29 CPUMF_CTR_SET_EXT = 3, /* Extended Counter Set */ 30 CPUMF_CTR_SET_MT_DIAG = 4, /* MT-diagnostic Counter Set */ 31 32 /* Maximum number of counter sets */ 33 CPUMF_CTR_SET_MAX, 34 }; 35 36 #define CPUMF_LCCTL_ENABLE_SHIFT 16 37 #define CPUMF_LCCTL_ACTCTL_SHIFT 0 38 39 static inline void ctr_set_enable(u64 *state, u64 ctrsets) 40 { 41 *state |= ctrsets << CPUMF_LCCTL_ENABLE_SHIFT; 42 } 43 44 static inline void ctr_set_disable(u64 *state, u64 ctrsets) 45 { 46 *state &= ~(ctrsets << CPUMF_LCCTL_ENABLE_SHIFT); 47 } 48 49 static inline void ctr_set_start(u64 *state, u64 ctrsets) 50 { 51 *state |= ctrsets << CPUMF_LCCTL_ACTCTL_SHIFT; 52 } 53 54 static inline void ctr_set_stop(u64 *state, u64 ctrsets) 55 { 56 *state &= ~(ctrsets << CPUMF_LCCTL_ACTCTL_SHIFT); 57 } 58 59 static inline int ctr_stcctm(enum cpumf_ctr_set set, u64 range, u64 *dest) 60 { 61 switch (set) { 62 case CPUMF_CTR_SET_BASIC: 63 return stcctm(BASIC, range, dest); 64 case CPUMF_CTR_SET_USER: 65 return stcctm(PROBLEM_STATE, range, dest); 66 case CPUMF_CTR_SET_CRYPTO: 67 return stcctm(CRYPTO_ACTIVITY, range, dest); 68 case CPUMF_CTR_SET_EXT: 69 return stcctm(EXTENDED, range, dest); 70 case CPUMF_CTR_SET_MT_DIAG: 71 return stcctm(MT_DIAG_CLEARING, range, dest); 72 case CPUMF_CTR_SET_MAX: 73 return 3; 74 } 75 return 3; 76 } 77 78 struct cpu_cf_events { 79 atomic_t ctr_set[CPUMF_CTR_SET_MAX]; 80 u64 state; /* For perf_event_open SVC */ 81 u64 dev_state; /* For /dev/hwctr */ 82 unsigned int flags; 83 size_t used; /* Bytes used in data */ 84 size_t usedss; /* Bytes used in start/stop */ 85 unsigned char start[PAGE_SIZE]; /* Counter set at event add */ 86 unsigned char stop[PAGE_SIZE]; /* Counter set at event delete */ 87 unsigned char data[PAGE_SIZE]; /* Counter set at /dev/hwctr */ 88 unsigned int sets; /* # Counter set saved in memory */ 89 }; 90 91 /* Per-CPU event structure for the counter facility */ 92 static DEFINE_PER_CPU(struct cpu_cf_events, cpu_cf_events); 93 94 static unsigned int cfdiag_cpu_speed; /* CPU speed for CF_DIAG trailer */ 95 static debug_info_t *cf_dbg; 96 97 /* 98 * The CPU Measurement query counter information instruction contains 99 * information which varies per machine generation, but is constant and 100 * does not change when running on a particular machine, such as counter 101 * first and second version number. This is needed to determine the size 102 * of counter sets. Extract this information at device driver initialization. 103 */ 104 static struct cpumf_ctr_info cpumf_ctr_info; 105 106 #define CF_DIAG_CTRSET_DEF 0xfeef /* Counter set header mark */ 107 /* interval in seconds */ 108 109 /* Counter sets are stored as data stream in a page sized memory buffer and 110 * exported to user space via raw data attached to the event sample data. 111 * Each counter set starts with an eight byte header consisting of: 112 * - a two byte eye catcher (0xfeef) 113 * - a one byte counter set number 114 * - a two byte counter set size (indicates the number of counters in this set) 115 * - a three byte reserved value (must be zero) to make the header the same 116 * size as a counter value. 117 * All counter values are eight byte in size. 118 * 119 * All counter sets are followed by a 64 byte trailer. 120 * The trailer consists of a: 121 * - flag field indicating valid fields when corresponding bit set 122 * - the counter facility first and second version number 123 * - the CPU speed if nonzero 124 * - the time stamp the counter sets have been collected 125 * - the time of day (TOD) base value 126 * - the machine type. 127 * 128 * The counter sets are saved when the process is prepared to be executed on a 129 * CPU and saved again when the process is going to be removed from a CPU. 130 * The difference of both counter sets are calculated and stored in the event 131 * sample data area. 132 */ 133 struct cf_ctrset_entry { /* CPU-M CF counter set entry (8 byte) */ 134 unsigned int def:16; /* 0-15 Data Entry Format */ 135 unsigned int set:16; /* 16-31 Counter set identifier */ 136 unsigned int ctr:16; /* 32-47 Number of stored counters */ 137 unsigned int res1:16; /* 48-63 Reserved */ 138 }; 139 140 struct cf_trailer_entry { /* CPU-M CF_DIAG trailer (64 byte) */ 141 /* 0 - 7 */ 142 union { 143 struct { 144 unsigned int clock_base:1; /* TOD clock base set */ 145 unsigned int speed:1; /* CPU speed set */ 146 /* Measurement alerts */ 147 unsigned int mtda:1; /* Loss of MT ctr. data alert */ 148 unsigned int caca:1; /* Counter auth. change alert */ 149 unsigned int lcda:1; /* Loss of counter data alert */ 150 }; 151 unsigned long flags; /* 0-63 All indicators */ 152 }; 153 /* 8 - 15 */ 154 unsigned int cfvn:16; /* 64-79 Ctr First Version */ 155 unsigned int csvn:16; /* 80-95 Ctr Second Version */ 156 unsigned int cpu_speed:32; /* 96-127 CPU speed */ 157 /* 16 - 23 */ 158 unsigned long timestamp; /* 128-191 Timestamp (TOD) */ 159 /* 24 - 55 */ 160 union { 161 struct { 162 unsigned long progusage1; 163 unsigned long progusage2; 164 unsigned long progusage3; 165 unsigned long tod_base; 166 }; 167 unsigned long progusage[4]; 168 }; 169 /* 56 - 63 */ 170 unsigned int mach_type:16; /* Machine type */ 171 unsigned int res1:16; /* Reserved */ 172 unsigned int res2:32; /* Reserved */ 173 }; 174 175 /* Create the trailer data at the end of a page. */ 176 static void cfdiag_trailer(struct cf_trailer_entry *te) 177 { 178 struct cpuid cpuid; 179 180 te->cfvn = cpumf_ctr_info.cfvn; /* Counter version numbers */ 181 te->csvn = cpumf_ctr_info.csvn; 182 183 get_cpu_id(&cpuid); /* Machine type */ 184 te->mach_type = cpuid.machine; 185 te->cpu_speed = cfdiag_cpu_speed; 186 if (te->cpu_speed) 187 te->speed = 1; 188 te->clock_base = 1; /* Save clock base */ 189 te->tod_base = tod_clock_base.tod; 190 te->timestamp = get_tod_clock_fast(); 191 } 192 193 /* 194 * The number of counters per counter set varies between machine generations, 195 * but is constant when running on a particular machine generation. 196 * Determine each counter set size at device driver initialization and 197 * retrieve it later. 198 */ 199 static size_t cpumf_ctr_setsizes[CPUMF_CTR_SET_MAX]; 200 static void cpum_cf_make_setsize(enum cpumf_ctr_set ctrset) 201 { 202 size_t ctrset_size = 0; 203 204 switch (ctrset) { 205 case CPUMF_CTR_SET_BASIC: 206 if (cpumf_ctr_info.cfvn >= 1) 207 ctrset_size = 6; 208 break; 209 case CPUMF_CTR_SET_USER: 210 if (cpumf_ctr_info.cfvn == 1) 211 ctrset_size = 6; 212 else if (cpumf_ctr_info.cfvn >= 3) 213 ctrset_size = 2; 214 break; 215 case CPUMF_CTR_SET_CRYPTO: 216 if (cpumf_ctr_info.csvn >= 1 && cpumf_ctr_info.csvn <= 5) 217 ctrset_size = 16; 218 else if (cpumf_ctr_info.csvn == 6 || cpumf_ctr_info.csvn == 7) 219 ctrset_size = 20; 220 break; 221 case CPUMF_CTR_SET_EXT: 222 if (cpumf_ctr_info.csvn == 1) 223 ctrset_size = 32; 224 else if (cpumf_ctr_info.csvn == 2) 225 ctrset_size = 48; 226 else if (cpumf_ctr_info.csvn >= 3 && cpumf_ctr_info.csvn <= 5) 227 ctrset_size = 128; 228 else if (cpumf_ctr_info.csvn == 6 || cpumf_ctr_info.csvn == 7) 229 ctrset_size = 160; 230 break; 231 case CPUMF_CTR_SET_MT_DIAG: 232 if (cpumf_ctr_info.csvn > 3) 233 ctrset_size = 48; 234 break; 235 case CPUMF_CTR_SET_MAX: 236 break; 237 } 238 cpumf_ctr_setsizes[ctrset] = ctrset_size; 239 } 240 241 /* 242 * Return the maximum possible counter set size (in number of 8 byte counters) 243 * depending on type and model number. 244 */ 245 static size_t cpum_cf_read_setsize(enum cpumf_ctr_set ctrset) 246 { 247 return cpumf_ctr_setsizes[ctrset]; 248 } 249 250 /* Read a counter set. The counter set number determines the counter set and 251 * the CPUM-CF first and second version number determine the number of 252 * available counters in each counter set. 253 * Each counter set starts with header containing the counter set number and 254 * the number of eight byte counters. 255 * 256 * The functions returns the number of bytes occupied by this counter set 257 * including the header. 258 * If there is no counter in the counter set, this counter set is useless and 259 * zero is returned on this case. 260 * 261 * Note that the counter sets may not be enabled or active and the stcctm 262 * instruction might return error 3. Depending on error_ok value this is ok, 263 * for example when called from cpumf_pmu_start() call back function. 264 */ 265 static size_t cfdiag_getctrset(struct cf_ctrset_entry *ctrdata, int ctrset, 266 size_t room, bool error_ok) 267 { 268 size_t ctrset_size, need = 0; 269 int rc = 3; /* Assume write failure */ 270 271 ctrdata->def = CF_DIAG_CTRSET_DEF; 272 ctrdata->set = ctrset; 273 ctrdata->res1 = 0; 274 ctrset_size = cpum_cf_read_setsize(ctrset); 275 276 if (ctrset_size) { /* Save data */ 277 need = ctrset_size * sizeof(u64) + sizeof(*ctrdata); 278 if (need <= room) { 279 rc = ctr_stcctm(ctrset, ctrset_size, 280 (u64 *)(ctrdata + 1)); 281 } 282 if (rc != 3 || error_ok) 283 ctrdata->ctr = ctrset_size; 284 else 285 need = 0; 286 } 287 288 return need; 289 } 290 291 static const u64 cpumf_ctr_ctl[CPUMF_CTR_SET_MAX] = { 292 [CPUMF_CTR_SET_BASIC] = 0x02, 293 [CPUMF_CTR_SET_USER] = 0x04, 294 [CPUMF_CTR_SET_CRYPTO] = 0x08, 295 [CPUMF_CTR_SET_EXT] = 0x01, 296 [CPUMF_CTR_SET_MT_DIAG] = 0x20, 297 }; 298 299 /* Read out all counter sets and save them in the provided data buffer. 300 * The last 64 byte host an artificial trailer entry. 301 */ 302 static size_t cfdiag_getctr(void *data, size_t sz, unsigned long auth, 303 bool error_ok) 304 { 305 struct cf_trailer_entry *trailer; 306 size_t offset = 0, done; 307 int i; 308 309 memset(data, 0, sz); 310 sz -= sizeof(*trailer); /* Always room for trailer */ 311 for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) { 312 struct cf_ctrset_entry *ctrdata = data + offset; 313 314 if (!(auth & cpumf_ctr_ctl[i])) 315 continue; /* Counter set not authorized */ 316 317 done = cfdiag_getctrset(ctrdata, i, sz - offset, error_ok); 318 offset += done; 319 } 320 trailer = data + offset; 321 cfdiag_trailer(trailer); 322 return offset + sizeof(*trailer); 323 } 324 325 /* Calculate the difference for each counter in a counter set. */ 326 static void cfdiag_diffctrset(u64 *pstart, u64 *pstop, int counters) 327 { 328 for (; --counters >= 0; ++pstart, ++pstop) 329 if (*pstop >= *pstart) 330 *pstop -= *pstart; 331 else 332 *pstop = *pstart - *pstop + 1; 333 } 334 335 /* Scan the counter sets and calculate the difference of each counter 336 * in each set. The result is the increment of each counter during the 337 * period the counter set has been activated. 338 * 339 * Return true on success. 340 */ 341 static int cfdiag_diffctr(struct cpu_cf_events *cpuhw, unsigned long auth) 342 { 343 struct cf_trailer_entry *trailer_start, *trailer_stop; 344 struct cf_ctrset_entry *ctrstart, *ctrstop; 345 size_t offset = 0; 346 347 auth &= (1 << CPUMF_LCCTL_ENABLE_SHIFT) - 1; 348 do { 349 ctrstart = (struct cf_ctrset_entry *)(cpuhw->start + offset); 350 ctrstop = (struct cf_ctrset_entry *)(cpuhw->stop + offset); 351 352 if (memcmp(ctrstop, ctrstart, sizeof(*ctrstop))) { 353 pr_err_once("cpum_cf_diag counter set compare error " 354 "in set %i\n", ctrstart->set); 355 return 0; 356 } 357 auth &= ~cpumf_ctr_ctl[ctrstart->set]; 358 if (ctrstart->def == CF_DIAG_CTRSET_DEF) { 359 cfdiag_diffctrset((u64 *)(ctrstart + 1), 360 (u64 *)(ctrstop + 1), ctrstart->ctr); 361 offset += ctrstart->ctr * sizeof(u64) + 362 sizeof(*ctrstart); 363 } 364 } while (ctrstart->def && auth); 365 366 /* Save time_stamp from start of event in stop's trailer */ 367 trailer_start = (struct cf_trailer_entry *)(cpuhw->start + offset); 368 trailer_stop = (struct cf_trailer_entry *)(cpuhw->stop + offset); 369 trailer_stop->progusage[0] = trailer_start->timestamp; 370 371 return 1; 372 } 373 374 static enum cpumf_ctr_set get_counter_set(u64 event) 375 { 376 int set = CPUMF_CTR_SET_MAX; 377 378 if (event < 32) 379 set = CPUMF_CTR_SET_BASIC; 380 else if (event < 64) 381 set = CPUMF_CTR_SET_USER; 382 else if (event < 128) 383 set = CPUMF_CTR_SET_CRYPTO; 384 else if (event < 288) 385 set = CPUMF_CTR_SET_EXT; 386 else if (event >= 448 && event < 496) 387 set = CPUMF_CTR_SET_MT_DIAG; 388 389 return set; 390 } 391 392 static int validate_ctr_version(const u64 config, enum cpumf_ctr_set set) 393 { 394 u16 mtdiag_ctl; 395 int err = 0; 396 397 /* check required version for counter sets */ 398 switch (set) { 399 case CPUMF_CTR_SET_BASIC: 400 case CPUMF_CTR_SET_USER: 401 if (cpumf_ctr_info.cfvn < 1) 402 err = -EOPNOTSUPP; 403 break; 404 case CPUMF_CTR_SET_CRYPTO: 405 if ((cpumf_ctr_info.csvn >= 1 && cpumf_ctr_info.csvn <= 5 && 406 config > 79) || (cpumf_ctr_info.csvn >= 6 && config > 83)) 407 err = -EOPNOTSUPP; 408 break; 409 case CPUMF_CTR_SET_EXT: 410 if (cpumf_ctr_info.csvn < 1) 411 err = -EOPNOTSUPP; 412 if ((cpumf_ctr_info.csvn == 1 && config > 159) || 413 (cpumf_ctr_info.csvn == 2 && config > 175) || 414 (cpumf_ctr_info.csvn >= 3 && cpumf_ctr_info.csvn <= 5 && 415 config > 255) || 416 (cpumf_ctr_info.csvn >= 6 && config > 287)) 417 err = -EOPNOTSUPP; 418 break; 419 case CPUMF_CTR_SET_MT_DIAG: 420 if (cpumf_ctr_info.csvn <= 3) 421 err = -EOPNOTSUPP; 422 /* 423 * MT-diagnostic counters are read-only. The counter set 424 * is automatically enabled and activated on all CPUs with 425 * multithreading (SMT). Deactivation of multithreading 426 * also disables the counter set. State changes are ignored 427 * by lcctl(). Because Linux controls SMT enablement through 428 * a kernel parameter only, the counter set is either disabled 429 * or enabled and active. 430 * 431 * Thus, the counters can only be used if SMT is on and the 432 * counter set is enabled and active. 433 */ 434 mtdiag_ctl = cpumf_ctr_ctl[CPUMF_CTR_SET_MT_DIAG]; 435 if (!((cpumf_ctr_info.auth_ctl & mtdiag_ctl) && 436 (cpumf_ctr_info.enable_ctl & mtdiag_ctl) && 437 (cpumf_ctr_info.act_ctl & mtdiag_ctl))) 438 err = -EOPNOTSUPP; 439 break; 440 case CPUMF_CTR_SET_MAX: 441 err = -EOPNOTSUPP; 442 } 443 444 return err; 445 } 446 447 /* 448 * Change the CPUMF state to active. 449 * Enable and activate the CPU-counter sets according 450 * to the per-cpu control state. 451 */ 452 static void cpumf_pmu_enable(struct pmu *pmu) 453 { 454 struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); 455 int err; 456 457 if (cpuhw->flags & PMU_F_ENABLED) 458 return; 459 460 err = lcctl(cpuhw->state | cpuhw->dev_state); 461 if (err) 462 pr_err("Enabling the performance measuring unit failed with rc=%x\n", err); 463 else 464 cpuhw->flags |= PMU_F_ENABLED; 465 } 466 467 /* 468 * Change the CPUMF state to inactive. 469 * Disable and enable (inactive) the CPU-counter sets according 470 * to the per-cpu control state. 471 */ 472 static void cpumf_pmu_disable(struct pmu *pmu) 473 { 474 struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); 475 int err; 476 u64 inactive; 477 478 if (!(cpuhw->flags & PMU_F_ENABLED)) 479 return; 480 481 inactive = cpuhw->state & ~((1 << CPUMF_LCCTL_ENABLE_SHIFT) - 1); 482 inactive |= cpuhw->dev_state; 483 err = lcctl(inactive); 484 if (err) 485 pr_err("Disabling the performance measuring unit failed with rc=%x\n", err); 486 else 487 cpuhw->flags &= ~PMU_F_ENABLED; 488 } 489 490 #define PMC_INIT 0UL 491 #define PMC_RELEASE 1UL 492 493 static void cpum_cf_setup_cpu(void *flags) 494 { 495 struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); 496 497 switch ((unsigned long)flags) { 498 case PMC_INIT: 499 cpuhw->flags |= PMU_F_RESERVED; 500 break; 501 502 case PMC_RELEASE: 503 cpuhw->flags &= ~PMU_F_RESERVED; 504 break; 505 } 506 507 /* Disable CPU counter sets */ 508 lcctl(0); 509 debug_sprintf_event(cf_dbg, 5, "%s flags %#x flags %#x state %#llx\n", 510 __func__, *(int *)flags, cpuhw->flags, 511 cpuhw->state); 512 } 513 514 /* Initialize the CPU-measurement counter facility */ 515 static int __kernel_cpumcf_begin(void) 516 { 517 on_each_cpu(cpum_cf_setup_cpu, (void *)PMC_INIT, 1); 518 irq_subclass_register(IRQ_SUBCLASS_MEASUREMENT_ALERT); 519 520 return 0; 521 } 522 523 /* Release the CPU-measurement counter facility */ 524 static void __kernel_cpumcf_end(void) 525 { 526 on_each_cpu(cpum_cf_setup_cpu, (void *)PMC_RELEASE, 1); 527 irq_subclass_unregister(IRQ_SUBCLASS_MEASUREMENT_ALERT); 528 } 529 530 /* Number of perf events counting hardware events */ 531 static atomic_t num_events = ATOMIC_INIT(0); 532 /* Used to avoid races in calling reserve/release_cpumf_hardware */ 533 static DEFINE_MUTEX(pmc_reserve_mutex); 534 535 /* Release the PMU if event is the last perf event */ 536 static void hw_perf_event_destroy(struct perf_event *event) 537 { 538 mutex_lock(&pmc_reserve_mutex); 539 if (atomic_dec_return(&num_events) == 0) 540 __kernel_cpumcf_end(); 541 mutex_unlock(&pmc_reserve_mutex); 542 } 543 544 /* CPUMF <-> perf event mappings for kernel+userspace (basic set) */ 545 static const int cpumf_generic_events_basic[] = { 546 [PERF_COUNT_HW_CPU_CYCLES] = 0, 547 [PERF_COUNT_HW_INSTRUCTIONS] = 1, 548 [PERF_COUNT_HW_CACHE_REFERENCES] = -1, 549 [PERF_COUNT_HW_CACHE_MISSES] = -1, 550 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = -1, 551 [PERF_COUNT_HW_BRANCH_MISSES] = -1, 552 [PERF_COUNT_HW_BUS_CYCLES] = -1, 553 }; 554 /* CPUMF <-> perf event mappings for userspace (problem-state set) */ 555 static const int cpumf_generic_events_user[] = { 556 [PERF_COUNT_HW_CPU_CYCLES] = 32, 557 [PERF_COUNT_HW_INSTRUCTIONS] = 33, 558 [PERF_COUNT_HW_CACHE_REFERENCES] = -1, 559 [PERF_COUNT_HW_CACHE_MISSES] = -1, 560 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = -1, 561 [PERF_COUNT_HW_BRANCH_MISSES] = -1, 562 [PERF_COUNT_HW_BUS_CYCLES] = -1, 563 }; 564 565 static void cpumf_hw_inuse(void) 566 { 567 mutex_lock(&pmc_reserve_mutex); 568 if (atomic_inc_return(&num_events) == 1) 569 __kernel_cpumcf_begin(); 570 mutex_unlock(&pmc_reserve_mutex); 571 } 572 573 static int is_userspace_event(u64 ev) 574 { 575 return cpumf_generic_events_user[PERF_COUNT_HW_CPU_CYCLES] == ev || 576 cpumf_generic_events_user[PERF_COUNT_HW_INSTRUCTIONS] == ev; 577 } 578 579 static int __hw_perf_event_init(struct perf_event *event, unsigned int type) 580 { 581 struct perf_event_attr *attr = &event->attr; 582 struct hw_perf_event *hwc = &event->hw; 583 enum cpumf_ctr_set set; 584 u64 ev; 585 586 switch (type) { 587 case PERF_TYPE_RAW: 588 /* Raw events are used to access counters directly, 589 * hence do not permit excludes */ 590 if (attr->exclude_kernel || attr->exclude_user || 591 attr->exclude_hv) 592 return -EOPNOTSUPP; 593 ev = attr->config; 594 break; 595 596 case PERF_TYPE_HARDWARE: 597 if (is_sampling_event(event)) /* No sampling support */ 598 return -ENOENT; 599 ev = attr->config; 600 if (!attr->exclude_user && attr->exclude_kernel) { 601 /* 602 * Count user space (problem-state) only 603 * Handle events 32 and 33 as 0:u and 1:u 604 */ 605 if (!is_userspace_event(ev)) { 606 if (ev >= ARRAY_SIZE(cpumf_generic_events_user)) 607 return -EOPNOTSUPP; 608 ev = cpumf_generic_events_user[ev]; 609 } 610 } else if (!attr->exclude_kernel && attr->exclude_user) { 611 /* No support for kernel space counters only */ 612 return -EOPNOTSUPP; 613 } else { 614 /* Count user and kernel space, incl. events 32 + 33 */ 615 if (!is_userspace_event(ev)) { 616 if (ev >= ARRAY_SIZE(cpumf_generic_events_basic)) 617 return -EOPNOTSUPP; 618 ev = cpumf_generic_events_basic[ev]; 619 } 620 } 621 break; 622 623 default: 624 return -ENOENT; 625 } 626 627 if (ev == -1) 628 return -ENOENT; 629 630 if (ev > PERF_CPUM_CF_MAX_CTR) 631 return -ENOENT; 632 633 /* Obtain the counter set to which the specified counter belongs */ 634 set = get_counter_set(ev); 635 switch (set) { 636 case CPUMF_CTR_SET_BASIC: 637 case CPUMF_CTR_SET_USER: 638 case CPUMF_CTR_SET_CRYPTO: 639 case CPUMF_CTR_SET_EXT: 640 case CPUMF_CTR_SET_MT_DIAG: 641 /* 642 * Use the hardware perf event structure to store the 643 * counter number in the 'config' member and the counter 644 * set number in the 'config_base' as bit mask. 645 * It is later used to enable/disable the counter(s). 646 */ 647 hwc->config = ev; 648 hwc->config_base = cpumf_ctr_ctl[set]; 649 break; 650 case CPUMF_CTR_SET_MAX: 651 /* The counter could not be associated to a counter set */ 652 return -EINVAL; 653 } 654 655 /* Initialize for using the CPU-measurement counter facility */ 656 cpumf_hw_inuse(); 657 event->destroy = hw_perf_event_destroy; 658 659 /* 660 * Finally, validate version and authorization of the counter set. 661 * If the particular CPU counter set is not authorized, 662 * return with -ENOENT in order to fall back to other 663 * PMUs that might suffice the event request. 664 */ 665 if (!(hwc->config_base & cpumf_ctr_info.auth_ctl)) 666 return -ENOENT; 667 return validate_ctr_version(hwc->config, set); 668 } 669 670 /* Events CPU_CYLCES and INSTRUCTIONS can be submitted with two different 671 * attribute::type values: 672 * - PERF_TYPE_HARDWARE: 673 * - pmu->type: 674 * Handle both type of invocations identical. They address the same hardware. 675 * The result is different when event modifiers exclude_kernel and/or 676 * exclude_user are also set. 677 */ 678 static int cpumf_pmu_event_type(struct perf_event *event) 679 { 680 u64 ev = event->attr.config; 681 682 if (cpumf_generic_events_basic[PERF_COUNT_HW_CPU_CYCLES] == ev || 683 cpumf_generic_events_basic[PERF_COUNT_HW_INSTRUCTIONS] == ev || 684 cpumf_generic_events_user[PERF_COUNT_HW_CPU_CYCLES] == ev || 685 cpumf_generic_events_user[PERF_COUNT_HW_INSTRUCTIONS] == ev) 686 return PERF_TYPE_HARDWARE; 687 return PERF_TYPE_RAW; 688 } 689 690 static int cpumf_pmu_event_init(struct perf_event *event) 691 { 692 unsigned int type = event->attr.type; 693 int err; 694 695 if (type == PERF_TYPE_HARDWARE || type == PERF_TYPE_RAW) 696 err = __hw_perf_event_init(event, type); 697 else if (event->pmu->type == type) 698 /* Registered as unknown PMU */ 699 err = __hw_perf_event_init(event, cpumf_pmu_event_type(event)); 700 else 701 return -ENOENT; 702 703 if (unlikely(err) && event->destroy) 704 event->destroy(event); 705 706 return err; 707 } 708 709 static int hw_perf_event_reset(struct perf_event *event) 710 { 711 u64 prev, new; 712 int err; 713 714 do { 715 prev = local64_read(&event->hw.prev_count); 716 err = ecctr(event->hw.config, &new); 717 if (err) { 718 if (err != 3) 719 break; 720 /* The counter is not (yet) available. This 721 * might happen if the counter set to which 722 * this counter belongs is in the disabled 723 * state. 724 */ 725 new = 0; 726 } 727 } while (local64_cmpxchg(&event->hw.prev_count, prev, new) != prev); 728 729 return err; 730 } 731 732 static void hw_perf_event_update(struct perf_event *event) 733 { 734 u64 prev, new, delta; 735 int err; 736 737 do { 738 prev = local64_read(&event->hw.prev_count); 739 err = ecctr(event->hw.config, &new); 740 if (err) 741 return; 742 } while (local64_cmpxchg(&event->hw.prev_count, prev, new) != prev); 743 744 delta = (prev <= new) ? new - prev 745 : (-1ULL - prev) + new + 1; /* overflow */ 746 local64_add(delta, &event->count); 747 } 748 749 static void cpumf_pmu_read(struct perf_event *event) 750 { 751 if (event->hw.state & PERF_HES_STOPPED) 752 return; 753 754 hw_perf_event_update(event); 755 } 756 757 static void cpumf_pmu_start(struct perf_event *event, int flags) 758 { 759 struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); 760 struct hw_perf_event *hwc = &event->hw; 761 int i; 762 763 if (!(hwc->state & PERF_HES_STOPPED)) 764 return; 765 766 hwc->state = 0; 767 768 /* (Re-)enable and activate the counter set */ 769 ctr_set_enable(&cpuhw->state, hwc->config_base); 770 ctr_set_start(&cpuhw->state, hwc->config_base); 771 772 /* The counter set to which this counter belongs can be already active. 773 * Because all counters in a set are active, the event->hw.prev_count 774 * needs to be synchronized. At this point, the counter set can be in 775 * the inactive or disabled state. 776 */ 777 if (hwc->config == PERF_EVENT_CPUM_CF_DIAG) { 778 cpuhw->usedss = cfdiag_getctr(cpuhw->start, 779 sizeof(cpuhw->start), 780 hwc->config_base, true); 781 } else { 782 hw_perf_event_reset(event); 783 } 784 785 /* Increment refcount for counter sets */ 786 for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) 787 if ((hwc->config_base & cpumf_ctr_ctl[i])) 788 atomic_inc(&cpuhw->ctr_set[i]); 789 } 790 791 /* Create perf event sample with the counter sets as raw data. The sample 792 * is then pushed to the event subsystem and the function checks for 793 * possible event overflows. If an event overflow occurs, the PMU is 794 * stopped. 795 * 796 * Return non-zero if an event overflow occurred. 797 */ 798 static int cfdiag_push_sample(struct perf_event *event, 799 struct cpu_cf_events *cpuhw) 800 { 801 struct perf_sample_data data; 802 struct perf_raw_record raw; 803 struct pt_regs regs; 804 int overflow; 805 806 /* Setup perf sample */ 807 perf_sample_data_init(&data, 0, event->hw.last_period); 808 memset(®s, 0, sizeof(regs)); 809 memset(&raw, 0, sizeof(raw)); 810 811 if (event->attr.sample_type & PERF_SAMPLE_CPU) 812 data.cpu_entry.cpu = event->cpu; 813 if (event->attr.sample_type & PERF_SAMPLE_RAW) { 814 raw.frag.size = cpuhw->usedss; 815 raw.frag.data = cpuhw->stop; 816 perf_sample_save_raw_data(&data, &raw); 817 } 818 819 overflow = perf_event_overflow(event, &data, ®s); 820 debug_sprintf_event(cf_dbg, 3, 821 "%s event %#llx sample_type %#llx raw %d ov %d\n", 822 __func__, event->hw.config, 823 event->attr.sample_type, raw.size, overflow); 824 if (overflow) 825 event->pmu->stop(event, 0); 826 827 perf_event_update_userpage(event); 828 return overflow; 829 } 830 831 static void cpumf_pmu_stop(struct perf_event *event, int flags) 832 { 833 struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); 834 struct hw_perf_event *hwc = &event->hw; 835 int i; 836 837 if (!(hwc->state & PERF_HES_STOPPED)) { 838 /* Decrement reference count for this counter set and if this 839 * is the last used counter in the set, clear activation 840 * control and set the counter set state to inactive. 841 */ 842 for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) { 843 if (!(hwc->config_base & cpumf_ctr_ctl[i])) 844 continue; 845 if (!atomic_dec_return(&cpuhw->ctr_set[i])) 846 ctr_set_stop(&cpuhw->state, cpumf_ctr_ctl[i]); 847 } 848 hwc->state |= PERF_HES_STOPPED; 849 } 850 851 if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) { 852 if (hwc->config == PERF_EVENT_CPUM_CF_DIAG) { 853 local64_inc(&event->count); 854 cpuhw->usedss = cfdiag_getctr(cpuhw->stop, 855 sizeof(cpuhw->stop), 856 event->hw.config_base, 857 false); 858 if (cfdiag_diffctr(cpuhw, event->hw.config_base)) 859 cfdiag_push_sample(event, cpuhw); 860 } else if (cpuhw->flags & PMU_F_RESERVED) { 861 /* Only update when PMU not hotplugged off */ 862 hw_perf_event_update(event); 863 } 864 hwc->state |= PERF_HES_UPTODATE; 865 } 866 } 867 868 static int cpumf_pmu_add(struct perf_event *event, int flags) 869 { 870 struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); 871 872 ctr_set_enable(&cpuhw->state, event->hw.config_base); 873 event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED; 874 875 if (flags & PERF_EF_START) 876 cpumf_pmu_start(event, PERF_EF_RELOAD); 877 878 return 0; 879 } 880 881 static void cpumf_pmu_del(struct perf_event *event, int flags) 882 { 883 struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); 884 int i; 885 886 cpumf_pmu_stop(event, PERF_EF_UPDATE); 887 888 /* Check if any counter in the counter set is still used. If not used, 889 * change the counter set to the disabled state. This also clears the 890 * content of all counters in the set. 891 * 892 * When a new perf event has been added but not yet started, this can 893 * clear enable control and resets all counters in a set. Therefore, 894 * cpumf_pmu_start() always has to reenable a counter set. 895 */ 896 for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) 897 if (!atomic_read(&cpuhw->ctr_set[i])) 898 ctr_set_disable(&cpuhw->state, cpumf_ctr_ctl[i]); 899 } 900 901 /* Performance monitoring unit for s390x */ 902 static struct pmu cpumf_pmu = { 903 .task_ctx_nr = perf_sw_context, 904 .capabilities = PERF_PMU_CAP_NO_INTERRUPT, 905 .pmu_enable = cpumf_pmu_enable, 906 .pmu_disable = cpumf_pmu_disable, 907 .event_init = cpumf_pmu_event_init, 908 .add = cpumf_pmu_add, 909 .del = cpumf_pmu_del, 910 .start = cpumf_pmu_start, 911 .stop = cpumf_pmu_stop, 912 .read = cpumf_pmu_read, 913 }; 914 915 static int cpum_cf_setup(unsigned int cpu, unsigned long flags) 916 { 917 local_irq_disable(); 918 cpum_cf_setup_cpu((void *)flags); 919 local_irq_enable(); 920 return 0; 921 } 922 923 static int cfset_online_cpu(unsigned int cpu); 924 static int cpum_cf_online_cpu(unsigned int cpu) 925 { 926 debug_sprintf_event(cf_dbg, 4, "%s cpu %d in_irq %ld\n", __func__, 927 cpu, in_interrupt()); 928 cpum_cf_setup(cpu, PMC_INIT); 929 return cfset_online_cpu(cpu); 930 } 931 932 static int cfset_offline_cpu(unsigned int cpu); 933 static int cpum_cf_offline_cpu(unsigned int cpu) 934 { 935 debug_sprintf_event(cf_dbg, 4, "%s cpu %d\n", __func__, cpu); 936 cfset_offline_cpu(cpu); 937 return cpum_cf_setup(cpu, PMC_RELEASE); 938 } 939 940 /* Return true if store counter set multiple instruction is available */ 941 static inline int stccm_avail(void) 942 { 943 return test_facility(142); 944 } 945 946 /* CPU-measurement alerts for the counter facility */ 947 static void cpumf_measurement_alert(struct ext_code ext_code, 948 unsigned int alert, unsigned long unused) 949 { 950 struct cpu_cf_events *cpuhw; 951 952 if (!(alert & CPU_MF_INT_CF_MASK)) 953 return; 954 955 inc_irq_stat(IRQEXT_CMC); 956 cpuhw = this_cpu_ptr(&cpu_cf_events); 957 958 /* 959 * Measurement alerts are shared and might happen when the PMU 960 * is not reserved. Ignore these alerts in this case. 961 */ 962 if (!(cpuhw->flags & PMU_F_RESERVED)) 963 return; 964 965 /* counter authorization change alert */ 966 if (alert & CPU_MF_INT_CF_CACA) 967 qctri(&cpumf_ctr_info); 968 969 /* loss of counter data alert */ 970 if (alert & CPU_MF_INT_CF_LCDA) 971 pr_err("CPU[%i] Counter data was lost\n", smp_processor_id()); 972 973 /* loss of MT counter data alert */ 974 if (alert & CPU_MF_INT_CF_MTDA) 975 pr_warn("CPU[%i] MT counter data was lost\n", 976 smp_processor_id()); 977 } 978 979 static int cfset_init(void); 980 static int __init cpumf_pmu_init(void) 981 { 982 int rc; 983 984 /* Extract counter measurement facility information */ 985 if (!cpum_cf_avail() || qctri(&cpumf_ctr_info)) 986 return -ENODEV; 987 988 /* Determine and store counter set sizes for later reference */ 989 for (rc = CPUMF_CTR_SET_BASIC; rc < CPUMF_CTR_SET_MAX; ++rc) 990 cpum_cf_make_setsize(rc); 991 992 /* 993 * Clear bit 15 of cr0 to unauthorize problem-state to 994 * extract measurement counters 995 */ 996 ctl_clear_bit(0, 48); 997 998 /* register handler for measurement-alert interruptions */ 999 rc = register_external_irq(EXT_IRQ_MEASURE_ALERT, 1000 cpumf_measurement_alert); 1001 if (rc) { 1002 pr_err("Registering for CPU-measurement alerts failed with rc=%i\n", rc); 1003 return rc; 1004 } 1005 1006 /* Setup s390dbf facility */ 1007 cf_dbg = debug_register(KMSG_COMPONENT, 2, 1, 128); 1008 if (!cf_dbg) { 1009 pr_err("Registration of s390dbf(cpum_cf) failed\n"); 1010 rc = -ENOMEM; 1011 goto out1; 1012 } 1013 debug_register_view(cf_dbg, &debug_sprintf_view); 1014 1015 cpumf_pmu.attr_groups = cpumf_cf_event_group(); 1016 rc = perf_pmu_register(&cpumf_pmu, "cpum_cf", -1); 1017 if (rc) { 1018 pr_err("Registering the cpum_cf PMU failed with rc=%i\n", rc); 1019 goto out2; 1020 } else if (stccm_avail()) { /* Setup counter set device */ 1021 cfset_init(); 1022 } 1023 1024 rc = cpuhp_setup_state(CPUHP_AP_PERF_S390_CF_ONLINE, 1025 "perf/s390/cf:online", 1026 cpum_cf_online_cpu, cpum_cf_offline_cpu); 1027 return rc; 1028 1029 out2: 1030 debug_unregister_view(cf_dbg, &debug_sprintf_view); 1031 debug_unregister(cf_dbg); 1032 out1: 1033 unregister_external_irq(EXT_IRQ_MEASURE_ALERT, cpumf_measurement_alert); 1034 return rc; 1035 } 1036 1037 /* Support for the CPU Measurement Facility counter set extraction using 1038 * device /dev/hwctr. This allows user space programs to extract complete 1039 * counter set via normal file operations. 1040 */ 1041 1042 static atomic_t cfset_opencnt = ATOMIC_INIT(0); /* Access count */ 1043 static DEFINE_MUTEX(cfset_ctrset_mutex);/* Synchronize access to hardware */ 1044 struct cfset_call_on_cpu_parm { /* Parm struct for smp_call_on_cpu */ 1045 unsigned int sets; /* Counter set bit mask */ 1046 atomic_t cpus_ack; /* # CPUs successfully executed func */ 1047 }; 1048 1049 static struct cfset_session { /* CPUs and counter set bit mask */ 1050 struct list_head head; /* Head of list of active processes */ 1051 } cfset_session = { 1052 .head = LIST_HEAD_INIT(cfset_session.head) 1053 }; 1054 1055 struct cfset_request { /* CPUs and counter set bit mask */ 1056 unsigned long ctrset; /* Bit mask of counter set to read */ 1057 cpumask_t mask; /* CPU mask to read from */ 1058 struct list_head node; /* Chain to cfset_session.head */ 1059 }; 1060 1061 static void cfset_session_init(void) 1062 { 1063 INIT_LIST_HEAD(&cfset_session.head); 1064 } 1065 1066 /* Remove current request from global bookkeeping. Maintain a counter set bit 1067 * mask on a per CPU basis. 1068 * Done in process context under mutex protection. 1069 */ 1070 static void cfset_session_del(struct cfset_request *p) 1071 { 1072 list_del(&p->node); 1073 } 1074 1075 /* Add current request to global bookkeeping. Maintain a counter set bit mask 1076 * on a per CPU basis. 1077 * Done in process context under mutex protection. 1078 */ 1079 static void cfset_session_add(struct cfset_request *p) 1080 { 1081 list_add(&p->node, &cfset_session.head); 1082 } 1083 1084 /* The /dev/hwctr device access uses PMU_F_IN_USE to mark the device access 1085 * path is currently used. 1086 * The cpu_cf_events::dev_state is used to denote counter sets in use by this 1087 * interface. It is always or'ed in. If this interface is not active, its 1088 * value is zero and no additional counter sets will be included. 1089 * 1090 * The cpu_cf_events::state is used by the perf_event_open SVC and remains 1091 * unchanged. 1092 * 1093 * perf_pmu_enable() and perf_pmu_enable() and its call backs 1094 * cpumf_pmu_enable() and cpumf_pmu_disable() are called by the 1095 * performance measurement subsystem to enable per process 1096 * CPU Measurement counter facility. 1097 * The XXX_enable() and XXX_disable functions are used to turn off 1098 * x86 performance monitoring interrupt (PMI) during scheduling. 1099 * s390 uses these calls to temporarily stop and resume the active CPU 1100 * counters sets during scheduling. 1101 * 1102 * We do allow concurrent access of perf_event_open() SVC and /dev/hwctr 1103 * device access. The perf_event_open() SVC interface makes a lot of effort 1104 * to only run the counters while the calling process is actively scheduled 1105 * to run. 1106 * When /dev/hwctr interface is also used at the same time, the counter sets 1107 * will keep running, even when the process is scheduled off a CPU. 1108 * However this is not a problem and does not lead to wrong counter values 1109 * for the perf_event_open() SVC. The current counter value will be recorded 1110 * during schedule-in. At schedule-out time the current counter value is 1111 * extracted again and the delta is calculated and added to the event. 1112 */ 1113 /* Stop all counter sets via ioctl interface */ 1114 static void cfset_ioctl_off(void *parm) 1115 { 1116 struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); 1117 struct cfset_call_on_cpu_parm *p = parm; 1118 int rc; 1119 1120 /* Check if any counter set used by /dev/hwc */ 1121 for (rc = CPUMF_CTR_SET_BASIC; rc < CPUMF_CTR_SET_MAX; ++rc) 1122 if ((p->sets & cpumf_ctr_ctl[rc])) { 1123 if (!atomic_dec_return(&cpuhw->ctr_set[rc])) { 1124 ctr_set_disable(&cpuhw->dev_state, 1125 cpumf_ctr_ctl[rc]); 1126 ctr_set_stop(&cpuhw->dev_state, 1127 cpumf_ctr_ctl[rc]); 1128 } 1129 } 1130 /* Keep perf_event_open counter sets */ 1131 rc = lcctl(cpuhw->dev_state | cpuhw->state); 1132 if (rc) 1133 pr_err("Counter set stop %#llx of /dev/%s failed rc=%i\n", 1134 cpuhw->state, S390_HWCTR_DEVICE, rc); 1135 if (!cpuhw->dev_state) 1136 cpuhw->flags &= ~PMU_F_IN_USE; 1137 debug_sprintf_event(cf_dbg, 4, "%s rc %d state %#llx dev_state %#llx\n", 1138 __func__, rc, cpuhw->state, cpuhw->dev_state); 1139 } 1140 1141 /* Start counter sets on particular CPU */ 1142 static void cfset_ioctl_on(void *parm) 1143 { 1144 struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); 1145 struct cfset_call_on_cpu_parm *p = parm; 1146 int rc; 1147 1148 cpuhw->flags |= PMU_F_IN_USE; 1149 ctr_set_enable(&cpuhw->dev_state, p->sets); 1150 ctr_set_start(&cpuhw->dev_state, p->sets); 1151 for (rc = CPUMF_CTR_SET_BASIC; rc < CPUMF_CTR_SET_MAX; ++rc) 1152 if ((p->sets & cpumf_ctr_ctl[rc])) 1153 atomic_inc(&cpuhw->ctr_set[rc]); 1154 rc = lcctl(cpuhw->dev_state | cpuhw->state); /* Start counter sets */ 1155 if (!rc) 1156 atomic_inc(&p->cpus_ack); 1157 else 1158 pr_err("Counter set start %#llx of /dev/%s failed rc=%i\n", 1159 cpuhw->dev_state | cpuhw->state, S390_HWCTR_DEVICE, rc); 1160 debug_sprintf_event(cf_dbg, 4, "%s rc %d state %#llx dev_state %#llx\n", 1161 __func__, rc, cpuhw->state, cpuhw->dev_state); 1162 } 1163 1164 static void cfset_release_cpu(void *p) 1165 { 1166 struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); 1167 int rc; 1168 1169 debug_sprintf_event(cf_dbg, 4, "%s state %#llx dev_state %#llx\n", 1170 __func__, cpuhw->state, cpuhw->dev_state); 1171 cpuhw->dev_state = 0; 1172 rc = lcctl(cpuhw->state); /* Keep perf_event_open counter sets */ 1173 if (rc) 1174 pr_err("Counter set release %#llx of /dev/%s failed rc=%i\n", 1175 cpuhw->state, S390_HWCTR_DEVICE, rc); 1176 } 1177 1178 /* This modifies the process CPU mask to adopt it to the currently online 1179 * CPUs. Offline CPUs can not be addresses. This call terminates the access 1180 * and is usually followed by close() or a new iotcl(..., START, ...) which 1181 * creates a new request structure. 1182 */ 1183 static void cfset_all_stop(struct cfset_request *req) 1184 { 1185 struct cfset_call_on_cpu_parm p = { 1186 .sets = req->ctrset, 1187 }; 1188 1189 cpumask_and(&req->mask, &req->mask, cpu_online_mask); 1190 on_each_cpu_mask(&req->mask, cfset_ioctl_off, &p, 1); 1191 } 1192 1193 /* Release function is also called when application gets terminated without 1194 * doing a proper ioctl(..., S390_HWCTR_STOP, ...) command. 1195 */ 1196 static int cfset_release(struct inode *inode, struct file *file) 1197 { 1198 mutex_lock(&cfset_ctrset_mutex); 1199 /* Open followed by close/exit has no private_data */ 1200 if (file->private_data) { 1201 cfset_all_stop(file->private_data); 1202 cfset_session_del(file->private_data); 1203 kfree(file->private_data); 1204 file->private_data = NULL; 1205 } 1206 if (!atomic_dec_return(&cfset_opencnt)) 1207 on_each_cpu(cfset_release_cpu, NULL, 1); 1208 mutex_unlock(&cfset_ctrset_mutex); 1209 1210 hw_perf_event_destroy(NULL); 1211 return 0; 1212 } 1213 1214 static int cfset_open(struct inode *inode, struct file *file) 1215 { 1216 if (!capable(CAP_SYS_ADMIN)) 1217 return -EPERM; 1218 mutex_lock(&cfset_ctrset_mutex); 1219 if (atomic_inc_return(&cfset_opencnt) == 1) 1220 cfset_session_init(); 1221 mutex_unlock(&cfset_ctrset_mutex); 1222 1223 cpumf_hw_inuse(); 1224 file->private_data = NULL; 1225 /* nonseekable_open() never fails */ 1226 return nonseekable_open(inode, file); 1227 } 1228 1229 static int cfset_all_start(struct cfset_request *req) 1230 { 1231 struct cfset_call_on_cpu_parm p = { 1232 .sets = req->ctrset, 1233 .cpus_ack = ATOMIC_INIT(0), 1234 }; 1235 cpumask_var_t mask; 1236 int rc = 0; 1237 1238 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) 1239 return -ENOMEM; 1240 cpumask_and(mask, &req->mask, cpu_online_mask); 1241 on_each_cpu_mask(mask, cfset_ioctl_on, &p, 1); 1242 if (atomic_read(&p.cpus_ack) != cpumask_weight(mask)) { 1243 on_each_cpu_mask(mask, cfset_ioctl_off, &p, 1); 1244 rc = -EIO; 1245 debug_sprintf_event(cf_dbg, 4, "%s CPUs missing", __func__); 1246 } 1247 free_cpumask_var(mask); 1248 return rc; 1249 } 1250 1251 /* Return the maximum required space for all possible CPUs in case one 1252 * CPU will be onlined during the START, READ, STOP cycles. 1253 * To find out the size of the counter sets, any one CPU will do. They 1254 * all have the same counter sets. 1255 */ 1256 static size_t cfset_needspace(unsigned int sets) 1257 { 1258 size_t bytes = 0; 1259 int i; 1260 1261 for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) { 1262 if (!(sets & cpumf_ctr_ctl[i])) 1263 continue; 1264 bytes += cpum_cf_read_setsize(i) * sizeof(u64) + 1265 sizeof(((struct s390_ctrset_setdata *)0)->set) + 1266 sizeof(((struct s390_ctrset_setdata *)0)->no_cnts); 1267 } 1268 bytes = sizeof(((struct s390_ctrset_read *)0)->no_cpus) + nr_cpu_ids * 1269 (bytes + sizeof(((struct s390_ctrset_cpudata *)0)->cpu_nr) + 1270 sizeof(((struct s390_ctrset_cpudata *)0)->no_sets)); 1271 return bytes; 1272 } 1273 1274 static int cfset_all_copy(unsigned long arg, cpumask_t *mask) 1275 { 1276 struct s390_ctrset_read __user *ctrset_read; 1277 unsigned int cpu, cpus, rc = 0; 1278 void __user *uptr; 1279 1280 ctrset_read = (struct s390_ctrset_read __user *)arg; 1281 uptr = ctrset_read->data; 1282 for_each_cpu(cpu, mask) { 1283 struct cpu_cf_events *cpuhw = per_cpu_ptr(&cpu_cf_events, cpu); 1284 struct s390_ctrset_cpudata __user *ctrset_cpudata; 1285 1286 ctrset_cpudata = uptr; 1287 rc = put_user(cpu, &ctrset_cpudata->cpu_nr); 1288 rc |= put_user(cpuhw->sets, &ctrset_cpudata->no_sets); 1289 rc |= copy_to_user(ctrset_cpudata->data, cpuhw->data, 1290 cpuhw->used); 1291 if (rc) { 1292 rc = -EFAULT; 1293 goto out; 1294 } 1295 uptr += sizeof(struct s390_ctrset_cpudata) + cpuhw->used; 1296 cond_resched(); 1297 } 1298 cpus = cpumask_weight(mask); 1299 if (put_user(cpus, &ctrset_read->no_cpus)) 1300 rc = -EFAULT; 1301 out: 1302 debug_sprintf_event(cf_dbg, 4, "%s rc %d copied %ld\n", __func__, rc, 1303 uptr - (void __user *)ctrset_read->data); 1304 return rc; 1305 } 1306 1307 static size_t cfset_cpuset_read(struct s390_ctrset_setdata *p, int ctrset, 1308 int ctrset_size, size_t room) 1309 { 1310 size_t need = 0; 1311 int rc = -1; 1312 1313 need = sizeof(*p) + sizeof(u64) * ctrset_size; 1314 if (need <= room) { 1315 p->set = cpumf_ctr_ctl[ctrset]; 1316 p->no_cnts = ctrset_size; 1317 rc = ctr_stcctm(ctrset, ctrset_size, (u64 *)p->cv); 1318 if (rc == 3) /* Nothing stored */ 1319 need = 0; 1320 } 1321 return need; 1322 } 1323 1324 /* Read all counter sets. */ 1325 static void cfset_cpu_read(void *parm) 1326 { 1327 struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); 1328 struct cfset_call_on_cpu_parm *p = parm; 1329 int set, set_size; 1330 size_t space; 1331 1332 /* No data saved yet */ 1333 cpuhw->used = 0; 1334 cpuhw->sets = 0; 1335 memset(cpuhw->data, 0, sizeof(cpuhw->data)); 1336 1337 /* Scan the counter sets */ 1338 for (set = CPUMF_CTR_SET_BASIC; set < CPUMF_CTR_SET_MAX; ++set) { 1339 struct s390_ctrset_setdata *sp = (void *)cpuhw->data + 1340 cpuhw->used; 1341 1342 if (!(p->sets & cpumf_ctr_ctl[set])) 1343 continue; /* Counter set not in list */ 1344 set_size = cpum_cf_read_setsize(set); 1345 space = sizeof(cpuhw->data) - cpuhw->used; 1346 space = cfset_cpuset_read(sp, set, set_size, space); 1347 if (space) { 1348 cpuhw->used += space; 1349 cpuhw->sets += 1; 1350 } 1351 } 1352 debug_sprintf_event(cf_dbg, 4, "%s sets %d used %zd\n", __func__, 1353 cpuhw->sets, cpuhw->used); 1354 } 1355 1356 static int cfset_all_read(unsigned long arg, struct cfset_request *req) 1357 { 1358 struct cfset_call_on_cpu_parm p; 1359 cpumask_var_t mask; 1360 int rc; 1361 1362 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) 1363 return -ENOMEM; 1364 1365 p.sets = req->ctrset; 1366 cpumask_and(mask, &req->mask, cpu_online_mask); 1367 on_each_cpu_mask(mask, cfset_cpu_read, &p, 1); 1368 rc = cfset_all_copy(arg, mask); 1369 free_cpumask_var(mask); 1370 return rc; 1371 } 1372 1373 static long cfset_ioctl_read(unsigned long arg, struct cfset_request *req) 1374 { 1375 int ret = -ENODATA; 1376 1377 if (req && req->ctrset) 1378 ret = cfset_all_read(arg, req); 1379 return ret; 1380 } 1381 1382 static long cfset_ioctl_stop(struct file *file) 1383 { 1384 struct cfset_request *req = file->private_data; 1385 int ret = -ENXIO; 1386 1387 if (req) { 1388 cfset_all_stop(req); 1389 cfset_session_del(req); 1390 kfree(req); 1391 file->private_data = NULL; 1392 ret = 0; 1393 } 1394 return ret; 1395 } 1396 1397 static long cfset_ioctl_start(unsigned long arg, struct file *file) 1398 { 1399 struct s390_ctrset_start __user *ustart; 1400 struct s390_ctrset_start start; 1401 struct cfset_request *preq; 1402 void __user *umask; 1403 unsigned int len; 1404 int ret = 0; 1405 size_t need; 1406 1407 if (file->private_data) 1408 return -EBUSY; 1409 ustart = (struct s390_ctrset_start __user *)arg; 1410 if (copy_from_user(&start, ustart, sizeof(start))) 1411 return -EFAULT; 1412 if (start.version != S390_HWCTR_START_VERSION) 1413 return -EINVAL; 1414 if (start.counter_sets & ~(cpumf_ctr_ctl[CPUMF_CTR_SET_BASIC] | 1415 cpumf_ctr_ctl[CPUMF_CTR_SET_USER] | 1416 cpumf_ctr_ctl[CPUMF_CTR_SET_CRYPTO] | 1417 cpumf_ctr_ctl[CPUMF_CTR_SET_EXT] | 1418 cpumf_ctr_ctl[CPUMF_CTR_SET_MT_DIAG])) 1419 return -EINVAL; /* Invalid counter set */ 1420 if (!start.counter_sets) 1421 return -EINVAL; /* No counter set at all? */ 1422 1423 preq = kzalloc(sizeof(*preq), GFP_KERNEL); 1424 if (!preq) 1425 return -ENOMEM; 1426 cpumask_clear(&preq->mask); 1427 len = min_t(u64, start.cpumask_len, cpumask_size()); 1428 umask = (void __user *)start.cpumask; 1429 if (copy_from_user(&preq->mask, umask, len)) { 1430 kfree(preq); 1431 return -EFAULT; 1432 } 1433 if (cpumask_empty(&preq->mask)) { 1434 kfree(preq); 1435 return -EINVAL; 1436 } 1437 need = cfset_needspace(start.counter_sets); 1438 if (put_user(need, &ustart->data_bytes)) { 1439 kfree(preq); 1440 return -EFAULT; 1441 } 1442 preq->ctrset = start.counter_sets; 1443 ret = cfset_all_start(preq); 1444 if (!ret) { 1445 cfset_session_add(preq); 1446 file->private_data = preq; 1447 debug_sprintf_event(cf_dbg, 4, "%s set %#lx need %ld ret %d\n", 1448 __func__, preq->ctrset, need, ret); 1449 } else { 1450 kfree(preq); 1451 } 1452 return ret; 1453 } 1454 1455 /* Entry point to the /dev/hwctr device interface. 1456 * The ioctl system call supports three subcommands: 1457 * S390_HWCTR_START: Start the specified counter sets on a CPU list. The 1458 * counter set keeps running until explicitly stopped. Returns the number 1459 * of bytes needed to store the counter values. If another S390_HWCTR_START 1460 * ioctl subcommand is called without a previous S390_HWCTR_STOP stop 1461 * command on the same file descriptor, -EBUSY is returned. 1462 * S390_HWCTR_READ: Read the counter set values from specified CPU list given 1463 * with the S390_HWCTR_START command. 1464 * S390_HWCTR_STOP: Stops the counter sets on the CPU list given with the 1465 * previous S390_HWCTR_START subcommand. 1466 */ 1467 static long cfset_ioctl(struct file *file, unsigned int cmd, unsigned long arg) 1468 { 1469 int ret; 1470 1471 cpus_read_lock(); 1472 mutex_lock(&cfset_ctrset_mutex); 1473 switch (cmd) { 1474 case S390_HWCTR_START: 1475 ret = cfset_ioctl_start(arg, file); 1476 break; 1477 case S390_HWCTR_STOP: 1478 ret = cfset_ioctl_stop(file); 1479 break; 1480 case S390_HWCTR_READ: 1481 ret = cfset_ioctl_read(arg, file->private_data); 1482 break; 1483 default: 1484 ret = -ENOTTY; 1485 break; 1486 } 1487 mutex_unlock(&cfset_ctrset_mutex); 1488 cpus_read_unlock(); 1489 return ret; 1490 } 1491 1492 static const struct file_operations cfset_fops = { 1493 .owner = THIS_MODULE, 1494 .open = cfset_open, 1495 .release = cfset_release, 1496 .unlocked_ioctl = cfset_ioctl, 1497 .compat_ioctl = cfset_ioctl, 1498 .llseek = no_llseek 1499 }; 1500 1501 static struct miscdevice cfset_dev = { 1502 .name = S390_HWCTR_DEVICE, 1503 .minor = MISC_DYNAMIC_MINOR, 1504 .fops = &cfset_fops, 1505 }; 1506 1507 /* Hotplug add of a CPU. Scan through all active processes and add 1508 * that CPU to the list of CPUs supplied with ioctl(..., START, ...). 1509 */ 1510 static int cfset_online_cpu(unsigned int cpu) 1511 { 1512 struct cfset_call_on_cpu_parm p; 1513 struct cfset_request *rp; 1514 1515 mutex_lock(&cfset_ctrset_mutex); 1516 if (!list_empty(&cfset_session.head)) { 1517 list_for_each_entry(rp, &cfset_session.head, node) { 1518 p.sets = rp->ctrset; 1519 cfset_ioctl_on(&p); 1520 cpumask_set_cpu(cpu, &rp->mask); 1521 } 1522 } 1523 mutex_unlock(&cfset_ctrset_mutex); 1524 return 0; 1525 } 1526 1527 /* Hotplug remove of a CPU. Scan through all active processes and clear 1528 * that CPU from the list of CPUs supplied with ioctl(..., START, ...). 1529 */ 1530 static int cfset_offline_cpu(unsigned int cpu) 1531 { 1532 struct cfset_call_on_cpu_parm p; 1533 struct cfset_request *rp; 1534 1535 mutex_lock(&cfset_ctrset_mutex); 1536 if (!list_empty(&cfset_session.head)) { 1537 list_for_each_entry(rp, &cfset_session.head, node) { 1538 p.sets = rp->ctrset; 1539 cfset_ioctl_off(&p); 1540 cpumask_clear_cpu(cpu, &rp->mask); 1541 } 1542 } 1543 mutex_unlock(&cfset_ctrset_mutex); 1544 return 0; 1545 } 1546 1547 static void cfdiag_read(struct perf_event *event) 1548 { 1549 debug_sprintf_event(cf_dbg, 3, "%s event %#llx count %ld\n", __func__, 1550 event->attr.config, local64_read(&event->count)); 1551 } 1552 1553 static int get_authctrsets(void) 1554 { 1555 unsigned long auth = 0; 1556 enum cpumf_ctr_set i; 1557 1558 for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) { 1559 if (cpumf_ctr_info.auth_ctl & cpumf_ctr_ctl[i]) 1560 auth |= cpumf_ctr_ctl[i]; 1561 } 1562 return auth; 1563 } 1564 1565 /* Setup the event. Test for authorized counter sets and only include counter 1566 * sets which are authorized at the time of the setup. Including unauthorized 1567 * counter sets result in specification exception (and panic). 1568 */ 1569 static int cfdiag_event_init2(struct perf_event *event) 1570 { 1571 struct perf_event_attr *attr = &event->attr; 1572 int err = 0; 1573 1574 /* Set sample_period to indicate sampling */ 1575 event->hw.config = attr->config; 1576 event->hw.sample_period = attr->sample_period; 1577 local64_set(&event->hw.period_left, event->hw.sample_period); 1578 local64_set(&event->count, 0); 1579 event->hw.last_period = event->hw.sample_period; 1580 1581 /* Add all authorized counter sets to config_base. The 1582 * the hardware init function is either called per-cpu or just once 1583 * for all CPUS (event->cpu == -1). This depends on the whether 1584 * counting is started for all CPUs or on a per workload base where 1585 * the perf event moves from one CPU to another CPU. 1586 * Checking the authorization on any CPU is fine as the hardware 1587 * applies the same authorization settings to all CPUs. 1588 */ 1589 event->hw.config_base = get_authctrsets(); 1590 1591 /* No authorized counter sets, nothing to count/sample */ 1592 if (!event->hw.config_base) 1593 err = -EINVAL; 1594 1595 debug_sprintf_event(cf_dbg, 5, "%s err %d config_base %#lx\n", 1596 __func__, err, event->hw.config_base); 1597 return err; 1598 } 1599 1600 static int cfdiag_event_init(struct perf_event *event) 1601 { 1602 struct perf_event_attr *attr = &event->attr; 1603 int err = -ENOENT; 1604 1605 if (event->attr.config != PERF_EVENT_CPUM_CF_DIAG || 1606 event->attr.type != event->pmu->type) 1607 goto out; 1608 1609 /* Raw events are used to access counters directly, 1610 * hence do not permit excludes. 1611 * This event is useless without PERF_SAMPLE_RAW to return counter set 1612 * values as raw data. 1613 */ 1614 if (attr->exclude_kernel || attr->exclude_user || attr->exclude_hv || 1615 !(attr->sample_type & (PERF_SAMPLE_CPU | PERF_SAMPLE_RAW))) { 1616 err = -EOPNOTSUPP; 1617 goto out; 1618 } 1619 1620 /* Initialize for using the CPU-measurement counter facility */ 1621 cpumf_hw_inuse(); 1622 event->destroy = hw_perf_event_destroy; 1623 1624 err = cfdiag_event_init2(event); 1625 if (unlikely(err)) 1626 event->destroy(event); 1627 out: 1628 return err; 1629 } 1630 1631 /* Create cf_diag/events/CF_DIAG event sysfs file. This counter is used 1632 * to collect the complete counter sets for a scheduled process. Target 1633 * are complete counter sets attached as raw data to the artificial event. 1634 * This results in complete counter sets available when a process is 1635 * scheduled. Contains the delta of every counter while the process was 1636 * running. 1637 */ 1638 CPUMF_EVENT_ATTR(CF_DIAG, CF_DIAG, PERF_EVENT_CPUM_CF_DIAG); 1639 1640 static struct attribute *cfdiag_events_attr[] = { 1641 CPUMF_EVENT_PTR(CF_DIAG, CF_DIAG), 1642 NULL, 1643 }; 1644 1645 PMU_FORMAT_ATTR(event, "config:0-63"); 1646 1647 static struct attribute *cfdiag_format_attr[] = { 1648 &format_attr_event.attr, 1649 NULL, 1650 }; 1651 1652 static struct attribute_group cfdiag_events_group = { 1653 .name = "events", 1654 .attrs = cfdiag_events_attr, 1655 }; 1656 static struct attribute_group cfdiag_format_group = { 1657 .name = "format", 1658 .attrs = cfdiag_format_attr, 1659 }; 1660 static const struct attribute_group *cfdiag_attr_groups[] = { 1661 &cfdiag_events_group, 1662 &cfdiag_format_group, 1663 NULL, 1664 }; 1665 1666 /* Performance monitoring unit for event CF_DIAG. Since this event 1667 * is also started and stopped via the perf_event_open() system call, use 1668 * the same event enable/disable call back functions. They do not 1669 * have a pointer to the perf_event strcture as first parameter. 1670 * 1671 * The functions XXX_add, XXX_del, XXX_start and XXX_stop are also common. 1672 * Reuse them and distinguish the event (always first parameter) via 1673 * 'config' member. 1674 */ 1675 static struct pmu cf_diag = { 1676 .task_ctx_nr = perf_sw_context, 1677 .event_init = cfdiag_event_init, 1678 .pmu_enable = cpumf_pmu_enable, 1679 .pmu_disable = cpumf_pmu_disable, 1680 .add = cpumf_pmu_add, 1681 .del = cpumf_pmu_del, 1682 .start = cpumf_pmu_start, 1683 .stop = cpumf_pmu_stop, 1684 .read = cfdiag_read, 1685 1686 .attr_groups = cfdiag_attr_groups 1687 }; 1688 1689 /* Calculate memory needed to store all counter sets together with header and 1690 * trailer data. This is independent of the counter set authorization which 1691 * can vary depending on the configuration. 1692 */ 1693 static size_t cfdiag_maxsize(struct cpumf_ctr_info *info) 1694 { 1695 size_t max_size = sizeof(struct cf_trailer_entry); 1696 enum cpumf_ctr_set i; 1697 1698 for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) { 1699 size_t size = cpum_cf_read_setsize(i); 1700 1701 if (size) 1702 max_size += size * sizeof(u64) + 1703 sizeof(struct cf_ctrset_entry); 1704 } 1705 return max_size; 1706 } 1707 1708 /* Get the CPU speed, try sampling facility first and CPU attributes second. */ 1709 static void cfdiag_get_cpu_speed(void) 1710 { 1711 unsigned long mhz; 1712 1713 if (cpum_sf_avail()) { /* Sampling facility first */ 1714 struct hws_qsi_info_block si; 1715 1716 memset(&si, 0, sizeof(si)); 1717 if (!qsi(&si)) { 1718 cfdiag_cpu_speed = si.cpu_speed; 1719 return; 1720 } 1721 } 1722 1723 /* Fallback: CPU speed extract static part. Used in case 1724 * CPU Measurement Sampling Facility is turned off. 1725 */ 1726 mhz = __ecag(ECAG_CPU_ATTRIBUTE, 0); 1727 if (mhz != -1UL) 1728 cfdiag_cpu_speed = mhz & 0xffffffff; 1729 } 1730 1731 static int cfset_init(void) 1732 { 1733 size_t need; 1734 int rc; 1735 1736 cfdiag_get_cpu_speed(); 1737 /* Make sure the counter set data fits into predefined buffer. */ 1738 need = cfdiag_maxsize(&cpumf_ctr_info); 1739 if (need > sizeof(((struct cpu_cf_events *)0)->start)) { 1740 pr_err("Insufficient memory for PMU(cpum_cf_diag) need=%zu\n", 1741 need); 1742 return -ENOMEM; 1743 } 1744 1745 rc = misc_register(&cfset_dev); 1746 if (rc) { 1747 pr_err("Registration of /dev/%s failed rc=%i\n", 1748 cfset_dev.name, rc); 1749 goto out; 1750 } 1751 1752 rc = perf_pmu_register(&cf_diag, "cpum_cf_diag", -1); 1753 if (rc) { 1754 misc_deregister(&cfset_dev); 1755 pr_err("Registration of PMU(cpum_cf_diag) failed with rc=%i\n", 1756 rc); 1757 } 1758 out: 1759 return rc; 1760 } 1761 1762 device_initcall(cpumf_pmu_init); 1763