1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Performance event support for s390x - CPU-measurement Counter Facility 4 * 5 * Copyright IBM Corp. 2012, 2021 6 * Author(s): Hendrik Brueckner <brueckner@linux.ibm.com> 7 * Thomas Richter <tmricht@linux.ibm.com> 8 */ 9 #define KMSG_COMPONENT "cpum_cf" 10 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt 11 12 #include <linux/kernel.h> 13 #include <linux/kernel_stat.h> 14 #include <linux/percpu.h> 15 #include <linux/notifier.h> 16 #include <linux/init.h> 17 #include <linux/export.h> 18 #include <linux/miscdevice.h> 19 20 #include <asm/cpu_mcf.h> 21 #include <asm/hwctrset.h> 22 #include <asm/debug.h> 23 24 static unsigned int cfdiag_cpu_speed; /* CPU speed for CF_DIAG trailer */ 25 static debug_info_t *cf_dbg; 26 27 #define CF_DIAG_CTRSET_DEF 0xfeef /* Counter set header mark */ 28 /* interval in seconds */ 29 30 /* Counter sets are stored as data stream in a page sized memory buffer and 31 * exported to user space via raw data attached to the event sample data. 32 * Each counter set starts with an eight byte header consisting of: 33 * - a two byte eye catcher (0xfeef) 34 * - a one byte counter set number 35 * - a two byte counter set size (indicates the number of counters in this set) 36 * - a three byte reserved value (must be zero) to make the header the same 37 * size as a counter value. 38 * All counter values are eight byte in size. 39 * 40 * All counter sets are followed by a 64 byte trailer. 41 * The trailer consists of a: 42 * - flag field indicating valid fields when corresponding bit set 43 * - the counter facility first and second version number 44 * - the CPU speed if nonzero 45 * - the time stamp the counter sets have been collected 46 * - the time of day (TOD) base value 47 * - the machine type. 48 * 49 * The counter sets are saved when the process is prepared to be executed on a 50 * CPU and saved again when the process is going to be removed from a CPU. 51 * The difference of both counter sets are calculated and stored in the event 52 * sample data area. 53 */ 54 struct cf_ctrset_entry { /* CPU-M CF counter set entry (8 byte) */ 55 unsigned int def:16; /* 0-15 Data Entry Format */ 56 unsigned int set:16; /* 16-31 Counter set identifier */ 57 unsigned int ctr:16; /* 32-47 Number of stored counters */ 58 unsigned int res1:16; /* 48-63 Reserved */ 59 }; 60 61 struct cf_trailer_entry { /* CPU-M CF_DIAG trailer (64 byte) */ 62 /* 0 - 7 */ 63 union { 64 struct { 65 unsigned int clock_base:1; /* TOD clock base set */ 66 unsigned int speed:1; /* CPU speed set */ 67 /* Measurement alerts */ 68 unsigned int mtda:1; /* Loss of MT ctr. data alert */ 69 unsigned int caca:1; /* Counter auth. change alert */ 70 unsigned int lcda:1; /* Loss of counter data alert */ 71 }; 72 unsigned long flags; /* 0-63 All indicators */ 73 }; 74 /* 8 - 15 */ 75 unsigned int cfvn:16; /* 64-79 Ctr First Version */ 76 unsigned int csvn:16; /* 80-95 Ctr Second Version */ 77 unsigned int cpu_speed:32; /* 96-127 CPU speed */ 78 /* 16 - 23 */ 79 unsigned long timestamp; /* 128-191 Timestamp (TOD) */ 80 /* 24 - 55 */ 81 union { 82 struct { 83 unsigned long progusage1; 84 unsigned long progusage2; 85 unsigned long progusage3; 86 unsigned long tod_base; 87 }; 88 unsigned long progusage[4]; 89 }; 90 /* 56 - 63 */ 91 unsigned int mach_type:16; /* Machine type */ 92 unsigned int res1:16; /* Reserved */ 93 unsigned int res2:32; /* Reserved */ 94 }; 95 96 /* Create the trailer data at the end of a page. */ 97 static void cfdiag_trailer(struct cf_trailer_entry *te) 98 { 99 struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); 100 struct cpuid cpuid; 101 102 te->cfvn = cpuhw->info.cfvn; /* Counter version numbers */ 103 te->csvn = cpuhw->info.csvn; 104 105 get_cpu_id(&cpuid); /* Machine type */ 106 te->mach_type = cpuid.machine; 107 te->cpu_speed = cfdiag_cpu_speed; 108 if (te->cpu_speed) 109 te->speed = 1; 110 te->clock_base = 1; /* Save clock base */ 111 te->tod_base = tod_clock_base.tod; 112 te->timestamp = get_tod_clock_fast(); 113 } 114 115 /* Read a counter set. The counter set number determines the counter set and 116 * the CPUM-CF first and second version number determine the number of 117 * available counters in each counter set. 118 * Each counter set starts with header containing the counter set number and 119 * the number of eight byte counters. 120 * 121 * The functions returns the number of bytes occupied by this counter set 122 * including the header. 123 * If there is no counter in the counter set, this counter set is useless and 124 * zero is returned on this case. 125 * 126 * Note that the counter sets may not be enabled or active and the stcctm 127 * instruction might return error 3. Depending on error_ok value this is ok, 128 * for example when called from cpumf_pmu_start() call back function. 129 */ 130 static size_t cfdiag_getctrset(struct cf_ctrset_entry *ctrdata, int ctrset, 131 size_t room, bool error_ok) 132 { 133 struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); 134 size_t ctrset_size, need = 0; 135 int rc = 3; /* Assume write failure */ 136 137 ctrdata->def = CF_DIAG_CTRSET_DEF; 138 ctrdata->set = ctrset; 139 ctrdata->res1 = 0; 140 ctrset_size = cpum_cf_ctrset_size(ctrset, &cpuhw->info); 141 142 if (ctrset_size) { /* Save data */ 143 need = ctrset_size * sizeof(u64) + sizeof(*ctrdata); 144 if (need <= room) { 145 rc = ctr_stcctm(ctrset, ctrset_size, 146 (u64 *)(ctrdata + 1)); 147 } 148 if (rc != 3 || error_ok) 149 ctrdata->ctr = ctrset_size; 150 else 151 need = 0; 152 } 153 154 debug_sprintf_event(cf_dbg, 3, 155 "%s ctrset %d ctrset_size %zu cfvn %d csvn %d" 156 " need %zd rc %d\n", __func__, ctrset, ctrset_size, 157 cpuhw->info.cfvn, cpuhw->info.csvn, need, rc); 158 return need; 159 } 160 161 static const u64 cpumf_ctr_ctl[CPUMF_CTR_SET_MAX] = { 162 [CPUMF_CTR_SET_BASIC] = 0x02, 163 [CPUMF_CTR_SET_USER] = 0x04, 164 [CPUMF_CTR_SET_CRYPTO] = 0x08, 165 [CPUMF_CTR_SET_EXT] = 0x01, 166 [CPUMF_CTR_SET_MT_DIAG] = 0x20, 167 }; 168 169 /* Read out all counter sets and save them in the provided data buffer. 170 * The last 64 byte host an artificial trailer entry. 171 */ 172 static size_t cfdiag_getctr(void *data, size_t sz, unsigned long auth, 173 bool error_ok) 174 { 175 struct cf_trailer_entry *trailer; 176 size_t offset = 0, done; 177 int i; 178 179 memset(data, 0, sz); 180 sz -= sizeof(*trailer); /* Always room for trailer */ 181 for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) { 182 struct cf_ctrset_entry *ctrdata = data + offset; 183 184 if (!(auth & cpumf_ctr_ctl[i])) 185 continue; /* Counter set not authorized */ 186 187 done = cfdiag_getctrset(ctrdata, i, sz - offset, error_ok); 188 offset += done; 189 } 190 trailer = data + offset; 191 cfdiag_trailer(trailer); 192 return offset + sizeof(*trailer); 193 } 194 195 /* Calculate the difference for each counter in a counter set. */ 196 static void cfdiag_diffctrset(u64 *pstart, u64 *pstop, int counters) 197 { 198 for (; --counters >= 0; ++pstart, ++pstop) 199 if (*pstop >= *pstart) 200 *pstop -= *pstart; 201 else 202 *pstop = *pstart - *pstop + 1; 203 } 204 205 /* Scan the counter sets and calculate the difference of each counter 206 * in each set. The result is the increment of each counter during the 207 * period the counter set has been activated. 208 * 209 * Return true on success. 210 */ 211 static int cfdiag_diffctr(struct cpu_cf_events *cpuhw, unsigned long auth) 212 { 213 struct cf_trailer_entry *trailer_start, *trailer_stop; 214 struct cf_ctrset_entry *ctrstart, *ctrstop; 215 size_t offset = 0; 216 217 auth &= (1 << CPUMF_LCCTL_ENABLE_SHIFT) - 1; 218 do { 219 ctrstart = (struct cf_ctrset_entry *)(cpuhw->start + offset); 220 ctrstop = (struct cf_ctrset_entry *)(cpuhw->stop + offset); 221 222 if (memcmp(ctrstop, ctrstart, sizeof(*ctrstop))) { 223 pr_err_once("cpum_cf_diag counter set compare error " 224 "in set %i\n", ctrstart->set); 225 return 0; 226 } 227 auth &= ~cpumf_ctr_ctl[ctrstart->set]; 228 if (ctrstart->def == CF_DIAG_CTRSET_DEF) { 229 cfdiag_diffctrset((u64 *)(ctrstart + 1), 230 (u64 *)(ctrstop + 1), ctrstart->ctr); 231 offset += ctrstart->ctr * sizeof(u64) + 232 sizeof(*ctrstart); 233 } 234 } while (ctrstart->def && auth); 235 236 /* Save time_stamp from start of event in stop's trailer */ 237 trailer_start = (struct cf_trailer_entry *)(cpuhw->start + offset); 238 trailer_stop = (struct cf_trailer_entry *)(cpuhw->stop + offset); 239 trailer_stop->progusage[0] = trailer_start->timestamp; 240 241 return 1; 242 } 243 244 static enum cpumf_ctr_set get_counter_set(u64 event) 245 { 246 int set = CPUMF_CTR_SET_MAX; 247 248 if (event < 32) 249 set = CPUMF_CTR_SET_BASIC; 250 else if (event < 64) 251 set = CPUMF_CTR_SET_USER; 252 else if (event < 128) 253 set = CPUMF_CTR_SET_CRYPTO; 254 else if (event < 288) 255 set = CPUMF_CTR_SET_EXT; 256 else if (event >= 448 && event < 496) 257 set = CPUMF_CTR_SET_MT_DIAG; 258 259 return set; 260 } 261 262 static int validate_ctr_version(const struct hw_perf_event *hwc, 263 enum cpumf_ctr_set set) 264 { 265 struct cpu_cf_events *cpuhw; 266 int err = 0; 267 u16 mtdiag_ctl; 268 269 cpuhw = &get_cpu_var(cpu_cf_events); 270 271 /* check required version for counter sets */ 272 switch (set) { 273 case CPUMF_CTR_SET_BASIC: 274 case CPUMF_CTR_SET_USER: 275 if (cpuhw->info.cfvn < 1) 276 err = -EOPNOTSUPP; 277 break; 278 case CPUMF_CTR_SET_CRYPTO: 279 if ((cpuhw->info.csvn >= 1 && cpuhw->info.csvn <= 5 && 280 hwc->config > 79) || 281 (cpuhw->info.csvn >= 6 && hwc->config > 83)) 282 err = -EOPNOTSUPP; 283 break; 284 case CPUMF_CTR_SET_EXT: 285 if (cpuhw->info.csvn < 1) 286 err = -EOPNOTSUPP; 287 if ((cpuhw->info.csvn == 1 && hwc->config > 159) || 288 (cpuhw->info.csvn == 2 && hwc->config > 175) || 289 (cpuhw->info.csvn >= 3 && cpuhw->info.csvn <= 5 290 && hwc->config > 255) || 291 (cpuhw->info.csvn >= 6 && hwc->config > 287)) 292 err = -EOPNOTSUPP; 293 break; 294 case CPUMF_CTR_SET_MT_DIAG: 295 if (cpuhw->info.csvn <= 3) 296 err = -EOPNOTSUPP; 297 /* 298 * MT-diagnostic counters are read-only. The counter set 299 * is automatically enabled and activated on all CPUs with 300 * multithreading (SMT). Deactivation of multithreading 301 * also disables the counter set. State changes are ignored 302 * by lcctl(). Because Linux controls SMT enablement through 303 * a kernel parameter only, the counter set is either disabled 304 * or enabled and active. 305 * 306 * Thus, the counters can only be used if SMT is on and the 307 * counter set is enabled and active. 308 */ 309 mtdiag_ctl = cpumf_ctr_ctl[CPUMF_CTR_SET_MT_DIAG]; 310 if (!((cpuhw->info.auth_ctl & mtdiag_ctl) && 311 (cpuhw->info.enable_ctl & mtdiag_ctl) && 312 (cpuhw->info.act_ctl & mtdiag_ctl))) 313 err = -EOPNOTSUPP; 314 break; 315 case CPUMF_CTR_SET_MAX: 316 err = -EOPNOTSUPP; 317 } 318 319 put_cpu_var(cpu_cf_events); 320 return err; 321 } 322 323 static int validate_ctr_auth(const struct hw_perf_event *hwc) 324 { 325 struct cpu_cf_events *cpuhw; 326 int err = 0; 327 328 cpuhw = &get_cpu_var(cpu_cf_events); 329 330 /* Check authorization for cpu counter sets. 331 * If the particular CPU counter set is not authorized, 332 * return with -ENOENT in order to fall back to other 333 * PMUs that might suffice the event request. 334 */ 335 if (!(hwc->config_base & cpuhw->info.auth_ctl)) 336 err = -ENOENT; 337 338 put_cpu_var(cpu_cf_events); 339 return err; 340 } 341 342 /* 343 * Change the CPUMF state to active. 344 * Enable and activate the CPU-counter sets according 345 * to the per-cpu control state. 346 */ 347 static void cpumf_pmu_enable(struct pmu *pmu) 348 { 349 struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); 350 int err; 351 352 if (cpuhw->flags & PMU_F_ENABLED) 353 return; 354 355 err = lcctl(cpuhw->state | cpuhw->dev_state); 356 if (err) { 357 pr_err("Enabling the performance measuring unit " 358 "failed with rc=%x\n", err); 359 return; 360 } 361 362 cpuhw->flags |= PMU_F_ENABLED; 363 } 364 365 /* 366 * Change the CPUMF state to inactive. 367 * Disable and enable (inactive) the CPU-counter sets according 368 * to the per-cpu control state. 369 */ 370 static void cpumf_pmu_disable(struct pmu *pmu) 371 { 372 struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); 373 int err; 374 u64 inactive; 375 376 if (!(cpuhw->flags & PMU_F_ENABLED)) 377 return; 378 379 inactive = cpuhw->state & ~((1 << CPUMF_LCCTL_ENABLE_SHIFT) - 1); 380 inactive |= cpuhw->dev_state; 381 err = lcctl(inactive); 382 if (err) { 383 pr_err("Disabling the performance measuring unit " 384 "failed with rc=%x\n", err); 385 return; 386 } 387 388 cpuhw->flags &= ~PMU_F_ENABLED; 389 } 390 391 392 /* Number of perf events counting hardware events */ 393 static atomic_t num_events = ATOMIC_INIT(0); 394 /* Used to avoid races in calling reserve/release_cpumf_hardware */ 395 static DEFINE_MUTEX(pmc_reserve_mutex); 396 397 /* Release the PMU if event is the last perf event */ 398 static void hw_perf_event_destroy(struct perf_event *event) 399 { 400 if (!atomic_add_unless(&num_events, -1, 1)) { 401 mutex_lock(&pmc_reserve_mutex); 402 if (atomic_dec_return(&num_events) == 0) 403 __kernel_cpumcf_end(); 404 mutex_unlock(&pmc_reserve_mutex); 405 } 406 } 407 408 /* CPUMF <-> perf event mappings for kernel+userspace (basic set) */ 409 static const int cpumf_generic_events_basic[] = { 410 [PERF_COUNT_HW_CPU_CYCLES] = 0, 411 [PERF_COUNT_HW_INSTRUCTIONS] = 1, 412 [PERF_COUNT_HW_CACHE_REFERENCES] = -1, 413 [PERF_COUNT_HW_CACHE_MISSES] = -1, 414 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = -1, 415 [PERF_COUNT_HW_BRANCH_MISSES] = -1, 416 [PERF_COUNT_HW_BUS_CYCLES] = -1, 417 }; 418 /* CPUMF <-> perf event mappings for userspace (problem-state set) */ 419 static const int cpumf_generic_events_user[] = { 420 [PERF_COUNT_HW_CPU_CYCLES] = 32, 421 [PERF_COUNT_HW_INSTRUCTIONS] = 33, 422 [PERF_COUNT_HW_CACHE_REFERENCES] = -1, 423 [PERF_COUNT_HW_CACHE_MISSES] = -1, 424 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = -1, 425 [PERF_COUNT_HW_BRANCH_MISSES] = -1, 426 [PERF_COUNT_HW_BUS_CYCLES] = -1, 427 }; 428 429 static void cpumf_hw_inuse(void) 430 { 431 mutex_lock(&pmc_reserve_mutex); 432 if (atomic_inc_return(&num_events) == 1) 433 __kernel_cpumcf_begin(); 434 mutex_unlock(&pmc_reserve_mutex); 435 } 436 437 static int __hw_perf_event_init(struct perf_event *event, unsigned int type) 438 { 439 struct perf_event_attr *attr = &event->attr; 440 struct hw_perf_event *hwc = &event->hw; 441 enum cpumf_ctr_set set; 442 int err = 0; 443 u64 ev; 444 445 switch (type) { 446 case PERF_TYPE_RAW: 447 /* Raw events are used to access counters directly, 448 * hence do not permit excludes */ 449 if (attr->exclude_kernel || attr->exclude_user || 450 attr->exclude_hv) 451 return -EOPNOTSUPP; 452 ev = attr->config; 453 break; 454 455 case PERF_TYPE_HARDWARE: 456 if (is_sampling_event(event)) /* No sampling support */ 457 return -ENOENT; 458 ev = attr->config; 459 /* Count user space (problem-state) only */ 460 if (!attr->exclude_user && attr->exclude_kernel) { 461 if (ev >= ARRAY_SIZE(cpumf_generic_events_user)) 462 return -EOPNOTSUPP; 463 ev = cpumf_generic_events_user[ev]; 464 465 /* No support for kernel space counters only */ 466 } else if (!attr->exclude_kernel && attr->exclude_user) { 467 return -EOPNOTSUPP; 468 } else { /* Count user and kernel space */ 469 if (ev >= ARRAY_SIZE(cpumf_generic_events_basic)) 470 return -EOPNOTSUPP; 471 ev = cpumf_generic_events_basic[ev]; 472 } 473 break; 474 475 default: 476 return -ENOENT; 477 } 478 479 if (ev == -1) 480 return -ENOENT; 481 482 if (ev > PERF_CPUM_CF_MAX_CTR) 483 return -ENOENT; 484 485 /* Obtain the counter set to which the specified counter belongs */ 486 set = get_counter_set(ev); 487 switch (set) { 488 case CPUMF_CTR_SET_BASIC: 489 case CPUMF_CTR_SET_USER: 490 case CPUMF_CTR_SET_CRYPTO: 491 case CPUMF_CTR_SET_EXT: 492 case CPUMF_CTR_SET_MT_DIAG: 493 /* 494 * Use the hardware perf event structure to store the 495 * counter number in the 'config' member and the counter 496 * set number in the 'config_base' as bit mask. 497 * It is later used to enable/disable the counter(s). 498 */ 499 hwc->config = ev; 500 hwc->config_base = cpumf_ctr_ctl[set]; 501 break; 502 case CPUMF_CTR_SET_MAX: 503 /* The counter could not be associated to a counter set */ 504 return -EINVAL; 505 } 506 507 /* Initialize for using the CPU-measurement counter facility */ 508 cpumf_hw_inuse(); 509 event->destroy = hw_perf_event_destroy; 510 511 /* Finally, validate version and authorization of the counter set */ 512 err = validate_ctr_auth(hwc); 513 if (!err) 514 err = validate_ctr_version(hwc, set); 515 516 return err; 517 } 518 519 /* Events CPU_CYLCES and INSTRUCTIONS can be submitted with two different 520 * attribute::type values: 521 * - PERF_TYPE_HARDWARE: 522 * - pmu->type: 523 * Handle both type of invocations identical. They address the same hardware. 524 * The result is different when event modifiers exclude_kernel and/or 525 * exclude_user are also set. 526 */ 527 static int cpumf_pmu_event_type(struct perf_event *event) 528 { 529 u64 ev = event->attr.config; 530 531 if (cpumf_generic_events_basic[PERF_COUNT_HW_CPU_CYCLES] == ev || 532 cpumf_generic_events_basic[PERF_COUNT_HW_INSTRUCTIONS] == ev || 533 cpumf_generic_events_user[PERF_COUNT_HW_CPU_CYCLES] == ev || 534 cpumf_generic_events_user[PERF_COUNT_HW_INSTRUCTIONS] == ev) 535 return PERF_TYPE_HARDWARE; 536 return PERF_TYPE_RAW; 537 } 538 539 static int cpumf_pmu_event_init(struct perf_event *event) 540 { 541 unsigned int type = event->attr.type; 542 int err; 543 544 if (type == PERF_TYPE_HARDWARE || type == PERF_TYPE_RAW) 545 err = __hw_perf_event_init(event, type); 546 else if (event->pmu->type == type) 547 /* Registered as unknown PMU */ 548 err = __hw_perf_event_init(event, cpumf_pmu_event_type(event)); 549 else 550 return -ENOENT; 551 552 if (unlikely(err) && event->destroy) 553 event->destroy(event); 554 555 return err; 556 } 557 558 static int hw_perf_event_reset(struct perf_event *event) 559 { 560 u64 prev, new; 561 int err; 562 563 do { 564 prev = local64_read(&event->hw.prev_count); 565 err = ecctr(event->hw.config, &new); 566 if (err) { 567 if (err != 3) 568 break; 569 /* The counter is not (yet) available. This 570 * might happen if the counter set to which 571 * this counter belongs is in the disabled 572 * state. 573 */ 574 new = 0; 575 } 576 } while (local64_cmpxchg(&event->hw.prev_count, prev, new) != prev); 577 578 return err; 579 } 580 581 static void hw_perf_event_update(struct perf_event *event) 582 { 583 u64 prev, new, delta; 584 int err; 585 586 do { 587 prev = local64_read(&event->hw.prev_count); 588 err = ecctr(event->hw.config, &new); 589 if (err) 590 return; 591 } while (local64_cmpxchg(&event->hw.prev_count, prev, new) != prev); 592 593 delta = (prev <= new) ? new - prev 594 : (-1ULL - prev) + new + 1; /* overflow */ 595 local64_add(delta, &event->count); 596 } 597 598 static void cpumf_pmu_read(struct perf_event *event) 599 { 600 if (event->hw.state & PERF_HES_STOPPED) 601 return; 602 603 hw_perf_event_update(event); 604 } 605 606 static void cpumf_pmu_start(struct perf_event *event, int flags) 607 { 608 struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); 609 struct hw_perf_event *hwc = &event->hw; 610 int i; 611 612 if (!(hwc->state & PERF_HES_STOPPED)) 613 return; 614 615 hwc->state = 0; 616 617 /* (Re-)enable and activate the counter set */ 618 ctr_set_enable(&cpuhw->state, hwc->config_base); 619 ctr_set_start(&cpuhw->state, hwc->config_base); 620 621 /* The counter set to which this counter belongs can be already active. 622 * Because all counters in a set are active, the event->hw.prev_count 623 * needs to be synchronized. At this point, the counter set can be in 624 * the inactive or disabled state. 625 */ 626 if (hwc->config == PERF_EVENT_CPUM_CF_DIAG) { 627 cpuhw->usedss = cfdiag_getctr(cpuhw->start, 628 sizeof(cpuhw->start), 629 hwc->config_base, true); 630 } else { 631 hw_perf_event_reset(event); 632 } 633 634 /* Increment refcount for counter sets */ 635 for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) 636 if ((hwc->config_base & cpumf_ctr_ctl[i])) 637 atomic_inc(&cpuhw->ctr_set[i]); 638 } 639 640 /* Create perf event sample with the counter sets as raw data. The sample 641 * is then pushed to the event subsystem and the function checks for 642 * possible event overflows. If an event overflow occurs, the PMU is 643 * stopped. 644 * 645 * Return non-zero if an event overflow occurred. 646 */ 647 static int cfdiag_push_sample(struct perf_event *event, 648 struct cpu_cf_events *cpuhw) 649 { 650 struct perf_sample_data data; 651 struct perf_raw_record raw; 652 struct pt_regs regs; 653 int overflow; 654 655 /* Setup perf sample */ 656 perf_sample_data_init(&data, 0, event->hw.last_period); 657 memset(®s, 0, sizeof(regs)); 658 memset(&raw, 0, sizeof(raw)); 659 660 if (event->attr.sample_type & PERF_SAMPLE_CPU) 661 data.cpu_entry.cpu = event->cpu; 662 if (event->attr.sample_type & PERF_SAMPLE_RAW) { 663 raw.frag.size = cpuhw->usedss; 664 raw.frag.data = cpuhw->stop; 665 raw.size = raw.frag.size; 666 data.raw = &raw; 667 data.sample_flags |= PERF_SAMPLE_RAW; 668 } 669 670 overflow = perf_event_overflow(event, &data, ®s); 671 debug_sprintf_event(cf_dbg, 3, 672 "%s event %#llx sample_type %#llx raw %d ov %d\n", 673 __func__, event->hw.config, 674 event->attr.sample_type, raw.size, overflow); 675 if (overflow) 676 event->pmu->stop(event, 0); 677 678 perf_event_update_userpage(event); 679 return overflow; 680 } 681 682 static void cpumf_pmu_stop(struct perf_event *event, int flags) 683 { 684 struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); 685 struct hw_perf_event *hwc = &event->hw; 686 int i; 687 688 if (!(hwc->state & PERF_HES_STOPPED)) { 689 /* Decrement reference count for this counter set and if this 690 * is the last used counter in the set, clear activation 691 * control and set the counter set state to inactive. 692 */ 693 for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) { 694 if (!(hwc->config_base & cpumf_ctr_ctl[i])) 695 continue; 696 if (!atomic_dec_return(&cpuhw->ctr_set[i])) 697 ctr_set_stop(&cpuhw->state, cpumf_ctr_ctl[i]); 698 } 699 hwc->state |= PERF_HES_STOPPED; 700 } 701 702 if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) { 703 if (hwc->config == PERF_EVENT_CPUM_CF_DIAG) { 704 local64_inc(&event->count); 705 cpuhw->usedss = cfdiag_getctr(cpuhw->stop, 706 sizeof(cpuhw->stop), 707 event->hw.config_base, 708 false); 709 if (cfdiag_diffctr(cpuhw, event->hw.config_base)) 710 cfdiag_push_sample(event, cpuhw); 711 } else if (cpuhw->flags & PMU_F_RESERVED) { 712 /* Only update when PMU not hotplugged off */ 713 hw_perf_event_update(event); 714 } 715 hwc->state |= PERF_HES_UPTODATE; 716 } 717 } 718 719 static int cpumf_pmu_add(struct perf_event *event, int flags) 720 { 721 struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); 722 723 ctr_set_enable(&cpuhw->state, event->hw.config_base); 724 event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED; 725 726 if (flags & PERF_EF_START) 727 cpumf_pmu_start(event, PERF_EF_RELOAD); 728 729 return 0; 730 } 731 732 static void cpumf_pmu_del(struct perf_event *event, int flags) 733 { 734 struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); 735 int i; 736 737 cpumf_pmu_stop(event, PERF_EF_UPDATE); 738 739 /* Check if any counter in the counter set is still used. If not used, 740 * change the counter set to the disabled state. This also clears the 741 * content of all counters in the set. 742 * 743 * When a new perf event has been added but not yet started, this can 744 * clear enable control and resets all counters in a set. Therefore, 745 * cpumf_pmu_start() always has to reenable a counter set. 746 */ 747 for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) 748 if (!atomic_read(&cpuhw->ctr_set[i])) 749 ctr_set_disable(&cpuhw->state, cpumf_ctr_ctl[i]); 750 } 751 752 /* Performance monitoring unit for s390x */ 753 static struct pmu cpumf_pmu = { 754 .task_ctx_nr = perf_sw_context, 755 .capabilities = PERF_PMU_CAP_NO_INTERRUPT, 756 .pmu_enable = cpumf_pmu_enable, 757 .pmu_disable = cpumf_pmu_disable, 758 .event_init = cpumf_pmu_event_init, 759 .add = cpumf_pmu_add, 760 .del = cpumf_pmu_del, 761 .start = cpumf_pmu_start, 762 .stop = cpumf_pmu_stop, 763 .read = cpumf_pmu_read, 764 }; 765 766 static int cfset_init(void); 767 static int __init cpumf_pmu_init(void) 768 { 769 int rc; 770 771 if (!kernel_cpumcf_avail()) 772 return -ENODEV; 773 774 /* Setup s390dbf facility */ 775 cf_dbg = debug_register(KMSG_COMPONENT, 2, 1, 128); 776 if (!cf_dbg) { 777 pr_err("Registration of s390dbf(cpum_cf) failed\n"); 778 return -ENOMEM; 779 } 780 debug_register_view(cf_dbg, &debug_sprintf_view); 781 782 cpumf_pmu.attr_groups = cpumf_cf_event_group(); 783 rc = perf_pmu_register(&cpumf_pmu, "cpum_cf", -1); 784 if (rc) { 785 debug_unregister_view(cf_dbg, &debug_sprintf_view); 786 debug_unregister(cf_dbg); 787 pr_err("Registering the cpum_cf PMU failed with rc=%i\n", rc); 788 } else if (stccm_avail()) { /* Setup counter set device */ 789 cfset_init(); 790 } 791 return rc; 792 } 793 794 /* Support for the CPU Measurement Facility counter set extraction using 795 * device /dev/hwctr. This allows user space programs to extract complete 796 * counter set via normal file operations. 797 */ 798 799 static atomic_t cfset_opencnt = ATOMIC_INIT(0); /* Access count */ 800 static DEFINE_MUTEX(cfset_ctrset_mutex);/* Synchronize access to hardware */ 801 struct cfset_call_on_cpu_parm { /* Parm struct for smp_call_on_cpu */ 802 unsigned int sets; /* Counter set bit mask */ 803 atomic_t cpus_ack; /* # CPUs successfully executed func */ 804 }; 805 806 static struct cfset_session { /* CPUs and counter set bit mask */ 807 struct list_head head; /* Head of list of active processes */ 808 } cfset_session = { 809 .head = LIST_HEAD_INIT(cfset_session.head) 810 }; 811 812 struct cfset_request { /* CPUs and counter set bit mask */ 813 unsigned long ctrset; /* Bit mask of counter set to read */ 814 cpumask_t mask; /* CPU mask to read from */ 815 struct list_head node; /* Chain to cfset_session.head */ 816 }; 817 818 static void cfset_session_init(void) 819 { 820 INIT_LIST_HEAD(&cfset_session.head); 821 } 822 823 /* Remove current request from global bookkeeping. Maintain a counter set bit 824 * mask on a per CPU basis. 825 * Done in process context under mutex protection. 826 */ 827 static void cfset_session_del(struct cfset_request *p) 828 { 829 list_del(&p->node); 830 } 831 832 /* Add current request to global bookkeeping. Maintain a counter set bit mask 833 * on a per CPU basis. 834 * Done in process context under mutex protection. 835 */ 836 static void cfset_session_add(struct cfset_request *p) 837 { 838 list_add(&p->node, &cfset_session.head); 839 } 840 841 /* The /dev/hwctr device access uses PMU_F_IN_USE to mark the device access 842 * path is currently used. 843 * The cpu_cf_events::dev_state is used to denote counter sets in use by this 844 * interface. It is always or'ed in. If this interface is not active, its 845 * value is zero and no additional counter sets will be included. 846 * 847 * The cpu_cf_events::state is used by the perf_event_open SVC and remains 848 * unchanged. 849 * 850 * perf_pmu_enable() and perf_pmu_enable() and its call backs 851 * cpumf_pmu_enable() and cpumf_pmu_disable() are called by the 852 * performance measurement subsystem to enable per process 853 * CPU Measurement counter facility. 854 * The XXX_enable() and XXX_disable functions are used to turn off 855 * x86 performance monitoring interrupt (PMI) during scheduling. 856 * s390 uses these calls to temporarily stop and resume the active CPU 857 * counters sets during scheduling. 858 * 859 * We do allow concurrent access of perf_event_open() SVC and /dev/hwctr 860 * device access. The perf_event_open() SVC interface makes a lot of effort 861 * to only run the counters while the calling process is actively scheduled 862 * to run. 863 * When /dev/hwctr interface is also used at the same time, the counter sets 864 * will keep running, even when the process is scheduled off a CPU. 865 * However this is not a problem and does not lead to wrong counter values 866 * for the perf_event_open() SVC. The current counter value will be recorded 867 * during schedule-in. At schedule-out time the current counter value is 868 * extracted again and the delta is calculated and added to the event. 869 */ 870 /* Stop all counter sets via ioctl interface */ 871 static void cfset_ioctl_off(void *parm) 872 { 873 struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); 874 struct cfset_call_on_cpu_parm *p = parm; 875 int rc; 876 877 /* Check if any counter set used by /dev/hwc */ 878 for (rc = CPUMF_CTR_SET_BASIC; rc < CPUMF_CTR_SET_MAX; ++rc) 879 if ((p->sets & cpumf_ctr_ctl[rc])) { 880 if (!atomic_dec_return(&cpuhw->ctr_set[rc])) { 881 ctr_set_disable(&cpuhw->dev_state, 882 cpumf_ctr_ctl[rc]); 883 ctr_set_stop(&cpuhw->dev_state, 884 cpumf_ctr_ctl[rc]); 885 } 886 } 887 /* Keep perf_event_open counter sets */ 888 rc = lcctl(cpuhw->dev_state | cpuhw->state); 889 if (rc) 890 pr_err("Counter set stop %#llx of /dev/%s failed rc=%i\n", 891 cpuhw->state, S390_HWCTR_DEVICE, rc); 892 if (!cpuhw->dev_state) 893 cpuhw->flags &= ~PMU_F_IN_USE; 894 debug_sprintf_event(cf_dbg, 4, "%s rc %d state %#llx dev_state %#llx\n", 895 __func__, rc, cpuhw->state, cpuhw->dev_state); 896 } 897 898 /* Start counter sets on particular CPU */ 899 static void cfset_ioctl_on(void *parm) 900 { 901 struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); 902 struct cfset_call_on_cpu_parm *p = parm; 903 int rc; 904 905 cpuhw->flags |= PMU_F_IN_USE; 906 ctr_set_enable(&cpuhw->dev_state, p->sets); 907 ctr_set_start(&cpuhw->dev_state, p->sets); 908 for (rc = CPUMF_CTR_SET_BASIC; rc < CPUMF_CTR_SET_MAX; ++rc) 909 if ((p->sets & cpumf_ctr_ctl[rc])) 910 atomic_inc(&cpuhw->ctr_set[rc]); 911 rc = lcctl(cpuhw->dev_state | cpuhw->state); /* Start counter sets */ 912 if (!rc) 913 atomic_inc(&p->cpus_ack); 914 else 915 pr_err("Counter set start %#llx of /dev/%s failed rc=%i\n", 916 cpuhw->dev_state | cpuhw->state, S390_HWCTR_DEVICE, rc); 917 debug_sprintf_event(cf_dbg, 4, "%s rc %d state %#llx dev_state %#llx\n", 918 __func__, rc, cpuhw->state, cpuhw->dev_state); 919 } 920 921 static void cfset_release_cpu(void *p) 922 { 923 struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); 924 int rc; 925 926 debug_sprintf_event(cf_dbg, 4, "%s state %#llx dev_state %#llx\n", 927 __func__, cpuhw->state, cpuhw->dev_state); 928 cpuhw->dev_state = 0; 929 rc = lcctl(cpuhw->state); /* Keep perf_event_open counter sets */ 930 if (rc) 931 pr_err("Counter set release %#llx of /dev/%s failed rc=%i\n", 932 cpuhw->state, S390_HWCTR_DEVICE, rc); 933 } 934 935 /* This modifies the process CPU mask to adopt it to the currently online 936 * CPUs. Offline CPUs can not be addresses. This call terminates the access 937 * and is usually followed by close() or a new iotcl(..., START, ...) which 938 * creates a new request structure. 939 */ 940 static void cfset_all_stop(struct cfset_request *req) 941 { 942 struct cfset_call_on_cpu_parm p = { 943 .sets = req->ctrset, 944 }; 945 946 cpumask_and(&req->mask, &req->mask, cpu_online_mask); 947 on_each_cpu_mask(&req->mask, cfset_ioctl_off, &p, 1); 948 } 949 950 /* Release function is also called when application gets terminated without 951 * doing a proper ioctl(..., S390_HWCTR_STOP, ...) command. 952 */ 953 static int cfset_release(struct inode *inode, struct file *file) 954 { 955 mutex_lock(&cfset_ctrset_mutex); 956 /* Open followed by close/exit has no private_data */ 957 if (file->private_data) { 958 cfset_all_stop(file->private_data); 959 cfset_session_del(file->private_data); 960 kfree(file->private_data); 961 file->private_data = NULL; 962 } 963 if (!atomic_dec_return(&cfset_opencnt)) 964 on_each_cpu(cfset_release_cpu, NULL, 1); 965 mutex_unlock(&cfset_ctrset_mutex); 966 967 hw_perf_event_destroy(NULL); 968 return 0; 969 } 970 971 static int cfset_open(struct inode *inode, struct file *file) 972 { 973 if (!capable(CAP_SYS_ADMIN)) 974 return -EPERM; 975 mutex_lock(&cfset_ctrset_mutex); 976 if (atomic_inc_return(&cfset_opencnt) == 1) 977 cfset_session_init(); 978 mutex_unlock(&cfset_ctrset_mutex); 979 980 cpumf_hw_inuse(); 981 file->private_data = NULL; 982 /* nonseekable_open() never fails */ 983 return nonseekable_open(inode, file); 984 } 985 986 static int cfset_all_start(struct cfset_request *req) 987 { 988 struct cfset_call_on_cpu_parm p = { 989 .sets = req->ctrset, 990 .cpus_ack = ATOMIC_INIT(0), 991 }; 992 cpumask_var_t mask; 993 int rc = 0; 994 995 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) 996 return -ENOMEM; 997 cpumask_and(mask, &req->mask, cpu_online_mask); 998 on_each_cpu_mask(mask, cfset_ioctl_on, &p, 1); 999 if (atomic_read(&p.cpus_ack) != cpumask_weight(mask)) { 1000 on_each_cpu_mask(mask, cfset_ioctl_off, &p, 1); 1001 rc = -EIO; 1002 debug_sprintf_event(cf_dbg, 4, "%s CPUs missing", __func__); 1003 } 1004 free_cpumask_var(mask); 1005 return rc; 1006 } 1007 1008 1009 /* Return the maximum required space for all possible CPUs in case one 1010 * CPU will be onlined during the START, READ, STOP cycles. 1011 * To find out the size of the counter sets, any one CPU will do. They 1012 * all have the same counter sets. 1013 */ 1014 static size_t cfset_needspace(unsigned int sets) 1015 { 1016 struct cpu_cf_events *cpuhw = get_cpu_ptr(&cpu_cf_events); 1017 size_t bytes = 0; 1018 int i; 1019 1020 for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) { 1021 if (!(sets & cpumf_ctr_ctl[i])) 1022 continue; 1023 bytes += cpum_cf_ctrset_size(i, &cpuhw->info) * sizeof(u64) + 1024 sizeof(((struct s390_ctrset_setdata *)0)->set) + 1025 sizeof(((struct s390_ctrset_setdata *)0)->no_cnts); 1026 } 1027 bytes = sizeof(((struct s390_ctrset_read *)0)->no_cpus) + nr_cpu_ids * 1028 (bytes + sizeof(((struct s390_ctrset_cpudata *)0)->cpu_nr) + 1029 sizeof(((struct s390_ctrset_cpudata *)0)->no_sets)); 1030 put_cpu_ptr(&cpu_cf_events); 1031 return bytes; 1032 } 1033 1034 static int cfset_all_copy(unsigned long arg, cpumask_t *mask) 1035 { 1036 struct s390_ctrset_read __user *ctrset_read; 1037 unsigned int cpu, cpus, rc; 1038 void __user *uptr; 1039 1040 ctrset_read = (struct s390_ctrset_read __user *)arg; 1041 uptr = ctrset_read->data; 1042 for_each_cpu(cpu, mask) { 1043 struct cpu_cf_events *cpuhw = per_cpu_ptr(&cpu_cf_events, cpu); 1044 struct s390_ctrset_cpudata __user *ctrset_cpudata; 1045 1046 ctrset_cpudata = uptr; 1047 rc = put_user(cpu, &ctrset_cpudata->cpu_nr); 1048 rc |= put_user(cpuhw->sets, &ctrset_cpudata->no_sets); 1049 rc |= copy_to_user(ctrset_cpudata->data, cpuhw->data, 1050 cpuhw->used); 1051 if (rc) 1052 return -EFAULT; 1053 uptr += sizeof(struct s390_ctrset_cpudata) + cpuhw->used; 1054 cond_resched(); 1055 } 1056 cpus = cpumask_weight(mask); 1057 if (put_user(cpus, &ctrset_read->no_cpus)) 1058 return -EFAULT; 1059 debug_sprintf_event(cf_dbg, 4, "%s copied %ld\n", __func__, 1060 uptr - (void __user *)ctrset_read->data); 1061 return 0; 1062 } 1063 1064 static size_t cfset_cpuset_read(struct s390_ctrset_setdata *p, int ctrset, 1065 int ctrset_size, size_t room) 1066 { 1067 size_t need = 0; 1068 int rc = -1; 1069 1070 need = sizeof(*p) + sizeof(u64) * ctrset_size; 1071 if (need <= room) { 1072 p->set = cpumf_ctr_ctl[ctrset]; 1073 p->no_cnts = ctrset_size; 1074 rc = ctr_stcctm(ctrset, ctrset_size, (u64 *)p->cv); 1075 if (rc == 3) /* Nothing stored */ 1076 need = 0; 1077 } 1078 return need; 1079 } 1080 1081 /* Read all counter sets. */ 1082 static void cfset_cpu_read(void *parm) 1083 { 1084 struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); 1085 struct cfset_call_on_cpu_parm *p = parm; 1086 int set, set_size; 1087 size_t space; 1088 1089 /* No data saved yet */ 1090 cpuhw->used = 0; 1091 cpuhw->sets = 0; 1092 memset(cpuhw->data, 0, sizeof(cpuhw->data)); 1093 1094 /* Scan the counter sets */ 1095 for (set = CPUMF_CTR_SET_BASIC; set < CPUMF_CTR_SET_MAX; ++set) { 1096 struct s390_ctrset_setdata *sp = (void *)cpuhw->data + 1097 cpuhw->used; 1098 1099 if (!(p->sets & cpumf_ctr_ctl[set])) 1100 continue; /* Counter set not in list */ 1101 set_size = cpum_cf_ctrset_size(set, &cpuhw->info); 1102 space = sizeof(cpuhw->data) - cpuhw->used; 1103 space = cfset_cpuset_read(sp, set, set_size, space); 1104 if (space) { 1105 cpuhw->used += space; 1106 cpuhw->sets += 1; 1107 } 1108 } 1109 debug_sprintf_event(cf_dbg, 4, "%s sets %d used %zd\n", __func__, 1110 cpuhw->sets, cpuhw->used); 1111 } 1112 1113 static int cfset_all_read(unsigned long arg, struct cfset_request *req) 1114 { 1115 struct cfset_call_on_cpu_parm p; 1116 cpumask_var_t mask; 1117 int rc; 1118 1119 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) 1120 return -ENOMEM; 1121 1122 p.sets = req->ctrset; 1123 cpumask_and(mask, &req->mask, cpu_online_mask); 1124 on_each_cpu_mask(mask, cfset_cpu_read, &p, 1); 1125 rc = cfset_all_copy(arg, mask); 1126 free_cpumask_var(mask); 1127 return rc; 1128 } 1129 1130 static long cfset_ioctl_read(unsigned long arg, struct cfset_request *req) 1131 { 1132 struct s390_ctrset_read read; 1133 int ret = -ENODATA; 1134 1135 if (req && req->ctrset) { 1136 if (copy_from_user(&read, (char __user *)arg, sizeof(read))) 1137 return -EFAULT; 1138 ret = cfset_all_read(arg, req); 1139 } 1140 return ret; 1141 } 1142 1143 static long cfset_ioctl_stop(struct file *file) 1144 { 1145 struct cfset_request *req = file->private_data; 1146 int ret = -ENXIO; 1147 1148 if (req) { 1149 cfset_all_stop(req); 1150 cfset_session_del(req); 1151 kfree(req); 1152 file->private_data = NULL; 1153 ret = 0; 1154 } 1155 return ret; 1156 } 1157 1158 static long cfset_ioctl_start(unsigned long arg, struct file *file) 1159 { 1160 struct s390_ctrset_start __user *ustart; 1161 struct s390_ctrset_start start; 1162 struct cfset_request *preq; 1163 void __user *umask; 1164 unsigned int len; 1165 int ret = 0; 1166 size_t need; 1167 1168 if (file->private_data) 1169 return -EBUSY; 1170 ustart = (struct s390_ctrset_start __user *)arg; 1171 if (copy_from_user(&start, ustart, sizeof(start))) 1172 return -EFAULT; 1173 if (start.version != S390_HWCTR_START_VERSION) 1174 return -EINVAL; 1175 if (start.counter_sets & ~(cpumf_ctr_ctl[CPUMF_CTR_SET_BASIC] | 1176 cpumf_ctr_ctl[CPUMF_CTR_SET_USER] | 1177 cpumf_ctr_ctl[CPUMF_CTR_SET_CRYPTO] | 1178 cpumf_ctr_ctl[CPUMF_CTR_SET_EXT] | 1179 cpumf_ctr_ctl[CPUMF_CTR_SET_MT_DIAG])) 1180 return -EINVAL; /* Invalid counter set */ 1181 if (!start.counter_sets) 1182 return -EINVAL; /* No counter set at all? */ 1183 1184 preq = kzalloc(sizeof(*preq), GFP_KERNEL); 1185 if (!preq) 1186 return -ENOMEM; 1187 cpumask_clear(&preq->mask); 1188 len = min_t(u64, start.cpumask_len, cpumask_size()); 1189 umask = (void __user *)start.cpumask; 1190 if (copy_from_user(&preq->mask, umask, len)) { 1191 kfree(preq); 1192 return -EFAULT; 1193 } 1194 if (cpumask_empty(&preq->mask)) { 1195 kfree(preq); 1196 return -EINVAL; 1197 } 1198 need = cfset_needspace(start.counter_sets); 1199 if (put_user(need, &ustart->data_bytes)) { 1200 kfree(preq); 1201 return -EFAULT; 1202 } 1203 preq->ctrset = start.counter_sets; 1204 ret = cfset_all_start(preq); 1205 if (!ret) { 1206 cfset_session_add(preq); 1207 file->private_data = preq; 1208 debug_sprintf_event(cf_dbg, 4, "%s set %#lx need %ld ret %d\n", 1209 __func__, preq->ctrset, need, ret); 1210 } else { 1211 kfree(preq); 1212 } 1213 return ret; 1214 } 1215 1216 /* Entry point to the /dev/hwctr device interface. 1217 * The ioctl system call supports three subcommands: 1218 * S390_HWCTR_START: Start the specified counter sets on a CPU list. The 1219 * counter set keeps running until explicitly stopped. Returns the number 1220 * of bytes needed to store the counter values. If another S390_HWCTR_START 1221 * ioctl subcommand is called without a previous S390_HWCTR_STOP stop 1222 * command on the same file descriptor, -EBUSY is returned. 1223 * S390_HWCTR_READ: Read the counter set values from specified CPU list given 1224 * with the S390_HWCTR_START command. 1225 * S390_HWCTR_STOP: Stops the counter sets on the CPU list given with the 1226 * previous S390_HWCTR_START subcommand. 1227 */ 1228 static long cfset_ioctl(struct file *file, unsigned int cmd, unsigned long arg) 1229 { 1230 int ret; 1231 1232 cpus_read_lock(); 1233 mutex_lock(&cfset_ctrset_mutex); 1234 switch (cmd) { 1235 case S390_HWCTR_START: 1236 ret = cfset_ioctl_start(arg, file); 1237 break; 1238 case S390_HWCTR_STOP: 1239 ret = cfset_ioctl_stop(file); 1240 break; 1241 case S390_HWCTR_READ: 1242 ret = cfset_ioctl_read(arg, file->private_data); 1243 break; 1244 default: 1245 ret = -ENOTTY; 1246 break; 1247 } 1248 mutex_unlock(&cfset_ctrset_mutex); 1249 cpus_read_unlock(); 1250 return ret; 1251 } 1252 1253 static const struct file_operations cfset_fops = { 1254 .owner = THIS_MODULE, 1255 .open = cfset_open, 1256 .release = cfset_release, 1257 .unlocked_ioctl = cfset_ioctl, 1258 .compat_ioctl = cfset_ioctl, 1259 .llseek = no_llseek 1260 }; 1261 1262 static struct miscdevice cfset_dev = { 1263 .name = S390_HWCTR_DEVICE, 1264 .minor = MISC_DYNAMIC_MINOR, 1265 .fops = &cfset_fops, 1266 }; 1267 1268 /* Hotplug add of a CPU. Scan through all active processes and add 1269 * that CPU to the list of CPUs supplied with ioctl(..., START, ...). 1270 */ 1271 int cfset_online_cpu(unsigned int cpu) 1272 { 1273 struct cfset_call_on_cpu_parm p; 1274 struct cfset_request *rp; 1275 1276 mutex_lock(&cfset_ctrset_mutex); 1277 if (!list_empty(&cfset_session.head)) { 1278 list_for_each_entry(rp, &cfset_session.head, node) { 1279 p.sets = rp->ctrset; 1280 cfset_ioctl_on(&p); 1281 cpumask_set_cpu(cpu, &rp->mask); 1282 } 1283 } 1284 mutex_unlock(&cfset_ctrset_mutex); 1285 return 0; 1286 } 1287 1288 /* Hotplug remove of a CPU. Scan through all active processes and clear 1289 * that CPU from the list of CPUs supplied with ioctl(..., START, ...). 1290 */ 1291 int cfset_offline_cpu(unsigned int cpu) 1292 { 1293 struct cfset_call_on_cpu_parm p; 1294 struct cfset_request *rp; 1295 1296 mutex_lock(&cfset_ctrset_mutex); 1297 if (!list_empty(&cfset_session.head)) { 1298 list_for_each_entry(rp, &cfset_session.head, node) { 1299 p.sets = rp->ctrset; 1300 cfset_ioctl_off(&p); 1301 cpumask_clear_cpu(cpu, &rp->mask); 1302 } 1303 } 1304 mutex_unlock(&cfset_ctrset_mutex); 1305 return 0; 1306 } 1307 1308 static void cfdiag_read(struct perf_event *event) 1309 { 1310 debug_sprintf_event(cf_dbg, 3, "%s event %#llx count %ld\n", __func__, 1311 event->attr.config, local64_read(&event->count)); 1312 } 1313 1314 static int get_authctrsets(void) 1315 { 1316 struct cpu_cf_events *cpuhw; 1317 unsigned long auth = 0; 1318 enum cpumf_ctr_set i; 1319 1320 cpuhw = &get_cpu_var(cpu_cf_events); 1321 for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) { 1322 if (cpuhw->info.auth_ctl & cpumf_ctr_ctl[i]) 1323 auth |= cpumf_ctr_ctl[i]; 1324 } 1325 put_cpu_var(cpu_cf_events); 1326 return auth; 1327 } 1328 1329 /* Setup the event. Test for authorized counter sets and only include counter 1330 * sets which are authorized at the time of the setup. Including unauthorized 1331 * counter sets result in specification exception (and panic). 1332 */ 1333 static int cfdiag_event_init2(struct perf_event *event) 1334 { 1335 struct perf_event_attr *attr = &event->attr; 1336 int err = 0; 1337 1338 /* Set sample_period to indicate sampling */ 1339 event->hw.config = attr->config; 1340 event->hw.sample_period = attr->sample_period; 1341 local64_set(&event->hw.period_left, event->hw.sample_period); 1342 local64_set(&event->count, 0); 1343 event->hw.last_period = event->hw.sample_period; 1344 1345 /* Add all authorized counter sets to config_base. The 1346 * the hardware init function is either called per-cpu or just once 1347 * for all CPUS (event->cpu == -1). This depends on the whether 1348 * counting is started for all CPUs or on a per workload base where 1349 * the perf event moves from one CPU to another CPU. 1350 * Checking the authorization on any CPU is fine as the hardware 1351 * applies the same authorization settings to all CPUs. 1352 */ 1353 event->hw.config_base = get_authctrsets(); 1354 1355 /* No authorized counter sets, nothing to count/sample */ 1356 if (!event->hw.config_base) 1357 err = -EINVAL; 1358 1359 debug_sprintf_event(cf_dbg, 5, "%s err %d config_base %#lx\n", 1360 __func__, err, event->hw.config_base); 1361 return err; 1362 } 1363 1364 static int cfdiag_event_init(struct perf_event *event) 1365 { 1366 struct perf_event_attr *attr = &event->attr; 1367 int err = -ENOENT; 1368 1369 if (event->attr.config != PERF_EVENT_CPUM_CF_DIAG || 1370 event->attr.type != event->pmu->type) 1371 goto out; 1372 1373 /* Raw events are used to access counters directly, 1374 * hence do not permit excludes. 1375 * This event is useless without PERF_SAMPLE_RAW to return counter set 1376 * values as raw data. 1377 */ 1378 if (attr->exclude_kernel || attr->exclude_user || attr->exclude_hv || 1379 !(attr->sample_type & (PERF_SAMPLE_CPU | PERF_SAMPLE_RAW))) { 1380 err = -EOPNOTSUPP; 1381 goto out; 1382 } 1383 1384 /* Initialize for using the CPU-measurement counter facility */ 1385 cpumf_hw_inuse(); 1386 event->destroy = hw_perf_event_destroy; 1387 1388 err = cfdiag_event_init2(event); 1389 if (unlikely(err)) 1390 event->destroy(event); 1391 out: 1392 return err; 1393 } 1394 1395 /* Create cf_diag/events/CF_DIAG event sysfs file. This counter is used 1396 * to collect the complete counter sets for a scheduled process. Target 1397 * are complete counter sets attached as raw data to the artificial event. 1398 * This results in complete counter sets available when a process is 1399 * scheduled. Contains the delta of every counter while the process was 1400 * running. 1401 */ 1402 CPUMF_EVENT_ATTR(CF_DIAG, CF_DIAG, PERF_EVENT_CPUM_CF_DIAG); 1403 1404 static struct attribute *cfdiag_events_attr[] = { 1405 CPUMF_EVENT_PTR(CF_DIAG, CF_DIAG), 1406 NULL, 1407 }; 1408 1409 PMU_FORMAT_ATTR(event, "config:0-63"); 1410 1411 static struct attribute *cfdiag_format_attr[] = { 1412 &format_attr_event.attr, 1413 NULL, 1414 }; 1415 1416 static struct attribute_group cfdiag_events_group = { 1417 .name = "events", 1418 .attrs = cfdiag_events_attr, 1419 }; 1420 static struct attribute_group cfdiag_format_group = { 1421 .name = "format", 1422 .attrs = cfdiag_format_attr, 1423 }; 1424 static const struct attribute_group *cfdiag_attr_groups[] = { 1425 &cfdiag_events_group, 1426 &cfdiag_format_group, 1427 NULL, 1428 }; 1429 1430 /* Performance monitoring unit for event CF_DIAG. Since this event 1431 * is also started and stopped via the perf_event_open() system call, use 1432 * the same event enable/disable call back functions. They do not 1433 * have a pointer to the perf_event strcture as first parameter. 1434 * 1435 * The functions XXX_add, XXX_del, XXX_start and XXX_stop are also common. 1436 * Reuse them and distinguish the event (always first parameter) via 1437 * 'config' member. 1438 */ 1439 static struct pmu cf_diag = { 1440 .task_ctx_nr = perf_sw_context, 1441 .event_init = cfdiag_event_init, 1442 .pmu_enable = cpumf_pmu_enable, 1443 .pmu_disable = cpumf_pmu_disable, 1444 .add = cpumf_pmu_add, 1445 .del = cpumf_pmu_del, 1446 .start = cpumf_pmu_start, 1447 .stop = cpumf_pmu_stop, 1448 .read = cfdiag_read, 1449 1450 .attr_groups = cfdiag_attr_groups 1451 }; 1452 1453 /* Calculate memory needed to store all counter sets together with header and 1454 * trailer data. This is independent of the counter set authorization which 1455 * can vary depending on the configuration. 1456 */ 1457 static size_t cfdiag_maxsize(struct cpumf_ctr_info *info) 1458 { 1459 size_t max_size = sizeof(struct cf_trailer_entry); 1460 enum cpumf_ctr_set i; 1461 1462 for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) { 1463 size_t size = cpum_cf_ctrset_size(i, info); 1464 1465 if (size) 1466 max_size += size * sizeof(u64) + 1467 sizeof(struct cf_ctrset_entry); 1468 } 1469 return max_size; 1470 } 1471 1472 /* Get the CPU speed, try sampling facility first and CPU attributes second. */ 1473 static void cfdiag_get_cpu_speed(void) 1474 { 1475 unsigned long mhz; 1476 1477 if (cpum_sf_avail()) { /* Sampling facility first */ 1478 struct hws_qsi_info_block si; 1479 1480 memset(&si, 0, sizeof(si)); 1481 if (!qsi(&si)) { 1482 cfdiag_cpu_speed = si.cpu_speed; 1483 return; 1484 } 1485 } 1486 1487 /* Fallback: CPU speed extract static part. Used in case 1488 * CPU Measurement Sampling Facility is turned off. 1489 */ 1490 mhz = __ecag(ECAG_CPU_ATTRIBUTE, 0); 1491 if (mhz != -1UL) 1492 cfdiag_cpu_speed = mhz & 0xffffffff; 1493 } 1494 1495 static int cfset_init(void) 1496 { 1497 struct cpumf_ctr_info info; 1498 size_t need; 1499 int rc; 1500 1501 if (qctri(&info)) 1502 return -ENODEV; 1503 1504 cfdiag_get_cpu_speed(); 1505 /* Make sure the counter set data fits into predefined buffer. */ 1506 need = cfdiag_maxsize(&info); 1507 if (need > sizeof(((struct cpu_cf_events *)0)->start)) { 1508 pr_err("Insufficient memory for PMU(cpum_cf_diag) need=%zu\n", 1509 need); 1510 return -ENOMEM; 1511 } 1512 1513 rc = misc_register(&cfset_dev); 1514 if (rc) { 1515 pr_err("Registration of /dev/%s failed rc=%i\n", 1516 cfset_dev.name, rc); 1517 goto out; 1518 } 1519 1520 rc = perf_pmu_register(&cf_diag, "cpum_cf_diag", -1); 1521 if (rc) { 1522 misc_deregister(&cfset_dev); 1523 pr_err("Registration of PMU(cpum_cf_diag) failed with rc=%i\n", 1524 rc); 1525 } 1526 out: 1527 return rc; 1528 } 1529 1530 device_initcall(cpumf_pmu_init); 1531