1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Hypervisor supplied "24x7" performance counter support 4 * 5 * Author: Cody P Schafer <cody@linux.vnet.ibm.com> 6 * Copyright 2014 IBM Corporation. 7 */ 8 9 #define pr_fmt(fmt) "hv-24x7: " fmt 10 11 #include <linux/perf_event.h> 12 #include <linux/rbtree.h> 13 #include <linux/module.h> 14 #include <linux/slab.h> 15 #include <linux/vmalloc.h> 16 17 #include <asm/cputhreads.h> 18 #include <asm/firmware.h> 19 #include <asm/hvcall.h> 20 #include <asm/io.h> 21 #include <linux/byteorder/generic.h> 22 23 #include <asm/rtas.h> 24 #include "hv-24x7.h" 25 #include "hv-24x7-catalog.h" 26 #include "hv-common.h" 27 28 /* Version of the 24x7 hypervisor API that we should use in this machine. */ 29 static int interface_version; 30 31 /* Whether we have to aggregate result data for some domains. */ 32 static bool aggregate_result_elements; 33 34 static cpumask_t hv_24x7_cpumask; 35 36 static bool domain_is_valid(unsigned domain) 37 { 38 switch (domain) { 39 #define DOMAIN(n, v, x, c) \ 40 case HV_PERF_DOMAIN_##n: \ 41 /* fall through */ 42 #include "hv-24x7-domains.h" 43 #undef DOMAIN 44 return true; 45 default: 46 return false; 47 } 48 } 49 50 static bool is_physical_domain(unsigned domain) 51 { 52 switch (domain) { 53 #define DOMAIN(n, v, x, c) \ 54 case HV_PERF_DOMAIN_##n: \ 55 return c; 56 #include "hv-24x7-domains.h" 57 #undef DOMAIN 58 default: 59 return false; 60 } 61 } 62 63 /* 64 * The Processor Module Information system parameter allows transferring 65 * of certain processor module information from the platform to the OS. 66 * Refer PAPR+ document to get parameter token value as '43'. 67 */ 68 69 #define PROCESSOR_MODULE_INFO 43 70 71 static u32 phys_sockets; /* Physical sockets */ 72 static u32 phys_chipspersocket; /* Physical chips per socket*/ 73 static u32 phys_coresperchip; /* Physical cores per chip */ 74 75 /* 76 * read_24x7_sys_info() 77 * Retrieve the number of sockets and chips per socket and cores per 78 * chip details through the get-system-parameter rtas call. 79 */ 80 void read_24x7_sys_info(void) 81 { 82 int call_status, len, ntypes; 83 84 spin_lock(&rtas_data_buf_lock); 85 86 /* 87 * Making system parameter: chips and sockets and cores per chip 88 * default to 1. 89 */ 90 phys_sockets = 1; 91 phys_chipspersocket = 1; 92 phys_coresperchip = 1; 93 94 call_status = rtas_call(rtas_token("ibm,get-system-parameter"), 3, 1, 95 NULL, 96 PROCESSOR_MODULE_INFO, 97 __pa(rtas_data_buf), 98 RTAS_DATA_BUF_SIZE); 99 100 if (call_status != 0) { 101 pr_err("Error calling get-system-parameter %d\n", 102 call_status); 103 } else { 104 len = be16_to_cpup((__be16 *)&rtas_data_buf[0]); 105 if (len < 8) 106 goto out; 107 108 ntypes = be16_to_cpup((__be16 *)&rtas_data_buf[2]); 109 110 if (!ntypes) 111 goto out; 112 113 phys_sockets = be16_to_cpup((__be16 *)&rtas_data_buf[4]); 114 phys_chipspersocket = be16_to_cpup((__be16 *)&rtas_data_buf[6]); 115 phys_coresperchip = be16_to_cpup((__be16 *)&rtas_data_buf[8]); 116 } 117 118 out: 119 spin_unlock(&rtas_data_buf_lock); 120 } 121 122 /* Domains for which more than one result element are returned for each event. */ 123 static bool domain_needs_aggregation(unsigned int domain) 124 { 125 return aggregate_result_elements && 126 (domain == HV_PERF_DOMAIN_PHYS_CORE || 127 (domain >= HV_PERF_DOMAIN_VCPU_HOME_CORE && 128 domain <= HV_PERF_DOMAIN_VCPU_REMOTE_NODE)); 129 } 130 131 static const char *domain_name(unsigned domain) 132 { 133 if (!domain_is_valid(domain)) 134 return NULL; 135 136 switch (domain) { 137 case HV_PERF_DOMAIN_PHYS_CHIP: return "Physical Chip"; 138 case HV_PERF_DOMAIN_PHYS_CORE: return "Physical Core"; 139 case HV_PERF_DOMAIN_VCPU_HOME_CORE: return "VCPU Home Core"; 140 case HV_PERF_DOMAIN_VCPU_HOME_CHIP: return "VCPU Home Chip"; 141 case HV_PERF_DOMAIN_VCPU_HOME_NODE: return "VCPU Home Node"; 142 case HV_PERF_DOMAIN_VCPU_REMOTE_NODE: return "VCPU Remote Node"; 143 } 144 145 WARN_ON_ONCE(domain); 146 return NULL; 147 } 148 149 static bool catalog_entry_domain_is_valid(unsigned domain) 150 { 151 /* POWER8 doesn't support virtual domains. */ 152 if (interface_version == 1) 153 return is_physical_domain(domain); 154 else 155 return domain_is_valid(domain); 156 } 157 158 /* 159 * TODO: Merging events: 160 * - Think of the hcall as an interface to a 4d array of counters: 161 * - x = domains 162 * - y = indexes in the domain (core, chip, vcpu, node, etc) 163 * - z = offset into the counter space 164 * - w = lpars (guest vms, "logical partitions") 165 * - A single request is: x,y,y_last,z,z_last,w,w_last 166 * - this means we can retrieve a rectangle of counters in y,z for a single x. 167 * 168 * - Things to consider (ignoring w): 169 * - input cost_per_request = 16 170 * - output cost_per_result(ys,zs) = 8 + 8 * ys + ys * zs 171 * - limited number of requests per hcall (must fit into 4K bytes) 172 * - 4k = 16 [buffer header] - 16 [request size] * request_count 173 * - 255 requests per hcall 174 * - sometimes it will be more efficient to read extra data and discard 175 */ 176 177 /* 178 * Example usage: 179 * perf stat -e 'hv_24x7/domain=2,offset=8,vcpu=0,lpar=0xffffffff/' 180 */ 181 182 /* u3 0-6, one of HV_24X7_PERF_DOMAIN */ 183 EVENT_DEFINE_RANGE_FORMAT(domain, config, 0, 3); 184 /* u16 */ 185 EVENT_DEFINE_RANGE_FORMAT(core, config, 16, 31); 186 EVENT_DEFINE_RANGE_FORMAT(chip, config, 16, 31); 187 EVENT_DEFINE_RANGE_FORMAT(vcpu, config, 16, 31); 188 /* u32, see "data_offset" */ 189 EVENT_DEFINE_RANGE_FORMAT(offset, config, 32, 63); 190 /* u16 */ 191 EVENT_DEFINE_RANGE_FORMAT(lpar, config1, 0, 15); 192 193 EVENT_DEFINE_RANGE(reserved1, config, 4, 15); 194 EVENT_DEFINE_RANGE(reserved2, config1, 16, 63); 195 EVENT_DEFINE_RANGE(reserved3, config2, 0, 63); 196 197 static struct attribute *format_attrs[] = { 198 &format_attr_domain.attr, 199 &format_attr_offset.attr, 200 &format_attr_core.attr, 201 &format_attr_chip.attr, 202 &format_attr_vcpu.attr, 203 &format_attr_lpar.attr, 204 NULL, 205 }; 206 207 static struct attribute_group format_group = { 208 .name = "format", 209 .attrs = format_attrs, 210 }; 211 212 static struct attribute_group event_group = { 213 .name = "events", 214 /* .attrs is set in init */ 215 }; 216 217 static struct attribute_group event_desc_group = { 218 .name = "event_descs", 219 /* .attrs is set in init */ 220 }; 221 222 static struct attribute_group event_long_desc_group = { 223 .name = "event_long_descs", 224 /* .attrs is set in init */ 225 }; 226 227 static struct kmem_cache *hv_page_cache; 228 229 DEFINE_PER_CPU(int, hv_24x7_txn_flags); 230 DEFINE_PER_CPU(int, hv_24x7_txn_err); 231 232 struct hv_24x7_hw { 233 struct perf_event *events[255]; 234 }; 235 236 DEFINE_PER_CPU(struct hv_24x7_hw, hv_24x7_hw); 237 238 /* 239 * request_buffer and result_buffer are not required to be 4k aligned, 240 * but are not allowed to cross any 4k boundary. Aligning them to 4k is 241 * the simplest way to ensure that. 242 */ 243 #define H24x7_DATA_BUFFER_SIZE 4096 244 DEFINE_PER_CPU(char, hv_24x7_reqb[H24x7_DATA_BUFFER_SIZE]) __aligned(4096); 245 DEFINE_PER_CPU(char, hv_24x7_resb[H24x7_DATA_BUFFER_SIZE]) __aligned(4096); 246 247 static unsigned int max_num_requests(int interface_version) 248 { 249 return (H24x7_DATA_BUFFER_SIZE - sizeof(struct hv_24x7_request_buffer)) 250 / H24x7_REQUEST_SIZE(interface_version); 251 } 252 253 static char *event_name(struct hv_24x7_event_data *ev, int *len) 254 { 255 *len = be16_to_cpu(ev->event_name_len) - 2; 256 return (char *)ev->remainder; 257 } 258 259 static char *event_desc(struct hv_24x7_event_data *ev, int *len) 260 { 261 unsigned nl = be16_to_cpu(ev->event_name_len); 262 __be16 *desc_len = (__be16 *)(ev->remainder + nl - 2); 263 264 *len = be16_to_cpu(*desc_len) - 2; 265 return (char *)ev->remainder + nl; 266 } 267 268 static char *event_long_desc(struct hv_24x7_event_data *ev, int *len) 269 { 270 unsigned nl = be16_to_cpu(ev->event_name_len); 271 __be16 *desc_len_ = (__be16 *)(ev->remainder + nl - 2); 272 unsigned desc_len = be16_to_cpu(*desc_len_); 273 __be16 *long_desc_len = (__be16 *)(ev->remainder + nl + desc_len - 2); 274 275 *len = be16_to_cpu(*long_desc_len) - 2; 276 return (char *)ev->remainder + nl + desc_len; 277 } 278 279 static bool event_fixed_portion_is_within(struct hv_24x7_event_data *ev, 280 void *end) 281 { 282 void *start = ev; 283 284 return (start + offsetof(struct hv_24x7_event_data, remainder)) < end; 285 } 286 287 /* 288 * Things we don't check: 289 * - padding for desc, name, and long/detailed desc is required to be '\0' 290 * bytes. 291 * 292 * Return NULL if we pass end, 293 * Otherwise return the address of the byte just following the event. 294 */ 295 static void *event_end(struct hv_24x7_event_data *ev, void *end) 296 { 297 void *start = ev; 298 __be16 *dl_, *ldl_; 299 unsigned dl, ldl; 300 unsigned nl = be16_to_cpu(ev->event_name_len); 301 302 if (nl < 2) { 303 pr_debug("%s: name length too short: %d", __func__, nl); 304 return NULL; 305 } 306 307 if (start + nl > end) { 308 pr_debug("%s: start=%p + nl=%u > end=%p", 309 __func__, start, nl, end); 310 return NULL; 311 } 312 313 dl_ = (__be16 *)(ev->remainder + nl - 2); 314 if (!IS_ALIGNED((uintptr_t)dl_, 2)) 315 pr_warn("desc len not aligned %p", dl_); 316 dl = be16_to_cpu(*dl_); 317 if (dl < 2) { 318 pr_debug("%s: desc len too short: %d", __func__, dl); 319 return NULL; 320 } 321 322 if (start + nl + dl > end) { 323 pr_debug("%s: (start=%p + nl=%u + dl=%u)=%p > end=%p", 324 __func__, start, nl, dl, start + nl + dl, end); 325 return NULL; 326 } 327 328 ldl_ = (__be16 *)(ev->remainder + nl + dl - 2); 329 if (!IS_ALIGNED((uintptr_t)ldl_, 2)) 330 pr_warn("long desc len not aligned %p", ldl_); 331 ldl = be16_to_cpu(*ldl_); 332 if (ldl < 2) { 333 pr_debug("%s: long desc len too short (ldl=%u)", 334 __func__, ldl); 335 return NULL; 336 } 337 338 if (start + nl + dl + ldl > end) { 339 pr_debug("%s: start=%p + nl=%u + dl=%u + ldl=%u > end=%p", 340 __func__, start, nl, dl, ldl, end); 341 return NULL; 342 } 343 344 return start + nl + dl + ldl; 345 } 346 347 static long h_get_24x7_catalog_page_(unsigned long phys_4096, 348 unsigned long version, unsigned long index) 349 { 350 pr_devel("h_get_24x7_catalog_page(0x%lx, %lu, %lu)", 351 phys_4096, version, index); 352 353 WARN_ON(!IS_ALIGNED(phys_4096, 4096)); 354 355 return plpar_hcall_norets(H_GET_24X7_CATALOG_PAGE, 356 phys_4096, version, index); 357 } 358 359 static long h_get_24x7_catalog_page(char page[], u64 version, u32 index) 360 { 361 return h_get_24x7_catalog_page_(virt_to_phys(page), 362 version, index); 363 } 364 365 /* 366 * Each event we find in the catalog, will have a sysfs entry. Format the 367 * data for this sysfs entry based on the event's domain. 368 * 369 * Events belonging to the Chip domain can only be monitored in that domain. 370 * i.e the domain for these events is a fixed/knwon value. 371 * 372 * Events belonging to the Core domain can be monitored either in the physical 373 * core or in one of the virtual CPU domains. So the domain value for these 374 * events must be specified by the user (i.e is a required parameter). Format 375 * the Core events with 'domain=?' so the perf-tool can error check required 376 * parameters. 377 * 378 * NOTE: For the Core domain events, rather than making domain a required 379 * parameter we could default it to PHYS_CORE and allowe users to 380 * override the domain to one of the VCPU domains. 381 * 382 * However, this can make the interface a little inconsistent. 383 * 384 * If we set domain=2 (PHYS_CHIP) and allow user to override this field 385 * the user may be tempted to also modify the "offset=x" field in which 386 * can lead to confusing usage. Consider the HPM_PCYC (offset=0x18) and 387 * HPM_INST (offset=0x20) events. With: 388 * 389 * perf stat -e hv_24x7/HPM_PCYC,offset=0x20/ 390 * 391 * we end up monitoring HPM_INST, while the command line has HPM_PCYC. 392 * 393 * By not assigning a default value to the domain for the Core events, 394 * we can have simple guidelines: 395 * 396 * - Specifying values for parameters with "=?" is required. 397 * 398 * - Specifying (i.e overriding) values for other parameters 399 * is undefined. 400 */ 401 static char *event_fmt(struct hv_24x7_event_data *event, unsigned domain) 402 { 403 const char *sindex; 404 const char *lpar; 405 const char *domain_str; 406 char buf[8]; 407 408 switch (domain) { 409 case HV_PERF_DOMAIN_PHYS_CHIP: 410 snprintf(buf, sizeof(buf), "%d", domain); 411 domain_str = buf; 412 lpar = "0x0"; 413 sindex = "chip"; 414 break; 415 case HV_PERF_DOMAIN_PHYS_CORE: 416 domain_str = "?"; 417 lpar = "0x0"; 418 sindex = "core"; 419 break; 420 default: 421 domain_str = "?"; 422 lpar = "?"; 423 sindex = "vcpu"; 424 } 425 426 return kasprintf(GFP_KERNEL, 427 "domain=%s,offset=0x%x,%s=?,lpar=%s", 428 domain_str, 429 be16_to_cpu(event->event_counter_offs) + 430 be16_to_cpu(event->event_group_record_offs), 431 sindex, 432 lpar); 433 } 434 435 /* Avoid trusting fw to NUL terminate strings */ 436 static char *memdup_to_str(char *maybe_str, int max_len, gfp_t gfp) 437 { 438 return kasprintf(gfp, "%.*s", max_len, maybe_str); 439 } 440 441 static ssize_t device_show_string(struct device *dev, 442 struct device_attribute *attr, char *buf) 443 { 444 struct dev_ext_attribute *d; 445 446 d = container_of(attr, struct dev_ext_attribute, attr); 447 448 return sprintf(buf, "%s\n", (char *)d->var); 449 } 450 451 static ssize_t cpumask_show(struct device *dev, 452 struct device_attribute *attr, char *buf) 453 { 454 return cpumap_print_to_pagebuf(true, buf, &hv_24x7_cpumask); 455 } 456 457 static ssize_t sockets_show(struct device *dev, 458 struct device_attribute *attr, char *buf) 459 { 460 return sprintf(buf, "%d\n", phys_sockets); 461 } 462 463 static ssize_t chipspersocket_show(struct device *dev, 464 struct device_attribute *attr, char *buf) 465 { 466 return sprintf(buf, "%d\n", phys_chipspersocket); 467 } 468 469 static ssize_t coresperchip_show(struct device *dev, 470 struct device_attribute *attr, char *buf) 471 { 472 return sprintf(buf, "%d\n", phys_coresperchip); 473 } 474 475 static struct attribute *device_str_attr_create_(char *name, char *str) 476 { 477 struct dev_ext_attribute *attr = kzalloc(sizeof(*attr), GFP_KERNEL); 478 479 if (!attr) 480 return NULL; 481 482 sysfs_attr_init(&attr->attr.attr); 483 484 attr->var = str; 485 attr->attr.attr.name = name; 486 attr->attr.attr.mode = 0444; 487 attr->attr.show = device_show_string; 488 489 return &attr->attr.attr; 490 } 491 492 /* 493 * Allocate and initialize strings representing event attributes. 494 * 495 * NOTE: The strings allocated here are never destroyed and continue to 496 * exist till shutdown. This is to allow us to create as many events 497 * from the catalog as possible, even if we encounter errors with some. 498 * In case of changes to error paths in future, these may need to be 499 * freed by the caller. 500 */ 501 static struct attribute *device_str_attr_create(char *name, int name_max, 502 int name_nonce, 503 char *str, size_t str_max) 504 { 505 char *n; 506 char *s = memdup_to_str(str, str_max, GFP_KERNEL); 507 struct attribute *a; 508 509 if (!s) 510 return NULL; 511 512 if (!name_nonce) 513 n = kasprintf(GFP_KERNEL, "%.*s", name_max, name); 514 else 515 n = kasprintf(GFP_KERNEL, "%.*s__%d", name_max, name, 516 name_nonce); 517 if (!n) 518 goto out_s; 519 520 a = device_str_attr_create_(n, s); 521 if (!a) 522 goto out_n; 523 524 return a; 525 out_n: 526 kfree(n); 527 out_s: 528 kfree(s); 529 return NULL; 530 } 531 532 static struct attribute *event_to_attr(unsigned ix, 533 struct hv_24x7_event_data *event, 534 unsigned domain, 535 int nonce) 536 { 537 int event_name_len; 538 char *ev_name, *a_ev_name, *val; 539 struct attribute *attr; 540 541 if (!domain_is_valid(domain)) { 542 pr_warn("catalog event %u has invalid domain %u\n", 543 ix, domain); 544 return NULL; 545 } 546 547 val = event_fmt(event, domain); 548 if (!val) 549 return NULL; 550 551 ev_name = event_name(event, &event_name_len); 552 if (!nonce) 553 a_ev_name = kasprintf(GFP_KERNEL, "%.*s", 554 (int)event_name_len, ev_name); 555 else 556 a_ev_name = kasprintf(GFP_KERNEL, "%.*s__%d", 557 (int)event_name_len, ev_name, nonce); 558 559 if (!a_ev_name) 560 goto out_val; 561 562 attr = device_str_attr_create_(a_ev_name, val); 563 if (!attr) 564 goto out_name; 565 566 return attr; 567 out_name: 568 kfree(a_ev_name); 569 out_val: 570 kfree(val); 571 return NULL; 572 } 573 574 static struct attribute *event_to_desc_attr(struct hv_24x7_event_data *event, 575 int nonce) 576 { 577 int nl, dl; 578 char *name = event_name(event, &nl); 579 char *desc = event_desc(event, &dl); 580 581 /* If there isn't a description, don't create the sysfs file */ 582 if (!dl) 583 return NULL; 584 585 return device_str_attr_create(name, nl, nonce, desc, dl); 586 } 587 588 static struct attribute * 589 event_to_long_desc_attr(struct hv_24x7_event_data *event, int nonce) 590 { 591 int nl, dl; 592 char *name = event_name(event, &nl); 593 char *desc = event_long_desc(event, &dl); 594 595 /* If there isn't a description, don't create the sysfs file */ 596 if (!dl) 597 return NULL; 598 599 return device_str_attr_create(name, nl, nonce, desc, dl); 600 } 601 602 static int event_data_to_attrs(unsigned ix, struct attribute **attrs, 603 struct hv_24x7_event_data *event, int nonce) 604 { 605 *attrs = event_to_attr(ix, event, event->domain, nonce); 606 if (!*attrs) 607 return -1; 608 609 return 0; 610 } 611 612 /* */ 613 struct event_uniq { 614 struct rb_node node; 615 const char *name; 616 int nl; 617 unsigned ct; 618 unsigned domain; 619 }; 620 621 static int memord(const void *d1, size_t s1, const void *d2, size_t s2) 622 { 623 if (s1 < s2) 624 return 1; 625 if (s1 > s2) 626 return -1; 627 628 return memcmp(d1, d2, s1); 629 } 630 631 static int ev_uniq_ord(const void *v1, size_t s1, unsigned d1, const void *v2, 632 size_t s2, unsigned d2) 633 { 634 int r = memord(v1, s1, v2, s2); 635 636 if (r) 637 return r; 638 if (d1 > d2) 639 return 1; 640 if (d2 > d1) 641 return -1; 642 return 0; 643 } 644 645 static int event_uniq_add(struct rb_root *root, const char *name, int nl, 646 unsigned domain) 647 { 648 struct rb_node **new = &(root->rb_node), *parent = NULL; 649 struct event_uniq *data; 650 651 /* Figure out where to put new node */ 652 while (*new) { 653 struct event_uniq *it; 654 int result; 655 656 it = rb_entry(*new, struct event_uniq, node); 657 result = ev_uniq_ord(name, nl, domain, it->name, it->nl, 658 it->domain); 659 660 parent = *new; 661 if (result < 0) 662 new = &((*new)->rb_left); 663 else if (result > 0) 664 new = &((*new)->rb_right); 665 else { 666 it->ct++; 667 pr_info("found a duplicate event %.*s, ct=%u\n", nl, 668 name, it->ct); 669 return it->ct; 670 } 671 } 672 673 data = kmalloc(sizeof(*data), GFP_KERNEL); 674 if (!data) 675 return -ENOMEM; 676 677 *data = (struct event_uniq) { 678 .name = name, 679 .nl = nl, 680 .ct = 0, 681 .domain = domain, 682 }; 683 684 /* Add new node and rebalance tree. */ 685 rb_link_node(&data->node, parent, new); 686 rb_insert_color(&data->node, root); 687 688 /* data->ct */ 689 return 0; 690 } 691 692 static void event_uniq_destroy(struct rb_root *root) 693 { 694 /* 695 * the strings we point to are in the giant block of memory filled by 696 * the catalog, and are freed separately. 697 */ 698 struct event_uniq *pos, *n; 699 700 rbtree_postorder_for_each_entry_safe(pos, n, root, node) 701 kfree(pos); 702 } 703 704 705 /* 706 * ensure the event structure's sizes are self consistent and don't cause us to 707 * read outside of the event 708 * 709 * On success, return the event length in bytes. 710 * Otherwise, return -1 (and print as appropriate). 711 */ 712 static ssize_t catalog_event_len_validate(struct hv_24x7_event_data *event, 713 size_t event_idx, 714 size_t event_data_bytes, 715 size_t event_entry_count, 716 size_t offset, void *end) 717 { 718 ssize_t ev_len; 719 void *ev_end, *calc_ev_end; 720 721 if (offset >= event_data_bytes) 722 return -1; 723 724 if (event_idx >= event_entry_count) { 725 pr_devel("catalog event data has %zu bytes of padding after last event\n", 726 event_data_bytes - offset); 727 return -1; 728 } 729 730 if (!event_fixed_portion_is_within(event, end)) { 731 pr_warn("event %zu fixed portion is not within range\n", 732 event_idx); 733 return -1; 734 } 735 736 ev_len = be16_to_cpu(event->length); 737 738 if (ev_len % 16) 739 pr_info("event %zu has length %zu not divisible by 16: event=%pK\n", 740 event_idx, ev_len, event); 741 742 ev_end = (__u8 *)event + ev_len; 743 if (ev_end > end) { 744 pr_warn("event %zu has .length=%zu, ends after buffer end: ev_end=%pK > end=%pK, offset=%zu\n", 745 event_idx, ev_len, ev_end, end, 746 offset); 747 return -1; 748 } 749 750 calc_ev_end = event_end(event, end); 751 if (!calc_ev_end) { 752 pr_warn("event %zu has a calculated length which exceeds buffer length %zu: event=%pK end=%pK, offset=%zu\n", 753 event_idx, event_data_bytes, event, end, 754 offset); 755 return -1; 756 } 757 758 if (calc_ev_end > ev_end) { 759 pr_warn("event %zu exceeds it's own length: event=%pK, end=%pK, offset=%zu, calc_ev_end=%pK\n", 760 event_idx, event, ev_end, offset, calc_ev_end); 761 return -1; 762 } 763 764 return ev_len; 765 } 766 767 #define MAX_4K (SIZE_MAX / 4096) 768 769 static int create_events_from_catalog(struct attribute ***events_, 770 struct attribute ***event_descs_, 771 struct attribute ***event_long_descs_) 772 { 773 long hret; 774 size_t catalog_len, catalog_page_len, event_entry_count, 775 event_data_len, event_data_offs, 776 event_data_bytes, junk_events, event_idx, event_attr_ct, i, 777 attr_max, event_idx_last, desc_ct, long_desc_ct; 778 ssize_t ct, ev_len; 779 uint64_t catalog_version_num; 780 struct attribute **events, **event_descs, **event_long_descs; 781 struct hv_24x7_catalog_page_0 *page_0 = 782 kmem_cache_alloc(hv_page_cache, GFP_KERNEL); 783 void *page = page_0; 784 void *event_data, *end; 785 struct hv_24x7_event_data *event; 786 struct rb_root ev_uniq = RB_ROOT; 787 int ret = 0; 788 789 if (!page) { 790 ret = -ENOMEM; 791 goto e_out; 792 } 793 794 hret = h_get_24x7_catalog_page(page, 0, 0); 795 if (hret) { 796 ret = -EIO; 797 goto e_free; 798 } 799 800 catalog_version_num = be64_to_cpu(page_0->version); 801 catalog_page_len = be32_to_cpu(page_0->length); 802 803 if (MAX_4K < catalog_page_len) { 804 pr_err("invalid page count: %zu\n", catalog_page_len); 805 ret = -EIO; 806 goto e_free; 807 } 808 809 catalog_len = catalog_page_len * 4096; 810 811 event_entry_count = be16_to_cpu(page_0->event_entry_count); 812 event_data_offs = be16_to_cpu(page_0->event_data_offs); 813 event_data_len = be16_to_cpu(page_0->event_data_len); 814 815 pr_devel("cv %llu cl %zu eec %zu edo %zu edl %zu\n", 816 catalog_version_num, catalog_len, 817 event_entry_count, event_data_offs, event_data_len); 818 819 if ((MAX_4K < event_data_len) 820 || (MAX_4K < event_data_offs) 821 || (MAX_4K - event_data_offs < event_data_len)) { 822 pr_err("invalid event data offs %zu and/or len %zu\n", 823 event_data_offs, event_data_len); 824 ret = -EIO; 825 goto e_free; 826 } 827 828 if ((event_data_offs + event_data_len) > catalog_page_len) { 829 pr_err("event data %zu-%zu does not fit inside catalog 0-%zu\n", 830 event_data_offs, 831 event_data_offs + event_data_len, 832 catalog_page_len); 833 ret = -EIO; 834 goto e_free; 835 } 836 837 if (SIZE_MAX - 1 < event_entry_count) { 838 pr_err("event_entry_count %zu is invalid\n", event_entry_count); 839 ret = -EIO; 840 goto e_free; 841 } 842 843 event_data_bytes = event_data_len * 4096; 844 845 /* 846 * event data can span several pages, events can cross between these 847 * pages. Use vmalloc to make this easier. 848 */ 849 event_data = vmalloc(event_data_bytes); 850 if (!event_data) { 851 pr_err("could not allocate event data\n"); 852 ret = -ENOMEM; 853 goto e_free; 854 } 855 856 end = event_data + event_data_bytes; 857 858 /* 859 * using vmalloc_to_phys() like this only works if PAGE_SIZE is 860 * divisible by 4096 861 */ 862 BUILD_BUG_ON(PAGE_SIZE % 4096); 863 864 for (i = 0; i < event_data_len; i++) { 865 hret = h_get_24x7_catalog_page_( 866 vmalloc_to_phys(event_data + i * 4096), 867 catalog_version_num, 868 i + event_data_offs); 869 if (hret) { 870 pr_err("Failed to get event data in page %zu: rc=%ld\n", 871 i + event_data_offs, hret); 872 ret = -EIO; 873 goto e_event_data; 874 } 875 } 876 877 /* 878 * scan the catalog to determine the number of attributes we need, and 879 * verify it at the same time. 880 */ 881 for (junk_events = 0, event = event_data, event_idx = 0, attr_max = 0; 882 ; 883 event_idx++, event = (void *)event + ev_len) { 884 size_t offset = (void *)event - (void *)event_data; 885 char *name; 886 int nl; 887 888 ev_len = catalog_event_len_validate(event, event_idx, 889 event_data_bytes, 890 event_entry_count, 891 offset, end); 892 if (ev_len < 0) 893 break; 894 895 name = event_name(event, &nl); 896 897 if (event->event_group_record_len == 0) { 898 pr_devel("invalid event %zu (%.*s): group_record_len == 0, skipping\n", 899 event_idx, nl, name); 900 junk_events++; 901 continue; 902 } 903 904 if (!catalog_entry_domain_is_valid(event->domain)) { 905 pr_info("event %zu (%.*s) has invalid domain %d\n", 906 event_idx, nl, name, event->domain); 907 junk_events++; 908 continue; 909 } 910 911 attr_max++; 912 } 913 914 event_idx_last = event_idx; 915 if (event_idx_last != event_entry_count) 916 pr_warn("event buffer ended before listed # of events were parsed (got %zu, wanted %zu, junk %zu)\n", 917 event_idx_last, event_entry_count, junk_events); 918 919 events = kmalloc_array(attr_max + 1, sizeof(*events), GFP_KERNEL); 920 if (!events) { 921 ret = -ENOMEM; 922 goto e_event_data; 923 } 924 925 event_descs = kmalloc_array(event_idx + 1, sizeof(*event_descs), 926 GFP_KERNEL); 927 if (!event_descs) { 928 ret = -ENOMEM; 929 goto e_event_attrs; 930 } 931 932 event_long_descs = kmalloc_array(event_idx + 1, 933 sizeof(*event_long_descs), GFP_KERNEL); 934 if (!event_long_descs) { 935 ret = -ENOMEM; 936 goto e_event_descs; 937 } 938 939 /* Iterate over the catalog filling in the attribute vector */ 940 for (junk_events = 0, event_attr_ct = 0, desc_ct = 0, long_desc_ct = 0, 941 event = event_data, event_idx = 0; 942 event_idx < event_idx_last; 943 event_idx++, ev_len = be16_to_cpu(event->length), 944 event = (void *)event + ev_len) { 945 char *name; 946 int nl; 947 int nonce; 948 /* 949 * these are the only "bad" events that are intermixed and that 950 * we can ignore without issue. make sure to skip them here 951 */ 952 if (event->event_group_record_len == 0) 953 continue; 954 if (!catalog_entry_domain_is_valid(event->domain)) 955 continue; 956 957 name = event_name(event, &nl); 958 nonce = event_uniq_add(&ev_uniq, name, nl, event->domain); 959 ct = event_data_to_attrs(event_idx, events + event_attr_ct, 960 event, nonce); 961 if (ct < 0) { 962 pr_warn("event %zu (%.*s) creation failure, skipping\n", 963 event_idx, nl, name); 964 junk_events++; 965 } else { 966 event_attr_ct++; 967 event_descs[desc_ct] = event_to_desc_attr(event, nonce); 968 if (event_descs[desc_ct]) 969 desc_ct++; 970 event_long_descs[long_desc_ct] = 971 event_to_long_desc_attr(event, nonce); 972 if (event_long_descs[long_desc_ct]) 973 long_desc_ct++; 974 } 975 } 976 977 pr_info("read %zu catalog entries, created %zu event attrs (%zu failures), %zu descs\n", 978 event_idx, event_attr_ct, junk_events, desc_ct); 979 980 events[event_attr_ct] = NULL; 981 event_descs[desc_ct] = NULL; 982 event_long_descs[long_desc_ct] = NULL; 983 984 event_uniq_destroy(&ev_uniq); 985 vfree(event_data); 986 kmem_cache_free(hv_page_cache, page); 987 988 *events_ = events; 989 *event_descs_ = event_descs; 990 *event_long_descs_ = event_long_descs; 991 return 0; 992 993 e_event_descs: 994 kfree(event_descs); 995 e_event_attrs: 996 kfree(events); 997 e_event_data: 998 vfree(event_data); 999 e_free: 1000 kmem_cache_free(hv_page_cache, page); 1001 e_out: 1002 *events_ = NULL; 1003 *event_descs_ = NULL; 1004 *event_long_descs_ = NULL; 1005 return ret; 1006 } 1007 1008 static ssize_t catalog_read(struct file *filp, struct kobject *kobj, 1009 struct bin_attribute *bin_attr, char *buf, 1010 loff_t offset, size_t count) 1011 { 1012 long hret; 1013 ssize_t ret = 0; 1014 size_t catalog_len = 0, catalog_page_len = 0; 1015 loff_t page_offset = 0; 1016 loff_t offset_in_page; 1017 size_t copy_len; 1018 uint64_t catalog_version_num = 0; 1019 void *page = kmem_cache_alloc(hv_page_cache, GFP_USER); 1020 struct hv_24x7_catalog_page_0 *page_0 = page; 1021 1022 if (!page) 1023 return -ENOMEM; 1024 1025 hret = h_get_24x7_catalog_page(page, 0, 0); 1026 if (hret) { 1027 ret = -EIO; 1028 goto e_free; 1029 } 1030 1031 catalog_version_num = be64_to_cpu(page_0->version); 1032 catalog_page_len = be32_to_cpu(page_0->length); 1033 catalog_len = catalog_page_len * 4096; 1034 1035 page_offset = offset / 4096; 1036 offset_in_page = offset % 4096; 1037 1038 if (page_offset >= catalog_page_len) 1039 goto e_free; 1040 1041 if (page_offset != 0) { 1042 hret = h_get_24x7_catalog_page(page, catalog_version_num, 1043 page_offset); 1044 if (hret) { 1045 ret = -EIO; 1046 goto e_free; 1047 } 1048 } 1049 1050 copy_len = 4096 - offset_in_page; 1051 if (copy_len > count) 1052 copy_len = count; 1053 1054 memcpy(buf, page+offset_in_page, copy_len); 1055 ret = copy_len; 1056 1057 e_free: 1058 if (hret) 1059 pr_err("h_get_24x7_catalog_page(ver=%lld, page=%lld) failed:" 1060 " rc=%ld\n", 1061 catalog_version_num, page_offset, hret); 1062 kmem_cache_free(hv_page_cache, page); 1063 1064 pr_devel("catalog_read: offset=%lld(%lld) count=%zu " 1065 "catalog_len=%zu(%zu) => %zd\n", offset, page_offset, 1066 count, catalog_len, catalog_page_len, ret); 1067 1068 return ret; 1069 } 1070 1071 static ssize_t domains_show(struct device *dev, struct device_attribute *attr, 1072 char *page) 1073 { 1074 int d, n, count = 0; 1075 const char *str; 1076 1077 for (d = 0; d < HV_PERF_DOMAIN_MAX; d++) { 1078 str = domain_name(d); 1079 if (!str) 1080 continue; 1081 1082 n = sprintf(page, "%d: %s\n", d, str); 1083 if (n < 0) 1084 break; 1085 1086 count += n; 1087 page += n; 1088 } 1089 return count; 1090 } 1091 1092 #define PAGE_0_ATTR(_name, _fmt, _expr) \ 1093 static ssize_t _name##_show(struct device *dev, \ 1094 struct device_attribute *dev_attr, \ 1095 char *buf) \ 1096 { \ 1097 long hret; \ 1098 ssize_t ret = 0; \ 1099 void *page = kmem_cache_alloc(hv_page_cache, GFP_USER); \ 1100 struct hv_24x7_catalog_page_0 *page_0 = page; \ 1101 if (!page) \ 1102 return -ENOMEM; \ 1103 hret = h_get_24x7_catalog_page(page, 0, 0); \ 1104 if (hret) { \ 1105 ret = -EIO; \ 1106 goto e_free; \ 1107 } \ 1108 ret = sprintf(buf, _fmt, _expr); \ 1109 e_free: \ 1110 kmem_cache_free(hv_page_cache, page); \ 1111 return ret; \ 1112 } \ 1113 static DEVICE_ATTR_RO(_name) 1114 1115 PAGE_0_ATTR(catalog_version, "%lld\n", 1116 (unsigned long long)be64_to_cpu(page_0->version)); 1117 PAGE_0_ATTR(catalog_len, "%lld\n", 1118 (unsigned long long)be32_to_cpu(page_0->length) * 4096); 1119 static BIN_ATTR_RO(catalog, 0/* real length varies */); 1120 static DEVICE_ATTR_RO(domains); 1121 static DEVICE_ATTR_RO(sockets); 1122 static DEVICE_ATTR_RO(chipspersocket); 1123 static DEVICE_ATTR_RO(coresperchip); 1124 static DEVICE_ATTR_RO(cpumask); 1125 1126 static struct bin_attribute *if_bin_attrs[] = { 1127 &bin_attr_catalog, 1128 NULL, 1129 }; 1130 1131 static struct attribute *cpumask_attrs[] = { 1132 &dev_attr_cpumask.attr, 1133 NULL, 1134 }; 1135 1136 static struct attribute_group cpumask_attr_group = { 1137 .attrs = cpumask_attrs, 1138 }; 1139 1140 static struct attribute *if_attrs[] = { 1141 &dev_attr_catalog_len.attr, 1142 &dev_attr_catalog_version.attr, 1143 &dev_attr_domains.attr, 1144 &dev_attr_sockets.attr, 1145 &dev_attr_chipspersocket.attr, 1146 &dev_attr_coresperchip.attr, 1147 NULL, 1148 }; 1149 1150 static struct attribute_group if_group = { 1151 .name = "interface", 1152 .bin_attrs = if_bin_attrs, 1153 .attrs = if_attrs, 1154 }; 1155 1156 static const struct attribute_group *attr_groups[] = { 1157 &format_group, 1158 &event_group, 1159 &event_desc_group, 1160 &event_long_desc_group, 1161 &if_group, 1162 &cpumask_attr_group, 1163 NULL, 1164 }; 1165 1166 /* 1167 * Start the process for a new H_GET_24x7_DATA hcall. 1168 */ 1169 static void init_24x7_request(struct hv_24x7_request_buffer *request_buffer, 1170 struct hv_24x7_data_result_buffer *result_buffer) 1171 { 1172 1173 memset(request_buffer, 0, H24x7_DATA_BUFFER_SIZE); 1174 memset(result_buffer, 0, H24x7_DATA_BUFFER_SIZE); 1175 1176 request_buffer->interface_version = interface_version; 1177 /* memset above set request_buffer->num_requests to 0 */ 1178 } 1179 1180 /* 1181 * Commit (i.e perform) the H_GET_24x7_DATA hcall using the data collected 1182 * by 'init_24x7_request()' and 'add_event_to_24x7_request()'. 1183 */ 1184 static int make_24x7_request(struct hv_24x7_request_buffer *request_buffer, 1185 struct hv_24x7_data_result_buffer *result_buffer) 1186 { 1187 long ret; 1188 1189 /* 1190 * NOTE: Due to variable number of array elements in request and 1191 * result buffer(s), sizeof() is not reliable. Use the actual 1192 * allocated buffer size, H24x7_DATA_BUFFER_SIZE. 1193 */ 1194 ret = plpar_hcall_norets(H_GET_24X7_DATA, 1195 virt_to_phys(request_buffer), H24x7_DATA_BUFFER_SIZE, 1196 virt_to_phys(result_buffer), H24x7_DATA_BUFFER_SIZE); 1197 1198 if (ret) { 1199 struct hv_24x7_request *req; 1200 1201 req = request_buffer->requests; 1202 pr_notice_ratelimited("hcall failed: [%d %#x %#x %d] => ret 0x%lx (%ld) detail=0x%x failing ix=%x\n", 1203 req->performance_domain, req->data_offset, 1204 req->starting_ix, req->starting_lpar_ix, 1205 ret, ret, result_buffer->detailed_rc, 1206 result_buffer->failing_request_ix); 1207 return -EIO; 1208 } 1209 1210 return 0; 1211 } 1212 1213 /* 1214 * Add the given @event to the next slot in the 24x7 request_buffer. 1215 * 1216 * Note that H_GET_24X7_DATA hcall allows reading several counters' 1217 * values in a single HCALL. We expect the caller to add events to the 1218 * request buffer one by one, make the HCALL and process the results. 1219 */ 1220 static int add_event_to_24x7_request(struct perf_event *event, 1221 struct hv_24x7_request_buffer *request_buffer) 1222 { 1223 u16 idx; 1224 int i; 1225 size_t req_size; 1226 struct hv_24x7_request *req; 1227 1228 if (request_buffer->num_requests >= 1229 max_num_requests(request_buffer->interface_version)) { 1230 pr_devel("Too many requests for 24x7 HCALL %d\n", 1231 request_buffer->num_requests); 1232 return -EINVAL; 1233 } 1234 1235 switch (event_get_domain(event)) { 1236 case HV_PERF_DOMAIN_PHYS_CHIP: 1237 idx = event_get_chip(event); 1238 break; 1239 case HV_PERF_DOMAIN_PHYS_CORE: 1240 idx = event_get_core(event); 1241 break; 1242 default: 1243 idx = event_get_vcpu(event); 1244 } 1245 1246 req_size = H24x7_REQUEST_SIZE(request_buffer->interface_version); 1247 1248 i = request_buffer->num_requests++; 1249 req = (void *) request_buffer->requests + i * req_size; 1250 1251 req->performance_domain = event_get_domain(event); 1252 req->data_size = cpu_to_be16(8); 1253 req->data_offset = cpu_to_be32(event_get_offset(event)); 1254 req->starting_lpar_ix = cpu_to_be16(event_get_lpar(event)); 1255 req->max_num_lpars = cpu_to_be16(1); 1256 req->starting_ix = cpu_to_be16(idx); 1257 req->max_ix = cpu_to_be16(1); 1258 1259 if (request_buffer->interface_version > 1) { 1260 if (domain_needs_aggregation(req->performance_domain)) 1261 req->max_num_thread_groups = -1; 1262 else if (req->performance_domain != HV_PERF_DOMAIN_PHYS_CHIP) { 1263 req->starting_thread_group_ix = idx % 2; 1264 req->max_num_thread_groups = 1; 1265 } 1266 } 1267 1268 return 0; 1269 } 1270 1271 /** 1272 * get_count_from_result - get event count from all result elements in result 1273 * 1274 * If the event corresponding to this result needs aggregation of the result 1275 * element values, then this function does that. 1276 * 1277 * @event: Event associated with @res. 1278 * @resb: Result buffer containing @res. 1279 * @res: Result to work on. 1280 * @countp: Output variable containing the event count. 1281 * @next: Optional output variable pointing to the next result in @resb. 1282 */ 1283 static int get_count_from_result(struct perf_event *event, 1284 struct hv_24x7_data_result_buffer *resb, 1285 struct hv_24x7_result *res, u64 *countp, 1286 struct hv_24x7_result **next) 1287 { 1288 u16 num_elements = be16_to_cpu(res->num_elements_returned); 1289 u16 data_size = be16_to_cpu(res->result_element_data_size); 1290 unsigned int data_offset; 1291 void *element_data; 1292 int i; 1293 u64 count; 1294 1295 /* 1296 * We can bail out early if the result is empty. 1297 */ 1298 if (!num_elements) { 1299 pr_debug("Result of request %hhu is empty, nothing to do\n", 1300 res->result_ix); 1301 1302 if (next) 1303 *next = (struct hv_24x7_result *) res->elements; 1304 1305 return -ENODATA; 1306 } 1307 1308 /* 1309 * Since we always specify 1 as the maximum for the smallest resource 1310 * we're requesting, there should to be only one element per result. 1311 * Except when an event needs aggregation, in which case there are more. 1312 */ 1313 if (num_elements != 1 && 1314 !domain_needs_aggregation(event_get_domain(event))) { 1315 pr_err("Error: result of request %hhu has %hu elements\n", 1316 res->result_ix, num_elements); 1317 1318 return -EIO; 1319 } 1320 1321 if (data_size != sizeof(u64)) { 1322 pr_debug("Error: result of request %hhu has data of %hu bytes\n", 1323 res->result_ix, data_size); 1324 1325 return -ENOTSUPP; 1326 } 1327 1328 if (resb->interface_version == 1) 1329 data_offset = offsetof(struct hv_24x7_result_element_v1, 1330 element_data); 1331 else 1332 data_offset = offsetof(struct hv_24x7_result_element_v2, 1333 element_data); 1334 1335 /* Go through the result elements in the result. */ 1336 for (i = count = 0, element_data = res->elements + data_offset; 1337 i < num_elements; 1338 i++, element_data += data_size + data_offset) 1339 count += be64_to_cpu(*((u64 *) element_data)); 1340 1341 *countp = count; 1342 1343 /* The next result is after the last result element. */ 1344 if (next) 1345 *next = element_data - data_offset; 1346 1347 return 0; 1348 } 1349 1350 static int single_24x7_request(struct perf_event *event, u64 *count) 1351 { 1352 int ret; 1353 struct hv_24x7_request_buffer *request_buffer; 1354 struct hv_24x7_data_result_buffer *result_buffer; 1355 1356 BUILD_BUG_ON(sizeof(*request_buffer) > 4096); 1357 BUILD_BUG_ON(sizeof(*result_buffer) > 4096); 1358 1359 request_buffer = (void *)get_cpu_var(hv_24x7_reqb); 1360 result_buffer = (void *)get_cpu_var(hv_24x7_resb); 1361 1362 init_24x7_request(request_buffer, result_buffer); 1363 1364 ret = add_event_to_24x7_request(event, request_buffer); 1365 if (ret) 1366 goto out; 1367 1368 ret = make_24x7_request(request_buffer, result_buffer); 1369 if (ret) 1370 goto out; 1371 1372 /* process result from hcall */ 1373 ret = get_count_from_result(event, result_buffer, 1374 result_buffer->results, count, NULL); 1375 1376 out: 1377 put_cpu_var(hv_24x7_reqb); 1378 put_cpu_var(hv_24x7_resb); 1379 return ret; 1380 } 1381 1382 1383 static int h_24x7_event_init(struct perf_event *event) 1384 { 1385 struct hv_perf_caps caps; 1386 unsigned domain; 1387 unsigned long hret; 1388 u64 ct; 1389 1390 /* Not our event */ 1391 if (event->attr.type != event->pmu->type) 1392 return -ENOENT; 1393 1394 /* Unused areas must be 0 */ 1395 if (event_get_reserved1(event) || 1396 event_get_reserved2(event) || 1397 event_get_reserved3(event)) { 1398 pr_devel("reserved set when forbidden 0x%llx(0x%llx) 0x%llx(0x%llx) 0x%llx(0x%llx)\n", 1399 event->attr.config, 1400 event_get_reserved1(event), 1401 event->attr.config1, 1402 event_get_reserved2(event), 1403 event->attr.config2, 1404 event_get_reserved3(event)); 1405 return -EINVAL; 1406 } 1407 1408 /* no branch sampling */ 1409 if (has_branch_stack(event)) 1410 return -EOPNOTSUPP; 1411 1412 /* offset must be 8 byte aligned */ 1413 if (event_get_offset(event) % 8) { 1414 pr_devel("bad alignment\n"); 1415 return -EINVAL; 1416 } 1417 1418 domain = event_get_domain(event); 1419 if (domain >= HV_PERF_DOMAIN_MAX) { 1420 pr_devel("invalid domain %d\n", domain); 1421 return -EINVAL; 1422 } 1423 1424 hret = hv_perf_caps_get(&caps); 1425 if (hret) { 1426 pr_devel("could not get capabilities: rc=%ld\n", hret); 1427 return -EIO; 1428 } 1429 1430 /* Physical domains & other lpars require extra capabilities */ 1431 if (!caps.collect_privileged && (is_physical_domain(domain) || 1432 (event_get_lpar(event) != event_get_lpar_max()))) { 1433 pr_devel("hv permissions disallow: is_physical_domain:%d, lpar=0x%llx\n", 1434 is_physical_domain(domain), 1435 event_get_lpar(event)); 1436 return -EACCES; 1437 } 1438 1439 /* Get the initial value of the counter for this event */ 1440 if (single_24x7_request(event, &ct)) { 1441 pr_devel("test hcall failed\n"); 1442 return -EIO; 1443 } 1444 (void)local64_xchg(&event->hw.prev_count, ct); 1445 1446 return 0; 1447 } 1448 1449 static u64 h_24x7_get_value(struct perf_event *event) 1450 { 1451 u64 ct; 1452 1453 if (single_24x7_request(event, &ct)) 1454 /* We checked this in event init, shouldn't fail here... */ 1455 return 0; 1456 1457 return ct; 1458 } 1459 1460 static void update_event_count(struct perf_event *event, u64 now) 1461 { 1462 s64 prev; 1463 1464 prev = local64_xchg(&event->hw.prev_count, now); 1465 local64_add(now - prev, &event->count); 1466 } 1467 1468 static void h_24x7_event_read(struct perf_event *event) 1469 { 1470 u64 now; 1471 struct hv_24x7_request_buffer *request_buffer; 1472 struct hv_24x7_hw *h24x7hw; 1473 int txn_flags; 1474 1475 txn_flags = __this_cpu_read(hv_24x7_txn_flags); 1476 1477 /* 1478 * If in a READ transaction, add this counter to the list of 1479 * counters to read during the next HCALL (i.e commit_txn()). 1480 * If not in a READ transaction, go ahead and make the HCALL 1481 * to read this counter by itself. 1482 */ 1483 1484 if (txn_flags & PERF_PMU_TXN_READ) { 1485 int i; 1486 int ret; 1487 1488 if (__this_cpu_read(hv_24x7_txn_err)) 1489 return; 1490 1491 request_buffer = (void *)get_cpu_var(hv_24x7_reqb); 1492 1493 ret = add_event_to_24x7_request(event, request_buffer); 1494 if (ret) { 1495 __this_cpu_write(hv_24x7_txn_err, ret); 1496 } else { 1497 /* 1498 * Associate the event with the HCALL request index, 1499 * so ->commit_txn() can quickly find/update count. 1500 */ 1501 i = request_buffer->num_requests - 1; 1502 1503 h24x7hw = &get_cpu_var(hv_24x7_hw); 1504 h24x7hw->events[i] = event; 1505 put_cpu_var(h24x7hw); 1506 } 1507 1508 put_cpu_var(hv_24x7_reqb); 1509 } else { 1510 now = h_24x7_get_value(event); 1511 update_event_count(event, now); 1512 } 1513 } 1514 1515 static void h_24x7_event_start(struct perf_event *event, int flags) 1516 { 1517 if (flags & PERF_EF_RELOAD) 1518 local64_set(&event->hw.prev_count, h_24x7_get_value(event)); 1519 } 1520 1521 static void h_24x7_event_stop(struct perf_event *event, int flags) 1522 { 1523 h_24x7_event_read(event); 1524 } 1525 1526 static int h_24x7_event_add(struct perf_event *event, int flags) 1527 { 1528 if (flags & PERF_EF_START) 1529 h_24x7_event_start(event, flags); 1530 1531 return 0; 1532 } 1533 1534 /* 1535 * 24x7 counters only support READ transactions. They are 1536 * always counting and dont need/support ADD transactions. 1537 * Cache the flags, but otherwise ignore transactions that 1538 * are not PERF_PMU_TXN_READ. 1539 */ 1540 static void h_24x7_event_start_txn(struct pmu *pmu, unsigned int flags) 1541 { 1542 struct hv_24x7_request_buffer *request_buffer; 1543 struct hv_24x7_data_result_buffer *result_buffer; 1544 1545 /* We should not be called if we are already in a txn */ 1546 WARN_ON_ONCE(__this_cpu_read(hv_24x7_txn_flags)); 1547 1548 __this_cpu_write(hv_24x7_txn_flags, flags); 1549 if (flags & ~PERF_PMU_TXN_READ) 1550 return; 1551 1552 request_buffer = (void *)get_cpu_var(hv_24x7_reqb); 1553 result_buffer = (void *)get_cpu_var(hv_24x7_resb); 1554 1555 init_24x7_request(request_buffer, result_buffer); 1556 1557 put_cpu_var(hv_24x7_resb); 1558 put_cpu_var(hv_24x7_reqb); 1559 } 1560 1561 /* 1562 * Clean up transaction state. 1563 * 1564 * NOTE: Ignore state of request and result buffers for now. 1565 * We will initialize them during the next read/txn. 1566 */ 1567 static void reset_txn(void) 1568 { 1569 __this_cpu_write(hv_24x7_txn_flags, 0); 1570 __this_cpu_write(hv_24x7_txn_err, 0); 1571 } 1572 1573 /* 1574 * 24x7 counters only support READ transactions. They are always counting 1575 * and dont need/support ADD transactions. Clear ->txn_flags but otherwise 1576 * ignore transactions that are not of type PERF_PMU_TXN_READ. 1577 * 1578 * For READ transactions, submit all pending 24x7 requests (i.e requests 1579 * that were queued by h_24x7_event_read()), to the hypervisor and update 1580 * the event counts. 1581 */ 1582 static int h_24x7_event_commit_txn(struct pmu *pmu) 1583 { 1584 struct hv_24x7_request_buffer *request_buffer; 1585 struct hv_24x7_data_result_buffer *result_buffer; 1586 struct hv_24x7_result *res, *next_res; 1587 u64 count; 1588 int i, ret, txn_flags; 1589 struct hv_24x7_hw *h24x7hw; 1590 1591 txn_flags = __this_cpu_read(hv_24x7_txn_flags); 1592 WARN_ON_ONCE(!txn_flags); 1593 1594 ret = 0; 1595 if (txn_flags & ~PERF_PMU_TXN_READ) 1596 goto out; 1597 1598 ret = __this_cpu_read(hv_24x7_txn_err); 1599 if (ret) 1600 goto out; 1601 1602 request_buffer = (void *)get_cpu_var(hv_24x7_reqb); 1603 result_buffer = (void *)get_cpu_var(hv_24x7_resb); 1604 1605 ret = make_24x7_request(request_buffer, result_buffer); 1606 if (ret) 1607 goto put_reqb; 1608 1609 h24x7hw = &get_cpu_var(hv_24x7_hw); 1610 1611 /* Go through results in the result buffer to update event counts. */ 1612 for (i = 0, res = result_buffer->results; 1613 i < result_buffer->num_results; i++, res = next_res) { 1614 struct perf_event *event = h24x7hw->events[res->result_ix]; 1615 1616 ret = get_count_from_result(event, result_buffer, res, &count, 1617 &next_res); 1618 if (ret) 1619 break; 1620 1621 update_event_count(event, count); 1622 } 1623 1624 put_cpu_var(hv_24x7_hw); 1625 1626 put_reqb: 1627 put_cpu_var(hv_24x7_resb); 1628 put_cpu_var(hv_24x7_reqb); 1629 out: 1630 reset_txn(); 1631 return ret; 1632 } 1633 1634 /* 1635 * 24x7 counters only support READ transactions. They are always counting 1636 * and dont need/support ADD transactions. However, regardless of type 1637 * of transaction, all we need to do is cleanup, so we don't have to check 1638 * the type of transaction. 1639 */ 1640 static void h_24x7_event_cancel_txn(struct pmu *pmu) 1641 { 1642 WARN_ON_ONCE(!__this_cpu_read(hv_24x7_txn_flags)); 1643 reset_txn(); 1644 } 1645 1646 static struct pmu h_24x7_pmu = { 1647 .task_ctx_nr = perf_invalid_context, 1648 1649 .name = "hv_24x7", 1650 .attr_groups = attr_groups, 1651 .event_init = h_24x7_event_init, 1652 .add = h_24x7_event_add, 1653 .del = h_24x7_event_stop, 1654 .start = h_24x7_event_start, 1655 .stop = h_24x7_event_stop, 1656 .read = h_24x7_event_read, 1657 .start_txn = h_24x7_event_start_txn, 1658 .commit_txn = h_24x7_event_commit_txn, 1659 .cancel_txn = h_24x7_event_cancel_txn, 1660 .capabilities = PERF_PMU_CAP_NO_EXCLUDE, 1661 }; 1662 1663 static int ppc_hv_24x7_cpu_online(unsigned int cpu) 1664 { 1665 if (cpumask_empty(&hv_24x7_cpumask)) 1666 cpumask_set_cpu(cpu, &hv_24x7_cpumask); 1667 1668 return 0; 1669 } 1670 1671 static int ppc_hv_24x7_cpu_offline(unsigned int cpu) 1672 { 1673 int target; 1674 1675 /* Check if exiting cpu is used for collecting 24x7 events */ 1676 if (!cpumask_test_and_clear_cpu(cpu, &hv_24x7_cpumask)) 1677 return 0; 1678 1679 /* Find a new cpu to collect 24x7 events */ 1680 target = cpumask_last(cpu_active_mask); 1681 1682 if (target < 0 || target >= nr_cpu_ids) { 1683 pr_err("hv_24x7: CPU hotplug init failed\n"); 1684 return -1; 1685 } 1686 1687 /* Migrate 24x7 events to the new target */ 1688 cpumask_set_cpu(target, &hv_24x7_cpumask); 1689 perf_pmu_migrate_context(&h_24x7_pmu, cpu, target); 1690 1691 return 0; 1692 } 1693 1694 static int hv_24x7_cpu_hotplug_init(void) 1695 { 1696 return cpuhp_setup_state(CPUHP_AP_PERF_POWERPC_HV_24x7_ONLINE, 1697 "perf/powerpc/hv_24x7:online", 1698 ppc_hv_24x7_cpu_online, 1699 ppc_hv_24x7_cpu_offline); 1700 } 1701 1702 static int hv_24x7_init(void) 1703 { 1704 int r; 1705 unsigned long hret; 1706 struct hv_perf_caps caps; 1707 1708 if (!firmware_has_feature(FW_FEATURE_LPAR)) { 1709 pr_debug("not a virtualized system, not enabling\n"); 1710 return -ENODEV; 1711 } else if (!cur_cpu_spec->oprofile_cpu_type) 1712 return -ENODEV; 1713 1714 /* POWER8 only supports v1, while POWER9 only supports v2. */ 1715 if (!strcmp(cur_cpu_spec->oprofile_cpu_type, "ppc64/power8")) 1716 interface_version = 1; 1717 else { 1718 interface_version = 2; 1719 1720 /* SMT8 in POWER9 needs to aggregate result elements. */ 1721 if (threads_per_core == 8) 1722 aggregate_result_elements = true; 1723 } 1724 1725 hret = hv_perf_caps_get(&caps); 1726 if (hret) { 1727 pr_debug("could not obtain capabilities, not enabling, rc=%ld\n", 1728 hret); 1729 return -ENODEV; 1730 } 1731 1732 hv_page_cache = kmem_cache_create("hv-page-4096", 4096, 4096, 0, NULL); 1733 if (!hv_page_cache) 1734 return -ENOMEM; 1735 1736 /* sampling not supported */ 1737 h_24x7_pmu.capabilities |= PERF_PMU_CAP_NO_INTERRUPT; 1738 1739 r = create_events_from_catalog(&event_group.attrs, 1740 &event_desc_group.attrs, 1741 &event_long_desc_group.attrs); 1742 1743 if (r) 1744 return r; 1745 1746 /* init cpuhotplug */ 1747 r = hv_24x7_cpu_hotplug_init(); 1748 if (r) 1749 return r; 1750 1751 r = perf_pmu_register(&h_24x7_pmu, h_24x7_pmu.name, -1); 1752 if (r) 1753 return r; 1754 1755 read_24x7_sys_info(); 1756 1757 return 0; 1758 } 1759 1760 device_initcall(hv_24x7_init); 1761