1edc41a10SNamhyung Kim // SPDX-License-Identifier: GPL-2.0 2edc41a10SNamhyung Kim #include "util/bpf_counter.h" 3edc41a10SNamhyung Kim #include "util/debug.h" 4edc41a10SNamhyung Kim #include "util/evsel.h" 5edc41a10SNamhyung Kim #include "util/evlist.h" 6edc41a10SNamhyung Kim #include "util/off_cpu.h" 7edc41a10SNamhyung Kim #include "util/perf-hooks.h" 8*685439a7SNamhyung Kim #include "util/record.h" 9edc41a10SNamhyung Kim #include "util/session.h" 1010742d0cSNamhyung Kim #include "util/target.h" 1110742d0cSNamhyung Kim #include "util/cpumap.h" 1210742d0cSNamhyung Kim #include "util/thread_map.h" 13*685439a7SNamhyung Kim #include "util/cgroup.h" 14edc41a10SNamhyung Kim #include <bpf/bpf.h> 15edc41a10SNamhyung Kim 16edc41a10SNamhyung Kim #include "bpf_skel/off_cpu.skel.h" 17edc41a10SNamhyung Kim 18edc41a10SNamhyung Kim #define MAX_STACKS 32 19edc41a10SNamhyung Kim /* we don't need actual timestamp, just want to put the samples at last */ 20edc41a10SNamhyung Kim #define OFF_CPU_TIMESTAMP (~0ull << 32) 21edc41a10SNamhyung Kim 22edc41a10SNamhyung Kim static struct off_cpu_bpf *skel; 23edc41a10SNamhyung Kim 24edc41a10SNamhyung Kim struct off_cpu_key { 25edc41a10SNamhyung Kim u32 pid; 26edc41a10SNamhyung Kim u32 tgid; 27edc41a10SNamhyung Kim u32 stack_id; 28edc41a10SNamhyung Kim u32 state; 29*685439a7SNamhyung Kim u64 cgroup_id; 30edc41a10SNamhyung Kim }; 31edc41a10SNamhyung Kim 32edc41a10SNamhyung Kim union off_cpu_data { 33edc41a10SNamhyung Kim struct perf_event_header hdr; 34edc41a10SNamhyung Kim u64 array[1024 / sizeof(u64)]; 35edc41a10SNamhyung Kim }; 36edc41a10SNamhyung Kim 37edc41a10SNamhyung Kim static int off_cpu_config(struct evlist *evlist) 38edc41a10SNamhyung Kim { 39edc41a10SNamhyung Kim struct evsel *evsel; 40edc41a10SNamhyung Kim struct perf_event_attr attr = { 41edc41a10SNamhyung Kim .type = PERF_TYPE_SOFTWARE, 42edc41a10SNamhyung Kim .config = PERF_COUNT_SW_BPF_OUTPUT, 43edc41a10SNamhyung Kim .size = sizeof(attr), /* to capture ABI version */ 44edc41a10SNamhyung Kim }; 45edc41a10SNamhyung Kim char *evname = strdup(OFFCPU_EVENT); 46edc41a10SNamhyung Kim 47edc41a10SNamhyung Kim if (evname == NULL) 48edc41a10SNamhyung Kim return -ENOMEM; 49edc41a10SNamhyung Kim 50edc41a10SNamhyung Kim evsel = evsel__new(&attr); 51edc41a10SNamhyung Kim if (!evsel) { 52edc41a10SNamhyung Kim free(evname); 53edc41a10SNamhyung Kim return -ENOMEM; 54edc41a10SNamhyung Kim } 55edc41a10SNamhyung Kim 56edc41a10SNamhyung Kim evsel->core.attr.freq = 1; 57edc41a10SNamhyung Kim evsel->core.attr.sample_period = 1; 58edc41a10SNamhyung Kim /* off-cpu analysis depends on stack trace */ 59edc41a10SNamhyung Kim evsel->core.attr.sample_type = PERF_SAMPLE_CALLCHAIN; 60edc41a10SNamhyung Kim 61edc41a10SNamhyung Kim evlist__add(evlist, evsel); 62edc41a10SNamhyung Kim 63edc41a10SNamhyung Kim free(evsel->name); 64edc41a10SNamhyung Kim evsel->name = evname; 65edc41a10SNamhyung Kim 66edc41a10SNamhyung Kim return 0; 67edc41a10SNamhyung Kim } 68edc41a10SNamhyung Kim 6910742d0cSNamhyung Kim static void off_cpu_start(void *arg) 70edc41a10SNamhyung Kim { 7110742d0cSNamhyung Kim struct evlist *evlist = arg; 7210742d0cSNamhyung Kim 7310742d0cSNamhyung Kim /* update task filter for the given workload */ 7410742d0cSNamhyung Kim if (!skel->bss->has_cpu && !skel->bss->has_task && 7510742d0cSNamhyung Kim perf_thread_map__pid(evlist->core.threads, 0) != -1) { 7610742d0cSNamhyung Kim int fd; 7710742d0cSNamhyung Kim u32 pid; 7810742d0cSNamhyung Kim u8 val = 1; 7910742d0cSNamhyung Kim 8010742d0cSNamhyung Kim skel->bss->has_task = 1; 8110742d0cSNamhyung Kim fd = bpf_map__fd(skel->maps.task_filter); 8210742d0cSNamhyung Kim pid = perf_thread_map__pid(evlist->core.threads, 0); 8310742d0cSNamhyung Kim bpf_map_update_elem(fd, &pid, &val, BPF_ANY); 8410742d0cSNamhyung Kim } 8510742d0cSNamhyung Kim 86edc41a10SNamhyung Kim skel->bss->enabled = 1; 87edc41a10SNamhyung Kim } 88edc41a10SNamhyung Kim 89edc41a10SNamhyung Kim static void off_cpu_finish(void *arg __maybe_unused) 90edc41a10SNamhyung Kim { 91edc41a10SNamhyung Kim skel->bss->enabled = 0; 92edc41a10SNamhyung Kim off_cpu_bpf__destroy(skel); 93edc41a10SNamhyung Kim } 94edc41a10SNamhyung Kim 95b36888f7SNamhyung Kim /* v5.18 kernel added prev_state arg, so it needs to check the signature */ 96b36888f7SNamhyung Kim static void check_sched_switch_args(void) 97b36888f7SNamhyung Kim { 98b36888f7SNamhyung Kim const struct btf *btf = bpf_object__btf(skel->obj); 99b36888f7SNamhyung Kim const struct btf_type *t1, *t2, *t3; 100b36888f7SNamhyung Kim u32 type_id; 101b36888f7SNamhyung Kim 102b36888f7SNamhyung Kim type_id = btf__find_by_name_kind(btf, "bpf_trace_sched_switch", 103b36888f7SNamhyung Kim BTF_KIND_TYPEDEF); 104b36888f7SNamhyung Kim if ((s32)type_id < 0) 105b36888f7SNamhyung Kim return; 106b36888f7SNamhyung Kim 107b36888f7SNamhyung Kim t1 = btf__type_by_id(btf, type_id); 108b36888f7SNamhyung Kim if (t1 == NULL) 109b36888f7SNamhyung Kim return; 110b36888f7SNamhyung Kim 111b36888f7SNamhyung Kim t2 = btf__type_by_id(btf, t1->type); 112b36888f7SNamhyung Kim if (t2 == NULL || !btf_is_ptr(t2)) 113b36888f7SNamhyung Kim return; 114b36888f7SNamhyung Kim 115b36888f7SNamhyung Kim t3 = btf__type_by_id(btf, t2->type); 116b36888f7SNamhyung Kim if (t3 && btf_is_func_proto(t3) && btf_vlen(t3) == 4) { 117b36888f7SNamhyung Kim /* new format: pass prev_state as 4th arg */ 118b36888f7SNamhyung Kim skel->rodata->has_prev_state = true; 119b36888f7SNamhyung Kim } 120b36888f7SNamhyung Kim } 121b36888f7SNamhyung Kim 122*685439a7SNamhyung Kim int off_cpu_prepare(struct evlist *evlist, struct target *target, 123*685439a7SNamhyung Kim struct record_opts *opts) 124edc41a10SNamhyung Kim { 12510742d0cSNamhyung Kim int err, fd, i; 126*685439a7SNamhyung Kim int ncpus = 1, ntasks = 1, ncgrps = 1; 127edc41a10SNamhyung Kim 128edc41a10SNamhyung Kim if (off_cpu_config(evlist) < 0) { 129edc41a10SNamhyung Kim pr_err("Failed to config off-cpu BPF event\n"); 130edc41a10SNamhyung Kim return -1; 131edc41a10SNamhyung Kim } 132edc41a10SNamhyung Kim 13310742d0cSNamhyung Kim skel = off_cpu_bpf__open(); 134edc41a10SNamhyung Kim if (!skel) { 135edc41a10SNamhyung Kim pr_err("Failed to open off-cpu BPF skeleton\n"); 136edc41a10SNamhyung Kim return -1; 137edc41a10SNamhyung Kim } 138edc41a10SNamhyung Kim 13910742d0cSNamhyung Kim /* don't need to set cpu filter for system-wide mode */ 14010742d0cSNamhyung Kim if (target->cpu_list) { 14110742d0cSNamhyung Kim ncpus = perf_cpu_map__nr(evlist->core.user_requested_cpus); 14210742d0cSNamhyung Kim bpf_map__set_max_entries(skel->maps.cpu_filter, ncpus); 14310742d0cSNamhyung Kim } 14410742d0cSNamhyung Kim 14510742d0cSNamhyung Kim if (target__has_task(target)) { 14610742d0cSNamhyung Kim ntasks = perf_thread_map__nr(evlist->core.threads); 14710742d0cSNamhyung Kim bpf_map__set_max_entries(skel->maps.task_filter, ntasks); 14810742d0cSNamhyung Kim } 14910742d0cSNamhyung Kim 150*685439a7SNamhyung Kim if (evlist__first(evlist)->cgrp) { 151*685439a7SNamhyung Kim ncgrps = evlist->core.nr_entries - 1; /* excluding a dummy */ 152*685439a7SNamhyung Kim bpf_map__set_max_entries(skel->maps.cgroup_filter, ncgrps); 153*685439a7SNamhyung Kim 154*685439a7SNamhyung Kim if (!cgroup_is_v2("perf_event")) 155*685439a7SNamhyung Kim skel->rodata->uses_cgroup_v1 = true; 156*685439a7SNamhyung Kim } 157*685439a7SNamhyung Kim 158*685439a7SNamhyung Kim if (opts->record_cgroup) { 159*685439a7SNamhyung Kim skel->rodata->needs_cgroup = true; 160*685439a7SNamhyung Kim 161*685439a7SNamhyung Kim if (!cgroup_is_v2("perf_event")) 162*685439a7SNamhyung Kim skel->rodata->uses_cgroup_v1 = true; 163*685439a7SNamhyung Kim } 164*685439a7SNamhyung Kim 16510742d0cSNamhyung Kim set_max_rlimit(); 166b36888f7SNamhyung Kim check_sched_switch_args(); 16710742d0cSNamhyung Kim 16810742d0cSNamhyung Kim err = off_cpu_bpf__load(skel); 16910742d0cSNamhyung Kim if (err) { 17010742d0cSNamhyung Kim pr_err("Failed to load off-cpu skeleton\n"); 17110742d0cSNamhyung Kim goto out; 17210742d0cSNamhyung Kim } 17310742d0cSNamhyung Kim 17410742d0cSNamhyung Kim if (target->cpu_list) { 17510742d0cSNamhyung Kim u32 cpu; 17610742d0cSNamhyung Kim u8 val = 1; 17710742d0cSNamhyung Kim 17810742d0cSNamhyung Kim skel->bss->has_cpu = 1; 17910742d0cSNamhyung Kim fd = bpf_map__fd(skel->maps.cpu_filter); 18010742d0cSNamhyung Kim 18110742d0cSNamhyung Kim for (i = 0; i < ncpus; i++) { 18210742d0cSNamhyung Kim cpu = perf_cpu_map__cpu(evlist->core.user_requested_cpus, i).cpu; 18310742d0cSNamhyung Kim bpf_map_update_elem(fd, &cpu, &val, BPF_ANY); 18410742d0cSNamhyung Kim } 18510742d0cSNamhyung Kim } 18610742d0cSNamhyung Kim 18710742d0cSNamhyung Kim if (target__has_task(target)) { 18810742d0cSNamhyung Kim u32 pid; 18910742d0cSNamhyung Kim u8 val = 1; 19010742d0cSNamhyung Kim 19110742d0cSNamhyung Kim skel->bss->has_task = 1; 19210742d0cSNamhyung Kim fd = bpf_map__fd(skel->maps.task_filter); 19310742d0cSNamhyung Kim 19410742d0cSNamhyung Kim for (i = 0; i < ntasks; i++) { 19510742d0cSNamhyung Kim pid = perf_thread_map__pid(evlist->core.threads, i); 19610742d0cSNamhyung Kim bpf_map_update_elem(fd, &pid, &val, BPF_ANY); 19710742d0cSNamhyung Kim } 19810742d0cSNamhyung Kim } 19910742d0cSNamhyung Kim 200*685439a7SNamhyung Kim if (evlist__first(evlist)->cgrp) { 201*685439a7SNamhyung Kim struct evsel *evsel; 202*685439a7SNamhyung Kim u8 val = 1; 203*685439a7SNamhyung Kim 204*685439a7SNamhyung Kim skel->bss->has_cgroup = 1; 205*685439a7SNamhyung Kim fd = bpf_map__fd(skel->maps.cgroup_filter); 206*685439a7SNamhyung Kim 207*685439a7SNamhyung Kim evlist__for_each_entry(evlist, evsel) { 208*685439a7SNamhyung Kim struct cgroup *cgrp = evsel->cgrp; 209*685439a7SNamhyung Kim 210*685439a7SNamhyung Kim if (cgrp == NULL) 211*685439a7SNamhyung Kim continue; 212*685439a7SNamhyung Kim 213*685439a7SNamhyung Kim if (!cgrp->id && read_cgroup_id(cgrp) < 0) { 214*685439a7SNamhyung Kim pr_err("Failed to read cgroup id of %s\n", 215*685439a7SNamhyung Kim cgrp->name); 216*685439a7SNamhyung Kim goto out; 217*685439a7SNamhyung Kim } 218*685439a7SNamhyung Kim 219*685439a7SNamhyung Kim bpf_map_update_elem(fd, &cgrp->id, &val, BPF_ANY); 220*685439a7SNamhyung Kim } 221*685439a7SNamhyung Kim } 222*685439a7SNamhyung Kim 223edc41a10SNamhyung Kim err = off_cpu_bpf__attach(skel); 224edc41a10SNamhyung Kim if (err) { 225edc41a10SNamhyung Kim pr_err("Failed to attach off-cpu BPF skeleton\n"); 226edc41a10SNamhyung Kim goto out; 227edc41a10SNamhyung Kim } 228edc41a10SNamhyung Kim 22910742d0cSNamhyung Kim if (perf_hooks__set_hook("record_start", off_cpu_start, evlist) || 23010742d0cSNamhyung Kim perf_hooks__set_hook("record_end", off_cpu_finish, evlist)) { 231edc41a10SNamhyung Kim pr_err("Failed to attach off-cpu skeleton\n"); 232edc41a10SNamhyung Kim goto out; 233edc41a10SNamhyung Kim } 234edc41a10SNamhyung Kim 235edc41a10SNamhyung Kim return 0; 236edc41a10SNamhyung Kim 237edc41a10SNamhyung Kim out: 238edc41a10SNamhyung Kim off_cpu_bpf__destroy(skel); 239edc41a10SNamhyung Kim return -1; 240edc41a10SNamhyung Kim } 241edc41a10SNamhyung Kim 242edc41a10SNamhyung Kim int off_cpu_write(struct perf_session *session) 243edc41a10SNamhyung Kim { 244edc41a10SNamhyung Kim int bytes = 0, size; 245edc41a10SNamhyung Kim int fd, stack; 246edc41a10SNamhyung Kim u64 sample_type, val, sid = 0; 247edc41a10SNamhyung Kim struct evsel *evsel; 248edc41a10SNamhyung Kim struct perf_data_file *file = &session->data->file; 249edc41a10SNamhyung Kim struct off_cpu_key prev, key; 250edc41a10SNamhyung Kim union off_cpu_data data = { 251edc41a10SNamhyung Kim .hdr = { 252edc41a10SNamhyung Kim .type = PERF_RECORD_SAMPLE, 253edc41a10SNamhyung Kim .misc = PERF_RECORD_MISC_USER, 254edc41a10SNamhyung Kim }, 255edc41a10SNamhyung Kim }; 256edc41a10SNamhyung Kim u64 tstamp = OFF_CPU_TIMESTAMP; 257edc41a10SNamhyung Kim 258edc41a10SNamhyung Kim skel->bss->enabled = 0; 259edc41a10SNamhyung Kim 260edc41a10SNamhyung Kim evsel = evlist__find_evsel_by_str(session->evlist, OFFCPU_EVENT); 261edc41a10SNamhyung Kim if (evsel == NULL) { 262edc41a10SNamhyung Kim pr_err("%s evsel not found\n", OFFCPU_EVENT); 263edc41a10SNamhyung Kim return 0; 264edc41a10SNamhyung Kim } 265edc41a10SNamhyung Kim 266edc41a10SNamhyung Kim sample_type = evsel->core.attr.sample_type; 267edc41a10SNamhyung Kim 268edc41a10SNamhyung Kim if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER)) { 269edc41a10SNamhyung Kim if (evsel->core.id) 270edc41a10SNamhyung Kim sid = evsel->core.id[0]; 271edc41a10SNamhyung Kim } 272edc41a10SNamhyung Kim 273edc41a10SNamhyung Kim fd = bpf_map__fd(skel->maps.off_cpu); 274edc41a10SNamhyung Kim stack = bpf_map__fd(skel->maps.stacks); 275edc41a10SNamhyung Kim memset(&prev, 0, sizeof(prev)); 276edc41a10SNamhyung Kim 277edc41a10SNamhyung Kim while (!bpf_map_get_next_key(fd, &prev, &key)) { 278edc41a10SNamhyung Kim int n = 1; /* start from perf_event_header */ 279edc41a10SNamhyung Kim int ip_pos = -1; 280edc41a10SNamhyung Kim 281edc41a10SNamhyung Kim bpf_map_lookup_elem(fd, &key, &val); 282edc41a10SNamhyung Kim 283edc41a10SNamhyung Kim if (sample_type & PERF_SAMPLE_IDENTIFIER) 284edc41a10SNamhyung Kim data.array[n++] = sid; 285edc41a10SNamhyung Kim if (sample_type & PERF_SAMPLE_IP) { 286edc41a10SNamhyung Kim ip_pos = n; 287edc41a10SNamhyung Kim data.array[n++] = 0; /* will be updated */ 288edc41a10SNamhyung Kim } 289edc41a10SNamhyung Kim if (sample_type & PERF_SAMPLE_TID) 290edc41a10SNamhyung Kim data.array[n++] = (u64)key.pid << 32 | key.tgid; 291edc41a10SNamhyung Kim if (sample_type & PERF_SAMPLE_TIME) 292edc41a10SNamhyung Kim data.array[n++] = tstamp; 293edc41a10SNamhyung Kim if (sample_type & PERF_SAMPLE_ID) 294edc41a10SNamhyung Kim data.array[n++] = sid; 295edc41a10SNamhyung Kim if (sample_type & PERF_SAMPLE_CPU) 296edc41a10SNamhyung Kim data.array[n++] = 0; 297edc41a10SNamhyung Kim if (sample_type & PERF_SAMPLE_PERIOD) 298edc41a10SNamhyung Kim data.array[n++] = val; 299edc41a10SNamhyung Kim if (sample_type & PERF_SAMPLE_CALLCHAIN) { 300edc41a10SNamhyung Kim int len = 0; 301edc41a10SNamhyung Kim 302edc41a10SNamhyung Kim /* data.array[n] is callchain->nr (updated later) */ 303edc41a10SNamhyung Kim data.array[n + 1] = PERF_CONTEXT_USER; 304edc41a10SNamhyung Kim data.array[n + 2] = 0; 305edc41a10SNamhyung Kim 306edc41a10SNamhyung Kim bpf_map_lookup_elem(stack, &key.stack_id, &data.array[n + 2]); 307edc41a10SNamhyung Kim while (data.array[n + 2 + len]) 308edc41a10SNamhyung Kim len++; 309edc41a10SNamhyung Kim 310edc41a10SNamhyung Kim /* update length of callchain */ 311edc41a10SNamhyung Kim data.array[n] = len + 1; 312edc41a10SNamhyung Kim 313edc41a10SNamhyung Kim /* update sample ip with the first callchain entry */ 314edc41a10SNamhyung Kim if (ip_pos >= 0) 315edc41a10SNamhyung Kim data.array[ip_pos] = data.array[n + 2]; 316edc41a10SNamhyung Kim 317edc41a10SNamhyung Kim /* calculate sample callchain data array length */ 318edc41a10SNamhyung Kim n += len + 2; 319edc41a10SNamhyung Kim } 320*685439a7SNamhyung Kim if (sample_type & PERF_SAMPLE_CGROUP) 321*685439a7SNamhyung Kim data.array[n++] = key.cgroup_id; 322edc41a10SNamhyung Kim /* TODO: handle more sample types */ 323edc41a10SNamhyung Kim 324edc41a10SNamhyung Kim size = n * sizeof(u64); 325edc41a10SNamhyung Kim data.hdr.size = size; 326edc41a10SNamhyung Kim bytes += size; 327edc41a10SNamhyung Kim 328edc41a10SNamhyung Kim if (perf_data_file__write(file, &data, size) < 0) { 329edc41a10SNamhyung Kim pr_err("failed to write perf data, error: %m\n"); 330edc41a10SNamhyung Kim return bytes; 331edc41a10SNamhyung Kim } 332edc41a10SNamhyung Kim 333edc41a10SNamhyung Kim prev = key; 334edc41a10SNamhyung Kim /* increase dummy timestamp to sort later samples */ 335edc41a10SNamhyung Kim tstamp++; 336edc41a10SNamhyung Kim } 337edc41a10SNamhyung Kim return bytes; 338edc41a10SNamhyung Kim } 339