1944138f0SNamhyung Kim // SPDX-License-Identifier: GPL-2.0
2944138f0SNamhyung Kim 
3944138f0SNamhyung Kim /* Copyright (c) 2021 Facebook */
4944138f0SNamhyung Kim /* Copyright (c) 2021 Google */
5944138f0SNamhyung Kim 
6944138f0SNamhyung Kim #include <assert.h>
7944138f0SNamhyung Kim #include <limits.h>
8944138f0SNamhyung Kim #include <unistd.h>
9944138f0SNamhyung Kim #include <sys/file.h>
10944138f0SNamhyung Kim #include <sys/time.h>
11944138f0SNamhyung Kim #include <sys/resource.h>
12944138f0SNamhyung Kim #include <linux/err.h>
13944138f0SNamhyung Kim #include <linux/zalloc.h>
14944138f0SNamhyung Kim #include <linux/perf_event.h>
15944138f0SNamhyung Kim #include <api/fs/fs.h>
16944138f0SNamhyung Kim #include <perf/bpf_perf.h>
17944138f0SNamhyung Kim 
18944138f0SNamhyung Kim #include "affinity.h"
19944138f0SNamhyung Kim #include "bpf_counter.h"
20944138f0SNamhyung Kim #include "cgroup.h"
21944138f0SNamhyung Kim #include "counts.h"
22944138f0SNamhyung Kim #include "debug.h"
23944138f0SNamhyung Kim #include "evsel.h"
24944138f0SNamhyung Kim #include "evlist.h"
25944138f0SNamhyung Kim #include "target.h"
26944138f0SNamhyung Kim #include "cpumap.h"
27944138f0SNamhyung Kim #include "thread_map.h"
28944138f0SNamhyung Kim 
29944138f0SNamhyung Kim #include "bpf_skel/bperf_cgroup.skel.h"
30944138f0SNamhyung Kim 
31944138f0SNamhyung Kim static struct perf_event_attr cgrp_switch_attr = {
32944138f0SNamhyung Kim 	.type = PERF_TYPE_SOFTWARE,
33944138f0SNamhyung Kim 	.config = PERF_COUNT_SW_CGROUP_SWITCHES,
34944138f0SNamhyung Kim 	.size = sizeof(cgrp_switch_attr),
35944138f0SNamhyung Kim 	.sample_period = 1,
36944138f0SNamhyung Kim 	.disabled = 1,
37944138f0SNamhyung Kim };
38944138f0SNamhyung Kim 
39944138f0SNamhyung Kim static struct evsel *cgrp_switch;
40944138f0SNamhyung Kim static struct bperf_cgroup_bpf *skel;
41944138f0SNamhyung Kim 
42944138f0SNamhyung Kim #define FD(evt, cpu) (*(int *)xyarray__entry(evt->core.fd, cpu, 0))
43944138f0SNamhyung Kim 
44944138f0SNamhyung Kim static int bperf_load_program(struct evlist *evlist)
45944138f0SNamhyung Kim {
46944138f0SNamhyung Kim 	struct bpf_link *link;
47944138f0SNamhyung Kim 	struct evsel *evsel;
48944138f0SNamhyung Kim 	struct cgroup *cgrp, *leader_cgrp;
49944138f0SNamhyung Kim 	__u32 i, cpu;
50944138f0SNamhyung Kim 	__u32 nr_cpus = evlist->core.all_cpus->nr;
51944138f0SNamhyung Kim 	int total_cpus = cpu__max_cpu();
52944138f0SNamhyung Kim 	int map_size, map_fd;
53944138f0SNamhyung Kim 	int prog_fd, err;
54944138f0SNamhyung Kim 
55944138f0SNamhyung Kim 	skel = bperf_cgroup_bpf__open();
56944138f0SNamhyung Kim 	if (!skel) {
57944138f0SNamhyung Kim 		pr_err("Failed to open cgroup skeleton\n");
58944138f0SNamhyung Kim 		return -1;
59944138f0SNamhyung Kim 	}
60944138f0SNamhyung Kim 
61944138f0SNamhyung Kim 	skel->rodata->num_cpus = total_cpus;
62944138f0SNamhyung Kim 	skel->rodata->num_events = evlist->core.nr_entries / nr_cgroups;
63944138f0SNamhyung Kim 
64944138f0SNamhyung Kim 	BUG_ON(evlist->core.nr_entries % nr_cgroups != 0);
65944138f0SNamhyung Kim 
66944138f0SNamhyung Kim 	/* we need one copy of events per cpu for reading */
67944138f0SNamhyung Kim 	map_size = total_cpus * evlist->core.nr_entries / nr_cgroups;
68944138f0SNamhyung Kim 	bpf_map__resize(skel->maps.events, map_size);
69944138f0SNamhyung Kim 	bpf_map__resize(skel->maps.cgrp_idx, nr_cgroups);
70944138f0SNamhyung Kim 	/* previous result is saved in a per-cpu array */
71944138f0SNamhyung Kim 	map_size = evlist->core.nr_entries / nr_cgroups;
72944138f0SNamhyung Kim 	bpf_map__resize(skel->maps.prev_readings, map_size);
73944138f0SNamhyung Kim 	/* cgroup result needs all events (per-cpu) */
74944138f0SNamhyung Kim 	map_size = evlist->core.nr_entries;
75944138f0SNamhyung Kim 	bpf_map__resize(skel->maps.cgrp_readings, map_size);
76944138f0SNamhyung Kim 
77944138f0SNamhyung Kim 	set_max_rlimit();
78944138f0SNamhyung Kim 
79944138f0SNamhyung Kim 	err = bperf_cgroup_bpf__load(skel);
80944138f0SNamhyung Kim 	if (err) {
81944138f0SNamhyung Kim 		pr_err("Failed to load cgroup skeleton\n");
82944138f0SNamhyung Kim 		goto out;
83944138f0SNamhyung Kim 	}
84944138f0SNamhyung Kim 
85944138f0SNamhyung Kim 	if (cgroup_is_v2("perf_event") > 0)
86944138f0SNamhyung Kim 		skel->bss->use_cgroup_v2 = 1;
87944138f0SNamhyung Kim 
88944138f0SNamhyung Kim 	err = -1;
89944138f0SNamhyung Kim 
90944138f0SNamhyung Kim 	cgrp_switch = evsel__new(&cgrp_switch_attr);
91944138f0SNamhyung Kim 	if (evsel__open_per_cpu(cgrp_switch, evlist->core.all_cpus, -1) < 0) {
92944138f0SNamhyung Kim 		pr_err("Failed to open cgroup switches event\n");
93944138f0SNamhyung Kim 		goto out;
94944138f0SNamhyung Kim 	}
95944138f0SNamhyung Kim 
96944138f0SNamhyung Kim 	for (i = 0; i < nr_cpus; i++) {
97944138f0SNamhyung Kim 		link = bpf_program__attach_perf_event(skel->progs.on_cgrp_switch,
98944138f0SNamhyung Kim 						      FD(cgrp_switch, i));
99944138f0SNamhyung Kim 		if (IS_ERR(link)) {
100944138f0SNamhyung Kim 			pr_err("Failed to attach cgroup program\n");
101944138f0SNamhyung Kim 			err = PTR_ERR(link);
102944138f0SNamhyung Kim 			goto out;
103944138f0SNamhyung Kim 		}
104944138f0SNamhyung Kim 	}
105944138f0SNamhyung Kim 
106944138f0SNamhyung Kim 	/*
107944138f0SNamhyung Kim 	 * Update cgrp_idx map from cgroup-id to event index.
108944138f0SNamhyung Kim 	 */
109944138f0SNamhyung Kim 	cgrp = NULL;
110944138f0SNamhyung Kim 	i = 0;
111944138f0SNamhyung Kim 
112944138f0SNamhyung Kim 	evlist__for_each_entry(evlist, evsel) {
113944138f0SNamhyung Kim 		if (cgrp == NULL || evsel->cgrp == leader_cgrp) {
114944138f0SNamhyung Kim 			leader_cgrp = evsel->cgrp;
115944138f0SNamhyung Kim 			evsel->cgrp = NULL;
116944138f0SNamhyung Kim 
117944138f0SNamhyung Kim 			/* open single copy of the events w/o cgroup */
118944138f0SNamhyung Kim 			err = evsel__open_per_cpu(evsel, evlist->core.all_cpus, -1);
119944138f0SNamhyung Kim 			if (err) {
120944138f0SNamhyung Kim 				pr_err("Failed to open first cgroup events\n");
121944138f0SNamhyung Kim 				goto out;
122944138f0SNamhyung Kim 			}
123944138f0SNamhyung Kim 
124944138f0SNamhyung Kim 			map_fd = bpf_map__fd(skel->maps.events);
125944138f0SNamhyung Kim 			for (cpu = 0; cpu < nr_cpus; cpu++) {
126944138f0SNamhyung Kim 				int fd = FD(evsel, cpu);
127*38fe0e01SJiri Olsa 				__u32 idx = evsel->core.idx * total_cpus +
128944138f0SNamhyung Kim 					evlist->core.all_cpus->map[cpu];
129944138f0SNamhyung Kim 
130944138f0SNamhyung Kim 				err = bpf_map_update_elem(map_fd, &idx, &fd,
131944138f0SNamhyung Kim 							  BPF_ANY);
132944138f0SNamhyung Kim 				if (err < 0) {
133944138f0SNamhyung Kim 					pr_err("Failed to update perf_event fd\n");
134944138f0SNamhyung Kim 					goto out;
135944138f0SNamhyung Kim 				}
136944138f0SNamhyung Kim 			}
137944138f0SNamhyung Kim 
138944138f0SNamhyung Kim 			evsel->cgrp = leader_cgrp;
139944138f0SNamhyung Kim 		}
140944138f0SNamhyung Kim 		evsel->supported = true;
141944138f0SNamhyung Kim 
142944138f0SNamhyung Kim 		if (evsel->cgrp == cgrp)
143944138f0SNamhyung Kim 			continue;
144944138f0SNamhyung Kim 
145944138f0SNamhyung Kim 		cgrp = evsel->cgrp;
146944138f0SNamhyung Kim 
147944138f0SNamhyung Kim 		if (read_cgroup_id(cgrp) < 0) {
148944138f0SNamhyung Kim 			pr_err("Failed to get cgroup id\n");
149944138f0SNamhyung Kim 			err = -1;
150944138f0SNamhyung Kim 			goto out;
151944138f0SNamhyung Kim 		}
152944138f0SNamhyung Kim 
153944138f0SNamhyung Kim 		map_fd = bpf_map__fd(skel->maps.cgrp_idx);
154944138f0SNamhyung Kim 		err = bpf_map_update_elem(map_fd, &cgrp->id, &i, BPF_ANY);
155944138f0SNamhyung Kim 		if (err < 0) {
156944138f0SNamhyung Kim 			pr_err("Failed to update cgroup index map\n");
157944138f0SNamhyung Kim 			goto out;
158944138f0SNamhyung Kim 		}
159944138f0SNamhyung Kim 
160944138f0SNamhyung Kim 		i++;
161944138f0SNamhyung Kim 	}
162944138f0SNamhyung Kim 
163944138f0SNamhyung Kim 	/*
164944138f0SNamhyung Kim 	 * bperf uses BPF_PROG_TEST_RUN to get accurate reading. Check
165944138f0SNamhyung Kim 	 * whether the kernel support it
166944138f0SNamhyung Kim 	 */
167944138f0SNamhyung Kim 	prog_fd = bpf_program__fd(skel->progs.trigger_read);
168944138f0SNamhyung Kim 	err = bperf_trigger_reading(prog_fd, 0);
169944138f0SNamhyung Kim 	if (err) {
170944138f0SNamhyung Kim 		pr_warning("The kernel does not support test_run for raw_tp BPF programs.\n"
171944138f0SNamhyung Kim 			   "Therefore, --for-each-cgroup might show inaccurate readings\n");
172944138f0SNamhyung Kim 		err = 0;
173944138f0SNamhyung Kim 	}
174944138f0SNamhyung Kim 
175944138f0SNamhyung Kim out:
176944138f0SNamhyung Kim 	return err;
177944138f0SNamhyung Kim }
178944138f0SNamhyung Kim 
179944138f0SNamhyung Kim static int bperf_cgrp__load(struct evsel *evsel,
180944138f0SNamhyung Kim 			    struct target *target __maybe_unused)
181944138f0SNamhyung Kim {
182944138f0SNamhyung Kim 	static bool bperf_loaded = false;
183944138f0SNamhyung Kim 
184944138f0SNamhyung Kim 	evsel->bperf_leader_prog_fd = -1;
185944138f0SNamhyung Kim 	evsel->bperf_leader_link_fd = -1;
186944138f0SNamhyung Kim 
187944138f0SNamhyung Kim 	if (!bperf_loaded && bperf_load_program(evsel->evlist))
188944138f0SNamhyung Kim 		return -1;
189944138f0SNamhyung Kim 
190944138f0SNamhyung Kim 	bperf_loaded = true;
191944138f0SNamhyung Kim 	/* just to bypass bpf_counter_skip() */
192944138f0SNamhyung Kim 	evsel->follower_skel = (struct bperf_follower_bpf *)skel;
193944138f0SNamhyung Kim 
194944138f0SNamhyung Kim 	return 0;
195944138f0SNamhyung Kim }
196944138f0SNamhyung Kim 
197944138f0SNamhyung Kim static int bperf_cgrp__install_pe(struct evsel *evsel __maybe_unused,
198944138f0SNamhyung Kim 				  int cpu __maybe_unused, int fd __maybe_unused)
199944138f0SNamhyung Kim {
200944138f0SNamhyung Kim 	/* nothing to do */
201944138f0SNamhyung Kim 	return 0;
202944138f0SNamhyung Kim }
203944138f0SNamhyung Kim 
204944138f0SNamhyung Kim /*
205944138f0SNamhyung Kim  * trigger the leader prog on each cpu, so the cgrp_reading map could get
206944138f0SNamhyung Kim  * the latest results.
207944138f0SNamhyung Kim  */
208944138f0SNamhyung Kim static int bperf_cgrp__sync_counters(struct evlist *evlist)
209944138f0SNamhyung Kim {
210944138f0SNamhyung Kim 	int i, cpu;
211944138f0SNamhyung Kim 	int nr_cpus = evlist->core.all_cpus->nr;
212944138f0SNamhyung Kim 	int prog_fd = bpf_program__fd(skel->progs.trigger_read);
213944138f0SNamhyung Kim 
214944138f0SNamhyung Kim 	for (i = 0; i < nr_cpus; i++) {
215944138f0SNamhyung Kim 		cpu = evlist->core.all_cpus->map[i];
216944138f0SNamhyung Kim 		bperf_trigger_reading(prog_fd, cpu);
217944138f0SNamhyung Kim 	}
218944138f0SNamhyung Kim 
219944138f0SNamhyung Kim 	return 0;
220944138f0SNamhyung Kim }
221944138f0SNamhyung Kim 
222944138f0SNamhyung Kim static int bperf_cgrp__enable(struct evsel *evsel)
223944138f0SNamhyung Kim {
224*38fe0e01SJiri Olsa 	if (evsel->core.idx)
225944138f0SNamhyung Kim 		return 0;
226944138f0SNamhyung Kim 
227944138f0SNamhyung Kim 	bperf_cgrp__sync_counters(evsel->evlist);
228944138f0SNamhyung Kim 
229944138f0SNamhyung Kim 	skel->bss->enabled = 1;
230944138f0SNamhyung Kim 	return 0;
231944138f0SNamhyung Kim }
232944138f0SNamhyung Kim 
233944138f0SNamhyung Kim static int bperf_cgrp__disable(struct evsel *evsel)
234944138f0SNamhyung Kim {
235*38fe0e01SJiri Olsa 	if (evsel->core.idx)
236944138f0SNamhyung Kim 		return 0;
237944138f0SNamhyung Kim 
238944138f0SNamhyung Kim 	bperf_cgrp__sync_counters(evsel->evlist);
239944138f0SNamhyung Kim 
240944138f0SNamhyung Kim 	skel->bss->enabled = 0;
241944138f0SNamhyung Kim 	return 0;
242944138f0SNamhyung Kim }
243944138f0SNamhyung Kim 
244944138f0SNamhyung Kim static int bperf_cgrp__read(struct evsel *evsel)
245944138f0SNamhyung Kim {
246944138f0SNamhyung Kim 	struct evlist *evlist = evsel->evlist;
247944138f0SNamhyung Kim 	int i, cpu, nr_cpus = evlist->core.all_cpus->nr;
248944138f0SNamhyung Kim 	int total_cpus = cpu__max_cpu();
249944138f0SNamhyung Kim 	struct perf_counts_values *counts;
250944138f0SNamhyung Kim 	struct bpf_perf_event_value *values;
251944138f0SNamhyung Kim 	int reading_map_fd, err = 0;
252944138f0SNamhyung Kim 	__u32 idx;
253944138f0SNamhyung Kim 
254*38fe0e01SJiri Olsa 	if (evsel->core.idx)
255944138f0SNamhyung Kim 		return 0;
256944138f0SNamhyung Kim 
257944138f0SNamhyung Kim 	bperf_cgrp__sync_counters(evsel->evlist);
258944138f0SNamhyung Kim 
259944138f0SNamhyung Kim 	values = calloc(total_cpus, sizeof(*values));
260944138f0SNamhyung Kim 	if (values == NULL)
261944138f0SNamhyung Kim 		return -ENOMEM;
262944138f0SNamhyung Kim 
263944138f0SNamhyung Kim 	reading_map_fd = bpf_map__fd(skel->maps.cgrp_readings);
264944138f0SNamhyung Kim 
265944138f0SNamhyung Kim 	evlist__for_each_entry(evlist, evsel) {
266*38fe0e01SJiri Olsa 		idx = evsel->core.idx;
267944138f0SNamhyung Kim 		err = bpf_map_lookup_elem(reading_map_fd, &idx, values);
268944138f0SNamhyung Kim 		if (err) {
269944138f0SNamhyung Kim 			pr_err("bpf map lookup falied: idx=%u, event=%s, cgrp=%s\n",
270944138f0SNamhyung Kim 			       idx, evsel__name(evsel), evsel->cgrp->name);
271944138f0SNamhyung Kim 			goto out;
272944138f0SNamhyung Kim 		}
273944138f0SNamhyung Kim 
274944138f0SNamhyung Kim 		for (i = 0; i < nr_cpus; i++) {
275944138f0SNamhyung Kim 			cpu = evlist->core.all_cpus->map[i];
276944138f0SNamhyung Kim 
277944138f0SNamhyung Kim 			counts = perf_counts(evsel->counts, i, 0);
278944138f0SNamhyung Kim 			counts->val = values[cpu].counter;
279944138f0SNamhyung Kim 			counts->ena = values[cpu].enabled;
280944138f0SNamhyung Kim 			counts->run = values[cpu].running;
281944138f0SNamhyung Kim 		}
282944138f0SNamhyung Kim 	}
283944138f0SNamhyung Kim 
284944138f0SNamhyung Kim out:
285944138f0SNamhyung Kim 	free(values);
286944138f0SNamhyung Kim 	return err;
287944138f0SNamhyung Kim }
288944138f0SNamhyung Kim 
289944138f0SNamhyung Kim static int bperf_cgrp__destroy(struct evsel *evsel)
290944138f0SNamhyung Kim {
291*38fe0e01SJiri Olsa 	if (evsel->core.idx)
292944138f0SNamhyung Kim 		return 0;
293944138f0SNamhyung Kim 
294944138f0SNamhyung Kim 	bperf_cgroup_bpf__destroy(skel);
295944138f0SNamhyung Kim 	evsel__delete(cgrp_switch);  // it'll destroy on_switch progs too
296944138f0SNamhyung Kim 
297944138f0SNamhyung Kim 	return 0;
298944138f0SNamhyung Kim }
299944138f0SNamhyung Kim 
300944138f0SNamhyung Kim struct bpf_counter_ops bperf_cgrp_ops = {
301944138f0SNamhyung Kim 	.load       = bperf_cgrp__load,
302944138f0SNamhyung Kim 	.enable     = bperf_cgrp__enable,
303944138f0SNamhyung Kim 	.disable    = bperf_cgrp__disable,
304944138f0SNamhyung Kim 	.read       = bperf_cgrp__read,
305944138f0SNamhyung Kim 	.install_pe = bperf_cgrp__install_pe,
306944138f0SNamhyung Kim 	.destroy    = bperf_cgrp__destroy,
307944138f0SNamhyung Kim };
308