1 // SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
2 // Copyright (c) 2021 Facebook
3 // Copyright (c) 2021 Google
4 #include "vmlinux.h"
5 #include <bpf/bpf_helpers.h>
6 #include <bpf/bpf_tracing.h>
7 #include <bpf/bpf_core_read.h>
8 
9 #define MAX_LEVELS  10  // max cgroup hierarchy level: arbitrary
10 #define MAX_EVENTS  32  // max events per cgroup: arbitrary
11 
12 // NOTE: many of map and global data will be modified before loading
13 //       from the userspace (perf tool) using the skeleton helpers.
14 
15 // single set of global perf events to measure
16 struct {
17 	__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
18 	__uint(key_size, sizeof(__u32));
19 	__uint(value_size, sizeof(int));
20 	__uint(max_entries, 1);
21 } events SEC(".maps");
22 
23 // from cgroup id to event index
24 struct {
25 	__uint(type, BPF_MAP_TYPE_HASH);
26 	__uint(key_size, sizeof(__u64));
27 	__uint(value_size, sizeof(__u32));
28 	__uint(max_entries, 1);
29 } cgrp_idx SEC(".maps");
30 
31 // per-cpu event snapshots to calculate delta
32 struct {
33 	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
34 	__uint(key_size, sizeof(__u32));
35 	__uint(value_size, sizeof(struct bpf_perf_event_value));
36 } prev_readings SEC(".maps");
37 
38 // aggregated event values for each cgroup (per-cpu)
39 // will be read from the user-space
40 struct {
41 	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
42 	__uint(key_size, sizeof(__u32));
43 	__uint(value_size, sizeof(struct bpf_perf_event_value));
44 } cgrp_readings SEC(".maps");
45 
46 /* new kernel cgroup definition */
47 struct cgroup___new {
48 	int level;
49 	struct cgroup *ancestors[];
50 } __attribute__((preserve_access_index));
51 
52 /* old kernel cgroup definition */
53 struct cgroup___old {
54 	int level;
55 	u64 ancestor_ids[];
56 } __attribute__((preserve_access_index));
57 
58 const volatile __u32 num_events = 1;
59 const volatile __u32 num_cpus = 1;
60 
61 int enabled = 0;
62 int use_cgroup_v2 = 0;
63 int perf_subsys_id = -1;
64 
65 static inline __u64 get_cgroup_v1_ancestor_id(struct cgroup *cgrp, int level)
66 {
67 	/* recast pointer to capture new type for compiler */
68 	struct cgroup___new *cgrp_new = (void *)cgrp;
69 
70 	if (bpf_core_field_exists(cgrp_new->ancestors)) {
71 		return BPF_CORE_READ(cgrp_new, ancestors[level], kn, id);
72 	} else {
73 		/* recast pointer to capture old type for compiler */
74 		struct cgroup___old *cgrp_old = (void *)cgrp;
75 
76 		return BPF_CORE_READ(cgrp_old, ancestor_ids[level]);
77 	}
78 }
79 
80 static inline int get_cgroup_v1_idx(__u32 *cgrps, int size)
81 {
82 	struct task_struct *p = (void *)bpf_get_current_task();
83 	struct cgroup *cgrp;
84 	register int i = 0;
85 	__u32 *elem;
86 	int level;
87 	int cnt;
88 
89 	if (perf_subsys_id == -1) {
90 #if __has_builtin(__builtin_preserve_enum_value)
91 		perf_subsys_id = bpf_core_enum_value(enum cgroup_subsys_id,
92 						     perf_event_cgrp_id);
93 #else
94 		perf_subsys_id = perf_event_cgrp_id;
95 #endif
96 	}
97 	cgrp = BPF_CORE_READ(p, cgroups, subsys[perf_subsys_id], cgroup);
98 	level = BPF_CORE_READ(cgrp, level);
99 
100 	for (cnt = 0; i < MAX_LEVELS; i++) {
101 		__u64 cgrp_id;
102 
103 		if (i > level)
104 			break;
105 
106 		// convert cgroup-id to a map index
107 		cgrp_id = get_cgroup_v1_ancestor_id(cgrp, i);
108 		elem = bpf_map_lookup_elem(&cgrp_idx, &cgrp_id);
109 		if (!elem)
110 			continue;
111 
112 		cgrps[cnt++] = *elem;
113 		if (cnt == size)
114 			break;
115 	}
116 
117 	return cnt;
118 }
119 
120 static inline int get_cgroup_v2_idx(__u32 *cgrps, int size)
121 {
122 	register int i = 0;
123 	__u32 *elem;
124 	int cnt;
125 
126 	for (cnt = 0; i < MAX_LEVELS; i++) {
127 		__u64 cgrp_id = bpf_get_current_ancestor_cgroup_id(i);
128 
129 		if (cgrp_id == 0)
130 			break;
131 
132 		// convert cgroup-id to a map index
133 		elem = bpf_map_lookup_elem(&cgrp_idx, &cgrp_id);
134 		if (!elem)
135 			continue;
136 
137 		cgrps[cnt++] = *elem;
138 		if (cnt == size)
139 			break;
140 	}
141 
142 	return cnt;
143 }
144 
145 static int bperf_cgroup_count(void)
146 {
147 	register __u32 idx = 0;  // to have it in a register to pass BPF verifier
148 	register int c = 0;
149 	struct bpf_perf_event_value val, delta, *prev_val, *cgrp_val;
150 	__u32 cpu = bpf_get_smp_processor_id();
151 	__u32 cgrp_idx[MAX_LEVELS];
152 	int cgrp_cnt;
153 	__u32 key, cgrp;
154 	long err;
155 
156 	if (use_cgroup_v2)
157 		cgrp_cnt = get_cgroup_v2_idx(cgrp_idx, MAX_LEVELS);
158 	else
159 		cgrp_cnt = get_cgroup_v1_idx(cgrp_idx, MAX_LEVELS);
160 
161 	for ( ; idx < MAX_EVENTS; idx++) {
162 		if (idx == num_events)
163 			break;
164 
165 		// XXX: do not pass idx directly (for verifier)
166 		key = idx;
167 		// this is per-cpu array for diff
168 		prev_val = bpf_map_lookup_elem(&prev_readings, &key);
169 		if (!prev_val) {
170 			val.counter = val.enabled = val.running = 0;
171 			bpf_map_update_elem(&prev_readings, &key, &val, BPF_ANY);
172 
173 			prev_val = bpf_map_lookup_elem(&prev_readings, &key);
174 			if (!prev_val)
175 				continue;
176 		}
177 
178 		// read from global perf_event array
179 		key = idx * num_cpus + cpu;
180 		err = bpf_perf_event_read_value(&events, key, &val, sizeof(val));
181 		if (err)
182 			continue;
183 
184 		if (enabled) {
185 			delta.counter = val.counter - prev_val->counter;
186 			delta.enabled = val.enabled - prev_val->enabled;
187 			delta.running = val.running - prev_val->running;
188 
189 			for (c = 0; c < MAX_LEVELS; c++) {
190 				if (c == cgrp_cnt)
191 					break;
192 
193 				cgrp = cgrp_idx[c];
194 
195 				// aggregate the result by cgroup
196 				key = cgrp * num_events + idx;
197 				cgrp_val = bpf_map_lookup_elem(&cgrp_readings, &key);
198 				if (cgrp_val) {
199 					cgrp_val->counter += delta.counter;
200 					cgrp_val->enabled += delta.enabled;
201 					cgrp_val->running += delta.running;
202 				} else {
203 					bpf_map_update_elem(&cgrp_readings, &key,
204 							    &delta, BPF_ANY);
205 				}
206 			}
207 		}
208 
209 		*prev_val = val;
210 	}
211 	return 0;
212 }
213 
214 // This will be attached to cgroup-switches event for each cpu
215 SEC("perf_event")
216 int BPF_PROG(on_cgrp_switch)
217 {
218 	return bperf_cgroup_count();
219 }
220 
221 SEC("raw_tp/sched_switch")
222 int BPF_PROG(trigger_read)
223 {
224 	return bperf_cgroup_count();
225 }
226 
227 char LICENSE[] SEC("license") = "Dual BSD/GPL";
228