1 // SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
2 // Copyright (c) 2021 Facebook
3 // Copyright (c) 2021 Google
4 #include "vmlinux.h"
5 #include <bpf/bpf_helpers.h>
6 #include <bpf/bpf_tracing.h>
7 #include <bpf/bpf_core_read.h>
8 
9 #define MAX_LEVELS  10  // max cgroup hierarchy level: arbitrary
10 #define MAX_EVENTS  32  // max events per cgroup: arbitrary
11 
12 // NOTE: many of map and global data will be modified before loading
13 //       from the userspace (perf tool) using the skeleton helpers.
14 
15 // single set of global perf events to measure
16 struct {
17 	__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
18 	__uint(key_size, sizeof(__u32));
19 	__uint(value_size, sizeof(int));
20 	__uint(max_entries, 1);
21 } events SEC(".maps");
22 
23 // from cgroup id to event index
24 struct {
25 	__uint(type, BPF_MAP_TYPE_HASH);
26 	__uint(key_size, sizeof(__u64));
27 	__uint(value_size, sizeof(__u32));
28 	__uint(max_entries, 1);
29 } cgrp_idx SEC(".maps");
30 
31 // per-cpu event snapshots to calculate delta
32 struct {
33 	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
34 	__uint(key_size, sizeof(__u32));
35 	__uint(value_size, sizeof(struct bpf_perf_event_value));
36 } prev_readings SEC(".maps");
37 
38 // aggregated event values for each cgroup (per-cpu)
39 // will be read from the user-space
40 struct {
41 	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
42 	__uint(key_size, sizeof(__u32));
43 	__uint(value_size, sizeof(struct bpf_perf_event_value));
44 } cgrp_readings SEC(".maps");
45 
46 const volatile __u32 num_events = 1;
47 const volatile __u32 num_cpus = 1;
48 
49 int enabled = 0;
50 int use_cgroup_v2 = 0;
51 int perf_subsys_id = -1;
52 
53 static inline int get_cgroup_v1_idx(__u32 *cgrps, int size)
54 {
55 	struct task_struct *p = (void *)bpf_get_current_task();
56 	struct cgroup *cgrp;
57 	register int i = 0;
58 	__u32 *elem;
59 	int level;
60 	int cnt;
61 
62 	if (perf_subsys_id == -1) {
63 #if __has_builtin(__builtin_preserve_enum_value)
64 		perf_subsys_id = bpf_core_enum_value(enum cgroup_subsys_id,
65 						     perf_event_cgrp_id);
66 #else
67 		perf_subsys_id = perf_event_cgrp_id;
68 #endif
69 	}
70 	cgrp = BPF_CORE_READ(p, cgroups, subsys[perf_subsys_id], cgroup);
71 	level = BPF_CORE_READ(cgrp, level);
72 
73 	for (cnt = 0; i < MAX_LEVELS; i++) {
74 		__u64 cgrp_id;
75 
76 		if (i > level)
77 			break;
78 
79 		// convert cgroup-id to a map index
80 		cgrp_id = BPF_CORE_READ(cgrp, ancestor_ids[i]);
81 		elem = bpf_map_lookup_elem(&cgrp_idx, &cgrp_id);
82 		if (!elem)
83 			continue;
84 
85 		cgrps[cnt++] = *elem;
86 		if (cnt == size)
87 			break;
88 	}
89 
90 	return cnt;
91 }
92 
93 static inline int get_cgroup_v2_idx(__u32 *cgrps, int size)
94 {
95 	register int i = 0;
96 	__u32 *elem;
97 	int cnt;
98 
99 	for (cnt = 0; i < MAX_LEVELS; i++) {
100 		__u64 cgrp_id = bpf_get_current_ancestor_cgroup_id(i);
101 
102 		if (cgrp_id == 0)
103 			break;
104 
105 		// convert cgroup-id to a map index
106 		elem = bpf_map_lookup_elem(&cgrp_idx, &cgrp_id);
107 		if (!elem)
108 			continue;
109 
110 		cgrps[cnt++] = *elem;
111 		if (cnt == size)
112 			break;
113 	}
114 
115 	return cnt;
116 }
117 
118 static int bperf_cgroup_count(void)
119 {
120 	register __u32 idx = 0;  // to have it in a register to pass BPF verifier
121 	register int c = 0;
122 	struct bpf_perf_event_value val, delta, *prev_val, *cgrp_val;
123 	__u32 cpu = bpf_get_smp_processor_id();
124 	__u32 cgrp_idx[MAX_LEVELS];
125 	int cgrp_cnt;
126 	__u32 key, cgrp;
127 	long err;
128 
129 	if (use_cgroup_v2)
130 		cgrp_cnt = get_cgroup_v2_idx(cgrp_idx, MAX_LEVELS);
131 	else
132 		cgrp_cnt = get_cgroup_v1_idx(cgrp_idx, MAX_LEVELS);
133 
134 	for ( ; idx < MAX_EVENTS; idx++) {
135 		if (idx == num_events)
136 			break;
137 
138 		// XXX: do not pass idx directly (for verifier)
139 		key = idx;
140 		// this is per-cpu array for diff
141 		prev_val = bpf_map_lookup_elem(&prev_readings, &key);
142 		if (!prev_val) {
143 			val.counter = val.enabled = val.running = 0;
144 			bpf_map_update_elem(&prev_readings, &key, &val, BPF_ANY);
145 
146 			prev_val = bpf_map_lookup_elem(&prev_readings, &key);
147 			if (!prev_val)
148 				continue;
149 		}
150 
151 		// read from global perf_event array
152 		key = idx * num_cpus + cpu;
153 		err = bpf_perf_event_read_value(&events, key, &val, sizeof(val));
154 		if (err)
155 			continue;
156 
157 		if (enabled) {
158 			delta.counter = val.counter - prev_val->counter;
159 			delta.enabled = val.enabled - prev_val->enabled;
160 			delta.running = val.running - prev_val->running;
161 
162 			for (c = 0; c < MAX_LEVELS; c++) {
163 				if (c == cgrp_cnt)
164 					break;
165 
166 				cgrp = cgrp_idx[c];
167 
168 				// aggregate the result by cgroup
169 				key = cgrp * num_events + idx;
170 				cgrp_val = bpf_map_lookup_elem(&cgrp_readings, &key);
171 				if (cgrp_val) {
172 					cgrp_val->counter += delta.counter;
173 					cgrp_val->enabled += delta.enabled;
174 					cgrp_val->running += delta.running;
175 				} else {
176 					bpf_map_update_elem(&cgrp_readings, &key,
177 							    &delta, BPF_ANY);
178 				}
179 			}
180 		}
181 
182 		*prev_val = val;
183 	}
184 	return 0;
185 }
186 
187 // This will be attached to cgroup-switches event for each cpu
188 SEC("perf_event")
189 int BPF_PROG(on_cgrp_switch)
190 {
191 	return bperf_cgroup_count();
192 }
193 
194 SEC("raw_tp/sched_switch")
195 int BPF_PROG(trigger_read)
196 {
197 	return bperf_cgroup_count();
198 }
199 
200 char LICENSE[] SEC("license") = "Dual BSD/GPL";
201