1 // SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) 2 // Copyright (c) 2021 Facebook 3 // Copyright (c) 2021 Google 4 #include "vmlinux.h" 5 #include <bpf/bpf_helpers.h> 6 #include <bpf/bpf_tracing.h> 7 #include <bpf/bpf_core_read.h> 8 9 #define MAX_LEVELS 10 // max cgroup hierarchy level: arbitrary 10 #define MAX_EVENTS 32 // max events per cgroup: arbitrary 11 12 // NOTE: many of map and global data will be modified before loading 13 // from the userspace (perf tool) using the skeleton helpers. 14 15 // single set of global perf events to measure 16 struct { 17 __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); 18 __uint(key_size, sizeof(__u32)); 19 __uint(value_size, sizeof(int)); 20 __uint(max_entries, 1); 21 } events SEC(".maps"); 22 23 // from cgroup id to event index 24 struct { 25 __uint(type, BPF_MAP_TYPE_HASH); 26 __uint(key_size, sizeof(__u64)); 27 __uint(value_size, sizeof(__u32)); 28 __uint(max_entries, 1); 29 } cgrp_idx SEC(".maps"); 30 31 // per-cpu event snapshots to calculate delta 32 struct { 33 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); 34 __uint(key_size, sizeof(__u32)); 35 __uint(value_size, sizeof(struct bpf_perf_event_value)); 36 } prev_readings SEC(".maps"); 37 38 // aggregated event values for each cgroup (per-cpu) 39 // will be read from the user-space 40 struct { 41 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); 42 __uint(key_size, sizeof(__u32)); 43 __uint(value_size, sizeof(struct bpf_perf_event_value)); 44 } cgrp_readings SEC(".maps"); 45 46 const volatile __u32 num_events = 1; 47 const volatile __u32 num_cpus = 1; 48 49 int enabled = 0; 50 int use_cgroup_v2 = 0; 51 int perf_subsys_id = -1; 52 53 static inline int get_cgroup_v1_idx(__u32 *cgrps, int size) 54 { 55 struct task_struct *p = (void *)bpf_get_current_task(); 56 struct cgroup *cgrp; 57 register int i = 0; 58 __u32 *elem; 59 int level; 60 int cnt; 61 62 if (perf_subsys_id == -1) { 63 #if __has_builtin(__builtin_preserve_enum_value) 64 perf_subsys_id = bpf_core_enum_value(enum cgroup_subsys_id, 65 perf_event_cgrp_id); 66 #else 67 perf_subsys_id = perf_event_cgrp_id; 68 #endif 69 } 70 cgrp = BPF_CORE_READ(p, cgroups, subsys[perf_subsys_id], cgroup); 71 level = BPF_CORE_READ(cgrp, level); 72 73 for (cnt = 0; i < MAX_LEVELS; i++) { 74 __u64 cgrp_id; 75 76 if (i > level) 77 break; 78 79 // convert cgroup-id to a map index 80 cgrp_id = BPF_CORE_READ(cgrp, ancestor_ids[i]); 81 elem = bpf_map_lookup_elem(&cgrp_idx, &cgrp_id); 82 if (!elem) 83 continue; 84 85 cgrps[cnt++] = *elem; 86 if (cnt == size) 87 break; 88 } 89 90 return cnt; 91 } 92 93 static inline int get_cgroup_v2_idx(__u32 *cgrps, int size) 94 { 95 register int i = 0; 96 __u32 *elem; 97 int cnt; 98 99 for (cnt = 0; i < MAX_LEVELS; i++) { 100 __u64 cgrp_id = bpf_get_current_ancestor_cgroup_id(i); 101 102 if (cgrp_id == 0) 103 break; 104 105 // convert cgroup-id to a map index 106 elem = bpf_map_lookup_elem(&cgrp_idx, &cgrp_id); 107 if (!elem) 108 continue; 109 110 cgrps[cnt++] = *elem; 111 if (cnt == size) 112 break; 113 } 114 115 return cnt; 116 } 117 118 static int bperf_cgroup_count(void) 119 { 120 register __u32 idx = 0; // to have it in a register to pass BPF verifier 121 register int c = 0; 122 struct bpf_perf_event_value val, delta, *prev_val, *cgrp_val; 123 __u32 cpu = bpf_get_smp_processor_id(); 124 __u32 cgrp_idx[MAX_LEVELS]; 125 int cgrp_cnt; 126 __u32 key, cgrp; 127 long err; 128 129 if (use_cgroup_v2) 130 cgrp_cnt = get_cgroup_v2_idx(cgrp_idx, MAX_LEVELS); 131 else 132 cgrp_cnt = get_cgroup_v1_idx(cgrp_idx, MAX_LEVELS); 133 134 for ( ; idx < MAX_EVENTS; idx++) { 135 if (idx == num_events) 136 break; 137 138 // XXX: do not pass idx directly (for verifier) 139 key = idx; 140 // this is per-cpu array for diff 141 prev_val = bpf_map_lookup_elem(&prev_readings, &key); 142 if (!prev_val) { 143 val.counter = val.enabled = val.running = 0; 144 bpf_map_update_elem(&prev_readings, &key, &val, BPF_ANY); 145 146 prev_val = bpf_map_lookup_elem(&prev_readings, &key); 147 if (!prev_val) 148 continue; 149 } 150 151 // read from global perf_event array 152 key = idx * num_cpus + cpu; 153 err = bpf_perf_event_read_value(&events, key, &val, sizeof(val)); 154 if (err) 155 continue; 156 157 if (enabled) { 158 delta.counter = val.counter - prev_val->counter; 159 delta.enabled = val.enabled - prev_val->enabled; 160 delta.running = val.running - prev_val->running; 161 162 for (c = 0; c < MAX_LEVELS; c++) { 163 if (c == cgrp_cnt) 164 break; 165 166 cgrp = cgrp_idx[c]; 167 168 // aggregate the result by cgroup 169 key = cgrp * num_events + idx; 170 cgrp_val = bpf_map_lookup_elem(&cgrp_readings, &key); 171 if (cgrp_val) { 172 cgrp_val->counter += delta.counter; 173 cgrp_val->enabled += delta.enabled; 174 cgrp_val->running += delta.running; 175 } else { 176 bpf_map_update_elem(&cgrp_readings, &key, 177 &delta, BPF_ANY); 178 } 179 } 180 } 181 182 *prev_val = val; 183 } 184 return 0; 185 } 186 187 // This will be attached to cgroup-switches event for each cpu 188 SEC("perf_event") 189 int BPF_PROG(on_cgrp_switch) 190 { 191 return bperf_cgroup_count(); 192 } 193 194 SEC("raw_tp/sched_switch") 195 int BPF_PROG(trigger_read) 196 { 197 return bperf_cgroup_count(); 198 } 199 200 char LICENSE[] SEC("license") = "Dual BSD/GPL"; 201