1 // SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) 2 // Copyright (c) 2022 Google 3 #include "vmlinux.h" 4 #include <bpf/bpf_helpers.h> 5 #include <bpf/bpf_tracing.h> 6 #include <bpf/bpf_core_read.h> 7 8 /* task->flags for off-cpu analysis */ 9 #define PF_KTHREAD 0x00200000 /* I am a kernel thread */ 10 11 /* task->state for off-cpu analysis */ 12 #define TASK_INTERRUPTIBLE 0x0001 13 #define TASK_UNINTERRUPTIBLE 0x0002 14 15 /* create a new thread */ 16 #define CLONE_THREAD 0x10000 17 18 #define MAX_STACKS 32 19 #define MAX_ENTRIES 102400 20 21 struct tstamp_data { 22 __u32 stack_id; 23 __u32 state; 24 __u64 timestamp; 25 }; 26 27 struct offcpu_key { 28 __u32 pid; 29 __u32 tgid; 30 __u32 stack_id; 31 __u32 state; 32 __u64 cgroup_id; 33 }; 34 35 struct { 36 __uint(type, BPF_MAP_TYPE_STACK_TRACE); 37 __uint(key_size, sizeof(__u32)); 38 __uint(value_size, MAX_STACKS * sizeof(__u64)); 39 __uint(max_entries, MAX_ENTRIES); 40 } stacks SEC(".maps"); 41 42 struct { 43 __uint(type, BPF_MAP_TYPE_TASK_STORAGE); 44 __uint(map_flags, BPF_F_NO_PREALLOC); 45 __type(key, int); 46 __type(value, struct tstamp_data); 47 } tstamp SEC(".maps"); 48 49 struct { 50 __uint(type, BPF_MAP_TYPE_HASH); 51 __uint(key_size, sizeof(struct offcpu_key)); 52 __uint(value_size, sizeof(__u64)); 53 __uint(max_entries, MAX_ENTRIES); 54 } off_cpu SEC(".maps"); 55 56 struct { 57 __uint(type, BPF_MAP_TYPE_HASH); 58 __uint(key_size, sizeof(__u32)); 59 __uint(value_size, sizeof(__u8)); 60 __uint(max_entries, 1); 61 } cpu_filter SEC(".maps"); 62 63 struct { 64 __uint(type, BPF_MAP_TYPE_HASH); 65 __uint(key_size, sizeof(__u32)); 66 __uint(value_size, sizeof(__u8)); 67 __uint(max_entries, 1); 68 } task_filter SEC(".maps"); 69 70 struct { 71 __uint(type, BPF_MAP_TYPE_HASH); 72 __uint(key_size, sizeof(__u64)); 73 __uint(value_size, sizeof(__u8)); 74 __uint(max_entries, 1); 75 } cgroup_filter SEC(".maps"); 76 77 /* new kernel task_struct definition */ 78 struct task_struct___new { 79 long __state; 80 } __attribute__((preserve_access_index)); 81 82 /* old kernel task_struct definition */ 83 struct task_struct___old { 84 long state; 85 } __attribute__((preserve_access_index)); 86 87 int enabled = 0; 88 int has_cpu = 0; 89 int has_task = 0; 90 int has_cgroup = 0; 91 int uses_tgid = 0; 92 93 const volatile bool has_prev_state = false; 94 const volatile bool needs_cgroup = false; 95 const volatile bool uses_cgroup_v1 = false; 96 97 /* 98 * Old kernel used to call it task_struct->state and now it's '__state'. 99 * Use BPF CO-RE "ignored suffix rule" to deal with it like below: 100 * 101 * https://nakryiko.com/posts/bpf-core-reference-guide/#handling-incompatible-field-and-type-changes 102 */ 103 static inline int get_task_state(struct task_struct *t) 104 { 105 /* recast pointer to capture new type for compiler */ 106 struct task_struct___new *t_new = (void *)t; 107 108 if (bpf_core_field_exists(t_new->__state)) { 109 return BPF_CORE_READ(t_new, __state); 110 } else { 111 /* recast pointer to capture old type for compiler */ 112 struct task_struct___old *t_old = (void *)t; 113 114 return BPF_CORE_READ(t_old, state); 115 } 116 } 117 118 static inline __u64 get_cgroup_id(struct task_struct *t) 119 { 120 struct cgroup *cgrp; 121 122 if (uses_cgroup_v1) 123 cgrp = BPF_CORE_READ(t, cgroups, subsys[perf_event_cgrp_id], cgroup); 124 else 125 cgrp = BPF_CORE_READ(t, cgroups, dfl_cgrp); 126 127 return BPF_CORE_READ(cgrp, kn, id); 128 } 129 130 static inline int can_record(struct task_struct *t, int state) 131 { 132 /* kernel threads don't have user stack */ 133 if (t->flags & PF_KTHREAD) 134 return 0; 135 136 if (state != TASK_INTERRUPTIBLE && 137 state != TASK_UNINTERRUPTIBLE) 138 return 0; 139 140 if (has_cpu) { 141 __u32 cpu = bpf_get_smp_processor_id(); 142 __u8 *ok; 143 144 ok = bpf_map_lookup_elem(&cpu_filter, &cpu); 145 if (!ok) 146 return 0; 147 } 148 149 if (has_task) { 150 __u8 *ok; 151 __u32 pid; 152 153 if (uses_tgid) 154 pid = t->tgid; 155 else 156 pid = t->pid; 157 158 ok = bpf_map_lookup_elem(&task_filter, &pid); 159 if (!ok) 160 return 0; 161 } 162 163 if (has_cgroup) { 164 __u8 *ok; 165 __u64 cgrp_id = get_cgroup_id(t); 166 167 ok = bpf_map_lookup_elem(&cgroup_filter, &cgrp_id); 168 if (!ok) 169 return 0; 170 } 171 172 return 1; 173 } 174 175 static int off_cpu_stat(u64 *ctx, struct task_struct *prev, 176 struct task_struct *next, int state) 177 { 178 __u64 ts; 179 __u32 stack_id; 180 struct tstamp_data *pelem; 181 182 ts = bpf_ktime_get_ns(); 183 184 if (!can_record(prev, state)) 185 goto next; 186 187 stack_id = bpf_get_stackid(ctx, &stacks, 188 BPF_F_FAST_STACK_CMP | BPF_F_USER_STACK); 189 190 pelem = bpf_task_storage_get(&tstamp, prev, NULL, 191 BPF_LOCAL_STORAGE_GET_F_CREATE); 192 if (!pelem) 193 goto next; 194 195 pelem->timestamp = ts; 196 pelem->state = state; 197 pelem->stack_id = stack_id; 198 199 next: 200 pelem = bpf_task_storage_get(&tstamp, next, NULL, 0); 201 202 if (pelem && pelem->timestamp) { 203 struct offcpu_key key = { 204 .pid = next->pid, 205 .tgid = next->tgid, 206 .stack_id = pelem->stack_id, 207 .state = pelem->state, 208 .cgroup_id = needs_cgroup ? get_cgroup_id(next) : 0, 209 }; 210 __u64 delta = ts - pelem->timestamp; 211 __u64 *total; 212 213 total = bpf_map_lookup_elem(&off_cpu, &key); 214 if (total) 215 *total += delta; 216 else 217 bpf_map_update_elem(&off_cpu, &key, &delta, BPF_ANY); 218 219 /* prevent to reuse the timestamp later */ 220 pelem->timestamp = 0; 221 } 222 223 return 0; 224 } 225 226 SEC("tp_btf/task_newtask") 227 int on_newtask(u64 *ctx) 228 { 229 struct task_struct *task; 230 u64 clone_flags; 231 u32 pid; 232 u8 val = 1; 233 234 if (!uses_tgid) 235 return 0; 236 237 task = (struct task_struct *)bpf_get_current_task(); 238 239 pid = BPF_CORE_READ(task, tgid); 240 if (!bpf_map_lookup_elem(&task_filter, &pid)) 241 return 0; 242 243 task = (struct task_struct *)ctx[0]; 244 clone_flags = ctx[1]; 245 246 pid = task->tgid; 247 if (!(clone_flags & CLONE_THREAD)) 248 bpf_map_update_elem(&task_filter, &pid, &val, BPF_NOEXIST); 249 250 return 0; 251 } 252 253 SEC("tp_btf/sched_switch") 254 int on_switch(u64 *ctx) 255 { 256 struct task_struct *prev, *next; 257 int prev_state; 258 259 if (!enabled) 260 return 0; 261 262 prev = (struct task_struct *)ctx[1]; 263 next = (struct task_struct *)ctx[2]; 264 265 if (has_prev_state) 266 prev_state = (int)ctx[3]; 267 else 268 prev_state = get_task_state(prev); 269 270 return off_cpu_stat(ctx, prev, next, prev_state); 271 } 272 273 char LICENSE[] SEC("license") = "Dual BSD/GPL"; 274