1 // SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) 2 // Copyright (c) 2022 Google 3 #include "vmlinux.h" 4 #include <bpf/bpf_helpers.h> 5 #include <bpf/bpf_tracing.h> 6 #include <bpf/bpf_core_read.h> 7 8 /* task->flags for off-cpu analysis */ 9 #define PF_KTHREAD 0x00200000 /* I am a kernel thread */ 10 11 /* task->state for off-cpu analysis */ 12 #define TASK_INTERRUPTIBLE 0x0001 13 #define TASK_UNINTERRUPTIBLE 0x0002 14 15 #define MAX_STACKS 32 16 #define MAX_ENTRIES 102400 17 18 struct tstamp_data { 19 __u32 stack_id; 20 __u32 state; 21 __u64 timestamp; 22 }; 23 24 struct offcpu_key { 25 __u32 pid; 26 __u32 tgid; 27 __u32 stack_id; 28 __u32 state; 29 __u64 cgroup_id; 30 }; 31 32 struct { 33 __uint(type, BPF_MAP_TYPE_STACK_TRACE); 34 __uint(key_size, sizeof(__u32)); 35 __uint(value_size, MAX_STACKS * sizeof(__u64)); 36 __uint(max_entries, MAX_ENTRIES); 37 } stacks SEC(".maps"); 38 39 struct { 40 __uint(type, BPF_MAP_TYPE_TASK_STORAGE); 41 __uint(map_flags, BPF_F_NO_PREALLOC); 42 __type(key, int); 43 __type(value, struct tstamp_data); 44 } tstamp SEC(".maps"); 45 46 struct { 47 __uint(type, BPF_MAP_TYPE_HASH); 48 __uint(key_size, sizeof(struct offcpu_key)); 49 __uint(value_size, sizeof(__u64)); 50 __uint(max_entries, MAX_ENTRIES); 51 } off_cpu SEC(".maps"); 52 53 struct { 54 __uint(type, BPF_MAP_TYPE_HASH); 55 __uint(key_size, sizeof(__u32)); 56 __uint(value_size, sizeof(__u8)); 57 __uint(max_entries, 1); 58 } cpu_filter SEC(".maps"); 59 60 struct { 61 __uint(type, BPF_MAP_TYPE_HASH); 62 __uint(key_size, sizeof(__u32)); 63 __uint(value_size, sizeof(__u8)); 64 __uint(max_entries, 1); 65 } task_filter SEC(".maps"); 66 67 struct { 68 __uint(type, BPF_MAP_TYPE_HASH); 69 __uint(key_size, sizeof(__u64)); 70 __uint(value_size, sizeof(__u8)); 71 __uint(max_entries, 1); 72 } cgroup_filter SEC(".maps"); 73 74 /* old kernel task_struct definition */ 75 struct task_struct___old { 76 long state; 77 } __attribute__((preserve_access_index)); 78 79 int enabled = 0; 80 int has_cpu = 0; 81 int has_task = 0; 82 int has_cgroup = 0; 83 84 const volatile bool has_prev_state = false; 85 const volatile bool needs_cgroup = false; 86 const volatile bool uses_cgroup_v1 = false; 87 88 /* 89 * Old kernel used to call it task_struct->state and now it's '__state'. 90 * Use BPF CO-RE "ignored suffix rule" to deal with it like below: 91 * 92 * https://nakryiko.com/posts/bpf-core-reference-guide/#handling-incompatible-field-and-type-changes 93 */ 94 static inline int get_task_state(struct task_struct *t) 95 { 96 if (bpf_core_field_exists(t->__state)) 97 return BPF_CORE_READ(t, __state); 98 99 /* recast pointer to capture task_struct___old type for compiler */ 100 struct task_struct___old *t_old = (void *)t; 101 102 /* now use old "state" name of the field */ 103 return BPF_CORE_READ(t_old, state); 104 } 105 106 static inline __u64 get_cgroup_id(struct task_struct *t) 107 { 108 struct cgroup *cgrp; 109 110 if (uses_cgroup_v1) 111 cgrp = BPF_CORE_READ(t, cgroups, subsys[perf_event_cgrp_id], cgroup); 112 else 113 cgrp = BPF_CORE_READ(t, cgroups, dfl_cgrp); 114 115 return BPF_CORE_READ(cgrp, kn, id); 116 } 117 118 static inline int can_record(struct task_struct *t, int state) 119 { 120 /* kernel threads don't have user stack */ 121 if (t->flags & PF_KTHREAD) 122 return 0; 123 124 if (state != TASK_INTERRUPTIBLE && 125 state != TASK_UNINTERRUPTIBLE) 126 return 0; 127 128 if (has_cpu) { 129 __u32 cpu = bpf_get_smp_processor_id(); 130 __u8 *ok; 131 132 ok = bpf_map_lookup_elem(&cpu_filter, &cpu); 133 if (!ok) 134 return 0; 135 } 136 137 if (has_task) { 138 __u8 *ok; 139 __u32 pid = t->pid; 140 141 ok = bpf_map_lookup_elem(&task_filter, &pid); 142 if (!ok) 143 return 0; 144 } 145 146 if (has_cgroup) { 147 __u8 *ok; 148 __u64 cgrp_id = get_cgroup_id(t); 149 150 ok = bpf_map_lookup_elem(&cgroup_filter, &cgrp_id); 151 if (!ok) 152 return 0; 153 } 154 155 return 1; 156 } 157 158 static int off_cpu_stat(u64 *ctx, struct task_struct *prev, 159 struct task_struct *next, int state) 160 { 161 __u64 ts; 162 __u32 stack_id; 163 struct tstamp_data *pelem; 164 165 ts = bpf_ktime_get_ns(); 166 167 if (!can_record(prev, state)) 168 goto next; 169 170 stack_id = bpf_get_stackid(ctx, &stacks, 171 BPF_F_FAST_STACK_CMP | BPF_F_USER_STACK); 172 173 pelem = bpf_task_storage_get(&tstamp, prev, NULL, 174 BPF_LOCAL_STORAGE_GET_F_CREATE); 175 if (!pelem) 176 goto next; 177 178 pelem->timestamp = ts; 179 pelem->state = state; 180 pelem->stack_id = stack_id; 181 182 next: 183 pelem = bpf_task_storage_get(&tstamp, next, NULL, 0); 184 185 if (pelem && pelem->timestamp) { 186 struct offcpu_key key = { 187 .pid = next->pid, 188 .tgid = next->tgid, 189 .stack_id = pelem->stack_id, 190 .state = pelem->state, 191 .cgroup_id = needs_cgroup ? get_cgroup_id(next) : 0, 192 }; 193 __u64 delta = ts - pelem->timestamp; 194 __u64 *total; 195 196 total = bpf_map_lookup_elem(&off_cpu, &key); 197 if (total) 198 *total += delta; 199 else 200 bpf_map_update_elem(&off_cpu, &key, &delta, BPF_ANY); 201 202 /* prevent to reuse the timestamp later */ 203 pelem->timestamp = 0; 204 } 205 206 return 0; 207 } 208 209 SEC("tp_btf/sched_switch") 210 int on_switch(u64 *ctx) 211 { 212 struct task_struct *prev, *next; 213 int prev_state; 214 215 if (!enabled) 216 return 0; 217 218 prev = (struct task_struct *)ctx[1]; 219 next = (struct task_struct *)ctx[2]; 220 221 if (has_prev_state) 222 prev_state = (int)ctx[3]; 223 else 224 prev_state = get_task_state(prev); 225 226 return off_cpu_stat(ctx, prev, next, prev_state); 227 } 228 229 char LICENSE[] SEC("license") = "Dual BSD/GPL"; 230