1 // SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) 2 // Copyright (c) 2022 Google 3 #include "vmlinux.h" 4 #include <bpf/bpf_helpers.h> 5 #include <bpf/bpf_tracing.h> 6 #include <bpf/bpf_core_read.h> 7 8 /* task->flags for off-cpu analysis */ 9 #define PF_KTHREAD 0x00200000 /* I am a kernel thread */ 10 11 /* task->state for off-cpu analysis */ 12 #define TASK_INTERRUPTIBLE 0x0001 13 #define TASK_UNINTERRUPTIBLE 0x0002 14 15 #define MAX_STACKS 32 16 #define MAX_ENTRIES 102400 17 18 struct tstamp_data { 19 __u32 stack_id; 20 __u32 state; 21 __u64 timestamp; 22 }; 23 24 struct offcpu_key { 25 __u32 pid; 26 __u32 tgid; 27 __u32 stack_id; 28 __u32 state; 29 }; 30 31 struct { 32 __uint(type, BPF_MAP_TYPE_STACK_TRACE); 33 __uint(key_size, sizeof(__u32)); 34 __uint(value_size, MAX_STACKS * sizeof(__u64)); 35 __uint(max_entries, MAX_ENTRIES); 36 } stacks SEC(".maps"); 37 38 struct { 39 __uint(type, BPF_MAP_TYPE_TASK_STORAGE); 40 __uint(map_flags, BPF_F_NO_PREALLOC); 41 __type(key, int); 42 __type(value, struct tstamp_data); 43 } tstamp SEC(".maps"); 44 45 struct { 46 __uint(type, BPF_MAP_TYPE_HASH); 47 __uint(key_size, sizeof(struct offcpu_key)); 48 __uint(value_size, sizeof(__u64)); 49 __uint(max_entries, MAX_ENTRIES); 50 } off_cpu SEC(".maps"); 51 52 struct { 53 __uint(type, BPF_MAP_TYPE_HASH); 54 __uint(key_size, sizeof(__u32)); 55 __uint(value_size, sizeof(__u8)); 56 __uint(max_entries, 1); 57 } cpu_filter SEC(".maps"); 58 59 struct { 60 __uint(type, BPF_MAP_TYPE_HASH); 61 __uint(key_size, sizeof(__u32)); 62 __uint(value_size, sizeof(__u8)); 63 __uint(max_entries, 1); 64 } task_filter SEC(".maps"); 65 66 /* old kernel task_struct definition */ 67 struct task_struct___old { 68 long state; 69 } __attribute__((preserve_access_index)); 70 71 int enabled = 0; 72 int has_cpu = 0; 73 int has_task = 0; 74 75 const volatile bool has_prev_state = false; 76 77 /* 78 * Old kernel used to call it task_struct->state and now it's '__state'. 79 * Use BPF CO-RE "ignored suffix rule" to deal with it like below: 80 * 81 * https://nakryiko.com/posts/bpf-core-reference-guide/#handling-incompatible-field-and-type-changes 82 */ 83 static inline int get_task_state(struct task_struct *t) 84 { 85 if (bpf_core_field_exists(t->__state)) 86 return BPF_CORE_READ(t, __state); 87 88 /* recast pointer to capture task_struct___old type for compiler */ 89 struct task_struct___old *t_old = (void *)t; 90 91 /* now use old "state" name of the field */ 92 return BPF_CORE_READ(t_old, state); 93 } 94 95 static inline int can_record(struct task_struct *t, int state) 96 { 97 /* kernel threads don't have user stack */ 98 if (t->flags & PF_KTHREAD) 99 return 0; 100 101 if (state != TASK_INTERRUPTIBLE && 102 state != TASK_UNINTERRUPTIBLE) 103 return 0; 104 105 if (has_cpu) { 106 __u32 cpu = bpf_get_smp_processor_id(); 107 __u8 *ok; 108 109 ok = bpf_map_lookup_elem(&cpu_filter, &cpu); 110 if (!ok) 111 return 0; 112 } 113 114 if (has_task) { 115 __u8 *ok; 116 __u32 pid = t->pid; 117 118 ok = bpf_map_lookup_elem(&task_filter, &pid); 119 if (!ok) 120 return 0; 121 } 122 123 return 1; 124 } 125 126 static int off_cpu_stat(u64 *ctx, struct task_struct *prev, 127 struct task_struct *next, int state) 128 { 129 __u64 ts; 130 __u32 stack_id; 131 struct tstamp_data *pelem; 132 133 ts = bpf_ktime_get_ns(); 134 135 if (!can_record(prev, state)) 136 goto next; 137 138 stack_id = bpf_get_stackid(ctx, &stacks, 139 BPF_F_FAST_STACK_CMP | BPF_F_USER_STACK); 140 141 pelem = bpf_task_storage_get(&tstamp, prev, NULL, 142 BPF_LOCAL_STORAGE_GET_F_CREATE); 143 if (!pelem) 144 goto next; 145 146 pelem->timestamp = ts; 147 pelem->state = state; 148 pelem->stack_id = stack_id; 149 150 next: 151 pelem = bpf_task_storage_get(&tstamp, next, NULL, 0); 152 153 if (pelem && pelem->timestamp) { 154 struct offcpu_key key = { 155 .pid = next->pid, 156 .tgid = next->tgid, 157 .stack_id = pelem->stack_id, 158 .state = pelem->state, 159 }; 160 __u64 delta = ts - pelem->timestamp; 161 __u64 *total; 162 163 total = bpf_map_lookup_elem(&off_cpu, &key); 164 if (total) 165 *total += delta; 166 else 167 bpf_map_update_elem(&off_cpu, &key, &delta, BPF_ANY); 168 169 /* prevent to reuse the timestamp later */ 170 pelem->timestamp = 0; 171 } 172 173 return 0; 174 } 175 176 SEC("tp_btf/sched_switch") 177 int on_switch(u64 *ctx) 178 { 179 struct task_struct *prev, *next; 180 int prev_state; 181 182 if (!enabled) 183 return 0; 184 185 prev = (struct task_struct *)ctx[1]; 186 next = (struct task_struct *)ctx[2]; 187 188 if (has_prev_state) 189 prev_state = (int)ctx[3]; 190 else 191 prev_state = get_task_state(prev); 192 193 return off_cpu_stat(ctx, prev, next, prev_state); 194 } 195 196 char LICENSE[] SEC("license") = "Dual BSD/GPL"; 197