1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Functions to manage eBPF programs attached to cgroup subsystems 4 * 5 * Copyright 2022 Google LLC. 6 */ 7 #include "vmlinux.h" 8 #include <bpf/bpf_helpers.h> 9 #include <bpf/bpf_tracing.h> 10 #include <bpf/bpf_core_read.h> 11 12 char _license[] SEC("license") = "GPL"; 13 14 /* 15 * Start times are stored per-task, not per-cgroup, as multiple tasks in one 16 * cgroup can perform reclaim concurrently. 17 */ 18 struct { 19 __uint(type, BPF_MAP_TYPE_TASK_STORAGE); 20 __uint(map_flags, BPF_F_NO_PREALLOC); 21 __type(key, int); 22 __type(value, __u64); 23 } vmscan_start_time SEC(".maps"); 24 25 struct vmscan_percpu { 26 /* Previous percpu state, to figure out if we have new updates */ 27 __u64 prev; 28 /* Current percpu state */ 29 __u64 state; 30 }; 31 32 struct vmscan { 33 /* State propagated through children, pending aggregation */ 34 __u64 pending; 35 /* Total state, including all cpus and all children */ 36 __u64 state; 37 }; 38 39 struct { 40 __uint(type, BPF_MAP_TYPE_PERCPU_HASH); 41 __uint(max_entries, 100); 42 __type(key, __u64); 43 __type(value, struct vmscan_percpu); 44 } pcpu_cgroup_vmscan_elapsed SEC(".maps"); 45 46 struct { 47 __uint(type, BPF_MAP_TYPE_HASH); 48 __uint(max_entries, 100); 49 __type(key, __u64); 50 __type(value, struct vmscan); 51 } cgroup_vmscan_elapsed SEC(".maps"); 52 53 extern void cgroup_rstat_updated(struct cgroup *cgrp, int cpu) __ksym; 54 extern void cgroup_rstat_flush(struct cgroup *cgrp) __ksym; 55 56 static struct cgroup *task_memcg(struct task_struct *task) 57 { 58 int cgrp_id; 59 60 #if __has_builtin(__builtin_preserve_enum_value) 61 cgrp_id = bpf_core_enum_value(enum cgroup_subsys_id, memory_cgrp_id); 62 #else 63 cgrp_id = memory_cgrp_id; 64 #endif 65 return task->cgroups->subsys[cgrp_id]->cgroup; 66 } 67 68 static uint64_t cgroup_id(struct cgroup *cgrp) 69 { 70 return cgrp->kn->id; 71 } 72 73 static int create_vmscan_percpu_elem(__u64 cg_id, __u64 state) 74 { 75 struct vmscan_percpu pcpu_init = {.state = state, .prev = 0}; 76 77 return bpf_map_update_elem(&pcpu_cgroup_vmscan_elapsed, &cg_id, 78 &pcpu_init, BPF_NOEXIST); 79 } 80 81 static int create_vmscan_elem(__u64 cg_id, __u64 state, __u64 pending) 82 { 83 struct vmscan init = {.state = state, .pending = pending}; 84 85 return bpf_map_update_elem(&cgroup_vmscan_elapsed, &cg_id, 86 &init, BPF_NOEXIST); 87 } 88 89 SEC("tp_btf/mm_vmscan_memcg_reclaim_begin") 90 int BPF_PROG(vmscan_start, int order, gfp_t gfp_flags) 91 { 92 struct task_struct *task = bpf_get_current_task_btf(); 93 __u64 *start_time_ptr; 94 95 start_time_ptr = bpf_task_storage_get(&vmscan_start_time, task, 0, 96 BPF_LOCAL_STORAGE_GET_F_CREATE); 97 if (start_time_ptr) 98 *start_time_ptr = bpf_ktime_get_ns(); 99 return 0; 100 } 101 102 SEC("tp_btf/mm_vmscan_memcg_reclaim_end") 103 int BPF_PROG(vmscan_end, unsigned long nr_reclaimed) 104 { 105 struct vmscan_percpu *pcpu_stat; 106 struct task_struct *current = bpf_get_current_task_btf(); 107 struct cgroup *cgrp; 108 __u64 *start_time_ptr; 109 __u64 current_elapsed, cg_id; 110 __u64 end_time = bpf_ktime_get_ns(); 111 112 /* 113 * cgrp is the first parent cgroup of current that has memcg enabled in 114 * its subtree_control, or NULL if memcg is disabled in the entire tree. 115 * In a cgroup hierarchy like this: 116 * a 117 * / \ 118 * b c 119 * If "a" has memcg enabled, while "b" doesn't, then processes in "b" 120 * will accumulate their stats directly to "a". This makes sure that no 121 * stats are lost from processes in leaf cgroups that don't have memcg 122 * enabled, but only exposes stats for cgroups that have memcg enabled. 123 */ 124 cgrp = task_memcg(current); 125 if (!cgrp) 126 return 0; 127 128 cg_id = cgroup_id(cgrp); 129 start_time_ptr = bpf_task_storage_get(&vmscan_start_time, current, 0, 130 BPF_LOCAL_STORAGE_GET_F_CREATE); 131 if (!start_time_ptr) 132 return 0; 133 134 current_elapsed = end_time - *start_time_ptr; 135 pcpu_stat = bpf_map_lookup_elem(&pcpu_cgroup_vmscan_elapsed, 136 &cg_id); 137 if (pcpu_stat) 138 pcpu_stat->state += current_elapsed; 139 else if (create_vmscan_percpu_elem(cg_id, current_elapsed)) 140 return 0; 141 142 cgroup_rstat_updated(cgrp, bpf_get_smp_processor_id()); 143 return 0; 144 } 145 146 SEC("fentry/bpf_rstat_flush") 147 int BPF_PROG(vmscan_flush, struct cgroup *cgrp, struct cgroup *parent, int cpu) 148 { 149 struct vmscan_percpu *pcpu_stat; 150 struct vmscan *total_stat, *parent_stat; 151 __u64 cg_id = cgroup_id(cgrp); 152 __u64 parent_cg_id = parent ? cgroup_id(parent) : 0; 153 __u64 *pcpu_vmscan; 154 __u64 state; 155 __u64 delta = 0; 156 157 /* Add CPU changes on this level since the last flush */ 158 pcpu_stat = bpf_map_lookup_percpu_elem(&pcpu_cgroup_vmscan_elapsed, 159 &cg_id, cpu); 160 if (pcpu_stat) { 161 state = pcpu_stat->state; 162 delta += state - pcpu_stat->prev; 163 pcpu_stat->prev = state; 164 } 165 166 total_stat = bpf_map_lookup_elem(&cgroup_vmscan_elapsed, &cg_id); 167 if (!total_stat) { 168 if (create_vmscan_elem(cg_id, delta, 0)) 169 return 0; 170 171 goto update_parent; 172 } 173 174 /* Collect pending stats from subtree */ 175 if (total_stat->pending) { 176 delta += total_stat->pending; 177 total_stat->pending = 0; 178 } 179 180 /* Propagate changes to this cgroup's total */ 181 total_stat->state += delta; 182 183 update_parent: 184 /* Skip if there are no changes to propagate, or no parent */ 185 if (!delta || !parent_cg_id) 186 return 0; 187 188 /* Propagate changes to cgroup's parent */ 189 parent_stat = bpf_map_lookup_elem(&cgroup_vmscan_elapsed, 190 &parent_cg_id); 191 if (parent_stat) 192 parent_stat->pending += delta; 193 else 194 create_vmscan_elem(parent_cg_id, 0, delta); 195 return 0; 196 } 197 198 SEC("iter.s/cgroup") 199 int BPF_PROG(dump_vmscan, struct bpf_iter_meta *meta, struct cgroup *cgrp) 200 { 201 struct seq_file *seq = meta->seq; 202 struct vmscan *total_stat; 203 __u64 cg_id = cgrp ? cgroup_id(cgrp) : 0; 204 205 /* Do nothing for the terminal call */ 206 if (!cg_id) 207 return 1; 208 209 /* Flush the stats to make sure we get the most updated numbers */ 210 cgroup_rstat_flush(cgrp); 211 212 total_stat = bpf_map_lookup_elem(&cgroup_vmscan_elapsed, &cg_id); 213 if (!total_stat) { 214 BPF_SEQ_PRINTF(seq, "cg_id: %llu, total_vmscan_delay: 0\n", 215 cg_id); 216 } else { 217 BPF_SEQ_PRINTF(seq, "cg_id: %llu, total_vmscan_delay: %llu\n", 218 cg_id, total_stat->state); 219 } 220 221 /* 222 * We only dump stats for one cgroup here, so return 1 to stop 223 * iteration after the first cgroup. 224 */ 225 return 1; 226 } 227