1*88886309SYosry Ahmed // SPDX-License-Identifier: GPL-2.0-only 2*88886309SYosry Ahmed /* 3*88886309SYosry Ahmed * Functions to manage eBPF programs attached to cgroup subsystems 4*88886309SYosry Ahmed * 5*88886309SYosry Ahmed * Copyright 2022 Google LLC. 6*88886309SYosry Ahmed */ 7*88886309SYosry Ahmed #include <asm-generic/errno.h> 8*88886309SYosry Ahmed #include <errno.h> 9*88886309SYosry Ahmed #include <sys/types.h> 10*88886309SYosry Ahmed #include <sys/mount.h> 11*88886309SYosry Ahmed #include <sys/stat.h> 12*88886309SYosry Ahmed #include <unistd.h> 13*88886309SYosry Ahmed 14*88886309SYosry Ahmed #include <test_progs.h> 15*88886309SYosry Ahmed #include <bpf/libbpf.h> 16*88886309SYosry Ahmed #include <bpf/bpf.h> 17*88886309SYosry Ahmed 18*88886309SYosry Ahmed #include "cgroup_helpers.h" 19*88886309SYosry Ahmed #include "cgroup_hierarchical_stats.skel.h" 20*88886309SYosry Ahmed 21*88886309SYosry Ahmed #define PAGE_SIZE 4096 22*88886309SYosry Ahmed #define MB(x) (x << 20) 23*88886309SYosry Ahmed 24*88886309SYosry Ahmed #define BPFFS_ROOT "/sys/fs/bpf/" 25*88886309SYosry Ahmed #define BPFFS_VMSCAN BPFFS_ROOT"vmscan/" 26*88886309SYosry Ahmed 27*88886309SYosry Ahmed #define CG_ROOT_NAME "root" 28*88886309SYosry Ahmed #define CG_ROOT_ID 1 29*88886309SYosry Ahmed 30*88886309SYosry Ahmed #define CGROUP_PATH(p, n) {.path = p"/"n, .name = n} 31*88886309SYosry Ahmed 32*88886309SYosry Ahmed static struct { 33*88886309SYosry Ahmed const char *path, *name; 34*88886309SYosry Ahmed unsigned long long id; 35*88886309SYosry Ahmed int fd; 36*88886309SYosry Ahmed } cgroups[] = { 37*88886309SYosry Ahmed CGROUP_PATH("/", "test"), 38*88886309SYosry Ahmed CGROUP_PATH("/test", "child1"), 39*88886309SYosry Ahmed CGROUP_PATH("/test", "child2"), 40*88886309SYosry Ahmed CGROUP_PATH("/test/child1", "child1_1"), 41*88886309SYosry Ahmed CGROUP_PATH("/test/child1", "child1_2"), 42*88886309SYosry Ahmed CGROUP_PATH("/test/child2", "child2_1"), 43*88886309SYosry Ahmed CGROUP_PATH("/test/child2", "child2_2"), 44*88886309SYosry Ahmed }; 45*88886309SYosry Ahmed 46*88886309SYosry Ahmed #define N_CGROUPS ARRAY_SIZE(cgroups) 47*88886309SYosry Ahmed #define N_NON_LEAF_CGROUPS 3 48*88886309SYosry Ahmed 49*88886309SYosry Ahmed static int root_cgroup_fd; 50*88886309SYosry Ahmed static bool mounted_bpffs; 51*88886309SYosry Ahmed 52*88886309SYosry Ahmed /* reads file at 'path' to 'buf', returns 0 on success. */ 53*88886309SYosry Ahmed static int read_from_file(const char *path, char *buf, size_t size) 54*88886309SYosry Ahmed { 55*88886309SYosry Ahmed int fd, len; 56*88886309SYosry Ahmed 57*88886309SYosry Ahmed fd = open(path, O_RDONLY); 58*88886309SYosry Ahmed if (fd < 0) 59*88886309SYosry Ahmed return fd; 60*88886309SYosry Ahmed 61*88886309SYosry Ahmed len = read(fd, buf, size); 62*88886309SYosry Ahmed close(fd); 63*88886309SYosry Ahmed if (len < 0) 64*88886309SYosry Ahmed return len; 65*88886309SYosry Ahmed 66*88886309SYosry Ahmed buf[len] = 0; 67*88886309SYosry Ahmed return 0; 68*88886309SYosry Ahmed } 69*88886309SYosry Ahmed 70*88886309SYosry Ahmed /* mounts bpffs and mkdir for reading stats, returns 0 on success. */ 71*88886309SYosry Ahmed static int setup_bpffs(void) 72*88886309SYosry Ahmed { 73*88886309SYosry Ahmed int err; 74*88886309SYosry Ahmed 75*88886309SYosry Ahmed /* Mount bpffs */ 76*88886309SYosry Ahmed err = mount("bpf", BPFFS_ROOT, "bpf", 0, NULL); 77*88886309SYosry Ahmed mounted_bpffs = !err; 78*88886309SYosry Ahmed if (ASSERT_FALSE(err && errno != EBUSY, "mount")) 79*88886309SYosry Ahmed return err; 80*88886309SYosry Ahmed 81*88886309SYosry Ahmed /* Create a directory to contain stat files in bpffs */ 82*88886309SYosry Ahmed err = mkdir(BPFFS_VMSCAN, 0755); 83*88886309SYosry Ahmed if (!ASSERT_OK(err, "mkdir")) 84*88886309SYosry Ahmed return err; 85*88886309SYosry Ahmed 86*88886309SYosry Ahmed return 0; 87*88886309SYosry Ahmed } 88*88886309SYosry Ahmed 89*88886309SYosry Ahmed static void cleanup_bpffs(void) 90*88886309SYosry Ahmed { 91*88886309SYosry Ahmed /* Remove created directory in bpffs */ 92*88886309SYosry Ahmed ASSERT_OK(rmdir(BPFFS_VMSCAN), "rmdir "BPFFS_VMSCAN); 93*88886309SYosry Ahmed 94*88886309SYosry Ahmed /* Unmount bpffs, if it wasn't already mounted when we started */ 95*88886309SYosry Ahmed if (mounted_bpffs) 96*88886309SYosry Ahmed return; 97*88886309SYosry Ahmed 98*88886309SYosry Ahmed ASSERT_OK(umount(BPFFS_ROOT), "unmount bpffs"); 99*88886309SYosry Ahmed } 100*88886309SYosry Ahmed 101*88886309SYosry Ahmed /* sets up cgroups, returns 0 on success. */ 102*88886309SYosry Ahmed static int setup_cgroups(void) 103*88886309SYosry Ahmed { 104*88886309SYosry Ahmed int i, fd, err; 105*88886309SYosry Ahmed 106*88886309SYosry Ahmed err = setup_cgroup_environment(); 107*88886309SYosry Ahmed if (!ASSERT_OK(err, "setup_cgroup_environment")) 108*88886309SYosry Ahmed return err; 109*88886309SYosry Ahmed 110*88886309SYosry Ahmed root_cgroup_fd = get_root_cgroup(); 111*88886309SYosry Ahmed if (!ASSERT_GE(root_cgroup_fd, 0, "get_root_cgroup")) 112*88886309SYosry Ahmed return root_cgroup_fd; 113*88886309SYosry Ahmed 114*88886309SYosry Ahmed for (i = 0; i < N_CGROUPS; i++) { 115*88886309SYosry Ahmed fd = create_and_get_cgroup(cgroups[i].path); 116*88886309SYosry Ahmed if (!ASSERT_GE(fd, 0, "create_and_get_cgroup")) 117*88886309SYosry Ahmed return fd; 118*88886309SYosry Ahmed 119*88886309SYosry Ahmed cgroups[i].fd = fd; 120*88886309SYosry Ahmed cgroups[i].id = get_cgroup_id(cgroups[i].path); 121*88886309SYosry Ahmed 122*88886309SYosry Ahmed /* 123*88886309SYosry Ahmed * Enable memcg controller for the entire hierarchy. 124*88886309SYosry Ahmed * Note that stats are collected for all cgroups in a hierarchy 125*88886309SYosry Ahmed * with memcg enabled anyway, but are only exposed for cgroups 126*88886309SYosry Ahmed * that have memcg enabled. 127*88886309SYosry Ahmed */ 128*88886309SYosry Ahmed if (i < N_NON_LEAF_CGROUPS) { 129*88886309SYosry Ahmed err = enable_controllers(cgroups[i].path, "memory"); 130*88886309SYosry Ahmed if (!ASSERT_OK(err, "enable_controllers")) 131*88886309SYosry Ahmed return err; 132*88886309SYosry Ahmed } 133*88886309SYosry Ahmed } 134*88886309SYosry Ahmed return 0; 135*88886309SYosry Ahmed } 136*88886309SYosry Ahmed 137*88886309SYosry Ahmed static void cleanup_cgroups(void) 138*88886309SYosry Ahmed { 139*88886309SYosry Ahmed close(root_cgroup_fd); 140*88886309SYosry Ahmed for (int i = 0; i < N_CGROUPS; i++) 141*88886309SYosry Ahmed close(cgroups[i].fd); 142*88886309SYosry Ahmed cleanup_cgroup_environment(); 143*88886309SYosry Ahmed } 144*88886309SYosry Ahmed 145*88886309SYosry Ahmed /* Sets up cgroup hiearchary, returns 0 on success. */ 146*88886309SYosry Ahmed static int setup_hierarchy(void) 147*88886309SYosry Ahmed { 148*88886309SYosry Ahmed return setup_bpffs() || setup_cgroups(); 149*88886309SYosry Ahmed } 150*88886309SYosry Ahmed 151*88886309SYosry Ahmed static void destroy_hierarchy(void) 152*88886309SYosry Ahmed { 153*88886309SYosry Ahmed cleanup_cgroups(); 154*88886309SYosry Ahmed cleanup_bpffs(); 155*88886309SYosry Ahmed } 156*88886309SYosry Ahmed 157*88886309SYosry Ahmed static int reclaimer(const char *cgroup_path, size_t size) 158*88886309SYosry Ahmed { 159*88886309SYosry Ahmed static char size_buf[128]; 160*88886309SYosry Ahmed char *buf, *ptr; 161*88886309SYosry Ahmed int err; 162*88886309SYosry Ahmed 163*88886309SYosry Ahmed /* Join cgroup in the parent process workdir */ 164*88886309SYosry Ahmed if (join_parent_cgroup(cgroup_path)) 165*88886309SYosry Ahmed return EACCES; 166*88886309SYosry Ahmed 167*88886309SYosry Ahmed /* Allocate memory */ 168*88886309SYosry Ahmed buf = malloc(size); 169*88886309SYosry Ahmed if (!buf) 170*88886309SYosry Ahmed return ENOMEM; 171*88886309SYosry Ahmed 172*88886309SYosry Ahmed /* Write to memory to make sure it's actually allocated */ 173*88886309SYosry Ahmed for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE) 174*88886309SYosry Ahmed *ptr = 1; 175*88886309SYosry Ahmed 176*88886309SYosry Ahmed /* Try to reclaim memory */ 177*88886309SYosry Ahmed snprintf(size_buf, 128, "%lu", size); 178*88886309SYosry Ahmed err = write_cgroup_file_parent(cgroup_path, "memory.reclaim", size_buf); 179*88886309SYosry Ahmed 180*88886309SYosry Ahmed free(buf); 181*88886309SYosry Ahmed /* memory.reclaim returns EAGAIN if the amount is not fully reclaimed */ 182*88886309SYosry Ahmed if (err && errno != EAGAIN) 183*88886309SYosry Ahmed return errno; 184*88886309SYosry Ahmed 185*88886309SYosry Ahmed return 0; 186*88886309SYosry Ahmed } 187*88886309SYosry Ahmed 188*88886309SYosry Ahmed static int induce_vmscan(void) 189*88886309SYosry Ahmed { 190*88886309SYosry Ahmed int i, status; 191*88886309SYosry Ahmed 192*88886309SYosry Ahmed /* 193*88886309SYosry Ahmed * In every leaf cgroup, run a child process that allocates some memory 194*88886309SYosry Ahmed * and attempts to reclaim some of it. 195*88886309SYosry Ahmed */ 196*88886309SYosry Ahmed for (i = N_NON_LEAF_CGROUPS; i < N_CGROUPS; i++) { 197*88886309SYosry Ahmed pid_t pid; 198*88886309SYosry Ahmed 199*88886309SYosry Ahmed /* Create reclaimer child */ 200*88886309SYosry Ahmed pid = fork(); 201*88886309SYosry Ahmed if (pid == 0) { 202*88886309SYosry Ahmed status = reclaimer(cgroups[i].path, MB(5)); 203*88886309SYosry Ahmed exit(status); 204*88886309SYosry Ahmed } 205*88886309SYosry Ahmed 206*88886309SYosry Ahmed /* Cleanup reclaimer child */ 207*88886309SYosry Ahmed waitpid(pid, &status, 0); 208*88886309SYosry Ahmed ASSERT_TRUE(WIFEXITED(status), "reclaimer exited"); 209*88886309SYosry Ahmed ASSERT_EQ(WEXITSTATUS(status), 0, "reclaim exit code"); 210*88886309SYosry Ahmed } 211*88886309SYosry Ahmed return 0; 212*88886309SYosry Ahmed } 213*88886309SYosry Ahmed 214*88886309SYosry Ahmed static unsigned long long 215*88886309SYosry Ahmed get_cgroup_vmscan_delay(unsigned long long cgroup_id, const char *file_name) 216*88886309SYosry Ahmed { 217*88886309SYosry Ahmed unsigned long long vmscan = 0, id = 0; 218*88886309SYosry Ahmed static char buf[128], path[128]; 219*88886309SYosry Ahmed 220*88886309SYosry Ahmed /* For every cgroup, read the file generated by cgroup_iter */ 221*88886309SYosry Ahmed snprintf(path, 128, "%s%s", BPFFS_VMSCAN, file_name); 222*88886309SYosry Ahmed if (!ASSERT_OK(read_from_file(path, buf, 128), "read cgroup_iter")) 223*88886309SYosry Ahmed return 0; 224*88886309SYosry Ahmed 225*88886309SYosry Ahmed /* Check the output file formatting */ 226*88886309SYosry Ahmed ASSERT_EQ(sscanf(buf, "cg_id: %llu, total_vmscan_delay: %llu\n", 227*88886309SYosry Ahmed &id, &vmscan), 2, "output format"); 228*88886309SYosry Ahmed 229*88886309SYosry Ahmed /* Check that the cgroup_id is displayed correctly */ 230*88886309SYosry Ahmed ASSERT_EQ(id, cgroup_id, "cgroup_id"); 231*88886309SYosry Ahmed /* Check that the vmscan reading is non-zero */ 232*88886309SYosry Ahmed ASSERT_GT(vmscan, 0, "vmscan_reading"); 233*88886309SYosry Ahmed return vmscan; 234*88886309SYosry Ahmed } 235*88886309SYosry Ahmed 236*88886309SYosry Ahmed static void check_vmscan_stats(void) 237*88886309SYosry Ahmed { 238*88886309SYosry Ahmed unsigned long long vmscan_readings[N_CGROUPS], vmscan_root; 239*88886309SYosry Ahmed int i; 240*88886309SYosry Ahmed 241*88886309SYosry Ahmed for (i = 0; i < N_CGROUPS; i++) { 242*88886309SYosry Ahmed vmscan_readings[i] = get_cgroup_vmscan_delay(cgroups[i].id, 243*88886309SYosry Ahmed cgroups[i].name); 244*88886309SYosry Ahmed } 245*88886309SYosry Ahmed 246*88886309SYosry Ahmed /* Read stats for root too */ 247*88886309SYosry Ahmed vmscan_root = get_cgroup_vmscan_delay(CG_ROOT_ID, CG_ROOT_NAME); 248*88886309SYosry Ahmed 249*88886309SYosry Ahmed /* Check that child1 == child1_1 + child1_2 */ 250*88886309SYosry Ahmed ASSERT_EQ(vmscan_readings[1], vmscan_readings[3] + vmscan_readings[4], 251*88886309SYosry Ahmed "child1_vmscan"); 252*88886309SYosry Ahmed /* Check that child2 == child2_1 + child2_2 */ 253*88886309SYosry Ahmed ASSERT_EQ(vmscan_readings[2], vmscan_readings[5] + vmscan_readings[6], 254*88886309SYosry Ahmed "child2_vmscan"); 255*88886309SYosry Ahmed /* Check that test == child1 + child2 */ 256*88886309SYosry Ahmed ASSERT_EQ(vmscan_readings[0], vmscan_readings[1] + vmscan_readings[2], 257*88886309SYosry Ahmed "test_vmscan"); 258*88886309SYosry Ahmed /* Check that root >= test */ 259*88886309SYosry Ahmed ASSERT_GE(vmscan_root, vmscan_readings[1], "root_vmscan"); 260*88886309SYosry Ahmed } 261*88886309SYosry Ahmed 262*88886309SYosry Ahmed /* Creates iter link and pins in bpffs, returns 0 on success, -errno on failure. 263*88886309SYosry Ahmed */ 264*88886309SYosry Ahmed static int setup_cgroup_iter(struct cgroup_hierarchical_stats *obj, 265*88886309SYosry Ahmed int cgroup_fd, const char *file_name) 266*88886309SYosry Ahmed { 267*88886309SYosry Ahmed DECLARE_LIBBPF_OPTS(bpf_iter_attach_opts, opts); 268*88886309SYosry Ahmed union bpf_iter_link_info linfo = {}; 269*88886309SYosry Ahmed struct bpf_link *link; 270*88886309SYosry Ahmed static char path[128]; 271*88886309SYosry Ahmed int err; 272*88886309SYosry Ahmed 273*88886309SYosry Ahmed /* 274*88886309SYosry Ahmed * Create an iter link, parameterized by cgroup_fd. We only want to 275*88886309SYosry Ahmed * traverse one cgroup, so set the traversal order to "self". 276*88886309SYosry Ahmed */ 277*88886309SYosry Ahmed linfo.cgroup.cgroup_fd = cgroup_fd; 278*88886309SYosry Ahmed linfo.cgroup.order = BPF_ITER_SELF_ONLY; 279*88886309SYosry Ahmed opts.link_info = &linfo; 280*88886309SYosry Ahmed opts.link_info_len = sizeof(linfo); 281*88886309SYosry Ahmed link = bpf_program__attach_iter(obj->progs.dump_vmscan, &opts); 282*88886309SYosry Ahmed if (!ASSERT_OK_PTR(link, "attach_iter")) 283*88886309SYosry Ahmed return -EFAULT; 284*88886309SYosry Ahmed 285*88886309SYosry Ahmed /* Pin the link to a bpffs file */ 286*88886309SYosry Ahmed snprintf(path, 128, "%s%s", BPFFS_VMSCAN, file_name); 287*88886309SYosry Ahmed err = bpf_link__pin(link, path); 288*88886309SYosry Ahmed ASSERT_OK(err, "pin cgroup_iter"); 289*88886309SYosry Ahmed 290*88886309SYosry Ahmed /* Remove the link, leaving only the ref held by the pinned file */ 291*88886309SYosry Ahmed bpf_link__destroy(link); 292*88886309SYosry Ahmed return err; 293*88886309SYosry Ahmed } 294*88886309SYosry Ahmed 295*88886309SYosry Ahmed /* Sets up programs for collecting stats, returns 0 on success. */ 296*88886309SYosry Ahmed static int setup_progs(struct cgroup_hierarchical_stats **skel) 297*88886309SYosry Ahmed { 298*88886309SYosry Ahmed int i, err; 299*88886309SYosry Ahmed 300*88886309SYosry Ahmed *skel = cgroup_hierarchical_stats__open_and_load(); 301*88886309SYosry Ahmed if (!ASSERT_OK_PTR(*skel, "open_and_load")) 302*88886309SYosry Ahmed return 1; 303*88886309SYosry Ahmed 304*88886309SYosry Ahmed /* Attach cgroup_iter program that will dump the stats to cgroups */ 305*88886309SYosry Ahmed for (i = 0; i < N_CGROUPS; i++) { 306*88886309SYosry Ahmed err = setup_cgroup_iter(*skel, cgroups[i].fd, cgroups[i].name); 307*88886309SYosry Ahmed if (!ASSERT_OK(err, "setup_cgroup_iter")) 308*88886309SYosry Ahmed return err; 309*88886309SYosry Ahmed } 310*88886309SYosry Ahmed 311*88886309SYosry Ahmed /* Also dump stats for root */ 312*88886309SYosry Ahmed err = setup_cgroup_iter(*skel, root_cgroup_fd, CG_ROOT_NAME); 313*88886309SYosry Ahmed if (!ASSERT_OK(err, "setup_cgroup_iter")) 314*88886309SYosry Ahmed return err; 315*88886309SYosry Ahmed 316*88886309SYosry Ahmed bpf_program__set_autoattach((*skel)->progs.dump_vmscan, false); 317*88886309SYosry Ahmed err = cgroup_hierarchical_stats__attach(*skel); 318*88886309SYosry Ahmed if (!ASSERT_OK(err, "attach")) 319*88886309SYosry Ahmed return err; 320*88886309SYosry Ahmed 321*88886309SYosry Ahmed return 0; 322*88886309SYosry Ahmed } 323*88886309SYosry Ahmed 324*88886309SYosry Ahmed static void destroy_progs(struct cgroup_hierarchical_stats *skel) 325*88886309SYosry Ahmed { 326*88886309SYosry Ahmed static char path[128]; 327*88886309SYosry Ahmed int i; 328*88886309SYosry Ahmed 329*88886309SYosry Ahmed for (i = 0; i < N_CGROUPS; i++) { 330*88886309SYosry Ahmed /* Delete files in bpffs that cgroup_iters are pinned in */ 331*88886309SYosry Ahmed snprintf(path, 128, "%s%s", BPFFS_VMSCAN, 332*88886309SYosry Ahmed cgroups[i].name); 333*88886309SYosry Ahmed ASSERT_OK(remove(path), "remove cgroup_iter pin"); 334*88886309SYosry Ahmed } 335*88886309SYosry Ahmed 336*88886309SYosry Ahmed /* Delete root file in bpffs */ 337*88886309SYosry Ahmed snprintf(path, 128, "%s%s", BPFFS_VMSCAN, CG_ROOT_NAME); 338*88886309SYosry Ahmed ASSERT_OK(remove(path), "remove cgroup_iter root pin"); 339*88886309SYosry Ahmed cgroup_hierarchical_stats__destroy(skel); 340*88886309SYosry Ahmed } 341*88886309SYosry Ahmed 342*88886309SYosry Ahmed void test_cgroup_hierarchical_stats(void) 343*88886309SYosry Ahmed { 344*88886309SYosry Ahmed struct cgroup_hierarchical_stats *skel = NULL; 345*88886309SYosry Ahmed 346*88886309SYosry Ahmed if (setup_hierarchy()) 347*88886309SYosry Ahmed goto hierarchy_cleanup; 348*88886309SYosry Ahmed if (setup_progs(&skel)) 349*88886309SYosry Ahmed goto cleanup; 350*88886309SYosry Ahmed if (induce_vmscan()) 351*88886309SYosry Ahmed goto cleanup; 352*88886309SYosry Ahmed check_vmscan_stats(); 353*88886309SYosry Ahmed cleanup: 354*88886309SYosry Ahmed destroy_progs(skel); 355*88886309SYosry Ahmed hierarchy_cleanup: 356*88886309SYosry Ahmed destroy_hierarchy(); 357*88886309SYosry Ahmed } 358