1 //SPDX-License-Identifier: GPL-2.0 2 #include <linux/bpf-cgroup.h> 3 #include <linux/bpf.h> 4 #include <linux/btf.h> 5 #include <linux/bug.h> 6 #include <linux/filter.h> 7 #include <linux/mm.h> 8 #include <linux/rbtree.h> 9 #include <linux/slab.h> 10 #include <uapi/linux/btf.h> 11 12 DEFINE_PER_CPU(struct bpf_cgroup_storage*, bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]); 13 14 #ifdef CONFIG_CGROUP_BPF 15 16 #include "../cgroup/cgroup-internal.h" 17 18 #define LOCAL_STORAGE_CREATE_FLAG_MASK \ 19 (BPF_F_NUMA_NODE | BPF_F_ACCESS_MASK) 20 21 struct bpf_cgroup_storage_map { 22 struct bpf_map map; 23 24 spinlock_t lock; 25 struct rb_root root; 26 struct list_head list; 27 }; 28 29 static struct bpf_cgroup_storage_map *map_to_storage(struct bpf_map *map) 30 { 31 return container_of(map, struct bpf_cgroup_storage_map, map); 32 } 33 34 static bool attach_type_isolated(const struct bpf_map *map) 35 { 36 return map->key_size == sizeof(struct bpf_cgroup_storage_key); 37 } 38 39 static int bpf_cgroup_storage_key_cmp(const struct bpf_cgroup_storage_map *map, 40 const void *_key1, const void *_key2) 41 { 42 if (attach_type_isolated(&map->map)) { 43 const struct bpf_cgroup_storage_key *key1 = _key1; 44 const struct bpf_cgroup_storage_key *key2 = _key2; 45 46 if (key1->cgroup_inode_id < key2->cgroup_inode_id) 47 return -1; 48 else if (key1->cgroup_inode_id > key2->cgroup_inode_id) 49 return 1; 50 else if (key1->attach_type < key2->attach_type) 51 return -1; 52 else if (key1->attach_type > key2->attach_type) 53 return 1; 54 } else { 55 const __u64 *cgroup_inode_id1 = _key1; 56 const __u64 *cgroup_inode_id2 = _key2; 57 58 if (*cgroup_inode_id1 < *cgroup_inode_id2) 59 return -1; 60 else if (*cgroup_inode_id1 > *cgroup_inode_id2) 61 return 1; 62 } 63 return 0; 64 } 65 66 struct bpf_cgroup_storage * 67 cgroup_storage_lookup(struct bpf_cgroup_storage_map *map, 68 void *key, bool locked) 69 { 70 struct rb_root *root = &map->root; 71 struct rb_node *node; 72 73 if (!locked) 74 spin_lock_bh(&map->lock); 75 76 node = root->rb_node; 77 while (node) { 78 struct bpf_cgroup_storage *storage; 79 80 storage = container_of(node, struct bpf_cgroup_storage, node); 81 82 switch (bpf_cgroup_storage_key_cmp(map, key, &storage->key)) { 83 case -1: 84 node = node->rb_left; 85 break; 86 case 1: 87 node = node->rb_right; 88 break; 89 default: 90 if (!locked) 91 spin_unlock_bh(&map->lock); 92 return storage; 93 } 94 } 95 96 if (!locked) 97 spin_unlock_bh(&map->lock); 98 99 return NULL; 100 } 101 102 static int cgroup_storage_insert(struct bpf_cgroup_storage_map *map, 103 struct bpf_cgroup_storage *storage) 104 { 105 struct rb_root *root = &map->root; 106 struct rb_node **new = &(root->rb_node), *parent = NULL; 107 108 while (*new) { 109 struct bpf_cgroup_storage *this; 110 111 this = container_of(*new, struct bpf_cgroup_storage, node); 112 113 parent = *new; 114 switch (bpf_cgroup_storage_key_cmp(map, &storage->key, &this->key)) { 115 case -1: 116 new = &((*new)->rb_left); 117 break; 118 case 1: 119 new = &((*new)->rb_right); 120 break; 121 default: 122 return -EEXIST; 123 } 124 } 125 126 rb_link_node(&storage->node, parent, new); 127 rb_insert_color(&storage->node, root); 128 129 return 0; 130 } 131 132 static void *cgroup_storage_lookup_elem(struct bpf_map *_map, void *key) 133 { 134 struct bpf_cgroup_storage_map *map = map_to_storage(_map); 135 struct bpf_cgroup_storage *storage; 136 137 storage = cgroup_storage_lookup(map, key, false); 138 if (!storage) 139 return NULL; 140 141 return &READ_ONCE(storage->buf)->data[0]; 142 } 143 144 static int cgroup_storage_update_elem(struct bpf_map *map, void *key, 145 void *value, u64 flags) 146 { 147 struct bpf_cgroup_storage *storage; 148 struct bpf_storage_buffer *new; 149 150 if (unlikely(flags & ~(BPF_F_LOCK | BPF_EXIST))) 151 return -EINVAL; 152 153 if (unlikely((flags & BPF_F_LOCK) && 154 !map_value_has_spin_lock(map))) 155 return -EINVAL; 156 157 storage = cgroup_storage_lookup((struct bpf_cgroup_storage_map *)map, 158 key, false); 159 if (!storage) 160 return -ENOENT; 161 162 if (flags & BPF_F_LOCK) { 163 copy_map_value_locked(map, storage->buf->data, value, false); 164 return 0; 165 } 166 167 new = bpf_map_kmalloc_node(map, sizeof(struct bpf_storage_buffer) + 168 map->value_size, 169 __GFP_ZERO | GFP_ATOMIC | __GFP_NOWARN, 170 map->numa_node); 171 if (!new) 172 return -ENOMEM; 173 174 memcpy(&new->data[0], value, map->value_size); 175 check_and_init_map_lock(map, new->data); 176 177 new = xchg(&storage->buf, new); 178 kfree_rcu(new, rcu); 179 180 return 0; 181 } 182 183 int bpf_percpu_cgroup_storage_copy(struct bpf_map *_map, void *key, 184 void *value) 185 { 186 struct bpf_cgroup_storage_map *map = map_to_storage(_map); 187 struct bpf_cgroup_storage *storage; 188 int cpu, off = 0; 189 u32 size; 190 191 rcu_read_lock(); 192 storage = cgroup_storage_lookup(map, key, false); 193 if (!storage) { 194 rcu_read_unlock(); 195 return -ENOENT; 196 } 197 198 /* per_cpu areas are zero-filled and bpf programs can only 199 * access 'value_size' of them, so copying rounded areas 200 * will not leak any kernel data 201 */ 202 size = round_up(_map->value_size, 8); 203 for_each_possible_cpu(cpu) { 204 bpf_long_memcpy(value + off, 205 per_cpu_ptr(storage->percpu_buf, cpu), size); 206 off += size; 207 } 208 rcu_read_unlock(); 209 return 0; 210 } 211 212 int bpf_percpu_cgroup_storage_update(struct bpf_map *_map, void *key, 213 void *value, u64 map_flags) 214 { 215 struct bpf_cgroup_storage_map *map = map_to_storage(_map); 216 struct bpf_cgroup_storage *storage; 217 int cpu, off = 0; 218 u32 size; 219 220 if (map_flags != BPF_ANY && map_flags != BPF_EXIST) 221 return -EINVAL; 222 223 rcu_read_lock(); 224 storage = cgroup_storage_lookup(map, key, false); 225 if (!storage) { 226 rcu_read_unlock(); 227 return -ENOENT; 228 } 229 230 /* the user space will provide round_up(value_size, 8) bytes that 231 * will be copied into per-cpu area. bpf programs can only access 232 * value_size of it. During lookup the same extra bytes will be 233 * returned or zeros which were zero-filled by percpu_alloc, 234 * so no kernel data leaks possible 235 */ 236 size = round_up(_map->value_size, 8); 237 for_each_possible_cpu(cpu) { 238 bpf_long_memcpy(per_cpu_ptr(storage->percpu_buf, cpu), 239 value + off, size); 240 off += size; 241 } 242 rcu_read_unlock(); 243 return 0; 244 } 245 246 static int cgroup_storage_get_next_key(struct bpf_map *_map, void *key, 247 void *_next_key) 248 { 249 struct bpf_cgroup_storage_map *map = map_to_storage(_map); 250 struct bpf_cgroup_storage *storage; 251 252 spin_lock_bh(&map->lock); 253 254 if (list_empty(&map->list)) 255 goto enoent; 256 257 if (key) { 258 storage = cgroup_storage_lookup(map, key, true); 259 if (!storage) 260 goto enoent; 261 262 storage = list_next_entry(storage, list_map); 263 if (!storage) 264 goto enoent; 265 } else { 266 storage = list_first_entry(&map->list, 267 struct bpf_cgroup_storage, list_map); 268 } 269 270 spin_unlock_bh(&map->lock); 271 272 if (attach_type_isolated(&map->map)) { 273 struct bpf_cgroup_storage_key *next = _next_key; 274 *next = storage->key; 275 } else { 276 __u64 *next = _next_key; 277 *next = storage->key.cgroup_inode_id; 278 } 279 return 0; 280 281 enoent: 282 spin_unlock_bh(&map->lock); 283 return -ENOENT; 284 } 285 286 static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) 287 { 288 int numa_node = bpf_map_attr_numa_node(attr); 289 struct bpf_cgroup_storage_map *map; 290 291 if (attr->key_size != sizeof(struct bpf_cgroup_storage_key) && 292 attr->key_size != sizeof(__u64)) 293 return ERR_PTR(-EINVAL); 294 295 if (attr->value_size == 0) 296 return ERR_PTR(-EINVAL); 297 298 if (attr->value_size > PAGE_SIZE) 299 return ERR_PTR(-E2BIG); 300 301 if (attr->map_flags & ~LOCAL_STORAGE_CREATE_FLAG_MASK || 302 !bpf_map_flags_access_ok(attr->map_flags)) 303 return ERR_PTR(-EINVAL); 304 305 if (attr->max_entries) 306 /* max_entries is not used and enforced to be 0 */ 307 return ERR_PTR(-EINVAL); 308 309 map = kmalloc_node(sizeof(struct bpf_cgroup_storage_map), 310 __GFP_ZERO | GFP_USER | __GFP_ACCOUNT, numa_node); 311 if (!map) 312 return ERR_PTR(-ENOMEM); 313 314 /* copy mandatory map attributes */ 315 bpf_map_init_from_attr(&map->map, attr); 316 317 spin_lock_init(&map->lock); 318 map->root = RB_ROOT; 319 INIT_LIST_HEAD(&map->list); 320 321 return &map->map; 322 } 323 324 static void cgroup_storage_map_free(struct bpf_map *_map) 325 { 326 struct bpf_cgroup_storage_map *map = map_to_storage(_map); 327 struct list_head *storages = &map->list; 328 struct bpf_cgroup_storage *storage, *stmp; 329 330 mutex_lock(&cgroup_mutex); 331 332 list_for_each_entry_safe(storage, stmp, storages, list_map) { 333 bpf_cgroup_storage_unlink(storage); 334 bpf_cgroup_storage_free(storage); 335 } 336 337 mutex_unlock(&cgroup_mutex); 338 339 WARN_ON(!RB_EMPTY_ROOT(&map->root)); 340 WARN_ON(!list_empty(&map->list)); 341 342 kfree(map); 343 } 344 345 static int cgroup_storage_delete_elem(struct bpf_map *map, void *key) 346 { 347 return -EINVAL; 348 } 349 350 static int cgroup_storage_check_btf(const struct bpf_map *map, 351 const struct btf *btf, 352 const struct btf_type *key_type, 353 const struct btf_type *value_type) 354 { 355 if (attach_type_isolated(map)) { 356 struct btf_member *m; 357 u32 offset, size; 358 359 /* Key is expected to be of struct bpf_cgroup_storage_key type, 360 * which is: 361 * struct bpf_cgroup_storage_key { 362 * __u64 cgroup_inode_id; 363 * __u32 attach_type; 364 * }; 365 */ 366 367 /* 368 * Key_type must be a structure with two fields. 369 */ 370 if (BTF_INFO_KIND(key_type->info) != BTF_KIND_STRUCT || 371 BTF_INFO_VLEN(key_type->info) != 2) 372 return -EINVAL; 373 374 /* 375 * The first field must be a 64 bit integer at 0 offset. 376 */ 377 m = (struct btf_member *)(key_type + 1); 378 size = sizeof_field(struct bpf_cgroup_storage_key, cgroup_inode_id); 379 if (!btf_member_is_reg_int(btf, key_type, m, 0, size)) 380 return -EINVAL; 381 382 /* 383 * The second field must be a 32 bit integer at 64 bit offset. 384 */ 385 m++; 386 offset = offsetof(struct bpf_cgroup_storage_key, attach_type); 387 size = sizeof_field(struct bpf_cgroup_storage_key, attach_type); 388 if (!btf_member_is_reg_int(btf, key_type, m, offset, size)) 389 return -EINVAL; 390 } else { 391 u32 int_data; 392 393 /* 394 * Key is expected to be u64, which stores the cgroup_inode_id 395 */ 396 397 if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT) 398 return -EINVAL; 399 400 int_data = *(u32 *)(key_type + 1); 401 if (BTF_INT_BITS(int_data) != 64 || BTF_INT_OFFSET(int_data)) 402 return -EINVAL; 403 } 404 405 return 0; 406 } 407 408 static void cgroup_storage_seq_show_elem(struct bpf_map *map, void *key, 409 struct seq_file *m) 410 { 411 enum bpf_cgroup_storage_type stype = cgroup_storage_type(map); 412 struct bpf_cgroup_storage *storage; 413 int cpu; 414 415 rcu_read_lock(); 416 storage = cgroup_storage_lookup(map_to_storage(map), key, false); 417 if (!storage) { 418 rcu_read_unlock(); 419 return; 420 } 421 422 btf_type_seq_show(map->btf, map->btf_key_type_id, key, m); 423 stype = cgroup_storage_type(map); 424 if (stype == BPF_CGROUP_STORAGE_SHARED) { 425 seq_puts(m, ": "); 426 btf_type_seq_show(map->btf, map->btf_value_type_id, 427 &READ_ONCE(storage->buf)->data[0], m); 428 seq_puts(m, "\n"); 429 } else { 430 seq_puts(m, ": {\n"); 431 for_each_possible_cpu(cpu) { 432 seq_printf(m, "\tcpu%d: ", cpu); 433 btf_type_seq_show(map->btf, map->btf_value_type_id, 434 per_cpu_ptr(storage->percpu_buf, cpu), 435 m); 436 seq_puts(m, "\n"); 437 } 438 seq_puts(m, "}\n"); 439 } 440 rcu_read_unlock(); 441 } 442 443 static int cgroup_storage_map_btf_id; 444 const struct bpf_map_ops cgroup_storage_map_ops = { 445 .map_alloc = cgroup_storage_map_alloc, 446 .map_free = cgroup_storage_map_free, 447 .map_get_next_key = cgroup_storage_get_next_key, 448 .map_lookup_elem = cgroup_storage_lookup_elem, 449 .map_update_elem = cgroup_storage_update_elem, 450 .map_delete_elem = cgroup_storage_delete_elem, 451 .map_check_btf = cgroup_storage_check_btf, 452 .map_seq_show_elem = cgroup_storage_seq_show_elem, 453 .map_btf_name = "bpf_cgroup_storage_map", 454 .map_btf_id = &cgroup_storage_map_btf_id, 455 }; 456 457 int bpf_cgroup_storage_assign(struct bpf_prog_aux *aux, struct bpf_map *_map) 458 { 459 enum bpf_cgroup_storage_type stype = cgroup_storage_type(_map); 460 461 if (aux->cgroup_storage[stype] && 462 aux->cgroup_storage[stype] != _map) 463 return -EBUSY; 464 465 aux->cgroup_storage[stype] = _map; 466 return 0; 467 } 468 469 static size_t bpf_cgroup_storage_calculate_size(struct bpf_map *map, u32 *pages) 470 { 471 size_t size; 472 473 if (cgroup_storage_type(map) == BPF_CGROUP_STORAGE_SHARED) { 474 size = sizeof(struct bpf_storage_buffer) + map->value_size; 475 *pages = round_up(sizeof(struct bpf_cgroup_storage) + size, 476 PAGE_SIZE) >> PAGE_SHIFT; 477 } else { 478 size = map->value_size; 479 *pages = round_up(round_up(size, 8) * num_possible_cpus(), 480 PAGE_SIZE) >> PAGE_SHIFT; 481 } 482 483 return size; 484 } 485 486 struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog, 487 enum bpf_cgroup_storage_type stype) 488 { 489 const gfp_t gfp = __GFP_ZERO | GFP_USER; 490 struct bpf_cgroup_storage *storage; 491 struct bpf_map *map; 492 size_t size; 493 u32 pages; 494 495 map = prog->aux->cgroup_storage[stype]; 496 if (!map) 497 return NULL; 498 499 size = bpf_cgroup_storage_calculate_size(map, &pages); 500 501 storage = bpf_map_kmalloc_node(map, sizeof(struct bpf_cgroup_storage), 502 gfp, map->numa_node); 503 if (!storage) 504 goto enomem; 505 506 if (stype == BPF_CGROUP_STORAGE_SHARED) { 507 storage->buf = bpf_map_kmalloc_node(map, size, gfp, 508 map->numa_node); 509 if (!storage->buf) 510 goto enomem; 511 check_and_init_map_lock(map, storage->buf->data); 512 } else { 513 storage->percpu_buf = bpf_map_alloc_percpu(map, size, 8, gfp); 514 if (!storage->percpu_buf) 515 goto enomem; 516 } 517 518 storage->map = (struct bpf_cgroup_storage_map *)map; 519 520 return storage; 521 522 enomem: 523 kfree(storage); 524 return ERR_PTR(-ENOMEM); 525 } 526 527 static void free_shared_cgroup_storage_rcu(struct rcu_head *rcu) 528 { 529 struct bpf_cgroup_storage *storage = 530 container_of(rcu, struct bpf_cgroup_storage, rcu); 531 532 kfree(storage->buf); 533 kfree(storage); 534 } 535 536 static void free_percpu_cgroup_storage_rcu(struct rcu_head *rcu) 537 { 538 struct bpf_cgroup_storage *storage = 539 container_of(rcu, struct bpf_cgroup_storage, rcu); 540 541 free_percpu(storage->percpu_buf); 542 kfree(storage); 543 } 544 545 void bpf_cgroup_storage_free(struct bpf_cgroup_storage *storage) 546 { 547 enum bpf_cgroup_storage_type stype; 548 struct bpf_map *map; 549 550 if (!storage) 551 return; 552 553 map = &storage->map->map; 554 stype = cgroup_storage_type(map); 555 if (stype == BPF_CGROUP_STORAGE_SHARED) 556 call_rcu(&storage->rcu, free_shared_cgroup_storage_rcu); 557 else 558 call_rcu(&storage->rcu, free_percpu_cgroup_storage_rcu); 559 } 560 561 void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage, 562 struct cgroup *cgroup, 563 enum bpf_attach_type type) 564 { 565 struct bpf_cgroup_storage_map *map; 566 567 if (!storage) 568 return; 569 570 storage->key.attach_type = type; 571 storage->key.cgroup_inode_id = cgroup_id(cgroup); 572 573 map = storage->map; 574 575 spin_lock_bh(&map->lock); 576 WARN_ON(cgroup_storage_insert(map, storage)); 577 list_add(&storage->list_map, &map->list); 578 list_add(&storage->list_cg, &cgroup->bpf.storages); 579 spin_unlock_bh(&map->lock); 580 } 581 582 void bpf_cgroup_storage_unlink(struct bpf_cgroup_storage *storage) 583 { 584 struct bpf_cgroup_storage_map *map; 585 struct rb_root *root; 586 587 if (!storage) 588 return; 589 590 map = storage->map; 591 592 spin_lock_bh(&map->lock); 593 root = &map->root; 594 rb_erase(&storage->node, root); 595 596 list_del(&storage->list_map); 597 list_del(&storage->list_cg); 598 spin_unlock_bh(&map->lock); 599 } 600 601 #endif 602