1 //SPDX-License-Identifier: GPL-2.0 2 #include <linux/bpf-cgroup.h> 3 #include <linux/bpf.h> 4 #include <linux/btf.h> 5 #include <linux/bug.h> 6 #include <linux/filter.h> 7 #include <linux/mm.h> 8 #include <linux/rbtree.h> 9 #include <linux/slab.h> 10 #include <uapi/linux/btf.h> 11 12 DEFINE_PER_CPU(struct bpf_cgroup_storage*, bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]); 13 14 #ifdef CONFIG_CGROUP_BPF 15 16 #include "../cgroup/cgroup-internal.h" 17 18 #define LOCAL_STORAGE_CREATE_FLAG_MASK \ 19 (BPF_F_NUMA_NODE | BPF_F_ACCESS_MASK) 20 21 struct bpf_cgroup_storage_map { 22 struct bpf_map map; 23 24 spinlock_t lock; 25 struct rb_root root; 26 struct list_head list; 27 }; 28 29 static struct bpf_cgroup_storage_map *map_to_storage(struct bpf_map *map) 30 { 31 return container_of(map, struct bpf_cgroup_storage_map, map); 32 } 33 34 static bool attach_type_isolated(const struct bpf_map *map) 35 { 36 return map->key_size == sizeof(struct bpf_cgroup_storage_key); 37 } 38 39 static int bpf_cgroup_storage_key_cmp(const struct bpf_cgroup_storage_map *map, 40 const void *_key1, const void *_key2) 41 { 42 if (attach_type_isolated(&map->map)) { 43 const struct bpf_cgroup_storage_key *key1 = _key1; 44 const struct bpf_cgroup_storage_key *key2 = _key2; 45 46 if (key1->cgroup_inode_id < key2->cgroup_inode_id) 47 return -1; 48 else if (key1->cgroup_inode_id > key2->cgroup_inode_id) 49 return 1; 50 else if (key1->attach_type < key2->attach_type) 51 return -1; 52 else if (key1->attach_type > key2->attach_type) 53 return 1; 54 } else { 55 const __u64 *cgroup_inode_id1 = _key1; 56 const __u64 *cgroup_inode_id2 = _key2; 57 58 if (*cgroup_inode_id1 < *cgroup_inode_id2) 59 return -1; 60 else if (*cgroup_inode_id1 > *cgroup_inode_id2) 61 return 1; 62 } 63 return 0; 64 } 65 66 struct bpf_cgroup_storage * 67 cgroup_storage_lookup(struct bpf_cgroup_storage_map *map, 68 void *key, bool locked) 69 { 70 struct rb_root *root = &map->root; 71 struct rb_node *node; 72 73 if (!locked) 74 spin_lock_bh(&map->lock); 75 76 node = root->rb_node; 77 while (node) { 78 struct bpf_cgroup_storage *storage; 79 80 storage = container_of(node, struct bpf_cgroup_storage, node); 81 82 switch (bpf_cgroup_storage_key_cmp(map, key, &storage->key)) { 83 case -1: 84 node = node->rb_left; 85 break; 86 case 1: 87 node = node->rb_right; 88 break; 89 default: 90 if (!locked) 91 spin_unlock_bh(&map->lock); 92 return storage; 93 } 94 } 95 96 if (!locked) 97 spin_unlock_bh(&map->lock); 98 99 return NULL; 100 } 101 102 static int cgroup_storage_insert(struct bpf_cgroup_storage_map *map, 103 struct bpf_cgroup_storage *storage) 104 { 105 struct rb_root *root = &map->root; 106 struct rb_node **new = &(root->rb_node), *parent = NULL; 107 108 while (*new) { 109 struct bpf_cgroup_storage *this; 110 111 this = container_of(*new, struct bpf_cgroup_storage, node); 112 113 parent = *new; 114 switch (bpf_cgroup_storage_key_cmp(map, &storage->key, &this->key)) { 115 case -1: 116 new = &((*new)->rb_left); 117 break; 118 case 1: 119 new = &((*new)->rb_right); 120 break; 121 default: 122 return -EEXIST; 123 } 124 } 125 126 rb_link_node(&storage->node, parent, new); 127 rb_insert_color(&storage->node, root); 128 129 return 0; 130 } 131 132 static void *cgroup_storage_lookup_elem(struct bpf_map *_map, void *key) 133 { 134 struct bpf_cgroup_storage_map *map = map_to_storage(_map); 135 struct bpf_cgroup_storage *storage; 136 137 storage = cgroup_storage_lookup(map, key, false); 138 if (!storage) 139 return NULL; 140 141 return &READ_ONCE(storage->buf)->data[0]; 142 } 143 144 static int cgroup_storage_update_elem(struct bpf_map *map, void *key, 145 void *value, u64 flags) 146 { 147 struct bpf_cgroup_storage *storage; 148 struct bpf_storage_buffer *new; 149 150 if (unlikely(flags & ~(BPF_F_LOCK | BPF_EXIST))) 151 return -EINVAL; 152 153 if (unlikely((flags & BPF_F_LOCK) && 154 !map_value_has_spin_lock(map))) 155 return -EINVAL; 156 157 storage = cgroup_storage_lookup((struct bpf_cgroup_storage_map *)map, 158 key, false); 159 if (!storage) 160 return -ENOENT; 161 162 if (flags & BPF_F_LOCK) { 163 copy_map_value_locked(map, storage->buf->data, value, false); 164 return 0; 165 } 166 167 new = kmalloc_node(sizeof(struct bpf_storage_buffer) + 168 map->value_size, 169 __GFP_ZERO | GFP_ATOMIC | __GFP_NOWARN, 170 map->numa_node); 171 if (!new) 172 return -ENOMEM; 173 174 memcpy(&new->data[0], value, map->value_size); 175 check_and_init_map_lock(map, new->data); 176 177 new = xchg(&storage->buf, new); 178 kfree_rcu(new, rcu); 179 180 return 0; 181 } 182 183 int bpf_percpu_cgroup_storage_copy(struct bpf_map *_map, void *key, 184 void *value) 185 { 186 struct bpf_cgroup_storage_map *map = map_to_storage(_map); 187 struct bpf_cgroup_storage *storage; 188 int cpu, off = 0; 189 u32 size; 190 191 rcu_read_lock(); 192 storage = cgroup_storage_lookup(map, key, false); 193 if (!storage) { 194 rcu_read_unlock(); 195 return -ENOENT; 196 } 197 198 /* per_cpu areas are zero-filled and bpf programs can only 199 * access 'value_size' of them, so copying rounded areas 200 * will not leak any kernel data 201 */ 202 size = round_up(_map->value_size, 8); 203 for_each_possible_cpu(cpu) { 204 bpf_long_memcpy(value + off, 205 per_cpu_ptr(storage->percpu_buf, cpu), size); 206 off += size; 207 } 208 rcu_read_unlock(); 209 return 0; 210 } 211 212 int bpf_percpu_cgroup_storage_update(struct bpf_map *_map, void *key, 213 void *value, u64 map_flags) 214 { 215 struct bpf_cgroup_storage_map *map = map_to_storage(_map); 216 struct bpf_cgroup_storage *storage; 217 int cpu, off = 0; 218 u32 size; 219 220 if (map_flags != BPF_ANY && map_flags != BPF_EXIST) 221 return -EINVAL; 222 223 rcu_read_lock(); 224 storage = cgroup_storage_lookup(map, key, false); 225 if (!storage) { 226 rcu_read_unlock(); 227 return -ENOENT; 228 } 229 230 /* the user space will provide round_up(value_size, 8) bytes that 231 * will be copied into per-cpu area. bpf programs can only access 232 * value_size of it. During lookup the same extra bytes will be 233 * returned or zeros which were zero-filled by percpu_alloc, 234 * so no kernel data leaks possible 235 */ 236 size = round_up(_map->value_size, 8); 237 for_each_possible_cpu(cpu) { 238 bpf_long_memcpy(per_cpu_ptr(storage->percpu_buf, cpu), 239 value + off, size); 240 off += size; 241 } 242 rcu_read_unlock(); 243 return 0; 244 } 245 246 static int cgroup_storage_get_next_key(struct bpf_map *_map, void *key, 247 void *_next_key) 248 { 249 struct bpf_cgroup_storage_map *map = map_to_storage(_map); 250 struct bpf_cgroup_storage *storage; 251 252 spin_lock_bh(&map->lock); 253 254 if (list_empty(&map->list)) 255 goto enoent; 256 257 if (key) { 258 storage = cgroup_storage_lookup(map, key, true); 259 if (!storage) 260 goto enoent; 261 262 storage = list_next_entry(storage, list_map); 263 if (!storage) 264 goto enoent; 265 } else { 266 storage = list_first_entry(&map->list, 267 struct bpf_cgroup_storage, list_map); 268 } 269 270 spin_unlock_bh(&map->lock); 271 272 if (attach_type_isolated(&map->map)) { 273 struct bpf_cgroup_storage_key *next = _next_key; 274 *next = storage->key; 275 } else { 276 __u64 *next = _next_key; 277 *next = storage->key.cgroup_inode_id; 278 } 279 return 0; 280 281 enoent: 282 spin_unlock_bh(&map->lock); 283 return -ENOENT; 284 } 285 286 static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) 287 { 288 int numa_node = bpf_map_attr_numa_node(attr); 289 struct bpf_cgroup_storage_map *map; 290 struct bpf_map_memory mem; 291 int ret; 292 293 if (attr->key_size != sizeof(struct bpf_cgroup_storage_key) && 294 attr->key_size != sizeof(__u64)) 295 return ERR_PTR(-EINVAL); 296 297 if (attr->value_size == 0) 298 return ERR_PTR(-EINVAL); 299 300 if (attr->value_size > PAGE_SIZE) 301 return ERR_PTR(-E2BIG); 302 303 if (attr->map_flags & ~LOCAL_STORAGE_CREATE_FLAG_MASK || 304 !bpf_map_flags_access_ok(attr->map_flags)) 305 return ERR_PTR(-EINVAL); 306 307 if (attr->max_entries) 308 /* max_entries is not used and enforced to be 0 */ 309 return ERR_PTR(-EINVAL); 310 311 ret = bpf_map_charge_init(&mem, sizeof(struct bpf_cgroup_storage_map)); 312 if (ret < 0) 313 return ERR_PTR(ret); 314 315 map = kmalloc_node(sizeof(struct bpf_cgroup_storage_map), 316 __GFP_ZERO | GFP_USER, numa_node); 317 if (!map) { 318 bpf_map_charge_finish(&mem); 319 return ERR_PTR(-ENOMEM); 320 } 321 322 bpf_map_charge_move(&map->map.memory, &mem); 323 324 /* copy mandatory map attributes */ 325 bpf_map_init_from_attr(&map->map, attr); 326 327 spin_lock_init(&map->lock); 328 map->root = RB_ROOT; 329 INIT_LIST_HEAD(&map->list); 330 331 return &map->map; 332 } 333 334 static void cgroup_storage_map_free(struct bpf_map *_map) 335 { 336 struct bpf_cgroup_storage_map *map = map_to_storage(_map); 337 struct list_head *storages = &map->list; 338 struct bpf_cgroup_storage *storage, *stmp; 339 340 mutex_lock(&cgroup_mutex); 341 342 list_for_each_entry_safe(storage, stmp, storages, list_map) { 343 bpf_cgroup_storage_unlink(storage); 344 bpf_cgroup_storage_free(storage); 345 } 346 347 mutex_unlock(&cgroup_mutex); 348 349 WARN_ON(!RB_EMPTY_ROOT(&map->root)); 350 WARN_ON(!list_empty(&map->list)); 351 352 kfree(map); 353 } 354 355 static int cgroup_storage_delete_elem(struct bpf_map *map, void *key) 356 { 357 return -EINVAL; 358 } 359 360 static int cgroup_storage_check_btf(const struct bpf_map *map, 361 const struct btf *btf, 362 const struct btf_type *key_type, 363 const struct btf_type *value_type) 364 { 365 if (attach_type_isolated(map)) { 366 struct btf_member *m; 367 u32 offset, size; 368 369 /* Key is expected to be of struct bpf_cgroup_storage_key type, 370 * which is: 371 * struct bpf_cgroup_storage_key { 372 * __u64 cgroup_inode_id; 373 * __u32 attach_type; 374 * }; 375 */ 376 377 /* 378 * Key_type must be a structure with two fields. 379 */ 380 if (BTF_INFO_KIND(key_type->info) != BTF_KIND_STRUCT || 381 BTF_INFO_VLEN(key_type->info) != 2) 382 return -EINVAL; 383 384 /* 385 * The first field must be a 64 bit integer at 0 offset. 386 */ 387 m = (struct btf_member *)(key_type + 1); 388 size = sizeof_field(struct bpf_cgroup_storage_key, cgroup_inode_id); 389 if (!btf_member_is_reg_int(btf, key_type, m, 0, size)) 390 return -EINVAL; 391 392 /* 393 * The second field must be a 32 bit integer at 64 bit offset. 394 */ 395 m++; 396 offset = offsetof(struct bpf_cgroup_storage_key, attach_type); 397 size = sizeof_field(struct bpf_cgroup_storage_key, attach_type); 398 if (!btf_member_is_reg_int(btf, key_type, m, offset, size)) 399 return -EINVAL; 400 } else { 401 u32 int_data; 402 403 /* 404 * Key is expected to be u64, which stores the cgroup_inode_id 405 */ 406 407 if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT) 408 return -EINVAL; 409 410 int_data = *(u32 *)(key_type + 1); 411 if (BTF_INT_BITS(int_data) != 64 || BTF_INT_OFFSET(int_data)) 412 return -EINVAL; 413 } 414 415 return 0; 416 } 417 418 static void cgroup_storage_seq_show_elem(struct bpf_map *map, void *key, 419 struct seq_file *m) 420 { 421 enum bpf_cgroup_storage_type stype = cgroup_storage_type(map); 422 struct bpf_cgroup_storage *storage; 423 int cpu; 424 425 rcu_read_lock(); 426 storage = cgroup_storage_lookup(map_to_storage(map), key, false); 427 if (!storage) { 428 rcu_read_unlock(); 429 return; 430 } 431 432 btf_type_seq_show(map->btf, map->btf_key_type_id, key, m); 433 stype = cgroup_storage_type(map); 434 if (stype == BPF_CGROUP_STORAGE_SHARED) { 435 seq_puts(m, ": "); 436 btf_type_seq_show(map->btf, map->btf_value_type_id, 437 &READ_ONCE(storage->buf)->data[0], m); 438 seq_puts(m, "\n"); 439 } else { 440 seq_puts(m, ": {\n"); 441 for_each_possible_cpu(cpu) { 442 seq_printf(m, "\tcpu%d: ", cpu); 443 btf_type_seq_show(map->btf, map->btf_value_type_id, 444 per_cpu_ptr(storage->percpu_buf, cpu), 445 m); 446 seq_puts(m, "\n"); 447 } 448 seq_puts(m, "}\n"); 449 } 450 rcu_read_unlock(); 451 } 452 453 static int cgroup_storage_map_btf_id; 454 const struct bpf_map_ops cgroup_storage_map_ops = { 455 .map_alloc = cgroup_storage_map_alloc, 456 .map_free = cgroup_storage_map_free, 457 .map_get_next_key = cgroup_storage_get_next_key, 458 .map_lookup_elem = cgroup_storage_lookup_elem, 459 .map_update_elem = cgroup_storage_update_elem, 460 .map_delete_elem = cgroup_storage_delete_elem, 461 .map_check_btf = cgroup_storage_check_btf, 462 .map_seq_show_elem = cgroup_storage_seq_show_elem, 463 .map_btf_name = "bpf_cgroup_storage_map", 464 .map_btf_id = &cgroup_storage_map_btf_id, 465 }; 466 467 int bpf_cgroup_storage_assign(struct bpf_prog_aux *aux, struct bpf_map *_map) 468 { 469 enum bpf_cgroup_storage_type stype = cgroup_storage_type(_map); 470 471 if (aux->cgroup_storage[stype] && 472 aux->cgroup_storage[stype] != _map) 473 return -EBUSY; 474 475 aux->cgroup_storage[stype] = _map; 476 return 0; 477 } 478 479 static size_t bpf_cgroup_storage_calculate_size(struct bpf_map *map, u32 *pages) 480 { 481 size_t size; 482 483 if (cgroup_storage_type(map) == BPF_CGROUP_STORAGE_SHARED) { 484 size = sizeof(struct bpf_storage_buffer) + map->value_size; 485 *pages = round_up(sizeof(struct bpf_cgroup_storage) + size, 486 PAGE_SIZE) >> PAGE_SHIFT; 487 } else { 488 size = map->value_size; 489 *pages = round_up(round_up(size, 8) * num_possible_cpus(), 490 PAGE_SIZE) >> PAGE_SHIFT; 491 } 492 493 return size; 494 } 495 496 struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog, 497 enum bpf_cgroup_storage_type stype) 498 { 499 struct bpf_cgroup_storage *storage; 500 struct bpf_map *map; 501 gfp_t flags; 502 size_t size; 503 u32 pages; 504 505 map = prog->aux->cgroup_storage[stype]; 506 if (!map) 507 return NULL; 508 509 size = bpf_cgroup_storage_calculate_size(map, &pages); 510 511 if (bpf_map_charge_memlock(map, pages)) 512 return ERR_PTR(-EPERM); 513 514 storage = kmalloc_node(sizeof(struct bpf_cgroup_storage), 515 __GFP_ZERO | GFP_USER, map->numa_node); 516 if (!storage) 517 goto enomem; 518 519 flags = __GFP_ZERO | GFP_USER; 520 521 if (stype == BPF_CGROUP_STORAGE_SHARED) { 522 storage->buf = kmalloc_node(size, flags, map->numa_node); 523 if (!storage->buf) 524 goto enomem; 525 check_and_init_map_lock(map, storage->buf->data); 526 } else { 527 storage->percpu_buf = __alloc_percpu_gfp(size, 8, flags); 528 if (!storage->percpu_buf) 529 goto enomem; 530 } 531 532 storage->map = (struct bpf_cgroup_storage_map *)map; 533 534 return storage; 535 536 enomem: 537 bpf_map_uncharge_memlock(map, pages); 538 kfree(storage); 539 return ERR_PTR(-ENOMEM); 540 } 541 542 static void free_shared_cgroup_storage_rcu(struct rcu_head *rcu) 543 { 544 struct bpf_cgroup_storage *storage = 545 container_of(rcu, struct bpf_cgroup_storage, rcu); 546 547 kfree(storage->buf); 548 kfree(storage); 549 } 550 551 static void free_percpu_cgroup_storage_rcu(struct rcu_head *rcu) 552 { 553 struct bpf_cgroup_storage *storage = 554 container_of(rcu, struct bpf_cgroup_storage, rcu); 555 556 free_percpu(storage->percpu_buf); 557 kfree(storage); 558 } 559 560 void bpf_cgroup_storage_free(struct bpf_cgroup_storage *storage) 561 { 562 enum bpf_cgroup_storage_type stype; 563 struct bpf_map *map; 564 u32 pages; 565 566 if (!storage) 567 return; 568 569 map = &storage->map->map; 570 571 bpf_cgroup_storage_calculate_size(map, &pages); 572 bpf_map_uncharge_memlock(map, pages); 573 574 stype = cgroup_storage_type(map); 575 if (stype == BPF_CGROUP_STORAGE_SHARED) 576 call_rcu(&storage->rcu, free_shared_cgroup_storage_rcu); 577 else 578 call_rcu(&storage->rcu, free_percpu_cgroup_storage_rcu); 579 } 580 581 void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage, 582 struct cgroup *cgroup, 583 enum bpf_attach_type type) 584 { 585 struct bpf_cgroup_storage_map *map; 586 587 if (!storage) 588 return; 589 590 storage->key.attach_type = type; 591 storage->key.cgroup_inode_id = cgroup_id(cgroup); 592 593 map = storage->map; 594 595 spin_lock_bh(&map->lock); 596 WARN_ON(cgroup_storage_insert(map, storage)); 597 list_add(&storage->list_map, &map->list); 598 list_add(&storage->list_cg, &cgroup->bpf.storages); 599 spin_unlock_bh(&map->lock); 600 } 601 602 void bpf_cgroup_storage_unlink(struct bpf_cgroup_storage *storage) 603 { 604 struct bpf_cgroup_storage_map *map; 605 struct rb_root *root; 606 607 if (!storage) 608 return; 609 610 map = storage->map; 611 612 spin_lock_bh(&map->lock); 613 root = &map->root; 614 rb_erase(&storage->node, root); 615 616 list_del(&storage->list_map); 617 list_del(&storage->list_cg); 618 spin_unlock_bh(&map->lock); 619 } 620 621 #endif 622