1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/bpf-cgroup.h> 3 #include <linux/bpf.h> 4 #include <linux/bpf_local_storage.h> 5 #include <linux/btf.h> 6 #include <linux/bug.h> 7 #include <linux/filter.h> 8 #include <linux/mm.h> 9 #include <linux/rbtree.h> 10 #include <linux/slab.h> 11 #include <uapi/linux/btf.h> 12 #include <linux/btf_ids.h> 13 14 #ifdef CONFIG_CGROUP_BPF 15 16 #include "../cgroup/cgroup-internal.h" 17 18 #define LOCAL_STORAGE_CREATE_FLAG_MASK \ 19 (BPF_F_NUMA_NODE | BPF_F_ACCESS_MASK) 20 21 struct bpf_cgroup_storage_map { 22 struct bpf_map map; 23 24 spinlock_t lock; 25 struct rb_root root; 26 struct list_head list; 27 }; 28 29 static struct bpf_cgroup_storage_map *map_to_storage(struct bpf_map *map) 30 { 31 return container_of(map, struct bpf_cgroup_storage_map, map); 32 } 33 34 static bool attach_type_isolated(const struct bpf_map *map) 35 { 36 return map->key_size == sizeof(struct bpf_cgroup_storage_key); 37 } 38 39 static int bpf_cgroup_storage_key_cmp(const struct bpf_cgroup_storage_map *map, 40 const void *_key1, const void *_key2) 41 { 42 if (attach_type_isolated(&map->map)) { 43 const struct bpf_cgroup_storage_key *key1 = _key1; 44 const struct bpf_cgroup_storage_key *key2 = _key2; 45 46 if (key1->cgroup_inode_id < key2->cgroup_inode_id) 47 return -1; 48 else if (key1->cgroup_inode_id > key2->cgroup_inode_id) 49 return 1; 50 else if (key1->attach_type < key2->attach_type) 51 return -1; 52 else if (key1->attach_type > key2->attach_type) 53 return 1; 54 } else { 55 const __u64 *cgroup_inode_id1 = _key1; 56 const __u64 *cgroup_inode_id2 = _key2; 57 58 if (*cgroup_inode_id1 < *cgroup_inode_id2) 59 return -1; 60 else if (*cgroup_inode_id1 > *cgroup_inode_id2) 61 return 1; 62 } 63 return 0; 64 } 65 66 struct bpf_cgroup_storage * 67 cgroup_storage_lookup(struct bpf_cgroup_storage_map *map, 68 void *key, bool locked) 69 { 70 struct rb_root *root = &map->root; 71 struct rb_node *node; 72 73 if (!locked) 74 spin_lock_bh(&map->lock); 75 76 node = root->rb_node; 77 while (node) { 78 struct bpf_cgroup_storage *storage; 79 80 storage = container_of(node, struct bpf_cgroup_storage, node); 81 82 switch (bpf_cgroup_storage_key_cmp(map, key, &storage->key)) { 83 case -1: 84 node = node->rb_left; 85 break; 86 case 1: 87 node = node->rb_right; 88 break; 89 default: 90 if (!locked) 91 spin_unlock_bh(&map->lock); 92 return storage; 93 } 94 } 95 96 if (!locked) 97 spin_unlock_bh(&map->lock); 98 99 return NULL; 100 } 101 102 static int cgroup_storage_insert(struct bpf_cgroup_storage_map *map, 103 struct bpf_cgroup_storage *storage) 104 { 105 struct rb_root *root = &map->root; 106 struct rb_node **new = &(root->rb_node), *parent = NULL; 107 108 while (*new) { 109 struct bpf_cgroup_storage *this; 110 111 this = container_of(*new, struct bpf_cgroup_storage, node); 112 113 parent = *new; 114 switch (bpf_cgroup_storage_key_cmp(map, &storage->key, &this->key)) { 115 case -1: 116 new = &((*new)->rb_left); 117 break; 118 case 1: 119 new = &((*new)->rb_right); 120 break; 121 default: 122 return -EEXIST; 123 } 124 } 125 126 rb_link_node(&storage->node, parent, new); 127 rb_insert_color(&storage->node, root); 128 129 return 0; 130 } 131 132 static void *cgroup_storage_lookup_elem(struct bpf_map *_map, void *key) 133 { 134 struct bpf_cgroup_storage_map *map = map_to_storage(_map); 135 struct bpf_cgroup_storage *storage; 136 137 storage = cgroup_storage_lookup(map, key, false); 138 if (!storage) 139 return NULL; 140 141 return &READ_ONCE(storage->buf)->data[0]; 142 } 143 144 static long cgroup_storage_update_elem(struct bpf_map *map, void *key, 145 void *value, u64 flags) 146 { 147 struct bpf_cgroup_storage *storage; 148 struct bpf_storage_buffer *new; 149 150 if (unlikely(flags & ~(BPF_F_LOCK | BPF_EXIST))) 151 return -EINVAL; 152 153 if (unlikely((flags & BPF_F_LOCK) && 154 !btf_record_has_field(map->record, BPF_SPIN_LOCK))) 155 return -EINVAL; 156 157 storage = cgroup_storage_lookup((struct bpf_cgroup_storage_map *)map, 158 key, false); 159 if (!storage) 160 return -ENOENT; 161 162 if (flags & BPF_F_LOCK) { 163 copy_map_value_locked(map, storage->buf->data, value, false); 164 return 0; 165 } 166 167 new = bpf_map_kmalloc_node(map, struct_size(new, data, map->value_size), 168 __GFP_ZERO | GFP_NOWAIT | __GFP_NOWARN, 169 map->numa_node); 170 if (!new) 171 return -ENOMEM; 172 173 memcpy(&new->data[0], value, map->value_size); 174 check_and_init_map_value(map, new->data); 175 176 new = xchg(&storage->buf, new); 177 kfree_rcu(new, rcu); 178 179 return 0; 180 } 181 182 int bpf_percpu_cgroup_storage_copy(struct bpf_map *_map, void *key, 183 void *value) 184 { 185 struct bpf_cgroup_storage_map *map = map_to_storage(_map); 186 struct bpf_cgroup_storage *storage; 187 int cpu, off = 0; 188 u32 size; 189 190 rcu_read_lock(); 191 storage = cgroup_storage_lookup(map, key, false); 192 if (!storage) { 193 rcu_read_unlock(); 194 return -ENOENT; 195 } 196 197 /* per_cpu areas are zero-filled and bpf programs can only 198 * access 'value_size' of them, so copying rounded areas 199 * will not leak any kernel data 200 */ 201 size = round_up(_map->value_size, 8); 202 for_each_possible_cpu(cpu) { 203 bpf_long_memcpy(value + off, 204 per_cpu_ptr(storage->percpu_buf, cpu), size); 205 off += size; 206 } 207 rcu_read_unlock(); 208 return 0; 209 } 210 211 int bpf_percpu_cgroup_storage_update(struct bpf_map *_map, void *key, 212 void *value, u64 map_flags) 213 { 214 struct bpf_cgroup_storage_map *map = map_to_storage(_map); 215 struct bpf_cgroup_storage *storage; 216 int cpu, off = 0; 217 u32 size; 218 219 if (map_flags != BPF_ANY && map_flags != BPF_EXIST) 220 return -EINVAL; 221 222 rcu_read_lock(); 223 storage = cgroup_storage_lookup(map, key, false); 224 if (!storage) { 225 rcu_read_unlock(); 226 return -ENOENT; 227 } 228 229 /* the user space will provide round_up(value_size, 8) bytes that 230 * will be copied into per-cpu area. bpf programs can only access 231 * value_size of it. During lookup the same extra bytes will be 232 * returned or zeros which were zero-filled by percpu_alloc, 233 * so no kernel data leaks possible 234 */ 235 size = round_up(_map->value_size, 8); 236 for_each_possible_cpu(cpu) { 237 bpf_long_memcpy(per_cpu_ptr(storage->percpu_buf, cpu), 238 value + off, size); 239 off += size; 240 } 241 rcu_read_unlock(); 242 return 0; 243 } 244 245 static int cgroup_storage_get_next_key(struct bpf_map *_map, void *key, 246 void *_next_key) 247 { 248 struct bpf_cgroup_storage_map *map = map_to_storage(_map); 249 struct bpf_cgroup_storage *storage; 250 251 spin_lock_bh(&map->lock); 252 253 if (list_empty(&map->list)) 254 goto enoent; 255 256 if (key) { 257 storage = cgroup_storage_lookup(map, key, true); 258 if (!storage) 259 goto enoent; 260 261 storage = list_next_entry(storage, list_map); 262 if (!storage) 263 goto enoent; 264 } else { 265 storage = list_first_entry(&map->list, 266 struct bpf_cgroup_storage, list_map); 267 } 268 269 spin_unlock_bh(&map->lock); 270 271 if (attach_type_isolated(&map->map)) { 272 struct bpf_cgroup_storage_key *next = _next_key; 273 *next = storage->key; 274 } else { 275 __u64 *next = _next_key; 276 *next = storage->key.cgroup_inode_id; 277 } 278 return 0; 279 280 enoent: 281 spin_unlock_bh(&map->lock); 282 return -ENOENT; 283 } 284 285 static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) 286 { 287 __u32 max_value_size = BPF_LOCAL_STORAGE_MAX_VALUE_SIZE; 288 int numa_node = bpf_map_attr_numa_node(attr); 289 struct bpf_cgroup_storage_map *map; 290 291 /* percpu is bound by PCPU_MIN_UNIT_SIZE, non-percu 292 * is the same as other local storages. 293 */ 294 if (attr->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) 295 max_value_size = min_t(__u32, max_value_size, 296 PCPU_MIN_UNIT_SIZE); 297 298 if (attr->key_size != sizeof(struct bpf_cgroup_storage_key) && 299 attr->key_size != sizeof(__u64)) 300 return ERR_PTR(-EINVAL); 301 302 if (attr->value_size == 0) 303 return ERR_PTR(-EINVAL); 304 305 if (attr->value_size > max_value_size) 306 return ERR_PTR(-E2BIG); 307 308 if (attr->map_flags & ~LOCAL_STORAGE_CREATE_FLAG_MASK || 309 !bpf_map_flags_access_ok(attr->map_flags)) 310 return ERR_PTR(-EINVAL); 311 312 if (attr->max_entries) 313 /* max_entries is not used and enforced to be 0 */ 314 return ERR_PTR(-EINVAL); 315 316 map = bpf_map_area_alloc(sizeof(struct bpf_cgroup_storage_map), numa_node); 317 if (!map) 318 return ERR_PTR(-ENOMEM); 319 320 /* copy mandatory map attributes */ 321 bpf_map_init_from_attr(&map->map, attr); 322 323 spin_lock_init(&map->lock); 324 map->root = RB_ROOT; 325 INIT_LIST_HEAD(&map->list); 326 327 return &map->map; 328 } 329 330 static void cgroup_storage_map_free(struct bpf_map *_map) 331 { 332 struct bpf_cgroup_storage_map *map = map_to_storage(_map); 333 struct list_head *storages = &map->list; 334 struct bpf_cgroup_storage *storage, *stmp; 335 336 mutex_lock(&cgroup_mutex); 337 338 list_for_each_entry_safe(storage, stmp, storages, list_map) { 339 bpf_cgroup_storage_unlink(storage); 340 bpf_cgroup_storage_free(storage); 341 } 342 343 mutex_unlock(&cgroup_mutex); 344 345 WARN_ON(!RB_EMPTY_ROOT(&map->root)); 346 WARN_ON(!list_empty(&map->list)); 347 348 bpf_map_area_free(map); 349 } 350 351 static long cgroup_storage_delete_elem(struct bpf_map *map, void *key) 352 { 353 return -EINVAL; 354 } 355 356 static int cgroup_storage_check_btf(const struct bpf_map *map, 357 const struct btf *btf, 358 const struct btf_type *key_type, 359 const struct btf_type *value_type) 360 { 361 if (attach_type_isolated(map)) { 362 struct btf_member *m; 363 u32 offset, size; 364 365 /* Key is expected to be of struct bpf_cgroup_storage_key type, 366 * which is: 367 * struct bpf_cgroup_storage_key { 368 * __u64 cgroup_inode_id; 369 * __u32 attach_type; 370 * }; 371 */ 372 373 /* 374 * Key_type must be a structure with two fields. 375 */ 376 if (BTF_INFO_KIND(key_type->info) != BTF_KIND_STRUCT || 377 BTF_INFO_VLEN(key_type->info) != 2) 378 return -EINVAL; 379 380 /* 381 * The first field must be a 64 bit integer at 0 offset. 382 */ 383 m = (struct btf_member *)(key_type + 1); 384 size = sizeof_field(struct bpf_cgroup_storage_key, cgroup_inode_id); 385 if (!btf_member_is_reg_int(btf, key_type, m, 0, size)) 386 return -EINVAL; 387 388 /* 389 * The second field must be a 32 bit integer at 64 bit offset. 390 */ 391 m++; 392 offset = offsetof(struct bpf_cgroup_storage_key, attach_type); 393 size = sizeof_field(struct bpf_cgroup_storage_key, attach_type); 394 if (!btf_member_is_reg_int(btf, key_type, m, offset, size)) 395 return -EINVAL; 396 } else { 397 u32 int_data; 398 399 /* 400 * Key is expected to be u64, which stores the cgroup_inode_id 401 */ 402 403 if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT) 404 return -EINVAL; 405 406 int_data = *(u32 *)(key_type + 1); 407 if (BTF_INT_BITS(int_data) != 64 || BTF_INT_OFFSET(int_data)) 408 return -EINVAL; 409 } 410 411 return 0; 412 } 413 414 static void cgroup_storage_seq_show_elem(struct bpf_map *map, void *key, 415 struct seq_file *m) 416 { 417 enum bpf_cgroup_storage_type stype; 418 struct bpf_cgroup_storage *storage; 419 int cpu; 420 421 rcu_read_lock(); 422 storage = cgroup_storage_lookup(map_to_storage(map), key, false); 423 if (!storage) { 424 rcu_read_unlock(); 425 return; 426 } 427 428 btf_type_seq_show(map->btf, map->btf_key_type_id, key, m); 429 stype = cgroup_storage_type(map); 430 if (stype == BPF_CGROUP_STORAGE_SHARED) { 431 seq_puts(m, ": "); 432 btf_type_seq_show(map->btf, map->btf_value_type_id, 433 &READ_ONCE(storage->buf)->data[0], m); 434 seq_puts(m, "\n"); 435 } else { 436 seq_puts(m, ": {\n"); 437 for_each_possible_cpu(cpu) { 438 seq_printf(m, "\tcpu%d: ", cpu); 439 btf_type_seq_show(map->btf, map->btf_value_type_id, 440 per_cpu_ptr(storage->percpu_buf, cpu), 441 m); 442 seq_puts(m, "\n"); 443 } 444 seq_puts(m, "}\n"); 445 } 446 rcu_read_unlock(); 447 } 448 449 static u64 cgroup_storage_map_usage(const struct bpf_map *map) 450 { 451 /* Currently the dynamically allocated elements are not counted. */ 452 return sizeof(struct bpf_cgroup_storage_map); 453 } 454 455 BTF_ID_LIST_SINGLE(cgroup_storage_map_btf_ids, struct, 456 bpf_cgroup_storage_map) 457 const struct bpf_map_ops cgroup_storage_map_ops = { 458 .map_alloc = cgroup_storage_map_alloc, 459 .map_free = cgroup_storage_map_free, 460 .map_get_next_key = cgroup_storage_get_next_key, 461 .map_lookup_elem = cgroup_storage_lookup_elem, 462 .map_update_elem = cgroup_storage_update_elem, 463 .map_delete_elem = cgroup_storage_delete_elem, 464 .map_check_btf = cgroup_storage_check_btf, 465 .map_seq_show_elem = cgroup_storage_seq_show_elem, 466 .map_mem_usage = cgroup_storage_map_usage, 467 .map_btf_id = &cgroup_storage_map_btf_ids[0], 468 }; 469 470 int bpf_cgroup_storage_assign(struct bpf_prog_aux *aux, struct bpf_map *_map) 471 { 472 enum bpf_cgroup_storage_type stype = cgroup_storage_type(_map); 473 474 if (aux->cgroup_storage[stype] && 475 aux->cgroup_storage[stype] != _map) 476 return -EBUSY; 477 478 aux->cgroup_storage[stype] = _map; 479 return 0; 480 } 481 482 static size_t bpf_cgroup_storage_calculate_size(struct bpf_map *map, u32 *pages) 483 { 484 size_t size; 485 486 if (cgroup_storage_type(map) == BPF_CGROUP_STORAGE_SHARED) { 487 size = sizeof(struct bpf_storage_buffer) + map->value_size; 488 *pages = round_up(sizeof(struct bpf_cgroup_storage) + size, 489 PAGE_SIZE) >> PAGE_SHIFT; 490 } else { 491 size = map->value_size; 492 *pages = round_up(round_up(size, 8) * num_possible_cpus(), 493 PAGE_SIZE) >> PAGE_SHIFT; 494 } 495 496 return size; 497 } 498 499 struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog, 500 enum bpf_cgroup_storage_type stype) 501 { 502 const gfp_t gfp = __GFP_ZERO | GFP_USER; 503 struct bpf_cgroup_storage *storage; 504 struct bpf_map *map; 505 size_t size; 506 u32 pages; 507 508 map = prog->aux->cgroup_storage[stype]; 509 if (!map) 510 return NULL; 511 512 size = bpf_cgroup_storage_calculate_size(map, &pages); 513 514 storage = bpf_map_kmalloc_node(map, sizeof(struct bpf_cgroup_storage), 515 gfp, map->numa_node); 516 if (!storage) 517 goto enomem; 518 519 if (stype == BPF_CGROUP_STORAGE_SHARED) { 520 storage->buf = bpf_map_kmalloc_node(map, size, gfp, 521 map->numa_node); 522 if (!storage->buf) 523 goto enomem; 524 check_and_init_map_value(map, storage->buf->data); 525 } else { 526 storage->percpu_buf = bpf_map_alloc_percpu(map, size, 8, gfp); 527 if (!storage->percpu_buf) 528 goto enomem; 529 } 530 531 storage->map = (struct bpf_cgroup_storage_map *)map; 532 533 return storage; 534 535 enomem: 536 kfree(storage); 537 return ERR_PTR(-ENOMEM); 538 } 539 540 static void free_shared_cgroup_storage_rcu(struct rcu_head *rcu) 541 { 542 struct bpf_cgroup_storage *storage = 543 container_of(rcu, struct bpf_cgroup_storage, rcu); 544 545 kfree(storage->buf); 546 kfree(storage); 547 } 548 549 static void free_percpu_cgroup_storage_rcu(struct rcu_head *rcu) 550 { 551 struct bpf_cgroup_storage *storage = 552 container_of(rcu, struct bpf_cgroup_storage, rcu); 553 554 free_percpu(storage->percpu_buf); 555 kfree(storage); 556 } 557 558 void bpf_cgroup_storage_free(struct bpf_cgroup_storage *storage) 559 { 560 enum bpf_cgroup_storage_type stype; 561 struct bpf_map *map; 562 563 if (!storage) 564 return; 565 566 map = &storage->map->map; 567 stype = cgroup_storage_type(map); 568 if (stype == BPF_CGROUP_STORAGE_SHARED) 569 call_rcu(&storage->rcu, free_shared_cgroup_storage_rcu); 570 else 571 call_rcu(&storage->rcu, free_percpu_cgroup_storage_rcu); 572 } 573 574 void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage, 575 struct cgroup *cgroup, 576 enum bpf_attach_type type) 577 { 578 struct bpf_cgroup_storage_map *map; 579 580 if (!storage) 581 return; 582 583 storage->key.attach_type = type; 584 storage->key.cgroup_inode_id = cgroup_id(cgroup); 585 586 map = storage->map; 587 588 spin_lock_bh(&map->lock); 589 WARN_ON(cgroup_storage_insert(map, storage)); 590 list_add(&storage->list_map, &map->list); 591 list_add(&storage->list_cg, &cgroup->bpf.storages); 592 spin_unlock_bh(&map->lock); 593 } 594 595 void bpf_cgroup_storage_unlink(struct bpf_cgroup_storage *storage) 596 { 597 struct bpf_cgroup_storage_map *map; 598 struct rb_root *root; 599 600 if (!storage) 601 return; 602 603 map = storage->map; 604 605 spin_lock_bh(&map->lock); 606 root = &map->root; 607 rb_erase(&storage->node, root); 608 609 list_del(&storage->list_map); 610 list_del(&storage->list_cg); 611 spin_unlock_bh(&map->lock); 612 } 613 614 #endif 615