1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Copyright (c) 2019 Facebook */ 3 4 #include <linux/bpf.h> 5 #include <linux/bpf_verifier.h> 6 #include <linux/btf.h> 7 #include <linux/filter.h> 8 #include <linux/slab.h> 9 #include <linux/numa.h> 10 #include <linux/seq_file.h> 11 #include <linux/refcount.h> 12 #include <linux/mutex.h> 13 #include <linux/btf_ids.h> 14 15 enum bpf_struct_ops_state { 16 BPF_STRUCT_OPS_STATE_INIT, 17 BPF_STRUCT_OPS_STATE_INUSE, 18 BPF_STRUCT_OPS_STATE_TOBEFREE, 19 }; 20 21 #define BPF_STRUCT_OPS_COMMON_VALUE \ 22 refcount_t refcnt; \ 23 enum bpf_struct_ops_state state 24 25 struct bpf_struct_ops_value { 26 BPF_STRUCT_OPS_COMMON_VALUE; 27 char data[] ____cacheline_aligned_in_smp; 28 }; 29 30 struct bpf_struct_ops_map { 31 struct bpf_map map; 32 struct rcu_head rcu; 33 const struct bpf_struct_ops *st_ops; 34 /* protect map_update */ 35 struct mutex lock; 36 /* progs has all the bpf_prog that is populated 37 * to the func ptr of the kernel's struct 38 * (in kvalue.data). 39 */ 40 struct bpf_prog **progs; 41 /* image is a page that has all the trampolines 42 * that stores the func args before calling the bpf_prog. 43 * A PAGE_SIZE "image" is enough to store all trampoline for 44 * "progs[]". 45 */ 46 void *image; 47 /* uvalue->data stores the kernel struct 48 * (e.g. tcp_congestion_ops) that is more useful 49 * to userspace than the kvalue. For example, 50 * the bpf_prog's id is stored instead of the kernel 51 * address of a func ptr. 52 */ 53 struct bpf_struct_ops_value *uvalue; 54 /* kvalue.data stores the actual kernel's struct 55 * (e.g. tcp_congestion_ops) that will be 56 * registered to the kernel subsystem. 57 */ 58 struct bpf_struct_ops_value kvalue; 59 }; 60 61 #define VALUE_PREFIX "bpf_struct_ops_" 62 #define VALUE_PREFIX_LEN (sizeof(VALUE_PREFIX) - 1) 63 64 /* bpf_struct_ops_##_name (e.g. bpf_struct_ops_tcp_congestion_ops) is 65 * the map's value exposed to the userspace and its btf-type-id is 66 * stored at the map->btf_vmlinux_value_type_id. 67 * 68 */ 69 #define BPF_STRUCT_OPS_TYPE(_name) \ 70 extern struct bpf_struct_ops bpf_##_name; \ 71 \ 72 struct bpf_struct_ops_##_name { \ 73 BPF_STRUCT_OPS_COMMON_VALUE; \ 74 struct _name data ____cacheline_aligned_in_smp; \ 75 }; 76 #include "bpf_struct_ops_types.h" 77 #undef BPF_STRUCT_OPS_TYPE 78 79 enum { 80 #define BPF_STRUCT_OPS_TYPE(_name) BPF_STRUCT_OPS_TYPE_##_name, 81 #include "bpf_struct_ops_types.h" 82 #undef BPF_STRUCT_OPS_TYPE 83 __NR_BPF_STRUCT_OPS_TYPE, 84 }; 85 86 static struct bpf_struct_ops * const bpf_struct_ops[] = { 87 #define BPF_STRUCT_OPS_TYPE(_name) \ 88 [BPF_STRUCT_OPS_TYPE_##_name] = &bpf_##_name, 89 #include "bpf_struct_ops_types.h" 90 #undef BPF_STRUCT_OPS_TYPE 91 }; 92 93 const struct bpf_verifier_ops bpf_struct_ops_verifier_ops = { 94 }; 95 96 const struct bpf_prog_ops bpf_struct_ops_prog_ops = { 97 #ifdef CONFIG_NET 98 .test_run = bpf_struct_ops_test_run, 99 #endif 100 }; 101 102 static const struct btf_type *module_type; 103 104 void bpf_struct_ops_init(struct btf *btf, struct bpf_verifier_log *log) 105 { 106 s32 type_id, value_id, module_id; 107 const struct btf_member *member; 108 struct bpf_struct_ops *st_ops; 109 const struct btf_type *t; 110 char value_name[128]; 111 const char *mname; 112 u32 i, j; 113 114 /* Ensure BTF type is emitted for "struct bpf_struct_ops_##_name" */ 115 #define BPF_STRUCT_OPS_TYPE(_name) BTF_TYPE_EMIT(struct bpf_struct_ops_##_name); 116 #include "bpf_struct_ops_types.h" 117 #undef BPF_STRUCT_OPS_TYPE 118 119 module_id = btf_find_by_name_kind(btf, "module", BTF_KIND_STRUCT); 120 if (module_id < 0) { 121 pr_warn("Cannot find struct module in btf_vmlinux\n"); 122 return; 123 } 124 module_type = btf_type_by_id(btf, module_id); 125 126 for (i = 0; i < ARRAY_SIZE(bpf_struct_ops); i++) { 127 st_ops = bpf_struct_ops[i]; 128 129 if (strlen(st_ops->name) + VALUE_PREFIX_LEN >= 130 sizeof(value_name)) { 131 pr_warn("struct_ops name %s is too long\n", 132 st_ops->name); 133 continue; 134 } 135 sprintf(value_name, "%s%s", VALUE_PREFIX, st_ops->name); 136 137 value_id = btf_find_by_name_kind(btf, value_name, 138 BTF_KIND_STRUCT); 139 if (value_id < 0) { 140 pr_warn("Cannot find struct %s in btf_vmlinux\n", 141 value_name); 142 continue; 143 } 144 145 type_id = btf_find_by_name_kind(btf, st_ops->name, 146 BTF_KIND_STRUCT); 147 if (type_id < 0) { 148 pr_warn("Cannot find struct %s in btf_vmlinux\n", 149 st_ops->name); 150 continue; 151 } 152 t = btf_type_by_id(btf, type_id); 153 if (btf_type_vlen(t) > BPF_STRUCT_OPS_MAX_NR_MEMBERS) { 154 pr_warn("Cannot support #%u members in struct %s\n", 155 btf_type_vlen(t), st_ops->name); 156 continue; 157 } 158 159 for_each_member(j, t, member) { 160 const struct btf_type *func_proto; 161 162 mname = btf_name_by_offset(btf, member->name_off); 163 if (!*mname) { 164 pr_warn("anon member in struct %s is not supported\n", 165 st_ops->name); 166 break; 167 } 168 169 if (__btf_member_bitfield_size(t, member)) { 170 pr_warn("bit field member %s in struct %s is not supported\n", 171 mname, st_ops->name); 172 break; 173 } 174 175 func_proto = btf_type_resolve_func_ptr(btf, 176 member->type, 177 NULL); 178 if (func_proto && 179 btf_distill_func_proto(log, btf, 180 func_proto, mname, 181 &st_ops->func_models[j])) { 182 pr_warn("Error in parsing func ptr %s in struct %s\n", 183 mname, st_ops->name); 184 break; 185 } 186 } 187 188 if (j == btf_type_vlen(t)) { 189 if (st_ops->init(btf)) { 190 pr_warn("Error in init bpf_struct_ops %s\n", 191 st_ops->name); 192 } else { 193 st_ops->type_id = type_id; 194 st_ops->type = t; 195 st_ops->value_id = value_id; 196 st_ops->value_type = btf_type_by_id(btf, 197 value_id); 198 } 199 } 200 } 201 } 202 203 extern struct btf *btf_vmlinux; 204 205 static const struct bpf_struct_ops * 206 bpf_struct_ops_find_value(u32 value_id) 207 { 208 unsigned int i; 209 210 if (!value_id || !btf_vmlinux) 211 return NULL; 212 213 for (i = 0; i < ARRAY_SIZE(bpf_struct_ops); i++) { 214 if (bpf_struct_ops[i]->value_id == value_id) 215 return bpf_struct_ops[i]; 216 } 217 218 return NULL; 219 } 220 221 const struct bpf_struct_ops *bpf_struct_ops_find(u32 type_id) 222 { 223 unsigned int i; 224 225 if (!type_id || !btf_vmlinux) 226 return NULL; 227 228 for (i = 0; i < ARRAY_SIZE(bpf_struct_ops); i++) { 229 if (bpf_struct_ops[i]->type_id == type_id) 230 return bpf_struct_ops[i]; 231 } 232 233 return NULL; 234 } 235 236 static int bpf_struct_ops_map_get_next_key(struct bpf_map *map, void *key, 237 void *next_key) 238 { 239 if (key && *(u32 *)key == 0) 240 return -ENOENT; 241 242 *(u32 *)next_key = 0; 243 return 0; 244 } 245 246 int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key, 247 void *value) 248 { 249 struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; 250 struct bpf_struct_ops_value *uvalue, *kvalue; 251 enum bpf_struct_ops_state state; 252 253 if (unlikely(*(u32 *)key != 0)) 254 return -ENOENT; 255 256 kvalue = &st_map->kvalue; 257 /* Pair with smp_store_release() during map_update */ 258 state = smp_load_acquire(&kvalue->state); 259 if (state == BPF_STRUCT_OPS_STATE_INIT) { 260 memset(value, 0, map->value_size); 261 return 0; 262 } 263 264 /* No lock is needed. state and refcnt do not need 265 * to be updated together under atomic context. 266 */ 267 uvalue = value; 268 memcpy(uvalue, st_map->uvalue, map->value_size); 269 uvalue->state = state; 270 refcount_set(&uvalue->refcnt, refcount_read(&kvalue->refcnt)); 271 272 return 0; 273 } 274 275 static void *bpf_struct_ops_map_lookup_elem(struct bpf_map *map, void *key) 276 { 277 return ERR_PTR(-EINVAL); 278 } 279 280 static void bpf_struct_ops_map_put_progs(struct bpf_struct_ops_map *st_map) 281 { 282 const struct btf_type *t = st_map->st_ops->type; 283 u32 i; 284 285 for (i = 0; i < btf_type_vlen(t); i++) { 286 if (st_map->progs[i]) { 287 bpf_prog_put(st_map->progs[i]); 288 st_map->progs[i] = NULL; 289 } 290 } 291 } 292 293 static int check_zero_holes(const struct btf_type *t, void *data) 294 { 295 const struct btf_member *member; 296 u32 i, moff, msize, prev_mend = 0; 297 const struct btf_type *mtype; 298 299 for_each_member(i, t, member) { 300 moff = __btf_member_bit_offset(t, member) / 8; 301 if (moff > prev_mend && 302 memchr_inv(data + prev_mend, 0, moff - prev_mend)) 303 return -EINVAL; 304 305 mtype = btf_type_by_id(btf_vmlinux, member->type); 306 mtype = btf_resolve_size(btf_vmlinux, mtype, &msize); 307 if (IS_ERR(mtype)) 308 return PTR_ERR(mtype); 309 prev_mend = moff + msize; 310 } 311 312 if (t->size > prev_mend && 313 memchr_inv(data + prev_mend, 0, t->size - prev_mend)) 314 return -EINVAL; 315 316 return 0; 317 } 318 319 int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_progs *tprogs, 320 struct bpf_prog *prog, 321 const struct btf_func_model *model, 322 void *image, void *image_end) 323 { 324 u32 flags; 325 326 tprogs[BPF_TRAMP_FENTRY].progs[0] = prog; 327 tprogs[BPF_TRAMP_FENTRY].nr_progs = 1; 328 flags = model->ret_size > 0 ? BPF_TRAMP_F_RET_FENTRY_RET : 0; 329 return arch_prepare_bpf_trampoline(NULL, image, image_end, 330 model, flags, tprogs, NULL); 331 } 332 333 static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, 334 void *value, u64 flags) 335 { 336 struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; 337 const struct bpf_struct_ops *st_ops = st_map->st_ops; 338 struct bpf_struct_ops_value *uvalue, *kvalue; 339 const struct btf_member *member; 340 const struct btf_type *t = st_ops->type; 341 struct bpf_tramp_progs *tprogs = NULL; 342 void *udata, *kdata; 343 int prog_fd, err = 0; 344 void *image, *image_end; 345 u32 i; 346 347 if (flags) 348 return -EINVAL; 349 350 if (*(u32 *)key != 0) 351 return -E2BIG; 352 353 err = check_zero_holes(st_ops->value_type, value); 354 if (err) 355 return err; 356 357 uvalue = value; 358 err = check_zero_holes(t, uvalue->data); 359 if (err) 360 return err; 361 362 if (uvalue->state || refcount_read(&uvalue->refcnt)) 363 return -EINVAL; 364 365 tprogs = kcalloc(BPF_TRAMP_MAX, sizeof(*tprogs), GFP_KERNEL); 366 if (!tprogs) 367 return -ENOMEM; 368 369 uvalue = (struct bpf_struct_ops_value *)st_map->uvalue; 370 kvalue = (struct bpf_struct_ops_value *)&st_map->kvalue; 371 372 mutex_lock(&st_map->lock); 373 374 if (kvalue->state != BPF_STRUCT_OPS_STATE_INIT) { 375 err = -EBUSY; 376 goto unlock; 377 } 378 379 memcpy(uvalue, value, map->value_size); 380 381 udata = &uvalue->data; 382 kdata = &kvalue->data; 383 image = st_map->image; 384 image_end = st_map->image + PAGE_SIZE; 385 386 for_each_member(i, t, member) { 387 const struct btf_type *mtype, *ptype; 388 struct bpf_prog *prog; 389 u32 moff; 390 391 moff = __btf_member_bit_offset(t, member) / 8; 392 ptype = btf_type_resolve_ptr(btf_vmlinux, member->type, NULL); 393 if (ptype == module_type) { 394 if (*(void **)(udata + moff)) 395 goto reset_unlock; 396 *(void **)(kdata + moff) = BPF_MODULE_OWNER; 397 continue; 398 } 399 400 err = st_ops->init_member(t, member, kdata, udata); 401 if (err < 0) 402 goto reset_unlock; 403 404 /* The ->init_member() has handled this member */ 405 if (err > 0) 406 continue; 407 408 /* If st_ops->init_member does not handle it, 409 * we will only handle func ptrs and zero-ed members 410 * here. Reject everything else. 411 */ 412 413 /* All non func ptr member must be 0 */ 414 if (!ptype || !btf_type_is_func_proto(ptype)) { 415 u32 msize; 416 417 mtype = btf_type_by_id(btf_vmlinux, member->type); 418 mtype = btf_resolve_size(btf_vmlinux, mtype, &msize); 419 if (IS_ERR(mtype)) { 420 err = PTR_ERR(mtype); 421 goto reset_unlock; 422 } 423 424 if (memchr_inv(udata + moff, 0, msize)) { 425 err = -EINVAL; 426 goto reset_unlock; 427 } 428 429 continue; 430 } 431 432 prog_fd = (int)(*(unsigned long *)(udata + moff)); 433 /* Similar check as the attr->attach_prog_fd */ 434 if (!prog_fd) 435 continue; 436 437 prog = bpf_prog_get(prog_fd); 438 if (IS_ERR(prog)) { 439 err = PTR_ERR(prog); 440 goto reset_unlock; 441 } 442 st_map->progs[i] = prog; 443 444 if (prog->type != BPF_PROG_TYPE_STRUCT_OPS || 445 prog->aux->attach_btf_id != st_ops->type_id || 446 prog->expected_attach_type != i) { 447 err = -EINVAL; 448 goto reset_unlock; 449 } 450 451 err = bpf_struct_ops_prepare_trampoline(tprogs, prog, 452 &st_ops->func_models[i], 453 image, image_end); 454 if (err < 0) 455 goto reset_unlock; 456 457 *(void **)(kdata + moff) = image; 458 image += err; 459 460 /* put prog_id to udata */ 461 *(unsigned long *)(udata + moff) = prog->aux->id; 462 } 463 464 refcount_set(&kvalue->refcnt, 1); 465 bpf_map_inc(map); 466 467 set_memory_ro((long)st_map->image, 1); 468 set_memory_x((long)st_map->image, 1); 469 err = st_ops->reg(kdata); 470 if (likely(!err)) { 471 /* Pair with smp_load_acquire() during lookup_elem(). 472 * It ensures the above udata updates (e.g. prog->aux->id) 473 * can be seen once BPF_STRUCT_OPS_STATE_INUSE is set. 474 */ 475 smp_store_release(&kvalue->state, BPF_STRUCT_OPS_STATE_INUSE); 476 goto unlock; 477 } 478 479 /* Error during st_ops->reg(). It is very unlikely since 480 * the above init_member() should have caught it earlier 481 * before reg(). The only possibility is if there was a race 482 * in registering the struct_ops (under the same name) to 483 * a sub-system through different struct_ops's maps. 484 */ 485 set_memory_nx((long)st_map->image, 1); 486 set_memory_rw((long)st_map->image, 1); 487 bpf_map_put(map); 488 489 reset_unlock: 490 bpf_struct_ops_map_put_progs(st_map); 491 memset(uvalue, 0, map->value_size); 492 memset(kvalue, 0, map->value_size); 493 unlock: 494 kfree(tprogs); 495 mutex_unlock(&st_map->lock); 496 return err; 497 } 498 499 static int bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key) 500 { 501 enum bpf_struct_ops_state prev_state; 502 struct bpf_struct_ops_map *st_map; 503 504 st_map = (struct bpf_struct_ops_map *)map; 505 prev_state = cmpxchg(&st_map->kvalue.state, 506 BPF_STRUCT_OPS_STATE_INUSE, 507 BPF_STRUCT_OPS_STATE_TOBEFREE); 508 switch (prev_state) { 509 case BPF_STRUCT_OPS_STATE_INUSE: 510 st_map->st_ops->unreg(&st_map->kvalue.data); 511 if (refcount_dec_and_test(&st_map->kvalue.refcnt)) 512 bpf_map_put(map); 513 return 0; 514 case BPF_STRUCT_OPS_STATE_TOBEFREE: 515 return -EINPROGRESS; 516 case BPF_STRUCT_OPS_STATE_INIT: 517 return -ENOENT; 518 default: 519 WARN_ON_ONCE(1); 520 /* Should never happen. Treat it as not found. */ 521 return -ENOENT; 522 } 523 } 524 525 static void bpf_struct_ops_map_seq_show_elem(struct bpf_map *map, void *key, 526 struct seq_file *m) 527 { 528 void *value; 529 int err; 530 531 value = kmalloc(map->value_size, GFP_USER | __GFP_NOWARN); 532 if (!value) 533 return; 534 535 err = bpf_struct_ops_map_sys_lookup_elem(map, key, value); 536 if (!err) { 537 btf_type_seq_show(btf_vmlinux, map->btf_vmlinux_value_type_id, 538 value, m); 539 seq_puts(m, "\n"); 540 } 541 542 kfree(value); 543 } 544 545 static void bpf_struct_ops_map_free(struct bpf_map *map) 546 { 547 struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; 548 549 if (st_map->progs) 550 bpf_struct_ops_map_put_progs(st_map); 551 bpf_map_area_free(st_map->progs); 552 bpf_jit_free_exec(st_map->image); 553 bpf_map_area_free(st_map->uvalue); 554 bpf_map_area_free(st_map); 555 } 556 557 static int bpf_struct_ops_map_alloc_check(union bpf_attr *attr) 558 { 559 if (attr->key_size != sizeof(unsigned int) || attr->max_entries != 1 || 560 attr->map_flags || !attr->btf_vmlinux_value_type_id) 561 return -EINVAL; 562 return 0; 563 } 564 565 static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr) 566 { 567 const struct bpf_struct_ops *st_ops; 568 size_t st_map_size; 569 struct bpf_struct_ops_map *st_map; 570 const struct btf_type *t, *vt; 571 struct bpf_map *map; 572 573 if (!bpf_capable()) 574 return ERR_PTR(-EPERM); 575 576 st_ops = bpf_struct_ops_find_value(attr->btf_vmlinux_value_type_id); 577 if (!st_ops) 578 return ERR_PTR(-ENOTSUPP); 579 580 vt = st_ops->value_type; 581 if (attr->value_size != vt->size) 582 return ERR_PTR(-EINVAL); 583 584 t = st_ops->type; 585 586 st_map_size = sizeof(*st_map) + 587 /* kvalue stores the 588 * struct bpf_struct_ops_tcp_congestions_ops 589 */ 590 (vt->size - sizeof(struct bpf_struct_ops_value)); 591 592 st_map = bpf_map_area_alloc(st_map_size, NUMA_NO_NODE); 593 if (!st_map) 594 return ERR_PTR(-ENOMEM); 595 596 st_map->st_ops = st_ops; 597 map = &st_map->map; 598 599 st_map->uvalue = bpf_map_area_alloc(vt->size, NUMA_NO_NODE); 600 st_map->progs = 601 bpf_map_area_alloc(btf_type_vlen(t) * sizeof(struct bpf_prog *), 602 NUMA_NO_NODE); 603 st_map->image = bpf_jit_alloc_exec(PAGE_SIZE); 604 if (!st_map->uvalue || !st_map->progs || !st_map->image) { 605 bpf_struct_ops_map_free(map); 606 return ERR_PTR(-ENOMEM); 607 } 608 609 mutex_init(&st_map->lock); 610 set_vm_flush_reset_perms(st_map->image); 611 bpf_map_init_from_attr(map, attr); 612 613 return map; 614 } 615 616 BTF_ID_LIST_SINGLE(bpf_struct_ops_map_btf_ids, struct, bpf_struct_ops_map) 617 const struct bpf_map_ops bpf_struct_ops_map_ops = { 618 .map_alloc_check = bpf_struct_ops_map_alloc_check, 619 .map_alloc = bpf_struct_ops_map_alloc, 620 .map_free = bpf_struct_ops_map_free, 621 .map_get_next_key = bpf_struct_ops_map_get_next_key, 622 .map_lookup_elem = bpf_struct_ops_map_lookup_elem, 623 .map_delete_elem = bpf_struct_ops_map_delete_elem, 624 .map_update_elem = bpf_struct_ops_map_update_elem, 625 .map_seq_show_elem = bpf_struct_ops_map_seq_show_elem, 626 .map_btf_id = &bpf_struct_ops_map_btf_ids[0], 627 }; 628 629 /* "const void *" because some subsystem is 630 * passing a const (e.g. const struct tcp_congestion_ops *) 631 */ 632 bool bpf_struct_ops_get(const void *kdata) 633 { 634 struct bpf_struct_ops_value *kvalue; 635 636 kvalue = container_of(kdata, struct bpf_struct_ops_value, data); 637 638 return refcount_inc_not_zero(&kvalue->refcnt); 639 } 640 641 static void bpf_struct_ops_put_rcu(struct rcu_head *head) 642 { 643 struct bpf_struct_ops_map *st_map; 644 645 st_map = container_of(head, struct bpf_struct_ops_map, rcu); 646 bpf_map_put(&st_map->map); 647 } 648 649 void bpf_struct_ops_put(const void *kdata) 650 { 651 struct bpf_struct_ops_value *kvalue; 652 653 kvalue = container_of(kdata, struct bpf_struct_ops_value, data); 654 if (refcount_dec_and_test(&kvalue->refcnt)) { 655 struct bpf_struct_ops_map *st_map; 656 657 st_map = container_of(kvalue, struct bpf_struct_ops_map, 658 kvalue); 659 /* The struct_ops's function may switch to another struct_ops. 660 * 661 * For example, bpf_tcp_cc_x->init() may switch to 662 * another tcp_cc_y by calling 663 * setsockopt(TCP_CONGESTION, "tcp_cc_y"). 664 * During the switch, bpf_struct_ops_put(tcp_cc_x) is called 665 * and its map->refcnt may reach 0 which then free its 666 * trampoline image while tcp_cc_x is still running. 667 * 668 * Thus, a rcu grace period is needed here. 669 */ 670 call_rcu(&st_map->rcu, bpf_struct_ops_put_rcu); 671 } 672 } 673