1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Copyright (c) 2019 Facebook */ 3 4 #include <linux/bpf.h> 5 #include <linux/bpf_verifier.h> 6 #include <linux/btf.h> 7 #include <linux/filter.h> 8 #include <linux/slab.h> 9 #include <linux/numa.h> 10 #include <linux/seq_file.h> 11 #include <linux/refcount.h> 12 #include <linux/mutex.h> 13 14 enum bpf_struct_ops_state { 15 BPF_STRUCT_OPS_STATE_INIT, 16 BPF_STRUCT_OPS_STATE_INUSE, 17 BPF_STRUCT_OPS_STATE_TOBEFREE, 18 }; 19 20 #define BPF_STRUCT_OPS_COMMON_VALUE \ 21 refcount_t refcnt; \ 22 enum bpf_struct_ops_state state 23 24 struct bpf_struct_ops_value { 25 BPF_STRUCT_OPS_COMMON_VALUE; 26 char data[] ____cacheline_aligned_in_smp; 27 }; 28 29 struct bpf_struct_ops_map { 30 struct bpf_map map; 31 struct rcu_head rcu; 32 const struct bpf_struct_ops *st_ops; 33 /* protect map_update */ 34 struct mutex lock; 35 /* progs has all the bpf_prog that is populated 36 * to the func ptr of the kernel's struct 37 * (in kvalue.data). 38 */ 39 struct bpf_prog **progs; 40 /* image is a page that has all the trampolines 41 * that stores the func args before calling the bpf_prog. 42 * A PAGE_SIZE "image" is enough to store all trampoline for 43 * "progs[]". 44 */ 45 void *image; 46 /* uvalue->data stores the kernel struct 47 * (e.g. tcp_congestion_ops) that is more useful 48 * to userspace than the kvalue. For example, 49 * the bpf_prog's id is stored instead of the kernel 50 * address of a func ptr. 51 */ 52 struct bpf_struct_ops_value *uvalue; 53 /* kvalue.data stores the actual kernel's struct 54 * (e.g. tcp_congestion_ops) that will be 55 * registered to the kernel subsystem. 56 */ 57 struct bpf_struct_ops_value kvalue; 58 }; 59 60 #define VALUE_PREFIX "bpf_struct_ops_" 61 #define VALUE_PREFIX_LEN (sizeof(VALUE_PREFIX) - 1) 62 63 /* bpf_struct_ops_##_name (e.g. bpf_struct_ops_tcp_congestion_ops) is 64 * the map's value exposed to the userspace and its btf-type-id is 65 * stored at the map->btf_vmlinux_value_type_id. 66 * 67 */ 68 #define BPF_STRUCT_OPS_TYPE(_name) \ 69 extern struct bpf_struct_ops bpf_##_name; \ 70 \ 71 struct bpf_struct_ops_##_name { \ 72 BPF_STRUCT_OPS_COMMON_VALUE; \ 73 struct _name data ____cacheline_aligned_in_smp; \ 74 }; 75 #include "bpf_struct_ops_types.h" 76 #undef BPF_STRUCT_OPS_TYPE 77 78 enum { 79 #define BPF_STRUCT_OPS_TYPE(_name) BPF_STRUCT_OPS_TYPE_##_name, 80 #include "bpf_struct_ops_types.h" 81 #undef BPF_STRUCT_OPS_TYPE 82 __NR_BPF_STRUCT_OPS_TYPE, 83 }; 84 85 static struct bpf_struct_ops * const bpf_struct_ops[] = { 86 #define BPF_STRUCT_OPS_TYPE(_name) \ 87 [BPF_STRUCT_OPS_TYPE_##_name] = &bpf_##_name, 88 #include "bpf_struct_ops_types.h" 89 #undef BPF_STRUCT_OPS_TYPE 90 }; 91 92 const struct bpf_verifier_ops bpf_struct_ops_verifier_ops = { 93 }; 94 95 const struct bpf_prog_ops bpf_struct_ops_prog_ops = { 96 #ifdef CONFIG_NET 97 .test_run = bpf_struct_ops_test_run, 98 #endif 99 }; 100 101 static const struct btf_type *module_type; 102 103 void bpf_struct_ops_init(struct btf *btf, struct bpf_verifier_log *log) 104 { 105 s32 type_id, value_id, module_id; 106 const struct btf_member *member; 107 struct bpf_struct_ops *st_ops; 108 const struct btf_type *t; 109 char value_name[128]; 110 const char *mname; 111 u32 i, j; 112 113 /* Ensure BTF type is emitted for "struct bpf_struct_ops_##_name" */ 114 #define BPF_STRUCT_OPS_TYPE(_name) BTF_TYPE_EMIT(struct bpf_struct_ops_##_name); 115 #include "bpf_struct_ops_types.h" 116 #undef BPF_STRUCT_OPS_TYPE 117 118 module_id = btf_find_by_name_kind(btf, "module", BTF_KIND_STRUCT); 119 if (module_id < 0) { 120 pr_warn("Cannot find struct module in btf_vmlinux\n"); 121 return; 122 } 123 module_type = btf_type_by_id(btf, module_id); 124 125 for (i = 0; i < ARRAY_SIZE(bpf_struct_ops); i++) { 126 st_ops = bpf_struct_ops[i]; 127 128 if (strlen(st_ops->name) + VALUE_PREFIX_LEN >= 129 sizeof(value_name)) { 130 pr_warn("struct_ops name %s is too long\n", 131 st_ops->name); 132 continue; 133 } 134 sprintf(value_name, "%s%s", VALUE_PREFIX, st_ops->name); 135 136 value_id = btf_find_by_name_kind(btf, value_name, 137 BTF_KIND_STRUCT); 138 if (value_id < 0) { 139 pr_warn("Cannot find struct %s in btf_vmlinux\n", 140 value_name); 141 continue; 142 } 143 144 type_id = btf_find_by_name_kind(btf, st_ops->name, 145 BTF_KIND_STRUCT); 146 if (type_id < 0) { 147 pr_warn("Cannot find struct %s in btf_vmlinux\n", 148 st_ops->name); 149 continue; 150 } 151 t = btf_type_by_id(btf, type_id); 152 if (btf_type_vlen(t) > BPF_STRUCT_OPS_MAX_NR_MEMBERS) { 153 pr_warn("Cannot support #%u members in struct %s\n", 154 btf_type_vlen(t), st_ops->name); 155 continue; 156 } 157 158 for_each_member(j, t, member) { 159 const struct btf_type *func_proto; 160 161 mname = btf_name_by_offset(btf, member->name_off); 162 if (!*mname) { 163 pr_warn("anon member in struct %s is not supported\n", 164 st_ops->name); 165 break; 166 } 167 168 if (__btf_member_bitfield_size(t, member)) { 169 pr_warn("bit field member %s in struct %s is not supported\n", 170 mname, st_ops->name); 171 break; 172 } 173 174 func_proto = btf_type_resolve_func_ptr(btf, 175 member->type, 176 NULL); 177 if (func_proto && 178 btf_distill_func_proto(log, btf, 179 func_proto, mname, 180 &st_ops->func_models[j])) { 181 pr_warn("Error in parsing func ptr %s in struct %s\n", 182 mname, st_ops->name); 183 break; 184 } 185 } 186 187 if (j == btf_type_vlen(t)) { 188 if (st_ops->init(btf)) { 189 pr_warn("Error in init bpf_struct_ops %s\n", 190 st_ops->name); 191 } else { 192 st_ops->type_id = type_id; 193 st_ops->type = t; 194 st_ops->value_id = value_id; 195 st_ops->value_type = btf_type_by_id(btf, 196 value_id); 197 } 198 } 199 } 200 } 201 202 extern struct btf *btf_vmlinux; 203 204 static const struct bpf_struct_ops * 205 bpf_struct_ops_find_value(u32 value_id) 206 { 207 unsigned int i; 208 209 if (!value_id || !btf_vmlinux) 210 return NULL; 211 212 for (i = 0; i < ARRAY_SIZE(bpf_struct_ops); i++) { 213 if (bpf_struct_ops[i]->value_id == value_id) 214 return bpf_struct_ops[i]; 215 } 216 217 return NULL; 218 } 219 220 const struct bpf_struct_ops *bpf_struct_ops_find(u32 type_id) 221 { 222 unsigned int i; 223 224 if (!type_id || !btf_vmlinux) 225 return NULL; 226 227 for (i = 0; i < ARRAY_SIZE(bpf_struct_ops); i++) { 228 if (bpf_struct_ops[i]->type_id == type_id) 229 return bpf_struct_ops[i]; 230 } 231 232 return NULL; 233 } 234 235 static int bpf_struct_ops_map_get_next_key(struct bpf_map *map, void *key, 236 void *next_key) 237 { 238 if (key && *(u32 *)key == 0) 239 return -ENOENT; 240 241 *(u32 *)next_key = 0; 242 return 0; 243 } 244 245 int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key, 246 void *value) 247 { 248 struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; 249 struct bpf_struct_ops_value *uvalue, *kvalue; 250 enum bpf_struct_ops_state state; 251 252 if (unlikely(*(u32 *)key != 0)) 253 return -ENOENT; 254 255 kvalue = &st_map->kvalue; 256 /* Pair with smp_store_release() during map_update */ 257 state = smp_load_acquire(&kvalue->state); 258 if (state == BPF_STRUCT_OPS_STATE_INIT) { 259 memset(value, 0, map->value_size); 260 return 0; 261 } 262 263 /* No lock is needed. state and refcnt do not need 264 * to be updated together under atomic context. 265 */ 266 uvalue = (struct bpf_struct_ops_value *)value; 267 memcpy(uvalue, st_map->uvalue, map->value_size); 268 uvalue->state = state; 269 refcount_set(&uvalue->refcnt, refcount_read(&kvalue->refcnt)); 270 271 return 0; 272 } 273 274 static void *bpf_struct_ops_map_lookup_elem(struct bpf_map *map, void *key) 275 { 276 return ERR_PTR(-EINVAL); 277 } 278 279 static void bpf_struct_ops_map_put_progs(struct bpf_struct_ops_map *st_map) 280 { 281 const struct btf_type *t = st_map->st_ops->type; 282 u32 i; 283 284 for (i = 0; i < btf_type_vlen(t); i++) { 285 if (st_map->progs[i]) { 286 bpf_prog_put(st_map->progs[i]); 287 st_map->progs[i] = NULL; 288 } 289 } 290 } 291 292 static int check_zero_holes(const struct btf_type *t, void *data) 293 { 294 const struct btf_member *member; 295 u32 i, moff, msize, prev_mend = 0; 296 const struct btf_type *mtype; 297 298 for_each_member(i, t, member) { 299 moff = __btf_member_bit_offset(t, member) / 8; 300 if (moff > prev_mend && 301 memchr_inv(data + prev_mend, 0, moff - prev_mend)) 302 return -EINVAL; 303 304 mtype = btf_type_by_id(btf_vmlinux, member->type); 305 mtype = btf_resolve_size(btf_vmlinux, mtype, &msize); 306 if (IS_ERR(mtype)) 307 return PTR_ERR(mtype); 308 prev_mend = moff + msize; 309 } 310 311 if (t->size > prev_mend && 312 memchr_inv(data + prev_mend, 0, t->size - prev_mend)) 313 return -EINVAL; 314 315 return 0; 316 } 317 318 int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_progs *tprogs, 319 struct bpf_prog *prog, 320 const struct btf_func_model *model, 321 void *image, void *image_end) 322 { 323 u32 flags; 324 325 tprogs[BPF_TRAMP_FENTRY].progs[0] = prog; 326 tprogs[BPF_TRAMP_FENTRY].nr_progs = 1; 327 flags = model->ret_size > 0 ? BPF_TRAMP_F_RET_FENTRY_RET : 0; 328 return arch_prepare_bpf_trampoline(NULL, image, image_end, 329 model, flags, tprogs, NULL); 330 } 331 332 static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, 333 void *value, u64 flags) 334 { 335 struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; 336 const struct bpf_struct_ops *st_ops = st_map->st_ops; 337 struct bpf_struct_ops_value *uvalue, *kvalue; 338 const struct btf_member *member; 339 const struct btf_type *t = st_ops->type; 340 struct bpf_tramp_progs *tprogs = NULL; 341 void *udata, *kdata; 342 int prog_fd, err = 0; 343 void *image, *image_end; 344 u32 i; 345 346 if (flags) 347 return -EINVAL; 348 349 if (*(u32 *)key != 0) 350 return -E2BIG; 351 352 err = check_zero_holes(st_ops->value_type, value); 353 if (err) 354 return err; 355 356 uvalue = (struct bpf_struct_ops_value *)value; 357 err = check_zero_holes(t, uvalue->data); 358 if (err) 359 return err; 360 361 if (uvalue->state || refcount_read(&uvalue->refcnt)) 362 return -EINVAL; 363 364 tprogs = kcalloc(BPF_TRAMP_MAX, sizeof(*tprogs), GFP_KERNEL); 365 if (!tprogs) 366 return -ENOMEM; 367 368 uvalue = (struct bpf_struct_ops_value *)st_map->uvalue; 369 kvalue = (struct bpf_struct_ops_value *)&st_map->kvalue; 370 371 mutex_lock(&st_map->lock); 372 373 if (kvalue->state != BPF_STRUCT_OPS_STATE_INIT) { 374 err = -EBUSY; 375 goto unlock; 376 } 377 378 memcpy(uvalue, value, map->value_size); 379 380 udata = &uvalue->data; 381 kdata = &kvalue->data; 382 image = st_map->image; 383 image_end = st_map->image + PAGE_SIZE; 384 385 for_each_member(i, t, member) { 386 const struct btf_type *mtype, *ptype; 387 struct bpf_prog *prog; 388 u32 moff; 389 390 moff = __btf_member_bit_offset(t, member) / 8; 391 ptype = btf_type_resolve_ptr(btf_vmlinux, member->type, NULL); 392 if (ptype == module_type) { 393 if (*(void **)(udata + moff)) 394 goto reset_unlock; 395 *(void **)(kdata + moff) = BPF_MODULE_OWNER; 396 continue; 397 } 398 399 err = st_ops->init_member(t, member, kdata, udata); 400 if (err < 0) 401 goto reset_unlock; 402 403 /* The ->init_member() has handled this member */ 404 if (err > 0) 405 continue; 406 407 /* If st_ops->init_member does not handle it, 408 * we will only handle func ptrs and zero-ed members 409 * here. Reject everything else. 410 */ 411 412 /* All non func ptr member must be 0 */ 413 if (!ptype || !btf_type_is_func_proto(ptype)) { 414 u32 msize; 415 416 mtype = btf_type_by_id(btf_vmlinux, member->type); 417 mtype = btf_resolve_size(btf_vmlinux, mtype, &msize); 418 if (IS_ERR(mtype)) { 419 err = PTR_ERR(mtype); 420 goto reset_unlock; 421 } 422 423 if (memchr_inv(udata + moff, 0, msize)) { 424 err = -EINVAL; 425 goto reset_unlock; 426 } 427 428 continue; 429 } 430 431 prog_fd = (int)(*(unsigned long *)(udata + moff)); 432 /* Similar check as the attr->attach_prog_fd */ 433 if (!prog_fd) 434 continue; 435 436 prog = bpf_prog_get(prog_fd); 437 if (IS_ERR(prog)) { 438 err = PTR_ERR(prog); 439 goto reset_unlock; 440 } 441 st_map->progs[i] = prog; 442 443 if (prog->type != BPF_PROG_TYPE_STRUCT_OPS || 444 prog->aux->attach_btf_id != st_ops->type_id || 445 prog->expected_attach_type != i) { 446 err = -EINVAL; 447 goto reset_unlock; 448 } 449 450 err = bpf_struct_ops_prepare_trampoline(tprogs, prog, 451 &st_ops->func_models[i], 452 image, image_end); 453 if (err < 0) 454 goto reset_unlock; 455 456 *(void **)(kdata + moff) = image; 457 image += err; 458 459 /* put prog_id to udata */ 460 *(unsigned long *)(udata + moff) = prog->aux->id; 461 } 462 463 refcount_set(&kvalue->refcnt, 1); 464 bpf_map_inc(map); 465 466 set_memory_ro((long)st_map->image, 1); 467 set_memory_x((long)st_map->image, 1); 468 err = st_ops->reg(kdata); 469 if (likely(!err)) { 470 /* Pair with smp_load_acquire() during lookup_elem(). 471 * It ensures the above udata updates (e.g. prog->aux->id) 472 * can be seen once BPF_STRUCT_OPS_STATE_INUSE is set. 473 */ 474 smp_store_release(&kvalue->state, BPF_STRUCT_OPS_STATE_INUSE); 475 goto unlock; 476 } 477 478 /* Error during st_ops->reg(). It is very unlikely since 479 * the above init_member() should have caught it earlier 480 * before reg(). The only possibility is if there was a race 481 * in registering the struct_ops (under the same name) to 482 * a sub-system through different struct_ops's maps. 483 */ 484 set_memory_nx((long)st_map->image, 1); 485 set_memory_rw((long)st_map->image, 1); 486 bpf_map_put(map); 487 488 reset_unlock: 489 bpf_struct_ops_map_put_progs(st_map); 490 memset(uvalue, 0, map->value_size); 491 memset(kvalue, 0, map->value_size); 492 unlock: 493 kfree(tprogs); 494 mutex_unlock(&st_map->lock); 495 return err; 496 } 497 498 static int bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key) 499 { 500 enum bpf_struct_ops_state prev_state; 501 struct bpf_struct_ops_map *st_map; 502 503 st_map = (struct bpf_struct_ops_map *)map; 504 prev_state = cmpxchg(&st_map->kvalue.state, 505 BPF_STRUCT_OPS_STATE_INUSE, 506 BPF_STRUCT_OPS_STATE_TOBEFREE); 507 switch (prev_state) { 508 case BPF_STRUCT_OPS_STATE_INUSE: 509 st_map->st_ops->unreg(&st_map->kvalue.data); 510 if (refcount_dec_and_test(&st_map->kvalue.refcnt)) 511 bpf_map_put(map); 512 return 0; 513 case BPF_STRUCT_OPS_STATE_TOBEFREE: 514 return -EINPROGRESS; 515 case BPF_STRUCT_OPS_STATE_INIT: 516 return -ENOENT; 517 default: 518 WARN_ON_ONCE(1); 519 /* Should never happen. Treat it as not found. */ 520 return -ENOENT; 521 } 522 } 523 524 static void bpf_struct_ops_map_seq_show_elem(struct bpf_map *map, void *key, 525 struct seq_file *m) 526 { 527 void *value; 528 int err; 529 530 value = kmalloc(map->value_size, GFP_USER | __GFP_NOWARN); 531 if (!value) 532 return; 533 534 err = bpf_struct_ops_map_sys_lookup_elem(map, key, value); 535 if (!err) { 536 btf_type_seq_show(btf_vmlinux, map->btf_vmlinux_value_type_id, 537 value, m); 538 seq_puts(m, "\n"); 539 } 540 541 kfree(value); 542 } 543 544 static void bpf_struct_ops_map_free(struct bpf_map *map) 545 { 546 struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; 547 548 if (st_map->progs) 549 bpf_struct_ops_map_put_progs(st_map); 550 bpf_map_area_free(st_map->progs); 551 bpf_jit_free_exec(st_map->image); 552 bpf_map_area_free(st_map->uvalue); 553 bpf_map_area_free(st_map); 554 } 555 556 static int bpf_struct_ops_map_alloc_check(union bpf_attr *attr) 557 { 558 if (attr->key_size != sizeof(unsigned int) || attr->max_entries != 1 || 559 attr->map_flags || !attr->btf_vmlinux_value_type_id) 560 return -EINVAL; 561 return 0; 562 } 563 564 static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr) 565 { 566 const struct bpf_struct_ops *st_ops; 567 size_t st_map_size; 568 struct bpf_struct_ops_map *st_map; 569 const struct btf_type *t, *vt; 570 struct bpf_map *map; 571 572 if (!bpf_capable()) 573 return ERR_PTR(-EPERM); 574 575 st_ops = bpf_struct_ops_find_value(attr->btf_vmlinux_value_type_id); 576 if (!st_ops) 577 return ERR_PTR(-ENOTSUPP); 578 579 vt = st_ops->value_type; 580 if (attr->value_size != vt->size) 581 return ERR_PTR(-EINVAL); 582 583 t = st_ops->type; 584 585 st_map_size = sizeof(*st_map) + 586 /* kvalue stores the 587 * struct bpf_struct_ops_tcp_congestions_ops 588 */ 589 (vt->size - sizeof(struct bpf_struct_ops_value)); 590 591 st_map = bpf_map_area_alloc(st_map_size, NUMA_NO_NODE); 592 if (!st_map) 593 return ERR_PTR(-ENOMEM); 594 595 st_map->st_ops = st_ops; 596 map = &st_map->map; 597 598 st_map->uvalue = bpf_map_area_alloc(vt->size, NUMA_NO_NODE); 599 st_map->progs = 600 bpf_map_area_alloc(btf_type_vlen(t) * sizeof(struct bpf_prog *), 601 NUMA_NO_NODE); 602 st_map->image = bpf_jit_alloc_exec(PAGE_SIZE); 603 if (!st_map->uvalue || !st_map->progs || !st_map->image) { 604 bpf_struct_ops_map_free(map); 605 return ERR_PTR(-ENOMEM); 606 } 607 608 mutex_init(&st_map->lock); 609 set_vm_flush_reset_perms(st_map->image); 610 bpf_map_init_from_attr(map, attr); 611 612 return map; 613 } 614 615 static int bpf_struct_ops_map_btf_id; 616 const struct bpf_map_ops bpf_struct_ops_map_ops = { 617 .map_alloc_check = bpf_struct_ops_map_alloc_check, 618 .map_alloc = bpf_struct_ops_map_alloc, 619 .map_free = bpf_struct_ops_map_free, 620 .map_get_next_key = bpf_struct_ops_map_get_next_key, 621 .map_lookup_elem = bpf_struct_ops_map_lookup_elem, 622 .map_delete_elem = bpf_struct_ops_map_delete_elem, 623 .map_update_elem = bpf_struct_ops_map_update_elem, 624 .map_seq_show_elem = bpf_struct_ops_map_seq_show_elem, 625 .map_btf_name = "bpf_struct_ops_map", 626 .map_btf_id = &bpf_struct_ops_map_btf_id, 627 }; 628 629 /* "const void *" because some subsystem is 630 * passing a const (e.g. const struct tcp_congestion_ops *) 631 */ 632 bool bpf_struct_ops_get(const void *kdata) 633 { 634 struct bpf_struct_ops_value *kvalue; 635 636 kvalue = container_of(kdata, struct bpf_struct_ops_value, data); 637 638 return refcount_inc_not_zero(&kvalue->refcnt); 639 } 640 641 static void bpf_struct_ops_put_rcu(struct rcu_head *head) 642 { 643 struct bpf_struct_ops_map *st_map; 644 645 st_map = container_of(head, struct bpf_struct_ops_map, rcu); 646 bpf_map_put(&st_map->map); 647 } 648 649 void bpf_struct_ops_put(const void *kdata) 650 { 651 struct bpf_struct_ops_value *kvalue; 652 653 kvalue = container_of(kdata, struct bpf_struct_ops_value, data); 654 if (refcount_dec_and_test(&kvalue->refcnt)) { 655 struct bpf_struct_ops_map *st_map; 656 657 st_map = container_of(kvalue, struct bpf_struct_ops_map, 658 kvalue); 659 /* The struct_ops's function may switch to another struct_ops. 660 * 661 * For example, bpf_tcp_cc_x->init() may switch to 662 * another tcp_cc_y by calling 663 * setsockopt(TCP_CONGESTION, "tcp_cc_y"). 664 * During the switch, bpf_struct_ops_put(tcp_cc_x) is called 665 * and its map->refcnt may reach 0 which then free its 666 * trampoline image while tcp_cc_x is still running. 667 * 668 * Thus, a rcu grace period is needed here. 669 */ 670 call_rcu(&st_map->rcu, bpf_struct_ops_put_rcu); 671 } 672 } 673