1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Copyright (c) 2019 Facebook */ 3 4 #include <linux/bpf.h> 5 #include <linux/bpf_verifier.h> 6 #include <linux/btf.h> 7 #include <linux/filter.h> 8 #include <linux/slab.h> 9 #include <linux/numa.h> 10 #include <linux/seq_file.h> 11 #include <linux/refcount.h> 12 #include <linux/mutex.h> 13 #include <linux/btf_ids.h> 14 #include <linux/rcupdate_wait.h> 15 16 enum bpf_struct_ops_state { 17 BPF_STRUCT_OPS_STATE_INIT, 18 BPF_STRUCT_OPS_STATE_INUSE, 19 BPF_STRUCT_OPS_STATE_TOBEFREE, 20 BPF_STRUCT_OPS_STATE_READY, 21 }; 22 23 #define BPF_STRUCT_OPS_COMMON_VALUE \ 24 refcount_t refcnt; \ 25 enum bpf_struct_ops_state state 26 27 struct bpf_struct_ops_value { 28 BPF_STRUCT_OPS_COMMON_VALUE; 29 char data[] ____cacheline_aligned_in_smp; 30 }; 31 32 struct bpf_struct_ops_map { 33 struct bpf_map map; 34 struct rcu_head rcu; 35 const struct bpf_struct_ops *st_ops; 36 /* protect map_update */ 37 struct mutex lock; 38 /* link has all the bpf_links that is populated 39 * to the func ptr of the kernel's struct 40 * (in kvalue.data). 41 */ 42 struct bpf_link **links; 43 /* image is a page that has all the trampolines 44 * that stores the func args before calling the bpf_prog. 45 * A PAGE_SIZE "image" is enough to store all trampoline for 46 * "links[]". 47 */ 48 void *image; 49 /* uvalue->data stores the kernel struct 50 * (e.g. tcp_congestion_ops) that is more useful 51 * to userspace than the kvalue. For example, 52 * the bpf_prog's id is stored instead of the kernel 53 * address of a func ptr. 54 */ 55 struct bpf_struct_ops_value *uvalue; 56 /* kvalue.data stores the actual kernel's struct 57 * (e.g. tcp_congestion_ops) that will be 58 * registered to the kernel subsystem. 59 */ 60 struct bpf_struct_ops_value kvalue; 61 }; 62 63 struct bpf_struct_ops_link { 64 struct bpf_link link; 65 struct bpf_map __rcu *map; 66 }; 67 68 static DEFINE_MUTEX(update_mutex); 69 70 #define VALUE_PREFIX "bpf_struct_ops_" 71 #define VALUE_PREFIX_LEN (sizeof(VALUE_PREFIX) - 1) 72 73 /* bpf_struct_ops_##_name (e.g. bpf_struct_ops_tcp_congestion_ops) is 74 * the map's value exposed to the userspace and its btf-type-id is 75 * stored at the map->btf_vmlinux_value_type_id. 76 * 77 */ 78 #define BPF_STRUCT_OPS_TYPE(_name) \ 79 extern struct bpf_struct_ops bpf_##_name; \ 80 \ 81 struct bpf_struct_ops_##_name { \ 82 BPF_STRUCT_OPS_COMMON_VALUE; \ 83 struct _name data ____cacheline_aligned_in_smp; \ 84 }; 85 #include "bpf_struct_ops_types.h" 86 #undef BPF_STRUCT_OPS_TYPE 87 88 enum { 89 #define BPF_STRUCT_OPS_TYPE(_name) BPF_STRUCT_OPS_TYPE_##_name, 90 #include "bpf_struct_ops_types.h" 91 #undef BPF_STRUCT_OPS_TYPE 92 __NR_BPF_STRUCT_OPS_TYPE, 93 }; 94 95 static struct bpf_struct_ops * const bpf_struct_ops[] = { 96 #define BPF_STRUCT_OPS_TYPE(_name) \ 97 [BPF_STRUCT_OPS_TYPE_##_name] = &bpf_##_name, 98 #include "bpf_struct_ops_types.h" 99 #undef BPF_STRUCT_OPS_TYPE 100 }; 101 102 const struct bpf_verifier_ops bpf_struct_ops_verifier_ops = { 103 }; 104 105 const struct bpf_prog_ops bpf_struct_ops_prog_ops = { 106 #ifdef CONFIG_NET 107 .test_run = bpf_struct_ops_test_run, 108 #endif 109 }; 110 111 static const struct btf_type *module_type; 112 113 void bpf_struct_ops_init(struct btf *btf, struct bpf_verifier_log *log) 114 { 115 s32 type_id, value_id, module_id; 116 const struct btf_member *member; 117 struct bpf_struct_ops *st_ops; 118 const struct btf_type *t; 119 char value_name[128]; 120 const char *mname; 121 u32 i, j; 122 123 /* Ensure BTF type is emitted for "struct bpf_struct_ops_##_name" */ 124 #define BPF_STRUCT_OPS_TYPE(_name) BTF_TYPE_EMIT(struct bpf_struct_ops_##_name); 125 #include "bpf_struct_ops_types.h" 126 #undef BPF_STRUCT_OPS_TYPE 127 128 module_id = btf_find_by_name_kind(btf, "module", BTF_KIND_STRUCT); 129 if (module_id < 0) { 130 pr_warn("Cannot find struct module in btf_vmlinux\n"); 131 return; 132 } 133 module_type = btf_type_by_id(btf, module_id); 134 135 for (i = 0; i < ARRAY_SIZE(bpf_struct_ops); i++) { 136 st_ops = bpf_struct_ops[i]; 137 138 if (strlen(st_ops->name) + VALUE_PREFIX_LEN >= 139 sizeof(value_name)) { 140 pr_warn("struct_ops name %s is too long\n", 141 st_ops->name); 142 continue; 143 } 144 sprintf(value_name, "%s%s", VALUE_PREFIX, st_ops->name); 145 146 value_id = btf_find_by_name_kind(btf, value_name, 147 BTF_KIND_STRUCT); 148 if (value_id < 0) { 149 pr_warn("Cannot find struct %s in btf_vmlinux\n", 150 value_name); 151 continue; 152 } 153 154 type_id = btf_find_by_name_kind(btf, st_ops->name, 155 BTF_KIND_STRUCT); 156 if (type_id < 0) { 157 pr_warn("Cannot find struct %s in btf_vmlinux\n", 158 st_ops->name); 159 continue; 160 } 161 t = btf_type_by_id(btf, type_id); 162 if (btf_type_vlen(t) > BPF_STRUCT_OPS_MAX_NR_MEMBERS) { 163 pr_warn("Cannot support #%u members in struct %s\n", 164 btf_type_vlen(t), st_ops->name); 165 continue; 166 } 167 168 for_each_member(j, t, member) { 169 const struct btf_type *func_proto; 170 171 mname = btf_name_by_offset(btf, member->name_off); 172 if (!*mname) { 173 pr_warn("anon member in struct %s is not supported\n", 174 st_ops->name); 175 break; 176 } 177 178 if (__btf_member_bitfield_size(t, member)) { 179 pr_warn("bit field member %s in struct %s is not supported\n", 180 mname, st_ops->name); 181 break; 182 } 183 184 func_proto = btf_type_resolve_func_ptr(btf, 185 member->type, 186 NULL); 187 if (func_proto && 188 btf_distill_func_proto(log, btf, 189 func_proto, mname, 190 &st_ops->func_models[j])) { 191 pr_warn("Error in parsing func ptr %s in struct %s\n", 192 mname, st_ops->name); 193 break; 194 } 195 } 196 197 if (j == btf_type_vlen(t)) { 198 if (st_ops->init(btf)) { 199 pr_warn("Error in init bpf_struct_ops %s\n", 200 st_ops->name); 201 } else { 202 st_ops->type_id = type_id; 203 st_ops->type = t; 204 st_ops->value_id = value_id; 205 st_ops->value_type = btf_type_by_id(btf, 206 value_id); 207 } 208 } 209 } 210 } 211 212 extern struct btf *btf_vmlinux; 213 214 static const struct bpf_struct_ops * 215 bpf_struct_ops_find_value(u32 value_id) 216 { 217 unsigned int i; 218 219 if (!value_id || !btf_vmlinux) 220 return NULL; 221 222 for (i = 0; i < ARRAY_SIZE(bpf_struct_ops); i++) { 223 if (bpf_struct_ops[i]->value_id == value_id) 224 return bpf_struct_ops[i]; 225 } 226 227 return NULL; 228 } 229 230 const struct bpf_struct_ops *bpf_struct_ops_find(u32 type_id) 231 { 232 unsigned int i; 233 234 if (!type_id || !btf_vmlinux) 235 return NULL; 236 237 for (i = 0; i < ARRAY_SIZE(bpf_struct_ops); i++) { 238 if (bpf_struct_ops[i]->type_id == type_id) 239 return bpf_struct_ops[i]; 240 } 241 242 return NULL; 243 } 244 245 static int bpf_struct_ops_map_get_next_key(struct bpf_map *map, void *key, 246 void *next_key) 247 { 248 if (key && *(u32 *)key == 0) 249 return -ENOENT; 250 251 *(u32 *)next_key = 0; 252 return 0; 253 } 254 255 int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key, 256 void *value) 257 { 258 struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; 259 struct bpf_struct_ops_value *uvalue, *kvalue; 260 enum bpf_struct_ops_state state; 261 s64 refcnt; 262 263 if (unlikely(*(u32 *)key != 0)) 264 return -ENOENT; 265 266 kvalue = &st_map->kvalue; 267 /* Pair with smp_store_release() during map_update */ 268 state = smp_load_acquire(&kvalue->state); 269 if (state == BPF_STRUCT_OPS_STATE_INIT) { 270 memset(value, 0, map->value_size); 271 return 0; 272 } 273 274 /* No lock is needed. state and refcnt do not need 275 * to be updated together under atomic context. 276 */ 277 uvalue = value; 278 memcpy(uvalue, st_map->uvalue, map->value_size); 279 uvalue->state = state; 280 281 /* This value offers the user space a general estimate of how 282 * many sockets are still utilizing this struct_ops for TCP 283 * congestion control. The number might not be exact, but it 284 * should sufficiently meet our present goals. 285 */ 286 refcnt = atomic64_read(&map->refcnt) - atomic64_read(&map->usercnt); 287 refcount_set(&uvalue->refcnt, max_t(s64, refcnt, 0)); 288 289 return 0; 290 } 291 292 static void *bpf_struct_ops_map_lookup_elem(struct bpf_map *map, void *key) 293 { 294 return ERR_PTR(-EINVAL); 295 } 296 297 static void bpf_struct_ops_map_put_progs(struct bpf_struct_ops_map *st_map) 298 { 299 const struct btf_type *t = st_map->st_ops->type; 300 u32 i; 301 302 for (i = 0; i < btf_type_vlen(t); i++) { 303 if (st_map->links[i]) { 304 bpf_link_put(st_map->links[i]); 305 st_map->links[i] = NULL; 306 } 307 } 308 } 309 310 static int check_zero_holes(const struct btf_type *t, void *data) 311 { 312 const struct btf_member *member; 313 u32 i, moff, msize, prev_mend = 0; 314 const struct btf_type *mtype; 315 316 for_each_member(i, t, member) { 317 moff = __btf_member_bit_offset(t, member) / 8; 318 if (moff > prev_mend && 319 memchr_inv(data + prev_mend, 0, moff - prev_mend)) 320 return -EINVAL; 321 322 mtype = btf_type_by_id(btf_vmlinux, member->type); 323 mtype = btf_resolve_size(btf_vmlinux, mtype, &msize); 324 if (IS_ERR(mtype)) 325 return PTR_ERR(mtype); 326 prev_mend = moff + msize; 327 } 328 329 if (t->size > prev_mend && 330 memchr_inv(data + prev_mend, 0, t->size - prev_mend)) 331 return -EINVAL; 332 333 return 0; 334 } 335 336 static void bpf_struct_ops_link_release(struct bpf_link *link) 337 { 338 } 339 340 static void bpf_struct_ops_link_dealloc(struct bpf_link *link) 341 { 342 struct bpf_tramp_link *tlink = container_of(link, struct bpf_tramp_link, link); 343 344 kfree(tlink); 345 } 346 347 const struct bpf_link_ops bpf_struct_ops_link_lops = { 348 .release = bpf_struct_ops_link_release, 349 .dealloc = bpf_struct_ops_link_dealloc, 350 }; 351 352 int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks, 353 struct bpf_tramp_link *link, 354 const struct btf_func_model *model, 355 void *image, void *image_end) 356 { 357 u32 flags; 358 359 tlinks[BPF_TRAMP_FENTRY].links[0] = link; 360 tlinks[BPF_TRAMP_FENTRY].nr_links = 1; 361 /* BPF_TRAMP_F_RET_FENTRY_RET is only used by bpf_struct_ops, 362 * and it must be used alone. 363 */ 364 flags = model->ret_size > 0 ? BPF_TRAMP_F_RET_FENTRY_RET : 0; 365 return arch_prepare_bpf_trampoline(NULL, image, image_end, 366 model, flags, tlinks, NULL); 367 } 368 369 static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, 370 void *value, u64 flags) 371 { 372 struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; 373 const struct bpf_struct_ops *st_ops = st_map->st_ops; 374 struct bpf_struct_ops_value *uvalue, *kvalue; 375 const struct btf_member *member; 376 const struct btf_type *t = st_ops->type; 377 struct bpf_tramp_links *tlinks = NULL; 378 void *udata, *kdata; 379 int prog_fd, err = 0; 380 void *image, *image_end; 381 u32 i; 382 383 if (flags) 384 return -EINVAL; 385 386 if (*(u32 *)key != 0) 387 return -E2BIG; 388 389 err = check_zero_holes(st_ops->value_type, value); 390 if (err) 391 return err; 392 393 uvalue = value; 394 err = check_zero_holes(t, uvalue->data); 395 if (err) 396 return err; 397 398 if (uvalue->state || refcount_read(&uvalue->refcnt)) 399 return -EINVAL; 400 401 tlinks = kcalloc(BPF_TRAMP_MAX, sizeof(*tlinks), GFP_KERNEL); 402 if (!tlinks) 403 return -ENOMEM; 404 405 uvalue = (struct bpf_struct_ops_value *)st_map->uvalue; 406 kvalue = (struct bpf_struct_ops_value *)&st_map->kvalue; 407 408 mutex_lock(&st_map->lock); 409 410 if (kvalue->state != BPF_STRUCT_OPS_STATE_INIT) { 411 err = -EBUSY; 412 goto unlock; 413 } 414 415 memcpy(uvalue, value, map->value_size); 416 417 udata = &uvalue->data; 418 kdata = &kvalue->data; 419 image = st_map->image; 420 image_end = st_map->image + PAGE_SIZE; 421 422 for_each_member(i, t, member) { 423 const struct btf_type *mtype, *ptype; 424 struct bpf_prog *prog; 425 struct bpf_tramp_link *link; 426 u32 moff; 427 428 moff = __btf_member_bit_offset(t, member) / 8; 429 ptype = btf_type_resolve_ptr(btf_vmlinux, member->type, NULL); 430 if (ptype == module_type) { 431 if (*(void **)(udata + moff)) 432 goto reset_unlock; 433 *(void **)(kdata + moff) = BPF_MODULE_OWNER; 434 continue; 435 } 436 437 err = st_ops->init_member(t, member, kdata, udata); 438 if (err < 0) 439 goto reset_unlock; 440 441 /* The ->init_member() has handled this member */ 442 if (err > 0) 443 continue; 444 445 /* If st_ops->init_member does not handle it, 446 * we will only handle func ptrs and zero-ed members 447 * here. Reject everything else. 448 */ 449 450 /* All non func ptr member must be 0 */ 451 if (!ptype || !btf_type_is_func_proto(ptype)) { 452 u32 msize; 453 454 mtype = btf_type_by_id(btf_vmlinux, member->type); 455 mtype = btf_resolve_size(btf_vmlinux, mtype, &msize); 456 if (IS_ERR(mtype)) { 457 err = PTR_ERR(mtype); 458 goto reset_unlock; 459 } 460 461 if (memchr_inv(udata + moff, 0, msize)) { 462 err = -EINVAL; 463 goto reset_unlock; 464 } 465 466 continue; 467 } 468 469 prog_fd = (int)(*(unsigned long *)(udata + moff)); 470 /* Similar check as the attr->attach_prog_fd */ 471 if (!prog_fd) 472 continue; 473 474 prog = bpf_prog_get(prog_fd); 475 if (IS_ERR(prog)) { 476 err = PTR_ERR(prog); 477 goto reset_unlock; 478 } 479 480 if (prog->type != BPF_PROG_TYPE_STRUCT_OPS || 481 prog->aux->attach_btf_id != st_ops->type_id || 482 prog->expected_attach_type != i) { 483 bpf_prog_put(prog); 484 err = -EINVAL; 485 goto reset_unlock; 486 } 487 488 link = kzalloc(sizeof(*link), GFP_USER); 489 if (!link) { 490 bpf_prog_put(prog); 491 err = -ENOMEM; 492 goto reset_unlock; 493 } 494 bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, 495 &bpf_struct_ops_link_lops, prog); 496 st_map->links[i] = &link->link; 497 498 err = bpf_struct_ops_prepare_trampoline(tlinks, link, 499 &st_ops->func_models[i], 500 image, image_end); 501 if (err < 0) 502 goto reset_unlock; 503 504 *(void **)(kdata + moff) = image; 505 image += err; 506 507 /* put prog_id to udata */ 508 *(unsigned long *)(udata + moff) = prog->aux->id; 509 } 510 511 if (st_map->map.map_flags & BPF_F_LINK) { 512 err = st_ops->validate(kdata); 513 if (err) 514 goto reset_unlock; 515 set_memory_rox((long)st_map->image, 1); 516 /* Let bpf_link handle registration & unregistration. 517 * 518 * Pair with smp_load_acquire() during lookup_elem(). 519 */ 520 smp_store_release(&kvalue->state, BPF_STRUCT_OPS_STATE_READY); 521 goto unlock; 522 } 523 524 set_memory_rox((long)st_map->image, 1); 525 err = st_ops->reg(kdata); 526 if (likely(!err)) { 527 /* This refcnt increment on the map here after 528 * 'st_ops->reg()' is secure since the state of the 529 * map must be set to INIT at this moment, and thus 530 * bpf_struct_ops_map_delete_elem() can't unregister 531 * or transition it to TOBEFREE concurrently. 532 */ 533 bpf_map_inc(map); 534 /* Pair with smp_load_acquire() during lookup_elem(). 535 * It ensures the above udata updates (e.g. prog->aux->id) 536 * can be seen once BPF_STRUCT_OPS_STATE_INUSE is set. 537 */ 538 smp_store_release(&kvalue->state, BPF_STRUCT_OPS_STATE_INUSE); 539 goto unlock; 540 } 541 542 /* Error during st_ops->reg(). Can happen if this struct_ops needs to be 543 * verified as a whole, after all init_member() calls. Can also happen if 544 * there was a race in registering the struct_ops (under the same name) to 545 * a sub-system through different struct_ops's maps. 546 */ 547 set_memory_nx((long)st_map->image, 1); 548 set_memory_rw((long)st_map->image, 1); 549 550 reset_unlock: 551 bpf_struct_ops_map_put_progs(st_map); 552 memset(uvalue, 0, map->value_size); 553 memset(kvalue, 0, map->value_size); 554 unlock: 555 kfree(tlinks); 556 mutex_unlock(&st_map->lock); 557 return err; 558 } 559 560 static long bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key) 561 { 562 enum bpf_struct_ops_state prev_state; 563 struct bpf_struct_ops_map *st_map; 564 565 st_map = (struct bpf_struct_ops_map *)map; 566 if (st_map->map.map_flags & BPF_F_LINK) 567 return -EOPNOTSUPP; 568 569 prev_state = cmpxchg(&st_map->kvalue.state, 570 BPF_STRUCT_OPS_STATE_INUSE, 571 BPF_STRUCT_OPS_STATE_TOBEFREE); 572 switch (prev_state) { 573 case BPF_STRUCT_OPS_STATE_INUSE: 574 st_map->st_ops->unreg(&st_map->kvalue.data); 575 bpf_map_put(map); 576 return 0; 577 case BPF_STRUCT_OPS_STATE_TOBEFREE: 578 return -EINPROGRESS; 579 case BPF_STRUCT_OPS_STATE_INIT: 580 return -ENOENT; 581 default: 582 WARN_ON_ONCE(1); 583 /* Should never happen. Treat it as not found. */ 584 return -ENOENT; 585 } 586 } 587 588 static void bpf_struct_ops_map_seq_show_elem(struct bpf_map *map, void *key, 589 struct seq_file *m) 590 { 591 void *value; 592 int err; 593 594 value = kmalloc(map->value_size, GFP_USER | __GFP_NOWARN); 595 if (!value) 596 return; 597 598 err = bpf_struct_ops_map_sys_lookup_elem(map, key, value); 599 if (!err) { 600 btf_type_seq_show(btf_vmlinux, map->btf_vmlinux_value_type_id, 601 value, m); 602 seq_puts(m, "\n"); 603 } 604 605 kfree(value); 606 } 607 608 static void __bpf_struct_ops_map_free(struct bpf_map *map) 609 { 610 struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; 611 612 if (st_map->links) 613 bpf_struct_ops_map_put_progs(st_map); 614 bpf_map_area_free(st_map->links); 615 bpf_jit_free_exec(st_map->image); 616 bpf_map_area_free(st_map->uvalue); 617 bpf_map_area_free(st_map); 618 } 619 620 static void bpf_struct_ops_map_free(struct bpf_map *map) 621 { 622 /* The struct_ops's function may switch to another struct_ops. 623 * 624 * For example, bpf_tcp_cc_x->init() may switch to 625 * another tcp_cc_y by calling 626 * setsockopt(TCP_CONGESTION, "tcp_cc_y"). 627 * During the switch, bpf_struct_ops_put(tcp_cc_x) is called 628 * and its refcount may reach 0 which then free its 629 * trampoline image while tcp_cc_x is still running. 630 * 631 * A vanilla rcu gp is to wait for all bpf-tcp-cc prog 632 * to finish. bpf-tcp-cc prog is non sleepable. 633 * A rcu_tasks gp is to wait for the last few insn 634 * in the tramopline image to finish before releasing 635 * the trampoline image. 636 */ 637 synchronize_rcu_mult(call_rcu, call_rcu_tasks); 638 639 __bpf_struct_ops_map_free(map); 640 } 641 642 static int bpf_struct_ops_map_alloc_check(union bpf_attr *attr) 643 { 644 if (attr->key_size != sizeof(unsigned int) || attr->max_entries != 1 || 645 (attr->map_flags & ~BPF_F_LINK) || !attr->btf_vmlinux_value_type_id) 646 return -EINVAL; 647 return 0; 648 } 649 650 static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr) 651 { 652 const struct bpf_struct_ops *st_ops; 653 size_t st_map_size; 654 struct bpf_struct_ops_map *st_map; 655 const struct btf_type *t, *vt; 656 struct bpf_map *map; 657 658 if (!bpf_capable()) 659 return ERR_PTR(-EPERM); 660 661 st_ops = bpf_struct_ops_find_value(attr->btf_vmlinux_value_type_id); 662 if (!st_ops) 663 return ERR_PTR(-ENOTSUPP); 664 665 vt = st_ops->value_type; 666 if (attr->value_size != vt->size) 667 return ERR_PTR(-EINVAL); 668 669 if (attr->map_flags & BPF_F_LINK && (!st_ops->validate || !st_ops->update)) 670 return ERR_PTR(-EOPNOTSUPP); 671 672 t = st_ops->type; 673 674 st_map_size = sizeof(*st_map) + 675 /* kvalue stores the 676 * struct bpf_struct_ops_tcp_congestions_ops 677 */ 678 (vt->size - sizeof(struct bpf_struct_ops_value)); 679 680 st_map = bpf_map_area_alloc(st_map_size, NUMA_NO_NODE); 681 if (!st_map) 682 return ERR_PTR(-ENOMEM); 683 684 st_map->st_ops = st_ops; 685 map = &st_map->map; 686 687 st_map->uvalue = bpf_map_area_alloc(vt->size, NUMA_NO_NODE); 688 st_map->links = 689 bpf_map_area_alloc(btf_type_vlen(t) * sizeof(struct bpf_links *), 690 NUMA_NO_NODE); 691 st_map->image = bpf_jit_alloc_exec(PAGE_SIZE); 692 if (!st_map->uvalue || !st_map->links || !st_map->image) { 693 __bpf_struct_ops_map_free(map); 694 return ERR_PTR(-ENOMEM); 695 } 696 697 mutex_init(&st_map->lock); 698 set_vm_flush_reset_perms(st_map->image); 699 bpf_map_init_from_attr(map, attr); 700 701 return map; 702 } 703 704 static u64 bpf_struct_ops_map_mem_usage(const struct bpf_map *map) 705 { 706 struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; 707 const struct bpf_struct_ops *st_ops = st_map->st_ops; 708 const struct btf_type *vt = st_ops->value_type; 709 u64 usage; 710 711 usage = sizeof(*st_map) + 712 vt->size - sizeof(struct bpf_struct_ops_value); 713 usage += vt->size; 714 usage += btf_type_vlen(vt) * sizeof(struct bpf_links *); 715 usage += PAGE_SIZE; 716 return usage; 717 } 718 719 BTF_ID_LIST_SINGLE(bpf_struct_ops_map_btf_ids, struct, bpf_struct_ops_map) 720 const struct bpf_map_ops bpf_struct_ops_map_ops = { 721 .map_alloc_check = bpf_struct_ops_map_alloc_check, 722 .map_alloc = bpf_struct_ops_map_alloc, 723 .map_free = bpf_struct_ops_map_free, 724 .map_get_next_key = bpf_struct_ops_map_get_next_key, 725 .map_lookup_elem = bpf_struct_ops_map_lookup_elem, 726 .map_delete_elem = bpf_struct_ops_map_delete_elem, 727 .map_update_elem = bpf_struct_ops_map_update_elem, 728 .map_seq_show_elem = bpf_struct_ops_map_seq_show_elem, 729 .map_mem_usage = bpf_struct_ops_map_mem_usage, 730 .map_btf_id = &bpf_struct_ops_map_btf_ids[0], 731 }; 732 733 /* "const void *" because some subsystem is 734 * passing a const (e.g. const struct tcp_congestion_ops *) 735 */ 736 bool bpf_struct_ops_get(const void *kdata) 737 { 738 struct bpf_struct_ops_value *kvalue; 739 struct bpf_struct_ops_map *st_map; 740 struct bpf_map *map; 741 742 kvalue = container_of(kdata, struct bpf_struct_ops_value, data); 743 st_map = container_of(kvalue, struct bpf_struct_ops_map, kvalue); 744 745 map = __bpf_map_inc_not_zero(&st_map->map, false); 746 return !IS_ERR(map); 747 } 748 749 void bpf_struct_ops_put(const void *kdata) 750 { 751 struct bpf_struct_ops_value *kvalue; 752 struct bpf_struct_ops_map *st_map; 753 754 kvalue = container_of(kdata, struct bpf_struct_ops_value, data); 755 st_map = container_of(kvalue, struct bpf_struct_ops_map, kvalue); 756 757 bpf_map_put(&st_map->map); 758 } 759 760 static bool bpf_struct_ops_valid_to_reg(struct bpf_map *map) 761 { 762 struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; 763 764 return map->map_type == BPF_MAP_TYPE_STRUCT_OPS && 765 map->map_flags & BPF_F_LINK && 766 /* Pair with smp_store_release() during map_update */ 767 smp_load_acquire(&st_map->kvalue.state) == BPF_STRUCT_OPS_STATE_READY; 768 } 769 770 static void bpf_struct_ops_map_link_dealloc(struct bpf_link *link) 771 { 772 struct bpf_struct_ops_link *st_link; 773 struct bpf_struct_ops_map *st_map; 774 775 st_link = container_of(link, struct bpf_struct_ops_link, link); 776 st_map = (struct bpf_struct_ops_map *) 777 rcu_dereference_protected(st_link->map, true); 778 if (st_map) { 779 /* st_link->map can be NULL if 780 * bpf_struct_ops_link_create() fails to register. 781 */ 782 st_map->st_ops->unreg(&st_map->kvalue.data); 783 bpf_map_put(&st_map->map); 784 } 785 kfree(st_link); 786 } 787 788 static void bpf_struct_ops_map_link_show_fdinfo(const struct bpf_link *link, 789 struct seq_file *seq) 790 { 791 struct bpf_struct_ops_link *st_link; 792 struct bpf_map *map; 793 794 st_link = container_of(link, struct bpf_struct_ops_link, link); 795 rcu_read_lock(); 796 map = rcu_dereference(st_link->map); 797 seq_printf(seq, "map_id:\t%d\n", map->id); 798 rcu_read_unlock(); 799 } 800 801 static int bpf_struct_ops_map_link_fill_link_info(const struct bpf_link *link, 802 struct bpf_link_info *info) 803 { 804 struct bpf_struct_ops_link *st_link; 805 struct bpf_map *map; 806 807 st_link = container_of(link, struct bpf_struct_ops_link, link); 808 rcu_read_lock(); 809 map = rcu_dereference(st_link->map); 810 info->struct_ops.map_id = map->id; 811 rcu_read_unlock(); 812 return 0; 813 } 814 815 static int bpf_struct_ops_map_link_update(struct bpf_link *link, struct bpf_map *new_map, 816 struct bpf_map *expected_old_map) 817 { 818 struct bpf_struct_ops_map *st_map, *old_st_map; 819 struct bpf_map *old_map; 820 struct bpf_struct_ops_link *st_link; 821 int err = 0; 822 823 st_link = container_of(link, struct bpf_struct_ops_link, link); 824 st_map = container_of(new_map, struct bpf_struct_ops_map, map); 825 826 if (!bpf_struct_ops_valid_to_reg(new_map)) 827 return -EINVAL; 828 829 mutex_lock(&update_mutex); 830 831 old_map = rcu_dereference_protected(st_link->map, lockdep_is_held(&update_mutex)); 832 if (expected_old_map && old_map != expected_old_map) { 833 err = -EPERM; 834 goto err_out; 835 } 836 837 old_st_map = container_of(old_map, struct bpf_struct_ops_map, map); 838 /* The new and old struct_ops must be the same type. */ 839 if (st_map->st_ops != old_st_map->st_ops) { 840 err = -EINVAL; 841 goto err_out; 842 } 843 844 err = st_map->st_ops->update(st_map->kvalue.data, old_st_map->kvalue.data); 845 if (err) 846 goto err_out; 847 848 bpf_map_inc(new_map); 849 rcu_assign_pointer(st_link->map, new_map); 850 bpf_map_put(old_map); 851 852 err_out: 853 mutex_unlock(&update_mutex); 854 855 return err; 856 } 857 858 static const struct bpf_link_ops bpf_struct_ops_map_lops = { 859 .dealloc = bpf_struct_ops_map_link_dealloc, 860 .show_fdinfo = bpf_struct_ops_map_link_show_fdinfo, 861 .fill_link_info = bpf_struct_ops_map_link_fill_link_info, 862 .update_map = bpf_struct_ops_map_link_update, 863 }; 864 865 int bpf_struct_ops_link_create(union bpf_attr *attr) 866 { 867 struct bpf_struct_ops_link *link = NULL; 868 struct bpf_link_primer link_primer; 869 struct bpf_struct_ops_map *st_map; 870 struct bpf_map *map; 871 int err; 872 873 map = bpf_map_get(attr->link_create.map_fd); 874 if (IS_ERR(map)) 875 return PTR_ERR(map); 876 877 st_map = (struct bpf_struct_ops_map *)map; 878 879 if (!bpf_struct_ops_valid_to_reg(map)) { 880 err = -EINVAL; 881 goto err_out; 882 } 883 884 link = kzalloc(sizeof(*link), GFP_USER); 885 if (!link) { 886 err = -ENOMEM; 887 goto err_out; 888 } 889 bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, &bpf_struct_ops_map_lops, NULL); 890 891 err = bpf_link_prime(&link->link, &link_primer); 892 if (err) 893 goto err_out; 894 895 err = st_map->st_ops->reg(st_map->kvalue.data); 896 if (err) { 897 bpf_link_cleanup(&link_primer); 898 link = NULL; 899 goto err_out; 900 } 901 RCU_INIT_POINTER(link->map, map); 902 903 return bpf_link_settle(&link_primer); 904 905 err_out: 906 bpf_map_put(map); 907 kfree(link); 908 return err; 909 } 910 911