1 /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com 2 * 3 * This program is free software; you can redistribute it and/or 4 * modify it under the terms of version 2 of the GNU General Public 5 * License as published by the Free Software Foundation. 6 * 7 * This program is distributed in the hope that it will be useful, but 8 * WITHOUT ANY WARRANTY; without even the implied warranty of 9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 10 * General Public License for more details. 11 */ 12 #include <linux/bpf.h> 13 #include <linux/err.h> 14 #include <linux/vmalloc.h> 15 #include <linux/slab.h> 16 #include <linux/mm.h> 17 #include <linux/filter.h> 18 #include <linux/perf_event.h> 19 20 static void bpf_array_free_percpu(struct bpf_array *array) 21 { 22 int i; 23 24 for (i = 0; i < array->map.max_entries; i++) 25 free_percpu(array->pptrs[i]); 26 } 27 28 static int bpf_array_alloc_percpu(struct bpf_array *array) 29 { 30 void __percpu *ptr; 31 int i; 32 33 for (i = 0; i < array->map.max_entries; i++) { 34 ptr = __alloc_percpu_gfp(array->elem_size, 8, 35 GFP_USER | __GFP_NOWARN); 36 if (!ptr) { 37 bpf_array_free_percpu(array); 38 return -ENOMEM; 39 } 40 array->pptrs[i] = ptr; 41 } 42 43 return 0; 44 } 45 46 /* Called from syscall */ 47 static struct bpf_map *array_map_alloc(union bpf_attr *attr) 48 { 49 bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; 50 struct bpf_array *array; 51 u64 array_size; 52 u32 elem_size; 53 54 /* check sanity of attributes */ 55 if (attr->max_entries == 0 || attr->key_size != 4 || 56 attr->value_size == 0 || attr->map_flags) 57 return ERR_PTR(-EINVAL); 58 59 if (attr->value_size >= 1 << (KMALLOC_SHIFT_MAX - 1)) 60 /* if value_size is bigger, the user space won't be able to 61 * access the elements. 62 */ 63 return ERR_PTR(-E2BIG); 64 65 elem_size = round_up(attr->value_size, 8); 66 67 array_size = sizeof(*array); 68 if (percpu) 69 array_size += (u64) attr->max_entries * sizeof(void *); 70 else 71 array_size += (u64) attr->max_entries * elem_size; 72 73 /* make sure there is no u32 overflow later in round_up() */ 74 if (array_size >= U32_MAX - PAGE_SIZE) 75 return ERR_PTR(-ENOMEM); 76 77 78 /* allocate all map elements and zero-initialize them */ 79 array = kzalloc(array_size, GFP_USER | __GFP_NOWARN); 80 if (!array) { 81 array = vzalloc(array_size); 82 if (!array) 83 return ERR_PTR(-ENOMEM); 84 } 85 86 /* copy mandatory map attributes */ 87 array->map.map_type = attr->map_type; 88 array->map.key_size = attr->key_size; 89 array->map.value_size = attr->value_size; 90 array->map.max_entries = attr->max_entries; 91 array->elem_size = elem_size; 92 93 if (!percpu) 94 goto out; 95 96 array_size += (u64) attr->max_entries * elem_size * num_possible_cpus(); 97 98 if (array_size >= U32_MAX - PAGE_SIZE || 99 elem_size > PCPU_MIN_UNIT_SIZE || bpf_array_alloc_percpu(array)) { 100 kvfree(array); 101 return ERR_PTR(-ENOMEM); 102 } 103 out: 104 array->map.pages = round_up(array_size, PAGE_SIZE) >> PAGE_SHIFT; 105 106 return &array->map; 107 } 108 109 /* Called from syscall or from eBPF program */ 110 static void *array_map_lookup_elem(struct bpf_map *map, void *key) 111 { 112 struct bpf_array *array = container_of(map, struct bpf_array, map); 113 u32 index = *(u32 *)key; 114 115 if (unlikely(index >= array->map.max_entries)) 116 return NULL; 117 118 return array->value + array->elem_size * index; 119 } 120 121 /* Called from eBPF program */ 122 static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key) 123 { 124 struct bpf_array *array = container_of(map, struct bpf_array, map); 125 u32 index = *(u32 *)key; 126 127 if (unlikely(index >= array->map.max_entries)) 128 return NULL; 129 130 return this_cpu_ptr(array->pptrs[index]); 131 } 132 133 int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value) 134 { 135 struct bpf_array *array = container_of(map, struct bpf_array, map); 136 u32 index = *(u32 *)key; 137 void __percpu *pptr; 138 int cpu, off = 0; 139 u32 size; 140 141 if (unlikely(index >= array->map.max_entries)) 142 return -ENOENT; 143 144 /* per_cpu areas are zero-filled and bpf programs can only 145 * access 'value_size' of them, so copying rounded areas 146 * will not leak any kernel data 147 */ 148 size = round_up(map->value_size, 8); 149 rcu_read_lock(); 150 pptr = array->pptrs[index]; 151 for_each_possible_cpu(cpu) { 152 bpf_long_memcpy(value + off, per_cpu_ptr(pptr, cpu), size); 153 off += size; 154 } 155 rcu_read_unlock(); 156 return 0; 157 } 158 159 /* Called from syscall */ 160 static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key) 161 { 162 struct bpf_array *array = container_of(map, struct bpf_array, map); 163 u32 index = *(u32 *)key; 164 u32 *next = (u32 *)next_key; 165 166 if (index >= array->map.max_entries) { 167 *next = 0; 168 return 0; 169 } 170 171 if (index == array->map.max_entries - 1) 172 return -ENOENT; 173 174 *next = index + 1; 175 return 0; 176 } 177 178 /* Called from syscall or from eBPF program */ 179 static int array_map_update_elem(struct bpf_map *map, void *key, void *value, 180 u64 map_flags) 181 { 182 struct bpf_array *array = container_of(map, struct bpf_array, map); 183 u32 index = *(u32 *)key; 184 185 if (unlikely(map_flags > BPF_EXIST)) 186 /* unknown flags */ 187 return -EINVAL; 188 189 if (unlikely(index >= array->map.max_entries)) 190 /* all elements were pre-allocated, cannot insert a new one */ 191 return -E2BIG; 192 193 if (unlikely(map_flags == BPF_NOEXIST)) 194 /* all elements already exist */ 195 return -EEXIST; 196 197 if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) 198 memcpy(this_cpu_ptr(array->pptrs[index]), 199 value, map->value_size); 200 else 201 memcpy(array->value + array->elem_size * index, 202 value, map->value_size); 203 return 0; 204 } 205 206 int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value, 207 u64 map_flags) 208 { 209 struct bpf_array *array = container_of(map, struct bpf_array, map); 210 u32 index = *(u32 *)key; 211 void __percpu *pptr; 212 int cpu, off = 0; 213 u32 size; 214 215 if (unlikely(map_flags > BPF_EXIST)) 216 /* unknown flags */ 217 return -EINVAL; 218 219 if (unlikely(index >= array->map.max_entries)) 220 /* all elements were pre-allocated, cannot insert a new one */ 221 return -E2BIG; 222 223 if (unlikely(map_flags == BPF_NOEXIST)) 224 /* all elements already exist */ 225 return -EEXIST; 226 227 /* the user space will provide round_up(value_size, 8) bytes that 228 * will be copied into per-cpu area. bpf programs can only access 229 * value_size of it. During lookup the same extra bytes will be 230 * returned or zeros which were zero-filled by percpu_alloc, 231 * so no kernel data leaks possible 232 */ 233 size = round_up(map->value_size, 8); 234 rcu_read_lock(); 235 pptr = array->pptrs[index]; 236 for_each_possible_cpu(cpu) { 237 bpf_long_memcpy(per_cpu_ptr(pptr, cpu), value + off, size); 238 off += size; 239 } 240 rcu_read_unlock(); 241 return 0; 242 } 243 244 /* Called from syscall or from eBPF program */ 245 static int array_map_delete_elem(struct bpf_map *map, void *key) 246 { 247 return -EINVAL; 248 } 249 250 /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ 251 static void array_map_free(struct bpf_map *map) 252 { 253 struct bpf_array *array = container_of(map, struct bpf_array, map); 254 255 /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, 256 * so the programs (can be more than one that used this map) were 257 * disconnected from events. Wait for outstanding programs to complete 258 * and free the array 259 */ 260 synchronize_rcu(); 261 262 if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) 263 bpf_array_free_percpu(array); 264 265 kvfree(array); 266 } 267 268 static const struct bpf_map_ops array_ops = { 269 .map_alloc = array_map_alloc, 270 .map_free = array_map_free, 271 .map_get_next_key = array_map_get_next_key, 272 .map_lookup_elem = array_map_lookup_elem, 273 .map_update_elem = array_map_update_elem, 274 .map_delete_elem = array_map_delete_elem, 275 }; 276 277 static struct bpf_map_type_list array_type __read_mostly = { 278 .ops = &array_ops, 279 .type = BPF_MAP_TYPE_ARRAY, 280 }; 281 282 static const struct bpf_map_ops percpu_array_ops = { 283 .map_alloc = array_map_alloc, 284 .map_free = array_map_free, 285 .map_get_next_key = array_map_get_next_key, 286 .map_lookup_elem = percpu_array_map_lookup_elem, 287 .map_update_elem = array_map_update_elem, 288 .map_delete_elem = array_map_delete_elem, 289 }; 290 291 static struct bpf_map_type_list percpu_array_type __read_mostly = { 292 .ops = &percpu_array_ops, 293 .type = BPF_MAP_TYPE_PERCPU_ARRAY, 294 }; 295 296 static int __init register_array_map(void) 297 { 298 bpf_register_map_type(&array_type); 299 bpf_register_map_type(&percpu_array_type); 300 return 0; 301 } 302 late_initcall(register_array_map); 303 304 static struct bpf_map *fd_array_map_alloc(union bpf_attr *attr) 305 { 306 /* only file descriptors can be stored in this type of map */ 307 if (attr->value_size != sizeof(u32)) 308 return ERR_PTR(-EINVAL); 309 return array_map_alloc(attr); 310 } 311 312 static void fd_array_map_free(struct bpf_map *map) 313 { 314 struct bpf_array *array = container_of(map, struct bpf_array, map); 315 int i; 316 317 synchronize_rcu(); 318 319 /* make sure it's empty */ 320 for (i = 0; i < array->map.max_entries; i++) 321 BUG_ON(array->ptrs[i] != NULL); 322 kvfree(array); 323 } 324 325 static void *fd_array_map_lookup_elem(struct bpf_map *map, void *key) 326 { 327 return NULL; 328 } 329 330 /* only called from syscall */ 331 int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file, 332 void *key, void *value, u64 map_flags) 333 { 334 struct bpf_array *array = container_of(map, struct bpf_array, map); 335 void *new_ptr, *old_ptr; 336 u32 index = *(u32 *)key, ufd; 337 338 if (map_flags != BPF_ANY) 339 return -EINVAL; 340 341 if (index >= array->map.max_entries) 342 return -E2BIG; 343 344 ufd = *(u32 *)value; 345 new_ptr = map->ops->map_fd_get_ptr(map, map_file, ufd); 346 if (IS_ERR(new_ptr)) 347 return PTR_ERR(new_ptr); 348 349 old_ptr = xchg(array->ptrs + index, new_ptr); 350 if (old_ptr) 351 map->ops->map_fd_put_ptr(old_ptr); 352 353 return 0; 354 } 355 356 static int fd_array_map_delete_elem(struct bpf_map *map, void *key) 357 { 358 struct bpf_array *array = container_of(map, struct bpf_array, map); 359 void *old_ptr; 360 u32 index = *(u32 *)key; 361 362 if (index >= array->map.max_entries) 363 return -E2BIG; 364 365 old_ptr = xchg(array->ptrs + index, NULL); 366 if (old_ptr) { 367 map->ops->map_fd_put_ptr(old_ptr); 368 return 0; 369 } else { 370 return -ENOENT; 371 } 372 } 373 374 static void *prog_fd_array_get_ptr(struct bpf_map *map, 375 struct file *map_file, int fd) 376 { 377 struct bpf_array *array = container_of(map, struct bpf_array, map); 378 struct bpf_prog *prog = bpf_prog_get(fd); 379 380 if (IS_ERR(prog)) 381 return prog; 382 383 if (!bpf_prog_array_compatible(array, prog)) { 384 bpf_prog_put(prog); 385 return ERR_PTR(-EINVAL); 386 } 387 388 return prog; 389 } 390 391 static void prog_fd_array_put_ptr(void *ptr) 392 { 393 bpf_prog_put(ptr); 394 } 395 396 /* decrement refcnt of all bpf_progs that are stored in this map */ 397 void bpf_fd_array_map_clear(struct bpf_map *map) 398 { 399 struct bpf_array *array = container_of(map, struct bpf_array, map); 400 int i; 401 402 for (i = 0; i < array->map.max_entries; i++) 403 fd_array_map_delete_elem(map, &i); 404 } 405 406 static const struct bpf_map_ops prog_array_ops = { 407 .map_alloc = fd_array_map_alloc, 408 .map_free = fd_array_map_free, 409 .map_get_next_key = array_map_get_next_key, 410 .map_lookup_elem = fd_array_map_lookup_elem, 411 .map_delete_elem = fd_array_map_delete_elem, 412 .map_fd_get_ptr = prog_fd_array_get_ptr, 413 .map_fd_put_ptr = prog_fd_array_put_ptr, 414 }; 415 416 static struct bpf_map_type_list prog_array_type __read_mostly = { 417 .ops = &prog_array_ops, 418 .type = BPF_MAP_TYPE_PROG_ARRAY, 419 }; 420 421 static int __init register_prog_array_map(void) 422 { 423 bpf_register_map_type(&prog_array_type); 424 return 0; 425 } 426 late_initcall(register_prog_array_map); 427 428 static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file, 429 struct file *map_file) 430 { 431 struct bpf_event_entry *ee; 432 433 ee = kzalloc(sizeof(*ee), GFP_ATOMIC); 434 if (ee) { 435 ee->event = perf_file->private_data; 436 ee->perf_file = perf_file; 437 ee->map_file = map_file; 438 } 439 440 return ee; 441 } 442 443 static void __bpf_event_entry_free(struct rcu_head *rcu) 444 { 445 struct bpf_event_entry *ee; 446 447 ee = container_of(rcu, struct bpf_event_entry, rcu); 448 fput(ee->perf_file); 449 kfree(ee); 450 } 451 452 static void bpf_event_entry_free_rcu(struct bpf_event_entry *ee) 453 { 454 call_rcu(&ee->rcu, __bpf_event_entry_free); 455 } 456 457 static void *perf_event_fd_array_get_ptr(struct bpf_map *map, 458 struct file *map_file, int fd) 459 { 460 const struct perf_event_attr *attr; 461 struct bpf_event_entry *ee; 462 struct perf_event *event; 463 struct file *perf_file; 464 465 perf_file = perf_event_get(fd); 466 if (IS_ERR(perf_file)) 467 return perf_file; 468 469 event = perf_file->private_data; 470 ee = ERR_PTR(-EINVAL); 471 472 attr = perf_event_attrs(event); 473 if (IS_ERR(attr) || attr->inherit) 474 goto err_out; 475 476 switch (attr->type) { 477 case PERF_TYPE_SOFTWARE: 478 if (attr->config != PERF_COUNT_SW_BPF_OUTPUT) 479 goto err_out; 480 /* fall-through */ 481 case PERF_TYPE_RAW: 482 case PERF_TYPE_HARDWARE: 483 ee = bpf_event_entry_gen(perf_file, map_file); 484 if (ee) 485 return ee; 486 ee = ERR_PTR(-ENOMEM); 487 /* fall-through */ 488 default: 489 break; 490 } 491 492 err_out: 493 fput(perf_file); 494 return ee; 495 } 496 497 static void perf_event_fd_array_put_ptr(void *ptr) 498 { 499 bpf_event_entry_free_rcu(ptr); 500 } 501 502 static void perf_event_fd_array_release(struct bpf_map *map, 503 struct file *map_file) 504 { 505 struct bpf_array *array = container_of(map, struct bpf_array, map); 506 struct bpf_event_entry *ee; 507 int i; 508 509 rcu_read_lock(); 510 for (i = 0; i < array->map.max_entries; i++) { 511 ee = READ_ONCE(array->ptrs[i]); 512 if (ee && ee->map_file == map_file) 513 fd_array_map_delete_elem(map, &i); 514 } 515 rcu_read_unlock(); 516 } 517 518 static const struct bpf_map_ops perf_event_array_ops = { 519 .map_alloc = fd_array_map_alloc, 520 .map_free = fd_array_map_free, 521 .map_get_next_key = array_map_get_next_key, 522 .map_lookup_elem = fd_array_map_lookup_elem, 523 .map_delete_elem = fd_array_map_delete_elem, 524 .map_fd_get_ptr = perf_event_fd_array_get_ptr, 525 .map_fd_put_ptr = perf_event_fd_array_put_ptr, 526 .map_release = perf_event_fd_array_release, 527 }; 528 529 static struct bpf_map_type_list perf_event_array_type __read_mostly = { 530 .ops = &perf_event_array_ops, 531 .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY, 532 }; 533 534 static int __init register_perf_event_array_map(void) 535 { 536 bpf_register_map_type(&perf_event_array_type); 537 return 0; 538 } 539 late_initcall(register_perf_event_array_map); 540 541 #ifdef CONFIG_CGROUPS 542 static void *cgroup_fd_array_get_ptr(struct bpf_map *map, 543 struct file *map_file /* not used */, 544 int fd) 545 { 546 return cgroup_get_from_fd(fd); 547 } 548 549 static void cgroup_fd_array_put_ptr(void *ptr) 550 { 551 /* cgroup_put free cgrp after a rcu grace period */ 552 cgroup_put(ptr); 553 } 554 555 static void cgroup_fd_array_free(struct bpf_map *map) 556 { 557 bpf_fd_array_map_clear(map); 558 fd_array_map_free(map); 559 } 560 561 static const struct bpf_map_ops cgroup_array_ops = { 562 .map_alloc = fd_array_map_alloc, 563 .map_free = cgroup_fd_array_free, 564 .map_get_next_key = array_map_get_next_key, 565 .map_lookup_elem = fd_array_map_lookup_elem, 566 .map_delete_elem = fd_array_map_delete_elem, 567 .map_fd_get_ptr = cgroup_fd_array_get_ptr, 568 .map_fd_put_ptr = cgroup_fd_array_put_ptr, 569 }; 570 571 static struct bpf_map_type_list cgroup_array_type __read_mostly = { 572 .ops = &cgroup_array_ops, 573 .type = BPF_MAP_TYPE_CGROUP_ARRAY, 574 }; 575 576 static int __init register_cgroup_array_map(void) 577 { 578 bpf_register_map_type(&cgroup_array_type); 579 return 0; 580 } 581 late_initcall(register_cgroup_array_map); 582 #endif 583