1*f1a2e44aSMauricio Vasquez B // SPDX-License-Identifier: GPL-2.0 2*f1a2e44aSMauricio Vasquez B /* 3*f1a2e44aSMauricio Vasquez B * queue_stack_maps.c: BPF queue and stack maps 4*f1a2e44aSMauricio Vasquez B * 5*f1a2e44aSMauricio Vasquez B * Copyright (c) 2018 Politecnico di Torino 6*f1a2e44aSMauricio Vasquez B */ 7*f1a2e44aSMauricio Vasquez B #include <linux/bpf.h> 8*f1a2e44aSMauricio Vasquez B #include <linux/list.h> 9*f1a2e44aSMauricio Vasquez B #include <linux/slab.h> 10*f1a2e44aSMauricio Vasquez B #include "percpu_freelist.h" 11*f1a2e44aSMauricio Vasquez B 12*f1a2e44aSMauricio Vasquez B #define QUEUE_STACK_CREATE_FLAG_MASK \ 13*f1a2e44aSMauricio Vasquez B (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) 14*f1a2e44aSMauricio Vasquez B 15*f1a2e44aSMauricio Vasquez B 16*f1a2e44aSMauricio Vasquez B struct bpf_queue_stack { 17*f1a2e44aSMauricio Vasquez B struct bpf_map map; 18*f1a2e44aSMauricio Vasquez B raw_spinlock_t lock; 19*f1a2e44aSMauricio Vasquez B u32 head, tail; 20*f1a2e44aSMauricio Vasquez B u32 size; /* max_entries + 1 */ 21*f1a2e44aSMauricio Vasquez B 22*f1a2e44aSMauricio Vasquez B char elements[0] __aligned(8); 23*f1a2e44aSMauricio Vasquez B }; 24*f1a2e44aSMauricio Vasquez B 25*f1a2e44aSMauricio Vasquez B static struct bpf_queue_stack *bpf_queue_stack(struct bpf_map *map) 26*f1a2e44aSMauricio Vasquez B { 27*f1a2e44aSMauricio Vasquez B return container_of(map, struct bpf_queue_stack, map); 28*f1a2e44aSMauricio Vasquez B } 29*f1a2e44aSMauricio Vasquez B 30*f1a2e44aSMauricio Vasquez B static bool queue_stack_map_is_empty(struct bpf_queue_stack *qs) 31*f1a2e44aSMauricio Vasquez B { 32*f1a2e44aSMauricio Vasquez B return qs->head == qs->tail; 33*f1a2e44aSMauricio Vasquez B } 34*f1a2e44aSMauricio Vasquez B 35*f1a2e44aSMauricio Vasquez B static bool queue_stack_map_is_full(struct bpf_queue_stack *qs) 36*f1a2e44aSMauricio Vasquez B { 37*f1a2e44aSMauricio Vasquez B u32 head = qs->head + 1; 38*f1a2e44aSMauricio Vasquez B 39*f1a2e44aSMauricio Vasquez B if (unlikely(head >= qs->size)) 40*f1a2e44aSMauricio Vasquez B head = 0; 41*f1a2e44aSMauricio Vasquez B 42*f1a2e44aSMauricio Vasquez B return head == qs->tail; 43*f1a2e44aSMauricio Vasquez B } 44*f1a2e44aSMauricio Vasquez B 45*f1a2e44aSMauricio Vasquez B /* Called from syscall */ 46*f1a2e44aSMauricio Vasquez B static int queue_stack_map_alloc_check(union bpf_attr *attr) 47*f1a2e44aSMauricio Vasquez B { 48*f1a2e44aSMauricio Vasquez B /* check sanity of attributes */ 49*f1a2e44aSMauricio Vasquez B if (attr->max_entries == 0 || attr->key_size != 0 || 50*f1a2e44aSMauricio Vasquez B attr->map_flags & ~QUEUE_STACK_CREATE_FLAG_MASK) 51*f1a2e44aSMauricio Vasquez B return -EINVAL; 52*f1a2e44aSMauricio Vasquez B 53*f1a2e44aSMauricio Vasquez B if (attr->value_size > KMALLOC_MAX_SIZE) 54*f1a2e44aSMauricio Vasquez B /* if value_size is bigger, the user space won't be able to 55*f1a2e44aSMauricio Vasquez B * access the elements. 56*f1a2e44aSMauricio Vasquez B */ 57*f1a2e44aSMauricio Vasquez B return -E2BIG; 58*f1a2e44aSMauricio Vasquez B 59*f1a2e44aSMauricio Vasquez B return 0; 60*f1a2e44aSMauricio Vasquez B } 61*f1a2e44aSMauricio Vasquez B 62*f1a2e44aSMauricio Vasquez B static struct bpf_map *queue_stack_map_alloc(union bpf_attr *attr) 63*f1a2e44aSMauricio Vasquez B { 64*f1a2e44aSMauricio Vasquez B int ret, numa_node = bpf_map_attr_numa_node(attr); 65*f1a2e44aSMauricio Vasquez B struct bpf_queue_stack *qs; 66*f1a2e44aSMauricio Vasquez B u32 size, value_size; 67*f1a2e44aSMauricio Vasquez B u64 queue_size, cost; 68*f1a2e44aSMauricio Vasquez B 69*f1a2e44aSMauricio Vasquez B size = attr->max_entries + 1; 70*f1a2e44aSMauricio Vasquez B value_size = attr->value_size; 71*f1a2e44aSMauricio Vasquez B 72*f1a2e44aSMauricio Vasquez B queue_size = sizeof(*qs) + (u64) value_size * size; 73*f1a2e44aSMauricio Vasquez B 74*f1a2e44aSMauricio Vasquez B cost = queue_size; 75*f1a2e44aSMauricio Vasquez B if (cost >= U32_MAX - PAGE_SIZE) 76*f1a2e44aSMauricio Vasquez B return ERR_PTR(-E2BIG); 77*f1a2e44aSMauricio Vasquez B 78*f1a2e44aSMauricio Vasquez B cost = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; 79*f1a2e44aSMauricio Vasquez B 80*f1a2e44aSMauricio Vasquez B ret = bpf_map_precharge_memlock(cost); 81*f1a2e44aSMauricio Vasquez B if (ret < 0) 82*f1a2e44aSMauricio Vasquez B return ERR_PTR(ret); 83*f1a2e44aSMauricio Vasquez B 84*f1a2e44aSMauricio Vasquez B qs = bpf_map_area_alloc(queue_size, numa_node); 85*f1a2e44aSMauricio Vasquez B if (!qs) 86*f1a2e44aSMauricio Vasquez B return ERR_PTR(-ENOMEM); 87*f1a2e44aSMauricio Vasquez B 88*f1a2e44aSMauricio Vasquez B memset(qs, 0, sizeof(*qs)); 89*f1a2e44aSMauricio Vasquez B 90*f1a2e44aSMauricio Vasquez B bpf_map_init_from_attr(&qs->map, attr); 91*f1a2e44aSMauricio Vasquez B 92*f1a2e44aSMauricio Vasquez B qs->map.pages = cost; 93*f1a2e44aSMauricio Vasquez B qs->size = size; 94*f1a2e44aSMauricio Vasquez B 95*f1a2e44aSMauricio Vasquez B raw_spin_lock_init(&qs->lock); 96*f1a2e44aSMauricio Vasquez B 97*f1a2e44aSMauricio Vasquez B return &qs->map; 98*f1a2e44aSMauricio Vasquez B } 99*f1a2e44aSMauricio Vasquez B 100*f1a2e44aSMauricio Vasquez B /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ 101*f1a2e44aSMauricio Vasquez B static void queue_stack_map_free(struct bpf_map *map) 102*f1a2e44aSMauricio Vasquez B { 103*f1a2e44aSMauricio Vasquez B struct bpf_queue_stack *qs = bpf_queue_stack(map); 104*f1a2e44aSMauricio Vasquez B 105*f1a2e44aSMauricio Vasquez B /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, 106*f1a2e44aSMauricio Vasquez B * so the programs (can be more than one that used this map) were 107*f1a2e44aSMauricio Vasquez B * disconnected from events. Wait for outstanding critical sections in 108*f1a2e44aSMauricio Vasquez B * these programs to complete 109*f1a2e44aSMauricio Vasquez B */ 110*f1a2e44aSMauricio Vasquez B synchronize_rcu(); 111*f1a2e44aSMauricio Vasquez B 112*f1a2e44aSMauricio Vasquez B bpf_map_area_free(qs); 113*f1a2e44aSMauricio Vasquez B } 114*f1a2e44aSMauricio Vasquez B 115*f1a2e44aSMauricio Vasquez B static int __queue_map_get(struct bpf_map *map, void *value, bool delete) 116*f1a2e44aSMauricio Vasquez B { 117*f1a2e44aSMauricio Vasquez B struct bpf_queue_stack *qs = bpf_queue_stack(map); 118*f1a2e44aSMauricio Vasquez B unsigned long flags; 119*f1a2e44aSMauricio Vasquez B int err = 0; 120*f1a2e44aSMauricio Vasquez B void *ptr; 121*f1a2e44aSMauricio Vasquez B 122*f1a2e44aSMauricio Vasquez B raw_spin_lock_irqsave(&qs->lock, flags); 123*f1a2e44aSMauricio Vasquez B 124*f1a2e44aSMauricio Vasquez B if (queue_stack_map_is_empty(qs)) { 125*f1a2e44aSMauricio Vasquez B err = -ENOENT; 126*f1a2e44aSMauricio Vasquez B goto out; 127*f1a2e44aSMauricio Vasquez B } 128*f1a2e44aSMauricio Vasquez B 129*f1a2e44aSMauricio Vasquez B ptr = &qs->elements[qs->tail * qs->map.value_size]; 130*f1a2e44aSMauricio Vasquez B memcpy(value, ptr, qs->map.value_size); 131*f1a2e44aSMauricio Vasquez B 132*f1a2e44aSMauricio Vasquez B if (delete) { 133*f1a2e44aSMauricio Vasquez B if (unlikely(++qs->tail >= qs->size)) 134*f1a2e44aSMauricio Vasquez B qs->tail = 0; 135*f1a2e44aSMauricio Vasquez B } 136*f1a2e44aSMauricio Vasquez B 137*f1a2e44aSMauricio Vasquez B out: 138*f1a2e44aSMauricio Vasquez B raw_spin_unlock_irqrestore(&qs->lock, flags); 139*f1a2e44aSMauricio Vasquez B return err; 140*f1a2e44aSMauricio Vasquez B } 141*f1a2e44aSMauricio Vasquez B 142*f1a2e44aSMauricio Vasquez B 143*f1a2e44aSMauricio Vasquez B static int __stack_map_get(struct bpf_map *map, void *value, bool delete) 144*f1a2e44aSMauricio Vasquez B { 145*f1a2e44aSMauricio Vasquez B struct bpf_queue_stack *qs = bpf_queue_stack(map); 146*f1a2e44aSMauricio Vasquez B unsigned long flags; 147*f1a2e44aSMauricio Vasquez B int err = 0; 148*f1a2e44aSMauricio Vasquez B void *ptr; 149*f1a2e44aSMauricio Vasquez B u32 index; 150*f1a2e44aSMauricio Vasquez B 151*f1a2e44aSMauricio Vasquez B raw_spin_lock_irqsave(&qs->lock, flags); 152*f1a2e44aSMauricio Vasquez B 153*f1a2e44aSMauricio Vasquez B if (queue_stack_map_is_empty(qs)) { 154*f1a2e44aSMauricio Vasquez B err = -ENOENT; 155*f1a2e44aSMauricio Vasquez B goto out; 156*f1a2e44aSMauricio Vasquez B } 157*f1a2e44aSMauricio Vasquez B 158*f1a2e44aSMauricio Vasquez B index = qs->head - 1; 159*f1a2e44aSMauricio Vasquez B if (unlikely(index >= qs->size)) 160*f1a2e44aSMauricio Vasquez B index = qs->size - 1; 161*f1a2e44aSMauricio Vasquez B 162*f1a2e44aSMauricio Vasquez B ptr = &qs->elements[index * qs->map.value_size]; 163*f1a2e44aSMauricio Vasquez B memcpy(value, ptr, qs->map.value_size); 164*f1a2e44aSMauricio Vasquez B 165*f1a2e44aSMauricio Vasquez B if (delete) 166*f1a2e44aSMauricio Vasquez B qs->head = index; 167*f1a2e44aSMauricio Vasquez B 168*f1a2e44aSMauricio Vasquez B out: 169*f1a2e44aSMauricio Vasquez B raw_spin_unlock_irqrestore(&qs->lock, flags); 170*f1a2e44aSMauricio Vasquez B return err; 171*f1a2e44aSMauricio Vasquez B } 172*f1a2e44aSMauricio Vasquez B 173*f1a2e44aSMauricio Vasquez B /* Called from syscall or from eBPF program */ 174*f1a2e44aSMauricio Vasquez B static int queue_map_peek_elem(struct bpf_map *map, void *value) 175*f1a2e44aSMauricio Vasquez B { 176*f1a2e44aSMauricio Vasquez B return __queue_map_get(map, value, false); 177*f1a2e44aSMauricio Vasquez B } 178*f1a2e44aSMauricio Vasquez B 179*f1a2e44aSMauricio Vasquez B /* Called from syscall or from eBPF program */ 180*f1a2e44aSMauricio Vasquez B static int stack_map_peek_elem(struct bpf_map *map, void *value) 181*f1a2e44aSMauricio Vasquez B { 182*f1a2e44aSMauricio Vasquez B return __stack_map_get(map, value, false); 183*f1a2e44aSMauricio Vasquez B } 184*f1a2e44aSMauricio Vasquez B 185*f1a2e44aSMauricio Vasquez B /* Called from syscall or from eBPF program */ 186*f1a2e44aSMauricio Vasquez B static int queue_map_pop_elem(struct bpf_map *map, void *value) 187*f1a2e44aSMauricio Vasquez B { 188*f1a2e44aSMauricio Vasquez B return __queue_map_get(map, value, true); 189*f1a2e44aSMauricio Vasquez B } 190*f1a2e44aSMauricio Vasquez B 191*f1a2e44aSMauricio Vasquez B /* Called from syscall or from eBPF program */ 192*f1a2e44aSMauricio Vasquez B static int stack_map_pop_elem(struct bpf_map *map, void *value) 193*f1a2e44aSMauricio Vasquez B { 194*f1a2e44aSMauricio Vasquez B return __stack_map_get(map, value, true); 195*f1a2e44aSMauricio Vasquez B } 196*f1a2e44aSMauricio Vasquez B 197*f1a2e44aSMauricio Vasquez B /* Called from syscall or from eBPF program */ 198*f1a2e44aSMauricio Vasquez B static int queue_stack_map_push_elem(struct bpf_map *map, void *value, 199*f1a2e44aSMauricio Vasquez B u64 flags) 200*f1a2e44aSMauricio Vasquez B { 201*f1a2e44aSMauricio Vasquez B struct bpf_queue_stack *qs = bpf_queue_stack(map); 202*f1a2e44aSMauricio Vasquez B unsigned long irq_flags; 203*f1a2e44aSMauricio Vasquez B int err = 0; 204*f1a2e44aSMauricio Vasquez B void *dst; 205*f1a2e44aSMauricio Vasquez B 206*f1a2e44aSMauricio Vasquez B /* BPF_EXIST is used to force making room for a new element in case the 207*f1a2e44aSMauricio Vasquez B * map is full 208*f1a2e44aSMauricio Vasquez B */ 209*f1a2e44aSMauricio Vasquez B bool replace = (flags & BPF_EXIST); 210*f1a2e44aSMauricio Vasquez B 211*f1a2e44aSMauricio Vasquez B /* Check supported flags for queue and stack maps */ 212*f1a2e44aSMauricio Vasquez B if (flags & BPF_NOEXIST || flags > BPF_EXIST) 213*f1a2e44aSMauricio Vasquez B return -EINVAL; 214*f1a2e44aSMauricio Vasquez B 215*f1a2e44aSMauricio Vasquez B raw_spin_lock_irqsave(&qs->lock, irq_flags); 216*f1a2e44aSMauricio Vasquez B 217*f1a2e44aSMauricio Vasquez B if (queue_stack_map_is_full(qs)) { 218*f1a2e44aSMauricio Vasquez B if (!replace) { 219*f1a2e44aSMauricio Vasquez B err = -E2BIG; 220*f1a2e44aSMauricio Vasquez B goto out; 221*f1a2e44aSMauricio Vasquez B } 222*f1a2e44aSMauricio Vasquez B /* advance tail pointer to overwrite oldest element */ 223*f1a2e44aSMauricio Vasquez B if (unlikely(++qs->tail >= qs->size)) 224*f1a2e44aSMauricio Vasquez B qs->tail = 0; 225*f1a2e44aSMauricio Vasquez B } 226*f1a2e44aSMauricio Vasquez B 227*f1a2e44aSMauricio Vasquez B dst = &qs->elements[qs->head * qs->map.value_size]; 228*f1a2e44aSMauricio Vasquez B memcpy(dst, value, qs->map.value_size); 229*f1a2e44aSMauricio Vasquez B 230*f1a2e44aSMauricio Vasquez B if (unlikely(++qs->head >= qs->size)) 231*f1a2e44aSMauricio Vasquez B qs->head = 0; 232*f1a2e44aSMauricio Vasquez B 233*f1a2e44aSMauricio Vasquez B out: 234*f1a2e44aSMauricio Vasquez B raw_spin_unlock_irqrestore(&qs->lock, irq_flags); 235*f1a2e44aSMauricio Vasquez B return err; 236*f1a2e44aSMauricio Vasquez B } 237*f1a2e44aSMauricio Vasquez B 238*f1a2e44aSMauricio Vasquez B /* Called from syscall or from eBPF program */ 239*f1a2e44aSMauricio Vasquez B static void *queue_stack_map_lookup_elem(struct bpf_map *map, void *key) 240*f1a2e44aSMauricio Vasquez B { 241*f1a2e44aSMauricio Vasquez B return NULL; 242*f1a2e44aSMauricio Vasquez B } 243*f1a2e44aSMauricio Vasquez B 244*f1a2e44aSMauricio Vasquez B /* Called from syscall or from eBPF program */ 245*f1a2e44aSMauricio Vasquez B static int queue_stack_map_update_elem(struct bpf_map *map, void *key, 246*f1a2e44aSMauricio Vasquez B void *value, u64 flags) 247*f1a2e44aSMauricio Vasquez B { 248*f1a2e44aSMauricio Vasquez B return -EINVAL; 249*f1a2e44aSMauricio Vasquez B } 250*f1a2e44aSMauricio Vasquez B 251*f1a2e44aSMauricio Vasquez B /* Called from syscall or from eBPF program */ 252*f1a2e44aSMauricio Vasquez B static int queue_stack_map_delete_elem(struct bpf_map *map, void *key) 253*f1a2e44aSMauricio Vasquez B { 254*f1a2e44aSMauricio Vasquez B return -EINVAL; 255*f1a2e44aSMauricio Vasquez B } 256*f1a2e44aSMauricio Vasquez B 257*f1a2e44aSMauricio Vasquez B /* Called from syscall */ 258*f1a2e44aSMauricio Vasquez B static int queue_stack_map_get_next_key(struct bpf_map *map, void *key, 259*f1a2e44aSMauricio Vasquez B void *next_key) 260*f1a2e44aSMauricio Vasquez B { 261*f1a2e44aSMauricio Vasquez B return -EINVAL; 262*f1a2e44aSMauricio Vasquez B } 263*f1a2e44aSMauricio Vasquez B 264*f1a2e44aSMauricio Vasquez B const struct bpf_map_ops queue_map_ops = { 265*f1a2e44aSMauricio Vasquez B .map_alloc_check = queue_stack_map_alloc_check, 266*f1a2e44aSMauricio Vasquez B .map_alloc = queue_stack_map_alloc, 267*f1a2e44aSMauricio Vasquez B .map_free = queue_stack_map_free, 268*f1a2e44aSMauricio Vasquez B .map_lookup_elem = queue_stack_map_lookup_elem, 269*f1a2e44aSMauricio Vasquez B .map_update_elem = queue_stack_map_update_elem, 270*f1a2e44aSMauricio Vasquez B .map_delete_elem = queue_stack_map_delete_elem, 271*f1a2e44aSMauricio Vasquez B .map_push_elem = queue_stack_map_push_elem, 272*f1a2e44aSMauricio Vasquez B .map_pop_elem = queue_map_pop_elem, 273*f1a2e44aSMauricio Vasquez B .map_peek_elem = queue_map_peek_elem, 274*f1a2e44aSMauricio Vasquez B .map_get_next_key = queue_stack_map_get_next_key, 275*f1a2e44aSMauricio Vasquez B }; 276*f1a2e44aSMauricio Vasquez B 277*f1a2e44aSMauricio Vasquez B const struct bpf_map_ops stack_map_ops = { 278*f1a2e44aSMauricio Vasquez B .map_alloc_check = queue_stack_map_alloc_check, 279*f1a2e44aSMauricio Vasquez B .map_alloc = queue_stack_map_alloc, 280*f1a2e44aSMauricio Vasquez B .map_free = queue_stack_map_free, 281*f1a2e44aSMauricio Vasquez B .map_lookup_elem = queue_stack_map_lookup_elem, 282*f1a2e44aSMauricio Vasquez B .map_update_elem = queue_stack_map_update_elem, 283*f1a2e44aSMauricio Vasquez B .map_delete_elem = queue_stack_map_delete_elem, 284*f1a2e44aSMauricio Vasquez B .map_push_elem = queue_stack_map_push_elem, 285*f1a2e44aSMauricio Vasquez B .map_pop_elem = stack_map_pop_elem, 286*f1a2e44aSMauricio Vasquez B .map_peek_elem = stack_map_peek_elem, 287*f1a2e44aSMauricio Vasquez B .map_get_next_key = queue_stack_map_get_next_key, 288*f1a2e44aSMauricio Vasquez B }; 289