1457f4436SAndrii Nakryiko #include <linux/bpf.h> 2457f4436SAndrii Nakryiko #include <linux/btf.h> 3457f4436SAndrii Nakryiko #include <linux/err.h> 4457f4436SAndrii Nakryiko #include <linux/irq_work.h> 5457f4436SAndrii Nakryiko #include <linux/slab.h> 6457f4436SAndrii Nakryiko #include <linux/filter.h> 7457f4436SAndrii Nakryiko #include <linux/mm.h> 8457f4436SAndrii Nakryiko #include <linux/vmalloc.h> 9457f4436SAndrii Nakryiko #include <linux/wait.h> 10457f4436SAndrii Nakryiko #include <linux/poll.h> 11ccff81e1SRustam Kovhaev #include <linux/kmemleak.h> 12457f4436SAndrii Nakryiko #include <uapi/linux/btf.h> 13c317ab71SMenglong Dong #include <linux/btf_ids.h> 14457f4436SAndrii Nakryiko 15457f4436SAndrii Nakryiko #define RINGBUF_CREATE_FLAG_MASK (BPF_F_NUMA_NODE) 16457f4436SAndrii Nakryiko 17457f4436SAndrii Nakryiko /* non-mmap()'able part of bpf_ringbuf (everything up to consumer page) */ 18457f4436SAndrii Nakryiko #define RINGBUF_PGOFF \ 19457f4436SAndrii Nakryiko (offsetof(struct bpf_ringbuf, consumer_pos) >> PAGE_SHIFT) 20457f4436SAndrii Nakryiko /* consumer page and producer page */ 21457f4436SAndrii Nakryiko #define RINGBUF_POS_PAGES 2 22457f4436SAndrii Nakryiko 23457f4436SAndrii Nakryiko #define RINGBUF_MAX_RECORD_SZ (UINT_MAX/4) 24457f4436SAndrii Nakryiko 25457f4436SAndrii Nakryiko /* Maximum size of ring buffer area is limited by 32-bit page offset within 26457f4436SAndrii Nakryiko * record header, counted in pages. Reserve 8 bits for extensibility, and take 27457f4436SAndrii Nakryiko * into account few extra pages for consumer/producer pages and 28457f4436SAndrii Nakryiko * non-mmap()'able parts. This gives 64GB limit, which seems plenty for single 29457f4436SAndrii Nakryiko * ring buffer. 30457f4436SAndrii Nakryiko */ 31457f4436SAndrii Nakryiko #define RINGBUF_MAX_DATA_SZ \ 32457f4436SAndrii Nakryiko (((1ULL << 24) - RINGBUF_POS_PAGES - RINGBUF_PGOFF) * PAGE_SIZE) 33457f4436SAndrii Nakryiko 34457f4436SAndrii Nakryiko struct bpf_ringbuf { 35457f4436SAndrii Nakryiko wait_queue_head_t waitq; 36457f4436SAndrii Nakryiko struct irq_work work; 37457f4436SAndrii Nakryiko u64 mask; 38457f4436SAndrii Nakryiko struct page **pages; 39457f4436SAndrii Nakryiko int nr_pages; 40457f4436SAndrii Nakryiko spinlock_t spinlock ____cacheline_aligned_in_smp; 4120571567SDavid Vernet /* For user-space producer ring buffers, an atomic_t busy bit is used 4220571567SDavid Vernet * to synchronize access to the ring buffers in the kernel, rather than 4320571567SDavid Vernet * the spinlock that is used for kernel-producer ring buffers. This is 4420571567SDavid Vernet * done because the ring buffer must hold a lock across a BPF program's 4520571567SDavid Vernet * callback: 4620571567SDavid Vernet * 4720571567SDavid Vernet * __bpf_user_ringbuf_peek() // lock acquired 4820571567SDavid Vernet * -> program callback_fn() 4920571567SDavid Vernet * -> __bpf_user_ringbuf_sample_release() // lock released 5020571567SDavid Vernet * 5120571567SDavid Vernet * It is unsafe and incorrect to hold an IRQ spinlock across what could 5220571567SDavid Vernet * be a long execution window, so we instead simply disallow concurrent 5320571567SDavid Vernet * access to the ring buffer by kernel consumers, and return -EBUSY from 5420571567SDavid Vernet * __bpf_user_ringbuf_peek() if the busy bit is held by another task. 5520571567SDavid Vernet */ 5620571567SDavid Vernet atomic_t busy ____cacheline_aligned_in_smp; 57583c1f42SDavid Vernet /* Consumer and producer counters are put into separate pages to 58583c1f42SDavid Vernet * allow each position to be mapped with different permissions. 59583c1f42SDavid Vernet * This prevents a user-space application from modifying the 60583c1f42SDavid Vernet * position and ruining in-kernel tracking. The permissions of the 61583c1f42SDavid Vernet * pages depend on who is producing samples: user-space or the 62583c1f42SDavid Vernet * kernel. 63583c1f42SDavid Vernet * 64583c1f42SDavid Vernet * Kernel-producer 65583c1f42SDavid Vernet * --------------- 66583c1f42SDavid Vernet * The producer position and data pages are mapped as r/o in 67583c1f42SDavid Vernet * userspace. For this approach, bits in the header of samples are 68583c1f42SDavid Vernet * used to signal to user-space, and to other producers, whether a 69583c1f42SDavid Vernet * sample is currently being written. 70583c1f42SDavid Vernet * 71583c1f42SDavid Vernet * User-space producer 72583c1f42SDavid Vernet * ------------------- 73583c1f42SDavid Vernet * Only the page containing the consumer position is mapped r/o in 74583c1f42SDavid Vernet * user-space. User-space producers also use bits of the header to 75583c1f42SDavid Vernet * communicate to the kernel, but the kernel must carefully check and 76583c1f42SDavid Vernet * validate each sample to ensure that they're correctly formatted, and 77583c1f42SDavid Vernet * fully contained within the ring buffer. 78457f4436SAndrii Nakryiko */ 79457f4436SAndrii Nakryiko unsigned long consumer_pos __aligned(PAGE_SIZE); 80457f4436SAndrii Nakryiko unsigned long producer_pos __aligned(PAGE_SIZE); 81457f4436SAndrii Nakryiko char data[] __aligned(PAGE_SIZE); 82457f4436SAndrii Nakryiko }; 83457f4436SAndrii Nakryiko 84457f4436SAndrii Nakryiko struct bpf_ringbuf_map { 85457f4436SAndrii Nakryiko struct bpf_map map; 86457f4436SAndrii Nakryiko struct bpf_ringbuf *rb; 87457f4436SAndrii Nakryiko }; 88457f4436SAndrii Nakryiko 89457f4436SAndrii Nakryiko /* 8-byte ring buffer record header structure */ 90457f4436SAndrii Nakryiko struct bpf_ringbuf_hdr { 91457f4436SAndrii Nakryiko u32 len; 92457f4436SAndrii Nakryiko u32 pg_off; 93457f4436SAndrii Nakryiko }; 94457f4436SAndrii Nakryiko 95457f4436SAndrii Nakryiko static struct bpf_ringbuf *bpf_ringbuf_area_alloc(size_t data_sz, int numa_node) 96457f4436SAndrii Nakryiko { 97be4035c7SRoman Gushchin const gfp_t flags = GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL | 98be4035c7SRoman Gushchin __GFP_NOWARN | __GFP_ZERO; 99457f4436SAndrii Nakryiko int nr_meta_pages = RINGBUF_PGOFF + RINGBUF_POS_PAGES; 100457f4436SAndrii Nakryiko int nr_data_pages = data_sz >> PAGE_SHIFT; 101457f4436SAndrii Nakryiko int nr_pages = nr_meta_pages + nr_data_pages; 102457f4436SAndrii Nakryiko struct page **pages, *page; 103457f4436SAndrii Nakryiko struct bpf_ringbuf *rb; 104457f4436SAndrii Nakryiko size_t array_size; 105457f4436SAndrii Nakryiko int i; 106457f4436SAndrii Nakryiko 107457f4436SAndrii Nakryiko /* Each data page is mapped twice to allow "virtual" 108457f4436SAndrii Nakryiko * continuous read of samples wrapping around the end of ring 109457f4436SAndrii Nakryiko * buffer area: 110457f4436SAndrii Nakryiko * ------------------------------------------------------ 111457f4436SAndrii Nakryiko * | meta pages | real data pages | same data pages | 112457f4436SAndrii Nakryiko * ------------------------------------------------------ 113457f4436SAndrii Nakryiko * | | 1 2 3 4 5 6 7 8 9 | 1 2 3 4 5 6 7 8 9 | 114457f4436SAndrii Nakryiko * ------------------------------------------------------ 115457f4436SAndrii Nakryiko * | | TA DA | TA DA | 116457f4436SAndrii Nakryiko * ------------------------------------------------------ 117457f4436SAndrii Nakryiko * ^^^^^^^ 118457f4436SAndrii Nakryiko * | 119457f4436SAndrii Nakryiko * Here, no need to worry about special handling of wrapped-around 120457f4436SAndrii Nakryiko * data due to double-mapped data pages. This works both in kernel and 121457f4436SAndrii Nakryiko * when mmap()'ed in user-space, simplifying both kernel and 122457f4436SAndrii Nakryiko * user-space implementations significantly. 123457f4436SAndrii Nakryiko */ 124457f4436SAndrii Nakryiko array_size = (nr_meta_pages + 2 * nr_data_pages) * sizeof(*pages); 125be4035c7SRoman Gushchin pages = bpf_map_area_alloc(array_size, numa_node); 126457f4436SAndrii Nakryiko if (!pages) 127457f4436SAndrii Nakryiko return NULL; 128457f4436SAndrii Nakryiko 129457f4436SAndrii Nakryiko for (i = 0; i < nr_pages; i++) { 130457f4436SAndrii Nakryiko page = alloc_pages_node(numa_node, flags, 0); 131457f4436SAndrii Nakryiko if (!page) { 132457f4436SAndrii Nakryiko nr_pages = i; 133457f4436SAndrii Nakryiko goto err_free_pages; 134457f4436SAndrii Nakryiko } 135457f4436SAndrii Nakryiko pages[i] = page; 136457f4436SAndrii Nakryiko if (i >= nr_meta_pages) 137457f4436SAndrii Nakryiko pages[nr_data_pages + i] = page; 138457f4436SAndrii Nakryiko } 139457f4436SAndrii Nakryiko 140457f4436SAndrii Nakryiko rb = vmap(pages, nr_meta_pages + 2 * nr_data_pages, 141b293dcc4SHou Tao VM_MAP | VM_USERMAP, PAGE_KERNEL); 142457f4436SAndrii Nakryiko if (rb) { 143ccff81e1SRustam Kovhaev kmemleak_not_leak(pages); 144457f4436SAndrii Nakryiko rb->pages = pages; 145457f4436SAndrii Nakryiko rb->nr_pages = nr_pages; 146457f4436SAndrii Nakryiko return rb; 147457f4436SAndrii Nakryiko } 148457f4436SAndrii Nakryiko 149457f4436SAndrii Nakryiko err_free_pages: 150457f4436SAndrii Nakryiko for (i = 0; i < nr_pages; i++) 151457f4436SAndrii Nakryiko __free_page(pages[i]); 1528f58ee54SYafang Shao bpf_map_area_free(pages); 153457f4436SAndrii Nakryiko return NULL; 154457f4436SAndrii Nakryiko } 155457f4436SAndrii Nakryiko 156457f4436SAndrii Nakryiko static void bpf_ringbuf_notify(struct irq_work *work) 157457f4436SAndrii Nakryiko { 158457f4436SAndrii Nakryiko struct bpf_ringbuf *rb = container_of(work, struct bpf_ringbuf, work); 159457f4436SAndrii Nakryiko 160457f4436SAndrii Nakryiko wake_up_all(&rb->waitq); 161457f4436SAndrii Nakryiko } 162457f4436SAndrii Nakryiko 163457f4436SAndrii Nakryiko static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node) 164457f4436SAndrii Nakryiko { 165457f4436SAndrii Nakryiko struct bpf_ringbuf *rb; 166457f4436SAndrii Nakryiko 167457f4436SAndrii Nakryiko rb = bpf_ringbuf_area_alloc(data_sz, numa_node); 168457f4436SAndrii Nakryiko if (!rb) 169abbdd081SRoman Gushchin return NULL; 170457f4436SAndrii Nakryiko 171457f4436SAndrii Nakryiko spin_lock_init(&rb->spinlock); 17220571567SDavid Vernet atomic_set(&rb->busy, 0); 173457f4436SAndrii Nakryiko init_waitqueue_head(&rb->waitq); 174457f4436SAndrii Nakryiko init_irq_work(&rb->work, bpf_ringbuf_notify); 175457f4436SAndrii Nakryiko 176457f4436SAndrii Nakryiko rb->mask = data_sz - 1; 177457f4436SAndrii Nakryiko rb->consumer_pos = 0; 178457f4436SAndrii Nakryiko rb->producer_pos = 0; 179457f4436SAndrii Nakryiko 180457f4436SAndrii Nakryiko return rb; 181457f4436SAndrii Nakryiko } 182457f4436SAndrii Nakryiko 183457f4436SAndrii Nakryiko static struct bpf_map *ringbuf_map_alloc(union bpf_attr *attr) 184457f4436SAndrii Nakryiko { 185457f4436SAndrii Nakryiko struct bpf_ringbuf_map *rb_map; 186457f4436SAndrii Nakryiko 187457f4436SAndrii Nakryiko if (attr->map_flags & ~RINGBUF_CREATE_FLAG_MASK) 188457f4436SAndrii Nakryiko return ERR_PTR(-EINVAL); 189457f4436SAndrii Nakryiko 190457f4436SAndrii Nakryiko if (attr->key_size || attr->value_size || 191517bbe19SAndrii Nakryiko !is_power_of_2(attr->max_entries) || 192517bbe19SAndrii Nakryiko !PAGE_ALIGNED(attr->max_entries)) 193457f4436SAndrii Nakryiko return ERR_PTR(-EINVAL); 194457f4436SAndrii Nakryiko 195517bbe19SAndrii Nakryiko #ifdef CONFIG_64BIT 196517bbe19SAndrii Nakryiko /* on 32-bit arch, it's impossible to overflow record's hdr->pgoff */ 197517bbe19SAndrii Nakryiko if (attr->max_entries > RINGBUF_MAX_DATA_SZ) 198517bbe19SAndrii Nakryiko return ERR_PTR(-E2BIG); 199517bbe19SAndrii Nakryiko #endif 200517bbe19SAndrii Nakryiko 20173cf09a3SYafang Shao rb_map = bpf_map_area_alloc(sizeof(*rb_map), NUMA_NO_NODE); 202457f4436SAndrii Nakryiko if (!rb_map) 203457f4436SAndrii Nakryiko return ERR_PTR(-ENOMEM); 204457f4436SAndrii Nakryiko 205457f4436SAndrii Nakryiko bpf_map_init_from_attr(&rb_map->map, attr); 206457f4436SAndrii Nakryiko 207457f4436SAndrii Nakryiko rb_map->rb = bpf_ringbuf_alloc(attr->max_entries, rb_map->map.numa_node); 208abbdd081SRoman Gushchin if (!rb_map->rb) { 20973cf09a3SYafang Shao bpf_map_area_free(rb_map); 210abbdd081SRoman Gushchin return ERR_PTR(-ENOMEM); 211457f4436SAndrii Nakryiko } 212457f4436SAndrii Nakryiko 213457f4436SAndrii Nakryiko return &rb_map->map; 214457f4436SAndrii Nakryiko } 215457f4436SAndrii Nakryiko 216457f4436SAndrii Nakryiko static void bpf_ringbuf_free(struct bpf_ringbuf *rb) 217457f4436SAndrii Nakryiko { 218457f4436SAndrii Nakryiko /* copy pages pointer and nr_pages to local variable, as we are going 219457f4436SAndrii Nakryiko * to unmap rb itself with vunmap() below 220457f4436SAndrii Nakryiko */ 221457f4436SAndrii Nakryiko struct page **pages = rb->pages; 222457f4436SAndrii Nakryiko int i, nr_pages = rb->nr_pages; 223457f4436SAndrii Nakryiko 224457f4436SAndrii Nakryiko vunmap(rb); 225457f4436SAndrii Nakryiko for (i = 0; i < nr_pages; i++) 226457f4436SAndrii Nakryiko __free_page(pages[i]); 2278f58ee54SYafang Shao bpf_map_area_free(pages); 228457f4436SAndrii Nakryiko } 229457f4436SAndrii Nakryiko 230457f4436SAndrii Nakryiko static void ringbuf_map_free(struct bpf_map *map) 231457f4436SAndrii Nakryiko { 232457f4436SAndrii Nakryiko struct bpf_ringbuf_map *rb_map; 233457f4436SAndrii Nakryiko 234457f4436SAndrii Nakryiko rb_map = container_of(map, struct bpf_ringbuf_map, map); 235457f4436SAndrii Nakryiko bpf_ringbuf_free(rb_map->rb); 23673cf09a3SYafang Shao bpf_map_area_free(rb_map); 237457f4436SAndrii Nakryiko } 238457f4436SAndrii Nakryiko 239457f4436SAndrii Nakryiko static void *ringbuf_map_lookup_elem(struct bpf_map *map, void *key) 240457f4436SAndrii Nakryiko { 241457f4436SAndrii Nakryiko return ERR_PTR(-ENOTSUPP); 242457f4436SAndrii Nakryiko } 243457f4436SAndrii Nakryiko 244457f4436SAndrii Nakryiko static int ringbuf_map_update_elem(struct bpf_map *map, void *key, void *value, 245457f4436SAndrii Nakryiko u64 flags) 246457f4436SAndrii Nakryiko { 247457f4436SAndrii Nakryiko return -ENOTSUPP; 248457f4436SAndrii Nakryiko } 249457f4436SAndrii Nakryiko 250457f4436SAndrii Nakryiko static int ringbuf_map_delete_elem(struct bpf_map *map, void *key) 251457f4436SAndrii Nakryiko { 252457f4436SAndrii Nakryiko return -ENOTSUPP; 253457f4436SAndrii Nakryiko } 254457f4436SAndrii Nakryiko 255457f4436SAndrii Nakryiko static int ringbuf_map_get_next_key(struct bpf_map *map, void *key, 256457f4436SAndrii Nakryiko void *next_key) 257457f4436SAndrii Nakryiko { 258457f4436SAndrii Nakryiko return -ENOTSUPP; 259457f4436SAndrii Nakryiko } 260457f4436SAndrii Nakryiko 261583c1f42SDavid Vernet static int ringbuf_map_mmap_kern(struct bpf_map *map, struct vm_area_struct *vma) 262457f4436SAndrii Nakryiko { 263457f4436SAndrii Nakryiko struct bpf_ringbuf_map *rb_map; 264457f4436SAndrii Nakryiko 265457f4436SAndrii Nakryiko rb_map = container_of(map, struct bpf_ringbuf_map, map); 266457f4436SAndrii Nakryiko 26704ea3086SAndrii Nakryiko if (vma->vm_flags & VM_WRITE) { 26804ea3086SAndrii Nakryiko /* allow writable mapping for the consumer_pos only */ 26904ea3086SAndrii Nakryiko if (vma->vm_pgoff != 0 || vma->vm_end - vma->vm_start != PAGE_SIZE) 27004ea3086SAndrii Nakryiko return -EPERM; 27104ea3086SAndrii Nakryiko } else { 272*1c71222eSSuren Baghdasaryan vm_flags_clear(vma, VM_MAYWRITE); 27304ea3086SAndrii Nakryiko } 27404ea3086SAndrii Nakryiko /* remap_vmalloc_range() checks size and offset constraints */ 275457f4436SAndrii Nakryiko return remap_vmalloc_range(vma, rb_map->rb, 276457f4436SAndrii Nakryiko vma->vm_pgoff + RINGBUF_PGOFF); 277457f4436SAndrii Nakryiko } 278457f4436SAndrii Nakryiko 279583c1f42SDavid Vernet static int ringbuf_map_mmap_user(struct bpf_map *map, struct vm_area_struct *vma) 280583c1f42SDavid Vernet { 281583c1f42SDavid Vernet struct bpf_ringbuf_map *rb_map; 282583c1f42SDavid Vernet 283583c1f42SDavid Vernet rb_map = container_of(map, struct bpf_ringbuf_map, map); 284583c1f42SDavid Vernet 285583c1f42SDavid Vernet if (vma->vm_flags & VM_WRITE) { 286583c1f42SDavid Vernet if (vma->vm_pgoff == 0) 287583c1f42SDavid Vernet /* Disallow writable mappings to the consumer pointer, 288583c1f42SDavid Vernet * and allow writable mappings to both the producer 289583c1f42SDavid Vernet * position, and the ring buffer data itself. 290583c1f42SDavid Vernet */ 291583c1f42SDavid Vernet return -EPERM; 292583c1f42SDavid Vernet } else { 293*1c71222eSSuren Baghdasaryan vm_flags_clear(vma, VM_MAYWRITE); 294583c1f42SDavid Vernet } 295583c1f42SDavid Vernet /* remap_vmalloc_range() checks size and offset constraints */ 296583c1f42SDavid Vernet return remap_vmalloc_range(vma, rb_map->rb, vma->vm_pgoff + RINGBUF_PGOFF); 297583c1f42SDavid Vernet } 298583c1f42SDavid Vernet 299457f4436SAndrii Nakryiko static unsigned long ringbuf_avail_data_sz(struct bpf_ringbuf *rb) 300457f4436SAndrii Nakryiko { 301457f4436SAndrii Nakryiko unsigned long cons_pos, prod_pos; 302457f4436SAndrii Nakryiko 303457f4436SAndrii Nakryiko cons_pos = smp_load_acquire(&rb->consumer_pos); 304457f4436SAndrii Nakryiko prod_pos = smp_load_acquire(&rb->producer_pos); 305457f4436SAndrii Nakryiko return prod_pos - cons_pos; 306457f4436SAndrii Nakryiko } 307457f4436SAndrii Nakryiko 30820571567SDavid Vernet static u32 ringbuf_total_data_sz(const struct bpf_ringbuf *rb) 30920571567SDavid Vernet { 31020571567SDavid Vernet return rb->mask + 1; 31120571567SDavid Vernet } 31220571567SDavid Vernet 31320571567SDavid Vernet static __poll_t ringbuf_map_poll_kern(struct bpf_map *map, struct file *filp, 314457f4436SAndrii Nakryiko struct poll_table_struct *pts) 315457f4436SAndrii Nakryiko { 316457f4436SAndrii Nakryiko struct bpf_ringbuf_map *rb_map; 317457f4436SAndrii Nakryiko 318457f4436SAndrii Nakryiko rb_map = container_of(map, struct bpf_ringbuf_map, map); 319457f4436SAndrii Nakryiko poll_wait(filp, &rb_map->rb->waitq, pts); 320457f4436SAndrii Nakryiko 321457f4436SAndrii Nakryiko if (ringbuf_avail_data_sz(rb_map->rb)) 322457f4436SAndrii Nakryiko return EPOLLIN | EPOLLRDNORM; 323457f4436SAndrii Nakryiko return 0; 324457f4436SAndrii Nakryiko } 325457f4436SAndrii Nakryiko 32620571567SDavid Vernet static __poll_t ringbuf_map_poll_user(struct bpf_map *map, struct file *filp, 32720571567SDavid Vernet struct poll_table_struct *pts) 32820571567SDavid Vernet { 32920571567SDavid Vernet struct bpf_ringbuf_map *rb_map; 33020571567SDavid Vernet 33120571567SDavid Vernet rb_map = container_of(map, struct bpf_ringbuf_map, map); 33220571567SDavid Vernet poll_wait(filp, &rb_map->rb->waitq, pts); 33320571567SDavid Vernet 33420571567SDavid Vernet if (ringbuf_avail_data_sz(rb_map->rb) < ringbuf_total_data_sz(rb_map->rb)) 33520571567SDavid Vernet return EPOLLOUT | EPOLLWRNORM; 33620571567SDavid Vernet return 0; 33720571567SDavid Vernet } 33820571567SDavid Vernet 339c317ab71SMenglong Dong BTF_ID_LIST_SINGLE(ringbuf_map_btf_ids, struct, bpf_ringbuf_map) 340457f4436SAndrii Nakryiko const struct bpf_map_ops ringbuf_map_ops = { 341f4d05259SMartin KaFai Lau .map_meta_equal = bpf_map_meta_equal, 342457f4436SAndrii Nakryiko .map_alloc = ringbuf_map_alloc, 343457f4436SAndrii Nakryiko .map_free = ringbuf_map_free, 344583c1f42SDavid Vernet .map_mmap = ringbuf_map_mmap_kern, 34520571567SDavid Vernet .map_poll = ringbuf_map_poll_kern, 346457f4436SAndrii Nakryiko .map_lookup_elem = ringbuf_map_lookup_elem, 347457f4436SAndrii Nakryiko .map_update_elem = ringbuf_map_update_elem, 348457f4436SAndrii Nakryiko .map_delete_elem = ringbuf_map_delete_elem, 349457f4436SAndrii Nakryiko .map_get_next_key = ringbuf_map_get_next_key, 350c317ab71SMenglong Dong .map_btf_id = &ringbuf_map_btf_ids[0], 351457f4436SAndrii Nakryiko }; 352457f4436SAndrii Nakryiko 353583c1f42SDavid Vernet BTF_ID_LIST_SINGLE(user_ringbuf_map_btf_ids, struct, bpf_ringbuf_map) 354583c1f42SDavid Vernet const struct bpf_map_ops user_ringbuf_map_ops = { 355583c1f42SDavid Vernet .map_meta_equal = bpf_map_meta_equal, 356583c1f42SDavid Vernet .map_alloc = ringbuf_map_alloc, 357583c1f42SDavid Vernet .map_free = ringbuf_map_free, 358583c1f42SDavid Vernet .map_mmap = ringbuf_map_mmap_user, 35920571567SDavid Vernet .map_poll = ringbuf_map_poll_user, 360583c1f42SDavid Vernet .map_lookup_elem = ringbuf_map_lookup_elem, 361583c1f42SDavid Vernet .map_update_elem = ringbuf_map_update_elem, 362583c1f42SDavid Vernet .map_delete_elem = ringbuf_map_delete_elem, 363583c1f42SDavid Vernet .map_get_next_key = ringbuf_map_get_next_key, 364583c1f42SDavid Vernet .map_btf_id = &user_ringbuf_map_btf_ids[0], 365583c1f42SDavid Vernet }; 366583c1f42SDavid Vernet 367457f4436SAndrii Nakryiko /* Given pointer to ring buffer record metadata and struct bpf_ringbuf itself, 368457f4436SAndrii Nakryiko * calculate offset from record metadata to ring buffer in pages, rounded 369457f4436SAndrii Nakryiko * down. This page offset is stored as part of record metadata and allows to 370457f4436SAndrii Nakryiko * restore struct bpf_ringbuf * from record pointer. This page offset is 371457f4436SAndrii Nakryiko * stored at offset 4 of record metadata header. 372457f4436SAndrii Nakryiko */ 373457f4436SAndrii Nakryiko static size_t bpf_ringbuf_rec_pg_off(struct bpf_ringbuf *rb, 374457f4436SAndrii Nakryiko struct bpf_ringbuf_hdr *hdr) 375457f4436SAndrii Nakryiko { 376457f4436SAndrii Nakryiko return ((void *)hdr - (void *)rb) >> PAGE_SHIFT; 377457f4436SAndrii Nakryiko } 378457f4436SAndrii Nakryiko 379457f4436SAndrii Nakryiko /* Given pointer to ring buffer record header, restore pointer to struct 380457f4436SAndrii Nakryiko * bpf_ringbuf itself by using page offset stored at offset 4 381457f4436SAndrii Nakryiko */ 382457f4436SAndrii Nakryiko static struct bpf_ringbuf * 383457f4436SAndrii Nakryiko bpf_ringbuf_restore_from_rec(struct bpf_ringbuf_hdr *hdr) 384457f4436SAndrii Nakryiko { 385457f4436SAndrii Nakryiko unsigned long addr = (unsigned long)(void *)hdr; 386457f4436SAndrii Nakryiko unsigned long off = (unsigned long)hdr->pg_off << PAGE_SHIFT; 387457f4436SAndrii Nakryiko 388457f4436SAndrii Nakryiko return (void*)((addr & PAGE_MASK) - off); 389457f4436SAndrii Nakryiko } 390457f4436SAndrii Nakryiko 391457f4436SAndrii Nakryiko static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size) 392457f4436SAndrii Nakryiko { 393457f4436SAndrii Nakryiko unsigned long cons_pos, prod_pos, new_prod_pos, flags; 394457f4436SAndrii Nakryiko u32 len, pg_off; 395457f4436SAndrii Nakryiko struct bpf_ringbuf_hdr *hdr; 396457f4436SAndrii Nakryiko 397457f4436SAndrii Nakryiko if (unlikely(size > RINGBUF_MAX_RECORD_SZ)) 398457f4436SAndrii Nakryiko return NULL; 399457f4436SAndrii Nakryiko 400457f4436SAndrii Nakryiko len = round_up(size + BPF_RINGBUF_HDR_SZ, 8); 40120571567SDavid Vernet if (len > ringbuf_total_data_sz(rb)) 4024b81ccebSThadeu Lima de Souza Cascardo return NULL; 4034b81ccebSThadeu Lima de Souza Cascardo 404457f4436SAndrii Nakryiko cons_pos = smp_load_acquire(&rb->consumer_pos); 405457f4436SAndrii Nakryiko 406457f4436SAndrii Nakryiko if (in_nmi()) { 407457f4436SAndrii Nakryiko if (!spin_trylock_irqsave(&rb->spinlock, flags)) 408457f4436SAndrii Nakryiko return NULL; 409457f4436SAndrii Nakryiko } else { 410457f4436SAndrii Nakryiko spin_lock_irqsave(&rb->spinlock, flags); 411457f4436SAndrii Nakryiko } 412457f4436SAndrii Nakryiko 413457f4436SAndrii Nakryiko prod_pos = rb->producer_pos; 414457f4436SAndrii Nakryiko new_prod_pos = prod_pos + len; 415457f4436SAndrii Nakryiko 416457f4436SAndrii Nakryiko /* check for out of ringbuf space by ensuring producer position 417457f4436SAndrii Nakryiko * doesn't advance more than (ringbuf_size - 1) ahead 418457f4436SAndrii Nakryiko */ 419457f4436SAndrii Nakryiko if (new_prod_pos - cons_pos > rb->mask) { 420457f4436SAndrii Nakryiko spin_unlock_irqrestore(&rb->spinlock, flags); 421457f4436SAndrii Nakryiko return NULL; 422457f4436SAndrii Nakryiko } 423457f4436SAndrii Nakryiko 424457f4436SAndrii Nakryiko hdr = (void *)rb->data + (prod_pos & rb->mask); 425457f4436SAndrii Nakryiko pg_off = bpf_ringbuf_rec_pg_off(rb, hdr); 426457f4436SAndrii Nakryiko hdr->len = size | BPF_RINGBUF_BUSY_BIT; 427457f4436SAndrii Nakryiko hdr->pg_off = pg_off; 428457f4436SAndrii Nakryiko 429457f4436SAndrii Nakryiko /* pairs with consumer's smp_load_acquire() */ 430457f4436SAndrii Nakryiko smp_store_release(&rb->producer_pos, new_prod_pos); 431457f4436SAndrii Nakryiko 432457f4436SAndrii Nakryiko spin_unlock_irqrestore(&rb->spinlock, flags); 433457f4436SAndrii Nakryiko 434457f4436SAndrii Nakryiko return (void *)hdr + BPF_RINGBUF_HDR_SZ; 435457f4436SAndrii Nakryiko } 436457f4436SAndrii Nakryiko 437457f4436SAndrii Nakryiko BPF_CALL_3(bpf_ringbuf_reserve, struct bpf_map *, map, u64, size, u64, flags) 438457f4436SAndrii Nakryiko { 439457f4436SAndrii Nakryiko struct bpf_ringbuf_map *rb_map; 440457f4436SAndrii Nakryiko 441457f4436SAndrii Nakryiko if (unlikely(flags)) 442457f4436SAndrii Nakryiko return 0; 443457f4436SAndrii Nakryiko 444457f4436SAndrii Nakryiko rb_map = container_of(map, struct bpf_ringbuf_map, map); 445457f4436SAndrii Nakryiko return (unsigned long)__bpf_ringbuf_reserve(rb_map->rb, size); 446457f4436SAndrii Nakryiko } 447457f4436SAndrii Nakryiko 448457f4436SAndrii Nakryiko const struct bpf_func_proto bpf_ringbuf_reserve_proto = { 449457f4436SAndrii Nakryiko .func = bpf_ringbuf_reserve, 450894f2a8bSKumar Kartikeya Dwivedi .ret_type = RET_PTR_TO_RINGBUF_MEM_OR_NULL, 451457f4436SAndrii Nakryiko .arg1_type = ARG_CONST_MAP_PTR, 452457f4436SAndrii Nakryiko .arg2_type = ARG_CONST_ALLOC_SIZE_OR_ZERO, 453457f4436SAndrii Nakryiko .arg3_type = ARG_ANYTHING, 454457f4436SAndrii Nakryiko }; 455457f4436SAndrii Nakryiko 456457f4436SAndrii Nakryiko static void bpf_ringbuf_commit(void *sample, u64 flags, bool discard) 457457f4436SAndrii Nakryiko { 458457f4436SAndrii Nakryiko unsigned long rec_pos, cons_pos; 459457f4436SAndrii Nakryiko struct bpf_ringbuf_hdr *hdr; 460457f4436SAndrii Nakryiko struct bpf_ringbuf *rb; 461457f4436SAndrii Nakryiko u32 new_len; 462457f4436SAndrii Nakryiko 463457f4436SAndrii Nakryiko hdr = sample - BPF_RINGBUF_HDR_SZ; 464457f4436SAndrii Nakryiko rb = bpf_ringbuf_restore_from_rec(hdr); 465457f4436SAndrii Nakryiko new_len = hdr->len ^ BPF_RINGBUF_BUSY_BIT; 466457f4436SAndrii Nakryiko if (discard) 467457f4436SAndrii Nakryiko new_len |= BPF_RINGBUF_DISCARD_BIT; 468457f4436SAndrii Nakryiko 469457f4436SAndrii Nakryiko /* update record header with correct final size prefix */ 470457f4436SAndrii Nakryiko xchg(&hdr->len, new_len); 471457f4436SAndrii Nakryiko 472457f4436SAndrii Nakryiko /* if consumer caught up and is waiting for our record, notify about 473457f4436SAndrii Nakryiko * new data availability 474457f4436SAndrii Nakryiko */ 475457f4436SAndrii Nakryiko rec_pos = (void *)hdr - (void *)rb->data; 476457f4436SAndrii Nakryiko cons_pos = smp_load_acquire(&rb->consumer_pos) & rb->mask; 477457f4436SAndrii Nakryiko 478457f4436SAndrii Nakryiko if (flags & BPF_RB_FORCE_WAKEUP) 479457f4436SAndrii Nakryiko irq_work_queue(&rb->work); 480457f4436SAndrii Nakryiko else if (cons_pos == rec_pos && !(flags & BPF_RB_NO_WAKEUP)) 481457f4436SAndrii Nakryiko irq_work_queue(&rb->work); 482457f4436SAndrii Nakryiko } 483457f4436SAndrii Nakryiko 484457f4436SAndrii Nakryiko BPF_CALL_2(bpf_ringbuf_submit, void *, sample, u64, flags) 485457f4436SAndrii Nakryiko { 486457f4436SAndrii Nakryiko bpf_ringbuf_commit(sample, flags, false /* discard */); 487457f4436SAndrii Nakryiko return 0; 488457f4436SAndrii Nakryiko } 489457f4436SAndrii Nakryiko 490457f4436SAndrii Nakryiko const struct bpf_func_proto bpf_ringbuf_submit_proto = { 491457f4436SAndrii Nakryiko .func = bpf_ringbuf_submit, 492457f4436SAndrii Nakryiko .ret_type = RET_VOID, 493894f2a8bSKumar Kartikeya Dwivedi .arg1_type = ARG_PTR_TO_RINGBUF_MEM | OBJ_RELEASE, 494457f4436SAndrii Nakryiko .arg2_type = ARG_ANYTHING, 495457f4436SAndrii Nakryiko }; 496457f4436SAndrii Nakryiko 497457f4436SAndrii Nakryiko BPF_CALL_2(bpf_ringbuf_discard, void *, sample, u64, flags) 498457f4436SAndrii Nakryiko { 499457f4436SAndrii Nakryiko bpf_ringbuf_commit(sample, flags, true /* discard */); 500457f4436SAndrii Nakryiko return 0; 501457f4436SAndrii Nakryiko } 502457f4436SAndrii Nakryiko 503457f4436SAndrii Nakryiko const struct bpf_func_proto bpf_ringbuf_discard_proto = { 504457f4436SAndrii Nakryiko .func = bpf_ringbuf_discard, 505457f4436SAndrii Nakryiko .ret_type = RET_VOID, 506894f2a8bSKumar Kartikeya Dwivedi .arg1_type = ARG_PTR_TO_RINGBUF_MEM | OBJ_RELEASE, 507457f4436SAndrii Nakryiko .arg2_type = ARG_ANYTHING, 508457f4436SAndrii Nakryiko }; 509457f4436SAndrii Nakryiko 510457f4436SAndrii Nakryiko BPF_CALL_4(bpf_ringbuf_output, struct bpf_map *, map, void *, data, u64, size, 511457f4436SAndrii Nakryiko u64, flags) 512457f4436SAndrii Nakryiko { 513457f4436SAndrii Nakryiko struct bpf_ringbuf_map *rb_map; 514457f4436SAndrii Nakryiko void *rec; 515457f4436SAndrii Nakryiko 516457f4436SAndrii Nakryiko if (unlikely(flags & ~(BPF_RB_NO_WAKEUP | BPF_RB_FORCE_WAKEUP))) 517457f4436SAndrii Nakryiko return -EINVAL; 518457f4436SAndrii Nakryiko 519457f4436SAndrii Nakryiko rb_map = container_of(map, struct bpf_ringbuf_map, map); 520457f4436SAndrii Nakryiko rec = __bpf_ringbuf_reserve(rb_map->rb, size); 521457f4436SAndrii Nakryiko if (!rec) 522457f4436SAndrii Nakryiko return -EAGAIN; 523457f4436SAndrii Nakryiko 524457f4436SAndrii Nakryiko memcpy(rec, data, size); 525457f4436SAndrii Nakryiko bpf_ringbuf_commit(rec, flags, false /* discard */); 526457f4436SAndrii Nakryiko return 0; 527457f4436SAndrii Nakryiko } 528457f4436SAndrii Nakryiko 529457f4436SAndrii Nakryiko const struct bpf_func_proto bpf_ringbuf_output_proto = { 530457f4436SAndrii Nakryiko .func = bpf_ringbuf_output, 531457f4436SAndrii Nakryiko .ret_type = RET_INTEGER, 532457f4436SAndrii Nakryiko .arg1_type = ARG_CONST_MAP_PTR, 533216e3cd2SHao Luo .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, 534457f4436SAndrii Nakryiko .arg3_type = ARG_CONST_SIZE_OR_ZERO, 535457f4436SAndrii Nakryiko .arg4_type = ARG_ANYTHING, 536457f4436SAndrii Nakryiko }; 537457f4436SAndrii Nakryiko 538457f4436SAndrii Nakryiko BPF_CALL_2(bpf_ringbuf_query, struct bpf_map *, map, u64, flags) 539457f4436SAndrii Nakryiko { 540457f4436SAndrii Nakryiko struct bpf_ringbuf *rb; 541457f4436SAndrii Nakryiko 542457f4436SAndrii Nakryiko rb = container_of(map, struct bpf_ringbuf_map, map)->rb; 543457f4436SAndrii Nakryiko 544457f4436SAndrii Nakryiko switch (flags) { 545457f4436SAndrii Nakryiko case BPF_RB_AVAIL_DATA: 546457f4436SAndrii Nakryiko return ringbuf_avail_data_sz(rb); 547457f4436SAndrii Nakryiko case BPF_RB_RING_SIZE: 54820571567SDavid Vernet return ringbuf_total_data_sz(rb); 549457f4436SAndrii Nakryiko case BPF_RB_CONS_POS: 550457f4436SAndrii Nakryiko return smp_load_acquire(&rb->consumer_pos); 551457f4436SAndrii Nakryiko case BPF_RB_PROD_POS: 552457f4436SAndrii Nakryiko return smp_load_acquire(&rb->producer_pos); 553457f4436SAndrii Nakryiko default: 554457f4436SAndrii Nakryiko return 0; 555457f4436SAndrii Nakryiko } 556457f4436SAndrii Nakryiko } 557457f4436SAndrii Nakryiko 558457f4436SAndrii Nakryiko const struct bpf_func_proto bpf_ringbuf_query_proto = { 559457f4436SAndrii Nakryiko .func = bpf_ringbuf_query, 560457f4436SAndrii Nakryiko .ret_type = RET_INTEGER, 561457f4436SAndrii Nakryiko .arg1_type = ARG_CONST_MAP_PTR, 562457f4436SAndrii Nakryiko .arg2_type = ARG_ANYTHING, 563457f4436SAndrii Nakryiko }; 564bc34dee6SJoanne Koong 565bc34dee6SJoanne Koong BPF_CALL_4(bpf_ringbuf_reserve_dynptr, struct bpf_map *, map, u32, size, u64, flags, 566bc34dee6SJoanne Koong struct bpf_dynptr_kern *, ptr) 567bc34dee6SJoanne Koong { 568bc34dee6SJoanne Koong struct bpf_ringbuf_map *rb_map; 569bc34dee6SJoanne Koong void *sample; 570bc34dee6SJoanne Koong int err; 571bc34dee6SJoanne Koong 572bc34dee6SJoanne Koong if (unlikely(flags)) { 573bc34dee6SJoanne Koong bpf_dynptr_set_null(ptr); 574bc34dee6SJoanne Koong return -EINVAL; 575bc34dee6SJoanne Koong } 576bc34dee6SJoanne Koong 577bc34dee6SJoanne Koong err = bpf_dynptr_check_size(size); 578bc34dee6SJoanne Koong if (err) { 579bc34dee6SJoanne Koong bpf_dynptr_set_null(ptr); 580bc34dee6SJoanne Koong return err; 581bc34dee6SJoanne Koong } 582bc34dee6SJoanne Koong 583bc34dee6SJoanne Koong rb_map = container_of(map, struct bpf_ringbuf_map, map); 584bc34dee6SJoanne Koong 585bc34dee6SJoanne Koong sample = __bpf_ringbuf_reserve(rb_map->rb, size); 586bc34dee6SJoanne Koong if (!sample) { 587bc34dee6SJoanne Koong bpf_dynptr_set_null(ptr); 588bc34dee6SJoanne Koong return -EINVAL; 589bc34dee6SJoanne Koong } 590bc34dee6SJoanne Koong 591bc34dee6SJoanne Koong bpf_dynptr_init(ptr, sample, BPF_DYNPTR_TYPE_RINGBUF, 0, size); 592bc34dee6SJoanne Koong 593bc34dee6SJoanne Koong return 0; 594bc34dee6SJoanne Koong } 595bc34dee6SJoanne Koong 596bc34dee6SJoanne Koong const struct bpf_func_proto bpf_ringbuf_reserve_dynptr_proto = { 597bc34dee6SJoanne Koong .func = bpf_ringbuf_reserve_dynptr, 598bc34dee6SJoanne Koong .ret_type = RET_INTEGER, 599bc34dee6SJoanne Koong .arg1_type = ARG_CONST_MAP_PTR, 600bc34dee6SJoanne Koong .arg2_type = ARG_ANYTHING, 601bc34dee6SJoanne Koong .arg3_type = ARG_ANYTHING, 602bc34dee6SJoanne Koong .arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_RINGBUF | MEM_UNINIT, 603bc34dee6SJoanne Koong }; 604bc34dee6SJoanne Koong 605bc34dee6SJoanne Koong BPF_CALL_2(bpf_ringbuf_submit_dynptr, struct bpf_dynptr_kern *, ptr, u64, flags) 606bc34dee6SJoanne Koong { 607bc34dee6SJoanne Koong if (!ptr->data) 608bc34dee6SJoanne Koong return 0; 609bc34dee6SJoanne Koong 610bc34dee6SJoanne Koong bpf_ringbuf_commit(ptr->data, flags, false /* discard */); 611bc34dee6SJoanne Koong 612bc34dee6SJoanne Koong bpf_dynptr_set_null(ptr); 613bc34dee6SJoanne Koong 614bc34dee6SJoanne Koong return 0; 615bc34dee6SJoanne Koong } 616bc34dee6SJoanne Koong 617bc34dee6SJoanne Koong const struct bpf_func_proto bpf_ringbuf_submit_dynptr_proto = { 618bc34dee6SJoanne Koong .func = bpf_ringbuf_submit_dynptr, 619bc34dee6SJoanne Koong .ret_type = RET_VOID, 620bc34dee6SJoanne Koong .arg1_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_RINGBUF | OBJ_RELEASE, 621bc34dee6SJoanne Koong .arg2_type = ARG_ANYTHING, 622bc34dee6SJoanne Koong }; 623bc34dee6SJoanne Koong 624bc34dee6SJoanne Koong BPF_CALL_2(bpf_ringbuf_discard_dynptr, struct bpf_dynptr_kern *, ptr, u64, flags) 625bc34dee6SJoanne Koong { 626bc34dee6SJoanne Koong if (!ptr->data) 627bc34dee6SJoanne Koong return 0; 628bc34dee6SJoanne Koong 629bc34dee6SJoanne Koong bpf_ringbuf_commit(ptr->data, flags, true /* discard */); 630bc34dee6SJoanne Koong 631bc34dee6SJoanne Koong bpf_dynptr_set_null(ptr); 632bc34dee6SJoanne Koong 633bc34dee6SJoanne Koong return 0; 634bc34dee6SJoanne Koong } 635bc34dee6SJoanne Koong 636bc34dee6SJoanne Koong const struct bpf_func_proto bpf_ringbuf_discard_dynptr_proto = { 637bc34dee6SJoanne Koong .func = bpf_ringbuf_discard_dynptr, 638bc34dee6SJoanne Koong .ret_type = RET_VOID, 639bc34dee6SJoanne Koong .arg1_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_RINGBUF | OBJ_RELEASE, 640bc34dee6SJoanne Koong .arg2_type = ARG_ANYTHING, 641bc34dee6SJoanne Koong }; 64220571567SDavid Vernet 64320571567SDavid Vernet static int __bpf_user_ringbuf_peek(struct bpf_ringbuf *rb, void **sample, u32 *size) 64420571567SDavid Vernet { 64520571567SDavid Vernet int err; 64620571567SDavid Vernet u32 hdr_len, sample_len, total_len, flags, *hdr; 64720571567SDavid Vernet u64 cons_pos, prod_pos; 64820571567SDavid Vernet 64920571567SDavid Vernet /* Synchronizes with smp_store_release() in user-space producer. */ 65020571567SDavid Vernet prod_pos = smp_load_acquire(&rb->producer_pos); 65120571567SDavid Vernet if (prod_pos % 8) 65220571567SDavid Vernet return -EINVAL; 65320571567SDavid Vernet 65420571567SDavid Vernet /* Synchronizes with smp_store_release() in __bpf_user_ringbuf_sample_release() */ 65520571567SDavid Vernet cons_pos = smp_load_acquire(&rb->consumer_pos); 65620571567SDavid Vernet if (cons_pos >= prod_pos) 65720571567SDavid Vernet return -ENODATA; 65820571567SDavid Vernet 65920571567SDavid Vernet hdr = (u32 *)((uintptr_t)rb->data + (uintptr_t)(cons_pos & rb->mask)); 66020571567SDavid Vernet /* Synchronizes with smp_store_release() in user-space producer. */ 66120571567SDavid Vernet hdr_len = smp_load_acquire(hdr); 66220571567SDavid Vernet flags = hdr_len & (BPF_RINGBUF_BUSY_BIT | BPF_RINGBUF_DISCARD_BIT); 66320571567SDavid Vernet sample_len = hdr_len & ~flags; 66420571567SDavid Vernet total_len = round_up(sample_len + BPF_RINGBUF_HDR_SZ, 8); 66520571567SDavid Vernet 66620571567SDavid Vernet /* The sample must fit within the region advertised by the producer position. */ 66720571567SDavid Vernet if (total_len > prod_pos - cons_pos) 66820571567SDavid Vernet return -EINVAL; 66920571567SDavid Vernet 67020571567SDavid Vernet /* The sample must fit within the data region of the ring buffer. */ 67120571567SDavid Vernet if (total_len > ringbuf_total_data_sz(rb)) 67220571567SDavid Vernet return -E2BIG; 67320571567SDavid Vernet 67420571567SDavid Vernet /* The sample must fit into a struct bpf_dynptr. */ 67520571567SDavid Vernet err = bpf_dynptr_check_size(sample_len); 67620571567SDavid Vernet if (err) 67720571567SDavid Vernet return -E2BIG; 67820571567SDavid Vernet 67920571567SDavid Vernet if (flags & BPF_RINGBUF_DISCARD_BIT) { 68020571567SDavid Vernet /* If the discard bit is set, the sample should be skipped. 68120571567SDavid Vernet * 68220571567SDavid Vernet * Update the consumer pos, and return -EAGAIN so the caller 68320571567SDavid Vernet * knows to skip this sample and try to read the next one. 68420571567SDavid Vernet */ 68520571567SDavid Vernet smp_store_release(&rb->consumer_pos, cons_pos + total_len); 68620571567SDavid Vernet return -EAGAIN; 68720571567SDavid Vernet } 68820571567SDavid Vernet 68920571567SDavid Vernet if (flags & BPF_RINGBUF_BUSY_BIT) 69020571567SDavid Vernet return -ENODATA; 69120571567SDavid Vernet 69220571567SDavid Vernet *sample = (void *)((uintptr_t)rb->data + 69320571567SDavid Vernet (uintptr_t)((cons_pos + BPF_RINGBUF_HDR_SZ) & rb->mask)); 69420571567SDavid Vernet *size = sample_len; 69520571567SDavid Vernet return 0; 69620571567SDavid Vernet } 69720571567SDavid Vernet 69820571567SDavid Vernet static void __bpf_user_ringbuf_sample_release(struct bpf_ringbuf *rb, size_t size, u64 flags) 69920571567SDavid Vernet { 70020571567SDavid Vernet u64 consumer_pos; 70120571567SDavid Vernet u32 rounded_size = round_up(size + BPF_RINGBUF_HDR_SZ, 8); 70220571567SDavid Vernet 70320571567SDavid Vernet /* Using smp_load_acquire() is unnecessary here, as the busy-bit 70420571567SDavid Vernet * prevents another task from writing to consumer_pos after it was read 70520571567SDavid Vernet * by this task with smp_load_acquire() in __bpf_user_ringbuf_peek(). 70620571567SDavid Vernet */ 70720571567SDavid Vernet consumer_pos = rb->consumer_pos; 70820571567SDavid Vernet /* Synchronizes with smp_load_acquire() in user-space producer. */ 70920571567SDavid Vernet smp_store_release(&rb->consumer_pos, consumer_pos + rounded_size); 71020571567SDavid Vernet } 71120571567SDavid Vernet 71220571567SDavid Vernet BPF_CALL_4(bpf_user_ringbuf_drain, struct bpf_map *, map, 71320571567SDavid Vernet void *, callback_fn, void *, callback_ctx, u64, flags) 71420571567SDavid Vernet { 71520571567SDavid Vernet struct bpf_ringbuf *rb; 71620571567SDavid Vernet long samples, discarded_samples = 0, ret = 0; 71720571567SDavid Vernet bpf_callback_t callback = (bpf_callback_t)callback_fn; 71820571567SDavid Vernet u64 wakeup_flags = BPF_RB_NO_WAKEUP | BPF_RB_FORCE_WAKEUP; 71920571567SDavid Vernet int busy = 0; 72020571567SDavid Vernet 72120571567SDavid Vernet if (unlikely(flags & ~wakeup_flags)) 72220571567SDavid Vernet return -EINVAL; 72320571567SDavid Vernet 72420571567SDavid Vernet rb = container_of(map, struct bpf_ringbuf_map, map)->rb; 72520571567SDavid Vernet 72620571567SDavid Vernet /* If another consumer is already consuming a sample, wait for them to finish. */ 72720571567SDavid Vernet if (!atomic_try_cmpxchg(&rb->busy, &busy, 1)) 72820571567SDavid Vernet return -EBUSY; 72920571567SDavid Vernet 73020571567SDavid Vernet for (samples = 0; samples < BPF_MAX_USER_RINGBUF_SAMPLES && ret == 0; samples++) { 73120571567SDavid Vernet int err; 73220571567SDavid Vernet u32 size; 73320571567SDavid Vernet void *sample; 73420571567SDavid Vernet struct bpf_dynptr_kern dynptr; 73520571567SDavid Vernet 73620571567SDavid Vernet err = __bpf_user_ringbuf_peek(rb, &sample, &size); 73720571567SDavid Vernet if (err) { 73820571567SDavid Vernet if (err == -ENODATA) { 73920571567SDavid Vernet break; 74020571567SDavid Vernet } else if (err == -EAGAIN) { 74120571567SDavid Vernet discarded_samples++; 74220571567SDavid Vernet continue; 74320571567SDavid Vernet } else { 74420571567SDavid Vernet ret = err; 74520571567SDavid Vernet goto schedule_work_return; 74620571567SDavid Vernet } 74720571567SDavid Vernet } 74820571567SDavid Vernet 74920571567SDavid Vernet bpf_dynptr_init(&dynptr, sample, BPF_DYNPTR_TYPE_LOCAL, 0, size); 75020571567SDavid Vernet ret = callback((uintptr_t)&dynptr, (uintptr_t)callback_ctx, 0, 0, 0); 75120571567SDavid Vernet __bpf_user_ringbuf_sample_release(rb, size, flags); 75220571567SDavid Vernet } 75320571567SDavid Vernet ret = samples - discarded_samples; 75420571567SDavid Vernet 75520571567SDavid Vernet schedule_work_return: 75620571567SDavid Vernet /* Prevent the clearing of the busy-bit from being reordered before the 75720571567SDavid Vernet * storing of any rb consumer or producer positions. 75820571567SDavid Vernet */ 75920571567SDavid Vernet smp_mb__before_atomic(); 76020571567SDavid Vernet atomic_set(&rb->busy, 0); 76120571567SDavid Vernet 76220571567SDavid Vernet if (flags & BPF_RB_FORCE_WAKEUP) 76320571567SDavid Vernet irq_work_queue(&rb->work); 76420571567SDavid Vernet else if (!(flags & BPF_RB_NO_WAKEUP) && samples > 0) 76520571567SDavid Vernet irq_work_queue(&rb->work); 76620571567SDavid Vernet return ret; 76720571567SDavid Vernet } 76820571567SDavid Vernet 76920571567SDavid Vernet const struct bpf_func_proto bpf_user_ringbuf_drain_proto = { 77020571567SDavid Vernet .func = bpf_user_ringbuf_drain, 77120571567SDavid Vernet .ret_type = RET_INTEGER, 77220571567SDavid Vernet .arg1_type = ARG_CONST_MAP_PTR, 77320571567SDavid Vernet .arg2_type = ARG_PTR_TO_FUNC, 77420571567SDavid Vernet .arg3_type = ARG_PTR_TO_STACK_OR_NULL, 77520571567SDavid Vernet .arg4_type = ARG_ANYTHING, 77620571567SDavid Vernet }; 777