1457f4436SAndrii Nakryiko #include <linux/bpf.h> 2457f4436SAndrii Nakryiko #include <linux/btf.h> 3457f4436SAndrii Nakryiko #include <linux/err.h> 4457f4436SAndrii Nakryiko #include <linux/irq_work.h> 5457f4436SAndrii Nakryiko #include <linux/slab.h> 6457f4436SAndrii Nakryiko #include <linux/filter.h> 7457f4436SAndrii Nakryiko #include <linux/mm.h> 8457f4436SAndrii Nakryiko #include <linux/vmalloc.h> 9457f4436SAndrii Nakryiko #include <linux/wait.h> 10457f4436SAndrii Nakryiko #include <linux/poll.h> 11ccff81e1SRustam Kovhaev #include <linux/kmemleak.h> 12457f4436SAndrii Nakryiko #include <uapi/linux/btf.h> 13c317ab71SMenglong Dong #include <linux/btf_ids.h> 14457f4436SAndrii Nakryiko 15457f4436SAndrii Nakryiko #define RINGBUF_CREATE_FLAG_MASK (BPF_F_NUMA_NODE) 16457f4436SAndrii Nakryiko 17457f4436SAndrii Nakryiko /* non-mmap()'able part of bpf_ringbuf (everything up to consumer page) */ 18457f4436SAndrii Nakryiko #define RINGBUF_PGOFF \ 19457f4436SAndrii Nakryiko (offsetof(struct bpf_ringbuf, consumer_pos) >> PAGE_SHIFT) 20457f4436SAndrii Nakryiko /* consumer page and producer page */ 21457f4436SAndrii Nakryiko #define RINGBUF_POS_PAGES 2 222f7e4ab2SYafang Shao #define RINGBUF_NR_META_PAGES (RINGBUF_PGOFF + RINGBUF_POS_PAGES) 23457f4436SAndrii Nakryiko 24457f4436SAndrii Nakryiko #define RINGBUF_MAX_RECORD_SZ (UINT_MAX/4) 25457f4436SAndrii Nakryiko 26457f4436SAndrii Nakryiko /* Maximum size of ring buffer area is limited by 32-bit page offset within 27457f4436SAndrii Nakryiko * record header, counted in pages. Reserve 8 bits for extensibility, and take 28457f4436SAndrii Nakryiko * into account few extra pages for consumer/producer pages and 29457f4436SAndrii Nakryiko * non-mmap()'able parts. This gives 64GB limit, which seems plenty for single 30457f4436SAndrii Nakryiko * ring buffer. 31457f4436SAndrii Nakryiko */ 32457f4436SAndrii Nakryiko #define RINGBUF_MAX_DATA_SZ \ 33457f4436SAndrii Nakryiko (((1ULL << 24) - RINGBUF_POS_PAGES - RINGBUF_PGOFF) * PAGE_SIZE) 34457f4436SAndrii Nakryiko 35457f4436SAndrii Nakryiko struct bpf_ringbuf { 36457f4436SAndrii Nakryiko wait_queue_head_t waitq; 37457f4436SAndrii Nakryiko struct irq_work work; 38457f4436SAndrii Nakryiko u64 mask; 39457f4436SAndrii Nakryiko struct page **pages; 40457f4436SAndrii Nakryiko int nr_pages; 41457f4436SAndrii Nakryiko spinlock_t spinlock ____cacheline_aligned_in_smp; 4220571567SDavid Vernet /* For user-space producer ring buffers, an atomic_t busy bit is used 4320571567SDavid Vernet * to synchronize access to the ring buffers in the kernel, rather than 4420571567SDavid Vernet * the spinlock that is used for kernel-producer ring buffers. This is 4520571567SDavid Vernet * done because the ring buffer must hold a lock across a BPF program's 4620571567SDavid Vernet * callback: 4720571567SDavid Vernet * 4820571567SDavid Vernet * __bpf_user_ringbuf_peek() // lock acquired 4920571567SDavid Vernet * -> program callback_fn() 5020571567SDavid Vernet * -> __bpf_user_ringbuf_sample_release() // lock released 5120571567SDavid Vernet * 5220571567SDavid Vernet * It is unsafe and incorrect to hold an IRQ spinlock across what could 5320571567SDavid Vernet * be a long execution window, so we instead simply disallow concurrent 5420571567SDavid Vernet * access to the ring buffer by kernel consumers, and return -EBUSY from 5520571567SDavid Vernet * __bpf_user_ringbuf_peek() if the busy bit is held by another task. 5620571567SDavid Vernet */ 5720571567SDavid Vernet atomic_t busy ____cacheline_aligned_in_smp; 58583c1f42SDavid Vernet /* Consumer and producer counters are put into separate pages to 59583c1f42SDavid Vernet * allow each position to be mapped with different permissions. 60583c1f42SDavid Vernet * This prevents a user-space application from modifying the 61583c1f42SDavid Vernet * position and ruining in-kernel tracking. The permissions of the 62583c1f42SDavid Vernet * pages depend on who is producing samples: user-space or the 63583c1f42SDavid Vernet * kernel. 64583c1f42SDavid Vernet * 65583c1f42SDavid Vernet * Kernel-producer 66583c1f42SDavid Vernet * --------------- 67583c1f42SDavid Vernet * The producer position and data pages are mapped as r/o in 68583c1f42SDavid Vernet * userspace. For this approach, bits in the header of samples are 69583c1f42SDavid Vernet * used to signal to user-space, and to other producers, whether a 70583c1f42SDavid Vernet * sample is currently being written. 71583c1f42SDavid Vernet * 72583c1f42SDavid Vernet * User-space producer 73583c1f42SDavid Vernet * ------------------- 74583c1f42SDavid Vernet * Only the page containing the consumer position is mapped r/o in 75583c1f42SDavid Vernet * user-space. User-space producers also use bits of the header to 76583c1f42SDavid Vernet * communicate to the kernel, but the kernel must carefully check and 77583c1f42SDavid Vernet * validate each sample to ensure that they're correctly formatted, and 78583c1f42SDavid Vernet * fully contained within the ring buffer. 79457f4436SAndrii Nakryiko */ 80457f4436SAndrii Nakryiko unsigned long consumer_pos __aligned(PAGE_SIZE); 81457f4436SAndrii Nakryiko unsigned long producer_pos __aligned(PAGE_SIZE); 82457f4436SAndrii Nakryiko char data[] __aligned(PAGE_SIZE); 83457f4436SAndrii Nakryiko }; 84457f4436SAndrii Nakryiko 85457f4436SAndrii Nakryiko struct bpf_ringbuf_map { 86457f4436SAndrii Nakryiko struct bpf_map map; 87457f4436SAndrii Nakryiko struct bpf_ringbuf *rb; 88457f4436SAndrii Nakryiko }; 89457f4436SAndrii Nakryiko 90457f4436SAndrii Nakryiko /* 8-byte ring buffer record header structure */ 91457f4436SAndrii Nakryiko struct bpf_ringbuf_hdr { 92457f4436SAndrii Nakryiko u32 len; 93457f4436SAndrii Nakryiko u32 pg_off; 94457f4436SAndrii Nakryiko }; 95457f4436SAndrii Nakryiko 96457f4436SAndrii Nakryiko static struct bpf_ringbuf *bpf_ringbuf_area_alloc(size_t data_sz, int numa_node) 97457f4436SAndrii Nakryiko { 98be4035c7SRoman Gushchin const gfp_t flags = GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL | 99be4035c7SRoman Gushchin __GFP_NOWARN | __GFP_ZERO; 1002f7e4ab2SYafang Shao int nr_meta_pages = RINGBUF_NR_META_PAGES; 101457f4436SAndrii Nakryiko int nr_data_pages = data_sz >> PAGE_SHIFT; 102457f4436SAndrii Nakryiko int nr_pages = nr_meta_pages + nr_data_pages; 103457f4436SAndrii Nakryiko struct page **pages, *page; 104457f4436SAndrii Nakryiko struct bpf_ringbuf *rb; 105457f4436SAndrii Nakryiko size_t array_size; 106457f4436SAndrii Nakryiko int i; 107457f4436SAndrii Nakryiko 108457f4436SAndrii Nakryiko /* Each data page is mapped twice to allow "virtual" 109457f4436SAndrii Nakryiko * continuous read of samples wrapping around the end of ring 110457f4436SAndrii Nakryiko * buffer area: 111457f4436SAndrii Nakryiko * ------------------------------------------------------ 112457f4436SAndrii Nakryiko * | meta pages | real data pages | same data pages | 113457f4436SAndrii Nakryiko * ------------------------------------------------------ 114457f4436SAndrii Nakryiko * | | 1 2 3 4 5 6 7 8 9 | 1 2 3 4 5 6 7 8 9 | 115457f4436SAndrii Nakryiko * ------------------------------------------------------ 116457f4436SAndrii Nakryiko * | | TA DA | TA DA | 117457f4436SAndrii Nakryiko * ------------------------------------------------------ 118457f4436SAndrii Nakryiko * ^^^^^^^ 119457f4436SAndrii Nakryiko * | 120457f4436SAndrii Nakryiko * Here, no need to worry about special handling of wrapped-around 121457f4436SAndrii Nakryiko * data due to double-mapped data pages. This works both in kernel and 122457f4436SAndrii Nakryiko * when mmap()'ed in user-space, simplifying both kernel and 123457f4436SAndrii Nakryiko * user-space implementations significantly. 124457f4436SAndrii Nakryiko */ 125457f4436SAndrii Nakryiko array_size = (nr_meta_pages + 2 * nr_data_pages) * sizeof(*pages); 126be4035c7SRoman Gushchin pages = bpf_map_area_alloc(array_size, numa_node); 127457f4436SAndrii Nakryiko if (!pages) 128457f4436SAndrii Nakryiko return NULL; 129457f4436SAndrii Nakryiko 130457f4436SAndrii Nakryiko for (i = 0; i < nr_pages; i++) { 131457f4436SAndrii Nakryiko page = alloc_pages_node(numa_node, flags, 0); 132457f4436SAndrii Nakryiko if (!page) { 133457f4436SAndrii Nakryiko nr_pages = i; 134457f4436SAndrii Nakryiko goto err_free_pages; 135457f4436SAndrii Nakryiko } 136457f4436SAndrii Nakryiko pages[i] = page; 137457f4436SAndrii Nakryiko if (i >= nr_meta_pages) 138457f4436SAndrii Nakryiko pages[nr_data_pages + i] = page; 139457f4436SAndrii Nakryiko } 140457f4436SAndrii Nakryiko 141457f4436SAndrii Nakryiko rb = vmap(pages, nr_meta_pages + 2 * nr_data_pages, 142b293dcc4SHou Tao VM_MAP | VM_USERMAP, PAGE_KERNEL); 143457f4436SAndrii Nakryiko if (rb) { 144ccff81e1SRustam Kovhaev kmemleak_not_leak(pages); 145457f4436SAndrii Nakryiko rb->pages = pages; 146457f4436SAndrii Nakryiko rb->nr_pages = nr_pages; 147457f4436SAndrii Nakryiko return rb; 148457f4436SAndrii Nakryiko } 149457f4436SAndrii Nakryiko 150457f4436SAndrii Nakryiko err_free_pages: 151457f4436SAndrii Nakryiko for (i = 0; i < nr_pages; i++) 152457f4436SAndrii Nakryiko __free_page(pages[i]); 1538f58ee54SYafang Shao bpf_map_area_free(pages); 154457f4436SAndrii Nakryiko return NULL; 155457f4436SAndrii Nakryiko } 156457f4436SAndrii Nakryiko 157457f4436SAndrii Nakryiko static void bpf_ringbuf_notify(struct irq_work *work) 158457f4436SAndrii Nakryiko { 159457f4436SAndrii Nakryiko struct bpf_ringbuf *rb = container_of(work, struct bpf_ringbuf, work); 160457f4436SAndrii Nakryiko 161457f4436SAndrii Nakryiko wake_up_all(&rb->waitq); 162457f4436SAndrii Nakryiko } 163457f4436SAndrii Nakryiko 164457f4436SAndrii Nakryiko static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node) 165457f4436SAndrii Nakryiko { 166457f4436SAndrii Nakryiko struct bpf_ringbuf *rb; 167457f4436SAndrii Nakryiko 168457f4436SAndrii Nakryiko rb = bpf_ringbuf_area_alloc(data_sz, numa_node); 169457f4436SAndrii Nakryiko if (!rb) 170abbdd081SRoman Gushchin return NULL; 171457f4436SAndrii Nakryiko 172457f4436SAndrii Nakryiko spin_lock_init(&rb->spinlock); 17320571567SDavid Vernet atomic_set(&rb->busy, 0); 174457f4436SAndrii Nakryiko init_waitqueue_head(&rb->waitq); 175457f4436SAndrii Nakryiko init_irq_work(&rb->work, bpf_ringbuf_notify); 176457f4436SAndrii Nakryiko 177457f4436SAndrii Nakryiko rb->mask = data_sz - 1; 178457f4436SAndrii Nakryiko rb->consumer_pos = 0; 179457f4436SAndrii Nakryiko rb->producer_pos = 0; 180457f4436SAndrii Nakryiko 181457f4436SAndrii Nakryiko return rb; 182457f4436SAndrii Nakryiko } 183457f4436SAndrii Nakryiko 184457f4436SAndrii Nakryiko static struct bpf_map *ringbuf_map_alloc(union bpf_attr *attr) 185457f4436SAndrii Nakryiko { 186457f4436SAndrii Nakryiko struct bpf_ringbuf_map *rb_map; 187457f4436SAndrii Nakryiko 188457f4436SAndrii Nakryiko if (attr->map_flags & ~RINGBUF_CREATE_FLAG_MASK) 189457f4436SAndrii Nakryiko return ERR_PTR(-EINVAL); 190457f4436SAndrii Nakryiko 191457f4436SAndrii Nakryiko if (attr->key_size || attr->value_size || 192517bbe19SAndrii Nakryiko !is_power_of_2(attr->max_entries) || 193517bbe19SAndrii Nakryiko !PAGE_ALIGNED(attr->max_entries)) 194457f4436SAndrii Nakryiko return ERR_PTR(-EINVAL); 195457f4436SAndrii Nakryiko 196517bbe19SAndrii Nakryiko #ifdef CONFIG_64BIT 197517bbe19SAndrii Nakryiko /* on 32-bit arch, it's impossible to overflow record's hdr->pgoff */ 198517bbe19SAndrii Nakryiko if (attr->max_entries > RINGBUF_MAX_DATA_SZ) 199517bbe19SAndrii Nakryiko return ERR_PTR(-E2BIG); 200517bbe19SAndrii Nakryiko #endif 201517bbe19SAndrii Nakryiko 20273cf09a3SYafang Shao rb_map = bpf_map_area_alloc(sizeof(*rb_map), NUMA_NO_NODE); 203457f4436SAndrii Nakryiko if (!rb_map) 204457f4436SAndrii Nakryiko return ERR_PTR(-ENOMEM); 205457f4436SAndrii Nakryiko 206457f4436SAndrii Nakryiko bpf_map_init_from_attr(&rb_map->map, attr); 207457f4436SAndrii Nakryiko 208457f4436SAndrii Nakryiko rb_map->rb = bpf_ringbuf_alloc(attr->max_entries, rb_map->map.numa_node); 209abbdd081SRoman Gushchin if (!rb_map->rb) { 21073cf09a3SYafang Shao bpf_map_area_free(rb_map); 211abbdd081SRoman Gushchin return ERR_PTR(-ENOMEM); 212457f4436SAndrii Nakryiko } 213457f4436SAndrii Nakryiko 214457f4436SAndrii Nakryiko return &rb_map->map; 215457f4436SAndrii Nakryiko } 216457f4436SAndrii Nakryiko 217457f4436SAndrii Nakryiko static void bpf_ringbuf_free(struct bpf_ringbuf *rb) 218457f4436SAndrii Nakryiko { 219457f4436SAndrii Nakryiko /* copy pages pointer and nr_pages to local variable, as we are going 220457f4436SAndrii Nakryiko * to unmap rb itself with vunmap() below 221457f4436SAndrii Nakryiko */ 222457f4436SAndrii Nakryiko struct page **pages = rb->pages; 223457f4436SAndrii Nakryiko int i, nr_pages = rb->nr_pages; 224457f4436SAndrii Nakryiko 225457f4436SAndrii Nakryiko vunmap(rb); 226457f4436SAndrii Nakryiko for (i = 0; i < nr_pages; i++) 227457f4436SAndrii Nakryiko __free_page(pages[i]); 2288f58ee54SYafang Shao bpf_map_area_free(pages); 229457f4436SAndrii Nakryiko } 230457f4436SAndrii Nakryiko 231457f4436SAndrii Nakryiko static void ringbuf_map_free(struct bpf_map *map) 232457f4436SAndrii Nakryiko { 233457f4436SAndrii Nakryiko struct bpf_ringbuf_map *rb_map; 234457f4436SAndrii Nakryiko 235457f4436SAndrii Nakryiko rb_map = container_of(map, struct bpf_ringbuf_map, map); 236457f4436SAndrii Nakryiko bpf_ringbuf_free(rb_map->rb); 23773cf09a3SYafang Shao bpf_map_area_free(rb_map); 238457f4436SAndrii Nakryiko } 239457f4436SAndrii Nakryiko 240457f4436SAndrii Nakryiko static void *ringbuf_map_lookup_elem(struct bpf_map *map, void *key) 241457f4436SAndrii Nakryiko { 242457f4436SAndrii Nakryiko return ERR_PTR(-ENOTSUPP); 243457f4436SAndrii Nakryiko } 244457f4436SAndrii Nakryiko 245*d7ba4cc9SJP Kobryn static long ringbuf_map_update_elem(struct bpf_map *map, void *key, void *value, 246457f4436SAndrii Nakryiko u64 flags) 247457f4436SAndrii Nakryiko { 248457f4436SAndrii Nakryiko return -ENOTSUPP; 249457f4436SAndrii Nakryiko } 250457f4436SAndrii Nakryiko 251*d7ba4cc9SJP Kobryn static long ringbuf_map_delete_elem(struct bpf_map *map, void *key) 252457f4436SAndrii Nakryiko { 253457f4436SAndrii Nakryiko return -ENOTSUPP; 254457f4436SAndrii Nakryiko } 255457f4436SAndrii Nakryiko 256457f4436SAndrii Nakryiko static int ringbuf_map_get_next_key(struct bpf_map *map, void *key, 257457f4436SAndrii Nakryiko void *next_key) 258457f4436SAndrii Nakryiko { 259457f4436SAndrii Nakryiko return -ENOTSUPP; 260457f4436SAndrii Nakryiko } 261457f4436SAndrii Nakryiko 262583c1f42SDavid Vernet static int ringbuf_map_mmap_kern(struct bpf_map *map, struct vm_area_struct *vma) 263457f4436SAndrii Nakryiko { 264457f4436SAndrii Nakryiko struct bpf_ringbuf_map *rb_map; 265457f4436SAndrii Nakryiko 266457f4436SAndrii Nakryiko rb_map = container_of(map, struct bpf_ringbuf_map, map); 267457f4436SAndrii Nakryiko 26804ea3086SAndrii Nakryiko if (vma->vm_flags & VM_WRITE) { 26904ea3086SAndrii Nakryiko /* allow writable mapping for the consumer_pos only */ 27004ea3086SAndrii Nakryiko if (vma->vm_pgoff != 0 || vma->vm_end - vma->vm_start != PAGE_SIZE) 27104ea3086SAndrii Nakryiko return -EPERM; 27204ea3086SAndrii Nakryiko } else { 2731c71222eSSuren Baghdasaryan vm_flags_clear(vma, VM_MAYWRITE); 27404ea3086SAndrii Nakryiko } 27504ea3086SAndrii Nakryiko /* remap_vmalloc_range() checks size and offset constraints */ 276457f4436SAndrii Nakryiko return remap_vmalloc_range(vma, rb_map->rb, 277457f4436SAndrii Nakryiko vma->vm_pgoff + RINGBUF_PGOFF); 278457f4436SAndrii Nakryiko } 279457f4436SAndrii Nakryiko 280583c1f42SDavid Vernet static int ringbuf_map_mmap_user(struct bpf_map *map, struct vm_area_struct *vma) 281583c1f42SDavid Vernet { 282583c1f42SDavid Vernet struct bpf_ringbuf_map *rb_map; 283583c1f42SDavid Vernet 284583c1f42SDavid Vernet rb_map = container_of(map, struct bpf_ringbuf_map, map); 285583c1f42SDavid Vernet 286583c1f42SDavid Vernet if (vma->vm_flags & VM_WRITE) { 287583c1f42SDavid Vernet if (vma->vm_pgoff == 0) 288583c1f42SDavid Vernet /* Disallow writable mappings to the consumer pointer, 289583c1f42SDavid Vernet * and allow writable mappings to both the producer 290583c1f42SDavid Vernet * position, and the ring buffer data itself. 291583c1f42SDavid Vernet */ 292583c1f42SDavid Vernet return -EPERM; 293583c1f42SDavid Vernet } else { 2941c71222eSSuren Baghdasaryan vm_flags_clear(vma, VM_MAYWRITE); 295583c1f42SDavid Vernet } 296583c1f42SDavid Vernet /* remap_vmalloc_range() checks size and offset constraints */ 297583c1f42SDavid Vernet return remap_vmalloc_range(vma, rb_map->rb, vma->vm_pgoff + RINGBUF_PGOFF); 298583c1f42SDavid Vernet } 299583c1f42SDavid Vernet 300457f4436SAndrii Nakryiko static unsigned long ringbuf_avail_data_sz(struct bpf_ringbuf *rb) 301457f4436SAndrii Nakryiko { 302457f4436SAndrii Nakryiko unsigned long cons_pos, prod_pos; 303457f4436SAndrii Nakryiko 304457f4436SAndrii Nakryiko cons_pos = smp_load_acquire(&rb->consumer_pos); 305457f4436SAndrii Nakryiko prod_pos = smp_load_acquire(&rb->producer_pos); 306457f4436SAndrii Nakryiko return prod_pos - cons_pos; 307457f4436SAndrii Nakryiko } 308457f4436SAndrii Nakryiko 30920571567SDavid Vernet static u32 ringbuf_total_data_sz(const struct bpf_ringbuf *rb) 31020571567SDavid Vernet { 31120571567SDavid Vernet return rb->mask + 1; 31220571567SDavid Vernet } 31320571567SDavid Vernet 31420571567SDavid Vernet static __poll_t ringbuf_map_poll_kern(struct bpf_map *map, struct file *filp, 315457f4436SAndrii Nakryiko struct poll_table_struct *pts) 316457f4436SAndrii Nakryiko { 317457f4436SAndrii Nakryiko struct bpf_ringbuf_map *rb_map; 318457f4436SAndrii Nakryiko 319457f4436SAndrii Nakryiko rb_map = container_of(map, struct bpf_ringbuf_map, map); 320457f4436SAndrii Nakryiko poll_wait(filp, &rb_map->rb->waitq, pts); 321457f4436SAndrii Nakryiko 322457f4436SAndrii Nakryiko if (ringbuf_avail_data_sz(rb_map->rb)) 323457f4436SAndrii Nakryiko return EPOLLIN | EPOLLRDNORM; 324457f4436SAndrii Nakryiko return 0; 325457f4436SAndrii Nakryiko } 326457f4436SAndrii Nakryiko 32720571567SDavid Vernet static __poll_t ringbuf_map_poll_user(struct bpf_map *map, struct file *filp, 32820571567SDavid Vernet struct poll_table_struct *pts) 32920571567SDavid Vernet { 33020571567SDavid Vernet struct bpf_ringbuf_map *rb_map; 33120571567SDavid Vernet 33220571567SDavid Vernet rb_map = container_of(map, struct bpf_ringbuf_map, map); 33320571567SDavid Vernet poll_wait(filp, &rb_map->rb->waitq, pts); 33420571567SDavid Vernet 33520571567SDavid Vernet if (ringbuf_avail_data_sz(rb_map->rb) < ringbuf_total_data_sz(rb_map->rb)) 33620571567SDavid Vernet return EPOLLOUT | EPOLLWRNORM; 33720571567SDavid Vernet return 0; 33820571567SDavid Vernet } 33920571567SDavid Vernet 3402f7e4ab2SYafang Shao static u64 ringbuf_map_mem_usage(const struct bpf_map *map) 3412f7e4ab2SYafang Shao { 3422f7e4ab2SYafang Shao struct bpf_ringbuf *rb; 3432f7e4ab2SYafang Shao int nr_data_pages; 3442f7e4ab2SYafang Shao int nr_meta_pages; 3452f7e4ab2SYafang Shao u64 usage = sizeof(struct bpf_ringbuf_map); 3462f7e4ab2SYafang Shao 3472f7e4ab2SYafang Shao rb = container_of(map, struct bpf_ringbuf_map, map)->rb; 3482f7e4ab2SYafang Shao usage += (u64)rb->nr_pages << PAGE_SHIFT; 3492f7e4ab2SYafang Shao nr_meta_pages = RINGBUF_NR_META_PAGES; 3502f7e4ab2SYafang Shao nr_data_pages = map->max_entries >> PAGE_SHIFT; 3512f7e4ab2SYafang Shao usage += (nr_meta_pages + 2 * nr_data_pages) * sizeof(struct page *); 3522f7e4ab2SYafang Shao return usage; 3532f7e4ab2SYafang Shao } 3542f7e4ab2SYafang Shao 355c317ab71SMenglong Dong BTF_ID_LIST_SINGLE(ringbuf_map_btf_ids, struct, bpf_ringbuf_map) 356457f4436SAndrii Nakryiko const struct bpf_map_ops ringbuf_map_ops = { 357f4d05259SMartin KaFai Lau .map_meta_equal = bpf_map_meta_equal, 358457f4436SAndrii Nakryiko .map_alloc = ringbuf_map_alloc, 359457f4436SAndrii Nakryiko .map_free = ringbuf_map_free, 360583c1f42SDavid Vernet .map_mmap = ringbuf_map_mmap_kern, 36120571567SDavid Vernet .map_poll = ringbuf_map_poll_kern, 362457f4436SAndrii Nakryiko .map_lookup_elem = ringbuf_map_lookup_elem, 363457f4436SAndrii Nakryiko .map_update_elem = ringbuf_map_update_elem, 364457f4436SAndrii Nakryiko .map_delete_elem = ringbuf_map_delete_elem, 365457f4436SAndrii Nakryiko .map_get_next_key = ringbuf_map_get_next_key, 3662f7e4ab2SYafang Shao .map_mem_usage = ringbuf_map_mem_usage, 367c317ab71SMenglong Dong .map_btf_id = &ringbuf_map_btf_ids[0], 368457f4436SAndrii Nakryiko }; 369457f4436SAndrii Nakryiko 370583c1f42SDavid Vernet BTF_ID_LIST_SINGLE(user_ringbuf_map_btf_ids, struct, bpf_ringbuf_map) 371583c1f42SDavid Vernet const struct bpf_map_ops user_ringbuf_map_ops = { 372583c1f42SDavid Vernet .map_meta_equal = bpf_map_meta_equal, 373583c1f42SDavid Vernet .map_alloc = ringbuf_map_alloc, 374583c1f42SDavid Vernet .map_free = ringbuf_map_free, 375583c1f42SDavid Vernet .map_mmap = ringbuf_map_mmap_user, 37620571567SDavid Vernet .map_poll = ringbuf_map_poll_user, 377583c1f42SDavid Vernet .map_lookup_elem = ringbuf_map_lookup_elem, 378583c1f42SDavid Vernet .map_update_elem = ringbuf_map_update_elem, 379583c1f42SDavid Vernet .map_delete_elem = ringbuf_map_delete_elem, 380583c1f42SDavid Vernet .map_get_next_key = ringbuf_map_get_next_key, 3812f7e4ab2SYafang Shao .map_mem_usage = ringbuf_map_mem_usage, 382583c1f42SDavid Vernet .map_btf_id = &user_ringbuf_map_btf_ids[0], 383583c1f42SDavid Vernet }; 384583c1f42SDavid Vernet 385457f4436SAndrii Nakryiko /* Given pointer to ring buffer record metadata and struct bpf_ringbuf itself, 386457f4436SAndrii Nakryiko * calculate offset from record metadata to ring buffer in pages, rounded 387457f4436SAndrii Nakryiko * down. This page offset is stored as part of record metadata and allows to 388457f4436SAndrii Nakryiko * restore struct bpf_ringbuf * from record pointer. This page offset is 389457f4436SAndrii Nakryiko * stored at offset 4 of record metadata header. 390457f4436SAndrii Nakryiko */ 391457f4436SAndrii Nakryiko static size_t bpf_ringbuf_rec_pg_off(struct bpf_ringbuf *rb, 392457f4436SAndrii Nakryiko struct bpf_ringbuf_hdr *hdr) 393457f4436SAndrii Nakryiko { 394457f4436SAndrii Nakryiko return ((void *)hdr - (void *)rb) >> PAGE_SHIFT; 395457f4436SAndrii Nakryiko } 396457f4436SAndrii Nakryiko 397457f4436SAndrii Nakryiko /* Given pointer to ring buffer record header, restore pointer to struct 398457f4436SAndrii Nakryiko * bpf_ringbuf itself by using page offset stored at offset 4 399457f4436SAndrii Nakryiko */ 400457f4436SAndrii Nakryiko static struct bpf_ringbuf * 401457f4436SAndrii Nakryiko bpf_ringbuf_restore_from_rec(struct bpf_ringbuf_hdr *hdr) 402457f4436SAndrii Nakryiko { 403457f4436SAndrii Nakryiko unsigned long addr = (unsigned long)(void *)hdr; 404457f4436SAndrii Nakryiko unsigned long off = (unsigned long)hdr->pg_off << PAGE_SHIFT; 405457f4436SAndrii Nakryiko 406457f4436SAndrii Nakryiko return (void*)((addr & PAGE_MASK) - off); 407457f4436SAndrii Nakryiko } 408457f4436SAndrii Nakryiko 409457f4436SAndrii Nakryiko static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size) 410457f4436SAndrii Nakryiko { 411457f4436SAndrii Nakryiko unsigned long cons_pos, prod_pos, new_prod_pos, flags; 412457f4436SAndrii Nakryiko u32 len, pg_off; 413457f4436SAndrii Nakryiko struct bpf_ringbuf_hdr *hdr; 414457f4436SAndrii Nakryiko 415457f4436SAndrii Nakryiko if (unlikely(size > RINGBUF_MAX_RECORD_SZ)) 416457f4436SAndrii Nakryiko return NULL; 417457f4436SAndrii Nakryiko 418457f4436SAndrii Nakryiko len = round_up(size + BPF_RINGBUF_HDR_SZ, 8); 41920571567SDavid Vernet if (len > ringbuf_total_data_sz(rb)) 4204b81ccebSThadeu Lima de Souza Cascardo return NULL; 4214b81ccebSThadeu Lima de Souza Cascardo 422457f4436SAndrii Nakryiko cons_pos = smp_load_acquire(&rb->consumer_pos); 423457f4436SAndrii Nakryiko 424457f4436SAndrii Nakryiko if (in_nmi()) { 425457f4436SAndrii Nakryiko if (!spin_trylock_irqsave(&rb->spinlock, flags)) 426457f4436SAndrii Nakryiko return NULL; 427457f4436SAndrii Nakryiko } else { 428457f4436SAndrii Nakryiko spin_lock_irqsave(&rb->spinlock, flags); 429457f4436SAndrii Nakryiko } 430457f4436SAndrii Nakryiko 431457f4436SAndrii Nakryiko prod_pos = rb->producer_pos; 432457f4436SAndrii Nakryiko new_prod_pos = prod_pos + len; 433457f4436SAndrii Nakryiko 434457f4436SAndrii Nakryiko /* check for out of ringbuf space by ensuring producer position 435457f4436SAndrii Nakryiko * doesn't advance more than (ringbuf_size - 1) ahead 436457f4436SAndrii Nakryiko */ 437457f4436SAndrii Nakryiko if (new_prod_pos - cons_pos > rb->mask) { 438457f4436SAndrii Nakryiko spin_unlock_irqrestore(&rb->spinlock, flags); 439457f4436SAndrii Nakryiko return NULL; 440457f4436SAndrii Nakryiko } 441457f4436SAndrii Nakryiko 442457f4436SAndrii Nakryiko hdr = (void *)rb->data + (prod_pos & rb->mask); 443457f4436SAndrii Nakryiko pg_off = bpf_ringbuf_rec_pg_off(rb, hdr); 444457f4436SAndrii Nakryiko hdr->len = size | BPF_RINGBUF_BUSY_BIT; 445457f4436SAndrii Nakryiko hdr->pg_off = pg_off; 446457f4436SAndrii Nakryiko 447457f4436SAndrii Nakryiko /* pairs with consumer's smp_load_acquire() */ 448457f4436SAndrii Nakryiko smp_store_release(&rb->producer_pos, new_prod_pos); 449457f4436SAndrii Nakryiko 450457f4436SAndrii Nakryiko spin_unlock_irqrestore(&rb->spinlock, flags); 451457f4436SAndrii Nakryiko 452457f4436SAndrii Nakryiko return (void *)hdr + BPF_RINGBUF_HDR_SZ; 453457f4436SAndrii Nakryiko } 454457f4436SAndrii Nakryiko 455457f4436SAndrii Nakryiko BPF_CALL_3(bpf_ringbuf_reserve, struct bpf_map *, map, u64, size, u64, flags) 456457f4436SAndrii Nakryiko { 457457f4436SAndrii Nakryiko struct bpf_ringbuf_map *rb_map; 458457f4436SAndrii Nakryiko 459457f4436SAndrii Nakryiko if (unlikely(flags)) 460457f4436SAndrii Nakryiko return 0; 461457f4436SAndrii Nakryiko 462457f4436SAndrii Nakryiko rb_map = container_of(map, struct bpf_ringbuf_map, map); 463457f4436SAndrii Nakryiko return (unsigned long)__bpf_ringbuf_reserve(rb_map->rb, size); 464457f4436SAndrii Nakryiko } 465457f4436SAndrii Nakryiko 466457f4436SAndrii Nakryiko const struct bpf_func_proto bpf_ringbuf_reserve_proto = { 467457f4436SAndrii Nakryiko .func = bpf_ringbuf_reserve, 468894f2a8bSKumar Kartikeya Dwivedi .ret_type = RET_PTR_TO_RINGBUF_MEM_OR_NULL, 469457f4436SAndrii Nakryiko .arg1_type = ARG_CONST_MAP_PTR, 470457f4436SAndrii Nakryiko .arg2_type = ARG_CONST_ALLOC_SIZE_OR_ZERO, 471457f4436SAndrii Nakryiko .arg3_type = ARG_ANYTHING, 472457f4436SAndrii Nakryiko }; 473457f4436SAndrii Nakryiko 474457f4436SAndrii Nakryiko static void bpf_ringbuf_commit(void *sample, u64 flags, bool discard) 475457f4436SAndrii Nakryiko { 476457f4436SAndrii Nakryiko unsigned long rec_pos, cons_pos; 477457f4436SAndrii Nakryiko struct bpf_ringbuf_hdr *hdr; 478457f4436SAndrii Nakryiko struct bpf_ringbuf *rb; 479457f4436SAndrii Nakryiko u32 new_len; 480457f4436SAndrii Nakryiko 481457f4436SAndrii Nakryiko hdr = sample - BPF_RINGBUF_HDR_SZ; 482457f4436SAndrii Nakryiko rb = bpf_ringbuf_restore_from_rec(hdr); 483457f4436SAndrii Nakryiko new_len = hdr->len ^ BPF_RINGBUF_BUSY_BIT; 484457f4436SAndrii Nakryiko if (discard) 485457f4436SAndrii Nakryiko new_len |= BPF_RINGBUF_DISCARD_BIT; 486457f4436SAndrii Nakryiko 487457f4436SAndrii Nakryiko /* update record header with correct final size prefix */ 488457f4436SAndrii Nakryiko xchg(&hdr->len, new_len); 489457f4436SAndrii Nakryiko 490457f4436SAndrii Nakryiko /* if consumer caught up and is waiting for our record, notify about 491457f4436SAndrii Nakryiko * new data availability 492457f4436SAndrii Nakryiko */ 493457f4436SAndrii Nakryiko rec_pos = (void *)hdr - (void *)rb->data; 494457f4436SAndrii Nakryiko cons_pos = smp_load_acquire(&rb->consumer_pos) & rb->mask; 495457f4436SAndrii Nakryiko 496457f4436SAndrii Nakryiko if (flags & BPF_RB_FORCE_WAKEUP) 497457f4436SAndrii Nakryiko irq_work_queue(&rb->work); 498457f4436SAndrii Nakryiko else if (cons_pos == rec_pos && !(flags & BPF_RB_NO_WAKEUP)) 499457f4436SAndrii Nakryiko irq_work_queue(&rb->work); 500457f4436SAndrii Nakryiko } 501457f4436SAndrii Nakryiko 502457f4436SAndrii Nakryiko BPF_CALL_2(bpf_ringbuf_submit, void *, sample, u64, flags) 503457f4436SAndrii Nakryiko { 504457f4436SAndrii Nakryiko bpf_ringbuf_commit(sample, flags, false /* discard */); 505457f4436SAndrii Nakryiko return 0; 506457f4436SAndrii Nakryiko } 507457f4436SAndrii Nakryiko 508457f4436SAndrii Nakryiko const struct bpf_func_proto bpf_ringbuf_submit_proto = { 509457f4436SAndrii Nakryiko .func = bpf_ringbuf_submit, 510457f4436SAndrii Nakryiko .ret_type = RET_VOID, 511894f2a8bSKumar Kartikeya Dwivedi .arg1_type = ARG_PTR_TO_RINGBUF_MEM | OBJ_RELEASE, 512457f4436SAndrii Nakryiko .arg2_type = ARG_ANYTHING, 513457f4436SAndrii Nakryiko }; 514457f4436SAndrii Nakryiko 515457f4436SAndrii Nakryiko BPF_CALL_2(bpf_ringbuf_discard, void *, sample, u64, flags) 516457f4436SAndrii Nakryiko { 517457f4436SAndrii Nakryiko bpf_ringbuf_commit(sample, flags, true /* discard */); 518457f4436SAndrii Nakryiko return 0; 519457f4436SAndrii Nakryiko } 520457f4436SAndrii Nakryiko 521457f4436SAndrii Nakryiko const struct bpf_func_proto bpf_ringbuf_discard_proto = { 522457f4436SAndrii Nakryiko .func = bpf_ringbuf_discard, 523457f4436SAndrii Nakryiko .ret_type = RET_VOID, 524894f2a8bSKumar Kartikeya Dwivedi .arg1_type = ARG_PTR_TO_RINGBUF_MEM | OBJ_RELEASE, 525457f4436SAndrii Nakryiko .arg2_type = ARG_ANYTHING, 526457f4436SAndrii Nakryiko }; 527457f4436SAndrii Nakryiko 528457f4436SAndrii Nakryiko BPF_CALL_4(bpf_ringbuf_output, struct bpf_map *, map, void *, data, u64, size, 529457f4436SAndrii Nakryiko u64, flags) 530457f4436SAndrii Nakryiko { 531457f4436SAndrii Nakryiko struct bpf_ringbuf_map *rb_map; 532457f4436SAndrii Nakryiko void *rec; 533457f4436SAndrii Nakryiko 534457f4436SAndrii Nakryiko if (unlikely(flags & ~(BPF_RB_NO_WAKEUP | BPF_RB_FORCE_WAKEUP))) 535457f4436SAndrii Nakryiko return -EINVAL; 536457f4436SAndrii Nakryiko 537457f4436SAndrii Nakryiko rb_map = container_of(map, struct bpf_ringbuf_map, map); 538457f4436SAndrii Nakryiko rec = __bpf_ringbuf_reserve(rb_map->rb, size); 539457f4436SAndrii Nakryiko if (!rec) 540457f4436SAndrii Nakryiko return -EAGAIN; 541457f4436SAndrii Nakryiko 542457f4436SAndrii Nakryiko memcpy(rec, data, size); 543457f4436SAndrii Nakryiko bpf_ringbuf_commit(rec, flags, false /* discard */); 544457f4436SAndrii Nakryiko return 0; 545457f4436SAndrii Nakryiko } 546457f4436SAndrii Nakryiko 547457f4436SAndrii Nakryiko const struct bpf_func_proto bpf_ringbuf_output_proto = { 548457f4436SAndrii Nakryiko .func = bpf_ringbuf_output, 549457f4436SAndrii Nakryiko .ret_type = RET_INTEGER, 550457f4436SAndrii Nakryiko .arg1_type = ARG_CONST_MAP_PTR, 551216e3cd2SHao Luo .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, 552457f4436SAndrii Nakryiko .arg3_type = ARG_CONST_SIZE_OR_ZERO, 553457f4436SAndrii Nakryiko .arg4_type = ARG_ANYTHING, 554457f4436SAndrii Nakryiko }; 555457f4436SAndrii Nakryiko 556457f4436SAndrii Nakryiko BPF_CALL_2(bpf_ringbuf_query, struct bpf_map *, map, u64, flags) 557457f4436SAndrii Nakryiko { 558457f4436SAndrii Nakryiko struct bpf_ringbuf *rb; 559457f4436SAndrii Nakryiko 560457f4436SAndrii Nakryiko rb = container_of(map, struct bpf_ringbuf_map, map)->rb; 561457f4436SAndrii Nakryiko 562457f4436SAndrii Nakryiko switch (flags) { 563457f4436SAndrii Nakryiko case BPF_RB_AVAIL_DATA: 564457f4436SAndrii Nakryiko return ringbuf_avail_data_sz(rb); 565457f4436SAndrii Nakryiko case BPF_RB_RING_SIZE: 56620571567SDavid Vernet return ringbuf_total_data_sz(rb); 567457f4436SAndrii Nakryiko case BPF_RB_CONS_POS: 568457f4436SAndrii Nakryiko return smp_load_acquire(&rb->consumer_pos); 569457f4436SAndrii Nakryiko case BPF_RB_PROD_POS: 570457f4436SAndrii Nakryiko return smp_load_acquire(&rb->producer_pos); 571457f4436SAndrii Nakryiko default: 572457f4436SAndrii Nakryiko return 0; 573457f4436SAndrii Nakryiko } 574457f4436SAndrii Nakryiko } 575457f4436SAndrii Nakryiko 576457f4436SAndrii Nakryiko const struct bpf_func_proto bpf_ringbuf_query_proto = { 577457f4436SAndrii Nakryiko .func = bpf_ringbuf_query, 578457f4436SAndrii Nakryiko .ret_type = RET_INTEGER, 579457f4436SAndrii Nakryiko .arg1_type = ARG_CONST_MAP_PTR, 580457f4436SAndrii Nakryiko .arg2_type = ARG_ANYTHING, 581457f4436SAndrii Nakryiko }; 582bc34dee6SJoanne Koong 583bc34dee6SJoanne Koong BPF_CALL_4(bpf_ringbuf_reserve_dynptr, struct bpf_map *, map, u32, size, u64, flags, 584bc34dee6SJoanne Koong struct bpf_dynptr_kern *, ptr) 585bc34dee6SJoanne Koong { 586bc34dee6SJoanne Koong struct bpf_ringbuf_map *rb_map; 587bc34dee6SJoanne Koong void *sample; 588bc34dee6SJoanne Koong int err; 589bc34dee6SJoanne Koong 590bc34dee6SJoanne Koong if (unlikely(flags)) { 591bc34dee6SJoanne Koong bpf_dynptr_set_null(ptr); 592bc34dee6SJoanne Koong return -EINVAL; 593bc34dee6SJoanne Koong } 594bc34dee6SJoanne Koong 595bc34dee6SJoanne Koong err = bpf_dynptr_check_size(size); 596bc34dee6SJoanne Koong if (err) { 597bc34dee6SJoanne Koong bpf_dynptr_set_null(ptr); 598bc34dee6SJoanne Koong return err; 599bc34dee6SJoanne Koong } 600bc34dee6SJoanne Koong 601bc34dee6SJoanne Koong rb_map = container_of(map, struct bpf_ringbuf_map, map); 602bc34dee6SJoanne Koong 603bc34dee6SJoanne Koong sample = __bpf_ringbuf_reserve(rb_map->rb, size); 604bc34dee6SJoanne Koong if (!sample) { 605bc34dee6SJoanne Koong bpf_dynptr_set_null(ptr); 606bc34dee6SJoanne Koong return -EINVAL; 607bc34dee6SJoanne Koong } 608bc34dee6SJoanne Koong 609bc34dee6SJoanne Koong bpf_dynptr_init(ptr, sample, BPF_DYNPTR_TYPE_RINGBUF, 0, size); 610bc34dee6SJoanne Koong 611bc34dee6SJoanne Koong return 0; 612bc34dee6SJoanne Koong } 613bc34dee6SJoanne Koong 614bc34dee6SJoanne Koong const struct bpf_func_proto bpf_ringbuf_reserve_dynptr_proto = { 615bc34dee6SJoanne Koong .func = bpf_ringbuf_reserve_dynptr, 616bc34dee6SJoanne Koong .ret_type = RET_INTEGER, 617bc34dee6SJoanne Koong .arg1_type = ARG_CONST_MAP_PTR, 618bc34dee6SJoanne Koong .arg2_type = ARG_ANYTHING, 619bc34dee6SJoanne Koong .arg3_type = ARG_ANYTHING, 620bc34dee6SJoanne Koong .arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_RINGBUF | MEM_UNINIT, 621bc34dee6SJoanne Koong }; 622bc34dee6SJoanne Koong 623bc34dee6SJoanne Koong BPF_CALL_2(bpf_ringbuf_submit_dynptr, struct bpf_dynptr_kern *, ptr, u64, flags) 624bc34dee6SJoanne Koong { 625bc34dee6SJoanne Koong if (!ptr->data) 626bc34dee6SJoanne Koong return 0; 627bc34dee6SJoanne Koong 628bc34dee6SJoanne Koong bpf_ringbuf_commit(ptr->data, flags, false /* discard */); 629bc34dee6SJoanne Koong 630bc34dee6SJoanne Koong bpf_dynptr_set_null(ptr); 631bc34dee6SJoanne Koong 632bc34dee6SJoanne Koong return 0; 633bc34dee6SJoanne Koong } 634bc34dee6SJoanne Koong 635bc34dee6SJoanne Koong const struct bpf_func_proto bpf_ringbuf_submit_dynptr_proto = { 636bc34dee6SJoanne Koong .func = bpf_ringbuf_submit_dynptr, 637bc34dee6SJoanne Koong .ret_type = RET_VOID, 638bc34dee6SJoanne Koong .arg1_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_RINGBUF | OBJ_RELEASE, 639bc34dee6SJoanne Koong .arg2_type = ARG_ANYTHING, 640bc34dee6SJoanne Koong }; 641bc34dee6SJoanne Koong 642bc34dee6SJoanne Koong BPF_CALL_2(bpf_ringbuf_discard_dynptr, struct bpf_dynptr_kern *, ptr, u64, flags) 643bc34dee6SJoanne Koong { 644bc34dee6SJoanne Koong if (!ptr->data) 645bc34dee6SJoanne Koong return 0; 646bc34dee6SJoanne Koong 647bc34dee6SJoanne Koong bpf_ringbuf_commit(ptr->data, flags, true /* discard */); 648bc34dee6SJoanne Koong 649bc34dee6SJoanne Koong bpf_dynptr_set_null(ptr); 650bc34dee6SJoanne Koong 651bc34dee6SJoanne Koong return 0; 652bc34dee6SJoanne Koong } 653bc34dee6SJoanne Koong 654bc34dee6SJoanne Koong const struct bpf_func_proto bpf_ringbuf_discard_dynptr_proto = { 655bc34dee6SJoanne Koong .func = bpf_ringbuf_discard_dynptr, 656bc34dee6SJoanne Koong .ret_type = RET_VOID, 657bc34dee6SJoanne Koong .arg1_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_RINGBUF | OBJ_RELEASE, 658bc34dee6SJoanne Koong .arg2_type = ARG_ANYTHING, 659bc34dee6SJoanne Koong }; 66020571567SDavid Vernet 66120571567SDavid Vernet static int __bpf_user_ringbuf_peek(struct bpf_ringbuf *rb, void **sample, u32 *size) 66220571567SDavid Vernet { 66320571567SDavid Vernet int err; 66420571567SDavid Vernet u32 hdr_len, sample_len, total_len, flags, *hdr; 66520571567SDavid Vernet u64 cons_pos, prod_pos; 66620571567SDavid Vernet 66720571567SDavid Vernet /* Synchronizes with smp_store_release() in user-space producer. */ 66820571567SDavid Vernet prod_pos = smp_load_acquire(&rb->producer_pos); 66920571567SDavid Vernet if (prod_pos % 8) 67020571567SDavid Vernet return -EINVAL; 67120571567SDavid Vernet 67220571567SDavid Vernet /* Synchronizes with smp_store_release() in __bpf_user_ringbuf_sample_release() */ 67320571567SDavid Vernet cons_pos = smp_load_acquire(&rb->consumer_pos); 67420571567SDavid Vernet if (cons_pos >= prod_pos) 67520571567SDavid Vernet return -ENODATA; 67620571567SDavid Vernet 67720571567SDavid Vernet hdr = (u32 *)((uintptr_t)rb->data + (uintptr_t)(cons_pos & rb->mask)); 67820571567SDavid Vernet /* Synchronizes with smp_store_release() in user-space producer. */ 67920571567SDavid Vernet hdr_len = smp_load_acquire(hdr); 68020571567SDavid Vernet flags = hdr_len & (BPF_RINGBUF_BUSY_BIT | BPF_RINGBUF_DISCARD_BIT); 68120571567SDavid Vernet sample_len = hdr_len & ~flags; 68220571567SDavid Vernet total_len = round_up(sample_len + BPF_RINGBUF_HDR_SZ, 8); 68320571567SDavid Vernet 68420571567SDavid Vernet /* The sample must fit within the region advertised by the producer position. */ 68520571567SDavid Vernet if (total_len > prod_pos - cons_pos) 68620571567SDavid Vernet return -EINVAL; 68720571567SDavid Vernet 68820571567SDavid Vernet /* The sample must fit within the data region of the ring buffer. */ 68920571567SDavid Vernet if (total_len > ringbuf_total_data_sz(rb)) 69020571567SDavid Vernet return -E2BIG; 69120571567SDavid Vernet 69220571567SDavid Vernet /* The sample must fit into a struct bpf_dynptr. */ 69320571567SDavid Vernet err = bpf_dynptr_check_size(sample_len); 69420571567SDavid Vernet if (err) 69520571567SDavid Vernet return -E2BIG; 69620571567SDavid Vernet 69720571567SDavid Vernet if (flags & BPF_RINGBUF_DISCARD_BIT) { 69820571567SDavid Vernet /* If the discard bit is set, the sample should be skipped. 69920571567SDavid Vernet * 70020571567SDavid Vernet * Update the consumer pos, and return -EAGAIN so the caller 70120571567SDavid Vernet * knows to skip this sample and try to read the next one. 70220571567SDavid Vernet */ 70320571567SDavid Vernet smp_store_release(&rb->consumer_pos, cons_pos + total_len); 70420571567SDavid Vernet return -EAGAIN; 70520571567SDavid Vernet } 70620571567SDavid Vernet 70720571567SDavid Vernet if (flags & BPF_RINGBUF_BUSY_BIT) 70820571567SDavid Vernet return -ENODATA; 70920571567SDavid Vernet 71020571567SDavid Vernet *sample = (void *)((uintptr_t)rb->data + 71120571567SDavid Vernet (uintptr_t)((cons_pos + BPF_RINGBUF_HDR_SZ) & rb->mask)); 71220571567SDavid Vernet *size = sample_len; 71320571567SDavid Vernet return 0; 71420571567SDavid Vernet } 71520571567SDavid Vernet 71620571567SDavid Vernet static void __bpf_user_ringbuf_sample_release(struct bpf_ringbuf *rb, size_t size, u64 flags) 71720571567SDavid Vernet { 71820571567SDavid Vernet u64 consumer_pos; 71920571567SDavid Vernet u32 rounded_size = round_up(size + BPF_RINGBUF_HDR_SZ, 8); 72020571567SDavid Vernet 72120571567SDavid Vernet /* Using smp_load_acquire() is unnecessary here, as the busy-bit 72220571567SDavid Vernet * prevents another task from writing to consumer_pos after it was read 72320571567SDavid Vernet * by this task with smp_load_acquire() in __bpf_user_ringbuf_peek(). 72420571567SDavid Vernet */ 72520571567SDavid Vernet consumer_pos = rb->consumer_pos; 72620571567SDavid Vernet /* Synchronizes with smp_load_acquire() in user-space producer. */ 72720571567SDavid Vernet smp_store_release(&rb->consumer_pos, consumer_pos + rounded_size); 72820571567SDavid Vernet } 72920571567SDavid Vernet 73020571567SDavid Vernet BPF_CALL_4(bpf_user_ringbuf_drain, struct bpf_map *, map, 73120571567SDavid Vernet void *, callback_fn, void *, callback_ctx, u64, flags) 73220571567SDavid Vernet { 73320571567SDavid Vernet struct bpf_ringbuf *rb; 73420571567SDavid Vernet long samples, discarded_samples = 0, ret = 0; 73520571567SDavid Vernet bpf_callback_t callback = (bpf_callback_t)callback_fn; 73620571567SDavid Vernet u64 wakeup_flags = BPF_RB_NO_WAKEUP | BPF_RB_FORCE_WAKEUP; 73720571567SDavid Vernet int busy = 0; 73820571567SDavid Vernet 73920571567SDavid Vernet if (unlikely(flags & ~wakeup_flags)) 74020571567SDavid Vernet return -EINVAL; 74120571567SDavid Vernet 74220571567SDavid Vernet rb = container_of(map, struct bpf_ringbuf_map, map)->rb; 74320571567SDavid Vernet 74420571567SDavid Vernet /* If another consumer is already consuming a sample, wait for them to finish. */ 74520571567SDavid Vernet if (!atomic_try_cmpxchg(&rb->busy, &busy, 1)) 74620571567SDavid Vernet return -EBUSY; 74720571567SDavid Vernet 74820571567SDavid Vernet for (samples = 0; samples < BPF_MAX_USER_RINGBUF_SAMPLES && ret == 0; samples++) { 74920571567SDavid Vernet int err; 75020571567SDavid Vernet u32 size; 75120571567SDavid Vernet void *sample; 75220571567SDavid Vernet struct bpf_dynptr_kern dynptr; 75320571567SDavid Vernet 75420571567SDavid Vernet err = __bpf_user_ringbuf_peek(rb, &sample, &size); 75520571567SDavid Vernet if (err) { 75620571567SDavid Vernet if (err == -ENODATA) { 75720571567SDavid Vernet break; 75820571567SDavid Vernet } else if (err == -EAGAIN) { 75920571567SDavid Vernet discarded_samples++; 76020571567SDavid Vernet continue; 76120571567SDavid Vernet } else { 76220571567SDavid Vernet ret = err; 76320571567SDavid Vernet goto schedule_work_return; 76420571567SDavid Vernet } 76520571567SDavid Vernet } 76620571567SDavid Vernet 76720571567SDavid Vernet bpf_dynptr_init(&dynptr, sample, BPF_DYNPTR_TYPE_LOCAL, 0, size); 76820571567SDavid Vernet ret = callback((uintptr_t)&dynptr, (uintptr_t)callback_ctx, 0, 0, 0); 76920571567SDavid Vernet __bpf_user_ringbuf_sample_release(rb, size, flags); 77020571567SDavid Vernet } 77120571567SDavid Vernet ret = samples - discarded_samples; 77220571567SDavid Vernet 77320571567SDavid Vernet schedule_work_return: 77420571567SDavid Vernet /* Prevent the clearing of the busy-bit from being reordered before the 77520571567SDavid Vernet * storing of any rb consumer or producer positions. 77620571567SDavid Vernet */ 77720571567SDavid Vernet smp_mb__before_atomic(); 77820571567SDavid Vernet atomic_set(&rb->busy, 0); 77920571567SDavid Vernet 78020571567SDavid Vernet if (flags & BPF_RB_FORCE_WAKEUP) 78120571567SDavid Vernet irq_work_queue(&rb->work); 78220571567SDavid Vernet else if (!(flags & BPF_RB_NO_WAKEUP) && samples > 0) 78320571567SDavid Vernet irq_work_queue(&rb->work); 78420571567SDavid Vernet return ret; 78520571567SDavid Vernet } 78620571567SDavid Vernet 78720571567SDavid Vernet const struct bpf_func_proto bpf_user_ringbuf_drain_proto = { 78820571567SDavid Vernet .func = bpf_user_ringbuf_drain, 78920571567SDavid Vernet .ret_type = RET_INTEGER, 79020571567SDavid Vernet .arg1_type = ARG_CONST_MAP_PTR, 79120571567SDavid Vernet .arg2_type = ARG_PTR_TO_FUNC, 79220571567SDavid Vernet .arg3_type = ARG_PTR_TO_STACK_OR_NULL, 79320571567SDavid Vernet .arg4_type = ARG_ANYTHING, 79420571567SDavid Vernet }; 795