1 /* 2 * Performance events ring-buffer code: 3 * 4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> 5 * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar 6 * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> 7 * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> 8 * 9 * For licensing details see kernel-base/COPYING 10 */ 11 12 #include <linux/perf_event.h> 13 #include <linux/vmalloc.h> 14 #include <linux/slab.h> 15 #include <linux/circ_buf.h> 16 17 #include "internal.h" 18 19 static void perf_output_wakeup(struct perf_output_handle *handle) 20 { 21 atomic_set(&handle->rb->poll, POLL_IN); 22 23 handle->event->pending_wakeup = 1; 24 irq_work_queue(&handle->event->pending); 25 } 26 27 /* 28 * We need to ensure a later event_id doesn't publish a head when a former 29 * event isn't done writing. However since we need to deal with NMIs we 30 * cannot fully serialize things. 31 * 32 * We only publish the head (and generate a wakeup) when the outer-most 33 * event completes. 34 */ 35 static void perf_output_get_handle(struct perf_output_handle *handle) 36 { 37 struct ring_buffer *rb = handle->rb; 38 39 preempt_disable(); 40 local_inc(&rb->nest); 41 handle->wakeup = local_read(&rb->wakeup); 42 } 43 44 static void perf_output_put_handle(struct perf_output_handle *handle) 45 { 46 struct ring_buffer *rb = handle->rb; 47 unsigned long head; 48 49 again: 50 head = local_read(&rb->head); 51 52 /* 53 * IRQ/NMI can happen here, which means we can miss a head update. 54 */ 55 56 if (!local_dec_and_test(&rb->nest)) 57 goto out; 58 59 /* 60 * Since the mmap() consumer (userspace) can run on a different CPU: 61 * 62 * kernel user 63 * 64 * READ ->data_tail READ ->data_head 65 * smp_mb() (A) smp_rmb() (C) 66 * WRITE $data READ $data 67 * smp_wmb() (B) smp_mb() (D) 68 * STORE ->data_head WRITE ->data_tail 69 * 70 * Where A pairs with D, and B pairs with C. 71 * 72 * I don't think A needs to be a full barrier because we won't in fact 73 * write data until we see the store from userspace. So we simply don't 74 * issue the data WRITE until we observe it. Be conservative for now. 75 * 76 * OTOH, D needs to be a full barrier since it separates the data READ 77 * from the tail WRITE. 78 * 79 * For B a WMB is sufficient since it separates two WRITEs, and for C 80 * an RMB is sufficient since it separates two READs. 81 * 82 * See perf_output_begin(). 83 */ 84 smp_wmb(); 85 rb->user_page->data_head = head; 86 87 /* 88 * Now check if we missed an update -- rely on previous implied 89 * compiler barriers to force a re-read. 90 */ 91 if (unlikely(head != local_read(&rb->head))) { 92 local_inc(&rb->nest); 93 goto again; 94 } 95 96 if (handle->wakeup != local_read(&rb->wakeup)) 97 perf_output_wakeup(handle); 98 99 out: 100 preempt_enable(); 101 } 102 103 int perf_output_begin(struct perf_output_handle *handle, 104 struct perf_event *event, unsigned int size) 105 { 106 struct ring_buffer *rb; 107 unsigned long tail, offset, head; 108 int have_lost, page_shift; 109 struct { 110 struct perf_event_header header; 111 u64 id; 112 u64 lost; 113 } lost_event; 114 115 rcu_read_lock(); 116 /* 117 * For inherited events we send all the output towards the parent. 118 */ 119 if (event->parent) 120 event = event->parent; 121 122 rb = rcu_dereference(event->rb); 123 if (unlikely(!rb)) 124 goto out; 125 126 if (unlikely(!rb->nr_pages)) 127 goto out; 128 129 handle->rb = rb; 130 handle->event = event; 131 132 have_lost = local_read(&rb->lost); 133 if (unlikely(have_lost)) { 134 size += sizeof(lost_event); 135 if (event->attr.sample_id_all) 136 size += event->id_header_size; 137 } 138 139 perf_output_get_handle(handle); 140 141 do { 142 tail = ACCESS_ONCE(rb->user_page->data_tail); 143 offset = head = local_read(&rb->head); 144 if (!rb->overwrite && 145 unlikely(CIRC_SPACE(head, tail, perf_data_size(rb)) < size)) 146 goto fail; 147 head += size; 148 } while (local_cmpxchg(&rb->head, offset, head) != offset); 149 150 /* 151 * Separate the userpage->tail read from the data stores below. 152 * Matches the MB userspace SHOULD issue after reading the data 153 * and before storing the new tail position. 154 * 155 * See perf_output_put_handle(). 156 */ 157 smp_mb(); 158 159 if (unlikely(head - local_read(&rb->wakeup) > rb->watermark)) 160 local_add(rb->watermark, &rb->wakeup); 161 162 page_shift = PAGE_SHIFT + page_order(rb); 163 164 handle->page = (offset >> page_shift) & (rb->nr_pages - 1); 165 offset &= (1UL << page_shift) - 1; 166 handle->addr = rb->data_pages[handle->page] + offset; 167 handle->size = (1UL << page_shift) - offset; 168 169 if (unlikely(have_lost)) { 170 struct perf_sample_data sample_data; 171 172 lost_event.header.size = sizeof(lost_event); 173 lost_event.header.type = PERF_RECORD_LOST; 174 lost_event.header.misc = 0; 175 lost_event.id = event->id; 176 lost_event.lost = local_xchg(&rb->lost, 0); 177 178 perf_event_header__init_id(&lost_event.header, 179 &sample_data, event); 180 perf_output_put(handle, lost_event); 181 perf_event__output_id_sample(event, handle, &sample_data); 182 } 183 184 return 0; 185 186 fail: 187 local_inc(&rb->lost); 188 perf_output_put_handle(handle); 189 out: 190 rcu_read_unlock(); 191 192 return -ENOSPC; 193 } 194 195 unsigned int perf_output_copy(struct perf_output_handle *handle, 196 const void *buf, unsigned int len) 197 { 198 return __output_copy(handle, buf, len); 199 } 200 201 unsigned int perf_output_skip(struct perf_output_handle *handle, 202 unsigned int len) 203 { 204 return __output_skip(handle, NULL, len); 205 } 206 207 void perf_output_end(struct perf_output_handle *handle) 208 { 209 perf_output_put_handle(handle); 210 rcu_read_unlock(); 211 } 212 213 static void 214 ring_buffer_init(struct ring_buffer *rb, long watermark, int flags) 215 { 216 long max_size = perf_data_size(rb); 217 218 if (watermark) 219 rb->watermark = min(max_size, watermark); 220 221 if (!rb->watermark) 222 rb->watermark = max_size / 2; 223 224 if (flags & RING_BUFFER_WRITABLE) 225 rb->overwrite = 0; 226 else 227 rb->overwrite = 1; 228 229 atomic_set(&rb->refcount, 1); 230 231 INIT_LIST_HEAD(&rb->event_list); 232 spin_lock_init(&rb->event_lock); 233 } 234 235 #ifndef CONFIG_PERF_USE_VMALLOC 236 237 /* 238 * Back perf_mmap() with regular GFP_KERNEL-0 pages. 239 */ 240 241 struct page * 242 perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff) 243 { 244 if (pgoff > rb->nr_pages) 245 return NULL; 246 247 if (pgoff == 0) 248 return virt_to_page(rb->user_page); 249 250 return virt_to_page(rb->data_pages[pgoff - 1]); 251 } 252 253 static void *perf_mmap_alloc_page(int cpu) 254 { 255 struct page *page; 256 int node; 257 258 node = (cpu == -1) ? cpu : cpu_to_node(cpu); 259 page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); 260 if (!page) 261 return NULL; 262 263 return page_address(page); 264 } 265 266 struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags) 267 { 268 struct ring_buffer *rb; 269 unsigned long size; 270 int i; 271 272 size = sizeof(struct ring_buffer); 273 size += nr_pages * sizeof(void *); 274 275 rb = kzalloc(size, GFP_KERNEL); 276 if (!rb) 277 goto fail; 278 279 rb->user_page = perf_mmap_alloc_page(cpu); 280 if (!rb->user_page) 281 goto fail_user_page; 282 283 for (i = 0; i < nr_pages; i++) { 284 rb->data_pages[i] = perf_mmap_alloc_page(cpu); 285 if (!rb->data_pages[i]) 286 goto fail_data_pages; 287 } 288 289 rb->nr_pages = nr_pages; 290 291 ring_buffer_init(rb, watermark, flags); 292 293 return rb; 294 295 fail_data_pages: 296 for (i--; i >= 0; i--) 297 free_page((unsigned long)rb->data_pages[i]); 298 299 free_page((unsigned long)rb->user_page); 300 301 fail_user_page: 302 kfree(rb); 303 304 fail: 305 return NULL; 306 } 307 308 static void perf_mmap_free_page(unsigned long addr) 309 { 310 struct page *page = virt_to_page((void *)addr); 311 312 page->mapping = NULL; 313 __free_page(page); 314 } 315 316 void rb_free(struct ring_buffer *rb) 317 { 318 int i; 319 320 perf_mmap_free_page((unsigned long)rb->user_page); 321 for (i = 0; i < rb->nr_pages; i++) 322 perf_mmap_free_page((unsigned long)rb->data_pages[i]); 323 kfree(rb); 324 } 325 326 #else 327 static int data_page_nr(struct ring_buffer *rb) 328 { 329 return rb->nr_pages << page_order(rb); 330 } 331 332 struct page * 333 perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff) 334 { 335 /* The '>' counts in the user page. */ 336 if (pgoff > data_page_nr(rb)) 337 return NULL; 338 339 return vmalloc_to_page((void *)rb->user_page + pgoff * PAGE_SIZE); 340 } 341 342 static void perf_mmap_unmark_page(void *addr) 343 { 344 struct page *page = vmalloc_to_page(addr); 345 346 page->mapping = NULL; 347 } 348 349 static void rb_free_work(struct work_struct *work) 350 { 351 struct ring_buffer *rb; 352 void *base; 353 int i, nr; 354 355 rb = container_of(work, struct ring_buffer, work); 356 nr = data_page_nr(rb); 357 358 base = rb->user_page; 359 /* The '<=' counts in the user page. */ 360 for (i = 0; i <= nr; i++) 361 perf_mmap_unmark_page(base + (i * PAGE_SIZE)); 362 363 vfree(base); 364 kfree(rb); 365 } 366 367 void rb_free(struct ring_buffer *rb) 368 { 369 schedule_work(&rb->work); 370 } 371 372 struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags) 373 { 374 struct ring_buffer *rb; 375 unsigned long size; 376 void *all_buf; 377 378 size = sizeof(struct ring_buffer); 379 size += sizeof(void *); 380 381 rb = kzalloc(size, GFP_KERNEL); 382 if (!rb) 383 goto fail; 384 385 INIT_WORK(&rb->work, rb_free_work); 386 387 all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE); 388 if (!all_buf) 389 goto fail_all_buf; 390 391 rb->user_page = all_buf; 392 rb->data_pages[0] = all_buf + PAGE_SIZE; 393 rb->page_order = ilog2(nr_pages); 394 rb->nr_pages = !!nr_pages; 395 396 ring_buffer_init(rb, watermark, flags); 397 398 return rb; 399 400 fail_all_buf: 401 kfree(rb); 402 403 fail: 404 return NULL; 405 } 406 407 #endif 408