1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Performance events ring-buffer code: 4 * 5 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> 6 * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar 7 * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra 8 * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> 9 */ 10 11 #include <linux/perf_event.h> 12 #include <linux/vmalloc.h> 13 #include <linux/slab.h> 14 #include <linux/circ_buf.h> 15 #include <linux/poll.h> 16 #include <linux/nospec.h> 17 18 #include "internal.h" 19 20 static void perf_output_wakeup(struct perf_output_handle *handle) 21 { 22 atomic_set(&handle->rb->poll, EPOLLIN); 23 24 handle->event->pending_wakeup = 1; 25 irq_work_queue(&handle->event->pending); 26 } 27 28 /* 29 * We need to ensure a later event_id doesn't publish a head when a former 30 * event isn't done writing. However since we need to deal with NMIs we 31 * cannot fully serialize things. 32 * 33 * We only publish the head (and generate a wakeup) when the outer-most 34 * event completes. 35 */ 36 static void perf_output_get_handle(struct perf_output_handle *handle) 37 { 38 struct perf_buffer *rb = handle->rb; 39 40 preempt_disable(); 41 42 /* 43 * Avoid an explicit LOAD/STORE such that architectures with memops 44 * can use them. 45 */ 46 (*(volatile unsigned int *)&rb->nest)++; 47 handle->wakeup = local_read(&rb->wakeup); 48 } 49 50 static void perf_output_put_handle(struct perf_output_handle *handle) 51 { 52 struct perf_buffer *rb = handle->rb; 53 unsigned long head; 54 unsigned int nest; 55 56 /* 57 * If this isn't the outermost nesting, we don't have to update 58 * @rb->user_page->data_head. 59 */ 60 nest = READ_ONCE(rb->nest); 61 if (nest > 1) { 62 WRITE_ONCE(rb->nest, nest - 1); 63 goto out; 64 } 65 66 again: 67 /* 68 * In order to avoid publishing a head value that goes backwards, 69 * we must ensure the load of @rb->head happens after we've 70 * incremented @rb->nest. 71 * 72 * Otherwise we can observe a @rb->head value before one published 73 * by an IRQ/NMI happening between the load and the increment. 74 */ 75 barrier(); 76 head = local_read(&rb->head); 77 78 /* 79 * IRQ/NMI can happen here and advance @rb->head, causing our 80 * load above to be stale. 81 */ 82 83 /* 84 * Since the mmap() consumer (userspace) can run on a different CPU: 85 * 86 * kernel user 87 * 88 * if (LOAD ->data_tail) { LOAD ->data_head 89 * (A) smp_rmb() (C) 90 * STORE $data LOAD $data 91 * smp_wmb() (B) smp_mb() (D) 92 * STORE ->data_head STORE ->data_tail 93 * } 94 * 95 * Where A pairs with D, and B pairs with C. 96 * 97 * In our case (A) is a control dependency that separates the load of 98 * the ->data_tail and the stores of $data. In case ->data_tail 99 * indicates there is no room in the buffer to store $data we do not. 100 * 101 * D needs to be a full barrier since it separates the data READ 102 * from the tail WRITE. 103 * 104 * For B a WMB is sufficient since it separates two WRITEs, and for C 105 * an RMB is sufficient since it separates two READs. 106 * 107 * See perf_output_begin(). 108 */ 109 smp_wmb(); /* B, matches C */ 110 WRITE_ONCE(rb->user_page->data_head, head); 111 112 /* 113 * We must publish the head before decrementing the nest count, 114 * otherwise an IRQ/NMI can publish a more recent head value and our 115 * write will (temporarily) publish a stale value. 116 */ 117 barrier(); 118 WRITE_ONCE(rb->nest, 0); 119 120 /* 121 * Ensure we decrement @rb->nest before we validate the @rb->head. 122 * Otherwise we cannot be sure we caught the 'last' nested update. 123 */ 124 barrier(); 125 if (unlikely(head != local_read(&rb->head))) { 126 WRITE_ONCE(rb->nest, 1); 127 goto again; 128 } 129 130 if (handle->wakeup != local_read(&rb->wakeup)) 131 perf_output_wakeup(handle); 132 133 out: 134 preempt_enable(); 135 } 136 137 static __always_inline bool 138 ring_buffer_has_space(unsigned long head, unsigned long tail, 139 unsigned long data_size, unsigned int size, 140 bool backward) 141 { 142 if (!backward) 143 return CIRC_SPACE(head, tail, data_size) >= size; 144 else 145 return CIRC_SPACE(tail, head, data_size) >= size; 146 } 147 148 static __always_inline int 149 __perf_output_begin(struct perf_output_handle *handle, 150 struct perf_sample_data *data, 151 struct perf_event *event, unsigned int size, 152 bool backward) 153 { 154 struct perf_buffer *rb; 155 unsigned long tail, offset, head; 156 int have_lost, page_shift; 157 struct { 158 struct perf_event_header header; 159 u64 id; 160 u64 lost; 161 } lost_event; 162 163 rcu_read_lock(); 164 /* 165 * For inherited events we send all the output towards the parent. 166 */ 167 if (event->parent) 168 event = event->parent; 169 170 rb = rcu_dereference(event->rb); 171 if (unlikely(!rb)) 172 goto out; 173 174 if (unlikely(rb->paused)) { 175 if (rb->nr_pages) 176 local_inc(&rb->lost); 177 goto out; 178 } 179 180 handle->rb = rb; 181 handle->event = event; 182 183 have_lost = local_read(&rb->lost); 184 if (unlikely(have_lost)) { 185 size += sizeof(lost_event); 186 if (event->attr.sample_id_all) 187 size += event->id_header_size; 188 } 189 190 perf_output_get_handle(handle); 191 192 do { 193 tail = READ_ONCE(rb->user_page->data_tail); 194 offset = head = local_read(&rb->head); 195 if (!rb->overwrite) { 196 if (unlikely(!ring_buffer_has_space(head, tail, 197 perf_data_size(rb), 198 size, backward))) 199 goto fail; 200 } 201 202 /* 203 * The above forms a control dependency barrier separating the 204 * @tail load above from the data stores below. Since the @tail 205 * load is required to compute the branch to fail below. 206 * 207 * A, matches D; the full memory barrier userspace SHOULD issue 208 * after reading the data and before storing the new tail 209 * position. 210 * 211 * See perf_output_put_handle(). 212 */ 213 214 if (!backward) 215 head += size; 216 else 217 head -= size; 218 } while (local_cmpxchg(&rb->head, offset, head) != offset); 219 220 if (backward) { 221 offset = head; 222 head = (u64)(-head); 223 } 224 225 /* 226 * We rely on the implied barrier() by local_cmpxchg() to ensure 227 * none of the data stores below can be lifted up by the compiler. 228 */ 229 230 if (unlikely(head - local_read(&rb->wakeup) > rb->watermark)) 231 local_add(rb->watermark, &rb->wakeup); 232 233 page_shift = PAGE_SHIFT + page_order(rb); 234 235 handle->page = (offset >> page_shift) & (rb->nr_pages - 1); 236 offset &= (1UL << page_shift) - 1; 237 handle->addr = rb->data_pages[handle->page] + offset; 238 handle->size = (1UL << page_shift) - offset; 239 240 if (unlikely(have_lost)) { 241 lost_event.header.size = sizeof(lost_event); 242 lost_event.header.type = PERF_RECORD_LOST; 243 lost_event.header.misc = 0; 244 lost_event.id = event->id; 245 lost_event.lost = local_xchg(&rb->lost, 0); 246 247 /* XXX mostly redundant; @data is already fully initializes */ 248 perf_event_header__init_id(&lost_event.header, data, event); 249 perf_output_put(handle, lost_event); 250 perf_event__output_id_sample(event, handle, data); 251 } 252 253 return 0; 254 255 fail: 256 local_inc(&rb->lost); 257 perf_output_put_handle(handle); 258 out: 259 rcu_read_unlock(); 260 261 return -ENOSPC; 262 } 263 264 int perf_output_begin_forward(struct perf_output_handle *handle, 265 struct perf_sample_data *data, 266 struct perf_event *event, unsigned int size) 267 { 268 return __perf_output_begin(handle, data, event, size, false); 269 } 270 271 int perf_output_begin_backward(struct perf_output_handle *handle, 272 struct perf_sample_data *data, 273 struct perf_event *event, unsigned int size) 274 { 275 return __perf_output_begin(handle, data, event, size, true); 276 } 277 278 int perf_output_begin(struct perf_output_handle *handle, 279 struct perf_sample_data *data, 280 struct perf_event *event, unsigned int size) 281 { 282 283 return __perf_output_begin(handle, data, event, size, 284 unlikely(is_write_backward(event))); 285 } 286 287 unsigned int perf_output_copy(struct perf_output_handle *handle, 288 const void *buf, unsigned int len) 289 { 290 return __output_copy(handle, buf, len); 291 } 292 293 unsigned int perf_output_skip(struct perf_output_handle *handle, 294 unsigned int len) 295 { 296 return __output_skip(handle, NULL, len); 297 } 298 299 void perf_output_end(struct perf_output_handle *handle) 300 { 301 perf_output_put_handle(handle); 302 rcu_read_unlock(); 303 } 304 305 static void 306 ring_buffer_init(struct perf_buffer *rb, long watermark, int flags) 307 { 308 long max_size = perf_data_size(rb); 309 310 if (watermark) 311 rb->watermark = min(max_size, watermark); 312 313 if (!rb->watermark) 314 rb->watermark = max_size / 2; 315 316 if (flags & RING_BUFFER_WRITABLE) 317 rb->overwrite = 0; 318 else 319 rb->overwrite = 1; 320 321 refcount_set(&rb->refcount, 1); 322 323 INIT_LIST_HEAD(&rb->event_list); 324 spin_lock_init(&rb->event_lock); 325 326 /* 327 * perf_output_begin() only checks rb->paused, therefore 328 * rb->paused must be true if we have no pages for output. 329 */ 330 if (!rb->nr_pages) 331 rb->paused = 1; 332 } 333 334 void perf_aux_output_flag(struct perf_output_handle *handle, u64 flags) 335 { 336 /* 337 * OVERWRITE is determined by perf_aux_output_end() and can't 338 * be passed in directly. 339 */ 340 if (WARN_ON_ONCE(flags & PERF_AUX_FLAG_OVERWRITE)) 341 return; 342 343 handle->aux_flags |= flags; 344 } 345 EXPORT_SYMBOL_GPL(perf_aux_output_flag); 346 347 /* 348 * This is called before hardware starts writing to the AUX area to 349 * obtain an output handle and make sure there's room in the buffer. 350 * When the capture completes, call perf_aux_output_end() to commit 351 * the recorded data to the buffer. 352 * 353 * The ordering is similar to that of perf_output_{begin,end}, with 354 * the exception of (B), which should be taken care of by the pmu 355 * driver, since ordering rules will differ depending on hardware. 356 * 357 * Call this from pmu::start(); see the comment in perf_aux_output_end() 358 * about its use in pmu callbacks. Both can also be called from the PMI 359 * handler if needed. 360 */ 361 void *perf_aux_output_begin(struct perf_output_handle *handle, 362 struct perf_event *event) 363 { 364 struct perf_event *output_event = event; 365 unsigned long aux_head, aux_tail; 366 struct perf_buffer *rb; 367 unsigned int nest; 368 369 if (output_event->parent) 370 output_event = output_event->parent; 371 372 /* 373 * Since this will typically be open across pmu::add/pmu::del, we 374 * grab ring_buffer's refcount instead of holding rcu read lock 375 * to make sure it doesn't disappear under us. 376 */ 377 rb = ring_buffer_get(output_event); 378 if (!rb) 379 return NULL; 380 381 if (!rb_has_aux(rb)) 382 goto err; 383 384 /* 385 * If aux_mmap_count is zero, the aux buffer is in perf_mmap_close(), 386 * about to get freed, so we leave immediately. 387 * 388 * Checking rb::aux_mmap_count and rb::refcount has to be done in 389 * the same order, see perf_mmap_close. Otherwise we end up freeing 390 * aux pages in this path, which is a bug, because in_atomic(). 391 */ 392 if (!atomic_read(&rb->aux_mmap_count)) 393 goto err; 394 395 if (!refcount_inc_not_zero(&rb->aux_refcount)) 396 goto err; 397 398 nest = READ_ONCE(rb->aux_nest); 399 /* 400 * Nesting is not supported for AUX area, make sure nested 401 * writers are caught early 402 */ 403 if (WARN_ON_ONCE(nest)) 404 goto err_put; 405 406 WRITE_ONCE(rb->aux_nest, nest + 1); 407 408 aux_head = rb->aux_head; 409 410 handle->rb = rb; 411 handle->event = event; 412 handle->head = aux_head; 413 handle->size = 0; 414 handle->aux_flags = 0; 415 416 /* 417 * In overwrite mode, AUX data stores do not depend on aux_tail, 418 * therefore (A) control dependency barrier does not exist. The 419 * (B) <-> (C) ordering is still observed by the pmu driver. 420 */ 421 if (!rb->aux_overwrite) { 422 aux_tail = READ_ONCE(rb->user_page->aux_tail); 423 handle->wakeup = rb->aux_wakeup + rb->aux_watermark; 424 if (aux_head - aux_tail < perf_aux_size(rb)) 425 handle->size = CIRC_SPACE(aux_head, aux_tail, perf_aux_size(rb)); 426 427 /* 428 * handle->size computation depends on aux_tail load; this forms a 429 * control dependency barrier separating aux_tail load from aux data 430 * store that will be enabled on successful return 431 */ 432 if (!handle->size) { /* A, matches D */ 433 event->pending_disable = smp_processor_id(); 434 perf_output_wakeup(handle); 435 WRITE_ONCE(rb->aux_nest, 0); 436 goto err_put; 437 } 438 } 439 440 return handle->rb->aux_priv; 441 442 err_put: 443 /* can't be last */ 444 rb_free_aux(rb); 445 446 err: 447 ring_buffer_put(rb); 448 handle->event = NULL; 449 450 return NULL; 451 } 452 EXPORT_SYMBOL_GPL(perf_aux_output_begin); 453 454 static __always_inline bool rb_need_aux_wakeup(struct perf_buffer *rb) 455 { 456 if (rb->aux_overwrite) 457 return false; 458 459 if (rb->aux_head - rb->aux_wakeup >= rb->aux_watermark) { 460 rb->aux_wakeup = rounddown(rb->aux_head, rb->aux_watermark); 461 return true; 462 } 463 464 return false; 465 } 466 467 /* 468 * Commit the data written by hardware into the ring buffer by adjusting 469 * aux_head and posting a PERF_RECORD_AUX into the perf buffer. It is the 470 * pmu driver's responsibility to observe ordering rules of the hardware, 471 * so that all the data is externally visible before this is called. 472 * 473 * Note: this has to be called from pmu::stop() callback, as the assumption 474 * of the AUX buffer management code is that after pmu::stop(), the AUX 475 * transaction must be stopped and therefore drop the AUX reference count. 476 */ 477 void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size) 478 { 479 bool wakeup = !!(handle->aux_flags & PERF_AUX_FLAG_TRUNCATED); 480 struct perf_buffer *rb = handle->rb; 481 unsigned long aux_head; 482 483 /* in overwrite mode, driver provides aux_head via handle */ 484 if (rb->aux_overwrite) { 485 handle->aux_flags |= PERF_AUX_FLAG_OVERWRITE; 486 487 aux_head = handle->head; 488 rb->aux_head = aux_head; 489 } else { 490 handle->aux_flags &= ~PERF_AUX_FLAG_OVERWRITE; 491 492 aux_head = rb->aux_head; 493 rb->aux_head += size; 494 } 495 496 /* 497 * Only send RECORD_AUX if we have something useful to communicate 498 * 499 * Note: the OVERWRITE records by themselves are not considered 500 * useful, as they don't communicate any *new* information, 501 * aside from the short-lived offset, that becomes history at 502 * the next event sched-in and therefore isn't useful. 503 * The userspace that needs to copy out AUX data in overwrite 504 * mode should know to use user_page::aux_head for the actual 505 * offset. So, from now on we don't output AUX records that 506 * have *only* OVERWRITE flag set. 507 */ 508 if (size || (handle->aux_flags & ~(u64)PERF_AUX_FLAG_OVERWRITE)) 509 perf_event_aux_event(handle->event, aux_head, size, 510 handle->aux_flags); 511 512 WRITE_ONCE(rb->user_page->aux_head, rb->aux_head); 513 if (rb_need_aux_wakeup(rb)) 514 wakeup = true; 515 516 if (wakeup) { 517 if (handle->aux_flags & PERF_AUX_FLAG_TRUNCATED) 518 handle->event->pending_disable = smp_processor_id(); 519 perf_output_wakeup(handle); 520 } 521 522 handle->event = NULL; 523 524 WRITE_ONCE(rb->aux_nest, 0); 525 /* can't be last */ 526 rb_free_aux(rb); 527 ring_buffer_put(rb); 528 } 529 EXPORT_SYMBOL_GPL(perf_aux_output_end); 530 531 /* 532 * Skip over a given number of bytes in the AUX buffer, due to, for example, 533 * hardware's alignment constraints. 534 */ 535 int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size) 536 { 537 struct perf_buffer *rb = handle->rb; 538 539 if (size > handle->size) 540 return -ENOSPC; 541 542 rb->aux_head += size; 543 544 WRITE_ONCE(rb->user_page->aux_head, rb->aux_head); 545 if (rb_need_aux_wakeup(rb)) { 546 perf_output_wakeup(handle); 547 handle->wakeup = rb->aux_wakeup + rb->aux_watermark; 548 } 549 550 handle->head = rb->aux_head; 551 handle->size -= size; 552 553 return 0; 554 } 555 EXPORT_SYMBOL_GPL(perf_aux_output_skip); 556 557 void *perf_get_aux(struct perf_output_handle *handle) 558 { 559 /* this is only valid between perf_aux_output_begin and *_end */ 560 if (!handle->event) 561 return NULL; 562 563 return handle->rb->aux_priv; 564 } 565 EXPORT_SYMBOL_GPL(perf_get_aux); 566 567 /* 568 * Copy out AUX data from an AUX handle. 569 */ 570 long perf_output_copy_aux(struct perf_output_handle *aux_handle, 571 struct perf_output_handle *handle, 572 unsigned long from, unsigned long to) 573 { 574 struct perf_buffer *rb = aux_handle->rb; 575 unsigned long tocopy, remainder, len = 0; 576 void *addr; 577 578 from &= (rb->aux_nr_pages << PAGE_SHIFT) - 1; 579 to &= (rb->aux_nr_pages << PAGE_SHIFT) - 1; 580 581 do { 582 tocopy = PAGE_SIZE - offset_in_page(from); 583 if (to > from) 584 tocopy = min(tocopy, to - from); 585 if (!tocopy) 586 break; 587 588 addr = rb->aux_pages[from >> PAGE_SHIFT]; 589 addr += offset_in_page(from); 590 591 remainder = perf_output_copy(handle, addr, tocopy); 592 if (remainder) 593 return -EFAULT; 594 595 len += tocopy; 596 from += tocopy; 597 from &= (rb->aux_nr_pages << PAGE_SHIFT) - 1; 598 } while (to != from); 599 600 return len; 601 } 602 603 #define PERF_AUX_GFP (GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY) 604 605 static struct page *rb_alloc_aux_page(int node, int order) 606 { 607 struct page *page; 608 609 if (order > MAX_ORDER) 610 order = MAX_ORDER; 611 612 do { 613 page = alloc_pages_node(node, PERF_AUX_GFP, order); 614 } while (!page && order--); 615 616 if (page && order) { 617 /* 618 * Communicate the allocation size to the driver: 619 * if we managed to secure a high-order allocation, 620 * set its first page's private to this order; 621 * !PagePrivate(page) means it's just a normal page. 622 */ 623 split_page(page, order); 624 SetPagePrivate(page); 625 set_page_private(page, order); 626 } 627 628 return page; 629 } 630 631 static void rb_free_aux_page(struct perf_buffer *rb, int idx) 632 { 633 struct page *page = virt_to_page(rb->aux_pages[idx]); 634 635 ClearPagePrivate(page); 636 page->mapping = NULL; 637 __free_page(page); 638 } 639 640 static void __rb_free_aux(struct perf_buffer *rb) 641 { 642 int pg; 643 644 /* 645 * Should never happen, the last reference should be dropped from 646 * perf_mmap_close() path, which first stops aux transactions (which 647 * in turn are the atomic holders of aux_refcount) and then does the 648 * last rb_free_aux(). 649 */ 650 WARN_ON_ONCE(in_atomic()); 651 652 if (rb->aux_priv) { 653 rb->free_aux(rb->aux_priv); 654 rb->free_aux = NULL; 655 rb->aux_priv = NULL; 656 } 657 658 if (rb->aux_nr_pages) { 659 for (pg = 0; pg < rb->aux_nr_pages; pg++) 660 rb_free_aux_page(rb, pg); 661 662 kfree(rb->aux_pages); 663 rb->aux_nr_pages = 0; 664 } 665 } 666 667 int rb_alloc_aux(struct perf_buffer *rb, struct perf_event *event, 668 pgoff_t pgoff, int nr_pages, long watermark, int flags) 669 { 670 bool overwrite = !(flags & RING_BUFFER_WRITABLE); 671 int node = (event->cpu == -1) ? -1 : cpu_to_node(event->cpu); 672 int ret = -ENOMEM, max_order; 673 674 if (!has_aux(event)) 675 return -EOPNOTSUPP; 676 677 /* 678 * We need to start with the max_order that fits in nr_pages, 679 * not the other way around, hence ilog2() and not get_order. 680 */ 681 max_order = ilog2(nr_pages); 682 683 /* 684 * PMU requests more than one contiguous chunks of memory 685 * for SW double buffering 686 */ 687 if (!overwrite) { 688 if (!max_order) 689 return -EINVAL; 690 691 max_order--; 692 } 693 694 rb->aux_pages = kcalloc_node(nr_pages, sizeof(void *), GFP_KERNEL, 695 node); 696 if (!rb->aux_pages) 697 return -ENOMEM; 698 699 rb->free_aux = event->pmu->free_aux; 700 for (rb->aux_nr_pages = 0; rb->aux_nr_pages < nr_pages;) { 701 struct page *page; 702 int last, order; 703 704 order = min(max_order, ilog2(nr_pages - rb->aux_nr_pages)); 705 page = rb_alloc_aux_page(node, order); 706 if (!page) 707 goto out; 708 709 for (last = rb->aux_nr_pages + (1 << page_private(page)); 710 last > rb->aux_nr_pages; rb->aux_nr_pages++) 711 rb->aux_pages[rb->aux_nr_pages] = page_address(page++); 712 } 713 714 /* 715 * In overwrite mode, PMUs that don't support SG may not handle more 716 * than one contiguous allocation, since they rely on PMI to do double 717 * buffering. In this case, the entire buffer has to be one contiguous 718 * chunk. 719 */ 720 if ((event->pmu->capabilities & PERF_PMU_CAP_AUX_NO_SG) && 721 overwrite) { 722 struct page *page = virt_to_page(rb->aux_pages[0]); 723 724 if (page_private(page) != max_order) 725 goto out; 726 } 727 728 rb->aux_priv = event->pmu->setup_aux(event, rb->aux_pages, nr_pages, 729 overwrite); 730 if (!rb->aux_priv) 731 goto out; 732 733 ret = 0; 734 735 /* 736 * aux_pages (and pmu driver's private data, aux_priv) will be 737 * referenced in both producer's and consumer's contexts, thus 738 * we keep a refcount here to make sure either of the two can 739 * reference them safely. 740 */ 741 refcount_set(&rb->aux_refcount, 1); 742 743 rb->aux_overwrite = overwrite; 744 rb->aux_watermark = watermark; 745 746 if (!rb->aux_watermark && !rb->aux_overwrite) 747 rb->aux_watermark = nr_pages << (PAGE_SHIFT - 1); 748 749 out: 750 if (!ret) 751 rb->aux_pgoff = pgoff; 752 else 753 __rb_free_aux(rb); 754 755 return ret; 756 } 757 758 void rb_free_aux(struct perf_buffer *rb) 759 { 760 if (refcount_dec_and_test(&rb->aux_refcount)) 761 __rb_free_aux(rb); 762 } 763 764 #ifndef CONFIG_PERF_USE_VMALLOC 765 766 /* 767 * Back perf_mmap() with regular GFP_KERNEL-0 pages. 768 */ 769 770 static struct page * 771 __perf_mmap_to_page(struct perf_buffer *rb, unsigned long pgoff) 772 { 773 if (pgoff > rb->nr_pages) 774 return NULL; 775 776 if (pgoff == 0) 777 return virt_to_page(rb->user_page); 778 779 return virt_to_page(rb->data_pages[pgoff - 1]); 780 } 781 782 static void *perf_mmap_alloc_page(int cpu) 783 { 784 struct page *page; 785 int node; 786 787 node = (cpu == -1) ? cpu : cpu_to_node(cpu); 788 page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); 789 if (!page) 790 return NULL; 791 792 return page_address(page); 793 } 794 795 static void perf_mmap_free_page(void *addr) 796 { 797 struct page *page = virt_to_page(addr); 798 799 page->mapping = NULL; 800 __free_page(page); 801 } 802 803 struct perf_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags) 804 { 805 struct perf_buffer *rb; 806 unsigned long size; 807 int i; 808 809 size = sizeof(struct perf_buffer); 810 size += nr_pages * sizeof(void *); 811 812 if (order_base_2(size) >= PAGE_SHIFT+MAX_ORDER) 813 goto fail; 814 815 rb = kzalloc(size, GFP_KERNEL); 816 if (!rb) 817 goto fail; 818 819 rb->user_page = perf_mmap_alloc_page(cpu); 820 if (!rb->user_page) 821 goto fail_user_page; 822 823 for (i = 0; i < nr_pages; i++) { 824 rb->data_pages[i] = perf_mmap_alloc_page(cpu); 825 if (!rb->data_pages[i]) 826 goto fail_data_pages; 827 } 828 829 rb->nr_pages = nr_pages; 830 831 ring_buffer_init(rb, watermark, flags); 832 833 return rb; 834 835 fail_data_pages: 836 for (i--; i >= 0; i--) 837 perf_mmap_free_page(rb->data_pages[i]); 838 839 perf_mmap_free_page(rb->user_page); 840 841 fail_user_page: 842 kfree(rb); 843 844 fail: 845 return NULL; 846 } 847 848 void rb_free(struct perf_buffer *rb) 849 { 850 int i; 851 852 perf_mmap_free_page(rb->user_page); 853 for (i = 0; i < rb->nr_pages; i++) 854 perf_mmap_free_page(rb->data_pages[i]); 855 kfree(rb); 856 } 857 858 #else 859 static int data_page_nr(struct perf_buffer *rb) 860 { 861 return rb->nr_pages << page_order(rb); 862 } 863 864 static struct page * 865 __perf_mmap_to_page(struct perf_buffer *rb, unsigned long pgoff) 866 { 867 /* The '>' counts in the user page. */ 868 if (pgoff > data_page_nr(rb)) 869 return NULL; 870 871 return vmalloc_to_page((void *)rb->user_page + pgoff * PAGE_SIZE); 872 } 873 874 static void perf_mmap_unmark_page(void *addr) 875 { 876 struct page *page = vmalloc_to_page(addr); 877 878 page->mapping = NULL; 879 } 880 881 static void rb_free_work(struct work_struct *work) 882 { 883 struct perf_buffer *rb; 884 void *base; 885 int i, nr; 886 887 rb = container_of(work, struct perf_buffer, work); 888 nr = data_page_nr(rb); 889 890 base = rb->user_page; 891 /* The '<=' counts in the user page. */ 892 for (i = 0; i <= nr; i++) 893 perf_mmap_unmark_page(base + (i * PAGE_SIZE)); 894 895 vfree(base); 896 kfree(rb); 897 } 898 899 void rb_free(struct perf_buffer *rb) 900 { 901 schedule_work(&rb->work); 902 } 903 904 struct perf_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags) 905 { 906 struct perf_buffer *rb; 907 unsigned long size; 908 void *all_buf; 909 910 size = sizeof(struct perf_buffer); 911 size += sizeof(void *); 912 913 rb = kzalloc(size, GFP_KERNEL); 914 if (!rb) 915 goto fail; 916 917 INIT_WORK(&rb->work, rb_free_work); 918 919 all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE); 920 if (!all_buf) 921 goto fail_all_buf; 922 923 rb->user_page = all_buf; 924 rb->data_pages[0] = all_buf + PAGE_SIZE; 925 if (nr_pages) { 926 rb->nr_pages = 1; 927 rb->page_order = ilog2(nr_pages); 928 } 929 930 ring_buffer_init(rb, watermark, flags); 931 932 return rb; 933 934 fail_all_buf: 935 kfree(rb); 936 937 fail: 938 return NULL; 939 } 940 941 #endif 942 943 struct page * 944 perf_mmap_to_page(struct perf_buffer *rb, unsigned long pgoff) 945 { 946 if (rb->aux_nr_pages) { 947 /* above AUX space */ 948 if (pgoff > rb->aux_pgoff + rb->aux_nr_pages) 949 return NULL; 950 951 /* AUX space */ 952 if (pgoff >= rb->aux_pgoff) { 953 int aux_pgoff = array_index_nospec(pgoff - rb->aux_pgoff, rb->aux_nr_pages); 954 return virt_to_page(rb->aux_pages[aux_pgoff]); 955 } 956 } 957 958 return __perf_mmap_to_page(rb, pgoff); 959 } 960