1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 2011-2017, Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com> 4 * 5 * Parts came from evlist.c builtin-{top,stat,record}.c, see those files for further 6 * copyright notes. 7 */ 8 9 #include <sys/mman.h> 10 #include <inttypes.h> 11 #include <asm/bug.h> 12 #ifdef HAVE_LIBNUMA_SUPPORT 13 #include <numaif.h> 14 #endif 15 #include "debug.h" 16 #include "event.h" 17 #include "mmap.h" 18 #include "util.h" /* page_size */ 19 20 size_t perf_mmap__mmap_len(struct perf_mmap *map) 21 { 22 return map->mask + 1 + page_size; 23 } 24 25 /* When check_messup is true, 'end' must points to a good entry */ 26 static union perf_event *perf_mmap__read(struct perf_mmap *map, 27 u64 *startp, u64 end) 28 { 29 unsigned char *data = map->base + page_size; 30 union perf_event *event = NULL; 31 int diff = end - *startp; 32 33 if (diff >= (int)sizeof(event->header)) { 34 size_t size; 35 36 event = (union perf_event *)&data[*startp & map->mask]; 37 size = event->header.size; 38 39 if (size < sizeof(event->header) || diff < (int)size) 40 return NULL; 41 42 /* 43 * Event straddles the mmap boundary -- header should always 44 * be inside due to u64 alignment of output. 45 */ 46 if ((*startp & map->mask) + size != ((*startp + size) & map->mask)) { 47 unsigned int offset = *startp; 48 unsigned int len = min(sizeof(*event), size), cpy; 49 void *dst = map->event_copy; 50 51 do { 52 cpy = min(map->mask + 1 - (offset & map->mask), len); 53 memcpy(dst, &data[offset & map->mask], cpy); 54 offset += cpy; 55 dst += cpy; 56 len -= cpy; 57 } while (len); 58 59 event = (union perf_event *)map->event_copy; 60 } 61 62 *startp += size; 63 } 64 65 return event; 66 } 67 68 /* 69 * Read event from ring buffer one by one. 70 * Return one event for each call. 71 * 72 * Usage: 73 * perf_mmap__read_init() 74 * while(event = perf_mmap__read_event()) { 75 * //process the event 76 * perf_mmap__consume() 77 * } 78 * perf_mmap__read_done() 79 */ 80 union perf_event *perf_mmap__read_event(struct perf_mmap *map) 81 { 82 union perf_event *event; 83 84 /* 85 * Check if event was unmapped due to a POLLHUP/POLLERR. 86 */ 87 if (!refcount_read(&map->refcnt)) 88 return NULL; 89 90 /* non-overwirte doesn't pause the ringbuffer */ 91 if (!map->overwrite) 92 map->end = perf_mmap__read_head(map); 93 94 event = perf_mmap__read(map, &map->start, map->end); 95 96 if (!map->overwrite) 97 map->prev = map->start; 98 99 return event; 100 } 101 102 static bool perf_mmap__empty(struct perf_mmap *map) 103 { 104 return perf_mmap__read_head(map) == map->prev && !map->auxtrace_mmap.base; 105 } 106 107 void perf_mmap__get(struct perf_mmap *map) 108 { 109 refcount_inc(&map->refcnt); 110 } 111 112 void perf_mmap__put(struct perf_mmap *map) 113 { 114 BUG_ON(map->base && refcount_read(&map->refcnt) == 0); 115 116 if (refcount_dec_and_test(&map->refcnt)) 117 perf_mmap__munmap(map); 118 } 119 120 void perf_mmap__consume(struct perf_mmap *map) 121 { 122 if (!map->overwrite) { 123 u64 old = map->prev; 124 125 perf_mmap__write_tail(map, old); 126 } 127 128 if (refcount_read(&map->refcnt) == 1 && perf_mmap__empty(map)) 129 perf_mmap__put(map); 130 } 131 132 int __weak auxtrace_mmap__mmap(struct auxtrace_mmap *mm __maybe_unused, 133 struct auxtrace_mmap_params *mp __maybe_unused, 134 void *userpg __maybe_unused, 135 int fd __maybe_unused) 136 { 137 return 0; 138 } 139 140 void __weak auxtrace_mmap__munmap(struct auxtrace_mmap *mm __maybe_unused) 141 { 142 } 143 144 void __weak auxtrace_mmap_params__init(struct auxtrace_mmap_params *mp __maybe_unused, 145 off_t auxtrace_offset __maybe_unused, 146 unsigned int auxtrace_pages __maybe_unused, 147 bool auxtrace_overwrite __maybe_unused) 148 { 149 } 150 151 void __weak auxtrace_mmap_params__set_idx(struct auxtrace_mmap_params *mp __maybe_unused, 152 struct perf_evlist *evlist __maybe_unused, 153 int idx __maybe_unused, 154 bool per_cpu __maybe_unused) 155 { 156 } 157 158 #ifdef HAVE_AIO_SUPPORT 159 static int perf_mmap__aio_enabled(struct perf_mmap *map) 160 { 161 return map->aio.nr_cblocks > 0; 162 } 163 164 #ifdef HAVE_LIBNUMA_SUPPORT 165 static int perf_mmap__aio_alloc(struct perf_mmap *map, int idx) 166 { 167 map->aio.data[idx] = mmap(NULL, perf_mmap__mmap_len(map), PROT_READ|PROT_WRITE, 168 MAP_PRIVATE|MAP_ANONYMOUS, 0, 0); 169 if (map->aio.data[idx] == MAP_FAILED) { 170 map->aio.data[idx] = NULL; 171 return -1; 172 } 173 174 return 0; 175 } 176 177 static void perf_mmap__aio_free(struct perf_mmap *map, int idx) 178 { 179 if (map->aio.data[idx]) { 180 munmap(map->aio.data[idx], perf_mmap__mmap_len(map)); 181 map->aio.data[idx] = NULL; 182 } 183 } 184 185 static int perf_mmap__aio_bind(struct perf_mmap *map, int idx, int cpu, int affinity) 186 { 187 void *data; 188 size_t mmap_len; 189 unsigned long node_mask; 190 191 if (affinity != PERF_AFFINITY_SYS && cpu__max_node() > 1) { 192 data = map->aio.data[idx]; 193 mmap_len = perf_mmap__mmap_len(map); 194 node_mask = 1UL << cpu__get_node(cpu); 195 if (mbind(data, mmap_len, MPOL_BIND, &node_mask, 1, 0)) { 196 pr_err("Failed to bind [%p-%p] AIO buffer to node %d: error %m\n", 197 data, data + mmap_len, cpu__get_node(cpu)); 198 return -1; 199 } 200 } 201 202 return 0; 203 } 204 #else /* !HAVE_LIBNUMA_SUPPORT */ 205 static int perf_mmap__aio_alloc(struct perf_mmap *map, int idx) 206 { 207 map->aio.data[idx] = malloc(perf_mmap__mmap_len(map)); 208 if (map->aio.data[idx] == NULL) 209 return -1; 210 211 return 0; 212 } 213 214 static void perf_mmap__aio_free(struct perf_mmap *map, int idx) 215 { 216 zfree(&(map->aio.data[idx])); 217 } 218 219 static int perf_mmap__aio_bind(struct perf_mmap *map __maybe_unused, int idx __maybe_unused, 220 int cpu __maybe_unused, int affinity __maybe_unused) 221 { 222 return 0; 223 } 224 #endif 225 226 static int perf_mmap__aio_mmap(struct perf_mmap *map, struct mmap_params *mp) 227 { 228 int delta_max, i, prio, ret; 229 230 map->aio.nr_cblocks = mp->nr_cblocks; 231 if (map->aio.nr_cblocks) { 232 map->aio.aiocb = calloc(map->aio.nr_cblocks, sizeof(struct aiocb *)); 233 if (!map->aio.aiocb) { 234 pr_debug2("failed to allocate aiocb for data buffer, error %m\n"); 235 return -1; 236 } 237 map->aio.cblocks = calloc(map->aio.nr_cblocks, sizeof(struct aiocb)); 238 if (!map->aio.cblocks) { 239 pr_debug2("failed to allocate cblocks for data buffer, error %m\n"); 240 return -1; 241 } 242 map->aio.data = calloc(map->aio.nr_cblocks, sizeof(void *)); 243 if (!map->aio.data) { 244 pr_debug2("failed to allocate data buffer, error %m\n"); 245 return -1; 246 } 247 delta_max = sysconf(_SC_AIO_PRIO_DELTA_MAX); 248 for (i = 0; i < map->aio.nr_cblocks; ++i) { 249 ret = perf_mmap__aio_alloc(map, i); 250 if (ret == -1) { 251 pr_debug2("failed to allocate data buffer area, error %m"); 252 return -1; 253 } 254 ret = perf_mmap__aio_bind(map, i, map->cpu, mp->affinity); 255 if (ret == -1) 256 return -1; 257 /* 258 * Use cblock.aio_fildes value different from -1 259 * to denote started aio write operation on the 260 * cblock so it requires explicit record__aio_sync() 261 * call prior the cblock may be reused again. 262 */ 263 map->aio.cblocks[i].aio_fildes = -1; 264 /* 265 * Allocate cblocks with priority delta to have 266 * faster aio write system calls because queued requests 267 * are kept in separate per-prio queues and adding 268 * a new request will iterate thru shorter per-prio 269 * list. Blocks with numbers higher than 270 * _SC_AIO_PRIO_DELTA_MAX go with priority 0. 271 */ 272 prio = delta_max - i; 273 map->aio.cblocks[i].aio_reqprio = prio >= 0 ? prio : 0; 274 } 275 } 276 277 return 0; 278 } 279 280 static void perf_mmap__aio_munmap(struct perf_mmap *map) 281 { 282 int i; 283 284 for (i = 0; i < map->aio.nr_cblocks; ++i) 285 perf_mmap__aio_free(map, i); 286 if (map->aio.data) 287 zfree(&map->aio.data); 288 zfree(&map->aio.cblocks); 289 zfree(&map->aio.aiocb); 290 } 291 #else /* !HAVE_AIO_SUPPORT */ 292 static int perf_mmap__aio_enabled(struct perf_mmap *map __maybe_unused) 293 { 294 return 0; 295 } 296 297 static int perf_mmap__aio_mmap(struct perf_mmap *map __maybe_unused, 298 struct mmap_params *mp __maybe_unused) 299 { 300 return 0; 301 } 302 303 static void perf_mmap__aio_munmap(struct perf_mmap *map __maybe_unused) 304 { 305 } 306 #endif 307 308 void perf_mmap__munmap(struct perf_mmap *map) 309 { 310 perf_mmap__aio_munmap(map); 311 if (map->data != NULL) { 312 munmap(map->data, perf_mmap__mmap_len(map)); 313 map->data = NULL; 314 } 315 if (map->base != NULL) { 316 munmap(map->base, perf_mmap__mmap_len(map)); 317 map->base = NULL; 318 map->fd = -1; 319 refcount_set(&map->refcnt, 0); 320 } 321 auxtrace_mmap__munmap(&map->auxtrace_mmap); 322 } 323 324 static void build_node_mask(int node, cpu_set_t *mask) 325 { 326 int c, cpu, nr_cpus; 327 const struct cpu_map *cpu_map = NULL; 328 329 cpu_map = cpu_map__online(); 330 if (!cpu_map) 331 return; 332 333 nr_cpus = cpu_map__nr(cpu_map); 334 for (c = 0; c < nr_cpus; c++) { 335 cpu = cpu_map->map[c]; /* map c index to online cpu index */ 336 if (cpu__get_node(cpu) == node) 337 CPU_SET(cpu, mask); 338 } 339 } 340 341 static void perf_mmap__setup_affinity_mask(struct perf_mmap *map, struct mmap_params *mp) 342 { 343 CPU_ZERO(&map->affinity_mask); 344 if (mp->affinity == PERF_AFFINITY_NODE && cpu__max_node() > 1) 345 build_node_mask(cpu__get_node(map->cpu), &map->affinity_mask); 346 else if (mp->affinity == PERF_AFFINITY_CPU) 347 CPU_SET(map->cpu, &map->affinity_mask); 348 } 349 350 int perf_mmap__mmap(struct perf_mmap *map, struct mmap_params *mp, int fd, int cpu) 351 { 352 /* 353 * The last one will be done at perf_mmap__consume(), so that we 354 * make sure we don't prevent tools from consuming every last event in 355 * the ring buffer. 356 * 357 * I.e. we can get the POLLHUP meaning that the fd doesn't exist 358 * anymore, but the last events for it are still in the ring buffer, 359 * waiting to be consumed. 360 * 361 * Tools can chose to ignore this at their own discretion, but the 362 * evlist layer can't just drop it when filtering events in 363 * perf_evlist__filter_pollfd(). 364 */ 365 refcount_set(&map->refcnt, 2); 366 map->prev = 0; 367 map->mask = mp->mask; 368 map->base = mmap(NULL, perf_mmap__mmap_len(map), mp->prot, 369 MAP_SHARED, fd, 0); 370 if (map->base == MAP_FAILED) { 371 pr_debug2("failed to mmap perf event ring buffer, error %d\n", 372 errno); 373 map->base = NULL; 374 return -1; 375 } 376 map->fd = fd; 377 map->cpu = cpu; 378 379 perf_mmap__setup_affinity_mask(map, mp); 380 381 map->flush = mp->flush; 382 383 map->comp_level = mp->comp_level; 384 385 if (map->comp_level && !perf_mmap__aio_enabled(map)) { 386 map->data = mmap(NULL, perf_mmap__mmap_len(map), PROT_READ|PROT_WRITE, 387 MAP_PRIVATE|MAP_ANONYMOUS, 0, 0); 388 if (map->data == MAP_FAILED) { 389 pr_debug2("failed to mmap data buffer, error %d\n", 390 errno); 391 map->data = NULL; 392 return -1; 393 } 394 } 395 396 if (auxtrace_mmap__mmap(&map->auxtrace_mmap, 397 &mp->auxtrace_mp, map->base, fd)) 398 return -1; 399 400 return perf_mmap__aio_mmap(map, mp); 401 } 402 403 static int overwrite_rb_find_range(void *buf, int mask, u64 *start, u64 *end) 404 { 405 struct perf_event_header *pheader; 406 u64 evt_head = *start; 407 int size = mask + 1; 408 409 pr_debug2("%s: buf=%p, start=%"PRIx64"\n", __func__, buf, *start); 410 pheader = (struct perf_event_header *)(buf + (*start & mask)); 411 while (true) { 412 if (evt_head - *start >= (unsigned int)size) { 413 pr_debug("Finished reading overwrite ring buffer: rewind\n"); 414 if (evt_head - *start > (unsigned int)size) 415 evt_head -= pheader->size; 416 *end = evt_head; 417 return 0; 418 } 419 420 pheader = (struct perf_event_header *)(buf + (evt_head & mask)); 421 422 if (pheader->size == 0) { 423 pr_debug("Finished reading overwrite ring buffer: get start\n"); 424 *end = evt_head; 425 return 0; 426 } 427 428 evt_head += pheader->size; 429 pr_debug3("move evt_head: %"PRIx64"\n", evt_head); 430 } 431 WARN_ONCE(1, "Shouldn't get here\n"); 432 return -1; 433 } 434 435 /* 436 * Report the start and end of the available data in ringbuffer 437 */ 438 static int __perf_mmap__read_init(struct perf_mmap *md) 439 { 440 u64 head = perf_mmap__read_head(md); 441 u64 old = md->prev; 442 unsigned char *data = md->base + page_size; 443 unsigned long size; 444 445 md->start = md->overwrite ? head : old; 446 md->end = md->overwrite ? old : head; 447 448 if ((md->end - md->start) < md->flush) 449 return -EAGAIN; 450 451 size = md->end - md->start; 452 if (size > (unsigned long)(md->mask) + 1) { 453 if (!md->overwrite) { 454 WARN_ONCE(1, "failed to keep up with mmap data. (warn only once)\n"); 455 456 md->prev = head; 457 perf_mmap__consume(md); 458 return -EAGAIN; 459 } 460 461 /* 462 * Backward ring buffer is full. We still have a chance to read 463 * most of data from it. 464 */ 465 if (overwrite_rb_find_range(data, md->mask, &md->start, &md->end)) 466 return -EINVAL; 467 } 468 469 return 0; 470 } 471 472 int perf_mmap__read_init(struct perf_mmap *map) 473 { 474 /* 475 * Check if event was unmapped due to a POLLHUP/POLLERR. 476 */ 477 if (!refcount_read(&map->refcnt)) 478 return -ENOENT; 479 480 return __perf_mmap__read_init(map); 481 } 482 483 int perf_mmap__push(struct perf_mmap *md, void *to, 484 int push(struct perf_mmap *map, void *to, void *buf, size_t size)) 485 { 486 u64 head = perf_mmap__read_head(md); 487 unsigned char *data = md->base + page_size; 488 unsigned long size; 489 void *buf; 490 int rc = 0; 491 492 rc = perf_mmap__read_init(md); 493 if (rc < 0) 494 return (rc == -EAGAIN) ? 1 : -1; 495 496 size = md->end - md->start; 497 498 if ((md->start & md->mask) + size != (md->end & md->mask)) { 499 buf = &data[md->start & md->mask]; 500 size = md->mask + 1 - (md->start & md->mask); 501 md->start += size; 502 503 if (push(md, to, buf, size) < 0) { 504 rc = -1; 505 goto out; 506 } 507 } 508 509 buf = &data[md->start & md->mask]; 510 size = md->end - md->start; 511 md->start += size; 512 513 if (push(md, to, buf, size) < 0) { 514 rc = -1; 515 goto out; 516 } 517 518 md->prev = head; 519 perf_mmap__consume(md); 520 out: 521 return rc; 522 } 523 524 /* 525 * Mandatory for overwrite mode 526 * The direction of overwrite mode is backward. 527 * The last perf_mmap__read() will set tail to map->prev. 528 * Need to correct the map->prev to head which is the end of next read. 529 */ 530 void perf_mmap__read_done(struct perf_mmap *map) 531 { 532 /* 533 * Check if event was unmapped due to a POLLHUP/POLLERR. 534 */ 535 if (!refcount_read(&map->refcnt)) 536 return; 537 538 map->prev = perf_mmap__read_head(map); 539 } 540