xref: /openbmc/linux/tools/perf/util/s390-cpumsf.c (revision 33d9e183)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright IBM Corp. 2018
4  * Auxtrace support for s390 CPU-Measurement Sampling Facility
5  *
6  * Author(s):  Thomas Richter <tmricht@linux.ibm.com>
7  *
8  * Auxiliary traces are collected during 'perf record' using rbd000 event.
9  * Several PERF_RECORD_XXX are generated during recording:
10  *
11  * PERF_RECORD_AUX:
12  *	Records that new data landed in the AUX buffer part.
13  * PERF_RECORD_AUXTRACE:
14  *	Defines auxtrace data. Followed by the actual data. The contents of
15  *	the auxtrace data is dependent on the event and the CPU.
16  *	This record is generated by perf record command. For details
17  *	see Documentation/perf.data-file-format.txt.
18  * PERF_RECORD_AUXTRACE_INFO:
19  *	Defines a table of contains for PERF_RECORD_AUXTRACE records. This
20  *	record is generated during 'perf record' command. Each record contains up
21  *	to 256 entries describing offset and size of the AUXTRACE data in the
22  *	perf.data file.
23  * PERF_RECORD_AUXTRACE_ERROR:
24  *	Indicates an error during AUXTRACE collection such as buffer overflow.
25  * PERF_RECORD_FINISHED_ROUND:
26  *	Perf events are not necessarily in time stamp order, as they can be
27  *	collected in parallel on different CPUs. If the events should be
28  *	processed in time order they need to be sorted first.
29  *	Perf report guarantees that there is no reordering over a
30  *	PERF_RECORD_FINISHED_ROUND boundary event. All perf records with a
31  *	time stamp lower than this record are processed (and displayed) before
32  *	the succeeding perf record are processed.
33  *
34  * These records are evaluated during perf report command.
35  *
36  * 1. PERF_RECORD_AUXTRACE_INFO is used to set up the infrastructure for
37  * auxiliary trace data processing. See s390_cpumsf_process_auxtrace_info()
38  * below.
39  * Auxiliary trace data is collected per CPU. To merge the data into the report
40  * an auxtrace_queue is created for each CPU. It is assumed that the auxtrace
41  * data is in ascending order.
42  *
43  * Each queue has a double linked list of auxtrace_buffers. This list contains
44  * the offset and size of a CPU's auxtrace data. During auxtrace processing
45  * the data portion is mmap()'ed.
46  *
47  * To sort the queues in chronological order, all queue access is controlled
48  * by the auxtrace_heap. This is basicly a stack, each stack element has two
49  * entries, the queue number and a time stamp. However the stack is sorted by
50  * the time stamps. The highest time stamp is at the bottom the lowest
51  * (nearest) time stamp is at the top. That sort order is maintained at all
52  * times!
53  *
54  * After the auxtrace infrastructure has been setup, the auxtrace queues are
55  * filled with data (offset/size pairs) and the auxtrace_heap is populated.
56  *
57  * 2. PERF_RECORD_XXX processing triggers access to the auxtrace_queues.
58  * Each record is handled by s390_cpumsf_process_event(). The time stamp of
59  * the perf record is compared with the time stamp located on the auxtrace_heap
60  * top element. If that time stamp is lower than the time stamp from the
61  * record sample, the auxtrace queues will be processed. As auxtrace queues
62  * control many auxtrace_buffers and each buffer can be quite large, the
63  * auxtrace buffer might be processed only partially. In this case the
64  * position in the auxtrace_buffer of that queue is remembered and the time
65  * stamp of the last processed entry of the auxtrace_buffer replaces the
66  * current auxtrace_heap top.
67  *
68  * 3. Auxtrace_queues might run of out data and are feeded by the
69  * PERF_RECORD_AUXTRACE handling, see s390_cpumsf_process_auxtrace_event().
70  *
71  * Event Generation
72  * Each sampling-data entry in the auxilary trace data generates a perf sample.
73  * This sample is filled
74  * with data from the auxtrace such as PID/TID, instruction address, CPU state,
75  * etc. This sample is processed with perf_session__deliver_synth_event() to
76  * be included into the GUI.
77  *
78  * 4. PERF_RECORD_FINISHED_ROUND event is used to process all the remaining
79  * auxiliary traces entries until the time stamp of this record is reached
80  * auxtrace_heap top. This is triggered by ordered_event->deliver().
81  *
82  *
83  * Perf event processing.
84  * Event processing of PERF_RECORD_XXX entries relies on time stamp entries.
85  * This is the function call sequence:
86  *
87  * __cmd_report()
88  * |
89  * perf_session__process_events()
90  * |
91  * __perf_session__process_events()
92  * |
93  * perf_session__process_event()
94  * |  This functions splits the PERF_RECORD_XXX records.
95  * |  - Those generated by perf record command (type number equal or higher
96  * |    than PERF_RECORD_USER_TYPE_START) are handled by
97  * |    perf_session__process_user_event(see below)
98  * |  - Those generated by the kernel are handled by
99  * |    perf_evlist__parse_sample_timestamp()
100  * |
101  * perf_evlist__parse_sample_timestamp()
102  * |  Extract time stamp from sample data.
103  * |
104  * perf_session__queue_event()
105  * |  If timestamp is positive the sample is entered into an ordered_event
106  * |  list, sort order is the timestamp. The event processing is deferred until
107  * |  later (see perf_session__process_user_event()).
108  * |  Other timestamps (0 or -1) are handled immediately by
109  * |  perf_session__deliver_event(). These are events generated at start up
110  * |  of command perf record. They create PERF_RECORD_COMM and PERF_RECORD_MMAP*
111  * |  records. They are needed to create a list of running processes and its
112  * |  memory mappings and layout. They are needed at the beginning to enable
113  * |  command perf report to create process trees and memory mappings.
114  * |
115  * perf_session__deliver_event()
116  * |  Delivers a PERF_RECORD_XXX entry for handling.
117  * |
118  * auxtrace__process_event()
119  * |  The timestamp of the PERF_RECORD_XXX entry is taken to correlate with
120  * |  time stamps from the auxiliary trace buffers. This enables
121  * |  synchronization between auxiliary trace data and the events on the
122  * |  perf.data file.
123  * |
124  * machine__deliver_event()
125  * |  Handles the PERF_RECORD_XXX event. This depends on the record type.
126  *    It might update the process tree, update a process memory map or enter
127  *    a sample with IP and call back chain data into GUI data pool.
128  *
129  *
130  * Deferred processing determined by perf_session__process_user_event() is
131  * finally processed when a PERF_RECORD_FINISHED_ROUND is encountered. These
132  * are generated during command perf record.
133  * The timestamp of PERF_RECORD_FINISHED_ROUND event is taken to process all
134  * PERF_RECORD_XXX entries stored in the ordered_event list. This list was
135  * built up while reading the perf.data file.
136  * Each event is now processed by calling perf_session__deliver_event().
137  * This enables time synchronization between the data in the perf.data file and
138  * the data in the auxiliary trace buffers.
139  */
140 
141 #include <endian.h>
142 #include <errno.h>
143 #include <byteswap.h>
144 #include <inttypes.h>
145 #include <linux/kernel.h>
146 #include <linux/types.h>
147 #include <linux/bitops.h>
148 #include <linux/log2.h>
149 
150 #include "cpumap.h"
151 #include "color.h"
152 #include "evsel.h"
153 #include "evlist.h"
154 #include "machine.h"
155 #include "session.h"
156 #include "util.h"
157 #include "thread.h"
158 #include "debug.h"
159 #include "auxtrace.h"
160 #include "s390-cpumsf.h"
161 #include "s390-cpumsf-kernel.h"
162 
163 struct s390_cpumsf {
164 	struct auxtrace		auxtrace;
165 	struct auxtrace_queues	queues;
166 	struct auxtrace_heap	heap;
167 	struct perf_session	*session;
168 	struct machine		*machine;
169 	u32			auxtrace_type;
170 	u32			pmu_type;
171 	u16			machine_type;
172 	bool			data_queued;
173 };
174 
175 struct s390_cpumsf_queue {
176 	struct s390_cpumsf	*sf;
177 	unsigned int		queue_nr;
178 	struct auxtrace_buffer	*buffer;
179 	int			cpu;
180 };
181 
182 /* Display s390 CPU measurement facility basic-sampling data entry */
183 static bool s390_cpumsf_basic_show(const char *color, size_t pos,
184 				   struct hws_basic_entry *basic)
185 {
186 	if (basic->def != 1) {
187 		pr_err("Invalid AUX trace basic entry [%#08zx]\n", pos);
188 		return false;
189 	}
190 	color_fprintf(stdout, color, "    [%#08zx] Basic   Def:%04x Inst:%#04x"
191 		      " %c%c%c%c AS:%d ASN:%#04x IA:%#018llx\n"
192 		      "\t\tCL:%d HPP:%#018llx GPP:%#018llx\n",
193 		      pos, basic->def, basic->U,
194 		      basic->T ? 'T' : ' ',
195 		      basic->W ? 'W' : ' ',
196 		      basic->P ? 'P' : ' ',
197 		      basic->I ? 'I' : ' ',
198 		      basic->AS, basic->prim_asn, basic->ia, basic->CL,
199 		      basic->hpp, basic->gpp);
200 	return true;
201 }
202 
203 /* Display s390 CPU measurement facility diagnostic-sampling data entry */
204 static bool s390_cpumsf_diag_show(const char *color, size_t pos,
205 				  struct hws_diag_entry *diag)
206 {
207 	if (diag->def < S390_CPUMSF_DIAG_DEF_FIRST) {
208 		pr_err("Invalid AUX trace diagnostic entry [%#08zx]\n", pos);
209 		return false;
210 	}
211 	color_fprintf(stdout, color, "    [%#08zx] Diag    Def:%04x %c\n",
212 		      pos, diag->def, diag->I ? 'I' : ' ');
213 	return true;
214 }
215 
216 /* Return TOD timestamp contained in an trailer entry */
217 static unsigned long long trailer_timestamp(struct hws_trailer_entry *te)
218 {
219 	/* te->t set: TOD in STCKE format, bytes 8-15
220 	 * to->t not set: TOD in STCK format, bytes 0-7
221 	 */
222 	unsigned long long ts;
223 
224 	memcpy(&ts, &te->timestamp[te->t], sizeof(ts));
225 	return ts;
226 }
227 
228 /* Display s390 CPU measurement facility trailer entry */
229 static bool s390_cpumsf_trailer_show(const char *color, size_t pos,
230 				     struct hws_trailer_entry *te)
231 {
232 	if (te->bsdes != sizeof(struct hws_basic_entry)) {
233 		pr_err("Invalid AUX trace trailer entry [%#08zx]\n", pos);
234 		return false;
235 	}
236 	color_fprintf(stdout, color, "    [%#08zx] Trailer %c%c%c bsdes:%d"
237 		      " dsdes:%d Overflow:%lld Time:%#llx\n"
238 		      "\t\tC:%d TOD:%#lx 1:%#llx 2:%#llx\n",
239 		      pos,
240 		      te->f ? 'F' : ' ',
241 		      te->a ? 'A' : ' ',
242 		      te->t ? 'T' : ' ',
243 		      te->bsdes, te->dsdes, te->overflow,
244 		      trailer_timestamp(te), te->clock_base, te->progusage2,
245 		      te->progusage[0], te->progusage[1]);
246 	return true;
247 }
248 
249 /* Test a sample data block. It must be 4KB or a multiple thereof in size and
250  * 4KB page aligned. Each sample data page has a trailer entry at the
251  * end which contains the sample entry data sizes.
252  *
253  * Return true if the sample data block passes the checks and set the
254  * basic set entry size and diagnostic set entry size.
255  *
256  * Return false on failure.
257  *
258  * Note: Old hardware does not set the basic or diagnostic entry sizes
259  * in the trailer entry. Use the type number instead.
260  */
261 static bool s390_cpumsf_validate(int machine_type,
262 				 unsigned char *buf, size_t len,
263 				 unsigned short *bsdes,
264 				 unsigned short *dsdes)
265 {
266 	struct hws_basic_entry *basic = (struct hws_basic_entry *)buf;
267 	struct hws_trailer_entry *te;
268 
269 	*dsdes = *bsdes = 0;
270 	if (len & (S390_CPUMSF_PAGESZ - 1))	/* Illegal size */
271 		return false;
272 	if (basic->def != 1)		/* No basic set entry, must be first */
273 		return false;
274 	/* Check for trailer entry at end of SDB */
275 	te = (struct hws_trailer_entry *)(buf + S390_CPUMSF_PAGESZ
276 					      - sizeof(*te));
277 	*bsdes = te->bsdes;
278 	*dsdes = te->dsdes;
279 	if (!te->bsdes && !te->dsdes) {
280 		/* Very old hardware, use CPUID */
281 		switch (machine_type) {
282 		case 2097:
283 		case 2098:
284 			*dsdes = 64;
285 			*bsdes = 32;
286 			break;
287 		case 2817:
288 		case 2818:
289 			*dsdes = 74;
290 			*bsdes = 32;
291 			break;
292 		case 2827:
293 		case 2828:
294 			*dsdes = 85;
295 			*bsdes = 32;
296 			break;
297 		default:
298 			/* Illegal trailer entry */
299 			return false;
300 		}
301 	}
302 	return true;
303 }
304 
305 /* Return true if there is room for another entry */
306 static bool s390_cpumsf_reached_trailer(size_t entry_sz, size_t pos)
307 {
308 	size_t payload = S390_CPUMSF_PAGESZ - sizeof(struct hws_trailer_entry);
309 
310 	if (payload - (pos & (S390_CPUMSF_PAGESZ - 1)) < entry_sz)
311 		return false;
312 	return true;
313 }
314 
315 /* Dump an auxiliary buffer. These buffers are multiple of
316  * 4KB SDB pages.
317  */
318 static void s390_cpumsf_dump(struct s390_cpumsf *sf,
319 			     unsigned char *buf, size_t len)
320 {
321 	const char *color = PERF_COLOR_BLUE;
322 	struct hws_basic_entry *basic;
323 	struct hws_diag_entry *diag;
324 	unsigned short bsdes, dsdes;
325 	size_t pos = 0;
326 
327 	color_fprintf(stdout, color,
328 		      ". ... s390 AUX data: size %zu bytes\n",
329 		      len);
330 
331 	if (!s390_cpumsf_validate(sf->machine_type, buf, len, &bsdes,
332 				  &dsdes)) {
333 		pr_err("Invalid AUX trace data block size:%zu"
334 		       " (type:%d bsdes:%hd dsdes:%hd)\n",
335 		       len, sf->machine_type, bsdes, dsdes);
336 		return;
337 	}
338 
339 	/* s390 kernel always returns 4KB blocks fully occupied,
340 	 * no partially filled SDBs.
341 	 */
342 	while (pos < len) {
343 		/* Handle Basic entry */
344 		basic = (struct hws_basic_entry *)(buf + pos);
345 		if (s390_cpumsf_basic_show(color, pos, basic))
346 			pos += bsdes;
347 		else
348 			return;
349 
350 		/* Handle Diagnostic entry */
351 		diag = (struct hws_diag_entry *)(buf + pos);
352 		if (s390_cpumsf_diag_show(color, pos, diag))
353 			pos += dsdes;
354 		else
355 			return;
356 
357 		/* Check for trailer entry */
358 		if (!s390_cpumsf_reached_trailer(bsdes + dsdes, pos)) {
359 			/* Show trailer entry */
360 			struct hws_trailer_entry te;
361 
362 			pos = (pos + S390_CPUMSF_PAGESZ)
363 			       & ~(S390_CPUMSF_PAGESZ - 1);
364 			pos -= sizeof(te);
365 			memcpy(&te, buf + pos, sizeof(te));
366 			/* Set descriptor sizes in case of old hardware
367 			 * where these values are not set.
368 			 */
369 			te.bsdes = bsdes;
370 			te.dsdes = dsdes;
371 			if (s390_cpumsf_trailer_show(color, pos, &te))
372 				pos += sizeof(te);
373 			else
374 				return;
375 		}
376 	}
377 }
378 
379 static void s390_cpumsf_dump_event(struct s390_cpumsf *sf, unsigned char *buf,
380 				   size_t len)
381 {
382 	printf(".\n");
383 	s390_cpumsf_dump(sf, buf, len);
384 }
385 
386 #define	S390_LPP_PID_MASK	0xffffffff
387 
388 static bool s390_cpumsf_make_event(size_t pos,
389 				   struct hws_basic_entry *basic,
390 				   struct s390_cpumsf_queue *sfq)
391 {
392 	struct perf_sample sample = {
393 				.ip = basic->ia,
394 				.pid = basic->hpp & S390_LPP_PID_MASK,
395 				.tid = basic->hpp & S390_LPP_PID_MASK,
396 				.cpumode = PERF_RECORD_MISC_CPUMODE_UNKNOWN,
397 				.cpu = sfq->cpu,
398 				.period = 1
399 			    };
400 	union perf_event event;
401 
402 	memset(&event, 0, sizeof(event));
403 	if (basic->CL == 1)	/* Native LPAR mode */
404 		sample.cpumode = basic->P ? PERF_RECORD_MISC_USER
405 					  : PERF_RECORD_MISC_KERNEL;
406 	else if (basic->CL == 2)	/* Guest kernel/user space */
407 		sample.cpumode = basic->P ? PERF_RECORD_MISC_GUEST_USER
408 					  : PERF_RECORD_MISC_GUEST_KERNEL;
409 	else if (basic->gpp || basic->prim_asn != 0xffff)
410 		/* Use heuristics on old hardware */
411 		sample.cpumode = basic->P ? PERF_RECORD_MISC_GUEST_USER
412 					  : PERF_RECORD_MISC_GUEST_KERNEL;
413 	else
414 		sample.cpumode = basic->P ? PERF_RECORD_MISC_USER
415 					  : PERF_RECORD_MISC_KERNEL;
416 
417 	event.sample.header.type = PERF_RECORD_SAMPLE;
418 	event.sample.header.misc = sample.cpumode;
419 	event.sample.header.size = sizeof(struct perf_event_header);
420 
421 	pr_debug4("%s pos:%#zx ip:%#" PRIx64 " P:%d CL:%d pid:%d.%d cpumode:%d cpu:%d\n",
422 		 __func__, pos, sample.ip, basic->P, basic->CL, sample.pid,
423 		 sample.tid, sample.cpumode, sample.cpu);
424 	if (perf_session__deliver_synth_event(sfq->sf->session, &event,
425 					      &sample)) {
426 		pr_err("s390 Auxiliary Trace: failed to deliver event\n");
427 		return false;
428 	}
429 	return true;
430 }
431 
432 static unsigned long long get_trailer_time(const unsigned char *buf)
433 {
434 	struct hws_trailer_entry *te;
435 	unsigned long long aux_time;
436 
437 	te = (struct hws_trailer_entry *)(buf + S390_CPUMSF_PAGESZ
438 					      - sizeof(*te));
439 
440 	if (!te->clock_base)	/* TOD_CLOCK_BASE value missing */
441 		return 0;
442 
443 	/* Correct calculation to convert time stamp in trailer entry to
444 	 * nano seconds (taken from arch/s390 function tod_to_ns()).
445 	 * TOD_CLOCK_BASE is stored in trailer entry member progusage2.
446 	 */
447 	aux_time = trailer_timestamp(te) - te->progusage2;
448 	aux_time = (aux_time >> 9) * 125 + (((aux_time & 0x1ff) * 125) >> 9);
449 	return aux_time;
450 }
451 
452 /* Process the data samples of a single queue. The first parameter is a
453  * pointer to the queue, the second parameter is the time stamp. This
454  * is the time stamp:
455  * - of the event that triggered this processing.
456  * - or the time stamp when the last proccesing of this queue stopped.
457  *   In this case it stopped at a 4KB page boundary and record the
458  *   position on where to continue processing on the next invocation
459  *   (see buffer->use_data and buffer->use_size).
460  *
461  * When this function returns the second parameter is updated to
462  * reflect the time stamp of the last processed auxiliary data entry
463  * (taken from the trailer entry of that page). The caller uses this
464  * returned time stamp to record the last processed entry in this
465  * queue.
466  *
467  * The function returns:
468  * 0:  Processing successful. The second parameter returns the
469  *     time stamp from the trailer entry until which position
470  *     processing took place. Subsequent calls resume from this
471  *     position.
472  * <0: An error occurred during processing. The second parameter
473  *     returns the maximum time stamp.
474  * >0: Done on this queue. The second parameter returns the
475  *     maximum time stamp.
476  */
477 static int s390_cpumsf_samples(struct s390_cpumsf_queue *sfq, u64 *ts)
478 {
479 	struct s390_cpumsf *sf = sfq->sf;
480 	unsigned char *buf = sfq->buffer->use_data;
481 	size_t len = sfq->buffer->use_size;
482 	struct hws_basic_entry *basic;
483 	unsigned short bsdes, dsdes;
484 	size_t pos = 0;
485 	int err = 1;
486 	u64 aux_ts;
487 
488 	if (!s390_cpumsf_validate(sf->machine_type, buf, len, &bsdes,
489 				  &dsdes)) {
490 		*ts = ~0ULL;
491 		return -1;
492 	}
493 
494 	/* Get trailer entry time stamp and check if entries in
495 	 * this auxiliary page are ready for processing. If the
496 	 * time stamp of the first entry is too high, whole buffer
497 	 * can be skipped. In this case return time stamp.
498 	 */
499 	aux_ts = get_trailer_time(buf);
500 	if (!aux_ts) {
501 		pr_err("[%#08" PRIx64 "] Invalid AUX trailer entry TOD clock base\n",
502 		       sfq->buffer->data_offset);
503 		aux_ts = ~0ULL;
504 		goto out;
505 	}
506 	if (aux_ts > *ts) {
507 		*ts = aux_ts;
508 		return 0;
509 	}
510 
511 	while (pos < len) {
512 		/* Handle Basic entry */
513 		basic = (struct hws_basic_entry *)(buf + pos);
514 		if (s390_cpumsf_make_event(pos, basic, sfq))
515 			pos += bsdes;
516 		else {
517 			err = -EBADF;
518 			goto out;
519 		}
520 
521 		pos += dsdes;	/* Skip diagnositic entry */
522 
523 		/* Check for trailer entry */
524 		if (!s390_cpumsf_reached_trailer(bsdes + dsdes, pos)) {
525 			pos = (pos + S390_CPUMSF_PAGESZ)
526 			       & ~(S390_CPUMSF_PAGESZ - 1);
527 			/* Check existence of next page */
528 			if (pos >= len)
529 				break;
530 			aux_ts = get_trailer_time(buf + pos);
531 			if (!aux_ts) {
532 				aux_ts = ~0ULL;
533 				goto out;
534 			}
535 			if (aux_ts > *ts) {
536 				*ts = aux_ts;
537 				sfq->buffer->use_data += pos;
538 				sfq->buffer->use_size -= pos;
539 				return 0;
540 			}
541 		}
542 	}
543 out:
544 	*ts = aux_ts;
545 	sfq->buffer->use_size = 0;
546 	sfq->buffer->use_data = NULL;
547 	return err;	/* Buffer completely scanned or error */
548 }
549 
550 /* Run the s390 auxiliary trace decoder.
551  * Select the queue buffer to operate on, the caller already selected
552  * the proper queue, depending on second parameter 'ts'.
553  * This is the time stamp until which the auxiliary entries should
554  * be processed. This value is updated by called functions and
555  * returned to the caller.
556  *
557  * Resume processing in the current buffer. If there is no buffer
558  * get a new buffer from the queue and setup start position for
559  * processing.
560  * When a buffer is completely processed remove it from the queue
561  * before returning.
562  *
563  * This function returns
564  * 1: When the queue is empty. Second parameter will be set to
565  *    maximum time stamp.
566  * 0: Normal processing done.
567  * <0: Error during queue buffer setup. This causes the caller
568  *     to stop processing completely.
569  */
570 static int s390_cpumsf_run_decoder(struct s390_cpumsf_queue *sfq,
571 				   u64 *ts)
572 {
573 
574 	struct auxtrace_buffer *buffer;
575 	struct auxtrace_queue *queue;
576 	int err;
577 
578 	queue = &sfq->sf->queues.queue_array[sfq->queue_nr];
579 
580 	/* Get buffer and last position in buffer to resume
581 	 * decoding the auxiliary entries. One buffer might be large
582 	 * and decoding might stop in between. This depends on the time
583 	 * stamp of the trailer entry in each page of the auxiliary
584 	 * data and the time stamp of the event triggering the decoding.
585 	 */
586 	if (sfq->buffer == NULL) {
587 		sfq->buffer = buffer = auxtrace_buffer__next(queue,
588 							     sfq->buffer);
589 		if (!buffer) {
590 			*ts = ~0ULL;
591 			return 1;	/* Processing done on this queue */
592 		}
593 		/* Start with a new buffer on this queue */
594 		if (buffer->data) {
595 			buffer->use_size = buffer->size;
596 			buffer->use_data = buffer->data;
597 		}
598 	} else
599 		buffer = sfq->buffer;
600 
601 	if (!buffer->data) {
602 		int fd = perf_data__fd(sfq->sf->session->data);
603 
604 		buffer->data = auxtrace_buffer__get_data(buffer, fd);
605 		if (!buffer->data)
606 			return -ENOMEM;
607 		buffer->use_size = buffer->size;
608 		buffer->use_data = buffer->data;
609 	}
610 	pr_debug4("%s queue_nr:%d buffer:%" PRId64 " offset:%#" PRIx64 " size:%#zx rest:%#zx\n",
611 		  __func__, sfq->queue_nr, buffer->buffer_nr, buffer->offset,
612 		  buffer->size, buffer->use_size);
613 	err = s390_cpumsf_samples(sfq, ts);
614 
615 	/* If non-zero, there is either an error (err < 0) or the buffer is
616 	 * completely done (err > 0). The error is unrecoverable, usually
617 	 * some descriptors could not be read successfully, so continue with
618 	 * the next buffer.
619 	 * In both cases the parameter 'ts' has been updated.
620 	 */
621 	if (err) {
622 		sfq->buffer = NULL;
623 		list_del(&buffer->list);
624 		auxtrace_buffer__free(buffer);
625 		if (err > 0)		/* Buffer done, no error */
626 			err = 0;
627 	}
628 	return err;
629 }
630 
631 static struct s390_cpumsf_queue *
632 s390_cpumsf_alloc_queue(struct s390_cpumsf *sf, unsigned int queue_nr)
633 {
634 	struct s390_cpumsf_queue *sfq;
635 
636 	sfq = zalloc(sizeof(struct s390_cpumsf_queue));
637 	if (sfq == NULL)
638 		return NULL;
639 
640 	sfq->sf = sf;
641 	sfq->queue_nr = queue_nr;
642 	sfq->cpu = -1;
643 	return sfq;
644 }
645 
646 static int s390_cpumsf_setup_queue(struct s390_cpumsf *sf,
647 				   struct auxtrace_queue *queue,
648 				   unsigned int queue_nr, u64 ts)
649 {
650 	struct s390_cpumsf_queue *sfq = queue->priv;
651 
652 	if (list_empty(&queue->head))
653 		return 0;
654 
655 	if (sfq == NULL) {
656 		sfq = s390_cpumsf_alloc_queue(sf, queue_nr);
657 		if (!sfq)
658 			return -ENOMEM;
659 		queue->priv = sfq;
660 
661 		if (queue->cpu != -1)
662 			sfq->cpu = queue->cpu;
663 	}
664 	return auxtrace_heap__add(&sf->heap, queue_nr, ts);
665 }
666 
667 static int s390_cpumsf_setup_queues(struct s390_cpumsf *sf, u64 ts)
668 {
669 	unsigned int i;
670 	int ret = 0;
671 
672 	for (i = 0; i < sf->queues.nr_queues; i++) {
673 		ret = s390_cpumsf_setup_queue(sf, &sf->queues.queue_array[i],
674 					      i, ts);
675 		if (ret)
676 			break;
677 	}
678 	return ret;
679 }
680 
681 static int s390_cpumsf_update_queues(struct s390_cpumsf *sf, u64 ts)
682 {
683 	if (!sf->queues.new_data)
684 		return 0;
685 
686 	sf->queues.new_data = false;
687 	return s390_cpumsf_setup_queues(sf, ts);
688 }
689 
690 static int s390_cpumsf_process_queues(struct s390_cpumsf *sf, u64 timestamp)
691 {
692 	unsigned int queue_nr;
693 	u64 ts;
694 	int ret;
695 
696 	while (1) {
697 		struct auxtrace_queue *queue;
698 		struct s390_cpumsf_queue *sfq;
699 
700 		if (!sf->heap.heap_cnt)
701 			return 0;
702 
703 		if (sf->heap.heap_array[0].ordinal >= timestamp)
704 			return 0;
705 
706 		queue_nr = sf->heap.heap_array[0].queue_nr;
707 		queue = &sf->queues.queue_array[queue_nr];
708 		sfq = queue->priv;
709 
710 		auxtrace_heap__pop(&sf->heap);
711 		if (sf->heap.heap_cnt) {
712 			ts = sf->heap.heap_array[0].ordinal + 1;
713 			if (ts > timestamp)
714 				ts = timestamp;
715 		} else {
716 			ts = timestamp;
717 		}
718 
719 		ret = s390_cpumsf_run_decoder(sfq, &ts);
720 		if (ret < 0) {
721 			auxtrace_heap__add(&sf->heap, queue_nr, ts);
722 			return ret;
723 		}
724 		if (!ret) {
725 			ret = auxtrace_heap__add(&sf->heap, queue_nr, ts);
726 			if (ret < 0)
727 				return ret;
728 		}
729 	}
730 	return 0;
731 }
732 
733 static int s390_cpumsf_synth_error(struct s390_cpumsf *sf, int code, int cpu,
734 				   pid_t pid, pid_t tid, u64 ip)
735 {
736 	char msg[MAX_AUXTRACE_ERROR_MSG];
737 	union perf_event event;
738 	int err;
739 
740 	strncpy(msg, "Lost Auxiliary Trace Buffer", sizeof(msg) - 1);
741 	auxtrace_synth_error(&event.auxtrace_error, PERF_AUXTRACE_ERROR_ITRACE,
742 			     code, cpu, pid, tid, ip, msg);
743 
744 	err = perf_session__deliver_synth_event(sf->session, &event, NULL);
745 	if (err)
746 		pr_err("s390 Auxiliary Trace: failed to deliver error event,"
747 			"error %d\n", err);
748 	return err;
749 }
750 
751 static int s390_cpumsf_lost(struct s390_cpumsf *sf, struct perf_sample *sample)
752 {
753 	return s390_cpumsf_synth_error(sf, 1, sample->cpu,
754 				       sample->pid, sample->tid, 0);
755 }
756 
757 static int
758 s390_cpumsf_process_event(struct perf_session *session __maybe_unused,
759 			  union perf_event *event,
760 			  struct perf_sample *sample,
761 			  struct perf_tool *tool)
762 {
763 	struct s390_cpumsf *sf = container_of(session->auxtrace,
764 					      struct s390_cpumsf,
765 					      auxtrace);
766 	u64 timestamp = sample->time;
767 	int err = 0;
768 
769 	if (dump_trace)
770 		return 0;
771 
772 	if (!tool->ordered_events) {
773 		pr_err("s390 Auxiliary Trace requires ordered events\n");
774 		return -EINVAL;
775 	}
776 
777 	if (event->header.type == PERF_RECORD_AUX &&
778 	    event->aux.flags & PERF_AUX_FLAG_TRUNCATED)
779 		return s390_cpumsf_lost(sf, sample);
780 
781 	if (timestamp) {
782 		err = s390_cpumsf_update_queues(sf, timestamp);
783 		if (!err)
784 			err = s390_cpumsf_process_queues(sf, timestamp);
785 	}
786 	return err;
787 }
788 
789 struct s390_cpumsf_synth {
790 	struct perf_tool cpumsf_tool;
791 	struct perf_session *session;
792 };
793 
794 static int
795 s390_cpumsf_process_auxtrace_event(struct perf_session *session,
796 				   union perf_event *event __maybe_unused,
797 				   struct perf_tool *tool __maybe_unused)
798 {
799 	struct s390_cpumsf *sf = container_of(session->auxtrace,
800 					      struct s390_cpumsf,
801 					      auxtrace);
802 
803 	int fd = perf_data__fd(session->data);
804 	struct auxtrace_buffer *buffer;
805 	off_t data_offset;
806 	int err;
807 
808 	if (sf->data_queued)
809 		return 0;
810 
811 	if (perf_data__is_pipe(session->data)) {
812 		data_offset = 0;
813 	} else {
814 		data_offset = lseek(fd, 0, SEEK_CUR);
815 		if (data_offset == -1)
816 			return -errno;
817 	}
818 
819 	err = auxtrace_queues__add_event(&sf->queues, session, event,
820 					 data_offset, &buffer);
821 	if (err)
822 		return err;
823 
824 	/* Dump here after copying piped trace out of the pipe */
825 	if (dump_trace) {
826 		if (auxtrace_buffer__get_data(buffer, fd)) {
827 			s390_cpumsf_dump_event(sf, buffer->data,
828 					       buffer->size);
829 			auxtrace_buffer__put_data(buffer);
830 		}
831 	}
832 	return 0;
833 }
834 
835 static void s390_cpumsf_free_events(struct perf_session *session __maybe_unused)
836 {
837 }
838 
839 static int s390_cpumsf_flush(struct perf_session *session __maybe_unused,
840 			     struct perf_tool *tool __maybe_unused)
841 {
842 	return 0;
843 }
844 
845 static void s390_cpumsf_free_queues(struct perf_session *session)
846 {
847 	struct s390_cpumsf *sf = container_of(session->auxtrace,
848 					      struct s390_cpumsf,
849 					      auxtrace);
850 	struct auxtrace_queues *queues = &sf->queues;
851 	unsigned int i;
852 
853 	for (i = 0; i < queues->nr_queues; i++)
854 		zfree(&queues->queue_array[i].priv);
855 	auxtrace_queues__free(queues);
856 }
857 
858 static void s390_cpumsf_free(struct perf_session *session)
859 {
860 	struct s390_cpumsf *sf = container_of(session->auxtrace,
861 					      struct s390_cpumsf,
862 					      auxtrace);
863 
864 	auxtrace_heap__free(&sf->heap);
865 	s390_cpumsf_free_queues(session);
866 	session->auxtrace = NULL;
867 	free(sf);
868 }
869 
870 static int s390_cpumsf_get_type(const char *cpuid)
871 {
872 	int ret, family = 0;
873 
874 	ret = sscanf(cpuid, "%*[^,],%u", &family);
875 	return (ret == 1) ? family : 0;
876 }
877 
878 /* Check itrace options set on perf report command.
879  * Return true, if none are set or all options specified can be
880  * handled on s390.
881  * Return false otherwise.
882  */
883 static bool check_auxtrace_itrace(struct itrace_synth_opts *itops)
884 {
885 	if (!itops || !itops->set)
886 		return true;
887 	pr_err("No --itrace options supported\n");
888 	return false;
889 }
890 
891 int s390_cpumsf_process_auxtrace_info(union perf_event *event,
892 				      struct perf_session *session)
893 {
894 	struct auxtrace_info_event *auxtrace_info = &event->auxtrace_info;
895 	struct s390_cpumsf *sf;
896 	int err;
897 
898 	if (auxtrace_info->header.size < sizeof(struct auxtrace_info_event))
899 		return -EINVAL;
900 
901 	sf = zalloc(sizeof(struct s390_cpumsf));
902 	if (sf == NULL)
903 		return -ENOMEM;
904 
905 	if (!check_auxtrace_itrace(session->itrace_synth_opts)) {
906 		err = -EINVAL;
907 		goto err_free;
908 	}
909 
910 	err = auxtrace_queues__init(&sf->queues);
911 	if (err)
912 		goto err_free;
913 
914 	sf->session = session;
915 	sf->machine = &session->machines.host; /* No kvm support */
916 	sf->auxtrace_type = auxtrace_info->type;
917 	sf->pmu_type = PERF_TYPE_RAW;
918 	sf->machine_type = s390_cpumsf_get_type(session->evlist->env->cpuid);
919 
920 	sf->auxtrace.process_event = s390_cpumsf_process_event;
921 	sf->auxtrace.process_auxtrace_event = s390_cpumsf_process_auxtrace_event;
922 	sf->auxtrace.flush_events = s390_cpumsf_flush;
923 	sf->auxtrace.free_events = s390_cpumsf_free_events;
924 	sf->auxtrace.free = s390_cpumsf_free;
925 	session->auxtrace = &sf->auxtrace;
926 
927 	if (dump_trace)
928 		return 0;
929 
930 	err = auxtrace_queues__process_index(&sf->queues, session);
931 	if (err)
932 		goto err_free_queues;
933 
934 	if (sf->queues.populated)
935 		sf->data_queued = true;
936 
937 	return 0;
938 
939 err_free_queues:
940 	auxtrace_queues__free(&sf->queues);
941 	session->auxtrace = NULL;
942 err_free:
943 	free(sf);
944 	return err;
945 }
946