1b9efd75bSJin Yao[
2b9efd75bSJin Yao    {
3a7c1aaa6SIan Rogers        "BriefDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend",
4a7c1aaa6SIan Rogers        "MetricExpr": "topdown\\-fe\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) - INT_MISC.UOP_DROPPING / SLOTS",
5a7c1aaa6SIan Rogers        "MetricGroup": "PGO;TopdownL1;tma_L1_group",
6a7c1aaa6SIan Rogers        "MetricName": "tma_frontend_bound",
7a7c1aaa6SIan Rogers        "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Machine_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound. Sample with: FRONTEND_RETIRED.LATENCY_GE_4_PS",
8a7c1aaa6SIan Rogers        "ScaleUnit": "100%"
9a7c1aaa6SIan Rogers    },
10a7c1aaa6SIan Rogers    {
11a7c1aaa6SIan Rogers        "BriefDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues",
12a7c1aaa6SIan Rogers        "MetricExpr": "(5 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE - INT_MISC.UOP_DROPPING) / SLOTS",
13a7c1aaa6SIan Rogers        "MetricGroup": "Frontend;TopdownL2;tma_L2_group;tma_frontend_bound_group",
14a7c1aaa6SIan Rogers        "MetricName": "tma_fetch_latency",
15a7c1aaa6SIan Rogers        "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues.  For example; instruction-cache misses; iTLB misses or fetch stalls after a branch misprediction are categorized under Frontend Latency. In such cases; the Frontend eventually delivers no uops for some period. Sample with: FRONTEND_RETIRED.LATENCY_GE_16_PS;FRONTEND_RETIRED.LATENCY_GE_8_PS",
16a7c1aaa6SIan Rogers        "ScaleUnit": "100%"
17a7c1aaa6SIan Rogers    },
18a7c1aaa6SIan Rogers    {
19a7c1aaa6SIan Rogers        "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses",
20a7c1aaa6SIan Rogers        "MetricExpr": "ICACHE_16B.IFDATA_STALL / CLKS",
21a7c1aaa6SIan Rogers        "MetricGroup": "BigFoot;FetchLat;IcMiss;TopdownL3;tma_fetch_latency_group",
22a7c1aaa6SIan Rogers        "MetricName": "tma_icache_misses",
23a7c1aaa6SIan Rogers        "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses. Sample with: FRONTEND_RETIRED.L2_MISS_PS;FRONTEND_RETIRED.L1I_MISS_PS",
24a7c1aaa6SIan Rogers        "ScaleUnit": "100%"
25a7c1aaa6SIan Rogers    },
26a7c1aaa6SIan Rogers    {
27a7c1aaa6SIan Rogers        "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses",
28a7c1aaa6SIan Rogers        "MetricExpr": "ICACHE_64B.IFTAG_STALL / CLKS",
29a7c1aaa6SIan Rogers        "MetricGroup": "BigFoot;FetchLat;MemoryTLB;TopdownL3;tma_fetch_latency_group",
30a7c1aaa6SIan Rogers        "MetricName": "tma_itlb_misses",
31a7c1aaa6SIan Rogers        "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses. Sample with: FRONTEND_RETIRED.STLB_MISS_PS;FRONTEND_RETIRED.ITLB_MISS_PS",
32a7c1aaa6SIan Rogers        "ScaleUnit": "100%"
33a7c1aaa6SIan Rogers    },
34a7c1aaa6SIan Rogers    {
35a7c1aaa6SIan Rogers        "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers",
36a7c1aaa6SIan Rogers        "MetricExpr": "INT_MISC.CLEAR_RESTEER_CYCLES / CLKS + tma_unknown_branches",
37a7c1aaa6SIan Rogers        "MetricGroup": "FetchLat;TopdownL3;tma_fetch_latency_group",
38a7c1aaa6SIan Rogers        "MetricName": "tma_branch_resteers",
39a7c1aaa6SIan Rogers        "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers. Branch Resteers estimates the Frontend delay in fetching operations from corrected path; following all sorts of miss-predicted branches. For example; branchy code with lots of miss-predictions might get categorized under Branch Resteers. Note the value of this node may overlap with its siblings. Sample with: BR_MISP_RETIRED.ALL_BRANCHES",
40a7c1aaa6SIan Rogers        "ScaleUnit": "100%"
41a7c1aaa6SIan Rogers    },
42a7c1aaa6SIan Rogers    {
43a7c1aaa6SIan Rogers        "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Branch Misprediction at execution stage",
44*69f685e0SIan Rogers        "MetricExpr": "BR_MISP_RETIRED.ALL_BRANCHES / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT) * INT_MISC.CLEAR_RESTEER_CYCLES / CLKS",
45a7c1aaa6SIan Rogers        "MetricGroup": "BadSpec;BrMispredicts;TopdownL4;tma_branch_resteers_group",
46a7c1aaa6SIan Rogers        "MetricName": "tma_mispredicts_resteers",
47a7c1aaa6SIan Rogers        "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Branch Misprediction at execution stage.  Sample with: INT_MISC.CLEAR_RESTEER_CYCLES",
48a7c1aaa6SIan Rogers        "ScaleUnit": "100%"
49a7c1aaa6SIan Rogers    },
50a7c1aaa6SIan Rogers    {
51a7c1aaa6SIan Rogers        "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Machine Clears",
52*69f685e0SIan Rogers        "MetricExpr": "(1 - BR_MISP_RETIRED.ALL_BRANCHES / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT)) * INT_MISC.CLEAR_RESTEER_CYCLES / CLKS",
53a7c1aaa6SIan Rogers        "MetricGroup": "BadSpec;MachineClears;TopdownL4;tma_branch_resteers_group",
54a7c1aaa6SIan Rogers        "MetricName": "tma_clears_resteers",
55a7c1aaa6SIan Rogers        "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Machine Clears.  Sample with: INT_MISC.CLEAR_RESTEER_CYCLES",
56a7c1aaa6SIan Rogers        "ScaleUnit": "100%"
57a7c1aaa6SIan Rogers    },
58a7c1aaa6SIan Rogers    {
59a7c1aaa6SIan Rogers        "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears",
60a7c1aaa6SIan Rogers        "MetricExpr": "10 * BACLEARS.ANY / CLKS",
61a7c1aaa6SIan Rogers        "MetricGroup": "BigFoot;FetchLat;TopdownL4;tma_branch_resteers_group",
62a7c1aaa6SIan Rogers        "MetricName": "tma_unknown_branches",
63a7c1aaa6SIan Rogers        "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears. These are fetched branches the Branch Prediction Unit was unable to recognize (First fetch or hitting BPU capacity limit). Sample with: BACLEARS.ANY",
64a7c1aaa6SIan Rogers        "ScaleUnit": "100%"
65a7c1aaa6SIan Rogers    },
66a7c1aaa6SIan Rogers    {
67a7c1aaa6SIan Rogers        "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines",
68a7c1aaa6SIan Rogers        "MetricExpr": "DSB2MITE_SWITCHES.PENALTY_CYCLES / CLKS",
69a7c1aaa6SIan Rogers        "MetricGroup": "DSBmiss;FetchLat;TopdownL3;tma_fetch_latency_group",
70a7c1aaa6SIan Rogers        "MetricName": "tma_dsb_switches",
71a7c1aaa6SIan Rogers        "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines. The DSB (decoded i-cache) is a Uop Cache where the front-end directly delivers Uops (micro operations) avoiding heavy x86 decoding. The DSB pipeline has shorter latency and delivered higher bandwidth than the MITE (legacy instruction decode pipeline). Switching between the two pipelines can cause penalties hence this metric measures the exposed penalty. Sample with: FRONTEND_RETIRED.DSB_MISS_PS",
72a7c1aaa6SIan Rogers        "ScaleUnit": "100%"
73a7c1aaa6SIan Rogers    },
74a7c1aaa6SIan Rogers    {
75a7c1aaa6SIan Rogers        "BriefDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs)",
76a7c1aaa6SIan Rogers        "MetricExpr": "ILD_STALL.LCP / CLKS",
77a7c1aaa6SIan Rogers        "MetricGroup": "FetchLat;TopdownL3;tma_fetch_latency_group",
78a7c1aaa6SIan Rogers        "MetricName": "tma_lcp",
79a7c1aaa6SIan Rogers        "PublicDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will certainly avoid this. #Link: Optimization Guide about LCP BKMs.",
80a7c1aaa6SIan Rogers        "ScaleUnit": "100%"
81a7c1aaa6SIan Rogers    },
82a7c1aaa6SIan Rogers    {
83a7c1aaa6SIan Rogers        "BriefDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS)",
84a7c1aaa6SIan Rogers        "MetricExpr": "3 * IDQ.MS_SWITCHES / CLKS",
85a7c1aaa6SIan Rogers        "MetricGroup": "FetchLat;MicroSeq;TopdownL3;tma_fetch_latency_group",
86a7c1aaa6SIan Rogers        "MetricName": "tma_ms_switches",
87a7c1aaa6SIan Rogers        "PublicDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS). Commonly used instructions are optimized for delivery by the DSB (decoded i-cache) or MITE (legacy instruction decode) pipelines. Certain operations cannot be handled natively by the execution pipeline; and must be performed by microcode (small programs injected into the execution stream). Switching to the MS too often can negatively impact performance. The MS is designated to deliver long uop flows required by CISC instructions like CPUID; or uncommon conditions like Floating Point Assists when dealing with Denormals. Sample with: IDQ.MS_SWITCHES",
88a7c1aaa6SIan Rogers        "ScaleUnit": "100%"
89a7c1aaa6SIan Rogers    },
90a7c1aaa6SIan Rogers    {
91a7c1aaa6SIan Rogers        "BriefDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues",
92a7c1aaa6SIan Rogers        "MetricExpr": "max(0, tma_frontend_bound - tma_fetch_latency)",
93a7c1aaa6SIan Rogers        "MetricGroup": "FetchBW;Frontend;TopdownL2;tma_L2_group;tma_frontend_bound_group",
94a7c1aaa6SIan Rogers        "MetricName": "tma_fetch_bandwidth",
95a7c1aaa6SIan Rogers        "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues.  For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Sample with: FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_2_PS",
96a7c1aaa6SIan Rogers        "ScaleUnit": "100%"
97a7c1aaa6SIan Rogers    },
98a7c1aaa6SIan Rogers    {
99a7c1aaa6SIan Rogers        "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline)",
100a7c1aaa6SIan Rogers        "MetricExpr": "(IDQ.MITE_CYCLES_ANY - IDQ.MITE_CYCLES_OK) / CORE_CLKS / 2",
101a7c1aaa6SIan Rogers        "MetricGroup": "DSBmiss;FetchBW;TopdownL3;tma_fetch_bandwidth_group",
102a7c1aaa6SIan Rogers        "MetricName": "tma_mite",
103a7c1aaa6SIan Rogers        "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline). This pipeline is used for code that was not pre-cached in the DSB or LSD. For example; inefficiencies due to asymmetric decoders; use of long immediate or LCP can manifest as MITE fetch bandwidth bottleneck. Sample with: FRONTEND_RETIRED.ANY_DSB_MISS",
104a7c1aaa6SIan Rogers        "ScaleUnit": "100%"
105a7c1aaa6SIan Rogers    },
106a7c1aaa6SIan Rogers    {
107a7c1aaa6SIan Rogers        "BriefDescription": "This metric represents fraction of cycles where decoder-0 was the only active decoder",
108a7c1aaa6SIan Rogers        "MetricExpr": "(cpu@INST_DECODED.DECODERS\\,cmask\\=1@ - cpu@INST_DECODED.DECODERS\\,cmask\\=2@) / CORE_CLKS",
109a7c1aaa6SIan Rogers        "MetricGroup": "DSBmiss;FetchBW;TopdownL4;tma_mite_group",
110a7c1aaa6SIan Rogers        "MetricName": "tma_decoder0_alone",
111a7c1aaa6SIan Rogers        "ScaleUnit": "100%"
112a7c1aaa6SIan Rogers    },
113a7c1aaa6SIan Rogers    {
114a7c1aaa6SIan Rogers        "BriefDescription": "This metric represents fraction of cycles where (only) 4 uops were delivered by the MITE pipeline",
115a7c1aaa6SIan Rogers        "MetricExpr": "(cpu@IDQ.MITE_UOPS\\,cmask\\=4@ - cpu@IDQ.MITE_UOPS\\,cmask\\=5@) / CLKS",
116a7c1aaa6SIan Rogers        "MetricGroup": "DSBmiss;FetchBW;TopdownL4;tma_mite_group",
117a7c1aaa6SIan Rogers        "MetricName": "tma_mite_4wide",
118a7c1aaa6SIan Rogers        "ScaleUnit": "100%"
119a7c1aaa6SIan Rogers    },
120a7c1aaa6SIan Rogers    {
121a7c1aaa6SIan Rogers        "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline",
122a7c1aaa6SIan Rogers        "MetricExpr": "(IDQ.DSB_CYCLES_ANY - IDQ.DSB_CYCLES_OK) / CORE_CLKS / 2",
123a7c1aaa6SIan Rogers        "MetricGroup": "DSB;FetchBW;TopdownL3;tma_fetch_bandwidth_group",
124a7c1aaa6SIan Rogers        "MetricName": "tma_dsb",
125a7c1aaa6SIan Rogers        "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline.  For example; inefficient utilization of the DSB cache structure or bank conflict when reading from it; are categorized here.",
126a7c1aaa6SIan Rogers        "ScaleUnit": "100%"
127a7c1aaa6SIan Rogers    },
128a7c1aaa6SIan Rogers    {
129a7c1aaa6SIan Rogers        "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to LSD (Loop Stream Detector) unit",
130a7c1aaa6SIan Rogers        "MetricExpr": "(LSD.CYCLES_ACTIVE - LSD.CYCLES_OK) / CORE_CLKS / 2",
131a7c1aaa6SIan Rogers        "MetricGroup": "FetchBW;LSD;TopdownL3;tma_fetch_bandwidth_group",
132a7c1aaa6SIan Rogers        "MetricName": "tma_lsd",
133a7c1aaa6SIan Rogers        "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to LSD (Loop Stream Detector) unit.  LSD typically does well sustaining Uop supply. However; in some rare cases; optimal uop-delivery could not be reached for small loops whose size (in terms of number of uops) does not suit well the LSD structure.",
134a7c1aaa6SIan Rogers        "ScaleUnit": "100%"
135a7c1aaa6SIan Rogers    },
136a7c1aaa6SIan Rogers    {
137a7c1aaa6SIan Rogers        "BriefDescription": "This category represents fraction of slots wasted due to incorrect speculations",
138a7c1aaa6SIan Rogers        "MetricExpr": "max(1 - (tma_frontend_bound + tma_backend_bound + tma_retiring), 0)",
139a7c1aaa6SIan Rogers        "MetricGroup": "TopdownL1;tma_L1_group",
140a7c1aaa6SIan Rogers        "MetricName": "tma_bad_speculation",
141a7c1aaa6SIan Rogers        "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example.",
142a7c1aaa6SIan Rogers        "ScaleUnit": "100%"
143a7c1aaa6SIan Rogers    },
144a7c1aaa6SIan Rogers    {
145a7c1aaa6SIan Rogers        "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction",
146*69f685e0SIan Rogers        "MetricExpr": "BR_MISP_RETIRED.ALL_BRANCHES / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT) * tma_bad_speculation",
147a7c1aaa6SIan Rogers        "MetricGroup": "BadSpec;BrMispredicts;TopdownL2;tma_L2_group;tma_bad_speculation_group",
148a7c1aaa6SIan Rogers        "MetricName": "tma_branch_mispredicts",
149a7c1aaa6SIan Rogers        "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction.  These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES",
150a7c1aaa6SIan Rogers        "ScaleUnit": "100%"
151a7c1aaa6SIan Rogers    },
152a7c1aaa6SIan Rogers    {
153a7c1aaa6SIan Rogers        "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears",
154a7c1aaa6SIan Rogers        "MetricExpr": "max(0, tma_bad_speculation - tma_branch_mispredicts)",
155a7c1aaa6SIan Rogers        "MetricGroup": "BadSpec;MachineClears;TopdownL2;tma_L2_group;tma_bad_speculation_group",
156a7c1aaa6SIan Rogers        "MetricName": "tma_machine_clears",
157a7c1aaa6SIan Rogers        "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears.  These slots are either wasted by uops fetched prior to the clear; or stalls the out-of-order portion of the machine needs to recover its state after the clear. For example; this can happen due to memory ordering Nukes (e.g. Memory Disambiguation) or Self-Modifying-Code (SMC) nukes. Sample with: MACHINE_CLEARS.COUNT",
158a7c1aaa6SIan Rogers        "ScaleUnit": "100%"
159a7c1aaa6SIan Rogers    },
160a7c1aaa6SIan Rogers    {
161a7c1aaa6SIan Rogers        "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend",
162*69f685e0SIan Rogers        "MetricExpr": "topdown\\-be\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 5 * cpu@INT_MISC.RECOVERY_CYCLES\\,cmask\\=1\\,edge@ / SLOTS",
163a7c1aaa6SIan Rogers        "MetricGroup": "TopdownL1;tma_L1_group",
164a7c1aaa6SIan Rogers        "MetricName": "tma_backend_bound",
165a7c1aaa6SIan Rogers        "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound. Sample with: TOPDOWN.BACKEND_BOUND_SLOTS",
166a7c1aaa6SIan Rogers        "ScaleUnit": "100%"
167a7c1aaa6SIan Rogers    },
168a7c1aaa6SIan Rogers    {
169a7c1aaa6SIan Rogers        "BriefDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck",
170*69f685e0SIan Rogers        "MetricExpr": "(CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES) * tma_backend_bound",
171a7c1aaa6SIan Rogers        "MetricGroup": "Backend;TopdownL2;tma_L2_group;tma_backend_bound_group",
172a7c1aaa6SIan Rogers        "MetricName": "tma_memory_bound",
173a7c1aaa6SIan Rogers        "PublicDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck.  Memory Bound estimates fraction of slots where pipeline is likely stalled due to demand load or store instructions. This accounts mainly for (1) non-completed in-flight memory demand loads which coincides with execution units starvation; in addition to (2) cases where stores could impose backpressure on the pipeline when many of them get buffered at the same time (less common out of the two).",
174a7c1aaa6SIan Rogers        "ScaleUnit": "100%"
175a7c1aaa6SIan Rogers    },
176a7c1aaa6SIan Rogers    {
177a7c1aaa6SIan Rogers        "BriefDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache",
178a7c1aaa6SIan Rogers        "MetricExpr": "max((CYCLE_ACTIVITY.STALLS_MEM_ANY - CYCLE_ACTIVITY.STALLS_L1D_MISS) / CLKS, 0)",
179a7c1aaa6SIan Rogers        "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group",
180a7c1aaa6SIan Rogers        "MetricName": "tma_l1_bound",
181a7c1aaa6SIan Rogers        "PublicDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache.  The L1 data cache typically has the shortest latency.  However; in certain cases like loads blocked on older stores; a load might suffer due to high latency even though it is being satisfied by the L1. Another example is loads who miss in the TLB. These cases are characterized by execution unit stalls; while some non-completed demand load lives in the machine without having that demand load missing the L1 cache. Sample with: MEM_LOAD_RETIRED.L1_HIT_PS;MEM_LOAD_RETIRED.FB_HIT_PS",
182a7c1aaa6SIan Rogers        "ScaleUnit": "100%"
183a7c1aaa6SIan Rogers    },
184a7c1aaa6SIan Rogers    {
185a7c1aaa6SIan Rogers        "BriefDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses",
186a7c1aaa6SIan Rogers        "MetricExpr": "min(7 * cpu@DTLB_LOAD_MISSES.STLB_HIT\\,cmask\\=1@ + DTLB_LOAD_MISSES.WALK_ACTIVE, max(CYCLE_ACTIVITY.CYCLES_MEM_ANY - CYCLE_ACTIVITY.CYCLES_L1D_MISS, 0)) / CLKS",
187a7c1aaa6SIan Rogers        "MetricGroup": "MemoryTLB;TopdownL4;tma_l1_bound_group",
188a7c1aaa6SIan Rogers        "MetricName": "tma_dtlb_load",
189a7c1aaa6SIan Rogers        "PublicDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses. TLBs (Translation Look-aside Buffers) are processor caches for recently used entries out of the Page Tables that are used to map virtual- to physical-addresses by the operating system. This metric approximates the potential delay of demand loads missing the first-level data TLB (assuming worst case scenario with back to back misses to different pages). This includes hitting in the second-level TLB (STLB) as well as performing a hardware page walk on an STLB miss. Sample with: MEM_INST_RETIRED.STLB_MISS_LOADS_PS",
190a7c1aaa6SIan Rogers        "ScaleUnit": "100%"
191a7c1aaa6SIan Rogers    },
192a7c1aaa6SIan Rogers    {
193a7c1aaa6SIan Rogers        "BriefDescription": "This metric roughly estimates the fraction of cycles where the (first level) DTLB was missed by load accesses, that later on hit in second-level TLB (STLB)",
194a7c1aaa6SIan Rogers        "MetricExpr": "tma_dtlb_load - tma_load_stlb_miss",
195a7c1aaa6SIan Rogers        "MetricGroup": "MemoryTLB;TopdownL5;tma_dtlb_load_group",
196a7c1aaa6SIan Rogers        "MetricName": "tma_load_stlb_hit",
197a7c1aaa6SIan Rogers        "ScaleUnit": "100%"
198a7c1aaa6SIan Rogers    },
199a7c1aaa6SIan Rogers    {
200a7c1aaa6SIan Rogers        "BriefDescription": "This metric estimates the fraction of cycles where the Second-level TLB (STLB) was missed by load accesses, performing a hardware page walk",
201a7c1aaa6SIan Rogers        "MetricExpr": "DTLB_LOAD_MISSES.WALK_ACTIVE / CLKS",
202a7c1aaa6SIan Rogers        "MetricGroup": "MemoryTLB;TopdownL5;tma_dtlb_load_group",
203a7c1aaa6SIan Rogers        "MetricName": "tma_load_stlb_miss",
204a7c1aaa6SIan Rogers        "ScaleUnit": "100%"
205a7c1aaa6SIan Rogers    },
206a7c1aaa6SIan Rogers    {
207a7c1aaa6SIan Rogers        "BriefDescription": "This metric roughly estimates fraction of cycles when the memory subsystem had loads blocked since they could not forward data from earlier (in program order) overlapping stores",
208a7c1aaa6SIan Rogers        "MetricExpr": "13 * LD_BLOCKS.STORE_FORWARD / CLKS",
209a7c1aaa6SIan Rogers        "MetricGroup": "TopdownL4;tma_l1_bound_group",
210a7c1aaa6SIan Rogers        "MetricName": "tma_store_fwd_blk",
211a7c1aaa6SIan Rogers        "PublicDescription": "This metric roughly estimates fraction of cycles when the memory subsystem had loads blocked since they could not forward data from earlier (in program order) overlapping stores. To streamline memory operations in the pipeline; a load can avoid waiting for memory if a prior in-flight store is writing the data that the load wants to read (store forwarding process). However; in some cases the load may be blocked for a significant time pending the store forward. For example; when the prior store is writing a smaller region than the load is reading.",
212a7c1aaa6SIan Rogers        "ScaleUnit": "100%"
213a7c1aaa6SIan Rogers    },
214a7c1aaa6SIan Rogers    {
215a7c1aaa6SIan Rogers        "BriefDescription": "This metric represents fraction of cycles the CPU spent handling cache misses due to lock operations",
216*69f685e0SIan Rogers        "MetricExpr": "(16 * max(0, MEM_INST_RETIRED.LOCK_LOADS - L2_RQSTS.ALL_RFO) + MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES * (10 * L2_RQSTS.RFO_HIT + min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO))) / CLKS",
217a7c1aaa6SIan Rogers        "MetricGroup": "Offcore;TopdownL4;tma_l1_bound_group",
218a7c1aaa6SIan Rogers        "MetricName": "tma_lock_latency",
219a7c1aaa6SIan Rogers        "PublicDescription": "This metric represents fraction of cycles the CPU spent handling cache misses due to lock operations. Due to the microarchitecture handling of locks; they are classified as L1_Bound regardless of what memory source satisfied them. Sample with: MEM_INST_RETIRED.LOCK_LOADS_PS",
220a7c1aaa6SIan Rogers        "ScaleUnit": "100%"
221a7c1aaa6SIan Rogers    },
222a7c1aaa6SIan Rogers    {
223a7c1aaa6SIan Rogers        "BriefDescription": "This metric estimates fraction of cycles handling memory load split accesses - load that cross 64-byte cache line boundary",
224a7c1aaa6SIan Rogers        "MetricExpr": "Load_Miss_Real_Latency * LD_BLOCKS.NO_SR / CLKS",
225a7c1aaa6SIan Rogers        "MetricGroup": "TopdownL4;tma_l1_bound_group",
226a7c1aaa6SIan Rogers        "MetricName": "tma_split_loads",
227a7c1aaa6SIan Rogers        "PublicDescription": "This metric estimates fraction of cycles handling memory load split accesses - load that cross 64-byte cache line boundary.  Sample with: MEM_INST_RETIRED.SPLIT_LOADS_PS",
228a7c1aaa6SIan Rogers        "ScaleUnit": "100%"
229a7c1aaa6SIan Rogers    },
230a7c1aaa6SIan Rogers    {
231a7c1aaa6SIan Rogers        "BriefDescription": "This metric estimates how often memory load accesses were aliased by preceding stores (in program order) with a 4K address offset",
232a7c1aaa6SIan Rogers        "MetricExpr": "LD_BLOCKS_PARTIAL.ADDRESS_ALIAS / CLKS",
233a7c1aaa6SIan Rogers        "MetricGroup": "TopdownL4;tma_l1_bound_group",
234a7c1aaa6SIan Rogers        "MetricName": "tma_4k_aliasing",
235a7c1aaa6SIan Rogers        "PublicDescription": "This metric estimates how often memory load accesses were aliased by preceding stores (in program order) with a 4K address offset. False match is possible; which incur a few cycles load re-issue. However; the short re-issue duration is often hidden by the out-of-order core and HW optimizations; hence a user may safely ignore a high value of this metric unless it manages to propagate up into parent nodes of the hierarchy (e.g. to L1_Bound).",
236a7c1aaa6SIan Rogers        "ScaleUnit": "100%"
237a7c1aaa6SIan Rogers    },
238a7c1aaa6SIan Rogers    {
239a7c1aaa6SIan Rogers        "BriefDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed",
240a7c1aaa6SIan Rogers        "MetricExpr": "L1D_PEND_MISS.FB_FULL / CLKS",
241a7c1aaa6SIan Rogers        "MetricGroup": "MemoryBW;TopdownL4;tma_l1_bound_group",
242a7c1aaa6SIan Rogers        "MetricName": "tma_fb_full",
243a7c1aaa6SIan Rogers        "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory).",
244a7c1aaa6SIan Rogers        "ScaleUnit": "100%"
245a7c1aaa6SIan Rogers    },
246a7c1aaa6SIan Rogers    {
247a7c1aaa6SIan Rogers        "BriefDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads",
248*69f685e0SIan Rogers        "MetricExpr": "MEM_LOAD_RETIRED.L2_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) / (MEM_LOAD_RETIRED.L2_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + L1D_PEND_MISS.FB_FULL_PERIODS) * ((CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / CLKS)",
249a7c1aaa6SIan Rogers        "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group",
250a7c1aaa6SIan Rogers        "MetricName": "tma_l2_bound",
251a7c1aaa6SIan Rogers        "PublicDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads.  Avoiding cache misses (i.e. L1 misses/L2 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_RETIRED.L2_HIT_PS",
252a7c1aaa6SIan Rogers        "ScaleUnit": "100%"
253a7c1aaa6SIan Rogers    },
254a7c1aaa6SIan Rogers    {
255a7c1aaa6SIan Rogers        "BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core",
256a7c1aaa6SIan Rogers        "MetricExpr": "(CYCLE_ACTIVITY.STALLS_L2_MISS - CYCLE_ACTIVITY.STALLS_L3_MISS) / CLKS",
257a7c1aaa6SIan Rogers        "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group",
258a7c1aaa6SIan Rogers        "MetricName": "tma_l3_bound",
259a7c1aaa6SIan Rogers        "PublicDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core.  Avoiding cache misses (i.e. L2 misses/L3 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS",
260a7c1aaa6SIan Rogers        "ScaleUnit": "100%"
261a7c1aaa6SIan Rogers    },
262a7c1aaa6SIan Rogers    {
263a7c1aaa6SIan Rogers        "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses",
264*69f685e0SIan Rogers        "MetricExpr": "(49 * Average_Frequency * (MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD * (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM / (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM + OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD))) + 48 * Average_Frequency * MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / CLKS",
265a7c1aaa6SIan Rogers        "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_l3_bound_group",
266a7c1aaa6SIan Rogers        "MetricName": "tma_contested_accesses",
267a7c1aaa6SIan Rogers        "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses. Contested accesses occur when data written by one Logical Processor are read by another Logical Processor on a different Physical Core. Examples of contested accesses include synchronizations such as locks; true data sharing such as modified locked variables; and false sharing. Sample with: MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD;MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS",
268a7c1aaa6SIan Rogers        "ScaleUnit": "100%"
269a7c1aaa6SIan Rogers    },
270a7c1aaa6SIan Rogers    {
271a7c1aaa6SIan Rogers        "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses",
272*69f685e0SIan Rogers        "MetricExpr": "48 * Average_Frequency * (MEM_LOAD_L3_HIT_RETIRED.XSNP_NO_FWD + MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD * (1 - OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM / (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM + OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD))) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / CLKS",
273a7c1aaa6SIan Rogers        "MetricGroup": "Offcore;Snoop;TopdownL4;tma_l3_bound_group",
274a7c1aaa6SIan Rogers        "MetricName": "tma_data_sharing",
275a7c1aaa6SIan Rogers        "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses. Data shared by multiple Logical Processors (even just read shared) may cause increased access latency due to cache coherency. Excessive data sharing can drastically harm multithreaded performance. Sample with: MEM_LOAD_L3_HIT_RETIRED.XSNP_NO_FWD",
276a7c1aaa6SIan Rogers        "ScaleUnit": "100%"
277a7c1aaa6SIan Rogers    },
278a7c1aaa6SIan Rogers    {
279a7c1aaa6SIan Rogers        "BriefDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)",
280*69f685e0SIan Rogers        "MetricExpr": "17.5 * Average_Frequency * MEM_LOAD_RETIRED.L3_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / CLKS",
281a7c1aaa6SIan Rogers        "MetricGroup": "MemoryLat;TopdownL4;tma_l3_bound_group",
282a7c1aaa6SIan Rogers        "MetricName": "tma_l3_hit_latency",
283a7c1aaa6SIan Rogers        "PublicDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited).  Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance.  Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS",
284a7c1aaa6SIan Rogers        "ScaleUnit": "100%"
285a7c1aaa6SIan Rogers    },
286a7c1aaa6SIan Rogers    {
287a7c1aaa6SIan Rogers        "BriefDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors)",
288a7c1aaa6SIan Rogers        "MetricExpr": "L1D_PEND_MISS.L2_STALL / CLKS",
289a7c1aaa6SIan Rogers        "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_l3_bound_group",
290a7c1aaa6SIan Rogers        "MetricName": "tma_sq_full",
291a7c1aaa6SIan Rogers        "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). The Super Queue is used for requests to access the L2 cache or to go out to the Uncore.",
292a7c1aaa6SIan Rogers        "ScaleUnit": "100%"
293a7c1aaa6SIan Rogers    },
294a7c1aaa6SIan Rogers    {
295a7c1aaa6SIan Rogers        "BriefDescription": "This metric estimates how often the CPU was stalled on accesses to external memory (DRAM) by loads",
296*69f685e0SIan Rogers        "MetricExpr": "CYCLE_ACTIVITY.STALLS_L3_MISS / CLKS + (CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / CLKS - tma_l2_bound",
297a7c1aaa6SIan Rogers        "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group",
298a7c1aaa6SIan Rogers        "MetricName": "tma_dram_bound",
299a7c1aaa6SIan Rogers        "PublicDescription": "This metric estimates how often the CPU was stalled on accesses to external memory (DRAM) by loads. Better caching can improve the latency and increase performance. Sample with: MEM_LOAD_RETIRED.L3_MISS_PS",
300a7c1aaa6SIan Rogers        "ScaleUnit": "100%"
301a7c1aaa6SIan Rogers    },
302a7c1aaa6SIan Rogers    {
303a7c1aaa6SIan Rogers        "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM)",
304a7c1aaa6SIan Rogers        "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=4@) / CLKS",
305a7c1aaa6SIan Rogers        "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_dram_bound_group",
306a7c1aaa6SIan Rogers        "MetricName": "tma_mem_bandwidth",
307a7c1aaa6SIan Rogers        "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM).  The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that).",
308a7c1aaa6SIan Rogers        "ScaleUnit": "100%"
309a7c1aaa6SIan Rogers    },
310a7c1aaa6SIan Rogers    {
311a7c1aaa6SIan Rogers        "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM)",
312a7c1aaa6SIan Rogers        "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / CLKS - tma_mem_bandwidth",
313a7c1aaa6SIan Rogers        "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_dram_bound_group",
314a7c1aaa6SIan Rogers        "MetricName": "tma_mem_latency",
315a7c1aaa6SIan Rogers        "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM).  This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that).",
316a7c1aaa6SIan Rogers        "ScaleUnit": "100%"
317a7c1aaa6SIan Rogers    },
318a7c1aaa6SIan Rogers    {
319a7c1aaa6SIan Rogers        "BriefDescription": "This metric estimates how often CPU was stalled  due to RFO store memory accesses; RFO store issue a read-for-ownership request before the write",
320a7c1aaa6SIan Rogers        "MetricExpr": "EXE_ACTIVITY.BOUND_ON_STORES / CLKS",
321a7c1aaa6SIan Rogers        "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group",
322a7c1aaa6SIan Rogers        "MetricName": "tma_store_bound",
323a7c1aaa6SIan Rogers        "PublicDescription": "This metric estimates how often CPU was stalled  due to RFO store memory accesses; RFO store issue a read-for-ownership request before the write. Even though store accesses do not typically stall out-of-order CPUs; there are few cases where stores can lead to actual stalls. This metric will be flagged should RFO stores be a bottleneck. Sample with: MEM_INST_RETIRED.ALL_STORES_PS",
324a7c1aaa6SIan Rogers        "ScaleUnit": "100%"
325a7c1aaa6SIan Rogers    },
326a7c1aaa6SIan Rogers    {
327a7c1aaa6SIan Rogers        "BriefDescription": "This metric estimates fraction of cycles the CPU spent handling L1D store misses",
328*69f685e0SIan Rogers        "MetricExpr": "(L2_RQSTS.RFO_HIT * 10 * (1 - MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES) + (1 - MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES) * min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO)) / CLKS",
329a7c1aaa6SIan Rogers        "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_store_bound_group",
330a7c1aaa6SIan Rogers        "MetricName": "tma_store_latency",
331a7c1aaa6SIan Rogers        "PublicDescription": "This metric estimates fraction of cycles the CPU spent handling L1D store misses. Store accesses usually less impact out-of-order core performance; however; holding resources for longer time can lead into undesired implications (e.g. contention on L1D fill-buffer entries - see FB_Full)",
332a7c1aaa6SIan Rogers        "ScaleUnit": "100%"
333a7c1aaa6SIan Rogers    },
334a7c1aaa6SIan Rogers    {
335a7c1aaa6SIan Rogers        "BriefDescription": "This metric roughly estimates how often CPU was handling synchronizations due to False Sharing",
336*69f685e0SIan Rogers        "MetricExpr": "54 * Average_Frequency * OCR.DEMAND_RFO.L3_HIT.SNOOP_HITM / CLKS",
337a7c1aaa6SIan Rogers        "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_store_bound_group",
338a7c1aaa6SIan Rogers        "MetricName": "tma_false_sharing",
339a7c1aaa6SIan Rogers        "PublicDescription": "This metric roughly estimates how often CPU was handling synchronizations due to False Sharing. False Sharing is a multithreading hiccup; where multiple Logical Processors contend on different data-elements mapped into the same cache line.  Sample with: OCR.DEMAND_RFO.L3_HIT.SNOOP_HITM",
340a7c1aaa6SIan Rogers        "ScaleUnit": "100%"
341a7c1aaa6SIan Rogers    },
342a7c1aaa6SIan Rogers    {
343a7c1aaa6SIan Rogers        "BriefDescription": "This metric represents rate of split store accesses",
344a7c1aaa6SIan Rogers        "MetricExpr": "MEM_INST_RETIRED.SPLIT_STORES / CORE_CLKS",
345a7c1aaa6SIan Rogers        "MetricGroup": "TopdownL4;tma_store_bound_group",
346a7c1aaa6SIan Rogers        "MetricName": "tma_split_stores",
347a7c1aaa6SIan Rogers        "PublicDescription": "This metric represents rate of split store accesses.  Consider aligning your data to the 64-byte cache line granularity. Sample with: MEM_INST_RETIRED.SPLIT_STORES_PS",
348a7c1aaa6SIan Rogers        "ScaleUnit": "100%"
349a7c1aaa6SIan Rogers    },
350a7c1aaa6SIan Rogers    {
351a7c1aaa6SIan Rogers        "BriefDescription": "This metric estimates how often CPU was stalled  due to Streaming store memory accesses; Streaming store optimize out a read request required by RFO stores",
352a7c1aaa6SIan Rogers        "MetricExpr": "9 * OCR.STREAMING_WR.ANY_RESPONSE / CLKS",
353a7c1aaa6SIan Rogers        "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_store_bound_group",
354a7c1aaa6SIan Rogers        "MetricName": "tma_streaming_stores",
355a7c1aaa6SIan Rogers        "PublicDescription": "This metric estimates how often CPU was stalled  due to Streaming store memory accesses; Streaming store optimize out a read request required by RFO stores. Even though store accesses do not typically stall out-of-order CPUs; there are few cases where stores can lead to actual stalls. This metric will be flagged should Streaming stores be a bottleneck. Sample with: OCR.STREAMING_WR.ANY_RESPONSE",
356a7c1aaa6SIan Rogers        "ScaleUnit": "100%"
357a7c1aaa6SIan Rogers    },
358a7c1aaa6SIan Rogers    {
359a7c1aaa6SIan Rogers        "BriefDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses",
360a7c1aaa6SIan Rogers        "MetricExpr": "(7 * cpu@DTLB_STORE_MISSES.STLB_HIT\\,cmask\\=1@ + DTLB_STORE_MISSES.WALK_ACTIVE) / CORE_CLKS",
361a7c1aaa6SIan Rogers        "MetricGroup": "MemoryTLB;TopdownL4;tma_store_bound_group",
362a7c1aaa6SIan Rogers        "MetricName": "tma_dtlb_store",
363a7c1aaa6SIan Rogers        "PublicDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses.  As with ordinary data caching; focus on improving data locality and reducing working-set size to reduce DTLB overhead.  Additionally; consider using profile-guided optimization (PGO) to collocate frequently-used data on the same page.  Try using larger page sizes for large amounts of frequently-used data. Sample with: MEM_INST_RETIRED.STLB_MISS_STORES_PS",
364a7c1aaa6SIan Rogers        "ScaleUnit": "100%"
365a7c1aaa6SIan Rogers    },
366a7c1aaa6SIan Rogers    {
367a7c1aaa6SIan Rogers        "BriefDescription": "This metric roughly estimates the fraction of cycles where the TLB was missed by store accesses, hitting in the second-level TLB (STLB)",
368a7c1aaa6SIan Rogers        "MetricExpr": "tma_dtlb_store - tma_store_stlb_miss",
369a7c1aaa6SIan Rogers        "MetricGroup": "MemoryTLB;TopdownL5;tma_dtlb_store_group",
370a7c1aaa6SIan Rogers        "MetricName": "tma_store_stlb_hit",
371a7c1aaa6SIan Rogers        "ScaleUnit": "100%"
372a7c1aaa6SIan Rogers    },
373a7c1aaa6SIan Rogers    {
374a7c1aaa6SIan Rogers        "BriefDescription": "This metric estimates the fraction of cycles where the STLB was missed by store accesses, performing a hardware page walk",
375a7c1aaa6SIan Rogers        "MetricExpr": "DTLB_STORE_MISSES.WALK_ACTIVE / CORE_CLKS",
376a7c1aaa6SIan Rogers        "MetricGroup": "MemoryTLB;TopdownL5;tma_dtlb_store_group",
377a7c1aaa6SIan Rogers        "MetricName": "tma_store_stlb_miss",
378a7c1aaa6SIan Rogers        "ScaleUnit": "100%"
379a7c1aaa6SIan Rogers    },
380a7c1aaa6SIan Rogers    {
381a7c1aaa6SIan Rogers        "BriefDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck",
382a7c1aaa6SIan Rogers        "MetricExpr": "max(0, tma_backend_bound - tma_memory_bound)",
383a7c1aaa6SIan Rogers        "MetricGroup": "Backend;Compute;TopdownL2;tma_L2_group;tma_backend_bound_group",
384a7c1aaa6SIan Rogers        "MetricName": "tma_core_bound",
385a7c1aaa6SIan Rogers        "PublicDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck.  Shortage in hardware compute resources; or dependencies in software's instructions are both categorized under Core Bound. Hence it may indicate the machine ran out of an out-of-order resource; certain execution units are overloaded or dependencies in program's data- or instruction-flow are limiting the performance (e.g. FP-chained long-latency arithmetic operations).",
386a7c1aaa6SIan Rogers        "ScaleUnit": "100%"
387a7c1aaa6SIan Rogers    },
388a7c1aaa6SIan Rogers    {
389a7c1aaa6SIan Rogers        "BriefDescription": "This metric represents fraction of cycles where the Divider unit was active",
390a7c1aaa6SIan Rogers        "MetricExpr": "ARITH.DIVIDER_ACTIVE / CLKS",
391a7c1aaa6SIan Rogers        "MetricGroup": "TopdownL3;tma_core_bound_group",
392a7c1aaa6SIan Rogers        "MetricName": "tma_divider",
393a7c1aaa6SIan Rogers        "PublicDescription": "This metric represents fraction of cycles where the Divider unit was active. Divide and square root instructions are performed by the Divider unit and can take considerably longer latency than integer or Floating Point addition; subtraction; or multiplication. Sample with: ARITH.DIVIDER_ACTIVE",
394a7c1aaa6SIan Rogers        "ScaleUnit": "100%"
395a7c1aaa6SIan Rogers    },
396a7c1aaa6SIan Rogers    {
397a7c1aaa6SIan Rogers        "BriefDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related)",
398*69f685e0SIan Rogers        "MetricExpr": "((cpu@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL)) / CLKS if ARITH.DIVIDER_ACTIVE < CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY else (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL) / CLKS)",
399a7c1aaa6SIan Rogers        "MetricGroup": "PortsUtil;TopdownL3;tma_core_bound_group",
400a7c1aaa6SIan Rogers        "MetricName": "tma_ports_utilization",
401a7c1aaa6SIan Rogers        "PublicDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related).  Two distinct categories can be attributed into this metric: (1) heavy data-dependency among contiguous instructions would manifest in this metric - such cases are often referred to as low Instruction Level Parallelism (ILP). (2) Contention on some hardware execution unit other than Divider. For example; when there are too many multiply operations.",
402a7c1aaa6SIan Rogers        "ScaleUnit": "100%"
403a7c1aaa6SIan Rogers    },
404a7c1aaa6SIan Rogers    {
405a7c1aaa6SIan Rogers        "BriefDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise)",
406a7c1aaa6SIan Rogers        "MetricExpr": "cpu@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ / CLKS + tma_serializing_operation * (CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY) / CLKS",
407a7c1aaa6SIan Rogers        "MetricGroup": "PortsUtil;TopdownL4;tma_ports_utilization_group",
408a7c1aaa6SIan Rogers        "MetricName": "tma_ports_utilized_0",
409a7c1aaa6SIan Rogers        "PublicDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise). Long-latency instructions like divides may contribute to this metric.",
410a7c1aaa6SIan Rogers        "ScaleUnit": "100%"
411a7c1aaa6SIan Rogers    },
412a7c1aaa6SIan Rogers    {
413a7c1aaa6SIan Rogers        "BriefDescription": "This metric represents fraction of cycles the CPU issue-pipeline was stalled due to serializing operations",
414a7c1aaa6SIan Rogers        "MetricExpr": "RESOURCE_STALLS.SCOREBOARD / CLKS",
415a7c1aaa6SIan Rogers        "MetricGroup": "TopdownL5;tma_ports_utilized_0_group",
416a7c1aaa6SIan Rogers        "MetricName": "tma_serializing_operation",
417a7c1aaa6SIan Rogers        "PublicDescription": "This metric represents fraction of cycles the CPU issue-pipeline was stalled due to serializing operations. Instructions like CPUID; WRMSR or LFENCE serialize the out-of-order execution which may limit performance. Sample with: RESOURCE_STALLS.SCOREBOARD",
418a7c1aaa6SIan Rogers        "ScaleUnit": "100%"
419a7c1aaa6SIan Rogers    },
420a7c1aaa6SIan Rogers    {
421a7c1aaa6SIan Rogers        "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to PAUSE Instructions",
422a7c1aaa6SIan Rogers        "MetricExpr": "140 * MISC_RETIRED.PAUSE_INST / CLKS",
423a7c1aaa6SIan Rogers        "MetricGroup": "TopdownL6;tma_serializing_operation_group",
424a7c1aaa6SIan Rogers        "MetricName": "tma_slow_pause",
425a7c1aaa6SIan Rogers        "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to PAUSE Instructions. Sample with: MISC_RETIRED.PAUSE_INST",
426a7c1aaa6SIan Rogers        "ScaleUnit": "100%"
427a7c1aaa6SIan Rogers    },
428a7c1aaa6SIan Rogers    {
429a7c1aaa6SIan Rogers        "BriefDescription": "The Mixing_Vectors metric gives the percentage of injected blend uops out of all uops issued",
430a7c1aaa6SIan Rogers        "MetricExpr": "CLKS * UOPS_ISSUED.VECTOR_WIDTH_MISMATCH / UOPS_ISSUED.ANY",
431a7c1aaa6SIan Rogers        "MetricGroup": "TopdownL5;tma_ports_utilized_0_group",
432a7c1aaa6SIan Rogers        "MetricName": "tma_mixing_vectors",
433a7c1aaa6SIan Rogers        "PublicDescription": "The Mixing_Vectors metric gives the percentage of injected blend uops out of all uops issued. Usually a Mixing_Vectors over 5% is worth investigating. Read more in Appendix B1 of the Optimizations Guide for this topic.",
434a7c1aaa6SIan Rogers        "ScaleUnit": "100%"
435a7c1aaa6SIan Rogers    },
436a7c1aaa6SIan Rogers    {
437a7c1aaa6SIan Rogers        "BriefDescription": "This metric represents fraction of cycles where the CPU executed total of 1 uop per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)",
438a7c1aaa6SIan Rogers        "MetricExpr": "EXE_ACTIVITY.1_PORTS_UTIL / CLKS",
439a7c1aaa6SIan Rogers        "MetricGroup": "PortsUtil;TopdownL4;tma_ports_utilization_group",
440a7c1aaa6SIan Rogers        "MetricName": "tma_ports_utilized_1",
441a7c1aaa6SIan Rogers        "PublicDescription": "This metric represents fraction of cycles where the CPU executed total of 1 uop per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise). This can be due to heavy data-dependency among software instructions; or over oversubscribing a particular hardware resource. In some other cases with high 1_Port_Utilized and L1_Bound; this metric can point to L1 data-cache latency bottleneck that may not necessarily manifest with complete execution starvation (due to the short L1 latency e.g. walking a linked list) - looking at the assembly can be helpful. Sample with: EXE_ACTIVITY.1_PORTS_UTIL",
442a7c1aaa6SIan Rogers        "ScaleUnit": "100%"
443a7c1aaa6SIan Rogers    },
444a7c1aaa6SIan Rogers    {
445a7c1aaa6SIan Rogers        "BriefDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)",
446a7c1aaa6SIan Rogers        "MetricExpr": "EXE_ACTIVITY.2_PORTS_UTIL / CLKS",
447a7c1aaa6SIan Rogers        "MetricGroup": "PortsUtil;TopdownL4;tma_ports_utilization_group",
448a7c1aaa6SIan Rogers        "MetricName": "tma_ports_utilized_2",
449a7c1aaa6SIan Rogers        "PublicDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise).  Loop Vectorization -most compilers feature auto-Vectorization options today- reduces pressure on the execution ports as multiple elements are calculated with same uop. Sample with: EXE_ACTIVITY.2_PORTS_UTIL",
450a7c1aaa6SIan Rogers        "ScaleUnit": "100%"
451a7c1aaa6SIan Rogers    },
452a7c1aaa6SIan Rogers    {
453a7c1aaa6SIan Rogers        "BriefDescription": "This metric represents fraction of cycles CPU executed total of 3 or more uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)",
454a7c1aaa6SIan Rogers        "MetricExpr": "UOPS_EXECUTED.CYCLES_GE_3 / CLKS",
455a7c1aaa6SIan Rogers        "MetricGroup": "PortsUtil;TopdownL4;tma_ports_utilization_group",
456a7c1aaa6SIan Rogers        "MetricName": "tma_ports_utilized_3m",
457a7c1aaa6SIan Rogers        "PublicDescription": "This metric represents fraction of cycles CPU executed total of 3 or more uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise). Sample with: UOPS_EXECUTED.CYCLES_GE_3",
458a7c1aaa6SIan Rogers        "ScaleUnit": "100%"
459a7c1aaa6SIan Rogers    },
460a7c1aaa6SIan Rogers    {
461a7c1aaa6SIan Rogers        "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution ports for ALU operations.",
462a7c1aaa6SIan Rogers        "MetricExpr": "(UOPS_DISPATCHED.PORT_0 + UOPS_DISPATCHED.PORT_1 + UOPS_DISPATCHED.PORT_5 + UOPS_DISPATCHED.PORT_6) / (4 * CORE_CLKS)",
463a7c1aaa6SIan Rogers        "MetricGroup": "TopdownL5;tma_ports_utilized_3m_group",
464a7c1aaa6SIan Rogers        "MetricName": "tma_alu_op_utilization",
465a7c1aaa6SIan Rogers        "ScaleUnit": "100%"
466a7c1aaa6SIan Rogers    },
467a7c1aaa6SIan Rogers    {
468a7c1aaa6SIan Rogers        "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 0 ([SNB+] ALU; [HSW+] ALU and 2nd branch) Sample with: UOPS_DISPATCHED.PORT_0",
469a7c1aaa6SIan Rogers        "MetricExpr": "UOPS_DISPATCHED.PORT_0 / CORE_CLKS",
470a7c1aaa6SIan Rogers        "MetricGroup": "Compute;TopdownL6;tma_alu_op_utilization_group",
471a7c1aaa6SIan Rogers        "MetricName": "tma_port_0",
472a7c1aaa6SIan Rogers        "ScaleUnit": "100%"
473a7c1aaa6SIan Rogers    },
474a7c1aaa6SIan Rogers    {
475a7c1aaa6SIan Rogers        "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 1 (ALU) Sample with: UOPS_DISPATCHED.PORT_1",
476a7c1aaa6SIan Rogers        "MetricExpr": "UOPS_DISPATCHED.PORT_1 / CORE_CLKS",
477a7c1aaa6SIan Rogers        "MetricGroup": "TopdownL6;tma_alu_op_utilization_group",
478a7c1aaa6SIan Rogers        "MetricName": "tma_port_1",
479a7c1aaa6SIan Rogers        "ScaleUnit": "100%"
480a7c1aaa6SIan Rogers    },
481a7c1aaa6SIan Rogers    {
482a7c1aaa6SIan Rogers        "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 5 ([SNB+] Branches and ALU; [HSW+] ALU) Sample with: UOPS_DISPATCHED.PORT_5",
483a7c1aaa6SIan Rogers        "MetricExpr": "UOPS_DISPATCHED.PORT_5 / CORE_CLKS",
484a7c1aaa6SIan Rogers        "MetricGroup": "TopdownL6;tma_alu_op_utilization_group",
485a7c1aaa6SIan Rogers        "MetricName": "tma_port_5",
486a7c1aaa6SIan Rogers        "ScaleUnit": "100%"
487a7c1aaa6SIan Rogers    },
488a7c1aaa6SIan Rogers    {
489a7c1aaa6SIan Rogers        "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+]Primary Branch and simple ALU) Sample with: UOPS_DISPATCHED.PORT_6",
490a7c1aaa6SIan Rogers        "MetricExpr": "UOPS_DISPATCHED.PORT_6 / CORE_CLKS",
491a7c1aaa6SIan Rogers        "MetricGroup": "TopdownL6;tma_alu_op_utilization_group",
492a7c1aaa6SIan Rogers        "MetricName": "tma_port_6",
493a7c1aaa6SIan Rogers        "ScaleUnit": "100%"
494a7c1aaa6SIan Rogers    },
495a7c1aaa6SIan Rogers    {
496a7c1aaa6SIan Rogers        "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Load operations Sample with: UOPS_DISPATCHED.PORT_2_3",
497a7c1aaa6SIan Rogers        "MetricExpr": "UOPS_DISPATCHED.PORT_2_3 / (2 * CORE_CLKS)",
498a7c1aaa6SIan Rogers        "MetricGroup": "TopdownL5;tma_ports_utilized_3m_group",
499a7c1aaa6SIan Rogers        "MetricName": "tma_load_op_utilization",
500a7c1aaa6SIan Rogers        "ScaleUnit": "100%"
501a7c1aaa6SIan Rogers    },
502a7c1aaa6SIan Rogers    {
503a7c1aaa6SIan Rogers        "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Store operations Sample with: UOPS_DISPATCHED.PORT_7_8",
504a7c1aaa6SIan Rogers        "MetricExpr": "(UOPS_DISPATCHED.PORT_4_9 + UOPS_DISPATCHED.PORT_7_8) / (4 * CORE_CLKS)",
505a7c1aaa6SIan Rogers        "MetricGroup": "TopdownL5;tma_ports_utilized_3m_group",
506a7c1aaa6SIan Rogers        "MetricName": "tma_store_op_utilization",
507a7c1aaa6SIan Rogers        "ScaleUnit": "100%"
508a7c1aaa6SIan Rogers    },
509a7c1aaa6SIan Rogers    {
510a7c1aaa6SIan Rogers        "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired",
511a7c1aaa6SIan Rogers        "MetricExpr": "topdown\\-retiring / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 0 * SLOTS",
512a7c1aaa6SIan Rogers        "MetricGroup": "TopdownL1;tma_L1_group",
513a7c1aaa6SIan Rogers        "MetricName": "tma_retiring",
514a7c1aaa6SIan Rogers        "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category.  Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved.  Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance.  For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided.  Sample with: UOPS_RETIRED.SLOTS",
515a7c1aaa6SIan Rogers        "ScaleUnit": "100%"
516a7c1aaa6SIan Rogers    },
517a7c1aaa6SIan Rogers    {
518a7c1aaa6SIan Rogers        "BriefDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation)",
519a7c1aaa6SIan Rogers        "MetricExpr": "max(0, tma_retiring - tma_heavy_operations)",
520a7c1aaa6SIan Rogers        "MetricGroup": "Retire;TopdownL2;tma_L2_group;tma_retiring_group",
521a7c1aaa6SIan Rogers        "MetricName": "tma_light_operations",
522a7c1aaa6SIan Rogers        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST",
523a7c1aaa6SIan Rogers        "ScaleUnit": "100%"
524a7c1aaa6SIan Rogers    },
525a7c1aaa6SIan Rogers    {
526a7c1aaa6SIan Rogers        "BriefDescription": "This metric represents overall arithmetic floating-point (FP) operations fraction the CPU has executed (retired)",
527a7c1aaa6SIan Rogers        "MetricExpr": "tma_x87_use + tma_fp_scalar + tma_fp_vector",
528a7c1aaa6SIan Rogers        "MetricGroup": "HPC;TopdownL3;tma_light_operations_group",
529a7c1aaa6SIan Rogers        "MetricName": "tma_fp_arith",
530a7c1aaa6SIan Rogers        "PublicDescription": "This metric represents overall arithmetic floating-point (FP) operations fraction the CPU has executed (retired). Note this metric's value may exceed its parent due to use of \"Uops\" CountDomain and FMA double-counting.",
531a7c1aaa6SIan Rogers        "ScaleUnit": "100%"
532a7c1aaa6SIan Rogers    },
533a7c1aaa6SIan Rogers    {
534a7c1aaa6SIan Rogers        "BriefDescription": "This metric serves as an approximation of legacy x87 usage",
535a7c1aaa6SIan Rogers        "MetricExpr": "tma_retiring * UOPS_EXECUTED.X87 / UOPS_EXECUTED.THREAD",
536a7c1aaa6SIan Rogers        "MetricGroup": "Compute;TopdownL4;tma_fp_arith_group",
537a7c1aaa6SIan Rogers        "MetricName": "tma_x87_use",
538a7c1aaa6SIan Rogers        "PublicDescription": "This metric serves as an approximation of legacy x87 usage. It accounts for instructions beyond X87 FP arithmetic operations; hence may be used as a thermometer to avoid X87 high usage and preferably upgrade to modern ISA. See Tip under Tuning Hint.",
539a7c1aaa6SIan Rogers        "ScaleUnit": "100%"
540a7c1aaa6SIan Rogers    },
541a7c1aaa6SIan Rogers    {
542a7c1aaa6SIan Rogers        "BriefDescription": "This metric approximates arithmetic floating-point (FP) scalar uops fraction the CPU has retired",
543a7c1aaa6SIan Rogers        "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) / (tma_retiring * SLOTS)",
544a7c1aaa6SIan Rogers        "MetricGroup": "Compute;Flops;TopdownL4;tma_fp_arith_group",
545a7c1aaa6SIan Rogers        "MetricName": "tma_fp_scalar",
546a7c1aaa6SIan Rogers        "PublicDescription": "This metric approximates arithmetic floating-point (FP) scalar uops fraction the CPU has retired. May overcount due to FMA double counting.",
547a7c1aaa6SIan Rogers        "ScaleUnit": "100%"
548a7c1aaa6SIan Rogers    },
549a7c1aaa6SIan Rogers    {
550a7c1aaa6SIan Rogers        "BriefDescription": "This metric approximates arithmetic floating-point (FP) vector uops fraction the CPU has retired aggregated across all vector widths",
551a7c1aaa6SIan Rogers        "MetricExpr": "(FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / (tma_retiring * SLOTS)",
552a7c1aaa6SIan Rogers        "MetricGroup": "Compute;Flops;TopdownL4;tma_fp_arith_group",
553a7c1aaa6SIan Rogers        "MetricName": "tma_fp_vector",
554a7c1aaa6SIan Rogers        "PublicDescription": "This metric approximates arithmetic floating-point (FP) vector uops fraction the CPU has retired aggregated across all vector widths. May overcount due to FMA double counting.",
555a7c1aaa6SIan Rogers        "ScaleUnit": "100%"
556a7c1aaa6SIan Rogers    },
557a7c1aaa6SIan Rogers    {
558a7c1aaa6SIan Rogers        "BriefDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 128-bit wide vectors",
559a7c1aaa6SIan Rogers        "MetricExpr": "(FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE) / (tma_retiring * SLOTS)",
560a7c1aaa6SIan Rogers        "MetricGroup": "Compute;Flops;TopdownL5;tma_fp_vector_group",
561a7c1aaa6SIan Rogers        "MetricName": "tma_fp_vector_128b",
562a7c1aaa6SIan Rogers        "PublicDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 128-bit wide vectors. May overcount due to FMA double counting.",
563a7c1aaa6SIan Rogers        "ScaleUnit": "100%"
564a7c1aaa6SIan Rogers    },
565a7c1aaa6SIan Rogers    {
566a7c1aaa6SIan Rogers        "BriefDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 256-bit wide vectors",
567a7c1aaa6SIan Rogers        "MetricExpr": "(FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / (tma_retiring * SLOTS)",
568a7c1aaa6SIan Rogers        "MetricGroup": "Compute;Flops;TopdownL5;tma_fp_vector_group",
569a7c1aaa6SIan Rogers        "MetricName": "tma_fp_vector_256b",
570a7c1aaa6SIan Rogers        "PublicDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 256-bit wide vectors. May overcount due to FMA double counting.",
571a7c1aaa6SIan Rogers        "ScaleUnit": "100%"
572a7c1aaa6SIan Rogers    },
573a7c1aaa6SIan Rogers    {
574a7c1aaa6SIan Rogers        "BriefDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 512-bit wide vectors",
575a7c1aaa6SIan Rogers        "MetricExpr": "(FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / (tma_retiring * SLOTS)",
576a7c1aaa6SIan Rogers        "MetricGroup": "Compute;Flops;TopdownL5;tma_fp_vector_group",
577a7c1aaa6SIan Rogers        "MetricName": "tma_fp_vector_512b",
578a7c1aaa6SIan Rogers        "PublicDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 512-bit wide vectors. May overcount due to FMA double counting.",
579a7c1aaa6SIan Rogers        "ScaleUnit": "100%"
580a7c1aaa6SIan Rogers    },
581a7c1aaa6SIan Rogers    {
582a7c1aaa6SIan Rogers        "BriefDescription": "This metric represents fraction of slots where the CPU was retiring memory operations -- uops for memory load or store accesses.",
583a7c1aaa6SIan Rogers        "MetricExpr": "tma_light_operations * MEM_INST_RETIRED.ANY / INST_RETIRED.ANY",
584a7c1aaa6SIan Rogers        "MetricGroup": "Pipeline;TopdownL3;tma_light_operations_group",
585a7c1aaa6SIan Rogers        "MetricName": "tma_memory_operations",
586a7c1aaa6SIan Rogers        "ScaleUnit": "100%"
587a7c1aaa6SIan Rogers    },
588a7c1aaa6SIan Rogers    {
589a7c1aaa6SIan Rogers        "BriefDescription": "This metric represents fraction of slots where the CPU was retiring branch instructions.",
590a7c1aaa6SIan Rogers        "MetricExpr": "tma_light_operations * BR_INST_RETIRED.ALL_BRANCHES / (tma_retiring * SLOTS)",
591a7c1aaa6SIan Rogers        "MetricGroup": "Pipeline;TopdownL3;tma_light_operations_group",
592a7c1aaa6SIan Rogers        "MetricName": "tma_branch_instructions",
593a7c1aaa6SIan Rogers        "ScaleUnit": "100%"
594a7c1aaa6SIan Rogers    },
595a7c1aaa6SIan Rogers    {
596a7c1aaa6SIan Rogers        "BriefDescription": "This metric represents fraction of slots where the CPU was retiring NOP (no op) instructions",
597a7c1aaa6SIan Rogers        "MetricExpr": "tma_light_operations * INST_RETIRED.NOP / (tma_retiring * SLOTS)",
598a7c1aaa6SIan Rogers        "MetricGroup": "Pipeline;TopdownL3;tma_light_operations_group",
599a7c1aaa6SIan Rogers        "MetricName": "tma_nop_instructions",
600a7c1aaa6SIan Rogers        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring NOP (no op) instructions. Compilers often use NOPs for certain address alignments - e.g. start address of a function or loop body. Sample with: INST_RETIRED.NOP",
601a7c1aaa6SIan Rogers        "ScaleUnit": "100%"
602a7c1aaa6SIan Rogers    },
603a7c1aaa6SIan Rogers    {
604a7c1aaa6SIan Rogers        "BriefDescription": "This metric represents the remaining light uops fraction the CPU has executed - remaining means not covered by other sibling nodes. May undercount due to FMA double counting",
605a7c1aaa6SIan Rogers        "MetricExpr": "max(0, tma_light_operations - (tma_fp_arith + tma_memory_operations + tma_branch_instructions + tma_nop_instructions))",
606a7c1aaa6SIan Rogers        "MetricGroup": "Pipeline;TopdownL3;tma_light_operations_group",
607a7c1aaa6SIan Rogers        "MetricName": "tma_other_light_ops",
608a7c1aaa6SIan Rogers        "ScaleUnit": "100%"
609a7c1aaa6SIan Rogers    },
610a7c1aaa6SIan Rogers    {
611a7c1aaa6SIan Rogers        "BriefDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or microcoded sequences",
612a7c1aaa6SIan Rogers        "MetricExpr": "tma_microcode_sequencer + tma_retiring * (UOPS_DECODED.DEC0 - cpu@UOPS_DECODED.DEC0\\,cmask\\=1@) / IDQ.MITE_UOPS",
613a7c1aaa6SIan Rogers        "MetricGroup": "Retire;TopdownL2;tma_L2_group;tma_retiring_group",
614a7c1aaa6SIan Rogers        "MetricName": "tma_heavy_operations",
615a7c1aaa6SIan Rogers        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or microcoded sequences. This highly-correlates with the uop length of these instructions/sequences.",
616a7c1aaa6SIan Rogers        "ScaleUnit": "100%"
617a7c1aaa6SIan Rogers    },
618a7c1aaa6SIan Rogers    {
619a7c1aaa6SIan Rogers        "BriefDescription": "This metric represents fraction of slots where the CPU was retiring instructions that that are decoder into two or up to ([SNB+] four; [ADL+] five) uops",
620a7c1aaa6SIan Rogers        "MetricExpr": "tma_heavy_operations - tma_microcode_sequencer",
621a7c1aaa6SIan Rogers        "MetricGroup": "TopdownL3;tma_heavy_operations_group",
622a7c1aaa6SIan Rogers        "MetricName": "tma_few_uops_instructions",
623a7c1aaa6SIan Rogers        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring instructions that that are decoder into two or up to ([SNB+] four; [ADL+] five) uops. This highly-correlates with the number of uops in such instructions.",
624a7c1aaa6SIan Rogers        "ScaleUnit": "100%"
625a7c1aaa6SIan Rogers    },
626a7c1aaa6SIan Rogers    {
627a7c1aaa6SIan Rogers        "BriefDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit",
628*69f685e0SIan Rogers        "MetricExpr": "tma_retiring * SLOTS / UOPS_ISSUED.ANY * IDQ.MS_UOPS / SLOTS",
629a7c1aaa6SIan Rogers        "MetricGroup": "MicroSeq;TopdownL3;tma_heavy_operations_group",
630a7c1aaa6SIan Rogers        "MetricName": "tma_microcode_sequencer",
631a7c1aaa6SIan Rogers        "PublicDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit.  The MS is used for CISC instructions not supported by the default decoders (like repeat move strings; or CPUID); or by microcode assists used to address some operation modes (like in Floating Point assists). These cases can often be avoided. Sample with: IDQ.MS_UOPS",
632a7c1aaa6SIan Rogers        "ScaleUnit": "100%"
633a7c1aaa6SIan Rogers    },
634a7c1aaa6SIan Rogers    {
635a7c1aaa6SIan Rogers        "BriefDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists",
636a7c1aaa6SIan Rogers        "MetricExpr": "100 * ASSISTS.ANY / SLOTS",
637a7c1aaa6SIan Rogers        "MetricGroup": "TopdownL4;tma_microcode_sequencer_group",
638a7c1aaa6SIan Rogers        "MetricName": "tma_assists",
639a7c1aaa6SIan Rogers        "PublicDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists. Assists are long sequences of uops that are required in certain corner-cases for operations that cannot be handled natively by the execution pipeline. For example; when working with very small floating point values (so-called Denormals); the FP units are not set up to perform these operations natively. Instead; a sequence of instructions to perform the computation on the Denormals is injected into the pipeline. Since these microcode sequences might be dozens of uops long; Assists can be extremely deleterious to performance and they can be avoided in many cases. Sample with: ASSISTS.ANY",
640a7c1aaa6SIan Rogers        "ScaleUnit": "100%"
641a7c1aaa6SIan Rogers    },
642a7c1aaa6SIan Rogers    {
643a7c1aaa6SIan Rogers        "BriefDescription": "This metric estimates fraction of cycles the CPU retired uops originated from CISC (complex instruction set computer) instruction",
644a7c1aaa6SIan Rogers        "MetricExpr": "max(0, tma_microcode_sequencer - tma_assists)",
645a7c1aaa6SIan Rogers        "MetricGroup": "TopdownL4;tma_microcode_sequencer_group",
646a7c1aaa6SIan Rogers        "MetricName": "tma_cisc",
647a7c1aaa6SIan Rogers        "PublicDescription": "This metric estimates fraction of cycles the CPU retired uops originated from CISC (complex instruction set computer) instruction. A CISC instruction has multiple uops that are required to perform the instruction's functionality as in the case of read-modify-write as an example. Since these instructions require multiple uops they may or may not imply sub-optimal use of machine resources.",
648a7c1aaa6SIan Rogers        "ScaleUnit": "100%"
649a7c1aaa6SIan Rogers    },
650a7c1aaa6SIan Rogers    {
651a7c1aaa6SIan Rogers        "BriefDescription": "Total pipeline cost of Branch Misprediction related bottlenecks",
652a7c1aaa6SIan Rogers        "MetricExpr": "100 * (tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))",
653a7c1aaa6SIan Rogers        "MetricGroup": "Bad;BadSpec;BrMispredicts",
654a7c1aaa6SIan Rogers        "MetricName": "Mispredictions"
655a7c1aaa6SIan Rogers    },
656a7c1aaa6SIan Rogers    {
657a7c1aaa6SIan Rogers        "BriefDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks",
658*69f685e0SIan Rogers        "MetricExpr": "100 * tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency)) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_sq_full / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full))) + tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_fb_full / (tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk))",
659a7c1aaa6SIan Rogers        "MetricGroup": "Mem;MemoryBW;Offcore",
660a7c1aaa6SIan Rogers        "MetricName": "Memory_Bandwidth"
661a7c1aaa6SIan Rogers    },
662a7c1aaa6SIan Rogers    {
663a7c1aaa6SIan Rogers        "BriefDescription": "Total pipeline cost of Memory Latency related bottlenecks (external memory and off-core caches)",
664*69f685e0SIan Rogers        "MetricExpr": "100 * tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_l2_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound))",
665a7c1aaa6SIan Rogers        "MetricGroup": "Mem;MemoryLat;Offcore",
666a7c1aaa6SIan Rogers        "MetricName": "Memory_Latency"
667a7c1aaa6SIan Rogers    },
668a7c1aaa6SIan Rogers    {
669a7c1aaa6SIan Rogers        "BriefDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs)",
670*69f685e0SIan Rogers        "MetricExpr": "100 * tma_memory_bound * (tma_l1_bound / max(tma_memory_bound, tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_dtlb_load / max(tma_l1_bound, tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_dtlb_store / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)))",
671a7c1aaa6SIan Rogers        "MetricGroup": "Mem;MemoryTLB;Offcore",
672a7c1aaa6SIan Rogers        "MetricName": "Memory_Data_TLBs"
673a7c1aaa6SIan Rogers    },
674a7c1aaa6SIan Rogers    {
6755e1dd4f2SIan Rogers        "BriefDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls)",
676a7c1aaa6SIan Rogers        "MetricExpr": "100 * ((BR_INST_RETIRED.COND + 3 * BR_INST_RETIRED.NEAR_CALL + (BR_INST_RETIRED.NEAR_TAKEN - BR_INST_RETIRED.COND_TAKEN - 2 * BR_INST_RETIRED.NEAR_CALL)) / SLOTS)",
6775e1dd4f2SIan Rogers        "MetricGroup": "Ret",
6785e1dd4f2SIan Rogers        "MetricName": "Branching_Overhead"
6795e1dd4f2SIan Rogers    },
6805e1dd4f2SIan Rogers    {
6815e1dd4f2SIan Rogers        "BriefDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses)",
682a7c1aaa6SIan Rogers        "MetricExpr": "100 * tma_fetch_latency * (tma_itlb_misses + tma_icache_misses + tma_unknown_branches) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)",
6835e1dd4f2SIan Rogers        "MetricGroup": "BigFoot;Fed;Frontend;IcMiss;MemoryTLB",
6845e1dd4f2SIan Rogers        "MetricName": "Big_Code"
6855e1dd4f2SIan Rogers    },
6865e1dd4f2SIan Rogers    {
687a7c1aaa6SIan Rogers        "BriefDescription": "Total pipeline cost of instruction fetch bandwidth related bottlenecks",
688a7c1aaa6SIan Rogers        "MetricExpr": "100 * (tma_frontend_bound - tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) - Big_Code",
689a7c1aaa6SIan Rogers        "MetricGroup": "Fed;FetchBW;Frontend",
690a7c1aaa6SIan Rogers        "MetricName": "Instruction_Fetch_BW"
691a7c1aaa6SIan Rogers    },
692a7c1aaa6SIan Rogers    {
693b9efd75bSJin Yao        "BriefDescription": "Instructions Per Cycle (per Logical Processor)",
694a7c1aaa6SIan Rogers        "MetricExpr": "INST_RETIRED.ANY / CLKS",
6955e1dd4f2SIan Rogers        "MetricGroup": "Ret;Summary",
696b9efd75bSJin Yao        "MetricName": "IPC"
697b9efd75bSJin Yao    },
698b9efd75bSJin Yao    {
699a7c1aaa6SIan Rogers        "BriefDescription": "Uops Per Instruction",
700*69f685e0SIan Rogers        "MetricExpr": "tma_retiring * SLOTS / INST_RETIRED.ANY",
701a7c1aaa6SIan Rogers        "MetricGroup": "Pipeline;Ret;Retire",
702a7c1aaa6SIan Rogers        "MetricName": "UPI"
703a7c1aaa6SIan Rogers    },
704a7c1aaa6SIan Rogers    {
705a7c1aaa6SIan Rogers        "BriefDescription": "Instruction per taken branch",
706*69f685e0SIan Rogers        "MetricExpr": "tma_retiring * SLOTS / BR_INST_RETIRED.NEAR_TAKEN",
707a7c1aaa6SIan Rogers        "MetricGroup": "Branches;Fed;FetchBW",
708a7c1aaa6SIan Rogers        "MetricName": "UpTB"
709a7c1aaa6SIan Rogers    },
710a7c1aaa6SIan Rogers    {
711b9efd75bSJin Yao        "BriefDescription": "Cycles Per Instruction (per Logical Processor)",
712a7c1aaa6SIan Rogers        "MetricExpr": "1 / IPC",
713a7c1aaa6SIan Rogers        "MetricGroup": "Mem;Pipeline",
714b9efd75bSJin Yao        "MetricName": "CPI"
715b9efd75bSJin Yao    },
716b9efd75bSJin Yao    {
717b9efd75bSJin Yao        "BriefDescription": "Per-Logical Processor actual clocks when the Logical Processor is active.",
718b9efd75bSJin Yao        "MetricExpr": "CPU_CLK_UNHALTED.THREAD",
719b9efd75bSJin Yao        "MetricGroup": "Pipeline",
720b9efd75bSJin Yao        "MetricName": "CLKS"
721b9efd75bSJin Yao    },
722b9efd75bSJin Yao    {
7235e1dd4f2SIan Rogers        "BriefDescription": "Total issue-pipeline slots (per-Physical Core till ICL; per-Logical Processor ICL onward)",
7245e1dd4f2SIan Rogers        "MetricExpr": "TOPDOWN.SLOTS",
725a7c1aaa6SIan Rogers        "MetricGroup": "tma_L1_group",
7265e1dd4f2SIan Rogers        "MetricName": "SLOTS"
7275e1dd4f2SIan Rogers    },
7285e1dd4f2SIan Rogers    {
7295e1dd4f2SIan Rogers        "BriefDescription": "Fraction of Physical Core issue-slots utilized by this Logical Processor",
730*69f685e0SIan Rogers        "MetricExpr": "(SLOTS / (TOPDOWN.SLOTS / 2) if #SMT_on else 1)",
731a7c1aaa6SIan Rogers        "MetricGroup": "SMT;tma_L1_group",
7325e1dd4f2SIan Rogers        "MetricName": "Slots_Utilization"
7335e1dd4f2SIan Rogers    },
7345e1dd4f2SIan Rogers    {
7355e1dd4f2SIan Rogers        "BriefDescription": "The ratio of Executed- by Issued-Uops",
7365e1dd4f2SIan Rogers        "MetricExpr": "UOPS_EXECUTED.THREAD / UOPS_ISSUED.ANY",
7375e1dd4f2SIan Rogers        "MetricGroup": "Cor;Pipeline",
7385e1dd4f2SIan Rogers        "MetricName": "Execute_per_Issue",
7395e1dd4f2SIan Rogers        "PublicDescription": "The ratio of Executed- by Issued-Uops. Ratio > 1 suggests high rate of uop micro-fusions. Ratio < 1 suggest high rate of \"execute\" at rename stage."
7405e1dd4f2SIan Rogers    },
7415e1dd4f2SIan Rogers    {
7425e1dd4f2SIan Rogers        "BriefDescription": "Instructions Per Cycle across hyper-threads (per physical core)",
743a7c1aaa6SIan Rogers        "MetricExpr": "INST_RETIRED.ANY / CORE_CLKS",
744a7c1aaa6SIan Rogers        "MetricGroup": "Ret;SMT;tma_L1_group",
745b9efd75bSJin Yao        "MetricName": "CoreIPC"
746b9efd75bSJin Yao    },
747b9efd75bSJin Yao    {
748b9efd75bSJin Yao        "BriefDescription": "Floating Point Operations Per Cycle",
749*69f685e0SIan Rogers        "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * (FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE) + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / CORE_CLKS",
750a7c1aaa6SIan Rogers        "MetricGroup": "Flops;Ret",
751b9efd75bSJin Yao        "MetricName": "FLOPc"
752b9efd75bSJin Yao    },
753b9efd75bSJin Yao    {
7545e1dd4f2SIan Rogers        "BriefDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width)",
755*69f685e0SIan Rogers        "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE)) / (2 * CORE_CLKS)",
7565e1dd4f2SIan Rogers        "MetricGroup": "Cor;Flops;HPC",
7575e1dd4f2SIan Rogers        "MetricName": "FP_Arith_Utilization",
7585e1dd4f2SIan Rogers        "PublicDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width). Values > 1 are possible due to ([BDW+] Fused-Multiply Add (FMA) counting - common; [ADL+] use all of ADD/MUL/FMA in Scalar or 128/256-bit vectors - less common)."
759b9efd75bSJin Yao    },
760b9efd75bSJin Yao    {
7615e1dd4f2SIan Rogers        "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core",
762*69f685e0SIan Rogers        "MetricExpr": "UOPS_EXECUTED.THREAD / (UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2 if #SMT_on else UOPS_EXECUTED.CORE_CYCLES_GE_1)",
7635e1dd4f2SIan Rogers        "MetricGroup": "Backend;Cor;Pipeline;PortsUtil",
7645e1dd4f2SIan Rogers        "MetricName": "ILP"
765b9efd75bSJin Yao    },
766b9efd75bSJin Yao    {
767a7c1aaa6SIan Rogers        "BriefDescription": "Probability of Core Bound bottleneck hidden by SMT-profiling artifacts",
768*69f685e0SIan Rogers        "MetricExpr": "((1 - tma_core_bound / tma_ports_utilization if tma_core_bound < tma_ports_utilization else 1) if SMT_2T_Utilization > 0.5 else 0)",
769a7c1aaa6SIan Rogers        "MetricGroup": "Cor;SMT",
770a7c1aaa6SIan Rogers        "MetricName": "Core_Bound_Likely"
771a7c1aaa6SIan Rogers    },
772a7c1aaa6SIan Rogers    {
773b9efd75bSJin Yao        "BriefDescription": "Core actual clocks when any Logical Processor is active on the Physical Core",
774b9efd75bSJin Yao        "MetricExpr": "CPU_CLK_UNHALTED.DISTRIBUTED",
775b9efd75bSJin Yao        "MetricGroup": "SMT",
776b9efd75bSJin Yao        "MetricName": "CORE_CLKS"
777b9efd75bSJin Yao    },
778b9efd75bSJin Yao    {
779b9efd75bSJin Yao        "BriefDescription": "Instructions per Load (lower number means higher occurrence rate)",
780b9efd75bSJin Yao        "MetricExpr": "INST_RETIRED.ANY / MEM_INST_RETIRED.ALL_LOADS",
781b9efd75bSJin Yao        "MetricGroup": "InsType",
782b9efd75bSJin Yao        "MetricName": "IpLoad"
783b9efd75bSJin Yao    },
784b9efd75bSJin Yao    {
785b9efd75bSJin Yao        "BriefDescription": "Instructions per Store (lower number means higher occurrence rate)",
786b9efd75bSJin Yao        "MetricExpr": "INST_RETIRED.ANY / MEM_INST_RETIRED.ALL_STORES",
787b9efd75bSJin Yao        "MetricGroup": "InsType",
788b9efd75bSJin Yao        "MetricName": "IpStore"
789b9efd75bSJin Yao    },
790b9efd75bSJin Yao    {
791b9efd75bSJin Yao        "BriefDescription": "Instructions per Branch (lower number means higher occurrence rate)",
792b9efd75bSJin Yao        "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.ALL_BRANCHES",
7935e1dd4f2SIan Rogers        "MetricGroup": "Branches;Fed;InsType",
794b9efd75bSJin Yao        "MetricName": "IpBranch"
795b9efd75bSJin Yao    },
796b9efd75bSJin Yao    {
797b9efd75bSJin Yao        "BriefDescription": "Instructions per (near) call (lower number means higher occurrence rate)",
798b9efd75bSJin Yao        "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.NEAR_CALL",
7995e1dd4f2SIan Rogers        "MetricGroup": "Branches;Fed;PGO",
800b9efd75bSJin Yao        "MetricName": "IpCall"
801b9efd75bSJin Yao    },
802b9efd75bSJin Yao    {
8035e1dd4f2SIan Rogers        "BriefDescription": "Instruction per taken branch",
8045e1dd4f2SIan Rogers        "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.NEAR_TAKEN",
8055e1dd4f2SIan Rogers        "MetricGroup": "Branches;Fed;FetchBW;Frontend;PGO",
8065e1dd4f2SIan Rogers        "MetricName": "IpTB"
8075e1dd4f2SIan Rogers    },
8085e1dd4f2SIan Rogers    {
809b9efd75bSJin Yao        "BriefDescription": "Branch instructions per taken branch. ",
810b9efd75bSJin Yao        "MetricExpr": "BR_INST_RETIRED.ALL_BRANCHES / BR_INST_RETIRED.NEAR_TAKEN",
8115e1dd4f2SIan Rogers        "MetricGroup": "Branches;Fed;PGO",
812b9efd75bSJin Yao        "MetricName": "BpTkBranch"
813b9efd75bSJin Yao    },
814b9efd75bSJin Yao    {
815b9efd75bSJin Yao        "BriefDescription": "Instructions per Floating Point (FP) Operation (lower number means higher occurrence rate)",
816*69f685e0SIan Rogers        "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * (FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE) + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE)",
8175e1dd4f2SIan Rogers        "MetricGroup": "Flops;InsType",
818b9efd75bSJin Yao        "MetricName": "IpFLOP"
819b9efd75bSJin Yao    },
820b9efd75bSJin Yao    {
8215e1dd4f2SIan Rogers        "BriefDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate)",
822*69f685e0SIan Rogers        "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE))",
8235e1dd4f2SIan Rogers        "MetricGroup": "Flops;InsType",
8245e1dd4f2SIan Rogers        "MetricName": "IpArith",
8255e1dd4f2SIan Rogers        "PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. Approximated prior to BDW."
8265e1dd4f2SIan Rogers    },
8275e1dd4f2SIan Rogers    {
8285e1dd4f2SIan Rogers        "BriefDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate)",
8295e1dd4f2SIan Rogers        "MetricExpr": "INST_RETIRED.ANY / FP_ARITH_INST_RETIRED.SCALAR_SINGLE",
8305e1dd4f2SIan Rogers        "MetricGroup": "Flops;FpScalar;InsType",
8315e1dd4f2SIan Rogers        "MetricName": "IpArith_Scalar_SP",
8325e1dd4f2SIan Rogers        "PublicDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting."
8335e1dd4f2SIan Rogers    },
8345e1dd4f2SIan Rogers    {
8355e1dd4f2SIan Rogers        "BriefDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate)",
8365e1dd4f2SIan Rogers        "MetricExpr": "INST_RETIRED.ANY / FP_ARITH_INST_RETIRED.SCALAR_DOUBLE",
8375e1dd4f2SIan Rogers        "MetricGroup": "Flops;FpScalar;InsType",
8385e1dd4f2SIan Rogers        "MetricName": "IpArith_Scalar_DP",
8395e1dd4f2SIan Rogers        "PublicDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting."
8405e1dd4f2SIan Rogers    },
8415e1dd4f2SIan Rogers    {
8425e1dd4f2SIan Rogers        "BriefDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate)",
8435e1dd4f2SIan Rogers        "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE)",
8445e1dd4f2SIan Rogers        "MetricGroup": "Flops;FpVector;InsType",
8455e1dd4f2SIan Rogers        "MetricName": "IpArith_AVX128",
8465e1dd4f2SIan Rogers        "PublicDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting."
8475e1dd4f2SIan Rogers    },
8485e1dd4f2SIan Rogers    {
8495e1dd4f2SIan Rogers        "BriefDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate)",
8505e1dd4f2SIan Rogers        "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE)",
8515e1dd4f2SIan Rogers        "MetricGroup": "Flops;FpVector;InsType",
8525e1dd4f2SIan Rogers        "MetricName": "IpArith_AVX256",
8535e1dd4f2SIan Rogers        "PublicDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting."
8545e1dd4f2SIan Rogers    },
8555e1dd4f2SIan Rogers    {
8565e1dd4f2SIan Rogers        "BriefDescription": "Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate)",
8575e1dd4f2SIan Rogers        "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE)",
8585e1dd4f2SIan Rogers        "MetricGroup": "Flops;FpVector;InsType",
8595e1dd4f2SIan Rogers        "MetricName": "IpArith_AVX512",
8605e1dd4f2SIan Rogers        "PublicDescription": "Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting."
8615e1dd4f2SIan Rogers    },
8625e1dd4f2SIan Rogers    {
8635e1dd4f2SIan Rogers        "BriefDescription": "Instructions per Software prefetch instruction (of any type: NTA/T0/T1/T2/Prefetch) (lower number means higher occurrence rate)",
8645e1dd4f2SIan Rogers        "MetricExpr": "INST_RETIRED.ANY / cpu@SW_PREFETCH_ACCESS.T0\\,umask\\=0xF@",
8655e1dd4f2SIan Rogers        "MetricGroup": "Prefetches",
8665e1dd4f2SIan Rogers        "MetricName": "IpSWPF"
8675e1dd4f2SIan Rogers    },
8685e1dd4f2SIan Rogers    {
869a7c1aaa6SIan Rogers        "BriefDescription": "Total number of retired Instructions Sample with: INST_RETIRED.PREC_DIST",
870b9efd75bSJin Yao        "MetricExpr": "INST_RETIRED.ANY",
871a7c1aaa6SIan Rogers        "MetricGroup": "Summary;tma_L1_group",
872b9efd75bSJin Yao        "MetricName": "Instructions"
873b9efd75bSJin Yao    },
874b9efd75bSJin Yao    {
875a7c1aaa6SIan Rogers        "BriefDescription": "Average number of Uops retired in cycles where at least one uop has retired.",
876*69f685e0SIan Rogers        "MetricExpr": "tma_retiring * SLOTS / cpu@UOPS_RETIRED.SLOTS\\,cmask\\=1@",
877a7c1aaa6SIan Rogers        "MetricGroup": "Pipeline;Ret",
878a7c1aaa6SIan Rogers        "MetricName": "Retire"
879a7c1aaa6SIan Rogers    },
880a7c1aaa6SIan Rogers    {
8815e1dd4f2SIan Rogers        "BriefDescription": "",
8825e1dd4f2SIan Rogers        "MetricExpr": "UOPS_EXECUTED.THREAD / cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@",
8835e1dd4f2SIan Rogers        "MetricGroup": "Cor;Pipeline;PortsUtil;SMT",
8845e1dd4f2SIan Rogers        "MetricName": "Execute"
8855e1dd4f2SIan Rogers    },
8865e1dd4f2SIan Rogers    {
8875e1dd4f2SIan Rogers        "BriefDescription": "Average number of Uops issued by front-end when it issued something",
8885e1dd4f2SIan Rogers        "MetricExpr": "UOPS_ISSUED.ANY / cpu@UOPS_ISSUED.ANY\\,cmask\\=1@",
8895e1dd4f2SIan Rogers        "MetricGroup": "Fed;FetchBW",
8905e1dd4f2SIan Rogers        "MetricName": "Fetch_UpC"
8915e1dd4f2SIan Rogers    },
8925e1dd4f2SIan Rogers    {
893b9efd75bSJin Yao        "BriefDescription": "Fraction of Uops delivered by the LSD (Loop Stream Detector; aka Loop Cache)",
894b9efd75bSJin Yao        "MetricExpr": "LSD.UOPS / (IDQ.DSB_UOPS + LSD.UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS)",
8955e1dd4f2SIan Rogers        "MetricGroup": "Fed;LSD",
896b9efd75bSJin Yao        "MetricName": "LSD_Coverage"
897b9efd75bSJin Yao    },
898b9efd75bSJin Yao    {
899b9efd75bSJin Yao        "BriefDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache)",
900b9efd75bSJin Yao        "MetricExpr": "IDQ.DSB_UOPS / (IDQ.DSB_UOPS + LSD.UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS)",
9015e1dd4f2SIan Rogers        "MetricGroup": "DSB;Fed;FetchBW",
902b9efd75bSJin Yao        "MetricName": "DSB_Coverage"
903b9efd75bSJin Yao    },
904b9efd75bSJin Yao    {
9055e1dd4f2SIan Rogers        "BriefDescription": "Average number of cycles of a switch from the DSB fetch-unit to MITE fetch unit - see DSB_Switches tree node for details.",
9065e1dd4f2SIan Rogers        "MetricExpr": "DSB2MITE_SWITCHES.PENALTY_CYCLES / cpu@DSB2MITE_SWITCHES.PENALTY_CYCLES\\,cmask\\=1\\,edge@",
9075e1dd4f2SIan Rogers        "MetricGroup": "DSBmiss",
9085e1dd4f2SIan Rogers        "MetricName": "DSB_Switch_Cost"
9095e1dd4f2SIan Rogers    },
9105e1dd4f2SIan Rogers    {
911a7c1aaa6SIan Rogers        "BriefDescription": "Total penalty related to DSB (uop cache) misses - subset of the Instruction_Fetch_BW Bottleneck.",
912a7c1aaa6SIan Rogers        "MetricExpr": "100 * (tma_fetch_latency * tma_dsb_switches / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + tma_fetch_bandwidth * tma_mite / (tma_dsb + tma_lsd + tma_mite))",
913a7c1aaa6SIan Rogers        "MetricGroup": "DSBmiss;Fed",
914a7c1aaa6SIan Rogers        "MetricName": "DSB_Misses"
915a7c1aaa6SIan Rogers    },
916a7c1aaa6SIan Rogers    {
9175e1dd4f2SIan Rogers        "BriefDescription": "Number of Instructions per non-speculative DSB miss (lower number means higher occurrence rate)",
9185e1dd4f2SIan Rogers        "MetricExpr": "INST_RETIRED.ANY / FRONTEND_RETIRED.ANY_DSB_MISS",
9195e1dd4f2SIan Rogers        "MetricGroup": "DSBmiss;Fed",
9205e1dd4f2SIan Rogers        "MetricName": "IpDSB_Miss_Ret"
9215e1dd4f2SIan Rogers    },
9225e1dd4f2SIan Rogers    {
9235e1dd4f2SIan Rogers        "BriefDescription": "Number of Instructions per non-speculative Branch Misprediction (JEClear) (lower number means higher occurrence rate)",
9245e1dd4f2SIan Rogers        "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.ALL_BRANCHES",
9255e1dd4f2SIan Rogers        "MetricGroup": "Bad;BadSpec;BrMispredicts",
9265e1dd4f2SIan Rogers        "MetricName": "IpMispredict"
9275e1dd4f2SIan Rogers    },
9285e1dd4f2SIan Rogers    {
929a7c1aaa6SIan Rogers        "BriefDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear)",
930a7c1aaa6SIan Rogers        "MetricExpr": "(tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) * SLOTS / BR_MISP_RETIRED.ALL_BRANCHES",
931a7c1aaa6SIan Rogers        "MetricGroup": "Bad;BrMispredicts",
932a7c1aaa6SIan Rogers        "MetricName": "Branch_Misprediction_Cost"
933a7c1aaa6SIan Rogers    },
934a7c1aaa6SIan Rogers    {
9355e1dd4f2SIan Rogers        "BriefDescription": "Fraction of branches that are non-taken conditionals",
9365e1dd4f2SIan Rogers        "MetricExpr": "BR_INST_RETIRED.COND_NTAKEN / BR_INST_RETIRED.ALL_BRANCHES",
9375e1dd4f2SIan Rogers        "MetricGroup": "Bad;Branches;CodeGen;PGO",
9385e1dd4f2SIan Rogers        "MetricName": "Cond_NT"
9395e1dd4f2SIan Rogers    },
9405e1dd4f2SIan Rogers    {
9415e1dd4f2SIan Rogers        "BriefDescription": "Fraction of branches that are taken conditionals",
9425e1dd4f2SIan Rogers        "MetricExpr": "BR_INST_RETIRED.COND_TAKEN / BR_INST_RETIRED.ALL_BRANCHES",
9435e1dd4f2SIan Rogers        "MetricGroup": "Bad;Branches;CodeGen;PGO",
9445e1dd4f2SIan Rogers        "MetricName": "Cond_TK"
9455e1dd4f2SIan Rogers    },
9465e1dd4f2SIan Rogers    {
9475e1dd4f2SIan Rogers        "BriefDescription": "Fraction of branches that are CALL or RET",
9485e1dd4f2SIan Rogers        "MetricExpr": "(BR_INST_RETIRED.NEAR_CALL + BR_INST_RETIRED.NEAR_RETURN) / BR_INST_RETIRED.ALL_BRANCHES",
9495e1dd4f2SIan Rogers        "MetricGroup": "Bad;Branches",
9505e1dd4f2SIan Rogers        "MetricName": "CallRet"
9515e1dd4f2SIan Rogers    },
9525e1dd4f2SIan Rogers    {
9535e1dd4f2SIan Rogers        "BriefDescription": "Fraction of branches that are unconditional (direct or indirect) jumps",
9545e1dd4f2SIan Rogers        "MetricExpr": "(BR_INST_RETIRED.NEAR_TAKEN - BR_INST_RETIRED.COND_TAKEN - 2 * BR_INST_RETIRED.NEAR_CALL) / BR_INST_RETIRED.ALL_BRANCHES",
9555e1dd4f2SIan Rogers        "MetricGroup": "Bad;Branches",
9565e1dd4f2SIan Rogers        "MetricName": "Jump"
9575e1dd4f2SIan Rogers    },
9585e1dd4f2SIan Rogers    {
9595e1dd4f2SIan Rogers        "BriefDescription": "Fraction of branches of other types (not individually covered by other metrics in Info.Branches group)",
960a7c1aaa6SIan Rogers        "MetricExpr": "1 - (Cond_NT + Cond_TK + CallRet + Jump)",
9615e1dd4f2SIan Rogers        "MetricGroup": "Bad;Branches",
9625e1dd4f2SIan Rogers        "MetricName": "Other_Branches"
9635e1dd4f2SIan Rogers    },
9645e1dd4f2SIan Rogers    {
9655e1dd4f2SIan Rogers        "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)",
966b9efd75bSJin Yao        "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_RETIRED.L1_MISS + MEM_LOAD_RETIRED.FB_HIT)",
9675e1dd4f2SIan Rogers        "MetricGroup": "Mem;MemoryBound;MemoryLat",
968b9efd75bSJin Yao        "MetricName": "Load_Miss_Real_Latency"
969b9efd75bSJin Yao    },
970b9efd75bSJin Yao    {
971b9efd75bSJin Yao        "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)",
972b9efd75bSJin Yao        "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES",
973a7c1aaa6SIan Rogers        "MetricGroup": "Mem;MemoryBW;MemoryBound",
974b9efd75bSJin Yao        "MetricName": "MLP"
975b9efd75bSJin Yao    },
976b9efd75bSJin Yao    {
977b9efd75bSJin Yao        "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads",
978*69f685e0SIan Rogers        "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L1_MISS / INST_RETIRED.ANY",
979a7c1aaa6SIan Rogers        "MetricGroup": "CacheMisses;Mem",
980b9efd75bSJin Yao        "MetricName": "L1MPKI"
981b9efd75bSJin Yao    },
982b9efd75bSJin Yao    {
9835e1dd4f2SIan Rogers        "BriefDescription": "L1 cache true misses per kilo instruction for all demand loads (including speculative)",
984*69f685e0SIan Rogers        "MetricExpr": "1e3 * L2_RQSTS.ALL_DEMAND_DATA_RD / INST_RETIRED.ANY",
985a7c1aaa6SIan Rogers        "MetricGroup": "CacheMisses;Mem",
9865e1dd4f2SIan Rogers        "MetricName": "L1MPKI_Load"
9875e1dd4f2SIan Rogers    },
9885e1dd4f2SIan Rogers    {
989b9efd75bSJin Yao        "BriefDescription": "L2 cache true misses per kilo instruction for retired demand loads",
990*69f685e0SIan Rogers        "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L2_MISS / INST_RETIRED.ANY",
991a7c1aaa6SIan Rogers        "MetricGroup": "Backend;CacheMisses;Mem",
992b9efd75bSJin Yao        "MetricName": "L2MPKI"
993b9efd75bSJin Yao    },
994b9efd75bSJin Yao    {
9955e1dd4f2SIan Rogers        "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all request types (including speculative)",
996*69f685e0SIan Rogers        "MetricExpr": "1e3 * L2_RQSTS.MISS / INST_RETIRED.ANY",
997a7c1aaa6SIan Rogers        "MetricGroup": "CacheMisses;Mem;Offcore",
9985e1dd4f2SIan Rogers        "MetricName": "L2MPKI_All"
9995e1dd4f2SIan Rogers    },
10005e1dd4f2SIan Rogers    {
10015e1dd4f2SIan Rogers        "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all demand loads  (including speculative)",
1002*69f685e0SIan Rogers        "MetricExpr": "1e3 * L2_RQSTS.DEMAND_DATA_RD_MISS / INST_RETIRED.ANY",
1003a7c1aaa6SIan Rogers        "MetricGroup": "CacheMisses;Mem",
10045e1dd4f2SIan Rogers        "MetricName": "L2MPKI_Load"
10055e1dd4f2SIan Rogers    },
10065e1dd4f2SIan Rogers    {
10075e1dd4f2SIan Rogers        "BriefDescription": "L2 cache hits per kilo instruction for all request types (including speculative)",
1008*69f685e0SIan Rogers        "MetricExpr": "1e3 * (L2_RQSTS.REFERENCES - L2_RQSTS.MISS) / INST_RETIRED.ANY",
1009a7c1aaa6SIan Rogers        "MetricGroup": "CacheMisses;Mem",
10105e1dd4f2SIan Rogers        "MetricName": "L2HPKI_All"
10115e1dd4f2SIan Rogers    },
10125e1dd4f2SIan Rogers    {
10135e1dd4f2SIan Rogers        "BriefDescription": "L2 cache hits per kilo instruction for all demand loads  (including speculative)",
1014*69f685e0SIan Rogers        "MetricExpr": "1e3 * L2_RQSTS.DEMAND_DATA_RD_HIT / INST_RETIRED.ANY",
1015a7c1aaa6SIan Rogers        "MetricGroup": "CacheMisses;Mem",
10165e1dd4f2SIan Rogers        "MetricName": "L2HPKI_Load"
10175e1dd4f2SIan Rogers    },
10185e1dd4f2SIan Rogers    {
1019b9efd75bSJin Yao        "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads",
1020*69f685e0SIan Rogers        "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L3_MISS / INST_RETIRED.ANY",
1021a7c1aaa6SIan Rogers        "MetricGroup": "CacheMisses;Mem",
1022b9efd75bSJin Yao        "MetricName": "L3MPKI"
1023b9efd75bSJin Yao    },
1024b9efd75bSJin Yao    {
10255e1dd4f2SIan Rogers        "BriefDescription": "Fill Buffer (FB) hits per kilo instructions for retired demand loads (L1D misses that merge into ongoing miss-handling entries)",
1026*69f685e0SIan Rogers        "MetricExpr": "1e3 * MEM_LOAD_RETIRED.FB_HIT / INST_RETIRED.ANY",
1027a7c1aaa6SIan Rogers        "MetricGroup": "CacheMisses;Mem",
10285e1dd4f2SIan Rogers        "MetricName": "FB_HPKI"
10295e1dd4f2SIan Rogers    },
10305e1dd4f2SIan Rogers    {
10315e1dd4f2SIan Rogers        "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses",
10325e1dd4f2SIan Rogers        "MetricConstraint": "NO_NMI_WATCHDOG",
1033a7c1aaa6SIan Rogers        "MetricExpr": "(ITLB_MISSES.WALK_PENDING + DTLB_LOAD_MISSES.WALK_PENDING + DTLB_STORE_MISSES.WALK_PENDING) / (2 * CORE_CLKS)",
10345e1dd4f2SIan Rogers        "MetricGroup": "Mem;MemoryTLB",
10355e1dd4f2SIan Rogers        "MetricName": "Page_Walks_Utilization"
10365e1dd4f2SIan Rogers    },
10375e1dd4f2SIan Rogers    {
10385e1dd4f2SIan Rogers        "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]",
1039*69f685e0SIan Rogers        "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / duration_time",
10405e1dd4f2SIan Rogers        "MetricGroup": "Mem;MemoryBW",
10415e1dd4f2SIan Rogers        "MetricName": "L1D_Cache_Fill_BW"
10425e1dd4f2SIan Rogers    },
10435e1dd4f2SIan Rogers    {
10445e1dd4f2SIan Rogers        "BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]",
1045*69f685e0SIan Rogers        "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / duration_time",
10465e1dd4f2SIan Rogers        "MetricGroup": "Mem;MemoryBW",
10475e1dd4f2SIan Rogers        "MetricName": "L2_Cache_Fill_BW"
10485e1dd4f2SIan Rogers    },
10495e1dd4f2SIan Rogers    {
10505e1dd4f2SIan Rogers        "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]",
1051*69f685e0SIan Rogers        "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time",
10525e1dd4f2SIan Rogers        "MetricGroup": "Mem;MemoryBW",
10535e1dd4f2SIan Rogers        "MetricName": "L3_Cache_Fill_BW"
10545e1dd4f2SIan Rogers    },
10555e1dd4f2SIan Rogers    {
10565e1dd4f2SIan Rogers        "BriefDescription": "Average per-core data access bandwidth to the L3 cache [GB / sec]",
1057*69f685e0SIan Rogers        "MetricExpr": "64 * OFFCORE_REQUESTS.ALL_REQUESTS / 1e9 / duration_time",
10585e1dd4f2SIan Rogers        "MetricGroup": "Mem;MemoryBW;Offcore",
10595e1dd4f2SIan Rogers        "MetricName": "L3_Cache_Access_BW"
10605e1dd4f2SIan Rogers    },
10615e1dd4f2SIan Rogers    {
10625e1dd4f2SIan Rogers        "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]",
1063a7c1aaa6SIan Rogers        "MetricExpr": "L1D_Cache_Fill_BW",
10645e1dd4f2SIan Rogers        "MetricGroup": "Mem;MemoryBW",
10655e1dd4f2SIan Rogers        "MetricName": "L1D_Cache_Fill_BW_1T"
10665e1dd4f2SIan Rogers    },
10675e1dd4f2SIan Rogers    {
10685e1dd4f2SIan Rogers        "BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]",
1069a7c1aaa6SIan Rogers        "MetricExpr": "L2_Cache_Fill_BW",
10705e1dd4f2SIan Rogers        "MetricGroup": "Mem;MemoryBW",
10715e1dd4f2SIan Rogers        "MetricName": "L2_Cache_Fill_BW_1T"
10725e1dd4f2SIan Rogers    },
10735e1dd4f2SIan Rogers    {
10745e1dd4f2SIan Rogers        "BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]",
1075a7c1aaa6SIan Rogers        "MetricExpr": "L3_Cache_Fill_BW",
10765e1dd4f2SIan Rogers        "MetricGroup": "Mem;MemoryBW",
10775e1dd4f2SIan Rogers        "MetricName": "L3_Cache_Fill_BW_1T"
10785e1dd4f2SIan Rogers    },
10795e1dd4f2SIan Rogers    {
10805e1dd4f2SIan Rogers        "BriefDescription": "Average per-thread data access bandwidth to the L3 cache [GB / sec]",
1081a7c1aaa6SIan Rogers        "MetricExpr": "L3_Cache_Access_BW",
10825e1dd4f2SIan Rogers        "MetricGroup": "Mem;MemoryBW;Offcore",
10835e1dd4f2SIan Rogers        "MetricName": "L3_Cache_Access_BW_1T"
10845e1dd4f2SIan Rogers    },
10855e1dd4f2SIan Rogers    {
1086b9efd75bSJin Yao        "BriefDescription": "Average CPU Utilization",
1087*69f685e0SIan Rogers        "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC",
1088b9efd75bSJin Yao        "MetricGroup": "HPC;Summary",
1089b9efd75bSJin Yao        "MetricName": "CPU_Utilization"
1090b9efd75bSJin Yao    },
1091b9efd75bSJin Yao    {
1092b9efd75bSJin Yao        "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]",
1093*69f685e0SIan Rogers        "MetricExpr": "Turbo_Utilization * TSC / 1e9 / duration_time",
1094a7c1aaa6SIan Rogers        "MetricGroup": "Power;Summary",
1095b9efd75bSJin Yao        "MetricName": "Average_Frequency"
1096b9efd75bSJin Yao    },
1097b9efd75bSJin Yao    {
1098b9efd75bSJin Yao        "BriefDescription": "Giga Floating Point Operations Per Second",
1099*69f685e0SIan Rogers        "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * (FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE) + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / 1e9 / duration_time",
11005e1dd4f2SIan Rogers        "MetricGroup": "Cor;Flops;HPC",
11015e1dd4f2SIan Rogers        "MetricName": "GFLOPs",
11025e1dd4f2SIan Rogers        "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine."
1103b9efd75bSJin Yao    },
1104b9efd75bSJin Yao    {
1105b9efd75bSJin Yao        "BriefDescription": "Average Frequency Utilization relative nominal frequency",
1106a7c1aaa6SIan Rogers        "MetricExpr": "CLKS / CPU_CLK_UNHALTED.REF_TSC",
1107b9efd75bSJin Yao        "MetricGroup": "Power",
1108b9efd75bSJin Yao        "MetricName": "Turbo_Utilization"
1109b9efd75bSJin Yao    },
1110b9efd75bSJin Yao    {
11115e1dd4f2SIan Rogers        "BriefDescription": "Fraction of Core cycles where the core was running with power-delivery for baseline license level 0",
1112a7c1aaa6SIan Rogers        "MetricExpr": "CORE_POWER.LVL0_TURBO_LICENSE / CORE_CLKS",
11135e1dd4f2SIan Rogers        "MetricGroup": "Power",
11145e1dd4f2SIan Rogers        "MetricName": "Power_License0_Utilization",
11155e1dd4f2SIan Rogers        "PublicDescription": "Fraction of Core cycles where the core was running with power-delivery for baseline license level 0.  This includes non-AVX codes, SSE, AVX 128-bit, and low-current AVX 256-bit codes."
11165e1dd4f2SIan Rogers    },
11175e1dd4f2SIan Rogers    {
11185e1dd4f2SIan Rogers        "BriefDescription": "Fraction of Core cycles where the core was running with power-delivery for license level 1",
1119a7c1aaa6SIan Rogers        "MetricExpr": "CORE_POWER.LVL1_TURBO_LICENSE / CORE_CLKS",
11205e1dd4f2SIan Rogers        "MetricGroup": "Power",
11215e1dd4f2SIan Rogers        "MetricName": "Power_License1_Utilization",
11225e1dd4f2SIan Rogers        "PublicDescription": "Fraction of Core cycles where the core was running with power-delivery for license level 1.  This includes high current AVX 256-bit instructions as well as low current AVX 512-bit instructions."
11235e1dd4f2SIan Rogers    },
11245e1dd4f2SIan Rogers    {
11255e1dd4f2SIan Rogers        "BriefDescription": "Fraction of Core cycles where the core was running with power-delivery for license level 2 (introduced in SKX)",
1126a7c1aaa6SIan Rogers        "MetricExpr": "CORE_POWER.LVL2_TURBO_LICENSE / CORE_CLKS",
11275e1dd4f2SIan Rogers        "MetricGroup": "Power",
11285e1dd4f2SIan Rogers        "MetricName": "Power_License2_Utilization",
11295e1dd4f2SIan Rogers        "PublicDescription": "Fraction of Core cycles where the core was running with power-delivery for license level 2 (introduced in SKX).  This includes high current AVX 512-bit instructions."
11305e1dd4f2SIan Rogers    },
11315e1dd4f2SIan Rogers    {
1132b9efd75bSJin Yao        "BriefDescription": "Fraction of cycles where both hardware Logical Processors were active",
1133*69f685e0SIan Rogers        "MetricExpr": "(1 - CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_DISTRIBUTED if #SMT_on else 0)",
1134b9efd75bSJin Yao        "MetricGroup": "SMT",
1135b9efd75bSJin Yao        "MetricName": "SMT_2T_Utilization"
1136b9efd75bSJin Yao    },
1137b9efd75bSJin Yao    {
1138b9efd75bSJin Yao        "BriefDescription": "Fraction of cycles spent in the Operating System (OS) Kernel mode",
1139b9efd75bSJin Yao        "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / CPU_CLK_UNHALTED.THREAD",
1140b9efd75bSJin Yao        "MetricGroup": "OS",
1141b9efd75bSJin Yao        "MetricName": "Kernel_Utilization"
1142b9efd75bSJin Yao    },
1143b9efd75bSJin Yao    {
11445e1dd4f2SIan Rogers        "BriefDescription": "Cycles Per Instruction for the Operating System (OS) Kernel mode",
11455e1dd4f2SIan Rogers        "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / INST_RETIRED.ANY_P:k",
11465e1dd4f2SIan Rogers        "MetricGroup": "OS",
11475e1dd4f2SIan Rogers        "MetricName": "Kernel_CPI"
11485e1dd4f2SIan Rogers    },
11495e1dd4f2SIan Rogers    {
11505e1dd4f2SIan Rogers        "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]",
1151*69f685e0SIan Rogers        "MetricExpr": "64 * (arb@event\\=0x81\\,umask\\=0x1@ + arb@event\\=0x84\\,umask\\=0x1@) / 1e6 / duration_time / 1e3",
11525e1dd4f2SIan Rogers        "MetricGroup": "HPC;Mem;MemoryBW;SoC",
11535e1dd4f2SIan Rogers        "MetricName": "DRAM_BW_Use"
11545e1dd4f2SIan Rogers    },
11555e1dd4f2SIan Rogers    {
11565e1dd4f2SIan Rogers        "BriefDescription": "Average number of parallel requests to external memory. Accounts for all requests",
11575e1dd4f2SIan Rogers        "MetricExpr": "UNC_ARB_TRK_OCCUPANCY.ALL / arb@event\\=0x81\\,umask\\=0x1@",
11585e1dd4f2SIan Rogers        "MetricGroup": "Mem;SoC",
11595e1dd4f2SIan Rogers        "MetricName": "MEM_Parallel_Requests"
11605e1dd4f2SIan Rogers    },
11615e1dd4f2SIan Rogers    {
1162b9efd75bSJin Yao        "BriefDescription": "Instructions per Far Branch ( Far Branches apply upon transition from application to operating system, handling interrupts, exceptions) [lower number means higher occurrence rate]",
1163b9efd75bSJin Yao        "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.FAR_BRANCH:u",
1164b9efd75bSJin Yao        "MetricGroup": "Branches;OS",
1165b9efd75bSJin Yao        "MetricName": "IpFarBranch"
1166b9efd75bSJin Yao    },
1167b9efd75bSJin Yao    {
1168b9efd75bSJin Yao        "BriefDescription": "C6 residency percent per core",
1169*69f685e0SIan Rogers        "MetricExpr": "cstate_core@c6\\-residency@ / TSC",
1170b9efd75bSJin Yao        "MetricGroup": "Power",
1171*69f685e0SIan Rogers        "MetricName": "C6_Core_Residency",
1172*69f685e0SIan Rogers        "ScaleUnit": "100%"
1173b9efd75bSJin Yao    },
1174b9efd75bSJin Yao    {
1175b9efd75bSJin Yao        "BriefDescription": "C7 residency percent per core",
1176*69f685e0SIan Rogers        "MetricExpr": "cstate_core@c7\\-residency@ / TSC",
1177b9efd75bSJin Yao        "MetricGroup": "Power",
1178*69f685e0SIan Rogers        "MetricName": "C7_Core_Residency",
1179*69f685e0SIan Rogers        "ScaleUnit": "100%"
1180b9efd75bSJin Yao    },
1181b9efd75bSJin Yao    {
11825e1dd4f2SIan Rogers        "BriefDescription": "C2 residency percent per package",
1183*69f685e0SIan Rogers        "MetricExpr": "cstate_pkg@c2\\-residency@ / TSC",
11845e1dd4f2SIan Rogers        "MetricGroup": "Power",
1185*69f685e0SIan Rogers        "MetricName": "C2_Pkg_Residency",
1186*69f685e0SIan Rogers        "ScaleUnit": "100%"
11875e1dd4f2SIan Rogers    },
11885e1dd4f2SIan Rogers    {
11895e1dd4f2SIan Rogers        "BriefDescription": "C3 residency percent per package",
1190*69f685e0SIan Rogers        "MetricExpr": "cstate_pkg@c3\\-residency@ / TSC",
11915e1dd4f2SIan Rogers        "MetricGroup": "Power",
1192*69f685e0SIan Rogers        "MetricName": "C3_Pkg_Residency",
1193*69f685e0SIan Rogers        "ScaleUnit": "100%"
11945e1dd4f2SIan Rogers    },
11955e1dd4f2SIan Rogers    {
1196b9efd75bSJin Yao        "BriefDescription": "C6 residency percent per package",
1197*69f685e0SIan Rogers        "MetricExpr": "cstate_pkg@c6\\-residency@ / TSC",
1198b9efd75bSJin Yao        "MetricGroup": "Power",
1199*69f685e0SIan Rogers        "MetricName": "C6_Pkg_Residency",
1200*69f685e0SIan Rogers        "ScaleUnit": "100%"
1201b9efd75bSJin Yao    },
1202b9efd75bSJin Yao    {
1203b9efd75bSJin Yao        "BriefDescription": "C7 residency percent per package",
1204*69f685e0SIan Rogers        "MetricExpr": "cstate_pkg@c7\\-residency@ / TSC",
1205b9efd75bSJin Yao        "MetricGroup": "Power",
1206*69f685e0SIan Rogers        "MetricName": "C7_Pkg_Residency",
1207*69f685e0SIan Rogers        "ScaleUnit": "100%"
12085e1dd4f2SIan Rogers    },
12095e1dd4f2SIan Rogers    {
12105e1dd4f2SIan Rogers        "BriefDescription": "C8 residency percent per package",
1211*69f685e0SIan Rogers        "MetricExpr": "cstate_pkg@c8\\-residency@ / TSC",
12125e1dd4f2SIan Rogers        "MetricGroup": "Power",
1213*69f685e0SIan Rogers        "MetricName": "C8_Pkg_Residency",
1214*69f685e0SIan Rogers        "ScaleUnit": "100%"
12155e1dd4f2SIan Rogers    },
12165e1dd4f2SIan Rogers    {
12175e1dd4f2SIan Rogers        "BriefDescription": "C9 residency percent per package",
1218*69f685e0SIan Rogers        "MetricExpr": "cstate_pkg@c9\\-residency@ / TSC",
12195e1dd4f2SIan Rogers        "MetricGroup": "Power",
1220*69f685e0SIan Rogers        "MetricName": "C9_Pkg_Residency",
1221*69f685e0SIan Rogers        "ScaleUnit": "100%"
12225e1dd4f2SIan Rogers    },
12235e1dd4f2SIan Rogers    {
12245e1dd4f2SIan Rogers        "BriefDescription": "C10 residency percent per package",
1225*69f685e0SIan Rogers        "MetricExpr": "cstate_pkg@c10\\-residency@ / TSC",
12265e1dd4f2SIan Rogers        "MetricGroup": "Power",
1227*69f685e0SIan Rogers        "MetricName": "C10_Pkg_Residency",
1228*69f685e0SIan Rogers        "ScaleUnit": "100%"
1229b9efd75bSJin Yao    }
1230b9efd75bSJin Yao]
1231