1cf979623SAndi Kleen[
2cf979623SAndi Kleen    {
31ab15f66SIan Rogers        "BriefDescription": "C2 residency percent per package",
41ab15f66SIan Rogers        "MetricExpr": "cstate_pkg@c2\\-residency@ / TSC",
5cf979623SAndi Kleen        "MetricGroup": "Power",
61ab15f66SIan Rogers        "MetricName": "C2_Pkg_Residency",
71ab15f66SIan Rogers        "ScaleUnit": "100%"
8fec57a8eSIan Rogers    },
9fec57a8eSIan Rogers    {
1061ec07f5SHaiyan Song        "BriefDescription": "C3 residency percent per core",
11fec57a8eSIan Rogers        "MetricExpr": "cstate_core@c3\\-residency@ / TSC",
12cf979623SAndi Kleen        "MetricGroup": "Power",
13fec57a8eSIan Rogers        "MetricName": "C3_Core_Residency",
14fec57a8eSIan Rogers        "ScaleUnit": "100%"
15cf979623SAndi Kleen    },
16cf979623SAndi Kleen    {
1761ec07f5SHaiyan Song        "BriefDescription": "C3 residency percent per package",
18fec57a8eSIan Rogers        "MetricExpr": "cstate_pkg@c3\\-residency@ / TSC",
19cf979623SAndi Kleen        "MetricGroup": "Power",
20fec57a8eSIan Rogers        "MetricName": "C3_Pkg_Residency",
21fec57a8eSIan Rogers        "ScaleUnit": "100%"
22cf979623SAndi Kleen    },
23cf979623SAndi Kleen    {
241ab15f66SIan Rogers        "BriefDescription": "C6 residency percent per core",
251ab15f66SIan Rogers        "MetricExpr": "cstate_core@c6\\-residency@ / TSC",
261ab15f66SIan Rogers        "MetricGroup": "Power",
271ab15f66SIan Rogers        "MetricName": "C6_Core_Residency",
281ab15f66SIan Rogers        "ScaleUnit": "100%"
291ab15f66SIan Rogers    },
301ab15f66SIan Rogers    {
3161ec07f5SHaiyan Song        "BriefDescription": "C6 residency percent per package",
32fec57a8eSIan Rogers        "MetricExpr": "cstate_pkg@c6\\-residency@ / TSC",
33cf979623SAndi Kleen        "MetricGroup": "Power",
34fec57a8eSIan Rogers        "MetricName": "C6_Pkg_Residency",
35fec57a8eSIan Rogers        "ScaleUnit": "100%"
36cf979623SAndi Kleen    },
37cf979623SAndi Kleen    {
381ab15f66SIan Rogers        "BriefDescription": "C7 residency percent per core",
391ab15f66SIan Rogers        "MetricExpr": "cstate_core@c7\\-residency@ / TSC",
401ab15f66SIan Rogers        "MetricGroup": "Power",
411ab15f66SIan Rogers        "MetricName": "C7_Core_Residency",
421ab15f66SIan Rogers        "ScaleUnit": "100%"
431ab15f66SIan Rogers    },
441ab15f66SIan Rogers    {
4561ec07f5SHaiyan Song        "BriefDescription": "C7 residency percent per package",
46fec57a8eSIan Rogers        "MetricExpr": "cstate_pkg@c7\\-residency@ / TSC",
47cf979623SAndi Kleen        "MetricGroup": "Power",
48fec57a8eSIan Rogers        "MetricName": "C7_Pkg_Residency",
49fec57a8eSIan Rogers        "ScaleUnit": "100%"
501ab15f66SIan Rogers    },
511ab15f66SIan Rogers    {
521ab15f66SIan Rogers        "BriefDescription": "Uncore frequency per die [GHZ]",
531ab15f66SIan Rogers        "MetricExpr": "tma_info_socket_clks / #num_dies / duration_time / 1e9",
541ab15f66SIan Rogers        "MetricGroup": "SoC",
551ab15f66SIan Rogers        "MetricName": "UNCORE_FREQ"
561ab15f66SIan Rogers    },
571ab15f66SIan Rogers    {
581ab15f66SIan Rogers        "BriefDescription": "Percentage of cycles spent in System Management Interrupts.",
591ab15f66SIan Rogers        "MetricExpr": "((msr@aperf@ - cycles) / msr@aperf@ if msr@smi@ > 0 else 0)",
601ab15f66SIan Rogers        "MetricGroup": "smi",
611ab15f66SIan Rogers        "MetricName": "smi_cycles",
621ab15f66SIan Rogers        "MetricThreshold": "smi_cycles > 0.1",
631ab15f66SIan Rogers        "ScaleUnit": "100%"
641ab15f66SIan Rogers    },
651ab15f66SIan Rogers    {
661ab15f66SIan Rogers        "BriefDescription": "Number of SMI interrupts.",
671ab15f66SIan Rogers        "MetricExpr": "msr@smi@",
681ab15f66SIan Rogers        "MetricGroup": "smi",
691ab15f66SIan Rogers        "MetricName": "smi_num",
701ab15f66SIan Rogers        "ScaleUnit": "1SMI#"
711ab15f66SIan Rogers    },
721ab15f66SIan Rogers    {
731ab15f66SIan Rogers        "BriefDescription": "This metric estimates how often memory load accesses were aliased by preceding stores (in program order) with a 4K address offset",
741ab15f66SIan Rogers        "MetricExpr": "LD_BLOCKS_PARTIAL.ADDRESS_ALIAS / tma_info_clks",
751ab15f66SIan Rogers        "MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group",
761ab15f66SIan Rogers        "MetricName": "tma_4k_aliasing",
771ab15f66SIan Rogers        "MetricThreshold": "tma_4k_aliasing > 0.2 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
781ab15f66SIan Rogers        "PublicDescription": "This metric estimates how often memory load accesses were aliased by preceding stores (in program order) with a 4K address offset. False match is possible; which incur a few cycles load re-issue. However; the short re-issue duration is often hidden by the out-of-order core and HW optimizations; hence a user may safely ignore a high value of this metric unless it manages to propagate up into parent nodes of the hierarchy (e.g. to L1_Bound).",
791ab15f66SIan Rogers        "ScaleUnit": "100%"
801ab15f66SIan Rogers    },
811ab15f66SIan Rogers    {
821ab15f66SIan Rogers        "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution ports for ALU operations.",
831ab15f66SIan Rogers        "MetricConstraint": "NO_GROUP_EVENTS_NMI",
841ab15f66SIan Rogers        "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_0 + UOPS_DISPATCHED_PORT.PORT_1 + UOPS_DISPATCHED_PORT.PORT_5 + UOPS_DISPATCHED_PORT.PORT_6) / tma_info_slots",
851ab15f66SIan Rogers        "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group",
861ab15f66SIan Rogers        "MetricName": "tma_alu_op_utilization",
871ab15f66SIan Rogers        "MetricThreshold": "tma_alu_op_utilization > 0.6",
881ab15f66SIan Rogers        "ScaleUnit": "100%"
891ab15f66SIan Rogers    },
901ab15f66SIan Rogers    {
911ab15f66SIan Rogers        "BriefDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists",
921ab15f66SIan Rogers        "MetricExpr": "100 * OTHER_ASSISTS.ANY_WB_ASSIST / tma_info_slots",
931ab15f66SIan Rogers        "MetricGroup": "TopdownL4;tma_L4_group;tma_microcode_sequencer_group",
941ab15f66SIan Rogers        "MetricName": "tma_assists",
951ab15f66SIan Rogers        "MetricThreshold": "tma_assists > 0.1 & (tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1)",
961ab15f66SIan Rogers        "PublicDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists. Assists are long sequences of uops that are required in certain corner-cases for operations that cannot be handled natively by the execution pipeline. For example; when working with very small floating point values (so-called Denormals); the FP units are not set up to perform these operations natively. Instead; a sequence of instructions to perform the computation on the Denormals is injected into the pipeline. Since these microcode sequences might be dozens of uops long; Assists can be extremely deleterious to performance and they can be avoided in many cases. Sample with: OTHER_ASSISTS.ANY",
971ab15f66SIan Rogers        "ScaleUnit": "100%"
981ab15f66SIan Rogers    },
991ab15f66SIan Rogers    {
1001ab15f66SIan Rogers        "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend",
1011ab15f66SIan Rogers        "MetricConstraint": "NO_GROUP_EVENTS_NMI",
1021ab15f66SIan Rogers        "MetricExpr": "1 - (tma_frontend_bound + tma_bad_speculation + tma_retiring)",
1031ab15f66SIan Rogers        "MetricGroup": "TmaL1;TopdownL1;tma_L1_group",
1041ab15f66SIan Rogers        "MetricName": "tma_backend_bound",
1051ab15f66SIan Rogers        "MetricThreshold": "tma_backend_bound > 0.2",
106*ccc66c60SIan Rogers        "MetricgroupNoGroup": "TopdownL1",
1071ab15f66SIan Rogers        "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound.",
1081ab15f66SIan Rogers        "ScaleUnit": "100%"
1091ab15f66SIan Rogers    },
1101ab15f66SIan Rogers    {
1111ab15f66SIan Rogers        "BriefDescription": "This category represents fraction of slots wasted due to incorrect speculations",
1121ab15f66SIan Rogers        "MetricExpr": "(UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * (INT_MISC.RECOVERY_CYCLES_ANY / 2 if #SMT_on else INT_MISC.RECOVERY_CYCLES)) / tma_info_slots",
1131ab15f66SIan Rogers        "MetricGroup": "TmaL1;TopdownL1;tma_L1_group",
1141ab15f66SIan Rogers        "MetricName": "tma_bad_speculation",
1151ab15f66SIan Rogers        "MetricThreshold": "tma_bad_speculation > 0.15",
116*ccc66c60SIan Rogers        "MetricgroupNoGroup": "TopdownL1",
1171ab15f66SIan Rogers        "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example.",
1181ab15f66SIan Rogers        "ScaleUnit": "100%"
1191ab15f66SIan Rogers    },
1201ab15f66SIan Rogers    {
1211ab15f66SIan Rogers        "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction",
1221ab15f66SIan Rogers        "MetricConstraint": "NO_GROUP_EVENTS",
1231ab15f66SIan Rogers        "MetricExpr": "BR_MISP_RETIRED.ALL_BRANCHES / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT) * tma_bad_speculation",
1241ab15f66SIan Rogers        "MetricGroup": "BadSpec;BrMispredicts;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueBM",
1251ab15f66SIan Rogers        "MetricName": "tma_branch_mispredicts",
1261ab15f66SIan Rogers        "MetricThreshold": "tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15",
127*ccc66c60SIan Rogers        "MetricgroupNoGroup": "TopdownL2",
1281ab15f66SIan Rogers        "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction.  These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES. Related metrics: tma_info_branch_misprediction_cost, tma_mispredicts_resteers",
1291ab15f66SIan Rogers        "ScaleUnit": "100%"
1301ab15f66SIan Rogers    },
1311ab15f66SIan Rogers    {
1321ab15f66SIan Rogers        "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers",
1331ab15f66SIan Rogers        "MetricExpr": "12 * (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT + BACLEARS.ANY) / tma_info_clks",
1341ab15f66SIan Rogers        "MetricGroup": "FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group",
1351ab15f66SIan Rogers        "MetricName": "tma_branch_resteers",
1361ab15f66SIan Rogers        "MetricThreshold": "tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)",
1371ab15f66SIan Rogers        "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers. Branch Resteers estimates the Frontend delay in fetching operations from corrected path; following all sorts of miss-predicted branches. For example; branchy code with lots of miss-predictions might get categorized under Branch Resteers. Note the value of this node may overlap with its siblings. Sample with: BR_MISP_RETIRED.ALL_BRANCHES",
1381ab15f66SIan Rogers        "ScaleUnit": "100%"
1391ab15f66SIan Rogers    },
1401ab15f66SIan Rogers    {
1411ab15f66SIan Rogers        "BriefDescription": "This metric estimates fraction of cycles the CPU retired uops originated from CISC (complex instruction set computer) instruction",
1421ab15f66SIan Rogers        "MetricConstraint": "NO_GROUP_EVENTS_NMI",
1431ab15f66SIan Rogers        "MetricExpr": "max(0, tma_microcode_sequencer - tma_assists)",
1441ab15f66SIan Rogers        "MetricGroup": "TopdownL4;tma_L4_group;tma_microcode_sequencer_group",
1451ab15f66SIan Rogers        "MetricName": "tma_cisc",
1461ab15f66SIan Rogers        "MetricThreshold": "tma_cisc > 0.1 & (tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1)",
1471ab15f66SIan Rogers        "PublicDescription": "This metric estimates fraction of cycles the CPU retired uops originated from CISC (complex instruction set computer) instruction. A CISC instruction has multiple uops that are required to perform the instruction's functionality as in the case of read-modify-write as an example. Since these instructions require multiple uops they may or may not imply sub-optimal use of machine resources.",
1481ab15f66SIan Rogers        "ScaleUnit": "100%"
1491ab15f66SIan Rogers    },
1501ab15f66SIan Rogers    {
1511ab15f66SIan Rogers        "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Machine Clears",
1521ab15f66SIan Rogers        "MetricExpr": "MACHINE_CLEARS.COUNT * tma_branch_resteers / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT + BACLEARS.ANY)",
1531ab15f66SIan Rogers        "MetricGroup": "BadSpec;MachineClears;TopdownL4;tma_L4_group;tma_branch_resteers_group;tma_issueMC",
1541ab15f66SIan Rogers        "MetricName": "tma_clears_resteers",
1551ab15f66SIan Rogers        "MetricThreshold": "tma_clears_resteers > 0.05 & (tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15))",
1561ab15f66SIan Rogers        "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Machine Clears. Related metrics: tma_l1_bound, tma_machine_clears, tma_microcode_sequencer, tma_ms_switches",
1571ab15f66SIan Rogers        "ScaleUnit": "100%"
1581ab15f66SIan Rogers    },
1591ab15f66SIan Rogers    {
1601ab15f66SIan Rogers        "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses",
1611ab15f66SIan Rogers        "MetricConstraint": "NO_GROUP_EVENTS",
1621ab15f66SIan Rogers        "MetricExpr": "(60 * (MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_RETIRED.L3_MISS))) + 43 * (MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_RETIRED.L3_MISS)))) / tma_info_clks",
1631ab15f66SIan Rogers        "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group",
1641ab15f66SIan Rogers        "MetricName": "tma_contested_accesses",
1651ab15f66SIan Rogers        "MetricThreshold": "tma_contested_accesses > 0.05 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
1661ab15f66SIan Rogers        "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses. Contested accesses occur when data written by one Logical Processor are read by another Logical Processor on a different Physical Core. Examples of contested accesses include synchronizations such as locks; true data sharing such as modified locked variables; and false sharing. Sample with: MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM_PS;MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS_PS. Related metrics: tma_data_sharing, tma_false_sharing, tma_machine_clears, tma_remote_cache",
1671ab15f66SIan Rogers        "ScaleUnit": "100%"
1681ab15f66SIan Rogers    },
1691ab15f66SIan Rogers    {
1701ab15f66SIan Rogers        "BriefDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck",
1711ab15f66SIan Rogers        "MetricConstraint": "NO_GROUP_EVENTS",
1721ab15f66SIan Rogers        "MetricExpr": "tma_backend_bound - tma_memory_bound",
1731ab15f66SIan Rogers        "MetricGroup": "Backend;Compute;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group",
1741ab15f66SIan Rogers        "MetricName": "tma_core_bound",
1751ab15f66SIan Rogers        "MetricThreshold": "tma_core_bound > 0.1 & tma_backend_bound > 0.2",
176*ccc66c60SIan Rogers        "MetricgroupNoGroup": "TopdownL2",
1771ab15f66SIan Rogers        "PublicDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck.  Shortage in hardware compute resources; or dependencies in software's instructions are both categorized under Core Bound. Hence it may indicate the machine ran out of an out-of-order resource; certain execution units are overloaded or dependencies in program's data- or instruction-flow are limiting the performance (e.g. FP-chained long-latency arithmetic operations).",
1781ab15f66SIan Rogers        "ScaleUnit": "100%"
1791ab15f66SIan Rogers    },
1801ab15f66SIan Rogers    {
1811ab15f66SIan Rogers        "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses",
1821ab15f66SIan Rogers        "MetricConstraint": "NO_GROUP_EVENTS",
1831ab15f66SIan Rogers        "MetricExpr": "43 * (MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_RETIRED.L3_MISS))) / tma_info_clks",
1841ab15f66SIan Rogers        "MetricGroup": "Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group",
1851ab15f66SIan Rogers        "MetricName": "tma_data_sharing",
1861ab15f66SIan Rogers        "MetricThreshold": "tma_data_sharing > 0.05 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
1871ab15f66SIan Rogers        "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses. Data shared by multiple Logical Processors (even just read shared) may cause increased access latency due to cache coherency. Excessive data sharing can drastically harm multithreaded performance. Sample with: MEM_LOAD_L3_HIT_RETIRED.XSNP_HIT_PS. Related metrics: tma_contested_accesses, tma_false_sharing, tma_machine_clears, tma_remote_cache",
1881ab15f66SIan Rogers        "ScaleUnit": "100%"
1891ab15f66SIan Rogers    },
1901ab15f66SIan Rogers    {
1911ab15f66SIan Rogers        "BriefDescription": "This metric represents fraction of cycles where the Divider unit was active",
1921ab15f66SIan Rogers        "MetricExpr": "ARITH.FPU_DIV_ACTIVE / tma_info_core_clks",
1931ab15f66SIan Rogers        "MetricGroup": "TopdownL3;tma_L3_group;tma_core_bound_group",
1941ab15f66SIan Rogers        "MetricName": "tma_divider",
1951ab15f66SIan Rogers        "MetricThreshold": "tma_divider > 0.2 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)",
1961ab15f66SIan Rogers        "PublicDescription": "This metric represents fraction of cycles where the Divider unit was active. Divide and square root instructions are performed by the Divider unit and can take considerably longer latency than integer or Floating Point addition; subtraction; or multiplication. Sample with: ARITH.DIVIDER_UOPS",
1971ab15f66SIan Rogers        "ScaleUnit": "100%"
1981ab15f66SIan Rogers    },
1991ab15f66SIan Rogers    {
2001ab15f66SIan Rogers        "BriefDescription": "This metric estimates how often the CPU was stalled on accesses to external memory (DRAM) by loads",
2011ab15f66SIan Rogers        "MetricConstraint": "NO_GROUP_EVENTS_SMT",
2021ab15f66SIan Rogers        "MetricExpr": "(1 - MEM_LOAD_UOPS_RETIRED.L3_HIT / (MEM_LOAD_UOPS_RETIRED.L3_HIT + 7 * MEM_LOAD_UOPS_RETIRED.L3_MISS)) * CYCLE_ACTIVITY.STALLS_L2_MISS / tma_info_clks",
2031ab15f66SIan Rogers        "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
2041ab15f66SIan Rogers        "MetricName": "tma_dram_bound",
2051ab15f66SIan Rogers        "MetricThreshold": "tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)",
2061ab15f66SIan Rogers        "PublicDescription": "This metric estimates how often the CPU was stalled on accesses to external memory (DRAM) by loads. Better caching can improve the latency and increase performance. Sample with: MEM_LOAD_UOPS_RETIRED.L3_MISS_PS",
2071ab15f66SIan Rogers        "ScaleUnit": "100%"
2081ab15f66SIan Rogers    },
2091ab15f66SIan Rogers    {
2101ab15f66SIan Rogers        "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline",
2111ab15f66SIan Rogers        "MetricExpr": "(IDQ.ALL_DSB_CYCLES_ANY_UOPS - IDQ.ALL_DSB_CYCLES_4_UOPS) / tma_info_core_clks / 2",
2121ab15f66SIan Rogers        "MetricGroup": "DSB;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group",
2131ab15f66SIan Rogers        "MetricName": "tma_dsb",
2141ab15f66SIan Rogers        "MetricThreshold": "tma_dsb > 0.15 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 4 > 0.35)",
2151ab15f66SIan Rogers        "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline.  For example; inefficient utilization of the DSB cache structure or bank conflict when reading from it; are categorized here.",
2161ab15f66SIan Rogers        "ScaleUnit": "100%"
2171ab15f66SIan Rogers    },
2181ab15f66SIan Rogers    {
2191ab15f66SIan Rogers        "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines",
2201ab15f66SIan Rogers        "MetricExpr": "DSB2MITE_SWITCHES.PENALTY_CYCLES / tma_info_clks",
2211ab15f66SIan Rogers        "MetricGroup": "DSBmiss;FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueFB",
2221ab15f66SIan Rogers        "MetricName": "tma_dsb_switches",
2231ab15f66SIan Rogers        "MetricThreshold": "tma_dsb_switches > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)",
2241ab15f66SIan Rogers        "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines. The DSB (decoded i-cache) is a Uop Cache where the front-end directly delivers Uops (micro operations) avoiding heavy x86 decoding. The DSB pipeline has shorter latency and delivered higher bandwidth than the MITE (legacy instruction decode pipeline). Switching between the two pipelines can cause penalties hence this metric measures the exposed penalty. Related metrics: tma_fetch_bandwidth, tma_info_dsb_coverage, tma_info_iptb, tma_lcp",
2251ab15f66SIan Rogers        "ScaleUnit": "100%"
2261ab15f66SIan Rogers    },
2271ab15f66SIan Rogers    {
2281ab15f66SIan Rogers        "BriefDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses",
2291ab15f66SIan Rogers        "MetricExpr": "(8 * DTLB_LOAD_MISSES.STLB_HIT + cpu@DTLB_LOAD_MISSES.WALK_DURATION\\,cmask\\=1@ + 7 * DTLB_LOAD_MISSES.WALK_COMPLETED) / tma_info_clks",
2301ab15f66SIan Rogers        "MetricGroup": "MemoryTLB;TopdownL4;tma_L4_group;tma_issueTLB;tma_l1_bound_group",
2311ab15f66SIan Rogers        "MetricName": "tma_dtlb_load",
2321ab15f66SIan Rogers        "MetricThreshold": "tma_dtlb_load > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
2331ab15f66SIan Rogers        "PublicDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses. TLBs (Translation Look-aside Buffers) are processor caches for recently used entries out of the Page Tables that are used to map virtual- to physical-addresses by the operating system. This metric approximates the potential delay of demand loads missing the first-level data TLB (assuming worst case scenario with back to back misses to different pages). This includes hitting in the second-level TLB (STLB) as well as performing a hardware page walk on an STLB miss. Sample with: MEM_UOPS_RETIRED.STLB_MISS_LOADS_PS. Related metrics: tma_dtlb_store",
2341ab15f66SIan Rogers        "ScaleUnit": "100%"
2351ab15f66SIan Rogers    },
2361ab15f66SIan Rogers    {
2371ab15f66SIan Rogers        "BriefDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses",
2381ab15f66SIan Rogers        "MetricExpr": "(8 * DTLB_STORE_MISSES.STLB_HIT + cpu@DTLB_STORE_MISSES.WALK_DURATION\\,cmask\\=1@ + 7 * DTLB_STORE_MISSES.WALK_COMPLETED) / tma_info_clks",
2391ab15f66SIan Rogers        "MetricGroup": "MemoryTLB;TopdownL4;tma_L4_group;tma_issueTLB;tma_store_bound_group",
2401ab15f66SIan Rogers        "MetricName": "tma_dtlb_store",
2411ab15f66SIan Rogers        "MetricThreshold": "tma_dtlb_store > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
2421ab15f66SIan Rogers        "PublicDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses.  As with ordinary data caching; focus on improving data locality and reducing working-set size to reduce DTLB overhead.  Additionally; consider using profile-guided optimization (PGO) to collocate frequently-used data on the same page.  Try using larger page sizes for large amounts of frequently-used data. Sample with: MEM_UOPS_RETIRED.STLB_MISS_STORES_PS. Related metrics: tma_dtlb_load",
2431ab15f66SIan Rogers        "ScaleUnit": "100%"
2441ab15f66SIan Rogers    },
2451ab15f66SIan Rogers    {
2461ab15f66SIan Rogers        "BriefDescription": "This metric roughly estimates how often CPU was handling synchronizations due to False Sharing",
2471ab15f66SIan Rogers        "MetricExpr": "60 * OFFCORE_RESPONSE.DEMAND_RFO.L3_HIT.SNOOP_HITM / tma_info_clks",
2481ab15f66SIan Rogers        "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_store_bound_group",
2491ab15f66SIan Rogers        "MetricName": "tma_false_sharing",
2501ab15f66SIan Rogers        "MetricThreshold": "tma_false_sharing > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
2511ab15f66SIan Rogers        "PublicDescription": "This metric roughly estimates how often CPU was handling synchronizations due to False Sharing. False Sharing is a multithreading hiccup; where multiple Logical Processors contend on different data-elements mapped into the same cache line. Sample with: MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM_PS;OFFCORE_RESPONSE.DEMAND_RFO.L3_HIT.SNOOP_HITM. Related metrics: tma_contested_accesses, tma_data_sharing, tma_machine_clears, tma_remote_cache",
2521ab15f66SIan Rogers        "ScaleUnit": "100%"
2531ab15f66SIan Rogers    },
2541ab15f66SIan Rogers    {
2551ab15f66SIan Rogers        "BriefDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed",
2561ab15f66SIan Rogers        "MetricConstraint": "NO_GROUP_EVENTS",
2571ab15f66SIan Rogers        "MetricExpr": "tma_info_load_miss_real_latency * cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=1@ / tma_info_clks",
2581ab15f66SIan Rogers        "MetricGroup": "MemoryBW;TopdownL4;tma_L4_group;tma_issueBW;tma_issueSL;tma_issueSmSt;tma_l1_bound_group",
2591ab15f66SIan Rogers        "MetricName": "tma_fb_full",
2601ab15f66SIan Rogers        "MetricThreshold": "tma_fb_full > 0.3",
2611ab15f66SIan Rogers        "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_info_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores",
2621ab15f66SIan Rogers        "ScaleUnit": "100%"
2631ab15f66SIan Rogers    },
2641ab15f66SIan Rogers    {
2651ab15f66SIan Rogers        "BriefDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues",
2661ab15f66SIan Rogers        "MetricExpr": "tma_frontend_bound - tma_fetch_latency",
2671ab15f66SIan Rogers        "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB",
2681ab15f66SIan Rogers        "MetricName": "tma_fetch_bandwidth",
2691ab15f66SIan Rogers        "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 4 > 0.35",
270*ccc66c60SIan Rogers        "MetricgroupNoGroup": "TopdownL2",
2711ab15f66SIan Rogers        "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues.  For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Related metrics: tma_dsb_switches, tma_info_dsb_coverage, tma_info_iptb, tma_lcp",
2721ab15f66SIan Rogers        "ScaleUnit": "100%"
2731ab15f66SIan Rogers    },
2741ab15f66SIan Rogers    {
2751ab15f66SIan Rogers        "BriefDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues",
2761ab15f66SIan Rogers        "MetricExpr": "4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / tma_info_slots",
2771ab15f66SIan Rogers        "MetricGroup": "Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group",
2781ab15f66SIan Rogers        "MetricName": "tma_fetch_latency",
2791ab15f66SIan Rogers        "MetricThreshold": "tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15",
280*ccc66c60SIan Rogers        "MetricgroupNoGroup": "TopdownL2",
2811ab15f66SIan Rogers        "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues.  For example; instruction-cache misses; iTLB misses or fetch stalls after a branch misprediction are categorized under Frontend Latency. In such cases; the Frontend eventually delivers no uops for some period. Sample with: RS_EVENTS.EMPTY_END",
2821ab15f66SIan Rogers        "ScaleUnit": "100%"
2831ab15f66SIan Rogers    },
2841ab15f66SIan Rogers    {
2851ab15f66SIan Rogers        "BriefDescription": "This metric represents overall arithmetic floating-point (FP) operations fraction the CPU has executed (retired)",
2861ab15f66SIan Rogers        "MetricExpr": "tma_x87_use + tma_fp_scalar + tma_fp_vector",
2871ab15f66SIan Rogers        "MetricGroup": "HPC;TopdownL3;tma_L3_group;tma_light_operations_group",
2881ab15f66SIan Rogers        "MetricName": "tma_fp_arith",
2891ab15f66SIan Rogers        "MetricThreshold": "tma_fp_arith > 0.2 & tma_light_operations > 0.6",
2901ab15f66SIan Rogers        "PublicDescription": "This metric represents overall arithmetic floating-point (FP) operations fraction the CPU has executed (retired). Note this metric's value may exceed its parent due to use of \"Uops\" CountDomain and FMA double-counting.",
2911ab15f66SIan Rogers        "ScaleUnit": "100%"
2921ab15f66SIan Rogers    },
2931ab15f66SIan Rogers    {
2941ab15f66SIan Rogers        "BriefDescription": "This metric approximates arithmetic floating-point (FP) scalar uops fraction the CPU has retired",
2951ab15f66SIan Rogers        "MetricExpr": "cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ / UOPS_RETIRED.RETIRE_SLOTS",
2961ab15f66SIan Rogers        "MetricGroup": "Compute;Flops;TopdownL4;tma_L4_group;tma_fp_arith_group;tma_issue2P",
2971ab15f66SIan Rogers        "MetricName": "tma_fp_scalar",
2981ab15f66SIan Rogers        "MetricThreshold": "tma_fp_scalar > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6)",
2991ab15f66SIan Rogers        "PublicDescription": "This metric approximates arithmetic floating-point (FP) scalar uops fraction the CPU has retired. May overcount due to FMA double counting. Related metrics: tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_port_6, tma_ports_utilized_2",
3001ab15f66SIan Rogers        "ScaleUnit": "100%"
3011ab15f66SIan Rogers    },
3021ab15f66SIan Rogers    {
3031ab15f66SIan Rogers        "BriefDescription": "This metric approximates arithmetic floating-point (FP) vector uops fraction the CPU has retired aggregated across all vector widths",
3041ab15f66SIan Rogers        "MetricExpr": "cpu@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE\\,umask\\=0x3c@ / UOPS_RETIRED.RETIRE_SLOTS",
3051ab15f66SIan Rogers        "MetricGroup": "Compute;Flops;TopdownL4;tma_L4_group;tma_fp_arith_group;tma_issue2P",
3061ab15f66SIan Rogers        "MetricName": "tma_fp_vector",
3071ab15f66SIan Rogers        "MetricThreshold": "tma_fp_vector > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6)",
3081ab15f66SIan Rogers        "PublicDescription": "This metric approximates arithmetic floating-point (FP) vector uops fraction the CPU has retired aggregated across all vector widths. May overcount due to FMA double counting. Related metrics: tma_fp_scalar, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_port_6, tma_ports_utilized_2",
3091ab15f66SIan Rogers        "ScaleUnit": "100%"
3101ab15f66SIan Rogers    },
3111ab15f66SIan Rogers    {
3121ab15f66SIan Rogers        "BriefDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 128-bit wide vectors",
3131ab15f66SIan Rogers        "MetricExpr": "(FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE) / UOPS_RETIRED.RETIRE_SLOTS",
3141ab15f66SIan Rogers        "MetricGroup": "Compute;Flops;TopdownL5;tma_L5_group;tma_fp_vector_group;tma_issue2P",
3151ab15f66SIan Rogers        "MetricName": "tma_fp_vector_128b",
3161ab15f66SIan Rogers        "MetricThreshold": "tma_fp_vector_128b > 0.1 & (tma_fp_vector > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6))",
3171ab15f66SIan Rogers        "PublicDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 128-bit wide vectors. May overcount due to FMA double counting. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_port_6, tma_ports_utilized_2",
3181ab15f66SIan Rogers        "ScaleUnit": "100%"
3191ab15f66SIan Rogers    },
3201ab15f66SIan Rogers    {
3211ab15f66SIan Rogers        "BriefDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 256-bit wide vectors",
3221ab15f66SIan Rogers        "MetricExpr": "(FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / UOPS_RETIRED.RETIRE_SLOTS",
3231ab15f66SIan Rogers        "MetricGroup": "Compute;Flops;TopdownL5;tma_L5_group;tma_fp_vector_group;tma_issue2P",
3241ab15f66SIan Rogers        "MetricName": "tma_fp_vector_256b",
3251ab15f66SIan Rogers        "MetricThreshold": "tma_fp_vector_256b > 0.1 & (tma_fp_vector > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6))",
3261ab15f66SIan Rogers        "PublicDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 256-bit wide vectors. May overcount due to FMA double counting. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_port_6, tma_ports_utilized_2",
3271ab15f66SIan Rogers        "ScaleUnit": "100%"
3281ab15f66SIan Rogers    },
3291ab15f66SIan Rogers    {
3301ab15f66SIan Rogers        "BriefDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend",
3311ab15f66SIan Rogers        "MetricExpr": "IDQ_UOPS_NOT_DELIVERED.CORE / tma_info_slots",
3321ab15f66SIan Rogers        "MetricGroup": "PGO;TmaL1;TopdownL1;tma_L1_group",
3331ab15f66SIan Rogers        "MetricName": "tma_frontend_bound",
3341ab15f66SIan Rogers        "MetricThreshold": "tma_frontend_bound > 0.15",
335*ccc66c60SIan Rogers        "MetricgroupNoGroup": "TopdownL1",
3361ab15f66SIan Rogers        "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Pipeline_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound.",
3371ab15f66SIan Rogers        "ScaleUnit": "100%"
3381ab15f66SIan Rogers    },
3391ab15f66SIan Rogers    {
3401ab15f66SIan Rogers        "BriefDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences",
3411ab15f66SIan Rogers        "MetricExpr": "tma_microcode_sequencer",
3421ab15f66SIan Rogers        "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group",
3431ab15f66SIan Rogers        "MetricName": "tma_heavy_operations",
3441ab15f66SIan Rogers        "MetricThreshold": "tma_heavy_operations > 0.1",
345*ccc66c60SIan Rogers        "MetricgroupNoGroup": "TopdownL2",
3461ab15f66SIan Rogers        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences.",
3471ab15f66SIan Rogers        "ScaleUnit": "100%"
3481ab15f66SIan Rogers    },
3491ab15f66SIan Rogers    {
3501ab15f66SIan Rogers        "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses.",
3511ab15f66SIan Rogers        "MetricExpr": "ICACHE.IFDATA_STALL / tma_info_clks",
3521ab15f66SIan Rogers        "MetricGroup": "BigFoot;FetchLat;IcMiss;TopdownL3;tma_L3_group;tma_fetch_latency_group",
3531ab15f66SIan Rogers        "MetricName": "tma_icache_misses",
3541ab15f66SIan Rogers        "MetricThreshold": "tma_icache_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)",
3551ab15f66SIan Rogers        "ScaleUnit": "100%"
3561ab15f66SIan Rogers    },
3571ab15f66SIan Rogers    {
3581ab15f66SIan Rogers        "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]",
3591ab15f66SIan Rogers        "MetricExpr": "tma_info_turbo_utilization * TSC / 1e9 / duration_time",
3601ab15f66SIan Rogers        "MetricGroup": "Power;Summary",
3611ab15f66SIan Rogers        "MetricName": "tma_info_average_frequency"
3621ab15f66SIan Rogers    },
3631ab15f66SIan Rogers    {
3641ab15f66SIan Rogers        "BriefDescription": "Branch instructions per taken branch.",
3651ab15f66SIan Rogers        "MetricExpr": "BR_INST_RETIRED.ALL_BRANCHES / BR_INST_RETIRED.NEAR_TAKEN",
3661ab15f66SIan Rogers        "MetricGroup": "Branches;Fed;PGO",
3671ab15f66SIan Rogers        "MetricName": "tma_info_bptkbranch"
3681ab15f66SIan Rogers    },
3691ab15f66SIan Rogers    {
3701ab15f66SIan Rogers        "BriefDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear)",
3711ab15f66SIan Rogers        "MetricExpr": "(tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) * tma_info_slots / BR_MISP_RETIRED.ALL_BRANCHES",
3721ab15f66SIan Rogers        "MetricGroup": "Bad;BrMispredicts;tma_issueBM",
3731ab15f66SIan Rogers        "MetricName": "tma_info_branch_misprediction_cost",
3741ab15f66SIan Rogers        "PublicDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear). Related metrics: tma_branch_mispredicts, tma_mispredicts_resteers"
3751ab15f66SIan Rogers    },
3761ab15f66SIan Rogers    {
3771ab15f66SIan Rogers        "BriefDescription": "Per-Logical Processor actual clocks when the Logical Processor is active.",
3781ab15f66SIan Rogers        "MetricExpr": "CPU_CLK_UNHALTED.THREAD",
3791ab15f66SIan Rogers        "MetricGroup": "Pipeline",
3801ab15f66SIan Rogers        "MetricName": "tma_info_clks"
3811ab15f66SIan Rogers    },
3821ab15f66SIan Rogers    {
3831ab15f66SIan Rogers        "BriefDescription": "Core actual clocks when any Logical Processor is active on the Physical Core",
3841ab15f66SIan Rogers        "MetricExpr": "(CPU_CLK_UNHALTED.THREAD / 2 * (1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK) if #core_wide < 1 else (CPU_CLK_UNHALTED.THREAD_ANY / 2 if #SMT_on else tma_info_clks))",
3851ab15f66SIan Rogers        "MetricGroup": "SMT",
3861ab15f66SIan Rogers        "MetricName": "tma_info_core_clks"
3871ab15f66SIan Rogers    },
3881ab15f66SIan Rogers    {
3891ab15f66SIan Rogers        "BriefDescription": "Instructions Per Cycle across hyper-threads (per physical core)",
3901ab15f66SIan Rogers        "MetricExpr": "INST_RETIRED.ANY / tma_info_core_clks",
3911ab15f66SIan Rogers        "MetricGroup": "Ret;SMT;TmaL1;tma_L1_group",
3921ab15f66SIan Rogers        "MetricName": "tma_info_coreipc"
3931ab15f66SIan Rogers    },
3941ab15f66SIan Rogers    {
3951ab15f66SIan Rogers        "BriefDescription": "Cycles Per Instruction (per Logical Processor)",
3961ab15f66SIan Rogers        "MetricExpr": "1 / tma_info_ipc",
3971ab15f66SIan Rogers        "MetricGroup": "Mem;Pipeline",
3981ab15f66SIan Rogers        "MetricName": "tma_info_cpi"
3991ab15f66SIan Rogers    },
4001ab15f66SIan Rogers    {
4011ab15f66SIan Rogers        "BriefDescription": "Average CPU Utilization",
4021ab15f66SIan Rogers        "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC",
4031ab15f66SIan Rogers        "MetricGroup": "HPC;Summary",
4041ab15f66SIan Rogers        "MetricName": "tma_info_cpu_utilization"
4051ab15f66SIan Rogers    },
4061ab15f66SIan Rogers    {
4071ab15f66SIan Rogers        "BriefDescription": "Average Parallel L2 cache miss data reads",
4081ab15f66SIan Rogers        "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD",
4091ab15f66SIan Rogers        "MetricGroup": "Memory_BW;Offcore",
4101ab15f66SIan Rogers        "MetricName": "tma_info_data_l2_mlp"
4111ab15f66SIan Rogers    },
4121ab15f66SIan Rogers    {
4131ab15f66SIan Rogers        "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]",
4141ab15f66SIan Rogers        "MetricExpr": "64 * (UNC_ARB_TRK_REQUESTS.ALL + UNC_ARB_COH_TRK_REQUESTS.ALL) / 1e6 / duration_time / 1e3",
4151ab15f66SIan Rogers        "MetricGroup": "HPC;Mem;MemoryBW;SoC;tma_issueBW",
4161ab15f66SIan Rogers        "MetricName": "tma_info_dram_bw_use",
4171ab15f66SIan Rogers        "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_fb_full, tma_mem_bandwidth, tma_sq_full"
4181ab15f66SIan Rogers    },
4191ab15f66SIan Rogers    {
4201ab15f66SIan Rogers        "BriefDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache)",
4211ab15f66SIan Rogers        "MetricExpr": "IDQ.DSB_UOPS / (IDQ.DSB_UOPS + LSD.UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS)",
4221ab15f66SIan Rogers        "MetricGroup": "DSB;Fed;FetchBW;tma_issueFB",
4231ab15f66SIan Rogers        "MetricName": "tma_info_dsb_coverage",
4241ab15f66SIan Rogers        "MetricThreshold": "tma_info_dsb_coverage < 0.7 & tma_info_ipc / 4 > 0.35",
4251ab15f66SIan Rogers        "PublicDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache). Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_iptb, tma_lcp"
4261ab15f66SIan Rogers    },
4271ab15f66SIan Rogers    {
4281ab15f66SIan Rogers        "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-thread",
4291ab15f66SIan Rogers        "MetricExpr": "UOPS_EXECUTED.THREAD / cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@",
4301ab15f66SIan Rogers        "MetricGroup": "Cor;Pipeline;PortsUtil;SMT",
4311ab15f66SIan Rogers        "MetricName": "tma_info_execute"
4321ab15f66SIan Rogers    },
4331ab15f66SIan Rogers    {
4341ab15f66SIan Rogers        "BriefDescription": "The ratio of Executed- by Issued-Uops",
4351ab15f66SIan Rogers        "MetricExpr": "UOPS_EXECUTED.THREAD / UOPS_ISSUED.ANY",
4361ab15f66SIan Rogers        "MetricGroup": "Cor;Pipeline",
4371ab15f66SIan Rogers        "MetricName": "tma_info_execute_per_issue",
4381ab15f66SIan Rogers        "PublicDescription": "The ratio of Executed- by Issued-Uops. Ratio > 1 suggests high rate of uop micro-fusions. Ratio < 1 suggest high rate of \"execute\" at rename stage."
4391ab15f66SIan Rogers    },
4401ab15f66SIan Rogers    {
4411ab15f66SIan Rogers        "BriefDescription": "Floating Point Operations Per Cycle",
4421ab15f66SIan Rogers        "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / tma_info_core_clks",
4431ab15f66SIan Rogers        "MetricGroup": "Flops;Ret",
4441ab15f66SIan Rogers        "MetricName": "tma_info_flopc"
4451ab15f66SIan Rogers    },
4461ab15f66SIan Rogers    {
4471ab15f66SIan Rogers        "BriefDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width)",
4481ab15f66SIan Rogers        "MetricExpr": "(cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + cpu@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE\\,umask\\=0x3c@) / (2 * tma_info_core_clks)",
4491ab15f66SIan Rogers        "MetricGroup": "Cor;Flops;HPC",
4501ab15f66SIan Rogers        "MetricName": "tma_info_fp_arith_utilization",
4511ab15f66SIan Rogers        "PublicDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width). Values > 1 are possible due to ([BDW+] Fused-Multiply Add (FMA) counting - common; [ADL+] use all of ADD/MUL/FMA in Scalar or 128/256-bit vectors - less common)."
4521ab15f66SIan Rogers    },
4531ab15f66SIan Rogers    {
4541ab15f66SIan Rogers        "BriefDescription": "Giga Floating Point Operations Per Second",
4551ab15f66SIan Rogers        "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / 1e9 / duration_time",
4561ab15f66SIan Rogers        "MetricGroup": "Cor;Flops;HPC",
4571ab15f66SIan Rogers        "MetricName": "tma_info_gflops",
4581ab15f66SIan Rogers        "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine."
4591ab15f66SIan Rogers    },
4601ab15f66SIan Rogers    {
4611ab15f66SIan Rogers        "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core",
4621ab15f66SIan Rogers        "MetricExpr": "UOPS_EXECUTED.THREAD / (cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2 if #SMT_on else UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC)",
4631ab15f66SIan Rogers        "MetricGroup": "Backend;Cor;Pipeline;PortsUtil",
4641ab15f66SIan Rogers        "MetricName": "tma_info_ilp"
4651ab15f66SIan Rogers    },
4661ab15f66SIan Rogers    {
4671ab15f66SIan Rogers        "BriefDescription": "Total number of retired Instructions",
4681ab15f66SIan Rogers        "MetricExpr": "INST_RETIRED.ANY",
4691ab15f66SIan Rogers        "MetricGroup": "Summary;TmaL1;tma_L1_group",
4701ab15f66SIan Rogers        "MetricName": "tma_info_instructions",
4711ab15f66SIan Rogers        "PublicDescription": "Total number of retired Instructions. Sample with: INST_RETIRED.PREC_DIST"
4721ab15f66SIan Rogers    },
4731ab15f66SIan Rogers    {
4741ab15f66SIan Rogers        "BriefDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate)",
4751ab15f66SIan Rogers        "MetricExpr": "INST_RETIRED.ANY / (cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + cpu@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE\\,umask\\=0x3c@)",
4761ab15f66SIan Rogers        "MetricGroup": "Flops;InsType",
4771ab15f66SIan Rogers        "MetricName": "tma_info_iparith",
4781ab15f66SIan Rogers        "MetricThreshold": "tma_info_iparith < 10",
4791ab15f66SIan Rogers        "PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. Approximated prior to BDW."
4801ab15f66SIan Rogers    },
4811ab15f66SIan Rogers    {
4821ab15f66SIan Rogers        "BriefDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate)",
4831ab15f66SIan Rogers        "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE)",
4841ab15f66SIan Rogers        "MetricGroup": "Flops;FpVector;InsType",
4851ab15f66SIan Rogers        "MetricName": "tma_info_iparith_avx128",
4861ab15f66SIan Rogers        "MetricThreshold": "tma_info_iparith_avx128 < 10",
4871ab15f66SIan Rogers        "PublicDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting."
4881ab15f66SIan Rogers    },
4891ab15f66SIan Rogers    {
4901ab15f66SIan Rogers        "BriefDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate)",
4911ab15f66SIan Rogers        "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE)",
4921ab15f66SIan Rogers        "MetricGroup": "Flops;FpVector;InsType",
4931ab15f66SIan Rogers        "MetricName": "tma_info_iparith_avx256",
4941ab15f66SIan Rogers        "MetricThreshold": "tma_info_iparith_avx256 < 10",
4951ab15f66SIan Rogers        "PublicDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting."
4961ab15f66SIan Rogers    },
4971ab15f66SIan Rogers    {
4981ab15f66SIan Rogers        "BriefDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate)",
4991ab15f66SIan Rogers        "MetricExpr": "INST_RETIRED.ANY / FP_ARITH_INST_RETIRED.SCALAR_DOUBLE",
5001ab15f66SIan Rogers        "MetricGroup": "Flops;FpScalar;InsType",
5011ab15f66SIan Rogers        "MetricName": "tma_info_iparith_scalar_dp",
5021ab15f66SIan Rogers        "MetricThreshold": "tma_info_iparith_scalar_dp < 10",
5031ab15f66SIan Rogers        "PublicDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting."
5041ab15f66SIan Rogers    },
5051ab15f66SIan Rogers    {
5061ab15f66SIan Rogers        "BriefDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate)",
5071ab15f66SIan Rogers        "MetricExpr": "INST_RETIRED.ANY / FP_ARITH_INST_RETIRED.SCALAR_SINGLE",
5081ab15f66SIan Rogers        "MetricGroup": "Flops;FpScalar;InsType",
5091ab15f66SIan Rogers        "MetricName": "tma_info_iparith_scalar_sp",
5101ab15f66SIan Rogers        "MetricThreshold": "tma_info_iparith_scalar_sp < 10",
5111ab15f66SIan Rogers        "PublicDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting."
5121ab15f66SIan Rogers    },
5131ab15f66SIan Rogers    {
5141ab15f66SIan Rogers        "BriefDescription": "Instructions per Branch (lower number means higher occurrence rate)",
5151ab15f66SIan Rogers        "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.ALL_BRANCHES",
5161ab15f66SIan Rogers        "MetricGroup": "Branches;Fed;InsType",
5171ab15f66SIan Rogers        "MetricName": "tma_info_ipbranch",
5181ab15f66SIan Rogers        "MetricThreshold": "tma_info_ipbranch < 8"
5191ab15f66SIan Rogers    },
5201ab15f66SIan Rogers    {
5211ab15f66SIan Rogers        "BriefDescription": "Instructions Per Cycle (per Logical Processor)",
5221ab15f66SIan Rogers        "MetricExpr": "INST_RETIRED.ANY / tma_info_clks",
5231ab15f66SIan Rogers        "MetricGroup": "Ret;Summary",
5241ab15f66SIan Rogers        "MetricName": "tma_info_ipc"
5251ab15f66SIan Rogers    },
5261ab15f66SIan Rogers    {
5271ab15f66SIan Rogers        "BriefDescription": "Instructions per (near) call (lower number means higher occurrence rate)",
5281ab15f66SIan Rogers        "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.NEAR_CALL",
5291ab15f66SIan Rogers        "MetricGroup": "Branches;Fed;PGO",
5301ab15f66SIan Rogers        "MetricName": "tma_info_ipcall",
5311ab15f66SIan Rogers        "MetricThreshold": "tma_info_ipcall < 200"
5321ab15f66SIan Rogers    },
5331ab15f66SIan Rogers    {
5341ab15f66SIan Rogers        "BriefDescription": "Instructions per Far Branch ( Far Branches apply upon transition from application to operating system, handling interrupts, exceptions) [lower number means higher occurrence rate]",
5351ab15f66SIan Rogers        "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.FAR_BRANCH:u",
5361ab15f66SIan Rogers        "MetricGroup": "Branches;OS",
5371ab15f66SIan Rogers        "MetricName": "tma_info_ipfarbranch",
5381ab15f66SIan Rogers        "MetricThreshold": "tma_info_ipfarbranch < 1e6"
5391ab15f66SIan Rogers    },
5401ab15f66SIan Rogers    {
5411ab15f66SIan Rogers        "BriefDescription": "Instructions per Floating Point (FP) Operation (lower number means higher occurrence rate)",
5421ab15f66SIan Rogers        "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE)",
5431ab15f66SIan Rogers        "MetricGroup": "Flops;InsType",
5441ab15f66SIan Rogers        "MetricName": "tma_info_ipflop",
5451ab15f66SIan Rogers        "MetricThreshold": "tma_info_ipflop < 10"
5461ab15f66SIan Rogers    },
5471ab15f66SIan Rogers    {
5481ab15f66SIan Rogers        "BriefDescription": "Instructions per Load (lower number means higher occurrence rate)",
5491ab15f66SIan Rogers        "MetricExpr": "INST_RETIRED.ANY / MEM_UOPS_RETIRED.ALL_LOADS",
5501ab15f66SIan Rogers        "MetricGroup": "InsType",
5511ab15f66SIan Rogers        "MetricName": "tma_info_ipload",
5521ab15f66SIan Rogers        "MetricThreshold": "tma_info_ipload < 3"
5531ab15f66SIan Rogers    },
5541ab15f66SIan Rogers    {
5551ab15f66SIan Rogers        "BriefDescription": "Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).",
5561ab15f66SIan Rogers        "MetricExpr": "tma_info_instructions / (UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * cpu@BR_MISP_EXEC.ALL_BRANCHES\\,umask\\=0xE4@)",
5571ab15f66SIan Rogers        "MetricGroup": "Bad;BrMispredicts",
5581ab15f66SIan Rogers        "MetricName": "tma_info_ipmisp_indirect",
5591ab15f66SIan Rogers        "MetricThreshold": "tma_info_ipmisp_indirect < 1e3"
5601ab15f66SIan Rogers    },
5611ab15f66SIan Rogers    {
5621ab15f66SIan Rogers        "BriefDescription": "Number of Instructions per non-speculative Branch Misprediction (JEClear) (lower number means higher occurrence rate)",
5631ab15f66SIan Rogers        "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.ALL_BRANCHES",
5641ab15f66SIan Rogers        "MetricGroup": "Bad;BadSpec;BrMispredicts",
5651ab15f66SIan Rogers        "MetricName": "tma_info_ipmispredict",
5661ab15f66SIan Rogers        "MetricThreshold": "tma_info_ipmispredict < 200"
5671ab15f66SIan Rogers    },
5681ab15f66SIan Rogers    {
5691ab15f66SIan Rogers        "BriefDescription": "Instructions per Store (lower number means higher occurrence rate)",
5701ab15f66SIan Rogers        "MetricExpr": "INST_RETIRED.ANY / MEM_UOPS_RETIRED.ALL_STORES",
5711ab15f66SIan Rogers        "MetricGroup": "InsType",
5721ab15f66SIan Rogers        "MetricName": "tma_info_ipstore",
5731ab15f66SIan Rogers        "MetricThreshold": "tma_info_ipstore < 8"
5741ab15f66SIan Rogers    },
5751ab15f66SIan Rogers    {
5761ab15f66SIan Rogers        "BriefDescription": "Instruction per taken branch",
5771ab15f66SIan Rogers        "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.NEAR_TAKEN",
5781ab15f66SIan Rogers        "MetricGroup": "Branches;Fed;FetchBW;Frontend;PGO;tma_issueFB",
5791ab15f66SIan Rogers        "MetricName": "tma_info_iptb",
5801ab15f66SIan Rogers        "MetricThreshold": "tma_info_iptb < 9",
5811ab15f66SIan Rogers        "PublicDescription": "Instruction per taken branch. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_dsb_coverage, tma_lcp"
5821ab15f66SIan Rogers    },
5831ab15f66SIan Rogers    {
5841ab15f66SIan Rogers        "BriefDescription": "Instructions per speculative Unknown Branch Misprediction (BAClear) (lower number means higher occurrence rate)",
5851ab15f66SIan Rogers        "MetricExpr": "tma_info_instructions / BACLEARS.ANY",
5861ab15f66SIan Rogers        "MetricGroup": "Fed",
5871ab15f66SIan Rogers        "MetricName": "tma_info_ipunknown_branch"
5881ab15f66SIan Rogers    },
5891ab15f66SIan Rogers    {
5901ab15f66SIan Rogers        "BriefDescription": "Cycles Per Instruction for the Operating System (OS) Kernel mode",
5911ab15f66SIan Rogers        "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / INST_RETIRED.ANY_P:k",
5921ab15f66SIan Rogers        "MetricGroup": "OS",
5931ab15f66SIan Rogers        "MetricName": "tma_info_kernel_cpi"
5941ab15f66SIan Rogers    },
5951ab15f66SIan Rogers    {
5961ab15f66SIan Rogers        "BriefDescription": "Fraction of cycles spent in the Operating System (OS) Kernel mode",
5971ab15f66SIan Rogers        "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / CPU_CLK_UNHALTED.THREAD",
5981ab15f66SIan Rogers        "MetricGroup": "OS",
5991ab15f66SIan Rogers        "MetricName": "tma_info_kernel_utilization",
6001ab15f66SIan Rogers        "MetricThreshold": "tma_info_kernel_utilization > 0.05"
6011ab15f66SIan Rogers    },
6021ab15f66SIan Rogers    {
6031ab15f66SIan Rogers        "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]",
6041ab15f66SIan Rogers        "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / duration_time",
6051ab15f66SIan Rogers        "MetricGroup": "Mem;MemoryBW",
6061ab15f66SIan Rogers        "MetricName": "tma_info_l1d_cache_fill_bw"
6071ab15f66SIan Rogers    },
6081ab15f66SIan Rogers    {
6091ab15f66SIan Rogers        "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]",
6101ab15f66SIan Rogers        "MetricExpr": "tma_info_l1d_cache_fill_bw",
6111ab15f66SIan Rogers        "MetricGroup": "Mem;MemoryBW",
6121ab15f66SIan Rogers        "MetricName": "tma_info_l1d_cache_fill_bw_1t"
6131ab15f66SIan Rogers    },
6141ab15f66SIan Rogers    {
6151ab15f66SIan Rogers        "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads",
6161ab15f66SIan Rogers        "MetricExpr": "1e3 * MEM_LOAD_UOPS_RETIRED.L1_MISS / INST_RETIRED.ANY",
6171ab15f66SIan Rogers        "MetricGroup": "CacheMisses;Mem",
6181ab15f66SIan Rogers        "MetricName": "tma_info_l1mpki"
6191ab15f66SIan Rogers    },
6201ab15f66SIan Rogers    {
6211ab15f66SIan Rogers        "BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]",
6221ab15f66SIan Rogers        "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / duration_time",
6231ab15f66SIan Rogers        "MetricGroup": "Mem;MemoryBW",
6241ab15f66SIan Rogers        "MetricName": "tma_info_l2_cache_fill_bw"
6251ab15f66SIan Rogers    },
6261ab15f66SIan Rogers    {
6271ab15f66SIan Rogers        "BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]",
6281ab15f66SIan Rogers        "MetricExpr": "tma_info_l2_cache_fill_bw",
6291ab15f66SIan Rogers        "MetricGroup": "Mem;MemoryBW",
6301ab15f66SIan Rogers        "MetricName": "tma_info_l2_cache_fill_bw_1t"
6311ab15f66SIan Rogers    },
6321ab15f66SIan Rogers    {
6331ab15f66SIan Rogers        "BriefDescription": "L2 cache hits per kilo instruction for all request types (including speculative)",
6341ab15f66SIan Rogers        "MetricExpr": "1e3 * (L2_RQSTS.REFERENCES - L2_RQSTS.MISS) / INST_RETIRED.ANY",
6351ab15f66SIan Rogers        "MetricGroup": "CacheMisses;Mem",
6361ab15f66SIan Rogers        "MetricName": "tma_info_l2hpki_all"
6371ab15f66SIan Rogers    },
6381ab15f66SIan Rogers    {
6391ab15f66SIan Rogers        "BriefDescription": "L2 cache hits per kilo instruction for all demand loads  (including speculative)",
6401ab15f66SIan Rogers        "MetricExpr": "1e3 * L2_RQSTS.DEMAND_DATA_RD_HIT / INST_RETIRED.ANY",
6411ab15f66SIan Rogers        "MetricGroup": "CacheMisses;Mem",
6421ab15f66SIan Rogers        "MetricName": "tma_info_l2hpki_load"
6431ab15f66SIan Rogers    },
6441ab15f66SIan Rogers    {
6451ab15f66SIan Rogers        "BriefDescription": "L2 cache true misses per kilo instruction for retired demand loads",
6461ab15f66SIan Rogers        "MetricExpr": "1e3 * MEM_LOAD_UOPS_RETIRED.L2_MISS / INST_RETIRED.ANY",
6471ab15f66SIan Rogers        "MetricGroup": "Backend;CacheMisses;Mem",
6481ab15f66SIan Rogers        "MetricName": "tma_info_l2mpki"
6491ab15f66SIan Rogers    },
6501ab15f66SIan Rogers    {
6511ab15f66SIan Rogers        "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all request types (including speculative)",
6521ab15f66SIan Rogers        "MetricExpr": "1e3 * L2_RQSTS.MISS / INST_RETIRED.ANY",
6531ab15f66SIan Rogers        "MetricGroup": "CacheMisses;Mem;Offcore",
6541ab15f66SIan Rogers        "MetricName": "tma_info_l2mpki_all"
6551ab15f66SIan Rogers    },
6561ab15f66SIan Rogers    {
6571ab15f66SIan Rogers        "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all demand loads  (including speculative)",
6581ab15f66SIan Rogers        "MetricExpr": "1e3 * L2_RQSTS.DEMAND_DATA_RD_MISS / INST_RETIRED.ANY",
6591ab15f66SIan Rogers        "MetricGroup": "CacheMisses;Mem",
6601ab15f66SIan Rogers        "MetricName": "tma_info_l2mpki_load"
6611ab15f66SIan Rogers    },
6621ab15f66SIan Rogers    {
6631ab15f66SIan Rogers        "BriefDescription": "Average per-thread data access bandwidth to the L3 cache [GB / sec]",
6641ab15f66SIan Rogers        "MetricExpr": "0",
6651ab15f66SIan Rogers        "MetricGroup": "Mem;MemoryBW;Offcore",
6661ab15f66SIan Rogers        "MetricName": "tma_info_l3_cache_access_bw_1t"
6671ab15f66SIan Rogers    },
6681ab15f66SIan Rogers    {
6691ab15f66SIan Rogers        "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]",
6701ab15f66SIan Rogers        "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time",
6711ab15f66SIan Rogers        "MetricGroup": "Mem;MemoryBW",
6721ab15f66SIan Rogers        "MetricName": "tma_info_l3_cache_fill_bw"
6731ab15f66SIan Rogers    },
6741ab15f66SIan Rogers    {
6751ab15f66SIan Rogers        "BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]",
6761ab15f66SIan Rogers        "MetricExpr": "tma_info_l3_cache_fill_bw",
6771ab15f66SIan Rogers        "MetricGroup": "Mem;MemoryBW",
6781ab15f66SIan Rogers        "MetricName": "tma_info_l3_cache_fill_bw_1t"
6791ab15f66SIan Rogers    },
6801ab15f66SIan Rogers    {
6811ab15f66SIan Rogers        "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads",
6821ab15f66SIan Rogers        "MetricExpr": "1e3 * MEM_LOAD_UOPS_RETIRED.L3_MISS / INST_RETIRED.ANY",
6831ab15f66SIan Rogers        "MetricGroup": "CacheMisses;Mem",
6841ab15f66SIan Rogers        "MetricName": "tma_info_l3mpki"
6851ab15f66SIan Rogers    },
6861ab15f66SIan Rogers    {
6871ab15f66SIan Rogers        "BriefDescription": "Average Latency for L2 cache miss demand Loads",
6881ab15f66SIan Rogers        "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS.DEMAND_DATA_RD",
6891ab15f66SIan Rogers        "MetricGroup": "Memory_Lat;Offcore",
6901ab15f66SIan Rogers        "MetricName": "tma_info_load_l2_miss_latency"
6911ab15f66SIan Rogers    },
6921ab15f66SIan Rogers    {
6931ab15f66SIan Rogers        "BriefDescription": "Average Parallel L2 cache miss demand Loads",
6941ab15f66SIan Rogers        "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_DATA_RD",
6951ab15f66SIan Rogers        "MetricGroup": "Memory_BW;Offcore",
6961ab15f66SIan Rogers        "MetricName": "tma_info_load_l2_mlp"
6971ab15f66SIan Rogers    },
6981ab15f66SIan Rogers    {
6991ab15f66SIan Rogers        "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)",
7001ab15f66SIan Rogers        "MetricConstraint": "NO_GROUP_EVENTS",
7011ab15f66SIan Rogers        "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_UOPS_RETIRED.L1_MISS + MEM_LOAD_UOPS_RETIRED.HIT_LFB)",
7021ab15f66SIan Rogers        "MetricGroup": "Mem;MemoryBound;MemoryLat",
7031ab15f66SIan Rogers        "MetricName": "tma_info_load_miss_real_latency"
7041ab15f66SIan Rogers    },
7051ab15f66SIan Rogers    {
7061ab15f66SIan Rogers        "BriefDescription": "Average number of parallel requests to external memory",
7071ab15f66SIan Rogers        "MetricExpr": "UNC_ARB_TRK_OCCUPANCY.ALL / UNC_ARB_TRK_OCCUPANCY.CYCLES_WITH_ANY_REQUEST",
7081ab15f66SIan Rogers        "MetricGroup": "Mem;SoC",
7091ab15f66SIan Rogers        "MetricName": "tma_info_mem_parallel_requests",
7101ab15f66SIan Rogers        "PublicDescription": "Average number of parallel requests to external memory. Accounts for all requests"
7111ab15f66SIan Rogers    },
7121ab15f66SIan Rogers    {
7131ab15f66SIan Rogers        "BriefDescription": "Average latency of all requests to external memory (in Uncore cycles)",
7141ab15f66SIan Rogers        "MetricExpr": "UNC_ARB_TRK_OCCUPANCY.ALL / UNC_ARB_TRK_REQUESTS.ALL",
7151ab15f66SIan Rogers        "MetricGroup": "Mem;SoC",
7161ab15f66SIan Rogers        "MetricName": "tma_info_mem_request_latency"
7171ab15f66SIan Rogers    },
7181ab15f66SIan Rogers    {
7191ab15f66SIan Rogers        "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss",
7201ab15f66SIan Rogers        "MetricConstraint": "NO_GROUP_EVENTS",
7211ab15f66SIan Rogers        "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES",
7221ab15f66SIan Rogers        "MetricGroup": "Mem;MemoryBW;MemoryBound",
7231ab15f66SIan Rogers        "MetricName": "tma_info_mlp",
7241ab15f66SIan Rogers        "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)"
7251ab15f66SIan Rogers    },
7261ab15f66SIan Rogers    {
7271ab15f66SIan Rogers        "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses",
7281ab15f66SIan Rogers        "MetricExpr": "(cpu@ITLB_MISSES.WALK_DURATION\\,cmask\\=1@ + cpu@DTLB_LOAD_MISSES.WALK_DURATION\\,cmask\\=1@ + cpu@DTLB_STORE_MISSES.WALK_DURATION\\,cmask\\=1@ + 7 * (DTLB_STORE_MISSES.WALK_COMPLETED + DTLB_LOAD_MISSES.WALK_COMPLETED + ITLB_MISSES.WALK_COMPLETED)) / tma_info_core_clks",
7291ab15f66SIan Rogers        "MetricGroup": "Mem;MemoryTLB",
7301ab15f66SIan Rogers        "MetricName": "tma_info_page_walks_utilization",
7311ab15f66SIan Rogers        "MetricThreshold": "tma_info_page_walks_utilization > 0.5"
7321ab15f66SIan Rogers    },
7331ab15f66SIan Rogers    {
7341ab15f66SIan Rogers        "BriefDescription": "Average number of Uops retired in cycles where at least one uop has retired.",
7351ab15f66SIan Rogers        "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / cpu@UOPS_RETIRED.RETIRE_SLOTS\\,cmask\\=1@",
7361ab15f66SIan Rogers        "MetricGroup": "Pipeline;Ret",
7371ab15f66SIan Rogers        "MetricName": "tma_info_retire"
7381ab15f66SIan Rogers    },
7391ab15f66SIan Rogers    {
7401ab15f66SIan Rogers        "BriefDescription": "Total issue-pipeline slots (per-Physical Core till ICL; per-Logical Processor ICL onward)",
7411ab15f66SIan Rogers        "MetricExpr": "4 * tma_info_core_clks",
7421ab15f66SIan Rogers        "MetricGroup": "TmaL1;tma_L1_group",
7431ab15f66SIan Rogers        "MetricName": "tma_info_slots"
7441ab15f66SIan Rogers    },
7451ab15f66SIan Rogers    {
7461ab15f66SIan Rogers        "BriefDescription": "Fraction of cycles where both hardware Logical Processors were active",
7471ab15f66SIan Rogers        "MetricExpr": "(1 - CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / (CPU_CLK_UNHALTED.REF_XCLK_ANY / 2) if #SMT_on else 0)",
7481ab15f66SIan Rogers        "MetricGroup": "SMT",
7491ab15f66SIan Rogers        "MetricName": "tma_info_smt_2t_utilization"
7501ab15f66SIan Rogers    },
7511ab15f66SIan Rogers    {
7521ab15f66SIan Rogers        "BriefDescription": "Socket actual clocks when any core is active on that socket",
7531ab15f66SIan Rogers        "MetricExpr": "UNC_CLOCK.SOCKET",
7541ab15f66SIan Rogers        "MetricGroup": "SoC",
7551ab15f66SIan Rogers        "MetricName": "tma_info_socket_clks"
7561ab15f66SIan Rogers    },
7571ab15f66SIan Rogers    {
7581ab15f66SIan Rogers        "BriefDescription": "Average Frequency Utilization relative nominal frequency",
7591ab15f66SIan Rogers        "MetricExpr": "tma_info_clks / CPU_CLK_UNHALTED.REF_TSC",
7601ab15f66SIan Rogers        "MetricGroup": "Power",
7611ab15f66SIan Rogers        "MetricName": "tma_info_turbo_utilization"
7621ab15f66SIan Rogers    },
7631ab15f66SIan Rogers    {
7641ab15f66SIan Rogers        "BriefDescription": "Uops Per Instruction",
7651ab15f66SIan Rogers        "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / INST_RETIRED.ANY",
7661ab15f66SIan Rogers        "MetricGroup": "Pipeline;Ret;Retire",
7671ab15f66SIan Rogers        "MetricName": "tma_info_uoppi",
7681ab15f66SIan Rogers        "MetricThreshold": "tma_info_uoppi > 1.05"
7691ab15f66SIan Rogers    },
7701ab15f66SIan Rogers    {
7711ab15f66SIan Rogers        "BriefDescription": "Instruction per taken branch",
7721ab15f66SIan Rogers        "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / BR_INST_RETIRED.NEAR_TAKEN",
7731ab15f66SIan Rogers        "MetricGroup": "Branches;Fed;FetchBW",
7741ab15f66SIan Rogers        "MetricName": "tma_info_uptb",
7751ab15f66SIan Rogers        "MetricThreshold": "tma_info_uptb < 6"
7761ab15f66SIan Rogers    },
7771ab15f66SIan Rogers    {
7781ab15f66SIan Rogers        "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses",
7791ab15f66SIan Rogers        "MetricExpr": "(14 * ITLB_MISSES.STLB_HIT + cpu@ITLB_MISSES.WALK_DURATION\\,cmask\\=1@ + 7 * ITLB_MISSES.WALK_COMPLETED) / tma_info_clks",
7801ab15f66SIan Rogers        "MetricGroup": "BigFoot;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group",
7811ab15f66SIan Rogers        "MetricName": "tma_itlb_misses",
7821ab15f66SIan Rogers        "MetricThreshold": "tma_itlb_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)",
7831ab15f66SIan Rogers        "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses. Sample with: ITLB_MISSES.WALK_COMPLETED",
7841ab15f66SIan Rogers        "ScaleUnit": "100%"
7851ab15f66SIan Rogers    },
7861ab15f66SIan Rogers    {
7871ab15f66SIan Rogers        "BriefDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache",
7881ab15f66SIan Rogers        "MetricExpr": "max((CYCLE_ACTIVITY.STALLS_MEM_ANY - CYCLE_ACTIVITY.STALLS_L1D_MISS) / tma_info_clks, 0)",
7891ab15f66SIan Rogers        "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_issueL1;tma_issueMC;tma_memory_bound_group",
7901ab15f66SIan Rogers        "MetricName": "tma_l1_bound",
7911ab15f66SIan Rogers        "MetricThreshold": "tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)",
7921ab15f66SIan Rogers        "PublicDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache.  The L1 data cache typically has the shortest latency.  However; in certain cases like loads blocked on older stores; a load might suffer due to high latency even though it is being satisfied by the L1. Another example is loads who miss in the TLB. These cases are characterized by execution unit stalls; while some non-completed demand load lives in the machine without having that demand load missing the L1 cache. Sample with: MEM_LOAD_UOPS_RETIRED.L1_HIT_PS;MEM_LOAD_UOPS_RETIRED.HIT_LFB_PS. Related metrics: tma_clears_resteers, tma_machine_clears, tma_microcode_sequencer, tma_ms_switches, tma_ports_utilized_1",
7931ab15f66SIan Rogers        "ScaleUnit": "100%"
7941ab15f66SIan Rogers    },
7951ab15f66SIan Rogers    {
7961ab15f66SIan Rogers        "BriefDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads",
7971ab15f66SIan Rogers        "MetricExpr": "(CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / tma_info_clks",
7981ab15f66SIan Rogers        "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
7991ab15f66SIan Rogers        "MetricName": "tma_l2_bound",
8001ab15f66SIan Rogers        "MetricThreshold": "tma_l2_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)",
8011ab15f66SIan Rogers        "PublicDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads.  Avoiding cache misses (i.e. L1 misses/L2 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_UOPS_RETIRED.L2_HIT_PS",
8021ab15f66SIan Rogers        "ScaleUnit": "100%"
8031ab15f66SIan Rogers    },
8041ab15f66SIan Rogers    {
8051ab15f66SIan Rogers        "BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core",
8061ab15f66SIan Rogers        "MetricConstraint": "NO_GROUP_EVENTS_SMT",
8071ab15f66SIan Rogers        "MetricExpr": "MEM_LOAD_UOPS_RETIRED.L3_HIT / (MEM_LOAD_UOPS_RETIRED.L3_HIT + 7 * MEM_LOAD_UOPS_RETIRED.L3_MISS) * CYCLE_ACTIVITY.STALLS_L2_MISS / tma_info_clks",
8081ab15f66SIan Rogers        "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
8091ab15f66SIan Rogers        "MetricName": "tma_l3_bound",
8101ab15f66SIan Rogers        "MetricThreshold": "tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)",
8111ab15f66SIan Rogers        "PublicDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core.  Avoiding cache misses (i.e. L2 misses/L3 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_UOPS_RETIRED.L3_HIT_PS",
8121ab15f66SIan Rogers        "ScaleUnit": "100%"
8131ab15f66SIan Rogers    },
8141ab15f66SIan Rogers    {
8151ab15f66SIan Rogers        "BriefDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)",
8161ab15f66SIan Rogers        "MetricConstraint": "NO_GROUP_EVENTS",
8171ab15f66SIan Rogers        "MetricExpr": "29 * (MEM_LOAD_UOPS_RETIRED.L3_HIT * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_RETIRED.L3_MISS))) / tma_info_clks",
8181ab15f66SIan Rogers        "MetricGroup": "MemoryLat;TopdownL4;tma_L4_group;tma_issueLat;tma_l3_bound_group",
8191ab15f66SIan Rogers        "MetricName": "tma_l3_hit_latency",
8201ab15f66SIan Rogers        "MetricThreshold": "tma_l3_hit_latency > 0.1 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
8211ab15f66SIan Rogers        "PublicDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited).  Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance.  Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_UOPS_RETIRED.L3_HIT_PS. Related metrics: tma_mem_latency",
8221ab15f66SIan Rogers        "ScaleUnit": "100%"
8231ab15f66SIan Rogers    },
8241ab15f66SIan Rogers    {
8251ab15f66SIan Rogers        "BriefDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs)",
8261ab15f66SIan Rogers        "MetricExpr": "ILD_STALL.LCP / tma_info_clks",
8271ab15f66SIan Rogers        "MetricGroup": "FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueFB",
8281ab15f66SIan Rogers        "MetricName": "tma_lcp",
8291ab15f66SIan Rogers        "MetricThreshold": "tma_lcp > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)",
8301ab15f66SIan Rogers        "PublicDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will certainly avoid this. #Link: Optimization Guide about LCP BKMs. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_dsb_coverage, tma_info_iptb",
8311ab15f66SIan Rogers        "ScaleUnit": "100%"
8321ab15f66SIan Rogers    },
8331ab15f66SIan Rogers    {
8341ab15f66SIan Rogers        "BriefDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation)",
8351ab15f66SIan Rogers        "MetricExpr": "tma_retiring - tma_heavy_operations",
8361ab15f66SIan Rogers        "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group",
8371ab15f66SIan Rogers        "MetricName": "tma_light_operations",
8381ab15f66SIan Rogers        "MetricThreshold": "tma_light_operations > 0.6",
839*ccc66c60SIan Rogers        "MetricgroupNoGroup": "TopdownL2",
8401ab15f66SIan Rogers        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST",
8411ab15f66SIan Rogers        "ScaleUnit": "100%"
8421ab15f66SIan Rogers    },
8431ab15f66SIan Rogers    {
8441ab15f66SIan Rogers        "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Load operations",
8451ab15f66SIan Rogers        "MetricConstraint": "NO_GROUP_EVENTS_NMI",
8461ab15f66SIan Rogers        "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_2 + UOPS_DISPATCHED_PORT.PORT_3 + UOPS_DISPATCHED_PORT.PORT_7 - UOPS_DISPATCHED_PORT.PORT_4) / (2 * tma_info_core_clks)",
8471ab15f66SIan Rogers        "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group",
8481ab15f66SIan Rogers        "MetricName": "tma_load_op_utilization",
8491ab15f66SIan Rogers        "MetricThreshold": "tma_load_op_utilization > 0.6",
8501ab15f66SIan Rogers        "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Load operations. Sample with: UOPS_DISPATCHED.PORT_2_3",
8511ab15f66SIan Rogers        "ScaleUnit": "100%"
8521ab15f66SIan Rogers    },
8531ab15f66SIan Rogers    {
8541ab15f66SIan Rogers        "BriefDescription": "This metric represents fraction of cycles the CPU spent handling cache misses due to lock operations",
8551ab15f66SIan Rogers        "MetricConstraint": "NO_GROUP_EVENTS",
8561ab15f66SIan Rogers        "MetricExpr": "MEM_UOPS_RETIRED.LOCK_LOADS / MEM_UOPS_RETIRED.ALL_STORES * min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO) / tma_info_clks",
8571ab15f66SIan Rogers        "MetricGroup": "Offcore;TopdownL4;tma_L4_group;tma_issueRFO;tma_l1_bound_group",
8581ab15f66SIan Rogers        "MetricName": "tma_lock_latency",
8591ab15f66SIan Rogers        "MetricThreshold": "tma_lock_latency > 0.2 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
8601ab15f66SIan Rogers        "PublicDescription": "This metric represents fraction of cycles the CPU spent handling cache misses due to lock operations. Due to the microarchitecture handling of locks; they are classified as L1_Bound regardless of what memory source satisfied them. Sample with: MEM_UOPS_RETIRED.LOCK_LOADS_PS. Related metrics: tma_store_latency",
8611ab15f66SIan Rogers        "ScaleUnit": "100%"
8621ab15f66SIan Rogers    },
8631ab15f66SIan Rogers    {
8641ab15f66SIan Rogers        "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears",
8651ab15f66SIan Rogers        "MetricConstraint": "NO_GROUP_EVENTS",
8661ab15f66SIan Rogers        "MetricExpr": "tma_bad_speculation - tma_branch_mispredicts",
8671ab15f66SIan Rogers        "MetricGroup": "BadSpec;MachineClears;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueMC;tma_issueSyncxn",
8681ab15f66SIan Rogers        "MetricName": "tma_machine_clears",
8691ab15f66SIan Rogers        "MetricThreshold": "tma_machine_clears > 0.1 & tma_bad_speculation > 0.15",
870*ccc66c60SIan Rogers        "MetricgroupNoGroup": "TopdownL2",
8711ab15f66SIan Rogers        "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears.  These slots are either wasted by uops fetched prior to the clear; or stalls the out-of-order portion of the machine needs to recover its state after the clear. For example; this can happen due to memory ordering Nukes (e.g. Memory Disambiguation) or Self-Modifying-Code (SMC) nukes. Sample with: MACHINE_CLEARS.COUNT. Related metrics: tma_clears_resteers, tma_contested_accesses, tma_data_sharing, tma_false_sharing, tma_l1_bound, tma_microcode_sequencer, tma_ms_switches, tma_remote_cache",
8721ab15f66SIan Rogers        "ScaleUnit": "100%"
8731ab15f66SIan Rogers    },
8741ab15f66SIan Rogers    {
8751ab15f66SIan Rogers        "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM)",
8761ab15f66SIan Rogers        "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=4@) / tma_info_clks",
8771ab15f66SIan Rogers        "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueBW",
8781ab15f66SIan Rogers        "MetricName": "tma_mem_bandwidth",
8791ab15f66SIan Rogers        "MetricThreshold": "tma_mem_bandwidth > 0.2 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
8801ab15f66SIan Rogers        "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM).  The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_dram_bw_use, tma_sq_full",
8811ab15f66SIan Rogers        "ScaleUnit": "100%"
8821ab15f66SIan Rogers    },
8831ab15f66SIan Rogers    {
8841ab15f66SIan Rogers        "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM)",
8851ab15f66SIan Rogers        "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / tma_info_clks - tma_mem_bandwidth",
8861ab15f66SIan Rogers        "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueLat",
8871ab15f66SIan Rogers        "MetricName": "tma_mem_latency",
8881ab15f66SIan Rogers        "MetricThreshold": "tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
8891ab15f66SIan Rogers        "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM).  This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_l3_hit_latency",
8901ab15f66SIan Rogers        "ScaleUnit": "100%"
8911ab15f66SIan Rogers    },
8921ab15f66SIan Rogers    {
8931ab15f66SIan Rogers        "BriefDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck",
8941ab15f66SIan Rogers        "MetricConstraint": "NO_GROUP_EVENTS",
8951ab15f66SIan Rogers        "MetricExpr": "(CYCLE_ACTIVITY.STALLS_MEM_ANY + RESOURCE_STALLS.SB) / (CYCLE_ACTIVITY.STALLS_TOTAL + UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC - (UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC if tma_info_ipc > 1.8 else UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC) - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0) + RESOURCE_STALLS.SB) * tma_backend_bound",
8961ab15f66SIan Rogers        "MetricGroup": "Backend;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group",
8971ab15f66SIan Rogers        "MetricName": "tma_memory_bound",
8981ab15f66SIan Rogers        "MetricThreshold": "tma_memory_bound > 0.2 & tma_backend_bound > 0.2",
899*ccc66c60SIan Rogers        "MetricgroupNoGroup": "TopdownL2",
9001ab15f66SIan Rogers        "PublicDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck.  Memory Bound estimates fraction of slots where pipeline is likely stalled due to demand load or store instructions. This accounts mainly for (1) non-completed in-flight memory demand loads which coincides with execution units starvation; in addition to (2) cases where stores could impose backpressure on the pipeline when many of them get buffered at the same time (less common out of the two).",
9011ab15f66SIan Rogers        "ScaleUnit": "100%"
9021ab15f66SIan Rogers    },
9031ab15f66SIan Rogers    {
9041ab15f66SIan Rogers        "BriefDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit",
9051ab15f66SIan Rogers        "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * IDQ.MS_UOPS / tma_info_slots",
9061ab15f66SIan Rogers        "MetricGroup": "MicroSeq;TopdownL3;tma_L3_group;tma_heavy_operations_group;tma_issueMC;tma_issueMS",
9071ab15f66SIan Rogers        "MetricName": "tma_microcode_sequencer",
9081ab15f66SIan Rogers        "MetricThreshold": "tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1",
9091ab15f66SIan Rogers        "PublicDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit.  The MS is used for CISC instructions not supported by the default decoders (like repeat move strings; or CPUID); or by microcode assists used to address some operation modes (like in Floating Point assists). These cases can often be avoided. Sample with: IDQ.MS_UOPS. Related metrics: tma_clears_resteers, tma_l1_bound, tma_machine_clears, tma_ms_switches",
9101ab15f66SIan Rogers        "ScaleUnit": "100%"
9111ab15f66SIan Rogers    },
9121ab15f66SIan Rogers    {
9131ab15f66SIan Rogers        "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Branch Misprediction at execution stage",
9141ab15f66SIan Rogers        "MetricExpr": "BR_MISP_RETIRED.ALL_BRANCHES * tma_branch_resteers / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT + BACLEARS.ANY)",
9151ab15f66SIan Rogers        "MetricGroup": "BadSpec;BrMispredicts;TopdownL4;tma_L4_group;tma_branch_resteers_group;tma_issueBM",
9161ab15f66SIan Rogers        "MetricName": "tma_mispredicts_resteers",
9171ab15f66SIan Rogers        "MetricThreshold": "tma_mispredicts_resteers > 0.05 & (tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15))",
9181ab15f66SIan Rogers        "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Branch Misprediction at execution stage. Related metrics: tma_branch_mispredicts, tma_info_branch_misprediction_cost",
9191ab15f66SIan Rogers        "ScaleUnit": "100%"
9201ab15f66SIan Rogers    },
9211ab15f66SIan Rogers    {
9221ab15f66SIan Rogers        "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline)",
9231ab15f66SIan Rogers        "MetricExpr": "(IDQ.ALL_MITE_CYCLES_ANY_UOPS - IDQ.ALL_MITE_CYCLES_4_UOPS) / tma_info_core_clks / 2",
9241ab15f66SIan Rogers        "MetricGroup": "DSBmiss;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group",
9251ab15f66SIan Rogers        "MetricName": "tma_mite",
9261ab15f66SIan Rogers        "MetricThreshold": "tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 4 > 0.35)",
9271ab15f66SIan Rogers        "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline). This pipeline is used for code that was not pre-cached in the DSB or LSD. For example; inefficiencies due to asymmetric decoders; use of long immediate or LCP can manifest as MITE fetch bandwidth bottleneck.",
9281ab15f66SIan Rogers        "ScaleUnit": "100%"
9291ab15f66SIan Rogers    },
9301ab15f66SIan Rogers    {
9311ab15f66SIan Rogers        "BriefDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS)",
9321ab15f66SIan Rogers        "MetricExpr": "2 * IDQ.MS_SWITCHES / tma_info_clks",
9331ab15f66SIan Rogers        "MetricGroup": "FetchLat;MicroSeq;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueMC;tma_issueMS;tma_issueMV;tma_issueSO",
9341ab15f66SIan Rogers        "MetricName": "tma_ms_switches",
9351ab15f66SIan Rogers        "MetricThreshold": "tma_ms_switches > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)",
9361ab15f66SIan Rogers        "PublicDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS). Commonly used instructions are optimized for delivery by the DSB (decoded i-cache) or MITE (legacy instruction decode) pipelines. Certain operations cannot be handled natively by the execution pipeline; and must be performed by microcode (small programs injected into the execution stream). Switching to the MS too often can negatively impact performance. The MS is designated to deliver long uop flows required by CISC instructions like CPUID; or uncommon conditions like Floating Point Assists when dealing with Denormals. Sample with: IDQ.MS_SWITCHES. Related metrics: tma_clears_resteers, tma_l1_bound, tma_machine_clears, tma_microcode_sequencer, tma_mixing_vectors, tma_serializing_operation",
9371ab15f66SIan Rogers        "ScaleUnit": "100%"
9381ab15f66SIan Rogers    },
9391ab15f66SIan Rogers    {
9401ab15f66SIan Rogers        "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 0 ([SNB+] ALU; [HSW+] ALU and 2nd branch)",
9411ab15f66SIan Rogers        "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_0 / tma_info_core_clks",
9421ab15f66SIan Rogers        "MetricGroup": "Compute;TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P",
9431ab15f66SIan Rogers        "MetricName": "tma_port_0",
9441ab15f66SIan Rogers        "MetricThreshold": "tma_port_0 > 0.6",
9451ab15f66SIan Rogers        "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 0 ([SNB+] ALU; [HSW+] ALU and 2nd branch). Sample with: UOPS_DISPATCHED_PORT.PORT_0. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_1, tma_port_5, tma_port_6, tma_ports_utilized_2",
9461ab15f66SIan Rogers        "ScaleUnit": "100%"
9471ab15f66SIan Rogers    },
9481ab15f66SIan Rogers    {
9491ab15f66SIan Rogers        "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 1 (ALU)",
9501ab15f66SIan Rogers        "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_1 / tma_info_core_clks",
9511ab15f66SIan Rogers        "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P",
9521ab15f66SIan Rogers        "MetricName": "tma_port_1",
9531ab15f66SIan Rogers        "MetricThreshold": "tma_port_1 > 0.6",
9541ab15f66SIan Rogers        "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 1 (ALU). Sample with: UOPS_DISPATCHED_PORT.PORT_1. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_0, tma_port_5, tma_port_6, tma_ports_utilized_2",
9551ab15f66SIan Rogers        "ScaleUnit": "100%"
9561ab15f66SIan Rogers    },
9571ab15f66SIan Rogers    {
9581ab15f66SIan Rogers        "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 2 ([SNB+]Loads and Store-address; [ICL+] Loads)",
9591ab15f66SIan Rogers        "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_2 / tma_info_core_clks",
9601ab15f66SIan Rogers        "MetricGroup": "TopdownL6;tma_L6_group;tma_load_op_utilization_group",
9611ab15f66SIan Rogers        "MetricName": "tma_port_2",
9621ab15f66SIan Rogers        "MetricThreshold": "tma_port_2 > 0.6",
9631ab15f66SIan Rogers        "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 2 ([SNB+]Loads and Store-address; [ICL+] Loads). Sample with: UOPS_DISPATCHED_PORT.PORT_2",
9641ab15f66SIan Rogers        "ScaleUnit": "100%"
9651ab15f66SIan Rogers    },
9661ab15f66SIan Rogers    {
9671ab15f66SIan Rogers        "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 3 ([SNB+]Loads and Store-address; [ICL+] Loads)",
9681ab15f66SIan Rogers        "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_3 / tma_info_core_clks",
9691ab15f66SIan Rogers        "MetricGroup": "TopdownL6;tma_L6_group;tma_load_op_utilization_group",
9701ab15f66SIan Rogers        "MetricName": "tma_port_3",
9711ab15f66SIan Rogers        "MetricThreshold": "tma_port_3 > 0.6",
9721ab15f66SIan Rogers        "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 3 ([SNB+]Loads and Store-address; [ICL+] Loads). Sample with: UOPS_DISPATCHED_PORT.PORT_3",
9731ab15f66SIan Rogers        "ScaleUnit": "100%"
9741ab15f66SIan Rogers    },
9751ab15f66SIan Rogers    {
9761ab15f66SIan Rogers        "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 4 (Store-data)",
9771ab15f66SIan Rogers        "MetricExpr": "tma_store_op_utilization",
9781ab15f66SIan Rogers        "MetricGroup": "TopdownL6;tma_L6_group;tma_issueSpSt;tma_store_op_utilization_group",
9791ab15f66SIan Rogers        "MetricName": "tma_port_4",
9801ab15f66SIan Rogers        "MetricThreshold": "tma_port_4 > 0.6",
9811ab15f66SIan Rogers        "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 4 (Store-data). Sample with: UOPS_DISPATCHED_PORT.PORT_4. Related metrics: tma_split_stores",
9821ab15f66SIan Rogers        "ScaleUnit": "100%"
9831ab15f66SIan Rogers    },
9841ab15f66SIan Rogers    {
9851ab15f66SIan Rogers        "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 5 ([SNB+] Branches and ALU; [HSW+] ALU)",
9861ab15f66SIan Rogers        "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_5 / tma_info_core_clks",
9871ab15f66SIan Rogers        "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P",
9881ab15f66SIan Rogers        "MetricName": "tma_port_5",
9891ab15f66SIan Rogers        "MetricThreshold": "tma_port_5 > 0.6",
9901ab15f66SIan Rogers        "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 5 ([SNB+] Branches and ALU; [HSW+] ALU). Sample with: UOPS_DISPATCHED.PORT_5. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_6, tma_ports_utilized_2",
9911ab15f66SIan Rogers        "ScaleUnit": "100%"
9921ab15f66SIan Rogers    },
9931ab15f66SIan Rogers    {
9941ab15f66SIan Rogers        "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+]Primary Branch and simple ALU)",
9951ab15f66SIan Rogers        "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_6 / tma_info_core_clks",
9961ab15f66SIan Rogers        "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P",
9971ab15f66SIan Rogers        "MetricName": "tma_port_6",
9981ab15f66SIan Rogers        "MetricThreshold": "tma_port_6 > 0.6",
9991ab15f66SIan Rogers        "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+]Primary Branch and simple ALU). Sample with: UOPS_DISPATCHED_PORT.PORT_6. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_ports_utilized_2",
10001ab15f66SIan Rogers        "ScaleUnit": "100%"
10011ab15f66SIan Rogers    },
10021ab15f66SIan Rogers    {
10031ab15f66SIan Rogers        "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 7 ([HSW+]simple Store-address)",
10041ab15f66SIan Rogers        "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_7 / tma_info_core_clks",
10051ab15f66SIan Rogers        "MetricGroup": "TopdownL6;tma_L6_group;tma_store_op_utilization_group",
10061ab15f66SIan Rogers        "MetricName": "tma_port_7",
10071ab15f66SIan Rogers        "MetricThreshold": "tma_port_7 > 0.6",
10081ab15f66SIan Rogers        "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 7 ([HSW+]simple Store-address). Sample with: UOPS_DISPATCHED_PORT.PORT_7",
10091ab15f66SIan Rogers        "ScaleUnit": "100%"
10101ab15f66SIan Rogers    },
10111ab15f66SIan Rogers    {
10121ab15f66SIan Rogers        "BriefDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related)",
10131ab15f66SIan Rogers        "MetricConstraint": "NO_GROUP_EVENTS",
10141ab15f66SIan Rogers        "MetricExpr": "(CYCLE_ACTIVITY.STALLS_TOTAL + UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC - (UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC if tma_info_ipc > 1.8 else UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC) - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0) + RESOURCE_STALLS.SB - RESOURCE_STALLS.SB - CYCLE_ACTIVITY.STALLS_MEM_ANY) / tma_info_clks",
10151ab15f66SIan Rogers        "MetricGroup": "PortsUtil;TopdownL3;tma_L3_group;tma_core_bound_group",
10161ab15f66SIan Rogers        "MetricName": "tma_ports_utilization",
10171ab15f66SIan Rogers        "MetricThreshold": "tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)",
10181ab15f66SIan Rogers        "PublicDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related).  Two distinct categories can be attributed into this metric: (1) heavy data-dependency among contiguous instructions would manifest in this metric - such cases are often referred to as low Instruction Level Parallelism (ILP). (2) Contention on some hardware execution unit other than Divider. For example; when there are too many multiply operations.",
10191ab15f66SIan Rogers        "ScaleUnit": "100%"
10201ab15f66SIan Rogers    },
10211ab15f66SIan Rogers    {
10221ab15f66SIan Rogers        "BriefDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise)",
10231ab15f66SIan Rogers        "MetricExpr": "(cpu@UOPS_EXECUTED.CORE\\,inv\\,cmask\\=1@ / 2 if #SMT_on else (CYCLE_ACTIVITY.STALLS_TOTAL - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0)) / tma_info_core_clks)",
10241ab15f66SIan Rogers        "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group",
10251ab15f66SIan Rogers        "MetricName": "tma_ports_utilized_0",
10261ab15f66SIan Rogers        "MetricThreshold": "tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))",
10271ab15f66SIan Rogers        "PublicDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise). Long-latency instructions like divides may contribute to this metric.",
10281ab15f66SIan Rogers        "ScaleUnit": "100%"
10291ab15f66SIan Rogers    },
10301ab15f66SIan Rogers    {
10311ab15f66SIan Rogers        "BriefDescription": "This metric represents fraction of cycles where the CPU executed total of 1 uop per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)",
10321ab15f66SIan Rogers        "MetricExpr": "((cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@) / 2 if #SMT_on else (UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC - UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC) / tma_info_core_clks)",
10331ab15f66SIan Rogers        "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_issueL1;tma_ports_utilization_group",
10341ab15f66SIan Rogers        "MetricName": "tma_ports_utilized_1",
10351ab15f66SIan Rogers        "MetricThreshold": "tma_ports_utilized_1 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))",
10361ab15f66SIan Rogers        "PublicDescription": "This metric represents fraction of cycles where the CPU executed total of 1 uop per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise). This can be due to heavy data-dependency among software instructions; or over oversubscribing a particular hardware resource. In some other cases with high 1_Port_Utilized and L1_Bound; this metric can point to L1 data-cache latency bottleneck that may not necessarily manifest with complete execution starvation (due to the short L1 latency e.g. walking a linked list) - looking at the assembly can be helpful. Related metrics: tma_l1_bound",
10371ab15f66SIan Rogers        "ScaleUnit": "100%"
10381ab15f66SIan Rogers    },
10391ab15f66SIan Rogers    {
10401ab15f66SIan Rogers        "BriefDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)",
10411ab15f66SIan Rogers        "MetricExpr": "((cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@) / 2 if #SMT_on else (UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC - UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC) / tma_info_core_clks)",
10421ab15f66SIan Rogers        "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_issue2P;tma_ports_utilization_group",
10431ab15f66SIan Rogers        "MetricName": "tma_ports_utilized_2",
10441ab15f66SIan Rogers        "MetricThreshold": "tma_ports_utilized_2 > 0.15 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))",
10451ab15f66SIan Rogers        "PublicDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise).  Loop Vectorization -most compilers feature auto-Vectorization options today- reduces pressure on the execution ports as multiple elements are calculated with same uop. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_port_6",
10461ab15f66SIan Rogers        "ScaleUnit": "100%"
10471ab15f66SIan Rogers    },
10481ab15f66SIan Rogers    {
10491ab15f66SIan Rogers        "BriefDescription": "This metric represents fraction of cycles CPU executed total of 3 or more uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise).",
10501ab15f66SIan Rogers        "MetricExpr": "(cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@ / 2 if #SMT_on else UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC) / tma_info_core_clks",
10511ab15f66SIan Rogers        "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group",
10521ab15f66SIan Rogers        "MetricName": "tma_ports_utilized_3m",
10531ab15f66SIan Rogers        "MetricThreshold": "tma_ports_utilized_3m > 0.7 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))",
10541ab15f66SIan Rogers        "ScaleUnit": "100%"
10551ab15f66SIan Rogers    },
10561ab15f66SIan Rogers    {
10571ab15f66SIan Rogers        "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired",
10581ab15f66SIan Rogers        "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / tma_info_slots",
10591ab15f66SIan Rogers        "MetricGroup": "TmaL1;TopdownL1;tma_L1_group",
10601ab15f66SIan Rogers        "MetricName": "tma_retiring",
10611ab15f66SIan Rogers        "MetricThreshold": "tma_retiring > 0.7 | tma_heavy_operations > 0.1",
1062*ccc66c60SIan Rogers        "MetricgroupNoGroup": "TopdownL1",
10631ab15f66SIan Rogers        "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category.  Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved.  Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance.  For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. Sample with: UOPS_RETIRED.RETIRE_SLOTS",
10641ab15f66SIan Rogers        "ScaleUnit": "100%"
10651ab15f66SIan Rogers    },
10661ab15f66SIan Rogers    {
10671ab15f66SIan Rogers        "BriefDescription": "This metric estimates fraction of cycles handling memory load split accesses - load that cross 64-byte cache line boundary",
10681ab15f66SIan Rogers        "MetricConstraint": "NO_GROUP_EVENTS",
10691ab15f66SIan Rogers        "MetricExpr": "tma_info_load_miss_real_latency * LD_BLOCKS.NO_SR / tma_info_clks",
10701ab15f66SIan Rogers        "MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group",
10711ab15f66SIan Rogers        "MetricName": "tma_split_loads",
10721ab15f66SIan Rogers        "MetricThreshold": "tma_split_loads > 0.2 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
10731ab15f66SIan Rogers        "PublicDescription": "This metric estimates fraction of cycles handling memory load split accesses - load that cross 64-byte cache line boundary. Sample with: MEM_UOPS_RETIRED.SPLIT_LOADS_PS",
10741ab15f66SIan Rogers        "ScaleUnit": "100%"
10751ab15f66SIan Rogers    },
10761ab15f66SIan Rogers    {
10771ab15f66SIan Rogers        "BriefDescription": "This metric represents rate of split store accesses",
10781ab15f66SIan Rogers        "MetricExpr": "2 * MEM_UOPS_RETIRED.SPLIT_STORES / tma_info_core_clks",
10791ab15f66SIan Rogers        "MetricGroup": "TopdownL4;tma_L4_group;tma_issueSpSt;tma_store_bound_group",
10801ab15f66SIan Rogers        "MetricName": "tma_split_stores",
10811ab15f66SIan Rogers        "MetricThreshold": "tma_split_stores > 0.2 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
10821ab15f66SIan Rogers        "PublicDescription": "This metric represents rate of split store accesses.  Consider aligning your data to the 64-byte cache line granularity. Sample with: MEM_UOPS_RETIRED.SPLIT_STORES_PS. Related metrics: tma_port_4",
10831ab15f66SIan Rogers        "ScaleUnit": "100%"
10841ab15f66SIan Rogers    },
10851ab15f66SIan Rogers    {
10861ab15f66SIan Rogers        "BriefDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors)",
10871ab15f66SIan Rogers        "MetricExpr": "(OFFCORE_REQUESTS_BUFFER.SQ_FULL / 2 if #SMT_on else OFFCORE_REQUESTS_BUFFER.SQ_FULL) / tma_info_core_clks",
10881ab15f66SIan Rogers        "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_issueBW;tma_l3_bound_group",
10891ab15f66SIan Rogers        "MetricName": "tma_sq_full",
10901ab15f66SIan Rogers        "MetricThreshold": "tma_sq_full > 0.3 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
10911ab15f66SIan Rogers        "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_fb_full, tma_info_dram_bw_use, tma_mem_bandwidth",
10921ab15f66SIan Rogers        "ScaleUnit": "100%"
10931ab15f66SIan Rogers    },
10941ab15f66SIan Rogers    {
10951ab15f66SIan Rogers        "BriefDescription": "This metric estimates how often CPU was stalled  due to RFO store memory accesses; RFO store issue a read-for-ownership request before the write",
10961ab15f66SIan Rogers        "MetricExpr": "RESOURCE_STALLS.SB / tma_info_clks",
10971ab15f66SIan Rogers        "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
10981ab15f66SIan Rogers        "MetricName": "tma_store_bound",
10991ab15f66SIan Rogers        "MetricThreshold": "tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)",
11001ab15f66SIan Rogers        "PublicDescription": "This metric estimates how often CPU was stalled  due to RFO store memory accesses; RFO store issue a read-for-ownership request before the write. Even though store accesses do not typically stall out-of-order CPUs; there are few cases where stores can lead to actual stalls. This metric will be flagged should RFO stores be a bottleneck. Sample with: MEM_UOPS_RETIRED.ALL_STORES_PS",
11011ab15f66SIan Rogers        "ScaleUnit": "100%"
11021ab15f66SIan Rogers    },
11031ab15f66SIan Rogers    {
11041ab15f66SIan Rogers        "BriefDescription": "This metric roughly estimates fraction of cycles when the memory subsystem had loads blocked since they could not forward data from earlier (in program order) overlapping stores",
11051ab15f66SIan Rogers        "MetricExpr": "13 * LD_BLOCKS.STORE_FORWARD / tma_info_clks",
11061ab15f66SIan Rogers        "MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group",
11071ab15f66SIan Rogers        "MetricName": "tma_store_fwd_blk",
11081ab15f66SIan Rogers        "MetricThreshold": "tma_store_fwd_blk > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
11091ab15f66SIan Rogers        "PublicDescription": "This metric roughly estimates fraction of cycles when the memory subsystem had loads blocked since they could not forward data from earlier (in program order) overlapping stores. To streamline memory operations in the pipeline; a load can avoid waiting for memory if a prior in-flight store is writing the data that the load wants to read (store forwarding process). However; in some cases the load may be blocked for a significant time pending the store forward. For example; when the prior store is writing a smaller region than the load is reading.",
11101ab15f66SIan Rogers        "ScaleUnit": "100%"
11111ab15f66SIan Rogers    },
11121ab15f66SIan Rogers    {
11131ab15f66SIan Rogers        "BriefDescription": "This metric estimates fraction of cycles the CPU spent handling L1D store misses",
11141ab15f66SIan Rogers        "MetricConstraint": "NO_GROUP_EVENTS",
11151ab15f66SIan Rogers        "MetricExpr": "(L2_RQSTS.RFO_HIT * 9 * (1 - MEM_UOPS_RETIRED.LOCK_LOADS / MEM_UOPS_RETIRED.ALL_STORES) + (1 - MEM_UOPS_RETIRED.LOCK_LOADS / MEM_UOPS_RETIRED.ALL_STORES) * min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO)) / tma_info_clks",
11161ab15f66SIan Rogers        "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_issueRFO;tma_issueSL;tma_store_bound_group",
11171ab15f66SIan Rogers        "MetricName": "tma_store_latency",
11181ab15f66SIan Rogers        "MetricThreshold": "tma_store_latency > 0.1 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
11191ab15f66SIan Rogers        "PublicDescription": "This metric estimates fraction of cycles the CPU spent handling L1D store misses. Store accesses usually less impact out-of-order core performance; however; holding resources for longer time can lead into undesired implications (e.g. contention on L1D fill-buffer entries - see FB_Full). Related metrics: tma_fb_full, tma_lock_latency",
11201ab15f66SIan Rogers        "ScaleUnit": "100%"
11211ab15f66SIan Rogers    },
11221ab15f66SIan Rogers    {
11231ab15f66SIan Rogers        "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Store operations",
11241ab15f66SIan Rogers        "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_4 / tma_info_core_clks",
11251ab15f66SIan Rogers        "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group",
11261ab15f66SIan Rogers        "MetricName": "tma_store_op_utilization",
11271ab15f66SIan Rogers        "MetricThreshold": "tma_store_op_utilization > 0.6",
11281ab15f66SIan Rogers        "ScaleUnit": "100%"
11291ab15f66SIan Rogers    },
11301ab15f66SIan Rogers    {
11311ab15f66SIan Rogers        "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears",
11321ab15f66SIan Rogers        "MetricExpr": "tma_branch_resteers - tma_mispredicts_resteers - tma_clears_resteers",
11331ab15f66SIan Rogers        "MetricGroup": "BigFoot;FetchLat;TopdownL4;tma_L4_group;tma_branch_resteers_group",
11341ab15f66SIan Rogers        "MetricName": "tma_unknown_branches",
11351ab15f66SIan Rogers        "MetricThreshold": "tma_unknown_branches > 0.05 & (tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15))",
11361ab15f66SIan Rogers        "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears. These are fetched branches the Branch Prediction Unit was unable to recognize (e.g. first time the branch is fetched or hitting BTB capacity limit). Sample with: BACLEARS.ANY",
11371ab15f66SIan Rogers        "ScaleUnit": "100%"
11381ab15f66SIan Rogers    },
11391ab15f66SIan Rogers    {
11401ab15f66SIan Rogers        "BriefDescription": "This metric serves as an approximation of legacy x87 usage",
11411ab15f66SIan Rogers        "MetricExpr": "INST_RETIRED.X87 * tma_info_uoppi / UOPS_RETIRED.RETIRE_SLOTS",
11421ab15f66SIan Rogers        "MetricGroup": "Compute;TopdownL4;tma_L4_group;tma_fp_arith_group",
11431ab15f66SIan Rogers        "MetricName": "tma_x87_use",
11441ab15f66SIan Rogers        "MetricThreshold": "tma_x87_use > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6)",
11451ab15f66SIan Rogers        "PublicDescription": "This metric serves as an approximation of legacy x87 usage. It accounts for instructions beyond X87 FP arithmetic operations; hence may be used as a thermometer to avoid X87 high usage and preferably upgrade to modern ISA. See Tip under Tuning Hint.",
11461ab15f66SIan Rogers        "ScaleUnit": "100%"
1147cf979623SAndi Kleen    }
1148cf979623SAndi Kleen]
1149