17e74ece3SIan Rogers[
27e74ece3SIan Rogers    {
37e74ece3SIan Rogers        "BriefDescription": "C10 residency percent per package",
47e74ece3SIan Rogers        "MetricExpr": "cstate_pkg@c10\\-residency@ / TSC",
57e74ece3SIan Rogers        "MetricGroup": "Power",
67e74ece3SIan Rogers        "MetricName": "C10_Pkg_Residency",
77e74ece3SIan Rogers        "ScaleUnit": "100%"
87e74ece3SIan Rogers    },
97e74ece3SIan Rogers    {
107e74ece3SIan Rogers        "BriefDescription": "C2 residency percent per package",
117e74ece3SIan Rogers        "MetricExpr": "cstate_pkg@c2\\-residency@ / TSC",
127e74ece3SIan Rogers        "MetricGroup": "Power",
137e74ece3SIan Rogers        "MetricName": "C2_Pkg_Residency",
147e74ece3SIan Rogers        "ScaleUnit": "100%"
157e74ece3SIan Rogers    },
167e74ece3SIan Rogers    {
177e74ece3SIan Rogers        "BriefDescription": "C3 residency percent per package",
187e74ece3SIan Rogers        "MetricExpr": "cstate_pkg@c3\\-residency@ / TSC",
197e74ece3SIan Rogers        "MetricGroup": "Power",
207e74ece3SIan Rogers        "MetricName": "C3_Pkg_Residency",
217e74ece3SIan Rogers        "ScaleUnit": "100%"
227e74ece3SIan Rogers    },
237e74ece3SIan Rogers    {
247e74ece3SIan Rogers        "BriefDescription": "C6 residency percent per core",
257e74ece3SIan Rogers        "MetricExpr": "cstate_core@c6\\-residency@ / TSC",
267e74ece3SIan Rogers        "MetricGroup": "Power",
277e74ece3SIan Rogers        "MetricName": "C6_Core_Residency",
287e74ece3SIan Rogers        "ScaleUnit": "100%"
297e74ece3SIan Rogers    },
307e74ece3SIan Rogers    {
317e74ece3SIan Rogers        "BriefDescription": "C6 residency percent per package",
327e74ece3SIan Rogers        "MetricExpr": "cstate_pkg@c6\\-residency@ / TSC",
337e74ece3SIan Rogers        "MetricGroup": "Power",
347e74ece3SIan Rogers        "MetricName": "C6_Pkg_Residency",
357e74ece3SIan Rogers        "ScaleUnit": "100%"
367e74ece3SIan Rogers    },
377e74ece3SIan Rogers    {
387e74ece3SIan Rogers        "BriefDescription": "C7 residency percent per core",
397e74ece3SIan Rogers        "MetricExpr": "cstate_core@c7\\-residency@ / TSC",
407e74ece3SIan Rogers        "MetricGroup": "Power",
417e74ece3SIan Rogers        "MetricName": "C7_Core_Residency",
427e74ece3SIan Rogers        "ScaleUnit": "100%"
437e74ece3SIan Rogers    },
447e74ece3SIan Rogers    {
457e74ece3SIan Rogers        "BriefDescription": "C7 residency percent per package",
467e74ece3SIan Rogers        "MetricExpr": "cstate_pkg@c7\\-residency@ / TSC",
477e74ece3SIan Rogers        "MetricGroup": "Power",
487e74ece3SIan Rogers        "MetricName": "C7_Pkg_Residency",
497e74ece3SIan Rogers        "ScaleUnit": "100%"
507e74ece3SIan Rogers    },
517e74ece3SIan Rogers    {
527e74ece3SIan Rogers        "BriefDescription": "C8 residency percent per package",
537e74ece3SIan Rogers        "MetricExpr": "cstate_pkg@c8\\-residency@ / TSC",
547e74ece3SIan Rogers        "MetricGroup": "Power",
557e74ece3SIan Rogers        "MetricName": "C8_Pkg_Residency",
567e74ece3SIan Rogers        "ScaleUnit": "100%"
577e74ece3SIan Rogers    },
587e74ece3SIan Rogers    {
597e74ece3SIan Rogers        "BriefDescription": "C9 residency percent per package",
607e74ece3SIan Rogers        "MetricExpr": "cstate_pkg@c9\\-residency@ / TSC",
617e74ece3SIan Rogers        "MetricGroup": "Power",
627e74ece3SIan Rogers        "MetricName": "C9_Pkg_Residency",
637e74ece3SIan Rogers        "ScaleUnit": "100%"
647e74ece3SIan Rogers    },
657e74ece3SIan Rogers    {
667e74ece3SIan Rogers        "BriefDescription": "Uncore frequency per die [GHZ]",
677e74ece3SIan Rogers        "MetricExpr": "tma_info_system_socket_clks / #num_dies / duration_time / 1e9",
687e74ece3SIan Rogers        "MetricGroup": "SoC",
697e74ece3SIan Rogers        "MetricName": "UNCORE_FREQ"
707e74ece3SIan Rogers    },
717e74ece3SIan Rogers    {
727e74ece3SIan Rogers        "BriefDescription": "Percentage of cycles spent in System Management Interrupts.",
737e74ece3SIan Rogers        "MetricExpr": "((msr@aperf@ - cycles) / msr@aperf@ if msr@smi@ > 0 else 0)",
747e74ece3SIan Rogers        "MetricGroup": "smi",
757e74ece3SIan Rogers        "MetricName": "smi_cycles",
767e74ece3SIan Rogers        "MetricThreshold": "smi_cycles > 0.1",
777e74ece3SIan Rogers        "ScaleUnit": "100%"
787e74ece3SIan Rogers    },
797e74ece3SIan Rogers    {
807e74ece3SIan Rogers        "BriefDescription": "Number of SMI interrupts.",
817e74ece3SIan Rogers        "MetricExpr": "msr@smi@",
827e74ece3SIan Rogers        "MetricGroup": "smi",
837e74ece3SIan Rogers        "MetricName": "smi_num",
847e74ece3SIan Rogers        "ScaleUnit": "1SMI#"
857e74ece3SIan Rogers    },
867e74ece3SIan Rogers    {
877e74ece3SIan Rogers        "BriefDescription": "This metric estimates how often memory load accesses were aliased by preceding stores (in program order) with a 4K address offset",
88*9a7d82c1SIan Rogers        "MetricConstraint": "NO_GROUP_EVENTS_NMI",
897e74ece3SIan Rogers        "MetricExpr": "LD_BLOCKS_PARTIAL.ADDRESS_ALIAS / tma_info_thread_clks",
907e74ece3SIan Rogers        "MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group",
917e74ece3SIan Rogers        "MetricName": "tma_4k_aliasing",
927e74ece3SIan Rogers        "MetricThreshold": "tma_4k_aliasing > 0.2 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
937e74ece3SIan Rogers        "PublicDescription": "This metric estimates how often memory load accesses were aliased by preceding stores (in program order) with a 4K address offset. False match is possible; which incur a few cycles load re-issue. However; the short re-issue duration is often hidden by the out-of-order core and HW optimizations; hence a user may safely ignore a high value of this metric unless it manages to propagate up into parent nodes of the hierarchy (e.g. to L1_Bound).",
947e74ece3SIan Rogers        "ScaleUnit": "100%"
957e74ece3SIan Rogers    },
967e74ece3SIan Rogers    {
977e74ece3SIan Rogers        "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution ports for ALU operations.",
987e74ece3SIan Rogers        "MetricExpr": "(UOPS_DISPATCHED.PORT_0 + UOPS_DISPATCHED.PORT_1 + UOPS_DISPATCHED.PORT_5 + UOPS_DISPATCHED.PORT_6) / (4 * tma_info_core_core_clks)",
997e74ece3SIan Rogers        "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group",
1007e74ece3SIan Rogers        "MetricName": "tma_alu_op_utilization",
1017e74ece3SIan Rogers        "MetricThreshold": "tma_alu_op_utilization > 0.6",
1027e74ece3SIan Rogers        "ScaleUnit": "100%"
1037e74ece3SIan Rogers    },
1047e74ece3SIan Rogers    {
1057e74ece3SIan Rogers        "BriefDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists",
1067e74ece3SIan Rogers        "MetricExpr": "100 * ASSISTS.ANY / tma_info_thread_slots",
1077e74ece3SIan Rogers        "MetricGroup": "TopdownL4;tma_L4_group;tma_microcode_sequencer_group",
1087e74ece3SIan Rogers        "MetricName": "tma_assists",
1097e74ece3SIan Rogers        "MetricThreshold": "tma_assists > 0.1 & (tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1)",
1107e74ece3SIan Rogers        "PublicDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists. Assists are long sequences of uops that are required in certain corner-cases for operations that cannot be handled natively by the execution pipeline. For example; when working with very small floating point values (so-called Denormals); the FP units are not set up to perform these operations natively. Instead; a sequence of instructions to perform the computation on the Denormals is injected into the pipeline. Since these microcode sequences might be dozens of uops long; Assists can be extremely deleterious to performance and they can be avoided in many cases. Sample with: ASSISTS.ANY",
1117e74ece3SIan Rogers        "ScaleUnit": "100%"
1127e74ece3SIan Rogers    },
1137e74ece3SIan Rogers    {
1147e74ece3SIan Rogers        "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend",
1157e74ece3SIan Rogers        "DefaultMetricgroupName": "TopdownL1",
1167e74ece3SIan Rogers        "MetricExpr": "topdown\\-be\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 5 * cpu@INT_MISC.RECOVERY_CYCLES\\,cmask\\=1\\,edge@ / tma_info_thread_slots",
1177e74ece3SIan Rogers        "MetricGroup": "Default;TmaL1;TopdownL1;tma_L1_group",
1187e74ece3SIan Rogers        "MetricName": "tma_backend_bound",
1197e74ece3SIan Rogers        "MetricThreshold": "tma_backend_bound > 0.2",
1207e74ece3SIan Rogers        "MetricgroupNoGroup": "TopdownL1;Default",
1217e74ece3SIan Rogers        "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound. Sample with: TOPDOWN.BACKEND_BOUND_SLOTS",
1227e74ece3SIan Rogers        "ScaleUnit": "100%"
1237e74ece3SIan Rogers    },
1247e74ece3SIan Rogers    {
1257e74ece3SIan Rogers        "BriefDescription": "This category represents fraction of slots wasted due to incorrect speculations",
1267e74ece3SIan Rogers        "DefaultMetricgroupName": "TopdownL1",
1277e74ece3SIan Rogers        "MetricExpr": "max(1 - (tma_frontend_bound + tma_backend_bound + tma_retiring), 0)",
1287e74ece3SIan Rogers        "MetricGroup": "Default;TmaL1;TopdownL1;tma_L1_group",
1297e74ece3SIan Rogers        "MetricName": "tma_bad_speculation",
1307e74ece3SIan Rogers        "MetricThreshold": "tma_bad_speculation > 0.15",
1317e74ece3SIan Rogers        "MetricgroupNoGroup": "TopdownL1;Default",
1327e74ece3SIan Rogers        "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example.",
1337e74ece3SIan Rogers        "ScaleUnit": "100%"
1347e74ece3SIan Rogers    },
1357e74ece3SIan Rogers    {
1367e74ece3SIan Rogers        "BriefDescription": "This metric represents fraction of slots where the CPU was retiring branch instructions.",
1377e74ece3SIan Rogers        "MetricExpr": "tma_light_operations * BR_INST_RETIRED.ALL_BRANCHES / (tma_retiring * tma_info_thread_slots)",
1387e74ece3SIan Rogers        "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group",
1397e74ece3SIan Rogers        "MetricName": "tma_branch_instructions",
1407e74ece3SIan Rogers        "MetricThreshold": "tma_branch_instructions > 0.1 & tma_light_operations > 0.6",
1417e74ece3SIan Rogers        "ScaleUnit": "100%"
1427e74ece3SIan Rogers    },
1437e74ece3SIan Rogers    {
1447e74ece3SIan Rogers        "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction",
1457e74ece3SIan Rogers        "MetricExpr": "BR_MISP_RETIRED.ALL_BRANCHES / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT) * tma_bad_speculation",
1467e74ece3SIan Rogers        "MetricGroup": "BadSpec;BrMispredicts;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueBM",
1477e74ece3SIan Rogers        "MetricName": "tma_branch_mispredicts",
1487e74ece3SIan Rogers        "MetricThreshold": "tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15",
1497e74ece3SIan Rogers        "MetricgroupNoGroup": "TopdownL2",
1507e74ece3SIan Rogers        "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction.  These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES. Related metrics: tma_info_bad_spec_branch_misprediction_cost, tma_info_bottleneck_mispredictions, tma_mispredicts_resteers",
1517e74ece3SIan Rogers        "ScaleUnit": "100%"
1527e74ece3SIan Rogers    },
1537e74ece3SIan Rogers    {
1547e74ece3SIan Rogers        "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers",
1557e74ece3SIan Rogers        "MetricExpr": "INT_MISC.CLEAR_RESTEER_CYCLES / tma_info_thread_clks + tma_unknown_branches",
1567e74ece3SIan Rogers        "MetricGroup": "FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group",
1577e74ece3SIan Rogers        "MetricName": "tma_branch_resteers",
1587e74ece3SIan Rogers        "MetricThreshold": "tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)",
1597e74ece3SIan Rogers        "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers. Branch Resteers estimates the Frontend delay in fetching operations from corrected path; following all sorts of miss-predicted branches. For example; branchy code with lots of miss-predictions might get categorized under Branch Resteers. Note the value of this node may overlap with its siblings. Sample with: BR_MISP_RETIRED.ALL_BRANCHES",
1607e74ece3SIan Rogers        "ScaleUnit": "100%"
1617e74ece3SIan Rogers    },
1627e74ece3SIan Rogers    {
1637e74ece3SIan Rogers        "BriefDescription": "This metric estimates fraction of cycles the CPU retired uops originated from CISC (complex instruction set computer) instruction",
1647e74ece3SIan Rogers        "MetricExpr": "max(0, tma_microcode_sequencer - tma_assists)",
1657e74ece3SIan Rogers        "MetricGroup": "TopdownL4;tma_L4_group;tma_microcode_sequencer_group",
1667e74ece3SIan Rogers        "MetricName": "tma_cisc",
1677e74ece3SIan Rogers        "MetricThreshold": "tma_cisc > 0.1 & (tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1)",
1687e74ece3SIan Rogers        "PublicDescription": "This metric estimates fraction of cycles the CPU retired uops originated from CISC (complex instruction set computer) instruction. A CISC instruction has multiple uops that are required to perform the instruction's functionality as in the case of read-modify-write as an example. Since these instructions require multiple uops they may or may not imply sub-optimal use of machine resources.",
1697e74ece3SIan Rogers        "ScaleUnit": "100%"
1707e74ece3SIan Rogers    },
1717e74ece3SIan Rogers    {
1727e74ece3SIan Rogers        "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Machine Clears",
1737e74ece3SIan Rogers        "MetricExpr": "(1 - BR_MISP_RETIRED.ALL_BRANCHES / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT)) * INT_MISC.CLEAR_RESTEER_CYCLES / tma_info_thread_clks",
1747e74ece3SIan Rogers        "MetricGroup": "BadSpec;MachineClears;TopdownL4;tma_L4_group;tma_branch_resteers_group;tma_issueMC",
1757e74ece3SIan Rogers        "MetricName": "tma_clears_resteers",
1767e74ece3SIan Rogers        "MetricThreshold": "tma_clears_resteers > 0.05 & (tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15))",
1777e74ece3SIan Rogers        "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Machine Clears. Sample with: INT_MISC.CLEAR_RESTEER_CYCLES. Related metrics: tma_l1_bound, tma_machine_clears, tma_microcode_sequencer, tma_ms_switches",
1787e74ece3SIan Rogers        "ScaleUnit": "100%"
1797e74ece3SIan Rogers    },
1807e74ece3SIan Rogers    {
1817e74ece3SIan Rogers        "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses",
1827e74ece3SIan Rogers        "MetricConstraint": "NO_GROUP_EVENTS",
1837e74ece3SIan Rogers        "MetricExpr": "(29 * tma_info_system_average_frequency * MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM + 23.5 * tma_info_system_average_frequency * MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks",
1847e74ece3SIan Rogers        "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group",
1857e74ece3SIan Rogers        "MetricName": "tma_contested_accesses",
1867e74ece3SIan Rogers        "MetricThreshold": "tma_contested_accesses > 0.05 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
1877e74ece3SIan Rogers        "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses. Contested accesses occur when data written by one Logical Processor are read by another Logical Processor on a different Physical Core. Examples of contested accesses include synchronizations such as locks; true data sharing such as modified locked variables; and false sharing. Sample with: MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM_PS;MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS_PS. Related metrics: tma_data_sharing, tma_false_sharing, tma_machine_clears, tma_remote_cache",
1887e74ece3SIan Rogers        "ScaleUnit": "100%"
1897e74ece3SIan Rogers    },
1907e74ece3SIan Rogers    {
1917e74ece3SIan Rogers        "BriefDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck",
1927e74ece3SIan Rogers        "MetricExpr": "max(0, tma_backend_bound - tma_memory_bound)",
1937e74ece3SIan Rogers        "MetricGroup": "Backend;Compute;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group",
1947e74ece3SIan Rogers        "MetricName": "tma_core_bound",
1957e74ece3SIan Rogers        "MetricThreshold": "tma_core_bound > 0.1 & tma_backend_bound > 0.2",
1967e74ece3SIan Rogers        "MetricgroupNoGroup": "TopdownL2",
1977e74ece3SIan Rogers        "PublicDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck.  Shortage in hardware compute resources; or dependencies in software's instructions are both categorized under Core Bound. Hence it may indicate the machine ran out of an out-of-order resource; certain execution units are overloaded or dependencies in program's data- or instruction-flow are limiting the performance (e.g. FP-chained long-latency arithmetic operations).",
1987e74ece3SIan Rogers        "ScaleUnit": "100%"
1997e74ece3SIan Rogers    },
2007e74ece3SIan Rogers    {
2017e74ece3SIan Rogers        "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses",
2027e74ece3SIan Rogers        "MetricConstraint": "NO_GROUP_EVENTS",
2037e74ece3SIan Rogers        "MetricExpr": "23.5 * tma_info_system_average_frequency * MEM_LOAD_L3_HIT_RETIRED.XSNP_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks",
2047e74ece3SIan Rogers        "MetricGroup": "Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group",
2057e74ece3SIan Rogers        "MetricName": "tma_data_sharing",
2067e74ece3SIan Rogers        "MetricThreshold": "tma_data_sharing > 0.05 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
2077e74ece3SIan Rogers        "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses. Data shared by multiple Logical Processors (even just read shared) may cause increased access latency due to cache coherency. Excessive data sharing can drastically harm multithreaded performance. Sample with: MEM_LOAD_L3_HIT_RETIRED.XSNP_HIT_PS. Related metrics: tma_contested_accesses, tma_false_sharing, tma_machine_clears, tma_remote_cache",
2087e74ece3SIan Rogers        "ScaleUnit": "100%"
2097e74ece3SIan Rogers    },
2107e74ece3SIan Rogers    {
2117e74ece3SIan Rogers        "BriefDescription": "This metric represents fraction of cycles where decoder-0 was the only active decoder",
2127e74ece3SIan Rogers        "MetricExpr": "(cpu@INST_DECODED.DECODERS\\,cmask\\=1@ - cpu@INST_DECODED.DECODERS\\,cmask\\=2@) / tma_info_core_core_clks / 2",
2137e74ece3SIan Rogers        "MetricGroup": "DSBmiss;FetchBW;TopdownL4;tma_L4_group;tma_issueD0;tma_mite_group",
2147e74ece3SIan Rogers        "MetricName": "tma_decoder0_alone",
2157e74ece3SIan Rogers        "MetricThreshold": "tma_decoder0_alone > 0.1 & (tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 5 > 0.35))",
2167e74ece3SIan Rogers        "PublicDescription": "This metric represents fraction of cycles where decoder-0 was the only active decoder. Related metrics: tma_few_uops_instructions",
2177e74ece3SIan Rogers        "ScaleUnit": "100%"
2187e74ece3SIan Rogers    },
2197e74ece3SIan Rogers    {
2207e74ece3SIan Rogers        "BriefDescription": "This metric represents fraction of cycles where the Divider unit was active",
2217e74ece3SIan Rogers        "MetricExpr": "ARITH.DIVIDER_ACTIVE / tma_info_thread_clks",
2227e74ece3SIan Rogers        "MetricGroup": "TopdownL3;tma_L3_group;tma_core_bound_group",
2237e74ece3SIan Rogers        "MetricName": "tma_divider",
2247e74ece3SIan Rogers        "MetricThreshold": "tma_divider > 0.2 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)",
2257e74ece3SIan Rogers        "PublicDescription": "This metric represents fraction of cycles where the Divider unit was active. Divide and square root instructions are performed by the Divider unit and can take considerably longer latency than integer or Floating Point addition; subtraction; or multiplication. Sample with: ARITH.DIVIDER_ACTIVE",
2267e74ece3SIan Rogers        "ScaleUnit": "100%"
2277e74ece3SIan Rogers    },
2287e74ece3SIan Rogers    {
2297e74ece3SIan Rogers        "BriefDescription": "This metric estimates how often the CPU was stalled on accesses to external memory (DRAM) by loads",
2307e74ece3SIan Rogers        "MetricConstraint": "NO_GROUP_EVENTS",
2317e74ece3SIan Rogers        "MetricExpr": "CYCLE_ACTIVITY.STALLS_L3_MISS / tma_info_thread_clks + (CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / tma_info_thread_clks - tma_l2_bound",
2327e74ece3SIan Rogers        "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
2337e74ece3SIan Rogers        "MetricName": "tma_dram_bound",
2347e74ece3SIan Rogers        "MetricThreshold": "tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)",
2357e74ece3SIan Rogers        "PublicDescription": "This metric estimates how often the CPU was stalled on accesses to external memory (DRAM) by loads. Better caching can improve the latency and increase performance. Sample with: MEM_LOAD_RETIRED.L3_MISS_PS",
2367e74ece3SIan Rogers        "ScaleUnit": "100%"
2377e74ece3SIan Rogers    },
2387e74ece3SIan Rogers    {
2397e74ece3SIan Rogers        "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline",
2407e74ece3SIan Rogers        "MetricExpr": "(IDQ.DSB_CYCLES_ANY - IDQ.DSB_CYCLES_OK) / tma_info_core_core_clks / 2",
2417e74ece3SIan Rogers        "MetricGroup": "DSB;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group",
2427e74ece3SIan Rogers        "MetricName": "tma_dsb",
2437e74ece3SIan Rogers        "MetricThreshold": "tma_dsb > 0.15 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 5 > 0.35)",
2447e74ece3SIan Rogers        "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline.  For example; inefficient utilization of the DSB cache structure or bank conflict when reading from it; are categorized here.",
2457e74ece3SIan Rogers        "ScaleUnit": "100%"
2467e74ece3SIan Rogers    },
2477e74ece3SIan Rogers    {
2487e74ece3SIan Rogers        "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines",
2497e74ece3SIan Rogers        "MetricExpr": "DSB2MITE_SWITCHES.PENALTY_CYCLES / tma_info_thread_clks",
2507e74ece3SIan Rogers        "MetricGroup": "DSBmiss;FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueFB",
2517e74ece3SIan Rogers        "MetricName": "tma_dsb_switches",
2527e74ece3SIan Rogers        "MetricThreshold": "tma_dsb_switches > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)",
2537e74ece3SIan Rogers        "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines. The DSB (decoded i-cache) is a Uop Cache where the front-end directly delivers Uops (micro operations) avoiding heavy x86 decoding. The DSB pipeline has shorter latency and delivered higher bandwidth than the MITE (legacy instruction decode pipeline). Switching between the two pipelines can cause penalties hence this metric measures the exposed penalty. Sample with: FRONTEND_RETIRED.DSB_MISS_PS. Related metrics: tma_fetch_bandwidth, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp",
2547e74ece3SIan Rogers        "ScaleUnit": "100%"
2557e74ece3SIan Rogers    },
2567e74ece3SIan Rogers    {
2577e74ece3SIan Rogers        "BriefDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses",
2587e74ece3SIan Rogers        "MetricExpr": "min(7 * cpu@DTLB_LOAD_MISSES.STLB_HIT\\,cmask\\=1@ + DTLB_LOAD_MISSES.WALK_ACTIVE, max(CYCLE_ACTIVITY.CYCLES_MEM_ANY - CYCLE_ACTIVITY.CYCLES_L1D_MISS, 0)) / tma_info_thread_clks",
2597e74ece3SIan Rogers        "MetricGroup": "MemoryTLB;TopdownL4;tma_L4_group;tma_issueTLB;tma_l1_bound_group",
2607e74ece3SIan Rogers        "MetricName": "tma_dtlb_load",
2617e74ece3SIan Rogers        "MetricThreshold": "tma_dtlb_load > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
2627e74ece3SIan Rogers        "PublicDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses. TLBs (Translation Look-aside Buffers) are processor caches for recently used entries out of the Page Tables that are used to map virtual- to physical-addresses by the operating system. This metric approximates the potential delay of demand loads missing the first-level data TLB (assuming worst case scenario with back to back misses to different pages). This includes hitting in the second-level TLB (STLB) as well as performing a hardware page walk on an STLB miss. Sample with: MEM_INST_RETIRED.STLB_MISS_LOADS_PS. Related metrics: tma_dtlb_store, tma_info_bottleneck_memory_data_tlbs",
2637e74ece3SIan Rogers        "ScaleUnit": "100%"
2647e74ece3SIan Rogers    },
2657e74ece3SIan Rogers    {
2667e74ece3SIan Rogers        "BriefDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses",
2677e74ece3SIan Rogers        "MetricExpr": "(7 * cpu@DTLB_STORE_MISSES.STLB_HIT\\,cmask\\=1@ + DTLB_STORE_MISSES.WALK_ACTIVE) / tma_info_core_core_clks",
2687e74ece3SIan Rogers        "MetricGroup": "MemoryTLB;TopdownL4;tma_L4_group;tma_issueTLB;tma_store_bound_group",
2697e74ece3SIan Rogers        "MetricName": "tma_dtlb_store",
2707e74ece3SIan Rogers        "MetricThreshold": "tma_dtlb_store > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
2717e74ece3SIan Rogers        "PublicDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses.  As with ordinary data caching; focus on improving data locality and reducing working-set size to reduce DTLB overhead.  Additionally; consider using profile-guided optimization (PGO) to collocate frequently-used data on the same page.  Try using larger page sizes for large amounts of frequently-used data. Sample with: MEM_INST_RETIRED.STLB_MISS_STORES_PS. Related metrics: tma_dtlb_load, tma_info_bottleneck_memory_data_tlbs",
2727e74ece3SIan Rogers        "ScaleUnit": "100%"
2737e74ece3SIan Rogers    },
2747e74ece3SIan Rogers    {
2757e74ece3SIan Rogers        "BriefDescription": "This metric roughly estimates how often CPU was handling synchronizations due to False Sharing",
2767e74ece3SIan Rogers        "MetricExpr": "32.5 * tma_info_system_average_frequency * OCR.DEMAND_RFO.L3_HIT.SNOOP_HITM / tma_info_thread_clks",
2777e74ece3SIan Rogers        "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_store_bound_group",
2787e74ece3SIan Rogers        "MetricName": "tma_false_sharing",
2797e74ece3SIan Rogers        "MetricThreshold": "tma_false_sharing > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
2807e74ece3SIan Rogers        "PublicDescription": "This metric roughly estimates how often CPU was handling synchronizations due to False Sharing. False Sharing is a multithreading hiccup; where multiple Logical Processors contend on different data-elements mapped into the same cache line. Sample with: OCR.DEMAND_RFO.L3_HIT.SNOOP_HITM. Related metrics: tma_contested_accesses, tma_data_sharing, tma_machine_clears, tma_remote_cache",
2817e74ece3SIan Rogers        "ScaleUnit": "100%"
2827e74ece3SIan Rogers    },
2837e74ece3SIan Rogers    {
2847e74ece3SIan Rogers        "BriefDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed",
2857e74ece3SIan Rogers        "MetricExpr": "L1D_PEND_MISS.FB_FULL / tma_info_thread_clks",
2867e74ece3SIan Rogers        "MetricGroup": "MemoryBW;TopdownL4;tma_L4_group;tma_issueBW;tma_issueSL;tma_issueSmSt;tma_l1_bound_group",
2877e74ece3SIan Rogers        "MetricName": "tma_fb_full",
2887e74ece3SIan Rogers        "MetricThreshold": "tma_fb_full > 0.3",
2897e74ece3SIan Rogers        "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_info_bottleneck_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores",
2907e74ece3SIan Rogers        "ScaleUnit": "100%"
2917e74ece3SIan Rogers    },
2927e74ece3SIan Rogers    {
2937e74ece3SIan Rogers        "BriefDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues",
2947e74ece3SIan Rogers        "MetricExpr": "max(0, tma_frontend_bound - tma_fetch_latency)",
2957e74ece3SIan Rogers        "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB",
2967e74ece3SIan Rogers        "MetricName": "tma_fetch_bandwidth",
2977e74ece3SIan Rogers        "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 5 > 0.35",
2987e74ece3SIan Rogers        "MetricgroupNoGroup": "TopdownL2",
2997e74ece3SIan Rogers        "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues.  For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Sample with: FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_2_PS. Related metrics: tma_dsb_switches, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp",
3007e74ece3SIan Rogers        "ScaleUnit": "100%"
3017e74ece3SIan Rogers    },
3027e74ece3SIan Rogers    {
3037e74ece3SIan Rogers        "BriefDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues",
3047e74ece3SIan Rogers        "MetricExpr": "(5 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE - INT_MISC.UOP_DROPPING) / tma_info_thread_slots",
3057e74ece3SIan Rogers        "MetricGroup": "Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group",
3067e74ece3SIan Rogers        "MetricName": "tma_fetch_latency",
3077e74ece3SIan Rogers        "MetricThreshold": "tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15",
3087e74ece3SIan Rogers        "MetricgroupNoGroup": "TopdownL2",
3097e74ece3SIan Rogers        "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues.  For example; instruction-cache misses; iTLB misses or fetch stalls after a branch misprediction are categorized under Frontend Latency. In such cases; the Frontend eventually delivers no uops for some period. Sample with: FRONTEND_RETIRED.LATENCY_GE_16_PS;FRONTEND_RETIRED.LATENCY_GE_8_PS",
3107e74ece3SIan Rogers        "ScaleUnit": "100%"
3117e74ece3SIan Rogers    },
3127e74ece3SIan Rogers    {
3137e74ece3SIan Rogers        "BriefDescription": "This metric represents fraction of slots where the CPU was retiring instructions that that are decoder into two or up to ([SNB+] four; [ADL+] five) uops",
3147e74ece3SIan Rogers        "MetricExpr": "tma_heavy_operations - tma_microcode_sequencer",
3157e74ece3SIan Rogers        "MetricGroup": "TopdownL3;tma_L3_group;tma_heavy_operations_group;tma_issueD0",
3167e74ece3SIan Rogers        "MetricName": "tma_few_uops_instructions",
3177e74ece3SIan Rogers        "MetricThreshold": "tma_few_uops_instructions > 0.05 & tma_heavy_operations > 0.1",
3187e74ece3SIan Rogers        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring instructions that that are decoder into two or up to ([SNB+] four; [ADL+] five) uops. This highly-correlates with the number of uops in such instructions. Related metrics: tma_decoder0_alone",
3197e74ece3SIan Rogers        "ScaleUnit": "100%"
3207e74ece3SIan Rogers    },
3217e74ece3SIan Rogers    {
3227e74ece3SIan Rogers        "BriefDescription": "This metric represents overall arithmetic floating-point (FP) operations fraction the CPU has executed (retired)",
3237e74ece3SIan Rogers        "MetricExpr": "tma_x87_use + tma_fp_scalar + tma_fp_vector",
3247e74ece3SIan Rogers        "MetricGroup": "HPC;TopdownL3;tma_L3_group;tma_light_operations_group",
3257e74ece3SIan Rogers        "MetricName": "tma_fp_arith",
3267e74ece3SIan Rogers        "MetricThreshold": "tma_fp_arith > 0.2 & tma_light_operations > 0.6",
3277e74ece3SIan Rogers        "PublicDescription": "This metric represents overall arithmetic floating-point (FP) operations fraction the CPU has executed (retired). Note this metric's value may exceed its parent due to use of \"Uops\" CountDomain and FMA double-counting.",
3287e74ece3SIan Rogers        "ScaleUnit": "100%"
3297e74ece3SIan Rogers    },
3307e74ece3SIan Rogers    {
3317e74ece3SIan Rogers        "BriefDescription": "This metric approximates arithmetic floating-point (FP) scalar uops fraction the CPU has retired",
3327e74ece3SIan Rogers        "MetricExpr": "cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ / (tma_retiring * tma_info_thread_slots)",
3337e74ece3SIan Rogers        "MetricGroup": "Compute;Flops;TopdownL4;tma_L4_group;tma_fp_arith_group;tma_issue2P",
3347e74ece3SIan Rogers        "MetricName": "tma_fp_scalar",
3357e74ece3SIan Rogers        "MetricThreshold": "tma_fp_scalar > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6)",
3367e74ece3SIan Rogers        "PublicDescription": "This metric approximates arithmetic floating-point (FP) scalar uops fraction the CPU has retired. May overcount due to FMA double counting. Related metrics: tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_port_6, tma_ports_utilized_2",
3377e74ece3SIan Rogers        "ScaleUnit": "100%"
3387e74ece3SIan Rogers    },
3397e74ece3SIan Rogers    {
3407e74ece3SIan Rogers        "BriefDescription": "This metric approximates arithmetic floating-point (FP) vector uops fraction the CPU has retired aggregated across all vector widths",
3417e74ece3SIan Rogers        "MetricExpr": "cpu@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE\\,umask\\=0xfc@ / (tma_retiring * tma_info_thread_slots)",
3427e74ece3SIan Rogers        "MetricGroup": "Compute;Flops;TopdownL4;tma_L4_group;tma_fp_arith_group;tma_issue2P",
3437e74ece3SIan Rogers        "MetricName": "tma_fp_vector",
3447e74ece3SIan Rogers        "MetricThreshold": "tma_fp_vector > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6)",
3457e74ece3SIan Rogers        "PublicDescription": "This metric approximates arithmetic floating-point (FP) vector uops fraction the CPU has retired aggregated across all vector widths. May overcount due to FMA double counting. Related metrics: tma_fp_scalar, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_port_6, tma_ports_utilized_2",
3467e74ece3SIan Rogers        "ScaleUnit": "100%"
3477e74ece3SIan Rogers    },
3487e74ece3SIan Rogers    {
3497e74ece3SIan Rogers        "BriefDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 128-bit wide vectors",
3507e74ece3SIan Rogers        "MetricExpr": "(FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE) / (tma_retiring * tma_info_thread_slots)",
3517e74ece3SIan Rogers        "MetricGroup": "Compute;Flops;TopdownL5;tma_L5_group;tma_fp_vector_group;tma_issue2P",
3527e74ece3SIan Rogers        "MetricName": "tma_fp_vector_128b",
3537e74ece3SIan Rogers        "MetricThreshold": "tma_fp_vector_128b > 0.1 & (tma_fp_vector > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6))",
3547e74ece3SIan Rogers        "PublicDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 128-bit wide vectors. May overcount due to FMA double counting. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_port_6, tma_ports_utilized_2",
3557e74ece3SIan Rogers        "ScaleUnit": "100%"
3567e74ece3SIan Rogers    },
3577e74ece3SIan Rogers    {
3587e74ece3SIan Rogers        "BriefDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 256-bit wide vectors",
3597e74ece3SIan Rogers        "MetricExpr": "(FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / (tma_retiring * tma_info_thread_slots)",
3607e74ece3SIan Rogers        "MetricGroup": "Compute;Flops;TopdownL5;tma_L5_group;tma_fp_vector_group;tma_issue2P",
3617e74ece3SIan Rogers        "MetricName": "tma_fp_vector_256b",
3627e74ece3SIan Rogers        "MetricThreshold": "tma_fp_vector_256b > 0.1 & (tma_fp_vector > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6))",
3637e74ece3SIan Rogers        "PublicDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 256-bit wide vectors. May overcount due to FMA double counting. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_port_6, tma_ports_utilized_2",
3647e74ece3SIan Rogers        "ScaleUnit": "100%"
3657e74ece3SIan Rogers    },
3667e74ece3SIan Rogers    {
3677e74ece3SIan Rogers        "BriefDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 512-bit wide vectors",
3687e74ece3SIan Rogers        "MetricExpr": "(FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / (tma_retiring * tma_info_thread_slots)",
3697e74ece3SIan Rogers        "MetricGroup": "Compute;Flops;TopdownL5;tma_L5_group;tma_fp_vector_group;tma_issue2P",
3707e74ece3SIan Rogers        "MetricName": "tma_fp_vector_512b",
3717e74ece3SIan Rogers        "MetricThreshold": "tma_fp_vector_512b > 0.1 & (tma_fp_vector > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6))",
3727e74ece3SIan Rogers        "PublicDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 512-bit wide vectors. May overcount due to FMA double counting. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_port_0, tma_port_1, tma_port_5, tma_port_6, tma_ports_utilized_2",
3737e74ece3SIan Rogers        "ScaleUnit": "100%"
3747e74ece3SIan Rogers    },
3757e74ece3SIan Rogers    {
3767e74ece3SIan Rogers        "BriefDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend",
3777e74ece3SIan Rogers        "DefaultMetricgroupName": "TopdownL1",
3787e74ece3SIan Rogers        "MetricExpr": "topdown\\-fe\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) - INT_MISC.UOP_DROPPING / tma_info_thread_slots",
3797e74ece3SIan Rogers        "MetricGroup": "Default;PGO;TmaL1;TopdownL1;tma_L1_group",
3807e74ece3SIan Rogers        "MetricName": "tma_frontend_bound",
3817e74ece3SIan Rogers        "MetricThreshold": "tma_frontend_bound > 0.15",
3827e74ece3SIan Rogers        "MetricgroupNoGroup": "TopdownL1;Default",
3837e74ece3SIan Rogers        "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Pipeline_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound. Sample with: FRONTEND_RETIRED.LATENCY_GE_4_PS",
3847e74ece3SIan Rogers        "ScaleUnit": "100%"
3857e74ece3SIan Rogers    },
3867e74ece3SIan Rogers    {
3877e74ece3SIan Rogers        "BriefDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences",
3887e74ece3SIan Rogers        "MetricExpr": "tma_microcode_sequencer + tma_retiring * (UOPS_DECODED.DEC0 - cpu@UOPS_DECODED.DEC0\\,cmask\\=1@) / IDQ.MITE_UOPS",
3897e74ece3SIan Rogers        "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group",
3907e74ece3SIan Rogers        "MetricName": "tma_heavy_operations",
3917e74ece3SIan Rogers        "MetricThreshold": "tma_heavy_operations > 0.1",
3927e74ece3SIan Rogers        "MetricgroupNoGroup": "TopdownL2",
3937e74ece3SIan Rogers        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences.",
3947e74ece3SIan Rogers        "ScaleUnit": "100%"
3957e74ece3SIan Rogers    },
3967e74ece3SIan Rogers    {
3977e74ece3SIan Rogers        "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses",
3987e74ece3SIan Rogers        "MetricExpr": "ICACHE_16B.IFDATA_STALL / tma_info_thread_clks",
3997e74ece3SIan Rogers        "MetricGroup": "BigFoot;FetchLat;IcMiss;TopdownL3;tma_L3_group;tma_fetch_latency_group",
4007e74ece3SIan Rogers        "MetricName": "tma_icache_misses",
4017e74ece3SIan Rogers        "MetricThreshold": "tma_icache_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)",
4027e74ece3SIan Rogers        "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses. Sample with: FRONTEND_RETIRED.L2_MISS_PS;FRONTEND_RETIRED.L1I_MISS_PS",
4037e74ece3SIan Rogers        "ScaleUnit": "100%"
4047e74ece3SIan Rogers    },
4057e74ece3SIan Rogers    {
4067e74ece3SIan Rogers        "BriefDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear)",
4077e74ece3SIan Rogers        "MetricConstraint": "NO_GROUP_EVENTS",
4087e74ece3SIan Rogers        "MetricExpr": "(tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) * tma_info_thread_slots / BR_MISP_RETIRED.ALL_BRANCHES",
4097e74ece3SIan Rogers        "MetricGroup": "Bad;BrMispredicts;tma_issueBM",
4107e74ece3SIan Rogers        "MetricName": "tma_info_bad_spec_branch_misprediction_cost",
4117e74ece3SIan Rogers        "PublicDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear). Related metrics: tma_branch_mispredicts, tma_info_bottleneck_mispredictions, tma_mispredicts_resteers"
4127e74ece3SIan Rogers    },
4137e74ece3SIan Rogers    {
4147e74ece3SIan Rogers        "BriefDescription": "Instructions per retired mispredicts for conditional non-taken branches (lower number means higher occurrence rate).",
4157e74ece3SIan Rogers        "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.COND_NTAKEN",
4167e74ece3SIan Rogers        "MetricGroup": "Bad;BrMispredicts",
4177e74ece3SIan Rogers        "MetricName": "tma_info_bad_spec_ipmisp_cond_ntaken",
4187e74ece3SIan Rogers        "MetricThreshold": "tma_info_bad_spec_ipmisp_cond_ntaken < 200"
4197e74ece3SIan Rogers    },
4207e74ece3SIan Rogers    {
4217e74ece3SIan Rogers        "BriefDescription": "Instructions per retired mispredicts for conditional taken branches (lower number means higher occurrence rate).",
4227e74ece3SIan Rogers        "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.COND_TAKEN",
4237e74ece3SIan Rogers        "MetricGroup": "Bad;BrMispredicts",
4247e74ece3SIan Rogers        "MetricName": "tma_info_bad_spec_ipmisp_cond_taken",
4257e74ece3SIan Rogers        "MetricThreshold": "tma_info_bad_spec_ipmisp_cond_taken < 200"
4267e74ece3SIan Rogers    },
4277e74ece3SIan Rogers    {
4287e74ece3SIan Rogers        "BriefDescription": "Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).",
4297e74ece3SIan Rogers        "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.INDIRECT",
4307e74ece3SIan Rogers        "MetricGroup": "Bad;BrMispredicts",
4317e74ece3SIan Rogers        "MetricName": "tma_info_bad_spec_ipmisp_indirect",
4327e74ece3SIan Rogers        "MetricThreshold": "tma_info_bad_spec_ipmisp_indirect < 1e3"
4337e74ece3SIan Rogers    },
4347e74ece3SIan Rogers    {
4357e74ece3SIan Rogers        "BriefDescription": "Instructions per retired mispredicts for return branches (lower number means higher occurrence rate).",
4367e74ece3SIan Rogers        "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.RET",
4377e74ece3SIan Rogers        "MetricGroup": "Bad;BrMispredicts",
4387e74ece3SIan Rogers        "MetricName": "tma_info_bad_spec_ipmisp_ret",
4397e74ece3SIan Rogers        "MetricThreshold": "tma_info_bad_spec_ipmisp_ret < 500"
4407e74ece3SIan Rogers    },
4417e74ece3SIan Rogers    {
4427e74ece3SIan Rogers        "BriefDescription": "Number of Instructions per non-speculative Branch Misprediction (JEClear) (lower number means higher occurrence rate)",
4437e74ece3SIan Rogers        "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.ALL_BRANCHES",
4447e74ece3SIan Rogers        "MetricGroup": "Bad;BadSpec;BrMispredicts",
4457e74ece3SIan Rogers        "MetricName": "tma_info_bad_spec_ipmispredict",
4467e74ece3SIan Rogers        "MetricThreshold": "tma_info_bad_spec_ipmispredict < 200"
4477e74ece3SIan Rogers    },
4487e74ece3SIan Rogers    {
4497e74ece3SIan Rogers        "BriefDescription": "Probability of Core Bound bottleneck hidden by SMT-profiling artifacts",
4507e74ece3SIan Rogers        "MetricConstraint": "NO_GROUP_EVENTS",
4517e74ece3SIan Rogers        "MetricExpr": "(100 * (1 - tma_core_bound / tma_ports_utilization if tma_core_bound < tma_ports_utilization else 1) if tma_info_system_smt_2t_utilization > 0.5 else 0)",
4527e74ece3SIan Rogers        "MetricGroup": "Cor;SMT",
4537e74ece3SIan Rogers        "MetricName": "tma_info_botlnk_l0_core_bound_likely",
4547e74ece3SIan Rogers        "MetricThreshold": "tma_info_botlnk_l0_core_bound_likely > 0.5"
4557e74ece3SIan Rogers    },
4567e74ece3SIan Rogers    {
4577e74ece3SIan Rogers        "BriefDescription": "Total pipeline cost of DSB (uop cache) misses - subset of the Instruction_Fetch_BW Bottleneck",
4587e74ece3SIan Rogers        "MetricConstraint": "NO_GROUP_EVENTS",
4597e74ece3SIan Rogers        "MetricExpr": "100 * (tma_fetch_latency * tma_dsb_switches / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + tma_fetch_bandwidth * tma_mite / (tma_dsb + tma_lsd + tma_mite))",
4607e74ece3SIan Rogers        "MetricGroup": "DSBmiss;Fed;tma_issueFB",
4617e74ece3SIan Rogers        "MetricName": "tma_info_botlnk_l2_dsb_misses",
4627e74ece3SIan Rogers        "MetricThreshold": "tma_info_botlnk_l2_dsb_misses > 10",
4637e74ece3SIan Rogers        "PublicDescription": "Total pipeline cost of DSB (uop cache) misses - subset of the Instruction_Fetch_BW Bottleneck. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp"
4647e74ece3SIan Rogers    },
4657e74ece3SIan Rogers    {
4667e74ece3SIan Rogers        "BriefDescription": "Total pipeline cost of Instruction Cache misses - subset of the Big_Code Bottleneck",
467*9a7d82c1SIan Rogers        "MetricConstraint": "NO_GROUP_EVENTS",
4687e74ece3SIan Rogers        "MetricExpr": "100 * (tma_fetch_latency * tma_icache_misses / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))",
4697e74ece3SIan Rogers        "MetricGroup": "Fed;FetchLat;IcMiss;tma_issueFL",
4707e74ece3SIan Rogers        "MetricName": "tma_info_botlnk_l2_ic_misses",
4717e74ece3SIan Rogers        "MetricThreshold": "tma_info_botlnk_l2_ic_misses > 5",
4727e74ece3SIan Rogers        "PublicDescription": "Total pipeline cost of Instruction Cache misses - subset of the Big_Code Bottleneck. Related metrics: "
4737e74ece3SIan Rogers    },
4747e74ece3SIan Rogers    {
4757e74ece3SIan Rogers        "BriefDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses)",
4767e74ece3SIan Rogers        "MetricConstraint": "NO_GROUP_EVENTS",
4777e74ece3SIan Rogers        "MetricExpr": "100 * tma_fetch_latency * (tma_itlb_misses + tma_icache_misses + tma_unknown_branches) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)",
4787e74ece3SIan Rogers        "MetricGroup": "BigFoot;Fed;Frontend;IcMiss;MemoryTLB;tma_issueBC",
4797e74ece3SIan Rogers        "MetricName": "tma_info_bottleneck_big_code",
4807e74ece3SIan Rogers        "MetricThreshold": "tma_info_bottleneck_big_code > 20",
4817e74ece3SIan Rogers        "PublicDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses). Related metrics: tma_info_bottleneck_branching_overhead"
4827e74ece3SIan Rogers    },
4837e74ece3SIan Rogers    {
4847e74ece3SIan Rogers        "BriefDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls)",
4857e74ece3SIan Rogers        "MetricExpr": "100 * ((BR_INST_RETIRED.COND + 3 * BR_INST_RETIRED.NEAR_CALL + (BR_INST_RETIRED.NEAR_TAKEN - BR_INST_RETIRED.COND_TAKEN - 2 * BR_INST_RETIRED.NEAR_CALL)) / tma_info_thread_slots)",
4867e74ece3SIan Rogers        "MetricGroup": "Ret;tma_issueBC",
4877e74ece3SIan Rogers        "MetricName": "tma_info_bottleneck_branching_overhead",
4887e74ece3SIan Rogers        "MetricThreshold": "tma_info_bottleneck_branching_overhead > 10",
4897e74ece3SIan Rogers        "PublicDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls). Related metrics: tma_info_bottleneck_big_code"
4907e74ece3SIan Rogers    },
4917e74ece3SIan Rogers    {
4927e74ece3SIan Rogers        "BriefDescription": "Total pipeline cost of instruction fetch bandwidth related bottlenecks",
4937e74ece3SIan Rogers        "MetricConstraint": "NO_GROUP_EVENTS",
4947e74ece3SIan Rogers        "MetricExpr": "100 * (tma_frontend_bound - tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) - tma_info_bottleneck_big_code",
4957e74ece3SIan Rogers        "MetricGroup": "Fed;FetchBW;Frontend",
4967e74ece3SIan Rogers        "MetricName": "tma_info_bottleneck_instruction_fetch_bw",
4977e74ece3SIan Rogers        "MetricThreshold": "tma_info_bottleneck_instruction_fetch_bw > 20"
4987e74ece3SIan Rogers    },
4997e74ece3SIan Rogers    {
5007e74ece3SIan Rogers        "BriefDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks",
501*9a7d82c1SIan Rogers        "MetricConstraint": "NO_GROUP_EVENTS",
5027e74ece3SIan Rogers        "MetricExpr": "100 * tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency)) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_sq_full / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full))) + tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_fb_full / (tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk))",
5037e74ece3SIan Rogers        "MetricGroup": "Mem;MemoryBW;Offcore;tma_issueBW",
5047e74ece3SIan Rogers        "MetricName": "tma_info_bottleneck_memory_bandwidth",
5057e74ece3SIan Rogers        "MetricThreshold": "tma_info_bottleneck_memory_bandwidth > 20",
5067e74ece3SIan Rogers        "PublicDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks. Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full"
5077e74ece3SIan Rogers    },
5087e74ece3SIan Rogers    {
5097e74ece3SIan Rogers        "BriefDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs)",
5107e74ece3SIan Rogers        "MetricConstraint": "NO_GROUP_EVENTS",
5117e74ece3SIan Rogers        "MetricExpr": "100 * tma_memory_bound * (tma_l1_bound / max(tma_memory_bound, tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_dtlb_load / max(tma_l1_bound, tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_dtlb_store / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)))",
5127e74ece3SIan Rogers        "MetricGroup": "Mem;MemoryTLB;Offcore;tma_issueTLB",
5137e74ece3SIan Rogers        "MetricName": "tma_info_bottleneck_memory_data_tlbs",
5147e74ece3SIan Rogers        "MetricThreshold": "tma_info_bottleneck_memory_data_tlbs > 20",
5157e74ece3SIan Rogers        "PublicDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs). Related metrics: tma_dtlb_load, tma_dtlb_store"
5167e74ece3SIan Rogers    },
5177e74ece3SIan Rogers    {
5187e74ece3SIan Rogers        "BriefDescription": "Total pipeline cost of Memory Latency related bottlenecks (external memory and off-core caches)",
5197e74ece3SIan Rogers        "MetricConstraint": "NO_GROUP_EVENTS",
5207e74ece3SIan Rogers        "MetricExpr": "100 * tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_l2_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound))",
5217e74ece3SIan Rogers        "MetricGroup": "Mem;MemoryLat;Offcore;tma_issueLat",
5227e74ece3SIan Rogers        "MetricName": "tma_info_bottleneck_memory_latency",
5237e74ece3SIan Rogers        "MetricThreshold": "tma_info_bottleneck_memory_latency > 20",
5247e74ece3SIan Rogers        "PublicDescription": "Total pipeline cost of Memory Latency related bottlenecks (external memory and off-core caches). Related metrics: tma_l3_hit_latency, tma_mem_latency"
5257e74ece3SIan Rogers    },
5267e74ece3SIan Rogers    {
5277e74ece3SIan Rogers        "BriefDescription": "Total pipeline cost of Branch Misprediction related bottlenecks",
5287e74ece3SIan Rogers        "MetricConstraint": "NO_GROUP_EVENTS",
5297e74ece3SIan Rogers        "MetricExpr": "100 * (tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))",
5307e74ece3SIan Rogers        "MetricGroup": "Bad;BadSpec;BrMispredicts;tma_issueBM",
5317e74ece3SIan Rogers        "MetricName": "tma_info_bottleneck_mispredictions",
5327e74ece3SIan Rogers        "MetricThreshold": "tma_info_bottleneck_mispredictions > 20",
5337e74ece3SIan Rogers        "PublicDescription": "Total pipeline cost of Branch Misprediction related bottlenecks. Related metrics: tma_branch_mispredicts, tma_info_bad_spec_branch_misprediction_cost, tma_mispredicts_resteers"
5347e74ece3SIan Rogers    },
5357e74ece3SIan Rogers    {
5367e74ece3SIan Rogers        "BriefDescription": "Fraction of branches that are CALL or RET",
5377e74ece3SIan Rogers        "MetricExpr": "(BR_INST_RETIRED.NEAR_CALL + BR_INST_RETIRED.NEAR_RETURN) / BR_INST_RETIRED.ALL_BRANCHES",
5387e74ece3SIan Rogers        "MetricGroup": "Bad;Branches",
5397e74ece3SIan Rogers        "MetricName": "tma_info_branches_callret"
5407e74ece3SIan Rogers    },
5417e74ece3SIan Rogers    {
5427e74ece3SIan Rogers        "BriefDescription": "Fraction of branches that are non-taken conditionals",
5437e74ece3SIan Rogers        "MetricExpr": "BR_INST_RETIRED.COND_NTAKEN / BR_INST_RETIRED.ALL_BRANCHES",
5447e74ece3SIan Rogers        "MetricGroup": "Bad;Branches;CodeGen;PGO",
5457e74ece3SIan Rogers        "MetricName": "tma_info_branches_cond_nt"
5467e74ece3SIan Rogers    },
5477e74ece3SIan Rogers    {
5487e74ece3SIan Rogers        "BriefDescription": "Fraction of branches that are taken conditionals",
5497e74ece3SIan Rogers        "MetricExpr": "BR_INST_RETIRED.COND_TAKEN / BR_INST_RETIRED.ALL_BRANCHES",
5507e74ece3SIan Rogers        "MetricGroup": "Bad;Branches;CodeGen;PGO",
5517e74ece3SIan Rogers        "MetricName": "tma_info_branches_cond_tk"
5527e74ece3SIan Rogers    },
5537e74ece3SIan Rogers    {
5547e74ece3SIan Rogers        "BriefDescription": "Fraction of branches that are unconditional (direct or indirect) jumps",
5557e74ece3SIan Rogers        "MetricExpr": "(BR_INST_RETIRED.NEAR_TAKEN - BR_INST_RETIRED.COND_TAKEN - 2 * BR_INST_RETIRED.NEAR_CALL) / BR_INST_RETIRED.ALL_BRANCHES",
5567e74ece3SIan Rogers        "MetricGroup": "Bad;Branches",
5577e74ece3SIan Rogers        "MetricName": "tma_info_branches_jump"
5587e74ece3SIan Rogers    },
5597e74ece3SIan Rogers    {
5607e74ece3SIan Rogers        "BriefDescription": "Fraction of branches of other types (not individually covered by other metrics in Info.Branches group)",
5617e74ece3SIan Rogers        "MetricExpr": "1 - (tma_info_branches_cond_nt + tma_info_branches_cond_tk + tma_info_branches_callret + tma_info_branches_jump)",
5627e74ece3SIan Rogers        "MetricGroup": "Bad;Branches",
5637e74ece3SIan Rogers        "MetricName": "tma_info_branches_other_branches"
5647e74ece3SIan Rogers    },
5657e74ece3SIan Rogers    {
5667e74ece3SIan Rogers        "BriefDescription": "Core actual clocks when any Logical Processor is active on the Physical Core",
5677e74ece3SIan Rogers        "MetricExpr": "CPU_CLK_UNHALTED.DISTRIBUTED",
5687e74ece3SIan Rogers        "MetricGroup": "SMT",
5697e74ece3SIan Rogers        "MetricName": "tma_info_core_core_clks"
5707e74ece3SIan Rogers    },
5717e74ece3SIan Rogers    {
5727e74ece3SIan Rogers        "BriefDescription": "Instructions Per Cycle across hyper-threads (per physical core)",
5737e74ece3SIan Rogers        "MetricExpr": "INST_RETIRED.ANY / tma_info_core_core_clks",
5747e74ece3SIan Rogers        "MetricGroup": "Ret;SMT;TmaL1;tma_L1_group",
5757e74ece3SIan Rogers        "MetricName": "tma_info_core_coreipc"
5767e74ece3SIan Rogers    },
5777e74ece3SIan Rogers    {
5787e74ece3SIan Rogers        "BriefDescription": "Floating Point Operations Per Cycle",
5797e74ece3SIan Rogers        "MetricExpr": "(cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * cpu@FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE\\,umask\\=0x18@ + 8 * cpu@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE\\,umask\\=0x60@ + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / tma_info_core_core_clks",
5807e74ece3SIan Rogers        "MetricGroup": "Flops;Ret",
5817e74ece3SIan Rogers        "MetricName": "tma_info_core_flopc"
5827e74ece3SIan Rogers    },
5837e74ece3SIan Rogers    {
5847e74ece3SIan Rogers        "BriefDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width)",
5857e74ece3SIan Rogers        "MetricExpr": "(cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + cpu@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE\\,umask\\=0xfc@) / (2 * tma_info_core_core_clks)",
5867e74ece3SIan Rogers        "MetricGroup": "Cor;Flops;HPC",
5877e74ece3SIan Rogers        "MetricName": "tma_info_core_fp_arith_utilization",
5887e74ece3SIan Rogers        "PublicDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width). Values > 1 are possible due to ([BDW+] Fused-Multiply Add (FMA) counting - common; [ADL+] use all of ADD/MUL/FMA in Scalar or 128/256-bit vectors - less common)."
5897e74ece3SIan Rogers    },
5907e74ece3SIan Rogers    {
5917e74ece3SIan Rogers        "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core",
5927e74ece3SIan Rogers        "MetricExpr": "UOPS_EXECUTED.THREAD / (UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2 if #SMT_on else UOPS_EXECUTED.CORE_CYCLES_GE_1)",
5937e74ece3SIan Rogers        "MetricGroup": "Backend;Cor;Pipeline;PortsUtil",
5947e74ece3SIan Rogers        "MetricName": "tma_info_core_ilp"
5957e74ece3SIan Rogers    },
5967e74ece3SIan Rogers    {
5977e74ece3SIan Rogers        "BriefDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache)",
5987e74ece3SIan Rogers        "MetricExpr": "IDQ.DSB_UOPS / UOPS_ISSUED.ANY",
5997e74ece3SIan Rogers        "MetricGroup": "DSB;Fed;FetchBW;tma_issueFB",
6007e74ece3SIan Rogers        "MetricName": "tma_info_frontend_dsb_coverage",
6017e74ece3SIan Rogers        "MetricThreshold": "tma_info_frontend_dsb_coverage < 0.7 & tma_info_thread_ipc / 5 > 0.35",
6027e74ece3SIan Rogers        "PublicDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache). Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_botlnk_l2_dsb_misses, tma_info_inst_mix_iptb, tma_lcp"
6037e74ece3SIan Rogers    },
6047e74ece3SIan Rogers    {
6057e74ece3SIan Rogers        "BriefDescription": "Average number of cycles of a switch from the DSB fetch-unit to MITE fetch unit - see DSB_Switches tree node for details.",
6067e74ece3SIan Rogers        "MetricExpr": "DSB2MITE_SWITCHES.PENALTY_CYCLES / cpu@DSB2MITE_SWITCHES.PENALTY_CYCLES\\,cmask\\=1\\,edge@",
6077e74ece3SIan Rogers        "MetricGroup": "DSBmiss",
6087e74ece3SIan Rogers        "MetricName": "tma_info_frontend_dsb_switch_cost"
6097e74ece3SIan Rogers    },
6107e74ece3SIan Rogers    {
6117e74ece3SIan Rogers        "BriefDescription": "Average number of Uops issued by front-end when it issued something",
6127e74ece3SIan Rogers        "MetricExpr": "UOPS_ISSUED.ANY / cpu@UOPS_ISSUED.ANY\\,cmask\\=1@",
6137e74ece3SIan Rogers        "MetricGroup": "Fed;FetchBW",
6147e74ece3SIan Rogers        "MetricName": "tma_info_frontend_fetch_upc"
6157e74ece3SIan Rogers    },
6167e74ece3SIan Rogers    {
6177e74ece3SIan Rogers        "BriefDescription": "Average Latency for L1 instruction cache misses",
6187e74ece3SIan Rogers        "MetricExpr": "ICACHE_16B.IFDATA_STALL / cpu@ICACHE_16B.IFDATA_STALL\\,cmask\\=1\\,edge@",
6197e74ece3SIan Rogers        "MetricGroup": "Fed;FetchLat;IcMiss",
6207e74ece3SIan Rogers        "MetricName": "tma_info_frontend_icache_miss_latency"
6217e74ece3SIan Rogers    },
6227e74ece3SIan Rogers    {
6237e74ece3SIan Rogers        "BriefDescription": "Instructions per non-speculative DSB miss (lower number means higher occurrence rate)",
6247e74ece3SIan Rogers        "MetricExpr": "INST_RETIRED.ANY / FRONTEND_RETIRED.ANY_DSB_MISS",
6257e74ece3SIan Rogers        "MetricGroup": "DSBmiss;Fed",
6267e74ece3SIan Rogers        "MetricName": "tma_info_frontend_ipdsb_miss_ret",
6277e74ece3SIan Rogers        "MetricThreshold": "tma_info_frontend_ipdsb_miss_ret < 50"
6287e74ece3SIan Rogers    },
6297e74ece3SIan Rogers    {
6307e74ece3SIan Rogers        "BriefDescription": "Instructions per speculative Unknown Branch Misprediction (BAClear) (lower number means higher occurrence rate)",
6317e74ece3SIan Rogers        "MetricExpr": "tma_info_inst_mix_instructions / BACLEARS.ANY",
6327e74ece3SIan Rogers        "MetricGroup": "Fed",
6337e74ece3SIan Rogers        "MetricName": "tma_info_frontend_ipunknown_branch"
6347e74ece3SIan Rogers    },
6357e74ece3SIan Rogers    {
6367e74ece3SIan Rogers        "BriefDescription": "L2 cache true code cacheline misses per kilo instruction",
6377e74ece3SIan Rogers        "MetricExpr": "1e3 * FRONTEND_RETIRED.L2_MISS / INST_RETIRED.ANY",
6387e74ece3SIan Rogers        "MetricGroup": "IcMiss",
6397e74ece3SIan Rogers        "MetricName": "tma_info_frontend_l2mpki_code"
6407e74ece3SIan Rogers    },
6417e74ece3SIan Rogers    {
6427e74ece3SIan Rogers        "BriefDescription": "L2 cache speculative code cacheline misses per kilo instruction",
6437e74ece3SIan Rogers        "MetricExpr": "1e3 * L2_RQSTS.CODE_RD_MISS / INST_RETIRED.ANY",
6447e74ece3SIan Rogers        "MetricGroup": "IcMiss",
6457e74ece3SIan Rogers        "MetricName": "tma_info_frontend_l2mpki_code_all"
6467e74ece3SIan Rogers    },
6477e74ece3SIan Rogers    {
6487e74ece3SIan Rogers        "BriefDescription": "Fraction of Uops delivered by the LSD (Loop Stream Detector; aka Loop Cache)",
6497e74ece3SIan Rogers        "MetricExpr": "LSD.UOPS / UOPS_ISSUED.ANY",
6507e74ece3SIan Rogers        "MetricGroup": "Fed;LSD",
6517e74ece3SIan Rogers        "MetricName": "tma_info_frontend_lsd_coverage"
6527e74ece3SIan Rogers    },
6537e74ece3SIan Rogers    {
6547e74ece3SIan Rogers        "BriefDescription": "Branch instructions per taken branch.",
6557e74ece3SIan Rogers        "MetricExpr": "BR_INST_RETIRED.ALL_BRANCHES / BR_INST_RETIRED.NEAR_TAKEN",
6567e74ece3SIan Rogers        "MetricGroup": "Branches;Fed;PGO",
6577e74ece3SIan Rogers        "MetricName": "tma_info_inst_mix_bptkbranch"
6587e74ece3SIan Rogers    },
6597e74ece3SIan Rogers    {
6607e74ece3SIan Rogers        "BriefDescription": "Total number of retired Instructions",
6617e74ece3SIan Rogers        "MetricExpr": "INST_RETIRED.ANY",
6627e74ece3SIan Rogers        "MetricGroup": "Summary;TmaL1;tma_L1_group",
6637e74ece3SIan Rogers        "MetricName": "tma_info_inst_mix_instructions",
6647e74ece3SIan Rogers        "PublicDescription": "Total number of retired Instructions. Sample with: INST_RETIRED.PREC_DIST"
6657e74ece3SIan Rogers    },
6667e74ece3SIan Rogers    {
6677e74ece3SIan Rogers        "BriefDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate)",
6687e74ece3SIan Rogers        "MetricExpr": "INST_RETIRED.ANY / (cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + cpu@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE\\,umask\\=0xfc@)",
6697e74ece3SIan Rogers        "MetricGroup": "Flops;InsType",
6707e74ece3SIan Rogers        "MetricName": "tma_info_inst_mix_iparith",
6717e74ece3SIan Rogers        "MetricThreshold": "tma_info_inst_mix_iparith < 10",
6727e74ece3SIan Rogers        "PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. Approximated prior to BDW."
6737e74ece3SIan Rogers    },
6747e74ece3SIan Rogers    {
6757e74ece3SIan Rogers        "BriefDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate)",
6767e74ece3SIan Rogers        "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE)",
6777e74ece3SIan Rogers        "MetricGroup": "Flops;FpVector;InsType",
6787e74ece3SIan Rogers        "MetricName": "tma_info_inst_mix_iparith_avx128",
6797e74ece3SIan Rogers        "MetricThreshold": "tma_info_inst_mix_iparith_avx128 < 10",
6807e74ece3SIan Rogers        "PublicDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting."
6817e74ece3SIan Rogers    },
6827e74ece3SIan Rogers    {
6837e74ece3SIan Rogers        "BriefDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate)",
6847e74ece3SIan Rogers        "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE)",
6857e74ece3SIan Rogers        "MetricGroup": "Flops;FpVector;InsType",
6867e74ece3SIan Rogers        "MetricName": "tma_info_inst_mix_iparith_avx256",
6877e74ece3SIan Rogers        "MetricThreshold": "tma_info_inst_mix_iparith_avx256 < 10",
6887e74ece3SIan Rogers        "PublicDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting."
6897e74ece3SIan Rogers    },
6907e74ece3SIan Rogers    {
6917e74ece3SIan Rogers        "BriefDescription": "Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate)",
6927e74ece3SIan Rogers        "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE)",
6937e74ece3SIan Rogers        "MetricGroup": "Flops;FpVector;InsType",
6947e74ece3SIan Rogers        "MetricName": "tma_info_inst_mix_iparith_avx512",
6957e74ece3SIan Rogers        "MetricThreshold": "tma_info_inst_mix_iparith_avx512 < 10",
6967e74ece3SIan Rogers        "PublicDescription": "Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting."
6977e74ece3SIan Rogers    },
6987e74ece3SIan Rogers    {
6997e74ece3SIan Rogers        "BriefDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate)",
7007e74ece3SIan Rogers        "MetricExpr": "INST_RETIRED.ANY / FP_ARITH_INST_RETIRED.SCALAR_DOUBLE",
7017e74ece3SIan Rogers        "MetricGroup": "Flops;FpScalar;InsType",
7027e74ece3SIan Rogers        "MetricName": "tma_info_inst_mix_iparith_scalar_dp",
7037e74ece3SIan Rogers        "MetricThreshold": "tma_info_inst_mix_iparith_scalar_dp < 10",
7047e74ece3SIan Rogers        "PublicDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting."
7057e74ece3SIan Rogers    },
7067e74ece3SIan Rogers    {
7077e74ece3SIan Rogers        "BriefDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate)",
7087e74ece3SIan Rogers        "MetricExpr": "INST_RETIRED.ANY / FP_ARITH_INST_RETIRED.SCALAR_SINGLE",
7097e74ece3SIan Rogers        "MetricGroup": "Flops;FpScalar;InsType",
7107e74ece3SIan Rogers        "MetricName": "tma_info_inst_mix_iparith_scalar_sp",
7117e74ece3SIan Rogers        "MetricThreshold": "tma_info_inst_mix_iparith_scalar_sp < 10",
7127e74ece3SIan Rogers        "PublicDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting."
7137e74ece3SIan Rogers    },
7147e74ece3SIan Rogers    {
7157e74ece3SIan Rogers        "BriefDescription": "Instructions per Branch (lower number means higher occurrence rate)",
7167e74ece3SIan Rogers        "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.ALL_BRANCHES",
7177e74ece3SIan Rogers        "MetricGroup": "Branches;Fed;InsType",
7187e74ece3SIan Rogers        "MetricName": "tma_info_inst_mix_ipbranch",
7197e74ece3SIan Rogers        "MetricThreshold": "tma_info_inst_mix_ipbranch < 8"
7207e74ece3SIan Rogers    },
7217e74ece3SIan Rogers    {
7227e74ece3SIan Rogers        "BriefDescription": "Instructions per (near) call (lower number means higher occurrence rate)",
7237e74ece3SIan Rogers        "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.NEAR_CALL",
7247e74ece3SIan Rogers        "MetricGroup": "Branches;Fed;PGO",
7257e74ece3SIan Rogers        "MetricName": "tma_info_inst_mix_ipcall",
7267e74ece3SIan Rogers        "MetricThreshold": "tma_info_inst_mix_ipcall < 200"
7277e74ece3SIan Rogers    },
7287e74ece3SIan Rogers    {
7297e74ece3SIan Rogers        "BriefDescription": "Instructions per Floating Point (FP) Operation (lower number means higher occurrence rate)",
7307e74ece3SIan Rogers        "MetricExpr": "INST_RETIRED.ANY / (cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * cpu@FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE\\,umask\\=0x18@ + 8 * cpu@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE\\,umask\\=0x60@ + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE)",
7317e74ece3SIan Rogers        "MetricGroup": "Flops;InsType",
7327e74ece3SIan Rogers        "MetricName": "tma_info_inst_mix_ipflop",
7337e74ece3SIan Rogers        "MetricThreshold": "tma_info_inst_mix_ipflop < 10"
7347e74ece3SIan Rogers    },
7357e74ece3SIan Rogers    {
7367e74ece3SIan Rogers        "BriefDescription": "Instructions per Load (lower number means higher occurrence rate)",
7377e74ece3SIan Rogers        "MetricExpr": "INST_RETIRED.ANY / MEM_INST_RETIRED.ALL_LOADS",
7387e74ece3SIan Rogers        "MetricGroup": "InsType",
7397e74ece3SIan Rogers        "MetricName": "tma_info_inst_mix_ipload",
7407e74ece3SIan Rogers        "MetricThreshold": "tma_info_inst_mix_ipload < 3"
7417e74ece3SIan Rogers    },
7427e74ece3SIan Rogers    {
7437e74ece3SIan Rogers        "BriefDescription": "Instructions per Store (lower number means higher occurrence rate)",
7447e74ece3SIan Rogers        "MetricExpr": "INST_RETIRED.ANY / MEM_INST_RETIRED.ALL_STORES",
7457e74ece3SIan Rogers        "MetricGroup": "InsType",
7467e74ece3SIan Rogers        "MetricName": "tma_info_inst_mix_ipstore",
7477e74ece3SIan Rogers        "MetricThreshold": "tma_info_inst_mix_ipstore < 8"
7487e74ece3SIan Rogers    },
7497e74ece3SIan Rogers    {
7507e74ece3SIan Rogers        "BriefDescription": "Instructions per Software prefetch instruction (of any type: NTA/T0/T1/T2/Prefetch) (lower number means higher occurrence rate)",
7517e74ece3SIan Rogers        "MetricExpr": "INST_RETIRED.ANY / cpu@SW_PREFETCH_ACCESS.T0\\,umask\\=0xF@",
7527e74ece3SIan Rogers        "MetricGroup": "Prefetches",
7537e74ece3SIan Rogers        "MetricName": "tma_info_inst_mix_ipswpf",
7547e74ece3SIan Rogers        "MetricThreshold": "tma_info_inst_mix_ipswpf < 100"
7557e74ece3SIan Rogers    },
7567e74ece3SIan Rogers    {
7577e74ece3SIan Rogers        "BriefDescription": "Instruction per taken branch",
7587e74ece3SIan Rogers        "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.NEAR_TAKEN",
7597e74ece3SIan Rogers        "MetricGroup": "Branches;Fed;FetchBW;Frontend;PGO;tma_issueFB",
7607e74ece3SIan Rogers        "MetricName": "tma_info_inst_mix_iptb",
7617e74ece3SIan Rogers        "MetricThreshold": "tma_info_inst_mix_iptb < 11",
7627e74ece3SIan Rogers        "PublicDescription": "Instruction per taken branch. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_lcp"
7637e74ece3SIan Rogers    },
7647e74ece3SIan Rogers    {
7657e74ece3SIan Rogers        "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]",
7667e74ece3SIan Rogers        "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / duration_time",
7677e74ece3SIan Rogers        "MetricGroup": "Mem;MemoryBW",
7687e74ece3SIan Rogers        "MetricName": "tma_info_memory_core_l1d_cache_fill_bw"
7697e74ece3SIan Rogers    },
7707e74ece3SIan Rogers    {
7717e74ece3SIan Rogers        "BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]",
7727e74ece3SIan Rogers        "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / duration_time",
7737e74ece3SIan Rogers        "MetricGroup": "Mem;MemoryBW",
7747e74ece3SIan Rogers        "MetricName": "tma_info_memory_core_l2_cache_fill_bw"
7757e74ece3SIan Rogers    },
7767e74ece3SIan Rogers    {
7777e74ece3SIan Rogers        "BriefDescription": "Average per-core data access bandwidth to the L3 cache [GB / sec]",
7787e74ece3SIan Rogers        "MetricExpr": "64 * OFFCORE_REQUESTS.ALL_REQUESTS / 1e9 / duration_time",
7797e74ece3SIan Rogers        "MetricGroup": "Mem;MemoryBW;Offcore",
7807e74ece3SIan Rogers        "MetricName": "tma_info_memory_core_l3_cache_access_bw"
7817e74ece3SIan Rogers    },
7827e74ece3SIan Rogers    {
7837e74ece3SIan Rogers        "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]",
7847e74ece3SIan Rogers        "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time",
7857e74ece3SIan Rogers        "MetricGroup": "Mem;MemoryBW",
7867e74ece3SIan Rogers        "MetricName": "tma_info_memory_core_l3_cache_fill_bw"
7877e74ece3SIan Rogers    },
7887e74ece3SIan Rogers    {
7897e74ece3SIan Rogers        "BriefDescription": "Fill Buffer (FB) hits per kilo instructions for retired demand loads (L1D misses that merge into ongoing miss-handling entries)",
7907e74ece3SIan Rogers        "MetricExpr": "1e3 * MEM_LOAD_RETIRED.FB_HIT / INST_RETIRED.ANY",
7917e74ece3SIan Rogers        "MetricGroup": "CacheMisses;Mem",
7927e74ece3SIan Rogers        "MetricName": "tma_info_memory_fb_hpki"
7937e74ece3SIan Rogers    },
7947e74ece3SIan Rogers    {
7957e74ece3SIan Rogers        "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads",
7967e74ece3SIan Rogers        "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L1_MISS / INST_RETIRED.ANY",
7977e74ece3SIan Rogers        "MetricGroup": "CacheMisses;Mem",
7987e74ece3SIan Rogers        "MetricName": "tma_info_memory_l1mpki"
7997e74ece3SIan Rogers    },
8007e74ece3SIan Rogers    {
8017e74ece3SIan Rogers        "BriefDescription": "L1 cache true misses per kilo instruction for all demand loads (including speculative)",
8027e74ece3SIan Rogers        "MetricExpr": "1e3 * L2_RQSTS.ALL_DEMAND_DATA_RD / INST_RETIRED.ANY",
8037e74ece3SIan Rogers        "MetricGroup": "CacheMisses;Mem",
8047e74ece3SIan Rogers        "MetricName": "tma_info_memory_l1mpki_load"
8057e74ece3SIan Rogers    },
8067e74ece3SIan Rogers    {
8077e74ece3SIan Rogers        "BriefDescription": "L2 cache hits per kilo instruction for all request types (including speculative)",
8087e74ece3SIan Rogers        "MetricExpr": "1e3 * (L2_RQSTS.REFERENCES - L2_RQSTS.MISS) / INST_RETIRED.ANY",
8097e74ece3SIan Rogers        "MetricGroup": "CacheMisses;Mem",
8107e74ece3SIan Rogers        "MetricName": "tma_info_memory_l2hpki_all"
8117e74ece3SIan Rogers    },
8127e74ece3SIan Rogers    {
8137e74ece3SIan Rogers        "BriefDescription": "L2 cache hits per kilo instruction for all demand loads  (including speculative)",
8147e74ece3SIan Rogers        "MetricExpr": "1e3 * L2_RQSTS.DEMAND_DATA_RD_HIT / INST_RETIRED.ANY",
8157e74ece3SIan Rogers        "MetricGroup": "CacheMisses;Mem",
8167e74ece3SIan Rogers        "MetricName": "tma_info_memory_l2hpki_load"
8177e74ece3SIan Rogers    },
8187e74ece3SIan Rogers    {
8197e74ece3SIan Rogers        "BriefDescription": "L2 cache true misses per kilo instruction for retired demand loads",
8207e74ece3SIan Rogers        "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L2_MISS / INST_RETIRED.ANY",
8217e74ece3SIan Rogers        "MetricGroup": "Backend;CacheMisses;Mem",
8227e74ece3SIan Rogers        "MetricName": "tma_info_memory_l2mpki"
8237e74ece3SIan Rogers    },
8247e74ece3SIan Rogers    {
8257e74ece3SIan Rogers        "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all request types (including speculative)",
8267e74ece3SIan Rogers        "MetricExpr": "1e3 * L2_RQSTS.MISS / INST_RETIRED.ANY",
8277e74ece3SIan Rogers        "MetricGroup": "CacheMisses;Mem;Offcore",
8287e74ece3SIan Rogers        "MetricName": "tma_info_memory_l2mpki_all"
8297e74ece3SIan Rogers    },
8307e74ece3SIan Rogers    {
8317e74ece3SIan Rogers        "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all demand loads  (including speculative)",
8327e74ece3SIan Rogers        "MetricExpr": "1e3 * L2_RQSTS.DEMAND_DATA_RD_MISS / INST_RETIRED.ANY",
8337e74ece3SIan Rogers        "MetricGroup": "CacheMisses;Mem",
8347e74ece3SIan Rogers        "MetricName": "tma_info_memory_l2mpki_load"
8357e74ece3SIan Rogers    },
8367e74ece3SIan Rogers    {
8377e74ece3SIan Rogers        "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads",
8387e74ece3SIan Rogers        "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L3_MISS / INST_RETIRED.ANY",
8397e74ece3SIan Rogers        "MetricGroup": "CacheMisses;Mem",
8407e74ece3SIan Rogers        "MetricName": "tma_info_memory_l3mpki"
8417e74ece3SIan Rogers    },
8427e74ece3SIan Rogers    {
8437e74ece3SIan Rogers        "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)",
8447e74ece3SIan Rogers        "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_RETIRED.L1_MISS + MEM_LOAD_RETIRED.FB_HIT)",
8457e74ece3SIan Rogers        "MetricGroup": "Mem;MemoryBound;MemoryLat",
8467e74ece3SIan Rogers        "MetricName": "tma_info_memory_load_miss_real_latency"
8477e74ece3SIan Rogers    },
8487e74ece3SIan Rogers    {
8497e74ece3SIan Rogers        "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss",
8507e74ece3SIan Rogers        "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES",
8517e74ece3SIan Rogers        "MetricGroup": "Mem;MemoryBW;MemoryBound",
8527e74ece3SIan Rogers        "MetricName": "tma_info_memory_mlp",
8537e74ece3SIan Rogers        "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)"
8547e74ece3SIan Rogers    },
8557e74ece3SIan Rogers    {
8567e74ece3SIan Rogers        "BriefDescription": "Average Parallel L2 cache miss data reads",
8577e74ece3SIan Rogers        "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD",
8587e74ece3SIan Rogers        "MetricGroup": "Memory_BW;Offcore",
8597e74ece3SIan Rogers        "MetricName": "tma_info_memory_oro_data_l2_mlp"
8607e74ece3SIan Rogers    },
8617e74ece3SIan Rogers    {
8627e74ece3SIan Rogers        "BriefDescription": "Average Latency for L2 cache miss demand Loads",
8637e74ece3SIan Rogers        "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS.DEMAND_DATA_RD",
8647e74ece3SIan Rogers        "MetricGroup": "Memory_Lat;Offcore",
8657e74ece3SIan Rogers        "MetricName": "tma_info_memory_oro_load_l2_miss_latency"
8667e74ece3SIan Rogers    },
8677e74ece3SIan Rogers    {
8687e74ece3SIan Rogers        "BriefDescription": "Average Parallel L2 cache miss demand Loads",
8697e74ece3SIan Rogers        "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / cpu@OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD\\,cmask\\=1@",
8707e74ece3SIan Rogers        "MetricGroup": "Memory_BW;Offcore",
8717e74ece3SIan Rogers        "MetricName": "tma_info_memory_oro_load_l2_mlp"
8727e74ece3SIan Rogers    },
8737e74ece3SIan Rogers    {
8747e74ece3SIan Rogers        "BriefDescription": "Average Latency for L3 cache miss demand Loads",
8757e74ece3SIan Rogers        "MetricExpr": "cpu@OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD\\,umask\\=0x10@ / OFFCORE_REQUESTS.L3_MISS_DEMAND_DATA_RD",
8767e74ece3SIan Rogers        "MetricGroup": "Memory_Lat;Offcore",
8777e74ece3SIan Rogers        "MetricName": "tma_info_memory_oro_load_l3_miss_latency"
8787e74ece3SIan Rogers    },
8797e74ece3SIan Rogers    {
8807e74ece3SIan Rogers        "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]",
8817e74ece3SIan Rogers        "MetricExpr": "tma_info_memory_core_l1d_cache_fill_bw",
8827e74ece3SIan Rogers        "MetricGroup": "Mem;MemoryBW",
8837e74ece3SIan Rogers        "MetricName": "tma_info_memory_thread_l1d_cache_fill_bw_1t"
8847e74ece3SIan Rogers    },
8857e74ece3SIan Rogers    {
8867e74ece3SIan Rogers        "BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]",
8877e74ece3SIan Rogers        "MetricExpr": "tma_info_memory_core_l2_cache_fill_bw",
8887e74ece3SIan Rogers        "MetricGroup": "Mem;MemoryBW",
8897e74ece3SIan Rogers        "MetricName": "tma_info_memory_thread_l2_cache_fill_bw_1t"
8907e74ece3SIan Rogers    },
8917e74ece3SIan Rogers    {
8927e74ece3SIan Rogers        "BriefDescription": "Average per-thread data access bandwidth to the L3 cache [GB / sec]",
8937e74ece3SIan Rogers        "MetricExpr": "tma_info_memory_core_l3_cache_access_bw",
8947e74ece3SIan Rogers        "MetricGroup": "Mem;MemoryBW;Offcore",
8957e74ece3SIan Rogers        "MetricName": "tma_info_memory_thread_l3_cache_access_bw_1t"
8967e74ece3SIan Rogers    },
8977e74ece3SIan Rogers    {
8987e74ece3SIan Rogers        "BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]",
8997e74ece3SIan Rogers        "MetricExpr": "tma_info_memory_core_l3_cache_fill_bw",
9007e74ece3SIan Rogers        "MetricGroup": "Mem;MemoryBW",
9017e74ece3SIan Rogers        "MetricName": "tma_info_memory_thread_l3_cache_fill_bw_1t"
9027e74ece3SIan Rogers    },
9037e74ece3SIan Rogers    {
9047e74ece3SIan Rogers        "BriefDescription": "STLB (2nd level TLB) code speculative misses per kilo instruction (misses of any page-size that complete the page walk)",
9057e74ece3SIan Rogers        "MetricExpr": "1e3 * ITLB_MISSES.WALK_COMPLETED / INST_RETIRED.ANY",
9067e74ece3SIan Rogers        "MetricGroup": "Fed;MemoryTLB",
9077e74ece3SIan Rogers        "MetricName": "tma_info_memory_tlb_code_stlb_mpki"
9087e74ece3SIan Rogers    },
9097e74ece3SIan Rogers    {
9107e74ece3SIan Rogers        "BriefDescription": "STLB (2nd level TLB) data load speculative misses per kilo instruction (misses of any page-size that complete the page walk)",
9117e74ece3SIan Rogers        "MetricExpr": "1e3 * DTLB_LOAD_MISSES.WALK_COMPLETED / INST_RETIRED.ANY",
9127e74ece3SIan Rogers        "MetricGroup": "Mem;MemoryTLB",
9137e74ece3SIan Rogers        "MetricName": "tma_info_memory_tlb_load_stlb_mpki"
9147e74ece3SIan Rogers    },
9157e74ece3SIan Rogers    {
9167e74ece3SIan Rogers        "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses",
9177e74ece3SIan Rogers        "MetricExpr": "(ITLB_MISSES.WALK_PENDING + DTLB_LOAD_MISSES.WALK_PENDING + DTLB_STORE_MISSES.WALK_PENDING) / (2 * tma_info_core_core_clks)",
9187e74ece3SIan Rogers        "MetricGroup": "Mem;MemoryTLB",
9197e74ece3SIan Rogers        "MetricName": "tma_info_memory_tlb_page_walks_utilization",
9207e74ece3SIan Rogers        "MetricThreshold": "tma_info_memory_tlb_page_walks_utilization > 0.5"
9217e74ece3SIan Rogers    },
9227e74ece3SIan Rogers    {
9237e74ece3SIan Rogers        "BriefDescription": "STLB (2nd level TLB) data store speculative misses per kilo instruction (misses of any page-size that complete the page walk)",
9247e74ece3SIan Rogers        "MetricExpr": "1e3 * DTLB_STORE_MISSES.WALK_COMPLETED / INST_RETIRED.ANY",
9257e74ece3SIan Rogers        "MetricGroup": "Mem;MemoryTLB",
9267e74ece3SIan Rogers        "MetricName": "tma_info_memory_tlb_store_stlb_mpki"
9277e74ece3SIan Rogers    },
9287e74ece3SIan Rogers    {
9297e74ece3SIan Rogers        "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-thread",
9307e74ece3SIan Rogers        "MetricExpr": "UOPS_EXECUTED.THREAD / cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@",
9317e74ece3SIan Rogers        "MetricGroup": "Cor;Pipeline;PortsUtil;SMT",
9327e74ece3SIan Rogers        "MetricName": "tma_info_pipeline_execute"
9337e74ece3SIan Rogers    },
9347e74ece3SIan Rogers    {
9357e74ece3SIan Rogers        "BriefDescription": "Average number of Uops retired in cycles where at least one uop has retired.",
9367e74ece3SIan Rogers        "MetricExpr": "tma_retiring * tma_info_thread_slots / cpu@UOPS_RETIRED.SLOTS\\,cmask\\=1@",
9377e74ece3SIan Rogers        "MetricGroup": "Pipeline;Ret",
9387e74ece3SIan Rogers        "MetricName": "tma_info_pipeline_retire"
9397e74ece3SIan Rogers    },
9407e74ece3SIan Rogers    {
9417e74ece3SIan Rogers        "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]",
9427e74ece3SIan Rogers        "MetricExpr": "tma_info_system_turbo_utilization * TSC / 1e9 / duration_time",
9437e74ece3SIan Rogers        "MetricGroup": "Power;Summary",
9447e74ece3SIan Rogers        "MetricName": "tma_info_system_average_frequency"
9457e74ece3SIan Rogers    },
9467e74ece3SIan Rogers    {
9477e74ece3SIan Rogers        "BriefDescription": "Average CPU Utilization",
9487e74ece3SIan Rogers        "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC",
9497e74ece3SIan Rogers        "MetricGroup": "HPC;Summary",
9507e74ece3SIan Rogers        "MetricName": "tma_info_system_cpu_utilization"
9517e74ece3SIan Rogers    },
9527e74ece3SIan Rogers    {
9537e74ece3SIan Rogers        "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]",
9547e74ece3SIan Rogers        "MetricExpr": "64 * (UNC_ARB_TRK_REQUESTS.ALL + UNC_ARB_COH_TRK_REQUESTS.ALL) / 1e6 / duration_time / 1e3",
9557e74ece3SIan Rogers        "MetricGroup": "HPC;Mem;MemoryBW;SoC;tma_issueBW",
9567e74ece3SIan Rogers        "MetricName": "tma_info_system_dram_bw_use",
9577e74ece3SIan Rogers        "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_fb_full, tma_info_bottleneck_memory_bandwidth, tma_mem_bandwidth, tma_sq_full"
9587e74ece3SIan Rogers    },
9597e74ece3SIan Rogers    {
9607e74ece3SIan Rogers        "BriefDescription": "Giga Floating Point Operations Per Second",
9617e74ece3SIan Rogers        "MetricExpr": "(cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * cpu@FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE\\,umask\\=0x18@ + 8 * cpu@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE\\,umask\\=0x60@ + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / 1e9 / duration_time",
9627e74ece3SIan Rogers        "MetricGroup": "Cor;Flops;HPC",
9637e74ece3SIan Rogers        "MetricName": "tma_info_system_gflops",
9647e74ece3SIan Rogers        "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine."
9657e74ece3SIan Rogers    },
9667e74ece3SIan Rogers    {
9677e74ece3SIan Rogers        "BriefDescription": "Instructions per Far Branch ( Far Branches apply upon transition from application to operating system, handling interrupts, exceptions) [lower number means higher occurrence rate]",
9687e74ece3SIan Rogers        "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.FAR_BRANCH:u",
9697e74ece3SIan Rogers        "MetricGroup": "Branches;OS",
9707e74ece3SIan Rogers        "MetricName": "tma_info_system_ipfarbranch",
9717e74ece3SIan Rogers        "MetricThreshold": "tma_info_system_ipfarbranch < 1e6"
9727e74ece3SIan Rogers    },
9737e74ece3SIan Rogers    {
9747e74ece3SIan Rogers        "BriefDescription": "Cycles Per Instruction for the Operating System (OS) Kernel mode",
9757e74ece3SIan Rogers        "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / INST_RETIRED.ANY_P:k",
9767e74ece3SIan Rogers        "MetricGroup": "OS",
9777e74ece3SIan Rogers        "MetricName": "tma_info_system_kernel_cpi"
9787e74ece3SIan Rogers    },
9797e74ece3SIan Rogers    {
9807e74ece3SIan Rogers        "BriefDescription": "Fraction of cycles spent in the Operating System (OS) Kernel mode",
9817e74ece3SIan Rogers        "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / CPU_CLK_UNHALTED.THREAD",
9827e74ece3SIan Rogers        "MetricGroup": "OS",
9837e74ece3SIan Rogers        "MetricName": "tma_info_system_kernel_utilization",
9847e74ece3SIan Rogers        "MetricThreshold": "tma_info_system_kernel_utilization > 0.05"
9857e74ece3SIan Rogers    },
9867e74ece3SIan Rogers    {
9877e74ece3SIan Rogers        "BriefDescription": "Average number of parallel data read requests to external memory",
9887e74ece3SIan Rogers        "MetricExpr": "UNC_ARB_DAT_OCCUPANCY.RD / cpu@UNC_ARB_DAT_OCCUPANCY.RD\\,cmask\\=1@",
9897e74ece3SIan Rogers        "MetricGroup": "Mem;MemoryBW;SoC",
9907e74ece3SIan Rogers        "MetricName": "tma_info_system_mem_parallel_reads",
9917e74ece3SIan Rogers        "PublicDescription": "Average number of parallel data read requests to external memory. Accounts for demand loads and L1/L2 prefetches"
9927e74ece3SIan Rogers    },
9937e74ece3SIan Rogers    {
9947e74ece3SIan Rogers        "BriefDescription": "Average latency of data read request to external memory (in nanoseconds)",
9957e74ece3SIan Rogers        "MetricExpr": "(UNC_ARB_TRK_OCCUPANCY.RD + UNC_ARB_DAT_OCCUPANCY.RD) / UNC_ARB_TRK_REQUESTS.RD",
9967e74ece3SIan Rogers        "MetricGroup": "Mem;MemoryLat;SoC",
9977e74ece3SIan Rogers        "MetricName": "tma_info_system_mem_read_latency",
9987e74ece3SIan Rogers        "PublicDescription": "Average latency of data read request to external memory (in nanoseconds). Accounts for demand loads and L1/L2 prefetches. ([RKL+]memory-controller only)"
9997e74ece3SIan Rogers    },
10007e74ece3SIan Rogers    {
10017e74ece3SIan Rogers        "BriefDescription": "Average latency of all requests to external memory (in Uncore cycles)",
10027e74ece3SIan Rogers        "MetricExpr": "(UNC_ARB_TRK_OCCUPANCY.ALL + UNC_ARB_DAT_OCCUPANCY.RD) / UNC_ARB_TRK_REQUESTS.ALL",
10037e74ece3SIan Rogers        "MetricGroup": "Mem;SoC",
10047e74ece3SIan Rogers        "MetricName": "tma_info_system_mem_request_latency"
10057e74ece3SIan Rogers    },
10067e74ece3SIan Rogers    {
10077e74ece3SIan Rogers        "BriefDescription": "Fraction of Core cycles where the core was running with power-delivery for baseline license level 0",
10087e74ece3SIan Rogers        "MetricExpr": "CORE_POWER.LVL0_TURBO_LICENSE / tma_info_core_core_clks",
10097e74ece3SIan Rogers        "MetricGroup": "Power",
10107e74ece3SIan Rogers        "MetricName": "tma_info_system_power_license0_utilization",
10117e74ece3SIan Rogers        "PublicDescription": "Fraction of Core cycles where the core was running with power-delivery for baseline license level 0.  This includes non-AVX codes, SSE, AVX 128-bit, and low-current AVX 256-bit codes."
10127e74ece3SIan Rogers    },
10137e74ece3SIan Rogers    {
10147e74ece3SIan Rogers        "BriefDescription": "Fraction of Core cycles where the core was running with power-delivery for license level 1",
10157e74ece3SIan Rogers        "MetricExpr": "CORE_POWER.LVL1_TURBO_LICENSE / tma_info_core_core_clks",
10167e74ece3SIan Rogers        "MetricGroup": "Power",
10177e74ece3SIan Rogers        "MetricName": "tma_info_system_power_license1_utilization",
10187e74ece3SIan Rogers        "MetricThreshold": "tma_info_system_power_license1_utilization > 0.5",
10197e74ece3SIan Rogers        "PublicDescription": "Fraction of Core cycles where the core was running with power-delivery for license level 1.  This includes high current AVX 256-bit instructions as well as low current AVX 512-bit instructions."
10207e74ece3SIan Rogers    },
10217e74ece3SIan Rogers    {
10227e74ece3SIan Rogers        "BriefDescription": "Fraction of Core cycles where the core was running with power-delivery for license level 2 (introduced in SKX)",
10237e74ece3SIan Rogers        "MetricExpr": "CORE_POWER.LVL2_TURBO_LICENSE / tma_info_core_core_clks",
10247e74ece3SIan Rogers        "MetricGroup": "Power",
10257e74ece3SIan Rogers        "MetricName": "tma_info_system_power_license2_utilization",
10267e74ece3SIan Rogers        "MetricThreshold": "tma_info_system_power_license2_utilization > 0.5",
10277e74ece3SIan Rogers        "PublicDescription": "Fraction of Core cycles where the core was running with power-delivery for license level 2 (introduced in SKX).  This includes high current AVX 512-bit instructions."
10287e74ece3SIan Rogers    },
10297e74ece3SIan Rogers    {
10307e74ece3SIan Rogers        "BriefDescription": "Fraction of cycles where both hardware Logical Processors were active",
10317e74ece3SIan Rogers        "MetricExpr": "(1 - CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_DISTRIBUTED if #SMT_on else 0)",
10327e74ece3SIan Rogers        "MetricGroup": "SMT",
10337e74ece3SIan Rogers        "MetricName": "tma_info_system_smt_2t_utilization"
10347e74ece3SIan Rogers    },
10357e74ece3SIan Rogers    {
10367e74ece3SIan Rogers        "BriefDescription": "Socket actual clocks when any core is active on that socket",
10377e74ece3SIan Rogers        "MetricExpr": "UNC_CLOCK.SOCKET",
10387e74ece3SIan Rogers        "MetricGroup": "SoC",
10397e74ece3SIan Rogers        "MetricName": "tma_info_system_socket_clks"
10407e74ece3SIan Rogers    },
10417e74ece3SIan Rogers    {
10427e74ece3SIan Rogers        "BriefDescription": "Average Frequency Utilization relative nominal frequency",
10437e74ece3SIan Rogers        "MetricExpr": "tma_info_thread_clks / CPU_CLK_UNHALTED.REF_TSC",
10447e74ece3SIan Rogers        "MetricGroup": "Power",
10457e74ece3SIan Rogers        "MetricName": "tma_info_system_turbo_utilization"
10467e74ece3SIan Rogers    },
10477e74ece3SIan Rogers    {
10487e74ece3SIan Rogers        "BriefDescription": "Per-Logical Processor actual clocks when the Logical Processor is active.",
10497e74ece3SIan Rogers        "MetricExpr": "CPU_CLK_UNHALTED.THREAD",
10507e74ece3SIan Rogers        "MetricGroup": "Pipeline",
10517e74ece3SIan Rogers        "MetricName": "tma_info_thread_clks"
10527e74ece3SIan Rogers    },
10537e74ece3SIan Rogers    {
10547e74ece3SIan Rogers        "BriefDescription": "Cycles Per Instruction (per Logical Processor)",
10557e74ece3SIan Rogers        "MetricExpr": "1 / tma_info_thread_ipc",
10567e74ece3SIan Rogers        "MetricGroup": "Mem;Pipeline",
10577e74ece3SIan Rogers        "MetricName": "tma_info_thread_cpi"
10587e74ece3SIan Rogers    },
10597e74ece3SIan Rogers    {
10607e74ece3SIan Rogers        "BriefDescription": "The ratio of Executed- by Issued-Uops",
10617e74ece3SIan Rogers        "MetricExpr": "UOPS_EXECUTED.THREAD / UOPS_ISSUED.ANY",
10627e74ece3SIan Rogers        "MetricGroup": "Cor;Pipeline",
10637e74ece3SIan Rogers        "MetricName": "tma_info_thread_execute_per_issue",
10647e74ece3SIan Rogers        "PublicDescription": "The ratio of Executed- by Issued-Uops. Ratio > 1 suggests high rate of uop micro-fusions. Ratio < 1 suggest high rate of \"execute\" at rename stage."
10657e74ece3SIan Rogers    },
10667e74ece3SIan Rogers    {
10677e74ece3SIan Rogers        "BriefDescription": "Instructions Per Cycle (per Logical Processor)",
10687e74ece3SIan Rogers        "MetricExpr": "INST_RETIRED.ANY / tma_info_thread_clks",
10697e74ece3SIan Rogers        "MetricGroup": "Ret;Summary",
10707e74ece3SIan Rogers        "MetricName": "tma_info_thread_ipc"
10717e74ece3SIan Rogers    },
10727e74ece3SIan Rogers    {
10737e74ece3SIan Rogers        "BriefDescription": "Total issue-pipeline slots (per-Physical Core till ICL; per-Logical Processor ICL onward)",
10747e74ece3SIan Rogers        "MetricExpr": "TOPDOWN.SLOTS",
10757e74ece3SIan Rogers        "MetricGroup": "TmaL1;tma_L1_group",
10767e74ece3SIan Rogers        "MetricName": "tma_info_thread_slots"
10777e74ece3SIan Rogers    },
10787e74ece3SIan Rogers    {
10797e74ece3SIan Rogers        "BriefDescription": "Fraction of Physical Core issue-slots utilized by this Logical Processor",
10807e74ece3SIan Rogers        "MetricExpr": "(tma_info_thread_slots / (TOPDOWN.SLOTS / 2) if #SMT_on else 1)",
10817e74ece3SIan Rogers        "MetricGroup": "SMT;TmaL1;tma_L1_group",
10827e74ece3SIan Rogers        "MetricName": "tma_info_thread_slots_utilization"
10837e74ece3SIan Rogers    },
10847e74ece3SIan Rogers    {
10857e74ece3SIan Rogers        "BriefDescription": "Uops Per Instruction",
10867e74ece3SIan Rogers        "MetricExpr": "tma_retiring * tma_info_thread_slots / INST_RETIRED.ANY",
10877e74ece3SIan Rogers        "MetricGroup": "Pipeline;Ret;Retire",
10887e74ece3SIan Rogers        "MetricName": "tma_info_thread_uoppi",
10897e74ece3SIan Rogers        "MetricThreshold": "tma_info_thread_uoppi > 1.05"
10907e74ece3SIan Rogers    },
10917e74ece3SIan Rogers    {
10927e74ece3SIan Rogers        "BriefDescription": "Instruction per taken branch",
10937e74ece3SIan Rogers        "MetricExpr": "tma_retiring * tma_info_thread_slots / BR_INST_RETIRED.NEAR_TAKEN",
10947e74ece3SIan Rogers        "MetricGroup": "Branches;Fed;FetchBW",
10957e74ece3SIan Rogers        "MetricName": "tma_info_thread_uptb",
10967e74ece3SIan Rogers        "MetricThreshold": "tma_info_thread_uptb < 7.5"
10977e74ece3SIan Rogers    },
10987e74ece3SIan Rogers    {
10997e74ece3SIan Rogers        "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses",
11007e74ece3SIan Rogers        "MetricExpr": "ICACHE_64B.IFTAG_STALL / tma_info_thread_clks",
11017e74ece3SIan Rogers        "MetricGroup": "BigFoot;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group",
11027e74ece3SIan Rogers        "MetricName": "tma_itlb_misses",
11037e74ece3SIan Rogers        "MetricThreshold": "tma_itlb_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)",
11047e74ece3SIan Rogers        "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses. Sample with: FRONTEND_RETIRED.STLB_MISS_PS;FRONTEND_RETIRED.ITLB_MISS_PS",
11057e74ece3SIan Rogers        "ScaleUnit": "100%"
11067e74ece3SIan Rogers    },
11077e74ece3SIan Rogers    {
11087e74ece3SIan Rogers        "BriefDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache",
11097e74ece3SIan Rogers        "MetricExpr": "max((CYCLE_ACTIVITY.STALLS_MEM_ANY - CYCLE_ACTIVITY.STALLS_L1D_MISS) / tma_info_thread_clks, 0)",
11107e74ece3SIan Rogers        "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_issueL1;tma_issueMC;tma_memory_bound_group",
11117e74ece3SIan Rogers        "MetricName": "tma_l1_bound",
11127e74ece3SIan Rogers        "MetricThreshold": "tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)",
11137e74ece3SIan Rogers        "PublicDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache.  The L1 data cache typically has the shortest latency.  However; in certain cases like loads blocked on older stores; a load might suffer due to high latency even though it is being satisfied by the L1. Another example is loads who miss in the TLB. These cases are characterized by execution unit stalls; while some non-completed demand load lives in the machine without having that demand load missing the L1 cache. Sample with: MEM_LOAD_RETIRED.L1_HIT_PS;MEM_LOAD_RETIRED.FB_HIT_PS. Related metrics: tma_clears_resteers, tma_machine_clears, tma_microcode_sequencer, tma_ms_switches, tma_ports_utilized_1",
11147e74ece3SIan Rogers        "ScaleUnit": "100%"
11157e74ece3SIan Rogers    },
11167e74ece3SIan Rogers    {
11177e74ece3SIan Rogers        "BriefDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads",
11187e74ece3SIan Rogers        "MetricConstraint": "NO_GROUP_EVENTS",
11197e74ece3SIan Rogers        "MetricExpr": "MEM_LOAD_RETIRED.L2_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) / (MEM_LOAD_RETIRED.L2_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + L1D_PEND_MISS.FB_FULL_PERIODS) * ((CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / tma_info_thread_clks)",
11207e74ece3SIan Rogers        "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
11217e74ece3SIan Rogers        "MetricName": "tma_l2_bound",
11227e74ece3SIan Rogers        "MetricThreshold": "tma_l2_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)",
11237e74ece3SIan Rogers        "PublicDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads.  Avoiding cache misses (i.e. L1 misses/L2 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_RETIRED.L2_HIT_PS",
11247e74ece3SIan Rogers        "ScaleUnit": "100%"
11257e74ece3SIan Rogers    },
11267e74ece3SIan Rogers    {
11277e74ece3SIan Rogers        "BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core",
1128*9a7d82c1SIan Rogers        "MetricConstraint": "NO_GROUP_EVENTS_NMI",
11297e74ece3SIan Rogers        "MetricExpr": "(CYCLE_ACTIVITY.STALLS_L2_MISS - CYCLE_ACTIVITY.STALLS_L3_MISS) / tma_info_thread_clks",
11307e74ece3SIan Rogers        "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
11317e74ece3SIan Rogers        "MetricName": "tma_l3_bound",
11327e74ece3SIan Rogers        "MetricThreshold": "tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)",
11337e74ece3SIan Rogers        "PublicDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core.  Avoiding cache misses (i.e. L2 misses/L3 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS",
11347e74ece3SIan Rogers        "ScaleUnit": "100%"
11357e74ece3SIan Rogers    },
11367e74ece3SIan Rogers    {
11377e74ece3SIan Rogers        "BriefDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)",
11387e74ece3SIan Rogers        "MetricExpr": "9 * tma_info_system_average_frequency * MEM_LOAD_RETIRED.L3_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks",
11397e74ece3SIan Rogers        "MetricGroup": "MemoryLat;TopdownL4;tma_L4_group;tma_issueLat;tma_l3_bound_group",
11407e74ece3SIan Rogers        "MetricName": "tma_l3_hit_latency",
11417e74ece3SIan Rogers        "MetricThreshold": "tma_l3_hit_latency > 0.1 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
11427e74ece3SIan Rogers        "PublicDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited).  Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance.  Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS. Related metrics: tma_info_bottleneck_memory_latency, tma_mem_latency",
11437e74ece3SIan Rogers        "ScaleUnit": "100%"
11447e74ece3SIan Rogers    },
11457e74ece3SIan Rogers    {
11467e74ece3SIan Rogers        "BriefDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs)",
11477e74ece3SIan Rogers        "MetricExpr": "ILD_STALL.LCP / tma_info_thread_clks",
11487e74ece3SIan Rogers        "MetricGroup": "FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueFB",
11497e74ece3SIan Rogers        "MetricName": "tma_lcp",
11507e74ece3SIan Rogers        "MetricThreshold": "tma_lcp > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)",
11517e74ece3SIan Rogers        "PublicDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will certainly avoid this. #Link: Optimization Guide about LCP BKMs. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb",
11527e74ece3SIan Rogers        "ScaleUnit": "100%"
11537e74ece3SIan Rogers    },
11547e74ece3SIan Rogers    {
11557e74ece3SIan Rogers        "BriefDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation)",
11567e74ece3SIan Rogers        "MetricExpr": "max(0, tma_retiring - tma_heavy_operations)",
11577e74ece3SIan Rogers        "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group",
11587e74ece3SIan Rogers        "MetricName": "tma_light_operations",
11597e74ece3SIan Rogers        "MetricThreshold": "tma_light_operations > 0.6",
11607e74ece3SIan Rogers        "MetricgroupNoGroup": "TopdownL2",
11617e74ece3SIan Rogers        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST",
11627e74ece3SIan Rogers        "ScaleUnit": "100%"
11637e74ece3SIan Rogers    },
11647e74ece3SIan Rogers    {
11657e74ece3SIan Rogers        "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Load operations",
11667e74ece3SIan Rogers        "MetricExpr": "UOPS_DISPATCHED.PORT_2_3 / (2 * tma_info_core_core_clks)",
11677e74ece3SIan Rogers        "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group",
11687e74ece3SIan Rogers        "MetricName": "tma_load_op_utilization",
11697e74ece3SIan Rogers        "MetricThreshold": "tma_load_op_utilization > 0.6",
11707e74ece3SIan Rogers        "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Load operations. Sample with: UOPS_DISPATCHED.PORT_2_3",
11717e74ece3SIan Rogers        "ScaleUnit": "100%"
11727e74ece3SIan Rogers    },
11737e74ece3SIan Rogers    {
11747e74ece3SIan Rogers        "BriefDescription": "This metric roughly estimates the fraction of cycles where the (first level) DTLB was missed by load accesses, that later on hit in second-level TLB (STLB)",
11757e74ece3SIan Rogers        "MetricExpr": "tma_dtlb_load - tma_load_stlb_miss",
11767e74ece3SIan Rogers        "MetricGroup": "MemoryTLB;TopdownL5;tma_L5_group;tma_dtlb_load_group",
11777e74ece3SIan Rogers        "MetricName": "tma_load_stlb_hit",
11787e74ece3SIan Rogers        "MetricThreshold": "tma_load_stlb_hit > 0.05 & (tma_dtlb_load > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))",
11797e74ece3SIan Rogers        "ScaleUnit": "100%"
11807e74ece3SIan Rogers    },
11817e74ece3SIan Rogers    {
11827e74ece3SIan Rogers        "BriefDescription": "This metric estimates the fraction of cycles where the Second-level TLB (STLB) was missed by load accesses, performing a hardware page walk",
11837e74ece3SIan Rogers        "MetricExpr": "DTLB_LOAD_MISSES.WALK_ACTIVE / tma_info_thread_clks",
11847e74ece3SIan Rogers        "MetricGroup": "MemoryTLB;TopdownL5;tma_L5_group;tma_dtlb_load_group",
11857e74ece3SIan Rogers        "MetricName": "tma_load_stlb_miss",
11867e74ece3SIan Rogers        "MetricThreshold": "tma_load_stlb_miss > 0.05 & (tma_dtlb_load > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))",
11877e74ece3SIan Rogers        "ScaleUnit": "100%"
11887e74ece3SIan Rogers    },
11897e74ece3SIan Rogers    {
11907e74ece3SIan Rogers        "BriefDescription": "This metric represents fraction of cycles the CPU spent handling cache misses due to lock operations",
11917e74ece3SIan Rogers        "MetricConstraint": "NO_GROUP_EVENTS",
11927e74ece3SIan Rogers        "MetricExpr": "(16 * max(0, MEM_INST_RETIRED.LOCK_LOADS - L2_RQSTS.ALL_RFO) + MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES * (10 * L2_RQSTS.RFO_HIT + min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO))) / tma_info_thread_clks",
11937e74ece3SIan Rogers        "MetricGroup": "Offcore;TopdownL4;tma_L4_group;tma_issueRFO;tma_l1_bound_group",
11947e74ece3SIan Rogers        "MetricName": "tma_lock_latency",
11957e74ece3SIan Rogers        "MetricThreshold": "tma_lock_latency > 0.2 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
11967e74ece3SIan Rogers        "PublicDescription": "This metric represents fraction of cycles the CPU spent handling cache misses due to lock operations. Due to the microarchitecture handling of locks; they are classified as L1_Bound regardless of what memory source satisfied them. Sample with: MEM_INST_RETIRED.LOCK_LOADS_PS. Related metrics: tma_store_latency",
11977e74ece3SIan Rogers        "ScaleUnit": "100%"
11987e74ece3SIan Rogers    },
11997e74ece3SIan Rogers    {
12007e74ece3SIan Rogers        "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to LSD (Loop Stream Detector) unit",
12017e74ece3SIan Rogers        "MetricExpr": "(LSD.CYCLES_ACTIVE - LSD.CYCLES_OK) / tma_info_core_core_clks / 2",
12027e74ece3SIan Rogers        "MetricGroup": "FetchBW;LSD;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group",
12037e74ece3SIan Rogers        "MetricName": "tma_lsd",
12047e74ece3SIan Rogers        "MetricThreshold": "tma_lsd > 0.15 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 5 > 0.35)",
12057e74ece3SIan Rogers        "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to LSD (Loop Stream Detector) unit.  LSD typically does well sustaining Uop supply. However; in some rare cases; optimal uop-delivery could not be reached for small loops whose size (in terms of number of uops) does not suit well the LSD structure.",
12067e74ece3SIan Rogers        "ScaleUnit": "100%"
12077e74ece3SIan Rogers    },
12087e74ece3SIan Rogers    {
12097e74ece3SIan Rogers        "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears",
12107e74ece3SIan Rogers        "MetricExpr": "max(0, tma_bad_speculation - tma_branch_mispredicts)",
12117e74ece3SIan Rogers        "MetricGroup": "BadSpec;MachineClears;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueMC;tma_issueSyncxn",
12127e74ece3SIan Rogers        "MetricName": "tma_machine_clears",
12137e74ece3SIan Rogers        "MetricThreshold": "tma_machine_clears > 0.1 & tma_bad_speculation > 0.15",
12147e74ece3SIan Rogers        "MetricgroupNoGroup": "TopdownL2",
12157e74ece3SIan Rogers        "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears.  These slots are either wasted by uops fetched prior to the clear; or stalls the out-of-order portion of the machine needs to recover its state after the clear. For example; this can happen due to memory ordering Nukes (e.g. Memory Disambiguation) or Self-Modifying-Code (SMC) nukes. Sample with: MACHINE_CLEARS.COUNT. Related metrics: tma_clears_resteers, tma_contested_accesses, tma_data_sharing, tma_false_sharing, tma_l1_bound, tma_microcode_sequencer, tma_ms_switches, tma_remote_cache",
12167e74ece3SIan Rogers        "ScaleUnit": "100%"
12177e74ece3SIan Rogers    },
12187e74ece3SIan Rogers    {
12197e74ece3SIan Rogers        "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM)",
12207e74ece3SIan Rogers        "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=4@) / tma_info_thread_clks",
12217e74ece3SIan Rogers        "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueBW",
12227e74ece3SIan Rogers        "MetricName": "tma_mem_bandwidth",
12237e74ece3SIan Rogers        "MetricThreshold": "tma_mem_bandwidth > 0.2 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
12247e74ece3SIan Rogers        "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM).  The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_bottleneck_memory_bandwidth, tma_info_system_dram_bw_use, tma_sq_full",
12257e74ece3SIan Rogers        "ScaleUnit": "100%"
12267e74ece3SIan Rogers    },
12277e74ece3SIan Rogers    {
12287e74ece3SIan Rogers        "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM)",
12297e74ece3SIan Rogers        "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / tma_info_thread_clks - tma_mem_bandwidth",
12307e74ece3SIan Rogers        "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueLat",
12317e74ece3SIan Rogers        "MetricName": "tma_mem_latency",
12327e74ece3SIan Rogers        "MetricThreshold": "tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
12337e74ece3SIan Rogers        "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM).  This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_info_bottleneck_memory_latency, tma_l3_hit_latency",
12347e74ece3SIan Rogers        "ScaleUnit": "100%"
12357e74ece3SIan Rogers    },
12367e74ece3SIan Rogers    {
12377e74ece3SIan Rogers        "BriefDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck",
12387e74ece3SIan Rogers        "MetricExpr": "(CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES) * tma_backend_bound",
12397e74ece3SIan Rogers        "MetricGroup": "Backend;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group",
12407e74ece3SIan Rogers        "MetricName": "tma_memory_bound",
12417e74ece3SIan Rogers        "MetricThreshold": "tma_memory_bound > 0.2 & tma_backend_bound > 0.2",
12427e74ece3SIan Rogers        "MetricgroupNoGroup": "TopdownL2",
12437e74ece3SIan Rogers        "PublicDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck.  Memory Bound estimates fraction of slots where pipeline is likely stalled due to demand load or store instructions. This accounts mainly for (1) non-completed in-flight memory demand loads which coincides with execution units starvation; in addition to (2) cases where stores could impose backpressure on the pipeline when many of them get buffered at the same time (less common out of the two).",
12447e74ece3SIan Rogers        "ScaleUnit": "100%"
12457e74ece3SIan Rogers    },
12467e74ece3SIan Rogers    {
12477e74ece3SIan Rogers        "BriefDescription": "This metric represents fraction of slots where the CPU was retiring memory operations -- uops for memory load or store accesses.",
12487e74ece3SIan Rogers        "MetricConstraint": "NO_GROUP_EVENTS",
12497e74ece3SIan Rogers        "MetricExpr": "tma_light_operations * MEM_INST_RETIRED.ANY / INST_RETIRED.ANY",
12507e74ece3SIan Rogers        "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group",
12517e74ece3SIan Rogers        "MetricName": "tma_memory_operations",
12527e74ece3SIan Rogers        "MetricThreshold": "tma_memory_operations > 0.1 & tma_light_operations > 0.6",
12537e74ece3SIan Rogers        "ScaleUnit": "100%"
12547e74ece3SIan Rogers    },
12557e74ece3SIan Rogers    {
12567e74ece3SIan Rogers        "BriefDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit",
12577e74ece3SIan Rogers        "MetricExpr": "tma_retiring * tma_info_thread_slots / UOPS_ISSUED.ANY * IDQ.MS_UOPS / tma_info_thread_slots",
12587e74ece3SIan Rogers        "MetricGroup": "MicroSeq;TopdownL3;tma_L3_group;tma_heavy_operations_group;tma_issueMC;tma_issueMS",
12597e74ece3SIan Rogers        "MetricName": "tma_microcode_sequencer",
12607e74ece3SIan Rogers        "MetricThreshold": "tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1",
12617e74ece3SIan Rogers        "PublicDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit.  The MS is used for CISC instructions not supported by the default decoders (like repeat move strings; or CPUID); or by microcode assists used to address some operation modes (like in Floating Point assists). These cases can often be avoided. Sample with: IDQ.MS_UOPS. Related metrics: tma_clears_resteers, tma_l1_bound, tma_machine_clears, tma_ms_switches",
12627e74ece3SIan Rogers        "ScaleUnit": "100%"
12637e74ece3SIan Rogers    },
12647e74ece3SIan Rogers    {
12657e74ece3SIan Rogers        "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Branch Misprediction at execution stage",
12667e74ece3SIan Rogers        "MetricExpr": "BR_MISP_RETIRED.ALL_BRANCHES / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT) * INT_MISC.CLEAR_RESTEER_CYCLES / tma_info_thread_clks",
12677e74ece3SIan Rogers        "MetricGroup": "BadSpec;BrMispredicts;TopdownL4;tma_L4_group;tma_branch_resteers_group;tma_issueBM",
12687e74ece3SIan Rogers        "MetricName": "tma_mispredicts_resteers",
12697e74ece3SIan Rogers        "MetricThreshold": "tma_mispredicts_resteers > 0.05 & (tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15))",
12707e74ece3SIan Rogers        "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Branch Misprediction at execution stage. Sample with: INT_MISC.CLEAR_RESTEER_CYCLES. Related metrics: tma_branch_mispredicts, tma_info_bad_spec_branch_misprediction_cost, tma_info_bottleneck_mispredictions",
12717e74ece3SIan Rogers        "ScaleUnit": "100%"
12727e74ece3SIan Rogers    },
12737e74ece3SIan Rogers    {
12747e74ece3SIan Rogers        "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline)",
12757e74ece3SIan Rogers        "MetricExpr": "(IDQ.MITE_CYCLES_ANY - IDQ.MITE_CYCLES_OK) / tma_info_core_core_clks / 2",
12767e74ece3SIan Rogers        "MetricGroup": "DSBmiss;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group",
12777e74ece3SIan Rogers        "MetricName": "tma_mite",
12787e74ece3SIan Rogers        "MetricThreshold": "tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 5 > 0.35)",
12797e74ece3SIan Rogers        "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline). This pipeline is used for code that was not pre-cached in the DSB or LSD. For example; inefficiencies due to asymmetric decoders; use of long immediate or LCP can manifest as MITE fetch bandwidth bottleneck. Sample with: FRONTEND_RETIRED.ANY_DSB_MISS",
12807e74ece3SIan Rogers        "ScaleUnit": "100%"
12817e74ece3SIan Rogers    },
12827e74ece3SIan Rogers    {
12837e74ece3SIan Rogers        "BriefDescription": "This metric represents fraction of cycles where (only) 4 uops were delivered by the MITE pipeline",
12847e74ece3SIan Rogers        "MetricExpr": "(cpu@IDQ.MITE_UOPS\\,cmask\\=4@ - cpu@IDQ.MITE_UOPS\\,cmask\\=5@) / tma_info_thread_clks",
12857e74ece3SIan Rogers        "MetricGroup": "DSBmiss;FetchBW;TopdownL4;tma_L4_group;tma_mite_group",
12867e74ece3SIan Rogers        "MetricName": "tma_mite_4wide",
12877e74ece3SIan Rogers        "MetricThreshold": "tma_mite_4wide > 0.05 & (tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 5 > 0.35))",
12887e74ece3SIan Rogers        "ScaleUnit": "100%"
12897e74ece3SIan Rogers    },
12907e74ece3SIan Rogers    {
12917e74ece3SIan Rogers        "BriefDescription": "The Mixing_Vectors metric gives the percentage of injected blend uops out of all uops issued",
12927e74ece3SIan Rogers        "MetricExpr": "UOPS_ISSUED.VECTOR_WIDTH_MISMATCH / UOPS_ISSUED.ANY",
12937e74ece3SIan Rogers        "MetricGroup": "TopdownL5;tma_L5_group;tma_issueMV;tma_ports_utilized_0_group",
12947e74ece3SIan Rogers        "MetricName": "tma_mixing_vectors",
12957e74ece3SIan Rogers        "MetricThreshold": "tma_mixing_vectors > 0.05",
12967e74ece3SIan Rogers        "PublicDescription": "The Mixing_Vectors metric gives the percentage of injected blend uops out of all uops issued. Usually a Mixing_Vectors over 5% is worth investigating. Read more in Appendix B1 of the Optimizations Guide for this topic. Related metrics: tma_ms_switches",
12977e74ece3SIan Rogers        "ScaleUnit": "100%"
12987e74ece3SIan Rogers    },
12997e74ece3SIan Rogers    {
13007e74ece3SIan Rogers        "BriefDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS)",
13017e74ece3SIan Rogers        "MetricExpr": "3 * IDQ.MS_SWITCHES / tma_info_thread_clks",
13027e74ece3SIan Rogers        "MetricGroup": "FetchLat;MicroSeq;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueMC;tma_issueMS;tma_issueMV;tma_issueSO",
13037e74ece3SIan Rogers        "MetricName": "tma_ms_switches",
13047e74ece3SIan Rogers        "MetricThreshold": "tma_ms_switches > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)",
13057e74ece3SIan Rogers        "PublicDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS). Commonly used instructions are optimized for delivery by the DSB (decoded i-cache) or MITE (legacy instruction decode) pipelines. Certain operations cannot be handled natively by the execution pipeline; and must be performed by microcode (small programs injected into the execution stream). Switching to the MS too often can negatively impact performance. The MS is designated to deliver long uop flows required by CISC instructions like CPUID; or uncommon conditions like Floating Point Assists when dealing with Denormals. Sample with: IDQ.MS_SWITCHES. Related metrics: tma_clears_resteers, tma_l1_bound, tma_machine_clears, tma_microcode_sequencer, tma_mixing_vectors, tma_serializing_operation",
13067e74ece3SIan Rogers        "ScaleUnit": "100%"
13077e74ece3SIan Rogers    },
13087e74ece3SIan Rogers    {
13097e74ece3SIan Rogers        "BriefDescription": "This metric represents fraction of slots where the CPU was retiring NOP (no op) instructions",
13107e74ece3SIan Rogers        "MetricExpr": "tma_light_operations * INST_RETIRED.NOP / (tma_retiring * tma_info_thread_slots)",
13117e74ece3SIan Rogers        "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group",
13127e74ece3SIan Rogers        "MetricName": "tma_nop_instructions",
13137e74ece3SIan Rogers        "MetricThreshold": "tma_nop_instructions > 0.1 & tma_light_operations > 0.6",
13147e74ece3SIan Rogers        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring NOP (no op) instructions. Compilers often use NOPs for certain address alignments - e.g. start address of a function or loop body. Sample with: INST_RETIRED.NOP",
13157e74ece3SIan Rogers        "ScaleUnit": "100%"
13167e74ece3SIan Rogers    },
13177e74ece3SIan Rogers    {
13187e74ece3SIan Rogers        "BriefDescription": "This metric represents the remaining light uops fraction the CPU has executed - remaining means not covered by other sibling nodes",
13197e74ece3SIan Rogers        "MetricConstraint": "NO_GROUP_EVENTS",
13207e74ece3SIan Rogers        "MetricExpr": "max(0, tma_light_operations - (tma_fp_arith + tma_memory_operations + tma_branch_instructions + tma_nop_instructions))",
13217e74ece3SIan Rogers        "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group",
13227e74ece3SIan Rogers        "MetricName": "tma_other_light_ops",
13237e74ece3SIan Rogers        "MetricThreshold": "tma_other_light_ops > 0.3 & tma_light_operations > 0.6",
13247e74ece3SIan Rogers        "PublicDescription": "This metric represents the remaining light uops fraction the CPU has executed - remaining means not covered by other sibling nodes. May undercount due to FMA double counting",
13257e74ece3SIan Rogers        "ScaleUnit": "100%"
13267e74ece3SIan Rogers    },
13277e74ece3SIan Rogers    {
13287e74ece3SIan Rogers        "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 0 ([SNB+] ALU; [HSW+] ALU and 2nd branch)",
13297e74ece3SIan Rogers        "MetricExpr": "UOPS_DISPATCHED.PORT_0 / tma_info_core_core_clks",
13307e74ece3SIan Rogers        "MetricGroup": "Compute;TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P",
13317e74ece3SIan Rogers        "MetricName": "tma_port_0",
13327e74ece3SIan Rogers        "MetricThreshold": "tma_port_0 > 0.6",
13337e74ece3SIan Rogers        "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 0 ([SNB+] ALU; [HSW+] ALU and 2nd branch). Sample with: UOPS_DISPATCHED.PORT_0. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_1, tma_port_5, tma_port_6, tma_ports_utilized_2",
13347e74ece3SIan Rogers        "ScaleUnit": "100%"
13357e74ece3SIan Rogers    },
13367e74ece3SIan Rogers    {
13377e74ece3SIan Rogers        "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 1 (ALU)",
13387e74ece3SIan Rogers        "MetricExpr": "UOPS_DISPATCHED.PORT_1 / tma_info_core_core_clks",
13397e74ece3SIan Rogers        "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P",
13407e74ece3SIan Rogers        "MetricName": "tma_port_1",
13417e74ece3SIan Rogers        "MetricThreshold": "tma_port_1 > 0.6",
13427e74ece3SIan Rogers        "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 1 (ALU). Sample with: UOPS_DISPATCHED.PORT_1. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_0, tma_port_5, tma_port_6, tma_ports_utilized_2",
13437e74ece3SIan Rogers        "ScaleUnit": "100%"
13447e74ece3SIan Rogers    },
13457e74ece3SIan Rogers    {
13467e74ece3SIan Rogers        "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 5 ([SNB+] Branches and ALU; [HSW+] ALU)",
13477e74ece3SIan Rogers        "MetricExpr": "UOPS_DISPATCHED.PORT_5 / tma_info_core_core_clks",
13487e74ece3SIan Rogers        "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P",
13497e74ece3SIan Rogers        "MetricName": "tma_port_5",
13507e74ece3SIan Rogers        "MetricThreshold": "tma_port_5 > 0.6",
13517e74ece3SIan Rogers        "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 5 ([SNB+] Branches and ALU; [HSW+] ALU). Sample with: UOPS_DISPATCHED.PORT_5. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_6, tma_ports_utilized_2",
13527e74ece3SIan Rogers        "ScaleUnit": "100%"
13537e74ece3SIan Rogers    },
13547e74ece3SIan Rogers    {
13557e74ece3SIan Rogers        "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+]Primary Branch and simple ALU)",
13567e74ece3SIan Rogers        "MetricExpr": "UOPS_DISPATCHED.PORT_6 / tma_info_core_core_clks",
13577e74ece3SIan Rogers        "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P",
13587e74ece3SIan Rogers        "MetricName": "tma_port_6",
13597e74ece3SIan Rogers        "MetricThreshold": "tma_port_6 > 0.6",
13607e74ece3SIan Rogers        "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+]Primary Branch and simple ALU). Sample with: UOPS_DISPATCHED.PORT_6. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_ports_utilized_2",
13617e74ece3SIan Rogers        "ScaleUnit": "100%"
13627e74ece3SIan Rogers    },
13637e74ece3SIan Rogers    {
13647e74ece3SIan Rogers        "BriefDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related)",
13657e74ece3SIan Rogers        "MetricExpr": "((cpu@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ + tma_serializing_operation * (CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY) + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL)) / tma_info_thread_clks if ARITH.DIVIDER_ACTIVE < CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY else (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL) / tma_info_thread_clks)",
13667e74ece3SIan Rogers        "MetricGroup": "PortsUtil;TopdownL3;tma_L3_group;tma_core_bound_group",
13677e74ece3SIan Rogers        "MetricName": "tma_ports_utilization",
13687e74ece3SIan Rogers        "MetricThreshold": "tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)",
13697e74ece3SIan Rogers        "PublicDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related).  Two distinct categories can be attributed into this metric: (1) heavy data-dependency among contiguous instructions would manifest in this metric - such cases are often referred to as low Instruction Level Parallelism (ILP). (2) Contention on some hardware execution unit other than Divider. For example; when there are too many multiply operations.",
13707e74ece3SIan Rogers        "ScaleUnit": "100%"
13717e74ece3SIan Rogers    },
13727e74ece3SIan Rogers    {
13737e74ece3SIan Rogers        "BriefDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise)",
13747e74ece3SIan Rogers        "MetricExpr": "cpu@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ / tma_info_thread_clks + tma_serializing_operation * (CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY) / tma_info_thread_clks",
13757e74ece3SIan Rogers        "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group",
13767e74ece3SIan Rogers        "MetricName": "tma_ports_utilized_0",
13777e74ece3SIan Rogers        "MetricThreshold": "tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))",
13787e74ece3SIan Rogers        "PublicDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise). Long-latency instructions like divides may contribute to this metric.",
13797e74ece3SIan Rogers        "ScaleUnit": "100%"
13807e74ece3SIan Rogers    },
13817e74ece3SIan Rogers    {
13827e74ece3SIan Rogers        "BriefDescription": "This metric represents fraction of cycles where the CPU executed total of 1 uop per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)",
13837e74ece3SIan Rogers        "MetricExpr": "EXE_ACTIVITY.1_PORTS_UTIL / tma_info_thread_clks",
13847e74ece3SIan Rogers        "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_issueL1;tma_ports_utilization_group",
13857e74ece3SIan Rogers        "MetricName": "tma_ports_utilized_1",
13867e74ece3SIan Rogers        "MetricThreshold": "tma_ports_utilized_1 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))",
13877e74ece3SIan Rogers        "PublicDescription": "This metric represents fraction of cycles where the CPU executed total of 1 uop per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise). This can be due to heavy data-dependency among software instructions; or over oversubscribing a particular hardware resource. In some other cases with high 1_Port_Utilized and L1_Bound; this metric can point to L1 data-cache latency bottleneck that may not necessarily manifest with complete execution starvation (due to the short L1 latency e.g. walking a linked list) - looking at the assembly can be helpful. Sample with: EXE_ACTIVITY.1_PORTS_UTIL. Related metrics: tma_l1_bound",
13887e74ece3SIan Rogers        "ScaleUnit": "100%"
13897e74ece3SIan Rogers    },
13907e74ece3SIan Rogers    {
13917e74ece3SIan Rogers        "BriefDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)",
13927e74ece3SIan Rogers        "MetricExpr": "EXE_ACTIVITY.2_PORTS_UTIL / tma_info_thread_clks",
13937e74ece3SIan Rogers        "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_issue2P;tma_ports_utilization_group",
13947e74ece3SIan Rogers        "MetricName": "tma_ports_utilized_2",
13957e74ece3SIan Rogers        "MetricThreshold": "tma_ports_utilized_2 > 0.15 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))",
13967e74ece3SIan Rogers        "PublicDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise).  Loop Vectorization -most compilers feature auto-Vectorization options today- reduces pressure on the execution ports as multiple elements are calculated with same uop. Sample with: EXE_ACTIVITY.2_PORTS_UTIL. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_port_6",
13977e74ece3SIan Rogers        "ScaleUnit": "100%"
13987e74ece3SIan Rogers    },
13997e74ece3SIan Rogers    {
14007e74ece3SIan Rogers        "BriefDescription": "This metric represents fraction of cycles CPU executed total of 3 or more uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)",
14017e74ece3SIan Rogers        "MetricExpr": "UOPS_EXECUTED.CYCLES_GE_3 / tma_info_thread_clks",
14027e74ece3SIan Rogers        "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group",
14037e74ece3SIan Rogers        "MetricName": "tma_ports_utilized_3m",
14047e74ece3SIan Rogers        "MetricThreshold": "tma_ports_utilized_3m > 0.7 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))",
14057e74ece3SIan Rogers        "PublicDescription": "This metric represents fraction of cycles CPU executed total of 3 or more uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise). Sample with: UOPS_EXECUTED.CYCLES_GE_3",
14067e74ece3SIan Rogers        "ScaleUnit": "100%"
14077e74ece3SIan Rogers    },
14087e74ece3SIan Rogers    {
14097e74ece3SIan Rogers        "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired",
14107e74ece3SIan Rogers        "DefaultMetricgroupName": "TopdownL1",
14117e74ece3SIan Rogers        "MetricExpr": "topdown\\-retiring / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 0 * tma_info_thread_slots",
14127e74ece3SIan Rogers        "MetricGroup": "Default;TmaL1;TopdownL1;tma_L1_group",
14137e74ece3SIan Rogers        "MetricName": "tma_retiring",
14147e74ece3SIan Rogers        "MetricThreshold": "tma_retiring > 0.7 | tma_heavy_operations > 0.1",
14157e74ece3SIan Rogers        "MetricgroupNoGroup": "TopdownL1;Default",
14167e74ece3SIan Rogers        "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category.  Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved.  Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance.  For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. Sample with: UOPS_RETIRED.SLOTS",
14177e74ece3SIan Rogers        "ScaleUnit": "100%"
14187e74ece3SIan Rogers    },
14197e74ece3SIan Rogers    {
14207e74ece3SIan Rogers        "BriefDescription": "This metric represents fraction of cycles the CPU issue-pipeline was stalled due to serializing operations",
14217e74ece3SIan Rogers        "MetricExpr": "RESOURCE_STALLS.SCOREBOARD / tma_info_thread_clks",
14227e74ece3SIan Rogers        "MetricGroup": "PortsUtil;TopdownL5;tma_L5_group;tma_issueSO;tma_ports_utilized_0_group",
14237e74ece3SIan Rogers        "MetricName": "tma_serializing_operation",
14247e74ece3SIan Rogers        "MetricThreshold": "tma_serializing_operation > 0.1 & (tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)))",
14257e74ece3SIan Rogers        "PublicDescription": "This metric represents fraction of cycles the CPU issue-pipeline was stalled due to serializing operations. Instructions like CPUID; WRMSR or LFENCE serialize the out-of-order execution which may limit performance. Sample with: RESOURCE_STALLS.SCOREBOARD. Related metrics: tma_ms_switches",
14267e74ece3SIan Rogers        "ScaleUnit": "100%"
14277e74ece3SIan Rogers    },
14287e74ece3SIan Rogers    {
14297e74ece3SIan Rogers        "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to PAUSE Instructions",
14307e74ece3SIan Rogers        "MetricExpr": "140 * MISC_RETIRED.PAUSE_INST / tma_info_thread_clks",
14317e74ece3SIan Rogers        "MetricGroup": "TopdownL6;tma_L6_group;tma_serializing_operation_group",
14327e74ece3SIan Rogers        "MetricName": "tma_slow_pause",
14337e74ece3SIan Rogers        "MetricThreshold": "tma_slow_pause > 0.05 & (tma_serializing_operation > 0.1 & (tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))))",
14347e74ece3SIan Rogers        "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to PAUSE Instructions. Sample with: MISC_RETIRED.PAUSE_INST",
14357e74ece3SIan Rogers        "ScaleUnit": "100%"
14367e74ece3SIan Rogers    },
14377e74ece3SIan Rogers    {
14387e74ece3SIan Rogers        "BriefDescription": "This metric estimates fraction of cycles handling memory load split accesses - load that cross 64-byte cache line boundary",
14397e74ece3SIan Rogers        "MetricExpr": "tma_info_memory_load_miss_real_latency * LD_BLOCKS.NO_SR / tma_info_thread_clks",
14407e74ece3SIan Rogers        "MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group",
14417e74ece3SIan Rogers        "MetricName": "tma_split_loads",
14427e74ece3SIan Rogers        "MetricThreshold": "tma_split_loads > 0.2 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
14437e74ece3SIan Rogers        "PublicDescription": "This metric estimates fraction of cycles handling memory load split accesses - load that cross 64-byte cache line boundary. Sample with: MEM_INST_RETIRED.SPLIT_LOADS_PS",
14447e74ece3SIan Rogers        "ScaleUnit": "100%"
14457e74ece3SIan Rogers    },
14467e74ece3SIan Rogers    {
14477e74ece3SIan Rogers        "BriefDescription": "This metric represents rate of split store accesses",
1448*9a7d82c1SIan Rogers        "MetricConstraint": "NO_GROUP_EVENTS_NMI",
14497e74ece3SIan Rogers        "MetricExpr": "MEM_INST_RETIRED.SPLIT_STORES / tma_info_core_core_clks",
14507e74ece3SIan Rogers        "MetricGroup": "TopdownL4;tma_L4_group;tma_issueSpSt;tma_store_bound_group",
14517e74ece3SIan Rogers        "MetricName": "tma_split_stores",
14527e74ece3SIan Rogers        "MetricThreshold": "tma_split_stores > 0.2 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
14537e74ece3SIan Rogers        "PublicDescription": "This metric represents rate of split store accesses.  Consider aligning your data to the 64-byte cache line granularity. Sample with: MEM_INST_RETIRED.SPLIT_STORES_PS. Related metrics: tma_port_4",
14547e74ece3SIan Rogers        "ScaleUnit": "100%"
14557e74ece3SIan Rogers    },
14567e74ece3SIan Rogers    {
14577e74ece3SIan Rogers        "BriefDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors)",
14587e74ece3SIan Rogers        "MetricExpr": "L1D_PEND_MISS.L2_STALL / tma_info_thread_clks",
14597e74ece3SIan Rogers        "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_issueBW;tma_l3_bound_group",
14607e74ece3SIan Rogers        "MetricName": "tma_sq_full",
14617e74ece3SIan Rogers        "MetricThreshold": "tma_sq_full > 0.3 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
14627e74ece3SIan Rogers        "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_fb_full, tma_info_bottleneck_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth",
14637e74ece3SIan Rogers        "ScaleUnit": "100%"
14647e74ece3SIan Rogers    },
14657e74ece3SIan Rogers    {
14667e74ece3SIan Rogers        "BriefDescription": "This metric estimates how often CPU was stalled  due to RFO store memory accesses; RFO store issue a read-for-ownership request before the write",
14677e74ece3SIan Rogers        "MetricExpr": "EXE_ACTIVITY.BOUND_ON_STORES / tma_info_thread_clks",
14687e74ece3SIan Rogers        "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
14697e74ece3SIan Rogers        "MetricName": "tma_store_bound",
14707e74ece3SIan Rogers        "MetricThreshold": "tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)",
14717e74ece3SIan Rogers        "PublicDescription": "This metric estimates how often CPU was stalled  due to RFO store memory accesses; RFO store issue a read-for-ownership request before the write. Even though store accesses do not typically stall out-of-order CPUs; there are few cases where stores can lead to actual stalls. This metric will be flagged should RFO stores be a bottleneck. Sample with: MEM_INST_RETIRED.ALL_STORES_PS",
14727e74ece3SIan Rogers        "ScaleUnit": "100%"
14737e74ece3SIan Rogers    },
14747e74ece3SIan Rogers    {
14757e74ece3SIan Rogers        "BriefDescription": "This metric roughly estimates fraction of cycles when the memory subsystem had loads blocked since they could not forward data from earlier (in program order) overlapping stores",
1476*9a7d82c1SIan Rogers        "MetricConstraint": "NO_GROUP_EVENTS_NMI",
14777e74ece3SIan Rogers        "MetricExpr": "13 * LD_BLOCKS.STORE_FORWARD / tma_info_thread_clks",
14787e74ece3SIan Rogers        "MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group",
14797e74ece3SIan Rogers        "MetricName": "tma_store_fwd_blk",
14807e74ece3SIan Rogers        "MetricThreshold": "tma_store_fwd_blk > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
14817e74ece3SIan Rogers        "PublicDescription": "This metric roughly estimates fraction of cycles when the memory subsystem had loads blocked since they could not forward data from earlier (in program order) overlapping stores. To streamline memory operations in the pipeline; a load can avoid waiting for memory if a prior in-flight store is writing the data that the load wants to read (store forwarding process). However; in some cases the load may be blocked for a significant time pending the store forward. For example; when the prior store is writing a smaller region than the load is reading.",
14827e74ece3SIan Rogers        "ScaleUnit": "100%"
14837e74ece3SIan Rogers    },
14847e74ece3SIan Rogers    {
14857e74ece3SIan Rogers        "BriefDescription": "This metric estimates fraction of cycles the CPU spent handling L1D store misses",
14867e74ece3SIan Rogers        "MetricExpr": "(L2_RQSTS.RFO_HIT * 10 * (1 - MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES) + (1 - MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES) * min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO)) / tma_info_thread_clks",
14877e74ece3SIan Rogers        "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_issueRFO;tma_issueSL;tma_store_bound_group",
14887e74ece3SIan Rogers        "MetricName": "tma_store_latency",
14897e74ece3SIan Rogers        "MetricThreshold": "tma_store_latency > 0.1 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
14907e74ece3SIan Rogers        "PublicDescription": "This metric estimates fraction of cycles the CPU spent handling L1D store misses. Store accesses usually less impact out-of-order core performance; however; holding resources for longer time can lead into undesired implications (e.g. contention on L1D fill-buffer entries - see FB_Full). Related metrics: tma_fb_full, tma_lock_latency",
14917e74ece3SIan Rogers        "ScaleUnit": "100%"
14927e74ece3SIan Rogers    },
14937e74ece3SIan Rogers    {
14947e74ece3SIan Rogers        "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Store operations",
14957e74ece3SIan Rogers        "MetricExpr": "(UOPS_DISPATCHED.PORT_4_9 + UOPS_DISPATCHED.PORT_7_8) / (4 * tma_info_core_core_clks)",
14967e74ece3SIan Rogers        "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group",
14977e74ece3SIan Rogers        "MetricName": "tma_store_op_utilization",
14987e74ece3SIan Rogers        "MetricThreshold": "tma_store_op_utilization > 0.6",
14997e74ece3SIan Rogers        "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Store operations. Sample with: UOPS_DISPATCHED.PORT_7_8",
15007e74ece3SIan Rogers        "ScaleUnit": "100%"
15017e74ece3SIan Rogers    },
15027e74ece3SIan Rogers    {
15037e74ece3SIan Rogers        "BriefDescription": "This metric roughly estimates the fraction of cycles where the TLB was missed by store accesses, hitting in the second-level TLB (STLB)",
15047e74ece3SIan Rogers        "MetricExpr": "tma_dtlb_store - tma_store_stlb_miss",
15057e74ece3SIan Rogers        "MetricGroup": "MemoryTLB;TopdownL5;tma_L5_group;tma_dtlb_store_group",
15067e74ece3SIan Rogers        "MetricName": "tma_store_stlb_hit",
15077e74ece3SIan Rogers        "MetricThreshold": "tma_store_stlb_hit > 0.05 & (tma_dtlb_store > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))",
15087e74ece3SIan Rogers        "ScaleUnit": "100%"
15097e74ece3SIan Rogers    },
15107e74ece3SIan Rogers    {
15117e74ece3SIan Rogers        "BriefDescription": "This metric estimates the fraction of cycles where the STLB was missed by store accesses, performing a hardware page walk",
15127e74ece3SIan Rogers        "MetricExpr": "DTLB_STORE_MISSES.WALK_ACTIVE / tma_info_core_core_clks",
15137e74ece3SIan Rogers        "MetricGroup": "MemoryTLB;TopdownL5;tma_L5_group;tma_dtlb_store_group",
15147e74ece3SIan Rogers        "MetricName": "tma_store_stlb_miss",
15157e74ece3SIan Rogers        "MetricThreshold": "tma_store_stlb_miss > 0.05 & (tma_dtlb_store > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))",
15167e74ece3SIan Rogers        "ScaleUnit": "100%"
15177e74ece3SIan Rogers    },
15187e74ece3SIan Rogers    {
15197e74ece3SIan Rogers        "BriefDescription": "This metric estimates how often CPU was stalled  due to Streaming store memory accesses; Streaming store optimize out a read request required by RFO stores",
15207e74ece3SIan Rogers        "MetricExpr": "9 * OCR.STREAMING_WR.ANY_RESPONSE / tma_info_thread_clks",
15217e74ece3SIan Rogers        "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_issueSmSt;tma_store_bound_group",
15227e74ece3SIan Rogers        "MetricName": "tma_streaming_stores",
15237e74ece3SIan Rogers        "MetricThreshold": "tma_streaming_stores > 0.2 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
15247e74ece3SIan Rogers        "PublicDescription": "This metric estimates how often CPU was stalled  due to Streaming store memory accesses; Streaming store optimize out a read request required by RFO stores. Even though store accesses do not typically stall out-of-order CPUs; there are few cases where stores can lead to actual stalls. This metric will be flagged should Streaming stores be a bottleneck. Sample with: OCR.STREAMING_WR.ANY_RESPONSE. Related metrics: tma_fb_full",
15257e74ece3SIan Rogers        "ScaleUnit": "100%"
15267e74ece3SIan Rogers    },
15277e74ece3SIan Rogers    {
15287e74ece3SIan Rogers        "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears",
15297e74ece3SIan Rogers        "MetricExpr": "10 * BACLEARS.ANY / tma_info_thread_clks",
15307e74ece3SIan Rogers        "MetricGroup": "BigFoot;FetchLat;TopdownL4;tma_L4_group;tma_branch_resteers_group",
15317e74ece3SIan Rogers        "MetricName": "tma_unknown_branches",
15327e74ece3SIan Rogers        "MetricThreshold": "tma_unknown_branches > 0.05 & (tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15))",
15337e74ece3SIan Rogers        "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears. These are fetched branches the Branch Prediction Unit was unable to recognize (e.g. first time the branch is fetched or hitting BTB capacity limit). Sample with: BACLEARS.ANY",
15347e74ece3SIan Rogers        "ScaleUnit": "100%"
15357e74ece3SIan Rogers    },
15367e74ece3SIan Rogers    {
15377e74ece3SIan Rogers        "BriefDescription": "This metric serves as an approximation of legacy x87 usage",
15387e74ece3SIan Rogers        "MetricExpr": "tma_retiring * UOPS_EXECUTED.X87 / UOPS_EXECUTED.THREAD",
15397e74ece3SIan Rogers        "MetricGroup": "Compute;TopdownL4;tma_L4_group;tma_fp_arith_group",
15407e74ece3SIan Rogers        "MetricName": "tma_x87_use",
15417e74ece3SIan Rogers        "MetricThreshold": "tma_x87_use > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6)",
15427e74ece3SIan Rogers        "PublicDescription": "This metric serves as an approximation of legacy x87 usage. It accounts for instructions beyond X87 FP arithmetic operations; hence may be used as a thermometer to avoid X87 high usage and preferably upgrade to modern ISA. See Tip under Tuning Hint.",
15437e74ece3SIan Rogers        "ScaleUnit": "100%"
15447e74ece3SIan Rogers    },
15457e74ece3SIan Rogers    {
15467e74ece3SIan Rogers        "BriefDescription": "Percentage of cycles in aborted transactions.",
15477e74ece3SIan Rogers        "MetricExpr": "(max(cycles\\-t - cycles\\-ct, 0) / cycles if has_event(cycles\\-t) else 0)",
15487e74ece3SIan Rogers        "MetricGroup": "transaction",
15497e74ece3SIan Rogers        "MetricName": "tsx_aborted_cycles",
15507e74ece3SIan Rogers        "ScaleUnit": "100%"
15517e74ece3SIan Rogers    },
15527e74ece3SIan Rogers    {
15537e74ece3SIan Rogers        "BriefDescription": "Number of cycles within a transaction divided by the number of elisions.",
15547e74ece3SIan Rogers        "MetricExpr": "(cycles\\-t / el\\-start if has_event(cycles\\-t) else 0)",
15557e74ece3SIan Rogers        "MetricGroup": "transaction",
15567e74ece3SIan Rogers        "MetricName": "tsx_cycles_per_elision",
15577e74ece3SIan Rogers        "ScaleUnit": "1cycles / elision"
15587e74ece3SIan Rogers    },
15597e74ece3SIan Rogers    {
15607e74ece3SIan Rogers        "BriefDescription": "Number of cycles within a transaction divided by the number of transactions.",
15617e74ece3SIan Rogers        "MetricExpr": "(cycles\\-t / tx\\-start if has_event(cycles\\-t) else 0)",
15627e74ece3SIan Rogers        "MetricGroup": "transaction",
15637e74ece3SIan Rogers        "MetricName": "tsx_cycles_per_transaction",
15647e74ece3SIan Rogers        "ScaleUnit": "1cycles / transaction"
15657e74ece3SIan Rogers    },
15667e74ece3SIan Rogers    {
15677e74ece3SIan Rogers        "BriefDescription": "Percentage of cycles within a transaction region.",
15687e74ece3SIan Rogers        "MetricExpr": "(cycles\\-t / cycles if has_event(cycles\\-t) else 0)",
15697e74ece3SIan Rogers        "MetricGroup": "transaction",
15707e74ece3SIan Rogers        "MetricName": "tsx_transactional_cycles",
15717e74ece3SIan Rogers        "ScaleUnit": "100%"
15727e74ece3SIan Rogers    }
15737e74ece3SIan Rogers]
1574