1#!/usr/bin/env drgn 2# 3# Copyright (C) 2023 Tejun Heo <tj@kernel.org> 4# Copyright (C) 2023 Meta Platforms, Inc. and affiliates. 5 6desc = """ 7This is a drgn script to monitor workqueues. For more info on drgn, visit 8https://github.com/osandov/drgn. 9 10 total Total number of work items executed by the workqueue. 11 12 infl The number of currently in-flight work items. 13 14 CPUtime Total CPU time consumed by the workqueue in seconds. This is 15 sampled from scheduler ticks and only provides ballpark 16 measurement. "nohz_full=" CPUs are excluded from measurement. 17 18 CPUitsv The number of times a concurrency-managed work item hogged CPU 19 longer than the threshold (workqueue.cpu_intensive_thresh_us) 20 and got excluded from concurrency management to avoid stalling 21 other work items. 22 23 CMW/RPR For per-cpu workqueues, the number of concurrency-management 24 wake-ups while executing a work item of the workqueue. For 25 unbound workqueues, the number of times a worker was repatriated 26 to its affinity scope after being migrated to an off-scope CPU by 27 the scheduler. 28 29 mayday The number of times the rescuer was requested while waiting for 30 new worker creation. 31 32 rescued The number of work items executed by the rescuer. 33""" 34 35import sys 36import signal 37import os 38import re 39import time 40import json 41 42import drgn 43from drgn.helpers.linux.list import list_for_each_entry,list_empty 44from drgn.helpers.linux.cpumask import for_each_possible_cpu 45 46import argparse 47parser = argparse.ArgumentParser(description=desc, 48 formatter_class=argparse.RawTextHelpFormatter) 49parser.add_argument('workqueue', metavar='REGEX', nargs='*', 50 help='Target workqueue name patterns (all if empty)') 51parser.add_argument('-i', '--interval', metavar='SECS', type=float, default=1, 52 help='Monitoring interval (0 to print once and exit)') 53parser.add_argument('-j', '--json', action='store_true', 54 help='Output in json') 55args = parser.parse_args() 56 57def err(s): 58 print(s, file=sys.stderr, flush=True) 59 sys.exit(1) 60 61workqueues = prog['workqueues'] 62 63WQ_UNBOUND = prog['WQ_UNBOUND'] 64WQ_MEM_RECLAIM = prog['WQ_MEM_RECLAIM'] 65 66PWQ_STAT_STARTED = prog['PWQ_STAT_STARTED'] # work items started execution 67PWQ_STAT_COMPLETED = prog['PWQ_STAT_COMPLETED'] # work items completed execution 68PWQ_STAT_CPU_TIME = prog['PWQ_STAT_CPU_TIME'] # total CPU time consumed 69PWQ_STAT_CPU_INTENSIVE = prog['PWQ_STAT_CPU_INTENSIVE'] # wq_cpu_intensive_thresh_us violations 70PWQ_STAT_CM_WAKEUP = prog['PWQ_STAT_CM_WAKEUP'] # concurrency-management worker wakeups 71PWQ_STAT_REPATRIATED = prog['PWQ_STAT_REPATRIATED'] # unbound workers brought back into scope 72PWQ_STAT_MAYDAY = prog['PWQ_STAT_MAYDAY'] # maydays to rescuer 73PWQ_STAT_RESCUED = prog['PWQ_STAT_RESCUED'] # linked work items executed by rescuer 74PWQ_NR_STATS = prog['PWQ_NR_STATS'] 75 76class WqStats: 77 def __init__(self, wq): 78 self.name = wq.name.string_().decode() 79 self.unbound = wq.flags & WQ_UNBOUND != 0 80 self.mem_reclaim = wq.flags & WQ_MEM_RECLAIM != 0 81 self.stats = [0] * PWQ_NR_STATS 82 for pwq in list_for_each_entry('struct pool_workqueue', wq.pwqs.address_of_(), 'pwqs_node'): 83 for i in range(PWQ_NR_STATS): 84 self.stats[i] += int(pwq.stats[i]) 85 86 def dict(self, now): 87 return { 'timestamp' : now, 88 'name' : self.name, 89 'unbound' : self.unbound, 90 'mem_reclaim' : self.mem_reclaim, 91 'started' : self.stats[PWQ_STAT_STARTED], 92 'completed' : self.stats[PWQ_STAT_COMPLETED], 93 'cpu_time' : self.stats[PWQ_STAT_CPU_TIME], 94 'cpu_intensive' : self.stats[PWQ_STAT_CPU_INTENSIVE], 95 'cm_wakeup' : self.stats[PWQ_STAT_CM_WAKEUP], 96 'repatriated' : self.stats[PWQ_STAT_REPATRIATED], 97 'mayday' : self.stats[PWQ_STAT_MAYDAY], 98 'rescued' : self.stats[PWQ_STAT_RESCUED], } 99 100 def table_header_str(): 101 return f'{"":>24} {"total":>8} {"infl":>5} {"CPUtime":>8} '\ 102 f'{"CPUitsv":>7} {"CMW/RPR":>7} {"mayday":>7} {"rescued":>7}' 103 104 def table_row_str(self): 105 cpu_intensive = '-' 106 cmw_rpr = '-' 107 mayday = '-' 108 rescued = '-' 109 110 if self.unbound: 111 cmw_rpr = str(self.stats[PWQ_STAT_REPATRIATED]); 112 else: 113 cpu_intensive = str(self.stats[PWQ_STAT_CPU_INTENSIVE]) 114 cmw_rpr = str(self.stats[PWQ_STAT_CM_WAKEUP]) 115 116 if self.mem_reclaim: 117 mayday = str(self.stats[PWQ_STAT_MAYDAY]) 118 rescued = str(self.stats[PWQ_STAT_RESCUED]) 119 120 out = f'{self.name[-24:]:24} ' \ 121 f'{self.stats[PWQ_STAT_STARTED]:8} ' \ 122 f'{max(self.stats[PWQ_STAT_STARTED] - self.stats[PWQ_STAT_COMPLETED], 0):5} ' \ 123 f'{self.stats[PWQ_STAT_CPU_TIME] / 1000000:8.1f} ' \ 124 f'{cpu_intensive:>7} ' \ 125 f'{cmw_rpr:>7} ' \ 126 f'{mayday:>7} ' \ 127 f'{rescued:>7} ' 128 return out.rstrip(':') 129 130exit_req = False 131 132def sigint_handler(signr, frame): 133 global exit_req 134 exit_req = True 135 136def main(): 137 # handle args 138 table_fmt = not args.json 139 interval = args.interval 140 141 re_str = None 142 if args.workqueue: 143 for r in args.workqueue: 144 if re_str is None: 145 re_str = r 146 else: 147 re_str += '|' + r 148 149 filter_re = re.compile(re_str) if re_str else None 150 151 # monitoring loop 152 signal.signal(signal.SIGINT, sigint_handler) 153 154 while not exit_req: 155 now = time.time() 156 157 if table_fmt: 158 print() 159 print(WqStats.table_header_str()) 160 161 for wq in list_for_each_entry('struct workqueue_struct', workqueues.address_of_(), 'list'): 162 stats = WqStats(wq) 163 if filter_re and not filter_re.search(stats.name): 164 continue 165 if table_fmt: 166 print(stats.table_row_str()) 167 else: 168 print(stats.dict(now)) 169 170 if interval == 0: 171 break 172 time.sleep(interval) 173 174if __name__ == "__main__": 175 main() 176