xref: /openbmc/linux/include/linux/psi_types.h (revision c900529f3d9161bfde5cca0754f83b4d3c3e0220)
1 /* SPDX-License-Identifier: GPL-2.0 */
2 #ifndef _LINUX_PSI_TYPES_H
3 #define _LINUX_PSI_TYPES_H
4 
5 #include <linux/kthread.h>
6 #include <linux/seqlock.h>
7 #include <linux/types.h>
8 #include <linux/kref.h>
9 #include <linux/wait.h>
10 
11 #ifdef CONFIG_PSI
12 
13 /* Tracked task states */
14 enum psi_task_count {
15 	NR_IOWAIT,
16 	NR_MEMSTALL,
17 	NR_RUNNING,
18 	/*
19 	 * For IO and CPU stalls the presence of running/oncpu tasks
20 	 * in the domain means a partial rather than a full stall.
21 	 * For memory it's not so simple because of page reclaimers:
22 	 * they are running/oncpu while representing a stall. To tell
23 	 * whether a domain has productivity left or not, we need to
24 	 * distinguish between regular running (i.e. productive)
25 	 * threads and memstall ones.
26 	 */
27 	NR_MEMSTALL_RUNNING,
28 	NR_PSI_TASK_COUNTS = 4,
29 };
30 
31 /* Task state bitmasks */
32 #define TSK_IOWAIT	(1 << NR_IOWAIT)
33 #define TSK_MEMSTALL	(1 << NR_MEMSTALL)
34 #define TSK_RUNNING	(1 << NR_RUNNING)
35 #define TSK_MEMSTALL_RUNNING	(1 << NR_MEMSTALL_RUNNING)
36 
37 /* Only one task can be scheduled, no corresponding task count */
38 #define TSK_ONCPU	(1 << NR_PSI_TASK_COUNTS)
39 
40 /* Resources that workloads could be stalled on */
41 enum psi_res {
42 	PSI_IO,
43 	PSI_MEM,
44 	PSI_CPU,
45 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
46 	PSI_IRQ,
47 #endif
48 	NR_PSI_RESOURCES,
49 };
50 
51 /*
52  * Pressure states for each resource:
53  *
54  * SOME: Stalled tasks & working tasks
55  * FULL: Stalled tasks & no working tasks
56  */
57 enum psi_states {
58 	PSI_IO_SOME,
59 	PSI_IO_FULL,
60 	PSI_MEM_SOME,
61 	PSI_MEM_FULL,
62 	PSI_CPU_SOME,
63 	PSI_CPU_FULL,
64 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
65 	PSI_IRQ_FULL,
66 #endif
67 	/* Only per-CPU, to weigh the CPU in the global average: */
68 	PSI_NONIDLE,
69 	NR_PSI_STATES,
70 };
71 
72 /* Use one bit in the state mask to track TSK_ONCPU */
73 #define PSI_ONCPU	(1 << NR_PSI_STATES)
74 
75 /* Flag whether to re-arm avgs_work, see details in get_recent_times() */
76 #define PSI_STATE_RESCHEDULE	(1 << (NR_PSI_STATES + 1))
77 
78 enum psi_aggregators {
79 	PSI_AVGS = 0,
80 	PSI_POLL,
81 	NR_PSI_AGGREGATORS,
82 };
83 
84 struct psi_group_cpu {
85 	/* 1st cacheline updated by the scheduler */
86 
87 	/* Aggregator needs to know of concurrent changes */
88 	seqcount_t seq ____cacheline_aligned_in_smp;
89 
90 	/* States of the tasks belonging to this group */
91 	unsigned int tasks[NR_PSI_TASK_COUNTS];
92 
93 	/* Aggregate pressure state derived from the tasks */
94 	u32 state_mask;
95 
96 	/* Period time sampling buckets for each state of interest (ns) */
97 	u32 times[NR_PSI_STATES];
98 
99 	/* Time of last task change in this group (rq_clock) */
100 	u64 state_start;
101 
102 	/* 2nd cacheline updated by the aggregator */
103 
104 	/* Delta detection against the sampling buckets */
105 	u32 times_prev[NR_PSI_AGGREGATORS][NR_PSI_STATES]
106 			____cacheline_aligned_in_smp;
107 };
108 
109 /* PSI growth tracking window */
110 struct psi_window {
111 	/* Window size in ns */
112 	u64 size;
113 
114 	/* Start time of the current window in ns */
115 	u64 start_time;
116 
117 	/* Value at the start of the window */
118 	u64 start_value;
119 
120 	/* Value growth in the previous window */
121 	u64 prev_growth;
122 };
123 
124 struct psi_trigger {
125 	/* PSI state being monitored by the trigger */
126 	enum psi_states state;
127 
128 	/* User-spacified threshold in ns */
129 	u64 threshold;
130 
131 	/* List node inside triggers list */
132 	struct list_head node;
133 
134 	/* Backpointer needed during trigger destruction */
135 	struct psi_group *group;
136 
137 	/* Wait queue for polling */
138 	wait_queue_head_t event_wait;
139 
140 	/* Kernfs file for cgroup triggers */
141 	struct kernfs_open_file *of;
142 
143 	/* Pending event flag */
144 	int event;
145 
146 	/* Tracking window */
147 	struct psi_window win;
148 
149 	/*
150 	 * Time last event was generated. Used for rate-limiting
151 	 * events to one per window
152 	 */
153 	u64 last_event_time;
154 
155 	/* Deferred event(s) from previous ratelimit window */
156 	bool pending_event;
157 
158 	/* Trigger type - PSI_AVGS for unprivileged, PSI_POLL for RT */
159 	enum psi_aggregators aggregator;
160 };
161 
162 struct psi_group {
163 	struct psi_group *parent;
164 	bool enabled;
165 
166 	/* Protects data used by the aggregator */
167 	struct mutex avgs_lock;
168 
169 	/* Per-cpu task state & time tracking */
170 	struct psi_group_cpu __percpu *pcpu;
171 
172 	/* Running pressure averages */
173 	u64 avg_total[NR_PSI_STATES - 1];
174 	u64 avg_last_update;
175 	u64 avg_next_update;
176 
177 	/* Aggregator work control */
178 	struct delayed_work avgs_work;
179 
180 	/* Unprivileged triggers against N*PSI_FREQ windows */
181 	struct list_head avg_triggers;
182 	u32 avg_nr_triggers[NR_PSI_STATES - 1];
183 
184 	/* Total stall times and sampled pressure averages */
185 	u64 total[NR_PSI_AGGREGATORS][NR_PSI_STATES - 1];
186 	unsigned long avg[NR_PSI_STATES - 1][3];
187 
188 	/* Monitor RT polling work control */
189 	struct task_struct __rcu *rtpoll_task;
190 	struct timer_list rtpoll_timer;
191 	wait_queue_head_t rtpoll_wait;
192 	atomic_t rtpoll_wakeup;
193 	atomic_t rtpoll_scheduled;
194 
195 	/* Protects data used by the monitor */
196 	struct mutex rtpoll_trigger_lock;
197 
198 	/* Configured RT polling triggers */
199 	struct list_head rtpoll_triggers;
200 	u32 rtpoll_nr_triggers[NR_PSI_STATES - 1];
201 	u32 rtpoll_states;
202 	u64 rtpoll_min_period;
203 
204 	/* Total stall times at the start of RT polling monitor activation */
205 	u64 rtpoll_total[NR_PSI_STATES - 1];
206 	u64 rtpoll_next_update;
207 	u64 rtpoll_until;
208 };
209 
210 #else /* CONFIG_PSI */
211 
212 #define NR_PSI_RESOURCES	0
213 
214 struct psi_group { };
215 
216 #endif /* CONFIG_PSI */
217 
218 #endif /* _LINUX_PSI_TYPES_H */
219