12fb75e1bSLiu Xinpeng // SPDX-License-Identifier: GPL-2.0
2eb414681SJohannes Weiner /*
3eb414681SJohannes Weiner * Pressure stall information for CPU, memory and IO
4eb414681SJohannes Weiner *
5eb414681SJohannes Weiner * Copyright (c) 2018 Facebook, Inc.
6eb414681SJohannes Weiner * Author: Johannes Weiner <hannes@cmpxchg.org>
7eb414681SJohannes Weiner *
80e94682bSSuren Baghdasaryan * Polling support by Suren Baghdasaryan <surenb@google.com>
90e94682bSSuren Baghdasaryan * Copyright (c) 2018 Google, Inc.
100e94682bSSuren Baghdasaryan *
11eb414681SJohannes Weiner * When CPU, memory and IO are contended, tasks experience delays that
12eb414681SJohannes Weiner * reduce throughput and introduce latencies into the workload. Memory
13eb414681SJohannes Weiner * and IO contention, in addition, can cause a full loss of forward
14eb414681SJohannes Weiner * progress in which the CPU goes idle.
15eb414681SJohannes Weiner *
16eb414681SJohannes Weiner * This code aggregates individual task delays into resource pressure
17eb414681SJohannes Weiner * metrics that indicate problems with both workload health and
18eb414681SJohannes Weiner * resource utilization.
19eb414681SJohannes Weiner *
20eb414681SJohannes Weiner * Model
21eb414681SJohannes Weiner *
22eb414681SJohannes Weiner * The time in which a task can execute on a CPU is our baseline for
23eb414681SJohannes Weiner * productivity. Pressure expresses the amount of time in which this
24eb414681SJohannes Weiner * potential cannot be realized due to resource contention.
25eb414681SJohannes Weiner *
26eb414681SJohannes Weiner * This concept of productivity has two components: the workload and
27eb414681SJohannes Weiner * the CPU. To measure the impact of pressure on both, we define two
28eb414681SJohannes Weiner * contention states for a resource: SOME and FULL.
29eb414681SJohannes Weiner *
30eb414681SJohannes Weiner * In the SOME state of a given resource, one or more tasks are
31eb414681SJohannes Weiner * delayed on that resource. This affects the workload's ability to
32eb414681SJohannes Weiner * perform work, but the CPU may still be executing other tasks.
33eb414681SJohannes Weiner *
34eb414681SJohannes Weiner * In the FULL state of a given resource, all non-idle tasks are
35eb414681SJohannes Weiner * delayed on that resource such that nobody is advancing and the CPU
36eb414681SJohannes Weiner * goes idle. This leaves both workload and CPU unproductive.
37eb414681SJohannes Weiner *
38eb414681SJohannes Weiner * SOME = nr_delayed_tasks != 0
39cb0e52b7SBrian Chen * FULL = nr_delayed_tasks != 0 && nr_productive_tasks == 0
40cb0e52b7SBrian Chen *
41cb0e52b7SBrian Chen * What it means for a task to be productive is defined differently
42cb0e52b7SBrian Chen * for each resource. For IO, productive means a running task. For
43cb0e52b7SBrian Chen * memory, productive means a running task that isn't a reclaimer. For
44cb0e52b7SBrian Chen * CPU, productive means an oncpu task.
45cb0e52b7SBrian Chen *
46cb0e52b7SBrian Chen * Naturally, the FULL state doesn't exist for the CPU resource at the
47cb0e52b7SBrian Chen * system level, but exist at the cgroup level. At the cgroup level,
48cb0e52b7SBrian Chen * FULL means all non-idle tasks in the cgroup are delayed on the CPU
49cb0e52b7SBrian Chen * resource which is being used by others outside of the cgroup or
50cb0e52b7SBrian Chen * throttled by the cgroup cpu.max configuration.
51eb414681SJohannes Weiner *
52eb414681SJohannes Weiner * The percentage of wallclock time spent in those compound stall
53eb414681SJohannes Weiner * states gives pressure numbers between 0 and 100 for each resource,
54eb414681SJohannes Weiner * where the SOME percentage indicates workload slowdowns and the FULL
55eb414681SJohannes Weiner * percentage indicates reduced CPU utilization:
56eb414681SJohannes Weiner *
57eb414681SJohannes Weiner * %SOME = time(SOME) / period
58eb414681SJohannes Weiner * %FULL = time(FULL) / period
59eb414681SJohannes Weiner *
60eb414681SJohannes Weiner * Multiple CPUs
61eb414681SJohannes Weiner *
62eb414681SJohannes Weiner * The more tasks and available CPUs there are, the more work can be
63eb414681SJohannes Weiner * performed concurrently. This means that the potential that can go
64eb414681SJohannes Weiner * unrealized due to resource contention *also* scales with non-idle
65eb414681SJohannes Weiner * tasks and CPUs.
66eb414681SJohannes Weiner *
67eb414681SJohannes Weiner * Consider a scenario where 257 number crunching tasks are trying to
68eb414681SJohannes Weiner * run concurrently on 256 CPUs. If we simply aggregated the task
69eb414681SJohannes Weiner * states, we would have to conclude a CPU SOME pressure number of
70eb414681SJohannes Weiner * 100%, since *somebody* is waiting on a runqueue at all
71eb414681SJohannes Weiner * times. However, that is clearly not the amount of contention the
723b03706fSIngo Molnar * workload is experiencing: only one out of 256 possible execution
73eb414681SJohannes Weiner * threads will be contended at any given time, or about 0.4%.
74eb414681SJohannes Weiner *
75eb414681SJohannes Weiner * Conversely, consider a scenario of 4 tasks and 4 CPUs where at any
76eb414681SJohannes Weiner * given time *one* of the tasks is delayed due to a lack of memory.
77eb414681SJohannes Weiner * Again, looking purely at the task state would yield a memory FULL
78eb414681SJohannes Weiner * pressure number of 0%, since *somebody* is always making forward
79eb414681SJohannes Weiner * progress. But again this wouldn't capture the amount of execution
80eb414681SJohannes Weiner * potential lost, which is 1 out of 4 CPUs, or 25%.
81eb414681SJohannes Weiner *
82eb414681SJohannes Weiner * To calculate wasted potential (pressure) with multiple processors,
83eb414681SJohannes Weiner * we have to base our calculation on the number of non-idle tasks in
84eb414681SJohannes Weiner * conjunction with the number of available CPUs, which is the number
85eb414681SJohannes Weiner * of potential execution threads. SOME becomes then the proportion of
863b03706fSIngo Molnar * delayed tasks to possible threads, and FULL is the share of possible
87eb414681SJohannes Weiner * threads that are unproductive due to delays:
88eb414681SJohannes Weiner *
89eb414681SJohannes Weiner * threads = min(nr_nonidle_tasks, nr_cpus)
90eb414681SJohannes Weiner * SOME = min(nr_delayed_tasks / threads, 1)
91cb0e52b7SBrian Chen * FULL = (threads - min(nr_productive_tasks, threads)) / threads
92eb414681SJohannes Weiner *
93eb414681SJohannes Weiner * For the 257 number crunchers on 256 CPUs, this yields:
94eb414681SJohannes Weiner *
95eb414681SJohannes Weiner * threads = min(257, 256)
96eb414681SJohannes Weiner * SOME = min(1 / 256, 1) = 0.4%
97cb0e52b7SBrian Chen * FULL = (256 - min(256, 256)) / 256 = 0%
98eb414681SJohannes Weiner *
99eb414681SJohannes Weiner * For the 1 out of 4 memory-delayed tasks, this yields:
100eb414681SJohannes Weiner *
101eb414681SJohannes Weiner * threads = min(4, 4)
102eb414681SJohannes Weiner * SOME = min(1 / 4, 1) = 25%
103eb414681SJohannes Weiner * FULL = (4 - min(3, 4)) / 4 = 25%
104eb414681SJohannes Weiner *
105eb414681SJohannes Weiner * [ Substitute nr_cpus with 1, and you can see that it's a natural
106eb414681SJohannes Weiner * extension of the single-CPU model. ]
107eb414681SJohannes Weiner *
108eb414681SJohannes Weiner * Implementation
109eb414681SJohannes Weiner *
110eb414681SJohannes Weiner * To assess the precise time spent in each such state, we would have
111eb414681SJohannes Weiner * to freeze the system on task changes and start/stop the state
112eb414681SJohannes Weiner * clocks accordingly. Obviously that doesn't scale in practice.
113eb414681SJohannes Weiner *
114eb414681SJohannes Weiner * Because the scheduler aims to distribute the compute load evenly
115eb414681SJohannes Weiner * among the available CPUs, we can track task state locally to each
116eb414681SJohannes Weiner * CPU and, at much lower frequency, extrapolate the global state for
117eb414681SJohannes Weiner * the cumulative stall times and the running averages.
118eb414681SJohannes Weiner *
119eb414681SJohannes Weiner * For each runqueue, we track:
120eb414681SJohannes Weiner *
121eb414681SJohannes Weiner * tSOME[cpu] = time(nr_delayed_tasks[cpu] != 0)
122cb0e52b7SBrian Chen * tFULL[cpu] = time(nr_delayed_tasks[cpu] && !nr_productive_tasks[cpu])
123eb414681SJohannes Weiner * tNONIDLE[cpu] = time(nr_nonidle_tasks[cpu] != 0)
124eb414681SJohannes Weiner *
125eb414681SJohannes Weiner * and then periodically aggregate:
126eb414681SJohannes Weiner *
127eb414681SJohannes Weiner * tNONIDLE = sum(tNONIDLE[i])
128eb414681SJohannes Weiner *
129eb414681SJohannes Weiner * tSOME = sum(tSOME[i] * tNONIDLE[i]) / tNONIDLE
130eb414681SJohannes Weiner * tFULL = sum(tFULL[i] * tNONIDLE[i]) / tNONIDLE
131eb414681SJohannes Weiner *
132eb414681SJohannes Weiner * %SOME = tSOME / period
133eb414681SJohannes Weiner * %FULL = tFULL / period
134eb414681SJohannes Weiner *
135eb414681SJohannes Weiner * This gives us an approximation of pressure that is practical
136eb414681SJohannes Weiner * cost-wise, yet way more sensitive and accurate than periodic
137eb414681SJohannes Weiner * sampling of the aggregate task states would be.
138eb414681SJohannes Weiner */
139eb414681SJohannes Weiner
140eb414681SJohannes Weiner static int psi_bug __read_mostly;
141eb414681SJohannes Weiner
142e0c27447SJohannes Weiner DEFINE_STATIC_KEY_FALSE(psi_disabled);
14335cd21f6SMiaohe Lin static DEFINE_STATIC_KEY_TRUE(psi_cgroups_enabled);
144e0c27447SJohannes Weiner
145e0c27447SJohannes Weiner #ifdef CONFIG_PSI_DEFAULT_DISABLED
1469289c5e6SSuren Baghdasaryan static bool psi_enable;
147e0c27447SJohannes Weiner #else
1489289c5e6SSuren Baghdasaryan static bool psi_enable = true;
149e0c27447SJohannes Weiner #endif
setup_psi(char * str)150e0c27447SJohannes Weiner static int __init setup_psi(char *str)
151e0c27447SJohannes Weiner {
152e0c27447SJohannes Weiner return kstrtobool(str, &psi_enable) == 0;
153e0c27447SJohannes Weiner }
154e0c27447SJohannes Weiner __setup("psi=", setup_psi);
155eb414681SJohannes Weiner
156eb414681SJohannes Weiner /* Running averages - we need to be higher-res than loadavg */
157eb414681SJohannes Weiner #define PSI_FREQ (2*HZ+1) /* 2 sec intervals */
158eb414681SJohannes Weiner #define EXP_10s 1677 /* 1/exp(2s/10s) as fixed-point */
159eb414681SJohannes Weiner #define EXP_60s 1981 /* 1/exp(2s/60s) */
160eb414681SJohannes Weiner #define EXP_300s 2034 /* 1/exp(2s/300s) */
161eb414681SJohannes Weiner
1620e94682bSSuren Baghdasaryan /* PSI trigger definitions */
1630e94682bSSuren Baghdasaryan #define WINDOW_MAX_US 10000000 /* Max window size is 10s */
1640e94682bSSuren Baghdasaryan #define UPDATES_PER_WINDOW 10 /* 10 updates per window */
1650e94682bSSuren Baghdasaryan
166eb414681SJohannes Weiner /* Sampling frequency in nanoseconds */
167eb414681SJohannes Weiner static u64 psi_period __read_mostly;
168eb414681SJohannes Weiner
169eb414681SJohannes Weiner /* System-level pressure and stall tracking */
170eb414681SJohannes Weiner static DEFINE_PER_CPU(struct psi_group_cpu, system_group_pcpu);
171df5ba5beSDan Schatzberg struct psi_group psi_system = {
172eb414681SJohannes Weiner .pcpu = &system_group_pcpu,
173eb414681SJohannes Weiner };
174eb414681SJohannes Weiner
175bcc78db6SSuren Baghdasaryan static void psi_avgs_work(struct work_struct *work);
176eb414681SJohannes Weiner
1778f91efd8SZhaoyang Huang static void poll_timer_fn(struct timer_list *t);
1788f91efd8SZhaoyang Huang
group_init(struct psi_group * group)179eb414681SJohannes Weiner static void group_init(struct psi_group *group)
180eb414681SJohannes Weiner {
181eb414681SJohannes Weiner int cpu;
182eb414681SJohannes Weiner
18334f26a15SChengming Zhou group->enabled = true;
184eb414681SJohannes Weiner for_each_possible_cpu(cpu)
185eb414681SJohannes Weiner seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq);
1863dfbe25cSJohannes Weiner group->avg_last_update = sched_clock();
1873dfbe25cSJohannes Weiner group->avg_next_update = group->avg_last_update + psi_period;
188bcc78db6SSuren Baghdasaryan mutex_init(&group->avgs_lock);
189d82caa27SDomenico Cerasuolo
190d82caa27SDomenico Cerasuolo /* Init avg trigger-related members */
191d82caa27SDomenico Cerasuolo INIT_LIST_HEAD(&group->avg_triggers);
192d82caa27SDomenico Cerasuolo memset(group->avg_nr_triggers, 0, sizeof(group->avg_nr_triggers));
193d82caa27SDomenico Cerasuolo INIT_DELAYED_WORK(&group->avgs_work, psi_avgs_work);
194d82caa27SDomenico Cerasuolo
195d82caa27SDomenico Cerasuolo /* Init rtpoll trigger-related members */
19665457b74SDomenico Cerasuolo atomic_set(&group->rtpoll_scheduled, 0);
19765457b74SDomenico Cerasuolo mutex_init(&group->rtpoll_trigger_lock);
19865457b74SDomenico Cerasuolo INIT_LIST_HEAD(&group->rtpoll_triggers);
19965457b74SDomenico Cerasuolo group->rtpoll_min_period = U32_MAX;
20065457b74SDomenico Cerasuolo group->rtpoll_next_update = ULLONG_MAX;
20165457b74SDomenico Cerasuolo init_waitqueue_head(&group->rtpoll_wait);
20265457b74SDomenico Cerasuolo timer_setup(&group->rtpoll_timer, poll_timer_fn, 0);
20365457b74SDomenico Cerasuolo rcu_assign_pointer(group->rtpoll_task, NULL);
204eb414681SJohannes Weiner }
205eb414681SJohannes Weiner
psi_init(void)206eb414681SJohannes Weiner void __init psi_init(void)
207eb414681SJohannes Weiner {
208e0c27447SJohannes Weiner if (!psi_enable) {
209e0c27447SJohannes Weiner static_branch_enable(&psi_disabled);
210e2ad8ab0SChengming Zhou static_branch_disable(&psi_cgroups_enabled);
211eb414681SJohannes Weiner return;
212e0c27447SJohannes Weiner }
213eb414681SJohannes Weiner
2143958e2d0SSuren Baghdasaryan if (!cgroup_psi_enabled())
2153958e2d0SSuren Baghdasaryan static_branch_disable(&psi_cgroups_enabled);
2163958e2d0SSuren Baghdasaryan
217eb414681SJohannes Weiner psi_period = jiffies_to_nsecs(PSI_FREQ);
218eb414681SJohannes Weiner group_init(&psi_system);
219eb414681SJohannes Weiner }
220eb414681SJohannes Weiner
test_state(unsigned int * tasks,enum psi_states state,bool oncpu)22171dbdde7SJohannes Weiner static bool test_state(unsigned int *tasks, enum psi_states state, bool oncpu)
222eb414681SJohannes Weiner {
223eb414681SJohannes Weiner switch (state) {
224eb414681SJohannes Weiner case PSI_IO_SOME:
225fddc8babSJohannes Weiner return unlikely(tasks[NR_IOWAIT]);
226eb414681SJohannes Weiner case PSI_IO_FULL:
227fddc8babSJohannes Weiner return unlikely(tasks[NR_IOWAIT] && !tasks[NR_RUNNING]);
228eb414681SJohannes Weiner case PSI_MEM_SOME:
229fddc8babSJohannes Weiner return unlikely(tasks[NR_MEMSTALL]);
230eb414681SJohannes Weiner case PSI_MEM_FULL:
231cb0e52b7SBrian Chen return unlikely(tasks[NR_MEMSTALL] &&
232cb0e52b7SBrian Chen tasks[NR_RUNNING] == tasks[NR_MEMSTALL_RUNNING]);
233eb414681SJohannes Weiner case PSI_CPU_SOME:
23471dbdde7SJohannes Weiner return unlikely(tasks[NR_RUNNING] > oncpu);
235e7fcd762SChengming Zhou case PSI_CPU_FULL:
23671dbdde7SJohannes Weiner return unlikely(tasks[NR_RUNNING] && !oncpu);
237eb414681SJohannes Weiner case PSI_NONIDLE:
238eb414681SJohannes Weiner return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] ||
239eb414681SJohannes Weiner tasks[NR_RUNNING];
240eb414681SJohannes Weiner default:
241eb414681SJohannes Weiner return false;
242eb414681SJohannes Weiner }
243eb414681SJohannes Weiner }
244eb414681SJohannes Weiner
get_recent_times(struct psi_group * group,int cpu,enum psi_aggregators aggregator,u32 * times,u32 * pchanged_states)2450e94682bSSuren Baghdasaryan static void get_recent_times(struct psi_group *group, int cpu,
2460e94682bSSuren Baghdasaryan enum psi_aggregators aggregator, u32 *times,
247333f3017SSuren Baghdasaryan u32 *pchanged_states)
248eb414681SJohannes Weiner {
249eb414681SJohannes Weiner struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu);
2502fcd7bbaSChengming Zhou int current_cpu = raw_smp_processor_id();
2512fcd7bbaSChengming Zhou unsigned int tasks[NR_PSI_TASK_COUNTS];
252eb414681SJohannes Weiner u64 now, state_start;
25333b2d630SSuren Baghdasaryan enum psi_states s;
254eb414681SJohannes Weiner unsigned int seq;
25533b2d630SSuren Baghdasaryan u32 state_mask;
256eb414681SJohannes Weiner
257333f3017SSuren Baghdasaryan *pchanged_states = 0;
258333f3017SSuren Baghdasaryan
259eb414681SJohannes Weiner /* Snapshot a coherent view of the CPU state */
260eb414681SJohannes Weiner do {
261eb414681SJohannes Weiner seq = read_seqcount_begin(&groupc->seq);
262eb414681SJohannes Weiner now = cpu_clock(cpu);
263eb414681SJohannes Weiner memcpy(times, groupc->times, sizeof(groupc->times));
26433b2d630SSuren Baghdasaryan state_mask = groupc->state_mask;
265eb414681SJohannes Weiner state_start = groupc->state_start;
2662fcd7bbaSChengming Zhou if (cpu == current_cpu)
2672fcd7bbaSChengming Zhou memcpy(tasks, groupc->tasks, sizeof(groupc->tasks));
268eb414681SJohannes Weiner } while (read_seqcount_retry(&groupc->seq, seq));
269eb414681SJohannes Weiner
270eb414681SJohannes Weiner /* Calculate state time deltas against the previous snapshot */
271eb414681SJohannes Weiner for (s = 0; s < NR_PSI_STATES; s++) {
272eb414681SJohannes Weiner u32 delta;
273eb414681SJohannes Weiner /*
274eb414681SJohannes Weiner * In addition to already concluded states, we also
275eb414681SJohannes Weiner * incorporate currently active states on the CPU,
276eb414681SJohannes Weiner * since states may last for many sampling periods.
277eb414681SJohannes Weiner *
278eb414681SJohannes Weiner * This way we keep our delta sampling buckets small
279eb414681SJohannes Weiner * (u32) and our reported pressure close to what's
280eb414681SJohannes Weiner * actually happening.
281eb414681SJohannes Weiner */
28233b2d630SSuren Baghdasaryan if (state_mask & (1 << s))
283eb414681SJohannes Weiner times[s] += now - state_start;
284eb414681SJohannes Weiner
2850e94682bSSuren Baghdasaryan delta = times[s] - groupc->times_prev[aggregator][s];
2860e94682bSSuren Baghdasaryan groupc->times_prev[aggregator][s] = times[s];
287eb414681SJohannes Weiner
288eb414681SJohannes Weiner times[s] = delta;
289333f3017SSuren Baghdasaryan if (delta)
290333f3017SSuren Baghdasaryan *pchanged_states |= (1 << s);
291eb414681SJohannes Weiner }
2922fcd7bbaSChengming Zhou
2932fcd7bbaSChengming Zhou /*
2942fcd7bbaSChengming Zhou * When collect_percpu_times() from the avgs_work, we don't want to
2952fcd7bbaSChengming Zhou * re-arm avgs_work when all CPUs are IDLE. But the current CPU running
2962fcd7bbaSChengming Zhou * this avgs_work is never IDLE, cause avgs_work can't be shut off.
2972fcd7bbaSChengming Zhou * So for the current CPU, we need to re-arm avgs_work only when
2982fcd7bbaSChengming Zhou * (NR_RUNNING > 1 || NR_IOWAIT > 0 || NR_MEMSTALL > 0), for other CPUs
2992fcd7bbaSChengming Zhou * we can just check PSI_NONIDLE delta.
3002fcd7bbaSChengming Zhou */
3012fcd7bbaSChengming Zhou if (current_work() == &group->avgs_work.work) {
3022fcd7bbaSChengming Zhou bool reschedule;
3032fcd7bbaSChengming Zhou
3042fcd7bbaSChengming Zhou if (cpu == current_cpu)
3052fcd7bbaSChengming Zhou reschedule = tasks[NR_RUNNING] +
3062fcd7bbaSChengming Zhou tasks[NR_IOWAIT] +
3072fcd7bbaSChengming Zhou tasks[NR_MEMSTALL] > 1;
3082fcd7bbaSChengming Zhou else
3092fcd7bbaSChengming Zhou reschedule = *pchanged_states & (1 << PSI_NONIDLE);
3102fcd7bbaSChengming Zhou
3112fcd7bbaSChengming Zhou if (reschedule)
3122fcd7bbaSChengming Zhou *pchanged_states |= PSI_STATE_RESCHEDULE;
3132fcd7bbaSChengming Zhou }
314eb414681SJohannes Weiner }
315eb414681SJohannes Weiner
calc_avgs(unsigned long avg[3],int missed_periods,u64 time,u64 period)316eb414681SJohannes Weiner static void calc_avgs(unsigned long avg[3], int missed_periods,
317eb414681SJohannes Weiner u64 time, u64 period)
318eb414681SJohannes Weiner {
319eb414681SJohannes Weiner unsigned long pct;
320eb414681SJohannes Weiner
321eb414681SJohannes Weiner /* Fill in zeroes for periods of no activity */
322eb414681SJohannes Weiner if (missed_periods) {
323eb414681SJohannes Weiner avg[0] = calc_load_n(avg[0], EXP_10s, 0, missed_periods);
324eb414681SJohannes Weiner avg[1] = calc_load_n(avg[1], EXP_60s, 0, missed_periods);
325eb414681SJohannes Weiner avg[2] = calc_load_n(avg[2], EXP_300s, 0, missed_periods);
326eb414681SJohannes Weiner }
327eb414681SJohannes Weiner
328eb414681SJohannes Weiner /* Sample the most recent active period */
329eb414681SJohannes Weiner pct = div_u64(time * 100, period);
330eb414681SJohannes Weiner pct *= FIXED_1;
331eb414681SJohannes Weiner avg[0] = calc_load(avg[0], EXP_10s, pct);
332eb414681SJohannes Weiner avg[1] = calc_load(avg[1], EXP_60s, pct);
333eb414681SJohannes Weiner avg[2] = calc_load(avg[2], EXP_300s, pct);
334eb414681SJohannes Weiner }
335eb414681SJohannes Weiner
collect_percpu_times(struct psi_group * group,enum psi_aggregators aggregator,u32 * pchanged_states)3360e94682bSSuren Baghdasaryan static void collect_percpu_times(struct psi_group *group,
3370e94682bSSuren Baghdasaryan enum psi_aggregators aggregator,
3380e94682bSSuren Baghdasaryan u32 *pchanged_states)
339eb414681SJohannes Weiner {
340eb414681SJohannes Weiner u64 deltas[NR_PSI_STATES - 1] = { 0, };
341eb414681SJohannes Weiner unsigned long nonidle_total = 0;
342333f3017SSuren Baghdasaryan u32 changed_states = 0;
343eb414681SJohannes Weiner int cpu;
344eb414681SJohannes Weiner int s;
345eb414681SJohannes Weiner
346eb414681SJohannes Weiner /*
347eb414681SJohannes Weiner * Collect the per-cpu time buckets and average them into a
348eb414681SJohannes Weiner * single time sample that is normalized to wallclock time.
349eb414681SJohannes Weiner *
350eb414681SJohannes Weiner * For averaging, each CPU is weighted by its non-idle time in
351eb414681SJohannes Weiner * the sampling period. This eliminates artifacts from uneven
352eb414681SJohannes Weiner * loading, or even entirely idle CPUs.
353eb414681SJohannes Weiner */
354eb414681SJohannes Weiner for_each_possible_cpu(cpu) {
355eb414681SJohannes Weiner u32 times[NR_PSI_STATES];
356eb414681SJohannes Weiner u32 nonidle;
357333f3017SSuren Baghdasaryan u32 cpu_changed_states;
358eb414681SJohannes Weiner
3590e94682bSSuren Baghdasaryan get_recent_times(group, cpu, aggregator, times,
360333f3017SSuren Baghdasaryan &cpu_changed_states);
361333f3017SSuren Baghdasaryan changed_states |= cpu_changed_states;
362eb414681SJohannes Weiner
363eb414681SJohannes Weiner nonidle = nsecs_to_jiffies(times[PSI_NONIDLE]);
364eb414681SJohannes Weiner nonidle_total += nonidle;
365eb414681SJohannes Weiner
366eb414681SJohannes Weiner for (s = 0; s < PSI_NONIDLE; s++)
367eb414681SJohannes Weiner deltas[s] += (u64)times[s] * nonidle;
368eb414681SJohannes Weiner }
369eb414681SJohannes Weiner
370eb414681SJohannes Weiner /*
371eb414681SJohannes Weiner * Integrate the sample into the running statistics that are
372eb414681SJohannes Weiner * reported to userspace: the cumulative stall times and the
373eb414681SJohannes Weiner * decaying averages.
374eb414681SJohannes Weiner *
375eb414681SJohannes Weiner * Pressure percentages are sampled at PSI_FREQ. We might be
376eb414681SJohannes Weiner * called more often when the user polls more frequently than
377eb414681SJohannes Weiner * that; we might be called less often when there is no task
378eb414681SJohannes Weiner * activity, thus no data, and clock ticks are sporadic. The
379eb414681SJohannes Weiner * below handles both.
380eb414681SJohannes Weiner */
381eb414681SJohannes Weiner
382eb414681SJohannes Weiner /* total= */
383eb414681SJohannes Weiner for (s = 0; s < NR_PSI_STATES - 1; s++)
3840e94682bSSuren Baghdasaryan group->total[aggregator][s] +=
3850e94682bSSuren Baghdasaryan div_u64(deltas[s], max(nonidle_total, 1UL));
386eb414681SJohannes Weiner
387333f3017SSuren Baghdasaryan if (pchanged_states)
388333f3017SSuren Baghdasaryan *pchanged_states = changed_states;
3897fc70a39SSuren Baghdasaryan }
3907fc70a39SSuren Baghdasaryan
3917fab21faSDomenico Cerasuolo /* Trigger tracking window manipulations */
window_reset(struct psi_window * win,u64 now,u64 value,u64 prev_growth)3927fab21faSDomenico Cerasuolo static void window_reset(struct psi_window *win, u64 now, u64 value,
3937fab21faSDomenico Cerasuolo u64 prev_growth)
3947fab21faSDomenico Cerasuolo {
3957fab21faSDomenico Cerasuolo win->start_time = now;
3967fab21faSDomenico Cerasuolo win->start_value = value;
3977fab21faSDomenico Cerasuolo win->prev_growth = prev_growth;
3987fab21faSDomenico Cerasuolo }
3997fab21faSDomenico Cerasuolo
4007fab21faSDomenico Cerasuolo /*
4017fab21faSDomenico Cerasuolo * PSI growth tracking window update and growth calculation routine.
4027fab21faSDomenico Cerasuolo *
4037fab21faSDomenico Cerasuolo * This approximates a sliding tracking window by interpolating
4047fab21faSDomenico Cerasuolo * partially elapsed windows using historical growth data from the
4057fab21faSDomenico Cerasuolo * previous intervals. This minimizes memory requirements (by not storing
4067fab21faSDomenico Cerasuolo * all the intermediate values in the previous window) and simplifies
4077fab21faSDomenico Cerasuolo * the calculations. It works well because PSI signal changes only in
4087fab21faSDomenico Cerasuolo * positive direction and over relatively small window sizes the growth
4097fab21faSDomenico Cerasuolo * is close to linear.
4107fab21faSDomenico Cerasuolo */
window_update(struct psi_window * win,u64 now,u64 value)4117fab21faSDomenico Cerasuolo static u64 window_update(struct psi_window *win, u64 now, u64 value)
4127fab21faSDomenico Cerasuolo {
4137fab21faSDomenico Cerasuolo u64 elapsed;
4147fab21faSDomenico Cerasuolo u64 growth;
4157fab21faSDomenico Cerasuolo
4167fab21faSDomenico Cerasuolo elapsed = now - win->start_time;
4177fab21faSDomenico Cerasuolo growth = value - win->start_value;
4187fab21faSDomenico Cerasuolo /*
4197fab21faSDomenico Cerasuolo * After each tracking window passes win->start_value and
4207fab21faSDomenico Cerasuolo * win->start_time get reset and win->prev_growth stores
4217fab21faSDomenico Cerasuolo * the average per-window growth of the previous window.
4227fab21faSDomenico Cerasuolo * win->prev_growth is then used to interpolate additional
4237fab21faSDomenico Cerasuolo * growth from the previous window assuming it was linear.
4247fab21faSDomenico Cerasuolo */
4257fab21faSDomenico Cerasuolo if (elapsed > win->size)
4267fab21faSDomenico Cerasuolo window_reset(win, now, value, growth);
4277fab21faSDomenico Cerasuolo else {
4287fab21faSDomenico Cerasuolo u32 remaining;
4297fab21faSDomenico Cerasuolo
4307fab21faSDomenico Cerasuolo remaining = win->size - elapsed;
4317fab21faSDomenico Cerasuolo growth += div64_u64(win->prev_growth * remaining, win->size);
4327fab21faSDomenico Cerasuolo }
4337fab21faSDomenico Cerasuolo
4347fab21faSDomenico Cerasuolo return growth;
4357fab21faSDomenico Cerasuolo }
4367fab21faSDomenico Cerasuolo
update_triggers(struct psi_group * group,u64 now,bool * update_total,enum psi_aggregators aggregator)437d82caa27SDomenico Cerasuolo static u64 update_triggers(struct psi_group *group, u64 now, bool *update_total,
438d82caa27SDomenico Cerasuolo enum psi_aggregators aggregator)
4397fab21faSDomenico Cerasuolo {
4407fab21faSDomenico Cerasuolo struct psi_trigger *t;
441d82caa27SDomenico Cerasuolo u64 *total = group->total[aggregator];
442d82caa27SDomenico Cerasuolo struct list_head *triggers;
443d82caa27SDomenico Cerasuolo u64 *aggregator_total;
4444468fcaeSDomenico Cerasuolo *update_total = false;
4457fab21faSDomenico Cerasuolo
446d82caa27SDomenico Cerasuolo if (aggregator == PSI_AVGS) {
447d82caa27SDomenico Cerasuolo triggers = &group->avg_triggers;
448d82caa27SDomenico Cerasuolo aggregator_total = group->avg_total;
449d82caa27SDomenico Cerasuolo } else {
450d82caa27SDomenico Cerasuolo triggers = &group->rtpoll_triggers;
451d82caa27SDomenico Cerasuolo aggregator_total = group->rtpoll_total;
452d82caa27SDomenico Cerasuolo }
453d82caa27SDomenico Cerasuolo
4547fab21faSDomenico Cerasuolo /*
4557fab21faSDomenico Cerasuolo * On subsequent updates, calculate growth deltas and let
4567fab21faSDomenico Cerasuolo * watchers know when their specified thresholds are exceeded.
4577fab21faSDomenico Cerasuolo */
458d82caa27SDomenico Cerasuolo list_for_each_entry(t, triggers, node) {
4597fab21faSDomenico Cerasuolo u64 growth;
4607fab21faSDomenico Cerasuolo bool new_stall;
4617fab21faSDomenico Cerasuolo
462d82caa27SDomenico Cerasuolo new_stall = aggregator_total[t->state] != total[t->state];
4637fab21faSDomenico Cerasuolo
4647fab21faSDomenico Cerasuolo /* Check for stall activity or a previous threshold breach */
4657fab21faSDomenico Cerasuolo if (!new_stall && !t->pending_event)
4667fab21faSDomenico Cerasuolo continue;
4677fab21faSDomenico Cerasuolo /*
4687fab21faSDomenico Cerasuolo * Check for new stall activity, as well as deferred
4697fab21faSDomenico Cerasuolo * events that occurred in the last window after the
4707fab21faSDomenico Cerasuolo * trigger had already fired (we want to ratelimit
4717fab21faSDomenico Cerasuolo * events without dropping any).
4727fab21faSDomenico Cerasuolo */
4737fab21faSDomenico Cerasuolo if (new_stall) {
4747fab21faSDomenico Cerasuolo /*
4757fab21faSDomenico Cerasuolo * Multiple triggers might be looking at the same state,
4767fab21faSDomenico Cerasuolo * remember to update group->polling_total[] once we've
4777fab21faSDomenico Cerasuolo * been through all of them. Also remember to extend the
4787fab21faSDomenico Cerasuolo * polling time if we see new stall activity.
4797fab21faSDomenico Cerasuolo */
4804468fcaeSDomenico Cerasuolo *update_total = true;
4817fab21faSDomenico Cerasuolo
4827fab21faSDomenico Cerasuolo /* Calculate growth since last update */
4837fab21faSDomenico Cerasuolo growth = window_update(&t->win, now, total[t->state]);
4847fab21faSDomenico Cerasuolo if (!t->pending_event) {
4857fab21faSDomenico Cerasuolo if (growth < t->threshold)
4867fab21faSDomenico Cerasuolo continue;
4877fab21faSDomenico Cerasuolo
4887fab21faSDomenico Cerasuolo t->pending_event = true;
4897fab21faSDomenico Cerasuolo }
4907fab21faSDomenico Cerasuolo }
4917fab21faSDomenico Cerasuolo /* Limit event signaling to once per window */
4927fab21faSDomenico Cerasuolo if (now < t->last_event_time + t->win.size)
4937fab21faSDomenico Cerasuolo continue;
4947fab21faSDomenico Cerasuolo
4957fab21faSDomenico Cerasuolo /* Generate an event */
496aff03707SSuren Baghdasaryan if (cmpxchg(&t->event, 0, 1) == 0) {
497aff03707SSuren Baghdasaryan if (t->of)
498aff03707SSuren Baghdasaryan kernfs_notify(t->of->kn);
499aff03707SSuren Baghdasaryan else
5007fab21faSDomenico Cerasuolo wake_up_interruptible(&t->event_wait);
501aff03707SSuren Baghdasaryan }
5027fab21faSDomenico Cerasuolo t->last_event_time = now;
5037fab21faSDomenico Cerasuolo /* Reset threshold breach flag once event got generated */
5047fab21faSDomenico Cerasuolo t->pending_event = false;
5057fab21faSDomenico Cerasuolo }
5067fab21faSDomenico Cerasuolo
50765457b74SDomenico Cerasuolo return now + group->rtpoll_min_period;
5087fab21faSDomenico Cerasuolo }
5097fab21faSDomenico Cerasuolo
update_averages(struct psi_group * group,u64 now)5107fc70a39SSuren Baghdasaryan static u64 update_averages(struct psi_group *group, u64 now)
5117fc70a39SSuren Baghdasaryan {
5127fc70a39SSuren Baghdasaryan unsigned long missed_periods = 0;
5137fc70a39SSuren Baghdasaryan u64 expires, period;
5147fc70a39SSuren Baghdasaryan u64 avg_next_update;
5157fc70a39SSuren Baghdasaryan int s;
5167fc70a39SSuren Baghdasaryan
517eb414681SJohannes Weiner /* avgX= */
518bcc78db6SSuren Baghdasaryan expires = group->avg_next_update;
5194e37504dSJohannes Weiner if (now - expires >= psi_period)
520eb414681SJohannes Weiner missed_periods = div_u64(now - expires, psi_period);
521eb414681SJohannes Weiner
522eb414681SJohannes Weiner /*
523eb414681SJohannes Weiner * The periodic clock tick can get delayed for various
524eb414681SJohannes Weiner * reasons, especially on loaded systems. To avoid clock
525eb414681SJohannes Weiner * drift, we schedule the clock in fixed psi_period intervals.
526eb414681SJohannes Weiner * But the deltas we sample out of the per-cpu buckets above
527eb414681SJohannes Weiner * are based on the actual time elapsing between clock ticks.
528eb414681SJohannes Weiner */
5297fc70a39SSuren Baghdasaryan avg_next_update = expires + ((1 + missed_periods) * psi_period);
530bcc78db6SSuren Baghdasaryan period = now - (group->avg_last_update + (missed_periods * psi_period));
531bcc78db6SSuren Baghdasaryan group->avg_last_update = now;
532eb414681SJohannes Weiner
533eb414681SJohannes Weiner for (s = 0; s < NR_PSI_STATES - 1; s++) {
534eb414681SJohannes Weiner u32 sample;
535eb414681SJohannes Weiner
5360e94682bSSuren Baghdasaryan sample = group->total[PSI_AVGS][s] - group->avg_total[s];
537eb414681SJohannes Weiner /*
538eb414681SJohannes Weiner * Due to the lockless sampling of the time buckets,
539eb414681SJohannes Weiner * recorded time deltas can slip into the next period,
540eb414681SJohannes Weiner * which under full pressure can result in samples in
541eb414681SJohannes Weiner * excess of the period length.
542eb414681SJohannes Weiner *
543eb414681SJohannes Weiner * We don't want to report non-sensical pressures in
544eb414681SJohannes Weiner * excess of 100%, nor do we want to drop such events
545eb414681SJohannes Weiner * on the floor. Instead we punt any overage into the
546eb414681SJohannes Weiner * future until pressure subsides. By doing this we
547eb414681SJohannes Weiner * don't underreport the occurring pressure curve, we
548eb414681SJohannes Weiner * just report it delayed by one period length.
549eb414681SJohannes Weiner *
550eb414681SJohannes Weiner * The error isn't cumulative. As soon as another
551eb414681SJohannes Weiner * delta slips from a period P to P+1, by definition
552eb414681SJohannes Weiner * it frees up its time T in P.
553eb414681SJohannes Weiner */
554eb414681SJohannes Weiner if (sample > period)
555eb414681SJohannes Weiner sample = period;
556bcc78db6SSuren Baghdasaryan group->avg_total[s] += sample;
557eb414681SJohannes Weiner calc_avgs(group->avg[s], missed_periods, sample, period);
558eb414681SJohannes Weiner }
5597fc70a39SSuren Baghdasaryan
5607fc70a39SSuren Baghdasaryan return avg_next_update;
561eb414681SJohannes Weiner }
562eb414681SJohannes Weiner
psi_avgs_work(struct work_struct * work)563bcc78db6SSuren Baghdasaryan static void psi_avgs_work(struct work_struct *work)
564eb414681SJohannes Weiner {
565eb414681SJohannes Weiner struct delayed_work *dwork;
566eb414681SJohannes Weiner struct psi_group *group;
567333f3017SSuren Baghdasaryan u32 changed_states;
568d82caa27SDomenico Cerasuolo bool update_total;
5697fc70a39SSuren Baghdasaryan u64 now;
570eb414681SJohannes Weiner
571eb414681SJohannes Weiner dwork = to_delayed_work(work);
572bcc78db6SSuren Baghdasaryan group = container_of(dwork, struct psi_group, avgs_work);
573eb414681SJohannes Weiner
5747fc70a39SSuren Baghdasaryan mutex_lock(&group->avgs_lock);
5757fc70a39SSuren Baghdasaryan
5767fc70a39SSuren Baghdasaryan now = sched_clock();
5777fc70a39SSuren Baghdasaryan
5780e94682bSSuren Baghdasaryan collect_percpu_times(group, PSI_AVGS, &changed_states);
579eb414681SJohannes Weiner /*
580eb414681SJohannes Weiner * If there is task activity, periodically fold the per-cpu
581eb414681SJohannes Weiner * times and feed samples into the running averages. If things
582eb414681SJohannes Weiner * are idle and there is no data to process, stop the clock.
583eb414681SJohannes Weiner * Once restarted, we'll catch up the running averages in one
584eb414681SJohannes Weiner * go - see calc_avgs() and missed_periods.
585eb414681SJohannes Weiner */
586d82caa27SDomenico Cerasuolo if (now >= group->avg_next_update) {
587d82caa27SDomenico Cerasuolo update_triggers(group, now, &update_total, PSI_AVGS);
5887fc70a39SSuren Baghdasaryan group->avg_next_update = update_averages(group, now);
589d82caa27SDomenico Cerasuolo }
590eb414681SJohannes Weiner
5912fcd7bbaSChengming Zhou if (changed_states & PSI_STATE_RESCHEDULE) {
5927fc70a39SSuren Baghdasaryan schedule_delayed_work(dwork, nsecs_to_jiffies(
5937fc70a39SSuren Baghdasaryan group->avg_next_update - now) + 1);
594eb414681SJohannes Weiner }
5957fc70a39SSuren Baghdasaryan
5967fc70a39SSuren Baghdasaryan mutex_unlock(&group->avgs_lock);
597eb414681SJohannes Weiner }
598eb414681SJohannes Weiner
init_rtpoll_triggers(struct psi_group * group,u64 now)599d82caa27SDomenico Cerasuolo static void init_rtpoll_triggers(struct psi_group *group, u64 now)
6000e94682bSSuren Baghdasaryan {
6010e94682bSSuren Baghdasaryan struct psi_trigger *t;
6020e94682bSSuren Baghdasaryan
60365457b74SDomenico Cerasuolo list_for_each_entry(t, &group->rtpoll_triggers, node)
6040e94682bSSuren Baghdasaryan window_reset(&t->win, now,
6050e94682bSSuren Baghdasaryan group->total[PSI_POLL][t->state], 0);
60665457b74SDomenico Cerasuolo memcpy(group->rtpoll_total, group->total[PSI_POLL],
60765457b74SDomenico Cerasuolo sizeof(group->rtpoll_total));
60865457b74SDomenico Cerasuolo group->rtpoll_next_update = now + group->rtpoll_min_period;
6090e94682bSSuren Baghdasaryan }
6100e94682bSSuren Baghdasaryan
611710ffe67SSuren Baghdasaryan /* Schedule polling if it's not already scheduled or forced. */
psi_schedule_rtpoll_work(struct psi_group * group,unsigned long delay,bool force)61265457b74SDomenico Cerasuolo static void psi_schedule_rtpoll_work(struct psi_group *group, unsigned long delay,
613710ffe67SSuren Baghdasaryan bool force)
6140e94682bSSuren Baghdasaryan {
615461daba0SSuren Baghdasaryan struct task_struct *task;
6160e94682bSSuren Baghdasaryan
617461daba0SSuren Baghdasaryan /*
618710ffe67SSuren Baghdasaryan * atomic_xchg should be called even when !force to provide a
61965457b74SDomenico Cerasuolo * full memory barrier (see the comment inside psi_rtpoll_work).
620461daba0SSuren Baghdasaryan */
62165457b74SDomenico Cerasuolo if (atomic_xchg(&group->rtpoll_scheduled, 1) && !force)
6220e94682bSSuren Baghdasaryan return;
6230e94682bSSuren Baghdasaryan
6240e94682bSSuren Baghdasaryan rcu_read_lock();
6250e94682bSSuren Baghdasaryan
62665457b74SDomenico Cerasuolo task = rcu_dereference(group->rtpoll_task);
6270e94682bSSuren Baghdasaryan /*
6280e94682bSSuren Baghdasaryan * kworker might be NULL in case psi_trigger_destroy races with
6290e94682bSSuren Baghdasaryan * psi_task_change (hotpath) which can't use locks
6300e94682bSSuren Baghdasaryan */
631461daba0SSuren Baghdasaryan if (likely(task))
63265457b74SDomenico Cerasuolo mod_timer(&group->rtpoll_timer, jiffies + delay);
633710ffe67SSuren Baghdasaryan else
63465457b74SDomenico Cerasuolo atomic_set(&group->rtpoll_scheduled, 0);
6350e94682bSSuren Baghdasaryan
6360e94682bSSuren Baghdasaryan rcu_read_unlock();
6370e94682bSSuren Baghdasaryan }
6380e94682bSSuren Baghdasaryan
psi_rtpoll_work(struct psi_group * group)63965457b74SDomenico Cerasuolo static void psi_rtpoll_work(struct psi_group *group)
6400e94682bSSuren Baghdasaryan {
641710ffe67SSuren Baghdasaryan bool force_reschedule = false;
6420e94682bSSuren Baghdasaryan u32 changed_states;
6434468fcaeSDomenico Cerasuolo bool update_total;
6440e94682bSSuren Baghdasaryan u64 now;
6450e94682bSSuren Baghdasaryan
64665457b74SDomenico Cerasuolo mutex_lock(&group->rtpoll_trigger_lock);
6470e94682bSSuren Baghdasaryan
6480e94682bSSuren Baghdasaryan now = sched_clock();
6490e94682bSSuren Baghdasaryan
65065457b74SDomenico Cerasuolo if (now > group->rtpoll_until) {
651710ffe67SSuren Baghdasaryan /*
652710ffe67SSuren Baghdasaryan * We are either about to start or might stop polling if no
653710ffe67SSuren Baghdasaryan * state change was recorded. Resetting poll_scheduled leaves
654710ffe67SSuren Baghdasaryan * a small window for psi_group_change to sneak in and schedule
655710ffe67SSuren Baghdasaryan * an immediate poll_work before we get to rescheduling. One
656710ffe67SSuren Baghdasaryan * potential extra wakeup at the end of the polling window
657710ffe67SSuren Baghdasaryan * should be negligible and polling_next_update still keeps
658710ffe67SSuren Baghdasaryan * updates correctly on schedule.
659710ffe67SSuren Baghdasaryan */
66065457b74SDomenico Cerasuolo atomic_set(&group->rtpoll_scheduled, 0);
661710ffe67SSuren Baghdasaryan /*
662710ffe67SSuren Baghdasaryan * A task change can race with the poll worker that is supposed to
663710ffe67SSuren Baghdasaryan * report on it. To avoid missing events, ensure ordering between
664710ffe67SSuren Baghdasaryan * poll_scheduled and the task state accesses, such that if the poll
665710ffe67SSuren Baghdasaryan * worker misses the state update, the task change is guaranteed to
666710ffe67SSuren Baghdasaryan * reschedule the poll worker:
667710ffe67SSuren Baghdasaryan *
668710ffe67SSuren Baghdasaryan * poll worker:
669710ffe67SSuren Baghdasaryan * atomic_set(poll_scheduled, 0)
670710ffe67SSuren Baghdasaryan * smp_mb()
671710ffe67SSuren Baghdasaryan * LOAD states
672710ffe67SSuren Baghdasaryan *
673710ffe67SSuren Baghdasaryan * task change:
674710ffe67SSuren Baghdasaryan * STORE states
675710ffe67SSuren Baghdasaryan * if atomic_xchg(poll_scheduled, 1) == 0:
676710ffe67SSuren Baghdasaryan * schedule poll worker
677710ffe67SSuren Baghdasaryan *
678710ffe67SSuren Baghdasaryan * The atomic_xchg() implies a full barrier.
679710ffe67SSuren Baghdasaryan */
680710ffe67SSuren Baghdasaryan smp_mb();
681710ffe67SSuren Baghdasaryan } else {
682710ffe67SSuren Baghdasaryan /* Polling window is not over, keep rescheduling */
683710ffe67SSuren Baghdasaryan force_reschedule = true;
684710ffe67SSuren Baghdasaryan }
685710ffe67SSuren Baghdasaryan
686710ffe67SSuren Baghdasaryan
6870e94682bSSuren Baghdasaryan collect_percpu_times(group, PSI_POLL, &changed_states);
6880e94682bSSuren Baghdasaryan
68965457b74SDomenico Cerasuolo if (changed_states & group->rtpoll_states) {
6900e94682bSSuren Baghdasaryan /* Initialize trigger windows when entering polling mode */
69165457b74SDomenico Cerasuolo if (now > group->rtpoll_until)
692d82caa27SDomenico Cerasuolo init_rtpoll_triggers(group, now);
6930e94682bSSuren Baghdasaryan
6940e94682bSSuren Baghdasaryan /*
6950e94682bSSuren Baghdasaryan * Keep the monitor active for at least the duration of the
6960e94682bSSuren Baghdasaryan * minimum tracking window as long as monitor states are
6970e94682bSSuren Baghdasaryan * changing.
6980e94682bSSuren Baghdasaryan */
69965457b74SDomenico Cerasuolo group->rtpoll_until = now +
70065457b74SDomenico Cerasuolo group->rtpoll_min_period * UPDATES_PER_WINDOW;
7010e94682bSSuren Baghdasaryan }
7020e94682bSSuren Baghdasaryan
70365457b74SDomenico Cerasuolo if (now > group->rtpoll_until) {
70465457b74SDomenico Cerasuolo group->rtpoll_next_update = ULLONG_MAX;
7050e94682bSSuren Baghdasaryan goto out;
7060e94682bSSuren Baghdasaryan }
7070e94682bSSuren Baghdasaryan
7084468fcaeSDomenico Cerasuolo if (now >= group->rtpoll_next_update) {
709d82caa27SDomenico Cerasuolo group->rtpoll_next_update = update_triggers(group, now, &update_total, PSI_POLL);
7104468fcaeSDomenico Cerasuolo if (update_total)
7114468fcaeSDomenico Cerasuolo memcpy(group->rtpoll_total, group->total[PSI_POLL],
7124468fcaeSDomenico Cerasuolo sizeof(group->rtpoll_total));
7134468fcaeSDomenico Cerasuolo }
7140e94682bSSuren Baghdasaryan
71565457b74SDomenico Cerasuolo psi_schedule_rtpoll_work(group,
71665457b74SDomenico Cerasuolo nsecs_to_jiffies(group->rtpoll_next_update - now) + 1,
717710ffe67SSuren Baghdasaryan force_reschedule);
7180e94682bSSuren Baghdasaryan
7190e94682bSSuren Baghdasaryan out:
72065457b74SDomenico Cerasuolo mutex_unlock(&group->rtpoll_trigger_lock);
7210e94682bSSuren Baghdasaryan }
7220e94682bSSuren Baghdasaryan
psi_rtpoll_worker(void * data)72365457b74SDomenico Cerasuolo static int psi_rtpoll_worker(void *data)
724461daba0SSuren Baghdasaryan {
725461daba0SSuren Baghdasaryan struct psi_group *group = (struct psi_group *)data;
726461daba0SSuren Baghdasaryan
7272cca5426SPeter Zijlstra sched_set_fifo_low(current);
728461daba0SSuren Baghdasaryan
729461daba0SSuren Baghdasaryan while (true) {
73065457b74SDomenico Cerasuolo wait_event_interruptible(group->rtpoll_wait,
73165457b74SDomenico Cerasuolo atomic_cmpxchg(&group->rtpoll_wakeup, 1, 0) ||
732461daba0SSuren Baghdasaryan kthread_should_stop());
733461daba0SSuren Baghdasaryan if (kthread_should_stop())
734461daba0SSuren Baghdasaryan break;
735461daba0SSuren Baghdasaryan
73665457b74SDomenico Cerasuolo psi_rtpoll_work(group);
737461daba0SSuren Baghdasaryan }
738461daba0SSuren Baghdasaryan return 0;
739461daba0SSuren Baghdasaryan }
740461daba0SSuren Baghdasaryan
poll_timer_fn(struct timer_list * t)741461daba0SSuren Baghdasaryan static void poll_timer_fn(struct timer_list *t)
742461daba0SSuren Baghdasaryan {
74365457b74SDomenico Cerasuolo struct psi_group *group = from_timer(group, t, rtpoll_timer);
744461daba0SSuren Baghdasaryan
74565457b74SDomenico Cerasuolo atomic_set(&group->rtpoll_wakeup, 1);
74665457b74SDomenico Cerasuolo wake_up_interruptible(&group->rtpoll_wait);
747461daba0SSuren Baghdasaryan }
748461daba0SSuren Baghdasaryan
record_times(struct psi_group_cpu * groupc,u64 now)749df774306SShakeel Butt static void record_times(struct psi_group_cpu *groupc, u64 now)
750eb414681SJohannes Weiner {
751eb414681SJohannes Weiner u32 delta;
752eb414681SJohannes Weiner
753eb414681SJohannes Weiner delta = now - groupc->state_start;
754eb414681SJohannes Weiner groupc->state_start = now;
755eb414681SJohannes Weiner
75633b2d630SSuren Baghdasaryan if (groupc->state_mask & (1 << PSI_IO_SOME)) {
757eb414681SJohannes Weiner groupc->times[PSI_IO_SOME] += delta;
75833b2d630SSuren Baghdasaryan if (groupc->state_mask & (1 << PSI_IO_FULL))
759eb414681SJohannes Weiner groupc->times[PSI_IO_FULL] += delta;
760eb414681SJohannes Weiner }
761eb414681SJohannes Weiner
76233b2d630SSuren Baghdasaryan if (groupc->state_mask & (1 << PSI_MEM_SOME)) {
763eb414681SJohannes Weiner groupc->times[PSI_MEM_SOME] += delta;
76433b2d630SSuren Baghdasaryan if (groupc->state_mask & (1 << PSI_MEM_FULL))
765eb414681SJohannes Weiner groupc->times[PSI_MEM_FULL] += delta;
766eb414681SJohannes Weiner }
767eb414681SJohannes Weiner
768e7fcd762SChengming Zhou if (groupc->state_mask & (1 << PSI_CPU_SOME)) {
769eb414681SJohannes Weiner groupc->times[PSI_CPU_SOME] += delta;
770e7fcd762SChengming Zhou if (groupc->state_mask & (1 << PSI_CPU_FULL))
771e7fcd762SChengming Zhou groupc->times[PSI_CPU_FULL] += delta;
772e7fcd762SChengming Zhou }
773eb414681SJohannes Weiner
77433b2d630SSuren Baghdasaryan if (groupc->state_mask & (1 << PSI_NONIDLE))
775eb414681SJohannes Weiner groupc->times[PSI_NONIDLE] += delta;
776eb414681SJohannes Weiner }
777eb414681SJohannes Weiner
psi_group_change(struct psi_group * group,int cpu,unsigned int clear,unsigned int set,bool wake_clock)77836b238d5SJohannes Weiner static void psi_group_change(struct psi_group *group, int cpu,
779*1f997b1dSJohannes Weiner unsigned int clear, unsigned int set,
78036b238d5SJohannes Weiner bool wake_clock)
781eb414681SJohannes Weiner {
782eb414681SJohannes Weiner struct psi_group_cpu *groupc;
783eb414681SJohannes Weiner unsigned int t, m;
78433b2d630SSuren Baghdasaryan enum psi_states s;
78571dbdde7SJohannes Weiner u32 state_mask;
786*1f997b1dSJohannes Weiner u64 now;
787eb414681SJohannes Weiner
788448a2500SJohn Stultz lockdep_assert_rq_held(cpu_rq(cpu));
789eb414681SJohannes Weiner groupc = per_cpu_ptr(group->pcpu, cpu);
790eb414681SJohannes Weiner
791eb414681SJohannes Weiner /*
79234f26a15SChengming Zhou * First we update the task counts according to the state
793eb414681SJohannes Weiner * change requested through the @clear and @set bits.
79434f26a15SChengming Zhou *
79534f26a15SChengming Zhou * Then if the cgroup PSI stats accounting enabled, we
79634f26a15SChengming Zhou * assess the aggregate resource states this CPU's tasks
79734f26a15SChengming Zhou * have been in since the last change, and account any
79834f26a15SChengming Zhou * SOME and FULL time these may have resulted in.
799eb414681SJohannes Weiner */
800eb414681SJohannes Weiner write_seqcount_begin(&groupc->seq);
801*1f997b1dSJohannes Weiner now = cpu_clock(cpu);
802eb414681SJohannes Weiner
80371dbdde7SJohannes Weiner /*
80471dbdde7SJohannes Weiner * Start with TSK_ONCPU, which doesn't have a corresponding
80571dbdde7SJohannes Weiner * task count - it's just a boolean flag directly encoded in
80671dbdde7SJohannes Weiner * the state mask. Clear, set, or carry the current state if
80771dbdde7SJohannes Weiner * no changes are requested.
80871dbdde7SJohannes Weiner */
80971dbdde7SJohannes Weiner if (unlikely(clear & TSK_ONCPU)) {
81071dbdde7SJohannes Weiner state_mask = 0;
81171dbdde7SJohannes Weiner clear &= ~TSK_ONCPU;
81271dbdde7SJohannes Weiner } else if (unlikely(set & TSK_ONCPU)) {
81371dbdde7SJohannes Weiner state_mask = PSI_ONCPU;
81471dbdde7SJohannes Weiner set &= ~TSK_ONCPU;
81571dbdde7SJohannes Weiner } else {
81671dbdde7SJohannes Weiner state_mask = groupc->state_mask & PSI_ONCPU;
81771dbdde7SJohannes Weiner }
818eb414681SJohannes Weiner
81971dbdde7SJohannes Weiner /*
82071dbdde7SJohannes Weiner * The rest of the state mask is calculated based on the task
82171dbdde7SJohannes Weiner * counts. Update those first, then construct the mask.
82271dbdde7SJohannes Weiner */
823eb414681SJohannes Weiner for (t = 0, m = clear; m; m &= ~(1 << t), t++) {
824eb414681SJohannes Weiner if (!(m & (1 << t)))
825eb414681SJohannes Weiner continue;
8269d10a13dSCharan Teja Reddy if (groupc->tasks[t]) {
8279d10a13dSCharan Teja Reddy groupc->tasks[t]--;
8289d10a13dSCharan Teja Reddy } else if (!psi_bug) {
82971dbdde7SJohannes Weiner printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u] clear=%x set=%x\n",
830eb414681SJohannes Weiner cpu, t, groupc->tasks[0],
831eb414681SJohannes Weiner groupc->tasks[1], groupc->tasks[2],
83271dbdde7SJohannes Weiner groupc->tasks[3], clear, set);
833eb414681SJohannes Weiner psi_bug = 1;
834eb414681SJohannes Weiner }
835eb414681SJohannes Weiner }
836eb414681SJohannes Weiner
837eb414681SJohannes Weiner for (t = 0; set; set &= ~(1 << t), t++)
838eb414681SJohannes Weiner if (set & (1 << t))
839eb414681SJohannes Weiner groupc->tasks[t]++;
840eb414681SJohannes Weiner
84134f26a15SChengming Zhou if (!group->enabled) {
84234f26a15SChengming Zhou /*
84334f26a15SChengming Zhou * On the first group change after disabling PSI, conclude
84434f26a15SChengming Zhou * the current state and flush its time. This is unlikely
84534f26a15SChengming Zhou * to matter to the user, but aggregation (get_recent_times)
84634f26a15SChengming Zhou * may have already incorporated the live state into times_prev;
84734f26a15SChengming Zhou * avoid a delta sample underflow when PSI is later re-enabled.
84834f26a15SChengming Zhou */
84934f26a15SChengming Zhou if (unlikely(groupc->state_mask & (1 << PSI_NONIDLE)))
85034f26a15SChengming Zhou record_times(groupc, now);
85134f26a15SChengming Zhou
85234f26a15SChengming Zhou groupc->state_mask = state_mask;
85334f26a15SChengming Zhou
85434f26a15SChengming Zhou write_seqcount_end(&groupc->seq);
85534f26a15SChengming Zhou return;
85634f26a15SChengming Zhou }
85734f26a15SChengming Zhou
85833b2d630SSuren Baghdasaryan for (s = 0; s < NR_PSI_STATES; s++) {
85971dbdde7SJohannes Weiner if (test_state(groupc->tasks, s, state_mask & PSI_ONCPU))
86033b2d630SSuren Baghdasaryan state_mask |= (1 << s);
86133b2d630SSuren Baghdasaryan }
8627fae6c81SChengming Zhou
8637fae6c81SChengming Zhou /*
8647fae6c81SChengming Zhou * Since we care about lost potential, a memstall is FULL
8657fae6c81SChengming Zhou * when there are no other working tasks, but also when
8667fae6c81SChengming Zhou * the CPU is actively reclaiming and nothing productive
8677fae6c81SChengming Zhou * could run even if it were runnable. So when the current
8687fae6c81SChengming Zhou * task in a cgroup is in_memstall, the corresponding groupc
8697fae6c81SChengming Zhou * on that cpu is in PSI_MEM_FULL state.
8707fae6c81SChengming Zhou */
87171dbdde7SJohannes Weiner if (unlikely((state_mask & PSI_ONCPU) && cpu_curr(cpu)->in_memstall))
8727fae6c81SChengming Zhou state_mask |= (1 << PSI_MEM_FULL);
8737fae6c81SChengming Zhou
87434f26a15SChengming Zhou record_times(groupc, now);
87534f26a15SChengming Zhou
87633b2d630SSuren Baghdasaryan groupc->state_mask = state_mask;
87733b2d630SSuren Baghdasaryan
878eb414681SJohannes Weiner write_seqcount_end(&groupc->seq);
8790e94682bSSuren Baghdasaryan
88065457b74SDomenico Cerasuolo if (state_mask & group->rtpoll_states)
88165457b74SDomenico Cerasuolo psi_schedule_rtpoll_work(group, 1, false);
88236b238d5SJohannes Weiner
88336b238d5SJohannes Weiner if (wake_clock && !delayed_work_pending(&group->avgs_work))
88436b238d5SJohannes Weiner schedule_delayed_work(&group->avgs_work, PSI_FREQ);
885eb414681SJohannes Weiner }
886eb414681SJohannes Weiner
task_psi_group(struct task_struct * task)887dc86aba7SChengming Zhou static inline struct psi_group *task_psi_group(struct task_struct *task)
8882ce7135aSJohannes Weiner {
8892ce7135aSJohannes Weiner #ifdef CONFIG_CGROUPS
890dc86aba7SChengming Zhou if (static_branch_likely(&psi_cgroups_enabled))
891dc86aba7SChengming Zhou return cgroup_psi(task_dfl_cgroup(task));
8922ce7135aSJohannes Weiner #endif
8932ce7135aSJohannes Weiner return &psi_system;
8942ce7135aSJohannes Weiner }
8952ce7135aSJohannes Weiner
psi_flags_change(struct task_struct * task,int clear,int set)89636b238d5SJohannes Weiner static void psi_flags_change(struct task_struct *task, int clear, int set)
89736b238d5SJohannes Weiner {
89836b238d5SJohannes Weiner if (((task->psi_flags & set) ||
89936b238d5SJohannes Weiner (task->psi_flags & clear) != clear) &&
90036b238d5SJohannes Weiner !psi_bug) {
90136b238d5SJohannes Weiner printk_deferred(KERN_ERR "psi: inconsistent task state! task=%d:%s cpu=%d psi_flags=%x clear=%x set=%x\n",
90236b238d5SJohannes Weiner task->pid, task->comm, task_cpu(task),
90336b238d5SJohannes Weiner task->psi_flags, clear, set);
90436b238d5SJohannes Weiner psi_bug = 1;
90536b238d5SJohannes Weiner }
90636b238d5SJohannes Weiner
90736b238d5SJohannes Weiner task->psi_flags &= ~clear;
90836b238d5SJohannes Weiner task->psi_flags |= set;
90936b238d5SJohannes Weiner }
91036b238d5SJohannes Weiner
psi_task_change(struct task_struct * task,int clear,int set)911eb414681SJohannes Weiner void psi_task_change(struct task_struct *task, int clear, int set)
912eb414681SJohannes Weiner {
913eb414681SJohannes Weiner int cpu = task_cpu(task);
9142ce7135aSJohannes Weiner struct psi_group *group;
915eb414681SJohannes Weiner
916eb414681SJohannes Weiner if (!task->pid)
917eb414681SJohannes Weiner return;
918eb414681SJohannes Weiner
91936b238d5SJohannes Weiner psi_flags_change(task, clear, set);
920eb414681SJohannes Weiner
921dc86aba7SChengming Zhou group = task_psi_group(task);
922dc86aba7SChengming Zhou do {
923*1f997b1dSJohannes Weiner psi_group_change(group, cpu, clear, set, true);
924dc86aba7SChengming Zhou } while ((group = group->parent));
92536b238d5SJohannes Weiner }
9260e94682bSSuren Baghdasaryan
psi_task_switch(struct task_struct * prev,struct task_struct * next,bool sleep)92736b238d5SJohannes Weiner void psi_task_switch(struct task_struct *prev, struct task_struct *next,
92836b238d5SJohannes Weiner bool sleep)
92936b238d5SJohannes Weiner {
93036b238d5SJohannes Weiner struct psi_group *group, *common = NULL;
93136b238d5SJohannes Weiner int cpu = task_cpu(prev);
9320e94682bSSuren Baghdasaryan
93336b238d5SJohannes Weiner if (next->pid) {
93436b238d5SJohannes Weiner psi_flags_change(next, 0, TSK_ONCPU);
93536b238d5SJohannes Weiner /*
93665176f59SChengming Zhou * Set TSK_ONCPU on @next's cgroups. If @next shares any
93765176f59SChengming Zhou * ancestors with @prev, those will already have @prev's
93865176f59SChengming Zhou * TSK_ONCPU bit set, and we can stop the iteration there.
93936b238d5SJohannes Weiner */
940dc86aba7SChengming Zhou group = task_psi_group(next);
941dc86aba7SChengming Zhou do {
94271dbdde7SJohannes Weiner if (per_cpu_ptr(group->pcpu, cpu)->state_mask &
94371dbdde7SJohannes Weiner PSI_ONCPU) {
94436b238d5SJohannes Weiner common = group;
94536b238d5SJohannes Weiner break;
94636b238d5SJohannes Weiner }
94736b238d5SJohannes Weiner
948*1f997b1dSJohannes Weiner psi_group_change(group, cpu, 0, TSK_ONCPU, true);
949dc86aba7SChengming Zhou } while ((group = group->parent));
95036b238d5SJohannes Weiner }
95136b238d5SJohannes Weiner
95236b238d5SJohannes Weiner if (prev->pid) {
9534117cebfSChengming Zhou int clear = TSK_ONCPU, set = 0;
954c530a3c7SChengming Zhou bool wake_clock = true;
9554117cebfSChengming Zhou
9564117cebfSChengming Zhou /*
957cb0e52b7SBrian Chen * When we're going to sleep, psi_dequeue() lets us
958cb0e52b7SBrian Chen * handle TSK_RUNNING, TSK_MEMSTALL_RUNNING and
959cb0e52b7SBrian Chen * TSK_IOWAIT here, where we can combine it with
960cb0e52b7SBrian Chen * TSK_ONCPU and save walking common ancestors twice.
9614117cebfSChengming Zhou */
9624117cebfSChengming Zhou if (sleep) {
9634117cebfSChengming Zhou clear |= TSK_RUNNING;
964cb0e52b7SBrian Chen if (prev->in_memstall)
965cb0e52b7SBrian Chen clear |= TSK_MEMSTALL_RUNNING;
9664117cebfSChengming Zhou if (prev->in_iowait)
9674117cebfSChengming Zhou set |= TSK_IOWAIT;
968c530a3c7SChengming Zhou
969c530a3c7SChengming Zhou /*
970c530a3c7SChengming Zhou * Periodic aggregation shuts off if there is a period of no
971c530a3c7SChengming Zhou * task changes, so we wake it back up if necessary. However,
972c530a3c7SChengming Zhou * don't do this if the task change is the aggregation worker
973c530a3c7SChengming Zhou * itself going to sleep, or we'll ping-pong forever.
974c530a3c7SChengming Zhou */
975c530a3c7SChengming Zhou if (unlikely((prev->flags & PF_WQ_WORKER) &&
976c530a3c7SChengming Zhou wq_worker_last_func(prev) == psi_avgs_work))
977c530a3c7SChengming Zhou wake_clock = false;
9784117cebfSChengming Zhou }
9794117cebfSChengming Zhou
9804117cebfSChengming Zhou psi_flags_change(prev, clear, set);
98136b238d5SJohannes Weiner
982dc86aba7SChengming Zhou group = task_psi_group(prev);
983dc86aba7SChengming Zhou do {
984dc86aba7SChengming Zhou if (group == common)
985dc86aba7SChengming Zhou break;
986*1f997b1dSJohannes Weiner psi_group_change(group, cpu, clear, set, wake_clock);
987dc86aba7SChengming Zhou } while ((group = group->parent));
9884117cebfSChengming Zhou
9894117cebfSChengming Zhou /*
99065176f59SChengming Zhou * TSK_ONCPU is handled up to the common ancestor. If there are
99165176f59SChengming Zhou * any other differences between the two tasks (e.g. prev goes
99265176f59SChengming Zhou * to sleep, or only one task is memstall), finish propagating
99365176f59SChengming Zhou * those differences all the way up to the root.
9944117cebfSChengming Zhou */
99565176f59SChengming Zhou if ((prev->psi_flags ^ next->psi_flags) & ~TSK_ONCPU) {
9964117cebfSChengming Zhou clear &= ~TSK_ONCPU;
997dc86aba7SChengming Zhou for (; group; group = group->parent)
998*1f997b1dSJohannes Weiner psi_group_change(group, cpu, clear, set, wake_clock);
9994117cebfSChengming Zhou }
10001b69ac6bSJohannes Weiner }
1001eb414681SJohannes Weiner }
1002eb414681SJohannes Weiner
100352b1364bSChengming Zhou #ifdef CONFIG_IRQ_TIME_ACCOUNTING
psi_account_irqtime(struct rq * rq,struct task_struct * curr,struct task_struct * prev)1004448a2500SJohn Stultz void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_struct *prev)
100552b1364bSChengming Zhou {
1006448a2500SJohn Stultz int cpu = task_cpu(curr);
100752b1364bSChengming Zhou struct psi_group *group;
100852b1364bSChengming Zhou struct psi_group_cpu *groupc;
1009448a2500SJohn Stultz s64 delta;
1010*1f997b1dSJohannes Weiner u64 irq;
101152b1364bSChengming Zhou
1012448a2500SJohn Stultz if (!curr->pid)
1013448a2500SJohn Stultz return;
1014448a2500SJohn Stultz
1015448a2500SJohn Stultz lockdep_assert_rq_held(rq);
1016448a2500SJohn Stultz group = task_psi_group(curr);
1017448a2500SJohn Stultz if (prev && task_psi_group(prev) == group)
101852b1364bSChengming Zhou return;
101952b1364bSChengming Zhou
1020448a2500SJohn Stultz irq = irq_time_read(cpu);
1021448a2500SJohn Stultz delta = (s64)(irq - rq->psi_irq_time);
1022448a2500SJohn Stultz if (delta < 0)
1023448a2500SJohn Stultz return;
1024448a2500SJohn Stultz rq->psi_irq_time = irq;
102552b1364bSChengming Zhou
1026dc86aba7SChengming Zhou do {
1027*1f997b1dSJohannes Weiner u64 now;
1028*1f997b1dSJohannes Weiner
102934f26a15SChengming Zhou if (!group->enabled)
103034f26a15SChengming Zhou continue;
103134f26a15SChengming Zhou
103252b1364bSChengming Zhou groupc = per_cpu_ptr(group->pcpu, cpu);
103352b1364bSChengming Zhou
103452b1364bSChengming Zhou write_seqcount_begin(&groupc->seq);
1035*1f997b1dSJohannes Weiner now = cpu_clock(cpu);
103652b1364bSChengming Zhou
103752b1364bSChengming Zhou record_times(groupc, now);
103852b1364bSChengming Zhou groupc->times[PSI_IRQ_FULL] += delta;
103952b1364bSChengming Zhou
104052b1364bSChengming Zhou write_seqcount_end(&groupc->seq);
104152b1364bSChengming Zhou
104265457b74SDomenico Cerasuolo if (group->rtpoll_states & (1 << PSI_IRQ_FULL))
104365457b74SDomenico Cerasuolo psi_schedule_rtpoll_work(group, 1, false);
1044dc86aba7SChengming Zhou } while ((group = group->parent));
104552b1364bSChengming Zhou }
104652b1364bSChengming Zhou #endif
104752b1364bSChengming Zhou
1048eb414681SJohannes Weiner /**
1049eb414681SJohannes Weiner * psi_memstall_enter - mark the beginning of a memory stall section
1050eb414681SJohannes Weiner * @flags: flags to handle nested sections
1051eb414681SJohannes Weiner *
1052eb414681SJohannes Weiner * Marks the calling task as being stalled due to a lack of memory,
1053eb414681SJohannes Weiner * such as waiting for a refault or performing reclaim.
1054eb414681SJohannes Weiner */
psi_memstall_enter(unsigned long * flags)1055eb414681SJohannes Weiner void psi_memstall_enter(unsigned long *flags)
1056eb414681SJohannes Weiner {
1057eb414681SJohannes Weiner struct rq_flags rf;
1058eb414681SJohannes Weiner struct rq *rq;
1059eb414681SJohannes Weiner
1060e0c27447SJohannes Weiner if (static_branch_likely(&psi_disabled))
1061eb414681SJohannes Weiner return;
1062eb414681SJohannes Weiner
10631066d1b6SYafang Shao *flags = current->in_memstall;
1064eb414681SJohannes Weiner if (*flags)
1065eb414681SJohannes Weiner return;
1066eb414681SJohannes Weiner /*
10671066d1b6SYafang Shao * in_memstall setting & accounting needs to be atomic wrt
1068eb414681SJohannes Weiner * changes to the task's scheduling state, otherwise we can
1069eb414681SJohannes Weiner * race with CPU migration.
1070eb414681SJohannes Weiner */
1071eb414681SJohannes Weiner rq = this_rq_lock_irq(&rf);
1072eb414681SJohannes Weiner
10731066d1b6SYafang Shao current->in_memstall = 1;
1074cb0e52b7SBrian Chen psi_task_change(current, 0, TSK_MEMSTALL | TSK_MEMSTALL_RUNNING);
1075eb414681SJohannes Weiner
1076eb414681SJohannes Weiner rq_unlock_irq(rq, &rf);
1077eb414681SJohannes Weiner }
1078527eb453SChristoph Hellwig EXPORT_SYMBOL_GPL(psi_memstall_enter);
1079eb414681SJohannes Weiner
1080eb414681SJohannes Weiner /**
1081eb414681SJohannes Weiner * psi_memstall_leave - mark the end of an memory stall section
1082eb414681SJohannes Weiner * @flags: flags to handle nested memdelay sections
1083eb414681SJohannes Weiner *
1084eb414681SJohannes Weiner * Marks the calling task as no longer stalled due to lack of memory.
1085eb414681SJohannes Weiner */
psi_memstall_leave(unsigned long * flags)1086eb414681SJohannes Weiner void psi_memstall_leave(unsigned long *flags)
1087eb414681SJohannes Weiner {
1088eb414681SJohannes Weiner struct rq_flags rf;
1089eb414681SJohannes Weiner struct rq *rq;
1090eb414681SJohannes Weiner
1091e0c27447SJohannes Weiner if (static_branch_likely(&psi_disabled))
1092eb414681SJohannes Weiner return;
1093eb414681SJohannes Weiner
1094eb414681SJohannes Weiner if (*flags)
1095eb414681SJohannes Weiner return;
1096eb414681SJohannes Weiner /*
10971066d1b6SYafang Shao * in_memstall clearing & accounting needs to be atomic wrt
1098eb414681SJohannes Weiner * changes to the task's scheduling state, otherwise we could
1099eb414681SJohannes Weiner * race with CPU migration.
1100eb414681SJohannes Weiner */
1101eb414681SJohannes Weiner rq = this_rq_lock_irq(&rf);
1102eb414681SJohannes Weiner
11031066d1b6SYafang Shao current->in_memstall = 0;
1104cb0e52b7SBrian Chen psi_task_change(current, TSK_MEMSTALL | TSK_MEMSTALL_RUNNING, 0);
1105eb414681SJohannes Weiner
1106eb414681SJohannes Weiner rq_unlock_irq(rq, &rf);
1107eb414681SJohannes Weiner }
1108527eb453SChristoph Hellwig EXPORT_SYMBOL_GPL(psi_memstall_leave);
1109eb414681SJohannes Weiner
11102ce7135aSJohannes Weiner #ifdef CONFIG_CGROUPS
psi_cgroup_alloc(struct cgroup * cgroup)11112ce7135aSJohannes Weiner int psi_cgroup_alloc(struct cgroup *cgroup)
11122ce7135aSJohannes Weiner {
1113e2ad8ab0SChengming Zhou if (!static_branch_likely(&psi_cgroups_enabled))
11142ce7135aSJohannes Weiner return 0;
11152ce7135aSJohannes Weiner
11162b97cf76SHao Jia cgroup->psi = kzalloc(sizeof(struct psi_group), GFP_KERNEL);
11175f69a657SChen Wandun if (!cgroup->psi)
11182ce7135aSJohannes Weiner return -ENOMEM;
11195f69a657SChen Wandun
11205f69a657SChen Wandun cgroup->psi->pcpu = alloc_percpu(struct psi_group_cpu);
11215f69a657SChen Wandun if (!cgroup->psi->pcpu) {
11225f69a657SChen Wandun kfree(cgroup->psi);
11235f69a657SChen Wandun return -ENOMEM;
11245f69a657SChen Wandun }
11255f69a657SChen Wandun group_init(cgroup->psi);
1126dc86aba7SChengming Zhou cgroup->psi->parent = cgroup_psi(cgroup_parent(cgroup));
11272ce7135aSJohannes Weiner return 0;
11282ce7135aSJohannes Weiner }
11292ce7135aSJohannes Weiner
psi_cgroup_free(struct cgroup * cgroup)11302ce7135aSJohannes Weiner void psi_cgroup_free(struct cgroup *cgroup)
11312ce7135aSJohannes Weiner {
1132e2ad8ab0SChengming Zhou if (!static_branch_likely(&psi_cgroups_enabled))
11332ce7135aSJohannes Weiner return;
11342ce7135aSJohannes Weiner
11355f69a657SChen Wandun cancel_delayed_work_sync(&cgroup->psi->avgs_work);
11365f69a657SChen Wandun free_percpu(cgroup->psi->pcpu);
11370e94682bSSuren Baghdasaryan /* All triggers must be removed by now */
113865457b74SDomenico Cerasuolo WARN_ONCE(cgroup->psi->rtpoll_states, "psi: trigger leak\n");
11395f69a657SChen Wandun kfree(cgroup->psi);
11402ce7135aSJohannes Weiner }
11412ce7135aSJohannes Weiner
11422ce7135aSJohannes Weiner /**
11432ce7135aSJohannes Weiner * cgroup_move_task - move task to a different cgroup
11442ce7135aSJohannes Weiner * @task: the task
11452ce7135aSJohannes Weiner * @to: the target css_set
11462ce7135aSJohannes Weiner *
11472ce7135aSJohannes Weiner * Move task to a new cgroup and safely migrate its associated stall
11482ce7135aSJohannes Weiner * state between the different groups.
11492ce7135aSJohannes Weiner *
11502ce7135aSJohannes Weiner * This function acquires the task's rq lock to lock out concurrent
11512ce7135aSJohannes Weiner * changes to the task's scheduling state and - in case the task is
11522ce7135aSJohannes Weiner * running - concurrent changes to its stall state.
11532ce7135aSJohannes Weiner */
cgroup_move_task(struct task_struct * task,struct css_set * to)11542ce7135aSJohannes Weiner void cgroup_move_task(struct task_struct *task, struct css_set *to)
11552ce7135aSJohannes Weiner {
1156d583d360SJohannes Weiner unsigned int task_flags;
11572ce7135aSJohannes Weiner struct rq_flags rf;
11582ce7135aSJohannes Weiner struct rq *rq;
11592ce7135aSJohannes Weiner
1160e2ad8ab0SChengming Zhou if (!static_branch_likely(&psi_cgroups_enabled)) {
11618fcb2312SOlof Johansson /*
11628fcb2312SOlof Johansson * Lame to do this here, but the scheduler cannot be locked
11638fcb2312SOlof Johansson * from the outside, so we move cgroups from inside sched/.
11648fcb2312SOlof Johansson */
11658fcb2312SOlof Johansson rcu_assign_pointer(task->cgroups, to);
11668fcb2312SOlof Johansson return;
11678fcb2312SOlof Johansson }
11688fcb2312SOlof Johansson
11692ce7135aSJohannes Weiner rq = task_rq_lock(task, &rf);
11702ce7135aSJohannes Weiner
1171d583d360SJohannes Weiner /*
1172d583d360SJohannes Weiner * We may race with schedule() dropping the rq lock between
1173d583d360SJohannes Weiner * deactivating prev and switching to next. Because the psi
1174d583d360SJohannes Weiner * updates from the deactivation are deferred to the switch
1175d583d360SJohannes Weiner * callback to save cgroup tree updates, the task's scheduling
1176d583d360SJohannes Weiner * state here is not coherent with its psi state:
1177d583d360SJohannes Weiner *
1178d583d360SJohannes Weiner * schedule() cgroup_move_task()
1179d583d360SJohannes Weiner * rq_lock()
1180d583d360SJohannes Weiner * deactivate_task()
1181d583d360SJohannes Weiner * p->on_rq = 0
1182d583d360SJohannes Weiner * psi_dequeue() // defers TSK_RUNNING & TSK_IOWAIT updates
1183d583d360SJohannes Weiner * pick_next_task()
1184d583d360SJohannes Weiner * rq_unlock()
1185d583d360SJohannes Weiner * rq_lock()
1186d583d360SJohannes Weiner * psi_task_change() // old cgroup
1187d583d360SJohannes Weiner * task->cgroups = to
1188d583d360SJohannes Weiner * psi_task_change() // new cgroup
1189d583d360SJohannes Weiner * rq_unlock()
1190d583d360SJohannes Weiner * rq_lock()
1191d583d360SJohannes Weiner * psi_sched_switch() // does deferred updates in new cgroup
1192d583d360SJohannes Weiner *
1193d583d360SJohannes Weiner * Don't rely on the scheduling state. Use psi_flags instead.
1194d583d360SJohannes Weiner */
1195d583d360SJohannes Weiner task_flags = task->psi_flags;
11962ce7135aSJohannes Weiner
11972ce7135aSJohannes Weiner if (task_flags)
11982ce7135aSJohannes Weiner psi_task_change(task, task_flags, 0);
11992ce7135aSJohannes Weiner
12008fcb2312SOlof Johansson /* See comment above */
12012ce7135aSJohannes Weiner rcu_assign_pointer(task->cgroups, to);
12022ce7135aSJohannes Weiner
12032ce7135aSJohannes Weiner if (task_flags)
12042ce7135aSJohannes Weiner psi_task_change(task, 0, task_flags);
12052ce7135aSJohannes Weiner
12062ce7135aSJohannes Weiner task_rq_unlock(rq, task, &rf);
12072ce7135aSJohannes Weiner }
120834f26a15SChengming Zhou
psi_cgroup_restart(struct psi_group * group)120934f26a15SChengming Zhou void psi_cgroup_restart(struct psi_group *group)
121034f26a15SChengming Zhou {
121134f26a15SChengming Zhou int cpu;
121234f26a15SChengming Zhou
121334f26a15SChengming Zhou /*
121434f26a15SChengming Zhou * After we disable psi_group->enabled, we don't actually
121534f26a15SChengming Zhou * stop percpu tasks accounting in each psi_group_cpu,
121634f26a15SChengming Zhou * instead only stop test_state() loop, record_times()
121734f26a15SChengming Zhou * and averaging worker, see psi_group_change() for details.
121834f26a15SChengming Zhou *
121934f26a15SChengming Zhou * When disable cgroup PSI, this function has nothing to sync
122034f26a15SChengming Zhou * since cgroup pressure files are hidden and percpu psi_group_cpu
122134f26a15SChengming Zhou * would see !psi_group->enabled and only do task accounting.
122234f26a15SChengming Zhou *
122334f26a15SChengming Zhou * When re-enable cgroup PSI, this function use psi_group_change()
122434f26a15SChengming Zhou * to get correct state mask from test_state() loop on tasks[],
122534f26a15SChengming Zhou * and restart groupc->state_start from now, use .clear = .set = 0
122634f26a15SChengming Zhou * here since no task status really changed.
122734f26a15SChengming Zhou */
122834f26a15SChengming Zhou if (!group->enabled)
122934f26a15SChengming Zhou return;
123034f26a15SChengming Zhou
123134f26a15SChengming Zhou for_each_possible_cpu(cpu) {
123234f26a15SChengming Zhou struct rq *rq = cpu_rq(cpu);
123334f26a15SChengming Zhou struct rq_flags rf;
123434f26a15SChengming Zhou
123534f26a15SChengming Zhou rq_lock_irq(rq, &rf);
1236*1f997b1dSJohannes Weiner psi_group_change(group, cpu, 0, 0, true);
123734f26a15SChengming Zhou rq_unlock_irq(rq, &rf);
123834f26a15SChengming Zhou }
123934f26a15SChengming Zhou }
12402ce7135aSJohannes Weiner #endif /* CONFIG_CGROUPS */
12412ce7135aSJohannes Weiner
psi_show(struct seq_file * m,struct psi_group * group,enum psi_res res)12422ce7135aSJohannes Weiner int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
1243eb414681SJohannes Weiner {
124452b1364bSChengming Zhou bool only_full = false;
1245eb414681SJohannes Weiner int full;
12467fc70a39SSuren Baghdasaryan u64 now;
1247eb414681SJohannes Weiner
1248e0c27447SJohannes Weiner if (static_branch_likely(&psi_disabled))
1249eb414681SJohannes Weiner return -EOPNOTSUPP;
1250eb414681SJohannes Weiner
12517fc70a39SSuren Baghdasaryan /* Update averages before reporting them */
12527fc70a39SSuren Baghdasaryan mutex_lock(&group->avgs_lock);
12537fc70a39SSuren Baghdasaryan now = sched_clock();
12540e94682bSSuren Baghdasaryan collect_percpu_times(group, PSI_AVGS, NULL);
12557fc70a39SSuren Baghdasaryan if (now >= group->avg_next_update)
12567fc70a39SSuren Baghdasaryan group->avg_next_update = update_averages(group, now);
12577fc70a39SSuren Baghdasaryan mutex_unlock(&group->avgs_lock);
1258eb414681SJohannes Weiner
125952b1364bSChengming Zhou #ifdef CONFIG_IRQ_TIME_ACCOUNTING
126052b1364bSChengming Zhou only_full = res == PSI_IRQ;
126152b1364bSChengming Zhou #endif
126252b1364bSChengming Zhou
126352b1364bSChengming Zhou for (full = 0; full < 2 - only_full; full++) {
1264890d550dSChengming Zhou unsigned long avg[3] = { 0, };
1265890d550dSChengming Zhou u64 total = 0;
1266eb414681SJohannes Weiner int w;
1267eb414681SJohannes Weiner
1268890d550dSChengming Zhou /* CPU FULL is undefined at the system level */
1269890d550dSChengming Zhou if (!(group == &psi_system && res == PSI_CPU && full)) {
1270eb414681SJohannes Weiner for (w = 0; w < 3; w++)
1271eb414681SJohannes Weiner avg[w] = group->avg[res * 2 + full][w];
12720e94682bSSuren Baghdasaryan total = div_u64(group->total[PSI_AVGS][res * 2 + full],
12730e94682bSSuren Baghdasaryan NSEC_PER_USEC);
1274890d550dSChengming Zhou }
1275eb414681SJohannes Weiner
1276eb414681SJohannes Weiner seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n",
127752b1364bSChengming Zhou full || only_full ? "full" : "some",
1278eb414681SJohannes Weiner LOAD_INT(avg[0]), LOAD_FRAC(avg[0]),
1279eb414681SJohannes Weiner LOAD_INT(avg[1]), LOAD_FRAC(avg[1]),
1280eb414681SJohannes Weiner LOAD_INT(avg[2]), LOAD_FRAC(avg[2]),
1281eb414681SJohannes Weiner total);
1282eb414681SJohannes Weiner }
1283eb414681SJohannes Weiner
1284eb414681SJohannes Weiner return 0;
1285eb414681SJohannes Weiner }
1286eb414681SJohannes Weiner
psi_trigger_create(struct psi_group * group,char * buf,enum psi_res res,struct file * file,struct kernfs_open_file * of)1287aff03707SSuren Baghdasaryan struct psi_trigger *psi_trigger_create(struct psi_group *group, char *buf,
1288aff03707SSuren Baghdasaryan enum psi_res res, struct file *file,
1289aff03707SSuren Baghdasaryan struct kernfs_open_file *of)
12900e94682bSSuren Baghdasaryan {
12910e94682bSSuren Baghdasaryan struct psi_trigger *t;
12920e94682bSSuren Baghdasaryan enum psi_states state;
12930e94682bSSuren Baghdasaryan u32 threshold_us;
1294d82caa27SDomenico Cerasuolo bool privileged;
12950e94682bSSuren Baghdasaryan u32 window_us;
12960e94682bSSuren Baghdasaryan
12970e94682bSSuren Baghdasaryan if (static_branch_likely(&psi_disabled))
12980e94682bSSuren Baghdasaryan return ERR_PTR(-EOPNOTSUPP);
12990e94682bSSuren Baghdasaryan
1300d82caa27SDomenico Cerasuolo /*
1301d82caa27SDomenico Cerasuolo * Checking the privilege here on file->f_cred implies that a privileged user
1302d82caa27SDomenico Cerasuolo * could open the file and delegate the write to an unprivileged one.
1303d82caa27SDomenico Cerasuolo */
1304d82caa27SDomenico Cerasuolo privileged = cap_raised(file->f_cred->cap_effective, CAP_SYS_RESOURCE);
1305d82caa27SDomenico Cerasuolo
13060e94682bSSuren Baghdasaryan if (sscanf(buf, "some %u %u", &threshold_us, &window_us) == 2)
13070e94682bSSuren Baghdasaryan state = PSI_IO_SOME + res * 2;
13080e94682bSSuren Baghdasaryan else if (sscanf(buf, "full %u %u", &threshold_us, &window_us) == 2)
13090e94682bSSuren Baghdasaryan state = PSI_IO_FULL + res * 2;
13100e94682bSSuren Baghdasaryan else
13110e94682bSSuren Baghdasaryan return ERR_PTR(-EINVAL);
13120e94682bSSuren Baghdasaryan
131352b1364bSChengming Zhou #ifdef CONFIG_IRQ_TIME_ACCOUNTING
131452b1364bSChengming Zhou if (res == PSI_IRQ && --state != PSI_IRQ_FULL)
131552b1364bSChengming Zhou return ERR_PTR(-EINVAL);
131652b1364bSChengming Zhou #endif
131752b1364bSChengming Zhou
13180e94682bSSuren Baghdasaryan if (state >= PSI_NONIDLE)
13190e94682bSSuren Baghdasaryan return ERR_PTR(-EINVAL);
13200e94682bSSuren Baghdasaryan
1321519fabc7SSuren Baghdasaryan if (window_us == 0 || window_us > WINDOW_MAX_US)
13220e94682bSSuren Baghdasaryan return ERR_PTR(-EINVAL);
13230e94682bSSuren Baghdasaryan
1324d82caa27SDomenico Cerasuolo /*
1325d82caa27SDomenico Cerasuolo * Unprivileged users can only use 2s windows so that averages aggregation
1326d82caa27SDomenico Cerasuolo * work is used, and no RT threads need to be spawned.
1327d82caa27SDomenico Cerasuolo */
1328d82caa27SDomenico Cerasuolo if (!privileged && window_us % 2000000)
1329d82caa27SDomenico Cerasuolo return ERR_PTR(-EINVAL);
1330d82caa27SDomenico Cerasuolo
13310e94682bSSuren Baghdasaryan /* Check threshold */
13320e94682bSSuren Baghdasaryan if (threshold_us == 0 || threshold_us > window_us)
13330e94682bSSuren Baghdasaryan return ERR_PTR(-EINVAL);
13340e94682bSSuren Baghdasaryan
13350e94682bSSuren Baghdasaryan t = kmalloc(sizeof(*t), GFP_KERNEL);
13360e94682bSSuren Baghdasaryan if (!t)
13370e94682bSSuren Baghdasaryan return ERR_PTR(-ENOMEM);
13380e94682bSSuren Baghdasaryan
13390e94682bSSuren Baghdasaryan t->group = group;
13400e94682bSSuren Baghdasaryan t->state = state;
13410e94682bSSuren Baghdasaryan t->threshold = threshold_us * NSEC_PER_USEC;
13420e94682bSSuren Baghdasaryan t->win.size = window_us * NSEC_PER_USEC;
1343915a087eSHailong Liu window_reset(&t->win, sched_clock(),
1344915a087eSHailong Liu group->total[PSI_POLL][t->state], 0);
13450e94682bSSuren Baghdasaryan
13460e94682bSSuren Baghdasaryan t->event = 0;
13470e94682bSSuren Baghdasaryan t->last_event_time = 0;
1348aff03707SSuren Baghdasaryan t->of = of;
1349aff03707SSuren Baghdasaryan if (!of)
13500e94682bSSuren Baghdasaryan init_waitqueue_head(&t->event_wait);
1351e6df4eadSZhaoyang Huang t->pending_event = false;
1352d82caa27SDomenico Cerasuolo t->aggregator = privileged ? PSI_POLL : PSI_AVGS;
13530e94682bSSuren Baghdasaryan
1354d82caa27SDomenico Cerasuolo if (privileged) {
135565457b74SDomenico Cerasuolo mutex_lock(&group->rtpoll_trigger_lock);
13560e94682bSSuren Baghdasaryan
135765457b74SDomenico Cerasuolo if (!rcu_access_pointer(group->rtpoll_task)) {
1358461daba0SSuren Baghdasaryan struct task_struct *task;
13590e94682bSSuren Baghdasaryan
136065457b74SDomenico Cerasuolo task = kthread_create(psi_rtpoll_worker, group, "psimon");
1361461daba0SSuren Baghdasaryan if (IS_ERR(task)) {
13620e94682bSSuren Baghdasaryan kfree(t);
136365457b74SDomenico Cerasuolo mutex_unlock(&group->rtpoll_trigger_lock);
1364461daba0SSuren Baghdasaryan return ERR_CAST(task);
13650e94682bSSuren Baghdasaryan }
136665457b74SDomenico Cerasuolo atomic_set(&group->rtpoll_wakeup, 0);
1367461daba0SSuren Baghdasaryan wake_up_process(task);
136865457b74SDomenico Cerasuolo rcu_assign_pointer(group->rtpoll_task, task);
13690e94682bSSuren Baghdasaryan }
13700e94682bSSuren Baghdasaryan
137165457b74SDomenico Cerasuolo list_add(&t->node, &group->rtpoll_triggers);
137265457b74SDomenico Cerasuolo group->rtpoll_min_period = min(group->rtpoll_min_period,
13730e94682bSSuren Baghdasaryan div_u64(t->win.size, UPDATES_PER_WINDOW));
137465457b74SDomenico Cerasuolo group->rtpoll_nr_triggers[t->state]++;
137565457b74SDomenico Cerasuolo group->rtpoll_states |= (1 << t->state);
13760e94682bSSuren Baghdasaryan
137765457b74SDomenico Cerasuolo mutex_unlock(&group->rtpoll_trigger_lock);
1378d82caa27SDomenico Cerasuolo } else {
1379d82caa27SDomenico Cerasuolo mutex_lock(&group->avgs_lock);
13800e94682bSSuren Baghdasaryan
1381d82caa27SDomenico Cerasuolo list_add(&t->node, &group->avg_triggers);
1382d82caa27SDomenico Cerasuolo group->avg_nr_triggers[t->state]++;
1383d82caa27SDomenico Cerasuolo
1384d82caa27SDomenico Cerasuolo mutex_unlock(&group->avgs_lock);
1385d82caa27SDomenico Cerasuolo }
13860e94682bSSuren Baghdasaryan return t;
13870e94682bSSuren Baghdasaryan }
13880e94682bSSuren Baghdasaryan
psi_trigger_destroy(struct psi_trigger * t)1389a06247c6SSuren Baghdasaryan void psi_trigger_destroy(struct psi_trigger *t)
13900e94682bSSuren Baghdasaryan {
1391a06247c6SSuren Baghdasaryan struct psi_group *group;
1392461daba0SSuren Baghdasaryan struct task_struct *task_to_destroy = NULL;
13930e94682bSSuren Baghdasaryan
1394a06247c6SSuren Baghdasaryan /*
1395a06247c6SSuren Baghdasaryan * We do not check psi_disabled since it might have been disabled after
1396a06247c6SSuren Baghdasaryan * the trigger got created.
1397a06247c6SSuren Baghdasaryan */
1398a06247c6SSuren Baghdasaryan if (!t)
13990e94682bSSuren Baghdasaryan return;
14000e94682bSSuren Baghdasaryan
1401a06247c6SSuren Baghdasaryan group = t->group;
14020e94682bSSuren Baghdasaryan /*
1403c2dbe32dSMunehisa Kamata * Wakeup waiters to stop polling and clear the queue to prevent it from
1404c2dbe32dSMunehisa Kamata * being accessed later. Can happen if cgroup is deleted from under a
1405c2dbe32dSMunehisa Kamata * polling process.
14060e94682bSSuren Baghdasaryan */
1407aff03707SSuren Baghdasaryan if (t->of)
1408aff03707SSuren Baghdasaryan kernfs_notify(t->of->kn);
1409aff03707SSuren Baghdasaryan else
1410aff03707SSuren Baghdasaryan wake_up_interruptible(&t->event_wait);
14110e94682bSSuren Baghdasaryan
1412d82caa27SDomenico Cerasuolo if (t->aggregator == PSI_AVGS) {
1413d82caa27SDomenico Cerasuolo mutex_lock(&group->avgs_lock);
1414d82caa27SDomenico Cerasuolo if (!list_empty(&t->node)) {
1415d82caa27SDomenico Cerasuolo list_del(&t->node);
1416d82caa27SDomenico Cerasuolo group->avg_nr_triggers[t->state]--;
1417d82caa27SDomenico Cerasuolo }
1418d82caa27SDomenico Cerasuolo mutex_unlock(&group->avgs_lock);
1419d82caa27SDomenico Cerasuolo } else {
142065457b74SDomenico Cerasuolo mutex_lock(&group->rtpoll_trigger_lock);
14210e94682bSSuren Baghdasaryan if (!list_empty(&t->node)) {
14220e94682bSSuren Baghdasaryan struct psi_trigger *tmp;
14230e94682bSSuren Baghdasaryan u64 period = ULLONG_MAX;
14240e94682bSSuren Baghdasaryan
14250e94682bSSuren Baghdasaryan list_del(&t->node);
142665457b74SDomenico Cerasuolo group->rtpoll_nr_triggers[t->state]--;
142765457b74SDomenico Cerasuolo if (!group->rtpoll_nr_triggers[t->state])
142865457b74SDomenico Cerasuolo group->rtpoll_states &= ~(1 << t->state);
1429e2a1f85bSYang Yang /*
1430e2a1f85bSYang Yang * Reset min update period for the remaining triggers
1431e2a1f85bSYang Yang * iff the destroying trigger had the min window size.
1432e2a1f85bSYang Yang */
1433e2a1f85bSYang Yang if (group->rtpoll_min_period == div_u64(t->win.size, UPDATES_PER_WINDOW)) {
143465457b74SDomenico Cerasuolo list_for_each_entry(tmp, &group->rtpoll_triggers, node)
14350e94682bSSuren Baghdasaryan period = min(period, div_u64(tmp->win.size,
14360e94682bSSuren Baghdasaryan UPDATES_PER_WINDOW));
143765457b74SDomenico Cerasuolo group->rtpoll_min_period = period;
1438e2a1f85bSYang Yang }
143965457b74SDomenico Cerasuolo /* Destroy rtpoll_task when the last trigger is destroyed */
144065457b74SDomenico Cerasuolo if (group->rtpoll_states == 0) {
144165457b74SDomenico Cerasuolo group->rtpoll_until = 0;
1442461daba0SSuren Baghdasaryan task_to_destroy = rcu_dereference_protected(
144365457b74SDomenico Cerasuolo group->rtpoll_task,
144465457b74SDomenico Cerasuolo lockdep_is_held(&group->rtpoll_trigger_lock));
144565457b74SDomenico Cerasuolo rcu_assign_pointer(group->rtpoll_task, NULL);
144665457b74SDomenico Cerasuolo del_timer(&group->rtpoll_timer);
14470e94682bSSuren Baghdasaryan }
14480e94682bSSuren Baghdasaryan }
144965457b74SDomenico Cerasuolo mutex_unlock(&group->rtpoll_trigger_lock);
1450d82caa27SDomenico Cerasuolo }
14510e94682bSSuren Baghdasaryan
14520e94682bSSuren Baghdasaryan /*
145365457b74SDomenico Cerasuolo * Wait for psi_schedule_rtpoll_work RCU to complete its read-side
1454a06247c6SSuren Baghdasaryan * critical section before destroying the trigger and optionally the
145565457b74SDomenico Cerasuolo * rtpoll_task.
14560e94682bSSuren Baghdasaryan */
14570e94682bSSuren Baghdasaryan synchronize_rcu();
14580e94682bSSuren Baghdasaryan /*
145965457b74SDomenico Cerasuolo * Stop kthread 'psimon' after releasing rtpoll_trigger_lock to prevent
146065457b74SDomenico Cerasuolo * a deadlock while waiting for psi_rtpoll_work to acquire
146165457b74SDomenico Cerasuolo * rtpoll_trigger_lock
14620e94682bSSuren Baghdasaryan */
1463461daba0SSuren Baghdasaryan if (task_to_destroy) {
14647b2b55daSJason Xing /*
14657b2b55daSJason Xing * After the RCU grace period has expired, the worker
146665457b74SDomenico Cerasuolo * can no longer be found through group->rtpoll_task.
14677b2b55daSJason Xing */
1468461daba0SSuren Baghdasaryan kthread_stop(task_to_destroy);
146965457b74SDomenico Cerasuolo atomic_set(&group->rtpoll_scheduled, 0);
14700e94682bSSuren Baghdasaryan }
14710e94682bSSuren Baghdasaryan kfree(t);
14720e94682bSSuren Baghdasaryan }
14730e94682bSSuren Baghdasaryan
psi_trigger_poll(void ** trigger_ptr,struct file * file,poll_table * wait)14740e94682bSSuren Baghdasaryan __poll_t psi_trigger_poll(void **trigger_ptr,
14750e94682bSSuren Baghdasaryan struct file *file, poll_table *wait)
14760e94682bSSuren Baghdasaryan {
14770e94682bSSuren Baghdasaryan __poll_t ret = DEFAULT_POLLMASK;
14780e94682bSSuren Baghdasaryan struct psi_trigger *t;
14790e94682bSSuren Baghdasaryan
14800e94682bSSuren Baghdasaryan if (static_branch_likely(&psi_disabled))
14810e94682bSSuren Baghdasaryan return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI;
14820e94682bSSuren Baghdasaryan
1483a06247c6SSuren Baghdasaryan t = smp_load_acquire(trigger_ptr);
1484a06247c6SSuren Baghdasaryan if (!t)
14850e94682bSSuren Baghdasaryan return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI;
14860e94682bSSuren Baghdasaryan
1487aff03707SSuren Baghdasaryan if (t->of)
1488aff03707SSuren Baghdasaryan kernfs_generic_poll(t->of, wait);
1489aff03707SSuren Baghdasaryan else
14900e94682bSSuren Baghdasaryan poll_wait(file, &t->event_wait, wait);
14910e94682bSSuren Baghdasaryan
14920e94682bSSuren Baghdasaryan if (cmpxchg(&t->event, 1, 0) == 1)
14930e94682bSSuren Baghdasaryan ret |= EPOLLPRI;
14940e94682bSSuren Baghdasaryan
14950e94682bSSuren Baghdasaryan return ret;
14960e94682bSSuren Baghdasaryan }
14970e94682bSSuren Baghdasaryan
14985102bb1cSSuren Baghdasaryan #ifdef CONFIG_PROC_FS
psi_io_show(struct seq_file * m,void * v)14995102bb1cSSuren Baghdasaryan static int psi_io_show(struct seq_file *m, void *v)
15005102bb1cSSuren Baghdasaryan {
15015102bb1cSSuren Baghdasaryan return psi_show(m, &psi_system, PSI_IO);
15025102bb1cSSuren Baghdasaryan }
15035102bb1cSSuren Baghdasaryan
psi_memory_show(struct seq_file * m,void * v)15045102bb1cSSuren Baghdasaryan static int psi_memory_show(struct seq_file *m, void *v)
15055102bb1cSSuren Baghdasaryan {
15065102bb1cSSuren Baghdasaryan return psi_show(m, &psi_system, PSI_MEM);
15075102bb1cSSuren Baghdasaryan }
15085102bb1cSSuren Baghdasaryan
psi_cpu_show(struct seq_file * m,void * v)15095102bb1cSSuren Baghdasaryan static int psi_cpu_show(struct seq_file *m, void *v)
15105102bb1cSSuren Baghdasaryan {
15115102bb1cSSuren Baghdasaryan return psi_show(m, &psi_system, PSI_CPU);
15125102bb1cSSuren Baghdasaryan }
15135102bb1cSSuren Baghdasaryan
psi_io_open(struct inode * inode,struct file * file)15145102bb1cSSuren Baghdasaryan static int psi_io_open(struct inode *inode, struct file *file)
15155102bb1cSSuren Baghdasaryan {
1516d82caa27SDomenico Cerasuolo return single_open(file, psi_io_show, NULL);
15175102bb1cSSuren Baghdasaryan }
15185102bb1cSSuren Baghdasaryan
psi_memory_open(struct inode * inode,struct file * file)15195102bb1cSSuren Baghdasaryan static int psi_memory_open(struct inode *inode, struct file *file)
15205102bb1cSSuren Baghdasaryan {
1521d82caa27SDomenico Cerasuolo return single_open(file, psi_memory_show, NULL);
15225102bb1cSSuren Baghdasaryan }
15235102bb1cSSuren Baghdasaryan
psi_cpu_open(struct inode * inode,struct file * file)15245102bb1cSSuren Baghdasaryan static int psi_cpu_open(struct inode *inode, struct file *file)
15255102bb1cSSuren Baghdasaryan {
1526d82caa27SDomenico Cerasuolo return single_open(file, psi_cpu_show, NULL);
15275102bb1cSSuren Baghdasaryan }
15285102bb1cSSuren Baghdasaryan
psi_write(struct file * file,const char __user * user_buf,size_t nbytes,enum psi_res res)15290e94682bSSuren Baghdasaryan static ssize_t psi_write(struct file *file, const char __user *user_buf,
15300e94682bSSuren Baghdasaryan size_t nbytes, enum psi_res res)
15310e94682bSSuren Baghdasaryan {
15320e94682bSSuren Baghdasaryan char buf[32];
15330e94682bSSuren Baghdasaryan size_t buf_size;
15340e94682bSSuren Baghdasaryan struct seq_file *seq;
15350e94682bSSuren Baghdasaryan struct psi_trigger *new;
15360e94682bSSuren Baghdasaryan
15370e94682bSSuren Baghdasaryan if (static_branch_likely(&psi_disabled))
15380e94682bSSuren Baghdasaryan return -EOPNOTSUPP;
15390e94682bSSuren Baghdasaryan
15406fcca0faSSuren Baghdasaryan if (!nbytes)
15416fcca0faSSuren Baghdasaryan return -EINVAL;
15426fcca0faSSuren Baghdasaryan
15434adcdceaSMiles Chen buf_size = min(nbytes, sizeof(buf));
15440e94682bSSuren Baghdasaryan if (copy_from_user(buf, user_buf, buf_size))
15450e94682bSSuren Baghdasaryan return -EFAULT;
15460e94682bSSuren Baghdasaryan
15470e94682bSSuren Baghdasaryan buf[buf_size - 1] = '\0';
15480e94682bSSuren Baghdasaryan
15490e94682bSSuren Baghdasaryan seq = file->private_data;
1550a06247c6SSuren Baghdasaryan
15510e94682bSSuren Baghdasaryan /* Take seq->lock to protect seq->private from concurrent writes */
15520e94682bSSuren Baghdasaryan mutex_lock(&seq->lock);
1553a06247c6SSuren Baghdasaryan
1554a06247c6SSuren Baghdasaryan /* Allow only one trigger per file descriptor */
1555a06247c6SSuren Baghdasaryan if (seq->private) {
1556a06247c6SSuren Baghdasaryan mutex_unlock(&seq->lock);
1557a06247c6SSuren Baghdasaryan return -EBUSY;
1558a06247c6SSuren Baghdasaryan }
1559a06247c6SSuren Baghdasaryan
1560aff03707SSuren Baghdasaryan new = psi_trigger_create(&psi_system, buf, res, file, NULL);
1561a06247c6SSuren Baghdasaryan if (IS_ERR(new)) {
1562a06247c6SSuren Baghdasaryan mutex_unlock(&seq->lock);
1563a06247c6SSuren Baghdasaryan return PTR_ERR(new);
1564a06247c6SSuren Baghdasaryan }
1565a06247c6SSuren Baghdasaryan
1566a06247c6SSuren Baghdasaryan smp_store_release(&seq->private, new);
15670e94682bSSuren Baghdasaryan mutex_unlock(&seq->lock);
15680e94682bSSuren Baghdasaryan
15690e94682bSSuren Baghdasaryan return nbytes;
15700e94682bSSuren Baghdasaryan }
15710e94682bSSuren Baghdasaryan
psi_io_write(struct file * file,const char __user * user_buf,size_t nbytes,loff_t * ppos)15720e94682bSSuren Baghdasaryan static ssize_t psi_io_write(struct file *file, const char __user *user_buf,
15730e94682bSSuren Baghdasaryan size_t nbytes, loff_t *ppos)
15740e94682bSSuren Baghdasaryan {
15750e94682bSSuren Baghdasaryan return psi_write(file, user_buf, nbytes, PSI_IO);
15760e94682bSSuren Baghdasaryan }
15770e94682bSSuren Baghdasaryan
psi_memory_write(struct file * file,const char __user * user_buf,size_t nbytes,loff_t * ppos)15780e94682bSSuren Baghdasaryan static ssize_t psi_memory_write(struct file *file, const char __user *user_buf,
15790e94682bSSuren Baghdasaryan size_t nbytes, loff_t *ppos)
15800e94682bSSuren Baghdasaryan {
15810e94682bSSuren Baghdasaryan return psi_write(file, user_buf, nbytes, PSI_MEM);
15820e94682bSSuren Baghdasaryan }
15830e94682bSSuren Baghdasaryan
psi_cpu_write(struct file * file,const char __user * user_buf,size_t nbytes,loff_t * ppos)15840e94682bSSuren Baghdasaryan static ssize_t psi_cpu_write(struct file *file, const char __user *user_buf,
15850e94682bSSuren Baghdasaryan size_t nbytes, loff_t *ppos)
15860e94682bSSuren Baghdasaryan {
15870e94682bSSuren Baghdasaryan return psi_write(file, user_buf, nbytes, PSI_CPU);
15880e94682bSSuren Baghdasaryan }
15890e94682bSSuren Baghdasaryan
psi_fop_poll(struct file * file,poll_table * wait)15900e94682bSSuren Baghdasaryan static __poll_t psi_fop_poll(struct file *file, poll_table *wait)
15910e94682bSSuren Baghdasaryan {
15920e94682bSSuren Baghdasaryan struct seq_file *seq = file->private_data;
15930e94682bSSuren Baghdasaryan
15940e94682bSSuren Baghdasaryan return psi_trigger_poll(&seq->private, file, wait);
15950e94682bSSuren Baghdasaryan }
15960e94682bSSuren Baghdasaryan
psi_fop_release(struct inode * inode,struct file * file)15970e94682bSSuren Baghdasaryan static int psi_fop_release(struct inode *inode, struct file *file)
15980e94682bSSuren Baghdasaryan {
15990e94682bSSuren Baghdasaryan struct seq_file *seq = file->private_data;
16000e94682bSSuren Baghdasaryan
1601a06247c6SSuren Baghdasaryan psi_trigger_destroy(seq->private);
16020e94682bSSuren Baghdasaryan return single_release(inode, file);
16030e94682bSSuren Baghdasaryan }
16040e94682bSSuren Baghdasaryan
160597a32539SAlexey Dobriyan static const struct proc_ops psi_io_proc_ops = {
160697a32539SAlexey Dobriyan .proc_open = psi_io_open,
160797a32539SAlexey Dobriyan .proc_read = seq_read,
160897a32539SAlexey Dobriyan .proc_lseek = seq_lseek,
160997a32539SAlexey Dobriyan .proc_write = psi_io_write,
161097a32539SAlexey Dobriyan .proc_poll = psi_fop_poll,
161197a32539SAlexey Dobriyan .proc_release = psi_fop_release,
1612eb414681SJohannes Weiner };
1613eb414681SJohannes Weiner
161497a32539SAlexey Dobriyan static const struct proc_ops psi_memory_proc_ops = {
161597a32539SAlexey Dobriyan .proc_open = psi_memory_open,
161697a32539SAlexey Dobriyan .proc_read = seq_read,
161797a32539SAlexey Dobriyan .proc_lseek = seq_lseek,
161897a32539SAlexey Dobriyan .proc_write = psi_memory_write,
161997a32539SAlexey Dobriyan .proc_poll = psi_fop_poll,
162097a32539SAlexey Dobriyan .proc_release = psi_fop_release,
1621eb414681SJohannes Weiner };
1622eb414681SJohannes Weiner
162397a32539SAlexey Dobriyan static const struct proc_ops psi_cpu_proc_ops = {
162497a32539SAlexey Dobriyan .proc_open = psi_cpu_open,
162597a32539SAlexey Dobriyan .proc_read = seq_read,
162697a32539SAlexey Dobriyan .proc_lseek = seq_lseek,
162797a32539SAlexey Dobriyan .proc_write = psi_cpu_write,
162897a32539SAlexey Dobriyan .proc_poll = psi_fop_poll,
162997a32539SAlexey Dobriyan .proc_release = psi_fop_release,
1630eb414681SJohannes Weiner };
1631eb414681SJohannes Weiner
163252b1364bSChengming Zhou #ifdef CONFIG_IRQ_TIME_ACCOUNTING
psi_irq_show(struct seq_file * m,void * v)163352b1364bSChengming Zhou static int psi_irq_show(struct seq_file *m, void *v)
163452b1364bSChengming Zhou {
163552b1364bSChengming Zhou return psi_show(m, &psi_system, PSI_IRQ);
163652b1364bSChengming Zhou }
163752b1364bSChengming Zhou
psi_irq_open(struct inode * inode,struct file * file)163852b1364bSChengming Zhou static int psi_irq_open(struct inode *inode, struct file *file)
163952b1364bSChengming Zhou {
1640d82caa27SDomenico Cerasuolo return single_open(file, psi_irq_show, NULL);
164152b1364bSChengming Zhou }
164252b1364bSChengming Zhou
psi_irq_write(struct file * file,const char __user * user_buf,size_t nbytes,loff_t * ppos)164352b1364bSChengming Zhou static ssize_t psi_irq_write(struct file *file, const char __user *user_buf,
164452b1364bSChengming Zhou size_t nbytes, loff_t *ppos)
164552b1364bSChengming Zhou {
164652b1364bSChengming Zhou return psi_write(file, user_buf, nbytes, PSI_IRQ);
164752b1364bSChengming Zhou }
164852b1364bSChengming Zhou
164952b1364bSChengming Zhou static const struct proc_ops psi_irq_proc_ops = {
165052b1364bSChengming Zhou .proc_open = psi_irq_open,
165152b1364bSChengming Zhou .proc_read = seq_read,
165252b1364bSChengming Zhou .proc_lseek = seq_lseek,
165352b1364bSChengming Zhou .proc_write = psi_irq_write,
165452b1364bSChengming Zhou .proc_poll = psi_fop_poll,
165552b1364bSChengming Zhou .proc_release = psi_fop_release,
165652b1364bSChengming Zhou };
165752b1364bSChengming Zhou #endif
165852b1364bSChengming Zhou
psi_proc_init(void)1659eb414681SJohannes Weiner static int __init psi_proc_init(void)
1660eb414681SJohannes Weiner {
16613d817689SWang Long if (psi_enable) {
1662eb414681SJohannes Weiner proc_mkdir("pressure", NULL);
16636db12ee0SJosh Hunt proc_create("pressure/io", 0666, NULL, &psi_io_proc_ops);
16646db12ee0SJosh Hunt proc_create("pressure/memory", 0666, NULL, &psi_memory_proc_ops);
16656db12ee0SJosh Hunt proc_create("pressure/cpu", 0666, NULL, &psi_cpu_proc_ops);
166652b1364bSChengming Zhou #ifdef CONFIG_IRQ_TIME_ACCOUNTING
166752b1364bSChengming Zhou proc_create("pressure/irq", 0666, NULL, &psi_irq_proc_ops);
166852b1364bSChengming Zhou #endif
16693d817689SWang Long }
1670eb414681SJohannes Weiner return 0;
1671eb414681SJohannes Weiner }
1672eb414681SJohannes Weiner module_init(psi_proc_init);
16735102bb1cSSuren Baghdasaryan
16745102bb1cSSuren Baghdasaryan #endif /* CONFIG_PROC_FS */
1675