1 // SPDX-License-Identifier: GPL-2.0-only 2 #include "cgroup-internal.h" 3 4 #include <linux/sched/cputime.h> 5 6 static DEFINE_SPINLOCK(cgroup_rstat_lock); 7 static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock); 8 9 static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu); 10 11 static struct cgroup_rstat_cpu *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu) 12 { 13 return per_cpu_ptr(cgrp->rstat_cpu, cpu); 14 } 15 16 /** 17 * cgroup_rstat_updated - keep track of updated rstat_cpu 18 * @cgrp: target cgroup 19 * @cpu: cpu on which rstat_cpu was updated 20 * 21 * @cgrp's rstat_cpu on @cpu was updated. Put it on the parent's matching 22 * rstat_cpu->updated_children list. See the comment on top of 23 * cgroup_rstat_cpu definition for details. 24 */ 25 void cgroup_rstat_updated(struct cgroup *cgrp, int cpu) 26 { 27 raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu); 28 struct cgroup *parent; 29 unsigned long flags; 30 31 /* nothing to do for root */ 32 if (!cgroup_parent(cgrp)) 33 return; 34 35 /* 36 * Speculative already-on-list test. This may race leading to 37 * temporary inaccuracies, which is fine. 38 * 39 * Because @parent's updated_children is terminated with @parent 40 * instead of NULL, we can tell whether @cgrp is on the list by 41 * testing the next pointer for NULL. 42 */ 43 if (cgroup_rstat_cpu(cgrp, cpu)->updated_next) 44 return; 45 46 raw_spin_lock_irqsave(cpu_lock, flags); 47 48 /* put @cgrp and all ancestors on the corresponding updated lists */ 49 for (parent = cgroup_parent(cgrp); parent; 50 cgrp = parent, parent = cgroup_parent(cgrp)) { 51 struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); 52 struct cgroup_rstat_cpu *prstatc = cgroup_rstat_cpu(parent, cpu); 53 54 /* 55 * Both additions and removals are bottom-up. If a cgroup 56 * is already in the tree, all ancestors are. 57 */ 58 if (rstatc->updated_next) 59 break; 60 61 rstatc->updated_next = prstatc->updated_children; 62 prstatc->updated_children = cgrp; 63 } 64 65 raw_spin_unlock_irqrestore(cpu_lock, flags); 66 } 67 68 /** 69 * cgroup_rstat_cpu_pop_updated - iterate and dismantle rstat_cpu updated tree 70 * @pos: current position 71 * @root: root of the tree to traversal 72 * @cpu: target cpu 73 * 74 * Walks the udpated rstat_cpu tree on @cpu from @root. %NULL @pos starts 75 * the traversal and %NULL return indicates the end. During traversal, 76 * each returned cgroup is unlinked from the tree. Must be called with the 77 * matching cgroup_rstat_cpu_lock held. 78 * 79 * The only ordering guarantee is that, for a parent and a child pair 80 * covered by a given traversal, if a child is visited, its parent is 81 * guaranteed to be visited afterwards. 82 */ 83 static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos, 84 struct cgroup *root, int cpu) 85 { 86 struct cgroup_rstat_cpu *rstatc; 87 88 if (pos == root) 89 return NULL; 90 91 /* 92 * We're gonna walk down to the first leaf and visit/remove it. We 93 * can pick whatever unvisited node as the starting point. 94 */ 95 if (!pos) 96 pos = root; 97 else 98 pos = cgroup_parent(pos); 99 100 /* walk down to the first leaf */ 101 while (true) { 102 rstatc = cgroup_rstat_cpu(pos, cpu); 103 if (rstatc->updated_children == pos) 104 break; 105 pos = rstatc->updated_children; 106 } 107 108 /* 109 * Unlink @pos from the tree. As the updated_children list is 110 * singly linked, we have to walk it to find the removal point. 111 * However, due to the way we traverse, @pos will be the first 112 * child in most cases. The only exception is @root. 113 */ 114 if (rstatc->updated_next) { 115 struct cgroup *parent = cgroup_parent(pos); 116 struct cgroup_rstat_cpu *prstatc = cgroup_rstat_cpu(parent, cpu); 117 struct cgroup_rstat_cpu *nrstatc; 118 struct cgroup **nextp; 119 120 nextp = &prstatc->updated_children; 121 while (true) { 122 nrstatc = cgroup_rstat_cpu(*nextp, cpu); 123 if (*nextp == pos) 124 break; 125 126 WARN_ON_ONCE(*nextp == parent); 127 nextp = &nrstatc->updated_next; 128 } 129 130 *nextp = rstatc->updated_next; 131 rstatc->updated_next = NULL; 132 133 return pos; 134 } 135 136 /* only happens for @root */ 137 return NULL; 138 } 139 140 /* see cgroup_rstat_flush() */ 141 static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep) 142 __releases(&cgroup_rstat_lock) __acquires(&cgroup_rstat_lock) 143 { 144 int cpu; 145 146 lockdep_assert_held(&cgroup_rstat_lock); 147 148 for_each_possible_cpu(cpu) { 149 raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, 150 cpu); 151 struct cgroup *pos = NULL; 152 153 raw_spin_lock(cpu_lock); 154 while ((pos = cgroup_rstat_cpu_pop_updated(pos, cgrp, cpu))) { 155 struct cgroup_subsys_state *css; 156 157 cgroup_base_stat_flush(pos, cpu); 158 159 rcu_read_lock(); 160 list_for_each_entry_rcu(css, &pos->rstat_css_list, 161 rstat_css_node) 162 css->ss->css_rstat_flush(css, cpu); 163 rcu_read_unlock(); 164 } 165 raw_spin_unlock(cpu_lock); 166 167 /* if @may_sleep, play nice and yield if necessary */ 168 if (may_sleep && (need_resched() || 169 spin_needbreak(&cgroup_rstat_lock))) { 170 spin_unlock_irq(&cgroup_rstat_lock); 171 if (!cond_resched()) 172 cpu_relax(); 173 spin_lock_irq(&cgroup_rstat_lock); 174 } 175 } 176 } 177 178 /** 179 * cgroup_rstat_flush - flush stats in @cgrp's subtree 180 * @cgrp: target cgroup 181 * 182 * Collect all per-cpu stats in @cgrp's subtree into the global counters 183 * and propagate them upwards. After this function returns, all cgroups in 184 * the subtree have up-to-date ->stat. 185 * 186 * This also gets all cgroups in the subtree including @cgrp off the 187 * ->updated_children lists. 188 * 189 * This function may block. 190 */ 191 void cgroup_rstat_flush(struct cgroup *cgrp) 192 { 193 might_sleep(); 194 195 spin_lock_irq(&cgroup_rstat_lock); 196 cgroup_rstat_flush_locked(cgrp, true); 197 spin_unlock_irq(&cgroup_rstat_lock); 198 } 199 200 /** 201 * cgroup_rstat_flush_irqsafe - irqsafe version of cgroup_rstat_flush() 202 * @cgrp: target cgroup 203 * 204 * This function can be called from any context. 205 */ 206 void cgroup_rstat_flush_irqsafe(struct cgroup *cgrp) 207 { 208 unsigned long flags; 209 210 spin_lock_irqsave(&cgroup_rstat_lock, flags); 211 cgroup_rstat_flush_locked(cgrp, false); 212 spin_unlock_irqrestore(&cgroup_rstat_lock, flags); 213 } 214 215 /** 216 * cgroup_rstat_flush_begin - flush stats in @cgrp's subtree and hold 217 * @cgrp: target cgroup 218 * 219 * Flush stats in @cgrp's subtree and prevent further flushes. Must be 220 * paired with cgroup_rstat_flush_release(). 221 * 222 * This function may block. 223 */ 224 void cgroup_rstat_flush_hold(struct cgroup *cgrp) 225 __acquires(&cgroup_rstat_lock) 226 { 227 might_sleep(); 228 spin_lock_irq(&cgroup_rstat_lock); 229 cgroup_rstat_flush_locked(cgrp, true); 230 } 231 232 /** 233 * cgroup_rstat_flush_release - release cgroup_rstat_flush_hold() 234 */ 235 void cgroup_rstat_flush_release(void) 236 __releases(&cgroup_rstat_lock) 237 { 238 spin_unlock_irq(&cgroup_rstat_lock); 239 } 240 241 int cgroup_rstat_init(struct cgroup *cgrp) 242 { 243 int cpu; 244 245 /* the root cgrp has rstat_cpu preallocated */ 246 if (!cgrp->rstat_cpu) { 247 cgrp->rstat_cpu = alloc_percpu(struct cgroup_rstat_cpu); 248 if (!cgrp->rstat_cpu) 249 return -ENOMEM; 250 } 251 252 /* ->updated_children list is self terminated */ 253 for_each_possible_cpu(cpu) { 254 struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); 255 256 rstatc->updated_children = cgrp; 257 u64_stats_init(&rstatc->bsync); 258 } 259 260 return 0; 261 } 262 263 void cgroup_rstat_exit(struct cgroup *cgrp) 264 { 265 int cpu; 266 267 cgroup_rstat_flush(cgrp); 268 269 /* sanity check */ 270 for_each_possible_cpu(cpu) { 271 struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); 272 273 if (WARN_ON_ONCE(rstatc->updated_children != cgrp) || 274 WARN_ON_ONCE(rstatc->updated_next)) 275 return; 276 } 277 278 free_percpu(cgrp->rstat_cpu); 279 cgrp->rstat_cpu = NULL; 280 } 281 282 void __init cgroup_rstat_boot(void) 283 { 284 int cpu; 285 286 for_each_possible_cpu(cpu) 287 raw_spin_lock_init(per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu)); 288 289 BUG_ON(cgroup_rstat_init(&cgrp_dfl_root.cgrp)); 290 } 291 292 /* 293 * Functions for cgroup basic resource statistics implemented on top of 294 * rstat. 295 */ 296 static void cgroup_base_stat_add(struct cgroup_base_stat *dst_bstat, 297 struct cgroup_base_stat *src_bstat) 298 { 299 dst_bstat->cputime.utime += src_bstat->cputime.utime; 300 dst_bstat->cputime.stime += src_bstat->cputime.stime; 301 dst_bstat->cputime.sum_exec_runtime += src_bstat->cputime.sum_exec_runtime; 302 } 303 304 static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat, 305 struct cgroup_base_stat *src_bstat) 306 { 307 dst_bstat->cputime.utime -= src_bstat->cputime.utime; 308 dst_bstat->cputime.stime -= src_bstat->cputime.stime; 309 dst_bstat->cputime.sum_exec_runtime -= src_bstat->cputime.sum_exec_runtime; 310 } 311 312 static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu) 313 { 314 struct cgroup *parent = cgroup_parent(cgrp); 315 struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); 316 struct cgroup_base_stat cur, delta; 317 unsigned seq; 318 319 /* fetch the current per-cpu values */ 320 do { 321 seq = __u64_stats_fetch_begin(&rstatc->bsync); 322 cur.cputime = rstatc->bstat.cputime; 323 } while (__u64_stats_fetch_retry(&rstatc->bsync, seq)); 324 325 /* propagate percpu delta to global */ 326 delta = cur; 327 cgroup_base_stat_sub(&delta, &rstatc->last_bstat); 328 cgroup_base_stat_add(&cgrp->bstat, &delta); 329 cgroup_base_stat_add(&rstatc->last_bstat, &delta); 330 331 /* propagate global delta to parent */ 332 if (parent) { 333 delta = cgrp->bstat; 334 cgroup_base_stat_sub(&delta, &cgrp->last_bstat); 335 cgroup_base_stat_add(&parent->bstat, &delta); 336 cgroup_base_stat_add(&cgrp->last_bstat, &delta); 337 } 338 } 339 340 static struct cgroup_rstat_cpu * 341 cgroup_base_stat_cputime_account_begin(struct cgroup *cgrp) 342 { 343 struct cgroup_rstat_cpu *rstatc; 344 345 rstatc = get_cpu_ptr(cgrp->rstat_cpu); 346 u64_stats_update_begin(&rstatc->bsync); 347 return rstatc; 348 } 349 350 static void cgroup_base_stat_cputime_account_end(struct cgroup *cgrp, 351 struct cgroup_rstat_cpu *rstatc) 352 { 353 u64_stats_update_end(&rstatc->bsync); 354 cgroup_rstat_updated(cgrp, smp_processor_id()); 355 put_cpu_ptr(rstatc); 356 } 357 358 void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec) 359 { 360 struct cgroup_rstat_cpu *rstatc; 361 362 rstatc = cgroup_base_stat_cputime_account_begin(cgrp); 363 rstatc->bstat.cputime.sum_exec_runtime += delta_exec; 364 cgroup_base_stat_cputime_account_end(cgrp, rstatc); 365 } 366 367 void __cgroup_account_cputime_field(struct cgroup *cgrp, 368 enum cpu_usage_stat index, u64 delta_exec) 369 { 370 struct cgroup_rstat_cpu *rstatc; 371 372 rstatc = cgroup_base_stat_cputime_account_begin(cgrp); 373 374 switch (index) { 375 case CPUTIME_USER: 376 case CPUTIME_NICE: 377 rstatc->bstat.cputime.utime += delta_exec; 378 break; 379 case CPUTIME_SYSTEM: 380 case CPUTIME_IRQ: 381 case CPUTIME_SOFTIRQ: 382 rstatc->bstat.cputime.stime += delta_exec; 383 break; 384 default: 385 break; 386 } 387 388 cgroup_base_stat_cputime_account_end(cgrp, rstatc); 389 } 390 391 /* 392 * compute the cputime for the root cgroup by getting the per cpu data 393 * at a global level, then categorizing the fields in a manner consistent 394 * with how it is done by __cgroup_account_cputime_field for each bit of 395 * cpu time attributed to a cgroup. 396 */ 397 static void root_cgroup_cputime(struct task_cputime *cputime) 398 { 399 int i; 400 401 cputime->stime = 0; 402 cputime->utime = 0; 403 cputime->sum_exec_runtime = 0; 404 for_each_possible_cpu(i) { 405 struct kernel_cpustat kcpustat; 406 u64 *cpustat = kcpustat.cpustat; 407 u64 user = 0; 408 u64 sys = 0; 409 410 kcpustat_cpu_fetch(&kcpustat, i); 411 412 user += cpustat[CPUTIME_USER]; 413 user += cpustat[CPUTIME_NICE]; 414 cputime->utime += user; 415 416 sys += cpustat[CPUTIME_SYSTEM]; 417 sys += cpustat[CPUTIME_IRQ]; 418 sys += cpustat[CPUTIME_SOFTIRQ]; 419 cputime->stime += sys; 420 421 cputime->sum_exec_runtime += user; 422 cputime->sum_exec_runtime += sys; 423 cputime->sum_exec_runtime += cpustat[CPUTIME_STEAL]; 424 cputime->sum_exec_runtime += cpustat[CPUTIME_GUEST]; 425 cputime->sum_exec_runtime += cpustat[CPUTIME_GUEST_NICE]; 426 } 427 } 428 429 void cgroup_base_stat_cputime_show(struct seq_file *seq) 430 { 431 struct cgroup *cgrp = seq_css(seq)->cgroup; 432 u64 usage, utime, stime; 433 struct task_cputime cputime; 434 435 if (cgroup_parent(cgrp)) { 436 cgroup_rstat_flush_hold(cgrp); 437 usage = cgrp->bstat.cputime.sum_exec_runtime; 438 cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime, 439 &utime, &stime); 440 cgroup_rstat_flush_release(); 441 } else { 442 root_cgroup_cputime(&cputime); 443 usage = cputime.sum_exec_runtime; 444 utime = cputime.utime; 445 stime = cputime.stime; 446 } 447 448 do_div(usage, NSEC_PER_USEC); 449 do_div(utime, NSEC_PER_USEC); 450 do_div(stime, NSEC_PER_USEC); 451 452 seq_printf(seq, "usage_usec %llu\n" 453 "user_usec %llu\n" 454 "system_usec %llu\n", 455 usage, utime, stime); 456 } 457