1 // SPDX-License-Identifier: GPL-2.0-only 2 #include "cgroup-internal.h" 3 4 #include <linux/sched/cputime.h> 5 6 static DEFINE_SPINLOCK(cgroup_rstat_lock); 7 static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock); 8 9 static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu); 10 11 static struct cgroup_rstat_cpu *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu) 12 { 13 return per_cpu_ptr(cgrp->rstat_cpu, cpu); 14 } 15 16 /** 17 * cgroup_rstat_updated - keep track of updated rstat_cpu 18 * @cgrp: target cgroup 19 * @cpu: cpu on which rstat_cpu was updated 20 * 21 * @cgrp's rstat_cpu on @cpu was updated. Put it on the parent's matching 22 * rstat_cpu->updated_children list. See the comment on top of 23 * cgroup_rstat_cpu definition for details. 24 */ 25 void cgroup_rstat_updated(struct cgroup *cgrp, int cpu) 26 { 27 raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu); 28 unsigned long flags; 29 30 /* 31 * Speculative already-on-list test. This may race leading to 32 * temporary inaccuracies, which is fine. 33 * 34 * Because @parent's updated_children is terminated with @parent 35 * instead of NULL, we can tell whether @cgrp is on the list by 36 * testing the next pointer for NULL. 37 */ 38 if (cgroup_rstat_cpu(cgrp, cpu)->updated_next) 39 return; 40 41 raw_spin_lock_irqsave(cpu_lock, flags); 42 43 /* put @cgrp and all ancestors on the corresponding updated lists */ 44 while (true) { 45 struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); 46 struct cgroup *parent = cgroup_parent(cgrp); 47 struct cgroup_rstat_cpu *prstatc; 48 49 /* 50 * Both additions and removals are bottom-up. If a cgroup 51 * is already in the tree, all ancestors are. 52 */ 53 if (rstatc->updated_next) 54 break; 55 56 /* Root has no parent to link it to, but mark it busy */ 57 if (!parent) { 58 rstatc->updated_next = cgrp; 59 break; 60 } 61 62 prstatc = cgroup_rstat_cpu(parent, cpu); 63 rstatc->updated_next = prstatc->updated_children; 64 prstatc->updated_children = cgrp; 65 66 cgrp = parent; 67 } 68 69 raw_spin_unlock_irqrestore(cpu_lock, flags); 70 } 71 72 /** 73 * cgroup_rstat_cpu_pop_updated - iterate and dismantle rstat_cpu updated tree 74 * @pos: current position 75 * @root: root of the tree to traversal 76 * @cpu: target cpu 77 * 78 * Walks the udpated rstat_cpu tree on @cpu from @root. %NULL @pos starts 79 * the traversal and %NULL return indicates the end. During traversal, 80 * each returned cgroup is unlinked from the tree. Must be called with the 81 * matching cgroup_rstat_cpu_lock held. 82 * 83 * The only ordering guarantee is that, for a parent and a child pair 84 * covered by a given traversal, if a child is visited, its parent is 85 * guaranteed to be visited afterwards. 86 */ 87 static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos, 88 struct cgroup *root, int cpu) 89 { 90 struct cgroup_rstat_cpu *rstatc; 91 92 if (pos == root) 93 return NULL; 94 95 /* 96 * We're gonna walk down to the first leaf and visit/remove it. We 97 * can pick whatever unvisited node as the starting point. 98 */ 99 if (!pos) 100 pos = root; 101 else 102 pos = cgroup_parent(pos); 103 104 /* walk down to the first leaf */ 105 while (true) { 106 rstatc = cgroup_rstat_cpu(pos, cpu); 107 if (rstatc->updated_children == pos) 108 break; 109 pos = rstatc->updated_children; 110 } 111 112 /* 113 * Unlink @pos from the tree. As the updated_children list is 114 * singly linked, we have to walk it to find the removal point. 115 * However, due to the way we traverse, @pos will be the first 116 * child in most cases. The only exception is @root. 117 */ 118 if (rstatc->updated_next) { 119 struct cgroup *parent = cgroup_parent(pos); 120 121 if (parent) { 122 struct cgroup_rstat_cpu *prstatc; 123 struct cgroup **nextp; 124 125 prstatc = cgroup_rstat_cpu(parent, cpu); 126 nextp = &prstatc->updated_children; 127 while (true) { 128 struct cgroup_rstat_cpu *nrstatc; 129 130 nrstatc = cgroup_rstat_cpu(*nextp, cpu); 131 if (*nextp == pos) 132 break; 133 WARN_ON_ONCE(*nextp == parent); 134 nextp = &nrstatc->updated_next; 135 } 136 *nextp = rstatc->updated_next; 137 } 138 139 rstatc->updated_next = NULL; 140 return pos; 141 } 142 143 /* only happens for @root */ 144 return NULL; 145 } 146 147 /* see cgroup_rstat_flush() */ 148 static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep) 149 __releases(&cgroup_rstat_lock) __acquires(&cgroup_rstat_lock) 150 { 151 int cpu; 152 153 lockdep_assert_held(&cgroup_rstat_lock); 154 155 for_each_possible_cpu(cpu) { 156 raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, 157 cpu); 158 struct cgroup *pos = NULL; 159 160 raw_spin_lock(cpu_lock); 161 while ((pos = cgroup_rstat_cpu_pop_updated(pos, cgrp, cpu))) { 162 struct cgroup_subsys_state *css; 163 164 cgroup_base_stat_flush(pos, cpu); 165 166 rcu_read_lock(); 167 list_for_each_entry_rcu(css, &pos->rstat_css_list, 168 rstat_css_node) 169 css->ss->css_rstat_flush(css, cpu); 170 rcu_read_unlock(); 171 } 172 raw_spin_unlock(cpu_lock); 173 174 /* if @may_sleep, play nice and yield if necessary */ 175 if (may_sleep && (need_resched() || 176 spin_needbreak(&cgroup_rstat_lock))) { 177 spin_unlock_irq(&cgroup_rstat_lock); 178 if (!cond_resched()) 179 cpu_relax(); 180 spin_lock_irq(&cgroup_rstat_lock); 181 } 182 } 183 } 184 185 /** 186 * cgroup_rstat_flush - flush stats in @cgrp's subtree 187 * @cgrp: target cgroup 188 * 189 * Collect all per-cpu stats in @cgrp's subtree into the global counters 190 * and propagate them upwards. After this function returns, all cgroups in 191 * the subtree have up-to-date ->stat. 192 * 193 * This also gets all cgroups in the subtree including @cgrp off the 194 * ->updated_children lists. 195 * 196 * This function may block. 197 */ 198 void cgroup_rstat_flush(struct cgroup *cgrp) 199 { 200 might_sleep(); 201 202 spin_lock_irq(&cgroup_rstat_lock); 203 cgroup_rstat_flush_locked(cgrp, true); 204 spin_unlock_irq(&cgroup_rstat_lock); 205 } 206 207 /** 208 * cgroup_rstat_flush_irqsafe - irqsafe version of cgroup_rstat_flush() 209 * @cgrp: target cgroup 210 * 211 * This function can be called from any context. 212 */ 213 void cgroup_rstat_flush_irqsafe(struct cgroup *cgrp) 214 { 215 unsigned long flags; 216 217 spin_lock_irqsave(&cgroup_rstat_lock, flags); 218 cgroup_rstat_flush_locked(cgrp, false); 219 spin_unlock_irqrestore(&cgroup_rstat_lock, flags); 220 } 221 222 /** 223 * cgroup_rstat_flush_begin - flush stats in @cgrp's subtree and hold 224 * @cgrp: target cgroup 225 * 226 * Flush stats in @cgrp's subtree and prevent further flushes. Must be 227 * paired with cgroup_rstat_flush_release(). 228 * 229 * This function may block. 230 */ 231 void cgroup_rstat_flush_hold(struct cgroup *cgrp) 232 __acquires(&cgroup_rstat_lock) 233 { 234 might_sleep(); 235 spin_lock_irq(&cgroup_rstat_lock); 236 cgroup_rstat_flush_locked(cgrp, true); 237 } 238 239 /** 240 * cgroup_rstat_flush_release - release cgroup_rstat_flush_hold() 241 */ 242 void cgroup_rstat_flush_release(void) 243 __releases(&cgroup_rstat_lock) 244 { 245 spin_unlock_irq(&cgroup_rstat_lock); 246 } 247 248 int cgroup_rstat_init(struct cgroup *cgrp) 249 { 250 int cpu; 251 252 /* the root cgrp has rstat_cpu preallocated */ 253 if (!cgrp->rstat_cpu) { 254 cgrp->rstat_cpu = alloc_percpu(struct cgroup_rstat_cpu); 255 if (!cgrp->rstat_cpu) 256 return -ENOMEM; 257 } 258 259 /* ->updated_children list is self terminated */ 260 for_each_possible_cpu(cpu) { 261 struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); 262 263 rstatc->updated_children = cgrp; 264 u64_stats_init(&rstatc->bsync); 265 } 266 267 return 0; 268 } 269 270 void cgroup_rstat_exit(struct cgroup *cgrp) 271 { 272 int cpu; 273 274 cgroup_rstat_flush(cgrp); 275 276 /* sanity check */ 277 for_each_possible_cpu(cpu) { 278 struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); 279 280 if (WARN_ON_ONCE(rstatc->updated_children != cgrp) || 281 WARN_ON_ONCE(rstatc->updated_next)) 282 return; 283 } 284 285 free_percpu(cgrp->rstat_cpu); 286 cgrp->rstat_cpu = NULL; 287 } 288 289 void __init cgroup_rstat_boot(void) 290 { 291 int cpu; 292 293 for_each_possible_cpu(cpu) 294 raw_spin_lock_init(per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu)); 295 } 296 297 /* 298 * Functions for cgroup basic resource statistics implemented on top of 299 * rstat. 300 */ 301 static void cgroup_base_stat_add(struct cgroup_base_stat *dst_bstat, 302 struct cgroup_base_stat *src_bstat) 303 { 304 dst_bstat->cputime.utime += src_bstat->cputime.utime; 305 dst_bstat->cputime.stime += src_bstat->cputime.stime; 306 dst_bstat->cputime.sum_exec_runtime += src_bstat->cputime.sum_exec_runtime; 307 } 308 309 static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat, 310 struct cgroup_base_stat *src_bstat) 311 { 312 dst_bstat->cputime.utime -= src_bstat->cputime.utime; 313 dst_bstat->cputime.stime -= src_bstat->cputime.stime; 314 dst_bstat->cputime.sum_exec_runtime -= src_bstat->cputime.sum_exec_runtime; 315 } 316 317 static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu) 318 { 319 struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); 320 struct cgroup *parent = cgroup_parent(cgrp); 321 struct cgroup_base_stat cur, delta; 322 unsigned seq; 323 324 /* Root-level stats are sourced from system-wide CPU stats */ 325 if (!parent) 326 return; 327 328 /* fetch the current per-cpu values */ 329 do { 330 seq = __u64_stats_fetch_begin(&rstatc->bsync); 331 cur.cputime = rstatc->bstat.cputime; 332 } while (__u64_stats_fetch_retry(&rstatc->bsync, seq)); 333 334 /* propagate percpu delta to global */ 335 delta = cur; 336 cgroup_base_stat_sub(&delta, &rstatc->last_bstat); 337 cgroup_base_stat_add(&cgrp->bstat, &delta); 338 cgroup_base_stat_add(&rstatc->last_bstat, &delta); 339 340 /* propagate global delta to parent (unless that's root) */ 341 if (cgroup_parent(parent)) { 342 delta = cgrp->bstat; 343 cgroup_base_stat_sub(&delta, &cgrp->last_bstat); 344 cgroup_base_stat_add(&parent->bstat, &delta); 345 cgroup_base_stat_add(&cgrp->last_bstat, &delta); 346 } 347 } 348 349 static struct cgroup_rstat_cpu * 350 cgroup_base_stat_cputime_account_begin(struct cgroup *cgrp) 351 { 352 struct cgroup_rstat_cpu *rstatc; 353 354 rstatc = get_cpu_ptr(cgrp->rstat_cpu); 355 u64_stats_update_begin(&rstatc->bsync); 356 return rstatc; 357 } 358 359 static void cgroup_base_stat_cputime_account_end(struct cgroup *cgrp, 360 struct cgroup_rstat_cpu *rstatc) 361 { 362 u64_stats_update_end(&rstatc->bsync); 363 cgroup_rstat_updated(cgrp, smp_processor_id()); 364 put_cpu_ptr(rstatc); 365 } 366 367 void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec) 368 { 369 struct cgroup_rstat_cpu *rstatc; 370 371 rstatc = cgroup_base_stat_cputime_account_begin(cgrp); 372 rstatc->bstat.cputime.sum_exec_runtime += delta_exec; 373 cgroup_base_stat_cputime_account_end(cgrp, rstatc); 374 } 375 376 void __cgroup_account_cputime_field(struct cgroup *cgrp, 377 enum cpu_usage_stat index, u64 delta_exec) 378 { 379 struct cgroup_rstat_cpu *rstatc; 380 381 rstatc = cgroup_base_stat_cputime_account_begin(cgrp); 382 383 switch (index) { 384 case CPUTIME_USER: 385 case CPUTIME_NICE: 386 rstatc->bstat.cputime.utime += delta_exec; 387 break; 388 case CPUTIME_SYSTEM: 389 case CPUTIME_IRQ: 390 case CPUTIME_SOFTIRQ: 391 rstatc->bstat.cputime.stime += delta_exec; 392 break; 393 default: 394 break; 395 } 396 397 cgroup_base_stat_cputime_account_end(cgrp, rstatc); 398 } 399 400 /* 401 * compute the cputime for the root cgroup by getting the per cpu data 402 * at a global level, then categorizing the fields in a manner consistent 403 * with how it is done by __cgroup_account_cputime_field for each bit of 404 * cpu time attributed to a cgroup. 405 */ 406 static void root_cgroup_cputime(struct task_cputime *cputime) 407 { 408 int i; 409 410 cputime->stime = 0; 411 cputime->utime = 0; 412 cputime->sum_exec_runtime = 0; 413 for_each_possible_cpu(i) { 414 struct kernel_cpustat kcpustat; 415 u64 *cpustat = kcpustat.cpustat; 416 u64 user = 0; 417 u64 sys = 0; 418 419 kcpustat_cpu_fetch(&kcpustat, i); 420 421 user += cpustat[CPUTIME_USER]; 422 user += cpustat[CPUTIME_NICE]; 423 cputime->utime += user; 424 425 sys += cpustat[CPUTIME_SYSTEM]; 426 sys += cpustat[CPUTIME_IRQ]; 427 sys += cpustat[CPUTIME_SOFTIRQ]; 428 cputime->stime += sys; 429 430 cputime->sum_exec_runtime += user; 431 cputime->sum_exec_runtime += sys; 432 cputime->sum_exec_runtime += cpustat[CPUTIME_STEAL]; 433 cputime->sum_exec_runtime += cpustat[CPUTIME_GUEST]; 434 cputime->sum_exec_runtime += cpustat[CPUTIME_GUEST_NICE]; 435 } 436 } 437 438 void cgroup_base_stat_cputime_show(struct seq_file *seq) 439 { 440 struct cgroup *cgrp = seq_css(seq)->cgroup; 441 u64 usage, utime, stime; 442 struct task_cputime cputime; 443 444 if (cgroup_parent(cgrp)) { 445 cgroup_rstat_flush_hold(cgrp); 446 usage = cgrp->bstat.cputime.sum_exec_runtime; 447 cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime, 448 &utime, &stime); 449 cgroup_rstat_flush_release(); 450 } else { 451 root_cgroup_cputime(&cputime); 452 usage = cputime.sum_exec_runtime; 453 utime = cputime.utime; 454 stime = cputime.stime; 455 } 456 457 do_div(usage, NSEC_PER_USEC); 458 do_div(utime, NSEC_PER_USEC); 459 do_div(stime, NSEC_PER_USEC); 460 461 seq_printf(seq, "usage_usec %llu\n" 462 "user_usec %llu\n" 463 "system_usec %llu\n", 464 usage, utime, stime); 465 } 466