1 #include "cgroup-internal.h" 2 3 #include <linux/sched/cputime.h> 4 5 static DEFINE_SPINLOCK(cgroup_rstat_lock); 6 static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock); 7 8 static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu); 9 10 static struct cgroup_rstat_cpu *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu) 11 { 12 return per_cpu_ptr(cgrp->rstat_cpu, cpu); 13 } 14 15 /** 16 * cgroup_rstat_updated - keep track of updated rstat_cpu 17 * @cgrp: target cgroup 18 * @cpu: cpu on which rstat_cpu was updated 19 * 20 * @cgrp's rstat_cpu on @cpu was updated. Put it on the parent's matching 21 * rstat_cpu->updated_children list. See the comment on top of 22 * cgroup_rstat_cpu definition for details. 23 */ 24 void cgroup_rstat_updated(struct cgroup *cgrp, int cpu) 25 { 26 raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu); 27 struct cgroup *parent; 28 unsigned long flags; 29 30 /* nothing to do for root */ 31 if (!cgroup_parent(cgrp)) 32 return; 33 34 /* 35 * Paired with the one in cgroup_rstat_cpu_pop_upated(). Either we 36 * see NULL updated_next or they see our updated stat. 37 */ 38 smp_mb(); 39 40 /* 41 * Because @parent's updated_children is terminated with @parent 42 * instead of NULL, we can tell whether @cgrp is on the list by 43 * testing the next pointer for NULL. 44 */ 45 if (cgroup_rstat_cpu(cgrp, cpu)->updated_next) 46 return; 47 48 raw_spin_lock_irqsave(cpu_lock, flags); 49 50 /* put @cgrp and all ancestors on the corresponding updated lists */ 51 for (parent = cgroup_parent(cgrp); parent; 52 cgrp = parent, parent = cgroup_parent(cgrp)) { 53 struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); 54 struct cgroup_rstat_cpu *prstatc = cgroup_rstat_cpu(parent, cpu); 55 56 /* 57 * Both additions and removals are bottom-up. If a cgroup 58 * is already in the tree, all ancestors are. 59 */ 60 if (rstatc->updated_next) 61 break; 62 63 rstatc->updated_next = prstatc->updated_children; 64 prstatc->updated_children = cgrp; 65 } 66 67 raw_spin_unlock_irqrestore(cpu_lock, flags); 68 } 69 EXPORT_SYMBOL_GPL(cgroup_rstat_updated); 70 71 /** 72 * cgroup_rstat_cpu_pop_updated - iterate and dismantle rstat_cpu updated tree 73 * @pos: current position 74 * @root: root of the tree to traversal 75 * @cpu: target cpu 76 * 77 * Walks the udpated rstat_cpu tree on @cpu from @root. %NULL @pos starts 78 * the traversal and %NULL return indicates the end. During traversal, 79 * each returned cgroup is unlinked from the tree. Must be called with the 80 * matching cgroup_rstat_cpu_lock held. 81 * 82 * The only ordering guarantee is that, for a parent and a child pair 83 * covered by a given traversal, if a child is visited, its parent is 84 * guaranteed to be visited afterwards. 85 */ 86 static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos, 87 struct cgroup *root, int cpu) 88 { 89 struct cgroup_rstat_cpu *rstatc; 90 91 if (pos == root) 92 return NULL; 93 94 /* 95 * We're gonna walk down to the first leaf and visit/remove it. We 96 * can pick whatever unvisited node as the starting point. 97 */ 98 if (!pos) 99 pos = root; 100 else 101 pos = cgroup_parent(pos); 102 103 /* walk down to the first leaf */ 104 while (true) { 105 rstatc = cgroup_rstat_cpu(pos, cpu); 106 if (rstatc->updated_children == pos) 107 break; 108 pos = rstatc->updated_children; 109 } 110 111 /* 112 * Unlink @pos from the tree. As the updated_children list is 113 * singly linked, we have to walk it to find the removal point. 114 * However, due to the way we traverse, @pos will be the first 115 * child in most cases. The only exception is @root. 116 */ 117 if (rstatc->updated_next) { 118 struct cgroup *parent = cgroup_parent(pos); 119 struct cgroup_rstat_cpu *prstatc = cgroup_rstat_cpu(parent, cpu); 120 struct cgroup_rstat_cpu *nrstatc; 121 struct cgroup **nextp; 122 123 nextp = &prstatc->updated_children; 124 while (true) { 125 nrstatc = cgroup_rstat_cpu(*nextp, cpu); 126 if (*nextp == pos) 127 break; 128 129 WARN_ON_ONCE(*nextp == parent); 130 nextp = &nrstatc->updated_next; 131 } 132 133 *nextp = rstatc->updated_next; 134 rstatc->updated_next = NULL; 135 136 /* 137 * Paired with the one in cgroup_rstat_cpu_updated(). 138 * Either they see NULL updated_next or we see their 139 * updated stat. 140 */ 141 smp_mb(); 142 143 return pos; 144 } 145 146 /* only happens for @root */ 147 return NULL; 148 } 149 150 /* see cgroup_rstat_flush() */ 151 static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep) 152 __releases(&cgroup_rstat_lock) __acquires(&cgroup_rstat_lock) 153 { 154 int cpu; 155 156 lockdep_assert_held(&cgroup_rstat_lock); 157 158 for_each_possible_cpu(cpu) { 159 raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, 160 cpu); 161 struct cgroup *pos = NULL; 162 163 raw_spin_lock(cpu_lock); 164 while ((pos = cgroup_rstat_cpu_pop_updated(pos, cgrp, cpu))) { 165 struct cgroup_subsys_state *css; 166 167 cgroup_base_stat_flush(pos, cpu); 168 169 rcu_read_lock(); 170 list_for_each_entry_rcu(css, &pos->rstat_css_list, 171 rstat_css_node) 172 css->ss->css_rstat_flush(css, cpu); 173 rcu_read_unlock(); 174 } 175 raw_spin_unlock(cpu_lock); 176 177 /* if @may_sleep, play nice and yield if necessary */ 178 if (may_sleep && (need_resched() || 179 spin_needbreak(&cgroup_rstat_lock))) { 180 spin_unlock_irq(&cgroup_rstat_lock); 181 if (!cond_resched()) 182 cpu_relax(); 183 spin_lock_irq(&cgroup_rstat_lock); 184 } 185 } 186 } 187 188 /** 189 * cgroup_rstat_flush - flush stats in @cgrp's subtree 190 * @cgrp: target cgroup 191 * 192 * Collect all per-cpu stats in @cgrp's subtree into the global counters 193 * and propagate them upwards. After this function returns, all cgroups in 194 * the subtree have up-to-date ->stat. 195 * 196 * This also gets all cgroups in the subtree including @cgrp off the 197 * ->updated_children lists. 198 * 199 * This function may block. 200 */ 201 void cgroup_rstat_flush(struct cgroup *cgrp) 202 { 203 might_sleep(); 204 205 spin_lock_irq(&cgroup_rstat_lock); 206 cgroup_rstat_flush_locked(cgrp, true); 207 spin_unlock_irq(&cgroup_rstat_lock); 208 } 209 210 /** 211 * cgroup_rstat_flush_irqsafe - irqsafe version of cgroup_rstat_flush() 212 * @cgrp: target cgroup 213 * 214 * This function can be called from any context. 215 */ 216 void cgroup_rstat_flush_irqsafe(struct cgroup *cgrp) 217 { 218 unsigned long flags; 219 220 spin_lock_irqsave(&cgroup_rstat_lock, flags); 221 cgroup_rstat_flush_locked(cgrp, false); 222 spin_unlock_irqrestore(&cgroup_rstat_lock, flags); 223 } 224 225 /** 226 * cgroup_rstat_flush_begin - flush stats in @cgrp's subtree and hold 227 * @cgrp: target cgroup 228 * 229 * Flush stats in @cgrp's subtree and prevent further flushes. Must be 230 * paired with cgroup_rstat_flush_release(). 231 * 232 * This function may block. 233 */ 234 void cgroup_rstat_flush_hold(struct cgroup *cgrp) 235 __acquires(&cgroup_rstat_lock) 236 { 237 might_sleep(); 238 spin_lock_irq(&cgroup_rstat_lock); 239 cgroup_rstat_flush_locked(cgrp, true); 240 } 241 242 /** 243 * cgroup_rstat_flush_release - release cgroup_rstat_flush_hold() 244 */ 245 void cgroup_rstat_flush_release(void) 246 __releases(&cgroup_rstat_lock) 247 { 248 spin_unlock_irq(&cgroup_rstat_lock); 249 } 250 251 int cgroup_rstat_init(struct cgroup *cgrp) 252 { 253 int cpu; 254 255 /* the root cgrp has rstat_cpu preallocated */ 256 if (!cgrp->rstat_cpu) { 257 cgrp->rstat_cpu = alloc_percpu(struct cgroup_rstat_cpu); 258 if (!cgrp->rstat_cpu) 259 return -ENOMEM; 260 } 261 262 /* ->updated_children list is self terminated */ 263 for_each_possible_cpu(cpu) { 264 struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); 265 266 rstatc->updated_children = cgrp; 267 u64_stats_init(&rstatc->bsync); 268 } 269 270 return 0; 271 } 272 273 void cgroup_rstat_exit(struct cgroup *cgrp) 274 { 275 int cpu; 276 277 cgroup_rstat_flush(cgrp); 278 279 /* sanity check */ 280 for_each_possible_cpu(cpu) { 281 struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); 282 283 if (WARN_ON_ONCE(rstatc->updated_children != cgrp) || 284 WARN_ON_ONCE(rstatc->updated_next)) 285 return; 286 } 287 288 free_percpu(cgrp->rstat_cpu); 289 cgrp->rstat_cpu = NULL; 290 } 291 292 void __init cgroup_rstat_boot(void) 293 { 294 int cpu; 295 296 for_each_possible_cpu(cpu) 297 raw_spin_lock_init(per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu)); 298 299 BUG_ON(cgroup_rstat_init(&cgrp_dfl_root.cgrp)); 300 } 301 302 /* 303 * Functions for cgroup basic resource statistics implemented on top of 304 * rstat. 305 */ 306 static void cgroup_base_stat_accumulate(struct cgroup_base_stat *dst_bstat, 307 struct cgroup_base_stat *src_bstat) 308 { 309 dst_bstat->cputime.utime += src_bstat->cputime.utime; 310 dst_bstat->cputime.stime += src_bstat->cputime.stime; 311 dst_bstat->cputime.sum_exec_runtime += src_bstat->cputime.sum_exec_runtime; 312 } 313 314 static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu) 315 { 316 struct cgroup *parent = cgroup_parent(cgrp); 317 struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); 318 struct task_cputime *last_cputime = &rstatc->last_bstat.cputime; 319 struct task_cputime cputime; 320 struct cgroup_base_stat delta; 321 unsigned seq; 322 323 /* fetch the current per-cpu values */ 324 do { 325 seq = __u64_stats_fetch_begin(&rstatc->bsync); 326 cputime = rstatc->bstat.cputime; 327 } while (__u64_stats_fetch_retry(&rstatc->bsync, seq)); 328 329 /* calculate the delta to propgate */ 330 delta.cputime.utime = cputime.utime - last_cputime->utime; 331 delta.cputime.stime = cputime.stime - last_cputime->stime; 332 delta.cputime.sum_exec_runtime = cputime.sum_exec_runtime - 333 last_cputime->sum_exec_runtime; 334 *last_cputime = cputime; 335 336 /* transfer the pending stat into delta */ 337 cgroup_base_stat_accumulate(&delta, &cgrp->pending_bstat); 338 memset(&cgrp->pending_bstat, 0, sizeof(cgrp->pending_bstat)); 339 340 /* propagate delta into the global stat and the parent's pending */ 341 cgroup_base_stat_accumulate(&cgrp->bstat, &delta); 342 if (parent) 343 cgroup_base_stat_accumulate(&parent->pending_bstat, &delta); 344 } 345 346 static struct cgroup_rstat_cpu * 347 cgroup_base_stat_cputime_account_begin(struct cgroup *cgrp) 348 { 349 struct cgroup_rstat_cpu *rstatc; 350 351 rstatc = get_cpu_ptr(cgrp->rstat_cpu); 352 u64_stats_update_begin(&rstatc->bsync); 353 return rstatc; 354 } 355 356 static void cgroup_base_stat_cputime_account_end(struct cgroup *cgrp, 357 struct cgroup_rstat_cpu *rstatc) 358 { 359 u64_stats_update_end(&rstatc->bsync); 360 cgroup_rstat_updated(cgrp, smp_processor_id()); 361 put_cpu_ptr(rstatc); 362 } 363 364 void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec) 365 { 366 struct cgroup_rstat_cpu *rstatc; 367 368 rstatc = cgroup_base_stat_cputime_account_begin(cgrp); 369 rstatc->bstat.cputime.sum_exec_runtime += delta_exec; 370 cgroup_base_stat_cputime_account_end(cgrp, rstatc); 371 } 372 373 void __cgroup_account_cputime_field(struct cgroup *cgrp, 374 enum cpu_usage_stat index, u64 delta_exec) 375 { 376 struct cgroup_rstat_cpu *rstatc; 377 378 rstatc = cgroup_base_stat_cputime_account_begin(cgrp); 379 380 switch (index) { 381 case CPUTIME_USER: 382 case CPUTIME_NICE: 383 rstatc->bstat.cputime.utime += delta_exec; 384 break; 385 case CPUTIME_SYSTEM: 386 case CPUTIME_IRQ: 387 case CPUTIME_SOFTIRQ: 388 rstatc->bstat.cputime.stime += delta_exec; 389 break; 390 default: 391 break; 392 } 393 394 cgroup_base_stat_cputime_account_end(cgrp, rstatc); 395 } 396 397 void cgroup_base_stat_cputime_show(struct seq_file *seq) 398 { 399 struct cgroup *cgrp = seq_css(seq)->cgroup; 400 u64 usage, utime, stime; 401 402 if (!cgroup_parent(cgrp)) 403 return; 404 405 cgroup_rstat_flush_hold(cgrp); 406 usage = cgrp->bstat.cputime.sum_exec_runtime; 407 cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime, &utime, &stime); 408 cgroup_rstat_flush_release(); 409 410 do_div(usage, NSEC_PER_USEC); 411 do_div(utime, NSEC_PER_USEC); 412 do_div(stime, NSEC_PER_USEC); 413 414 seq_printf(seq, "usage_usec %llu\n" 415 "user_usec %llu\n" 416 "system_usec %llu\n", 417 usage, utime, stime); 418 } 419