1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH) 4 * 5 * Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> 6 * 7 * Interactivity improvements by Mike Galbraith 8 * (C) 2007 Mike Galbraith <efault@gmx.de> 9 * 10 * Various enhancements by Dmitry Adamushko. 11 * (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com> 12 * 13 * Group scheduling enhancements by Srivatsa Vaddagiri 14 * Copyright IBM Corporation, 2007 15 * Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com> 16 * 17 * Scaled math optimizations by Thomas Gleixner 18 * Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de> 19 * 20 * Adaptive scheduling granularity, math enhancements by Peter Zijlstra 21 * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra 22 */ 23 24 #include <linux/sched/mm.h> 25 #include <linux/sched/topology.h> 26 27 #include <linux/latencytop.h> 28 #include <linux/cpumask.h> 29 #include <linux/cpuidle.h> 30 #include <linux/slab.h> 31 #include <linux/profile.h> 32 #include <linux/interrupt.h> 33 #include <linux/mempolicy.h> 34 #include <linux/migrate.h> 35 #include <linux/task_work.h> 36 #include <linux/sched/isolation.h> 37 38 #include <trace/events/sched.h> 39 40 #include "sched.h" 41 42 /* 43 * Targeted preemption latency for CPU-bound tasks: 44 * 45 * NOTE: this latency value is not the same as the concept of 46 * 'timeslice length' - timeslices in CFS are of variable length 47 * and have no persistent notion like in traditional, time-slice 48 * based scheduling concepts. 49 * 50 * (to see the precise effective timeslice length of your workload, 51 * run vmstat and monitor the context-switches (cs) field) 52 * 53 * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds) 54 */ 55 unsigned int sysctl_sched_latency = 6000000ULL; 56 unsigned int normalized_sysctl_sched_latency = 6000000ULL; 57 58 /* 59 * The initial- and re-scaling of tunables is configurable 60 * 61 * Options are: 62 * 63 * SCHED_TUNABLESCALING_NONE - unscaled, always *1 64 * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus) 65 * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus 66 * 67 * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus)) 68 */ 69 enum sched_tunable_scaling sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG; 70 71 /* 72 * Minimal preemption granularity for CPU-bound tasks: 73 * 74 * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds) 75 */ 76 unsigned int sysctl_sched_min_granularity = 750000ULL; 77 unsigned int normalized_sysctl_sched_min_granularity = 750000ULL; 78 79 /* 80 * This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity 81 */ 82 static unsigned int sched_nr_latency = 8; 83 84 /* 85 * After fork, child runs first. If set to 0 (default) then 86 * parent will (try to) run first. 87 */ 88 unsigned int sysctl_sched_child_runs_first __read_mostly; 89 90 /* 91 * SCHED_OTHER wake-up granularity. 92 * 93 * This option delays the preemption effects of decoupled workloads 94 * and reduces their over-scheduling. Synchronous workloads will still 95 * have immediate wakeup/sleep latencies. 96 * 97 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) 98 */ 99 unsigned int sysctl_sched_wakeup_granularity = 1000000UL; 100 unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL; 101 102 const_debug unsigned int sysctl_sched_migration_cost = 500000UL; 103 104 #ifdef CONFIG_SMP 105 /* 106 * For asym packing, by default the lower numbered cpu has higher priority. 107 */ 108 int __weak arch_asym_cpu_priority(int cpu) 109 { 110 return -cpu; 111 } 112 #endif 113 114 #ifdef CONFIG_CFS_BANDWIDTH 115 /* 116 * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool 117 * each time a cfs_rq requests quota. 118 * 119 * Note: in the case that the slice exceeds the runtime remaining (either due 120 * to consumption or the quota being specified to be smaller than the slice) 121 * we will always only issue the remaining available time. 122 * 123 * (default: 5 msec, units: microseconds) 124 */ 125 unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; 126 #endif 127 128 /* 129 * The margin used when comparing utilization with CPU capacity: 130 * util * margin < capacity * 1024 131 * 132 * (default: ~20%) 133 */ 134 unsigned int capacity_margin = 1280; 135 136 static inline void update_load_add(struct load_weight *lw, unsigned long inc) 137 { 138 lw->weight += inc; 139 lw->inv_weight = 0; 140 } 141 142 static inline void update_load_sub(struct load_weight *lw, unsigned long dec) 143 { 144 lw->weight -= dec; 145 lw->inv_weight = 0; 146 } 147 148 static inline void update_load_set(struct load_weight *lw, unsigned long w) 149 { 150 lw->weight = w; 151 lw->inv_weight = 0; 152 } 153 154 /* 155 * Increase the granularity value when there are more CPUs, 156 * because with more CPUs the 'effective latency' as visible 157 * to users decreases. But the relationship is not linear, 158 * so pick a second-best guess by going with the log2 of the 159 * number of CPUs. 160 * 161 * This idea comes from the SD scheduler of Con Kolivas: 162 */ 163 static unsigned int get_update_sysctl_factor(void) 164 { 165 unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8); 166 unsigned int factor; 167 168 switch (sysctl_sched_tunable_scaling) { 169 case SCHED_TUNABLESCALING_NONE: 170 factor = 1; 171 break; 172 case SCHED_TUNABLESCALING_LINEAR: 173 factor = cpus; 174 break; 175 case SCHED_TUNABLESCALING_LOG: 176 default: 177 factor = 1 + ilog2(cpus); 178 break; 179 } 180 181 return factor; 182 } 183 184 static void update_sysctl(void) 185 { 186 unsigned int factor = get_update_sysctl_factor(); 187 188 #define SET_SYSCTL(name) \ 189 (sysctl_##name = (factor) * normalized_sysctl_##name) 190 SET_SYSCTL(sched_min_granularity); 191 SET_SYSCTL(sched_latency); 192 SET_SYSCTL(sched_wakeup_granularity); 193 #undef SET_SYSCTL 194 } 195 196 void sched_init_granularity(void) 197 { 198 update_sysctl(); 199 } 200 201 #define WMULT_CONST (~0U) 202 #define WMULT_SHIFT 32 203 204 static void __update_inv_weight(struct load_weight *lw) 205 { 206 unsigned long w; 207 208 if (likely(lw->inv_weight)) 209 return; 210 211 w = scale_load_down(lw->weight); 212 213 if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST)) 214 lw->inv_weight = 1; 215 else if (unlikely(!w)) 216 lw->inv_weight = WMULT_CONST; 217 else 218 lw->inv_weight = WMULT_CONST / w; 219 } 220 221 /* 222 * delta_exec * weight / lw.weight 223 * OR 224 * (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT 225 * 226 * Either weight := NICE_0_LOAD and lw \e sched_prio_to_wmult[], in which case 227 * we're guaranteed shift stays positive because inv_weight is guaranteed to 228 * fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22. 229 * 230 * Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus 231 * weight/lw.weight <= 1, and therefore our shift will also be positive. 232 */ 233 static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw) 234 { 235 u64 fact = scale_load_down(weight); 236 int shift = WMULT_SHIFT; 237 238 __update_inv_weight(lw); 239 240 if (unlikely(fact >> 32)) { 241 while (fact >> 32) { 242 fact >>= 1; 243 shift--; 244 } 245 } 246 247 /* hint to use a 32x32->64 mul */ 248 fact = (u64)(u32)fact * lw->inv_weight; 249 250 while (fact >> 32) { 251 fact >>= 1; 252 shift--; 253 } 254 255 return mul_u64_u32_shr(delta_exec, fact, shift); 256 } 257 258 259 const struct sched_class fair_sched_class; 260 261 /************************************************************** 262 * CFS operations on generic schedulable entities: 263 */ 264 265 #ifdef CONFIG_FAIR_GROUP_SCHED 266 267 /* cpu runqueue to which this cfs_rq is attached */ 268 static inline struct rq *rq_of(struct cfs_rq *cfs_rq) 269 { 270 return cfs_rq->rq; 271 } 272 273 /* An entity is a task if it doesn't "own" a runqueue */ 274 #define entity_is_task(se) (!se->my_q) 275 276 static inline struct task_struct *task_of(struct sched_entity *se) 277 { 278 SCHED_WARN_ON(!entity_is_task(se)); 279 return container_of(se, struct task_struct, se); 280 } 281 282 /* Walk up scheduling entities hierarchy */ 283 #define for_each_sched_entity(se) \ 284 for (; se; se = se->parent) 285 286 static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) 287 { 288 return p->se.cfs_rq; 289 } 290 291 /* runqueue on which this entity is (to be) queued */ 292 static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) 293 { 294 return se->cfs_rq; 295 } 296 297 /* runqueue "owned" by this group */ 298 static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) 299 { 300 return grp->my_q; 301 } 302 303 static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) 304 { 305 if (!cfs_rq->on_list) { 306 struct rq *rq = rq_of(cfs_rq); 307 int cpu = cpu_of(rq); 308 /* 309 * Ensure we either appear before our parent (if already 310 * enqueued) or force our parent to appear after us when it is 311 * enqueued. The fact that we always enqueue bottom-up 312 * reduces this to two cases and a special case for the root 313 * cfs_rq. Furthermore, it also means that we will always reset 314 * tmp_alone_branch either when the branch is connected 315 * to a tree or when we reach the beg of the tree 316 */ 317 if (cfs_rq->tg->parent && 318 cfs_rq->tg->parent->cfs_rq[cpu]->on_list) { 319 /* 320 * If parent is already on the list, we add the child 321 * just before. Thanks to circular linked property of 322 * the list, this means to put the child at the tail 323 * of the list that starts by parent. 324 */ 325 list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list, 326 &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list)); 327 /* 328 * The branch is now connected to its tree so we can 329 * reset tmp_alone_branch to the beginning of the 330 * list. 331 */ 332 rq->tmp_alone_branch = &rq->leaf_cfs_rq_list; 333 } else if (!cfs_rq->tg->parent) { 334 /* 335 * cfs rq without parent should be put 336 * at the tail of the list. 337 */ 338 list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list, 339 &rq->leaf_cfs_rq_list); 340 /* 341 * We have reach the beg of a tree so we can reset 342 * tmp_alone_branch to the beginning of the list. 343 */ 344 rq->tmp_alone_branch = &rq->leaf_cfs_rq_list; 345 } else { 346 /* 347 * The parent has not already been added so we want to 348 * make sure that it will be put after us. 349 * tmp_alone_branch points to the beg of the branch 350 * where we will add parent. 351 */ 352 list_add_rcu(&cfs_rq->leaf_cfs_rq_list, 353 rq->tmp_alone_branch); 354 /* 355 * update tmp_alone_branch to points to the new beg 356 * of the branch 357 */ 358 rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list; 359 } 360 361 cfs_rq->on_list = 1; 362 } 363 } 364 365 static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) 366 { 367 if (cfs_rq->on_list) { 368 list_del_rcu(&cfs_rq->leaf_cfs_rq_list); 369 cfs_rq->on_list = 0; 370 } 371 } 372 373 /* Iterate thr' all leaf cfs_rq's on a runqueue */ 374 #define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \ 375 list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list, \ 376 leaf_cfs_rq_list) 377 378 /* Do the two (enqueued) entities belong to the same group ? */ 379 static inline struct cfs_rq * 380 is_same_group(struct sched_entity *se, struct sched_entity *pse) 381 { 382 if (se->cfs_rq == pse->cfs_rq) 383 return se->cfs_rq; 384 385 return NULL; 386 } 387 388 static inline struct sched_entity *parent_entity(struct sched_entity *se) 389 { 390 return se->parent; 391 } 392 393 static void 394 find_matching_se(struct sched_entity **se, struct sched_entity **pse) 395 { 396 int se_depth, pse_depth; 397 398 /* 399 * preemption test can be made between sibling entities who are in the 400 * same cfs_rq i.e who have a common parent. Walk up the hierarchy of 401 * both tasks until we find their ancestors who are siblings of common 402 * parent. 403 */ 404 405 /* First walk up until both entities are at same depth */ 406 se_depth = (*se)->depth; 407 pse_depth = (*pse)->depth; 408 409 while (se_depth > pse_depth) { 410 se_depth--; 411 *se = parent_entity(*se); 412 } 413 414 while (pse_depth > se_depth) { 415 pse_depth--; 416 *pse = parent_entity(*pse); 417 } 418 419 while (!is_same_group(*se, *pse)) { 420 *se = parent_entity(*se); 421 *pse = parent_entity(*pse); 422 } 423 } 424 425 #else /* !CONFIG_FAIR_GROUP_SCHED */ 426 427 static inline struct task_struct *task_of(struct sched_entity *se) 428 { 429 return container_of(se, struct task_struct, se); 430 } 431 432 static inline struct rq *rq_of(struct cfs_rq *cfs_rq) 433 { 434 return container_of(cfs_rq, struct rq, cfs); 435 } 436 437 #define entity_is_task(se) 1 438 439 #define for_each_sched_entity(se) \ 440 for (; se; se = NULL) 441 442 static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) 443 { 444 return &task_rq(p)->cfs; 445 } 446 447 static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) 448 { 449 struct task_struct *p = task_of(se); 450 struct rq *rq = task_rq(p); 451 452 return &rq->cfs; 453 } 454 455 /* runqueue "owned" by this group */ 456 static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) 457 { 458 return NULL; 459 } 460 461 static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) 462 { 463 } 464 465 static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) 466 { 467 } 468 469 #define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \ 470 for (cfs_rq = &rq->cfs, pos = NULL; cfs_rq; cfs_rq = pos) 471 472 static inline struct sched_entity *parent_entity(struct sched_entity *se) 473 { 474 return NULL; 475 } 476 477 static inline void 478 find_matching_se(struct sched_entity **se, struct sched_entity **pse) 479 { 480 } 481 482 #endif /* CONFIG_FAIR_GROUP_SCHED */ 483 484 static __always_inline 485 void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec); 486 487 /************************************************************** 488 * Scheduling class tree data structure manipulation methods: 489 */ 490 491 static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime) 492 { 493 s64 delta = (s64)(vruntime - max_vruntime); 494 if (delta > 0) 495 max_vruntime = vruntime; 496 497 return max_vruntime; 498 } 499 500 static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime) 501 { 502 s64 delta = (s64)(vruntime - min_vruntime); 503 if (delta < 0) 504 min_vruntime = vruntime; 505 506 return min_vruntime; 507 } 508 509 static inline int entity_before(struct sched_entity *a, 510 struct sched_entity *b) 511 { 512 return (s64)(a->vruntime - b->vruntime) < 0; 513 } 514 515 static void update_min_vruntime(struct cfs_rq *cfs_rq) 516 { 517 struct sched_entity *curr = cfs_rq->curr; 518 struct rb_node *leftmost = rb_first_cached(&cfs_rq->tasks_timeline); 519 520 u64 vruntime = cfs_rq->min_vruntime; 521 522 if (curr) { 523 if (curr->on_rq) 524 vruntime = curr->vruntime; 525 else 526 curr = NULL; 527 } 528 529 if (leftmost) { /* non-empty tree */ 530 struct sched_entity *se; 531 se = rb_entry(leftmost, struct sched_entity, run_node); 532 533 if (!curr) 534 vruntime = se->vruntime; 535 else 536 vruntime = min_vruntime(vruntime, se->vruntime); 537 } 538 539 /* ensure we never gain time by being placed backwards. */ 540 cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime); 541 #ifndef CONFIG_64BIT 542 smp_wmb(); 543 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; 544 #endif 545 } 546 547 /* 548 * Enqueue an entity into the rb-tree: 549 */ 550 static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) 551 { 552 struct rb_node **link = &cfs_rq->tasks_timeline.rb_root.rb_node; 553 struct rb_node *parent = NULL; 554 struct sched_entity *entry; 555 bool leftmost = true; 556 557 /* 558 * Find the right place in the rbtree: 559 */ 560 while (*link) { 561 parent = *link; 562 entry = rb_entry(parent, struct sched_entity, run_node); 563 /* 564 * We dont care about collisions. Nodes with 565 * the same key stay together. 566 */ 567 if (entity_before(se, entry)) { 568 link = &parent->rb_left; 569 } else { 570 link = &parent->rb_right; 571 leftmost = false; 572 } 573 } 574 575 rb_link_node(&se->run_node, parent, link); 576 rb_insert_color_cached(&se->run_node, 577 &cfs_rq->tasks_timeline, leftmost); 578 } 579 580 static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) 581 { 582 rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline); 583 } 584 585 struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) 586 { 587 struct rb_node *left = rb_first_cached(&cfs_rq->tasks_timeline); 588 589 if (!left) 590 return NULL; 591 592 return rb_entry(left, struct sched_entity, run_node); 593 } 594 595 static struct sched_entity *__pick_next_entity(struct sched_entity *se) 596 { 597 struct rb_node *next = rb_next(&se->run_node); 598 599 if (!next) 600 return NULL; 601 602 return rb_entry(next, struct sched_entity, run_node); 603 } 604 605 #ifdef CONFIG_SCHED_DEBUG 606 struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) 607 { 608 struct rb_node *last = rb_last(&cfs_rq->tasks_timeline.rb_root); 609 610 if (!last) 611 return NULL; 612 613 return rb_entry(last, struct sched_entity, run_node); 614 } 615 616 /************************************************************** 617 * Scheduling class statistics methods: 618 */ 619 620 int sched_proc_update_handler(struct ctl_table *table, int write, 621 void __user *buffer, size_t *lenp, 622 loff_t *ppos) 623 { 624 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); 625 unsigned int factor = get_update_sysctl_factor(); 626 627 if (ret || !write) 628 return ret; 629 630 sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency, 631 sysctl_sched_min_granularity); 632 633 #define WRT_SYSCTL(name) \ 634 (normalized_sysctl_##name = sysctl_##name / (factor)) 635 WRT_SYSCTL(sched_min_granularity); 636 WRT_SYSCTL(sched_latency); 637 WRT_SYSCTL(sched_wakeup_granularity); 638 #undef WRT_SYSCTL 639 640 return 0; 641 } 642 #endif 643 644 /* 645 * delta /= w 646 */ 647 static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se) 648 { 649 if (unlikely(se->load.weight != NICE_0_LOAD)) 650 delta = __calc_delta(delta, NICE_0_LOAD, &se->load); 651 652 return delta; 653 } 654 655 /* 656 * The idea is to set a period in which each task runs once. 657 * 658 * When there are too many tasks (sched_nr_latency) we have to stretch 659 * this period because otherwise the slices get too small. 660 * 661 * p = (nr <= nl) ? l : l*nr/nl 662 */ 663 static u64 __sched_period(unsigned long nr_running) 664 { 665 if (unlikely(nr_running > sched_nr_latency)) 666 return nr_running * sysctl_sched_min_granularity; 667 else 668 return sysctl_sched_latency; 669 } 670 671 /* 672 * We calculate the wall-time slice from the period by taking a part 673 * proportional to the weight. 674 * 675 * s = p*P[w/rw] 676 */ 677 static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) 678 { 679 u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq); 680 681 for_each_sched_entity(se) { 682 struct load_weight *load; 683 struct load_weight lw; 684 685 cfs_rq = cfs_rq_of(se); 686 load = &cfs_rq->load; 687 688 if (unlikely(!se->on_rq)) { 689 lw = cfs_rq->load; 690 691 update_load_add(&lw, se->load.weight); 692 load = &lw; 693 } 694 slice = __calc_delta(slice, se->load.weight, load); 695 } 696 return slice; 697 } 698 699 /* 700 * We calculate the vruntime slice of a to-be-inserted task. 701 * 702 * vs = s/w 703 */ 704 static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) 705 { 706 return calc_delta_fair(sched_slice(cfs_rq, se), se); 707 } 708 709 #ifdef CONFIG_SMP 710 711 #include "sched-pelt.h" 712 713 static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu); 714 static unsigned long task_h_load(struct task_struct *p); 715 716 /* Give new sched_entity start runnable values to heavy its load in infant time */ 717 void init_entity_runnable_average(struct sched_entity *se) 718 { 719 struct sched_avg *sa = &se->avg; 720 721 memset(sa, 0, sizeof(*sa)); 722 723 /* 724 * Tasks are intialized with full load to be seen as heavy tasks until 725 * they get a chance to stabilize to their real load level. 726 * Group entities are intialized with zero load to reflect the fact that 727 * nothing has been attached to the task group yet. 728 */ 729 if (entity_is_task(se)) 730 sa->runnable_load_avg = sa->load_avg = scale_load_down(se->load.weight); 731 732 se->runnable_weight = se->load.weight; 733 734 /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */ 735 } 736 737 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); 738 static void attach_entity_cfs_rq(struct sched_entity *se); 739 740 /* 741 * With new tasks being created, their initial util_avgs are extrapolated 742 * based on the cfs_rq's current util_avg: 743 * 744 * util_avg = cfs_rq->util_avg / (cfs_rq->load_avg + 1) * se.load.weight 745 * 746 * However, in many cases, the above util_avg does not give a desired 747 * value. Moreover, the sum of the util_avgs may be divergent, such 748 * as when the series is a harmonic series. 749 * 750 * To solve this problem, we also cap the util_avg of successive tasks to 751 * only 1/2 of the left utilization budget: 752 * 753 * util_avg_cap = (1024 - cfs_rq->avg.util_avg) / 2^n 754 * 755 * where n denotes the nth task. 756 * 757 * For example, a simplest series from the beginning would be like: 758 * 759 * task util_avg: 512, 256, 128, 64, 32, 16, 8, ... 760 * cfs_rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ... 761 * 762 * Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap) 763 * if util_avg > util_avg_cap. 764 */ 765 void post_init_entity_util_avg(struct sched_entity *se) 766 { 767 struct cfs_rq *cfs_rq = cfs_rq_of(se); 768 struct sched_avg *sa = &se->avg; 769 long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2; 770 771 if (cap > 0) { 772 if (cfs_rq->avg.util_avg != 0) { 773 sa->util_avg = cfs_rq->avg.util_avg * se->load.weight; 774 sa->util_avg /= (cfs_rq->avg.load_avg + 1); 775 776 if (sa->util_avg > cap) 777 sa->util_avg = cap; 778 } else { 779 sa->util_avg = cap; 780 } 781 } 782 783 if (entity_is_task(se)) { 784 struct task_struct *p = task_of(se); 785 if (p->sched_class != &fair_sched_class) { 786 /* 787 * For !fair tasks do: 788 * 789 update_cfs_rq_load_avg(now, cfs_rq); 790 attach_entity_load_avg(cfs_rq, se); 791 switched_from_fair(rq, p); 792 * 793 * such that the next switched_to_fair() has the 794 * expected state. 795 */ 796 se->avg.last_update_time = cfs_rq_clock_task(cfs_rq); 797 return; 798 } 799 } 800 801 attach_entity_cfs_rq(se); 802 } 803 804 #else /* !CONFIG_SMP */ 805 void init_entity_runnable_average(struct sched_entity *se) 806 { 807 } 808 void post_init_entity_util_avg(struct sched_entity *se) 809 { 810 } 811 static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) 812 { 813 } 814 #endif /* CONFIG_SMP */ 815 816 /* 817 * Update the current task's runtime statistics. 818 */ 819 static void update_curr(struct cfs_rq *cfs_rq) 820 { 821 struct sched_entity *curr = cfs_rq->curr; 822 u64 now = rq_clock_task(rq_of(cfs_rq)); 823 u64 delta_exec; 824 825 if (unlikely(!curr)) 826 return; 827 828 delta_exec = now - curr->exec_start; 829 if (unlikely((s64)delta_exec <= 0)) 830 return; 831 832 curr->exec_start = now; 833 834 schedstat_set(curr->statistics.exec_max, 835 max(delta_exec, curr->statistics.exec_max)); 836 837 curr->sum_exec_runtime += delta_exec; 838 schedstat_add(cfs_rq->exec_clock, delta_exec); 839 840 curr->vruntime += calc_delta_fair(delta_exec, curr); 841 update_min_vruntime(cfs_rq); 842 843 if (entity_is_task(curr)) { 844 struct task_struct *curtask = task_of(curr); 845 846 trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime); 847 cgroup_account_cputime(curtask, delta_exec); 848 account_group_exec_runtime(curtask, delta_exec); 849 } 850 851 account_cfs_rq_runtime(cfs_rq, delta_exec); 852 } 853 854 static void update_curr_fair(struct rq *rq) 855 { 856 update_curr(cfs_rq_of(&rq->curr->se)); 857 } 858 859 static inline void 860 update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) 861 { 862 u64 wait_start, prev_wait_start; 863 864 if (!schedstat_enabled()) 865 return; 866 867 wait_start = rq_clock(rq_of(cfs_rq)); 868 prev_wait_start = schedstat_val(se->statistics.wait_start); 869 870 if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) && 871 likely(wait_start > prev_wait_start)) 872 wait_start -= prev_wait_start; 873 874 schedstat_set(se->statistics.wait_start, wait_start); 875 } 876 877 static inline void 878 update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) 879 { 880 struct task_struct *p; 881 u64 delta; 882 883 if (!schedstat_enabled()) 884 return; 885 886 delta = rq_clock(rq_of(cfs_rq)) - schedstat_val(se->statistics.wait_start); 887 888 if (entity_is_task(se)) { 889 p = task_of(se); 890 if (task_on_rq_migrating(p)) { 891 /* 892 * Preserve migrating task's wait time so wait_start 893 * time stamp can be adjusted to accumulate wait time 894 * prior to migration. 895 */ 896 schedstat_set(se->statistics.wait_start, delta); 897 return; 898 } 899 trace_sched_stat_wait(p, delta); 900 } 901 902 schedstat_set(se->statistics.wait_max, 903 max(schedstat_val(se->statistics.wait_max), delta)); 904 schedstat_inc(se->statistics.wait_count); 905 schedstat_add(se->statistics.wait_sum, delta); 906 schedstat_set(se->statistics.wait_start, 0); 907 } 908 909 static inline void 910 update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) 911 { 912 struct task_struct *tsk = NULL; 913 u64 sleep_start, block_start; 914 915 if (!schedstat_enabled()) 916 return; 917 918 sleep_start = schedstat_val(se->statistics.sleep_start); 919 block_start = schedstat_val(se->statistics.block_start); 920 921 if (entity_is_task(se)) 922 tsk = task_of(se); 923 924 if (sleep_start) { 925 u64 delta = rq_clock(rq_of(cfs_rq)) - sleep_start; 926 927 if ((s64)delta < 0) 928 delta = 0; 929 930 if (unlikely(delta > schedstat_val(se->statistics.sleep_max))) 931 schedstat_set(se->statistics.sleep_max, delta); 932 933 schedstat_set(se->statistics.sleep_start, 0); 934 schedstat_add(se->statistics.sum_sleep_runtime, delta); 935 936 if (tsk) { 937 account_scheduler_latency(tsk, delta >> 10, 1); 938 trace_sched_stat_sleep(tsk, delta); 939 } 940 } 941 if (block_start) { 942 u64 delta = rq_clock(rq_of(cfs_rq)) - block_start; 943 944 if ((s64)delta < 0) 945 delta = 0; 946 947 if (unlikely(delta > schedstat_val(se->statistics.block_max))) 948 schedstat_set(se->statistics.block_max, delta); 949 950 schedstat_set(se->statistics.block_start, 0); 951 schedstat_add(se->statistics.sum_sleep_runtime, delta); 952 953 if (tsk) { 954 if (tsk->in_iowait) { 955 schedstat_add(se->statistics.iowait_sum, delta); 956 schedstat_inc(se->statistics.iowait_count); 957 trace_sched_stat_iowait(tsk, delta); 958 } 959 960 trace_sched_stat_blocked(tsk, delta); 961 962 /* 963 * Blocking time is in units of nanosecs, so shift by 964 * 20 to get a milliseconds-range estimation of the 965 * amount of time that the task spent sleeping: 966 */ 967 if (unlikely(prof_on == SLEEP_PROFILING)) { 968 profile_hits(SLEEP_PROFILING, 969 (void *)get_wchan(tsk), 970 delta >> 20); 971 } 972 account_scheduler_latency(tsk, delta >> 10, 0); 973 } 974 } 975 } 976 977 /* 978 * Task is being enqueued - update stats: 979 */ 980 static inline void 981 update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) 982 { 983 if (!schedstat_enabled()) 984 return; 985 986 /* 987 * Are we enqueueing a waiting task? (for current tasks 988 * a dequeue/enqueue event is a NOP) 989 */ 990 if (se != cfs_rq->curr) 991 update_stats_wait_start(cfs_rq, se); 992 993 if (flags & ENQUEUE_WAKEUP) 994 update_stats_enqueue_sleeper(cfs_rq, se); 995 } 996 997 static inline void 998 update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) 999 { 1000 1001 if (!schedstat_enabled()) 1002 return; 1003 1004 /* 1005 * Mark the end of the wait period if dequeueing a 1006 * waiting task: 1007 */ 1008 if (se != cfs_rq->curr) 1009 update_stats_wait_end(cfs_rq, se); 1010 1011 if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) { 1012 struct task_struct *tsk = task_of(se); 1013 1014 if (tsk->state & TASK_INTERRUPTIBLE) 1015 schedstat_set(se->statistics.sleep_start, 1016 rq_clock(rq_of(cfs_rq))); 1017 if (tsk->state & TASK_UNINTERRUPTIBLE) 1018 schedstat_set(se->statistics.block_start, 1019 rq_clock(rq_of(cfs_rq))); 1020 } 1021 } 1022 1023 /* 1024 * We are picking a new current task - update its stats: 1025 */ 1026 static inline void 1027 update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) 1028 { 1029 /* 1030 * We are starting a new run period: 1031 */ 1032 se->exec_start = rq_clock_task(rq_of(cfs_rq)); 1033 } 1034 1035 /************************************************** 1036 * Scheduling class queueing methods: 1037 */ 1038 1039 #ifdef CONFIG_NUMA_BALANCING 1040 /* 1041 * Approximate time to scan a full NUMA task in ms. The task scan period is 1042 * calculated based on the tasks virtual memory size and 1043 * numa_balancing_scan_size. 1044 */ 1045 unsigned int sysctl_numa_balancing_scan_period_min = 1000; 1046 unsigned int sysctl_numa_balancing_scan_period_max = 60000; 1047 1048 /* Portion of address space to scan in MB */ 1049 unsigned int sysctl_numa_balancing_scan_size = 256; 1050 1051 /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */ 1052 unsigned int sysctl_numa_balancing_scan_delay = 1000; 1053 1054 struct numa_group { 1055 atomic_t refcount; 1056 1057 spinlock_t lock; /* nr_tasks, tasks */ 1058 int nr_tasks; 1059 pid_t gid; 1060 int active_nodes; 1061 1062 struct rcu_head rcu; 1063 unsigned long total_faults; 1064 unsigned long max_faults_cpu; 1065 /* 1066 * Faults_cpu is used to decide whether memory should move 1067 * towards the CPU. As a consequence, these stats are weighted 1068 * more by CPU use than by memory faults. 1069 */ 1070 unsigned long *faults_cpu; 1071 unsigned long faults[0]; 1072 }; 1073 1074 static inline unsigned long group_faults_priv(struct numa_group *ng); 1075 static inline unsigned long group_faults_shared(struct numa_group *ng); 1076 1077 static unsigned int task_nr_scan_windows(struct task_struct *p) 1078 { 1079 unsigned long rss = 0; 1080 unsigned long nr_scan_pages; 1081 1082 /* 1083 * Calculations based on RSS as non-present and empty pages are skipped 1084 * by the PTE scanner and NUMA hinting faults should be trapped based 1085 * on resident pages 1086 */ 1087 nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT); 1088 rss = get_mm_rss(p->mm); 1089 if (!rss) 1090 rss = nr_scan_pages; 1091 1092 rss = round_up(rss, nr_scan_pages); 1093 return rss / nr_scan_pages; 1094 } 1095 1096 /* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */ 1097 #define MAX_SCAN_WINDOW 2560 1098 1099 static unsigned int task_scan_min(struct task_struct *p) 1100 { 1101 unsigned int scan_size = READ_ONCE(sysctl_numa_balancing_scan_size); 1102 unsigned int scan, floor; 1103 unsigned int windows = 1; 1104 1105 if (scan_size < MAX_SCAN_WINDOW) 1106 windows = MAX_SCAN_WINDOW / scan_size; 1107 floor = 1000 / windows; 1108 1109 scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p); 1110 return max_t(unsigned int, floor, scan); 1111 } 1112 1113 static unsigned int task_scan_start(struct task_struct *p) 1114 { 1115 unsigned long smin = task_scan_min(p); 1116 unsigned long period = smin; 1117 1118 /* Scale the maximum scan period with the amount of shared memory. */ 1119 if (p->numa_group) { 1120 struct numa_group *ng = p->numa_group; 1121 unsigned long shared = group_faults_shared(ng); 1122 unsigned long private = group_faults_priv(ng); 1123 1124 period *= atomic_read(&ng->refcount); 1125 period *= shared + 1; 1126 period /= private + shared + 1; 1127 } 1128 1129 return max(smin, period); 1130 } 1131 1132 static unsigned int task_scan_max(struct task_struct *p) 1133 { 1134 unsigned long smin = task_scan_min(p); 1135 unsigned long smax; 1136 1137 /* Watch for min being lower than max due to floor calculations */ 1138 smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p); 1139 1140 /* Scale the maximum scan period with the amount of shared memory. */ 1141 if (p->numa_group) { 1142 struct numa_group *ng = p->numa_group; 1143 unsigned long shared = group_faults_shared(ng); 1144 unsigned long private = group_faults_priv(ng); 1145 unsigned long period = smax; 1146 1147 period *= atomic_read(&ng->refcount); 1148 period *= shared + 1; 1149 period /= private + shared + 1; 1150 1151 smax = max(smax, period); 1152 } 1153 1154 return max(smin, smax); 1155 } 1156 1157 static void account_numa_enqueue(struct rq *rq, struct task_struct *p) 1158 { 1159 rq->nr_numa_running += (p->numa_preferred_nid != -1); 1160 rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p)); 1161 } 1162 1163 static void account_numa_dequeue(struct rq *rq, struct task_struct *p) 1164 { 1165 rq->nr_numa_running -= (p->numa_preferred_nid != -1); 1166 rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p)); 1167 } 1168 1169 /* Shared or private faults. */ 1170 #define NR_NUMA_HINT_FAULT_TYPES 2 1171 1172 /* Memory and CPU locality */ 1173 #define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2) 1174 1175 /* Averaged statistics, and temporary buffers. */ 1176 #define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2) 1177 1178 pid_t task_numa_group_id(struct task_struct *p) 1179 { 1180 return p->numa_group ? p->numa_group->gid : 0; 1181 } 1182 1183 /* 1184 * The averaged statistics, shared & private, memory & cpu, 1185 * occupy the first half of the array. The second half of the 1186 * array is for current counters, which are averaged into the 1187 * first set by task_numa_placement. 1188 */ 1189 static inline int task_faults_idx(enum numa_faults_stats s, int nid, int priv) 1190 { 1191 return NR_NUMA_HINT_FAULT_TYPES * (s * nr_node_ids + nid) + priv; 1192 } 1193 1194 static inline unsigned long task_faults(struct task_struct *p, int nid) 1195 { 1196 if (!p->numa_faults) 1197 return 0; 1198 1199 return p->numa_faults[task_faults_idx(NUMA_MEM, nid, 0)] + 1200 p->numa_faults[task_faults_idx(NUMA_MEM, nid, 1)]; 1201 } 1202 1203 static inline unsigned long group_faults(struct task_struct *p, int nid) 1204 { 1205 if (!p->numa_group) 1206 return 0; 1207 1208 return p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 0)] + 1209 p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 1)]; 1210 } 1211 1212 static inline unsigned long group_faults_cpu(struct numa_group *group, int nid) 1213 { 1214 return group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 0)] + 1215 group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)]; 1216 } 1217 1218 static inline unsigned long group_faults_priv(struct numa_group *ng) 1219 { 1220 unsigned long faults = 0; 1221 int node; 1222 1223 for_each_online_node(node) { 1224 faults += ng->faults[task_faults_idx(NUMA_MEM, node, 1)]; 1225 } 1226 1227 return faults; 1228 } 1229 1230 static inline unsigned long group_faults_shared(struct numa_group *ng) 1231 { 1232 unsigned long faults = 0; 1233 int node; 1234 1235 for_each_online_node(node) { 1236 faults += ng->faults[task_faults_idx(NUMA_MEM, node, 0)]; 1237 } 1238 1239 return faults; 1240 } 1241 1242 /* 1243 * A node triggering more than 1/3 as many NUMA faults as the maximum is 1244 * considered part of a numa group's pseudo-interleaving set. Migrations 1245 * between these nodes are slowed down, to allow things to settle down. 1246 */ 1247 #define ACTIVE_NODE_FRACTION 3 1248 1249 static bool numa_is_active_node(int nid, struct numa_group *ng) 1250 { 1251 return group_faults_cpu(ng, nid) * ACTIVE_NODE_FRACTION > ng->max_faults_cpu; 1252 } 1253 1254 /* Handle placement on systems where not all nodes are directly connected. */ 1255 static unsigned long score_nearby_nodes(struct task_struct *p, int nid, 1256 int maxdist, bool task) 1257 { 1258 unsigned long score = 0; 1259 int node; 1260 1261 /* 1262 * All nodes are directly connected, and the same distance 1263 * from each other. No need for fancy placement algorithms. 1264 */ 1265 if (sched_numa_topology_type == NUMA_DIRECT) 1266 return 0; 1267 1268 /* 1269 * This code is called for each node, introducing N^2 complexity, 1270 * which should be ok given the number of nodes rarely exceeds 8. 1271 */ 1272 for_each_online_node(node) { 1273 unsigned long faults; 1274 int dist = node_distance(nid, node); 1275 1276 /* 1277 * The furthest away nodes in the system are not interesting 1278 * for placement; nid was already counted. 1279 */ 1280 if (dist == sched_max_numa_distance || node == nid) 1281 continue; 1282 1283 /* 1284 * On systems with a backplane NUMA topology, compare groups 1285 * of nodes, and move tasks towards the group with the most 1286 * memory accesses. When comparing two nodes at distance 1287 * "hoplimit", only nodes closer by than "hoplimit" are part 1288 * of each group. Skip other nodes. 1289 */ 1290 if (sched_numa_topology_type == NUMA_BACKPLANE && 1291 dist > maxdist) 1292 continue; 1293 1294 /* Add up the faults from nearby nodes. */ 1295 if (task) 1296 faults = task_faults(p, node); 1297 else 1298 faults = group_faults(p, node); 1299 1300 /* 1301 * On systems with a glueless mesh NUMA topology, there are 1302 * no fixed "groups of nodes". Instead, nodes that are not 1303 * directly connected bounce traffic through intermediate 1304 * nodes; a numa_group can occupy any set of nodes. 1305 * The further away a node is, the less the faults count. 1306 * This seems to result in good task placement. 1307 */ 1308 if (sched_numa_topology_type == NUMA_GLUELESS_MESH) { 1309 faults *= (sched_max_numa_distance - dist); 1310 faults /= (sched_max_numa_distance - LOCAL_DISTANCE); 1311 } 1312 1313 score += faults; 1314 } 1315 1316 return score; 1317 } 1318 1319 /* 1320 * These return the fraction of accesses done by a particular task, or 1321 * task group, on a particular numa node. The group weight is given a 1322 * larger multiplier, in order to group tasks together that are almost 1323 * evenly spread out between numa nodes. 1324 */ 1325 static inline unsigned long task_weight(struct task_struct *p, int nid, 1326 int dist) 1327 { 1328 unsigned long faults, total_faults; 1329 1330 if (!p->numa_faults) 1331 return 0; 1332 1333 total_faults = p->total_numa_faults; 1334 1335 if (!total_faults) 1336 return 0; 1337 1338 faults = task_faults(p, nid); 1339 faults += score_nearby_nodes(p, nid, dist, true); 1340 1341 return 1000 * faults / total_faults; 1342 } 1343 1344 static inline unsigned long group_weight(struct task_struct *p, int nid, 1345 int dist) 1346 { 1347 unsigned long faults, total_faults; 1348 1349 if (!p->numa_group) 1350 return 0; 1351 1352 total_faults = p->numa_group->total_faults; 1353 1354 if (!total_faults) 1355 return 0; 1356 1357 faults = group_faults(p, nid); 1358 faults += score_nearby_nodes(p, nid, dist, false); 1359 1360 return 1000 * faults / total_faults; 1361 } 1362 1363 bool should_numa_migrate_memory(struct task_struct *p, struct page * page, 1364 int src_nid, int dst_cpu) 1365 { 1366 struct numa_group *ng = p->numa_group; 1367 int dst_nid = cpu_to_node(dst_cpu); 1368 int last_cpupid, this_cpupid; 1369 1370 this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid); 1371 1372 /* 1373 * Multi-stage node selection is used in conjunction with a periodic 1374 * migration fault to build a temporal task<->page relation. By using 1375 * a two-stage filter we remove short/unlikely relations. 1376 * 1377 * Using P(p) ~ n_p / n_t as per frequentist probability, we can equate 1378 * a task's usage of a particular page (n_p) per total usage of this 1379 * page (n_t) (in a given time-span) to a probability. 1380 * 1381 * Our periodic faults will sample this probability and getting the 1382 * same result twice in a row, given these samples are fully 1383 * independent, is then given by P(n)^2, provided our sample period 1384 * is sufficiently short compared to the usage pattern. 1385 * 1386 * This quadric squishes small probabilities, making it less likely we 1387 * act on an unlikely task<->page relation. 1388 */ 1389 last_cpupid = page_cpupid_xchg_last(page, this_cpupid); 1390 if (!cpupid_pid_unset(last_cpupid) && 1391 cpupid_to_nid(last_cpupid) != dst_nid) 1392 return false; 1393 1394 /* Always allow migrate on private faults */ 1395 if (cpupid_match_pid(p, last_cpupid)) 1396 return true; 1397 1398 /* A shared fault, but p->numa_group has not been set up yet. */ 1399 if (!ng) 1400 return true; 1401 1402 /* 1403 * Destination node is much more heavily used than the source 1404 * node? Allow migration. 1405 */ 1406 if (group_faults_cpu(ng, dst_nid) > group_faults_cpu(ng, src_nid) * 1407 ACTIVE_NODE_FRACTION) 1408 return true; 1409 1410 /* 1411 * Distribute memory according to CPU & memory use on each node, 1412 * with 3/4 hysteresis to avoid unnecessary memory migrations: 1413 * 1414 * faults_cpu(dst) 3 faults_cpu(src) 1415 * --------------- * - > --------------- 1416 * faults_mem(dst) 4 faults_mem(src) 1417 */ 1418 return group_faults_cpu(ng, dst_nid) * group_faults(p, src_nid) * 3 > 1419 group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4; 1420 } 1421 1422 static unsigned long weighted_cpuload(struct rq *rq); 1423 static unsigned long source_load(int cpu, int type); 1424 static unsigned long target_load(int cpu, int type); 1425 static unsigned long capacity_of(int cpu); 1426 1427 /* Cached statistics for all CPUs within a node */ 1428 struct numa_stats { 1429 unsigned long nr_running; 1430 unsigned long load; 1431 1432 /* Total compute capacity of CPUs on a node */ 1433 unsigned long compute_capacity; 1434 1435 /* Approximate capacity in terms of runnable tasks on a node */ 1436 unsigned long task_capacity; 1437 int has_free_capacity; 1438 }; 1439 1440 /* 1441 * XXX borrowed from update_sg_lb_stats 1442 */ 1443 static void update_numa_stats(struct numa_stats *ns, int nid) 1444 { 1445 int smt, cpu, cpus = 0; 1446 unsigned long capacity; 1447 1448 memset(ns, 0, sizeof(*ns)); 1449 for_each_cpu(cpu, cpumask_of_node(nid)) { 1450 struct rq *rq = cpu_rq(cpu); 1451 1452 ns->nr_running += rq->nr_running; 1453 ns->load += weighted_cpuload(rq); 1454 ns->compute_capacity += capacity_of(cpu); 1455 1456 cpus++; 1457 } 1458 1459 /* 1460 * If we raced with hotplug and there are no CPUs left in our mask 1461 * the @ns structure is NULL'ed and task_numa_compare() will 1462 * not find this node attractive. 1463 * 1464 * We'll either bail at !has_free_capacity, or we'll detect a huge 1465 * imbalance and bail there. 1466 */ 1467 if (!cpus) 1468 return; 1469 1470 /* smt := ceil(cpus / capacity), assumes: 1 < smt_power < 2 */ 1471 smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, ns->compute_capacity); 1472 capacity = cpus / smt; /* cores */ 1473 1474 ns->task_capacity = min_t(unsigned, capacity, 1475 DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE)); 1476 ns->has_free_capacity = (ns->nr_running < ns->task_capacity); 1477 } 1478 1479 struct task_numa_env { 1480 struct task_struct *p; 1481 1482 int src_cpu, src_nid; 1483 int dst_cpu, dst_nid; 1484 1485 struct numa_stats src_stats, dst_stats; 1486 1487 int imbalance_pct; 1488 int dist; 1489 1490 struct task_struct *best_task; 1491 long best_imp; 1492 int best_cpu; 1493 }; 1494 1495 static void task_numa_assign(struct task_numa_env *env, 1496 struct task_struct *p, long imp) 1497 { 1498 if (env->best_task) 1499 put_task_struct(env->best_task); 1500 if (p) 1501 get_task_struct(p); 1502 1503 env->best_task = p; 1504 env->best_imp = imp; 1505 env->best_cpu = env->dst_cpu; 1506 } 1507 1508 static bool load_too_imbalanced(long src_load, long dst_load, 1509 struct task_numa_env *env) 1510 { 1511 long imb, old_imb; 1512 long orig_src_load, orig_dst_load; 1513 long src_capacity, dst_capacity; 1514 1515 /* 1516 * The load is corrected for the CPU capacity available on each node. 1517 * 1518 * src_load dst_load 1519 * ------------ vs --------- 1520 * src_capacity dst_capacity 1521 */ 1522 src_capacity = env->src_stats.compute_capacity; 1523 dst_capacity = env->dst_stats.compute_capacity; 1524 1525 /* We care about the slope of the imbalance, not the direction. */ 1526 if (dst_load < src_load) 1527 swap(dst_load, src_load); 1528 1529 /* Is the difference below the threshold? */ 1530 imb = dst_load * src_capacity * 100 - 1531 src_load * dst_capacity * env->imbalance_pct; 1532 if (imb <= 0) 1533 return false; 1534 1535 /* 1536 * The imbalance is above the allowed threshold. 1537 * Compare it with the old imbalance. 1538 */ 1539 orig_src_load = env->src_stats.load; 1540 orig_dst_load = env->dst_stats.load; 1541 1542 if (orig_dst_load < orig_src_load) 1543 swap(orig_dst_load, orig_src_load); 1544 1545 old_imb = orig_dst_load * src_capacity * 100 - 1546 orig_src_load * dst_capacity * env->imbalance_pct; 1547 1548 /* Would this change make things worse? */ 1549 return (imb > old_imb); 1550 } 1551 1552 /* 1553 * This checks if the overall compute and NUMA accesses of the system would 1554 * be improved if the source tasks was migrated to the target dst_cpu taking 1555 * into account that it might be best if task running on the dst_cpu should 1556 * be exchanged with the source task 1557 */ 1558 static void task_numa_compare(struct task_numa_env *env, 1559 long taskimp, long groupimp) 1560 { 1561 struct rq *src_rq = cpu_rq(env->src_cpu); 1562 struct rq *dst_rq = cpu_rq(env->dst_cpu); 1563 struct task_struct *cur; 1564 long src_load, dst_load; 1565 long load; 1566 long imp = env->p->numa_group ? groupimp : taskimp; 1567 long moveimp = imp; 1568 int dist = env->dist; 1569 1570 rcu_read_lock(); 1571 cur = task_rcu_dereference(&dst_rq->curr); 1572 if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur))) 1573 cur = NULL; 1574 1575 /* 1576 * Because we have preemption enabled we can get migrated around and 1577 * end try selecting ourselves (current == env->p) as a swap candidate. 1578 */ 1579 if (cur == env->p) 1580 goto unlock; 1581 1582 /* 1583 * "imp" is the fault differential for the source task between the 1584 * source and destination node. Calculate the total differential for 1585 * the source task and potential destination task. The more negative 1586 * the value is, the more rmeote accesses that would be expected to 1587 * be incurred if the tasks were swapped. 1588 */ 1589 if (cur) { 1590 /* Skip this swap candidate if cannot move to the source cpu */ 1591 if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed)) 1592 goto unlock; 1593 1594 /* 1595 * If dst and source tasks are in the same NUMA group, or not 1596 * in any group then look only at task weights. 1597 */ 1598 if (cur->numa_group == env->p->numa_group) { 1599 imp = taskimp + task_weight(cur, env->src_nid, dist) - 1600 task_weight(cur, env->dst_nid, dist); 1601 /* 1602 * Add some hysteresis to prevent swapping the 1603 * tasks within a group over tiny differences. 1604 */ 1605 if (cur->numa_group) 1606 imp -= imp/16; 1607 } else { 1608 /* 1609 * Compare the group weights. If a task is all by 1610 * itself (not part of a group), use the task weight 1611 * instead. 1612 */ 1613 if (cur->numa_group) 1614 imp += group_weight(cur, env->src_nid, dist) - 1615 group_weight(cur, env->dst_nid, dist); 1616 else 1617 imp += task_weight(cur, env->src_nid, dist) - 1618 task_weight(cur, env->dst_nid, dist); 1619 } 1620 } 1621 1622 if (imp <= env->best_imp && moveimp <= env->best_imp) 1623 goto unlock; 1624 1625 if (!cur) { 1626 /* Is there capacity at our destination? */ 1627 if (env->src_stats.nr_running <= env->src_stats.task_capacity && 1628 !env->dst_stats.has_free_capacity) 1629 goto unlock; 1630 1631 goto balance; 1632 } 1633 1634 /* Balance doesn't matter much if we're running a task per cpu */ 1635 if (imp > env->best_imp && src_rq->nr_running == 1 && 1636 dst_rq->nr_running == 1) 1637 goto assign; 1638 1639 /* 1640 * In the overloaded case, try and keep the load balanced. 1641 */ 1642 balance: 1643 load = task_h_load(env->p); 1644 dst_load = env->dst_stats.load + load; 1645 src_load = env->src_stats.load - load; 1646 1647 if (moveimp > imp && moveimp > env->best_imp) { 1648 /* 1649 * If the improvement from just moving env->p direction is 1650 * better than swapping tasks around, check if a move is 1651 * possible. Store a slightly smaller score than moveimp, 1652 * so an actually idle CPU will win. 1653 */ 1654 if (!load_too_imbalanced(src_load, dst_load, env)) { 1655 imp = moveimp - 1; 1656 cur = NULL; 1657 goto assign; 1658 } 1659 } 1660 1661 if (imp <= env->best_imp) 1662 goto unlock; 1663 1664 if (cur) { 1665 load = task_h_load(cur); 1666 dst_load -= load; 1667 src_load += load; 1668 } 1669 1670 if (load_too_imbalanced(src_load, dst_load, env)) 1671 goto unlock; 1672 1673 /* 1674 * One idle CPU per node is evaluated for a task numa move. 1675 * Call select_idle_sibling to maybe find a better one. 1676 */ 1677 if (!cur) { 1678 /* 1679 * select_idle_siblings() uses an per-cpu cpumask that 1680 * can be used from IRQ context. 1681 */ 1682 local_irq_disable(); 1683 env->dst_cpu = select_idle_sibling(env->p, env->src_cpu, 1684 env->dst_cpu); 1685 local_irq_enable(); 1686 } 1687 1688 assign: 1689 task_numa_assign(env, cur, imp); 1690 unlock: 1691 rcu_read_unlock(); 1692 } 1693 1694 static void task_numa_find_cpu(struct task_numa_env *env, 1695 long taskimp, long groupimp) 1696 { 1697 int cpu; 1698 1699 for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) { 1700 /* Skip this CPU if the source task cannot migrate */ 1701 if (!cpumask_test_cpu(cpu, &env->p->cpus_allowed)) 1702 continue; 1703 1704 env->dst_cpu = cpu; 1705 task_numa_compare(env, taskimp, groupimp); 1706 } 1707 } 1708 1709 /* Only move tasks to a NUMA node less busy than the current node. */ 1710 static bool numa_has_capacity(struct task_numa_env *env) 1711 { 1712 struct numa_stats *src = &env->src_stats; 1713 struct numa_stats *dst = &env->dst_stats; 1714 1715 if (src->has_free_capacity && !dst->has_free_capacity) 1716 return false; 1717 1718 /* 1719 * Only consider a task move if the source has a higher load 1720 * than the destination, corrected for CPU capacity on each node. 1721 * 1722 * src->load dst->load 1723 * --------------------- vs --------------------- 1724 * src->compute_capacity dst->compute_capacity 1725 */ 1726 if (src->load * dst->compute_capacity * env->imbalance_pct > 1727 1728 dst->load * src->compute_capacity * 100) 1729 return true; 1730 1731 return false; 1732 } 1733 1734 static int task_numa_migrate(struct task_struct *p) 1735 { 1736 struct task_numa_env env = { 1737 .p = p, 1738 1739 .src_cpu = task_cpu(p), 1740 .src_nid = task_node(p), 1741 1742 .imbalance_pct = 112, 1743 1744 .best_task = NULL, 1745 .best_imp = 0, 1746 .best_cpu = -1, 1747 }; 1748 struct sched_domain *sd; 1749 unsigned long taskweight, groupweight; 1750 int nid, ret, dist; 1751 long taskimp, groupimp; 1752 1753 /* 1754 * Pick the lowest SD_NUMA domain, as that would have the smallest 1755 * imbalance and would be the first to start moving tasks about. 1756 * 1757 * And we want to avoid any moving of tasks about, as that would create 1758 * random movement of tasks -- counter the numa conditions we're trying 1759 * to satisfy here. 1760 */ 1761 rcu_read_lock(); 1762 sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu)); 1763 if (sd) 1764 env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2; 1765 rcu_read_unlock(); 1766 1767 /* 1768 * Cpusets can break the scheduler domain tree into smaller 1769 * balance domains, some of which do not cross NUMA boundaries. 1770 * Tasks that are "trapped" in such domains cannot be migrated 1771 * elsewhere, so there is no point in (re)trying. 1772 */ 1773 if (unlikely(!sd)) { 1774 p->numa_preferred_nid = task_node(p); 1775 return -EINVAL; 1776 } 1777 1778 env.dst_nid = p->numa_preferred_nid; 1779 dist = env.dist = node_distance(env.src_nid, env.dst_nid); 1780 taskweight = task_weight(p, env.src_nid, dist); 1781 groupweight = group_weight(p, env.src_nid, dist); 1782 update_numa_stats(&env.src_stats, env.src_nid); 1783 taskimp = task_weight(p, env.dst_nid, dist) - taskweight; 1784 groupimp = group_weight(p, env.dst_nid, dist) - groupweight; 1785 update_numa_stats(&env.dst_stats, env.dst_nid); 1786 1787 /* Try to find a spot on the preferred nid. */ 1788 if (numa_has_capacity(&env)) 1789 task_numa_find_cpu(&env, taskimp, groupimp); 1790 1791 /* 1792 * Look at other nodes in these cases: 1793 * - there is no space available on the preferred_nid 1794 * - the task is part of a numa_group that is interleaved across 1795 * multiple NUMA nodes; in order to better consolidate the group, 1796 * we need to check other locations. 1797 */ 1798 if (env.best_cpu == -1 || (p->numa_group && p->numa_group->active_nodes > 1)) { 1799 for_each_online_node(nid) { 1800 if (nid == env.src_nid || nid == p->numa_preferred_nid) 1801 continue; 1802 1803 dist = node_distance(env.src_nid, env.dst_nid); 1804 if (sched_numa_topology_type == NUMA_BACKPLANE && 1805 dist != env.dist) { 1806 taskweight = task_weight(p, env.src_nid, dist); 1807 groupweight = group_weight(p, env.src_nid, dist); 1808 } 1809 1810 /* Only consider nodes where both task and groups benefit */ 1811 taskimp = task_weight(p, nid, dist) - taskweight; 1812 groupimp = group_weight(p, nid, dist) - groupweight; 1813 if (taskimp < 0 && groupimp < 0) 1814 continue; 1815 1816 env.dist = dist; 1817 env.dst_nid = nid; 1818 update_numa_stats(&env.dst_stats, env.dst_nid); 1819 if (numa_has_capacity(&env)) 1820 task_numa_find_cpu(&env, taskimp, groupimp); 1821 } 1822 } 1823 1824 /* 1825 * If the task is part of a workload that spans multiple NUMA nodes, 1826 * and is migrating into one of the workload's active nodes, remember 1827 * this node as the task's preferred numa node, so the workload can 1828 * settle down. 1829 * A task that migrated to a second choice node will be better off 1830 * trying for a better one later. Do not set the preferred node here. 1831 */ 1832 if (p->numa_group) { 1833 struct numa_group *ng = p->numa_group; 1834 1835 if (env.best_cpu == -1) 1836 nid = env.src_nid; 1837 else 1838 nid = env.dst_nid; 1839 1840 if (ng->active_nodes > 1 && numa_is_active_node(env.dst_nid, ng)) 1841 sched_setnuma(p, env.dst_nid); 1842 } 1843 1844 /* No better CPU than the current one was found. */ 1845 if (env.best_cpu == -1) 1846 return -EAGAIN; 1847 1848 /* 1849 * Reset the scan period if the task is being rescheduled on an 1850 * alternative node to recheck if the tasks is now properly placed. 1851 */ 1852 p->numa_scan_period = task_scan_start(p); 1853 1854 if (env.best_task == NULL) { 1855 ret = migrate_task_to(p, env.best_cpu); 1856 if (ret != 0) 1857 trace_sched_stick_numa(p, env.src_cpu, env.best_cpu); 1858 return ret; 1859 } 1860 1861 ret = migrate_swap(p, env.best_task); 1862 if (ret != 0) 1863 trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task)); 1864 put_task_struct(env.best_task); 1865 return ret; 1866 } 1867 1868 /* Attempt to migrate a task to a CPU on the preferred node. */ 1869 static void numa_migrate_preferred(struct task_struct *p) 1870 { 1871 unsigned long interval = HZ; 1872 1873 /* This task has no NUMA fault statistics yet */ 1874 if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults)) 1875 return; 1876 1877 /* Periodically retry migrating the task to the preferred node */ 1878 interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16); 1879 p->numa_migrate_retry = jiffies + interval; 1880 1881 /* Success if task is already running on preferred CPU */ 1882 if (task_node(p) == p->numa_preferred_nid) 1883 return; 1884 1885 /* Otherwise, try migrate to a CPU on the preferred node */ 1886 task_numa_migrate(p); 1887 } 1888 1889 /* 1890 * Find out how many nodes on the workload is actively running on. Do this by 1891 * tracking the nodes from which NUMA hinting faults are triggered. This can 1892 * be different from the set of nodes where the workload's memory is currently 1893 * located. 1894 */ 1895 static void numa_group_count_active_nodes(struct numa_group *numa_group) 1896 { 1897 unsigned long faults, max_faults = 0; 1898 int nid, active_nodes = 0; 1899 1900 for_each_online_node(nid) { 1901 faults = group_faults_cpu(numa_group, nid); 1902 if (faults > max_faults) 1903 max_faults = faults; 1904 } 1905 1906 for_each_online_node(nid) { 1907 faults = group_faults_cpu(numa_group, nid); 1908 if (faults * ACTIVE_NODE_FRACTION > max_faults) 1909 active_nodes++; 1910 } 1911 1912 numa_group->max_faults_cpu = max_faults; 1913 numa_group->active_nodes = active_nodes; 1914 } 1915 1916 /* 1917 * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS 1918 * increments. The more local the fault statistics are, the higher the scan 1919 * period will be for the next scan window. If local/(local+remote) ratio is 1920 * below NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS) 1921 * the scan period will decrease. Aim for 70% local accesses. 1922 */ 1923 #define NUMA_PERIOD_SLOTS 10 1924 #define NUMA_PERIOD_THRESHOLD 7 1925 1926 /* 1927 * Increase the scan period (slow down scanning) if the majority of 1928 * our memory is already on our local node, or if the majority of 1929 * the page accesses are shared with other processes. 1930 * Otherwise, decrease the scan period. 1931 */ 1932 static void update_task_scan_period(struct task_struct *p, 1933 unsigned long shared, unsigned long private) 1934 { 1935 unsigned int period_slot; 1936 int lr_ratio, ps_ratio; 1937 int diff; 1938 1939 unsigned long remote = p->numa_faults_locality[0]; 1940 unsigned long local = p->numa_faults_locality[1]; 1941 1942 /* 1943 * If there were no record hinting faults then either the task is 1944 * completely idle or all activity is areas that are not of interest 1945 * to automatic numa balancing. Related to that, if there were failed 1946 * migration then it implies we are migrating too quickly or the local 1947 * node is overloaded. In either case, scan slower 1948 */ 1949 if (local + shared == 0 || p->numa_faults_locality[2]) { 1950 p->numa_scan_period = min(p->numa_scan_period_max, 1951 p->numa_scan_period << 1); 1952 1953 p->mm->numa_next_scan = jiffies + 1954 msecs_to_jiffies(p->numa_scan_period); 1955 1956 return; 1957 } 1958 1959 /* 1960 * Prepare to scale scan period relative to the current period. 1961 * == NUMA_PERIOD_THRESHOLD scan period stays the same 1962 * < NUMA_PERIOD_THRESHOLD scan period decreases (scan faster) 1963 * >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower) 1964 */ 1965 period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS); 1966 lr_ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote); 1967 ps_ratio = (private * NUMA_PERIOD_SLOTS) / (private + shared); 1968 1969 if (ps_ratio >= NUMA_PERIOD_THRESHOLD) { 1970 /* 1971 * Most memory accesses are local. There is no need to 1972 * do fast NUMA scanning, since memory is already local. 1973 */ 1974 int slot = ps_ratio - NUMA_PERIOD_THRESHOLD; 1975 if (!slot) 1976 slot = 1; 1977 diff = slot * period_slot; 1978 } else if (lr_ratio >= NUMA_PERIOD_THRESHOLD) { 1979 /* 1980 * Most memory accesses are shared with other tasks. 1981 * There is no point in continuing fast NUMA scanning, 1982 * since other tasks may just move the memory elsewhere. 1983 */ 1984 int slot = lr_ratio - NUMA_PERIOD_THRESHOLD; 1985 if (!slot) 1986 slot = 1; 1987 diff = slot * period_slot; 1988 } else { 1989 /* 1990 * Private memory faults exceed (SLOTS-THRESHOLD)/SLOTS, 1991 * yet they are not on the local NUMA node. Speed up 1992 * NUMA scanning to get the memory moved over. 1993 */ 1994 int ratio = max(lr_ratio, ps_ratio); 1995 diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot; 1996 } 1997 1998 p->numa_scan_period = clamp(p->numa_scan_period + diff, 1999 task_scan_min(p), task_scan_max(p)); 2000 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality)); 2001 } 2002 2003 /* 2004 * Get the fraction of time the task has been running since the last 2005 * NUMA placement cycle. The scheduler keeps similar statistics, but 2006 * decays those on a 32ms period, which is orders of magnitude off 2007 * from the dozens-of-seconds NUMA balancing period. Use the scheduler 2008 * stats only if the task is so new there are no NUMA statistics yet. 2009 */ 2010 static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period) 2011 { 2012 u64 runtime, delta, now; 2013 /* Use the start of this time slice to avoid calculations. */ 2014 now = p->se.exec_start; 2015 runtime = p->se.sum_exec_runtime; 2016 2017 if (p->last_task_numa_placement) { 2018 delta = runtime - p->last_sum_exec_runtime; 2019 *period = now - p->last_task_numa_placement; 2020 } else { 2021 delta = p->se.avg.load_sum; 2022 *period = LOAD_AVG_MAX; 2023 } 2024 2025 p->last_sum_exec_runtime = runtime; 2026 p->last_task_numa_placement = now; 2027 2028 return delta; 2029 } 2030 2031 /* 2032 * Determine the preferred nid for a task in a numa_group. This needs to 2033 * be done in a way that produces consistent results with group_weight, 2034 * otherwise workloads might not converge. 2035 */ 2036 static int preferred_group_nid(struct task_struct *p, int nid) 2037 { 2038 nodemask_t nodes; 2039 int dist; 2040 2041 /* Direct connections between all NUMA nodes. */ 2042 if (sched_numa_topology_type == NUMA_DIRECT) 2043 return nid; 2044 2045 /* 2046 * On a system with glueless mesh NUMA topology, group_weight 2047 * scores nodes according to the number of NUMA hinting faults on 2048 * both the node itself, and on nearby nodes. 2049 */ 2050 if (sched_numa_topology_type == NUMA_GLUELESS_MESH) { 2051 unsigned long score, max_score = 0; 2052 int node, max_node = nid; 2053 2054 dist = sched_max_numa_distance; 2055 2056 for_each_online_node(node) { 2057 score = group_weight(p, node, dist); 2058 if (score > max_score) { 2059 max_score = score; 2060 max_node = node; 2061 } 2062 } 2063 return max_node; 2064 } 2065 2066 /* 2067 * Finding the preferred nid in a system with NUMA backplane 2068 * interconnect topology is more involved. The goal is to locate 2069 * tasks from numa_groups near each other in the system, and 2070 * untangle workloads from different sides of the system. This requires 2071 * searching down the hierarchy of node groups, recursively searching 2072 * inside the highest scoring group of nodes. The nodemask tricks 2073 * keep the complexity of the search down. 2074 */ 2075 nodes = node_online_map; 2076 for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) { 2077 unsigned long max_faults = 0; 2078 nodemask_t max_group = NODE_MASK_NONE; 2079 int a, b; 2080 2081 /* Are there nodes at this distance from each other? */ 2082 if (!find_numa_distance(dist)) 2083 continue; 2084 2085 for_each_node_mask(a, nodes) { 2086 unsigned long faults = 0; 2087 nodemask_t this_group; 2088 nodes_clear(this_group); 2089 2090 /* Sum group's NUMA faults; includes a==b case. */ 2091 for_each_node_mask(b, nodes) { 2092 if (node_distance(a, b) < dist) { 2093 faults += group_faults(p, b); 2094 node_set(b, this_group); 2095 node_clear(b, nodes); 2096 } 2097 } 2098 2099 /* Remember the top group. */ 2100 if (faults > max_faults) { 2101 max_faults = faults; 2102 max_group = this_group; 2103 /* 2104 * subtle: at the smallest distance there is 2105 * just one node left in each "group", the 2106 * winner is the preferred nid. 2107 */ 2108 nid = a; 2109 } 2110 } 2111 /* Next round, evaluate the nodes within max_group. */ 2112 if (!max_faults) 2113 break; 2114 nodes = max_group; 2115 } 2116 return nid; 2117 } 2118 2119 static void task_numa_placement(struct task_struct *p) 2120 { 2121 int seq, nid, max_nid = -1, max_group_nid = -1; 2122 unsigned long max_faults = 0, max_group_faults = 0; 2123 unsigned long fault_types[2] = { 0, 0 }; 2124 unsigned long total_faults; 2125 u64 runtime, period; 2126 spinlock_t *group_lock = NULL; 2127 2128 /* 2129 * The p->mm->numa_scan_seq field gets updated without 2130 * exclusive access. Use READ_ONCE() here to ensure 2131 * that the field is read in a single access: 2132 */ 2133 seq = READ_ONCE(p->mm->numa_scan_seq); 2134 if (p->numa_scan_seq == seq) 2135 return; 2136 p->numa_scan_seq = seq; 2137 p->numa_scan_period_max = task_scan_max(p); 2138 2139 total_faults = p->numa_faults_locality[0] + 2140 p->numa_faults_locality[1]; 2141 runtime = numa_get_avg_runtime(p, &period); 2142 2143 /* If the task is part of a group prevent parallel updates to group stats */ 2144 if (p->numa_group) { 2145 group_lock = &p->numa_group->lock; 2146 spin_lock_irq(group_lock); 2147 } 2148 2149 /* Find the node with the highest number of faults */ 2150 for_each_online_node(nid) { 2151 /* Keep track of the offsets in numa_faults array */ 2152 int mem_idx, membuf_idx, cpu_idx, cpubuf_idx; 2153 unsigned long faults = 0, group_faults = 0; 2154 int priv; 2155 2156 for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) { 2157 long diff, f_diff, f_weight; 2158 2159 mem_idx = task_faults_idx(NUMA_MEM, nid, priv); 2160 membuf_idx = task_faults_idx(NUMA_MEMBUF, nid, priv); 2161 cpu_idx = task_faults_idx(NUMA_CPU, nid, priv); 2162 cpubuf_idx = task_faults_idx(NUMA_CPUBUF, nid, priv); 2163 2164 /* Decay existing window, copy faults since last scan */ 2165 diff = p->numa_faults[membuf_idx] - p->numa_faults[mem_idx] / 2; 2166 fault_types[priv] += p->numa_faults[membuf_idx]; 2167 p->numa_faults[membuf_idx] = 0; 2168 2169 /* 2170 * Normalize the faults_from, so all tasks in a group 2171 * count according to CPU use, instead of by the raw 2172 * number of faults. Tasks with little runtime have 2173 * little over-all impact on throughput, and thus their 2174 * faults are less important. 2175 */ 2176 f_weight = div64_u64(runtime << 16, period + 1); 2177 f_weight = (f_weight * p->numa_faults[cpubuf_idx]) / 2178 (total_faults + 1); 2179 f_diff = f_weight - p->numa_faults[cpu_idx] / 2; 2180 p->numa_faults[cpubuf_idx] = 0; 2181 2182 p->numa_faults[mem_idx] += diff; 2183 p->numa_faults[cpu_idx] += f_diff; 2184 faults += p->numa_faults[mem_idx]; 2185 p->total_numa_faults += diff; 2186 if (p->numa_group) { 2187 /* 2188 * safe because we can only change our own group 2189 * 2190 * mem_idx represents the offset for a given 2191 * nid and priv in a specific region because it 2192 * is at the beginning of the numa_faults array. 2193 */ 2194 p->numa_group->faults[mem_idx] += diff; 2195 p->numa_group->faults_cpu[mem_idx] += f_diff; 2196 p->numa_group->total_faults += diff; 2197 group_faults += p->numa_group->faults[mem_idx]; 2198 } 2199 } 2200 2201 if (faults > max_faults) { 2202 max_faults = faults; 2203 max_nid = nid; 2204 } 2205 2206 if (group_faults > max_group_faults) { 2207 max_group_faults = group_faults; 2208 max_group_nid = nid; 2209 } 2210 } 2211 2212 update_task_scan_period(p, fault_types[0], fault_types[1]); 2213 2214 if (p->numa_group) { 2215 numa_group_count_active_nodes(p->numa_group); 2216 spin_unlock_irq(group_lock); 2217 max_nid = preferred_group_nid(p, max_group_nid); 2218 } 2219 2220 if (max_faults) { 2221 /* Set the new preferred node */ 2222 if (max_nid != p->numa_preferred_nid) 2223 sched_setnuma(p, max_nid); 2224 2225 if (task_node(p) != p->numa_preferred_nid) 2226 numa_migrate_preferred(p); 2227 } 2228 } 2229 2230 static inline int get_numa_group(struct numa_group *grp) 2231 { 2232 return atomic_inc_not_zero(&grp->refcount); 2233 } 2234 2235 static inline void put_numa_group(struct numa_group *grp) 2236 { 2237 if (atomic_dec_and_test(&grp->refcount)) 2238 kfree_rcu(grp, rcu); 2239 } 2240 2241 static void task_numa_group(struct task_struct *p, int cpupid, int flags, 2242 int *priv) 2243 { 2244 struct numa_group *grp, *my_grp; 2245 struct task_struct *tsk; 2246 bool join = false; 2247 int cpu = cpupid_to_cpu(cpupid); 2248 int i; 2249 2250 if (unlikely(!p->numa_group)) { 2251 unsigned int size = sizeof(struct numa_group) + 2252 4*nr_node_ids*sizeof(unsigned long); 2253 2254 grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN); 2255 if (!grp) 2256 return; 2257 2258 atomic_set(&grp->refcount, 1); 2259 grp->active_nodes = 1; 2260 grp->max_faults_cpu = 0; 2261 spin_lock_init(&grp->lock); 2262 grp->gid = p->pid; 2263 /* Second half of the array tracks nids where faults happen */ 2264 grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES * 2265 nr_node_ids; 2266 2267 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) 2268 grp->faults[i] = p->numa_faults[i]; 2269 2270 grp->total_faults = p->total_numa_faults; 2271 2272 grp->nr_tasks++; 2273 rcu_assign_pointer(p->numa_group, grp); 2274 } 2275 2276 rcu_read_lock(); 2277 tsk = READ_ONCE(cpu_rq(cpu)->curr); 2278 2279 if (!cpupid_match_pid(tsk, cpupid)) 2280 goto no_join; 2281 2282 grp = rcu_dereference(tsk->numa_group); 2283 if (!grp) 2284 goto no_join; 2285 2286 my_grp = p->numa_group; 2287 if (grp == my_grp) 2288 goto no_join; 2289 2290 /* 2291 * Only join the other group if its bigger; if we're the bigger group, 2292 * the other task will join us. 2293 */ 2294 if (my_grp->nr_tasks > grp->nr_tasks) 2295 goto no_join; 2296 2297 /* 2298 * Tie-break on the grp address. 2299 */ 2300 if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp) 2301 goto no_join; 2302 2303 /* Always join threads in the same process. */ 2304 if (tsk->mm == current->mm) 2305 join = true; 2306 2307 /* Simple filter to avoid false positives due to PID collisions */ 2308 if (flags & TNF_SHARED) 2309 join = true; 2310 2311 /* Update priv based on whether false sharing was detected */ 2312 *priv = !join; 2313 2314 if (join && !get_numa_group(grp)) 2315 goto no_join; 2316 2317 rcu_read_unlock(); 2318 2319 if (!join) 2320 return; 2321 2322 BUG_ON(irqs_disabled()); 2323 double_lock_irq(&my_grp->lock, &grp->lock); 2324 2325 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) { 2326 my_grp->faults[i] -= p->numa_faults[i]; 2327 grp->faults[i] += p->numa_faults[i]; 2328 } 2329 my_grp->total_faults -= p->total_numa_faults; 2330 grp->total_faults += p->total_numa_faults; 2331 2332 my_grp->nr_tasks--; 2333 grp->nr_tasks++; 2334 2335 spin_unlock(&my_grp->lock); 2336 spin_unlock_irq(&grp->lock); 2337 2338 rcu_assign_pointer(p->numa_group, grp); 2339 2340 put_numa_group(my_grp); 2341 return; 2342 2343 no_join: 2344 rcu_read_unlock(); 2345 return; 2346 } 2347 2348 void task_numa_free(struct task_struct *p) 2349 { 2350 struct numa_group *grp = p->numa_group; 2351 void *numa_faults = p->numa_faults; 2352 unsigned long flags; 2353 int i; 2354 2355 if (grp) { 2356 spin_lock_irqsave(&grp->lock, flags); 2357 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) 2358 grp->faults[i] -= p->numa_faults[i]; 2359 grp->total_faults -= p->total_numa_faults; 2360 2361 grp->nr_tasks--; 2362 spin_unlock_irqrestore(&grp->lock, flags); 2363 RCU_INIT_POINTER(p->numa_group, NULL); 2364 put_numa_group(grp); 2365 } 2366 2367 p->numa_faults = NULL; 2368 kfree(numa_faults); 2369 } 2370 2371 /* 2372 * Got a PROT_NONE fault for a page on @node. 2373 */ 2374 void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) 2375 { 2376 struct task_struct *p = current; 2377 bool migrated = flags & TNF_MIGRATED; 2378 int cpu_node = task_node(current); 2379 int local = !!(flags & TNF_FAULT_LOCAL); 2380 struct numa_group *ng; 2381 int priv; 2382 2383 if (!static_branch_likely(&sched_numa_balancing)) 2384 return; 2385 2386 /* for example, ksmd faulting in a user's mm */ 2387 if (!p->mm) 2388 return; 2389 2390 /* Allocate buffer to track faults on a per-node basis */ 2391 if (unlikely(!p->numa_faults)) { 2392 int size = sizeof(*p->numa_faults) * 2393 NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids; 2394 2395 p->numa_faults = kzalloc(size, GFP_KERNEL|__GFP_NOWARN); 2396 if (!p->numa_faults) 2397 return; 2398 2399 p->total_numa_faults = 0; 2400 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality)); 2401 } 2402 2403 /* 2404 * First accesses are treated as private, otherwise consider accesses 2405 * to be private if the accessing pid has not changed 2406 */ 2407 if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) { 2408 priv = 1; 2409 } else { 2410 priv = cpupid_match_pid(p, last_cpupid); 2411 if (!priv && !(flags & TNF_NO_GROUP)) 2412 task_numa_group(p, last_cpupid, flags, &priv); 2413 } 2414 2415 /* 2416 * If a workload spans multiple NUMA nodes, a shared fault that 2417 * occurs wholly within the set of nodes that the workload is 2418 * actively using should be counted as local. This allows the 2419 * scan rate to slow down when a workload has settled down. 2420 */ 2421 ng = p->numa_group; 2422 if (!priv && !local && ng && ng->active_nodes > 1 && 2423 numa_is_active_node(cpu_node, ng) && 2424 numa_is_active_node(mem_node, ng)) 2425 local = 1; 2426 2427 task_numa_placement(p); 2428 2429 /* 2430 * Retry task to preferred node migration periodically, in case it 2431 * case it previously failed, or the scheduler moved us. 2432 */ 2433 if (time_after(jiffies, p->numa_migrate_retry)) 2434 numa_migrate_preferred(p); 2435 2436 if (migrated) 2437 p->numa_pages_migrated += pages; 2438 if (flags & TNF_MIGRATE_FAIL) 2439 p->numa_faults_locality[2] += pages; 2440 2441 p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages; 2442 p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages; 2443 p->numa_faults_locality[local] += pages; 2444 } 2445 2446 static void reset_ptenuma_scan(struct task_struct *p) 2447 { 2448 /* 2449 * We only did a read acquisition of the mmap sem, so 2450 * p->mm->numa_scan_seq is written to without exclusive access 2451 * and the update is not guaranteed to be atomic. That's not 2452 * much of an issue though, since this is just used for 2453 * statistical sampling. Use READ_ONCE/WRITE_ONCE, which are not 2454 * expensive, to avoid any form of compiler optimizations: 2455 */ 2456 WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1); 2457 p->mm->numa_scan_offset = 0; 2458 } 2459 2460 /* 2461 * The expensive part of numa migration is done from task_work context. 2462 * Triggered from task_tick_numa(). 2463 */ 2464 void task_numa_work(struct callback_head *work) 2465 { 2466 unsigned long migrate, next_scan, now = jiffies; 2467 struct task_struct *p = current; 2468 struct mm_struct *mm = p->mm; 2469 u64 runtime = p->se.sum_exec_runtime; 2470 struct vm_area_struct *vma; 2471 unsigned long start, end; 2472 unsigned long nr_pte_updates = 0; 2473 long pages, virtpages; 2474 2475 SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work)); 2476 2477 work->next = work; /* protect against double add */ 2478 /* 2479 * Who cares about NUMA placement when they're dying. 2480 * 2481 * NOTE: make sure not to dereference p->mm before this check, 2482 * exit_task_work() happens _after_ exit_mm() so we could be called 2483 * without p->mm even though we still had it when we enqueued this 2484 * work. 2485 */ 2486 if (p->flags & PF_EXITING) 2487 return; 2488 2489 if (!mm->numa_next_scan) { 2490 mm->numa_next_scan = now + 2491 msecs_to_jiffies(sysctl_numa_balancing_scan_delay); 2492 } 2493 2494 /* 2495 * Enforce maximal scan/migration frequency.. 2496 */ 2497 migrate = mm->numa_next_scan; 2498 if (time_before(now, migrate)) 2499 return; 2500 2501 if (p->numa_scan_period == 0) { 2502 p->numa_scan_period_max = task_scan_max(p); 2503 p->numa_scan_period = task_scan_start(p); 2504 } 2505 2506 next_scan = now + msecs_to_jiffies(p->numa_scan_period); 2507 if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate) 2508 return; 2509 2510 /* 2511 * Delay this task enough that another task of this mm will likely win 2512 * the next time around. 2513 */ 2514 p->node_stamp += 2 * TICK_NSEC; 2515 2516 start = mm->numa_scan_offset; 2517 pages = sysctl_numa_balancing_scan_size; 2518 pages <<= 20 - PAGE_SHIFT; /* MB in pages */ 2519 virtpages = pages * 8; /* Scan up to this much virtual space */ 2520 if (!pages) 2521 return; 2522 2523 2524 if (!down_read_trylock(&mm->mmap_sem)) 2525 return; 2526 vma = find_vma(mm, start); 2527 if (!vma) { 2528 reset_ptenuma_scan(p); 2529 start = 0; 2530 vma = mm->mmap; 2531 } 2532 for (; vma; vma = vma->vm_next) { 2533 if (!vma_migratable(vma) || !vma_policy_mof(vma) || 2534 is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) { 2535 continue; 2536 } 2537 2538 /* 2539 * Shared library pages mapped by multiple processes are not 2540 * migrated as it is expected they are cache replicated. Avoid 2541 * hinting faults in read-only file-backed mappings or the vdso 2542 * as migrating the pages will be of marginal benefit. 2543 */ 2544 if (!vma->vm_mm || 2545 (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ))) 2546 continue; 2547 2548 /* 2549 * Skip inaccessible VMAs to avoid any confusion between 2550 * PROT_NONE and NUMA hinting ptes 2551 */ 2552 if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))) 2553 continue; 2554 2555 do { 2556 start = max(start, vma->vm_start); 2557 end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE); 2558 end = min(end, vma->vm_end); 2559 nr_pte_updates = change_prot_numa(vma, start, end); 2560 2561 /* 2562 * Try to scan sysctl_numa_balancing_size worth of 2563 * hpages that have at least one present PTE that 2564 * is not already pte-numa. If the VMA contains 2565 * areas that are unused or already full of prot_numa 2566 * PTEs, scan up to virtpages, to skip through those 2567 * areas faster. 2568 */ 2569 if (nr_pte_updates) 2570 pages -= (end - start) >> PAGE_SHIFT; 2571 virtpages -= (end - start) >> PAGE_SHIFT; 2572 2573 start = end; 2574 if (pages <= 0 || virtpages <= 0) 2575 goto out; 2576 2577 cond_resched(); 2578 } while (end != vma->vm_end); 2579 } 2580 2581 out: 2582 /* 2583 * It is possible to reach the end of the VMA list but the last few 2584 * VMAs are not guaranteed to the vma_migratable. If they are not, we 2585 * would find the !migratable VMA on the next scan but not reset the 2586 * scanner to the start so check it now. 2587 */ 2588 if (vma) 2589 mm->numa_scan_offset = start; 2590 else 2591 reset_ptenuma_scan(p); 2592 up_read(&mm->mmap_sem); 2593 2594 /* 2595 * Make sure tasks use at least 32x as much time to run other code 2596 * than they used here, to limit NUMA PTE scanning overhead to 3% max. 2597 * Usually update_task_scan_period slows down scanning enough; on an 2598 * overloaded system we need to limit overhead on a per task basis. 2599 */ 2600 if (unlikely(p->se.sum_exec_runtime != runtime)) { 2601 u64 diff = p->se.sum_exec_runtime - runtime; 2602 p->node_stamp += 32 * diff; 2603 } 2604 } 2605 2606 /* 2607 * Drive the periodic memory faults.. 2608 */ 2609 void task_tick_numa(struct rq *rq, struct task_struct *curr) 2610 { 2611 struct callback_head *work = &curr->numa_work; 2612 u64 period, now; 2613 2614 /* 2615 * We don't care about NUMA placement if we don't have memory. 2616 */ 2617 if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work) 2618 return; 2619 2620 /* 2621 * Using runtime rather than walltime has the dual advantage that 2622 * we (mostly) drive the selection from busy threads and that the 2623 * task needs to have done some actual work before we bother with 2624 * NUMA placement. 2625 */ 2626 now = curr->se.sum_exec_runtime; 2627 period = (u64)curr->numa_scan_period * NSEC_PER_MSEC; 2628 2629 if (now > curr->node_stamp + period) { 2630 if (!curr->node_stamp) 2631 curr->numa_scan_period = task_scan_start(curr); 2632 curr->node_stamp += period; 2633 2634 if (!time_before(jiffies, curr->mm->numa_next_scan)) { 2635 init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */ 2636 task_work_add(curr, work, true); 2637 } 2638 } 2639 } 2640 2641 #else 2642 static void task_tick_numa(struct rq *rq, struct task_struct *curr) 2643 { 2644 } 2645 2646 static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p) 2647 { 2648 } 2649 2650 static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p) 2651 { 2652 } 2653 2654 #endif /* CONFIG_NUMA_BALANCING */ 2655 2656 static void 2657 account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) 2658 { 2659 update_load_add(&cfs_rq->load, se->load.weight); 2660 if (!parent_entity(se)) 2661 update_load_add(&rq_of(cfs_rq)->load, se->load.weight); 2662 #ifdef CONFIG_SMP 2663 if (entity_is_task(se)) { 2664 struct rq *rq = rq_of(cfs_rq); 2665 2666 account_numa_enqueue(rq, task_of(se)); 2667 list_add(&se->group_node, &rq->cfs_tasks); 2668 } 2669 #endif 2670 cfs_rq->nr_running++; 2671 } 2672 2673 static void 2674 account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) 2675 { 2676 update_load_sub(&cfs_rq->load, se->load.weight); 2677 if (!parent_entity(se)) 2678 update_load_sub(&rq_of(cfs_rq)->load, se->load.weight); 2679 #ifdef CONFIG_SMP 2680 if (entity_is_task(se)) { 2681 account_numa_dequeue(rq_of(cfs_rq), task_of(se)); 2682 list_del_init(&se->group_node); 2683 } 2684 #endif 2685 cfs_rq->nr_running--; 2686 } 2687 2688 /* 2689 * Signed add and clamp on underflow. 2690 * 2691 * Explicitly do a load-store to ensure the intermediate value never hits 2692 * memory. This allows lockless observations without ever seeing the negative 2693 * values. 2694 */ 2695 #define add_positive(_ptr, _val) do { \ 2696 typeof(_ptr) ptr = (_ptr); \ 2697 typeof(_val) val = (_val); \ 2698 typeof(*ptr) res, var = READ_ONCE(*ptr); \ 2699 \ 2700 res = var + val; \ 2701 \ 2702 if (val < 0 && res > var) \ 2703 res = 0; \ 2704 \ 2705 WRITE_ONCE(*ptr, res); \ 2706 } while (0) 2707 2708 /* 2709 * Unsigned subtract and clamp on underflow. 2710 * 2711 * Explicitly do a load-store to ensure the intermediate value never hits 2712 * memory. This allows lockless observations without ever seeing the negative 2713 * values. 2714 */ 2715 #define sub_positive(_ptr, _val) do { \ 2716 typeof(_ptr) ptr = (_ptr); \ 2717 typeof(*ptr) val = (_val); \ 2718 typeof(*ptr) res, var = READ_ONCE(*ptr); \ 2719 res = var - val; \ 2720 if (res > var) \ 2721 res = 0; \ 2722 WRITE_ONCE(*ptr, res); \ 2723 } while (0) 2724 2725 #ifdef CONFIG_SMP 2726 /* 2727 * XXX we want to get rid of these helpers and use the full load resolution. 2728 */ 2729 static inline long se_weight(struct sched_entity *se) 2730 { 2731 return scale_load_down(se->load.weight); 2732 } 2733 2734 static inline long se_runnable(struct sched_entity *se) 2735 { 2736 return scale_load_down(se->runnable_weight); 2737 } 2738 2739 static inline void 2740 enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) 2741 { 2742 cfs_rq->runnable_weight += se->runnable_weight; 2743 2744 cfs_rq->avg.runnable_load_avg += se->avg.runnable_load_avg; 2745 cfs_rq->avg.runnable_load_sum += se_runnable(se) * se->avg.runnable_load_sum; 2746 } 2747 2748 static inline void 2749 dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) 2750 { 2751 cfs_rq->runnable_weight -= se->runnable_weight; 2752 2753 sub_positive(&cfs_rq->avg.runnable_load_avg, se->avg.runnable_load_avg); 2754 sub_positive(&cfs_rq->avg.runnable_load_sum, 2755 se_runnable(se) * se->avg.runnable_load_sum); 2756 } 2757 2758 static inline void 2759 enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) 2760 { 2761 cfs_rq->avg.load_avg += se->avg.load_avg; 2762 cfs_rq->avg.load_sum += se_weight(se) * se->avg.load_sum; 2763 } 2764 2765 static inline void 2766 dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) 2767 { 2768 sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg); 2769 sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum); 2770 } 2771 #else 2772 static inline void 2773 enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } 2774 static inline void 2775 dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } 2776 static inline void 2777 enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } 2778 static inline void 2779 dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } 2780 #endif 2781 2782 static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, 2783 unsigned long weight, unsigned long runnable) 2784 { 2785 if (se->on_rq) { 2786 /* commit outstanding execution time */ 2787 if (cfs_rq->curr == se) 2788 update_curr(cfs_rq); 2789 account_entity_dequeue(cfs_rq, se); 2790 dequeue_runnable_load_avg(cfs_rq, se); 2791 } 2792 dequeue_load_avg(cfs_rq, se); 2793 2794 se->runnable_weight = runnable; 2795 update_load_set(&se->load, weight); 2796 2797 #ifdef CONFIG_SMP 2798 do { 2799 u32 divider = LOAD_AVG_MAX - 1024 + se->avg.period_contrib; 2800 2801 se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider); 2802 se->avg.runnable_load_avg = 2803 div_u64(se_runnable(se) * se->avg.runnable_load_sum, divider); 2804 } while (0); 2805 #endif 2806 2807 enqueue_load_avg(cfs_rq, se); 2808 if (se->on_rq) { 2809 account_entity_enqueue(cfs_rq, se); 2810 enqueue_runnable_load_avg(cfs_rq, se); 2811 } 2812 } 2813 2814 void reweight_task(struct task_struct *p, int prio) 2815 { 2816 struct sched_entity *se = &p->se; 2817 struct cfs_rq *cfs_rq = cfs_rq_of(se); 2818 struct load_weight *load = &se->load; 2819 unsigned long weight = scale_load(sched_prio_to_weight[prio]); 2820 2821 reweight_entity(cfs_rq, se, weight, weight); 2822 load->inv_weight = sched_prio_to_wmult[prio]; 2823 } 2824 2825 #ifdef CONFIG_FAIR_GROUP_SCHED 2826 # ifdef CONFIG_SMP 2827 /* 2828 * All this does is approximate the hierarchical proportion which includes that 2829 * global sum we all love to hate. 2830 * 2831 * That is, the weight of a group entity, is the proportional share of the 2832 * group weight based on the group runqueue weights. That is: 2833 * 2834 * tg->weight * grq->load.weight 2835 * ge->load.weight = ----------------------------- (1) 2836 * \Sum grq->load.weight 2837 * 2838 * Now, because computing that sum is prohibitively expensive to compute (been 2839 * there, done that) we approximate it with this average stuff. The average 2840 * moves slower and therefore the approximation is cheaper and more stable. 2841 * 2842 * So instead of the above, we substitute: 2843 * 2844 * grq->load.weight -> grq->avg.load_avg (2) 2845 * 2846 * which yields the following: 2847 * 2848 * tg->weight * grq->avg.load_avg 2849 * ge->load.weight = ------------------------------ (3) 2850 * tg->load_avg 2851 * 2852 * Where: tg->load_avg ~= \Sum grq->avg.load_avg 2853 * 2854 * That is shares_avg, and it is right (given the approximation (2)). 2855 * 2856 * The problem with it is that because the average is slow -- it was designed 2857 * to be exactly that of course -- this leads to transients in boundary 2858 * conditions. In specific, the case where the group was idle and we start the 2859 * one task. It takes time for our CPU's grq->avg.load_avg to build up, 2860 * yielding bad latency etc.. 2861 * 2862 * Now, in that special case (1) reduces to: 2863 * 2864 * tg->weight * grq->load.weight 2865 * ge->load.weight = ----------------------------- = tg->weight (4) 2866 * grp->load.weight 2867 * 2868 * That is, the sum collapses because all other CPUs are idle; the UP scenario. 2869 * 2870 * So what we do is modify our approximation (3) to approach (4) in the (near) 2871 * UP case, like: 2872 * 2873 * ge->load.weight = 2874 * 2875 * tg->weight * grq->load.weight 2876 * --------------------------------------------------- (5) 2877 * tg->load_avg - grq->avg.load_avg + grq->load.weight 2878 * 2879 * But because grq->load.weight can drop to 0, resulting in a divide by zero, 2880 * we need to use grq->avg.load_avg as its lower bound, which then gives: 2881 * 2882 * 2883 * tg->weight * grq->load.weight 2884 * ge->load.weight = ----------------------------- (6) 2885 * tg_load_avg' 2886 * 2887 * Where: 2888 * 2889 * tg_load_avg' = tg->load_avg - grq->avg.load_avg + 2890 * max(grq->load.weight, grq->avg.load_avg) 2891 * 2892 * And that is shares_weight and is icky. In the (near) UP case it approaches 2893 * (4) while in the normal case it approaches (3). It consistently 2894 * overestimates the ge->load.weight and therefore: 2895 * 2896 * \Sum ge->load.weight >= tg->weight 2897 * 2898 * hence icky! 2899 */ 2900 static long calc_group_shares(struct cfs_rq *cfs_rq) 2901 { 2902 long tg_weight, tg_shares, load, shares; 2903 struct task_group *tg = cfs_rq->tg; 2904 2905 tg_shares = READ_ONCE(tg->shares); 2906 2907 load = max(scale_load_down(cfs_rq->load.weight), cfs_rq->avg.load_avg); 2908 2909 tg_weight = atomic_long_read(&tg->load_avg); 2910 2911 /* Ensure tg_weight >= load */ 2912 tg_weight -= cfs_rq->tg_load_avg_contrib; 2913 tg_weight += load; 2914 2915 shares = (tg_shares * load); 2916 if (tg_weight) 2917 shares /= tg_weight; 2918 2919 /* 2920 * MIN_SHARES has to be unscaled here to support per-CPU partitioning 2921 * of a group with small tg->shares value. It is a floor value which is 2922 * assigned as a minimum load.weight to the sched_entity representing 2923 * the group on a CPU. 2924 * 2925 * E.g. on 64-bit for a group with tg->shares of scale_load(15)=15*1024 2926 * on an 8-core system with 8 tasks each runnable on one CPU shares has 2927 * to be 15*1024*1/8=1920 instead of scale_load(MIN_SHARES)=2*1024. In 2928 * case no task is runnable on a CPU MIN_SHARES=2 should be returned 2929 * instead of 0. 2930 */ 2931 return clamp_t(long, shares, MIN_SHARES, tg_shares); 2932 } 2933 2934 /* 2935 * This calculates the effective runnable weight for a group entity based on 2936 * the group entity weight calculated above. 2937 * 2938 * Because of the above approximation (2), our group entity weight is 2939 * an load_avg based ratio (3). This means that it includes blocked load and 2940 * does not represent the runnable weight. 2941 * 2942 * Approximate the group entity's runnable weight per ratio from the group 2943 * runqueue: 2944 * 2945 * grq->avg.runnable_load_avg 2946 * ge->runnable_weight = ge->load.weight * -------------------------- (7) 2947 * grq->avg.load_avg 2948 * 2949 * However, analogous to above, since the avg numbers are slow, this leads to 2950 * transients in the from-idle case. Instead we use: 2951 * 2952 * ge->runnable_weight = ge->load.weight * 2953 * 2954 * max(grq->avg.runnable_load_avg, grq->runnable_weight) 2955 * ----------------------------------------------------- (8) 2956 * max(grq->avg.load_avg, grq->load.weight) 2957 * 2958 * Where these max() serve both to use the 'instant' values to fix the slow 2959 * from-idle and avoid the /0 on to-idle, similar to (6). 2960 */ 2961 static long calc_group_runnable(struct cfs_rq *cfs_rq, long shares) 2962 { 2963 long runnable, load_avg; 2964 2965 load_avg = max(cfs_rq->avg.load_avg, 2966 scale_load_down(cfs_rq->load.weight)); 2967 2968 runnable = max(cfs_rq->avg.runnable_load_avg, 2969 scale_load_down(cfs_rq->runnable_weight)); 2970 2971 runnable *= shares; 2972 if (load_avg) 2973 runnable /= load_avg; 2974 2975 return clamp_t(long, runnable, MIN_SHARES, shares); 2976 } 2977 # endif /* CONFIG_SMP */ 2978 2979 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); 2980 2981 /* 2982 * Recomputes the group entity based on the current state of its group 2983 * runqueue. 2984 */ 2985 static void update_cfs_group(struct sched_entity *se) 2986 { 2987 struct cfs_rq *gcfs_rq = group_cfs_rq(se); 2988 long shares, runnable; 2989 2990 if (!gcfs_rq) 2991 return; 2992 2993 if (throttled_hierarchy(gcfs_rq)) 2994 return; 2995 2996 #ifndef CONFIG_SMP 2997 runnable = shares = READ_ONCE(gcfs_rq->tg->shares); 2998 2999 if (likely(se->load.weight == shares)) 3000 return; 3001 #else 3002 shares = calc_group_shares(gcfs_rq); 3003 runnable = calc_group_runnable(gcfs_rq, shares); 3004 #endif 3005 3006 reweight_entity(cfs_rq_of(se), se, shares, runnable); 3007 } 3008 3009 #else /* CONFIG_FAIR_GROUP_SCHED */ 3010 static inline void update_cfs_group(struct sched_entity *se) 3011 { 3012 } 3013 #endif /* CONFIG_FAIR_GROUP_SCHED */ 3014 3015 static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) 3016 { 3017 struct rq *rq = rq_of(cfs_rq); 3018 3019 if (&rq->cfs == cfs_rq) { 3020 /* 3021 * There are a few boundary cases this might miss but it should 3022 * get called often enough that that should (hopefully) not be 3023 * a real problem -- added to that it only calls on the local 3024 * CPU, so if we enqueue remotely we'll miss an update, but 3025 * the next tick/schedule should update. 3026 * 3027 * It will not get called when we go idle, because the idle 3028 * thread is a different class (!fair), nor will the utilization 3029 * number include things like RT tasks. 3030 * 3031 * As is, the util number is not freq-invariant (we'd have to 3032 * implement arch_scale_freq_capacity() for that). 3033 * 3034 * See cpu_util(). 3035 */ 3036 cpufreq_update_util(rq, 0); 3037 } 3038 } 3039 3040 #ifdef CONFIG_SMP 3041 /* 3042 * Approximate: 3043 * val * y^n, where y^32 ~= 0.5 (~1 scheduling period) 3044 */ 3045 static u64 decay_load(u64 val, u64 n) 3046 { 3047 unsigned int local_n; 3048 3049 if (unlikely(n > LOAD_AVG_PERIOD * 63)) 3050 return 0; 3051 3052 /* after bounds checking we can collapse to 32-bit */ 3053 local_n = n; 3054 3055 /* 3056 * As y^PERIOD = 1/2, we can combine 3057 * y^n = 1/2^(n/PERIOD) * y^(n%PERIOD) 3058 * With a look-up table which covers y^n (n<PERIOD) 3059 * 3060 * To achieve constant time decay_load. 3061 */ 3062 if (unlikely(local_n >= LOAD_AVG_PERIOD)) { 3063 val >>= local_n / LOAD_AVG_PERIOD; 3064 local_n %= LOAD_AVG_PERIOD; 3065 } 3066 3067 val = mul_u64_u32_shr(val, runnable_avg_yN_inv[local_n], 32); 3068 return val; 3069 } 3070 3071 static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3) 3072 { 3073 u32 c1, c2, c3 = d3; /* y^0 == 1 */ 3074 3075 /* 3076 * c1 = d1 y^p 3077 */ 3078 c1 = decay_load((u64)d1, periods); 3079 3080 /* 3081 * p-1 3082 * c2 = 1024 \Sum y^n 3083 * n=1 3084 * 3085 * inf inf 3086 * = 1024 ( \Sum y^n - \Sum y^n - y^0 ) 3087 * n=0 n=p 3088 */ 3089 c2 = LOAD_AVG_MAX - decay_load(LOAD_AVG_MAX, periods) - 1024; 3090 3091 return c1 + c2 + c3; 3092 } 3093 3094 #define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT) 3095 3096 /* 3097 * Accumulate the three separate parts of the sum; d1 the remainder 3098 * of the last (incomplete) period, d2 the span of full periods and d3 3099 * the remainder of the (incomplete) current period. 3100 * 3101 * d1 d2 d3 3102 * ^ ^ ^ 3103 * | | | 3104 * |<->|<----------------->|<--->| 3105 * ... |---x---|------| ... |------|-----x (now) 3106 * 3107 * p-1 3108 * u' = (u + d1) y^p + 1024 \Sum y^n + d3 y^0 3109 * n=1 3110 * 3111 * = u y^p + (Step 1) 3112 * 3113 * p-1 3114 * d1 y^p + 1024 \Sum y^n + d3 y^0 (Step 2) 3115 * n=1 3116 */ 3117 static __always_inline u32 3118 accumulate_sum(u64 delta, int cpu, struct sched_avg *sa, 3119 unsigned long load, unsigned long runnable, int running) 3120 { 3121 unsigned long scale_freq, scale_cpu; 3122 u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */ 3123 u64 periods; 3124 3125 scale_freq = arch_scale_freq_capacity(NULL, cpu); 3126 scale_cpu = arch_scale_cpu_capacity(NULL, cpu); 3127 3128 delta += sa->period_contrib; 3129 periods = delta / 1024; /* A period is 1024us (~1ms) */ 3130 3131 /* 3132 * Step 1: decay old *_sum if we crossed period boundaries. 3133 */ 3134 if (periods) { 3135 sa->load_sum = decay_load(sa->load_sum, periods); 3136 sa->runnable_load_sum = 3137 decay_load(sa->runnable_load_sum, periods); 3138 sa->util_sum = decay_load((u64)(sa->util_sum), periods); 3139 3140 /* 3141 * Step 2 3142 */ 3143 delta %= 1024; 3144 contrib = __accumulate_pelt_segments(periods, 3145 1024 - sa->period_contrib, delta); 3146 } 3147 sa->period_contrib = delta; 3148 3149 contrib = cap_scale(contrib, scale_freq); 3150 if (load) 3151 sa->load_sum += load * contrib; 3152 if (runnable) 3153 sa->runnable_load_sum += runnable * contrib; 3154 if (running) 3155 sa->util_sum += contrib * scale_cpu; 3156 3157 return periods; 3158 } 3159 3160 /* 3161 * We can represent the historical contribution to runnable average as the 3162 * coefficients of a geometric series. To do this we sub-divide our runnable 3163 * history into segments of approximately 1ms (1024us); label the segment that 3164 * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g. 3165 * 3166 * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ... 3167 * p0 p1 p2 3168 * (now) (~1ms ago) (~2ms ago) 3169 * 3170 * Let u_i denote the fraction of p_i that the entity was runnable. 3171 * 3172 * We then designate the fractions u_i as our co-efficients, yielding the 3173 * following representation of historical load: 3174 * u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ... 3175 * 3176 * We choose y based on the with of a reasonably scheduling period, fixing: 3177 * y^32 = 0.5 3178 * 3179 * This means that the contribution to load ~32ms ago (u_32) will be weighted 3180 * approximately half as much as the contribution to load within the last ms 3181 * (u_0). 3182 * 3183 * When a period "rolls over" and we have new u_0`, multiplying the previous 3184 * sum again by y is sufficient to update: 3185 * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... ) 3186 * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}] 3187 */ 3188 static __always_inline int 3189 ___update_load_sum(u64 now, int cpu, struct sched_avg *sa, 3190 unsigned long load, unsigned long runnable, int running) 3191 { 3192 u64 delta; 3193 3194 delta = now - sa->last_update_time; 3195 /* 3196 * This should only happen when time goes backwards, which it 3197 * unfortunately does during sched clock init when we swap over to TSC. 3198 */ 3199 if ((s64)delta < 0) { 3200 sa->last_update_time = now; 3201 return 0; 3202 } 3203 3204 /* 3205 * Use 1024ns as the unit of measurement since it's a reasonable 3206 * approximation of 1us and fast to compute. 3207 */ 3208 delta >>= 10; 3209 if (!delta) 3210 return 0; 3211 3212 sa->last_update_time += delta << 10; 3213 3214 /* 3215 * running is a subset of runnable (weight) so running can't be set if 3216 * runnable is clear. But there are some corner cases where the current 3217 * se has been already dequeued but cfs_rq->curr still points to it. 3218 * This means that weight will be 0 but not running for a sched_entity 3219 * but also for a cfs_rq if the latter becomes idle. As an example, 3220 * this happens during idle_balance() which calls 3221 * update_blocked_averages() 3222 */ 3223 if (!load) 3224 runnable = running = 0; 3225 3226 /* 3227 * Now we know we crossed measurement unit boundaries. The *_avg 3228 * accrues by two steps: 3229 * 3230 * Step 1: accumulate *_sum since last_update_time. If we haven't 3231 * crossed period boundaries, finish. 3232 */ 3233 if (!accumulate_sum(delta, cpu, sa, load, runnable, running)) 3234 return 0; 3235 3236 return 1; 3237 } 3238 3239 static __always_inline void 3240 ___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runnable) 3241 { 3242 u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib; 3243 3244 /* 3245 * Step 2: update *_avg. 3246 */ 3247 sa->load_avg = div_u64(load * sa->load_sum, divider); 3248 sa->runnable_load_avg = div_u64(runnable * sa->runnable_load_sum, divider); 3249 sa->util_avg = sa->util_sum / divider; 3250 } 3251 3252 /* 3253 * sched_entity: 3254 * 3255 * task: 3256 * se_runnable() == se_weight() 3257 * 3258 * group: [ see update_cfs_group() ] 3259 * se_weight() = tg->weight * grq->load_avg / tg->load_avg 3260 * se_runnable() = se_weight(se) * grq->runnable_load_avg / grq->load_avg 3261 * 3262 * load_sum := runnable_sum 3263 * load_avg = se_weight(se) * runnable_avg 3264 * 3265 * runnable_load_sum := runnable_sum 3266 * runnable_load_avg = se_runnable(se) * runnable_avg 3267 * 3268 * XXX collapse load_sum and runnable_load_sum 3269 * 3270 * cfq_rs: 3271 * 3272 * load_sum = \Sum se_weight(se) * se->avg.load_sum 3273 * load_avg = \Sum se->avg.load_avg 3274 * 3275 * runnable_load_sum = \Sum se_runnable(se) * se->avg.runnable_load_sum 3276 * runnable_load_avg = \Sum se->avg.runable_load_avg 3277 */ 3278 3279 static int 3280 __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se) 3281 { 3282 if (entity_is_task(se)) 3283 se->runnable_weight = se->load.weight; 3284 3285 if (___update_load_sum(now, cpu, &se->avg, 0, 0, 0)) { 3286 ___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); 3287 return 1; 3288 } 3289 3290 return 0; 3291 } 3292 3293 static int 3294 __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se) 3295 { 3296 if (entity_is_task(se)) 3297 se->runnable_weight = se->load.weight; 3298 3299 if (___update_load_sum(now, cpu, &se->avg, !!se->on_rq, !!se->on_rq, 3300 cfs_rq->curr == se)) { 3301 3302 ___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); 3303 return 1; 3304 } 3305 3306 return 0; 3307 } 3308 3309 static int 3310 __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq) 3311 { 3312 if (___update_load_sum(now, cpu, &cfs_rq->avg, 3313 scale_load_down(cfs_rq->load.weight), 3314 scale_load_down(cfs_rq->runnable_weight), 3315 cfs_rq->curr != NULL)) { 3316 3317 ___update_load_avg(&cfs_rq->avg, 1, 1); 3318 return 1; 3319 } 3320 3321 return 0; 3322 } 3323 3324 #ifdef CONFIG_FAIR_GROUP_SCHED 3325 /** 3326 * update_tg_load_avg - update the tg's load avg 3327 * @cfs_rq: the cfs_rq whose avg changed 3328 * @force: update regardless of how small the difference 3329 * 3330 * This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load. 3331 * However, because tg->load_avg is a global value there are performance 3332 * considerations. 3333 * 3334 * In order to avoid having to look at the other cfs_rq's, we use a 3335 * differential update where we store the last value we propagated. This in 3336 * turn allows skipping updates if the differential is 'small'. 3337 * 3338 * Updating tg's load_avg is necessary before update_cfs_share(). 3339 */ 3340 static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) 3341 { 3342 long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib; 3343 3344 /* 3345 * No need to update load_avg for root_task_group as it is not used. 3346 */ 3347 if (cfs_rq->tg == &root_task_group) 3348 return; 3349 3350 if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) { 3351 atomic_long_add(delta, &cfs_rq->tg->load_avg); 3352 cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg; 3353 } 3354 } 3355 3356 /* 3357 * Called within set_task_rq() right before setting a task's cpu. The 3358 * caller only guarantees p->pi_lock is held; no other assumptions, 3359 * including the state of rq->lock, should be made. 3360 */ 3361 void set_task_rq_fair(struct sched_entity *se, 3362 struct cfs_rq *prev, struct cfs_rq *next) 3363 { 3364 u64 p_last_update_time; 3365 u64 n_last_update_time; 3366 3367 if (!sched_feat(ATTACH_AGE_LOAD)) 3368 return; 3369 3370 /* 3371 * We are supposed to update the task to "current" time, then its up to 3372 * date and ready to go to new CPU/cfs_rq. But we have difficulty in 3373 * getting what current time is, so simply throw away the out-of-date 3374 * time. This will result in the wakee task is less decayed, but giving 3375 * the wakee more load sounds not bad. 3376 */ 3377 if (!(se->avg.last_update_time && prev)) 3378 return; 3379 3380 #ifndef CONFIG_64BIT 3381 { 3382 u64 p_last_update_time_copy; 3383 u64 n_last_update_time_copy; 3384 3385 do { 3386 p_last_update_time_copy = prev->load_last_update_time_copy; 3387 n_last_update_time_copy = next->load_last_update_time_copy; 3388 3389 smp_rmb(); 3390 3391 p_last_update_time = prev->avg.last_update_time; 3392 n_last_update_time = next->avg.last_update_time; 3393 3394 } while (p_last_update_time != p_last_update_time_copy || 3395 n_last_update_time != n_last_update_time_copy); 3396 } 3397 #else 3398 p_last_update_time = prev->avg.last_update_time; 3399 n_last_update_time = next->avg.last_update_time; 3400 #endif 3401 __update_load_avg_blocked_se(p_last_update_time, cpu_of(rq_of(prev)), se); 3402 se->avg.last_update_time = n_last_update_time; 3403 } 3404 3405 3406 /* 3407 * When on migration a sched_entity joins/leaves the PELT hierarchy, we need to 3408 * propagate its contribution. The key to this propagation is the invariant 3409 * that for each group: 3410 * 3411 * ge->avg == grq->avg (1) 3412 * 3413 * _IFF_ we look at the pure running and runnable sums. Because they 3414 * represent the very same entity, just at different points in the hierarchy. 3415 * 3416 * 3417 * Per the above update_tg_cfs_util() is trivial (and still 'wrong') and 3418 * simply copies the running sum over. 3419 * 3420 * However, update_tg_cfs_runnable() is more complex. So we have: 3421 * 3422 * ge->avg.load_avg = ge->load.weight * ge->avg.runnable_avg (2) 3423 * 3424 * And since, like util, the runnable part should be directly transferable, 3425 * the following would _appear_ to be the straight forward approach: 3426 * 3427 * grq->avg.load_avg = grq->load.weight * grq->avg.running_avg (3) 3428 * 3429 * And per (1) we have: 3430 * 3431 * ge->avg.running_avg == grq->avg.running_avg 3432 * 3433 * Which gives: 3434 * 3435 * ge->load.weight * grq->avg.load_avg 3436 * ge->avg.load_avg = ----------------------------------- (4) 3437 * grq->load.weight 3438 * 3439 * Except that is wrong! 3440 * 3441 * Because while for entities historical weight is not important and we 3442 * really only care about our future and therefore can consider a pure 3443 * runnable sum, runqueues can NOT do this. 3444 * 3445 * We specifically want runqueues to have a load_avg that includes 3446 * historical weights. Those represent the blocked load, the load we expect 3447 * to (shortly) return to us. This only works by keeping the weights as 3448 * integral part of the sum. We therefore cannot decompose as per (3). 3449 * 3450 * OK, so what then? 3451 * 3452 * 3453 * Another way to look at things is: 3454 * 3455 * grq->avg.load_avg = \Sum se->avg.load_avg 3456 * 3457 * Therefore, per (2): 3458 * 3459 * grq->avg.load_avg = \Sum se->load.weight * se->avg.runnable_avg 3460 * 3461 * And the very thing we're propagating is a change in that sum (someone 3462 * joined/left). So we can easily know the runnable change, which would be, per 3463 * (2) the already tracked se->load_avg divided by the corresponding 3464 * se->weight. 3465 * 3466 * Basically (4) but in differential form: 3467 * 3468 * d(runnable_avg) += se->avg.load_avg / se->load.weight 3469 * (5) 3470 * ge->avg.load_avg += ge->load.weight * d(runnable_avg) 3471 */ 3472 3473 static inline void 3474 update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) 3475 { 3476 long delta = gcfs_rq->avg.util_avg - se->avg.util_avg; 3477 3478 /* Nothing to update */ 3479 if (!delta) 3480 return; 3481 3482 /* Set new sched_entity's utilization */ 3483 se->avg.util_avg = gcfs_rq->avg.util_avg; 3484 se->avg.util_sum = se->avg.util_avg * LOAD_AVG_MAX; 3485 3486 /* Update parent cfs_rq utilization */ 3487 add_positive(&cfs_rq->avg.util_avg, delta); 3488 cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX; 3489 } 3490 3491 static inline void 3492 update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) 3493 { 3494 long runnable_sum = gcfs_rq->prop_runnable_sum; 3495 long runnable_load_avg, load_avg; 3496 s64 runnable_load_sum, load_sum; 3497 3498 if (!runnable_sum) 3499 return; 3500 3501 gcfs_rq->prop_runnable_sum = 0; 3502 3503 load_sum = (s64)se_weight(se) * runnable_sum; 3504 load_avg = div_s64(load_sum, LOAD_AVG_MAX); 3505 3506 add_positive(&se->avg.load_sum, runnable_sum); 3507 add_positive(&se->avg.load_avg, load_avg); 3508 3509 add_positive(&cfs_rq->avg.load_avg, load_avg); 3510 add_positive(&cfs_rq->avg.load_sum, load_sum); 3511 3512 runnable_load_sum = (s64)se_runnable(se) * runnable_sum; 3513 runnable_load_avg = div_s64(runnable_load_sum, LOAD_AVG_MAX); 3514 3515 add_positive(&se->avg.runnable_load_sum, runnable_sum); 3516 add_positive(&se->avg.runnable_load_avg, runnable_load_avg); 3517 3518 if (se->on_rq) { 3519 add_positive(&cfs_rq->avg.runnable_load_avg, runnable_load_avg); 3520 add_positive(&cfs_rq->avg.runnable_load_sum, runnable_load_sum); 3521 } 3522 } 3523 3524 static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) 3525 { 3526 cfs_rq->propagate = 1; 3527 cfs_rq->prop_runnable_sum += runnable_sum; 3528 } 3529 3530 /* Update task and its cfs_rq load average */ 3531 static inline int propagate_entity_load_avg(struct sched_entity *se) 3532 { 3533 struct cfs_rq *cfs_rq, *gcfs_rq; 3534 3535 if (entity_is_task(se)) 3536 return 0; 3537 3538 gcfs_rq = group_cfs_rq(se); 3539 if (!gcfs_rq->propagate) 3540 return 0; 3541 3542 gcfs_rq->propagate = 0; 3543 3544 cfs_rq = cfs_rq_of(se); 3545 3546 add_tg_cfs_propagate(cfs_rq, gcfs_rq->prop_runnable_sum); 3547 3548 update_tg_cfs_util(cfs_rq, se, gcfs_rq); 3549 update_tg_cfs_runnable(cfs_rq, se, gcfs_rq); 3550 3551 return 1; 3552 } 3553 3554 /* 3555 * Check if we need to update the load and the utilization of a blocked 3556 * group_entity: 3557 */ 3558 static inline bool skip_blocked_update(struct sched_entity *se) 3559 { 3560 struct cfs_rq *gcfs_rq = group_cfs_rq(se); 3561 3562 /* 3563 * If sched_entity still have not zero load or utilization, we have to 3564 * decay it: 3565 */ 3566 if (se->avg.load_avg || se->avg.util_avg) 3567 return false; 3568 3569 /* 3570 * If there is a pending propagation, we have to update the load and 3571 * the utilization of the sched_entity: 3572 */ 3573 if (gcfs_rq->propagate) 3574 return false; 3575 3576 /* 3577 * Otherwise, the load and the utilization of the sched_entity is 3578 * already zero and there is no pending propagation, so it will be a 3579 * waste of time to try to decay it: 3580 */ 3581 return true; 3582 } 3583 3584 #else /* CONFIG_FAIR_GROUP_SCHED */ 3585 3586 static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {} 3587 3588 static inline int propagate_entity_load_avg(struct sched_entity *se) 3589 { 3590 return 0; 3591 } 3592 3593 static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) {} 3594 3595 #endif /* CONFIG_FAIR_GROUP_SCHED */ 3596 3597 /** 3598 * update_cfs_rq_load_avg - update the cfs_rq's load/util averages 3599 * @now: current time, as per cfs_rq_clock_task() 3600 * @cfs_rq: cfs_rq to update 3601 * 3602 * The cfs_rq avg is the direct sum of all its entities (blocked and runnable) 3603 * avg. The immediate corollary is that all (fair) tasks must be attached, see 3604 * post_init_entity_util_avg(). 3605 * 3606 * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example. 3607 * 3608 * Returns true if the load decayed or we removed load. 3609 * 3610 * Since both these conditions indicate a changed cfs_rq->avg.load we should 3611 * call update_tg_load_avg() when this function returns true. 3612 */ 3613 static inline int 3614 update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) 3615 { 3616 unsigned long removed_load = 0, removed_util = 0, removed_runnable_sum = 0; 3617 struct sched_avg *sa = &cfs_rq->avg; 3618 int decayed = 0; 3619 3620 if (cfs_rq->removed.nr) { 3621 unsigned long r; 3622 u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib; 3623 3624 raw_spin_lock(&cfs_rq->removed.lock); 3625 swap(cfs_rq->removed.util_avg, removed_util); 3626 swap(cfs_rq->removed.load_avg, removed_load); 3627 swap(cfs_rq->removed.runnable_sum, removed_runnable_sum); 3628 cfs_rq->removed.nr = 0; 3629 raw_spin_unlock(&cfs_rq->removed.lock); 3630 3631 r = removed_load; 3632 sub_positive(&sa->load_avg, r); 3633 sub_positive(&sa->load_sum, r * divider); 3634 3635 r = removed_util; 3636 sub_positive(&sa->util_avg, r); 3637 sub_positive(&sa->util_sum, r * divider); 3638 3639 add_tg_cfs_propagate(cfs_rq, -(long)removed_runnable_sum); 3640 3641 decayed = 1; 3642 } 3643 3644 decayed |= __update_load_avg_cfs_rq(now, cpu_of(rq_of(cfs_rq)), cfs_rq); 3645 3646 #ifndef CONFIG_64BIT 3647 smp_wmb(); 3648 cfs_rq->load_last_update_time_copy = sa->last_update_time; 3649 #endif 3650 3651 if (decayed) 3652 cfs_rq_util_change(cfs_rq); 3653 3654 return decayed; 3655 } 3656 3657 /** 3658 * attach_entity_load_avg - attach this entity to its cfs_rq load avg 3659 * @cfs_rq: cfs_rq to attach to 3660 * @se: sched_entity to attach 3661 * 3662 * Must call update_cfs_rq_load_avg() before this, since we rely on 3663 * cfs_rq->avg.last_update_time being current. 3664 */ 3665 static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) 3666 { 3667 u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib; 3668 3669 /* 3670 * When we attach the @se to the @cfs_rq, we must align the decay 3671 * window because without that, really weird and wonderful things can 3672 * happen. 3673 * 3674 * XXX illustrate 3675 */ 3676 se->avg.last_update_time = cfs_rq->avg.last_update_time; 3677 se->avg.period_contrib = cfs_rq->avg.period_contrib; 3678 3679 /* 3680 * Hell(o) Nasty stuff.. we need to recompute _sum based on the new 3681 * period_contrib. This isn't strictly correct, but since we're 3682 * entirely outside of the PELT hierarchy, nobody cares if we truncate 3683 * _sum a little. 3684 */ 3685 se->avg.util_sum = se->avg.util_avg * divider; 3686 3687 se->avg.load_sum = divider; 3688 if (se_weight(se)) { 3689 se->avg.load_sum = 3690 div_u64(se->avg.load_avg * se->avg.load_sum, se_weight(se)); 3691 } 3692 3693 se->avg.runnable_load_sum = se->avg.load_sum; 3694 3695 enqueue_load_avg(cfs_rq, se); 3696 cfs_rq->avg.util_avg += se->avg.util_avg; 3697 cfs_rq->avg.util_sum += se->avg.util_sum; 3698 3699 add_tg_cfs_propagate(cfs_rq, se->avg.load_sum); 3700 3701 cfs_rq_util_change(cfs_rq); 3702 } 3703 3704 /** 3705 * detach_entity_load_avg - detach this entity from its cfs_rq load avg 3706 * @cfs_rq: cfs_rq to detach from 3707 * @se: sched_entity to detach 3708 * 3709 * Must call update_cfs_rq_load_avg() before this, since we rely on 3710 * cfs_rq->avg.last_update_time being current. 3711 */ 3712 static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) 3713 { 3714 dequeue_load_avg(cfs_rq, se); 3715 sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg); 3716 sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum); 3717 3718 add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum); 3719 3720 cfs_rq_util_change(cfs_rq); 3721 } 3722 3723 /* 3724 * Optional action to be done while updating the load average 3725 */ 3726 #define UPDATE_TG 0x1 3727 #define SKIP_AGE_LOAD 0x2 3728 #define DO_ATTACH 0x4 3729 3730 /* Update task and its cfs_rq load average */ 3731 static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) 3732 { 3733 u64 now = cfs_rq_clock_task(cfs_rq); 3734 struct rq *rq = rq_of(cfs_rq); 3735 int cpu = cpu_of(rq); 3736 int decayed; 3737 3738 /* 3739 * Track task load average for carrying it to new CPU after migrated, and 3740 * track group sched_entity load average for task_h_load calc in migration 3741 */ 3742 if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) 3743 __update_load_avg_se(now, cpu, cfs_rq, se); 3744 3745 decayed = update_cfs_rq_load_avg(now, cfs_rq); 3746 decayed |= propagate_entity_load_avg(se); 3747 3748 if (!se->avg.last_update_time && (flags & DO_ATTACH)) { 3749 3750 attach_entity_load_avg(cfs_rq, se); 3751 update_tg_load_avg(cfs_rq, 0); 3752 3753 } else if (decayed && (flags & UPDATE_TG)) 3754 update_tg_load_avg(cfs_rq, 0); 3755 } 3756 3757 #ifndef CONFIG_64BIT 3758 static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq) 3759 { 3760 u64 last_update_time_copy; 3761 u64 last_update_time; 3762 3763 do { 3764 last_update_time_copy = cfs_rq->load_last_update_time_copy; 3765 smp_rmb(); 3766 last_update_time = cfs_rq->avg.last_update_time; 3767 } while (last_update_time != last_update_time_copy); 3768 3769 return last_update_time; 3770 } 3771 #else 3772 static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq) 3773 { 3774 return cfs_rq->avg.last_update_time; 3775 } 3776 #endif 3777 3778 /* 3779 * Synchronize entity load avg of dequeued entity without locking 3780 * the previous rq. 3781 */ 3782 void sync_entity_load_avg(struct sched_entity *se) 3783 { 3784 struct cfs_rq *cfs_rq = cfs_rq_of(se); 3785 u64 last_update_time; 3786 3787 last_update_time = cfs_rq_last_update_time(cfs_rq); 3788 __update_load_avg_blocked_se(last_update_time, cpu_of(rq_of(cfs_rq)), se); 3789 } 3790 3791 /* 3792 * Task first catches up with cfs_rq, and then subtract 3793 * itself from the cfs_rq (task must be off the queue now). 3794 */ 3795 void remove_entity_load_avg(struct sched_entity *se) 3796 { 3797 struct cfs_rq *cfs_rq = cfs_rq_of(se); 3798 unsigned long flags; 3799 3800 /* 3801 * tasks cannot exit without having gone through wake_up_new_task() -> 3802 * post_init_entity_util_avg() which will have added things to the 3803 * cfs_rq, so we can remove unconditionally. 3804 * 3805 * Similarly for groups, they will have passed through 3806 * post_init_entity_util_avg() before unregister_sched_fair_group() 3807 * calls this. 3808 */ 3809 3810 sync_entity_load_avg(se); 3811 3812 raw_spin_lock_irqsave(&cfs_rq->removed.lock, flags); 3813 ++cfs_rq->removed.nr; 3814 cfs_rq->removed.util_avg += se->avg.util_avg; 3815 cfs_rq->removed.load_avg += se->avg.load_avg; 3816 cfs_rq->removed.runnable_sum += se->avg.load_sum; /* == runnable_sum */ 3817 raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags); 3818 } 3819 3820 static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq) 3821 { 3822 return cfs_rq->avg.runnable_load_avg; 3823 } 3824 3825 static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq) 3826 { 3827 return cfs_rq->avg.load_avg; 3828 } 3829 3830 static int idle_balance(struct rq *this_rq, struct rq_flags *rf); 3831 3832 #else /* CONFIG_SMP */ 3833 3834 static inline int 3835 update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) 3836 { 3837 return 0; 3838 } 3839 3840 #define UPDATE_TG 0x0 3841 #define SKIP_AGE_LOAD 0x0 3842 #define DO_ATTACH 0x0 3843 3844 static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1) 3845 { 3846 cfs_rq_util_change(cfs_rq); 3847 } 3848 3849 static inline void remove_entity_load_avg(struct sched_entity *se) {} 3850 3851 static inline void 3852 attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} 3853 static inline void 3854 detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} 3855 3856 static inline int idle_balance(struct rq *rq, struct rq_flags *rf) 3857 { 3858 return 0; 3859 } 3860 3861 #endif /* CONFIG_SMP */ 3862 3863 static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se) 3864 { 3865 #ifdef CONFIG_SCHED_DEBUG 3866 s64 d = se->vruntime - cfs_rq->min_vruntime; 3867 3868 if (d < 0) 3869 d = -d; 3870 3871 if (d > 3*sysctl_sched_latency) 3872 schedstat_inc(cfs_rq->nr_spread_over); 3873 #endif 3874 } 3875 3876 static void 3877 place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) 3878 { 3879 u64 vruntime = cfs_rq->min_vruntime; 3880 3881 /* 3882 * The 'current' period is already promised to the current tasks, 3883 * however the extra weight of the new task will slow them down a 3884 * little, place the new task so that it fits in the slot that 3885 * stays open at the end. 3886 */ 3887 if (initial && sched_feat(START_DEBIT)) 3888 vruntime += sched_vslice(cfs_rq, se); 3889 3890 /* sleeps up to a single latency don't count. */ 3891 if (!initial) { 3892 unsigned long thresh = sysctl_sched_latency; 3893 3894 /* 3895 * Halve their sleep time's effect, to allow 3896 * for a gentler effect of sleepers: 3897 */ 3898 if (sched_feat(GENTLE_FAIR_SLEEPERS)) 3899 thresh >>= 1; 3900 3901 vruntime -= thresh; 3902 } 3903 3904 /* ensure we never gain time by being placed backwards. */ 3905 se->vruntime = max_vruntime(se->vruntime, vruntime); 3906 } 3907 3908 static void check_enqueue_throttle(struct cfs_rq *cfs_rq); 3909 3910 static inline void check_schedstat_required(void) 3911 { 3912 #ifdef CONFIG_SCHEDSTATS 3913 if (schedstat_enabled()) 3914 return; 3915 3916 /* Force schedstat enabled if a dependent tracepoint is active */ 3917 if (trace_sched_stat_wait_enabled() || 3918 trace_sched_stat_sleep_enabled() || 3919 trace_sched_stat_iowait_enabled() || 3920 trace_sched_stat_blocked_enabled() || 3921 trace_sched_stat_runtime_enabled()) { 3922 printk_deferred_once("Scheduler tracepoints stat_sleep, stat_iowait, " 3923 "stat_blocked and stat_runtime require the " 3924 "kernel parameter schedstats=enable or " 3925 "kernel.sched_schedstats=1\n"); 3926 } 3927 #endif 3928 } 3929 3930 3931 /* 3932 * MIGRATION 3933 * 3934 * dequeue 3935 * update_curr() 3936 * update_min_vruntime() 3937 * vruntime -= min_vruntime 3938 * 3939 * enqueue 3940 * update_curr() 3941 * update_min_vruntime() 3942 * vruntime += min_vruntime 3943 * 3944 * this way the vruntime transition between RQs is done when both 3945 * min_vruntime are up-to-date. 3946 * 3947 * WAKEUP (remote) 3948 * 3949 * ->migrate_task_rq_fair() (p->state == TASK_WAKING) 3950 * vruntime -= min_vruntime 3951 * 3952 * enqueue 3953 * update_curr() 3954 * update_min_vruntime() 3955 * vruntime += min_vruntime 3956 * 3957 * this way we don't have the most up-to-date min_vruntime on the originating 3958 * CPU and an up-to-date min_vruntime on the destination CPU. 3959 */ 3960 3961 static void 3962 enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) 3963 { 3964 bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATED); 3965 bool curr = cfs_rq->curr == se; 3966 3967 /* 3968 * If we're the current task, we must renormalise before calling 3969 * update_curr(). 3970 */ 3971 if (renorm && curr) 3972 se->vruntime += cfs_rq->min_vruntime; 3973 3974 update_curr(cfs_rq); 3975 3976 /* 3977 * Otherwise, renormalise after, such that we're placed at the current 3978 * moment in time, instead of some random moment in the past. Being 3979 * placed in the past could significantly boost this task to the 3980 * fairness detriment of existing tasks. 3981 */ 3982 if (renorm && !curr) 3983 se->vruntime += cfs_rq->min_vruntime; 3984 3985 /* 3986 * When enqueuing a sched_entity, we must: 3987 * - Update loads to have both entity and cfs_rq synced with now. 3988 * - Add its load to cfs_rq->runnable_avg 3989 * - For group_entity, update its weight to reflect the new share of 3990 * its group cfs_rq 3991 * - Add its new weight to cfs_rq->load.weight 3992 */ 3993 update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH); 3994 update_cfs_group(se); 3995 enqueue_runnable_load_avg(cfs_rq, se); 3996 account_entity_enqueue(cfs_rq, se); 3997 3998 if (flags & ENQUEUE_WAKEUP) 3999 place_entity(cfs_rq, se, 0); 4000 4001 check_schedstat_required(); 4002 update_stats_enqueue(cfs_rq, se, flags); 4003 check_spread(cfs_rq, se); 4004 if (!curr) 4005 __enqueue_entity(cfs_rq, se); 4006 se->on_rq = 1; 4007 4008 if (cfs_rq->nr_running == 1) { 4009 list_add_leaf_cfs_rq(cfs_rq); 4010 check_enqueue_throttle(cfs_rq); 4011 } 4012 } 4013 4014 static void __clear_buddies_last(struct sched_entity *se) 4015 { 4016 for_each_sched_entity(se) { 4017 struct cfs_rq *cfs_rq = cfs_rq_of(se); 4018 if (cfs_rq->last != se) 4019 break; 4020 4021 cfs_rq->last = NULL; 4022 } 4023 } 4024 4025 static void __clear_buddies_next(struct sched_entity *se) 4026 { 4027 for_each_sched_entity(se) { 4028 struct cfs_rq *cfs_rq = cfs_rq_of(se); 4029 if (cfs_rq->next != se) 4030 break; 4031 4032 cfs_rq->next = NULL; 4033 } 4034 } 4035 4036 static void __clear_buddies_skip(struct sched_entity *se) 4037 { 4038 for_each_sched_entity(se) { 4039 struct cfs_rq *cfs_rq = cfs_rq_of(se); 4040 if (cfs_rq->skip != se) 4041 break; 4042 4043 cfs_rq->skip = NULL; 4044 } 4045 } 4046 4047 static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) 4048 { 4049 if (cfs_rq->last == se) 4050 __clear_buddies_last(se); 4051 4052 if (cfs_rq->next == se) 4053 __clear_buddies_next(se); 4054 4055 if (cfs_rq->skip == se) 4056 __clear_buddies_skip(se); 4057 } 4058 4059 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq); 4060 4061 static void 4062 dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) 4063 { 4064 /* 4065 * Update run-time statistics of the 'current'. 4066 */ 4067 update_curr(cfs_rq); 4068 4069 /* 4070 * When dequeuing a sched_entity, we must: 4071 * - Update loads to have both entity and cfs_rq synced with now. 4072 * - Substract its load from the cfs_rq->runnable_avg. 4073 * - Substract its previous weight from cfs_rq->load.weight. 4074 * - For group entity, update its weight to reflect the new share 4075 * of its group cfs_rq. 4076 */ 4077 update_load_avg(cfs_rq, se, UPDATE_TG); 4078 dequeue_runnable_load_avg(cfs_rq, se); 4079 4080 update_stats_dequeue(cfs_rq, se, flags); 4081 4082 clear_buddies(cfs_rq, se); 4083 4084 if (se != cfs_rq->curr) 4085 __dequeue_entity(cfs_rq, se); 4086 se->on_rq = 0; 4087 account_entity_dequeue(cfs_rq, se); 4088 4089 /* 4090 * Normalize after update_curr(); which will also have moved 4091 * min_vruntime if @se is the one holding it back. But before doing 4092 * update_min_vruntime() again, which will discount @se's position and 4093 * can move min_vruntime forward still more. 4094 */ 4095 if (!(flags & DEQUEUE_SLEEP)) 4096 se->vruntime -= cfs_rq->min_vruntime; 4097 4098 /* return excess runtime on last dequeue */ 4099 return_cfs_rq_runtime(cfs_rq); 4100 4101 update_cfs_group(se); 4102 4103 /* 4104 * Now advance min_vruntime if @se was the entity holding it back, 4105 * except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be 4106 * put back on, and if we advance min_vruntime, we'll be placed back 4107 * further than we started -- ie. we'll be penalized. 4108 */ 4109 if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE) 4110 update_min_vruntime(cfs_rq); 4111 } 4112 4113 /* 4114 * Preempt the current task with a newly woken task if needed: 4115 */ 4116 static void 4117 check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) 4118 { 4119 unsigned long ideal_runtime, delta_exec; 4120 struct sched_entity *se; 4121 s64 delta; 4122 4123 ideal_runtime = sched_slice(cfs_rq, curr); 4124 delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; 4125 if (delta_exec > ideal_runtime) { 4126 resched_curr(rq_of(cfs_rq)); 4127 /* 4128 * The current task ran long enough, ensure it doesn't get 4129 * re-elected due to buddy favours. 4130 */ 4131 clear_buddies(cfs_rq, curr); 4132 return; 4133 } 4134 4135 /* 4136 * Ensure that a task that missed wakeup preemption by a 4137 * narrow margin doesn't have to wait for a full slice. 4138 * This also mitigates buddy induced latencies under load. 4139 */ 4140 if (delta_exec < sysctl_sched_min_granularity) 4141 return; 4142 4143 se = __pick_first_entity(cfs_rq); 4144 delta = curr->vruntime - se->vruntime; 4145 4146 if (delta < 0) 4147 return; 4148 4149 if (delta > ideal_runtime) 4150 resched_curr(rq_of(cfs_rq)); 4151 } 4152 4153 static void 4154 set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) 4155 { 4156 /* 'current' is not kept within the tree. */ 4157 if (se->on_rq) { 4158 /* 4159 * Any task has to be enqueued before it get to execute on 4160 * a CPU. So account for the time it spent waiting on the 4161 * runqueue. 4162 */ 4163 update_stats_wait_end(cfs_rq, se); 4164 __dequeue_entity(cfs_rq, se); 4165 update_load_avg(cfs_rq, se, UPDATE_TG); 4166 } 4167 4168 update_stats_curr_start(cfs_rq, se); 4169 cfs_rq->curr = se; 4170 4171 /* 4172 * Track our maximum slice length, if the CPU's load is at 4173 * least twice that of our own weight (i.e. dont track it 4174 * when there are only lesser-weight tasks around): 4175 */ 4176 if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) { 4177 schedstat_set(se->statistics.slice_max, 4178 max((u64)schedstat_val(se->statistics.slice_max), 4179 se->sum_exec_runtime - se->prev_sum_exec_runtime)); 4180 } 4181 4182 se->prev_sum_exec_runtime = se->sum_exec_runtime; 4183 } 4184 4185 static int 4186 wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); 4187 4188 /* 4189 * Pick the next process, keeping these things in mind, in this order: 4190 * 1) keep things fair between processes/task groups 4191 * 2) pick the "next" process, since someone really wants that to run 4192 * 3) pick the "last" process, for cache locality 4193 * 4) do not run the "skip" process, if something else is available 4194 */ 4195 static struct sched_entity * 4196 pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr) 4197 { 4198 struct sched_entity *left = __pick_first_entity(cfs_rq); 4199 struct sched_entity *se; 4200 4201 /* 4202 * If curr is set we have to see if its left of the leftmost entity 4203 * still in the tree, provided there was anything in the tree at all. 4204 */ 4205 if (!left || (curr && entity_before(curr, left))) 4206 left = curr; 4207 4208 se = left; /* ideally we run the leftmost entity */ 4209 4210 /* 4211 * Avoid running the skip buddy, if running something else can 4212 * be done without getting too unfair. 4213 */ 4214 if (cfs_rq->skip == se) { 4215 struct sched_entity *second; 4216 4217 if (se == curr) { 4218 second = __pick_first_entity(cfs_rq); 4219 } else { 4220 second = __pick_next_entity(se); 4221 if (!second || (curr && entity_before(curr, second))) 4222 second = curr; 4223 } 4224 4225 if (second && wakeup_preempt_entity(second, left) < 1) 4226 se = second; 4227 } 4228 4229 /* 4230 * Prefer last buddy, try to return the CPU to a preempted task. 4231 */ 4232 if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) 4233 se = cfs_rq->last; 4234 4235 /* 4236 * Someone really wants this to run. If it's not unfair, run it. 4237 */ 4238 if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) 4239 se = cfs_rq->next; 4240 4241 clear_buddies(cfs_rq, se); 4242 4243 return se; 4244 } 4245 4246 static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq); 4247 4248 static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) 4249 { 4250 /* 4251 * If still on the runqueue then deactivate_task() 4252 * was not called and update_curr() has to be done: 4253 */ 4254 if (prev->on_rq) 4255 update_curr(cfs_rq); 4256 4257 /* throttle cfs_rqs exceeding runtime */ 4258 check_cfs_rq_runtime(cfs_rq); 4259 4260 check_spread(cfs_rq, prev); 4261 4262 if (prev->on_rq) { 4263 update_stats_wait_start(cfs_rq, prev); 4264 /* Put 'current' back into the tree. */ 4265 __enqueue_entity(cfs_rq, prev); 4266 /* in !on_rq case, update occurred at dequeue */ 4267 update_load_avg(cfs_rq, prev, 0); 4268 } 4269 cfs_rq->curr = NULL; 4270 } 4271 4272 static void 4273 entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) 4274 { 4275 /* 4276 * Update run-time statistics of the 'current'. 4277 */ 4278 update_curr(cfs_rq); 4279 4280 /* 4281 * Ensure that runnable average is periodically updated. 4282 */ 4283 update_load_avg(cfs_rq, curr, UPDATE_TG); 4284 update_cfs_group(curr); 4285 4286 #ifdef CONFIG_SCHED_HRTICK 4287 /* 4288 * queued ticks are scheduled to match the slice, so don't bother 4289 * validating it and just reschedule. 4290 */ 4291 if (queued) { 4292 resched_curr(rq_of(cfs_rq)); 4293 return; 4294 } 4295 /* 4296 * don't let the period tick interfere with the hrtick preemption 4297 */ 4298 if (!sched_feat(DOUBLE_TICK) && 4299 hrtimer_active(&rq_of(cfs_rq)->hrtick_timer)) 4300 return; 4301 #endif 4302 4303 if (cfs_rq->nr_running > 1) 4304 check_preempt_tick(cfs_rq, curr); 4305 } 4306 4307 4308 /************************************************** 4309 * CFS bandwidth control machinery 4310 */ 4311 4312 #ifdef CONFIG_CFS_BANDWIDTH 4313 4314 #ifdef HAVE_JUMP_LABEL 4315 static struct static_key __cfs_bandwidth_used; 4316 4317 static inline bool cfs_bandwidth_used(void) 4318 { 4319 return static_key_false(&__cfs_bandwidth_used); 4320 } 4321 4322 void cfs_bandwidth_usage_inc(void) 4323 { 4324 static_key_slow_inc(&__cfs_bandwidth_used); 4325 } 4326 4327 void cfs_bandwidth_usage_dec(void) 4328 { 4329 static_key_slow_dec(&__cfs_bandwidth_used); 4330 } 4331 #else /* HAVE_JUMP_LABEL */ 4332 static bool cfs_bandwidth_used(void) 4333 { 4334 return true; 4335 } 4336 4337 void cfs_bandwidth_usage_inc(void) {} 4338 void cfs_bandwidth_usage_dec(void) {} 4339 #endif /* HAVE_JUMP_LABEL */ 4340 4341 /* 4342 * default period for cfs group bandwidth. 4343 * default: 0.1s, units: nanoseconds 4344 */ 4345 static inline u64 default_cfs_period(void) 4346 { 4347 return 100000000ULL; 4348 } 4349 4350 static inline u64 sched_cfs_bandwidth_slice(void) 4351 { 4352 return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC; 4353 } 4354 4355 /* 4356 * Replenish runtime according to assigned quota and update expiration time. 4357 * We use sched_clock_cpu directly instead of rq->clock to avoid adding 4358 * additional synchronization around rq->lock. 4359 * 4360 * requires cfs_b->lock 4361 */ 4362 void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b) 4363 { 4364 u64 now; 4365 4366 if (cfs_b->quota == RUNTIME_INF) 4367 return; 4368 4369 now = sched_clock_cpu(smp_processor_id()); 4370 cfs_b->runtime = cfs_b->quota; 4371 cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period); 4372 } 4373 4374 static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) 4375 { 4376 return &tg->cfs_bandwidth; 4377 } 4378 4379 /* rq->task_clock normalized against any time this cfs_rq has spent throttled */ 4380 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) 4381 { 4382 if (unlikely(cfs_rq->throttle_count)) 4383 return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time; 4384 4385 return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time; 4386 } 4387 4388 /* returns 0 on failure to allocate runtime */ 4389 static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) 4390 { 4391 struct task_group *tg = cfs_rq->tg; 4392 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); 4393 u64 amount = 0, min_amount, expires; 4394 4395 /* note: this is a positive sum as runtime_remaining <= 0 */ 4396 min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining; 4397 4398 raw_spin_lock(&cfs_b->lock); 4399 if (cfs_b->quota == RUNTIME_INF) 4400 amount = min_amount; 4401 else { 4402 start_cfs_bandwidth(cfs_b); 4403 4404 if (cfs_b->runtime > 0) { 4405 amount = min(cfs_b->runtime, min_amount); 4406 cfs_b->runtime -= amount; 4407 cfs_b->idle = 0; 4408 } 4409 } 4410 expires = cfs_b->runtime_expires; 4411 raw_spin_unlock(&cfs_b->lock); 4412 4413 cfs_rq->runtime_remaining += amount; 4414 /* 4415 * we may have advanced our local expiration to account for allowed 4416 * spread between our sched_clock and the one on which runtime was 4417 * issued. 4418 */ 4419 if ((s64)(expires - cfs_rq->runtime_expires) > 0) 4420 cfs_rq->runtime_expires = expires; 4421 4422 return cfs_rq->runtime_remaining > 0; 4423 } 4424 4425 /* 4426 * Note: This depends on the synchronization provided by sched_clock and the 4427 * fact that rq->clock snapshots this value. 4428 */ 4429 static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq) 4430 { 4431 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); 4432 4433 /* if the deadline is ahead of our clock, nothing to do */ 4434 if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0)) 4435 return; 4436 4437 if (cfs_rq->runtime_remaining < 0) 4438 return; 4439 4440 /* 4441 * If the local deadline has passed we have to consider the 4442 * possibility that our sched_clock is 'fast' and the global deadline 4443 * has not truly expired. 4444 * 4445 * Fortunately we can check determine whether this the case by checking 4446 * whether the global deadline has advanced. It is valid to compare 4447 * cfs_b->runtime_expires without any locks since we only care about 4448 * exact equality, so a partial write will still work. 4449 */ 4450 4451 if (cfs_rq->runtime_expires != cfs_b->runtime_expires) { 4452 /* extend local deadline, drift is bounded above by 2 ticks */ 4453 cfs_rq->runtime_expires += TICK_NSEC; 4454 } else { 4455 /* global deadline is ahead, expiration has passed */ 4456 cfs_rq->runtime_remaining = 0; 4457 } 4458 } 4459 4460 static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) 4461 { 4462 /* dock delta_exec before expiring quota (as it could span periods) */ 4463 cfs_rq->runtime_remaining -= delta_exec; 4464 expire_cfs_rq_runtime(cfs_rq); 4465 4466 if (likely(cfs_rq->runtime_remaining > 0)) 4467 return; 4468 4469 /* 4470 * if we're unable to extend our runtime we resched so that the active 4471 * hierarchy can be throttled 4472 */ 4473 if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr)) 4474 resched_curr(rq_of(cfs_rq)); 4475 } 4476 4477 static __always_inline 4478 void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) 4479 { 4480 if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled) 4481 return; 4482 4483 __account_cfs_rq_runtime(cfs_rq, delta_exec); 4484 } 4485 4486 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) 4487 { 4488 return cfs_bandwidth_used() && cfs_rq->throttled; 4489 } 4490 4491 /* check whether cfs_rq, or any parent, is throttled */ 4492 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq) 4493 { 4494 return cfs_bandwidth_used() && cfs_rq->throttle_count; 4495 } 4496 4497 /* 4498 * Ensure that neither of the group entities corresponding to src_cpu or 4499 * dest_cpu are members of a throttled hierarchy when performing group 4500 * load-balance operations. 4501 */ 4502 static inline int throttled_lb_pair(struct task_group *tg, 4503 int src_cpu, int dest_cpu) 4504 { 4505 struct cfs_rq *src_cfs_rq, *dest_cfs_rq; 4506 4507 src_cfs_rq = tg->cfs_rq[src_cpu]; 4508 dest_cfs_rq = tg->cfs_rq[dest_cpu]; 4509 4510 return throttled_hierarchy(src_cfs_rq) || 4511 throttled_hierarchy(dest_cfs_rq); 4512 } 4513 4514 /* updated child weight may affect parent so we have to do this bottom up */ 4515 static int tg_unthrottle_up(struct task_group *tg, void *data) 4516 { 4517 struct rq *rq = data; 4518 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; 4519 4520 cfs_rq->throttle_count--; 4521 if (!cfs_rq->throttle_count) { 4522 /* adjust cfs_rq_clock_task() */ 4523 cfs_rq->throttled_clock_task_time += rq_clock_task(rq) - 4524 cfs_rq->throttled_clock_task; 4525 } 4526 4527 return 0; 4528 } 4529 4530 static int tg_throttle_down(struct task_group *tg, void *data) 4531 { 4532 struct rq *rq = data; 4533 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; 4534 4535 /* group is entering throttled state, stop time */ 4536 if (!cfs_rq->throttle_count) 4537 cfs_rq->throttled_clock_task = rq_clock_task(rq); 4538 cfs_rq->throttle_count++; 4539 4540 return 0; 4541 } 4542 4543 static void throttle_cfs_rq(struct cfs_rq *cfs_rq) 4544 { 4545 struct rq *rq = rq_of(cfs_rq); 4546 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); 4547 struct sched_entity *se; 4548 long task_delta, dequeue = 1; 4549 bool empty; 4550 4551 se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; 4552 4553 /* freeze hierarchy runnable averages while throttled */ 4554 rcu_read_lock(); 4555 walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq); 4556 rcu_read_unlock(); 4557 4558 task_delta = cfs_rq->h_nr_running; 4559 for_each_sched_entity(se) { 4560 struct cfs_rq *qcfs_rq = cfs_rq_of(se); 4561 /* throttled entity or throttle-on-deactivate */ 4562 if (!se->on_rq) 4563 break; 4564 4565 if (dequeue) 4566 dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP); 4567 qcfs_rq->h_nr_running -= task_delta; 4568 4569 if (qcfs_rq->load.weight) 4570 dequeue = 0; 4571 } 4572 4573 if (!se) 4574 sub_nr_running(rq, task_delta); 4575 4576 cfs_rq->throttled = 1; 4577 cfs_rq->throttled_clock = rq_clock(rq); 4578 raw_spin_lock(&cfs_b->lock); 4579 empty = list_empty(&cfs_b->throttled_cfs_rq); 4580 4581 /* 4582 * Add to the _head_ of the list, so that an already-started 4583 * distribute_cfs_runtime will not see us 4584 */ 4585 list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); 4586 4587 /* 4588 * If we're the first throttled task, make sure the bandwidth 4589 * timer is running. 4590 */ 4591 if (empty) 4592 start_cfs_bandwidth(cfs_b); 4593 4594 raw_spin_unlock(&cfs_b->lock); 4595 } 4596 4597 void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) 4598 { 4599 struct rq *rq = rq_of(cfs_rq); 4600 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); 4601 struct sched_entity *se; 4602 int enqueue = 1; 4603 long task_delta; 4604 4605 se = cfs_rq->tg->se[cpu_of(rq)]; 4606 4607 cfs_rq->throttled = 0; 4608 4609 update_rq_clock(rq); 4610 4611 raw_spin_lock(&cfs_b->lock); 4612 cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock; 4613 list_del_rcu(&cfs_rq->throttled_list); 4614 raw_spin_unlock(&cfs_b->lock); 4615 4616 /* update hierarchical throttle state */ 4617 walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq); 4618 4619 if (!cfs_rq->load.weight) 4620 return; 4621 4622 task_delta = cfs_rq->h_nr_running; 4623 for_each_sched_entity(se) { 4624 if (se->on_rq) 4625 enqueue = 0; 4626 4627 cfs_rq = cfs_rq_of(se); 4628 if (enqueue) 4629 enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP); 4630 cfs_rq->h_nr_running += task_delta; 4631 4632 if (cfs_rq_throttled(cfs_rq)) 4633 break; 4634 } 4635 4636 if (!se) 4637 add_nr_running(rq, task_delta); 4638 4639 /* determine whether we need to wake up potentially idle cpu */ 4640 if (rq->curr == rq->idle && rq->cfs.nr_running) 4641 resched_curr(rq); 4642 } 4643 4644 static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, 4645 u64 remaining, u64 expires) 4646 { 4647 struct cfs_rq *cfs_rq; 4648 u64 runtime; 4649 u64 starting_runtime = remaining; 4650 4651 rcu_read_lock(); 4652 list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq, 4653 throttled_list) { 4654 struct rq *rq = rq_of(cfs_rq); 4655 struct rq_flags rf; 4656 4657 rq_lock(rq, &rf); 4658 if (!cfs_rq_throttled(cfs_rq)) 4659 goto next; 4660 4661 runtime = -cfs_rq->runtime_remaining + 1; 4662 if (runtime > remaining) 4663 runtime = remaining; 4664 remaining -= runtime; 4665 4666 cfs_rq->runtime_remaining += runtime; 4667 cfs_rq->runtime_expires = expires; 4668 4669 /* we check whether we're throttled above */ 4670 if (cfs_rq->runtime_remaining > 0) 4671 unthrottle_cfs_rq(cfs_rq); 4672 4673 next: 4674 rq_unlock(rq, &rf); 4675 4676 if (!remaining) 4677 break; 4678 } 4679 rcu_read_unlock(); 4680 4681 return starting_runtime - remaining; 4682 } 4683 4684 /* 4685 * Responsible for refilling a task_group's bandwidth and unthrottling its 4686 * cfs_rqs as appropriate. If there has been no activity within the last 4687 * period the timer is deactivated until scheduling resumes; cfs_b->idle is 4688 * used to track this state. 4689 */ 4690 static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) 4691 { 4692 u64 runtime, runtime_expires; 4693 int throttled; 4694 4695 /* no need to continue the timer with no bandwidth constraint */ 4696 if (cfs_b->quota == RUNTIME_INF) 4697 goto out_deactivate; 4698 4699 throttled = !list_empty(&cfs_b->throttled_cfs_rq); 4700 cfs_b->nr_periods += overrun; 4701 4702 /* 4703 * idle depends on !throttled (for the case of a large deficit), and if 4704 * we're going inactive then everything else can be deferred 4705 */ 4706 if (cfs_b->idle && !throttled) 4707 goto out_deactivate; 4708 4709 __refill_cfs_bandwidth_runtime(cfs_b); 4710 4711 if (!throttled) { 4712 /* mark as potentially idle for the upcoming period */ 4713 cfs_b->idle = 1; 4714 return 0; 4715 } 4716 4717 /* account preceding periods in which throttling occurred */ 4718 cfs_b->nr_throttled += overrun; 4719 4720 runtime_expires = cfs_b->runtime_expires; 4721 4722 /* 4723 * This check is repeated as we are holding onto the new bandwidth while 4724 * we unthrottle. This can potentially race with an unthrottled group 4725 * trying to acquire new bandwidth from the global pool. This can result 4726 * in us over-using our runtime if it is all used during this loop, but 4727 * only by limited amounts in that extreme case. 4728 */ 4729 while (throttled && cfs_b->runtime > 0) { 4730 runtime = cfs_b->runtime; 4731 raw_spin_unlock(&cfs_b->lock); 4732 /* we can't nest cfs_b->lock while distributing bandwidth */ 4733 runtime = distribute_cfs_runtime(cfs_b, runtime, 4734 runtime_expires); 4735 raw_spin_lock(&cfs_b->lock); 4736 4737 throttled = !list_empty(&cfs_b->throttled_cfs_rq); 4738 4739 cfs_b->runtime -= min(runtime, cfs_b->runtime); 4740 } 4741 4742 /* 4743 * While we are ensured activity in the period following an 4744 * unthrottle, this also covers the case in which the new bandwidth is 4745 * insufficient to cover the existing bandwidth deficit. (Forcing the 4746 * timer to remain active while there are any throttled entities.) 4747 */ 4748 cfs_b->idle = 0; 4749 4750 return 0; 4751 4752 out_deactivate: 4753 return 1; 4754 } 4755 4756 /* a cfs_rq won't donate quota below this amount */ 4757 static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC; 4758 /* minimum remaining period time to redistribute slack quota */ 4759 static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC; 4760 /* how long we wait to gather additional slack before distributing */ 4761 static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC; 4762 4763 /* 4764 * Are we near the end of the current quota period? 4765 * 4766 * Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the 4767 * hrtimer base being cleared by hrtimer_start. In the case of 4768 * migrate_hrtimers, base is never cleared, so we are fine. 4769 */ 4770 static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire) 4771 { 4772 struct hrtimer *refresh_timer = &cfs_b->period_timer; 4773 u64 remaining; 4774 4775 /* if the call-back is running a quota refresh is already occurring */ 4776 if (hrtimer_callback_running(refresh_timer)) 4777 return 1; 4778 4779 /* is a quota refresh about to occur? */ 4780 remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer)); 4781 if (remaining < min_expire) 4782 return 1; 4783 4784 return 0; 4785 } 4786 4787 static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b) 4788 { 4789 u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration; 4790 4791 /* if there's a quota refresh soon don't bother with slack */ 4792 if (runtime_refresh_within(cfs_b, min_left)) 4793 return; 4794 4795 hrtimer_start(&cfs_b->slack_timer, 4796 ns_to_ktime(cfs_bandwidth_slack_period), 4797 HRTIMER_MODE_REL); 4798 } 4799 4800 /* we know any runtime found here is valid as update_curr() precedes return */ 4801 static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq) 4802 { 4803 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); 4804 s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime; 4805 4806 if (slack_runtime <= 0) 4807 return; 4808 4809 raw_spin_lock(&cfs_b->lock); 4810 if (cfs_b->quota != RUNTIME_INF && 4811 cfs_rq->runtime_expires == cfs_b->runtime_expires) { 4812 cfs_b->runtime += slack_runtime; 4813 4814 /* we are under rq->lock, defer unthrottling using a timer */ 4815 if (cfs_b->runtime > sched_cfs_bandwidth_slice() && 4816 !list_empty(&cfs_b->throttled_cfs_rq)) 4817 start_cfs_slack_bandwidth(cfs_b); 4818 } 4819 raw_spin_unlock(&cfs_b->lock); 4820 4821 /* even if it's not valid for return we don't want to try again */ 4822 cfs_rq->runtime_remaining -= slack_runtime; 4823 } 4824 4825 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) 4826 { 4827 if (!cfs_bandwidth_used()) 4828 return; 4829 4830 if (!cfs_rq->runtime_enabled || cfs_rq->nr_running) 4831 return; 4832 4833 __return_cfs_rq_runtime(cfs_rq); 4834 } 4835 4836 /* 4837 * This is done with a timer (instead of inline with bandwidth return) since 4838 * it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs. 4839 */ 4840 static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) 4841 { 4842 u64 runtime = 0, slice = sched_cfs_bandwidth_slice(); 4843 u64 expires; 4844 4845 /* confirm we're still not at a refresh boundary */ 4846 raw_spin_lock(&cfs_b->lock); 4847 if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) { 4848 raw_spin_unlock(&cfs_b->lock); 4849 return; 4850 } 4851 4852 if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) 4853 runtime = cfs_b->runtime; 4854 4855 expires = cfs_b->runtime_expires; 4856 raw_spin_unlock(&cfs_b->lock); 4857 4858 if (!runtime) 4859 return; 4860 4861 runtime = distribute_cfs_runtime(cfs_b, runtime, expires); 4862 4863 raw_spin_lock(&cfs_b->lock); 4864 if (expires == cfs_b->runtime_expires) 4865 cfs_b->runtime -= min(runtime, cfs_b->runtime); 4866 raw_spin_unlock(&cfs_b->lock); 4867 } 4868 4869 /* 4870 * When a group wakes up we want to make sure that its quota is not already 4871 * expired/exceeded, otherwise it may be allowed to steal additional ticks of 4872 * runtime as update_curr() throttling can not not trigger until it's on-rq. 4873 */ 4874 static void check_enqueue_throttle(struct cfs_rq *cfs_rq) 4875 { 4876 if (!cfs_bandwidth_used()) 4877 return; 4878 4879 /* an active group must be handled by the update_curr()->put() path */ 4880 if (!cfs_rq->runtime_enabled || cfs_rq->curr) 4881 return; 4882 4883 /* ensure the group is not already throttled */ 4884 if (cfs_rq_throttled(cfs_rq)) 4885 return; 4886 4887 /* update runtime allocation */ 4888 account_cfs_rq_runtime(cfs_rq, 0); 4889 if (cfs_rq->runtime_remaining <= 0) 4890 throttle_cfs_rq(cfs_rq); 4891 } 4892 4893 static void sync_throttle(struct task_group *tg, int cpu) 4894 { 4895 struct cfs_rq *pcfs_rq, *cfs_rq; 4896 4897 if (!cfs_bandwidth_used()) 4898 return; 4899 4900 if (!tg->parent) 4901 return; 4902 4903 cfs_rq = tg->cfs_rq[cpu]; 4904 pcfs_rq = tg->parent->cfs_rq[cpu]; 4905 4906 cfs_rq->throttle_count = pcfs_rq->throttle_count; 4907 cfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu)); 4908 } 4909 4910 /* conditionally throttle active cfs_rq's from put_prev_entity() */ 4911 static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) 4912 { 4913 if (!cfs_bandwidth_used()) 4914 return false; 4915 4916 if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0)) 4917 return false; 4918 4919 /* 4920 * it's possible for a throttled entity to be forced into a running 4921 * state (e.g. set_curr_task), in this case we're finished. 4922 */ 4923 if (cfs_rq_throttled(cfs_rq)) 4924 return true; 4925 4926 throttle_cfs_rq(cfs_rq); 4927 return true; 4928 } 4929 4930 static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer) 4931 { 4932 struct cfs_bandwidth *cfs_b = 4933 container_of(timer, struct cfs_bandwidth, slack_timer); 4934 4935 do_sched_cfs_slack_timer(cfs_b); 4936 4937 return HRTIMER_NORESTART; 4938 } 4939 4940 static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) 4941 { 4942 struct cfs_bandwidth *cfs_b = 4943 container_of(timer, struct cfs_bandwidth, period_timer); 4944 int overrun; 4945 int idle = 0; 4946 4947 raw_spin_lock(&cfs_b->lock); 4948 for (;;) { 4949 overrun = hrtimer_forward_now(timer, cfs_b->period); 4950 if (!overrun) 4951 break; 4952 4953 idle = do_sched_cfs_period_timer(cfs_b, overrun); 4954 } 4955 if (idle) 4956 cfs_b->period_active = 0; 4957 raw_spin_unlock(&cfs_b->lock); 4958 4959 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; 4960 } 4961 4962 void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) 4963 { 4964 raw_spin_lock_init(&cfs_b->lock); 4965 cfs_b->runtime = 0; 4966 cfs_b->quota = RUNTIME_INF; 4967 cfs_b->period = ns_to_ktime(default_cfs_period()); 4968 4969 INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq); 4970 hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED); 4971 cfs_b->period_timer.function = sched_cfs_period_timer; 4972 hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 4973 cfs_b->slack_timer.function = sched_cfs_slack_timer; 4974 } 4975 4976 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) 4977 { 4978 cfs_rq->runtime_enabled = 0; 4979 INIT_LIST_HEAD(&cfs_rq->throttled_list); 4980 } 4981 4982 void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) 4983 { 4984 lockdep_assert_held(&cfs_b->lock); 4985 4986 if (!cfs_b->period_active) { 4987 cfs_b->period_active = 1; 4988 hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period); 4989 hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED); 4990 } 4991 } 4992 4993 static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) 4994 { 4995 /* init_cfs_bandwidth() was not called */ 4996 if (!cfs_b->throttled_cfs_rq.next) 4997 return; 4998 4999 hrtimer_cancel(&cfs_b->period_timer); 5000 hrtimer_cancel(&cfs_b->slack_timer); 5001 } 5002 5003 /* 5004 * Both these cpu hotplug callbacks race against unregister_fair_sched_group() 5005 * 5006 * The race is harmless, since modifying bandwidth settings of unhooked group 5007 * bits doesn't do much. 5008 */ 5009 5010 /* cpu online calback */ 5011 static void __maybe_unused update_runtime_enabled(struct rq *rq) 5012 { 5013 struct task_group *tg; 5014 5015 lockdep_assert_held(&rq->lock); 5016 5017 rcu_read_lock(); 5018 list_for_each_entry_rcu(tg, &task_groups, list) { 5019 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; 5020 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; 5021 5022 raw_spin_lock(&cfs_b->lock); 5023 cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF; 5024 raw_spin_unlock(&cfs_b->lock); 5025 } 5026 rcu_read_unlock(); 5027 } 5028 5029 /* cpu offline callback */ 5030 static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) 5031 { 5032 struct task_group *tg; 5033 5034 lockdep_assert_held(&rq->lock); 5035 5036 rcu_read_lock(); 5037 list_for_each_entry_rcu(tg, &task_groups, list) { 5038 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; 5039 5040 if (!cfs_rq->runtime_enabled) 5041 continue; 5042 5043 /* 5044 * clock_task is not advancing so we just need to make sure 5045 * there's some valid quota amount 5046 */ 5047 cfs_rq->runtime_remaining = 1; 5048 /* 5049 * Offline rq is schedulable till cpu is completely disabled 5050 * in take_cpu_down(), so we prevent new cfs throttling here. 5051 */ 5052 cfs_rq->runtime_enabled = 0; 5053 5054 if (cfs_rq_throttled(cfs_rq)) 5055 unthrottle_cfs_rq(cfs_rq); 5056 } 5057 rcu_read_unlock(); 5058 } 5059 5060 #else /* CONFIG_CFS_BANDWIDTH */ 5061 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) 5062 { 5063 return rq_clock_task(rq_of(cfs_rq)); 5064 } 5065 5066 static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {} 5067 static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; } 5068 static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} 5069 static inline void sync_throttle(struct task_group *tg, int cpu) {} 5070 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} 5071 5072 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) 5073 { 5074 return 0; 5075 } 5076 5077 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq) 5078 { 5079 return 0; 5080 } 5081 5082 static inline int throttled_lb_pair(struct task_group *tg, 5083 int src_cpu, int dest_cpu) 5084 { 5085 return 0; 5086 } 5087 5088 void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} 5089 5090 #ifdef CONFIG_FAIR_GROUP_SCHED 5091 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} 5092 #endif 5093 5094 static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) 5095 { 5096 return NULL; 5097 } 5098 static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} 5099 static inline void update_runtime_enabled(struct rq *rq) {} 5100 static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {} 5101 5102 #endif /* CONFIG_CFS_BANDWIDTH */ 5103 5104 /************************************************** 5105 * CFS operations on tasks: 5106 */ 5107 5108 #ifdef CONFIG_SCHED_HRTICK 5109 static void hrtick_start_fair(struct rq *rq, struct task_struct *p) 5110 { 5111 struct sched_entity *se = &p->se; 5112 struct cfs_rq *cfs_rq = cfs_rq_of(se); 5113 5114 SCHED_WARN_ON(task_rq(p) != rq); 5115 5116 if (rq->cfs.h_nr_running > 1) { 5117 u64 slice = sched_slice(cfs_rq, se); 5118 u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime; 5119 s64 delta = slice - ran; 5120 5121 if (delta < 0) { 5122 if (rq->curr == p) 5123 resched_curr(rq); 5124 return; 5125 } 5126 hrtick_start(rq, delta); 5127 } 5128 } 5129 5130 /* 5131 * called from enqueue/dequeue and updates the hrtick when the 5132 * current task is from our class and nr_running is low enough 5133 * to matter. 5134 */ 5135 static void hrtick_update(struct rq *rq) 5136 { 5137 struct task_struct *curr = rq->curr; 5138 5139 if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class) 5140 return; 5141 5142 if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency) 5143 hrtick_start_fair(rq, curr); 5144 } 5145 #else /* !CONFIG_SCHED_HRTICK */ 5146 static inline void 5147 hrtick_start_fair(struct rq *rq, struct task_struct *p) 5148 { 5149 } 5150 5151 static inline void hrtick_update(struct rq *rq) 5152 { 5153 } 5154 #endif 5155 5156 /* 5157 * The enqueue_task method is called before nr_running is 5158 * increased. Here we update the fair scheduling stats and 5159 * then put the task into the rbtree: 5160 */ 5161 static void 5162 enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) 5163 { 5164 struct cfs_rq *cfs_rq; 5165 struct sched_entity *se = &p->se; 5166 5167 /* 5168 * If in_iowait is set, the code below may not trigger any cpufreq 5169 * utilization updates, so do it here explicitly with the IOWAIT flag 5170 * passed. 5171 */ 5172 if (p->in_iowait) 5173 cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT); 5174 5175 for_each_sched_entity(se) { 5176 if (se->on_rq) 5177 break; 5178 cfs_rq = cfs_rq_of(se); 5179 enqueue_entity(cfs_rq, se, flags); 5180 5181 /* 5182 * end evaluation on encountering a throttled cfs_rq 5183 * 5184 * note: in the case of encountering a throttled cfs_rq we will 5185 * post the final h_nr_running increment below. 5186 */ 5187 if (cfs_rq_throttled(cfs_rq)) 5188 break; 5189 cfs_rq->h_nr_running++; 5190 5191 flags = ENQUEUE_WAKEUP; 5192 } 5193 5194 for_each_sched_entity(se) { 5195 cfs_rq = cfs_rq_of(se); 5196 cfs_rq->h_nr_running++; 5197 5198 if (cfs_rq_throttled(cfs_rq)) 5199 break; 5200 5201 update_load_avg(cfs_rq, se, UPDATE_TG); 5202 update_cfs_group(se); 5203 } 5204 5205 if (!se) 5206 add_nr_running(rq, 1); 5207 5208 hrtick_update(rq); 5209 } 5210 5211 static void set_next_buddy(struct sched_entity *se); 5212 5213 /* 5214 * The dequeue_task method is called before nr_running is 5215 * decreased. We remove the task from the rbtree and 5216 * update the fair scheduling stats: 5217 */ 5218 static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) 5219 { 5220 struct cfs_rq *cfs_rq; 5221 struct sched_entity *se = &p->se; 5222 int task_sleep = flags & DEQUEUE_SLEEP; 5223 5224 for_each_sched_entity(se) { 5225 cfs_rq = cfs_rq_of(se); 5226 dequeue_entity(cfs_rq, se, flags); 5227 5228 /* 5229 * end evaluation on encountering a throttled cfs_rq 5230 * 5231 * note: in the case of encountering a throttled cfs_rq we will 5232 * post the final h_nr_running decrement below. 5233 */ 5234 if (cfs_rq_throttled(cfs_rq)) 5235 break; 5236 cfs_rq->h_nr_running--; 5237 5238 /* Don't dequeue parent if it has other entities besides us */ 5239 if (cfs_rq->load.weight) { 5240 /* Avoid re-evaluating load for this entity: */ 5241 se = parent_entity(se); 5242 /* 5243 * Bias pick_next to pick a task from this cfs_rq, as 5244 * p is sleeping when it is within its sched_slice. 5245 */ 5246 if (task_sleep && se && !throttled_hierarchy(cfs_rq)) 5247 set_next_buddy(se); 5248 break; 5249 } 5250 flags |= DEQUEUE_SLEEP; 5251 } 5252 5253 for_each_sched_entity(se) { 5254 cfs_rq = cfs_rq_of(se); 5255 cfs_rq->h_nr_running--; 5256 5257 if (cfs_rq_throttled(cfs_rq)) 5258 break; 5259 5260 update_load_avg(cfs_rq, se, UPDATE_TG); 5261 update_cfs_group(se); 5262 } 5263 5264 if (!se) 5265 sub_nr_running(rq, 1); 5266 5267 hrtick_update(rq); 5268 } 5269 5270 #ifdef CONFIG_SMP 5271 5272 /* Working cpumask for: load_balance, load_balance_newidle. */ 5273 DEFINE_PER_CPU(cpumask_var_t, load_balance_mask); 5274 DEFINE_PER_CPU(cpumask_var_t, select_idle_mask); 5275 5276 #ifdef CONFIG_NO_HZ_COMMON 5277 /* 5278 * per rq 'load' arrray crap; XXX kill this. 5279 */ 5280 5281 /* 5282 * The exact cpuload calculated at every tick would be: 5283 * 5284 * load' = (1 - 1/2^i) * load + (1/2^i) * cur_load 5285 * 5286 * If a cpu misses updates for n ticks (as it was idle) and update gets 5287 * called on the n+1-th tick when cpu may be busy, then we have: 5288 * 5289 * load_n = (1 - 1/2^i)^n * load_0 5290 * load_n+1 = (1 - 1/2^i) * load_n + (1/2^i) * cur_load 5291 * 5292 * decay_load_missed() below does efficient calculation of 5293 * 5294 * load' = (1 - 1/2^i)^n * load 5295 * 5296 * Because x^(n+m) := x^n * x^m we can decompose any x^n in power-of-2 factors. 5297 * This allows us to precompute the above in said factors, thereby allowing the 5298 * reduction of an arbitrary n in O(log_2 n) steps. (See also 5299 * fixed_power_int()) 5300 * 5301 * The calculation is approximated on a 128 point scale. 5302 */ 5303 #define DEGRADE_SHIFT 7 5304 5305 static const u8 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128}; 5306 static const u8 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = { 5307 { 0, 0, 0, 0, 0, 0, 0, 0 }, 5308 { 64, 32, 8, 0, 0, 0, 0, 0 }, 5309 { 96, 72, 40, 12, 1, 0, 0, 0 }, 5310 { 112, 98, 75, 43, 15, 1, 0, 0 }, 5311 { 120, 112, 98, 76, 45, 16, 2, 0 } 5312 }; 5313 5314 /* 5315 * Update cpu_load for any missed ticks, due to tickless idle. The backlog 5316 * would be when CPU is idle and so we just decay the old load without 5317 * adding any new load. 5318 */ 5319 static unsigned long 5320 decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) 5321 { 5322 int j = 0; 5323 5324 if (!missed_updates) 5325 return load; 5326 5327 if (missed_updates >= degrade_zero_ticks[idx]) 5328 return 0; 5329 5330 if (idx == 1) 5331 return load >> missed_updates; 5332 5333 while (missed_updates) { 5334 if (missed_updates % 2) 5335 load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT; 5336 5337 missed_updates >>= 1; 5338 j++; 5339 } 5340 return load; 5341 } 5342 #endif /* CONFIG_NO_HZ_COMMON */ 5343 5344 /** 5345 * __cpu_load_update - update the rq->cpu_load[] statistics 5346 * @this_rq: The rq to update statistics for 5347 * @this_load: The current load 5348 * @pending_updates: The number of missed updates 5349 * 5350 * Update rq->cpu_load[] statistics. This function is usually called every 5351 * scheduler tick (TICK_NSEC). 5352 * 5353 * This function computes a decaying average: 5354 * 5355 * load[i]' = (1 - 1/2^i) * load[i] + (1/2^i) * load 5356 * 5357 * Because of NOHZ it might not get called on every tick which gives need for 5358 * the @pending_updates argument. 5359 * 5360 * load[i]_n = (1 - 1/2^i) * load[i]_n-1 + (1/2^i) * load_n-1 5361 * = A * load[i]_n-1 + B ; A := (1 - 1/2^i), B := (1/2^i) * load 5362 * = A * (A * load[i]_n-2 + B) + B 5363 * = A * (A * (A * load[i]_n-3 + B) + B) + B 5364 * = A^3 * load[i]_n-3 + (A^2 + A + 1) * B 5365 * = A^n * load[i]_0 + (A^(n-1) + A^(n-2) + ... + 1) * B 5366 * = A^n * load[i]_0 + ((1 - A^n) / (1 - A)) * B 5367 * = (1 - 1/2^i)^n * (load[i]_0 - load) + load 5368 * 5369 * In the above we've assumed load_n := load, which is true for NOHZ_FULL as 5370 * any change in load would have resulted in the tick being turned back on. 5371 * 5372 * For regular NOHZ, this reduces to: 5373 * 5374 * load[i]_n = (1 - 1/2^i)^n * load[i]_0 5375 * 5376 * see decay_load_misses(). For NOHZ_FULL we get to subtract and add the extra 5377 * term. 5378 */ 5379 static void cpu_load_update(struct rq *this_rq, unsigned long this_load, 5380 unsigned long pending_updates) 5381 { 5382 unsigned long __maybe_unused tickless_load = this_rq->cpu_load[0]; 5383 int i, scale; 5384 5385 this_rq->nr_load_updates++; 5386 5387 /* Update our load: */ 5388 this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ 5389 for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { 5390 unsigned long old_load, new_load; 5391 5392 /* scale is effectively 1 << i now, and >> i divides by scale */ 5393 5394 old_load = this_rq->cpu_load[i]; 5395 #ifdef CONFIG_NO_HZ_COMMON 5396 old_load = decay_load_missed(old_load, pending_updates - 1, i); 5397 if (tickless_load) { 5398 old_load -= decay_load_missed(tickless_load, pending_updates - 1, i); 5399 /* 5400 * old_load can never be a negative value because a 5401 * decayed tickless_load cannot be greater than the 5402 * original tickless_load. 5403 */ 5404 old_load += tickless_load; 5405 } 5406 #endif 5407 new_load = this_load; 5408 /* 5409 * Round up the averaging division if load is increasing. This 5410 * prevents us from getting stuck on 9 if the load is 10, for 5411 * example. 5412 */ 5413 if (new_load > old_load) 5414 new_load += scale - 1; 5415 5416 this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i; 5417 } 5418 5419 sched_avg_update(this_rq); 5420 } 5421 5422 /* Used instead of source_load when we know the type == 0 */ 5423 static unsigned long weighted_cpuload(struct rq *rq) 5424 { 5425 return cfs_rq_runnable_load_avg(&rq->cfs); 5426 } 5427 5428 #ifdef CONFIG_NO_HZ_COMMON 5429 /* 5430 * There is no sane way to deal with nohz on smp when using jiffies because the 5431 * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading 5432 * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}. 5433 * 5434 * Therefore we need to avoid the delta approach from the regular tick when 5435 * possible since that would seriously skew the load calculation. This is why we 5436 * use cpu_load_update_periodic() for CPUs out of nohz. However we'll rely on 5437 * jiffies deltas for updates happening while in nohz mode (idle ticks, idle 5438 * loop exit, nohz_idle_balance, nohz full exit...) 5439 * 5440 * This means we might still be one tick off for nohz periods. 5441 */ 5442 5443 static void cpu_load_update_nohz(struct rq *this_rq, 5444 unsigned long curr_jiffies, 5445 unsigned long load) 5446 { 5447 unsigned long pending_updates; 5448 5449 pending_updates = curr_jiffies - this_rq->last_load_update_tick; 5450 if (pending_updates) { 5451 this_rq->last_load_update_tick = curr_jiffies; 5452 /* 5453 * In the regular NOHZ case, we were idle, this means load 0. 5454 * In the NOHZ_FULL case, we were non-idle, we should consider 5455 * its weighted load. 5456 */ 5457 cpu_load_update(this_rq, load, pending_updates); 5458 } 5459 } 5460 5461 /* 5462 * Called from nohz_idle_balance() to update the load ratings before doing the 5463 * idle balance. 5464 */ 5465 static void cpu_load_update_idle(struct rq *this_rq) 5466 { 5467 /* 5468 * bail if there's load or we're actually up-to-date. 5469 */ 5470 if (weighted_cpuload(this_rq)) 5471 return; 5472 5473 cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), 0); 5474 } 5475 5476 /* 5477 * Record CPU load on nohz entry so we know the tickless load to account 5478 * on nohz exit. cpu_load[0] happens then to be updated more frequently 5479 * than other cpu_load[idx] but it should be fine as cpu_load readers 5480 * shouldn't rely into synchronized cpu_load[*] updates. 5481 */ 5482 void cpu_load_update_nohz_start(void) 5483 { 5484 struct rq *this_rq = this_rq(); 5485 5486 /* 5487 * This is all lockless but should be fine. If weighted_cpuload changes 5488 * concurrently we'll exit nohz. And cpu_load write can race with 5489 * cpu_load_update_idle() but both updater would be writing the same. 5490 */ 5491 this_rq->cpu_load[0] = weighted_cpuload(this_rq); 5492 } 5493 5494 /* 5495 * Account the tickless load in the end of a nohz frame. 5496 */ 5497 void cpu_load_update_nohz_stop(void) 5498 { 5499 unsigned long curr_jiffies = READ_ONCE(jiffies); 5500 struct rq *this_rq = this_rq(); 5501 unsigned long load; 5502 struct rq_flags rf; 5503 5504 if (curr_jiffies == this_rq->last_load_update_tick) 5505 return; 5506 5507 load = weighted_cpuload(this_rq); 5508 rq_lock(this_rq, &rf); 5509 update_rq_clock(this_rq); 5510 cpu_load_update_nohz(this_rq, curr_jiffies, load); 5511 rq_unlock(this_rq, &rf); 5512 } 5513 #else /* !CONFIG_NO_HZ_COMMON */ 5514 static inline void cpu_load_update_nohz(struct rq *this_rq, 5515 unsigned long curr_jiffies, 5516 unsigned long load) { } 5517 #endif /* CONFIG_NO_HZ_COMMON */ 5518 5519 static void cpu_load_update_periodic(struct rq *this_rq, unsigned long load) 5520 { 5521 #ifdef CONFIG_NO_HZ_COMMON 5522 /* See the mess around cpu_load_update_nohz(). */ 5523 this_rq->last_load_update_tick = READ_ONCE(jiffies); 5524 #endif 5525 cpu_load_update(this_rq, load, 1); 5526 } 5527 5528 /* 5529 * Called from scheduler_tick() 5530 */ 5531 void cpu_load_update_active(struct rq *this_rq) 5532 { 5533 unsigned long load = weighted_cpuload(this_rq); 5534 5535 if (tick_nohz_tick_stopped()) 5536 cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), load); 5537 else 5538 cpu_load_update_periodic(this_rq, load); 5539 } 5540 5541 /* 5542 * Return a low guess at the load of a migration-source cpu weighted 5543 * according to the scheduling class and "nice" value. 5544 * 5545 * We want to under-estimate the load of migration sources, to 5546 * balance conservatively. 5547 */ 5548 static unsigned long source_load(int cpu, int type) 5549 { 5550 struct rq *rq = cpu_rq(cpu); 5551 unsigned long total = weighted_cpuload(rq); 5552 5553 if (type == 0 || !sched_feat(LB_BIAS)) 5554 return total; 5555 5556 return min(rq->cpu_load[type-1], total); 5557 } 5558 5559 /* 5560 * Return a high guess at the load of a migration-target cpu weighted 5561 * according to the scheduling class and "nice" value. 5562 */ 5563 static unsigned long target_load(int cpu, int type) 5564 { 5565 struct rq *rq = cpu_rq(cpu); 5566 unsigned long total = weighted_cpuload(rq); 5567 5568 if (type == 0 || !sched_feat(LB_BIAS)) 5569 return total; 5570 5571 return max(rq->cpu_load[type-1], total); 5572 } 5573 5574 static unsigned long capacity_of(int cpu) 5575 { 5576 return cpu_rq(cpu)->cpu_capacity; 5577 } 5578 5579 static unsigned long capacity_orig_of(int cpu) 5580 { 5581 return cpu_rq(cpu)->cpu_capacity_orig; 5582 } 5583 5584 static unsigned long cpu_avg_load_per_task(int cpu) 5585 { 5586 struct rq *rq = cpu_rq(cpu); 5587 unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running); 5588 unsigned long load_avg = weighted_cpuload(rq); 5589 5590 if (nr_running) 5591 return load_avg / nr_running; 5592 5593 return 0; 5594 } 5595 5596 static void record_wakee(struct task_struct *p) 5597 { 5598 /* 5599 * Only decay a single time; tasks that have less then 1 wakeup per 5600 * jiffy will not have built up many flips. 5601 */ 5602 if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) { 5603 current->wakee_flips >>= 1; 5604 current->wakee_flip_decay_ts = jiffies; 5605 } 5606 5607 if (current->last_wakee != p) { 5608 current->last_wakee = p; 5609 current->wakee_flips++; 5610 } 5611 } 5612 5613 /* 5614 * Detect M:N waker/wakee relationships via a switching-frequency heuristic. 5615 * 5616 * A waker of many should wake a different task than the one last awakened 5617 * at a frequency roughly N times higher than one of its wakees. 5618 * 5619 * In order to determine whether we should let the load spread vs consolidating 5620 * to shared cache, we look for a minimum 'flip' frequency of llc_size in one 5621 * partner, and a factor of lls_size higher frequency in the other. 5622 * 5623 * With both conditions met, we can be relatively sure that the relationship is 5624 * non-monogamous, with partner count exceeding socket size. 5625 * 5626 * Waker/wakee being client/server, worker/dispatcher, interrupt source or 5627 * whatever is irrelevant, spread criteria is apparent partner count exceeds 5628 * socket size. 5629 */ 5630 static int wake_wide(struct task_struct *p) 5631 { 5632 unsigned int master = current->wakee_flips; 5633 unsigned int slave = p->wakee_flips; 5634 int factor = this_cpu_read(sd_llc_size); 5635 5636 if (master < slave) 5637 swap(master, slave); 5638 if (slave < factor || master < slave * factor) 5639 return 0; 5640 return 1; 5641 } 5642 5643 /* 5644 * The purpose of wake_affine() is to quickly determine on which CPU we can run 5645 * soonest. For the purpose of speed we only consider the waking and previous 5646 * CPU. 5647 * 5648 * wake_affine_idle() - only considers 'now', it check if the waking CPU is (or 5649 * will be) idle. 5650 * 5651 * wake_affine_weight() - considers the weight to reflect the average 5652 * scheduling latency of the CPUs. This seems to work 5653 * for the overloaded case. 5654 */ 5655 5656 static bool 5657 wake_affine_idle(struct sched_domain *sd, struct task_struct *p, 5658 int this_cpu, int prev_cpu, int sync) 5659 { 5660 if (idle_cpu(this_cpu)) 5661 return true; 5662 5663 if (sync && cpu_rq(this_cpu)->nr_running == 1) 5664 return true; 5665 5666 return false; 5667 } 5668 5669 static bool 5670 wake_affine_weight(struct sched_domain *sd, struct task_struct *p, 5671 int this_cpu, int prev_cpu, int sync) 5672 { 5673 s64 this_eff_load, prev_eff_load; 5674 unsigned long task_load; 5675 5676 this_eff_load = target_load(this_cpu, sd->wake_idx); 5677 prev_eff_load = source_load(prev_cpu, sd->wake_idx); 5678 5679 if (sync) { 5680 unsigned long current_load = task_h_load(current); 5681 5682 if (current_load > this_eff_load) 5683 return true; 5684 5685 this_eff_load -= current_load; 5686 } 5687 5688 task_load = task_h_load(p); 5689 5690 this_eff_load += task_load; 5691 if (sched_feat(WA_BIAS)) 5692 this_eff_load *= 100; 5693 this_eff_load *= capacity_of(prev_cpu); 5694 5695 prev_eff_load -= task_load; 5696 if (sched_feat(WA_BIAS)) 5697 prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2; 5698 prev_eff_load *= capacity_of(this_cpu); 5699 5700 return this_eff_load <= prev_eff_load; 5701 } 5702 5703 static int wake_affine(struct sched_domain *sd, struct task_struct *p, 5704 int prev_cpu, int sync) 5705 { 5706 int this_cpu = smp_processor_id(); 5707 bool affine = false; 5708 5709 if (sched_feat(WA_IDLE) && !affine) 5710 affine = wake_affine_idle(sd, p, this_cpu, prev_cpu, sync); 5711 5712 if (sched_feat(WA_WEIGHT) && !affine) 5713 affine = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync); 5714 5715 schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts); 5716 if (affine) { 5717 schedstat_inc(sd->ttwu_move_affine); 5718 schedstat_inc(p->se.statistics.nr_wakeups_affine); 5719 } 5720 5721 return affine; 5722 } 5723 5724 static inline int task_util(struct task_struct *p); 5725 static int cpu_util_wake(int cpu, struct task_struct *p); 5726 5727 static unsigned long capacity_spare_wake(int cpu, struct task_struct *p) 5728 { 5729 return capacity_orig_of(cpu) - cpu_util_wake(cpu, p); 5730 } 5731 5732 /* 5733 * find_idlest_group finds and returns the least busy CPU group within the 5734 * domain. 5735 * 5736 * Assumes p is allowed on at least one CPU in sd. 5737 */ 5738 static struct sched_group * 5739 find_idlest_group(struct sched_domain *sd, struct task_struct *p, 5740 int this_cpu, int sd_flag) 5741 { 5742 struct sched_group *idlest = NULL, *group = sd->groups; 5743 struct sched_group *most_spare_sg = NULL; 5744 unsigned long min_runnable_load = ULONG_MAX; 5745 unsigned long this_runnable_load = ULONG_MAX; 5746 unsigned long min_avg_load = ULONG_MAX, this_avg_load = ULONG_MAX; 5747 unsigned long most_spare = 0, this_spare = 0; 5748 int load_idx = sd->forkexec_idx; 5749 int imbalance_scale = 100 + (sd->imbalance_pct-100)/2; 5750 unsigned long imbalance = scale_load_down(NICE_0_LOAD) * 5751 (sd->imbalance_pct-100) / 100; 5752 5753 if (sd_flag & SD_BALANCE_WAKE) 5754 load_idx = sd->wake_idx; 5755 5756 do { 5757 unsigned long load, avg_load, runnable_load; 5758 unsigned long spare_cap, max_spare_cap; 5759 int local_group; 5760 int i; 5761 5762 /* Skip over this group if it has no CPUs allowed */ 5763 if (!cpumask_intersects(sched_group_span(group), 5764 &p->cpus_allowed)) 5765 continue; 5766 5767 local_group = cpumask_test_cpu(this_cpu, 5768 sched_group_span(group)); 5769 5770 /* 5771 * Tally up the load of all CPUs in the group and find 5772 * the group containing the CPU with most spare capacity. 5773 */ 5774 avg_load = 0; 5775 runnable_load = 0; 5776 max_spare_cap = 0; 5777 5778 for_each_cpu(i, sched_group_span(group)) { 5779 /* Bias balancing toward cpus of our domain */ 5780 if (local_group) 5781 load = source_load(i, load_idx); 5782 else 5783 load = target_load(i, load_idx); 5784 5785 runnable_load += load; 5786 5787 avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs); 5788 5789 spare_cap = capacity_spare_wake(i, p); 5790 5791 if (spare_cap > max_spare_cap) 5792 max_spare_cap = spare_cap; 5793 } 5794 5795 /* Adjust by relative CPU capacity of the group */ 5796 avg_load = (avg_load * SCHED_CAPACITY_SCALE) / 5797 group->sgc->capacity; 5798 runnable_load = (runnable_load * SCHED_CAPACITY_SCALE) / 5799 group->sgc->capacity; 5800 5801 if (local_group) { 5802 this_runnable_load = runnable_load; 5803 this_avg_load = avg_load; 5804 this_spare = max_spare_cap; 5805 } else { 5806 if (min_runnable_load > (runnable_load + imbalance)) { 5807 /* 5808 * The runnable load is significantly smaller 5809 * so we can pick this new cpu 5810 */ 5811 min_runnable_load = runnable_load; 5812 min_avg_load = avg_load; 5813 idlest = group; 5814 } else if ((runnable_load < (min_runnable_load + imbalance)) && 5815 (100*min_avg_load > imbalance_scale*avg_load)) { 5816 /* 5817 * The runnable loads are close so take the 5818 * blocked load into account through avg_load. 5819 */ 5820 min_avg_load = avg_load; 5821 idlest = group; 5822 } 5823 5824 if (most_spare < max_spare_cap) { 5825 most_spare = max_spare_cap; 5826 most_spare_sg = group; 5827 } 5828 } 5829 } while (group = group->next, group != sd->groups); 5830 5831 /* 5832 * The cross-over point between using spare capacity or least load 5833 * is too conservative for high utilization tasks on partially 5834 * utilized systems if we require spare_capacity > task_util(p), 5835 * so we allow for some task stuffing by using 5836 * spare_capacity > task_util(p)/2. 5837 * 5838 * Spare capacity can't be used for fork because the utilization has 5839 * not been set yet, we must first select a rq to compute the initial 5840 * utilization. 5841 */ 5842 if (sd_flag & SD_BALANCE_FORK) 5843 goto skip_spare; 5844 5845 if (this_spare > task_util(p) / 2 && 5846 imbalance_scale*this_spare > 100*most_spare) 5847 return NULL; 5848 5849 if (most_spare > task_util(p) / 2) 5850 return most_spare_sg; 5851 5852 skip_spare: 5853 if (!idlest) 5854 return NULL; 5855 5856 if (min_runnable_load > (this_runnable_load + imbalance)) 5857 return NULL; 5858 5859 if ((this_runnable_load < (min_runnable_load + imbalance)) && 5860 (100*this_avg_load < imbalance_scale*min_avg_load)) 5861 return NULL; 5862 5863 return idlest; 5864 } 5865 5866 /* 5867 * find_idlest_group_cpu - find the idlest cpu among the cpus in group. 5868 */ 5869 static int 5870 find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) 5871 { 5872 unsigned long load, min_load = ULONG_MAX; 5873 unsigned int min_exit_latency = UINT_MAX; 5874 u64 latest_idle_timestamp = 0; 5875 int least_loaded_cpu = this_cpu; 5876 int shallowest_idle_cpu = -1; 5877 int i; 5878 5879 /* Check if we have any choice: */ 5880 if (group->group_weight == 1) 5881 return cpumask_first(sched_group_span(group)); 5882 5883 /* Traverse only the allowed CPUs */ 5884 for_each_cpu_and(i, sched_group_span(group), &p->cpus_allowed) { 5885 if (idle_cpu(i)) { 5886 struct rq *rq = cpu_rq(i); 5887 struct cpuidle_state *idle = idle_get_state(rq); 5888 if (idle && idle->exit_latency < min_exit_latency) { 5889 /* 5890 * We give priority to a CPU whose idle state 5891 * has the smallest exit latency irrespective 5892 * of any idle timestamp. 5893 */ 5894 min_exit_latency = idle->exit_latency; 5895 latest_idle_timestamp = rq->idle_stamp; 5896 shallowest_idle_cpu = i; 5897 } else if ((!idle || idle->exit_latency == min_exit_latency) && 5898 rq->idle_stamp > latest_idle_timestamp) { 5899 /* 5900 * If equal or no active idle state, then 5901 * the most recently idled CPU might have 5902 * a warmer cache. 5903 */ 5904 latest_idle_timestamp = rq->idle_stamp; 5905 shallowest_idle_cpu = i; 5906 } 5907 } else if (shallowest_idle_cpu == -1) { 5908 load = weighted_cpuload(cpu_rq(i)); 5909 if (load < min_load || (load == min_load && i == this_cpu)) { 5910 min_load = load; 5911 least_loaded_cpu = i; 5912 } 5913 } 5914 } 5915 5916 return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu; 5917 } 5918 5919 static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p, 5920 int cpu, int prev_cpu, int sd_flag) 5921 { 5922 int new_cpu = cpu; 5923 5924 if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed)) 5925 return prev_cpu; 5926 5927 while (sd) { 5928 struct sched_group *group; 5929 struct sched_domain *tmp; 5930 int weight; 5931 5932 if (!(sd->flags & sd_flag)) { 5933 sd = sd->child; 5934 continue; 5935 } 5936 5937 group = find_idlest_group(sd, p, cpu, sd_flag); 5938 if (!group) { 5939 sd = sd->child; 5940 continue; 5941 } 5942 5943 new_cpu = find_idlest_group_cpu(group, p, cpu); 5944 if (new_cpu == cpu) { 5945 /* Now try balancing at a lower domain level of cpu */ 5946 sd = sd->child; 5947 continue; 5948 } 5949 5950 /* Now try balancing at a lower domain level of new_cpu */ 5951 cpu = new_cpu; 5952 weight = sd->span_weight; 5953 sd = NULL; 5954 for_each_domain(cpu, tmp) { 5955 if (weight <= tmp->span_weight) 5956 break; 5957 if (tmp->flags & sd_flag) 5958 sd = tmp; 5959 } 5960 /* while loop will break here if sd == NULL */ 5961 } 5962 5963 return new_cpu; 5964 } 5965 5966 #ifdef CONFIG_SCHED_SMT 5967 5968 static inline void set_idle_cores(int cpu, int val) 5969 { 5970 struct sched_domain_shared *sds; 5971 5972 sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); 5973 if (sds) 5974 WRITE_ONCE(sds->has_idle_cores, val); 5975 } 5976 5977 static inline bool test_idle_cores(int cpu, bool def) 5978 { 5979 struct sched_domain_shared *sds; 5980 5981 sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); 5982 if (sds) 5983 return READ_ONCE(sds->has_idle_cores); 5984 5985 return def; 5986 } 5987 5988 /* 5989 * Scans the local SMT mask to see if the entire core is idle, and records this 5990 * information in sd_llc_shared->has_idle_cores. 5991 * 5992 * Since SMT siblings share all cache levels, inspecting this limited remote 5993 * state should be fairly cheap. 5994 */ 5995 void __update_idle_core(struct rq *rq) 5996 { 5997 int core = cpu_of(rq); 5998 int cpu; 5999 6000 rcu_read_lock(); 6001 if (test_idle_cores(core, true)) 6002 goto unlock; 6003 6004 for_each_cpu(cpu, cpu_smt_mask(core)) { 6005 if (cpu == core) 6006 continue; 6007 6008 if (!idle_cpu(cpu)) 6009 goto unlock; 6010 } 6011 6012 set_idle_cores(core, 1); 6013 unlock: 6014 rcu_read_unlock(); 6015 } 6016 6017 /* 6018 * Scan the entire LLC domain for idle cores; this dynamically switches off if 6019 * there are no idle cores left in the system; tracked through 6020 * sd_llc->shared->has_idle_cores and enabled through update_idle_core() above. 6021 */ 6022 static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target) 6023 { 6024 struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask); 6025 int core, cpu; 6026 6027 if (!static_branch_likely(&sched_smt_present)) 6028 return -1; 6029 6030 if (!test_idle_cores(target, false)) 6031 return -1; 6032 6033 cpumask_and(cpus, sched_domain_span(sd), &p->cpus_allowed); 6034 6035 for_each_cpu_wrap(core, cpus, target) { 6036 bool idle = true; 6037 6038 for_each_cpu(cpu, cpu_smt_mask(core)) { 6039 cpumask_clear_cpu(cpu, cpus); 6040 if (!idle_cpu(cpu)) 6041 idle = false; 6042 } 6043 6044 if (idle) 6045 return core; 6046 } 6047 6048 /* 6049 * Failed to find an idle core; stop looking for one. 6050 */ 6051 set_idle_cores(target, 0); 6052 6053 return -1; 6054 } 6055 6056 /* 6057 * Scan the local SMT mask for idle CPUs. 6058 */ 6059 static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target) 6060 { 6061 int cpu; 6062 6063 if (!static_branch_likely(&sched_smt_present)) 6064 return -1; 6065 6066 for_each_cpu(cpu, cpu_smt_mask(target)) { 6067 if (!cpumask_test_cpu(cpu, &p->cpus_allowed)) 6068 continue; 6069 if (idle_cpu(cpu)) 6070 return cpu; 6071 } 6072 6073 return -1; 6074 } 6075 6076 #else /* CONFIG_SCHED_SMT */ 6077 6078 static inline int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target) 6079 { 6080 return -1; 6081 } 6082 6083 static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target) 6084 { 6085 return -1; 6086 } 6087 6088 #endif /* CONFIG_SCHED_SMT */ 6089 6090 /* 6091 * Scan the LLC domain for idle CPUs; this is dynamically regulated by 6092 * comparing the average scan cost (tracked in sd->avg_scan_cost) against the 6093 * average idle time for this rq (as found in rq->avg_idle). 6094 */ 6095 static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target) 6096 { 6097 struct sched_domain *this_sd; 6098 u64 avg_cost, avg_idle; 6099 u64 time, cost; 6100 s64 delta; 6101 int cpu, nr = INT_MAX; 6102 6103 this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); 6104 if (!this_sd) 6105 return -1; 6106 6107 /* 6108 * Due to large variance we need a large fuzz factor; hackbench in 6109 * particularly is sensitive here. 6110 */ 6111 avg_idle = this_rq()->avg_idle / 512; 6112 avg_cost = this_sd->avg_scan_cost + 1; 6113 6114 if (sched_feat(SIS_AVG_CPU) && avg_idle < avg_cost) 6115 return -1; 6116 6117 if (sched_feat(SIS_PROP)) { 6118 u64 span_avg = sd->span_weight * avg_idle; 6119 if (span_avg > 4*avg_cost) 6120 nr = div_u64(span_avg, avg_cost); 6121 else 6122 nr = 4; 6123 } 6124 6125 time = local_clock(); 6126 6127 for_each_cpu_wrap(cpu, sched_domain_span(sd), target) { 6128 if (!--nr) 6129 return -1; 6130 if (!cpumask_test_cpu(cpu, &p->cpus_allowed)) 6131 continue; 6132 if (idle_cpu(cpu)) 6133 break; 6134 } 6135 6136 time = local_clock() - time; 6137 cost = this_sd->avg_scan_cost; 6138 delta = (s64)(time - cost) / 8; 6139 this_sd->avg_scan_cost += delta; 6140 6141 return cpu; 6142 } 6143 6144 /* 6145 * Try and locate an idle core/thread in the LLC cache domain. 6146 */ 6147 static int select_idle_sibling(struct task_struct *p, int prev, int target) 6148 { 6149 struct sched_domain *sd; 6150 int i; 6151 6152 if (idle_cpu(target)) 6153 return target; 6154 6155 /* 6156 * If the previous cpu is cache affine and idle, don't be stupid. 6157 */ 6158 if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev)) 6159 return prev; 6160 6161 sd = rcu_dereference(per_cpu(sd_llc, target)); 6162 if (!sd) 6163 return target; 6164 6165 i = select_idle_core(p, sd, target); 6166 if ((unsigned)i < nr_cpumask_bits) 6167 return i; 6168 6169 i = select_idle_cpu(p, sd, target); 6170 if ((unsigned)i < nr_cpumask_bits) 6171 return i; 6172 6173 i = select_idle_smt(p, sd, target); 6174 if ((unsigned)i < nr_cpumask_bits) 6175 return i; 6176 6177 return target; 6178 } 6179 6180 /* 6181 * cpu_util returns the amount of capacity of a CPU that is used by CFS 6182 * tasks. The unit of the return value must be the one of capacity so we can 6183 * compare the utilization with the capacity of the CPU that is available for 6184 * CFS task (ie cpu_capacity). 6185 * 6186 * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the 6187 * recent utilization of currently non-runnable tasks on a CPU. It represents 6188 * the amount of utilization of a CPU in the range [0..capacity_orig] where 6189 * capacity_orig is the cpu_capacity available at the highest frequency 6190 * (arch_scale_freq_capacity()). 6191 * The utilization of a CPU converges towards a sum equal to or less than the 6192 * current capacity (capacity_curr <= capacity_orig) of the CPU because it is 6193 * the running time on this CPU scaled by capacity_curr. 6194 * 6195 * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even 6196 * higher than capacity_orig because of unfortunate rounding in 6197 * cfs.avg.util_avg or just after migrating tasks and new task wakeups until 6198 * the average stabilizes with the new running time. We need to check that the 6199 * utilization stays within the range of [0..capacity_orig] and cap it if 6200 * necessary. Without utilization capping, a group could be seen as overloaded 6201 * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of 6202 * available capacity. We allow utilization to overshoot capacity_curr (but not 6203 * capacity_orig) as it useful for predicting the capacity required after task 6204 * migrations (scheduler-driven DVFS). 6205 */ 6206 static int cpu_util(int cpu) 6207 { 6208 unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg; 6209 unsigned long capacity = capacity_orig_of(cpu); 6210 6211 return (util >= capacity) ? capacity : util; 6212 } 6213 6214 static inline int task_util(struct task_struct *p) 6215 { 6216 return p->se.avg.util_avg; 6217 } 6218 6219 /* 6220 * cpu_util_wake: Compute cpu utilization with any contributions from 6221 * the waking task p removed. 6222 */ 6223 static int cpu_util_wake(int cpu, struct task_struct *p) 6224 { 6225 unsigned long util, capacity; 6226 6227 /* Task has no contribution or is new */ 6228 if (cpu != task_cpu(p) || !p->se.avg.last_update_time) 6229 return cpu_util(cpu); 6230 6231 capacity = capacity_orig_of(cpu); 6232 util = max_t(long, cpu_rq(cpu)->cfs.avg.util_avg - task_util(p), 0); 6233 6234 return (util >= capacity) ? capacity : util; 6235 } 6236 6237 /* 6238 * Disable WAKE_AFFINE in the case where task @p doesn't fit in the 6239 * capacity of either the waking CPU @cpu or the previous CPU @prev_cpu. 6240 * 6241 * In that case WAKE_AFFINE doesn't make sense and we'll let 6242 * BALANCE_WAKE sort things out. 6243 */ 6244 static int wake_cap(struct task_struct *p, int cpu, int prev_cpu) 6245 { 6246 long min_cap, max_cap; 6247 6248 min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu)); 6249 max_cap = cpu_rq(cpu)->rd->max_cpu_capacity; 6250 6251 /* Minimum capacity is close to max, no need to abort wake_affine */ 6252 if (max_cap - min_cap < max_cap >> 3) 6253 return 0; 6254 6255 /* Bring task utilization in sync with prev_cpu */ 6256 sync_entity_load_avg(&p->se); 6257 6258 return min_cap * 1024 < task_util(p) * capacity_margin; 6259 } 6260 6261 /* 6262 * select_task_rq_fair: Select target runqueue for the waking task in domains 6263 * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE, 6264 * SD_BALANCE_FORK, or SD_BALANCE_EXEC. 6265 * 6266 * Balances load by selecting the idlest cpu in the idlest group, or under 6267 * certain conditions an idle sibling cpu if the domain has SD_WAKE_AFFINE set. 6268 * 6269 * Returns the target cpu number. 6270 * 6271 * preempt must be disabled. 6272 */ 6273 static int 6274 select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags) 6275 { 6276 struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; 6277 int cpu = smp_processor_id(); 6278 int new_cpu = prev_cpu; 6279 int want_affine = 0; 6280 int sync = wake_flags & WF_SYNC; 6281 6282 if (sd_flag & SD_BALANCE_WAKE) { 6283 record_wakee(p); 6284 want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu) 6285 && cpumask_test_cpu(cpu, &p->cpus_allowed); 6286 } 6287 6288 rcu_read_lock(); 6289 for_each_domain(cpu, tmp) { 6290 if (!(tmp->flags & SD_LOAD_BALANCE)) 6291 break; 6292 6293 /* 6294 * If both cpu and prev_cpu are part of this domain, 6295 * cpu is a valid SD_WAKE_AFFINE target. 6296 */ 6297 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) && 6298 cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) { 6299 affine_sd = tmp; 6300 break; 6301 } 6302 6303 if (tmp->flags & sd_flag) 6304 sd = tmp; 6305 else if (!want_affine) 6306 break; 6307 } 6308 6309 if (affine_sd) { 6310 sd = NULL; /* Prefer wake_affine over balance flags */ 6311 if (cpu == prev_cpu) 6312 goto pick_cpu; 6313 6314 if (wake_affine(affine_sd, p, prev_cpu, sync)) 6315 new_cpu = cpu; 6316 } 6317 6318 if (sd && !(sd_flag & SD_BALANCE_FORK)) { 6319 /* 6320 * We're going to need the task's util for capacity_spare_wake 6321 * in find_idlest_group. Sync it up to prev_cpu's 6322 * last_update_time. 6323 */ 6324 sync_entity_load_avg(&p->se); 6325 } 6326 6327 if (!sd) { 6328 pick_cpu: 6329 if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */ 6330 new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); 6331 6332 } else { 6333 new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag); 6334 } 6335 rcu_read_unlock(); 6336 6337 return new_cpu; 6338 } 6339 6340 static void detach_entity_cfs_rq(struct sched_entity *se); 6341 6342 /* 6343 * Called immediately before a task is migrated to a new cpu; task_cpu(p) and 6344 * cfs_rq_of(p) references at time of call are still valid and identify the 6345 * previous cpu. The caller guarantees p->pi_lock or task_rq(p)->lock is held. 6346 */ 6347 static void migrate_task_rq_fair(struct task_struct *p) 6348 { 6349 /* 6350 * As blocked tasks retain absolute vruntime the migration needs to 6351 * deal with this by subtracting the old and adding the new 6352 * min_vruntime -- the latter is done by enqueue_entity() when placing 6353 * the task on the new runqueue. 6354 */ 6355 if (p->state == TASK_WAKING) { 6356 struct sched_entity *se = &p->se; 6357 struct cfs_rq *cfs_rq = cfs_rq_of(se); 6358 u64 min_vruntime; 6359 6360 #ifndef CONFIG_64BIT 6361 u64 min_vruntime_copy; 6362 6363 do { 6364 min_vruntime_copy = cfs_rq->min_vruntime_copy; 6365 smp_rmb(); 6366 min_vruntime = cfs_rq->min_vruntime; 6367 } while (min_vruntime != min_vruntime_copy); 6368 #else 6369 min_vruntime = cfs_rq->min_vruntime; 6370 #endif 6371 6372 se->vruntime -= min_vruntime; 6373 } 6374 6375 if (p->on_rq == TASK_ON_RQ_MIGRATING) { 6376 /* 6377 * In case of TASK_ON_RQ_MIGRATING we in fact hold the 'old' 6378 * rq->lock and can modify state directly. 6379 */ 6380 lockdep_assert_held(&task_rq(p)->lock); 6381 detach_entity_cfs_rq(&p->se); 6382 6383 } else { 6384 /* 6385 * We are supposed to update the task to "current" time, then 6386 * its up to date and ready to go to new CPU/cfs_rq. But we 6387 * have difficulty in getting what current time is, so simply 6388 * throw away the out-of-date time. This will result in the 6389 * wakee task is less decayed, but giving the wakee more load 6390 * sounds not bad. 6391 */ 6392 remove_entity_load_avg(&p->se); 6393 } 6394 6395 /* Tell new CPU we are migrated */ 6396 p->se.avg.last_update_time = 0; 6397 6398 /* We have migrated, no longer consider this task hot */ 6399 p->se.exec_start = 0; 6400 } 6401 6402 static void task_dead_fair(struct task_struct *p) 6403 { 6404 remove_entity_load_avg(&p->se); 6405 } 6406 #endif /* CONFIG_SMP */ 6407 6408 static unsigned long 6409 wakeup_gran(struct sched_entity *curr, struct sched_entity *se) 6410 { 6411 unsigned long gran = sysctl_sched_wakeup_granularity; 6412 6413 /* 6414 * Since its curr running now, convert the gran from real-time 6415 * to virtual-time in his units. 6416 * 6417 * By using 'se' instead of 'curr' we penalize light tasks, so 6418 * they get preempted easier. That is, if 'se' < 'curr' then 6419 * the resulting gran will be larger, therefore penalizing the 6420 * lighter, if otoh 'se' > 'curr' then the resulting gran will 6421 * be smaller, again penalizing the lighter task. 6422 * 6423 * This is especially important for buddies when the leftmost 6424 * task is higher priority than the buddy. 6425 */ 6426 return calc_delta_fair(gran, se); 6427 } 6428 6429 /* 6430 * Should 'se' preempt 'curr'. 6431 * 6432 * |s1 6433 * |s2 6434 * |s3 6435 * g 6436 * |<--->|c 6437 * 6438 * w(c, s1) = -1 6439 * w(c, s2) = 0 6440 * w(c, s3) = 1 6441 * 6442 */ 6443 static int 6444 wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se) 6445 { 6446 s64 gran, vdiff = curr->vruntime - se->vruntime; 6447 6448 if (vdiff <= 0) 6449 return -1; 6450 6451 gran = wakeup_gran(curr, se); 6452 if (vdiff > gran) 6453 return 1; 6454 6455 return 0; 6456 } 6457 6458 static void set_last_buddy(struct sched_entity *se) 6459 { 6460 if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE)) 6461 return; 6462 6463 for_each_sched_entity(se) { 6464 if (SCHED_WARN_ON(!se->on_rq)) 6465 return; 6466 cfs_rq_of(se)->last = se; 6467 } 6468 } 6469 6470 static void set_next_buddy(struct sched_entity *se) 6471 { 6472 if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE)) 6473 return; 6474 6475 for_each_sched_entity(se) { 6476 if (SCHED_WARN_ON(!se->on_rq)) 6477 return; 6478 cfs_rq_of(se)->next = se; 6479 } 6480 } 6481 6482 static void set_skip_buddy(struct sched_entity *se) 6483 { 6484 for_each_sched_entity(se) 6485 cfs_rq_of(se)->skip = se; 6486 } 6487 6488 /* 6489 * Preempt the current task with a newly woken task if needed: 6490 */ 6491 static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) 6492 { 6493 struct task_struct *curr = rq->curr; 6494 struct sched_entity *se = &curr->se, *pse = &p->se; 6495 struct cfs_rq *cfs_rq = task_cfs_rq(curr); 6496 int scale = cfs_rq->nr_running >= sched_nr_latency; 6497 int next_buddy_marked = 0; 6498 6499 if (unlikely(se == pse)) 6500 return; 6501 6502 /* 6503 * This is possible from callers such as attach_tasks(), in which we 6504 * unconditionally check_prempt_curr() after an enqueue (which may have 6505 * lead to a throttle). This both saves work and prevents false 6506 * next-buddy nomination below. 6507 */ 6508 if (unlikely(throttled_hierarchy(cfs_rq_of(pse)))) 6509 return; 6510 6511 if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) { 6512 set_next_buddy(pse); 6513 next_buddy_marked = 1; 6514 } 6515 6516 /* 6517 * We can come here with TIF_NEED_RESCHED already set from new task 6518 * wake up path. 6519 * 6520 * Note: this also catches the edge-case of curr being in a throttled 6521 * group (e.g. via set_curr_task), since update_curr() (in the 6522 * enqueue of curr) will have resulted in resched being set. This 6523 * prevents us from potentially nominating it as a false LAST_BUDDY 6524 * below. 6525 */ 6526 if (test_tsk_need_resched(curr)) 6527 return; 6528 6529 /* Idle tasks are by definition preempted by non-idle tasks. */ 6530 if (unlikely(curr->policy == SCHED_IDLE) && 6531 likely(p->policy != SCHED_IDLE)) 6532 goto preempt; 6533 6534 /* 6535 * Batch and idle tasks do not preempt non-idle tasks (their preemption 6536 * is driven by the tick): 6537 */ 6538 if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION)) 6539 return; 6540 6541 find_matching_se(&se, &pse); 6542 update_curr(cfs_rq_of(se)); 6543 BUG_ON(!pse); 6544 if (wakeup_preempt_entity(se, pse) == 1) { 6545 /* 6546 * Bias pick_next to pick the sched entity that is 6547 * triggering this preemption. 6548 */ 6549 if (!next_buddy_marked) 6550 set_next_buddy(pse); 6551 goto preempt; 6552 } 6553 6554 return; 6555 6556 preempt: 6557 resched_curr(rq); 6558 /* 6559 * Only set the backward buddy when the current task is still 6560 * on the rq. This can happen when a wakeup gets interleaved 6561 * with schedule on the ->pre_schedule() or idle_balance() 6562 * point, either of which can * drop the rq lock. 6563 * 6564 * Also, during early boot the idle thread is in the fair class, 6565 * for obvious reasons its a bad idea to schedule back to it. 6566 */ 6567 if (unlikely(!se->on_rq || curr == rq->idle)) 6568 return; 6569 6570 if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se)) 6571 set_last_buddy(se); 6572 } 6573 6574 static struct task_struct * 6575 pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) 6576 { 6577 struct cfs_rq *cfs_rq = &rq->cfs; 6578 struct sched_entity *se; 6579 struct task_struct *p; 6580 int new_tasks; 6581 6582 again: 6583 if (!cfs_rq->nr_running) 6584 goto idle; 6585 6586 #ifdef CONFIG_FAIR_GROUP_SCHED 6587 if (prev->sched_class != &fair_sched_class) 6588 goto simple; 6589 6590 /* 6591 * Because of the set_next_buddy() in dequeue_task_fair() it is rather 6592 * likely that a next task is from the same cgroup as the current. 6593 * 6594 * Therefore attempt to avoid putting and setting the entire cgroup 6595 * hierarchy, only change the part that actually changes. 6596 */ 6597 6598 do { 6599 struct sched_entity *curr = cfs_rq->curr; 6600 6601 /* 6602 * Since we got here without doing put_prev_entity() we also 6603 * have to consider cfs_rq->curr. If it is still a runnable 6604 * entity, update_curr() will update its vruntime, otherwise 6605 * forget we've ever seen it. 6606 */ 6607 if (curr) { 6608 if (curr->on_rq) 6609 update_curr(cfs_rq); 6610 else 6611 curr = NULL; 6612 6613 /* 6614 * This call to check_cfs_rq_runtime() will do the 6615 * throttle and dequeue its entity in the parent(s). 6616 * Therefore the nr_running test will indeed 6617 * be correct. 6618 */ 6619 if (unlikely(check_cfs_rq_runtime(cfs_rq))) { 6620 cfs_rq = &rq->cfs; 6621 6622 if (!cfs_rq->nr_running) 6623 goto idle; 6624 6625 goto simple; 6626 } 6627 } 6628 6629 se = pick_next_entity(cfs_rq, curr); 6630 cfs_rq = group_cfs_rq(se); 6631 } while (cfs_rq); 6632 6633 p = task_of(se); 6634 6635 /* 6636 * Since we haven't yet done put_prev_entity and if the selected task 6637 * is a different task than we started out with, try and touch the 6638 * least amount of cfs_rqs. 6639 */ 6640 if (prev != p) { 6641 struct sched_entity *pse = &prev->se; 6642 6643 while (!(cfs_rq = is_same_group(se, pse))) { 6644 int se_depth = se->depth; 6645 int pse_depth = pse->depth; 6646 6647 if (se_depth <= pse_depth) { 6648 put_prev_entity(cfs_rq_of(pse), pse); 6649 pse = parent_entity(pse); 6650 } 6651 if (se_depth >= pse_depth) { 6652 set_next_entity(cfs_rq_of(se), se); 6653 se = parent_entity(se); 6654 } 6655 } 6656 6657 put_prev_entity(cfs_rq, pse); 6658 set_next_entity(cfs_rq, se); 6659 } 6660 6661 goto done; 6662 simple: 6663 #endif 6664 6665 put_prev_task(rq, prev); 6666 6667 do { 6668 se = pick_next_entity(cfs_rq, NULL); 6669 set_next_entity(cfs_rq, se); 6670 cfs_rq = group_cfs_rq(se); 6671 } while (cfs_rq); 6672 6673 p = task_of(se); 6674 6675 done: __maybe_unused 6676 #ifdef CONFIG_SMP 6677 /* 6678 * Move the next running task to the front of 6679 * the list, so our cfs_tasks list becomes MRU 6680 * one. 6681 */ 6682 list_move(&p->se.group_node, &rq->cfs_tasks); 6683 #endif 6684 6685 if (hrtick_enabled(rq)) 6686 hrtick_start_fair(rq, p); 6687 6688 return p; 6689 6690 idle: 6691 new_tasks = idle_balance(rq, rf); 6692 6693 /* 6694 * Because idle_balance() releases (and re-acquires) rq->lock, it is 6695 * possible for any higher priority task to appear. In that case we 6696 * must re-start the pick_next_entity() loop. 6697 */ 6698 if (new_tasks < 0) 6699 return RETRY_TASK; 6700 6701 if (new_tasks > 0) 6702 goto again; 6703 6704 return NULL; 6705 } 6706 6707 /* 6708 * Account for a descheduled task: 6709 */ 6710 static void put_prev_task_fair(struct rq *rq, struct task_struct *prev) 6711 { 6712 struct sched_entity *se = &prev->se; 6713 struct cfs_rq *cfs_rq; 6714 6715 for_each_sched_entity(se) { 6716 cfs_rq = cfs_rq_of(se); 6717 put_prev_entity(cfs_rq, se); 6718 } 6719 } 6720 6721 /* 6722 * sched_yield() is very simple 6723 * 6724 * The magic of dealing with the ->skip buddy is in pick_next_entity. 6725 */ 6726 static void yield_task_fair(struct rq *rq) 6727 { 6728 struct task_struct *curr = rq->curr; 6729 struct cfs_rq *cfs_rq = task_cfs_rq(curr); 6730 struct sched_entity *se = &curr->se; 6731 6732 /* 6733 * Are we the only task in the tree? 6734 */ 6735 if (unlikely(rq->nr_running == 1)) 6736 return; 6737 6738 clear_buddies(cfs_rq, se); 6739 6740 if (curr->policy != SCHED_BATCH) { 6741 update_rq_clock(rq); 6742 /* 6743 * Update run-time statistics of the 'current'. 6744 */ 6745 update_curr(cfs_rq); 6746 /* 6747 * Tell update_rq_clock() that we've just updated, 6748 * so we don't do microscopic update in schedule() 6749 * and double the fastpath cost. 6750 */ 6751 rq_clock_skip_update(rq, true); 6752 } 6753 6754 set_skip_buddy(se); 6755 } 6756 6757 static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt) 6758 { 6759 struct sched_entity *se = &p->se; 6760 6761 /* throttled hierarchies are not runnable */ 6762 if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se))) 6763 return false; 6764 6765 /* Tell the scheduler that we'd really like pse to run next. */ 6766 set_next_buddy(se); 6767 6768 yield_task_fair(rq); 6769 6770 return true; 6771 } 6772 6773 #ifdef CONFIG_SMP 6774 /************************************************** 6775 * Fair scheduling class load-balancing methods. 6776 * 6777 * BASICS 6778 * 6779 * The purpose of load-balancing is to achieve the same basic fairness the 6780 * per-cpu scheduler provides, namely provide a proportional amount of compute 6781 * time to each task. This is expressed in the following equation: 6782 * 6783 * W_i,n/P_i == W_j,n/P_j for all i,j (1) 6784 * 6785 * Where W_i,n is the n-th weight average for cpu i. The instantaneous weight 6786 * W_i,0 is defined as: 6787 * 6788 * W_i,0 = \Sum_j w_i,j (2) 6789 * 6790 * Where w_i,j is the weight of the j-th runnable task on cpu i. This weight 6791 * is derived from the nice value as per sched_prio_to_weight[]. 6792 * 6793 * The weight average is an exponential decay average of the instantaneous 6794 * weight: 6795 * 6796 * W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3) 6797 * 6798 * C_i is the compute capacity of cpu i, typically it is the 6799 * fraction of 'recent' time available for SCHED_OTHER task execution. But it 6800 * can also include other factors [XXX]. 6801 * 6802 * To achieve this balance we define a measure of imbalance which follows 6803 * directly from (1): 6804 * 6805 * imb_i,j = max{ avg(W/C), W_i/C_i } - min{ avg(W/C), W_j/C_j } (4) 6806 * 6807 * We them move tasks around to minimize the imbalance. In the continuous 6808 * function space it is obvious this converges, in the discrete case we get 6809 * a few fun cases generally called infeasible weight scenarios. 6810 * 6811 * [XXX expand on: 6812 * - infeasible weights; 6813 * - local vs global optima in the discrete case. ] 6814 * 6815 * 6816 * SCHED DOMAINS 6817 * 6818 * In order to solve the imbalance equation (4), and avoid the obvious O(n^2) 6819 * for all i,j solution, we create a tree of cpus that follows the hardware 6820 * topology where each level pairs two lower groups (or better). This results 6821 * in O(log n) layers. Furthermore we reduce the number of cpus going up the 6822 * tree to only the first of the previous level and we decrease the frequency 6823 * of load-balance at each level inv. proportional to the number of cpus in 6824 * the groups. 6825 * 6826 * This yields: 6827 * 6828 * log_2 n 1 n 6829 * \Sum { --- * --- * 2^i } = O(n) (5) 6830 * i = 0 2^i 2^i 6831 * `- size of each group 6832 * | | `- number of cpus doing load-balance 6833 * | `- freq 6834 * `- sum over all levels 6835 * 6836 * Coupled with a limit on how many tasks we can migrate every balance pass, 6837 * this makes (5) the runtime complexity of the balancer. 6838 * 6839 * An important property here is that each CPU is still (indirectly) connected 6840 * to every other cpu in at most O(log n) steps: 6841 * 6842 * The adjacency matrix of the resulting graph is given by: 6843 * 6844 * log_2 n 6845 * A_i,j = \Union (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1) (6) 6846 * k = 0 6847 * 6848 * And you'll find that: 6849 * 6850 * A^(log_2 n)_i,j != 0 for all i,j (7) 6851 * 6852 * Showing there's indeed a path between every cpu in at most O(log n) steps. 6853 * The task movement gives a factor of O(m), giving a convergence complexity 6854 * of: 6855 * 6856 * O(nm log n), n := nr_cpus, m := nr_tasks (8) 6857 * 6858 * 6859 * WORK CONSERVING 6860 * 6861 * In order to avoid CPUs going idle while there's still work to do, new idle 6862 * balancing is more aggressive and has the newly idle cpu iterate up the domain 6863 * tree itself instead of relying on other CPUs to bring it work. 6864 * 6865 * This adds some complexity to both (5) and (8) but it reduces the total idle 6866 * time. 6867 * 6868 * [XXX more?] 6869 * 6870 * 6871 * CGROUPS 6872 * 6873 * Cgroups make a horror show out of (2), instead of a simple sum we get: 6874 * 6875 * s_k,i 6876 * W_i,0 = \Sum_j \Prod_k w_k * ----- (9) 6877 * S_k 6878 * 6879 * Where 6880 * 6881 * s_k,i = \Sum_j w_i,j,k and S_k = \Sum_i s_k,i (10) 6882 * 6883 * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i. 6884 * 6885 * The big problem is S_k, its a global sum needed to compute a local (W_i) 6886 * property. 6887 * 6888 * [XXX write more on how we solve this.. _after_ merging pjt's patches that 6889 * rewrite all of this once again.] 6890 */ 6891 6892 static unsigned long __read_mostly max_load_balance_interval = HZ/10; 6893 6894 enum fbq_type { regular, remote, all }; 6895 6896 #define LBF_ALL_PINNED 0x01 6897 #define LBF_NEED_BREAK 0x02 6898 #define LBF_DST_PINNED 0x04 6899 #define LBF_SOME_PINNED 0x08 6900 6901 struct lb_env { 6902 struct sched_domain *sd; 6903 6904 struct rq *src_rq; 6905 int src_cpu; 6906 6907 int dst_cpu; 6908 struct rq *dst_rq; 6909 6910 struct cpumask *dst_grpmask; 6911 int new_dst_cpu; 6912 enum cpu_idle_type idle; 6913 long imbalance; 6914 /* The set of CPUs under consideration for load-balancing */ 6915 struct cpumask *cpus; 6916 6917 unsigned int flags; 6918 6919 unsigned int loop; 6920 unsigned int loop_break; 6921 unsigned int loop_max; 6922 6923 enum fbq_type fbq_type; 6924 struct list_head tasks; 6925 }; 6926 6927 /* 6928 * Is this task likely cache-hot: 6929 */ 6930 static int task_hot(struct task_struct *p, struct lb_env *env) 6931 { 6932 s64 delta; 6933 6934 lockdep_assert_held(&env->src_rq->lock); 6935 6936 if (p->sched_class != &fair_sched_class) 6937 return 0; 6938 6939 if (unlikely(p->policy == SCHED_IDLE)) 6940 return 0; 6941 6942 /* 6943 * Buddy candidates are cache hot: 6944 */ 6945 if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running && 6946 (&p->se == cfs_rq_of(&p->se)->next || 6947 &p->se == cfs_rq_of(&p->se)->last)) 6948 return 1; 6949 6950 if (sysctl_sched_migration_cost == -1) 6951 return 1; 6952 if (sysctl_sched_migration_cost == 0) 6953 return 0; 6954 6955 delta = rq_clock_task(env->src_rq) - p->se.exec_start; 6956 6957 return delta < (s64)sysctl_sched_migration_cost; 6958 } 6959 6960 #ifdef CONFIG_NUMA_BALANCING 6961 /* 6962 * Returns 1, if task migration degrades locality 6963 * Returns 0, if task migration improves locality i.e migration preferred. 6964 * Returns -1, if task migration is not affected by locality. 6965 */ 6966 static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env) 6967 { 6968 struct numa_group *numa_group = rcu_dereference(p->numa_group); 6969 unsigned long src_faults, dst_faults; 6970 int src_nid, dst_nid; 6971 6972 if (!static_branch_likely(&sched_numa_balancing)) 6973 return -1; 6974 6975 if (!p->numa_faults || !(env->sd->flags & SD_NUMA)) 6976 return -1; 6977 6978 src_nid = cpu_to_node(env->src_cpu); 6979 dst_nid = cpu_to_node(env->dst_cpu); 6980 6981 if (src_nid == dst_nid) 6982 return -1; 6983 6984 /* Migrating away from the preferred node is always bad. */ 6985 if (src_nid == p->numa_preferred_nid) { 6986 if (env->src_rq->nr_running > env->src_rq->nr_preferred_running) 6987 return 1; 6988 else 6989 return -1; 6990 } 6991 6992 /* Encourage migration to the preferred node. */ 6993 if (dst_nid == p->numa_preferred_nid) 6994 return 0; 6995 6996 /* Leaving a core idle is often worse than degrading locality. */ 6997 if (env->idle != CPU_NOT_IDLE) 6998 return -1; 6999 7000 if (numa_group) { 7001 src_faults = group_faults(p, src_nid); 7002 dst_faults = group_faults(p, dst_nid); 7003 } else { 7004 src_faults = task_faults(p, src_nid); 7005 dst_faults = task_faults(p, dst_nid); 7006 } 7007 7008 return dst_faults < src_faults; 7009 } 7010 7011 #else 7012 static inline int migrate_degrades_locality(struct task_struct *p, 7013 struct lb_env *env) 7014 { 7015 return -1; 7016 } 7017 #endif 7018 7019 /* 7020 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? 7021 */ 7022 static 7023 int can_migrate_task(struct task_struct *p, struct lb_env *env) 7024 { 7025 int tsk_cache_hot; 7026 7027 lockdep_assert_held(&env->src_rq->lock); 7028 7029 /* 7030 * We do not migrate tasks that are: 7031 * 1) throttled_lb_pair, or 7032 * 2) cannot be migrated to this CPU due to cpus_allowed, or 7033 * 3) running (obviously), or 7034 * 4) are cache-hot on their current CPU. 7035 */ 7036 if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu)) 7037 return 0; 7038 7039 if (!cpumask_test_cpu(env->dst_cpu, &p->cpus_allowed)) { 7040 int cpu; 7041 7042 schedstat_inc(p->se.statistics.nr_failed_migrations_affine); 7043 7044 env->flags |= LBF_SOME_PINNED; 7045 7046 /* 7047 * Remember if this task can be migrated to any other cpu in 7048 * our sched_group. We may want to revisit it if we couldn't 7049 * meet load balance goals by pulling other tasks on src_cpu. 7050 * 7051 * Avoid computing new_dst_cpu for NEWLY_IDLE or if we have 7052 * already computed one in current iteration. 7053 */ 7054 if (env->idle == CPU_NEWLY_IDLE || (env->flags & LBF_DST_PINNED)) 7055 return 0; 7056 7057 /* Prevent to re-select dst_cpu via env's cpus */ 7058 for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) { 7059 if (cpumask_test_cpu(cpu, &p->cpus_allowed)) { 7060 env->flags |= LBF_DST_PINNED; 7061 env->new_dst_cpu = cpu; 7062 break; 7063 } 7064 } 7065 7066 return 0; 7067 } 7068 7069 /* Record that we found atleast one task that could run on dst_cpu */ 7070 env->flags &= ~LBF_ALL_PINNED; 7071 7072 if (task_running(env->src_rq, p)) { 7073 schedstat_inc(p->se.statistics.nr_failed_migrations_running); 7074 return 0; 7075 } 7076 7077 /* 7078 * Aggressive migration if: 7079 * 1) destination numa is preferred 7080 * 2) task is cache cold, or 7081 * 3) too many balance attempts have failed. 7082 */ 7083 tsk_cache_hot = migrate_degrades_locality(p, env); 7084 if (tsk_cache_hot == -1) 7085 tsk_cache_hot = task_hot(p, env); 7086 7087 if (tsk_cache_hot <= 0 || 7088 env->sd->nr_balance_failed > env->sd->cache_nice_tries) { 7089 if (tsk_cache_hot == 1) { 7090 schedstat_inc(env->sd->lb_hot_gained[env->idle]); 7091 schedstat_inc(p->se.statistics.nr_forced_migrations); 7092 } 7093 return 1; 7094 } 7095 7096 schedstat_inc(p->se.statistics.nr_failed_migrations_hot); 7097 return 0; 7098 } 7099 7100 /* 7101 * detach_task() -- detach the task for the migration specified in env 7102 */ 7103 static void detach_task(struct task_struct *p, struct lb_env *env) 7104 { 7105 lockdep_assert_held(&env->src_rq->lock); 7106 7107 p->on_rq = TASK_ON_RQ_MIGRATING; 7108 deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK); 7109 set_task_cpu(p, env->dst_cpu); 7110 } 7111 7112 /* 7113 * detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as 7114 * part of active balancing operations within "domain". 7115 * 7116 * Returns a task if successful and NULL otherwise. 7117 */ 7118 static struct task_struct *detach_one_task(struct lb_env *env) 7119 { 7120 struct task_struct *p; 7121 7122 lockdep_assert_held(&env->src_rq->lock); 7123 7124 list_for_each_entry_reverse(p, 7125 &env->src_rq->cfs_tasks, se.group_node) { 7126 if (!can_migrate_task(p, env)) 7127 continue; 7128 7129 detach_task(p, env); 7130 7131 /* 7132 * Right now, this is only the second place where 7133 * lb_gained[env->idle] is updated (other is detach_tasks) 7134 * so we can safely collect stats here rather than 7135 * inside detach_tasks(). 7136 */ 7137 schedstat_inc(env->sd->lb_gained[env->idle]); 7138 return p; 7139 } 7140 return NULL; 7141 } 7142 7143 static const unsigned int sched_nr_migrate_break = 32; 7144 7145 /* 7146 * detach_tasks() -- tries to detach up to imbalance weighted load from 7147 * busiest_rq, as part of a balancing operation within domain "sd". 7148 * 7149 * Returns number of detached tasks if successful and 0 otherwise. 7150 */ 7151 static int detach_tasks(struct lb_env *env) 7152 { 7153 struct list_head *tasks = &env->src_rq->cfs_tasks; 7154 struct task_struct *p; 7155 unsigned long load; 7156 int detached = 0; 7157 7158 lockdep_assert_held(&env->src_rq->lock); 7159 7160 if (env->imbalance <= 0) 7161 return 0; 7162 7163 while (!list_empty(tasks)) { 7164 /* 7165 * We don't want to steal all, otherwise we may be treated likewise, 7166 * which could at worst lead to a livelock crash. 7167 */ 7168 if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1) 7169 break; 7170 7171 p = list_last_entry(tasks, struct task_struct, se.group_node); 7172 7173 env->loop++; 7174 /* We've more or less seen every task there is, call it quits */ 7175 if (env->loop > env->loop_max) 7176 break; 7177 7178 /* take a breather every nr_migrate tasks */ 7179 if (env->loop > env->loop_break) { 7180 env->loop_break += sched_nr_migrate_break; 7181 env->flags |= LBF_NEED_BREAK; 7182 break; 7183 } 7184 7185 if (!can_migrate_task(p, env)) 7186 goto next; 7187 7188 load = task_h_load(p); 7189 7190 if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed) 7191 goto next; 7192 7193 if ((load / 2) > env->imbalance) 7194 goto next; 7195 7196 detach_task(p, env); 7197 list_add(&p->se.group_node, &env->tasks); 7198 7199 detached++; 7200 env->imbalance -= load; 7201 7202 #ifdef CONFIG_PREEMPT 7203 /* 7204 * NEWIDLE balancing is a source of latency, so preemptible 7205 * kernels will stop after the first task is detached to minimize 7206 * the critical section. 7207 */ 7208 if (env->idle == CPU_NEWLY_IDLE) 7209 break; 7210 #endif 7211 7212 /* 7213 * We only want to steal up to the prescribed amount of 7214 * weighted load. 7215 */ 7216 if (env->imbalance <= 0) 7217 break; 7218 7219 continue; 7220 next: 7221 list_move(&p->se.group_node, tasks); 7222 } 7223 7224 /* 7225 * Right now, this is one of only two places we collect this stat 7226 * so we can safely collect detach_one_task() stats here rather 7227 * than inside detach_one_task(). 7228 */ 7229 schedstat_add(env->sd->lb_gained[env->idle], detached); 7230 7231 return detached; 7232 } 7233 7234 /* 7235 * attach_task() -- attach the task detached by detach_task() to its new rq. 7236 */ 7237 static void attach_task(struct rq *rq, struct task_struct *p) 7238 { 7239 lockdep_assert_held(&rq->lock); 7240 7241 BUG_ON(task_rq(p) != rq); 7242 activate_task(rq, p, ENQUEUE_NOCLOCK); 7243 p->on_rq = TASK_ON_RQ_QUEUED; 7244 check_preempt_curr(rq, p, 0); 7245 } 7246 7247 /* 7248 * attach_one_task() -- attaches the task returned from detach_one_task() to 7249 * its new rq. 7250 */ 7251 static void attach_one_task(struct rq *rq, struct task_struct *p) 7252 { 7253 struct rq_flags rf; 7254 7255 rq_lock(rq, &rf); 7256 update_rq_clock(rq); 7257 attach_task(rq, p); 7258 rq_unlock(rq, &rf); 7259 } 7260 7261 /* 7262 * attach_tasks() -- attaches all tasks detached by detach_tasks() to their 7263 * new rq. 7264 */ 7265 static void attach_tasks(struct lb_env *env) 7266 { 7267 struct list_head *tasks = &env->tasks; 7268 struct task_struct *p; 7269 struct rq_flags rf; 7270 7271 rq_lock(env->dst_rq, &rf); 7272 update_rq_clock(env->dst_rq); 7273 7274 while (!list_empty(tasks)) { 7275 p = list_first_entry(tasks, struct task_struct, se.group_node); 7276 list_del_init(&p->se.group_node); 7277 7278 attach_task(env->dst_rq, p); 7279 } 7280 7281 rq_unlock(env->dst_rq, &rf); 7282 } 7283 7284 #ifdef CONFIG_FAIR_GROUP_SCHED 7285 7286 static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) 7287 { 7288 if (cfs_rq->load.weight) 7289 return false; 7290 7291 if (cfs_rq->avg.load_sum) 7292 return false; 7293 7294 if (cfs_rq->avg.util_sum) 7295 return false; 7296 7297 if (cfs_rq->avg.runnable_load_sum) 7298 return false; 7299 7300 return true; 7301 } 7302 7303 static void update_blocked_averages(int cpu) 7304 { 7305 struct rq *rq = cpu_rq(cpu); 7306 struct cfs_rq *cfs_rq, *pos; 7307 struct rq_flags rf; 7308 7309 rq_lock_irqsave(rq, &rf); 7310 update_rq_clock(rq); 7311 7312 /* 7313 * Iterates the task_group tree in a bottom up fashion, see 7314 * list_add_leaf_cfs_rq() for details. 7315 */ 7316 for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) { 7317 struct sched_entity *se; 7318 7319 /* throttled entities do not contribute to load */ 7320 if (throttled_hierarchy(cfs_rq)) 7321 continue; 7322 7323 if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq)) 7324 update_tg_load_avg(cfs_rq, 0); 7325 7326 /* Propagate pending load changes to the parent, if any: */ 7327 se = cfs_rq->tg->se[cpu]; 7328 if (se && !skip_blocked_update(se)) 7329 update_load_avg(cfs_rq_of(se), se, 0); 7330 7331 /* 7332 * There can be a lot of idle CPU cgroups. Don't let fully 7333 * decayed cfs_rqs linger on the list. 7334 */ 7335 if (cfs_rq_is_decayed(cfs_rq)) 7336 list_del_leaf_cfs_rq(cfs_rq); 7337 } 7338 rq_unlock_irqrestore(rq, &rf); 7339 } 7340 7341 /* 7342 * Compute the hierarchical load factor for cfs_rq and all its ascendants. 7343 * This needs to be done in a top-down fashion because the load of a child 7344 * group is a fraction of its parents load. 7345 */ 7346 static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq) 7347 { 7348 struct rq *rq = rq_of(cfs_rq); 7349 struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)]; 7350 unsigned long now = jiffies; 7351 unsigned long load; 7352 7353 if (cfs_rq->last_h_load_update == now) 7354 return; 7355 7356 cfs_rq->h_load_next = NULL; 7357 for_each_sched_entity(se) { 7358 cfs_rq = cfs_rq_of(se); 7359 cfs_rq->h_load_next = se; 7360 if (cfs_rq->last_h_load_update == now) 7361 break; 7362 } 7363 7364 if (!se) { 7365 cfs_rq->h_load = cfs_rq_load_avg(cfs_rq); 7366 cfs_rq->last_h_load_update = now; 7367 } 7368 7369 while ((se = cfs_rq->h_load_next) != NULL) { 7370 load = cfs_rq->h_load; 7371 load = div64_ul(load * se->avg.load_avg, 7372 cfs_rq_load_avg(cfs_rq) + 1); 7373 cfs_rq = group_cfs_rq(se); 7374 cfs_rq->h_load = load; 7375 cfs_rq->last_h_load_update = now; 7376 } 7377 } 7378 7379 static unsigned long task_h_load(struct task_struct *p) 7380 { 7381 struct cfs_rq *cfs_rq = task_cfs_rq(p); 7382 7383 update_cfs_rq_h_load(cfs_rq); 7384 return div64_ul(p->se.avg.load_avg * cfs_rq->h_load, 7385 cfs_rq_load_avg(cfs_rq) + 1); 7386 } 7387 #else 7388 static inline void update_blocked_averages(int cpu) 7389 { 7390 struct rq *rq = cpu_rq(cpu); 7391 struct cfs_rq *cfs_rq = &rq->cfs; 7392 struct rq_flags rf; 7393 7394 rq_lock_irqsave(rq, &rf); 7395 update_rq_clock(rq); 7396 update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq); 7397 rq_unlock_irqrestore(rq, &rf); 7398 } 7399 7400 static unsigned long task_h_load(struct task_struct *p) 7401 { 7402 return p->se.avg.load_avg; 7403 } 7404 #endif 7405 7406 /********** Helpers for find_busiest_group ************************/ 7407 7408 enum group_type { 7409 group_other = 0, 7410 group_imbalanced, 7411 group_overloaded, 7412 }; 7413 7414 /* 7415 * sg_lb_stats - stats of a sched_group required for load_balancing 7416 */ 7417 struct sg_lb_stats { 7418 unsigned long avg_load; /*Avg load across the CPUs of the group */ 7419 unsigned long group_load; /* Total load over the CPUs of the group */ 7420 unsigned long sum_weighted_load; /* Weighted load of group's tasks */ 7421 unsigned long load_per_task; 7422 unsigned long group_capacity; 7423 unsigned long group_util; /* Total utilization of the group */ 7424 unsigned int sum_nr_running; /* Nr tasks running in the group */ 7425 unsigned int idle_cpus; 7426 unsigned int group_weight; 7427 enum group_type group_type; 7428 int group_no_capacity; 7429 #ifdef CONFIG_NUMA_BALANCING 7430 unsigned int nr_numa_running; 7431 unsigned int nr_preferred_running; 7432 #endif 7433 }; 7434 7435 /* 7436 * sd_lb_stats - Structure to store the statistics of a sched_domain 7437 * during load balancing. 7438 */ 7439 struct sd_lb_stats { 7440 struct sched_group *busiest; /* Busiest group in this sd */ 7441 struct sched_group *local; /* Local group in this sd */ 7442 unsigned long total_running; 7443 unsigned long total_load; /* Total load of all groups in sd */ 7444 unsigned long total_capacity; /* Total capacity of all groups in sd */ 7445 unsigned long avg_load; /* Average load across all groups in sd */ 7446 7447 struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */ 7448 struct sg_lb_stats local_stat; /* Statistics of the local group */ 7449 }; 7450 7451 static inline void init_sd_lb_stats(struct sd_lb_stats *sds) 7452 { 7453 /* 7454 * Skimp on the clearing to avoid duplicate work. We can avoid clearing 7455 * local_stat because update_sg_lb_stats() does a full clear/assignment. 7456 * We must however clear busiest_stat::avg_load because 7457 * update_sd_pick_busiest() reads this before assignment. 7458 */ 7459 *sds = (struct sd_lb_stats){ 7460 .busiest = NULL, 7461 .local = NULL, 7462 .total_running = 0UL, 7463 .total_load = 0UL, 7464 .total_capacity = 0UL, 7465 .busiest_stat = { 7466 .avg_load = 0UL, 7467 .sum_nr_running = 0, 7468 .group_type = group_other, 7469 }, 7470 }; 7471 } 7472 7473 /** 7474 * get_sd_load_idx - Obtain the load index for a given sched domain. 7475 * @sd: The sched_domain whose load_idx is to be obtained. 7476 * @idle: The idle status of the CPU for whose sd load_idx is obtained. 7477 * 7478 * Return: The load index. 7479 */ 7480 static inline int get_sd_load_idx(struct sched_domain *sd, 7481 enum cpu_idle_type idle) 7482 { 7483 int load_idx; 7484 7485 switch (idle) { 7486 case CPU_NOT_IDLE: 7487 load_idx = sd->busy_idx; 7488 break; 7489 7490 case CPU_NEWLY_IDLE: 7491 load_idx = sd->newidle_idx; 7492 break; 7493 default: 7494 load_idx = sd->idle_idx; 7495 break; 7496 } 7497 7498 return load_idx; 7499 } 7500 7501 static unsigned long scale_rt_capacity(int cpu) 7502 { 7503 struct rq *rq = cpu_rq(cpu); 7504 u64 total, used, age_stamp, avg; 7505 s64 delta; 7506 7507 /* 7508 * Since we're reading these variables without serialization make sure 7509 * we read them once before doing sanity checks on them. 7510 */ 7511 age_stamp = READ_ONCE(rq->age_stamp); 7512 avg = READ_ONCE(rq->rt_avg); 7513 delta = __rq_clock_broken(rq) - age_stamp; 7514 7515 if (unlikely(delta < 0)) 7516 delta = 0; 7517 7518 total = sched_avg_period() + delta; 7519 7520 used = div_u64(avg, total); 7521 7522 if (likely(used < SCHED_CAPACITY_SCALE)) 7523 return SCHED_CAPACITY_SCALE - used; 7524 7525 return 1; 7526 } 7527 7528 static void update_cpu_capacity(struct sched_domain *sd, int cpu) 7529 { 7530 unsigned long capacity = arch_scale_cpu_capacity(sd, cpu); 7531 struct sched_group *sdg = sd->groups; 7532 7533 cpu_rq(cpu)->cpu_capacity_orig = capacity; 7534 7535 capacity *= scale_rt_capacity(cpu); 7536 capacity >>= SCHED_CAPACITY_SHIFT; 7537 7538 if (!capacity) 7539 capacity = 1; 7540 7541 cpu_rq(cpu)->cpu_capacity = capacity; 7542 sdg->sgc->capacity = capacity; 7543 sdg->sgc->min_capacity = capacity; 7544 } 7545 7546 void update_group_capacity(struct sched_domain *sd, int cpu) 7547 { 7548 struct sched_domain *child = sd->child; 7549 struct sched_group *group, *sdg = sd->groups; 7550 unsigned long capacity, min_capacity; 7551 unsigned long interval; 7552 7553 interval = msecs_to_jiffies(sd->balance_interval); 7554 interval = clamp(interval, 1UL, max_load_balance_interval); 7555 sdg->sgc->next_update = jiffies + interval; 7556 7557 if (!child) { 7558 update_cpu_capacity(sd, cpu); 7559 return; 7560 } 7561 7562 capacity = 0; 7563 min_capacity = ULONG_MAX; 7564 7565 if (child->flags & SD_OVERLAP) { 7566 /* 7567 * SD_OVERLAP domains cannot assume that child groups 7568 * span the current group. 7569 */ 7570 7571 for_each_cpu(cpu, sched_group_span(sdg)) { 7572 struct sched_group_capacity *sgc; 7573 struct rq *rq = cpu_rq(cpu); 7574 7575 /* 7576 * build_sched_domains() -> init_sched_groups_capacity() 7577 * gets here before we've attached the domains to the 7578 * runqueues. 7579 * 7580 * Use capacity_of(), which is set irrespective of domains 7581 * in update_cpu_capacity(). 7582 * 7583 * This avoids capacity from being 0 and 7584 * causing divide-by-zero issues on boot. 7585 */ 7586 if (unlikely(!rq->sd)) { 7587 capacity += capacity_of(cpu); 7588 } else { 7589 sgc = rq->sd->groups->sgc; 7590 capacity += sgc->capacity; 7591 } 7592 7593 min_capacity = min(capacity, min_capacity); 7594 } 7595 } else { 7596 /* 7597 * !SD_OVERLAP domains can assume that child groups 7598 * span the current group. 7599 */ 7600 7601 group = child->groups; 7602 do { 7603 struct sched_group_capacity *sgc = group->sgc; 7604 7605 capacity += sgc->capacity; 7606 min_capacity = min(sgc->min_capacity, min_capacity); 7607 group = group->next; 7608 } while (group != child->groups); 7609 } 7610 7611 sdg->sgc->capacity = capacity; 7612 sdg->sgc->min_capacity = min_capacity; 7613 } 7614 7615 /* 7616 * Check whether the capacity of the rq has been noticeably reduced by side 7617 * activity. The imbalance_pct is used for the threshold. 7618 * Return true is the capacity is reduced 7619 */ 7620 static inline int 7621 check_cpu_capacity(struct rq *rq, struct sched_domain *sd) 7622 { 7623 return ((rq->cpu_capacity * sd->imbalance_pct) < 7624 (rq->cpu_capacity_orig * 100)); 7625 } 7626 7627 /* 7628 * Group imbalance indicates (and tries to solve) the problem where balancing 7629 * groups is inadequate due to ->cpus_allowed constraints. 7630 * 7631 * Imagine a situation of two groups of 4 cpus each and 4 tasks each with a 7632 * cpumask covering 1 cpu of the first group and 3 cpus of the second group. 7633 * Something like: 7634 * 7635 * { 0 1 2 3 } { 4 5 6 7 } 7636 * * * * * 7637 * 7638 * If we were to balance group-wise we'd place two tasks in the first group and 7639 * two tasks in the second group. Clearly this is undesired as it will overload 7640 * cpu 3 and leave one of the cpus in the second group unused. 7641 * 7642 * The current solution to this issue is detecting the skew in the first group 7643 * by noticing the lower domain failed to reach balance and had difficulty 7644 * moving tasks due to affinity constraints. 7645 * 7646 * When this is so detected; this group becomes a candidate for busiest; see 7647 * update_sd_pick_busiest(). And calculate_imbalance() and 7648 * find_busiest_group() avoid some of the usual balance conditions to allow it 7649 * to create an effective group imbalance. 7650 * 7651 * This is a somewhat tricky proposition since the next run might not find the 7652 * group imbalance and decide the groups need to be balanced again. A most 7653 * subtle and fragile situation. 7654 */ 7655 7656 static inline int sg_imbalanced(struct sched_group *group) 7657 { 7658 return group->sgc->imbalance; 7659 } 7660 7661 /* 7662 * group_has_capacity returns true if the group has spare capacity that could 7663 * be used by some tasks. 7664 * We consider that a group has spare capacity if the * number of task is 7665 * smaller than the number of CPUs or if the utilization is lower than the 7666 * available capacity for CFS tasks. 7667 * For the latter, we use a threshold to stabilize the state, to take into 7668 * account the variance of the tasks' load and to return true if the available 7669 * capacity in meaningful for the load balancer. 7670 * As an example, an available capacity of 1% can appear but it doesn't make 7671 * any benefit for the load balance. 7672 */ 7673 static inline bool 7674 group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs) 7675 { 7676 if (sgs->sum_nr_running < sgs->group_weight) 7677 return true; 7678 7679 if ((sgs->group_capacity * 100) > 7680 (sgs->group_util * env->sd->imbalance_pct)) 7681 return true; 7682 7683 return false; 7684 } 7685 7686 /* 7687 * group_is_overloaded returns true if the group has more tasks than it can 7688 * handle. 7689 * group_is_overloaded is not equals to !group_has_capacity because a group 7690 * with the exact right number of tasks, has no more spare capacity but is not 7691 * overloaded so both group_has_capacity and group_is_overloaded return 7692 * false. 7693 */ 7694 static inline bool 7695 group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs) 7696 { 7697 if (sgs->sum_nr_running <= sgs->group_weight) 7698 return false; 7699 7700 if ((sgs->group_capacity * 100) < 7701 (sgs->group_util * env->sd->imbalance_pct)) 7702 return true; 7703 7704 return false; 7705 } 7706 7707 /* 7708 * group_smaller_cpu_capacity: Returns true if sched_group sg has smaller 7709 * per-CPU capacity than sched_group ref. 7710 */ 7711 static inline bool 7712 group_smaller_cpu_capacity(struct sched_group *sg, struct sched_group *ref) 7713 { 7714 return sg->sgc->min_capacity * capacity_margin < 7715 ref->sgc->min_capacity * 1024; 7716 } 7717 7718 static inline enum 7719 group_type group_classify(struct sched_group *group, 7720 struct sg_lb_stats *sgs) 7721 { 7722 if (sgs->group_no_capacity) 7723 return group_overloaded; 7724 7725 if (sg_imbalanced(group)) 7726 return group_imbalanced; 7727 7728 return group_other; 7729 } 7730 7731 /** 7732 * update_sg_lb_stats - Update sched_group's statistics for load balancing. 7733 * @env: The load balancing environment. 7734 * @group: sched_group whose statistics are to be updated. 7735 * @load_idx: Load index of sched_domain of this_cpu for load calc. 7736 * @local_group: Does group contain this_cpu. 7737 * @sgs: variable to hold the statistics for this group. 7738 * @overload: Indicate more than one runnable task for any CPU. 7739 */ 7740 static inline void update_sg_lb_stats(struct lb_env *env, 7741 struct sched_group *group, int load_idx, 7742 int local_group, struct sg_lb_stats *sgs, 7743 bool *overload) 7744 { 7745 unsigned long load; 7746 int i, nr_running; 7747 7748 memset(sgs, 0, sizeof(*sgs)); 7749 7750 for_each_cpu_and(i, sched_group_span(group), env->cpus) { 7751 struct rq *rq = cpu_rq(i); 7752 7753 /* Bias balancing toward cpus of our domain */ 7754 if (local_group) 7755 load = target_load(i, load_idx); 7756 else 7757 load = source_load(i, load_idx); 7758 7759 sgs->group_load += load; 7760 sgs->group_util += cpu_util(i); 7761 sgs->sum_nr_running += rq->cfs.h_nr_running; 7762 7763 nr_running = rq->nr_running; 7764 if (nr_running > 1) 7765 *overload = true; 7766 7767 #ifdef CONFIG_NUMA_BALANCING 7768 sgs->nr_numa_running += rq->nr_numa_running; 7769 sgs->nr_preferred_running += rq->nr_preferred_running; 7770 #endif 7771 sgs->sum_weighted_load += weighted_cpuload(rq); 7772 /* 7773 * No need to call idle_cpu() if nr_running is not 0 7774 */ 7775 if (!nr_running && idle_cpu(i)) 7776 sgs->idle_cpus++; 7777 } 7778 7779 /* Adjust by relative CPU capacity of the group */ 7780 sgs->group_capacity = group->sgc->capacity; 7781 sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity; 7782 7783 if (sgs->sum_nr_running) 7784 sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; 7785 7786 sgs->group_weight = group->group_weight; 7787 7788 sgs->group_no_capacity = group_is_overloaded(env, sgs); 7789 sgs->group_type = group_classify(group, sgs); 7790 } 7791 7792 /** 7793 * update_sd_pick_busiest - return 1 on busiest group 7794 * @env: The load balancing environment. 7795 * @sds: sched_domain statistics 7796 * @sg: sched_group candidate to be checked for being the busiest 7797 * @sgs: sched_group statistics 7798 * 7799 * Determine if @sg is a busier group than the previously selected 7800 * busiest group. 7801 * 7802 * Return: %true if @sg is a busier group than the previously selected 7803 * busiest group. %false otherwise. 7804 */ 7805 static bool update_sd_pick_busiest(struct lb_env *env, 7806 struct sd_lb_stats *sds, 7807 struct sched_group *sg, 7808 struct sg_lb_stats *sgs) 7809 { 7810 struct sg_lb_stats *busiest = &sds->busiest_stat; 7811 7812 if (sgs->group_type > busiest->group_type) 7813 return true; 7814 7815 if (sgs->group_type < busiest->group_type) 7816 return false; 7817 7818 if (sgs->avg_load <= busiest->avg_load) 7819 return false; 7820 7821 if (!(env->sd->flags & SD_ASYM_CPUCAPACITY)) 7822 goto asym_packing; 7823 7824 /* 7825 * Candidate sg has no more than one task per CPU and 7826 * has higher per-CPU capacity. Migrating tasks to less 7827 * capable CPUs may harm throughput. Maximize throughput, 7828 * power/energy consequences are not considered. 7829 */ 7830 if (sgs->sum_nr_running <= sgs->group_weight && 7831 group_smaller_cpu_capacity(sds->local, sg)) 7832 return false; 7833 7834 asym_packing: 7835 /* This is the busiest node in its class. */ 7836 if (!(env->sd->flags & SD_ASYM_PACKING)) 7837 return true; 7838 7839 /* No ASYM_PACKING if target cpu is already busy */ 7840 if (env->idle == CPU_NOT_IDLE) 7841 return true; 7842 /* 7843 * ASYM_PACKING needs to move all the work to the highest 7844 * prority CPUs in the group, therefore mark all groups 7845 * of lower priority than ourself as busy. 7846 */ 7847 if (sgs->sum_nr_running && 7848 sched_asym_prefer(env->dst_cpu, sg->asym_prefer_cpu)) { 7849 if (!sds->busiest) 7850 return true; 7851 7852 /* Prefer to move from lowest priority cpu's work */ 7853 if (sched_asym_prefer(sds->busiest->asym_prefer_cpu, 7854 sg->asym_prefer_cpu)) 7855 return true; 7856 } 7857 7858 return false; 7859 } 7860 7861 #ifdef CONFIG_NUMA_BALANCING 7862 static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs) 7863 { 7864 if (sgs->sum_nr_running > sgs->nr_numa_running) 7865 return regular; 7866 if (sgs->sum_nr_running > sgs->nr_preferred_running) 7867 return remote; 7868 return all; 7869 } 7870 7871 static inline enum fbq_type fbq_classify_rq(struct rq *rq) 7872 { 7873 if (rq->nr_running > rq->nr_numa_running) 7874 return regular; 7875 if (rq->nr_running > rq->nr_preferred_running) 7876 return remote; 7877 return all; 7878 } 7879 #else 7880 static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs) 7881 { 7882 return all; 7883 } 7884 7885 static inline enum fbq_type fbq_classify_rq(struct rq *rq) 7886 { 7887 return regular; 7888 } 7889 #endif /* CONFIG_NUMA_BALANCING */ 7890 7891 /** 7892 * update_sd_lb_stats - Update sched_domain's statistics for load balancing. 7893 * @env: The load balancing environment. 7894 * @sds: variable to hold the statistics for this sched_domain. 7895 */ 7896 static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds) 7897 { 7898 struct sched_domain *child = env->sd->child; 7899 struct sched_group *sg = env->sd->groups; 7900 struct sg_lb_stats *local = &sds->local_stat; 7901 struct sg_lb_stats tmp_sgs; 7902 int load_idx, prefer_sibling = 0; 7903 bool overload = false; 7904 7905 if (child && child->flags & SD_PREFER_SIBLING) 7906 prefer_sibling = 1; 7907 7908 load_idx = get_sd_load_idx(env->sd, env->idle); 7909 7910 do { 7911 struct sg_lb_stats *sgs = &tmp_sgs; 7912 int local_group; 7913 7914 local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(sg)); 7915 if (local_group) { 7916 sds->local = sg; 7917 sgs = local; 7918 7919 if (env->idle != CPU_NEWLY_IDLE || 7920 time_after_eq(jiffies, sg->sgc->next_update)) 7921 update_group_capacity(env->sd, env->dst_cpu); 7922 } 7923 7924 update_sg_lb_stats(env, sg, load_idx, local_group, sgs, 7925 &overload); 7926 7927 if (local_group) 7928 goto next_group; 7929 7930 /* 7931 * In case the child domain prefers tasks go to siblings 7932 * first, lower the sg capacity so that we'll try 7933 * and move all the excess tasks away. We lower the capacity 7934 * of a group only if the local group has the capacity to fit 7935 * these excess tasks. The extra check prevents the case where 7936 * you always pull from the heaviest group when it is already 7937 * under-utilized (possible with a large weight task outweighs 7938 * the tasks on the system). 7939 */ 7940 if (prefer_sibling && sds->local && 7941 group_has_capacity(env, local) && 7942 (sgs->sum_nr_running > local->sum_nr_running + 1)) { 7943 sgs->group_no_capacity = 1; 7944 sgs->group_type = group_classify(sg, sgs); 7945 } 7946 7947 if (update_sd_pick_busiest(env, sds, sg, sgs)) { 7948 sds->busiest = sg; 7949 sds->busiest_stat = *sgs; 7950 } 7951 7952 next_group: 7953 /* Now, start updating sd_lb_stats */ 7954 sds->total_running += sgs->sum_nr_running; 7955 sds->total_load += sgs->group_load; 7956 sds->total_capacity += sgs->group_capacity; 7957 7958 sg = sg->next; 7959 } while (sg != env->sd->groups); 7960 7961 if (env->sd->flags & SD_NUMA) 7962 env->fbq_type = fbq_classify_group(&sds->busiest_stat); 7963 7964 if (!env->sd->parent) { 7965 /* update overload indicator if we are at root domain */ 7966 if (env->dst_rq->rd->overload != overload) 7967 env->dst_rq->rd->overload = overload; 7968 } 7969 } 7970 7971 /** 7972 * check_asym_packing - Check to see if the group is packed into the 7973 * sched domain. 7974 * 7975 * This is primarily intended to used at the sibling level. Some 7976 * cores like POWER7 prefer to use lower numbered SMT threads. In the 7977 * case of POWER7, it can move to lower SMT modes only when higher 7978 * threads are idle. When in lower SMT modes, the threads will 7979 * perform better since they share less core resources. Hence when we 7980 * have idle threads, we want them to be the higher ones. 7981 * 7982 * This packing function is run on idle threads. It checks to see if 7983 * the busiest CPU in this domain (core in the P7 case) has a higher 7984 * CPU number than the packing function is being run on. Here we are 7985 * assuming lower CPU number will be equivalent to lower a SMT thread 7986 * number. 7987 * 7988 * Return: 1 when packing is required and a task should be moved to 7989 * this CPU. The amount of the imbalance is returned in env->imbalance. 7990 * 7991 * @env: The load balancing environment. 7992 * @sds: Statistics of the sched_domain which is to be packed 7993 */ 7994 static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds) 7995 { 7996 int busiest_cpu; 7997 7998 if (!(env->sd->flags & SD_ASYM_PACKING)) 7999 return 0; 8000 8001 if (env->idle == CPU_NOT_IDLE) 8002 return 0; 8003 8004 if (!sds->busiest) 8005 return 0; 8006 8007 busiest_cpu = sds->busiest->asym_prefer_cpu; 8008 if (sched_asym_prefer(busiest_cpu, env->dst_cpu)) 8009 return 0; 8010 8011 env->imbalance = DIV_ROUND_CLOSEST( 8012 sds->busiest_stat.avg_load * sds->busiest_stat.group_capacity, 8013 SCHED_CAPACITY_SCALE); 8014 8015 return 1; 8016 } 8017 8018 /** 8019 * fix_small_imbalance - Calculate the minor imbalance that exists 8020 * amongst the groups of a sched_domain, during 8021 * load balancing. 8022 * @env: The load balancing environment. 8023 * @sds: Statistics of the sched_domain whose imbalance is to be calculated. 8024 */ 8025 static inline 8026 void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) 8027 { 8028 unsigned long tmp, capa_now = 0, capa_move = 0; 8029 unsigned int imbn = 2; 8030 unsigned long scaled_busy_load_per_task; 8031 struct sg_lb_stats *local, *busiest; 8032 8033 local = &sds->local_stat; 8034 busiest = &sds->busiest_stat; 8035 8036 if (!local->sum_nr_running) 8037 local->load_per_task = cpu_avg_load_per_task(env->dst_cpu); 8038 else if (busiest->load_per_task > local->load_per_task) 8039 imbn = 1; 8040 8041 scaled_busy_load_per_task = 8042 (busiest->load_per_task * SCHED_CAPACITY_SCALE) / 8043 busiest->group_capacity; 8044 8045 if (busiest->avg_load + scaled_busy_load_per_task >= 8046 local->avg_load + (scaled_busy_load_per_task * imbn)) { 8047 env->imbalance = busiest->load_per_task; 8048 return; 8049 } 8050 8051 /* 8052 * OK, we don't have enough imbalance to justify moving tasks, 8053 * however we may be able to increase total CPU capacity used by 8054 * moving them. 8055 */ 8056 8057 capa_now += busiest->group_capacity * 8058 min(busiest->load_per_task, busiest->avg_load); 8059 capa_now += local->group_capacity * 8060 min(local->load_per_task, local->avg_load); 8061 capa_now /= SCHED_CAPACITY_SCALE; 8062 8063 /* Amount of load we'd subtract */ 8064 if (busiest->avg_load > scaled_busy_load_per_task) { 8065 capa_move += busiest->group_capacity * 8066 min(busiest->load_per_task, 8067 busiest->avg_load - scaled_busy_load_per_task); 8068 } 8069 8070 /* Amount of load we'd add */ 8071 if (busiest->avg_load * busiest->group_capacity < 8072 busiest->load_per_task * SCHED_CAPACITY_SCALE) { 8073 tmp = (busiest->avg_load * busiest->group_capacity) / 8074 local->group_capacity; 8075 } else { 8076 tmp = (busiest->load_per_task * SCHED_CAPACITY_SCALE) / 8077 local->group_capacity; 8078 } 8079 capa_move += local->group_capacity * 8080 min(local->load_per_task, local->avg_load + tmp); 8081 capa_move /= SCHED_CAPACITY_SCALE; 8082 8083 /* Move if we gain throughput */ 8084 if (capa_move > capa_now) 8085 env->imbalance = busiest->load_per_task; 8086 } 8087 8088 /** 8089 * calculate_imbalance - Calculate the amount of imbalance present within the 8090 * groups of a given sched_domain during load balance. 8091 * @env: load balance environment 8092 * @sds: statistics of the sched_domain whose imbalance is to be calculated. 8093 */ 8094 static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds) 8095 { 8096 unsigned long max_pull, load_above_capacity = ~0UL; 8097 struct sg_lb_stats *local, *busiest; 8098 8099 local = &sds->local_stat; 8100 busiest = &sds->busiest_stat; 8101 8102 if (busiest->group_type == group_imbalanced) { 8103 /* 8104 * In the group_imb case we cannot rely on group-wide averages 8105 * to ensure cpu-load equilibrium, look at wider averages. XXX 8106 */ 8107 busiest->load_per_task = 8108 min(busiest->load_per_task, sds->avg_load); 8109 } 8110 8111 /* 8112 * Avg load of busiest sg can be less and avg load of local sg can 8113 * be greater than avg load across all sgs of sd because avg load 8114 * factors in sg capacity and sgs with smaller group_type are 8115 * skipped when updating the busiest sg: 8116 */ 8117 if (busiest->avg_load <= sds->avg_load || 8118 local->avg_load >= sds->avg_load) { 8119 env->imbalance = 0; 8120 return fix_small_imbalance(env, sds); 8121 } 8122 8123 /* 8124 * If there aren't any idle cpus, avoid creating some. 8125 */ 8126 if (busiest->group_type == group_overloaded && 8127 local->group_type == group_overloaded) { 8128 load_above_capacity = busiest->sum_nr_running * SCHED_CAPACITY_SCALE; 8129 if (load_above_capacity > busiest->group_capacity) { 8130 load_above_capacity -= busiest->group_capacity; 8131 load_above_capacity *= scale_load_down(NICE_0_LOAD); 8132 load_above_capacity /= busiest->group_capacity; 8133 } else 8134 load_above_capacity = ~0UL; 8135 } 8136 8137 /* 8138 * We're trying to get all the cpus to the average_load, so we don't 8139 * want to push ourselves above the average load, nor do we wish to 8140 * reduce the max loaded cpu below the average load. At the same time, 8141 * we also don't want to reduce the group load below the group 8142 * capacity. Thus we look for the minimum possible imbalance. 8143 */ 8144 max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity); 8145 8146 /* How much load to actually move to equalise the imbalance */ 8147 env->imbalance = min( 8148 max_pull * busiest->group_capacity, 8149 (sds->avg_load - local->avg_load) * local->group_capacity 8150 ) / SCHED_CAPACITY_SCALE; 8151 8152 /* 8153 * if *imbalance is less than the average load per runnable task 8154 * there is no guarantee that any tasks will be moved so we'll have 8155 * a think about bumping its value to force at least one task to be 8156 * moved 8157 */ 8158 if (env->imbalance < busiest->load_per_task) 8159 return fix_small_imbalance(env, sds); 8160 } 8161 8162 /******* find_busiest_group() helpers end here *********************/ 8163 8164 /** 8165 * find_busiest_group - Returns the busiest group within the sched_domain 8166 * if there is an imbalance. 8167 * 8168 * Also calculates the amount of weighted load which should be moved 8169 * to restore balance. 8170 * 8171 * @env: The load balancing environment. 8172 * 8173 * Return: - The busiest group if imbalance exists. 8174 */ 8175 static struct sched_group *find_busiest_group(struct lb_env *env) 8176 { 8177 struct sg_lb_stats *local, *busiest; 8178 struct sd_lb_stats sds; 8179 8180 init_sd_lb_stats(&sds); 8181 8182 /* 8183 * Compute the various statistics relavent for load balancing at 8184 * this level. 8185 */ 8186 update_sd_lb_stats(env, &sds); 8187 local = &sds.local_stat; 8188 busiest = &sds.busiest_stat; 8189 8190 /* ASYM feature bypasses nice load balance check */ 8191 if (check_asym_packing(env, &sds)) 8192 return sds.busiest; 8193 8194 /* There is no busy sibling group to pull tasks from */ 8195 if (!sds.busiest || busiest->sum_nr_running == 0) 8196 goto out_balanced; 8197 8198 /* XXX broken for overlapping NUMA groups */ 8199 sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load) 8200 / sds.total_capacity; 8201 8202 /* 8203 * If the busiest group is imbalanced the below checks don't 8204 * work because they assume all things are equal, which typically 8205 * isn't true due to cpus_allowed constraints and the like. 8206 */ 8207 if (busiest->group_type == group_imbalanced) 8208 goto force_balance; 8209 8210 /* 8211 * When dst_cpu is idle, prevent SMP nice and/or asymmetric group 8212 * capacities from resulting in underutilization due to avg_load. 8213 */ 8214 if (env->idle != CPU_NOT_IDLE && group_has_capacity(env, local) && 8215 busiest->group_no_capacity) 8216 goto force_balance; 8217 8218 /* 8219 * If the local group is busier than the selected busiest group 8220 * don't try and pull any tasks. 8221 */ 8222 if (local->avg_load >= busiest->avg_load) 8223 goto out_balanced; 8224 8225 /* 8226 * Don't pull any tasks if this group is already above the domain 8227 * average load. 8228 */ 8229 if (local->avg_load >= sds.avg_load) 8230 goto out_balanced; 8231 8232 if (env->idle == CPU_IDLE) { 8233 /* 8234 * This cpu is idle. If the busiest group is not overloaded 8235 * and there is no imbalance between this and busiest group 8236 * wrt idle cpus, it is balanced. The imbalance becomes 8237 * significant if the diff is greater than 1 otherwise we 8238 * might end up to just move the imbalance on another group 8239 */ 8240 if ((busiest->group_type != group_overloaded) && 8241 (local->idle_cpus <= (busiest->idle_cpus + 1))) 8242 goto out_balanced; 8243 } else { 8244 /* 8245 * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use 8246 * imbalance_pct to be conservative. 8247 */ 8248 if (100 * busiest->avg_load <= 8249 env->sd->imbalance_pct * local->avg_load) 8250 goto out_balanced; 8251 } 8252 8253 force_balance: 8254 /* Looks like there is an imbalance. Compute it */ 8255 calculate_imbalance(env, &sds); 8256 return sds.busiest; 8257 8258 out_balanced: 8259 env->imbalance = 0; 8260 return NULL; 8261 } 8262 8263 /* 8264 * find_busiest_queue - find the busiest runqueue among the cpus in group. 8265 */ 8266 static struct rq *find_busiest_queue(struct lb_env *env, 8267 struct sched_group *group) 8268 { 8269 struct rq *busiest = NULL, *rq; 8270 unsigned long busiest_load = 0, busiest_capacity = 1; 8271 int i; 8272 8273 for_each_cpu_and(i, sched_group_span(group), env->cpus) { 8274 unsigned long capacity, wl; 8275 enum fbq_type rt; 8276 8277 rq = cpu_rq(i); 8278 rt = fbq_classify_rq(rq); 8279 8280 /* 8281 * We classify groups/runqueues into three groups: 8282 * - regular: there are !numa tasks 8283 * - remote: there are numa tasks that run on the 'wrong' node 8284 * - all: there is no distinction 8285 * 8286 * In order to avoid migrating ideally placed numa tasks, 8287 * ignore those when there's better options. 8288 * 8289 * If we ignore the actual busiest queue to migrate another 8290 * task, the next balance pass can still reduce the busiest 8291 * queue by moving tasks around inside the node. 8292 * 8293 * If we cannot move enough load due to this classification 8294 * the next pass will adjust the group classification and 8295 * allow migration of more tasks. 8296 * 8297 * Both cases only affect the total convergence complexity. 8298 */ 8299 if (rt > env->fbq_type) 8300 continue; 8301 8302 capacity = capacity_of(i); 8303 8304 wl = weighted_cpuload(rq); 8305 8306 /* 8307 * When comparing with imbalance, use weighted_cpuload() 8308 * which is not scaled with the cpu capacity. 8309 */ 8310 8311 if (rq->nr_running == 1 && wl > env->imbalance && 8312 !check_cpu_capacity(rq, env->sd)) 8313 continue; 8314 8315 /* 8316 * For the load comparisons with the other cpu's, consider 8317 * the weighted_cpuload() scaled with the cpu capacity, so 8318 * that the load can be moved away from the cpu that is 8319 * potentially running at a lower capacity. 8320 * 8321 * Thus we're looking for max(wl_i / capacity_i), crosswise 8322 * multiplication to rid ourselves of the division works out 8323 * to: wl_i * capacity_j > wl_j * capacity_i; where j is 8324 * our previous maximum. 8325 */ 8326 if (wl * busiest_capacity > busiest_load * capacity) { 8327 busiest_load = wl; 8328 busiest_capacity = capacity; 8329 busiest = rq; 8330 } 8331 } 8332 8333 return busiest; 8334 } 8335 8336 /* 8337 * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but 8338 * so long as it is large enough. 8339 */ 8340 #define MAX_PINNED_INTERVAL 512 8341 8342 static int need_active_balance(struct lb_env *env) 8343 { 8344 struct sched_domain *sd = env->sd; 8345 8346 if (env->idle == CPU_NEWLY_IDLE) { 8347 8348 /* 8349 * ASYM_PACKING needs to force migrate tasks from busy but 8350 * lower priority CPUs in order to pack all tasks in the 8351 * highest priority CPUs. 8352 */ 8353 if ((sd->flags & SD_ASYM_PACKING) && 8354 sched_asym_prefer(env->dst_cpu, env->src_cpu)) 8355 return 1; 8356 } 8357 8358 /* 8359 * The dst_cpu is idle and the src_cpu CPU has only 1 CFS task. 8360 * It's worth migrating the task if the src_cpu's capacity is reduced 8361 * because of other sched_class or IRQs if more capacity stays 8362 * available on dst_cpu. 8363 */ 8364 if ((env->idle != CPU_NOT_IDLE) && 8365 (env->src_rq->cfs.h_nr_running == 1)) { 8366 if ((check_cpu_capacity(env->src_rq, sd)) && 8367 (capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100)) 8368 return 1; 8369 } 8370 8371 return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); 8372 } 8373 8374 static int active_load_balance_cpu_stop(void *data); 8375 8376 static int should_we_balance(struct lb_env *env) 8377 { 8378 struct sched_group *sg = env->sd->groups; 8379 int cpu, balance_cpu = -1; 8380 8381 /* 8382 * Ensure the balancing environment is consistent; can happen 8383 * when the softirq triggers 'during' hotplug. 8384 */ 8385 if (!cpumask_test_cpu(env->dst_cpu, env->cpus)) 8386 return 0; 8387 8388 /* 8389 * In the newly idle case, we will allow all the cpu's 8390 * to do the newly idle load balance. 8391 */ 8392 if (env->idle == CPU_NEWLY_IDLE) 8393 return 1; 8394 8395 /* Try to find first idle cpu */ 8396 for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) { 8397 if (!idle_cpu(cpu)) 8398 continue; 8399 8400 balance_cpu = cpu; 8401 break; 8402 } 8403 8404 if (balance_cpu == -1) 8405 balance_cpu = group_balance_cpu(sg); 8406 8407 /* 8408 * First idle cpu or the first cpu(busiest) in this sched group 8409 * is eligible for doing load balancing at this and above domains. 8410 */ 8411 return balance_cpu == env->dst_cpu; 8412 } 8413 8414 /* 8415 * Check this_cpu to ensure it is balanced within domain. Attempt to move 8416 * tasks if there is an imbalance. 8417 */ 8418 static int load_balance(int this_cpu, struct rq *this_rq, 8419 struct sched_domain *sd, enum cpu_idle_type idle, 8420 int *continue_balancing) 8421 { 8422 int ld_moved, cur_ld_moved, active_balance = 0; 8423 struct sched_domain *sd_parent = sd->parent; 8424 struct sched_group *group; 8425 struct rq *busiest; 8426 struct rq_flags rf; 8427 struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask); 8428 8429 struct lb_env env = { 8430 .sd = sd, 8431 .dst_cpu = this_cpu, 8432 .dst_rq = this_rq, 8433 .dst_grpmask = sched_group_span(sd->groups), 8434 .idle = idle, 8435 .loop_break = sched_nr_migrate_break, 8436 .cpus = cpus, 8437 .fbq_type = all, 8438 .tasks = LIST_HEAD_INIT(env.tasks), 8439 }; 8440 8441 cpumask_and(cpus, sched_domain_span(sd), cpu_active_mask); 8442 8443 schedstat_inc(sd->lb_count[idle]); 8444 8445 redo: 8446 if (!should_we_balance(&env)) { 8447 *continue_balancing = 0; 8448 goto out_balanced; 8449 } 8450 8451 group = find_busiest_group(&env); 8452 if (!group) { 8453 schedstat_inc(sd->lb_nobusyg[idle]); 8454 goto out_balanced; 8455 } 8456 8457 busiest = find_busiest_queue(&env, group); 8458 if (!busiest) { 8459 schedstat_inc(sd->lb_nobusyq[idle]); 8460 goto out_balanced; 8461 } 8462 8463 BUG_ON(busiest == env.dst_rq); 8464 8465 schedstat_add(sd->lb_imbalance[idle], env.imbalance); 8466 8467 env.src_cpu = busiest->cpu; 8468 env.src_rq = busiest; 8469 8470 ld_moved = 0; 8471 if (busiest->nr_running > 1) { 8472 /* 8473 * Attempt to move tasks. If find_busiest_group has found 8474 * an imbalance but busiest->nr_running <= 1, the group is 8475 * still unbalanced. ld_moved simply stays zero, so it is 8476 * correctly treated as an imbalance. 8477 */ 8478 env.flags |= LBF_ALL_PINNED; 8479 env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); 8480 8481 more_balance: 8482 rq_lock_irqsave(busiest, &rf); 8483 update_rq_clock(busiest); 8484 8485 /* 8486 * cur_ld_moved - load moved in current iteration 8487 * ld_moved - cumulative load moved across iterations 8488 */ 8489 cur_ld_moved = detach_tasks(&env); 8490 8491 /* 8492 * We've detached some tasks from busiest_rq. Every 8493 * task is masked "TASK_ON_RQ_MIGRATING", so we can safely 8494 * unlock busiest->lock, and we are able to be sure 8495 * that nobody can manipulate the tasks in parallel. 8496 * See task_rq_lock() family for the details. 8497 */ 8498 8499 rq_unlock(busiest, &rf); 8500 8501 if (cur_ld_moved) { 8502 attach_tasks(&env); 8503 ld_moved += cur_ld_moved; 8504 } 8505 8506 local_irq_restore(rf.flags); 8507 8508 if (env.flags & LBF_NEED_BREAK) { 8509 env.flags &= ~LBF_NEED_BREAK; 8510 goto more_balance; 8511 } 8512 8513 /* 8514 * Revisit (affine) tasks on src_cpu that couldn't be moved to 8515 * us and move them to an alternate dst_cpu in our sched_group 8516 * where they can run. The upper limit on how many times we 8517 * iterate on same src_cpu is dependent on number of cpus in our 8518 * sched_group. 8519 * 8520 * This changes load balance semantics a bit on who can move 8521 * load to a given_cpu. In addition to the given_cpu itself 8522 * (or a ilb_cpu acting on its behalf where given_cpu is 8523 * nohz-idle), we now have balance_cpu in a position to move 8524 * load to given_cpu. In rare situations, this may cause 8525 * conflicts (balance_cpu and given_cpu/ilb_cpu deciding 8526 * _independently_ and at _same_ time to move some load to 8527 * given_cpu) causing exceess load to be moved to given_cpu. 8528 * This however should not happen so much in practice and 8529 * moreover subsequent load balance cycles should correct the 8530 * excess load moved. 8531 */ 8532 if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) { 8533 8534 /* Prevent to re-select dst_cpu via env's cpus */ 8535 cpumask_clear_cpu(env.dst_cpu, env.cpus); 8536 8537 env.dst_rq = cpu_rq(env.new_dst_cpu); 8538 env.dst_cpu = env.new_dst_cpu; 8539 env.flags &= ~LBF_DST_PINNED; 8540 env.loop = 0; 8541 env.loop_break = sched_nr_migrate_break; 8542 8543 /* 8544 * Go back to "more_balance" rather than "redo" since we 8545 * need to continue with same src_cpu. 8546 */ 8547 goto more_balance; 8548 } 8549 8550 /* 8551 * We failed to reach balance because of affinity. 8552 */ 8553 if (sd_parent) { 8554 int *group_imbalance = &sd_parent->groups->sgc->imbalance; 8555 8556 if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) 8557 *group_imbalance = 1; 8558 } 8559 8560 /* All tasks on this runqueue were pinned by CPU affinity */ 8561 if (unlikely(env.flags & LBF_ALL_PINNED)) { 8562 cpumask_clear_cpu(cpu_of(busiest), cpus); 8563 /* 8564 * Attempting to continue load balancing at the current 8565 * sched_domain level only makes sense if there are 8566 * active CPUs remaining as possible busiest CPUs to 8567 * pull load from which are not contained within the 8568 * destination group that is receiving any migrated 8569 * load. 8570 */ 8571 if (!cpumask_subset(cpus, env.dst_grpmask)) { 8572 env.loop = 0; 8573 env.loop_break = sched_nr_migrate_break; 8574 goto redo; 8575 } 8576 goto out_all_pinned; 8577 } 8578 } 8579 8580 if (!ld_moved) { 8581 schedstat_inc(sd->lb_failed[idle]); 8582 /* 8583 * Increment the failure counter only on periodic balance. 8584 * We do not want newidle balance, which can be very 8585 * frequent, pollute the failure counter causing 8586 * excessive cache_hot migrations and active balances. 8587 */ 8588 if (idle != CPU_NEWLY_IDLE) 8589 sd->nr_balance_failed++; 8590 8591 if (need_active_balance(&env)) { 8592 unsigned long flags; 8593 8594 raw_spin_lock_irqsave(&busiest->lock, flags); 8595 8596 /* don't kick the active_load_balance_cpu_stop, 8597 * if the curr task on busiest cpu can't be 8598 * moved to this_cpu 8599 */ 8600 if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) { 8601 raw_spin_unlock_irqrestore(&busiest->lock, 8602 flags); 8603 env.flags |= LBF_ALL_PINNED; 8604 goto out_one_pinned; 8605 } 8606 8607 /* 8608 * ->active_balance synchronizes accesses to 8609 * ->active_balance_work. Once set, it's cleared 8610 * only after active load balance is finished. 8611 */ 8612 if (!busiest->active_balance) { 8613 busiest->active_balance = 1; 8614 busiest->push_cpu = this_cpu; 8615 active_balance = 1; 8616 } 8617 raw_spin_unlock_irqrestore(&busiest->lock, flags); 8618 8619 if (active_balance) { 8620 stop_one_cpu_nowait(cpu_of(busiest), 8621 active_load_balance_cpu_stop, busiest, 8622 &busiest->active_balance_work); 8623 } 8624 8625 /* We've kicked active balancing, force task migration. */ 8626 sd->nr_balance_failed = sd->cache_nice_tries+1; 8627 } 8628 } else 8629 sd->nr_balance_failed = 0; 8630 8631 if (likely(!active_balance)) { 8632 /* We were unbalanced, so reset the balancing interval */ 8633 sd->balance_interval = sd->min_interval; 8634 } else { 8635 /* 8636 * If we've begun active balancing, start to back off. This 8637 * case may not be covered by the all_pinned logic if there 8638 * is only 1 task on the busy runqueue (because we don't call 8639 * detach_tasks). 8640 */ 8641 if (sd->balance_interval < sd->max_interval) 8642 sd->balance_interval *= 2; 8643 } 8644 8645 goto out; 8646 8647 out_balanced: 8648 /* 8649 * We reach balance although we may have faced some affinity 8650 * constraints. Clear the imbalance flag if it was set. 8651 */ 8652 if (sd_parent) { 8653 int *group_imbalance = &sd_parent->groups->sgc->imbalance; 8654 8655 if (*group_imbalance) 8656 *group_imbalance = 0; 8657 } 8658 8659 out_all_pinned: 8660 /* 8661 * We reach balance because all tasks are pinned at this level so 8662 * we can't migrate them. Let the imbalance flag set so parent level 8663 * can try to migrate them. 8664 */ 8665 schedstat_inc(sd->lb_balanced[idle]); 8666 8667 sd->nr_balance_failed = 0; 8668 8669 out_one_pinned: 8670 /* tune up the balancing interval */ 8671 if (((env.flags & LBF_ALL_PINNED) && 8672 sd->balance_interval < MAX_PINNED_INTERVAL) || 8673 (sd->balance_interval < sd->max_interval)) 8674 sd->balance_interval *= 2; 8675 8676 ld_moved = 0; 8677 out: 8678 return ld_moved; 8679 } 8680 8681 static inline unsigned long 8682 get_sd_balance_interval(struct sched_domain *sd, int cpu_busy) 8683 { 8684 unsigned long interval = sd->balance_interval; 8685 8686 if (cpu_busy) 8687 interval *= sd->busy_factor; 8688 8689 /* scale ms to jiffies */ 8690 interval = msecs_to_jiffies(interval); 8691 interval = clamp(interval, 1UL, max_load_balance_interval); 8692 8693 return interval; 8694 } 8695 8696 static inline void 8697 update_next_balance(struct sched_domain *sd, unsigned long *next_balance) 8698 { 8699 unsigned long interval, next; 8700 8701 /* used by idle balance, so cpu_busy = 0 */ 8702 interval = get_sd_balance_interval(sd, 0); 8703 next = sd->last_balance + interval; 8704 8705 if (time_after(*next_balance, next)) 8706 *next_balance = next; 8707 } 8708 8709 /* 8710 * idle_balance is called by schedule() if this_cpu is about to become 8711 * idle. Attempts to pull tasks from other CPUs. 8712 */ 8713 static int idle_balance(struct rq *this_rq, struct rq_flags *rf) 8714 { 8715 unsigned long next_balance = jiffies + HZ; 8716 int this_cpu = this_rq->cpu; 8717 struct sched_domain *sd; 8718 int pulled_task = 0; 8719 u64 curr_cost = 0; 8720 8721 /* 8722 * We must set idle_stamp _before_ calling idle_balance(), such that we 8723 * measure the duration of idle_balance() as idle time. 8724 */ 8725 this_rq->idle_stamp = rq_clock(this_rq); 8726 8727 /* 8728 * Do not pull tasks towards !active CPUs... 8729 */ 8730 if (!cpu_active(this_cpu)) 8731 return 0; 8732 8733 /* 8734 * This is OK, because current is on_cpu, which avoids it being picked 8735 * for load-balance and preemption/IRQs are still disabled avoiding 8736 * further scheduler activity on it and we're being very careful to 8737 * re-start the picking loop. 8738 */ 8739 rq_unpin_lock(this_rq, rf); 8740 8741 if (this_rq->avg_idle < sysctl_sched_migration_cost || 8742 !this_rq->rd->overload) { 8743 rcu_read_lock(); 8744 sd = rcu_dereference_check_sched_domain(this_rq->sd); 8745 if (sd) 8746 update_next_balance(sd, &next_balance); 8747 rcu_read_unlock(); 8748 8749 goto out; 8750 } 8751 8752 raw_spin_unlock(&this_rq->lock); 8753 8754 update_blocked_averages(this_cpu); 8755 rcu_read_lock(); 8756 for_each_domain(this_cpu, sd) { 8757 int continue_balancing = 1; 8758 u64 t0, domain_cost; 8759 8760 if (!(sd->flags & SD_LOAD_BALANCE)) 8761 continue; 8762 8763 if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) { 8764 update_next_balance(sd, &next_balance); 8765 break; 8766 } 8767 8768 if (sd->flags & SD_BALANCE_NEWIDLE) { 8769 t0 = sched_clock_cpu(this_cpu); 8770 8771 pulled_task = load_balance(this_cpu, this_rq, 8772 sd, CPU_NEWLY_IDLE, 8773 &continue_balancing); 8774 8775 domain_cost = sched_clock_cpu(this_cpu) - t0; 8776 if (domain_cost > sd->max_newidle_lb_cost) 8777 sd->max_newidle_lb_cost = domain_cost; 8778 8779 curr_cost += domain_cost; 8780 } 8781 8782 update_next_balance(sd, &next_balance); 8783 8784 /* 8785 * Stop searching for tasks to pull if there are 8786 * now runnable tasks on this rq. 8787 */ 8788 if (pulled_task || this_rq->nr_running > 0) 8789 break; 8790 } 8791 rcu_read_unlock(); 8792 8793 raw_spin_lock(&this_rq->lock); 8794 8795 if (curr_cost > this_rq->max_idle_balance_cost) 8796 this_rq->max_idle_balance_cost = curr_cost; 8797 8798 /* 8799 * While browsing the domains, we released the rq lock, a task could 8800 * have been enqueued in the meantime. Since we're not going idle, 8801 * pretend we pulled a task. 8802 */ 8803 if (this_rq->cfs.h_nr_running && !pulled_task) 8804 pulled_task = 1; 8805 8806 out: 8807 /* Move the next balance forward */ 8808 if (time_after(this_rq->next_balance, next_balance)) 8809 this_rq->next_balance = next_balance; 8810 8811 /* Is there a task of a high priority class? */ 8812 if (this_rq->nr_running != this_rq->cfs.h_nr_running) 8813 pulled_task = -1; 8814 8815 if (pulled_task) 8816 this_rq->idle_stamp = 0; 8817 8818 rq_repin_lock(this_rq, rf); 8819 8820 return pulled_task; 8821 } 8822 8823 /* 8824 * active_load_balance_cpu_stop is run by cpu stopper. It pushes 8825 * running tasks off the busiest CPU onto idle CPUs. It requires at 8826 * least 1 task to be running on each physical CPU where possible, and 8827 * avoids physical / logical imbalances. 8828 */ 8829 static int active_load_balance_cpu_stop(void *data) 8830 { 8831 struct rq *busiest_rq = data; 8832 int busiest_cpu = cpu_of(busiest_rq); 8833 int target_cpu = busiest_rq->push_cpu; 8834 struct rq *target_rq = cpu_rq(target_cpu); 8835 struct sched_domain *sd; 8836 struct task_struct *p = NULL; 8837 struct rq_flags rf; 8838 8839 rq_lock_irq(busiest_rq, &rf); 8840 /* 8841 * Between queueing the stop-work and running it is a hole in which 8842 * CPUs can become inactive. We should not move tasks from or to 8843 * inactive CPUs. 8844 */ 8845 if (!cpu_active(busiest_cpu) || !cpu_active(target_cpu)) 8846 goto out_unlock; 8847 8848 /* make sure the requested cpu hasn't gone down in the meantime */ 8849 if (unlikely(busiest_cpu != smp_processor_id() || 8850 !busiest_rq->active_balance)) 8851 goto out_unlock; 8852 8853 /* Is there any task to move? */ 8854 if (busiest_rq->nr_running <= 1) 8855 goto out_unlock; 8856 8857 /* 8858 * This condition is "impossible", if it occurs 8859 * we need to fix it. Originally reported by 8860 * Bjorn Helgaas on a 128-cpu setup. 8861 */ 8862 BUG_ON(busiest_rq == target_rq); 8863 8864 /* Search for an sd spanning us and the target CPU. */ 8865 rcu_read_lock(); 8866 for_each_domain(target_cpu, sd) { 8867 if ((sd->flags & SD_LOAD_BALANCE) && 8868 cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) 8869 break; 8870 } 8871 8872 if (likely(sd)) { 8873 struct lb_env env = { 8874 .sd = sd, 8875 .dst_cpu = target_cpu, 8876 .dst_rq = target_rq, 8877 .src_cpu = busiest_rq->cpu, 8878 .src_rq = busiest_rq, 8879 .idle = CPU_IDLE, 8880 /* 8881 * can_migrate_task() doesn't need to compute new_dst_cpu 8882 * for active balancing. Since we have CPU_IDLE, but no 8883 * @dst_grpmask we need to make that test go away with lying 8884 * about DST_PINNED. 8885 */ 8886 .flags = LBF_DST_PINNED, 8887 }; 8888 8889 schedstat_inc(sd->alb_count); 8890 update_rq_clock(busiest_rq); 8891 8892 p = detach_one_task(&env); 8893 if (p) { 8894 schedstat_inc(sd->alb_pushed); 8895 /* Active balancing done, reset the failure counter. */ 8896 sd->nr_balance_failed = 0; 8897 } else { 8898 schedstat_inc(sd->alb_failed); 8899 } 8900 } 8901 rcu_read_unlock(); 8902 out_unlock: 8903 busiest_rq->active_balance = 0; 8904 rq_unlock(busiest_rq, &rf); 8905 8906 if (p) 8907 attach_one_task(target_rq, p); 8908 8909 local_irq_enable(); 8910 8911 return 0; 8912 } 8913 8914 static inline int on_null_domain(struct rq *rq) 8915 { 8916 return unlikely(!rcu_dereference_sched(rq->sd)); 8917 } 8918 8919 #ifdef CONFIG_NO_HZ_COMMON 8920 /* 8921 * idle load balancing details 8922 * - When one of the busy CPUs notice that there may be an idle rebalancing 8923 * needed, they will kick the idle load balancer, which then does idle 8924 * load balancing for all the idle CPUs. 8925 */ 8926 static struct { 8927 cpumask_var_t idle_cpus_mask; 8928 atomic_t nr_cpus; 8929 unsigned long next_balance; /* in jiffy units */ 8930 } nohz ____cacheline_aligned; 8931 8932 static inline int find_new_ilb(void) 8933 { 8934 int ilb = cpumask_first(nohz.idle_cpus_mask); 8935 8936 if (ilb < nr_cpu_ids && idle_cpu(ilb)) 8937 return ilb; 8938 8939 return nr_cpu_ids; 8940 } 8941 8942 /* 8943 * Kick a CPU to do the nohz balancing, if it is time for it. We pick the 8944 * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle 8945 * CPU (if there is one). 8946 */ 8947 static void nohz_balancer_kick(void) 8948 { 8949 int ilb_cpu; 8950 8951 nohz.next_balance++; 8952 8953 ilb_cpu = find_new_ilb(); 8954 8955 if (ilb_cpu >= nr_cpu_ids) 8956 return; 8957 8958 if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu))) 8959 return; 8960 /* 8961 * Use smp_send_reschedule() instead of resched_cpu(). 8962 * This way we generate a sched IPI on the target cpu which 8963 * is idle. And the softirq performing nohz idle load balance 8964 * will be run before returning from the IPI. 8965 */ 8966 smp_send_reschedule(ilb_cpu); 8967 return; 8968 } 8969 8970 void nohz_balance_exit_idle(unsigned int cpu) 8971 { 8972 if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) { 8973 /* 8974 * Completely isolated CPUs don't ever set, so we must test. 8975 */ 8976 if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) { 8977 cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); 8978 atomic_dec(&nohz.nr_cpus); 8979 } 8980 clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); 8981 } 8982 } 8983 8984 static inline void set_cpu_sd_state_busy(void) 8985 { 8986 struct sched_domain *sd; 8987 int cpu = smp_processor_id(); 8988 8989 rcu_read_lock(); 8990 sd = rcu_dereference(per_cpu(sd_llc, cpu)); 8991 8992 if (!sd || !sd->nohz_idle) 8993 goto unlock; 8994 sd->nohz_idle = 0; 8995 8996 atomic_inc(&sd->shared->nr_busy_cpus); 8997 unlock: 8998 rcu_read_unlock(); 8999 } 9000 9001 void set_cpu_sd_state_idle(void) 9002 { 9003 struct sched_domain *sd; 9004 int cpu = smp_processor_id(); 9005 9006 rcu_read_lock(); 9007 sd = rcu_dereference(per_cpu(sd_llc, cpu)); 9008 9009 if (!sd || sd->nohz_idle) 9010 goto unlock; 9011 sd->nohz_idle = 1; 9012 9013 atomic_dec(&sd->shared->nr_busy_cpus); 9014 unlock: 9015 rcu_read_unlock(); 9016 } 9017 9018 /* 9019 * This routine will record that the cpu is going idle with tick stopped. 9020 * This info will be used in performing idle load balancing in the future. 9021 */ 9022 void nohz_balance_enter_idle(int cpu) 9023 { 9024 /* 9025 * If this cpu is going down, then nothing needs to be done. 9026 */ 9027 if (!cpu_active(cpu)) 9028 return; 9029 9030 /* Spare idle load balancing on CPUs that don't want to be disturbed: */ 9031 if (!housekeeping_cpu(cpu, HK_FLAG_SCHED)) 9032 return; 9033 9034 if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu))) 9035 return; 9036 9037 /* 9038 * If we're a completely isolated CPU, we don't play. 9039 */ 9040 if (on_null_domain(cpu_rq(cpu))) 9041 return; 9042 9043 cpumask_set_cpu(cpu, nohz.idle_cpus_mask); 9044 atomic_inc(&nohz.nr_cpus); 9045 set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); 9046 } 9047 #endif 9048 9049 static DEFINE_SPINLOCK(balancing); 9050 9051 /* 9052 * Scale the max load_balance interval with the number of CPUs in the system. 9053 * This trades load-balance latency on larger machines for less cross talk. 9054 */ 9055 void update_max_interval(void) 9056 { 9057 max_load_balance_interval = HZ*num_online_cpus()/10; 9058 } 9059 9060 /* 9061 * It checks each scheduling domain to see if it is due to be balanced, 9062 * and initiates a balancing operation if so. 9063 * 9064 * Balancing parameters are set up in init_sched_domains. 9065 */ 9066 static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) 9067 { 9068 int continue_balancing = 1; 9069 int cpu = rq->cpu; 9070 unsigned long interval; 9071 struct sched_domain *sd; 9072 /* Earliest time when we have to do rebalance again */ 9073 unsigned long next_balance = jiffies + 60*HZ; 9074 int update_next_balance = 0; 9075 int need_serialize, need_decay = 0; 9076 u64 max_cost = 0; 9077 9078 update_blocked_averages(cpu); 9079 9080 rcu_read_lock(); 9081 for_each_domain(cpu, sd) { 9082 /* 9083 * Decay the newidle max times here because this is a regular 9084 * visit to all the domains. Decay ~1% per second. 9085 */ 9086 if (time_after(jiffies, sd->next_decay_max_lb_cost)) { 9087 sd->max_newidle_lb_cost = 9088 (sd->max_newidle_lb_cost * 253) / 256; 9089 sd->next_decay_max_lb_cost = jiffies + HZ; 9090 need_decay = 1; 9091 } 9092 max_cost += sd->max_newidle_lb_cost; 9093 9094 if (!(sd->flags & SD_LOAD_BALANCE)) 9095 continue; 9096 9097 /* 9098 * Stop the load balance at this level. There is another 9099 * CPU in our sched group which is doing load balancing more 9100 * actively. 9101 */ 9102 if (!continue_balancing) { 9103 if (need_decay) 9104 continue; 9105 break; 9106 } 9107 9108 interval = get_sd_balance_interval(sd, idle != CPU_IDLE); 9109 9110 need_serialize = sd->flags & SD_SERIALIZE; 9111 if (need_serialize) { 9112 if (!spin_trylock(&balancing)) 9113 goto out; 9114 } 9115 9116 if (time_after_eq(jiffies, sd->last_balance + interval)) { 9117 if (load_balance(cpu, rq, sd, idle, &continue_balancing)) { 9118 /* 9119 * The LBF_DST_PINNED logic could have changed 9120 * env->dst_cpu, so we can't know our idle 9121 * state even if we migrated tasks. Update it. 9122 */ 9123 idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE; 9124 } 9125 sd->last_balance = jiffies; 9126 interval = get_sd_balance_interval(sd, idle != CPU_IDLE); 9127 } 9128 if (need_serialize) 9129 spin_unlock(&balancing); 9130 out: 9131 if (time_after(next_balance, sd->last_balance + interval)) { 9132 next_balance = sd->last_balance + interval; 9133 update_next_balance = 1; 9134 } 9135 } 9136 if (need_decay) { 9137 /* 9138 * Ensure the rq-wide value also decays but keep it at a 9139 * reasonable floor to avoid funnies with rq->avg_idle. 9140 */ 9141 rq->max_idle_balance_cost = 9142 max((u64)sysctl_sched_migration_cost, max_cost); 9143 } 9144 rcu_read_unlock(); 9145 9146 /* 9147 * next_balance will be updated only when there is a need. 9148 * When the cpu is attached to null domain for ex, it will not be 9149 * updated. 9150 */ 9151 if (likely(update_next_balance)) { 9152 rq->next_balance = next_balance; 9153 9154 #ifdef CONFIG_NO_HZ_COMMON 9155 /* 9156 * If this CPU has been elected to perform the nohz idle 9157 * balance. Other idle CPUs have already rebalanced with 9158 * nohz_idle_balance() and nohz.next_balance has been 9159 * updated accordingly. This CPU is now running the idle load 9160 * balance for itself and we need to update the 9161 * nohz.next_balance accordingly. 9162 */ 9163 if ((idle == CPU_IDLE) && time_after(nohz.next_balance, rq->next_balance)) 9164 nohz.next_balance = rq->next_balance; 9165 #endif 9166 } 9167 } 9168 9169 #ifdef CONFIG_NO_HZ_COMMON 9170 /* 9171 * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the 9172 * rebalancing for all the cpus for whom scheduler ticks are stopped. 9173 */ 9174 static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) 9175 { 9176 int this_cpu = this_rq->cpu; 9177 struct rq *rq; 9178 int balance_cpu; 9179 /* Earliest time when we have to do rebalance again */ 9180 unsigned long next_balance = jiffies + 60*HZ; 9181 int update_next_balance = 0; 9182 9183 if (idle != CPU_IDLE || 9184 !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu))) 9185 goto end; 9186 9187 for_each_cpu(balance_cpu, nohz.idle_cpus_mask) { 9188 if (balance_cpu == this_cpu || !idle_cpu(balance_cpu)) 9189 continue; 9190 9191 /* 9192 * If this cpu gets work to do, stop the load balancing 9193 * work being done for other cpus. Next load 9194 * balancing owner will pick it up. 9195 */ 9196 if (need_resched()) 9197 break; 9198 9199 rq = cpu_rq(balance_cpu); 9200 9201 /* 9202 * If time for next balance is due, 9203 * do the balance. 9204 */ 9205 if (time_after_eq(jiffies, rq->next_balance)) { 9206 struct rq_flags rf; 9207 9208 rq_lock_irq(rq, &rf); 9209 update_rq_clock(rq); 9210 cpu_load_update_idle(rq); 9211 rq_unlock_irq(rq, &rf); 9212 9213 rebalance_domains(rq, CPU_IDLE); 9214 } 9215 9216 if (time_after(next_balance, rq->next_balance)) { 9217 next_balance = rq->next_balance; 9218 update_next_balance = 1; 9219 } 9220 } 9221 9222 /* 9223 * next_balance will be updated only when there is a need. 9224 * When the CPU is attached to null domain for ex, it will not be 9225 * updated. 9226 */ 9227 if (likely(update_next_balance)) 9228 nohz.next_balance = next_balance; 9229 end: 9230 clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)); 9231 } 9232 9233 /* 9234 * Current heuristic for kicking the idle load balancer in the presence 9235 * of an idle cpu in the system. 9236 * - This rq has more than one task. 9237 * - This rq has at least one CFS task and the capacity of the CPU is 9238 * significantly reduced because of RT tasks or IRQs. 9239 * - At parent of LLC scheduler domain level, this cpu's scheduler group has 9240 * multiple busy cpu. 9241 * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler 9242 * domain span are idle. 9243 */ 9244 static inline bool nohz_kick_needed(struct rq *rq) 9245 { 9246 unsigned long now = jiffies; 9247 struct sched_domain_shared *sds; 9248 struct sched_domain *sd; 9249 int nr_busy, i, cpu = rq->cpu; 9250 bool kick = false; 9251 9252 if (unlikely(rq->idle_balance)) 9253 return false; 9254 9255 /* 9256 * We may be recently in ticked or tickless idle mode. At the first 9257 * busy tick after returning from idle, we will update the busy stats. 9258 */ 9259 set_cpu_sd_state_busy(); 9260 nohz_balance_exit_idle(cpu); 9261 9262 /* 9263 * None are in tickless mode and hence no need for NOHZ idle load 9264 * balancing. 9265 */ 9266 if (likely(!atomic_read(&nohz.nr_cpus))) 9267 return false; 9268 9269 if (time_before(now, nohz.next_balance)) 9270 return false; 9271 9272 if (rq->nr_running >= 2) 9273 return true; 9274 9275 rcu_read_lock(); 9276 sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); 9277 if (sds) { 9278 /* 9279 * XXX: write a coherent comment on why we do this. 9280 * See also: http://lkml.kernel.org/r/20111202010832.602203411@sbsiddha-desk.sc.intel.com 9281 */ 9282 nr_busy = atomic_read(&sds->nr_busy_cpus); 9283 if (nr_busy > 1) { 9284 kick = true; 9285 goto unlock; 9286 } 9287 9288 } 9289 9290 sd = rcu_dereference(rq->sd); 9291 if (sd) { 9292 if ((rq->cfs.h_nr_running >= 1) && 9293 check_cpu_capacity(rq, sd)) { 9294 kick = true; 9295 goto unlock; 9296 } 9297 } 9298 9299 sd = rcu_dereference(per_cpu(sd_asym, cpu)); 9300 if (sd) { 9301 for_each_cpu(i, sched_domain_span(sd)) { 9302 if (i == cpu || 9303 !cpumask_test_cpu(i, nohz.idle_cpus_mask)) 9304 continue; 9305 9306 if (sched_asym_prefer(i, cpu)) { 9307 kick = true; 9308 goto unlock; 9309 } 9310 } 9311 } 9312 unlock: 9313 rcu_read_unlock(); 9314 return kick; 9315 } 9316 #else 9317 static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { } 9318 #endif 9319 9320 /* 9321 * run_rebalance_domains is triggered when needed from the scheduler tick. 9322 * Also triggered for nohz idle balancing (with nohz_balancing_kick set). 9323 */ 9324 static __latent_entropy void run_rebalance_domains(struct softirq_action *h) 9325 { 9326 struct rq *this_rq = this_rq(); 9327 enum cpu_idle_type idle = this_rq->idle_balance ? 9328 CPU_IDLE : CPU_NOT_IDLE; 9329 9330 /* 9331 * If this cpu has a pending nohz_balance_kick, then do the 9332 * balancing on behalf of the other idle cpus whose ticks are 9333 * stopped. Do nohz_idle_balance *before* rebalance_domains to 9334 * give the idle cpus a chance to load balance. Else we may 9335 * load balance only within the local sched_domain hierarchy 9336 * and abort nohz_idle_balance altogether if we pull some load. 9337 */ 9338 nohz_idle_balance(this_rq, idle); 9339 rebalance_domains(this_rq, idle); 9340 } 9341 9342 /* 9343 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. 9344 */ 9345 void trigger_load_balance(struct rq *rq) 9346 { 9347 /* Don't need to rebalance while attached to NULL domain */ 9348 if (unlikely(on_null_domain(rq))) 9349 return; 9350 9351 if (time_after_eq(jiffies, rq->next_balance)) 9352 raise_softirq(SCHED_SOFTIRQ); 9353 #ifdef CONFIG_NO_HZ_COMMON 9354 if (nohz_kick_needed(rq)) 9355 nohz_balancer_kick(); 9356 #endif 9357 } 9358 9359 static void rq_online_fair(struct rq *rq) 9360 { 9361 update_sysctl(); 9362 9363 update_runtime_enabled(rq); 9364 } 9365 9366 static void rq_offline_fair(struct rq *rq) 9367 { 9368 update_sysctl(); 9369 9370 /* Ensure any throttled groups are reachable by pick_next_task */ 9371 unthrottle_offline_cfs_rqs(rq); 9372 } 9373 9374 #endif /* CONFIG_SMP */ 9375 9376 /* 9377 * scheduler tick hitting a task of our scheduling class: 9378 */ 9379 static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) 9380 { 9381 struct cfs_rq *cfs_rq; 9382 struct sched_entity *se = &curr->se; 9383 9384 for_each_sched_entity(se) { 9385 cfs_rq = cfs_rq_of(se); 9386 entity_tick(cfs_rq, se, queued); 9387 } 9388 9389 if (static_branch_unlikely(&sched_numa_balancing)) 9390 task_tick_numa(rq, curr); 9391 } 9392 9393 /* 9394 * called on fork with the child task as argument from the parent's context 9395 * - child not yet on the tasklist 9396 * - preemption disabled 9397 */ 9398 static void task_fork_fair(struct task_struct *p) 9399 { 9400 struct cfs_rq *cfs_rq; 9401 struct sched_entity *se = &p->se, *curr; 9402 struct rq *rq = this_rq(); 9403 struct rq_flags rf; 9404 9405 rq_lock(rq, &rf); 9406 update_rq_clock(rq); 9407 9408 cfs_rq = task_cfs_rq(current); 9409 curr = cfs_rq->curr; 9410 if (curr) { 9411 update_curr(cfs_rq); 9412 se->vruntime = curr->vruntime; 9413 } 9414 place_entity(cfs_rq, se, 1); 9415 9416 if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) { 9417 /* 9418 * Upon rescheduling, sched_class::put_prev_task() will place 9419 * 'current' within the tree based on its new key value. 9420 */ 9421 swap(curr->vruntime, se->vruntime); 9422 resched_curr(rq); 9423 } 9424 9425 se->vruntime -= cfs_rq->min_vruntime; 9426 rq_unlock(rq, &rf); 9427 } 9428 9429 /* 9430 * Priority of the task has changed. Check to see if we preempt 9431 * the current task. 9432 */ 9433 static void 9434 prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) 9435 { 9436 if (!task_on_rq_queued(p)) 9437 return; 9438 9439 /* 9440 * Reschedule if we are currently running on this runqueue and 9441 * our priority decreased, or if we are not currently running on 9442 * this runqueue and our priority is higher than the current's 9443 */ 9444 if (rq->curr == p) { 9445 if (p->prio > oldprio) 9446 resched_curr(rq); 9447 } else 9448 check_preempt_curr(rq, p, 0); 9449 } 9450 9451 static inline bool vruntime_normalized(struct task_struct *p) 9452 { 9453 struct sched_entity *se = &p->se; 9454 9455 /* 9456 * In both the TASK_ON_RQ_QUEUED and TASK_ON_RQ_MIGRATING cases, 9457 * the dequeue_entity(.flags=0) will already have normalized the 9458 * vruntime. 9459 */ 9460 if (p->on_rq) 9461 return true; 9462 9463 /* 9464 * When !on_rq, vruntime of the task has usually NOT been normalized. 9465 * But there are some cases where it has already been normalized: 9466 * 9467 * - A forked child which is waiting for being woken up by 9468 * wake_up_new_task(). 9469 * - A task which has been woken up by try_to_wake_up() and 9470 * waiting for actually being woken up by sched_ttwu_pending(). 9471 */ 9472 if (!se->sum_exec_runtime || p->state == TASK_WAKING) 9473 return true; 9474 9475 return false; 9476 } 9477 9478 #ifdef CONFIG_FAIR_GROUP_SCHED 9479 /* 9480 * Propagate the changes of the sched_entity across the tg tree to make it 9481 * visible to the root 9482 */ 9483 static void propagate_entity_cfs_rq(struct sched_entity *se) 9484 { 9485 struct cfs_rq *cfs_rq; 9486 9487 /* Start to propagate at parent */ 9488 se = se->parent; 9489 9490 for_each_sched_entity(se) { 9491 cfs_rq = cfs_rq_of(se); 9492 9493 if (cfs_rq_throttled(cfs_rq)) 9494 break; 9495 9496 update_load_avg(cfs_rq, se, UPDATE_TG); 9497 } 9498 } 9499 #else 9500 static void propagate_entity_cfs_rq(struct sched_entity *se) { } 9501 #endif 9502 9503 static void detach_entity_cfs_rq(struct sched_entity *se) 9504 { 9505 struct cfs_rq *cfs_rq = cfs_rq_of(se); 9506 9507 /* Catch up with the cfs_rq and remove our load when we leave */ 9508 update_load_avg(cfs_rq, se, 0); 9509 detach_entity_load_avg(cfs_rq, se); 9510 update_tg_load_avg(cfs_rq, false); 9511 propagate_entity_cfs_rq(se); 9512 } 9513 9514 static void attach_entity_cfs_rq(struct sched_entity *se) 9515 { 9516 struct cfs_rq *cfs_rq = cfs_rq_of(se); 9517 9518 #ifdef CONFIG_FAIR_GROUP_SCHED 9519 /* 9520 * Since the real-depth could have been changed (only FAIR 9521 * class maintain depth value), reset depth properly. 9522 */ 9523 se->depth = se->parent ? se->parent->depth + 1 : 0; 9524 #endif 9525 9526 /* Synchronize entity with its cfs_rq */ 9527 update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD); 9528 attach_entity_load_avg(cfs_rq, se); 9529 update_tg_load_avg(cfs_rq, false); 9530 propagate_entity_cfs_rq(se); 9531 } 9532 9533 static void detach_task_cfs_rq(struct task_struct *p) 9534 { 9535 struct sched_entity *se = &p->se; 9536 struct cfs_rq *cfs_rq = cfs_rq_of(se); 9537 9538 if (!vruntime_normalized(p)) { 9539 /* 9540 * Fix up our vruntime so that the current sleep doesn't 9541 * cause 'unlimited' sleep bonus. 9542 */ 9543 place_entity(cfs_rq, se, 0); 9544 se->vruntime -= cfs_rq->min_vruntime; 9545 } 9546 9547 detach_entity_cfs_rq(se); 9548 } 9549 9550 static void attach_task_cfs_rq(struct task_struct *p) 9551 { 9552 struct sched_entity *se = &p->se; 9553 struct cfs_rq *cfs_rq = cfs_rq_of(se); 9554 9555 attach_entity_cfs_rq(se); 9556 9557 if (!vruntime_normalized(p)) 9558 se->vruntime += cfs_rq->min_vruntime; 9559 } 9560 9561 static void switched_from_fair(struct rq *rq, struct task_struct *p) 9562 { 9563 detach_task_cfs_rq(p); 9564 } 9565 9566 static void switched_to_fair(struct rq *rq, struct task_struct *p) 9567 { 9568 attach_task_cfs_rq(p); 9569 9570 if (task_on_rq_queued(p)) { 9571 /* 9572 * We were most likely switched from sched_rt, so 9573 * kick off the schedule if running, otherwise just see 9574 * if we can still preempt the current task. 9575 */ 9576 if (rq->curr == p) 9577 resched_curr(rq); 9578 else 9579 check_preempt_curr(rq, p, 0); 9580 } 9581 } 9582 9583 /* Account for a task changing its policy or group. 9584 * 9585 * This routine is mostly called to set cfs_rq->curr field when a task 9586 * migrates between groups/classes. 9587 */ 9588 static void set_curr_task_fair(struct rq *rq) 9589 { 9590 struct sched_entity *se = &rq->curr->se; 9591 9592 for_each_sched_entity(se) { 9593 struct cfs_rq *cfs_rq = cfs_rq_of(se); 9594 9595 set_next_entity(cfs_rq, se); 9596 /* ensure bandwidth has been allocated on our new cfs_rq */ 9597 account_cfs_rq_runtime(cfs_rq, 0); 9598 } 9599 } 9600 9601 void init_cfs_rq(struct cfs_rq *cfs_rq) 9602 { 9603 cfs_rq->tasks_timeline = RB_ROOT_CACHED; 9604 cfs_rq->min_vruntime = (u64)(-(1LL << 20)); 9605 #ifndef CONFIG_64BIT 9606 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; 9607 #endif 9608 #ifdef CONFIG_SMP 9609 raw_spin_lock_init(&cfs_rq->removed.lock); 9610 #endif 9611 } 9612 9613 #ifdef CONFIG_FAIR_GROUP_SCHED 9614 static void task_set_group_fair(struct task_struct *p) 9615 { 9616 struct sched_entity *se = &p->se; 9617 9618 set_task_rq(p, task_cpu(p)); 9619 se->depth = se->parent ? se->parent->depth + 1 : 0; 9620 } 9621 9622 static void task_move_group_fair(struct task_struct *p) 9623 { 9624 detach_task_cfs_rq(p); 9625 set_task_rq(p, task_cpu(p)); 9626 9627 #ifdef CONFIG_SMP 9628 /* Tell se's cfs_rq has been changed -- migrated */ 9629 p->se.avg.last_update_time = 0; 9630 #endif 9631 attach_task_cfs_rq(p); 9632 } 9633 9634 static void task_change_group_fair(struct task_struct *p, int type) 9635 { 9636 switch (type) { 9637 case TASK_SET_GROUP: 9638 task_set_group_fair(p); 9639 break; 9640 9641 case TASK_MOVE_GROUP: 9642 task_move_group_fair(p); 9643 break; 9644 } 9645 } 9646 9647 void free_fair_sched_group(struct task_group *tg) 9648 { 9649 int i; 9650 9651 destroy_cfs_bandwidth(tg_cfs_bandwidth(tg)); 9652 9653 for_each_possible_cpu(i) { 9654 if (tg->cfs_rq) 9655 kfree(tg->cfs_rq[i]); 9656 if (tg->se) 9657 kfree(tg->se[i]); 9658 } 9659 9660 kfree(tg->cfs_rq); 9661 kfree(tg->se); 9662 } 9663 9664 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) 9665 { 9666 struct sched_entity *se; 9667 struct cfs_rq *cfs_rq; 9668 int i; 9669 9670 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); 9671 if (!tg->cfs_rq) 9672 goto err; 9673 tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL); 9674 if (!tg->se) 9675 goto err; 9676 9677 tg->shares = NICE_0_LOAD; 9678 9679 init_cfs_bandwidth(tg_cfs_bandwidth(tg)); 9680 9681 for_each_possible_cpu(i) { 9682 cfs_rq = kzalloc_node(sizeof(struct cfs_rq), 9683 GFP_KERNEL, cpu_to_node(i)); 9684 if (!cfs_rq) 9685 goto err; 9686 9687 se = kzalloc_node(sizeof(struct sched_entity), 9688 GFP_KERNEL, cpu_to_node(i)); 9689 if (!se) 9690 goto err_free_rq; 9691 9692 init_cfs_rq(cfs_rq); 9693 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); 9694 init_entity_runnable_average(se); 9695 } 9696 9697 return 1; 9698 9699 err_free_rq: 9700 kfree(cfs_rq); 9701 err: 9702 return 0; 9703 } 9704 9705 void online_fair_sched_group(struct task_group *tg) 9706 { 9707 struct sched_entity *se; 9708 struct rq *rq; 9709 int i; 9710 9711 for_each_possible_cpu(i) { 9712 rq = cpu_rq(i); 9713 se = tg->se[i]; 9714 9715 raw_spin_lock_irq(&rq->lock); 9716 update_rq_clock(rq); 9717 attach_entity_cfs_rq(se); 9718 sync_throttle(tg, i); 9719 raw_spin_unlock_irq(&rq->lock); 9720 } 9721 } 9722 9723 void unregister_fair_sched_group(struct task_group *tg) 9724 { 9725 unsigned long flags; 9726 struct rq *rq; 9727 int cpu; 9728 9729 for_each_possible_cpu(cpu) { 9730 if (tg->se[cpu]) 9731 remove_entity_load_avg(tg->se[cpu]); 9732 9733 /* 9734 * Only empty task groups can be destroyed; so we can speculatively 9735 * check on_list without danger of it being re-added. 9736 */ 9737 if (!tg->cfs_rq[cpu]->on_list) 9738 continue; 9739 9740 rq = cpu_rq(cpu); 9741 9742 raw_spin_lock_irqsave(&rq->lock, flags); 9743 list_del_leaf_cfs_rq(tg->cfs_rq[cpu]); 9744 raw_spin_unlock_irqrestore(&rq->lock, flags); 9745 } 9746 } 9747 9748 void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, 9749 struct sched_entity *se, int cpu, 9750 struct sched_entity *parent) 9751 { 9752 struct rq *rq = cpu_rq(cpu); 9753 9754 cfs_rq->tg = tg; 9755 cfs_rq->rq = rq; 9756 init_cfs_rq_runtime(cfs_rq); 9757 9758 tg->cfs_rq[cpu] = cfs_rq; 9759 tg->se[cpu] = se; 9760 9761 /* se could be NULL for root_task_group */ 9762 if (!se) 9763 return; 9764 9765 if (!parent) { 9766 se->cfs_rq = &rq->cfs; 9767 se->depth = 0; 9768 } else { 9769 se->cfs_rq = parent->my_q; 9770 se->depth = parent->depth + 1; 9771 } 9772 9773 se->my_q = cfs_rq; 9774 /* guarantee group entities always have weight */ 9775 update_load_set(&se->load, NICE_0_LOAD); 9776 se->parent = parent; 9777 } 9778 9779 static DEFINE_MUTEX(shares_mutex); 9780 9781 int sched_group_set_shares(struct task_group *tg, unsigned long shares) 9782 { 9783 int i; 9784 9785 /* 9786 * We can't change the weight of the root cgroup. 9787 */ 9788 if (!tg->se[0]) 9789 return -EINVAL; 9790 9791 shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES)); 9792 9793 mutex_lock(&shares_mutex); 9794 if (tg->shares == shares) 9795 goto done; 9796 9797 tg->shares = shares; 9798 for_each_possible_cpu(i) { 9799 struct rq *rq = cpu_rq(i); 9800 struct sched_entity *se = tg->se[i]; 9801 struct rq_flags rf; 9802 9803 /* Propagate contribution to hierarchy */ 9804 rq_lock_irqsave(rq, &rf); 9805 update_rq_clock(rq); 9806 for_each_sched_entity(se) { 9807 update_load_avg(cfs_rq_of(se), se, UPDATE_TG); 9808 update_cfs_group(se); 9809 } 9810 rq_unlock_irqrestore(rq, &rf); 9811 } 9812 9813 done: 9814 mutex_unlock(&shares_mutex); 9815 return 0; 9816 } 9817 #else /* CONFIG_FAIR_GROUP_SCHED */ 9818 9819 void free_fair_sched_group(struct task_group *tg) { } 9820 9821 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) 9822 { 9823 return 1; 9824 } 9825 9826 void online_fair_sched_group(struct task_group *tg) { } 9827 9828 void unregister_fair_sched_group(struct task_group *tg) { } 9829 9830 #endif /* CONFIG_FAIR_GROUP_SCHED */ 9831 9832 9833 static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task) 9834 { 9835 struct sched_entity *se = &task->se; 9836 unsigned int rr_interval = 0; 9837 9838 /* 9839 * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise 9840 * idle runqueue: 9841 */ 9842 if (rq->cfs.load.weight) 9843 rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se)); 9844 9845 return rr_interval; 9846 } 9847 9848 /* 9849 * All the scheduling class methods: 9850 */ 9851 const struct sched_class fair_sched_class = { 9852 .next = &idle_sched_class, 9853 .enqueue_task = enqueue_task_fair, 9854 .dequeue_task = dequeue_task_fair, 9855 .yield_task = yield_task_fair, 9856 .yield_to_task = yield_to_task_fair, 9857 9858 .check_preempt_curr = check_preempt_wakeup, 9859 9860 .pick_next_task = pick_next_task_fair, 9861 .put_prev_task = put_prev_task_fair, 9862 9863 #ifdef CONFIG_SMP 9864 .select_task_rq = select_task_rq_fair, 9865 .migrate_task_rq = migrate_task_rq_fair, 9866 9867 .rq_online = rq_online_fair, 9868 .rq_offline = rq_offline_fair, 9869 9870 .task_dead = task_dead_fair, 9871 .set_cpus_allowed = set_cpus_allowed_common, 9872 #endif 9873 9874 .set_curr_task = set_curr_task_fair, 9875 .task_tick = task_tick_fair, 9876 .task_fork = task_fork_fair, 9877 9878 .prio_changed = prio_changed_fair, 9879 .switched_from = switched_from_fair, 9880 .switched_to = switched_to_fair, 9881 9882 .get_rr_interval = get_rr_interval_fair, 9883 9884 .update_curr = update_curr_fair, 9885 9886 #ifdef CONFIG_FAIR_GROUP_SCHED 9887 .task_change_group = task_change_group_fair, 9888 #endif 9889 }; 9890 9891 #ifdef CONFIG_SCHED_DEBUG 9892 void print_cfs_stats(struct seq_file *m, int cpu) 9893 { 9894 struct cfs_rq *cfs_rq, *pos; 9895 9896 rcu_read_lock(); 9897 for_each_leaf_cfs_rq_safe(cpu_rq(cpu), cfs_rq, pos) 9898 print_cfs_rq(m, cpu, cfs_rq); 9899 rcu_read_unlock(); 9900 } 9901 9902 #ifdef CONFIG_NUMA_BALANCING 9903 void show_numa_stats(struct task_struct *p, struct seq_file *m) 9904 { 9905 int node; 9906 unsigned long tsf = 0, tpf = 0, gsf = 0, gpf = 0; 9907 9908 for_each_online_node(node) { 9909 if (p->numa_faults) { 9910 tsf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 0)]; 9911 tpf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 1)]; 9912 } 9913 if (p->numa_group) { 9914 gsf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 0)], 9915 gpf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 1)]; 9916 } 9917 print_numa_stats(m, node, tsf, tpf, gsf, gpf); 9918 } 9919 } 9920 #endif /* CONFIG_NUMA_BALANCING */ 9921 #endif /* CONFIG_SCHED_DEBUG */ 9922 9923 __init void init_sched_fair_class(void) 9924 { 9925 #ifdef CONFIG_SMP 9926 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); 9927 9928 #ifdef CONFIG_NO_HZ_COMMON 9929 nohz.next_balance = jiffies; 9930 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); 9931 #endif 9932 #endif /* SMP */ 9933 9934 } 9935