xref: /openbmc/linux/kernel/sched/fair.c (revision f0702555)
1 /*
2  * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
3  *
4  *  Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
5  *
6  *  Interactivity improvements by Mike Galbraith
7  *  (C) 2007 Mike Galbraith <efault@gmx.de>
8  *
9  *  Various enhancements by Dmitry Adamushko.
10  *  (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
11  *
12  *  Group scheduling enhancements by Srivatsa Vaddagiri
13  *  Copyright IBM Corporation, 2007
14  *  Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
15  *
16  *  Scaled math optimizations by Thomas Gleixner
17  *  Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
18  *
19  *  Adaptive scheduling granularity, math enhancements by Peter Zijlstra
20  *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
21  */
22 
23 #include <linux/sched.h>
24 #include <linux/latencytop.h>
25 #include <linux/cpumask.h>
26 #include <linux/cpuidle.h>
27 #include <linux/slab.h>
28 #include <linux/profile.h>
29 #include <linux/interrupt.h>
30 #include <linux/mempolicy.h>
31 #include <linux/migrate.h>
32 #include <linux/task_work.h>
33 
34 #include <trace/events/sched.h>
35 
36 #include "sched.h"
37 
38 /*
39  * Targeted preemption latency for CPU-bound tasks:
40  * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
41  *
42  * NOTE: this latency value is not the same as the concept of
43  * 'timeslice length' - timeslices in CFS are of variable length
44  * and have no persistent notion like in traditional, time-slice
45  * based scheduling concepts.
46  *
47  * (to see the precise effective timeslice length of your workload,
48  *  run vmstat and monitor the context-switches (cs) field)
49  */
50 unsigned int sysctl_sched_latency = 6000000ULL;
51 unsigned int normalized_sysctl_sched_latency = 6000000ULL;
52 
53 /*
54  * The initial- and re-scaling of tunables is configurable
55  * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
56  *
57  * Options are:
58  * SCHED_TUNABLESCALING_NONE - unscaled, always *1
59  * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
60  * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
61  */
62 enum sched_tunable_scaling sysctl_sched_tunable_scaling
63 	= SCHED_TUNABLESCALING_LOG;
64 
65 /*
66  * Minimal preemption granularity for CPU-bound tasks:
67  * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
68  */
69 unsigned int sysctl_sched_min_granularity = 750000ULL;
70 unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
71 
72 /*
73  * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
74  */
75 static unsigned int sched_nr_latency = 8;
76 
77 /*
78  * After fork, child runs first. If set to 0 (default) then
79  * parent will (try to) run first.
80  */
81 unsigned int sysctl_sched_child_runs_first __read_mostly;
82 
83 /*
84  * SCHED_OTHER wake-up granularity.
85  * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
86  *
87  * This option delays the preemption effects of decoupled workloads
88  * and reduces their over-scheduling. Synchronous workloads will still
89  * have immediate wakeup/sleep latencies.
90  */
91 unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
92 unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
93 
94 const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
95 
96 /*
97  * The exponential sliding  window over which load is averaged for shares
98  * distribution.
99  * (default: 10msec)
100  */
101 unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
102 
103 #ifdef CONFIG_CFS_BANDWIDTH
104 /*
105  * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
106  * each time a cfs_rq requests quota.
107  *
108  * Note: in the case that the slice exceeds the runtime remaining (either due
109  * to consumption or the quota being specified to be smaller than the slice)
110  * we will always only issue the remaining available time.
111  *
112  * default: 5 msec, units: microseconds
113   */
114 unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
115 #endif
116 
117 static inline void update_load_add(struct load_weight *lw, unsigned long inc)
118 {
119 	lw->weight += inc;
120 	lw->inv_weight = 0;
121 }
122 
123 static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
124 {
125 	lw->weight -= dec;
126 	lw->inv_weight = 0;
127 }
128 
129 static inline void update_load_set(struct load_weight *lw, unsigned long w)
130 {
131 	lw->weight = w;
132 	lw->inv_weight = 0;
133 }
134 
135 /*
136  * Increase the granularity value when there are more CPUs,
137  * because with more CPUs the 'effective latency' as visible
138  * to users decreases. But the relationship is not linear,
139  * so pick a second-best guess by going with the log2 of the
140  * number of CPUs.
141  *
142  * This idea comes from the SD scheduler of Con Kolivas:
143  */
144 static unsigned int get_update_sysctl_factor(void)
145 {
146 	unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8);
147 	unsigned int factor;
148 
149 	switch (sysctl_sched_tunable_scaling) {
150 	case SCHED_TUNABLESCALING_NONE:
151 		factor = 1;
152 		break;
153 	case SCHED_TUNABLESCALING_LINEAR:
154 		factor = cpus;
155 		break;
156 	case SCHED_TUNABLESCALING_LOG:
157 	default:
158 		factor = 1 + ilog2(cpus);
159 		break;
160 	}
161 
162 	return factor;
163 }
164 
165 static void update_sysctl(void)
166 {
167 	unsigned int factor = get_update_sysctl_factor();
168 
169 #define SET_SYSCTL(name) \
170 	(sysctl_##name = (factor) * normalized_sysctl_##name)
171 	SET_SYSCTL(sched_min_granularity);
172 	SET_SYSCTL(sched_latency);
173 	SET_SYSCTL(sched_wakeup_granularity);
174 #undef SET_SYSCTL
175 }
176 
177 void sched_init_granularity(void)
178 {
179 	update_sysctl();
180 }
181 
182 #define WMULT_CONST	(~0U)
183 #define WMULT_SHIFT	32
184 
185 static void __update_inv_weight(struct load_weight *lw)
186 {
187 	unsigned long w;
188 
189 	if (likely(lw->inv_weight))
190 		return;
191 
192 	w = scale_load_down(lw->weight);
193 
194 	if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
195 		lw->inv_weight = 1;
196 	else if (unlikely(!w))
197 		lw->inv_weight = WMULT_CONST;
198 	else
199 		lw->inv_weight = WMULT_CONST / w;
200 }
201 
202 /*
203  * delta_exec * weight / lw.weight
204  *   OR
205  * (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT
206  *
207  * Either weight := NICE_0_LOAD and lw \e sched_prio_to_wmult[], in which case
208  * we're guaranteed shift stays positive because inv_weight is guaranteed to
209  * fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.
210  *
211  * Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus
212  * weight/lw.weight <= 1, and therefore our shift will also be positive.
213  */
214 static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
215 {
216 	u64 fact = scale_load_down(weight);
217 	int shift = WMULT_SHIFT;
218 
219 	__update_inv_weight(lw);
220 
221 	if (unlikely(fact >> 32)) {
222 		while (fact >> 32) {
223 			fact >>= 1;
224 			shift--;
225 		}
226 	}
227 
228 	/* hint to use a 32x32->64 mul */
229 	fact = (u64)(u32)fact * lw->inv_weight;
230 
231 	while (fact >> 32) {
232 		fact >>= 1;
233 		shift--;
234 	}
235 
236 	return mul_u64_u32_shr(delta_exec, fact, shift);
237 }
238 
239 
240 const struct sched_class fair_sched_class;
241 
242 /**************************************************************
243  * CFS operations on generic schedulable entities:
244  */
245 
246 #ifdef CONFIG_FAIR_GROUP_SCHED
247 
248 /* cpu runqueue to which this cfs_rq is attached */
249 static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
250 {
251 	return cfs_rq->rq;
252 }
253 
254 /* An entity is a task if it doesn't "own" a runqueue */
255 #define entity_is_task(se)	(!se->my_q)
256 
257 static inline struct task_struct *task_of(struct sched_entity *se)
258 {
259 #ifdef CONFIG_SCHED_DEBUG
260 	WARN_ON_ONCE(!entity_is_task(se));
261 #endif
262 	return container_of(se, struct task_struct, se);
263 }
264 
265 /* Walk up scheduling entities hierarchy */
266 #define for_each_sched_entity(se) \
267 		for (; se; se = se->parent)
268 
269 static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
270 {
271 	return p->se.cfs_rq;
272 }
273 
274 /* runqueue on which this entity is (to be) queued */
275 static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
276 {
277 	return se->cfs_rq;
278 }
279 
280 /* runqueue "owned" by this group */
281 static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
282 {
283 	return grp->my_q;
284 }
285 
286 static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
287 {
288 	if (!cfs_rq->on_list) {
289 		/*
290 		 * Ensure we either appear before our parent (if already
291 		 * enqueued) or force our parent to appear after us when it is
292 		 * enqueued.  The fact that we always enqueue bottom-up
293 		 * reduces this to two cases.
294 		 */
295 		if (cfs_rq->tg->parent &&
296 		    cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
297 			list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
298 				&rq_of(cfs_rq)->leaf_cfs_rq_list);
299 		} else {
300 			list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
301 				&rq_of(cfs_rq)->leaf_cfs_rq_list);
302 		}
303 
304 		cfs_rq->on_list = 1;
305 	}
306 }
307 
308 static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
309 {
310 	if (cfs_rq->on_list) {
311 		list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
312 		cfs_rq->on_list = 0;
313 	}
314 }
315 
316 /* Iterate thr' all leaf cfs_rq's on a runqueue */
317 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
318 	list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
319 
320 /* Do the two (enqueued) entities belong to the same group ? */
321 static inline struct cfs_rq *
322 is_same_group(struct sched_entity *se, struct sched_entity *pse)
323 {
324 	if (se->cfs_rq == pse->cfs_rq)
325 		return se->cfs_rq;
326 
327 	return NULL;
328 }
329 
330 static inline struct sched_entity *parent_entity(struct sched_entity *se)
331 {
332 	return se->parent;
333 }
334 
335 static void
336 find_matching_se(struct sched_entity **se, struct sched_entity **pse)
337 {
338 	int se_depth, pse_depth;
339 
340 	/*
341 	 * preemption test can be made between sibling entities who are in the
342 	 * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
343 	 * both tasks until we find their ancestors who are siblings of common
344 	 * parent.
345 	 */
346 
347 	/* First walk up until both entities are at same depth */
348 	se_depth = (*se)->depth;
349 	pse_depth = (*pse)->depth;
350 
351 	while (se_depth > pse_depth) {
352 		se_depth--;
353 		*se = parent_entity(*se);
354 	}
355 
356 	while (pse_depth > se_depth) {
357 		pse_depth--;
358 		*pse = parent_entity(*pse);
359 	}
360 
361 	while (!is_same_group(*se, *pse)) {
362 		*se = parent_entity(*se);
363 		*pse = parent_entity(*pse);
364 	}
365 }
366 
367 #else	/* !CONFIG_FAIR_GROUP_SCHED */
368 
369 static inline struct task_struct *task_of(struct sched_entity *se)
370 {
371 	return container_of(se, struct task_struct, se);
372 }
373 
374 static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
375 {
376 	return container_of(cfs_rq, struct rq, cfs);
377 }
378 
379 #define entity_is_task(se)	1
380 
381 #define for_each_sched_entity(se) \
382 		for (; se; se = NULL)
383 
384 static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
385 {
386 	return &task_rq(p)->cfs;
387 }
388 
389 static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
390 {
391 	struct task_struct *p = task_of(se);
392 	struct rq *rq = task_rq(p);
393 
394 	return &rq->cfs;
395 }
396 
397 /* runqueue "owned" by this group */
398 static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
399 {
400 	return NULL;
401 }
402 
403 static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
404 {
405 }
406 
407 static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
408 {
409 }
410 
411 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
412 		for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
413 
414 static inline struct sched_entity *parent_entity(struct sched_entity *se)
415 {
416 	return NULL;
417 }
418 
419 static inline void
420 find_matching_se(struct sched_entity **se, struct sched_entity **pse)
421 {
422 }
423 
424 #endif	/* CONFIG_FAIR_GROUP_SCHED */
425 
426 static __always_inline
427 void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
428 
429 /**************************************************************
430  * Scheduling class tree data structure manipulation methods:
431  */
432 
433 static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
434 {
435 	s64 delta = (s64)(vruntime - max_vruntime);
436 	if (delta > 0)
437 		max_vruntime = vruntime;
438 
439 	return max_vruntime;
440 }
441 
442 static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
443 {
444 	s64 delta = (s64)(vruntime - min_vruntime);
445 	if (delta < 0)
446 		min_vruntime = vruntime;
447 
448 	return min_vruntime;
449 }
450 
451 static inline int entity_before(struct sched_entity *a,
452 				struct sched_entity *b)
453 {
454 	return (s64)(a->vruntime - b->vruntime) < 0;
455 }
456 
457 static void update_min_vruntime(struct cfs_rq *cfs_rq)
458 {
459 	u64 vruntime = cfs_rq->min_vruntime;
460 
461 	if (cfs_rq->curr)
462 		vruntime = cfs_rq->curr->vruntime;
463 
464 	if (cfs_rq->rb_leftmost) {
465 		struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
466 						   struct sched_entity,
467 						   run_node);
468 
469 		if (!cfs_rq->curr)
470 			vruntime = se->vruntime;
471 		else
472 			vruntime = min_vruntime(vruntime, se->vruntime);
473 	}
474 
475 	/* ensure we never gain time by being placed backwards. */
476 	cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
477 #ifndef CONFIG_64BIT
478 	smp_wmb();
479 	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
480 #endif
481 }
482 
483 /*
484  * Enqueue an entity into the rb-tree:
485  */
486 static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
487 {
488 	struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
489 	struct rb_node *parent = NULL;
490 	struct sched_entity *entry;
491 	int leftmost = 1;
492 
493 	/*
494 	 * Find the right place in the rbtree:
495 	 */
496 	while (*link) {
497 		parent = *link;
498 		entry = rb_entry(parent, struct sched_entity, run_node);
499 		/*
500 		 * We dont care about collisions. Nodes with
501 		 * the same key stay together.
502 		 */
503 		if (entity_before(se, entry)) {
504 			link = &parent->rb_left;
505 		} else {
506 			link = &parent->rb_right;
507 			leftmost = 0;
508 		}
509 	}
510 
511 	/*
512 	 * Maintain a cache of leftmost tree entries (it is frequently
513 	 * used):
514 	 */
515 	if (leftmost)
516 		cfs_rq->rb_leftmost = &se->run_node;
517 
518 	rb_link_node(&se->run_node, parent, link);
519 	rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
520 }
521 
522 static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
523 {
524 	if (cfs_rq->rb_leftmost == &se->run_node) {
525 		struct rb_node *next_node;
526 
527 		next_node = rb_next(&se->run_node);
528 		cfs_rq->rb_leftmost = next_node;
529 	}
530 
531 	rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
532 }
533 
534 struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
535 {
536 	struct rb_node *left = cfs_rq->rb_leftmost;
537 
538 	if (!left)
539 		return NULL;
540 
541 	return rb_entry(left, struct sched_entity, run_node);
542 }
543 
544 static struct sched_entity *__pick_next_entity(struct sched_entity *se)
545 {
546 	struct rb_node *next = rb_next(&se->run_node);
547 
548 	if (!next)
549 		return NULL;
550 
551 	return rb_entry(next, struct sched_entity, run_node);
552 }
553 
554 #ifdef CONFIG_SCHED_DEBUG
555 struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
556 {
557 	struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
558 
559 	if (!last)
560 		return NULL;
561 
562 	return rb_entry(last, struct sched_entity, run_node);
563 }
564 
565 /**************************************************************
566  * Scheduling class statistics methods:
567  */
568 
569 int sched_proc_update_handler(struct ctl_table *table, int write,
570 		void __user *buffer, size_t *lenp,
571 		loff_t *ppos)
572 {
573 	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
574 	unsigned int factor = get_update_sysctl_factor();
575 
576 	if (ret || !write)
577 		return ret;
578 
579 	sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
580 					sysctl_sched_min_granularity);
581 
582 #define WRT_SYSCTL(name) \
583 	(normalized_sysctl_##name = sysctl_##name / (factor))
584 	WRT_SYSCTL(sched_min_granularity);
585 	WRT_SYSCTL(sched_latency);
586 	WRT_SYSCTL(sched_wakeup_granularity);
587 #undef WRT_SYSCTL
588 
589 	return 0;
590 }
591 #endif
592 
593 /*
594  * delta /= w
595  */
596 static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
597 {
598 	if (unlikely(se->load.weight != NICE_0_LOAD))
599 		delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
600 
601 	return delta;
602 }
603 
604 /*
605  * The idea is to set a period in which each task runs once.
606  *
607  * When there are too many tasks (sched_nr_latency) we have to stretch
608  * this period because otherwise the slices get too small.
609  *
610  * p = (nr <= nl) ? l : l*nr/nl
611  */
612 static u64 __sched_period(unsigned long nr_running)
613 {
614 	if (unlikely(nr_running > sched_nr_latency))
615 		return nr_running * sysctl_sched_min_granularity;
616 	else
617 		return sysctl_sched_latency;
618 }
619 
620 /*
621  * We calculate the wall-time slice from the period by taking a part
622  * proportional to the weight.
623  *
624  * s = p*P[w/rw]
625  */
626 static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
627 {
628 	u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
629 
630 	for_each_sched_entity(se) {
631 		struct load_weight *load;
632 		struct load_weight lw;
633 
634 		cfs_rq = cfs_rq_of(se);
635 		load = &cfs_rq->load;
636 
637 		if (unlikely(!se->on_rq)) {
638 			lw = cfs_rq->load;
639 
640 			update_load_add(&lw, se->load.weight);
641 			load = &lw;
642 		}
643 		slice = __calc_delta(slice, se->load.weight, load);
644 	}
645 	return slice;
646 }
647 
648 /*
649  * We calculate the vruntime slice of a to-be-inserted task.
650  *
651  * vs = s/w
652  */
653 static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
654 {
655 	return calc_delta_fair(sched_slice(cfs_rq, se), se);
656 }
657 
658 #ifdef CONFIG_SMP
659 static int select_idle_sibling(struct task_struct *p, int cpu);
660 static unsigned long task_h_load(struct task_struct *p);
661 
662 /*
663  * We choose a half-life close to 1 scheduling period.
664  * Note: The tables runnable_avg_yN_inv and runnable_avg_yN_sum are
665  * dependent on this value.
666  */
667 #define LOAD_AVG_PERIOD 32
668 #define LOAD_AVG_MAX 47742 /* maximum possible load avg */
669 #define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_AVG_MAX */
670 
671 /* Give new sched_entity start runnable values to heavy its load in infant time */
672 void init_entity_runnable_average(struct sched_entity *se)
673 {
674 	struct sched_avg *sa = &se->avg;
675 
676 	sa->last_update_time = 0;
677 	/*
678 	 * sched_avg's period_contrib should be strictly less then 1024, so
679 	 * we give it 1023 to make sure it is almost a period (1024us), and
680 	 * will definitely be update (after enqueue).
681 	 */
682 	sa->period_contrib = 1023;
683 	sa->load_avg = scale_load_down(se->load.weight);
684 	sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
685 	/*
686 	 * At this point, util_avg won't be used in select_task_rq_fair anyway
687 	 */
688 	sa->util_avg = 0;
689 	sa->util_sum = 0;
690 	/* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
691 }
692 
693 /*
694  * With new tasks being created, their initial util_avgs are extrapolated
695  * based on the cfs_rq's current util_avg:
696  *
697  *   util_avg = cfs_rq->util_avg / (cfs_rq->load_avg + 1) * se.load.weight
698  *
699  * However, in many cases, the above util_avg does not give a desired
700  * value. Moreover, the sum of the util_avgs may be divergent, such
701  * as when the series is a harmonic series.
702  *
703  * To solve this problem, we also cap the util_avg of successive tasks to
704  * only 1/2 of the left utilization budget:
705  *
706  *   util_avg_cap = (1024 - cfs_rq->avg.util_avg) / 2^n
707  *
708  * where n denotes the nth task.
709  *
710  * For example, a simplest series from the beginning would be like:
711  *
712  *  task  util_avg: 512, 256, 128,  64,  32,   16,    8, ...
713  * cfs_rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ...
714  *
715  * Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap)
716  * if util_avg > util_avg_cap.
717  */
718 void post_init_entity_util_avg(struct sched_entity *se)
719 {
720 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
721 	struct sched_avg *sa = &se->avg;
722 	long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2;
723 
724 	if (cap > 0) {
725 		if (cfs_rq->avg.util_avg != 0) {
726 			sa->util_avg  = cfs_rq->avg.util_avg * se->load.weight;
727 			sa->util_avg /= (cfs_rq->avg.load_avg + 1);
728 
729 			if (sa->util_avg > cap)
730 				sa->util_avg = cap;
731 		} else {
732 			sa->util_avg = cap;
733 		}
734 		sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
735 	}
736 }
737 
738 static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq);
739 static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq);
740 #else
741 void init_entity_runnable_average(struct sched_entity *se)
742 {
743 }
744 void post_init_entity_util_avg(struct sched_entity *se)
745 {
746 }
747 #endif
748 
749 /*
750  * Update the current task's runtime statistics.
751  */
752 static void update_curr(struct cfs_rq *cfs_rq)
753 {
754 	struct sched_entity *curr = cfs_rq->curr;
755 	u64 now = rq_clock_task(rq_of(cfs_rq));
756 	u64 delta_exec;
757 
758 	if (unlikely(!curr))
759 		return;
760 
761 	delta_exec = now - curr->exec_start;
762 	if (unlikely((s64)delta_exec <= 0))
763 		return;
764 
765 	curr->exec_start = now;
766 
767 	schedstat_set(curr->statistics.exec_max,
768 		      max(delta_exec, curr->statistics.exec_max));
769 
770 	curr->sum_exec_runtime += delta_exec;
771 	schedstat_add(cfs_rq, exec_clock, delta_exec);
772 
773 	curr->vruntime += calc_delta_fair(delta_exec, curr);
774 	update_min_vruntime(cfs_rq);
775 
776 	if (entity_is_task(curr)) {
777 		struct task_struct *curtask = task_of(curr);
778 
779 		trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
780 		cpuacct_charge(curtask, delta_exec);
781 		account_group_exec_runtime(curtask, delta_exec);
782 	}
783 
784 	account_cfs_rq_runtime(cfs_rq, delta_exec);
785 }
786 
787 static void update_curr_fair(struct rq *rq)
788 {
789 	update_curr(cfs_rq_of(&rq->curr->se));
790 }
791 
792 #ifdef CONFIG_SCHEDSTATS
793 static inline void
794 update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
795 {
796 	u64 wait_start = rq_clock(rq_of(cfs_rq));
797 
798 	if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) &&
799 	    likely(wait_start > se->statistics.wait_start))
800 		wait_start -= se->statistics.wait_start;
801 
802 	se->statistics.wait_start = wait_start;
803 }
804 
805 static void
806 update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
807 {
808 	struct task_struct *p;
809 	u64 delta;
810 
811 	delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start;
812 
813 	if (entity_is_task(se)) {
814 		p = task_of(se);
815 		if (task_on_rq_migrating(p)) {
816 			/*
817 			 * Preserve migrating task's wait time so wait_start
818 			 * time stamp can be adjusted to accumulate wait time
819 			 * prior to migration.
820 			 */
821 			se->statistics.wait_start = delta;
822 			return;
823 		}
824 		trace_sched_stat_wait(p, delta);
825 	}
826 
827 	se->statistics.wait_max = max(se->statistics.wait_max, delta);
828 	se->statistics.wait_count++;
829 	se->statistics.wait_sum += delta;
830 	se->statistics.wait_start = 0;
831 }
832 
833 /*
834  * Task is being enqueued - update stats:
835  */
836 static inline void
837 update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
838 {
839 	/*
840 	 * Are we enqueueing a waiting task? (for current tasks
841 	 * a dequeue/enqueue event is a NOP)
842 	 */
843 	if (se != cfs_rq->curr)
844 		update_stats_wait_start(cfs_rq, se);
845 }
846 
847 static inline void
848 update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
849 {
850 	/*
851 	 * Mark the end of the wait period if dequeueing a
852 	 * waiting task:
853 	 */
854 	if (se != cfs_rq->curr)
855 		update_stats_wait_end(cfs_rq, se);
856 
857 	if (flags & DEQUEUE_SLEEP) {
858 		if (entity_is_task(se)) {
859 			struct task_struct *tsk = task_of(se);
860 
861 			if (tsk->state & TASK_INTERRUPTIBLE)
862 				se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));
863 			if (tsk->state & TASK_UNINTERRUPTIBLE)
864 				se->statistics.block_start = rq_clock(rq_of(cfs_rq));
865 		}
866 	}
867 
868 }
869 #else
870 static inline void
871 update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
872 {
873 }
874 
875 static inline void
876 update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
877 {
878 }
879 
880 static inline void
881 update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
882 {
883 }
884 
885 static inline void
886 update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
887 {
888 }
889 #endif
890 
891 /*
892  * We are picking a new current task - update its stats:
893  */
894 static inline void
895 update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
896 {
897 	/*
898 	 * We are starting a new run period:
899 	 */
900 	se->exec_start = rq_clock_task(rq_of(cfs_rq));
901 }
902 
903 /**************************************************
904  * Scheduling class queueing methods:
905  */
906 
907 #ifdef CONFIG_NUMA_BALANCING
908 /*
909  * Approximate time to scan a full NUMA task in ms. The task scan period is
910  * calculated based on the tasks virtual memory size and
911  * numa_balancing_scan_size.
912  */
913 unsigned int sysctl_numa_balancing_scan_period_min = 1000;
914 unsigned int sysctl_numa_balancing_scan_period_max = 60000;
915 
916 /* Portion of address space to scan in MB */
917 unsigned int sysctl_numa_balancing_scan_size = 256;
918 
919 /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
920 unsigned int sysctl_numa_balancing_scan_delay = 1000;
921 
922 static unsigned int task_nr_scan_windows(struct task_struct *p)
923 {
924 	unsigned long rss = 0;
925 	unsigned long nr_scan_pages;
926 
927 	/*
928 	 * Calculations based on RSS as non-present and empty pages are skipped
929 	 * by the PTE scanner and NUMA hinting faults should be trapped based
930 	 * on resident pages
931 	 */
932 	nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);
933 	rss = get_mm_rss(p->mm);
934 	if (!rss)
935 		rss = nr_scan_pages;
936 
937 	rss = round_up(rss, nr_scan_pages);
938 	return rss / nr_scan_pages;
939 }
940 
941 /* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */
942 #define MAX_SCAN_WINDOW 2560
943 
944 static unsigned int task_scan_min(struct task_struct *p)
945 {
946 	unsigned int scan_size = READ_ONCE(sysctl_numa_balancing_scan_size);
947 	unsigned int scan, floor;
948 	unsigned int windows = 1;
949 
950 	if (scan_size < MAX_SCAN_WINDOW)
951 		windows = MAX_SCAN_WINDOW / scan_size;
952 	floor = 1000 / windows;
953 
954 	scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
955 	return max_t(unsigned int, floor, scan);
956 }
957 
958 static unsigned int task_scan_max(struct task_struct *p)
959 {
960 	unsigned int smin = task_scan_min(p);
961 	unsigned int smax;
962 
963 	/* Watch for min being lower than max due to floor calculations */
964 	smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
965 	return max(smin, smax);
966 }
967 
968 static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
969 {
970 	rq->nr_numa_running += (p->numa_preferred_nid != -1);
971 	rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
972 }
973 
974 static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
975 {
976 	rq->nr_numa_running -= (p->numa_preferred_nid != -1);
977 	rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
978 }
979 
980 struct numa_group {
981 	atomic_t refcount;
982 
983 	spinlock_t lock; /* nr_tasks, tasks */
984 	int nr_tasks;
985 	pid_t gid;
986 	int active_nodes;
987 
988 	struct rcu_head rcu;
989 	unsigned long total_faults;
990 	unsigned long max_faults_cpu;
991 	/*
992 	 * Faults_cpu is used to decide whether memory should move
993 	 * towards the CPU. As a consequence, these stats are weighted
994 	 * more by CPU use than by memory faults.
995 	 */
996 	unsigned long *faults_cpu;
997 	unsigned long faults[0];
998 };
999 
1000 /* Shared or private faults. */
1001 #define NR_NUMA_HINT_FAULT_TYPES 2
1002 
1003 /* Memory and CPU locality */
1004 #define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2)
1005 
1006 /* Averaged statistics, and temporary buffers. */
1007 #define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2)
1008 
1009 pid_t task_numa_group_id(struct task_struct *p)
1010 {
1011 	return p->numa_group ? p->numa_group->gid : 0;
1012 }
1013 
1014 /*
1015  * The averaged statistics, shared & private, memory & cpu,
1016  * occupy the first half of the array. The second half of the
1017  * array is for current counters, which are averaged into the
1018  * first set by task_numa_placement.
1019  */
1020 static inline int task_faults_idx(enum numa_faults_stats s, int nid, int priv)
1021 {
1022 	return NR_NUMA_HINT_FAULT_TYPES * (s * nr_node_ids + nid) + priv;
1023 }
1024 
1025 static inline unsigned long task_faults(struct task_struct *p, int nid)
1026 {
1027 	if (!p->numa_faults)
1028 		return 0;
1029 
1030 	return p->numa_faults[task_faults_idx(NUMA_MEM, nid, 0)] +
1031 		p->numa_faults[task_faults_idx(NUMA_MEM, nid, 1)];
1032 }
1033 
1034 static inline unsigned long group_faults(struct task_struct *p, int nid)
1035 {
1036 	if (!p->numa_group)
1037 		return 0;
1038 
1039 	return p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 0)] +
1040 		p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 1)];
1041 }
1042 
1043 static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
1044 {
1045 	return group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 0)] +
1046 		group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];
1047 }
1048 
1049 /*
1050  * A node triggering more than 1/3 as many NUMA faults as the maximum is
1051  * considered part of a numa group's pseudo-interleaving set. Migrations
1052  * between these nodes are slowed down, to allow things to settle down.
1053  */
1054 #define ACTIVE_NODE_FRACTION 3
1055 
1056 static bool numa_is_active_node(int nid, struct numa_group *ng)
1057 {
1058 	return group_faults_cpu(ng, nid) * ACTIVE_NODE_FRACTION > ng->max_faults_cpu;
1059 }
1060 
1061 /* Handle placement on systems where not all nodes are directly connected. */
1062 static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
1063 					int maxdist, bool task)
1064 {
1065 	unsigned long score = 0;
1066 	int node;
1067 
1068 	/*
1069 	 * All nodes are directly connected, and the same distance
1070 	 * from each other. No need for fancy placement algorithms.
1071 	 */
1072 	if (sched_numa_topology_type == NUMA_DIRECT)
1073 		return 0;
1074 
1075 	/*
1076 	 * This code is called for each node, introducing N^2 complexity,
1077 	 * which should be ok given the number of nodes rarely exceeds 8.
1078 	 */
1079 	for_each_online_node(node) {
1080 		unsigned long faults;
1081 		int dist = node_distance(nid, node);
1082 
1083 		/*
1084 		 * The furthest away nodes in the system are not interesting
1085 		 * for placement; nid was already counted.
1086 		 */
1087 		if (dist == sched_max_numa_distance || node == nid)
1088 			continue;
1089 
1090 		/*
1091 		 * On systems with a backplane NUMA topology, compare groups
1092 		 * of nodes, and move tasks towards the group with the most
1093 		 * memory accesses. When comparing two nodes at distance
1094 		 * "hoplimit", only nodes closer by than "hoplimit" are part
1095 		 * of each group. Skip other nodes.
1096 		 */
1097 		if (sched_numa_topology_type == NUMA_BACKPLANE &&
1098 					dist > maxdist)
1099 			continue;
1100 
1101 		/* Add up the faults from nearby nodes. */
1102 		if (task)
1103 			faults = task_faults(p, node);
1104 		else
1105 			faults = group_faults(p, node);
1106 
1107 		/*
1108 		 * On systems with a glueless mesh NUMA topology, there are
1109 		 * no fixed "groups of nodes". Instead, nodes that are not
1110 		 * directly connected bounce traffic through intermediate
1111 		 * nodes; a numa_group can occupy any set of nodes.
1112 		 * The further away a node is, the less the faults count.
1113 		 * This seems to result in good task placement.
1114 		 */
1115 		if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
1116 			faults *= (sched_max_numa_distance - dist);
1117 			faults /= (sched_max_numa_distance - LOCAL_DISTANCE);
1118 		}
1119 
1120 		score += faults;
1121 	}
1122 
1123 	return score;
1124 }
1125 
1126 /*
1127  * These return the fraction of accesses done by a particular task, or
1128  * task group, on a particular numa node.  The group weight is given a
1129  * larger multiplier, in order to group tasks together that are almost
1130  * evenly spread out between numa nodes.
1131  */
1132 static inline unsigned long task_weight(struct task_struct *p, int nid,
1133 					int dist)
1134 {
1135 	unsigned long faults, total_faults;
1136 
1137 	if (!p->numa_faults)
1138 		return 0;
1139 
1140 	total_faults = p->total_numa_faults;
1141 
1142 	if (!total_faults)
1143 		return 0;
1144 
1145 	faults = task_faults(p, nid);
1146 	faults += score_nearby_nodes(p, nid, dist, true);
1147 
1148 	return 1000 * faults / total_faults;
1149 }
1150 
1151 static inline unsigned long group_weight(struct task_struct *p, int nid,
1152 					 int dist)
1153 {
1154 	unsigned long faults, total_faults;
1155 
1156 	if (!p->numa_group)
1157 		return 0;
1158 
1159 	total_faults = p->numa_group->total_faults;
1160 
1161 	if (!total_faults)
1162 		return 0;
1163 
1164 	faults = group_faults(p, nid);
1165 	faults += score_nearby_nodes(p, nid, dist, false);
1166 
1167 	return 1000 * faults / total_faults;
1168 }
1169 
1170 bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
1171 				int src_nid, int dst_cpu)
1172 {
1173 	struct numa_group *ng = p->numa_group;
1174 	int dst_nid = cpu_to_node(dst_cpu);
1175 	int last_cpupid, this_cpupid;
1176 
1177 	this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
1178 
1179 	/*
1180 	 * Multi-stage node selection is used in conjunction with a periodic
1181 	 * migration fault to build a temporal task<->page relation. By using
1182 	 * a two-stage filter we remove short/unlikely relations.
1183 	 *
1184 	 * Using P(p) ~ n_p / n_t as per frequentist probability, we can equate
1185 	 * a task's usage of a particular page (n_p) per total usage of this
1186 	 * page (n_t) (in a given time-span) to a probability.
1187 	 *
1188 	 * Our periodic faults will sample this probability and getting the
1189 	 * same result twice in a row, given these samples are fully
1190 	 * independent, is then given by P(n)^2, provided our sample period
1191 	 * is sufficiently short compared to the usage pattern.
1192 	 *
1193 	 * This quadric squishes small probabilities, making it less likely we
1194 	 * act on an unlikely task<->page relation.
1195 	 */
1196 	last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
1197 	if (!cpupid_pid_unset(last_cpupid) &&
1198 				cpupid_to_nid(last_cpupid) != dst_nid)
1199 		return false;
1200 
1201 	/* Always allow migrate on private faults */
1202 	if (cpupid_match_pid(p, last_cpupid))
1203 		return true;
1204 
1205 	/* A shared fault, but p->numa_group has not been set up yet. */
1206 	if (!ng)
1207 		return true;
1208 
1209 	/*
1210 	 * Destination node is much more heavily used than the source
1211 	 * node? Allow migration.
1212 	 */
1213 	if (group_faults_cpu(ng, dst_nid) > group_faults_cpu(ng, src_nid) *
1214 					ACTIVE_NODE_FRACTION)
1215 		return true;
1216 
1217 	/*
1218 	 * Distribute memory according to CPU & memory use on each node,
1219 	 * with 3/4 hysteresis to avoid unnecessary memory migrations:
1220 	 *
1221 	 * faults_cpu(dst)   3   faults_cpu(src)
1222 	 * --------------- * - > ---------------
1223 	 * faults_mem(dst)   4   faults_mem(src)
1224 	 */
1225 	return group_faults_cpu(ng, dst_nid) * group_faults(p, src_nid) * 3 >
1226 	       group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
1227 }
1228 
1229 static unsigned long weighted_cpuload(const int cpu);
1230 static unsigned long source_load(int cpu, int type);
1231 static unsigned long target_load(int cpu, int type);
1232 static unsigned long capacity_of(int cpu);
1233 static long effective_load(struct task_group *tg, int cpu, long wl, long wg);
1234 
1235 /* Cached statistics for all CPUs within a node */
1236 struct numa_stats {
1237 	unsigned long nr_running;
1238 	unsigned long load;
1239 
1240 	/* Total compute capacity of CPUs on a node */
1241 	unsigned long compute_capacity;
1242 
1243 	/* Approximate capacity in terms of runnable tasks on a node */
1244 	unsigned long task_capacity;
1245 	int has_free_capacity;
1246 };
1247 
1248 /*
1249  * XXX borrowed from update_sg_lb_stats
1250  */
1251 static void update_numa_stats(struct numa_stats *ns, int nid)
1252 {
1253 	int smt, cpu, cpus = 0;
1254 	unsigned long capacity;
1255 
1256 	memset(ns, 0, sizeof(*ns));
1257 	for_each_cpu(cpu, cpumask_of_node(nid)) {
1258 		struct rq *rq = cpu_rq(cpu);
1259 
1260 		ns->nr_running += rq->nr_running;
1261 		ns->load += weighted_cpuload(cpu);
1262 		ns->compute_capacity += capacity_of(cpu);
1263 
1264 		cpus++;
1265 	}
1266 
1267 	/*
1268 	 * If we raced with hotplug and there are no CPUs left in our mask
1269 	 * the @ns structure is NULL'ed and task_numa_compare() will
1270 	 * not find this node attractive.
1271 	 *
1272 	 * We'll either bail at !has_free_capacity, or we'll detect a huge
1273 	 * imbalance and bail there.
1274 	 */
1275 	if (!cpus)
1276 		return;
1277 
1278 	/* smt := ceil(cpus / capacity), assumes: 1 < smt_power < 2 */
1279 	smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, ns->compute_capacity);
1280 	capacity = cpus / smt; /* cores */
1281 
1282 	ns->task_capacity = min_t(unsigned, capacity,
1283 		DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE));
1284 	ns->has_free_capacity = (ns->nr_running < ns->task_capacity);
1285 }
1286 
1287 struct task_numa_env {
1288 	struct task_struct *p;
1289 
1290 	int src_cpu, src_nid;
1291 	int dst_cpu, dst_nid;
1292 
1293 	struct numa_stats src_stats, dst_stats;
1294 
1295 	int imbalance_pct;
1296 	int dist;
1297 
1298 	struct task_struct *best_task;
1299 	long best_imp;
1300 	int best_cpu;
1301 };
1302 
1303 static void task_numa_assign(struct task_numa_env *env,
1304 			     struct task_struct *p, long imp)
1305 {
1306 	if (env->best_task)
1307 		put_task_struct(env->best_task);
1308 
1309 	env->best_task = p;
1310 	env->best_imp = imp;
1311 	env->best_cpu = env->dst_cpu;
1312 }
1313 
1314 static bool load_too_imbalanced(long src_load, long dst_load,
1315 				struct task_numa_env *env)
1316 {
1317 	long imb, old_imb;
1318 	long orig_src_load, orig_dst_load;
1319 	long src_capacity, dst_capacity;
1320 
1321 	/*
1322 	 * The load is corrected for the CPU capacity available on each node.
1323 	 *
1324 	 * src_load        dst_load
1325 	 * ------------ vs ---------
1326 	 * src_capacity    dst_capacity
1327 	 */
1328 	src_capacity = env->src_stats.compute_capacity;
1329 	dst_capacity = env->dst_stats.compute_capacity;
1330 
1331 	/* We care about the slope of the imbalance, not the direction. */
1332 	if (dst_load < src_load)
1333 		swap(dst_load, src_load);
1334 
1335 	/* Is the difference below the threshold? */
1336 	imb = dst_load * src_capacity * 100 -
1337 	      src_load * dst_capacity * env->imbalance_pct;
1338 	if (imb <= 0)
1339 		return false;
1340 
1341 	/*
1342 	 * The imbalance is above the allowed threshold.
1343 	 * Compare it with the old imbalance.
1344 	 */
1345 	orig_src_load = env->src_stats.load;
1346 	orig_dst_load = env->dst_stats.load;
1347 
1348 	if (orig_dst_load < orig_src_load)
1349 		swap(orig_dst_load, orig_src_load);
1350 
1351 	old_imb = orig_dst_load * src_capacity * 100 -
1352 		  orig_src_load * dst_capacity * env->imbalance_pct;
1353 
1354 	/* Would this change make things worse? */
1355 	return (imb > old_imb);
1356 }
1357 
1358 /*
1359  * This checks if the overall compute and NUMA accesses of the system would
1360  * be improved if the source tasks was migrated to the target dst_cpu taking
1361  * into account that it might be best if task running on the dst_cpu should
1362  * be exchanged with the source task
1363  */
1364 static void task_numa_compare(struct task_numa_env *env,
1365 			      long taskimp, long groupimp)
1366 {
1367 	struct rq *src_rq = cpu_rq(env->src_cpu);
1368 	struct rq *dst_rq = cpu_rq(env->dst_cpu);
1369 	struct task_struct *cur;
1370 	long src_load, dst_load;
1371 	long load;
1372 	long imp = env->p->numa_group ? groupimp : taskimp;
1373 	long moveimp = imp;
1374 	int dist = env->dist;
1375 	bool assigned = false;
1376 
1377 	rcu_read_lock();
1378 
1379 	raw_spin_lock_irq(&dst_rq->lock);
1380 	cur = dst_rq->curr;
1381 	/*
1382 	 * No need to move the exiting task or idle task.
1383 	 */
1384 	if ((cur->flags & PF_EXITING) || is_idle_task(cur))
1385 		cur = NULL;
1386 	else {
1387 		/*
1388 		 * The task_struct must be protected here to protect the
1389 		 * p->numa_faults access in the task_weight since the
1390 		 * numa_faults could already be freed in the following path:
1391 		 * finish_task_switch()
1392 		 *     --> put_task_struct()
1393 		 *         --> __put_task_struct()
1394 		 *             --> task_numa_free()
1395 		 */
1396 		get_task_struct(cur);
1397 	}
1398 
1399 	raw_spin_unlock_irq(&dst_rq->lock);
1400 
1401 	/*
1402 	 * Because we have preemption enabled we can get migrated around and
1403 	 * end try selecting ourselves (current == env->p) as a swap candidate.
1404 	 */
1405 	if (cur == env->p)
1406 		goto unlock;
1407 
1408 	/*
1409 	 * "imp" is the fault differential for the source task between the
1410 	 * source and destination node. Calculate the total differential for
1411 	 * the source task and potential destination task. The more negative
1412 	 * the value is, the more rmeote accesses that would be expected to
1413 	 * be incurred if the tasks were swapped.
1414 	 */
1415 	if (cur) {
1416 		/* Skip this swap candidate if cannot move to the source cpu */
1417 		if (!cpumask_test_cpu(env->src_cpu, tsk_cpus_allowed(cur)))
1418 			goto unlock;
1419 
1420 		/*
1421 		 * If dst and source tasks are in the same NUMA group, or not
1422 		 * in any group then look only at task weights.
1423 		 */
1424 		if (cur->numa_group == env->p->numa_group) {
1425 			imp = taskimp + task_weight(cur, env->src_nid, dist) -
1426 			      task_weight(cur, env->dst_nid, dist);
1427 			/*
1428 			 * Add some hysteresis to prevent swapping the
1429 			 * tasks within a group over tiny differences.
1430 			 */
1431 			if (cur->numa_group)
1432 				imp -= imp/16;
1433 		} else {
1434 			/*
1435 			 * Compare the group weights. If a task is all by
1436 			 * itself (not part of a group), use the task weight
1437 			 * instead.
1438 			 */
1439 			if (cur->numa_group)
1440 				imp += group_weight(cur, env->src_nid, dist) -
1441 				       group_weight(cur, env->dst_nid, dist);
1442 			else
1443 				imp += task_weight(cur, env->src_nid, dist) -
1444 				       task_weight(cur, env->dst_nid, dist);
1445 		}
1446 	}
1447 
1448 	if (imp <= env->best_imp && moveimp <= env->best_imp)
1449 		goto unlock;
1450 
1451 	if (!cur) {
1452 		/* Is there capacity at our destination? */
1453 		if (env->src_stats.nr_running <= env->src_stats.task_capacity &&
1454 		    !env->dst_stats.has_free_capacity)
1455 			goto unlock;
1456 
1457 		goto balance;
1458 	}
1459 
1460 	/* Balance doesn't matter much if we're running a task per cpu */
1461 	if (imp > env->best_imp && src_rq->nr_running == 1 &&
1462 			dst_rq->nr_running == 1)
1463 		goto assign;
1464 
1465 	/*
1466 	 * In the overloaded case, try and keep the load balanced.
1467 	 */
1468 balance:
1469 	load = task_h_load(env->p);
1470 	dst_load = env->dst_stats.load + load;
1471 	src_load = env->src_stats.load - load;
1472 
1473 	if (moveimp > imp && moveimp > env->best_imp) {
1474 		/*
1475 		 * If the improvement from just moving env->p direction is
1476 		 * better than swapping tasks around, check if a move is
1477 		 * possible. Store a slightly smaller score than moveimp,
1478 		 * so an actually idle CPU will win.
1479 		 */
1480 		if (!load_too_imbalanced(src_load, dst_load, env)) {
1481 			imp = moveimp - 1;
1482 			put_task_struct(cur);
1483 			cur = NULL;
1484 			goto assign;
1485 		}
1486 	}
1487 
1488 	if (imp <= env->best_imp)
1489 		goto unlock;
1490 
1491 	if (cur) {
1492 		load = task_h_load(cur);
1493 		dst_load -= load;
1494 		src_load += load;
1495 	}
1496 
1497 	if (load_too_imbalanced(src_load, dst_load, env))
1498 		goto unlock;
1499 
1500 	/*
1501 	 * One idle CPU per node is evaluated for a task numa move.
1502 	 * Call select_idle_sibling to maybe find a better one.
1503 	 */
1504 	if (!cur)
1505 		env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu);
1506 
1507 assign:
1508 	assigned = true;
1509 	task_numa_assign(env, cur, imp);
1510 unlock:
1511 	rcu_read_unlock();
1512 	/*
1513 	 * The dst_rq->curr isn't assigned. The protection for task_struct is
1514 	 * finished.
1515 	 */
1516 	if (cur && !assigned)
1517 		put_task_struct(cur);
1518 }
1519 
1520 static void task_numa_find_cpu(struct task_numa_env *env,
1521 				long taskimp, long groupimp)
1522 {
1523 	int cpu;
1524 
1525 	for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
1526 		/* Skip this CPU if the source task cannot migrate */
1527 		if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(env->p)))
1528 			continue;
1529 
1530 		env->dst_cpu = cpu;
1531 		task_numa_compare(env, taskimp, groupimp);
1532 	}
1533 }
1534 
1535 /* Only move tasks to a NUMA node less busy than the current node. */
1536 static bool numa_has_capacity(struct task_numa_env *env)
1537 {
1538 	struct numa_stats *src = &env->src_stats;
1539 	struct numa_stats *dst = &env->dst_stats;
1540 
1541 	if (src->has_free_capacity && !dst->has_free_capacity)
1542 		return false;
1543 
1544 	/*
1545 	 * Only consider a task move if the source has a higher load
1546 	 * than the destination, corrected for CPU capacity on each node.
1547 	 *
1548 	 *      src->load                dst->load
1549 	 * --------------------- vs ---------------------
1550 	 * src->compute_capacity    dst->compute_capacity
1551 	 */
1552 	if (src->load * dst->compute_capacity * env->imbalance_pct >
1553 
1554 	    dst->load * src->compute_capacity * 100)
1555 		return true;
1556 
1557 	return false;
1558 }
1559 
1560 static int task_numa_migrate(struct task_struct *p)
1561 {
1562 	struct task_numa_env env = {
1563 		.p = p,
1564 
1565 		.src_cpu = task_cpu(p),
1566 		.src_nid = task_node(p),
1567 
1568 		.imbalance_pct = 112,
1569 
1570 		.best_task = NULL,
1571 		.best_imp = 0,
1572 		.best_cpu = -1,
1573 	};
1574 	struct sched_domain *sd;
1575 	unsigned long taskweight, groupweight;
1576 	int nid, ret, dist;
1577 	long taskimp, groupimp;
1578 
1579 	/*
1580 	 * Pick the lowest SD_NUMA domain, as that would have the smallest
1581 	 * imbalance and would be the first to start moving tasks about.
1582 	 *
1583 	 * And we want to avoid any moving of tasks about, as that would create
1584 	 * random movement of tasks -- counter the numa conditions we're trying
1585 	 * to satisfy here.
1586 	 */
1587 	rcu_read_lock();
1588 	sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
1589 	if (sd)
1590 		env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
1591 	rcu_read_unlock();
1592 
1593 	/*
1594 	 * Cpusets can break the scheduler domain tree into smaller
1595 	 * balance domains, some of which do not cross NUMA boundaries.
1596 	 * Tasks that are "trapped" in such domains cannot be migrated
1597 	 * elsewhere, so there is no point in (re)trying.
1598 	 */
1599 	if (unlikely(!sd)) {
1600 		p->numa_preferred_nid = task_node(p);
1601 		return -EINVAL;
1602 	}
1603 
1604 	env.dst_nid = p->numa_preferred_nid;
1605 	dist = env.dist = node_distance(env.src_nid, env.dst_nid);
1606 	taskweight = task_weight(p, env.src_nid, dist);
1607 	groupweight = group_weight(p, env.src_nid, dist);
1608 	update_numa_stats(&env.src_stats, env.src_nid);
1609 	taskimp = task_weight(p, env.dst_nid, dist) - taskweight;
1610 	groupimp = group_weight(p, env.dst_nid, dist) - groupweight;
1611 	update_numa_stats(&env.dst_stats, env.dst_nid);
1612 
1613 	/* Try to find a spot on the preferred nid. */
1614 	if (numa_has_capacity(&env))
1615 		task_numa_find_cpu(&env, taskimp, groupimp);
1616 
1617 	/*
1618 	 * Look at other nodes in these cases:
1619 	 * - there is no space available on the preferred_nid
1620 	 * - the task is part of a numa_group that is interleaved across
1621 	 *   multiple NUMA nodes; in order to better consolidate the group,
1622 	 *   we need to check other locations.
1623 	 */
1624 	if (env.best_cpu == -1 || (p->numa_group && p->numa_group->active_nodes > 1)) {
1625 		for_each_online_node(nid) {
1626 			if (nid == env.src_nid || nid == p->numa_preferred_nid)
1627 				continue;
1628 
1629 			dist = node_distance(env.src_nid, env.dst_nid);
1630 			if (sched_numa_topology_type == NUMA_BACKPLANE &&
1631 						dist != env.dist) {
1632 				taskweight = task_weight(p, env.src_nid, dist);
1633 				groupweight = group_weight(p, env.src_nid, dist);
1634 			}
1635 
1636 			/* Only consider nodes where both task and groups benefit */
1637 			taskimp = task_weight(p, nid, dist) - taskweight;
1638 			groupimp = group_weight(p, nid, dist) - groupweight;
1639 			if (taskimp < 0 && groupimp < 0)
1640 				continue;
1641 
1642 			env.dist = dist;
1643 			env.dst_nid = nid;
1644 			update_numa_stats(&env.dst_stats, env.dst_nid);
1645 			if (numa_has_capacity(&env))
1646 				task_numa_find_cpu(&env, taskimp, groupimp);
1647 		}
1648 	}
1649 
1650 	/*
1651 	 * If the task is part of a workload that spans multiple NUMA nodes,
1652 	 * and is migrating into one of the workload's active nodes, remember
1653 	 * this node as the task's preferred numa node, so the workload can
1654 	 * settle down.
1655 	 * A task that migrated to a second choice node will be better off
1656 	 * trying for a better one later. Do not set the preferred node here.
1657 	 */
1658 	if (p->numa_group) {
1659 		struct numa_group *ng = p->numa_group;
1660 
1661 		if (env.best_cpu == -1)
1662 			nid = env.src_nid;
1663 		else
1664 			nid = env.dst_nid;
1665 
1666 		if (ng->active_nodes > 1 && numa_is_active_node(env.dst_nid, ng))
1667 			sched_setnuma(p, env.dst_nid);
1668 	}
1669 
1670 	/* No better CPU than the current one was found. */
1671 	if (env.best_cpu == -1)
1672 		return -EAGAIN;
1673 
1674 	/*
1675 	 * Reset the scan period if the task is being rescheduled on an
1676 	 * alternative node to recheck if the tasks is now properly placed.
1677 	 */
1678 	p->numa_scan_period = task_scan_min(p);
1679 
1680 	if (env.best_task == NULL) {
1681 		ret = migrate_task_to(p, env.best_cpu);
1682 		if (ret != 0)
1683 			trace_sched_stick_numa(p, env.src_cpu, env.best_cpu);
1684 		return ret;
1685 	}
1686 
1687 	ret = migrate_swap(p, env.best_task);
1688 	if (ret != 0)
1689 		trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task));
1690 	put_task_struct(env.best_task);
1691 	return ret;
1692 }
1693 
1694 /* Attempt to migrate a task to a CPU on the preferred node. */
1695 static void numa_migrate_preferred(struct task_struct *p)
1696 {
1697 	unsigned long interval = HZ;
1698 
1699 	/* This task has no NUMA fault statistics yet */
1700 	if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults))
1701 		return;
1702 
1703 	/* Periodically retry migrating the task to the preferred node */
1704 	interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16);
1705 	p->numa_migrate_retry = jiffies + interval;
1706 
1707 	/* Success if task is already running on preferred CPU */
1708 	if (task_node(p) == p->numa_preferred_nid)
1709 		return;
1710 
1711 	/* Otherwise, try migrate to a CPU on the preferred node */
1712 	task_numa_migrate(p);
1713 }
1714 
1715 /*
1716  * Find out how many nodes on the workload is actively running on. Do this by
1717  * tracking the nodes from which NUMA hinting faults are triggered. This can
1718  * be different from the set of nodes where the workload's memory is currently
1719  * located.
1720  */
1721 static void numa_group_count_active_nodes(struct numa_group *numa_group)
1722 {
1723 	unsigned long faults, max_faults = 0;
1724 	int nid, active_nodes = 0;
1725 
1726 	for_each_online_node(nid) {
1727 		faults = group_faults_cpu(numa_group, nid);
1728 		if (faults > max_faults)
1729 			max_faults = faults;
1730 	}
1731 
1732 	for_each_online_node(nid) {
1733 		faults = group_faults_cpu(numa_group, nid);
1734 		if (faults * ACTIVE_NODE_FRACTION > max_faults)
1735 			active_nodes++;
1736 	}
1737 
1738 	numa_group->max_faults_cpu = max_faults;
1739 	numa_group->active_nodes = active_nodes;
1740 }
1741 
1742 /*
1743  * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
1744  * increments. The more local the fault statistics are, the higher the scan
1745  * period will be for the next scan window. If local/(local+remote) ratio is
1746  * below NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS)
1747  * the scan period will decrease. Aim for 70% local accesses.
1748  */
1749 #define NUMA_PERIOD_SLOTS 10
1750 #define NUMA_PERIOD_THRESHOLD 7
1751 
1752 /*
1753  * Increase the scan period (slow down scanning) if the majority of
1754  * our memory is already on our local node, or if the majority of
1755  * the page accesses are shared with other processes.
1756  * Otherwise, decrease the scan period.
1757  */
1758 static void update_task_scan_period(struct task_struct *p,
1759 			unsigned long shared, unsigned long private)
1760 {
1761 	unsigned int period_slot;
1762 	int ratio;
1763 	int diff;
1764 
1765 	unsigned long remote = p->numa_faults_locality[0];
1766 	unsigned long local = p->numa_faults_locality[1];
1767 
1768 	/*
1769 	 * If there were no record hinting faults then either the task is
1770 	 * completely idle or all activity is areas that are not of interest
1771 	 * to automatic numa balancing. Related to that, if there were failed
1772 	 * migration then it implies we are migrating too quickly or the local
1773 	 * node is overloaded. In either case, scan slower
1774 	 */
1775 	if (local + shared == 0 || p->numa_faults_locality[2]) {
1776 		p->numa_scan_period = min(p->numa_scan_period_max,
1777 			p->numa_scan_period << 1);
1778 
1779 		p->mm->numa_next_scan = jiffies +
1780 			msecs_to_jiffies(p->numa_scan_period);
1781 
1782 		return;
1783 	}
1784 
1785 	/*
1786 	 * Prepare to scale scan period relative to the current period.
1787 	 *	 == NUMA_PERIOD_THRESHOLD scan period stays the same
1788 	 *       <  NUMA_PERIOD_THRESHOLD scan period decreases (scan faster)
1789 	 *	 >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower)
1790 	 */
1791 	period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
1792 	ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
1793 	if (ratio >= NUMA_PERIOD_THRESHOLD) {
1794 		int slot = ratio - NUMA_PERIOD_THRESHOLD;
1795 		if (!slot)
1796 			slot = 1;
1797 		diff = slot * period_slot;
1798 	} else {
1799 		diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
1800 
1801 		/*
1802 		 * Scale scan rate increases based on sharing. There is an
1803 		 * inverse relationship between the degree of sharing and
1804 		 * the adjustment made to the scanning period. Broadly
1805 		 * speaking the intent is that there is little point
1806 		 * scanning faster if shared accesses dominate as it may
1807 		 * simply bounce migrations uselessly
1808 		 */
1809 		ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared + 1));
1810 		diff = (diff * ratio) / NUMA_PERIOD_SLOTS;
1811 	}
1812 
1813 	p->numa_scan_period = clamp(p->numa_scan_period + diff,
1814 			task_scan_min(p), task_scan_max(p));
1815 	memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
1816 }
1817 
1818 /*
1819  * Get the fraction of time the task has been running since the last
1820  * NUMA placement cycle. The scheduler keeps similar statistics, but
1821  * decays those on a 32ms period, which is orders of magnitude off
1822  * from the dozens-of-seconds NUMA balancing period. Use the scheduler
1823  * stats only if the task is so new there are no NUMA statistics yet.
1824  */
1825 static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
1826 {
1827 	u64 runtime, delta, now;
1828 	/* Use the start of this time slice to avoid calculations. */
1829 	now = p->se.exec_start;
1830 	runtime = p->se.sum_exec_runtime;
1831 
1832 	if (p->last_task_numa_placement) {
1833 		delta = runtime - p->last_sum_exec_runtime;
1834 		*period = now - p->last_task_numa_placement;
1835 	} else {
1836 		delta = p->se.avg.load_sum / p->se.load.weight;
1837 		*period = LOAD_AVG_MAX;
1838 	}
1839 
1840 	p->last_sum_exec_runtime = runtime;
1841 	p->last_task_numa_placement = now;
1842 
1843 	return delta;
1844 }
1845 
1846 /*
1847  * Determine the preferred nid for a task in a numa_group. This needs to
1848  * be done in a way that produces consistent results with group_weight,
1849  * otherwise workloads might not converge.
1850  */
1851 static int preferred_group_nid(struct task_struct *p, int nid)
1852 {
1853 	nodemask_t nodes;
1854 	int dist;
1855 
1856 	/* Direct connections between all NUMA nodes. */
1857 	if (sched_numa_topology_type == NUMA_DIRECT)
1858 		return nid;
1859 
1860 	/*
1861 	 * On a system with glueless mesh NUMA topology, group_weight
1862 	 * scores nodes according to the number of NUMA hinting faults on
1863 	 * both the node itself, and on nearby nodes.
1864 	 */
1865 	if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
1866 		unsigned long score, max_score = 0;
1867 		int node, max_node = nid;
1868 
1869 		dist = sched_max_numa_distance;
1870 
1871 		for_each_online_node(node) {
1872 			score = group_weight(p, node, dist);
1873 			if (score > max_score) {
1874 				max_score = score;
1875 				max_node = node;
1876 			}
1877 		}
1878 		return max_node;
1879 	}
1880 
1881 	/*
1882 	 * Finding the preferred nid in a system with NUMA backplane
1883 	 * interconnect topology is more involved. The goal is to locate
1884 	 * tasks from numa_groups near each other in the system, and
1885 	 * untangle workloads from different sides of the system. This requires
1886 	 * searching down the hierarchy of node groups, recursively searching
1887 	 * inside the highest scoring group of nodes. The nodemask tricks
1888 	 * keep the complexity of the search down.
1889 	 */
1890 	nodes = node_online_map;
1891 	for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) {
1892 		unsigned long max_faults = 0;
1893 		nodemask_t max_group = NODE_MASK_NONE;
1894 		int a, b;
1895 
1896 		/* Are there nodes at this distance from each other? */
1897 		if (!find_numa_distance(dist))
1898 			continue;
1899 
1900 		for_each_node_mask(a, nodes) {
1901 			unsigned long faults = 0;
1902 			nodemask_t this_group;
1903 			nodes_clear(this_group);
1904 
1905 			/* Sum group's NUMA faults; includes a==b case. */
1906 			for_each_node_mask(b, nodes) {
1907 				if (node_distance(a, b) < dist) {
1908 					faults += group_faults(p, b);
1909 					node_set(b, this_group);
1910 					node_clear(b, nodes);
1911 				}
1912 			}
1913 
1914 			/* Remember the top group. */
1915 			if (faults > max_faults) {
1916 				max_faults = faults;
1917 				max_group = this_group;
1918 				/*
1919 				 * subtle: at the smallest distance there is
1920 				 * just one node left in each "group", the
1921 				 * winner is the preferred nid.
1922 				 */
1923 				nid = a;
1924 			}
1925 		}
1926 		/* Next round, evaluate the nodes within max_group. */
1927 		if (!max_faults)
1928 			break;
1929 		nodes = max_group;
1930 	}
1931 	return nid;
1932 }
1933 
1934 static void task_numa_placement(struct task_struct *p)
1935 {
1936 	int seq, nid, max_nid = -1, max_group_nid = -1;
1937 	unsigned long max_faults = 0, max_group_faults = 0;
1938 	unsigned long fault_types[2] = { 0, 0 };
1939 	unsigned long total_faults;
1940 	u64 runtime, period;
1941 	spinlock_t *group_lock = NULL;
1942 
1943 	/*
1944 	 * The p->mm->numa_scan_seq field gets updated without
1945 	 * exclusive access. Use READ_ONCE() here to ensure
1946 	 * that the field is read in a single access:
1947 	 */
1948 	seq = READ_ONCE(p->mm->numa_scan_seq);
1949 	if (p->numa_scan_seq == seq)
1950 		return;
1951 	p->numa_scan_seq = seq;
1952 	p->numa_scan_period_max = task_scan_max(p);
1953 
1954 	total_faults = p->numa_faults_locality[0] +
1955 		       p->numa_faults_locality[1];
1956 	runtime = numa_get_avg_runtime(p, &period);
1957 
1958 	/* If the task is part of a group prevent parallel updates to group stats */
1959 	if (p->numa_group) {
1960 		group_lock = &p->numa_group->lock;
1961 		spin_lock_irq(group_lock);
1962 	}
1963 
1964 	/* Find the node with the highest number of faults */
1965 	for_each_online_node(nid) {
1966 		/* Keep track of the offsets in numa_faults array */
1967 		int mem_idx, membuf_idx, cpu_idx, cpubuf_idx;
1968 		unsigned long faults = 0, group_faults = 0;
1969 		int priv;
1970 
1971 		for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {
1972 			long diff, f_diff, f_weight;
1973 
1974 			mem_idx = task_faults_idx(NUMA_MEM, nid, priv);
1975 			membuf_idx = task_faults_idx(NUMA_MEMBUF, nid, priv);
1976 			cpu_idx = task_faults_idx(NUMA_CPU, nid, priv);
1977 			cpubuf_idx = task_faults_idx(NUMA_CPUBUF, nid, priv);
1978 
1979 			/* Decay existing window, copy faults since last scan */
1980 			diff = p->numa_faults[membuf_idx] - p->numa_faults[mem_idx] / 2;
1981 			fault_types[priv] += p->numa_faults[membuf_idx];
1982 			p->numa_faults[membuf_idx] = 0;
1983 
1984 			/*
1985 			 * Normalize the faults_from, so all tasks in a group
1986 			 * count according to CPU use, instead of by the raw
1987 			 * number of faults. Tasks with little runtime have
1988 			 * little over-all impact on throughput, and thus their
1989 			 * faults are less important.
1990 			 */
1991 			f_weight = div64_u64(runtime << 16, period + 1);
1992 			f_weight = (f_weight * p->numa_faults[cpubuf_idx]) /
1993 				   (total_faults + 1);
1994 			f_diff = f_weight - p->numa_faults[cpu_idx] / 2;
1995 			p->numa_faults[cpubuf_idx] = 0;
1996 
1997 			p->numa_faults[mem_idx] += diff;
1998 			p->numa_faults[cpu_idx] += f_diff;
1999 			faults += p->numa_faults[mem_idx];
2000 			p->total_numa_faults += diff;
2001 			if (p->numa_group) {
2002 				/*
2003 				 * safe because we can only change our own group
2004 				 *
2005 				 * mem_idx represents the offset for a given
2006 				 * nid and priv in a specific region because it
2007 				 * is at the beginning of the numa_faults array.
2008 				 */
2009 				p->numa_group->faults[mem_idx] += diff;
2010 				p->numa_group->faults_cpu[mem_idx] += f_diff;
2011 				p->numa_group->total_faults += diff;
2012 				group_faults += p->numa_group->faults[mem_idx];
2013 			}
2014 		}
2015 
2016 		if (faults > max_faults) {
2017 			max_faults = faults;
2018 			max_nid = nid;
2019 		}
2020 
2021 		if (group_faults > max_group_faults) {
2022 			max_group_faults = group_faults;
2023 			max_group_nid = nid;
2024 		}
2025 	}
2026 
2027 	update_task_scan_period(p, fault_types[0], fault_types[1]);
2028 
2029 	if (p->numa_group) {
2030 		numa_group_count_active_nodes(p->numa_group);
2031 		spin_unlock_irq(group_lock);
2032 		max_nid = preferred_group_nid(p, max_group_nid);
2033 	}
2034 
2035 	if (max_faults) {
2036 		/* Set the new preferred node */
2037 		if (max_nid != p->numa_preferred_nid)
2038 			sched_setnuma(p, max_nid);
2039 
2040 		if (task_node(p) != p->numa_preferred_nid)
2041 			numa_migrate_preferred(p);
2042 	}
2043 }
2044 
2045 static inline int get_numa_group(struct numa_group *grp)
2046 {
2047 	return atomic_inc_not_zero(&grp->refcount);
2048 }
2049 
2050 static inline void put_numa_group(struct numa_group *grp)
2051 {
2052 	if (atomic_dec_and_test(&grp->refcount))
2053 		kfree_rcu(grp, rcu);
2054 }
2055 
2056 static void task_numa_group(struct task_struct *p, int cpupid, int flags,
2057 			int *priv)
2058 {
2059 	struct numa_group *grp, *my_grp;
2060 	struct task_struct *tsk;
2061 	bool join = false;
2062 	int cpu = cpupid_to_cpu(cpupid);
2063 	int i;
2064 
2065 	if (unlikely(!p->numa_group)) {
2066 		unsigned int size = sizeof(struct numa_group) +
2067 				    4*nr_node_ids*sizeof(unsigned long);
2068 
2069 		grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
2070 		if (!grp)
2071 			return;
2072 
2073 		atomic_set(&grp->refcount, 1);
2074 		grp->active_nodes = 1;
2075 		grp->max_faults_cpu = 0;
2076 		spin_lock_init(&grp->lock);
2077 		grp->gid = p->pid;
2078 		/* Second half of the array tracks nids where faults happen */
2079 		grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
2080 						nr_node_ids;
2081 
2082 		for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
2083 			grp->faults[i] = p->numa_faults[i];
2084 
2085 		grp->total_faults = p->total_numa_faults;
2086 
2087 		grp->nr_tasks++;
2088 		rcu_assign_pointer(p->numa_group, grp);
2089 	}
2090 
2091 	rcu_read_lock();
2092 	tsk = READ_ONCE(cpu_rq(cpu)->curr);
2093 
2094 	if (!cpupid_match_pid(tsk, cpupid))
2095 		goto no_join;
2096 
2097 	grp = rcu_dereference(tsk->numa_group);
2098 	if (!grp)
2099 		goto no_join;
2100 
2101 	my_grp = p->numa_group;
2102 	if (grp == my_grp)
2103 		goto no_join;
2104 
2105 	/*
2106 	 * Only join the other group if its bigger; if we're the bigger group,
2107 	 * the other task will join us.
2108 	 */
2109 	if (my_grp->nr_tasks > grp->nr_tasks)
2110 		goto no_join;
2111 
2112 	/*
2113 	 * Tie-break on the grp address.
2114 	 */
2115 	if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp)
2116 		goto no_join;
2117 
2118 	/* Always join threads in the same process. */
2119 	if (tsk->mm == current->mm)
2120 		join = true;
2121 
2122 	/* Simple filter to avoid false positives due to PID collisions */
2123 	if (flags & TNF_SHARED)
2124 		join = true;
2125 
2126 	/* Update priv based on whether false sharing was detected */
2127 	*priv = !join;
2128 
2129 	if (join && !get_numa_group(grp))
2130 		goto no_join;
2131 
2132 	rcu_read_unlock();
2133 
2134 	if (!join)
2135 		return;
2136 
2137 	BUG_ON(irqs_disabled());
2138 	double_lock_irq(&my_grp->lock, &grp->lock);
2139 
2140 	for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
2141 		my_grp->faults[i] -= p->numa_faults[i];
2142 		grp->faults[i] += p->numa_faults[i];
2143 	}
2144 	my_grp->total_faults -= p->total_numa_faults;
2145 	grp->total_faults += p->total_numa_faults;
2146 
2147 	my_grp->nr_tasks--;
2148 	grp->nr_tasks++;
2149 
2150 	spin_unlock(&my_grp->lock);
2151 	spin_unlock_irq(&grp->lock);
2152 
2153 	rcu_assign_pointer(p->numa_group, grp);
2154 
2155 	put_numa_group(my_grp);
2156 	return;
2157 
2158 no_join:
2159 	rcu_read_unlock();
2160 	return;
2161 }
2162 
2163 void task_numa_free(struct task_struct *p)
2164 {
2165 	struct numa_group *grp = p->numa_group;
2166 	void *numa_faults = p->numa_faults;
2167 	unsigned long flags;
2168 	int i;
2169 
2170 	if (grp) {
2171 		spin_lock_irqsave(&grp->lock, flags);
2172 		for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
2173 			grp->faults[i] -= p->numa_faults[i];
2174 		grp->total_faults -= p->total_numa_faults;
2175 
2176 		grp->nr_tasks--;
2177 		spin_unlock_irqrestore(&grp->lock, flags);
2178 		RCU_INIT_POINTER(p->numa_group, NULL);
2179 		put_numa_group(grp);
2180 	}
2181 
2182 	p->numa_faults = NULL;
2183 	kfree(numa_faults);
2184 }
2185 
2186 /*
2187  * Got a PROT_NONE fault for a page on @node.
2188  */
2189 void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
2190 {
2191 	struct task_struct *p = current;
2192 	bool migrated = flags & TNF_MIGRATED;
2193 	int cpu_node = task_node(current);
2194 	int local = !!(flags & TNF_FAULT_LOCAL);
2195 	struct numa_group *ng;
2196 	int priv;
2197 
2198 	if (!static_branch_likely(&sched_numa_balancing))
2199 		return;
2200 
2201 	/* for example, ksmd faulting in a user's mm */
2202 	if (!p->mm)
2203 		return;
2204 
2205 	/* Allocate buffer to track faults on a per-node basis */
2206 	if (unlikely(!p->numa_faults)) {
2207 		int size = sizeof(*p->numa_faults) *
2208 			   NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;
2209 
2210 		p->numa_faults = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);
2211 		if (!p->numa_faults)
2212 			return;
2213 
2214 		p->total_numa_faults = 0;
2215 		memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
2216 	}
2217 
2218 	/*
2219 	 * First accesses are treated as private, otherwise consider accesses
2220 	 * to be private if the accessing pid has not changed
2221 	 */
2222 	if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) {
2223 		priv = 1;
2224 	} else {
2225 		priv = cpupid_match_pid(p, last_cpupid);
2226 		if (!priv && !(flags & TNF_NO_GROUP))
2227 			task_numa_group(p, last_cpupid, flags, &priv);
2228 	}
2229 
2230 	/*
2231 	 * If a workload spans multiple NUMA nodes, a shared fault that
2232 	 * occurs wholly within the set of nodes that the workload is
2233 	 * actively using should be counted as local. This allows the
2234 	 * scan rate to slow down when a workload has settled down.
2235 	 */
2236 	ng = p->numa_group;
2237 	if (!priv && !local && ng && ng->active_nodes > 1 &&
2238 				numa_is_active_node(cpu_node, ng) &&
2239 				numa_is_active_node(mem_node, ng))
2240 		local = 1;
2241 
2242 	task_numa_placement(p);
2243 
2244 	/*
2245 	 * Retry task to preferred node migration periodically, in case it
2246 	 * case it previously failed, or the scheduler moved us.
2247 	 */
2248 	if (time_after(jiffies, p->numa_migrate_retry))
2249 		numa_migrate_preferred(p);
2250 
2251 	if (migrated)
2252 		p->numa_pages_migrated += pages;
2253 	if (flags & TNF_MIGRATE_FAIL)
2254 		p->numa_faults_locality[2] += pages;
2255 
2256 	p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages;
2257 	p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;
2258 	p->numa_faults_locality[local] += pages;
2259 }
2260 
2261 static void reset_ptenuma_scan(struct task_struct *p)
2262 {
2263 	/*
2264 	 * We only did a read acquisition of the mmap sem, so
2265 	 * p->mm->numa_scan_seq is written to without exclusive access
2266 	 * and the update is not guaranteed to be atomic. That's not
2267 	 * much of an issue though, since this is just used for
2268 	 * statistical sampling. Use READ_ONCE/WRITE_ONCE, which are not
2269 	 * expensive, to avoid any form of compiler optimizations:
2270 	 */
2271 	WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
2272 	p->mm->numa_scan_offset = 0;
2273 }
2274 
2275 /*
2276  * The expensive part of numa migration is done from task_work context.
2277  * Triggered from task_tick_numa().
2278  */
2279 void task_numa_work(struct callback_head *work)
2280 {
2281 	unsigned long migrate, next_scan, now = jiffies;
2282 	struct task_struct *p = current;
2283 	struct mm_struct *mm = p->mm;
2284 	u64 runtime = p->se.sum_exec_runtime;
2285 	struct vm_area_struct *vma;
2286 	unsigned long start, end;
2287 	unsigned long nr_pte_updates = 0;
2288 	long pages, virtpages;
2289 
2290 	WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
2291 
2292 	work->next = work; /* protect against double add */
2293 	/*
2294 	 * Who cares about NUMA placement when they're dying.
2295 	 *
2296 	 * NOTE: make sure not to dereference p->mm before this check,
2297 	 * exit_task_work() happens _after_ exit_mm() so we could be called
2298 	 * without p->mm even though we still had it when we enqueued this
2299 	 * work.
2300 	 */
2301 	if (p->flags & PF_EXITING)
2302 		return;
2303 
2304 	if (!mm->numa_next_scan) {
2305 		mm->numa_next_scan = now +
2306 			msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
2307 	}
2308 
2309 	/*
2310 	 * Enforce maximal scan/migration frequency..
2311 	 */
2312 	migrate = mm->numa_next_scan;
2313 	if (time_before(now, migrate))
2314 		return;
2315 
2316 	if (p->numa_scan_period == 0) {
2317 		p->numa_scan_period_max = task_scan_max(p);
2318 		p->numa_scan_period = task_scan_min(p);
2319 	}
2320 
2321 	next_scan = now + msecs_to_jiffies(p->numa_scan_period);
2322 	if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
2323 		return;
2324 
2325 	/*
2326 	 * Delay this task enough that another task of this mm will likely win
2327 	 * the next time around.
2328 	 */
2329 	p->node_stamp += 2 * TICK_NSEC;
2330 
2331 	start = mm->numa_scan_offset;
2332 	pages = sysctl_numa_balancing_scan_size;
2333 	pages <<= 20 - PAGE_SHIFT; /* MB in pages */
2334 	virtpages = pages * 8;	   /* Scan up to this much virtual space */
2335 	if (!pages)
2336 		return;
2337 
2338 
2339 	down_read(&mm->mmap_sem);
2340 	vma = find_vma(mm, start);
2341 	if (!vma) {
2342 		reset_ptenuma_scan(p);
2343 		start = 0;
2344 		vma = mm->mmap;
2345 	}
2346 	for (; vma; vma = vma->vm_next) {
2347 		if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
2348 			is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) {
2349 			continue;
2350 		}
2351 
2352 		/*
2353 		 * Shared library pages mapped by multiple processes are not
2354 		 * migrated as it is expected they are cache replicated. Avoid
2355 		 * hinting faults in read-only file-backed mappings or the vdso
2356 		 * as migrating the pages will be of marginal benefit.
2357 		 */
2358 		if (!vma->vm_mm ||
2359 		    (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
2360 			continue;
2361 
2362 		/*
2363 		 * Skip inaccessible VMAs to avoid any confusion between
2364 		 * PROT_NONE and NUMA hinting ptes
2365 		 */
2366 		if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
2367 			continue;
2368 
2369 		do {
2370 			start = max(start, vma->vm_start);
2371 			end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
2372 			end = min(end, vma->vm_end);
2373 			nr_pte_updates = change_prot_numa(vma, start, end);
2374 
2375 			/*
2376 			 * Try to scan sysctl_numa_balancing_size worth of
2377 			 * hpages that have at least one present PTE that
2378 			 * is not already pte-numa. If the VMA contains
2379 			 * areas that are unused or already full of prot_numa
2380 			 * PTEs, scan up to virtpages, to skip through those
2381 			 * areas faster.
2382 			 */
2383 			if (nr_pte_updates)
2384 				pages -= (end - start) >> PAGE_SHIFT;
2385 			virtpages -= (end - start) >> PAGE_SHIFT;
2386 
2387 			start = end;
2388 			if (pages <= 0 || virtpages <= 0)
2389 				goto out;
2390 
2391 			cond_resched();
2392 		} while (end != vma->vm_end);
2393 	}
2394 
2395 out:
2396 	/*
2397 	 * It is possible to reach the end of the VMA list but the last few
2398 	 * VMAs are not guaranteed to the vma_migratable. If they are not, we
2399 	 * would find the !migratable VMA on the next scan but not reset the
2400 	 * scanner to the start so check it now.
2401 	 */
2402 	if (vma)
2403 		mm->numa_scan_offset = start;
2404 	else
2405 		reset_ptenuma_scan(p);
2406 	up_read(&mm->mmap_sem);
2407 
2408 	/*
2409 	 * Make sure tasks use at least 32x as much time to run other code
2410 	 * than they used here, to limit NUMA PTE scanning overhead to 3% max.
2411 	 * Usually update_task_scan_period slows down scanning enough; on an
2412 	 * overloaded system we need to limit overhead on a per task basis.
2413 	 */
2414 	if (unlikely(p->se.sum_exec_runtime != runtime)) {
2415 		u64 diff = p->se.sum_exec_runtime - runtime;
2416 		p->node_stamp += 32 * diff;
2417 	}
2418 }
2419 
2420 /*
2421  * Drive the periodic memory faults..
2422  */
2423 void task_tick_numa(struct rq *rq, struct task_struct *curr)
2424 {
2425 	struct callback_head *work = &curr->numa_work;
2426 	u64 period, now;
2427 
2428 	/*
2429 	 * We don't care about NUMA placement if we don't have memory.
2430 	 */
2431 	if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
2432 		return;
2433 
2434 	/*
2435 	 * Using runtime rather than walltime has the dual advantage that
2436 	 * we (mostly) drive the selection from busy threads and that the
2437 	 * task needs to have done some actual work before we bother with
2438 	 * NUMA placement.
2439 	 */
2440 	now = curr->se.sum_exec_runtime;
2441 	period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
2442 
2443 	if (now > curr->node_stamp + period) {
2444 		if (!curr->node_stamp)
2445 			curr->numa_scan_period = task_scan_min(curr);
2446 		curr->node_stamp += period;
2447 
2448 		if (!time_before(jiffies, curr->mm->numa_next_scan)) {
2449 			init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
2450 			task_work_add(curr, work, true);
2451 		}
2452 	}
2453 }
2454 #else
2455 static void task_tick_numa(struct rq *rq, struct task_struct *curr)
2456 {
2457 }
2458 
2459 static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p)
2460 {
2461 }
2462 
2463 static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
2464 {
2465 }
2466 #endif /* CONFIG_NUMA_BALANCING */
2467 
2468 static void
2469 account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
2470 {
2471 	update_load_add(&cfs_rq->load, se->load.weight);
2472 	if (!parent_entity(se))
2473 		update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
2474 #ifdef CONFIG_SMP
2475 	if (entity_is_task(se)) {
2476 		struct rq *rq = rq_of(cfs_rq);
2477 
2478 		account_numa_enqueue(rq, task_of(se));
2479 		list_add(&se->group_node, &rq->cfs_tasks);
2480 	}
2481 #endif
2482 	cfs_rq->nr_running++;
2483 }
2484 
2485 static void
2486 account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
2487 {
2488 	update_load_sub(&cfs_rq->load, se->load.weight);
2489 	if (!parent_entity(se))
2490 		update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
2491 #ifdef CONFIG_SMP
2492 	if (entity_is_task(se)) {
2493 		account_numa_dequeue(rq_of(cfs_rq), task_of(se));
2494 		list_del_init(&se->group_node);
2495 	}
2496 #endif
2497 	cfs_rq->nr_running--;
2498 }
2499 
2500 #ifdef CONFIG_FAIR_GROUP_SCHED
2501 # ifdef CONFIG_SMP
2502 static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
2503 {
2504 	long tg_weight;
2505 
2506 	/*
2507 	 * Use this CPU's real-time load instead of the last load contribution
2508 	 * as the updating of the contribution is delayed, and we will use the
2509 	 * the real-time load to calc the share. See update_tg_load_avg().
2510 	 */
2511 	tg_weight = atomic_long_read(&tg->load_avg);
2512 	tg_weight -= cfs_rq->tg_load_avg_contrib;
2513 	tg_weight += cfs_rq->load.weight;
2514 
2515 	return tg_weight;
2516 }
2517 
2518 static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
2519 {
2520 	long tg_weight, load, shares;
2521 
2522 	tg_weight = calc_tg_weight(tg, cfs_rq);
2523 	load = cfs_rq->load.weight;
2524 
2525 	shares = (tg->shares * load);
2526 	if (tg_weight)
2527 		shares /= tg_weight;
2528 
2529 	if (shares < MIN_SHARES)
2530 		shares = MIN_SHARES;
2531 	if (shares > tg->shares)
2532 		shares = tg->shares;
2533 
2534 	return shares;
2535 }
2536 # else /* CONFIG_SMP */
2537 static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
2538 {
2539 	return tg->shares;
2540 }
2541 # endif /* CONFIG_SMP */
2542 static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
2543 			    unsigned long weight)
2544 {
2545 	if (se->on_rq) {
2546 		/* commit outstanding execution time */
2547 		if (cfs_rq->curr == se)
2548 			update_curr(cfs_rq);
2549 		account_entity_dequeue(cfs_rq, se);
2550 	}
2551 
2552 	update_load_set(&se->load, weight);
2553 
2554 	if (se->on_rq)
2555 		account_entity_enqueue(cfs_rq, se);
2556 }
2557 
2558 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
2559 
2560 static void update_cfs_shares(struct cfs_rq *cfs_rq)
2561 {
2562 	struct task_group *tg;
2563 	struct sched_entity *se;
2564 	long shares;
2565 
2566 	tg = cfs_rq->tg;
2567 	se = tg->se[cpu_of(rq_of(cfs_rq))];
2568 	if (!se || throttled_hierarchy(cfs_rq))
2569 		return;
2570 #ifndef CONFIG_SMP
2571 	if (likely(se->load.weight == tg->shares))
2572 		return;
2573 #endif
2574 	shares = calc_cfs_shares(cfs_rq, tg);
2575 
2576 	reweight_entity(cfs_rq_of(se), se, shares);
2577 }
2578 #else /* CONFIG_FAIR_GROUP_SCHED */
2579 static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
2580 {
2581 }
2582 #endif /* CONFIG_FAIR_GROUP_SCHED */
2583 
2584 #ifdef CONFIG_SMP
2585 /* Precomputed fixed inverse multiplies for multiplication by y^n */
2586 static const u32 runnable_avg_yN_inv[] = {
2587 	0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
2588 	0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
2589 	0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
2590 	0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,
2591 	0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,
2592 	0x85aac367, 0x82cd8698,
2593 };
2594 
2595 /*
2596  * Precomputed \Sum y^k { 1<=k<=n }.  These are floor(true_value) to prevent
2597  * over-estimates when re-combining.
2598  */
2599 static const u32 runnable_avg_yN_sum[] = {
2600 	    0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,
2601 	 9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,
2602 	17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
2603 };
2604 
2605 /*
2606  * Precomputed \Sum y^k { 1<=k<=n, where n%32=0). Values are rolled down to
2607  * lower integers. See Documentation/scheduler/sched-avg.txt how these
2608  * were generated:
2609  */
2610 static const u32 __accumulated_sum_N32[] = {
2611 	    0, 23371, 35056, 40899, 43820, 45281,
2612 	46011, 46376, 46559, 46650, 46696, 46719,
2613 };
2614 
2615 /*
2616  * Approximate:
2617  *   val * y^n,    where y^32 ~= 0.5 (~1 scheduling period)
2618  */
2619 static __always_inline u64 decay_load(u64 val, u64 n)
2620 {
2621 	unsigned int local_n;
2622 
2623 	if (!n)
2624 		return val;
2625 	else if (unlikely(n > LOAD_AVG_PERIOD * 63))
2626 		return 0;
2627 
2628 	/* after bounds checking we can collapse to 32-bit */
2629 	local_n = n;
2630 
2631 	/*
2632 	 * As y^PERIOD = 1/2, we can combine
2633 	 *    y^n = 1/2^(n/PERIOD) * y^(n%PERIOD)
2634 	 * With a look-up table which covers y^n (n<PERIOD)
2635 	 *
2636 	 * To achieve constant time decay_load.
2637 	 */
2638 	if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
2639 		val >>= local_n / LOAD_AVG_PERIOD;
2640 		local_n %= LOAD_AVG_PERIOD;
2641 	}
2642 
2643 	val = mul_u64_u32_shr(val, runnable_avg_yN_inv[local_n], 32);
2644 	return val;
2645 }
2646 
2647 /*
2648  * For updates fully spanning n periods, the contribution to runnable
2649  * average will be: \Sum 1024*y^n
2650  *
2651  * We can compute this reasonably efficiently by combining:
2652  *   y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for  n <PERIOD}
2653  */
2654 static u32 __compute_runnable_contrib(u64 n)
2655 {
2656 	u32 contrib = 0;
2657 
2658 	if (likely(n <= LOAD_AVG_PERIOD))
2659 		return runnable_avg_yN_sum[n];
2660 	else if (unlikely(n >= LOAD_AVG_MAX_N))
2661 		return LOAD_AVG_MAX;
2662 
2663 	/* Since n < LOAD_AVG_MAX_N, n/LOAD_AVG_PERIOD < 11 */
2664 	contrib = __accumulated_sum_N32[n/LOAD_AVG_PERIOD];
2665 	n %= LOAD_AVG_PERIOD;
2666 	contrib = decay_load(contrib, n);
2667 	return contrib + runnable_avg_yN_sum[n];
2668 }
2669 
2670 #define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
2671 
2672 /*
2673  * We can represent the historical contribution to runnable average as the
2674  * coefficients of a geometric series.  To do this we sub-divide our runnable
2675  * history into segments of approximately 1ms (1024us); label the segment that
2676  * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
2677  *
2678  * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
2679  *      p0            p1           p2
2680  *     (now)       (~1ms ago)  (~2ms ago)
2681  *
2682  * Let u_i denote the fraction of p_i that the entity was runnable.
2683  *
2684  * We then designate the fractions u_i as our co-efficients, yielding the
2685  * following representation of historical load:
2686  *   u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
2687  *
2688  * We choose y based on the with of a reasonably scheduling period, fixing:
2689  *   y^32 = 0.5
2690  *
2691  * This means that the contribution to load ~32ms ago (u_32) will be weighted
2692  * approximately half as much as the contribution to load within the last ms
2693  * (u_0).
2694  *
2695  * When a period "rolls over" and we have new u_0`, multiplying the previous
2696  * sum again by y is sufficient to update:
2697  *   load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
2698  *            = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
2699  */
2700 static __always_inline int
2701 __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
2702 		  unsigned long weight, int running, struct cfs_rq *cfs_rq)
2703 {
2704 	u64 delta, scaled_delta, periods;
2705 	u32 contrib;
2706 	unsigned int delta_w, scaled_delta_w, decayed = 0;
2707 	unsigned long scale_freq, scale_cpu;
2708 
2709 	delta = now - sa->last_update_time;
2710 	/*
2711 	 * This should only happen when time goes backwards, which it
2712 	 * unfortunately does during sched clock init when we swap over to TSC.
2713 	 */
2714 	if ((s64)delta < 0) {
2715 		sa->last_update_time = now;
2716 		return 0;
2717 	}
2718 
2719 	/*
2720 	 * Use 1024ns as the unit of measurement since it's a reasonable
2721 	 * approximation of 1us and fast to compute.
2722 	 */
2723 	delta >>= 10;
2724 	if (!delta)
2725 		return 0;
2726 	sa->last_update_time = now;
2727 
2728 	scale_freq = arch_scale_freq_capacity(NULL, cpu);
2729 	scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
2730 
2731 	/* delta_w is the amount already accumulated against our next period */
2732 	delta_w = sa->period_contrib;
2733 	if (delta + delta_w >= 1024) {
2734 		decayed = 1;
2735 
2736 		/* how much left for next period will start over, we don't know yet */
2737 		sa->period_contrib = 0;
2738 
2739 		/*
2740 		 * Now that we know we're crossing a period boundary, figure
2741 		 * out how much from delta we need to complete the current
2742 		 * period and accrue it.
2743 		 */
2744 		delta_w = 1024 - delta_w;
2745 		scaled_delta_w = cap_scale(delta_w, scale_freq);
2746 		if (weight) {
2747 			sa->load_sum += weight * scaled_delta_w;
2748 			if (cfs_rq) {
2749 				cfs_rq->runnable_load_sum +=
2750 						weight * scaled_delta_w;
2751 			}
2752 		}
2753 		if (running)
2754 			sa->util_sum += scaled_delta_w * scale_cpu;
2755 
2756 		delta -= delta_w;
2757 
2758 		/* Figure out how many additional periods this update spans */
2759 		periods = delta / 1024;
2760 		delta %= 1024;
2761 
2762 		sa->load_sum = decay_load(sa->load_sum, periods + 1);
2763 		if (cfs_rq) {
2764 			cfs_rq->runnable_load_sum =
2765 				decay_load(cfs_rq->runnable_load_sum, periods + 1);
2766 		}
2767 		sa->util_sum = decay_load((u64)(sa->util_sum), periods + 1);
2768 
2769 		/* Efficiently calculate \sum (1..n_period) 1024*y^i */
2770 		contrib = __compute_runnable_contrib(periods);
2771 		contrib = cap_scale(contrib, scale_freq);
2772 		if (weight) {
2773 			sa->load_sum += weight * contrib;
2774 			if (cfs_rq)
2775 				cfs_rq->runnable_load_sum += weight * contrib;
2776 		}
2777 		if (running)
2778 			sa->util_sum += contrib * scale_cpu;
2779 	}
2780 
2781 	/* Remainder of delta accrued against u_0` */
2782 	scaled_delta = cap_scale(delta, scale_freq);
2783 	if (weight) {
2784 		sa->load_sum += weight * scaled_delta;
2785 		if (cfs_rq)
2786 			cfs_rq->runnable_load_sum += weight * scaled_delta;
2787 	}
2788 	if (running)
2789 		sa->util_sum += scaled_delta * scale_cpu;
2790 
2791 	sa->period_contrib += delta;
2792 
2793 	if (decayed) {
2794 		sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX);
2795 		if (cfs_rq) {
2796 			cfs_rq->runnable_load_avg =
2797 				div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX);
2798 		}
2799 		sa->util_avg = sa->util_sum / LOAD_AVG_MAX;
2800 	}
2801 
2802 	return decayed;
2803 }
2804 
2805 #ifdef CONFIG_FAIR_GROUP_SCHED
2806 /*
2807  * Updating tg's load_avg is necessary before update_cfs_share (which is done)
2808  * and effective_load (which is not done because it is too costly).
2809  */
2810 static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
2811 {
2812 	long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
2813 
2814 	/*
2815 	 * No need to update load_avg for root_task_group as it is not used.
2816 	 */
2817 	if (cfs_rq->tg == &root_task_group)
2818 		return;
2819 
2820 	if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
2821 		atomic_long_add(delta, &cfs_rq->tg->load_avg);
2822 		cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
2823 	}
2824 }
2825 
2826 /*
2827  * Called within set_task_rq() right before setting a task's cpu. The
2828  * caller only guarantees p->pi_lock is held; no other assumptions,
2829  * including the state of rq->lock, should be made.
2830  */
2831 void set_task_rq_fair(struct sched_entity *se,
2832 		      struct cfs_rq *prev, struct cfs_rq *next)
2833 {
2834 	if (!sched_feat(ATTACH_AGE_LOAD))
2835 		return;
2836 
2837 	/*
2838 	 * We are supposed to update the task to "current" time, then its up to
2839 	 * date and ready to go to new CPU/cfs_rq. But we have difficulty in
2840 	 * getting what current time is, so simply throw away the out-of-date
2841 	 * time. This will result in the wakee task is less decayed, but giving
2842 	 * the wakee more load sounds not bad.
2843 	 */
2844 	if (se->avg.last_update_time && prev) {
2845 		u64 p_last_update_time;
2846 		u64 n_last_update_time;
2847 
2848 #ifndef CONFIG_64BIT
2849 		u64 p_last_update_time_copy;
2850 		u64 n_last_update_time_copy;
2851 
2852 		do {
2853 			p_last_update_time_copy = prev->load_last_update_time_copy;
2854 			n_last_update_time_copy = next->load_last_update_time_copy;
2855 
2856 			smp_rmb();
2857 
2858 			p_last_update_time = prev->avg.last_update_time;
2859 			n_last_update_time = next->avg.last_update_time;
2860 
2861 		} while (p_last_update_time != p_last_update_time_copy ||
2862 			 n_last_update_time != n_last_update_time_copy);
2863 #else
2864 		p_last_update_time = prev->avg.last_update_time;
2865 		n_last_update_time = next->avg.last_update_time;
2866 #endif
2867 		__update_load_avg(p_last_update_time, cpu_of(rq_of(prev)),
2868 				  &se->avg, 0, 0, NULL);
2869 		se->avg.last_update_time = n_last_update_time;
2870 	}
2871 }
2872 #else /* CONFIG_FAIR_GROUP_SCHED */
2873 static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
2874 #endif /* CONFIG_FAIR_GROUP_SCHED */
2875 
2876 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
2877 
2878 static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
2879 {
2880 	struct rq *rq = rq_of(cfs_rq);
2881 	int cpu = cpu_of(rq);
2882 
2883 	if (cpu == smp_processor_id() && &rq->cfs == cfs_rq) {
2884 		unsigned long max = rq->cpu_capacity_orig;
2885 
2886 		/*
2887 		 * There are a few boundary cases this might miss but it should
2888 		 * get called often enough that that should (hopefully) not be
2889 		 * a real problem -- added to that it only calls on the local
2890 		 * CPU, so if we enqueue remotely we'll miss an update, but
2891 		 * the next tick/schedule should update.
2892 		 *
2893 		 * It will not get called when we go idle, because the idle
2894 		 * thread is a different class (!fair), nor will the utilization
2895 		 * number include things like RT tasks.
2896 		 *
2897 		 * As is, the util number is not freq-invariant (we'd have to
2898 		 * implement arch_scale_freq_capacity() for that).
2899 		 *
2900 		 * See cpu_util().
2901 		 */
2902 		cpufreq_update_util(rq_clock(rq),
2903 				    min(cfs_rq->avg.util_avg, max), max);
2904 	}
2905 }
2906 
2907 /* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */
2908 static inline int
2909 update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
2910 {
2911 	struct sched_avg *sa = &cfs_rq->avg;
2912 	int decayed, removed_load = 0, removed_util = 0;
2913 
2914 	if (atomic_long_read(&cfs_rq->removed_load_avg)) {
2915 		s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
2916 		sa->load_avg = max_t(long, sa->load_avg - r, 0);
2917 		sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0);
2918 		removed_load = 1;
2919 	}
2920 
2921 	if (atomic_long_read(&cfs_rq->removed_util_avg)) {
2922 		long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
2923 		sa->util_avg = max_t(long, sa->util_avg - r, 0);
2924 		sa->util_sum = max_t(s32, sa->util_sum - r * LOAD_AVG_MAX, 0);
2925 		removed_util = 1;
2926 	}
2927 
2928 	decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
2929 		scale_load_down(cfs_rq->load.weight), cfs_rq->curr != NULL, cfs_rq);
2930 
2931 #ifndef CONFIG_64BIT
2932 	smp_wmb();
2933 	cfs_rq->load_last_update_time_copy = sa->last_update_time;
2934 #endif
2935 
2936 	if (update_freq && (decayed || removed_util))
2937 		cfs_rq_util_change(cfs_rq);
2938 
2939 	return decayed || removed_load;
2940 }
2941 
2942 /* Update task and its cfs_rq load average */
2943 static inline void update_load_avg(struct sched_entity *se, int update_tg)
2944 {
2945 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
2946 	u64 now = cfs_rq_clock_task(cfs_rq);
2947 	struct rq *rq = rq_of(cfs_rq);
2948 	int cpu = cpu_of(rq);
2949 
2950 	/*
2951 	 * Track task load average for carrying it to new CPU after migrated, and
2952 	 * track group sched_entity load average for task_h_load calc in migration
2953 	 */
2954 	__update_load_avg(now, cpu, &se->avg,
2955 			  se->on_rq * scale_load_down(se->load.weight),
2956 			  cfs_rq->curr == se, NULL);
2957 
2958 	if (update_cfs_rq_load_avg(now, cfs_rq, true) && update_tg)
2959 		update_tg_load_avg(cfs_rq, 0);
2960 }
2961 
2962 static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
2963 {
2964 	if (!sched_feat(ATTACH_AGE_LOAD))
2965 		goto skip_aging;
2966 
2967 	/*
2968 	 * If we got migrated (either between CPUs or between cgroups) we'll
2969 	 * have aged the average right before clearing @last_update_time.
2970 	 */
2971 	if (se->avg.last_update_time) {
2972 		__update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
2973 				  &se->avg, 0, 0, NULL);
2974 
2975 		/*
2976 		 * XXX: we could have just aged the entire load away if we've been
2977 		 * absent from the fair class for too long.
2978 		 */
2979 	}
2980 
2981 skip_aging:
2982 	se->avg.last_update_time = cfs_rq->avg.last_update_time;
2983 	cfs_rq->avg.load_avg += se->avg.load_avg;
2984 	cfs_rq->avg.load_sum += se->avg.load_sum;
2985 	cfs_rq->avg.util_avg += se->avg.util_avg;
2986 	cfs_rq->avg.util_sum += se->avg.util_sum;
2987 
2988 	cfs_rq_util_change(cfs_rq);
2989 }
2990 
2991 static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
2992 {
2993 	__update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
2994 			  &se->avg, se->on_rq * scale_load_down(se->load.weight),
2995 			  cfs_rq->curr == se, NULL);
2996 
2997 	cfs_rq->avg.load_avg = max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0);
2998 	cfs_rq->avg.load_sum = max_t(s64,  cfs_rq->avg.load_sum - se->avg.load_sum, 0);
2999 	cfs_rq->avg.util_avg = max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0);
3000 	cfs_rq->avg.util_sum = max_t(s32,  cfs_rq->avg.util_sum - se->avg.util_sum, 0);
3001 
3002 	cfs_rq_util_change(cfs_rq);
3003 }
3004 
3005 /* Add the load generated by se into cfs_rq's load average */
3006 static inline void
3007 enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
3008 {
3009 	struct sched_avg *sa = &se->avg;
3010 	u64 now = cfs_rq_clock_task(cfs_rq);
3011 	int migrated, decayed;
3012 
3013 	migrated = !sa->last_update_time;
3014 	if (!migrated) {
3015 		__update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
3016 			se->on_rq * scale_load_down(se->load.weight),
3017 			cfs_rq->curr == se, NULL);
3018 	}
3019 
3020 	decayed = update_cfs_rq_load_avg(now, cfs_rq, !migrated);
3021 
3022 	cfs_rq->runnable_load_avg += sa->load_avg;
3023 	cfs_rq->runnable_load_sum += sa->load_sum;
3024 
3025 	if (migrated)
3026 		attach_entity_load_avg(cfs_rq, se);
3027 
3028 	if (decayed || migrated)
3029 		update_tg_load_avg(cfs_rq, 0);
3030 }
3031 
3032 /* Remove the runnable load generated by se from cfs_rq's runnable load average */
3033 static inline void
3034 dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
3035 {
3036 	update_load_avg(se, 1);
3037 
3038 	cfs_rq->runnable_load_avg =
3039 		max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0);
3040 	cfs_rq->runnable_load_sum =
3041 		max_t(s64,  cfs_rq->runnable_load_sum - se->avg.load_sum, 0);
3042 }
3043 
3044 #ifndef CONFIG_64BIT
3045 static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
3046 {
3047 	u64 last_update_time_copy;
3048 	u64 last_update_time;
3049 
3050 	do {
3051 		last_update_time_copy = cfs_rq->load_last_update_time_copy;
3052 		smp_rmb();
3053 		last_update_time = cfs_rq->avg.last_update_time;
3054 	} while (last_update_time != last_update_time_copy);
3055 
3056 	return last_update_time;
3057 }
3058 #else
3059 static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
3060 {
3061 	return cfs_rq->avg.last_update_time;
3062 }
3063 #endif
3064 
3065 /*
3066  * Task first catches up with cfs_rq, and then subtract
3067  * itself from the cfs_rq (task must be off the queue now).
3068  */
3069 void remove_entity_load_avg(struct sched_entity *se)
3070 {
3071 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
3072 	u64 last_update_time;
3073 
3074 	/*
3075 	 * Newly created task or never used group entity should not be removed
3076 	 * from its (source) cfs_rq
3077 	 */
3078 	if (se->avg.last_update_time == 0)
3079 		return;
3080 
3081 	last_update_time = cfs_rq_last_update_time(cfs_rq);
3082 
3083 	__update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
3084 	atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg);
3085 	atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg);
3086 }
3087 
3088 static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
3089 {
3090 	return cfs_rq->runnable_load_avg;
3091 }
3092 
3093 static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
3094 {
3095 	return cfs_rq->avg.load_avg;
3096 }
3097 
3098 static int idle_balance(struct rq *this_rq);
3099 
3100 #else /* CONFIG_SMP */
3101 
3102 static inline void update_load_avg(struct sched_entity *se, int not_used)
3103 {
3104 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
3105 	struct rq *rq = rq_of(cfs_rq);
3106 
3107 	cpufreq_trigger_update(rq_clock(rq));
3108 }
3109 
3110 static inline void
3111 enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
3112 static inline void
3113 dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
3114 static inline void remove_entity_load_avg(struct sched_entity *se) {}
3115 
3116 static inline void
3117 attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
3118 static inline void
3119 detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
3120 
3121 static inline int idle_balance(struct rq *rq)
3122 {
3123 	return 0;
3124 }
3125 
3126 #endif /* CONFIG_SMP */
3127 
3128 static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
3129 {
3130 #ifdef CONFIG_SCHEDSTATS
3131 	struct task_struct *tsk = NULL;
3132 
3133 	if (entity_is_task(se))
3134 		tsk = task_of(se);
3135 
3136 	if (se->statistics.sleep_start) {
3137 		u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.sleep_start;
3138 
3139 		if ((s64)delta < 0)
3140 			delta = 0;
3141 
3142 		if (unlikely(delta > se->statistics.sleep_max))
3143 			se->statistics.sleep_max = delta;
3144 
3145 		se->statistics.sleep_start = 0;
3146 		se->statistics.sum_sleep_runtime += delta;
3147 
3148 		if (tsk) {
3149 			account_scheduler_latency(tsk, delta >> 10, 1);
3150 			trace_sched_stat_sleep(tsk, delta);
3151 		}
3152 	}
3153 	if (se->statistics.block_start) {
3154 		u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.block_start;
3155 
3156 		if ((s64)delta < 0)
3157 			delta = 0;
3158 
3159 		if (unlikely(delta > se->statistics.block_max))
3160 			se->statistics.block_max = delta;
3161 
3162 		se->statistics.block_start = 0;
3163 		se->statistics.sum_sleep_runtime += delta;
3164 
3165 		if (tsk) {
3166 			if (tsk->in_iowait) {
3167 				se->statistics.iowait_sum += delta;
3168 				se->statistics.iowait_count++;
3169 				trace_sched_stat_iowait(tsk, delta);
3170 			}
3171 
3172 			trace_sched_stat_blocked(tsk, delta);
3173 
3174 			/*
3175 			 * Blocking time is in units of nanosecs, so shift by
3176 			 * 20 to get a milliseconds-range estimation of the
3177 			 * amount of time that the task spent sleeping:
3178 			 */
3179 			if (unlikely(prof_on == SLEEP_PROFILING)) {
3180 				profile_hits(SLEEP_PROFILING,
3181 						(void *)get_wchan(tsk),
3182 						delta >> 20);
3183 			}
3184 			account_scheduler_latency(tsk, delta >> 10, 0);
3185 		}
3186 	}
3187 #endif
3188 }
3189 
3190 static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
3191 {
3192 #ifdef CONFIG_SCHED_DEBUG
3193 	s64 d = se->vruntime - cfs_rq->min_vruntime;
3194 
3195 	if (d < 0)
3196 		d = -d;
3197 
3198 	if (d > 3*sysctl_sched_latency)
3199 		schedstat_inc(cfs_rq, nr_spread_over);
3200 #endif
3201 }
3202 
3203 static void
3204 place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
3205 {
3206 	u64 vruntime = cfs_rq->min_vruntime;
3207 
3208 	/*
3209 	 * The 'current' period is already promised to the current tasks,
3210 	 * however the extra weight of the new task will slow them down a
3211 	 * little, place the new task so that it fits in the slot that
3212 	 * stays open at the end.
3213 	 */
3214 	if (initial && sched_feat(START_DEBIT))
3215 		vruntime += sched_vslice(cfs_rq, se);
3216 
3217 	/* sleeps up to a single latency don't count. */
3218 	if (!initial) {
3219 		unsigned long thresh = sysctl_sched_latency;
3220 
3221 		/*
3222 		 * Halve their sleep time's effect, to allow
3223 		 * for a gentler effect of sleepers:
3224 		 */
3225 		if (sched_feat(GENTLE_FAIR_SLEEPERS))
3226 			thresh >>= 1;
3227 
3228 		vruntime -= thresh;
3229 	}
3230 
3231 	/* ensure we never gain time by being placed backwards. */
3232 	se->vruntime = max_vruntime(se->vruntime, vruntime);
3233 }
3234 
3235 static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
3236 
3237 static inline void check_schedstat_required(void)
3238 {
3239 #ifdef CONFIG_SCHEDSTATS
3240 	if (schedstat_enabled())
3241 		return;
3242 
3243 	/* Force schedstat enabled if a dependent tracepoint is active */
3244 	if (trace_sched_stat_wait_enabled()    ||
3245 			trace_sched_stat_sleep_enabled()   ||
3246 			trace_sched_stat_iowait_enabled()  ||
3247 			trace_sched_stat_blocked_enabled() ||
3248 			trace_sched_stat_runtime_enabled())  {
3249 		pr_warn_once("Scheduler tracepoints stat_sleep, stat_iowait, "
3250 			     "stat_blocked and stat_runtime require the "
3251 			     "kernel parameter schedstats=enabled or "
3252 			     "kernel.sched_schedstats=1\n");
3253 	}
3254 #endif
3255 }
3256 
3257 
3258 /*
3259  * MIGRATION
3260  *
3261  *	dequeue
3262  *	  update_curr()
3263  *	    update_min_vruntime()
3264  *	  vruntime -= min_vruntime
3265  *
3266  *	enqueue
3267  *	  update_curr()
3268  *	    update_min_vruntime()
3269  *	  vruntime += min_vruntime
3270  *
3271  * this way the vruntime transition between RQs is done when both
3272  * min_vruntime are up-to-date.
3273  *
3274  * WAKEUP (remote)
3275  *
3276  *	->migrate_task_rq_fair() (p->state == TASK_WAKING)
3277  *	  vruntime -= min_vruntime
3278  *
3279  *	enqueue
3280  *	  update_curr()
3281  *	    update_min_vruntime()
3282  *	  vruntime += min_vruntime
3283  *
3284  * this way we don't have the most up-to-date min_vruntime on the originating
3285  * CPU and an up-to-date min_vruntime on the destination CPU.
3286  */
3287 
3288 static void
3289 enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
3290 {
3291 	bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATED);
3292 	bool curr = cfs_rq->curr == se;
3293 
3294 	/*
3295 	 * If we're the current task, we must renormalise before calling
3296 	 * update_curr().
3297 	 */
3298 	if (renorm && curr)
3299 		se->vruntime += cfs_rq->min_vruntime;
3300 
3301 	update_curr(cfs_rq);
3302 
3303 	/*
3304 	 * Otherwise, renormalise after, such that we're placed at the current
3305 	 * moment in time, instead of some random moment in the past. Being
3306 	 * placed in the past could significantly boost this task to the
3307 	 * fairness detriment of existing tasks.
3308 	 */
3309 	if (renorm && !curr)
3310 		se->vruntime += cfs_rq->min_vruntime;
3311 
3312 	enqueue_entity_load_avg(cfs_rq, se);
3313 	account_entity_enqueue(cfs_rq, se);
3314 	update_cfs_shares(cfs_rq);
3315 
3316 	if (flags & ENQUEUE_WAKEUP) {
3317 		place_entity(cfs_rq, se, 0);
3318 		if (schedstat_enabled())
3319 			enqueue_sleeper(cfs_rq, se);
3320 	}
3321 
3322 	check_schedstat_required();
3323 	if (schedstat_enabled()) {
3324 		update_stats_enqueue(cfs_rq, se);
3325 		check_spread(cfs_rq, se);
3326 	}
3327 	if (!curr)
3328 		__enqueue_entity(cfs_rq, se);
3329 	se->on_rq = 1;
3330 
3331 	if (cfs_rq->nr_running == 1) {
3332 		list_add_leaf_cfs_rq(cfs_rq);
3333 		check_enqueue_throttle(cfs_rq);
3334 	}
3335 }
3336 
3337 static void __clear_buddies_last(struct sched_entity *se)
3338 {
3339 	for_each_sched_entity(se) {
3340 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
3341 		if (cfs_rq->last != se)
3342 			break;
3343 
3344 		cfs_rq->last = NULL;
3345 	}
3346 }
3347 
3348 static void __clear_buddies_next(struct sched_entity *se)
3349 {
3350 	for_each_sched_entity(se) {
3351 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
3352 		if (cfs_rq->next != se)
3353 			break;
3354 
3355 		cfs_rq->next = NULL;
3356 	}
3357 }
3358 
3359 static void __clear_buddies_skip(struct sched_entity *se)
3360 {
3361 	for_each_sched_entity(se) {
3362 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
3363 		if (cfs_rq->skip != se)
3364 			break;
3365 
3366 		cfs_rq->skip = NULL;
3367 	}
3368 }
3369 
3370 static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
3371 {
3372 	if (cfs_rq->last == se)
3373 		__clear_buddies_last(se);
3374 
3375 	if (cfs_rq->next == se)
3376 		__clear_buddies_next(se);
3377 
3378 	if (cfs_rq->skip == se)
3379 		__clear_buddies_skip(se);
3380 }
3381 
3382 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
3383 
3384 static void
3385 dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
3386 {
3387 	/*
3388 	 * Update run-time statistics of the 'current'.
3389 	 */
3390 	update_curr(cfs_rq);
3391 	dequeue_entity_load_avg(cfs_rq, se);
3392 
3393 	if (schedstat_enabled())
3394 		update_stats_dequeue(cfs_rq, se, flags);
3395 
3396 	clear_buddies(cfs_rq, se);
3397 
3398 	if (se != cfs_rq->curr)
3399 		__dequeue_entity(cfs_rq, se);
3400 	se->on_rq = 0;
3401 	account_entity_dequeue(cfs_rq, se);
3402 
3403 	/*
3404 	 * Normalize the entity after updating the min_vruntime because the
3405 	 * update can refer to the ->curr item and we need to reflect this
3406 	 * movement in our normalized position.
3407 	 */
3408 	if (!(flags & DEQUEUE_SLEEP))
3409 		se->vruntime -= cfs_rq->min_vruntime;
3410 
3411 	/* return excess runtime on last dequeue */
3412 	return_cfs_rq_runtime(cfs_rq);
3413 
3414 	update_min_vruntime(cfs_rq);
3415 	update_cfs_shares(cfs_rq);
3416 }
3417 
3418 /*
3419  * Preempt the current task with a newly woken task if needed:
3420  */
3421 static void
3422 check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
3423 {
3424 	unsigned long ideal_runtime, delta_exec;
3425 	struct sched_entity *se;
3426 	s64 delta;
3427 
3428 	ideal_runtime = sched_slice(cfs_rq, curr);
3429 	delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
3430 	if (delta_exec > ideal_runtime) {
3431 		resched_curr(rq_of(cfs_rq));
3432 		/*
3433 		 * The current task ran long enough, ensure it doesn't get
3434 		 * re-elected due to buddy favours.
3435 		 */
3436 		clear_buddies(cfs_rq, curr);
3437 		return;
3438 	}
3439 
3440 	/*
3441 	 * Ensure that a task that missed wakeup preemption by a
3442 	 * narrow margin doesn't have to wait for a full slice.
3443 	 * This also mitigates buddy induced latencies under load.
3444 	 */
3445 	if (delta_exec < sysctl_sched_min_granularity)
3446 		return;
3447 
3448 	se = __pick_first_entity(cfs_rq);
3449 	delta = curr->vruntime - se->vruntime;
3450 
3451 	if (delta < 0)
3452 		return;
3453 
3454 	if (delta > ideal_runtime)
3455 		resched_curr(rq_of(cfs_rq));
3456 }
3457 
3458 static void
3459 set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
3460 {
3461 	/* 'current' is not kept within the tree. */
3462 	if (se->on_rq) {
3463 		/*
3464 		 * Any task has to be enqueued before it get to execute on
3465 		 * a CPU. So account for the time it spent waiting on the
3466 		 * runqueue.
3467 		 */
3468 		if (schedstat_enabled())
3469 			update_stats_wait_end(cfs_rq, se);
3470 		__dequeue_entity(cfs_rq, se);
3471 		update_load_avg(se, 1);
3472 	}
3473 
3474 	update_stats_curr_start(cfs_rq, se);
3475 	cfs_rq->curr = se;
3476 #ifdef CONFIG_SCHEDSTATS
3477 	/*
3478 	 * Track our maximum slice length, if the CPU's load is at
3479 	 * least twice that of our own weight (i.e. dont track it
3480 	 * when there are only lesser-weight tasks around):
3481 	 */
3482 	if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
3483 		se->statistics.slice_max = max(se->statistics.slice_max,
3484 			se->sum_exec_runtime - se->prev_sum_exec_runtime);
3485 	}
3486 #endif
3487 	se->prev_sum_exec_runtime = se->sum_exec_runtime;
3488 }
3489 
3490 static int
3491 wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
3492 
3493 /*
3494  * Pick the next process, keeping these things in mind, in this order:
3495  * 1) keep things fair between processes/task groups
3496  * 2) pick the "next" process, since someone really wants that to run
3497  * 3) pick the "last" process, for cache locality
3498  * 4) do not run the "skip" process, if something else is available
3499  */
3500 static struct sched_entity *
3501 pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
3502 {
3503 	struct sched_entity *left = __pick_first_entity(cfs_rq);
3504 	struct sched_entity *se;
3505 
3506 	/*
3507 	 * If curr is set we have to see if its left of the leftmost entity
3508 	 * still in the tree, provided there was anything in the tree at all.
3509 	 */
3510 	if (!left || (curr && entity_before(curr, left)))
3511 		left = curr;
3512 
3513 	se = left; /* ideally we run the leftmost entity */
3514 
3515 	/*
3516 	 * Avoid running the skip buddy, if running something else can
3517 	 * be done without getting too unfair.
3518 	 */
3519 	if (cfs_rq->skip == se) {
3520 		struct sched_entity *second;
3521 
3522 		if (se == curr) {
3523 			second = __pick_first_entity(cfs_rq);
3524 		} else {
3525 			second = __pick_next_entity(se);
3526 			if (!second || (curr && entity_before(curr, second)))
3527 				second = curr;
3528 		}
3529 
3530 		if (second && wakeup_preempt_entity(second, left) < 1)
3531 			se = second;
3532 	}
3533 
3534 	/*
3535 	 * Prefer last buddy, try to return the CPU to a preempted task.
3536 	 */
3537 	if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
3538 		se = cfs_rq->last;
3539 
3540 	/*
3541 	 * Someone really wants this to run. If it's not unfair, run it.
3542 	 */
3543 	if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
3544 		se = cfs_rq->next;
3545 
3546 	clear_buddies(cfs_rq, se);
3547 
3548 	return se;
3549 }
3550 
3551 static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
3552 
3553 static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
3554 {
3555 	/*
3556 	 * If still on the runqueue then deactivate_task()
3557 	 * was not called and update_curr() has to be done:
3558 	 */
3559 	if (prev->on_rq)
3560 		update_curr(cfs_rq);
3561 
3562 	/* throttle cfs_rqs exceeding runtime */
3563 	check_cfs_rq_runtime(cfs_rq);
3564 
3565 	if (schedstat_enabled()) {
3566 		check_spread(cfs_rq, prev);
3567 		if (prev->on_rq)
3568 			update_stats_wait_start(cfs_rq, prev);
3569 	}
3570 
3571 	if (prev->on_rq) {
3572 		/* Put 'current' back into the tree. */
3573 		__enqueue_entity(cfs_rq, prev);
3574 		/* in !on_rq case, update occurred at dequeue */
3575 		update_load_avg(prev, 0);
3576 	}
3577 	cfs_rq->curr = NULL;
3578 }
3579 
3580 static void
3581 entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
3582 {
3583 	/*
3584 	 * Update run-time statistics of the 'current'.
3585 	 */
3586 	update_curr(cfs_rq);
3587 
3588 	/*
3589 	 * Ensure that runnable average is periodically updated.
3590 	 */
3591 	update_load_avg(curr, 1);
3592 	update_cfs_shares(cfs_rq);
3593 
3594 #ifdef CONFIG_SCHED_HRTICK
3595 	/*
3596 	 * queued ticks are scheduled to match the slice, so don't bother
3597 	 * validating it and just reschedule.
3598 	 */
3599 	if (queued) {
3600 		resched_curr(rq_of(cfs_rq));
3601 		return;
3602 	}
3603 	/*
3604 	 * don't let the period tick interfere with the hrtick preemption
3605 	 */
3606 	if (!sched_feat(DOUBLE_TICK) &&
3607 			hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
3608 		return;
3609 #endif
3610 
3611 	if (cfs_rq->nr_running > 1)
3612 		check_preempt_tick(cfs_rq, curr);
3613 }
3614 
3615 
3616 /**************************************************
3617  * CFS bandwidth control machinery
3618  */
3619 
3620 #ifdef CONFIG_CFS_BANDWIDTH
3621 
3622 #ifdef HAVE_JUMP_LABEL
3623 static struct static_key __cfs_bandwidth_used;
3624 
3625 static inline bool cfs_bandwidth_used(void)
3626 {
3627 	return static_key_false(&__cfs_bandwidth_used);
3628 }
3629 
3630 void cfs_bandwidth_usage_inc(void)
3631 {
3632 	static_key_slow_inc(&__cfs_bandwidth_used);
3633 }
3634 
3635 void cfs_bandwidth_usage_dec(void)
3636 {
3637 	static_key_slow_dec(&__cfs_bandwidth_used);
3638 }
3639 #else /* HAVE_JUMP_LABEL */
3640 static bool cfs_bandwidth_used(void)
3641 {
3642 	return true;
3643 }
3644 
3645 void cfs_bandwidth_usage_inc(void) {}
3646 void cfs_bandwidth_usage_dec(void) {}
3647 #endif /* HAVE_JUMP_LABEL */
3648 
3649 /*
3650  * default period for cfs group bandwidth.
3651  * default: 0.1s, units: nanoseconds
3652  */
3653 static inline u64 default_cfs_period(void)
3654 {
3655 	return 100000000ULL;
3656 }
3657 
3658 static inline u64 sched_cfs_bandwidth_slice(void)
3659 {
3660 	return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
3661 }
3662 
3663 /*
3664  * Replenish runtime according to assigned quota and update expiration time.
3665  * We use sched_clock_cpu directly instead of rq->clock to avoid adding
3666  * additional synchronization around rq->lock.
3667  *
3668  * requires cfs_b->lock
3669  */
3670 void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
3671 {
3672 	u64 now;
3673 
3674 	if (cfs_b->quota == RUNTIME_INF)
3675 		return;
3676 
3677 	now = sched_clock_cpu(smp_processor_id());
3678 	cfs_b->runtime = cfs_b->quota;
3679 	cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
3680 }
3681 
3682 static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
3683 {
3684 	return &tg->cfs_bandwidth;
3685 }
3686 
3687 /* rq->task_clock normalized against any time this cfs_rq has spent throttled */
3688 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
3689 {
3690 	if (unlikely(cfs_rq->throttle_count))
3691 		return cfs_rq->throttled_clock_task;
3692 
3693 	return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
3694 }
3695 
3696 /* returns 0 on failure to allocate runtime */
3697 static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
3698 {
3699 	struct task_group *tg = cfs_rq->tg;
3700 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
3701 	u64 amount = 0, min_amount, expires;
3702 
3703 	/* note: this is a positive sum as runtime_remaining <= 0 */
3704 	min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
3705 
3706 	raw_spin_lock(&cfs_b->lock);
3707 	if (cfs_b->quota == RUNTIME_INF)
3708 		amount = min_amount;
3709 	else {
3710 		start_cfs_bandwidth(cfs_b);
3711 
3712 		if (cfs_b->runtime > 0) {
3713 			amount = min(cfs_b->runtime, min_amount);
3714 			cfs_b->runtime -= amount;
3715 			cfs_b->idle = 0;
3716 		}
3717 	}
3718 	expires = cfs_b->runtime_expires;
3719 	raw_spin_unlock(&cfs_b->lock);
3720 
3721 	cfs_rq->runtime_remaining += amount;
3722 	/*
3723 	 * we may have advanced our local expiration to account for allowed
3724 	 * spread between our sched_clock and the one on which runtime was
3725 	 * issued.
3726 	 */
3727 	if ((s64)(expires - cfs_rq->runtime_expires) > 0)
3728 		cfs_rq->runtime_expires = expires;
3729 
3730 	return cfs_rq->runtime_remaining > 0;
3731 }
3732 
3733 /*
3734  * Note: This depends on the synchronization provided by sched_clock and the
3735  * fact that rq->clock snapshots this value.
3736  */
3737 static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
3738 {
3739 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
3740 
3741 	/* if the deadline is ahead of our clock, nothing to do */
3742 	if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0))
3743 		return;
3744 
3745 	if (cfs_rq->runtime_remaining < 0)
3746 		return;
3747 
3748 	/*
3749 	 * If the local deadline has passed we have to consider the
3750 	 * possibility that our sched_clock is 'fast' and the global deadline
3751 	 * has not truly expired.
3752 	 *
3753 	 * Fortunately we can check determine whether this the case by checking
3754 	 * whether the global deadline has advanced. It is valid to compare
3755 	 * cfs_b->runtime_expires without any locks since we only care about
3756 	 * exact equality, so a partial write will still work.
3757 	 */
3758 
3759 	if (cfs_rq->runtime_expires != cfs_b->runtime_expires) {
3760 		/* extend local deadline, drift is bounded above by 2 ticks */
3761 		cfs_rq->runtime_expires += TICK_NSEC;
3762 	} else {
3763 		/* global deadline is ahead, expiration has passed */
3764 		cfs_rq->runtime_remaining = 0;
3765 	}
3766 }
3767 
3768 static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
3769 {
3770 	/* dock delta_exec before expiring quota (as it could span periods) */
3771 	cfs_rq->runtime_remaining -= delta_exec;
3772 	expire_cfs_rq_runtime(cfs_rq);
3773 
3774 	if (likely(cfs_rq->runtime_remaining > 0))
3775 		return;
3776 
3777 	/*
3778 	 * if we're unable to extend our runtime we resched so that the active
3779 	 * hierarchy can be throttled
3780 	 */
3781 	if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
3782 		resched_curr(rq_of(cfs_rq));
3783 }
3784 
3785 static __always_inline
3786 void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
3787 {
3788 	if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
3789 		return;
3790 
3791 	__account_cfs_rq_runtime(cfs_rq, delta_exec);
3792 }
3793 
3794 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
3795 {
3796 	return cfs_bandwidth_used() && cfs_rq->throttled;
3797 }
3798 
3799 /* check whether cfs_rq, or any parent, is throttled */
3800 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
3801 {
3802 	return cfs_bandwidth_used() && cfs_rq->throttle_count;
3803 }
3804 
3805 /*
3806  * Ensure that neither of the group entities corresponding to src_cpu or
3807  * dest_cpu are members of a throttled hierarchy when performing group
3808  * load-balance operations.
3809  */
3810 static inline int throttled_lb_pair(struct task_group *tg,
3811 				    int src_cpu, int dest_cpu)
3812 {
3813 	struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
3814 
3815 	src_cfs_rq = tg->cfs_rq[src_cpu];
3816 	dest_cfs_rq = tg->cfs_rq[dest_cpu];
3817 
3818 	return throttled_hierarchy(src_cfs_rq) ||
3819 	       throttled_hierarchy(dest_cfs_rq);
3820 }
3821 
3822 /* updated child weight may affect parent so we have to do this bottom up */
3823 static int tg_unthrottle_up(struct task_group *tg, void *data)
3824 {
3825 	struct rq *rq = data;
3826 	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
3827 
3828 	cfs_rq->throttle_count--;
3829 #ifdef CONFIG_SMP
3830 	if (!cfs_rq->throttle_count) {
3831 		/* adjust cfs_rq_clock_task() */
3832 		cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
3833 					     cfs_rq->throttled_clock_task;
3834 	}
3835 #endif
3836 
3837 	return 0;
3838 }
3839 
3840 static int tg_throttle_down(struct task_group *tg, void *data)
3841 {
3842 	struct rq *rq = data;
3843 	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
3844 
3845 	/* group is entering throttled state, stop time */
3846 	if (!cfs_rq->throttle_count)
3847 		cfs_rq->throttled_clock_task = rq_clock_task(rq);
3848 	cfs_rq->throttle_count++;
3849 
3850 	return 0;
3851 }
3852 
3853 static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
3854 {
3855 	struct rq *rq = rq_of(cfs_rq);
3856 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
3857 	struct sched_entity *se;
3858 	long task_delta, dequeue = 1;
3859 	bool empty;
3860 
3861 	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
3862 
3863 	/* freeze hierarchy runnable averages while throttled */
3864 	rcu_read_lock();
3865 	walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
3866 	rcu_read_unlock();
3867 
3868 	task_delta = cfs_rq->h_nr_running;
3869 	for_each_sched_entity(se) {
3870 		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
3871 		/* throttled entity or throttle-on-deactivate */
3872 		if (!se->on_rq)
3873 			break;
3874 
3875 		if (dequeue)
3876 			dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
3877 		qcfs_rq->h_nr_running -= task_delta;
3878 
3879 		if (qcfs_rq->load.weight)
3880 			dequeue = 0;
3881 	}
3882 
3883 	if (!se)
3884 		sub_nr_running(rq, task_delta);
3885 
3886 	cfs_rq->throttled = 1;
3887 	cfs_rq->throttled_clock = rq_clock(rq);
3888 	raw_spin_lock(&cfs_b->lock);
3889 	empty = list_empty(&cfs_b->throttled_cfs_rq);
3890 
3891 	/*
3892 	 * Add to the _head_ of the list, so that an already-started
3893 	 * distribute_cfs_runtime will not see us
3894 	 */
3895 	list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
3896 
3897 	/*
3898 	 * If we're the first throttled task, make sure the bandwidth
3899 	 * timer is running.
3900 	 */
3901 	if (empty)
3902 		start_cfs_bandwidth(cfs_b);
3903 
3904 	raw_spin_unlock(&cfs_b->lock);
3905 }
3906 
3907 void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
3908 {
3909 	struct rq *rq = rq_of(cfs_rq);
3910 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
3911 	struct sched_entity *se;
3912 	int enqueue = 1;
3913 	long task_delta;
3914 
3915 	se = cfs_rq->tg->se[cpu_of(rq)];
3916 
3917 	cfs_rq->throttled = 0;
3918 
3919 	update_rq_clock(rq);
3920 
3921 	raw_spin_lock(&cfs_b->lock);
3922 	cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
3923 	list_del_rcu(&cfs_rq->throttled_list);
3924 	raw_spin_unlock(&cfs_b->lock);
3925 
3926 	/* update hierarchical throttle state */
3927 	walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
3928 
3929 	if (!cfs_rq->load.weight)
3930 		return;
3931 
3932 	task_delta = cfs_rq->h_nr_running;
3933 	for_each_sched_entity(se) {
3934 		if (se->on_rq)
3935 			enqueue = 0;
3936 
3937 		cfs_rq = cfs_rq_of(se);
3938 		if (enqueue)
3939 			enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
3940 		cfs_rq->h_nr_running += task_delta;
3941 
3942 		if (cfs_rq_throttled(cfs_rq))
3943 			break;
3944 	}
3945 
3946 	if (!se)
3947 		add_nr_running(rq, task_delta);
3948 
3949 	/* determine whether we need to wake up potentially idle cpu */
3950 	if (rq->curr == rq->idle && rq->cfs.nr_running)
3951 		resched_curr(rq);
3952 }
3953 
3954 static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
3955 		u64 remaining, u64 expires)
3956 {
3957 	struct cfs_rq *cfs_rq;
3958 	u64 runtime;
3959 	u64 starting_runtime = remaining;
3960 
3961 	rcu_read_lock();
3962 	list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
3963 				throttled_list) {
3964 		struct rq *rq = rq_of(cfs_rq);
3965 
3966 		raw_spin_lock(&rq->lock);
3967 		if (!cfs_rq_throttled(cfs_rq))
3968 			goto next;
3969 
3970 		runtime = -cfs_rq->runtime_remaining + 1;
3971 		if (runtime > remaining)
3972 			runtime = remaining;
3973 		remaining -= runtime;
3974 
3975 		cfs_rq->runtime_remaining += runtime;
3976 		cfs_rq->runtime_expires = expires;
3977 
3978 		/* we check whether we're throttled above */
3979 		if (cfs_rq->runtime_remaining > 0)
3980 			unthrottle_cfs_rq(cfs_rq);
3981 
3982 next:
3983 		raw_spin_unlock(&rq->lock);
3984 
3985 		if (!remaining)
3986 			break;
3987 	}
3988 	rcu_read_unlock();
3989 
3990 	return starting_runtime - remaining;
3991 }
3992 
3993 /*
3994  * Responsible for refilling a task_group's bandwidth and unthrottling its
3995  * cfs_rqs as appropriate. If there has been no activity within the last
3996  * period the timer is deactivated until scheduling resumes; cfs_b->idle is
3997  * used to track this state.
3998  */
3999 static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
4000 {
4001 	u64 runtime, runtime_expires;
4002 	int throttled;
4003 
4004 	/* no need to continue the timer with no bandwidth constraint */
4005 	if (cfs_b->quota == RUNTIME_INF)
4006 		goto out_deactivate;
4007 
4008 	throttled = !list_empty(&cfs_b->throttled_cfs_rq);
4009 	cfs_b->nr_periods += overrun;
4010 
4011 	/*
4012 	 * idle depends on !throttled (for the case of a large deficit), and if
4013 	 * we're going inactive then everything else can be deferred
4014 	 */
4015 	if (cfs_b->idle && !throttled)
4016 		goto out_deactivate;
4017 
4018 	__refill_cfs_bandwidth_runtime(cfs_b);
4019 
4020 	if (!throttled) {
4021 		/* mark as potentially idle for the upcoming period */
4022 		cfs_b->idle = 1;
4023 		return 0;
4024 	}
4025 
4026 	/* account preceding periods in which throttling occurred */
4027 	cfs_b->nr_throttled += overrun;
4028 
4029 	runtime_expires = cfs_b->runtime_expires;
4030 
4031 	/*
4032 	 * This check is repeated as we are holding onto the new bandwidth while
4033 	 * we unthrottle. This can potentially race with an unthrottled group
4034 	 * trying to acquire new bandwidth from the global pool. This can result
4035 	 * in us over-using our runtime if it is all used during this loop, but
4036 	 * only by limited amounts in that extreme case.
4037 	 */
4038 	while (throttled && cfs_b->runtime > 0) {
4039 		runtime = cfs_b->runtime;
4040 		raw_spin_unlock(&cfs_b->lock);
4041 		/* we can't nest cfs_b->lock while distributing bandwidth */
4042 		runtime = distribute_cfs_runtime(cfs_b, runtime,
4043 						 runtime_expires);
4044 		raw_spin_lock(&cfs_b->lock);
4045 
4046 		throttled = !list_empty(&cfs_b->throttled_cfs_rq);
4047 
4048 		cfs_b->runtime -= min(runtime, cfs_b->runtime);
4049 	}
4050 
4051 	/*
4052 	 * While we are ensured activity in the period following an
4053 	 * unthrottle, this also covers the case in which the new bandwidth is
4054 	 * insufficient to cover the existing bandwidth deficit.  (Forcing the
4055 	 * timer to remain active while there are any throttled entities.)
4056 	 */
4057 	cfs_b->idle = 0;
4058 
4059 	return 0;
4060 
4061 out_deactivate:
4062 	return 1;
4063 }
4064 
4065 /* a cfs_rq won't donate quota below this amount */
4066 static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
4067 /* minimum remaining period time to redistribute slack quota */
4068 static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
4069 /* how long we wait to gather additional slack before distributing */
4070 static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
4071 
4072 /*
4073  * Are we near the end of the current quota period?
4074  *
4075  * Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the
4076  * hrtimer base being cleared by hrtimer_start. In the case of
4077  * migrate_hrtimers, base is never cleared, so we are fine.
4078  */
4079 static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
4080 {
4081 	struct hrtimer *refresh_timer = &cfs_b->period_timer;
4082 	u64 remaining;
4083 
4084 	/* if the call-back is running a quota refresh is already occurring */
4085 	if (hrtimer_callback_running(refresh_timer))
4086 		return 1;
4087 
4088 	/* is a quota refresh about to occur? */
4089 	remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
4090 	if (remaining < min_expire)
4091 		return 1;
4092 
4093 	return 0;
4094 }
4095 
4096 static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
4097 {
4098 	u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
4099 
4100 	/* if there's a quota refresh soon don't bother with slack */
4101 	if (runtime_refresh_within(cfs_b, min_left))
4102 		return;
4103 
4104 	hrtimer_start(&cfs_b->slack_timer,
4105 			ns_to_ktime(cfs_bandwidth_slack_period),
4106 			HRTIMER_MODE_REL);
4107 }
4108 
4109 /* we know any runtime found here is valid as update_curr() precedes return */
4110 static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4111 {
4112 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
4113 	s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
4114 
4115 	if (slack_runtime <= 0)
4116 		return;
4117 
4118 	raw_spin_lock(&cfs_b->lock);
4119 	if (cfs_b->quota != RUNTIME_INF &&
4120 	    cfs_rq->runtime_expires == cfs_b->runtime_expires) {
4121 		cfs_b->runtime += slack_runtime;
4122 
4123 		/* we are under rq->lock, defer unthrottling using a timer */
4124 		if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
4125 		    !list_empty(&cfs_b->throttled_cfs_rq))
4126 			start_cfs_slack_bandwidth(cfs_b);
4127 	}
4128 	raw_spin_unlock(&cfs_b->lock);
4129 
4130 	/* even if it's not valid for return we don't want to try again */
4131 	cfs_rq->runtime_remaining -= slack_runtime;
4132 }
4133 
4134 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4135 {
4136 	if (!cfs_bandwidth_used())
4137 		return;
4138 
4139 	if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
4140 		return;
4141 
4142 	__return_cfs_rq_runtime(cfs_rq);
4143 }
4144 
4145 /*
4146  * This is done with a timer (instead of inline with bandwidth return) since
4147  * it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
4148  */
4149 static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
4150 {
4151 	u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
4152 	u64 expires;
4153 
4154 	/* confirm we're still not at a refresh boundary */
4155 	raw_spin_lock(&cfs_b->lock);
4156 	if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
4157 		raw_spin_unlock(&cfs_b->lock);
4158 		return;
4159 	}
4160 
4161 	if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
4162 		runtime = cfs_b->runtime;
4163 
4164 	expires = cfs_b->runtime_expires;
4165 	raw_spin_unlock(&cfs_b->lock);
4166 
4167 	if (!runtime)
4168 		return;
4169 
4170 	runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
4171 
4172 	raw_spin_lock(&cfs_b->lock);
4173 	if (expires == cfs_b->runtime_expires)
4174 		cfs_b->runtime -= min(runtime, cfs_b->runtime);
4175 	raw_spin_unlock(&cfs_b->lock);
4176 }
4177 
4178 /*
4179  * When a group wakes up we want to make sure that its quota is not already
4180  * expired/exceeded, otherwise it may be allowed to steal additional ticks of
4181  * runtime as update_curr() throttling can not not trigger until it's on-rq.
4182  */
4183 static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
4184 {
4185 	if (!cfs_bandwidth_used())
4186 		return;
4187 
4188 	/* an active group must be handled by the update_curr()->put() path */
4189 	if (!cfs_rq->runtime_enabled || cfs_rq->curr)
4190 		return;
4191 
4192 	/* ensure the group is not already throttled */
4193 	if (cfs_rq_throttled(cfs_rq))
4194 		return;
4195 
4196 	/* update runtime allocation */
4197 	account_cfs_rq_runtime(cfs_rq, 0);
4198 	if (cfs_rq->runtime_remaining <= 0)
4199 		throttle_cfs_rq(cfs_rq);
4200 }
4201 
4202 /* conditionally throttle active cfs_rq's from put_prev_entity() */
4203 static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4204 {
4205 	if (!cfs_bandwidth_used())
4206 		return false;
4207 
4208 	if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
4209 		return false;
4210 
4211 	/*
4212 	 * it's possible for a throttled entity to be forced into a running
4213 	 * state (e.g. set_curr_task), in this case we're finished.
4214 	 */
4215 	if (cfs_rq_throttled(cfs_rq))
4216 		return true;
4217 
4218 	throttle_cfs_rq(cfs_rq);
4219 	return true;
4220 }
4221 
4222 static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
4223 {
4224 	struct cfs_bandwidth *cfs_b =
4225 		container_of(timer, struct cfs_bandwidth, slack_timer);
4226 
4227 	do_sched_cfs_slack_timer(cfs_b);
4228 
4229 	return HRTIMER_NORESTART;
4230 }
4231 
4232 static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
4233 {
4234 	struct cfs_bandwidth *cfs_b =
4235 		container_of(timer, struct cfs_bandwidth, period_timer);
4236 	int overrun;
4237 	int idle = 0;
4238 
4239 	raw_spin_lock(&cfs_b->lock);
4240 	for (;;) {
4241 		overrun = hrtimer_forward_now(timer, cfs_b->period);
4242 		if (!overrun)
4243 			break;
4244 
4245 		idle = do_sched_cfs_period_timer(cfs_b, overrun);
4246 	}
4247 	if (idle)
4248 		cfs_b->period_active = 0;
4249 	raw_spin_unlock(&cfs_b->lock);
4250 
4251 	return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
4252 }
4253 
4254 void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
4255 {
4256 	raw_spin_lock_init(&cfs_b->lock);
4257 	cfs_b->runtime = 0;
4258 	cfs_b->quota = RUNTIME_INF;
4259 	cfs_b->period = ns_to_ktime(default_cfs_period());
4260 
4261 	INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
4262 	hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
4263 	cfs_b->period_timer.function = sched_cfs_period_timer;
4264 	hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
4265 	cfs_b->slack_timer.function = sched_cfs_slack_timer;
4266 }
4267 
4268 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4269 {
4270 	cfs_rq->runtime_enabled = 0;
4271 	INIT_LIST_HEAD(&cfs_rq->throttled_list);
4272 }
4273 
4274 void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
4275 {
4276 	lockdep_assert_held(&cfs_b->lock);
4277 
4278 	if (!cfs_b->period_active) {
4279 		cfs_b->period_active = 1;
4280 		hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
4281 		hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED);
4282 	}
4283 }
4284 
4285 static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
4286 {
4287 	/* init_cfs_bandwidth() was not called */
4288 	if (!cfs_b->throttled_cfs_rq.next)
4289 		return;
4290 
4291 	hrtimer_cancel(&cfs_b->period_timer);
4292 	hrtimer_cancel(&cfs_b->slack_timer);
4293 }
4294 
4295 static void __maybe_unused update_runtime_enabled(struct rq *rq)
4296 {
4297 	struct cfs_rq *cfs_rq;
4298 
4299 	for_each_leaf_cfs_rq(rq, cfs_rq) {
4300 		struct cfs_bandwidth *cfs_b = &cfs_rq->tg->cfs_bandwidth;
4301 
4302 		raw_spin_lock(&cfs_b->lock);
4303 		cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF;
4304 		raw_spin_unlock(&cfs_b->lock);
4305 	}
4306 }
4307 
4308 static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
4309 {
4310 	struct cfs_rq *cfs_rq;
4311 
4312 	for_each_leaf_cfs_rq(rq, cfs_rq) {
4313 		if (!cfs_rq->runtime_enabled)
4314 			continue;
4315 
4316 		/*
4317 		 * clock_task is not advancing so we just need to make sure
4318 		 * there's some valid quota amount
4319 		 */
4320 		cfs_rq->runtime_remaining = 1;
4321 		/*
4322 		 * Offline rq is schedulable till cpu is completely disabled
4323 		 * in take_cpu_down(), so we prevent new cfs throttling here.
4324 		 */
4325 		cfs_rq->runtime_enabled = 0;
4326 
4327 		if (cfs_rq_throttled(cfs_rq))
4328 			unthrottle_cfs_rq(cfs_rq);
4329 	}
4330 }
4331 
4332 #else /* CONFIG_CFS_BANDWIDTH */
4333 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
4334 {
4335 	return rq_clock_task(rq_of(cfs_rq));
4336 }
4337 
4338 static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
4339 static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
4340 static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
4341 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
4342 
4343 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
4344 {
4345 	return 0;
4346 }
4347 
4348 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
4349 {
4350 	return 0;
4351 }
4352 
4353 static inline int throttled_lb_pair(struct task_group *tg,
4354 				    int src_cpu, int dest_cpu)
4355 {
4356 	return 0;
4357 }
4358 
4359 void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
4360 
4361 #ifdef CONFIG_FAIR_GROUP_SCHED
4362 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
4363 #endif
4364 
4365 static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
4366 {
4367 	return NULL;
4368 }
4369 static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
4370 static inline void update_runtime_enabled(struct rq *rq) {}
4371 static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
4372 
4373 #endif /* CONFIG_CFS_BANDWIDTH */
4374 
4375 /**************************************************
4376  * CFS operations on tasks:
4377  */
4378 
4379 #ifdef CONFIG_SCHED_HRTICK
4380 static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
4381 {
4382 	struct sched_entity *se = &p->se;
4383 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
4384 
4385 	WARN_ON(task_rq(p) != rq);
4386 
4387 	if (cfs_rq->nr_running > 1) {
4388 		u64 slice = sched_slice(cfs_rq, se);
4389 		u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
4390 		s64 delta = slice - ran;
4391 
4392 		if (delta < 0) {
4393 			if (rq->curr == p)
4394 				resched_curr(rq);
4395 			return;
4396 		}
4397 		hrtick_start(rq, delta);
4398 	}
4399 }
4400 
4401 /*
4402  * called from enqueue/dequeue and updates the hrtick when the
4403  * current task is from our class and nr_running is low enough
4404  * to matter.
4405  */
4406 static void hrtick_update(struct rq *rq)
4407 {
4408 	struct task_struct *curr = rq->curr;
4409 
4410 	if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class)
4411 		return;
4412 
4413 	if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
4414 		hrtick_start_fair(rq, curr);
4415 }
4416 #else /* !CONFIG_SCHED_HRTICK */
4417 static inline void
4418 hrtick_start_fair(struct rq *rq, struct task_struct *p)
4419 {
4420 }
4421 
4422 static inline void hrtick_update(struct rq *rq)
4423 {
4424 }
4425 #endif
4426 
4427 /*
4428  * The enqueue_task method is called before nr_running is
4429  * increased. Here we update the fair scheduling stats and
4430  * then put the task into the rbtree:
4431  */
4432 static void
4433 enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
4434 {
4435 	struct cfs_rq *cfs_rq;
4436 	struct sched_entity *se = &p->se;
4437 
4438 	for_each_sched_entity(se) {
4439 		if (se->on_rq)
4440 			break;
4441 		cfs_rq = cfs_rq_of(se);
4442 		enqueue_entity(cfs_rq, se, flags);
4443 
4444 		/*
4445 		 * end evaluation on encountering a throttled cfs_rq
4446 		 *
4447 		 * note: in the case of encountering a throttled cfs_rq we will
4448 		 * post the final h_nr_running increment below.
4449 		*/
4450 		if (cfs_rq_throttled(cfs_rq))
4451 			break;
4452 		cfs_rq->h_nr_running++;
4453 
4454 		flags = ENQUEUE_WAKEUP;
4455 	}
4456 
4457 	for_each_sched_entity(se) {
4458 		cfs_rq = cfs_rq_of(se);
4459 		cfs_rq->h_nr_running++;
4460 
4461 		if (cfs_rq_throttled(cfs_rq))
4462 			break;
4463 
4464 		update_load_avg(se, 1);
4465 		update_cfs_shares(cfs_rq);
4466 	}
4467 
4468 	if (!se)
4469 		add_nr_running(rq, 1);
4470 
4471 	hrtick_update(rq);
4472 }
4473 
4474 static void set_next_buddy(struct sched_entity *se);
4475 
4476 /*
4477  * The dequeue_task method is called before nr_running is
4478  * decreased. We remove the task from the rbtree and
4479  * update the fair scheduling stats:
4480  */
4481 static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
4482 {
4483 	struct cfs_rq *cfs_rq;
4484 	struct sched_entity *se = &p->se;
4485 	int task_sleep = flags & DEQUEUE_SLEEP;
4486 
4487 	for_each_sched_entity(se) {
4488 		cfs_rq = cfs_rq_of(se);
4489 		dequeue_entity(cfs_rq, se, flags);
4490 
4491 		/*
4492 		 * end evaluation on encountering a throttled cfs_rq
4493 		 *
4494 		 * note: in the case of encountering a throttled cfs_rq we will
4495 		 * post the final h_nr_running decrement below.
4496 		*/
4497 		if (cfs_rq_throttled(cfs_rq))
4498 			break;
4499 		cfs_rq->h_nr_running--;
4500 
4501 		/* Don't dequeue parent if it has other entities besides us */
4502 		if (cfs_rq->load.weight) {
4503 			/*
4504 			 * Bias pick_next to pick a task from this cfs_rq, as
4505 			 * p is sleeping when it is within its sched_slice.
4506 			 */
4507 			if (task_sleep && parent_entity(se))
4508 				set_next_buddy(parent_entity(se));
4509 
4510 			/* avoid re-evaluating load for this entity */
4511 			se = parent_entity(se);
4512 			break;
4513 		}
4514 		flags |= DEQUEUE_SLEEP;
4515 	}
4516 
4517 	for_each_sched_entity(se) {
4518 		cfs_rq = cfs_rq_of(se);
4519 		cfs_rq->h_nr_running--;
4520 
4521 		if (cfs_rq_throttled(cfs_rq))
4522 			break;
4523 
4524 		update_load_avg(se, 1);
4525 		update_cfs_shares(cfs_rq);
4526 	}
4527 
4528 	if (!se)
4529 		sub_nr_running(rq, 1);
4530 
4531 	hrtick_update(rq);
4532 }
4533 
4534 #ifdef CONFIG_SMP
4535 #ifdef CONFIG_NO_HZ_COMMON
4536 /*
4537  * per rq 'load' arrray crap; XXX kill this.
4538  */
4539 
4540 /*
4541  * The exact cpuload calculated at every tick would be:
4542  *
4543  *   load' = (1 - 1/2^i) * load + (1/2^i) * cur_load
4544  *
4545  * If a cpu misses updates for n ticks (as it was idle) and update gets
4546  * called on the n+1-th tick when cpu may be busy, then we have:
4547  *
4548  *   load_n   = (1 - 1/2^i)^n * load_0
4549  *   load_n+1 = (1 - 1/2^i)   * load_n + (1/2^i) * cur_load
4550  *
4551  * decay_load_missed() below does efficient calculation of
4552  *
4553  *   load' = (1 - 1/2^i)^n * load
4554  *
4555  * Because x^(n+m) := x^n * x^m we can decompose any x^n in power-of-2 factors.
4556  * This allows us to precompute the above in said factors, thereby allowing the
4557  * reduction of an arbitrary n in O(log_2 n) steps. (See also
4558  * fixed_power_int())
4559  *
4560  * The calculation is approximated on a 128 point scale.
4561  */
4562 #define DEGRADE_SHIFT		7
4563 
4564 static const u8 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
4565 static const u8 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
4566 	{   0,   0,  0,  0,  0,  0, 0, 0 },
4567 	{  64,  32,  8,  0,  0,  0, 0, 0 },
4568 	{  96,  72, 40, 12,  1,  0, 0, 0 },
4569 	{ 112,  98, 75, 43, 15,  1, 0, 0 },
4570 	{ 120, 112, 98, 76, 45, 16, 2, 0 }
4571 };
4572 
4573 /*
4574  * Update cpu_load for any missed ticks, due to tickless idle. The backlog
4575  * would be when CPU is idle and so we just decay the old load without
4576  * adding any new load.
4577  */
4578 static unsigned long
4579 decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
4580 {
4581 	int j = 0;
4582 
4583 	if (!missed_updates)
4584 		return load;
4585 
4586 	if (missed_updates >= degrade_zero_ticks[idx])
4587 		return 0;
4588 
4589 	if (idx == 1)
4590 		return load >> missed_updates;
4591 
4592 	while (missed_updates) {
4593 		if (missed_updates % 2)
4594 			load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
4595 
4596 		missed_updates >>= 1;
4597 		j++;
4598 	}
4599 	return load;
4600 }
4601 #endif /* CONFIG_NO_HZ_COMMON */
4602 
4603 /**
4604  * __cpu_load_update - update the rq->cpu_load[] statistics
4605  * @this_rq: The rq to update statistics for
4606  * @this_load: The current load
4607  * @pending_updates: The number of missed updates
4608  *
4609  * Update rq->cpu_load[] statistics. This function is usually called every
4610  * scheduler tick (TICK_NSEC).
4611  *
4612  * This function computes a decaying average:
4613  *
4614  *   load[i]' = (1 - 1/2^i) * load[i] + (1/2^i) * load
4615  *
4616  * Because of NOHZ it might not get called on every tick which gives need for
4617  * the @pending_updates argument.
4618  *
4619  *   load[i]_n = (1 - 1/2^i) * load[i]_n-1 + (1/2^i) * load_n-1
4620  *             = A * load[i]_n-1 + B ; A := (1 - 1/2^i), B := (1/2^i) * load
4621  *             = A * (A * load[i]_n-2 + B) + B
4622  *             = A * (A * (A * load[i]_n-3 + B) + B) + B
4623  *             = A^3 * load[i]_n-3 + (A^2 + A + 1) * B
4624  *             = A^n * load[i]_0 + (A^(n-1) + A^(n-2) + ... + 1) * B
4625  *             = A^n * load[i]_0 + ((1 - A^n) / (1 - A)) * B
4626  *             = (1 - 1/2^i)^n * (load[i]_0 - load) + load
4627  *
4628  * In the above we've assumed load_n := load, which is true for NOHZ_FULL as
4629  * any change in load would have resulted in the tick being turned back on.
4630  *
4631  * For regular NOHZ, this reduces to:
4632  *
4633  *   load[i]_n = (1 - 1/2^i)^n * load[i]_0
4634  *
4635  * see decay_load_misses(). For NOHZ_FULL we get to subtract and add the extra
4636  * term.
4637  */
4638 static void cpu_load_update(struct rq *this_rq, unsigned long this_load,
4639 			    unsigned long pending_updates)
4640 {
4641 	unsigned long __maybe_unused tickless_load = this_rq->cpu_load[0];
4642 	int i, scale;
4643 
4644 	this_rq->nr_load_updates++;
4645 
4646 	/* Update our load: */
4647 	this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
4648 	for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
4649 		unsigned long old_load, new_load;
4650 
4651 		/* scale is effectively 1 << i now, and >> i divides by scale */
4652 
4653 		old_load = this_rq->cpu_load[i];
4654 #ifdef CONFIG_NO_HZ_COMMON
4655 		old_load = decay_load_missed(old_load, pending_updates - 1, i);
4656 		if (tickless_load) {
4657 			old_load -= decay_load_missed(tickless_load, pending_updates - 1, i);
4658 			/*
4659 			 * old_load can never be a negative value because a
4660 			 * decayed tickless_load cannot be greater than the
4661 			 * original tickless_load.
4662 			 */
4663 			old_load += tickless_load;
4664 		}
4665 #endif
4666 		new_load = this_load;
4667 		/*
4668 		 * Round up the averaging division if load is increasing. This
4669 		 * prevents us from getting stuck on 9 if the load is 10, for
4670 		 * example.
4671 		 */
4672 		if (new_load > old_load)
4673 			new_load += scale - 1;
4674 
4675 		this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
4676 	}
4677 
4678 	sched_avg_update(this_rq);
4679 }
4680 
4681 /* Used instead of source_load when we know the type == 0 */
4682 static unsigned long weighted_cpuload(const int cpu)
4683 {
4684 	return cfs_rq_runnable_load_avg(&cpu_rq(cpu)->cfs);
4685 }
4686 
4687 #ifdef CONFIG_NO_HZ_COMMON
4688 /*
4689  * There is no sane way to deal with nohz on smp when using jiffies because the
4690  * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
4691  * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
4692  *
4693  * Therefore we need to avoid the delta approach from the regular tick when
4694  * possible since that would seriously skew the load calculation. This is why we
4695  * use cpu_load_update_periodic() for CPUs out of nohz. However we'll rely on
4696  * jiffies deltas for updates happening while in nohz mode (idle ticks, idle
4697  * loop exit, nohz_idle_balance, nohz full exit...)
4698  *
4699  * This means we might still be one tick off for nohz periods.
4700  */
4701 
4702 static void cpu_load_update_nohz(struct rq *this_rq,
4703 				 unsigned long curr_jiffies,
4704 				 unsigned long load)
4705 {
4706 	unsigned long pending_updates;
4707 
4708 	pending_updates = curr_jiffies - this_rq->last_load_update_tick;
4709 	if (pending_updates) {
4710 		this_rq->last_load_update_tick = curr_jiffies;
4711 		/*
4712 		 * In the regular NOHZ case, we were idle, this means load 0.
4713 		 * In the NOHZ_FULL case, we were non-idle, we should consider
4714 		 * its weighted load.
4715 		 */
4716 		cpu_load_update(this_rq, load, pending_updates);
4717 	}
4718 }
4719 
4720 /*
4721  * Called from nohz_idle_balance() to update the load ratings before doing the
4722  * idle balance.
4723  */
4724 static void cpu_load_update_idle(struct rq *this_rq)
4725 {
4726 	/*
4727 	 * bail if there's load or we're actually up-to-date.
4728 	 */
4729 	if (weighted_cpuload(cpu_of(this_rq)))
4730 		return;
4731 
4732 	cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), 0);
4733 }
4734 
4735 /*
4736  * Record CPU load on nohz entry so we know the tickless load to account
4737  * on nohz exit. cpu_load[0] happens then to be updated more frequently
4738  * than other cpu_load[idx] but it should be fine as cpu_load readers
4739  * shouldn't rely into synchronized cpu_load[*] updates.
4740  */
4741 void cpu_load_update_nohz_start(void)
4742 {
4743 	struct rq *this_rq = this_rq();
4744 
4745 	/*
4746 	 * This is all lockless but should be fine. If weighted_cpuload changes
4747 	 * concurrently we'll exit nohz. And cpu_load write can race with
4748 	 * cpu_load_update_idle() but both updater would be writing the same.
4749 	 */
4750 	this_rq->cpu_load[0] = weighted_cpuload(cpu_of(this_rq));
4751 }
4752 
4753 /*
4754  * Account the tickless load in the end of a nohz frame.
4755  */
4756 void cpu_load_update_nohz_stop(void)
4757 {
4758 	unsigned long curr_jiffies = READ_ONCE(jiffies);
4759 	struct rq *this_rq = this_rq();
4760 	unsigned long load;
4761 
4762 	if (curr_jiffies == this_rq->last_load_update_tick)
4763 		return;
4764 
4765 	load = weighted_cpuload(cpu_of(this_rq));
4766 	raw_spin_lock(&this_rq->lock);
4767 	update_rq_clock(this_rq);
4768 	cpu_load_update_nohz(this_rq, curr_jiffies, load);
4769 	raw_spin_unlock(&this_rq->lock);
4770 }
4771 #else /* !CONFIG_NO_HZ_COMMON */
4772 static inline void cpu_load_update_nohz(struct rq *this_rq,
4773 					unsigned long curr_jiffies,
4774 					unsigned long load) { }
4775 #endif /* CONFIG_NO_HZ_COMMON */
4776 
4777 static void cpu_load_update_periodic(struct rq *this_rq, unsigned long load)
4778 {
4779 #ifdef CONFIG_NO_HZ_COMMON
4780 	/* See the mess around cpu_load_update_nohz(). */
4781 	this_rq->last_load_update_tick = READ_ONCE(jiffies);
4782 #endif
4783 	cpu_load_update(this_rq, load, 1);
4784 }
4785 
4786 /*
4787  * Called from scheduler_tick()
4788  */
4789 void cpu_load_update_active(struct rq *this_rq)
4790 {
4791 	unsigned long load = weighted_cpuload(cpu_of(this_rq));
4792 
4793 	if (tick_nohz_tick_stopped())
4794 		cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), load);
4795 	else
4796 		cpu_load_update_periodic(this_rq, load);
4797 }
4798 
4799 /*
4800  * Return a low guess at the load of a migration-source cpu weighted
4801  * according to the scheduling class and "nice" value.
4802  *
4803  * We want to under-estimate the load of migration sources, to
4804  * balance conservatively.
4805  */
4806 static unsigned long source_load(int cpu, int type)
4807 {
4808 	struct rq *rq = cpu_rq(cpu);
4809 	unsigned long total = weighted_cpuload(cpu);
4810 
4811 	if (type == 0 || !sched_feat(LB_BIAS))
4812 		return total;
4813 
4814 	return min(rq->cpu_load[type-1], total);
4815 }
4816 
4817 /*
4818  * Return a high guess at the load of a migration-target cpu weighted
4819  * according to the scheduling class and "nice" value.
4820  */
4821 static unsigned long target_load(int cpu, int type)
4822 {
4823 	struct rq *rq = cpu_rq(cpu);
4824 	unsigned long total = weighted_cpuload(cpu);
4825 
4826 	if (type == 0 || !sched_feat(LB_BIAS))
4827 		return total;
4828 
4829 	return max(rq->cpu_load[type-1], total);
4830 }
4831 
4832 static unsigned long capacity_of(int cpu)
4833 {
4834 	return cpu_rq(cpu)->cpu_capacity;
4835 }
4836 
4837 static unsigned long capacity_orig_of(int cpu)
4838 {
4839 	return cpu_rq(cpu)->cpu_capacity_orig;
4840 }
4841 
4842 static unsigned long cpu_avg_load_per_task(int cpu)
4843 {
4844 	struct rq *rq = cpu_rq(cpu);
4845 	unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
4846 	unsigned long load_avg = weighted_cpuload(cpu);
4847 
4848 	if (nr_running)
4849 		return load_avg / nr_running;
4850 
4851 	return 0;
4852 }
4853 
4854 #ifdef CONFIG_FAIR_GROUP_SCHED
4855 /*
4856  * effective_load() calculates the load change as seen from the root_task_group
4857  *
4858  * Adding load to a group doesn't make a group heavier, but can cause movement
4859  * of group shares between cpus. Assuming the shares were perfectly aligned one
4860  * can calculate the shift in shares.
4861  *
4862  * Calculate the effective load difference if @wl is added (subtracted) to @tg
4863  * on this @cpu and results in a total addition (subtraction) of @wg to the
4864  * total group weight.
4865  *
4866  * Given a runqueue weight distribution (rw_i) we can compute a shares
4867  * distribution (s_i) using:
4868  *
4869  *   s_i = rw_i / \Sum rw_j						(1)
4870  *
4871  * Suppose we have 4 CPUs and our @tg is a direct child of the root group and
4872  * has 7 equal weight tasks, distributed as below (rw_i), with the resulting
4873  * shares distribution (s_i):
4874  *
4875  *   rw_i = {   2,   4,   1,   0 }
4876  *   s_i  = { 2/7, 4/7, 1/7,   0 }
4877  *
4878  * As per wake_affine() we're interested in the load of two CPUs (the CPU the
4879  * task used to run on and the CPU the waker is running on), we need to
4880  * compute the effect of waking a task on either CPU and, in case of a sync
4881  * wakeup, compute the effect of the current task going to sleep.
4882  *
4883  * So for a change of @wl to the local @cpu with an overall group weight change
4884  * of @wl we can compute the new shares distribution (s'_i) using:
4885  *
4886  *   s'_i = (rw_i + @wl) / (@wg + \Sum rw_j)				(2)
4887  *
4888  * Suppose we're interested in CPUs 0 and 1, and want to compute the load
4889  * differences in waking a task to CPU 0. The additional task changes the
4890  * weight and shares distributions like:
4891  *
4892  *   rw'_i = {   3,   4,   1,   0 }
4893  *   s'_i  = { 3/8, 4/8, 1/8,   0 }
4894  *
4895  * We can then compute the difference in effective weight by using:
4896  *
4897  *   dw_i = S * (s'_i - s_i)						(3)
4898  *
4899  * Where 'S' is the group weight as seen by its parent.
4900  *
4901  * Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7)
4902  * times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 -
4903  * 4/7) times the weight of the group.
4904  */
4905 static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
4906 {
4907 	struct sched_entity *se = tg->se[cpu];
4908 
4909 	if (!tg->parent)	/* the trivial, non-cgroup case */
4910 		return wl;
4911 
4912 	for_each_sched_entity(se) {
4913 		long w, W;
4914 
4915 		tg = se->my_q->tg;
4916 
4917 		/*
4918 		 * W = @wg + \Sum rw_j
4919 		 */
4920 		W = wg + calc_tg_weight(tg, se->my_q);
4921 
4922 		/*
4923 		 * w = rw_i + @wl
4924 		 */
4925 		w = cfs_rq_load_avg(se->my_q) + wl;
4926 
4927 		/*
4928 		 * wl = S * s'_i; see (2)
4929 		 */
4930 		if (W > 0 && w < W)
4931 			wl = (w * (long)tg->shares) / W;
4932 		else
4933 			wl = tg->shares;
4934 
4935 		/*
4936 		 * Per the above, wl is the new se->load.weight value; since
4937 		 * those are clipped to [MIN_SHARES, ...) do so now. See
4938 		 * calc_cfs_shares().
4939 		 */
4940 		if (wl < MIN_SHARES)
4941 			wl = MIN_SHARES;
4942 
4943 		/*
4944 		 * wl = dw_i = S * (s'_i - s_i); see (3)
4945 		 */
4946 		wl -= se->avg.load_avg;
4947 
4948 		/*
4949 		 * Recursively apply this logic to all parent groups to compute
4950 		 * the final effective load change on the root group. Since
4951 		 * only the @tg group gets extra weight, all parent groups can
4952 		 * only redistribute existing shares. @wl is the shift in shares
4953 		 * resulting from this level per the above.
4954 		 */
4955 		wg = 0;
4956 	}
4957 
4958 	return wl;
4959 }
4960 #else
4961 
4962 static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
4963 {
4964 	return wl;
4965 }
4966 
4967 #endif
4968 
4969 static void record_wakee(struct task_struct *p)
4970 {
4971 	/*
4972 	 * Only decay a single time; tasks that have less then 1 wakeup per
4973 	 * jiffy will not have built up many flips.
4974 	 */
4975 	if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) {
4976 		current->wakee_flips >>= 1;
4977 		current->wakee_flip_decay_ts = jiffies;
4978 	}
4979 
4980 	if (current->last_wakee != p) {
4981 		current->last_wakee = p;
4982 		current->wakee_flips++;
4983 	}
4984 }
4985 
4986 /*
4987  * Detect M:N waker/wakee relationships via a switching-frequency heuristic.
4988  *
4989  * A waker of many should wake a different task than the one last awakened
4990  * at a frequency roughly N times higher than one of its wakees.
4991  *
4992  * In order to determine whether we should let the load spread vs consolidating
4993  * to shared cache, we look for a minimum 'flip' frequency of llc_size in one
4994  * partner, and a factor of lls_size higher frequency in the other.
4995  *
4996  * With both conditions met, we can be relatively sure that the relationship is
4997  * non-monogamous, with partner count exceeding socket size.
4998  *
4999  * Waker/wakee being client/server, worker/dispatcher, interrupt source or
5000  * whatever is irrelevant, spread criteria is apparent partner count exceeds
5001  * socket size.
5002  */
5003 static int wake_wide(struct task_struct *p)
5004 {
5005 	unsigned int master = current->wakee_flips;
5006 	unsigned int slave = p->wakee_flips;
5007 	int factor = this_cpu_read(sd_llc_size);
5008 
5009 	if (master < slave)
5010 		swap(master, slave);
5011 	if (slave < factor || master < slave * factor)
5012 		return 0;
5013 	return 1;
5014 }
5015 
5016 static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
5017 {
5018 	s64 this_load, load;
5019 	s64 this_eff_load, prev_eff_load;
5020 	int idx, this_cpu, prev_cpu;
5021 	struct task_group *tg;
5022 	unsigned long weight;
5023 	int balanced;
5024 
5025 	idx	  = sd->wake_idx;
5026 	this_cpu  = smp_processor_id();
5027 	prev_cpu  = task_cpu(p);
5028 	load	  = source_load(prev_cpu, idx);
5029 	this_load = target_load(this_cpu, idx);
5030 
5031 	/*
5032 	 * If sync wakeup then subtract the (maximum possible)
5033 	 * effect of the currently running task from the load
5034 	 * of the current CPU:
5035 	 */
5036 	if (sync) {
5037 		tg = task_group(current);
5038 		weight = current->se.avg.load_avg;
5039 
5040 		this_load += effective_load(tg, this_cpu, -weight, -weight);
5041 		load += effective_load(tg, prev_cpu, 0, -weight);
5042 	}
5043 
5044 	tg = task_group(p);
5045 	weight = p->se.avg.load_avg;
5046 
5047 	/*
5048 	 * In low-load situations, where prev_cpu is idle and this_cpu is idle
5049 	 * due to the sync cause above having dropped this_load to 0, we'll
5050 	 * always have an imbalance, but there's really nothing you can do
5051 	 * about that, so that's good too.
5052 	 *
5053 	 * Otherwise check if either cpus are near enough in load to allow this
5054 	 * task to be woken on this_cpu.
5055 	 */
5056 	this_eff_load = 100;
5057 	this_eff_load *= capacity_of(prev_cpu);
5058 
5059 	prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
5060 	prev_eff_load *= capacity_of(this_cpu);
5061 
5062 	if (this_load > 0) {
5063 		this_eff_load *= this_load +
5064 			effective_load(tg, this_cpu, weight, weight);
5065 
5066 		prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
5067 	}
5068 
5069 	balanced = this_eff_load <= prev_eff_load;
5070 
5071 	schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
5072 
5073 	if (!balanced)
5074 		return 0;
5075 
5076 	schedstat_inc(sd, ttwu_move_affine);
5077 	schedstat_inc(p, se.statistics.nr_wakeups_affine);
5078 
5079 	return 1;
5080 }
5081 
5082 /*
5083  * find_idlest_group finds and returns the least busy CPU group within the
5084  * domain.
5085  */
5086 static struct sched_group *
5087 find_idlest_group(struct sched_domain *sd, struct task_struct *p,
5088 		  int this_cpu, int sd_flag)
5089 {
5090 	struct sched_group *idlest = NULL, *group = sd->groups;
5091 	unsigned long min_load = ULONG_MAX, this_load = 0;
5092 	int load_idx = sd->forkexec_idx;
5093 	int imbalance = 100 + (sd->imbalance_pct-100)/2;
5094 
5095 	if (sd_flag & SD_BALANCE_WAKE)
5096 		load_idx = sd->wake_idx;
5097 
5098 	do {
5099 		unsigned long load, avg_load;
5100 		int local_group;
5101 		int i;
5102 
5103 		/* Skip over this group if it has no CPUs allowed */
5104 		if (!cpumask_intersects(sched_group_cpus(group),
5105 					tsk_cpus_allowed(p)))
5106 			continue;
5107 
5108 		local_group = cpumask_test_cpu(this_cpu,
5109 					       sched_group_cpus(group));
5110 
5111 		/* Tally up the load of all CPUs in the group */
5112 		avg_load = 0;
5113 
5114 		for_each_cpu(i, sched_group_cpus(group)) {
5115 			/* Bias balancing toward cpus of our domain */
5116 			if (local_group)
5117 				load = source_load(i, load_idx);
5118 			else
5119 				load = target_load(i, load_idx);
5120 
5121 			avg_load += load;
5122 		}
5123 
5124 		/* Adjust by relative CPU capacity of the group */
5125 		avg_load = (avg_load * SCHED_CAPACITY_SCALE) / group->sgc->capacity;
5126 
5127 		if (local_group) {
5128 			this_load = avg_load;
5129 		} else if (avg_load < min_load) {
5130 			min_load = avg_load;
5131 			idlest = group;
5132 		}
5133 	} while (group = group->next, group != sd->groups);
5134 
5135 	if (!idlest || 100*this_load < imbalance*min_load)
5136 		return NULL;
5137 	return idlest;
5138 }
5139 
5140 /*
5141  * find_idlest_cpu - find the idlest cpu among the cpus in group.
5142  */
5143 static int
5144 find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
5145 {
5146 	unsigned long load, min_load = ULONG_MAX;
5147 	unsigned int min_exit_latency = UINT_MAX;
5148 	u64 latest_idle_timestamp = 0;
5149 	int least_loaded_cpu = this_cpu;
5150 	int shallowest_idle_cpu = -1;
5151 	int i;
5152 
5153 	/* Traverse only the allowed CPUs */
5154 	for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
5155 		if (idle_cpu(i)) {
5156 			struct rq *rq = cpu_rq(i);
5157 			struct cpuidle_state *idle = idle_get_state(rq);
5158 			if (idle && idle->exit_latency < min_exit_latency) {
5159 				/*
5160 				 * We give priority to a CPU whose idle state
5161 				 * has the smallest exit latency irrespective
5162 				 * of any idle timestamp.
5163 				 */
5164 				min_exit_latency = idle->exit_latency;
5165 				latest_idle_timestamp = rq->idle_stamp;
5166 				shallowest_idle_cpu = i;
5167 			} else if ((!idle || idle->exit_latency == min_exit_latency) &&
5168 				   rq->idle_stamp > latest_idle_timestamp) {
5169 				/*
5170 				 * If equal or no active idle state, then
5171 				 * the most recently idled CPU might have
5172 				 * a warmer cache.
5173 				 */
5174 				latest_idle_timestamp = rq->idle_stamp;
5175 				shallowest_idle_cpu = i;
5176 			}
5177 		} else if (shallowest_idle_cpu == -1) {
5178 			load = weighted_cpuload(i);
5179 			if (load < min_load || (load == min_load && i == this_cpu)) {
5180 				min_load = load;
5181 				least_loaded_cpu = i;
5182 			}
5183 		}
5184 	}
5185 
5186 	return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
5187 }
5188 
5189 /*
5190  * Try and locate an idle CPU in the sched_domain.
5191  */
5192 static int select_idle_sibling(struct task_struct *p, int target)
5193 {
5194 	struct sched_domain *sd;
5195 	struct sched_group *sg;
5196 	int i = task_cpu(p);
5197 
5198 	if (idle_cpu(target))
5199 		return target;
5200 
5201 	/*
5202 	 * If the prevous cpu is cache affine and idle, don't be stupid.
5203 	 */
5204 	if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
5205 		return i;
5206 
5207 	/*
5208 	 * Otherwise, iterate the domains and find an eligible idle cpu.
5209 	 *
5210 	 * A completely idle sched group at higher domains is more
5211 	 * desirable than an idle group at a lower level, because lower
5212 	 * domains have smaller groups and usually share hardware
5213 	 * resources which causes tasks to contend on them, e.g. x86
5214 	 * hyperthread siblings in the lowest domain (SMT) can contend
5215 	 * on the shared cpu pipeline.
5216 	 *
5217 	 * However, while we prefer idle groups at higher domains
5218 	 * finding an idle cpu at the lowest domain is still better than
5219 	 * returning 'target', which we've already established, isn't
5220 	 * idle.
5221 	 */
5222 	sd = rcu_dereference(per_cpu(sd_llc, target));
5223 	for_each_lower_domain(sd) {
5224 		sg = sd->groups;
5225 		do {
5226 			if (!cpumask_intersects(sched_group_cpus(sg),
5227 						tsk_cpus_allowed(p)))
5228 				goto next;
5229 
5230 			/* Ensure the entire group is idle */
5231 			for_each_cpu(i, sched_group_cpus(sg)) {
5232 				if (i == target || !idle_cpu(i))
5233 					goto next;
5234 			}
5235 
5236 			/*
5237 			 * It doesn't matter which cpu we pick, the
5238 			 * whole group is idle.
5239 			 */
5240 			target = cpumask_first_and(sched_group_cpus(sg),
5241 					tsk_cpus_allowed(p));
5242 			goto done;
5243 next:
5244 			sg = sg->next;
5245 		} while (sg != sd->groups);
5246 	}
5247 done:
5248 	return target;
5249 }
5250 
5251 /*
5252  * cpu_util returns the amount of capacity of a CPU that is used by CFS
5253  * tasks. The unit of the return value must be the one of capacity so we can
5254  * compare the utilization with the capacity of the CPU that is available for
5255  * CFS task (ie cpu_capacity).
5256  *
5257  * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the
5258  * recent utilization of currently non-runnable tasks on a CPU. It represents
5259  * the amount of utilization of a CPU in the range [0..capacity_orig] where
5260  * capacity_orig is the cpu_capacity available at the highest frequency
5261  * (arch_scale_freq_capacity()).
5262  * The utilization of a CPU converges towards a sum equal to or less than the
5263  * current capacity (capacity_curr <= capacity_orig) of the CPU because it is
5264  * the running time on this CPU scaled by capacity_curr.
5265  *
5266  * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even
5267  * higher than capacity_orig because of unfortunate rounding in
5268  * cfs.avg.util_avg or just after migrating tasks and new task wakeups until
5269  * the average stabilizes with the new running time. We need to check that the
5270  * utilization stays within the range of [0..capacity_orig] and cap it if
5271  * necessary. Without utilization capping, a group could be seen as overloaded
5272  * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of
5273  * available capacity. We allow utilization to overshoot capacity_curr (but not
5274  * capacity_orig) as it useful for predicting the capacity required after task
5275  * migrations (scheduler-driven DVFS).
5276  */
5277 static int cpu_util(int cpu)
5278 {
5279 	unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg;
5280 	unsigned long capacity = capacity_orig_of(cpu);
5281 
5282 	return (util >= capacity) ? capacity : util;
5283 }
5284 
5285 /*
5286  * select_task_rq_fair: Select target runqueue for the waking task in domains
5287  * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
5288  * SD_BALANCE_FORK, or SD_BALANCE_EXEC.
5289  *
5290  * Balances load by selecting the idlest cpu in the idlest group, or under
5291  * certain conditions an idle sibling cpu if the domain has SD_WAKE_AFFINE set.
5292  *
5293  * Returns the target cpu number.
5294  *
5295  * preempt must be disabled.
5296  */
5297 static int
5298 select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
5299 {
5300 	struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
5301 	int cpu = smp_processor_id();
5302 	int new_cpu = prev_cpu;
5303 	int want_affine = 0;
5304 	int sync = wake_flags & WF_SYNC;
5305 
5306 	if (sd_flag & SD_BALANCE_WAKE) {
5307 		record_wakee(p);
5308 		want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
5309 	}
5310 
5311 	rcu_read_lock();
5312 	for_each_domain(cpu, tmp) {
5313 		if (!(tmp->flags & SD_LOAD_BALANCE))
5314 			break;
5315 
5316 		/*
5317 		 * If both cpu and prev_cpu are part of this domain,
5318 		 * cpu is a valid SD_WAKE_AFFINE target.
5319 		 */
5320 		if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
5321 		    cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
5322 			affine_sd = tmp;
5323 			break;
5324 		}
5325 
5326 		if (tmp->flags & sd_flag)
5327 			sd = tmp;
5328 		else if (!want_affine)
5329 			break;
5330 	}
5331 
5332 	if (affine_sd) {
5333 		sd = NULL; /* Prefer wake_affine over balance flags */
5334 		if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
5335 			new_cpu = cpu;
5336 	}
5337 
5338 	if (!sd) {
5339 		if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
5340 			new_cpu = select_idle_sibling(p, new_cpu);
5341 
5342 	} else while (sd) {
5343 		struct sched_group *group;
5344 		int weight;
5345 
5346 		if (!(sd->flags & sd_flag)) {
5347 			sd = sd->child;
5348 			continue;
5349 		}
5350 
5351 		group = find_idlest_group(sd, p, cpu, sd_flag);
5352 		if (!group) {
5353 			sd = sd->child;
5354 			continue;
5355 		}
5356 
5357 		new_cpu = find_idlest_cpu(group, p, cpu);
5358 		if (new_cpu == -1 || new_cpu == cpu) {
5359 			/* Now try balancing at a lower domain level of cpu */
5360 			sd = sd->child;
5361 			continue;
5362 		}
5363 
5364 		/* Now try balancing at a lower domain level of new_cpu */
5365 		cpu = new_cpu;
5366 		weight = sd->span_weight;
5367 		sd = NULL;
5368 		for_each_domain(cpu, tmp) {
5369 			if (weight <= tmp->span_weight)
5370 				break;
5371 			if (tmp->flags & sd_flag)
5372 				sd = tmp;
5373 		}
5374 		/* while loop will break here if sd == NULL */
5375 	}
5376 	rcu_read_unlock();
5377 
5378 	return new_cpu;
5379 }
5380 
5381 /*
5382  * Called immediately before a task is migrated to a new cpu; task_cpu(p) and
5383  * cfs_rq_of(p) references at time of call are still valid and identify the
5384  * previous cpu. The caller guarantees p->pi_lock or task_rq(p)->lock is held.
5385  */
5386 static void migrate_task_rq_fair(struct task_struct *p)
5387 {
5388 	/*
5389 	 * As blocked tasks retain absolute vruntime the migration needs to
5390 	 * deal with this by subtracting the old and adding the new
5391 	 * min_vruntime -- the latter is done by enqueue_entity() when placing
5392 	 * the task on the new runqueue.
5393 	 */
5394 	if (p->state == TASK_WAKING) {
5395 		struct sched_entity *se = &p->se;
5396 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
5397 		u64 min_vruntime;
5398 
5399 #ifndef CONFIG_64BIT
5400 		u64 min_vruntime_copy;
5401 
5402 		do {
5403 			min_vruntime_copy = cfs_rq->min_vruntime_copy;
5404 			smp_rmb();
5405 			min_vruntime = cfs_rq->min_vruntime;
5406 		} while (min_vruntime != min_vruntime_copy);
5407 #else
5408 		min_vruntime = cfs_rq->min_vruntime;
5409 #endif
5410 
5411 		se->vruntime -= min_vruntime;
5412 	}
5413 
5414 	/*
5415 	 * We are supposed to update the task to "current" time, then its up to date
5416 	 * and ready to go to new CPU/cfs_rq. But we have difficulty in getting
5417 	 * what current time is, so simply throw away the out-of-date time. This
5418 	 * will result in the wakee task is less decayed, but giving the wakee more
5419 	 * load sounds not bad.
5420 	 */
5421 	remove_entity_load_avg(&p->se);
5422 
5423 	/* Tell new CPU we are migrated */
5424 	p->se.avg.last_update_time = 0;
5425 
5426 	/* We have migrated, no longer consider this task hot */
5427 	p->se.exec_start = 0;
5428 }
5429 
5430 static void task_dead_fair(struct task_struct *p)
5431 {
5432 	remove_entity_load_avg(&p->se);
5433 }
5434 #endif /* CONFIG_SMP */
5435 
5436 static unsigned long
5437 wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
5438 {
5439 	unsigned long gran = sysctl_sched_wakeup_granularity;
5440 
5441 	/*
5442 	 * Since its curr running now, convert the gran from real-time
5443 	 * to virtual-time in his units.
5444 	 *
5445 	 * By using 'se' instead of 'curr' we penalize light tasks, so
5446 	 * they get preempted easier. That is, if 'se' < 'curr' then
5447 	 * the resulting gran will be larger, therefore penalizing the
5448 	 * lighter, if otoh 'se' > 'curr' then the resulting gran will
5449 	 * be smaller, again penalizing the lighter task.
5450 	 *
5451 	 * This is especially important for buddies when the leftmost
5452 	 * task is higher priority than the buddy.
5453 	 */
5454 	return calc_delta_fair(gran, se);
5455 }
5456 
5457 /*
5458  * Should 'se' preempt 'curr'.
5459  *
5460  *             |s1
5461  *        |s2
5462  *   |s3
5463  *         g
5464  *      |<--->|c
5465  *
5466  *  w(c, s1) = -1
5467  *  w(c, s2) =  0
5468  *  w(c, s3) =  1
5469  *
5470  */
5471 static int
5472 wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
5473 {
5474 	s64 gran, vdiff = curr->vruntime - se->vruntime;
5475 
5476 	if (vdiff <= 0)
5477 		return -1;
5478 
5479 	gran = wakeup_gran(curr, se);
5480 	if (vdiff > gran)
5481 		return 1;
5482 
5483 	return 0;
5484 }
5485 
5486 static void set_last_buddy(struct sched_entity *se)
5487 {
5488 	if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
5489 		return;
5490 
5491 	for_each_sched_entity(se)
5492 		cfs_rq_of(se)->last = se;
5493 }
5494 
5495 static void set_next_buddy(struct sched_entity *se)
5496 {
5497 	if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
5498 		return;
5499 
5500 	for_each_sched_entity(se)
5501 		cfs_rq_of(se)->next = se;
5502 }
5503 
5504 static void set_skip_buddy(struct sched_entity *se)
5505 {
5506 	for_each_sched_entity(se)
5507 		cfs_rq_of(se)->skip = se;
5508 }
5509 
5510 /*
5511  * Preempt the current task with a newly woken task if needed:
5512  */
5513 static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
5514 {
5515 	struct task_struct *curr = rq->curr;
5516 	struct sched_entity *se = &curr->se, *pse = &p->se;
5517 	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
5518 	int scale = cfs_rq->nr_running >= sched_nr_latency;
5519 	int next_buddy_marked = 0;
5520 
5521 	if (unlikely(se == pse))
5522 		return;
5523 
5524 	/*
5525 	 * This is possible from callers such as attach_tasks(), in which we
5526 	 * unconditionally check_prempt_curr() after an enqueue (which may have
5527 	 * lead to a throttle).  This both saves work and prevents false
5528 	 * next-buddy nomination below.
5529 	 */
5530 	if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
5531 		return;
5532 
5533 	if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
5534 		set_next_buddy(pse);
5535 		next_buddy_marked = 1;
5536 	}
5537 
5538 	/*
5539 	 * We can come here with TIF_NEED_RESCHED already set from new task
5540 	 * wake up path.
5541 	 *
5542 	 * Note: this also catches the edge-case of curr being in a throttled
5543 	 * group (e.g. via set_curr_task), since update_curr() (in the
5544 	 * enqueue of curr) will have resulted in resched being set.  This
5545 	 * prevents us from potentially nominating it as a false LAST_BUDDY
5546 	 * below.
5547 	 */
5548 	if (test_tsk_need_resched(curr))
5549 		return;
5550 
5551 	/* Idle tasks are by definition preempted by non-idle tasks. */
5552 	if (unlikely(curr->policy == SCHED_IDLE) &&
5553 	    likely(p->policy != SCHED_IDLE))
5554 		goto preempt;
5555 
5556 	/*
5557 	 * Batch and idle tasks do not preempt non-idle tasks (their preemption
5558 	 * is driven by the tick):
5559 	 */
5560 	if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
5561 		return;
5562 
5563 	find_matching_se(&se, &pse);
5564 	update_curr(cfs_rq_of(se));
5565 	BUG_ON(!pse);
5566 	if (wakeup_preempt_entity(se, pse) == 1) {
5567 		/*
5568 		 * Bias pick_next to pick the sched entity that is
5569 		 * triggering this preemption.
5570 		 */
5571 		if (!next_buddy_marked)
5572 			set_next_buddy(pse);
5573 		goto preempt;
5574 	}
5575 
5576 	return;
5577 
5578 preempt:
5579 	resched_curr(rq);
5580 	/*
5581 	 * Only set the backward buddy when the current task is still
5582 	 * on the rq. This can happen when a wakeup gets interleaved
5583 	 * with schedule on the ->pre_schedule() or idle_balance()
5584 	 * point, either of which can * drop the rq lock.
5585 	 *
5586 	 * Also, during early boot the idle thread is in the fair class,
5587 	 * for obvious reasons its a bad idea to schedule back to it.
5588 	 */
5589 	if (unlikely(!se->on_rq || curr == rq->idle))
5590 		return;
5591 
5592 	if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
5593 		set_last_buddy(se);
5594 }
5595 
5596 static struct task_struct *
5597 pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
5598 {
5599 	struct cfs_rq *cfs_rq = &rq->cfs;
5600 	struct sched_entity *se;
5601 	struct task_struct *p;
5602 	int new_tasks;
5603 
5604 again:
5605 #ifdef CONFIG_FAIR_GROUP_SCHED
5606 	if (!cfs_rq->nr_running)
5607 		goto idle;
5608 
5609 	if (prev->sched_class != &fair_sched_class)
5610 		goto simple;
5611 
5612 	/*
5613 	 * Because of the set_next_buddy() in dequeue_task_fair() it is rather
5614 	 * likely that a next task is from the same cgroup as the current.
5615 	 *
5616 	 * Therefore attempt to avoid putting and setting the entire cgroup
5617 	 * hierarchy, only change the part that actually changes.
5618 	 */
5619 
5620 	do {
5621 		struct sched_entity *curr = cfs_rq->curr;
5622 
5623 		/*
5624 		 * Since we got here without doing put_prev_entity() we also
5625 		 * have to consider cfs_rq->curr. If it is still a runnable
5626 		 * entity, update_curr() will update its vruntime, otherwise
5627 		 * forget we've ever seen it.
5628 		 */
5629 		if (curr) {
5630 			if (curr->on_rq)
5631 				update_curr(cfs_rq);
5632 			else
5633 				curr = NULL;
5634 
5635 			/*
5636 			 * This call to check_cfs_rq_runtime() will do the
5637 			 * throttle and dequeue its entity in the parent(s).
5638 			 * Therefore the 'simple' nr_running test will indeed
5639 			 * be correct.
5640 			 */
5641 			if (unlikely(check_cfs_rq_runtime(cfs_rq)))
5642 				goto simple;
5643 		}
5644 
5645 		se = pick_next_entity(cfs_rq, curr);
5646 		cfs_rq = group_cfs_rq(se);
5647 	} while (cfs_rq);
5648 
5649 	p = task_of(se);
5650 
5651 	/*
5652 	 * Since we haven't yet done put_prev_entity and if the selected task
5653 	 * is a different task than we started out with, try and touch the
5654 	 * least amount of cfs_rqs.
5655 	 */
5656 	if (prev != p) {
5657 		struct sched_entity *pse = &prev->se;
5658 
5659 		while (!(cfs_rq = is_same_group(se, pse))) {
5660 			int se_depth = se->depth;
5661 			int pse_depth = pse->depth;
5662 
5663 			if (se_depth <= pse_depth) {
5664 				put_prev_entity(cfs_rq_of(pse), pse);
5665 				pse = parent_entity(pse);
5666 			}
5667 			if (se_depth >= pse_depth) {
5668 				set_next_entity(cfs_rq_of(se), se);
5669 				se = parent_entity(se);
5670 			}
5671 		}
5672 
5673 		put_prev_entity(cfs_rq, pse);
5674 		set_next_entity(cfs_rq, se);
5675 	}
5676 
5677 	if (hrtick_enabled(rq))
5678 		hrtick_start_fair(rq, p);
5679 
5680 	return p;
5681 simple:
5682 	cfs_rq = &rq->cfs;
5683 #endif
5684 
5685 	if (!cfs_rq->nr_running)
5686 		goto idle;
5687 
5688 	put_prev_task(rq, prev);
5689 
5690 	do {
5691 		se = pick_next_entity(cfs_rq, NULL);
5692 		set_next_entity(cfs_rq, se);
5693 		cfs_rq = group_cfs_rq(se);
5694 	} while (cfs_rq);
5695 
5696 	p = task_of(se);
5697 
5698 	if (hrtick_enabled(rq))
5699 		hrtick_start_fair(rq, p);
5700 
5701 	return p;
5702 
5703 idle:
5704 	/*
5705 	 * This is OK, because current is on_cpu, which avoids it being picked
5706 	 * for load-balance and preemption/IRQs are still disabled avoiding
5707 	 * further scheduler activity on it and we're being very careful to
5708 	 * re-start the picking loop.
5709 	 */
5710 	lockdep_unpin_lock(&rq->lock, cookie);
5711 	new_tasks = idle_balance(rq);
5712 	lockdep_repin_lock(&rq->lock, cookie);
5713 	/*
5714 	 * Because idle_balance() releases (and re-acquires) rq->lock, it is
5715 	 * possible for any higher priority task to appear. In that case we
5716 	 * must re-start the pick_next_entity() loop.
5717 	 */
5718 	if (new_tasks < 0)
5719 		return RETRY_TASK;
5720 
5721 	if (new_tasks > 0)
5722 		goto again;
5723 
5724 	return NULL;
5725 }
5726 
5727 /*
5728  * Account for a descheduled task:
5729  */
5730 static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
5731 {
5732 	struct sched_entity *se = &prev->se;
5733 	struct cfs_rq *cfs_rq;
5734 
5735 	for_each_sched_entity(se) {
5736 		cfs_rq = cfs_rq_of(se);
5737 		put_prev_entity(cfs_rq, se);
5738 	}
5739 }
5740 
5741 /*
5742  * sched_yield() is very simple
5743  *
5744  * The magic of dealing with the ->skip buddy is in pick_next_entity.
5745  */
5746 static void yield_task_fair(struct rq *rq)
5747 {
5748 	struct task_struct *curr = rq->curr;
5749 	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
5750 	struct sched_entity *se = &curr->se;
5751 
5752 	/*
5753 	 * Are we the only task in the tree?
5754 	 */
5755 	if (unlikely(rq->nr_running == 1))
5756 		return;
5757 
5758 	clear_buddies(cfs_rq, se);
5759 
5760 	if (curr->policy != SCHED_BATCH) {
5761 		update_rq_clock(rq);
5762 		/*
5763 		 * Update run-time statistics of the 'current'.
5764 		 */
5765 		update_curr(cfs_rq);
5766 		/*
5767 		 * Tell update_rq_clock() that we've just updated,
5768 		 * so we don't do microscopic update in schedule()
5769 		 * and double the fastpath cost.
5770 		 */
5771 		rq_clock_skip_update(rq, true);
5772 	}
5773 
5774 	set_skip_buddy(se);
5775 }
5776 
5777 static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)
5778 {
5779 	struct sched_entity *se = &p->se;
5780 
5781 	/* throttled hierarchies are not runnable */
5782 	if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
5783 		return false;
5784 
5785 	/* Tell the scheduler that we'd really like pse to run next. */
5786 	set_next_buddy(se);
5787 
5788 	yield_task_fair(rq);
5789 
5790 	return true;
5791 }
5792 
5793 #ifdef CONFIG_SMP
5794 /**************************************************
5795  * Fair scheduling class load-balancing methods.
5796  *
5797  * BASICS
5798  *
5799  * The purpose of load-balancing is to achieve the same basic fairness the
5800  * per-cpu scheduler provides, namely provide a proportional amount of compute
5801  * time to each task. This is expressed in the following equation:
5802  *
5803  *   W_i,n/P_i == W_j,n/P_j for all i,j                               (1)
5804  *
5805  * Where W_i,n is the n-th weight average for cpu i. The instantaneous weight
5806  * W_i,0 is defined as:
5807  *
5808  *   W_i,0 = \Sum_j w_i,j                                             (2)
5809  *
5810  * Where w_i,j is the weight of the j-th runnable task on cpu i. This weight
5811  * is derived from the nice value as per sched_prio_to_weight[].
5812  *
5813  * The weight average is an exponential decay average of the instantaneous
5814  * weight:
5815  *
5816  *   W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0               (3)
5817  *
5818  * C_i is the compute capacity of cpu i, typically it is the
5819  * fraction of 'recent' time available for SCHED_OTHER task execution. But it
5820  * can also include other factors [XXX].
5821  *
5822  * To achieve this balance we define a measure of imbalance which follows
5823  * directly from (1):
5824  *
5825  *   imb_i,j = max{ avg(W/C), W_i/C_i } - min{ avg(W/C), W_j/C_j }    (4)
5826  *
5827  * We them move tasks around to minimize the imbalance. In the continuous
5828  * function space it is obvious this converges, in the discrete case we get
5829  * a few fun cases generally called infeasible weight scenarios.
5830  *
5831  * [XXX expand on:
5832  *     - infeasible weights;
5833  *     - local vs global optima in the discrete case. ]
5834  *
5835  *
5836  * SCHED DOMAINS
5837  *
5838  * In order to solve the imbalance equation (4), and avoid the obvious O(n^2)
5839  * for all i,j solution, we create a tree of cpus that follows the hardware
5840  * topology where each level pairs two lower groups (or better). This results
5841  * in O(log n) layers. Furthermore we reduce the number of cpus going up the
5842  * tree to only the first of the previous level and we decrease the frequency
5843  * of load-balance at each level inv. proportional to the number of cpus in
5844  * the groups.
5845  *
5846  * This yields:
5847  *
5848  *     log_2 n     1     n
5849  *   \Sum       { --- * --- * 2^i } = O(n)                            (5)
5850  *     i = 0      2^i   2^i
5851  *                               `- size of each group
5852  *         |         |     `- number of cpus doing load-balance
5853  *         |         `- freq
5854  *         `- sum over all levels
5855  *
5856  * Coupled with a limit on how many tasks we can migrate every balance pass,
5857  * this makes (5) the runtime complexity of the balancer.
5858  *
5859  * An important property here is that each CPU is still (indirectly) connected
5860  * to every other cpu in at most O(log n) steps:
5861  *
5862  * The adjacency matrix of the resulting graph is given by:
5863  *
5864  *             log_2 n
5865  *   A_i,j = \Union     (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1)  (6)
5866  *             k = 0
5867  *
5868  * And you'll find that:
5869  *
5870  *   A^(log_2 n)_i,j != 0  for all i,j                                (7)
5871  *
5872  * Showing there's indeed a path between every cpu in at most O(log n) steps.
5873  * The task movement gives a factor of O(m), giving a convergence complexity
5874  * of:
5875  *
5876  *   O(nm log n),  n := nr_cpus, m := nr_tasks                        (8)
5877  *
5878  *
5879  * WORK CONSERVING
5880  *
5881  * In order to avoid CPUs going idle while there's still work to do, new idle
5882  * balancing is more aggressive and has the newly idle cpu iterate up the domain
5883  * tree itself instead of relying on other CPUs to bring it work.
5884  *
5885  * This adds some complexity to both (5) and (8) but it reduces the total idle
5886  * time.
5887  *
5888  * [XXX more?]
5889  *
5890  *
5891  * CGROUPS
5892  *
5893  * Cgroups make a horror show out of (2), instead of a simple sum we get:
5894  *
5895  *                                s_k,i
5896  *   W_i,0 = \Sum_j \Prod_k w_k * -----                               (9)
5897  *                                 S_k
5898  *
5899  * Where
5900  *
5901  *   s_k,i = \Sum_j w_i,j,k  and  S_k = \Sum_i s_k,i                 (10)
5902  *
5903  * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i.
5904  *
5905  * The big problem is S_k, its a global sum needed to compute a local (W_i)
5906  * property.
5907  *
5908  * [XXX write more on how we solve this.. _after_ merging pjt's patches that
5909  *      rewrite all of this once again.]
5910  */
5911 
5912 static unsigned long __read_mostly max_load_balance_interval = HZ/10;
5913 
5914 enum fbq_type { regular, remote, all };
5915 
5916 #define LBF_ALL_PINNED	0x01
5917 #define LBF_NEED_BREAK	0x02
5918 #define LBF_DST_PINNED  0x04
5919 #define LBF_SOME_PINNED	0x08
5920 
5921 struct lb_env {
5922 	struct sched_domain	*sd;
5923 
5924 	struct rq		*src_rq;
5925 	int			src_cpu;
5926 
5927 	int			dst_cpu;
5928 	struct rq		*dst_rq;
5929 
5930 	struct cpumask		*dst_grpmask;
5931 	int			new_dst_cpu;
5932 	enum cpu_idle_type	idle;
5933 	long			imbalance;
5934 	/* The set of CPUs under consideration for load-balancing */
5935 	struct cpumask		*cpus;
5936 
5937 	unsigned int		flags;
5938 
5939 	unsigned int		loop;
5940 	unsigned int		loop_break;
5941 	unsigned int		loop_max;
5942 
5943 	enum fbq_type		fbq_type;
5944 	struct list_head	tasks;
5945 };
5946 
5947 /*
5948  * Is this task likely cache-hot:
5949  */
5950 static int task_hot(struct task_struct *p, struct lb_env *env)
5951 {
5952 	s64 delta;
5953 
5954 	lockdep_assert_held(&env->src_rq->lock);
5955 
5956 	if (p->sched_class != &fair_sched_class)
5957 		return 0;
5958 
5959 	if (unlikely(p->policy == SCHED_IDLE))
5960 		return 0;
5961 
5962 	/*
5963 	 * Buddy candidates are cache hot:
5964 	 */
5965 	if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running &&
5966 			(&p->se == cfs_rq_of(&p->se)->next ||
5967 			 &p->se == cfs_rq_of(&p->se)->last))
5968 		return 1;
5969 
5970 	if (sysctl_sched_migration_cost == -1)
5971 		return 1;
5972 	if (sysctl_sched_migration_cost == 0)
5973 		return 0;
5974 
5975 	delta = rq_clock_task(env->src_rq) - p->se.exec_start;
5976 
5977 	return delta < (s64)sysctl_sched_migration_cost;
5978 }
5979 
5980 #ifdef CONFIG_NUMA_BALANCING
5981 /*
5982  * Returns 1, if task migration degrades locality
5983  * Returns 0, if task migration improves locality i.e migration preferred.
5984  * Returns -1, if task migration is not affected by locality.
5985  */
5986 static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
5987 {
5988 	struct numa_group *numa_group = rcu_dereference(p->numa_group);
5989 	unsigned long src_faults, dst_faults;
5990 	int src_nid, dst_nid;
5991 
5992 	if (!static_branch_likely(&sched_numa_balancing))
5993 		return -1;
5994 
5995 	if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
5996 		return -1;
5997 
5998 	src_nid = cpu_to_node(env->src_cpu);
5999 	dst_nid = cpu_to_node(env->dst_cpu);
6000 
6001 	if (src_nid == dst_nid)
6002 		return -1;
6003 
6004 	/* Migrating away from the preferred node is always bad. */
6005 	if (src_nid == p->numa_preferred_nid) {
6006 		if (env->src_rq->nr_running > env->src_rq->nr_preferred_running)
6007 			return 1;
6008 		else
6009 			return -1;
6010 	}
6011 
6012 	/* Encourage migration to the preferred node. */
6013 	if (dst_nid == p->numa_preferred_nid)
6014 		return 0;
6015 
6016 	if (numa_group) {
6017 		src_faults = group_faults(p, src_nid);
6018 		dst_faults = group_faults(p, dst_nid);
6019 	} else {
6020 		src_faults = task_faults(p, src_nid);
6021 		dst_faults = task_faults(p, dst_nid);
6022 	}
6023 
6024 	return dst_faults < src_faults;
6025 }
6026 
6027 #else
6028 static inline int migrate_degrades_locality(struct task_struct *p,
6029 					     struct lb_env *env)
6030 {
6031 	return -1;
6032 }
6033 #endif
6034 
6035 /*
6036  * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
6037  */
6038 static
6039 int can_migrate_task(struct task_struct *p, struct lb_env *env)
6040 {
6041 	int tsk_cache_hot;
6042 
6043 	lockdep_assert_held(&env->src_rq->lock);
6044 
6045 	/*
6046 	 * We do not migrate tasks that are:
6047 	 * 1) throttled_lb_pair, or
6048 	 * 2) cannot be migrated to this CPU due to cpus_allowed, or
6049 	 * 3) running (obviously), or
6050 	 * 4) are cache-hot on their current CPU.
6051 	 */
6052 	if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
6053 		return 0;
6054 
6055 	if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
6056 		int cpu;
6057 
6058 		schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
6059 
6060 		env->flags |= LBF_SOME_PINNED;
6061 
6062 		/*
6063 		 * Remember if this task can be migrated to any other cpu in
6064 		 * our sched_group. We may want to revisit it if we couldn't
6065 		 * meet load balance goals by pulling other tasks on src_cpu.
6066 		 *
6067 		 * Also avoid computing new_dst_cpu if we have already computed
6068 		 * one in current iteration.
6069 		 */
6070 		if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED))
6071 			return 0;
6072 
6073 		/* Prevent to re-select dst_cpu via env's cpus */
6074 		for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
6075 			if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
6076 				env->flags |= LBF_DST_PINNED;
6077 				env->new_dst_cpu = cpu;
6078 				break;
6079 			}
6080 		}
6081 
6082 		return 0;
6083 	}
6084 
6085 	/* Record that we found atleast one task that could run on dst_cpu */
6086 	env->flags &= ~LBF_ALL_PINNED;
6087 
6088 	if (task_running(env->src_rq, p)) {
6089 		schedstat_inc(p, se.statistics.nr_failed_migrations_running);
6090 		return 0;
6091 	}
6092 
6093 	/*
6094 	 * Aggressive migration if:
6095 	 * 1) destination numa is preferred
6096 	 * 2) task is cache cold, or
6097 	 * 3) too many balance attempts have failed.
6098 	 */
6099 	tsk_cache_hot = migrate_degrades_locality(p, env);
6100 	if (tsk_cache_hot == -1)
6101 		tsk_cache_hot = task_hot(p, env);
6102 
6103 	if (tsk_cache_hot <= 0 ||
6104 	    env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
6105 		if (tsk_cache_hot == 1) {
6106 			schedstat_inc(env->sd, lb_hot_gained[env->idle]);
6107 			schedstat_inc(p, se.statistics.nr_forced_migrations);
6108 		}
6109 		return 1;
6110 	}
6111 
6112 	schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
6113 	return 0;
6114 }
6115 
6116 /*
6117  * detach_task() -- detach the task for the migration specified in env
6118  */
6119 static void detach_task(struct task_struct *p, struct lb_env *env)
6120 {
6121 	lockdep_assert_held(&env->src_rq->lock);
6122 
6123 	p->on_rq = TASK_ON_RQ_MIGRATING;
6124 	deactivate_task(env->src_rq, p, 0);
6125 	set_task_cpu(p, env->dst_cpu);
6126 }
6127 
6128 /*
6129  * detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as
6130  * part of active balancing operations within "domain".
6131  *
6132  * Returns a task if successful and NULL otherwise.
6133  */
6134 static struct task_struct *detach_one_task(struct lb_env *env)
6135 {
6136 	struct task_struct *p, *n;
6137 
6138 	lockdep_assert_held(&env->src_rq->lock);
6139 
6140 	list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
6141 		if (!can_migrate_task(p, env))
6142 			continue;
6143 
6144 		detach_task(p, env);
6145 
6146 		/*
6147 		 * Right now, this is only the second place where
6148 		 * lb_gained[env->idle] is updated (other is detach_tasks)
6149 		 * so we can safely collect stats here rather than
6150 		 * inside detach_tasks().
6151 		 */
6152 		schedstat_inc(env->sd, lb_gained[env->idle]);
6153 		return p;
6154 	}
6155 	return NULL;
6156 }
6157 
6158 static const unsigned int sched_nr_migrate_break = 32;
6159 
6160 /*
6161  * detach_tasks() -- tries to detach up to imbalance weighted load from
6162  * busiest_rq, as part of a balancing operation within domain "sd".
6163  *
6164  * Returns number of detached tasks if successful and 0 otherwise.
6165  */
6166 static int detach_tasks(struct lb_env *env)
6167 {
6168 	struct list_head *tasks = &env->src_rq->cfs_tasks;
6169 	struct task_struct *p;
6170 	unsigned long load;
6171 	int detached = 0;
6172 
6173 	lockdep_assert_held(&env->src_rq->lock);
6174 
6175 	if (env->imbalance <= 0)
6176 		return 0;
6177 
6178 	while (!list_empty(tasks)) {
6179 		/*
6180 		 * We don't want to steal all, otherwise we may be treated likewise,
6181 		 * which could at worst lead to a livelock crash.
6182 		 */
6183 		if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1)
6184 			break;
6185 
6186 		p = list_first_entry(tasks, struct task_struct, se.group_node);
6187 
6188 		env->loop++;
6189 		/* We've more or less seen every task there is, call it quits */
6190 		if (env->loop > env->loop_max)
6191 			break;
6192 
6193 		/* take a breather every nr_migrate tasks */
6194 		if (env->loop > env->loop_break) {
6195 			env->loop_break += sched_nr_migrate_break;
6196 			env->flags |= LBF_NEED_BREAK;
6197 			break;
6198 		}
6199 
6200 		if (!can_migrate_task(p, env))
6201 			goto next;
6202 
6203 		load = task_h_load(p);
6204 
6205 		if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
6206 			goto next;
6207 
6208 		if ((load / 2) > env->imbalance)
6209 			goto next;
6210 
6211 		detach_task(p, env);
6212 		list_add(&p->se.group_node, &env->tasks);
6213 
6214 		detached++;
6215 		env->imbalance -= load;
6216 
6217 #ifdef CONFIG_PREEMPT
6218 		/*
6219 		 * NEWIDLE balancing is a source of latency, so preemptible
6220 		 * kernels will stop after the first task is detached to minimize
6221 		 * the critical section.
6222 		 */
6223 		if (env->idle == CPU_NEWLY_IDLE)
6224 			break;
6225 #endif
6226 
6227 		/*
6228 		 * We only want to steal up to the prescribed amount of
6229 		 * weighted load.
6230 		 */
6231 		if (env->imbalance <= 0)
6232 			break;
6233 
6234 		continue;
6235 next:
6236 		list_move_tail(&p->se.group_node, tasks);
6237 	}
6238 
6239 	/*
6240 	 * Right now, this is one of only two places we collect this stat
6241 	 * so we can safely collect detach_one_task() stats here rather
6242 	 * than inside detach_one_task().
6243 	 */
6244 	schedstat_add(env->sd, lb_gained[env->idle], detached);
6245 
6246 	return detached;
6247 }
6248 
6249 /*
6250  * attach_task() -- attach the task detached by detach_task() to its new rq.
6251  */
6252 static void attach_task(struct rq *rq, struct task_struct *p)
6253 {
6254 	lockdep_assert_held(&rq->lock);
6255 
6256 	BUG_ON(task_rq(p) != rq);
6257 	activate_task(rq, p, 0);
6258 	p->on_rq = TASK_ON_RQ_QUEUED;
6259 	check_preempt_curr(rq, p, 0);
6260 }
6261 
6262 /*
6263  * attach_one_task() -- attaches the task returned from detach_one_task() to
6264  * its new rq.
6265  */
6266 static void attach_one_task(struct rq *rq, struct task_struct *p)
6267 {
6268 	raw_spin_lock(&rq->lock);
6269 	attach_task(rq, p);
6270 	raw_spin_unlock(&rq->lock);
6271 }
6272 
6273 /*
6274  * attach_tasks() -- attaches all tasks detached by detach_tasks() to their
6275  * new rq.
6276  */
6277 static void attach_tasks(struct lb_env *env)
6278 {
6279 	struct list_head *tasks = &env->tasks;
6280 	struct task_struct *p;
6281 
6282 	raw_spin_lock(&env->dst_rq->lock);
6283 
6284 	while (!list_empty(tasks)) {
6285 		p = list_first_entry(tasks, struct task_struct, se.group_node);
6286 		list_del_init(&p->se.group_node);
6287 
6288 		attach_task(env->dst_rq, p);
6289 	}
6290 
6291 	raw_spin_unlock(&env->dst_rq->lock);
6292 }
6293 
6294 #ifdef CONFIG_FAIR_GROUP_SCHED
6295 static void update_blocked_averages(int cpu)
6296 {
6297 	struct rq *rq = cpu_rq(cpu);
6298 	struct cfs_rq *cfs_rq;
6299 	unsigned long flags;
6300 
6301 	raw_spin_lock_irqsave(&rq->lock, flags);
6302 	update_rq_clock(rq);
6303 
6304 	/*
6305 	 * Iterates the task_group tree in a bottom up fashion, see
6306 	 * list_add_leaf_cfs_rq() for details.
6307 	 */
6308 	for_each_leaf_cfs_rq(rq, cfs_rq) {
6309 		/* throttled entities do not contribute to load */
6310 		if (throttled_hierarchy(cfs_rq))
6311 			continue;
6312 
6313 		if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true))
6314 			update_tg_load_avg(cfs_rq, 0);
6315 	}
6316 	raw_spin_unlock_irqrestore(&rq->lock, flags);
6317 }
6318 
6319 /*
6320  * Compute the hierarchical load factor for cfs_rq and all its ascendants.
6321  * This needs to be done in a top-down fashion because the load of a child
6322  * group is a fraction of its parents load.
6323  */
6324 static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
6325 {
6326 	struct rq *rq = rq_of(cfs_rq);
6327 	struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
6328 	unsigned long now = jiffies;
6329 	unsigned long load;
6330 
6331 	if (cfs_rq->last_h_load_update == now)
6332 		return;
6333 
6334 	cfs_rq->h_load_next = NULL;
6335 	for_each_sched_entity(se) {
6336 		cfs_rq = cfs_rq_of(se);
6337 		cfs_rq->h_load_next = se;
6338 		if (cfs_rq->last_h_load_update == now)
6339 			break;
6340 	}
6341 
6342 	if (!se) {
6343 		cfs_rq->h_load = cfs_rq_load_avg(cfs_rq);
6344 		cfs_rq->last_h_load_update = now;
6345 	}
6346 
6347 	while ((se = cfs_rq->h_load_next) != NULL) {
6348 		load = cfs_rq->h_load;
6349 		load = div64_ul(load * se->avg.load_avg,
6350 			cfs_rq_load_avg(cfs_rq) + 1);
6351 		cfs_rq = group_cfs_rq(se);
6352 		cfs_rq->h_load = load;
6353 		cfs_rq->last_h_load_update = now;
6354 	}
6355 }
6356 
6357 static unsigned long task_h_load(struct task_struct *p)
6358 {
6359 	struct cfs_rq *cfs_rq = task_cfs_rq(p);
6360 
6361 	update_cfs_rq_h_load(cfs_rq);
6362 	return div64_ul(p->se.avg.load_avg * cfs_rq->h_load,
6363 			cfs_rq_load_avg(cfs_rq) + 1);
6364 }
6365 #else
6366 static inline void update_blocked_averages(int cpu)
6367 {
6368 	struct rq *rq = cpu_rq(cpu);
6369 	struct cfs_rq *cfs_rq = &rq->cfs;
6370 	unsigned long flags;
6371 
6372 	raw_spin_lock_irqsave(&rq->lock, flags);
6373 	update_rq_clock(rq);
6374 	update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true);
6375 	raw_spin_unlock_irqrestore(&rq->lock, flags);
6376 }
6377 
6378 static unsigned long task_h_load(struct task_struct *p)
6379 {
6380 	return p->se.avg.load_avg;
6381 }
6382 #endif
6383 
6384 /********** Helpers for find_busiest_group ************************/
6385 
6386 enum group_type {
6387 	group_other = 0,
6388 	group_imbalanced,
6389 	group_overloaded,
6390 };
6391 
6392 /*
6393  * sg_lb_stats - stats of a sched_group required for load_balancing
6394  */
6395 struct sg_lb_stats {
6396 	unsigned long avg_load; /*Avg load across the CPUs of the group */
6397 	unsigned long group_load; /* Total load over the CPUs of the group */
6398 	unsigned long sum_weighted_load; /* Weighted load of group's tasks */
6399 	unsigned long load_per_task;
6400 	unsigned long group_capacity;
6401 	unsigned long group_util; /* Total utilization of the group */
6402 	unsigned int sum_nr_running; /* Nr tasks running in the group */
6403 	unsigned int idle_cpus;
6404 	unsigned int group_weight;
6405 	enum group_type group_type;
6406 	int group_no_capacity;
6407 #ifdef CONFIG_NUMA_BALANCING
6408 	unsigned int nr_numa_running;
6409 	unsigned int nr_preferred_running;
6410 #endif
6411 };
6412 
6413 /*
6414  * sd_lb_stats - Structure to store the statistics of a sched_domain
6415  *		 during load balancing.
6416  */
6417 struct sd_lb_stats {
6418 	struct sched_group *busiest;	/* Busiest group in this sd */
6419 	struct sched_group *local;	/* Local group in this sd */
6420 	unsigned long total_load;	/* Total load of all groups in sd */
6421 	unsigned long total_capacity;	/* Total capacity of all groups in sd */
6422 	unsigned long avg_load;	/* Average load across all groups in sd */
6423 
6424 	struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
6425 	struct sg_lb_stats local_stat;	/* Statistics of the local group */
6426 };
6427 
6428 static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
6429 {
6430 	/*
6431 	 * Skimp on the clearing to avoid duplicate work. We can avoid clearing
6432 	 * local_stat because update_sg_lb_stats() does a full clear/assignment.
6433 	 * We must however clear busiest_stat::avg_load because
6434 	 * update_sd_pick_busiest() reads this before assignment.
6435 	 */
6436 	*sds = (struct sd_lb_stats){
6437 		.busiest = NULL,
6438 		.local = NULL,
6439 		.total_load = 0UL,
6440 		.total_capacity = 0UL,
6441 		.busiest_stat = {
6442 			.avg_load = 0UL,
6443 			.sum_nr_running = 0,
6444 			.group_type = group_other,
6445 		},
6446 	};
6447 }
6448 
6449 /**
6450  * get_sd_load_idx - Obtain the load index for a given sched domain.
6451  * @sd: The sched_domain whose load_idx is to be obtained.
6452  * @idle: The idle status of the CPU for whose sd load_idx is obtained.
6453  *
6454  * Return: The load index.
6455  */
6456 static inline int get_sd_load_idx(struct sched_domain *sd,
6457 					enum cpu_idle_type idle)
6458 {
6459 	int load_idx;
6460 
6461 	switch (idle) {
6462 	case CPU_NOT_IDLE:
6463 		load_idx = sd->busy_idx;
6464 		break;
6465 
6466 	case CPU_NEWLY_IDLE:
6467 		load_idx = sd->newidle_idx;
6468 		break;
6469 	default:
6470 		load_idx = sd->idle_idx;
6471 		break;
6472 	}
6473 
6474 	return load_idx;
6475 }
6476 
6477 static unsigned long scale_rt_capacity(int cpu)
6478 {
6479 	struct rq *rq = cpu_rq(cpu);
6480 	u64 total, used, age_stamp, avg;
6481 	s64 delta;
6482 
6483 	/*
6484 	 * Since we're reading these variables without serialization make sure
6485 	 * we read them once before doing sanity checks on them.
6486 	 */
6487 	age_stamp = READ_ONCE(rq->age_stamp);
6488 	avg = READ_ONCE(rq->rt_avg);
6489 	delta = __rq_clock_broken(rq) - age_stamp;
6490 
6491 	if (unlikely(delta < 0))
6492 		delta = 0;
6493 
6494 	total = sched_avg_period() + delta;
6495 
6496 	used = div_u64(avg, total);
6497 
6498 	if (likely(used < SCHED_CAPACITY_SCALE))
6499 		return SCHED_CAPACITY_SCALE - used;
6500 
6501 	return 1;
6502 }
6503 
6504 static void update_cpu_capacity(struct sched_domain *sd, int cpu)
6505 {
6506 	unsigned long capacity = arch_scale_cpu_capacity(sd, cpu);
6507 	struct sched_group *sdg = sd->groups;
6508 
6509 	cpu_rq(cpu)->cpu_capacity_orig = capacity;
6510 
6511 	capacity *= scale_rt_capacity(cpu);
6512 	capacity >>= SCHED_CAPACITY_SHIFT;
6513 
6514 	if (!capacity)
6515 		capacity = 1;
6516 
6517 	cpu_rq(cpu)->cpu_capacity = capacity;
6518 	sdg->sgc->capacity = capacity;
6519 }
6520 
6521 void update_group_capacity(struct sched_domain *sd, int cpu)
6522 {
6523 	struct sched_domain *child = sd->child;
6524 	struct sched_group *group, *sdg = sd->groups;
6525 	unsigned long capacity;
6526 	unsigned long interval;
6527 
6528 	interval = msecs_to_jiffies(sd->balance_interval);
6529 	interval = clamp(interval, 1UL, max_load_balance_interval);
6530 	sdg->sgc->next_update = jiffies + interval;
6531 
6532 	if (!child) {
6533 		update_cpu_capacity(sd, cpu);
6534 		return;
6535 	}
6536 
6537 	capacity = 0;
6538 
6539 	if (child->flags & SD_OVERLAP) {
6540 		/*
6541 		 * SD_OVERLAP domains cannot assume that child groups
6542 		 * span the current group.
6543 		 */
6544 
6545 		for_each_cpu(cpu, sched_group_cpus(sdg)) {
6546 			struct sched_group_capacity *sgc;
6547 			struct rq *rq = cpu_rq(cpu);
6548 
6549 			/*
6550 			 * build_sched_domains() -> init_sched_groups_capacity()
6551 			 * gets here before we've attached the domains to the
6552 			 * runqueues.
6553 			 *
6554 			 * Use capacity_of(), which is set irrespective of domains
6555 			 * in update_cpu_capacity().
6556 			 *
6557 			 * This avoids capacity from being 0 and
6558 			 * causing divide-by-zero issues on boot.
6559 			 */
6560 			if (unlikely(!rq->sd)) {
6561 				capacity += capacity_of(cpu);
6562 				continue;
6563 			}
6564 
6565 			sgc = rq->sd->groups->sgc;
6566 			capacity += sgc->capacity;
6567 		}
6568 	} else  {
6569 		/*
6570 		 * !SD_OVERLAP domains can assume that child groups
6571 		 * span the current group.
6572 		 */
6573 
6574 		group = child->groups;
6575 		do {
6576 			capacity += group->sgc->capacity;
6577 			group = group->next;
6578 		} while (group != child->groups);
6579 	}
6580 
6581 	sdg->sgc->capacity = capacity;
6582 }
6583 
6584 /*
6585  * Check whether the capacity of the rq has been noticeably reduced by side
6586  * activity. The imbalance_pct is used for the threshold.
6587  * Return true is the capacity is reduced
6588  */
6589 static inline int
6590 check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
6591 {
6592 	return ((rq->cpu_capacity * sd->imbalance_pct) <
6593 				(rq->cpu_capacity_orig * 100));
6594 }
6595 
6596 /*
6597  * Group imbalance indicates (and tries to solve) the problem where balancing
6598  * groups is inadequate due to tsk_cpus_allowed() constraints.
6599  *
6600  * Imagine a situation of two groups of 4 cpus each and 4 tasks each with a
6601  * cpumask covering 1 cpu of the first group and 3 cpus of the second group.
6602  * Something like:
6603  *
6604  * 	{ 0 1 2 3 } { 4 5 6 7 }
6605  * 	        *     * * *
6606  *
6607  * If we were to balance group-wise we'd place two tasks in the first group and
6608  * two tasks in the second group. Clearly this is undesired as it will overload
6609  * cpu 3 and leave one of the cpus in the second group unused.
6610  *
6611  * The current solution to this issue is detecting the skew in the first group
6612  * by noticing the lower domain failed to reach balance and had difficulty
6613  * moving tasks due to affinity constraints.
6614  *
6615  * When this is so detected; this group becomes a candidate for busiest; see
6616  * update_sd_pick_busiest(). And calculate_imbalance() and
6617  * find_busiest_group() avoid some of the usual balance conditions to allow it
6618  * to create an effective group imbalance.
6619  *
6620  * This is a somewhat tricky proposition since the next run might not find the
6621  * group imbalance and decide the groups need to be balanced again. A most
6622  * subtle and fragile situation.
6623  */
6624 
6625 static inline int sg_imbalanced(struct sched_group *group)
6626 {
6627 	return group->sgc->imbalance;
6628 }
6629 
6630 /*
6631  * group_has_capacity returns true if the group has spare capacity that could
6632  * be used by some tasks.
6633  * We consider that a group has spare capacity if the  * number of task is
6634  * smaller than the number of CPUs or if the utilization is lower than the
6635  * available capacity for CFS tasks.
6636  * For the latter, we use a threshold to stabilize the state, to take into
6637  * account the variance of the tasks' load and to return true if the available
6638  * capacity in meaningful for the load balancer.
6639  * As an example, an available capacity of 1% can appear but it doesn't make
6640  * any benefit for the load balance.
6641  */
6642 static inline bool
6643 group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs)
6644 {
6645 	if (sgs->sum_nr_running < sgs->group_weight)
6646 		return true;
6647 
6648 	if ((sgs->group_capacity * 100) >
6649 			(sgs->group_util * env->sd->imbalance_pct))
6650 		return true;
6651 
6652 	return false;
6653 }
6654 
6655 /*
6656  *  group_is_overloaded returns true if the group has more tasks than it can
6657  *  handle.
6658  *  group_is_overloaded is not equals to !group_has_capacity because a group
6659  *  with the exact right number of tasks, has no more spare capacity but is not
6660  *  overloaded so both group_has_capacity and group_is_overloaded return
6661  *  false.
6662  */
6663 static inline bool
6664 group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs)
6665 {
6666 	if (sgs->sum_nr_running <= sgs->group_weight)
6667 		return false;
6668 
6669 	if ((sgs->group_capacity * 100) <
6670 			(sgs->group_util * env->sd->imbalance_pct))
6671 		return true;
6672 
6673 	return false;
6674 }
6675 
6676 static inline enum
6677 group_type group_classify(struct sched_group *group,
6678 			  struct sg_lb_stats *sgs)
6679 {
6680 	if (sgs->group_no_capacity)
6681 		return group_overloaded;
6682 
6683 	if (sg_imbalanced(group))
6684 		return group_imbalanced;
6685 
6686 	return group_other;
6687 }
6688 
6689 /**
6690  * update_sg_lb_stats - Update sched_group's statistics for load balancing.
6691  * @env: The load balancing environment.
6692  * @group: sched_group whose statistics are to be updated.
6693  * @load_idx: Load index of sched_domain of this_cpu for load calc.
6694  * @local_group: Does group contain this_cpu.
6695  * @sgs: variable to hold the statistics for this group.
6696  * @overload: Indicate more than one runnable task for any CPU.
6697  */
6698 static inline void update_sg_lb_stats(struct lb_env *env,
6699 			struct sched_group *group, int load_idx,
6700 			int local_group, struct sg_lb_stats *sgs,
6701 			bool *overload)
6702 {
6703 	unsigned long load;
6704 	int i, nr_running;
6705 
6706 	memset(sgs, 0, sizeof(*sgs));
6707 
6708 	for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
6709 		struct rq *rq = cpu_rq(i);
6710 
6711 		/* Bias balancing toward cpus of our domain */
6712 		if (local_group)
6713 			load = target_load(i, load_idx);
6714 		else
6715 			load = source_load(i, load_idx);
6716 
6717 		sgs->group_load += load;
6718 		sgs->group_util += cpu_util(i);
6719 		sgs->sum_nr_running += rq->cfs.h_nr_running;
6720 
6721 		nr_running = rq->nr_running;
6722 		if (nr_running > 1)
6723 			*overload = true;
6724 
6725 #ifdef CONFIG_NUMA_BALANCING
6726 		sgs->nr_numa_running += rq->nr_numa_running;
6727 		sgs->nr_preferred_running += rq->nr_preferred_running;
6728 #endif
6729 		sgs->sum_weighted_load += weighted_cpuload(i);
6730 		/*
6731 		 * No need to call idle_cpu() if nr_running is not 0
6732 		 */
6733 		if (!nr_running && idle_cpu(i))
6734 			sgs->idle_cpus++;
6735 	}
6736 
6737 	/* Adjust by relative CPU capacity of the group */
6738 	sgs->group_capacity = group->sgc->capacity;
6739 	sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity;
6740 
6741 	if (sgs->sum_nr_running)
6742 		sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
6743 
6744 	sgs->group_weight = group->group_weight;
6745 
6746 	sgs->group_no_capacity = group_is_overloaded(env, sgs);
6747 	sgs->group_type = group_classify(group, sgs);
6748 }
6749 
6750 /**
6751  * update_sd_pick_busiest - return 1 on busiest group
6752  * @env: The load balancing environment.
6753  * @sds: sched_domain statistics
6754  * @sg: sched_group candidate to be checked for being the busiest
6755  * @sgs: sched_group statistics
6756  *
6757  * Determine if @sg is a busier group than the previously selected
6758  * busiest group.
6759  *
6760  * Return: %true if @sg is a busier group than the previously selected
6761  * busiest group. %false otherwise.
6762  */
6763 static bool update_sd_pick_busiest(struct lb_env *env,
6764 				   struct sd_lb_stats *sds,
6765 				   struct sched_group *sg,
6766 				   struct sg_lb_stats *sgs)
6767 {
6768 	struct sg_lb_stats *busiest = &sds->busiest_stat;
6769 
6770 	if (sgs->group_type > busiest->group_type)
6771 		return true;
6772 
6773 	if (sgs->group_type < busiest->group_type)
6774 		return false;
6775 
6776 	if (sgs->avg_load <= busiest->avg_load)
6777 		return false;
6778 
6779 	/* This is the busiest node in its class. */
6780 	if (!(env->sd->flags & SD_ASYM_PACKING))
6781 		return true;
6782 
6783 	/* No ASYM_PACKING if target cpu is already busy */
6784 	if (env->idle == CPU_NOT_IDLE)
6785 		return true;
6786 	/*
6787 	 * ASYM_PACKING needs to move all the work to the lowest
6788 	 * numbered CPUs in the group, therefore mark all groups
6789 	 * higher than ourself as busy.
6790 	 */
6791 	if (sgs->sum_nr_running && env->dst_cpu < group_first_cpu(sg)) {
6792 		if (!sds->busiest)
6793 			return true;
6794 
6795 		/* Prefer to move from highest possible cpu's work */
6796 		if (group_first_cpu(sds->busiest) < group_first_cpu(sg))
6797 			return true;
6798 	}
6799 
6800 	return false;
6801 }
6802 
6803 #ifdef CONFIG_NUMA_BALANCING
6804 static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
6805 {
6806 	if (sgs->sum_nr_running > sgs->nr_numa_running)
6807 		return regular;
6808 	if (sgs->sum_nr_running > sgs->nr_preferred_running)
6809 		return remote;
6810 	return all;
6811 }
6812 
6813 static inline enum fbq_type fbq_classify_rq(struct rq *rq)
6814 {
6815 	if (rq->nr_running > rq->nr_numa_running)
6816 		return regular;
6817 	if (rq->nr_running > rq->nr_preferred_running)
6818 		return remote;
6819 	return all;
6820 }
6821 #else
6822 static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
6823 {
6824 	return all;
6825 }
6826 
6827 static inline enum fbq_type fbq_classify_rq(struct rq *rq)
6828 {
6829 	return regular;
6830 }
6831 #endif /* CONFIG_NUMA_BALANCING */
6832 
6833 /**
6834  * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
6835  * @env: The load balancing environment.
6836  * @sds: variable to hold the statistics for this sched_domain.
6837  */
6838 static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
6839 {
6840 	struct sched_domain *child = env->sd->child;
6841 	struct sched_group *sg = env->sd->groups;
6842 	struct sg_lb_stats tmp_sgs;
6843 	int load_idx, prefer_sibling = 0;
6844 	bool overload = false;
6845 
6846 	if (child && child->flags & SD_PREFER_SIBLING)
6847 		prefer_sibling = 1;
6848 
6849 	load_idx = get_sd_load_idx(env->sd, env->idle);
6850 
6851 	do {
6852 		struct sg_lb_stats *sgs = &tmp_sgs;
6853 		int local_group;
6854 
6855 		local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
6856 		if (local_group) {
6857 			sds->local = sg;
6858 			sgs = &sds->local_stat;
6859 
6860 			if (env->idle != CPU_NEWLY_IDLE ||
6861 			    time_after_eq(jiffies, sg->sgc->next_update))
6862 				update_group_capacity(env->sd, env->dst_cpu);
6863 		}
6864 
6865 		update_sg_lb_stats(env, sg, load_idx, local_group, sgs,
6866 						&overload);
6867 
6868 		if (local_group)
6869 			goto next_group;
6870 
6871 		/*
6872 		 * In case the child domain prefers tasks go to siblings
6873 		 * first, lower the sg capacity so that we'll try
6874 		 * and move all the excess tasks away. We lower the capacity
6875 		 * of a group only if the local group has the capacity to fit
6876 		 * these excess tasks. The extra check prevents the case where
6877 		 * you always pull from the heaviest group when it is already
6878 		 * under-utilized (possible with a large weight task outweighs
6879 		 * the tasks on the system).
6880 		 */
6881 		if (prefer_sibling && sds->local &&
6882 		    group_has_capacity(env, &sds->local_stat) &&
6883 		    (sgs->sum_nr_running > 1)) {
6884 			sgs->group_no_capacity = 1;
6885 			sgs->group_type = group_classify(sg, sgs);
6886 		}
6887 
6888 		if (update_sd_pick_busiest(env, sds, sg, sgs)) {
6889 			sds->busiest = sg;
6890 			sds->busiest_stat = *sgs;
6891 		}
6892 
6893 next_group:
6894 		/* Now, start updating sd_lb_stats */
6895 		sds->total_load += sgs->group_load;
6896 		sds->total_capacity += sgs->group_capacity;
6897 
6898 		sg = sg->next;
6899 	} while (sg != env->sd->groups);
6900 
6901 	if (env->sd->flags & SD_NUMA)
6902 		env->fbq_type = fbq_classify_group(&sds->busiest_stat);
6903 
6904 	if (!env->sd->parent) {
6905 		/* update overload indicator if we are at root domain */
6906 		if (env->dst_rq->rd->overload != overload)
6907 			env->dst_rq->rd->overload = overload;
6908 	}
6909 
6910 }
6911 
6912 /**
6913  * check_asym_packing - Check to see if the group is packed into the
6914  *			sched doman.
6915  *
6916  * This is primarily intended to used at the sibling level.  Some
6917  * cores like POWER7 prefer to use lower numbered SMT threads.  In the
6918  * case of POWER7, it can move to lower SMT modes only when higher
6919  * threads are idle.  When in lower SMT modes, the threads will
6920  * perform better since they share less core resources.  Hence when we
6921  * have idle threads, we want them to be the higher ones.
6922  *
6923  * This packing function is run on idle threads.  It checks to see if
6924  * the busiest CPU in this domain (core in the P7 case) has a higher
6925  * CPU number than the packing function is being run on.  Here we are
6926  * assuming lower CPU number will be equivalent to lower a SMT thread
6927  * number.
6928  *
6929  * Return: 1 when packing is required and a task should be moved to
6930  * this CPU.  The amount of the imbalance is returned in *imbalance.
6931  *
6932  * @env: The load balancing environment.
6933  * @sds: Statistics of the sched_domain which is to be packed
6934  */
6935 static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
6936 {
6937 	int busiest_cpu;
6938 
6939 	if (!(env->sd->flags & SD_ASYM_PACKING))
6940 		return 0;
6941 
6942 	if (env->idle == CPU_NOT_IDLE)
6943 		return 0;
6944 
6945 	if (!sds->busiest)
6946 		return 0;
6947 
6948 	busiest_cpu = group_first_cpu(sds->busiest);
6949 	if (env->dst_cpu > busiest_cpu)
6950 		return 0;
6951 
6952 	env->imbalance = DIV_ROUND_CLOSEST(
6953 		sds->busiest_stat.avg_load * sds->busiest_stat.group_capacity,
6954 		SCHED_CAPACITY_SCALE);
6955 
6956 	return 1;
6957 }
6958 
6959 /**
6960  * fix_small_imbalance - Calculate the minor imbalance that exists
6961  *			amongst the groups of a sched_domain, during
6962  *			load balancing.
6963  * @env: The load balancing environment.
6964  * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
6965  */
6966 static inline
6967 void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
6968 {
6969 	unsigned long tmp, capa_now = 0, capa_move = 0;
6970 	unsigned int imbn = 2;
6971 	unsigned long scaled_busy_load_per_task;
6972 	struct sg_lb_stats *local, *busiest;
6973 
6974 	local = &sds->local_stat;
6975 	busiest = &sds->busiest_stat;
6976 
6977 	if (!local->sum_nr_running)
6978 		local->load_per_task = cpu_avg_load_per_task(env->dst_cpu);
6979 	else if (busiest->load_per_task > local->load_per_task)
6980 		imbn = 1;
6981 
6982 	scaled_busy_load_per_task =
6983 		(busiest->load_per_task * SCHED_CAPACITY_SCALE) /
6984 		busiest->group_capacity;
6985 
6986 	if (busiest->avg_load + scaled_busy_load_per_task >=
6987 	    local->avg_load + (scaled_busy_load_per_task * imbn)) {
6988 		env->imbalance = busiest->load_per_task;
6989 		return;
6990 	}
6991 
6992 	/*
6993 	 * OK, we don't have enough imbalance to justify moving tasks,
6994 	 * however we may be able to increase total CPU capacity used by
6995 	 * moving them.
6996 	 */
6997 
6998 	capa_now += busiest->group_capacity *
6999 			min(busiest->load_per_task, busiest->avg_load);
7000 	capa_now += local->group_capacity *
7001 			min(local->load_per_task, local->avg_load);
7002 	capa_now /= SCHED_CAPACITY_SCALE;
7003 
7004 	/* Amount of load we'd subtract */
7005 	if (busiest->avg_load > scaled_busy_load_per_task) {
7006 		capa_move += busiest->group_capacity *
7007 			    min(busiest->load_per_task,
7008 				busiest->avg_load - scaled_busy_load_per_task);
7009 	}
7010 
7011 	/* Amount of load we'd add */
7012 	if (busiest->avg_load * busiest->group_capacity <
7013 	    busiest->load_per_task * SCHED_CAPACITY_SCALE) {
7014 		tmp = (busiest->avg_load * busiest->group_capacity) /
7015 		      local->group_capacity;
7016 	} else {
7017 		tmp = (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
7018 		      local->group_capacity;
7019 	}
7020 	capa_move += local->group_capacity *
7021 		    min(local->load_per_task, local->avg_load + tmp);
7022 	capa_move /= SCHED_CAPACITY_SCALE;
7023 
7024 	/* Move if we gain throughput */
7025 	if (capa_move > capa_now)
7026 		env->imbalance = busiest->load_per_task;
7027 }
7028 
7029 /**
7030  * calculate_imbalance - Calculate the amount of imbalance present within the
7031  *			 groups of a given sched_domain during load balance.
7032  * @env: load balance environment
7033  * @sds: statistics of the sched_domain whose imbalance is to be calculated.
7034  */
7035 static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
7036 {
7037 	unsigned long max_pull, load_above_capacity = ~0UL;
7038 	struct sg_lb_stats *local, *busiest;
7039 
7040 	local = &sds->local_stat;
7041 	busiest = &sds->busiest_stat;
7042 
7043 	if (busiest->group_type == group_imbalanced) {
7044 		/*
7045 		 * In the group_imb case we cannot rely on group-wide averages
7046 		 * to ensure cpu-load equilibrium, look at wider averages. XXX
7047 		 */
7048 		busiest->load_per_task =
7049 			min(busiest->load_per_task, sds->avg_load);
7050 	}
7051 
7052 	/*
7053 	 * Avg load of busiest sg can be less and avg load of local sg can
7054 	 * be greater than avg load across all sgs of sd because avg load
7055 	 * factors in sg capacity and sgs with smaller group_type are
7056 	 * skipped when updating the busiest sg:
7057 	 */
7058 	if (busiest->avg_load <= sds->avg_load ||
7059 	    local->avg_load >= sds->avg_load) {
7060 		env->imbalance = 0;
7061 		return fix_small_imbalance(env, sds);
7062 	}
7063 
7064 	/*
7065 	 * If there aren't any idle cpus, avoid creating some.
7066 	 */
7067 	if (busiest->group_type == group_overloaded &&
7068 	    local->group_type   == group_overloaded) {
7069 		load_above_capacity = busiest->sum_nr_running * SCHED_CAPACITY_SCALE;
7070 		if (load_above_capacity > busiest->group_capacity) {
7071 			load_above_capacity -= busiest->group_capacity;
7072 			load_above_capacity *= NICE_0_LOAD;
7073 			load_above_capacity /= busiest->group_capacity;
7074 		} else
7075 			load_above_capacity = ~0UL;
7076 	}
7077 
7078 	/*
7079 	 * We're trying to get all the cpus to the average_load, so we don't
7080 	 * want to push ourselves above the average load, nor do we wish to
7081 	 * reduce the max loaded cpu below the average load. At the same time,
7082 	 * we also don't want to reduce the group load below the group
7083 	 * capacity. Thus we look for the minimum possible imbalance.
7084 	 */
7085 	max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity);
7086 
7087 	/* How much load to actually move to equalise the imbalance */
7088 	env->imbalance = min(
7089 		max_pull * busiest->group_capacity,
7090 		(sds->avg_load - local->avg_load) * local->group_capacity
7091 	) / SCHED_CAPACITY_SCALE;
7092 
7093 	/*
7094 	 * if *imbalance is less than the average load per runnable task
7095 	 * there is no guarantee that any tasks will be moved so we'll have
7096 	 * a think about bumping its value to force at least one task to be
7097 	 * moved
7098 	 */
7099 	if (env->imbalance < busiest->load_per_task)
7100 		return fix_small_imbalance(env, sds);
7101 }
7102 
7103 /******* find_busiest_group() helpers end here *********************/
7104 
7105 /**
7106  * find_busiest_group - Returns the busiest group within the sched_domain
7107  * if there is an imbalance.
7108  *
7109  * Also calculates the amount of weighted load which should be moved
7110  * to restore balance.
7111  *
7112  * @env: The load balancing environment.
7113  *
7114  * Return:	- The busiest group if imbalance exists.
7115  */
7116 static struct sched_group *find_busiest_group(struct lb_env *env)
7117 {
7118 	struct sg_lb_stats *local, *busiest;
7119 	struct sd_lb_stats sds;
7120 
7121 	init_sd_lb_stats(&sds);
7122 
7123 	/*
7124 	 * Compute the various statistics relavent for load balancing at
7125 	 * this level.
7126 	 */
7127 	update_sd_lb_stats(env, &sds);
7128 	local = &sds.local_stat;
7129 	busiest = &sds.busiest_stat;
7130 
7131 	/* ASYM feature bypasses nice load balance check */
7132 	if (check_asym_packing(env, &sds))
7133 		return sds.busiest;
7134 
7135 	/* There is no busy sibling group to pull tasks from */
7136 	if (!sds.busiest || busiest->sum_nr_running == 0)
7137 		goto out_balanced;
7138 
7139 	sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load)
7140 						/ sds.total_capacity;
7141 
7142 	/*
7143 	 * If the busiest group is imbalanced the below checks don't
7144 	 * work because they assume all things are equal, which typically
7145 	 * isn't true due to cpus_allowed constraints and the like.
7146 	 */
7147 	if (busiest->group_type == group_imbalanced)
7148 		goto force_balance;
7149 
7150 	/* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
7151 	if (env->idle == CPU_NEWLY_IDLE && group_has_capacity(env, local) &&
7152 	    busiest->group_no_capacity)
7153 		goto force_balance;
7154 
7155 	/*
7156 	 * If the local group is busier than the selected busiest group
7157 	 * don't try and pull any tasks.
7158 	 */
7159 	if (local->avg_load >= busiest->avg_load)
7160 		goto out_balanced;
7161 
7162 	/*
7163 	 * Don't pull any tasks if this group is already above the domain
7164 	 * average load.
7165 	 */
7166 	if (local->avg_load >= sds.avg_load)
7167 		goto out_balanced;
7168 
7169 	if (env->idle == CPU_IDLE) {
7170 		/*
7171 		 * This cpu is idle. If the busiest group is not overloaded
7172 		 * and there is no imbalance between this and busiest group
7173 		 * wrt idle cpus, it is balanced. The imbalance becomes
7174 		 * significant if the diff is greater than 1 otherwise we
7175 		 * might end up to just move the imbalance on another group
7176 		 */
7177 		if ((busiest->group_type != group_overloaded) &&
7178 				(local->idle_cpus <= (busiest->idle_cpus + 1)))
7179 			goto out_balanced;
7180 	} else {
7181 		/*
7182 		 * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
7183 		 * imbalance_pct to be conservative.
7184 		 */
7185 		if (100 * busiest->avg_load <=
7186 				env->sd->imbalance_pct * local->avg_load)
7187 			goto out_balanced;
7188 	}
7189 
7190 force_balance:
7191 	/* Looks like there is an imbalance. Compute it */
7192 	calculate_imbalance(env, &sds);
7193 	return sds.busiest;
7194 
7195 out_balanced:
7196 	env->imbalance = 0;
7197 	return NULL;
7198 }
7199 
7200 /*
7201  * find_busiest_queue - find the busiest runqueue among the cpus in group.
7202  */
7203 static struct rq *find_busiest_queue(struct lb_env *env,
7204 				     struct sched_group *group)
7205 {
7206 	struct rq *busiest = NULL, *rq;
7207 	unsigned long busiest_load = 0, busiest_capacity = 1;
7208 	int i;
7209 
7210 	for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
7211 		unsigned long capacity, wl;
7212 		enum fbq_type rt;
7213 
7214 		rq = cpu_rq(i);
7215 		rt = fbq_classify_rq(rq);
7216 
7217 		/*
7218 		 * We classify groups/runqueues into three groups:
7219 		 *  - regular: there are !numa tasks
7220 		 *  - remote:  there are numa tasks that run on the 'wrong' node
7221 		 *  - all:     there is no distinction
7222 		 *
7223 		 * In order to avoid migrating ideally placed numa tasks,
7224 		 * ignore those when there's better options.
7225 		 *
7226 		 * If we ignore the actual busiest queue to migrate another
7227 		 * task, the next balance pass can still reduce the busiest
7228 		 * queue by moving tasks around inside the node.
7229 		 *
7230 		 * If we cannot move enough load due to this classification
7231 		 * the next pass will adjust the group classification and
7232 		 * allow migration of more tasks.
7233 		 *
7234 		 * Both cases only affect the total convergence complexity.
7235 		 */
7236 		if (rt > env->fbq_type)
7237 			continue;
7238 
7239 		capacity = capacity_of(i);
7240 
7241 		wl = weighted_cpuload(i);
7242 
7243 		/*
7244 		 * When comparing with imbalance, use weighted_cpuload()
7245 		 * which is not scaled with the cpu capacity.
7246 		 */
7247 
7248 		if (rq->nr_running == 1 && wl > env->imbalance &&
7249 		    !check_cpu_capacity(rq, env->sd))
7250 			continue;
7251 
7252 		/*
7253 		 * For the load comparisons with the other cpu's, consider
7254 		 * the weighted_cpuload() scaled with the cpu capacity, so
7255 		 * that the load can be moved away from the cpu that is
7256 		 * potentially running at a lower capacity.
7257 		 *
7258 		 * Thus we're looking for max(wl_i / capacity_i), crosswise
7259 		 * multiplication to rid ourselves of the division works out
7260 		 * to: wl_i * capacity_j > wl_j * capacity_i;  where j is
7261 		 * our previous maximum.
7262 		 */
7263 		if (wl * busiest_capacity > busiest_load * capacity) {
7264 			busiest_load = wl;
7265 			busiest_capacity = capacity;
7266 			busiest = rq;
7267 		}
7268 	}
7269 
7270 	return busiest;
7271 }
7272 
7273 /*
7274  * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
7275  * so long as it is large enough.
7276  */
7277 #define MAX_PINNED_INTERVAL	512
7278 
7279 /* Working cpumask for load_balance and load_balance_newidle. */
7280 DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
7281 
7282 static int need_active_balance(struct lb_env *env)
7283 {
7284 	struct sched_domain *sd = env->sd;
7285 
7286 	if (env->idle == CPU_NEWLY_IDLE) {
7287 
7288 		/*
7289 		 * ASYM_PACKING needs to force migrate tasks from busy but
7290 		 * higher numbered CPUs in order to pack all tasks in the
7291 		 * lowest numbered CPUs.
7292 		 */
7293 		if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu)
7294 			return 1;
7295 	}
7296 
7297 	/*
7298 	 * The dst_cpu is idle and the src_cpu CPU has only 1 CFS task.
7299 	 * It's worth migrating the task if the src_cpu's capacity is reduced
7300 	 * because of other sched_class or IRQs if more capacity stays
7301 	 * available on dst_cpu.
7302 	 */
7303 	if ((env->idle != CPU_NOT_IDLE) &&
7304 	    (env->src_rq->cfs.h_nr_running == 1)) {
7305 		if ((check_cpu_capacity(env->src_rq, sd)) &&
7306 		    (capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100))
7307 			return 1;
7308 	}
7309 
7310 	return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
7311 }
7312 
7313 static int active_load_balance_cpu_stop(void *data);
7314 
7315 static int should_we_balance(struct lb_env *env)
7316 {
7317 	struct sched_group *sg = env->sd->groups;
7318 	struct cpumask *sg_cpus, *sg_mask;
7319 	int cpu, balance_cpu = -1;
7320 
7321 	/*
7322 	 * In the newly idle case, we will allow all the cpu's
7323 	 * to do the newly idle load balance.
7324 	 */
7325 	if (env->idle == CPU_NEWLY_IDLE)
7326 		return 1;
7327 
7328 	sg_cpus = sched_group_cpus(sg);
7329 	sg_mask = sched_group_mask(sg);
7330 	/* Try to find first idle cpu */
7331 	for_each_cpu_and(cpu, sg_cpus, env->cpus) {
7332 		if (!cpumask_test_cpu(cpu, sg_mask) || !idle_cpu(cpu))
7333 			continue;
7334 
7335 		balance_cpu = cpu;
7336 		break;
7337 	}
7338 
7339 	if (balance_cpu == -1)
7340 		balance_cpu = group_balance_cpu(sg);
7341 
7342 	/*
7343 	 * First idle cpu or the first cpu(busiest) in this sched group
7344 	 * is eligible for doing load balancing at this and above domains.
7345 	 */
7346 	return balance_cpu == env->dst_cpu;
7347 }
7348 
7349 /*
7350  * Check this_cpu to ensure it is balanced within domain. Attempt to move
7351  * tasks if there is an imbalance.
7352  */
7353 static int load_balance(int this_cpu, struct rq *this_rq,
7354 			struct sched_domain *sd, enum cpu_idle_type idle,
7355 			int *continue_balancing)
7356 {
7357 	int ld_moved, cur_ld_moved, active_balance = 0;
7358 	struct sched_domain *sd_parent = sd->parent;
7359 	struct sched_group *group;
7360 	struct rq *busiest;
7361 	unsigned long flags;
7362 	struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
7363 
7364 	struct lb_env env = {
7365 		.sd		= sd,
7366 		.dst_cpu	= this_cpu,
7367 		.dst_rq		= this_rq,
7368 		.dst_grpmask    = sched_group_cpus(sd->groups),
7369 		.idle		= idle,
7370 		.loop_break	= sched_nr_migrate_break,
7371 		.cpus		= cpus,
7372 		.fbq_type	= all,
7373 		.tasks		= LIST_HEAD_INIT(env.tasks),
7374 	};
7375 
7376 	/*
7377 	 * For NEWLY_IDLE load_balancing, we don't need to consider
7378 	 * other cpus in our group
7379 	 */
7380 	if (idle == CPU_NEWLY_IDLE)
7381 		env.dst_grpmask = NULL;
7382 
7383 	cpumask_copy(cpus, cpu_active_mask);
7384 
7385 	schedstat_inc(sd, lb_count[idle]);
7386 
7387 redo:
7388 	if (!should_we_balance(&env)) {
7389 		*continue_balancing = 0;
7390 		goto out_balanced;
7391 	}
7392 
7393 	group = find_busiest_group(&env);
7394 	if (!group) {
7395 		schedstat_inc(sd, lb_nobusyg[idle]);
7396 		goto out_balanced;
7397 	}
7398 
7399 	busiest = find_busiest_queue(&env, group);
7400 	if (!busiest) {
7401 		schedstat_inc(sd, lb_nobusyq[idle]);
7402 		goto out_balanced;
7403 	}
7404 
7405 	BUG_ON(busiest == env.dst_rq);
7406 
7407 	schedstat_add(sd, lb_imbalance[idle], env.imbalance);
7408 
7409 	env.src_cpu = busiest->cpu;
7410 	env.src_rq = busiest;
7411 
7412 	ld_moved = 0;
7413 	if (busiest->nr_running > 1) {
7414 		/*
7415 		 * Attempt to move tasks. If find_busiest_group has found
7416 		 * an imbalance but busiest->nr_running <= 1, the group is
7417 		 * still unbalanced. ld_moved simply stays zero, so it is
7418 		 * correctly treated as an imbalance.
7419 		 */
7420 		env.flags |= LBF_ALL_PINNED;
7421 		env.loop_max  = min(sysctl_sched_nr_migrate, busiest->nr_running);
7422 
7423 more_balance:
7424 		raw_spin_lock_irqsave(&busiest->lock, flags);
7425 
7426 		/*
7427 		 * cur_ld_moved - load moved in current iteration
7428 		 * ld_moved     - cumulative load moved across iterations
7429 		 */
7430 		cur_ld_moved = detach_tasks(&env);
7431 
7432 		/*
7433 		 * We've detached some tasks from busiest_rq. Every
7434 		 * task is masked "TASK_ON_RQ_MIGRATING", so we can safely
7435 		 * unlock busiest->lock, and we are able to be sure
7436 		 * that nobody can manipulate the tasks in parallel.
7437 		 * See task_rq_lock() family for the details.
7438 		 */
7439 
7440 		raw_spin_unlock(&busiest->lock);
7441 
7442 		if (cur_ld_moved) {
7443 			attach_tasks(&env);
7444 			ld_moved += cur_ld_moved;
7445 		}
7446 
7447 		local_irq_restore(flags);
7448 
7449 		if (env.flags & LBF_NEED_BREAK) {
7450 			env.flags &= ~LBF_NEED_BREAK;
7451 			goto more_balance;
7452 		}
7453 
7454 		/*
7455 		 * Revisit (affine) tasks on src_cpu that couldn't be moved to
7456 		 * us and move them to an alternate dst_cpu in our sched_group
7457 		 * where they can run. The upper limit on how many times we
7458 		 * iterate on same src_cpu is dependent on number of cpus in our
7459 		 * sched_group.
7460 		 *
7461 		 * This changes load balance semantics a bit on who can move
7462 		 * load to a given_cpu. In addition to the given_cpu itself
7463 		 * (or a ilb_cpu acting on its behalf where given_cpu is
7464 		 * nohz-idle), we now have balance_cpu in a position to move
7465 		 * load to given_cpu. In rare situations, this may cause
7466 		 * conflicts (balance_cpu and given_cpu/ilb_cpu deciding
7467 		 * _independently_ and at _same_ time to move some load to
7468 		 * given_cpu) causing exceess load to be moved to given_cpu.
7469 		 * This however should not happen so much in practice and
7470 		 * moreover subsequent load balance cycles should correct the
7471 		 * excess load moved.
7472 		 */
7473 		if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
7474 
7475 			/* Prevent to re-select dst_cpu via env's cpus */
7476 			cpumask_clear_cpu(env.dst_cpu, env.cpus);
7477 
7478 			env.dst_rq	 = cpu_rq(env.new_dst_cpu);
7479 			env.dst_cpu	 = env.new_dst_cpu;
7480 			env.flags	&= ~LBF_DST_PINNED;
7481 			env.loop	 = 0;
7482 			env.loop_break	 = sched_nr_migrate_break;
7483 
7484 			/*
7485 			 * Go back to "more_balance" rather than "redo" since we
7486 			 * need to continue with same src_cpu.
7487 			 */
7488 			goto more_balance;
7489 		}
7490 
7491 		/*
7492 		 * We failed to reach balance because of affinity.
7493 		 */
7494 		if (sd_parent) {
7495 			int *group_imbalance = &sd_parent->groups->sgc->imbalance;
7496 
7497 			if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0)
7498 				*group_imbalance = 1;
7499 		}
7500 
7501 		/* All tasks on this runqueue were pinned by CPU affinity */
7502 		if (unlikely(env.flags & LBF_ALL_PINNED)) {
7503 			cpumask_clear_cpu(cpu_of(busiest), cpus);
7504 			if (!cpumask_empty(cpus)) {
7505 				env.loop = 0;
7506 				env.loop_break = sched_nr_migrate_break;
7507 				goto redo;
7508 			}
7509 			goto out_all_pinned;
7510 		}
7511 	}
7512 
7513 	if (!ld_moved) {
7514 		schedstat_inc(sd, lb_failed[idle]);
7515 		/*
7516 		 * Increment the failure counter only on periodic balance.
7517 		 * We do not want newidle balance, which can be very
7518 		 * frequent, pollute the failure counter causing
7519 		 * excessive cache_hot migrations and active balances.
7520 		 */
7521 		if (idle != CPU_NEWLY_IDLE)
7522 			sd->nr_balance_failed++;
7523 
7524 		if (need_active_balance(&env)) {
7525 			raw_spin_lock_irqsave(&busiest->lock, flags);
7526 
7527 			/* don't kick the active_load_balance_cpu_stop,
7528 			 * if the curr task on busiest cpu can't be
7529 			 * moved to this_cpu
7530 			 */
7531 			if (!cpumask_test_cpu(this_cpu,
7532 					tsk_cpus_allowed(busiest->curr))) {
7533 				raw_spin_unlock_irqrestore(&busiest->lock,
7534 							    flags);
7535 				env.flags |= LBF_ALL_PINNED;
7536 				goto out_one_pinned;
7537 			}
7538 
7539 			/*
7540 			 * ->active_balance synchronizes accesses to
7541 			 * ->active_balance_work.  Once set, it's cleared
7542 			 * only after active load balance is finished.
7543 			 */
7544 			if (!busiest->active_balance) {
7545 				busiest->active_balance = 1;
7546 				busiest->push_cpu = this_cpu;
7547 				active_balance = 1;
7548 			}
7549 			raw_spin_unlock_irqrestore(&busiest->lock, flags);
7550 
7551 			if (active_balance) {
7552 				stop_one_cpu_nowait(cpu_of(busiest),
7553 					active_load_balance_cpu_stop, busiest,
7554 					&busiest->active_balance_work);
7555 			}
7556 
7557 			/* We've kicked active balancing, force task migration. */
7558 			sd->nr_balance_failed = sd->cache_nice_tries+1;
7559 		}
7560 	} else
7561 		sd->nr_balance_failed = 0;
7562 
7563 	if (likely(!active_balance)) {
7564 		/* We were unbalanced, so reset the balancing interval */
7565 		sd->balance_interval = sd->min_interval;
7566 	} else {
7567 		/*
7568 		 * If we've begun active balancing, start to back off. This
7569 		 * case may not be covered by the all_pinned logic if there
7570 		 * is only 1 task on the busy runqueue (because we don't call
7571 		 * detach_tasks).
7572 		 */
7573 		if (sd->balance_interval < sd->max_interval)
7574 			sd->balance_interval *= 2;
7575 	}
7576 
7577 	goto out;
7578 
7579 out_balanced:
7580 	/*
7581 	 * We reach balance although we may have faced some affinity
7582 	 * constraints. Clear the imbalance flag if it was set.
7583 	 */
7584 	if (sd_parent) {
7585 		int *group_imbalance = &sd_parent->groups->sgc->imbalance;
7586 
7587 		if (*group_imbalance)
7588 			*group_imbalance = 0;
7589 	}
7590 
7591 out_all_pinned:
7592 	/*
7593 	 * We reach balance because all tasks are pinned at this level so
7594 	 * we can't migrate them. Let the imbalance flag set so parent level
7595 	 * can try to migrate them.
7596 	 */
7597 	schedstat_inc(sd, lb_balanced[idle]);
7598 
7599 	sd->nr_balance_failed = 0;
7600 
7601 out_one_pinned:
7602 	/* tune up the balancing interval */
7603 	if (((env.flags & LBF_ALL_PINNED) &&
7604 			sd->balance_interval < MAX_PINNED_INTERVAL) ||
7605 			(sd->balance_interval < sd->max_interval))
7606 		sd->balance_interval *= 2;
7607 
7608 	ld_moved = 0;
7609 out:
7610 	return ld_moved;
7611 }
7612 
7613 static inline unsigned long
7614 get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
7615 {
7616 	unsigned long interval = sd->balance_interval;
7617 
7618 	if (cpu_busy)
7619 		interval *= sd->busy_factor;
7620 
7621 	/* scale ms to jiffies */
7622 	interval = msecs_to_jiffies(interval);
7623 	interval = clamp(interval, 1UL, max_load_balance_interval);
7624 
7625 	return interval;
7626 }
7627 
7628 static inline void
7629 update_next_balance(struct sched_domain *sd, int cpu_busy, unsigned long *next_balance)
7630 {
7631 	unsigned long interval, next;
7632 
7633 	interval = get_sd_balance_interval(sd, cpu_busy);
7634 	next = sd->last_balance + interval;
7635 
7636 	if (time_after(*next_balance, next))
7637 		*next_balance = next;
7638 }
7639 
7640 /*
7641  * idle_balance is called by schedule() if this_cpu is about to become
7642  * idle. Attempts to pull tasks from other CPUs.
7643  */
7644 static int idle_balance(struct rq *this_rq)
7645 {
7646 	unsigned long next_balance = jiffies + HZ;
7647 	int this_cpu = this_rq->cpu;
7648 	struct sched_domain *sd;
7649 	int pulled_task = 0;
7650 	u64 curr_cost = 0;
7651 
7652 	/*
7653 	 * We must set idle_stamp _before_ calling idle_balance(), such that we
7654 	 * measure the duration of idle_balance() as idle time.
7655 	 */
7656 	this_rq->idle_stamp = rq_clock(this_rq);
7657 
7658 	if (this_rq->avg_idle < sysctl_sched_migration_cost ||
7659 	    !this_rq->rd->overload) {
7660 		rcu_read_lock();
7661 		sd = rcu_dereference_check_sched_domain(this_rq->sd);
7662 		if (sd)
7663 			update_next_balance(sd, 0, &next_balance);
7664 		rcu_read_unlock();
7665 
7666 		goto out;
7667 	}
7668 
7669 	raw_spin_unlock(&this_rq->lock);
7670 
7671 	update_blocked_averages(this_cpu);
7672 	rcu_read_lock();
7673 	for_each_domain(this_cpu, sd) {
7674 		int continue_balancing = 1;
7675 		u64 t0, domain_cost;
7676 
7677 		if (!(sd->flags & SD_LOAD_BALANCE))
7678 			continue;
7679 
7680 		if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
7681 			update_next_balance(sd, 0, &next_balance);
7682 			break;
7683 		}
7684 
7685 		if (sd->flags & SD_BALANCE_NEWIDLE) {
7686 			t0 = sched_clock_cpu(this_cpu);
7687 
7688 			pulled_task = load_balance(this_cpu, this_rq,
7689 						   sd, CPU_NEWLY_IDLE,
7690 						   &continue_balancing);
7691 
7692 			domain_cost = sched_clock_cpu(this_cpu) - t0;
7693 			if (domain_cost > sd->max_newidle_lb_cost)
7694 				sd->max_newidle_lb_cost = domain_cost;
7695 
7696 			curr_cost += domain_cost;
7697 		}
7698 
7699 		update_next_balance(sd, 0, &next_balance);
7700 
7701 		/*
7702 		 * Stop searching for tasks to pull if there are
7703 		 * now runnable tasks on this rq.
7704 		 */
7705 		if (pulled_task || this_rq->nr_running > 0)
7706 			break;
7707 	}
7708 	rcu_read_unlock();
7709 
7710 	raw_spin_lock(&this_rq->lock);
7711 
7712 	if (curr_cost > this_rq->max_idle_balance_cost)
7713 		this_rq->max_idle_balance_cost = curr_cost;
7714 
7715 	/*
7716 	 * While browsing the domains, we released the rq lock, a task could
7717 	 * have been enqueued in the meantime. Since we're not going idle,
7718 	 * pretend we pulled a task.
7719 	 */
7720 	if (this_rq->cfs.h_nr_running && !pulled_task)
7721 		pulled_task = 1;
7722 
7723 out:
7724 	/* Move the next balance forward */
7725 	if (time_after(this_rq->next_balance, next_balance))
7726 		this_rq->next_balance = next_balance;
7727 
7728 	/* Is there a task of a high priority class? */
7729 	if (this_rq->nr_running != this_rq->cfs.h_nr_running)
7730 		pulled_task = -1;
7731 
7732 	if (pulled_task)
7733 		this_rq->idle_stamp = 0;
7734 
7735 	return pulled_task;
7736 }
7737 
7738 /*
7739  * active_load_balance_cpu_stop is run by cpu stopper. It pushes
7740  * running tasks off the busiest CPU onto idle CPUs. It requires at
7741  * least 1 task to be running on each physical CPU where possible, and
7742  * avoids physical / logical imbalances.
7743  */
7744 static int active_load_balance_cpu_stop(void *data)
7745 {
7746 	struct rq *busiest_rq = data;
7747 	int busiest_cpu = cpu_of(busiest_rq);
7748 	int target_cpu = busiest_rq->push_cpu;
7749 	struct rq *target_rq = cpu_rq(target_cpu);
7750 	struct sched_domain *sd;
7751 	struct task_struct *p = NULL;
7752 
7753 	raw_spin_lock_irq(&busiest_rq->lock);
7754 
7755 	/* make sure the requested cpu hasn't gone down in the meantime */
7756 	if (unlikely(busiest_cpu != smp_processor_id() ||
7757 		     !busiest_rq->active_balance))
7758 		goto out_unlock;
7759 
7760 	/* Is there any task to move? */
7761 	if (busiest_rq->nr_running <= 1)
7762 		goto out_unlock;
7763 
7764 	/*
7765 	 * This condition is "impossible", if it occurs
7766 	 * we need to fix it. Originally reported by
7767 	 * Bjorn Helgaas on a 128-cpu setup.
7768 	 */
7769 	BUG_ON(busiest_rq == target_rq);
7770 
7771 	/* Search for an sd spanning us and the target CPU. */
7772 	rcu_read_lock();
7773 	for_each_domain(target_cpu, sd) {
7774 		if ((sd->flags & SD_LOAD_BALANCE) &&
7775 		    cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
7776 				break;
7777 	}
7778 
7779 	if (likely(sd)) {
7780 		struct lb_env env = {
7781 			.sd		= sd,
7782 			.dst_cpu	= target_cpu,
7783 			.dst_rq		= target_rq,
7784 			.src_cpu	= busiest_rq->cpu,
7785 			.src_rq		= busiest_rq,
7786 			.idle		= CPU_IDLE,
7787 		};
7788 
7789 		schedstat_inc(sd, alb_count);
7790 
7791 		p = detach_one_task(&env);
7792 		if (p) {
7793 			schedstat_inc(sd, alb_pushed);
7794 			/* Active balancing done, reset the failure counter. */
7795 			sd->nr_balance_failed = 0;
7796 		} else {
7797 			schedstat_inc(sd, alb_failed);
7798 		}
7799 	}
7800 	rcu_read_unlock();
7801 out_unlock:
7802 	busiest_rq->active_balance = 0;
7803 	raw_spin_unlock(&busiest_rq->lock);
7804 
7805 	if (p)
7806 		attach_one_task(target_rq, p);
7807 
7808 	local_irq_enable();
7809 
7810 	return 0;
7811 }
7812 
7813 static inline int on_null_domain(struct rq *rq)
7814 {
7815 	return unlikely(!rcu_dereference_sched(rq->sd));
7816 }
7817 
7818 #ifdef CONFIG_NO_HZ_COMMON
7819 /*
7820  * idle load balancing details
7821  * - When one of the busy CPUs notice that there may be an idle rebalancing
7822  *   needed, they will kick the idle load balancer, which then does idle
7823  *   load balancing for all the idle CPUs.
7824  */
7825 static struct {
7826 	cpumask_var_t idle_cpus_mask;
7827 	atomic_t nr_cpus;
7828 	unsigned long next_balance;     /* in jiffy units */
7829 } nohz ____cacheline_aligned;
7830 
7831 static inline int find_new_ilb(void)
7832 {
7833 	int ilb = cpumask_first(nohz.idle_cpus_mask);
7834 
7835 	if (ilb < nr_cpu_ids && idle_cpu(ilb))
7836 		return ilb;
7837 
7838 	return nr_cpu_ids;
7839 }
7840 
7841 /*
7842  * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
7843  * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
7844  * CPU (if there is one).
7845  */
7846 static void nohz_balancer_kick(void)
7847 {
7848 	int ilb_cpu;
7849 
7850 	nohz.next_balance++;
7851 
7852 	ilb_cpu = find_new_ilb();
7853 
7854 	if (ilb_cpu >= nr_cpu_ids)
7855 		return;
7856 
7857 	if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))
7858 		return;
7859 	/*
7860 	 * Use smp_send_reschedule() instead of resched_cpu().
7861 	 * This way we generate a sched IPI on the target cpu which
7862 	 * is idle. And the softirq performing nohz idle load balance
7863 	 * will be run before returning from the IPI.
7864 	 */
7865 	smp_send_reschedule(ilb_cpu);
7866 	return;
7867 }
7868 
7869 void nohz_balance_exit_idle(unsigned int cpu)
7870 {
7871 	if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
7872 		/*
7873 		 * Completely isolated CPUs don't ever set, so we must test.
7874 		 */
7875 		if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) {
7876 			cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
7877 			atomic_dec(&nohz.nr_cpus);
7878 		}
7879 		clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
7880 	}
7881 }
7882 
7883 static inline void set_cpu_sd_state_busy(void)
7884 {
7885 	struct sched_domain *sd;
7886 	int cpu = smp_processor_id();
7887 
7888 	rcu_read_lock();
7889 	sd = rcu_dereference(per_cpu(sd_busy, cpu));
7890 
7891 	if (!sd || !sd->nohz_idle)
7892 		goto unlock;
7893 	sd->nohz_idle = 0;
7894 
7895 	atomic_inc(&sd->groups->sgc->nr_busy_cpus);
7896 unlock:
7897 	rcu_read_unlock();
7898 }
7899 
7900 void set_cpu_sd_state_idle(void)
7901 {
7902 	struct sched_domain *sd;
7903 	int cpu = smp_processor_id();
7904 
7905 	rcu_read_lock();
7906 	sd = rcu_dereference(per_cpu(sd_busy, cpu));
7907 
7908 	if (!sd || sd->nohz_idle)
7909 		goto unlock;
7910 	sd->nohz_idle = 1;
7911 
7912 	atomic_dec(&sd->groups->sgc->nr_busy_cpus);
7913 unlock:
7914 	rcu_read_unlock();
7915 }
7916 
7917 /*
7918  * This routine will record that the cpu is going idle with tick stopped.
7919  * This info will be used in performing idle load balancing in the future.
7920  */
7921 void nohz_balance_enter_idle(int cpu)
7922 {
7923 	/*
7924 	 * If this cpu is going down, then nothing needs to be done.
7925 	 */
7926 	if (!cpu_active(cpu))
7927 		return;
7928 
7929 	if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
7930 		return;
7931 
7932 	/*
7933 	 * If we're a completely isolated CPU, we don't play.
7934 	 */
7935 	if (on_null_domain(cpu_rq(cpu)))
7936 		return;
7937 
7938 	cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
7939 	atomic_inc(&nohz.nr_cpus);
7940 	set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
7941 }
7942 #endif
7943 
7944 static DEFINE_SPINLOCK(balancing);
7945 
7946 /*
7947  * Scale the max load_balance interval with the number of CPUs in the system.
7948  * This trades load-balance latency on larger machines for less cross talk.
7949  */
7950 void update_max_interval(void)
7951 {
7952 	max_load_balance_interval = HZ*num_online_cpus()/10;
7953 }
7954 
7955 /*
7956  * It checks each scheduling domain to see if it is due to be balanced,
7957  * and initiates a balancing operation if so.
7958  *
7959  * Balancing parameters are set up in init_sched_domains.
7960  */
7961 static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
7962 {
7963 	int continue_balancing = 1;
7964 	int cpu = rq->cpu;
7965 	unsigned long interval;
7966 	struct sched_domain *sd;
7967 	/* Earliest time when we have to do rebalance again */
7968 	unsigned long next_balance = jiffies + 60*HZ;
7969 	int update_next_balance = 0;
7970 	int need_serialize, need_decay = 0;
7971 	u64 max_cost = 0;
7972 
7973 	update_blocked_averages(cpu);
7974 
7975 	rcu_read_lock();
7976 	for_each_domain(cpu, sd) {
7977 		/*
7978 		 * Decay the newidle max times here because this is a regular
7979 		 * visit to all the domains. Decay ~1% per second.
7980 		 */
7981 		if (time_after(jiffies, sd->next_decay_max_lb_cost)) {
7982 			sd->max_newidle_lb_cost =
7983 				(sd->max_newidle_lb_cost * 253) / 256;
7984 			sd->next_decay_max_lb_cost = jiffies + HZ;
7985 			need_decay = 1;
7986 		}
7987 		max_cost += sd->max_newidle_lb_cost;
7988 
7989 		if (!(sd->flags & SD_LOAD_BALANCE))
7990 			continue;
7991 
7992 		/*
7993 		 * Stop the load balance at this level. There is another
7994 		 * CPU in our sched group which is doing load balancing more
7995 		 * actively.
7996 		 */
7997 		if (!continue_balancing) {
7998 			if (need_decay)
7999 				continue;
8000 			break;
8001 		}
8002 
8003 		interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
8004 
8005 		need_serialize = sd->flags & SD_SERIALIZE;
8006 		if (need_serialize) {
8007 			if (!spin_trylock(&balancing))
8008 				goto out;
8009 		}
8010 
8011 		if (time_after_eq(jiffies, sd->last_balance + interval)) {
8012 			if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
8013 				/*
8014 				 * The LBF_DST_PINNED logic could have changed
8015 				 * env->dst_cpu, so we can't know our idle
8016 				 * state even if we migrated tasks. Update it.
8017 				 */
8018 				idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
8019 			}
8020 			sd->last_balance = jiffies;
8021 			interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
8022 		}
8023 		if (need_serialize)
8024 			spin_unlock(&balancing);
8025 out:
8026 		if (time_after(next_balance, sd->last_balance + interval)) {
8027 			next_balance = sd->last_balance + interval;
8028 			update_next_balance = 1;
8029 		}
8030 	}
8031 	if (need_decay) {
8032 		/*
8033 		 * Ensure the rq-wide value also decays but keep it at a
8034 		 * reasonable floor to avoid funnies with rq->avg_idle.
8035 		 */
8036 		rq->max_idle_balance_cost =
8037 			max((u64)sysctl_sched_migration_cost, max_cost);
8038 	}
8039 	rcu_read_unlock();
8040 
8041 	/*
8042 	 * next_balance will be updated only when there is a need.
8043 	 * When the cpu is attached to null domain for ex, it will not be
8044 	 * updated.
8045 	 */
8046 	if (likely(update_next_balance)) {
8047 		rq->next_balance = next_balance;
8048 
8049 #ifdef CONFIG_NO_HZ_COMMON
8050 		/*
8051 		 * If this CPU has been elected to perform the nohz idle
8052 		 * balance. Other idle CPUs have already rebalanced with
8053 		 * nohz_idle_balance() and nohz.next_balance has been
8054 		 * updated accordingly. This CPU is now running the idle load
8055 		 * balance for itself and we need to update the
8056 		 * nohz.next_balance accordingly.
8057 		 */
8058 		if ((idle == CPU_IDLE) && time_after(nohz.next_balance, rq->next_balance))
8059 			nohz.next_balance = rq->next_balance;
8060 #endif
8061 	}
8062 }
8063 
8064 #ifdef CONFIG_NO_HZ_COMMON
8065 /*
8066  * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
8067  * rebalancing for all the cpus for whom scheduler ticks are stopped.
8068  */
8069 static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
8070 {
8071 	int this_cpu = this_rq->cpu;
8072 	struct rq *rq;
8073 	int balance_cpu;
8074 	/* Earliest time when we have to do rebalance again */
8075 	unsigned long next_balance = jiffies + 60*HZ;
8076 	int update_next_balance = 0;
8077 
8078 	if (idle != CPU_IDLE ||
8079 	    !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
8080 		goto end;
8081 
8082 	for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
8083 		if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
8084 			continue;
8085 
8086 		/*
8087 		 * If this cpu gets work to do, stop the load balancing
8088 		 * work being done for other cpus. Next load
8089 		 * balancing owner will pick it up.
8090 		 */
8091 		if (need_resched())
8092 			break;
8093 
8094 		rq = cpu_rq(balance_cpu);
8095 
8096 		/*
8097 		 * If time for next balance is due,
8098 		 * do the balance.
8099 		 */
8100 		if (time_after_eq(jiffies, rq->next_balance)) {
8101 			raw_spin_lock_irq(&rq->lock);
8102 			update_rq_clock(rq);
8103 			cpu_load_update_idle(rq);
8104 			raw_spin_unlock_irq(&rq->lock);
8105 			rebalance_domains(rq, CPU_IDLE);
8106 		}
8107 
8108 		if (time_after(next_balance, rq->next_balance)) {
8109 			next_balance = rq->next_balance;
8110 			update_next_balance = 1;
8111 		}
8112 	}
8113 
8114 	/*
8115 	 * next_balance will be updated only when there is a need.
8116 	 * When the CPU is attached to null domain for ex, it will not be
8117 	 * updated.
8118 	 */
8119 	if (likely(update_next_balance))
8120 		nohz.next_balance = next_balance;
8121 end:
8122 	clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
8123 }
8124 
8125 /*
8126  * Current heuristic for kicking the idle load balancer in the presence
8127  * of an idle cpu in the system.
8128  *   - This rq has more than one task.
8129  *   - This rq has at least one CFS task and the capacity of the CPU is
8130  *     significantly reduced because of RT tasks or IRQs.
8131  *   - At parent of LLC scheduler domain level, this cpu's scheduler group has
8132  *     multiple busy cpu.
8133  *   - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
8134  *     domain span are idle.
8135  */
8136 static inline bool nohz_kick_needed(struct rq *rq)
8137 {
8138 	unsigned long now = jiffies;
8139 	struct sched_domain *sd;
8140 	struct sched_group_capacity *sgc;
8141 	int nr_busy, cpu = rq->cpu;
8142 	bool kick = false;
8143 
8144 	if (unlikely(rq->idle_balance))
8145 		return false;
8146 
8147        /*
8148 	* We may be recently in ticked or tickless idle mode. At the first
8149 	* busy tick after returning from idle, we will update the busy stats.
8150 	*/
8151 	set_cpu_sd_state_busy();
8152 	nohz_balance_exit_idle(cpu);
8153 
8154 	/*
8155 	 * None are in tickless mode and hence no need for NOHZ idle load
8156 	 * balancing.
8157 	 */
8158 	if (likely(!atomic_read(&nohz.nr_cpus)))
8159 		return false;
8160 
8161 	if (time_before(now, nohz.next_balance))
8162 		return false;
8163 
8164 	if (rq->nr_running >= 2)
8165 		return true;
8166 
8167 	rcu_read_lock();
8168 	sd = rcu_dereference(per_cpu(sd_busy, cpu));
8169 	if (sd) {
8170 		sgc = sd->groups->sgc;
8171 		nr_busy = atomic_read(&sgc->nr_busy_cpus);
8172 
8173 		if (nr_busy > 1) {
8174 			kick = true;
8175 			goto unlock;
8176 		}
8177 
8178 	}
8179 
8180 	sd = rcu_dereference(rq->sd);
8181 	if (sd) {
8182 		if ((rq->cfs.h_nr_running >= 1) &&
8183 				check_cpu_capacity(rq, sd)) {
8184 			kick = true;
8185 			goto unlock;
8186 		}
8187 	}
8188 
8189 	sd = rcu_dereference(per_cpu(sd_asym, cpu));
8190 	if (sd && (cpumask_first_and(nohz.idle_cpus_mask,
8191 				  sched_domain_span(sd)) < cpu)) {
8192 		kick = true;
8193 		goto unlock;
8194 	}
8195 
8196 unlock:
8197 	rcu_read_unlock();
8198 	return kick;
8199 }
8200 #else
8201 static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { }
8202 #endif
8203 
8204 /*
8205  * run_rebalance_domains is triggered when needed from the scheduler tick.
8206  * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
8207  */
8208 static void run_rebalance_domains(struct softirq_action *h)
8209 {
8210 	struct rq *this_rq = this_rq();
8211 	enum cpu_idle_type idle = this_rq->idle_balance ?
8212 						CPU_IDLE : CPU_NOT_IDLE;
8213 
8214 	/*
8215 	 * If this cpu has a pending nohz_balance_kick, then do the
8216 	 * balancing on behalf of the other idle cpus whose ticks are
8217 	 * stopped. Do nohz_idle_balance *before* rebalance_domains to
8218 	 * give the idle cpus a chance to load balance. Else we may
8219 	 * load balance only within the local sched_domain hierarchy
8220 	 * and abort nohz_idle_balance altogether if we pull some load.
8221 	 */
8222 	nohz_idle_balance(this_rq, idle);
8223 	rebalance_domains(this_rq, idle);
8224 }
8225 
8226 /*
8227  * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
8228  */
8229 void trigger_load_balance(struct rq *rq)
8230 {
8231 	/* Don't need to rebalance while attached to NULL domain */
8232 	if (unlikely(on_null_domain(rq)))
8233 		return;
8234 
8235 	if (time_after_eq(jiffies, rq->next_balance))
8236 		raise_softirq(SCHED_SOFTIRQ);
8237 #ifdef CONFIG_NO_HZ_COMMON
8238 	if (nohz_kick_needed(rq))
8239 		nohz_balancer_kick();
8240 #endif
8241 }
8242 
8243 static void rq_online_fair(struct rq *rq)
8244 {
8245 	update_sysctl();
8246 
8247 	update_runtime_enabled(rq);
8248 }
8249 
8250 static void rq_offline_fair(struct rq *rq)
8251 {
8252 	update_sysctl();
8253 
8254 	/* Ensure any throttled groups are reachable by pick_next_task */
8255 	unthrottle_offline_cfs_rqs(rq);
8256 }
8257 
8258 #endif /* CONFIG_SMP */
8259 
8260 /*
8261  * scheduler tick hitting a task of our scheduling class:
8262  */
8263 static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
8264 {
8265 	struct cfs_rq *cfs_rq;
8266 	struct sched_entity *se = &curr->se;
8267 
8268 	for_each_sched_entity(se) {
8269 		cfs_rq = cfs_rq_of(se);
8270 		entity_tick(cfs_rq, se, queued);
8271 	}
8272 
8273 	if (static_branch_unlikely(&sched_numa_balancing))
8274 		task_tick_numa(rq, curr);
8275 }
8276 
8277 /*
8278  * called on fork with the child task as argument from the parent's context
8279  *  - child not yet on the tasklist
8280  *  - preemption disabled
8281  */
8282 static void task_fork_fair(struct task_struct *p)
8283 {
8284 	struct cfs_rq *cfs_rq;
8285 	struct sched_entity *se = &p->se, *curr;
8286 	int this_cpu = smp_processor_id();
8287 	struct rq *rq = this_rq();
8288 	unsigned long flags;
8289 
8290 	raw_spin_lock_irqsave(&rq->lock, flags);
8291 
8292 	update_rq_clock(rq);
8293 
8294 	cfs_rq = task_cfs_rq(current);
8295 	curr = cfs_rq->curr;
8296 
8297 	/*
8298 	 * Not only the cpu but also the task_group of the parent might have
8299 	 * been changed after parent->se.parent,cfs_rq were copied to
8300 	 * child->se.parent,cfs_rq. So call __set_task_cpu() to make those
8301 	 * of child point to valid ones.
8302 	 */
8303 	rcu_read_lock();
8304 	__set_task_cpu(p, this_cpu);
8305 	rcu_read_unlock();
8306 
8307 	update_curr(cfs_rq);
8308 
8309 	if (curr)
8310 		se->vruntime = curr->vruntime;
8311 	place_entity(cfs_rq, se, 1);
8312 
8313 	if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
8314 		/*
8315 		 * Upon rescheduling, sched_class::put_prev_task() will place
8316 		 * 'current' within the tree based on its new key value.
8317 		 */
8318 		swap(curr->vruntime, se->vruntime);
8319 		resched_curr(rq);
8320 	}
8321 
8322 	se->vruntime -= cfs_rq->min_vruntime;
8323 
8324 	raw_spin_unlock_irqrestore(&rq->lock, flags);
8325 }
8326 
8327 /*
8328  * Priority of the task has changed. Check to see if we preempt
8329  * the current task.
8330  */
8331 static void
8332 prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
8333 {
8334 	if (!task_on_rq_queued(p))
8335 		return;
8336 
8337 	/*
8338 	 * Reschedule if we are currently running on this runqueue and
8339 	 * our priority decreased, or if we are not currently running on
8340 	 * this runqueue and our priority is higher than the current's
8341 	 */
8342 	if (rq->curr == p) {
8343 		if (p->prio > oldprio)
8344 			resched_curr(rq);
8345 	} else
8346 		check_preempt_curr(rq, p, 0);
8347 }
8348 
8349 static inline bool vruntime_normalized(struct task_struct *p)
8350 {
8351 	struct sched_entity *se = &p->se;
8352 
8353 	/*
8354 	 * In both the TASK_ON_RQ_QUEUED and TASK_ON_RQ_MIGRATING cases,
8355 	 * the dequeue_entity(.flags=0) will already have normalized the
8356 	 * vruntime.
8357 	 */
8358 	if (p->on_rq)
8359 		return true;
8360 
8361 	/*
8362 	 * When !on_rq, vruntime of the task has usually NOT been normalized.
8363 	 * But there are some cases where it has already been normalized:
8364 	 *
8365 	 * - A forked child which is waiting for being woken up by
8366 	 *   wake_up_new_task().
8367 	 * - A task which has been woken up by try_to_wake_up() and
8368 	 *   waiting for actually being woken up by sched_ttwu_pending().
8369 	 */
8370 	if (!se->sum_exec_runtime || p->state == TASK_WAKING)
8371 		return true;
8372 
8373 	return false;
8374 }
8375 
8376 static void detach_task_cfs_rq(struct task_struct *p)
8377 {
8378 	struct sched_entity *se = &p->se;
8379 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
8380 
8381 	if (!vruntime_normalized(p)) {
8382 		/*
8383 		 * Fix up our vruntime so that the current sleep doesn't
8384 		 * cause 'unlimited' sleep bonus.
8385 		 */
8386 		place_entity(cfs_rq, se, 0);
8387 		se->vruntime -= cfs_rq->min_vruntime;
8388 	}
8389 
8390 	/* Catch up with the cfs_rq and remove our load when we leave */
8391 	detach_entity_load_avg(cfs_rq, se);
8392 }
8393 
8394 static void attach_task_cfs_rq(struct task_struct *p)
8395 {
8396 	struct sched_entity *se = &p->se;
8397 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
8398 
8399 #ifdef CONFIG_FAIR_GROUP_SCHED
8400 	/*
8401 	 * Since the real-depth could have been changed (only FAIR
8402 	 * class maintain depth value), reset depth properly.
8403 	 */
8404 	se->depth = se->parent ? se->parent->depth + 1 : 0;
8405 #endif
8406 
8407 	/* Synchronize task with its cfs_rq */
8408 	attach_entity_load_avg(cfs_rq, se);
8409 
8410 	if (!vruntime_normalized(p))
8411 		se->vruntime += cfs_rq->min_vruntime;
8412 }
8413 
8414 static void switched_from_fair(struct rq *rq, struct task_struct *p)
8415 {
8416 	detach_task_cfs_rq(p);
8417 }
8418 
8419 static void switched_to_fair(struct rq *rq, struct task_struct *p)
8420 {
8421 	attach_task_cfs_rq(p);
8422 
8423 	if (task_on_rq_queued(p)) {
8424 		/*
8425 		 * We were most likely switched from sched_rt, so
8426 		 * kick off the schedule if running, otherwise just see
8427 		 * if we can still preempt the current task.
8428 		 */
8429 		if (rq->curr == p)
8430 			resched_curr(rq);
8431 		else
8432 			check_preempt_curr(rq, p, 0);
8433 	}
8434 }
8435 
8436 /* Account for a task changing its policy or group.
8437  *
8438  * This routine is mostly called to set cfs_rq->curr field when a task
8439  * migrates between groups/classes.
8440  */
8441 static void set_curr_task_fair(struct rq *rq)
8442 {
8443 	struct sched_entity *se = &rq->curr->se;
8444 
8445 	for_each_sched_entity(se) {
8446 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
8447 
8448 		set_next_entity(cfs_rq, se);
8449 		/* ensure bandwidth has been allocated on our new cfs_rq */
8450 		account_cfs_rq_runtime(cfs_rq, 0);
8451 	}
8452 }
8453 
8454 void init_cfs_rq(struct cfs_rq *cfs_rq)
8455 {
8456 	cfs_rq->tasks_timeline = RB_ROOT;
8457 	cfs_rq->min_vruntime = (u64)(-(1LL << 20));
8458 #ifndef CONFIG_64BIT
8459 	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
8460 #endif
8461 #ifdef CONFIG_SMP
8462 	atomic_long_set(&cfs_rq->removed_load_avg, 0);
8463 	atomic_long_set(&cfs_rq->removed_util_avg, 0);
8464 #endif
8465 }
8466 
8467 #ifdef CONFIG_FAIR_GROUP_SCHED
8468 static void task_move_group_fair(struct task_struct *p)
8469 {
8470 	detach_task_cfs_rq(p);
8471 	set_task_rq(p, task_cpu(p));
8472 
8473 #ifdef CONFIG_SMP
8474 	/* Tell se's cfs_rq has been changed -- migrated */
8475 	p->se.avg.last_update_time = 0;
8476 #endif
8477 	attach_task_cfs_rq(p);
8478 }
8479 
8480 void free_fair_sched_group(struct task_group *tg)
8481 {
8482 	int i;
8483 
8484 	destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
8485 
8486 	for_each_possible_cpu(i) {
8487 		if (tg->cfs_rq)
8488 			kfree(tg->cfs_rq[i]);
8489 		if (tg->se)
8490 			kfree(tg->se[i]);
8491 	}
8492 
8493 	kfree(tg->cfs_rq);
8494 	kfree(tg->se);
8495 }
8496 
8497 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8498 {
8499 	struct cfs_rq *cfs_rq;
8500 	struct sched_entity *se;
8501 	int i;
8502 
8503 	tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
8504 	if (!tg->cfs_rq)
8505 		goto err;
8506 	tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
8507 	if (!tg->se)
8508 		goto err;
8509 
8510 	tg->shares = NICE_0_LOAD;
8511 
8512 	init_cfs_bandwidth(tg_cfs_bandwidth(tg));
8513 
8514 	for_each_possible_cpu(i) {
8515 		cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
8516 				      GFP_KERNEL, cpu_to_node(i));
8517 		if (!cfs_rq)
8518 			goto err;
8519 
8520 		se = kzalloc_node(sizeof(struct sched_entity),
8521 				  GFP_KERNEL, cpu_to_node(i));
8522 		if (!se)
8523 			goto err_free_rq;
8524 
8525 		init_cfs_rq(cfs_rq);
8526 		init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
8527 		init_entity_runnable_average(se);
8528 		post_init_entity_util_avg(se);
8529 	}
8530 
8531 	return 1;
8532 
8533 err_free_rq:
8534 	kfree(cfs_rq);
8535 err:
8536 	return 0;
8537 }
8538 
8539 void unregister_fair_sched_group(struct task_group *tg)
8540 {
8541 	unsigned long flags;
8542 	struct rq *rq;
8543 	int cpu;
8544 
8545 	for_each_possible_cpu(cpu) {
8546 		if (tg->se[cpu])
8547 			remove_entity_load_avg(tg->se[cpu]);
8548 
8549 		/*
8550 		 * Only empty task groups can be destroyed; so we can speculatively
8551 		 * check on_list without danger of it being re-added.
8552 		 */
8553 		if (!tg->cfs_rq[cpu]->on_list)
8554 			continue;
8555 
8556 		rq = cpu_rq(cpu);
8557 
8558 		raw_spin_lock_irqsave(&rq->lock, flags);
8559 		list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
8560 		raw_spin_unlock_irqrestore(&rq->lock, flags);
8561 	}
8562 }
8563 
8564 void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
8565 			struct sched_entity *se, int cpu,
8566 			struct sched_entity *parent)
8567 {
8568 	struct rq *rq = cpu_rq(cpu);
8569 
8570 	cfs_rq->tg = tg;
8571 	cfs_rq->rq = rq;
8572 	init_cfs_rq_runtime(cfs_rq);
8573 
8574 	tg->cfs_rq[cpu] = cfs_rq;
8575 	tg->se[cpu] = se;
8576 
8577 	/* se could be NULL for root_task_group */
8578 	if (!se)
8579 		return;
8580 
8581 	if (!parent) {
8582 		se->cfs_rq = &rq->cfs;
8583 		se->depth = 0;
8584 	} else {
8585 		se->cfs_rq = parent->my_q;
8586 		se->depth = parent->depth + 1;
8587 	}
8588 
8589 	se->my_q = cfs_rq;
8590 	/* guarantee group entities always have weight */
8591 	update_load_set(&se->load, NICE_0_LOAD);
8592 	se->parent = parent;
8593 }
8594 
8595 static DEFINE_MUTEX(shares_mutex);
8596 
8597 int sched_group_set_shares(struct task_group *tg, unsigned long shares)
8598 {
8599 	int i;
8600 	unsigned long flags;
8601 
8602 	/*
8603 	 * We can't change the weight of the root cgroup.
8604 	 */
8605 	if (!tg->se[0])
8606 		return -EINVAL;
8607 
8608 	shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
8609 
8610 	mutex_lock(&shares_mutex);
8611 	if (tg->shares == shares)
8612 		goto done;
8613 
8614 	tg->shares = shares;
8615 	for_each_possible_cpu(i) {
8616 		struct rq *rq = cpu_rq(i);
8617 		struct sched_entity *se;
8618 
8619 		se = tg->se[i];
8620 		/* Propagate contribution to hierarchy */
8621 		raw_spin_lock_irqsave(&rq->lock, flags);
8622 
8623 		/* Possible calls to update_curr() need rq clock */
8624 		update_rq_clock(rq);
8625 		for_each_sched_entity(se)
8626 			update_cfs_shares(group_cfs_rq(se));
8627 		raw_spin_unlock_irqrestore(&rq->lock, flags);
8628 	}
8629 
8630 done:
8631 	mutex_unlock(&shares_mutex);
8632 	return 0;
8633 }
8634 #else /* CONFIG_FAIR_GROUP_SCHED */
8635 
8636 void free_fair_sched_group(struct task_group *tg) { }
8637 
8638 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8639 {
8640 	return 1;
8641 }
8642 
8643 void unregister_fair_sched_group(struct task_group *tg) { }
8644 
8645 #endif /* CONFIG_FAIR_GROUP_SCHED */
8646 
8647 
8648 static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
8649 {
8650 	struct sched_entity *se = &task->se;
8651 	unsigned int rr_interval = 0;
8652 
8653 	/*
8654 	 * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
8655 	 * idle runqueue:
8656 	 */
8657 	if (rq->cfs.load.weight)
8658 		rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
8659 
8660 	return rr_interval;
8661 }
8662 
8663 /*
8664  * All the scheduling class methods:
8665  */
8666 const struct sched_class fair_sched_class = {
8667 	.next			= &idle_sched_class,
8668 	.enqueue_task		= enqueue_task_fair,
8669 	.dequeue_task		= dequeue_task_fair,
8670 	.yield_task		= yield_task_fair,
8671 	.yield_to_task		= yield_to_task_fair,
8672 
8673 	.check_preempt_curr	= check_preempt_wakeup,
8674 
8675 	.pick_next_task		= pick_next_task_fair,
8676 	.put_prev_task		= put_prev_task_fair,
8677 
8678 #ifdef CONFIG_SMP
8679 	.select_task_rq		= select_task_rq_fair,
8680 	.migrate_task_rq	= migrate_task_rq_fair,
8681 
8682 	.rq_online		= rq_online_fair,
8683 	.rq_offline		= rq_offline_fair,
8684 
8685 	.task_dead		= task_dead_fair,
8686 	.set_cpus_allowed	= set_cpus_allowed_common,
8687 #endif
8688 
8689 	.set_curr_task          = set_curr_task_fair,
8690 	.task_tick		= task_tick_fair,
8691 	.task_fork		= task_fork_fair,
8692 
8693 	.prio_changed		= prio_changed_fair,
8694 	.switched_from		= switched_from_fair,
8695 	.switched_to		= switched_to_fair,
8696 
8697 	.get_rr_interval	= get_rr_interval_fair,
8698 
8699 	.update_curr		= update_curr_fair,
8700 
8701 #ifdef CONFIG_FAIR_GROUP_SCHED
8702 	.task_move_group	= task_move_group_fair,
8703 #endif
8704 };
8705 
8706 #ifdef CONFIG_SCHED_DEBUG
8707 void print_cfs_stats(struct seq_file *m, int cpu)
8708 {
8709 	struct cfs_rq *cfs_rq;
8710 
8711 	rcu_read_lock();
8712 	for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
8713 		print_cfs_rq(m, cpu, cfs_rq);
8714 	rcu_read_unlock();
8715 }
8716 
8717 #ifdef CONFIG_NUMA_BALANCING
8718 void show_numa_stats(struct task_struct *p, struct seq_file *m)
8719 {
8720 	int node;
8721 	unsigned long tsf = 0, tpf = 0, gsf = 0, gpf = 0;
8722 
8723 	for_each_online_node(node) {
8724 		if (p->numa_faults) {
8725 			tsf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 0)];
8726 			tpf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 1)];
8727 		}
8728 		if (p->numa_group) {
8729 			gsf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 0)],
8730 			gpf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 1)];
8731 		}
8732 		print_numa_stats(m, node, tsf, tpf, gsf, gpf);
8733 	}
8734 }
8735 #endif /* CONFIG_NUMA_BALANCING */
8736 #endif /* CONFIG_SCHED_DEBUG */
8737 
8738 __init void init_sched_fair_class(void)
8739 {
8740 #ifdef CONFIG_SMP
8741 	open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
8742 
8743 #ifdef CONFIG_NO_HZ_COMMON
8744 	nohz.next_balance = jiffies;
8745 	zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
8746 #endif
8747 #endif /* SMP */
8748 
8749 }
8750