xref: /openbmc/linux/kernel/sched/fair.c (revision a06c488d)
1 /*
2  * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
3  *
4  *  Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
5  *
6  *  Interactivity improvements by Mike Galbraith
7  *  (C) 2007 Mike Galbraith <efault@gmx.de>
8  *
9  *  Various enhancements by Dmitry Adamushko.
10  *  (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
11  *
12  *  Group scheduling enhancements by Srivatsa Vaddagiri
13  *  Copyright IBM Corporation, 2007
14  *  Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
15  *
16  *  Scaled math optimizations by Thomas Gleixner
17  *  Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
18  *
19  *  Adaptive scheduling granularity, math enhancements by Peter Zijlstra
20  *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
21  */
22 
23 #include <linux/latencytop.h>
24 #include <linux/sched.h>
25 #include <linux/cpumask.h>
26 #include <linux/cpuidle.h>
27 #include <linux/slab.h>
28 #include <linux/profile.h>
29 #include <linux/interrupt.h>
30 #include <linux/mempolicy.h>
31 #include <linux/migrate.h>
32 #include <linux/task_work.h>
33 
34 #include <trace/events/sched.h>
35 
36 #include "sched.h"
37 
38 /*
39  * Targeted preemption latency for CPU-bound tasks:
40  * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
41  *
42  * NOTE: this latency value is not the same as the concept of
43  * 'timeslice length' - timeslices in CFS are of variable length
44  * and have no persistent notion like in traditional, time-slice
45  * based scheduling concepts.
46  *
47  * (to see the precise effective timeslice length of your workload,
48  *  run vmstat and monitor the context-switches (cs) field)
49  */
50 unsigned int sysctl_sched_latency = 6000000ULL;
51 unsigned int normalized_sysctl_sched_latency = 6000000ULL;
52 
53 /*
54  * The initial- and re-scaling of tunables is configurable
55  * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
56  *
57  * Options are:
58  * SCHED_TUNABLESCALING_NONE - unscaled, always *1
59  * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
60  * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
61  */
62 enum sched_tunable_scaling sysctl_sched_tunable_scaling
63 	= SCHED_TUNABLESCALING_LOG;
64 
65 /*
66  * Minimal preemption granularity for CPU-bound tasks:
67  * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
68  */
69 unsigned int sysctl_sched_min_granularity = 750000ULL;
70 unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
71 
72 /*
73  * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
74  */
75 static unsigned int sched_nr_latency = 8;
76 
77 /*
78  * After fork, child runs first. If set to 0 (default) then
79  * parent will (try to) run first.
80  */
81 unsigned int sysctl_sched_child_runs_first __read_mostly;
82 
83 /*
84  * SCHED_OTHER wake-up granularity.
85  * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
86  *
87  * This option delays the preemption effects of decoupled workloads
88  * and reduces their over-scheduling. Synchronous workloads will still
89  * have immediate wakeup/sleep latencies.
90  */
91 unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
92 unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
93 
94 const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
95 
96 /*
97  * The exponential sliding  window over which load is averaged for shares
98  * distribution.
99  * (default: 10msec)
100  */
101 unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
102 
103 #ifdef CONFIG_CFS_BANDWIDTH
104 /*
105  * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
106  * each time a cfs_rq requests quota.
107  *
108  * Note: in the case that the slice exceeds the runtime remaining (either due
109  * to consumption or the quota being specified to be smaller than the slice)
110  * we will always only issue the remaining available time.
111  *
112  * default: 5 msec, units: microseconds
113   */
114 unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
115 #endif
116 
117 static inline void update_load_add(struct load_weight *lw, unsigned long inc)
118 {
119 	lw->weight += inc;
120 	lw->inv_weight = 0;
121 }
122 
123 static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
124 {
125 	lw->weight -= dec;
126 	lw->inv_weight = 0;
127 }
128 
129 static inline void update_load_set(struct load_weight *lw, unsigned long w)
130 {
131 	lw->weight = w;
132 	lw->inv_weight = 0;
133 }
134 
135 /*
136  * Increase the granularity value when there are more CPUs,
137  * because with more CPUs the 'effective latency' as visible
138  * to users decreases. But the relationship is not linear,
139  * so pick a second-best guess by going with the log2 of the
140  * number of CPUs.
141  *
142  * This idea comes from the SD scheduler of Con Kolivas:
143  */
144 static unsigned int get_update_sysctl_factor(void)
145 {
146 	unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8);
147 	unsigned int factor;
148 
149 	switch (sysctl_sched_tunable_scaling) {
150 	case SCHED_TUNABLESCALING_NONE:
151 		factor = 1;
152 		break;
153 	case SCHED_TUNABLESCALING_LINEAR:
154 		factor = cpus;
155 		break;
156 	case SCHED_TUNABLESCALING_LOG:
157 	default:
158 		factor = 1 + ilog2(cpus);
159 		break;
160 	}
161 
162 	return factor;
163 }
164 
165 static void update_sysctl(void)
166 {
167 	unsigned int factor = get_update_sysctl_factor();
168 
169 #define SET_SYSCTL(name) \
170 	(sysctl_##name = (factor) * normalized_sysctl_##name)
171 	SET_SYSCTL(sched_min_granularity);
172 	SET_SYSCTL(sched_latency);
173 	SET_SYSCTL(sched_wakeup_granularity);
174 #undef SET_SYSCTL
175 }
176 
177 void sched_init_granularity(void)
178 {
179 	update_sysctl();
180 }
181 
182 #define WMULT_CONST	(~0U)
183 #define WMULT_SHIFT	32
184 
185 static void __update_inv_weight(struct load_weight *lw)
186 {
187 	unsigned long w;
188 
189 	if (likely(lw->inv_weight))
190 		return;
191 
192 	w = scale_load_down(lw->weight);
193 
194 	if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
195 		lw->inv_weight = 1;
196 	else if (unlikely(!w))
197 		lw->inv_weight = WMULT_CONST;
198 	else
199 		lw->inv_weight = WMULT_CONST / w;
200 }
201 
202 /*
203  * delta_exec * weight / lw.weight
204  *   OR
205  * (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT
206  *
207  * Either weight := NICE_0_LOAD and lw \e prio_to_wmult[], in which case
208  * we're guaranteed shift stays positive because inv_weight is guaranteed to
209  * fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.
210  *
211  * Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus
212  * weight/lw.weight <= 1, and therefore our shift will also be positive.
213  */
214 static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
215 {
216 	u64 fact = scale_load_down(weight);
217 	int shift = WMULT_SHIFT;
218 
219 	__update_inv_weight(lw);
220 
221 	if (unlikely(fact >> 32)) {
222 		while (fact >> 32) {
223 			fact >>= 1;
224 			shift--;
225 		}
226 	}
227 
228 	/* hint to use a 32x32->64 mul */
229 	fact = (u64)(u32)fact * lw->inv_weight;
230 
231 	while (fact >> 32) {
232 		fact >>= 1;
233 		shift--;
234 	}
235 
236 	return mul_u64_u32_shr(delta_exec, fact, shift);
237 }
238 
239 
240 const struct sched_class fair_sched_class;
241 
242 /**************************************************************
243  * CFS operations on generic schedulable entities:
244  */
245 
246 #ifdef CONFIG_FAIR_GROUP_SCHED
247 
248 /* cpu runqueue to which this cfs_rq is attached */
249 static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
250 {
251 	return cfs_rq->rq;
252 }
253 
254 /* An entity is a task if it doesn't "own" a runqueue */
255 #define entity_is_task(se)	(!se->my_q)
256 
257 static inline struct task_struct *task_of(struct sched_entity *se)
258 {
259 #ifdef CONFIG_SCHED_DEBUG
260 	WARN_ON_ONCE(!entity_is_task(se));
261 #endif
262 	return container_of(se, struct task_struct, se);
263 }
264 
265 /* Walk up scheduling entities hierarchy */
266 #define for_each_sched_entity(se) \
267 		for (; se; se = se->parent)
268 
269 static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
270 {
271 	return p->se.cfs_rq;
272 }
273 
274 /* runqueue on which this entity is (to be) queued */
275 static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
276 {
277 	return se->cfs_rq;
278 }
279 
280 /* runqueue "owned" by this group */
281 static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
282 {
283 	return grp->my_q;
284 }
285 
286 static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
287 {
288 	if (!cfs_rq->on_list) {
289 		/*
290 		 * Ensure we either appear before our parent (if already
291 		 * enqueued) or force our parent to appear after us when it is
292 		 * enqueued.  The fact that we always enqueue bottom-up
293 		 * reduces this to two cases.
294 		 */
295 		if (cfs_rq->tg->parent &&
296 		    cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
297 			list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
298 				&rq_of(cfs_rq)->leaf_cfs_rq_list);
299 		} else {
300 			list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
301 				&rq_of(cfs_rq)->leaf_cfs_rq_list);
302 		}
303 
304 		cfs_rq->on_list = 1;
305 	}
306 }
307 
308 static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
309 {
310 	if (cfs_rq->on_list) {
311 		list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
312 		cfs_rq->on_list = 0;
313 	}
314 }
315 
316 /* Iterate thr' all leaf cfs_rq's on a runqueue */
317 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
318 	list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
319 
320 /* Do the two (enqueued) entities belong to the same group ? */
321 static inline struct cfs_rq *
322 is_same_group(struct sched_entity *se, struct sched_entity *pse)
323 {
324 	if (se->cfs_rq == pse->cfs_rq)
325 		return se->cfs_rq;
326 
327 	return NULL;
328 }
329 
330 static inline struct sched_entity *parent_entity(struct sched_entity *se)
331 {
332 	return se->parent;
333 }
334 
335 static void
336 find_matching_se(struct sched_entity **se, struct sched_entity **pse)
337 {
338 	int se_depth, pse_depth;
339 
340 	/*
341 	 * preemption test can be made between sibling entities who are in the
342 	 * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
343 	 * both tasks until we find their ancestors who are siblings of common
344 	 * parent.
345 	 */
346 
347 	/* First walk up until both entities are at same depth */
348 	se_depth = (*se)->depth;
349 	pse_depth = (*pse)->depth;
350 
351 	while (se_depth > pse_depth) {
352 		se_depth--;
353 		*se = parent_entity(*se);
354 	}
355 
356 	while (pse_depth > se_depth) {
357 		pse_depth--;
358 		*pse = parent_entity(*pse);
359 	}
360 
361 	while (!is_same_group(*se, *pse)) {
362 		*se = parent_entity(*se);
363 		*pse = parent_entity(*pse);
364 	}
365 }
366 
367 #else	/* !CONFIG_FAIR_GROUP_SCHED */
368 
369 static inline struct task_struct *task_of(struct sched_entity *se)
370 {
371 	return container_of(se, struct task_struct, se);
372 }
373 
374 static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
375 {
376 	return container_of(cfs_rq, struct rq, cfs);
377 }
378 
379 #define entity_is_task(se)	1
380 
381 #define for_each_sched_entity(se) \
382 		for (; se; se = NULL)
383 
384 static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
385 {
386 	return &task_rq(p)->cfs;
387 }
388 
389 static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
390 {
391 	struct task_struct *p = task_of(se);
392 	struct rq *rq = task_rq(p);
393 
394 	return &rq->cfs;
395 }
396 
397 /* runqueue "owned" by this group */
398 static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
399 {
400 	return NULL;
401 }
402 
403 static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
404 {
405 }
406 
407 static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
408 {
409 }
410 
411 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
412 		for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
413 
414 static inline struct sched_entity *parent_entity(struct sched_entity *se)
415 {
416 	return NULL;
417 }
418 
419 static inline void
420 find_matching_se(struct sched_entity **se, struct sched_entity **pse)
421 {
422 }
423 
424 #endif	/* CONFIG_FAIR_GROUP_SCHED */
425 
426 static __always_inline
427 void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
428 
429 /**************************************************************
430  * Scheduling class tree data structure manipulation methods:
431  */
432 
433 static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
434 {
435 	s64 delta = (s64)(vruntime - max_vruntime);
436 	if (delta > 0)
437 		max_vruntime = vruntime;
438 
439 	return max_vruntime;
440 }
441 
442 static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
443 {
444 	s64 delta = (s64)(vruntime - min_vruntime);
445 	if (delta < 0)
446 		min_vruntime = vruntime;
447 
448 	return min_vruntime;
449 }
450 
451 static inline int entity_before(struct sched_entity *a,
452 				struct sched_entity *b)
453 {
454 	return (s64)(a->vruntime - b->vruntime) < 0;
455 }
456 
457 static void update_min_vruntime(struct cfs_rq *cfs_rq)
458 {
459 	u64 vruntime = cfs_rq->min_vruntime;
460 
461 	if (cfs_rq->curr)
462 		vruntime = cfs_rq->curr->vruntime;
463 
464 	if (cfs_rq->rb_leftmost) {
465 		struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
466 						   struct sched_entity,
467 						   run_node);
468 
469 		if (!cfs_rq->curr)
470 			vruntime = se->vruntime;
471 		else
472 			vruntime = min_vruntime(vruntime, se->vruntime);
473 	}
474 
475 	/* ensure we never gain time by being placed backwards. */
476 	cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
477 #ifndef CONFIG_64BIT
478 	smp_wmb();
479 	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
480 #endif
481 }
482 
483 /*
484  * Enqueue an entity into the rb-tree:
485  */
486 static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
487 {
488 	struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
489 	struct rb_node *parent = NULL;
490 	struct sched_entity *entry;
491 	int leftmost = 1;
492 
493 	/*
494 	 * Find the right place in the rbtree:
495 	 */
496 	while (*link) {
497 		parent = *link;
498 		entry = rb_entry(parent, struct sched_entity, run_node);
499 		/*
500 		 * We dont care about collisions. Nodes with
501 		 * the same key stay together.
502 		 */
503 		if (entity_before(se, entry)) {
504 			link = &parent->rb_left;
505 		} else {
506 			link = &parent->rb_right;
507 			leftmost = 0;
508 		}
509 	}
510 
511 	/*
512 	 * Maintain a cache of leftmost tree entries (it is frequently
513 	 * used):
514 	 */
515 	if (leftmost)
516 		cfs_rq->rb_leftmost = &se->run_node;
517 
518 	rb_link_node(&se->run_node, parent, link);
519 	rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
520 }
521 
522 static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
523 {
524 	if (cfs_rq->rb_leftmost == &se->run_node) {
525 		struct rb_node *next_node;
526 
527 		next_node = rb_next(&se->run_node);
528 		cfs_rq->rb_leftmost = next_node;
529 	}
530 
531 	rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
532 }
533 
534 struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
535 {
536 	struct rb_node *left = cfs_rq->rb_leftmost;
537 
538 	if (!left)
539 		return NULL;
540 
541 	return rb_entry(left, struct sched_entity, run_node);
542 }
543 
544 static struct sched_entity *__pick_next_entity(struct sched_entity *se)
545 {
546 	struct rb_node *next = rb_next(&se->run_node);
547 
548 	if (!next)
549 		return NULL;
550 
551 	return rb_entry(next, struct sched_entity, run_node);
552 }
553 
554 #ifdef CONFIG_SCHED_DEBUG
555 struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
556 {
557 	struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
558 
559 	if (!last)
560 		return NULL;
561 
562 	return rb_entry(last, struct sched_entity, run_node);
563 }
564 
565 /**************************************************************
566  * Scheduling class statistics methods:
567  */
568 
569 int sched_proc_update_handler(struct ctl_table *table, int write,
570 		void __user *buffer, size_t *lenp,
571 		loff_t *ppos)
572 {
573 	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
574 	unsigned int factor = get_update_sysctl_factor();
575 
576 	if (ret || !write)
577 		return ret;
578 
579 	sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
580 					sysctl_sched_min_granularity);
581 
582 #define WRT_SYSCTL(name) \
583 	(normalized_sysctl_##name = sysctl_##name / (factor))
584 	WRT_SYSCTL(sched_min_granularity);
585 	WRT_SYSCTL(sched_latency);
586 	WRT_SYSCTL(sched_wakeup_granularity);
587 #undef WRT_SYSCTL
588 
589 	return 0;
590 }
591 #endif
592 
593 /*
594  * delta /= w
595  */
596 static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
597 {
598 	if (unlikely(se->load.weight != NICE_0_LOAD))
599 		delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
600 
601 	return delta;
602 }
603 
604 /*
605  * The idea is to set a period in which each task runs once.
606  *
607  * When there are too many tasks (sched_nr_latency) we have to stretch
608  * this period because otherwise the slices get too small.
609  *
610  * p = (nr <= nl) ? l : l*nr/nl
611  */
612 static u64 __sched_period(unsigned long nr_running)
613 {
614 	if (unlikely(nr_running > sched_nr_latency))
615 		return nr_running * sysctl_sched_min_granularity;
616 	else
617 		return sysctl_sched_latency;
618 }
619 
620 /*
621  * We calculate the wall-time slice from the period by taking a part
622  * proportional to the weight.
623  *
624  * s = p*P[w/rw]
625  */
626 static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
627 {
628 	u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
629 
630 	for_each_sched_entity(se) {
631 		struct load_weight *load;
632 		struct load_weight lw;
633 
634 		cfs_rq = cfs_rq_of(se);
635 		load = &cfs_rq->load;
636 
637 		if (unlikely(!se->on_rq)) {
638 			lw = cfs_rq->load;
639 
640 			update_load_add(&lw, se->load.weight);
641 			load = &lw;
642 		}
643 		slice = __calc_delta(slice, se->load.weight, load);
644 	}
645 	return slice;
646 }
647 
648 /*
649  * We calculate the vruntime slice of a to-be-inserted task.
650  *
651  * vs = s/w
652  */
653 static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
654 {
655 	return calc_delta_fair(sched_slice(cfs_rq, se), se);
656 }
657 
658 #ifdef CONFIG_SMP
659 static int select_idle_sibling(struct task_struct *p, int cpu);
660 static unsigned long task_h_load(struct task_struct *p);
661 
662 /*
663  * We choose a half-life close to 1 scheduling period.
664  * Note: The tables runnable_avg_yN_inv and runnable_avg_yN_sum are
665  * dependent on this value.
666  */
667 #define LOAD_AVG_PERIOD 32
668 #define LOAD_AVG_MAX 47742 /* maximum possible load avg */
669 #define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_AVG_MAX */
670 
671 /* Give new sched_entity start runnable values to heavy its load in infant time */
672 void init_entity_runnable_average(struct sched_entity *se)
673 {
674 	struct sched_avg *sa = &se->avg;
675 
676 	sa->last_update_time = 0;
677 	/*
678 	 * sched_avg's period_contrib should be strictly less then 1024, so
679 	 * we give it 1023 to make sure it is almost a period (1024us), and
680 	 * will definitely be update (after enqueue).
681 	 */
682 	sa->period_contrib = 1023;
683 	sa->load_avg = scale_load_down(se->load.weight);
684 	sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
685 	sa->util_avg = scale_load_down(SCHED_LOAD_SCALE);
686 	sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
687 	/* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
688 }
689 
690 static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq);
691 static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq);
692 #else
693 void init_entity_runnable_average(struct sched_entity *se)
694 {
695 }
696 #endif
697 
698 /*
699  * Update the current task's runtime statistics.
700  */
701 static void update_curr(struct cfs_rq *cfs_rq)
702 {
703 	struct sched_entity *curr = cfs_rq->curr;
704 	u64 now = rq_clock_task(rq_of(cfs_rq));
705 	u64 delta_exec;
706 
707 	if (unlikely(!curr))
708 		return;
709 
710 	delta_exec = now - curr->exec_start;
711 	if (unlikely((s64)delta_exec <= 0))
712 		return;
713 
714 	curr->exec_start = now;
715 
716 	schedstat_set(curr->statistics.exec_max,
717 		      max(delta_exec, curr->statistics.exec_max));
718 
719 	curr->sum_exec_runtime += delta_exec;
720 	schedstat_add(cfs_rq, exec_clock, delta_exec);
721 
722 	curr->vruntime += calc_delta_fair(delta_exec, curr);
723 	update_min_vruntime(cfs_rq);
724 
725 	if (entity_is_task(curr)) {
726 		struct task_struct *curtask = task_of(curr);
727 
728 		trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
729 		cpuacct_charge(curtask, delta_exec);
730 		account_group_exec_runtime(curtask, delta_exec);
731 	}
732 
733 	account_cfs_rq_runtime(cfs_rq, delta_exec);
734 }
735 
736 static void update_curr_fair(struct rq *rq)
737 {
738 	update_curr(cfs_rq_of(&rq->curr->se));
739 }
740 
741 #ifdef CONFIG_SCHEDSTATS
742 static inline void
743 update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
744 {
745 	u64 wait_start = rq_clock(rq_of(cfs_rq));
746 
747 	if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) &&
748 	    likely(wait_start > se->statistics.wait_start))
749 		wait_start -= se->statistics.wait_start;
750 
751 	se->statistics.wait_start = wait_start;
752 }
753 
754 static void
755 update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
756 {
757 	struct task_struct *p;
758 	u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start;
759 
760 	if (entity_is_task(se)) {
761 		p = task_of(se);
762 		if (task_on_rq_migrating(p)) {
763 			/*
764 			 * Preserve migrating task's wait time so wait_start
765 			 * time stamp can be adjusted to accumulate wait time
766 			 * prior to migration.
767 			 */
768 			se->statistics.wait_start = delta;
769 			return;
770 		}
771 		trace_sched_stat_wait(p, delta);
772 	}
773 
774 	se->statistics.wait_max = max(se->statistics.wait_max, delta);
775 	se->statistics.wait_count++;
776 	se->statistics.wait_sum += delta;
777 	se->statistics.wait_start = 0;
778 }
779 #else
780 static inline void
781 update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
782 {
783 }
784 
785 static inline void
786 update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
787 {
788 }
789 #endif
790 
791 /*
792  * Task is being enqueued - update stats:
793  */
794 static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
795 {
796 	/*
797 	 * Are we enqueueing a waiting task? (for current tasks
798 	 * a dequeue/enqueue event is a NOP)
799 	 */
800 	if (se != cfs_rq->curr)
801 		update_stats_wait_start(cfs_rq, se);
802 }
803 
804 static inline void
805 update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
806 {
807 	/*
808 	 * Mark the end of the wait period if dequeueing a
809 	 * waiting task:
810 	 */
811 	if (se != cfs_rq->curr)
812 		update_stats_wait_end(cfs_rq, se);
813 }
814 
815 /*
816  * We are picking a new current task - update its stats:
817  */
818 static inline void
819 update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
820 {
821 	/*
822 	 * We are starting a new run period:
823 	 */
824 	se->exec_start = rq_clock_task(rq_of(cfs_rq));
825 }
826 
827 /**************************************************
828  * Scheduling class queueing methods:
829  */
830 
831 #ifdef CONFIG_NUMA_BALANCING
832 /*
833  * Approximate time to scan a full NUMA task in ms. The task scan period is
834  * calculated based on the tasks virtual memory size and
835  * numa_balancing_scan_size.
836  */
837 unsigned int sysctl_numa_balancing_scan_period_min = 1000;
838 unsigned int sysctl_numa_balancing_scan_period_max = 60000;
839 
840 /* Portion of address space to scan in MB */
841 unsigned int sysctl_numa_balancing_scan_size = 256;
842 
843 /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
844 unsigned int sysctl_numa_balancing_scan_delay = 1000;
845 
846 static unsigned int task_nr_scan_windows(struct task_struct *p)
847 {
848 	unsigned long rss = 0;
849 	unsigned long nr_scan_pages;
850 
851 	/*
852 	 * Calculations based on RSS as non-present and empty pages are skipped
853 	 * by the PTE scanner and NUMA hinting faults should be trapped based
854 	 * on resident pages
855 	 */
856 	nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);
857 	rss = get_mm_rss(p->mm);
858 	if (!rss)
859 		rss = nr_scan_pages;
860 
861 	rss = round_up(rss, nr_scan_pages);
862 	return rss / nr_scan_pages;
863 }
864 
865 /* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */
866 #define MAX_SCAN_WINDOW 2560
867 
868 static unsigned int task_scan_min(struct task_struct *p)
869 {
870 	unsigned int scan_size = READ_ONCE(sysctl_numa_balancing_scan_size);
871 	unsigned int scan, floor;
872 	unsigned int windows = 1;
873 
874 	if (scan_size < MAX_SCAN_WINDOW)
875 		windows = MAX_SCAN_WINDOW / scan_size;
876 	floor = 1000 / windows;
877 
878 	scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
879 	return max_t(unsigned int, floor, scan);
880 }
881 
882 static unsigned int task_scan_max(struct task_struct *p)
883 {
884 	unsigned int smin = task_scan_min(p);
885 	unsigned int smax;
886 
887 	/* Watch for min being lower than max due to floor calculations */
888 	smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
889 	return max(smin, smax);
890 }
891 
892 static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
893 {
894 	rq->nr_numa_running += (p->numa_preferred_nid != -1);
895 	rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
896 }
897 
898 static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
899 {
900 	rq->nr_numa_running -= (p->numa_preferred_nid != -1);
901 	rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
902 }
903 
904 struct numa_group {
905 	atomic_t refcount;
906 
907 	spinlock_t lock; /* nr_tasks, tasks */
908 	int nr_tasks;
909 	pid_t gid;
910 
911 	struct rcu_head rcu;
912 	nodemask_t active_nodes;
913 	unsigned long total_faults;
914 	/*
915 	 * Faults_cpu is used to decide whether memory should move
916 	 * towards the CPU. As a consequence, these stats are weighted
917 	 * more by CPU use than by memory faults.
918 	 */
919 	unsigned long *faults_cpu;
920 	unsigned long faults[0];
921 };
922 
923 /* Shared or private faults. */
924 #define NR_NUMA_HINT_FAULT_TYPES 2
925 
926 /* Memory and CPU locality */
927 #define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2)
928 
929 /* Averaged statistics, and temporary buffers. */
930 #define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2)
931 
932 pid_t task_numa_group_id(struct task_struct *p)
933 {
934 	return p->numa_group ? p->numa_group->gid : 0;
935 }
936 
937 /*
938  * The averaged statistics, shared & private, memory & cpu,
939  * occupy the first half of the array. The second half of the
940  * array is for current counters, which are averaged into the
941  * first set by task_numa_placement.
942  */
943 static inline int task_faults_idx(enum numa_faults_stats s, int nid, int priv)
944 {
945 	return NR_NUMA_HINT_FAULT_TYPES * (s * nr_node_ids + nid) + priv;
946 }
947 
948 static inline unsigned long task_faults(struct task_struct *p, int nid)
949 {
950 	if (!p->numa_faults)
951 		return 0;
952 
953 	return p->numa_faults[task_faults_idx(NUMA_MEM, nid, 0)] +
954 		p->numa_faults[task_faults_idx(NUMA_MEM, nid, 1)];
955 }
956 
957 static inline unsigned long group_faults(struct task_struct *p, int nid)
958 {
959 	if (!p->numa_group)
960 		return 0;
961 
962 	return p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 0)] +
963 		p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 1)];
964 }
965 
966 static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
967 {
968 	return group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 0)] +
969 		group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];
970 }
971 
972 /* Handle placement on systems where not all nodes are directly connected. */
973 static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
974 					int maxdist, bool task)
975 {
976 	unsigned long score = 0;
977 	int node;
978 
979 	/*
980 	 * All nodes are directly connected, and the same distance
981 	 * from each other. No need for fancy placement algorithms.
982 	 */
983 	if (sched_numa_topology_type == NUMA_DIRECT)
984 		return 0;
985 
986 	/*
987 	 * This code is called for each node, introducing N^2 complexity,
988 	 * which should be ok given the number of nodes rarely exceeds 8.
989 	 */
990 	for_each_online_node(node) {
991 		unsigned long faults;
992 		int dist = node_distance(nid, node);
993 
994 		/*
995 		 * The furthest away nodes in the system are not interesting
996 		 * for placement; nid was already counted.
997 		 */
998 		if (dist == sched_max_numa_distance || node == nid)
999 			continue;
1000 
1001 		/*
1002 		 * On systems with a backplane NUMA topology, compare groups
1003 		 * of nodes, and move tasks towards the group with the most
1004 		 * memory accesses. When comparing two nodes at distance
1005 		 * "hoplimit", only nodes closer by than "hoplimit" are part
1006 		 * of each group. Skip other nodes.
1007 		 */
1008 		if (sched_numa_topology_type == NUMA_BACKPLANE &&
1009 					dist > maxdist)
1010 			continue;
1011 
1012 		/* Add up the faults from nearby nodes. */
1013 		if (task)
1014 			faults = task_faults(p, node);
1015 		else
1016 			faults = group_faults(p, node);
1017 
1018 		/*
1019 		 * On systems with a glueless mesh NUMA topology, there are
1020 		 * no fixed "groups of nodes". Instead, nodes that are not
1021 		 * directly connected bounce traffic through intermediate
1022 		 * nodes; a numa_group can occupy any set of nodes.
1023 		 * The further away a node is, the less the faults count.
1024 		 * This seems to result in good task placement.
1025 		 */
1026 		if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
1027 			faults *= (sched_max_numa_distance - dist);
1028 			faults /= (sched_max_numa_distance - LOCAL_DISTANCE);
1029 		}
1030 
1031 		score += faults;
1032 	}
1033 
1034 	return score;
1035 }
1036 
1037 /*
1038  * These return the fraction of accesses done by a particular task, or
1039  * task group, on a particular numa node.  The group weight is given a
1040  * larger multiplier, in order to group tasks together that are almost
1041  * evenly spread out between numa nodes.
1042  */
1043 static inline unsigned long task_weight(struct task_struct *p, int nid,
1044 					int dist)
1045 {
1046 	unsigned long faults, total_faults;
1047 
1048 	if (!p->numa_faults)
1049 		return 0;
1050 
1051 	total_faults = p->total_numa_faults;
1052 
1053 	if (!total_faults)
1054 		return 0;
1055 
1056 	faults = task_faults(p, nid);
1057 	faults += score_nearby_nodes(p, nid, dist, true);
1058 
1059 	return 1000 * faults / total_faults;
1060 }
1061 
1062 static inline unsigned long group_weight(struct task_struct *p, int nid,
1063 					 int dist)
1064 {
1065 	unsigned long faults, total_faults;
1066 
1067 	if (!p->numa_group)
1068 		return 0;
1069 
1070 	total_faults = p->numa_group->total_faults;
1071 
1072 	if (!total_faults)
1073 		return 0;
1074 
1075 	faults = group_faults(p, nid);
1076 	faults += score_nearby_nodes(p, nid, dist, false);
1077 
1078 	return 1000 * faults / total_faults;
1079 }
1080 
1081 bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
1082 				int src_nid, int dst_cpu)
1083 {
1084 	struct numa_group *ng = p->numa_group;
1085 	int dst_nid = cpu_to_node(dst_cpu);
1086 	int last_cpupid, this_cpupid;
1087 
1088 	this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
1089 
1090 	/*
1091 	 * Multi-stage node selection is used in conjunction with a periodic
1092 	 * migration fault to build a temporal task<->page relation. By using
1093 	 * a two-stage filter we remove short/unlikely relations.
1094 	 *
1095 	 * Using P(p) ~ n_p / n_t as per frequentist probability, we can equate
1096 	 * a task's usage of a particular page (n_p) per total usage of this
1097 	 * page (n_t) (in a given time-span) to a probability.
1098 	 *
1099 	 * Our periodic faults will sample this probability and getting the
1100 	 * same result twice in a row, given these samples are fully
1101 	 * independent, is then given by P(n)^2, provided our sample period
1102 	 * is sufficiently short compared to the usage pattern.
1103 	 *
1104 	 * This quadric squishes small probabilities, making it less likely we
1105 	 * act on an unlikely task<->page relation.
1106 	 */
1107 	last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
1108 	if (!cpupid_pid_unset(last_cpupid) &&
1109 				cpupid_to_nid(last_cpupid) != dst_nid)
1110 		return false;
1111 
1112 	/* Always allow migrate on private faults */
1113 	if (cpupid_match_pid(p, last_cpupid))
1114 		return true;
1115 
1116 	/* A shared fault, but p->numa_group has not been set up yet. */
1117 	if (!ng)
1118 		return true;
1119 
1120 	/*
1121 	 * Do not migrate if the destination is not a node that
1122 	 * is actively used by this numa group.
1123 	 */
1124 	if (!node_isset(dst_nid, ng->active_nodes))
1125 		return false;
1126 
1127 	/*
1128 	 * Source is a node that is not actively used by this
1129 	 * numa group, while the destination is. Migrate.
1130 	 */
1131 	if (!node_isset(src_nid, ng->active_nodes))
1132 		return true;
1133 
1134 	/*
1135 	 * Both source and destination are nodes in active
1136 	 * use by this numa group. Maximize memory bandwidth
1137 	 * by migrating from more heavily used groups, to less
1138 	 * heavily used ones, spreading the load around.
1139 	 * Use a 1/4 hysteresis to avoid spurious page movement.
1140 	 */
1141 	return group_faults(p, dst_nid) < (group_faults(p, src_nid) * 3 / 4);
1142 }
1143 
1144 static unsigned long weighted_cpuload(const int cpu);
1145 static unsigned long source_load(int cpu, int type);
1146 static unsigned long target_load(int cpu, int type);
1147 static unsigned long capacity_of(int cpu);
1148 static long effective_load(struct task_group *tg, int cpu, long wl, long wg);
1149 
1150 /* Cached statistics for all CPUs within a node */
1151 struct numa_stats {
1152 	unsigned long nr_running;
1153 	unsigned long load;
1154 
1155 	/* Total compute capacity of CPUs on a node */
1156 	unsigned long compute_capacity;
1157 
1158 	/* Approximate capacity in terms of runnable tasks on a node */
1159 	unsigned long task_capacity;
1160 	int has_free_capacity;
1161 };
1162 
1163 /*
1164  * XXX borrowed from update_sg_lb_stats
1165  */
1166 static void update_numa_stats(struct numa_stats *ns, int nid)
1167 {
1168 	int smt, cpu, cpus = 0;
1169 	unsigned long capacity;
1170 
1171 	memset(ns, 0, sizeof(*ns));
1172 	for_each_cpu(cpu, cpumask_of_node(nid)) {
1173 		struct rq *rq = cpu_rq(cpu);
1174 
1175 		ns->nr_running += rq->nr_running;
1176 		ns->load += weighted_cpuload(cpu);
1177 		ns->compute_capacity += capacity_of(cpu);
1178 
1179 		cpus++;
1180 	}
1181 
1182 	/*
1183 	 * If we raced with hotplug and there are no CPUs left in our mask
1184 	 * the @ns structure is NULL'ed and task_numa_compare() will
1185 	 * not find this node attractive.
1186 	 *
1187 	 * We'll either bail at !has_free_capacity, or we'll detect a huge
1188 	 * imbalance and bail there.
1189 	 */
1190 	if (!cpus)
1191 		return;
1192 
1193 	/* smt := ceil(cpus / capacity), assumes: 1 < smt_power < 2 */
1194 	smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, ns->compute_capacity);
1195 	capacity = cpus / smt; /* cores */
1196 
1197 	ns->task_capacity = min_t(unsigned, capacity,
1198 		DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE));
1199 	ns->has_free_capacity = (ns->nr_running < ns->task_capacity);
1200 }
1201 
1202 struct task_numa_env {
1203 	struct task_struct *p;
1204 
1205 	int src_cpu, src_nid;
1206 	int dst_cpu, dst_nid;
1207 
1208 	struct numa_stats src_stats, dst_stats;
1209 
1210 	int imbalance_pct;
1211 	int dist;
1212 
1213 	struct task_struct *best_task;
1214 	long best_imp;
1215 	int best_cpu;
1216 };
1217 
1218 static void task_numa_assign(struct task_numa_env *env,
1219 			     struct task_struct *p, long imp)
1220 {
1221 	if (env->best_task)
1222 		put_task_struct(env->best_task);
1223 	if (p)
1224 		get_task_struct(p);
1225 
1226 	env->best_task = p;
1227 	env->best_imp = imp;
1228 	env->best_cpu = env->dst_cpu;
1229 }
1230 
1231 static bool load_too_imbalanced(long src_load, long dst_load,
1232 				struct task_numa_env *env)
1233 {
1234 	long imb, old_imb;
1235 	long orig_src_load, orig_dst_load;
1236 	long src_capacity, dst_capacity;
1237 
1238 	/*
1239 	 * The load is corrected for the CPU capacity available on each node.
1240 	 *
1241 	 * src_load        dst_load
1242 	 * ------------ vs ---------
1243 	 * src_capacity    dst_capacity
1244 	 */
1245 	src_capacity = env->src_stats.compute_capacity;
1246 	dst_capacity = env->dst_stats.compute_capacity;
1247 
1248 	/* We care about the slope of the imbalance, not the direction. */
1249 	if (dst_load < src_load)
1250 		swap(dst_load, src_load);
1251 
1252 	/* Is the difference below the threshold? */
1253 	imb = dst_load * src_capacity * 100 -
1254 	      src_load * dst_capacity * env->imbalance_pct;
1255 	if (imb <= 0)
1256 		return false;
1257 
1258 	/*
1259 	 * The imbalance is above the allowed threshold.
1260 	 * Compare it with the old imbalance.
1261 	 */
1262 	orig_src_load = env->src_stats.load;
1263 	orig_dst_load = env->dst_stats.load;
1264 
1265 	if (orig_dst_load < orig_src_load)
1266 		swap(orig_dst_load, orig_src_load);
1267 
1268 	old_imb = orig_dst_load * src_capacity * 100 -
1269 		  orig_src_load * dst_capacity * env->imbalance_pct;
1270 
1271 	/* Would this change make things worse? */
1272 	return (imb > old_imb);
1273 }
1274 
1275 /*
1276  * This checks if the overall compute and NUMA accesses of the system would
1277  * be improved if the source tasks was migrated to the target dst_cpu taking
1278  * into account that it might be best if task running on the dst_cpu should
1279  * be exchanged with the source task
1280  */
1281 static void task_numa_compare(struct task_numa_env *env,
1282 			      long taskimp, long groupimp)
1283 {
1284 	struct rq *src_rq = cpu_rq(env->src_cpu);
1285 	struct rq *dst_rq = cpu_rq(env->dst_cpu);
1286 	struct task_struct *cur;
1287 	long src_load, dst_load;
1288 	long load;
1289 	long imp = env->p->numa_group ? groupimp : taskimp;
1290 	long moveimp = imp;
1291 	int dist = env->dist;
1292 
1293 	rcu_read_lock();
1294 
1295 	raw_spin_lock_irq(&dst_rq->lock);
1296 	cur = dst_rq->curr;
1297 	/*
1298 	 * No need to move the exiting task, and this ensures that ->curr
1299 	 * wasn't reaped and thus get_task_struct() in task_numa_assign()
1300 	 * is safe under RCU read lock.
1301 	 * Note that rcu_read_lock() itself can't protect from the final
1302 	 * put_task_struct() after the last schedule().
1303 	 */
1304 	if ((cur->flags & PF_EXITING) || is_idle_task(cur))
1305 		cur = NULL;
1306 	raw_spin_unlock_irq(&dst_rq->lock);
1307 
1308 	/*
1309 	 * Because we have preemption enabled we can get migrated around and
1310 	 * end try selecting ourselves (current == env->p) as a swap candidate.
1311 	 */
1312 	if (cur == env->p)
1313 		goto unlock;
1314 
1315 	/*
1316 	 * "imp" is the fault differential for the source task between the
1317 	 * source and destination node. Calculate the total differential for
1318 	 * the source task and potential destination task. The more negative
1319 	 * the value is, the more rmeote accesses that would be expected to
1320 	 * be incurred if the tasks were swapped.
1321 	 */
1322 	if (cur) {
1323 		/* Skip this swap candidate if cannot move to the source cpu */
1324 		if (!cpumask_test_cpu(env->src_cpu, tsk_cpus_allowed(cur)))
1325 			goto unlock;
1326 
1327 		/*
1328 		 * If dst and source tasks are in the same NUMA group, or not
1329 		 * in any group then look only at task weights.
1330 		 */
1331 		if (cur->numa_group == env->p->numa_group) {
1332 			imp = taskimp + task_weight(cur, env->src_nid, dist) -
1333 			      task_weight(cur, env->dst_nid, dist);
1334 			/*
1335 			 * Add some hysteresis to prevent swapping the
1336 			 * tasks within a group over tiny differences.
1337 			 */
1338 			if (cur->numa_group)
1339 				imp -= imp/16;
1340 		} else {
1341 			/*
1342 			 * Compare the group weights. If a task is all by
1343 			 * itself (not part of a group), use the task weight
1344 			 * instead.
1345 			 */
1346 			if (cur->numa_group)
1347 				imp += group_weight(cur, env->src_nid, dist) -
1348 				       group_weight(cur, env->dst_nid, dist);
1349 			else
1350 				imp += task_weight(cur, env->src_nid, dist) -
1351 				       task_weight(cur, env->dst_nid, dist);
1352 		}
1353 	}
1354 
1355 	if (imp <= env->best_imp && moveimp <= env->best_imp)
1356 		goto unlock;
1357 
1358 	if (!cur) {
1359 		/* Is there capacity at our destination? */
1360 		if (env->src_stats.nr_running <= env->src_stats.task_capacity &&
1361 		    !env->dst_stats.has_free_capacity)
1362 			goto unlock;
1363 
1364 		goto balance;
1365 	}
1366 
1367 	/* Balance doesn't matter much if we're running a task per cpu */
1368 	if (imp > env->best_imp && src_rq->nr_running == 1 &&
1369 			dst_rq->nr_running == 1)
1370 		goto assign;
1371 
1372 	/*
1373 	 * In the overloaded case, try and keep the load balanced.
1374 	 */
1375 balance:
1376 	load = task_h_load(env->p);
1377 	dst_load = env->dst_stats.load + load;
1378 	src_load = env->src_stats.load - load;
1379 
1380 	if (moveimp > imp && moveimp > env->best_imp) {
1381 		/*
1382 		 * If the improvement from just moving env->p direction is
1383 		 * better than swapping tasks around, check if a move is
1384 		 * possible. Store a slightly smaller score than moveimp,
1385 		 * so an actually idle CPU will win.
1386 		 */
1387 		if (!load_too_imbalanced(src_load, dst_load, env)) {
1388 			imp = moveimp - 1;
1389 			cur = NULL;
1390 			goto assign;
1391 		}
1392 	}
1393 
1394 	if (imp <= env->best_imp)
1395 		goto unlock;
1396 
1397 	if (cur) {
1398 		load = task_h_load(cur);
1399 		dst_load -= load;
1400 		src_load += load;
1401 	}
1402 
1403 	if (load_too_imbalanced(src_load, dst_load, env))
1404 		goto unlock;
1405 
1406 	/*
1407 	 * One idle CPU per node is evaluated for a task numa move.
1408 	 * Call select_idle_sibling to maybe find a better one.
1409 	 */
1410 	if (!cur)
1411 		env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu);
1412 
1413 assign:
1414 	task_numa_assign(env, cur, imp);
1415 unlock:
1416 	rcu_read_unlock();
1417 }
1418 
1419 static void task_numa_find_cpu(struct task_numa_env *env,
1420 				long taskimp, long groupimp)
1421 {
1422 	int cpu;
1423 
1424 	for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
1425 		/* Skip this CPU if the source task cannot migrate */
1426 		if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(env->p)))
1427 			continue;
1428 
1429 		env->dst_cpu = cpu;
1430 		task_numa_compare(env, taskimp, groupimp);
1431 	}
1432 }
1433 
1434 /* Only move tasks to a NUMA node less busy than the current node. */
1435 static bool numa_has_capacity(struct task_numa_env *env)
1436 {
1437 	struct numa_stats *src = &env->src_stats;
1438 	struct numa_stats *dst = &env->dst_stats;
1439 
1440 	if (src->has_free_capacity && !dst->has_free_capacity)
1441 		return false;
1442 
1443 	/*
1444 	 * Only consider a task move if the source has a higher load
1445 	 * than the destination, corrected for CPU capacity on each node.
1446 	 *
1447 	 *      src->load                dst->load
1448 	 * --------------------- vs ---------------------
1449 	 * src->compute_capacity    dst->compute_capacity
1450 	 */
1451 	if (src->load * dst->compute_capacity * env->imbalance_pct >
1452 
1453 	    dst->load * src->compute_capacity * 100)
1454 		return true;
1455 
1456 	return false;
1457 }
1458 
1459 static int task_numa_migrate(struct task_struct *p)
1460 {
1461 	struct task_numa_env env = {
1462 		.p = p,
1463 
1464 		.src_cpu = task_cpu(p),
1465 		.src_nid = task_node(p),
1466 
1467 		.imbalance_pct = 112,
1468 
1469 		.best_task = NULL,
1470 		.best_imp = 0,
1471 		.best_cpu = -1
1472 	};
1473 	struct sched_domain *sd;
1474 	unsigned long taskweight, groupweight;
1475 	int nid, ret, dist;
1476 	long taskimp, groupimp;
1477 
1478 	/*
1479 	 * Pick the lowest SD_NUMA domain, as that would have the smallest
1480 	 * imbalance and would be the first to start moving tasks about.
1481 	 *
1482 	 * And we want to avoid any moving of tasks about, as that would create
1483 	 * random movement of tasks -- counter the numa conditions we're trying
1484 	 * to satisfy here.
1485 	 */
1486 	rcu_read_lock();
1487 	sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
1488 	if (sd)
1489 		env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
1490 	rcu_read_unlock();
1491 
1492 	/*
1493 	 * Cpusets can break the scheduler domain tree into smaller
1494 	 * balance domains, some of which do not cross NUMA boundaries.
1495 	 * Tasks that are "trapped" in such domains cannot be migrated
1496 	 * elsewhere, so there is no point in (re)trying.
1497 	 */
1498 	if (unlikely(!sd)) {
1499 		p->numa_preferred_nid = task_node(p);
1500 		return -EINVAL;
1501 	}
1502 
1503 	env.dst_nid = p->numa_preferred_nid;
1504 	dist = env.dist = node_distance(env.src_nid, env.dst_nid);
1505 	taskweight = task_weight(p, env.src_nid, dist);
1506 	groupweight = group_weight(p, env.src_nid, dist);
1507 	update_numa_stats(&env.src_stats, env.src_nid);
1508 	taskimp = task_weight(p, env.dst_nid, dist) - taskweight;
1509 	groupimp = group_weight(p, env.dst_nid, dist) - groupweight;
1510 	update_numa_stats(&env.dst_stats, env.dst_nid);
1511 
1512 	/* Try to find a spot on the preferred nid. */
1513 	if (numa_has_capacity(&env))
1514 		task_numa_find_cpu(&env, taskimp, groupimp);
1515 
1516 	/*
1517 	 * Look at other nodes in these cases:
1518 	 * - there is no space available on the preferred_nid
1519 	 * - the task is part of a numa_group that is interleaved across
1520 	 *   multiple NUMA nodes; in order to better consolidate the group,
1521 	 *   we need to check other locations.
1522 	 */
1523 	if (env.best_cpu == -1 || (p->numa_group &&
1524 			nodes_weight(p->numa_group->active_nodes) > 1)) {
1525 		for_each_online_node(nid) {
1526 			if (nid == env.src_nid || nid == p->numa_preferred_nid)
1527 				continue;
1528 
1529 			dist = node_distance(env.src_nid, env.dst_nid);
1530 			if (sched_numa_topology_type == NUMA_BACKPLANE &&
1531 						dist != env.dist) {
1532 				taskweight = task_weight(p, env.src_nid, dist);
1533 				groupweight = group_weight(p, env.src_nid, dist);
1534 			}
1535 
1536 			/* Only consider nodes where both task and groups benefit */
1537 			taskimp = task_weight(p, nid, dist) - taskweight;
1538 			groupimp = group_weight(p, nid, dist) - groupweight;
1539 			if (taskimp < 0 && groupimp < 0)
1540 				continue;
1541 
1542 			env.dist = dist;
1543 			env.dst_nid = nid;
1544 			update_numa_stats(&env.dst_stats, env.dst_nid);
1545 			if (numa_has_capacity(&env))
1546 				task_numa_find_cpu(&env, taskimp, groupimp);
1547 		}
1548 	}
1549 
1550 	/*
1551 	 * If the task is part of a workload that spans multiple NUMA nodes,
1552 	 * and is migrating into one of the workload's active nodes, remember
1553 	 * this node as the task's preferred numa node, so the workload can
1554 	 * settle down.
1555 	 * A task that migrated to a second choice node will be better off
1556 	 * trying for a better one later. Do not set the preferred node here.
1557 	 */
1558 	if (p->numa_group) {
1559 		if (env.best_cpu == -1)
1560 			nid = env.src_nid;
1561 		else
1562 			nid = env.dst_nid;
1563 
1564 		if (node_isset(nid, p->numa_group->active_nodes))
1565 			sched_setnuma(p, env.dst_nid);
1566 	}
1567 
1568 	/* No better CPU than the current one was found. */
1569 	if (env.best_cpu == -1)
1570 		return -EAGAIN;
1571 
1572 	/*
1573 	 * Reset the scan period if the task is being rescheduled on an
1574 	 * alternative node to recheck if the tasks is now properly placed.
1575 	 */
1576 	p->numa_scan_period = task_scan_min(p);
1577 
1578 	if (env.best_task == NULL) {
1579 		ret = migrate_task_to(p, env.best_cpu);
1580 		if (ret != 0)
1581 			trace_sched_stick_numa(p, env.src_cpu, env.best_cpu);
1582 		return ret;
1583 	}
1584 
1585 	ret = migrate_swap(p, env.best_task);
1586 	if (ret != 0)
1587 		trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task));
1588 	put_task_struct(env.best_task);
1589 	return ret;
1590 }
1591 
1592 /* Attempt to migrate a task to a CPU on the preferred node. */
1593 static void numa_migrate_preferred(struct task_struct *p)
1594 {
1595 	unsigned long interval = HZ;
1596 
1597 	/* This task has no NUMA fault statistics yet */
1598 	if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults))
1599 		return;
1600 
1601 	/* Periodically retry migrating the task to the preferred node */
1602 	interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16);
1603 	p->numa_migrate_retry = jiffies + interval;
1604 
1605 	/* Success if task is already running on preferred CPU */
1606 	if (task_node(p) == p->numa_preferred_nid)
1607 		return;
1608 
1609 	/* Otherwise, try migrate to a CPU on the preferred node */
1610 	task_numa_migrate(p);
1611 }
1612 
1613 /*
1614  * Find the nodes on which the workload is actively running. We do this by
1615  * tracking the nodes from which NUMA hinting faults are triggered. This can
1616  * be different from the set of nodes where the workload's memory is currently
1617  * located.
1618  *
1619  * The bitmask is used to make smarter decisions on when to do NUMA page
1620  * migrations, To prevent flip-flopping, and excessive page migrations, nodes
1621  * are added when they cause over 6/16 of the maximum number of faults, but
1622  * only removed when they drop below 3/16.
1623  */
1624 static void update_numa_active_node_mask(struct numa_group *numa_group)
1625 {
1626 	unsigned long faults, max_faults = 0;
1627 	int nid;
1628 
1629 	for_each_online_node(nid) {
1630 		faults = group_faults_cpu(numa_group, nid);
1631 		if (faults > max_faults)
1632 			max_faults = faults;
1633 	}
1634 
1635 	for_each_online_node(nid) {
1636 		faults = group_faults_cpu(numa_group, nid);
1637 		if (!node_isset(nid, numa_group->active_nodes)) {
1638 			if (faults > max_faults * 6 / 16)
1639 				node_set(nid, numa_group->active_nodes);
1640 		} else if (faults < max_faults * 3 / 16)
1641 			node_clear(nid, numa_group->active_nodes);
1642 	}
1643 }
1644 
1645 /*
1646  * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
1647  * increments. The more local the fault statistics are, the higher the scan
1648  * period will be for the next scan window. If local/(local+remote) ratio is
1649  * below NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS)
1650  * the scan period will decrease. Aim for 70% local accesses.
1651  */
1652 #define NUMA_PERIOD_SLOTS 10
1653 #define NUMA_PERIOD_THRESHOLD 7
1654 
1655 /*
1656  * Increase the scan period (slow down scanning) if the majority of
1657  * our memory is already on our local node, or if the majority of
1658  * the page accesses are shared with other processes.
1659  * Otherwise, decrease the scan period.
1660  */
1661 static void update_task_scan_period(struct task_struct *p,
1662 			unsigned long shared, unsigned long private)
1663 {
1664 	unsigned int period_slot;
1665 	int ratio;
1666 	int diff;
1667 
1668 	unsigned long remote = p->numa_faults_locality[0];
1669 	unsigned long local = p->numa_faults_locality[1];
1670 
1671 	/*
1672 	 * If there were no record hinting faults then either the task is
1673 	 * completely idle or all activity is areas that are not of interest
1674 	 * to automatic numa balancing. Related to that, if there were failed
1675 	 * migration then it implies we are migrating too quickly or the local
1676 	 * node is overloaded. In either case, scan slower
1677 	 */
1678 	if (local + shared == 0 || p->numa_faults_locality[2]) {
1679 		p->numa_scan_period = min(p->numa_scan_period_max,
1680 			p->numa_scan_period << 1);
1681 
1682 		p->mm->numa_next_scan = jiffies +
1683 			msecs_to_jiffies(p->numa_scan_period);
1684 
1685 		return;
1686 	}
1687 
1688 	/*
1689 	 * Prepare to scale scan period relative to the current period.
1690 	 *	 == NUMA_PERIOD_THRESHOLD scan period stays the same
1691 	 *       <  NUMA_PERIOD_THRESHOLD scan period decreases (scan faster)
1692 	 *	 >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower)
1693 	 */
1694 	period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
1695 	ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
1696 	if (ratio >= NUMA_PERIOD_THRESHOLD) {
1697 		int slot = ratio - NUMA_PERIOD_THRESHOLD;
1698 		if (!slot)
1699 			slot = 1;
1700 		diff = slot * period_slot;
1701 	} else {
1702 		diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
1703 
1704 		/*
1705 		 * Scale scan rate increases based on sharing. There is an
1706 		 * inverse relationship between the degree of sharing and
1707 		 * the adjustment made to the scanning period. Broadly
1708 		 * speaking the intent is that there is little point
1709 		 * scanning faster if shared accesses dominate as it may
1710 		 * simply bounce migrations uselessly
1711 		 */
1712 		ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared + 1));
1713 		diff = (diff * ratio) / NUMA_PERIOD_SLOTS;
1714 	}
1715 
1716 	p->numa_scan_period = clamp(p->numa_scan_period + diff,
1717 			task_scan_min(p), task_scan_max(p));
1718 	memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
1719 }
1720 
1721 /*
1722  * Get the fraction of time the task has been running since the last
1723  * NUMA placement cycle. The scheduler keeps similar statistics, but
1724  * decays those on a 32ms period, which is orders of magnitude off
1725  * from the dozens-of-seconds NUMA balancing period. Use the scheduler
1726  * stats only if the task is so new there are no NUMA statistics yet.
1727  */
1728 static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
1729 {
1730 	u64 runtime, delta, now;
1731 	/* Use the start of this time slice to avoid calculations. */
1732 	now = p->se.exec_start;
1733 	runtime = p->se.sum_exec_runtime;
1734 
1735 	if (p->last_task_numa_placement) {
1736 		delta = runtime - p->last_sum_exec_runtime;
1737 		*period = now - p->last_task_numa_placement;
1738 	} else {
1739 		delta = p->se.avg.load_sum / p->se.load.weight;
1740 		*period = LOAD_AVG_MAX;
1741 	}
1742 
1743 	p->last_sum_exec_runtime = runtime;
1744 	p->last_task_numa_placement = now;
1745 
1746 	return delta;
1747 }
1748 
1749 /*
1750  * Determine the preferred nid for a task in a numa_group. This needs to
1751  * be done in a way that produces consistent results with group_weight,
1752  * otherwise workloads might not converge.
1753  */
1754 static int preferred_group_nid(struct task_struct *p, int nid)
1755 {
1756 	nodemask_t nodes;
1757 	int dist;
1758 
1759 	/* Direct connections between all NUMA nodes. */
1760 	if (sched_numa_topology_type == NUMA_DIRECT)
1761 		return nid;
1762 
1763 	/*
1764 	 * On a system with glueless mesh NUMA topology, group_weight
1765 	 * scores nodes according to the number of NUMA hinting faults on
1766 	 * both the node itself, and on nearby nodes.
1767 	 */
1768 	if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
1769 		unsigned long score, max_score = 0;
1770 		int node, max_node = nid;
1771 
1772 		dist = sched_max_numa_distance;
1773 
1774 		for_each_online_node(node) {
1775 			score = group_weight(p, node, dist);
1776 			if (score > max_score) {
1777 				max_score = score;
1778 				max_node = node;
1779 			}
1780 		}
1781 		return max_node;
1782 	}
1783 
1784 	/*
1785 	 * Finding the preferred nid in a system with NUMA backplane
1786 	 * interconnect topology is more involved. The goal is to locate
1787 	 * tasks from numa_groups near each other in the system, and
1788 	 * untangle workloads from different sides of the system. This requires
1789 	 * searching down the hierarchy of node groups, recursively searching
1790 	 * inside the highest scoring group of nodes. The nodemask tricks
1791 	 * keep the complexity of the search down.
1792 	 */
1793 	nodes = node_online_map;
1794 	for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) {
1795 		unsigned long max_faults = 0;
1796 		nodemask_t max_group = NODE_MASK_NONE;
1797 		int a, b;
1798 
1799 		/* Are there nodes at this distance from each other? */
1800 		if (!find_numa_distance(dist))
1801 			continue;
1802 
1803 		for_each_node_mask(a, nodes) {
1804 			unsigned long faults = 0;
1805 			nodemask_t this_group;
1806 			nodes_clear(this_group);
1807 
1808 			/* Sum group's NUMA faults; includes a==b case. */
1809 			for_each_node_mask(b, nodes) {
1810 				if (node_distance(a, b) < dist) {
1811 					faults += group_faults(p, b);
1812 					node_set(b, this_group);
1813 					node_clear(b, nodes);
1814 				}
1815 			}
1816 
1817 			/* Remember the top group. */
1818 			if (faults > max_faults) {
1819 				max_faults = faults;
1820 				max_group = this_group;
1821 				/*
1822 				 * subtle: at the smallest distance there is
1823 				 * just one node left in each "group", the
1824 				 * winner is the preferred nid.
1825 				 */
1826 				nid = a;
1827 			}
1828 		}
1829 		/* Next round, evaluate the nodes within max_group. */
1830 		if (!max_faults)
1831 			break;
1832 		nodes = max_group;
1833 	}
1834 	return nid;
1835 }
1836 
1837 static void task_numa_placement(struct task_struct *p)
1838 {
1839 	int seq, nid, max_nid = -1, max_group_nid = -1;
1840 	unsigned long max_faults = 0, max_group_faults = 0;
1841 	unsigned long fault_types[2] = { 0, 0 };
1842 	unsigned long total_faults;
1843 	u64 runtime, period;
1844 	spinlock_t *group_lock = NULL;
1845 
1846 	/*
1847 	 * The p->mm->numa_scan_seq field gets updated without
1848 	 * exclusive access. Use READ_ONCE() here to ensure
1849 	 * that the field is read in a single access:
1850 	 */
1851 	seq = READ_ONCE(p->mm->numa_scan_seq);
1852 	if (p->numa_scan_seq == seq)
1853 		return;
1854 	p->numa_scan_seq = seq;
1855 	p->numa_scan_period_max = task_scan_max(p);
1856 
1857 	total_faults = p->numa_faults_locality[0] +
1858 		       p->numa_faults_locality[1];
1859 	runtime = numa_get_avg_runtime(p, &period);
1860 
1861 	/* If the task is part of a group prevent parallel updates to group stats */
1862 	if (p->numa_group) {
1863 		group_lock = &p->numa_group->lock;
1864 		spin_lock_irq(group_lock);
1865 	}
1866 
1867 	/* Find the node with the highest number of faults */
1868 	for_each_online_node(nid) {
1869 		/* Keep track of the offsets in numa_faults array */
1870 		int mem_idx, membuf_idx, cpu_idx, cpubuf_idx;
1871 		unsigned long faults = 0, group_faults = 0;
1872 		int priv;
1873 
1874 		for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {
1875 			long diff, f_diff, f_weight;
1876 
1877 			mem_idx = task_faults_idx(NUMA_MEM, nid, priv);
1878 			membuf_idx = task_faults_idx(NUMA_MEMBUF, nid, priv);
1879 			cpu_idx = task_faults_idx(NUMA_CPU, nid, priv);
1880 			cpubuf_idx = task_faults_idx(NUMA_CPUBUF, nid, priv);
1881 
1882 			/* Decay existing window, copy faults since last scan */
1883 			diff = p->numa_faults[membuf_idx] - p->numa_faults[mem_idx] / 2;
1884 			fault_types[priv] += p->numa_faults[membuf_idx];
1885 			p->numa_faults[membuf_idx] = 0;
1886 
1887 			/*
1888 			 * Normalize the faults_from, so all tasks in a group
1889 			 * count according to CPU use, instead of by the raw
1890 			 * number of faults. Tasks with little runtime have
1891 			 * little over-all impact on throughput, and thus their
1892 			 * faults are less important.
1893 			 */
1894 			f_weight = div64_u64(runtime << 16, period + 1);
1895 			f_weight = (f_weight * p->numa_faults[cpubuf_idx]) /
1896 				   (total_faults + 1);
1897 			f_diff = f_weight - p->numa_faults[cpu_idx] / 2;
1898 			p->numa_faults[cpubuf_idx] = 0;
1899 
1900 			p->numa_faults[mem_idx] += diff;
1901 			p->numa_faults[cpu_idx] += f_diff;
1902 			faults += p->numa_faults[mem_idx];
1903 			p->total_numa_faults += diff;
1904 			if (p->numa_group) {
1905 				/*
1906 				 * safe because we can only change our own group
1907 				 *
1908 				 * mem_idx represents the offset for a given
1909 				 * nid and priv in a specific region because it
1910 				 * is at the beginning of the numa_faults array.
1911 				 */
1912 				p->numa_group->faults[mem_idx] += diff;
1913 				p->numa_group->faults_cpu[mem_idx] += f_diff;
1914 				p->numa_group->total_faults += diff;
1915 				group_faults += p->numa_group->faults[mem_idx];
1916 			}
1917 		}
1918 
1919 		if (faults > max_faults) {
1920 			max_faults = faults;
1921 			max_nid = nid;
1922 		}
1923 
1924 		if (group_faults > max_group_faults) {
1925 			max_group_faults = group_faults;
1926 			max_group_nid = nid;
1927 		}
1928 	}
1929 
1930 	update_task_scan_period(p, fault_types[0], fault_types[1]);
1931 
1932 	if (p->numa_group) {
1933 		update_numa_active_node_mask(p->numa_group);
1934 		spin_unlock_irq(group_lock);
1935 		max_nid = preferred_group_nid(p, max_group_nid);
1936 	}
1937 
1938 	if (max_faults) {
1939 		/* Set the new preferred node */
1940 		if (max_nid != p->numa_preferred_nid)
1941 			sched_setnuma(p, max_nid);
1942 
1943 		if (task_node(p) != p->numa_preferred_nid)
1944 			numa_migrate_preferred(p);
1945 	}
1946 }
1947 
1948 static inline int get_numa_group(struct numa_group *grp)
1949 {
1950 	return atomic_inc_not_zero(&grp->refcount);
1951 }
1952 
1953 static inline void put_numa_group(struct numa_group *grp)
1954 {
1955 	if (atomic_dec_and_test(&grp->refcount))
1956 		kfree_rcu(grp, rcu);
1957 }
1958 
1959 static void task_numa_group(struct task_struct *p, int cpupid, int flags,
1960 			int *priv)
1961 {
1962 	struct numa_group *grp, *my_grp;
1963 	struct task_struct *tsk;
1964 	bool join = false;
1965 	int cpu = cpupid_to_cpu(cpupid);
1966 	int i;
1967 
1968 	if (unlikely(!p->numa_group)) {
1969 		unsigned int size = sizeof(struct numa_group) +
1970 				    4*nr_node_ids*sizeof(unsigned long);
1971 
1972 		grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
1973 		if (!grp)
1974 			return;
1975 
1976 		atomic_set(&grp->refcount, 1);
1977 		spin_lock_init(&grp->lock);
1978 		grp->gid = p->pid;
1979 		/* Second half of the array tracks nids where faults happen */
1980 		grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
1981 						nr_node_ids;
1982 
1983 		node_set(task_node(current), grp->active_nodes);
1984 
1985 		for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
1986 			grp->faults[i] = p->numa_faults[i];
1987 
1988 		grp->total_faults = p->total_numa_faults;
1989 
1990 		grp->nr_tasks++;
1991 		rcu_assign_pointer(p->numa_group, grp);
1992 	}
1993 
1994 	rcu_read_lock();
1995 	tsk = READ_ONCE(cpu_rq(cpu)->curr);
1996 
1997 	if (!cpupid_match_pid(tsk, cpupid))
1998 		goto no_join;
1999 
2000 	grp = rcu_dereference(tsk->numa_group);
2001 	if (!grp)
2002 		goto no_join;
2003 
2004 	my_grp = p->numa_group;
2005 	if (grp == my_grp)
2006 		goto no_join;
2007 
2008 	/*
2009 	 * Only join the other group if its bigger; if we're the bigger group,
2010 	 * the other task will join us.
2011 	 */
2012 	if (my_grp->nr_tasks > grp->nr_tasks)
2013 		goto no_join;
2014 
2015 	/*
2016 	 * Tie-break on the grp address.
2017 	 */
2018 	if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp)
2019 		goto no_join;
2020 
2021 	/* Always join threads in the same process. */
2022 	if (tsk->mm == current->mm)
2023 		join = true;
2024 
2025 	/* Simple filter to avoid false positives due to PID collisions */
2026 	if (flags & TNF_SHARED)
2027 		join = true;
2028 
2029 	/* Update priv based on whether false sharing was detected */
2030 	*priv = !join;
2031 
2032 	if (join && !get_numa_group(grp))
2033 		goto no_join;
2034 
2035 	rcu_read_unlock();
2036 
2037 	if (!join)
2038 		return;
2039 
2040 	BUG_ON(irqs_disabled());
2041 	double_lock_irq(&my_grp->lock, &grp->lock);
2042 
2043 	for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
2044 		my_grp->faults[i] -= p->numa_faults[i];
2045 		grp->faults[i] += p->numa_faults[i];
2046 	}
2047 	my_grp->total_faults -= p->total_numa_faults;
2048 	grp->total_faults += p->total_numa_faults;
2049 
2050 	my_grp->nr_tasks--;
2051 	grp->nr_tasks++;
2052 
2053 	spin_unlock(&my_grp->lock);
2054 	spin_unlock_irq(&grp->lock);
2055 
2056 	rcu_assign_pointer(p->numa_group, grp);
2057 
2058 	put_numa_group(my_grp);
2059 	return;
2060 
2061 no_join:
2062 	rcu_read_unlock();
2063 	return;
2064 }
2065 
2066 void task_numa_free(struct task_struct *p)
2067 {
2068 	struct numa_group *grp = p->numa_group;
2069 	void *numa_faults = p->numa_faults;
2070 	unsigned long flags;
2071 	int i;
2072 
2073 	if (grp) {
2074 		spin_lock_irqsave(&grp->lock, flags);
2075 		for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
2076 			grp->faults[i] -= p->numa_faults[i];
2077 		grp->total_faults -= p->total_numa_faults;
2078 
2079 		grp->nr_tasks--;
2080 		spin_unlock_irqrestore(&grp->lock, flags);
2081 		RCU_INIT_POINTER(p->numa_group, NULL);
2082 		put_numa_group(grp);
2083 	}
2084 
2085 	p->numa_faults = NULL;
2086 	kfree(numa_faults);
2087 }
2088 
2089 /*
2090  * Got a PROT_NONE fault for a page on @node.
2091  */
2092 void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
2093 {
2094 	struct task_struct *p = current;
2095 	bool migrated = flags & TNF_MIGRATED;
2096 	int cpu_node = task_node(current);
2097 	int local = !!(flags & TNF_FAULT_LOCAL);
2098 	int priv;
2099 
2100 	if (!static_branch_likely(&sched_numa_balancing))
2101 		return;
2102 
2103 	/* for example, ksmd faulting in a user's mm */
2104 	if (!p->mm)
2105 		return;
2106 
2107 	/* Allocate buffer to track faults on a per-node basis */
2108 	if (unlikely(!p->numa_faults)) {
2109 		int size = sizeof(*p->numa_faults) *
2110 			   NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;
2111 
2112 		p->numa_faults = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);
2113 		if (!p->numa_faults)
2114 			return;
2115 
2116 		p->total_numa_faults = 0;
2117 		memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
2118 	}
2119 
2120 	/*
2121 	 * First accesses are treated as private, otherwise consider accesses
2122 	 * to be private if the accessing pid has not changed
2123 	 */
2124 	if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) {
2125 		priv = 1;
2126 	} else {
2127 		priv = cpupid_match_pid(p, last_cpupid);
2128 		if (!priv && !(flags & TNF_NO_GROUP))
2129 			task_numa_group(p, last_cpupid, flags, &priv);
2130 	}
2131 
2132 	/*
2133 	 * If a workload spans multiple NUMA nodes, a shared fault that
2134 	 * occurs wholly within the set of nodes that the workload is
2135 	 * actively using should be counted as local. This allows the
2136 	 * scan rate to slow down when a workload has settled down.
2137 	 */
2138 	if (!priv && !local && p->numa_group &&
2139 			node_isset(cpu_node, p->numa_group->active_nodes) &&
2140 			node_isset(mem_node, p->numa_group->active_nodes))
2141 		local = 1;
2142 
2143 	task_numa_placement(p);
2144 
2145 	/*
2146 	 * Retry task to preferred node migration periodically, in case it
2147 	 * case it previously failed, or the scheduler moved us.
2148 	 */
2149 	if (time_after(jiffies, p->numa_migrate_retry))
2150 		numa_migrate_preferred(p);
2151 
2152 	if (migrated)
2153 		p->numa_pages_migrated += pages;
2154 	if (flags & TNF_MIGRATE_FAIL)
2155 		p->numa_faults_locality[2] += pages;
2156 
2157 	p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages;
2158 	p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;
2159 	p->numa_faults_locality[local] += pages;
2160 }
2161 
2162 static void reset_ptenuma_scan(struct task_struct *p)
2163 {
2164 	/*
2165 	 * We only did a read acquisition of the mmap sem, so
2166 	 * p->mm->numa_scan_seq is written to without exclusive access
2167 	 * and the update is not guaranteed to be atomic. That's not
2168 	 * much of an issue though, since this is just used for
2169 	 * statistical sampling. Use READ_ONCE/WRITE_ONCE, which are not
2170 	 * expensive, to avoid any form of compiler optimizations:
2171 	 */
2172 	WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
2173 	p->mm->numa_scan_offset = 0;
2174 }
2175 
2176 /*
2177  * The expensive part of numa migration is done from task_work context.
2178  * Triggered from task_tick_numa().
2179  */
2180 void task_numa_work(struct callback_head *work)
2181 {
2182 	unsigned long migrate, next_scan, now = jiffies;
2183 	struct task_struct *p = current;
2184 	struct mm_struct *mm = p->mm;
2185 	u64 runtime = p->se.sum_exec_runtime;
2186 	struct vm_area_struct *vma;
2187 	unsigned long start, end;
2188 	unsigned long nr_pte_updates = 0;
2189 	long pages, virtpages;
2190 
2191 	WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
2192 
2193 	work->next = work; /* protect against double add */
2194 	/*
2195 	 * Who cares about NUMA placement when they're dying.
2196 	 *
2197 	 * NOTE: make sure not to dereference p->mm before this check,
2198 	 * exit_task_work() happens _after_ exit_mm() so we could be called
2199 	 * without p->mm even though we still had it when we enqueued this
2200 	 * work.
2201 	 */
2202 	if (p->flags & PF_EXITING)
2203 		return;
2204 
2205 	if (!mm->numa_next_scan) {
2206 		mm->numa_next_scan = now +
2207 			msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
2208 	}
2209 
2210 	/*
2211 	 * Enforce maximal scan/migration frequency..
2212 	 */
2213 	migrate = mm->numa_next_scan;
2214 	if (time_before(now, migrate))
2215 		return;
2216 
2217 	if (p->numa_scan_period == 0) {
2218 		p->numa_scan_period_max = task_scan_max(p);
2219 		p->numa_scan_period = task_scan_min(p);
2220 	}
2221 
2222 	next_scan = now + msecs_to_jiffies(p->numa_scan_period);
2223 	if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
2224 		return;
2225 
2226 	/*
2227 	 * Delay this task enough that another task of this mm will likely win
2228 	 * the next time around.
2229 	 */
2230 	p->node_stamp += 2 * TICK_NSEC;
2231 
2232 	start = mm->numa_scan_offset;
2233 	pages = sysctl_numa_balancing_scan_size;
2234 	pages <<= 20 - PAGE_SHIFT; /* MB in pages */
2235 	virtpages = pages * 8;	   /* Scan up to this much virtual space */
2236 	if (!pages)
2237 		return;
2238 
2239 
2240 	down_read(&mm->mmap_sem);
2241 	vma = find_vma(mm, start);
2242 	if (!vma) {
2243 		reset_ptenuma_scan(p);
2244 		start = 0;
2245 		vma = mm->mmap;
2246 	}
2247 	for (; vma; vma = vma->vm_next) {
2248 		if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
2249 			is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) {
2250 			continue;
2251 		}
2252 
2253 		/*
2254 		 * Shared library pages mapped by multiple processes are not
2255 		 * migrated as it is expected they are cache replicated. Avoid
2256 		 * hinting faults in read-only file-backed mappings or the vdso
2257 		 * as migrating the pages will be of marginal benefit.
2258 		 */
2259 		if (!vma->vm_mm ||
2260 		    (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
2261 			continue;
2262 
2263 		/*
2264 		 * Skip inaccessible VMAs to avoid any confusion between
2265 		 * PROT_NONE and NUMA hinting ptes
2266 		 */
2267 		if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
2268 			continue;
2269 
2270 		do {
2271 			start = max(start, vma->vm_start);
2272 			end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
2273 			end = min(end, vma->vm_end);
2274 			nr_pte_updates = change_prot_numa(vma, start, end);
2275 
2276 			/*
2277 			 * Try to scan sysctl_numa_balancing_size worth of
2278 			 * hpages that have at least one present PTE that
2279 			 * is not already pte-numa. If the VMA contains
2280 			 * areas that are unused or already full of prot_numa
2281 			 * PTEs, scan up to virtpages, to skip through those
2282 			 * areas faster.
2283 			 */
2284 			if (nr_pte_updates)
2285 				pages -= (end - start) >> PAGE_SHIFT;
2286 			virtpages -= (end - start) >> PAGE_SHIFT;
2287 
2288 			start = end;
2289 			if (pages <= 0 || virtpages <= 0)
2290 				goto out;
2291 
2292 			cond_resched();
2293 		} while (end != vma->vm_end);
2294 	}
2295 
2296 out:
2297 	/*
2298 	 * It is possible to reach the end of the VMA list but the last few
2299 	 * VMAs are not guaranteed to the vma_migratable. If they are not, we
2300 	 * would find the !migratable VMA on the next scan but not reset the
2301 	 * scanner to the start so check it now.
2302 	 */
2303 	if (vma)
2304 		mm->numa_scan_offset = start;
2305 	else
2306 		reset_ptenuma_scan(p);
2307 	up_read(&mm->mmap_sem);
2308 
2309 	/*
2310 	 * Make sure tasks use at least 32x as much time to run other code
2311 	 * than they used here, to limit NUMA PTE scanning overhead to 3% max.
2312 	 * Usually update_task_scan_period slows down scanning enough; on an
2313 	 * overloaded system we need to limit overhead on a per task basis.
2314 	 */
2315 	if (unlikely(p->se.sum_exec_runtime != runtime)) {
2316 		u64 diff = p->se.sum_exec_runtime - runtime;
2317 		p->node_stamp += 32 * diff;
2318 	}
2319 }
2320 
2321 /*
2322  * Drive the periodic memory faults..
2323  */
2324 void task_tick_numa(struct rq *rq, struct task_struct *curr)
2325 {
2326 	struct callback_head *work = &curr->numa_work;
2327 	u64 period, now;
2328 
2329 	/*
2330 	 * We don't care about NUMA placement if we don't have memory.
2331 	 */
2332 	if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
2333 		return;
2334 
2335 	/*
2336 	 * Using runtime rather than walltime has the dual advantage that
2337 	 * we (mostly) drive the selection from busy threads and that the
2338 	 * task needs to have done some actual work before we bother with
2339 	 * NUMA placement.
2340 	 */
2341 	now = curr->se.sum_exec_runtime;
2342 	period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
2343 
2344 	if (now > curr->node_stamp + period) {
2345 		if (!curr->node_stamp)
2346 			curr->numa_scan_period = task_scan_min(curr);
2347 		curr->node_stamp += period;
2348 
2349 		if (!time_before(jiffies, curr->mm->numa_next_scan)) {
2350 			init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
2351 			task_work_add(curr, work, true);
2352 		}
2353 	}
2354 }
2355 #else
2356 static void task_tick_numa(struct rq *rq, struct task_struct *curr)
2357 {
2358 }
2359 
2360 static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p)
2361 {
2362 }
2363 
2364 static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
2365 {
2366 }
2367 #endif /* CONFIG_NUMA_BALANCING */
2368 
2369 static void
2370 account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
2371 {
2372 	update_load_add(&cfs_rq->load, se->load.weight);
2373 	if (!parent_entity(se))
2374 		update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
2375 #ifdef CONFIG_SMP
2376 	if (entity_is_task(se)) {
2377 		struct rq *rq = rq_of(cfs_rq);
2378 
2379 		account_numa_enqueue(rq, task_of(se));
2380 		list_add(&se->group_node, &rq->cfs_tasks);
2381 	}
2382 #endif
2383 	cfs_rq->nr_running++;
2384 }
2385 
2386 static void
2387 account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
2388 {
2389 	update_load_sub(&cfs_rq->load, se->load.weight);
2390 	if (!parent_entity(se))
2391 		update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
2392 	if (entity_is_task(se)) {
2393 		account_numa_dequeue(rq_of(cfs_rq), task_of(se));
2394 		list_del_init(&se->group_node);
2395 	}
2396 	cfs_rq->nr_running--;
2397 }
2398 
2399 #ifdef CONFIG_FAIR_GROUP_SCHED
2400 # ifdef CONFIG_SMP
2401 static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
2402 {
2403 	long tg_weight;
2404 
2405 	/*
2406 	 * Use this CPU's real-time load instead of the last load contribution
2407 	 * as the updating of the contribution is delayed, and we will use the
2408 	 * the real-time load to calc the share. See update_tg_load_avg().
2409 	 */
2410 	tg_weight = atomic_long_read(&tg->load_avg);
2411 	tg_weight -= cfs_rq->tg_load_avg_contrib;
2412 	tg_weight += cfs_rq->load.weight;
2413 
2414 	return tg_weight;
2415 }
2416 
2417 static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
2418 {
2419 	long tg_weight, load, shares;
2420 
2421 	tg_weight = calc_tg_weight(tg, cfs_rq);
2422 	load = cfs_rq->load.weight;
2423 
2424 	shares = (tg->shares * load);
2425 	if (tg_weight)
2426 		shares /= tg_weight;
2427 
2428 	if (shares < MIN_SHARES)
2429 		shares = MIN_SHARES;
2430 	if (shares > tg->shares)
2431 		shares = tg->shares;
2432 
2433 	return shares;
2434 }
2435 # else /* CONFIG_SMP */
2436 static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
2437 {
2438 	return tg->shares;
2439 }
2440 # endif /* CONFIG_SMP */
2441 static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
2442 			    unsigned long weight)
2443 {
2444 	if (se->on_rq) {
2445 		/* commit outstanding execution time */
2446 		if (cfs_rq->curr == se)
2447 			update_curr(cfs_rq);
2448 		account_entity_dequeue(cfs_rq, se);
2449 	}
2450 
2451 	update_load_set(&se->load, weight);
2452 
2453 	if (se->on_rq)
2454 		account_entity_enqueue(cfs_rq, se);
2455 }
2456 
2457 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
2458 
2459 static void update_cfs_shares(struct cfs_rq *cfs_rq)
2460 {
2461 	struct task_group *tg;
2462 	struct sched_entity *se;
2463 	long shares;
2464 
2465 	tg = cfs_rq->tg;
2466 	se = tg->se[cpu_of(rq_of(cfs_rq))];
2467 	if (!se || throttled_hierarchy(cfs_rq))
2468 		return;
2469 #ifndef CONFIG_SMP
2470 	if (likely(se->load.weight == tg->shares))
2471 		return;
2472 #endif
2473 	shares = calc_cfs_shares(cfs_rq, tg);
2474 
2475 	reweight_entity(cfs_rq_of(se), se, shares);
2476 }
2477 #else /* CONFIG_FAIR_GROUP_SCHED */
2478 static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
2479 {
2480 }
2481 #endif /* CONFIG_FAIR_GROUP_SCHED */
2482 
2483 #ifdef CONFIG_SMP
2484 /* Precomputed fixed inverse multiplies for multiplication by y^n */
2485 static const u32 runnable_avg_yN_inv[] = {
2486 	0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
2487 	0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
2488 	0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
2489 	0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,
2490 	0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,
2491 	0x85aac367, 0x82cd8698,
2492 };
2493 
2494 /*
2495  * Precomputed \Sum y^k { 1<=k<=n }.  These are floor(true_value) to prevent
2496  * over-estimates when re-combining.
2497  */
2498 static const u32 runnable_avg_yN_sum[] = {
2499 	    0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,
2500 	 9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,
2501 	17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
2502 };
2503 
2504 /*
2505  * Approximate:
2506  *   val * y^n,    where y^32 ~= 0.5 (~1 scheduling period)
2507  */
2508 static __always_inline u64 decay_load(u64 val, u64 n)
2509 {
2510 	unsigned int local_n;
2511 
2512 	if (!n)
2513 		return val;
2514 	else if (unlikely(n > LOAD_AVG_PERIOD * 63))
2515 		return 0;
2516 
2517 	/* after bounds checking we can collapse to 32-bit */
2518 	local_n = n;
2519 
2520 	/*
2521 	 * As y^PERIOD = 1/2, we can combine
2522 	 *    y^n = 1/2^(n/PERIOD) * y^(n%PERIOD)
2523 	 * With a look-up table which covers y^n (n<PERIOD)
2524 	 *
2525 	 * To achieve constant time decay_load.
2526 	 */
2527 	if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
2528 		val >>= local_n / LOAD_AVG_PERIOD;
2529 		local_n %= LOAD_AVG_PERIOD;
2530 	}
2531 
2532 	val = mul_u64_u32_shr(val, runnable_avg_yN_inv[local_n], 32);
2533 	return val;
2534 }
2535 
2536 /*
2537  * For updates fully spanning n periods, the contribution to runnable
2538  * average will be: \Sum 1024*y^n
2539  *
2540  * We can compute this reasonably efficiently by combining:
2541  *   y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for  n <PERIOD}
2542  */
2543 static u32 __compute_runnable_contrib(u64 n)
2544 {
2545 	u32 contrib = 0;
2546 
2547 	if (likely(n <= LOAD_AVG_PERIOD))
2548 		return runnable_avg_yN_sum[n];
2549 	else if (unlikely(n >= LOAD_AVG_MAX_N))
2550 		return LOAD_AVG_MAX;
2551 
2552 	/* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */
2553 	do {
2554 		contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */
2555 		contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD];
2556 
2557 		n -= LOAD_AVG_PERIOD;
2558 	} while (n > LOAD_AVG_PERIOD);
2559 
2560 	contrib = decay_load(contrib, n);
2561 	return contrib + runnable_avg_yN_sum[n];
2562 }
2563 
2564 #if (SCHED_LOAD_SHIFT - SCHED_LOAD_RESOLUTION) != 10 || SCHED_CAPACITY_SHIFT != 10
2565 #error "load tracking assumes 2^10 as unit"
2566 #endif
2567 
2568 #define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
2569 
2570 /*
2571  * We can represent the historical contribution to runnable average as the
2572  * coefficients of a geometric series.  To do this we sub-divide our runnable
2573  * history into segments of approximately 1ms (1024us); label the segment that
2574  * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
2575  *
2576  * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
2577  *      p0            p1           p2
2578  *     (now)       (~1ms ago)  (~2ms ago)
2579  *
2580  * Let u_i denote the fraction of p_i that the entity was runnable.
2581  *
2582  * We then designate the fractions u_i as our co-efficients, yielding the
2583  * following representation of historical load:
2584  *   u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
2585  *
2586  * We choose y based on the with of a reasonably scheduling period, fixing:
2587  *   y^32 = 0.5
2588  *
2589  * This means that the contribution to load ~32ms ago (u_32) will be weighted
2590  * approximately half as much as the contribution to load within the last ms
2591  * (u_0).
2592  *
2593  * When a period "rolls over" and we have new u_0`, multiplying the previous
2594  * sum again by y is sufficient to update:
2595  *   load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
2596  *            = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
2597  */
2598 static __always_inline int
2599 __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
2600 		  unsigned long weight, int running, struct cfs_rq *cfs_rq)
2601 {
2602 	u64 delta, scaled_delta, periods;
2603 	u32 contrib;
2604 	unsigned int delta_w, scaled_delta_w, decayed = 0;
2605 	unsigned long scale_freq, scale_cpu;
2606 
2607 	delta = now - sa->last_update_time;
2608 	/*
2609 	 * This should only happen when time goes backwards, which it
2610 	 * unfortunately does during sched clock init when we swap over to TSC.
2611 	 */
2612 	if ((s64)delta < 0) {
2613 		sa->last_update_time = now;
2614 		return 0;
2615 	}
2616 
2617 	/*
2618 	 * Use 1024ns as the unit of measurement since it's a reasonable
2619 	 * approximation of 1us and fast to compute.
2620 	 */
2621 	delta >>= 10;
2622 	if (!delta)
2623 		return 0;
2624 	sa->last_update_time = now;
2625 
2626 	scale_freq = arch_scale_freq_capacity(NULL, cpu);
2627 	scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
2628 
2629 	/* delta_w is the amount already accumulated against our next period */
2630 	delta_w = sa->period_contrib;
2631 	if (delta + delta_w >= 1024) {
2632 		decayed = 1;
2633 
2634 		/* how much left for next period will start over, we don't know yet */
2635 		sa->period_contrib = 0;
2636 
2637 		/*
2638 		 * Now that we know we're crossing a period boundary, figure
2639 		 * out how much from delta we need to complete the current
2640 		 * period and accrue it.
2641 		 */
2642 		delta_w = 1024 - delta_w;
2643 		scaled_delta_w = cap_scale(delta_w, scale_freq);
2644 		if (weight) {
2645 			sa->load_sum += weight * scaled_delta_w;
2646 			if (cfs_rq) {
2647 				cfs_rq->runnable_load_sum +=
2648 						weight * scaled_delta_w;
2649 			}
2650 		}
2651 		if (running)
2652 			sa->util_sum += scaled_delta_w * scale_cpu;
2653 
2654 		delta -= delta_w;
2655 
2656 		/* Figure out how many additional periods this update spans */
2657 		periods = delta / 1024;
2658 		delta %= 1024;
2659 
2660 		sa->load_sum = decay_load(sa->load_sum, periods + 1);
2661 		if (cfs_rq) {
2662 			cfs_rq->runnable_load_sum =
2663 				decay_load(cfs_rq->runnable_load_sum, periods + 1);
2664 		}
2665 		sa->util_sum = decay_load((u64)(sa->util_sum), periods + 1);
2666 
2667 		/* Efficiently calculate \sum (1..n_period) 1024*y^i */
2668 		contrib = __compute_runnable_contrib(periods);
2669 		contrib = cap_scale(contrib, scale_freq);
2670 		if (weight) {
2671 			sa->load_sum += weight * contrib;
2672 			if (cfs_rq)
2673 				cfs_rq->runnable_load_sum += weight * contrib;
2674 		}
2675 		if (running)
2676 			sa->util_sum += contrib * scale_cpu;
2677 	}
2678 
2679 	/* Remainder of delta accrued against u_0` */
2680 	scaled_delta = cap_scale(delta, scale_freq);
2681 	if (weight) {
2682 		sa->load_sum += weight * scaled_delta;
2683 		if (cfs_rq)
2684 			cfs_rq->runnable_load_sum += weight * scaled_delta;
2685 	}
2686 	if (running)
2687 		sa->util_sum += scaled_delta * scale_cpu;
2688 
2689 	sa->period_contrib += delta;
2690 
2691 	if (decayed) {
2692 		sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX);
2693 		if (cfs_rq) {
2694 			cfs_rq->runnable_load_avg =
2695 				div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX);
2696 		}
2697 		sa->util_avg = sa->util_sum / LOAD_AVG_MAX;
2698 	}
2699 
2700 	return decayed;
2701 }
2702 
2703 #ifdef CONFIG_FAIR_GROUP_SCHED
2704 /*
2705  * Updating tg's load_avg is necessary before update_cfs_share (which is done)
2706  * and effective_load (which is not done because it is too costly).
2707  */
2708 static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
2709 {
2710 	long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
2711 
2712 	/*
2713 	 * No need to update load_avg for root_task_group as it is not used.
2714 	 */
2715 	if (cfs_rq->tg == &root_task_group)
2716 		return;
2717 
2718 	if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
2719 		atomic_long_add(delta, &cfs_rq->tg->load_avg);
2720 		cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
2721 	}
2722 }
2723 
2724 /*
2725  * Called within set_task_rq() right before setting a task's cpu. The
2726  * caller only guarantees p->pi_lock is held; no other assumptions,
2727  * including the state of rq->lock, should be made.
2728  */
2729 void set_task_rq_fair(struct sched_entity *se,
2730 		      struct cfs_rq *prev, struct cfs_rq *next)
2731 {
2732 	if (!sched_feat(ATTACH_AGE_LOAD))
2733 		return;
2734 
2735 	/*
2736 	 * We are supposed to update the task to "current" time, then its up to
2737 	 * date and ready to go to new CPU/cfs_rq. But we have difficulty in
2738 	 * getting what current time is, so simply throw away the out-of-date
2739 	 * time. This will result in the wakee task is less decayed, but giving
2740 	 * the wakee more load sounds not bad.
2741 	 */
2742 	if (se->avg.last_update_time && prev) {
2743 		u64 p_last_update_time;
2744 		u64 n_last_update_time;
2745 
2746 #ifndef CONFIG_64BIT
2747 		u64 p_last_update_time_copy;
2748 		u64 n_last_update_time_copy;
2749 
2750 		do {
2751 			p_last_update_time_copy = prev->load_last_update_time_copy;
2752 			n_last_update_time_copy = next->load_last_update_time_copy;
2753 
2754 			smp_rmb();
2755 
2756 			p_last_update_time = prev->avg.last_update_time;
2757 			n_last_update_time = next->avg.last_update_time;
2758 
2759 		} while (p_last_update_time != p_last_update_time_copy ||
2760 			 n_last_update_time != n_last_update_time_copy);
2761 #else
2762 		p_last_update_time = prev->avg.last_update_time;
2763 		n_last_update_time = next->avg.last_update_time;
2764 #endif
2765 		__update_load_avg(p_last_update_time, cpu_of(rq_of(prev)),
2766 				  &se->avg, 0, 0, NULL);
2767 		se->avg.last_update_time = n_last_update_time;
2768 	}
2769 }
2770 #else /* CONFIG_FAIR_GROUP_SCHED */
2771 static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
2772 #endif /* CONFIG_FAIR_GROUP_SCHED */
2773 
2774 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
2775 
2776 /* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */
2777 static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
2778 {
2779 	struct sched_avg *sa = &cfs_rq->avg;
2780 	int decayed, removed = 0;
2781 
2782 	if (atomic_long_read(&cfs_rq->removed_load_avg)) {
2783 		s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
2784 		sa->load_avg = max_t(long, sa->load_avg - r, 0);
2785 		sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0);
2786 		removed = 1;
2787 	}
2788 
2789 	if (atomic_long_read(&cfs_rq->removed_util_avg)) {
2790 		long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
2791 		sa->util_avg = max_t(long, sa->util_avg - r, 0);
2792 		sa->util_sum = max_t(s32, sa->util_sum - r * LOAD_AVG_MAX, 0);
2793 	}
2794 
2795 	decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
2796 		scale_load_down(cfs_rq->load.weight), cfs_rq->curr != NULL, cfs_rq);
2797 
2798 #ifndef CONFIG_64BIT
2799 	smp_wmb();
2800 	cfs_rq->load_last_update_time_copy = sa->last_update_time;
2801 #endif
2802 
2803 	return decayed || removed;
2804 }
2805 
2806 /* Update task and its cfs_rq load average */
2807 static inline void update_load_avg(struct sched_entity *se, int update_tg)
2808 {
2809 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
2810 	u64 now = cfs_rq_clock_task(cfs_rq);
2811 	int cpu = cpu_of(rq_of(cfs_rq));
2812 
2813 	/*
2814 	 * Track task load average for carrying it to new CPU after migrated, and
2815 	 * track group sched_entity load average for task_h_load calc in migration
2816 	 */
2817 	__update_load_avg(now, cpu, &se->avg,
2818 			  se->on_rq * scale_load_down(se->load.weight),
2819 			  cfs_rq->curr == se, NULL);
2820 
2821 	if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg)
2822 		update_tg_load_avg(cfs_rq, 0);
2823 }
2824 
2825 static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
2826 {
2827 	if (!sched_feat(ATTACH_AGE_LOAD))
2828 		goto skip_aging;
2829 
2830 	/*
2831 	 * If we got migrated (either between CPUs or between cgroups) we'll
2832 	 * have aged the average right before clearing @last_update_time.
2833 	 */
2834 	if (se->avg.last_update_time) {
2835 		__update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
2836 				  &se->avg, 0, 0, NULL);
2837 
2838 		/*
2839 		 * XXX: we could have just aged the entire load away if we've been
2840 		 * absent from the fair class for too long.
2841 		 */
2842 	}
2843 
2844 skip_aging:
2845 	se->avg.last_update_time = cfs_rq->avg.last_update_time;
2846 	cfs_rq->avg.load_avg += se->avg.load_avg;
2847 	cfs_rq->avg.load_sum += se->avg.load_sum;
2848 	cfs_rq->avg.util_avg += se->avg.util_avg;
2849 	cfs_rq->avg.util_sum += se->avg.util_sum;
2850 }
2851 
2852 static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
2853 {
2854 	__update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
2855 			  &se->avg, se->on_rq * scale_load_down(se->load.weight),
2856 			  cfs_rq->curr == se, NULL);
2857 
2858 	cfs_rq->avg.load_avg = max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0);
2859 	cfs_rq->avg.load_sum = max_t(s64,  cfs_rq->avg.load_sum - se->avg.load_sum, 0);
2860 	cfs_rq->avg.util_avg = max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0);
2861 	cfs_rq->avg.util_sum = max_t(s32,  cfs_rq->avg.util_sum - se->avg.util_sum, 0);
2862 }
2863 
2864 /* Add the load generated by se into cfs_rq's load average */
2865 static inline void
2866 enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
2867 {
2868 	struct sched_avg *sa = &se->avg;
2869 	u64 now = cfs_rq_clock_task(cfs_rq);
2870 	int migrated, decayed;
2871 
2872 	migrated = !sa->last_update_time;
2873 	if (!migrated) {
2874 		__update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
2875 			se->on_rq * scale_load_down(se->load.weight),
2876 			cfs_rq->curr == se, NULL);
2877 	}
2878 
2879 	decayed = update_cfs_rq_load_avg(now, cfs_rq);
2880 
2881 	cfs_rq->runnable_load_avg += sa->load_avg;
2882 	cfs_rq->runnable_load_sum += sa->load_sum;
2883 
2884 	if (migrated)
2885 		attach_entity_load_avg(cfs_rq, se);
2886 
2887 	if (decayed || migrated)
2888 		update_tg_load_avg(cfs_rq, 0);
2889 }
2890 
2891 /* Remove the runnable load generated by se from cfs_rq's runnable load average */
2892 static inline void
2893 dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
2894 {
2895 	update_load_avg(se, 1);
2896 
2897 	cfs_rq->runnable_load_avg =
2898 		max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0);
2899 	cfs_rq->runnable_load_sum =
2900 		max_t(s64,  cfs_rq->runnable_load_sum - se->avg.load_sum, 0);
2901 }
2902 
2903 #ifndef CONFIG_64BIT
2904 static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
2905 {
2906 	u64 last_update_time_copy;
2907 	u64 last_update_time;
2908 
2909 	do {
2910 		last_update_time_copy = cfs_rq->load_last_update_time_copy;
2911 		smp_rmb();
2912 		last_update_time = cfs_rq->avg.last_update_time;
2913 	} while (last_update_time != last_update_time_copy);
2914 
2915 	return last_update_time;
2916 }
2917 #else
2918 static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
2919 {
2920 	return cfs_rq->avg.last_update_time;
2921 }
2922 #endif
2923 
2924 /*
2925  * Task first catches up with cfs_rq, and then subtract
2926  * itself from the cfs_rq (task must be off the queue now).
2927  */
2928 void remove_entity_load_avg(struct sched_entity *se)
2929 {
2930 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
2931 	u64 last_update_time;
2932 
2933 	/*
2934 	 * Newly created task or never used group entity should not be removed
2935 	 * from its (source) cfs_rq
2936 	 */
2937 	if (se->avg.last_update_time == 0)
2938 		return;
2939 
2940 	last_update_time = cfs_rq_last_update_time(cfs_rq);
2941 
2942 	__update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
2943 	atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg);
2944 	atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg);
2945 }
2946 
2947 static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
2948 {
2949 	return cfs_rq->runnable_load_avg;
2950 }
2951 
2952 static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
2953 {
2954 	return cfs_rq->avg.load_avg;
2955 }
2956 
2957 static int idle_balance(struct rq *this_rq);
2958 
2959 #else /* CONFIG_SMP */
2960 
2961 static inline void update_load_avg(struct sched_entity *se, int update_tg) {}
2962 static inline void
2963 enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
2964 static inline void
2965 dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
2966 static inline void remove_entity_load_avg(struct sched_entity *se) {}
2967 
2968 static inline void
2969 attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
2970 static inline void
2971 detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
2972 
2973 static inline int idle_balance(struct rq *rq)
2974 {
2975 	return 0;
2976 }
2977 
2978 #endif /* CONFIG_SMP */
2979 
2980 static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
2981 {
2982 #ifdef CONFIG_SCHEDSTATS
2983 	struct task_struct *tsk = NULL;
2984 
2985 	if (entity_is_task(se))
2986 		tsk = task_of(se);
2987 
2988 	if (se->statistics.sleep_start) {
2989 		u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.sleep_start;
2990 
2991 		if ((s64)delta < 0)
2992 			delta = 0;
2993 
2994 		if (unlikely(delta > se->statistics.sleep_max))
2995 			se->statistics.sleep_max = delta;
2996 
2997 		se->statistics.sleep_start = 0;
2998 		se->statistics.sum_sleep_runtime += delta;
2999 
3000 		if (tsk) {
3001 			account_scheduler_latency(tsk, delta >> 10, 1);
3002 			trace_sched_stat_sleep(tsk, delta);
3003 		}
3004 	}
3005 	if (se->statistics.block_start) {
3006 		u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.block_start;
3007 
3008 		if ((s64)delta < 0)
3009 			delta = 0;
3010 
3011 		if (unlikely(delta > se->statistics.block_max))
3012 			se->statistics.block_max = delta;
3013 
3014 		se->statistics.block_start = 0;
3015 		se->statistics.sum_sleep_runtime += delta;
3016 
3017 		if (tsk) {
3018 			if (tsk->in_iowait) {
3019 				se->statistics.iowait_sum += delta;
3020 				se->statistics.iowait_count++;
3021 				trace_sched_stat_iowait(tsk, delta);
3022 			}
3023 
3024 			trace_sched_stat_blocked(tsk, delta);
3025 
3026 			/*
3027 			 * Blocking time is in units of nanosecs, so shift by
3028 			 * 20 to get a milliseconds-range estimation of the
3029 			 * amount of time that the task spent sleeping:
3030 			 */
3031 			if (unlikely(prof_on == SLEEP_PROFILING)) {
3032 				profile_hits(SLEEP_PROFILING,
3033 						(void *)get_wchan(tsk),
3034 						delta >> 20);
3035 			}
3036 			account_scheduler_latency(tsk, delta >> 10, 0);
3037 		}
3038 	}
3039 #endif
3040 }
3041 
3042 static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
3043 {
3044 #ifdef CONFIG_SCHED_DEBUG
3045 	s64 d = se->vruntime - cfs_rq->min_vruntime;
3046 
3047 	if (d < 0)
3048 		d = -d;
3049 
3050 	if (d > 3*sysctl_sched_latency)
3051 		schedstat_inc(cfs_rq, nr_spread_over);
3052 #endif
3053 }
3054 
3055 static void
3056 place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
3057 {
3058 	u64 vruntime = cfs_rq->min_vruntime;
3059 
3060 	/*
3061 	 * The 'current' period is already promised to the current tasks,
3062 	 * however the extra weight of the new task will slow them down a
3063 	 * little, place the new task so that it fits in the slot that
3064 	 * stays open at the end.
3065 	 */
3066 	if (initial && sched_feat(START_DEBIT))
3067 		vruntime += sched_vslice(cfs_rq, se);
3068 
3069 	/* sleeps up to a single latency don't count. */
3070 	if (!initial) {
3071 		unsigned long thresh = sysctl_sched_latency;
3072 
3073 		/*
3074 		 * Halve their sleep time's effect, to allow
3075 		 * for a gentler effect of sleepers:
3076 		 */
3077 		if (sched_feat(GENTLE_FAIR_SLEEPERS))
3078 			thresh >>= 1;
3079 
3080 		vruntime -= thresh;
3081 	}
3082 
3083 	/* ensure we never gain time by being placed backwards. */
3084 	se->vruntime = max_vruntime(se->vruntime, vruntime);
3085 }
3086 
3087 static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
3088 
3089 static void
3090 enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
3091 {
3092 	/*
3093 	 * Update the normalized vruntime before updating min_vruntime
3094 	 * through calling update_curr().
3095 	 */
3096 	if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
3097 		se->vruntime += cfs_rq->min_vruntime;
3098 
3099 	/*
3100 	 * Update run-time statistics of the 'current'.
3101 	 */
3102 	update_curr(cfs_rq);
3103 	enqueue_entity_load_avg(cfs_rq, se);
3104 	account_entity_enqueue(cfs_rq, se);
3105 	update_cfs_shares(cfs_rq);
3106 
3107 	if (flags & ENQUEUE_WAKEUP) {
3108 		place_entity(cfs_rq, se, 0);
3109 		enqueue_sleeper(cfs_rq, se);
3110 	}
3111 
3112 	update_stats_enqueue(cfs_rq, se);
3113 	check_spread(cfs_rq, se);
3114 	if (se != cfs_rq->curr)
3115 		__enqueue_entity(cfs_rq, se);
3116 	se->on_rq = 1;
3117 
3118 	if (cfs_rq->nr_running == 1) {
3119 		list_add_leaf_cfs_rq(cfs_rq);
3120 		check_enqueue_throttle(cfs_rq);
3121 	}
3122 }
3123 
3124 static void __clear_buddies_last(struct sched_entity *se)
3125 {
3126 	for_each_sched_entity(se) {
3127 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
3128 		if (cfs_rq->last != se)
3129 			break;
3130 
3131 		cfs_rq->last = NULL;
3132 	}
3133 }
3134 
3135 static void __clear_buddies_next(struct sched_entity *se)
3136 {
3137 	for_each_sched_entity(se) {
3138 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
3139 		if (cfs_rq->next != se)
3140 			break;
3141 
3142 		cfs_rq->next = NULL;
3143 	}
3144 }
3145 
3146 static void __clear_buddies_skip(struct sched_entity *se)
3147 {
3148 	for_each_sched_entity(se) {
3149 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
3150 		if (cfs_rq->skip != se)
3151 			break;
3152 
3153 		cfs_rq->skip = NULL;
3154 	}
3155 }
3156 
3157 static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
3158 {
3159 	if (cfs_rq->last == se)
3160 		__clear_buddies_last(se);
3161 
3162 	if (cfs_rq->next == se)
3163 		__clear_buddies_next(se);
3164 
3165 	if (cfs_rq->skip == se)
3166 		__clear_buddies_skip(se);
3167 }
3168 
3169 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
3170 
3171 static void
3172 dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
3173 {
3174 	/*
3175 	 * Update run-time statistics of the 'current'.
3176 	 */
3177 	update_curr(cfs_rq);
3178 	dequeue_entity_load_avg(cfs_rq, se);
3179 
3180 	update_stats_dequeue(cfs_rq, se);
3181 	if (flags & DEQUEUE_SLEEP) {
3182 #ifdef CONFIG_SCHEDSTATS
3183 		if (entity_is_task(se)) {
3184 			struct task_struct *tsk = task_of(se);
3185 
3186 			if (tsk->state & TASK_INTERRUPTIBLE)
3187 				se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));
3188 			if (tsk->state & TASK_UNINTERRUPTIBLE)
3189 				se->statistics.block_start = rq_clock(rq_of(cfs_rq));
3190 		}
3191 #endif
3192 	}
3193 
3194 	clear_buddies(cfs_rq, se);
3195 
3196 	if (se != cfs_rq->curr)
3197 		__dequeue_entity(cfs_rq, se);
3198 	se->on_rq = 0;
3199 	account_entity_dequeue(cfs_rq, se);
3200 
3201 	/*
3202 	 * Normalize the entity after updating the min_vruntime because the
3203 	 * update can refer to the ->curr item and we need to reflect this
3204 	 * movement in our normalized position.
3205 	 */
3206 	if (!(flags & DEQUEUE_SLEEP))
3207 		se->vruntime -= cfs_rq->min_vruntime;
3208 
3209 	/* return excess runtime on last dequeue */
3210 	return_cfs_rq_runtime(cfs_rq);
3211 
3212 	update_min_vruntime(cfs_rq);
3213 	update_cfs_shares(cfs_rq);
3214 }
3215 
3216 /*
3217  * Preempt the current task with a newly woken task if needed:
3218  */
3219 static void
3220 check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
3221 {
3222 	unsigned long ideal_runtime, delta_exec;
3223 	struct sched_entity *se;
3224 	s64 delta;
3225 
3226 	ideal_runtime = sched_slice(cfs_rq, curr);
3227 	delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
3228 	if (delta_exec > ideal_runtime) {
3229 		resched_curr(rq_of(cfs_rq));
3230 		/*
3231 		 * The current task ran long enough, ensure it doesn't get
3232 		 * re-elected due to buddy favours.
3233 		 */
3234 		clear_buddies(cfs_rq, curr);
3235 		return;
3236 	}
3237 
3238 	/*
3239 	 * Ensure that a task that missed wakeup preemption by a
3240 	 * narrow margin doesn't have to wait for a full slice.
3241 	 * This also mitigates buddy induced latencies under load.
3242 	 */
3243 	if (delta_exec < sysctl_sched_min_granularity)
3244 		return;
3245 
3246 	se = __pick_first_entity(cfs_rq);
3247 	delta = curr->vruntime - se->vruntime;
3248 
3249 	if (delta < 0)
3250 		return;
3251 
3252 	if (delta > ideal_runtime)
3253 		resched_curr(rq_of(cfs_rq));
3254 }
3255 
3256 static void
3257 set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
3258 {
3259 	/* 'current' is not kept within the tree. */
3260 	if (se->on_rq) {
3261 		/*
3262 		 * Any task has to be enqueued before it get to execute on
3263 		 * a CPU. So account for the time it spent waiting on the
3264 		 * runqueue.
3265 		 */
3266 		update_stats_wait_end(cfs_rq, se);
3267 		__dequeue_entity(cfs_rq, se);
3268 		update_load_avg(se, 1);
3269 	}
3270 
3271 	update_stats_curr_start(cfs_rq, se);
3272 	cfs_rq->curr = se;
3273 #ifdef CONFIG_SCHEDSTATS
3274 	/*
3275 	 * Track our maximum slice length, if the CPU's load is at
3276 	 * least twice that of our own weight (i.e. dont track it
3277 	 * when there are only lesser-weight tasks around):
3278 	 */
3279 	if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
3280 		se->statistics.slice_max = max(se->statistics.slice_max,
3281 			se->sum_exec_runtime - se->prev_sum_exec_runtime);
3282 	}
3283 #endif
3284 	se->prev_sum_exec_runtime = se->sum_exec_runtime;
3285 }
3286 
3287 static int
3288 wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
3289 
3290 /*
3291  * Pick the next process, keeping these things in mind, in this order:
3292  * 1) keep things fair between processes/task groups
3293  * 2) pick the "next" process, since someone really wants that to run
3294  * 3) pick the "last" process, for cache locality
3295  * 4) do not run the "skip" process, if something else is available
3296  */
3297 static struct sched_entity *
3298 pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
3299 {
3300 	struct sched_entity *left = __pick_first_entity(cfs_rq);
3301 	struct sched_entity *se;
3302 
3303 	/*
3304 	 * If curr is set we have to see if its left of the leftmost entity
3305 	 * still in the tree, provided there was anything in the tree at all.
3306 	 */
3307 	if (!left || (curr && entity_before(curr, left)))
3308 		left = curr;
3309 
3310 	se = left; /* ideally we run the leftmost entity */
3311 
3312 	/*
3313 	 * Avoid running the skip buddy, if running something else can
3314 	 * be done without getting too unfair.
3315 	 */
3316 	if (cfs_rq->skip == se) {
3317 		struct sched_entity *second;
3318 
3319 		if (se == curr) {
3320 			second = __pick_first_entity(cfs_rq);
3321 		} else {
3322 			second = __pick_next_entity(se);
3323 			if (!second || (curr && entity_before(curr, second)))
3324 				second = curr;
3325 		}
3326 
3327 		if (second && wakeup_preempt_entity(second, left) < 1)
3328 			se = second;
3329 	}
3330 
3331 	/*
3332 	 * Prefer last buddy, try to return the CPU to a preempted task.
3333 	 */
3334 	if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
3335 		se = cfs_rq->last;
3336 
3337 	/*
3338 	 * Someone really wants this to run. If it's not unfair, run it.
3339 	 */
3340 	if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
3341 		se = cfs_rq->next;
3342 
3343 	clear_buddies(cfs_rq, se);
3344 
3345 	return se;
3346 }
3347 
3348 static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
3349 
3350 static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
3351 {
3352 	/*
3353 	 * If still on the runqueue then deactivate_task()
3354 	 * was not called and update_curr() has to be done:
3355 	 */
3356 	if (prev->on_rq)
3357 		update_curr(cfs_rq);
3358 
3359 	/* throttle cfs_rqs exceeding runtime */
3360 	check_cfs_rq_runtime(cfs_rq);
3361 
3362 	check_spread(cfs_rq, prev);
3363 	if (prev->on_rq) {
3364 		update_stats_wait_start(cfs_rq, prev);
3365 		/* Put 'current' back into the tree. */
3366 		__enqueue_entity(cfs_rq, prev);
3367 		/* in !on_rq case, update occurred at dequeue */
3368 		update_load_avg(prev, 0);
3369 	}
3370 	cfs_rq->curr = NULL;
3371 }
3372 
3373 static void
3374 entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
3375 {
3376 	/*
3377 	 * Update run-time statistics of the 'current'.
3378 	 */
3379 	update_curr(cfs_rq);
3380 
3381 	/*
3382 	 * Ensure that runnable average is periodically updated.
3383 	 */
3384 	update_load_avg(curr, 1);
3385 	update_cfs_shares(cfs_rq);
3386 
3387 #ifdef CONFIG_SCHED_HRTICK
3388 	/*
3389 	 * queued ticks are scheduled to match the slice, so don't bother
3390 	 * validating it and just reschedule.
3391 	 */
3392 	if (queued) {
3393 		resched_curr(rq_of(cfs_rq));
3394 		return;
3395 	}
3396 	/*
3397 	 * don't let the period tick interfere with the hrtick preemption
3398 	 */
3399 	if (!sched_feat(DOUBLE_TICK) &&
3400 			hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
3401 		return;
3402 #endif
3403 
3404 	if (cfs_rq->nr_running > 1)
3405 		check_preempt_tick(cfs_rq, curr);
3406 }
3407 
3408 
3409 /**************************************************
3410  * CFS bandwidth control machinery
3411  */
3412 
3413 #ifdef CONFIG_CFS_BANDWIDTH
3414 
3415 #ifdef HAVE_JUMP_LABEL
3416 static struct static_key __cfs_bandwidth_used;
3417 
3418 static inline bool cfs_bandwidth_used(void)
3419 {
3420 	return static_key_false(&__cfs_bandwidth_used);
3421 }
3422 
3423 void cfs_bandwidth_usage_inc(void)
3424 {
3425 	static_key_slow_inc(&__cfs_bandwidth_used);
3426 }
3427 
3428 void cfs_bandwidth_usage_dec(void)
3429 {
3430 	static_key_slow_dec(&__cfs_bandwidth_used);
3431 }
3432 #else /* HAVE_JUMP_LABEL */
3433 static bool cfs_bandwidth_used(void)
3434 {
3435 	return true;
3436 }
3437 
3438 void cfs_bandwidth_usage_inc(void) {}
3439 void cfs_bandwidth_usage_dec(void) {}
3440 #endif /* HAVE_JUMP_LABEL */
3441 
3442 /*
3443  * default period for cfs group bandwidth.
3444  * default: 0.1s, units: nanoseconds
3445  */
3446 static inline u64 default_cfs_period(void)
3447 {
3448 	return 100000000ULL;
3449 }
3450 
3451 static inline u64 sched_cfs_bandwidth_slice(void)
3452 {
3453 	return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
3454 }
3455 
3456 /*
3457  * Replenish runtime according to assigned quota and update expiration time.
3458  * We use sched_clock_cpu directly instead of rq->clock to avoid adding
3459  * additional synchronization around rq->lock.
3460  *
3461  * requires cfs_b->lock
3462  */
3463 void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
3464 {
3465 	u64 now;
3466 
3467 	if (cfs_b->quota == RUNTIME_INF)
3468 		return;
3469 
3470 	now = sched_clock_cpu(smp_processor_id());
3471 	cfs_b->runtime = cfs_b->quota;
3472 	cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
3473 }
3474 
3475 static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
3476 {
3477 	return &tg->cfs_bandwidth;
3478 }
3479 
3480 /* rq->task_clock normalized against any time this cfs_rq has spent throttled */
3481 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
3482 {
3483 	if (unlikely(cfs_rq->throttle_count))
3484 		return cfs_rq->throttled_clock_task;
3485 
3486 	return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
3487 }
3488 
3489 /* returns 0 on failure to allocate runtime */
3490 static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
3491 {
3492 	struct task_group *tg = cfs_rq->tg;
3493 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
3494 	u64 amount = 0, min_amount, expires;
3495 
3496 	/* note: this is a positive sum as runtime_remaining <= 0 */
3497 	min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
3498 
3499 	raw_spin_lock(&cfs_b->lock);
3500 	if (cfs_b->quota == RUNTIME_INF)
3501 		amount = min_amount;
3502 	else {
3503 		start_cfs_bandwidth(cfs_b);
3504 
3505 		if (cfs_b->runtime > 0) {
3506 			amount = min(cfs_b->runtime, min_amount);
3507 			cfs_b->runtime -= amount;
3508 			cfs_b->idle = 0;
3509 		}
3510 	}
3511 	expires = cfs_b->runtime_expires;
3512 	raw_spin_unlock(&cfs_b->lock);
3513 
3514 	cfs_rq->runtime_remaining += amount;
3515 	/*
3516 	 * we may have advanced our local expiration to account for allowed
3517 	 * spread between our sched_clock and the one on which runtime was
3518 	 * issued.
3519 	 */
3520 	if ((s64)(expires - cfs_rq->runtime_expires) > 0)
3521 		cfs_rq->runtime_expires = expires;
3522 
3523 	return cfs_rq->runtime_remaining > 0;
3524 }
3525 
3526 /*
3527  * Note: This depends on the synchronization provided by sched_clock and the
3528  * fact that rq->clock snapshots this value.
3529  */
3530 static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
3531 {
3532 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
3533 
3534 	/* if the deadline is ahead of our clock, nothing to do */
3535 	if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0))
3536 		return;
3537 
3538 	if (cfs_rq->runtime_remaining < 0)
3539 		return;
3540 
3541 	/*
3542 	 * If the local deadline has passed we have to consider the
3543 	 * possibility that our sched_clock is 'fast' and the global deadline
3544 	 * has not truly expired.
3545 	 *
3546 	 * Fortunately we can check determine whether this the case by checking
3547 	 * whether the global deadline has advanced. It is valid to compare
3548 	 * cfs_b->runtime_expires without any locks since we only care about
3549 	 * exact equality, so a partial write will still work.
3550 	 */
3551 
3552 	if (cfs_rq->runtime_expires != cfs_b->runtime_expires) {
3553 		/* extend local deadline, drift is bounded above by 2 ticks */
3554 		cfs_rq->runtime_expires += TICK_NSEC;
3555 	} else {
3556 		/* global deadline is ahead, expiration has passed */
3557 		cfs_rq->runtime_remaining = 0;
3558 	}
3559 }
3560 
3561 static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
3562 {
3563 	/* dock delta_exec before expiring quota (as it could span periods) */
3564 	cfs_rq->runtime_remaining -= delta_exec;
3565 	expire_cfs_rq_runtime(cfs_rq);
3566 
3567 	if (likely(cfs_rq->runtime_remaining > 0))
3568 		return;
3569 
3570 	/*
3571 	 * if we're unable to extend our runtime we resched so that the active
3572 	 * hierarchy can be throttled
3573 	 */
3574 	if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
3575 		resched_curr(rq_of(cfs_rq));
3576 }
3577 
3578 static __always_inline
3579 void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
3580 {
3581 	if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
3582 		return;
3583 
3584 	__account_cfs_rq_runtime(cfs_rq, delta_exec);
3585 }
3586 
3587 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
3588 {
3589 	return cfs_bandwidth_used() && cfs_rq->throttled;
3590 }
3591 
3592 /* check whether cfs_rq, or any parent, is throttled */
3593 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
3594 {
3595 	return cfs_bandwidth_used() && cfs_rq->throttle_count;
3596 }
3597 
3598 /*
3599  * Ensure that neither of the group entities corresponding to src_cpu or
3600  * dest_cpu are members of a throttled hierarchy when performing group
3601  * load-balance operations.
3602  */
3603 static inline int throttled_lb_pair(struct task_group *tg,
3604 				    int src_cpu, int dest_cpu)
3605 {
3606 	struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
3607 
3608 	src_cfs_rq = tg->cfs_rq[src_cpu];
3609 	dest_cfs_rq = tg->cfs_rq[dest_cpu];
3610 
3611 	return throttled_hierarchy(src_cfs_rq) ||
3612 	       throttled_hierarchy(dest_cfs_rq);
3613 }
3614 
3615 /* updated child weight may affect parent so we have to do this bottom up */
3616 static int tg_unthrottle_up(struct task_group *tg, void *data)
3617 {
3618 	struct rq *rq = data;
3619 	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
3620 
3621 	cfs_rq->throttle_count--;
3622 #ifdef CONFIG_SMP
3623 	if (!cfs_rq->throttle_count) {
3624 		/* adjust cfs_rq_clock_task() */
3625 		cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
3626 					     cfs_rq->throttled_clock_task;
3627 	}
3628 #endif
3629 
3630 	return 0;
3631 }
3632 
3633 static int tg_throttle_down(struct task_group *tg, void *data)
3634 {
3635 	struct rq *rq = data;
3636 	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
3637 
3638 	/* group is entering throttled state, stop time */
3639 	if (!cfs_rq->throttle_count)
3640 		cfs_rq->throttled_clock_task = rq_clock_task(rq);
3641 	cfs_rq->throttle_count++;
3642 
3643 	return 0;
3644 }
3645 
3646 static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
3647 {
3648 	struct rq *rq = rq_of(cfs_rq);
3649 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
3650 	struct sched_entity *se;
3651 	long task_delta, dequeue = 1;
3652 	bool empty;
3653 
3654 	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
3655 
3656 	/* freeze hierarchy runnable averages while throttled */
3657 	rcu_read_lock();
3658 	walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
3659 	rcu_read_unlock();
3660 
3661 	task_delta = cfs_rq->h_nr_running;
3662 	for_each_sched_entity(se) {
3663 		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
3664 		/* throttled entity or throttle-on-deactivate */
3665 		if (!se->on_rq)
3666 			break;
3667 
3668 		if (dequeue)
3669 			dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
3670 		qcfs_rq->h_nr_running -= task_delta;
3671 
3672 		if (qcfs_rq->load.weight)
3673 			dequeue = 0;
3674 	}
3675 
3676 	if (!se)
3677 		sub_nr_running(rq, task_delta);
3678 
3679 	cfs_rq->throttled = 1;
3680 	cfs_rq->throttled_clock = rq_clock(rq);
3681 	raw_spin_lock(&cfs_b->lock);
3682 	empty = list_empty(&cfs_b->throttled_cfs_rq);
3683 
3684 	/*
3685 	 * Add to the _head_ of the list, so that an already-started
3686 	 * distribute_cfs_runtime will not see us
3687 	 */
3688 	list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
3689 
3690 	/*
3691 	 * If we're the first throttled task, make sure the bandwidth
3692 	 * timer is running.
3693 	 */
3694 	if (empty)
3695 		start_cfs_bandwidth(cfs_b);
3696 
3697 	raw_spin_unlock(&cfs_b->lock);
3698 }
3699 
3700 void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
3701 {
3702 	struct rq *rq = rq_of(cfs_rq);
3703 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
3704 	struct sched_entity *se;
3705 	int enqueue = 1;
3706 	long task_delta;
3707 
3708 	se = cfs_rq->tg->se[cpu_of(rq)];
3709 
3710 	cfs_rq->throttled = 0;
3711 
3712 	update_rq_clock(rq);
3713 
3714 	raw_spin_lock(&cfs_b->lock);
3715 	cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
3716 	list_del_rcu(&cfs_rq->throttled_list);
3717 	raw_spin_unlock(&cfs_b->lock);
3718 
3719 	/* update hierarchical throttle state */
3720 	walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
3721 
3722 	if (!cfs_rq->load.weight)
3723 		return;
3724 
3725 	task_delta = cfs_rq->h_nr_running;
3726 	for_each_sched_entity(se) {
3727 		if (se->on_rq)
3728 			enqueue = 0;
3729 
3730 		cfs_rq = cfs_rq_of(se);
3731 		if (enqueue)
3732 			enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
3733 		cfs_rq->h_nr_running += task_delta;
3734 
3735 		if (cfs_rq_throttled(cfs_rq))
3736 			break;
3737 	}
3738 
3739 	if (!se)
3740 		add_nr_running(rq, task_delta);
3741 
3742 	/* determine whether we need to wake up potentially idle cpu */
3743 	if (rq->curr == rq->idle && rq->cfs.nr_running)
3744 		resched_curr(rq);
3745 }
3746 
3747 static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
3748 		u64 remaining, u64 expires)
3749 {
3750 	struct cfs_rq *cfs_rq;
3751 	u64 runtime;
3752 	u64 starting_runtime = remaining;
3753 
3754 	rcu_read_lock();
3755 	list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
3756 				throttled_list) {
3757 		struct rq *rq = rq_of(cfs_rq);
3758 
3759 		raw_spin_lock(&rq->lock);
3760 		if (!cfs_rq_throttled(cfs_rq))
3761 			goto next;
3762 
3763 		runtime = -cfs_rq->runtime_remaining + 1;
3764 		if (runtime > remaining)
3765 			runtime = remaining;
3766 		remaining -= runtime;
3767 
3768 		cfs_rq->runtime_remaining += runtime;
3769 		cfs_rq->runtime_expires = expires;
3770 
3771 		/* we check whether we're throttled above */
3772 		if (cfs_rq->runtime_remaining > 0)
3773 			unthrottle_cfs_rq(cfs_rq);
3774 
3775 next:
3776 		raw_spin_unlock(&rq->lock);
3777 
3778 		if (!remaining)
3779 			break;
3780 	}
3781 	rcu_read_unlock();
3782 
3783 	return starting_runtime - remaining;
3784 }
3785 
3786 /*
3787  * Responsible for refilling a task_group's bandwidth and unthrottling its
3788  * cfs_rqs as appropriate. If there has been no activity within the last
3789  * period the timer is deactivated until scheduling resumes; cfs_b->idle is
3790  * used to track this state.
3791  */
3792 static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
3793 {
3794 	u64 runtime, runtime_expires;
3795 	int throttled;
3796 
3797 	/* no need to continue the timer with no bandwidth constraint */
3798 	if (cfs_b->quota == RUNTIME_INF)
3799 		goto out_deactivate;
3800 
3801 	throttled = !list_empty(&cfs_b->throttled_cfs_rq);
3802 	cfs_b->nr_periods += overrun;
3803 
3804 	/*
3805 	 * idle depends on !throttled (for the case of a large deficit), and if
3806 	 * we're going inactive then everything else can be deferred
3807 	 */
3808 	if (cfs_b->idle && !throttled)
3809 		goto out_deactivate;
3810 
3811 	__refill_cfs_bandwidth_runtime(cfs_b);
3812 
3813 	if (!throttled) {
3814 		/* mark as potentially idle for the upcoming period */
3815 		cfs_b->idle = 1;
3816 		return 0;
3817 	}
3818 
3819 	/* account preceding periods in which throttling occurred */
3820 	cfs_b->nr_throttled += overrun;
3821 
3822 	runtime_expires = cfs_b->runtime_expires;
3823 
3824 	/*
3825 	 * This check is repeated as we are holding onto the new bandwidth while
3826 	 * we unthrottle. This can potentially race with an unthrottled group
3827 	 * trying to acquire new bandwidth from the global pool. This can result
3828 	 * in us over-using our runtime if it is all used during this loop, but
3829 	 * only by limited amounts in that extreme case.
3830 	 */
3831 	while (throttled && cfs_b->runtime > 0) {
3832 		runtime = cfs_b->runtime;
3833 		raw_spin_unlock(&cfs_b->lock);
3834 		/* we can't nest cfs_b->lock while distributing bandwidth */
3835 		runtime = distribute_cfs_runtime(cfs_b, runtime,
3836 						 runtime_expires);
3837 		raw_spin_lock(&cfs_b->lock);
3838 
3839 		throttled = !list_empty(&cfs_b->throttled_cfs_rq);
3840 
3841 		cfs_b->runtime -= min(runtime, cfs_b->runtime);
3842 	}
3843 
3844 	/*
3845 	 * While we are ensured activity in the period following an
3846 	 * unthrottle, this also covers the case in which the new bandwidth is
3847 	 * insufficient to cover the existing bandwidth deficit.  (Forcing the
3848 	 * timer to remain active while there are any throttled entities.)
3849 	 */
3850 	cfs_b->idle = 0;
3851 
3852 	return 0;
3853 
3854 out_deactivate:
3855 	return 1;
3856 }
3857 
3858 /* a cfs_rq won't donate quota below this amount */
3859 static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
3860 /* minimum remaining period time to redistribute slack quota */
3861 static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
3862 /* how long we wait to gather additional slack before distributing */
3863 static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
3864 
3865 /*
3866  * Are we near the end of the current quota period?
3867  *
3868  * Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the
3869  * hrtimer base being cleared by hrtimer_start. In the case of
3870  * migrate_hrtimers, base is never cleared, so we are fine.
3871  */
3872 static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
3873 {
3874 	struct hrtimer *refresh_timer = &cfs_b->period_timer;
3875 	u64 remaining;
3876 
3877 	/* if the call-back is running a quota refresh is already occurring */
3878 	if (hrtimer_callback_running(refresh_timer))
3879 		return 1;
3880 
3881 	/* is a quota refresh about to occur? */
3882 	remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
3883 	if (remaining < min_expire)
3884 		return 1;
3885 
3886 	return 0;
3887 }
3888 
3889 static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
3890 {
3891 	u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
3892 
3893 	/* if there's a quota refresh soon don't bother with slack */
3894 	if (runtime_refresh_within(cfs_b, min_left))
3895 		return;
3896 
3897 	hrtimer_start(&cfs_b->slack_timer,
3898 			ns_to_ktime(cfs_bandwidth_slack_period),
3899 			HRTIMER_MODE_REL);
3900 }
3901 
3902 /* we know any runtime found here is valid as update_curr() precedes return */
3903 static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
3904 {
3905 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
3906 	s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
3907 
3908 	if (slack_runtime <= 0)
3909 		return;
3910 
3911 	raw_spin_lock(&cfs_b->lock);
3912 	if (cfs_b->quota != RUNTIME_INF &&
3913 	    cfs_rq->runtime_expires == cfs_b->runtime_expires) {
3914 		cfs_b->runtime += slack_runtime;
3915 
3916 		/* we are under rq->lock, defer unthrottling using a timer */
3917 		if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
3918 		    !list_empty(&cfs_b->throttled_cfs_rq))
3919 			start_cfs_slack_bandwidth(cfs_b);
3920 	}
3921 	raw_spin_unlock(&cfs_b->lock);
3922 
3923 	/* even if it's not valid for return we don't want to try again */
3924 	cfs_rq->runtime_remaining -= slack_runtime;
3925 }
3926 
3927 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
3928 {
3929 	if (!cfs_bandwidth_used())
3930 		return;
3931 
3932 	if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
3933 		return;
3934 
3935 	__return_cfs_rq_runtime(cfs_rq);
3936 }
3937 
3938 /*
3939  * This is done with a timer (instead of inline with bandwidth return) since
3940  * it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
3941  */
3942 static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
3943 {
3944 	u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
3945 	u64 expires;
3946 
3947 	/* confirm we're still not at a refresh boundary */
3948 	raw_spin_lock(&cfs_b->lock);
3949 	if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
3950 		raw_spin_unlock(&cfs_b->lock);
3951 		return;
3952 	}
3953 
3954 	if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
3955 		runtime = cfs_b->runtime;
3956 
3957 	expires = cfs_b->runtime_expires;
3958 	raw_spin_unlock(&cfs_b->lock);
3959 
3960 	if (!runtime)
3961 		return;
3962 
3963 	runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
3964 
3965 	raw_spin_lock(&cfs_b->lock);
3966 	if (expires == cfs_b->runtime_expires)
3967 		cfs_b->runtime -= min(runtime, cfs_b->runtime);
3968 	raw_spin_unlock(&cfs_b->lock);
3969 }
3970 
3971 /*
3972  * When a group wakes up we want to make sure that its quota is not already
3973  * expired/exceeded, otherwise it may be allowed to steal additional ticks of
3974  * runtime as update_curr() throttling can not not trigger until it's on-rq.
3975  */
3976 static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
3977 {
3978 	if (!cfs_bandwidth_used())
3979 		return;
3980 
3981 	/* an active group must be handled by the update_curr()->put() path */
3982 	if (!cfs_rq->runtime_enabled || cfs_rq->curr)
3983 		return;
3984 
3985 	/* ensure the group is not already throttled */
3986 	if (cfs_rq_throttled(cfs_rq))
3987 		return;
3988 
3989 	/* update runtime allocation */
3990 	account_cfs_rq_runtime(cfs_rq, 0);
3991 	if (cfs_rq->runtime_remaining <= 0)
3992 		throttle_cfs_rq(cfs_rq);
3993 }
3994 
3995 /* conditionally throttle active cfs_rq's from put_prev_entity() */
3996 static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
3997 {
3998 	if (!cfs_bandwidth_used())
3999 		return false;
4000 
4001 	if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
4002 		return false;
4003 
4004 	/*
4005 	 * it's possible for a throttled entity to be forced into a running
4006 	 * state (e.g. set_curr_task), in this case we're finished.
4007 	 */
4008 	if (cfs_rq_throttled(cfs_rq))
4009 		return true;
4010 
4011 	throttle_cfs_rq(cfs_rq);
4012 	return true;
4013 }
4014 
4015 static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
4016 {
4017 	struct cfs_bandwidth *cfs_b =
4018 		container_of(timer, struct cfs_bandwidth, slack_timer);
4019 
4020 	do_sched_cfs_slack_timer(cfs_b);
4021 
4022 	return HRTIMER_NORESTART;
4023 }
4024 
4025 static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
4026 {
4027 	struct cfs_bandwidth *cfs_b =
4028 		container_of(timer, struct cfs_bandwidth, period_timer);
4029 	int overrun;
4030 	int idle = 0;
4031 
4032 	raw_spin_lock(&cfs_b->lock);
4033 	for (;;) {
4034 		overrun = hrtimer_forward_now(timer, cfs_b->period);
4035 		if (!overrun)
4036 			break;
4037 
4038 		idle = do_sched_cfs_period_timer(cfs_b, overrun);
4039 	}
4040 	if (idle)
4041 		cfs_b->period_active = 0;
4042 	raw_spin_unlock(&cfs_b->lock);
4043 
4044 	return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
4045 }
4046 
4047 void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
4048 {
4049 	raw_spin_lock_init(&cfs_b->lock);
4050 	cfs_b->runtime = 0;
4051 	cfs_b->quota = RUNTIME_INF;
4052 	cfs_b->period = ns_to_ktime(default_cfs_period());
4053 
4054 	INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
4055 	hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
4056 	cfs_b->period_timer.function = sched_cfs_period_timer;
4057 	hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
4058 	cfs_b->slack_timer.function = sched_cfs_slack_timer;
4059 }
4060 
4061 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4062 {
4063 	cfs_rq->runtime_enabled = 0;
4064 	INIT_LIST_HEAD(&cfs_rq->throttled_list);
4065 }
4066 
4067 void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
4068 {
4069 	lockdep_assert_held(&cfs_b->lock);
4070 
4071 	if (!cfs_b->period_active) {
4072 		cfs_b->period_active = 1;
4073 		hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
4074 		hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED);
4075 	}
4076 }
4077 
4078 static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
4079 {
4080 	/* init_cfs_bandwidth() was not called */
4081 	if (!cfs_b->throttled_cfs_rq.next)
4082 		return;
4083 
4084 	hrtimer_cancel(&cfs_b->period_timer);
4085 	hrtimer_cancel(&cfs_b->slack_timer);
4086 }
4087 
4088 static void __maybe_unused update_runtime_enabled(struct rq *rq)
4089 {
4090 	struct cfs_rq *cfs_rq;
4091 
4092 	for_each_leaf_cfs_rq(rq, cfs_rq) {
4093 		struct cfs_bandwidth *cfs_b = &cfs_rq->tg->cfs_bandwidth;
4094 
4095 		raw_spin_lock(&cfs_b->lock);
4096 		cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF;
4097 		raw_spin_unlock(&cfs_b->lock);
4098 	}
4099 }
4100 
4101 static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
4102 {
4103 	struct cfs_rq *cfs_rq;
4104 
4105 	for_each_leaf_cfs_rq(rq, cfs_rq) {
4106 		if (!cfs_rq->runtime_enabled)
4107 			continue;
4108 
4109 		/*
4110 		 * clock_task is not advancing so we just need to make sure
4111 		 * there's some valid quota amount
4112 		 */
4113 		cfs_rq->runtime_remaining = 1;
4114 		/*
4115 		 * Offline rq is schedulable till cpu is completely disabled
4116 		 * in take_cpu_down(), so we prevent new cfs throttling here.
4117 		 */
4118 		cfs_rq->runtime_enabled = 0;
4119 
4120 		if (cfs_rq_throttled(cfs_rq))
4121 			unthrottle_cfs_rq(cfs_rq);
4122 	}
4123 }
4124 
4125 #else /* CONFIG_CFS_BANDWIDTH */
4126 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
4127 {
4128 	return rq_clock_task(rq_of(cfs_rq));
4129 }
4130 
4131 static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
4132 static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
4133 static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
4134 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
4135 
4136 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
4137 {
4138 	return 0;
4139 }
4140 
4141 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
4142 {
4143 	return 0;
4144 }
4145 
4146 static inline int throttled_lb_pair(struct task_group *tg,
4147 				    int src_cpu, int dest_cpu)
4148 {
4149 	return 0;
4150 }
4151 
4152 void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
4153 
4154 #ifdef CONFIG_FAIR_GROUP_SCHED
4155 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
4156 #endif
4157 
4158 static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
4159 {
4160 	return NULL;
4161 }
4162 static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
4163 static inline void update_runtime_enabled(struct rq *rq) {}
4164 static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
4165 
4166 #endif /* CONFIG_CFS_BANDWIDTH */
4167 
4168 /**************************************************
4169  * CFS operations on tasks:
4170  */
4171 
4172 #ifdef CONFIG_SCHED_HRTICK
4173 static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
4174 {
4175 	struct sched_entity *se = &p->se;
4176 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
4177 
4178 	WARN_ON(task_rq(p) != rq);
4179 
4180 	if (cfs_rq->nr_running > 1) {
4181 		u64 slice = sched_slice(cfs_rq, se);
4182 		u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
4183 		s64 delta = slice - ran;
4184 
4185 		if (delta < 0) {
4186 			if (rq->curr == p)
4187 				resched_curr(rq);
4188 			return;
4189 		}
4190 		hrtick_start(rq, delta);
4191 	}
4192 }
4193 
4194 /*
4195  * called from enqueue/dequeue and updates the hrtick when the
4196  * current task is from our class and nr_running is low enough
4197  * to matter.
4198  */
4199 static void hrtick_update(struct rq *rq)
4200 {
4201 	struct task_struct *curr = rq->curr;
4202 
4203 	if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class)
4204 		return;
4205 
4206 	if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
4207 		hrtick_start_fair(rq, curr);
4208 }
4209 #else /* !CONFIG_SCHED_HRTICK */
4210 static inline void
4211 hrtick_start_fair(struct rq *rq, struct task_struct *p)
4212 {
4213 }
4214 
4215 static inline void hrtick_update(struct rq *rq)
4216 {
4217 }
4218 #endif
4219 
4220 /*
4221  * The enqueue_task method is called before nr_running is
4222  * increased. Here we update the fair scheduling stats and
4223  * then put the task into the rbtree:
4224  */
4225 static void
4226 enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
4227 {
4228 	struct cfs_rq *cfs_rq;
4229 	struct sched_entity *se = &p->se;
4230 
4231 	for_each_sched_entity(se) {
4232 		if (se->on_rq)
4233 			break;
4234 		cfs_rq = cfs_rq_of(se);
4235 		enqueue_entity(cfs_rq, se, flags);
4236 
4237 		/*
4238 		 * end evaluation on encountering a throttled cfs_rq
4239 		 *
4240 		 * note: in the case of encountering a throttled cfs_rq we will
4241 		 * post the final h_nr_running increment below.
4242 		*/
4243 		if (cfs_rq_throttled(cfs_rq))
4244 			break;
4245 		cfs_rq->h_nr_running++;
4246 
4247 		flags = ENQUEUE_WAKEUP;
4248 	}
4249 
4250 	for_each_sched_entity(se) {
4251 		cfs_rq = cfs_rq_of(se);
4252 		cfs_rq->h_nr_running++;
4253 
4254 		if (cfs_rq_throttled(cfs_rq))
4255 			break;
4256 
4257 		update_load_avg(se, 1);
4258 		update_cfs_shares(cfs_rq);
4259 	}
4260 
4261 	if (!se)
4262 		add_nr_running(rq, 1);
4263 
4264 	hrtick_update(rq);
4265 }
4266 
4267 static void set_next_buddy(struct sched_entity *se);
4268 
4269 /*
4270  * The dequeue_task method is called before nr_running is
4271  * decreased. We remove the task from the rbtree and
4272  * update the fair scheduling stats:
4273  */
4274 static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
4275 {
4276 	struct cfs_rq *cfs_rq;
4277 	struct sched_entity *se = &p->se;
4278 	int task_sleep = flags & DEQUEUE_SLEEP;
4279 
4280 	for_each_sched_entity(se) {
4281 		cfs_rq = cfs_rq_of(se);
4282 		dequeue_entity(cfs_rq, se, flags);
4283 
4284 		/*
4285 		 * end evaluation on encountering a throttled cfs_rq
4286 		 *
4287 		 * note: in the case of encountering a throttled cfs_rq we will
4288 		 * post the final h_nr_running decrement below.
4289 		*/
4290 		if (cfs_rq_throttled(cfs_rq))
4291 			break;
4292 		cfs_rq->h_nr_running--;
4293 
4294 		/* Don't dequeue parent if it has other entities besides us */
4295 		if (cfs_rq->load.weight) {
4296 			/*
4297 			 * Bias pick_next to pick a task from this cfs_rq, as
4298 			 * p is sleeping when it is within its sched_slice.
4299 			 */
4300 			if (task_sleep && parent_entity(se))
4301 				set_next_buddy(parent_entity(se));
4302 
4303 			/* avoid re-evaluating load for this entity */
4304 			se = parent_entity(se);
4305 			break;
4306 		}
4307 		flags |= DEQUEUE_SLEEP;
4308 	}
4309 
4310 	for_each_sched_entity(se) {
4311 		cfs_rq = cfs_rq_of(se);
4312 		cfs_rq->h_nr_running--;
4313 
4314 		if (cfs_rq_throttled(cfs_rq))
4315 			break;
4316 
4317 		update_load_avg(se, 1);
4318 		update_cfs_shares(cfs_rq);
4319 	}
4320 
4321 	if (!se)
4322 		sub_nr_running(rq, 1);
4323 
4324 	hrtick_update(rq);
4325 }
4326 
4327 #ifdef CONFIG_SMP
4328 
4329 /*
4330  * per rq 'load' arrray crap; XXX kill this.
4331  */
4332 
4333 /*
4334  * The exact cpuload calculated at every tick would be:
4335  *
4336  *   load' = (1 - 1/2^i) * load + (1/2^i) * cur_load
4337  *
4338  * If a cpu misses updates for n ticks (as it was idle) and update gets
4339  * called on the n+1-th tick when cpu may be busy, then we have:
4340  *
4341  *   load_n   = (1 - 1/2^i)^n * load_0
4342  *   load_n+1 = (1 - 1/2^i)   * load_n + (1/2^i) * cur_load
4343  *
4344  * decay_load_missed() below does efficient calculation of
4345  *
4346  *   load' = (1 - 1/2^i)^n * load
4347  *
4348  * Because x^(n+m) := x^n * x^m we can decompose any x^n in power-of-2 factors.
4349  * This allows us to precompute the above in said factors, thereby allowing the
4350  * reduction of an arbitrary n in O(log_2 n) steps. (See also
4351  * fixed_power_int())
4352  *
4353  * The calculation is approximated on a 128 point scale.
4354  */
4355 #define DEGRADE_SHIFT		7
4356 
4357 static const u8 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
4358 static const u8 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
4359 	{   0,   0,  0,  0,  0,  0, 0, 0 },
4360 	{  64,  32,  8,  0,  0,  0, 0, 0 },
4361 	{  96,  72, 40, 12,  1,  0, 0, 0 },
4362 	{ 112,  98, 75, 43, 15,  1, 0, 0 },
4363 	{ 120, 112, 98, 76, 45, 16, 2, 0 }
4364 };
4365 
4366 /*
4367  * Update cpu_load for any missed ticks, due to tickless idle. The backlog
4368  * would be when CPU is idle and so we just decay the old load without
4369  * adding any new load.
4370  */
4371 static unsigned long
4372 decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
4373 {
4374 	int j = 0;
4375 
4376 	if (!missed_updates)
4377 		return load;
4378 
4379 	if (missed_updates >= degrade_zero_ticks[idx])
4380 		return 0;
4381 
4382 	if (idx == 1)
4383 		return load >> missed_updates;
4384 
4385 	while (missed_updates) {
4386 		if (missed_updates % 2)
4387 			load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
4388 
4389 		missed_updates >>= 1;
4390 		j++;
4391 	}
4392 	return load;
4393 }
4394 
4395 /**
4396  * __update_cpu_load - update the rq->cpu_load[] statistics
4397  * @this_rq: The rq to update statistics for
4398  * @this_load: The current load
4399  * @pending_updates: The number of missed updates
4400  * @active: !0 for NOHZ_FULL
4401  *
4402  * Update rq->cpu_load[] statistics. This function is usually called every
4403  * scheduler tick (TICK_NSEC).
4404  *
4405  * This function computes a decaying average:
4406  *
4407  *   load[i]' = (1 - 1/2^i) * load[i] + (1/2^i) * load
4408  *
4409  * Because of NOHZ it might not get called on every tick which gives need for
4410  * the @pending_updates argument.
4411  *
4412  *   load[i]_n = (1 - 1/2^i) * load[i]_n-1 + (1/2^i) * load_n-1
4413  *             = A * load[i]_n-1 + B ; A := (1 - 1/2^i), B := (1/2^i) * load
4414  *             = A * (A * load[i]_n-2 + B) + B
4415  *             = A * (A * (A * load[i]_n-3 + B) + B) + B
4416  *             = A^3 * load[i]_n-3 + (A^2 + A + 1) * B
4417  *             = A^n * load[i]_0 + (A^(n-1) + A^(n-2) + ... + 1) * B
4418  *             = A^n * load[i]_0 + ((1 - A^n) / (1 - A)) * B
4419  *             = (1 - 1/2^i)^n * (load[i]_0 - load) + load
4420  *
4421  * In the above we've assumed load_n := load, which is true for NOHZ_FULL as
4422  * any change in load would have resulted in the tick being turned back on.
4423  *
4424  * For regular NOHZ, this reduces to:
4425  *
4426  *   load[i]_n = (1 - 1/2^i)^n * load[i]_0
4427  *
4428  * see decay_load_misses(). For NOHZ_FULL we get to subtract and add the extra
4429  * term. See the @active paramter.
4430  */
4431 static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
4432 			      unsigned long pending_updates, int active)
4433 {
4434 	unsigned long tickless_load = active ? this_rq->cpu_load[0] : 0;
4435 	int i, scale;
4436 
4437 	this_rq->nr_load_updates++;
4438 
4439 	/* Update our load: */
4440 	this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
4441 	for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
4442 		unsigned long old_load, new_load;
4443 
4444 		/* scale is effectively 1 << i now, and >> i divides by scale */
4445 
4446 		old_load = this_rq->cpu_load[i] - tickless_load;
4447 		old_load = decay_load_missed(old_load, pending_updates - 1, i);
4448 		old_load += tickless_load;
4449 		new_load = this_load;
4450 		/*
4451 		 * Round up the averaging division if load is increasing. This
4452 		 * prevents us from getting stuck on 9 if the load is 10, for
4453 		 * example.
4454 		 */
4455 		if (new_load > old_load)
4456 			new_load += scale - 1;
4457 
4458 		this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
4459 	}
4460 
4461 	sched_avg_update(this_rq);
4462 }
4463 
4464 /* Used instead of source_load when we know the type == 0 */
4465 static unsigned long weighted_cpuload(const int cpu)
4466 {
4467 	return cfs_rq_runnable_load_avg(&cpu_rq(cpu)->cfs);
4468 }
4469 
4470 #ifdef CONFIG_NO_HZ_COMMON
4471 /*
4472  * There is no sane way to deal with nohz on smp when using jiffies because the
4473  * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
4474  * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
4475  *
4476  * Therefore we cannot use the delta approach from the regular tick since that
4477  * would seriously skew the load calculation. However we'll make do for those
4478  * updates happening while idle (nohz_idle_balance) or coming out of idle
4479  * (tick_nohz_idle_exit).
4480  *
4481  * This means we might still be one tick off for nohz periods.
4482  */
4483 
4484 /*
4485  * Called from nohz_idle_balance() to update the load ratings before doing the
4486  * idle balance.
4487  */
4488 static void update_idle_cpu_load(struct rq *this_rq)
4489 {
4490 	unsigned long curr_jiffies = READ_ONCE(jiffies);
4491 	unsigned long load = weighted_cpuload(cpu_of(this_rq));
4492 	unsigned long pending_updates;
4493 
4494 	/*
4495 	 * bail if there's load or we're actually up-to-date.
4496 	 */
4497 	if (load || curr_jiffies == this_rq->last_load_update_tick)
4498 		return;
4499 
4500 	pending_updates = curr_jiffies - this_rq->last_load_update_tick;
4501 	this_rq->last_load_update_tick = curr_jiffies;
4502 
4503 	__update_cpu_load(this_rq, load, pending_updates, 0);
4504 }
4505 
4506 /*
4507  * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
4508  */
4509 void update_cpu_load_nohz(int active)
4510 {
4511 	struct rq *this_rq = this_rq();
4512 	unsigned long curr_jiffies = READ_ONCE(jiffies);
4513 	unsigned long load = active ? weighted_cpuload(cpu_of(this_rq)) : 0;
4514 	unsigned long pending_updates;
4515 
4516 	if (curr_jiffies == this_rq->last_load_update_tick)
4517 		return;
4518 
4519 	raw_spin_lock(&this_rq->lock);
4520 	pending_updates = curr_jiffies - this_rq->last_load_update_tick;
4521 	if (pending_updates) {
4522 		this_rq->last_load_update_tick = curr_jiffies;
4523 		/*
4524 		 * In the regular NOHZ case, we were idle, this means load 0.
4525 		 * In the NOHZ_FULL case, we were non-idle, we should consider
4526 		 * its weighted load.
4527 		 */
4528 		__update_cpu_load(this_rq, load, pending_updates, active);
4529 	}
4530 	raw_spin_unlock(&this_rq->lock);
4531 }
4532 #endif /* CONFIG_NO_HZ */
4533 
4534 /*
4535  * Called from scheduler_tick()
4536  */
4537 void update_cpu_load_active(struct rq *this_rq)
4538 {
4539 	unsigned long load = weighted_cpuload(cpu_of(this_rq));
4540 	/*
4541 	 * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
4542 	 */
4543 	this_rq->last_load_update_tick = jiffies;
4544 	__update_cpu_load(this_rq, load, 1, 1);
4545 }
4546 
4547 /*
4548  * Return a low guess at the load of a migration-source cpu weighted
4549  * according to the scheduling class and "nice" value.
4550  *
4551  * We want to under-estimate the load of migration sources, to
4552  * balance conservatively.
4553  */
4554 static unsigned long source_load(int cpu, int type)
4555 {
4556 	struct rq *rq = cpu_rq(cpu);
4557 	unsigned long total = weighted_cpuload(cpu);
4558 
4559 	if (type == 0 || !sched_feat(LB_BIAS))
4560 		return total;
4561 
4562 	return min(rq->cpu_load[type-1], total);
4563 }
4564 
4565 /*
4566  * Return a high guess at the load of a migration-target cpu weighted
4567  * according to the scheduling class and "nice" value.
4568  */
4569 static unsigned long target_load(int cpu, int type)
4570 {
4571 	struct rq *rq = cpu_rq(cpu);
4572 	unsigned long total = weighted_cpuload(cpu);
4573 
4574 	if (type == 0 || !sched_feat(LB_BIAS))
4575 		return total;
4576 
4577 	return max(rq->cpu_load[type-1], total);
4578 }
4579 
4580 static unsigned long capacity_of(int cpu)
4581 {
4582 	return cpu_rq(cpu)->cpu_capacity;
4583 }
4584 
4585 static unsigned long capacity_orig_of(int cpu)
4586 {
4587 	return cpu_rq(cpu)->cpu_capacity_orig;
4588 }
4589 
4590 static unsigned long cpu_avg_load_per_task(int cpu)
4591 {
4592 	struct rq *rq = cpu_rq(cpu);
4593 	unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
4594 	unsigned long load_avg = weighted_cpuload(cpu);
4595 
4596 	if (nr_running)
4597 		return load_avg / nr_running;
4598 
4599 	return 0;
4600 }
4601 
4602 static void record_wakee(struct task_struct *p)
4603 {
4604 	/*
4605 	 * Rough decay (wiping) for cost saving, don't worry
4606 	 * about the boundary, really active task won't care
4607 	 * about the loss.
4608 	 */
4609 	if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) {
4610 		current->wakee_flips >>= 1;
4611 		current->wakee_flip_decay_ts = jiffies;
4612 	}
4613 
4614 	if (current->last_wakee != p) {
4615 		current->last_wakee = p;
4616 		current->wakee_flips++;
4617 	}
4618 }
4619 
4620 static void task_waking_fair(struct task_struct *p)
4621 {
4622 	struct sched_entity *se = &p->se;
4623 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
4624 	u64 min_vruntime;
4625 
4626 #ifndef CONFIG_64BIT
4627 	u64 min_vruntime_copy;
4628 
4629 	do {
4630 		min_vruntime_copy = cfs_rq->min_vruntime_copy;
4631 		smp_rmb();
4632 		min_vruntime = cfs_rq->min_vruntime;
4633 	} while (min_vruntime != min_vruntime_copy);
4634 #else
4635 	min_vruntime = cfs_rq->min_vruntime;
4636 #endif
4637 
4638 	se->vruntime -= min_vruntime;
4639 	record_wakee(p);
4640 }
4641 
4642 #ifdef CONFIG_FAIR_GROUP_SCHED
4643 /*
4644  * effective_load() calculates the load change as seen from the root_task_group
4645  *
4646  * Adding load to a group doesn't make a group heavier, but can cause movement
4647  * of group shares between cpus. Assuming the shares were perfectly aligned one
4648  * can calculate the shift in shares.
4649  *
4650  * Calculate the effective load difference if @wl is added (subtracted) to @tg
4651  * on this @cpu and results in a total addition (subtraction) of @wg to the
4652  * total group weight.
4653  *
4654  * Given a runqueue weight distribution (rw_i) we can compute a shares
4655  * distribution (s_i) using:
4656  *
4657  *   s_i = rw_i / \Sum rw_j						(1)
4658  *
4659  * Suppose we have 4 CPUs and our @tg is a direct child of the root group and
4660  * has 7 equal weight tasks, distributed as below (rw_i), with the resulting
4661  * shares distribution (s_i):
4662  *
4663  *   rw_i = {   2,   4,   1,   0 }
4664  *   s_i  = { 2/7, 4/7, 1/7,   0 }
4665  *
4666  * As per wake_affine() we're interested in the load of two CPUs (the CPU the
4667  * task used to run on and the CPU the waker is running on), we need to
4668  * compute the effect of waking a task on either CPU and, in case of a sync
4669  * wakeup, compute the effect of the current task going to sleep.
4670  *
4671  * So for a change of @wl to the local @cpu with an overall group weight change
4672  * of @wl we can compute the new shares distribution (s'_i) using:
4673  *
4674  *   s'_i = (rw_i + @wl) / (@wg + \Sum rw_j)				(2)
4675  *
4676  * Suppose we're interested in CPUs 0 and 1, and want to compute the load
4677  * differences in waking a task to CPU 0. The additional task changes the
4678  * weight and shares distributions like:
4679  *
4680  *   rw'_i = {   3,   4,   1,   0 }
4681  *   s'_i  = { 3/8, 4/8, 1/8,   0 }
4682  *
4683  * We can then compute the difference in effective weight by using:
4684  *
4685  *   dw_i = S * (s'_i - s_i)						(3)
4686  *
4687  * Where 'S' is the group weight as seen by its parent.
4688  *
4689  * Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7)
4690  * times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 -
4691  * 4/7) times the weight of the group.
4692  */
4693 static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
4694 {
4695 	struct sched_entity *se = tg->se[cpu];
4696 
4697 	if (!tg->parent)	/* the trivial, non-cgroup case */
4698 		return wl;
4699 
4700 	for_each_sched_entity(se) {
4701 		long w, W;
4702 
4703 		tg = se->my_q->tg;
4704 
4705 		/*
4706 		 * W = @wg + \Sum rw_j
4707 		 */
4708 		W = wg + calc_tg_weight(tg, se->my_q);
4709 
4710 		/*
4711 		 * w = rw_i + @wl
4712 		 */
4713 		w = cfs_rq_load_avg(se->my_q) + wl;
4714 
4715 		/*
4716 		 * wl = S * s'_i; see (2)
4717 		 */
4718 		if (W > 0 && w < W)
4719 			wl = (w * (long)tg->shares) / W;
4720 		else
4721 			wl = tg->shares;
4722 
4723 		/*
4724 		 * Per the above, wl is the new se->load.weight value; since
4725 		 * those are clipped to [MIN_SHARES, ...) do so now. See
4726 		 * calc_cfs_shares().
4727 		 */
4728 		if (wl < MIN_SHARES)
4729 			wl = MIN_SHARES;
4730 
4731 		/*
4732 		 * wl = dw_i = S * (s'_i - s_i); see (3)
4733 		 */
4734 		wl -= se->avg.load_avg;
4735 
4736 		/*
4737 		 * Recursively apply this logic to all parent groups to compute
4738 		 * the final effective load change on the root group. Since
4739 		 * only the @tg group gets extra weight, all parent groups can
4740 		 * only redistribute existing shares. @wl is the shift in shares
4741 		 * resulting from this level per the above.
4742 		 */
4743 		wg = 0;
4744 	}
4745 
4746 	return wl;
4747 }
4748 #else
4749 
4750 static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
4751 {
4752 	return wl;
4753 }
4754 
4755 #endif
4756 
4757 /*
4758  * Detect M:N waker/wakee relationships via a switching-frequency heuristic.
4759  * A waker of many should wake a different task than the one last awakened
4760  * at a frequency roughly N times higher than one of its wakees.  In order
4761  * to determine whether we should let the load spread vs consolodating to
4762  * shared cache, we look for a minimum 'flip' frequency of llc_size in one
4763  * partner, and a factor of lls_size higher frequency in the other.  With
4764  * both conditions met, we can be relatively sure that the relationship is
4765  * non-monogamous, with partner count exceeding socket size.  Waker/wakee
4766  * being client/server, worker/dispatcher, interrupt source or whatever is
4767  * irrelevant, spread criteria is apparent partner count exceeds socket size.
4768  */
4769 static int wake_wide(struct task_struct *p)
4770 {
4771 	unsigned int master = current->wakee_flips;
4772 	unsigned int slave = p->wakee_flips;
4773 	int factor = this_cpu_read(sd_llc_size);
4774 
4775 	if (master < slave)
4776 		swap(master, slave);
4777 	if (slave < factor || master < slave * factor)
4778 		return 0;
4779 	return 1;
4780 }
4781 
4782 static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
4783 {
4784 	s64 this_load, load;
4785 	s64 this_eff_load, prev_eff_load;
4786 	int idx, this_cpu, prev_cpu;
4787 	struct task_group *tg;
4788 	unsigned long weight;
4789 	int balanced;
4790 
4791 	idx	  = sd->wake_idx;
4792 	this_cpu  = smp_processor_id();
4793 	prev_cpu  = task_cpu(p);
4794 	load	  = source_load(prev_cpu, idx);
4795 	this_load = target_load(this_cpu, idx);
4796 
4797 	/*
4798 	 * If sync wakeup then subtract the (maximum possible)
4799 	 * effect of the currently running task from the load
4800 	 * of the current CPU:
4801 	 */
4802 	if (sync) {
4803 		tg = task_group(current);
4804 		weight = current->se.avg.load_avg;
4805 
4806 		this_load += effective_load(tg, this_cpu, -weight, -weight);
4807 		load += effective_load(tg, prev_cpu, 0, -weight);
4808 	}
4809 
4810 	tg = task_group(p);
4811 	weight = p->se.avg.load_avg;
4812 
4813 	/*
4814 	 * In low-load situations, where prev_cpu is idle and this_cpu is idle
4815 	 * due to the sync cause above having dropped this_load to 0, we'll
4816 	 * always have an imbalance, but there's really nothing you can do
4817 	 * about that, so that's good too.
4818 	 *
4819 	 * Otherwise check if either cpus are near enough in load to allow this
4820 	 * task to be woken on this_cpu.
4821 	 */
4822 	this_eff_load = 100;
4823 	this_eff_load *= capacity_of(prev_cpu);
4824 
4825 	prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
4826 	prev_eff_load *= capacity_of(this_cpu);
4827 
4828 	if (this_load > 0) {
4829 		this_eff_load *= this_load +
4830 			effective_load(tg, this_cpu, weight, weight);
4831 
4832 		prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
4833 	}
4834 
4835 	balanced = this_eff_load <= prev_eff_load;
4836 
4837 	schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
4838 
4839 	if (!balanced)
4840 		return 0;
4841 
4842 	schedstat_inc(sd, ttwu_move_affine);
4843 	schedstat_inc(p, se.statistics.nr_wakeups_affine);
4844 
4845 	return 1;
4846 }
4847 
4848 /*
4849  * find_idlest_group finds and returns the least busy CPU group within the
4850  * domain.
4851  */
4852 static struct sched_group *
4853 find_idlest_group(struct sched_domain *sd, struct task_struct *p,
4854 		  int this_cpu, int sd_flag)
4855 {
4856 	struct sched_group *idlest = NULL, *group = sd->groups;
4857 	unsigned long min_load = ULONG_MAX, this_load = 0;
4858 	int load_idx = sd->forkexec_idx;
4859 	int imbalance = 100 + (sd->imbalance_pct-100)/2;
4860 
4861 	if (sd_flag & SD_BALANCE_WAKE)
4862 		load_idx = sd->wake_idx;
4863 
4864 	do {
4865 		unsigned long load, avg_load;
4866 		int local_group;
4867 		int i;
4868 
4869 		/* Skip over this group if it has no CPUs allowed */
4870 		if (!cpumask_intersects(sched_group_cpus(group),
4871 					tsk_cpus_allowed(p)))
4872 			continue;
4873 
4874 		local_group = cpumask_test_cpu(this_cpu,
4875 					       sched_group_cpus(group));
4876 
4877 		/* Tally up the load of all CPUs in the group */
4878 		avg_load = 0;
4879 
4880 		for_each_cpu(i, sched_group_cpus(group)) {
4881 			/* Bias balancing toward cpus of our domain */
4882 			if (local_group)
4883 				load = source_load(i, load_idx);
4884 			else
4885 				load = target_load(i, load_idx);
4886 
4887 			avg_load += load;
4888 		}
4889 
4890 		/* Adjust by relative CPU capacity of the group */
4891 		avg_load = (avg_load * SCHED_CAPACITY_SCALE) / group->sgc->capacity;
4892 
4893 		if (local_group) {
4894 			this_load = avg_load;
4895 		} else if (avg_load < min_load) {
4896 			min_load = avg_load;
4897 			idlest = group;
4898 		}
4899 	} while (group = group->next, group != sd->groups);
4900 
4901 	if (!idlest || 100*this_load < imbalance*min_load)
4902 		return NULL;
4903 	return idlest;
4904 }
4905 
4906 /*
4907  * find_idlest_cpu - find the idlest cpu among the cpus in group.
4908  */
4909 static int
4910 find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
4911 {
4912 	unsigned long load, min_load = ULONG_MAX;
4913 	unsigned int min_exit_latency = UINT_MAX;
4914 	u64 latest_idle_timestamp = 0;
4915 	int least_loaded_cpu = this_cpu;
4916 	int shallowest_idle_cpu = -1;
4917 	int i;
4918 
4919 	/* Traverse only the allowed CPUs */
4920 	for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
4921 		if (idle_cpu(i)) {
4922 			struct rq *rq = cpu_rq(i);
4923 			struct cpuidle_state *idle = idle_get_state(rq);
4924 			if (idle && idle->exit_latency < min_exit_latency) {
4925 				/*
4926 				 * We give priority to a CPU whose idle state
4927 				 * has the smallest exit latency irrespective
4928 				 * of any idle timestamp.
4929 				 */
4930 				min_exit_latency = idle->exit_latency;
4931 				latest_idle_timestamp = rq->idle_stamp;
4932 				shallowest_idle_cpu = i;
4933 			} else if ((!idle || idle->exit_latency == min_exit_latency) &&
4934 				   rq->idle_stamp > latest_idle_timestamp) {
4935 				/*
4936 				 * If equal or no active idle state, then
4937 				 * the most recently idled CPU might have
4938 				 * a warmer cache.
4939 				 */
4940 				latest_idle_timestamp = rq->idle_stamp;
4941 				shallowest_idle_cpu = i;
4942 			}
4943 		} else if (shallowest_idle_cpu == -1) {
4944 			load = weighted_cpuload(i);
4945 			if (load < min_load || (load == min_load && i == this_cpu)) {
4946 				min_load = load;
4947 				least_loaded_cpu = i;
4948 			}
4949 		}
4950 	}
4951 
4952 	return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
4953 }
4954 
4955 /*
4956  * Try and locate an idle CPU in the sched_domain.
4957  */
4958 static int select_idle_sibling(struct task_struct *p, int target)
4959 {
4960 	struct sched_domain *sd;
4961 	struct sched_group *sg;
4962 	int i = task_cpu(p);
4963 
4964 	if (idle_cpu(target))
4965 		return target;
4966 
4967 	/*
4968 	 * If the prevous cpu is cache affine and idle, don't be stupid.
4969 	 */
4970 	if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
4971 		return i;
4972 
4973 	/*
4974 	 * Otherwise, iterate the domains and find an elegible idle cpu.
4975 	 */
4976 	sd = rcu_dereference(per_cpu(sd_llc, target));
4977 	for_each_lower_domain(sd) {
4978 		sg = sd->groups;
4979 		do {
4980 			if (!cpumask_intersects(sched_group_cpus(sg),
4981 						tsk_cpus_allowed(p)))
4982 				goto next;
4983 
4984 			for_each_cpu(i, sched_group_cpus(sg)) {
4985 				if (i == target || !idle_cpu(i))
4986 					goto next;
4987 			}
4988 
4989 			target = cpumask_first_and(sched_group_cpus(sg),
4990 					tsk_cpus_allowed(p));
4991 			goto done;
4992 next:
4993 			sg = sg->next;
4994 		} while (sg != sd->groups);
4995 	}
4996 done:
4997 	return target;
4998 }
4999 
5000 /*
5001  * cpu_util returns the amount of capacity of a CPU that is used by CFS
5002  * tasks. The unit of the return value must be the one of capacity so we can
5003  * compare the utilization with the capacity of the CPU that is available for
5004  * CFS task (ie cpu_capacity).
5005  *
5006  * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the
5007  * recent utilization of currently non-runnable tasks on a CPU. It represents
5008  * the amount of utilization of a CPU in the range [0..capacity_orig] where
5009  * capacity_orig is the cpu_capacity available at the highest frequency
5010  * (arch_scale_freq_capacity()).
5011  * The utilization of a CPU converges towards a sum equal to or less than the
5012  * current capacity (capacity_curr <= capacity_orig) of the CPU because it is
5013  * the running time on this CPU scaled by capacity_curr.
5014  *
5015  * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even
5016  * higher than capacity_orig because of unfortunate rounding in
5017  * cfs.avg.util_avg or just after migrating tasks and new task wakeups until
5018  * the average stabilizes with the new running time. We need to check that the
5019  * utilization stays within the range of [0..capacity_orig] and cap it if
5020  * necessary. Without utilization capping, a group could be seen as overloaded
5021  * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of
5022  * available capacity. We allow utilization to overshoot capacity_curr (but not
5023  * capacity_orig) as it useful for predicting the capacity required after task
5024  * migrations (scheduler-driven DVFS).
5025  */
5026 static int cpu_util(int cpu)
5027 {
5028 	unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg;
5029 	unsigned long capacity = capacity_orig_of(cpu);
5030 
5031 	return (util >= capacity) ? capacity : util;
5032 }
5033 
5034 /*
5035  * select_task_rq_fair: Select target runqueue for the waking task in domains
5036  * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
5037  * SD_BALANCE_FORK, or SD_BALANCE_EXEC.
5038  *
5039  * Balances load by selecting the idlest cpu in the idlest group, or under
5040  * certain conditions an idle sibling cpu if the domain has SD_WAKE_AFFINE set.
5041  *
5042  * Returns the target cpu number.
5043  *
5044  * preempt must be disabled.
5045  */
5046 static int
5047 select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
5048 {
5049 	struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
5050 	int cpu = smp_processor_id();
5051 	int new_cpu = prev_cpu;
5052 	int want_affine = 0;
5053 	int sync = wake_flags & WF_SYNC;
5054 
5055 	if (sd_flag & SD_BALANCE_WAKE)
5056 		want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
5057 
5058 	rcu_read_lock();
5059 	for_each_domain(cpu, tmp) {
5060 		if (!(tmp->flags & SD_LOAD_BALANCE))
5061 			break;
5062 
5063 		/*
5064 		 * If both cpu and prev_cpu are part of this domain,
5065 		 * cpu is a valid SD_WAKE_AFFINE target.
5066 		 */
5067 		if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
5068 		    cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
5069 			affine_sd = tmp;
5070 			break;
5071 		}
5072 
5073 		if (tmp->flags & sd_flag)
5074 			sd = tmp;
5075 		else if (!want_affine)
5076 			break;
5077 	}
5078 
5079 	if (affine_sd) {
5080 		sd = NULL; /* Prefer wake_affine over balance flags */
5081 		if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
5082 			new_cpu = cpu;
5083 	}
5084 
5085 	if (!sd) {
5086 		if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
5087 			new_cpu = select_idle_sibling(p, new_cpu);
5088 
5089 	} else while (sd) {
5090 		struct sched_group *group;
5091 		int weight;
5092 
5093 		if (!(sd->flags & sd_flag)) {
5094 			sd = sd->child;
5095 			continue;
5096 		}
5097 
5098 		group = find_idlest_group(sd, p, cpu, sd_flag);
5099 		if (!group) {
5100 			sd = sd->child;
5101 			continue;
5102 		}
5103 
5104 		new_cpu = find_idlest_cpu(group, p, cpu);
5105 		if (new_cpu == -1 || new_cpu == cpu) {
5106 			/* Now try balancing at a lower domain level of cpu */
5107 			sd = sd->child;
5108 			continue;
5109 		}
5110 
5111 		/* Now try balancing at a lower domain level of new_cpu */
5112 		cpu = new_cpu;
5113 		weight = sd->span_weight;
5114 		sd = NULL;
5115 		for_each_domain(cpu, tmp) {
5116 			if (weight <= tmp->span_weight)
5117 				break;
5118 			if (tmp->flags & sd_flag)
5119 				sd = tmp;
5120 		}
5121 		/* while loop will break here if sd == NULL */
5122 	}
5123 	rcu_read_unlock();
5124 
5125 	return new_cpu;
5126 }
5127 
5128 /*
5129  * Called immediately before a task is migrated to a new cpu; task_cpu(p) and
5130  * cfs_rq_of(p) references at time of call are still valid and identify the
5131  * previous cpu. The caller guarantees p->pi_lock or task_rq(p)->lock is held.
5132  */
5133 static void migrate_task_rq_fair(struct task_struct *p)
5134 {
5135 	/*
5136 	 * We are supposed to update the task to "current" time, then its up to date
5137 	 * and ready to go to new CPU/cfs_rq. But we have difficulty in getting
5138 	 * what current time is, so simply throw away the out-of-date time. This
5139 	 * will result in the wakee task is less decayed, but giving the wakee more
5140 	 * load sounds not bad.
5141 	 */
5142 	remove_entity_load_avg(&p->se);
5143 
5144 	/* Tell new CPU we are migrated */
5145 	p->se.avg.last_update_time = 0;
5146 
5147 	/* We have migrated, no longer consider this task hot */
5148 	p->se.exec_start = 0;
5149 }
5150 
5151 static void task_dead_fair(struct task_struct *p)
5152 {
5153 	remove_entity_load_avg(&p->se);
5154 }
5155 #endif /* CONFIG_SMP */
5156 
5157 static unsigned long
5158 wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
5159 {
5160 	unsigned long gran = sysctl_sched_wakeup_granularity;
5161 
5162 	/*
5163 	 * Since its curr running now, convert the gran from real-time
5164 	 * to virtual-time in his units.
5165 	 *
5166 	 * By using 'se' instead of 'curr' we penalize light tasks, so
5167 	 * they get preempted easier. That is, if 'se' < 'curr' then
5168 	 * the resulting gran will be larger, therefore penalizing the
5169 	 * lighter, if otoh 'se' > 'curr' then the resulting gran will
5170 	 * be smaller, again penalizing the lighter task.
5171 	 *
5172 	 * This is especially important for buddies when the leftmost
5173 	 * task is higher priority than the buddy.
5174 	 */
5175 	return calc_delta_fair(gran, se);
5176 }
5177 
5178 /*
5179  * Should 'se' preempt 'curr'.
5180  *
5181  *             |s1
5182  *        |s2
5183  *   |s3
5184  *         g
5185  *      |<--->|c
5186  *
5187  *  w(c, s1) = -1
5188  *  w(c, s2) =  0
5189  *  w(c, s3) =  1
5190  *
5191  */
5192 static int
5193 wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
5194 {
5195 	s64 gran, vdiff = curr->vruntime - se->vruntime;
5196 
5197 	if (vdiff <= 0)
5198 		return -1;
5199 
5200 	gran = wakeup_gran(curr, se);
5201 	if (vdiff > gran)
5202 		return 1;
5203 
5204 	return 0;
5205 }
5206 
5207 static void set_last_buddy(struct sched_entity *se)
5208 {
5209 	if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
5210 		return;
5211 
5212 	for_each_sched_entity(se)
5213 		cfs_rq_of(se)->last = se;
5214 }
5215 
5216 static void set_next_buddy(struct sched_entity *se)
5217 {
5218 	if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
5219 		return;
5220 
5221 	for_each_sched_entity(se)
5222 		cfs_rq_of(se)->next = se;
5223 }
5224 
5225 static void set_skip_buddy(struct sched_entity *se)
5226 {
5227 	for_each_sched_entity(se)
5228 		cfs_rq_of(se)->skip = se;
5229 }
5230 
5231 /*
5232  * Preempt the current task with a newly woken task if needed:
5233  */
5234 static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
5235 {
5236 	struct task_struct *curr = rq->curr;
5237 	struct sched_entity *se = &curr->se, *pse = &p->se;
5238 	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
5239 	int scale = cfs_rq->nr_running >= sched_nr_latency;
5240 	int next_buddy_marked = 0;
5241 
5242 	if (unlikely(se == pse))
5243 		return;
5244 
5245 	/*
5246 	 * This is possible from callers such as attach_tasks(), in which we
5247 	 * unconditionally check_prempt_curr() after an enqueue (which may have
5248 	 * lead to a throttle).  This both saves work and prevents false
5249 	 * next-buddy nomination below.
5250 	 */
5251 	if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
5252 		return;
5253 
5254 	if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
5255 		set_next_buddy(pse);
5256 		next_buddy_marked = 1;
5257 	}
5258 
5259 	/*
5260 	 * We can come here with TIF_NEED_RESCHED already set from new task
5261 	 * wake up path.
5262 	 *
5263 	 * Note: this also catches the edge-case of curr being in a throttled
5264 	 * group (e.g. via set_curr_task), since update_curr() (in the
5265 	 * enqueue of curr) will have resulted in resched being set.  This
5266 	 * prevents us from potentially nominating it as a false LAST_BUDDY
5267 	 * below.
5268 	 */
5269 	if (test_tsk_need_resched(curr))
5270 		return;
5271 
5272 	/* Idle tasks are by definition preempted by non-idle tasks. */
5273 	if (unlikely(curr->policy == SCHED_IDLE) &&
5274 	    likely(p->policy != SCHED_IDLE))
5275 		goto preempt;
5276 
5277 	/*
5278 	 * Batch and idle tasks do not preempt non-idle tasks (their preemption
5279 	 * is driven by the tick):
5280 	 */
5281 	if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
5282 		return;
5283 
5284 	find_matching_se(&se, &pse);
5285 	update_curr(cfs_rq_of(se));
5286 	BUG_ON(!pse);
5287 	if (wakeup_preempt_entity(se, pse) == 1) {
5288 		/*
5289 		 * Bias pick_next to pick the sched entity that is
5290 		 * triggering this preemption.
5291 		 */
5292 		if (!next_buddy_marked)
5293 			set_next_buddy(pse);
5294 		goto preempt;
5295 	}
5296 
5297 	return;
5298 
5299 preempt:
5300 	resched_curr(rq);
5301 	/*
5302 	 * Only set the backward buddy when the current task is still
5303 	 * on the rq. This can happen when a wakeup gets interleaved
5304 	 * with schedule on the ->pre_schedule() or idle_balance()
5305 	 * point, either of which can * drop the rq lock.
5306 	 *
5307 	 * Also, during early boot the idle thread is in the fair class,
5308 	 * for obvious reasons its a bad idea to schedule back to it.
5309 	 */
5310 	if (unlikely(!se->on_rq || curr == rq->idle))
5311 		return;
5312 
5313 	if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
5314 		set_last_buddy(se);
5315 }
5316 
5317 static struct task_struct *
5318 pick_next_task_fair(struct rq *rq, struct task_struct *prev)
5319 {
5320 	struct cfs_rq *cfs_rq = &rq->cfs;
5321 	struct sched_entity *se;
5322 	struct task_struct *p;
5323 	int new_tasks;
5324 
5325 again:
5326 #ifdef CONFIG_FAIR_GROUP_SCHED
5327 	if (!cfs_rq->nr_running)
5328 		goto idle;
5329 
5330 	if (prev->sched_class != &fair_sched_class)
5331 		goto simple;
5332 
5333 	/*
5334 	 * Because of the set_next_buddy() in dequeue_task_fair() it is rather
5335 	 * likely that a next task is from the same cgroup as the current.
5336 	 *
5337 	 * Therefore attempt to avoid putting and setting the entire cgroup
5338 	 * hierarchy, only change the part that actually changes.
5339 	 */
5340 
5341 	do {
5342 		struct sched_entity *curr = cfs_rq->curr;
5343 
5344 		/*
5345 		 * Since we got here without doing put_prev_entity() we also
5346 		 * have to consider cfs_rq->curr. If it is still a runnable
5347 		 * entity, update_curr() will update its vruntime, otherwise
5348 		 * forget we've ever seen it.
5349 		 */
5350 		if (curr) {
5351 			if (curr->on_rq)
5352 				update_curr(cfs_rq);
5353 			else
5354 				curr = NULL;
5355 
5356 			/*
5357 			 * This call to check_cfs_rq_runtime() will do the
5358 			 * throttle and dequeue its entity in the parent(s).
5359 			 * Therefore the 'simple' nr_running test will indeed
5360 			 * be correct.
5361 			 */
5362 			if (unlikely(check_cfs_rq_runtime(cfs_rq)))
5363 				goto simple;
5364 		}
5365 
5366 		se = pick_next_entity(cfs_rq, curr);
5367 		cfs_rq = group_cfs_rq(se);
5368 	} while (cfs_rq);
5369 
5370 	p = task_of(se);
5371 
5372 	/*
5373 	 * Since we haven't yet done put_prev_entity and if the selected task
5374 	 * is a different task than we started out with, try and touch the
5375 	 * least amount of cfs_rqs.
5376 	 */
5377 	if (prev != p) {
5378 		struct sched_entity *pse = &prev->se;
5379 
5380 		while (!(cfs_rq = is_same_group(se, pse))) {
5381 			int se_depth = se->depth;
5382 			int pse_depth = pse->depth;
5383 
5384 			if (se_depth <= pse_depth) {
5385 				put_prev_entity(cfs_rq_of(pse), pse);
5386 				pse = parent_entity(pse);
5387 			}
5388 			if (se_depth >= pse_depth) {
5389 				set_next_entity(cfs_rq_of(se), se);
5390 				se = parent_entity(se);
5391 			}
5392 		}
5393 
5394 		put_prev_entity(cfs_rq, pse);
5395 		set_next_entity(cfs_rq, se);
5396 	}
5397 
5398 	if (hrtick_enabled(rq))
5399 		hrtick_start_fair(rq, p);
5400 
5401 	return p;
5402 simple:
5403 	cfs_rq = &rq->cfs;
5404 #endif
5405 
5406 	if (!cfs_rq->nr_running)
5407 		goto idle;
5408 
5409 	put_prev_task(rq, prev);
5410 
5411 	do {
5412 		se = pick_next_entity(cfs_rq, NULL);
5413 		set_next_entity(cfs_rq, se);
5414 		cfs_rq = group_cfs_rq(se);
5415 	} while (cfs_rq);
5416 
5417 	p = task_of(se);
5418 
5419 	if (hrtick_enabled(rq))
5420 		hrtick_start_fair(rq, p);
5421 
5422 	return p;
5423 
5424 idle:
5425 	/*
5426 	 * This is OK, because current is on_cpu, which avoids it being picked
5427 	 * for load-balance and preemption/IRQs are still disabled avoiding
5428 	 * further scheduler activity on it and we're being very careful to
5429 	 * re-start the picking loop.
5430 	 */
5431 	lockdep_unpin_lock(&rq->lock);
5432 	new_tasks = idle_balance(rq);
5433 	lockdep_pin_lock(&rq->lock);
5434 	/*
5435 	 * Because idle_balance() releases (and re-acquires) rq->lock, it is
5436 	 * possible for any higher priority task to appear. In that case we
5437 	 * must re-start the pick_next_entity() loop.
5438 	 */
5439 	if (new_tasks < 0)
5440 		return RETRY_TASK;
5441 
5442 	if (new_tasks > 0)
5443 		goto again;
5444 
5445 	return NULL;
5446 }
5447 
5448 /*
5449  * Account for a descheduled task:
5450  */
5451 static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
5452 {
5453 	struct sched_entity *se = &prev->se;
5454 	struct cfs_rq *cfs_rq;
5455 
5456 	for_each_sched_entity(se) {
5457 		cfs_rq = cfs_rq_of(se);
5458 		put_prev_entity(cfs_rq, se);
5459 	}
5460 }
5461 
5462 /*
5463  * sched_yield() is very simple
5464  *
5465  * The magic of dealing with the ->skip buddy is in pick_next_entity.
5466  */
5467 static void yield_task_fair(struct rq *rq)
5468 {
5469 	struct task_struct *curr = rq->curr;
5470 	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
5471 	struct sched_entity *se = &curr->se;
5472 
5473 	/*
5474 	 * Are we the only task in the tree?
5475 	 */
5476 	if (unlikely(rq->nr_running == 1))
5477 		return;
5478 
5479 	clear_buddies(cfs_rq, se);
5480 
5481 	if (curr->policy != SCHED_BATCH) {
5482 		update_rq_clock(rq);
5483 		/*
5484 		 * Update run-time statistics of the 'current'.
5485 		 */
5486 		update_curr(cfs_rq);
5487 		/*
5488 		 * Tell update_rq_clock() that we've just updated,
5489 		 * so we don't do microscopic update in schedule()
5490 		 * and double the fastpath cost.
5491 		 */
5492 		rq_clock_skip_update(rq, true);
5493 	}
5494 
5495 	set_skip_buddy(se);
5496 }
5497 
5498 static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)
5499 {
5500 	struct sched_entity *se = &p->se;
5501 
5502 	/* throttled hierarchies are not runnable */
5503 	if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
5504 		return false;
5505 
5506 	/* Tell the scheduler that we'd really like pse to run next. */
5507 	set_next_buddy(se);
5508 
5509 	yield_task_fair(rq);
5510 
5511 	return true;
5512 }
5513 
5514 #ifdef CONFIG_SMP
5515 /**************************************************
5516  * Fair scheduling class load-balancing methods.
5517  *
5518  * BASICS
5519  *
5520  * The purpose of load-balancing is to achieve the same basic fairness the
5521  * per-cpu scheduler provides, namely provide a proportional amount of compute
5522  * time to each task. This is expressed in the following equation:
5523  *
5524  *   W_i,n/P_i == W_j,n/P_j for all i,j                               (1)
5525  *
5526  * Where W_i,n is the n-th weight average for cpu i. The instantaneous weight
5527  * W_i,0 is defined as:
5528  *
5529  *   W_i,0 = \Sum_j w_i,j                                             (2)
5530  *
5531  * Where w_i,j is the weight of the j-th runnable task on cpu i. This weight
5532  * is derived from the nice value as per prio_to_weight[].
5533  *
5534  * The weight average is an exponential decay average of the instantaneous
5535  * weight:
5536  *
5537  *   W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0               (3)
5538  *
5539  * C_i is the compute capacity of cpu i, typically it is the
5540  * fraction of 'recent' time available for SCHED_OTHER task execution. But it
5541  * can also include other factors [XXX].
5542  *
5543  * To achieve this balance we define a measure of imbalance which follows
5544  * directly from (1):
5545  *
5546  *   imb_i,j = max{ avg(W/C), W_i/C_i } - min{ avg(W/C), W_j/C_j }    (4)
5547  *
5548  * We them move tasks around to minimize the imbalance. In the continuous
5549  * function space it is obvious this converges, in the discrete case we get
5550  * a few fun cases generally called infeasible weight scenarios.
5551  *
5552  * [XXX expand on:
5553  *     - infeasible weights;
5554  *     - local vs global optima in the discrete case. ]
5555  *
5556  *
5557  * SCHED DOMAINS
5558  *
5559  * In order to solve the imbalance equation (4), and avoid the obvious O(n^2)
5560  * for all i,j solution, we create a tree of cpus that follows the hardware
5561  * topology where each level pairs two lower groups (or better). This results
5562  * in O(log n) layers. Furthermore we reduce the number of cpus going up the
5563  * tree to only the first of the previous level and we decrease the frequency
5564  * of load-balance at each level inv. proportional to the number of cpus in
5565  * the groups.
5566  *
5567  * This yields:
5568  *
5569  *     log_2 n     1     n
5570  *   \Sum       { --- * --- * 2^i } = O(n)                            (5)
5571  *     i = 0      2^i   2^i
5572  *                               `- size of each group
5573  *         |         |     `- number of cpus doing load-balance
5574  *         |         `- freq
5575  *         `- sum over all levels
5576  *
5577  * Coupled with a limit on how many tasks we can migrate every balance pass,
5578  * this makes (5) the runtime complexity of the balancer.
5579  *
5580  * An important property here is that each CPU is still (indirectly) connected
5581  * to every other cpu in at most O(log n) steps:
5582  *
5583  * The adjacency matrix of the resulting graph is given by:
5584  *
5585  *             log_2 n
5586  *   A_i,j = \Union     (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1)  (6)
5587  *             k = 0
5588  *
5589  * And you'll find that:
5590  *
5591  *   A^(log_2 n)_i,j != 0  for all i,j                                (7)
5592  *
5593  * Showing there's indeed a path between every cpu in at most O(log n) steps.
5594  * The task movement gives a factor of O(m), giving a convergence complexity
5595  * of:
5596  *
5597  *   O(nm log n),  n := nr_cpus, m := nr_tasks                        (8)
5598  *
5599  *
5600  * WORK CONSERVING
5601  *
5602  * In order to avoid CPUs going idle while there's still work to do, new idle
5603  * balancing is more aggressive and has the newly idle cpu iterate up the domain
5604  * tree itself instead of relying on other CPUs to bring it work.
5605  *
5606  * This adds some complexity to both (5) and (8) but it reduces the total idle
5607  * time.
5608  *
5609  * [XXX more?]
5610  *
5611  *
5612  * CGROUPS
5613  *
5614  * Cgroups make a horror show out of (2), instead of a simple sum we get:
5615  *
5616  *                                s_k,i
5617  *   W_i,0 = \Sum_j \Prod_k w_k * -----                               (9)
5618  *                                 S_k
5619  *
5620  * Where
5621  *
5622  *   s_k,i = \Sum_j w_i,j,k  and  S_k = \Sum_i s_k,i                 (10)
5623  *
5624  * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i.
5625  *
5626  * The big problem is S_k, its a global sum needed to compute a local (W_i)
5627  * property.
5628  *
5629  * [XXX write more on how we solve this.. _after_ merging pjt's patches that
5630  *      rewrite all of this once again.]
5631  */
5632 
5633 static unsigned long __read_mostly max_load_balance_interval = HZ/10;
5634 
5635 enum fbq_type { regular, remote, all };
5636 
5637 #define LBF_ALL_PINNED	0x01
5638 #define LBF_NEED_BREAK	0x02
5639 #define LBF_DST_PINNED  0x04
5640 #define LBF_SOME_PINNED	0x08
5641 
5642 struct lb_env {
5643 	struct sched_domain	*sd;
5644 
5645 	struct rq		*src_rq;
5646 	int			src_cpu;
5647 
5648 	int			dst_cpu;
5649 	struct rq		*dst_rq;
5650 
5651 	struct cpumask		*dst_grpmask;
5652 	int			new_dst_cpu;
5653 	enum cpu_idle_type	idle;
5654 	long			imbalance;
5655 	/* The set of CPUs under consideration for load-balancing */
5656 	struct cpumask		*cpus;
5657 
5658 	unsigned int		flags;
5659 
5660 	unsigned int		loop;
5661 	unsigned int		loop_break;
5662 	unsigned int		loop_max;
5663 
5664 	enum fbq_type		fbq_type;
5665 	struct list_head	tasks;
5666 };
5667 
5668 /*
5669  * Is this task likely cache-hot:
5670  */
5671 static int task_hot(struct task_struct *p, struct lb_env *env)
5672 {
5673 	s64 delta;
5674 
5675 	lockdep_assert_held(&env->src_rq->lock);
5676 
5677 	if (p->sched_class != &fair_sched_class)
5678 		return 0;
5679 
5680 	if (unlikely(p->policy == SCHED_IDLE))
5681 		return 0;
5682 
5683 	/*
5684 	 * Buddy candidates are cache hot:
5685 	 */
5686 	if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running &&
5687 			(&p->se == cfs_rq_of(&p->se)->next ||
5688 			 &p->se == cfs_rq_of(&p->se)->last))
5689 		return 1;
5690 
5691 	if (sysctl_sched_migration_cost == -1)
5692 		return 1;
5693 	if (sysctl_sched_migration_cost == 0)
5694 		return 0;
5695 
5696 	delta = rq_clock_task(env->src_rq) - p->se.exec_start;
5697 
5698 	return delta < (s64)sysctl_sched_migration_cost;
5699 }
5700 
5701 #ifdef CONFIG_NUMA_BALANCING
5702 /*
5703  * Returns 1, if task migration degrades locality
5704  * Returns 0, if task migration improves locality i.e migration preferred.
5705  * Returns -1, if task migration is not affected by locality.
5706  */
5707 static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
5708 {
5709 	struct numa_group *numa_group = rcu_dereference(p->numa_group);
5710 	unsigned long src_faults, dst_faults;
5711 	int src_nid, dst_nid;
5712 
5713 	if (!static_branch_likely(&sched_numa_balancing))
5714 		return -1;
5715 
5716 	if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
5717 		return -1;
5718 
5719 	src_nid = cpu_to_node(env->src_cpu);
5720 	dst_nid = cpu_to_node(env->dst_cpu);
5721 
5722 	if (src_nid == dst_nid)
5723 		return -1;
5724 
5725 	/* Migrating away from the preferred node is always bad. */
5726 	if (src_nid == p->numa_preferred_nid) {
5727 		if (env->src_rq->nr_running > env->src_rq->nr_preferred_running)
5728 			return 1;
5729 		else
5730 			return -1;
5731 	}
5732 
5733 	/* Encourage migration to the preferred node. */
5734 	if (dst_nid == p->numa_preferred_nid)
5735 		return 0;
5736 
5737 	if (numa_group) {
5738 		src_faults = group_faults(p, src_nid);
5739 		dst_faults = group_faults(p, dst_nid);
5740 	} else {
5741 		src_faults = task_faults(p, src_nid);
5742 		dst_faults = task_faults(p, dst_nid);
5743 	}
5744 
5745 	return dst_faults < src_faults;
5746 }
5747 
5748 #else
5749 static inline int migrate_degrades_locality(struct task_struct *p,
5750 					     struct lb_env *env)
5751 {
5752 	return -1;
5753 }
5754 #endif
5755 
5756 /*
5757  * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
5758  */
5759 static
5760 int can_migrate_task(struct task_struct *p, struct lb_env *env)
5761 {
5762 	int tsk_cache_hot;
5763 
5764 	lockdep_assert_held(&env->src_rq->lock);
5765 
5766 	/*
5767 	 * We do not migrate tasks that are:
5768 	 * 1) throttled_lb_pair, or
5769 	 * 2) cannot be migrated to this CPU due to cpus_allowed, or
5770 	 * 3) running (obviously), or
5771 	 * 4) are cache-hot on their current CPU.
5772 	 */
5773 	if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
5774 		return 0;
5775 
5776 	if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
5777 		int cpu;
5778 
5779 		schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
5780 
5781 		env->flags |= LBF_SOME_PINNED;
5782 
5783 		/*
5784 		 * Remember if this task can be migrated to any other cpu in
5785 		 * our sched_group. We may want to revisit it if we couldn't
5786 		 * meet load balance goals by pulling other tasks on src_cpu.
5787 		 *
5788 		 * Also avoid computing new_dst_cpu if we have already computed
5789 		 * one in current iteration.
5790 		 */
5791 		if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED))
5792 			return 0;
5793 
5794 		/* Prevent to re-select dst_cpu via env's cpus */
5795 		for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
5796 			if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
5797 				env->flags |= LBF_DST_PINNED;
5798 				env->new_dst_cpu = cpu;
5799 				break;
5800 			}
5801 		}
5802 
5803 		return 0;
5804 	}
5805 
5806 	/* Record that we found atleast one task that could run on dst_cpu */
5807 	env->flags &= ~LBF_ALL_PINNED;
5808 
5809 	if (task_running(env->src_rq, p)) {
5810 		schedstat_inc(p, se.statistics.nr_failed_migrations_running);
5811 		return 0;
5812 	}
5813 
5814 	/*
5815 	 * Aggressive migration if:
5816 	 * 1) destination numa is preferred
5817 	 * 2) task is cache cold, or
5818 	 * 3) too many balance attempts have failed.
5819 	 */
5820 	tsk_cache_hot = migrate_degrades_locality(p, env);
5821 	if (tsk_cache_hot == -1)
5822 		tsk_cache_hot = task_hot(p, env);
5823 
5824 	if (tsk_cache_hot <= 0 ||
5825 	    env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
5826 		if (tsk_cache_hot == 1) {
5827 			schedstat_inc(env->sd, lb_hot_gained[env->idle]);
5828 			schedstat_inc(p, se.statistics.nr_forced_migrations);
5829 		}
5830 		return 1;
5831 	}
5832 
5833 	schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
5834 	return 0;
5835 }
5836 
5837 /*
5838  * detach_task() -- detach the task for the migration specified in env
5839  */
5840 static void detach_task(struct task_struct *p, struct lb_env *env)
5841 {
5842 	lockdep_assert_held(&env->src_rq->lock);
5843 
5844 	p->on_rq = TASK_ON_RQ_MIGRATING;
5845 	deactivate_task(env->src_rq, p, 0);
5846 	set_task_cpu(p, env->dst_cpu);
5847 }
5848 
5849 /*
5850  * detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as
5851  * part of active balancing operations within "domain".
5852  *
5853  * Returns a task if successful and NULL otherwise.
5854  */
5855 static struct task_struct *detach_one_task(struct lb_env *env)
5856 {
5857 	struct task_struct *p, *n;
5858 
5859 	lockdep_assert_held(&env->src_rq->lock);
5860 
5861 	list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
5862 		if (!can_migrate_task(p, env))
5863 			continue;
5864 
5865 		detach_task(p, env);
5866 
5867 		/*
5868 		 * Right now, this is only the second place where
5869 		 * lb_gained[env->idle] is updated (other is detach_tasks)
5870 		 * so we can safely collect stats here rather than
5871 		 * inside detach_tasks().
5872 		 */
5873 		schedstat_inc(env->sd, lb_gained[env->idle]);
5874 		return p;
5875 	}
5876 	return NULL;
5877 }
5878 
5879 static const unsigned int sched_nr_migrate_break = 32;
5880 
5881 /*
5882  * detach_tasks() -- tries to detach up to imbalance weighted load from
5883  * busiest_rq, as part of a balancing operation within domain "sd".
5884  *
5885  * Returns number of detached tasks if successful and 0 otherwise.
5886  */
5887 static int detach_tasks(struct lb_env *env)
5888 {
5889 	struct list_head *tasks = &env->src_rq->cfs_tasks;
5890 	struct task_struct *p;
5891 	unsigned long load;
5892 	int detached = 0;
5893 
5894 	lockdep_assert_held(&env->src_rq->lock);
5895 
5896 	if (env->imbalance <= 0)
5897 		return 0;
5898 
5899 	while (!list_empty(tasks)) {
5900 		/*
5901 		 * We don't want to steal all, otherwise we may be treated likewise,
5902 		 * which could at worst lead to a livelock crash.
5903 		 */
5904 		if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1)
5905 			break;
5906 
5907 		p = list_first_entry(tasks, struct task_struct, se.group_node);
5908 
5909 		env->loop++;
5910 		/* We've more or less seen every task there is, call it quits */
5911 		if (env->loop > env->loop_max)
5912 			break;
5913 
5914 		/* take a breather every nr_migrate tasks */
5915 		if (env->loop > env->loop_break) {
5916 			env->loop_break += sched_nr_migrate_break;
5917 			env->flags |= LBF_NEED_BREAK;
5918 			break;
5919 		}
5920 
5921 		if (!can_migrate_task(p, env))
5922 			goto next;
5923 
5924 		load = task_h_load(p);
5925 
5926 		if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
5927 			goto next;
5928 
5929 		if ((load / 2) > env->imbalance)
5930 			goto next;
5931 
5932 		detach_task(p, env);
5933 		list_add(&p->se.group_node, &env->tasks);
5934 
5935 		detached++;
5936 		env->imbalance -= load;
5937 
5938 #ifdef CONFIG_PREEMPT
5939 		/*
5940 		 * NEWIDLE balancing is a source of latency, so preemptible
5941 		 * kernels will stop after the first task is detached to minimize
5942 		 * the critical section.
5943 		 */
5944 		if (env->idle == CPU_NEWLY_IDLE)
5945 			break;
5946 #endif
5947 
5948 		/*
5949 		 * We only want to steal up to the prescribed amount of
5950 		 * weighted load.
5951 		 */
5952 		if (env->imbalance <= 0)
5953 			break;
5954 
5955 		continue;
5956 next:
5957 		list_move_tail(&p->se.group_node, tasks);
5958 	}
5959 
5960 	/*
5961 	 * Right now, this is one of only two places we collect this stat
5962 	 * so we can safely collect detach_one_task() stats here rather
5963 	 * than inside detach_one_task().
5964 	 */
5965 	schedstat_add(env->sd, lb_gained[env->idle], detached);
5966 
5967 	return detached;
5968 }
5969 
5970 /*
5971  * attach_task() -- attach the task detached by detach_task() to its new rq.
5972  */
5973 static void attach_task(struct rq *rq, struct task_struct *p)
5974 {
5975 	lockdep_assert_held(&rq->lock);
5976 
5977 	BUG_ON(task_rq(p) != rq);
5978 	activate_task(rq, p, 0);
5979 	p->on_rq = TASK_ON_RQ_QUEUED;
5980 	check_preempt_curr(rq, p, 0);
5981 }
5982 
5983 /*
5984  * attach_one_task() -- attaches the task returned from detach_one_task() to
5985  * its new rq.
5986  */
5987 static void attach_one_task(struct rq *rq, struct task_struct *p)
5988 {
5989 	raw_spin_lock(&rq->lock);
5990 	attach_task(rq, p);
5991 	raw_spin_unlock(&rq->lock);
5992 }
5993 
5994 /*
5995  * attach_tasks() -- attaches all tasks detached by detach_tasks() to their
5996  * new rq.
5997  */
5998 static void attach_tasks(struct lb_env *env)
5999 {
6000 	struct list_head *tasks = &env->tasks;
6001 	struct task_struct *p;
6002 
6003 	raw_spin_lock(&env->dst_rq->lock);
6004 
6005 	while (!list_empty(tasks)) {
6006 		p = list_first_entry(tasks, struct task_struct, se.group_node);
6007 		list_del_init(&p->se.group_node);
6008 
6009 		attach_task(env->dst_rq, p);
6010 	}
6011 
6012 	raw_spin_unlock(&env->dst_rq->lock);
6013 }
6014 
6015 #ifdef CONFIG_FAIR_GROUP_SCHED
6016 static void update_blocked_averages(int cpu)
6017 {
6018 	struct rq *rq = cpu_rq(cpu);
6019 	struct cfs_rq *cfs_rq;
6020 	unsigned long flags;
6021 
6022 	raw_spin_lock_irqsave(&rq->lock, flags);
6023 	update_rq_clock(rq);
6024 
6025 	/*
6026 	 * Iterates the task_group tree in a bottom up fashion, see
6027 	 * list_add_leaf_cfs_rq() for details.
6028 	 */
6029 	for_each_leaf_cfs_rq(rq, cfs_rq) {
6030 		/* throttled entities do not contribute to load */
6031 		if (throttled_hierarchy(cfs_rq))
6032 			continue;
6033 
6034 		if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq))
6035 			update_tg_load_avg(cfs_rq, 0);
6036 	}
6037 	raw_spin_unlock_irqrestore(&rq->lock, flags);
6038 }
6039 
6040 /*
6041  * Compute the hierarchical load factor for cfs_rq and all its ascendants.
6042  * This needs to be done in a top-down fashion because the load of a child
6043  * group is a fraction of its parents load.
6044  */
6045 static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
6046 {
6047 	struct rq *rq = rq_of(cfs_rq);
6048 	struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
6049 	unsigned long now = jiffies;
6050 	unsigned long load;
6051 
6052 	if (cfs_rq->last_h_load_update == now)
6053 		return;
6054 
6055 	cfs_rq->h_load_next = NULL;
6056 	for_each_sched_entity(se) {
6057 		cfs_rq = cfs_rq_of(se);
6058 		cfs_rq->h_load_next = se;
6059 		if (cfs_rq->last_h_load_update == now)
6060 			break;
6061 	}
6062 
6063 	if (!se) {
6064 		cfs_rq->h_load = cfs_rq_load_avg(cfs_rq);
6065 		cfs_rq->last_h_load_update = now;
6066 	}
6067 
6068 	while ((se = cfs_rq->h_load_next) != NULL) {
6069 		load = cfs_rq->h_load;
6070 		load = div64_ul(load * se->avg.load_avg,
6071 			cfs_rq_load_avg(cfs_rq) + 1);
6072 		cfs_rq = group_cfs_rq(se);
6073 		cfs_rq->h_load = load;
6074 		cfs_rq->last_h_load_update = now;
6075 	}
6076 }
6077 
6078 static unsigned long task_h_load(struct task_struct *p)
6079 {
6080 	struct cfs_rq *cfs_rq = task_cfs_rq(p);
6081 
6082 	update_cfs_rq_h_load(cfs_rq);
6083 	return div64_ul(p->se.avg.load_avg * cfs_rq->h_load,
6084 			cfs_rq_load_avg(cfs_rq) + 1);
6085 }
6086 #else
6087 static inline void update_blocked_averages(int cpu)
6088 {
6089 	struct rq *rq = cpu_rq(cpu);
6090 	struct cfs_rq *cfs_rq = &rq->cfs;
6091 	unsigned long flags;
6092 
6093 	raw_spin_lock_irqsave(&rq->lock, flags);
6094 	update_rq_clock(rq);
6095 	update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
6096 	raw_spin_unlock_irqrestore(&rq->lock, flags);
6097 }
6098 
6099 static unsigned long task_h_load(struct task_struct *p)
6100 {
6101 	return p->se.avg.load_avg;
6102 }
6103 #endif
6104 
6105 /********** Helpers for find_busiest_group ************************/
6106 
6107 enum group_type {
6108 	group_other = 0,
6109 	group_imbalanced,
6110 	group_overloaded,
6111 };
6112 
6113 /*
6114  * sg_lb_stats - stats of a sched_group required for load_balancing
6115  */
6116 struct sg_lb_stats {
6117 	unsigned long avg_load; /*Avg load across the CPUs of the group */
6118 	unsigned long group_load; /* Total load over the CPUs of the group */
6119 	unsigned long sum_weighted_load; /* Weighted load of group's tasks */
6120 	unsigned long load_per_task;
6121 	unsigned long group_capacity;
6122 	unsigned long group_util; /* Total utilization of the group */
6123 	unsigned int sum_nr_running; /* Nr tasks running in the group */
6124 	unsigned int idle_cpus;
6125 	unsigned int group_weight;
6126 	enum group_type group_type;
6127 	int group_no_capacity;
6128 #ifdef CONFIG_NUMA_BALANCING
6129 	unsigned int nr_numa_running;
6130 	unsigned int nr_preferred_running;
6131 #endif
6132 };
6133 
6134 /*
6135  * sd_lb_stats - Structure to store the statistics of a sched_domain
6136  *		 during load balancing.
6137  */
6138 struct sd_lb_stats {
6139 	struct sched_group *busiest;	/* Busiest group in this sd */
6140 	struct sched_group *local;	/* Local group in this sd */
6141 	unsigned long total_load;	/* Total load of all groups in sd */
6142 	unsigned long total_capacity;	/* Total capacity of all groups in sd */
6143 	unsigned long avg_load;	/* Average load across all groups in sd */
6144 
6145 	struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
6146 	struct sg_lb_stats local_stat;	/* Statistics of the local group */
6147 };
6148 
6149 static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
6150 {
6151 	/*
6152 	 * Skimp on the clearing to avoid duplicate work. We can avoid clearing
6153 	 * local_stat because update_sg_lb_stats() does a full clear/assignment.
6154 	 * We must however clear busiest_stat::avg_load because
6155 	 * update_sd_pick_busiest() reads this before assignment.
6156 	 */
6157 	*sds = (struct sd_lb_stats){
6158 		.busiest = NULL,
6159 		.local = NULL,
6160 		.total_load = 0UL,
6161 		.total_capacity = 0UL,
6162 		.busiest_stat = {
6163 			.avg_load = 0UL,
6164 			.sum_nr_running = 0,
6165 			.group_type = group_other,
6166 		},
6167 	};
6168 }
6169 
6170 /**
6171  * get_sd_load_idx - Obtain the load index for a given sched domain.
6172  * @sd: The sched_domain whose load_idx is to be obtained.
6173  * @idle: The idle status of the CPU for whose sd load_idx is obtained.
6174  *
6175  * Return: The load index.
6176  */
6177 static inline int get_sd_load_idx(struct sched_domain *sd,
6178 					enum cpu_idle_type idle)
6179 {
6180 	int load_idx;
6181 
6182 	switch (idle) {
6183 	case CPU_NOT_IDLE:
6184 		load_idx = sd->busy_idx;
6185 		break;
6186 
6187 	case CPU_NEWLY_IDLE:
6188 		load_idx = sd->newidle_idx;
6189 		break;
6190 	default:
6191 		load_idx = sd->idle_idx;
6192 		break;
6193 	}
6194 
6195 	return load_idx;
6196 }
6197 
6198 static unsigned long scale_rt_capacity(int cpu)
6199 {
6200 	struct rq *rq = cpu_rq(cpu);
6201 	u64 total, used, age_stamp, avg;
6202 	s64 delta;
6203 
6204 	/*
6205 	 * Since we're reading these variables without serialization make sure
6206 	 * we read them once before doing sanity checks on them.
6207 	 */
6208 	age_stamp = READ_ONCE(rq->age_stamp);
6209 	avg = READ_ONCE(rq->rt_avg);
6210 	delta = __rq_clock_broken(rq) - age_stamp;
6211 
6212 	if (unlikely(delta < 0))
6213 		delta = 0;
6214 
6215 	total = sched_avg_period() + delta;
6216 
6217 	used = div_u64(avg, total);
6218 
6219 	if (likely(used < SCHED_CAPACITY_SCALE))
6220 		return SCHED_CAPACITY_SCALE - used;
6221 
6222 	return 1;
6223 }
6224 
6225 static void update_cpu_capacity(struct sched_domain *sd, int cpu)
6226 {
6227 	unsigned long capacity = arch_scale_cpu_capacity(sd, cpu);
6228 	struct sched_group *sdg = sd->groups;
6229 
6230 	cpu_rq(cpu)->cpu_capacity_orig = capacity;
6231 
6232 	capacity *= scale_rt_capacity(cpu);
6233 	capacity >>= SCHED_CAPACITY_SHIFT;
6234 
6235 	if (!capacity)
6236 		capacity = 1;
6237 
6238 	cpu_rq(cpu)->cpu_capacity = capacity;
6239 	sdg->sgc->capacity = capacity;
6240 }
6241 
6242 void update_group_capacity(struct sched_domain *sd, int cpu)
6243 {
6244 	struct sched_domain *child = sd->child;
6245 	struct sched_group *group, *sdg = sd->groups;
6246 	unsigned long capacity;
6247 	unsigned long interval;
6248 
6249 	interval = msecs_to_jiffies(sd->balance_interval);
6250 	interval = clamp(interval, 1UL, max_load_balance_interval);
6251 	sdg->sgc->next_update = jiffies + interval;
6252 
6253 	if (!child) {
6254 		update_cpu_capacity(sd, cpu);
6255 		return;
6256 	}
6257 
6258 	capacity = 0;
6259 
6260 	if (child->flags & SD_OVERLAP) {
6261 		/*
6262 		 * SD_OVERLAP domains cannot assume that child groups
6263 		 * span the current group.
6264 		 */
6265 
6266 		for_each_cpu(cpu, sched_group_cpus(sdg)) {
6267 			struct sched_group_capacity *sgc;
6268 			struct rq *rq = cpu_rq(cpu);
6269 
6270 			/*
6271 			 * build_sched_domains() -> init_sched_groups_capacity()
6272 			 * gets here before we've attached the domains to the
6273 			 * runqueues.
6274 			 *
6275 			 * Use capacity_of(), which is set irrespective of domains
6276 			 * in update_cpu_capacity().
6277 			 *
6278 			 * This avoids capacity from being 0 and
6279 			 * causing divide-by-zero issues on boot.
6280 			 */
6281 			if (unlikely(!rq->sd)) {
6282 				capacity += capacity_of(cpu);
6283 				continue;
6284 			}
6285 
6286 			sgc = rq->sd->groups->sgc;
6287 			capacity += sgc->capacity;
6288 		}
6289 	} else  {
6290 		/*
6291 		 * !SD_OVERLAP domains can assume that child groups
6292 		 * span the current group.
6293 		 */
6294 
6295 		group = child->groups;
6296 		do {
6297 			capacity += group->sgc->capacity;
6298 			group = group->next;
6299 		} while (group != child->groups);
6300 	}
6301 
6302 	sdg->sgc->capacity = capacity;
6303 }
6304 
6305 /*
6306  * Check whether the capacity of the rq has been noticeably reduced by side
6307  * activity. The imbalance_pct is used for the threshold.
6308  * Return true is the capacity is reduced
6309  */
6310 static inline int
6311 check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
6312 {
6313 	return ((rq->cpu_capacity * sd->imbalance_pct) <
6314 				(rq->cpu_capacity_orig * 100));
6315 }
6316 
6317 /*
6318  * Group imbalance indicates (and tries to solve) the problem where balancing
6319  * groups is inadequate due to tsk_cpus_allowed() constraints.
6320  *
6321  * Imagine a situation of two groups of 4 cpus each and 4 tasks each with a
6322  * cpumask covering 1 cpu of the first group and 3 cpus of the second group.
6323  * Something like:
6324  *
6325  * 	{ 0 1 2 3 } { 4 5 6 7 }
6326  * 	        *     * * *
6327  *
6328  * If we were to balance group-wise we'd place two tasks in the first group and
6329  * two tasks in the second group. Clearly this is undesired as it will overload
6330  * cpu 3 and leave one of the cpus in the second group unused.
6331  *
6332  * The current solution to this issue is detecting the skew in the first group
6333  * by noticing the lower domain failed to reach balance and had difficulty
6334  * moving tasks due to affinity constraints.
6335  *
6336  * When this is so detected; this group becomes a candidate for busiest; see
6337  * update_sd_pick_busiest(). And calculate_imbalance() and
6338  * find_busiest_group() avoid some of the usual balance conditions to allow it
6339  * to create an effective group imbalance.
6340  *
6341  * This is a somewhat tricky proposition since the next run might not find the
6342  * group imbalance and decide the groups need to be balanced again. A most
6343  * subtle and fragile situation.
6344  */
6345 
6346 static inline int sg_imbalanced(struct sched_group *group)
6347 {
6348 	return group->sgc->imbalance;
6349 }
6350 
6351 /*
6352  * group_has_capacity returns true if the group has spare capacity that could
6353  * be used by some tasks.
6354  * We consider that a group has spare capacity if the  * number of task is
6355  * smaller than the number of CPUs or if the utilization is lower than the
6356  * available capacity for CFS tasks.
6357  * For the latter, we use a threshold to stabilize the state, to take into
6358  * account the variance of the tasks' load and to return true if the available
6359  * capacity in meaningful for the load balancer.
6360  * As an example, an available capacity of 1% can appear but it doesn't make
6361  * any benefit for the load balance.
6362  */
6363 static inline bool
6364 group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs)
6365 {
6366 	if (sgs->sum_nr_running < sgs->group_weight)
6367 		return true;
6368 
6369 	if ((sgs->group_capacity * 100) >
6370 			(sgs->group_util * env->sd->imbalance_pct))
6371 		return true;
6372 
6373 	return false;
6374 }
6375 
6376 /*
6377  *  group_is_overloaded returns true if the group has more tasks than it can
6378  *  handle.
6379  *  group_is_overloaded is not equals to !group_has_capacity because a group
6380  *  with the exact right number of tasks, has no more spare capacity but is not
6381  *  overloaded so both group_has_capacity and group_is_overloaded return
6382  *  false.
6383  */
6384 static inline bool
6385 group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs)
6386 {
6387 	if (sgs->sum_nr_running <= sgs->group_weight)
6388 		return false;
6389 
6390 	if ((sgs->group_capacity * 100) <
6391 			(sgs->group_util * env->sd->imbalance_pct))
6392 		return true;
6393 
6394 	return false;
6395 }
6396 
6397 static inline enum
6398 group_type group_classify(struct sched_group *group,
6399 			  struct sg_lb_stats *sgs)
6400 {
6401 	if (sgs->group_no_capacity)
6402 		return group_overloaded;
6403 
6404 	if (sg_imbalanced(group))
6405 		return group_imbalanced;
6406 
6407 	return group_other;
6408 }
6409 
6410 /**
6411  * update_sg_lb_stats - Update sched_group's statistics for load balancing.
6412  * @env: The load balancing environment.
6413  * @group: sched_group whose statistics are to be updated.
6414  * @load_idx: Load index of sched_domain of this_cpu for load calc.
6415  * @local_group: Does group contain this_cpu.
6416  * @sgs: variable to hold the statistics for this group.
6417  * @overload: Indicate more than one runnable task for any CPU.
6418  */
6419 static inline void update_sg_lb_stats(struct lb_env *env,
6420 			struct sched_group *group, int load_idx,
6421 			int local_group, struct sg_lb_stats *sgs,
6422 			bool *overload)
6423 {
6424 	unsigned long load;
6425 	int i, nr_running;
6426 
6427 	memset(sgs, 0, sizeof(*sgs));
6428 
6429 	for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
6430 		struct rq *rq = cpu_rq(i);
6431 
6432 		/* Bias balancing toward cpus of our domain */
6433 		if (local_group)
6434 			load = target_load(i, load_idx);
6435 		else
6436 			load = source_load(i, load_idx);
6437 
6438 		sgs->group_load += load;
6439 		sgs->group_util += cpu_util(i);
6440 		sgs->sum_nr_running += rq->cfs.h_nr_running;
6441 
6442 		nr_running = rq->nr_running;
6443 		if (nr_running > 1)
6444 			*overload = true;
6445 
6446 #ifdef CONFIG_NUMA_BALANCING
6447 		sgs->nr_numa_running += rq->nr_numa_running;
6448 		sgs->nr_preferred_running += rq->nr_preferred_running;
6449 #endif
6450 		sgs->sum_weighted_load += weighted_cpuload(i);
6451 		/*
6452 		 * No need to call idle_cpu() if nr_running is not 0
6453 		 */
6454 		if (!nr_running && idle_cpu(i))
6455 			sgs->idle_cpus++;
6456 	}
6457 
6458 	/* Adjust by relative CPU capacity of the group */
6459 	sgs->group_capacity = group->sgc->capacity;
6460 	sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity;
6461 
6462 	if (sgs->sum_nr_running)
6463 		sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
6464 
6465 	sgs->group_weight = group->group_weight;
6466 
6467 	sgs->group_no_capacity = group_is_overloaded(env, sgs);
6468 	sgs->group_type = group_classify(group, sgs);
6469 }
6470 
6471 /**
6472  * update_sd_pick_busiest - return 1 on busiest group
6473  * @env: The load balancing environment.
6474  * @sds: sched_domain statistics
6475  * @sg: sched_group candidate to be checked for being the busiest
6476  * @sgs: sched_group statistics
6477  *
6478  * Determine if @sg is a busier group than the previously selected
6479  * busiest group.
6480  *
6481  * Return: %true if @sg is a busier group than the previously selected
6482  * busiest group. %false otherwise.
6483  */
6484 static bool update_sd_pick_busiest(struct lb_env *env,
6485 				   struct sd_lb_stats *sds,
6486 				   struct sched_group *sg,
6487 				   struct sg_lb_stats *sgs)
6488 {
6489 	struct sg_lb_stats *busiest = &sds->busiest_stat;
6490 
6491 	if (sgs->group_type > busiest->group_type)
6492 		return true;
6493 
6494 	if (sgs->group_type < busiest->group_type)
6495 		return false;
6496 
6497 	if (sgs->avg_load <= busiest->avg_load)
6498 		return false;
6499 
6500 	/* This is the busiest node in its class. */
6501 	if (!(env->sd->flags & SD_ASYM_PACKING))
6502 		return true;
6503 
6504 	/*
6505 	 * ASYM_PACKING needs to move all the work to the lowest
6506 	 * numbered CPUs in the group, therefore mark all groups
6507 	 * higher than ourself as busy.
6508 	 */
6509 	if (sgs->sum_nr_running && env->dst_cpu < group_first_cpu(sg)) {
6510 		if (!sds->busiest)
6511 			return true;
6512 
6513 		if (group_first_cpu(sds->busiest) > group_first_cpu(sg))
6514 			return true;
6515 	}
6516 
6517 	return false;
6518 }
6519 
6520 #ifdef CONFIG_NUMA_BALANCING
6521 static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
6522 {
6523 	if (sgs->sum_nr_running > sgs->nr_numa_running)
6524 		return regular;
6525 	if (sgs->sum_nr_running > sgs->nr_preferred_running)
6526 		return remote;
6527 	return all;
6528 }
6529 
6530 static inline enum fbq_type fbq_classify_rq(struct rq *rq)
6531 {
6532 	if (rq->nr_running > rq->nr_numa_running)
6533 		return regular;
6534 	if (rq->nr_running > rq->nr_preferred_running)
6535 		return remote;
6536 	return all;
6537 }
6538 #else
6539 static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
6540 {
6541 	return all;
6542 }
6543 
6544 static inline enum fbq_type fbq_classify_rq(struct rq *rq)
6545 {
6546 	return regular;
6547 }
6548 #endif /* CONFIG_NUMA_BALANCING */
6549 
6550 /**
6551  * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
6552  * @env: The load balancing environment.
6553  * @sds: variable to hold the statistics for this sched_domain.
6554  */
6555 static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
6556 {
6557 	struct sched_domain *child = env->sd->child;
6558 	struct sched_group *sg = env->sd->groups;
6559 	struct sg_lb_stats tmp_sgs;
6560 	int load_idx, prefer_sibling = 0;
6561 	bool overload = false;
6562 
6563 	if (child && child->flags & SD_PREFER_SIBLING)
6564 		prefer_sibling = 1;
6565 
6566 	load_idx = get_sd_load_idx(env->sd, env->idle);
6567 
6568 	do {
6569 		struct sg_lb_stats *sgs = &tmp_sgs;
6570 		int local_group;
6571 
6572 		local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
6573 		if (local_group) {
6574 			sds->local = sg;
6575 			sgs = &sds->local_stat;
6576 
6577 			if (env->idle != CPU_NEWLY_IDLE ||
6578 			    time_after_eq(jiffies, sg->sgc->next_update))
6579 				update_group_capacity(env->sd, env->dst_cpu);
6580 		}
6581 
6582 		update_sg_lb_stats(env, sg, load_idx, local_group, sgs,
6583 						&overload);
6584 
6585 		if (local_group)
6586 			goto next_group;
6587 
6588 		/*
6589 		 * In case the child domain prefers tasks go to siblings
6590 		 * first, lower the sg capacity so that we'll try
6591 		 * and move all the excess tasks away. We lower the capacity
6592 		 * of a group only if the local group has the capacity to fit
6593 		 * these excess tasks. The extra check prevents the case where
6594 		 * you always pull from the heaviest group when it is already
6595 		 * under-utilized (possible with a large weight task outweighs
6596 		 * the tasks on the system).
6597 		 */
6598 		if (prefer_sibling && sds->local &&
6599 		    group_has_capacity(env, &sds->local_stat) &&
6600 		    (sgs->sum_nr_running > 1)) {
6601 			sgs->group_no_capacity = 1;
6602 			sgs->group_type = group_classify(sg, sgs);
6603 		}
6604 
6605 		if (update_sd_pick_busiest(env, sds, sg, sgs)) {
6606 			sds->busiest = sg;
6607 			sds->busiest_stat = *sgs;
6608 		}
6609 
6610 next_group:
6611 		/* Now, start updating sd_lb_stats */
6612 		sds->total_load += sgs->group_load;
6613 		sds->total_capacity += sgs->group_capacity;
6614 
6615 		sg = sg->next;
6616 	} while (sg != env->sd->groups);
6617 
6618 	if (env->sd->flags & SD_NUMA)
6619 		env->fbq_type = fbq_classify_group(&sds->busiest_stat);
6620 
6621 	if (!env->sd->parent) {
6622 		/* update overload indicator if we are at root domain */
6623 		if (env->dst_rq->rd->overload != overload)
6624 			env->dst_rq->rd->overload = overload;
6625 	}
6626 
6627 }
6628 
6629 /**
6630  * check_asym_packing - Check to see if the group is packed into the
6631  *			sched doman.
6632  *
6633  * This is primarily intended to used at the sibling level.  Some
6634  * cores like POWER7 prefer to use lower numbered SMT threads.  In the
6635  * case of POWER7, it can move to lower SMT modes only when higher
6636  * threads are idle.  When in lower SMT modes, the threads will
6637  * perform better since they share less core resources.  Hence when we
6638  * have idle threads, we want them to be the higher ones.
6639  *
6640  * This packing function is run on idle threads.  It checks to see if
6641  * the busiest CPU in this domain (core in the P7 case) has a higher
6642  * CPU number than the packing function is being run on.  Here we are
6643  * assuming lower CPU number will be equivalent to lower a SMT thread
6644  * number.
6645  *
6646  * Return: 1 when packing is required and a task should be moved to
6647  * this CPU.  The amount of the imbalance is returned in *imbalance.
6648  *
6649  * @env: The load balancing environment.
6650  * @sds: Statistics of the sched_domain which is to be packed
6651  */
6652 static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
6653 {
6654 	int busiest_cpu;
6655 
6656 	if (!(env->sd->flags & SD_ASYM_PACKING))
6657 		return 0;
6658 
6659 	if (!sds->busiest)
6660 		return 0;
6661 
6662 	busiest_cpu = group_first_cpu(sds->busiest);
6663 	if (env->dst_cpu > busiest_cpu)
6664 		return 0;
6665 
6666 	env->imbalance = DIV_ROUND_CLOSEST(
6667 		sds->busiest_stat.avg_load * sds->busiest_stat.group_capacity,
6668 		SCHED_CAPACITY_SCALE);
6669 
6670 	return 1;
6671 }
6672 
6673 /**
6674  * fix_small_imbalance - Calculate the minor imbalance that exists
6675  *			amongst the groups of a sched_domain, during
6676  *			load balancing.
6677  * @env: The load balancing environment.
6678  * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
6679  */
6680 static inline
6681 void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
6682 {
6683 	unsigned long tmp, capa_now = 0, capa_move = 0;
6684 	unsigned int imbn = 2;
6685 	unsigned long scaled_busy_load_per_task;
6686 	struct sg_lb_stats *local, *busiest;
6687 
6688 	local = &sds->local_stat;
6689 	busiest = &sds->busiest_stat;
6690 
6691 	if (!local->sum_nr_running)
6692 		local->load_per_task = cpu_avg_load_per_task(env->dst_cpu);
6693 	else if (busiest->load_per_task > local->load_per_task)
6694 		imbn = 1;
6695 
6696 	scaled_busy_load_per_task =
6697 		(busiest->load_per_task * SCHED_CAPACITY_SCALE) /
6698 		busiest->group_capacity;
6699 
6700 	if (busiest->avg_load + scaled_busy_load_per_task >=
6701 	    local->avg_load + (scaled_busy_load_per_task * imbn)) {
6702 		env->imbalance = busiest->load_per_task;
6703 		return;
6704 	}
6705 
6706 	/*
6707 	 * OK, we don't have enough imbalance to justify moving tasks,
6708 	 * however we may be able to increase total CPU capacity used by
6709 	 * moving them.
6710 	 */
6711 
6712 	capa_now += busiest->group_capacity *
6713 			min(busiest->load_per_task, busiest->avg_load);
6714 	capa_now += local->group_capacity *
6715 			min(local->load_per_task, local->avg_load);
6716 	capa_now /= SCHED_CAPACITY_SCALE;
6717 
6718 	/* Amount of load we'd subtract */
6719 	if (busiest->avg_load > scaled_busy_load_per_task) {
6720 		capa_move += busiest->group_capacity *
6721 			    min(busiest->load_per_task,
6722 				busiest->avg_load - scaled_busy_load_per_task);
6723 	}
6724 
6725 	/* Amount of load we'd add */
6726 	if (busiest->avg_load * busiest->group_capacity <
6727 	    busiest->load_per_task * SCHED_CAPACITY_SCALE) {
6728 		tmp = (busiest->avg_load * busiest->group_capacity) /
6729 		      local->group_capacity;
6730 	} else {
6731 		tmp = (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
6732 		      local->group_capacity;
6733 	}
6734 	capa_move += local->group_capacity *
6735 		    min(local->load_per_task, local->avg_load + tmp);
6736 	capa_move /= SCHED_CAPACITY_SCALE;
6737 
6738 	/* Move if we gain throughput */
6739 	if (capa_move > capa_now)
6740 		env->imbalance = busiest->load_per_task;
6741 }
6742 
6743 /**
6744  * calculate_imbalance - Calculate the amount of imbalance present within the
6745  *			 groups of a given sched_domain during load balance.
6746  * @env: load balance environment
6747  * @sds: statistics of the sched_domain whose imbalance is to be calculated.
6748  */
6749 static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
6750 {
6751 	unsigned long max_pull, load_above_capacity = ~0UL;
6752 	struct sg_lb_stats *local, *busiest;
6753 
6754 	local = &sds->local_stat;
6755 	busiest = &sds->busiest_stat;
6756 
6757 	if (busiest->group_type == group_imbalanced) {
6758 		/*
6759 		 * In the group_imb case we cannot rely on group-wide averages
6760 		 * to ensure cpu-load equilibrium, look at wider averages. XXX
6761 		 */
6762 		busiest->load_per_task =
6763 			min(busiest->load_per_task, sds->avg_load);
6764 	}
6765 
6766 	/*
6767 	 * In the presence of smp nice balancing, certain scenarios can have
6768 	 * max load less than avg load(as we skip the groups at or below
6769 	 * its cpu_capacity, while calculating max_load..)
6770 	 */
6771 	if (busiest->avg_load <= sds->avg_load ||
6772 	    local->avg_load >= sds->avg_load) {
6773 		env->imbalance = 0;
6774 		return fix_small_imbalance(env, sds);
6775 	}
6776 
6777 	/*
6778 	 * If there aren't any idle cpus, avoid creating some.
6779 	 */
6780 	if (busiest->group_type == group_overloaded &&
6781 	    local->group_type   == group_overloaded) {
6782 		load_above_capacity = busiest->sum_nr_running *
6783 					SCHED_LOAD_SCALE;
6784 		if (load_above_capacity > busiest->group_capacity)
6785 			load_above_capacity -= busiest->group_capacity;
6786 		else
6787 			load_above_capacity = ~0UL;
6788 	}
6789 
6790 	/*
6791 	 * We're trying to get all the cpus to the average_load, so we don't
6792 	 * want to push ourselves above the average load, nor do we wish to
6793 	 * reduce the max loaded cpu below the average load. At the same time,
6794 	 * we also don't want to reduce the group load below the group capacity
6795 	 * (so that we can implement power-savings policies etc). Thus we look
6796 	 * for the minimum possible imbalance.
6797 	 */
6798 	max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity);
6799 
6800 	/* How much load to actually move to equalise the imbalance */
6801 	env->imbalance = min(
6802 		max_pull * busiest->group_capacity,
6803 		(sds->avg_load - local->avg_load) * local->group_capacity
6804 	) / SCHED_CAPACITY_SCALE;
6805 
6806 	/*
6807 	 * if *imbalance is less than the average load per runnable task
6808 	 * there is no guarantee that any tasks will be moved so we'll have
6809 	 * a think about bumping its value to force at least one task to be
6810 	 * moved
6811 	 */
6812 	if (env->imbalance < busiest->load_per_task)
6813 		return fix_small_imbalance(env, sds);
6814 }
6815 
6816 /******* find_busiest_group() helpers end here *********************/
6817 
6818 /**
6819  * find_busiest_group - Returns the busiest group within the sched_domain
6820  * if there is an imbalance. If there isn't an imbalance, and
6821  * the user has opted for power-savings, it returns a group whose
6822  * CPUs can be put to idle by rebalancing those tasks elsewhere, if
6823  * such a group exists.
6824  *
6825  * Also calculates the amount of weighted load which should be moved
6826  * to restore balance.
6827  *
6828  * @env: The load balancing environment.
6829  *
6830  * Return:	- The busiest group if imbalance exists.
6831  *		- If no imbalance and user has opted for power-savings balance,
6832  *		   return the least loaded group whose CPUs can be
6833  *		   put to idle by rebalancing its tasks onto our group.
6834  */
6835 static struct sched_group *find_busiest_group(struct lb_env *env)
6836 {
6837 	struct sg_lb_stats *local, *busiest;
6838 	struct sd_lb_stats sds;
6839 
6840 	init_sd_lb_stats(&sds);
6841 
6842 	/*
6843 	 * Compute the various statistics relavent for load balancing at
6844 	 * this level.
6845 	 */
6846 	update_sd_lb_stats(env, &sds);
6847 	local = &sds.local_stat;
6848 	busiest = &sds.busiest_stat;
6849 
6850 	/* ASYM feature bypasses nice load balance check */
6851 	if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) &&
6852 	    check_asym_packing(env, &sds))
6853 		return sds.busiest;
6854 
6855 	/* There is no busy sibling group to pull tasks from */
6856 	if (!sds.busiest || busiest->sum_nr_running == 0)
6857 		goto out_balanced;
6858 
6859 	sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load)
6860 						/ sds.total_capacity;
6861 
6862 	/*
6863 	 * If the busiest group is imbalanced the below checks don't
6864 	 * work because they assume all things are equal, which typically
6865 	 * isn't true due to cpus_allowed constraints and the like.
6866 	 */
6867 	if (busiest->group_type == group_imbalanced)
6868 		goto force_balance;
6869 
6870 	/* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
6871 	if (env->idle == CPU_NEWLY_IDLE && group_has_capacity(env, local) &&
6872 	    busiest->group_no_capacity)
6873 		goto force_balance;
6874 
6875 	/*
6876 	 * If the local group is busier than the selected busiest group
6877 	 * don't try and pull any tasks.
6878 	 */
6879 	if (local->avg_load >= busiest->avg_load)
6880 		goto out_balanced;
6881 
6882 	/*
6883 	 * Don't pull any tasks if this group is already above the domain
6884 	 * average load.
6885 	 */
6886 	if (local->avg_load >= sds.avg_load)
6887 		goto out_balanced;
6888 
6889 	if (env->idle == CPU_IDLE) {
6890 		/*
6891 		 * This cpu is idle. If the busiest group is not overloaded
6892 		 * and there is no imbalance between this and busiest group
6893 		 * wrt idle cpus, it is balanced. The imbalance becomes
6894 		 * significant if the diff is greater than 1 otherwise we
6895 		 * might end up to just move the imbalance on another group
6896 		 */
6897 		if ((busiest->group_type != group_overloaded) &&
6898 				(local->idle_cpus <= (busiest->idle_cpus + 1)))
6899 			goto out_balanced;
6900 	} else {
6901 		/*
6902 		 * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
6903 		 * imbalance_pct to be conservative.
6904 		 */
6905 		if (100 * busiest->avg_load <=
6906 				env->sd->imbalance_pct * local->avg_load)
6907 			goto out_balanced;
6908 	}
6909 
6910 force_balance:
6911 	/* Looks like there is an imbalance. Compute it */
6912 	calculate_imbalance(env, &sds);
6913 	return sds.busiest;
6914 
6915 out_balanced:
6916 	env->imbalance = 0;
6917 	return NULL;
6918 }
6919 
6920 /*
6921  * find_busiest_queue - find the busiest runqueue among the cpus in group.
6922  */
6923 static struct rq *find_busiest_queue(struct lb_env *env,
6924 				     struct sched_group *group)
6925 {
6926 	struct rq *busiest = NULL, *rq;
6927 	unsigned long busiest_load = 0, busiest_capacity = 1;
6928 	int i;
6929 
6930 	for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
6931 		unsigned long capacity, wl;
6932 		enum fbq_type rt;
6933 
6934 		rq = cpu_rq(i);
6935 		rt = fbq_classify_rq(rq);
6936 
6937 		/*
6938 		 * We classify groups/runqueues into three groups:
6939 		 *  - regular: there are !numa tasks
6940 		 *  - remote:  there are numa tasks that run on the 'wrong' node
6941 		 *  - all:     there is no distinction
6942 		 *
6943 		 * In order to avoid migrating ideally placed numa tasks,
6944 		 * ignore those when there's better options.
6945 		 *
6946 		 * If we ignore the actual busiest queue to migrate another
6947 		 * task, the next balance pass can still reduce the busiest
6948 		 * queue by moving tasks around inside the node.
6949 		 *
6950 		 * If we cannot move enough load due to this classification
6951 		 * the next pass will adjust the group classification and
6952 		 * allow migration of more tasks.
6953 		 *
6954 		 * Both cases only affect the total convergence complexity.
6955 		 */
6956 		if (rt > env->fbq_type)
6957 			continue;
6958 
6959 		capacity = capacity_of(i);
6960 
6961 		wl = weighted_cpuload(i);
6962 
6963 		/*
6964 		 * When comparing with imbalance, use weighted_cpuload()
6965 		 * which is not scaled with the cpu capacity.
6966 		 */
6967 
6968 		if (rq->nr_running == 1 && wl > env->imbalance &&
6969 		    !check_cpu_capacity(rq, env->sd))
6970 			continue;
6971 
6972 		/*
6973 		 * For the load comparisons with the other cpu's, consider
6974 		 * the weighted_cpuload() scaled with the cpu capacity, so
6975 		 * that the load can be moved away from the cpu that is
6976 		 * potentially running at a lower capacity.
6977 		 *
6978 		 * Thus we're looking for max(wl_i / capacity_i), crosswise
6979 		 * multiplication to rid ourselves of the division works out
6980 		 * to: wl_i * capacity_j > wl_j * capacity_i;  where j is
6981 		 * our previous maximum.
6982 		 */
6983 		if (wl * busiest_capacity > busiest_load * capacity) {
6984 			busiest_load = wl;
6985 			busiest_capacity = capacity;
6986 			busiest = rq;
6987 		}
6988 	}
6989 
6990 	return busiest;
6991 }
6992 
6993 /*
6994  * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
6995  * so long as it is large enough.
6996  */
6997 #define MAX_PINNED_INTERVAL	512
6998 
6999 /* Working cpumask for load_balance and load_balance_newidle. */
7000 DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
7001 
7002 static int need_active_balance(struct lb_env *env)
7003 {
7004 	struct sched_domain *sd = env->sd;
7005 
7006 	if (env->idle == CPU_NEWLY_IDLE) {
7007 
7008 		/*
7009 		 * ASYM_PACKING needs to force migrate tasks from busy but
7010 		 * higher numbered CPUs in order to pack all tasks in the
7011 		 * lowest numbered CPUs.
7012 		 */
7013 		if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu)
7014 			return 1;
7015 	}
7016 
7017 	/*
7018 	 * The dst_cpu is idle and the src_cpu CPU has only 1 CFS task.
7019 	 * It's worth migrating the task if the src_cpu's capacity is reduced
7020 	 * because of other sched_class or IRQs if more capacity stays
7021 	 * available on dst_cpu.
7022 	 */
7023 	if ((env->idle != CPU_NOT_IDLE) &&
7024 	    (env->src_rq->cfs.h_nr_running == 1)) {
7025 		if ((check_cpu_capacity(env->src_rq, sd)) &&
7026 		    (capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100))
7027 			return 1;
7028 	}
7029 
7030 	return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
7031 }
7032 
7033 static int active_load_balance_cpu_stop(void *data);
7034 
7035 static int should_we_balance(struct lb_env *env)
7036 {
7037 	struct sched_group *sg = env->sd->groups;
7038 	struct cpumask *sg_cpus, *sg_mask;
7039 	int cpu, balance_cpu = -1;
7040 
7041 	/*
7042 	 * In the newly idle case, we will allow all the cpu's
7043 	 * to do the newly idle load balance.
7044 	 */
7045 	if (env->idle == CPU_NEWLY_IDLE)
7046 		return 1;
7047 
7048 	sg_cpus = sched_group_cpus(sg);
7049 	sg_mask = sched_group_mask(sg);
7050 	/* Try to find first idle cpu */
7051 	for_each_cpu_and(cpu, sg_cpus, env->cpus) {
7052 		if (!cpumask_test_cpu(cpu, sg_mask) || !idle_cpu(cpu))
7053 			continue;
7054 
7055 		balance_cpu = cpu;
7056 		break;
7057 	}
7058 
7059 	if (balance_cpu == -1)
7060 		balance_cpu = group_balance_cpu(sg);
7061 
7062 	/*
7063 	 * First idle cpu or the first cpu(busiest) in this sched group
7064 	 * is eligible for doing load balancing at this and above domains.
7065 	 */
7066 	return balance_cpu == env->dst_cpu;
7067 }
7068 
7069 /*
7070  * Check this_cpu to ensure it is balanced within domain. Attempt to move
7071  * tasks if there is an imbalance.
7072  */
7073 static int load_balance(int this_cpu, struct rq *this_rq,
7074 			struct sched_domain *sd, enum cpu_idle_type idle,
7075 			int *continue_balancing)
7076 {
7077 	int ld_moved, cur_ld_moved, active_balance = 0;
7078 	struct sched_domain *sd_parent = sd->parent;
7079 	struct sched_group *group;
7080 	struct rq *busiest;
7081 	unsigned long flags;
7082 	struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
7083 
7084 	struct lb_env env = {
7085 		.sd		= sd,
7086 		.dst_cpu	= this_cpu,
7087 		.dst_rq		= this_rq,
7088 		.dst_grpmask    = sched_group_cpus(sd->groups),
7089 		.idle		= idle,
7090 		.loop_break	= sched_nr_migrate_break,
7091 		.cpus		= cpus,
7092 		.fbq_type	= all,
7093 		.tasks		= LIST_HEAD_INIT(env.tasks),
7094 	};
7095 
7096 	/*
7097 	 * For NEWLY_IDLE load_balancing, we don't need to consider
7098 	 * other cpus in our group
7099 	 */
7100 	if (idle == CPU_NEWLY_IDLE)
7101 		env.dst_grpmask = NULL;
7102 
7103 	cpumask_copy(cpus, cpu_active_mask);
7104 
7105 	schedstat_inc(sd, lb_count[idle]);
7106 
7107 redo:
7108 	if (!should_we_balance(&env)) {
7109 		*continue_balancing = 0;
7110 		goto out_balanced;
7111 	}
7112 
7113 	group = find_busiest_group(&env);
7114 	if (!group) {
7115 		schedstat_inc(sd, lb_nobusyg[idle]);
7116 		goto out_balanced;
7117 	}
7118 
7119 	busiest = find_busiest_queue(&env, group);
7120 	if (!busiest) {
7121 		schedstat_inc(sd, lb_nobusyq[idle]);
7122 		goto out_balanced;
7123 	}
7124 
7125 	BUG_ON(busiest == env.dst_rq);
7126 
7127 	schedstat_add(sd, lb_imbalance[idle], env.imbalance);
7128 
7129 	env.src_cpu = busiest->cpu;
7130 	env.src_rq = busiest;
7131 
7132 	ld_moved = 0;
7133 	if (busiest->nr_running > 1) {
7134 		/*
7135 		 * Attempt to move tasks. If find_busiest_group has found
7136 		 * an imbalance but busiest->nr_running <= 1, the group is
7137 		 * still unbalanced. ld_moved simply stays zero, so it is
7138 		 * correctly treated as an imbalance.
7139 		 */
7140 		env.flags |= LBF_ALL_PINNED;
7141 		env.loop_max  = min(sysctl_sched_nr_migrate, busiest->nr_running);
7142 
7143 more_balance:
7144 		raw_spin_lock_irqsave(&busiest->lock, flags);
7145 
7146 		/*
7147 		 * cur_ld_moved - load moved in current iteration
7148 		 * ld_moved     - cumulative load moved across iterations
7149 		 */
7150 		cur_ld_moved = detach_tasks(&env);
7151 
7152 		/*
7153 		 * We've detached some tasks from busiest_rq. Every
7154 		 * task is masked "TASK_ON_RQ_MIGRATING", so we can safely
7155 		 * unlock busiest->lock, and we are able to be sure
7156 		 * that nobody can manipulate the tasks in parallel.
7157 		 * See task_rq_lock() family for the details.
7158 		 */
7159 
7160 		raw_spin_unlock(&busiest->lock);
7161 
7162 		if (cur_ld_moved) {
7163 			attach_tasks(&env);
7164 			ld_moved += cur_ld_moved;
7165 		}
7166 
7167 		local_irq_restore(flags);
7168 
7169 		if (env.flags & LBF_NEED_BREAK) {
7170 			env.flags &= ~LBF_NEED_BREAK;
7171 			goto more_balance;
7172 		}
7173 
7174 		/*
7175 		 * Revisit (affine) tasks on src_cpu that couldn't be moved to
7176 		 * us and move them to an alternate dst_cpu in our sched_group
7177 		 * where they can run. The upper limit on how many times we
7178 		 * iterate on same src_cpu is dependent on number of cpus in our
7179 		 * sched_group.
7180 		 *
7181 		 * This changes load balance semantics a bit on who can move
7182 		 * load to a given_cpu. In addition to the given_cpu itself
7183 		 * (or a ilb_cpu acting on its behalf where given_cpu is
7184 		 * nohz-idle), we now have balance_cpu in a position to move
7185 		 * load to given_cpu. In rare situations, this may cause
7186 		 * conflicts (balance_cpu and given_cpu/ilb_cpu deciding
7187 		 * _independently_ and at _same_ time to move some load to
7188 		 * given_cpu) causing exceess load to be moved to given_cpu.
7189 		 * This however should not happen so much in practice and
7190 		 * moreover subsequent load balance cycles should correct the
7191 		 * excess load moved.
7192 		 */
7193 		if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
7194 
7195 			/* Prevent to re-select dst_cpu via env's cpus */
7196 			cpumask_clear_cpu(env.dst_cpu, env.cpus);
7197 
7198 			env.dst_rq	 = cpu_rq(env.new_dst_cpu);
7199 			env.dst_cpu	 = env.new_dst_cpu;
7200 			env.flags	&= ~LBF_DST_PINNED;
7201 			env.loop	 = 0;
7202 			env.loop_break	 = sched_nr_migrate_break;
7203 
7204 			/*
7205 			 * Go back to "more_balance" rather than "redo" since we
7206 			 * need to continue with same src_cpu.
7207 			 */
7208 			goto more_balance;
7209 		}
7210 
7211 		/*
7212 		 * We failed to reach balance because of affinity.
7213 		 */
7214 		if (sd_parent) {
7215 			int *group_imbalance = &sd_parent->groups->sgc->imbalance;
7216 
7217 			if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0)
7218 				*group_imbalance = 1;
7219 		}
7220 
7221 		/* All tasks on this runqueue were pinned by CPU affinity */
7222 		if (unlikely(env.flags & LBF_ALL_PINNED)) {
7223 			cpumask_clear_cpu(cpu_of(busiest), cpus);
7224 			if (!cpumask_empty(cpus)) {
7225 				env.loop = 0;
7226 				env.loop_break = sched_nr_migrate_break;
7227 				goto redo;
7228 			}
7229 			goto out_all_pinned;
7230 		}
7231 	}
7232 
7233 	if (!ld_moved) {
7234 		schedstat_inc(sd, lb_failed[idle]);
7235 		/*
7236 		 * Increment the failure counter only on periodic balance.
7237 		 * We do not want newidle balance, which can be very
7238 		 * frequent, pollute the failure counter causing
7239 		 * excessive cache_hot migrations and active balances.
7240 		 */
7241 		if (idle != CPU_NEWLY_IDLE)
7242 			sd->nr_balance_failed++;
7243 
7244 		if (need_active_balance(&env)) {
7245 			raw_spin_lock_irqsave(&busiest->lock, flags);
7246 
7247 			/* don't kick the active_load_balance_cpu_stop,
7248 			 * if the curr task on busiest cpu can't be
7249 			 * moved to this_cpu
7250 			 */
7251 			if (!cpumask_test_cpu(this_cpu,
7252 					tsk_cpus_allowed(busiest->curr))) {
7253 				raw_spin_unlock_irqrestore(&busiest->lock,
7254 							    flags);
7255 				env.flags |= LBF_ALL_PINNED;
7256 				goto out_one_pinned;
7257 			}
7258 
7259 			/*
7260 			 * ->active_balance synchronizes accesses to
7261 			 * ->active_balance_work.  Once set, it's cleared
7262 			 * only after active load balance is finished.
7263 			 */
7264 			if (!busiest->active_balance) {
7265 				busiest->active_balance = 1;
7266 				busiest->push_cpu = this_cpu;
7267 				active_balance = 1;
7268 			}
7269 			raw_spin_unlock_irqrestore(&busiest->lock, flags);
7270 
7271 			if (active_balance) {
7272 				stop_one_cpu_nowait(cpu_of(busiest),
7273 					active_load_balance_cpu_stop, busiest,
7274 					&busiest->active_balance_work);
7275 			}
7276 
7277 			/*
7278 			 * We've kicked active balancing, reset the failure
7279 			 * counter.
7280 			 */
7281 			sd->nr_balance_failed = sd->cache_nice_tries+1;
7282 		}
7283 	} else
7284 		sd->nr_balance_failed = 0;
7285 
7286 	if (likely(!active_balance)) {
7287 		/* We were unbalanced, so reset the balancing interval */
7288 		sd->balance_interval = sd->min_interval;
7289 	} else {
7290 		/*
7291 		 * If we've begun active balancing, start to back off. This
7292 		 * case may not be covered by the all_pinned logic if there
7293 		 * is only 1 task on the busy runqueue (because we don't call
7294 		 * detach_tasks).
7295 		 */
7296 		if (sd->balance_interval < sd->max_interval)
7297 			sd->balance_interval *= 2;
7298 	}
7299 
7300 	goto out;
7301 
7302 out_balanced:
7303 	/*
7304 	 * We reach balance although we may have faced some affinity
7305 	 * constraints. Clear the imbalance flag if it was set.
7306 	 */
7307 	if (sd_parent) {
7308 		int *group_imbalance = &sd_parent->groups->sgc->imbalance;
7309 
7310 		if (*group_imbalance)
7311 			*group_imbalance = 0;
7312 	}
7313 
7314 out_all_pinned:
7315 	/*
7316 	 * We reach balance because all tasks are pinned at this level so
7317 	 * we can't migrate them. Let the imbalance flag set so parent level
7318 	 * can try to migrate them.
7319 	 */
7320 	schedstat_inc(sd, lb_balanced[idle]);
7321 
7322 	sd->nr_balance_failed = 0;
7323 
7324 out_one_pinned:
7325 	/* tune up the balancing interval */
7326 	if (((env.flags & LBF_ALL_PINNED) &&
7327 			sd->balance_interval < MAX_PINNED_INTERVAL) ||
7328 			(sd->balance_interval < sd->max_interval))
7329 		sd->balance_interval *= 2;
7330 
7331 	ld_moved = 0;
7332 out:
7333 	return ld_moved;
7334 }
7335 
7336 static inline unsigned long
7337 get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
7338 {
7339 	unsigned long interval = sd->balance_interval;
7340 
7341 	if (cpu_busy)
7342 		interval *= sd->busy_factor;
7343 
7344 	/* scale ms to jiffies */
7345 	interval = msecs_to_jiffies(interval);
7346 	interval = clamp(interval, 1UL, max_load_balance_interval);
7347 
7348 	return interval;
7349 }
7350 
7351 static inline void
7352 update_next_balance(struct sched_domain *sd, int cpu_busy, unsigned long *next_balance)
7353 {
7354 	unsigned long interval, next;
7355 
7356 	interval = get_sd_balance_interval(sd, cpu_busy);
7357 	next = sd->last_balance + interval;
7358 
7359 	if (time_after(*next_balance, next))
7360 		*next_balance = next;
7361 }
7362 
7363 /*
7364  * idle_balance is called by schedule() if this_cpu is about to become
7365  * idle. Attempts to pull tasks from other CPUs.
7366  */
7367 static int idle_balance(struct rq *this_rq)
7368 {
7369 	unsigned long next_balance = jiffies + HZ;
7370 	int this_cpu = this_rq->cpu;
7371 	struct sched_domain *sd;
7372 	int pulled_task = 0;
7373 	u64 curr_cost = 0;
7374 
7375 	/*
7376 	 * We must set idle_stamp _before_ calling idle_balance(), such that we
7377 	 * measure the duration of idle_balance() as idle time.
7378 	 */
7379 	this_rq->idle_stamp = rq_clock(this_rq);
7380 
7381 	if (this_rq->avg_idle < sysctl_sched_migration_cost ||
7382 	    !this_rq->rd->overload) {
7383 		rcu_read_lock();
7384 		sd = rcu_dereference_check_sched_domain(this_rq->sd);
7385 		if (sd)
7386 			update_next_balance(sd, 0, &next_balance);
7387 		rcu_read_unlock();
7388 
7389 		goto out;
7390 	}
7391 
7392 	raw_spin_unlock(&this_rq->lock);
7393 
7394 	update_blocked_averages(this_cpu);
7395 	rcu_read_lock();
7396 	for_each_domain(this_cpu, sd) {
7397 		int continue_balancing = 1;
7398 		u64 t0, domain_cost;
7399 
7400 		if (!(sd->flags & SD_LOAD_BALANCE))
7401 			continue;
7402 
7403 		if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
7404 			update_next_balance(sd, 0, &next_balance);
7405 			break;
7406 		}
7407 
7408 		if (sd->flags & SD_BALANCE_NEWIDLE) {
7409 			t0 = sched_clock_cpu(this_cpu);
7410 
7411 			pulled_task = load_balance(this_cpu, this_rq,
7412 						   sd, CPU_NEWLY_IDLE,
7413 						   &continue_balancing);
7414 
7415 			domain_cost = sched_clock_cpu(this_cpu) - t0;
7416 			if (domain_cost > sd->max_newidle_lb_cost)
7417 				sd->max_newidle_lb_cost = domain_cost;
7418 
7419 			curr_cost += domain_cost;
7420 		}
7421 
7422 		update_next_balance(sd, 0, &next_balance);
7423 
7424 		/*
7425 		 * Stop searching for tasks to pull if there are
7426 		 * now runnable tasks on this rq.
7427 		 */
7428 		if (pulled_task || this_rq->nr_running > 0)
7429 			break;
7430 	}
7431 	rcu_read_unlock();
7432 
7433 	raw_spin_lock(&this_rq->lock);
7434 
7435 	if (curr_cost > this_rq->max_idle_balance_cost)
7436 		this_rq->max_idle_balance_cost = curr_cost;
7437 
7438 	/*
7439 	 * While browsing the domains, we released the rq lock, a task could
7440 	 * have been enqueued in the meantime. Since we're not going idle,
7441 	 * pretend we pulled a task.
7442 	 */
7443 	if (this_rq->cfs.h_nr_running && !pulled_task)
7444 		pulled_task = 1;
7445 
7446 out:
7447 	/* Move the next balance forward */
7448 	if (time_after(this_rq->next_balance, next_balance))
7449 		this_rq->next_balance = next_balance;
7450 
7451 	/* Is there a task of a high priority class? */
7452 	if (this_rq->nr_running != this_rq->cfs.h_nr_running)
7453 		pulled_task = -1;
7454 
7455 	if (pulled_task)
7456 		this_rq->idle_stamp = 0;
7457 
7458 	return pulled_task;
7459 }
7460 
7461 /*
7462  * active_load_balance_cpu_stop is run by cpu stopper. It pushes
7463  * running tasks off the busiest CPU onto idle CPUs. It requires at
7464  * least 1 task to be running on each physical CPU where possible, and
7465  * avoids physical / logical imbalances.
7466  */
7467 static int active_load_balance_cpu_stop(void *data)
7468 {
7469 	struct rq *busiest_rq = data;
7470 	int busiest_cpu = cpu_of(busiest_rq);
7471 	int target_cpu = busiest_rq->push_cpu;
7472 	struct rq *target_rq = cpu_rq(target_cpu);
7473 	struct sched_domain *sd;
7474 	struct task_struct *p = NULL;
7475 
7476 	raw_spin_lock_irq(&busiest_rq->lock);
7477 
7478 	/* make sure the requested cpu hasn't gone down in the meantime */
7479 	if (unlikely(busiest_cpu != smp_processor_id() ||
7480 		     !busiest_rq->active_balance))
7481 		goto out_unlock;
7482 
7483 	/* Is there any task to move? */
7484 	if (busiest_rq->nr_running <= 1)
7485 		goto out_unlock;
7486 
7487 	/*
7488 	 * This condition is "impossible", if it occurs
7489 	 * we need to fix it. Originally reported by
7490 	 * Bjorn Helgaas on a 128-cpu setup.
7491 	 */
7492 	BUG_ON(busiest_rq == target_rq);
7493 
7494 	/* Search for an sd spanning us and the target CPU. */
7495 	rcu_read_lock();
7496 	for_each_domain(target_cpu, sd) {
7497 		if ((sd->flags & SD_LOAD_BALANCE) &&
7498 		    cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
7499 				break;
7500 	}
7501 
7502 	if (likely(sd)) {
7503 		struct lb_env env = {
7504 			.sd		= sd,
7505 			.dst_cpu	= target_cpu,
7506 			.dst_rq		= target_rq,
7507 			.src_cpu	= busiest_rq->cpu,
7508 			.src_rq		= busiest_rq,
7509 			.idle		= CPU_IDLE,
7510 		};
7511 
7512 		schedstat_inc(sd, alb_count);
7513 
7514 		p = detach_one_task(&env);
7515 		if (p)
7516 			schedstat_inc(sd, alb_pushed);
7517 		else
7518 			schedstat_inc(sd, alb_failed);
7519 	}
7520 	rcu_read_unlock();
7521 out_unlock:
7522 	busiest_rq->active_balance = 0;
7523 	raw_spin_unlock(&busiest_rq->lock);
7524 
7525 	if (p)
7526 		attach_one_task(target_rq, p);
7527 
7528 	local_irq_enable();
7529 
7530 	return 0;
7531 }
7532 
7533 static inline int on_null_domain(struct rq *rq)
7534 {
7535 	return unlikely(!rcu_dereference_sched(rq->sd));
7536 }
7537 
7538 #ifdef CONFIG_NO_HZ_COMMON
7539 /*
7540  * idle load balancing details
7541  * - When one of the busy CPUs notice that there may be an idle rebalancing
7542  *   needed, they will kick the idle load balancer, which then does idle
7543  *   load balancing for all the idle CPUs.
7544  */
7545 static struct {
7546 	cpumask_var_t idle_cpus_mask;
7547 	atomic_t nr_cpus;
7548 	unsigned long next_balance;     /* in jiffy units */
7549 } nohz ____cacheline_aligned;
7550 
7551 static inline int find_new_ilb(void)
7552 {
7553 	int ilb = cpumask_first(nohz.idle_cpus_mask);
7554 
7555 	if (ilb < nr_cpu_ids && idle_cpu(ilb))
7556 		return ilb;
7557 
7558 	return nr_cpu_ids;
7559 }
7560 
7561 /*
7562  * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
7563  * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
7564  * CPU (if there is one).
7565  */
7566 static void nohz_balancer_kick(void)
7567 {
7568 	int ilb_cpu;
7569 
7570 	nohz.next_balance++;
7571 
7572 	ilb_cpu = find_new_ilb();
7573 
7574 	if (ilb_cpu >= nr_cpu_ids)
7575 		return;
7576 
7577 	if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))
7578 		return;
7579 	/*
7580 	 * Use smp_send_reschedule() instead of resched_cpu().
7581 	 * This way we generate a sched IPI on the target cpu which
7582 	 * is idle. And the softirq performing nohz idle load balance
7583 	 * will be run before returning from the IPI.
7584 	 */
7585 	smp_send_reschedule(ilb_cpu);
7586 	return;
7587 }
7588 
7589 static inline void nohz_balance_exit_idle(int cpu)
7590 {
7591 	if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
7592 		/*
7593 		 * Completely isolated CPUs don't ever set, so we must test.
7594 		 */
7595 		if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) {
7596 			cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
7597 			atomic_dec(&nohz.nr_cpus);
7598 		}
7599 		clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
7600 	}
7601 }
7602 
7603 static inline void set_cpu_sd_state_busy(void)
7604 {
7605 	struct sched_domain *sd;
7606 	int cpu = smp_processor_id();
7607 
7608 	rcu_read_lock();
7609 	sd = rcu_dereference(per_cpu(sd_busy, cpu));
7610 
7611 	if (!sd || !sd->nohz_idle)
7612 		goto unlock;
7613 	sd->nohz_idle = 0;
7614 
7615 	atomic_inc(&sd->groups->sgc->nr_busy_cpus);
7616 unlock:
7617 	rcu_read_unlock();
7618 }
7619 
7620 void set_cpu_sd_state_idle(void)
7621 {
7622 	struct sched_domain *sd;
7623 	int cpu = smp_processor_id();
7624 
7625 	rcu_read_lock();
7626 	sd = rcu_dereference(per_cpu(sd_busy, cpu));
7627 
7628 	if (!sd || sd->nohz_idle)
7629 		goto unlock;
7630 	sd->nohz_idle = 1;
7631 
7632 	atomic_dec(&sd->groups->sgc->nr_busy_cpus);
7633 unlock:
7634 	rcu_read_unlock();
7635 }
7636 
7637 /*
7638  * This routine will record that the cpu is going idle with tick stopped.
7639  * This info will be used in performing idle load balancing in the future.
7640  */
7641 void nohz_balance_enter_idle(int cpu)
7642 {
7643 	/*
7644 	 * If this cpu is going down, then nothing needs to be done.
7645 	 */
7646 	if (!cpu_active(cpu))
7647 		return;
7648 
7649 	if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
7650 		return;
7651 
7652 	/*
7653 	 * If we're a completely isolated CPU, we don't play.
7654 	 */
7655 	if (on_null_domain(cpu_rq(cpu)))
7656 		return;
7657 
7658 	cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
7659 	atomic_inc(&nohz.nr_cpus);
7660 	set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
7661 }
7662 
7663 static int sched_ilb_notifier(struct notifier_block *nfb,
7664 					unsigned long action, void *hcpu)
7665 {
7666 	switch (action & ~CPU_TASKS_FROZEN) {
7667 	case CPU_DYING:
7668 		nohz_balance_exit_idle(smp_processor_id());
7669 		return NOTIFY_OK;
7670 	default:
7671 		return NOTIFY_DONE;
7672 	}
7673 }
7674 #endif
7675 
7676 static DEFINE_SPINLOCK(balancing);
7677 
7678 /*
7679  * Scale the max load_balance interval with the number of CPUs in the system.
7680  * This trades load-balance latency on larger machines for less cross talk.
7681  */
7682 void update_max_interval(void)
7683 {
7684 	max_load_balance_interval = HZ*num_online_cpus()/10;
7685 }
7686 
7687 /*
7688  * It checks each scheduling domain to see if it is due to be balanced,
7689  * and initiates a balancing operation if so.
7690  *
7691  * Balancing parameters are set up in init_sched_domains.
7692  */
7693 static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
7694 {
7695 	int continue_balancing = 1;
7696 	int cpu = rq->cpu;
7697 	unsigned long interval;
7698 	struct sched_domain *sd;
7699 	/* Earliest time when we have to do rebalance again */
7700 	unsigned long next_balance = jiffies + 60*HZ;
7701 	int update_next_balance = 0;
7702 	int need_serialize, need_decay = 0;
7703 	u64 max_cost = 0;
7704 
7705 	update_blocked_averages(cpu);
7706 
7707 	rcu_read_lock();
7708 	for_each_domain(cpu, sd) {
7709 		/*
7710 		 * Decay the newidle max times here because this is a regular
7711 		 * visit to all the domains. Decay ~1% per second.
7712 		 */
7713 		if (time_after(jiffies, sd->next_decay_max_lb_cost)) {
7714 			sd->max_newidle_lb_cost =
7715 				(sd->max_newidle_lb_cost * 253) / 256;
7716 			sd->next_decay_max_lb_cost = jiffies + HZ;
7717 			need_decay = 1;
7718 		}
7719 		max_cost += sd->max_newidle_lb_cost;
7720 
7721 		if (!(sd->flags & SD_LOAD_BALANCE))
7722 			continue;
7723 
7724 		/*
7725 		 * Stop the load balance at this level. There is another
7726 		 * CPU in our sched group which is doing load balancing more
7727 		 * actively.
7728 		 */
7729 		if (!continue_balancing) {
7730 			if (need_decay)
7731 				continue;
7732 			break;
7733 		}
7734 
7735 		interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
7736 
7737 		need_serialize = sd->flags & SD_SERIALIZE;
7738 		if (need_serialize) {
7739 			if (!spin_trylock(&balancing))
7740 				goto out;
7741 		}
7742 
7743 		if (time_after_eq(jiffies, sd->last_balance + interval)) {
7744 			if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
7745 				/*
7746 				 * The LBF_DST_PINNED logic could have changed
7747 				 * env->dst_cpu, so we can't know our idle
7748 				 * state even if we migrated tasks. Update it.
7749 				 */
7750 				idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
7751 			}
7752 			sd->last_balance = jiffies;
7753 			interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
7754 		}
7755 		if (need_serialize)
7756 			spin_unlock(&balancing);
7757 out:
7758 		if (time_after(next_balance, sd->last_balance + interval)) {
7759 			next_balance = sd->last_balance + interval;
7760 			update_next_balance = 1;
7761 		}
7762 	}
7763 	if (need_decay) {
7764 		/*
7765 		 * Ensure the rq-wide value also decays but keep it at a
7766 		 * reasonable floor to avoid funnies with rq->avg_idle.
7767 		 */
7768 		rq->max_idle_balance_cost =
7769 			max((u64)sysctl_sched_migration_cost, max_cost);
7770 	}
7771 	rcu_read_unlock();
7772 
7773 	/*
7774 	 * next_balance will be updated only when there is a need.
7775 	 * When the cpu is attached to null domain for ex, it will not be
7776 	 * updated.
7777 	 */
7778 	if (likely(update_next_balance)) {
7779 		rq->next_balance = next_balance;
7780 
7781 #ifdef CONFIG_NO_HZ_COMMON
7782 		/*
7783 		 * If this CPU has been elected to perform the nohz idle
7784 		 * balance. Other idle CPUs have already rebalanced with
7785 		 * nohz_idle_balance() and nohz.next_balance has been
7786 		 * updated accordingly. This CPU is now running the idle load
7787 		 * balance for itself and we need to update the
7788 		 * nohz.next_balance accordingly.
7789 		 */
7790 		if ((idle == CPU_IDLE) && time_after(nohz.next_balance, rq->next_balance))
7791 			nohz.next_balance = rq->next_balance;
7792 #endif
7793 	}
7794 }
7795 
7796 #ifdef CONFIG_NO_HZ_COMMON
7797 /*
7798  * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
7799  * rebalancing for all the cpus for whom scheduler ticks are stopped.
7800  */
7801 static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
7802 {
7803 	int this_cpu = this_rq->cpu;
7804 	struct rq *rq;
7805 	int balance_cpu;
7806 	/* Earliest time when we have to do rebalance again */
7807 	unsigned long next_balance = jiffies + 60*HZ;
7808 	int update_next_balance = 0;
7809 
7810 	if (idle != CPU_IDLE ||
7811 	    !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
7812 		goto end;
7813 
7814 	for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
7815 		if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
7816 			continue;
7817 
7818 		/*
7819 		 * If this cpu gets work to do, stop the load balancing
7820 		 * work being done for other cpus. Next load
7821 		 * balancing owner will pick it up.
7822 		 */
7823 		if (need_resched())
7824 			break;
7825 
7826 		rq = cpu_rq(balance_cpu);
7827 
7828 		/*
7829 		 * If time for next balance is due,
7830 		 * do the balance.
7831 		 */
7832 		if (time_after_eq(jiffies, rq->next_balance)) {
7833 			raw_spin_lock_irq(&rq->lock);
7834 			update_rq_clock(rq);
7835 			update_idle_cpu_load(rq);
7836 			raw_spin_unlock_irq(&rq->lock);
7837 			rebalance_domains(rq, CPU_IDLE);
7838 		}
7839 
7840 		if (time_after(next_balance, rq->next_balance)) {
7841 			next_balance = rq->next_balance;
7842 			update_next_balance = 1;
7843 		}
7844 	}
7845 
7846 	/*
7847 	 * next_balance will be updated only when there is a need.
7848 	 * When the CPU is attached to null domain for ex, it will not be
7849 	 * updated.
7850 	 */
7851 	if (likely(update_next_balance))
7852 		nohz.next_balance = next_balance;
7853 end:
7854 	clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
7855 }
7856 
7857 /*
7858  * Current heuristic for kicking the idle load balancer in the presence
7859  * of an idle cpu in the system.
7860  *   - This rq has more than one task.
7861  *   - This rq has at least one CFS task and the capacity of the CPU is
7862  *     significantly reduced because of RT tasks or IRQs.
7863  *   - At parent of LLC scheduler domain level, this cpu's scheduler group has
7864  *     multiple busy cpu.
7865  *   - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
7866  *     domain span are idle.
7867  */
7868 static inline bool nohz_kick_needed(struct rq *rq)
7869 {
7870 	unsigned long now = jiffies;
7871 	struct sched_domain *sd;
7872 	struct sched_group_capacity *sgc;
7873 	int nr_busy, cpu = rq->cpu;
7874 	bool kick = false;
7875 
7876 	if (unlikely(rq->idle_balance))
7877 		return false;
7878 
7879        /*
7880 	* We may be recently in ticked or tickless idle mode. At the first
7881 	* busy tick after returning from idle, we will update the busy stats.
7882 	*/
7883 	set_cpu_sd_state_busy();
7884 	nohz_balance_exit_idle(cpu);
7885 
7886 	/*
7887 	 * None are in tickless mode and hence no need for NOHZ idle load
7888 	 * balancing.
7889 	 */
7890 	if (likely(!atomic_read(&nohz.nr_cpus)))
7891 		return false;
7892 
7893 	if (time_before(now, nohz.next_balance))
7894 		return false;
7895 
7896 	if (rq->nr_running >= 2)
7897 		return true;
7898 
7899 	rcu_read_lock();
7900 	sd = rcu_dereference(per_cpu(sd_busy, cpu));
7901 	if (sd) {
7902 		sgc = sd->groups->sgc;
7903 		nr_busy = atomic_read(&sgc->nr_busy_cpus);
7904 
7905 		if (nr_busy > 1) {
7906 			kick = true;
7907 			goto unlock;
7908 		}
7909 
7910 	}
7911 
7912 	sd = rcu_dereference(rq->sd);
7913 	if (sd) {
7914 		if ((rq->cfs.h_nr_running >= 1) &&
7915 				check_cpu_capacity(rq, sd)) {
7916 			kick = true;
7917 			goto unlock;
7918 		}
7919 	}
7920 
7921 	sd = rcu_dereference(per_cpu(sd_asym, cpu));
7922 	if (sd && (cpumask_first_and(nohz.idle_cpus_mask,
7923 				  sched_domain_span(sd)) < cpu)) {
7924 		kick = true;
7925 		goto unlock;
7926 	}
7927 
7928 unlock:
7929 	rcu_read_unlock();
7930 	return kick;
7931 }
7932 #else
7933 static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { }
7934 #endif
7935 
7936 /*
7937  * run_rebalance_domains is triggered when needed from the scheduler tick.
7938  * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
7939  */
7940 static void run_rebalance_domains(struct softirq_action *h)
7941 {
7942 	struct rq *this_rq = this_rq();
7943 	enum cpu_idle_type idle = this_rq->idle_balance ?
7944 						CPU_IDLE : CPU_NOT_IDLE;
7945 
7946 	/*
7947 	 * If this cpu has a pending nohz_balance_kick, then do the
7948 	 * balancing on behalf of the other idle cpus whose ticks are
7949 	 * stopped. Do nohz_idle_balance *before* rebalance_domains to
7950 	 * give the idle cpus a chance to load balance. Else we may
7951 	 * load balance only within the local sched_domain hierarchy
7952 	 * and abort nohz_idle_balance altogether if we pull some load.
7953 	 */
7954 	nohz_idle_balance(this_rq, idle);
7955 	rebalance_domains(this_rq, idle);
7956 }
7957 
7958 /*
7959  * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
7960  */
7961 void trigger_load_balance(struct rq *rq)
7962 {
7963 	/* Don't need to rebalance while attached to NULL domain */
7964 	if (unlikely(on_null_domain(rq)))
7965 		return;
7966 
7967 	if (time_after_eq(jiffies, rq->next_balance))
7968 		raise_softirq(SCHED_SOFTIRQ);
7969 #ifdef CONFIG_NO_HZ_COMMON
7970 	if (nohz_kick_needed(rq))
7971 		nohz_balancer_kick();
7972 #endif
7973 }
7974 
7975 static void rq_online_fair(struct rq *rq)
7976 {
7977 	update_sysctl();
7978 
7979 	update_runtime_enabled(rq);
7980 }
7981 
7982 static void rq_offline_fair(struct rq *rq)
7983 {
7984 	update_sysctl();
7985 
7986 	/* Ensure any throttled groups are reachable by pick_next_task */
7987 	unthrottle_offline_cfs_rqs(rq);
7988 }
7989 
7990 #endif /* CONFIG_SMP */
7991 
7992 /*
7993  * scheduler tick hitting a task of our scheduling class:
7994  */
7995 static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
7996 {
7997 	struct cfs_rq *cfs_rq;
7998 	struct sched_entity *se = &curr->se;
7999 
8000 	for_each_sched_entity(se) {
8001 		cfs_rq = cfs_rq_of(se);
8002 		entity_tick(cfs_rq, se, queued);
8003 	}
8004 
8005 	if (static_branch_unlikely(&sched_numa_balancing))
8006 		task_tick_numa(rq, curr);
8007 }
8008 
8009 /*
8010  * called on fork with the child task as argument from the parent's context
8011  *  - child not yet on the tasklist
8012  *  - preemption disabled
8013  */
8014 static void task_fork_fair(struct task_struct *p)
8015 {
8016 	struct cfs_rq *cfs_rq;
8017 	struct sched_entity *se = &p->se, *curr;
8018 	int this_cpu = smp_processor_id();
8019 	struct rq *rq = this_rq();
8020 	unsigned long flags;
8021 
8022 	raw_spin_lock_irqsave(&rq->lock, flags);
8023 
8024 	update_rq_clock(rq);
8025 
8026 	cfs_rq = task_cfs_rq(current);
8027 	curr = cfs_rq->curr;
8028 
8029 	/*
8030 	 * Not only the cpu but also the task_group of the parent might have
8031 	 * been changed after parent->se.parent,cfs_rq were copied to
8032 	 * child->se.parent,cfs_rq. So call __set_task_cpu() to make those
8033 	 * of child point to valid ones.
8034 	 */
8035 	rcu_read_lock();
8036 	__set_task_cpu(p, this_cpu);
8037 	rcu_read_unlock();
8038 
8039 	update_curr(cfs_rq);
8040 
8041 	if (curr)
8042 		se->vruntime = curr->vruntime;
8043 	place_entity(cfs_rq, se, 1);
8044 
8045 	if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
8046 		/*
8047 		 * Upon rescheduling, sched_class::put_prev_task() will place
8048 		 * 'current' within the tree based on its new key value.
8049 		 */
8050 		swap(curr->vruntime, se->vruntime);
8051 		resched_curr(rq);
8052 	}
8053 
8054 	se->vruntime -= cfs_rq->min_vruntime;
8055 
8056 	raw_spin_unlock_irqrestore(&rq->lock, flags);
8057 }
8058 
8059 /*
8060  * Priority of the task has changed. Check to see if we preempt
8061  * the current task.
8062  */
8063 static void
8064 prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
8065 {
8066 	if (!task_on_rq_queued(p))
8067 		return;
8068 
8069 	/*
8070 	 * Reschedule if we are currently running on this runqueue and
8071 	 * our priority decreased, or if we are not currently running on
8072 	 * this runqueue and our priority is higher than the current's
8073 	 */
8074 	if (rq->curr == p) {
8075 		if (p->prio > oldprio)
8076 			resched_curr(rq);
8077 	} else
8078 		check_preempt_curr(rq, p, 0);
8079 }
8080 
8081 static inline bool vruntime_normalized(struct task_struct *p)
8082 {
8083 	struct sched_entity *se = &p->se;
8084 
8085 	/*
8086 	 * In both the TASK_ON_RQ_QUEUED and TASK_ON_RQ_MIGRATING cases,
8087 	 * the dequeue_entity(.flags=0) will already have normalized the
8088 	 * vruntime.
8089 	 */
8090 	if (p->on_rq)
8091 		return true;
8092 
8093 	/*
8094 	 * When !on_rq, vruntime of the task has usually NOT been normalized.
8095 	 * But there are some cases where it has already been normalized:
8096 	 *
8097 	 * - A forked child which is waiting for being woken up by
8098 	 *   wake_up_new_task().
8099 	 * - A task which has been woken up by try_to_wake_up() and
8100 	 *   waiting for actually being woken up by sched_ttwu_pending().
8101 	 */
8102 	if (!se->sum_exec_runtime || p->state == TASK_WAKING)
8103 		return true;
8104 
8105 	return false;
8106 }
8107 
8108 static void detach_task_cfs_rq(struct task_struct *p)
8109 {
8110 	struct sched_entity *se = &p->se;
8111 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
8112 
8113 	if (!vruntime_normalized(p)) {
8114 		/*
8115 		 * Fix up our vruntime so that the current sleep doesn't
8116 		 * cause 'unlimited' sleep bonus.
8117 		 */
8118 		place_entity(cfs_rq, se, 0);
8119 		se->vruntime -= cfs_rq->min_vruntime;
8120 	}
8121 
8122 	/* Catch up with the cfs_rq and remove our load when we leave */
8123 	detach_entity_load_avg(cfs_rq, se);
8124 }
8125 
8126 static void attach_task_cfs_rq(struct task_struct *p)
8127 {
8128 	struct sched_entity *se = &p->se;
8129 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
8130 
8131 #ifdef CONFIG_FAIR_GROUP_SCHED
8132 	/*
8133 	 * Since the real-depth could have been changed (only FAIR
8134 	 * class maintain depth value), reset depth properly.
8135 	 */
8136 	se->depth = se->parent ? se->parent->depth + 1 : 0;
8137 #endif
8138 
8139 	/* Synchronize task with its cfs_rq */
8140 	attach_entity_load_avg(cfs_rq, se);
8141 
8142 	if (!vruntime_normalized(p))
8143 		se->vruntime += cfs_rq->min_vruntime;
8144 }
8145 
8146 static void switched_from_fair(struct rq *rq, struct task_struct *p)
8147 {
8148 	detach_task_cfs_rq(p);
8149 }
8150 
8151 static void switched_to_fair(struct rq *rq, struct task_struct *p)
8152 {
8153 	attach_task_cfs_rq(p);
8154 
8155 	if (task_on_rq_queued(p)) {
8156 		/*
8157 		 * We were most likely switched from sched_rt, so
8158 		 * kick off the schedule if running, otherwise just see
8159 		 * if we can still preempt the current task.
8160 		 */
8161 		if (rq->curr == p)
8162 			resched_curr(rq);
8163 		else
8164 			check_preempt_curr(rq, p, 0);
8165 	}
8166 }
8167 
8168 /* Account for a task changing its policy or group.
8169  *
8170  * This routine is mostly called to set cfs_rq->curr field when a task
8171  * migrates between groups/classes.
8172  */
8173 static void set_curr_task_fair(struct rq *rq)
8174 {
8175 	struct sched_entity *se = &rq->curr->se;
8176 
8177 	for_each_sched_entity(se) {
8178 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
8179 
8180 		set_next_entity(cfs_rq, se);
8181 		/* ensure bandwidth has been allocated on our new cfs_rq */
8182 		account_cfs_rq_runtime(cfs_rq, 0);
8183 	}
8184 }
8185 
8186 void init_cfs_rq(struct cfs_rq *cfs_rq)
8187 {
8188 	cfs_rq->tasks_timeline = RB_ROOT;
8189 	cfs_rq->min_vruntime = (u64)(-(1LL << 20));
8190 #ifndef CONFIG_64BIT
8191 	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
8192 #endif
8193 #ifdef CONFIG_SMP
8194 	atomic_long_set(&cfs_rq->removed_load_avg, 0);
8195 	atomic_long_set(&cfs_rq->removed_util_avg, 0);
8196 #endif
8197 }
8198 
8199 #ifdef CONFIG_FAIR_GROUP_SCHED
8200 static void task_move_group_fair(struct task_struct *p)
8201 {
8202 	detach_task_cfs_rq(p);
8203 	set_task_rq(p, task_cpu(p));
8204 
8205 #ifdef CONFIG_SMP
8206 	/* Tell se's cfs_rq has been changed -- migrated */
8207 	p->se.avg.last_update_time = 0;
8208 #endif
8209 	attach_task_cfs_rq(p);
8210 }
8211 
8212 void free_fair_sched_group(struct task_group *tg)
8213 {
8214 	int i;
8215 
8216 	destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
8217 
8218 	for_each_possible_cpu(i) {
8219 		if (tg->cfs_rq)
8220 			kfree(tg->cfs_rq[i]);
8221 		if (tg->se) {
8222 			if (tg->se[i])
8223 				remove_entity_load_avg(tg->se[i]);
8224 			kfree(tg->se[i]);
8225 		}
8226 	}
8227 
8228 	kfree(tg->cfs_rq);
8229 	kfree(tg->se);
8230 }
8231 
8232 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8233 {
8234 	struct cfs_rq *cfs_rq;
8235 	struct sched_entity *se;
8236 	int i;
8237 
8238 	tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
8239 	if (!tg->cfs_rq)
8240 		goto err;
8241 	tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
8242 	if (!tg->se)
8243 		goto err;
8244 
8245 	tg->shares = NICE_0_LOAD;
8246 
8247 	init_cfs_bandwidth(tg_cfs_bandwidth(tg));
8248 
8249 	for_each_possible_cpu(i) {
8250 		cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
8251 				      GFP_KERNEL, cpu_to_node(i));
8252 		if (!cfs_rq)
8253 			goto err;
8254 
8255 		se = kzalloc_node(sizeof(struct sched_entity),
8256 				  GFP_KERNEL, cpu_to_node(i));
8257 		if (!se)
8258 			goto err_free_rq;
8259 
8260 		init_cfs_rq(cfs_rq);
8261 		init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
8262 		init_entity_runnable_average(se);
8263 	}
8264 
8265 	return 1;
8266 
8267 err_free_rq:
8268 	kfree(cfs_rq);
8269 err:
8270 	return 0;
8271 }
8272 
8273 void unregister_fair_sched_group(struct task_group *tg, int cpu)
8274 {
8275 	struct rq *rq = cpu_rq(cpu);
8276 	unsigned long flags;
8277 
8278 	/*
8279 	* Only empty task groups can be destroyed; so we can speculatively
8280 	* check on_list without danger of it being re-added.
8281 	*/
8282 	if (!tg->cfs_rq[cpu]->on_list)
8283 		return;
8284 
8285 	raw_spin_lock_irqsave(&rq->lock, flags);
8286 	list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
8287 	raw_spin_unlock_irqrestore(&rq->lock, flags);
8288 }
8289 
8290 void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
8291 			struct sched_entity *se, int cpu,
8292 			struct sched_entity *parent)
8293 {
8294 	struct rq *rq = cpu_rq(cpu);
8295 
8296 	cfs_rq->tg = tg;
8297 	cfs_rq->rq = rq;
8298 	init_cfs_rq_runtime(cfs_rq);
8299 
8300 	tg->cfs_rq[cpu] = cfs_rq;
8301 	tg->se[cpu] = se;
8302 
8303 	/* se could be NULL for root_task_group */
8304 	if (!se)
8305 		return;
8306 
8307 	if (!parent) {
8308 		se->cfs_rq = &rq->cfs;
8309 		se->depth = 0;
8310 	} else {
8311 		se->cfs_rq = parent->my_q;
8312 		se->depth = parent->depth + 1;
8313 	}
8314 
8315 	se->my_q = cfs_rq;
8316 	/* guarantee group entities always have weight */
8317 	update_load_set(&se->load, NICE_0_LOAD);
8318 	se->parent = parent;
8319 }
8320 
8321 static DEFINE_MUTEX(shares_mutex);
8322 
8323 int sched_group_set_shares(struct task_group *tg, unsigned long shares)
8324 {
8325 	int i;
8326 	unsigned long flags;
8327 
8328 	/*
8329 	 * We can't change the weight of the root cgroup.
8330 	 */
8331 	if (!tg->se[0])
8332 		return -EINVAL;
8333 
8334 	shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
8335 
8336 	mutex_lock(&shares_mutex);
8337 	if (tg->shares == shares)
8338 		goto done;
8339 
8340 	tg->shares = shares;
8341 	for_each_possible_cpu(i) {
8342 		struct rq *rq = cpu_rq(i);
8343 		struct sched_entity *se;
8344 
8345 		se = tg->se[i];
8346 		/* Propagate contribution to hierarchy */
8347 		raw_spin_lock_irqsave(&rq->lock, flags);
8348 
8349 		/* Possible calls to update_curr() need rq clock */
8350 		update_rq_clock(rq);
8351 		for_each_sched_entity(se)
8352 			update_cfs_shares(group_cfs_rq(se));
8353 		raw_spin_unlock_irqrestore(&rq->lock, flags);
8354 	}
8355 
8356 done:
8357 	mutex_unlock(&shares_mutex);
8358 	return 0;
8359 }
8360 #else /* CONFIG_FAIR_GROUP_SCHED */
8361 
8362 void free_fair_sched_group(struct task_group *tg) { }
8363 
8364 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8365 {
8366 	return 1;
8367 }
8368 
8369 void unregister_fair_sched_group(struct task_group *tg, int cpu) { }
8370 
8371 #endif /* CONFIG_FAIR_GROUP_SCHED */
8372 
8373 
8374 static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
8375 {
8376 	struct sched_entity *se = &task->se;
8377 	unsigned int rr_interval = 0;
8378 
8379 	/*
8380 	 * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
8381 	 * idle runqueue:
8382 	 */
8383 	if (rq->cfs.load.weight)
8384 		rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
8385 
8386 	return rr_interval;
8387 }
8388 
8389 /*
8390  * All the scheduling class methods:
8391  */
8392 const struct sched_class fair_sched_class = {
8393 	.next			= &idle_sched_class,
8394 	.enqueue_task		= enqueue_task_fair,
8395 	.dequeue_task		= dequeue_task_fair,
8396 	.yield_task		= yield_task_fair,
8397 	.yield_to_task		= yield_to_task_fair,
8398 
8399 	.check_preempt_curr	= check_preempt_wakeup,
8400 
8401 	.pick_next_task		= pick_next_task_fair,
8402 	.put_prev_task		= put_prev_task_fair,
8403 
8404 #ifdef CONFIG_SMP
8405 	.select_task_rq		= select_task_rq_fair,
8406 	.migrate_task_rq	= migrate_task_rq_fair,
8407 
8408 	.rq_online		= rq_online_fair,
8409 	.rq_offline		= rq_offline_fair,
8410 
8411 	.task_waking		= task_waking_fair,
8412 	.task_dead		= task_dead_fair,
8413 	.set_cpus_allowed	= set_cpus_allowed_common,
8414 #endif
8415 
8416 	.set_curr_task          = set_curr_task_fair,
8417 	.task_tick		= task_tick_fair,
8418 	.task_fork		= task_fork_fair,
8419 
8420 	.prio_changed		= prio_changed_fair,
8421 	.switched_from		= switched_from_fair,
8422 	.switched_to		= switched_to_fair,
8423 
8424 	.get_rr_interval	= get_rr_interval_fair,
8425 
8426 	.update_curr		= update_curr_fair,
8427 
8428 #ifdef CONFIG_FAIR_GROUP_SCHED
8429 	.task_move_group	= task_move_group_fair,
8430 #endif
8431 };
8432 
8433 #ifdef CONFIG_SCHED_DEBUG
8434 void print_cfs_stats(struct seq_file *m, int cpu)
8435 {
8436 	struct cfs_rq *cfs_rq;
8437 
8438 	rcu_read_lock();
8439 	for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
8440 		print_cfs_rq(m, cpu, cfs_rq);
8441 	rcu_read_unlock();
8442 }
8443 
8444 #ifdef CONFIG_NUMA_BALANCING
8445 void show_numa_stats(struct task_struct *p, struct seq_file *m)
8446 {
8447 	int node;
8448 	unsigned long tsf = 0, tpf = 0, gsf = 0, gpf = 0;
8449 
8450 	for_each_online_node(node) {
8451 		if (p->numa_faults) {
8452 			tsf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 0)];
8453 			tpf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 1)];
8454 		}
8455 		if (p->numa_group) {
8456 			gsf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 0)],
8457 			gpf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 1)];
8458 		}
8459 		print_numa_stats(m, node, tsf, tpf, gsf, gpf);
8460 	}
8461 }
8462 #endif /* CONFIG_NUMA_BALANCING */
8463 #endif /* CONFIG_SCHED_DEBUG */
8464 
8465 __init void init_sched_fair_class(void)
8466 {
8467 #ifdef CONFIG_SMP
8468 	open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
8469 
8470 #ifdef CONFIG_NO_HZ_COMMON
8471 	nohz.next_balance = jiffies;
8472 	zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
8473 	cpu_notifier(sched_ilb_notifier, 0);
8474 #endif
8475 #endif /* SMP */
8476 
8477 }
8478